Vendor import of llvm release_34 branch r197841 (effectively, 3.4 RC3):

https://llvm.org/svn/llvm-project/llvm/branches/release_34@197841
author: dim <dim@FreeBSD.org> 2013-12-22 00:04:03 +0000
committer: dim <dim@FreeBSD.org> 2013-12-22 00:04:03 +0000
commit: 8cf58e3ee36bd550746fca361a894e2727485200 (patch)
tree: 2ba0398b4c42ad4f55561327538044fd2c925a8b /lib
parent: aa45f148926e3461a1fd8b10c990f0a51a908cc9 (diff)
download: FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.zip
FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.tar.gz
1251 files changed, 147957 insertions, 82357 deletions
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 210b80a..b8b6d37 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -26,6 +26,7 @@
 
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/BasicBlock.h"
@@ -361,24 +362,6 @@ AliasAnalysis::getModRefInfo(const AtomicRMWInst *RMW, const Location &Loc) {
 }
 
 namespace {
-  // Conservatively return true. Return false, if there is a single path
-  // starting from "From" and the path does not reach "To".
-  static bool hasPath(const BasicBlock *From, const BasicBlock *To) {
-    const unsigned MaxCheck = 5;
-    const BasicBlock *Current = From;
-    for (unsigned I = 0; I < MaxCheck; I++) {
-      unsigned NumSuccs = Current->getTerminator()->getNumSuccessors();
-      if (NumSuccs > 1)
-        return true;
-      if (NumSuccs == 0)
-        return false;
-      Current = Current->getTerminator()->getSuccessor(0);
-      if (Current == To)
-        return true;
-    }
-    return true;
-  }
-
   /// Only find pointer captures which happen before the given instruction. Uses
   /// the dominator tree to determine whether one instruction is before another.
   /// Only support the case where the Value is defined in the same basic block
@@ -400,7 +383,7 @@ namespace {
       // there is no need to explore the use if BeforeHere dominates use.
       // Check whether there is a path from I to BeforeHere.
       if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
-          !hasPath(BB, BeforeHere->getParent()))
+          !isPotentiallyReachable(I, BeforeHere, DT))
         return false;
       return true;
     }
@@ -412,7 +395,7 @@ namespace {
       if (BeforeHere != I && !DT->isReachableFromEntry(BB))
         return false;
       if (BeforeHere != I && DT->dominates(BeforeHere, I) &&
-          !hasPath(BB, BeforeHere->getParent()))
+          !isPotentiallyReachable(I, BeforeHere, DT))
         return false;
       Captured = true;
       return true;
@@ -450,6 +433,7 @@ AliasAnalysis::callCapturesBefore(const Instruction *I,
     return AliasAnalysis::ModRef;
 
   unsigned ArgNo = 0;
+  AliasAnalysis::ModRefResult R = AliasAnalysis::NoModRef;
   for (ImmutableCallSite::arg_iterator CI = CS.arg_begin(), CE = CS.arg_end();
        CI != CE; ++CI, ++ArgNo) {
     // Only look at the no-capture or byval pointer arguments.  If this
@@ -463,12 +447,18 @@ AliasAnalysis::callCapturesBefore(const Instruction *I,
     // is impossible to alias the pointer we're checking.  If not, we have to
     // assume that the call could touch the pointer, even though it doesn't
     // escape.
-    if (!isNoAlias(AliasAnalysis::Location(*CI),
-                   AliasAnalysis::Location(Object))) {
-      return AliasAnalysis::ModRef;
+    if (isNoAlias(AliasAnalysis::Location(*CI),
+		  AliasAnalysis::Location(Object)))
+      continue;
+    if (CS.doesNotAccessMemory(ArgNo))
+      continue;
+    if (CS.onlyReadsMemory(ArgNo)) {
+      R = AliasAnalysis::Ref;
+      continue;
     }
+    return AliasAnalysis::ModRef;
   }
-  return AliasAnalysis::NoModRef;
+  return R;
 }
 
 // AliasAnalysis destructor: DO NOT move this to the header file for
@@ -537,6 +527,15 @@ bool llvm::isNoAliasCall(const Value *V) {
   return false;
 }
 
+/// isNoAliasArgument - Return true if this is an argument with the noalias
+/// attribute.
+bool llvm::isNoAliasArgument(const Value *V)
+{
+  if (const Argument *A = dyn_cast<Argument>(V))
+    return A->hasNoAliasAttr();
+  return false;
+}
+
 /// isIdentifiedObject - Return true if this pointer refers to a distinct and
 /// identifiable object.  This returns true for:
 ///    Global Variables and Functions (but not Global Aliases)
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 5910526..2289c12 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -299,7 +299,6 @@ bool AliasSetTracker::add(Value *Ptr, uint64_t Size, const MDNode *TBAAInfo) {
 bool AliasSetTracker::add(LoadInst *LI) {
   if (LI->getOrdering() > Monotonic) return addUnknown(LI);
   AliasSet::AccessType ATy = AliasSet::Refs;
-  if (!LI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   AliasSet &AS = addPointer(LI->getOperand(0),
                             AA.getTypeStoreSize(LI->getType()),
@@ -312,7 +311,6 @@ bool AliasSetTracker::add(LoadInst *LI) {
 bool AliasSetTracker::add(StoreInst *SI) {
   if (SI->getOrdering() > Monotonic) return addUnknown(SI);
   AliasSet::AccessType ATy = AliasSet::Mods;
-  if (!SI->isUnordered()) ATy = AliasSet::ModRef;
   bool NewPtr;
   Value *Val = SI->getOperand(0);
   AliasSet &AS = addPointer(SI->getOperand(1),
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 349c417..98f2a55 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -34,6 +34,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
   initializeDependenceAnalysisPass(Registry);
+  initializeDelinearizationPass(Registry);
   initializeDominanceFrontierPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
@@ -54,16 +55,6 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeMemoryDependenceAnalysisPass(Registry);
   initializeModuleDebugInfoPrinterPass(Registry);
   initializePostDominatorTreePass(Registry);
-  initializeProfileEstimatorPassPass(Registry);
-  initializeNoProfileInfoPass(Registry);
-  initializeNoPathProfileInfoPass(Registry);
-  initializeProfileInfoAnalysisGroup(Registry);
-  initializePathProfileInfoAnalysisGroup(Registry);
-  initializeLoaderPassPass(Registry);
-  initializePathProfileLoaderPassPass(Registry);
-  initializeProfileVerifierPassPass(Registry);
-  initializePathProfileVerifierPass(Registry);
-  initializeProfileMetadataLoaderPassPass(Registry);
   initializeRegionInfoPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index f8509dd..b2c2011 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -122,7 +122,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   //      question (in this case rewind to p), or
   //    - just give up. It is up to caller to make sure the pointer is pointing
   //      to the base address the object.
-  // 
+  //
   // We go for 2nd option for simplicity.
   if (!isIdentifiedObject(V))
     return false;
@@ -130,7 +130,7 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
   uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
-  
+
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
 }
 
@@ -142,6 +142,17 @@ static bool isObjectSize(const Value *V, uint64_t Size,
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size;
 }
 
+/// isIdentifiedFunctionLocal - Return true if V is umabigously identified
+/// at the function-level. Different IdentifiedFunctionLocals can't alias.
+/// Further, an IdentifiedFunctionLocal can not alias with any function
+/// arguments other than itself, which is not neccessarily true for
+/// IdentifiedObjects.
+static bool isIdentifiedFunctionLocal(const Value *V)
+{
+  return isa<AllocaInst>(V) || isNoAliasCall(V) || isNoAliasArgument(V);
+}
+
+
 //===----------------------------------------------------------------------===//
 // GetElementPtr Instruction Decomposition and Analysis
 //===----------------------------------------------------------------------===//
@@ -152,7 +163,7 @@ namespace {
     EK_SignExt,
     EK_ZeroExt
   };
-  
+
   struct VariableGEPIndex {
     const Value *V;
     ExtensionKind Extension;
@@ -189,7 +200,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
     Offset = 0;
     return V;
   }
-  
+
   if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(V)) {
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(BOp->getOperand(1))) {
       switch (BOp->getOpcode()) {
@@ -220,7 +231,7 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
       }
     }
   }
-  
+
   // Since GEP indices are sign extended anyway, we don't care about the high
   // bits of a sign or zero extended value - just scales and offsets.  The
   // extensions have to be consistent though.
@@ -237,10 +248,10 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                         TD, Depth+1);
     Scale = Scale.zext(OldWidth);
     Offset = Offset.zext(OldWidth);
-    
+
     return Result;
   }
-  
+
   Scale = 1;
   Offset = 0;
   return V;
@@ -265,7 +276,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        const DataLayout *TD) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = 6;
-  
+
   BaseOffs = 0;
   do {
     // See if this is a bitcast or GEP.
@@ -280,7 +291,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       }
       return V;
     }
-    
+
     if (Op->getOpcode() == Instruction::BitCast) {
       V = Op->getOperand(0);
       continue;
@@ -297,15 +308,14 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           V = Simplified;
           continue;
         }
-    
+
       return V;
     }
-    
+
     // Don't attempt to analyze GEPs over unsized objects.
-    if (!cast<PointerType>(GEPOp->getOperand(0)->getType())
-        ->getElementType()->isSized())
+    if (!GEPOp->getOperand(0)->getType()->getPointerElementType()->isSized())
       return V;
-    
+
     // If we are lacking DataLayout information, we can't compute the offets of
     // elements computed by GEPs.  However, we can handle bitcast equivalent
     // GEPs.
@@ -315,7 +325,8 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
       V = GEPOp->getOperand(0);
       continue;
     }
-    
+
+    unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
     for (User::const_op_iterator I = GEPOp->op_begin()+1,
@@ -326,38 +337,37 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
         // For a struct, add the member offset.
         unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
         if (FieldNo == 0) continue;
-        
+
         BaseOffs += TD->getStructLayout(STy)->getElementOffset(FieldNo);
         continue;
       }
-      
+
       // For an array/pointer, add the element offset, explicitly scaled.
       if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Index)) {
         if (CIdx->isZero()) continue;
         BaseOffs += TD->getTypeAllocSize(*GTI)*CIdx->getSExtValue();
         continue;
       }
-      
+
       uint64_t Scale = TD->getTypeAllocSize(*GTI);
       ExtensionKind Extension = EK_NotExtended;
-      
+
       // If the integer type is smaller than the pointer size, it is implicitly
       // sign extended to pointer size.
-      unsigned Width = cast<IntegerType>(Index->getType())->getBitWidth();
-      if (TD->getPointerSizeInBits() > Width)
+      unsigned Width = Index->getType()->getIntegerBitWidth();
+      if (TD->getPointerSizeInBits(AS) > Width)
         Extension = EK_SignExt;
-      
+
       // Use GetLinearExpression to decompose the index into a C1*V+C2 form.
       APInt IndexScale(Width, 0), IndexOffset(Width, 0);
       Index = GetLinearExpression(Index, IndexScale, IndexOffset, Extension,
                                   *TD, 0);
-      
+
       // The GEP index scale ("Scale") scales C1*V+C2, yielding (C1*V+C2)*Scale.
       // This gives us an aggregate computation of (C1*Scale)*V + C2*Scale.
       BaseOffs += IndexOffset.getSExtValue()*Scale;
       Scale *= IndexScale.getSExtValue();
-      
-      
+
       // If we already had an occurrence of this index variable, merge this
       // scale into it.  For example, we want to handle:
       //   A[x][x] -> x*16 + x*4 -> x*20
@@ -370,25 +380,25 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
           break;
         }
       }
-      
+
       // Make sure that we have a scale that makes sense for this target's
       // pointer size.
-      if (unsigned ShiftBits = 64-TD->getPointerSizeInBits()) {
+      if (unsigned ShiftBits = 64 - TD->getPointerSizeInBits(AS)) {
         Scale <<= ShiftBits;
         Scale = (int64_t)Scale >> ShiftBits;
       }
-      
+
       if (Scale) {
         VariableGEPIndex Entry = {Index, Extension,
                                   static_cast<int64_t>(Scale)};
         VarIndices.push_back(Entry);
       }
     }
-    
+
     // Analyze the base pointer next.
     V = GEPOp->getOperand(0);
   } while (--MaxLookup);
-  
+
   // If the chain of expressions is too deep, just return early.
   return V;
 }
@@ -396,7 +406,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
 /// GetIndexDifference - Dest and Src are the variable indices from two
 /// decomposed GetElementPtr instructions GEP1 and GEP2 which have common base
 /// pointers.  Subtract the GEP2 indices from GEP1 to find the symbolic
-/// difference between the two pointers. 
+/// difference between the two pointers.
 static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
                                const SmallVectorImpl<VariableGEPIndex> &Src) {
   if (Src.empty()) return;
@@ -405,12 +415,12 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
     const Value *V = Src[i].V;
     ExtensionKind Extension = Src[i].Extension;
     int64_t Scale = Src[i].Scale;
-    
+
     // Find V in Dest.  This is N^2, but pointer indices almost never have more
     // than a few variable indexes.
     for (unsigned j = 0, e = Dest.size(); j != e; ++j) {
       if (Dest[j].V != V || Dest[j].Extension != Extension) continue;
-      
+
       // If we found it, subtract off Scale V's from the entry in Dest.  If it
       // goes to zero, remove the entry.
       if (Dest[j].Scale != Scale)
@@ -420,7 +430,7 @@ static void GetIndexDifference(SmallVectorImpl<VariableGEPIndex> &Dest,
       Scale = 0;
       break;
     }
-    
+
     // If we didn't consume this entry, add it to the end of the Dest list.
     if (Scale) {
       VariableGEPIndex Entry = { V, Extension, -Scale };
@@ -515,7 +525,7 @@ namespace {
         return (AliasAnalysis*)this;
       return this;
     }
-    
+
   private:
     // AliasCache - Track alias queries to guard against recursion.
     typedef std::pair<Location, Location> LocPair;
@@ -685,7 +695,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
          "AliasAnalysis query involving multiple functions!");
 
   const Value *Object = GetUnderlyingObject(Loc.Ptr, TD);
-  
+
   // If this is a tail call and Loc.Ptr points to a stack location, we know that
   // the tail call cannot access or modify the local stack.
   // We cannot exclude byval arguments here; these belong to the caller of
@@ -695,7 +705,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     if (const CallInst *CI = dyn_cast<CallInst>(CS.getInstruction()))
       if (CI->isTailCall())
         return NoModRef;
-  
+
   // If the pointer is to a locally allocated object that does not escape,
   // then the call can not mod/ref the pointer unless the call takes the pointer
   // as an argument, and itself doesn't capture it.
@@ -711,7 +721,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       if (!(*CI)->getType()->isPointerTy() ||
           (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
         continue;
-      
+
       // If this is a no-capture pointer argument, see if we can tell that it
       // is impossible to alias the pointer we're checking.  If not, we have to
       // assume that the call could touch the pointer, even though it doesn't
@@ -721,7 +731,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
         break;
       }
     }
-    
+
     if (!PassedAsArg)
       return NoModRef;
   }
@@ -810,7 +820,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
     }
 
   // We can bound the aliasing properties of memset_pattern16 just as we can
-  // for memcpy/memset.  This is particularly important because the 
+  // for memcpy/memset.  This is particularly important because the
   // LoopIdiomRecognizer likes to turn loops into calls to memset_pattern16
   // whenever possible.
   else if (TLI.has(LibFunc::memset_pattern16) &&
@@ -846,8 +856,8 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
   return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
 }
 
-static bool areVarIndicesEqual(SmallVector<VariableGEPIndex, 4> &Indices1,
-                               SmallVector<VariableGEPIndex, 4> &Indices2) {
+static bool areVarIndicesEqual(SmallVectorImpl<VariableGEPIndex> &Indices1,
+                               SmallVectorImpl<VariableGEPIndex> &Indices2) {
   unsigned Size1 = Indices1.size();
   unsigned Size2 = Indices2.size();
 
@@ -914,22 +924,22 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
         GEP1VariableIndices.clear();
       }
     }
-    
+
     // If we get a No or May, then return it immediately, no amount of analysis
     // will improve this situation.
     if (BaseAlias != MustAlias) return BaseAlias;
-    
+
     // Otherwise, we have a MustAlias.  Since the base pointers alias each other
     // exactly, see if the computed offset from the common pointer tells us
     // about the relation of the resulting pointer.
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     int64_t GEP2BaseOffset;
     SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
@@ -937,12 +947,12 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
       return MayAlias;
     }
-    
+
     // Subtract the GEP2 pointer from the GEP1 pointer to find out their
     // symbolic difference.
     GEP1BaseOffset -= GEP2BaseOffset;
     GetIndexDifference(GEP1VariableIndices, GEP2VariableIndices);
-    
+
   } else {
     // Check to see if these two pointers are related by the getelementptr
     // instruction.  If one pointer is a GEP with a non-zero index of the other
@@ -964,7 +974,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
 
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
-    
+
     // DecomposeGEPExpression and GetUnderlyingObject should return the
     // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1) {
@@ -973,7 +983,7 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return MayAlias;
     }
   }
-  
+
   // In the two GEP Case, if there is no difference in the offsets of the
   // computed pointers, the resultant pointers are a must alias.  This
   // hapens when we have two lexically identical GEP's (for example).
@@ -1205,17 +1215,17 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
         (isa<Constant>(O2) && isIdentifiedObject(O1) && !isa<Constant>(O1)))
       return NoAlias;
 
-    // Arguments can't alias with local allocations or noalias calls
-    // in the same function.
-    if (((isa<Argument>(O1) && (isa<AllocaInst>(O2) || isNoAliasCall(O2))) ||
-         (isa<Argument>(O2) && (isa<AllocaInst>(O1) || isNoAliasCall(O1)))))
+    // Function arguments can't alias with things that are known to be
+    // unambigously identified at the function level.
+    if ((isa<Argument>(O1) && isIdentifiedFunctionLocal(O2)) ||
+        (isa<Argument>(O2) && isIdentifiedFunctionLocal(O1)))
       return NoAlias;
 
     // Most objects can't alias null.
     if ((isa<ConstantPointerNull>(O2) && isKnownNonNull(O1)) ||
         (isa<ConstantPointerNull>(O1) && isKnownNonNull(O2)))
       return NoAlias;
-  
+
     // If one pointer is the result of a call/invoke or load and the other is a
     // non-escaping local object within the same function, then we know the
     // object couldn't escape to a point where the call could return it.
@@ -1237,7 +1247,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
         (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
       return NoAlias;
-  
+
   // Check the cache before climbing up use-def chains. This also terminates
   // otherwise infinitely recursive queries.
   LocPair Locs(Location(V1, V1Size, V1TBAAInfo),
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 100e5c8..62f3ab1 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -1,4 +1,4 @@
-//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------=======//
+//=======-------- BlockFrequencyInfo.cpp - Block Frequency Analysis -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,14 +17,97 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 
 using namespace llvm;
 
-INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                      true, true)
+#ifndef NDEBUG
+enum GVDAGType {
+  GVDT_None,
+  GVDT_Fraction,
+  GVDT_Integer
+};
+
+static cl::opt<GVDAGType>
+ViewBlockFreqPropagationDAG("view-block-freq-propagation-dags", cl::Hidden,
+          cl::desc("Pop up a window to show a dag displaying how block "
+                   "frequencies propagation through the CFG."),
+          cl::values(
+            clEnumValN(GVDT_None, "none",
+                       "do not display graphs."),
+            clEnumValN(GVDT_Fraction, "fraction", "display a graph using the "
+                       "fractional block frequency representation."),
+            clEnumValN(GVDT_Integer, "integer", "display a graph using the raw "
+                       "integer fractional block frequency representation."),
+            clEnumValEnd));
+
+namespace llvm {
+
+template <>
+struct GraphTraits<BlockFrequencyInfo *> {
+  typedef const BasicBlock NodeType;
+  typedef succ_const_iterator ChildIteratorType;
+  typedef Function::const_iterator nodes_iterator;
+
+  static inline const NodeType *getEntryNode(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static ChildIteratorType child_begin(const NodeType *N) {
+    return succ_begin(N);
+  }
+  static ChildIteratorType child_end(const NodeType *N) {
+    return succ_end(N);
+  }
+  static nodes_iterator nodes_begin(const BlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+  static nodes_iterator nodes_end(const BlockFrequencyInfo *G) {
+    return G->getFunction()->end();
+  }
+};
+
+template<>
+struct DOTGraphTraits<BlockFrequencyInfo*> : public DefaultDOTGraphTraits {
+  explicit DOTGraphTraits(bool isSimple=false) :
+    DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const BlockFrequencyInfo *G) {
+    return G->getFunction()->getName();
+  }
+
+  std::string getNodeLabel(const BasicBlock *Node,
+                           const BlockFrequencyInfo *Graph) {
+    std::string Result;
+    raw_string_ostream OS(Result);
+
+    OS << Node->getName().str() << ":";
+    switch (ViewBlockFreqPropagationDAG) {
+    case GVDT_Fraction:
+      Graph->getBlockFreq(Node).print(OS);
+      break;
+    case GVDT_Integer:
+      OS << Graph->getBlockFreq(Node).getFrequency();
+      break;
+    case GVDT_None:
+      llvm_unreachable("If we are not supposed to render a graph we should "
+                       "never reach this point.");
+    }
+
+    return Result;
+  }
+};
+
+} // end namespace llvm
+#endif
+
+INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
+                      "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
-INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq", "Block Frequency Analysis",
-                    true, true)
+INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
+                    "Block Frequency Analysis", true, true)
 
 char BlockFrequencyInfo::ID = 0;
 
@@ -46,6 +129,10 @@ void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
 bool BlockFrequencyInfo::runOnFunction(Function &F) {
   BranchProbabilityInfo &BPI = getAnalysis<BranchProbabilityInfo>();
   BFI->doFunction(&F, &BPI);
+#ifndef NDEBUG
+  if (ViewBlockFreqPropagationDAG != GVDT_None)
+    view();
+#endif
   return false;
 }
 
@@ -53,11 +140,22 @@ void BlockFrequencyInfo::print(raw_ostream &O, const Module *) const {
   if (BFI) BFI->print(O);
 }
 
-/// getblockFreq - Return block frequency. Return 0 if we don't have the
-/// information. Please note that initial frequency is equal to 1024. It means
-/// that we should not rely on the value itself, but only on the comparison to
-/// the other block frequencies. We do this to avoid using of floating points.
-///
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
   return BFI->getBlockFreq(BB);
 }
+
+/// Pop up a ghostview window with the current block frequency propagation
+/// rendered using dot.
+void BlockFrequencyInfo::view() const {
+// This code is only for debugging.
+#ifndef NDEBUG
+  ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
+#else
+  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
+            "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+const Function *BlockFrequencyInfo::getFunction() const {
+  return BFI->Fn;
+}
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 6c58856..86560ca 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -69,6 +69,20 @@ static const uint32_t UR_TAKEN_WEIGHT = 1;
 /// easily subsume it.
 static const uint32_t UR_NONTAKEN_WEIGHT = 1024*1024 - 1;
 
+/// \brief Weight for a branch taken going into a cold block.
+///
+/// This is the weight for a branch taken toward a block marked
+/// cold.  A block is marked cold if it's postdominated by a
+/// block containing a call to a cold function.  Cold functions
+/// are those marked with attribute 'cold'.
+static const uint32_t CC_TAKEN_WEIGHT = 4;
+
+/// \brief Weight for a branch not-taken into a cold block.
+///
+/// This is the weight for a branch not taken toward a block marked
+/// cold.
+static const uint32_t CC_NONTAKEN_WEIGHT = 64;
+
 static const uint32_t PH_TAKEN_WEIGHT = 20;
 static const uint32_t PH_NONTAKEN_WEIGHT = 12;
 
@@ -137,8 +151,8 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
 
   uint32_t UnreachableWeight =
     std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT);
-  for (SmallVector<unsigned, 4>::iterator I = UnreachableEdges.begin(),
-                                          E = UnreachableEdges.end();
+  for (SmallVectorImpl<unsigned>::iterator I = UnreachableEdges.begin(),
+                                           E = UnreachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, UnreachableWeight);
 
@@ -147,8 +161,8 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
   uint32_t ReachableWeight =
     std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(),
              NORMAL_WEIGHT);
-  for (SmallVector<unsigned, 4>::iterator I = ReachableEdges.begin(),
-                                          E = ReachableEdges.end();
+  for (SmallVectorImpl<unsigned>::iterator I = ReachableEdges.begin(),
+                                           E = ReachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, ReachableWeight);
 
@@ -193,6 +207,67 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
   return true;
 }
 
+/// \brief Calculate edge weights for edges leading to cold blocks.
+///
+/// A cold block is one post-dominated by  a block with a call to a
+/// cold function.  Those edges are unlikely to be taken, so we give
+/// them relatively low weight.
+///
+/// Return true if we could compute the weights for cold edges.
+/// Return false, otherwise.
+bool BranchProbabilityInfo::calcColdCallHeuristics(BasicBlock *BB) {
+  TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return false;
+
+  // Determine which successors are post-dominated by a cold block.
+  SmallVector<unsigned, 4> ColdEdges;
+  SmallVector<unsigned, 4> NormalEdges;
+  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+    if (PostDominatedByColdCall.count(*I))
+      ColdEdges.push_back(I.getSuccessorIndex());
+    else
+      NormalEdges.push_back(I.getSuccessorIndex());
+
+  // If all successors are in the set of blocks post-dominated by cold calls,
+  // this block is in the set post-dominated by cold calls.
+  if (ColdEdges.size() == TI->getNumSuccessors())
+    PostDominatedByColdCall.insert(BB);
+  else {
+    // Otherwise, if the block itself contains a cold function, add it to the
+    // set of blocks postdominated by a cold call.
+    assert(!PostDominatedByColdCall.count(BB));
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (CallInst *CI = dyn_cast<CallInst>(I))
+        if (CI->hasFnAttr(Attribute::Cold)) {
+          PostDominatedByColdCall.insert(BB);
+          break;
+        }
+  }
+
+  // Skip probabilities if this block has a single successor.
+  if (TI->getNumSuccessors() == 1 || ColdEdges.empty())
+    return false;
+
+  uint32_t ColdWeight =
+      std::max(CC_TAKEN_WEIGHT / (unsigned) ColdEdges.size(), MIN_WEIGHT);
+  for (SmallVectorImpl<unsigned>::iterator I = ColdEdges.begin(),
+                                           E = ColdEdges.end();
+       I != E; ++I)
+    setEdgeWeight(BB, *I, ColdWeight);
+
+  if (NormalEdges.empty())
+    return true;
+  uint32_t NormalWeight = std::max(
+      CC_NONTAKEN_WEIGHT / (unsigned) NormalEdges.size(), NORMAL_WEIGHT);
+  for (SmallVectorImpl<unsigned>::iterator I = NormalEdges.begin(),
+                                           E = NormalEdges.end();
+       I != E; ++I)
+    setEdgeWeight(BB, *I, NormalWeight);
+
+  return true;
+}
+
 // Calculate Edge Weights using "Pointer Heuristics". Predict a comparsion
 // between two pointer or pointer and NULL will fail.
 bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
@@ -251,7 +326,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (backWeight < NORMAL_WEIGHT)
       backWeight = NORMAL_WEIGHT;
 
-    for (SmallVector<unsigned, 8>::iterator EI = BackEdges.begin(),
+    for (SmallVectorImpl<unsigned>::iterator EI = BackEdges.begin(),
          EE = BackEdges.end(); EI != EE; ++EI) {
       setEdgeWeight(BB, *EI, backWeight);
     }
@@ -262,7 +337,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (inWeight < NORMAL_WEIGHT)
       inWeight = NORMAL_WEIGHT;
 
-    for (SmallVector<unsigned, 8>::iterator EI = InEdges.begin(),
+    for (SmallVectorImpl<unsigned>::iterator EI = InEdges.begin(),
          EE = InEdges.end(); EI != EE; ++EI) {
       setEdgeWeight(BB, *EI, inWeight);
     }
@@ -273,7 +348,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (exitWeight < MIN_WEIGHT)
       exitWeight = MIN_WEIGHT;
 
-    for (SmallVector<unsigned, 8>::iterator EI = ExitingEdges.begin(),
+    for (SmallVectorImpl<unsigned>::iterator EI = ExitingEdges.begin(),
          EE = ExitingEdges.end(); EI != EE; ++EI) {
       setEdgeWeight(BB, *EI, exitWeight);
     }
@@ -323,10 +398,24 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
     // InstCombine canonicalizes X <= 0 into X < 1.
     // X <= 0   ->  Unlikely
     isProb = false;
-  } else if (CV->isAllOnesValue() && CI->getPredicate() == CmpInst::ICMP_SGT) {
-    // InstCombine canonicalizes X >= 0 into X > -1.
-    // X >= 0   ->  Likely
-    isProb = true;
+  } else if (CV->isAllOnesValue()) {
+    switch (CI->getPredicate()) {
+    case CmpInst::ICMP_EQ:
+      // X == -1  ->  Unlikely
+      isProb = false;
+      break;
+    case CmpInst::ICMP_NE:
+      // X != -1  ->  Likely
+      isProb = true;
+      break;
+    case CmpInst::ICMP_SGT:
+      // InstCombine canonicalizes X >= 0 into X > -1.
+      // X >= 0   ->  Likely
+      isProb = true;
+      break;
+    default:
+      return false;
+    }
   } else {
     return false;
   }
@@ -397,6 +486,7 @@ bool BranchProbabilityInfo::runOnFunction(Function &F) {
   LastF = &F; // Store the last function we ran on for printing.
   LI = &getAnalysis<LoopInfo>();
   assert(PostDominatedByUnreachable.empty());
+  assert(PostDominatedByColdCall.empty());
 
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
@@ -408,6 +498,8 @@ bool BranchProbabilityInfo::runOnFunction(Function &F) {
       continue;
     if (calcMetadataWeights(*I))
       continue;
+    if (calcColdCallHeuristics(*I))
+      continue;
     if (calcLoopBranchHeuristics(*I))
       continue;
     if (calcPointerHeuristics(*I))
@@ -420,6 +512,7 @@ bool BranchProbabilityInfo::runOnFunction(Function &F) {
   }
 
   PostDominatedByUnreachable.clear();
+  PostDominatedByColdCall.clear();
   return false;
 }
 
diff --git a/lib/Analysis/CFG.cpp b/lib/Analysis/CFG.cpp
new file mode 100644
index 0000000..c3f32d3
--- /dev/null
+++ b/lib/Analysis/CFG.cpp
@@ -0,0 +1,245 @@
+//===-- CFG.cpp - BasicBlock analysis --------------------------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions performs analyses on basic blocks, and instructions
+// contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/CFG.h"
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+
+using namespace llvm;
+
+/// FindFunctionBackedges - Analyze the specified function to find all of the
+/// loop backedges in the function and return them.  This is a relatively cheap
+/// (compared to computing dominators and loop info) analysis.
+///
+/// The output is added to Result, as pairs of <from,to> edge info.
+void llvm::FindFunctionBackedges(const Function &F,
+     SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
+  const BasicBlock *BB = &F.getEntryBlock();
+  if (succ_begin(BB) == succ_end(BB))
+    return;
+
+  SmallPtrSet<const BasicBlock*, 8> Visited;
+  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
+  SmallPtrSet<const BasicBlock*, 8> InStack;
+
+  Visited.insert(BB);
+  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+  InStack.insert(BB);
+  do {
+    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
+    const BasicBlock *ParentBB = Top.first;
+    succ_const_iterator &I = Top.second;
+
+    bool FoundNew = false;
+    while (I != succ_end(ParentBB)) {
+      BB = *I++;
+      if (Visited.insert(BB)) {
+        FoundNew = true;
+        break;
+      }
+      // Successor is in VisitStack, it's a back edge.
+      if (InStack.count(BB))
+        Result.push_back(std::make_pair(ParentBB, BB));
+    }
+
+    if (FoundNew) {
+      // Go down one level if there is a unvisited successor.
+      InStack.insert(BB);
+      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
+    } else {
+      // Go up one level.
+      InStack.erase(VisitStack.pop_back_val().first);
+    }
+  } while (!VisitStack.empty());
+}
+
+/// GetSuccessorNumber - Search for the specified successor of basic block BB
+/// and return its position in the terminator instruction's list of
+/// successors.  It is an error to call this with a block that is not a
+/// successor.
+unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
+  TerminatorInst *Term = BB->getTerminator();
+#ifndef NDEBUG
+  unsigned e = Term->getNumSuccessors();
+#endif
+  for (unsigned i = 0; ; ++i) {
+    assert(i != e && "Didn't find edge?");
+    if (Term->getSuccessor(i) == Succ)
+      return i;
+  }
+}
+
+/// isCriticalEdge - Return true if the specified edge is a critical edge.
+/// Critical edges are edges from a block with multiple successors to a block
+/// with multiple predecessors.
+bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
+                          bool AllowIdenticalEdges) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *FirstPred = *I;
+  ++I;        // Skip one edge due to the incoming arc from TI.
+  if (!AllowIdenticalEdges)
+    return I != E;
+
+  // If AllowIdenticalEdges is true, then we allow this edge to be considered
+  // non-critical iff all preds come from TI's block.
+  while (I != E) {
+    const BasicBlock *P = *I;
+    if (P != FirstPred)
+      return true;
+    // Note: leave this as is until no one ever compiles with either gcc 4.0.1
+    // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
+    E = pred_end(P);
+    ++I;
+  }
+  return false;
+}
+
+// LoopInfo contains a mapping from basic block to the innermost loop. Find
+// the outermost loop in the loop nest that contains BB.
+static const Loop *getOutermostLoop(const LoopInfo *LI, const BasicBlock *BB) {
+  const Loop *L = LI->getLoopFor(BB);
+  if (L) {
+    while (const Loop *Parent = L->getParentLoop())
+      L = Parent;
+  }
+  return L;
+}
+
+// True if there is a loop which contains both BB1 and BB2.
+static bool loopContainsBoth(const LoopInfo *LI,
+                             const BasicBlock *BB1, const BasicBlock *BB2) {
+  const Loop *L1 = getOutermostLoop(LI, BB1);
+  const Loop *L2 = getOutermostLoop(LI, BB2);
+  return L1 != NULL && L1 == L2;
+}
+
+static bool isPotentiallyReachableInner(SmallVectorImpl<BasicBlock *> &Worklist,
+                                        BasicBlock *StopBB,
+                                        const DominatorTree *DT,
+                                        const LoopInfo *LI) {
+  // When the stop block is unreachable, it's dominated from everywhere,
+  // regardless of whether there's a path between the two blocks.
+  if (DT && !DT->isReachableFromEntry(StopBB))
+    DT = 0;
+
+  // Limit the number of blocks we visit. The goal is to avoid run-away compile
+  // times on large CFGs without hampering sensible code. Arbitrarily chosen.
+  unsigned Limit = 32;
+  SmallSet<const BasicBlock*, 64> Visited;
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    if (!Visited.insert(BB))
+      continue;
+    if (BB == StopBB)
+      return true;
+    if (DT && DT->dominates(BB, StopBB))
+      return true;
+    if (LI && loopContainsBoth(LI, BB, StopBB))
+      return true;
+
+    if (!--Limit) {
+      // We haven't been able to prove it one way or the other. Conservatively
+      // answer true -- that there is potentially a path.
+      return true;
+    }
+
+    if (const Loop *Outer = LI ? getOutermostLoop(LI, BB) : 0) {
+      // All blocks in a single loop are reachable from all other blocks. From
+      // any of these blocks, we can skip directly to the exits of the loop,
+      // ignoring any other blocks inside the loop body.
+      Outer->getExitBlocks(Worklist);
+    } else {
+      for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+        Worklist.push_back(*I);
+    }
+  } while (!Worklist.empty());
+
+  // We have exhausted all possible paths and are certain that 'To' can not be
+  // reached from 'From'.
+  return false;
+}
+
+bool llvm::isPotentiallyReachable(const BasicBlock *A, const BasicBlock *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent() == B->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+  Worklist.push_back(const_cast<BasicBlock*>(A));
+
+  return isPotentiallyReachableInner(Worklist, const_cast<BasicBlock*>(B),
+                                     DT, LI);
+}
+
+bool llvm::isPotentiallyReachable(const Instruction *A, const Instruction *B,
+                                  const DominatorTree *DT, const LoopInfo *LI) {
+  assert(A->getParent()->getParent() == B->getParent()->getParent() &&
+         "This analysis is function-local!");
+
+  SmallVector<BasicBlock*, 32> Worklist;
+
+  if (A->getParent() == B->getParent()) {
+    // The same block case is special because it's the only time we're looking
+    // within a single block to see which instruction comes first. Once we
+    // start looking at multiple blocks, the first instruction of the block is
+    // reachable, so we only need to determine reachability between whole
+    // blocks.
+    BasicBlock *BB = const_cast<BasicBlock *>(A->getParent());
+
+    // If the block is in a loop then we can reach any instruction in the block
+    // from any other instruction in the block by going around a backedge.
+    if (LI && LI->getLoopFor(BB) != 0)
+      return true;
+
+    // Linear scan, start at 'A', see whether we hit 'B' or the end first.
+    for (BasicBlock::const_iterator I = A, E = BB->end(); I != E; ++I) {
+      if (&*I == B)
+        return true;
+    }
+
+    // Can't be in a loop if it's the entry block -- the entry block may not
+    // have predecessors.
+    if (BB == &BB->getParent()->getEntryBlock())
+      return false;
+
+    // Otherwise, continue doing the normal per-BB CFG walk.
+    for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+      Worklist.push_back(*I);
+
+    if (Worklist.empty()) {
+      // We've proven that there's no path!
+      return false;
+    }
+  } else {
+    Worklist.push_back(const_cast<BasicBlock*>(A->getParent()));
+  }
+
+  if (A->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return true;
+  if (B->getParent() == &A->getParent()->getParent()->getEntryBlock())
+    return false;
+
+  return isPotentiallyReachableInner(Worklist,
+                                     const_cast<BasicBlock*>(B->getParent()),
+                                     DT, LI);
+}
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 597c767..3624aac 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -8,11 +8,13 @@ add_llvm_library(LLVMAnalysis
   BasicAliasAnalysis.cpp
   BlockFrequencyInfo.cpp
   BranchProbabilityInfo.cpp
+  CFG.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
   CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
+  Delinearization.cpp
   DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
@@ -34,17 +36,7 @@ add_llvm_library(LLVMAnalysis
   ModuleDebugInfoPrinter.cpp
   NoAliasAnalysis.cpp
   PHITransAddr.cpp
-  PathNumbering.cpp
-  PathProfileInfo.cpp
-  PathProfileVerifier.cpp
   PostDominators.cpp
-  ProfileEstimatorPass.cpp
-  ProfileInfo.cpp
-  ProfileInfoLoader.cpp
-  ProfileInfoLoaderPass.cpp
-  ProfileVerifierPass.cpp
-  ProfileDataLoader.cpp
-  ProfileDataLoaderPass.cpp
   PtrUseVisitor.cpp
   RegionInfo.cpp
   RegionPass.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index a729270..79fab1b 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -146,8 +146,14 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
     case Instruction::PHI:
     case Instruction::Select:
       // The original value is not captured via this if the new value isn't.
+      Count = 0;
       for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
            UI != UE; ++UI) {
+        // If there are lots of uses, conservatively say that the value
+        // is captured to avoid taking too much compile time.
+        if (Count++ >= Threshold)
+          return Tracker->tooManyUses();
+
         Use *U = &UI.getUse();
         if (Visited.insert(U))
           if (Tracker->shouldExplore(U))
@@ -158,10 +164,10 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
       // Don't count comparisons of a no-alias return value against null as
       // captures. This allows us to ignore comparisons of malloc results
       // with null, for example.
-      if (isNoAliasCall(V->stripPointerCasts()))
-        if (ConstantPointerNull *CPN =
-              dyn_cast<ConstantPointerNull>(I->getOperand(1)))
-          if (CPN->getType()->getAddressSpace() == 0)
+      if (ConstantPointerNull *CPN =
+          dyn_cast<ConstantPointerNull>(I->getOperand(1)))
+        if (CPN->getType()->getAddressSpace() == 0)
+          if (isNoAliasCall(V->stripPointerCasts()))
             break;
       // Otherwise, be conservative. There are crazy ways to capture pointers
       // using comparisons.
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index bc0dffc..3d32232 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -224,7 +224,8 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
                                        APInt &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
-    Offset.clearAllBits();
+    unsigned BitWidth = TD.getPointerTypeSizeInBits(GV->getType());
+    Offset = APInt(BitWidth, 0);
     return true;
   }
 
@@ -238,16 +239,23 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
 
   // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(CE)) {
-    // If the base isn't a global+constant, we aren't either.
-    if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
-      return false;
+  GEPOperator *GEP = dyn_cast<GEPOperator>(CE);
+  if (!GEP)
+    return false;
 
-    // Otherwise, add any offset that our operands provide.
-    return GEP->accumulateConstantOffset(TD, Offset);
-  }
+  unsigned BitWidth = TD.getPointerTypeSizeInBits(GEP->getType());
+  APInt TmpOffset(BitWidth, 0);
 
-  return false;
+  // If the base isn't a global+constant, we aren't either.
+  if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, TmpOffset, TD))
+    return false;
+
+  // Otherwise, add any offset that our operands provide.
+  if (!GEP->accumulateConstantOffset(TD, TmpOffset))
+    return false;
+
+  Offset = TmpOffset;
+  return true;
 }
 
 /// ReadDataFromGlobal - Recursive helper to read bits out of global.  C is the
@@ -324,12 +332,12 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
       // If we read all of the bytes we needed from this element we're done.
       uint64_t NextEltOffset = SL->getElementOffset(Index);
 
-      if (BytesLeft <= NextEltOffset-CurEltOffset-ByteOffset)
+      if (BytesLeft <= NextEltOffset - CurEltOffset - ByteOffset)
         return true;
 
       // Move to the next element of the struct.
-      CurPtr += NextEltOffset-CurEltOffset-ByteOffset;
-      BytesLeft -= NextEltOffset-CurEltOffset-ByteOffset;
+      CurPtr += NextEltOffset - CurEltOffset - ByteOffset;
+      BytesLeft -= NextEltOffset - CurEltOffset - ByteOffset;
       ByteOffset = 0;
       CurEltOffset = NextEltOffset;
     }
@@ -338,7 +346,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (isa<ConstantArray>(C) || isa<ConstantVector>(C) ||
       isa<ConstantDataSequential>(C)) {
-    Type *EltTy = cast<SequentialType>(C->getType())->getElementType();
+    Type *EltTy = C->getType()->getSequentialElementType();
     uint64_t EltSize = TD.getTypeAllocSize(EltTy);
     uint64_t Index = ByteOffset / EltSize;
     uint64_t Offset = ByteOffset - Index * EltSize;
@@ -346,7 +354,7 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     if (ArrayType *AT = dyn_cast<ArrayType>(C->getType()))
       NumElts = AT->getNumElements();
     else
-      NumElts = cast<VectorType>(C->getType())->getNumElements();
+      NumElts = C->getType()->getVectorNumElements();
 
     for (; Index != NumElts; ++Index) {
       if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
@@ -367,9 +375,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
-        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext()))
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getType())) {
       return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
                                 BytesLeft, TD);
+    }
   }
 
   // Otherwise, unknown initializer type.
@@ -378,26 +387,29 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 
 static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                                  const DataLayout &TD) {
-  Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
+  PointerType *PTy = cast<PointerType>(C->getType());
+  Type *LoadTy = PTy->getElementType();
   IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
 
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
+    unsigned AS = PTy->getAddressSpace();
+
     // If this is a float/double load, we can try folding it as an int32/64 load
     // and then bitcast the result.  This can be useful for union cases.  Note
     // that address spaces don't matter here since we're not going to result in
     // an actual new load.
     Type *MapTy;
     if (LoadTy->isHalfTy())
-      MapTy = Type::getInt16PtrTy(C->getContext());
+      MapTy = Type::getInt16PtrTy(C->getContext(), AS);
     else if (LoadTy->isFloatTy())
-      MapTy = Type::getInt32PtrTy(C->getContext());
+      MapTy = Type::getInt32PtrTy(C->getContext(), AS);
     else if (LoadTy->isDoubleTy())
-      MapTy = Type::getInt64PtrTy(C->getContext());
+      MapTy = Type::getInt64PtrTy(C->getContext(), AS);
     else if (LoadTy->isVectorTy()) {
-      MapTy = IntegerType::get(C->getContext(),
-                               TD.getTypeAllocSizeInBits(LoadTy));
-      MapTy = PointerType::getUnqual(MapTy);
+      MapTy = PointerType::getIntNPtrTy(C->getContext(),
+                                        TD.getTypeAllocSizeInBits(LoadTy),
+                                        AS);
     } else
       return 0;
 
@@ -408,10 +420,11 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   }
 
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
-  if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
+  if (BytesLoaded > 32 || BytesLoaded == 0)
+    return 0;
 
   GlobalValue *GVal;
-  APInt Offset(TD.getPointerSizeInBits(), 0);
+  APInt Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
 
@@ -422,7 +435,8 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
 
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
-  if (Offset.isNegative()) return 0;
+  if (Offset.isNegative())
+    return 0;
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset.getZExtValue() >=
@@ -439,7 +453,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
     ResultVal = RawBytes[BytesLoaded - 1];
     for (unsigned i = 1; i != BytesLoaded; ++i) {
       ResultVal <<= 8;
-      ResultVal |= RawBytes[BytesLoaded-1-i];
+      ResultVal |= RawBytes[BytesLoaded - 1 - i];
     }
   } else {
     ResultVal = RawBytes[0];
@@ -464,14 +478,17 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
 
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
-  if (!CE) return 0;
+  if (!CE)
+    return 0;
 
   if (CE->getOpcode() == Instruction::GetElementPtr) {
-    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
-      if (GV->isConstant() && GV->hasDefinitiveInitializer())
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+      if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
         if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
           return V;
+      }
+    }
   }
 
   // Instead of loading constant c string, use corresponding integer value
@@ -576,13 +593,13 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
   // constant.  This happens frequently when iterating over a global array.
   if (Opc == Instruction::Sub && DL) {
     GlobalValue *GV1, *GV2;
-    unsigned PtrSize = DL->getPointerSizeInBits();
-    unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
-    APInt Offs1(PtrSize, 0), Offs2(PtrSize, 0);
+    APInt Offs1, Offs2;
 
     if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *DL))
       if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *DL) &&
           GV1 == GV2) {
+        unsigned OpSize = DL->getTypeSizeInBits(Op0->getType());
+
         // (&GV+C1) - (&GV+C2) -> C1-C2, pointer arithmetic cannot overflow.
         // PtrToInt may change the bitwidth so we have convert to the right size
         // first.
@@ -600,15 +617,18 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
-  if (!TD) return 0;
-  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+  if (!TD)
+    return 0;
+
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy);
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
   for (unsigned i = 1, e = Ops.size(); i != e; ++i) {
     if ((i == 1 ||
-         !isa<StructType>(GetElementPtrInst::getIndexedType(Ops[0]->getType(),
-                                                        Ops.slice(1, i-1)))) &&
+         !isa<StructType>(GetElementPtrInst::getIndexedType(
+                            Ops[0]->getType(),
+                            Ops.slice(1, i - 1)))) &&
         Ops[i]->getType() != IntPtrTy) {
       Any = true;
       NewIdxs.push_back(ConstantExpr::getCast(CastInst::getCastOpcode(Ops[i],
@@ -619,13 +639,16 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
     } else
       NewIdxs.push_back(Ops[i]);
   }
-  if (!Any) return 0;
 
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+  if (!Any)
+    return 0;
+
+  Constant *C = ConstantExpr::getGetElementPtr(Ops[0], NewIdxs);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
+  }
+
   return C;
 }
 
@@ -640,7 +663,7 @@ static Constant* StripPtrCastKeepAS(Constant* Ptr) {
   if (NewPtrTy->getAddressSpace() != OldPtrTy->getAddressSpace()) {
     NewPtrTy = NewPtrTy->getElementType()->getPointerTo(
       OldPtrTy->getAddressSpace());
-    Ptr = ConstantExpr::getBitCast(Ptr, NewPtrTy);
+    Ptr = ConstantExpr::getPointerCast(Ptr, NewPtrTy);
   }
   return Ptr;
 }
@@ -651,11 +674,12 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
                                          Type *ResultTy, const DataLayout *TD,
                                          const TargetLibraryInfo *TLI) {
   Constant *Ptr = Ops[0];
-  if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
+  if (!TD || !Ptr->getType()->getPointerElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
     return 0;
 
-  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getType());
+  Type *ResultElementTy = ResultTy->getPointerElementType();
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
@@ -664,8 +688,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
-      if (Ops.size() == 2 &&
-          cast<PointerType>(ResultTy)->getElementType()->isIntegerTy(8)) {
+      if (Ops.size() == 2 && ResultElementTy->isIntegerTy(8)) {
         ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[1]);
         assert((CE == 0 || CE->getType() == IntPtrTy) &&
                "CastGEPIndices didn't canonicalize index types!");
@@ -692,7 +715,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
   while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-    SmallVector<Value *, 4> NestedOps(GEP->op_begin()+1, GEP->op_end());
+    SmallVector<Value *, 4> NestedOps(GEP->op_begin() + 1, GEP->op_end());
 
     // Do not try the incorporate the sub-GEP if some index is not a number.
     bool AllConstantInt = true;
@@ -713,12 +736,15 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // If the base value for this address is a literal integer value, fold the
   // getelementptr to the resulting integer value casted to the pointer type.
   APInt BasePtr(BitWidth, 0);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
-    if (CE->getOpcode() == Instruction::IntToPtr)
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+    if (CE->getOpcode() == Instruction::IntToPtr) {
       if (ConstantInt *Base = dyn_cast<ConstantInt>(CE->getOperand(0)))
         BasePtr = Base->getValue().zextOrTrunc(BitWidth);
+    }
+  }
+
   if (Ptr->isNullValue() || BasePtr != 0) {
-    Constant *C = ConstantInt::get(Ptr->getContext(), Offset+BasePtr);
+    Constant *C = ConstantInt::get(Ptr->getContext(), Offset + BasePtr);
     return ConstantExpr::getIntToPtr(C, ResultTy);
   }
 
@@ -728,7 +754,8 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
-  SmallVector<Constant*, 32> NewIdxs;
+  SmallVector<Constant *, 32> NewIdxs;
+
   do {
     if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
       if (ATy->isPointerTy()) {
@@ -743,7 +770,6 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
-      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
       if (ElemSize == 0)
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
@@ -778,7 +804,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
       // We've reached some non-indexable type.
       break;
     }
-  } while (Ty != cast<PointerType>(ResultTy)->getElementType());
+  } while (Ty != ResultElementTy);
 
   // If we haven't used up the entire offset by descending the static
   // type, then the offset is pointing into the middle of an indivisible
@@ -787,14 +813,13 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
     return 0;
 
   // Create a GEP.
-  Constant *C =
-    ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
-  assert(cast<PointerType>(C->getType())->getElementType() == Ty &&
+  Constant *C = ConstantExpr::getGetElementPtr(Ptr, NewIdxs);
+  assert(C->getType()->getPointerElementType() == Ty &&
          "Computed GetElementPtr has unexpected type!");
 
   // If we ended up indexing a member with a type that doesn't match
   // the type of what the original indices indexed, add a cast.
-  if (Ty != cast<PointerType>(ResultTy)->getElementType())
+  if (Ty != ResultElementTy)
     C = FoldBitCast(C, ResultTy, *TD);
 
   return C;
@@ -867,16 +892,18 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, TD);
 
-  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I))
+  if (InsertValueInst *IVI = dyn_cast<InsertValueInst>(I)) {
     return ConstantExpr::getInsertValue(
                                 cast<Constant>(IVI->getAggregateOperand()),
                                 cast<Constant>(IVI->getInsertedValueOperand()),
                                 IVI->getIndices());
+  }
 
-  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I))
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(I)) {
     return ConstantExpr::getExtractValue(
                                     cast<Constant>(EVI->getAggregateOperand()),
                                     EVI->getIndices());
+  }
 
   return ConstantFoldInstOperands(I->getOpcode(), I->getType(), Ops, TD, TLI);
 }
@@ -930,9 +957,10 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                          const TargetLibraryInfo *TLI) {
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
-    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
+    if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1])) {
       if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
         return C;
+    }
 
     return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
   }
@@ -953,10 +981,11 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
       if (TD && CE->getOpcode() == Instruction::IntToPtr) {
         Constant *Input = CE->getOperand(0);
         unsigned InWidth = Input->getType()->getScalarSizeInBits();
-        if (TD->getPointerSizeInBits() < InWidth) {
+        unsigned PtrWidth = TD->getPointerTypeSizeInBits(CE->getType());
+        if (PtrWidth < InWidth) {
           Constant *Mask =
-            ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
-                                                  TD->getPointerSizeInBits()));
+            ConstantInt::get(CE->getContext(),
+                             APInt::getLowBitsSet(InWidth, PtrWidth));
           Input = ConstantExpr::getAnd(Input, Mask);
         }
         // Do a zext or trunc to get to the dest size.
@@ -966,13 +995,22 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::IntToPtr:
     // If the input is a ptrtoint, turn the pair into a ptr to ptr bitcast if
-    // the int size is >= the ptr size.  This requires knowing the width of a
-    // pointer, so it can't be done in ConstantExpr::getCast.
-    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
-      if (TD &&
-          TD->getPointerSizeInBits() <= CE->getType()->getScalarSizeInBits() &&
-          CE->getOpcode() == Instruction::PtrToInt)
-        return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+    // the int size is >= the ptr size and the address spaces are the same.
+    // This requires knowing the width of a pointer, so it can't be done in
+    // ConstantExpr::getCast.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0])) {
+      if (TD && CE->getOpcode() == Instruction::PtrToInt) {
+        Constant *SrcPtr = CE->getOperand(0);
+        unsigned SrcPtrSize = TD->getPointerTypeSizeInBits(SrcPtr->getType());
+        unsigned MidIntSize = CE->getType()->getScalarSizeInBits();
+
+        if (MidIntSize >= SrcPtrSize) {
+          unsigned SrcAS = SrcPtr->getType()->getPointerAddressSpace();
+          if (SrcAS == DestTy->getPointerAddressSpace())
+            return FoldBitCast(CE->getOperand(0), DestTy, *TD);
+        }
+      }
+    }
 
     return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::Trunc:
@@ -984,6 +1022,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
   case Instruction::SIToFP:
   case Instruction::FPToUI:
   case Instruction::FPToSI:
+  case Instruction::AddrSpaceCast:
       return ConstantExpr::getCast(Opcode, Ops[0], DestTy);
   case Instruction::BitCast:
     if (TD)
@@ -1024,8 +1063,8 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // around to know if bit truncation is happening.
   if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
     if (TD && Ops1->isNullValue()) {
-      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
       if (CE0->getOpcode() == Instruction::IntToPtr) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
         Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1036,19 +1075,21 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
       // Only do this transformation if the int is intptrty in size, otherwise
       // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt &&
-          CE0->getType() == IntPtrTy) {
-        Constant *C = CE0->getOperand(0);
-        Constant *Null = Constant::getNullValue(C->getType());
-        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      if (CE0->getOpcode() == Instruction::PtrToInt) {
+        Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == IntPtrTy) {
+          Constant *C = CE0->getOperand(0);
+          Constant *Null = Constant::getNullValue(C->getType());
+          return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+        }
       }
     }
 
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
-        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
-
         if (CE0->getOpcode() == Instruction::IntToPtr) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
+
           // Convert the integer value to the right size to ensure we get the
           // proper extension or truncation.
           Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1060,11 +1101,17 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 
         // Only do this transformation if the int is intptrty in size, otherwise
         // there is a truncation or extension that we aren't modeling.
-        if ((CE0->getOpcode() == Instruction::PtrToInt &&
-             CE0->getType() == IntPtrTy &&
-             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()))
-          return ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0),
-                                                 CE1->getOperand(0), TD, TLI);
+        if (CE0->getOpcode() == Instruction::PtrToInt) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+          if (CE0->getType() == IntPtrTy &&
+              CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()) {
+            return ConstantFoldCompareInstOperands(Predicate,
+                                                   CE0->getOperand(0),
+                                                   CE1->getOperand(0),
+                                                   TD,
+                                                   TLI);
+          }
+        }
       }
     }
 
@@ -1101,7 +1148,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
   // addressing.
   for (unsigned i = 2, e = CE->getNumOperands(); i != e; ++i) {
     C = C->getAggregateElement(CE->getOperand(i));
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1116,7 +1164,8 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
   // addressing.
   for (unsigned i = 0, e = Indices.size(); i != e; ++i) {
     C = C->getAggregateElement(Indices[i]);
-    if (C == 0) return 0;
+    if (C == 0)
+      return 0;
   }
   return C;
 }
@@ -1128,8 +1177,7 @@ Constant *llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 
 /// canConstantFoldCallTo - Return true if its even possible to fold a call to
 /// the specified function.
-bool
-llvm::canConstantFoldCallTo(const Function *F) {
+bool llvm::canConstantFoldCallTo(const Function *F) {
   switch (F->getIntrinsicID()) {
   case Intrinsic::fabs:
   case Intrinsic::log:
@@ -1167,7 +1215,8 @@ llvm::canConstantFoldCallTo(const Function *F) {
   case 0: break;
   }
 
-  if (!F->hasName()) return false;
+  if (!F->hasName())
+    return false;
   StringRef Name = F->getName();
 
   // In these cases, the check of the length is required.  We don't want to
@@ -1250,7 +1299,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
 static Constant *ConstantFoldConvertToInt(const APFloat &Val,
                                           bool roundTowardZero, Type *Ty) {
   // All of these conversion intrinsics form an integer of at most 64bits.
-  unsigned ResultWidth = cast<IntegerType>(Ty)->getBitWidth();
+  unsigned ResultWidth = Ty->getIntegerBitWidth();
   assert(ResultWidth <= 64 &&
          "Can only constant fold conversions to 64 and 32 bit ints");
 
@@ -1271,7 +1320,8 @@ static Constant *ConstantFoldConvertToInt(const APFloat &Val,
 Constant *
 llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
                        const TargetLibraryInfo *TLI) {
-  if (!F->hasName()) return 0;
+  if (!F->hasName())
+    return 0;
   StringRef Name = F->getName();
 
   Type *Ty = F->getReturnType();
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 98a7780..f943258 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -19,6 +19,7 @@
 
 #define CM_NAME "cost-model"
 #define DEBUG_TYPE CM_NAME
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Function.h"
@@ -26,10 +27,15 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
+                                     cl::Hidden,
+                                     cl::desc("Recognize reduction patterns."));
+
 namespace {
   class CostModelAnalysis : public FunctionPass {
 
@@ -81,7 +87,7 @@ CostModelAnalysis::runOnFunction(Function &F) {
  return false;
 }
 
-static bool isReverseVectorMask(SmallVector<int, 16> &Mask) {
+static bool isReverseVectorMask(SmallVectorImpl<int> &Mask) {
   for (unsigned i = 0, MaskSize = Mask.size(); i < MaskSize; ++i)
     if (Mask[i] > 0 && Mask[i] != (int)(MaskSize - 1 - i))
       return false;
@@ -105,6 +111,260 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
   return OpInfo;
 }
 
+static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
+  if (M1.size() != M2.size())
+    return false;
+
+  for (unsigned i = 0, e = M1.size(); i != e; ++i)
+    if (M1[i] != M2[i])
+      return false;
+
+  return true;
+}
+
+static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
+                                     unsigned Level) {
+  // We don't need a shuffle if we just want to have element 0 in position 0 of
+  // the vector.
+  if (!SI && Level == 0 && IsLeft)
+    return true;
+  else if (!SI)
+    return false;
+
+  SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+
+  // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
+  // we look at the left or right side.
+  for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
+    Mask[i] = val;
+
+  SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+  if (!matchMask(Mask, ActualMask))
+    return false;
+
+  return true;
+}
+
+static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
+                                          unsigned Level, unsigned NumLevels) {
+  // Match one level of pairwise operations.
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  if (BinOp == 0)
+    return false;
+
+  assert(BinOp->getType()->isVectorTy() && "Expecting a vector type");
+
+  unsigned Opcode = BinOp->getOpcode();
+  Value *L = BinOp->getOperand(0);
+  Value *R = BinOp->getOperand(1);
+
+  ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
+  if (!LS && Level)
+    return false;
+  ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
+  if (!RS && Level)
+    return false;
+
+  // On level 0 we can omit one shufflevector instruction.
+  if (!Level && !RS && !LS)
+    return false;
+
+  // Shuffle inputs must match.
+  Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
+  Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
+  Value *NextLevelOp = 0;
+  if (NextLevelOpR && NextLevelOpL) {
+    // If we have two shuffles their operands must match.
+    if (NextLevelOpL != NextLevelOpR)
+      return false;
+
+    NextLevelOp = NextLevelOpL;
+  } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
+    // On the first level we can omit the shufflevector <0, undef,...>. So the
+    // input to the other shufflevector <1, undef> must match with one of the
+    // inputs to the current binary operation.
+    // Example:
+    //  %NextLevelOpL = shufflevector %R, <1, undef ...>
+    //  %BinOp        = fadd          %NextLevelOpL, %R
+    if (NextLevelOpL && NextLevelOpL != R)
+      return false;
+    else if (NextLevelOpR && NextLevelOpR != L)
+      return false;
+
+    NextLevelOp = NextLevelOpL ? R : L;
+  } else
+    return false;
+
+  // Check that the next levels binary operation exists and matches with the
+  // current one.
+  BinaryOperator *NextLevelBinOp = 0;
+  if (Level + 1 != NumLevels) {
+    if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
+      return false;
+    else if (NextLevelBinOp->getOpcode() != Opcode)
+      return false;
+  }
+
+  // Shuffle mask for pairwise operation must match.
+  if (matchPairwiseShuffleMask(LS, true, Level)) {
+    if (!matchPairwiseShuffleMask(RS, false, Level))
+      return false;
+  } else if (matchPairwiseShuffleMask(RS, true, Level)) {
+    if (!matchPairwiseShuffleMask(LS, false, Level))
+      return false;
+  } else
+    return false;
+
+  if (++Level == NumLevels)
+    return true;
+
+  // Match next level.
+  return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);
+}
+
+static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+                                   unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffle,shuffle,add triples like the following
+  // that builds a pairwise reduction tree.
+  //
+  //  (X0, X1, X2, X3)
+  //   (X0 + X1, X2 + X3, undef, undef)
+  //    ((X0 + X1) + (X2 + X3), undef, undef, undef)
+  //
+  // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+  // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+  // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+  //       <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+  if (!matchPairwiseReductionAtLevel(RdxStart, 0,  Log2_32(NumVecElems)))
+    return false;
+
+  Opcode = RdxStart->getOpcode();
+  Ty = VecTy;
+
+  return true;
+}
+
+static std::pair<Value *, ShuffleVectorInst *>
+getShuffleAndOtherOprd(BinaryOperator *B) {
+
+  Value *L = B->getOperand(0);
+  Value *R = B->getOperand(1);
+  ShuffleVectorInst *S = 0;
+
+  if ((S = dyn_cast<ShuffleVectorInst>(L)))
+    return std::make_pair(R, S);
+
+  S = dyn_cast<ShuffleVectorInst>(R);
+  return std::make_pair(L, S);
+}
+
+static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+                                          unsigned &Opcode, Type *&Ty) {
+  if (!EnableReduxCost)
+    return false;
+
+  // Need to extract the first element.
+  ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+  unsigned Idx = ~0u;
+  if (CI)
+    Idx = CI->getZExtValue();
+  if (Idx != 0)
+    return false;
+
+  BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+  if (!RdxStart)
+    return false;
+  unsigned RdxOpcode = RdxStart->getOpcode();
+
+  Type *VecTy = ReduxRoot->getOperand(0)->getType();
+  unsigned NumVecElems = VecTy->getVectorNumElements();
+  if (!isPowerOf2_32(NumVecElems))
+    return false;
+
+  // We look for a sequence of shuffles and adds like the following matching one
+  // fadd, shuffle vector pair at a time.
+  //
+  // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
+  //                           <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+  // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+  //                          <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+  // %r = extractelement <4 x float> %bin.rdx8, i32 0
+
+  unsigned MaskStart = 1;
+  Value *RdxOp = RdxStart;
+  SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
+  unsigned NumVecElemsRemain = NumVecElems;
+  while (NumVecElemsRemain - 1) {
+    // Check for the right reduction operation.
+    BinaryOperator *BinOp;
+    if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))
+      return false;
+    if (BinOp->getOpcode() != RdxOpcode)
+      return false;
+
+    Value *NextRdxOp;
+    ShuffleVectorInst *Shuffle;
+    tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
+
+    // Check the current reduction operation and the shuffle use the same value.
+    if (Shuffle == 0)
+      return false;
+    if (Shuffle->getOperand(0) != NextRdxOp)
+      return false;
+
+    // Check that shuffle masks matches.
+    for (unsigned j = 0; j != MaskStart; ++j)
+      ShuffleMask[j] = MaskStart + j;
+    // Fill the rest of the mask with -1 for undef.
+    std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
+
+    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    if (!matchMask(ShuffleMask, Mask))
+      return false;
+
+    RdxOp = NextRdxOp;
+    NumVecElemsRemain /= 2;
+    MaskStart *= 2;
+  }
+
+  Opcode = RdxOpcode;
+  Ty = VecTy;
+  return true;
+}
+
 unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   if (!TTI)
     return -1;
@@ -189,18 +449,29 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
     unsigned Idx = -1;
     if (CI)
       Idx = CI->getZExtValue();
+
+    // Try to match a reduction sequence (series of shufflevector and vector
+    // adds followed by a extractelement).
+    unsigned ReduxOpCode;
+    Type *ReduxType;
+
+    if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, false);
+    else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))
+      return TTI->getReductionCost(ReduxOpCode, ReduxType, true);
+
     return TTI->getVectorInstrCost(I->getOpcode(),
                                    EEI->getOperand(0)->getType(), Idx);
   }
   case Instruction::InsertElement: {
-      const InsertElementInst * IE = cast<InsertElementInst>(I);
-      ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
-      unsigned Idx = -1;
-      if (CI)
-        Idx = CI->getZExtValue();
-      return TTI->getVectorInstrCost(I->getOpcode(),
-                                     IE->getType(), Idx);
-    }
+    const InsertElementInst * IE = cast<InsertElementInst>(I);
+    ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+    unsigned Idx = -1;
+    if (CI)
+      Idx = CI->getZExtValue();
+    return TTI->getVectorInstrCost(I->getOpcode(),
+                                   IE->getType(), Idx);
+  }
   case Instruction::ShuffleVector: {
     const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
     Type *VecTypOp0 = Shuffle->getOperand(0)->getType();
diff --git a/lib/Analysis/Delinearization.cpp b/lib/Analysis/Delinearization.cpp
new file mode 100644
index 0000000..3ed0609
--- /dev/null
+++ b/lib/Analysis/Delinearization.cpp
@@ -0,0 +1,133 @@
+//===---- Delinearization.cpp - MultiDimensional Index Delinearization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements an analysis pass that tries to delinearize all GEP
+// instructions in all loops using the SCEV analysis functionality. This pass is
+// only used for testing purposes: if your pass needs delinearization, please
+// use the on-demand SCEVAddRecExpr::delinearize() function.
+//
+//===----------------------------------------------------------------------===//
+
+#define DL_NAME "delinearize"
+#define DEBUG_TYPE DL_NAME
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+class Delinearization : public FunctionPass {
+  Delinearization(const Delinearization &); // do not implement
+protected:
+  Function *F;
+  LoopInfo *LI;
+  ScalarEvolution *SE;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  Delinearization() : FunctionPass(ID) {
+    initializeDelinearizationPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnFunction(Function &F);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void print(raw_ostream &O, const Module *M = 0) const;
+};
+
+} // end anonymous namespace
+
+void Delinearization::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<ScalarEvolution>();
+}
+
+bool Delinearization::runOnFunction(Function &F) {
+  this->F = &F;
+  SE = &getAnalysis<ScalarEvolution>();
+  LI = &getAnalysis<LoopInfo>();
+  return false;
+}
+
+static Value *getPointerOperand(Instruction &Inst) {
+  if (LoadInst *Load = dyn_cast<LoadInst>(&Inst))
+    return Load->getPointerOperand();
+  else if (StoreInst *Store = dyn_cast<StoreInst>(&Inst))
+    return Store->getPointerOperand();
+  else if (GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(&Inst))
+    return Gep->getPointerOperand();
+  return NULL;
+}
+
+void Delinearization::print(raw_ostream &O, const Module *) const {
+  O << "Delinearization on function " << F->getName() << ":\n";
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+    Instruction *Inst = &(*I);
+
+    // Only analyze loads and stores.
+    if (!isa<StoreInst>(Inst) && !isa<LoadInst>(Inst) &&
+        !isa<GetElementPtrInst>(Inst))
+      continue;
+
+    const BasicBlock *BB = Inst->getParent();
+    // Delinearize the memory access as analyzed in all the surrounding loops.
+    // Do not analyze memory accesses outside loops.
+    for (Loop *L = LI->getLoopFor(BB); L != NULL; L = L->getParentLoop()) {
+      const SCEV *AccessFn = SE->getSCEVAtScope(getPointerOperand(*Inst), L);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(AccessFn);
+
+      // Do not try to delinearize memory accesses that are not AddRecs.
+      if (!AR)
+        break;
+
+      O << "AddRec: " << *AR << "\n";
+
+      SmallVector<const SCEV *, 3> Subscripts, Sizes;
+      const SCEV *Res = AR->delinearize(*SE, Subscripts, Sizes);
+      int Size = Subscripts.size();
+      if (Res == AR || Size == 0) {
+        O << "failed to delinearize\n";
+        continue;
+      }
+      O << "Base offset: " << *Res << "\n";
+      O << "ArrayDecl[UnknownSize]";
+      for (int i = 0; i < Size - 1; i++)
+        O << "[" << *Sizes[i] << "]";
+      O << " with elements of " << *Sizes[Size - 1] << " bytes.\n";
+
+      O << "ArrayRef";
+      for (int i = 0; i < Size; i++)
+        O << "[" << *Subscripts[i] << "]";
+      O << "\n";
+    }
+  }
+}
+
+char Delinearization::ID = 0;
+static const char delinearization_name[] = "Delinearization";
+INITIALIZE_PASS_BEGIN(Delinearization, DL_NAME, delinearization_name, true,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_END(Delinearization, DL_NAME, delinearization_name, true, true)
+
+FunctionPass *llvm::createDelinearizationPass() { return new Delinearization; }
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index cbc71bd..3b3e2ef 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -24,11 +24,11 @@
 // Both of these are conservative weaknesses;
 // that is, not a source of correctness problems.
 //
-// The implementation depends on the GEP instruction to
-// differentiate subscripts. Since Clang linearizes subscripts
-// for most arrays, we give up some precision (though the existing MIV tests
-// will help). We trust that the GEP instruction will eventually be extended.
-// In the meantime, we should explore Maslov's ideas about delinearization.
+// The implementation depends on the GEP instruction to differentiate
+// subscripts. Since Clang linearizes some array subscripts, the dependence
+// analysis is using SCEV->delinearize to recover the representation of multiple
+// subscripts, and thus avoid the more expensive and less precise MIV tests. The
+// delinearization is controlled by the flag -da-delinearize.
 //
 // We should pay some careful attention to the possibility of integer overflow
 // in the implementation of the various tests. This could happen with Add,
@@ -61,6 +61,7 @@
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstIterator.h"
@@ -104,6 +105,10 @@ STATISTIC(BanerjeeApplications, "Banerjee applications");
 STATISTIC(BanerjeeIndependence, "Banerjee independence");
 STATISTIC(BanerjeeSuccesses, "Banerjee successes");
 
+static cl::opt<bool>
+Delinearize("da-delinearize", cl::init(false), cl::Hidden, cl::ZeroOrMore,
+            cl::desc("Try to delinearize array references."));
+
 //===----------------------------------------------------------------------===//
 // basics
 
@@ -508,7 +513,7 @@ bool DependenceAnalysis::intersectConstraints(Constraint *X,
       APInt Xr = Xtop; // though they're just going to be overwritten
       APInt::sdivrem(Xtop, Xbot, Xq, Xr);
       APInt Yq = Ytop;
-      APInt Yr = Ytop;;
+      APInt Yr = Ytop;
       APInt::sdivrem(Ytop, Ybot, Yq, Yr);
       if (Xr != 0 || Yr != 0) {
         X->setEmpty();
@@ -2951,6 +2956,11 @@ const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
                              AddRec->getLoop(),
                              AddRec->getNoWrapFlags());
   }
+  if (SE->isLoopInvariant(AddRec, TargetLoop))
+    return SE->getAddRecExpr(AddRec,
+			     Value,
+			     TargetLoop,
+			     SCEV::FlagAnyWrap);
   return SE->getAddRecExpr(addToCoefficient(AddRec->getStart(),
                                             TargetLoop, Value),
                            AddRec->getStepRecurrence(*SE),
@@ -2972,7 +2982,7 @@ const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
 bool DependenceAnalysis::propagate(const SCEV *&Src,
                                    const SCEV *&Dst,
                                    SmallBitVector &Loops,
-                                   SmallVector<Constraint, 4> &Constraints,
+                                   SmallVectorImpl<Constraint> &Constraints,
                                    bool &Consistent) {
   bool Result = false;
   for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
@@ -3166,6 +3176,55 @@ void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
     llvm_unreachable("constraint has unexpected kind");
 }
 
+/// Check if we can delinearize the subscripts. If the SCEVs representing the
+/// source and destination array references are recurrences on a nested loop,
+/// this function flattens the nested recurrences into seperate recurrences
+/// for each loop level.
+bool
+DependenceAnalysis::tryDelinearize(const SCEV *SrcSCEV, const SCEV *DstSCEV,
+                                   SmallVectorImpl<Subscript> &Pair) const {
+  const SCEVAddRecExpr *SrcAR = dyn_cast<SCEVAddRecExpr>(SrcSCEV);
+  const SCEVAddRecExpr *DstAR = dyn_cast<SCEVAddRecExpr>(DstSCEV);
+  if (!SrcAR || !DstAR || !SrcAR->isAffine() || !DstAR->isAffine())
+    return false;
+
+  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts, SrcSizes, DstSizes;
+  SrcAR->delinearize(*SE, SrcSubscripts, SrcSizes);
+  DstAR->delinearize(*SE, DstSubscripts, DstSizes);
+
+  int size = SrcSubscripts.size();
+  int dstSize = DstSubscripts.size();
+  if (size != dstSize || size < 2)
+    return false;
+
+#ifndef NDEBUG
+  DEBUG(errs() << "\nSrcSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *SrcSubscripts[i]);
+  DEBUG(errs() << "\nDstSubscripts: ");
+  for (int i = 0; i < size; i++)
+    DEBUG(errs() << *DstSubscripts[i]);
+#endif
+
+  // The delinearization transforms a single-subscript MIV dependence test into
+  // a multi-subscript SIV dependence test that is easier to compute. So we
+  // resize Pair to contain as many pairs of subscripts as the delinearization
+  // has found, and then initialize the pairs following the delinearization.
+  Pair.resize(size);
+  for (int i = 0; i < size; ++i) {
+    Pair[i].Src = SrcSubscripts[i];
+    Pair[i].Dst = DstSubscripts[i];
+
+    // FIXME: we should record the bounds SrcSizes[i] and DstSizes[i] that the
+    // delinearization has found, and add these constraints to the dependence
+    // check to avoid memory accesses overflow from one dimension into another.
+    // This is related to the problem of determining the existence of data
+    // dependences in array accesses using a different number of subscripts: in
+    // C one can access an array A[100][100]; as A[0][9999], *A[9999], etc.
+  }
+
+  return true;
+}
 
 //===----------------------------------------------------------------------===//
 
@@ -3275,6 +3334,12 @@ Dependence *DependenceAnalysis::depends(Instruction *Src,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
@@ -3693,6 +3758,12 @@ const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
     Pair[0].Dst = DstSCEV;
   }
 
+  if (Delinearize && Pairs == 1 && CommonLevels > 1 &&
+      tryDelinearize(Pair[0].Src, Pair[0].Dst, Pair)) {
+    DEBUG(dbgs() << "    delinerized GEP\n");
+    Pairs = Pair.size();
+  }
+
   for (unsigned P = 0; P < Pairs; ++P) {
     Pair[P].Loops.resize(MaxLevels + 1);
     Pair[P].GroupLoops.resize(MaxLevels + 1);
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 7620fd9..f042964 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -6,11 +6,6 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the CallGraph class and provides the BasicCallGraph
-// default implementation.
-//
-//===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Instructions.h"
@@ -21,168 +16,92 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-namespace {
+CallGraph::CallGraph()
+    : ModulePass(ID), Root(0), ExternalCallingNode(0), CallsExternalNode(0) {
+  initializeCallGraphPass(*PassRegistry::getPassRegistry());
+}
 
-//===----------------------------------------------------------------------===//
-// BasicCallGraph class definition
-//
-class BasicCallGraph : public ModulePass, public CallGraph {
-  // Root is root of the call graph, or the external node if a 'main' function
-  // couldn't be found.
-  //
-  CallGraphNode *Root;
-
-  // ExternalCallingNode - This node has edges to all external functions and
-  // those internal functions that have their address taken.
-  CallGraphNode *ExternalCallingNode;
-
-  // CallsExternalNode - This node has edges to it from all functions making
-  // indirect calls or calling an external function.
-  CallGraphNode *CallsExternalNode;
-
-public:
-  static char ID; // Class identification, replacement for typeinfo
-  BasicCallGraph() : ModulePass(ID), Root(0), 
-    ExternalCallingNode(0), CallsExternalNode(0) {
-      initializeBasicCallGraphPass(*PassRegistry::getPassRegistry());
-    }
+void CallGraph::addToCallGraph(Function *F) {
+  CallGraphNode *Node = getOrInsertFunction(F);
 
-  // runOnModule - Compute the call graph for the specified module.
-  virtual bool runOnModule(Module &M) {
-    CallGraph::initialize(M);
-    
-    ExternalCallingNode = getOrInsertFunction(0);
-    CallsExternalNode = new CallGraphNode(0);
-    Root = 0;
-  
-    // Add every function to the call graph.
-    for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-      addToCallGraph(I);
-  
-    // If we didn't find a main function, use the external call graph node
-    if (Root == 0) Root = ExternalCallingNode;
-    
-    return false;
-  }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesAll();
-  }
+  // If this function has external linkage, anything could call it.
+  if (!F->hasLocalLinkage()) {
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
 
-  virtual void print(raw_ostream &OS, const Module *) const {
-    OS << "CallGraph Root is: ";
-    if (Function *F = getRoot()->getFunction())
-      OS << F->getName() << "\n";
-    else {
-      OS << "<<null function: 0x" << getRoot() << ">>\n";
+    // Found the entry point?
+    if (F->getName() == "main") {
+      if (Root) // Found multiple external mains?  Don't pick one.
+        Root = ExternalCallingNode;
+      else
+        Root = Node; // Found a main, keep track of it!
     }
-    
-    CallGraph::print(OS, 0);
   }
 
-  virtual void releaseMemory() {
-    destroy();
-  }
-  
-  /// getAdjustedAnalysisPointer - This method is used when a pass implements
-  /// an analysis interface through multiple inheritance.  If needed, it should
-  /// override this to adjust the this pointer as needed for the specified pass
-  /// info.
-  virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-    if (PI == &CallGraph::ID)
-      return (CallGraph*)this;
-    return this;
-  }
-  
-  CallGraphNode* getExternalCallingNode() const { return ExternalCallingNode; }
-  CallGraphNode* getCallsExternalNode()   const { return CallsExternalNode; }
-
-  // getRoot - Return the root of the call graph, which is either main, or if
-  // main cannot be found, the external node.
-  //
-  CallGraphNode *getRoot()             { return Root; }
-  const CallGraphNode *getRoot() const { return Root; }
-
-private:
-  //===---------------------------------------------------------------------
-  // Implementation of CallGraph construction
-  //
-
-  // addToCallGraph - Add a function to the call graph, and link the node to all
-  // of the functions that it calls.
-  //
-  void addToCallGraph(Function *F) {
-    CallGraphNode *Node = getOrInsertFunction(F);
-
-    // If this function has external linkage, anything could call it.
-    if (!F->hasLocalLinkage()) {
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-      // Found the entry point?
-      if (F->getName() == "main") {
-        if (Root)    // Found multiple external mains?  Don't pick one.
-          Root = ExternalCallingNode;
-        else
-          Root = Node;          // Found a main, keep track of it!
+  // If this function has its address taken, anything could call it.
+  if (F->hasAddressTaken())
+    ExternalCallingNode->addCalledFunction(CallSite(), Node);
+
+  // If this function is not defined in this translation unit, it could call
+  // anything.
+  if (F->isDeclaration() && !F->isIntrinsic())
+    Node->addCalledFunction(CallSite(), CallsExternalNode);
+
+  // Look for calls by this function.
+  for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
+      CallSite CS(cast<Value>(II));
+      if (CS) {
+        const Function *Callee = CS.getCalledFunction();
+        if (!Callee)
+          // Indirect calls of intrinsics are not allowed so no need to check.
+          Node->addCalledFunction(CS, CallsExternalNode);
+        else if (!Callee->isIntrinsic())
+          Node->addCalledFunction(CS, getOrInsertFunction(Callee));
       }
     }
+}
 
-    // If this function has its address taken, anything could call it.
-    if (F->hasAddressTaken())
-      ExternalCallingNode->addCalledFunction(CallSite(), Node);
-
-    // If this function is not defined in this translation unit, it could call
-    // anything.
-    if (F->isDeclaration() && !F->isIntrinsic())
-      Node->addCalledFunction(CallSite(), CallsExternalNode);
-
-    // Look for calls by this function.
-    for (Function::iterator BB = F->begin(), BBE = F->end(); BB != BBE; ++BB)
-      for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
-           II != IE; ++II) {
-        CallSite CS(cast<Value>(II));
-        if (CS) {
-          const Function *Callee = CS.getCalledFunction();
-          if (!Callee)
-            // Indirect calls of intrinsics are not allowed so no need to check.
-            Node->addCalledFunction(CS, CallsExternalNode);
-          else if (!Callee->isIntrinsic())
-            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
-        }
-      }
-  }
+void CallGraph::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
 
-  //
-  // destroy - Release memory for the call graph
-  virtual void destroy() {
-    /// CallsExternalNode is not in the function map, delete it explicitly.
-    if (CallsExternalNode) {
-      CallsExternalNode->allReferencesDropped();
-      delete CallsExternalNode;
-      CallsExternalNode = 0;
-    }
-    CallGraph::destroy();
-  }
-};
+bool CallGraph::runOnModule(Module &M) {
+  Mod = &M;
 
-} //End anonymous namespace
+  ExternalCallingNode = getOrInsertFunction(0);
+  assert(!CallsExternalNode);
+  CallsExternalNode = new CallGraphNode(0);
+  Root = 0;
 
-INITIALIZE_ANALYSIS_GROUP(CallGraph, "Call Graph", BasicCallGraph)
-INITIALIZE_AG_PASS(BasicCallGraph, CallGraph, "basiccg",
-                   "Basic CallGraph Construction", false, true, true)
+  // Add every function to the call graph.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    addToCallGraph(I);
 
-char CallGraph::ID = 0;
-char BasicCallGraph::ID = 0;
+  // If we didn't find a main function, use the external call graph node
+  if (Root == 0)
+    Root = ExternalCallingNode;
 
-void CallGraph::initialize(Module &M) {
-  Mod = &M;
+  return false;
 }
 
-void CallGraph::destroy() {
-  if (FunctionMap.empty()) return;
-  
-  // Reset all node's use counts to zero before deleting them to prevent an
-  // assertion from firing.
+INITIALIZE_PASS(CallGraph, "basiccg", "CallGraph Construction", false, true)
+
+char CallGraph::ID = 0;
+
+void CallGraph::releaseMemory() {
+  /// CallsExternalNode is not in the function map, delete it explicitly.
+  if (CallsExternalNode) {
+    CallsExternalNode->allReferencesDropped();
+    delete CallsExternalNode;
+    CallsExternalNode = 0;
+  }
+
+  if (FunctionMap.empty())
+    return;
+
+// Reset all node's use counts to zero before deleting them to prevent an
+// assertion from firing.
 #ifndef NDEBUG
   for (FunctionMapTy::iterator I = FunctionMap.begin(), E = FunctionMap.end();
        I != E; ++I)
@@ -195,7 +114,14 @@ void CallGraph::destroy() {
   FunctionMap.clear();
 }
 
-void CallGraph::print(raw_ostream &OS, Module*) const {
+void CallGraph::print(raw_ostream &OS, const Module*) const {
+  OS << "CallGraph Root is: ";
+  if (Function *F = Root->getFunction())
+    OS << F->getName() << "\n";
+  else {
+    OS << "<<null function: 0x" << Root << ">>\n";
+  }
+
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
diff --git a/lib/Analysis/IPA/CallGraphSCCPass.cpp b/lib/Analysis/IPA/CallGraphSCCPass.cpp
index a0d788f..182beca 100644
--- a/lib/Analysis/IPA/CallGraphSCCPass.cpp
+++ b/lib/Analysis/IPA/CallGraphSCCPass.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/PassManagers.h"
+#include "llvm/IR/LegacyPassManagers.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Timer.h"
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 92d0d23..7ec4644 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -189,7 +189,7 @@ char GlobalsModRef::ID = 0;
 INITIALIZE_AG_PASS_BEGIN(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_AG_PASS_END(GlobalsModRef, AliasAnalysis,
                 "globalsmodref-aa", "Simple mod/ref analysis for globals",    
                 false, true, false)
diff --git a/lib/Analysis/IPA/IPA.cpp b/lib/Analysis/IPA/IPA.cpp
index 1c1816d..47357cf 100644
--- a/lib/Analysis/IPA/IPA.cpp
+++ b/lib/Analysis/IPA/IPA.cpp
@@ -19,8 +19,7 @@ using namespace llvm;
 
 /// initializeIPA - Initialize all passes linked into the IPA library.
 void llvm::initializeIPA(PassRegistry &Registry) {
-  initializeBasicCallGraphPass(Registry);
-  initializeCallGraphAnalysisGroup(Registry);
+  initializeCallGraphPass(Registry);
   initializeCallGraphPrinterPass(Registry);
   initializeCallGraphViewerPass(Registry);
   initializeFindUsedTypesPass(Registry);
diff --git a/lib/Analysis/IPA/InlineCost.cpp b/lib/Analysis/IPA/InlineCost.cpp
index 35c45e6..3bc796e 100644
--- a/lib/Analysis/IPA/InlineCost.cpp
+++ b/lib/Analysis/IPA/InlineCost.cpp
@@ -59,6 +59,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool ExposesReturnsTwice;
   bool HasDynamicAlloca;
   bool ContainsNoDuplicateCall;
+  bool HasReturn;
+  bool HasIndirectBr;
 
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize;
@@ -124,7 +126,7 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitIntToPtr(IntToPtrInst &I);
   bool visitCastInst(CastInst &I);
   bool visitUnaryInstruction(UnaryInstruction &I);
-  bool visitICmp(ICmpInst &I);
+  bool visitCmpInst(CmpInst &I);
   bool visitSub(BinaryOperator &I);
   bool visitBinaryOperator(BinaryOperator &I);
   bool visitLoad(LoadInst &I);
@@ -132,6 +134,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitExtractValue(ExtractValueInst &I);
   bool visitInsertValue(InsertValueInst &I);
   bool visitCallSite(CallSite CS);
+  bool visitReturnInst(ReturnInst &RI);
+  bool visitBranchInst(BranchInst &BI);
+  bool visitSwitchInst(SwitchInst &SI);
+  bool visitIndirectBrInst(IndirectBrInst &IBI);
+  bool visitResumeInst(ResumeInst &RI);
+  bool visitUnreachableInst(UnreachableInst &I);
 
 public:
   CallAnalyzer(const DataLayout *TD, const TargetTransformInfo &TTI,
@@ -139,12 +147,13 @@ public:
       : TD(TD), TTI(TTI), F(Callee), Threshold(Threshold), Cost(0),
         IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
-        ContainsNoDuplicateCall(false), AllocatedSize(0), NumInstructions(0),
-        NumVectorInstructions(0), FiftyPercentVectorBonus(0),
-        TenPercentVectorBonus(0), VectorBonus(0), NumConstantArgs(0),
-        NumConstantOffsetPtrArgs(0), NumAllocaArgs(0), NumConstantPtrCmps(0),
-        NumConstantPtrDiffs(0), NumInstructionsSimplified(0),
-        SROACostSavings(0), SROACostSavingsLost(0) {}
+        ContainsNoDuplicateCall(false), HasReturn(false), HasIndirectBr(false),
+        AllocatedSize(0), NumInstructions(0), NumVectorInstructions(0),
+        FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
+        NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
+        NumConstantPtrCmps(0), NumConstantPtrDiffs(0),
+        NumInstructionsSimplified(0), SROACostSavings(0),
+        SROACostSavingsLost(0) {}
 
   bool analyzeCall(CallSite CS);
 
@@ -490,7 +499,7 @@ bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
   return false;
 }
 
-bool CallAnalyzer::visitICmp(ICmpInst &I) {
+bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
   if (!isa<Constant>(LHS))
@@ -499,12 +508,16 @@ bool CallAnalyzer::visitICmp(ICmpInst &I) {
   if (!isa<Constant>(RHS))
     if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
       RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS))
+  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
     if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C = ConstantExpr::getICmp(I.getPredicate(), CLHS, CRHS)) {
+      if (Constant *C = ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
         SimplifiedValues[&I] = C;
         return true;
       }
+  }
+
+  if (I.getOpcode() == Instruction::FCmp)
+    return false;
 
   // Otherwise look for a comparison between constant offset pointers with
   // a common base.
@@ -700,7 +713,7 @@ bool CallAnalyzer::simplifyCallSite(Function *F, CallSite CS) {
 }
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
-  if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
+  if (CS.hasFnAttr(Attribute::ReturnsTwice) &&
       !F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                       Attribute::ReturnsTwice)) {
     // This aborts the entire analysis.
@@ -781,6 +794,60 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
   return Base::visitCallSite(CS);
 }
 
+bool CallAnalyzer::visitReturnInst(ReturnInst &RI) {
+  // At least one return instruction will be free after inlining.
+  bool Free = !HasReturn;
+  HasReturn = true;
+  return Free;
+}
+
+bool CallAnalyzer::visitBranchInst(BranchInst &BI) {
+  // We model unconditional branches as essentially free -- they really
+  // shouldn't exist at all, but handling them makes the behavior of the
+  // inliner more regular and predictable. Interestingly, conditional branches
+  // which will fold away are also free.
+  return BI.isUnconditional() || isa<ConstantInt>(BI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(BI.getCondition()));
+}
+
+bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
+  // We model unconditional switches as free, see the comments on handling
+  // branches.
+  return isa<ConstantInt>(SI.getCondition()) ||
+         dyn_cast_or_null<ConstantInt>(
+             SimplifiedValues.lookup(SI.getCondition()));
+}
+
+bool CallAnalyzer::visitIndirectBrInst(IndirectBrInst &IBI) {
+  // We never want to inline functions that contain an indirectbr.  This is
+  // incorrect because all the blockaddress's (in static global initializers
+  // for example) would be referring to the original function, and this
+  // indirect jump would jump from the inlined copy of the function into the
+  // original function which is extremely undefined behavior.
+  // FIXME: This logic isn't really right; we can safely inline functions with
+  // indirectbr's as long as no other function or global references the
+  // blockaddress of a block within the current function.  And as a QOI issue,
+  // if someone is using a blockaddress without an indirectbr, and that
+  // reference somehow ends up in another function or global, we probably don't
+  // want to inline this function.
+  HasIndirectBr = true;
+  return false;
+}
+
+bool CallAnalyzer::visitResumeInst(ResumeInst &RI) {
+  // FIXME: It's not clear that a single instruction is an accurate model for
+  // the inline cost of a resume instruction.
+  return false;
+}
+
+bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
+  // FIXME: It might be reasonably to discount the cost of instructions leading
+  // to unreachable as they have the lowest possible impact on both runtime and
+  // code size.
+  return true; // No actual code is needed for unreachable.
+}
+
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
@@ -804,8 +871,7 @@ bool CallAnalyzer::visitInstruction(Instruction &I) {
 /// construct has been detected. It returns false if inlining is no longer
 /// viable, and true if inlining remains viable.
 bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
-  for (BasicBlock::iterator I = BB->begin(), E = llvm::prior(BB->end());
-       I != E; ++I) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     ++NumInstructions;
     if (isa<ExtractElementInst>(I) || I->getType()->isVectorTy())
       ++NumVectorInstructions;
@@ -821,7 +887,8 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
       Cost += InlineConstants::InstrCost;
 
     // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+        HasIndirectBr)
       return false;
 
     // If the caller is a recursive function then we don't want to inline
@@ -985,10 +1052,6 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     }
   }
 
-  // Track whether we've seen a return instruction. The first return
-  // instruction is free, as at least one will usually disappear in inlining.
-  bool HasReturn = false;
-
   // Populate our simplified values by mapping from function arguments to call
   // arguments with known important simplifications.
   CallSite::arg_iterator CAI = CS.arg_begin();
@@ -1035,33 +1098,11 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     if (BB->empty())
       continue;
 
-    // Handle the terminator cost here where we can track returns and other
-    // function-wide constructs.
-    TerminatorInst *TI = BB->getTerminator();
-
-    // We never want to inline functions that contain an indirectbr.  This is
-    // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this
-    // indirect jump would jump from the inlined copy of the function into the 
-    // original function which is extremely undefined behavior.
-    // FIXME: This logic isn't really right; we can safely inline functions
-    // with indirectbr's as long as no other function or global references the
-    // blockaddress of a block within the current function.  And as a QOI issue,
-    // if someone is using a blockaddress without an indirectbr, and that
-    // reference somehow ends up in another function or global, we probably
-    // don't want to inline this function.
-    if (isa<IndirectBrInst>(TI))
-      return false;
-
-    if (!HasReturn && isa<ReturnInst>(TI))
-      HasReturn = true;
-    else
-      Cost += InlineConstants::InstrCost;
-
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
-      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
+          HasIndirectBr)
         return false;
 
       // If the caller is a recursive function then we don't want to inline
@@ -1074,6 +1115,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       break;
     }
 
+    TerminatorInst *TI = BB->getTerminator();
+
     // Add in the live successors by first checking whether we have terminator
     // that may be simplified based on the values simplified by this call.
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -1167,6 +1210,22 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
 }
 
+/// \brief Test that two functions either have or have not the given attribute
+///        at the same time.
+static bool attributeMatches(Function *F1, Function *F2,
+                             Attribute::AttrKind Attr) {
+  return F1->hasFnAttribute(Attr) == F2->hasFnAttribute(Attr);
+}
+
+/// \brief Test that there are no attribute conflicts between Caller and Callee
+///        that prevent inlining.
+static bool functionsHaveCompatibleAttributes(Function *Caller,
+                                              Function *Callee) {
+  return attributeMatches(Caller, Callee, Attribute::SanitizeAddress) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeMemory) &&
+         attributeMatches(Caller, Callee, Attribute::SanitizeThread);
+}
+
 InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
                                              int Threshold) {
   // Cannot inline indirect calls.
@@ -1175,20 +1234,26 @@ InlineCost InlineCostAnalysis::getInlineCost(CallSite CS, Function *Callee,
 
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
-  if (Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::AlwaysInline)) {
+  if (Callee->hasFnAttribute(Attribute::AlwaysInline)) {
     if (isInlineViable(*Callee))
       return llvm::InlineCost::getAlways();
     return llvm::InlineCost::getNever();
   }
 
+  // Never inline functions with conflicting attributes (unless callee has
+  // always-inline attribute).
+  if (!functionsHaveCompatibleAttributes(CS.getCaller(), Callee))
+    return llvm::InlineCost::getNever();
+
+  // Don't inline this call if the caller has the optnone attribute.
+  if (CS.getCaller()->hasFnAttribute(Attribute::OptimizeNone))
+    return llvm::InlineCost::getNever();
+
   // Don't inline functions which can be redefined at link-time to mean
   // something else.  Don't inline functions marked noinline or call sites
   // marked noinline.
   if (Callee->mayBeOverridden() ||
-      Callee->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::NoInline) ||
-      CS.isNoInline())
+      Callee->hasFnAttribute(Attribute::NoInline) || CS.isNoInline())
     return llvm::InlineCost::getNever();
 
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index bf77451..b867af1 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -668,7 +668,8 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// follow non-inbounds geps. This allows it to remain usable for icmp ult/etc.
 /// folding.
 static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
-                                                Value *&V) {
+                                                Value *&V,
+                                                bool AllowNonInbounds = false) {
   assert(V->getType()->getScalarType()->isPointerTy());
 
   // Without DataLayout, just be conservative for now. Theoretically, more could
@@ -676,8 +677,8 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
   if (!TD)
     return ConstantInt::get(IntegerType::get(V->getContext(), 64), 0);
 
-  unsigned IntPtrWidth = TD->getPointerSizeInBits();
-  APInt Offset = APInt::getNullValue(IntPtrWidth);
+  Type *IntPtrTy = TD->getIntPtrType(V->getType())->getScalarType();
+  APInt Offset = APInt::getNullValue(IntPtrTy->getIntegerBitWidth());
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
@@ -685,7 +686,8 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(*TD, Offset))
+      if ((!AllowNonInbounds && !GEP->isInBounds()) ||
+          !GEP->accumulateConstantOffset(*TD, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -701,7 +703,6 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout *TD,
            "Unexpected operand type!");
   } while (Visited.insert(V));
 
-  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
   Constant *OffsetIntPtr = ConstantInt::get(IntPtrTy, Offset);
   if (V->getType()->isVectorTy())
     return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
@@ -1363,6 +1364,10 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
   if (Value *V = SimplifyShift(Instruction::LShr, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // X >> X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
   // undef >>l X -> 0
   if (match(Op0, m_Undef()))
     return Constant::getNullValue(Op0->getType());
@@ -1391,6 +1396,10 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
   if (Value *V = SimplifyShift(Instruction::AShr, Op0, Op1, Q, MaxRecurse))
     return V;
 
+  // X >> X -> 0
+  if (Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
   // all ones >>a X -> all ones
   if (match(Op0, m_AllOnes()))
     return Op0;
@@ -1730,7 +1739,7 @@ static Constant *computePointerICmp(const DataLayout *TD,
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
-  if (llvm::isKnownNonNull(LHS) && isa<ConstantPointerNull>(RHS) &&
+  if (llvm::isKnownNonNull(LHS, TLI) && isa<ConstantPointerNull>(RHS) &&
       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
@@ -1830,6 +1839,17 @@ static Constant *computePointerICmp(const DataLayout *TD,
         return ConstantInt::get(GetCompareTy(LHS),
                                 !CmpInst::isTrueWhenEqual(Pred));
     }
+
+    // Even if an non-inbounds GEP occurs along the path we can still optimize
+    // equality comparisons concerning the result. We avoid walking the whole
+    // chain again by starting where the last calls to
+    // stripAndComputeConstantOffsets left off and accumulate the offsets.
+    Constant *LHSNoBound = stripAndComputeConstantOffsets(TD, LHS, true);
+    Constant *RHSNoBound = stripAndComputeConstantOffsets(TD, RHS, true);
+    if (LHS == RHS)
+      return ConstantExpr::getICmp(Pred,
+                                   ConstantExpr::getAdd(LHSOffset, LHSNoBound),
+                                   ConstantExpr::getAdd(RHSOffset, RHSNoBound));
   }
 
   // Otherwise, fail.
@@ -2026,7 +2046,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
     if (MaxRecurse && Q.TD && isa<PtrToIntInst>(LI) &&
-        Q.TD->getPointerSizeInBits() == DstTy->getPrimitiveSizeInBits()) {
+        Q.TD->getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
         if (Value *V = SimplifyICmpInst(Pred, SrcOp,
@@ -2238,6 +2258,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     }
   }
 
+  // icmp pred (urem X, Y), Y
   if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) {
     bool KnownNonNegative, KnownNegative;
     switch (Pred) {
@@ -2245,7 +2266,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2255,7 +2276,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getFalse(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
+      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2265,6 +2286,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     }
   }
+
+  // icmp pred X, (urem Y, X)
   if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) {
     bool KnownNonNegative, KnownNegative;
     switch (Pred) {
@@ -2272,7 +2295,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       break;
     case ICmpInst::ICMP_SGT:
     case ICmpInst::ICMP_SGE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2282,7 +2305,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
       return getTrue(ITy);
     case ICmpInst::ICMP_SLT:
     case ICmpInst::ICMP_SLE:
-      ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.TD);
+      ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.TD);
       if (!KnownNonNegative)
         break;
       // fall-through
@@ -2936,6 +2959,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   case Intrinsic::trunc:
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
+  case Intrinsic::round:
     return true;
   }
 }
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 66b5e85..b6970af 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -421,8 +421,8 @@ void LVIValueHandle::deleted() {
     if (I->second == getValPtr())
       ToErase.push_back(*I);
   }
-  
-  for (SmallVector<OverDefinedPairTy, 4>::iterator I = ToErase.begin(),
+
+  for (SmallVectorImpl<OverDefinedPairTy>::iterator I = ToErase.begin(),
        E = ToErase.end(); I != E; ++I)
     Parent->OverDefinedCache.erase(*I);
   
@@ -444,8 +444,8 @@ void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
     if (I->first == BB)
       ToErase.push_back(*I);
   }
-  
-  for (SmallVector<OverDefinedPairTy, 4>::iterator I = ToErase.begin(),
+
+  for (SmallVectorImpl<OverDefinedPairTy>::iterator I = ToErase.begin(),
        E = ToErase.end(); I != E; ++I)
     OverDefinedCache.erase(*I);
 
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 9393508..ec17f47 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -207,7 +207,7 @@ void Lint::visitCallSite(CallSite CS) {
             &I);
 
     FunctionType *FT = F->getFunctionType();
-    unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+    unsigned NumActualArgs = CS.arg_size();
 
     Assert1(FT->isVarArg() ?
               FT->getNumParams() <= NumActualArgs :
@@ -504,14 +504,42 @@ void Lint::visitShl(BinaryOperator &I) {
             "Undefined result: Shift count out of range", &I);
 }
 
-static bool isZero(Value *V, DataLayout *TD) {
+static bool isZero(Value *V, DataLayout *DL) {
   // Assume undef could be zero.
-  if (isa<UndefValue>(V)) return true;
+  if (isa<UndefValue>(V))
+    return true;
+
+  VectorType *VecTy = dyn_cast<VectorType>(V->getType());
+  if (!VecTy) {
+    unsigned BitWidth = V->getType()->getIntegerBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(V, KnownZero, KnownOne, DL);
+    return KnownZero.isAllOnesValue();
+  }
+
+  // Per-component check doesn't work with zeroinitializer
+  Constant *C = dyn_cast<Constant>(V);
+  if (!C)
+    return false;
+
+  if (C->isZeroValue())
+    return true;
+
+  // For a vector, KnownZero will only be true if all values are zero, so check
+  // this per component
+  unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth();
+  for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
+    Constant *Elem = C->getAggregateElement(I);
+    if (isa<UndefValue>(Elem))
+      return true;
+
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    ComputeMaskedBits(Elem, KnownZero, KnownOne, DL);
+    if (KnownZero.isAllOnesValue())
+      return true;
+  }
 
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-  APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
-  return KnownZero.isAllOnesValue();
+  return false;
 }
 
 void Lint::visitSDiv(BinaryOperator &I) {
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index f1ad650..e369633 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -50,6 +50,9 @@ INITIALIZE_PASS_BEGIN(LoopInfo, "loops", "Natural Loop Information", true, true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_END(LoopInfo, "loops", "Natural Loop Information", true, true)
 
+// Loop identifier metadata name.
+static const char *const LoopMDName = "llvm.loop";
+
 //===----------------------------------------------------------------------===//
 // Loop implementation
 //
@@ -174,10 +177,6 @@ PHINode *Loop::getCanonicalInductionVariable() const {
 
 /// isLCSSAForm - Return true if the Loop is in LCSSA form
 bool Loop::isLCSSAForm(DominatorTree &DT) const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock*, 16> LoopBBs(block_begin(), block_end());
-
   for (block_iterator BI = block_begin(), E = block_end(); BI != E; ++BI) {
     BasicBlock *BB = *BI;
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;++I)
@@ -193,7 +192,7 @@ bool Loop::isLCSSAForm(DominatorTree &DT) const {
         // block they are defined in.  Also, blocks not reachable from the
         // entry are special; uses in them don't need to go through PHIs.
         if (UserBB != BB &&
-            !LoopBBs.count(UserBB) &&
+            !contains(UserBB) &&
             DT.isReachableFromEntry(UserBB))
           return false;
       }
@@ -217,12 +216,12 @@ bool Loop::isSafeToClone() const {
   // Return false if any loop blocks contain indirectbrs, or there are any calls
   // to noduplicate functions.
   for (Loop::block_iterator I = block_begin(), E = block_end(); I != E; ++I) {
-    if (isa<IndirectBrInst>((*I)->getTerminator())) {
+    if (isa<IndirectBrInst>((*I)->getTerminator()))
       return false;
-    } else if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator())) {
+
+    if (const InvokeInst *II = dyn_cast<InvokeInst>((*I)->getTerminator()))
       if (II->hasFnAttr(Attribute::NoDuplicate))
         return false;
-    }
 
     for (BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); BI != BE; ++BI) {
       if (const CallInst *CI = dyn_cast<CallInst>(BI)) {
@@ -234,14 +233,62 @@ bool Loop::isSafeToClone() const {
   return true;
 }
 
-bool Loop::isAnnotatedParallel() const {
+MDNode *Loop::getLoopID() const {
+  MDNode *LoopID = 0;
+  if (isLoopSimplifyForm()) {
+    LoopID = getLoopLatch()->getTerminator()->getMetadata(LoopMDName);
+  } else {
+    // Go through each predecessor of the loop header and check the
+    // terminator for the metadata.
+    BasicBlock *H = getHeader();
+    for (block_iterator I = block_begin(), IE = block_end(); I != IE; ++I) {
+      TerminatorInst *TI = (*I)->getTerminator();
+      MDNode *MD = 0;
+
+      // Check if this terminator branches to the loop header.
+      for (unsigned i = 0, ie = TI->getNumSuccessors(); i != ie; ++i) {
+        if (TI->getSuccessor(i) == H) {
+          MD = TI->getMetadata(LoopMDName);
+          break;
+        }
+      }
+      if (!MD)
+        return 0;
 
-  BasicBlock *latch = getLoopLatch();
-  if (latch == NULL)
-    return false;
+      if (!LoopID)
+        LoopID = MD;
+      else if (MD != LoopID)
+        return 0;
+    }
+  }
+  if (!LoopID || LoopID->getNumOperands() == 0 ||
+      LoopID->getOperand(0) != LoopID)
+    return 0;
+  return LoopID;
+}
 
-  MDNode *desiredLoopIdMetadata =
-    latch->getTerminator()->getMetadata("llvm.loop.parallel");
+void Loop::setLoopID(MDNode *LoopID) const {
+  assert(LoopID && "Loop ID should not be null");
+  assert(LoopID->getNumOperands() > 0 && "Loop ID needs at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "Loop ID should refer to itself");
+
+  if (isLoopSimplifyForm()) {
+    getLoopLatch()->getTerminator()->setMetadata(LoopMDName, LoopID);
+    return;
+  }
+
+  BasicBlock *H = getHeader();
+  for (block_iterator I = block_begin(), IE = block_end(); I != IE; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    for (unsigned i = 0, ie = TI->getNumSuccessors(); i != ie; ++i) {
+      if (TI->getSuccessor(i) == H)
+        TI->setMetadata(LoopMDName, LoopID);
+    }
+  }
+}
+
+bool Loop::isAnnotatedParallel() const {
+  MDNode *desiredLoopIdMetadata = getLoopID();
 
   if (!desiredLoopIdMetadata)
       return false;
@@ -258,15 +305,15 @@ bool Loop::isAnnotatedParallel() const {
       if (!II->mayReadOrWriteMemory())
         continue;
 
-      if (!II->getMetadata("llvm.mem.parallel_loop_access"))
-        return false;
-
       // The memory instruction can refer to the loop identifier metadata
       // directly or indirectly through another list metadata (in case of
       // nested parallel loops). The loop identifier metadata refers to
       // itself so we can check both cases with the same routine.
-      MDNode *loopIdMD =
-          dyn_cast<MDNode>(II->getMetadata("llvm.mem.parallel_loop_access"));
+      MDNode *loopIdMD = II->getMetadata("llvm.mem.parallel_loop_access");
+
+      if (!loopIdMD)
+        return false;
+
       bool loopIdMDFound = false;
       for (unsigned i = 0, e = loopIdMD->getNumOperands(); i < e; ++i) {
         if (loopIdMD->getOperand(i) == desiredLoopIdMetadata) {
@@ -286,9 +333,6 @@ bool Loop::isAnnotatedParallel() const {
 /// hasDedicatedExits - Return true if no exit block for the loop
 /// has a predecessor that is outside the loop.
 bool Loop::hasDedicatedExits() const {
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallPtrSet<BasicBlock *, 16> LoopBBs(block_begin(), block_end());
   // Each predecessor of each exit block of a normal loop is contained
   // within the loop.
   SmallVector<BasicBlock *, 4> ExitBlocks;
@@ -296,7 +340,7 @@ bool Loop::hasDedicatedExits() const {
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
     for (pred_iterator PI = pred_begin(ExitBlocks[i]),
          PE = pred_end(ExitBlocks[i]); PI != PE; ++PI)
-      if (!LoopBBs.count(*PI))
+      if (!contains(*PI))
         return false;
   // All the requirements are met.
   return true;
@@ -311,11 +355,6 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
   assert(hasDedicatedExits() &&
          "getUniqueExitBlocks assumes the loop has canonical form exits!");
 
-  // Sort the blocks vector so that we can use binary search to do quick
-  // lookups.
-  SmallVector<BasicBlock *, 128> LoopBBs(block_begin(), block_end());
-  std::sort(LoopBBs.begin(), LoopBBs.end());
-
   SmallVector<BasicBlock *, 32> switchExitBlocks;
 
   for (block_iterator BI = block_begin(), BE = block_end(); BI != BE; ++BI) {
@@ -325,7 +364,7 @@ Loop::getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const {
 
     for (succ_iterator I = succ_begin(*BI), E = succ_end(*BI); I != E; ++I) {
       // If block is inside the loop then it is not a exit block.
-      if (std::binary_search(LoopBBs.begin(), LoopBBs.end(), *I))
+      if (contains(*I))
         continue;
 
       pred_iterator PI = pred_begin(*I);
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 1540112..acf2ba6 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -188,6 +188,10 @@ bool LPPassManager::runOnFunction(Function &F) {
   // advantage in deleting uses in a later loop before optimizing the
   // definitions in an earlier loop. If we find a clear reason to process in
   // forward order, then a forward variant of LoopPassManager should be created.
+  //
+  // Note that LoopInfo::iterator visits loops in reverse program
+  // order. Here, reverse_iterator gives us a forward order, and the LoopQueue
+  // reverses the order a third time by popping from the back.
   for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
     addLoopIntoQueue(*I, LQ);
 
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 9c0d8ac..1db0f63 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -31,12 +31,13 @@
 using namespace llvm;
 
 enum AllocType {
-  MallocLike         = 1<<0, // allocates
-  CallocLike         = 1<<1, // allocates + bzero
-  ReallocLike        = 1<<2, // reallocates
-  StrDupLike         = 1<<3,
+  OpNewLike          = 1<<0, // allocates; never returns null
+  MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
+  CallocLike         = 1<<2, // allocates + bzero
+  ReallocLike        = 1<<3, // reallocates
+  StrDupLike         = 1<<4,
   AllocLike          = MallocLike | CallocLike | StrDupLike,
-  AnyAlloc           = MallocLike | CallocLike | ReallocLike | StrDupLike
+  AnyAlloc           = AllocLike | ReallocLike
 };
 
 struct AllocFnsTy {
@@ -52,20 +53,20 @@ struct AllocFnsTy {
 static const AllocFnsTy AllocationFnData[] = {
   {LibFunc::malloc,              MallocLike,  1, 0,  -1},
   {LibFunc::valloc,              MallocLike,  1, 0,  -1},
-  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::Znwj,                OpNewLike,   1, 0,  -1}, // new(unsigned int)
   {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::Znwm,                OpNewLike,   1, 0,  -1}, // new(unsigned long)
   {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::Znaj,                OpNewLike,   1, 0,  -1}, // new[](unsigned int)
   {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::Znam,                OpNewLike,   1, 0,  -1}, // new[](unsigned long)
   {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
   {LibFunc::calloc,              CallocLike,  2, 0,   1},
   {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
   {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
   {LibFunc::strdup,              StrDupLike,  1, -1, -1},
   {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
+  // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
 };
 
 
@@ -77,6 +78,9 @@ static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
   if (!CS.getInstruction())
     return 0;
 
+  if (CS.isNoBuiltin())
+    return 0;
+
   Function *Callee = CS.getCalledFunction();
   if (!Callee || !Callee->isDeclaration())
     return 0;
@@ -114,7 +118,7 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
     return 0;
 
   const AllocFnsTy *FnData = &AllocationFnData[i];
-  if ((FnData->AllocTy & AllocTy) == 0)
+  if ((FnData->AllocTy & AllocTy) != FnData->AllocTy)
     return 0;
 
   // Check function prototype.
@@ -186,6 +190,13 @@ bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
   return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
 }
 
+/// \brief Tests if a value is a call or invoke to a library function that
+/// allocates memory and never returns null (such as operator new).
+bool llvm::isOperatorNewLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                               bool LookThroughBitCast) {
+  return getAllocationData(V, OpNewLike, TLI, LookThroughBitCast);
+}
+
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
@@ -194,7 +205,7 @@ const CallInst *llvm::extractMallocCall(const Value *I,
   return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
 }
 
-static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
+static Value *computeArraySize(const CallInst *CI, const DataLayout *DL,
                                const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
@@ -202,12 +213,12 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 
   // The size of the malloc's result type must be known to determine array size.
   Type *T = getMallocAllocatedType(CI, TLI);
-  if (!T || !T->isSized() || !TD)
+  if (!T || !T->isSized() || !DL)
     return 0;
 
-  unsigned ElementSize = TD->getTypeAllocSize(T);
+  unsigned ElementSize = DL->getTypeAllocSize(T);
   if (StructType *ST = dyn_cast<StructType>(T))
-    ElementSize = TD->getStructLayout(ST)->getSizeInBytes();
+    ElementSize = DL->getStructLayout(ST)->getSizeInBytes();
 
   // If malloc call's arg can be determined to be a multiple of ElementSize,
   // return the multiple.  Otherwise, return NULL.
@@ -224,10 +235,10 @@ static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
 const CallInst *llvm::isArrayMalloc(const Value *I,
-                                    const DataLayout *TD,
+                                    const DataLayout *DL,
                                     const TargetLibraryInfo *TLI) {
   const CallInst *CI = extractMallocCall(I, TLI);
-  Value *ArraySize = computeArraySize(CI, TD, TLI);
+  Value *ArraySize = computeArraySize(CI, DL, TLI);
 
   if (ConstantInt *ConstSize = dyn_cast_or_null<ConstantInt>(ArraySize))
     if (ConstSize->isOne())
@@ -285,11 +296,11 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI,
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *TD,
+Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *DL,
                                 const TargetLibraryInfo *TLI,
                                 bool LookThroughSExt) {
   assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, TD, TLI, LookThroughSExt);
+  return computeArraySize(CI, DL, TLI, LookThroughSExt);
 }
 
 
@@ -315,9 +326,15 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
     return 0;
 
-  if (TLIFn != LibFunc::free &&
-      TLIFn != LibFunc::ZdlPv && // operator delete(void*)
-      TLIFn != LibFunc::ZdaPv)   // operator delete[](void*)
+  unsigned ExpectedNumParams;
+  if (TLIFn == LibFunc::free ||
+      TLIFn == LibFunc::ZdlPv || // operator delete(void*)
+      TLIFn == LibFunc::ZdaPv)   // operator delete[](void*)
+    ExpectedNumParams = 1;
+  else if (TLIFn == LibFunc::ZdlPvRKSt9nothrow_t || // delete(void*, nothrow)
+           TLIFn == LibFunc::ZdaPvRKSt9nothrow_t)   // delete[](void*, nothrow)
+    ExpectedNumParams = 2;
+  else
     return 0;
 
   // Check free prototype.
@@ -326,7 +343,7 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   FunctionType *FTy = Callee->getFunctionType();
   if (!FTy->getReturnType()->isVoidTy())
     return 0;
-  if (FTy->getNumParams() != 1)
+  if (FTy->getNumParams() != ExpectedNumParams)
     return 0;
   if (FTy->getParamType(0) != Type::getInt8PtrTy(Callee->getContext()))
     return 0;
@@ -345,12 +362,12 @@ const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *DL,
                          const TargetLibraryInfo *TLI, bool RoundToAlign) {
-  if (!TD)
+  if (!DL)
     return false;
 
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), RoundToAlign);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -377,12 +394,12 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
   return Size;
 }
 
-ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
                                                  bool RoundToAlign)
-: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
-  IntegerType *IntTy = TD->getIntPtrType(Context);
+: DL(DL), TLI(TLI), RoundToAlign(RoundToAlign) {
+  IntegerType *IntTy = DL->getIntPtrType(Context);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
 }
@@ -425,7 +442,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(I.getAllocatedType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(I.getAllocatedType()));
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlignment()), Zero);
 
@@ -444,7 +461,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
     return unknown();
   }
   PointerType *PT = cast<PointerType>(A.getType());
-  APInt Size(IntTyBits, TD->getTypeAllocSize(PT->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(PT->getElementType()));
   return std::make_pair(align(Size, A.getParamAlignment()), Zero);
 }
 
@@ -517,7 +534,7 @@ ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
 SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
   SizeOffsetType PtrData = compute(GEP.getPointerOperand());
   APInt Offset(IntTyBits, 0);
-  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*TD, Offset))
+  if (!bothKnown(PtrData) || !GEP.accumulateConstantOffset(*DL, Offset))
     return unknown();
 
   return std::make_pair(PtrData.first, PtrData.second + Offset);
@@ -533,7 +550,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitGlobalVariable(GlobalVariable &GV){
   if (!GV.hasDefinitiveInitializer())
     return unknown();
 
-  APInt Size(IntTyBits, TD->getTypeAllocSize(GV.getType()->getElementType()));
+  APInt Size(IntTyBits, DL->getTypeAllocSize(GV.getType()->getElementType()));
   return std::make_pair(align(Size, GV.getAlignment()), Zero);
 }
 
@@ -569,12 +586,13 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
   return unknown();
 }
 
-
-ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
-                                                   const TargetLibraryInfo *TLI,
-                                                     LLVMContext &Context)
-: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
-  IntTy = TD->getIntPtrType(Context);
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *DL,
+                                                     const TargetLibraryInfo *TLI,
+                                                     LLVMContext &Context,
+                                                     bool RoundToAlign)
+: DL(DL), TLI(TLI), Context(Context), Builder(Context, TargetFolder(DL)),
+  RoundToAlign(RoundToAlign) {
+  IntTy = DL->getIntPtrType(Context);
   Zero = ConstantInt::get(IntTy, 0);
 }
 
@@ -598,7 +616,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
@@ -617,13 +635,15 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     Builder.SetInsertPoint(I);
 
-  // record the pointers that were handled in this run, so that they can be
-  // cleaned later if something fails
-  SeenVals.insert(V);
-
   // now compute the size and offset
   SizeOffsetEvalType Result;
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+
+  // Record the pointers that were handled in this run, so that they can be
+  // cleaned later if something fails. We also use this set to break cycles that
+  // can occur in dead code.
+  if (!SeenVals.insert(V)) {
+    Result = unknown();
+  } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
     Result = visitGEPOperator(*GEP);
   } else if (Instruction *I = dyn_cast<Instruction>(V)) {
     Result = visit(*I);
@@ -656,7 +676,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   assert(I.isArrayAllocation());
   Value *ArraySize = I.getArraySize();
   Value *Size = ConstantInt::get(ArraySize->getType(),
-                                 TD->getTypeAllocSize(I.getAllocatedType()));
+                                 DL->getTypeAllocSize(I.getAllocatedType()));
   Size = Builder.CreateMul(Size, ArraySize);
   return std::make_pair(Size, Zero);
 }
@@ -708,7 +728,7 @@ ObjectSizeOffsetEvaluator::visitGEPOperator(GEPOperator &GEP) {
   if (!bothKnown(PtrData))
     return unknown();
 
-  Value *Offset = EmitGEPOffset(&Builder, *TD, &GEP, /*NoAssumptions=*/true);
+  Value *Offset = EmitGEPOffset(&Builder, *DL, &GEP, /*NoAssumptions=*/true);
   Offset = Builder.CreateAdd(PtrData.second, Offset);
   return std::make_pair(PtrData.first, Offset);
 }
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index c0009cb..84ff2ee 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- MemoryDependenceAnalysis.cpp - Mem Deps Implementation  --*- C++ -*-===//
+//===- MemoryDependenceAnalysis.cpp - Mem Deps Implementation -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -89,7 +89,7 @@ bool MemoryDependenceAnalysis::runOnFunction(Function &) {
   AA = &getAnalysis<AliasAnalysis>();
   TD = getAnalysisIfAvailable<DataLayout>();
   DT = getAnalysisIfAvailable<DominatorTree>();
-  if (PredCache == 0)
+  if (!PredCache)
     PredCache.reset(new PredIteratorCache());
   return false;
 }
@@ -371,18 +371,19 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
 
   // Walk backwards through the basic block, looking for dependencies.
   while (ScanIt != BB->begin()) {
+    Instruction *Inst = --ScanIt;
+
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
+      // Debug intrinsics don't (and can't) cause dependencies.
+      if (isa<DbgInfoIntrinsic>(II)) continue;
+
     // Limit the amount of scanning we do so we don't end up with quadratic
     // running time on extreme testcases.
     --Limit;
     if (!Limit)
       return MemDepResult::getUnknown();
 
-    Instruction *Inst = --ScanIt;
-
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
-      // Debug intrinsics don't (and can't) cause dependences.
-      if (isa<DbgInfoIntrinsic>(II)) continue;
-
       // If we reach a lifetime begin or end marker, then the query ends here
       // because the value is undefined.
       if (II->getIntrinsicID() == Intrinsic::lifetime_start) {
diff --git a/lib/Analysis/PathNumbering.cpp b/lib/Analysis/PathNumbering.cpp
deleted file mode 100644
index 30d213b..0000000
--- a/lib/Analysis/PathNumbering.cpp
+++ /dev/null
@@ -1,521 +0,0 @@
-//===- PathNumbering.cpp --------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Ball-Larus path numbers uniquely identify paths through a directed acyclic
-// graph (DAG) [Ball96].  For a CFG backedges are removed and replaced by phony
-// edges to obtain a DAG, and thus the unique path numbers [Ball96].
-//
-// The purpose of this analysis is to enumerate the edges in a CFG in order
-// to obtain paths from path numbers in a convenient manner.  As described in
-// [Ball96] edges can be enumerated such that given a path number by following
-// the CFG and updating the path number, the path is obtained.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "ball-larus-numbering"
-
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <queue>
-#include <sstream>
-#include <stack>
-#include <string>
-#include <utility>
-
-using namespace llvm;
-
-// Are we enabling early termination
-static cl::opt<bool> ProcessEarlyTermination(
-  "path-profile-early-termination", cl::Hidden,
-  cl::desc("In path profiling, insert extra instrumentation to account for "
-           "unexpected function termination."));
-
-// Returns the basic block for the BallLarusNode
-BasicBlock* BallLarusNode::getBlock() {
-  return(_basicBlock);
-}
-
-// Returns the number of paths to the exit starting at the node.
-unsigned BallLarusNode::getNumberPaths() {
-  return(_numberPaths);
-}
-
-// Sets the number of paths to the exit starting at the node.
-void BallLarusNode::setNumberPaths(unsigned numberPaths) {
-  _numberPaths = numberPaths;
-}
-
-// Gets the NodeColor used in graph algorithms.
-BallLarusNode::NodeColor BallLarusNode::getColor() {
-  return(_color);
-}
-
-// Sets the NodeColor used in graph algorithms.
-void BallLarusNode::setColor(BallLarusNode::NodeColor color) {
-  _color = color;
-}
-
-// Returns an iterator over predecessor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::predBegin() {
-  return(_predEdges.begin());
-}
-
-// Returns the end sentinel for the predecessor iterator.
-BLEdgeIterator BallLarusNode::predEnd() {
-  return(_predEdges.end());
-}
-
-// Returns the number of predecessor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberPredEdges() {
-  return(_predEdges.size());
-}
-
-// Returns an iterator over successor edges. Includes phony and
-// backedges.
-BLEdgeIterator BallLarusNode::succBegin() {
-  return(_succEdges.begin());
-}
-
-// Returns the end sentinel for the successor iterator.
-BLEdgeIterator BallLarusNode::succEnd() {
-  return(_succEdges.end());
-}
-
-// Returns the number of successor edges.  Includes phony and
-// backedges.
-unsigned BallLarusNode::getNumberSuccEdges() {
-  return(_succEdges.size());
-}
-
-// Add an edge to the predecessor list.
-void BallLarusNode::addPredEdge(BallLarusEdge* edge) {
-  _predEdges.push_back(edge);
-}
-
-// Remove an edge from the predecessor list.
-void BallLarusNode::removePredEdge(BallLarusEdge* edge) {
-  removeEdge(_predEdges, edge);
-}
-
-// Add an edge to the successor list.
-void BallLarusNode::addSuccEdge(BallLarusEdge* edge) {
-  _succEdges.push_back(edge);
-}
-
-// Remove an edge from the successor list.
-void BallLarusNode::removeSuccEdge(BallLarusEdge* edge) {
-  removeEdge(_succEdges, edge);
-}
-
-// Returns the name of the BasicBlock being represented.  If BasicBlock
-// is null then returns "<null>".  If BasicBlock has no name, then
-// "<unnamed>" is returned.  Intended for use with debug output.
-std::string BallLarusNode::getName() {
-  std::stringstream name;
-
-  if(getBlock() != NULL) {
-    if(getBlock()->hasName()) {
-      std::string tempName(getBlock()->getName());
-      name << tempName.c_str() << " (" << _uid << ")";
-    } else
-      name << "<unnamed> (" << _uid << ")";
-  } else
-    name << "<null> (" << _uid << ")";
-
-  return name.str();
-}
-
-// Removes an edge from an edgeVector.  Used by removePredEdge and
-// removeSuccEdge.
-void BallLarusNode::removeEdge(BLEdgeVector& v, BallLarusEdge* e) {
-  // TODO: Avoid linear scan by using a set instead
-  for(BLEdgeIterator i = v.begin(),
-        end = v.end();
-      i != end;
-      ++i) {
-    if((*i) == e) {
-      v.erase(i);
-      break;
-    }
-  }
-}
-
-// Returns the source node of this edge.
-BallLarusNode* BallLarusEdge::getSource() const {
-  return(_source);
-}
-
-// Returns the target node of this edge.
-BallLarusNode* BallLarusEdge::getTarget() const {
-  return(_target);
-}
-
-// Sets the type of the edge.
-BallLarusEdge::EdgeType BallLarusEdge::getType() const {
-  return _edgeType;
-}
-
-// Gets the type of the edge.
-void BallLarusEdge::setType(EdgeType type) {
-  _edgeType = type;
-}
-
-// Returns the weight of this edge.  Used to decode path numbers to sequences
-// of basic blocks.
-unsigned BallLarusEdge::getWeight() {
-  return(_weight);
-}
-
-// Sets the weight of the edge.  Used during path numbering.
-void BallLarusEdge::setWeight(unsigned weight) {
-  _weight = weight;
-}
-
-// Gets the phony edge originating at the root.
-BallLarusEdge* BallLarusEdge::getPhonyRoot() {
-  return _phonyRoot;
-}
-
-// Sets the phony edge originating at the root.
-void BallLarusEdge::setPhonyRoot(BallLarusEdge* phonyRoot) {
-  _phonyRoot = phonyRoot;
-}
-
-// Gets the phony edge terminating at the exit.
-BallLarusEdge* BallLarusEdge::getPhonyExit() {
-  return _phonyExit;
-}
-
-// Sets the phony edge terminating at the exit.
-void BallLarusEdge::setPhonyExit(BallLarusEdge* phonyExit) {
-  _phonyExit = phonyExit;
-}
-
-// Gets the associated real edge if this is a phony edge.
-BallLarusEdge* BallLarusEdge::getRealEdge() {
-  return _realEdge;
-}
-
-// Sets the associated real edge if this is a phony edge.
-void BallLarusEdge::setRealEdge(BallLarusEdge* realEdge) {
-  _realEdge = realEdge;
-}
-
-// Returns the duplicate number of the edge.
-unsigned BallLarusEdge::getDuplicateNumber() {
-  return(_duplicateNumber);
-}
-
-// Initialization that requires virtual functions which are not fully
-// functional in the constructor.
-void BallLarusDag::init() {
-  BLBlockNodeMap inDag;
-  std::stack<BallLarusNode*> dfsStack;
-
-  _root = addNode(&(_function.getEntryBlock()));
-  _exit = addNode(NULL);
-
-  // start search from root
-  dfsStack.push(getRoot());
-
-  // dfs to add each bb into the dag
-  while(dfsStack.size())
-    buildNode(inDag, dfsStack);
-
-  // put in the final edge
-  addEdge(getExit(),getRoot(),0);
-}
-
-// Frees all memory associated with the DAG.
-BallLarusDag::~BallLarusDag() {
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end(); edge != end;
-      ++edge)
-    delete (*edge);
-
-  for(BLNodeIterator node = _nodes.begin(), end = _nodes.end(); node != end;
-      ++node)
-    delete (*node);
-}
-
-// Calculate the path numbers by assigning edge increments as prescribed
-// in Ball-Larus path profiling.
-void BallLarusDag::calculatePathNumbers() {
-  BallLarusNode* node;
-  std::queue<BallLarusNode*> bfsQueue;
-  bfsQueue.push(getExit());
-
-  while(bfsQueue.size() > 0) {
-    node = bfsQueue.front();
-
-    DEBUG(dbgs() << "calculatePathNumbers on " << node->getName() << "\n");
-
-    bfsQueue.pop();
-    unsigned prevPathNumber = node->getNumberPaths();
-    calculatePathNumbersFrom(node);
-
-    // Check for DAG splitting
-    if( node->getNumberPaths() > 100000000 && node != getRoot() ) {
-      // Add new phony edge from the split-node to the DAG's exit
-      BallLarusEdge* exitEdge = addEdge(node, getExit(), 0);
-      exitEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-
-      // Counters to handle the possibility of a multi-graph
-      BasicBlock* oldTarget = 0;
-      unsigned duplicateNumber = 0;
-
-      // Iterate through each successor edge, adding phony edges
-      for( BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-           succ != end; oldTarget = (*succ)->getTarget()->getBlock(), succ++ ) {
-
-        if( (*succ)->getType() == BallLarusEdge::NORMAL ) {
-          // is this edge a duplicate?
-          if( oldTarget != (*succ)->getTarget()->getBlock() )
-            duplicateNumber = 0;
-
-          // create the new phony edge: root -> succ
-          BallLarusEdge* rootEdge =
-            addEdge(getRoot(), (*succ)->getTarget(), duplicateNumber++);
-          rootEdge->setType(BallLarusEdge::SPLITEDGE_PHONY);
-          rootEdge->setRealEdge(*succ);
-
-          // split on this edge and reference it's exit/root phony edges
-          (*succ)->setType(BallLarusEdge::SPLITEDGE);
-          (*succ)->setPhonyRoot(rootEdge);
-          (*succ)->setPhonyExit(exitEdge);
-          (*succ)->setWeight(0);
-        }
-      }
-
-      calculatePathNumbersFrom(node);
-    }
-
-    DEBUG(dbgs() << "prev, new number paths " << prevPathNumber << ", "
-          << node->getNumberPaths() << ".\n");
-
-    if(prevPathNumber == 0 && node->getNumberPaths() != 0) {
-      DEBUG(dbgs() << "node ready : " << node->getName() << "\n");
-      for(BLEdgeIterator pred = node->predBegin(), end = node->predEnd();
-          pred != end; pred++) {
-        if( (*pred)->getType() == BallLarusEdge::BACKEDGE ||
-            (*pred)->getType() == BallLarusEdge::SPLITEDGE )
-          continue;
-
-        BallLarusNode* nextNode = (*pred)->getSource();
-        // not yet visited?
-        if(nextNode->getNumberPaths() == 0)
-          bfsQueue.push(nextNode);
-      }
-    }
-  }
-
-  DEBUG(dbgs() << "\tNumber of paths: " << getRoot()->getNumberPaths() << "\n");
-}
-
-// Returns the number of paths for the Dag.
-unsigned BallLarusDag::getNumberOfPaths() {
-  return(getRoot()->getNumberPaths());
-}
-
-// Returns the root (i.e. entry) node for the DAG.
-BallLarusNode* BallLarusDag::getRoot() {
-  return _root;
-}
-
-// Returns the exit node for the DAG.
-BallLarusNode* BallLarusDag::getExit() {
-  return _exit;
-}
-
-// Returns the function for the DAG.
-Function& BallLarusDag::getFunction() {
-  return(_function);
-}
-
-// Clears the node colors.
-void BallLarusDag::clearColors(BallLarusNode::NodeColor color) {
-  for (BLNodeIterator nodeIt = _nodes.begin(); nodeIt != _nodes.end(); nodeIt++)
-    (*nodeIt)->setColor(color);
-}
-
-// Processes one node and its imediate edges for building the DAG.
-void BallLarusDag::buildNode(BLBlockNodeMap& inDag, BLNodeStack& dfsStack) {
-  BallLarusNode* currentNode = dfsStack.top();
-  BasicBlock* currentBlock = currentNode->getBlock();
-
-  if(currentNode->getColor() != BallLarusNode::WHITE) {
-    // we have already visited this node
-    dfsStack.pop();
-    currentNode->setColor(BallLarusNode::BLACK);
-  } else {
-    // are there any external procedure calls?
-    if( ProcessEarlyTermination ) {
-      for( BasicBlock::iterator bbCurrent = currentNode->getBlock()->begin(),
-             bbEnd = currentNode->getBlock()->end(); bbCurrent != bbEnd;
-           bbCurrent++ ) {
-        Instruction& instr = *bbCurrent;
-        if( instr.getOpcode() == Instruction::Call ) {
-          BallLarusEdge* callEdge = addEdge(currentNode, getExit(), 0);
-          callEdge->setType(BallLarusEdge::CALLEDGE_PHONY);
-          break;
-        }
-      }
-    }
-
-    TerminatorInst* terminator = currentNode->getBlock()->getTerminator();
-    if(isa<ReturnInst>(terminator) || isa<UnreachableInst>(terminator) ||
-       isa<ResumeInst>(terminator))
-      addEdge(currentNode, getExit(),0);
-
-    currentNode->setColor(BallLarusNode::GRAY);
-    inDag[currentBlock] = currentNode;
-
-    BasicBlock* oldSuccessor = 0;
-    unsigned duplicateNumber = 0;
-
-    // iterate through this node's successors
-    for(succ_iterator successor = succ_begin(currentBlock),
-          succEnd = succ_end(currentBlock); successor != succEnd;
-        oldSuccessor = *successor, ++successor ) {
-      BasicBlock* succBB = *successor;
-
-      // is this edge a duplicate?
-      if (oldSuccessor == succBB)
-        duplicateNumber++;
-      else
-        duplicateNumber = 0;
-
-      buildEdge(inDag, dfsStack, currentNode, succBB, duplicateNumber);
-    }
-  }
-}
-
-// Process an edge in the CFG for DAG building.
-void BallLarusDag::buildEdge(BLBlockNodeMap& inDag, std::stack<BallLarusNode*>&
-                             dfsStack, BallLarusNode* currentNode,
-                             BasicBlock* succBB, unsigned duplicateCount) {
-  BallLarusNode* succNode = inDag[succBB];
-
-  if(succNode && succNode->getColor() == BallLarusNode::BLACK) {
-    // visited node and forward edge
-    addEdge(currentNode, succNode, duplicateCount);
-  } else if(succNode && succNode->getColor() == BallLarusNode::GRAY) {
-    // visited node and back edge
-    DEBUG(dbgs() << "Backedge detected.\n");
-    addBackedge(currentNode, succNode, duplicateCount);
-  } else {
-    BallLarusNode* childNode;
-    // not visited node and forward edge
-    if(succNode) // an unvisited node that is child of a gray node
-      childNode = succNode;
-    else { // an unvisited node that is a child of a an unvisted node
-      childNode = addNode(succBB);
-      inDag[succBB] = childNode;
-    }
-    addEdge(currentNode, childNode, duplicateCount);
-    dfsStack.push(childNode);
-  }
-}
-
-// The weight on each edge is the increment required along any path that
-// contains that edge.
-void BallLarusDag::calculatePathNumbersFrom(BallLarusNode* node) {
-  if(node == getExit())
-    // The Exit node must be base case
-    node->setNumberPaths(1);
-  else {
-    unsigned sumPaths = 0;
-    BallLarusNode* succNode;
-
-    for(BLEdgeIterator succ = node->succBegin(), end = node->succEnd();
-        succ != end; succ++) {
-      if( (*succ)->getType() == BallLarusEdge::BACKEDGE ||
-          (*succ)->getType() == BallLarusEdge::SPLITEDGE )
-        continue;
-
-      (*succ)->setWeight(sumPaths);
-      succNode = (*succ)->getTarget();
-
-      if( !succNode->getNumberPaths() )
-        return;
-      sumPaths += succNode->getNumberPaths();
-    }
-
-    node->setNumberPaths(sumPaths);
-  }
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusNode* BallLarusDag::createNode(BasicBlock* BB) {
-  return( new BallLarusNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each
-// pointer created.
-BallLarusEdge* BallLarusDag::createEdge(BallLarusNode* source,
-                                        BallLarusNode* target,
-                                        unsigned duplicateCount) {
-  return( new BallLarusEdge(source, target, duplicateCount) );
-}
-
-// Proxy to node's constructor.  Updates the DAG state.
-BallLarusNode* BallLarusDag::addNode(BasicBlock* BB) {
-  BallLarusNode* newNode = createNode(BB);
-  _nodes.push_back(newNode);
-  return( newNode );
-}
-
-// Proxy to edge's constructor. Updates the DAG state.
-BallLarusEdge* BallLarusDag::addEdge(BallLarusNode* source,
-                                     BallLarusNode* target,
-                                     unsigned duplicateCount) {
-  BallLarusEdge* newEdge = createEdge(source, target, duplicateCount);
-  _edges.push_back(newEdge);
-  source->addSuccEdge(newEdge);
-  target->addPredEdge(newEdge);
-  return(newEdge);
-}
-
-// Adds a backedge with its phony edges. Updates the DAG state.
-void BallLarusDag::addBackedge(BallLarusNode* source, BallLarusNode* target,
-                               unsigned duplicateCount) {
-  BallLarusEdge* childEdge = addEdge(source, target, duplicateCount);
-  childEdge->setType(BallLarusEdge::BACKEDGE);
-
-  childEdge->setPhonyRoot(addEdge(getRoot(), target,0));
-  childEdge->setPhonyExit(addEdge(source, getExit(),0));
-
-  childEdge->getPhonyRoot()->setRealEdge(childEdge);
-  childEdge->getPhonyRoot()->setType(BallLarusEdge::BACKEDGE_PHONY);
-
-  childEdge->getPhonyExit()->setRealEdge(childEdge);
-  childEdge->getPhonyExit()->setType(BallLarusEdge::BACKEDGE_PHONY);
-  _backEdges.push_back(childEdge);
-}
diff --git a/lib/Analysis/PathProfileInfo.cpp b/lib/Analysis/PathProfileInfo.cpp
deleted file mode 100644
index bc53221..0000000
--- a/lib/Analysis/PathProfileInfo.cpp
+++ /dev/null
@@ -1,433 +0,0 @@
-//===- PathProfileInfo.cpp ------------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface used by optimizers to load path profiles,
-// and provides a loader pass which reads a path profile file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-info"
-
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-
-using namespace llvm;
-
-// command line option for loading path profiles
-static cl::opt<std::string>
-PathProfileInfoFilename("path-profile-loader-file", cl::init("llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Path profile file loaded by -path-profile-loader"), cl::Hidden);
-
-namespace {
-  class PathProfileLoaderPass : public ModulePass, public PathProfileInfo {
-  public:
-    PathProfileLoaderPass() : ModulePass(ID) { }
-    ~PathProfileLoaderPass();
-
-    // this pass doesn't change anything (only loads information)
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    // the full name of the loader pass
-    virtual const char* getPassName() const {
-      return "Path Profiling Information Loader";
-    }
-
-    // required since this pass implements multiple inheritance
-                virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    // entry point to run the pass
-    bool runOnModule(Module &M);
-
-    // pass identification
-    static char ID;
-
-  private:
-    // make a reference table to refer to function by number
-    void buildFunctionRefs(Module &M);
-
-    // process argument info of a program from the input file
-    void handleArgumentInfo();
-
-    // process path number information from the input file
-    void handlePathInfo();
-
-    // array of references to the functions in the module
-    std::vector<Function*> _functions;
-
-    // path profile file handle
-    FILE* _file;
-
-    // path profile file name
-    std::string _filename;
-  };
-}
-
-// register PathLoader
-char PathProfileLoaderPass::ID = 0;
-
-INITIALIZE_ANALYSIS_GROUP(PathProfileInfo, "Path Profile Information",
-                          NoPathProfileInfo)
-INITIALIZE_AG_PASS(PathProfileLoaderPass, PathProfileInfo,
-                   "path-profile-loader",
-                   "Load path profile information from file",
-                   false, true, false)
-
-char &llvm::PathProfileLoaderPassID = PathProfileLoaderPass::ID;
-
-// link PathLoader as a pass, and make it available as an optimisation
-ModulePass *llvm::createPathProfileLoaderPass() {
-  return new PathProfileLoaderPass;
-}
-
-// ----------------------------------------------------------------------------
-// PathEdge implementation
-//
-ProfilePathEdge::ProfilePathEdge (BasicBlock* source, BasicBlock* target,
-                                  unsigned duplicateNumber)
-  : _source(source), _target(target), _duplicateNumber(duplicateNumber) {}
-
-// ----------------------------------------------------------------------------
-// Path implementation
-//
-
-ProfilePath::ProfilePath (unsigned int number, unsigned int count,
-                          double countStdDev,   PathProfileInfo* ppi)
-  : _number(number) , _count(count), _countStdDev(countStdDev), _ppi(ppi) {}
-
-double ProfilePath::getFrequency() const {
-  return 100 * double(_count) /
-    double(_ppi->_functionPathCounts[_ppi->_currentFunction]);
-}
-
-static BallLarusEdge* getNextEdge (BallLarusNode* node,
-                                   unsigned int pathNumber) {
-  BallLarusEdge* best = 0;
-
-  for( BLEdgeIterator next = node->succBegin(),
-         end = node->succEnd(); next != end; next++ ) {
-    if( (*next)->getType() != BallLarusEdge::BACKEDGE && // no backedges
-        (*next)->getType() != BallLarusEdge::SPLITEDGE && // no split edges
-        (*next)->getWeight() <= pathNumber && // weight must be <= pathNumber
-        (!best || (best->getWeight() < (*next)->getWeight())) ) // best one?
-      best = *next;
-  }
-
-  return best;
-}
-
-ProfilePathEdgeVector* ProfilePath::getPathEdges() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathEdgeVector* pev = new ProfilePathEdgeVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-
-    increment -= next->getWeight();
-
-    if( next->getType() != BallLarusEdge::BACKEDGE_PHONY &&
-        next->getType() != BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getTarget() != _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getSource()->getBlock(),
-                       next->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::BACKEDGE_PHONY &&
-        next->getTarget() == _ppi->_currentDag->getExit() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    if( next->getType() == BallLarusEdge::SPLITEDGE_PHONY &&
-        next->getSource() == _ppi->_currentDag->getRoot() )
-      pev->push_back(ProfilePathEdge(
-                       next->getRealEdge()->getSource()->getBlock(),
-                       next->getRealEdge()->getTarget()->getBlock(),
-                       next->getDuplicateNumber()));
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pev;
-}
-
-ProfilePathBlockVector* ProfilePath::getPathBlocks() const {
-  BallLarusNode* currentNode = _ppi->_currentDag->getRoot ();
-  unsigned int increment = _number;
-  ProfilePathBlockVector* pbv = new ProfilePathBlockVector;
-
-  while (currentNode != _ppi->_currentDag->getExit()) {
-    BallLarusEdge* next = getNextEdge(currentNode, increment);
-    increment -= next->getWeight();
-
-    // add block to the block list if it is a real edge
-    if( next->getType() == BallLarusEdge::NORMAL)
-      pbv->push_back (currentNode->getBlock());
-    // make the back edge the last edge since we are at the end
-    else if( next->getTarget() == _ppi->_currentDag->getExit() ) {
-      pbv->push_back (currentNode->getBlock());
-      pbv->push_back (next->getRealEdge()->getTarget()->getBlock());
-    }
-
-    // set the new node
-    currentNode = next->getTarget();
-  }
-
-  return pbv;
-}
-
-BasicBlock* ProfilePath::getFirstBlockInPath() const {
-  BallLarusNode* root = _ppi->_currentDag->getRoot();
-  BallLarusEdge* edge = getNextEdge(root, _number);
-
-  if( edge && (edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-               edge->getType() == BallLarusEdge::SPLITEDGE_PHONY) )
-    return edge->getTarget()->getBlock();
-
-  return root->getBlock();
-}
-
-// ----------------------------------------------------------------------------
-// PathProfileInfo implementation
-//
-
-// Pass identification
-char llvm::PathProfileInfo::ID = 0;
-
-PathProfileInfo::PathProfileInfo () : _currentDag(0) , _currentFunction(0) {
-}
-
-PathProfileInfo::~PathProfileInfo() {
-  if (_currentDag)
-    delete _currentDag;
-}
-
-// set the function for which paths are currently begin processed
-void PathProfileInfo::setCurrentFunction(Function* F) {
-  // Make sure it exists
-  if (!F) return;
-
-  if (_currentDag)
-    delete _currentDag;
-
-  _currentFunction = F;
-  _currentDag = new BallLarusDag(*F);
-  _currentDag->init();
-  _currentDag->calculatePathNumbers();
-}
-
-// get the function for which paths are currently being processed
-Function* PathProfileInfo::getCurrentFunction() const {
-  return _currentFunction;
-}
-
-// get the entry block of the function
-BasicBlock* PathProfileInfo::getCurrentFunctionEntry() {
-  return _currentDag->getRoot()->getBlock();
-}
-
-// return the path based on its number
-ProfilePath* PathProfileInfo::getPath(unsigned int number) {
-  return _functionPaths[_currentFunction][number];
-}
-
-// return the number of paths which a function may potentially execute
-unsigned int PathProfileInfo::getPotentialPathCount() {
-  return _currentDag ? _currentDag->getNumberOfPaths() : 0;
-}
-
-// return an iterator for the beginning of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathBegin() {
-  return _functionPaths[_currentFunction].begin();
-}
-
-// return an iterator for the end of a functions executed paths
-ProfilePathIterator PathProfileInfo::pathEnd() {
-  return _functionPaths[_currentFunction].end();
-}
-
-// returns the total number of paths run in the function
-unsigned int PathProfileInfo::pathsRun() {
-  return _currentFunction ? _functionPaths[_currentFunction].size() : 0;
-}
-
-// ----------------------------------------------------------------------------
-// PathLoader implementation
-//
-
-// remove all generated paths
-PathProfileLoaderPass::~PathProfileLoaderPass() {
-  for( FunctionPathIterator funcNext = _functionPaths.begin(),
-         funcEnd = _functionPaths.end(); funcNext != funcEnd; funcNext++)
-    for( ProfilePathIterator pathNext = funcNext->second.begin(),
-           pathEnd = funcNext->second.end(); pathNext != pathEnd; pathNext++)
-      delete pathNext->second;
-}
-
-// entry point of the pass; this loads and parses a file
-bool PathProfileLoaderPass::runOnModule(Module &M) {
-  // get the filename and setup the module's function references
-  _filename = PathProfileInfoFilename;
-  buildFunctionRefs (M);
-
-  if (!(_file = fopen(_filename.c_str(), "rb"))) {
-    errs () << "error: input '" << _filename << "' file does not exist.\n";
-    return false;
-  }
-
-  ProfilingType profType;
-
-  while( fread(&profType, sizeof(ProfilingType), 1, _file) ) {
-    switch (profType) {
-    case ArgumentInfo:
-      handleArgumentInfo ();
-      break;
-    case PathInfo:
-      handlePathInfo ();
-      break;
-    default:
-      errs () << "error: bad path profiling file syntax, " << profType << "\n";
-      fclose (_file);
-      return false;
-    }
-  }
-
-  fclose (_file);
-
-  return true;
-}
-
-// create a reference table for functions defined in the path profile file
-void PathProfileLoaderPass::buildFunctionRefs (Module &M) {
-  _functions.push_back(0); // make the 0 index a null pointer
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-    _functions.push_back(F);
-  }
-}
-
-// handle command like argument infor in the output file
-void PathProfileLoaderPass::handleArgumentInfo() {
-  // get the argument list's length
-  unsigned savedArgsLength;
-  if( fread(&savedArgsLength, sizeof(unsigned), 1, _file) != 1 ) {
-    errs() << "warning: argument info header/data mismatch\n";
-    return;
-  }
-
-  // allocate a buffer, and get the arguments
-  char* args = new char[savedArgsLength+1];
-  if( fread(args, 1, savedArgsLength, _file) != savedArgsLength )
-    errs() << "warning: argument info header/data mismatch\n";
-
-  args[savedArgsLength] = '\0';
-  argList = std::string(args);
-  delete [] args; // cleanup dynamic string
-
-  // byte alignment
-  if (savedArgsLength & 3)
-    fseek(_file, 4-(savedArgsLength&3), SEEK_CUR);
-}
-
-// Handle path profile information in the output file
-void PathProfileLoaderPass::handlePathInfo () {
-  // get the number of functions in this profile
-  unsigned functionCount;
-  if( fread(&functionCount, sizeof(functionCount), 1, _file) != 1 ) {
-    errs() << "warning: path info header/data mismatch\n";
-    return;
-  }
-
-  // gather path information for each function
-  for (unsigned i = 0; i < functionCount; i++) {
-    PathProfileHeader pathHeader;
-    if( fread(&pathHeader, sizeof(pathHeader), 1, _file) != 1 ) {
-      errs() << "warning: bad header for path function info\n";
-      break;
-    }
-
-    Function* f = _functions[pathHeader.fnNumber];
-
-    // dynamically allocate a table to store path numbers
-    PathProfileTableEntry* pathTable =
-      new PathProfileTableEntry[pathHeader.numEntries];
-
-    if( fread(pathTable, sizeof(PathProfileTableEntry),
-              pathHeader.numEntries, _file) != pathHeader.numEntries) {
-      delete [] pathTable;
-      errs() << "warning: path function info header/data mismatch\n";
-      return;
-    }
-
-    // Build a new path for the current function
-    unsigned int totalPaths = 0;
-    for (unsigned int j = 0; j < pathHeader.numEntries; j++) {
-      totalPaths += pathTable[j].pathCounter;
-      _functionPaths[f][pathTable[j].pathNumber]
-        = new ProfilePath(pathTable[j].pathNumber, pathTable[j].pathCounter,
-                          0, this);
-    }
-
-    _functionPathCounts[f] = totalPaths;
-
-    delete [] pathTable;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//  NoProfile PathProfileInfo implementation
-//
-
-namespace {
-  struct NoPathProfileInfo : public ImmutablePass, public PathProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoPathProfileInfo() : ImmutablePass(ID) {
-      initializeNoPathProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &PathProfileInfo::ID)
-        return (PathProfileInfo*)this;
-      return this;
-    }
-
-    virtual const char *getPassName() const {
-      return "NoPathProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoPathProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoPathProfileInfo, PathProfileInfo, "no-path-profile",
-                   "No Path Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoPathProfileInfoPass() { return new NoPathProfileInfo(); }
diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp
deleted file mode 100644
index 48d7d05..0000000
--- a/lib/Analysis/PathProfileVerifier.cpp
+++ /dev/null
@@ -1,206 +0,0 @@
-//===- PathProfileVerifier.cpp --------------------------------*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This verifier derives an edge profile file from current path profile
-// information
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "path-profile-verifier"
-
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/PathProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
-
-using namespace llvm;
-
-namespace {
-  class PathProfileVerifier : public ModulePass {
-  private:
-    bool runOnModule(Module &M);
-
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    PathProfileVerifier() : ModulePass(ID) {
-      initializePathProfileVerifierPass(*PassRegistry::getPassRegistry());
-    }
-
-
-    virtual const char *getPassName() const {
-      return "Path Profiler Verifier";
-    }
-
-    // The verifier requires the path profile and edge profile.
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const;
-  };
-}
-
-static cl::opt<std::string>
-EdgeProfileFilename("path-profile-verifier-file",
-  cl::init("edgefrompath.llvmprof.out"),
-  cl::value_desc("filename"),
-  cl::desc("Edge profile file generated by -path-profile-verifier"),
-  cl::Hidden);
-
-char PathProfileVerifier::ID = 0;
-INITIALIZE_PASS(PathProfileVerifier, "path-profile-verifier",
-                "Compare the path profile derived edge profile against the "
-                "edge profile.", true, true)
-
-ModulePass *llvm::createPathProfileVerifierPass() {
-  return new PathProfileVerifier();
-}
-
-// The verifier requires the path profile and edge profile.
-void PathProfileVerifier::getAnalysisUsage(AnalysisUsage& AU) const {
-  AU.addRequired<PathProfileInfo>();
-  AU.addPreserved<PathProfileInfo>();
-}
-
-typedef std::map<unsigned, unsigned> DuplicateToIndexMap;
-typedef std::map<BasicBlock*,DuplicateToIndexMap> BlockToDuplicateMap;
-typedef std::map<BasicBlock*,BlockToDuplicateMap> NestedBlockToIndexMap;
-
-// the verifier iterates through each path to gather the total
-// number of edge frequencies
-bool PathProfileVerifier::runOnModule (Module &M) {
-  PathProfileInfo& pathProfileInfo = getAnalysis<PathProfileInfo>();
-
-  // setup a data structure to map path edges which index an
-  // array of edge counters
-  NestedBlockToIndexMap arrayMap;
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    arrayMap[(BasicBlock*)0][F->begin()][0] = i++;
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-
-      unsigned duplicate = 0;
-      BasicBlock* prev = 0;
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e;
-           prev = TI->getSuccessor(s), ++s) {
-        if (prev == TI->getSuccessor(s))
-          duplicate++;
-        else duplicate = 0;
-
-        arrayMap[BB][TI->getSuccessor(s)][duplicate] = i++;
-      }
-    }
-  }
-
-  std::vector<unsigned> edgeArray(i);
-
-  // iterate through each path and increment the edge counters as needed
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-
-    pathProfileInfo.setCurrentFunction(F);
-
-    DEBUG(dbgs() << "function '" << F->getName() << "' ran "
-          << pathProfileInfo.pathsRun()
-          << "/" << pathProfileInfo.getPotentialPathCount()
-          << " potential paths\n");
-
-    for( ProfilePathIterator nextPath = pathProfileInfo.pathBegin(),
-           endPath = pathProfileInfo.pathEnd();
-         nextPath != endPath; nextPath++ ) {
-      ProfilePath* currentPath = nextPath->second;
-
-      ProfilePathEdgeVector* pev = currentPath->getPathEdges();
-      DEBUG(dbgs () << "path #" << currentPath->getNumber() << ": "
-            << currentPath->getCount() << "\n");
-      // setup the entry edge (normally path profiling doesn't care about this)
-      if (currentPath->getFirstBlockInPath() == &F->getEntryBlock())
-        edgeArray[arrayMap[(BasicBlock*)0][currentPath->getFirstBlockInPath()][0]]
-          += currentPath->getCount();
-
-      for( ProfilePathEdgeIterator nextEdge = pev->begin(),
-             endEdge = pev->end(); nextEdge != endEdge; nextEdge++ ) {
-        if (nextEdge != pev->begin())
-          DEBUG(dbgs() << " :: ");
-
-        BasicBlock* source = nextEdge->getSource();
-        BasicBlock* target = nextEdge->getTarget();
-        unsigned duplicateNumber = nextEdge->getDuplicateNumber();
-        DEBUG(dbgs() << source->getName() << " --{" << duplicateNumber
-                     << "}--> " << target->getName());
-
-        // Ensure all the referenced edges exist
-        // TODO: make this a separate function
-        if( !arrayMap.count(source) ) {
-          errs() << "  error [" << F->getName() << "()]: source '"
-                 << source->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source].count(target) ) {
-          errs() << "  error [" << F->getName() << "()]: target '"
-                 << target->getName()
-                 << "' does not exist in the array map.\n";
-        } else if( !arrayMap[source][target].count(duplicateNumber) ) {
-          errs() << "  error [" << F->getName() << "()]: edge "
-                 << source->getName() << " -> " << target->getName()
-                 << " duplicate number " << duplicateNumber
-                 << " does not exist in the array map.\n";
-        } else {
-          edgeArray[arrayMap[source][target][duplicateNumber]]
-            += currentPath->getCount();
-        }
-      }
-
-      DEBUG(errs() << "\n");
-
-      delete pev;
-    }
-  }
-
-  std::string errorInfo;
-  std::string filename = EdgeProfileFilename;
-
-  // Open a handle to the file
-  FILE* edgeFile = fopen(filename.c_str(),"wb");
-
-  if (!edgeFile) {
-    errs() << "error: unable to open file '" << filename << "' for output.\n";
-    return false;
-  }
-
-  errs() << "Generating edge profile '" << filename << "' ...\n";
-
-  // write argument info
-  unsigned type = ArgumentInfo;
-  unsigned num = pathProfileInfo.argList.size();
-  int zeros = 0;
-
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-  fwrite(pathProfileInfo.argList.c_str(),1,num,edgeFile);
-  if (num&3)
-    fwrite(&zeros, 1, 4-(num&3), edgeFile);
-
-  type = EdgeInfo;
-  num = edgeArray.size();
-  fwrite(&type,sizeof(unsigned),1,edgeFile);
-  fwrite(&num,sizeof(unsigned),1,edgeFile);
-
-  // write each edge to the file
-  for( std::vector<unsigned>::iterator s = edgeArray.begin(),
-         e = edgeArray.end(); s != e; s++)
-    fwrite(&*s, sizeof (unsigned), 1, edgeFile);
-
-  fclose (edgeFile);
-
-  return true;
-}
diff --git a/lib/Analysis/ProfileDataLoader.cpp b/lib/Analysis/ProfileDataLoader.cpp
deleted file mode 100644
index d7f444b..0000000
--- a/lib/Analysis/ProfileDataLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileDataLoader class is used to load raw profiling data from the dump
-// file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/Analysis/ProfileDataTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
-                                                        const BasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first->getName();
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second->getName();
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
-/// (or both) may not be defined.
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  // Undefined + undefined = undefined.
-  if (A == ProfileDataLoader::Uncounted) return B;
-  if (B == ProfileDataLoader::Uncounted) return A;
-
-  return A + B;
-}
-
-/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
-template <typename T>
-static void ReadProfilingData(const char *ToolName, FILE *F,
-                              T *Data, size_t NumEntries) {
-  // Read in the block of data...
-  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
-    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
-}
-
-/// ReadProfilingNumEntries - Read how many entries are in this profiling data
-/// packet.
-static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
-                                        bool ShouldByteSwap) {
-  unsigned Entry;
-  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
-  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
-}
-
-/// ReadProfilingBlock - Read the number of entries in the next profiling data
-/// packet and then accumulate the entries into 'Data'.
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               SmallVector<unsigned, 32> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the data.
-  SmallVector<unsigned, 8> TempSpace(NumEntries);
-  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
-
-  // Make sure we have enough space ...
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
-
-  // Accumulate the data we just read into the existing data.
-  for (unsigned i = 0; i < NumEntries; ++i) {
-    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
-    Data[i] = AddCounts(Entry, Data[i]);
-  }
-}
-
-/// ReadProfilingArgBlock - Read the command line arguments that the progam was
-/// run with when the current profiling data packet(s) were generated.
-static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
-                                  bool ShouldByteSwap,
-                                  SmallVector<std::string, 1> &CommandLines) {
-  // Read the number of bytes ...
-  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
-
-  // Read in the arguments (if there are any to read).  Round up the length to
-  // the nearest 4-byte multiple.
-  SmallVector<char, 8> Args(ArgLength+4);
-  if (ArgLength)
-    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
-
-  // Store the arguments.
-  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
-}
-
-const unsigned ProfileDataLoader::Uncounted = ~0U;
-
-/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
-/// a fatal error if the file is invalid or broken.
-ProfileDataLoader::ProfileDataLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0)
-    report_fatal_error(Twine(ToolName) + ": Error opening '" +
-                       Filename + "': ");
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.  This can happen when the compiler host and target have
-    // different endianness.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
-
-    switch (PacketType) {
-      case ArgumentInfo:
-        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
-        break;
-
-      case EdgeInfo:
-        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-        break;
-
-      default:
-        report_fatal_error(std::string(ToolName)
-                           + ": Unknown profiling packet type");
-        break;
-    }
-  }
-
-  fclose(F);
-}
diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
deleted file mode 100644
index 2ee0093..0000000
--- a/lib/Analysis/ProfileDataLoaderPass.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass loads profiling data from a dump file and sets branch weight
-// metadata.
-//
-// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
-// once ProfileInfo etc. has been removed.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-metadata-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileDataLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
-
-static cl::opt<std::string>
-ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
-                  cl::value_desc("filename"),
-                  cl::desc("Profile file loaded by -profile-metadata-loader"));
-
-namespace {
-  /// This pass loads profiling data from a dump file and sets branch weight
-  /// metadata.
-  class ProfileMetadataLoaderPass : public ModulePass {
-    std::string Filename;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
-        : ModulePass(ID), Filename(filename) {
-      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileMetadataFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profile loader";
-    }
-
-    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
-                          ArrayRef<unsigned>);
-    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
-    virtual void setBranchWeightMetadata(Module&, ProfileData&);
-
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char ProfileMetadataLoaderPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
-              "Load profile information from llvmprof.out", false, true)
-
-char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
-
-/// createProfileMetadataLoaderPass - This function returns a Pass that loads
-/// the profiling information for the module from the specified filename,
-/// making it available to the optimizers.
-ModulePass *llvm::createProfileMetadataLoaderPass() { 
-    return new ProfileMetadataLoaderPass();
-}
-ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
-  return new ProfileMetadataLoaderPass(Filename);
-}
-
-/// readEdge - Take the value from a profile counter and assign it to an edge.
-void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
-                                         ProfileData &PB, ProfileData::Edge e,
-                                         ArrayRef<unsigned> Counters) {
-  if (ReadCount >= Counters.size()) return;
-
-  unsigned weight = Counters[ReadCount];
-  assert(weight != ProfileDataLoader::Uncounted);
-  PB.addEdgeWeight(e, weight);
-
-  DEBUG(dbgs() << "-- Read Edge Counter for " << e
-               << " (# "<< (ReadCount) << "): "
-               << PB.getEdgeWeight(e) << "\n");
-}
-
-/// matchEdges - Link every profile counter with an edge.
-unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
-                                               ArrayRef<unsigned> Counters) {
-  if (Counters.size() == 0) return 0;
-
-  unsigned ReadCount = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
-    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
-                 Counters);
-      }
-    }
-  }
-
-  return ReadCount;
-}
-
-/// setBranchWeightMetadata - Translate the counter values associated with each
-/// edge into branch weights for each conditional branch (a branch with 2 or
-/// more desinations).
-void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
-                                                        ProfileData &PB) {
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
-
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      TerminatorInst *TI = BB->getTerminator();
-      unsigned NumSuccessors = TI->getNumSuccessors();
-
-      // If there is only one successor then we can not set a branch
-      // probability as the target is certain.
-      if (NumSuccessors < 2) continue;
-
-      // Load the weights of all edges leading from this terminator.
-      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
-                   << " successors:\n");
-      SmallVector<uint32_t, 4> Weights(NumSuccessors);
-      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
-          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
-          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
-          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
-                       << Weights[s] << "\n");
-      }
-
-      // Set branch weight metadata.  This will set branch probabilities of
-      // 100%/0% if that is true of the dynamic execution.
-      // BranchProbabilityInfo can account for this when it loads this metadata
-      // (it gives the unexectuted branch a weight of 1 for the purposes of
-      // probability calculations).
-      MDBuilder MDB(TI->getContext());
-      MDNode *Node = MDB.createBranchWeights(Weights);
-      TI->setMetadata(LLVMContext::MD_prof, Node);
-      NumTermsAnnotated++;
-    }
-  }
-}
-
-bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
-  ProfileDataLoader PDL("profile-data-loader", Filename);
-  ProfileData PB;
-
-  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
-
-  unsigned ReadCount = matchEdges(M, PB, Counters);
-
-  if (ReadCount != Counters.size()) {
-    errs() << "WARNING: profile information is inconsistent with "
-           << "the current program!\n";
-  }
-  NumEdgesRead = ReadCount;
-
-  setBranchWeightMetadata(M, PB);
-
-  return ReadCount > 0;
-}
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
deleted file mode 100644
index b284b99..0000000
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-//===- ProfileEstimatorPass.cpp - LLVM Pass to estimate profile info ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// estimates the profiling information in a very crude and unimaginative way.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-estimator"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static cl::opt<double>
-LoopWeight(
-    "profile-estimator-loop-weight", cl::init(10),
-    cl::value_desc("loop-weight"),
-    cl::desc("Number of loop executions used for profile-estimator")
-);
-
-namespace {
-  class ProfileEstimatorPass : public FunctionPass, public ProfileInfo {
-    double ExecCount;
-    LoopInfo *LI;
-    std::set<BasicBlock*>  BBToVisit;
-    std::map<Loop*,double> LoopExitWeights;
-    std::map<Edge,double>  MinimalWeight;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit ProfileEstimatorPass(const double execcount = 0)
-        : FunctionPass(ID), ExecCount(execcount) {
-      initializeProfileEstimatorPassPass(*PassRegistry::getPassRegistry());
-      if (execcount == 0) ExecCount = LoopWeight;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<LoopInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information estimator";
-    }
-
-    /// run - Estimate the profile information from the specified file.
-    virtual bool runOnFunction(Function &F);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual void recurseBasicBlock(BasicBlock *BB);
-
-    void inline printEdgeWeight(Edge);
-  };
-}  // End of anonymous namespace
-
-char ProfileEstimatorPass::ID = 0;
-INITIALIZE_AG_PASS_BEGIN(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfo)
-INITIALIZE_AG_PASS_END(ProfileEstimatorPass, ProfileInfo, "profile-estimator",
-                "Estimate profiling information", false, true, false)
-
-namespace llvm {
-  char &ProfileEstimatorPassID = ProfileEstimatorPass::ID;
-
-  FunctionPass *createProfileEstimatorPass() {
-    return new ProfileEstimatorPass();
-  }
-
-  /// createProfileEstimatorPass - This function returns a Pass that estimates
-  /// profiling information using the given loop execution count.
-  Pass *createProfileEstimatorPass(const unsigned execcount) {
-    return new ProfileEstimatorPass(execcount);
-  }
-}
-
-static double ignoreMissing(double w) {
-  if (w == ProfileInfo::MissingValue) return 0;
-  return w;
-}
-
-static void inline printEdgeError(ProfileInfo::Edge e, const char *M) {
-  DEBUG(dbgs() << "-- Edge " << e << " is not calculated, " << M << "\n");
-}
-
-void inline ProfileEstimatorPass::printEdgeWeight(Edge E) {
-  DEBUG(dbgs() << "-- Weight of Edge " << E << ":"
-               << format("%20.20g", getEdgeWeight(E)) << "\n");
-}
-
-// recurseBasicBlock() - This calculates the ProfileInfo estimation for a
-// single block and then recurses into the successors.
-// The algorithm preserves the flow condition, meaning that the sum of the
-// weight of the incoming edges must be equal the block weight which must in
-// turn be equal to the sume of the weights of the outgoing edges.
-// Since the flow of an block is deterimined from the current state of the
-// flow, once an edge has a flow assigned this flow is never changed again,
-// otherwise it would be possible to violate the flow condition in another
-// block.
-void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
-
-  // Break the recursion if this BasicBlock was already visited.
-  if (BBToVisit.find(BB) == BBToVisit.end()) return;
-
-  // Read the LoopInfo for this block.
-  bool  BBisHeader = LI->isLoopHeader(BB);
-  Loop* BBLoop     = LI->getLoopFor(BB);
-
-  // To get the block weight, read all incoming edges.
-  double BBWeight = 0;
-  std::set<BasicBlock*> ProcessedPreds;
-  for ( pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-        bbi != bbe; ++bbi ) {
-    // If this block was not considered already, add weight.
-    Edge edge = getEdge(*bbi,BB);
-    double w = getEdgeWeight(edge);
-    if (ProcessedPreds.insert(*bbi).second) {
-      BBWeight += ignoreMissing(w);
-    }
-    // If this block is a loop header and the predecessor is contained in this
-    // loop, thus the edge is a backedge, continue and do not check if the
-    // value is valid.
-    if (BBisHeader && BBLoop->contains(*bbi)) {
-      printEdgeError(edge, "but is backedge, continuing");
-      continue;
-    }
-    // If the edges value is missing (and this is no loop header, and this is
-    // no backedge) return, this block is currently non estimatable.
-    if (w == MissingValue) {
-      printEdgeError(edge, "returning");
-      return;
-    }
-  }
-  if (getExecutionCount(BB) != MissingValue) {
-    BBWeight = getExecutionCount(BB);
-  }
-
-  // Fetch all necessary information for current block.
-  SmallVector<Edge, 8> ExitEdges;
-  SmallVector<Edge, 8> Edges;
-  if (BBLoop) {
-    BBLoop->getExitEdges(ExitEdges);
-  }
-
-  // If this is a loop header, consider the following:
-  // Exactly the flow that is entering this block, must exit this block too. So
-  // do the following: 
-  // *) get all the exit edges, read the flow that is already leaving this
-  // loop, remember the edges that do not have any flow on them right now.
-  // (The edges that have already flow on them are most likely exiting edges of
-  // other loops, do not touch those flows because the previously caclulated
-  // loopheaders would not be exact anymore.)
-  // *) In case there is not a single exiting edge left, create one at the loop
-  // latch to prevent the flow from building up in the loop.
-  // *) Take the flow that is not leaving the loop already and distribute it on
-  // the remaining exiting edges.
-  // (This ensures that all flow that enters the loop also leaves it.)
-  // *) Increase the flow into the loop by increasing the weight of this block.
-  // There is at least one incoming backedge that will bring us this flow later
-  // on. (So that the flow condition in this node is valid again.)
-  if (BBisHeader) {
-    double incoming = BBWeight;
-    // Subtract the flow leaving the loop.
-    std::set<Edge> ProcessedExits;
-    for (SmallVector<Edge, 8>::iterator ei = ExitEdges.begin(),
-         ee = ExitEdges.end(); ei != ee; ++ei) {
-      if (ProcessedExits.insert(*ei).second) {
-        double w = getEdgeWeight(*ei);
-        if (w == MissingValue) {
-          Edges.push_back(*ei);
-          // Check if there is a necessary minimal weight, if yes, subtract it 
-          // from weight.
-          if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-            incoming -= MinimalWeight[*ei];
-            DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-          }
-        } else {
-          incoming -= w;
-        }
-      }
-    }
-    // If no exit edges, create one:
-    if (Edges.size() == 0) {
-      BasicBlock *Latch = BBLoop->getLoopLatch();
-      if (Latch) {
-        Edge edge = getEdge(Latch,0);
-        EdgeInformation[BB->getParent()][edge] = BBWeight;
-        printEdgeWeight(edge);
-        edge = getEdge(Latch, BB);
-        EdgeInformation[BB->getParent()][edge] = BBWeight * ExecCount;
-        printEdgeWeight(edge);
-      }
-    }
-
-    // Distribute remaining weight to the exting edges. To prevent fractions
-    // from building up and provoking precision problems the weight which is to
-    // be distributed is split and the rounded, the last edge gets a somewhat
-    // bigger value, but we are close enough for an estimation.
-    double fraction = floor(incoming/Edges.size());
-    for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
-         ei != ee; ++ei) {
-      double w = 0;
-      if (ei != (ee-1)) {
-        w = fraction;
-        incoming -= fraction;
-      } else {
-        w = incoming;
-      }
-      EdgeInformation[BB->getParent()][*ei] += w;
-      // Read necessary minimal weight.
-      if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-        EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-        DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-      }
-      printEdgeWeight(*ei);
-      
-      // Add minimal weight to paths to all exit edges, this is used to ensure
-      // that enough flow is reaching this edges.
-      Path p;
-      const BasicBlock *Dest = GetPath(BB, (*ei).first, p, GetPathToDest);
-      while (Dest != BB) {
-        const BasicBlock *Parent = p.find(Dest)->second;
-        Edge e = getEdge(Parent, Dest);
-        if (MinimalWeight.find(e) == MinimalWeight.end()) {
-          MinimalWeight[e] = 0;
-        }
-        MinimalWeight[e] += w;
-        DEBUG(dbgs() << "Minimal Weight for " << e << ": " << format("%.20g",MinimalWeight[e]) << "\n");
-        Dest = Parent;
-      }
-    }
-    // Increase flow into the loop.
-    BBWeight *= (ExecCount+1);
-  }
-
-  BlockInformation[BB->getParent()][BB] = BBWeight;
-  // Up until now we considered only the loop exiting edges, now we have a
-  // definite block weight and must distribute this onto the outgoing edges.
-  // Since there may be already flow attached to some of the edges, read this
-  // flow first and remember the edges that have still now flow attached.
-  Edges.clear();
-  std::set<BasicBlock*> ProcessedSuccs;
-
-  succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-  // Also check for (BB,0) edges that may already contain some flow. (But only
-  // in case there are no successors.)
-  if (bbi == bbe) {
-    Edge edge = getEdge(BB,0);
-    EdgeInformation[BB->getParent()][edge] = BBWeight;
-    printEdgeWeight(edge);
-  }
-  for ( ; bbi != bbe; ++bbi ) {
-    if (ProcessedSuccs.insert(*bbi).second) {
-      Edge edge = getEdge(BB,*bbi);
-      double w = getEdgeWeight(edge);
-      if (w != MissingValue) {
-        BBWeight -= getEdgeWeight(edge);
-      } else {
-        Edges.push_back(edge);
-        // If minimal weight is necessary, reserve weight by subtracting weight
-        // from block weight, this is readded later on.
-        if (MinimalWeight.find(edge) != MinimalWeight.end()) {
-          BBWeight -= MinimalWeight[edge];
-          DEBUG(dbgs() << "Reserving " << format("%.20g",MinimalWeight[edge]) << " at " << edge << "\n");
-        }
-      }
-    }
-  }
-
-  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
-  // Finally we know what flow is still not leaving the block, distribute this
-  // flow onto the empty edges.
-  for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
-       ei != ee; ++ei) {
-    if (ei != (ee-1)) {
-      EdgeInformation[BB->getParent()][*ei] += fraction;
-      BBWeight -= fraction;
-    } else {
-      EdgeInformation[BB->getParent()][*ei] += BBWeight;
-    }
-    // Readd minial necessary weight.
-    if (MinimalWeight.find(*ei) != MinimalWeight.end()) {
-      EdgeInformation[BB->getParent()][*ei] += MinimalWeight[*ei];
-      DEBUG(dbgs() << "Additionally " << format("%.20g",MinimalWeight[*ei]) << " at " << (*ei) << "\n");
-    }
-    printEdgeWeight(*ei);
-  }
-
-  // This block is visited, mark this before the recursion.
-  BBToVisit.erase(BB);
-
-  // Recurse into successors.
-  for (succ_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-}
-
-bool ProfileEstimatorPass::runOnFunction(Function &F) {
-  if (F.isDeclaration()) return false;
-
-  // Fetch LoopInfo and clear ProfileInfo for this function.
-  LI = &getAnalysis<LoopInfo>();
-  FunctionInformation.erase(&F);
-  BlockInformation[&F].clear();
-  EdgeInformation[&F].clear();
-  BBToVisit.clear();
-
-  // Mark all blocks as to visit.
-  for (Function::iterator bi = F.begin(), be = F.end(); bi != be; ++bi)
-    BBToVisit.insert(bi);
-
-  // Clear Minimal Edges.
-  MinimalWeight.clear();
-
-  DEBUG(dbgs() << "Working on function " << F.getName() << "\n");
-
-  // Since the entry block is the first one and has no predecessors, the edge
-  // (0,entry) is inserted with the starting weight of 1.
-  BasicBlock *entry = &F.getEntryBlock();
-  BlockInformation[&F][entry] = pow(2.0, 32.0);
-  Edge edge = getEdge(0,entry);
-  EdgeInformation[&F][edge] = BlockInformation[&F][entry];
-  printEdgeWeight(edge);
-
-  // Since recurseBasicBlock() maybe returns with a block which was not fully
-  // estimated, use recurseBasicBlock() until everything is calculated.
-  bool cleanup = false;
-  recurseBasicBlock(entry);
-  while (BBToVisit.size() > 0 && !cleanup) {
-    // Remember number of open blocks, this is later used to check if progress
-    // was made.
-    unsigned size = BBToVisit.size();
-
-    // Try to calculate all blocks in turn.
-    for (std::set<BasicBlock*>::iterator bi = BBToVisit.begin(),
-         be = BBToVisit.end(); bi != be; ++bi) {
-      recurseBasicBlock(*bi);
-      // If at least one block was finished, break because iterator may be
-      // invalid.
-      if (BBToVisit.size() < size) break;
-    }
-
-    // If there was not a single block resolved, make some assumptions.
-    if (BBToVisit.size() == size) {
-      bool found = false;
-      for (std::set<BasicBlock*>::iterator BBI = BBToVisit.begin(), BBE = BBToVisit.end(); 
-           (BBI != BBE) && (!found); ++BBI) {
-        BasicBlock *BB = *BBI;
-        // Try each predecessor if it can be assumend.
-        for (pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-             (bbi != bbe) && (!found); ++bbi) {
-          Edge e = getEdge(*bbi,BB);
-          double w = getEdgeWeight(e);
-          // Check that edge from predecessor is still free.
-          if (w == MissingValue) {
-            // Check if there is a circle from this block to predecessor.
-            Path P;
-            const BasicBlock *Dest = GetPath(BB, *bbi, P, GetPathToDest);
-            if (Dest != *bbi) {
-              // If there is no circle, just set edge weight to 0
-              EdgeInformation[&F][e] = 0;
-              DEBUG(dbgs() << "Assuming edge weight: ");
-              printEdgeWeight(e);
-              found = true;
-            }
-          }
-        }
-      }
-      if (!found) {
-        cleanup = true;
-        DEBUG(dbgs() << "No assumption possible in Fuction "<<F.getName()<<", setting all to zero\n");
-      }
-    }
-  }
-  // In case there was no safe way to assume edges, set as a last measure, 
-  // set _everything_ to zero.
-  if (cleanup) {
-    FunctionInformation[&F] = 0;
-    BlockInformation[&F].clear();
-    EdgeInformation[&F].clear();
-    for (Function::const_iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
-      const BasicBlock *BB = &(*FI);
-      BlockInformation[&F][BB] = 0;
-      const_pred_iterator predi = pred_begin(BB), prede = pred_end(BB);
-      if (predi == prede) {
-        Edge e = getEdge(0,BB);
-        setEdgeWeight(e,0);
-      }
-      for (;predi != prede; ++predi) {
-        Edge e = getEdge(*predi,BB);
-        setEdgeWeight(e,0);
-      }
-      succ_const_iterator succi = succ_begin(BB), succe = succ_end(BB);
-      if (succi == succe) {
-        Edge e = getEdge(BB,0);
-        setEdgeWeight(e,0);
-      }
-      for (;succi != succe; ++succi) {
-        Edge e = getEdge(*succi,BB);
-        setEdgeWeight(e,0);
-      }
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
deleted file mode 100644
index 9626a48..0000000
--- a/lib/Analysis/ProfileInfo.cpp
+++ /dev/null
@@ -1,1079 +0,0 @@
-//===- ProfileInfo.cpp - Profile Info Interface ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the abstract ProfileInfo interface, and the default
-// "no profile" implementation.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-info"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <limits>
-#include <queue>
-#include <set>
-using namespace llvm;
-
-namespace llvm {
-  template<> char ProfileInfoT<Function,BasicBlock>::ID = 0;
-}
-
-// Register the ProfileInfo interface, providing a nice name to refer to.
-INITIALIZE_ANALYSIS_GROUP(ProfileInfo, "Profile Information", NoProfileInfo)
-
-namespace llvm {
-
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::ProfileInfoT() {}
-template <>
-ProfileInfoT<MachineFunction, MachineBasicBlock>::~ProfileInfoT() {}
-
-template <>
-ProfileInfoT<Function, BasicBlock>::ProfileInfoT() {
-  MachineProfile = 0;
-}
-template <>
-ProfileInfoT<Function, BasicBlock>::~ProfileInfoT() {
-  if (MachineProfile) delete MachineProfile;
-}
-
-template<>
-char ProfileInfoT<MachineFunction, MachineBasicBlock>::ID = 0;
-
-template<>
-const double ProfileInfoT<Function,BasicBlock>::MissingValue = -1;
-
-template<> const
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::MissingValue = -1;
-
-template<> double
-ProfileInfoT<Function,BasicBlock>::getExecutionCount(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(BB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  double Count = MissingValue;
-
-  const_pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
-
-  // Are there zero predecessors of this block?
-  if (PI == PE) {
-    Edge e = getEdge(0, BB);
-    Count = getEdgeWeight(e);
-  } else {
-    // Otherwise, if there are predecessors, the execution count of this block is
-    // the sum of the edge frequencies from the incoming edges.
-    std::set<const BasicBlock*> ProcessedPreds;
-    Count = 0;
-    for (; PI != PE; ++PI) {
-      const BasicBlock *P = *PI;
-      if (ProcessedPreds.insert(P).second) {
-        double w = getEdgeWeight(getEdge(P, BB));
-        if (w == MissingValue) {
-          Count = MissingValue;
-          break;
-        }
-        Count += w;
-      }
-    }
-  }
-
-  // If the predecessors did not suffice to get block weight, try successors.
-  if (Count == MissingValue) {
-
-    succ_const_iterator SI = succ_begin(BB), SE = succ_end(BB);
-
-    // Are there zero successors of this block?
-    if (SI == SE) {
-      Edge e = getEdge(BB,0);
-      Count = getEdgeWeight(e);
-    } else {
-      std::set<const BasicBlock*> ProcessedSuccs;
-      Count = 0;
-      for (; SI != SE; ++SI)
-        if (ProcessedSuccs.insert(*SI).second) {
-          double w = getEdgeWeight(getEdge(BB, *SI));
-          if (w == MissingValue) {
-            Count = MissingValue;
-            break;
-          }
-          Count += w;
-        }
-    }
-  }
-
-  if (Count != MissingValue) BlockInformation[BB->getParent()][BB] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineBasicBlock *MBB) {
-  std::map<const MachineFunction*, BlockCounts>::iterator J =
-    BlockInformation.find(MBB->getParent());
-  if (J != BlockInformation.end()) {
-    BlockCounts::iterator I = J->second.find(MBB);
-    if (I != J->second.end())
-      return I->second;
-  }
-
-  return MissingValue;
-}
-
-template<>
-double ProfileInfoT<Function,BasicBlock>::getExecutionCount(const Function *F) {
-  std::map<const Function*, double>::iterator J =
-    FunctionInformation.find(F);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  // isDeclaration() is checked here and not at start of function to allow
-  // functions without a body still to have a execution count.
-  if (F->isDeclaration()) return MissingValue;
-
-  double Count = getExecutionCount(&F->getEntryBlock());
-  if (Count != MissingValue) FunctionInformation[F] = Count;
-  return Count;
-}
-
-template<>
-double ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        getExecutionCount(const MachineFunction *MF) {
-  std::map<const MachineFunction*, double>::iterator J =
-    FunctionInformation.find(MF);
-  if (J != FunctionInformation.end())
-    return J->second;
-
-  double Count = getExecutionCount(&MF->front());
-  if (Count != MissingValue) FunctionInformation[MF] = Count;
-  return Count;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        setExecutionCount(const BasicBlock *BB, double w) {
-  DEBUG(dbgs() << "Creating Block " << BB->getName() 
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = w;
-}
-
-template<>
-void ProfileInfoT<MachineFunction, MachineBasicBlock>::
-        setExecutionCount(const MachineBasicBlock *MBB, double w) {
-  DEBUG(dbgs() << "Creating Block " << MBB->getBasicBlock()->getName()
-               << " (weight: " << format("%.20g",w) << ")\n");
-  BlockInformation[MBB->getParent()][MBB] = w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::addEdgeWeight(Edge e, double w) {
-  double oldw = getEdgeWeight(e);
-  assert (oldw != MissingValue && "Adding weight to Edge with no previous weight");
-  DEBUG(dbgs() << "Adding to Edge " << e
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  EdgeInformation[getFunction(e)][e] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        addExecutionCount(const BasicBlock *BB, double w) {
-  double oldw = getExecutionCount(BB);
-  assert (oldw != MissingValue && "Adding weight to Block with no previous weight");
-  DEBUG(dbgs() << "Adding to Block " << BB->getName()
-               << " (new weight: " << format("%.20g",oldw + w) << ")\n");
-  BlockInformation[BB->getParent()][BB] = oldw + w;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeBlock(const BasicBlock *BB) {
-  std::map<const Function*, BlockCounts>::iterator J =
-    BlockInformation.find(BB->getParent());
-  if (J == BlockInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting " << BB->getName() << "\n");
-  J->second.erase(BB);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::removeEdge(Edge e) {
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(getFunction(e));
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Deleting" << e << "\n");
-  J->second.erase(e);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceEdge(const Edge &oldedge, const Edge &newedge) {
-  double w;
-  if ((w = getEdgeWeight(newedge)) == MissingValue) {
-    w = getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Replacing " << oldedge << " with " << newedge  << "\n");
-  } else {
-    w += getEdgeWeight(oldedge);
-    DEBUG(dbgs() << "Adding " << oldedge << " to " << newedge  << "\n");
-  }
-  setEdgeWeight(newedge,w);
-  removeEdge(oldedge);
-}
-
-template<>
-const BasicBlock *ProfileInfoT<Function,BasicBlock>::
-        GetPath(const BasicBlock *Src, const BasicBlock *Dest,
-                Path &P, unsigned Mode) {
-  const BasicBlock *BB = 0;
-  bool hasFoundPath = false;
-
-  std::queue<const BasicBlock *> BFS;
-  BFS.push(Src);
-
-  while(BFS.size() && !hasFoundPath) {
-    BB = BFS.front();
-    BFS.pop();
-
-    succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-    if (Succ == End) {
-      P[(const BasicBlock*)0] = BB;
-      if (Mode & GetPathToExit) {
-        hasFoundPath = true;
-        BB = 0;
-      }
-    }
-    for(;Succ != End; ++Succ) {
-      if (P.find(*Succ) != P.end()) continue;
-      Edge e = getEdge(BB,*Succ);
-      if ((Mode & GetPathWithNewEdges) && (getEdgeWeight(e) != MissingValue)) continue;
-      P[*Succ] = BB;
-      BFS.push(*Succ);
-      if ((Mode & GetPathToDest) && *Succ == Dest) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-      if ((Mode & GetPathToValue) && (getExecutionCount(*Succ) != MissingValue)) {
-        hasFoundPath = true;
-        BB = *Succ;
-        break;
-      }
-    }
-  }
-
-  return BB;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        divertFlow(const Edge &oldedge, const Edge &newedge) {
-  DEBUG(dbgs() << "Diverting " << oldedge << " via " << newedge );
-
-  // First check if the old edge was taken, if not, just delete it...
-  if (getEdgeWeight(oldedge) == 0) {
-    removeEdge(oldedge);
-    return;
-  }
-
-  Path P;
-  P[newedge.first] = 0;
-  P[newedge.second] = newedge.first;
-  const BasicBlock *BB = GetPath(newedge.second,oldedge.second,P,GetPathToExit | GetPathToDest);
-
-  double w = getEdgeWeight (oldedge);
-  DEBUG(dbgs() << ", Weight: " << format("%.20g",w) << "\n");
-  do {
-    const BasicBlock *Parent = P.find(BB)->second;
-    Edge e = getEdge(Parent,BB);
-    double oldw = getEdgeWeight(e);
-    double oldc = getExecutionCount(e.first);
-    setEdgeWeight(e, w+oldw);
-    if (Parent != oldedge.first) {
-      setExecutionCount(e.first, w+oldc);
-    }
-    BB = Parent;
-  } while (BB != newedge.first);
-  removeEdge(oldedge);
-}
-
-/// Replaces all occurrences of RmBB in the ProfilingInfo with DestBB.
-/// This checks all edges of the function the blocks reside in and replaces the
-/// occurrences of RmBB with DestBB.
-template<>
-void ProfileInfoT<Function,BasicBlock>::
-        replaceAllUses(const BasicBlock *RmBB, const BasicBlock *DestBB) {
-  DEBUG(dbgs() << "Replacing " << RmBB->getName()
-               << " with " << DestBB->getName() << "\n");
-  const Function *F = DestBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  Edge e, newedge;
-  bool erasededge = false;
-  EdgeWeights::iterator I = J->second.begin(), E = J->second.end();
-  while(I != E) {
-    e = (I++)->first;
-    bool foundedge = false; bool eraseedge = false;
-    if (e.first == RmBB) {
-      if (e.second == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(DestBB, e.second);
-        foundedge = true;
-      }
-    }
-    if (e.second == RmBB) {
-      if (e.first == DestBB) {
-        eraseedge = true;
-      } else {
-        newedge = getEdge(e.first, DestBB);
-        foundedge = true;
-      }
-    }
-    if (foundedge) {
-      replaceEdge(e, newedge);
-    }
-    if (eraseedge) {
-      if (erasededge) {
-        Edge newedge = getEdge(DestBB, DestBB);
-        replaceEdge(e, newedge);
-      } else {
-        removeEdge(e);
-        erasededge = true;
-      }
-    }
-  }
-}
-
-/// Splits an edge in the ProfileInfo and redirects flow over NewBB.
-/// Since its possible that there is more than one edge in the CFG from FristBB
-/// to SecondBB its necessary to redirect the flow proporionally.
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitEdge(const BasicBlock *FirstBB,
-                                                  const BasicBlock *SecondBB,
-                                                  const BasicBlock *NewBB,
-                                                  bool MergeIdenticalEdges) {
-  const Function *F = FirstBB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  // Generate edges and read current weight.
-  Edge e  = getEdge(FirstBB, SecondBB);
-  Edge n1 = getEdge(FirstBB, NewBB);
-  Edge n2 = getEdge(NewBB, SecondBB);
-  EdgeWeights &ECs = J->second;
-  double w = ECs[e];
-
-  int succ_count = 0;
-  if (!MergeIdenticalEdges) {
-    // First count the edges from FristBB to SecondBB, if there is more than
-    // one, only slice out a proporional part for NewBB.
-    for(succ_const_iterator BBI = succ_begin(FirstBB), BBE = succ_end(FirstBB);
-        BBI != BBE; ++BBI) {
-      if (*BBI == SecondBB) succ_count++;  
-    }
-    // When the NewBB is completely new, increment the count by one so that
-    // the counts are properly distributed.
-    if (getExecutionCount(NewBB) == ProfileInfo::MissingValue) succ_count++;
-  } else {
-    // When the edges are merged anyway, then redirect all flow.
-    succ_count = 1;
-  }
-
-  // We know now how many edges there are from FirstBB to SecondBB, reroute a
-  // proportional part of the edge weight over NewBB.
-  double neww = floor(w / succ_count);
-  ECs[n1] += neww;
-  ECs[n2] += neww;
-  BlockInformation[F][NewBB] += neww;
-  if (succ_count == 1) {
-    ECs.erase(e);
-  } else {
-    ECs[e] -= neww;
-  }
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *Old,
-                                                   const BasicBlock* New) {
-  const Function *F = Old->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << Old->getName() << " to " << New->getName() << "\n");
-
-  std::set<Edge> Edges;
-  for (EdgeWeights::iterator ewi = J->second.begin(), ewe = J->second.end(); 
-       ewi != ewe; ++ewi) {
-    Edge old = ewi->first;
-    if (old.first == Old) {
-      Edges.insert(old);
-    }
-  }
-  for (std::set<Edge>::iterator EI = Edges.begin(), EE = Edges.end(); 
-       EI != EE; ++EI) {
-    Edge newedge = getEdge(New, EI->second);
-    replaceEdge(*EI, newedge);
-  }
-
-  double w = getExecutionCount(Old);
-  setEdgeWeight(getEdge(Old, New), w);
-  setExecutionCount(New, w);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::splitBlock(const BasicBlock *BB,
-                                                   const BasicBlock* NewBB,
-                                                   BasicBlock *const *Preds,
-                                                   unsigned NumPreds) {
-  const Function *F = BB->getParent();
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(F);
-  if (J == EdgeInformation.end()) return;
-
-  DEBUG(dbgs() << "Splitting " << NumPreds << " Edges from " << BB->getName() 
-               << " to " << NewBB->getName() << "\n");
-
-  // Collect weight that was redirected over NewBB.
-  double newweight = 0;
-  
-  std::set<const BasicBlock *> ProcessedPreds;
-  // For all requestes Predecessors.
-  for (unsigned pred = 0; pred < NumPreds; ++pred) {
-    const BasicBlock * Pred = Preds[pred];
-    if (ProcessedPreds.insert(Pred).second) {
-      // Create edges and read old weight.
-      Edge oldedge = getEdge(Pred, BB);
-      Edge newedge = getEdge(Pred, NewBB);
-
-      // Remember how much weight was redirected.
-      newweight += getEdgeWeight(oldedge);
-    
-      replaceEdge(oldedge,newedge);
-    }
-  }
-
-  Edge newedge = getEdge(NewBB,BB);
-  setEdgeWeight(newedge, newweight);
-  setExecutionCount(NewBB, newweight);
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::transfer(const Function *Old,
-                                                 const Function *New) {
-  DEBUG(dbgs() << "Replacing Function " << Old->getName() << " with "
-               << New->getName() << "\n");
-  std::map<const Function*, EdgeWeights>::iterator J =
-    EdgeInformation.find(Old);
-  if(J != EdgeInformation.end()) {
-    EdgeInformation[New] = J->second;
-  }
-  EdgeInformation.erase(Old);
-  BlockInformation.erase(Old);
-  FunctionInformation.erase(Old);
-}
-
-static double readEdgeOrRemember(ProfileInfo::Edge edge, double w,
-                                 ProfileInfo::Edge &tocalc, unsigned &uncalc) {
-  if (w == ProfileInfo::MissingValue) {
-    tocalc = edge;
-    uncalc++;
-    return 0;
-  } else {
-    return w;
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::
-        CalculateMissingEdge(const BasicBlock *BB, Edge &removed,
-                             bool assumeEmptySelf) {
-  Edge edgetocalc;
-  unsigned uncalculated = 0;
-
-  // collect weights of all incoming and outgoing edges, rememer edges that
-  // have no value
-  double incount = 0;
-  SmallSet<const BasicBlock*,8> pred_visited;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi==bbe) {
-    Edge e = getEdge(0,BB);
-    incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-  }
-  for (;bbi != bbe; ++bbi) {
-    if (pred_visited.insert(*bbi)) {
-      Edge e = getEdge(*bbi,BB);
-      incount += readEdgeOrRemember(e, getEdgeWeight(e) ,edgetocalc,uncalculated);
-    }
-  }
-
-  double outcount = 0;
-  SmallSet<const BasicBlock*,8> succ_visited;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi==sbbe) {
-    Edge e = getEdge(BB,0);
-    if (getEdgeWeight(e) == MissingValue) {
-      double w = getExecutionCount(BB);
-      if (w != MissingValue) {
-        setEdgeWeight(e,w);
-        removed = e;
-      }
-    }
-    outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-  }
-  for (;sbbi != sbbe; ++sbbi) {
-    if (succ_visited.insert(*sbbi)) {
-      Edge e = getEdge(BB,*sbbi);
-      outcount += readEdgeOrRemember(e, getEdgeWeight(e), edgetocalc, uncalculated);
-    }
-  }
-
-  // if exactly one edge weight was missing, calculate it and remove it from
-  // spanning tree
-  if (uncalculated == 0 ) {
-    return true;
-  } else
-  if (uncalculated == 1) {
-    if (incount < outcount) {
-      EdgeInformation[BB->getParent()][edgetocalc] = outcount-incount;
-    } else {
-      EdgeInformation[BB->getParent()][edgetocalc] = incount-outcount;
-    }
-    DEBUG(dbgs() << "--Calc Edge Counter for " << edgetocalc << ": "
-                 << format("%.20g", getEdgeWeight(edgetocalc)) << "\n");
-    removed = edgetocalc;
-    return true;
-  } else 
-  if (uncalculated == 2 && assumeEmptySelf && edgetocalc.first == edgetocalc.second && incount == outcount) {
-    setEdgeWeight(edgetocalc, incount * 10);
-    removed = edgetocalc;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-static void readEdge(ProfileInfo *PI, ProfileInfo::Edge e, double &calcw, std::set<ProfileInfo::Edge> &misscount) {
-  double w = PI->getEdgeWeight(e);
-  if (w != ProfileInfo::MissingValue) {
-    calcw += w;
-  } else {
-    misscount.insert(e);
-  }
-}
-
-template<>
-bool ProfileInfoT<Function,BasicBlock>::EstimateMissingEdges(const BasicBlock *BB) {
-  double inWeight = 0;
-  std::set<Edge> inMissing;
-  std::set<const BasicBlock*> ProcessedPreds;
-  const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-  if (bbi == bbe) {
-    readEdge(this,getEdge(0,BB),inWeight,inMissing);
-  }
-  for( ; bbi != bbe; ++bbi ) {
-    if (ProcessedPreds.insert(*bbi).second) {
-      readEdge(this,getEdge(*bbi,BB),inWeight,inMissing);
-    }
-  }
-
-  double outWeight = 0;
-  std::set<Edge> outMissing;
-  std::set<const BasicBlock*> ProcessedSuccs;
-  succ_const_iterator sbbi = succ_begin(BB), sbbe = succ_end(BB);
-  if (sbbi == sbbe)
-    readEdge(this,getEdge(BB,0),outWeight,outMissing);
-  for ( ; sbbi != sbbe; ++sbbi ) {
-    if (ProcessedSuccs.insert(*sbbi).second) {
-      readEdge(this,getEdge(BB,*sbbi),outWeight,outMissing);
-    }
-  }
-
-  double share;
-  std::set<Edge>::iterator ei,ee;
-  if (inMissing.size() == 0 && outMissing.size() > 0) {
-    ei = outMissing.begin();
-    ee = outMissing.end();
-    share = inWeight/outMissing.size();
-    setExecutionCount(BB,inWeight);
-  } else
-  if (inMissing.size() > 0 && outMissing.size() == 0 && outWeight == 0) {
-    ei = inMissing.begin();
-    ee = inMissing.end();
-    share = 0;
-    setExecutionCount(BB,0);
-  } else
-  if (inMissing.size() == 0 && outMissing.size() == 0) {
-    setExecutionCount(BB,outWeight);
-    return true;
-  } else {
-    return false;
-  }
-  for ( ; ei != ee; ++ei ) {
-    setEdgeWeight(*ei,share);
-  }
-  return true;
-}
-
-template<>
-void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
-//  if (getExecutionCount(&(F->getEntryBlock())) == 0) {
-//    for (Function::const_iterator FI = F->begin(), FE = F->end();
-//         FI != FE; ++FI) {
-//      const BasicBlock* BB = &(*FI);
-//      {
-//        const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//      {
-//        succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-//        if (NBB == End) {
-//          setEdgeWeight(getEdge(0,BB),0);
-//        }
-//        for(;NBB != End; ++NBB) {
-//          setEdgeWeight(getEdge(*NBB,BB),0);
-//        }
-//      }
-//    }
-//    return;
-//  }
-  // The set of BasicBlocks that are still unvisited.
-  std::set<const BasicBlock*> Unvisited;
-
-  // The set of return edges (Edges with no successors).
-  std::set<Edge> ReturnEdges;
-  double ReturnWeight = 0;
-  
-  // First iterate over the whole function and collect:
-  // 1) The blocks in this function in the Unvisited set.
-  // 2) The return edges in the ReturnEdges set.
-  // 3) The flow that is leaving the function already via return edges.
-
-  // Data structure for searching the function.
-  std::queue<const BasicBlock *> BFS;
-  const BasicBlock *BB = &(F->getEntryBlock());
-  BFS.push(BB);
-  Unvisited.insert(BB);
-
-  while (BFS.size()) {
-    BB = BFS.front(); BFS.pop();
-    succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-    if (NBB == End) {
-      Edge e = getEdge(BB,0);
-      double w = getEdgeWeight(e);
-      if (w == MissingValue) {
-        // If the return edge has no value, try to read value from block.
-        double bw = getExecutionCount(BB);
-        if (bw != MissingValue) {
-          setEdgeWeight(e,bw);
-          ReturnWeight += bw;
-        } else {
-          // If both return edge and block provide no value, collect edge.
-          ReturnEdges.insert(e);
-        }
-      } else {
-        // If the return edge has a proper value, collect it.
-        ReturnWeight += w;
-      }
-    }
-    for (;NBB != End; ++NBB) {
-      if (Unvisited.insert(*NBB).second) {
-        BFS.push(*NBB);
-      }
-    }
-  }
-
-  while (Unvisited.size() > 0) {
-    unsigned oldUnvisitedCount = Unvisited.size();
-    bool FoundPath = false;
-
-    // If there is only one edge left, calculate it.
-    if (ReturnEdges.size() == 1) {
-      ReturnWeight = getExecutionCount(&(F->getEntryBlock())) - ReturnWeight;
-
-      Edge e = *ReturnEdges.begin();
-      setEdgeWeight(e,ReturnWeight);
-      setExecutionCount(e.first,ReturnWeight);
-
-      Unvisited.erase(e.first);
-      ReturnEdges.erase(e);
-      continue;
-    }
-
-    // Calculate all blocks where only one edge is missing, this may also
-    // resolve furhter return edges.
-    std::set<const BasicBlock *>::iterator FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      Edge e;
-      if(CalculateMissingEdge(BB,e,true)) {
-        if (BlockInformation[F].find(BB) == BlockInformation[F].end()) {
-          setExecutionCount(BB,getExecutionCount(BB));
-        }
-        Unvisited.erase(BB);
-        if (e.first != 0 && e.second == 0) {
-          ReturnEdges.erase(e);
-          ReturnWeight += getEdgeWeight(e);
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Estimate edge weights by dividing the flow proportionally.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      bool AllEdgesHaveSameReturn = true;
-      // Check each Successor, these must all end up in the same or an empty
-      // return block otherwise its dangerous to do an estimation on them.
-      for (succ_const_iterator Succ = succ_begin(BB), End = succ_end(BB);
-           Succ != End; ++Succ) {
-        Path P;
-        GetPath(*Succ, 0, P, GetPathToExit);
-        if (Dest && Dest != P[(const BasicBlock*)0]) {
-          AllEdgesHaveSameReturn = false;
-        }
-        Dest = P[(const BasicBlock*)0];
-      }
-      if (AllEdgesHaveSameReturn) {
-        if(EstimateMissingEdges(BB)) {
-          Unvisited.erase(BB);
-          break;
-        }
-      }
-    }
-    if (oldUnvisitedCount > Unvisited.size()) continue;
-
-    // Check if there is a path to an block that has a known value and redirect
-    // flow accordingly.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      // Fetch path.
-      const BasicBlock *BB = *FI; ++FI;
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToValue);
-
-      // Calculate incoming flow.
-      double iw = 0; unsigned inmissing = 0; unsigned incount = 0; unsigned invalid = 0;
-      std::set<const BasicBlock *> Processed;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(*NBB, BB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw += ew;
-            invalid++;
-          } else {
-            // If the path contains the successor, this means its a backedge,
-            // do not count as missing.
-            if (P.find(*NBB) == P.end())
-              inmissing++;
-          }
-          incount++;
-        }
-      }
-      if (inmissing == incount) continue;
-      if (invalid == 0) continue;
-
-      // Subtract (already) outgoing flow.
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            iw -= ew;
-          }
-        }
-      }
-      if (iw < 0) continue;
-
-      // Check the receiving end of the path if it can handle the flow.
-      double ow = getExecutionCount(Dest);
-      Processed.clear();
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (Processed.insert(*NBB).second) {
-          Edge e = getEdge(BB, *NBB);
-          double ew = getEdgeWeight(e);
-          if (ew != MissingValue) {
-            ow -= ew;
-          }
-        }
-      }
-      if (ow < 0) continue;
-
-      // Determine how much flow shall be used.
-      double ew = getEdgeWeight(getEdge(P[Dest],Dest));
-      if (ew != MissingValue) {
-        ew = ew<ow?ew:ow;
-        ew = ew<iw?ew:iw;
-      } else {
-        if (inmissing == 0)
-          ew = iw;
-      }
-
-      // Create flow.
-      if (ew != MissingValue) {
-        do {
-          Edge e = getEdge(P[Dest],Dest);
-          if (getEdgeWeight(e) == MissingValue) {
-            setEdgeWeight(e,ew);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Calculate a block with self loop.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      bool SelfEdgeFound = false;
-      for (succ_const_iterator NBB = succ_begin(BB), End = succ_end(BB);
-           NBB != End; ++NBB) {
-        if (*NBB == BB) {
-          SelfEdgeFound = true;
-          break;
-        }
-      }
-      if (SelfEdgeFound) {
-        Edge e = getEdge(BB,BB);
-        if (getEdgeWeight(e) == MissingValue) {
-          double iw = 0;
-          std::set<const BasicBlock *> Processed;
-          for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-               NBB != End; ++NBB) {
-            if (Processed.insert(*NBB).second) {
-              Edge e = getEdge(*NBB, BB);
-              double ew = getEdgeWeight(e);
-              if (ew != MissingValue) {
-                iw += ew;
-              }
-            }
-          }
-          setEdgeWeight(e,iw * 10);
-          FoundPath = true;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    // Determine backedges, set them to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-      const BasicBlock *Dest = 0;
-      Path P;
-      bool BackEdgeFound = false;
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Dest = GetPath(BB, *NBB, P, GetPathToDest | GetPathWithNewEdges);
-        if (Dest == *NBB) {
-          BackEdgeFound = true;
-          break;
-        }
-      }
-      if (BackEdgeFound) {
-        Edge e = getEdge(Dest,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,0);
-            FoundPath = true;
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Channel flow to return block.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      Path P;
-      const BasicBlock *Dest = GetPath(BB, 0, P, GetPathToExit | GetPathWithNewEdges);
-      Dest = P[(const BasicBlock*)0];
-      if (!Dest) continue;
-
-      if (getEdgeWeight(getEdge(Dest,0)) == MissingValue) {
-        // Calculate incoming flow.
-        double iw = 0;
-        std::set<const BasicBlock *> Processed;
-        for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-             NBB != End; ++NBB) {
-          if (Processed.insert(*NBB).second) {
-            Edge e = getEdge(*NBB, BB);
-            double ew = getEdgeWeight(e);
-            if (ew != MissingValue) {
-              iw += ew;
-            }
-          }
-        }
-        do {
-          Edge e = getEdge(P[Dest], Dest);
-          double w = getEdgeWeight(e);
-          if (w == MissingValue) {
-            setEdgeWeight(e,iw);
-            FoundPath = true;
-          } else {
-            assert(0 && "Edge should not have value already!");
-          }
-          Dest = P[Dest];
-        } while (Dest != BB);
-      }
-    }
-    if (FoundPath) continue;
-
-    // Speculatively set edges to zero.
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE && !FoundPath) {
-      const BasicBlock *BB = *FI; ++FI;
-
-      for (const_pred_iterator NBB = pred_begin(BB), End = pred_end(BB);
-           NBB != End; ++NBB) {
-        Edge e = getEdge(*NBB,BB);
-        double w = getEdgeWeight(e);
-        if (w == MissingValue) {
-          setEdgeWeight(e,0);
-          FoundPath = true;
-          break;
-        }
-      }
-    }
-    if (FoundPath) continue;
-
-    errs() << "{";
-    FI = Unvisited.begin(), FE = Unvisited.end();
-    while(FI != FE) {
-      const BasicBlock *BB = *FI; ++FI;
-      dbgs() << BB->getName();
-      if (FI != FE)
-        dbgs() << ",";
-    }
-    errs() << "}";
-
-    errs() << "ASSERT: could not repair function";
-    assert(0 && "could not repair function");
-  }
-
-  EdgeWeights J = EdgeInformation[F];
-  for (EdgeWeights::iterator EI = J.begin(), EE = J.end(); EI != EE; ++EI) {
-    Edge e = EI->first;
-
-    bool SuccFound = false;
-    if (e.first != 0) {
-      succ_const_iterator NBB = succ_begin(e.first), End = succ_end(e.first);
-      if (NBB == End) {
-        if (0 == e.second) {
-          SuccFound = true;
-        }
-      }
-      for (;NBB != End; ++NBB) {
-        if (*NBB == e.second) {
-          SuccFound = true;
-          break;
-        }
-      }
-      if (!SuccFound) {
-        removeEdge(e);
-      }
-    }
-  }
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
-  return O << MF->getFunction()->getName() << "(MF)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
-  return O << MBB->getBasicBlock()->getName() << "(MB)";
-}
-
-raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first;
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second;
-  else
-    O << "0";
-
-  return O << ")";
-}
-
-} // namespace llvm
-
-//===----------------------------------------------------------------------===//
-//  NoProfile ProfileInfo implementation
-//
-
-namespace {
-  struct NoProfileInfo : public ImmutablePass, public ProfileInfo {
-    static char ID; // Class identification, replacement for typeinfo
-    NoProfileInfo() : ImmutablePass(ID) {
-      initializeNoProfileInfoPass(*PassRegistry::getPassRegistry());
-    }
-    
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    virtual const char *getPassName() const {
-      return "NoProfileInfo";
-    }
-  };
-}  // End of anonymous namespace
-
-char NoProfileInfo::ID = 0;
-// Register this pass...
-INITIALIZE_AG_PASS(NoProfileInfo, ProfileInfo, "no-profile",
-                   "No Profile Information", false, true, true)
-
-ImmutablePass *llvm::createNoProfileInfoPass() { return new NoProfileInfo(); }
diff --git a/lib/Analysis/ProfileInfoLoader.cpp b/lib/Analysis/ProfileInfoLoader.cpp
deleted file mode 100644
index f1f3e940..0000000
--- a/lib/Analysis/ProfileInfoLoader.cpp
+++ /dev/null
@@ -1,155 +0,0 @@
-//===- ProfileInfoLoad.cpp - Load profile information from disk -----------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// The ProfileInfoLoader class is used to load and represent profiling
-// information read in from the dump file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Analysis/ProfileInfoTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-// ByteSwap - Byteswap 'Var' if 'Really' is true.
-//
-static inline unsigned ByteSwap(unsigned Var, bool Really) {
-  if (!Really) return Var;
-  return ((Var & (255U<< 0U)) << 24U) |
-         ((Var & (255U<< 8U)) <<  8U) |
-         ((Var & (255U<<16U)) >>  8U) |
-         ((Var & (255U<<24U)) >> 24U);
-}
-
-static unsigned AddCounts(unsigned A, unsigned B) {
-  // If either value is undefined, use the other.
-  if (A == ProfileInfoLoader::Uncounted) return B;
-  if (B == ProfileInfoLoader::Uncounted) return A;
-  return A + B;
-}
-
-static void ReadProfilingBlock(const char *ToolName, FILE *F,
-                               bool ShouldByteSwap,
-                               std::vector<unsigned> &Data) {
-  // Read the number of entries...
-  unsigned NumEntries;
-  if (fread(&NumEntries, sizeof(unsigned), 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-  NumEntries = ByteSwap(NumEntries, ShouldByteSwap);
-
-  // Read the counts...
-  std::vector<unsigned> TempSpace(NumEntries);
-
-  // Read in the block of data...
-  if (fread(&TempSpace[0], sizeof(unsigned)*NumEntries, 1, F) != 1) {
-    errs() << ToolName << ": data packet truncated!\n";
-    perror(0);
-    exit(1);
-  }
-
-  // Make sure we have enough space... The space is initialised to -1 to
-  // facitiltate the loading of missing values for OptimalEdgeProfiling.
-  if (Data.size() < NumEntries)
-    Data.resize(NumEntries, ProfileInfoLoader::Uncounted);
-
-  // Accumulate the data we just read into the data.
-  if (!ShouldByteSwap) {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(TempSpace[i], Data[i]);
-    }
-  } else {
-    for (unsigned i = 0; i != NumEntries; ++i) {
-      Data[i] = AddCounts(ByteSwap(TempSpace[i], true), Data[i]);
-    }
-  }
-}
-
-const unsigned ProfileInfoLoader::Uncounted = ~0U;
-
-// ProfileInfoLoader ctor - Read the specified profiling data file, exiting the
-// program if the file is invalid or broken.
-//
-ProfileInfoLoader::ProfileInfoLoader(const char *ToolName,
-                                     const std::string &Filename)
-  : Filename(Filename) {
-  FILE *F = fopen(Filename.c_str(), "rb");
-  if (F == 0) {
-    errs() << ToolName << ": Error opening '" << Filename << "': ";
-    perror(0);
-    exit(1);
-  }
-
-  // Keep reading packets until we run out of them.
-  unsigned PacketType;
-  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
-    // If the low eight bits of the packet are zero, we must be dealing with an
-    // endianness mismatch.  Byteswap all words read from the profiling
-    // information.
-    bool ShouldByteSwap = (char)PacketType == 0;
-    PacketType = ByteSwap(PacketType, ShouldByteSwap);
-
-    switch (PacketType) {
-    case ArgumentInfo: {
-      unsigned ArgLength;
-      if (fread(&ArgLength, sizeof(unsigned), 1, F) != 1) {
-        errs() << ToolName << ": arguments packet truncated!\n";
-        perror(0);
-        exit(1);
-      }
-      ArgLength = ByteSwap(ArgLength, ShouldByteSwap);
-
-      // Read in the arguments...
-      std::vector<char> Chars(ArgLength+4);
-
-      if (ArgLength)
-        if (fread(&Chars[0], (ArgLength+3) & ~3, 1, F) != 1) {
-          errs() << ToolName << ": arguments packet truncated!\n";
-          perror(0);
-          exit(1);
-        }
-      CommandLines.push_back(std::string(&Chars[0], &Chars[ArgLength]));
-      break;
-    }
-
-    case FunctionInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, FunctionCounts);
-      break;
-
-    case BlockInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BlockCounts);
-      break;
-
-    case EdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
-      break;
-
-    case OptEdgeInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, OptimalEdgeCounts);
-      break;
-
-    case BBTraceInfo:
-      ReadProfilingBlock(ToolName, F, ShouldByteSwap, BBTrace);
-      break;
-
-    default:
-      errs() << ToolName << ": Unknown packet type #" << PacketType << "!\n";
-      exit(1);
-    }
-  }
-
-  fclose(F);
-}
-
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
deleted file mode 100644
index 346f8d6..0000000
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-//===- ProfileInfoLoaderPass.cpp - LLVM Pass to load profile info ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a concrete implementation of profiling information that
-// loads the information from a profile dump file.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-loader"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesRead, "The # of edges read.");
-
-static cl::opt<std::string>
-ProfileInfoFilename("profile-info-file", cl::init("llvmprof.out"),
-                    cl::value_desc("filename"),
-                    cl::desc("Profile file loaded by -profile-loader"));
-
-namespace {
-  class LoaderPass : public ModulePass, public ProfileInfo {
-    std::string Filename;
-    std::set<Edge> SpanningTree;
-    std::set<const BasicBlock*> BBisUnvisited;
-    unsigned ReadCount;
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-    explicit LoaderPass(const std::string &filename = "")
-      : ModulePass(ID), Filename(filename) {
-      initializeLoaderPassPass(*PassRegistry::getPassRegistry());
-      if (filename.empty()) Filename = ProfileInfoFilename;
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-    }
-
-    virtual const char *getPassName() const {
-      return "Profiling information loader";
-    }
-
-    // recurseBasicBlock() - Calculates the edge weights for as much basic
-    // blocks as possbile.
-    virtual void recurseBasicBlock(const BasicBlock *BB);
-    virtual void readEdgeOrRemember(Edge, Edge&, unsigned &, double &);
-    virtual void readEdge(ProfileInfo::Edge, std::vector<unsigned>&);
-
-    /// getAdjustedAnalysisPointer - This method is used when a pass implements
-    /// an analysis interface through multiple inheritance.  If needed, it
-    /// should override this to adjust the this pointer as needed for the
-    /// specified pass info.
-    virtual void *getAdjustedAnalysisPointer(AnalysisID PI) {
-      if (PI == &ProfileInfo::ID)
-        return (ProfileInfo*)this;
-      return this;
-    }
-    
-    /// run - Load the profile information from the specified file.
-    virtual bool runOnModule(Module &M);
-  };
-}  // End of anonymous namespace
-
-char LoaderPass::ID = 0;
-INITIALIZE_AG_PASS(LoaderPass, ProfileInfo, "profile-loader",
-              "Load profile information from llvmprof.out", false, true, false)
-
-char &llvm::ProfileLoaderPassID = LoaderPass::ID;
-
-ModulePass *llvm::createProfileLoaderPass() { return new LoaderPass(); }
-
-/// createProfileLoaderPass - This function returns a Pass that loads the
-/// profiling information for the module from the specified filename, making it
-/// available to the optimizers.
-Pass *llvm::createProfileLoaderPass(const std::string &Filename) {
-  return new LoaderPass(Filename);
-}
-
-void LoaderPass::readEdgeOrRemember(Edge edge, Edge &tocalc, 
-                                    unsigned &uncalc, double &count) {
-  double w;
-  if ((w = getEdgeWeight(edge)) == MissingValue) {
-    tocalc = edge;
-    uncalc++;
-  } else {
-    count+=w;
-  }
-}
-
-// recurseBasicBlock - Visits all neighbours of a block and then tries to
-// calculate the missing edge values.
-void LoaderPass::recurseBasicBlock(const BasicBlock *BB) {
-
-  // break recursion if already visited
-  if (BBisUnvisited.find(BB) == BBisUnvisited.end()) return;
-  BBisUnvisited.erase(BB);
-  if (!BB) return;
-
-  for (succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-  for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-       bbi != bbe; ++bbi) {
-    recurseBasicBlock(*bbi);
-  }
-
-  Edge tocalc;
-  if (CalculateMissingEdge(BB, tocalc)) {
-    SpanningTree.erase(tocalc);
-  }
-}
-
-void LoaderPass::readEdge(ProfileInfo::Edge e,
-                          std::vector<unsigned> &ECs) {
-  if (ReadCount < ECs.size()) {
-    double weight = ECs[ReadCount++];
-    if (weight != ProfileInfoLoader::Uncounted) {
-      // Here the data realm changes from the unsigned of the file to the
-      // double of the ProfileInfo. This conversion is save because we know
-      // that everything thats representable in unsinged is also representable
-      // in double.
-      EdgeInformation[getFunction(e)][e] += (double)weight;
-
-      DEBUG(dbgs() << "--Read Edge Counter for " << e
-                   << " (# "<< (ReadCount-1) << "): "
-                   << (unsigned)getEdgeWeight(e) << "\n");
-    } else {
-      // This happens only if reading optimal profiling information, not when
-      // reading regular profiling information.
-      SpanningTree.insert(e);
-    }
-  }
-}
-
-bool LoaderPass::runOnModule(Module &M) {
-  ProfileInfoLoader PIL("profile-loader", Filename);
-
-  EdgeInformation.clear();
-  std::vector<unsigned> Counters = PIL.getRawEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  Counters = PIL.getRawOptimalEdgeCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-      readEdge(getEdge(0,&F->getEntryBlock()), Counters);
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-        TerminatorInst *TI = BB->getTerminator();
-        if (TI->getNumSuccessors() == 0) {
-          readEdge(getEdge(BB,0), Counters);
-        }
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          readEdge(getEdge(BB,TI->getSuccessor(s)), Counters);
-        }
-      }
-      while (SpanningTree.size() > 0) {
-
-        unsigned size = SpanningTree.size();
-
-        BBisUnvisited.clear();
-        for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-             ee = SpanningTree.end(); ei != ee; ++ei) {
-          BBisUnvisited.insert(ei->first);
-          BBisUnvisited.insert(ei->second);
-        }
-        while (BBisUnvisited.size() > 0) {
-          recurseBasicBlock(*BBisUnvisited.begin());
-        }
-
-        if (SpanningTree.size() == size) {
-          DEBUG(dbgs()<<"{");
-          for (std::set<Edge>::iterator ei = SpanningTree.begin(),
-               ee = SpanningTree.end(); ei != ee; ++ei) {
-            DEBUG(dbgs()<< *ei <<",");
-          }
-          assert(0 && "No edge calculated!");
-        }
-
-      }
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-    NumEdgesRead = ReadCount;
-  }
-
-  BlockInformation.clear();
-  Counters = PIL.getRawBlockCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-        if (ReadCount < Counters.size())
-          // Here the data realm changes from the unsigned of the file to the
-          // double of the ProfileInfo. This conversion is save because we know
-          // that everything thats representable in unsinged is also
-          // representable in double.
-          BlockInformation[F][BB] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  FunctionInformation.clear();
-  Counters = PIL.getRawFunctionCounts();
-  if (Counters.size() > 0) {
-    ReadCount = 0;
-    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-      if (F->isDeclaration()) continue;
-      if (ReadCount < Counters.size())
-        // Here the data realm changes from the unsigned of the file to the
-        // double of the ProfileInfo. This conversion is save because we know
-        // that everything thats representable in unsinged is also
-        // representable in double.
-        FunctionInformation[F] = (double)Counters[ReadCount++];
-    }
-    if (ReadCount != Counters.size()) {
-      errs() << "WARNING: profile information is inconsistent with "
-             << "the current program!\n";
-    }
-  }
-
-  return false;
-}
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
deleted file mode 100644
index c8896de..0000000
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ /dev/null
@@ -1,383 +0,0 @@
-//===- ProfileVerifierPass.cpp - LLVM Pass to estimate profile info -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a pass that checks profiling information for 
-// plausibility.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "profile-verifier"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/InstIterator.h"
-#include "llvm/Support/raw_ostream.h"
-#include <set>
-using namespace llvm;
-
-static cl::opt<bool,false>
-ProfileVerifierDisableAssertions("profile-verifier-noassert",
-     cl::desc("Disable assertions"));
-
-namespace {
-  template<class FType, class BType>
-  class ProfileVerifierPassT : public FunctionPass {
-
-    struct DetailedBlockInfo {
-      const BType *BB;
-      double      BBWeight;
-      double      inWeight;
-      int         inCount;
-      double      outWeight;
-      int         outCount;
-    };
-
-    ProfileInfoT<FType, BType> *PI;
-    std::set<const BType*> BBisVisited;
-    std::set<const FType*>   FisVisited;
-    bool DisableAssertions;
-
-    // When debugging is enabled, the verifier prints a whole slew of debug
-    // information, otherwise its just the assert. These are all the helper
-    // functions.
-    bool PrintedDebugTree;
-    std::set<const BType*> BBisPrinted;
-    void debugEntry(DetailedBlockInfo*);
-    void printDebugInfo(const BType *BB);
-
-  public:
-    static char ID; // Class identification, replacement for typeinfo
-
-    explicit ProfileVerifierPassT () : FunctionPass(ID) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-      DisableAssertions = ProfileVerifierDisableAssertions;
-    }
-    explicit ProfileVerifierPassT (bool da) : FunctionPass(ID), 
-                                              DisableAssertions(da) {
-      initializeProfileVerifierPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesAll();
-      AU.addRequired<ProfileInfoT<FType, BType> >();
-    }
-
-    const char *getPassName() const {
-      return "Profiling information verifier";
-    }
-
-    /// run - Verify the profile information.
-    bool runOnFunction(FType &F);
-    void recurseBasicBlock(const BType*);
-
-    bool   exitReachable(const FType*);
-    double ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge);
-    void   CheckValue(bool, const char*, DetailedBlockInfo*);
-  };
-
-  typedef ProfileVerifierPassT<Function, BasicBlock> ProfileVerifierPass;
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::printDebugInfo(const BType *BB) {
-
-    if (BBisPrinted.find(BB) != BBisPrinted.end()) return;
-
-    double BBWeight = PI->getExecutionCount(BB);
-    if (BBWeight == ProfileInfoT<FType, BType>::MissingValue) { BBWeight = 0; }
-    double inWeight = 0;
-    int inCount = 0;
-    std::set<const BType*> ProcessedPreds;
-    for (const_pred_iterator bbi = pred_begin(BB), bbe = pred_end(BB);
-         bbi != bbe; ++bbi ) {
-      if (ProcessedPreds.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(*bbi,BB);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated in-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        inWeight += EdgeWeight;
-        inCount++;
-      }
-    }
-    double outWeight = 0;
-    int outCount = 0;
-    std::set<const BType*> ProcessedSuccs;
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-          bbi != bbe; ++bbi ) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        typename ProfileInfoT<FType, BType>::Edge E = PI->getEdge(BB,*bbi);
-        double EdgeWeight = PI->getEdgeWeight(E);
-        if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) { EdgeWeight = 0; }
-        dbgs() << "calculated out-edge " << E << ": " 
-               << format("%20.20g",EdgeWeight) << "\n";
-        outWeight += EdgeWeight;
-        outCount++;
-      }
-    }
-    dbgs() << "Block " << BB->getName()                   << " in "
-           << BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",inWeight)  << ","
-           << "inCount="   << inCount                     << ","
-           << "outWeight=" << format("%20.20g",outWeight) << ","
-           << "outCount"   << outCount                    << "\n";
-
-    // mark as visited and recurse into subnodes
-    BBisPrinted.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      printDebugInfo(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
-    dbgs() << "TROUBLE: Block " << DI->BB->getName()          << " in "
-           << DI->BB->getParent()->getName()                  << ":"
-           << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
-           << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
-           << "inCount="   << DI->inCount                     << ","
-           << "outWeight=" << format("%20.20g",DI->outWeight) << ","
-           << "outCount="  << DI->outCount                    << "\n";
-    if (!PrintedDebugTree) {
-      PrintedDebugTree = true;
-      printDebugInfo(&(DI->BB->getParent()->getEntryBlock()));
-    }
-  }
-
-  // This compares A and B for equality.
-  static bool Equals(double A, double B) {
-    return A == B;
-  }
-
-  // This checks if the function "exit" is reachable from an given function
-  // via calls, this is necessary to check if a profile is valid despite the
-  // counts not fitting exactly.
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::exitReachable(const FType *F) {
-    if (!F) return false;
-
-    if (FisVisited.count(F)) return false;
-
-    FType *Exit = F->getParent()->getFunction("exit");
-    if (Exit == F) {
-      return true;
-    }
-
-    FisVisited.insert(F);
-    bool exits = false;
-    for (const_inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
-      if (const CallInst *CI = dyn_cast<CallInst>(&*I)) {
-        FType *F = CI->getCalledFunction();
-        if (F) {
-          exits |= exitReachable(F);
-        } else {
-          // This is a call to a pointer, all bets are off...
-          exits = true;
-        }
-        if (exits) break;
-      }
-    }
-    return exits;
-  }
-
-  #define ASSERTMESSAGE(M) \
-    { dbgs() << "ASSERT:" << (M) << "\n"; \
-      if (!DisableAssertions) assert(0 && (M)); }
-
-  template<class FType, class BType>
-  double ProfileVerifierPassT<FType, BType>::ReadOrAssert(typename ProfileInfoT<FType, BType>::Edge E) {
-    double EdgeWeight = PI->getEdgeWeight(E);
-    if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
-      dbgs() << "Edge " << E << " in Function " 
-             << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-      ASSERTMESSAGE("Edge has missing value");
-      return 0;
-    } else {
-      if (EdgeWeight < 0) {
-        dbgs() << "Edge " << E << " in Function " 
-               << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
-        ASSERTMESSAGE("Edge has negative value");
-      }
-      return EdgeWeight;
-    }
-  }
-
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::CheckValue(bool Error, 
-                                                      const char *Message,
-                                                      DetailedBlockInfo *DI) {
-    if (Error) {
-      DEBUG(debugEntry(DI));
-      dbgs() << "Block " << DI->BB->getName() << " in Function "
-             << DI->BB->getParent()->getName() << ": ";
-      ASSERTMESSAGE(Message);
-    }
-    return;
-  }
-
-  // This calculates the Information for a block and then recurses into the
-  // successors.
-  template<class FType, class BType>
-  void ProfileVerifierPassT<FType, BType>::recurseBasicBlock(const BType *BB) {
-
-    // Break the recursion by remembering all visited blocks.
-    if (BBisVisited.find(BB) != BBisVisited.end()) return;
-
-    // Use a data structure to store all the information, this can then be handed
-    // to debug printers.
-    DetailedBlockInfo DI;
-    DI.BB = BB;
-    DI.outCount = DI.inCount = 0;
-    DI.inWeight = DI.outWeight = 0;
-
-    // Read predecessors.
-    std::set<const BType*> ProcessedPreds;
-    const_pred_iterator bpi = pred_begin(BB), bpe = pred_end(BB);
-    // If there are none, check for (0,BB) edge.
-    if (bpi == bpe) {
-      DI.inWeight += ReadOrAssert(PI->getEdge(0,BB));
-      DI.inCount++;
-    }
-    for (;bpi != bpe; ++bpi) {
-      if (ProcessedPreds.insert(*bpi).second) {
-        DI.inWeight += ReadOrAssert(PI->getEdge(*bpi,BB));
-        DI.inCount++;
-      }
-    }
-
-    // Read successors.
-    std::set<const BType*> ProcessedSuccs;
-    succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB);
-    // If there is an (0,BB) edge, consider it too. (This is done not only when
-    // there are no successors, but every time; not every function contains
-    // return blocks with no successors (think loop latch as return block)).
-    double w = PI->getEdgeWeight(PI->getEdge(BB,0));
-    if (w != ProfileInfoT<FType, BType>::MissingValue) {
-      DI.outWeight += w;
-      DI.outCount++;
-    }
-    for (;bbi != bbe; ++bbi) {
-      if (ProcessedSuccs.insert(*bbi).second) {
-        DI.outWeight += ReadOrAssert(PI->getEdge(BB,*bbi));
-        DI.outCount++;
-      }
-    }
-
-    // Read block weight.
-    DI.BBWeight = PI->getExecutionCount(BB);
-    CheckValue(DI.BBWeight == ProfileInfoT<FType, BType>::MissingValue,
-               "BasicBlock has missing value", &DI);
-    CheckValue(DI.BBWeight < 0,
-               "BasicBlock has negative value", &DI);
-
-    // Check if this block is a setjmp target.
-    bool isSetJmpTarget = false;
-    if (DI.outWeight > DI.inWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F && (F->getName() == "_setjmp")) {
-            isSetJmpTarget = true; break;
-          }
-        }
-      }
-    }
-    // Check if this block is eventually reaching exit.
-    bool isExitReachable = false;
-    if (DI.inWeight > DI.outWeight) {
-      for (typename BType::const_iterator i = BB->begin(), ie = BB->end();
-           i != ie; ++i) {
-        if (const CallInst *CI = dyn_cast<CallInst>(&*i)) {
-          FType *F = CI->getCalledFunction();
-          if (F) {
-            FisVisited.clear();
-            isExitReachable |= exitReachable(F);
-          } else {
-            // This is a call to a pointer, all bets are off...
-            isExitReachable = true;
-          }
-          if (isExitReachable) break;
-        }
-      }
-    }
-
-    if (DI.inCount > 0 && DI.outCount == 0) {
-       // If this is a block with no successors.
-      if (!isSetJmpTarget) {
-        CheckValue(!Equals(DI.inWeight,DI.BBWeight), 
-                   "inWeight and BBWeight do not match", &DI);
-      }
-    } else if (DI.inCount == 0 && DI.outCount > 0) {
-      // If this is a block with no predecessors.
-      if (!isExitReachable)
-        CheckValue(!Equals(DI.BBWeight,DI.outWeight), 
-                   "BBWeight and outWeight do not match", &DI);
-    } else {
-      // If this block has successors and predecessors.
-      if (DI.inWeight > DI.outWeight && !isExitReachable)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-      if (DI.inWeight < DI.outWeight && !isSetJmpTarget)
-        CheckValue(!Equals(DI.inWeight,DI.outWeight), 
-                   "inWeight and outWeight do not match", &DI);
-    }
-
-
-    // Mark this block as visited, rescurse into successors.
-    BBisVisited.insert(BB);
-    for ( succ_const_iterator bbi = succ_begin(BB), bbe = succ_end(BB); 
-          bbi != bbe; ++bbi ) {
-      recurseBasicBlock(*bbi);
-    }
-  }
-
-  template<class FType, class BType>
-  bool ProfileVerifierPassT<FType, BType>::runOnFunction(FType &F) {
-    PI = getAnalysisIfAvailable<ProfileInfoT<FType, BType> >();
-    if (!PI)
-      ASSERTMESSAGE("No ProfileInfo available");
-
-    // Prepare global variables.
-    PrintedDebugTree = false;
-    BBisVisited.clear();
-
-    // Fetch entry block and recurse into it.
-    const BType *entry = &F.getEntryBlock();
-    recurseBasicBlock(entry);
-
-    if (PI->getExecutionCount(&F) != PI->getExecutionCount(entry))
-      ASSERTMESSAGE("Function count and entry block count do not match");
-
-    return false;
-  }
-
-  template<class FType, class BType>
-  char ProfileVerifierPassT<FType, BType>::ID = 0;
-}
-
-INITIALIZE_PASS_BEGIN(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(ProfileVerifierPass, "profile-verifier",
-                "Verify profiling information", false, true)
-
-namespace llvm {
-  FunctionPass *createProfileVerifierPass() {
-    return new ProfileVerifierPass(ProfileVerifierDisableAssertions); 
-  }
-}
-
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 8577025..5635688 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -9,6 +9,7 @@
 // Detects single entry single exit regions in the control flow graph.
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "region"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -17,12 +18,9 @@
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
-
-#define DEBUG_TYPE "region"
 #include "llvm/Support/Debug.h"
-
-#include <set>
 #include <algorithm>
+#include <set>
 
 using namespace llvm;
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index f876748..0a02f4e 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -585,6 +585,9 @@ namespace {
 
         // Lexicographically compare n-ary expressions.
         unsigned LNumOps = LC->getNumOperands(), RNumOps = RC->getNumOperands();
+        if (LNumOps != RNumOps)
+          return (int)LNumOps - (int)RNumOps;
+
         for (unsigned i = 0; i != LNumOps; ++i) {
           if (i >= RNumOps)
             return 1;
@@ -758,7 +761,7 @@ static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
   unsigned CalculationBits = W + T;
 
   // Calculate 2^T, at width T+W.
-  APInt DivFactor = APInt(CalculationBits, 1).shl(T);
+  APInt DivFactor = APInt::getOneBitSet(CalculationBits, T);
 
   // Calculate the multiplicative inverse of K! / 2^T;
   // this multiplication factor will perform the exact division by
@@ -1380,7 +1383,7 @@ const SCEV *ScalarEvolution::getAnyExtendExpr(const SCEV *Op,
 ///
 static bool
 CollectAddOperandsWithScales(DenseMap<const SCEV *, APInt> &M,
-                             SmallVector<const SCEV *, 8> &NewOps,
+                             SmallVectorImpl<const SCEV *> &NewOps,
                              APInt &AccumulatedConstant,
                              const SCEV *const *Ops, size_t NumOperands,
                              const APInt &Scale,
@@ -1628,7 +1631,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // re-generate the operands list. Group the operands by constant scale,
       // to avoid multiplying by the same constant scale multiple times.
       std::map<APInt, SmallVector<const SCEV *, 4>, APIntCompare> MulOpLists;
-      for (SmallVector<const SCEV *, 8>::const_iterator I = NewOps.begin(),
+      for (SmallVectorImpl<const SCEV *>::const_iterator I = NewOps.begin(),
            E = NewOps.end(); I != E; ++I)
         MulOpLists[M.find(*I)->second].push_back(*I);
       // Re-generate the operands list.
@@ -2587,55 +2590,39 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
-                       TD->getTypeAllocSize(AllocTy));
+    return getConstant(IntTy, TD->getTypeAllocSize(AllocTy));
 
   Constant *C = ConstantExpr::getSizeOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
   Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
+  assert(Ty == IntTy && "Effective SCEV type doesn't match");
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
-  Constant *C = ConstantExpr::getAlignOf(AllocTy);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(AllocTy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
-
-const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(Type *IntTy,
+                                             StructType *STy,
                                              unsigned FieldNo) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
-  if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
+  if (TD) {
+    return getConstant(IntTy,
                        TD->getStructLayout(STy)->getElementOffset(FieldNo));
+  }
 
   Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
     if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
       C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
-  return getTruncateOrZeroExtend(getSCEV(C), Ty);
-}
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(Type *CTy,
-                                             Constant *FieldNo) {
-  Constant *C = ConstantExpr::getOffsetOf(CTy, FieldNo);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    if (Constant *Folded = ConstantFoldConstantExpression(CE, TD, TLI))
-      C = Folded;
-  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(CTy));
+  Type *Ty = getEffectiveSCEVType(PointerType::getUnqual(STy));
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
@@ -2700,12 +2687,15 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
 Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntegerTy()) {
     return Ty;
+  }
 
   // The only other support type is pointer.
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
-  if (TD) return TD->getIntPtrType(getContext());
+
+  if (TD)
+    return TD->getIntPtrType(Ty);
 
   // Without DataLayout, conservatively assume pointers are 64-bit.
   return Type::getInt64Ty(getContext());
@@ -2715,13 +2705,51 @@ const SCEV *ScalarEvolution::getCouldNotCompute() {
   return &CouldNotCompute;
 }
 
+namespace {
+  // Helper class working with SCEVTraversal to figure out if a SCEV contains
+  // a SCEVUnknown with null value-pointer. FindInvalidSCEVUnknown::FindOne
+  // is set iff if find such SCEVUnknown.
+  //
+  struct FindInvalidSCEVUnknown {
+    bool FindOne;
+    FindInvalidSCEVUnknown() { FindOne = false; }
+    bool follow(const SCEV *S) {
+      switch (S->getSCEVType()) {
+      case scConstant:
+        return false;
+      case scUnknown:
+        if (!cast<SCEVUnknown>(S)->getValue())
+          FindOne = true;
+        return false;
+      default:
+        return true;
+      }
+    }
+    bool isDone() const { return FindOne; }
+  };
+}
+
+bool ScalarEvolution::checkValidity(const SCEV *S) const {
+  FindInvalidSCEVUnknown F;
+  SCEVTraversal<FindInvalidSCEVUnknown> ST(F);
+  ST.visitAll(S);
+
+  return !F.FindOne;
+}
+
 /// getSCEV - Return an existing SCEV if it exists, otherwise analyze the
 /// expression and create a new one.
 const SCEV *ScalarEvolution::getSCEV(Value *V) {
   assert(isSCEVable(V->getType()) && "Value is not SCEVable!");
 
-  ValueExprMapType::const_iterator I = ValueExprMap.find_as(V);
-  if (I != ValueExprMap.end()) return I->second;
+  ValueExprMapType::iterator I = ValueExprMap.find_as(V);
+  if (I != ValueExprMap.end()) {
+    const SCEV *S = I->second;
+    if (checkValidity(S))
+      return S;
+    else
+      ValueExprMap.erase(I);
+  }
   const SCEV *S = createSCEV(V);
 
   // The process of creating a SCEV for V may have caused other SCEVs
@@ -3060,15 +3088,26 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
                   Flags = setFlags(Flags, SCEV::FlagNUW);
                 if (OBO->hasNoSignedWrap())
                   Flags = setFlags(Flags, SCEV::FlagNSW);
-              } else if (const GEPOperator *GEP =
-                         dyn_cast<GEPOperator>(BEValueV)) {
+              } else if (GEPOperator *GEP = dyn_cast<GEPOperator>(BEValueV)) {
                 // If the increment is an inbounds GEP, then we know the address
                 // space cannot be wrapped around. We cannot make any guarantee
                 // about signed or unsigned overflow because pointers are
                 // unsigned but we may have a negative index from the base
-                // pointer.
-                if (GEP->isInBounds())
+                // pointer. We can guarantee that no unsigned wrap occurs if the
+                // indices form a positive value.
+                if (GEP->isInBounds()) {
                   Flags = setFlags(Flags, SCEV::FlagNW);
+
+                  const SCEV *Ptr = getSCEV(GEP->getPointerOperand());
+                  if (isKnownPositive(getMinusSCEV(getSCEV(GEP), Ptr)))
+                    Flags = setFlags(Flags, SCEV::FlagNUW);
+                }
+              } else if (const SubOperator *OBO =
+                           dyn_cast<SubOperator>(BEValueV)) {
+                if (OBO->hasNoUnsignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNUW);
+                if (OBO->hasNoSignedWrap())
+                  Flags = setFlags(Flags, SCEV::FlagNSW);
               }
 
               const SCEV *StartVal = getSCEV(StartValueV);
@@ -3136,18 +3175,18 @@ const SCEV *ScalarEvolution::createNodeForPHI(PHINode *PN) {
 /// operations. This allows them to be analyzed by regular SCEV code.
 ///
 const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
+  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
+  Value *Base = GEP->getOperand(0);
+  // Don't attempt to analyze GEPs over unsized objects.
+  if (!Base->getType()->getPointerElementType()->isSized())
+    return getUnknown(GEP);
 
   // Don't blindly transfer the inbounds flag from the GEP instruction to the
   // Add expression, because the Instruction may be guarded by control flow
   // and the no-overflow bits may not be valid for the expression in any
   // context.
-  bool isInBounds = GEP->isInBounds();
+  SCEV::NoWrapFlags Wrap = GEP->isInBounds() ? SCEV::FlagNSW : SCEV::FlagAnyWrap;
 
-  Type *IntPtrTy = getEffectiveSCEVType(GEP->getType());
-  Value *Base = GEP->getOperand(0);
-  // Don't attempt to analyze GEPs over unsized objects.
-  if (!cast<PointerType>(Base->getType())->getElementType()->isSized())
-    return getUnknown(GEP);
   const SCEV *TotalOffset = getConstant(IntPtrTy, 0);
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (GetElementPtrInst::op_iterator I = llvm::next(GEP->op_begin()),
@@ -3158,21 +3197,19 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+      const SCEV *FieldOffset = getOffsetOfExpr(IntPtrTy, STy, FieldNo);
 
       // Add the field offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, FieldOffset);
     } else {
       // For an array, add the element offset, explicitly scaled.
-      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *ElementSize = getSizeOfExpr(IntPtrTy, *GTI);
       const SCEV *IndexS = getSCEV(Index);
       // Getelementptr indices are signed.
       IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
 
       // Multiply the index by the element size to compute the element offset.
-      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize,
-                                           isInBounds ? SCEV::FlagNSW :
-                                           SCEV::FlagAnyWrap);
+      const SCEV *LocalOffset = getMulExpr(IndexS, ElementSize, Wrap);
 
       // Add the element offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, LocalOffset);
@@ -3183,8 +3220,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   const SCEV *BaseS = getSCEV(Base);
 
   // Add the total offset from all the GEP indices to the base.
-  return getAddExpr(BaseS, TotalOffset,
-                    isInBounds ? SCEV::FlagNSW : SCEV::FlagAnyWrap);
+  return getAddExpr(BaseS, TotalOffset, Wrap);
 }
 
 /// GetMinTrailingZeros - Determine the minimum number of zero bits that S is
@@ -3551,7 +3587,7 @@ ScalarEvolution::getSignedRange(const SCEV *S) {
     if (!U->getValue()->getType()->isIntegerTy() && !TD)
       return setSignedRange(U, ConservativeResult);
     unsigned NS = ComputeNumSignBits(U->getValue(), TD);
-    if (NS == 1)
+    if (NS <= 1)
       return setSignedRange(U, ConservativeResult);
     return setSignedRange(U, ConservativeResult.intersectWith(
       ConstantRange(APInt::getSignedMinValue(BitWidth).ashr(NS - 1),
@@ -3751,7 +3787,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         break;
 
       Constant *X = ConstantInt::get(getContext(),
-        APInt(BitWidth, 1).shl(SA->getZExtValue()));
+        APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
       return getMulExpr(getSCEV(U->getOperand(0)), getSCEV(X));
     }
     break;
@@ -3769,7 +3805,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         break;
 
       Constant *X = ConstantInt::get(getContext(),
-        APInt(BitWidth, 1).shl(SA->getZExtValue()));
+        APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
       return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(X));
     }
     break;
@@ -3947,7 +3983,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 /// depends on a NSW assumption, and we would only fall back to a conservative
 /// trip count in that case.
 unsigned ScalarEvolution::
-getSmallConstantTripCount(Loop *L, BasicBlock */*ExitingBlock*/) {
+getSmallConstantTripCount(Loop *L, BasicBlock * /*ExitingBlock*/) {
   const SCEVConstant *ExitCount =
     dyn_cast<SCEVConstant>(getBackedgeTakenCount(L));
   if (!ExitCount)
@@ -3976,7 +4012,7 @@ getSmallConstantTripCount(Loop *L, BasicBlock */*ExitingBlock*/) {
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
 unsigned ScalarEvolution::
-getSmallConstantTripMultiple(Loop *L, BasicBlock */*ExitingBlock*/) {
+getSmallConstantTripMultiple(Loop *L, BasicBlock * /*ExitingBlock*/) {
   const SCEV *ExitCount = getBackedgeTakenCount(L);
   if (ExitCount == getCouldNotCompute())
     return 1;
@@ -4575,25 +4611,17 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_SLT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_SGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, true, IsSubExpr);
-    if (EL.hasAnyInfo()) return EL;
-    break;
-  }
-  case ICmpInst::ICMP_ULT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr);
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_ULT: {                    // while (X < Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SLT;
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
-  case ICmpInst::ICMP_UGT: {
-    ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                    getNotSCEV(RHS), L, false, IsSubExpr);
+  case ICmpInst::ICMP_SGT:
+  case ICmpInst::ICMP_UGT: {                    // while (X > Y)
+    bool IsSigned = Cond == ICmpInst::ICMP_SGT;
+    ExitLimit EL = HowManyGreaterThans(LHS, RHS, L, IsSigned, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -5031,15 +5059,21 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
 /// original value V is returned.
 const SCEV *ScalarEvolution::getSCEVAtScope(const SCEV *V, const Loop *L) {
   // Check to see if we've folded this expression at this loop before.
-  std::map<const Loop *, const SCEV *> &Values = ValuesAtScopes[V];
-  std::pair<std::map<const Loop *, const SCEV *>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, static_cast<const SCEV *>(0)));
-  if (!Pair.second)
-    return Pair.first->second ? Pair.first->second : V;
-
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values = ValuesAtScopes[V];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second ? Values[u].second : V;
+  }
+  Values.push_back(std::make_pair(L, static_cast<const SCEV *>(0)));
   // Otherwise compute it.
   const SCEV *C = computeSCEVAtScope(V, L);
-  ValuesAtScopes[V][L] = C;
+  SmallVector<std::pair<const Loop *, const SCEV *>, 2> &Values2 = ValuesAtScopes[V];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = C;
+      break;
+    }
+  }
   return C;
 }
 
@@ -5078,18 +5112,23 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
     case scAddExpr: {
       const SCEVAddExpr *SA = cast<SCEVAddExpr>(V);
       if (Constant *C = BuildConstantFromSCEV(SA->getOperand(0))) {
-        if (C->getType()->isPointerTy())
-          C = ConstantExpr::getBitCast(C, Type::getInt8PtrTy(C->getContext()));
+        if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+          unsigned AS = PTy->getAddressSpace();
+          Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
+          C = ConstantExpr::getBitCast(C, DestPtrTy);
+        }
         for (unsigned i = 1, e = SA->getNumOperands(); i != e; ++i) {
           Constant *C2 = BuildConstantFromSCEV(SA->getOperand(i));
           if (!C2) return 0;
 
           // First pointer!
           if (!C->getType()->isPointerTy() && C2->getType()->isPointerTy()) {
+            unsigned AS = C2->getType()->getPointerAddressSpace();
             std::swap(C, C2);
+            Type *DestPtrTy = Type::getInt8PtrTy(C->getContext(), AS);
             // The offsets have been converted to bytes.  We can add bytes to an
             // i8* by GEP with the byte count in the first index.
-            C = ConstantExpr::getBitCast(C,Type::getInt8PtrTy(C->getContext()));
+            C = ConstantExpr::getBitCast(C, DestPtrTy);
           }
 
           // Don't bother trying to sum two pointers. We probably can't
@@ -5097,8 +5136,8 @@ static Constant *BuildConstantFromSCEV(const SCEV *V) {
           if (C2->getType()->isPointerTy())
             return 0;
 
-          if (C->getType()->isPointerTy()) {
-            if (cast<PointerType>(C->getType())->getElementType()->isStructTy())
+          if (PointerType *PTy = dyn_cast<PointerType>(C->getType())) {
+            if (PTy->getElementType()->isStructTy())
               C2 = ConstantExpr::getIntegerCast(
                   C2, Type::getInt32Ty(C->getContext()), true);
             C = ConstantExpr::getGetElementPtr(C, C2);
@@ -6295,45 +6334,72 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
   return false;
 }
 
-/// getBECount - Subtract the end and start values and divide by the step,
-/// rounding up, to get the number of times the backedge is executed. Return
-/// CouldNotCompute if an intermediate computation overflows.
-const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
-                                        const SCEV *End,
-                                        const SCEV *Step,
-                                        bool NoWrap) {
-  assert(!isKnownNegative(Step) &&
-         "This code doesn't handle negative strides yet!");
-
-  Type *Ty = Start->getType();
-
-  // When Start == End, we have an exact BECount == 0. Short-circuit this case
-  // here because SCEV may not be able to determine that the unsigned division
-  // after rounding is zero.
-  if (Start == End)
-    return getConstant(Ty, 0);
-
-  const SCEV *NegOne = getConstant(Ty, (uint64_t)-1);
-  const SCEV *Diff = getMinusSCEV(End, Start);
-  const SCEV *RoundUp = getAddExpr(Step, NegOne);
-
-  // Add an adjustment to the difference between End and Start so that
-  // the division will effectively round up.
-  const SCEV *Add = getAddExpr(Diff, RoundUp);
-
-  if (!NoWrap) {
-    // Check Add for unsigned overflow.
-    // TODO: More sophisticated things could be done here.
-    Type *WideTy = IntegerType::get(getContext(),
-                                          getTypeSizeInBits(Ty) + 1);
-    const SCEV *EDiff = getZeroExtendExpr(Diff, WideTy);
-    const SCEV *ERoundUp = getZeroExtendExpr(RoundUp, WideTy);
-    const SCEV *OperandExtendedAdd = getAddExpr(EDiff, ERoundUp);
-    if (getZeroExtendExpr(Add, WideTy) != OperandExtendedAdd)
-      return getCouldNotCompute();
+// Verify if an linear IV with positive stride can overflow when in a 
+// less-than comparison, knowing the invariant term of the comparison, the 
+// stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MaxRHS = getSignedRange(RHS).getSignedMax();
+    APInt MaxValue = APInt::getSignedMaxValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                                .getSignedMax();
+
+    // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow!
+    return (MaxValue - MaxStrideMinusOne).slt(MaxRHS);
   }
 
-  return getUDivExpr(Add, Step);
+  APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax();
+  APInt MaxValue = APInt::getMaxValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                              .getUnsignedMax();
+
+  // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow!
+  return (MaxValue - MaxStrideMinusOne).ult(MaxRHS);
+}
+
+// Verify if an linear IV with negative stride can overflow when in a 
+// greater-than comparison, knowing the invariant term of the comparison,
+// the stride and the knowledge of NSW/NUW flags on the recurrence.
+bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride,
+                                         bool IsSigned, bool NoWrap) {
+  if (NoWrap) return false;
+
+  unsigned BitWidth = getTypeSizeInBits(RHS->getType());
+  const SCEV *One = getConstant(Stride->getType(), 1);
+
+  if (IsSigned) {
+    APInt MinRHS = getSignedRange(RHS).getSignedMin();
+    APInt MinValue = APInt::getSignedMinValue(BitWidth);
+    APInt MaxStrideMinusOne = getSignedRange(getMinusSCEV(Stride, One))
+                               .getSignedMax();
+
+    // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow!
+    return (MinValue + MaxStrideMinusOne).sgt(MinRHS);
+  }
+
+  APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin();
+  APInt MinValue = APInt::getMinValue(BitWidth);
+  APInt MaxStrideMinusOne = getUnsignedRange(getMinusSCEV(Stride, One))
+                            .getUnsignedMax();
+
+  // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow!
+  return (MinValue + MaxStrideMinusOne).ugt(MinRHS);
+}
+
+// Compute the backedge taken count knowing the interval difference, the
+// stride and presence of the equality in the comparison.
+const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, 
+                                            bool Equality) {
+  const SCEV *One = getConstant(Step->getType(), 1);
+  Delta = Equality ? getAddExpr(Delta, Step)
+                   : getAddExpr(Delta, getMinusSCEV(Step, One));
+  return getUDivExpr(Delta, Step);
 }
 
 /// HowManyLessThans - Return the number of times a backedge containing the
@@ -6345,119 +6411,144 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
 /// a subexpression that cannot overflow before evaluating true.
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
-                                  const Loop *L, bool isSigned,
+                                  const Loop *L, bool IsSigned,
                                   bool IsSubExpr) {
-  // Only handle:  "ADDREC < LoopInvariant".
-  if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
+  // We handle only IV < Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
 
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(LHS);
-  if (!AddRec || AddRec->getLoop() != L)
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
     return getCouldNotCompute();
 
-  // Check to see if we have a flag which makes analysis easy.
-  bool NoWrap = false;
-  if (!IsSubExpr) {
-    NoWrap = AddRec->getNoWrapFlags(
-      (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW))
-                          | SCEV::FlagNW));
-  }
-  if (AddRec->isAffine()) {
-    unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
-    const SCEV *Step = AddRec->getStepRecurrence(*this);
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
 
-    if (Step->isZero())
-      return getCouldNotCompute();
-    if (Step->isOne()) {
-      // With unit stride, the iteration never steps past the limit value.
-    } else if (isKnownPositive(Step)) {
-      // Test whether a positive iteration can step past the limit
-      // value and past the maximum value for its type in a single step.
-      // Note that it's not sufficient to check NoWrap here, because even
-      // though the value after a wrap is undefined, it's not undefined
-      // behavior, so if wrap does occur, the loop could either terminate or
-      // loop infinitely, but in either case, the loop is guaranteed to
-      // iterate at least until the iteration where the wrapping occurs.
-      const SCEV *One = getConstant(Step->getType(), 1);
-      if (isSigned) {
-        APInt Max = APInt::getSignedMaxValue(BitWidth);
-        if ((Max - getSignedRange(getMinusSCEV(Step, One)).getSignedMax())
-              .slt(getSignedRange(RHS).getSignedMax()))
-          return getCouldNotCompute();
-      } else {
-        APInt Max = APInt::getMaxValue(BitWidth);
-        if ((Max - getUnsignedRange(getMinusSCEV(Step, One)).getUnsignedMax())
-              .ult(getUnsignedRange(RHS).getUnsignedMax()))
-          return getCouldNotCompute();
-      }
-    } else
-      // TODO: Handle negative strides here and below.
-      return getCouldNotCompute();
+  const SCEV *Stride = IV->getStepRecurrence(*this);
 
-    // We know the LHS is of the form {n,+,s} and the RHS is some loop-invariant
-    // m.  So, we count the number of iterations in which {n,+,s} < m is true.
-    // Note that we cannot simply return max(m-n,0)/s because it's not safe to
-    // treat m-n as signed nor unsigned due to overflow possibility.
-
-    // First, we get the value of the LHS in the first iteration: n
-    const SCEV *Start = AddRec->getOperand(0);
-
-    // Determine the minimum constant start value.
-    const SCEV *MinStart = getConstant(isSigned ?
-      getSignedRange(Start).getSignedMin() :
-      getUnsignedRange(Start).getUnsignedMin());
-
-    // If we know that the condition is true in order to enter the loop,
-    // then we know that it will run exactly (m-n)/s times. Otherwise, we
-    // only know that it will execute (max(m,n)-n)/s times. In both cases,
-    // the division must round up.
-    const SCEV *End = RHS;
-    if (!isLoopEntryGuardedByCond(L,
-                                  isSigned ? ICmpInst::ICMP_SLT :
-                                             ICmpInst::ICMP_ULT,
-                                  getMinusSCEV(Start, Step), RHS))
-      End = isSigned ? getSMaxExpr(RHS, Start)
-                     : getUMaxExpr(RHS, Start);
-
-    // Determine the maximum constant end value.
-    const SCEV *MaxEnd = getConstant(isSigned ?
-      getSignedRange(End).getSignedMax() :
-      getUnsignedRange(End).getUnsignedMax());
-
-    // If MaxEnd is within a step of the maximum integer value in its type,
-    // adjust it down to the minimum value which would produce the same effect.
-    // This allows the subsequent ceiling division of (N+(step-1))/step to
-    // compute the correct value.
-    const SCEV *StepMinusOne = getMinusSCEV(Step,
-                                            getConstant(Step->getType(), 1));
-    MaxEnd = isSigned ?
-      getSMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getSignedMaxValue(BitWidth)),
-                               StepMinusOne)) :
-      getUMinExpr(MaxEnd,
-                  getMinusSCEV(getConstant(APInt::getMaxValue(BitWidth)),
-                               StepMinusOne));
-
-    // Finally, we subtract these two values and divide, rounding up, to get
-    // the number of times the backedge is executed.
-    const SCEV *BECount = getBECount(Start, End, Step, NoWrap);
-
-    // The maximum backedge count is similar, except using the minimum start
-    // value and the maximum end value.
-    // If we already have an exact constant BECount, use it instead.
-    const SCEV *MaxBECount = isa<SCEVConstant>(BECount) ? BECount
-      : getBECount(MinStart, MaxEnd, Step, NoWrap);
-
-    // If the stride is nonconstant, and NoWrap == true, then
-    // getBECount(MinStart, MaxEnd) may not compute. This would result in an
-    // exact BECount and invalid MaxBECount, which should be avoided to catch
-    // more optimization opportunities.
-    if (isa<SCEVCouldNotCompute>(MaxBECount))
-      MaxBECount = BECount;
-
-    return ExitLimit(BECount, MaxBECount);
-  }
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
 
-  return getCouldNotCompute();
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnLT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT
+                                      : ICmpInst::ICMP_ULT;
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(Start, Stride), RHS))
+    End = IsSigned ? getSMaxExpr(RHS, Start)
+                   : getUMaxExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(End, Start), Stride, false);
+
+  APInt MinStart = IsSigned ? getSignedRange(Start).getSignedMin()
+                            : getUnsignedRange(Start).getUnsignedMin();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMaxValue(BitWidth) - (MinStride - 1)
+                         : APInt::getMaxValue(BitWidth) - (MinStride - 1);
+
+  // Although End can be a MAX expression we estimate MaxEnd considering only
+  // the case End = RHS. This is safe because in the other case (End - Start)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MaxEnd =
+    IsSigned ? APIntOps::smin(getSignedRange(RHS).getSignedMax(), Limit)
+             : APIntOps::umin(getUnsignedRange(RHS).getUnsignedMax(), Limit);
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxEnd - MinStart),
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
+}
+
+ScalarEvolution::ExitLimit
+ScalarEvolution::HowManyGreaterThans(const SCEV *LHS, const SCEV *RHS,
+                                     const Loop *L, bool IsSigned,
+                                     bool IsSubExpr) {
+  // We handle only IV > Invariant
+  if (!isLoopInvariant(RHS, L))
+    return getCouldNotCompute();
+
+  const SCEVAddRecExpr *IV = dyn_cast<SCEVAddRecExpr>(LHS);
+
+  // Avoid weird loops
+  if (!IV || IV->getLoop() != L || !IV->isAffine())
+    return getCouldNotCompute();
+
+  bool NoWrap = !IsSubExpr &&
+                IV->getNoWrapFlags(IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW);
+
+  const SCEV *Stride = getNegativeSCEV(IV->getStepRecurrence(*this));
+
+  // Avoid negative or zero stride values
+  if (!isKnownPositive(Stride))
+    return getCouldNotCompute();
+
+  // Avoid proven overflow cases: this will ensure that the backedge taken count
+  // will not generate any unsigned overflow. Relaxed no-overflow conditions
+  // exploit NoWrapFlags, allowing to optimize in presence of undefined 
+  // behaviors like the case of C language.
+  if (!Stride->isOne() && doesIVOverflowOnGT(RHS, Stride, IsSigned, NoWrap))
+    return getCouldNotCompute();
+
+  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SGT
+                                      : ICmpInst::ICMP_UGT;
+
+  const SCEV *Start = IV->getStart();
+  const SCEV *End = RHS;
+  if (!isLoopEntryGuardedByCond(L, Cond, getAddExpr(Start, Stride), RHS))
+    End = IsSigned ? getSMinExpr(RHS, Start)
+                   : getUMinExpr(RHS, Start);
+
+  const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride, false);
+
+  APInt MaxStart = IsSigned ? getSignedRange(Start).getSignedMax()
+                            : getUnsignedRange(Start).getUnsignedMax();
+
+  APInt MinStride = IsSigned ? getSignedRange(Stride).getSignedMin()
+                             : getUnsignedRange(Stride).getUnsignedMin();
+
+  unsigned BitWidth = getTypeSizeInBits(LHS->getType());
+  APInt Limit = IsSigned ? APInt::getSignedMinValue(BitWidth) + (MinStride - 1)
+                         : APInt::getMinValue(BitWidth) + (MinStride - 1);
+
+  // Although End can be a MIN expression we estimate MinEnd considering only
+  // the case End = RHS. This is safe because in the other case (Start - End)
+  // is zero, leading to a zero maximum backedge taken count.
+  APInt MinEnd =
+    IsSigned ? APIntOps::smax(getSignedRange(RHS).getSignedMin(), Limit)
+             : APIntOps::umax(getUnsignedRange(RHS).getUnsignedMin(), Limit);
+
+
+  const SCEV *MaxBECount = getCouldNotCompute();
+  if (isa<SCEVConstant>(BECount))
+    MaxBECount = BECount;
+  else
+    MaxBECount = computeBECount(getConstant(MaxStart - MinEnd), 
+                                getConstant(MinStride), false);
+
+  if (isa<SCEVCouldNotCompute>(MaxBECount))
+    MaxBECount = BECount;
+
+  return ExitLimit(BECount, MaxBECount);
 }
 
 /// getNumIterationsInRange - Return the number of iterations of this loop that
@@ -6586,7 +6677,534 @@ const SCEV *SCEVAddRecExpr::getNumIterationsInRange(ConstantRange Range,
   return SE.getCouldNotCompute();
 }
 
+static const APInt gcd(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue().abs();
+  APInt B = C2->getValue()->getValue().abs();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
 
+  if (ABW > BBW)
+    B = B.zext(ABW);
+  else if (ABW < BBW)
+    A = A.zext(BBW);
+
+  return APIntOps::GreatestCommonDivisor(A, B);
+}
+
+static const APInt srem(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::srem(A, B);
+}
+
+static const APInt sdiv(const SCEVConstant *C1, const SCEVConstant *C2) {
+  APInt A = C1->getValue()->getValue();
+  APInt B = C2->getValue()->getValue();
+  uint32_t ABW = A.getBitWidth();
+  uint32_t BBW = B.getBitWidth();
+
+  if (ABW > BBW)
+    B = B.sext(ABW);
+  else if (ABW < BBW)
+    A = A.sext(BBW);
+
+  return APIntOps::sdiv(A, B);
+}
+
+namespace {
+struct SCEVGCD : public SCEVVisitor<SCEVGCD, const SCEV *> {
+public:
+  // Pattern match Step into Start. When Step is a multiply expression, find
+  // the largest subexpression of Step that appears in Start. When Start is an
+  // add expression, try to match Step in the subexpressions of Start, non
+  // matching subexpressions are returned under Remainder.
+  static const SCEV *findGCD(ScalarEvolution &SE, const SCEV *Start,
+                             const SCEV *Step, const SCEV **Remainder) {
+    assert(Remainder && "Remainder should not be NULL");
+    SCEVGCD R(SE, Step, SE.getConstant(Step->getType(), 0));
+    const SCEV *Res = R.visit(Start);
+    *Remainder = R.Remainder;
+    return Res;
+  }
+
+  SCEVGCD(ScalarEvolution &S, const SCEV *G, const SCEV *R)
+      : SE(S), GCD(G), Remainder(R) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant || Constant == Zero)
+      return GCD;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD)) {
+      const SCEV *Res = SE.getConstant(gcd(Constant, CGCD));
+      if (Res != One)
+        return Res;
+
+      Remainder = SE.getConstant(srem(Constant, CGCD));
+      Constant = cast<SCEVConstant>(SE.getMinusSCEV(Constant, Remainder));
+      Res = SE.getConstant(gcd(Constant, CGCD));
+      return Res;
+    }
+
+    // When GCD is not a constant, it could be that the GCD is an Add, Mul,
+    // AddRec, etc., in which case we want to find out how many times the
+    // Constant divides the GCD: we then return that as the new GCD.
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, GCD, Constant, &Rem);
+
+    if (Res == One || Rem != Zero) {
+      Remainder = Constant;
+      return One;
+    }
+
+    assert(isa<SCEVConstant>(Res) && "Res should be a constant");
+    Remainder = SE.getConstant(srem(Constant, cast<SCEVConstant>(Res)));
+    return Res;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(e - 1 - i), GCD, &Rem);
+
+      // FIXME: There may be ambiguous situations: for instance,
+      // GCD(-4 + (3 * %m), 2 * %m) where 2 divides -4 and %m divides (3 * %m).
+      // The order in which the AddExpr is traversed computes a different GCD
+      // and Remainder.
+      if (Res != One)
+        GCD = Res;
+      if (Rem != Zero)
+        Remainder = SE.getAddExpr(Remainder, Rem);
+    }
+
+    return GCD;
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      if (Expr->getOperand(i) == GCD)
+        return GCD;
+    }
+
+    // If we have not returned yet, it means that GCD is not part of Expr.
+    const SCEV *PartialGCD = One;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+      if (Rem != Zero)
+        // GCD does not divide Expr->getOperand(i).
+        continue;
+
+      if (Res == GCD)
+        return GCD;
+      PartialGCD = SE.getMulExpr(PartialGCD, Res);
+      if (PartialGCD == GCD)
+        return GCD;
+    }
+
+    if (PartialGCD != One)
+      return PartialGCD;
+
+    Remainder = Expr;
+    const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(GCD);
+    if (!Mul)
+      return PartialGCD;
+
+    // When the GCD is a multiply expression, try to decompose it:
+    // this occurs when Step does not divide the Start expression
+    // as in: {(-4 + (3 * %m)),+,(2 * %m)}
+    for (int i = 0, e = Mul->getNumOperands(); i < e; ++i) {
+      const SCEV *Rem = Zero;
+      const SCEV *Res = findGCD(SE, Expr, Mul->getOperand(i), &Rem);
+      if (Rem == Zero) {
+        Remainder = Rem;
+        return Res;
+      }
+    }
+
+    return PartialGCD;
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return GCD;
+
+    if (!Expr->isAffine()) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    const SCEV *Rem = Zero;
+    const SCEV *Res = findGCD(SE, Expr->getOperand(0), GCD, &Rem);
+    if (Rem != Zero)
+      Remainder = SE.getAddExpr(Remainder, Rem);
+
+    Rem = Zero;
+    Res = findGCD(SE, Expr->getOperand(1), Res, &Rem);
+    if (Rem != Zero) {
+      Remainder = Expr;
+      return GCD;
+    }
+
+    return Res;
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD != Expr)
+      Remainder = Expr;
+    return GCD;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return One;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Remainder, *Zero, *One;
+};
+
+struct SCEVDivision : public SCEVVisitor<SCEVDivision, const SCEV *> {
+public:
+  // Remove from Start all multiples of Step.
+  static const SCEV *divide(ScalarEvolution &SE, const SCEV *Start,
+                            const SCEV *Step) {
+    SCEVDivision D(SE, Step);
+    const SCEV *Rem = D.Zero;
+    (void)Rem;
+    // The division is guaranteed to succeed: Step should divide Start with no
+    // remainder.
+    assert(Step == SCEVGCD::findGCD(SE, Start, Step, &Rem) && Rem == D.Zero &&
+           "Step should divide Start with no remainder.");
+    return D.visit(Start);
+  }
+
+  SCEVDivision(ScalarEvolution &S, const SCEV *G) : SE(S), GCD(G) {
+    Zero = SE.getConstant(GCD->getType(), 0);
+    One = SE.getConstant(GCD->getType(), 1);
+  }
+
+  const SCEV *visitConstant(const SCEVConstant *Constant) {
+    if (GCD == Constant)
+      return One;
+
+    if (const SCEVConstant *CGCD = dyn_cast<SCEVConstant>(GCD))
+      return SE.getConstant(sdiv(Constant, CGCD));
+    return Constant;
+  }
+
+  const SCEV *visitTruncateExpr(const SCEVTruncateExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitZeroExtendExpr(const SCEVZeroExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitSignExtendExpr(const SCEVSignExtendExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddExpr(const SCEVAddExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    SmallVector<const SCEV *, 2> Operands;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getAddExpr(Operands);
+  }
+
+  const SCEV *visitMulExpr(const SCEVMulExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    bool FoundGCDTerm = false;
+    for (int i = 0, e = Expr->getNumOperands(); i < e; ++i)
+      if (Expr->getOperand(i) == GCD)
+        FoundGCDTerm = true;
+
+    SmallVector<const SCEV *, 2> Operands;
+    if (FoundGCDTerm) {
+      FoundGCDTerm = false;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (FoundGCDTerm)
+          Operands.push_back(Expr->getOperand(i));
+        else if (Expr->getOperand(i) == GCD)
+          FoundGCDTerm = true;
+        else
+          Operands.push_back(Expr->getOperand(i));
+      }
+    } else {
+      FoundGCDTerm = false;
+      const SCEV *PartialGCD = One;
+      for (int i = 0, e = Expr->getNumOperands(); i < e; ++i) {
+        if (PartialGCD == GCD) {
+          Operands.push_back(Expr->getOperand(i));
+          continue;
+        }
+
+        const SCEV *Rem = Zero;
+        const SCEV *Res = SCEVGCD::findGCD(SE, Expr->getOperand(i), GCD, &Rem);
+        if (Rem == Zero) {
+          PartialGCD = SE.getMulExpr(PartialGCD, Res);
+          Operands.push_back(divide(SE, Expr->getOperand(i), GCD));
+        } else {
+          Operands.push_back(Expr->getOperand(i));
+        }
+      }
+    }
+
+    if (Operands.size() == 1)
+      return Operands[0];
+    return SE.getMulExpr(Operands);
+  }
+
+  const SCEV *visitUDivExpr(const SCEVUDivExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+
+    assert(Expr->isAffine() && "Expr should be affine");
+
+    const SCEV *Start = divide(SE, Expr->getStart(), GCD);
+    const SCEV *Step = divide(SE, Expr->getStepRecurrence(SE), GCD);
+
+    return SE.getAddRecExpr(Start, Step, Expr->getLoop(),
+                            Expr->getNoWrapFlags());
+  }
+
+  const SCEV *visitSMaxExpr(const SCEVSMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUMaxExpr(const SCEVUMaxExpr *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (GCD == Expr)
+      return One;
+    return Expr;
+  }
+
+  const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *Expr) {
+    return Expr;
+  }
+
+private:
+  ScalarEvolution &SE;
+  const SCEV *GCD, *Zero, *One;
+};
+}
+
+/// Splits the SCEV into two vectors of SCEVs representing the subscripts and
+/// sizes of an array access. Returns the remainder of the delinearization that
+/// is the offset start of the array.  The SCEV->delinearize algorithm computes
+/// the multiples of SCEV coefficients: that is a pattern matching of sub
+/// expressions in the stride and base of a SCEV corresponding to the
+/// computation of a GCD (greatest common divisor) of base and stride.  When
+/// SCEV->delinearize fails, it returns the SCEV unchanged.
+///
+/// For example: when analyzing the memory access A[i][j][k] in this loop nest
+///
+///  void foo(long n, long m, long o, double A[n][m][o]) {
+///
+///    for (long i = 0; i < n; i++)
+///      for (long j = 0; j < m; j++)
+///        for (long k = 0; k < o; k++)
+///          A[i][j][k] = 1.0;
+///  }
+///
+/// the delinearization input is the following AddRec SCEV:
+///
+///  AddRec: {{{%A,+,(8 * %m * %o)}<%for.i>,+,(8 * %o)}<%for.j>,+,8}<%for.k>
+///
+/// From this SCEV, we are able to say that the base offset of the access is %A
+/// because it appears as an offset that does not divide any of the strides in
+/// the loops:
+///
+///  CHECK: Base offset: %A
+///
+/// and then SCEV->delinearize determines the size of some of the dimensions of
+/// the array as these are the multiples by which the strides are happening:
+///
+///  CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of sizeof(double) bytes.
+///
+/// Note that the outermost dimension remains of UnknownSize because there are
+/// no strides that would help identifying the size of the last dimension: when
+/// the array has been statically allocated, one could compute the size of that
+/// dimension by dividing the overall size of the array by the size of the known
+/// dimensions: %m * %o * 8.
+///
+/// Finally delinearize provides the access functions for the array reference
+/// that does correspond to A[i][j][k] of the above C testcase:
+///
+///  CHECK: ArrayRef[{0,+,1}<%for.i>][{0,+,1}<%for.j>][{0,+,1}<%for.k>]
+///
+/// The testcases are checking the output of a function pass:
+/// DelinearizationPass that walks through all loads and stores of a function
+/// asking for the SCEV of the memory access with respect to all enclosing
+/// loops, calling SCEV->delinearize on that and printing the results.
+
+const SCEV *
+SCEVAddRecExpr::delinearize(ScalarEvolution &SE,
+                            SmallVectorImpl<const SCEV *> &Subscripts,
+                            SmallVectorImpl<const SCEV *> &Sizes) const {
+  // Early exit in case this SCEV is not an affine multivariate function.
+  if (!this->isAffine())
+    return this;
+
+  const SCEV *Start = this->getStart();
+  const SCEV *Step = this->getStepRecurrence(SE);
+
+  // Build the SCEV representation of the cannonical induction variable in the
+  // loop of this SCEV.
+  const SCEV *Zero = SE.getConstant(this->getType(), 0);
+  const SCEV *One = SE.getConstant(this->getType(), 1);
+  const SCEV *IV =
+      SE.getAddRecExpr(Zero, One, this->getLoop(), this->getNoWrapFlags());
+
+  DEBUG(dbgs() << "(delinearize: " << *this << "\n");
+
+  // Currently we fail to delinearize when the stride of this SCEV is 1. We
+  // could decide to not fail in this case: we could just return 1 for the size
+  // of the subscript, and this same SCEV for the access function.
+  if (Step == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // Find the GCD and Remainder of the Start and Step coefficients of this SCEV.
+  const SCEV *Remainder = NULL;
+  const SCEV *GCD = SCEVGCD::findGCD(SE, Start, Step, &Remainder);
+
+  DEBUG(dbgs() << "GCD: " << *GCD << "\n");
+  DEBUG(dbgs() << "Remainder: " << *Remainder << "\n");
+
+  // Same remark as above: we currently fail the delinearization, although we
+  // can very well handle this special case.
+  if (GCD == One) {
+    DEBUG(dbgs() << "failed to delinearize " << *this << "\n)\n");
+    return this;
+  }
+
+  // As findGCD computed Remainder, GCD divides "Start - Remainder." The
+  // Quotient is then this SCEV without Remainder, scaled down by the GCD.  The
+  // Quotient is what will be used in the next subscript delinearization.
+  const SCEV *Quotient =
+      SCEVDivision::divide(SE, SE.getMinusSCEV(Start, Remainder), GCD);
+  DEBUG(dbgs() << "Quotient: " << *Quotient << "\n");
+
+  const SCEV *Rem;
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Quotient))
+    // Recursively call delinearize on the Quotient until there are no more
+    // multiples that can be recognized.
+    Rem = AR->delinearize(SE, Subscripts, Sizes);
+  else
+    Rem = Quotient;
+
+  // Scale up the cannonical induction variable IV by whatever remains from the
+  // Step after division by the GCD: the GCD is the size of all the sub-array.
+  if (Step != GCD) {
+    Step = SCEVDivision::divide(SE, Step, GCD);
+    IV = SE.getMulExpr(IV, Step);
+  }
+  // The access function in the current subscript is computed as the cannonical
+  // induction variable IV (potentially scaled up by the step) and offset by
+  // Rem, the offset of delinearization in the sub-array.
+  const SCEV *Index = SE.getAddExpr(IV, Rem);
+
+  // Record the access function and the size of the current subscript.
+  Subscripts.push_back(Index);
+  Sizes.push_back(GCD);
+
+#ifndef NDEBUG
+  int Size = Sizes.size();
+  DEBUG(dbgs() << "succeeded to delinearize " << *this << "\n");
+  DEBUG(dbgs() << "ArrayDecl[UnknownSize]");
+  for (int i = 0; i < Size - 1; i++)
+    DEBUG(dbgs() << "[" << *Sizes[i] << "]");
+  DEBUG(dbgs() << " with elements of " << *Sizes[Size - 1] << " bytes.\n");
+
+  DEBUG(dbgs() << "ArrayRef");
+  for (int i = 0; i < Size; i++)
+    DEBUG(dbgs() << "[" << *Subscripts[i] << "]");
+  DEBUG(dbgs() << "\n)\n");
+#endif
+
+  return Remainder;
+}
 
 //===----------------------------------------------------------------------===//
 //                   SCEVCallbackVH Class Implementation
@@ -6642,7 +7260,7 @@ ScalarEvolution::SCEVCallbackVH::SCEVCallbackVH(Value *V, ScalarEvolution *se)
 //===----------------------------------------------------------------------===//
 
 ScalarEvolution::ScalarEvolution()
-  : FunctionPass(ID), FirstUnknown(0) {
+  : FunctionPass(ID), ValuesAtScopes(64), LoopDispositions(64), BlockDispositions(64), FirstUnknown(0) {
   initializeScalarEvolutionPass(*PassRegistry::getPassRegistry());
 }
 
@@ -6780,14 +7398,21 @@ void ScalarEvolution::print(raw_ostream &OS, const Module *) const {
 
 ScalarEvolution::LoopDisposition
 ScalarEvolution::getLoopDisposition(const SCEV *S, const Loop *L) {
-  std::map<const Loop *, LoopDisposition> &Values = LoopDispositions[S];
-  std::pair<std::map<const Loop *, LoopDisposition>::iterator, bool> Pair =
-    Values.insert(std::make_pair(L, LoopVariant));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values = LoopDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == L)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(L, LoopVariant));
   LoopDisposition D = computeLoopDisposition(S, L);
-  return LoopDispositions[S][L] = D;
+  SmallVector<std::pair<const Loop *, LoopDisposition>, 2> &Values2 = LoopDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == L) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::LoopDisposition
@@ -6879,14 +7504,21 @@ bool ScalarEvolution::hasComputableLoopEvolution(const SCEV *S, const Loop *L) {
 
 ScalarEvolution::BlockDisposition
 ScalarEvolution::getBlockDisposition(const SCEV *S, const BasicBlock *BB) {
-  std::map<const BasicBlock *, BlockDisposition> &Values = BlockDispositions[S];
-  std::pair<std::map<const BasicBlock *, BlockDisposition>::iterator, bool>
-    Pair = Values.insert(std::make_pair(BB, DoesNotDominateBlock));
-  if (!Pair.second)
-    return Pair.first->second;
-
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values = BlockDispositions[S];
+  for (unsigned u = 0; u < Values.size(); u++) {
+    if (Values[u].first == BB)
+      return Values[u].second;
+  }
+  Values.push_back(std::make_pair(BB, DoesNotDominateBlock));
   BlockDisposition D = computeBlockDisposition(S, BB);
-  return BlockDispositions[S][BB] = D;
+  SmallVector<std::pair<const BasicBlock *, BlockDisposition>, 2> &Values2 = BlockDispositions[S];
+  for (unsigned u = Values2.size(); u > 0; u--) {
+    if (Values2[u - 1].first == BB) {
+      Values2[u - 1].second = D;
+      break;
+    }
+  }
+  return D;
 }
 
 ScalarEvolution::BlockDisposition
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index fcd7ce2..86a557b 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -176,8 +177,8 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -191,13 +192,9 @@ Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
 
   // If we haven't found this binop, insert it.
   Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
-  BO->setDebugLoc(SaveInsertPt->getDebugLoc());
+  BO->setDebugLoc(Loc);
   rememberInstruction(BO);
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   return BO;
 }
 
@@ -294,8 +291,8 @@ static bool FactorOutConstant(const SCEV *&S,
     const SCEV *Start = A->getStart();
     if (!FactorOutConstant(Start, Remainder, Factor, SE, TD))
       return false;
-    // FIXME: can use A->getNoWrapFlags(FlagNW)
-    S = SE.getAddRecExpr(Start, Step, A->getLoop(), SCEV::FlagAnyWrap);
+    S = SE.getAddRecExpr(Start, Step, A->getLoop(),
+                         A->getNoWrapFlags(SCEV::FlagNW));
     return true;
   }
 
@@ -348,8 +345,7 @@ static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
       AddRecs.push_back(SE.getAddRecExpr(Zero,
                                          A->getStepRecurrence(SE),
                                          A->getLoop(),
-                                         // FIXME: A->getNoWrapFlags(FlagNW)
-                                         SCEV::FlagAnyWrap));
+                                         A->getNoWrapFlags(SCEV::FlagNW)));
       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
         Ops[i] = Zero;
         Ops.append(Add->op_begin(), Add->op_end());
@@ -407,6 +403,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   // without the other.
   SplitAddRecs(Ops, Ty, SE);
 
+  Type *IntPtrTy = SE.TD
+                 ? SE.TD->getIntPtrType(PTy)
+                 : Type::getInt64Ty(PTy->getContext());
+
   // Descend down the pointer's type and attempt to convert the other
   // operands into GEP indices, at each level. The first index in a GEP
   // indexes into the array implied by the pointer operand; the rest of
@@ -417,7 +417,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // array indexing.
     SmallVector<const SCEV *, 8> ScaledOps;
     if (ElTy->isSized()) {
-      const SCEV *ElSize = SE.getSizeOfExpr(ElTy);
+      const SCEV *ElSize = SE.getSizeOfExpr(IntPtrTy, ElTy);
       if (!ElSize->isZero()) {
         SmallVector<const SCEV *, 8> NewOps;
         for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
@@ -549,8 +549,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     }
 
     // Save the original insertion point so we can restore it when we're done.
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+    BuilderType::InsertPointGuard Guard(Builder);
 
     // Move the insertion point out of as many loops as we can.
     while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -566,16 +565,11 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     Value *GEP = Builder.CreateGEP(V, Idx, "uglygep");
     rememberInstruction(GEP);
 
-    // Restore the original insert point.
-    if (SaveInsertBB)
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
     return GEP;
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPoint SaveInsertPt = Builder.saveIP();
 
   // Move the insertion point out of as many loops as we can.
   while (const Loop *L = SE.LI->getLoopFor(Builder.GetInsertBlock())) {
@@ -611,8 +605,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
   rememberInstruction(GEP);
 
   // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+  Builder.restoreIP(SaveInsertPt);
 
   return expand(SE.getAddExpr(Ops));
 }
@@ -846,8 +839,7 @@ static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
                          SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
                                           A->getStepRecurrence(SE),
                                           A->getLoop(),
-                                          // FIXME: A->getNoWrapFlags(FlagNW)
-                                          SCEV::FlagAnyWrap));
+                                          A->getNoWrapFlags(SCEV::FlagNW)));
   }
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
     Base = A->getOperand(A->getNumOperands()-1);
@@ -1078,8 +1070,7 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
   }
 
   // Save the original insertion point so we can restore it when we're done.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
 
   // Another AddRec may need to be recursively expanded below. For example, if
   // this AddRec is quadratic, the StepV may itself be an AddRec in this
@@ -1137,14 +1128,15 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
       IVIncInsertPos : Pred->getTerminator();
     Builder.SetInsertPoint(InsertPos);
     Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
-
+    if (isa<OverflowingBinaryOperator>(IncV)) {
+      if (Normalized->getNoWrapFlags(SCEV::FlagNUW))
+        cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap();
+      if (Normalized->getNoWrapFlags(SCEV::FlagNSW))
+        cast<BinaryOperator>(IncV)->setHasNoSignedWrap();
+    }
     PN->addIncoming(IncV, Pred);
   }
 
-  // Restore the original insert point.
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
-
   // After expanding subexpressions, restore the PostIncLoops set so the caller
   // can ensure that IVIncrement dominates the current uses.
   PostIncLoops = SavedPostIncLoops;
@@ -1180,8 +1172,7 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     Normalized = cast<SCEVAddRecExpr>(
       SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE),
                        Normalized->getLoop(),
-                       // FIXME: Normalized->getNoWrapFlags(FlagNW)
-                       SCEV::FlagAnyWrap));
+                       Normalized->getNoWrapFlags(SCEV::FlagNW)));
   }
 
   // Strip off any non-loop-dominating component from the addrec step.
@@ -1191,11 +1182,9 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
     PostLoopScale = Step;
     Step = SE.getConstant(Normalized->getType(), 1);
     Normalized =
-      cast<SCEVAddRecExpr>(SE.getAddRecExpr(Start, Step,
-                                            Normalized->getLoop(),
-                                            // FIXME: Normalized
-                                            // ->getNoWrapFlags(FlagNW)
-                                            SCEV::FlagAnyWrap));
+      cast<SCEVAddRecExpr>(SE.getAddRecExpr(
+                             Start, Step, Normalized->getLoop(),
+                             Normalized->getNoWrapFlags(SCEV::FlagNW)));
   }
 
   // Expand the core addrec. If we need post-loop scaling, force it to
@@ -1232,19 +1221,19 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
         !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
       if (useSubtract)
         Step = SE.getNegativeSCEV(Step);
-      // Expand the step somewhere that dominates the loop header.
-      BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-      BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
-      Value *StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
-      // Restore the insertion point to the place where the caller has
-      // determined dominates all uses.
-      restoreInsertPoint(SaveInsertBB, SaveInsertPt);
+      Value *StepV;
+      {
+        // Expand the step somewhere that dominates the loop header.
+        BuilderType::InsertPointGuard Guard(Builder);
+        StepV = expandCodeFor(Step, IntTy, L->getHeader()->begin());
+      }
       Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
     }
   }
 
   // Re-apply any non-loop-dominating scale.
   if (PostLoopScale) {
+    assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
     Result = InsertNoopCastOfTo(Result, IntTy);
     Result = Builder.CreateMul(Result,
                                expandCodeFor(PostLoopScale, IntTy));
@@ -1288,18 +1277,15 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
     for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
       NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
     Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
-                                       // FIXME: S->getNoWrapFlags(FlagNW)
-                                       SCEV::FlagAnyWrap));
-    BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-    BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+                                       S->getNoWrapFlags(SCEV::FlagNW)));
     BasicBlock::iterator NewInsertPt =
       llvm::next(BasicBlock::iterator(cast<Instruction>(V)));
+    BuilderType::InsertPointGuard Guard(Builder);
     while (isa<PHINode>(NewInsertPt) || isa<DbgInfoIntrinsic>(NewInsertPt) ||
            isa<LandingPadInst>(NewInsertPt))
       ++NewInsertPt;
     V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), 0,
                       NewInsertPt);
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
     return V;
   }
 
@@ -1307,8 +1293,8 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
   if (!S->getStart()->isZero()) {
     SmallVector<const SCEV *, 4> NewOps(S->op_begin(), S->op_end());
     NewOps[0] = SE.getConstant(Ty, 0);
-    // FIXME: can use S->getNoWrapFlags()
-    const SCEV *Rest = SE.getAddRecExpr(NewOps, L, SCEV::FlagAnyWrap);
+    const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
+                                        S->getNoWrapFlags(SCEV::FlagNW));
 
     // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
     // comments on expandAddToGEP for details.
@@ -1343,9 +1329,13 @@ Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
                                   Header->begin());
     rememberInstruction(CanonicalIV);
 
+    SmallSet<BasicBlock *, 4> PredSeen;
     Constant *One = ConstantInt::get(Ty, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
+      if (!PredSeen.insert(HP))
+        continue;
+
       if (L->contains(HP)) {
         // Insert a unit add instruction right before the terminator
         // corresponding to the back-edge.
@@ -1528,8 +1518,7 @@ Value *SCEVExpander::expand(const SCEV *S) {
   if (I != InsertedExpressions.end())
     return I->second;
 
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   Builder.SetInsertPoint(InsertPt->getParent(), InsertPt);
 
   // Expand the expression into instructions.
@@ -1542,8 +1531,6 @@ Value *SCEVExpander::expand(const SCEV *S) {
   // a postinc expansion, it could be reused by a non postinc user, but only if
   // its insertion point was already at the head of the loop.
   InsertedExpressions[std::make_pair(S, InsertPt)] = V;
-
-  restoreInsertPoint(SaveInsertBB, SaveInsertPt);
   return V;
 }
 
@@ -1554,10 +1541,6 @@ void SCEVExpander::rememberInstruction(Value *I) {
     InsertedValues.insert(I);
 }
 
-void SCEVExpander::restoreInsertPoint(BasicBlock *BB, BasicBlock::iterator I) {
-  Builder.SetInsertPoint(BB, I);
-}
-
 /// getOrInsertCanonicalInductionVariable - This method returns the
 /// canonical induction variable of the specified type for the specified
 /// loop (inserting one if there is none).  A canonical induction variable
@@ -1573,11 +1556,8 @@ SCEVExpander::getOrInsertCanonicalInductionVariable(const Loop *L,
                                    SE.getConstant(Ty, 1), L, SCEV::FlagAnyWrap);
 
   // Emit code for it.
-  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
-  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  BuilderType::InsertPointGuard Guard(Builder);
   PHINode *V = cast<PHINode>(expandCodeFor(H, 0, L->getHeader()->begin()));
-  if (SaveInsertBB)
-    restoreInsertPoint(SaveInsertBB, SaveInsertPt);
 
   return V;
 }
@@ -1725,28 +1705,43 @@ namespace {
 // Currently, we only allow division by a nonzero constant here. If this is
 // inadequate, we could easily allow division by SCEVUnknown by using
 // ValueTracking to check isKnownNonZero().
+//
+// We cannot generally expand recurrences unless the step dominates the loop
+// header. The expander handles the special case of affine recurrences by
+// scaling the recurrence outside the loop, but this technique isn't generally
+// applicable. Expanding a nested recurrence outside a loop requires computing
+// binomial coefficients. This could be done, but the recurrence has to be in a
+// perfectly reduced form, which can't be guaranteed.
 struct SCEVFindUnsafe {
+  ScalarEvolution &SE;
   bool IsUnsafe;
 
-  SCEVFindUnsafe(): IsUnsafe(false) {}
+  SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
 
   bool follow(const SCEV *S) {
-    const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S);
-    if (!D)
-      return true;
-    const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
-    if (SC && !SC->getValue()->isZero())
-      return true;
-    IsUnsafe = true;
-    return false;
+    if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+      const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+      if (!SC || SC->getValue()->isZero()) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+      const SCEV *Step = AR->getStepRecurrence(SE);
+      if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
+        IsUnsafe = true;
+        return false;
+      }
+    }
+    return true;
   }
   bool isDone() const { return IsUnsafe; }
 };
 }
 
 namespace llvm {
-bool isSafeToExpand(const SCEV *S) {
-  SCEVFindUnsafe Search;
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
+  SCEVFindUnsafe Search(SE);
   visitAll(S, Search);
   return !Search.IsUnsafe;
 }
diff --git a/lib/Analysis/ScalarEvolutionNormalization.cpp b/lib/Analysis/ScalarEvolutionNormalization.cpp
index dd2ed4f..f110616 100644
--- a/lib/Analysis/ScalarEvolutionNormalization.cpp
+++ b/lib/Analysis/ScalarEvolutionNormalization.cpp
@@ -119,11 +119,19 @@ TransformImpl(const SCEV *S, Instruction *User, Value *OperandValToReplace) {
     const SCEV *Result = SE.getAddRecExpr(Operands, L, SCEV::FlagAnyWrap);
     switch (Kind) {
     case NormalizeAutodetect:
-      if (IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
-        const SCEV *TransformedStep =
-          TransformSubExpr(AR->getStepRecurrence(SE),
-                           User, OperandValToReplace);
-        Result = SE.getMinusSCEV(Result, TransformedStep);
+      // Normalize this SCEV by subtracting the expression for the final step.
+      // We only allow affine AddRecs to be normalized, otherwise we would not
+      // be able to correctly denormalize.
+      // e.g. {1,+,3,+,2} == {-2,+,1,+,2} + {3,+,2}
+      // Normalized form:   {-2,+,1,+,2}
+      // Denormalized form: {1,+,3,+,2}
+      //
+      // However, denormalization would use the a different step expression than
+      // normalization (see getPostIncExpr), generating the wrong final
+      // expression: {-2,+,1,+,2} + {1,+,2} => {-1,+,3,+,2}
+      if (AR->isAffine() &&
+          IVUseShouldUsePostIncValue(User, OperandValToReplace, L, &DT)) {
+        Result = SE.getMinusSCEV(Result, AR->getStepRecurrence(SE));
         Loops.insert(L);
       }
 #if 0
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 64f8e96..0353295 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -88,10 +88,19 @@ unsigned TargetTransformInfo::getUserCost(const User *U) const {
   return PrevTTI->getUserCost(U);
 }
 
+bool TargetTransformInfo::hasBranchDivergence() const {
+  return PrevTTI->hasBranchDivergence();
+}
+
 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
   return PrevTTI->isLoweredToCall(F);
 }
 
+void TargetTransformInfo::getUnrollingPreferences(Loop *L,
+                            UnrollingPreferences &UP) const {
+  PrevTTI->getUnrollingPreferences(L, UP);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return PrevTTI->isLegalAddImmediate(Imm);
 }
@@ -108,6 +117,14 @@ bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                         Scale);
 }
 
+int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+                                              int64_t BaseOffset,
+                                              bool HasBaseReg,
+                                              int64_t Scale) const {
+  return PrevTTI->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
+                                       Scale);
+}
+
 bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return PrevTTI->isTruncateFree(Ty1, Ty2);
 }
@@ -133,6 +150,10 @@ TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const {
   return PrevTTI->getPopcntSupport(IntTyWidthInBit);
 }
 
+bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
+  return PrevTTI->haveFastSqrt(Ty);
+}
+
 unsigned TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
   return PrevTTI->getIntImmCost(Imm, Ty);
 }
@@ -198,8 +219,14 @@ unsigned TargetTransformInfo::getNumberOfParts(Type *Tp) const {
   return PrevTTI->getNumberOfParts(Tp);
 }
 
-unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp) const {
-  return PrevTTI->getAddressComputationCost(Tp);
+unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
+                                                        bool IsComplex) const {
+  return PrevTTI->getAddressComputationCost(Tp, IsComplex);
+}
+
+unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+                                               bool IsPairwise) const {
+  return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
 }
 
 namespace {
@@ -252,26 +279,34 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
       // Otherwise, the default basic cost is used.
       return TCC_Basic;
 
-    case Instruction::IntToPtr:
+    case Instruction::IntToPtr: {
+      if (!DL)
+        return TCC_Basic;
+
       // An inttoptr cast is free so long as the input is a legal integer type
       // which doesn't contain values outside the range of a pointer.
-      if (DL && DL->isLegalInteger(OpTy->getScalarSizeInBits()) &&
-          OpTy->getScalarSizeInBits() <= DL->getPointerSizeInBits())
+      unsigned OpSize = OpTy->getScalarSizeInBits();
+      if (DL->isLegalInteger(OpSize) &&
+          OpSize <= DL->getPointerTypeSizeInBits(Ty))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
+    }
+    case Instruction::PtrToInt: {
+      if (!DL)
+        return TCC_Basic;
 
-    case Instruction::PtrToInt:
       // A ptrtoint cast is free so long as the result is large enough to store
       // the pointer, and a legal integer type.
-      if (DL && DL->isLegalInteger(Ty->getScalarSizeInBits()) &&
-          Ty->getScalarSizeInBits() >= DL->getPointerSizeInBits())
+      unsigned DestSize = Ty->getScalarSizeInBits();
+      if (DL->isLegalInteger(DestSize) &&
+          DestSize >= DL->getPointerTypeSizeInBits(OpTy))
         return TCC_Free;
 
       // Otherwise it's not a no-op.
       return TCC_Basic;
-
+    }
     case Instruction::Trunc:
       // trunc to a native type is free (assuming the target has compare and
       // shift-right of the same width).
@@ -411,6 +446,8 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
                                 U->getOperand(0)->getType() : 0);
   }
 
+  bool hasBranchDivergence() const { return false; }
+
   bool isLoweredToCall(const Function *F) const {
     // FIXME: These should almost certainly not be handled here, and instead
     // handled with the help of TLI or the target itself. This was largely
@@ -442,6 +479,8 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return true;
   }
 
+  void getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
   bool isLegalAddImmediate(int64_t Imm) const {
     return false;
   }
@@ -457,6 +496,15 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return !BaseGV && BaseOffset == 0 && Scale <= 1;
   }
 
+  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
+                           bool HasBaseReg, int64_t Scale) const {
+    // Guess that all legal addressing mode are free.
+    if(isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale))
+      return 0;
+    return -1;
+  }
+
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) const {
     return false;
   }
@@ -481,6 +529,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return PSK_Software;
   }
 
+  bool haveFastSqrt(Type *Ty) const {
+    return false;
+  }
+
   unsigned getIntImmCost(const APInt &Imm, Type *Ty) const {
     return 1;
   }
@@ -542,9 +594,13 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
     return 0;
   }
 
-  unsigned getAddressComputationCost(Type *Tp) const {
+  unsigned getAddressComputationCost(Type *Tp, bool) const {
     return 0;
   }
+
+  unsigned getReductionCost(unsigned, Type *, bool) const {
+    return 1;
+  }
 };
 
 } // end anonymous namespace
diff --git a/lib/Analysis/TypeBasedAliasAnalysis.cpp b/lib/Analysis/TypeBasedAliasAnalysis.cpp
index bbf3c3a..6791d4b 100644
--- a/lib/Analysis/TypeBasedAliasAnalysis.cpp
+++ b/lib/Analysis/TypeBasedAliasAnalysis.cpp
@@ -16,7 +16,12 @@
 // typical C/C++ TBAA, but it can also be used to implement custom alias
 // analysis behavior for other languages.
 //
-// The current metadata format is very simple. TBAA MDNodes have up to
+// We now support two types of metadata format: scalar TBAA and struct-path
+// aware TBAA. After all testing cases are upgraded to use struct-path aware
+// TBAA and we can auto-upgrade existing bc files, the support for scalar TBAA
+// can be dropped.
+//
+// The scalar TBAA metadata format is very simple. TBAA MDNodes have up to
 // three fields, e.g.:
 //   !0 = metadata !{ metadata !"an example type tree" }
 //   !1 = metadata !{ metadata !"int", metadata !0 }
@@ -40,6 +45,65 @@
 // should return true; see
 // http://llvm.org/docs/AliasAnalysis.html#OtherItfs).
 //
+// With struct-path aware TBAA, the MDNodes attached to an instruction using
+// "!tbaa" are called path tag nodes.
+//
+// The path tag node has 4 fields with the last field being optional.
+//
+// The first field is the base type node, it can be a struct type node
+// or a scalar type node. The second field is the access type node, it
+// must be a scalar type node. The third field is the offset into the base type.
+// The last field has the same meaning as the last field of our scalar TBAA:
+// it's an integer which if equal to 1 indicates that the access is "constant".
+//
+// The struct type node has a name and a list of pairs, one pair for each member
+// of the struct. The first element of each pair is a type node (a struct type
+// node or a sclar type node), specifying the type of the member, the second
+// element of each pair is the offset of the member.
+//
+// Given an example
+// typedef struct {
+//   short s;
+// } A;
+// typedef struct {
+//   uint16_t s;
+//   A a;
+// } B;
+//
+// For an acess to B.a.s, we attach !5 (a path tag node) to the load/store
+// instruction. The base type is !4 (struct B), the access type is !2 (scalar
+// type short) and the offset is 4.
+//
+// !0 = metadata !{metadata !"Simple C/C++ TBAA"}
+// !1 = metadata !{metadata !"omnipotent char", metadata !0} // Scalar type node
+// !2 = metadata !{metadata !"short", metadata !1}           // Scalar type node
+// !3 = metadata !{metadata !"A", metadata !2, i64 0}        // Struct type node
+// !4 = metadata !{metadata !"B", metadata !2, i64 0, metadata !3, i64 4}
+//                                                           // Struct type node
+// !5 = metadata !{metadata !4, metadata !2, i64 4}          // Path tag node
+//
+// The struct type nodes and the scalar type nodes form a type DAG.
+//         Root (!0)
+//         char (!1)  -- edge to Root
+//         short (!2) -- edge to char
+//         A (!3) -- edge with offset 0 to short
+//         B (!4) -- edge with offset 0 to short and edge with offset 4 to A
+//
+// To check if two tags (tagX and tagY) can alias, we start from the base type
+// of tagX, follow the edge with the correct offset in the type DAG and adjust
+// the offset until we reach the base type of tagY or until we reach the Root
+// node.
+// If we reach the base type of tagY, compare the adjusted offset with
+// offset of tagY, return Alias if the offsets are the same, return NoAlias
+// otherwise.
+// If we reach the Root node, perform the above starting from base type of tagY
+// to see if we reach base type of tagX.
+//
+// If they have different roots, they're part of different potentially
+// unrelated type systems, so we return Alias to be conservative.
+// If neither node is an ancestor of the other and they have the same root,
+// then we say NoAlias.
+//
 // TODO: The current metadata format doesn't support struct
 // fields. For example:
 //   struct X {
@@ -71,7 +135,6 @@ using namespace llvm;
 // achieved by stripping the !tbaa tags from IR, but this option is sometimes
 // more convenient.
 static cl::opt<bool> EnableTBAA("enable-tbaa", cl::init(true));
-static cl::opt<bool> EnableStructPathTBAA("struct-path-tbaa", cl::init(false));
 
 namespace {
   /// TBAANode - This is a simple wrapper around an MDNode which provides a
@@ -168,8 +231,12 @@ namespace {
       if (Node->getNumOperands() < 2)
         return TBAAStructTypeNode();
 
-      // Special handling for a scalar type node. 
+      // Fast path for a scalar type node and a struct type node with a single
+      // field.
       if (Node->getNumOperands() <= 3) {
+        uint64_t Cur = Node->getNumOperands() == 2 ? 0 :
+                       cast<ConstantInt>(Node->getOperand(2))->getZExtValue();
+        Offset -= Cur;
         MDNode *P = dyn_cast_or_null<MDNode>(Node->getOperand(1));
         if (!P)
           return TBAAStructTypeNode();
@@ -259,12 +326,21 @@ TypeBasedAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AliasAnalysis::getAnalysisUsage(AU);
 }
 
+/// Check the first operand of the tbaa tag node, if it is a MDNode, we treat
+/// it as struct-path aware TBAA format, otherwise, we treat it as scalar TBAA
+/// format.
+static bool isStructPathTBAA(const MDNode *MD) {
+  // Anonymous TBAA root starts with a MDNode and dragonegg uses it as
+  // a TBAA tag.
+  return isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3;
+}
+
 /// Aliases - Test whether the type represented by A may alias the
 /// type represented by B.
 bool
 TypeBasedAliasAnalysis::Aliases(const MDNode *A,
                                 const MDNode *B) const {
-  if (EnableStructPathTBAA)
+  if (isStructPathTBAA(A))
     return PathAliases(A, B);
 
   // Keep track of the root node for A and B.
@@ -397,8 +473,8 @@ bool TypeBasedAliasAnalysis::pointsToConstantMemory(const Location &Loc,
 
   // If this is an "immutable" type, we can assume the pointer is pointing
   // to constant memory.
-  if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-      (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+  if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+      (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
     return true;
 
   return AliasAnalysis::pointsToConstantMemory(Loc, OrLocal);
@@ -414,8 +490,8 @@ TypeBasedAliasAnalysis::getModRefBehavior(ImmutableCallSite CS) {
   // If this is an "immutable" type, we can assume the call doesn't write
   // to memory.
   if (const MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_tbaa))
-    if ((!EnableStructPathTBAA && TBAANode(M).TypeIsImmutable()) ||
-        (EnableStructPathTBAA && TBAAStructTagNode(M).TypeIsImmutable()))
+    if ((!isStructPathTBAA(M) && TBAANode(M).TypeIsImmutable()) ||
+        (isStructPathTBAA(M) && TBAAStructTagNode(M).TypeIsImmutable()))
       Min = OnlyReadsMemory;
 
   return ModRefBehavior(AliasAnalysis::getModRefBehavior(CS) & Min);
@@ -458,6 +534,25 @@ TypeBasedAliasAnalysis::getModRefInfo(ImmutableCallSite CS1,
   return AliasAnalysis::getModRefInfo(CS1, CS2);
 }
 
+bool MDNode::isTBAAVtableAccess() const {
+  if (!isStructPathTBAA(this)) {
+    if (getNumOperands() < 1) return false;
+    if (MDString *Tag1 = dyn_cast<MDString>(getOperand(0))) {
+      if (Tag1->getString() == "vtable pointer") return true;
+    }
+    return false;
+  }
+
+  // For struct-path aware TBAA, we use the access type of the tag.
+  if (getNumOperands() < 2) return false;
+  MDNode *Tag = cast_or_null<MDNode>(getOperand(1));
+  if (!Tag) return false;
+  if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
+    if (Tag1->getString() == "vtable pointer") return true;
+  }
+  return false;  
+}
+
 MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
   if (!A || !B)
     return NULL;
@@ -466,7 +561,8 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     return A;
 
   // For struct-path aware TBAA, we use the access type of the tag.
-  if (EnableStructPathTBAA) {
+  bool StructPath = isStructPathTBAA(A);
+  if (StructPath) {
     A = cast_or_null<MDNode>(A->getOperand(1));
     if (!A) return 0;
     B = cast_or_null<MDNode>(B->getOperand(1));
@@ -499,7 +595,7 @@ MDNode *MDNode::getMostGenericTBAA(MDNode *A, MDNode *B) {
     --IA;
     --IB;
   }
-  if (!EnableStructPathTBAA)
+  if (!StructPath)
     return Ret;
 
   if (!Ret)
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 45dcc5e..e39ee62 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalAlias.h"
@@ -39,8 +40,8 @@ const unsigned MaxDepth = 6;
 static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
-  assert(isa<PointerType>(Ty) && "Expected a pointer type!");
-  return TD ? TD->getPointerSizeInBits() : 0;
+
+  return TD ? TD->getPointerTypeSizeInBits(Ty) : 0;
 }
 
 static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
@@ -290,7 +291,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     }
     if (Align > 0)
       KnownZero = APInt::getLowBitsSet(BitWidth,
-                                       CountTrailingZeros_32(Align));
+                                       countTrailingZeros(Align));
     else
       KnownZero.clearAllBits();
     KnownOne.clearAllBits();
@@ -321,7 +322,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
     }
 
     if (Align)
-      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
+      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
     return;
   }
 
@@ -613,7 +614,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       Align = TD->getABITypeAlignment(AI->getType()->getElementType());
 
     if (Align > 0)
-      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
+      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
@@ -629,12 +630,22 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
       Value *Index = I->getOperand(i);
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
         // Handle struct member offset arithmetic.
-        if (!TD) return;
-        const StructLayout *SL = TD->getStructLayout(STy);
+        if (!TD)
+          return;
+
+        // Handle case when index is vector zeroinitializer
+        Constant *CIndex = cast<Constant>(Index);
+        if (CIndex->isZeroValue())
+          continue;
+
+        if (CIndex->getType()->isVectorTy())
+          Index = CIndex->getSplatValue();
+
         unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
+        const StructLayout *SL = TD->getStructLayout(STy);
         uint64_t Offset = SL->getElementOffset(Idx);
-        TrailZ = std::min(TrailZ,
-                          CountTrailingZeros_64(Offset));
+        TrailZ = std::min<unsigned>(TrailZ,
+                                    countTrailingZeros(Offset));
       } else {
         // Handle array index arithmetic.
         Type *IndexedTy = GTI.getIndexedType();
@@ -644,7 +655,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         LocalKnownZero = LocalKnownOne = APInt(GEPOpiBits, 0);
         ComputeMaskedBits(Index, LocalKnownZero, LocalKnownOne, TD, Depth+1);
         TrailZ = std::min(TrailZ,
-                          unsigned(CountTrailingZeros_64(TypeSize) +
+                          unsigned(countTrailingZeros(TypeSize) +
                                    LocalKnownZero.countTrailingOnes()));
       }
     }
@@ -749,7 +760,6 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
         KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         break;
@@ -855,6 +865,37 @@ bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
     return false;
   }
 
+  // Adding a power-of-two or zero to the same power-of-two or zero yields
+  // either the original power-of-two, a larger power-of-two or zero.
+  if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
+    OverflowingBinaryOperator *VOBO = cast<OverflowingBinaryOperator>(V);
+    if (OrZero || VOBO->hasNoUnsignedWrap() || VOBO->hasNoSignedWrap()) {
+      if (match(X, m_And(m_Specific(Y), m_Value())) ||
+          match(X, m_And(m_Value(), m_Specific(Y))))
+        if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth))
+          return true;
+      if (match(Y, m_And(m_Specific(X), m_Value())) ||
+          match(Y, m_And(m_Value(), m_Specific(X))))
+        if (isKnownToBeAPowerOfTwo(X, OrZero, Depth))
+          return true;
+
+      unsigned BitWidth = V->getType()->getScalarSizeInBits();
+      APInt LHSZeroBits(BitWidth, 0), LHSOneBits(BitWidth, 0);
+      ComputeMaskedBits(X, LHSZeroBits, LHSOneBits, 0, Depth);
+
+      APInt RHSZeroBits(BitWidth, 0), RHSOneBits(BitWidth, 0);
+      ComputeMaskedBits(Y, RHSZeroBits, RHSOneBits, 0, Depth);
+      // If i8 V is a power of two or zero:
+      //  ZeroBits: 1 1 1 0 1 1 1 1
+      // ~ZeroBits: 0 0 0 1 0 0 0 0
+      if ((~(LHSZeroBits & RHSZeroBits)).isPowerOf2())
+        // If OrZero isn't set, we cannot give back a zero result.
+        // Make sure either the LHS or RHS has a bit set.
+        if (OrZero || RHSOneBits.getBoolValue() || LHSOneBits.getBoolValue())
+          return true;
+    }
+  }
+
   // An exact divide or right shift can only shift off zero bits, so the result
   // is a power of two only if the first operand is a power of two and not
   // copying a sign bit (sdiv int_min, 2).
@@ -1509,7 +1550,7 @@ Value *llvm::isBytewiseValue(Value *V) {
 // struct. To is the result struct built so far, new insertvalue instructions
 // build on that.
 static Value *BuildSubAggregate(Value *From, Value* To, Type *IndexedType,
-                                SmallVector<unsigned, 10> &Idxs,
+                                SmallVectorImpl<unsigned> &Idxs,
                                 unsigned IdxSkip,
                                 Instruction *InsertBefore) {
   llvm::StructType *STy = dyn_cast<llvm::StructType>(IndexedType);
@@ -1673,20 +1714,24 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                              const DataLayout *TD) {
+                                              const DataLayout *DL) {
   // Without DataLayout, conservatively assume 64-bit offsets, which is
   // the widest we support.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(Ptr->getType()) : 64;
   APInt ByteOffset(BitWidth, 0);
   while (1) {
     if (Ptr->getType()->isVectorTy())
       break;
 
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
-      APInt GEPOffset(BitWidth, 0);
-      if (TD && !GEP->accumulateConstantOffset(*TD, GEPOffset))
-        break;
-      ByteOffset += GEPOffset;
+      if (DL) {
+        APInt GEPOffset(BitWidth, 0);
+        if (!GEP->accumulateConstantOffset(*DL, GEPOffset))
+          break;
+
+        ByteOffset += GEPOffset;
+      }
+
       Ptr = GEP->getPointerOperand();
     } else if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
       Ptr = cast<Operator>(Ptr)->getOperand(0);
@@ -2019,7 +2064,7 @@ bool llvm::isSafeToSpeculativelyExecute(const Value *V,
 
 /// isKnownNonNull - Return true if we know that the specified value is never
 /// null.
-bool llvm::isKnownNonNull(const Value *V) {
+bool llvm::isKnownNonNull(const Value *V, const TargetLibraryInfo *TLI) {
   // Alloca never returns null, malloc might.
   if (isa<AllocaInst>(V)) return true;
 
@@ -2030,5 +2075,10 @@ bool llvm::isKnownNonNull(const Value *V) {
   // Global values are not null unless extern weak.
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
     return !GV->hasExternalWeakLinkage();
+
+  // operator new never returns null.
+  if (isOperatorNewLikeFn(V, TLI, /*LookThroughBitCast=*/true))
+    return true;
+
   return false;
 }
diff --git a/lib/Archive/Archive.cpp b/lib/Archive/Archive.cpp
deleted file mode 100644
index 1f36a00..0000000
--- a/lib/Archive/Archive.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-//===-- Archive.cpp - Generic LLVM archive functions ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the Archive and ArchiveMember
-// classes that is common to both reading and writing archives..
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Bitcode/Archive.h"
-#include "ArchiveInternals.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/system_error.h"
-#include <cstring>
-#include <memory>
-using namespace llvm;
-
-// getMemberSize - compute the actual physical size of the file member as seen
-// on disk. This isn't the size of member's payload. Use getSize() for that.
-unsigned
-ArchiveMember::getMemberSize() const {
-  // Basically its the file size plus the header size
-  unsigned result =  info.fileSize + sizeof(ArchiveMemberHeader);
-
-  // If it has a long filename, include the name length
-  if (hasLongFilename())
-    result += path.str().length() + 1;
-
-  // If its now odd lengthed, include the padding byte
-  if (result % 2 != 0 )
-    result++;
-
-  return result;
-}
-
-// This default constructor is only use by the ilist when it creates its
-// sentry node. We give it specific static values to make it stand out a bit.
-ArchiveMember::ArchiveMember()
-  : parent(0), path("--invalid--"), flags(0), data(0)
-{
-  info.user = sys::Process::GetCurrentUserId();
-  info.group = sys::Process::GetCurrentGroupId();
-  info.mode = 0777;
-  info.fileSize = 0;
-  info.modTime = sys::TimeValue::now();
-}
-
-// This is the constructor that the Archive class uses when it is building or
-// reading an archive. It just defaults a few things and ensures the parent is
-// set for the iplist. The Archive class fills in the ArchiveMember's data.
-// This is required because correctly setting the data may depend on other
-// things in the Archive.
-ArchiveMember::ArchiveMember(Archive* PAR)
-  : parent(PAR), path(), flags(0), data(0)
-{
-}
-
-// This method allows an ArchiveMember to be replaced with the data for a
-// different file, presumably as an update to the member. It also makes sure
-// the flags are reset correctly.
-bool ArchiveMember::replaceWith(const sys::Path& newFile, std::string* ErrMsg) {
-  bool Exists;
-  if (sys::fs::exists(newFile.str(), Exists) || !Exists) {
-    if (ErrMsg)
-      *ErrMsg = "Can not replace an archive member with a non-existent file";
-    return true;
-  }
-
-  data = 0;
-  path = newFile;
-
-  // SVR4 symbol tables have an empty name
-  if (path.str() == ARFILE_SVR4_SYMTAB_NAME)
-    flags |= SVR4SymbolTableFlag;
-  else
-    flags &= ~SVR4SymbolTableFlag;
-
-  // BSD4.4 symbol tables have a special name
-  if (path.str() == ARFILE_BSD4_SYMTAB_NAME)
-    flags |= BSD4SymbolTableFlag;
-  else
-    flags &= ~BSD4SymbolTableFlag;
-
-  // LLVM symbol tables have a very specific name
-  if (path.str() == ARFILE_LLVM_SYMTAB_NAME)
-    flags |= LLVMSymbolTableFlag;
-  else
-    flags &= ~LLVMSymbolTableFlag;
-
-  // String table name
-  if (path.str() == ARFILE_STRTAB_NAME)
-    flags |= StringTableFlag;
-  else
-    flags &= ~StringTableFlag;
-
-  // If it has a slash then it has a path
-  bool hasSlash = path.str().find('/') != std::string::npos;
-  if (hasSlash)
-    flags |= HasPathFlag;
-  else
-    flags &= ~HasPathFlag;
-
-  // If it has a slash or its over 15 chars then its a long filename format
-  if (hasSlash || path.str().length() > 15)
-    flags |= HasLongFilenameFlag;
-  else
-    flags &= ~HasLongFilenameFlag;
-
-  // Get the signature and status info
-  const char* signature = (const char*) data;
-  SmallString<4> magic;
-  if (!signature) {
-    sys::fs::get_magic(path.str(), magic.capacity(), magic);
-    signature = magic.c_str();
-    const sys::FileStatus *FSinfo = path.getFileStatus(false, ErrMsg);
-    if (FSinfo)
-      info = *FSinfo;
-    else
-      return true;
-  }
-
-  // Determine what kind of file it is.
-  switch (sys::IdentifyFileType(signature,4)) {
-    case sys::Bitcode_FileType:
-      flags |= BitcodeFlag;
-      break;
-    default:
-      flags &= ~BitcodeFlag;
-      break;
-  }
-  return false;
-}
-
-// Archive constructor - this is the only constructor that gets used for the
-// Archive class. Everything else (default,copy) is deprecated. This just
-// initializes and maps the file into memory, if requested.
-Archive::Archive(const sys::Path& filename, LLVMContext& C)
-  : archPath(filename), members(), mapfile(0), base(0), symTab(), strtab(),
-    symTabSize(0), firstFileOffset(0), modules(), foreignST(0), Context(C) {
-}
-
-bool
-Archive::mapToMemory(std::string* ErrMsg) {
-  OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFile(archPath.c_str(), File)) {
-    if (ErrMsg)
-      *ErrMsg = ec.message();
-    return true;
-  }
-  mapfile = File.take();
-  base = mapfile->getBufferStart();
-  return false;
-}
-
-void Archive::cleanUpMemory() {
-  // Shutdown the file mapping
-  delete mapfile;
-  mapfile = 0;
-  base = 0;
-
-  // Forget the entire symbol table
-  symTab.clear();
-  symTabSize = 0;
-
-  firstFileOffset = 0;
-
-  // Free the foreign symbol table member
-  if (foreignST) {
-    delete foreignST;
-    foreignST = 0;
-  }
-
-  // Delete any Modules and ArchiveMember's we've allocated as a result of
-  // symbol table searches.
-  for (ModuleMap::iterator I=modules.begin(), E=modules.end(); I != E; ++I ) {
-    delete I->second.first;
-    delete I->second.second;
-  }
-}
-
-// Archive destructor - just clean up memory
-Archive::~Archive() {
-  cleanUpMemory();
-}
-
-
-
-static void getSymbols(Module*M, std::vector<std::string>& symbols) {
-  // Loop over global variables
-  for (Module::global_iterator GI = M->global_begin(), GE=M->global_end(); GI != GE; ++GI)
-    if (!GI->isDeclaration() && !GI->hasLocalLinkage())
-      if (!GI->getName().empty())
-        symbols.push_back(GI->getName());
-
-  // Loop over functions
-  for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; ++FI)
-    if (!FI->isDeclaration() && !FI->hasLocalLinkage())
-      if (!FI->getName().empty())
-        symbols.push_back(FI->getName());
-
-  // Loop over aliases
-  for (Module::alias_iterator AI = M->alias_begin(), AE = M->alias_end();
-       AI != AE; ++AI) {
-    if (AI->hasName())
-      symbols.push_back(AI->getName());
-  }
-}
-
-// Get just the externally visible defined symbols from the bitcode
-bool llvm::GetBitcodeSymbols(const sys::Path& fName,
-                             LLVMContext& Context,
-                             std::vector<std::string>& symbols,
-                             std::string* ErrMsg) {
-  OwningPtr<MemoryBuffer> Buffer;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(fName.c_str(), Buffer)) {
-    if (ErrMsg) *ErrMsg = "Could not open file '" + fName.str() + "'" + ": "
-                        + ec.message();
-    return true;
-  }
-
-  Module *M = ParseBitcodeFile(Buffer.get(), Context, ErrMsg);
-  if (!M)
-    return true;
-
-  // Get the symbols
-  getSymbols(M, symbols);
-
-  // Done with the module.
-  delete M;
-  return true;
-}
-
-Module*
-llvm::GetBitcodeSymbols(const char *BufPtr, unsigned Length,
-                        const std::string& ModuleID,
-                        LLVMContext& Context,
-                        std::vector<std::string>& symbols,
-                        std::string* ErrMsg) {
-  // Get the module.
-  OwningPtr<MemoryBuffer> Buffer(
-    MemoryBuffer::getMemBufferCopy(StringRef(BufPtr, Length),ModuleID.c_str()));
-
-  Module *M = ParseBitcodeFile(Buffer.get(), Context, ErrMsg);
-  if (!M)
-    return 0;
-
-  // Get the symbols
-  getSymbols(M, symbols);
-
-  // Done with the module. Note that it's the caller's responsibility to delete
-  // the Module.
-  return M;
-}
diff --git a/lib/Archive/ArchiveInternals.h b/lib/Archive/ArchiveInternals.h
deleted file mode 100644
index f6c87e8..0000000
--- a/lib/Archive/ArchiveInternals.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- lib/Archive/ArchiveInternals.h -------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Internal implementation header for LLVM Archive files.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIB_ARCHIVE_ARCHIVEINTERNALS_H
-#define LIB_ARCHIVE_ARCHIVEINTERNALS_H
-
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Bitcode/Archive.h"
-#include "llvm/Support/TimeValue.h"
-#include <cstring>
-
-#define ARFILE_MAGIC "!<arch>\n"                   ///< magic string
-#define ARFILE_MAGIC_LEN (sizeof(ARFILE_MAGIC)-1)  ///< length of magic string
-#define ARFILE_SVR4_SYMTAB_NAME "/               " ///< SVR4 symtab entry name
-#define ARFILE_LLVM_SYMTAB_NAME "#_LLVM_SYM_TAB_#" ///< LLVM symtab entry name
-#define ARFILE_BSD4_SYMTAB_NAME "__.SYMDEF SORTED" ///< BSD4 symtab entry name
-#define ARFILE_STRTAB_NAME      "//              " ///< Name of string table
-#define ARFILE_PAD "\n"                            ///< inter-file align padding
-#define ARFILE_MEMBER_MAGIC "`\n"                  ///< fmag field magic #
-
-namespace llvm {
-
-  class LLVMContext;
-
-  /// The ArchiveMemberHeader structure is used internally for bitcode
-  /// archives.
-  /// The header precedes each file member in the archive. This structure is
-  /// defined using character arrays for direct and correct interpretation
-  /// regardless of the endianess of the machine that produced it.
-  /// @brief Archive File Member Header
-  class ArchiveMemberHeader {
-    /// @name Data
-    /// @{
-    public:
-      char name[16];  ///< Name of the file member.
-      char date[12];  ///< File date, decimal seconds since Epoch
-      char uid[6];    ///< user id in ASCII decimal
-      char gid[6];    ///< group id in ASCII decimal
-      char mode[8];   ///< file mode in ASCII octal
-      char size[10];  ///< file size in ASCII decimal
-      char fmag[2];   ///< Always contains ARFILE_MAGIC_TERMINATOR
-
-    /// @}
-    /// @name Methods
-    /// @{
-    public:
-    void init() {
-      memset(name,' ',16);
-      memset(date,' ',12);
-      memset(uid,' ',6);
-      memset(gid,' ',6);
-      memset(mode,' ',8);
-      memset(size,' ',10);
-      fmag[0] = '`';
-      fmag[1] = '\n';
-    }
-
-    bool checkSignature() const {
-      return 0 == memcmp(fmag, ARFILE_MEMBER_MAGIC,2);
-    }
-  };
-  
-  // Get just the externally visible defined symbols from the bitcode
-  bool GetBitcodeSymbols(const sys::Path& fName,
-                          LLVMContext& Context,
-                          std::vector<std::string>& symbols,
-                          std::string* ErrMsg);
-  
-  Module* GetBitcodeSymbols(const char *Buffer, unsigned Length,
-                            const std::string& ModuleID,
-                            LLVMContext& Context,
-                            std::vector<std::string>& symbols,
-                            std::string* ErrMsg);
-}
-
-#endif
-
-// vim: sw=2 ai
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
deleted file mode 100644
index 14713e6..0000000
--- a/lib/Archive/ArchiveReader.cpp
+++ /dev/null
@@ -1,633 +0,0 @@
-//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Builds up standard unix archive files (.a) containing LLVM bitcode.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Bitcode/Archive.h"
-#include "ArchiveInternals.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <cstdio>
-#include <cstdlib>
-using namespace llvm;
-
-/// Read a variable-bit-rate encoded unsigned integer
-static inline unsigned readInteger(const char*&At, const char*End) {
-  unsigned Shift = 0;
-  unsigned Result = 0;
-
-  do {
-    if (At == End)
-      return Result;
-    Result |= (unsigned)((*At++) & 0x7F) << Shift;
-    Shift += 7;
-  } while (At[-1] & 0x80);
-  return Result;
-}
-
-// Completely parse the Archive's symbol table and populate symTab member var.
-bool
-Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) {
-  const char* At = (const char*) data;
-  const char* End = At + size;
-  while (At < End) {
-    unsigned offset = readInteger(At, End);
-    if (At == End) {
-      if (error)
-        *error = "Ran out of data reading vbr_uint for symtab offset!";
-      return false;
-    }
-    unsigned length = readInteger(At, End);
-    if (At == End) {
-      if (error)
-        *error = "Ran out of data reading vbr_uint for symtab length!";
-      return false;
-    }
-    if (At + length > End) {
-      if (error)
-        *error = "Malformed symbol table: length not consistent with size";
-      return false;
-    }
-    // we don't care if it can't be inserted (duplicate entry)
-    symTab.insert(std::make_pair(std::string(At, length), offset));
-    At += length;
-  }
-  symTabSize = size;
-  return true;
-}
-
-// This member parses an ArchiveMemberHeader that is presumed to be pointed to
-// by At. The At pointer is updated to the byte just after the header, which
-// can be variable in size.
-ArchiveMember*
-Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
-{
-  if (At + sizeof(ArchiveMemberHeader) >= End) {
-    if (error)
-      *error = "Unexpected end of file";
-    return 0;
-  }
-
-  // Cast archive member header
-  const ArchiveMemberHeader* Hdr = (const ArchiveMemberHeader*)At;
-  At += sizeof(ArchiveMemberHeader);
-
-  int flags = 0;
-  int MemberSize = atoi(Hdr->size);
-  assert(MemberSize >= 0);
-
-  // Check the size of the member for sanity
-  if (At + MemberSize > End) {
-    if (error)
-      *error = "invalid member length in archive file";
-    return 0;
-  }
-
-  // Check the member signature
-  if (!Hdr->checkSignature()) {
-    if (error)
-      *error = "invalid file member signature";
-    return 0;
-  }
-
-  // Convert and check the member name
-  // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol
-  // table. The special name "//" and 14 blanks is for a string table, used
-  // for long file names. This library doesn't generate either of those but
-  // it will accept them. If the name starts with #1/ and the remainder is
-  // digits, then those digits specify the length of the name that is
-  // stored immediately following the header. The special name
-  // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bitcode.
-  // Anything else is a regular, short filename that is terminated with
-  // a '/' and blanks.
-
-  std::string pathname;
-  switch (Hdr->name[0]) {
-    case '#':
-      if (Hdr->name[1] == '1' && Hdr->name[2] == '/') {
-        if (isdigit(Hdr->name[3])) {
-          unsigned len = atoi(&Hdr->name[3]);
-          const char *nulp = (const char *)memchr(At, '\0', len);
-          pathname.assign(At, nulp != 0 ? (uintptr_t)(nulp - At) : len);
-          At += len;
-          MemberSize -= len;
-          flags |= ArchiveMember::HasLongFilenameFlag;
-        } else {
-          if (error)
-            *error = "invalid long filename";
-          return 0;
-        }
-      } else if (Hdr->name[1] == '_' &&
-                 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) {
-        // The member is using a long file name (>15 chars) format.
-        // This format is standard for 4.4BSD and Mac OSX operating
-        // systems. LLVM uses it similarly. In this format, the
-        // remainder of the name field (after #1/) specifies the
-        // length of the file name which occupy the first bytes of
-        // the member's data. The pathname already has the #1/ stripped.
-        pathname.assign(ARFILE_LLVM_SYMTAB_NAME);
-        flags |= ArchiveMember::LLVMSymbolTableFlag;
-      }
-      break;
-    case '/':
-      if (Hdr->name[1]== '/') {
-        if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) {
-          pathname.assign(ARFILE_STRTAB_NAME);
-          flags |= ArchiveMember::StringTableFlag;
-        } else {
-          if (error)
-            *error = "invalid string table name";
-          return 0;
-        }
-      } else if (Hdr->name[1] == ' ') {
-        if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) {
-          pathname.assign(ARFILE_SVR4_SYMTAB_NAME);
-          flags |= ArchiveMember::SVR4SymbolTableFlag;
-        } else {
-          if (error)
-            *error = "invalid SVR4 symbol table name";
-          return 0;
-        }
-      } else if (isdigit(Hdr->name[1])) {
-        unsigned index = atoi(&Hdr->name[1]);
-        if (index < strtab.length()) {
-          const char* namep = strtab.c_str() + index;
-          const char* endp = strtab.c_str() + strtab.length();
-          const char* p = namep;
-          const char* last_p = p;
-          while (p < endp) {
-            if (*p == '\n' && *last_p == '/') {
-              pathname.assign(namep, last_p - namep);
-              flags |= ArchiveMember::HasLongFilenameFlag;
-              break;
-            }
-            last_p = p;
-            p++;
-          }
-          if (p >= endp) {
-            if (error)
-              *error = "missing name terminator in string table";
-            return 0;
-          }
-        } else {
-          if (error)
-            *error = "name index beyond string table";
-          return 0;
-        }
-      }
-      break;
-    case '_':
-      if (Hdr->name[1] == '_' &&
-          (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) {
-        pathname.assign(ARFILE_BSD4_SYMTAB_NAME);
-        flags |= ArchiveMember::BSD4SymbolTableFlag;
-        break;
-      }
-      /* FALL THROUGH */
-
-    default:
-      const char* slash = (const char*) memchr(Hdr->name, '/', 16);
-      if (slash == 0)
-        slash = Hdr->name + 16;
-      pathname.assign(Hdr->name, slash - Hdr->name);
-      break;
-  }
-
-  // Determine if this is a bitcode file
-  switch (sys::IdentifyFileType(At, 4)) {
-    case sys::Bitcode_FileType:
-      flags |= ArchiveMember::BitcodeFlag;
-      break;
-    default:
-      flags &= ~ArchiveMember::BitcodeFlag;
-      break;
-  }
-
-  // Instantiate the ArchiveMember to be filled
-  ArchiveMember* member = new ArchiveMember(this);
-
-  // Fill in fields of the ArchiveMember
-  member->parent = this;
-  member->path.set(pathname);
-  member->info.fileSize = MemberSize;
-  member->info.modTime.fromEpochTime(atoi(Hdr->date));
-  unsigned int mode;
-  sscanf(Hdr->mode, "%o", &mode);
-  member->info.mode = mode;
-  member->info.user = atoi(Hdr->uid);
-  member->info.group = atoi(Hdr->gid);
-  member->flags = flags;
-  member->data = At;
-
-  return member;
-}
-
-bool
-Archive::checkSignature(std::string* error) {
-  // Check the magic string at file's header
-  if (mapfile->getBufferSize() < 8 || memcmp(base, ARFILE_MAGIC, 8)) {
-    if (error)
-      *error = "invalid signature for an archive file";
-    return false;
-  }
-  return true;
-}
-
-// This function loads the entire archive and fully populates its ilist with
-// the members of the archive file. This is typically used in preparation for
-// editing the contents of the archive.
-bool
-Archive::loadArchive(std::string* error) {
-
-  // Set up parsing
-  members.clear();
-  symTab.clear();
-  const char *At = base;
-  const char *End = mapfile->getBufferEnd();
-
-  if (!checkSignature(error))
-    return false;
-
-  At += 8;  // Skip the magic string.
-
-  bool seenSymbolTable = false;
-  bool foundFirstFile = false;
-  while (At < End) {
-    // parse the member header
-    const char* Save = At;
-    ArchiveMember* mbr = parseMemberHeader(At, End, error);
-    if (!mbr)
-      return false;
-
-    // check if this is the foreign symbol table
-    if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
-      // We just save this but don't do anything special
-      // with it. It doesn't count as the "first file".
-      if (foreignST) {
-        // What? Multiple foreign symbol tables? Just chuck it
-        // and retain the last one found.
-        delete foreignST;
-      }
-      foreignST = mbr;
-      At += mbr->getSize();
-      if ((intptr_t(At) & 1) == 1)
-        At++;
-    } else if (mbr->isStringTable()) {
-      // Simply suck the entire string table into a string
-      // variable. This will be used to get the names of the
-      // members that use the "/ddd" format for their names
-      // (SVR4 style long names).
-      strtab.assign(At, mbr->getSize());
-      At += mbr->getSize();
-      if ((intptr_t(At) & 1) == 1)
-        At++;
-      delete mbr;
-    } else if (mbr->isLLVMSymbolTable()) {
-      // This is the LLVM symbol table for the archive. If we've seen it
-      // already, its an error. Otherwise, parse the symbol table and move on.
-      if (seenSymbolTable) {
-        if (error)
-          *error = "invalid archive: multiple symbol tables";
-        return false;
-      }
-      if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error))
-        return false;
-      seenSymbolTable = true;
-      At += mbr->getSize();
-      if ((intptr_t(At) & 1) == 1)
-        At++;
-      delete mbr; // We don't need this member in the list of members.
-    } else {
-      // This is just a regular file. If its the first one, save its offset.
-      // Otherwise just push it on the list and move on to the next file.
-      if (!foundFirstFile) {
-        firstFileOffset = Save - base;
-        foundFirstFile = true;
-      }
-      members.push_back(mbr);
-      At += mbr->getSize();
-      if ((intptr_t(At) & 1) == 1)
-        At++;
-    }
-  }
-  return true;
-}
-
-// Open and completely load the archive file.
-Archive*
-Archive::OpenAndLoad(const sys::Path& File, LLVMContext& C,
-                     std::string* ErrorMessage) {
-  OwningPtr<Archive> result ( new Archive(File, C));
-  if (result->mapToMemory(ErrorMessage))
-    return NULL;
-  if (!result->loadArchive(ErrorMessage))
-    return NULL;
-  return result.take();
-}
-
-// Get all the bitcode modules from the archive
-bool
-Archive::getAllModules(std::vector<Module*>& Modules,
-                       std::string* ErrMessage) {
-
-  for (iterator I=begin(), E=end(); I != E; ++I) {
-    if (I->isBitcode()) {
-      std::string FullMemberName = archPath.str() +
-        "(" + I->getPath().str() + ")";
-      MemoryBuffer *Buffer =
-        MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
-                                       FullMemberName.c_str());
-      
-      Module *M = ParseBitcodeFile(Buffer, Context, ErrMessage);
-      delete Buffer;
-      if (!M)
-        return true;
-
-      Modules.push_back(M);
-    }
-  }
-  return false;
-}
-
-// Load just the symbol table from the archive file
-bool
-Archive::loadSymbolTable(std::string* ErrorMsg) {
-
-  // Set up parsing
-  members.clear();
-  symTab.clear();
-  const char *At = base;
-  const char *End = mapfile->getBufferEnd();
-
-  // Make sure we're dealing with an archive
-  if (!checkSignature(ErrorMsg))
-    return false;
-
-  At += 8; // Skip signature
-
-  // Parse the first file member header
-  const char* FirstFile = At;
-  ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg);
-  if (!mbr)
-    return false;
-
-  if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
-    // Skip the foreign symbol table, we don't do anything with it
-    At += mbr->getSize();
-    if ((intptr_t(At) & 1) == 1)
-      At++;
-    delete mbr;
-
-    // Read the next one
-    FirstFile = At;
-    mbr = parseMemberHeader(At, End, ErrorMsg);
-    if (!mbr) {
-      delete mbr;
-      return false;
-    }
-  }
-
-  if (mbr->isStringTable()) {
-    // Process the string table entry
-    strtab.assign((const char*)mbr->getData(), mbr->getSize());
-    At += mbr->getSize();
-    if ((intptr_t(At) & 1) == 1)
-      At++;
-    delete mbr;
-    // Get the next one
-    FirstFile = At;
-    mbr = parseMemberHeader(At, End, ErrorMsg);
-    if (!mbr) {
-      delete mbr;
-      return false;
-    }
-  }
-
-  // See if its the symbol table
-  if (mbr->isLLVMSymbolTable()) {
-    if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) {
-      delete mbr;
-      return false;
-    }
-
-    At += mbr->getSize();
-    if ((intptr_t(At) & 1) == 1)
-      At++;
-    delete mbr;
-    // Can't be any more symtab headers so just advance
-    FirstFile = At;
-  } else {
-    // There's no symbol table in the file. We have to rebuild it from scratch
-    // because the intent of this method is to get the symbol table loaded so
-    // it can be searched efficiently.
-    // Add the member to the members list
-    members.push_back(mbr);
-  }
-
-  firstFileOffset = FirstFile - base;
-  return true;
-}
-
-// Open the archive and load just the symbol tables
-Archive* Archive::OpenAndLoadSymbols(const sys::Path& File,
-                                     LLVMContext& C,
-                                     std::string* ErrorMessage) {
-  OwningPtr<Archive> result ( new Archive(File, C) );
-  if (result->mapToMemory(ErrorMessage))
-    return NULL;
-  if (!result->loadSymbolTable(ErrorMessage))
-    return NULL;
-  return result.take();
-}
-
-// Look up one symbol in the symbol table and return the module that defines
-// that symbol.
-Module*
-Archive::findModuleDefiningSymbol(const std::string& symbol, 
-                                  std::string* ErrMsg) {
-  SymTabType::iterator SI = symTab.find(symbol);
-  if (SI == symTab.end())
-    return 0;
-
-  // The symbol table was previously constructed assuming that the members were
-  // written without the symbol table header. Because VBR encoding is used, the
-  // values could not be adjusted to account for the offset of the symbol table
-  // because that could affect the size of the symbol table due to VBR encoding.
-  // We now have to account for this by adjusting the offset by the size of the
-  // symbol table and its header.
-  unsigned fileOffset =
-    SI->second +                // offset in symbol-table-less file
-    firstFileOffset;            // add offset to first "real" file in archive
-
-  // See if the module is already loaded
-  ModuleMap::iterator MI = modules.find(fileOffset);
-  if (MI != modules.end())
-    return MI->second.first;
-
-  // Module hasn't been loaded yet, we need to load it
-  const char* modptr = base + fileOffset;
-  ArchiveMember* mbr = parseMemberHeader(modptr, mapfile->getBufferEnd(),
-                                         ErrMsg);
-  if (!mbr)
-    return 0;
-
-  // Now, load the bitcode module to get the Module.
-  std::string FullMemberName = archPath.str() + "(" +
-    mbr->getPath().str() + ")";
-  MemoryBuffer *Buffer =
-    MemoryBuffer::getMemBufferCopy(StringRef(mbr->getData(), mbr->getSize()),
-                                   FullMemberName.c_str());
-  
-  Module *m = getLazyBitcodeModule(Buffer, Context, ErrMsg);
-  if (!m)
-    return 0;
-
-  modules.insert(std::make_pair(fileOffset, std::make_pair(m, mbr)));
-
-  return m;
-}
-
-// Look up multiple symbols in the symbol table and return a set of
-// Modules that define those symbols.
-bool
-Archive::findModulesDefiningSymbols(std::set<std::string>& symbols,
-                                    SmallVectorImpl<Module*>& result,
-                                    std::string* error) {
-  if (!mapfile || !base) {
-    if (error)
-      *error = "Empty archive invalid for finding modules defining symbols";
-    return false;
-  }
-
-  if (symTab.empty()) {
-    // We don't have a symbol table, so we must build it now but lets also
-    // make sure that we populate the modules table as we do this to ensure
-    // that we don't load them twice when findModuleDefiningSymbol is called
-    // below.
-
-    // Get a pointer to the first file
-    const char* At  = base + firstFileOffset;
-    const char* End = mapfile->getBufferEnd();
-
-    while ( At < End) {
-      // Compute the offset to be put in the symbol table
-      unsigned offset = At - base - firstFileOffset;
-
-      // Parse the file's header
-      ArchiveMember* mbr = parseMemberHeader(At, End, error);
-      if (!mbr)
-        return false;
-
-      // If it contains symbols
-      if (mbr->isBitcode()) {
-        // Get the symbols
-        std::vector<std::string> symbols;
-        std::string FullMemberName = archPath.str() + "(" +
-          mbr->getPath().str() + ")";
-        Module* M = 
-          GetBitcodeSymbols(At, mbr->getSize(), FullMemberName, Context,
-                            symbols, error);
-
-        if (M) {
-          // Insert the module's symbols into the symbol table
-          for (std::vector<std::string>::iterator I = symbols.begin(),
-               E=symbols.end(); I != E; ++I ) {
-            symTab.insert(std::make_pair(*I, offset));
-          }
-          // Insert the Module and the ArchiveMember into the table of
-          // modules.
-          modules.insert(std::make_pair(offset, std::make_pair(M, mbr)));
-        } else {
-          if (error)
-            *error = "Can't parse bitcode member: " + 
-              mbr->getPath().str() + ": " + *error;
-          delete mbr;
-          return false;
-        }
-      }
-
-      // Go to the next file location
-      At += mbr->getSize();
-      if ((intptr_t(At) & 1) == 1)
-        At++;
-    }
-  }
-
-  // At this point we have a valid symbol table (one way or another) so we
-  // just use it to quickly find the symbols requested.
-
-  SmallPtrSet<Module*, 16> Added;
-  for (std::set<std::string>::iterator I=symbols.begin(),
-         Next = I,
-         E=symbols.end(); I != E; I = Next) {
-    // Increment Next before we invalidate it.
-    ++Next;
-
-    // See if this symbol exists
-    Module* m = findModuleDefiningSymbol(*I,error);
-    if (!m)
-      continue;
-    bool NewMember = Added.insert(m);
-    if (!NewMember)
-      continue;
-
-    // The symbol exists, insert the Module into our result.
-    result.push_back(m);
-
-    // Remove the symbol now that its been resolved.
-    symbols.erase(I);
-  }
-  return true;
-}
-
-bool Archive::isBitcodeArchive() {
-  // Make sure the symTab has been loaded. In most cases this should have been
-  // done when the archive was constructed, but still,  this is just in case.
-  if (symTab.empty())
-    if (!loadSymbolTable(0))
-      return false;
-
-  // Now that we know it's been loaded, return true
-  // if it has a size
-  if (symTab.size()) return true;
-
-  // We still can't be sure it isn't a bitcode archive
-  if (!loadArchive(0))
-    return false;
-
-  std::vector<Module *> Modules;
-  std::string ErrorMessage;
-
-  // Scan the archive, trying to load a bitcode member.  We only load one to
-  // see if this works.
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    if (!I->isBitcode())
-      continue;
-    
-    std::string FullMemberName = 
-      archPath.str() + "(" + I->getPath().str() + ")";
-
-    MemoryBuffer *Buffer =
-      MemoryBuffer::getMemBufferCopy(StringRef(I->getData(), I->getSize()),
-                                     FullMemberName.c_str());
-    Module *M = ParseBitcodeFile(Buffer, Context);
-    delete Buffer;
-    if (!M)
-      return false;  // Couldn't parse bitcode, not a bitcode archive.
-    delete M;
-    return true;
-  }
-  
-  return false;
-}
diff --git a/lib/Archive/ArchiveWriter.cpp b/lib/Archive/ArchiveWriter.cpp
deleted file mode 100644
index 3eba701..0000000
--- a/lib/Archive/ArchiveWriter.cpp
+++ /dev/null
@@ -1,489 +0,0 @@
-//===-- ArchiveWriter.cpp - Write LLVM archive files ----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Builds up an LLVM archive file (.a) containing LLVM bitcode.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Bitcode/Archive.h"
-#include "ArchiveInternals.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Process.h"
-#include "llvm/Support/Signals.h"
-#include "llvm/Support/system_error.h"
-#include <fstream>
-#include <iomanip>
-#include <ostream>
-using namespace llvm;
-
-// Write an integer using variable bit rate encoding. This saves a few bytes
-// per entry in the symbol table.
-static inline void writeInteger(unsigned num, std::ofstream& ARFile) {
-  while (1) {
-    if (num < 0x80) { // done?
-      ARFile << (unsigned char)num;
-      return;
-    }
-
-    // Nope, we are bigger than a character, output the next 7 bits and set the
-    // high bit to say that there is more coming...
-    ARFile << (unsigned char)(0x80 | ((unsigned char)num & 0x7F));
-    num >>= 7;  // Shift out 7 bits now...
-  }
-}
-
-// Compute how many bytes are taken by a given VBR encoded value. This is needed
-// to pre-compute the size of the symbol table.
-static inline unsigned numVbrBytes(unsigned num) {
-
-  // Note that the following nested ifs are somewhat equivalent to a binary
-  // search. We split it in half by comparing against 2^14 first. This allows
-  // most reasonable values to be done in 2 comparisons instead of 1 for
-  // small ones and four for large ones. We expect this to access file offsets
-  // in the 2^10 to 2^24 range and symbol lengths in the 2^0 to 2^8 range,
-  // so this approach is reasonable.
-  if (num < 1<<14) {
-    if (num < 1<<7)
-      return 1;
-    else
-      return 2;
-  }
-  if (num < 1<<21)
-    return 3;
-
-  if (num < 1<<28)
-    return 4;
-  return 5; // anything >= 2^28 takes 5 bytes
-}
-
-// Create an empty archive.
-Archive* Archive::CreateEmpty(const sys::Path& FilePath, LLVMContext& C) {
-  Archive* result = new Archive(FilePath, C);
-  return result;
-}
-
-// Fill the ArchiveMemberHeader with the information from a member. If
-// TruncateNames is true, names are flattened to 15 chars or less. The sz field
-// is provided here instead of coming from the mbr because the member might be
-// stored compressed and the compressed size is not the ArchiveMember's size.
-// Furthermore compressed files have negative size fields to identify them as
-// compressed.
-bool
-Archive::fillHeader(const ArchiveMember &mbr, ArchiveMemberHeader& hdr,
-                    int sz, bool TruncateNames) const {
-
-  // Set the permissions mode, uid and gid
-  hdr.init();
-  char buffer[32];
-  sprintf(buffer, "%-8o", mbr.getMode());
-  memcpy(hdr.mode,buffer,8);
-  sprintf(buffer,  "%-6u", mbr.getUser());
-  memcpy(hdr.uid,buffer,6);
-  sprintf(buffer,  "%-6u", mbr.getGroup());
-  memcpy(hdr.gid,buffer,6);
-
-  // Set the last modification date
-  uint64_t secondsSinceEpoch = mbr.getModTime().toEpochTime();
-  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
-  memcpy(hdr.date,buffer,12);
-
-  // Get rid of trailing blanks in the name
-  std::string mbrPath = mbr.getPath().str();
-  size_t mbrLen = mbrPath.length();
-  while (mbrLen > 0 && mbrPath[mbrLen-1] == ' ') {
-    mbrPath.erase(mbrLen-1,1);
-    mbrLen--;
-  }
-
-  // Set the name field in one of its various flavors.
-  bool writeLongName = false;
-  if (mbr.isStringTable()) {
-    memcpy(hdr.name,ARFILE_STRTAB_NAME,16);
-  } else if (mbr.isSVR4SymbolTable()) {
-    memcpy(hdr.name,ARFILE_SVR4_SYMTAB_NAME,16);
-  } else if (mbr.isBSD4SymbolTable()) {
-    memcpy(hdr.name,ARFILE_BSD4_SYMTAB_NAME,16);
-  } else if (mbr.isLLVMSymbolTable()) {
-    memcpy(hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
-  } else if (TruncateNames) {
-    const char* nm = mbrPath.c_str();
-    unsigned len = mbrPath.length();
-    size_t slashpos = mbrPath.rfind('/');
-    if (slashpos != std::string::npos) {
-      nm += slashpos + 1;
-      len -= slashpos +1;
-    }
-    if (len > 15)
-      len = 15;
-    memcpy(hdr.name,nm,len);
-    hdr.name[len] = '/';
-  } else if (mbrPath.length() < 16 && mbrPath.find('/') == std::string::npos) {
-    memcpy(hdr.name,mbrPath.c_str(),mbrPath.length());
-    hdr.name[mbrPath.length()] = '/';
-  } else {
-    std::string nm = "#1/";
-    nm += utostr(mbrPath.length());
-    memcpy(hdr.name,nm.data(),nm.length());
-    if (sz < 0)
-      sz -= mbrPath.length();
-    else
-      sz += mbrPath.length();
-    writeLongName = true;
-  }
-
-  // Set the size field
-  if (sz < 0) {
-    buffer[0] = '-';
-    sprintf(&buffer[1],"%-9u",(unsigned)-sz);
-  } else {
-    sprintf(buffer, "%-10u", (unsigned)sz);
-  }
-  memcpy(hdr.size,buffer,10);
-
-  return writeLongName;
-}
-
-// Insert a file into the archive before some other member. This also takes care
-// of extracting the necessary flags and information from the file.
-bool
-Archive::addFileBefore(const sys::Path& filePath, iterator where,
-                        std::string* ErrMsg) {
-  bool Exists;
-  if (sys::fs::exists(filePath.str(), Exists) || !Exists) {
-    if (ErrMsg)
-      *ErrMsg = "Can not add a non-existent file to archive";
-    return true;
-  }
-
-  ArchiveMember* mbr = new ArchiveMember(this);
-
-  mbr->data = 0;
-  mbr->path = filePath;
-  const sys::FileStatus *FSInfo = mbr->path.getFileStatus(false, ErrMsg);
-  if (!FSInfo) {
-    delete mbr;
-    return true;
-  }
-  mbr->info = *FSInfo;
-
-  unsigned flags = 0;
-  bool hasSlash = filePath.str().find('/') != std::string::npos;
-  if (hasSlash)
-    flags |= ArchiveMember::HasPathFlag;
-  if (hasSlash || filePath.str().length() > 15)
-    flags |= ArchiveMember::HasLongFilenameFlag;
-
-  sys::fs::file_magic type;
-  if (sys::fs::identify_magic(mbr->path.str(), type))
-    type = sys::fs::file_magic::unknown;
-  switch (type) {
-    case sys::fs::file_magic::bitcode:
-      flags |= ArchiveMember::BitcodeFlag;
-      break;
-    default:
-      break;
-  }
-  mbr->flags = flags;
-  members.insert(where,mbr);
-  return false;
-}
-
-// Write one member out to the file.
-bool
-Archive::writeMember(
-  const ArchiveMember& member,
-  std::ofstream& ARFile,
-  bool CreateSymbolTable,
-  bool TruncateNames,
-  std::string* ErrMsg
-) {
-
-  unsigned filepos = ARFile.tellp();
-  filepos -= 8;
-
-  // Get the data and its size either from the
-  // member's in-memory data or directly from the file.
-  size_t fSize = member.getSize();
-  const char *data = (const char*)member.getData();
-  MemoryBuffer *mFile = 0;
-  if (!data) {
-    OwningPtr<MemoryBuffer> File;
-    if (error_code ec = MemoryBuffer::getFile(member.getPath().c_str(), File)) {
-      if (ErrMsg)
-        *ErrMsg = ec.message();
-      return true;
-    }
-    mFile = File.take();
-    data = mFile->getBufferStart();
-    fSize = mFile->getBufferSize();
-  }
-
-  // Now that we have the data in memory, update the
-  // symbol table if it's a bitcode file.
-  if (CreateSymbolTable && member.isBitcode()) {
-    std::vector<std::string> symbols;
-    std::string FullMemberName = archPath.str() + "(" + member.getPath().str()
-      + ")";
-    Module* M =
-      GetBitcodeSymbols(data, fSize, FullMemberName, Context, symbols, ErrMsg);
-
-    // If the bitcode parsed successfully
-    if ( M ) {
-      for (std::vector<std::string>::iterator SI = symbols.begin(),
-           SE = symbols.end(); SI != SE; ++SI) {
-
-        std::pair<SymTabType::iterator,bool> Res =
-          symTab.insert(std::make_pair(*SI,filepos));
-
-        if (Res.second) {
-          symTabSize += SI->length() +
-                        numVbrBytes(SI->length()) +
-                        numVbrBytes(filepos);
-        }
-      }
-      // We don't need this module any more.
-      delete M;
-    } else {
-      delete mFile;
-      if (ErrMsg)
-        *ErrMsg = "Can't parse bitcode member: " + member.getPath().str()
-          + ": " + *ErrMsg;
-      return true;
-    }
-  }
-
-  int hdrSize = fSize;
-
-  // Compute the fields of the header
-  ArchiveMemberHeader Hdr;
-  bool writeLongName = fillHeader(member,Hdr,hdrSize,TruncateNames);
-
-  // Write header to archive file
-  ARFile.write((char*)&Hdr, sizeof(Hdr));
-
-  // Write the long filename if its long
-  if (writeLongName) {
-    ARFile.write(member.getPath().str().data(),
-                 member.getPath().str().length());
-  }
-
-  // Write the (possibly compressed) member's content to the file.
-  ARFile.write(data,fSize);
-
-  // Make sure the member is an even length
-  if ((ARFile.tellp() & 1) == 1)
-    ARFile << ARFILE_PAD;
-
-  // Close the mapped file if it was opened
-  delete mFile;
-  return false;
-}
-
-// Write out the LLVM symbol table as an archive member to the file.
-void
-Archive::writeSymbolTable(std::ofstream& ARFile) {
-
-  // Construct the symbol table's header
-  ArchiveMemberHeader Hdr;
-  Hdr.init();
-  memcpy(Hdr.name,ARFILE_LLVM_SYMTAB_NAME,16);
-  uint64_t secondsSinceEpoch = sys::TimeValue::now().toEpochTime();
-  char buffer[32];
-  sprintf(buffer, "%-8o", 0644);
-  memcpy(Hdr.mode,buffer,8);
-  sprintf(buffer, "%-6u", sys::Process::GetCurrentUserId());
-  memcpy(Hdr.uid,buffer,6);
-  sprintf(buffer, "%-6u", sys::Process::GetCurrentGroupId());
-  memcpy(Hdr.gid,buffer,6);
-  sprintf(buffer,"%-12u", unsigned(secondsSinceEpoch));
-  memcpy(Hdr.date,buffer,12);
-  sprintf(buffer,"%-10u",symTabSize);
-  memcpy(Hdr.size,buffer,10);
-
-  // Write the header
-  ARFile.write((char*)&Hdr, sizeof(Hdr));
-
-#ifndef NDEBUG
-  // Save the starting position of the symbol tables data content.
-  unsigned startpos = ARFile.tellp();
-#endif
-
-  // Write out the symbols sequentially
-  for ( Archive::SymTabType::iterator I = symTab.begin(), E = symTab.end();
-        I != E; ++I)
-  {
-    // Write out the file index
-    writeInteger(I->second, ARFile);
-    // Write out the length of the symbol
-    writeInteger(I->first.length(), ARFile);
-    // Write out the symbol
-    ARFile.write(I->first.data(), I->first.length());
-  }
-
-#ifndef NDEBUG
-  // Now that we're done with the symbol table, get the ending file position
-  unsigned endpos = ARFile.tellp();
-#endif
-
-  // Make sure that the amount we wrote is what we pre-computed. This is
-  // critical for file integrity purposes.
-  assert(endpos - startpos == symTabSize && "Invalid symTabSize computation");
-
-  // Make sure the symbol table is even sized
-  if (symTabSize % 2 != 0 )
-    ARFile << ARFILE_PAD;
-}
-
-// Write the entire archive to the file specified when the archive was created.
-// This writes to a temporary file first. Options are for creating a symbol
-// table, flattening the file names (no directories, 15 chars max) and
-// compressing each archive member.
-bool
-Archive::writeToDisk(bool CreateSymbolTable, bool TruncateNames,
-                     std::string* ErrMsg)
-{
-  // Make sure they haven't opened up the file, not loaded it,
-  // but are now trying to write it which would wipe out the file.
-  if (members.empty() && mapfile && mapfile->getBufferSize() > 8) {
-    if (ErrMsg)
-      *ErrMsg = "Can't write an archive not opened for writing";
-    return true;
-  }
-
-  // Create a temporary file to store the archive in
-  sys::Path TmpArchive = archPath;
-  if (TmpArchive.createTemporaryFileOnDisk(ErrMsg))
-    return true;
-
-  // Make sure the temporary gets removed if we crash
-  sys::RemoveFileOnSignal(TmpArchive);
-
-  // Create archive file for output.
-  std::ios::openmode io_mode = std::ios::out | std::ios::trunc |
-                               std::ios::binary;
-  std::ofstream ArchiveFile(TmpArchive.c_str(), io_mode);
-
-  // Check for errors opening or creating archive file.
-  if (!ArchiveFile.is_open() || ArchiveFile.bad()) {
-    TmpArchive.eraseFromDisk();
-    if (ErrMsg)
-      *ErrMsg = "Error opening archive file: " + archPath.str();
-    return true;
-  }
-
-  // If we're creating a symbol table, reset it now
-  if (CreateSymbolTable) {
-    symTabSize = 0;
-    symTab.clear();
-  }
-
-  // Write magic string to archive.
-  ArchiveFile << ARFILE_MAGIC;
-
-  // Loop over all member files, and write them out. Note that this also
-  // builds the symbol table, symTab.
-  for (MembersList::iterator I = begin(), E = end(); I != E; ++I) {
-    if (writeMember(*I, ArchiveFile, CreateSymbolTable,
-                     TruncateNames, ErrMsg)) {
-      TmpArchive.eraseFromDisk();
-      ArchiveFile.close();
-      return true;
-    }
-  }
-
-  // Close archive file.
-  ArchiveFile.close();
-
-  // Write the symbol table
-  if (CreateSymbolTable) {
-    // At this point we have written a file that is a legal archive but it
-    // doesn't have a symbol table in it. To aid in faster reading and to
-    // ensure compatibility with other archivers we need to put the symbol
-    // table first in the file. Unfortunately, this means mapping the file
-    // we just wrote back in and copying it to the destination file.
-    sys::Path FinalFilePath = archPath;
-
-    // Map in the archive we just wrote.
-    {
-    OwningPtr<MemoryBuffer> arch;
-    if (error_code ec = MemoryBuffer::getFile(TmpArchive.c_str(), arch)) {
-      if (ErrMsg)
-        *ErrMsg = ec.message();
-      return true;
-    }
-    const char* base = arch->getBufferStart();
-
-    // Open another temporary file in order to avoid invalidating the
-    // mmapped data
-    if (FinalFilePath.createTemporaryFileOnDisk(ErrMsg))
-      return true;
-    sys::RemoveFileOnSignal(FinalFilePath);
-
-    std::ofstream FinalFile(FinalFilePath.c_str(), io_mode);
-    if (!FinalFile.is_open() || FinalFile.bad()) {
-      TmpArchive.eraseFromDisk();
-      if (ErrMsg)
-        *ErrMsg = "Error opening archive file: " + FinalFilePath.str();
-      return true;
-    }
-
-    // Write the file magic number
-    FinalFile << ARFILE_MAGIC;
-
-    // If there is a foreign symbol table, put it into the file now. Most
-    // ar(1) implementations require the symbol table to be first but llvm-ar
-    // can deal with it being after a foreign symbol table. This ensures
-    // compatibility with other ar(1) implementations as well as allowing the
-    // archive to store both native .o and LLVM .bc files, both indexed.
-    if (foreignST) {
-      if (writeMember(*foreignST, FinalFile, false, false, ErrMsg)) {
-        FinalFile.close();
-        TmpArchive.eraseFromDisk();
-        return true;
-      }
-    }
-
-    // Put out the LLVM symbol table now.
-    writeSymbolTable(FinalFile);
-
-    // Copy the temporary file contents being sure to skip the file's magic
-    // number.
-    FinalFile.write(base + sizeof(ARFILE_MAGIC)-1,
-      arch->getBufferSize()-sizeof(ARFILE_MAGIC)+1);
-
-    // Close up shop
-    FinalFile.close();
-    } // free arch.
-
-    // Move the final file over top of TmpArchive
-    if (FinalFilePath.renamePathOnDisk(TmpArchive, ErrMsg))
-      return true;
-  }
-
-  // Before we replace the actual archive, we need to forget all the
-  // members, since they point to data in that old archive. We need to do
-  // this because we cannot replace an open file on Windows.
-  cleanUpMemory();
-
-  if (TmpArchive.renamePathOnDisk(archPath, ErrMsg))
-    return true;
-
-  // Set correct read and write permissions after temporary file is moved
-  // to final destination path.
-  if (archPath.makeReadableOnDisk(ErrMsg))
-    return true;
-  if (archPath.makeWriteableOnDisk(ErrMsg))
-    return true;
-
-  return false;
-}
diff --git a/lib/Archive/CMakeLists.txt b/lib/Archive/CMakeLists.txt
deleted file mode 100644
index 7ff478a..0000000
--- a/lib/Archive/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-add_llvm_library(LLVMArchive
-  Archive.cpp
-  ArchiveReader.cpp
-  ArchiveWriter.cpp
-  )
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index e7a9f2a..1e6085b 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -478,12 +478,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(private);
   KEYWORD(linker_private);
   KEYWORD(linker_private_weak);
-  KEYWORD(linker_private_weak_def_auto); // FIXME: For backwards compatibility.
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
-  KEYWORD(linkonce_odr_auto_hide);
   KEYWORD(weak);
   KEYWORD(weak_odr);
   KEYWORD(appending);
@@ -540,6 +538,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(alignstack);
   KEYWORD(inteldialect);
   KEYWORD(gc);
+  KEYWORD(prefix);
 
   KEYWORD(ccc);
   KEYWORD(fastcc);
@@ -556,6 +555,10 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(spir_kernel);
   KEYWORD(spir_func);
   KEYWORD(intel_ocl_bicc);
+  KEYWORD(x86_64_sysvcc);
+  KEYWORD(x86_64_win64cc);
+  KEYWORD(webkit_jscc);
+  KEYWORD(anyregcc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -563,7 +566,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(attributes);
 
   KEYWORD(alwaysinline);
+  KEYWORD(builtin);
   KEYWORD(byval);
+  KEYWORD(cold);
   KEYWORD(inlinehint);
   KEYWORD(inreg);
   KEYWORD(minsize);
@@ -579,6 +584,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nounwind);
+  KEYWORD(optnone);
   KEYWORD(optsize);
   KEYWORD(readnone);
   KEYWORD(readonly);
@@ -659,6 +665,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(inttoptr,    IntToPtr);
   INSTKEYWORD(ptrtoint,    PtrToInt);
   INSTKEYWORD(bitcast,     BitCast);
+  INSTKEYWORD(addrspacecast, AddrSpaceCast);
   INSTKEYWORD(select,      Select);
   INSTKEYWORD(va_arg,      VAArg);
   INSTKEYWORD(ret,         Ret);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 62d8070d..3b903cd 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/ValueSymbolTable.h"
@@ -65,6 +66,9 @@ bool LLParser::ValidateEndOfModule() {
     ForwardRefInstMetadata.clear();
   }
 
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
   // Handle any function attribute group forward references.
   for (std::map<Value*, std::vector<unsigned> >::iterator
          I = ForwardRefAttrGroups.begin(), E = ForwardRefAttrGroups.end();
@@ -178,6 +182,8 @@ bool LLParser::ValidateEndOfModule() {
   for (Module::iterator FI = M->begin(), FE = M->end(); FI != FE; )
     UpgradeCallsToIntrinsic(FI++); // must be post-increment, as we remove
 
+  UpgradeDebugInfo(*M);
+
   return false;
 }
 
@@ -242,13 +248,11 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_linker_private:      // OptionalLinkage
     case lltok::kw_linker_private_weak: // OptionalLinkage
-    case lltok::kw_linker_private_weak_def_auto: // FIXME: backwards compat.
     case lltok::kw_internal:            // OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
     case lltok::kw_linkonce_odr:        // OptionalLinkage
-    case lltok::kw_linkonce_odr_auto_hide: // OptionalLinkage
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_dllexport:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
@@ -623,18 +627,14 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
                           unsigned Visibility) {
   assert(Lex.getKind() == lltok::kw_alias);
   Lex.Lex();
-  unsigned Linkage;
   LocTy LinkageLoc = Lex.getLoc();
-  if (ParseOptionalLinkage(Linkage))
+  unsigned L;
+  if (ParseOptionalLinkage(L))
     return true;
 
-  if (Linkage != GlobalValue::ExternalLinkage &&
-      Linkage != GlobalValue::WeakAnyLinkage &&
-      Linkage != GlobalValue::WeakODRLinkage &&
-      Linkage != GlobalValue::InternalLinkage &&
-      Linkage != GlobalValue::PrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
+  GlobalValue::LinkageTypes Linkage = (GlobalValue::LinkageTypes) L;
+
+  if(!GlobalAlias::isValidLinkage(Linkage))
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -810,13 +810,13 @@ bool LLParser::ParseUnnamedAttrGrp() {
   assert(Lex.getKind() == lltok::AttrGrpID);
   unsigned VarID = Lex.getUIntVal();
   std::vector<unsigned> unused;
-  LocTy NoBuiltinLoc;
+  LocTy BuiltinLoc;
   Lex.Lex();
 
   if (ParseToken(lltok::equal, "expected '=' here") ||
       ParseToken(lltok::lbrace, "expected '{' here") ||
       ParseFnAttributeValuePairs(NumberedAttrBuilders[VarID], unused, true,
-                                 NoBuiltinLoc) ||
+                                 BuiltinLoc) ||
       ParseToken(lltok::rbrace, "expected end of attribute group"))
     return true;
 
@@ -830,15 +830,15 @@ bool LLParser::ParseUnnamedAttrGrp() {
 ///   ::= <attr> | <attr> '=' <value>
 bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
                                           std::vector<unsigned> &FwdRefAttrGrps,
-                                          bool inAttrGrp, LocTy &NoBuiltinLoc) {
+                                          bool inAttrGrp, LocTy &BuiltinLoc) {
   bool HaveError = false;
 
   B.clear();
 
   while (true) {
     lltok::Kind Token = Lex.getKind();
-    if (Token == lltok::kw_nobuiltin)
-      NoBuiltinLoc = Lex.getLoc();
+    if (Token == lltok::kw_builtin)
+      BuiltinLoc = Lex.getLoc();
     switch (Token) {
     default:
       if (!inAttrGrp) return HaveError;
@@ -909,6 +909,8 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       continue;
     }
     case lltok::kw_alwaysinline:      B.addAttribute(Attribute::AlwaysInline); break;
+    case lltok::kw_builtin:           B.addAttribute(Attribute::Builtin); break;
+    case lltok::kw_cold:              B.addAttribute(Attribute::Cold); break;
     case lltok::kw_inlinehint:        B.addAttribute(Attribute::InlineHint); break;
     case lltok::kw_minsize:           B.addAttribute(Attribute::MinSize); break;
     case lltok::kw_naked:             B.addAttribute(Attribute::Naked); break;
@@ -920,6 +922,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_noredzone:         B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn:          B.addAttribute(Attribute::NoReturn); break;
     case lltok::kw_nounwind:          B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_optnone:           B.addAttribute(Attribute::OptimizeNone); break;
     case lltok::kw_optsize:           B.addAttribute(Attribute::OptimizeForSize); break;
     case lltok::kw_readnone:          B.addAttribute(Attribute::ReadNone); break;
     case lltok::kw_readonly:          B.addAttribute(Attribute::ReadOnly); break;
@@ -1157,6 +1160,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
     case lltok::kw_nocapture:       B.addAttribute(Attribute::NoCapture); break;
+    case lltok::kw_readnone:        B.addAttribute(Attribute::ReadNone); break;
+    case lltok::kw_readonly:        B.addAttribute(Attribute::ReadOnly); break;
     case lltok::kw_returned:        B.addAttribute(Attribute::Returned); break;
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_sret:            B.addAttribute(Attribute::StructRet); break;
@@ -1164,6 +1169,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
+    case lltok::kw_builtin:
     case lltok::kw_inlinehint:
     case lltok::kw_minsize:
     case lltok::kw_naked:
@@ -1175,9 +1181,8 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
-    case lltok::kw_readnone:
-    case lltok::kw_readonly:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
     case lltok::kw_sanitize_memory:
@@ -1222,6 +1227,8 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 
     case lltok::kw_alignstack:
     case lltok::kw_alwaysinline:
+    case lltok::kw_builtin:
+    case lltok::kw_cold:
     case lltok::kw_inlinehint:
     case lltok::kw_minsize:
     case lltok::kw_naked:
@@ -1233,9 +1240,8 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nounwind:
+    case lltok::kw_optnone:
     case lltok::kw_optsize:
-    case lltok::kw_readnone:
-    case lltok::kw_readonly:
     case lltok::kw_returns_twice:
     case lltok::kw_sanitize_address:
     case lltok::kw_sanitize_memory:
@@ -1246,6 +1252,10 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
+
+    case lltok::kw_readnone:
+    case lltok::kw_readonly:
+      HaveError |= Error(Lex.getLoc(), "invalid use of attribute on return type");
     }
 
     Lex.Lex();
@@ -1262,7 +1272,6 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
 ///   ::= 'weak_odr'
 ///   ::= 'linkonce'
 ///   ::= 'linkonce_odr'
-///   ::= 'linkonce_odr_auto_hide'
 ///   ::= 'available_externally'
 ///   ::= 'appending'
 ///   ::= 'dllexport'
@@ -1284,10 +1293,6 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
   case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
   case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
-  case lltok::kw_linkonce_odr_auto_hide:
-  case lltok::kw_linker_private_weak_def_auto: // FIXME: For backwards compat.
-    Res = GlobalValue::LinkOnceODRAutoHideLinkage;
-    break;
   case lltok::kw_available_externally:
     Res = GlobalValue::AvailableExternallyLinkage;
     break;
@@ -1337,6 +1342,10 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'ptx_device'
 ///   ::= 'spir_func'
 ///   ::= 'spir_kernel'
+///   ::= 'x86_64_sysvcc'
+///   ::= 'x86_64_win64cc'
+///   ::= 'webkit_jscc'
+///   ::= 'anyregcc'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
@@ -1357,6 +1366,10 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
   case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
+  case lltok::kw_x86_64_sysvcc:  CC = CallingConv::X86_64_SysV; break;
+  case lltok::kw_x86_64_win64cc: CC = CallingConv::X86_64_Win64; break;
+  case lltok::kw_webkit_jscc:    CC = CallingConv::WebKit_JS; break;
+  case lltok::kw_anyregcc:       CC = CallingConv::AnyReg; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
@@ -1413,6 +1426,9 @@ bool LLParser::ParseInstructionMetadata(Instruction *Inst,
       }
     }
 
+    if (MDK == LLVMContext::MD_tbaa)
+      InstsWithTBAATag.push_back(Inst);
+
     // If this is the end of the list, we're done.
   } while (EatIfPresent(lltok::comma));
   return false;
@@ -2372,7 +2388,6 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     Lex.Lex();
 
     ValID Fn, Label;
-    LocTy FnLoc, LabelLoc;
 
     if (ParseToken(lltok::lparen, "expected '(' in block address expression") ||
         ParseValID(Fn) ||
@@ -2402,6 +2417,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
@@ -2908,7 +2924,7 @@ bool LLParser::ParseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 /// FunctionHeader
 ///   ::= OptionalLinkage OptionalVisibility OptionalCallingConv OptRetAttrs
 ///       OptUnnamedAddr Type GlobalName '(' ArgList ')' OptFuncAttrs OptSection
-///       OptionalAlign OptGC
+///       OptionalAlign OptGC OptionalPrefix
 bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   // Parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
@@ -2942,7 +2958,6 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::DLLExportLinkage:
@@ -2981,27 +2996,30 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   bool isVarArg;
   AttrBuilder FuncAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
-  LocTy NoBuiltinLoc;
+  LocTy BuiltinLoc;
   std::string Section;
   unsigned Alignment;
   std::string GC;
   bool UnnamedAddr;
   LocTy UnnamedAddrLoc;
+  Constant *Prefix = 0;
 
   if (ParseArgumentList(ArgList, isVarArg) ||
       ParseOptionalToken(lltok::kw_unnamed_addr, UnnamedAddr,
                          &UnnamedAddrLoc) ||
       ParseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
-                                 NoBuiltinLoc) ||
+                                 BuiltinLoc) ||
       (EatIfPresent(lltok::kw_section) &&
        ParseStringConstant(Section)) ||
       ParseOptionalAlignment(Alignment) ||
       (EatIfPresent(lltok::kw_gc) &&
-       ParseStringConstant(GC)))
+       ParseStringConstant(GC)) ||
+      (EatIfPresent(lltok::kw_prefix) &&
+       ParseGlobalTypeAndValue(Prefix)))
     return true;
 
-  if (FuncAttrs.contains(Attribute::NoBuiltin))
-    return Error(NoBuiltinLoc, "'nobuiltin' attribute not valid on function");
+  if (FuncAttrs.contains(Attribute::Builtin))
+    return Error(BuiltinLoc, "'builtin' attribute not valid on function");
 
   // If the alignment was parsed as an attribute, move to the alignment field.
   if (FuncAttrs.hasAlignmentAttr()) {
@@ -3095,6 +3113,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   Fn->setAlignment(Alignment);
   Fn->setSection(Section);
   if (!GC.empty()) Fn->setGC(GC.c_str());
+  Fn->setPrefixData(Prefix);
   ForwardRefAttrGroups[Fn] = FwdRefAttrGrps;
 
   // Add all of the arguments we parsed to the function.
@@ -3160,7 +3179,6 @@ bool LLParser::ParseBasicBlock(PerFunctionState &PFS) {
 
   // Parse the instructions in this block until we get a terminator.
   Instruction *Inst;
-  SmallVector<std::pair<unsigned, MDNode *>, 4> MetadataOnInst;
   do {
     // This instruction may have three possibilities for a name: a) none
     // specified, b) name specified "%foo =", c) number specified: "%4 =".
@@ -3288,6 +3306,7 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
   case lltok::kw_bitcast:
+  case lltok::kw_addrspacecast:
   case lltok::kw_uitofp:
   case lltok::kw_sitofp:
   case lltok::kw_fptoui:
@@ -3925,7 +3944,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
                          bool isTail) {
   AttrBuilder RetAttrs, FnAttrs;
   std::vector<unsigned> FwdRefAttrGrps;
-  LocTy NoBuiltinLoc;
+  LocTy BuiltinLoc;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc;
@@ -3940,7 +3959,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       ParseValID(CalleeID) ||
       ParseParameterList(ArgList, PFS) ||
       ParseFnAttributeValuePairs(FnAttrs, FwdRefAttrGrps, false,
-                                 NoBuiltinLoc))
+                                 BuiltinLoc))
     return true;
 
   // If RetType is a non-function pointer type, then this is the short syntax
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 1f2879e..ded776c 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -107,6 +107,8 @@ namespace llvm {
     };
     DenseMap<Instruction*, std::vector<MDRef> > ForwardRefInstMetadata;
 
+    SmallVector<Instruction*, 64> InstsWithTBAATag;
+
     // Type resolution handling data structures.  The location is set when we
     // have processed a use of the type but not a definition yet.
     StringMap<std::pair<Type*, LocTy> > NamedTypes;
@@ -242,7 +244,7 @@ namespace llvm {
     bool ParseUnnamedAttrGrp();
     bool ParseFnAttributeValuePairs(AttrBuilder &B,
                                     std::vector<unsigned> &FwdRefAttrGrps,
-                                    bool inAttrGrp, LocTy &NoBuiltinLoc);
+                                    bool inAttrGrp, LocTy &BuiltinLoc);
 
     // Type Parsing.
     bool ParseType(Type *&Result, bool AllowVoid = false);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 3bf54fa..786d84d 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -38,9 +38,8 @@ namespace lltok {
     kw_global,  kw_constant,
 
     kw_private, kw_linker_private, kw_linker_private_weak,
-    kw_linker_private_weak_def_auto, // FIXME: For backwards compatibility.
     kw_internal,
-    kw_linkonce, kw_linkonce_odr, kw_linkonce_odr_auto_hide,
+    kw_linkonce, kw_linkonce_odr,
     kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
@@ -81,21 +80,26 @@ namespace lltok {
     kw_alignstack,
     kw_inteldialect,
     kw_gc,
+    kw_prefix,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
-	  kw_intel_ocl_bicc,
+    kw_intel_ocl_bicc,
     kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
     kw_ptx_kernel, kw_ptx_device,
     kw_spir_kernel, kw_spir_func,
+    kw_x86_64_sysvcc, kw_x86_64_win64cc,
+    kw_webkit_jscc, kw_anyregcc,
 
     // Attributes:
     kw_attributes,
     kw_alwaysinline,
     kw_sanitize_address,
+    kw_builtin,
     kw_byval,
+    kw_cold,
     kw_inlinehint,
     kw_inreg,
     kw_minsize,
@@ -111,6 +115,7 @@ namespace lltok {
     kw_noredzone,
     kw_noreturn,
     kw_nounwind,
+    kw_optnone,
     kw_optsize,
     kw_readnone,
     kw_readonly,
@@ -145,6 +150,7 @@ namespace lltok {
     kw_phi, kw_call,
     kw_trunc, kw_zext, kw_sext, kw_fptrunc, kw_fpext, kw_uitofp, kw_sitofp,
     kw_fptoui, kw_fptosi, kw_inttoptr, kw_ptrtoint, kw_bitcast,
+    kw_addrspacecast,
     kw_select, kw_va_arg,
 
     kw_landingpad, kw_personality, kw_cleanup, kw_catch, kw_filter,
diff --git a/lib/AsmParser/Parser.cpp b/lib/AsmParser/Parser.cpp
index bb4f03b..d777ab9 100644
--- a/lib/AsmParser/Parser.cpp
+++ b/lib/AsmParser/Parser.cpp
@@ -43,7 +43,7 @@ Module *llvm::ParseAssembly(MemoryBuffer *F,
 Module *llvm::ParseAssemblyFile(const std::string &Filename, SMDiagnostic &Err,
                                 LLVMContext &Context) {
   OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return 0;
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index e6ff4b4..ce3b7d1 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -12,16 +12,19 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/AutoUpgrade.h"
+#include "llvm/Bitcode/LLVMBitCodes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/Support/DataStream.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 enum {
@@ -87,7 +90,6 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
   case 14: return GlobalValue::LinkerPrivateWeakLinkage;
-  case 15: return GlobalValue::LinkOnceODRAutoHideLinkage;
   }
 }
 
@@ -126,6 +128,7 @@ static int GetDecodedCastOpcode(unsigned Val) {
   case bitc::CAST_PTRTOINT: return Instruction::PtrToInt;
   case bitc::CAST_INTTOPTR: return Instruction::IntToPtr;
   case bitc::CAST_BITCAST : return Instruction::BitCast;
+  case bitc::CAST_ADDRSPACECAST: return Instruction::AddrSpaceCast;
   }
 }
 static int GetDecodedBinaryOpcode(unsigned Val, Type *Ty) {
@@ -448,12 +451,12 @@ static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                 (EncodedAttrs & 0xffff));
 }
 
-bool BitcodeReader::ParseAttributeBlock() {
+error_code BitcodeReader::ParseAttributeBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributes.empty())
-    return Error("Multiple PARAMATTR blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -466,9 +469,9 @@ bool BitcodeReader::ParseAttributeBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -482,7 +485,7 @@ bool BitcodeReader::ParseAttributeBlock() {
     case bitc::PARAMATTR_CODE_ENTRY_OLD: { // ENTRY: [paramidx0, attr0, ...]
       // FIXME: Remove in 4.0.
       if (Record.size() & 1)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
@@ -506,12 +509,102 @@ bool BitcodeReader::ParseAttributeBlock() {
   }
 }
 
-bool BitcodeReader::ParseAttributeGroupBlock() {
+// Returns Attribute::None on unrecognized codes.
+static Attribute::AttrKind GetAttrFromCode(uint64_t Code) {
+  switch (Code) {
+  default:
+    return Attribute::None;
+  case bitc::ATTR_KIND_ALIGNMENT:
+    return Attribute::Alignment;
+  case bitc::ATTR_KIND_ALWAYS_INLINE:
+    return Attribute::AlwaysInline;
+  case bitc::ATTR_KIND_BUILTIN:
+    return Attribute::Builtin;
+  case bitc::ATTR_KIND_BY_VAL:
+    return Attribute::ByVal;
+  case bitc::ATTR_KIND_COLD:
+    return Attribute::Cold;
+  case bitc::ATTR_KIND_INLINE_HINT:
+    return Attribute::InlineHint;
+  case bitc::ATTR_KIND_IN_REG:
+    return Attribute::InReg;
+  case bitc::ATTR_KIND_MIN_SIZE:
+    return Attribute::MinSize;
+  case bitc::ATTR_KIND_NAKED:
+    return Attribute::Naked;
+  case bitc::ATTR_KIND_NEST:
+    return Attribute::Nest;
+  case bitc::ATTR_KIND_NO_ALIAS:
+    return Attribute::NoAlias;
+  case bitc::ATTR_KIND_NO_BUILTIN:
+    return Attribute::NoBuiltin;
+  case bitc::ATTR_KIND_NO_CAPTURE:
+    return Attribute::NoCapture;
+  case bitc::ATTR_KIND_NO_DUPLICATE:
+    return Attribute::NoDuplicate;
+  case bitc::ATTR_KIND_NO_IMPLICIT_FLOAT:
+    return Attribute::NoImplicitFloat;
+  case bitc::ATTR_KIND_NO_INLINE:
+    return Attribute::NoInline;
+  case bitc::ATTR_KIND_NON_LAZY_BIND:
+    return Attribute::NonLazyBind;
+  case bitc::ATTR_KIND_NO_RED_ZONE:
+    return Attribute::NoRedZone;
+  case bitc::ATTR_KIND_NO_RETURN:
+    return Attribute::NoReturn;
+  case bitc::ATTR_KIND_NO_UNWIND:
+    return Attribute::NoUnwind;
+  case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
+    return Attribute::OptimizeForSize;
+  case bitc::ATTR_KIND_OPTIMIZE_NONE:
+    return Attribute::OptimizeNone;
+  case bitc::ATTR_KIND_READ_NONE:
+    return Attribute::ReadNone;
+  case bitc::ATTR_KIND_READ_ONLY:
+    return Attribute::ReadOnly;
+  case bitc::ATTR_KIND_RETURNED:
+    return Attribute::Returned;
+  case bitc::ATTR_KIND_RETURNS_TWICE:
+    return Attribute::ReturnsTwice;
+  case bitc::ATTR_KIND_S_EXT:
+    return Attribute::SExt;
+  case bitc::ATTR_KIND_STACK_ALIGNMENT:
+    return Attribute::StackAlignment;
+  case bitc::ATTR_KIND_STACK_PROTECT:
+    return Attribute::StackProtect;
+  case bitc::ATTR_KIND_STACK_PROTECT_REQ:
+    return Attribute::StackProtectReq;
+  case bitc::ATTR_KIND_STACK_PROTECT_STRONG:
+    return Attribute::StackProtectStrong;
+  case bitc::ATTR_KIND_STRUCT_RET:
+    return Attribute::StructRet;
+  case bitc::ATTR_KIND_SANITIZE_ADDRESS:
+    return Attribute::SanitizeAddress;
+  case bitc::ATTR_KIND_SANITIZE_THREAD:
+    return Attribute::SanitizeThread;
+  case bitc::ATTR_KIND_SANITIZE_MEMORY:
+    return Attribute::SanitizeMemory;
+  case bitc::ATTR_KIND_UW_TABLE:
+    return Attribute::UWTable;
+  case bitc::ATTR_KIND_Z_EXT:
+    return Attribute::ZExt;
+  }
+}
+
+error_code BitcodeReader::ParseAttrKind(uint64_t Code,
+                                        Attribute::AttrKind *Kind) {
+  *Kind = GetAttrFromCode(Code);
+  if (*Kind == Attribute::None)
+    return Error(InvalidValue);
+  return error_code::success();
+}
+
+error_code BitcodeReader::ParseAttributeGroupBlock() {
   if (Stream.EnterSubBlock(bitc::PARAMATTR_GROUP_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   if (!MAttributeGroups.empty())
-    return Error("Multiple PARAMATTR_GROUP blocks found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -522,9 +615,9 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("Error at end of PARAMATTR_GROUP block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -537,7 +630,7 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       break;
     case bitc::PARAMATTR_GRP_CODE_ENTRY: { // ENTRY: [grpid, idx, a0, a1, ...]
       if (Record.size() < 3)
-        return Error("Invalid ENTRY record");
+        return Error(InvalidRecord);
 
       uint64_t GrpID = Record[0];
       uint64_t Idx = Record[1]; // Index of the object this attribute refers to.
@@ -545,9 +638,16 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
       AttrBuilder B;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Record[i] == 0) {        // Enum attribute
-          B.addAttribute(Attribute::AttrKind(Record[++i]));
+          Attribute::AttrKind Kind;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
+
+          B.addAttribute(Kind);
         } else if (Record[i] == 1) { // Align attribute
-          if (Attribute::AttrKind(Record[++i]) == Attribute::Alignment)
+          Attribute::AttrKind Kind;
+          if (error_code EC = ParseAttrKind(Record[++i], &Kind))
+            return EC;
+          if (Kind == Attribute::Alignment)
             B.addAlignmentAttr(Record[++i]);
           else
             B.addStackAlignmentAttr(Record[++i]);
@@ -581,16 +681,16 @@ bool BitcodeReader::ParseAttributeGroupBlock() {
   }
 }
 
-bool BitcodeReader::ParseTypeTable() {
+error_code BitcodeReader::ParseTypeTable() {
   if (Stream.EnterSubBlock(bitc::TYPE_BLOCK_ID_NEW))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   return ParseTypeTableBody();
 }
 
-bool BitcodeReader::ParseTypeTableBody() {
+error_code BitcodeReader::ParseTypeTableBody() {
   if (!TypeList.empty())
-    return Error("Multiple TYPE_BLOCKs found!");
+    return Error(InvalidMultipleBlocks);
 
   SmallVector<uint64_t, 64> Record;
   unsigned NumRecords = 0;
@@ -604,12 +704,11 @@ bool BitcodeReader::ParseTypeTableBody() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("Error in the type table block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NumRecords != TypeList.size())
-        return Error("Invalid type forward reference in TYPE_BLOCK");
-      return false;
+        return Error(MalformedBlock);
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -619,12 +718,13 @@ bool BitcodeReader::ParseTypeTableBody() {
     Record.clear();
     Type *ResultTy = 0;
     switch (Stream.readRecord(Entry.ID, Record)) {
-    default: return Error("unknown type in type table");
+    default:
+      return Error(InvalidValue);
     case bitc::TYPE_CODE_NUMENTRY: // TYPE_CODE_NUMENTRY: [numentries]
       // TYPE_CODE_NUMENTRY contains a count of the number of types in the
       // type list.  This allows us to reserve space.
       if (Record.size() < 1)
-        return Error("Invalid TYPE_CODE_NUMENTRY record");
+        return Error(InvalidRecord);
       TypeList.resize(Record[0]);
       continue;
     case bitc::TYPE_CODE_VOID:      // VOID
@@ -659,19 +759,20 @@ bool BitcodeReader::ParseTypeTableBody() {
       break;
     case bitc::TYPE_CODE_INTEGER:   // INTEGER: [width]
       if (Record.size() < 1)
-        return Error("Invalid Integer type record");
+        return Error(InvalidRecord);
 
       ResultTy = IntegerType::get(Context, Record[0]);
       break;
     case bitc::TYPE_CODE_POINTER: { // POINTER: [pointee type] or
                                     //          [pointee type, address space]
       if (Record.size() < 1)
-        return Error("Invalid POINTER type record");
+        return Error(InvalidRecord);
       unsigned AddressSpace = 0;
       if (Record.size() == 2)
         AddressSpace = Record[1];
       ResultTy = getTypeByID(Record[0]);
-      if (ResultTy == 0) return Error("invalid element type in pointer type");
+      if (ResultTy == 0)
+        return Error(InvalidType);
       ResultTy = PointerType::get(ResultTy, AddressSpace);
       break;
     }
@@ -679,7 +780,7 @@ bool BitcodeReader::ParseTypeTableBody() {
       // FIXME: attrid is dead, remove it in LLVM 4.0
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 3, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -690,7 +791,7 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[2]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-3)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
@@ -698,7 +799,7 @@ bool BitcodeReader::ParseTypeTableBody() {
     case bitc::TYPE_CODE_FUNCTION: {
       // FUNCTION: [vararg, retty, paramty x N]
       if (Record.size() < 2)
-        return Error("Invalid FUNCTION type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> ArgTys;
       for (unsigned i = 2, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -709,14 +810,14 @@ bool BitcodeReader::ParseTypeTableBody() {
 
       ResultTy = getTypeByID(Record[1]);
       if (ResultTy == 0 || ArgTys.size() < Record.size()-2)
-        return Error("invalid type in function type");
+        return Error(InvalidType);
 
       ResultTy = FunctionType::get(ResultTy, ArgTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_ANON: {  // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
       SmallVector<Type*, 8> EltTys;
       for (unsigned i = 1, e = Record.size(); i != e; ++i) {
         if (Type *T = getTypeByID(Record[i]))
@@ -725,21 +826,21 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid type in struct type");
+        return Error(InvalidType);
       ResultTy = StructType::get(Context, EltTys, Record[0]);
       break;
     }
     case bitc::TYPE_CODE_STRUCT_NAME:   // STRUCT_NAME: [strchr x N]
       if (ConvertToString(Record, 0, TypeName))
-        return Error("Invalid STRUCT_NAME record");
+        return Error(InvalidRecord);
       continue;
 
     case bitc::TYPE_CODE_STRUCT_NAMED: { // STRUCT: [ispacked, eltty x N]
       if (Record.size() < 1)
-        return Error("Invalid STRUCT type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -758,17 +859,17 @@ bool BitcodeReader::ParseTypeTableBody() {
           break;
       }
       if (EltTys.size() != Record.size()-1)
-        return Error("invalid STRUCT type record");
+        return Error(InvalidRecord);
       Res->setBody(EltTys, Record[0]);
       ResultTy = Res;
       break;
     }
     case bitc::TYPE_CODE_OPAQUE: {       // OPAQUE: []
       if (Record.size() != 1)
-        return Error("Invalid OPAQUE type record");
+        return Error(InvalidRecord);
 
       if (NumRecords >= TypeList.size())
-        return Error("invalid TYPE table");
+        return Error(InvalidTYPETable);
 
       // Check to see if this was forward referenced, if so fill in the temp.
       StructType *Res = cast_or_null<StructType>(TypeList[NumRecords]);
@@ -783,33 +884,33 @@ bool BitcodeReader::ParseTypeTableBody() {
     }
     case bitc::TYPE_CODE_ARRAY:     // ARRAY: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid ARRAY type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = ArrayType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     case bitc::TYPE_CODE_VECTOR:    // VECTOR: [numelts, eltty]
       if (Record.size() < 2)
-        return Error("Invalid VECTOR type record");
+        return Error(InvalidRecord);
       if ((ResultTy = getTypeByID(Record[1])))
         ResultTy = VectorType::get(ResultTy, Record[0]);
       else
-        return Error("Invalid ARRAY type element");
+        return Error(InvalidType);
       break;
     }
 
     if (NumRecords >= TypeList.size())
-      return Error("invalid TYPE table");
+      return Error(InvalidTYPETable);
     assert(ResultTy && "Didn't read a type?");
     assert(TypeList[NumRecords] == 0 && "Already read type?");
     TypeList[NumRecords++] = ResultTy;
   }
 }
 
-bool BitcodeReader::ParseValueSymbolTable() {
+error_code BitcodeReader::ParseValueSymbolTable() {
   if (Stream.EnterSubBlock(bitc::VALUE_SYMTAB_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -821,9 +922,9 @@ bool BitcodeReader::ParseValueSymbolTable() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed value symbol table block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -836,10 +937,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
       break;
     case bitc::VST_CODE_ENTRY: {  // VST_ENTRY: [valueid, namechar x N]
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_ENTRY record");
+        return Error(InvalidRecord);
       unsigned ValueID = Record[0];
       if (ValueID >= ValueList.size())
-        return Error("Invalid Value ID in VST_ENTRY record");
+        return Error(InvalidRecord);
       Value *V = ValueList[ValueID];
 
       V->setName(StringRef(ValueName.data(), ValueName.size()));
@@ -848,10 +949,10 @@ bool BitcodeReader::ParseValueSymbolTable() {
     }
     case bitc::VST_CODE_BBENTRY: {
       if (ConvertToString(Record, 1, ValueName))
-        return Error("Invalid VST_BBENTRY record");
+        return Error(InvalidRecord);
       BasicBlock *BB = getBasicBlock(Record[0]);
       if (BB == 0)
-        return Error("Invalid BB ID in VST_BBENTRY record");
+        return Error(InvalidRecord);
 
       BB->setName(StringRef(ValueName.data(), ValueName.size()));
       ValueName.clear();
@@ -861,11 +962,11 @@ bool BitcodeReader::ParseValueSymbolTable() {
   }
 }
 
-bool BitcodeReader::ParseMetadata() {
+error_code BitcodeReader::ParseMetadata() {
   unsigned NextMDValueNo = MDValueList.size();
 
   if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -876,10 +977,9 @@ bool BitcodeReader::ParseMetadata() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      Error("malformed metadata block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -908,7 +1008,7 @@ bool BitcodeReader::ParseMetadata() {
       for (unsigned i = 0; i != Size; ++i) {
         MDNode *MD = dyn_cast<MDNode>(MDValueList.getValueFwdRef(Record[i]));
         if (MD == 0)
-          return Error("Malformed metadata record");
+          return Error(InvalidRecord);
         NMD->addOperand(MD);
       }
       break;
@@ -918,13 +1018,14 @@ bool BitcodeReader::ParseMetadata() {
       // fall-through
     case bitc::METADATA_NODE: {
       if (Record.size() % 2 == 1)
-        return Error("Invalid METADATA_NODE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Value*, 8> Elts;
       for (unsigned i = 0; i != Size; i += 2) {
         Type *Ty = getTypeByID(Record[i]);
-        if (!Ty) return Error("Invalid METADATA_NODE record");
+        if (!Ty)
+          return Error(InvalidRecord);
         if (Ty->isMetadataTy())
           Elts.push_back(MDValueList.getValueFwdRef(Record[i+1]));
         else if (!Ty->isVoidTy())
@@ -945,14 +1046,14 @@ bool BitcodeReader::ParseMetadata() {
     }
     case bitc::METADATA_KIND: {
       if (Record.size() < 2)
-        return Error("Invalid METADATA_KIND record");
+        return Error(InvalidRecord);
 
       unsigned Kind = Record[0];
       SmallString<8> Name(Record.begin()+1, Record.end());
 
       unsigned NewKind = TheModule->getMDKindID(Name.str());
       if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
-        return Error("Conflicting METADATA_KIND records");
+        return Error(ConflictingMETADATA_KINDRecords);
       break;
     }
     }
@@ -972,12 +1073,14 @@ uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
 
 /// ResolveGlobalAndAliasInits - Resolve all of the initializers for global
 /// values and aliases that we can.
-bool BitcodeReader::ResolveGlobalAndAliasInits() {
+error_code BitcodeReader::ResolveGlobalAndAliasInits() {
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInitWorklist;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInitWorklist;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixWorklist;
 
   GlobalInitWorklist.swap(GlobalInits);
   AliasInitWorklist.swap(AliasInits);
+  FunctionPrefixWorklist.swap(FunctionPrefixes);
 
   while (!GlobalInitWorklist.empty()) {
     unsigned ValID = GlobalInitWorklist.back().second;
@@ -988,7 +1091,7 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         GlobalInitWorklist.back().first->setInitializer(C);
       else
-        return Error("Global variable initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     GlobalInitWorklist.pop_back();
   }
@@ -1001,11 +1104,25 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
       if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
         AliasInitWorklist.back().first->setAliasee(C);
       else
-        return Error("Alias initializer is not a constant!");
+        return Error(ExpectedConstant);
     }
     AliasInitWorklist.pop_back();
   }
-  return false;
+
+  while (!FunctionPrefixWorklist.empty()) {
+    unsigned ValID = FunctionPrefixWorklist.back().second;
+    if (ValID >= ValueList.size()) {
+      FunctionPrefixes.push_back(FunctionPrefixWorklist.back());
+    } else {
+      if (Constant *C = dyn_cast<Constant>(ValueList[ValID]))
+        FunctionPrefixWorklist.back().first->setPrefixData(C);
+      else
+        return Error(ExpectedConstant);
+    }
+    FunctionPrefixWorklist.pop_back();
+  }
+
+  return error_code::success();
 }
 
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
@@ -1016,9 +1133,9 @@ static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   return APInt(TypeBits, Words);
 }
 
-bool BitcodeReader::ParseConstants() {
+error_code BitcodeReader::ParseConstants() {
   if (Stream.EnterSubBlock(bitc::CONSTANTS_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1031,15 +1148,15 @@ bool BitcodeReader::ParseConstants() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed block record in AST file");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       if (NextCstNo != ValueList.size())
-        return Error("Invalid constant reference!");
+        return Error(InvalidConstantReference);
 
       // Once all the constants have been read, go through and resolve forward
       // references.
       ValueList.ResolveConstantForwardRefs();
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1056,9 +1173,9 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_SETTYPE:   // SETTYPE: [typeid]
       if (Record.empty())
-        return Error("Malformed CST_SETTYPE record");
+        return Error(InvalidRecord);
       if (Record[0] >= TypeList.size())
-        return Error("Invalid Type ID in CST_SETTYPE record");
+        return Error(InvalidRecord);
       CurTy = TypeList[Record[0]];
       continue;  // Skip the ValueList manipulation.
     case bitc::CST_CODE_NULL:      // NULL
@@ -1066,12 +1183,12 @@ bool BitcodeReader::ParseConstants() {
       break;
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid CST_INTEGER record");
+        return Error(InvalidRecord);
       V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
-        return Error("Invalid WIDE_INTEGER record");
+        return Error(InvalidRecord);
 
       APInt VInt = ReadWideAPInt(Record,
                                  cast<IntegerType>(CurTy)->getBitWidth());
@@ -1081,7 +1198,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
-        return Error("Invalid FLOAT record");
+        return Error(InvalidRecord);
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf,
                                              APInt(16, (uint16_t)Record[0])));
@@ -1111,7 +1228,7 @@ bool BitcodeReader::ParseConstants() {
 
     case bitc::CST_CODE_AGGREGATE: {// AGGREGATE: [n x value number]
       if (Record.empty())
-        return Error("Invalid CST_AGGREGATE record");
+        return Error(InvalidRecord);
 
       unsigned Size = Record.size();
       SmallVector<Constant*, 16> Elts;
@@ -1139,7 +1256,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_STRING:    // STRING: [values]
     case bitc::CST_CODE_CSTRING: { // CSTRING: [values]
       if (Record.empty())
-        return Error("Invalid CST_STRING record");
+        return Error(InvalidRecord);
 
       SmallString<16> Elts(Record.begin(), Record.end());
       V = ConstantDataArray::getString(Context, Elts,
@@ -1148,7 +1265,7 @@ bool BitcodeReader::ParseConstants() {
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
       if (Record.empty())
-        return Error("Invalid CST_DATA record");
+        return Error(InvalidRecord);
 
       Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
       unsigned Size = Record.size();
@@ -1193,13 +1310,14 @@ bool BitcodeReader::ParseConstants() {
         else
           V = ConstantDataArray::get(Context, Elts);
       } else {
-        return Error("Unknown element type in CE_DATA");
+        return Error(InvalidTypeForValue);
       }
       break;
     }
 
     case bitc::CST_CODE_CE_BINOP: {  // CE_BINOP: [opcode, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_BINOP record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedBinaryOpcode(Record[0], CurTy);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown binop.
@@ -1229,25 +1347,30 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CAST: {  // CE_CAST: [opcode, opty, opval]
-      if (Record.size() < 3) return Error("Invalid CE_CAST record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       int Opc = GetDecodedCastOpcode(Record[0]);
       if (Opc < 0) {
         V = UndefValue::get(CurTy);  // Unknown cast.
       } else {
         Type *OpTy = getTypeByID(Record[1]);
-        if (!OpTy) return Error("Invalid CE_CAST record");
+        if (!OpTy)
+          return Error(InvalidRecord);
         Constant *Op = ValueList.getConstantFwdRef(Record[2], OpTy);
-        V = ConstantExpr::getCast(Opc, Op, CurTy);
+        V = UpgradeBitCastExpr(Opc, Op, CurTy);
+        if (!V) V = ConstantExpr::getCast(Opc, Op, CurTy);
       }
       break;
     }
     case bitc::CST_CODE_CE_INBOUNDS_GEP:
     case bitc::CST_CODE_CE_GEP: {  // CE_GEP:        [n x operands]
-      if (Record.size() & 1) return Error("Invalid CE_GEP record");
+      if (Record.size() & 1)
+        return Error(InvalidRecord);
       SmallVector<Constant*, 16> Elts;
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Type *ElTy = getTypeByID(Record[i]);
-        if (!ElTy) return Error("Invalid CE_GEP record");
+        if (!ElTy)
+          return Error(InvalidRecord);
         Elts.push_back(ValueList.getConstantFwdRef(Record[i+1], ElTy));
       }
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
@@ -1256,19 +1379,31 @@ bool BitcodeReader::ParseConstants() {
                                            bitc::CST_CODE_CE_INBOUNDS_GEP);
       break;
     }
-    case bitc::CST_CODE_CE_SELECT:  // CE_SELECT: [opval#, opval#, opval#]
-      if (Record.size() < 3) return Error("Invalid CE_SELECT record");
-      V = ConstantExpr::getSelect(
-                          ValueList.getConstantFwdRef(Record[0],
-                                                      Type::getInt1Ty(Context)),
-                          ValueList.getConstantFwdRef(Record[1],CurTy),
-                          ValueList.getConstantFwdRef(Record[2],CurTy));
+    case bitc::CST_CODE_CE_SELECT: {  // CE_SELECT: [opval#, opval#, opval#]
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
+
+      Type *SelectorTy = Type::getInt1Ty(Context);
+
+      // If CurTy is a vector of length n, then Record[0] must be a <n x i1>
+      // vector. Otherwise, it must be a single bit.
+      if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
+        SelectorTy = VectorType::get(Type::getInt1Ty(Context),
+                                     VTy->getNumElements());
+
+      V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
+                                                              SelectorTy),
+                                  ValueList.getConstantFwdRef(Record[1],CurTy),
+                                  ValueList.getConstantFwdRef(Record[2],CurTy));
       break;
+    }
     case bitc::CST_CODE_CE_EXTRACTELT: { // CE_EXTRACTELT: [opty, opval, opval]
-      if (Record.size() < 3) return Error("Invalid CE_EXTRACTELT record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
-      if (OpTy == 0) return Error("Invalid CE_EXTRACTELT record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2],
                                                   Type::getInt32Ty(Context));
@@ -1278,7 +1413,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_INSERTELT: { // CE_INSERTELT: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_INSERTELT record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1],
                                                   OpTy->getElementType());
@@ -1290,7 +1425,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_CE_SHUFFLEVEC: { // CE_SHUFFLEVEC: [opval, opval, opval]
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || OpTy == 0)
-        return Error("Invalid CE_SHUFFLEVEC record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1304,7 +1439,7 @@ bool BitcodeReader::ParseConstants() {
       VectorType *OpTy =
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || RTy == 0 || OpTy == 0)
-        return Error("Invalid CE_SHUFVEC_EX record");
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
       Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
@@ -1314,9 +1449,11 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
-      if (Record.size() < 4) return Error("Invalid CE_CMP record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
-      if (OpTy == 0) return Error("Invalid CE_CMP record");
+      if (OpTy == 0)
+        return Error(InvalidRecord);
       Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
       Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
 
@@ -1329,16 +1466,17 @@ bool BitcodeReader::ParseConstants() {
     // This maintains backward compatibility, pre-asm dialect keywords.
     // FIXME: Remove with the 4.0 release.
     case bitc::CST_CODE_INLINEASM_OLD: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = Record[0] >> 1;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1352,17 +1490,18 @@ bool BitcodeReader::ParseConstants() {
     // This version adds support for the asm dialect keywords (e.g.,
     // inteldialect).
     case bitc::CST_CODE_INLINEASM: {
-      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      if (Record.size() < 2)
+        return Error(InvalidRecord);
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
       bool IsAlignStack = (Record[0] >> 1) & 1;
       unsigned AsmDialect = Record[0] >> 2;
       unsigned AsmStrSize = Record[1];
       if (2+AsmStrSize >= Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
       unsigned ConstStrSize = Record[2+AsmStrSize];
       if (3+AsmStrSize+ConstStrSize > Record.size())
-        return Error("Invalid INLINEASM record");
+        return Error(InvalidRecord);
 
       for (unsigned i = 0; i != AsmStrSize; ++i)
         AsmStr += (char)Record[2+i];
@@ -1375,12 +1514,15 @@ bool BitcodeReader::ParseConstants() {
       break;
     }
     case bitc::CST_CODE_BLOCKADDRESS:{
-      if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Record.size() < 3)
+        return Error(InvalidRecord);
       Type *FnTy = getTypeByID(Record[0]);
-      if (FnTy == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (FnTy == 0)
+        return Error(InvalidRecord);
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
-      if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
+      if (Fn == 0)
+        return Error(InvalidRecord);
 
       // If the function is already parsed we can insert the block address right
       // away.
@@ -1388,7 +1530,7 @@ bool BitcodeReader::ParseConstants() {
         Function::iterator BBI = Fn->begin(), BBE = Fn->end();
         for (size_t I = 0, E = Record[2]; I != E; ++I) {
           if (BBI == BBE)
-            return Error("Invalid blockaddress block #");
+            return Error(InvalidID);
           ++BBI;
         }
         V = BlockAddress::get(Fn, BBI);
@@ -1411,9 +1553,9 @@ bool BitcodeReader::ParseConstants() {
   }
 }
 
-bool BitcodeReader::ParseUseLists() {
+error_code BitcodeReader::ParseUseLists() {
   if (Stream.EnterSubBlock(bitc::USELIST_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1424,9 +1566,9 @@ bool BitcodeReader::ParseUseLists() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed use list block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1440,7 +1582,7 @@ bool BitcodeReader::ParseUseLists() {
     case bitc::USELIST_CODE_ENTRY: { // USELIST_CODE_ENTRY: TBD.
       unsigned RecordLength = Record.size();
       if (RecordLength < 1)
-        return Error ("Invalid UseList reader!");
+        return Error(InvalidRecord);
       UseListRecords.push_back(Record);
       break;
     }
@@ -1451,10 +1593,10 @@ bool BitcodeReader::ParseUseLists() {
 /// RememberAndSkipFunctionBody - When we see the block for a function body,
 /// remember where it is and then skip it.  This lets us lazily deserialize the
 /// functions.
-bool BitcodeReader::RememberAndSkipFunctionBody() {
+error_code BitcodeReader::RememberAndSkipFunctionBody() {
   // Get the function we are talking about.
   if (FunctionsWithBodies.empty())
-    return Error("Insufficient function protos");
+    return Error(InsufficientFunctionProtos);
 
   Function *Fn = FunctionsWithBodies.back();
   FunctionsWithBodies.pop_back();
@@ -1465,15 +1607,15 @@ bool BitcodeReader::RememberAndSkipFunctionBody() {
 
   // Skip over the function block for now.
   if (Stream.SkipBlock())
-    return Error("Malformed block record");
-  return false;
+    return Error(InvalidRecord);
+  return error_code::success();
 }
 
-bool BitcodeReader::GlobalCleanup() {
+error_code BitcodeReader::GlobalCleanup() {
   // Patch the initializers for globals and aliases up.
   ResolveGlobalAndAliasInits();
   if (!GlobalInits.empty() || !AliasInits.empty())
-    return Error("Malformed global initializer set");
+    return Error(MalformedGlobalInitializerSet);
 
   // Look for intrinsic functions which need to be upgraded at some point
   for (Module::iterator FI = TheModule->begin(), FE = TheModule->end();
@@ -1492,14 +1634,14 @@ bool BitcodeReader::GlobalCleanup() {
   // want lazy deserialization.
   std::vector<std::pair<GlobalVariable*, unsigned> >().swap(GlobalInits);
   std::vector<std::pair<GlobalAlias*, unsigned> >().swap(AliasInits);
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::ParseModule(bool Resume) {
+error_code BitcodeReader::ParseModule(bool Resume) {
   if (Resume)
     Stream.JumpToBit(NextUnreadBit);
   else if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   std::vector<std::string> SectionTable;
@@ -1511,8 +1653,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module block");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       return GlobalCleanup();
 
@@ -1520,49 +1661,51 @@ bool BitcodeReader::ParseModule(bool Resume) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::PARAMATTR_BLOCK_ID:
-        if (ParseAttributeBlock())
-          return true;
+        if (error_code EC = ParseAttributeBlock())
+          return EC;
         break;
       case bitc::PARAMATTR_GROUP_BLOCK_ID:
-        if (ParseAttributeGroupBlock())
-          return true;
+        if (error_code EC = ParseAttributeGroupBlock())
+          return EC;
         break;
       case bitc::TYPE_BLOCK_ID_NEW:
-        if (ParseTypeTable())
-          return true;
+        if (error_code EC = ParseTypeTable())
+          return EC;
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable())
-          return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         SeenValueSymbolTable = true;
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants() || ResolveGlobalAndAliasInits())
-          return true;
+        if (error_code EC = ParseConstants())
+          return EC;
+        if (error_code EC = ResolveGlobalAndAliasInits())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata())
-          return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       case bitc::FUNCTION_BLOCK_ID:
         // If this is the first function body we've seen, reverse the
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
           std::reverse(FunctionsWithBodies.begin(), FunctionsWithBodies.end());
-          if (GlobalCleanup())
-            return true;
+          if (error_code EC = GlobalCleanup())
+            return EC;
           SeenFirstFunctionBody = true;
         }
 
-        if (RememberAndSkipFunctionBody())
-          return true;
+        if (error_code EC = RememberAndSkipFunctionBody())
+          return EC;
         // For streaming bitcode, suspend parsing when we reach the function
         // bodies. Subsequent materialization calls will resume it when
         // necessary. For streaming, the function bodies must be at the end of
@@ -1571,12 +1714,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
         // just finish the parse now.
         if (LazyStreamer && SeenValueSymbolTable) {
           NextUnreadBit = Stream.GetCurrentBitNo();
-          return false;
+          return error_code::success();
         }
         break;
       case bitc::USELIST_BLOCK_ID:
-        if (ParseUseLists())
-          return true;
+        if (error_code EC = ParseUseLists())
+          return EC;
         break;
       }
       continue;
@@ -1592,11 +1735,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     default: break;  // Default behavior, ignore unknown content.
     case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
-        return Error("Malformed MODULE_CODE_VERSION");
+        return Error(InvalidRecord);
       // Only version #0 and #1 are supported so far.
       unsigned module_version = Record[0];
       switch (module_version) {
-        default: return Error("Unknown bitstream version!");
+        default:
+          return Error(InvalidValue);
         case 0:
           UseRelativeIDs = false;
           break;
@@ -1609,21 +1753,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       TheModule->setTargetTriple(S);
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DATALAYOUT record");
+        return Error(InvalidRecord);
       TheModule->setDataLayout(S);
       break;
     }
     case bitc::MODULE_CODE_ASM: {  // ASM: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_ASM record");
+        return Error(InvalidRecord);
       TheModule->setModuleInlineAsm(S);
       break;
     }
@@ -1631,21 +1775,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       // FIXME: Remove in 4.0.
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_DEPLIB record");
+        return Error(InvalidRecord);
       // Ignore value.
       break;
     }
     case bitc::MODULE_CODE_SECTIONNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_SECTIONNAME record");
+        return Error(InvalidRecord);
       SectionTable.push_back(S);
       break;
     }
     case bitc::MODULE_CODE_GCNAME: {  // SECTIONNAME: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_GCNAME record");
+        return Error(InvalidRecord);
       GCTable.push_back(S);
       break;
     }
@@ -1654,11 +1798,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             unnamed_addr]
     case bitc::MODULE_CODE_GLOBALVAR: {
       if (Record.size() < 6)
-        return Error("Invalid MODULE_CODE_GLOBALVAR record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_GLOBALVAR record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Global not a pointer type!");
+        return Error(InvalidTypeForValue);
       unsigned AddressSpace = cast<PointerType>(Ty)->getAddressSpace();
       Ty = cast<PointerType>(Ty)->getElementType();
 
@@ -1668,7 +1813,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
       std::string Section;
       if (Record[5]) {
         if (Record[5]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Section = SectionTable[Record[5]-1];
       }
       GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
@@ -1707,15 +1852,16 @@ bool BitcodeReader::ParseModule(bool Resume) {
     //             alignment, section, visibility, gc, unnamed_addr]
     case bitc::MODULE_CODE_FUNCTION: {
       if (Record.size() < 8)
-        return Error("Invalid MODULE_CODE_FUNCTION record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_CODE_FUNCTION record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
       FunctionType *FTy =
         dyn_cast<FunctionType>(cast<PointerType>(Ty)->getElementType());
       if (!FTy)
-        return Error("Function not a pointer to function type!");
+        return Error(InvalidTypeForValue);
 
       Function *Func = Function::Create(FTy, GlobalValue::ExternalLinkage,
                                         "", TheModule);
@@ -1728,19 +1874,21 @@ bool BitcodeReader::ParseModule(bool Resume) {
       Func->setAlignment((1 << Record[5]) >> 1);
       if (Record[6]) {
         if (Record[6]-1 >= SectionTable.size())
-          return Error("Invalid section ID");
+          return Error(InvalidID);
         Func->setSection(SectionTable[Record[6]-1]);
       }
       Func->setVisibility(GetDecodedVisibility(Record[7]));
       if (Record.size() > 8 && Record[8]) {
         if (Record[8]-1 > GCTable.size())
-          return Error("Invalid GC ID");
+          return Error(InvalidID);
         Func->setGC(GCTable[Record[8]-1].c_str());
       }
       bool UnnamedAddr = false;
       if (Record.size() > 9)
         UnnamedAddr = Record[9];
       Func->setUnnamedAddr(UnnamedAddr);
+      if (Record.size() > 10 && Record[10] != 0)
+        FunctionPrefixes.push_back(std::make_pair(Func, Record[10]-1));
       ValueList.push_back(Func);
 
       // If this is a function with a body, remember the prototype we are
@@ -1755,11 +1903,12 @@ bool BitcodeReader::ParseModule(bool Resume) {
     // ALIAS: [alias type, aliasee val#, linkage, visibility]
     case bitc::MODULE_CODE_ALIAS: {
       if (Record.size() < 3)
-        return Error("Invalid MODULE_ALIAS record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid MODULE_ALIAS record");
+      if (!Ty)
+        return Error(InvalidRecord);
       if (!Ty->isPointerTy())
-        return Error("Function not a pointer type!");
+        return Error(InvalidTypeForValue);
 
       GlobalAlias *NewGA = new GlobalAlias(Ty, GetDecodedLinkage(Record[2]),
                                            "", 0, TheModule);
@@ -1774,7 +1923,7 @@ bool BitcodeReader::ParseModule(bool Resume) {
     case bitc::MODULE_CODE_PURGEVALS:
       // Trim down the value list to the specified size.
       if (Record.size() < 1 || Record[0] > ValueList.size())
-        return Error("Invalid MODULE_PURGEVALS record");
+        return Error(InvalidRecord);
       ValueList.shrinkTo(Record[0]);
       break;
     }
@@ -1782,10 +1931,11 @@ bool BitcodeReader::ParseModule(bool Resume) {
   }
 }
 
-bool BitcodeReader::ParseBitcodeInto(Module *M) {
+error_code BitcodeReader::ParseBitcodeInto(Module *M) {
   TheModule = 0;
 
-  if (InitStream()) return true;
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -1794,42 +1944,42 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
   while (1) {
     if (Stream.AtEndOfStream())
-      return false;
+      return error_code::success();
 
     BitstreamEntry Entry =
       Stream.advance(BitstreamCursor::AF_DontAutoprocessAbbrevs);
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       switch (Entry.ID) {
       case bitc::BLOCKINFO_BLOCK_ID:
         if (Stream.ReadBlockInfoBlock())
-          return Error("Malformed BlockInfoBlock");
+          return Error(MalformedBlock);
         break;
       case bitc::MODULE_BLOCK_ID:
         // Reject multiple MODULE_BLOCK's in a single bitstream.
         if (TheModule)
-          return Error("Multiple MODULE_BLOCKs in same stream");
+          return Error(InvalidMultipleBlocks);
         TheModule = M;
-        if (ParseModule(false))
-          return true;
-        if (LazyStreamer) return false;
+        if (error_code EC = ParseModule(false))
+          return EC;
+        if (LazyStreamer)
+          return error_code::success();
         break;
       default:
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       }
       continue;
@@ -1842,16 +1992,16 @@ bool BitcodeReader::ParseBitcodeInto(Module *M) {
       if (Stream.getAbbrevIDWidth() == 2 && Entry.ID == 2 &&
           Stream.Read(6) == 2 && Stream.Read(24) == 0xa0a0a &&
           Stream.AtEndOfStream())
-        return false;
+        return error_code::success();
 
-      return Error("Invalid record at top-level");
+      return Error(InvalidRecord);
     }
   }
 }
 
-bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
+error_code BitcodeReader::ParseModuleTriple(std::string &Triple) {
   if (Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
 
@@ -1862,9 +2012,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed module block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1876,7 +2026,7 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
-        return Error("Invalid MODULE_CODE_TRIPLE record");
+        return Error(InvalidRecord);
       Triple = S;
       break;
     }
@@ -1885,8 +2035,9 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
   }
 }
 
-bool BitcodeReader::ParseTriple(std::string &Triple) {
-  if (InitStream()) return true;
+error_code BitcodeReader::ParseTriple(std::string &Triple) {
+  if (error_code EC = InitStream())
+    return EC;
 
   // Sniff for the signature.
   if (Stream.Read(8) != 'B' ||
@@ -1895,7 +2046,7 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
       Stream.Read(4) != 0xC ||
       Stream.Read(4) != 0xE ||
       Stream.Read(4) != 0xD)
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   // We expect a number of well-defined blocks, though we don't necessarily
   // need to understand them all.
@@ -1904,20 +2055,17 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      Error("malformed module file");
-      return true;
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
 
     case BitstreamEntry::SubBlock:
       if (Entry.ID == bitc::MODULE_BLOCK_ID)
         return ParseModuleTriple(Triple);
 
       // Ignore other sub-blocks.
-      if (Stream.SkipBlock()) {
-        Error("malformed block record in AST file");
-        return true;
-      }
+      if (Stream.SkipBlock())
+        return Error(MalformedBlock);
       continue;
 
     case BitstreamEntry::Record:
@@ -1928,9 +2076,9 @@ bool BitcodeReader::ParseTriple(std::string &Triple) {
 }
 
 /// ParseMetadataAttachment - Parse metadata attachments.
-bool BitcodeReader::ParseMetadataAttachment() {
+error_code BitcodeReader::ParseMetadataAttachment() {
   if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   SmallVector<uint64_t, 64> Record;
   while (1) {
@@ -1939,9 +2087,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return Error("malformed metadata block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
-      return false;
+      return error_code::success();
     case BitstreamEntry::Record:
       // The interesting case.
       break;
@@ -1955,16 +2103,18 @@ bool BitcodeReader::ParseMetadataAttachment() {
     case bitc::METADATA_ATTACHMENT: {
       unsigned RecordLength = Record.size();
       if (Record.empty() || (RecordLength - 1) % 2 == 1)
-        return Error ("Invalid METADATA_ATTACHMENT reader!");
+        return Error(InvalidRecord);
       Instruction *Inst = InstructionList[Record[0]];
       for (unsigned i = 1; i != RecordLength; i = i+2) {
         unsigned Kind = Record[i];
         DenseMap<unsigned, unsigned>::iterator I =
           MDKindMap.find(Kind);
         if (I == MDKindMap.end())
-          return Error("Invalid metadata kind ID");
+          return Error(InvalidID);
         Value *Node = MDValueList.getValueFwdRef(Record[i+1]);
         Inst->setMetadata(I->second, cast<MDNode>(Node));
+        if (I->second == LLVMContext::MD_tbaa)
+          InstsWithTBAATag.push_back(Inst);
       }
       break;
     }
@@ -1973,9 +2123,9 @@ bool BitcodeReader::ParseMetadataAttachment() {
 }
 
 /// ParseFunctionBody - Lazily parse the specified function body block.
-bool BitcodeReader::ParseFunctionBody(Function *F) {
+error_code BitcodeReader::ParseFunctionBody(Function *F) {
   if (Stream.EnterSubBlock(bitc::FUNCTION_BLOCK_ID))
-    return Error("Malformed block record");
+    return Error(InvalidRecord);
 
   InstructionList.clear();
   unsigned ModuleValueListSize = ValueList.size();
@@ -1998,7 +2148,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     switch (Entry.Kind) {
     case BitstreamEntry::Error:
-      return Error("Bitcode error in function block");
+      return Error(MalformedBlock);
     case BitstreamEntry::EndBlock:
       goto OutOfRecordLoop;
 
@@ -2006,20 +2156,24 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       switch (Entry.ID) {
       default:  // Skip unknown content.
         if (Stream.SkipBlock())
-          return Error("Malformed block record");
+          return Error(InvalidRecord);
         break;
       case bitc::CONSTANTS_BLOCK_ID:
-        if (ParseConstants()) return true;
+        if (error_code EC = ParseConstants())
+          return EC;
         NextValueNo = ValueList.size();
         break;
       case bitc::VALUE_SYMTAB_BLOCK_ID:
-        if (ParseValueSymbolTable()) return true;
+        if (error_code EC = ParseValueSymbolTable())
+          return EC;
         break;
       case bitc::METADATA_ATTACHMENT_ID:
-        if (ParseMetadataAttachment()) return true;
+        if (error_code EC = ParseMetadataAttachment())
+          return EC;
         break;
       case bitc::METADATA_BLOCK_ID:
-        if (ParseMetadata()) return true;
+        if (error_code EC = ParseMetadata())
+          return EC;
         break;
       }
       continue;
@@ -2035,10 +2189,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     unsigned BitCode = Stream.readRecord(Entry.ID, Record);
     switch (BitCode) {
     default: // Default behavior: reject
-      return Error("Unknown instruction");
+      return Error(InvalidValue);
     case bitc::FUNC_CODE_DECLAREBLOCKS:     // DECLAREBLOCKS: [nblocks]
       if (Record.size() < 1 || Record[0] == 0)
-        return Error("Invalid DECLAREBLOCKS record");
+        return Error(InvalidRecord);
       // Create all the basic blocks for the function.
       FunctionBBs.resize(Record[0]);
       for (unsigned i = 0, e = FunctionBBs.size(); i != e; ++i)
@@ -2058,7 +2212,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
 
-      if (I == 0) return Error("Invalid DEBUG_LOC_AGAIN record");
+      if (I == 0)
+        return Error(InvalidRecord);
       I->setDebugLoc(LastLoc);
       I = 0;
       continue;
@@ -2071,7 +2226,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
                !FunctionBBs[CurBBNo-1]->empty())
         I = &FunctionBBs[CurBBNo-1]->back();
       if (I == 0 || Record.size() < 4)
-        return Error("Invalid FUNC_CODE_DEBUG_LOC record");
+        return Error(InvalidRecord);
 
       unsigned Line = Record[0], Col = Record[1];
       unsigned ScopeID = Record[2], IAID = Record[3];
@@ -2091,10 +2246,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
-        return Error("Invalid BINOP record");
+        return Error(InvalidRecord);
 
       int Opc = GetDecodedBinaryOpcode(Record[OpNum++], LHS->getType());
-      if (Opc == -1) return Error("Invalid BINOP record");
+      if (Opc == -1)
+        return Error(InvalidRecord);
       I = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
       InstructionList.push_back(I);
       if (OpNum < Record.size()) {
@@ -2136,13 +2292,21 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid CAST record");
+        return Error(InvalidRecord);
 
       Type *ResTy = getTypeByID(Record[OpNum]);
       int Opc = GetDecodedCastOpcode(Record[OpNum+1]);
       if (Opc == -1 || ResTy == 0)
-        return Error("Invalid CAST record");
-      I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+        return Error(InvalidRecord);
+      Instruction *Temp = 0;
+      if ((I = UpgradeBitCastInst(Opc, Op, ResTy, Temp))) {
+        if (Temp) {
+          InstructionList.push_back(Temp);
+          CurBB->getInstList().push_back(Temp);
+        }
+      } else {
+        I = CastInst::Create((Instruction::CastOps)Opc, Op, ResTy);
+      }
       InstructionList.push_back(I);
       break;
     }
@@ -2151,13 +2315,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *BasePtr;
       if (getValueTypePair(Record, OpNum, NextValueNo, BasePtr))
-        return Error("Invalid GEP record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> GEPIdx;
       while (OpNum != Record.size()) {
         Value *Op;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid GEP record");
+          return Error(InvalidRecord);
         GEPIdx.push_back(Op);
       }
 
@@ -2173,14 +2337,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid EXTRACTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> EXTRACTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid EXTRACTVAL index");
+          return Error(InvalidValue);
         EXTRACTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2194,17 +2358,17 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Agg;
       if (getValueTypePair(Record, OpNum, NextValueNo, Agg))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
       Value *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Val))
-        return Error("Invalid INSERTVAL record");
+        return Error(InvalidRecord);
 
       SmallVector<unsigned, 4> INSERTVALIdx;
       for (unsigned RecSize = Record.size();
            OpNum != RecSize; ++OpNum) {
         uint64_t Index = Record[OpNum];
         if ((unsigned)Index != Index)
-          return Error("Invalid INSERTVAL index");
+          return Error(InvalidValue);
         INSERTVALIdx.push_back((unsigned)Index);
       }
 
@@ -2221,7 +2385,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
       InstructionList.push_back(I);
@@ -2236,18 +2400,18 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
           popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
-        return Error("Invalid SELECT record");
+        return Error(InvalidRecord);
 
       // select condition can be either i1 or [N x i1]
       if (VectorType* vector_type =
           dyn_cast<VectorType>(Cond->getType())) {
         // expect <n x i1>
         if (vector_type->getElementType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       } else {
         // expect i1
         if (Cond->getType() != Type::getInt1Ty(Context))
-          return Error("Invalid SELECT condition type");
+          return Error(InvalidTypeForValue);
       }
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2260,7 +2424,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid EXTRACTELT record");
+        return Error(InvalidRecord);
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
       break;
@@ -2273,7 +2437,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
           popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
-        return Error("Invalid INSERTELT record");
+        return Error(InvalidRecord);
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
       break;
@@ -2284,10 +2448,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
           popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
-        return Error("Invalid SHUFFLEVEC record");
+        return Error(InvalidRecord);
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
       InstructionList.push_back(I);
       break;
@@ -2305,7 +2469,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
           popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
-        return Error("Invalid CMP record");
+        return Error(InvalidRecord);
 
       if (LHS->getType()->isFPOrFPVectorTy())
         I = new FCmpInst((FCmpInst::Predicate)Record[OpNum], LHS, RHS);
@@ -2327,9 +2491,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         unsigned OpNum = 0;
         Value *Op = NULL;
         if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
         if (OpNum != Record.size())
-          return Error("Invalid RET record");
+          return Error(InvalidRecord);
 
         I = ReturnInst::Create(Context, Op);
         InstructionList.push_back(I);
@@ -2337,10 +2501,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       }
     case bitc::FUNC_CODE_INST_BR: { // BR: [bb#, bb#, opval] or [bb#]
       if (Record.size() != 1 && Record.size() != 3)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
       BasicBlock *TrueDest = getBasicBlock(Record[0]);
       if (TrueDest == 0)
-        return Error("Invalid BR record");
+        return Error(InvalidRecord);
 
       if (Record.size() == 1) {
         I = BranchInst::Create(TrueDest);
@@ -2351,7 +2515,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo,
                                Type::getInt1Ty(Context));
         if (FalseDest == 0 || Cond == 0)
-          return Error("Invalid BR record");
+          return Error(InvalidRecord);
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
         InstructionList.push_back(I);
       }
@@ -2360,7 +2524,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_SWITCH: { // SWITCH: [opty, op0, op1, ...]
       // Check magic
       if ((Record[0] >> 16) == SWITCH_INST_MAGIC) {
-        // New SwitchInst format with case ranges.
+        // "New" SwitchInst format with case ranges. The changes to write this
+        // format were reverted but we still recognize bitcode that uses it.
+        // Hopefully someday we will have support for case ranges and can use
+        // this format again.
 
         Type *OpTy = getTypeByID(Record[1]);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
@@ -2368,7 +2535,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (OpTy == 0 || Cond == 0 || Default == 0)
-          return Error("Invalid SWITCH record");
+          return Error(InvalidRecord);
 
         unsigned NumCases = Record[4];
 
@@ -2377,7 +2544,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         unsigned CurIdx = 5;
         for (unsigned i = 0; i != NumCases; ++i) {
-          IntegersSubsetToBB CaseBuilder;
+          SmallVector<ConstantInt*, 1> CaseVals;
           unsigned NumItems = Record[CurIdx++];
           for (unsigned ci = 0; ci != NumItems; ++ci) {
             bool isSingleNumber = Record[CurIdx++];
@@ -2397,20 +2564,22 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
               APInt High =
                   ReadWideAPInt(makeArrayRef(&Record[CurIdx], ActiveWords),
                                 ValueBitWidth);
-
-              CaseBuilder.add(IntItem::fromType(OpTy, Low),
-                              IntItem::fromType(OpTy, High));
               CurIdx += ActiveWords;
+
+              // FIXME: It is not clear whether values in the range should be
+              // compared as signed or unsigned values. The partially
+              // implemented changes that used this format in the past used
+              // unsigned comparisons.
+              for ( ; Low.ule(High); ++Low)
+                CaseVals.push_back(ConstantInt::get(Context, Low));
             } else
-              CaseBuilder.add(IntItem::fromType(OpTy, Low));
+              CaseVals.push_back(ConstantInt::get(Context, Low));
           }
           BasicBlock *DestBB = getBasicBlock(Record[CurIdx++]);
-          IntegersSubset Case = CaseBuilder.getCase();
-          SI->addCase(Case, DestBB);
+          for (SmallVector<ConstantInt*, 1>::iterator cvi = CaseVals.begin(),
+                 cve = CaseVals.end(); cvi != cve; ++cvi)
+            SI->addCase(*cvi, DestBB);
         }
-        uint16_t Hash = SI->hash();
-        if (Hash != (Record[0] & 0xFFFF))
-          return Error("Invalid SWITCH record");
         I = SI;
         break;
       }
@@ -2418,12 +2587,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Old SwitchInst format without case ranges.
 
       if (Record.size() < 3 || (Record.size() & 1) == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (OpTy == 0 || Cond == 0 || Default == 0)
-        return Error("Invalid SWITCH record");
+        return Error(InvalidRecord);
       unsigned NumCases = (Record.size()-3)/2;
       SwitchInst *SI = SwitchInst::Create(Cond, Default, NumCases);
       InstructionList.push_back(SI);
@@ -2433,7 +2602,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         BasicBlock *DestBB = getBasicBlock(Record[1+3+i*2]);
         if (CaseVal == 0 || DestBB == 0) {
           delete SI;
-          return Error("Invalid SWITCH record!");
+          return Error(InvalidRecord);
         }
         SI->addCase(CaseVal, DestBB);
       }
@@ -2442,11 +2611,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_INDIRECTBR: { // INDIRECTBR: [opty, op0, op1, ...]
       if (Record.size() < 2)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (OpTy == 0 || Address == 0)
-        return Error("Invalid INDIRECTBR record");
+        return Error(InvalidRecord);
       unsigned NumDests = Record.size()-2;
       IndirectBrInst *IBI = IndirectBrInst::Create(Address, NumDests);
       InstructionList.push_back(IBI);
@@ -2455,7 +2624,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           IBI->addDestination(DestBB);
         } else {
           delete IBI;
-          return Error("Invalid INDIRECTBR record!");
+          return Error(InvalidRecord);
         }
       }
       I = IBI;
@@ -2464,7 +2633,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_INVOKE: {
       // INVOKE: [attrs, cc, normBB, unwindBB, fnty, op0,op1,op2, ...]
-      if (Record.size() < 4) return Error("Invalid INVOKE record");
+      if (Record.size() < 4)
+        return Error(InvalidRecord);
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
       BasicBlock *NormalBB = getBasicBlock(Record[2]);
@@ -2473,7 +2643,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 4;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       PointerType *CalleeTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = !CalleeTy ? 0 :
@@ -2482,24 +2652,25 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // Check that the right number of fixed parameters are here.
       if (FTy == 0 || NormalBB == 0 || UnwindBB == 0 ||
           Record.size() < OpNum+FTy->getNumParams())
-        return Error("Invalid INVOKE record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
         Ops.push_back(getValue(Record, OpNum, NextValueNo,
                                FTy->getParamType(i)));
-        if (Ops.back() == 0) return Error("Invalid INVOKE record");
+        if (Ops.back() == 0)
+          return Error(InvalidRecord);
       }
 
       if (!FTy->isVarArg()) {
         if (Record.size() != OpNum)
-          return Error("Invalid INVOKE record");
+          return Error(InvalidRecord);
       } else {
         // Read type/value pairs for varargs params.
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid INVOKE record");
+            return Error(InvalidRecord);
           Ops.push_back(Op);
         }
       }
@@ -2515,7 +2686,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned Idx = 0;
       Value *Val = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, Val))
-        return Error("Invalid RESUME record");
+        return Error(InvalidRecord);
       I = ResumeInst::Create(Val);
       InstructionList.push_back(I);
       break;
@@ -2526,9 +2697,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       break;
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
-        return Error("Invalid PHI record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[0]);
-      if (!Ty) return Error("Invalid PHI record");
+      if (!Ty)
+        return Error(InvalidRecord);
 
       PHINode *PN = PHINode::Create(Ty, (Record.size()-1)/2);
       InstructionList.push_back(PN);
@@ -2543,7 +2715,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
-        if (!V || !BB) return Error("Invalid PHI record");
+        if (!V || !BB)
+          return Error(InvalidRecord);
         PN->addIncoming(V, BB);
       }
       I = PN;
@@ -2554,12 +2727,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       // LANDINGPAD: [ty, val, val, num, (id0,val0 ...)?]
       unsigned Idx = 0;
       if (Record.size() < 4)
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
       Type *Ty = getTypeByID(Record[Idx++]);
-      if (!Ty) return Error("Invalid LANDINGPAD record");
+      if (!Ty)
+        return Error(InvalidRecord);
       Value *PersFn = 0;
       if (getValueTypePair(Record, Idx, NextValueNo, PersFn))
-        return Error("Invalid LANDINGPAD record");
+        return Error(InvalidRecord);
 
       bool IsCleanup = !!Record[Idx++];
       unsigned NumClauses = Record[Idx++];
@@ -2572,7 +2746,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
         if (getValueTypePair(Record, Idx, NextValueNo, Val)) {
           delete LP;
-          return Error("Invalid LANDINGPAD record");
+          return Error(InvalidRecord);
         }
 
         assert((CT != LandingPadInst::Catch ||
@@ -2591,13 +2765,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
     case bitc::FUNC_CODE_INST_ALLOCA: { // ALLOCA: [instty, opty, op, align]
       if (Record.size() != 4)
-        return Error("Invalid ALLOCA record");
+        return Error(InvalidRecord);
       PointerType *Ty =
         dyn_cast_or_null<PointerType>(getTypeByID(Record[0]));
       Type *OpTy = getTypeByID(Record[1]);
       Value *Size = getFnValueByID(Record[2], OpTy);
       unsigned Align = Record[3];
-      if (!Ty || !Size) return Error("Invalid ALLOCA record");
+      if (!Ty || !Size)
+        return Error(InvalidRecord);
       I = new AllocaInst(Ty->getElementType(), Size, (1 << Align) >> 1);
       InstructionList.push_back(I);
       break;
@@ -2607,7 +2782,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+2 != Record.size())
-        return Error("Invalid LOAD record");
+        return Error(InvalidRecord);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2619,15 +2794,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       Value *Op;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op) ||
           OpNum+4 != Record.size())
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
 
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Release ||
           Ordering == AcquireRelease)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid LOADATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
 
       I = new LoadInst(Op, "", Record[OpNum+1], (1 << Record[OpNum]) >> 1,
@@ -2642,7 +2817,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
-        return Error("Invalid STORE record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1);
       InstructionList.push_back(I);
@@ -2656,15 +2831,15 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Acquire ||
           Ordering == AcquireRelease)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       if (Ordering != NotAtomic && Record[OpNum] == 0)
-        return Error("Invalid STOREATOMIC record");
+        return Error(InvalidRecord);
 
       I = new StoreInst(Val, Ptr, Record[OpNum+1], (1 << Record[OpNum]) >> 1,
                         Ordering, SynchScope);
@@ -2681,10 +2856,10 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           OpNum+3 != Record.size())
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+1]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid CMPXCHG record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+2]);
       I = new AtomicCmpXchgInst(Ptr, Cmp, New, Ordering, SynchScope);
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
@@ -2699,14 +2874,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
           popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicRMWInst::BinOp Operation = GetDecodedRMWOperation(Record[OpNum]);
       if (Operation < AtomicRMWInst::FIRST_BINOP ||
           Operation > AtomicRMWInst::LAST_BINOP)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[OpNum+2]);
       if (Ordering == NotAtomic || Ordering == Unordered)
-        return Error("Invalid ATOMICRMW record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[OpNum+3]);
       I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SynchScope);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
@@ -2715,11 +2890,11 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_FENCE: { // FENCE:[ordering, synchscope]
       if (2 != Record.size())
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       AtomicOrdering Ordering = GetDecodedOrdering(Record[0]);
       if (Ordering == NotAtomic || Ordering == Unordered ||
           Ordering == Monotonic)
-        return Error("Invalid FENCE record");
+        return Error(InvalidRecord);
       SynchronizationScope SynchScope = GetDecodedSynchScope(Record[1]);
       I = new FenceInst(Context, Ordering, SynchScope);
       InstructionList.push_back(I);
@@ -2728,7 +2903,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     case bitc::FUNC_CODE_INST_CALL: {
       // CALL: [paramattrs, cc, fnty, fnid, arg0, arg1...]
       if (Record.size() < 3)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       AttributeSet PAL = getAttributes(Record[0]);
       unsigned CCInfo = Record[1];
@@ -2736,13 +2911,13 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 2;
       Value *Callee;
       if (getValueTypePair(Record, OpNum, NextValueNo, Callee))
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       PointerType *OpTy = dyn_cast<PointerType>(Callee->getType());
       FunctionType *FTy = 0;
       if (OpTy) FTy = dyn_cast<FunctionType>(OpTy->getElementType());
       if (!FTy || Record.size() < FTy->getNumParams()+OpNum)
-        return Error("Invalid CALL record");
+        return Error(InvalidRecord);
 
       SmallVector<Value*, 16> Args;
       // Read the fixed params.
@@ -2752,18 +2927,19 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         else
           Args.push_back(getValue(Record, OpNum, NextValueNo,
                                   FTy->getParamType(i)));
-        if (Args.back() == 0) return Error("Invalid CALL record");
+        if (Args.back() == 0)
+          return Error(InvalidRecord);
       }
 
       // Read type/value pairs for varargs params.
       if (!FTy->isVarArg()) {
         if (OpNum != Record.size())
-          return Error("Invalid CALL record");
+          return Error(InvalidRecord);
       } else {
         while (OpNum != Record.size()) {
           Value *Op;
           if (getValueTypePair(Record, OpNum, NextValueNo, Op))
-            return Error("Invalid CALL record");
+            return Error(InvalidRecord);
           Args.push_back(Op);
         }
       }
@@ -2778,12 +2954,12 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     }
     case bitc::FUNC_CODE_INST_VAARG: { // VAARG: [valistty, valist, instty]
       if (Record.size() < 3)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       Type *OpTy = getTypeByID(Record[0]);
       Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
-        return Error("Invalid VAARG record");
+        return Error(InvalidRecord);
       I = new VAArgInst(Op, ResTy);
       InstructionList.push_back(I);
       break;
@@ -2794,7 +2970,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
     // this file.
     if (CurBB == 0) {
       delete I;
-      return Error("Invalid instruction with no BB");
+      return Error(InvalidInstructionWithNoBB);
     }
     CurBB->getInstList().push_back(I);
 
@@ -2821,7 +2997,7 @@ OutOfRecordLoop:
           delete A;
         }
       }
-      return Error("Never resolved value found in function!");
+      return Error(NeverResolvedValueFoundInFunction);
     }
   }
 
@@ -2837,7 +3013,7 @@ OutOfRecordLoop:
     for (unsigned i = 0, e = RefList.size(); i != e; ++i) {
       unsigned BlockIdx = RefList[i].first;
       if (BlockIdx >= FunctionBBs.size())
-        return Error("Invalid blockaddress block #");
+        return Error(InvalidID);
 
       GlobalVariable *FwdRef = RefList[i].second;
       FwdRef->replaceAllUsesWith(BlockAddress::get(F, FunctionBBs[BlockIdx]));
@@ -2851,20 +3027,21 @@ OutOfRecordLoop:
   ValueList.shrinkTo(ModuleValueListSize);
   MDValueList.shrinkTo(ModuleMDValueListSize);
   std::vector<BasicBlock*>().swap(FunctionBBs);
-  return false;
+  return error_code::success();
 }
 
-/// FindFunctionInStream - Find the function body in the bitcode stream
-bool BitcodeReader::FindFunctionInStream(Function *F,
+/// Find the function body in the bitcode stream
+error_code BitcodeReader::FindFunctionInStream(Function *F,
        DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator) {
   while (DeferredFunctionInfoIterator->second == 0) {
     if (Stream.AtEndOfStream())
-      return Error("Could not find Function in stream");
+      return Error(CouldNotFindFunctionInStream);
     // ParseModule will parse the next body in the stream and set its
     // position in the DeferredFunctionInfo map.
-    if (ParseModule(true)) return true;
+    if (error_code EC = ParseModule(true))
+      return EC;
   }
-  return false;
+  return error_code::success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -2880,25 +3057,25 @@ bool BitcodeReader::isMaterializable(const GlobalValue *GV) const {
   return false;
 }
 
-bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
+error_code BitcodeReader::Materialize(GlobalValue *GV) {
   Function *F = dyn_cast<Function>(GV);
   // If it's not a function or is already material, ignore the request.
-  if (!F || !F->isMaterializable()) return false;
+  if (!F || !F->isMaterializable())
+    return error_code::success();
 
   DenseMap<Function*, uint64_t>::iterator DFII = DeferredFunctionInfo.find(F);
   assert(DFII != DeferredFunctionInfo.end() && "Deferred function not found!");
   // If its position is recorded as 0, its body is somewhere in the stream
   // but we haven't seen it yet.
-  if (DFII->second == 0)
-    if (LazyStreamer && FindFunctionInStream(F, DFII)) return true;
+  if (DFII->second == 0 && LazyStreamer)
+    if (error_code EC = FindFunctionInStream(F, DFII))
+      return EC;
 
   // Move the bit stream to the saved position of the deferred function body.
   Stream.JumpToBit(DFII->second);
 
-  if (ParseFunctionBody(F)) {
-    if (ErrInfo) *ErrInfo = ErrorString;
-    return true;
-  }
+  if (error_code EC = ParseFunctionBody(F))
+    return EC;
 
   // Upgrade any old intrinsic calls in the function.
   for (UpgradedIntrinsicMap::iterator I = UpgradedIntrinsics.begin(),
@@ -2912,7 +3089,7 @@ bool BitcodeReader::Materialize(GlobalValue *GV, std::string *ErrInfo) {
     }
   }
 
-  return false;
+  return error_code::success();
 }
 
 bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
@@ -2935,17 +3112,18 @@ void BitcodeReader::Dematerialize(GlobalValue *GV) {
 }
 
 
-bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
+error_code BitcodeReader::MaterializeModule(Module *M) {
   assert(M == TheModule &&
          "Can only Materialize the Module this BitcodeReader is attached to.");
   // Iterate over the module, deserializing any functions that are still on
   // disk.
   for (Module::iterator F = TheModule->begin(), E = TheModule->end();
-       F != E; ++F)
-    if (F->isMaterializable() &&
-        Materialize(F, ErrInfo))
-      return true;
-
+       F != E; ++F) {
+    if (F->isMaterializable()) {
+      if (error_code EC = Materialize(F))
+        return EC;
+    }
+  }
   // At this point, if there are any function bodies, the current bit is
   // pointing to the END_BLOCK record after them. Now make sure the rest
   // of the bits in the module have been read.
@@ -2971,38 +3149,43 @@ bool BitcodeReader::MaterializeModule(Module *M, std::string *ErrInfo) {
   }
   std::vector<std::pair<Function*, Function*> >().swap(UpgradedIntrinsics);
 
-  return false;
+  for (unsigned I = 0, E = InstsWithTBAATag.size(); I < E; I++)
+    UpgradeInstWithTBAATag(InstsWithTBAATag[I]);
+
+  UpgradeDebugInfo(*M);
+  return error_code::success();
 }
 
-bool BitcodeReader::InitStream() {
-  if (LazyStreamer) return InitLazyStream();
+error_code BitcodeReader::InitStream() {
+  if (LazyStreamer)
+    return InitLazyStream();
   return InitStreamFromBuffer();
 }
 
-bool BitcodeReader::InitStreamFromBuffer() {
+error_code BitcodeReader::InitStreamFromBuffer() {
   const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3) {
     if (!isRawBitcode(BufPtr, BufEnd) && !isBitcodeWrapper(BufPtr, BufEnd))
-      return Error("Invalid bitcode signature");
+      return Error(InvalidBitcodeSignature);
     else
-      return Error("Bitcode stream should be a multiple of 4 bytes in length");
+      return Error(BitcodeStreamInvalidSize);
   }
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
   // The magic number is 0x0B17C0DE stored in little endian.
   if (isBitcodeWrapper(BufPtr, BufEnd))
     if (SkipBitcodeWrapperHeader(BufPtr, BufEnd, true))
-      return Error("Invalid bitcode wrapper header");
+      return Error(InvalidBitcodeWrapperHeader);
 
   StreamFile.reset(new BitstreamReader(BufPtr, BufEnd));
   Stream.init(*StreamFile);
 
-  return false;
+  return error_code::success();
 }
 
-bool BitcodeReader::InitLazyStream() {
+error_code BitcodeReader::InitLazyStream() {
   // Check and strip off the bitcode wrapper; BitstreamReader expects never to
   // see it.
   StreamingMemoryObject *Bytes = new StreamingMemoryObject(LazyStreamer);
@@ -3010,11 +3193,11 @@ bool BitcodeReader::InitLazyStream() {
   Stream.init(*StreamFile);
 
   unsigned char buf[16];
-  if (Bytes->readBytes(0, 16, buf, NULL) == -1)
-    return Error("Bitcode stream must be at least 16 bytes in length");
+  if (Bytes->readBytes(0, 16, buf) == -1)
+    return Error(BitcodeStreamInvalidSize);
 
   if (!isBitcode(buf, buf + 16))
-    return Error("Invalid bitcode signature");
+    return Error(InvalidBitcodeSignature);
 
   if (isBitcodeWrapper(buf, buf + 4)) {
     const unsigned char *bitcodeStart = buf;
@@ -3023,7 +3206,64 @@ bool BitcodeReader::InitLazyStream() {
     Bytes->dropLeadingBytes(bitcodeStart - buf);
     Bytes->setKnownObjectSize(bitcodeEnd - bitcodeStart);
   }
-  return false;
+  return error_code::success();
+}
+
+namespace {
+class BitcodeErrorCategoryType : public _do_message {
+  const char *name() const LLVM_OVERRIDE {
+    return "llvm.bitcode";
+  }
+  std::string message(int IE) const LLVM_OVERRIDE {
+    BitcodeReader::ErrorType E = static_cast<BitcodeReader::ErrorType>(IE);
+    switch (E) {
+    case BitcodeReader::BitcodeStreamInvalidSize:
+      return "Bitcode stream length should be >= 16 bytes and a multiple of 4";
+    case BitcodeReader::ConflictingMETADATA_KINDRecords:
+      return "Conflicting METADATA_KIND records";
+    case BitcodeReader::CouldNotFindFunctionInStream:
+      return "Could not find function in stream";
+    case BitcodeReader::ExpectedConstant:
+      return "Expected a constant";
+    case BitcodeReader::InsufficientFunctionProtos:
+      return "Insufficient function protos";
+    case BitcodeReader::InvalidBitcodeSignature:
+      return "Invalid bitcode signature";
+    case BitcodeReader::InvalidBitcodeWrapperHeader:
+      return "Invalid bitcode wrapper header";
+    case BitcodeReader::InvalidConstantReference:
+      return "Invalid ronstant reference";
+    case BitcodeReader::InvalidID:
+      return "Invalid ID";
+    case BitcodeReader::InvalidInstructionWithNoBB:
+      return "Invalid instruction with no BB";
+    case BitcodeReader::InvalidRecord:
+      return "Invalid record";
+    case BitcodeReader::InvalidTypeForValue:
+      return "Invalid type for value";
+    case BitcodeReader::InvalidTYPETable:
+      return "Invalid TYPE table";
+    case BitcodeReader::InvalidType:
+      return "Invalid type";
+    case BitcodeReader::MalformedBlock:
+      return "Malformed block";
+    case BitcodeReader::MalformedGlobalInitializerSet:
+      return "Malformed global initializer set";
+    case BitcodeReader::InvalidMultipleBlocks:
+      return "Invalid multiple blocks";
+    case BitcodeReader::NeverResolvedValueFoundInFunction:
+      return "Never resolved value found in function";
+    case BitcodeReader::InvalidValue:
+      return "Invalid value";
+    }
+    llvm_unreachable("Unknown error type!");
+  }
+};
+}
+
+const error_category &BitcodeReader::BitcodeErrorCategory() {
+  static BitcodeErrorCategoryType O;
+  return O;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3038,9 +3278,9 @@ Module *llvm::getLazyBitcodeModule(MemoryBuffer *Buffer,
   Module *M = new Module(Buffer->getBufferIdentifier(), Context);
   BitcodeReader *R = new BitcodeReader(Buffer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
     delete M;  // Also deletes R.
     return 0;
@@ -3061,9 +3301,9 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
   Module *M = new Module(name, Context);
   BitcodeReader *R = new BitcodeReader(streamer, Context);
   M->setMaterializer(R);
-  if (R->ParseBitcodeInto(M)) {
+  if (error_code EC = R->ParseBitcodeInto(M)) {
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
     delete M;  // Also deletes R.
     return 0;
   }
@@ -3102,9 +3342,9 @@ std::string llvm::getBitcodeTargetTriple(MemoryBuffer *Buffer,
   R->setBufferOwned(false);
 
   std::string Triple("");
-  if (R->ParseTriple(Triple))
+  if (error_code EC = R->ParseTriple(Triple))
     if (ErrMsg)
-      *ErrMsg = R->getErrorString();
+      *ErrMsg = EC.message();
 
   delete R;
   return Triple;
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index 28674eb..c5d345b 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -21,6 +21,7 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/system_error.h"
 #include "llvm/Support/ValueHandle.h"
 #include <vector>
 
@@ -132,8 +133,6 @@ class BitcodeReader : public GVMaterializer {
   uint64_t NextUnreadBit;
   bool SeenValueSymbolTable;
 
-  const char *ErrorString;
-
   std::vector<Type*> TypeList;
   BitcodeReaderValueList ValueList;
   BitcodeReaderMDValueList MDValueList;
@@ -142,6 +141,9 @@ class BitcodeReader : public GVMaterializer {
 
   std::vector<std::pair<GlobalVariable*, unsigned> > GlobalInits;
   std::vector<std::pair<GlobalAlias*, unsigned> > AliasInits;
+  std::vector<std::pair<Function*, unsigned> > FunctionPrefixes;
+
+  SmallVector<Instruction*, 64> InstsWithTBAATag;
 
   /// MAttributes - The set of attributes by index.  Index zero in the
   /// file is for null, and is thus not represented here.  As such all indices
@@ -191,17 +193,46 @@ class BitcodeReader : public GVMaterializer {
   /// not need this flag.
   bool UseRelativeIDs;
 
+  static const error_category &BitcodeErrorCategory();
+
 public:
+  enum ErrorType {
+    BitcodeStreamInvalidSize,
+    ConflictingMETADATA_KINDRecords,
+    CouldNotFindFunctionInStream,
+    ExpectedConstant,
+    InsufficientFunctionProtos,
+    InvalidBitcodeSignature,
+    InvalidBitcodeWrapperHeader,
+    InvalidConstantReference,
+    InvalidID, // A read identifier is not found in the table it should be in.
+    InvalidInstructionWithNoBB,
+    InvalidRecord, // A read record doesn't have the expected size or structure
+    InvalidTypeForValue, // Type read OK, but is invalid for its use
+    InvalidTYPETable,
+    InvalidType, // We were unable to read a type
+    MalformedBlock, // We are unable to advance in the stream.
+    MalformedGlobalInitializerSet,
+    InvalidMultipleBlocks, // We found multiple blocks of a kind that should
+                           // have only one
+    NeverResolvedValueFoundInFunction,
+    InvalidValue // Invalid version, inst number, attr number, etc
+  };
+
+  error_code Error(ErrorType E) {
+    return error_code(E, BitcodeErrorCategory());
+  }
+
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
       LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
       LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
-      ErrorString(0), ValueList(C), MDValueList(C),
+      ValueList(C), MDValueList(C),
       SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   ~BitcodeReader() {
@@ -218,23 +249,17 @@ public:
 
   virtual bool isMaterializable(const GlobalValue *GV) const;
   virtual bool isDematerializable(const GlobalValue *GV) const;
-  virtual bool Materialize(GlobalValue *GV, std::string *ErrInfo = 0);
-  virtual bool MaterializeModule(Module *M, std::string *ErrInfo = 0);
+  virtual error_code Materialize(GlobalValue *GV);
+  virtual error_code MaterializeModule(Module *M);
   virtual void Dematerialize(GlobalValue *GV);
 
-  bool Error(const char *Str) {
-    ErrorString = Str;
-    return true;
-  }
-  const char *getErrorString() const { return ErrorString; }
-
   /// @brief Main interface to parsing a bitcode buffer.
   /// @returns true if an error occurred.
-  bool ParseBitcodeInto(Module *M);
+  error_code ParseBitcodeInto(Module *M);
 
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
-  bool ParseTriple(std::string &Triple);
+  error_code ParseTriple(std::string &Triple);
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -258,7 +283,7 @@ private:
   /// getValueTypePair - Read a value/type pair out of the specified record from
   /// slot 'Slot'.  Increment Slot past the number of slots used in the record.
   /// Return true on failure.
-  bool getValueTypePair(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+  bool getValueTypePair(SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
                         unsigned InstNum, Value *&ResVal) {
     if (Slot == Record.size()) return true;
     unsigned ValNo = (unsigned)Record[Slot++];
@@ -282,7 +307,7 @@ private:
   /// popValue - Read a value out of the specified record from slot 'Slot'.
   /// Increment Slot past the number of slots used by the value in the record.
   /// Return true if there is an error.
-  bool popValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+  bool popValue(SmallVectorImpl<uint64_t> &Record, unsigned &Slot,
                 unsigned InstNum, Type *Ty, Value *&ResVal) {
     if (getValue(Record, Slot, InstNum, Ty, ResVal))
       return true;
@@ -292,7 +317,7 @@ private:
   }
 
   /// getValue -- Like popValue, but does not increment the Slot number.
-  bool getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+  bool getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                 unsigned InstNum, Type *Ty, Value *&ResVal) {
     ResVal = getValue(Record, Slot, InstNum, Ty);
     return ResVal == 0;
@@ -300,7 +325,7 @@ private:
 
   /// getValue -- Version of getValue that returns ResVal directly,
   /// or 0 if there is an error.
-  Value *getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+  Value *getValue(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                   unsigned InstNum, Type *Ty) {
     if (Slot == Record.size()) return 0;
     unsigned ValNo = (unsigned)Record[Slot];
@@ -311,7 +336,7 @@ private:
   }
 
   /// getValueSigned -- Like getValue, but decodes signed VBRs.
-  Value *getValueSigned(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+  Value *getValueSigned(SmallVectorImpl<uint64_t> &Record, unsigned Slot,
                         unsigned InstNum, Type *Ty) {
     if (Slot == Record.size()) return 0;
     unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
@@ -321,26 +346,27 @@ private:
     return getFnValueByID(ValNo, Ty);
   }
 
-  bool ParseModule(bool Resume);
-  bool ParseAttributeBlock();
-  bool ParseAttributeGroupBlock();
-  bool ParseTypeTable();
-  bool ParseTypeTableBody();
-
-  bool ParseValueSymbolTable();
-  bool ParseConstants();
-  bool RememberAndSkipFunctionBody();
-  bool ParseFunctionBody(Function *F);
-  bool GlobalCleanup();
-  bool ResolveGlobalAndAliasInits();
-  bool ParseMetadata();
-  bool ParseMetadataAttachment();
-  bool ParseModuleTriple(std::string &Triple);
-  bool ParseUseLists();
-  bool InitStream();
-  bool InitStreamFromBuffer();
-  bool InitLazyStream();
-  bool FindFunctionInStream(Function *F,
+  error_code ParseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
+  error_code ParseModule(bool Resume);
+  error_code ParseAttributeBlock();
+  error_code ParseAttributeGroupBlock();
+  error_code ParseTypeTable();
+  error_code ParseTypeTableBody();
+
+  error_code ParseValueSymbolTable();
+  error_code ParseConstants();
+  error_code RememberAndSkipFunctionBody();
+  error_code ParseFunctionBody(Function *F);
+  error_code GlobalCleanup();
+  error_code ResolveGlobalAndAliasInits();
+  error_code ParseMetadata();
+  error_code ParseMetadataAttachment();
+  error_code ParseModuleTriple(std::string &Triple);
+  error_code ParseUseLists();
+  error_code InitStream();
+  error_code InitStreamFromBuffer();
+  error_code InitLazyStream();
+  error_code FindFunctionInStream(Function *F,
          DenseMap<Function*, uint64_t>::iterator DeferredFunctionInfoIterator);
 };
 
diff --git a/lib/Bitcode/Reader/BitstreamReader.cpp b/lib/Bitcode/Reader/BitstreamReader.cpp
index 9dafe2a..1fd9abd 100644
--- a/lib/Bitcode/Reader/BitstreamReader.cpp
+++ b/lib/Bitcode/Reader/BitstreamReader.cpp
@@ -204,7 +204,16 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
 
   const BitCodeAbbrev *Abbv = getAbbrev(AbbrevID);
 
-  for (unsigned i = 0, e = Abbv->getNumOperandInfos(); i != e; ++i) {
+  // Read the record code first.
+  assert(Abbv->getNumOperandInfos() != 0 && "no record code in abbreviation?");
+  const BitCodeAbbrevOp &CodeOp = Abbv->getOperandInfo(0);
+  if (CodeOp.isLiteral())
+    readAbbreviatedLiteral(CodeOp, Vals);
+  else
+    readAbbreviatedField(CodeOp, Vals);
+  unsigned Code = (unsigned)Vals.pop_back_val();
+
+  for (unsigned i = 1, e = Abbv->getNumOperandInfos(); i != e; ++i) {
     const BitCodeAbbrevOp &Op = Abbv->getOperandInfo(i);
     if (Op.isLiteral()) {
       readAbbreviatedLiteral(Op, Vals);
@@ -264,8 +273,6 @@ unsigned BitstreamCursor::readRecord(unsigned AbbrevID,
     JumpToBit(NewEnd);
   }
 
-  unsigned Code = (unsigned)Vals[0];
-  Vals.erase(Vals.begin());
   return Code;
 }
 
diff --git a/lib/Bitcode/Writer/BitWriter.cpp b/lib/Bitcode/Writer/BitWriter.cpp
index 985208c..cd1ada2 100644
--- a/lib/Bitcode/Writer/BitWriter.cpp
+++ b/lib/Bitcode/Writer/BitWriter.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 int LLVMWriteBitcodeToFile(LLVMModuleRef M, const char *Path) {
   std::string ErrorInfo;
-  raw_fd_ostream OS(Path, ErrorInfo, raw_fd_ostream::F_Binary);
+  raw_fd_ostream OS(Path, ErrorInfo, sys::fs::F_Binary);
 
   if (!ErrorInfo.empty())
     return -1;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1b73f23..4cfc6bd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -60,10 +60,7 @@ enum {
   FUNCTION_INST_CAST_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
-  FUNCTION_INST_UNREACHABLE_ABBREV,
-
-  // SwitchInst Magic
-  SWITCH_INST_MAGIC = 0x4B5 // May 2012 => 1205 => Hex
+  FUNCTION_INST_UNREACHABLE_ABBREV
 };
 
 static unsigned GetEncodedCastOpcode(unsigned Opcode) {
@@ -81,6 +78,7 @@ static unsigned GetEncodedCastOpcode(unsigned Opcode) {
   case Instruction::PtrToInt: return bitc::CAST_PTRTOINT;
   case Instruction::IntToPtr: return bitc::CAST_INTTOPTR;
   case Instruction::BitCast : return bitc::CAST_BITCAST;
+  case Instruction::AddrSpaceCast: return bitc::CAST_ADDRSPACECAST;
   }
 }
 
@@ -161,6 +159,91 @@ static void WriteStringRecord(unsigned Code, StringRef Str,
   Stream.EmitRecord(Code, Vals, AbbrevToUse);
 }
 
+static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
+  switch (Kind) {
+  case Attribute::Alignment:
+    return bitc::ATTR_KIND_ALIGNMENT;
+  case Attribute::AlwaysInline:
+    return bitc::ATTR_KIND_ALWAYS_INLINE;
+  case Attribute::Builtin:
+    return bitc::ATTR_KIND_BUILTIN;
+  case Attribute::ByVal:
+    return bitc::ATTR_KIND_BY_VAL;
+  case Attribute::Cold:
+    return bitc::ATTR_KIND_COLD;
+  case Attribute::InlineHint:
+    return bitc::ATTR_KIND_INLINE_HINT;
+  case Attribute::InReg:
+    return bitc::ATTR_KIND_IN_REG;
+  case Attribute::MinSize:
+    return bitc::ATTR_KIND_MIN_SIZE;
+  case Attribute::Naked:
+    return bitc::ATTR_KIND_NAKED;
+  case Attribute::Nest:
+    return bitc::ATTR_KIND_NEST;
+  case Attribute::NoAlias:
+    return bitc::ATTR_KIND_NO_ALIAS;
+  case Attribute::NoBuiltin:
+    return bitc::ATTR_KIND_NO_BUILTIN;
+  case Attribute::NoCapture:
+    return bitc::ATTR_KIND_NO_CAPTURE;
+  case Attribute::NoDuplicate:
+    return bitc::ATTR_KIND_NO_DUPLICATE;
+  case Attribute::NoImplicitFloat:
+    return bitc::ATTR_KIND_NO_IMPLICIT_FLOAT;
+  case Attribute::NoInline:
+    return bitc::ATTR_KIND_NO_INLINE;
+  case Attribute::NonLazyBind:
+    return bitc::ATTR_KIND_NON_LAZY_BIND;
+  case Attribute::NoRedZone:
+    return bitc::ATTR_KIND_NO_RED_ZONE;
+  case Attribute::NoReturn:
+    return bitc::ATTR_KIND_NO_RETURN;
+  case Attribute::NoUnwind:
+    return bitc::ATTR_KIND_NO_UNWIND;
+  case Attribute::OptimizeForSize:
+    return bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE;
+  case Attribute::OptimizeNone:
+    return bitc::ATTR_KIND_OPTIMIZE_NONE;
+  case Attribute::ReadNone:
+    return bitc::ATTR_KIND_READ_NONE;
+  case Attribute::ReadOnly:
+    return bitc::ATTR_KIND_READ_ONLY;
+  case Attribute::Returned:
+    return bitc::ATTR_KIND_RETURNED;
+  case Attribute::ReturnsTwice:
+    return bitc::ATTR_KIND_RETURNS_TWICE;
+  case Attribute::SExt:
+    return bitc::ATTR_KIND_S_EXT;
+  case Attribute::StackAlignment:
+    return bitc::ATTR_KIND_STACK_ALIGNMENT;
+  case Attribute::StackProtect:
+    return bitc::ATTR_KIND_STACK_PROTECT;
+  case Attribute::StackProtectReq:
+    return bitc::ATTR_KIND_STACK_PROTECT_REQ;
+  case Attribute::StackProtectStrong:
+    return bitc::ATTR_KIND_STACK_PROTECT_STRONG;
+  case Attribute::StructRet:
+    return bitc::ATTR_KIND_STRUCT_RET;
+  case Attribute::SanitizeAddress:
+    return bitc::ATTR_KIND_SANITIZE_ADDRESS;
+  case Attribute::SanitizeThread:
+    return bitc::ATTR_KIND_SANITIZE_THREAD;
+  case Attribute::SanitizeMemory:
+    return bitc::ATTR_KIND_SANITIZE_MEMORY;
+  case Attribute::UWTable:
+    return bitc::ATTR_KIND_UW_TABLE;
+  case Attribute::ZExt:
+    return bitc::ATTR_KIND_Z_EXT;
+  case Attribute::EndAttrKinds:
+    llvm_unreachable("Can not encode end-attribute kinds marker.");
+  case Attribute::None:
+    llvm_unreachable("Can not encode none-attribute.");
+  }
+
+  llvm_unreachable("Trying to encode unknown attribute");
+}
+
 static void WriteAttributeGroupTable(const ValueEnumerator &VE,
                                      BitstreamWriter &Stream) {
   const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
@@ -182,10 +265,10 @@ static void WriteAttributeGroupTable(const ValueEnumerator &VE,
         Attribute Attr = *I;
         if (Attr.isEnumAttribute()) {
           Record.push_back(0);
-          Record.push_back(Attr.getKindAsEnum());
+          Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
         } else if (Attr.isAlignAttribute()) {
           Record.push_back(1);
-          Record.push_back(Attr.getKindAsEnum());
+          Record.push_back(getAttrKindEncoding(Attr.getKindAsEnum()));
           Record.push_back(Attr.getValueAsInt());
         } else {
           StringRef Kind = Attr.getKindAsString();
@@ -407,7 +490,6 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   case GlobalValue::AvailableExternallyLinkage:      return 12;
   case GlobalValue::LinkerPrivateLinkage:            return 13;
   case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:      return 15;
   }
   llvm_unreachable("Invalid linkage");
 }
@@ -524,7 +606,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
 
     // GLOBALVAR: [type, isconst, initid,
     //             linkage, alignment, section, visibility, threadlocal,
-    //             unnamed_addr]
+    //             unnamed_addr, externally_initialized]
     Vals.push_back(VE.getTypeID(GV->getType()));
     Vals.push_back(GV->isConstant());
     Vals.push_back(GV->isDeclaration() ? 0 :
@@ -550,7 +632,7 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
   // Emit the function proto information.
   for (Module::const_iterator F = M->begin(), E = M->end(); F != E; ++F) {
     // FUNCTION:  [type, callingconv, isproto, linkage, paramattrs, alignment,
-    //             section, visibility, gc, unnamed_addr]
+    //             section, visibility, gc, unnamed_addr, prefix]
     Vals.push_back(VE.getTypeID(F->getType()));
     Vals.push_back(F->getCallingConv());
     Vals.push_back(F->isDeclaration());
@@ -561,6 +643,8 @@ static void WriteModuleInfo(const Module *M, const ValueEnumerator &VE,
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F->hasGC() ? GCMap[F->getGC()] : 0);
     Vals.push_back(F->hasUnnamedAddr());
+    Vals.push_back(F->hasPrefixData() ? (VE.getValueID(F->getPrefixData()) + 1)
+                                      : 0);
 
     unsigned AbbrevToUse = 0;
     Stream.EmitRecord(bitc::MODULE_CODE_FUNCTION, Vals, AbbrevToUse);
@@ -614,7 +698,7 @@ static uint64_t GetOptimizationFlags(const Value *V) {
 static void WriteMDNode(const MDNode *N,
                         const ValueEnumerator &VE,
                         BitstreamWriter &Stream,
-                        SmallVector<uint64_t, 64> &Record) {
+                        SmallVectorImpl<uint64_t> &Record) {
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (N->getOperand(i)) {
       Record.push_back(VE.getTypeID(N->getOperand(i)->getType()));
@@ -701,7 +785,7 @@ static void WriteFunctionLocalMetadata(const Function &F,
                                        BitstreamWriter &Stream) {
   bool StartedMetadataBlock = false;
   SmallVector<uint64_t, 64> Record;
-  const SmallVector<const MDNode *, 8> &Vals = VE.getFunctionLocalMDValues();
+  const SmallVectorImpl<const MDNode *> &Vals = VE.getFunctionLocalMDValues();
   for (unsigned i = 0, e = Vals.size(); i != e; ++i)
     if (const MDNode *N = Vals[i])
       if (N->isFunctionLocal() && N->getFunction() == &F) {
@@ -780,34 +864,6 @@ static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
     Vals.push_back((-V << 1) | 1);
 }
 
-static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
-                      unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
-                      bool EmitSizeForWideNumbers = false
-                      ) {
-  if (Val.getBitWidth() <= 64) {
-    uint64_t V = Val.getSExtValue();
-    emitSignedInt64(Vals, V);
-    Code = bitc::CST_CODE_INTEGER;
-    AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
-  } else {
-    // Wide integers, > 64 bits in size.
-    // We have an arbitrary precision integer value to write whose
-    // bit width is > 64. However, in canonical unsigned integer
-    // format it is likely that the high bits are going to be zero.
-    // So, we only write the number of active words.
-    unsigned NWords = Val.getActiveWords();
-
-    if (EmitSizeForWideNumbers)
-      Vals.push_back(NWords);
-
-    const uint64_t *RawWords = Val.getRawData();
-    for (unsigned i = 0; i != NWords; ++i) {
-      emitSignedInt64(Vals, RawWords[i]);
-    }
-    Code = bitc::CST_CODE_WIDE_INTEGER;
-  }
-}
-
 static void WriteConstants(unsigned FirstVal, unsigned LastVal,
                            const ValueEnumerator &VE,
                            BitstreamWriter &Stream, bool isGlobal) {
@@ -891,7 +947,23 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
     } else if (isa<UndefValue>(C)) {
       Code = bitc::CST_CODE_UNDEF;
     } else if (const ConstantInt *IV = dyn_cast<ConstantInt>(C)) {
-      EmitAPInt(Record, Code, AbbrevToUse, IV->getValue());
+      if (IV->getBitWidth() <= 64) {
+        uint64_t V = IV->getSExtValue();
+        emitSignedInt64(Record, V);
+        Code = bitc::CST_CODE_INTEGER;
+        AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
+      } else {                             // Wide integers, > 64 bits in size.
+        // We have an arbitrary precision integer value to write whose
+        // bit width is > 64. However, in canonical unsigned integer
+        // format it is likely that the high bits are going to be zero.
+        // So, we only write the number of active words.
+        unsigned NWords = IV->getValue().getActiveWords();
+        const uint64_t *RawWords = IV->getValue().getRawData();
+        for (unsigned i = 0; i != NWords; ++i) {
+          emitSignedInt64(Record, RawWords[i]);
+        }
+        Code = bitc::CST_CODE_WIDE_INTEGER;
+      }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
@@ -1078,7 +1150,7 @@ static void WriteModuleConstants(const ValueEnumerator &VE,
 /// instruction ID, then it is a forward reference, and it also includes the
 /// type ID.  The value ID that is written is encoded relative to the InstID.
 static bool PushValueAndType(const Value *V, unsigned InstID,
-                             SmallVector<unsigned, 64> &Vals,
+                             SmallVectorImpl<unsigned> &Vals,
                              ValueEnumerator &VE) {
   unsigned ValID = VE.getValueID(V);
   // Make encoding relative to the InstID.
@@ -1093,21 +1165,14 @@ static bool PushValueAndType(const Value *V, unsigned InstID,
 /// pushValue - Like PushValueAndType, but where the type of the value is
 /// omitted (perhaps it was already encoded in an earlier operand).
 static void pushValue(const Value *V, unsigned InstID,
-                      SmallVector<unsigned, 64> &Vals,
+                      SmallVectorImpl<unsigned> &Vals,
                       ValueEnumerator &VE) {
   unsigned ValID = VE.getValueID(V);
   Vals.push_back(InstID - ValID);
 }
 
-static void pushValue64(const Value *V, unsigned InstID,
-                        SmallVector<uint64_t, 128> &Vals,
-                        ValueEnumerator &VE) {
-  uint64_t ValID = VE.getValueID(V);
-  Vals.push_back(InstID - ValID);
-}
-
 static void pushValueSigned(const Value *V, unsigned InstID,
-                            SmallVector<uint64_t, 128> &Vals,
+                            SmallVectorImpl<uint64_t> &Vals,
                             ValueEnumerator &VE) {
   unsigned ValID = VE.getValueID(V);
   int64_t diff = ((int32_t)InstID - (int32_t)ValID);
@@ -1117,7 +1182,7 @@ static void pushValueSigned(const Value *V, unsigned InstID,
 /// WriteInstruction - Emit an instruction to the specified stream.
 static void WriteInstruction(const Instruction &I, unsigned InstID,
                              ValueEnumerator &VE, BitstreamWriter &Stream,
-                             SmallVector<unsigned, 64> &Vals) {
+                             SmallVectorImpl<unsigned> &Vals) {
   unsigned Code = 0;
   unsigned AbbrevToUse = 0;
   VE.setInstructionID(&I);
@@ -1229,63 +1294,16 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     break;
   case Instruction::Switch:
     {
-      // Redefine Vals, since here we need to use 64 bit values
-      // explicitly to store large APInt numbers.
-      SmallVector<uint64_t, 128> Vals64;
-
       Code = bitc::FUNC_CODE_INST_SWITCH;
       const SwitchInst &SI = cast<SwitchInst>(I);
-
-      uint32_t SwitchRecordHeader = SI.hash() | (SWITCH_INST_MAGIC << 16);
-      Vals64.push_back(SwitchRecordHeader);
-
-      Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      pushValue64(SI.getCondition(), InstID, Vals64, VE);
-      Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
-      Vals64.push_back(SI.getNumCases());
+      Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
+      pushValue(SI.getCondition(), InstID, Vals, VE);
+      Vals.push_back(VE.getValueID(SI.getDefaultDest()));
       for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
            i != e; ++i) {
-        const IntegersSubset& CaseRanges = i.getCaseValueEx();
-        unsigned Code, Abbrev; // will unused.
-
-        if (CaseRanges.isSingleNumber()) {
-          Vals64.push_back(1/*NumItems = 1*/);
-          Vals64.push_back(true/*IsSingleNumber = true*/);
-          EmitAPInt(Vals64, Code, Abbrev, CaseRanges.getSingleNumber(0), true);
-        } else {
-
-          Vals64.push_back(CaseRanges.getNumItems());
-
-          if (CaseRanges.isSingleNumbersOnly()) {
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-
-              Vals64.push_back(true/*IsSingleNumber = true*/);
-
-              EmitAPInt(Vals64, Code, Abbrev,
-                        CaseRanges.getSingleNumber(ri), true);
-            }
-          } else
-            for (unsigned ri = 0, rn = CaseRanges.getNumItems();
-                 ri != rn; ++ri) {
-              IntegersSubset::Range r = CaseRanges.getItem(ri);
-              bool IsSingleNumber = CaseRanges.isSingleNumber(ri);
-
-              Vals64.push_back(IsSingleNumber);
-
-              EmitAPInt(Vals64, Code, Abbrev, r.getLow(), true);
-              if (!IsSingleNumber)
-                EmitAPInt(Vals64, Code, Abbrev, r.getHigh(), true);
-            }
-        }
-        Vals64.push_back(VE.getValueID(i.getCaseSuccessor()));
+        Vals.push_back(VE.getValueID(i.getCaseValue()));
+        Vals.push_back(VE.getValueID(i.getCaseSuccessor()));
       }
-
-      Stream.EmitRecord(Code, Vals64, AbbrevToUse);
-
-      // Also do expected action - clear external Vals collection:
-      Vals.clear();
-      return;
     }
     break;
   case Instruction::IndirectBr:
@@ -1847,6 +1865,8 @@ static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
     WriteUseList(FI, VE, Stream);
     if (!FI->isDeclaration())
       WriteFunctionUseList(FI, VE, Stream);
+    if (FI->hasPrefixData())
+      WriteUseList(FI->getPrefixData(), VE, Stream);
   }
 
   // Write the aliases.
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index 8bac6da..a164104 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -60,6 +60,11 @@ ValueEnumerator::ValueEnumerator(const Module *M) {
        I != E; ++I)
     EnumerateValue(I->getAliasee());
 
+  // Enumerate the prefix data constants.
+  for (Module::const_iterator I = M->begin(), E = M->end(); I != E; ++I)
+    if (I->hasPrefixData())
+      EnumerateValue(I->getPrefixData());
+
   // Insert constants and metadata that are named at module level into the slot
   // pool so that the module symbol table can refer to them...
   EnumerateValueSymbolTable(M->getValueSymbolTable());
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index 0af6164c..d1ca15f 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -125,7 +125,7 @@ public:
 
   const ValueList &getValues() const { return Values; }
   const ValueList &getMDValues() const { return MDValues; }
-  const SmallVector<const MDNode *, 8> &getFunctionLocalMDValues() const {
+  const SmallVectorImpl<const MDNode *> &getFunctionLocalMDValues() const {
     return FunctionLocalMDs;
   }
   const TypeList &getTypes() const { return Types; }
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 76ebe9a..7fbf123 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -7,6 +7,7 @@ add_subdirectory(Bitcode)
 add_subdirectory(Transforms)
 add_subdirectory(Linker)
 add_subdirectory(Analysis)
+add_subdirectory(LTO)
 add_subdirectory(MC)
 add_subdirectory(Object)
 add_subdirectory(Option)
@@ -14,4 +15,3 @@ add_subdirectory(DebugInfo)
 add_subdirectory(ExecutionEngine)
 add_subdirectory(Target)
 add_subdirectory(AsmParser)
-add_subdirectory(Archive)
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index c50f8b5..2ee7767 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -247,8 +247,8 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(MachineInstr *MI,
     if ((MO.isDef() && MI->isRegTiedToUseOperand(i)) ||
         IsImplicitDefUse(MI, MO)) {
       const unsigned Reg = MO.getReg();
-      PassthruRegs.insert(Reg);
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
         PassthruRegs.insert(*SubRegs);
     }
   }
@@ -782,7 +782,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
     if (MI == CriticalPathMI) {
       CriticalPathSU = CriticalPathStep(CriticalPathSU);
       CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : 0;
-    } else {
+    } else if (CriticalPathSet.any()) {
       ExcludeRegs = &CriticalPathSet;
     }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 4731af5..1600c67 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -202,161 +202,272 @@ ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
 }
 
 static bool isNoopBitcast(Type *T1, Type *T2,
-                          const TargetLowering& TLI) {
+                          const TargetLoweringBase& TLI) {
   return T1 == T2 || (T1->isPointerTy() && T2->isPointerTy()) ||
          (isa<VectorType>(T1) && isa<VectorType>(T2) &&
           TLI.isTypeLegal(EVT::getEVT(T1)) && TLI.isTypeLegal(EVT::getEVT(T2)));
 }
 
-/// sameNoopInput - Return true if V1 == V2, else if either V1 or V2 is a noop
-/// (i.e., lowers to no machine code), look through it (and any transitive noop
-/// operands to it) and check if it has the same noop input value.  This is
-/// used to determine if a tail call can be formed.
-static bool sameNoopInput(const Value *V1, const Value *V2,
-                          SmallVectorImpl<unsigned> &Els1,
-                          SmallVectorImpl<unsigned> &Els2,
-                          const TargetLowering &TLI) {
-  using std::swap;
-  bool swapParity = false;
-  bool equalEls = Els1 == Els2;
+/// Look through operations that will be free to find the earliest source of
+/// this value.
+///
+/// @param ValLoc If V has aggegate type, we will be interested in a particular
+/// scalar component. This records its address; the reverse of this list gives a
+/// sequence of indices appropriate for an extractvalue to locate the important
+/// value. This value is updated during the function and on exit will indicate
+/// similar information for the Value returned.
+///
+/// @param DataBits If this function looks through truncate instructions, this
+/// will record the smallest size attained.
+static const Value *getNoopInput(const Value *V,
+                                 SmallVectorImpl<unsigned> &ValLoc,
+                                 unsigned &DataBits,
+                                 const TargetLoweringBase &TLI) {
   while (true) {
-    if ((equalEls && V1 == V2) || isa<UndefValue>(V1) || isa<UndefValue>(V2)) {
-      if (swapParity)
-        // Revert to original Els1 and Els2 to avoid confusing recursive calls
-        swap(Els1, Els2);
-      return true;
-    }
-
     // Try to look through V1; if V1 is not an instruction, it can't be looked
     // through.
-    const Instruction *I = dyn_cast<Instruction>(V1);
+    const Instruction *I = dyn_cast<Instruction>(V);
+    if (!I || I->getNumOperands() == 0) return V;
     const Value *NoopInput = 0;
-    if (I != 0 && I->getNumOperands() > 0) {
-     Value *Op = I->getOperand(0);
-      if (isa<TruncInst>(I)) {
-        // Look through truly no-op truncates.
-        if (TLI.isTruncateFree(Op->getType(), I->getType()))
-          NoopInput = Op;
-      } else if (isa<BitCastInst>(I)) {
-        // Look through truly no-op bitcasts.
-        if (isNoopBitcast(Op->getType(), I->getType(), TLI))
-          NoopInput = Op;
-      } else if (isa<GetElementPtrInst>(I)) {
-        // Look through getelementptr
-        if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
-          NoopInput = Op;
-      } else if (isa<IntToPtrInst>(I)) {
-        // Look through inttoptr.
-        // Make sure this isn't a truncating or extending cast.  We could
-        // support this eventually, but don't bother for now.
-        if (!isa<VectorType>(I->getType()) &&
-            TLI.getPointerTy().getSizeInBits() == 
-              cast<IntegerType>(Op->getType())->getBitWidth())
-          NoopInput = Op;
-      } else if (isa<PtrToIntInst>(I)) {
-        // Look through ptrtoint.
-        // Make sure this isn't a truncating or extending cast.  We could
-        // support this eventually, but don't bother for now.
-        if (!isa<VectorType>(I->getType()) &&
-            TLI.getPointerTy().getSizeInBits() == 
-              cast<IntegerType>(I->getType())->getBitWidth())
-          NoopInput = Op;
-      } else if (isa<CallInst>(I)) {
-        // Look through call
-        for (User::const_op_iterator i = I->op_begin(),
-                                     // Skip Callee
-                                     e = I->op_end() - 1;
-             i != e; ++i) {
-          unsigned attrInd = i - I->op_begin() + 1;
-          if (cast<CallInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
-              isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
-            NoopInput = *i;
-            break;
-          }
+
+    Value *Op = I->getOperand(0);
+    if (isa<BitCastInst>(I)) {
+      // Look through truly no-op bitcasts.
+      if (isNoopBitcast(Op->getType(), I->getType(), TLI))
+        NoopInput = Op;
+    } else if (isa<GetElementPtrInst>(I)) {
+      // Look through getelementptr
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        NoopInput = Op;
+    } else if (isa<IntToPtrInst>(I)) {
+      // Look through inttoptr.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          TLI.getPointerTy().getSizeInBits() ==
+          cast<IntegerType>(Op->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<PtrToIntInst>(I)) {
+      // Look through ptrtoint.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          TLI.getPointerTy().getSizeInBits() ==
+          cast<IntegerType>(I->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<TruncInst>(I) &&
+               TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
+      DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
+      NoopInput = Op;
+    } else if (isa<CallInst>(I)) {
+      // Look through call (skipping callee)
+      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 1;
+           i != e; ++i) {
+        unsigned attrInd = i - I->op_begin() + 1;
+        if (cast<CallInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
+            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
+          NoopInput = *i;
+          break;
         }
-      } else if (isa<InvokeInst>(I)) {
-        // Look through invoke
-        for (User::const_op_iterator i = I->op_begin(),
-                                     // Skip BB, BB, Callee
-                                     e = I->op_end() - 3;
-             i != e; ++i) {
-          unsigned attrInd = i - I->op_begin() + 1;
-          if (cast<InvokeInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
-              isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
-            NoopInput = *i;
-            break;
-          }
+      }
+    } else if (isa<InvokeInst>(I)) {
+      // Look through invoke (skipping BB, BB, Callee)
+      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 3;
+           i != e; ++i) {
+        unsigned attrInd = i - I->op_begin() + 1;
+        if (cast<InvokeInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
+            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
+          NoopInput = *i;
+          break;
         }
       }
+    } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
+      // Value may come from either the aggregate or the scalar
+      ArrayRef<unsigned> InsertLoc = IVI->getIndices();
+      if (std::equal(InsertLoc.rbegin(), InsertLoc.rend(),
+                     ValLoc.rbegin())) {
+        // The type being inserted is a nested sub-type of the aggregate; we
+        // have to remove those initial indices to get the location we're
+        // interested in for the operand.
+        ValLoc.resize(ValLoc.size() - InsertLoc.size());
+        NoopInput = IVI->getInsertedValueOperand();
+      } else {
+        // The struct we're inserting into has the value we're interested in, no
+        // change of address.
+        NoopInput = Op;
+      }
+    } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+      // The part we're interested in will inevitably be some sub-section of the
+      // previous aggregate. Combine the two paths to obtain the true address of
+      // our element.
+      ArrayRef<unsigned> ExtractLoc = EVI->getIndices();
+      std::copy(ExtractLoc.rbegin(), ExtractLoc.rend(),
+                std::back_inserter(ValLoc));
+      NoopInput = Op;
     }
+    // Terminate if we couldn't find anything to look through.
+    if (!NoopInput)
+      return V;
 
-    if (NoopInput) {
-      V1 = NoopInput;
-      continue;
-    }
+    V = NoopInput;
+  }
+}
+
+/// Return true if this scalar return value only has bits discarded on its path
+/// from the "tail call" to the "ret". This includes the obvious noop
+/// instructions handled by getNoopInput above as well as free truncations (or
+/// extensions prior to the call).
+static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
+                                 SmallVectorImpl<unsigned> &RetIndices,
+                                 SmallVectorImpl<unsigned> &CallIndices,
+                                 bool AllowDifferingSizes,
+                                 const TargetLoweringBase &TLI) {
+
+  // Trace the sub-value needed by the return value as far back up the graph as
+  // possible, in the hope that it will intersect with the value produced by the
+  // call. In the simple case with no "returned" attribute, the hope is actually
+  // that we end up back at the tail call instruction itself.
+  unsigned BitsRequired = UINT_MAX;
+  RetVal = getNoopInput(RetVal, RetIndices, BitsRequired, TLI);
+
+  // If this slot in the value returned is undef, it doesn't matter what the
+  // call puts there, it'll be fine.
+  if (isa<UndefValue>(RetVal))
+    return true;
 
-    // If we already swapped, avoid infinite loop
-    if (swapParity)
-      break;
+  // Now do a similar search up through the graph to find where the value
+  // actually returned by the "tail call" comes from. In the simple case without
+  // a "returned" attribute, the search will be blocked immediately and the loop
+  // a Noop.
+  unsigned BitsProvided = UINT_MAX;
+  CallVal = getNoopInput(CallVal, CallIndices, BitsProvided, TLI);
+
+  // There's no hope if we can't actually trace them to (the same part of!) the
+  // same value.
+  if (CallVal != RetVal || CallIndices != RetIndices)
+    return false;
+
+  // However, intervening truncates may have made the call non-tail. Make sure
+  // all the bits that are needed by the "ret" have been provided by the "tail
+  // call". FIXME: with sufficiently cunning bit-tracking, we could look through
+  // extensions too.
+  if (BitsProvided < BitsRequired ||
+      (!AllowDifferingSizes && BitsProvided != BitsRequired))
+    return false;
 
-    // Otherwise, swap V1<->V2, Els1<->Els2
-    swap(V1, V2);
-    swap(Els1, Els2);
-    swapParity = !swapParity;
+  return true;
+}
+
+/// For an aggregate type, determine whether a given index is within bounds or
+/// not.
+static bool indexReallyValid(CompositeType *T, unsigned Idx) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return Idx < AT->getNumElements();
+
+  return Idx < cast<StructType>(T)->getNumElements();
+}
+
+/// Move the given iterators to the next leaf type in depth first traversal.
+///
+/// Performs a depth-first traversal of the type as specified by its arguments,
+/// stopping at the next leaf node (which may be a legitimate scalar type or an
+/// empty struct or array).
+///
+/// @param SubTypes List of the partial components making up the type from
+/// outermost to innermost non-empty aggregate. The element currently
+/// represented is SubTypes.back()->getTypeAtIndex(Path.back() - 1).
+///
+/// @param Path Set of extractvalue indices leading from the outermost type
+/// (SubTypes[0]) to the leaf node currently represented.
+///
+/// @returns true if a new type was found, false otherwise. Calling this
+/// function again on a finished iterator will repeatedly return
+/// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
+/// aggregate or a non-aggregate
+static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
+                                  SmallVectorImpl<unsigned> &Path) {
+  // First march back up the tree until we can successfully increment one of the
+  // coordinates in Path.
+  while (!Path.empty() && !indexReallyValid(SubTypes.back(), Path.back() + 1)) {
+    Path.pop_back();
+    SubTypes.pop_back();
   }
 
-  for (unsigned n = 0; n < 2; ++n) {
-    if (isa<InsertValueInst>(V1)) {
-      if (isa<StructType>(V1->getType())) {
-        // Look through insertvalue
-        unsigned i, e;
-        for (i = 0, e = cast<StructType>(V1->getType())->getNumElements();
-             i != e; ++i) {
-          const Value *InScalar = FindInsertedValue(const_cast<Value*>(V1), i);
-          if (InScalar == 0)
-            break;
-          Els1.push_back(i);
-          if (!sameNoopInput(InScalar, V2, Els1, Els2, TLI)) {
-            Els1.pop_back();
-            break;
-          }
-          Els1.pop_back();
-        }
-        if (i == e) {
-          if (swapParity)
-            swap(Els1, Els2);
-          return true;
-        }
-      }
-    } else if (!Els1.empty() && isa<ExtractValueInst>(V1)) {
-      const ExtractValueInst *EVI = cast<ExtractValueInst>(V1);
-      unsigned i = Els1.back();
-      // If the scalar value being inserted is an extractvalue of the right
-      // index from the call, then everything is good.
-      if (isa<StructType>(EVI->getOperand(0)->getType()) &&
-          EVI->getNumIndices() == 1 && EVI->getIndices()[0] == i) {
-        // Look through extractvalue
-        Els1.pop_back();
-        if (sameNoopInput(EVI->getOperand(0), V2, Els1, Els2, TLI)) {
-          Els1.push_back(i);
-          if (swapParity)
-            swap(Els1, Els2);
-          return true;
-        }
-        Els1.push_back(i);
-      }
-    }
+  // If we reached the top, then the iterator is done.
+  if (Path.empty())
+    return false;
 
-    swap(V1, V2);
-    swap(Els1, Els2);
-    swapParity = !swapParity;
+  // We know there's *some* valid leaf now, so march back down the tree picking
+  // out the left-most element at each node.
+  ++Path.back();
+  Type *DeeperType = SubTypes.back()->getTypeAtIndex(Path.back());
+  while (DeeperType->isAggregateType()) {
+    CompositeType *CT = cast<CompositeType>(DeeperType);
+    if (!indexReallyValid(CT, 0))
+      return true;
+
+    SubTypes.push_back(CT);
+    Path.push_back(0);
+
+    DeeperType = CT->getTypeAtIndex(0U);
   }
 
-  if (swapParity)
-    swap(Els1, Els2);
-  return false;
+  return true;
 }
 
+/// Find the first non-empty, scalar-like type in Next and setup the iterator
+/// components.
+///
+/// Assuming Next is an aggregate of some kind, this function will traverse the
+/// tree from left to right (i.e. depth-first) looking for the first
+/// non-aggregate type which will play a role in function return.
+///
+/// For example, if Next was {[0 x i64], {{}, i32, {}}, i32} then we would setup
+/// Path as [1, 1] and SubTypes as [Next, {{}, i32, {}}] to represent the first
+/// i32 in that type.
+static bool firstRealType(Type *Next,
+                          SmallVectorImpl<CompositeType *> &SubTypes,
+                          SmallVectorImpl<unsigned> &Path) {
+  // First initialise the iterator components to the first "leaf" node
+  // (i.e. node with no valid sub-type at any index, so {} does count as a leaf
+  // despite nominally being an aggregate).
+  while (Next->isAggregateType() &&
+         indexReallyValid(cast<CompositeType>(Next), 0)) {
+    SubTypes.push_back(cast<CompositeType>(Next));
+    Path.push_back(0);
+    Next = cast<CompositeType>(Next)->getTypeAtIndex(0U);
+  }
+
+  // If there's no Path now, Next was originally scalar already (or empty
+  // leaf). We're done.
+  if (Path.empty())
+    return true;
+
+  // Otherwise, use normal iteration to keep looking through the tree until we
+  // find a non-aggregate type.
+  while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType()) {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+  }
+
+  return true;
+}
+
+/// Set the iterator data-structures to the next non-empty, non-aggregate
+/// subtype.
+static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
+                         SmallVectorImpl<unsigned> &Path) {
+  do {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+
+    assert(!Path.empty() && "found a leaf but didn't set the path?");
+  } while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType());
+
+  return true;
+}
+
+
 /// Test if the given instruction is in a position to be optimized
 /// with a tail-call. This roughly means that it's in a block with
 /// a return and there's nothing that needs to be scheduled
@@ -399,6 +510,13 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
         return false;
     }
 
+  return returnTypeIsEligibleForTailCall(ExitBB->getParent(), I, Ret, TLI);
+}
+
+bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
+                                           const Instruction *I,
+                                           const ReturnInst *Ret,
+                                           const TargetLoweringBase &TLI) {
   // If the block ends with a void return or unreachable, it doesn't matter
   // what the call's return type is.
   if (!Ret || Ret->getNumOperands() == 0) return true;
@@ -407,22 +525,85 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS,
   // return type is.
   if (isa<UndefValue>(Ret->getOperand(0))) return true;
 
-  // Conservatively require the attributes of the call to match those of
-  // the return. Ignore noalias because it doesn't affect the call sequence.
-  const Function *F = ExitBB->getParent();
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias) !=
-      AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex).
-        removeAttribute(Attribute::NoAlias))
-    return false;
+  // Make sure the attributes attached to each return are compatible.
+  AttrBuilder CallerAttrs(F->getAttributes(),
+                          AttributeSet::ReturnIndex);
+  AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
+                          AttributeSet::ReturnIndex);
+
+  // Noalias is completely benign as far as calling convention goes, it
+  // shouldn't affect whether the call is a tail call.
+  CallerAttrs = CallerAttrs.removeAttribute(Attribute::NoAlias);
+  CalleeAttrs = CalleeAttrs.removeAttribute(Attribute::NoAlias);
+
+  bool AllowDifferingSizes = true;
+  if (CallerAttrs.contains(Attribute::ZExt)) {
+    if (!CalleeAttrs.contains(Attribute::ZExt))
+      return false;
 
-  // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::ZExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  } else if (CallerAttrs.contains(Attribute::SExt)) {
+    if (!CalleeAttrs.contains(Attribute::SExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+  }
+
+  // If they're still different, there's some facet we don't understand
+  // (currently only "inreg", but in future who knows). It may be OK but the
+  // only safe option is to reject the tail call.
+  if (CallerAttrs != CalleeAttrs)
     return false;
 
-  // Otherwise, make sure the return value and I have the same value
-  SmallVector<unsigned, 4> Els1, Els2;
-  return sameNoopInput(Ret->getOperand(0), I, Els1, Els2, TLI);
+  const Value *RetVal = Ret->getOperand(0), *CallVal = I;
+  SmallVector<unsigned, 4> RetPath, CallPath;
+  SmallVector<CompositeType *, 4> RetSubTypes, CallSubTypes;
+
+  bool RetEmpty = !firstRealType(RetVal->getType(), RetSubTypes, RetPath);
+  bool CallEmpty = !firstRealType(CallVal->getType(), CallSubTypes, CallPath);
+
+  // Nothing's actually returned, it doesn't matter what the callee put there
+  // it's a valid tail call.
+  if (RetEmpty)
+    return true;
+
+  // Iterate pairwise through each of the value types making up the tail call
+  // and the corresponding return. For each one we want to know whether it's
+  // essentially going directly from the tail call to the ret, via operations
+  // that end up not generating any code.
+  //
+  // We allow a certain amount of covariance here. For example it's permitted
+  // for the tail call to define more bits than the ret actually cares about
+  // (e.g. via a truncate).
+  do {
+    if (CallEmpty) {
+      // We've exhausted the values produced by the tail call instruction, the
+      // rest are essentially undef. The type doesn't really matter, but we need
+      // *something*.
+      Type *SlotType = RetSubTypes.back()->getTypeAtIndex(RetPath.back());
+      CallVal = UndefValue::get(SlotType);
+    }
+
+    // The manipulations performed when we're looking through an insertvalue or
+    // an extractvalue would happen at the front of the RetPath list, so since
+    // we have to copy it anyway it's more efficient to create a reversed copy.
+    using std::copy;
+    SmallVector<unsigned, 4> TmpRetPath, TmpCallPath;
+    copy(RetPath.rbegin(), RetPath.rend(), std::back_inserter(TmpRetPath));
+    copy(CallPath.rbegin(), CallPath.rend(), std::back_inserter(TmpCallPath));
+
+    // Finally, we can check whether the value produced by the tail call at this
+    // index is compatible with the value we return.
+    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
+                              AllowDifferingSizes, TLI))
+      return false;
+
+    CallEmpty  = !nextRealType(CallSubTypes, CallPath);
+  } while(nextRealType(RetSubTypes, RetPath));
+
+  return true;
 }
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 188047d..5d82dd9 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -47,13 +47,18 @@ ARMException::ARMException(AsmPrinter *A)
 
 ARMException::~ARMException() {}
 
+ARMTargetStreamer &ARMException::getTargetStreamer() {
+  MCTargetStreamer &TS = Asm->OutStreamer.getTargetStreamer();
+  return static_cast<ARMTargetStreamer &>(TS);
+}
+
 void ARMException::EndModule() {
 }
 
 /// BeginFunction - Gather pre-function exception information. Assumes it's
 /// being emitted immediately after the function entry point.
 void ARMException::BeginFunction(const MachineFunction *MF) {
-  Asm->OutStreamer.EmitFnStart();
+  getTargetStreamer().emitFnStart();
   if (Asm->MF->getFunction()->needsUnwindTableEntry())
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
                                                   Asm->getFunctionNumber()));
@@ -62,8 +67,9 @@ void ARMException::BeginFunction(const MachineFunction *MF) {
 /// EndFunction - Gather and emit post-function exception information.
 ///
 void ARMException::EndFunction() {
+  ARMTargetStreamer &ATS = getTargetStreamer();
   if (!Asm->MF->getFunction()->needsUnwindTableEntry())
-    Asm->OutStreamer.EmitCantUnwind();
+    ATS.emitCantUnwind();
   else {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_end",
                                                   Asm->getFunctionNumber()));
@@ -76,13 +82,13 @@ void ARMException::EndFunction() {
         // Emit references to personality.
         if (const Function * Personality =
             MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
-          MCSymbol *PerSym = Asm->Mang->getSymbol(Personality);
+          MCSymbol *PerSym = Asm->getSymbol(Personality);
           Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
-          Asm->OutStreamer.EmitPersonality(PerSym);
+          ATS.emitPersonality(PerSym);
         }
 
         // Emit .handlerdata directive.
-        Asm->OutStreamer.EmitHandlerData();
+        ATS.emitHandlerData();
 
         // Emit actual exception table
         EmitExceptionTable();
@@ -90,7 +96,7 @@ void ARMException::EndFunction() {
     }
   }
 
-  Asm->OutStreamer.EmitFnEnd();
+  ATS.emitFnEnd();
 }
 
 void ARMException::EmitTypeInfos(unsigned TTypeEncoding) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 84162ac..308b0e0 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -42,16 +42,18 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
 using namespace llvm;
 
-static const char *DWARFGroupName = "DWARF Emission";
-static const char *DbgTimerName = "DWARF Debug Writer";
-static const char *EHTimerName = "DWARF Exception Writer";
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "DWARF Debug Writer";
+static const char *const EHTimerName = "DWARF Exception Writer";
 
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
@@ -93,11 +95,11 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
   : MachineFunctionPass(ID),
-    TM(tm), MAI(tm.getMCAsmInfo()),
+    TM(tm), MAI(tm.getMCAsmInfo()), MII(tm.getInstrInfo()),
     OutContext(Streamer.getContext()),
     OutStreamer(Streamer),
     LastMI(0), LastFn(0), Counter(~0U), SetCounter(0) {
-  DD = 0; DE = 0; MMI = 0; LI = 0;
+  DD = 0; DE = 0; MMI = 0; LI = 0; MF = 0;
   CurrentFnSym = CurrentFnSymForSize = 0;
   GCMetadataPrinters = 0;
   VerboseAsm = Streamer.isVerboseAsm();
@@ -154,8 +156,6 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
-  OutStreamer.InitStreamer();
-
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   MMI->AnalyzeModule(M);
 
@@ -163,7 +163,9 @@ bool AsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
     .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, *TM.getDataLayout());
+  OutStreamer.InitStreamer();
+
+  Mang = new Mangler(&TM);
 
   // Allow the target to emit any magic that it wants at the start of the file.
   EmitStartOfAsmFile(M);
@@ -211,12 +213,12 @@ bool AsmPrinter::doInitialization(Module &M) {
   llvm_unreachable("Unknown exception type.");
 }
 
-void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
-  switch ((GlobalValue::LinkageTypes)Linkage) {
+void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  switch (Linkage) {
   case GlobalValue::CommonLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
@@ -224,8 +226,19 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
-      if ((GlobalValue::LinkageTypes)Linkage !=
-          GlobalValue::LinkOnceODRAutoHideLinkage)
+      bool CanBeHidden = false;
+
+      if (Linkage == GlobalValue::LinkOnceODRLinkage) {
+        if (GV->hasUnnamedAddr()) {
+          CanBeHidden = true;
+        } else {
+          GlobalStatus GS;
+          if (!GlobalStatus::analyzeGlobal(GV, GS) && !GS.IsCompared)
+            CanBeHidden = true;
+        }
+      }
+
+      if (!CanBeHidden)
         // .weak_definition _foo
         OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
@@ -238,7 +251,7 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
       // .weak _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Weak);
     }
-    break;
+    return;
   case GlobalValue::DLLExportLinkage:
   case GlobalValue::AppendingLinkage:
     // FIXME: appending linkage variables should go into a section of
@@ -247,16 +260,23 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
     // If external or appending, declare as a global symbol.
     // .globl _foo
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
-    break;
+    return;
   case GlobalValue::PrivateLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::LinkerPrivateLinkage:
-    break;
-  default:
-    llvm_unreachable("Unknown linkage type!");
+    return;
+  case GlobalValue::AvailableExternallyLinkage:
+    llvm_unreachable("Should never emit this");
+  case GlobalValue::DLLImportLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    llvm_unreachable("Don't know how to emit these");
   }
+  llvm_unreachable("Unknown linkage type!");
 }
 
+MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
+  return getObjFileLowering().getSymbol(*Mang, GV);
+}
 
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
 void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
@@ -272,7 +292,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     }
   }
 
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
@@ -283,13 +303,16 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *TD = TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+  const DataLayout *DL = TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, *TD);
+  unsigned AlignLog = getGVAlignmentLog2(GV, *DL);
+
+  if (DD)
+    DD->setSymbolSize(GVSym, Size);
 
   // Handle common and BSS local symbols (.lcomm).
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
@@ -367,9 +390,10 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     MCSymbol *MangSym =
       OutContext.GetOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
 
-    if (GVKind.isThreadBSS())
+    if (GVKind.isThreadBSS()) {
+      TheSection = getObjFileLowering().getTLSBSSSection();
       OutStreamer.EmitTBSSSymbol(TheSection, MangSym, Size, 1 << AlignLog);
-    else if (GVKind.isThreadData()) {
+    } else if (GVKind.isThreadData()) {
       OutStreamer.SwitchSection(TheSection);
 
       EmitAlignment(AlignLog, GV);
@@ -386,16 +410,16 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     OutStreamer.SwitchSection(TLVSect);
     // Emit the linkage here.
-    EmitLinkage(GV->getLinkage(), GVSym);
+    EmitLinkage(GV, GVSym);
     OutStreamer.EmitLabel(GVSym);
 
     // Three pointers in size:
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
-    unsigned PtrSize = TD->getPointerSizeInBits()/8;
+    unsigned PtrSize = DL->getPointerTypeSize(GV->getType());
     OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
-				PtrSize);
+                                PtrSize);
     OutStreamer.EmitIntValue(0, PtrSize);
     OutStreamer.EmitSymbolValue(MangSym, PtrSize);
 
@@ -405,7 +429,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   OutStreamer.SwitchSection(TheSection);
 
-  EmitLinkage(GV->getLinkage(), GVSym);
+  EmitLinkage(GV, GVSym);
   EmitAlignment(AlignLog, GV);
 
   OutStreamer.EmitLabel(GVSym);
@@ -431,7 +455,7 @@ void AsmPrinter::EmitFunctionHeader() {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
   EmitVisibility(CurrentFnSym, F->getVisibility());
 
-  EmitLinkage(F->getLinkage(), CurrentFnSym);
+  EmitLinkage(F, CurrentFnSym);
   EmitAlignment(MF->getAlignment(), F);
 
   if (MAI->hasDotTypeDotSizeDirective())
@@ -457,16 +481,6 @@ void AsmPrinter::EmitFunctionHeader() {
     OutStreamer.EmitLabel(DeadBlockSyms[i]);
   }
 
-  // Add some workaround for linkonce linkage on Cygwin\MinGW.
-  if (MAI->getLinkOnceDirective() != 0 &&
-      (F->hasLinkOnceLinkage() || F->hasWeakLinkage())) {
-    // FIXME: What is this?
-    MCSymbol *FakeStub =
-      OutContext.GetOrCreateSymbol(Twine("Lllvm$workaround$fake$stub$")+
-                                   CurrentFnSym->getName());
-    OutStreamer.EmitLabel(FakeStub);
-  }
-
   // Emit pre-function debug and/or EH information.
   if (DE) {
     NamedRegionTimer T(EHTimerName, DWARFGroupName, TimePassesIsEnabled);
@@ -476,6 +490,10 @@ void AsmPrinter::EmitFunctionHeader() {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     DD->beginFunction(MF);
   }
+
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
@@ -528,11 +546,11 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 
 /// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
-static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  AP.OutStreamer.AddComment(Twine("implicit-def: ") +
-                            AP.TM.getRegisterInfo()->getName(RegNo));
-  AP.OutStreamer.AddBlankLine();
+  OutStreamer.AddComment(Twine("implicit-def: ") +
+                         TM.getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddBlankLine();
 }
 
 static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
@@ -562,10 +580,17 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
 
   // cast away const; DIetc do not take const operands for some reason.
   DIVariable V(const_cast<MDNode*>(MI->getOperand(2).getMetadata()));
-  if (V.getContext().isSubprogram())
-    OS << DISubprogram(V.getContext()).getDisplayName() << ":";
+  if (V.getContext().isSubprogram()) {
+    StringRef Name = DISubprogram(V.getContext()).getDisplayName();
+    if (!Name.empty())
+      OS << Name << ":";
+  }
   OS << V.getName() << " <- ";
 
+  // The second operand is only an offset if it's an immediate.
+  bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
+  int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0;
+
   // Register or immediate value. Register 0 means undef.
   if (MI->getOperand(0).isFPImm()) {
     APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
@@ -586,18 +611,31 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   } else if (MI->getOperand(0).isCImm()) {
     MI->getOperand(0).getCImm()->getValue().print(OS, false /*isSigned*/);
   } else {
-    assert(MI->getOperand(0).isReg() && "Unknown operand type");
-    if (MI->getOperand(0).getReg() == 0) {
+    unsigned Reg;
+    if (MI->getOperand(0).isReg()) {
+      Reg = MI->getOperand(0).getReg();
+    } else {
+      assert(MI->getOperand(0).isFI() && "Unknown operand type");
+      const TargetFrameLowering *TFI = AP.TM.getFrameLowering();
+      Offset += TFI->getFrameIndexReference(*AP.MF,
+                                            MI->getOperand(0).getIndex(), Reg);
+      Deref = true;
+    }
+    if (Reg == 0) {
       // Suppress offset, it is not meaningful here.
       OS << "undef";
       // NOTE: Want this comment at start of line, don't emit with AddComment.
       AP.OutStreamer.EmitRawText(OS.str());
       return true;
     }
-    OS << AP.TM.getRegisterInfo()->getName(MI->getOperand(0).getReg());
+    if (Deref)
+      OS << '[';
+    OS << AP.TM.getRegisterInfo()->getName(Reg);
   }
 
-  OS << '+' << MI->getOperand(1).getImm();
+  if (Deref)
+    OS << '+' << Offset << ']';
+
   // NOTE: Want this comment at start of line, don't emit with AddComment.
   AP.OutStreamer.EmitRawText(OS.str());
   return true;
@@ -624,7 +662,7 @@ bool AsmPrinter::needsRelocationsForDwarfStringPool() const {
 }
 
 void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
-  MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+  const MCSymbol *Label = MI.getOperand(0).getMCSymbol();
 
   if (MAI->getExceptionHandlingType() != ExceptionHandling::DwarfCFI)
     return;
@@ -635,14 +673,14 @@ void AsmPrinter::emitPrologLabel(const MachineInstr &MI) {
   if (MMI->getCompactUnwindEncoding() != 0)
     OutStreamer.EmitCompactUnwindEncoding(MMI->getCompactUnwindEncoding());
 
-  MachineModuleInfo &MMI = MF->getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MachineModuleInfo &MMI = MF->getMMI();
+  const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
   bool FoundOne = false;
   (void)FoundOne;
-  for (std::vector<MachineMove>::iterator I = Moves.begin(),
-         E = Moves.end(); I != E; ++I) {
+  for (std::vector<MCCFIInstruction>::const_iterator I = Instrs.begin(),
+         E = Instrs.end(); I != E; ++I) {
     if (I->getLabel() == Label) {
-      EmitCFIFrameMove(*I);
+      emitCFIInstruction(*I);
       FoundOne = true;
     }
   }
@@ -702,7 +740,7 @@ void AsmPrinter::EmitFunctionBody() {
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) emitImplicitDef(II, *this);
+        if (isVerbose()) emitImplicitDef(II);
         break;
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(II, *this);
@@ -790,16 +828,9 @@ void AsmPrinter::EmitFunctionBody() {
   OutStreamer.AddBlankLine();
 }
 
-/// getDebugValueLocation - Get location information encoded by DBG_VALUE
-/// operands.
-MachineLocation AsmPrinter::
-getDebugValueLocation(const MachineInstr *MI) const {
-  // Target specific DBG_VALUE instructions are handled by each target.
-  return MachineLocation();
-}
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
-void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
+                                bool Indirect) const {
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
 
@@ -817,7 +848,7 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   // caller might be in the middle of an dwarf expression. We should
   // probably assert that Reg >= 0 once debug info generation is more mature.
 
-  if (MLoc.isIndirect()) {
+  if (MLoc.isIndirect() || Indirect) {
     if (Reg < 32) {
       OutStreamer.AddComment(
         dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
@@ -828,7 +859,9 @@ void AsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
       OutStreamer.AddComment(Twine(Reg));
       EmitULEB128(Reg);
     }
-    EmitSLEB128(MLoc.getOffset());
+    EmitSLEB128(!MLoc.isIndirect() ? 0 : MLoc.getOffset());
+    if (MLoc.isIndirect() && Indirect)
+      EmitInt8(dwarf::DW_OP_deref);
   } else {
     if (Reg < 32) {
       OutStreamer.AddComment(
@@ -860,7 +893,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (V == GlobalValue::DefaultVisibility)
       continue;
 
-    MCSymbol *Name = Mang->getSymbol(&F);
+    MCSymbol *Name = getSymbol(&F);
     EmitVisibility(Name, V, false);
   }
 
@@ -870,6 +903,9 @@ bool AsmPrinter::doFinalization(Module &M) {
   if (!ModuleFlags.empty())
     getObjFileLowering().emitModuleFlags(OutStreamer, ModuleFlags, Mang, TM);
 
+  // Make sure we wrote out everything we need.
+  OutStreamer.Flush();
+
   // Finalize debug and EH information.
   if (DE) {
     {
@@ -897,12 +933,12 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
          I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I) {
       if (!I->hasExternalWeakLinkage()) continue;
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(I), MCSA_WeakReference);
+      OutStreamer.EmitSymbolAttribute(getSymbol(I), MCSA_WeakReference);
     }
   }
 
@@ -910,14 +946,19 @@ bool AsmPrinter::doFinalization(Module &M) {
     OutStreamer.AddBlankLine();
     for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
          I != E; ++I) {
-      MCSymbol *Name = Mang->getSymbol(I);
+      MCSymbol *Name = getSymbol(I);
 
       const GlobalValue *GV = I->getAliasedGlobal();
-      MCSymbol *Target = Mang->getSymbol(GV);
+      if (GV->isDeclaration()) {
+        report_fatal_error(Name->getName() +
+                           ": Target doesn't support aliases to declarations");
+      }
+
+      MCSymbol *Target = getSymbol(GV);
 
       if (I->hasExternalLinkage() || !MAI->getWeakRefDirective())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_Global);
-      else if (I->hasWeakLinkage())
+      else if (I->hasWeakLinkage() || I->hasLinkOnceLinkage())
         OutStreamer.EmitSymbolAttribute(Name, MCSA_WeakReference);
       else
         assert(I->hasLocalLinkage() && "Invalid alias linkage");
@@ -936,6 +977,9 @@ bool AsmPrinter::doFinalization(Module &M) {
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*--I))
       MP->finishAssembly(*this);
 
+  // Emit llvm.ident metadata in an '.ident' directive.
+  EmitModuleIdents(M);
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -959,7 +1003,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
   // Get the function symbol.
-  CurrentFnSym = Mang->getSymbol(MF.getFunction());
+  CurrentFnSym = getSymbol(MF.getFunction());
   CurrentFnSymForSize = CurrentFnSym;
 
   if (isVerbose())
@@ -1266,16 +1310,10 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
     const GlobalValue *GV =
       dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
     if (GV && getObjFileLowering().shouldEmitUsedDirectiveFor(GV, Mang))
-      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(GV), MCSA_NoDeadStrip);
+      OutStreamer.EmitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
   }
 }
 
-typedef std::pair<unsigned, Constant*> Structor;
-
-static bool priority_order(const Structor& lhs, const Structor& rhs) {
-  return lhs.first < rhs.first;
-}
-
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
 void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
@@ -1292,6 +1330,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
       !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
 
   // Gather the structors in a form that's convenient for sorting by priority.
+  typedef std::pair<unsigned, Constant *> Structor;
   SmallVector<Structor, 8> Structors;
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i));
@@ -1305,9 +1344,9 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *TD = TM.getDataLayout();
-  unsigned Align = Log2_32(TD->getPointerPrefAlignment());
-  std::stable_sort(Structors.begin(), Structors.end(), priority_order);
+  const DataLayout *DL = TM.getDataLayout();
+  unsigned Align = Log2_32(DL->getPointerPrefAlignment());
+  std::stable_sort(Structors.begin(), Structors.end(), less_first());
   for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
     const MCSection *OutputSection =
       (isCtor ?
@@ -1320,6 +1359,21 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 }
 
+void AsmPrinter::EmitModuleIdents(Module &M) {
+  if (!MAI->hasIdentDirective())
+    return;
+
+  if (const NamedMDNode *NMD = M.getNamedMetadata("llvm.ident")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      assert(N->getNumOperands() == 1 && 
+             "llvm.ident metadata entry can have only one operand");
+      const MDString *S = cast<MDString>(N->getOperand(0));
+      OutStreamer.EmitIdent(S->getString());
+    }
+  }
+}
+
 //===--------------------------------------------------------------------===//
 // Emission and print routines
 //
@@ -1385,12 +1439,12 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
                             OutContext);
 
   if (!MAI->hasSetDirective())
-    OutStreamer.EmitValue(Diff, 4);
+    OutStreamer.EmitValue(Diff, Size);
   else {
     // Otherwise, emit with .set (aka assignment).
     MCSymbol *SetLabel = GetTempSymbol("set", SetCounter++);
     OutStreamer.EmitAssignment(SetLabel, Diff);
-    OutStreamer.EmitSymbolValue(SetLabel, 4);
+    OutStreamer.EmitSymbolValue(SetLabel, Size);
   }
 }
 
@@ -1398,8 +1452,12 @@ void AsmPrinter::EmitLabelOffsetDifference(const MCSymbol *Hi, uint64_t Offset,
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
 void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
-                                      unsigned Size)
+                                      unsigned Size, bool IsSectionRelative)
   const {
+  if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
+    OutStreamer.EmitCOFFSecRel32(Label);
+    return;
+  }
 
   // Emit Label+Offset (or just Label if Offset is zero)
   const MCExpr *Expr = MCSymbolRefExpr::Create(Label, OutContext);
@@ -1447,7 +1505,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -1477,10 +1535,10 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
-    APInt OffsetAI(TD.getPointerSizeInBits(), 0);
-    cast<GEPOperator>(CE)->accumulateConstantOffset(TD, OffsetAI);
+    APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
     const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
     if (!OffsetAI)
@@ -1501,17 +1559,17 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
     return lowerConstant(Op, AP);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &TD = *AP.TM.getDataLayout();
+    const DataLayout &DL = *AP.TM.getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -1521,13 +1579,13 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
-    if (TD.getTypeAllocSize(Ty) == TD.getTypeAllocSize(Op->getType()))
+    if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
       return OpExpr;
 
     // Otherwise the pointer is smaller than the resultant integer, mask off
     // the high bits so we are sure to get a proper truncation if the input is
     // a constant expr.
-    unsigned InBits = TD.getTypeAllocSizeInBits(Op->getType());
+    unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
     const MCExpr *MaskExpr = MCConstantExpr::Create(~0ULL >> (64-InBits), Ctx);
     return MCBinaryExpr::CreateAnd(OpExpr, MaskExpr, Ctx);
   }
@@ -1561,8 +1619,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
-                                   AsmPrinter &AP);
+static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP);
 
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
@@ -1624,7 +1681,7 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
 }
 
 static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
-                                             unsigned AddrSpace,AsmPrinter &AP){
+                                             AsmPrinter &AP){
 
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, AP.TM);
@@ -1632,12 +1689,12 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
-      return AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
+      return AP.OutStreamer.EmitFill(Bytes, Value);
   }
 
   // If this can be emitted with .ascii/.asciz, emit it as such.
   if (CDS->isString())
-    return AP.OutStreamer.EmitBytes(CDS->getAsString(), AddrSpace);
+    return AP.OutStreamer.EmitBytes(CDS->getAsString());
 
   // Otherwise, emit the values in successive locations.
   unsigned ElementByteSize = CDS->getElementByteSize();
@@ -1647,7 +1704,7 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
         AP.OutStreamer.GetCommentOS() << format("0x%" PRIx64 "\n",
                                                 CDS->getElementAsInteger(i));
       AP.OutStreamer.EmitIntValue(CDS->getElementAsInteger(i),
-                                  ElementByteSize, AddrSpace);
+                                  ElementByteSize);
     }
   } else if (ElementByteSize == 4) {
     // FP Constants are printed as integer constants to avoid losing
@@ -1662,7 +1719,7 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
       F = CDS->getElementAsFloat(i);
       if (AP.isVerbose())
         AP.OutStreamer.GetCommentOS() << "float " << F << '\n';
-      AP.OutStreamer.EmitIntValue(I, 4, AddrSpace);
+      AP.OutStreamer.EmitIntValue(I, 4);
     }
   } else {
     assert(CDS->getElementType()->isDoubleTy());
@@ -1675,78 +1732,74 @@ static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
       F = CDS->getElementAsDouble(i);
       if (AP.isVerbose())
         AP.OutStreamer.GetCommentOS() << "double " << F << '\n';
-      AP.OutStreamer.EmitIntValue(I, 8, AddrSpace);
+      AP.OutStreamer.EmitIntValue(I, 8);
     }
   }
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CDS->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CDS->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
-    AP.OutStreamer.EmitZeros(Padding, AddrSpace);
+    AP.OutStreamer.EmitZeros(Padding);
 
 }
 
-static void emitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
-                                    AsmPrinter &AP) {
+static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
   int Value = isRepeatedByteSequence(CA, AP.TM);
 
   if (Value != -1) {
     uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CA->getType());
-    AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
+    AP.OutStreamer.EmitFill(Bytes, Value);
   }
   else {
     for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      emitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+      emitGlobalConstantImpl(CA->getOperand(i), AP);
   }
 }
 
-static void emitGlobalConstantVector(const ConstantVector *CV,
-                                     unsigned AddrSpace, AsmPrinter &AP) {
+static void emitGlobalConstantVector(const ConstantVector *CV, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
-    emitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
+    emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  unsigned Size = TD.getTypeAllocSize(CV->getType());
-  unsigned EmittedSize = TD.getTypeAllocSize(CV->getType()->getElementType()) *
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  unsigned Size = DL.getTypeAllocSize(CV->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
-    AP.OutStreamer.EmitZeros(Padding, AddrSpace);
+    AP.OutStreamer.EmitZeros(Padding);
 }
 
-static void emitGlobalConstantStruct(const ConstantStruct *CS,
-                                     unsigned AddrSpace, AsmPrinter &AP) {
+static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *TD = AP.TM.getDataLayout();
-  unsigned Size = TD->getTypeAllocSize(CS->getType());
-  const StructLayout *Layout = TD->getStructLayout(CS->getType());
+  const DataLayout *DL = AP.TM.getDataLayout();
+  unsigned Size = DL->getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
     // Check if padding is needed and insert one or more 0s.
-    uint64_t FieldSize = TD->getTypeAllocSize(Field->getType());
+    uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
 
     // Now print the actual field value.
-    emitGlobalConstantImpl(Field, AddrSpace, AP);
+    emitGlobalConstantImpl(Field, AP);
 
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
     // as padding to ensure that the next field starts at the right offset.
-    AP.OutStreamer.EmitZeros(PadSize, AddrSpace);
+    AP.OutStreamer.EmitZeros(PadSize);
   }
   assert(SizeSoFar == Layout->getSizeInBytes() &&
          "Layout of constant struct may be incorrect!");
 }
 
-static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
-                                 AsmPrinter &AP) {
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   APInt API = CFP->getValueAPF().bitcastToAPInt();
 
   // First print a comment with what we think the original floating-point value
@@ -1772,47 +1825,86 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
     int Chunk = API.getNumWords() - 1;
 
     if (TrailingBytes)
-      AP.OutStreamer.EmitIntValue(p[Chunk--], TrailingBytes, AddrSpace);
+      AP.OutStreamer.EmitIntValue(p[Chunk--], TrailingBytes);
 
     for (; Chunk >= 0; --Chunk)
-      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t));
   } else {
     unsigned Chunk;
     for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
-      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t), AddrSpace);
+      AP.OutStreamer.EmitIntValue(p[Chunk], sizeof(uint64_t));
 
     if (TrailingBytes)
-      AP.OutStreamer.EmitIntValue(p[Chunk], TrailingBytes, AddrSpace);
+      AP.OutStreamer.EmitIntValue(p[Chunk], TrailingBytes);
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &TD = *AP.TM.getDataLayout();
-  AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
-                           TD.getTypeStoreSize(CFP->getType()), AddrSpace);
+  const DataLayout &DL = *AP.TM.getDataLayout();
+  AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
+                           DL.getTypeStoreSize(CFP->getType()));
 }
 
-static void emitGlobalConstantLargeInt(const ConstantInt *CI,
-                                       unsigned AddrSpace, AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
+static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
-  assert((BitWidth & 63) == 0 && "only support multiples of 64-bits");
+
+  // Copy the value as we may massage the layout for constants whose bit width
+  // is not a multiple of 64-bits.
+  APInt Realigned(CI->getValue());
+  uint64_t ExtraBits = 0;
+  unsigned ExtraBitsSize = BitWidth & 63;
+
+  if (ExtraBitsSize) {
+    // The bit width of the data is not a multiple of 64-bits.
+    // The extra bits are expected to be at the end of the chunk of the memory.
+    // Little endian:
+    // * Nothing to be done, just record the extra bits to emit.
+    // Big endian:
+    // * Record the extra bits to emit.
+    // * Realign the raw data to emit the chunks of 64-bits.
+    if (DL->isBigEndian()) {
+      // Basically the structure of the raw data is a chunk of 64-bits cells:
+      //    0        1         BitWidth / 64
+      // [chunk1][chunk2] ... [chunkN].
+      // The most significant chunk is chunkN and it should be emitted first.
+      // However, due to the alignment issue chunkN contains useless bits.
+      // Realign the chunks so that they contain only useless information:
+      // ExtraBits     0       1       (BitWidth / 64) - 1
+      //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
+      ExtraBits = Realigned.getRawData()[0] &
+        (((uint64_t)-1) >> (64 - ExtraBitsSize));
+      Realigned = Realigned.lshr(ExtraBitsSize);
+    } else
+      ExtraBits = Realigned.getRawData()[BitWidth / 64];
+  }
 
   // We don't expect assemblers to support integer data directives
   // for more than 64 bits, so we emit the data in at most 64-bit
   // quantities at a time.
-  const uint64_t *RawData = CI->getValue().getRawData();
+  const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
-    uint64_t Val = TD->isBigEndian() ? RawData[e - i - 1] : RawData[i];
-    AP.OutStreamer.EmitIntValue(Val, 8, AddrSpace);
+    uint64_t Val = DL->isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    AP.OutStreamer.EmitIntValue(Val, 8);
+  }
+
+  if (ExtraBitsSize) {
+    // Emit the extra bits after the 64-bits chunks.
+
+    // Emit a directive that fills the expected size.
+    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(CI->getType());
+    Size -= (BitWidth / 64) * 8;
+    assert(Size && Size * 8 >= ExtraBitsSize &&
+           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
+           == ExtraBits && "Directive too small for extra bits.");
+    AP.OutStreamer.EmitIntValue(ExtraBits, Size);
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
-                                   AsmPrinter &AP) {
-  const DataLayout *TD = AP.TM.getDataLayout();
-  uint64_t Size = TD->getTypeAllocSize(CV->getType());
+static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
+  const DataLayout *DL = AP.TM.getDataLayout();
+  uint64_t Size = DL->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
-    return AP.OutStreamer.EmitZeros(Size, AddrSpace);
+    return AP.OutStreamer.EmitZeros(Size);
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
     switch (Size) {
@@ -1823,64 +1915,64 @@ static void emitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
       if (AP.isVerbose())
         AP.OutStreamer.GetCommentOS() << format("0x%" PRIx64 "\n",
                                                 CI->getZExtValue());
-      AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
+      AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size);
       return;
     default:
-      emitGlobalConstantLargeInt(CI, AddrSpace, AP);
+      emitGlobalConstantLargeInt(CI, AP);
       return;
     }
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
-    return emitGlobalConstantFP(CFP, AddrSpace, AP);
+    return emitGlobalConstantFP(CFP, AP);
 
   if (isa<ConstantPointerNull>(CV)) {
-    AP.OutStreamer.EmitIntValue(0, Size, AddrSpace);
+    AP.OutStreamer.EmitIntValue(0, Size);
     return;
   }
 
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
-    return emitGlobalConstantDataSequential(CDS, AddrSpace, AP);
+    return emitGlobalConstantDataSequential(CDS, AP);
 
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return emitGlobalConstantArray(CVA, AddrSpace, AP);
+    return emitGlobalConstantArray(CVA, AP);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return emitGlobalConstantStruct(CVS, AddrSpace, AP);
+    return emitGlobalConstantStruct(CVS, AP);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
     // vectors).
     if (CE->getOpcode() == Instruction::BitCast)
-      return emitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
+      return emitGlobalConstantImpl(CE->getOperand(0), AP);
 
     if (Size > 8) {
       // If the constant expression's size is greater than 64-bits, then we have
       // to emit the value in chunks. Try to constant fold the value and emit it
       // that way.
-      Constant *New = ConstantFoldConstantExpression(CE, TD);
+      Constant *New = ConstantFoldConstantExpression(CE, DL);
       if (New && New != CE)
-        return emitGlobalConstantImpl(New, AddrSpace, AP);
+        return emitGlobalConstantImpl(New, AP);
     }
   }
 
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
-    return emitGlobalConstantVector(V, AddrSpace, AP);
+    return emitGlobalConstantVector(V, AP);
 
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size, AddrSpace);
+  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
-void AsmPrinter::EmitGlobalConstant(const Constant *CV, unsigned AddrSpace) {
+void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
   uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
-    emitGlobalConstantImpl(CV, AddrSpace, *this);
+    emitGlobalConstantImpl(CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
     // If the global has zero size, emit a single byte so that two labels don't
     // look like they are at the same location.
-    OutStreamer.EmitIntValue(0, 1, AddrSpace);
+    OutStreamer.EmitIntValue(0, 1);
   }
 }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 31e42d4..b92f49c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 /// EmitSLEB128 - emit the specified signed leb128 value.
-void AsmPrinter::EmitSLEB128(int Value, const char *Desc) const {
+void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const {
   if (isVerbose() && Desc)
     OutStreamer.AddComment(Desc);
 
@@ -41,7 +41,7 @@ void AsmPrinter::EmitSLEB128(int Value, const char *Desc) const {
 }
 
 /// EmitULEB128 - emit the specified signed leb128 value.
-void AsmPrinter::EmitULEB128(unsigned Value, const char *Desc,
+void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc,
                              unsigned PadTo) const {
   if (isVerbose() && Desc)
     OutStreamer.AddComment(Desc);
@@ -169,28 +169,27 @@ void AsmPrinter::EmitSectionOffset(const MCSymbol *Label,
 // Dwarf Lowering Routines
 //===----------------------------------------------------------------------===//
 
-/// EmitCFIFrameMove - Emit a frame instruction.
-void AsmPrinter::EmitCFIFrameMove(const MachineMove &Move) const {
-  const TargetRegisterInfo *RI = TM.getRegisterInfo();
-
-  const MachineLocation &Dst = Move.getDestination();
-  const MachineLocation &Src = Move.getSource();
-
-  // If advancing cfa.
-  if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-    if (Src.getReg() == MachineLocation::VirtualFP) {
-      OutStreamer.EmitCFIDefCfaOffset(-Src.getOffset());
-    } else {
-      // Reg + Offset
-      OutStreamer.EmitCFIDefCfa(RI->getDwarfRegNum(Src.getReg(), true),
-                                Src.getOffset());
-    }
-  } else if (Src.isReg() && Src.getReg() == MachineLocation::VirtualFP) {
-    assert(Dst.isReg() && "Machine move not supported yet.");
-    OutStreamer.EmitCFIDefCfaRegister(RI->getDwarfRegNum(Dst.getReg(), true));
-  } else {
-    assert(!Dst.isReg() && "Machine move not supported yet.");
-    OutStreamer.EmitCFIOffset(RI->getDwarfRegNum(Src.getReg(), true),
-                              Dst.getOffset());
+void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
+  switch (Inst.getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfaOffset:
+    OutStreamer.EmitCFIDefCfaOffset(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OutStreamer.EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OutStreamer.EmitCFIDefCfaRegister(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpOffset:
+    OutStreamer.EmitCFIOffset(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpRegister:
+    OutStreamer.EmitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OutStreamer.EmitCFIWindowSave();
+    break;
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index abfa330..4f927f6 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -123,7 +123,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
                                              TM.getTargetCPU(),
                                              TM.getTargetFeatureString()));
   OwningPtr<MCTargetAsmParser>
-    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser));
+    TAP(TM.getTarget().createMCAsmParser(*STI, *Parser, *MII));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
@@ -213,7 +213,7 @@ static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
       } else {
         unsigned OpFlags = MI->getOperand(OpNo).getImm();
         ++OpNo;  // Skip over the ID number.
-        
+
         if (InlineAsm::isMemKind(OpFlags)) {
           Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
                                             /*Modifier*/ 0, OS);
diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index 8d15c06..be484a6 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMAsmPrinter
   AsmPrinterDwarf.cpp
   AsmPrinterInlineAsm.cpp
   DIE.cpp
+  DIEHash.cpp
   DwarfAccelTable.cpp
   DwarfCFIException.cpp
   DwarfCompileUnit.cpp
@@ -13,3 +14,5 @@ add_llvm_library(LLVMAsmPrinter
   OcamlGCPrinter.cpp
   Win64Exception.cpp
   )
+
+add_dependencies(LLVMAsmPrinter intrinsics_gen)
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 673867a..6944428 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "DIE.h"
+#include "DwarfDebug.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/IR/DataLayout.h"
@@ -23,6 +24,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MD5.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -32,8 +34,10 @@ using namespace llvm;
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Attribute);
-  ID.AddInteger(Form);
+  // Explicitly cast to an integer type for which FoldingSetNodeID has
+  // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
+  ID.AddInteger(unsigned(Attribute));
+  ID.AddInteger(unsigned(Form));
 }
 
 //===----------------------------------------------------------------------===//
@@ -43,7 +47,7 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
 /// Profile - Used to gather unique data for the abbreviation folding set.
 ///
 void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
-  ID.AddInteger(Tag);
+  ID.AddInteger(unsigned(Tag));
   ID.AddInteger(ChildrenFlag);
 
   // For each attribute description.
@@ -55,11 +59,9 @@ void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
 ///
 void DIEAbbrev::Emit(AsmPrinter *AP) const {
   // Emit its Dwarf tag type.
-  // FIXME: Doing work even in non-asm-verbose runs.
   AP->EmitULEB128(Tag, dwarf::TagString(Tag));
 
   // Emit whether it has children DIEs.
-  // FIXME: Doing work even in non-asm-verbose runs.
   AP->EmitULEB128(ChildrenFlag, dwarf::ChildrenString(ChildrenFlag));
 
   // For each attribute description.
@@ -67,12 +69,10 @@ void DIEAbbrev::Emit(AsmPrinter *AP) const {
     const DIEAbbrevData &AttrData = Data[i];
 
     // Emit attribute type.
-    // FIXME: Doing work even in non-asm-verbose runs.
     AP->EmitULEB128(AttrData.getAttribute(),
                     dwarf::AttributeString(AttrData.getAttribute()));
 
     // Emit form type.
-    // FIXME: Doing work even in non-asm-verbose runs.
     AP->EmitULEB128(AttrData.getForm(),
                     dwarf::FormEncodingString(AttrData.getForm()));
   }
@@ -114,14 +114,34 @@ DIE::~DIE() {
 
 /// Climb up the parent chain to get the compile unit DIE to which this DIE
 /// belongs.
-DIE *DIE::getCompileUnit() const {
-  DIE *p = getParent();
+const DIE *DIE::getCompileUnit() const {
+  const DIE *Cu = getCompileUnitOrNull();
+  assert(Cu && "We should not have orphaned DIEs.");
+  return Cu;
+}
+
+/// Climb up the parent chain to get the compile unit DIE this DIE belongs
+/// to. Return NULL if DIE is not added to an owner yet.
+const DIE *DIE::getCompileUnitOrNull() const {
+  const DIE *p = this;
   while (p) {
     if (p->getTag() == dwarf::DW_TAG_compile_unit)
       return p;
     p = p->getParent();
   }
-  llvm_unreachable("We should not have orphaned DIEs.");
+  return NULL;
+}
+
+DIEValue *DIE::findAttribute(uint16_t Attribute) {
+  const SmallVectorImpl<DIEValue *> &Values = getValues();
+  const DIEAbbrev &Abbrevs = getAbbrev();
+
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return NULL.
+  for (size_t i = 0; i < Values.size(); ++i)
+    if (Abbrevs.getData()[i].getAttribute() == Attribute)
+      return Values[i];
+  return NULL;
 }
 
 #ifndef NDEBUG
@@ -178,7 +198,7 @@ void DIE::dump() {
 void DIEValue::anchor() { }
 
 #ifndef NDEBUG
-void DIEValue::dump() {
+void DIEValue::dump() const {
   print(dbgs());
 }
 #endif
@@ -189,14 +209,14 @@ void DIEValue::dump() {
 
 /// EmitValue - Emit integer of appropriate size.
 ///
-void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEInteger::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   unsigned Size = ~0U;
   switch (Form) {
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     if (Asm->OutStreamer.hasRawTextSupport())
-      Asm->OutStreamer.EmitRawText(StringRef(""));
+      Asm->OutStreamer.EmitRawText("");
     return;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
@@ -221,7 +241,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of integer value in bytes.
 ///
-unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEInteger::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  // Fall thru
@@ -244,25 +264,54 @@ unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
 }
 
 #ifndef NDEBUG
-void DIEInteger::print(raw_ostream &O) {
+void DIEInteger::print(raw_ostream &O) const {
   O << "Int: " << (int64_t)Integer << "  0x";
   O.write_hex(Integer);
 }
 #endif
 
 //===----------------------------------------------------------------------===//
+// DIEExpr Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit expression value.
+///
+void DIEExpr::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
+  AP->OutStreamer.EmitValue(Expr, SizeOf(AP, Form));
+}
+
+/// SizeOf - Determine size of expression value in bytes.
+///
+unsigned DIEExpr::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getDataLayout().getPointerSize();
+}
+
+#ifndef NDEBUG
+void DIEExpr::print(raw_ostream &O) const {
+  O << "Expr: ";
+  Expr->print(O);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
 // DIELabel Implementation
 //===----------------------------------------------------------------------===//
 
 /// EmitValue - Emit label value.
 ///
-void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
-  AP->OutStreamer.EmitSymbolValue(Label, SizeOf(AP, Form));
+void DIELabel::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
+  AP->EmitLabelReference(Label, SizeOf(AP, Form),
+                         Form == dwarf::DW_FORM_strp ||
+                             Form == dwarf::DW_FORM_sec_offset ||
+                             Form == dwarf::DW_FORM_ref_addr);
 }
 
 /// SizeOf - Determine size of label value in bytes.
 ///
-unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIELabel::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_sec_offset) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
@@ -270,7 +319,7 @@ unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
 }
 
 #ifndef NDEBUG
-void DIELabel::print(raw_ostream &O) {
+void DIELabel::print(raw_ostream &O) const {
   O << "Lbl: " << Label->getName();
 }
 #endif
@@ -281,36 +330,69 @@ void DIELabel::print(raw_ostream &O) {
 
 /// EmitValue - Emit delta value.
 ///
-void DIEDelta::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEDelta::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of delta value in bytes.
 ///
-unsigned DIEDelta::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEDelta::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
   return AP->getDataLayout().getPointerSize();
 }
 
 #ifndef NDEBUG
-void DIEDelta::print(raw_ostream &O) {
+void DIEDelta::print(raw_ostream &O) const {
   O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName();
 }
 #endif
 
 //===----------------------------------------------------------------------===//
+// DIEString Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit string value.
+///
+void DIEString::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
+  Access->EmitValue(AP, Form);
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEString::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
+  return Access->SizeOf(AP, Form);
+}
+
+#ifndef NDEBUG
+void DIEString::print(raw_ostream &O) const {
+  O << "String: " << Str << "\tSymbol: ";
+  Access->print(O);
+}
+#endif
+
+//===----------------------------------------------------------------------===//
 // DIEEntry Implementation
 //===----------------------------------------------------------------------===//
 
 /// EmitValue - Emit debug information entry offset.
 ///
-void DIEEntry::EmitValue(AsmPrinter *AP, unsigned Form) const {
+void DIEEntry::EmitValue(AsmPrinter *AP, dwarf::Form Form) const {
   AP->EmitInt32(Entry->getOffset());
 }
 
+unsigned DIEEntry::getRefAddrSize(AsmPrinter *AP) {
+  // DWARF4: References that use the attribute form DW_FORM_ref_addr are
+  // specified to be four bytes in the DWARF 32-bit format and eight bytes
+  // in the DWARF 64-bit format, while DWARF Version 2 specifies that such
+  // references have the same size as an address on the target system.
+  if (AP->getDwarfDebug()->getDwarfVersion() == 2)
+    return AP->getDataLayout().getPointerSize();
+  return sizeof(int32_t);
+}
+
 #ifndef NDEBUG
-void DIEEntry::print(raw_ostream &O) {
+void DIEEntry::print(raw_ostream &O) const {
   O << format("Die: 0x%lx", (long)(intptr_t)Entry);
 }
 #endif
@@ -333,7 +415,7 @@ unsigned DIEBlock::ComputeSize(AsmPrinter *AP) {
 
 /// EmitValue - Emit block data.
 ///
-void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
+void DIEBlock::EmitValue(AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
   case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
@@ -349,7 +431,7 @@ void DIEBlock::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 
 /// SizeOf - Determine size of block data in bytes.
 ///
-unsigned DIEBlock::SizeOf(AsmPrinter *AP, unsigned Form) const {
+unsigned DIEBlock::SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
   case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
@@ -360,7 +442,7 @@ unsigned DIEBlock::SizeOf(AsmPrinter *AP, unsigned Form) const {
 }
 
 #ifndef NDEBUG
-void DIEBlock::print(raw_ostream &O) {
+void DIEBlock::print(raw_ostream &O) const {
   O << "Blk: ";
   DIE::print(O, 5);
 }
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index 3c06001..f4fa326 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -18,30 +18,32 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/MC/MCExpr.h"
 #include <vector>
 
 namespace llvm {
   class AsmPrinter;
   class MCSymbol;
+  class MCSymbolRefExpr;
   class raw_ostream;
 
   //===--------------------------------------------------------------------===//
-  /// DIEAbbrevData - Dwarf abbreviation data, describes the one attribute of a
+  /// DIEAbbrevData - Dwarf abbreviation data, describes one attribute of a
   /// Dwarf abbreviation.
   class DIEAbbrevData {
     /// Attribute - Dwarf attribute code.
     ///
-    uint16_t Attribute;
+    dwarf::Attribute Attribute;
 
     /// Form - Dwarf form code.
     ///
-    uint16_t Form;
+    dwarf::Form Form;
   public:
-    DIEAbbrevData(uint16_t A, uint16_t F) : Attribute(A), Form(F) {}
+    DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
 
     // Accessors.
-    uint16_t getAttribute() const { return Attribute; }
-    uint16_t getForm()      const { return Form; }
+    dwarf::Attribute getAttribute() const { return Attribute; }
+    dwarf::Form getForm() const { return Form; }
 
     /// Profile - Used to gather unique data for the abbreviation folding set.
     ///
@@ -54,7 +56,7 @@ namespace llvm {
   class DIEAbbrev : public FoldingSetNode {
     /// Tag - Dwarf tag code.
     ///
-    uint16_t Tag;
+    dwarf::Tag Tag;
 
     /// ChildrenFlag - Dwarf children flag.
     ///
@@ -69,29 +71,22 @@ namespace llvm {
     SmallVector<DIEAbbrevData, 12> Data;
 
   public:
-    DIEAbbrev(uint16_t T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
+    DIEAbbrev(dwarf::Tag T, uint16_t C) : Tag(T), ChildrenFlag(C), Data() {}
 
     // Accessors.
-    uint16_t getTag() const { return Tag; }
+    dwarf::Tag getTag() const { return Tag; }
     unsigned getNumber() const { return Number; }
     uint16_t getChildrenFlag() const { return ChildrenFlag; }
     const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
-    void setTag(uint16_t T) { Tag = T; }
     void setChildrenFlag(uint16_t CF) { ChildrenFlag = CF; }
     void setNumber(unsigned N) { Number = N; }
 
     /// AddAttribute - Adds another set of attribute information to the
     /// abbreviation.
-    void AddAttribute(uint16_t Attribute, uint16_t Form) {
+    void AddAttribute(dwarf::Attribute Attribute, dwarf::Form Form) {
       Data.push_back(DIEAbbrevData(Attribute, Form));
     }
 
-    /// AddFirstAttribute - Adds a set of attribute information to the front
-    /// of the abbreviation.
-    void AddFirstAttribute(uint16_t Attribute, uint16_t Form) {
-      Data.insert(Data.begin(), DIEAbbrevData(Attribute, Form));
-    }
-
     /// Profile - Used to gather unique data for the abbreviation folding set.
     ///
     void Profile(FoldingSetNodeID &ID) const;
@@ -135,17 +130,17 @@ namespace llvm {
     ///
     SmallVector<DIEValue*, 12> Values;
 
-    // Private data for print()
-    mutable unsigned IndentCount;
   public:
     explicit DIE(unsigned Tag)
-      : Offset(0), Size(0), Abbrev(Tag, dwarf::DW_CHILDREN_no), Parent(0) {}
+        : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
+          Parent(0) {}
     virtual ~DIE();
 
     // Accessors.
     DIEAbbrev &getAbbrev() { return Abbrev; }
+    const DIEAbbrev &getAbbrev() const { return Abbrev; }
     unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
-    unsigned getTag() const { return Abbrev.getTag(); }
+    dwarf::Tag getTag() const { return Abbrev.getTag(); }
     unsigned getOffset() const { return Offset; }
     unsigned getSize() const { return Size; }
     const std::vector<DIE *> &getChildren() const { return Children; }
@@ -153,14 +148,17 @@ namespace llvm {
     DIE *getParent() const { return Parent; }
     /// Climb up the parent chain to get the compile unit DIE this DIE belongs
     /// to.
-    DIE *getCompileUnit() const;
-    void setTag(unsigned Tag) { Abbrev.setTag(Tag); }
+    const DIE *getCompileUnit() const;
+    /// Similar to getCompileUnit, returns null when DIE is not added to an
+    /// owner yet.
+    const DIE *getCompileUnitOrNull() const;
     void setOffset(unsigned O) { Offset = O; }
     void setSize(unsigned S) { Size = S; }
 
     /// addValue - Add a value and attributes to a DIE.
     ///
-    void addValue(unsigned Attribute, unsigned Form, DIEValue *Value) {
+    void addValue(dwarf::Attribute Attribute, dwarf::Form Form,
+                  DIEValue *Value) {
       Abbrev.AddAttribute(Attribute, Form);
       Values.push_back(Value);
     }
@@ -168,15 +166,16 @@ namespace llvm {
     /// addChild - Add a child to the DIE.
     ///
     void addChild(DIE *Child) {
-      if (Child->getParent()) {
-        assert (Child->getParent() == this && "Unexpected DIE Parent!");
-        return;
-      }
+      assert(!Child->getParent());
       Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
       Children.push_back(Child);
       Child->Parent = this;
     }
 
+    /// findAttribute - Find a value in the DIE with the attribute given, returns NULL
+    /// if no such attribute exists.
+    DIEValue *findAttribute(uint16_t Attribute);
+
 #ifndef NDEBUG
     void print(raw_ostream &O, unsigned IndentCount = 0) const;
     void dump();
@@ -192,6 +191,7 @@ namespace llvm {
     enum {
       isInteger,
       isString,
+      isExpr,
       isLabel,
       isDelta,
       isEntry,
@@ -210,15 +210,15 @@ namespace llvm {
 
     /// EmitValue - Emit value via the Dwarf writer.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
     /// SizeOf - Return the size of a value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const = 0;
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O) = 0;
-    void dump();
+    virtual void print(raw_ostream &O) const = 0;
+    void dump() const;
 #endif
   };
 
@@ -232,7 +232,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for integer.
     ///
-    static unsigned BestForm(bool IsSigned, uint64_t Int) {
+    static dwarf::Form BestForm(bool IsSigned, uint64_t Int) {
       if (IsSigned) {
         const int64_t SignedInt = Int;
         if ((char)Int == SignedInt)     return dwarf::DW_FORM_data1;
@@ -248,24 +248,52 @@ namespace llvm {
 
     /// EmitValue - Emit integer of appropriate size.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     uint64_t getValue() const { return Integer; }
 
     /// SizeOf - Determine size of integer value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O);
+    virtual void print(raw_ostream &O) const;
 #endif
   };
 
   //===--------------------------------------------------------------------===//
-  /// DIELabel - A label expression DIE.
+  /// DIEExpr - An expression DIE.
+  //
+  class DIEExpr : public DIEValue {
+    const MCExpr *Expr;
+  public:
+    explicit DIEExpr(const MCExpr *E) : DIEValue(isExpr), Expr(E) {}
+
+    /// EmitValue - Emit expression value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
+
+    /// getValue - Get MCExpr.
+    ///
+    const MCExpr *getValue() const { return Expr; }
+
+    /// SizeOf - Determine size of expression value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *E) { return E->getType() == isExpr; }
+
+#ifndef NDEBUG
+    virtual void print(raw_ostream &O) const;
+#endif
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// DIELabel - A label DIE.
   //
   class DIELabel : public DIEValue {
     const MCSymbol *Label;
@@ -274,21 +302,21 @@ namespace llvm {
 
     /// EmitValue - Emit label value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// getValue - Get MCSymbol.
     ///
-    const MCSymbol *getValue()       const { return Label; }
+    const MCSymbol *getValue() const { return Label; }
 
     /// SizeOf - Determine size of label value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O);
+    virtual void print(raw_ostream &O) const;
 #endif
   };
 
@@ -304,46 +332,82 @@ namespace llvm {
 
     /// EmitValue - Emit delta value.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of delta value in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O);
+    virtual void print(raw_ostream &O) const;
 #endif
   };
 
   //===--------------------------------------------------------------------===//
+  /// DIEString - A container for string values.
+  ///
+  class DIEString : public DIEValue {
+    const DIEValue *Access;
+    const StringRef Str;
+
+  public:
+    DIEString(const DIEValue *Acc, const StringRef S)
+        : DIEValue(isString), Access(Acc), Str(S) {}
+
+    /// getString - Grab the string out of the object.
+    StringRef getString() const { return Str; }
+
+    /// EmitValue - Emit delta value.
+    ///
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
+
+    /// SizeOf - Determine size of delta value in bytes.
+    ///
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
+
+    // Implement isa/cast/dyncast.
+    static bool classof(const DIEValue *D) { return D->getType() == isString; }
+
+  #ifndef NDEBUG
+    virtual void print(raw_ostream &O) const;
+  #endif
+  };
+
+  //===--------------------------------------------------------------------===//
   /// DIEEntry - A pointer to another debug information entry.  An instance of
   /// this class can also be used as a proxy for a debug information entry not
   /// yet defined (ie. types.)
   class DIEEntry : public DIEValue {
     DIE *const Entry;
   public:
-    explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {}
+    explicit DIEEntry(DIE *E) : DIEValue(isEntry), Entry(E) {
+      assert(E && "Cannot construct a DIEEntry with a null DIE");
+    }
 
     DIE *getEntry() const { return Entry; }
 
     /// EmitValue - Emit debug information entry offset.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of debug information entry in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const {
-      return sizeof(int32_t);
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const {
+      return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
+                                             : sizeof(int32_t);
     }
 
+    /// Returns size of a ref_addr entry.
+    static unsigned getRefAddrSize(AsmPrinter *AP);
+
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O);
+    virtual void print(raw_ostream &O) const;
 #endif
   };
 
@@ -353,9 +417,7 @@ namespace llvm {
   class DIEBlock : public DIEValue, public DIE {
     unsigned Size;                // Size in bytes excluding size header.
   public:
-    DIEBlock()
-      : DIEValue(isBlock), DIE(0), Size(0) {}
-    virtual ~DIEBlock() {}
+    DIEBlock() : DIEValue(isBlock), DIE(0), Size(0) {}
 
     /// ComputeSize - calculate the size of the block.
     ///
@@ -363,7 +425,7 @@ namespace llvm {
 
     /// BestForm - Choose the best form for data.
     ///
-    unsigned BestForm() const {
+    dwarf::Form BestForm() const {
       if ((unsigned char)Size == Size)  return dwarf::DW_FORM_block1;
       if ((unsigned short)Size == Size) return dwarf::DW_FORM_block2;
       if ((unsigned int)Size == Size)   return dwarf::DW_FORM_block4;
@@ -372,17 +434,17 @@ namespace llvm {
 
     /// EmitValue - Emit block data.
     ///
-    virtual void EmitValue(AsmPrinter *AP, unsigned Form) const;
+    virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const;
 
     /// SizeOf - Determine size of block data in bytes.
     ///
-    virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
+    virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const;
 
     // Implement isa/cast/dyncast.
     static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
 
 #ifndef NDEBUG
-    virtual void print(raw_ostream &O);
+    virtual void print(raw_ostream &O) const;
 #endif
   };
 
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
new file mode 100644
index 0000000..95eca90
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -0,0 +1,507 @@
+//===-- llvm/CodeGen/DIEHash.cpp - Dwarf Hashing Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dwarfdebug"
+
+#include "DIEHash.h"
+
+#include "DIE.h"
+#include "DwarfCompileUnit.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// \brief Grabs the string in whichever attribute is passed in and returns
+/// a reference to it.
+static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return an empty string.
+  for (size_t i = 0; i < Values.size(); ++i) {
+    if (Abbrevs.getData()[i].getAttribute() == Attr) {
+      DIEValue *V = Values[i];
+      assert(isa<DIEString>(V) && "String requested. Not a string.");
+      DIEString *S = cast<DIEString>(V);
+      return S->getString();
+    }
+  }
+  return StringRef("");
+}
+
+/// \brief Adds the string in \p Str to the hash. This also hashes
+/// a trailing NULL with the string.
+void DIEHash::addString(StringRef Str) {
+  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  Hash.update(Str);
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+// FIXME: The LEB128 routines are copied and only slightly modified out of
+// LEB128.h.
+
+/// \brief Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+void DIEHash::addULEB128(uint64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (Value != 0);
+}
+
+void DIEHash::addSLEB128(int64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (More);
+}
+
+/// \brief Including \p Parent adds the context of Parent to the hash..
+void DIEHash::addParentContext(const DIE &Parent) {
+
+  DEBUG(dbgs() << "Adding parent context to hash...\n");
+
+  // [7.27.2] For each surrounding type or namespace beginning with the
+  // outermost such construct...
+  SmallVector<const DIE *, 1> Parents;
+  const DIE *Cur = &Parent;
+  while (Cur->getTag() != dwarf::DW_TAG_compile_unit) {
+    Parents.push_back(Cur);
+    Cur = Cur->getParent();
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<const DIE *>::reverse_iterator I = Parents.rbegin(),
+                                                      E = Parents.rend();
+       I != E; ++I) {
+    const DIE &Die = **I;
+
+    // ... Append the letter "C" to the sequence...
+    addULEB128('C');
+
+    // ... Followed by the DWARF tag of the construct...
+    addULEB128(Die.getTag());
+
+    // ... Then the name, taken from the DW_AT_name attribute.
+    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
+    DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    if (!Name.empty())
+      addString(Name);
+  }
+}
+
+// Collect all of the attributes for a particular DIE in single structure.
+void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
+  const SmallVectorImpl<DIEValue *> &Values = Die.getValues();
+  const DIEAbbrev &Abbrevs = Die.getAbbrev();
+
+#define COLLECT_ATTR(NAME)                                                     \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME.Val = Values[i];                                                \
+    Attrs.NAME.Desc = &Abbrevs.getData()[i];                                   \
+    break
+
+  for (size_t i = 0, e = Values.size(); i != e; ++i) {
+    DEBUG(dbgs() << "Attribute: "
+                 << dwarf::AttributeString(Abbrevs.getData()[i].getAttribute())
+                 << " added.\n");
+    switch (Abbrevs.getData()[i].getAttribute()) {
+    COLLECT_ATTR(DW_AT_name);
+    COLLECT_ATTR(DW_AT_accessibility);
+    COLLECT_ATTR(DW_AT_address_class);
+    COLLECT_ATTR(DW_AT_allocated);
+    COLLECT_ATTR(DW_AT_artificial);
+    COLLECT_ATTR(DW_AT_associated);
+    COLLECT_ATTR(DW_AT_binary_scale);
+    COLLECT_ATTR(DW_AT_bit_offset);
+    COLLECT_ATTR(DW_AT_bit_size);
+    COLLECT_ATTR(DW_AT_bit_stride);
+    COLLECT_ATTR(DW_AT_byte_size);
+    COLLECT_ATTR(DW_AT_byte_stride);
+    COLLECT_ATTR(DW_AT_const_expr);
+    COLLECT_ATTR(DW_AT_const_value);
+    COLLECT_ATTR(DW_AT_containing_type);
+    COLLECT_ATTR(DW_AT_count);
+    COLLECT_ATTR(DW_AT_data_bit_offset);
+    COLLECT_ATTR(DW_AT_data_location);
+    COLLECT_ATTR(DW_AT_data_member_location);
+    COLLECT_ATTR(DW_AT_decimal_scale);
+    COLLECT_ATTR(DW_AT_decimal_sign);
+    COLLECT_ATTR(DW_AT_default_value);
+    COLLECT_ATTR(DW_AT_digit_count);
+    COLLECT_ATTR(DW_AT_discr);
+    COLLECT_ATTR(DW_AT_discr_list);
+    COLLECT_ATTR(DW_AT_discr_value);
+    COLLECT_ATTR(DW_AT_encoding);
+    COLLECT_ATTR(DW_AT_enum_class);
+    COLLECT_ATTR(DW_AT_endianity);
+    COLLECT_ATTR(DW_AT_explicit);
+    COLLECT_ATTR(DW_AT_is_optional);
+    COLLECT_ATTR(DW_AT_location);
+    COLLECT_ATTR(DW_AT_lower_bound);
+    COLLECT_ATTR(DW_AT_mutable);
+    COLLECT_ATTR(DW_AT_ordering);
+    COLLECT_ATTR(DW_AT_picture_string);
+    COLLECT_ATTR(DW_AT_prototyped);
+    COLLECT_ATTR(DW_AT_small);
+    COLLECT_ATTR(DW_AT_segment);
+    COLLECT_ATTR(DW_AT_string_length);
+    COLLECT_ATTR(DW_AT_threads_scaled);
+    COLLECT_ATTR(DW_AT_upper_bound);
+    COLLECT_ATTR(DW_AT_use_location);
+    COLLECT_ATTR(DW_AT_use_UTF8);
+    COLLECT_ATTR(DW_AT_variable_parameter);
+    COLLECT_ATTR(DW_AT_virtuality);
+    COLLECT_ATTR(DW_AT_visibility);
+    COLLECT_ATTR(DW_AT_vtable_elem_location);
+    COLLECT_ATTR(DW_AT_type);
+    default:
+      break;
+    }
+  }
+}
+
+void DIEHash::hashShallowTypeReference(dwarf::Attribute Attribute,
+                                       const DIE &Entry, StringRef Name) {
+  // append the letter 'N'
+  addULEB128('N');
+
+  // the DWARF attribute code (DW_AT_type or DW_AT_friend),
+  addULEB128(Attribute);
+
+  // the context of the tag,
+  if (const DIE *Parent = Entry.getParent())
+    addParentContext(*Parent);
+
+  // the letter 'E',
+  addULEB128('E');
+
+  // and the name of the type.
+  addString(Name);
+
+  // Currently DW_TAG_friends are not used by Clang, but if they do become so,
+  // here's the relevant spec text to implement:
+  //
+  // For DW_TAG_friend, if the referenced entry is the DW_TAG_subprogram,
+  // the context is omitted and the name to be used is the ABI-specific name
+  // of the subprogram (e.g., the mangled linker name).
+}
+
+void DIEHash::hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                        unsigned DieNumber) {
+  // a) If T is in the list of [previously hashed types], use the letter
+  // 'R' as the marker
+  addULEB128('R');
+
+  addULEB128(Attribute);
+
+  // and use the unsigned LEB128 encoding of [the index of T in the
+  // list] as the attribute value;
+  addULEB128(DieNumber);
+}
+
+void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                           const DIE &Entry) {
+  assert(Tag != dwarf::DW_TAG_friend && "No current LLVM clients emit friend "
+                                        "tags. Add support here when there's "
+                                        "a use case");
+  // Step 5
+  // If the tag in Step 3 is one of [the below tags]
+  if ((Tag == dwarf::DW_TAG_pointer_type ||
+       Tag == dwarf::DW_TAG_reference_type ||
+       Tag == dwarf::DW_TAG_rvalue_reference_type ||
+       Tag == dwarf::DW_TAG_ptr_to_member_type) &&
+      // and the referenced type (via the [below attributes])
+      // FIXME: This seems overly restrictive, and causes hash mismatches
+      // there's a decl/def difference in the containing type of a
+      // ptr_to_member_type, but it's what DWARF says, for some reason.
+      Attribute == dwarf::DW_AT_type) {
+    // ... has a DW_AT_name attribute,
+    StringRef Name = getDIEStringAttr(Entry, dwarf::DW_AT_name);
+    if (!Name.empty()) {
+      hashShallowTypeReference(Attribute, Entry, Name);
+      return;
+    }
+  }
+
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    hashRepeatedTypeReference(Attribute, DieNumber);
+    return;
+  }
+
+  // otherwise, b) use the letter 'T' as a the marker, ...
+  addULEB128('T');
+
+  addULEB128(Attribute);
+
+  // ... process the type T recursively by performing Steps 2 through 7, and
+  // use the result as the attribute value.
+  DieNumber = Numbering.size();
+  computeHash(Entry);
+}
+
+// Hash an individual attribute \param Attr based on the type of attribute and
+// the form.
+void DIEHash::hashAttribute(AttrEntry Attr, dwarf::Tag Tag) {
+  const DIEValue *Value = Attr.Val;
+  const DIEAbbrevData *Desc = Attr.Desc;
+  dwarf::Attribute Attribute = Desc->getAttribute();
+
+  // 7.27 Step 3
+  // ... An attribute that refers to another type entry T is processed as
+  // follows:
+  if (const DIEEntry *EntryAttr = dyn_cast<DIEEntry>(Value)) {
+    hashDIEEntry(Attribute, Tag, *EntryAttr->getEntry());
+    return;
+  }
+
+  // Other attribute values use the letter 'A' as the marker, ...
+  addULEB128('A');
+
+  addULEB128(Attribute);
+
+  // ... and the value consists of the form code (encoded as an unsigned LEB128
+  // value) followed by the encoding of the value according to the form code. To
+  // ensure reproducibility of the signature, the set of forms used in the
+  // signature computation is limited to the following: DW_FORM_sdata,
+  // DW_FORM_flag, DW_FORM_string, and DW_FORM_block.
+  switch (Desc->getForm()) {
+  case dwarf::DW_FORM_string:
+    llvm_unreachable(
+        "Add support for DW_FORM_string if we ever start emitting them again");
+  case dwarf::DW_FORM_GNU_str_index:
+  case dwarf::DW_FORM_strp:
+    addULEB128(dwarf::DW_FORM_string);
+    addString(cast<DIEString>(Value)->getString());
+    break;
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_udata:
+    addULEB128(dwarf::DW_FORM_sdata);
+    addSLEB128((int64_t)cast<DIEInteger>(Value)->getValue());
+    break;
+  default:
+    llvm_unreachable("Add support for additional forms");
+  }
+}
+
+// Go through the attributes from \param Attrs in the order specified in 7.27.4
+// and hash them.
+void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
+#define ADD_ATTR(ATTR)                                                         \
+  {                                                                            \
+    if (ATTR.Val != 0)                                                         \
+      hashAttribute(ATTR, Tag);                                                \
+  }
+
+  ADD_ATTR(Attrs.DW_AT_name);
+  ADD_ATTR(Attrs.DW_AT_accessibility);
+  ADD_ATTR(Attrs.DW_AT_address_class);
+  ADD_ATTR(Attrs.DW_AT_allocated);
+  ADD_ATTR(Attrs.DW_AT_artificial);
+  ADD_ATTR(Attrs.DW_AT_associated);
+  ADD_ATTR(Attrs.DW_AT_binary_scale);
+  ADD_ATTR(Attrs.DW_AT_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_bit_size);
+  ADD_ATTR(Attrs.DW_AT_bit_stride);
+  ADD_ATTR(Attrs.DW_AT_byte_size);
+  ADD_ATTR(Attrs.DW_AT_byte_stride);
+  ADD_ATTR(Attrs.DW_AT_const_expr);
+  ADD_ATTR(Attrs.DW_AT_const_value);
+  ADD_ATTR(Attrs.DW_AT_containing_type);
+  ADD_ATTR(Attrs.DW_AT_count);
+  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_data_location);
+  ADD_ATTR(Attrs.DW_AT_data_member_location);
+  ADD_ATTR(Attrs.DW_AT_decimal_scale);
+  ADD_ATTR(Attrs.DW_AT_decimal_sign);
+  ADD_ATTR(Attrs.DW_AT_default_value);
+  ADD_ATTR(Attrs.DW_AT_digit_count);
+  ADD_ATTR(Attrs.DW_AT_discr);
+  ADD_ATTR(Attrs.DW_AT_discr_list);
+  ADD_ATTR(Attrs.DW_AT_discr_value);
+  ADD_ATTR(Attrs.DW_AT_encoding);
+  ADD_ATTR(Attrs.DW_AT_enum_class);
+  ADD_ATTR(Attrs.DW_AT_endianity);
+  ADD_ATTR(Attrs.DW_AT_explicit);
+  ADD_ATTR(Attrs.DW_AT_is_optional);
+  ADD_ATTR(Attrs.DW_AT_location);
+  ADD_ATTR(Attrs.DW_AT_lower_bound);
+  ADD_ATTR(Attrs.DW_AT_mutable);
+  ADD_ATTR(Attrs.DW_AT_ordering);
+  ADD_ATTR(Attrs.DW_AT_picture_string);
+  ADD_ATTR(Attrs.DW_AT_prototyped);
+  ADD_ATTR(Attrs.DW_AT_small);
+  ADD_ATTR(Attrs.DW_AT_segment);
+  ADD_ATTR(Attrs.DW_AT_string_length);
+  ADD_ATTR(Attrs.DW_AT_threads_scaled);
+  ADD_ATTR(Attrs.DW_AT_upper_bound);
+  ADD_ATTR(Attrs.DW_AT_use_location);
+  ADD_ATTR(Attrs.DW_AT_use_UTF8);
+  ADD_ATTR(Attrs.DW_AT_variable_parameter);
+  ADD_ATTR(Attrs.DW_AT_virtuality);
+  ADD_ATTR(Attrs.DW_AT_visibility);
+  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
+  ADD_ATTR(Attrs.DW_AT_type);
+
+  // FIXME: Add the extended attributes.
+}
+
+// Add all of the attributes for \param Die to the hash.
+void DIEHash::addAttributes(const DIE &Die) {
+  DIEAttrs Attrs = {};
+  collectAttributes(Die, Attrs);
+  hashAttributes(Attrs, Die.getTag());
+}
+
+void DIEHash::hashNestedType(const DIE &Die, StringRef Name) {
+  // 7.27 Step 7
+  // ... append the letter 'S',
+  addULEB128('S');
+
+  // the tag of C,
+  addULEB128(Die.getTag());
+
+  // and the name.
+  addString(Name);
+}
+
+// Compute the hash of a DIE. This is based on the type signature computation
+// given in section 7.27 of the DWARF4 standard. It is the md5 hash of a
+// flattened description of the DIE.
+void DIEHash::computeHash(const DIE &Die) {
+  // Append the letter 'D', followed by the DWARF tag of the DIE.
+  addULEB128('D');
+  addULEB128(Die.getTag());
+
+  // Add each of the attributes of the DIE.
+  addAttributes(Die);
+
+  // Then hash each of the children of the DIE.
+  for (std::vector<DIE *>::const_iterator I = Die.getChildren().begin(),
+                                          E = Die.getChildren().end();
+       I != E; ++I) {
+    // 7.27 Step 7
+    // If C is a nested type entry or a member function entry, ...
+    if (isType((*I)->getTag()) || (*I)->getTag() == dwarf::DW_TAG_subprogram) {
+      StringRef Name = getDIEStringAttr(**I, dwarf::DW_AT_name);
+      // ... and has a DW_AT_name attribute
+      if (!Name.empty()) {
+        hashNestedType(**I, Name);
+        continue;
+      }
+    }
+    computeHash(**I);
+  }
+
+  // Following the last (or if there are no children), append a zero byte.
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is the md5 hash of a flattened description of the DIE
+/// with the exception that we are hashing only the context and the name of the
+/// type.
+uint64_t DIEHash::computeDIEODRSignature(const DIE &Die) {
+
+  // Add the contexts to the hash. We won't be computing the ODR hash for
+  // function local types so it's safe to use the generic context hashing
+  // algorithm here.
+  // FIXME: If we figure out how to account for linkage in some way we could
+  // actually do this with a slight modification to the parent hash algorithm.
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Add the current DIE information.
+
+  // Add the DWARF tag of the DIE.
+  addULEB128(Die.getTag());
+
+  // Add the name of the type to the hash.
+  addString(getDIEStringAttr(Die, dwarf::DW_AT_name));
+
+  // Now get the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of the full CU and all top level CU entities.
+// TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
+uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of additional forms not specifically called out in the
+/// standard.
+uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return *reinterpret_cast<support::ulittle64_t *>(Result + 8);
+}
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
new file mode 100644
index 0000000..f0c4ef9
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -0,0 +1,147 @@
+//===-- llvm/CodeGen/DIEHash.h - Dwarf Hashing Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DIE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+class CompileUnit;
+
+/// \brief An object containing the capability of hashing and adding hash
+/// attributes onto a DIE.
+class DIEHash {
+  // The entry for a particular attribute.
+  struct AttrEntry {
+    const DIEValue *Val;
+    const DIEAbbrevData *Desc;
+  };
+
+  // Collection of all attributes used in hashing a particular DIE.
+  struct DIEAttrs {
+    AttrEntry DW_AT_name;
+    AttrEntry DW_AT_accessibility;
+    AttrEntry DW_AT_address_class;
+    AttrEntry DW_AT_allocated;
+    AttrEntry DW_AT_artificial;
+    AttrEntry DW_AT_associated;
+    AttrEntry DW_AT_binary_scale;
+    AttrEntry DW_AT_bit_offset;
+    AttrEntry DW_AT_bit_size;
+    AttrEntry DW_AT_bit_stride;
+    AttrEntry DW_AT_byte_size;
+    AttrEntry DW_AT_byte_stride;
+    AttrEntry DW_AT_const_expr;
+    AttrEntry DW_AT_const_value;
+    AttrEntry DW_AT_containing_type;
+    AttrEntry DW_AT_count;
+    AttrEntry DW_AT_data_bit_offset;
+    AttrEntry DW_AT_data_location;
+    AttrEntry DW_AT_data_member_location;
+    AttrEntry DW_AT_decimal_scale;
+    AttrEntry DW_AT_decimal_sign;
+    AttrEntry DW_AT_default_value;
+    AttrEntry DW_AT_digit_count;
+    AttrEntry DW_AT_discr;
+    AttrEntry DW_AT_discr_list;
+    AttrEntry DW_AT_discr_value;
+    AttrEntry DW_AT_encoding;
+    AttrEntry DW_AT_enum_class;
+    AttrEntry DW_AT_endianity;
+    AttrEntry DW_AT_explicit;
+    AttrEntry DW_AT_is_optional;
+    AttrEntry DW_AT_location;
+    AttrEntry DW_AT_lower_bound;
+    AttrEntry DW_AT_mutable;
+    AttrEntry DW_AT_ordering;
+    AttrEntry DW_AT_picture_string;
+    AttrEntry DW_AT_prototyped;
+    AttrEntry DW_AT_small;
+    AttrEntry DW_AT_segment;
+    AttrEntry DW_AT_string_length;
+    AttrEntry DW_AT_threads_scaled;
+    AttrEntry DW_AT_upper_bound;
+    AttrEntry DW_AT_use_location;
+    AttrEntry DW_AT_use_UTF8;
+    AttrEntry DW_AT_variable_parameter;
+    AttrEntry DW_AT_virtuality;
+    AttrEntry DW_AT_visibility;
+    AttrEntry DW_AT_vtable_elem_location;
+    AttrEntry DW_AT_type;
+
+    // Insert any additional ones here...
+  };
+
+public:
+  /// \brief Computes the ODR signature.
+  uint64_t computeDIEODRSignature(const DIE &Die);
+
+  /// \brief Computes the CU signature.
+  uint64_t computeCUSignature(const DIE &Die);
+
+  /// \brief Computes the type signature.
+  uint64_t computeTypeSignature(const DIE &Die);
+
+  // Helper routines to process parts of a DIE.
+private:
+  /// \brief Adds the parent context of \param Die to the hash.
+  void addParentContext(const DIE &Die);
+
+  /// \brief Adds the attributes of \param Die to the hash.
+  void addAttributes(const DIE &Die);
+
+  /// \brief Computes the full DWARF4 7.27 hash of the DIE.
+  void computeHash(const DIE &Die);
+
+  // Routines that add DIEValues to the hash.
+private:
+  /// \brief Encodes and adds \param Value to the hash as a ULEB128.
+  void addULEB128(uint64_t Value);
+
+  /// \brief Encodes and adds \param Value to the hash as a SLEB128.
+  void addSLEB128(int64_t Value);
+
+  /// \brief Adds \param Str to the hash and includes a NULL byte.
+  void addString(StringRef Str);
+
+  /// \brief Collects the attributes of DIE \param Die into the \param Attrs
+  /// structure.
+  void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
+
+  /// \brief Hashes the attributes in \param Attrs in order.
+  void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
+
+  /// \brief Hashes an individual attribute.
+  void hashAttribute(AttrEntry Attr, dwarf::Tag Tag);
+
+  /// \brief Hashes an attribute that refers to another DIE.
+  void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                    const DIE &Entry);
+
+  /// \brief Hashes a reference to a named type in such a way that is
+  /// independent of whether that type is described by a declaration or a
+  /// definition.
+  void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
+                                StringRef Name);
+
+  /// \brief Hashes a reference to a previously referenced type DIE.
+  void hashRepeatedTypeReference(dwarf::Attribute Attribute, unsigned DieNumber);
+
+  void hashNestedType(const DIE &Die, StringRef Name);
+
+private:
+  MD5 Hash;
+  DenseMap<const DIE *, unsigned> Numbering;
+};
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index f58ec9b..689aeda 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -24,27 +24,14 @@
 
 using namespace llvm;
 
-const char *DwarfAccelTable::Atom::AtomTypeString(enum AtomType AT) {
-  switch (AT) {
-  case eAtomTypeNULL: return "eAtomTypeNULL";
-  case eAtomTypeDIEOffset: return "eAtomTypeDIEOffset";
-  case eAtomTypeCUOffset: return "eAtomTypeCUOffset";
-  case eAtomTypeTag: return "eAtomTypeTag";
-  case eAtomTypeNameFlags: return "eAtomTypeNameFlags";
-  case eAtomTypeTypeFlags: return "eAtomTypeTypeFlags";
-  }
-  llvm_unreachable("invalid AtomType!");
-}
-
 // The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
-DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList) :
-  Header(8 + (atomList.size() * 4)),
-  HeaderData(atomList),
-  Entries(Allocator) { }
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
+    : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
+      Entries(Allocator) {}
 
-DwarfAccelTable::~DwarfAccelTable() { }
+DwarfAccelTable::~DwarfAccelTable() {}
 
-void DwarfAccelTable::AddName(StringRef Name, DIE* die, char Flags) {
+void DwarfAccelTable::AddName(StringRef Name, DIE *die, char Flags) {
   assert(Data.empty() && "Already finalized!");
   // If the string is in the list already then add this die to the list
   // otherwise add a new one.
@@ -59,13 +46,16 @@ void DwarfAccelTable::ComputeBucketCount(void) {
     uniques[i] = Data[i]->HashValue;
   array_pod_sort(uniques.begin(), uniques.end());
   std::vector<uint32_t>::iterator p =
-    std::unique(uniques.begin(), uniques.end());
+      std::unique(uniques.begin(), uniques.end());
   uint32_t num = std::distance(uniques.begin(), p);
 
   // Then compute the bucket size, minimum of 1 bucket.
-  if (num > 1024) Header.bucket_count = num/4;
-  if (num > 16) Header.bucket_count = num/2;
-  else Header.bucket_count = num > 0 ? num : 1;
+  if (num > 1024)
+    Header.bucket_count = num / 4;
+  if (num > 16)
+    Header.bucket_count = num / 2;
+  else
+    Header.bucket_count = num > 0 ? num : 1;
 
   Header.hashes_count = num;
 }
@@ -76,15 +66,15 @@ static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
   return A->Die->getOffset() < B->Die->getOffset();
 }
 
-void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, const char *Prefix) {
+void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
   // Create the individual hash data outputs.
-  for (StringMap<DataArray>::iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::iterator EI = Entries.begin(), EE = Entries.end();
+       EI != EE; ++EI) {
 
     // Unique the entries.
     std::stable_sort(EI->second.begin(), EI->second.end(), compareDIEs);
     EI->second.erase(std::unique(EI->second.begin(), EI->second.end()),
-                       EI->second.end());
+                     EI->second.end());
 
     HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
     Data.push_back(Entry);
@@ -126,7 +116,7 @@ void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
   Asm->EmitInt32(HeaderData.Atoms.size());
   for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
     Atom A = HeaderData.Atoms[i];
-    Asm->OutStreamer.AddComment(Atom::AtomTypeString(A.type));
+    Asm->OutStreamer.AddComment(dwarf::AtomTypeString(A.type));
     Asm->EmitInt16(A.type);
     Asm->OutStreamer.AddComment(dwarf::FormEncodingString(A.form));
     Asm->EmitInt16(A.form);
@@ -152,7 +142,8 @@ void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Hash in Bucket " + Twine(i));
       Asm->EmitInt32((*HI)->HashValue);
     }
@@ -166,13 +157,13 @@ void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
 void DwarfAccelTable::EmitOffsets(AsmPrinter *Asm, MCSymbol *SecBegin) {
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       Asm->OutStreamer.AddComment("Offset in Bucket " + Twine(i));
       MCContext &Context = Asm->OutStreamer.getContext();
-      const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create((*HI)->Sym, Context),
-                                MCSymbolRefExpr::Create(SecBegin, Context),
-                                Context);
+      const MCExpr *Sub = MCBinaryExpr::CreateSub(
+          MCSymbolRefExpr::Create((*HI)->Sym, Context),
+          MCSymbolRefExpr::Create(SecBegin, Context), Context);
       Asm->OutStreamer.EmitValue(Sub, sizeof(uint32_t));
     }
   }
@@ -185,7 +176,8 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
   uint64_t PrevHash = UINT64_MAX;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI) {
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer.EmitLabel((*HI)->Sym);
       Asm->OutStreamer.AddComment((*HI)->Str);
@@ -193,8 +185,9 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
                              D->getStringPoolSym());
       Asm->OutStreamer.AddComment("Num DIEs");
       Asm->EmitInt32((*HI)->Data.size());
-      for (ArrayRef<HashDataContents*>::const_iterator
-             DI = (*HI)->Data.begin(), DE = (*HI)->Data.end();
+      for (ArrayRef<HashDataContents *>::const_iterator
+               DI = (*HI)->Data.begin(),
+               DE = (*HI)->Data.end();
            DI != DE; ++DI) {
         // Emit the DIE offset
         Asm->EmitInt32((*DI)->Die->getOffset());
@@ -214,8 +207,7 @@ void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfUnits *D) {
 }
 
 // Emit the entire data structure to the output file.
-void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin,
-                           DwarfUnits *D) {
+void DwarfAccelTable::Emit(AsmPrinter *Asm, MCSymbol *SecBegin, DwarfUnits *D) {
   // Emit the header.
   EmitHeader(Asm);
 
@@ -239,11 +231,12 @@ void DwarfAccelTable::print(raw_ostream &O) {
   HeaderData.print(O);
 
   O << "Entries: \n";
-  for (StringMap<DataArray>::const_iterator
-         EI = Entries.begin(), EE = Entries.end(); EI != EE; ++EI) {
+  for (StringMap<DataArray>::const_iterator EI = Entries.begin(),
+                                            EE = Entries.end();
+       EI != EE; ++EI) {
     O << "Name: " << EI->getKeyData() << "\n";
     for (DataArray::const_iterator DI = EI->second.begin(),
-           DE = EI->second.end();
+                                   DE = EI->second.end();
          DI != DE; ++DI)
       (*DI)->print(O);
   }
@@ -251,14 +244,14 @@ void DwarfAccelTable::print(raw_ostream &O) {
   O << "Buckets and Hashes: \n";
   for (size_t i = 0, e = Buckets.size(); i < e; ++i)
     for (HashList::const_iterator HI = Buckets[i].begin(),
-           HE = Buckets[i].end(); HI != HE; ++HI)
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI)
       (*HI)->print(O);
 
   O << "Data: \n";
-    for (std::vector<HashData*>::const_iterator
-           DI = Data.begin(), DE = Data.end(); DI != DE; ++DI)
-      (*DI)->print(O);
-
-
+  for (std::vector<HashData *>::const_iterator DI = Data.begin(),
+                                               DE = Data.end();
+       DI != DE; ++DI)
+    (*DI)->print(O);
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 9915bca..7627313 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -67,11 +67,7 @@ class DwarfUnits;
 
 class DwarfAccelTable {
 
-  enum HashFunctionType {
-    eHashFunctionDJB = 0u
-  };
-
-  static uint32_t HashDJB (StringRef Str) {
+  static uint32_t HashDJB(StringRef Str) {
     uint32_t h = 5381;
     for (unsigned i = 0, e = Str.size(); i != e; ++i)
       h = ((h << 5) + h) + Str[i];
@@ -80,25 +76,25 @@ class DwarfAccelTable {
 
   // Helper function to compute the number of buckets needed based on
   // the number of unique hashes.
-  void ComputeBucketCount (void);
+  void ComputeBucketCount(void);
 
   struct TableHeader {
-    uint32_t   magic;           // 'HASH' magic value to allow endian detection
-    uint16_t   version;         // Version number.
-    uint16_t   hash_function;   // The hash function enumeration that was used.
-    uint32_t   bucket_count;    // The number of buckets in this hash table.
-    uint32_t   hashes_count;    // The total number of unique hash values
-                                // and hash data offsets in this table.
-    uint32_t   header_data_len; // The bytes to skip to get to the hash
-                                // indexes (buckets) for correct alignment.
+    uint32_t magic;           // 'HASH' magic value to allow endian detection
+    uint16_t version;         // Version number.
+    uint16_t hash_function;   // The hash function enumeration that was used.
+    uint32_t bucket_count;    // The number of buckets in this hash table.
+    uint32_t hashes_count;    // The total number of unique hash values
+                              // and hash data offsets in this table.
+    uint32_t header_data_len; // The bytes to skip to get to the hash
+                              // indexes (buckets) for correct alignment.
     // Also written to disk is the implementation specific header data.
 
     static const uint32_t MagicHash = 0x48415348;
 
-    TableHeader (uint32_t data_len) :
-      magic (MagicHash), version (1), hash_function (eHashFunctionDJB),
-      bucket_count (0), hashes_count (0), header_data_len (data_len)
-    {}
+    TableHeader(uint32_t data_len)
+        : magic(MagicHash), version(1),
+          hash_function(dwarf::DW_hash_function_djb), bucket_count(0),
+          hashes_count(0), header_data_len(data_len) {}
 
 #ifndef NDEBUG
     void print(raw_ostream &O) {
@@ -124,62 +120,38 @@ public:
   // uint32_t die_offset_base
   // uint32_t atom_count
   // atom_count Atoms
-  enum AtomType {
-    eAtomTypeNULL       = 0u,
-    eAtomTypeDIEOffset  = 1u,   // DIE offset, check form for encoding
-    eAtomTypeCUOffset   = 2u,   // DIE offset of the compiler unit header that
-                                // contains the item in question
-    eAtomTypeTag        = 3u,   // DW_TAG_xxx value, should be encoded as
-                                // DW_FORM_data1 (if no tags exceed 255) or
-                                // DW_FORM_data2.
-    eAtomTypeNameFlags  = 4u,   // Flags from enum NameFlags
-    eAtomTypeTypeFlags  = 5u    // Flags from enum TypeFlags
-  };
-
-  enum TypeFlags {
-    eTypeFlagClassMask = 0x0000000fu,
-
-    // Always set for C++, only set for ObjC if this is the
-    // @implementation for a class.
-    eTypeFlagClassIsImplementation  = ( 1u << 1 )
-  };
 
   // Make these public so that they can be used as a general interface to
   // the class.
   struct Atom {
-    AtomType type; // enum AtomType
+    uint16_t type; // enum AtomType
     uint16_t form; // DWARF DW_FORM_ defines
 
-    Atom(AtomType type, uint16_t form) : type(type), form(form) {}
-    static const char * AtomTypeString(enum AtomType);
+    Atom(uint16_t type, uint16_t form) : type(type), form(form) {}
 #ifndef NDEBUG
     void print(raw_ostream &O) {
-      O << "Type: " << AtomTypeString(type) << "\n"
+      O << "Type: " << dwarf::AtomTypeString(type) << "\n"
         << "Form: " << dwarf::FormEncodingString(form) << "\n";
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
- private:
+private:
   struct TableHeaderData {
     uint32_t die_offset_base;
     SmallVector<Atom, 1> Atoms;
 
     TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
-      : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) { }
+        : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
 
 #ifndef NDEBUG
-    void print (raw_ostream &O) {
+    void print(raw_ostream &O) {
       O << "die_offset_base: " << die_offset_base << "\n";
       for (size_t i = 0; i < Atoms.size(); i++)
         Atoms[i].print(O);
     }
-    void dump() {
-      print(dbgs());
-    }
+    void dump() { print(dbgs()); }
 #endif
   };
 
@@ -193,37 +165,38 @@ public:
   // HashData[hash_data_count]
 public:
   struct HashDataContents {
-    DIE *Die; // Offsets
+    DIE *Die;   // Offsets
     char Flags; // Specific flags to output
 
-    HashDataContents(DIE *D, char Flags) :
-      Die(D),
-      Flags(Flags) { }
-    #ifndef NDEBUG
+    HashDataContents(DIE *D, char Flags) : Die(D), Flags(Flags) {}
+#ifndef NDEBUG
     void print(raw_ostream &O) const {
       O << "  Offset: " << Die->getOffset() << "\n";
       O << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n";
       O << "  Flags: " << Flags << "\n";
     }
-    #endif
+#endif
   };
+
 private:
   struct HashData {
     StringRef Str;
     uint32_t HashValue;
     MCSymbol *Sym;
-    ArrayRef<HashDataContents*> Data; // offsets
-    HashData(StringRef S, ArrayRef<HashDataContents*> Data)
-      : Str(S), Data(Data) {
+    ArrayRef<HashDataContents *> Data; // offsets
+    HashData(StringRef S, ArrayRef<HashDataContents *> Data)
+        : Str(S), Data(Data) {
       HashValue = DwarfAccelTable::HashDJB(S);
     }
-    #ifndef NDEBUG
+#ifndef NDEBUG
     void print(raw_ostream &O) {
       O << "Name: " << Str << "\n";
       O << "  Hash Value: " << format("0x%x", HashValue) << "\n";
-      O << "  Symbol: " ;
-      if (Sym) Sym->print(O);
-      else O << "<none>";
+      O << "  Symbol: ";
+      if (Sym)
+        Sym->print(O);
+      else
+        O << "<none>";
       O << "\n";
       for (size_t i = 0; i < Data.size(); i++) {
         O << "  Offset: " << Data[i]->Die->getOffset() << "\n";
@@ -231,14 +204,12 @@ private:
         O << "  Flags: " << Data[i]->Flags << "\n";
       }
     }
-    void dump() {
-      print(dbgs());
-    }
-    #endif
+    void dump() { print(dbgs()); }
+#endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
-  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  DwarfAccelTable(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
@@ -253,31 +224,30 @@ private:
   // Output Variables
   TableHeader Header;
   TableHeaderData HeaderData;
-  std::vector<HashData*> Data;
+  std::vector<HashData *> Data;
 
   // String Data
-  typedef std::vector<HashDataContents*> DataArray;
-  typedef StringMap<DataArray, BumpPtrAllocator&> StringEntries;
+  typedef std::vector<HashDataContents *> DataArray;
+  typedef StringMap<DataArray, BumpPtrAllocator &> StringEntries;
   StringEntries Entries;
 
   // Buckets/Hashes/Offsets
-  typedef std::vector<HashData*> HashList;
+  typedef std::vector<HashData *> HashList;
   typedef std::vector<HashList> BucketList;
   BucketList Buckets;
   HashList Hashes;
 
   // Public Implementation
- public:
+public:
   DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
   ~DwarfAccelTable();
-  void AddName(StringRef, DIE*, char = 0);
-  void FinalizeTable(AsmPrinter *, const char *);
+  void AddName(StringRef, DIE *, char = 0);
+  void FinalizeTable(AsmPrinter *, StringRef);
   void Emit(AsmPrinter *, MCSymbol *, DwarfUnits *);
 #ifndef NDEBUG
   void print(raw_ostream &O);
   void dump() { print(dbgs()); }
 #endif
 };
-
 }
 #endif
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index fec5ced..8918f3d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -68,7 +68,7 @@ void DwarfCFIException::EndModule() {
   for (size_t i = 0, e = Personalities.size(); i != e; ++i) {
     if (!Personalities[i])
       continue;
-    MCSymbol *Sym = Asm->Mang->getSymbol(Personalities[i]);
+    MCSymbol *Sym = Asm->getSymbol(Personalities[i]);
     TLOF.emitPersonalityValue(Asm->OutStreamer, Asm->TM, Sym);
     AtLeastOne = true;
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 89abcff..a6ff953 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -22,21 +22,23 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 
 /// CompileUnit - Compile unit constructor.
-CompileUnit::CompileUnit(unsigned UID, unsigned L, DIE *D, AsmPrinter *A,
-                         DwarfDebug *DW, DwarfUnits *DWU)
-  : UniqueID(UID), Language(L), CUDie(D), Asm(A), DD(DW), DU(DWU),
-    IndexTyDie(0), DebugInfoOffset(0) {
+CompileUnit::CompileUnit(unsigned UID, DIE *D, DICompileUnit Node,
+                         AsmPrinter *A, DwarfDebug *DW, DwarfUnits *DWU)
+    : UniqueID(UID), Node(Node), CUDie(D), Asm(A), DD(DW), DU(DWU),
+      IndexTyDie(0), DebugInfoOffset(0) {
   DIEIntegerOne = new (DIEValueAllocator) DIEInteger(1);
+  insertDIE(Node, D);
 }
 
 /// ~CompileUnit - Destructor for compile unit.
@@ -55,7 +57,7 @@ DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
 /// getDefaultLowerBound - Return the default lower bound for an array. If the
 /// DWARF version doesn't handle the language, return -1.
 int64_t CompileUnit::getDefaultLowerBound() const {
-  switch (Language) {
+  switch (getLanguage()) {
   default:
     break;
 
@@ -96,32 +98,71 @@ int64_t CompileUnit::getDefaultLowerBound() const {
   return -1;
 }
 
+/// Check whether the DIE for this MDNode can be shared across CUs.
+static bool isShareableAcrossCUs(DIDescriptor D) {
+  // When the MDNode can be part of the type system, the DIE can be
+  // shared across CUs.
+  return D.isType() ||
+         (D.isSubprogram() && !DISubprogram(D).isDefinition());
+}
+
+/// getDIE - Returns the debug information entry map slot for the
+/// specified debug variable. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+DIE *CompileUnit::getDIE(DIDescriptor D) const {
+  if (isShareableAcrossCUs(D))
+    return DD->getDIE(D);
+  return MDNodeToDieMap.lookup(D);
+}
+
+/// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+/// when the DIE for this MDNode can be shared across CUs. The mappings
+/// will be kept in DwarfDebug for shareable DIEs.
+void CompileUnit::insertDIE(DIDescriptor Desc, DIE *D) {
+  if (isShareableAcrossCUs(Desc)) {
+    DD->insertDIE(Desc, D);
+    return;
+  }
+  MDNodeToDieMap.insert(std::make_pair(Desc, D));
+}
+
 /// addFlag - Add a flag that is true.
-void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
-  if (!DD->useDarwinGDBCompat())
-    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
-                  DIEIntegerOne);
+void CompileUnit::addFlag(DIE *Die, dwarf::Attribute Attribute) {
+  if (DD->getDwarfVersion() >= 4)
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present, DIEIntegerOne);
   else
-    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+    Die->addValue(Attribute, dwarf::DW_FORM_flag, DIEIntegerOne);
 }
 
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
-void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, uint64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(false, Integer);
-  DIEValue *Value = Integer == 1 ?
-    DIEIntegerOne : new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+void CompileUnit::addUInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, uint64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(false, Integer);
+  DIEValue *Value = Integer == 1 ? DIEIntegerOne : new (DIEValueAllocator)
+                        DIEInteger(Integer);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer) {
+  addUInt(Block, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addSInt - Add an signed integer attribute data and value.
 ///
-void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
-                          unsigned Form, int64_t Integer) {
-  if (!Form) Form = DIEInteger::BestForm(true, Integer);
+void CompileUnit::addSInt(DIE *Die, dwarf::Attribute Attribute,
+                          Optional<dwarf::Form> Form, int64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(true, Integer);
   DIEValue *Value = new (DIEValueAllocator) DIEInteger(Integer);
-  Die->addValue(Attribute, Form, Value);
+  Die->addValue(Attribute, *Form, Value);
+}
+
+void CompileUnit::addSInt(DIEBlock *Die, Optional<dwarf::Form> Form,
+                          int64_t Integer) {
+  addSInt(Die, (dwarf::Attribute)0, Form, Integer);
 }
 
 /// addString - Add a string attribute data and value. We always emit a
@@ -129,27 +170,31 @@ void CompileUnit::addSInt(DIE *Die, unsigned Attribute,
 /// more predictable sizes. In the case of split dwarf we emit an index
 /// into another table which gets us the static offset into the string
 /// table.
-void CompileUnit::addString(DIE *Die, unsigned Attribute, StringRef String) {
+void CompileUnit::addString(DIE *Die, dwarf::Attribute Attribute,
+                            StringRef String) {
+  DIEValue *Value;
+  dwarf::Form Form;
   if (!DD->useSplitDwarf()) {
     MCSymbol *Symb = DU->getStringPoolEntry(String);
-    DIEValue *Value;
     if (Asm->needsRelocationsForDwarfStringPool())
       Value = new (DIEValueAllocator) DIELabel(Symb);
     else {
       MCSymbol *StringPool = DU->getStringPoolSym();
       Value = new (DIEValueAllocator) DIEDelta(Symb, StringPool);
     }
-    Die->addValue(Attribute, dwarf::DW_FORM_strp, Value);
+    Form = dwarf::DW_FORM_strp;
   } else {
     unsigned idx = DU->getStringPoolIndex(String);
-    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
-    Die->addValue(Attribute, dwarf::DW_FORM_GNU_str_index, Value);
+    Value = new (DIEValueAllocator) DIEInteger(idx);
+    Form = dwarf::DW_FORM_GNU_str_index;
   }
+  DIEValue *Str = new (DIEValueAllocator) DIEString(Value, String);
+  Die->addValue(Attribute, Form, Str);
 }
 
 /// addLocalString - Add a string attribute data and value. This is guaranteed
 /// to be in the local string pool instead of indirected.
-void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
+void CompileUnit::addLocalString(DIE *Die, dwarf::Attribute Attribute,
                                  StringRef String) {
   MCSymbol *Symb = DU->getStringPoolEntry(String);
   DIEValue *Value;
@@ -162,19 +207,34 @@ void CompileUnit::addLocalString(DIE *Die, unsigned Attribute,
   Die->addValue(Attribute, dwarf::DW_FORM_strp, Value);
 }
 
+/// addExpr - Add a Dwarf expression attribute data and value.
+///
+void CompileUnit::addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr) {
+  DIEValue *Value = new (DIEValueAllocator) DIEExpr(Expr);
+  Die->addValue((dwarf::Attribute)0, Form, Value);
+}
+
 /// addLabel - Add a Dwarf label attribute data and value.
 ///
-void CompileUnit::addLabel(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Label) {
+void CompileUnit::addLabel(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Label) {
   DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
   Die->addValue(Attribute, Form, Value);
 }
 
+void CompileUnit::addLabel(DIEBlock *Die, dwarf::Form Form,
+                           const MCSymbol *Label) {
+  addLabel(Die, (dwarf::Attribute)0, Form, Label);
+}
+
 /// addLabelAddress - Add a dwarf label attribute data and value using
 /// DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
+void CompileUnit::addLabelAddress(DIE *Die, dwarf::Attribute Attribute,
                                   MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
   if (!DD->useSplitDwarf()) {
     if (Label != NULL) {
       DIEValue *Value = new (DIEValueAllocator) DIELabel(Label);
@@ -193,37 +253,60 @@ void CompileUnit::addLabelAddress(DIE *Die, unsigned Attribute,
 /// addOpAddress - Add a dwarf op address data and value using the
 /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
 ///
-void CompileUnit::addOpAddress(DIE *Die, MCSymbol *Sym) {
-
+void CompileUnit::addOpAddress(DIEBlock *Die, const MCSymbol *Sym) {
+  DD->addArangeLabel(SymbolCU(this, Sym));
   if (!DD->useSplitDwarf()) {
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
-    addLabel(Die, 0, dwarf::DW_FORM_udata, Sym);
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, dwarf::DW_FORM_udata, Sym);
   } else {
-    unsigned idx = DU->getAddrPoolIndex(Sym);
-    DIEValue *Value = new (DIEValueAllocator) DIEInteger(idx);
-    addUInt(Die, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
-    Die->addValue(0, dwarf::DW_FORM_GNU_addr_index, Value);
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index, DU->getAddrPoolIndex(Sym));
   }
 }
 
 /// addDelta - Add a label delta attribute data and value.
 ///
-void CompileUnit::addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                           const MCSymbol *Hi, const MCSymbol *Lo) {
+void CompileUnit::addDelta(DIE *Die, dwarf::Attribute Attribute,
+                           dwarf::Form Form, const MCSymbol *Hi,
+                           const MCSymbol *Lo) {
   DIEValue *Value = new (DIEValueAllocator) DIEDelta(Hi, Lo);
   Die->addValue(Attribute, Form, Value);
 }
 
 /// addDIEEntry - Add a DIE attribute data and value.
 ///
-void CompileUnit::addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
                               DIE *Entry) {
-  Die->addValue(Attribute, Form, createDIEEntry(Entry));
+  addDIEEntry(Die, Attribute, createDIEEntry(Entry));
+}
+
+void CompileUnit::addDIEEntry(DIE *Die, dwarf::Attribute Attribute,
+                              DIEEntry *Entry) {
+  const DIE *DieCU = Die->getCompileUnitOrNull();
+  const DIE *EntryCU = Entry->getEntry()->getCompileUnitOrNull();
+  if (!DieCU)
+    // We assume that Die belongs to this CU, if it is not linked to any CU yet.
+    DieCU = getCUDie();
+  if (!EntryCU)
+    EntryCU = getCUDie();
+  Die->addValue(Attribute, EntryCU == DieCU ? dwarf::DW_FORM_ref4
+                                            : dwarf::DW_FORM_ref_addr,
+                Entry);
+}
+
+/// Create a DIE with the given Tag, add the DIE to its parent, and
+/// call insertDIE if MD is not null.
+DIE *CompileUnit::createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N) {
+  DIE *Die = new DIE(Tag);
+  Parent.addChild(Die);
+  if (N)
+    insertDIE(N, Die);
+  return Die;
 }
 
 /// addBlock - Add block data.
 ///
-void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
+void CompileUnit::addBlock(DIE *Die, dwarf::Attribute Attribute,
                            DIEBlock *Block) {
   Block->ComputeSize(Asm);
   DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
@@ -234,42 +317,42 @@ void CompileUnit::addBlock(DIE *Die, unsigned Attribute, unsigned Form,
 /// entry.
 void CompileUnit::addSourceLine(DIE *Die, DIVariable V) {
   // Verify variable.
-  if (!V.Verify())
+  if (!V.isVariable())
     return;
 
   unsigned Line = V.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(V.getContext().getFilename(),
-                                            V.getContext().getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(V.getContext().getFilename(),
+                              V.getContext().getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
 void CompileUnit::addSourceLine(DIE *Die, DIGlobalVariable G) {
   // Verify global variable.
-  if (!G.Verify())
+  if (!G.isGlobalVariable())
     return;
 
   unsigned Line = G.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(G.getFilename(), G.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
 void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
   // Verify subprogram.
-  if (!SP.Verify())
+  if (!SP.isSubprogram())
     return;
 
   // If the line number is 0, don't add it.
@@ -277,35 +360,35 @@ void CompileUnit::addSourceLine(DIE *Die, DISubprogram SP) {
   if (Line == 0)
     return;
 
-  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(),
-                                            SP.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(SP.getFilename(), SP.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
 void CompileUnit::addSourceLine(DIE *Die, DIType Ty) {
   // Verify type.
-  if (!Ty.Verify())
+  if (!Ty.isType())
     return;
 
   unsigned Line = Ty.getLineNumber();
   if (Line == 0)
     return;
-  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(),
-                                            Ty.getDirectory(), getUniqueID());
+  unsigned FileID = DD->getOrCreateSourceID(Ty.getFilename(), Ty.getDirectory(),
+                                            getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
 /// entry.
 void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
   // Verify type.
-  if (!Ty.Verify())
+  if (!Ty.isObjCProperty())
     return;
 
   unsigned Line = Ty.getLineNumber();
@@ -315,8 +398,8 @@ void CompileUnit::addSourceLine(DIE *Die, DIObjCProperty Ty) {
   unsigned FileID = DD->getOrCreateSourceID(File.getFilename(),
                                             File.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addSourceLine - Add location information to specified debug information
@@ -331,68 +414,73 @@ void CompileUnit::addSourceLine(DIE *Die, DINameSpace NS) {
     return;
   StringRef FN = NS.getFilename();
 
-  unsigned FileID = DD->getOrCreateSourceID(FN, NS.getDirectory(),
-                                            getUniqueID());
+  unsigned FileID =
+      DD->getOrCreateSourceID(FN, NS.getDirectory(), getUniqueID());
   assert(FileID && "Invalid file id");
-  addUInt(Die, dwarf::DW_AT_decl_file, 0, FileID);
-  addUInt(Die, dwarf::DW_AT_decl_line, 0, Line);
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
 }
 
 /// addVariableAddress - Add DW_AT_location attribute for a
 /// DbgVariable based on provided MachineLocation.
-void CompileUnit::addVariableAddress(DbgVariable *&DV, DIE *Die,
+void CompileUnit::addVariableAddress(const DbgVariable &DV, DIE *Die,
                                      MachineLocation Location) {
-  if (DV->variableHasComplexAddress())
+  if (DV.variableHasComplexAddress())
     addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
-  else if (DV->isBlockByrefVariable())
+  else if (DV.isBlockByrefVariable())
     addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
   else
-    addAddress(Die, dwarf::DW_AT_location, Location);
+    addAddress(Die, dwarf::DW_AT_location, Location,
+               DV.getVariable().isIndirect());
 }
 
 /// addRegisterOp - Add register operand.
-void CompileUnit::addRegisterOp(DIE *TheDie, unsigned Reg) {
+void CompileUnit::addRegisterOp(DIEBlock *TheDie, unsigned Reg) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
 }
 
 /// addRegisterOffset - Add register offset.
-void CompileUnit::addRegisterOffset(DIE *TheDie, unsigned Reg,
+void CompileUnit::addRegisterOffset(DIEBlock *TheDie, unsigned Reg,
                                     int64_t Offset) {
   const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
   unsigned DWReg = RI->getDwarfRegNum(Reg, false);
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   if (Reg == TRI->getFrameRegister(*Asm->MF))
     // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
   else if (DWReg < 32)
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
   else {
-    addUInt(TheDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, 0, dwarf::DW_FORM_udata, DWReg);
+    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
+    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
   }
-  addSInt(TheDie, 0, dwarf::DW_FORM_sdata, Offset);
+  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
 }
 
 /// addAddress - Add an address attribute to a die based on the location
 /// provided.
-void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
-                             const MachineLocation &Location) {
+void CompileUnit::addAddress(DIE *Die, dwarf::Attribute Attribute,
+                             const MachineLocation &Location, bool Indirect) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
-  if (Location.isReg())
+  if (Location.isReg() && !Indirect)
     addRegisterOp(Block, Location.getReg());
-  else
+  else {
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
+    if (Indirect && !Location.isReg()) {
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    }
+  }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// addComplexAddress - Start with the address based on the location provided,
@@ -400,37 +488,37 @@ void CompileUnit::addAddress(DIE *Die, unsigned Attribute,
 /// given the extra address information encoded in the DIVariable, starting from
 /// the starting location.  Add the DWARF information to the die.
 ///
-void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
-                                    unsigned Attribute,
+void CompileUnit::addComplexAddress(const DbgVariable &DV, DIE *Die,
+                                    dwarf::Attribute Attribute,
                                     const MachineLocation &Location) {
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-  unsigned N = DV->getNumAddrElements();
+  unsigned N = DV.getNumAddrElements();
   unsigned i = 0;
   if (Location.isReg()) {
-    if (N >= 2 && DV->getAddrElement(0) == DIBuilder::OpPlus) {
+    if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
       // If first address element is OpPlus then emit
       // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(Block, Location.getReg(), DV->getAddrElement(1));
+      addRegisterOffset(Block, Location.getReg(), DV.getAddrElement(1));
       i = 2;
     } else
       addRegisterOp(Block, Location.getReg());
-  }
-  else
+  } else
     addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
-  for (;i < N; ++i) {
-    uint64_t Element = DV->getAddrElement(i);
+  for (; i < N; ++i) {
+    uint64_t Element = DV.getAddrElement(i);
     if (Element == DIBuilder::OpPlus) {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, DV->getAddrElement(++i));
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(Block, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
     } else if (Element == DIBuilder::OpDeref) {
       if (!Location.isReg())
-        addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    } else llvm_unreachable("unknown DIBuilder Opcode");
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    } else
+      llvm_unreachable("unknown DIBuilder Opcode");
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -493,45 +581,42 @@ void CompileUnit::addComplexAddress(DbgVariable *&DV, DIE *Die,
 /// starting location.  Add the DWARF information to the die.  For
 /// more information, read large comment just above here.
 ///
-void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
-                                       unsigned Attribute,
+void CompileUnit::addBlockByrefAddress(const DbgVariable &DV, DIE *Die,
+                                       dwarf::Attribute Attribute,
                                        const MachineLocation &Location) {
-  DIType Ty = DV->getType();
+  DIType Ty = DV.getType();
   DIType TmpTy = Ty;
-  unsigned Tag = Ty.getTag();
+  uint16_t Tag = Ty.getTag();
   bool isPointer = false;
 
-  StringRef varName = DV->getName();
+  StringRef varName = DV.getName();
 
   if (Tag == dwarf::DW_TAG_pointer_type) {
-    DIDerivedType DTy = DIDerivedType(Ty);
-    TmpTy = DTy.getTypeDerivedFrom();
+    DIDerivedType DTy(Ty);
+    TmpTy = resolve(DTy.getTypeDerivedFrom());
     isPointer = true;
   }
 
-  DICompositeType blockStruct = DICompositeType(TmpTy);
+  DICompositeType blockStruct(TmpTy);
 
   // Find the __forwarding field and the variable field in the __Block_byref
   // struct.
   DIArray Fields = blockStruct.getTypeArray();
-  DIDescriptor varField = DIDescriptor();
-  DIDescriptor forwardingField = DIDescriptor();
+  DIDerivedType varField;
+  DIDerivedType forwardingField;
 
   for (unsigned i = 0, N = Fields.getNumElements(); i < N; ++i) {
-    DIDescriptor Element = Fields.getElement(i);
-    DIDerivedType DT = DIDerivedType(Element);
+    DIDerivedType DT(Fields.getElement(i));
     StringRef fieldName = DT.getName();
     if (fieldName == "__forwarding")
-      forwardingField = Element;
+      forwardingField = DT;
     else if (fieldName == varName)
-      varField = Element;
+      varField = DT;
   }
 
   // Get the offsets for the forwarding field and the variable field.
-  unsigned forwardingFieldOffset =
-    DIDerivedType(forwardingField).getOffsetInBits() >> 3;
-  unsigned varFieldOffset =
-    DIDerivedType(varField).getOffsetInBits() >> 3;
+  unsigned forwardingFieldOffset = forwardingField.getOffsetInBits() >> 3;
+  unsigned varFieldOffset = varField.getOffsetInBits() >> 2;
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
@@ -545,76 +630,139 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, forwardingFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+  addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, varFieldOffset);
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(Block, dwarf::DW_FORM_udata, varFieldOffset);
   }
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, 0, Block);
+  addBlock(Die, Attribute, Block);
 }
 
 /// isTypeSigned - Return true if the type is signed.
-static bool isTypeSigned(DIType Ty, int *SizeInBits) {
+static bool isTypeSigned(DwarfDebug *DD, DIType Ty, int *SizeInBits) {
   if (Ty.isDerivedType())
-    return isTypeSigned(DIDerivedType(Ty).getTypeDerivedFrom(), SizeInBits);
+    return isTypeSigned(DD, DD->resolve(DIDerivedType(Ty).getTypeDerivedFrom()),
+                        SizeInBits);
   if (Ty.isBasicType())
-    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed
-        || DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
+    if (DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed ||
+        DIBasicType(Ty).getEncoding() == dwarf::DW_ATE_signed_char) {
       *SizeInBits = Ty.getSizeInBits();
       return true;
     }
   return false;
 }
 
+/// Return true if type encoding is unsigned.
+static bool isUnsignedDIType(DwarfDebug *DD, DIType Ty) {
+  DIDerivedType DTy(Ty);
+  if (DTy.isDerivedType())
+    return isUnsignedDIType(DD, DD->resolve(DTy.getTypeDerivedFrom()));
+
+  DIBasicType BTy(Ty);
+  if (BTy.isBasicType()) {
+    unsigned Encoding = BTy.getEncoding();
+    if (Encoding == dwarf::DW_ATE_unsigned ||
+        Encoding == dwarf::DW_ATE_unsigned_char ||
+        Encoding == dwarf::DW_ATE_boolean)
+      return true;
+  }
+  return false;
+}
+
+/// If this type is derived from a base type then return base type size.
+static uint64_t getBaseTypeSize(DwarfDebug *DD, DIDerivedType Ty) {
+  unsigned Tag = Ty.getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return Ty.getSizeInBits();
+
+  DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
+
+  // If this type is not derived from any type then take conservative approach.
+  if (!BaseType.isValid())
+    return Ty.getSizeInBits();
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty.getSizeInBits();
+
+  if (BaseType.isDerivedType())
+    return getBaseTypeSize(DD, DIDerivedType(BaseType));
+
+  return BaseType.getSizeInBits();
+}
+
 /// addConstantValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
+void CompileUnit::addConstantValue(DIE *Die, const MachineOperand &MO,
                                    DIType Ty) {
+  // FIXME: This is a bit conservative/simple - it emits negative values at
+  // their maximum bit width which is a bit unfortunate (& doesn't prefer
+  // udata/sdata over dataN as suggested by the DWARF spec)
   assert(MO.isImm() && "Invalid machine operand!");
-  DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   int SizeInBits = -1;
-  bool SignedConstant = isTypeSigned(Ty, &SizeInBits);
-  unsigned Form = SignedConstant ? dwarf::DW_FORM_sdata : dwarf::DW_FORM_udata;
-  switch (SizeInBits) {
-    case 8:  Form = dwarf::DW_FORM_data1; break;
-    case 16: Form = dwarf::DW_FORM_data2; break;
-    case 32: Form = dwarf::DW_FORM_data4; break;
-    case 64: Form = dwarf::DW_FORM_data8; break;
-    default: break;
+  bool SignedConstant = isTypeSigned(DD, Ty, &SizeInBits);
+  dwarf::Form Form;
+
+  // If we're a signed constant definitely use sdata.
+  if (SignedConstant) {
+    addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, MO.getImm());
+    return;
   }
-  SignedConstant ? addSInt(Block, 0, Form, MO.getImm())
-    : addUInt(Block, 0, Form, MO.getImm());
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
+  // Else use data for now unless it's larger than we can deal with.
+  switch (SizeInBits) {
+  case 8:
+    Form = dwarf::DW_FORM_data1;
+    break;
+  case 16:
+    Form = dwarf::DW_FORM_data2;
+    break;
+  case 32:
+    Form = dwarf::DW_FORM_data4;
+    break;
+  case 64:
+    Form = dwarf::DW_FORM_data8;
+    break;
+  default:
+    Form = dwarf::DW_FORM_udata;
+    addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
+    return;
+  }
+  addUInt(Die, dwarf::DW_AT_const_value, Form, MO.getImm());
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
-  assert (MO.isFPImm() && "Invalid machine operand!");
+void CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
+  assert(MO.isFPImm() && "Invalid machine operand!");
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
   APFloat FPImm = MO.getFPImm()->getValueAPF();
 
   // Get the raw data form of the floating point.
   const APInt FltVal = FPImm.bitcastToAPInt();
-  const char *FltPtr = (const char*)FltVal.getRawData();
+  const char *FltPtr = (const char *)FltVal.getRawData();
 
   int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
   bool LittleEndian = Asm->getDataLayout().isLittleEndian();
@@ -624,43 +772,56 @@ bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
 
   // Output the constant to DWARF one byte at a time.
   for (; Start != Stop; Start += Incr)
-    addUInt(Block, 0, dwarf::DW_FORM_data1,
-            (unsigned char)0xFF & FltPtr[Start]);
+    addUInt(Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addConstantFPValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
-  return addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), false);
+void CompileUnit::addConstantFPValue(DIE *Die, const ConstantFP *CFP) {
+  // Pass this down to addConstantValue as an unsigned bag of bits.
+  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
 }
 
 /// addConstantValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
+void CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
                                    bool Unsigned) {
-  return addConstantValue(Die, CI->getValue(), Unsigned);
+  addConstantValue(Die, CI->getValue(), Unsigned);
 }
 
 // addConstantValue - Add constant value entry in variable DIE.
-bool CompileUnit::addConstantValue(DIE *Die, const APInt &Val,
-                                   bool Unsigned) {
+void CompileUnit::addConstantValue(DIE *Die, const APInt &Val, bool Unsigned) {
   unsigned CIBitWidth = Val.getBitWidth();
   if (CIBitWidth <= 64) {
-    unsigned form = 0;
+    // If we're a signed constant definitely use sdata.
+    if (!Unsigned) {
+      addSInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
+              Val.getSExtValue());
+      return;
+    }
+
+    // Else use data for now unless it's larger than we can deal with.
+    dwarf::Form Form;
     switch (CIBitWidth) {
-    case 8: form = dwarf::DW_FORM_data1; break;
-    case 16: form = dwarf::DW_FORM_data2; break;
-    case 32: form = dwarf::DW_FORM_data4; break;
-    case 64: form = dwarf::DW_FORM_data8; break;
+    case 8:
+      Form = dwarf::DW_FORM_data1;
+      break;
+    case 16:
+      Form = dwarf::DW_FORM_data2;
+      break;
+    case 32:
+      Form = dwarf::DW_FORM_data4;
+      break;
+    case 64:
+      Form = dwarf::DW_FORM_data8;
+      break;
     default:
-      form = Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata;
+      addUInt(Die, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
+              Val.getZExtValue());
+      return;
     }
-    if (Unsigned)
-      addUInt(Die, dwarf::DW_AT_const_value, form, Val.getZExtValue());
-    else
-      addSInt(Die, dwarf::DW_AT_const_value, form, Val.getSExtValue());
-    return true;
+    addUInt(Die, dwarf::DW_AT_const_value, Form, Val.getZExtValue());
+    return;
   }
 
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
@@ -678,11 +839,10 @@ bool CompileUnit::addConstantValue(DIE *Die, const APInt &Val,
       c = Ptr64[i / 8] >> (8 * (i & 7));
     else
       c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, c);
+    addUInt(Block, dwarf::DW_FORM_data1, c);
   }
 
-  addBlock(Die, dwarf::DW_AT_const_value, 0, Block);
-  return true;
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
 }
 
 /// addTemplateParams - Add template parameters into buffer.
@@ -691,47 +851,48 @@ void CompileUnit::addTemplateParams(DIE &Buffer, DIArray TParams) {
   for (unsigned i = 0, e = TParams.getNumElements(); i != e; ++i) {
     DIDescriptor Element = TParams.getElement(i);
     if (Element.isTemplateTypeParameter())
-      Buffer.addChild(getOrCreateTemplateTypeParameterDIE(
-                        DITemplateTypeParameter(Element)));
+      constructTemplateTypeParameterDIE(Buffer,
+                                        DITemplateTypeParameter(Element));
     else if (Element.isTemplateValueParameter())
-      Buffer.addChild(getOrCreateTemplateValueParameterDIE(
-                        DITemplateValueParameter(Element)));
+      constructTemplateValueParameterDIE(Buffer,
+                                         DITemplateValueParameter(Element));
   }
 }
 
 /// getOrCreateContextDIE - Get context owner's DIE.
-DIE *CompileUnit::getOrCreateContextDIE(DIDescriptor Context) {
+DIE *CompileUnit::getOrCreateContextDIE(DIScope Context) {
+  if (!Context || Context.isFile())
+    return getCUDie();
   if (Context.isType())
     return getOrCreateTypeDIE(DIType(Context));
-  else if (Context.isNameSpace())
+  if (Context.isNameSpace())
     return getOrCreateNameSpace(DINameSpace(Context));
-  else if (Context.isSubprogram())
+  if (Context.isSubprogram())
     return getOrCreateSubprogramDIE(DISubprogram(Context));
-  else
-    return getDIE(Context);
-}
-
-/// addToContextOwner - Add Die into the list of its context owner's children.
-void CompileUnit::addToContextOwner(DIE *Die, DIDescriptor Context) {
-  if (DIE *ContextDIE = getOrCreateContextDIE(Context))
-    ContextDIE->addChild(Die);
-  else
-    addDie(Die);
+  return getDIE(Context);
 }
 
 /// getOrCreateTypeDIE - Find existing DIE or create new DIE for the
 /// given DIType.
 DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
-  DIType Ty(TyNode);
-  if (!Ty.Verify())
+  if (!TyNode)
     return NULL;
+
+  DIType Ty(TyNode);
+  assert(Ty.isType());
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(Ty.getContext()));
+  assert(ContextDIE);
+
   DIE *TyDIE = getDIE(Ty);
   if (TyDIE)
     return TyDIE;
 
   // Create new type.
-  TyDIE = new DIE(dwarf::DW_TAG_base_type);
-  insertDIE(Ty, TyDIE);
+  TyDIE = createAndAddDIE(Ty.getTag(), *ContextDIE, Ty);
+
   if (Ty.isBasicType())
     constructTypeDIE(*TyDIE, DIBasicType(Ty));
   else if (Ty.isCompositeType())
@@ -748,28 +909,24 @@ DIE *CompileUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
       DICompositeType CT(Ty);
       // A runtime language of 0 actually means C/C++ and that any
       // non-negative value is some version of Objective-C/C++.
-      IsImplementation = (CT.getRunTimeLang() == 0) ||
-        CT.isObjcClassComplete();
+      IsImplementation = (CT.getRunTimeLang() == 0) || CT.isObjcClassComplete();
     }
-    unsigned Flags = IsImplementation ?
-                     DwarfAccelTable::eTypeFlagClassIsImplementation : 0;
+    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
     addAccelType(Ty.getName(), std::make_pair(TyDIE, Flags));
   }
 
-  addToContextOwner(TyDIE, Ty.getContext());
   return TyDIE;
 }
 
 /// addType - Add a new type attribute to the specified entity.
-void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
-  if (!Ty.Verify())
-    return;
+void CompileUnit::addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute) {
+  assert(Ty && "Trying to add a type that doesn't exist?");
 
   // Check for pre-existence.
   DIEEntry *Entry = getDIEEntry(Ty);
   // If it exists then use the existing value.
   if (Entry) {
-    Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+    addDIEEntry(Entity, Attribute, Entry);
     return;
   }
 
@@ -779,35 +936,112 @@ void CompileUnit::addType(DIE *Entity, DIType Ty, unsigned Attribute) {
   // Set up proxy.
   Entry = createDIEEntry(Buffer);
   insertDIEEntry(Ty, Entry);
-  Entity->addValue(Attribute, dwarf::DW_FORM_ref4, Entry);
+  addDIEEntry(Entity, Attribute, Entry);
 
   // If this is a complete composite type then include it in the
   // list of global types.
   addGlobalType(Ty);
 }
 
+// Accelerator table mutators - add each name along with its companion
+// DIE to the proper table while ensuring that the name that we're going
+// to reference is in the string table. We do this since the names we
+// add may not only be identical to the names in the DIE.
+void CompileUnit::addAccelName(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNames[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelObjC(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelObjC[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelNamespace(StringRef Name, DIE *Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<DIE *> &DIEs = AccelNamespace[Name];
+  DIEs.push_back(Die);
+}
+
+void CompileUnit::addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
+  DU->getStringPoolEntry(Name);
+  std::vector<std::pair<DIE *, unsigned> > &DIEs = AccelTypes[Name];
+  DIEs.push_back(Die);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void CompileUnit::addGlobalName(StringRef Name, DIE *Die, DIScope Context) {
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = Die;
+}
+
 /// addGlobalType - Add a new global type to the compile unit.
 ///
 void CompileUnit::addGlobalType(DIType Ty) {
-  DIDescriptor Context = Ty.getContext();
-  if (Ty.isCompositeType() && !Ty.getName().empty() && !Ty.isForwardDecl()
-      && (!Context || Context.isCompileUnit() || Context.isFile()
-          || Context.isNameSpace()))
-    if (DIEEntry *Entry = getDIEEntry(Ty))
-      GlobalTypes[Ty.getName()] = Entry->getEntry();
+  DIScope Context = resolve(Ty.getContext());
+  if (!Ty.getName().empty() && !Ty.isForwardDecl() &&
+      (!Context || Context.isCompileUnit() || Context.isFile() ||
+       Context.isNameSpace()))
+    if (DIEEntry *Entry = getDIEEntry(Ty)) {
+      std::string FullName =
+          getParentContextString(Context) + Ty.getName().str();
+      GlobalTypes[FullName] = Entry->getEntry();
+    }
 }
 
-/// addPubTypes - Add type for pubtypes section.
+/// getParentContextString - Walks the metadata parent chain in a language
+/// specific manner (using the compile unit language) and returns
+/// it as a string. This is done at the metadata level because DIEs may
+/// not currently have been added to the parent context and walking the
+/// DIEs looking for names is more expensive than walking the metadata.
+std::string CompileUnit::getParentContextString(DIScope Context) const {
+  if (!Context)
+    return "";
+
+  // FIXME: Decide whether to implement this for non-C++ languages.
+  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+    return "";
+
+  std::string CS;
+  SmallVector<DIScope, 1> Parents;
+  while (!Context.isCompileUnit()) {
+    Parents.push_back(Context);
+    if (Context.getContext())
+      Context = resolve(Context.getContext());
+    else
+      // Structure, etc types will have a NULL context if they're at the top
+      // level.
+      break;
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<DIScope>::reverse_iterator I = Parents.rbegin(),
+                                                  E = Parents.rend();
+       I != E; ++I) {
+    DIScope Ctx = *I;
+    StringRef Name = Ctx.getName();
+    if (!Name.empty()) {
+      CS += Name;
+      CS += "::";
+    }
+  }
+  return CS;
+}
+
+/// addPubTypes - Add subprogram argument types for pubtypes section.
 void CompileUnit::addPubTypes(DISubprogram SP) {
   DICompositeType SPTy = SP.getType();
-  unsigned SPTag = SPTy.getTag();
+  uint16_t SPTag = SPTy.getTag();
   if (SPTag != dwarf::DW_TAG_subroutine_type)
     return;
 
   DIArray Args = SPTy.getTypeArray();
   for (unsigned i = 0, e = Args.getNumElements(); i != e; ++i) {
     DIType ATy(Args.getElement(i));
-    if (!ATy.Verify())
+    if (!ATy.isType())
       continue;
     addGlobalType(ATy);
   }
@@ -821,18 +1055,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIBasicType BTy) {
   if (!Name.empty())
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
-  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type) {
-    Buffer.setTag(dwarf::DW_TAG_unspecified_type);
-    // Unspecified types has only name, nothing else.
+  // An unspecified type only has a name attribute.
+  if (BTy.getTag() == dwarf::DW_TAG_unspecified_type)
     return;
-  }
 
-  Buffer.setTag(dwarf::DW_TAG_base_type);
   addUInt(&Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
           BTy.getEncoding());
 
   uint64_t Size = BTy.getSizeInBits() >> 3;
-  addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+  addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 }
 
 /// constructTypeDIE - Construct derived type die from DIDerivedType.
@@ -840,16 +1071,12 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
   // Get core information.
   StringRef Name = DTy.getName();
   uint64_t Size = DTy.getSizeInBits() >> 3;
-  unsigned Tag = DTy.getTag();
-
-  // FIXME - Workaround for templates.
-  if (Tag == dwarf::DW_TAG_inheritance) Tag = dwarf::DW_TAG_reference_type;
-
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   // Map to main type, void will not have a type.
-  DIType FromTy = DTy.getTypeDerivedFrom();
-  addType(&Buffer, FromTy);
+  DIType FromTy = resolve(DTy.getTypeDerivedFrom());
+  if (FromTy)
+    addType(&Buffer, FromTy);
 
   // Add name if not anonymous or intermediate type.
   if (!Name.empty())
@@ -857,97 +1084,102 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DIDerivedType DTy) {
 
   // Add size if non-zero (derived types might be zero-sized.)
   if (Size && Tag != dwarf::DW_TAG_pointer_type)
-    addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+    addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DTy.getClassType()));
+    addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                getOrCreateTypeDIE(resolve(DTy.getClassType())));
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy.isForwardDecl())
     addSourceLine(&Buffer, DTy);
 }
 
+/// Return true if the type is appropriately scoped to be contained inside
+/// its own type unit.
+static bool isTypeUnitScoped(DIType Ty, const DwarfDebug *DD) {
+  DIScope Parent = DD->resolve(Ty.getContext());
+  while (Parent) {
+    // Don't generate a hash for anything scoped inside a function.
+    if (Parent.isSubprogram())
+      return false;
+    Parent = DD->resolve(Parent.getContext());
+  }
+  return true;
+}
+
+/// Return true if the type should be split out into a type unit.
+static bool shouldCreateTypeUnit(DICompositeType CTy, const DwarfDebug *DD) {
+  uint16_t Tag = CTy.getTag();
+
+  switch (Tag) {
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_class_type:
+    // If this is a class, structure, union, or enumeration type
+    // that is a definition (not a declaration), and not scoped
+    // inside a function then separate this out as a type unit.
+    return !CTy.isForwardDecl() && isTypeUnitScoped(CTy, DD);
+  default:
+    return false;
+  }
+}
+
 /// constructTypeDIE - Construct type DIE from DICompositeType.
 void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
   // Get core information.
   StringRef Name = CTy.getName();
 
   uint64_t Size = CTy.getSizeInBits() >> 3;
-  unsigned Tag = CTy.getTag();
-  Buffer.setTag(Tag);
+  uint16_t Tag = Buffer.getTag();
 
   switch (Tag) {
   case dwarf::DW_TAG_array_type:
-    constructArrayTypeDIE(Buffer, &CTy);
+    constructArrayTypeDIE(Buffer, CTy);
     break;
-  case dwarf::DW_TAG_enumeration_type: {
-    DIArray Elements = CTy.getTypeArray();
-
-    // Add enumerators to enumeration type.
-    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIE *ElemDie = NULL;
-      DIDescriptor Enum(Elements.getElement(i));
-      if (Enum.isEnumerator()) {
-        ElemDie = constructEnumTypeDIE(DIEnumerator(Enum));
-        Buffer.addChild(ElemDie);
-      }
-    }
-    DIType DTy = CTy.getTypeDerivedFrom();
-    if (DTy.Verify()) {
-      addType(&Buffer, DTy);
-      addUInt(&Buffer, dwarf::DW_AT_enum_class, dwarf::DW_FORM_flag, 1);
-    }
-  }
+  case dwarf::DW_TAG_enumeration_type:
+    constructEnumTypeDIE(Buffer, CTy);
     break;
   case dwarf::DW_TAG_subroutine_type: {
-    // Add return type.
+    // Add return type. A void return won't have a type.
     DIArray Elements = CTy.getTypeArray();
-    DIDescriptor RTy = Elements.getElement(0);
-    addType(&Buffer, DIType(RTy));
+    DIType RTy(Elements.getElement(0));
+    if (RTy)
+      addType(&Buffer, RTy);
 
     bool isPrototyped = true;
     // Add arguments.
     for (unsigned i = 1, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Ty = Elements.getElement(i);
       if (Ty.isUnspecifiedParameter()) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_unspecified_parameters);
-        Buffer.addChild(Arg);
+        createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
         isPrototyped = false;
       } else {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+        DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
         addType(Arg, DIType(Ty));
         if (DIType(Ty).isArtificial())
           addFlag(Arg, dwarf::DW_AT_artificial);
-        Buffer.addChild(Arg);
       }
     }
     // Add prototype flag if we're dealing with a C language and the
     // function has been prototyped.
+    uint16_t Language = getLanguage();
     if (isPrototyped &&
-        (Language == dwarf::DW_LANG_C89 ||
-         Language == dwarf::DW_LANG_C99 ||
+        (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
       addFlag(&Buffer, dwarf::DW_AT_prototyped);
-  }
-    break;
+  } break;
   case dwarf::DW_TAG_structure_type:
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_class_type: {
     // Add elements to structure type.
     DIArray Elements = CTy.getTypeArray();
-
-    // A forward struct declared type may not have elements available.
-    unsigned N = Elements.getNumElements();
-    if (N == 0)
-      break;
-
-    // Add elements to structure type.
-    for (unsigned i = 0; i < N; ++i) {
+    for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
       DIDescriptor Element = Elements.getElement(i);
       DIE *ElemDie = NULL;
       if (Element.isSubprogram()) {
         DISubprogram SP(Element);
-        ElemDie = getOrCreateSubprogramDIE(DISubprogram(Element));
+        ElemDie = getOrCreateSubprogramDIE(SP);
         if (SP.isProtected())
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
                   dwarf::DW_ACCESS_protected);
@@ -956,21 +1188,23 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
                   dwarf::DW_ACCESS_private);
         else
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
-            dwarf::DW_ACCESS_public);
+                  dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
           addFlag(ElemDie, dwarf::DW_AT_explicit);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
         if (DDTy.getTag() == dwarf::DW_TAG_friend) {
-          ElemDie = new DIE(dwarf::DW_TAG_friend);
-          addType(ElemDie, DDTy.getTypeDerivedFrom(), dwarf::DW_AT_friend);
-        } else if (DDTy.isStaticMember())
-          ElemDie = createStaticMemberDIE(DDTy);
-        else
-          ElemDie = createMemberDIE(DDTy);
+          ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          addType(ElemDie, resolve(DDTy.getTypeDerivedFrom()),
+                  dwarf::DW_AT_friend);
+        } else if (DDTy.isStaticMember()) {
+          getOrCreateStaticMemberDIE(DDTy);
+        } else {
+          constructMemberDIE(Buffer, DDTy);
+        }
       } else if (Element.isObjCProperty()) {
         DIObjCProperty Property(Element);
-        ElemDie = new DIE(Property.getTag());
+        ElemDie = createAndAddDIE(Property.getTag(), Buffer);
         StringRef PropertyName = Property.getObjCPropertyName();
         addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
         addType(ElemDie, Property.getType());
@@ -995,8 +1229,8 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         if (Property.isNonAtomicObjCProperty())
           PropertyAttributes |= dwarf::DW_APPLE_PROPERTY_nonatomic;
         if (PropertyAttributes)
-          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, 0,
-                 PropertyAttributes);
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
+                  PropertyAttributes);
 
         DIEEntry *Entry = getDIEEntry(Element);
         if (!Entry) {
@@ -1005,20 +1239,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         }
       } else
         continue;
-      Buffer.addChild(ElemDie);
     }
 
     if (CTy.isAppleBlockExtension())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
 
-    DICompositeType ContainingType = CTy.getContainingType();
-    if (DIDescriptor(ContainingType).isCompositeType())
-      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4,
-                  getOrCreateTypeDIE(DIType(ContainingType)));
-    else {
-      DIDescriptor Context = CTy.getContext();
-      addToContextOwner(&Buffer, Context);
-    }
+    DICompositeType ContainingType(resolve(CTy.getContainingType()));
+    if (ContainingType)
+      addDIEEntry(&Buffer, dwarf::DW_AT_containing_type,
+                  getOrCreateTypeDIE(ContainingType));
 
     if (CTy.isObjcClassComplete())
       addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
@@ -1026,8 +1255,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
     if (Tag == dwarf::DW_TAG_class_type ||
-        Tag == dwarf::DW_TAG_structure_type ||
-        Tag == dwarf::DW_TAG_union_type)
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
       addTemplateParams(Buffer, CTy.getTemplateParams());
 
     break;
@@ -1041,16 +1269,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     addString(&Buffer, dwarf::DW_AT_name, Name);
 
   if (Tag == dwarf::DW_TAG_enumeration_type ||
-      Tag == dwarf::DW_TAG_class_type ||
-      Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
       Tag == dwarf::DW_TAG_union_type) {
     // Add size if non-zero (derived types might be zero-sized.)
     // TODO: Do we care about size for enum forward declarations?
     if (Size)
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, Size);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, Size);
     else if (!CTy.isForwardDecl())
       // Add zero size if it is not a forward declaration.
-      addUInt(&Buffer, dwarf::DW_AT_byte_size, 0, 0);
+      addUInt(&Buffer, dwarf::DW_AT_byte_size, None, 0);
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
@@ -1063,117 +1290,128 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     // No harm in adding the runtime language to the declaration.
     unsigned RLang = CTy.getRunTimeLang();
     if (RLang)
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class,
-              dwarf::DW_FORM_data1, RLang);
+      addUInt(&Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+              RLang);
   }
+  // If this is a type applicable to a type unit it then add it to the
+  // list of types we'll compute a hash for later.
+  if (shouldCreateTypeUnit(CTy, DD))
+    DD->addTypeUnitType(&Buffer);
 }
 
-/// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateTypeParameter.
-DIE *
-CompileUnit::getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP) {
-  DIE *ParamDIE = getDIE(TP);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_type_parameter);
-  addType(ParamDIE, TP.getType());
-  addString(ParamDIE, dwarf::DW_AT_name, TP.getName());
-  return ParamDIE;
+/// constructTemplateTypeParameterDIE - Construct new DIE for the given
+/// DITemplateTypeParameter.
+void
+CompileUnit::constructTemplateTypeParameterDIE(DIE &Buffer,
+                                               DITemplateTypeParameter TP) {
+  DIE *ParamDIE =
+      createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
+  // Add the type if it exists, it could be void and therefore no type.
+  if (TP.getType())
+    addType(ParamDIE, resolve(TP.getType()));
+  if (!TP.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, TP.getName());
 }
 
-/// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE
-/// for the given DITemplateValueParameter.
-DIE *
-CompileUnit::getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TPV){
-  DIE *ParamDIE = getDIE(TPV);
-  if (ParamDIE)
-    return ParamDIE;
-
-  ParamDIE = new DIE(dwarf::DW_TAG_template_value_parameter);
-  addType(ParamDIE, TPV.getType());
-  if (!TPV.getName().empty())
-    addString(ParamDIE, dwarf::DW_AT_name, TPV.getName());
-  addUInt(ParamDIE, dwarf::DW_AT_const_value, dwarf::DW_FORM_udata,
-          TPV.getValue());
-  return ParamDIE;
+/// constructTemplateValueParameterDIE - Construct new DIE for the given
+/// DITemplateValueParameter.
+void
+CompileUnit::constructTemplateValueParameterDIE(DIE &Buffer,
+                                                DITemplateValueParameter VP) {
+  DIE *ParamDIE = createAndAddDIE(VP.getTag(), Buffer);
+
+  // Add the type if there is one, template template and template parameter
+  // packs will not have a type.
+  if (VP.getTag() == dwarf::DW_TAG_template_value_parameter)
+    addType(ParamDIE, resolve(VP.getType()));
+  if (!VP.getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
+  if (Value *Val = VP.getValue()) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
+      addConstantValue(ParamDIE, CI,
+                       isUnsignedDIType(DD, resolve(VP.getType())));
+    else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
+      // For declaration non-type template parameters (such as global values and
+      // functions)
+      DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
+      addOpAddress(Block, Asm->getSymbol(GV));
+      // Emit DW_OP_stack_value to use the address as the immediate value of the
+      // parameter, rather than a pointer to it.
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+      addBlock(ParamDIE, dwarf::DW_AT_location, Block);
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      assert(isa<MDString>(Val));
+      addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
+                cast<MDString>(Val)->getString());
+    } else if (VP.getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+      assert(isa<MDNode>(Val));
+      DIArray A(cast<MDNode>(Val));
+      addTemplateParams(*ParamDIE, A);
+    }
+  }
 }
 
 /// getOrCreateNameSpace - Create a DIE for DINameSpace.
 DIE *CompileUnit::getOrCreateNameSpace(DINameSpace NS) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(NS.getContext());
+
   DIE *NDie = getDIE(NS);
   if (NDie)
     return NDie;
-  NDie = new DIE(dwarf::DW_TAG_namespace);
-  insertDIE(NS, NDie);
+  NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+
   if (!NS.getName().empty()) {
     addString(NDie, dwarf::DW_AT_name, NS.getName());
     addAccelNamespace(NS.getName(), NDie);
+    addGlobalName(NS.getName(), NDie, NS.getContext());
   } else
     addAccelNamespace("(anonymous namespace)", NDie);
   addSourceLine(NDie, NS);
-  addToContextOwner(NDie, NS.getContext());
   return NDie;
 }
 
-/// getRealLinkageName - If special LLVM prefix that is used to inform the asm
-/// printer to not emit usual symbol prefix before the symbol name is used then
-/// return linkage name after skipping this special LLVM prefix.
-static StringRef getRealLinkageName(StringRef LinkageName) {
-  char One = '\1';
-  if (LinkageName.startswith(StringRef(&One, 1)))
-    return LinkageName.substr(1);
-  return LinkageName;
-}
-
 /// getOrCreateSubprogramDIE - Create new DIE using SP.
 DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE (as is the case for member function
+  // declarations).
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(SP.getContext()));
+
   DIE *SPDie = getDIE(SP);
   if (SPDie)
     return SPDie;
 
-  SPDie = new DIE(dwarf::DW_TAG_subprogram);
+  DISubprogram SPDecl = SP.getFunctionDeclaration();
+  if (SPDecl.isSubprogram())
+    // Add subprogram definitions to the CU die directly.
+    ContextDIE = CUDie.get();
 
   // DW_TAG_inlined_subroutine may refer to this DIE.
-  insertDIE(SP, SPDie);
+  SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
   DIE *DeclDie = NULL;
-  if (SPDecl.isSubprogram()) {
+  if (SPDecl.isSubprogram())
     DeclDie = getOrCreateSubprogramDIE(SPDecl);
-  }
-
-  // Add to context owner.
-  addToContextOwner(SPDie, SP.getContext());
 
   // Add function template parameters.
   addTemplateParams(*SPDie, SP.getTemplateParams());
 
-  // Unfortunately this code needs to stay here instead of below the
-  // AT_specification code in order to work around a bug in older
-  // gdbs that requires the linkage name to resolve multiple template
-  // functions.
-  // TODO: Remove this set of code when we get rid of the old gdb
-  // compatibility.
-  StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
-    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
-              getRealLinkageName(LinkageName));
-
   // If this DIE is going to refer declaration info using AT_specification
   // then there is no need to add other attributes.
   if (DeclDie) {
     // Refer function declaration directly.
-    addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                DeclDie);
+    addDIEEntry(SPDie, dwarf::DW_AT_specification, DeclDie);
 
     return SPDie;
   }
 
   // Add the linkage name if we have one.
-  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+  StringRef LinkageName = SP.getLinkageName();
+  if (!LinkageName.empty())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
-              getRealLinkageName(LinkageName));
+              GlobalValue::getRealLinkageName(LinkageName));
 
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
@@ -1183,31 +1421,31 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
   // Add the prototype if we have a prototype and we have a C like
   // language.
+  uint16_t Language = getLanguage();
   if (SP.isPrototyped() &&
-      (Language == dwarf::DW_LANG_C89 ||
-       Language == dwarf::DW_LANG_C99 ||
+      (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
        Language == dwarf::DW_LANG_ObjC))
     addFlag(SPDie, dwarf::DW_AT_prototyped);
 
-  // Add Return Type.
   DICompositeType SPTy = SP.getType();
-  DIArray Args = SPTy.getTypeArray();
-  unsigned SPTag = SPTy.getTag();
+  assert(SPTy.getTag() == dwarf::DW_TAG_subroutine_type &&
+         "the type of a subprogram should be a subroutine");
 
-  if (Args.getNumElements() == 0 || SPTag != dwarf::DW_TAG_subroutine_type)
-    addType(SPDie, SPTy);
-  else
+  DIArray Args = SPTy.getTypeArray();
+  // Add a return type. If this is a type like a C/C++ void type we don't add a
+  // return type.
+  if (Args.getElement(0))
     addType(SPDie, DIType(Args.getElement(0)));
 
   unsigned VK = SP.getVirtuality();
   if (VK) {
     addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
     DIEBlock *Block = getDIEBlock();
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(Block, 0, dwarf::DW_FORM_udata, SP.getVirtualIndex());
-    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, 0, Block);
-    ContainingTypeMap.insert(std::make_pair(SPDie,
-                                            SP.getContainingType()));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(Block, dwarf::DW_FORM_udata, SP.getVirtualIndex());
+    addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    ContainingTypeMap.insert(
+        std::make_pair(SPDie, resolve(SP.getContainingType())));
   }
 
   if (!SP.isDefinition()) {
@@ -1215,19 +1453,13 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
 
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
-    DICompositeType SPTy = SP.getType();
-    DIArray Args = SPTy.getTypeArray();
-    unsigned SPTag = SPTy.getTag();
-
-    if (SPTag == dwarf::DW_TAG_subroutine_type)
-      for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
-        DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        DIType ATy = DIType(Args.getElement(i));
-        addType(Arg, ATy);
-        if (ATy.isArtificial())
-          addFlag(Arg, dwarf::DW_AT_artificial);
-        SPDie->addChild(Arg);
-      }
+    for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+      DIE *Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+      DIType ATy(Args.getElement(i));
+      addType(Arg, ATy);
+      if (ATy.isArtificial())
+        addFlag(Arg, dwarf::DW_AT_artificial);
+    }
   }
 
   if (SP.isArtificial())
@@ -1274,16 +1506,16 @@ static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
 }
 
 /// createGlobalVariableDIE - create global variable DIE.
-void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
+void CompileUnit::createGlobalVariableDIE(DIGlobalVariable GV) {
+
   // Check for pre-existence.
-  if (getDIE(N))
+  if (getDIE(GV))
     return;
 
-  DIGlobalVariable GV(N);
-  if (!GV.Verify())
+  if (!GV.isGlobalVariable())
     return;
 
-  DIDescriptor GVContext = GV.getContext();
+  DIScope GVContext = GV.getContext();
   DIType GTy = GV.getType();
 
   // If this is a static data member definition, some attributes belong
@@ -1294,35 +1526,30 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (SDMDecl.Verify()) {
     assert(SDMDecl.isStaticMember() && "Expected static member decl");
     // We need the declaration DIE that is in the static member's class.
-    // But that class might not exist in the DWARF yet.
-    // Creating the class will create the static member decl DIE.
-    getOrCreateContextDIE(SDMDecl.getContext());
-    VariableDIE = getDIE(SDMDecl);
-    assert(VariableDIE && "Static member decl has no context?");
+    VariableDIE = getOrCreateStaticMemberDIE(SDMDecl);
     IsStaticMember = true;
   }
 
   // If this is not a static data member definition, create the variable
   // DIE and add the initial set of attributes to it.
   if (!VariableDIE) {
-    VariableDIE = new DIE(GV.getTag());
+    // Construct the context before querying for the existence of the DIE in
+    // case such construction creates the DIE.
+    DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
     // Add to map.
-    insertDIE(N, VariableDIE);
+    VariableDIE = createAndAddDIE(GV.getTag(), *ContextDIE, GV);
 
     // Add name and type.
     addString(VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
     addType(VariableDIE, GTy);
 
     // Add scoping info.
-    if (!GV.isLocalToUnit()) {
+    if (!GV.isLocalToUnit())
       addFlag(VariableDIE, dwarf::DW_AT_external);
-      addGlobalName(GV.getName(), VariableDIE);
-    }
 
     // Add line number info.
     addSourceLine(VariableDIE, GV);
-    // Add to context owner.
-    addToContextOwner(VariableDIE, GVContext);
   }
 
   // Add location.
@@ -1332,57 +1559,73 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
   if (isGlobalVariable) {
     addToAccelTable = true;
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
-    addOpAddress(Block, Asm->Mang->getSymbol(GV.getGlobal()));
+    const MCSymbol *Sym = Asm->getSymbol(GV.getGlobal());
+    if (GV.getGlobal()->isThreadLocal()) {
+      // FIXME: Make this work with -gsplit-dwarf.
+      unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+      assert((PointerSize == 4 || PointerSize == 8) &&
+             "Add support for other sizes if necessary");
+      const MCExpr *Expr =
+          Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym);
+      // Based on GCC's support for TLS:
+      if (!DD->useSplitDwarf()) {
+        // 1) Start with a constNu of the appropriate pointer size
+        addUInt(Block, dwarf::DW_FORM_data1,
+                PointerSize == 4 ? dwarf::DW_OP_const4u : dwarf::DW_OP_const8u);
+        // 2) containing the (relocated) offset of the TLS variable
+        //    within the module's TLS block.
+        addExpr(Block, dwarf::DW_FORM_udata, Expr);
+      } else {
+        addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+        addUInt(Block, dwarf::DW_FORM_udata, DU->getAddrPoolIndex(Expr));
+      }
+      // 3) followed by a custom OP to make the debugger do a TLS lookup.
+      addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_push_tls_address);
+    } else
+      addOpAddress(Block, Sym);
     // Do not create specification DIE if context is either compile unit
     // or a subprogram.
     if (GVContext && GV.isDefinition() && !GVContext.isCompileUnit() &&
-        !GVContext.isFile() && !isSubprogramContext(GVContext)) {
+        !GVContext.isFile() && !DD->isSubprogramContext(GVContext)) {
       // Create specification DIE.
-      VariableSpecDIE = new DIE(dwarf::DW_TAG_variable);
-      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
-                  dwarf::DW_FORM_ref4, VariableDIE);
-      addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
+      VariableSpecDIE = createAndAddDIE(dwarf::DW_TAG_variable, *CUDie);
+      addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification, VariableDIE);
+      addBlock(VariableSpecDIE, dwarf::DW_AT_location, Block);
       // A static member's declaration is already flagged as such.
       if (!SDMDecl.Verify())
         addFlag(VariableDIE, dwarf::DW_AT_declaration);
-      addDie(VariableSpecDIE);
     } else {
-      addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+      addBlock(VariableDIE, dwarf::DW_AT_location, Block);
     }
-    // Add linkage name.
+    // Add the linkage name.
     StringRef LinkageName = GV.getLinkageName();
-    if (!LinkageName.empty()) {
+    if (!LinkageName.empty())
       // From DWARF4: DIEs to which DW_AT_linkage_name may apply include:
       // TAG_common_block, TAG_constant, TAG_entry_point, TAG_subprogram and
       // TAG_variable.
-      addString(IsStaticMember && VariableSpecDIE ?
-                VariableSpecDIE : VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
-                getRealLinkageName(LinkageName));
-      // In compatibility mode with older gdbs we put the linkage name on both
-      // the TAG_variable DIE and on the TAG_member DIE.
-      if (IsStaticMember && VariableSpecDIE && DD->useDarwinGDBCompat())
-        addString(VariableDIE, dwarf::DW_AT_MIPS_linkage_name,
-                  getRealLinkageName(LinkageName));
-    }
+      addString(IsStaticMember && VariableSpecDIE ? VariableSpecDIE
+                                                  : VariableDIE,
+                dwarf::DW_AT_MIPS_linkage_name,
+                GlobalValue::getRealLinkageName(LinkageName));
   } else if (const ConstantInt *CI =
-             dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
+                 dyn_cast_or_null<ConstantInt>(GV.getConstant())) {
     // AT_const_value was added when the static member was created. To avoid
     // emitting AT_const_value multiple times, we only add AT_const_value when
     // it is not a static member.
     if (!IsStaticMember)
-      addConstantValue(VariableDIE, CI, GTy.isUnsignedDIType());
-  } else if (const ConstantExpr *CE = getMergedGlobalExpr(N->getOperand(11))) {
+      addConstantValue(VariableDIE, CI, isUnsignedDIType(DD, GTy));
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getOperand(11))) {
     addToAccelTable = true;
     // GV is a merged global.
     DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
     Value *Ptr = CE->getOperand(0);
-    addOpAddress(Block, Asm->Mang->getSymbol(cast<GlobalValue>(Ptr)));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
-    addUInt(Block, 0, dwarf::DW_FORM_udata,
-                   Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
-    addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-    addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
+    addOpAddress(Block, Asm->getSymbol(cast<GlobalValue>(Ptr)));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+    addUInt(Block, dwarf::DW_FORM_udata,
+            Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
+    addUInt(Block, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+    addBlock(VariableDIE, dwarf::DW_AT_location, Block);
   }
 
   if (addToAccelTable) {
@@ -1395,14 +1638,16 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
       addAccelName(GV.getLinkageName(), AddrDIE);
   }
 
-  return;
+  if (!GV.isLocalToUnit())
+    addGlobalName(GV.getName(), VariableSpecDIE ? VariableSpecDIE : VariableDIE,
+                  GV.getContext());
 }
 
 /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
                                        DIE *IndexTy) {
-  DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
-  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
+  DIE *DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, IndexTy);
 
   // The LowerBound value defines the lower bounds which is typically zero for
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
@@ -1415,26 +1660,22 @@ void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR,
   int64_t Count = SR.getCount();
 
   if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
-    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, LowerBound);
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
 
   if (Count != -1 && Count != 0)
     // FIXME: An unbounded array should reference the expression that defines
     // the array.
-    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, LowerBound + Count - 1);
-
-  Buffer.addChild(DW_Subrange);
+    addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, None,
+            LowerBound + Count - 1);
 }
 
 /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
-                                        DICompositeType *CTy) {
-  Buffer.setTag(dwarf::DW_TAG_array_type);
-  if (CTy->isVector())
+void CompileUnit::constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  if (CTy.isVector())
     addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
 
-  // Emit derived type.
-  addType(&Buffer, CTy->getTypeDerivedFrom());
-  DIArray Elements = CTy->getTypeArray();
+  // Emit the element type.
+  addType(&Buffer, resolve(CTy.getTypeDerivedFrom()));
 
   // Get an anonymous type for index type.
   // FIXME: This type should be passed down from the front end
@@ -1442,16 +1683,16 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   DIE *IdxTy = getIndexTyDie();
   if (!IdxTy) {
     // Construct an anonymous type for index type.
-    IdxTy = new DIE(dwarf::DW_TAG_base_type);
+    IdxTy = createAndAddDIE(dwarf::DW_TAG_base_type, *CUDie.get());
     addString(IdxTy, dwarf::DW_AT_name, "int");
-    addUInt(IdxTy, dwarf::DW_AT_byte_size, 0, sizeof(int32_t));
+    addUInt(IdxTy, dwarf::DW_AT_byte_size, None, sizeof(int32_t));
     addUInt(IdxTy, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
             dwarf::DW_ATE_signed);
-    addDie(IdxTy);
     setIndexTyDie(IdxTy);
   }
 
   // Add subranges to array type.
+  DIArray Elements = CTy.getTypeArray();
   for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
     DIDescriptor Element = Elements.getElement(i);
     if (Element.getTag() == dwarf::DW_TAG_subrange_type)
@@ -1459,195 +1700,180 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
   }
 }
 
-/// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-DIE *CompileUnit::constructEnumTypeDIE(DIEnumerator ETy) {
-  DIE *Enumerator = new DIE(dwarf::DW_TAG_enumerator);
-  StringRef Name = ETy.getName();
-  addString(Enumerator, dwarf::DW_AT_name, Name);
-  int64_t Value = ETy.getEnumValue();
-  addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
-  return Enumerator;
+/// constructEnumTypeDIE - Construct an enum type DIE from DICompositeType.
+void CompileUnit::constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy) {
+  DIArray Elements = CTy.getTypeArray();
+
+  // Add enumerators to enumeration type.
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIEnumerator Enum(Elements.getElement(i));
+    if (Enum.isEnumerator()) {
+      DIE *Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      StringRef Name = Enum.getName();
+      addString(Enumerator, dwarf::DW_AT_name, Name);
+      int64_t Value = Enum.getEnumValue();
+      addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata, Value);
+    }
+  }
+  DIType DTy = resolve(CTy.getTypeDerivedFrom());
+  if (DTy) {
+    addType(&Buffer, DTy);
+    addFlag(&Buffer, dwarf::DW_AT_enum_class);
+  }
 }
 
 /// constructContainingTypeDIEs - Construct DIEs for types that contain
 /// vtables.
 void CompileUnit::constructContainingTypeDIEs() {
   for (DenseMap<DIE *, const MDNode *>::iterator CI = ContainingTypeMap.begin(),
-         CE = ContainingTypeMap.end(); CI != CE; ++CI) {
+                                                 CE = ContainingTypeMap.end();
+       CI != CE; ++CI) {
     DIE *SPDie = CI->first;
-    const MDNode *N = CI->second;
-    if (!N) continue;
-    DIE *NDie = getDIE(N);
-    if (!NDie) continue;
-    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, dwarf::DW_FORM_ref4, NDie);
+    DIDescriptor D(CI->second);
+    if (!D)
+      continue;
+    DIE *NDie = getDIE(D);
+    if (!NDie)
+      continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, NDie);
   }
 }
 
 /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-DIE *CompileUnit::constructVariableDIE(DbgVariable *DV, bool isScopeAbstract) {
-  StringRef Name = DV->getName();
-
-  // Translate tag to proper Dwarf tag.
-  unsigned Tag = DV->getTag();
+DIE *CompileUnit::constructVariableDIE(DbgVariable &DV, bool isScopeAbstract) {
+  StringRef Name = DV.getName();
 
   // Define variable debug information entry.
-  DIE *VariableDie = new DIE(Tag);
-  DbgVariable *AbsVar = DV->getAbstractVariable();
+  DIE *VariableDie = new DIE(DV.getTag());
+  DbgVariable *AbsVar = DV.getAbstractVariable();
   DIE *AbsDIE = AbsVar ? AbsVar->getDIE() : NULL;
   if (AbsDIE)
-    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin,
-                            dwarf::DW_FORM_ref4, AbsDIE);
+    addDIEEntry(VariableDie, dwarf::DW_AT_abstract_origin, AbsDIE);
   else {
-    addString(VariableDie, dwarf::DW_AT_name, Name);
-    addSourceLine(VariableDie, DV->getVariable());
-    addType(VariableDie, DV->getType());
+    if (!Name.empty())
+      addString(VariableDie, dwarf::DW_AT_name, Name);
+    addSourceLine(VariableDie, DV.getVariable());
+    addType(VariableDie, DV.getType());
   }
 
-  if (DV->isArtificial())
+  if (DV.isArtificial())
     addFlag(VariableDie, dwarf::DW_AT_artificial);
 
   if (isScopeAbstract) {
-    DV->setDIE(VariableDie);
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Add variable address.
 
-  unsigned Offset = DV->getDotDebugLocOffset();
+  unsigned Offset = DV.getDotDebugLocOffset();
   if (Offset != ~0U) {
     addLabel(VariableDie, dwarf::DW_AT_location,
-                         dwarf::DW_FORM_data4,
-                         Asm->GetTempSymbol("debug_loc", Offset));
-    DV->setDIE(VariableDie);
+             DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                        : dwarf::DW_FORM_data4,
+             Asm->GetTempSymbol("debug_loc", Offset));
+    DV.setDIE(VariableDie);
     return VariableDie;
   }
 
   // Check if variable is described by a DBG_VALUE instruction.
-  if (const MachineInstr *DVInsn = DV->getMInsn()) {
-    bool updated = false;
-    if (DVInsn->getNumOperands() == 3) {
-      if (DVInsn->getOperand(0).isReg()) {
-        const MachineOperand RegOp = DVInsn->getOperand(0);
-        const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
-        if (DVInsn->getOperand(1).isImm() &&
-            TRI->getFrameRegister(*Asm->MF) == RegOp.getReg()) {
-          unsigned FrameReg = 0;
-          const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-          int Offset =
-            TFI->getFrameIndexReference(*Asm->MF,
-                                        DVInsn->getOperand(1).getImm(),
-                                        FrameReg);
-          MachineLocation Location(FrameReg, Offset);
-          addVariableAddress(DV, VariableDie, Location);
-
-        } else if (RegOp.getReg())
-          addVariableAddress(DV, VariableDie,
-                                         MachineLocation(RegOp.getReg()));
-        updated = true;
-      }
-      else if (DVInsn->getOperand(0).isImm())
-        updated =
-          addConstantValue(VariableDie, DVInsn->getOperand(0),
-                                       DV->getType());
-      else if (DVInsn->getOperand(0).isFPImm())
-        updated =
-          addConstantFPValue(VariableDie, DVInsn->getOperand(0));
-      else if (DVInsn->getOperand(0).isCImm())
-        updated =
-          addConstantValue(VariableDie,
-                                       DVInsn->getOperand(0).getCImm(),
-                                       DV->getType().isUnsignedDIType());
-    } else {
-      addVariableAddress(DV, VariableDie,
-                                     Asm->getDebugValueLocation(DVInsn));
-      updated = true;
-    }
-    if (!updated) {
-      // If variableDie is not updated then DBG_VALUE instruction does not
-      // have valid variable info.
-      delete VariableDie;
-      return NULL;
-    }
-    DV->setDIE(VariableDie);
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
+    assert(DVInsn->getNumOperands() == 3);
+    if (DVInsn->getOperand(0).isReg()) {
+      const MachineOperand RegOp = DVInsn->getOperand(0);
+      // If the second operand is an immediate, this is an indirect value.
+      if (DVInsn->getOperand(1).isImm()) {
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, VariableDie, Location);
+      } else if (RegOp.getReg())
+        addVariableAddress(DV, VariableDie, MachineLocation(RegOp.getReg()));
+    } else if (DVInsn->getOperand(0).isImm())
+      addConstantValue(VariableDie, DVInsn->getOperand(0), DV.getType());
+    else if (DVInsn->getOperand(0).isFPImm())
+      addConstantFPValue(VariableDie, DVInsn->getOperand(0));
+    else if (DVInsn->getOperand(0).isCImm())
+      addConstantValue(VariableDie, DVInsn->getOperand(0).getCImm(),
+                       isUnsignedDIType(DD, DV.getType()));
+
+    DV.setDIE(VariableDie);
     return VariableDie;
   } else {
     // .. else use frame index.
-    int FI = DV->getFrameIndex();
+    int FI = DV.getFrameIndex();
     if (FI != ~0) {
       unsigned FrameReg = 0;
       const TargetFrameLowering *TFI = Asm->TM.getFrameLowering();
-      int Offset =
-        TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+      int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
       MachineLocation Location(FrameReg, Offset);
       addVariableAddress(DV, VariableDie, Location);
     }
   }
 
-  DV->setDIE(VariableDie);
+  DV.setDIE(VariableDie);
   return VariableDie;
 }
 
-/// createMemberDIE - Create new member DIE.
-DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
-  DIE *MemberDie = new DIE(DT.getTag());
+/// constructMemberDIE - Construct member DIE from DIDerivedType.
+void CompileUnit::constructMemberDIE(DIE &Buffer, DIDerivedType DT) {
+  DIE *MemberDie = createAndAddDIE(DT.getTag(), Buffer);
   StringRef Name = DT.getName();
   if (!Name.empty())
     addString(MemberDie, dwarf::DW_AT_name, Name);
 
-  addType(MemberDie, DT.getTypeDerivedFrom());
+  addType(MemberDie, resolve(DT.getTypeDerivedFrom()));
 
   addSourceLine(MemberDie, DT);
 
   DIEBlock *MemLocationDie = new (DIEValueAllocator) DIEBlock();
-  addUInt(MemLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-
-  uint64_t Size = DT.getSizeInBits();
-  uint64_t FieldSize = DT.getOriginalTypeSize();
-
-  if (Size != FieldSize) {
-    // Handle bitfield.
-    addUInt(MemberDie, dwarf::DW_AT_byte_size, 0, DT.getOriginalTypeSize()>>3);
-    addUInt(MemberDie, dwarf::DW_AT_bit_size, 0, DT.getSizeInBits());
-
-    uint64_t Offset = DT.getOffsetInBits();
-    uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
-    uint64_t HiMark = (Offset + FieldSize) & AlignMask;
-    uint64_t FieldOffset = (HiMark - FieldSize);
-    Offset -= FieldOffset;
-
-    // Maybe we need to work from the other end.
-    if (Asm->getDataLayout().isLittleEndian())
-      Offset = FieldSize - (Offset + Size);
-    addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
+  addUInt(MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
 
-    // Here WD_AT_data_member_location points to the anonymous
-    // field that includes this bit field.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, FieldOffset >> 3);
-
-  } else
-    // This is not a bitfield.
-    addUInt(MemLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits() >> 3);
-
-  if (DT.getTag() == dwarf::DW_TAG_inheritance
-      && DT.isVirtual()) {
+  if (DT.getTag() == dwarf::DW_TAG_inheritance && DT.isVirtual()) {
 
     // For C++, virtual base classes are not at fixed offset. Use following
     // expression to extract appropriate offset from vtable.
     // BaseAddr = ObAddr + *((*ObAddr) - Offset)
 
     DIEBlock *VBaseLocationDie = new (DIEValueAllocator) DIEBlock();
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_udata, DT.getOffsetInBits());
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    addUInt(VBaseLocationDie, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
-
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0,
-             VBaseLocationDie);
-  } else
-    addBlock(MemberDie, dwarf::DW_AT_data_member_location, 0, MemLocationDie);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_udata, DT.getOffsetInBits());
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
+  } else {
+    uint64_t Size = DT.getSizeInBits();
+    uint64_t FieldSize = getBaseTypeSize(DD, DT);
+    uint64_t OffsetInBytes;
+
+    if (Size != FieldSize) {
+      // Handle bitfield.
+      addUInt(MemberDie, dwarf::DW_AT_byte_size, None,
+              getBaseTypeSize(DD, DT) >> 3);
+      addUInt(MemberDie, dwarf::DW_AT_bit_size, None, DT.getSizeInBits());
+
+      uint64_t Offset = DT.getOffsetInBits();
+      uint64_t AlignMask = ~(DT.getAlignInBits() - 1);
+      uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+      uint64_t FieldOffset = (HiMark - FieldSize);
+      Offset -= FieldOffset;
+
+      // Maybe we need to work from the other end.
+      if (Asm->getDataLayout().isLittleEndian())
+        Offset = FieldSize - (Offset + Size);
+      addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
+
+      // Here WD_AT_data_member_location points to the anonymous
+      // field that includes this bit field.
+      OffsetInBytes = FieldOffset >> 3;
+    } else
+      // This is not a bitfield.
+      OffsetInBytes = DT.getOffsetInBits() >> 3;
+    addUInt(MemberDie, dwarf::DW_AT_data_member_location, None, OffsetInBytes);
+  }
 
   if (DT.isProtected())
     addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
@@ -1671,17 +1897,26 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
 
   if (DT.isArtificial())
     addFlag(MemberDie, dwarf::DW_AT_artificial);
-
-  return MemberDie;
 }
 
-/// createStaticMemberDIE - Create new DIE for C++ static member.
-DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
+/// getOrCreateStaticMemberDIE - Create new DIE for C++ static member.
+DIE *CompileUnit::getOrCreateStaticMemberDIE(DIDerivedType DT) {
   if (!DT.Verify())
     return NULL;
 
-  DIE *StaticMemberDIE = new DIE(DT.getTag());
-  DIType Ty = DT.getTypeDerivedFrom();
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(DT.getContext()));
+  assert(dwarf::isType(ContextDIE->getTag()) &&
+         "Static member should belong to a type.");
+
+  DIE *StaticMemberDIE = getDIE(DT);
+  if (StaticMemberDIE)
+    return StaticMemberDIE;
+
+  StaticMemberDIE = createAndAddDIE(DT.getTag(), *ContextDIE, DT);
+
+  DIType Ty = resolve(DT.getTypeDerivedFrom());
 
   addString(StaticMemberDIE, dwarf::DW_AT_name, DT.getName());
   addType(StaticMemberDIE, Ty);
@@ -1702,10 +1937,20 @@ DIE *CompileUnit::createStaticMemberDIE(const DIDerivedType DT) {
             dwarf::DW_ACCESS_public);
 
   if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT.getConstant()))
-    addConstantValue(StaticMemberDIE, CI, Ty.isUnsignedDIType());
+    addConstantValue(StaticMemberDIE, CI, isUnsignedDIType(DD, Ty));
   if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT.getConstant()))
     addConstantFPValue(StaticMemberDIE, CFP);
 
-  insertDIE(DT, StaticMemberDIE);
   return StaticMemberDIE;
 }
+
+void CompileUnit::emitHeader(const MCSection *ASection,
+                             const MCSymbol *ASectionSym) {
+  Asm->OutStreamer.AddComment("DWARF version number");
+  Asm->EmitInt16(DD->getDwarfVersion());
+  Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
+  Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
+                         ASectionSym);
+  Asm->OutStreamer.AddComment("Address Size (in bytes)");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 8f08c63..d782c88 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -15,15 +15,16 @@
 #define CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DIE.h"
+#include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/DebugInfo.h"
+#include "llvm/MC/MCExpr.h"
 
 namespace llvm {
 
-class DwarfDebug;
-class DwarfUnits;
 class MachineLocation;
 class MachineOperand;
 class ConstantInt;
@@ -38,11 +39,10 @@ class CompileUnit {
   ///
   unsigned UniqueID;
 
-  /// Language - The DW_AT_language of the compile unit
-  ///
-  unsigned Language;
+  /// Node - MDNode for the compile unit.
+  DICompileUnit Node;
 
-  /// Die - Compile unit debug information entry.
+  /// CUDie - Compile unit debug information entry.
   ///
   const OwningPtr<DIE> CUDie;
 
@@ -56,28 +56,28 @@ class CompileUnit {
   /// IndexTyDie - An anonymous type for index type.  Owned by CUDie.
   DIE *IndexTyDie;
 
-  /// MDNodeToDieMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDieMap - Tracks the mapping of unit level debug information
   /// variables to debug information entries.
   DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
 
-  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug informaton
+  /// MDNodeToDIEEntryMap - Tracks the mapping of unit level debug information
   /// descriptors to debug information entries using a DIEEntry proxy.
   DenseMap<const MDNode *, DIEEntry *> MDNodeToDIEEntryMap;
 
   /// GlobalNames - A map of globally visible named entities for this unit.
   ///
-  StringMap<DIE*> GlobalNames;
+  StringMap<DIE *> GlobalNames;
 
   /// GlobalTypes - A map of globally visible types for this unit.
   ///
-  StringMap<DIE*> GlobalTypes;
+  StringMap<DIE *> GlobalTypes;
 
   /// AccelNames - A map of names for the name accelerator table.
   ///
-  StringMap<std::vector<DIE*> > AccelNames;
-  StringMap<std::vector<DIE*> > AccelObjC;
-  StringMap<std::vector<DIE*> > AccelNamespace;
-  StringMap<std::vector<std::pair<DIE*, unsigned> > > AccelTypes;
+  StringMap<std::vector<DIE *> > AccelNames;
+  StringMap<std::vector<DIE *> > AccelObjC;
+  StringMap<std::vector<DIE *> > AccelNamespace;
+  StringMap<std::vector<std::pair<DIE *, unsigned> > > AccelTypes;
 
   /// DIEBlocks - A list of all the DIEBlocks in use.
   std::vector<DIEBlock *> DIEBlocks;
@@ -87,163 +87,154 @@ class CompileUnit {
   /// corresponds to the MDNode mapped with the subprogram DIE.
   DenseMap<DIE *, const MDNode *> ContainingTypeMap;
 
-  /// Offset of the CUDie from beginning of debug info section.
-  unsigned DebugInfoOffset;
+  // DIEValueAllocator - All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
 
-  /// getLowerBoundDefault - Return the default lower bound for an array. If the
-  /// DWARF version doesn't handle the language, return -1.
-  int64_t getDefaultLowerBound() const;
+  // DIEIntegerOne - A preallocated DIEValue because 1 is used frequently.
+  DIEInteger *DIEIntegerOne;
 
 public:
-  CompileUnit(unsigned UID, unsigned L, DIE *D, AsmPrinter *A, DwarfDebug *DW,
-              DwarfUnits *);
+  CompileUnit(unsigned UID, DIE *D, DICompileUnit CU, AsmPrinter *A,
+              DwarfDebug *DW, DwarfUnits *DWU);
   ~CompileUnit();
 
   // Accessors.
-  unsigned getUniqueID()            const { return UniqueID; }
-  unsigned getLanguage()            const { return Language; }
-  DIE* getCUDie()                   const { return CUDie.get(); }
-  unsigned getDebugInfoOffset()     const { return DebugInfoOffset; }
-  const StringMap<DIE*> &getGlobalNames() const { return GlobalNames; }
-  const StringMap<DIE*> &getGlobalTypes() const { return GlobalTypes; }
-
-  const StringMap<std::vector<DIE*> > &getAccelNames() const {
+  unsigned getUniqueID() const { return UniqueID; }
+  uint16_t getLanguage() const { return Node.getLanguage(); }
+  DICompileUnit getNode() const { return Node; }
+  DIE *getCUDie() const { return CUDie.get(); }
+  const StringMap<DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  const StringMap<std::vector<DIE *> > &getAccelNames() const {
     return AccelNames;
   }
-  const StringMap<std::vector<DIE*> > &getAccelObjC() const {
+  const StringMap<std::vector<DIE *> > &getAccelObjC() const {
     return AccelObjC;
   }
-  const StringMap<std::vector<DIE*> > &getAccelNamespace() const {
+  const StringMap<std::vector<DIE *> > &getAccelNamespace() const {
     return AccelNamespace;
   }
-  const StringMap<std::vector<std::pair<DIE*, unsigned > > >
-  &getAccelTypes() const {
+  const StringMap<std::vector<std::pair<DIE *, unsigned> > > &
+  getAccelTypes() const {
     return AccelTypes;
   }
 
+  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
   void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+
   /// hasContent - Return true if this compile unit has something to write out.
   ///
   bool hasContent() const { return !CUDie->getChildren().empty(); }
 
+  /// getParentContextString - Get a string containing the language specific
+  /// context for a global name.
+  std::string getParentContextString(DIScope Context) const;
+
   /// addGlobalName - Add a new global entity to the compile unit.
   ///
-  void addGlobalName(StringRef Name, DIE *Die) { GlobalNames[Name] = Die; }
+  void addGlobalName(StringRef Name, DIE *Die, DIScope Context);
 
   /// addGlobalType - Add a new global type to the compile unit.
   ///
   void addGlobalType(DIType Ty);
 
+  /// addPubTypes - Add a set of types from the subprogram to the global types.
+  void addPubTypes(DISubprogram SP);
 
   /// addAccelName - Add a new name to the name accelerator table.
-  void addAccelName(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNames[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelObjC(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelObjC[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelNamespace(StringRef Name, DIE *Die) {
-    std::vector<DIE*> &DIEs = AccelNamespace[Name];
-    DIEs.push_back(Die);
-  }
-  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die) {
-    std::vector<std::pair<DIE*, unsigned > > &DIEs = AccelTypes[Name];
-    DIEs.push_back(Die);
-  }
+  void addAccelName(StringRef Name, DIE *Die);
 
-  /// getDIE - Returns the debug information entry map slot for the
-  /// specified debug variable.
-  DIE *getDIE(const MDNode *N) { return MDNodeToDieMap.lookup(N); }
+  /// addAccelObjC - Add a new name to the ObjC accelerator table.
+  void addAccelObjC(StringRef Name, DIE *Die);
 
-  DIEBlock *getDIEBlock() {
-    return new (DIEValueAllocator) DIEBlock();
-  }
+  /// addAccelNamespace - Add a new name to the namespace accelerator table.
+  void addAccelNamespace(StringRef Name, DIE *Die);
 
-  /// insertDIE - Insert DIE into the map.
-  void insertDIE(const MDNode *N, DIE *D) {
-    MDNodeToDieMap.insert(std::make_pair(N, D));
-  }
+  /// addAccelType - Add a new type to the type accelerator table.
+  void addAccelType(StringRef Name, std::pair<DIE *, unsigned> Die);
 
-  /// getDIEEntry - Returns the debug information entry for the specified
-  /// debug variable.
-  DIEEntry *getDIEEntry(const MDNode *N) {
-    DenseMap<const MDNode *, DIEEntry *>::iterator I =
-      MDNodeToDIEEntryMap.find(N);
-    if (I == MDNodeToDIEEntryMap.end())
-      return NULL;
-    return I->second;
-  }
+  /// getDIE - Returns the debug information entry map slot for the
+  /// specified debug variable. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  DIE *getDIE(DIDescriptor D) const;
 
-  /// insertDIEEntry - Insert debug information entry into the map.
-  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
-    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
-  }
+  DIEBlock *getDIEBlock() { return new (DIEValueAllocator) DIEBlock(); }
+
+  /// insertDIE - Insert DIE into the map. We delegate the request to DwarfDebug
+  /// when the MDNode can be part of the type system, since DIEs for
+  /// the type system can be shared across CUs and the mappings are
+  /// kept in DwarfDebug.
+  void insertDIE(DIDescriptor Desc, DIE *D);
 
   /// addDie - Adds or interns the DIE to the compile unit.
   ///
-  void addDie(DIE *Buffer) {
-    this->CUDie->addChild(Buffer);
-  }
-
-  // getIndexTyDie - Get an anonymous type for index type.
-  DIE *getIndexTyDie() {
-    return IndexTyDie;
-  }
-
-  // setIndexTyDie - Set D as anonymous type for index which can be reused
-  // later.
-  void setIndexTyDie(DIE *D) {
-    IndexTyDie = D;
-  }
+  void addDie(DIE *Buffer) { CUDie->addChild(Buffer); }
 
   /// addFlag - Add a flag that is true to the DIE.
-  void addFlag(DIE *Die, unsigned Attribute);
+  void addFlag(DIE *Die, dwarf::Attribute Attribute);
 
   /// addUInt - Add an unsigned integer attribute data and value.
   ///
-  void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
+  void addUInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               uint64_t Integer);
+
+  void addUInt(DIEBlock *Block, dwarf::Form Form, uint64_t Integer);
 
   /// addSInt - Add an signed integer attribute data and value.
   ///
-  void addSInt(DIE *Die, unsigned Attribute, unsigned Form, int64_t Integer);
+  void addSInt(DIE *Die, dwarf::Attribute Attribute, Optional<dwarf::Form> Form,
+               int64_t Integer);
+
+  void addSInt(DIEBlock *Die, Optional<dwarf::Form> Form, int64_t Integer);
 
   /// addString - Add a string attribute data and value.
   ///
-  void addString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
 
   /// addLocalString - Add a string attribute data and value.
   ///
-  void addLocalString(DIE *Die, unsigned Attribute, const StringRef Str);
+  void addLocalString(DIE *Die, dwarf::Attribute Attribute, const StringRef Str);
+
+  /// addExpr - Add a Dwarf expression attribute data and value.
+  ///
+  void addExpr(DIEBlock *Die, dwarf::Form Form, const MCExpr *Expr);
 
   /// addLabel - Add a Dwarf label attribute data and value.
   ///
-  void addLabel(DIE *Die, unsigned Attribute, unsigned Form,
+  void addLabel(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form,
                 const MCSymbol *Label);
 
+  void addLabel(DIEBlock *Die, dwarf::Form Form, const MCSymbol *Label);
+
   /// addLabelAddress - Add a dwarf label attribute data and value using
   /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addLabelAddress(DIE *Die, unsigned Attribute, MCSymbol *Label);
+  void addLabelAddress(DIE *Die, dwarf::Attribute Attribute, MCSymbol *Label);
 
   /// addOpAddress - Add a dwarf op address data and value using the
   /// form given and an op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
   ///
-  void addOpAddress(DIE *Die, MCSymbol *Label);
+  void addOpAddress(DIEBlock *Die, const MCSymbol *Label);
 
   /// addDelta - Add a label delta attribute data and value.
   ///
-  void addDelta(DIE *Die, unsigned Attribute, unsigned Form,
-                const MCSymbol *Hi, const MCSymbol *Lo);
+  void addDelta(DIE *Die, dwarf::Attribute Attribute, dwarf::Form Form, const MCSymbol *Hi,
+                const MCSymbol *Lo);
 
   /// addDIEEntry - Add a DIE attribute data and value.
   ///
-  void addDIEEntry(DIE *Die, unsigned Attribute, unsigned Form, DIE *Entry);
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIE *Entry);
+
+  /// addDIEEntry - Add a DIE attribute data and value.
+  ///
+  void addDIEEntry(DIE *Die, dwarf::Attribute Attribute, DIEEntry *Entry);
 
   /// addBlock - Add block data.
   ///
-  void addBlock(DIE *Die, unsigned Attribute, unsigned Form, DIEBlock *Block);
+  void addBlock(DIE *Die, dwarf::Attribute Attribute, DIEBlock *Block);
 
   /// addSourceLine - Add location information to specified debug information
   /// entry.
@@ -256,33 +247,33 @@ public:
 
   /// addAddress - Add an address attribute to a die based on the location
   /// provided.
-  void addAddress(DIE *Die, unsigned Attribute,
-                  const MachineLocation &Location);
+  void addAddress(DIE *Die, dwarf::Attribute Attribute, const MachineLocation &Location,
+                  bool Indirect = false);
 
   /// addConstantValue - Add constant value entry in variable DIE.
-  bool addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
-  bool addConstantValue(DIE *Die, const ConstantInt *CI, bool Unsigned);
-  bool addConstantValue(DIE *Die, const APInt &Val, bool Unsigned);
+  void addConstantValue(DIE *Die, const MachineOperand &MO, DIType Ty);
+  void addConstantValue(DIE *Die, const ConstantInt *CI, bool Unsigned);
+  void addConstantValue(DIE *Die, const APInt &Val, bool Unsigned);
 
   /// addConstantFPValue - Add constant value entry in variable DIE.
-  bool addConstantFPValue(DIE *Die, const MachineOperand &MO);
-  bool addConstantFPValue(DIE *Die, const ConstantFP *CFP);
+  void addConstantFPValue(DIE *Die, const MachineOperand &MO);
+  void addConstantFPValue(DIE *Die, const ConstantFP *CFP);
 
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
   /// addRegisterOp - Add register operand.
-  void addRegisterOp(DIE *TheDie, unsigned Reg);
+  void addRegisterOp(DIEBlock *TheDie, unsigned Reg);
 
   /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIE *TheDie, unsigned Reg, int64_t Offset);
+  void addRegisterOffset(DIEBlock *TheDie, unsigned Reg, int64_t Offset);
 
   /// addComplexAddress - Start with the address based on the location provided,
   /// and generate the DWARF information necessary to find the actual variable
   /// (navigating the extra location information encoded in the type) based on
   /// the starting location.  Add the DWARF information to the die.
   ///
-  void addComplexAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+  void addComplexAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                          const MachineLocation &Location);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
@@ -292,20 +283,18 @@ public:
   /// starting location.  Add the DWARF information to the die.  Obsolete,
   /// please use addComplexAddress instead.
   ///
-  void addBlockByrefAddress(DbgVariable *&DV, DIE *Die, unsigned Attribute,
+  void addBlockByrefAddress(const DbgVariable &DV, DIE *Die, dwarf::Attribute Attribute,
                             const MachineLocation &Location);
 
   /// addVariableAddress - Add DW_AT_location attribute for a
   /// DbgVariable based on provided MachineLocation.
-  void addVariableAddress(DbgVariable *&DV, DIE *Die, MachineLocation Location);
-
-  /// addToContextOwner - Add Die into the list of its context owner's children.
-  void addToContextOwner(DIE *Die, DIDescriptor Context);
+  void addVariableAddress(const DbgVariable &DV, DIE *Die,
+                          MachineLocation Location);
 
   /// addType - Add a new type attribute to the specified entity. This takes
   /// and attribute parameter because DW_AT_friend attributes are also
   /// type references.
-  void addType(DIE *Entity, DIType Ty, unsigned Attribute = dwarf::DW_AT_type);
+  void addType(DIE *Entity, DIType Ty, dwarf::Attribute Attribute = dwarf::DW_AT_type);
 
   /// getOrCreateNameSpace - Create a DIE for DINameSpace.
   DIE *getOrCreateNameSpace(DINameSpace NS);
@@ -317,66 +306,103 @@ public:
   /// given DIType.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
-  /// getOrCreateTemplateTypeParameterDIE - Find existing DIE or create new DIE
-  /// for the given DITemplateTypeParameter.
-  DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
+  /// getOrCreateContextDIE - Get context owner's DIE.
+  DIE *getOrCreateContextDIE(DIScope Context);
 
-  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create
-  /// new DIE for the given DITemplateValueParameter.
-  DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
+  /// createGlobalVariableDIE - create global variable DIE.
+  void createGlobalVariableDIE(DIGlobalVariable GV);
 
-  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
-  /// information entry.
-  DIEEntry *createDIEEntry(DIE *Entry);
+  /// constructContainingTypeDIEs - Construct DIEs for types that contain
+  /// vtables.
+  void constructContainingTypeDIEs();
 
-  /// createGlobalVariableDIE - create global variable DIE.
-  void createGlobalVariableDIE(const MDNode *N);
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable &DV, bool isScopeAbstract);
+
+  /// Create a DIE with the given Tag, add the DIE to its parent, and
+  /// call insertDIE if MD is not null.
+  DIE *createAndAddDIE(unsigned Tag, DIE &Parent, DIDescriptor N = DIDescriptor());
+
+  /// Compute the size of a header for this unit, not including the initial
+  /// length field.
+  unsigned getHeaderSize() const {
+    return sizeof(int16_t) + // DWARF version number
+           sizeof(int32_t) + // Offset Into Abbrev. Section
+           sizeof(int8_t);   // Pointer Size (in bytes)
+  }
 
-  void addPubTypes(DISubprogram SP);
+  /// Emit the header for this unit, not including the initial length field.
+  void emitHeader(const MCSection *ASection, const MCSymbol *ASectionSym);
 
+private:
   /// constructTypeDIE - Construct basic type die from DIBasicType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIBasicType BTy);
+  void constructTypeDIE(DIE &Buffer, DIBasicType BTy);
 
   /// constructTypeDIE - Construct derived type die from DIDerivedType.
-  void constructTypeDIE(DIE &Buffer,
-                        DIDerivedType DTy);
+  void constructTypeDIE(DIE &Buffer, DIDerivedType DTy);
 
   /// constructTypeDIE - Construct type DIE from DICompositeType.
-  void constructTypeDIE(DIE &Buffer,
-                        DICompositeType CTy);
+  void constructTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructSubrangeDIE - Construct subrange DIE from DISubrange.
   void constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy);
 
   /// constructArrayTypeDIE - Construct array type DIE from DICompositeType.
-  void constructArrayTypeDIE(DIE &Buffer,
-                             DICompositeType *CTy);
+  void constructArrayTypeDIE(DIE &Buffer, DICompositeType CTy);
 
   /// constructEnumTypeDIE - Construct enum type DIE from DIEnumerator.
-  DIE *constructEnumTypeDIE(DIEnumerator ETy);
+  void constructEnumTypeDIE(DIE &Buffer, DICompositeType CTy);
 
-  /// constructContainingTypeDIEs - Construct DIEs for types that contain
-  /// vtables.
-  void constructContainingTypeDIEs();
+  /// constructMemberDIE - Construct member DIE from DIDerivedType.
+  void constructMemberDIE(DIE &Buffer, DIDerivedType DT);
 
-  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
-  DIE *constructVariableDIE(DbgVariable *DV, bool isScopeAbstract);
+  /// constructTemplateTypeParameterDIE - Construct new DIE for the given
+  /// DITemplateTypeParameter.
+  void constructTemplateTypeParameterDIE(DIE &Buffer,
+                                         DITemplateTypeParameter TP);
 
-  /// createMemberDIE - Create new member DIE.
-  DIE *createMemberDIE(DIDerivedType DT);
+  /// constructTemplateValueParameterDIE - Construct new DIE for the given
+  /// DITemplateValueParameter.
+  void constructTemplateValueParameterDIE(DIE &Buffer,
+                                          DITemplateValueParameter TVP);
 
-  /// createStaticMemberDIE - Create new static data member DIE.
-  DIE *createStaticMemberDIE(DIDerivedType DT);
+  /// getOrCreateStaticMemberDIE - Create new static data member DIE.
+  DIE *getOrCreateStaticMemberDIE(DIDerivedType DT);
 
-  /// getOrCreateContextDIE - Get context owner's DIE.
-  DIE *getOrCreateContextDIE(DIDescriptor Context);
+  /// Offset of the CUDie from beginning of debug info section.
+  unsigned DebugInfoOffset;
 
-private:
+  /// getLowerBoundDefault - Return the default lower bound for an array. If the
+  /// DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
 
-  // DIEValueAllocator - All DIEValues are allocated through this allocator.
-  BumpPtrAllocator DIEValueAllocator;
-  DIEInteger *DIEIntegerOne;
+  /// getDIEEntry - Returns the debug information entry for the specified
+  /// debug variable.
+  DIEEntry *getDIEEntry(const MDNode *N) const {
+    return MDNodeToDIEEntryMap.lookup(N);
+  }
+
+  /// insertDIEEntry - Insert debug information entry into the map.
+  void insertDIEEntry(const MDNode *N, DIEEntry *E) {
+    MDNodeToDIEEntryMap.insert(std::make_pair(N, E));
+  }
+
+  // getIndexTyDie - Get an anonymous type for index type.
+  DIE *getIndexTyDie() { return IndexTyDie; }
+
+  // setIndexTyDie - Set D as anonymous type for index which can be reused
+  // later.
+  void setIndexTyDie(DIE *D) { IndexTyDie = D; }
+
+  /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
+  /// information entry.
+  DIEEntry *createDIEEntry(DIE *Entry);
+
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return DD->resolve(Ref);
+  }
 };
 
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 73bba69..24e2c05 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "dwarfdebug"
 #include "DwarfDebug.h"
 #include "DIE.h"
+#include "DIEHash.h"
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "llvm/ADT/STLExtras.h"
@@ -34,8 +35,10 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Support/ValueHandle.h"
@@ -46,61 +49,64 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
-static cl::opt<bool> DisableDebugInfoPrinting("disable-debug-info-print",
-                                              cl::Hidden,
-     cl::desc("Disable debug info printing"));
+static cl::opt<bool>
+DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+                         cl::desc("Disable debug info printing"));
 
-static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
-     cl::desc("Make an absence of debug location information explicit."),
-     cl::init(false));
+static cl::opt<bool> UnknownLocations(
+    "use-unknown-locations", cl::Hidden,
+    cl::desc("Make an absence of debug location information explicit."),
+    cl::init(false));
 
-static cl::opt<bool> GenerateDwarfPubNamesSection("generate-dwarf-pubnames",
-     cl::Hidden, cl::init(false),
-     cl::desc("Generate DWARF pubnames section"));
+static cl::opt<bool>
+GenerateODRHash("generate-odr-hash", cl::Hidden,
+                cl::desc("Add an ODR hash to external type DIEs."),
+                cl::init(false));
 
-namespace {
-  enum DefaultOnOff {
-    Default, Enable, Disable
-  };
-}
-
-static cl::opt<DefaultOnOff> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
-     cl::desc("Output prototype dwarf accelerator tables."),
-     cl::values(
-                clEnumVal(Default, "Default for platform"),
-                clEnumVal(Enable, "Enabled"),
-                clEnumVal(Disable, "Disabled"),
-                clEnumValEnd),
-     cl::init(Default));
-
-static cl::opt<DefaultOnOff> DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
-     cl::desc("Compatibility with Darwin gdb."),
-     cl::values(
-                clEnumVal(Default, "Default for platform"),
-                clEnumVal(Enable, "Enabled"),
-                clEnumVal(Disable, "Disabled"),
-                clEnumValEnd),
-     cl::init(Default));
-
-static cl::opt<DefaultOnOff> SplitDwarf("split-dwarf", cl::Hidden,
-     cl::desc("Output prototype dwarf split debug info."),
-     cl::values(
-                clEnumVal(Default, "Default for platform"),
-                clEnumVal(Enable, "Enabled"),
-                clEnumVal(Disable, "Disabled"),
-                clEnumValEnd),
-     cl::init(Default));
+static cl::opt<bool>
+GenerateCUHash("generate-cu-hash", cl::Hidden,
+               cl::desc("Add the CU hash as the dwo_id."),
+               cl::init(false));
 
-namespace {
-  const char *DWARFGroupName = "DWARF Emission";
-  const char *DbgTimerName = "DWARF Debug Writer";
+static cl::opt<bool>
+GenerateGnuPubSections("generate-gnu-dwarf-pub-sections", cl::Hidden,
+                       cl::desc("Generate GNU-style pubnames and pubtypes"),
+                       cl::init(false));
 
-  struct CompareFirst {
-    template <typename T> bool operator()(const T &lhs, const T &rhs) const {
-      return lhs.first < rhs.first;
-    }
-  };
-} // end anonymous namespace
+namespace {
+enum DefaultOnOff {
+  Default,
+  Enable,
+  Disable
+};
+}
+
+static cl::opt<DefaultOnOff>
+DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
+                 cl::desc("Output prototype dwarf accelerator tables."),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
+
+static cl::opt<DefaultOnOff>
+SplitDwarf("split-dwarf", cl::Hidden,
+           cl::desc("Output prototype dwarf split debug info."),
+           cl::values(clEnumVal(Default, "Default for platform"),
+                      clEnumVal(Enable, "Enabled"),
+                      clEnumVal(Disable, "Disabled"), clEnumValEnd),
+           cl::init(Default));
+
+static cl::opt<DefaultOnOff>
+DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
+                 cl::desc("Generate DWARF pubnames and pubtypes sections"),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
+
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "DWARF Debug Writer";
 
 //===----------------------------------------------------------------------===//
 
@@ -110,6 +116,13 @@ static const unsigned InitAbbreviationsSetSize = 9; // log2(512)
 
 namespace llvm {
 
+/// resolve - Look in the DwarfDebug map for the MDNode that
+/// corresponds to the reference.
+template <typename T>
+T DbgVariable::resolve(DIRef<T> Ref) const {
+  return DD->resolve(Ref);
+}
+
 DIType DbgVariable::getType() const {
   DIType Ty = Var.getType();
   // FIXME: isBlockByrefVariable should be reformulated in terms of complex
@@ -140,21 +153,16 @@ DIType DbgVariable::getType() const {
        the pointers and __Block_byref_x_VarName struct to find the actual
        value of the variable.  The function addBlockByrefType does this.  */
     DIType subType = Ty;
-    unsigned tag = Ty.getTag();
-
-    if (tag == dwarf::DW_TAG_pointer_type) {
-      DIDerivedType DTy = DIDerivedType(Ty);
-      subType = DTy.getTypeDerivedFrom();
-    }
+    uint16_t tag = Ty.getTag();
 
-    DICompositeType blockStruct = DICompositeType(subType);
-    DIArray Elements = blockStruct.getTypeArray();
+    if (tag == dwarf::DW_TAG_pointer_type)
+      subType = resolve(DIDerivedType(Ty).getTypeDerivedFrom());
 
+    DIArray Elements = DICompositeType(subType).getTypeArray();
     for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
-      DIDescriptor Element = Elements.getElement(i);
-      DIDerivedType DT = DIDerivedType(Element);
+      DIDerivedType DT(Elements.getElement(i));
       if (getName() == DT.getName())
-        return (DT.getTypeDerivedFrom());
+        return (resolve(DT.getTypeDerivedFrom()));
     }
   }
   return Ty;
@@ -162,15 +170,23 @@ DIType DbgVariable::getType() const {
 
 } // end llvm namespace
 
+/// Return Dwarf Version by checking module flags.
+static unsigned getDwarfVersionFromModule(const Module *M) {
+  Value *Val = M->getModuleFlag("Dwarf Version");
+  if (!Val)
+    return dwarf::DWARF_VERSION;
+  return cast<ConstantInt>(Val)->getZExtValue();
+}
+
 DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   : Asm(A), MMI(Asm->MMI), FirstCU(0),
     AbbreviationsSet(InitAbbreviationsSetSize),
     SourceIdMap(DIEValueAllocator),
     PrevLabel(NULL), GlobalCUIndexCount(0),
-    InfoHolder(A, &AbbreviationsSet, &Abbreviations, "info_string",
+    InfoHolder(A, &AbbreviationsSet, Abbreviations, "info_string",
                DIEValueAllocator),
     SkeletonAbbrevSet(InitAbbreviationsSetSize),
-    SkeletonHolder(A, &SkeletonAbbrevSet, &SkeletonAbbrevs, "skel_string",
+    SkeletonHolder(A, &SkeletonAbbrevSet, SkeletonAbbrevs, "skel_string",
                    DIEValueAllocator) {
 
   DwarfInfoSectionSym = DwarfAbbrevSectionSym = 0;
@@ -180,37 +196,32 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfAbbrevDWOSectionSym = DwarfStrDWOSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
-  // Turn on accelerator tables and older gdb compatibility
-  // for Darwin.
+  // Turn on accelerator tables for Darwin by default, pubnames by
+  // default for non-Darwin, and handle split dwarf.
   bool IsDarwin = Triple(A->getTargetTriple()).isOSDarwin();
-  if (DarwinGDBCompat == Default) {
-    if (IsDarwin)
-      IsDarwinGDBCompat = true;
-    else
-      IsDarwinGDBCompat = false;
-  } else
-    IsDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
 
-  if (DwarfAccelTables == Default) {
-    if (IsDarwin)
-      HasDwarfAccelTables = true;
-    else
-      HasDwarfAccelTables = false;
-  } else
-    HasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+  if (DwarfAccelTables == Default)
+    HasDwarfAccelTables = IsDarwin;
+  else
+    HasDwarfAccelTables = DwarfAccelTables == Enable;
 
   if (SplitDwarf == Default)
     HasSplitDwarf = false;
   else
-    HasSplitDwarf = SplitDwarf == Enable ? true : false;
+    HasSplitDwarf = SplitDwarf == Enable;
+
+  if (DwarfPubSections == Default)
+    HasDwarfPubSections = !IsDarwin;
+  else
+    HasDwarfPubSections = DwarfPubSections == Enable;
+
+  DwarfVersion = getDwarfVersionFromModule(MMI->getModule());
 
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule();
   }
 }
-DwarfDebug::~DwarfDebug() {
-}
 
 // Switch to the specified MCSection and emit an assembler
 // temporary label to it if SymbolStem is specified.
@@ -247,48 +258,37 @@ unsigned DwarfUnits::getStringPoolIndex(StringRef Str) {
   return Entry.second;
 }
 
-unsigned DwarfUnits::getAddrPoolIndex(MCSymbol *Sym) {
-  std::pair<MCSymbol*, unsigned> &Entry = AddressPool[Sym];
-  if (Entry.first) return Entry.second;
+unsigned DwarfUnits::getAddrPoolIndex(const MCSymbol *Sym) {
+  return getAddrPoolIndex(MCSymbolRefExpr::Create(Sym, Asm->OutContext));
+}
 
-  Entry.second = NextAddrPoolNumber++;
-  Entry.first = Sym;
-  return Entry.second;
+unsigned DwarfUnits::getAddrPoolIndex(const MCExpr *Sym) {
+  std::pair<DenseMap<const MCExpr *, unsigned>::iterator, bool> P =
+      AddressPool.insert(std::make_pair(Sym, NextAddrPoolNumber));
+  if (P.second)
+    ++NextAddrPoolNumber;
+  return P.first->second;
 }
 
 // Define a unique number for the abbreviation.
 //
 void DwarfUnits::assignAbbrevNumber(DIEAbbrev &Abbrev) {
-  // Profile the node so that we can make it unique.
-  FoldingSetNodeID ID;
-  Abbrev.Profile(ID);
-
   // Check the set for priors.
   DIEAbbrev *InSet = AbbreviationsSet->GetOrInsertNode(&Abbrev);
 
   // If it's newly added.
   if (InSet == &Abbrev) {
     // Add to abbreviation list.
-    Abbreviations->push_back(&Abbrev);
+    Abbreviations.push_back(&Abbrev);
 
     // Assign the vector position + 1 as its number.
-    Abbrev.setNumber(Abbreviations->size());
+    Abbrev.setNumber(Abbreviations.size());
   } else {
     // Assign existing abbreviation number.
     Abbrev.setNumber(InSet->getNumber());
   }
 }
 
-// If special LLVM prefix that is used to inform the asm
-// printer to not emit usual symbol prefix before the symbol name is used then
-// return linkage name after skipping this special LLVM prefix.
-static StringRef getRealLinkageName(StringRef LinkageName) {
-  char One = '\1';
-  if (LinkageName.startswith(StringRef(&One, 1)))
-    return LinkageName.substr(1);
-  return LinkageName;
-}
-
 static bool isObjCClass(StringRef Name) {
   return Name.startswith("+") || Name.startswith("-");
 }
@@ -296,12 +296,7 @@ static bool isObjCClass(StringRef Name) {
 static bool hasObjCCategory(StringRef Name) {
   if (!isObjCClass(Name)) return false;
 
-  size_t pos = Name.find(')');
-  if (pos != std::string::npos) {
-    if (Name[pos+1] != ' ') return false;
-    return true;
-  }
-  return false;
+  return Name.find(") ") != StringRef::npos;
 }
 
 static void getObjCClassCategory(StringRef In, StringRef &Class,
@@ -321,11 +316,20 @@ static StringRef getObjCMethodName(StringRef In) {
   return In.slice(In.find(' ') + 1, In.find(']'));
 }
 
+// Helper for sorting sections into a stable output order.
+static bool SectionSort(const MCSection *A, const MCSection *B) {
+    std::string LA = (A ? A->getLabelBeginName() : "");
+    std::string LB = (B ? B->getLabelBeginName() : "");
+    return LA < LB;
+}
+
 // Add the various names to the Dwarf accelerator table names.
+// TODO: Determine whether or not we should add names for programs
+// that do not have a DW_AT_name or DW_AT_linkage_name field - this
+// is only slightly different than the lookup of non-standard ObjC names.
 static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
                                DIE* Die) {
   if (!SP.isDefinition()) return;
-
   TheCU->addAccelName(SP.getName(), Die);
 
   // If the linkage name is different than the name, go ahead and output
@@ -346,30 +350,34 @@ static void addSubprogramNames(CompileUnit *TheCU, DISubprogram SP,
   }
 }
 
+/// isSubprogramContext - Return true if Context is either a subprogram
+/// or another context nested inside a subprogram.
+bool DwarfDebug::isSubprogramContext(const MDNode *Context) {
+  if (!Context)
+    return false;
+  DIDescriptor D(Context);
+  if (D.isSubprogram())
+    return true;
+  if (D.isType())
+    return isSubprogramContext(resolve(DIType(Context).getContext()));
+  return false;
+}
+
 // Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
 // and DW_AT_high_pc attributes. If there are global variables in this
 // scope then create and insert DIEs for these variables.
-DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
-                                          const MDNode *SPNode) {
-  DIE *SPDie = SPCU->getDIE(SPNode);
+DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP) {
+  DIE *SPDie = SPCU->getDIE(SP);
 
   assert(SPDie && "Unable to find subprogram DIE!");
-  DISubprogram SP(SPNode);
 
   // If we're updating an abstract DIE, then we will be adding the children and
   // object pointer later on. But what we don't want to do is process the
   // concrete DIE twice.
-  DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode);
-  if (AbsSPDIE) {
-    bool InSameCU = (AbsSPDIE->getCompileUnit() == SPCU->getCUDie());
+  if (DIE *AbsSPDIE = AbstractSPDies.lookup(SP)) {
     // Pick up abstract subprogram DIE.
-    SPDie = new DIE(dwarf::DW_TAG_subprogram);
-    // If AbsSPDIE belongs to a different CU, use DW_FORM_ref_addr instead of
-    // DW_FORM_ref4.
-    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
-                      InSameCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
-                      AbsSPDIE);
-    SPCU->addDie(SPDie);
+    SPDie = SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+    SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin, AbsSPDIE);
   } else {
     DISubprogram SPDecl = SP.getFunctionDeclaration();
     if (!SPDecl.isSubprogram()) {
@@ -378,32 +386,31 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
       // function then gdb prefers the definition at top level and but does not
       // expect specification DIE in parent function. So avoid creating
       // specification DIE for a function defined inside a function.
-      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-          !SP.getContext().isFile() &&
-          !isSubprogramContext(SP.getContext())) {
+      DIScope SPContext = resolve(SP.getContext());
+      if (SP.isDefinition() && !SPContext.isCompileUnit() &&
+          !SPContext.isFile() &&
+          !isSubprogramContext(SPContext)) {
         SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
 
         // Add arguments.
         DICompositeType SPTy = SP.getType();
         DIArray Args = SPTy.getTypeArray();
-        unsigned SPTag = SPTy.getTag();
+        uint16_t SPTag = SPTy.getTag();
         if (SPTag == dwarf::DW_TAG_subroutine_type)
           for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-            DIType ATy = DIType(Args.getElement(i));
+            DIE *Arg =
+                SPCU->createAndAddDIE(dwarf::DW_TAG_formal_parameter, *SPDie);
+            DIType ATy(Args.getElement(i));
             SPCU->addType(Arg, ATy);
             if (ATy.isArtificial())
               SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
             if (ATy.isObjectPointer())
-              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
-                                dwarf::DW_FORM_ref4, Arg);
-            SPDie->addChild(Arg);
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer, Arg);
           }
         DIE *SPDeclDie = SPDie;
-        SPDie = new DIE(dwarf::DW_TAG_subprogram);
-        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification,
-                          dwarf::DW_FORM_ref4, SPDeclDie);
-        SPCU->addDie(SPDie);
+        SPDie =
+            SPCU->createAndAddDIE(dwarf::DW_TAG_subprogram, *SPCU->getCUDie());
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, SPDeclDie);
       }
     }
   }
@@ -425,19 +432,40 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
   return SPDie;
 }
 
+/// Check whether we should create a DIE for the given Scope, return true
+/// if we don't create a DIE (the corresponding DIE is null).
+bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
+  if (Scope->isAbstractScope())
+    return false;
+
+  // We don't create a DIE if there is no Range.
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return true;
+
+  if (Ranges.size() > 1)
+    return false;
+
+  // We don't create a DIE if we have a single Range and the end label
+  // is null.
+  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
+  MCSymbol *End = getLabelAfterInsn(RI->second);
+  return !End;
+}
+
 // Construct new DW_TAG_lexical_block for this scope and attach
 // DW_AT_low_pc/DW_AT_high_pc labels.
 DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
                                           LexicalScope *Scope) {
+  if (isLexicalScopeDIENull(Scope))
+    return 0;
+
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_lexical_block);
   if (Scope->isAbstractScope())
     return ScopeDIE;
 
-  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
-  if (Ranges.empty())
-    return 0;
-
-  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  // If we have multiple ranges, emit them into the range section.
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
     // .debug_range as a uint, size 4, for now. emitDIE will handle
@@ -445,20 +473,23 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
     TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
                    DebugRangeSymbols.size()
                    * Asm->getDataLayout().getPointerSize());
-    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+    for (SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
       DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
     }
+
+    // Terminate the range list.
     DebugRangeSymbols.push_back(NULL);
     DebugRangeSymbols.push_back(NULL);
     return ScopeDIE;
   }
 
+  // Construct the address range for this DIE.
+  SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
   MCSymbol *Start = getLabelBeforeInsn(RI->first);
   MCSymbol *End = getLabelAfterInsn(RI->second);
-
-  if (End == 0) return 0;
+  assert(End && "End label should not be null!");
 
   assert(Start->isDefined() && "Invalid starting label for an inlined scope!");
   assert(End->isDefined() && "Invalid end label for an inlined scope!");
@@ -473,7 +504,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
 // represent this concrete inlined copy of the function.
 DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
                                           LexicalScope *Scope) {
-  const SmallVector<InsnRange, 4> &Ranges = Scope->getRanges();
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
   assert(Ranges.empty() == false &&
          "LexicalScope does not have instruction markers!");
 
@@ -487,21 +518,8 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     return NULL;
   }
 
-  SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin();
-  MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
-  MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
-
-  if (StartLabel == 0 || EndLabel == 0) {
-    llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
-  }
-  assert(StartLabel->isDefined() &&
-         "Invalid starting label for an inlined scope!");
-  assert(EndLabel->isDefined() &&
-         "Invalid end label for an inlined scope!");
-
   DIE *ScopeDIE = new DIE(dwarf::DW_TAG_inlined_subroutine);
-  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin,
-                     dwarf::DW_FORM_ref4, OriginDIE);
+  TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_abstract_origin, OriginDIE);
 
   if (Ranges.size() > 1) {
     // .debug_range section has not been laid out yet. Emit offset in
@@ -510,7 +528,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
                    DebugRangeSymbols.size()
                    * Asm->getDataLayout().getPointerSize());
-    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+    for (SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
       DebugRangeSymbols.push_back(getLabelAfterInsn(RI->second));
@@ -518,31 +536,29 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     DebugRangeSymbols.push_back(NULL);
     DebugRangeSymbols.push_back(NULL);
   } else {
+    SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin();
+    MCSymbol *StartLabel = getLabelBeforeInsn(RI->first);
+    MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
+
+    if (StartLabel == 0 || EndLabel == 0)
+      llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
+
+    assert(StartLabel->isDefined() &&
+           "Invalid starting label for an inlined scope!");
+    assert(EndLabel->isDefined() && "Invalid end label for an inlined scope!");
+
     TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_low_pc, StartLabel);
     TheCU->addLabelAddress(ScopeDIE, dwarf::DW_AT_high_pc, EndLabel);
   }
 
   InlinedSubprogramDIEs.insert(OriginDIE);
 
-  // Track the start label for this inlined function.
-  //.debug_inlined section specification does not clearly state how
-  // to emit inlined scope that is split into multiple instruction ranges.
-  // For now, use first instruction range and emit low_pc/high_pc pair and
-  // corresponding .debug_inlined section entry for this pair.
-  DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> >::iterator
-    I = InlineInfo.find(InlinedSP);
-
-  if (I == InlineInfo.end()) {
-    InlineInfo[InlinedSP].push_back(std::make_pair(StartLabel, ScopeDIE));
-    InlinedSPNodes.push_back(InlinedSP);
-  } else
-    I->second.push_back(std::make_pair(StartLabel, ScopeDIE));
-
+  // Add the call site information to the DIE.
   DILocation DL(Scope->getInlinedAt());
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, 0,
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_file, None,
                  getOrCreateSourceID(DL.getFilename(), DL.getDirectory(),
                                      TheCU->getUniqueID()));
-  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, 0, DL.getLineNumber());
+  TheCU->addUInt(ScopeDIE, dwarf::DW_AT_call_line, None, DL.getLineNumber());
 
   // Add name to the name table, we do this here because we're guaranteed
   // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
@@ -551,42 +567,49 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   return ScopeDIE;
 }
 
-// Construct a DIE for this scope.
-DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
-  if (!Scope || !Scope->getScopeNode())
-    return NULL;
-
-  DIScope DS(Scope->getScopeNode());
-  // Early return to avoid creating dangling variable|scope DIEs.
-  if (!Scope->getInlinedAt() && DS.isSubprogram() && Scope->isAbstractScope() &&
-      !TheCU->getDIE(DS))
-    return NULL;
-
-  SmallVector<DIE *, 8> Children;
-  DIE *ObjectPointer = NULL;
+DIE *DwarfDebug::createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                                        SmallVectorImpl<DIE*> &Children) {
+    DIE *ObjectPointer = NULL;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
         if (DIE *Arg =
-            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
+            TheCU->constructVariableDIE(*ArgDV, Scope->isAbstractScope())) {
           Children.push_back(Arg);
           if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
         }
 
   // Collect lexical scope children first.
-  const SmallVector<DbgVariable *, 8> &Variables = ScopeVariables.lookup(Scope);
+  const SmallVectorImpl<DbgVariable *> &Variables =ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
     if (DIE *Variable =
-        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
+        TheCU->constructVariableDIE(*Variables[i], Scope->isAbstractScope())) {
       Children.push_back(Variable);
       if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
     }
-  const SmallVector<LexicalScope *, 4> &Scopes = Scope->getChildren();
+  const SmallVectorImpl<LexicalScope *> &Scopes = Scope->getChildren();
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
       Children.push_back(Nested);
+  return ObjectPointer;
+}
+
+// Construct a DIE for this scope.
+DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
+  if (!Scope || !Scope->getScopeNode())
+    return NULL;
+
+  DIScope DS(Scope->getScopeNode());
+
+  SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
+  bool ChildrenCreated = false;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
   DIE *ScopeDIE = NULL;
   if (Scope->getInlinedAt())
     ScopeDIE = constructInlinedScopeDIE(TheCU, Scope);
@@ -597,34 +620,49 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
       // Note down abstract DIE.
       if (ScopeDIE)
         AbstractSPDies.insert(std::make_pair(DS, ScopeDIE));
-    }
-    else
-      ScopeDIE = updateSubprogramScopeDIE(TheCU, DS);
-  }
-  else {
+    } else
+      ScopeDIE = updateSubprogramScopeDIE(TheCU, DISubprogram(DS));
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (isLexicalScopeDIENull(Scope))
+      return NULL;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
+    ChildrenCreated = true;
+
     // There is no need to emit empty lexical block DIE.
     std::pair<ImportedEntityMap::const_iterator,
               ImportedEntityMap::const_iterator> Range = std::equal_range(
         ScopesWithImportedEntities.begin(), ScopesWithImportedEntities.end(),
         std::pair<const MDNode *, const MDNode *>(DS, (const MDNode*)0),
-        CompareFirst());
+        less_first());
     if (Children.empty() && Range.first == Range.second)
       return NULL;
     ScopeDIE = constructLexicalScopeDIE(TheCU, Scope);
-    for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second; ++i)
-      constructImportedModuleDIE(TheCU, i->second, ScopeDIE);
+    assert(ScopeDIE && "Scope DIE should not be null.");
+    for (ImportedEntityMap::const_iterator i = Range.first; i != Range.second;
+         ++i)
+      constructImportedEntityDIE(TheCU, i->second, ScopeDIE);
   }
 
-  if (!ScopeDIE) return NULL;
+  if (!ScopeDIE) {
+    assert(Children.empty() &&
+           "We create children only when the scope DIE is not null.");
+    return NULL;
+  }
+  if (!ChildrenCreated)
+    // We create children when the scope DIE is not null.
+    ObjectPointer = createScopeChildrenDIE(TheCU, Scope, Children);
 
   // Add children
-  for (SmallVector<DIE *, 8>::iterator I = Children.begin(),
+  for (SmallVectorImpl<DIE *>::iterator I = Children.begin(),
          E = Children.end(); I != E; ++I)
     ScopeDIE->addChild(*I);
 
   if (DS.isSubprogram() && ObjectPointer != NULL)
-    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
-                       dwarf::DW_FORM_ref4, ObjectPointer);
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, ObjectPointer);
 
   if (DS.isSubprogram())
     TheCU->addPubTypes(DISubprogram(DS));
@@ -640,8 +678,10 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
                                          StringRef DirName, unsigned CUID) {
   // If we use .loc in assembly, we can't separate .file entries according to
   // compile units. Thus all files will belong to the default compile unit.
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     CUID = 0;
 
   // If FE did not provide a file name, then assume stdin.
@@ -676,14 +716,12 @@ unsigned DwarfDebug::getOrCreateSourceID(StringRef FileName,
 
 // Create new CompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
-CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
-  DICompileUnit DIUnit(N);
+CompileUnit *DwarfDebug::constructCompileUnit(DICompileUnit DIUnit) {
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, Asm,
+  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++, Die, DIUnit, Asm,
                                        this, &InfoHolder);
 
   FileIDCUMap[NewCU->getUniqueID()] = 0;
@@ -710,31 +748,57 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   // Use a single line table if we are using .loc and generating assembly.
   bool UseTheFirstCU =
-    (Asm->TM.hasMCUseLoc() &&
-     Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) ||
-    (NewCU->getUniqueID() == 0);
+      (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport()) ||
+      (NewCU->getUniqueID() == 0);
 
-  // DW_AT_stmt_list is a offset of line number information for this
-  // compile unit in debug_line section. For split dwarf this is
-  // left in the skeleton CU and so not included.
-  // The line table entries are not always emitted in assembly, so it
-  // is not okay to use line_table_start here.
   if (!useSplitDwarf()) {
+    // DW_AT_stmt_list is a offset of line number information for this
+    // compile unit in debug_line section. For split dwarf this is
+    // left in the skeleton CU and so not included.
+    // The line table entries are not always emitted in assembly, so it
+    // is not okay to use line_table_start here.
     if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                      UseTheFirstCU ?
-                      Asm->GetTempSymbol("section_line") : LineTableStartSym);
+      NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_sec_offset,
+                      UseTheFirstCU ? Asm->GetTempSymbol("section_line")
+                                    : LineTableStartSym);
     else if (UseTheFirstCU)
       NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
     else
       NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
                       LineTableStartSym, DwarfLineSectionSym);
+
+    // If we're using split dwarf the compilation dir is going to be in the
+    // skeleton CU and so we don't need to duplicate it here.
+    if (!CompilationDir.empty())
+      NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+    // Flags to let the linker know we have emitted new style pubnames. Only
+    // emit it here if we don't have a skeleton CU for split dwarf.
+    if (GenerateGnuPubSections) {
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubnames",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubNamesSectionSym);
+
+      if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+        NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes,
+                        dwarf::DW_FORM_sec_offset,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()));
+      else
+        NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                        Asm->GetTempSymbol("gnu_pubtypes",
+                                           NewCU->getUniqueID()),
+                        DwarfGnuPubTypesSectionSym);
+    }
   }
 
-  // If we're using split dwarf the compilation dir is going to be in the
-  // skeleton CU and so we don't need to duplicate it here.
-  if (!useSplitDwarf() && !CompilationDir.empty())
-    NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
   if (DIUnit.isOptimized())
     NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
@@ -751,13 +815,17 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
 
   InfoHolder.addUnit(NewCU);
 
-  CUMap.insert(std::make_pair(N, NewCU));
+  CUMap.insert(std::make_pair(DIUnit, NewCU));
+  CUDieMap.insert(std::make_pair(Die, NewCU));
   return NewCU;
 }
 
 // Construct subprogram DIE.
-void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
-                                        const MDNode *N) {
+void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N) {
+  // FIXME: We should only call this routine once, however, during LTO if a
+  // program is defined in multiple CUs we could end up calling it out of
+  // beginModule as we walk the CUs.
+
   CompileUnit *&CURef = SPMap[N];
   if (CURef)
     return;
@@ -771,49 +839,54 @@ void DwarfDebug::constructSubprogramDIE(CompileUnit *TheCU,
 
   DIE *SubprogramDie = TheCU->getOrCreateSubprogramDIE(SP);
 
-  // Add to map.
-  TheCU->insertDIE(N, SubprogramDie);
-
-  // Add to context owner.
-  TheCU->addToContextOwner(SubprogramDie, SP.getContext());
-
-  // Expose as global, if requested.
-  if (GenerateDwarfPubNamesSection)
-    TheCU->addGlobalName(SP.getName(), SubprogramDie);
+  // Expose as a global name.
+  TheCU->addGlobalName(SP.getName(), SubprogramDie, resolve(SP.getContext()));
 }
 
-void DwarfDebug::constructImportedModuleDIE(CompileUnit *TheCU,
+void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
                                             const MDNode *N) {
-  DIImportedModule Module(N);
+  DIImportedEntity Module(N);
   if (!Module.Verify())
     return;
   if (DIE *D = TheCU->getOrCreateContextDIE(Module.getContext()))
-    constructImportedModuleDIE(TheCU, Module, D);
+    constructImportedEntityDIE(TheCU, Module, D);
 }
 
-void DwarfDebug::constructImportedModuleDIE(CompileUnit *TheCU, const MDNode *N,
+void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU, const MDNode *N,
                                             DIE *Context) {
-  DIImportedModule Module(N);
+  DIImportedEntity Module(N);
   if (!Module.Verify())
     return;
-  return constructImportedModuleDIE(TheCU, Module, Context);
+  return constructImportedEntityDIE(TheCU, Module, Context);
 }
 
-void DwarfDebug::constructImportedModuleDIE(CompileUnit *TheCU,
-                                            const DIImportedModule &Module,
+void DwarfDebug::constructImportedEntityDIE(CompileUnit *TheCU,
+                                            const DIImportedEntity &Module,
                                             DIE *Context) {
   assert(Module.Verify() &&
          "Use one of the MDNode * overloads to handle invalid metadata");
   assert(Context && "Should always have a context for an imported_module");
-  DIE *IMDie = new DIE(dwarf::DW_TAG_imported_module);
+  DIE *IMDie = new DIE(Module.getTag());
   TheCU->insertDIE(Module, IMDie);
-  DIE *NSDie = TheCU->getOrCreateNameSpace(Module.getNameSpace());
+  DIE *EntityDie;
+  DIDescriptor Entity = Module.getEntity();
+  if (Entity.isNameSpace())
+    EntityDie = TheCU->getOrCreateNameSpace(DINameSpace(Entity));
+  else if (Entity.isSubprogram())
+    EntityDie = TheCU->getOrCreateSubprogramDIE(DISubprogram(Entity));
+  else if (Entity.isType())
+    EntityDie = TheCU->getOrCreateTypeDIE(DIType(Entity));
+  else
+    EntityDie = TheCU->getDIE(Entity);
   unsigned FileID = getOrCreateSourceID(Module.getContext().getFilename(),
                                         Module.getContext().getDirectory(),
                                         TheCU->getUniqueID());
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, 0, FileID);
-  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, 0, Module.getLineNumber());
-  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, dwarf::DW_FORM_ref4, NSDie);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_file, None, FileID);
+  TheCU->addUInt(IMDie, dwarf::DW_AT_decl_line, None, Module.getLineNumber());
+  TheCU->addDIEEntry(IMDie, dwarf::DW_AT_import, EntityDie);
+  StringRef Name = Module.getName();
+  if (!Name.empty())
+    TheCU->addString(IMDie, dwarf::DW_AT_name, Name);
   Context->addChild(IMDie);
 }
 
@@ -831,6 +904,7 @@ void DwarfDebug::beginModule() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes)
     return;
+  TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
 
   // Emit initial sections so we can reference labels later.
   emitSectionLabels();
@@ -838,16 +912,16 @@ void DwarfDebug::beginModule() {
   for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
     DICompileUnit CUNode(CU_Nodes->getOperand(i));
     CompileUnit *CU = constructCompileUnit(CUNode);
-    DIArray ImportedModules = CUNode.getImportedModules();
-    for (unsigned i = 0, e = ImportedModules.getNumElements(); i != e; ++i)
+    DIArray ImportedEntities = CUNode.getImportedEntities();
+    for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
       ScopesWithImportedEntities.push_back(std::make_pair(
-          DIImportedModule(ImportedModules.getElement(i)).getContext(),
-          ImportedModules.getElement(i)));
+          DIImportedEntity(ImportedEntities.getElement(i)).getContext(),
+          ImportedEntities.getElement(i)));
     std::sort(ScopesWithImportedEntities.begin(),
-              ScopesWithImportedEntities.end(), CompareFirst());
+              ScopesWithImportedEntities.end(), less_first());
     DIArray GVs = CUNode.getGlobalVariables();
     for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i)
-      CU->createGlobalVariableDIE(GVs.getElement(i));
+      CU->createGlobalVariableDIE(DIGlobalVariable(GVs.getElement(i)));
     DIArray SPs = CUNode.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
       constructSubprogramDIE(CU, SPs.getElement(i));
@@ -859,24 +933,15 @@ void DwarfDebug::beginModule() {
       CU->getOrCreateTypeDIE(RetainedTypes.getElement(i));
     // Emit imported_modules last so that the relevant context is already
     // available.
-    for (unsigned i = 0, e = ImportedModules.getNumElements(); i != e; ++i)
-      constructImportedModuleDIE(CU, ImportedModules.getElement(i));
-    // If we're splitting the dwarf out now that we've got the entire
-    // CU then construct a skeleton CU based upon it.
-    if (useSplitDwarf()) {
-      // This should be a unique identifier when we want to build .dwp files.
-      CU->addUInt(CU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
-                  dwarf::DW_FORM_data8, 0);
-      // Now construct the skeleton CU associated.
-      constructSkeletonCU(CUNode);
-    }
+    for (unsigned i = 0, e = ImportedEntities.getNumElements(); i != e; ++i)
+      constructImportedEntityDIE(CU, ImportedEntities.getElement(i));
   }
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
 
   // Prime section data.
-  SectionMap.insert(Asm->getObjFileLowering().getTextSection());
+  SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
 // Attach DW_AT_inline attribute with inlined subprogram DIEs.
@@ -885,21 +950,20 @@ void DwarfDebug::computeInlinedDIEs() {
   for (SmallPtrSet<DIE *, 4>::iterator AI = InlinedSubprogramDIEs.begin(),
          AE = InlinedSubprogramDIEs.end(); AI != AE; ++AI) {
     DIE *ISP = *AI;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
   for (DenseMap<const MDNode *, DIE *>::iterator AI = AbstractSPDies.begin(),
          AE = AbstractSPDies.end(); AI != AE; ++AI) {
     DIE *ISP = AI->second;
     if (InlinedSubprogramDIEs.count(ISP))
       continue;
-    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, 0, dwarf::DW_INL_inlined);
+    FirstCU->addUInt(ISP, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
   }
 }
 
 // Collect info for variables that were optimized out.
 void DwarfDebug::collectDeadVariables() {
   const Module *M = MMI->getModule();
-  DenseMap<const MDNode *, LexicalScope *> DeadFnScopeMap;
 
   if (NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
@@ -907,33 +971,70 @@ void DwarfDebug::collectDeadVariables() {
       DIArray Subprograms = TheCU.getSubprograms();
       for (unsigned i = 0, e = Subprograms.getNumElements(); i != e; ++i) {
         DISubprogram SP(Subprograms.getElement(i));
-        if (ProcessedSPNodes.count(SP) != 0) continue;
-        if (!SP.Verify()) continue;
-        if (!SP.isDefinition()) continue;
+        if (ProcessedSPNodes.count(SP) != 0)
+          continue;
+        if (!SP.isSubprogram())
+          continue;
+        if (!SP.isDefinition())
+          continue;
         DIArray Variables = SP.getVariables();
-        if (Variables.getNumElements() == 0) continue;
-
-        LexicalScope *Scope =
-          new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
-        DeadFnScopeMap[SP] = Scope;
+        if (Variables.getNumElements() == 0)
+          continue;
 
         // Construct subprogram DIE and add variables DIEs.
         CompileUnit *SPCU = CUMap.lookup(TheCU);
         assert(SPCU && "Unable to find Compile Unit!");
+        // FIXME: See the comment in constructSubprogramDIE about duplicate
+        // subprogram DIEs.
         constructSubprogramDIE(SPCU, SP);
-        DIE *ScopeDIE = SPCU->getDIE(SP);
+        DIE *SPDIE = SPCU->getDIE(SP);
         for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
           DIVariable DV(Variables.getElement(vi));
-          if (!DV.Verify()) continue;
-          DbgVariable *NewVar = new DbgVariable(DV, NULL);
+          if (!DV.isVariable())
+            continue;
+          DbgVariable NewVar(DV, NULL, this);
           if (DIE *VariableDIE =
-              SPCU->constructVariableDIE(NewVar, Scope->isAbstractScope()))
-            ScopeDIE->addChild(VariableDIE);
+                  SPCU->constructVariableDIE(NewVar, false))
+            SPDIE->addChild(VariableDIE);
         }
       }
     }
   }
-  DeleteContainerSeconds(DeadFnScopeMap);
+}
+
+// Type Signature [7.27] and ODR Hash code.
+
+/// \brief Grabs the string in whichever attribute is passed in and returns
+/// a reference to it. Returns "" if the attribute doesn't exist.
+static StringRef getDIEStringAttr(DIE *Die, unsigned Attr) {
+  DIEValue *V = Die->findAttribute(Attr);
+
+  if (DIEString *S = dyn_cast_or_null<DIEString>(V))
+    return S->getString();
+
+  return StringRef("");
+}
+
+/// Return true if the current DIE is contained within an anonymous namespace.
+static bool isContainedInAnonNamespace(DIE *Die) {
+  DIE *Parent = Die->getParent();
+
+  while (Parent) {
+    if (Parent->getTag() == dwarf::DW_TAG_namespace &&
+        getDIEStringAttr(Parent, dwarf::DW_AT_name) == "")
+      return true;
+    Parent = Parent->getParent();
+  }
+
+  return false;
+}
+
+/// Test if the current CU language is C++ and that we have
+/// a named type that is not contained in an anonymous namespace.
+static bool shouldAddODRHash(CompileUnit *CU, DIE *Die) {
+  return CU->getLanguage() == dwarf::DW_LANG_C_plus_plus &&
+         getDIEStringAttr(Die, dwarf::DW_AT_name) != "" &&
+         !isContainedInAnonNamespace(Die);
 }
 
 void DwarfDebug::finalizeModuleInfo() {
@@ -943,31 +1044,102 @@ void DwarfDebug::finalizeModuleInfo() {
   // Attach DW_AT_inline attribute with inlined subprogram DIEs.
   computeInlinedDIEs();
 
-  // Emit DW_AT_containing_type attribute to connect types with their
-  // vtable holding type.
+  // Split out type units and conditionally add an ODR tag to the split
+  // out type.
+  // FIXME: Do type splitting.
+  for (unsigned i = 0, e = TypeUnits.size(); i != e; ++i) {
+    DIE *Die = TypeUnits[i];
+    DIEHash Hash;
+    // If we've requested ODR hashes and it's applicable for an ODR hash then
+    // add the ODR signature now.
+    // FIXME: This should be added onto the type unit, not the type, but this
+    // works as an intermediate stage.
+    if (GenerateODRHash && shouldAddODRHash(CUMap.begin()->second, Die))
+      CUMap.begin()->second->addUInt(Die, dwarf::DW_AT_GNU_odr_signature,
+                                     dwarf::DW_FORM_data8,
+                                     Hash.computeDIEODRSignature(*Die));
+  }
+
+  // Handle anything that needs to be done on a per-cu basis.
   for (DenseMap<const MDNode *, CompileUnit *>::iterator CUI = CUMap.begin(),
-         CUE = CUMap.end(); CUI != CUE; ++CUI) {
+                                                         CUE = CUMap.end();
+       CUI != CUE; ++CUI) {
     CompileUnit *TheCU = CUI->second;
+    // Emit DW_AT_containing_type attribute to connect types with their
+    // vtable holding type.
     TheCU->constructContainingTypeDIEs();
+
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then construct a skeleton CU based upon it.
+    if (useSplitDwarf()) {
+      uint64_t ID = 0;
+      if (GenerateCUHash) {
+        DIEHash CUHash;
+        ID = CUHash.computeCUSignature(*TheCU->getCUDie());
+      }
+      // This should be a unique identifier when we want to build .dwp files.
+      TheCU->addUInt(TheCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                     dwarf::DW_FORM_data8, ID);
+      // Now construct the skeleton CU associated.
+      CompileUnit *SkCU = constructSkeletonCU(TheCU);
+      // This should be a unique identifier when we want to build .dwp files.
+      SkCU->addUInt(SkCU->getCUDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+    }
   }
 
-   // Compute DIE offsets and sizes.
+  // Compute DIE offsets and sizes.
   InfoHolder.computeSizeAndOffsets();
   if (useSplitDwarf())
     SkeletonHolder.computeSizeAndOffsets();
 }
 
 void DwarfDebug::endSections() {
-  // Standard sections final addresses.
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getTextSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("text_end"));
-  Asm->OutStreamer.SwitchSection(Asm->getObjFileLowering().getDataSection());
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("data_end"));
+   // Filter labels by section.
+  for (size_t n = 0; n < ArangeLabels.size(); n++) {
+    const SymbolCU &SCU = ArangeLabels[n];
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      const MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[NULL].push_back(SCU);
+    }
+  }
+
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Add terminating symbols for each section.
+  for (unsigned ID=0;ID<Sections.size();ID++) {
+    const MCSection *Section = Sections[ID];
+    MCSymbol *Sym = NULL;
+
+    if (Section) {
+      // We can't call MCSection::getLabelEndName, as it's only safe to do so
+      // if we know the section name up-front. For user-created sections, the resulting
+      // label may not be valid to use as a label. (section names can use a greater
+      // set of characters on some systems)
+      Sym = Asm->GetTempSymbol("debug_end", ID);
+      Asm->OutStreamer.SwitchSection(Section);
+      Asm->OutStreamer.EmitLabel(Sym);
+    }
 
-  // End text sections.
-  for (unsigned I = 0, E = SectionMap.size(); I != E; ++I) {
-    Asm->OutStreamer.SwitchSection(SectionMap[I]);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", I+1));
+    // Insert a final terminator.
+    SectionMap[Section].push_back(SymbolCU(NULL, Sym));
   }
 }
 
@@ -984,6 +1156,8 @@ void DwarfDebug::endModule() {
   finalizeModuleInfo();
 
   if (!useSplitDwarf()) {
+    emitDebugStr();
+
     // Emit all the DIEs into a debug info section.
     emitDebugInfo();
 
@@ -1002,15 +1176,12 @@ void DwarfDebug::endModule() {
     // Emit info into a debug macinfo section.
     emitDebugMacInfo();
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   } else {
     // TODO: Fill this in for separated debug sections and separate
     // out information into new sections.
+    emitDebugStr();
+    if (useSplitDwarf())
+      emitDebugStrDWO();
 
     // Emit the debug info section and compile units.
     emitDebugInfo();
@@ -1035,12 +1206,6 @@ void DwarfDebug::endModule() {
     // Emit DWO addresses.
     InfoHolder.emitAddresses(Asm->getObjFileLowering().getDwarfAddrSection());
 
-    // Emit inline info.
-    // TODO: When we don't need the option anymore we
-    // can remove all of the code that this section
-    // depends upon.
-    if (useDarwinGDBCompat())
-      emitDebugInlineInfo();
   }
 
   // Emit info into the dwarf accelerator table sections.
@@ -1051,20 +1216,11 @@ void DwarfDebug::endModule() {
     emitAccelTypes();
   }
 
-  // Emit info into a debug pubnames section, if requested.
-  if (GenerateDwarfPubNamesSection)
-    emitDebugPubnames();
-
-  // Emit info into a debug pubtypes section.
-  // TODO: When we don't need the option anymore we can
-  // remove all of the code that adds to the table.
-  if (useDarwinGDBCompat())
-    emitDebugPubTypes();
-
-  // Finally emit string information into a string table.
-  emitDebugStr();
-  if (useSplitDwarf())
-    emitDebugStrDWO();
+  // Emit the pubnames and pubtypes sections if requested.
+  if (HasDwarfPubSections) {
+    emitDebugPubNames(GenerateGnuPubSections);
+    emitDebugPubTypes(GenerateGnuPubSections);
+  }
 
   // clean up.
   SPMap.clear();
@@ -1072,7 +1228,7 @@ void DwarfDebug::endModule() {
          E = CUMap.end(); I != E; ++I)
     delete I->second;
 
-  for (SmallVector<CompileUnit *, 1>::iterator I = SkeletonCUs.begin(),
+  for (SmallVectorImpl<CompileUnit *>::iterator I = SkeletonCUs.begin(),
          E = SkeletonCUs.end(); I != E; ++I)
     delete *I;
 
@@ -1094,7 +1250,7 @@ DbgVariable *DwarfDebug::findAbstractVariable(DIVariable &DV,
   if (!Scope)
     return NULL;
 
-  AbsDbgVariable = new DbgVariable(Var, NULL);
+  AbsDbgVariable = new DbgVariable(Var, NULL, this);
   addScopeVariable(Scope, AbsDbgVariable);
   AbstractVariables[Var] = AbsDbgVariable;
   return AbsDbgVariable;
@@ -1143,7 +1299,7 @@ DwarfDebug::collectVariableInfoFromMMITable(const MachineFunction *MF,
       continue;
 
     DbgVariable *AbsDbgVariable = findAbstractVariable(DV, VP.second);
-    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsDbgVariable, this);
     RegVar->setFrameIndex(VP.first);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
@@ -1158,7 +1314,8 @@ static bool isDbgValueInDefinedReg(const MachineInstr *MI) {
   assert(MI->isDebugValue() && "Invalid DBG_VALUE machine instruction!");
   return MI->getNumOperands() == 3 &&
          MI->getOperand(0).isReg() && MI->getOperand(0).getReg() &&
-         MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0;
+         (MI->getOperand(1).isImm() ||
+          (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == 0U));
 }
 
 // Get .debug_loc entry for the instruction range starting at MI.
@@ -1168,16 +1325,12 @@ static DotDebugLocEntry getDebugLocEntry(AsmPrinter *Asm,
                                          const MachineInstr *MI) {
   const MDNode *Var =  MI->getOperand(MI->getNumOperands() - 1).getMetadata();
 
-  if (MI->getNumOperands() != 3) {
-    MachineLocation MLoc = Asm->getDebugValueLocation(MI);
-    return DotDebugLocEntry(FLabel, SLabel, MLoc, Var);
-  }
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm()) {
+  assert(MI->getNumOperands() == 3);
+  if (MI->getOperand(0).isReg()) {
     MachineLocation MLoc;
-    // TODO: Currently an offset of 0 in a DBG_VALUE means
-    // we need to generate a direct register value.
-    // There is no way to specify an indirect value with offset 0.
-    if (MI->getOperand(1).getImm() == 0)
+    // If the second operand is an immediate, this is a
+    // register-indirect address.
+    if (!MI->getOperand(1).isImm())
       MLoc.set(MI->getOperand(0).getReg());
     else
       MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
@@ -1198,7 +1351,7 @@ void
 DwarfDebug::collectVariableInfo(const MachineFunction *MF,
                                 SmallPtrSet<const MDNode *, 16> &Processed) {
 
-  // collection info from MMI table.
+  // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMMITable(MF, Processed);
 
   for (SmallVectorImpl<const MDNode*>::const_iterator
@@ -1231,7 +1384,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     Processed.insert(DV);
     assert(MInsn->isDebugValue() && "History must begin with debug value");
     DbgVariable *AbsVar = findAbstractVariable(DV, MInsn->getDebugLoc());
-    DbgVariable *RegVar = new DbgVariable(DV, AbsVar);
+    DbgVariable *RegVar = new DbgVariable(DV, AbsVar, this);
     if (!addCurrentFnArgument(MF, RegVar, Scope))
       addScopeVariable(Scope, RegVar);
     if (AbsVar)
@@ -1291,10 +1444,10 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
   DIArray Variables = DISubprogram(FnScope->getScopeNode()).getVariables();
   for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
     DIVariable DV(Variables.getElement(i));
-    if (!DV || !DV.Verify() || !Processed.insert(DV))
+    if (!DV || !DV.isVariable() || !Processed.insert(DV))
       continue;
     if (LexicalScope *Scope = LScopes.findLexicalScope(DV.getContext()))
-      addScopeVariable(Scope, new DbgVariable(DV, NULL));
+      addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
   }
 }
 
@@ -1388,19 +1541,19 @@ void DwarfDebug::identifyScopeMarkers() {
   while (!WorkList.empty()) {
     LexicalScope *S = WorkList.pop_back_val();
 
-    const SmallVector<LexicalScope *, 4> &Children = S->getChildren();
+    const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
     if (!Children.empty())
-      for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
+      for (SmallVectorImpl<LexicalScope *>::const_iterator SI = Children.begin(),
              SE = Children.end(); SI != SE; ++SI)
         WorkList.push_back(*SI);
 
     if (S->isAbstractScope())
       continue;
 
-    const SmallVector<InsnRange, 4> &Ranges = S->getRanges();
+    const SmallVectorImpl<InsnRange> &Ranges = S->getRanges();
     if (Ranges.empty())
       continue;
-    for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
+    for (SmallVectorImpl<InsnRange>::const_iterator RI = Ranges.begin(),
            RE = Ranges.end(); RI != RE; ++RI) {
       assert(RI->first && "InsnRange does not have first instruction!");
       assert(RI->second && "InsnRange does not have second instruction!");
@@ -1422,7 +1575,7 @@ static MDNode *getScopeNode(DebugLoc DL, const LLVMContext &Ctx) {
 static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
   const MDNode *Scope = getScopeNode(DL, Ctx);
   DISubprogram SP = getDISubprogram(Scope);
-  if (SP.Verify()) {
+  if (SP.isSubprogram()) {
     // Check for number of operands since the compatibility is
     // cheap here.
     if (SP->getNumOperands() > 19)
@@ -1437,36 +1590,45 @@ static DebugLoc getFnDebugLoc(DebugLoc DL, const LLVMContext &Ctx) {
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
 void DwarfDebug::beginFunction(const MachineFunction *MF) {
-  if (!MMI->hasDebugInfo()) return;
+
+  // If there's no debug info for the function we're not going to do anything.
+  if (!MMI->hasDebugInfo())
+    return;
+
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty()) return;
+  if (LScopes.empty())
+    return;
+
+  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
+
+  // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
 
   // Set DwarfCompileUnitID in MCContext to the Compile Unit this function
-  // belongs to.
+  // belongs to so that we add to the correct per-cu line table in the
+  // non-asm case.
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
   CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
-  if (Asm->TM.hasMCUseLoc() &&
-      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+  if (Asm->TM.hasMCUseLoc() && Asm->OutStreamer.hasRawTextSupport())
     // Use a single line table if we are using .loc and generating assembly.
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
   else
     Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
 
-  FunctionBeginSym = Asm->GetTempSymbol("func_begin",
-                                        Asm->getFunctionNumber());
+  // Emit a label for the function so that we have a beginning address.
+  FunctionBeginSym = Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber());
   // Assumes in correct section after the entry point.
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
-  assert(UserVariables.empty() && DbgValues.empty() && "Maps weren't cleaned");
-
   const TargetRegisterInfo *TRI = Asm->TM.getRegisterInfo();
   // LiveUserVar - Map physreg numbers to the MDNode they contain.
-  std::vector<const MDNode*> LiveUserVar(TRI->getNumRegs());
+  std::vector<const MDNode *> LiveUserVar(TRI->getNumRegs());
 
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-       I != E; ++I) {
+  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end(); I != E;
+       ++I) {
     bool AtBlockEntry = true;
     for (MachineBasicBlock::const_iterator II = I->begin(), IE = I->end();
          II != IE; ++II) {
@@ -1477,22 +1639,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Keep track of user variables.
         const MDNode *Var =
-          MI->getOperand(MI->getNumOperands() - 1).getMetadata();
+            MI->getOperand(MI->getNumOperands() - 1).getMetadata();
 
         // Variable is in a register, we need to check for clobbers.
         if (isDbgValueInDefinedReg(MI))
           LiveUserVar[MI->getOperand(0).getReg()] = Var;
 
         // Check the history of this variable.
-        SmallVectorImpl<const MachineInstr*> &History = DbgValues[Var];
+        SmallVectorImpl<const MachineInstr *> &History = DbgValues[Var];
         if (History.empty()) {
           UserVariables.push_back(Var);
           // The first mention of a function argument gets the FunctionBeginSym
           // label, so arguments are visible when breaking at function entry.
           DIVariable DV(Var);
-          if (DV.Verify() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
-              DISubprogram(getDISubprogram(DV.getContext()))
-                .describes(MF->getFunction()))
+          if (DV.isVariable() && DV.getTag() == dwarf::DW_TAG_arg_variable &&
+              getDISubprogram(DV.getContext()).describes(MF->getFunction()))
             LabelsBeforeInsn[MI] = FunctionBeginSym;
         } else {
           // We have seen this variable before. Try to coalesce DBG_VALUEs.
@@ -1502,8 +1663,8 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             if (History.size() >= 2 &&
                 Prev->isIdenticalTo(History[History.size() - 2])) {
               DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
-                    << "\t" << *Prev
-                    << "\t" << *History[History.size() - 2] << "\n");
+                           << "\t" << *Prev << "\t"
+                           << *History[History.size() - 2] << "\n");
               History.pop_back();
             }
 
@@ -1514,17 +1675,15 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
               // Previous register assignment needs to terminate at the end of
               // its basic block.
               MachineBasicBlock::const_iterator LastMI =
-                PrevMBB->getLastNonDebugInstr();
+                  PrevMBB->getLastNonDebugInstr();
               if (LastMI == PrevMBB->end()) {
                 // Drop DBG_VALUE for empty range.
                 DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
-                      << "\t" << *Prev << "\n");
+                             << "\t" << *Prev << "\n");
                 History.pop_back();
-              }
-              else {
+              } else if (llvm::next(PrevMBB) != PrevMBB->getParent()->end())
                 // Terminate after LastMI.
                 History.push_back(LastMI);
-              }
             }
           }
         }
@@ -1542,11 +1701,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
         // Check if the instruction clobbers any registers with debug vars.
         for (MachineInstr::const_mop_iterator MOI = MI->operands_begin(),
-               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+                                              MOE = MI->operands_end();
+             MOI != MOE; ++MOI) {
           if (!MOI->isReg() || !MOI->isDef() || !MOI->getReg())
             continue;
-          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true);
-               AI.isValid(); ++AI) {
+          for (MCRegAliasIterator AI(MOI->getReg(), TRI, true); AI.isValid();
+               ++AI) {
             unsigned Reg = *AI;
             const MDNode *Var = LiveUserVar[Reg];
             if (!Var)
@@ -1558,7 +1718,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             DbgValueHistoryMap::iterator HistI = DbgValues.find(Var);
             if (HistI == DbgValues.end())
               continue;
-            SmallVectorImpl<const MachineInstr*> &History = HistI->second;
+            SmallVectorImpl<const MachineInstr *> &History = HistI->second;
             if (History.empty())
               continue;
             const MachineInstr *Prev = History.back();
@@ -1580,7 +1740,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   for (DbgValueHistoryMap::iterator I = DbgValues.begin(), E = DbgValues.end();
        I != E; ++I) {
-    SmallVectorImpl<const MachineInstr*> &History = I->second;
+    SmallVectorImpl<const MachineInstr *> &History = I->second;
     if (History.empty())
       continue;
 
@@ -1589,11 +1749,11 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
     if (Prev->isDebugValue() && isDbgValueInDefinedReg(Prev)) {
       const MachineBasicBlock *PrevMBB = Prev->getParent();
       MachineBasicBlock::const_iterator LastMI =
-        PrevMBB->getLastNonDebugInstr();
+          PrevMBB->getLastNonDebugInstr();
       if (LastMI == PrevMBB->end())
         // Drop DBG_VALUE for empty range.
         History.pop_back();
-      else {
+      else if (PrevMBB != &PrevMBB->getParent()->back()) {
         // Terminate after LastMI.
         History.push_back(LastMI);
       }
@@ -1613,20 +1773,46 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
 
   // Record beginning of function.
   if (!PrologEndLoc.isUnknown()) {
-    DebugLoc FnStartDL = getFnDebugLoc(PrologEndLoc,
-                                       MF->getFunction()->getContext());
-    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
-                     FnStartDL.getScope(MF->getFunction()->getContext()),
-    // We'd like to list the prologue as "not statements" but GDB behaves
-    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-                     DWARF2_FLAG_IS_STMT);
+    DebugLoc FnStartDL =
+        getFnDebugLoc(PrologEndLoc, MF->getFunction()->getContext());
+    recordSourceLine(
+        FnStartDL.getLine(), FnStartDL.getCol(),
+        FnStartDL.getScope(MF->getFunction()->getContext()),
+        // We'd like to list the prologue as "not statements" but GDB behaves
+        // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+        DWARF2_FLAG_IS_STMT);
   }
 }
 
 void DwarfDebug::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
-//  SmallVector<DbgVariable *, 8> &Vars = ScopeVariables.lookup(LS);
-  ScopeVariables[LS].push_back(Var);
-//  Vars.push_back(Var);
+  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
+  DIVariable DV = Var->getVariable();
+  // Variables with positive arg numbers are parameters.
+  if (unsigned ArgNum = DV.getArgNumber()) {
+    // Keep all parameters in order at the start of the variable list to ensure
+    // function types are correct (no out-of-order parameters)
+    //
+    // This could be improved by only doing it for optimized builds (unoptimized
+    // builds have the right order to begin with), searching from the back (this
+    // would catch the unoptimized case quickly), or doing a binary search
+    // rather than linear search.
+    SmallVectorImpl<DbgVariable *>::iterator I = Vars.begin();
+    while (I != Vars.end()) {
+      unsigned CurNum = (*I)->getVariable().getArgNumber();
+      // A local (non-parameter) variable has been found, insert immediately
+      // before it.
+      if (CurNum == 0)
+        break;
+      // A later indexed parameter has been found, insert immediately before it.
+      if (CurNum > ArgNum)
+        break;
+      ++I;
+    }
+    Vars.insert(I, Var);
+    return;
+  }
+
+  Vars.push_back(Var);
 }
 
 // Gather and emit post-function debug information.
@@ -1653,12 +1839,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   for (unsigned i = 0, e = AList.size(); i != e; ++i) {
     LexicalScope *AScope = AList[i];
     DISubprogram SP(AScope->getScopeNode());
-    if (SP.Verify()) {
+    if (SP.isSubprogram()) {
       // Collect info for variables that were optimized out.
       DIArray Variables = SP.getVariables();
       for (unsigned i = 0, e = Variables.getNumElements(); i != e; ++i) {
         DIVariable DV(Variables.getElement(i));
-        if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
+        if (!DV || !DV.isVariable() || !ProcessedVars.insert(DV))
           continue;
         // Check that DbgVariable for DV wasn't created earlier, when
         // findAbstractVariable() was called for inlined instance of DV.
@@ -1667,7 +1853,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         if (AbstractVariables.lookup(CleanDV))
           continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
-          addScopeVariable(Scope, new DbgVariable(DV, NULL));
+          addScopeVariable(Scope, new DbgVariable(DV, NULL, this));
       }
     }
     if (ProcessedSPNodes.count(AScope->getScopeNode()) == 0)
@@ -1679,11 +1865,8 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   if (!MF->getTarget().Options.DisableFramePointerElim(*MF))
     TheCU->addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
-  DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
-                                               MMI->getFrameMoves()));
-
   // Clear debug info
-  for (DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> >::iterator
+  for (ScopeVariablesMap::iterator
          I = ScopeVariables.begin(), E = ScopeVariables.end(); I != E; ++I)
     DeleteContainerPointers(I->second);
   ScopeVariables.clear();
@@ -1739,7 +1922,8 @@ void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
 // Emit Methods
 //===----------------------------------------------------------------------===//
 
-// Compute the size and offset of a DIE.
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
 unsigned
 DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   // Get the children.
@@ -1750,7 +1934,7 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
 
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbreviations->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbreviations[AbbrevNumber - 1];
 
   // Set DIE offset
   Die->setOffset(Offset);
@@ -1782,21 +1966,25 @@ DwarfUnits::computeSizeAndOffset(DIE *Die, unsigned Offset) {
   return Offset;
 }
 
-// Compute the size and offset of all the DIEs.
+// Compute the size and offset for each DIE.
 void DwarfUnits::computeSizeAndOffsets() {
-  // Offset from the beginning of debug info section.
-  unsigned AccuOffset = 0;
+  // Offset from the first CU in the debug info section is 0 initially.
+  unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
   for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
          E = CUs.end(); I != E; ++I) {
-    (*I)->setDebugInfoOffset(AccuOffset);
-    unsigned Offset =
-      sizeof(int32_t) + // Length of Compilation Unit Info
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
+    (*I)->setDebugInfoOffset(SecOffset);
 
+    // CU-relative offset is reset to 0 here.
+    unsigned Offset = sizeof(int32_t) + // Length of Unit Info
+                      (*I)->getHeaderSize(); // Unit-specific headers
+
+    // EndOffset here is CU-relative, after laying out
+    // all of the CU DIE.
     unsigned EndOffset = computeSizeAndOffset((*I)->getCUDie(), Offset);
-    AccuOffset += EndOffset;
+    SecOffset += EndOffset;
   }
 }
 
@@ -1821,9 +2009,16 @@ void DwarfDebug::emitSectionLabels() {
   DwarfLineSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfLineSection(), "section_line");
   emitSectionSym(Asm, TLOF.getDwarfLocSection());
-  if (GenerateDwarfPubNamesSection)
+  if (GenerateGnuPubSections) {
+    DwarfGnuPubNamesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubNamesSection());
+    DwarfGnuPubTypesSectionSym =
+        emitSectionSym(Asm, TLOF.getDwarfGnuPubTypesSection());
+  } else if (HasDwarfPubSections) {
     emitSectionSym(Asm, TLOF.getDwarfPubNamesSection());
-  emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+    emitSectionSym(Asm, TLOF.getDwarfPubTypesSection());
+  }
+
   DwarfStrSectionSym =
     emitSectionSym(Asm, TLOF.getDwarfStrSection(), "info_string");
   if (useSplitDwarf()) {
@@ -1843,10 +2038,10 @@ void DwarfDebug::emitSectionLabels() {
 }
 
 // Recursively emits a debug information entry.
-void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
+void DwarfDebug::emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs) {
   // Get the abbreviation for this DIE.
   unsigned AbbrevNumber = Die->getAbbrevNumber();
-  const DIEAbbrev *Abbrev = Abbrevs->at(AbbrevNumber - 1);
+  const DIEAbbrev *Abbrev = Abbrevs[AbbrevNumber - 1];
 
   // Emit the code (index) for the abbreviation.
   if (Asm->isVerbose())
@@ -1861,26 +2056,44 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
 
   // Emit the DIE attribute values.
   for (unsigned i = 0, N = Values.size(); i < N; ++i) {
-    unsigned Attr = AbbrevData[i].getAttribute();
-    unsigned Form = AbbrevData[i].getForm();
+    dwarf::Attribute Attr = AbbrevData[i].getAttribute();
+    dwarf::Form Form = AbbrevData[i].getForm();
     assert(Form && "Too many attributes for DIE (check abbreviation)");
 
     if (Asm->isVerbose())
       Asm->OutStreamer.AddComment(dwarf::AttributeString(Attr));
 
     switch (Attr) {
-    case dwarf::DW_AT_abstract_origin: {
+    case dwarf::DW_AT_abstract_origin:
+    case dwarf::DW_AT_type:
+    case dwarf::DW_AT_friend:
+    case dwarf::DW_AT_specification:
+    case dwarf::DW_AT_import:
+    case dwarf::DW_AT_containing_type: {
       DIEEntry *E = cast<DIEEntry>(Values[i]);
       DIE *Origin = E->getEntry();
       unsigned Addr = Origin->getOffset();
       if (Form == dwarf::DW_FORM_ref_addr) {
+        assert(!useSplitDwarf() && "TODO: dwo files can't have relocations.");
         // For DW_FORM_ref_addr, output the offset from beginning of debug info
         // section. Origin->getOffset() returns the offset from start of the
         // compile unit.
-        DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
-        Addr += Holder.getCUOffset(Origin->getCompileUnit());
+        CompileUnit *CU = CUDieMap.lookup(Origin->getCompileUnit());
+        assert(CU && "CUDie should belong to a CU.");
+        Addr += CU->getDebugInfoOffset();
+        if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+          Asm->EmitLabelPlusOffset(DwarfInfoSectionSym, Addr,
+                                   DIEEntry::getRefAddrSize(Asm));
+        else
+          Asm->EmitLabelOffsetDifference(DwarfInfoSectionSym, Addr,
+                                         DwarfInfoSectionSym,
+                                         DIEEntry::getRefAddrSize(Asm));
+      } else {
+        // Make sure Origin belong to the same CU.
+        assert(Die->getCompileUnit() == Origin->getCompileUnit() &&
+               "The referenced DIE should belong to the same CU in ref4");
+        Asm->EmitInt32(Addr);
       }
-      Asm->EmitInt32(Addr);
       break;
     }
     case dwarf::DW_AT_ranges: {
@@ -1902,7 +2115,7 @@ void DwarfDebug::emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs) {
     case dwarf::DW_AT_location: {
       if (DIELabel *L = dyn_cast<DIELabel>(Values[i])) {
         if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
-          Asm->EmitLabelReference(L->getValue(), 4);
+          Asm->EmitSectionOffset(L->getValue(), DwarfDebugLocSectionSym);
         else
           Asm->EmitLabelDifference(L->getValue(), DwarfDebugLocSectionSym, 4);
       } else {
@@ -1956,20 +2169,10 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
                                     TheCU->getUniqueID()));
 
     // Emit size of content not including length itself
-    unsigned ContentSize = Die->getSize() +
-      sizeof(int16_t) + // DWARF version number
-      sizeof(int32_t) + // Offset Into Abbrev. Section
-      sizeof(int8_t);   // Pointer Size (in bytes)
+    Asm->OutStreamer.AddComment("Length of Unit");
+    Asm->EmitInt32(TheCU->getHeaderSize() + Die->getSize());
 
-    Asm->OutStreamer.AddComment("Length of Compilation Unit Info");
-    Asm->EmitInt32(ContentSize);
-    Asm->OutStreamer.AddComment("DWARF version number");
-    Asm->EmitInt16(dwarf::DWARF_VERSION);
-    Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
-                           ASectionSym);
-    Asm->OutStreamer.AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+    TheCU->emitHeader(ASection, ASectionSym);
 
     DD->emitDIE(Die, Abbreviations);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(USection->getLabelEndName(),
@@ -1977,19 +2180,6 @@ void DwarfUnits::emitUnits(DwarfDebug *DD,
   }
 }
 
-/// For a given compile unit DIE, returns offset from beginning of debug info.
-unsigned DwarfUnits::getCUOffset(DIE *Die) {
-  assert(Die->getTag() == dwarf::DW_TAG_compile_unit  &&
-         "Input DIE should be compile unit in getCUOffset.");
-  for (SmallVectorImpl<CompileUnit *>::iterator I = CUs.begin(),
-       E = CUs.end(); I != E; ++I) {
-    CompileUnit *TheCU = *I;
-    if (TheCU->getCUDie() == Die)
-      return TheCU->getDebugInfoOffset();
-  }
-  llvm_unreachable("The compile unit DIE should belong to CUs in DwarfUnits.");
-}
-
 // Emit the debug info section.
 void DwarfDebug::emitDebugInfo() {
   DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
@@ -2063,7 +2253,7 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
 
 // Emit visible names into a hashed accelerator table section.
 void DwarfDebug::emitAccelNames() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2071,7 +2261,7 @@ void DwarfDebug::emitAccelNames() {
     const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelNames();
     for (StringMap<std::vector<DIE*> >::const_iterator
            GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
-      const char *Name = GI->getKeyData();
+      StringRef Name = GI->getKey();
       const std::vector<DIE *> &Entities = GI->second;
       for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
              DE = Entities.end(); DI != DE; ++DI)
@@ -2092,7 +2282,7 @@ void DwarfDebug::emitAccelNames() {
 // Emit objective C classes and categories into a hashed accelerator table
 // section.
 void DwarfDebug::emitAccelObjC() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2100,7 +2290,7 @@ void DwarfDebug::emitAccelObjC() {
     const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelObjC();
     for (StringMap<std::vector<DIE*> >::const_iterator
            GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
-      const char *Name = GI->getKeyData();
+      StringRef Name = GI->getKey();
       const std::vector<DIE *> &Entities = GI->second;
       for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
              DE = Entities.end(); DI != DE; ++DI)
@@ -2120,7 +2310,7 @@ void DwarfDebug::emitAccelObjC() {
 
 // Emit namespace dies into a hashed accelerator table.
 void DwarfDebug::emitAccelNamespaces() {
-  DwarfAccelTable AT(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  DwarfAccelTable AT(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                            dwarf::DW_FORM_data4));
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
          E = CUMap.end(); I != E; ++I) {
@@ -2128,7 +2318,7 @@ void DwarfDebug::emitAccelNamespaces() {
     const StringMap<std::vector<DIE*> > &Names = TheCU->getAccelNamespace();
     for (StringMap<std::vector<DIE*> >::const_iterator
            GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
-      const char *Name = GI->getKeyData();
+      StringRef Name = GI->getKey();
       const std::vector<DIE *> &Entities = GI->second;
       for (std::vector<DIE *>::const_iterator DI = Entities.begin(),
              DE = Entities.end(); DI != DE; ++DI)
@@ -2149,11 +2339,11 @@ void DwarfDebug::emitAccelNamespaces() {
 // Emit type dies into a hashed accelerator table.
 void DwarfDebug::emitAccelTypes() {
   std::vector<DwarfAccelTable::Atom> Atoms;
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeDIEOffset,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
                                         dwarf::DW_FORM_data4));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTag,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag,
                                         dwarf::DW_FORM_data2));
-  Atoms.push_back(DwarfAccelTable::Atom(DwarfAccelTable::eAtomTypeTypeFlags,
+  Atoms.push_back(DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags,
                                         dwarf::DW_FORM_data1));
   DwarfAccelTable AT(Atoms);
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
@@ -2163,7 +2353,7 @@ void DwarfDebug::emitAccelTypes() {
       = TheCU->getAccelTypes();
     for (StringMap<std::vector<std::pair<DIE*, unsigned> > >::const_iterator
            GI = Names.begin(), GE = Names.end(); GI != GE; ++GI) {
-      const char *Name = GI->getKeyData();
+      StringRef Name = GI->getKey();
       const std::vector<std::pair<DIE *, unsigned> > &Entities = GI->second;
       for (std::vector<std::pair<DIE *, unsigned> >::const_iterator DI
              = Entities.begin(), DE = Entities.end(); DI !=DE; ++DI)
@@ -2181,23 +2371,85 @@ void DwarfDebug::emitAccelTypes() {
   AT.Emit(Asm, SectionBegin, &InfoHolder);
 }
 
-/// emitDebugPubnames - Emit visible names into a debug pubnames section.
+// Public name handling.
+// The format for the various pubnames:
+//
+// dwarf pubnames - offset/name pairs where the offset is the offset into the CU
+// for the DIE that is named.
+//
+// gnu pubnames - offset/index value/name tuples where the offset is the offset
+// into the CU and the index value is computed according to the type of value
+// for the DIE that is named.
+//
+// For type units the offset is the offset of the skeleton DIE. For split dwarf
+// it's the offset within the debug_info/debug_types dwo section, however, the
+// reference in the pubname header doesn't change.
+
+/// computeIndexValue - Compute the gdb index value for the DIE and CU.
+static dwarf::PubIndexEntryDescriptor computeIndexValue(CompileUnit *CU,
+                                                        DIE *Die) {
+  dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
+
+  // We could have a specification DIE that has our most of our knowledge,
+  // look for that now.
+  DIEValue *SpecVal = Die->findAttribute(dwarf::DW_AT_specification);
+  if (SpecVal) {
+    DIE *SpecDIE = cast<DIEEntry>(SpecVal)->getEntry();
+    if (SpecDIE->findAttribute(dwarf::DW_AT_external))
+      Linkage = dwarf::GIEL_EXTERNAL;
+  } else if (Die->findAttribute(dwarf::DW_AT_external))
+    Linkage = dwarf::GIEL_EXTERNAL;
+
+  switch (Die->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    return dwarf::PubIndexEntryDescriptor(
+        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
+                              ? dwarf::GIEL_STATIC
+                              : dwarf::GIEL_EXTERNAL);
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_subrange_type:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_namespace:
+    return dwarf::GIEK_TYPE;
+  case dwarf::DW_TAG_subprogram:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_variable:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
+  case dwarf::DW_TAG_enumerator:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE,
+                                          dwarf::GIEL_STATIC);
+  default:
+    return dwarf::GIEK_NONE;
+  }
+}
+
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
 ///
-void DwarfDebug::emitDebugPubnames() {
+void DwarfDebug::emitDebugPubNames(bool GnuStyle) {
   const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
+               : Asm->getObjFileLowering().getDwarfPubNamesSection();
 
   typedef DenseMap<const MDNode*, CompileUnit*> CUMapType;
   for (CUMapType::iterator I = CUMap.begin(), E = CUMap.end(); I != E; ++I) {
     CompileUnit *TheCU = I->second;
     unsigned ID = TheCU->getUniqueID();
 
-    if (TheCU->getGlobalNames().empty())
-      continue;
-
     // Start the dwarf pubnames section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubNamesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubname section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubnames",
+                                                    TheCU->getUniqueID()));
 
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Names Info");
     Asm->EmitLabelDifference(Asm->GetTempSymbol("pubnames_end", ID),
                              Asm->GetTempSymbol("pubnames_begin", ID), 4);
@@ -2205,7 +2457,7 @@ void DwarfDebug::emitDebugPubnames() {
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubnames_begin", ID));
 
     Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(dwarf::DWARF_VERSION);
+    Asm->EmitInt16(dwarf::DW_PUBNAMES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
     Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
@@ -2216,18 +2468,27 @@ void DwarfDebug::emitDebugPubnames() {
                              Asm->GetTempSymbol(ISec->getLabelBeginName(), ID),
                              4);
 
+    // Emit the pubnames for this compilation unit.
     const StringMap<DIE*> &Globals = TheCU->getGlobalNames();
     for (StringMap<DIE*>::const_iterator
            GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
-      const DIE *Entity = GI->second;
+      DIE *Entity = GI->second;
 
       Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
       if (Asm->isVerbose())
         Asm->OutStreamer.AddComment("External Name");
-      Asm->OutStreamer.EmitBytes(StringRef(Name, strlen(Name)+1), 0);
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
     }
 
     Asm->OutStreamer.AddComment("End Mark");
@@ -2236,55 +2497,78 @@ void DwarfDebug::emitDebugPubnames() {
   }
 }
 
-void DwarfDebug::emitDebugPubTypes() {
+void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  const MCSection *PSec =
+      GnuStyle ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
+               : Asm->getObjFileLowering().getDwarfPubTypesSection();
+
   for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
-         E = CUMap.end(); I != E; ++I) {
+                                                         E = CUMap.end();
+       I != E; ++I) {
     CompileUnit *TheCU = I->second;
     // Start the dwarf pubtypes section.
-    Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfPubTypesSection());
+    Asm->OutStreamer.SwitchSection(PSec);
+
+    // Emit a label so we can reference the beginning of this pubtype section.
+    if (GnuStyle)
+      Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("gnu_pubtypes",
+                                                    TheCU->getUniqueID()));
+
+    // Emit the header.
     Asm->OutStreamer.AddComment("Length of Public Types Info");
     Asm->EmitLabelDifference(
-      Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
-      Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()),
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()), 4);
 
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_begin",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_begin", TheCU->getUniqueID()));
 
-    if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DWARF Version");
-    Asm->EmitInt16(dwarf::DWARF_VERSION);
+    if (Asm->isVerbose())
+      Asm->OutStreamer.AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DW_PUBTYPES_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
-    const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
-    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                              TheCU->getUniqueID()),
-                           DwarfInfoSectionSym);
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()),
+        DwarfInfoSectionSym);
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(),
-                                                TheCU->getUniqueID()),
-                             Asm->GetTempSymbol(ISec->getLabelBeginName(),
-                                                TheCU->getUniqueID()),
-                             4);
-
-    const StringMap<DIE*> &Globals = TheCU->getGlobalTypes();
-    for (StringMap<DIE*>::const_iterator
-           GI = Globals.begin(), GE = Globals.end(); GI != GE; ++GI) {
+    Asm->EmitLabelDifference(
+        Asm->GetTempSymbol(ISec->getLabelEndName(), TheCU->getUniqueID()),
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), TheCU->getUniqueID()), 4);
+
+    // Emit the pubtypes.
+    const StringMap<DIE *> &Globals = TheCU->getGlobalTypes();
+    for (StringMap<DIE *>::const_iterator GI = Globals.begin(),
+                                          GE = Globals.end();
+         GI != GE; ++GI) {
       const char *Name = GI->getKeyData();
       DIE *Entity = GI->second;
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("DIE offset");
       Asm->EmitInt32(Entity->getOffset());
 
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("External Name");
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheCU, Entity);
+        Asm->OutStreamer.AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
+      if (Asm->isVerbose())
+        Asm->OutStreamer.AddComment("External Name");
+
       // Emit the name with a terminating null byte.
-      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength()+1));
+      Asm->OutStreamer.EmitBytes(StringRef(Name, GI->getKeyLength() + 1));
     }
 
     Asm->OutStreamer.AddComment("End Mark");
     Asm->EmitInt32(0);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("pubtypes_end",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer.EmitLabel(
+        Asm->GetTempSymbol("pubtypes_end", TheCU->getUniqueID()));
   }
 }
 
@@ -2339,24 +2623,18 @@ void DwarfUnits::emitAddresses(const MCSection *AddrSection) {
   // Start the dwarf addr section.
   Asm->OutStreamer.SwitchSection(AddrSection);
 
-  // Get all of the string pool entries and put them in an array by their ID so
-  // we can sort them.
-  SmallVector<std::pair<unsigned,
-                        std::pair<MCSymbol*, unsigned>* >, 64> Entries;
+  // Order the address pool entries by ID
+  SmallVector<const MCExpr *, 64> Entries(AddressPool.size());
 
-  for (DenseMap<MCSymbol*, std::pair<MCSymbol*, unsigned> >::iterator
-         I = AddressPool.begin(), E = AddressPool.end();
+  for (DenseMap<const MCExpr *, unsigned>::iterator I = AddressPool.begin(),
+                                                    E = AddressPool.end();
        I != E; ++I)
-    Entries.push_back(std::make_pair(I->second.second, &(I->second)));
-
-  array_pod_sort(Entries.begin(), Entries.end());
+    Entries[I->second] = I->first;
 
   for (unsigned i = 0, e = Entries.size(); i != e; ++i) {
-    // Emit a label for reference from debug information entries.
-    MCSymbol *Sym = Entries[i].second->first;
-    if (Sym)
-      Asm->EmitLabelReference(Entries[i].second->first,
-                              Asm->getDataLayout().getPointerSize());
+    // Emit an expression for reference from debug information entries.
+    if (const MCExpr *Expr = Entries[i])
+      Asm->OutStreamer.EmitValue(Expr, Asm->getDataLayout().getPointerSize());
     else
       Asm->OutStreamer.EmitIntValue(0, Asm->getDataLayout().getPointerSize());
   }
@@ -2369,7 +2647,7 @@ void DwarfDebug::emitDebugStr() {
   Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
 }
 
-// Emit visible names into a debug loc section.
+// Emit locations into the debug loc section.
 void DwarfDebug::emitDebugLoc() {
   if (DotDebugLocEntries.empty())
     return;
@@ -2398,9 +2676,9 @@ void DwarfDebug::emitDebugLoc() {
       Asm->OutStreamer.EmitIntValue(0, Size);
       Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", index));
     } else {
-      Asm->OutStreamer.EmitSymbolValue(Entry.Begin, Size);
-      Asm->OutStreamer.EmitSymbolValue(Entry.End, Size);
-      DIVariable DV(Entry.Variable);
+      Asm->OutStreamer.EmitSymbolValue(Entry.getBeginSym(), Size);
+      Asm->OutStreamer.EmitSymbolValue(Entry.getEndSym(), Size);
+      DIVariable DV(Entry.getVariable());
       Asm->OutStreamer.AddComment("Loc expr size");
       MCSymbol *begin = Asm->OutStreamer.getContext().CreateTempSymbol();
       MCSymbol *end = Asm->OutStreamer.getContext().CreateTempSymbol();
@@ -2420,17 +2698,18 @@ void DwarfDebug::emitDebugLoc() {
           Asm->EmitULEB128(Entry.getInt());
         }
       } else if (Entry.isLocation()) {
+        MachineLocation Loc = Entry.getLoc();
         if (!DV.hasComplexAddress())
           // Regular entry.
-          Asm->EmitDwarfRegOp(Entry.Loc);
+          Asm->EmitDwarfRegOp(Loc, DV.isIndirect());
         else {
           // Complex address entry.
           unsigned N = DV.getNumAddrElements();
           unsigned i = 0;
           if (N >= 2 && DV.getAddrElement(0) == DIBuilder::OpPlus) {
-            if (Entry.Loc.getOffset()) {
+            if (Loc.getOffset()) {
               i = 2;
-              Asm->EmitDwarfRegOp(Entry.Loc);
+              Asm->EmitDwarfRegOp(Loc, DV.isIndirect());
               Asm->OutStreamer.AddComment("DW_OP_deref");
               Asm->EmitInt8(dwarf::DW_OP_deref);
               Asm->OutStreamer.AddComment("DW_OP_plus_uconst");
@@ -2439,12 +2718,12 @@ void DwarfDebug::emitDebugLoc() {
             } else {
               // If first address element is OpPlus then emit
               // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-              MachineLocation Loc(Entry.Loc.getReg(), DV.getAddrElement(1));
-              Asm->EmitDwarfRegOp(Loc);
+              MachineLocation TLoc(Loc.getReg(), DV.getAddrElement(1));
+              Asm->EmitDwarfRegOp(TLoc, DV.isIndirect());
               i = 2;
             }
           } else {
-            Asm->EmitDwarfRegOp(Entry.Loc);
+            Asm->EmitDwarfRegOp(Loc, DV.isIndirect());
           }
 
           // Emit remaining complex address elements.
@@ -2454,7 +2733,7 @@ void DwarfDebug::emitDebugLoc() {
               Asm->EmitInt8(dwarf::DW_OP_plus_uconst);
               Asm->EmitULEB128(DV.getAddrElement(++i));
             } else if (Element == DIBuilder::OpDeref) {
-              if (!Entry.Loc.isReg())
+              if (!Loc.isReg())
                 Asm->EmitInt8(dwarf::DW_OP_deref);
             } else
               llvm_unreachable("unknown Opcode found in complex address");
@@ -2468,18 +2747,178 @@ void DwarfDebug::emitDebugLoc() {
   }
 }
 
-// Emit visible names into a debug aranges section.
+struct SymbolCUSorter {
+  SymbolCUSorter(const MCStreamer &s) : Streamer(s) {}
+  const MCStreamer &Streamer;
+
+  bool operator() (const SymbolCU &A, const SymbolCU &B) {
+    unsigned IA = A.Sym ? Streamer.GetSymbolOrder(A.Sym) : 0;
+    unsigned IB = B.Sym ? Streamer.GetSymbolOrder(B.Sym) : 0;
+
+    // Symbols with no order assigned should be placed at the end.
+    // (e.g. section end labels)
+    if (IA == 0)
+      IA = (unsigned)(-1);
+    if (IB == 0)
+      IB = (unsigned)(-1);
+    return IA < IB;
+  }
+};
+
+static bool CUSort(const CompileUnit *A, const CompileUnit *B) {
+    return (A->getUniqueID() < B->getUniqueID());
+}
+
+struct ArangeSpan {
+  const MCSymbol *Start, *End;
+};
+
+// Emit a debug aranges section, containing a CU lookup for any
+// address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
   // Start the dwarf aranges section.
-  Asm->OutStreamer.SwitchSection(
-                          Asm->getObjFileLowering().getDwarfARangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfARangesSection());
+
+  typedef DenseMap<CompileUnit *, std::vector<ArangeSpan> > SpansType;
+
+  SpansType Spans;
+
+  // Build a list of sections used.
+  std::vector<const MCSection *> Sections;
+  for (SectionMapType::iterator it = SectionMap.begin(); it != SectionMap.end();
+       it++) {
+    const MCSection *Section = it->first;
+    Sections.push_back(Section);
+  }
+
+  // Sort the sections into order.
+  // This is only done to ensure consistent output order across different runs.
+  std::sort(Sections.begin(), Sections.end(), SectionSort);
+
+  // Build a set of address spans, sorted by CU.
+  for (size_t SecIdx=0;SecIdx<Sections.size();SecIdx++) {
+    const MCSection *Section = Sections[SecIdx];
+    SmallVector<SymbolCU, 8> &List = SectionMap[Section];
+    if (List.size() < 2)
+      continue;
+
+    // Sort the symbols by offset within the section.
+    SymbolCUSorter sorter(Asm->OutStreamer);
+    std::sort(List.begin(), List.end(), sorter);
+
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (Section == NULL) {
+      for (size_t n = 0; n < List.size(); n++) {
+        const SymbolCU &Cur = List[n];
+
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = NULL;
+        if (Cur.CU)
+          Spans[Cur.CU].push_back(Span);
+      }
+    } else {
+      // Build spans between each label.
+      const MCSymbol *StartSym = List[0].Sym;
+      for (size_t n = 1; n < List.size(); n++) {
+        const SymbolCU &Prev = List[n - 1];
+        const SymbolCU &Cur = List[n];
+
+        // Try and build the longest span we can within the same CU.
+        if (Cur.CU != Prev.CU) {
+          ArangeSpan Span;
+          Span.Start = StartSym;
+          Span.End = Cur.Sym;
+          Spans[Prev.CU].push_back(Span);
+          StartSym = Cur.Sym;
+        }
+      }
+    }
+  }
+
+  const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+  unsigned PtrSize = Asm->getDataLayout().getPointerSize();
+
+  // Build a list of CUs used.
+  std::vector<CompileUnit *> CUs;
+  for (SpansType::iterator it = Spans.begin(); it != Spans.end(); it++) {
+    CompileUnit *CU = it->first;
+    CUs.push_back(CU);
+  }
+
+  // Sort the CU list (again, to ensure consistent output order).
+  std::sort(CUs.begin(), CUs.end(), CUSort);
+
+  // Emit an arange table for each CU we used.
+  for (size_t CUIdx=0;CUIdx<CUs.size();CUIdx++) {
+    CompileUnit *CU = CUs[CUIdx];
+    std::vector<ArangeSpan> &List = Spans[CU];
+
+    // Emit size of content not including length itself.
+    unsigned ContentSize
+        = sizeof(int16_t) // DWARF ARange version number
+        + sizeof(int32_t) // Offset of CU in the .debug_info section
+        + sizeof(int8_t)  // Pointer Size (in bytes)
+        + sizeof(int8_t); // Segment Size (in bytes)
+
+    unsigned TupleSize = PtrSize * 2;
+
+    // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
+    unsigned Padding = 0;
+    while (((sizeof(int32_t) + ContentSize + Padding) % TupleSize) != 0)
+      Padding++;
+
+    ContentSize += Padding;
+    ContentSize += (List.size() + 1) * TupleSize;
+
+    // For each compile unit, write the list of spans it covers.
+    Asm->OutStreamer.AddComment("Length of ARange Set");
+    Asm->EmitInt32(ContentSize);
+    Asm->OutStreamer.AddComment("DWARF Arange version number");
+    Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->OutStreamer.AddComment("Offset Into Debug Info Section");
+    Asm->EmitSectionOffset(
+        Asm->GetTempSymbol(ISec->getLabelBeginName(), CU->getUniqueID()),
+        DwarfInfoSectionSym);
+    Asm->OutStreamer.AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(PtrSize);
+    Asm->OutStreamer.AddComment("Segment Size (in bytes)");
+    Asm->EmitInt8(0);
+
+    for (unsigned n = 0; n < Padding; n++)
+      Asm->EmitInt8(0xff);
+
+    for (unsigned n = 0; n < List.size(); n++) {
+      const ArangeSpan &Span = List[n];
+      Asm->EmitLabelReference(Span.Start, PtrSize);
+
+      // Calculate the size as being from the span start to it's end.
+      if (Span.End) {
+        Asm->EmitLabelDifference(Span.End, Span.Start, PtrSize);
+      } else {
+        // For symbols without an end marker (e.g. common), we
+        // write a single arange entry containing just that one symbol.
+        uint64_t Size = SymSize[Span.Start];
+        if (Size == 0)
+          Size = 1;
+
+        Asm->OutStreamer.EmitIntValue(Size, PtrSize);
+      }
+    }
+
+    Asm->OutStreamer.AddComment("ARange terminator");
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+    Asm->OutStreamer.EmitIntValue(0, PtrSize);
+  }
 }
 
 // Emit visible names into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
   // Start the dwarf ranges section.
-  Asm->OutStreamer.SwitchSection(
-    Asm->getObjFileLowering().getDwarfRangesSection());
+  Asm->OutStreamer
+      .SwitchSection(Asm->getObjFileLowering().getDwarfRangesSection());
   unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (SmallVectorImpl<const MCSymbol *>::iterator
          I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
@@ -2500,104 +2939,19 @@ void DwarfDebug::emitDebugMacInfo() {
   }
 }
 
-// Emit inline info using following format.
-// Section Header:
-// 1. length of section
-// 2. Dwarf version number
-// 3. address size.
-//
-// Entries (one "entry" for each function that was inlined):
-//
-// 1. offset into __debug_str section for MIPS linkage name, if exists;
-//   otherwise offset into __debug_str for regular function name.
-// 2. offset into __debug_str section for regular function name.
-// 3. an unsigned LEB128 number indicating the number of distinct inlining
-// instances for the function.
-//
-// The rest of the entry consists of a {die_offset, low_pc} pair for each
-// inlined instance; the die_offset points to the inlined_subroutine die in the
-// __debug_info section, and the low_pc is the starting address for the
-// inlining instance.
-void DwarfDebug::emitDebugInlineInfo() {
-  if (!Asm->MAI->doesDwarfUseInlineInfoSection())
-    return;
-
-  if (!FirstCU)
-    return;
-
-  Asm->OutStreamer.SwitchSection(
-                        Asm->getObjFileLowering().getDwarfDebugInlineSection());
-
-  Asm->OutStreamer.AddComment("Length of Debug Inlined Information Entry");
-  Asm->EmitLabelDifference(Asm->GetTempSymbol("debug_inlined_end", 1),
-                           Asm->GetTempSymbol("debug_inlined_begin", 1), 4);
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_begin", 1));
-
-  Asm->OutStreamer.AddComment("Dwarf Version");
-  Asm->EmitInt16(dwarf::DWARF_VERSION);
-  Asm->OutStreamer.AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
-
-  for (SmallVectorImpl<const MDNode *>::iterator I = InlinedSPNodes.begin(),
-         E = InlinedSPNodes.end(); I != E; ++I) {
-
-    const MDNode *Node = *I;
-    DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> >::iterator II
-      = InlineInfo.find(Node);
-    SmallVectorImpl<InlineInfoLabels> &Labels = II->second;
-    DISubprogram SP(Node);
-    StringRef LName = SP.getLinkageName();
-    StringRef Name = SP.getName();
-
-    Asm->OutStreamer.AddComment("MIPS linkage name");
-    if (LName.empty())
-      Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                             DwarfStrSectionSym);
-    else
-      Asm->EmitSectionOffset(InfoHolder
-                             .getStringPoolEntry(getRealLinkageName(LName)),
-                             DwarfStrSectionSym);
-
-    Asm->OutStreamer.AddComment("Function name");
-    Asm->EmitSectionOffset(InfoHolder.getStringPoolEntry(Name),
-                           DwarfStrSectionSym);
-    Asm->EmitULEB128(Labels.size(), "Inline count");
-
-    for (SmallVectorImpl<InlineInfoLabels>::iterator LI = Labels.begin(),
-           LE = Labels.end(); LI != LE; ++LI) {
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("DIE offset");
-      Asm->EmitInt32(LI->second->getOffset());
-
-      if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
-      Asm->OutStreamer.EmitSymbolValue(LI->first,
-                                       Asm->getDataLayout().getPointerSize());
-    }
-  }
-
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_end", 1));
-}
-
 // DWARF5 Experimental Separate Dwarf emitters.
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
 // DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
-// DW_AT_ranges_base, DW_AT_addr_base. If DW_AT_ranges is present,
-// DW_AT_low_pc and DW_AT_high_pc are not used, and vice versa.
-CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
-  DICompileUnit DIUnit(N);
-  CompilationDir = DIUnit.getDirectory();
+// DW_AT_ranges_base, DW_AT_addr_base.
+CompileUnit *DwarfDebug::constructSkeletonCU(const CompileUnit *CU) {
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(GlobalCUIndexCount++,
-                                       DIUnit.getLanguage(), Die, Asm,
-                                       this, &SkeletonHolder);
+  CompileUnit *NewCU = new CompileUnit(CU->getUniqueID(), Die, CU->getNode(),
+                                       Asm, this, &SkeletonHolder);
 
   NewCU->addLocalString(Die, dwarf::DW_AT_GNU_dwo_name,
-                        DIUnit.getSplitDebugFilename());
-
-  // This should be a unique identifier when we want to build .dwp files.
-  NewCU->addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8, 0);
+                        CU->getNode().getSplitDebugFilename());
 
   // Relocate to the beginning of the addr_base section, else 0 for the
   // beginning of the one for this compile unit.
@@ -2624,6 +2978,35 @@ CompileUnit *DwarfDebug::constructSkeletonCU(const MDNode *N) {
   if (!CompilationDir.empty())
     NewCU->addLocalString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
 
+  // Flags to let the linker know we have emitted new style pubnames.
+  if (GenerateGnuPubSections) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubnames, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubnames", NewCU->getUniqueID()),
+                      DwarfGnuPubNamesSectionSym);
+
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_sec_offset,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()));
+    else
+      NewCU->addDelta(Die, dwarf::DW_AT_GNU_pubtypes, dwarf::DW_FORM_data4,
+                      Asm->GetTempSymbol("gnu_pubtypes", NewCU->getUniqueID()),
+                      DwarfGnuPubTypesSectionSym);
+  }
+
+  // Flag if we've emitted any ranges and their location for the compile unit.
+  if (DebugRangeSymbols.size()) {
+    if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+      NewCU->addLabel(Die, dwarf::DW_AT_GNU_ranges_base,
+                      dwarf::DW_FORM_sec_offset, DwarfDebugRangeSectionSym);
+    else
+      NewCU->addUInt(Die, dwarf::DW_AT_GNU_ranges_base, dwarf::DW_FORM_data4,
+                     0);
+  }
+
   SkeletonHolder.addUnit(NewCU);
   SkeletonCUs.push_back(NewCU);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 24f758d..cebac39 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -41,7 +41,6 @@ class DIEAbbrev;
 class DIE;
 class DIEBlock;
 class DIEEntry;
-class DwarfDebug;
 
 //===----------------------------------------------------------------------===//
 /// \brief This class is used to record source line correspondence.
@@ -63,13 +62,12 @@ public:
 
 /// \brief This struct describes location entries emitted in the .debug_loc
 /// section.
-typedef struct DotDebugLocEntry {
+class DotDebugLocEntry {
+  // Begin and end symbols for the address range that this location is valid.
   const MCSymbol *Begin;
   const MCSymbol *End;
-  MachineLocation Loc;
-  const MDNode *Variable;
-  bool Merged;
-  bool Constant;
+
+  // Type of entry that this represents.
   enum EntryType {
     E_Location,
     E_Integer,
@@ -83,23 +81,42 @@ typedef struct DotDebugLocEntry {
     const ConstantFP *CFP;
     const ConstantInt *CIP;
   } Constants;
-  DotDebugLocEntry()
-    : Begin(0), End(0), Variable(0), Merged(false),
-      Constant(false) { Constants.Int = 0;}
+
+  // The location in the machine frame.
+  MachineLocation Loc;
+
+  // The variable to which this location entry corresponds.
+  const MDNode *Variable;
+
+  // Whether this location has been merged.
+  bool Merged;
+
+public:
+  DotDebugLocEntry() : Begin(0), End(0), Variable(0), Merged(false) {
+    Constants.Int = 0;
+  }
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, MachineLocation &L,
                    const MDNode *V)
-    : Begin(B), End(E), Loc(L), Variable(V), Merged(false),
-      Constant(false) { Constants.Int = 0; EntryKind = E_Location; }
+      : Begin(B), End(E), Loc(L), Variable(V), Merged(false) {
+    Constants.Int = 0;
+    EntryKind = E_Location;
+  }
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, int64_t i)
-    : Begin(B), End(E), Variable(0), Merged(false),
-      Constant(true) { Constants.Int = i; EntryKind = E_Integer; }
+      : Begin(B), End(E), Variable(0), Merged(false) {
+    Constants.Int = i;
+    EntryKind = E_Integer;
+  }
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantFP *FPtr)
-    : Begin(B), End(E), Variable(0), Merged(false),
-      Constant(true) { Constants.CFP = FPtr; EntryKind = E_ConstantFP; }
+      : Begin(B), End(E), Variable(0), Merged(false) {
+    Constants.CFP = FPtr;
+    EntryKind = E_ConstantFP;
+  }
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E,
                    const ConstantInt *IPtr)
-    : Begin(B), End(E), Variable(0), Merged(false),
-      Constant(true) { Constants.CIP = IPtr; EntryKind = E_ConstantInt; }
+      : Begin(B), End(E), Variable(0), Merged(false) {
+    Constants.CIP = IPtr;
+    EntryKind = E_ConstantInt;
+  }
 
   /// \brief Empty entries are also used as a trigger to emit temp label. Such
   /// labels are referenced is used to find debug_loc offset for a given DIE.
@@ -115,10 +132,14 @@ typedef struct DotDebugLocEntry {
   bool isInt() const         { return EntryKind == E_Integer; }
   bool isConstantFP() const  { return EntryKind == E_ConstantFP; }
   bool isConstantInt() const { return EntryKind == E_ConstantInt; }
-  int64_t getInt()                    { return Constants.Int; }
-  const ConstantFP *getConstantFP()   { return Constants.CFP; }
-  const ConstantInt *getConstantInt() { return Constants.CIP; }
-} DotDebugLocEntry;
+  int64_t getInt() const                    { return Constants.Int; }
+  const ConstantFP *getConstantFP() const   { return Constants.CFP; }
+  const ConstantInt *getConstantInt() const { return Constants.CIP; }
+  const MDNode *getVariable() const { return Variable; }
+  const MCSymbol *getBeginSym() const { return Begin; }
+  const MCSymbol *getEndSym() const { return End; }
+  MachineLocation getLoc() const { return Loc; }
+};
 
 //===----------------------------------------------------------------------===//
 /// \brief This class is used to track local variable information.
@@ -129,11 +150,12 @@ class DbgVariable {
   DbgVariable *AbsVar;               // Corresponding Abstract variable, if any.
   const MachineInstr *MInsn;         // DBG_VALUE instruction of the variable.
   int FrameIndex;
+  DwarfDebug *DD;
 public:
   // AbsVar may be NULL.
-  DbgVariable(DIVariable V, DbgVariable *AV)
+  DbgVariable(DIVariable V, DbgVariable *AV, DwarfDebug *DD)
     : Var(V), TheDIE(0), DotDebugLocOffset(~0U), AbsVar(AV), MInsn(0),
-      FrameIndex(~0) {}
+      FrameIndex(~0), DD(DD) {}
 
   // Accessors.
   DIVariable getVariable()           const { return Var; }
@@ -148,7 +170,7 @@ public:
   int getFrameIndex()                const { return FrameIndex; }
   void setFrameIndex(int FI)               { FrameIndex = FI; }
   // Translate tag to proper Dwarf tag.
-  unsigned getTag()                  const {
+  uint16_t getTag()                  const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
       return dwarf::DW_TAG_formal_parameter;
 
@@ -172,32 +194,27 @@ public:
   }
 
   bool variableHasComplexAddress()   const {
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    assert(Var.isVariable() && "Invalid complex DbgVariable!");
     return Var.hasComplexAddress();
   }
   bool isBlockByrefVariable()        const {
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    assert(Var.isVariable() && "Invalid complex DbgVariable!");
     return Var.isBlockByrefVariable();
   }
   unsigned getNumAddrElements()      const {
-    assert(Var.Verify() && "Invalid complex DbgVariable!");
+    assert(Var.isVariable() && "Invalid complex DbgVariable!");
     return Var.getNumAddrElements();
   }
   uint64_t getAddrElement(unsigned i) const {
     return Var.getAddrElement(i);
   }
   DIType getType() const;
-};
 
-
-// A String->Symbol mapping of strings used by indirect
-// references.
-typedef StringMap<std::pair<MCSymbol*, unsigned>,
-                  BumpPtrAllocator&> StrPool;
-
-// A Symbol->pair<Symbol, unsigned> mapping of addresses used by indirect
-// references.
-typedef DenseMap<MCSymbol *, std::pair<MCSymbol *, unsigned> > AddrPool;
+private:
+  /// resolve - Look in the DwarfDebug map for the MDNode that
+  /// corresponds to the reference.
+  template <typename T> T resolve(DIRef<T> Ref) const;
+};
 
 /// \brief Collects and handles information specific to a particular
 /// collection of units.
@@ -209,27 +226,34 @@ class DwarfUnits {
   FoldingSet<DIEAbbrev> *AbbreviationsSet;
 
   // A list of all the unique abbreviations in use.
-  std::vector<DIEAbbrev *> *Abbreviations;
+  std::vector<DIEAbbrev *> &Abbreviations;
 
   // A pointer to all units in the section.
   SmallVector<CompileUnit *, 1> CUs;
 
   // Collection of strings for this unit and assorted symbols.
+  // A String->Symbol mapping of strings used by indirect
+  // references.
+  typedef StringMap<std::pair<MCSymbol*, unsigned>,
+                    BumpPtrAllocator&> StrPool;
   StrPool StringPool;
   unsigned NextStringPoolNumber;
   std::string StringPref;
 
   // Collection of addresses for this unit and assorted labels.
+  // A Symbol->unsigned mapping of addresses used by indirect
+  // references.
+  typedef DenseMap<const MCExpr *, unsigned> AddrPool;
   AddrPool AddressPool;
   unsigned NextAddrPoolNumber;
 
 public:
   DwarfUnits(AsmPrinter *AP, FoldingSet<DIEAbbrev> *AS,
-             std::vector<DIEAbbrev *> *A, const char *Pref,
-             BumpPtrAllocator &DA) :
-    Asm(AP), AbbreviationsSet(AS), Abbreviations(A),
-    StringPool(DA), NextStringPoolNumber(0), StringPref(Pref),
-    AddressPool(), NextAddrPoolNumber(0) {}
+             std::vector<DIEAbbrev *> &A, const char *Pref,
+             BumpPtrAllocator &DA)
+      : Asm(AP), AbbreviationsSet(AS), Abbreviations(A), StringPool(DA),
+        NextStringPoolNumber(0), StringPref(Pref), AddressPool(),
+        NextAddrPoolNumber(0) {}
 
   /// \brief Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
@@ -245,14 +269,15 @@ public:
 
   /// \brief Emit all of the units to the section listed with the given
   /// abbreviation section.
-  void emitUnits(DwarfDebug *, const MCSection *, const MCSection *,
-                 const MCSymbol *);
+  void emitUnits(DwarfDebug *DD, const MCSection *USection,
+                 const MCSection *ASection, const MCSymbol *ASectionSym);
 
   /// \brief Emit all of the strings to the section given.
-  void emitStrings(const MCSection *, const MCSection *, const MCSymbol *);
+  void emitStrings(const MCSection *StrSection, const MCSection *OffsetSection,
+                   const MCSymbol *StrSecSym);
 
   /// \brief Emit all of the addresses to the section given.
-  void emitAddresses(const MCSection *);
+  void emitAddresses(const MCSection *AddrSection);
 
   /// \brief Returns the entry into the start of the pool.
   MCSymbol *getStringPoolSym();
@@ -270,14 +295,18 @@ public:
 
   /// \brief Returns the index into the address pool with the given
   /// label/symbol.
-  unsigned getAddrPoolIndex(MCSymbol *);
+  unsigned getAddrPoolIndex(const MCExpr *Sym);
+  unsigned getAddrPoolIndex(const MCSymbol *Sym);
 
   /// \brief Returns the address pool.
   AddrPool *getAddrPool() { return &AddressPool; }
+};
 
-  /// \brief for a given compile unit DIE, returns offset from beginning of
-  /// debug info.
-  unsigned getCUOffset(DIE *Die);
+/// \brief Helper used to pair up a symbol and its DWARF compile unit.
+struct SymbolCU {
+  SymbolCU(CompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
+  const MCSymbol *Sym;
+  CompileUnit *CU;
 };
 
 /// \brief Collects and handles dwarf debug information.
@@ -291,10 +320,7 @@ class DwarfDebug {
   // All DIEValues are allocated through this allocator.
   BumpPtrAllocator DIEValueAllocator;
 
-  //===--------------------------------------------------------------------===//
-  // Attribute used to construct specific Dwarf sections.
-  //
-
+  // Handle to the a compile unit used for the inline extension handling.
   CompileUnit *FirstCU;
 
   // Maps MDNode with its corresponding CompileUnit.
@@ -303,6 +329,14 @@ class DwarfDebug {
   // Maps subprogram MDNode with its corresponding CompileUnit.
   DenseMap <const MDNode *, CompileUnit *> SPMap;
 
+  // Maps a CU DIE with its corresponding CompileUnit.
+  DenseMap <const DIE *, CompileUnit *> CUDieMap;
+
+  /// Maps MDNodes for type sysstem with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in CompileUnit.
+  DenseMap<const MDNode *, DIE *> MDTypeNodeToDieMap;
+
   // Used to uniquely define abbreviations.
   FoldingSet<DIEAbbrev> AbbreviationsSet;
 
@@ -315,10 +349,17 @@ class DwarfDebug {
   // separated by a zero byte, mapped to a unique id.
   StringMap<unsigned, BumpPtrAllocator&> SourceIdMap;
 
+  // List of all labels used in aranges generation.
+  std::vector<SymbolCU> ArangeLabels;
+
+  // Size of each symbol emitted (for those symbols that have a specific size).
+  DenseMap <const MCSymbol *, uint64_t> SymSize;
+
   // Provides a unique id per text section.
-  SetVector<const MCSection*> SectionMap;
+  typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
+  SectionMapType SectionMap;
 
-  // List of Arguments (DbgValues) for current function.
+  // List of arguments for current function.
   SmallVector<DbgVariable *, 8> CurrentFnArguments;
 
   LexicalScopes LScopes;
@@ -327,7 +368,9 @@ class DwarfDebug {
   DenseMap<const MDNode *, DIE *> AbstractSPDies;
 
   // Collection of dbg variables of a scope.
-  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8> > ScopeVariables;
+  typedef DenseMap<LexicalScope *,
+                   SmallVector<DbgVariable *, 8> > ScopeVariablesMap;
+  ScopeVariablesMap ScopeVariables;
 
   // Collection of abstract variables.
   DenseMap<const MDNode *, DbgVariable *> AbstractVariables;
@@ -339,12 +382,6 @@ class DwarfDebug {
   // as DW_AT_inline.
   SmallPtrSet<DIE *, 4> InlinedSubprogramDIEs;
 
-  // Keep track of inlined functions and their location.  This
-  // information is used to populate the debug_inlined section.
-  typedef std::pair<const MCSymbol *, DIE *> InlineInfoLabels;
-  DenseMap<const MDNode *, SmallVector<InlineInfoLabels, 4> > InlineInfo;
-  SmallVector<const MDNode *, 4> InlinedSPNodes;
-
   // This is a collection of subprogram MDNodes that are processed to
   // create DIEs.
   SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
@@ -377,16 +414,6 @@ class DwarfDebug {
   // body.
   DebugLoc PrologEndLoc;
 
-  struct FunctionDebugFrameInfo {
-    unsigned Number;
-    std::vector<MachineMove> Moves;
-
-    FunctionDebugFrameInfo(unsigned Num, const std::vector<MachineMove> &M)
-      : Number(Num), Moves(M) {}
-  };
-
-  std::vector<FunctionDebugFrameInfo> DebugFrames;
-
   // Section Symbols: these are assembler temporary labels that are emitted at
   // the beginning of each supported dwarf section.  These are used to form
   // section offsets and are created by EmitSectionLabels.
@@ -395,9 +422,10 @@ class DwarfDebug {
   MCSymbol *DwarfDebugLocSectionSym, *DwarfLineSectionSym, *DwarfAddrSectionSym;
   MCSymbol *FunctionBeginSym, *FunctionEndSym;
   MCSymbol *DwarfAbbrevDWOSectionSym, *DwarfStrDWOSectionSym;
+  MCSymbol *DwarfGnuPubNamesSectionSym, *DwarfGnuPubTypesSectionSym;
 
   // As an optimization, there is no need to emit an entry in the directory
-  // table for the same directory as DW_at_comp_dir.
+  // table for the same directory as DW_AT_comp_dir.
   StringRef CompilationDir;
 
   // Counter for assigning globally unique IDs for CUs.
@@ -409,8 +437,19 @@ class DwarfDebug {
   // Holders for the various debug information flags that we might need to
   // have exposed. See accessor functions below for description.
 
-  // Whether or not we're emitting info for older versions of gdb on darwin.
-  bool IsDarwinGDBCompat;
+  // Holder for imported entities.
+  typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
+    ImportedEntityMap;
+  ImportedEntityMap ScopesWithImportedEntities;
+
+  // Holder for types that are going to be extracted out into a type unit.
+  std::vector<DIE *> TypeUnits;
+
+  // Whether to emit the pubnames/pubtypes sections.
+  bool HasDwarfPubSections;
+
+  // Version of dwarf we're emitting.
+  unsigned DwarfVersion;
 
   // DWARF5 Experimental Options
   bool HasDwarfAccelTables;
@@ -433,9 +472,8 @@ class DwarfDebug {
   // Holder for the skeleton information.
   DwarfUnits SkeletonHolder;
 
-  typedef SmallVector<std::pair<const MDNode *, const MDNode *>, 32>
-    ImportedEntityMap;
-  ImportedEntityMap ScopesWithImportedEntities;
+  // Maps from a type identifier to the actual MDNode.
+  DITypeIdentifierMap TypeIdentifierMap;
 
 private:
 
@@ -448,11 +486,14 @@ private:
   /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
   /// variables in this scope then create and insert DIEs for these
   /// variables.
-  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, const MDNode *SPNode);
+  DIE *updateSubprogramScopeDIE(CompileUnit *SPCU, DISubprogram SP);
 
   /// \brief Construct new DW_TAG_lexical_block for this scope and
   /// attach DW_AT_low_pc/DW_AT_high_pc labels.
   DIE *constructLexicalScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to check whether the DIE for a given Scope is going
+  /// to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
 
   /// \brief This scope represents inlined body of a function. Construct
   /// DIE to represent this concrete inlined copy of the function.
@@ -460,6 +501,9 @@ private:
 
   /// \brief Construct a DIE for this scope.
   DIE *constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope);
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(CompileUnit *TheCU, LexicalScope *Scope,
+                              SmallVectorImpl<DIE*> &Children);
 
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
@@ -511,10 +555,16 @@ private:
   void emitAccelTypes();
 
   /// \brief Emit visible names into a debug pubnames section.
-  void emitDebugPubnames();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubNames(bool GnuStyle = false);
 
   /// \brief Emit visible types into a debug pubtypes section.
-  void emitDebugPubTypes();
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubTypes(bool GnuStyle = false);
 
   /// \brief Emit visible names into a debug str section.
   void emitDebugStr();
@@ -538,7 +588,7 @@ private:
 
   /// \brief Construct the split debug info compile unit for the debug info
   /// section.
-  CompileUnit *constructSkeletonCU(const MDNode *);
+  CompileUnit *constructSkeletonCU(const CompileUnit *CU);
 
   /// \brief Emit the local split abbreviations.
   void emitSkeletonAbbrevs(const MCSection *);
@@ -554,21 +604,21 @@ private:
 
   /// \brief Create new CompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  CompileUnit *constructCompileUnit(const MDNode *N);
+  CompileUnit *constructCompileUnit(DICompileUnit DIUnit);
 
   /// \brief Construct subprogram DIE.
   void constructSubprogramDIE(CompileUnit *TheCU, const MDNode *N);
 
-  /// \brief Construct import_module DIE.
-  void constructImportedModuleDIE(CompileUnit *TheCU, const MDNode *N);
+  /// \brief Construct imported_module or imported_declaration DIE.
+  void constructImportedEntityDIE(CompileUnit *TheCU, const MDNode *N);
 
   /// \brief Construct import_module DIE.
-  void constructImportedModuleDIE(CompileUnit *TheCU, const MDNode *N,
+  void constructImportedEntityDIE(CompileUnit *TheCU, const MDNode *N,
                                   DIE *Context);
 
   /// \brief Construct import_module DIE.
-  void constructImportedModuleDIE(CompileUnit *TheCU,
-                                  const DIImportedModule &Module,
+  void constructImportedEntityDIE(CompileUnit *TheCU,
+                                  const DIImportedEntity &Module,
                                   DIE *Context);
 
   /// \brief Register a source line with debug info. Returns the unique
@@ -616,7 +666,13 @@ public:
   // Main entry points.
   //
   DwarfDebug(AsmPrinter *A, Module *M);
-  ~DwarfDebug();
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    MDTypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return MDTypeNodeToDieMap.lookup(TypeMD);
+  }
 
   /// \brief Emit all Dwarf sections that should come prior to the
   /// content.
@@ -637,6 +693,17 @@ public:
   /// \brief Process end of an instruction.
   void endInstruction(const MachineInstr *MI);
 
+  /// \brief Add a DIE to the set of types that we're going to pull into
+  /// type units.
+  void addTypeUnitType(DIE *Die) { TypeUnits.push_back(Die); }
+
+  /// \brief Add a label so that arange data can be generated for it.
+  void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
+
+  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) { SymSize[Sym] = Size;}
+
   /// \brief Look up the source id with the given directory and source file
   /// names. If none currently exists, create a new id and insert it in the
   /// SourceIds map.
@@ -644,11 +711,7 @@ public:
                                unsigned CUID);
 
   /// \brief Recursively Emits a debug information entry.
-  void emitDIE(DIE *Die, std::vector<DIEAbbrev *> *Abbrevs);
-
-  /// \brief Returns whether or not to limit some of our debug
-  /// output to the limitations of darwin gdb.
-  bool useDarwinGDBCompat() { return IsDarwinGDBCompat; }
+  void emitDIE(DIE *Die, ArrayRef<DIEAbbrev *> Abbrevs);
 
   // Experimental DWARF5 features.
 
@@ -659,6 +722,19 @@ public:
   /// \brief Returns whether or not to change the current debug info for the
   /// split dwarf proposal support.
   bool useSplitDwarf() { return HasSplitDwarf; }
+
+  /// Returns the Dwarf Version.
+  unsigned getDwarfVersion() const { return DwarfVersion; }
+
+  /// Find the MDNode for the given reference.
+  template <typename T> T resolve(DIRef<T> Ref) const {
+    return Ref.resolve(TypeIdentifierMap);
+  }
+
+  /// isSubprogramContext - Return true if Context is either a subprogram
+  /// or another context nested inside a subprogram.
+  bool isSubprogramContext(const MDNode *Context);
+
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 74b1b13..1575161 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -23,13 +23,13 @@ namespace llvm {
 template <typename T> class SmallVectorImpl;
 struct LandingPadInfo;
 class MachineModuleInfo;
-class MachineMove;
 class MachineInstr;
 class MachineFunction;
 class MCAsmInfo;
 class MCExpr;
 class MCSymbol;
 class Function;
+class ARMTargetStreamer;
 class AsmPrinter;
 
 //===----------------------------------------------------------------------===//
@@ -178,6 +178,8 @@ public:
 
 class ARMException : public DwarfException {
   void EmitTypeInfos(unsigned TTypeEncoding);
+  ARMTargetStreamer &getTargetStreamer();
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 4a99184..24aa1ab 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -26,18 +26,20 @@ using namespace llvm;
 namespace {
 
 class BasicTTI : public ImmutablePass, public TargetTransformInfo {
-  const TargetLoweringBase *TLI;
+  const TargetMachine *TM;
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
+  const TargetLoweringBase *getTLI() const { return TM->getTargetLowering(); }
+
 public:
-  BasicTTI() : ImmutablePass(ID), TLI(0) {
+  BasicTTI() : ImmutablePass(ID), TM(0) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
-  BasicTTI(const TargetLoweringBase *TLI) : ImmutablePass(ID), TLI(TLI) {
+  BasicTTI(const TargetMachine *TM) : ImmutablePass(ID), TM(TM) {
     initializeBasicTTIPass(*PassRegistry::getPassRegistry());
   }
 
@@ -63,6 +65,8 @@ public:
     return this;
   }
 
+  virtual bool hasBranchDivergence() const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
@@ -71,11 +75,16 @@ public:
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                      int64_t BaseOffset, bool HasBaseReg,
                                      int64_t Scale) const;
+  virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+                                   int64_t BaseOffset, bool HasBaseReg,
+                                   int64_t Scale) const;
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
   virtual bool isTypeLegal(Type *Ty) const;
   virtual unsigned getJumpBufAlignment() const;
   virtual unsigned getJumpBufSize() const;
   virtual bool shouldBuildLookupTables() const;
+  virtual bool haveFastSqrt(Type *Ty) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -103,7 +112,8 @@ public:
   virtual unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
                                          ArrayRef<Type*> Tys) const;
   virtual unsigned getNumberOfParts(Type *Tp) const;
-  virtual unsigned getAddressComputationCost(Type *Ty) const;
+  virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;
 
   /// @}
 };
@@ -115,17 +125,18 @@ INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti",
 char BasicTTI::ID = 0;
 
 ImmutablePass *
-llvm::createBasicTargetTransformInfoPass(const TargetLoweringBase *TLI) {
-  return new BasicTTI(TLI);
+llvm::createBasicTargetTransformInfoPass(const TargetMachine *TM) {
+  return new BasicTTI(TM);
 }
 
+bool BasicTTI::hasBranchDivergence() const { return false; }
 
 bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
-  return TLI->isLegalAddImmediate(imm);
+  return getTLI()->isLegalAddImmediate(imm);
 }
 
 bool BasicTTI::isLegalICmpImmediate(int64_t imm) const {
-  return TLI->isLegalICmpImmediate(imm);
+  return getTLI()->isLegalICmpImmediate(imm);
 }
 
 bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -136,32 +147,52 @@ bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
   AM.BaseOffs = BaseOffset;
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Scale;
-  return TLI->isLegalAddressingMode(AM, Ty);
+  return getTLI()->isLegalAddressingMode(AM, Ty);
+}
+
+int BasicTTI::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
+                                   int64_t BaseOffset, bool HasBaseReg,
+                                   int64_t Scale) const {
+  TargetLoweringBase::AddrMode AM;
+  AM.BaseGV = BaseGV;
+  AM.BaseOffs = BaseOffset;
+  AM.HasBaseReg = HasBaseReg;
+  AM.Scale = Scale;
+  return getTLI()->getScalingFactorCost(AM, Ty);
 }
 
 bool BasicTTI::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  return TLI->isTruncateFree(Ty1, Ty2);
+  return getTLI()->isTruncateFree(Ty1, Ty2);
 }
 
 bool BasicTTI::isTypeLegal(Type *Ty) const {
-  EVT T = TLI->getValueType(Ty);
-  return TLI->isTypeLegal(T);
+  EVT T = getTLI()->getValueType(Ty);
+  return getTLI()->isTypeLegal(T);
 }
 
 unsigned BasicTTI::getJumpBufAlignment() const {
-  return TLI->getJumpBufAlignment();
+  return getTLI()->getJumpBufAlignment();
 }
 
 unsigned BasicTTI::getJumpBufSize() const {
-  return TLI->getJumpBufSize();
+  return getTLI()->getJumpBufSize();
 }
 
 bool BasicTTI::shouldBuildLookupTables() const {
+  const TargetLoweringBase *TLI = getTLI();
   return TLI->supportJumpTables() &&
       (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
        TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
 }
 
+bool BasicTTI::haveFastSqrt(Type *Ty) const {
+  const TargetLoweringBase *TLI = getTLI();
+  EVT VT = TLI->getValueType(Ty);
+  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
+}
+
+void BasicTTI::getUnrollingPreferences(Loop *, UnrollingPreferences &) const { }
+
 //===----------------------------------------------------------------------===//
 //
 // Calls used by the vectorizers.
@@ -199,6 +230,7 @@ unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                           OperandValueKind,
                                           OperandValueKind) const {
   // Check if any of the operands are vector operands.
+  const TargetLoweringBase *TLI = getTLI();
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -245,6 +277,7 @@ unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
 unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
                                     Type *Src) const {
+  const TargetLoweringBase *TLI = getTLI();
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -338,6 +371,7 @@ unsigned BasicTTI::getCFInstrCost(unsigned Opcode) const {
 
 unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
                                       Type *CondTy) const {
+  const TargetLoweringBase *TLI = getTLI();
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -382,7 +416,7 @@ unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
                                    unsigned AddressSpace) const {
   assert(!Src->isVoidTy() && "Invalid type");
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);
 
   // Assume that all loads of legal types cost 1.
   return LT.first;
@@ -420,15 +454,23 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::log10:   ISD = ISD::FLOG10; break;
   case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
   case Intrinsic::fabs:    ISD = ISD::FABS;   break;
+  case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
   case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
   case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
   case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
+  case Intrinsic::nearbyint:
+                           ISD = ISD::FNEARBYINT; break;
   case Intrinsic::rint:    ISD = ISD::FRINT;  break;
+  case Intrinsic::round:   ISD = ISD::FROUND; break;
   case Intrinsic::pow:     ISD = ISD::FPOW;   break;
   case Intrinsic::fma:     ISD = ISD::FMA;    break;
   case Intrinsic::fmuladd: ISD = ISD::FMA;    break; // FIXME: mul + add?
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    return 0;
   }
 
+  const TargetLoweringBase *TLI = getTLI();
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
 
   if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
@@ -462,10 +504,24 @@ unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
 }
 
 unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Tp);
   return LT.first;
 }
 
-unsigned BasicTTI::getAddressComputationCost(Type *Ty) const {
+unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
   return 0;
 }
+
+unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwise) const {
+  assert(Ty->isVectorTy() && "Expect a vector type");
+  unsigned NumVecElts = Ty->getVectorNumElements();
+  unsigned NumReduxLevels = Log2_32(NumVecElts);
+  unsigned ArithCost = NumReduxLevels *
+    TopTTI->getArithmeticInstrCost(Opcode, Ty);
+  // Assume the pairwise shuffles add a cost.
+  unsigned ShuffleCost =
+      NumReduxLevels * (IsPairwise + 1) *
+      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index f8cc3b3..9cd4208 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -135,8 +135,8 @@ bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
     if (!I->isImplicitDef())
       break;
     unsigned Reg = I->getOperand(0).getReg();
-    ImpDefRegs.insert(Reg);
-    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
       ImpDefRegs.insert(*SubRegs);
     ++I;
   }
@@ -406,7 +406,8 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
 /// MBB so that the part before the iterator falls into the part starting at the
 /// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
-                                            MachineBasicBlock::iterator BBI1) {
+                                            MachineBasicBlock::iterator BBI1,
+                                            const BasicBlock *BB) {
   if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
     return 0;
 
@@ -414,7 +415,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
 
   // Create the fall-through block.
   MachineFunction::iterator MBBI = &CurMBB;
-  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(CurMBB.getBasicBlock());
+  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
   CurMBB.getParent()->insert(++MBBI, NewMBB);
 
   // Move all the successors of this block to the specified block.
@@ -647,6 +648,7 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
 /// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
 /// only of the common tail.  Create a block that does by splitting one.
 bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             MachineBasicBlock *SuccBB,
                                              unsigned maxCommonTailLength,
                                              unsigned &commonTailIndex) {
   commonTailIndex = 0;
@@ -676,7 +678,12 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
                << maxCommonTailLength);
 
-  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI);
+  // If the split block unconditionally falls-thru to SuccBB, it will be
+  // merged. In control flow terms it should then take SuccBB's name. e.g. If
+  // SuccBB is an inner loop, the common tail is still part of the inner loop.
+  const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
+    SuccBB->getBasicBlock() : MBB->getBasicBlock();
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI, BB);
   if (!newMBB) {
     DEBUG(dbgs() << "... failed!");
     return false;
@@ -784,7 +791,7 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
          !SameTails[commonTailIndex].tailIsWholeBlock())) {
       // None of the blocks consist entirely of the common tail.
       // Split a block so that one does.
-      if (!CreateCommonTailOnlyBlock(PredBB,
+      if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
                                      maxCommonTailLength, commonTailIndex)) {
         RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
         continue;
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index df795df..0d15ed7 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -1,4 +1,4 @@
-//===-- BranchFolding.h - Fold machine code branch instructions --*- C++ -*===//
+//===-- BranchFolding.h - Fold machine code branch instructions -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -100,13 +100,15 @@ namespace llvm {
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                  MachineBasicBlock *NewDest);
     MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
-                                  MachineBasicBlock::iterator BBI1);
+                                  MachineBasicBlock::iterator BBI1,
+                                  const BasicBlock *BB);
     unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
                               MachineBasicBlock *SuccBB,
                               MachineBasicBlock *PredBB);
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
     bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   MachineBasicBlock *SuccBB,
                                    unsigned maxCommonTailLength,
                                    unsigned &commonTailIndex);
 
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 56aa330..10cc9ff 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMCodeGen
   LiveRangeCalc.cpp
   LiveRangeEdit.cpp
   LiveRegMatrix.cpp
+  LiveRegUnits.cpp
   LiveStackAnalysis.cpp
   LiveVariables.cpp
   LocalStackSlotAllocation.cpp
@@ -88,7 +89,6 @@ add_llvm_library(LLVMCodeGen
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
   ShadowStackGC.cpp
-  ShrinkWrapping.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
@@ -97,7 +97,7 @@ add_llvm_library(LLVMCodeGen
   StackColoring.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
-  StrongPHIElimination.cpp
+  StackMaps.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 38ae17d..4925c4d 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -9,13 +9,12 @@
 
 #define DEBUG_TYPE "calcspillweights"
 
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -23,36 +22,22 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
-char CalculateSpillWeights::ID = 0;
-INITIALIZE_PASS_BEGIN(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(CalculateSpillWeights, "calcspillweights",
-                "Calculate spill weights", false, false)
-
-void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
-  au.addRequired<LiveIntervals>();
-  au.addRequired<MachineLoopInfo>();
-  au.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(au);
-}
-
-bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
-
+void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
+                           MachineFunction &MF,
+                           const MachineLoopInfo &MLI,
+                           const MachineBlockFrequencyInfo &MBFI,
+                           VirtRegAuxInfo::NormalizingFn norm) {
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
                << "********** Function: " << MF.getName() << '\n');
 
-  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  VirtRegAuxInfo VRAI(MF, LIS, getAnalysis<MachineLoopInfo>());
+  VirtRegAuxInfo VRAI(MF, LIS, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
-    VRAI.CalculateWeightAndHint(LIS.getInterval(Reg));
+    VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
   }
-  return false;
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
@@ -107,12 +92,12 @@ static bool isRematerializable(const LiveInterval &LI,
   return true;
 }
 
-void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
+void
+VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
   MachineRegisterInfo &mri = MF.getRegInfo();
   const TargetRegisterInfo &tri = *MF.getTarget().getRegisterInfo();
   MachineBasicBlock *mbb = 0;
   MachineLoop *loop = 0;
-  unsigned loopDepth = 0;
   bool isExiting = false;
   float totalWeight = 0;
   SmallPtrSet<MachineInstr*, 8> visited;
@@ -140,14 +125,14 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
       if (mi->getParent() != mbb) {
         mbb = mi->getParent();
         loop = Loops.getLoopFor(mbb);
-        loopDepth = loop ? loop->getLoopDepth() : 0;
         isExiting = loop ? loop->isLoopExiting(mbb) : false;
       }
 
       // Calculate instr weight.
       bool reads, writes;
       tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
-      weight = LiveIntervals::getSpillWeight(writes, reads, loopDepth);
+      weight = LiveIntervals::getSpillWeight(
+          writes, reads, MBFI.getBlockFreq(mi->getParent()));
 
       // Give extra weight to what looks like a loop induction variable update.
       if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
@@ -198,5 +183,5 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
   if (isRematerializable(li, LIS, *MF.getTarget().getInstrInfo()))
     totalWeight *= 0.5F;
 
-  li.weight = normalizeSpillWeight(totalWeight, li.getSize());
+  li.weight = normalize(totalWeight, li.getSize());
 }
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 75f4b96..fcfc9dc 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -24,7 +24,7 @@
 using namespace llvm;
 
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
-                 const TargetMachine &tm, SmallVector<CCValAssign, 16> &locs,
+                 const TargetMachine &tm, SmallVectorImpl<CCValAssign> &locs,
                  LLVMContext &C)
   : CallingConv(CC), IsVarArg(isVarArg), MF(mf), TM(tm),
     TRI(*TM.getRegisterInfo()), Locs(locs), Context(C),
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index c641991..7430c53 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -22,7 +22,6 @@ using namespace llvm;
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
-  initializeCalculateSpillWeightsPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
   initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
@@ -60,7 +59,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeStackProtectorPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackSlotColoringPass(Registry);
-  initializeStrongPHIEliminationPass(Registry);
   initializeTailDuplicatePassPass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 0eb74a4..18c8e0a 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -201,8 +201,8 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr *MI) {
 
     if (MO.isUse() && Special) {
       if (!KeepRegs.test(Reg)) {
-        KeepRegs.set(Reg);
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+        for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+             SubRegs.isValid(); ++SubRegs)
           KeepRegs.set(*SubRegs);
       }
     }
@@ -361,7 +361,7 @@ findSuitableFreeRegister(RegRefIter RegRefBegin,
                          unsigned AntiDepReg,
                          unsigned LastNewReg,
                          const TargetRegisterClass *RC,
-                         SmallVector<unsigned, 2> &Forbid)
+                         SmallVectorImpl<unsigned> &Forbid)
 {
   ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
   for (unsigned i = 0; i != Order.size(); ++i) {
@@ -388,7 +388,7 @@ findSuitableFreeRegister(RegRefIter RegRefBegin,
       continue;
     // If NewReg overlaps any of the forbidden registers, we can't use it.
     bool Forbidden = false;
-    for (SmallVector<unsigned, 2>::iterator it = Forbid.begin(),
+    for (SmallVectorImpl<unsigned>::iterator it = Forbid.begin(),
            ite = Forbid.end(); it != ite; ++it)
       if (TRI->regsOverlap(NewReg, *it)) {
         Forbidden = true;
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.h b/lib/CodeGen/CriticalAntiDepBreaker.h
index df13dd3..565d20b 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -103,7 +103,7 @@ class TargetRegisterInfo;
                                       unsigned AntiDepReg,
                                       unsigned LastNewReg,
                                       const TargetRegisterClass *RC,
-                                      SmallVector<unsigned, 2> &Forbid);
+                                      SmallVectorImpl<unsigned> &Forbid);
   };
 }
 
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index 840a101..6619bcf 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -160,7 +160,8 @@ void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator EndItr) {
   assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
   VLIWScheduler->startBlock(MBB);
-  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr, MBB->size());
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr,
+                             std::distance(BeginItr, EndItr));
   VLIWScheduler->schedule();
 
   // Generate MI -> SU map.
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index a54217f..5efe1ff 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -154,11 +154,11 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
         if (MO.isReg() && MO.isDef()) {
           unsigned Reg = MO.getReg();
           if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-            LivePhysRegs.reset(Reg);
             // Check the subreg set, not the alias set, because a def
             // of a super-register may still be partially live after
             // this def.
-            for (MCSubRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+            for (MCSubRegIterator SR(Reg, TRI,/*IncludeSelf=*/true);
+                 SR.isValid(); ++SR)
               LivePhysRegs.reset(*SR);
           }
         } else if (MO.isRegMask()) {
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index f27ec77..c7c1752 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -33,7 +33,6 @@ STATISTIC(NumResumesLowered, "Number of resume calls lowered");
 namespace {
   class DwarfEHPrepare : public FunctionPass {
     const TargetMachine *TM;
-    const TargetLoweringBase *TLI;
 
     // RewindFunction - _Unwind_Resume or the target equivalent.
     Constant *RewindFunction;
@@ -43,9 +42,8 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid.
-    DwarfEHPrepare(const TargetMachine *tm) :
-      FunctionPass(ID), TM(tm), TLI(TM->getTargetLowering()),
-      RewindFunction(0) {
+    DwarfEHPrepare(const TargetMachine *TM) :
+      FunctionPass(ID), TM(TM), RewindFunction(0) {
         initializeDominatorTreePass(*PassRegistry::getPassRegistry());
       }
 
@@ -61,8 +59,8 @@ namespace {
 
 char DwarfEHPrepare::ID = 0;
 
-FunctionPass *llvm::createDwarfEHPass(const TargetMachine *tm) {
-  return new DwarfEHPrepare(tm);
+FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
+  return new DwarfEHPrepare(TM);
 }
 
 /// GetExceptionObject - Return the exception object from the value passed into
@@ -108,20 +106,18 @@ Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
 /// InsertUnwindResumeCalls - Convert the ResumeInsts that are still present
 /// into calls to the appropriate _Unwind_Resume function.
 bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
-  bool UsesNewEH = false;
   SmallVector<ResumeInst*, 16> Resumes;
   for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
     TerminatorInst *TI = I->getTerminator();
     if (ResumeInst *RI = dyn_cast<ResumeInst>(TI))
       Resumes.push_back(RI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(TI))
-      UsesNewEH = II->getUnwindDest()->isLandingPad();
   }
 
   if (Resumes.empty())
-    return UsesNewEH;
+    return false;
 
   // Find the rewind function if we didn't already.
+  const TargetLowering *TLI = TM->getTargetLowering();
   if (!RewindFunction) {
     LLVMContext &Ctx = Resumes[0]->getContext();
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 9b0e76f..031f19c 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -23,6 +23,7 @@
 #define DEBUG_TYPE "execution-fix"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Allocator.h"
@@ -91,7 +92,7 @@ struct DomainValue {
 
   // First domain available.
   unsigned getFirstDomain() const {
-    return CountTrailingZeros_32(AvailableDomains);
+    return countTrailingZeros(AvailableDomains);
   }
 
   DomainValue() : Refs(0) { clear(); }
@@ -136,6 +137,12 @@ class ExeDepsFix : public MachineFunctionPass {
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
   LiveOutMap LiveOuts;
 
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+
+  /// Storage for register unit liveness.
+  LiveRegUnits LiveUnits;
+
   /// Current instruction number.
   /// The first instruction in each basic block is 0.
   int CurInstr;
@@ -185,6 +192,8 @@ private:
   void processDefs(MachineInstr*, bool Kill);
   void visitSoftInstr(MachineInstr*, unsigned mask);
   void visitHardInstr(MachineInstr*, unsigned domain);
+  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
+  void processUndefReads(MachineBasicBlock*);
 };
 }
 
@@ -341,6 +350,10 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
+  // Set up UndefReads to track undefined register reads.
+  UndefReads.clear();
+  LiveUnits.clear();
+
   // Set up LiveRegs to represent registers entering MBB.
   if (!LiveRegs)
     LiveRegs = new LiveReg[NumRegs];
@@ -448,10 +461,46 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
   processDefs(MI, !DomP.first);
 }
 
+/// \brief Return true to if it makes sense to break dependence on a partial def
+/// or undef use.
+bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                       unsigned Pref) {
+  int rx = regIndex(MI->getOperand(OpIdx).getReg());
+  if (rx < 0)
+    return false;
+
+  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+  if (Pref > Clearance) {
+    DEBUG(dbgs() << ": Break dependency.\n");
+    return true;
+  }
+  // The current clearance seems OK, but we may be ignoring a def from a
+  // back-edge.
+  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+    DEBUG(dbgs() << ": OK .\n");
+    return false;
+  }
+  // A def from an unprocessed back-edge may make us break this dependency.
+  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  return false;
+}
+
 // Update def-ages for registers defined by MI.
 // If Kill is set, also kill off DomainValues clobbered by the defs.
+//
+// Also break dependencies on partial defs and undef uses.
 void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  unsigned OpNum;
+  unsigned Pref = TII->getUndefRegClearance(MI, OpNum, TRI);
+  if (Pref) {
+    if (shouldBreakDependence(MI, OpNum, Pref))
+      UndefReads.push_back(std::make_pair(MI, OpNum));
+  }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
          e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
@@ -471,37 +520,58 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
     DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                  << '\t' << *MI);
 
+    // Check clearance before partial register updates.
+    // Call breakDependence before setting LiveRegs[rx].Def.
+    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+    if (Pref && shouldBreakDependence(MI, i, Pref))
+      TII->breakPartialRegDependency(MI, i, TRI);
+
     // How many instructions since rx was last written?
-    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
     LiveRegs[rx].Def = CurInstr;
 
     // Kill off domains redefined by generic instructions.
     if (Kill)
       kill(rx);
+  }
+  ++CurInstr;
+}
 
-    // Verify clearance before partial register updates.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (!Pref)
-      continue;
-    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
-    if (Pref > Clearance) {
-      DEBUG(dbgs() << ": Break dependency.\n");
-      TII->breakPartialRegDependency(MI, i, TRI);
-      continue;
-    }
-
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK.\n");
-      continue;
-    }
+/// \break Break false dependencies on undefined register reads.
+///
+/// Walk the block backward computing precise liveness. This is expensive, so we
+/// only do it on demand. Note that the occurrence of undefined register reads
+/// that should be broken is very rare, but when they occur we may have many in
+/// a single block.
+void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
 
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+  // Collect this block's live out register units.
+  LiveUnits.init(TRI);
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    LiveUnits.addLiveIns(*SI, *TRI);
   }
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
 
-  ++CurInstr;
+  for (MachineBasicBlock::reverse_iterator I = MBB->rbegin(), E = MBB->rend();
+       I != E; ++I) {
+    // Update liveness, including the current instrucion's defs.
+    LiveUnits.stepBackward(*I, *TRI);
+
+    if (UndefMI == &*I) {
+      if (!LiveUnits.contains(UndefMI->getOperand(OpIdx).getReg(), *TRI))
+        TII->breakPartialRegDependency(UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
 }
 
 // A hard instruction only works in one domain. All input registers will be
@@ -549,7 +619,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
         // Is it possible to use this collapsed register for free?
         if (dv->isCollapsed()) {
           // Restrict available domains to the ones in common with the operand.
-          // If there are no common domains, we must pay the cross-domain 
+          // If there are no common domains, we must pay the cross-domain
           // penalty for this operand.
           if (common) available = common;
         } else if (common)
@@ -564,7 +634,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
 
   // If the collapsed operands force a single domain, propagate the collapse.
   if (isPowerOf2_32(available)) {
-    unsigned domain = CountTrailingZeros_32(available);
+    unsigned domain = countTrailingZeros(available);
     TII->setExecutionDomain(mi, domain);
     visitHardInstr(mi, domain);
     return;
@@ -573,7 +643,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // Kill off any remaining uses that don't match available, and build a list of
   // incoming DomainValues that we want to merge.
   SmallVector<LiveReg, 4> Regs;
-  for (SmallVector<int, 4>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+  for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
     int rx = *i;
     const LiveReg &LR = LiveRegs[rx];
     // This useless DomainValue could have been missed above.
@@ -583,7 +653,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
     }
     // Sorted insertion.
     bool Inserted = false;
-    for (SmallVector<LiveReg, 4>::iterator i = Regs.begin(), e = Regs.end();
+    for (SmallVectorImpl<LiveReg>::iterator i = Regs.begin(), e = Regs.end();
            i != e && !Inserted; ++i) {
       if (LR.Def < i->Def) {
         Inserted = true;
@@ -614,7 +684,7 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
       continue;
 
     // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (SmallVector<int,4>::iterator i=used.begin(), e=used.end(); i != e; ++i)
+    for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i)
       if (LiveRegs[*i].Value == Latest)
         kill(*i);
   }
@@ -686,6 +756,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
         ++I)
       visitInstr(I);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -698,6 +769,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         ++I)
       if (!I->isDebugValue())
         processDefs(I, false);
+    processUndefReads(MBB);
     leaveBasicBlock(MBB);
   }
 
@@ -713,6 +785,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     delete[] FI->second;
   }
   LiveOuts.clear();
+  UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
 
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 1611db8..6c73fff 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -104,7 +104,7 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   }
 
   if (DstSubReg == InsReg) {
-    // No need to insert an identify copy instruction.
+    // No need to insert an identity copy instruction.
     // Watch out for case like this:
     // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
     // We must leave %RAX live.
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 8264d6d..e2d0eb4 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -22,6 +22,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -31,6 +33,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
 using namespace llvm;
 
 // Hidden options for help debugging.
@@ -150,14 +154,17 @@ namespace {
     /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
     /// basic block number.
     std::vector<BBInfo> BBAnalysis;
+    TargetSchedModel SchedModel;
 
     const TargetLoweringBase *TLI;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
-    const InstrItineraryData *InstrItins;
     const MachineBranchProbabilityInfo *MBPI;
     MachineRegisterInfo *MRI;
 
+    LiveRegUnits Redefs;
+    LiveRegUnits DontKill;
+
     bool PreRegAlloc;
     bool MadeChange;
     int FnNum;
@@ -198,11 +205,9 @@ namespace {
     void PredicateBlock(BBInfo &BBI,
                         MachineBasicBlock::iterator E,
                         SmallVectorImpl<MachineOperand> &Cond,
-                        SmallSet<unsigned, 4> &Redefs,
                         SmallSet<unsigned, 4> *LaterRedefs = 0);
     void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                SmallVectorImpl<MachineOperand> &Cond,
-                               SmallSet<unsigned, 4> &Redefs,
                                bool IgnoreBr = false);
     void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
 
@@ -267,7 +272,11 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-  InstrItins = MF.getTarget().getInstrItineraryData();
+
+  const TargetSubtargetInfo &ST =
+    MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
+
   if (!TII) return false;
 
   PreRegAlloc = MRI->isSSA();
@@ -666,32 +675,28 @@ void IfConverter::ScanInstructions(BBInfo &BBI) {
     bool isPredicated = TII->isPredicated(I);
     bool isCondBr = BBI.IsBrAnalyzable && I->isConditionalBranch();
 
-    if (!isCondBr) {
-      if (!isPredicated) {
-        BBI.NonPredSize++;
-        unsigned ExtraPredCost = 0;
-        unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I,
-                                                  &ExtraPredCost);
-        if (NumCycles > 1)
-          BBI.ExtraCost += NumCycles-1;
-        BBI.ExtraCost2 += ExtraPredCost;
-      } else if (!AlreadyPredicated) {
-        // FIXME: This instruction is already predicated before the
-        // if-conversion pass. It's probably something like a conditional move.
-        // Mark this block unpredicable for now.
-        BBI.IsUnpredicable = true;
-        return;
-      }
+    // A conditional branch is not predicable, but it may be eliminated.
+    if (isCondBr)
+      continue;
+
+    if (!isPredicated) {
+      BBI.NonPredSize++;
+      unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
+      if (NumCycles > 1)
+        BBI.ExtraCost += NumCycles-1;
+      BBI.ExtraCost2 += ExtraPredCost;
+    } else if (!AlreadyPredicated) {
+      // FIXME: This instruction is already predicated before the
+      // if-conversion pass. It's probably something like a conditional move.
+      // Mark this block unpredicable for now.
+      BBI.IsUnpredicable = true;
+      return;
     }
 
     if (BBI.ClobbersPred && !isPredicated) {
       // Predicate modification instruction should end the block (except for
       // already predicated instructions and end of block branches).
-      if (isCondBr) {
-        // A conditional branch is not predicable, but it may be eliminated.
-        continue;
-      }
-
       // Predicate may have been modified, the subsequent (currently)
       // unpredicated instructions cannot be correctly predicated.
       BBI.IsUnpredicable = true;
@@ -720,9 +725,9 @@ bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
   if (BBI.IsDone || BBI.IsUnpredicable)
     return false;
 
-  // If it is already predicated, check if its predicate subsumes the new
-  // predicate.
-  if (BBI.Predicate.size() && !TII->SubsumesPredicate(BBI.Predicate, Pred))
+  // If it is already predicated, check if the new predicate subsumes
+  // its predicate.
+  if (BBI.Predicate.size() && !TII->SubsumesPredicate(Pred, BBI.Predicate))
     return false;
 
   if (BBI.BrCond.size()) {
@@ -961,64 +966,58 @@ void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
     BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
 }
 
-/// InitPredRedefs / UpdatePredRedefs - Defs by predicated instructions are
-/// modeled as read + write (sort like two-address instructions). These
-/// routines track register liveness and add implicit uses to if-converted
-/// instructions to conform to the model.
-static void InitPredRedefs(MachineBasicBlock *BB, SmallSet<unsigned,4> &Redefs,
-                           const TargetRegisterInfo *TRI) {
-  for (MachineBasicBlock::livein_iterator I = BB->livein_begin(),
-         E = BB->livein_end(); I != E; ++I) {
-    unsigned Reg = *I;
-    Redefs.insert(Reg);
-    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-      Redefs.insert(*SubRegs);
-  }
-}
-
-static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI,
-                             bool AddImpUse = false) {
-  SmallVector<unsigned, 4> Defs;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
+/// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
+/// values defined in MI which are not live/used by MI.
+static void UpdatePredRedefs(MachineInstr *MI, LiveRegUnits &Redefs,
+                             const TargetRegisterInfo *TRI) {
+  for (ConstMIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isKill())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0)
       continue;
-    if (MO.isDef())
-      Defs.push_back(Reg);
-    else if (MO.isKill()) {
-      Redefs.erase(Reg);
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-        Redefs.erase(*SubRegs);
-    }
+    Redefs.removeReg(Reg, *TRI);
   }
-  MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
-  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
-    unsigned Reg = Defs[i];
-    if (!Redefs.insert(Reg)) {
-      if (AddImpUse)
-        // Treat predicated update as read + write.
-        MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
-    } else {
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
-        Redefs.insert(*SubRegs);
-    }
+  for (MIBundleOperands Ops(MI); Ops.isValid(); ++Ops) {
+    if (!Ops->isReg() || !Ops->isDef())
+      continue;
+    unsigned Reg = Ops->getReg();
+    if (Reg == 0 || Redefs.contains(Reg, *TRI))
+      continue;
+    Redefs.addReg(Reg, *TRI);
+
+    MachineOperand &Op = *Ops;
+    MachineInstr *MI = Op.getParent();
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(Reg, RegState::Implicit | RegState::Undef);
   }
 }
 
-static void UpdatePredRedefs(MachineBasicBlock::iterator I,
-                             MachineBasicBlock::iterator E,
-                             SmallSet<unsigned,4> &Redefs,
-                             const TargetRegisterInfo *TRI) {
-  while (I != E) {
-    UpdatePredRedefs(I, Redefs, TRI);
-    ++I;
+/**
+ * Remove kill flags from operands with a registers in the @p DontKill set.
+ */
+static void RemoveKills(MachineInstr &MI, const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for (MIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->isKill())
+      continue;
+    if (DontKill.contains(O->getReg(), MCRI))
+      O->setIsKill(false);
   }
 }
 
+/**
+ * Walks a range of machine instructions and removes kill flags for registers
+ * in the @p DontKill set.
+ */
+static void RemoveKills(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator E,
+                        const LiveRegUnits &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for ( ; I != E; ++I)
+    RemoveKills(*I, DontKill, MCRI);
+}
+
 /// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
 ///
 bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
@@ -1049,21 +1048,27 @@ bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentiall redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  // Compute a set of registers which must not be killed by instructions in
+  // BB1: This is everything live-in to BB2.
+  DontKill.init(TRI);
+  DontKill.addLiveIns(NextBBI->BB, *TRI);
 
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(CvtBBI->BB);
   } else {
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Merge converted block into entry block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1148,16 +1153,18 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(CvtBBI->BB, Redefs, TRI);
-  InitPredRedefs(NextBBI->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(CvtBBI->BB, *TRI);
+  Redefs.addLiveIns(NextBBI->BB, *TRI);
+
+  DontKill.clear();
 
   bool HasEarlyExit = CvtBBI->FalseBB != NULL;
   if (CvtBBI->BB->pred_size() > 1) {
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
-    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, Redefs, true);
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
     // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
     // explicitly remove CvtBBI as a successor.
@@ -1165,7 +1172,7 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
-    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond, Redefs);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
 
     // Now merge the entry of the triangle with the true block.
     BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
@@ -1276,8 +1283,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
 
   // Initialize liveins to the first BB. These are potentially redefined by
   // predicated instructions.
-  SmallSet<unsigned, 4> Redefs;
-  InitPredRedefs(BBI1->BB, Redefs, TRI);
+  Redefs.init(TRI);
+  Redefs.addLiveIns(BBI1->BB, *TRI);
 
   // Remove the duplicated instructions at the beginnings of both paths.
   MachineBasicBlock::iterator DI1 = BBI1->BB->begin();
@@ -1304,7 +1311,19 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       --NumDups1;
   }
 
-  UpdatePredRedefs(BBI1->BB->begin(), DI1, Redefs, TRI);
+  // Compute a set of registers which must not be killed by instructions in BB1:
+  // This is everything used+live in BB2 after the duplicated instructions. We
+  // can compute this set by simulating liveness backwards from the end of BB2.
+  DontKill.init(TRI);
+  for (MachineBasicBlock::reverse_iterator I = BBI2->BB->rbegin(),
+       E = MachineBasicBlock::reverse_iterator(DI2); I != E; ++I) {
+    DontKill.stepBackward(*I, *TRI);
+  }
+
+  for (MachineBasicBlock::const_iterator I = BBI1->BB->begin(), E = DI1; I != E;
+       ++I) {
+    Redefs.stepForward(*I, *TRI);
+  }
   BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
   BBI2->BB->erase(BBI2->BB->begin(), DI2);
 
@@ -1322,6 +1341,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
   BBI1->BB->erase(DI1, BBI1->BB->end());
 
+  // Kill flags in the true block for registers living into the false block
+  // must be removed.
+  RemoveKills(BBI1->BB->begin(), BBI1->BB->end(), DontKill, *TRI);
+
   // Remove 'false' block branch and find the last instruction to predicate.
   BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
   DI2 = BBI2->BB->end();
@@ -1362,8 +1385,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
         } else if (!RedefsByFalse.count(Reg)) {
           // These are defined before ctrl flow reach the 'false' instructions.
           // They cannot be modified by the 'true' instructions.
-          ExtUses.insert(Reg);
-          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+               SubRegs.isValid(); ++SubRegs)
             ExtUses.insert(*SubRegs);
         }
       }
@@ -1371,8 +1394,8 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
       for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
         unsigned Reg = Defs[i];
         if (!ExtUses.count(Reg)) {
-          RedefsByFalse.insert(Reg);
-          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+               SubRegs.isValid(); ++SubRegs)
             RedefsByFalse.insert(*SubRegs);
         }
       }
@@ -1380,10 +1403,10 @@ bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
   }
 
   // Predicate the 'true' block.
-  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, Redefs, &RedefsByFalse);
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, &RedefsByFalse);
 
   // Predicate the 'false' block.
-  PredicateBlock(*BBI2, DI2, *Cond2, Redefs);
+  PredicateBlock(*BBI2, DI2, *Cond2);
 
   // Merge the true block into the entry of the diamond.
   MergeBlocks(BBI, *BBI1, TailBB == 0);
@@ -1458,7 +1481,6 @@ static bool MaySpeculate(const MachineInstr *MI,
 void IfConverter::PredicateBlock(BBInfo &BBI,
                                  MachineBasicBlock::iterator E,
                                  SmallVectorImpl<MachineOperand> &Cond,
-                                 SmallSet<unsigned, 4> &Redefs,
                                  SmallSet<unsigned, 4> *LaterRedefs) {
   bool AnyUnpred = false;
   bool MaySpec = LaterRedefs != 0;
@@ -1484,7 +1506,7 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(I, Redefs, TRI, true);
+    UpdatePredRedefs(I, Redefs, TRI);
   }
 
   std::copy(Cond.begin(), Cond.end(), std::back_inserter(BBI.Predicate));
@@ -1501,7 +1523,6 @@ void IfConverter::PredicateBlock(BBInfo &BBI,
 /// the destination block. Skip end of block branches if IgnoreBr is true.
 void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
                                         SmallVectorImpl<MachineOperand> &Cond,
-                                        SmallSet<unsigned, 4> &Redefs,
                                         bool IgnoreBr) {
   MachineFunction &MF = *ToBBI.BB->getParent();
 
@@ -1514,8 +1535,8 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
     MachineInstr *MI = MF.CloneMachineInstr(I);
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
-    unsigned ExtraPredCost = 0;
-    unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost);
+    unsigned ExtraPredCost = TII->getPredicationCost(&*I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&*I, false);
     if (NumCycles > 1)
       ToBBI.ExtraCost += NumCycles-1;
     ToBBI.ExtraCost2 += ExtraPredCost;
@@ -1531,7 +1552,11 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 
     // If the predicated instruction now redefines a register as the result of
     // if-conversion, add an implicit kill.
-    UpdatePredRedefs(MI, Redefs, TRI, true);
+    UpdatePredRedefs(MI, Redefs, TRI);
+
+    // Some kill flags may not be correct anymore.
+    if (!DontKill.empty())
+      RemoveKills(*MI, DontKill, *TRI);
   }
 
   if (!IgnoreBr) {
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 35295fe..bb0e642 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "Spiller.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -21,8 +22,10 @@
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -63,6 +66,7 @@ class InlineSpiller : public Spiller {
   MachineRegisterInfo &MRI;
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
 
   // Variables that are valid during spill(), but used by multiple methods.
   LiveRangeEdit *Edit;
@@ -146,7 +150,8 @@ public:
       MFI(*mf.getFrameInfo()),
       MRI(mf.getRegInfo()),
       TII(*mf.getTarget().getInstrInfo()),
-      TRI(*mf.getTarget().getRegisterInfo()) {}
+      TRI(*mf.getTarget().getRegisterInfo()),
+      MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()) {}
 
   void spill(LiveRangeEdit &);
 
@@ -174,10 +179,8 @@ private:
   bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
                          MachineInstr *LoadMI = 0);
-  void insertReload(LiveInterval &NewLI, SlotIndex,
-                    MachineBasicBlock::iterator MI);
-  void insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                   SlotIndex, MachineBasicBlock::iterator MI);
+  void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
 
   void spillAroundUses(unsigned Reg);
   void spillAll();
@@ -337,10 +340,12 @@ static raw_ostream &operator<<(raw_ostream &OS,
 /// propagateSiblingValue - Propagate the value in SVI to dependents if it is
 /// known.  Otherwise remember the dependency for later.
 ///
-/// @param SVI SibValues entry to propagate.
+/// @param SVIIter SibValues entry to propagate.
 /// @param VNI Dependent value, or NULL to propagate to all saved dependents.
-void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVI,
+void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVIIter,
                                           VNInfo *VNI) {
+  SibValueMap::value_type *SVI = &*SVIIter;
+
   // When VNI is non-NULL, add it to SVI's deps, and only propagate to that.
   TinyPtrVector<VNInfo*> FirstDeps;
   if (VNI) {
@@ -352,14 +357,12 @@ void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVI,
   if (!SVI->second.hasDef())
     return;
 
-  // Work list of values to propagate.  It would be nice to use a SetVector
-  // here, but then we would be forced to use a SmallSet.
-  SmallVector<SibValueMap::iterator, 8> WorkList(1, SVI);
-  SmallPtrSet<VNInfo*, 8> WorkSet;
+  // Work list of values to propagate.
+  SmallSetVector<SibValueMap::value_type *, 8> WorkList;
+  WorkList.insert(SVI);
 
   do {
     SVI = WorkList.pop_back_val();
-    WorkSet.erase(SVI->first);
     TinyPtrVector<VNInfo*> *Deps = VNI ? &FirstDeps : &SVI->second.Deps;
     VNI = 0;
 
@@ -450,8 +453,7 @@ void InlineSpiller::propagateSiblingValue(SibValueMap::iterator SVI,
         continue;
 
       // Something changed in DepSVI. Propagate to dependents.
-      if (WorkSet.insert(DepSVI->first))
-        WorkList.push_back(DepSVI);
+      WorkList.insert(&*DepSVI);
 
       DEBUG(dbgs() << "  update " << DepSVI->first->id << '@'
             << DepSVI->first->def << " to:\t" << DepSV);
@@ -576,7 +578,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     if (unsigned SrcReg = isFullCopyOf(MI, Reg)) {
       if (isSibling(SrcReg)) {
         LiveInterval &SrcLI = LIS.getInterval(SrcReg);
-        LiveRangeQuery SrcQ(SrcLI, VNI->def);
+        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
         assert(SrcQ.valueIn() && "Copy from non-existing value");
         // Check if this COPY kills its source.
         SVI->second.KillsSource = SrcQ.isKill();
@@ -881,12 +883,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   }
 
   // Alocate a new register for the remat.
-  LiveInterval &NewLI = Edit->createFrom(Original);
-  NewLI.markNotSpillable();
+  unsigned NewVReg = Edit->createFrom(Original);
 
   // Finally we can rematerialize OrigMI before MI.
-  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewLI.reg, RM,
+  SlotIndex DefIdx = Edit->rematerializeAt(*MI->getParent(), MI, NewVReg, RM,
                                            TRI);
+  (void)DefIdx;
   DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
                << *LIS.getInstructionFromIndex(DefIdx));
 
@@ -894,15 +896,12 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(Ops[i].second);
     if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       MO.setIsKill();
     }
   }
-  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI);
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << *MI << '\n');
 
-  VNInfo *DefVNI = NewLI.getNextValue(DefIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(DefIdx, UseIdx.getRegSlot(), DefVNI));
-  DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
   ++NumRemats;
   return true;
 }
@@ -1005,6 +1004,40 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
+#if !defined(NDEBUG)
+// Dump the range of instructions from B to E with their slot indexes.
+static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
+                                               MachineBasicBlock::iterator E,
+                                               LiveIntervals const &LIS,
+                                               const char *const header,
+                                               unsigned VReg =0) {
+  char NextLine = '\n';
+  char SlotIndent = '\t';
+
+  if (llvm::next(B) == E) {
+    NextLine = ' ';
+    SlotIndent = ' ';
+  }
+
+  dbgs() << '\t' << header << ": " << NextLine;
+
+  for (MachineBasicBlock::iterator I = B; I != E; ++I) {
+    SlotIndex Idx = LIS.getInstructionIndex(I).getRegSlot();
+
+    // If a register was passed in and this instruction has it as a
+    // destination that is marked as an early clobber, print the
+    // early-clobber slot index.
+    if (VReg) {
+      MachineOperand *MO = I->findRegisterDefOperand(VReg);
+      if (MO && MO->isEarlyClobber())
+        Idx = Idx.getRegSlot(true);
+    }
+
+    dbgs() << SlotIndent << Idx << '\t' << *I;
+  }
+}
+#endif
+
 /// foldMemoryOperand - Try folding stack slot references in Ops into their
 /// instructions.
 ///
@@ -1024,6 +1057,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                       MI->getOpcode() == TargetOpcode::STACKMAP);
+
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
@@ -1035,7 +1071,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       continue;
     }
     // FIXME: Teach targets to deal with subregs.
-    if (MO.getSubReg())
+    if (!SpillSubRegs && MO.getSubReg())
       return false;
     // We cannot fold a load instruction into a def.
     if (LoadMI && MO.isDef())
@@ -1045,14 +1081,52 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
       FoldOps.push_back(Idx);
   }
 
+  MachineInstrSpan MIS(MI);
+
   MachineInstr *FoldMI =
                 LoadMI ? TII.foldMemoryOperand(MI, FoldOps, LoadMI)
                        : TII.foldMemoryOperand(MI, FoldOps, StackSlot);
   if (!FoldMI)
     return false;
+
+  // Remove LIS for any dead defs in the original MI not in FoldMI.
+  for (MIBundleOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
+        MRI.isReserved(Reg)) {
+      continue;
+    }
+    MIBundleOperands::PhysRegInfo RI =
+      MIBundleOperands(FoldMI).analyzePhysReg(Reg, &TRI);
+    if (MO->readsReg()) {
+      assert(RI.Reads && "Cannot fold physreg reader");
+      continue;
+    }
+    if (RI.Defines)
+      continue;
+    // FoldMI does not define this physreg. Remove the LI segment.
+    assert(MO->isDead() && "Cannot fold physreg def");
+    for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
+      if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
+        SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+        if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+          LR->removeValNo(VNI);
+      }
+    }
+  }
+
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);
   MI->eraseFromParent();
 
+  // Insert any new instructions other than FoldMI into the LIS maps.
+  assert(!MIS.empty() && "Unexpected empty span of instructions!");
+  for (MachineBasicBlock::iterator MII = MIS.begin(), End = MIS.end();
+       MII != End; ++MII)
+    if (&*MII != FoldMI)
+      LIS.InsertMachineInstrInMaps(&*MII);
+
   // TII.foldMemoryOperand may have left some implicit operands on the
   // instruction.  Strip them.
   if (ImpReg)
@@ -1064,8 +1138,9 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
         FoldMI->RemoveOperand(i - 1);
     }
 
-  DEBUG(dbgs() << "\tfolded:  " << LIS.getInstructionIndex(FoldMI) << '\t'
-               << *FoldMI);
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                           "folded"));
+
   if (!WasCopy)
     ++NumFolded;
   else if (Ops.front().second == 0)
@@ -1075,36 +1150,35 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
   return true;
 }
 
-/// insertReload - Insert a reload of NewLI.reg before MI.
-void InlineSpiller::insertReload(LiveInterval &NewLI,
+void InlineSpiller::insertReload(unsigned NewVReg,
                                  SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.loadRegFromStackSlot(MBB, MI, NewLI.reg, StackSlot,
-                           MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to load instruction.
-  SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  // Some (out-of-tree) targets have EC reload instructions.
-  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
-    if (MO->isEarlyClobber())
-      LoadIdx = LoadIdx.getRegSlot(true);
-  DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
-  VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
+                           MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                           NewVReg));
   ++NumReloads;
 }
 
-/// insertSpill - Insert a spill of NewLI.reg after MI.
-void InlineSpiller::insertSpill(LiveInterval &NewLI, const LiveInterval &OldLI,
-                                SlotIndex Idx, MachineBasicBlock::iterator MI) {
+/// insertSpill - Insert a spill of NewVReg after MI.
+void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
+                                 MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
-  TII.storeRegToStackSlot(MBB, ++MI, NewLI.reg, true, StackSlot,
-                          MRI.getRegClass(NewLI.reg), &TRI);
-  --MI; // Point to store instruction.
-  SlotIndex StoreIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
-  DEBUG(dbgs() << "\tspilled: " << StoreIdx << '\t' << *MI);
-  VNInfo *StoreVNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-  NewLI.addRange(LiveRange(Idx, StoreIdx, StoreVNI));
+
+  MachineInstrSpan MIS(MI);
+  TII.storeRegToStackSlot(MBB, llvm::next(MI), NewVReg, isKill, StackSlot,
+                          MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(llvm::next(MI), MIS.end());
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(llvm::next(MI), MIS.end(), LIS,
+                                           "spill"));
   ++NumSpills;
 }
 
@@ -1120,18 +1194,14 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     // Debug values are not allowed to affect codegen.
     if (MI->isDebugValue()) {
       // Modify DBG_VALUE now that the value is in a spill slot.
-      uint64_t Offset = MI->getOperand(1).getImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
+      uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       const MDNode *MDPtr = MI->getOperand(2).getMetadata();
       DebugLoc DL = MI->getDebugLoc();
-      if (MachineInstr *NewDV = TII.emitFrameIndexDebugValue(MF, StackSlot,
-                                                           Offset, MDPtr, DL)) {
-        DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
-        MachineBasicBlock *MBB = MI->getParent();
-        MBB->insert(MBB->erase(MI), NewDV);
-      } else {
-        DEBUG(dbgs() << "Removing debug info due to spill:" << "\t" << *MI);
-        MI->eraseFromParent();
-      }
+      DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+      MachineBasicBlock *MBB = MI->getParent();
+      BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE))
+          .addFrameIndex(StackSlot).addImm(Offset).addMetadata(MDPtr);
       continue;
     }
 
@@ -1184,19 +1254,18 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
     if (foldMemoryOperand(Ops))
       continue;
 
-    // Allocate interval around instruction.
+    // Create a new virtual register for spill/fill.
     // FIXME: Infer regclass from instruction alone.
-    LiveInterval &NewLI = Edit->createFrom(Reg);
-    NewLI.markNotSpillable();
+    unsigned NewVReg = Edit->createFrom(Reg);
 
     if (RI.Reads)
-      insertReload(NewLI, Idx, MI);
+      insertReload(NewVReg, Idx, MI);
 
     // Rewrite instruction operands.
     bool hasLiveDef = false;
     for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
       MachineOperand &MO = Ops[i].first->getOperand(Ops[i].second);
-      MO.setReg(NewLI.reg);
+      MO.setReg(NewVReg);
       if (MO.isUse()) {
         if (!Ops[i].first->isRegTiedToDefOperand(Ops[i].second))
           MO.setIsKill();
@@ -1205,21 +1274,12 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
           hasLiveDef = true;
       }
     }
-    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
 
     // FIXME: Use a second vreg if instruction has no tied ops.
-    if (RI.Writes) {
+    if (RI.Writes)
       if (hasLiveDef)
-        insertSpill(NewLI, OldLI, Idx, MI);
-      else {
-        // This instruction defines a dead value.  We don't need to spill it,
-        // but do create a live range for the dead value.
-        VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
-        NewLI.addRange(LiveRange(Idx, Idx.getDeadSlot(), VNI));
-      }
-    }
-
-    DEBUG(dbgs() << "\tinterval: " << NewLI << '\n');
+        insertSpill(NewVReg, true, MI);
   }
 }
 
@@ -1238,8 +1298,8 @@ void InlineSpiller::spillAll() {
 
   assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
   for (unsigned i = 0, e = RegsToSpill.size(); i != e; ++i)
-    StackInt->MergeRangesInAsValue(LIS.getInterval(RegsToSpill[i]),
-                                   StackInt->getValNumInfo(0));
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(RegsToSpill[i]),
+                                     StackInt->getValNumInfo(0));
   DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
@@ -1280,8 +1340,8 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
 
   DEBUG(dbgs() << "Inline spilling "
                << MRI.getRegClass(edit.getReg())->getName()
-               << ':' << PrintReg(edit.getReg()) << ' ' << edit.getParent()
-               << "\nFrom original " << LIS.getInterval(Original) << '\n');
+               << ':' << edit.getParent()
+               << "\nFrom original " << PrintReg(Original) << '\n');
   assert(edit.getParent().isSpillable() &&
          "Attempting to spill already spilled value.");
   assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
@@ -1294,5 +1354,5 @@ void InlineSpiller::spill(LiveRangeEdit &edit) {
   if (!RegsToSpill.empty())
     spillAll();
 
-  Edit->calculateRegClassAndHint(MF, Loops);
+  Edit->calculateRegClassAndHint(MF, Loops, MBFI);
 }
diff --git a/lib/CodeGen/InterferenceCache.cpp b/lib/CodeGen/InterferenceCache.cpp
index a8e711e..427225d 100644
--- a/lib/CodeGen/InterferenceCache.cpp
+++ b/lib/CodeGen/InterferenceCache.cpp
@@ -204,11 +204,11 @@ void InterferenceCache::Entry::update(unsigned MBBNum) {
   // Fixed interference.
   for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
     LiveInterval::iterator &I = RegUnits[i].FixedI;
-    LiveInterval *LI = RegUnits[i].Fixed;
-    if (I == LI->end() || I->start >= Stop)
+    LiveRange *LR = RegUnits[i].Fixed;
+    if (I == LR->end() || I->start >= Stop)
       continue;
-    I = LI->advanceTo(I, Stop);
-    bool Backup = I == LI->end() || I->start >= Stop;
+    I = LR->advanceTo(I, Stop);
+    bool Backup = I == LR->end() || I->start >= Stop;
     if (Backup)
       --I;
     SlotIndex StopI = I->end;
diff --git a/lib/CodeGen/InterferenceCache.h b/lib/CodeGen/InterferenceCache.h
index c02fb9a..800f705 100644
--- a/lib/CodeGen/InterferenceCache.h
+++ b/lib/CodeGen/InterferenceCache.h
@@ -72,7 +72,7 @@ class InterferenceCache {
       unsigned VirtTag;
 
       /// Fixed interference in RegUnit.
-      LiveInterval *Fixed;
+      LiveRange *Fixed;
 
       /// Iterator pointing into the fixed RegUnit interference.
       LiveInterval::iterator FixedI;
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index d894f66..c38d4fb 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -485,11 +485,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    Type *IntPtr = TD.getIntPtrType(Context);
+    Value *Op0 = CI->getArgOperand(0);
+    Type *IntPtr = TD.getIntPtrType(Op0->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
-    Ops[0] = CI->getArgOperand(0);
+    Ops[0] = Op0;
     // Extend the amount to i32.
     Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
                                    Type::getInt32Ty(Context),
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 1a09837..ad2c553 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -62,6 +62,17 @@ static bool getVerboseAsm() {
   llvm_unreachable("Invalid verbose asm state");
 }
 
+void LLVMTargetMachine::initAsmInfo() {
+  AsmInfo = TheTarget.createMCAsmInfo(*getRegisterInfo(), TargetTriple);
+  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
+  // and if the old one gets included then MCAsmInfo will be NULL and
+  // we'll crash later.
+  // Provide the user with a useful error message about what's wrong.
+  assert(AsmInfo && "MCAsmInfo not initialized. "
+         "Make sure you include the correct TargetSelect.h"
+         "and that InitializeAllTargetMCs() is being invoked!");
+}
+
 LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
                                      StringRef CPU, StringRef FS,
                                      TargetOptions Options,
@@ -69,18 +80,10 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
                                      CodeGenOpt::Level OL)
   : TargetMachine(T, Triple, CPU, FS, Options) {
   CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM, OL);
-  AsmInfo = T.createMCAsmInfo(Triple);
-  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
-  // and if the old one gets included then MCAsmInfo will be NULL and
-  // we'll crash later.
-  // Provide the user with a useful error message about what's wrong.
-  assert(AsmInfo && "MCAsmInfo not initialized."
-         "Make sure you include the correct TargetSelect.h"
-         "and that InitializeAllTargetMCs() is being invoked!");
 }
 
 void LLVMTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createBasicTargetTransformInfoPass(this));
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
@@ -112,7 +115,6 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
     new MachineModuleInfo(*TM->getMCAsmInfo(), *TM->getRegisterInfo(),
                           &TM->getTargetLowering()->getObjFileLowering());
   PM.add(MMI);
-  MCContext *Context = &MMI->getContext(); // Return the MCContext by-ref.
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
   PM.add(new MachineFunctionAnalysis(*TM));
@@ -131,7 +133,7 @@ static MCContext *addPassesToGenerateCode(LLVMTargetMachine *TM,
 
   PassConfig->setInitialized();
 
-  return Context;
+  return &MMI->getContext();
 }
 
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
@@ -161,6 +163,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 
   const MCAsmInfo &MAI = *getMCAsmInfo();
   const MCRegisterInfo &MRI = *getRegisterInfo();
+  const MCInstrInfo &MII = *getInstrInfo();
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   OwningPtr<MCStreamer> AsmStreamer;
 
@@ -168,19 +171,15 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_AssemblyFile: {
     MCInstPrinter *InstPrinter =
       getTarget().createMCInstPrinter(MAI.getAssemblerDialect(), MAI,
-                                      *getInstrInfo(),
-                                      Context->getRegisterInfo(), STI);
+                                      MII, MRI, STI);
 
     // Create a code emitter if asked to show the encoding.
     MCCodeEmitter *MCE = 0;
-    MCAsmBackend *MAB = 0;
-    if (ShowMCEncoding) {
-      const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
-      MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
-                                            *Context);
-      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
-    }
+    if (ShowMCEncoding)
+      MCE = getTarget().createMCCodeEmitter(MII, MRI, STI, *Context);
 
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                       TargetCPU);
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
                                                   getVerboseAsm(),
                                                   hasMCUseLoc(),
@@ -195,9 +194,9 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
   case CGFT_ObjectFile: {
     // Create the code emitter for the target if it exists.  If not, .o file
     // emission fails.
-    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
-                                                         STI, *Context);
-    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(),
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, STI,
+                                                         *Context);
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
                                                        TargetCPU);
     if (MCE == 0 || MAB == 0)
       return true;
@@ -232,7 +231,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
 /// get machine code emitted.  This uses a JITCodeEmitter object to handle
 /// actually outputting the machine code and resolving things like the address
-/// of functions.  This method should returns true if machine code emission is
+/// of functions.  This method should return true if machine code emission is
 /// not supported.
 ///
 bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
@@ -271,7 +270,8 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                        STI, *Ctx);
-  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(MRI, getTargetTriple(),
+                                                     TargetCPU);
   if (MCE == 0 || MAB == 0)
     return true;
 
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 8172154..ffe407a 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -212,15 +212,15 @@ LexicalScope *LexicalScopes::getOrCreateAbstractScope(const MDNode *N) {
 
 /// constructScopeNest
 void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
-  assert (Scope && "Unable to calculate scop edominance graph!");
+  assert (Scope && "Unable to calculate scope dominance graph!");
   SmallVector<LexicalScope *, 4> WorkStack;
   WorkStack.push_back(Scope);
   unsigned Counter = 0;
   while (!WorkStack.empty()) {
     LexicalScope *WS = WorkStack.back();
-    const SmallVector<LexicalScope *, 4> &Children = WS->getChildren();
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
     bool visitedChildren = false;
-    for (SmallVector<LexicalScope *, 4>::const_iterator SI = Children.begin(),
+    for (SmallVectorImpl<LexicalScope *>::const_iterator SI = Children.begin(),
            SE = Children.end(); SI != SE; ++SI) {
       LexicalScope *ChildScope = *SI;
       if (!ChildScope->getDFSOut()) {
@@ -279,8 +279,8 @@ getMachineBasicBlocks(DebugLoc DL,
     return;
   }
 
-  SmallVector<InsnRange, 4> &InsnRanges = Scope->getRanges();
-  for (SmallVector<InsnRange, 4>::iterator I = InsnRanges.begin(),
+  SmallVectorImpl<InsnRange> &InsnRanges = Scope->getRanges();
+  for (SmallVectorImpl<InsnRange>::iterator I = InsnRanges.begin(),
          E = InsnRanges.end(); I != E; ++I) {
     InsnRange &R = *I;
     MBBs.insert(R.first->getParent());
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 0b117ac..25645e0 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -108,6 +108,7 @@ class LDVImpl;
 class UserValue {
   const MDNode *variable; ///< The debug info variable we are part of.
   unsigned offset;        ///< Byte offset into variable.
+  bool IsIndirect;        ///< true if this is a register-indirect+offset value.
   DebugLoc dl;            ///< The debug location for the variable. This is
                           ///< used by dwarf writer to find lexical scope.
   UserValue *leader;      ///< Equivalence class leader.
@@ -130,13 +131,15 @@ class UserValue {
 
   /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
   /// is live. Returns true if any changes were made.
-  bool splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
 public:
   /// UserValue - Create a new UserValue.
-  UserValue(const MDNode *var, unsigned o, DebugLoc L,
+  UserValue(const MDNode *var, unsigned o, bool i, DebugLoc L,
             LocMap::Allocator &alloc)
-    : variable(var), offset(o), dl(L), leader(this), next(0), locInts(alloc)
+    : variable(var), offset(o), IsIndirect(i), dl(L), leader(this),
+      next(0), locInts(alloc)
   {}
 
   /// getLeader - Get the leader of this value's equivalence class.
@@ -217,13 +220,13 @@ public:
   /// End points where VNI is no longer live are added to Kills.
   /// @param Idx   Starting point for the definition.
   /// @param LocNo Location number to propagate.
-  /// @param LI    Restrict liveness to where LI has the value VNI. May be null.
-  /// @param VNI   When LI is not null, this is the value to restrict to.
+  /// @param LR    Restrict liveness to where LR has the value VNI. May be null.
+  /// @param VNI   When LR is not null, this is the value to restrict to.
   /// @param Kills Append end points of VNI's live range to Kills.
   /// @param LIS   Live intervals analysis.
   /// @param MDT   Dominator tree.
   void extendDef(SlotIndex Idx, unsigned LocNo,
-                 LiveInterval *LI, const VNInfo *VNI,
+                 LiveRange *LR, const VNInfo *VNI,
                  SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS, MachineDominatorTree &MDT,
                  UserValueScopes &UVS);
@@ -249,7 +252,8 @@ public:
 
   /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
-  bool splitRegister(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs);
+  bool splitRegister(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// rewriteLocations - Rewrite virtual register locations according to the
   /// provided virtual register map.
@@ -299,7 +303,8 @@ class LDVImpl {
   UVMap userVarMap;
 
   /// getUserValue - Find or create a UserValue.
-  UserValue *getUserValue(const MDNode *Var, unsigned Offset, DebugLoc DL);
+  UserValue *getUserValue(const MDNode *Var, unsigned Offset,
+                          bool IsIndirect, DebugLoc DL);
 
   /// lookupVirtReg - Find the EC leader for VirtReg or null.
   UserValue *lookupVirtReg(unsigned VirtReg);
@@ -342,7 +347,7 @@ public:
   void mapVirtReg(unsigned VirtReg, UserValue *EC);
 
   /// splitRegister -  Replace all references to OldReg with NewRegs.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
 
   /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
@@ -414,7 +419,7 @@ void UserValue::mapVirtRegs(LDVImpl *LDV) {
 }
 
 UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
-                                 DebugLoc DL) {
+                                 bool IsIndirect, DebugLoc DL) {
   UserValue *&Leader = userVarMap[Var];
   if (Leader) {
     UserValue *UV = Leader->getLeader();
@@ -424,7 +429,7 @@ UserValue *LDVImpl::getUserValue(const MDNode *Var, unsigned Offset,
         return UV;
   }
 
-  UserValue *UV = new UserValue(Var, Offset, DL, allocator);
+  UserValue *UV = new UserValue(Var, Offset, IsIndirect, DL, allocator);
   userValues.push_back(UV);
   Leader = UserValue::merge(Leader, UV);
   return UV;
@@ -445,15 +450,18 @@ UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
 bool LDVImpl::handleDebugValue(MachineInstr *MI, SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable
   if (MI->getNumOperands() != 3 ||
-      !MI->getOperand(1).isImm() || !MI->getOperand(2).isMetadata()) {
+      !(MI->getOperand(1).isReg() || MI->getOperand(1).isImm()) ||
+      !MI->getOperand(2).isMetadata()) {
     DEBUG(dbgs() << "Can't handle " << *MI);
     return false;
   }
 
   // Get or create the UserValue for (variable,offset).
-  unsigned Offset = MI->getOperand(1).getImm();
+  bool IsIndirect = MI->isIndirectDebugValue();
+  unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
   const MDNode *Var = MI->getOperand(2).getMetadata();
-  UserValue *UV = getUserValue(Var, Offset, MI->getDebugLoc());
+  //here.
+  UserValue *UV = getUserValue(Var, Offset, IsIndirect, MI->getDebugLoc());
   UV->addDef(Idx, MI->getOperand(0));
   return true;
 }
@@ -487,7 +495,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
 }
 
 void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
-                          LiveInterval *LI, const VNInfo *VNI,
+                          LiveRange *LR, const VNInfo *VNI,
                           SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS, MachineDominatorTree &MDT,
                           UserValueScopes &UVS) {
@@ -501,15 +509,15 @@ void UserValue::extendDef(SlotIndex Idx, unsigned LocNo,
 
     // Limit to VNI's live range.
     bool ToEnd = true;
-    if (LI && VNI) {
-      LiveRange *Range = LI->getLiveRangeContaining(Start);
-      if (!Range || Range->valno != VNI) {
+    if (LR && VNI) {
+      LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+      if (!Segment || Segment->valno != VNI) {
         if (Kills)
           Kills->push_back(Start);
         continue;
       }
-      if (Range->end < Stop)
-        Stop = Range->end, ToEnd = false;
+      if (Segment->end < Stop)
+        Stop = Segment->end, ToEnd = false;
     }
 
     // There could already be a short def at Start.
@@ -661,10 +669,10 @@ UserValue::computeIntervals(MachineRegisterInfo &MRI,
 
     // For physregs, use the live range of the first regunit as a guide.
     unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
-    LiveInterval *LI = &LIS.getRegUnit(Unit);
-    const VNInfo *VNI = LI->getVNInfoAt(Idx);
+    LiveRange *LR = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LR->getVNInfoAt(Idx);
     // Don't track copies from physregs, it is too expensive.
-    extendDef(Idx, LocNo, LI, VNI, 0, LIS, MDT, UVS);
+    extendDef(Idx, LocNo, LR, VNI, 0, LIS, MDT, UVS);
   }
 
   // Finally, erase all the undefs.
@@ -724,7 +732,8 @@ LiveDebugVariables::~LiveDebugVariables() {
 //===----------------------------------------------------------------------===//
 
 bool
-UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals& LIS) {
   DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
     print(dbgs(), 0);
@@ -733,7 +742,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
   LocMap::iterator LocMapI;
   LocMapI.setMap(locInts);
   for (unsigned i = 0; i != NewRegs.size(); ++i) {
-    LiveInterval *LI = NewRegs[i];
+    LiveInterval *LI = &LIS.getInterval(NewRegs[i]);
     if (LI->empty())
       continue;
 
@@ -822,7 +831,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<LiveInterval*> NewRegs) {
 }
 
 bool
-UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals &LIS) {
   bool DidChange = false;
   // Split locations referring to OldReg. Iterate backwards so splitLocation can
   // safely erase unused locations.
@@ -831,15 +841,15 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
     const MachineOperand *Loc = &locations[LocNo];
     if (!Loc->isReg() || Loc->getReg() != OldReg)
       continue;
-    DidChange |= splitLocation(LocNo, NewRegs);
+    DidChange |= splitLocation(LocNo, NewRegs, LIS);
   }
   return DidChange;
 }
 
-void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
   bool DidChange = false;
   for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
-    DidChange |= UV->splitRegister(OldReg, NewRegs);
+    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
 
   if (!DidChange)
     return;
@@ -847,11 +857,11 @@ void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
   // Map all of the new virtual registers.
   UserValue *UV = lookupVirtReg(OldReg);
   for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i]->reg, UV);
+    mapVirtReg(NewRegs[i], UV);
 }
 
 void LiveDebugVariables::
-splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs) {
+splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs, LiveIntervals &LIS) {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
 }
@@ -921,19 +931,12 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
   MachineOperand &Loc = locations[LocNo];
   ++NumInsertedDebugValues;
 
-  // Frame index locations may require a target callback.
-  if (Loc.isFI()) {
-    MachineInstr *MI = TII.emitFrameIndexDebugValue(*MBB->getParent(),
-                                          Loc.getIndex(), offset, variable, 
-                                                    findDebugLoc());
-    if (MI) {
-      MBB->insert(I, MI);
-      return;
-    }
-  }
-  // This is not a frame index, or the target is happy with a standard FI.
-  BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
-    .addOperand(Loc).addImm(offset).addMetadata(variable);
+  if (Loc.isReg())
+    BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
+            IsIndirect, Loc.getReg(), offset, variable);
+  else
+    BuildMI(*MBB, I, findDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
+      .addOperand(Loc).addImm(offset).addMetadata(variable);
 }
 
 void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
@@ -992,4 +995,3 @@ void LiveDebugVariables::dump() {
     static_cast<LDVImpl*>(pImpl)->print(dbgs());
 }
 #endif
-
diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 3ce3c39..58a3f0f 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h
@@ -27,6 +27,7 @@
 namespace llvm {
 
 class LiveInterval;
+class LiveIntervals;
 class VirtRegMap;
 
 class LiveDebugVariables : public MachineFunctionPass {
@@ -47,7 +48,8 @@ public:
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
-  void splitRegister(unsigned OldReg, ArrayRef<LiveInterval*> NewRegs);
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
 
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
   /// that happened during register allocation.
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index dccd847..2b8feb8 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -9,12 +9,12 @@
 //
 // This file implements the LiveRange and LiveInterval classes.  Given some
 // numbering of each the machine instructions an interval [i, j) is said to be a
-// live interval for register v if there is no instruction with number j' > j
+// live range for register v if there is no instruction with number j' >= j
 // such that v is live at j' and there is no instruction with number i' < i such
-// that v is live at i'. In this implementation intervals can have holes,
-// i.e. an interval might look like [1,20), [50,65), [1000,1001).  Each
-// individual range is represented as an instance of LiveRange, and the whole
-// interval is represented as an instance of LiveInterval.
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
 //
 //===----------------------------------------------------------------------===//
 
@@ -31,14 +31,14 @@
 #include <algorithm>
 using namespace llvm;
 
-LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
+LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   // This algorithm is basically std::upper_bound.
   // Unfortunately, std::upper_bound cannot be used with mixed types until we
   // adopt C++0x. Many libraries can do it, but not all.
   if (empty() || Pos >= endIndex())
     return end();
   iterator I = begin();
-  size_t Len = ranges.size();
+  size_t Len = size();
   do {
     size_t Mid = Len >> 1;
     if (Pos < I[Mid].end)
@@ -49,13 +49,13 @@ LiveInterval::iterator LiveInterval::find(SlotIndex Pos) {
   return I;
 }
 
-VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
-                                    VNInfo::Allocator &VNInfoAllocator) {
+VNInfo *LiveRange::createDeadDef(SlotIndex Def,
+                                  VNInfo::Allocator &VNInfoAllocator) {
   assert(!Def.isDead() && "Cannot define a value at the dead slot");
   iterator I = find(Def);
   if (I == end()) {
     VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-    ranges.push_back(LiveRange(Def, Def.getDeadSlot(), VNI));
+    segments.push_back(Segment(Def, Def.getDeadSlot(), VNI));
     return VNI;
   }
   if (SlotIndex::isSameInstr(Def, I->start)) {
@@ -73,11 +73,11 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
   }
   assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
   VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-  ranges.insert(I, LiveRange(Def, Def.getDeadSlot(), VNI));
+  segments.insert(I, Segment(Def, Def.getDeadSlot(), VNI));
   return VNI;
 }
 
-// overlaps - Return true if the intersection of the two live intervals is
+// overlaps - Return true if the intersection of the two live ranges is
 // not empty.
 //
 // An example for overlaps():
@@ -86,7 +86,7 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // 4: B = ...
 // 8: C = A + B ;; last use of A
 //
-// The live intervals should look like:
+// The live ranges should look like:
 //
 // A = [3, 11)
 // B = [7, x)
@@ -95,9 +95,9 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
 // A->overlaps(C) should return false since we want to be able to join
 // A and C.
 //
-bool LiveInterval::overlapsFrom(const LiveInterval& other,
-                                const_iterator StartPos) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlapsFrom(const LiveRange& other,
+                             const_iterator StartPos) const {
+  assert(!empty() && "empty range");
   const_iterator i = begin();
   const_iterator ie = end();
   const_iterator j = StartPos;
@@ -108,13 +108,13 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
 
   if (i->start < j->start) {
     i = std::upper_bound(i, ie, j->start);
-    if (i != ranges.begin()) --i;
+    if (i != begin()) --i;
   } else if (j->start < i->start) {
     ++StartPos;
     if (StartPos != other.end() && StartPos->start <= i->start) {
       assert(StartPos < other.end() && i < end());
       j = std::upper_bound(j, je, i->start);
-      if (j != other.ranges.begin()) --j;
+      if (j != other.begin()) --j;
     }
   } else {
     return true;
@@ -136,10 +136,9 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
   return false;
 }
 
-bool LiveInterval::overlaps(const LiveInterval &Other,
-                            const CoalescerPair &CP,
-                            const SlotIndexes &Indexes) const {
-  assert(!empty() && "empty interval");
+bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP,
+                         const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty range");
   if (Other.empty())
     return false;
 
@@ -178,9 +177,9 @@ bool LiveInterval::overlaps(const LiveInterval &Other,
   }
 }
 
-/// overlaps - Return true if the live interval overlaps a range specified
+/// overlaps - Return true if the live range overlaps an interval specified
 /// by [Start, End).
-bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
+bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
   assert(Start < End && "Invalid range");
   const_iterator I = std::lower_bound(begin(), end(), End);
   return I != begin() && (--I)->end > Start;
@@ -190,7 +189,7 @@ bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
 /// it can be nuked later.
-void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
+void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
   if (ValNo->id == getNumValNums()-1) {
     do {
       valnos.pop_back();
@@ -202,137 +201,135 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
 
 /// RenumberValues - Renumber all values in order of appearance and delete the
 /// remaining unused values.
-void LiveInterval::RenumberValues(LiveIntervals &lis) {
+void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     VNInfo *VNI = I->valno;
     if (!Seen.insert(VNI))
       continue;
-    assert(!VNI->isUnused() && "Unused valno used by live range");
+    assert(!VNI->isUnused() && "Unused valno used by live segment");
     VNI->id = (unsigned)valnos.size();
     valnos.push_back(VNI);
   }
 }
 
-/// extendIntervalEndTo - This method is used when we want to extend the range
-/// specified by I to end at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.  The iterator is
-/// not invalidated.
-void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to end
+/// at the specified endpoint.  To do this, we should merge and eliminate all
+/// segments that this will overlap with.  The iterator is not invalidated.
+void LiveRange::extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = llvm::next(I);
-  for (; MergeTo != ranges.end() && NewEnd >= MergeTo->end; ++MergeTo) {
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = llvm::next(I);
+  for (; MergeTo != end() && NewEnd >= MergeTo->end; ++MergeTo) {
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
   }
 
-  // If NewEnd was in the middle of an interval, make sure to get its endpoint.
+  // If NewEnd was in the middle of a segment, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // If the newly formed range now touches the range after it and if they have
-  // the same value number, merge the two ranges into one range.
-  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+  // If the newly formed segment now touches the segment after it and if they
+  // have the same value number, merge the two segments into one segment.
+  if (MergeTo != end() && MergeTo->start <= I->end &&
       MergeTo->valno == ValNo) {
     I->end = MergeTo->end;
     ++MergeTo;
   }
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
+  // Erase any dead segments.
+  segments.erase(llvm::next(I), MergeTo);
 }
 
 
-/// extendIntervalStartTo - This method is used when we want to extend the range
-/// specified by I to start at the specified endpoint.  To do this, we should
-/// merge and eliminate all ranges that this will overlap with.
-LiveInterval::Ranges::iterator
-LiveInterval::extendIntervalStartTo(Ranges::iterator I, SlotIndex NewStart) {
-  assert(I != ranges.end() && "Not a valid interval!");
+/// This method is used when we want to extend the segment specified by I to
+/// start at the specified endpoint.  To do this, we should merge and eliminate
+/// all segments that this will overlap with.
+LiveRange::iterator
+LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+  assert(I != end() && "Not a valid segment!");
   VNInfo *ValNo = I->valno;
 
-  // Search for the first interval that we can't merge with.
-  Ranges::iterator MergeTo = I;
+  // Search for the first segment that we can't merge with.
+  iterator MergeTo = I;
   do {
-    if (MergeTo == ranges.begin()) {
+    if (MergeTo == begin()) {
       I->start = NewStart;
-      ranges.erase(MergeTo, I);
+      segments.erase(MergeTo, I);
       return I;
     }
     assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
     --MergeTo;
   } while (NewStart <= MergeTo->start);
 
-  // If we start in the middle of another interval, just delete a range and
-  // extend that interval.
+  // If we start in the middle of another segment, just delete a range and
+  // extend that segment.
   if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
     MergeTo->end = I->end;
   } else {
-    // Otherwise, extend the interval right after.
+    // Otherwise, extend the segment right after.
     ++MergeTo;
     MergeTo->start = NewStart;
     MergeTo->end = I->end;
   }
 
-  ranges.erase(llvm::next(MergeTo), llvm::next(I));
+  segments.erase(llvm::next(MergeTo), llvm::next(I));
   return MergeTo;
 }
 
-LiveInterval::iterator
-LiveInterval::addRangeFrom(LiveRange LR, iterator From) {
-  SlotIndex Start = LR.start, End = LR.end;
-  iterator it = std::upper_bound(From, ranges.end(), Start);
+LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
+  SlotIndex Start = S.start, End = S.end;
+  iterator it = std::upper_bound(From, end(), Start);
 
-  // If the inserted interval starts in the middle or right at the end of
-  // another interval, just extend that interval to contain the range of LR.
-  if (it != ranges.begin()) {
+  // If the inserted segment starts in the middle or right at the end of
+  // another segment, just extend that segment to contain the segment of S.
+  if (it != begin()) {
     iterator B = prior(it);
-    if (LR.valno == B->valno) {
+    if (S.valno == B->valno) {
       if (B->start <= Start && B->end >= Start) {
-        extendIntervalEndTo(B, End);
+        extendSegmentEndTo(B, End);
         return B;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(B->end <= Start &&
-             "Cannot overlap two LiveRanges with differing ValID's"
+             "Cannot overlap two segments with differing ValID's"
              " (did you def the same reg twice in a MachineInstr?)");
     }
   }
 
-  // Otherwise, if this range ends in the middle of, or right next to, another
-  // interval, merge it into that interval.
-  if (it != ranges.end()) {
-    if (LR.valno == it->valno) {
+  // Otherwise, if this segment ends in the middle of, or right next to, another
+  // segment, merge it into that segment.
+  if (it != end()) {
+    if (S.valno == it->valno) {
       if (it->start <= End) {
-        it = extendIntervalStartTo(it, Start);
+        it = extendSegmentStartTo(it, Start);
 
-        // If LR is a complete superset of an interval, we may need to grow its
+        // If S is a complete superset of a segment, we may need to grow its
         // endpoint as well.
         if (End > it->end)
-          extendIntervalEndTo(it, End);
+          extendSegmentEndTo(it, End);
         return it;
       }
     } else {
-      // Check to make sure that we are not overlapping two live ranges with
+      // Check to make sure that we are not overlapping two live segments with
       // different valno's.
       assert(it->start >= End &&
-             "Cannot overlap two LiveRanges with differing ValID's");
+             "Cannot overlap two segments with differing ValID's");
     }
   }
 
-  // Otherwise, this is just a new range that doesn't interact with anything.
+  // Otherwise, this is just a new segment that doesn't interact with anything.
   // Insert it.
-  return ranges.insert(it, LR);
+  return segments.insert(it, S);
 }
 
-/// extendInBlock - If this interval is live before Kill in the basic
+/// extendInBlock - If this range is live before Kill in the basic
 /// block that starts at StartIdx, extend it to be live up to Kill and return
 /// the value. If there is no live range before Kill, return NULL.
-VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (empty())
     return 0;
   iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
@@ -342,20 +339,21 @@ VNInfo *LiveInterval::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
   if (I->end <= StartIdx)
     return 0;
   if (I->end < Kill)
-    extendIntervalEndTo(I, Kill);
+    extendSegmentEndTo(I, Kill);
   return I->valno;
 }
 
-/// removeRange - Remove the specified range from this interval.  Note that
-/// the range must be in a single LiveRange in its entirety.
-void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
-                               bool RemoveDeadValNo) {
-  // Find the LiveRange containing this span.
-  Ranges::iterator I = find(Start);
-  assert(I != ranges.end() && "Range is not in interval!");
-  assert(I->containsRange(Start, End) && "Range is not entirely in interval!");
+/// Remove the specified segment from this range.  Note that the segment must
+/// be in a single Segment in its entirety.
+void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
+                              bool RemoveDeadValNo) {
+  // Find the Segment containing this span.
+  iterator I = find(Start);
+  assert(I != end() && "Segment is not in range!");
+  assert(I->containsInterval(Start, End)
+         && "Segment is not entirely in range!");
 
-  // If the span we are removing is at the start of the LiveRange, adjust it.
+  // If the span we are removing is at the start of the Segment, adjust it.
   VNInfo *ValNo = I->valno;
   if (I->start == Start) {
     if (I->end == End) {
@@ -373,54 +371,50 @@ void LiveInterval::removeRange(SlotIndex Start, SlotIndex End,
         }
       }
 
-      ranges.erase(I);  // Removed the whole LiveRange.
+      segments.erase(I);  // Removed the whole Segment.
     } else
       I->start = End;
     return;
   }
 
-  // Otherwise if the span we are removing is at the end of the LiveRange,
+  // Otherwise if the span we are removing is at the end of the Segment,
   // adjust the other way.
   if (I->end == End) {
     I->end = Start;
     return;
   }
 
-  // Otherwise, we are splitting the LiveRange into two pieces.
+  // Otherwise, we are splitting the Segment into two pieces.
   SlotIndex OldEnd = I->end;
-  I->end = Start;   // Trim the old interval.
+  I->end = Start;   // Trim the old segment.
 
   // Insert the new one.
-  ranges.insert(llvm::next(I), LiveRange(End, OldEnd, ValNo));
+  segments.insert(llvm::next(I), Segment(End, OldEnd, ValNo));
 }
 
-/// removeValNo - Remove all the ranges defined by the specified value#.
+/// removeValNo - Remove all the segments defined by the specified value#.
 /// Also remove the value# from value# list.
-void LiveInterval::removeValNo(VNInfo *ValNo) {
+void LiveRange::removeValNo(VNInfo *ValNo) {
   if (empty()) return;
-  Ranges::iterator I = ranges.end();
-  Ranges::iterator E = ranges.begin();
+  iterator I = end();
+  iterator E = begin();
   do {
     --I;
     if (I->valno == ValNo)
-      ranges.erase(I);
+      segments.erase(I);
   } while (I != E);
   // Now that ValNo is dead, remove it.
   markValNoForDeletion(ValNo);
 }
 
-/// join - Join two live intervals (this, and other) together.  This applies
-/// mappings to the value numbers in the LHS/RHS intervals as specified.  If
-/// the intervals are not joinable, this aborts.
-void LiveInterval::join(LiveInterval &Other,
-                        const int *LHSValNoAssignments,
-                        const int *RHSValNoAssignments,
-                        SmallVector<VNInfo*, 16> &NewVNInfo,
-                        MachineRegisterInfo *MRI) {
+void LiveRange::join(LiveRange &Other,
+                     const int *LHSValNoAssignments,
+                     const int *RHSValNoAssignments,
+                     SmallVectorImpl<VNInfo *> &NewVNInfo) {
   verify();
 
-  // Determine if any of our live range values are mapped.  This is uncommon, so
-  // we want to avoid the interval scan if not.
+  // Determine if any of our values are mapped.  This is uncommon, so we want
+  // to avoid the range scan if not.
   bool MustMapCurValNos = false;
   unsigned NumVals = getNumValNums();
   unsigned NumNewVals = NewVNInfo.size();
@@ -433,8 +427,7 @@ void LiveInterval::join(LiveInterval &Other,
     }
   }
 
-  // If we have to apply a mapping to our base interval assignment, rewrite it
-  // now.
+  // If we have to apply a mapping to our base range assignment, rewrite it now.
   if (MustMapCurValNos && !empty()) {
     // Map the first live range.
 
@@ -445,12 +438,12 @@ void LiveInterval::join(LiveInterval &Other,
       assert(nextValNo != 0 && "Huh?");
 
       // If this live range has the same value # as its immediate predecessor,
-      // and if they are neighbors, remove one LiveRange.  This happens when we
+      // and if they are neighbors, remove one Segment.  This happens when we
       // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
       if (OutIt->valno == nextValNo && OutIt->end == I->start) {
         OutIt->end = I->end;
       } else {
-        // Didn't merge. Move OutIt to the next interval,
+        // Didn't merge. Move OutIt to the next segment,
         ++OutIt;
         OutIt->valno = nextValNo;
         if (OutIt != I) {
@@ -459,9 +452,9 @@ void LiveInterval::join(LiveInterval &Other,
         }
       }
     }
-    // If we merge some live ranges, chop off the end.
+    // If we merge some segments, chop off the end.
     ++OutIt;
-    ranges.erase(OutIt, end());
+    segments.erase(OutIt, end());
   }
 
   // Rewrite Other values before changing the VNInfo ids.
@@ -472,7 +465,7 @@ void LiveInterval::join(LiveInterval &Other,
     I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
-  // LiveInterval now. Also remove dead val#'s.
+  // LiveRange now. Also remove dead val#'s.
   unsigned NumValNos = 0;
   for (unsigned i = 0; i < NumNewVals; ++i) {
     VNInfo *VNI = NewVNInfo[i];
@@ -487,31 +480,31 @@ void LiveInterval::join(LiveInterval &Other,
   if (NumNewVals < NumVals)
     valnos.resize(NumNewVals);  // shrinkify
 
-  // Okay, now insert the RHS live ranges into the LHS.
+  // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
     Updater.add(*I);
 }
 
-/// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
-/// interval as the specified value number.  The LiveRanges in RHS are
-/// allowed to overlap with LiveRanges in the current interval, but only if
-/// the overlapping LiveRanges have the specified value number.
-void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
-                                        VNInfo *LHSValNo) {
+/// Merge all of the segments in RHS into this live range as the specified
+/// value number.  The segments in RHS are allowed to overlap with segments in
+/// the current range, but only if the overlapping segments have the
+/// specified value number.
+void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
+                                       VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     Updater.add(I->start, I->end, LHSValNo);
 }
 
-/// MergeValueInAsValue - Merge all of the live ranges of a specific val#
-/// in RHS into this live interval as the specified value number.
-/// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
-/// current interval, it will replace the value numbers of the overlaped
-/// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
-                                       const VNInfo *RHSValNo,
-                                       VNInfo *LHSValNo) {
+/// MergeValueInAsValue - Merge all of the live segments of a specific val#
+/// in RHS into this live range as the specified value number.
+/// The segments in RHS are allowed to overlap with segments in the
+/// current range, it will replace the value numbers of the overlaped
+/// segments with the specified value number.
+void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
+                                    const VNInfo *RHSValNo,
+                                    VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
   for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
     if (I->valno == RHSValNo)
@@ -520,9 +513,9 @@ void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
 
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
-/// LiveRanges with the V1 value number with the V2 value number.  This can
+/// segments with the V1 value number with the V2 value number.  This can
 /// cause merging of V1/V2 values numbers and compaction of the value space.
-VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   assert(V1 != V2 && "Identical value#'s are always equivalent!");
 
   // This code actually merges the (numerically) larger value number into the
@@ -536,37 +529,37 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
     std::swap(V1, V2);
   }
 
-  // Merge V1 live ranges into V2.
+  // Merge V1 segments into V2.
   for (iterator I = begin(); I != end(); ) {
-    iterator LR = I++;
-    if (LR->valno != V1) continue;  // Not a V1 LiveRange.
+    iterator S = I++;
+    if (S->valno != V1) continue;  // Not a V1 Segment.
 
     // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
     // range, extend it.
-    if (LR != begin()) {
-      iterator Prev = LR-1;
-      if (Prev->valno == V2 && Prev->end == LR->start) {
-        Prev->end = LR->end;
+    if (S != begin()) {
+      iterator Prev = S-1;
+      if (Prev->valno == V2 && Prev->end == S->start) {
+        Prev->end = S->end;
 
         // Erase this live-range.
-        ranges.erase(LR);
+        segments.erase(S);
         I = Prev+1;
-        LR = Prev;
+        S = Prev;
       }
     }
 
     // Okay, now we have a V1 or V2 live range that is maximally merged forward.
     // Ensure that it is a V2 live-range.
-    LR->valno = V2;
+    S->valno = V2;
 
-    // If we can merge it into later V2 live ranges, do so now.  We ignore any
-    // following V1 live ranges, as they will be merged in subsequent iterations
+    // If we can merge it into later V2 segments, do so now.  We ignore any
+    // following V1 segments, as they will be merged in subsequent iterations
     // of the loop.
     if (I != end()) {
-      if (I->start == LR->end && I->valno == V2) {
-        LR->end = I->end;
-        ranges.erase(I);
-        I = LR+1;
+      if (I->start == S->end && I->valno == V2) {
+        S->end = I->end;
+        segments.erase(I);
+        I = S+1;
       }
     }
   }
@@ -584,22 +577,21 @@ unsigned LiveInterval::getSize() const {
   return Sum;
 }
 
-raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
-  return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange::Segment &S) {
+  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ")";
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LiveRange::dump() const {
+void LiveRange::Segment::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
-void LiveInterval::print(raw_ostream &OS) const {
+void LiveRange::print(raw_ostream &OS) const {
   if (empty())
     OS << "EMPTY";
   else {
-    for (LiveInterval::Ranges::const_iterator I = ranges.begin(),
-           E = ranges.end(); I != E; ++I) {
+    for (const_iterator I = begin(), E = end(); I != E; ++I) {
       OS << *I;
       assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
     }
@@ -625,19 +617,29 @@ void LiveInterval::print(raw_ostream &OS) const {
   }
 }
 
+void LiveInterval::print(raw_ostream &OS) const {
+  OS << PrintReg(reg) << ' ';
+  super::print(OS);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveRange::dump() const {
+  dbgs() << *this << "\n";
+}
+
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 #endif
 
 #ifndef NDEBUG
-void LiveInterval::verify() const {
+void LiveRange::verify() const {
   for (const_iterator I = begin(), E = end(); I != E; ++I) {
     assert(I->start.isValid());
     assert(I->end.isValid());
     assert(I->start < I->end);
     assert(I->valno != 0);
+    assert(I->valno->id < valnos.size());
     assert(I->valno == valnos[I->valno->id]);
     if (llvm::next(I) != E) {
       assert(I->end <= llvm::next(I)->start);
@@ -649,10 +651,6 @@ void LiveInterval::verify() const {
 #endif
 
 
-void LiveRange::print(raw_ostream &os) const {
-  os << *this;
-}
-
 //===----------------------------------------------------------------------===//
 //                           LiveRangeUpdater class
 //===----------------------------------------------------------------------===//
@@ -665,11 +663,11 @@ void LiveRange::print(raw_ostream &os) const {
 //
 // Otherwise, segments are kept in three separate areas:
 //
-// 1. [begin; WriteI) at the front of LI.
-// 2. [ReadI; end) at the back of LI.
+// 1. [begin; WriteI) at the front of LR.
+// 2. [ReadI; end) at the back of LR.
 // 3. Spills.
 //
-// - LI.begin() <= WriteI <= ReadI <= LI.end().
+// - LR.begin() <= WriteI <= ReadI <= LR.end().
 // - Segments in all three areas are fully ordered and coalesced.
 // - Segments in area 1 precede and can't coalesce with segments in area 2.
 // - Segments in Spills precede and can't coalesce with segments in area 2.
@@ -684,23 +682,23 @@ void LiveRange::print(raw_ostream &os) const {
 
 void LiveRangeUpdater::print(raw_ostream &OS) const {
   if (!isDirty()) {
-    if (LI)
-      OS << "Clean " << PrintReg(LI->reg) << " updater: " << *LI << '\n';
+    if (LR)
+      OS << "Clean updater: " << *LR << '\n';
     else
       OS << "Null updater.\n";
     return;
   }
-  assert(LI && "Can't have null LI in dirty updater.");
-  OS << PrintReg(LI->reg) << " updater with gap = " << (ReadI - WriteI)
+  assert(LR && "Can't have null LR in dirty updater.");
+  OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveInterval::const_iterator I = LI->begin(); I != WriteI; ++I)
+  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
     OS << ' ' << *I;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveInterval::const_iterator I = ReadI, E = LI->end(); I != E; ++I)
+  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
     OS << ' ' << *I;
   OS << '\n';
 }
@@ -711,8 +709,9 @@ void LiveRangeUpdater::dump() const
 }
 
 // Determine if A and B should be coalesced.
-static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
-  assert(A.start <= B.start && "Unordered live ranges.");
+static inline bool coalescable(const LiveRange::Segment &A,
+                               const LiveRange::Segment &B) {
+  assert(A.start <= B.start && "Unordered live segments.");
   if (A.end == B.start)
     return A.valno == B.valno;
   if (A.end < B.start)
@@ -721,8 +720,8 @@ static inline bool coalescable(const LiveRange &A, const LiveRange &B) {
   return true;
 }
 
-void LiveRangeUpdater::add(LiveRange Seg) {
-  assert(LI && "Cannot add to a null destination");
+void LiveRangeUpdater::add(LiveRange::Segment Seg) {
+  assert(LR && "Cannot add to a null destination");
 
   // Flush the state if Start moves backwards.
   if (!LastStart.isValid() || LastStart > Seg.start) {
@@ -730,21 +729,21 @@ void LiveRangeUpdater::add(LiveRange Seg) {
       flush();
     // This brings us to an uninitialized state. Reinitialize.
     assert(Spills.empty() && "Leftover spilled segments");
-    WriteI = ReadI = LI->begin();
+    WriteI = ReadI = LR->begin();
   }
 
   // Remember start for next time.
   LastStart = Seg.start;
 
   // Advance ReadI until it ends after Seg.start.
-  LiveInterval::iterator E = LI->end();
+  LiveRange::iterator E = LR->end();
   if (ReadI != E && ReadI->end <= Seg.start) {
     // First try to close the gap between WriteI and ReadI with spills.
     if (ReadI != WriteI)
       mergeSpills();
     // Then advance ReadI.
     if (ReadI == WriteI)
-      ReadI = WriteI = LI->find(Seg.start);
+      ReadI = WriteI = LR->find(Seg.start);
     else
       while (ReadI != E && ReadI->end <= Seg.start)
         *WriteI++ = *ReadI++;
@@ -777,7 +776,7 @@ void LiveRangeUpdater::add(LiveRange Seg) {
   }
 
   // Try coalescing Seg into WriteI[-1].
-  if (WriteI != LI->begin() && coalescable(WriteI[-1], Seg)) {
+  if (WriteI != LR->begin() && coalescable(WriteI[-1], Seg)) {
     WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
     return;
   }
@@ -788,10 +787,10 @@ void LiveRangeUpdater::add(LiveRange Seg) {
     return;
   }
 
-  // Finally, append to LI or Spills.
+  // Finally, append to LR or Spills.
   if (WriteI == E) {
-    LI->ranges.push_back(Seg);
-    WriteI = ReadI = LI->ranges.end();
+    LR->segments.push_back(Seg);
+    WriteI = ReadI = LR->end();
   } else
     Spills.push_back(Seg);
 }
@@ -802,10 +801,10 @@ void LiveRangeUpdater::mergeSpills() {
   // Perform a backwards merge of Spills and [SpillI;WriteI).
   size_t GapSize = ReadI - WriteI;
   size_t NumMoved = std::min(Spills.size(), GapSize);
-  LiveInterval::iterator Src = WriteI;
-  LiveInterval::iterator Dst = Src + NumMoved;
-  LiveInterval::iterator SpillSrc = Spills.end();
-  LiveInterval::iterator B = LI->begin();
+  LiveRange::iterator Src = WriteI;
+  LiveRange::iterator Dst = Src + NumMoved;
+  LiveRange::iterator SpillSrc = Spills.end();
+  LiveRange::iterator B = LR->begin();
 
   // This is the new WriteI position after merging spills.
   WriteI = Dst;
@@ -827,12 +826,12 @@ void LiveRangeUpdater::flush() {
   // Clear the dirty state.
   LastStart = SlotIndex();
 
-  assert(LI && "Cannot add to a null destination");
+  assert(LR && "Cannot add to a null destination");
 
   // Nothing to merge?
   if (Spills.empty()) {
-    LI->ranges.erase(WriteI, ReadI);
-    LI->verify();
+    LR->segments.erase(WriteI, ReadI);
+    LR->verify();
     return;
   }
 
@@ -840,17 +839,17 @@ void LiveRangeUpdater::flush() {
   size_t GapSize = ReadI - WriteI;
   if (GapSize < Spills.size()) {
     // The gap is too small. Make some room.
-    size_t WritePos = WriteI - LI->begin();
-    LI->ranges.insert(ReadI, Spills.size() - GapSize, LiveRange());
+    size_t WritePos = WriteI - LR->begin();
+    LR->segments.insert(ReadI, Spills.size() - GapSize, LiveRange::Segment());
     // This also invalidated ReadI, but it is recomputed below.
-    WriteI = LI->ranges.begin() + WritePos;
+    WriteI = LR->begin() + WritePos;
   } else {
     // Shrink the gap if necessary.
-    LI->ranges.erase(WriteI + Spills.size(), ReadI);
+    LR->segments.erase(WriteI + Spills.size(), ReadI);
   }
   ReadI = WriteI + Spills.size();
   mergeSpills();
-  LI->verify();
+  LR->verify();
 }
 
 unsigned ConnectedVNInfoEqClasses::Classify(const LiveInterval *LI) {
@@ -909,8 +908,16 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     MachineOperand &MO = RI.getOperand();
     MachineInstr *MI = MO.getParent();
     ++RI;
-    // DBG_VALUE instructions should have been eliminated earlier.
-    LiveRangeQuery LRQ(LI, LIS.getInstructionIndex(MI));
+    // DBG_VALUE instructions don't have slot indexes, so get the index of the
+    // instruction before them.
+    // Normally, DBG_VALUE instructions are removed before this function is
+    // called, but it is not a requirement.
+    SlotIndex Idx;
+    if (MI->isDebugValue())
+      Idx = LIS.getSlotIndexes()->getIndexBefore(MI);
+    else
+      Idx = LIS.getInstructionIndex(MI);
+    LiveQueryResult LRQ = LI.Query(Idx);
     const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
     // In the case of an <undef> use that isn't tied to any def, VNI will be
     // NULL. If the use is tied to a def, VNI will be the defined value.
@@ -927,11 +934,11 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     if (unsigned eq = EqClass[I->valno->id]) {
       assert((LIV[eq]->empty() || LIV[eq]->expiredAt(I->start)) &&
              "New intervals should be empty");
-      LIV[eq]->ranges.push_back(*I);
+      LIV[eq]->segments.push_back(*I);
     } else
       *J++ = *I;
   }
-  LI.ranges.erase(J, E);
+  LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.
   unsigned j = 0, e = LI.getNumValNums();
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index f1b8394..e1c3217 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -51,6 +52,14 @@ INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
 
+#ifndef NDEBUG
+static cl::opt<bool> EnablePrecomputePhysRegs(
+  "precompute-phys-liveness", cl::Hidden,
+  cl::desc("Eagerly compute live intervals for all physreg units."));
+#else
+static bool EnablePrecomputePhysRegs = false;
+#endif // NDEBUG
+
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
@@ -86,15 +95,15 @@ void LiveIntervals::releaseMemory() {
   RegMaskBits.clear();
   RegMaskBlocks.clear();
 
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    delete RegUnitIntervals[i];
-  RegUnitIntervals.clear();
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    delete RegUnitRanges[i];
+  RegUnitRanges.clear();
 
   // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
   VNInfoAllocator.Reset();
 }
 
-/// runOnMachineFunction - Register allocate the whole function
+/// runOnMachineFunction - calculates LiveIntervals
 ///
 bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   MF = &fn;
@@ -115,6 +124,12 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   computeRegMasks();
   computeLiveInRegUnits();
 
+  if (EnablePrecomputePhysRegs) {
+    // For stress testing, precompute live ranges of all physical register
+    // units, including reserved registers.
+    for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+      getRegUnit(i);
+  }
   DEBUG(dump());
   return true;
 }
@@ -124,15 +139,15 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
   OS << "********** INTERVALS **********\n";
 
   // Dump the regunits.
-  for (unsigned i = 0, e = RegUnitIntervals.size(); i != e; ++i)
-    if (LiveInterval *LI = RegUnitIntervals[i])
-      OS << PrintRegUnit(i, TRI) << " = " << *LI << '\n';
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    if (LiveRange *LR = RegUnitRanges[i])
+      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (hasInterval(Reg))
-      OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
+      OS << getInterval(Reg) << '\n';
   }
 
   OS << "RegMasks:";
@@ -155,16 +170,17 @@ void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? HUGE_VALF : 0.0F;
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
+                  llvm::huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
 
 /// computeVirtRegInterval - Compute the live interval of a virtual register,
 /// based on defs and uses.
-void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
-  assert(LI->empty() && "Should only compute empty intervals.");
+  assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   LRCalc->createDeadDefs(LI);
   LRCalc->extendToUses(LI);
@@ -175,9 +191,7 @@ void LiveIntervals::computeVirtRegs() {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = createInterval(Reg);
-    VirtRegIntervals[Reg] = LI;
-    computeVirtRegInterval(LI);
+    createAndComputeVirtRegInterval(Reg);
   }
 }
 
@@ -214,12 +228,10 @@ void LiveIntervals::computeRegMasks() {
 // interference.
 //
 
-/// computeRegUnitInterval - Compute the live interval of a register unit, based
-/// on the uses and defs of aliasing registers.  The interval should be empty,
+/// computeRegUnitInterval - Compute the live range of a register unit, based
+/// on the uses and defs of aliasing registers.  The range should be empty,
 /// or contain only dead phi-defs from ABI blocks.
-void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
-  unsigned Unit = LI->reg;
-
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
 
@@ -229,25 +241,21 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
   // idempotent. It is very rare for a register unit to have multiple roots, so
   // uniquing super-registers is probably not worthwhile.
   for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    unsigned Root = *Roots;
-    if (!MRI->reg_empty(Root))
-      LRCalc->createDeadDefs(LI, Root);
-    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
+         Supers.isValid(); ++Supers) {
       if (!MRI->reg_empty(*Supers))
-        LRCalc->createDeadDefs(LI, *Supers);
+        LRCalc->createDeadDefs(LR, *Supers);
     }
   }
 
-  // Now extend LI to reach all uses.
+  // Now extend LR to reach all uses.
   // Ignore uses of reserved registers. We only track defs of those.
   for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
-    unsigned Root = *Roots;
-    if (!MRI->isReserved(Root) && !MRI->reg_empty(Root))
-      LRCalc->extendToUses(LI, Root);
-    for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
+    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
+         Supers.isValid(); ++Supers) {
       unsigned Reg = *Supers;
       if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
-        LRCalc->extendToUses(LI, Reg);
+        LRCalc->extendToUses(LR, Reg);
     }
   }
 }
@@ -258,11 +266,11 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
 /// without a corresponding def when entering the entry block or a landing pad.
 ///
 void LiveIntervals::computeLiveInRegUnits() {
-  RegUnitIntervals.resize(TRI->getNumRegUnits());
+  RegUnitRanges.resize(TRI->getNumRegUnits());
   DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
 
-  // Keep track of the intervals allocated.
-  SmallVector<LiveInterval*, 8> NewIntvs;
+  // Keep track of the live range sets allocated.
+  SmallVector<unsigned, 8> NewRanges;
 
   // Check all basic blocks for live-ins.
   for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
@@ -280,23 +288,25 @@ void LiveIntervals::computeLiveInRegUnits() {
          LIE = MBB->livein_end(); LII != LIE; ++LII) {
       for (MCRegUnitIterator Units(*LII, TRI); Units.isValid(); ++Units) {
         unsigned Unit = *Units;
-        LiveInterval *Intv = RegUnitIntervals[Unit];
-        if (!Intv) {
-          Intv = RegUnitIntervals[Unit] = new LiveInterval(Unit, HUGE_VALF);
-          NewIntvs.push_back(Intv);
+        LiveRange *LR = RegUnitRanges[Unit];
+        if (!LR) {
+          LR = RegUnitRanges[Unit] = new LiveRange();
+          NewRanges.push_back(Unit);
         }
-        VNInfo *VNI = Intv->createDeadDef(Begin, getVNInfoAllocator());
+        VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
         (void)VNI;
         DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
       }
     }
     DEBUG(dbgs() << '\n');
   }
-  DEBUG(dbgs() << "Created " << NewIntvs.size() << " new intervals.\n");
+  DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
 
-  // Compute the 'normal' part of the intervals.
-  for (unsigned i = 0, e = NewIntvs.size(); i != e; ++i)
-    computeRegUnitInterval(NewIntvs[i]);
+  // Compute the 'normal' part of the ranges.
+  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
+    unsigned Unit = NewRanges[i];
+    computeRegUnitRange(*RegUnitRanges[Unit], Unit);
+  }
 }
 
 
@@ -320,7 +330,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
       continue;
     SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
-    LiveRangeQuery LRQ(*li, Idx);
+    LiveQueryResult LRQ = li->Query(Idx);
     VNInfo *VNI = LRQ.valueIn();
     if (!VNI) {
       // This shouldn't happen: readsVirtualRegister returns true, but there is
@@ -339,14 +349,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     WorkList.push_back(std::make_pair(Idx, VNI));
   }
 
-  // Create a new live interval with only minimal live segments per def.
-  LiveInterval NewLI(li->reg, 0);
+  // Create new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
   for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
        I != E; ++I) {
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    NewLI.addRange(LiveRange(VNI->def, VNI->def.getDeadSlot(), VNI));
+    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
   }
 
   // Keep track of the PHIs that are in use.
@@ -361,7 +371,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     SlotIndex BlockStart = getMBBStartIdx(MBB);
 
     // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLI.extendInBlock(BlockStart, Idx)) {
+    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
       (void)ExtVNI;
       assert(ExtVNI == VNI && "Unexpected existing value number");
       // Is this a PHIDef we haven't seen before?
@@ -382,7 +392,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
 
     // VNI is live-in to MBB.
     DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLI.addRange(LiveRange(BlockStart, Idx, VNI));
+    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
 
     // Make sure VNI is live-out from the predecessors.
     for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
@@ -403,14 +413,14 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     VNInfo *VNI = *I;
     if (VNI->isUnused())
       continue;
-    LiveInterval::iterator LII = NewLI.FindLiveRangeContaining(VNI->def);
-    assert(LII != NewLI.end() && "Missing live range for PHI");
-    if (LII->end != VNI->def.getDeadSlot())
+    LiveRange::iterator LRI = NewLR.FindSegmentContaining(VNI->def);
+    assert(LRI != NewLR.end() && "Missing segment for PHI");
+    if (LRI->end != VNI->def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      NewLI.removeRange(*LII);
+      NewLR.removeSegment(LRI->start, LRI->end);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       CanSeparate = true;
     } else {
@@ -425,23 +435,23 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
     }
   }
 
-  // Move the trimmed ranges back.
-  li->ranges.swap(NewLI.ranges);
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
   DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
-void LiveIntervals::extendToIndices(LiveInterval *LI,
+void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Indices) {
   assert(LRCalc && "LRCalc not initialized.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   for (unsigned i = 0, e = Indices.size(); i != e; ++i)
-    LRCalc->extend(LI, Indices[i]);
+    LRCalc->extend(LR, Indices[i]);
 }
 
 void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveRangeQuery LRQ(*LI, Kill);
+  LiveQueryResult LRQ = LI->Query(Kill);
   VNInfo *VNI = LRQ.valueOut();
   if (!VNI)
     return;
@@ -452,13 +462,13 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeRange(Kill, LRQ.endPoint());
+    LI->removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeRange(Kill, MBBEnd);
+  LI->removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -476,23 +486,23 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
       // Check if VNI is live in to MBB.
       tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveRangeQuery LRQ(*LI, MBBStart);
+      LiveQueryResult LRQ = LI->Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
-        // This block isn't part of the VNI live range. Prune the search.
+        // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
         continue;
       }
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeRange(MBBStart, LRQ.endPoint());
+        LI->removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeRange(MBBStart, MBBEnd);
+      LI->removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
@@ -505,7 +515,7 @@ void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
@@ -520,13 +530,14 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveInterval *RUInt = &getRegUnit(*Units);
-      if (RUInt->empty())
+      LiveRange &RURanges = getRegUnit(*Units);
+      if (RURanges.empty())
         continue;
-      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
     }
 
-    // Every instruction that kills Reg corresponds to a live range end point.
+    // Every instruction that kills Reg corresponds to a segment range end
+    // point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
@@ -536,7 +547,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       if (!MI)
         continue;
 
-      // Check if any of the reguints are live beyond the end of RI. That could
+      // Check if any of the regunits are live beyond the end of RI. That could
       // happen when a physreg is defined as a copy of a virtreg:
       //
       //   %EAX = COPY %vreg5
@@ -546,12 +557,12 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
       bool CancelKill = false;
       for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveInterval *RInt = RU[u].first;
-        LiveInterval::iterator &I = RU[u].second;
-        if (I == RInt->end())
+        LiveRange &RRanges = *RU[u].first;
+        LiveRange::iterator &I = RU[u].second;
+        if (I == RRanges.end())
           continue;
-        I = RInt->advanceTo(I, RI->end);
-        if (I == RInt->end() || I->start >= RI->end)
+        I = RRanges.advanceTo(I, RI->end);
+        if (I == RRanges.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
         CancelKill = true;
@@ -609,35 +620,23 @@ LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
 }
 
 float
-LiveIntervals::getSpillWeight(bool isDef, bool isUse, unsigned loopDepth) {
-  // Limit the loop depth ridiculousness.
-  if (loopDepth > 200)
-    loopDepth = 200;
-
-  // The loop depth is used to roughly estimate the number of times the
-  // instruction is executed. Something like 10^d is simple, but will quickly
-  // overflow a float. This expression behaves like 10^d for small d, but is
-  // more tempered for large d. At d=200 we get 6.7e33 which leaves a bit of
-  // headroom before overflow.
-  // By the way, powf() might be unavailable here. For consistency,
-  // We may take pow(double,double).
-  float lc = std::pow(1 + (100.0 / (loopDepth + 10)), (double)loopDepth);
-
-  return (isDef + isUse) * lc;
+LiveIntervals::getSpillWeight(bool isDef, bool isUse, BlockFrequency freq) {
+  const float Scale = 1.0f / BlockFrequency::getEntryFrequency();
+  return (isDef + isUse) * (freq.getFrequency() * Scale);
 }
 
-LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
-                                                  MachineInstr* startInst) {
-  LiveInterval& Interval = getOrCreateInterval(reg);
+LiveRange::Segment
+LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr* startInst) {
+  LiveInterval& Interval = createEmptyInterval(reg);
   VNInfo* VN = Interval.getNextValue(
     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
     getVNInfoAllocator());
-  LiveRange LR(
+  LiveRange::Segment S(
      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
      getMBBEndIdx(startInst->getParent()), VN);
-  Interval.addRange(LR);
+  Interval.addSegment(S);
 
-  return LR;
+  return S;
 }
 
 
@@ -712,7 +711,7 @@ private:
   const TargetRegisterInfo& TRI;
   SlotIndex OldIdx;
   SlotIndex NewIdx;
-  SmallPtrSet<LiveInterval*, 8> Updated;
+  SmallPtrSet<LiveRange*, 8> Updated;
   bool UpdateFlags;
 
 public:
@@ -726,7 +725,7 @@ public:
   // physregs, even those that aren't needed for regalloc, in order to update
   // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
   // flags, and postRA passes will use a live register utility instead.
-  LiveInterval *getRegUnitLI(unsigned Unit) {
+  LiveRange *getRegUnitLI(unsigned Unit) {
     if (UpdateFlags)
       return &LIS.getRegUnit(Unit);
     return LIS.getCachedRegUnit(Unit);
@@ -751,15 +750,16 @@ public:
       if (!Reg)
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        updateRange(LIS.getInterval(Reg));
+        LiveInterval &LI = LIS.getInterval(Reg);
+        updateRange(LI, Reg);
         continue;
       }
 
       // For physregs, only update the regunits that actually have a
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
-        if (LiveInterval *LI = getRegUnitLI(*Units))
-          updateRange(*LI);
+        if (LiveRange *LR = getRegUnitLI(*Units))
+          updateRange(*LR, *Units);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -768,26 +768,26 @@ public:
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveInterval &LI) {
-    if (!Updated.insert(&LI))
+  void updateRange(LiveRange &LR, unsigned Reg) {
+    if (!Updated.insert(&LR))
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-        dbgs() << PrintReg(LI.reg);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        dbgs() << PrintReg(Reg);
       else
-        dbgs() << PrintRegUnit(LI.reg, &TRI);
-      dbgs() << ":\t" << LI << '\n';
+        dbgs() << PrintRegUnit(Reg, &TRI);
+      dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
-      handleMoveDown(LI);
+      handleMoveDown(LR);
     else
-      handleMoveUp(LI);
-    DEBUG(dbgs() << "        -->\t" << LI << '\n');
-    LI.verify();
+      handleMoveUp(LR, Reg);
+    DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LR.verify();
   }
 
-  /// Update LI to reflect an instruction has been moved downwards from OldIdx
+  /// Update LR to reflect an instruction has been moved downwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -801,17 +801,17 @@ private:
   ///    Move def to NewIdx, possibly across another live value.
   ///
   /// 4. Def at OldIdx AND at NewIdx:
-  ///    Remove live range [OldIdx;NewIdx) and value defined at OldIdx.
+  ///    Remove segment [OldIdx;NewIdx) and value defined at OldIdx.
   ///    (Happens when bundling multiple defs together).
   ///
   /// 5. Value read at OldIdx, killed before NewIdx:
   ///    Extend kill to NewIdx.
   ///
-  void handleMoveDown(LiveInterval &LI) {
+  void handleMoveDown(LiveRange &LR) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -828,7 +828,7 @@ private:
         for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
           if (MO->isReg() && MO->isUse())
             MO->setIsKill(false);
-      // Adjust I->end to reach NewIdx. This may temporarily make LI invalid by
+      // Adjust I->end to reach NewIdx. This may temporarily make LR invalid by
       // overlapping ranges. Case 5 above.
       I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
       // If this was a kill, there may also be a def. Otherwise we're done.
@@ -857,24 +857,25 @@ private:
     assert((I->end == OldIdx.getDeadSlot() ||
             SlotIndex::isSameInstr(I->end, NewIdx)) &&
             "Cannot move def below kill");
-    LiveInterval::iterator NewI = LI.advanceTo(I, NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.advanceTo(I, NewIdx.getRegSlot());
     if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
       // coalesced into that value.
       assert(NewI->valno != DefVNI && "Multiple defs of value?");
-      LI.removeValNo(DefVNI);
+      LR.removeValNo(DefVNI);
       return;
     }
     // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
-    // If the def at OldIdx was dead, we allow it to be moved across other LI
+    // If the def at OldIdx was dead, we allow it to be moved across other LR
     // values. The new range should be placed immediately before NewI, move any
     // intermediate ranges up.
     assert(NewI != I && "Inconsistent iterators");
     std::copy(llvm::next(I), NewI, I);
-    *llvm::prior(NewI) = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *llvm::prior(NewI)
+      = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
-  /// Update LI to reflect an instruction has been moved upwards from OldIdx
+  /// Update LR to reflect an instruction has been moved upwards from OldIdx
   /// to NewIdx.
   ///
   /// 1. Live def at OldIdx:
@@ -894,11 +895,11 @@ private:
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveInterval &LI) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg) {
     // First look for a kill at OldIdx.
-    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
-    LiveInterval::iterator E = LI.end();
-    // Is LI even live at OldIdx?
+    LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
+    LiveRange::iterator E = LR.end();
+    // Is LR even live at OldIdx?
     if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
 
@@ -915,7 +916,7 @@ private:
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        llvm::prior(I)->end = findLastUseBefore(LI.reg).getRegSlot();
+        llvm::prior(I)->end = findLastUseBefore(Reg).getRegSlot();
         return;
       }
     }
@@ -927,18 +928,18 @@ private:
     DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
 
     // Check for an existing def at NewIdx.
-    LiveInterval::iterator NewI = LI.find(NewIdx.getRegSlot());
+    LiveRange::iterator NewI = LR.find(NewIdx.getRegSlot());
     if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
       assert(NewI->valno != DefVNI && "Same value defined more than once?");
       // There is an existing def at NewIdx.
       if (I->end.isDead()) {
         // Case 3: Remove the dead def at OldIdx.
-        LI.removeValNo(DefVNI);
+        LR.removeValNo(DefVNI);
         return;
       }
       // Case 4: Replace def at NewIdx with live def at OldIdx.
       I->start = DefVNI->def;
-      LI.removeValNo(NewI->valno);
+      LR.removeValNo(NewI->valno);
       return;
     }
 
@@ -949,10 +950,10 @@ private:
       return;
     }
 
-    // DefVNI is a dead def. It may have been moved across other values in LI,
+    // DefVNI is a dead def. It may have been moved across other values in LR,
     // so move I up to NewI. Slide [NewI;I) down one position.
     std::copy_backward(NewI, I, llvm::next(I));
-    *NewI = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
+    *NewI = LiveRange::Segment(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
   void updateRegMaskSlots() {
@@ -1075,8 +1076,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
       if (MOI->isReg() &&
           TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
-        LiveInterval &LI = getOrCreateInterval(MOI->getReg());
-        computeVirtRegInterval(&LI);
+        createAndComputeVirtRegInterval(MOI->getReg());
       }
     }
   }
@@ -1123,9 +1123,9 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
               if (LII != LI.begin())
                 prevStart = llvm::prior(LII)->start;
 
-              // FIXME: This could be more efficient if there was a removeRange
-              // method that returned an iterator.
-              LI.removeRange(*LII, true);
+              // FIXME: This could be more efficient if there was a
+              // removeSegment method that returned an iterator.
+              LI.removeSegment(*LII, true);
               if (prevStart.isValid())
                 LII = LI.find(prevStart);
               else
@@ -1144,13 +1144,14 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
           if (!lastUseIdx.isValid()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), instrIdx.getDeadSlot(), VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(),
+                                 instrIdx.getDeadSlot(), VNI);
+            LII = LI.addSegment(S);
           } else if (LII->start != instrIdx.getRegSlot()) {
             VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
                                           VNInfoAllocator);
-            LiveRange LR(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addRange(LR);
+            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+            LII = LI.addSegment(S);
           }
 
           if (MO.getSubReg() && !MO.isUndef())
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index dede490..ae086bc 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -36,11 +36,11 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
 }
 
 
-void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LI->createDeadDef() will deduplicate.
+  // LR.createDeadDef() will deduplicate.
   for (MachineRegisterInfo::def_iterator
        I = MRI->def_begin(Reg), E = MRI->def_end(); I != E; ++I) {
     const MachineInstr *MI = &*I;
@@ -54,13 +54,13 @@ void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
       Idx = Indexes->getInstructionIndex(MI)
         .getRegSlot(I.getOperand().isEarlyClobber());
 
-    // Create the def in LI. This may find an existing def.
-    LI->createDeadDef(Idx, *Alloc);
+    // Create the def in LR. This may find an existing def.
+    LR.createDeadDef(Idx, *Alloc);
   }
 }
 
 
-void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all operands that read Reg. This may include partial defs.
@@ -99,7 +99,7 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
           Idx = Idx.getRegSlot(true);
       }
     }
-    extend(LI, Idx, Reg);
+    extend(LR, Idx, Reg);
   }
 }
 
@@ -125,17 +125,14 @@ void LiveRangeCalc::updateLiveIns() {
       assert(Seen.test(MBB->getNumber()));
       LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)0);
     }
-    Updater.setDest(I->LI);
+    Updater.setDest(&I->LR);
     Updater.add(Start, End, I->Value);
   }
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::extend(LiveInterval *LI,
-                           SlotIndex Kill,
-                           unsigned PhysReg) {
-  assert(LI && "Missing live range");
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg) {
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
@@ -144,14 +141,14 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   assert(KillMBB && "No MBB at Kill");
 
   // Is there a def in the same MBB we can extend?
-  if (LI->extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
     return;
 
   // Find the single reaching def, or determine if Kill is jointly dominated by
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  if (findReachingDefs(LI, KillMBB, Kill, PhysReg))
+  if (findReachingDefs(LR, *KillMBB, Kill, PhysReg))
     return;
 
   // When there were multiple different values, we may need new PHIs.
@@ -170,13 +167,11 @@ void LiveRangeCalc::calculateValues() {
 }
 
 
-bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
-                                     MachineBasicBlock *KillMBB,
-                                     SlotIndex Kill,
-                                     unsigned PhysReg) {
-  unsigned KillMBBNum = KillMBB->getNumber();
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                                     SlotIndex Kill, unsigned PhysReg) {
+  unsigned KillMBBNum = KillMBB.getNumber();
 
-  // Block numbers where LI should be live-in.
+  // Block numbers where LR should be live-in.
   SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
 
   // Remember if we have seen more than one value.
@@ -203,7 +198,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 #endif
 
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
+         PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
 
        // Is this a known live-out block?
@@ -221,7 +216,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
        // First time we see Pred.  Try to determine the live-out value, but set
        // it as null if Pred is live-through with an unknown value.
-       VNInfo *VNI = LI->extendInBlock(Start, End);
+       VNInfo *VNI = LR.extendInBlock(Start, End);
        setLiveOutValue(Pred, VNI);
        if (VNI) {
          if (TheVNI && TheVNI != VNI)
@@ -231,7 +226,7 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
        }
 
        // No, we need a live-in value for Pred as well
-       if (Pred != KillMBB)
+       if (Pred != &KillMBB)
           WorkList.push_back(Pred->getNumber());
        else
           // Loopback to KillMBB, so value is really live through.
@@ -248,9 +243,9 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
 
   // If a unique reaching def was found, blit in the live ranges immediately.
   if (UniqueVNI) {
-    LiveRangeUpdater Updater(LI);
-    for (SmallVectorImpl<unsigned>::const_iterator
-         I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    LiveRangeUpdater Updater(&LR);
+    for (SmallVectorImpl<unsigned>::const_iterator I = WorkList.begin(),
+         E = WorkList.end(); I != E; ++I) {
        SlotIndex Start, End;
        tie(Start, End) = Indexes->getMBBRange(*I);
        // Trim the live range in KillMBB.
@@ -270,8 +265,8 @@ bool LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   for (SmallVectorImpl<unsigned>::const_iterator
        I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
-    addLiveInBlock(LI, DomTree->getNode(MBB));
-    if (MBB == KillMBB)
+    addLiveInBlock(LR, DomTree->getNode(MBB));
+    if (MBB == &KillMBB)
       LiveIn.back().Kill = Kill;
   }
 
@@ -348,16 +343,17 @@ void LiveRangeCalc::updateSSA() {
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         tie(Start, End) = Indexes->getMBBRange(MBB);
-        VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
+        LiveRange &LR = I->LR;
+        VNInfo *VNI = LR.getNextValue(Start, *Alloc);
         I->Value = VNI;
         // This block is done, we know the final value.
         I->DomNode = 0;
 
         // Add liveness since updateLiveIns now skips this node.
         if (I->Kill.isValid())
-          I->LI->addRange(LiveRange(Start, I->Kill, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
         else {
-          I->LI->addRange(LiveRange(Start, End, VNI));
+          LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 57cab7b..a3a3fbb 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -75,9 +75,9 @@ class LiveRangeCalc {
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
   struct LiveInBlock {
-    // LI - The live range that is live-in to this block.  The algorithms can
+    // The live range set that is live-in to this block.  The algorithms can
     // handle multiple non-overlapping live ranges simultaneously.
-    LiveInterval *LI;
+    LiveRange &LR;
 
     // DomNode - Dominator tree node for the block.
     // Cleared when the final value has been determined and LI has been updated.
@@ -91,8 +91,8 @@ class LiveRangeCalc {
     // Live-in value filled in by updateSSA once it is known.
     VNInfo *Value;
 
-    LiveInBlock(LiveInterval *li, MachineDomTreeNode *node, SlotIndex kill)
-      : LI(li), DomNode(node), Kill(kill), Value(0) {}
+    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
+      : LR(LR), DomNode(node), Kill(kill), Value(0) {}
   };
 
   /// LiveIn - Work list of blocks where the live-in value has yet to be
@@ -111,10 +111,8 @@ class LiveRangeCalc {
   /// are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveInterval *LI,
-                        MachineBasicBlock *KillMBB,
-                        SlotIndex Kill,
-                        unsigned PhysReg);
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+                        SlotIndex Kill, unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
@@ -146,10 +144,6 @@ public:
              MachineDominatorTree*,
              VNInfo::Allocator*);
 
-  /// calculate - Calculate the live range of a virtual register from its defs
-  /// and uses.  LI must be empty with no values.
-  void calculate(LiveInterval *LI);
-
   //===--------------------------------------------------------------------===//
   // Mid-level interface.
   //===--------------------------------------------------------------------===//
@@ -165,27 +159,27 @@ public:
   /// single existing value, Alloc may be null.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
+  void extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
   /// minimal live range.
-  void createDeadDefs(LiveInterval *LI, unsigned Reg);
+  void createDeadDefs(LiveRange &LR, unsigned Reg);
 
   /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval *LI) {
-    createDeadDefs(LI, LI->reg);
+  void createDeadDefs(LiveInterval &LI) {
+    createDeadDefs(LI, LI.reg);
   }
 
   /// extendToUses - Extend the live range of LI to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveInterval *LI, unsigned Reg);
+  void extendToUses(LiveRange &LR, unsigned Reg);
 
   /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval *LI) {
-    extendToUses(LI, LI->reg);
+  void extendToUses(LiveInterval &LI) {
+    extendToUses(LI, LI.reg);
   }
 
   //===--------------------------------------------------------------------===//
@@ -216,15 +210,15 @@ public:
   /// function can only be called once per basic block.  Once the live-in value
   /// has been determined, calculateValues() will add liveness to LI.
   ///
-  /// @param LI      The live range that is live-in to the block.
+  /// @param LR      The live range that is live-in to the block.
   /// @param DomNode The domtree node for the block.
   /// @param Kill    Index in block where LI is killed.  If the value is
   ///                live-through, set Kill = SLotIndex() and also call
   ///                setLiveOutValue(MBB, 0).
-  void addLiveInBlock(LiveInterval *LI,
+  void addLiveInBlock(LiveRange &LR,
                       MachineDomTreeNode *DomNode,
                       SlotIndex Kill = SlotIndex()) {
-    LiveIn.push_back(LiveInBlock(LI, DomNode, Kill));
+    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
   }
 
   /// calculateValues - Calculate the value that will be live-in to each block
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 7793e96..cb70c43 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -13,7 +13,6 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "llvm/CodeGen/LiveRangeEdit.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -31,17 +30,23 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createFrom(unsigned OldReg) {
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
   unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
-    VRM->grow();
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
-  LiveInterval &LI = LIS.getOrCreateInterval(VReg);
-  NewRegs.push_back(&LI);
+  LiveInterval &LI = LIS.createEmptyInterval(VReg);
   return LI;
 }
 
+unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  return VReg;
+}
+
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
                                           AliasAnalysis *aa) {
@@ -216,108 +221,122 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   return true;
 }
 
-void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled) {
-  SetVector<LiveInterval*,
-            SmallVector<LiveInterval*, 8>,
-            SmallPtrSet<LiveInterval*, 8> > ToShrink;
+/// Find all live intervals that need to shrink, then remove the instruction.
+void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink) {
+  assert(MI->allDefsAreDead() && "Def isn't really dead");
+  SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
 
-  for (;;) {
-    // Erase all dead defs.
-    while (!Dead.empty()) {
-      MachineInstr *MI = Dead.pop_back_val();
-      assert(MI->allDefsAreDead() && "Def isn't really dead");
-      SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
-
-      // Never delete inline asm.
-      if (MI->isInlineAsm()) {
-        DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
-        continue;
-      }
+  // Never delete a bundled instruction.
+  if (MI->isBundled()) {
+    return;
+  }
+  // Never delete inline asm.
+  if (MI->isInlineAsm()) {
+    DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
+    return;
+  }
 
-      // Use the same criteria as DeadMachineInstructionElim.
-      bool SawStore = false;
-      if (!MI->isSafeToMove(&TII, 0, SawStore)) {
-        DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
-        continue;
-      }
+  // Use the same criteria as DeadMachineInstructionElim.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(&TII, 0, SawStore)) {
+    DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
+    return;
+  }
 
-      DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
-
-      // Collect virtual registers to be erased after MI is gone.
-      SmallVector<unsigned, 8> RegsToErase;
-      bool ReadsPhysRegs = false;
-
-      // Check for live intervals that may shrink
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-             MOE = MI->operands_end(); MOI != MOE; ++MOI) {
-        if (!MOI->isReg())
-          continue;
-        unsigned Reg = MOI->getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
-          // Check if MI reads any unreserved physregs.
-          if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
-            ReadsPhysRegs = true;
-          continue;
-        }
-        LiveInterval &LI = LIS.getInterval(Reg);
-
-        // Shrink read registers, unless it is likely to be expensive and
-        // unlikely to change anything. We typically don't want to shrink the
-        // PIC base register that has lots of uses everywhere.
-        // Always shrink COPY uses that probably come from live range splitting.
-        if (MI->readsVirtualRegister(Reg) &&
-            (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
-             LI.killedAt(Idx)))
-          ToShrink.insert(&LI);
-
-        // Remove defined value.
-        if (MOI->isDef()) {
-          if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
-            if (TheDelegate)
-              TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
-            LI.removeValNo(VNI);
-            if (LI.empty())
-              RegsToErase.push_back(Reg);
+  DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
+
+  // Collect virtual registers to be erased after MI is gone.
+  SmallVector<unsigned, 8> RegsToErase;
+  bool ReadsPhysRegs = false;
+
+  // Check for live intervals that may shrink
+  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+    if (!MOI->isReg())
+      continue;
+    unsigned Reg = MOI->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      // Check if MI reads any unreserved physregs.
+      if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
+        ReadsPhysRegs = true;
+      else if (MOI->isDef()) {
+        for (MCRegUnitIterator Units(Reg, MRI.getTargetRegisterInfo());
+             Units.isValid(); ++Units) {
+          if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
+            if (VNInfo *VNI = LR->getVNInfoAt(Idx))
+              LR->removeValNo(VNI);
           }
         }
       }
-
-      // Currently, we don't support DCE of physreg live ranges. If MI reads
-      // any unreserved physregs, don't erase the instruction, but turn it into
-      // a KILL instead. This way, the physreg live ranges don't end up
-      // dangling.
-      // FIXME: It would be better to have something like shrinkToUses() for
-      // physregs. That could potentially enable more DCE and it would free up
-      // the physreg. It would not happen often, though.
-      if (ReadsPhysRegs) {
-        MI->setDesc(TII.get(TargetOpcode::KILL));
-        // Remove all operands that aren't physregs.
-        for (unsigned i = MI->getNumOperands(); i; --i) {
-          const MachineOperand &MO = MI->getOperand(i-1);
-          if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-            continue;
-          MI->RemoveOperand(i-1);
-        }
-        DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
-      } else {
+      continue;
+    }
+    LiveInterval &LI = LIS.getInterval(Reg);
+
+    // Shrink read registers, unless it is likely to be expensive and
+    // unlikely to change anything. We typically don't want to shrink the
+    // PIC base register that has lots of uses everywhere.
+    // Always shrink COPY uses that probably come from live range splitting.
+    if (MI->readsVirtualRegister(Reg) &&
+        (MI->isCopy() || MOI->isDef() || MRI.hasOneNonDBGUse(Reg) ||
+         LI.Query(Idx).isKill()))
+      ToShrink.insert(&LI);
+
+    // Remove defined value.
+    if (MOI->isDef()) {
+      if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
         if (TheDelegate)
-          TheDelegate->LRE_WillEraseInstruction(MI);
-        LIS.RemoveMachineInstrFromMaps(MI);
-        MI->eraseFromParent();
-        ++NumDCEDeleted;
+          TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+        LI.removeValNo(VNI);
+        if (LI.empty())
+          RegsToErase.push_back(Reg);
       }
+    }
+  }
 
-      // Erase any virtregs that are now empty and unused. There may be <undef>
-      // uses around. Keep the empty live range in that case.
-      for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
-        unsigned Reg = RegsToErase[i];
-        if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
-          ToShrink.remove(&LIS.getInterval(Reg));
-          eraseVirtReg(Reg);
-        }
-      }
+  // Currently, we don't support DCE of physreg live ranges. If MI reads
+  // any unreserved physregs, don't erase the instruction, but turn it into
+  // a KILL instead. This way, the physreg live ranges don't end up
+  // dangling.
+  // FIXME: It would be better to have something like shrinkToUses() for
+  // physregs. That could potentially enable more DCE and it would free up
+  // the physreg. It would not happen often, though.
+  if (ReadsPhysRegs) {
+    MI->setDesc(TII.get(TargetOpcode::KILL));
+    // Remove all operands that aren't physregs.
+    for (unsigned i = MI->getNumOperands(); i; --i) {
+      const MachineOperand &MO = MI->getOperand(i-1);
+      if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        continue;
+      MI->RemoveOperand(i-1);
     }
+    DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+  } else {
+    if (TheDelegate)
+      TheDelegate->LRE_WillEraseInstruction(MI);
+    LIS.RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+    ++NumDCEDeleted;
+  }
+
+  // Erase any virtregs that are now empty and unused. There may be <undef>
+  // uses around. Keep the empty live range in that case.
+  for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+    unsigned Reg = RegsToErase[i];
+    if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+      ToShrink.remove(&LIS.getInterval(Reg));
+      eraseVirtReg(Reg);
+    }
+  }
+}
+
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled) {
+  ToShrinkSet ToShrink;
+
+  for (;;) {
+    // Erase all dead defs.
+    while (!Dead.empty())
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink);
 
     if (ToShrink.empty())
       break;
@@ -331,7 +350,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
       TheDelegate->LRE_WillShrinkVirtReg(LI->reg);
     if (!LIS.shrinkToUses(LI, &Dead))
       continue;
-    
+
     // Don't create new intervals for a register being spilled.
     // The new intervals would have to be spilled anyway so its not worth it.
     // Also they currently aren't spilled so creating them and not spilling
@@ -343,11 +362,11 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         break;
       }
     }
-    
+
     if (BeingSpilled) continue;
 
     // LI may have been separated, create new intervals.
-    LI->RenumberValues(LIS);
+    LI->RenumberValues();
     ConnectedVNInfoEqClasses ConEQ(LIS);
     unsigned NumComp = ConEQ.Classify(LI);
     if (NumComp <= 1)
@@ -357,7 +376,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
     DEBUG(dbgs() << NumComp << " components: " << *LI << '\n');
     SmallVector<LiveInterval*, 8> Dups(1, LI);
     for (unsigned i = 1; i != NumComp; ++i) {
-      Dups.push_back(&createFrom(LI->reg));
+      Dups.push_back(&createEmptyIntervalFrom(LI->reg));
       // If LI is an original interval that hasn't been split yet, make the new
       // intervals their own originals instead of referring to LI. The original
       // interval must contain all the split products, and LI doesn't.
@@ -374,14 +393,27 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
   }
 }
 
-void LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
-                                             const MachineLoopInfo &Loops) {
-  VirtRegAuxInfo VRAI(MF, LIS, Loops);
-  for (iterator I = begin(), E = end(); I != E; ++I) {
-    LiveInterval &LI = **I;
+// Keep track of new virtual registers created via
+// MachineRegisterInfo::createVirtualRegister.
+void
+LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
+{
+  if (VRM)
+    VRM->grow();
+
+  NewRegs.push_back(VReg);
+}
+
+void
+LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
+                                        const MachineLoopInfo &Loops,
+                                        const MachineBlockFrequencyInfo &MBFI) {
+  VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
+  for (unsigned I = 0, Size = size(); I < Size; ++I) {
+    LiveInterval &LI = LIS.getInterval(get(I));
     if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
       DEBUG(dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
                    << MRI.getRegClass(LI.reg)->getName() << '\n');
-    VRAI.CalculateWeightAndHint(LI);
+    VRAI.calculateSpillWeightAndHint(LI);
   }
 }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 0ef069f..1d801ac 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -119,9 +119,11 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
+    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
       return true;
+  }
   return false;
 }
 
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
new file mode 100644
index 0000000..6221ca2
--- /dev/null
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -0,0 +1,111 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRegUnits utility for tracking liveness of
+// physical register units across machine instructions in forward or backward
+// order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+using namespace llvm;
+
+/// Return true if the given MachineOperand clobbers the given register unit.
+/// A register unit is only clobbered if all its super-registers are clobbered.
+static bool operClobbersUnit(const MachineOperand *MO, unsigned Unit,
+                             const MCRegisterInfo *MCRI) {
+  for (MCRegUnitRootIterator RI(Unit, MCRI); RI.isValid(); ++RI) {
+    for (MCSuperRegIterator SI(*RI, MCRI, true); SI.isValid(); ++SI) {
+      if (!MO->clobbersPhysReg(*SI))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// We assume the high bits of a physical super register are not preserved
+/// unless the instruction has an implicit-use operand reading the
+/// super-register or a register unit for the upper bits is available.
+void LiveRegUnits::removeRegsInMask(const MachineOperand &Op,
+                                    const MCRegisterInfo &MCRI) {
+  SparseSet<unsigned>::iterator LUI = LiveUnits.begin();
+  while (LUI != LiveUnits.end()) {
+    if (operClobbersUnit(&Op, *LUI, &MCRI))
+      LUI = LiveUnits.erase(LUI);
+    else
+      ++LUI;
+  }
+}
+
+void LiveRegUnits::stepBackward(const MachineInstr &MI,
+                                const MCRegisterInfo &MCRI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      removeReg(Reg, MCRI);
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg() || O->isUndef())
+      continue;
+    unsigned Reg = O->getReg();
+    if (Reg == 0)
+      continue;
+    addReg(Reg, MCRI);
+  }
+}
+
+/// Uses with kill flag get removed from the set, defs added. If possible
+/// use StepBackward() instead of this function because some kill flags may
+/// be missing.
+void LiveRegUnits::stepForward(const MachineInstr &MI,
+                               const MCRegisterInfo &MCRI) {
+  SmallVector<unsigned, 4> Defs;
+  // Remove killed registers from the set.
+  for (ConstMIBundleOperands O(&MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      if (O->isDef()) {
+        if (!O->isDead())
+          Defs.push_back(Reg);
+      } else {
+        if (!O->isKill())
+          continue;
+        assert(O->isUse());
+        removeReg(Reg, MCRI);
+      }
+    } else if (O->isRegMask()) {
+      removeRegsInMask(*O, MCRI);
+    }
+  }
+  // Add defs to the set.
+  for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+    addReg(Defs[i], MCRI);
+  }
+}
+
+/// Adds all registers in the live-in list of block @p BB.
+void LiveRegUnits::addLiveIns(const MachineBasicBlock *MBB,
+                              const MCRegisterInfo &MCRI) {
+  for (MachineBasicBlock::livein_iterator L = MBB->livein_begin(),
+         LE = MBB->livein_end(); L != LE; ++L) {
+    addReg(*L, MCRI);
+  }
+}
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 789eddc..ed55d7a 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -217,8 +217,8 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
       continue;
     unsigned DefReg = MO.getReg();
     if (TRI->isSubRegister(Reg, DefReg)) {
-      PartDefRegs.insert(DefReg);
-      for (MCSubRegIterator SubRegs(DefReg, TRI); SubRegs.isValid(); ++SubRegs)
+      for (MCSubRegIterator SubRegs(DefReg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
         PartDefRegs.insert(*SubRegs);
     }
   }
@@ -271,8 +271,8 @@ void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr *MI) {
                                                   true/*IsImp*/));
 
   // Remember this use.
-  PhysRegUse[Reg]  = MI;
-  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+       SubRegs.isValid(); ++SubRegs)
     PhysRegUse[*SubRegs] =  MI;
 }
 
@@ -350,8 +350,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
       continue;
     }
     if (MachineInstr *Use = PhysRegUse[SubReg]) {
-      PartUses.insert(SubReg);
-      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+      for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true); SS.isValid();
+           ++SS)
         PartUses.insert(*SS);
       unsigned Dist = DistanceMap[Use];
       if (Dist > LastRefOrPartRefDist) {
@@ -387,8 +387,8 @@ bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
         LastSubRef->addRegisterKilled(SubReg, TRI, true);
       else {
         LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
-        PhysRegUse[SubReg] = LastRefOrPartRef;
-        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true);
+             SS.isValid(); ++SS)
           PhysRegUse[*SS] = LastRefOrPartRef;
       }
       for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
@@ -441,12 +441,12 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
 }
 
 void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
-                                     SmallVector<unsigned, 4> &Defs) {
+                                     SmallVectorImpl<unsigned> &Defs) {
   // What parts of the register are previously defined?
   SmallSet<unsigned, 32> Live;
   if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
-    Live.insert(Reg);
-    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
       Live.insert(*SubRegs);
   } else {
     for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
@@ -460,8 +460,8 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
       if (Live.count(SubReg))
         continue;
       if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
-        Live.insert(SubReg);
-        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+        for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true);
+             SS.isValid(); ++SS)
           Live.insert(*SS);
       }
     }
@@ -484,13 +484,12 @@ void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
 }
 
 void LiveVariables::UpdatePhysRegDefs(MachineInstr *MI,
-                                      SmallVector<unsigned, 4> &Defs) {
+                                      SmallVectorImpl<unsigned> &Defs) {
   while (!Defs.empty()) {
     unsigned Reg = Defs.back();
     Defs.pop_back();
-    PhysRegDef[Reg]  = MI;
-    PhysRegUse[Reg]  = NULL;
-    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs) {
       unsigned SubReg = *SubRegs;
       PhysRegDef[SubReg]  = MI;
       PhysRegUse[SubReg]  = NULL;
@@ -610,9 +609,9 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
     // if they have PHI nodes, and if so, we simulate an assignment at the end
     // of the current block.
     if (!PHIVarInfo[MBB->getNumber()].empty()) {
-      SmallVector<unsigned, 4>& VarInfoVec = PHIVarInfo[MBB->getNumber()];
+      SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
 
-      for (SmallVector<unsigned, 4>::iterator I = VarInfoVec.begin(),
+      for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
              E = VarInfoVec.end(); I != E; ++I)
         // Mark it alive only in the block we are representing.
         MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 78e9950..ca71e3b 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
@@ -51,7 +52,7 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
-    const char *Prefix = Ctx.getAsmInfo().getPrivateGlobalPrefix();
+    const char *Prefix = Ctx.getAsmInfo()->getPrivateGlobalPrefix();
     CachedMCSymbol = Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -341,6 +342,38 @@ bool MachineBasicBlock::isLiveIn(unsigned Reg) const {
   return I != livein_end();
 }
 
+unsigned
+MachineBasicBlock::addLiveIn(unsigned PhysReg, const TargetRegisterClass *RC) {
+  assert(getParent() && "MBB must be inserted in function");
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(RC && "Register class is required");
+  assert((isLandingPad() || this == &getParent()->front()) &&
+         "Only the entry block and landing pads can have physreg live ins");
+
+  bool LiveIn = isLiveIn(PhysReg);
+  iterator I = SkipPHIsAndLabels(begin()), E = end();
+  MachineRegisterInfo &MRI = getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *getParent()->getTarget().getInstrInfo();
+
+  // Look for an existing copy.
+  if (LiveIn)
+    for (;I != E && I->isCopy(); ++I)
+      if (I->getOperand(1).getReg() == PhysReg) {
+        unsigned VirtReg = I->getOperand(0).getReg();
+        if (!MRI.constrainRegClass(VirtReg, RC))
+          llvm_unreachable("Incompatible live-in register class.");
+        return VirtReg;
+      }
+
+  // No luck, create a virtual register.
+  unsigned VirtReg = MRI.createVirtualRegister(RC);
+  BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
+    .addReg(PhysReg, RegState::Kill);
+  if (!LiveIn)
+    addLiveIn(PhysReg);
+  return VirtReg;
+}
+
 void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
   getParent()->splice(NewAfter, this);
 }
@@ -828,7 +861,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
           LiveInterval &LI = LIS->getInterval(Reg);
           VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
           assert(VNI && "PHI sources should be live out of their predecessors.");
-          LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+          LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
         }
       }
     }
@@ -847,9 +880,9 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
       if (isLiveOut && isLastMBB) {
         VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
         assert(VNI && "LiveInterval should have VNInfo where it is live.");
-        LI.addRange(LiveRange(StartIndex, EndIndex, VNI));
+        LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
       } else if (!isLiveOut && !isLastMBB) {
-        LI.removeRange(StartIndex, EndIndex);
+        LI.removeSegment(StartIndex, EndIndex);
       }
     }
 
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 070daf2..e269d24 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -50,11 +50,6 @@ bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
   return false;
 }
 
-/// getblockFreq - Return block frequency. Return 0 if we don't have the
-/// information. Please note that initial frequency is equal to 1024. It means
-/// that we should not rely on the value itself, but only on the comparison to
-/// the other block frequencies. We do this to avoid using of floating points.
-///
 BlockFrequency MachineBlockFrequencyInfo::
 getBlockFreq(const MachineBasicBlock *MBB) const {
   return MBFI->getBlockFreq(MBB);
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index bfba503..4b0f7f3 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -991,6 +991,28 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     Cond.clear();
     MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
     if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      // The "PrevBB" is not yet updated to reflect current code layout, so,
+      //   o. it may fall-through to a block without explict "goto" instruction
+      //      before layout, and no longer fall-through it after layout; or 
+      //   o. just opposite.
+      // 
+      // AnalyzeBranch() may return erroneous value for FBB when these two
+      // situations take place. For the first scenario FBB is mistakenly set
+      // NULL; for the 2nd scenario, the FBB, which is expected to be NULL,
+      // is mistakenly pointing to "*BI".
+      //
+      bool needUpdateBr = true;
+      if (!Cond.empty() && (!FBB || FBB == *BI)) {
+        PrevBB->updateTerminator();
+        needUpdateBr = false;
+        Cond.clear();
+        TBB = FBB = 0;
+        if (TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+          // FIXME: This should never take place.
+          TBB = FBB = 0;
+        }
+      }
+
       // If PrevBB has a two-way branch, try to re-order the branches
       // such that we branch to the successor with higher weight first.
       if (TBB && !Cond.empty() && FBB &&
@@ -1003,8 +1025,10 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
         DebugLoc dl;  // FIXME: this is nowhere
         TII->RemoveBranch(*PrevBB);
         TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
+        needUpdateBr = true;
       }
-      PrevBB->updateTerminator();
+      if (needUpdateBr)
+        PrevBB->updateTerminator();
     }
   }
 
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 61d8d38..d228286 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -84,11 +84,11 @@ namespace {
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
                                SmallSet<unsigned,8> &PhysRefs,
-                               SmallVector<unsigned,2> &PhysDefs,
+                               SmallVectorImpl<unsigned> &PhysDefs,
                                bool &PhysUseDef) const;
     bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
                           SmallSet<unsigned,8> &PhysRefs,
-                          SmallVector<unsigned,2> &PhysDefs,
+                          SmallVectorImpl<unsigned> &PhysDefs,
                           bool &NonLocal) const;
     bool isCSECandidate(MachineInstr *MI);
     bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
@@ -193,7 +193,7 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
 bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
                                        const MachineBasicBlock *MBB,
                                        SmallSet<unsigned,8> &PhysRefs,
-                                       SmallVector<unsigned,2> &PhysDefs,
+                                       SmallVectorImpl<unsigned> &PhysDefs,
                                        bool &PhysUseDef) const{
   // First, add all uses to PhysRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -244,7 +244,7 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
 
 bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
                                   SmallSet<unsigned,8> &PhysRefs,
-                                  SmallVector<unsigned,2> &PhysDefs,
+                                  SmallVectorImpl<unsigned> &PhysDefs,
                                   bool &NonLocal) const {
   // For now conservatively returns false if the common subexpression is
   // not in the same basic block as the given instruction. The only exception
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index dc8a224..4f48e2c 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -213,9 +213,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
         CopyMap.erase(*AI);
         AvailCopyMap.erase(*AI);
       }
-      CopyMap[Def] = MI;
-      AvailCopyMap[Def] = MI;
-      for (MCSubRegIterator SR(Def, TRI); SR.isValid(); ++SR) {
+      for (MCSubRegIterator SR(Def, TRI, /*IncludeSelf=*/true); SR.isValid();
+           ++SR) {
         CopyMap[*SR] = MI;
         AvailCopyMap[*SR] = MI;
       }
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 04321f3..0703df0 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -54,23 +55,28 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
                                  GCModuleInfo* gmi)
   : Fn(F), Target(TM), Ctx(mmi.getContext()), MMI(mmi), GMI(gmi) {
   if (TM.getRegisterInfo())
-    RegInfo = new (Allocator) MachineRegisterInfo(*TM.getRegisterInfo());
+    RegInfo = new (Allocator) MachineRegisterInfo(TM);
   else
     RegInfo = 0;
+
   MFInfo = 0;
-  FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering(),
-                                               TM.Options.RealignStack);
+  FrameInfo =
+    new (Allocator) MachineFrameInfo(TM,!F->hasFnAttribute("no-realign-stack"));
+
   if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                        Attribute::StackAlignment))
     FrameInfo->ensureMaxAlignment(Fn->getAttributes().
                                 getStackAlignment(AttributeSet::FunctionIndex));
-  ConstantPool = new (Allocator) MachineConstantPool(TM.getDataLayout());
+
+  ConstantPool = new (Allocator) MachineConstantPool(TM);
   Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
+
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
   if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                         Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
                          TM.getTargetLowering()->getPrefFunctionAlignment());
+
   FunctionNumber = FunctionNum;
   JumpTableInfo = 0;
 }
@@ -456,11 +462,15 @@ MCSymbol *MachineFunction::getPICBaseSymbol() const {
 //  MachineFrameInfo implementation
 //===----------------------------------------------------------------------===//
 
+const TargetFrameLowering *MachineFrameInfo::getFrameLowering() const {
+  return TM.getFrameLowering();
+}
+
 /// ensureMaxAlignment - Make sure the function is at least Align bytes
 /// aligned.
 void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
-  if (!TFI.isStackRealignable() || !RealignOption)
-    assert(Align <= TFI.getStackAlignment() &&
+  if (!getFrameLowering()->isStackRealignable() || !RealignOption)
+    assert(Align <= getFrameLowering()->getStackAlignment() &&
            "For targets without stack realignment, Align is out of limit!");
   if (MaxAlignment < Align) MaxAlignment = Align;
 }
@@ -482,8 +492,10 @@ static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
 int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
                       bool isSS, bool MayNeedSP, const AllocaInst *Alloca) {
   assert(Size != 0 && "Cannot allocate zero size stack objects!");
-  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
-                                  Alignment, TFI.getStackAlignment());
+  Alignment =
+    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                          !RealignOption,
+                        Alignment, getFrameLowering()->getStackAlignment());
   Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP,
                                 Alloca));
   int Index = (int)Objects.size() - NumFixedObjects - 1;
@@ -498,8 +510,10 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
 ///
 int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
                                              unsigned Alignment) {
-  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
-                                  Alignment, TFI.getStackAlignment()); 
+  Alignment =
+    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                          !RealignOption,
+                        Alignment, getFrameLowering()->getStackAlignment()); 
   CreateStackObject(Size, Alignment, true, false);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
   ensureMaxAlignment(Alignment);
@@ -513,8 +527,10 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
 ///
 int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment) {
   HasVarSizedObjects = true;
-  Alignment = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
-                                  Alignment, TFI.getStackAlignment()); 
+  Alignment =
+    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                          !RealignOption,
+                        Alignment, getFrameLowering()->getStackAlignment()); 
   Objects.push_back(StackObject(0, Alignment, 0, false, false, true, 0));
   ensureMaxAlignment(Alignment);
   return (int)Objects.size()-NumFixedObjects-1;
@@ -532,10 +548,12 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // the incoming frame position.  If the frame object is at offset 32 and
   // the stack is guaranteed to be 16-byte aligned, then we know that the
   // object is 16-byte aligned.
-  unsigned StackAlign = TFI.getStackAlignment();
+  unsigned StackAlign = getFrameLowering()->getStackAlignment();
   unsigned Align = MinAlign(SPOffset, StackAlign);
-  Align = clampStackAlignment(!TFI.isStackRealignable() || !RealignOption,
-                              Align, TFI.getStackAlignment()); 
+  Align =
+    clampStackAlignment(!getFrameLowering()->isStackRealignable() ||
+                          !RealignOption,
+                        Align, getFrameLowering()->getStackAlignment()); 
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
                                               /*isSS*/   false,
                                               /*NeedSP*/ false,
@@ -769,6 +787,10 @@ void MachineJumpTableInfo::dump() const { print(dbgs()); }
 
 void MachineConstantPoolValue::anchor() { }
 
+const DataLayout *MachineConstantPool::getDataLayout() const {
+  return TM.getDataLayout();
+}
+
 Type *MachineConstantPoolEntry::getType() const {
   if (isMachineConstantPoolEntry())
     return Val.MachineCPVal->getType();
@@ -850,7 +872,8 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
   // FIXME, this could be made much more efficient for large constant pools.
   for (unsigned i = 0, e = Constants.size(); i != e; ++i)
     if (!Constants[i].isMachineConstantPoolEntry() &&
-        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, TD)) {
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C,
+                                  getDataLayout())) {
       if ((unsigned)Constants[i].getAlignment() < Alignment)
         Constants[i].Alignment = Alignment;
       return i;
@@ -887,7 +910,7 @@ void MachineConstantPool::print(raw_ostream &OS) const {
     if (Constants[i].isMachineConstantPoolEntry())
       Constants[i].Val.MachineCPVal->print(OS);
     else
-      OS << *(const Value*)Constants[i].Val.ConstVal;
+      WriteAsOperand(OS, Constants[i].Val.ConstVal, /*PrintType=*/false);
     OS << ", align=" << Constants[i].getAlignment();
     OS << "\n";
   }
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 32d0668..295b450 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -647,12 +647,15 @@ void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
     }
   }
 
+#ifndef NDEBUG
+  bool isMetaDataOp = Op.getType() == MachineOperand::MO_Metadata;
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
   // RegMask operands go between the explicit and implicit operands.
   assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
-          OpNo < MCID->getNumOperands()) &&
+          OpNo < MCID->getNumOperands() || isMetaDataOp) &&
          "Trying to add an operand to a machine instr that is already done!");
+#endif
 
   MachineRegisterInfo *MRI = getRegInfo();
 
@@ -1253,32 +1256,6 @@ bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
   return true;
 }
 
-/// isSafeToReMat - Return true if it's safe to rematerialize the specified
-/// instruction which defined the specified register instead of copying it.
-bool MachineInstr::isSafeToReMat(const TargetInstrInfo *TII,
-                                 AliasAnalysis *AA,
-                                 unsigned DstReg) const {
-  bool SawStore = false;
-  if (!TII->isTriviallyReMaterializable(this, AA) ||
-      !isSafeToMove(TII, AA, SawStore))
-    return false;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = getOperand(i);
-    if (!MO.isReg())
-      continue;
-    // FIXME: For now, do not remat any instruction with register operands.
-    // Later on, we can loosen the restriction is the register operands have
-    // not been modified between the def and use. Note, this is different from
-    // MachineSink because the code is no longer in two-address form (at least
-    // partially).
-    if (MO.isUse())
-      return false;
-    else if (!MO.isDead() && MO.getReg() != DstReg)
-      return false;
-  }
-  return true;
-}
-
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
 /// or volatile memory reference, or if the information describing the memory
 /// reference is not available. Return false if it is known to have no ordered
@@ -1411,8 +1388,10 @@ static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
   const LLVMContext &Ctx = MF->getFunction()->getContext();
   if (!DL.isUnknown()) {          // Print source line info.
     DIScope Scope(DL.getScope(Ctx));
+    assert((!Scope || Scope.isScope()) &&
+      "Scope of a DebugLoc should be null or a DIScope.");
     // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope.Verify())
+    if (Scope)
       CommentOS << Scope.getFilename();
     else
       CommentOS << "<unknown>";
@@ -1726,31 +1705,31 @@ void MachineInstr::clearRegisterKills(unsigned Reg,
   }
 }
 
-bool MachineInstr::addRegisterDead(unsigned IncomingReg,
+bool MachineInstr::addRegisterDead(unsigned Reg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
   bool hasAliases = isPhysReg &&
-    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+    MCRegAliasIterator(Reg, RegInfo, false).isValid();
   bool Found = false;
   SmallVector<unsigned,4> DeadOps;
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
       continue;
 
-    if (Reg == IncomingReg) {
+    if (MOReg == Reg) {
       MO.setIsDead();
       Found = true;
     } else if (hasAliases && MO.isDead() &&
-               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
       // There exists a super-register that's marked dead.
-      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+      if (RegInfo->isSuperRegister(Reg, MOReg))
         return true;
-      if (RegInfo->isSubRegister(IncomingReg, Reg))
+      if (RegInfo->isSubRegister(Reg, MOReg))
         DeadOps.push_back(i);
     }
   }
@@ -1770,7 +1749,7 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   if (Found || !AddIfNotFound)
     return Found;
 
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/,
                                        false /*IsKill*/,
@@ -1778,21 +1757,21 @@ bool MachineInstr::addRegisterDead(unsigned IncomingReg,
   return true;
 }
 
-void MachineInstr::addRegisterDefined(unsigned IncomingReg,
+void MachineInstr::addRegisterDefined(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(IncomingReg)) {
-    MachineOperand *MO = findRegisterDefOperand(IncomingReg, false, RegInfo);
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    MachineOperand *MO = findRegisterDefOperand(Reg, false, RegInfo);
     if (MO)
       return;
   } else {
     for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = getOperand(i);
-      if (MO.isReg() && MO.getReg() == IncomingReg && MO.isDef() &&
+      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
           MO.getSubReg() == 0)
         return;
     }
   }
-  addOperand(MachineOperand::CreateReg(IncomingReg,
+  addOperand(MachineOperand::CreateReg(Reg,
                                        true  /*IsDef*/,
                                        true  /*IsImp*/));
 }
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index ed3ed4d..104eacd 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -172,7 +172,7 @@ namespace {
                    BitVector &PhysRegDefs,
                    BitVector &PhysRegClobbers,
                    SmallSet<int, 32> &StoredFIs,
-                   SmallVector<CandidateInfo, 32> &Candidates);
+                   SmallVectorImpl<CandidateInfo> &Candidates);
 
     /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the
     /// current loop.
@@ -404,7 +404,7 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
                             BitVector &PhysRegDefs,
                             BitVector &PhysRegClobbers,
                             SmallSet<int, 32> &StoredFIs,
-                            SmallVector<CandidateInfo, 32> &Candidates) {
+                            SmallVectorImpl<CandidateInfo> &Candidates) {
   bool RuledOut = false;
   bool HasNonInvariantUse = false;
   unsigned Def = 0;
@@ -468,12 +468,12 @@ void MachineLICM::ProcessMI(MachineInstr *MI,
     for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
       if (PhysRegDefs.test(*AS))
         PhysRegClobbers.set(*AS);
-      if (PhysRegClobbers.test(*AS))
-        // MI defined register is seen defined by another instruction in
-        // the loop, it cannot be a LICM candidate.
-        RuledOut = true;
       PhysRegDefs.set(*AS);
     }
+    if (PhysRegClobbers.test(Reg))
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -502,7 +502,7 @@ void MachineLICM::HoistRegionPostRA() {
 
   // Walk the entire region, count number of defs for each register, and
   // collect potential LICM candidates.
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
 
@@ -584,7 +584,7 @@ void MachineLICM::HoistRegionPostRA() {
 /// AddToLiveIns - Add register 'Reg' to the livein sets of BBs in the current
 /// loop, and make sure it is not killed by any instructions in the loop.
 void MachineLICM::AddToLiveIns(unsigned Reg) {
-  const std::vector<MachineBasicBlock*> Blocks = CurLoop->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *BB = Blocks[i];
     if (!BB->isLiveIn(Reg))
@@ -1084,7 +1084,7 @@ bool MachineLICM::CanCauseHighRegPressure(DenseMap<unsigned, int> &Cost,
       return true;
 
     for (unsigned i = BackTrace.size(); i != 0; --i) {
-      SmallVector<unsigned, 8> &RP = BackTrace[i-1];
+      SmallVectorImpl<unsigned> &RP = BackTrace[i-1];
       if (RP[RCId] + Cost >= Limit)
         return true;
     }
@@ -1130,7 +1130,7 @@ void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
 
   // Update register pressure of blocks from loop header to current block.
   for (unsigned i = 0, e = BackTrace.size(); i != e; ++i) {
-    SmallVector<unsigned, 8> &RP = BackTrace[i];
+    SmallVectorImpl<unsigned> &RP = BackTrace[i];
     for (DenseMap<unsigned, int>::iterator CI = Cost.begin(), CE = Cost.end();
          CI != CE; ++CI) {
       unsigned RCId = CI->first;
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 8af9d05..bb54284 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -253,13 +253,12 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
 MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
                                      const MCRegisterInfo &MRI,
                                      const MCObjectFileInfo *MOFI)
-  : ImmutablePass(ID), Context(MAI, MRI, MOFI, 0, false) {
+  : ImmutablePass(ID), Context(&MAI, &MRI, MOFI, 0, false) {
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
 MachineModuleInfo::MachineModuleInfo()
-  : ImmutablePass(ID),
-    Context(*(MCAsmInfo*)0, *(MCRegisterInfo*)0, (MCObjectFileInfo*)0) {
+  : ImmutablePass(ID), Context(0, 0, 0) {
   llvm_unreachable("This MachineModuleInfo constructor should never be called, "
                    "MMI should always be explicitly constructed by "
                    "LLVMTargetMachine");
@@ -303,7 +302,7 @@ bool MachineModuleInfo::doFinalization(Module &M) {
 ///
 void MachineModuleInfo::EndFunction() {
   // Clean up frame info.
-  FrameMoves.clear();
+  FrameInstructions.clear();
 
   // Clean up exception info.
   LandingPads.clear();
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 68372f6..f8b8796 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -19,16 +19,21 @@
 
 using namespace llvm;
 
-MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
-  : TRI(&TRI), IsSSA(true), TracksLiveness(true) {
+// Pin the vtable to this file.
+void MachineRegisterInfo::Delegate::anchor() {}
+
+MachineRegisterInfo::MachineRegisterInfo(const TargetMachine &TM)
+  : TM(TM), TheDelegate(0), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedRegUnits.resize(TRI.getNumRegUnits());
-  UsedPhysRegMask.resize(TRI.getNumRegs());
+  UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
+  UsedPhysRegMask.resize(getTargetRegisterInfo()->getNumRegs());
 
   // Create the physreg use/def lists.
-  PhysRegUseDefLists = new MachineOperand*[TRI.getNumRegs()];
-  memset(PhysRegUseDefLists, 0, sizeof(MachineOperand*)*TRI.getNumRegs());
+  PhysRegUseDefLists =
+    new MachineOperand*[getTargetRegisterInfo()->getNumRegs()];
+  memset(PhysRegUseDefLists, 0,
+         sizeof(MachineOperand*)*getTargetRegisterInfo()->getNumRegs());
 }
 
 MachineRegisterInfo::~MachineRegisterInfo() {
@@ -50,7 +55,8 @@ MachineRegisterInfo::constrainRegClass(unsigned Reg,
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   if (OldRC == RC)
     return RC;
-  const TargetRegisterClass *NewRC = TRI->getCommonSubClass(OldRC, RC);
+  const TargetRegisterClass *NewRC =
+    getTargetRegisterInfo()->getCommonSubClass(OldRC, RC);
   if (!NewRC || NewRC == OldRC)
     return NewRC;
   if (NewRC->getNumRegs() < MinNumRegs)
@@ -63,7 +69,8 @@ bool
 MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
-  const TargetRegisterClass *NewRC = TRI->getLargestLegalSuperClass(OldRC);
+  const TargetRegisterClass *NewRC =
+    getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC);
 
   // Stop early if there is no room to grow.
   if (NewRC == OldRC)
@@ -73,14 +80,16 @@ MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
   for (reg_nodbg_iterator I = reg_nodbg_begin(Reg), E = reg_nodbg_end(); I != E;
        ++I) {
     const TargetRegisterClass *OpRC =
-      I->getRegClassConstraint(I.getOperandNo(), TII, TRI);
+      I->getRegClassConstraint(I.getOperandNo(), TII,
+                               getTargetRegisterInfo());
     if (unsigned SubIdx = I.getOperand().getSubReg()) {
       if (OpRC)
-        NewRC = TRI->getMatchingSuperRegClass(NewRC, OpRC, SubIdx);
+        NewRC = getTargetRegisterInfo()->getMatchingSuperRegClass(NewRC, OpRC,
+                                                                  SubIdx);
       else
-        NewRC = TRI->getSubClassWithSubReg(NewRC, SubIdx);
+        NewRC = getTargetRegisterInfo()->getSubClassWithSubReg(NewRC, SubIdx);
     } else if (OpRC)
-      NewRC = TRI->getCommonSubClass(NewRC, OpRC);
+      NewRC = getTargetRegisterInfo()->getCommonSubClass(NewRC, OpRC);
     if (!NewRC || NewRC == OldRC)
       return false;
   }
@@ -102,6 +111,8 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
   VRegInfo.grow(Reg);
   VRegInfo[Reg].first = RegClass;
   RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
   return Reg;
 }
 
@@ -126,24 +137,28 @@ void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
     MachineOperand *MO = &I.getOperand();
     MachineInstr *MI = MO->getParent();
     if (!MI) {
-      errs() << PrintReg(Reg, TRI) << " use list MachineOperand " << MO
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
              << " has no parent instruction.\n";
       Valid = false;
     }
     MachineOperand *MO0 = &MI->getOperand(0);
     unsigned NumOps = MI->getNumOperands();
     if (!(MO >= MO0 && MO < MO0+NumOps)) {
-      errs() << PrintReg(Reg, TRI) << " use list MachineOperand " << MO
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
              << " doesn't belong to parent MI: " << *MI;
       Valid = false;
     }
     if (!MO->isReg()) {
-      errs() << PrintReg(Reg, TRI) << " MachineOperand " << MO << ": " << *MO
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " MachineOperand " << MO << ": " << *MO
              << " is not a register\n";
       Valid = false;
     }
     if (MO->getReg() != Reg) {
-      errs() << PrintReg(Reg, TRI) << " use-list MachineOperand " << MO << ": "
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use-list MachineOperand " << MO << ": "
              << *MO << " is the wrong register\n";
       Valid = false;
     }
@@ -156,7 +171,7 @@ void MachineRegisterInfo::verifyUseLists() const {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
     verifyUseList(TargetRegisterInfo::index2VirtReg(i));
-  for (unsigned i = 1, e = TRI->getNumRegs(); i != e; ++i)
+  for (unsigned i = 1, e = getTargetRegisterInfo()->getNumRegs(); i != e; ++i)
     verifyUseList(i);
 #endif
 }
@@ -390,8 +405,8 @@ void MachineRegisterInfo::dumpUses(unsigned Reg) const {
 #endif
 
 void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
-  ReservedRegs = TRI->getReservedRegs(MF);
-  assert(ReservedRegs.size() == TRI->getNumRegs() &&
+  ReservedRegs = getTargetRegisterInfo()->getReservedRegs(MF);
+  assert(ReservedRegs.size() == getTargetRegisterInfo()->getNumRegs() &&
          "Invalid ReservedRegs vector from target");
 }
 
@@ -401,7 +416,8 @@ bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
 
   // Check if any overlapping register is modified, or allocatable so it may be
   // used later.
-  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
+  for (MCRegAliasIterator AI(PhysReg, getTargetRegisterInfo(), true);
+       AI.isValid(); ++AI)
     if (!def_empty(*AI) || isAllocatable(*AI))
       return false;
   return true;
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index bb6aad7..17f0af8 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -77,7 +77,7 @@ unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
 
 static
 unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
-          SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> &PredValues) {
+        SmallVectorImpl<std::pair<MachineBasicBlock*, unsigned> > &PredValues) {
   if (BB->empty())
     return 0;
 
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index fff6b2b..e71c4df 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
@@ -30,6 +31,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include <queue>
 
 using namespace llvm;
@@ -51,10 +53,11 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
-// FIXME: remove this flag after initial testing. It should always be a good
-// thing.
-static cl::opt<bool> EnableCopyConstrain("misched-vcopy", cl::Hidden,
-    cl::desc("Constrain vreg copies."), cl::init(true));
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+  cl::desc("Enable register pressure scheduling."), cl::init(true));
+
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
 
 static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
   cl::desc("Enable load clustering."), cl::init(true));
@@ -69,6 +72,10 @@ static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+void ScheduleDAGMutation::anchor() {}
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -98,6 +105,9 @@ public:
   virtual void print(raw_ostream &O, const Module* = 0) const;
 
   static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createMachineScheduler();
 };
 } // namespace
 
@@ -152,12 +162,13 @@ DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
 
 /// Forward declare the standard machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C);
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C);
 
 
 /// Decrement this iterator until reaching the top or a non-debug instr.
-static MachineBasicBlock::iterator
-priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+              MachineBasicBlock::const_iterator Beg) {
   assert(I != Beg && "reached the top of the region, cannot decrement");
   while (--I != Beg) {
     if (!I->isDebugValue())
@@ -166,10 +177,19 @@ priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  return const_cast<MachineInstr*>(
+    &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
+}
+
 /// If this iterator is a debug value, increment until reaching the End or a
 /// non-debug instruction.
-static MachineBasicBlock::iterator
-nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
   for(; I != End; ++I) {
     if (!I->isDebugValue())
       break;
@@ -177,6 +197,34 @@ nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
   return I;
 }
 
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor != useDefaultMachineSched)
+    return Ctor(this);
+
+  // Get the default scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSched(this);
+}
+
 /// Top-level MachineScheduler pass driver.
 ///
 /// Visit blocks in function order. Divide each block into scheduling regions
@@ -207,23 +255,14 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
 
   if (VerifyScheduling) {
-    DEBUG(LIS->print(dbgs()));
+    DEBUG(LIS->dump());
     MF->verify(this, "Before machine scheduling.");
   }
   RegClassInfo->runOnMachineFunction(*MF);
 
-  // Select the scheduler, or set the default.
-  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
-  if (Ctor == useDefaultMachineSched) {
-    // Get the default scheduler set by the target.
-    Ctor = MachineSchedRegistry::getDefault();
-    if (!Ctor) {
-      Ctor = createConvergingSched;
-      MachineSchedRegistry::setDefault(Ctor);
-    }
-  }
-  // Instantiate the selected scheduler.
-  OwningPtr<ScheduleDAGInstrs> Scheduler(Ctor(this));
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  OwningPtr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
 
   // Visit all machine basic blocks.
   //
@@ -258,14 +297,15 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
+      unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
         if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
           break;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
+      Scheduler->enterRegion(MBB, I, RegionEnd, NumRegionInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
@@ -280,7 +320,8 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
             << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingInstrs << "\n");
+            dbgs() << " RegionInstrs: " << NumRegionInstrs
+            << " Remaining: " << RemainingInstrs << "\n");
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -297,7 +338,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     Scheduler->finishBlock();
   }
   Scheduler->finalizeSchedule();
-  DEBUG(LIS->print(dbgs()));
+  DEBUG(LIS->dump());
   if (VerifyScheduling)
     MF->verify(this, "After machine scheduling.");
   return true;
@@ -309,7 +350,7 @@ void MachineScheduler::print(raw_ostream &O, const Module* m) const {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ReadyQueue::dump() {
-  dbgs() << "  " << Name << ": ";
+  dbgs() << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
     dbgs() << Queue[i]->NodeNum << " ";
   dbgs() << "\n";
@@ -449,13 +490,19 @@ bool ScheduleDAGMI::checkSchedLimit() {
 void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
                                 MachineBasicBlock::iterator begin,
                                 MachineBasicBlock::iterator end,
-                                unsigned endcount)
+                                unsigned regioninstrs)
 {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
 
   // For convenience remember the end of the liveness region.
   LiveRegionEnd =
     (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+
+  SUPressureDiffs.clear();
+
+  SchedImpl->initPolicy(begin, end, regioninstrs);
+
+  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
 }
 
 // Setup the register pressure trackers for the top scheduled top and bottom
@@ -467,7 +514,7 @@ void ScheduleDAGMI::initRegPressure() {
   // Close the RPTracker to finalize live ins.
   RPTracker.closeRegion();
 
-  DEBUG(RPTracker.getPressure().dump(TRI));
+  DEBUG(RPTracker.dump());
 
   // Initialize the live ins and live outs.
   TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
@@ -479,9 +526,23 @@ void ScheduleDAGMI::initRegPressure() {
   TopRPTracker.closeTop();
   BotRPTracker.closeBottom();
 
+  BotRPTracker.initLiveThru(RPTracker);
+  if (!BotRPTracker.getLiveThru().empty()) {
+    TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
+    DEBUG(dbgs() << "Live Thru: ";
+          dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
+  };
+
+  // For each live out vreg reduce the pressure change associated with other
+  // uses of the same vreg below the live-out reaching def.
+  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
   // Account for liveness generated by the region boundary.
-  if (LiveRegionEnd != RegionEnd)
-    BotRPTracker.recede();
+  if (LiveRegionEnd != RegionEnd) {
+    SmallVector<unsigned, 8> LiveUses;
+    BotRPTracker.recede(&LiveUses);
+    updatePressureDiffs(LiveUses);
+  }
 
   assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
 
@@ -491,38 +552,88 @@ void ScheduleDAGMI::initRegPressure() {
   const std::vector<unsigned> &RegionPressure =
     RPTracker.getPressure().MaxSetPressure;
   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
-    unsigned Limit = TRI->getRegPressureSetLimit(i);
-    DEBUG(dbgs() << TRI->getRegPressureSetName(i)
-          << "Limit " << Limit
-          << " Actual " << RegionPressure[i] << "\n");
-    if (RegionPressure[i] > Limit)
-      RegionCriticalPSets.push_back(PressureElement(i, 0));
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
+    if (RegionPressure[i] > Limit) {
+      DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+            << " Limit " << Limit
+            << " Actual " << RegionPressure[i] << "\n");
+      RegionCriticalPSets.push_back(PressureChange(i));
+    }
   }
   DEBUG(dbgs() << "Excess PSets: ";
         for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
           dbgs() << TRI->getRegPressureSetName(
-            RegionCriticalPSets[i].PSetID) << " ";
+            RegionCriticalPSets[i].getPSet()) << " ";
         dbgs() << "\n");
 }
 
-// FIXME: When the pressure tracker deals in pressure differences then we won't
-// iterate over all RegionCriticalPSets[i].
 void ScheduleDAGMI::
-updateScheduledPressure(const std::vector<unsigned> &NewMaxPressure) {
-  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
-    unsigned ID = RegionCriticalPSets[i].PSetID;
-    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
-    if ((int)NewMaxPressure[ID] > MaxUnits)
-      MaxUnits = NewMaxPressure[ID];
+updateScheduledPressure(const SUnit *SU,
+                        const std::vector<unsigned> &NewMaxPressure) {
+  const PressureDiff &PDiff = getPressureDiff(SU);
+  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
+       I != E; ++I) {
+    if (!I->isValid())
+      break;
+    unsigned ID = I->getPSet();
+    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+      ++CritIdx;
+    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+          && NewMaxPressure[ID] <= INT16_MAX)
+        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+    }
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+    if (NewMaxPressure[ID] >= Limit - 2) {
+      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+            << NewMaxPressure[ID] << " > " << Limit << "(+ "
+            << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+    }
   }
-  DEBUG(
-    for (unsigned i = 0, e = NewMaxPressure.size(); i < e; ++i) {
-      unsigned Limit = TRI->getRegPressureSetLimit(i);
-      if (NewMaxPressure[i] > Limit ) {
-        dbgs() << "  " << TRI->getRegPressureSetName(i) << ": "
-               << NewMaxPressure[i] << " > " << Limit << "\n";
+}
+
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMI::updatePressureDiffs(ArrayRef<unsigned> LiveUses) {
+  for (unsigned LUIdx = 0, LUEnd = LiveUses.size(); LUIdx != LUEnd; ++LUIdx) {
+    /// FIXME: Currently assuming single-use physregs.
+    unsigned Reg = LiveUses[LUIdx];
+    DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+    if (!TRI->isVirtualRegister(Reg))
+      continue;
+
+    // This may be called before CurrentBottom has been initialized. However,
+    // BotRPTracker must have a valid position. We want the value live into the
+    // instruction or live out of the block, so ask for the previous
+    // instruction's live-out.
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    VNInfo *VNI;
+    MachineBasicBlock::const_iterator I =
+      nextIfDebug(BotRPTracker.getPos(), BB->end());
+    if (I == BB->end())
+      VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    else {
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(I));
+      VNI = LRQ.valueIn();
+    }
+    // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+    assert(VNI && "No live value at use.");
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      SUnit *SU = UI->SU;
+      DEBUG(dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+            << *SU->getInstr());
+      // If this use comes before the reaching def, it cannot be a last use, so
+      // descrease its pressure change.
+      if (!SU->isScheduled && SU != &ExitSU) {
+        LiveQueryResult LRQ
+          = LI.Query(LIS->getInstructionIndex(SU->getInstr()));
+        if (LRQ.valueIn() == VNI)
+          getPressureDiff(SU).addPressureChange(Reg, true, &MRI);
       }
-    });
+    }
+  }
 }
 
 /// schedule - Called back from MachineScheduler::runOnMachineFunction
@@ -580,15 +691,23 @@ void ScheduleDAGMI::schedule() {
 
 /// Build the DAG and setup three register pressure trackers.
 void ScheduleDAGMI::buildDAGWithRegPressure() {
+  if (!ShouldTrackPressure) {
+    RPTracker.reset();
+    RegionCriticalPSets.clear();
+    buildSchedGraph(AA);
+    return;
+  }
+
   // Initialize the register pressure tracker used by buildSchedGraph.
-  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                 /*TrackUntiedDefs=*/true);
 
   // Account for liveness generate by the region boundary.
   if (LiveRegionEnd != RegionEnd)
     RPTracker.recede();
 
   // Build the DAG, and compute current register pressure.
-  buildSchedGraph(AA, &RPTracker);
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs);
 
   // Initialize top/bottom trackers after computing region pressure.
   initRegPressure();
@@ -631,6 +750,91 @@ void ScheduleDAGMI::findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
   ExitSU.biasCriticalPath();
 }
 
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an epomymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+unsigned ScheduleDAGMI::computeCyclicCriticalPath() {
+  // This only applies to single block loop.
+  if (!BB->isSuccessor(BB))
+    return 0;
+
+  unsigned MaxCyclicLatency = 0;
+  // Visit each live out vreg def to find def/use pairs that cross iterations.
+  ArrayRef<unsigned> LiveOuts = RPTracker.getPressure().LiveOutRegs;
+  for (ArrayRef<unsigned>::iterator RI = LiveOuts.begin(), RE = LiveOuts.end();
+       RI != RE; ++RI) {
+    unsigned Reg = *RI;
+    if (!TRI->isVirtualRegister(Reg))
+        continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    if (!DefVNI)
+      continue;
+
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+    const SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+
+    unsigned LiveOutHeight = DefSU->getHeight();
+    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+    // Visit all local users of the vreg def.
+    for (VReg2UseMap::iterator
+           UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+      if (UI->SU == &ExitSU)
+        continue;
+
+      // Only consider uses of the phi.
+      LiveQueryResult LRQ =
+        LI.Query(LIS->getInstructionIndex(UI->SU->getInstr()));
+      if (!LRQ.valueIn()->isPHIDef())
+        continue;
+
+      // Assume that a path spanning two iterations is a cycle, which could
+      // overestimate in strange cases. This allows cyclic latency to be
+      // estimated as the minimum slack of the vreg's depth or height.
+      unsigned CyclicLatency = 0;
+      if (LiveOutDepth > UI->SU->getDepth())
+        CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+
+      unsigned LiveInHeight = UI->SU->getHeight() + DefSU->Latency;
+      if (LiveInHeight > LiveOutHeight) {
+        if (LiveInHeight - LiveOutHeight < CyclicLatency)
+          CyclicLatency = LiveInHeight - LiveOutHeight;
+      }
+      else
+        CyclicLatency = 0;
+
+      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+            << UI->SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      if (CyclicLatency > MaxCyclicLatency)
+        MaxCyclicLatency = CyclicLatency;
+    }
+  }
+  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  return MaxCyclicLatency;
+}
+
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
                                ArrayRef<SUnit*> BotRoots) {
@@ -658,11 +862,13 @@ void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
   SchedImpl->registerRoots();
 
   // Advance past initial DebugValues.
-  assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
-  TopRPTracker.setPos(CurrentTop);
-
   CurrentBottom = RegionEnd;
+
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
 }
 
 /// Move an instruction and update register pressure.
@@ -679,10 +885,12 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       TopRPTracker.setPos(MI);
     }
 
-    // Update top scheduled pressure.
-    TopRPTracker.advance();
-    assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
-    updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+    }
   }
   else {
     assert(SU->isBottomReady() && "node still has unscheduled dependencies");
@@ -698,10 +906,14 @@ void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
       moveInstruction(MI, CurrentBottom);
       CurrentBottom = MI;
     }
-    // Update bottom scheduled pressure.
-    BotRPTracker.recede();
-    assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
-    updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+    if (ShouldTrackPressure) {
+      // Update bottom scheduled pressure.
+      SmallVector<unsigned, 8> LiveUses;
+      BotRPTracker.recede(&LiveUses);
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+      updatePressureDiffs(LiveUses);
+    }
   }
 }
 
@@ -1019,6 +1231,12 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMI *DAG) {
                                GlobalSegment->start)) {
       return;
     }
+    // If the prior global segment may be defined by the same two-address
+    // instruction that also defines LocalLI, then can't make a hole here.
+    if (SlotIndex::isSameInstr(llvm::prior(GlobalSegment)->start,
+                               LocalLI->beginIndex())) {
+      return;
+    }
     // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
     // it would be a disconnected component in the live range.
     assert(llvm::prior(GlobalSegment)->start < LocalLI->beginIndex() &&
@@ -1101,24 +1319,23 @@ void CopyConstrain::apply(ScheduleDAGMI *DAG) {
 }
 
 //===----------------------------------------------------------------------===//
-// ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
+// GenericScheduler - Implementation of the generic MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// GenericScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
-class ConvergingScheduler : public MachineSchedStrategy {
+class GenericScheduler : public MachineSchedStrategy {
 public:
   /// Represent the type of SchedCandidate found within a single queue.
   /// pickNodeBidirectional depends on these listed by decreasing priority.
   enum CandReason {
-    NoCand, PhysRegCopy, SingleExcess, SingleCritical, Cluster, Weak,
+    NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
     ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
-    TopDepthReduce, TopPathReduce, SingleMax, MultiPressure, NextDefUse,
-    NodeOrder};
+    TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
 
 #ifndef NDEBUG
-  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+  static const char *getReasonStr(GenericScheduler::CandReason Reason);
 #endif
 
   /// Policy for scheduling the next instruction in the candidate's zone.
@@ -1149,7 +1366,7 @@ public:
     }
   };
 
-  /// Store the state used by ConvergingScheduler heuristics, required for the
+  /// Store the state used by GenericScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
     CandPolicy Policy;
@@ -1160,6 +1377,9 @@ public:
     // The reason for this candidate.
     CandReason Reason;
 
+    // Set of reasons that apply to multiple candidates.
+    uint32_t RepeatReasonSet;
+
     // Register pressure values for the best candidate.
     RegPressureDelta RPDelta;
 
@@ -1167,7 +1387,7 @@ public:
     SchedResourceDelta ResDelta;
 
     SchedCandidate(const CandPolicy &policy)
-    : Policy(policy), SU(NULL), Reason(NoCand) {}
+      : Policy(policy), SU(NULL), Reason(NoCand), RepeatReasonSet(0) {}
 
     bool isValid() const { return SU; }
 
@@ -1180,6 +1400,9 @@ public:
       ResDelta = Best.ResDelta;
     }
 
+    bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
+    void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
+
     void initResourceDelta(const ScheduleDAGMI *DAG,
                            const TargetSchedModel *SchedModel);
   };
@@ -1188,33 +1411,27 @@ public:
   struct SchedRemainder {
     // Critical path through the DAG in expected latency.
     unsigned CriticalPath;
+    unsigned CyclicCritPath;
+
+    // Scaled count of micro-ops left to schedule.
+    unsigned RemIssueCount;
+
+    bool IsAcyclicLatencyLimited;
 
     // Unscheduled resources
     SmallVector<unsigned, 16> RemainingCounts;
-    // Critical resource for the unscheduled zone.
-    unsigned CritResIdx;
-    // Number of micro-ops left to schedule.
-    unsigned RemainingMicroOps;
 
     void reset() {
       CriticalPath = 0;
+      CyclicCritPath = 0;
+      RemIssueCount = 0;
+      IsAcyclicLatencyLimited = false;
       RemainingCounts.clear();
-      CritResIdx = 0;
-      RemainingMicroOps = 0;
     }
 
     SchedRemainder() { reset(); }
 
     void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
-
-    unsigned getMaxRemainingCount(const TargetSchedModel *SchedModel) const {
-      if (!SchedModel->hasInstrSchedModel())
-        return 0;
-
-      return std::max(
-        RemainingMicroOps * SchedModel->getMicroOpFactor(),
-        RemainingCounts[CritResIdx]);
-    }
   };
 
   /// Each Scheduling boundary is associated with ready queues. It tracks the
@@ -1235,8 +1452,13 @@ public:
 
     ScheduleHazardRecognizer *HazardRec;
 
+    /// Number of cycles it takes to issue the instructions scheduled in this
+    /// zone. It is defined as: scheduled-micro-ops / issue-width + stalls.
+    /// See getStalls().
     unsigned CurrCycle;
-    unsigned IssueCount;
+
+    /// Micro-ops issued in the current cycle
+    unsigned CurrMOps;
 
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle;
@@ -1244,52 +1466,71 @@ public:
     // The expected latency of the critical path in this scheduled zone.
     unsigned ExpectedLatency;
 
-    // Resources used in the scheduled zone beyond this boundary.
-    SmallVector<unsigned, 16> ResourceCounts;
+    // The latency of dependence chains leading into this zone.
+    // For each node scheduled bottom-up: DLat = max DLat, N.Depth.
+    // For each cycle scheduled: DLat -= 1.
+    unsigned DependentLatency;
+
+    /// Count the scheduled (issued) micro-ops that can be retired by
+    /// time=CurrCycle assuming the first scheduled instr is retired at time=0.
+    unsigned RetiredMOps;
+
+    // Count scheduled resources that have been executed. Resources are
+    // considered executed if they become ready in the time that it takes to
+    // saturate any resource including the one in question. Counts are scaled
+    // for direct comparison with other resources. Counts can be compared with
+    // MOps * getMicroOpFactor and Latency * getLatencyFactor.
+    SmallVector<unsigned, 16> ExecutedResCounts;
+
+    /// Cache the max count for a single resource.
+    unsigned MaxExecutedResCount;
 
     // Cache the critical resources ID in this scheduled zone.
-    unsigned CritResIdx;
+    unsigned ZoneCritResIdx;
 
     // Is the scheduled region resource limited vs. latency limited.
     bool IsResourceLimited;
 
-    unsigned ExpectedCount;
-
 #ifndef NDEBUG
-    // Remember the greatest min operand latency.
-    unsigned MaxMinLatency;
+    // Remember the greatest operand latency as an upper bound on the number of
+    // times we should retry the pending queue because of a hazard.
+    unsigned MaxObservedLatency;
 #endif
 
     void reset() {
       // A new HazardRec is created for each DAG and owned by SchedBoundary.
-      delete HazardRec;
-
+      // Destroying and reconstructing it is very expensive though. So keep
+      // invalid, placeholder HazardRecs.
+      if (HazardRec && HazardRec->isEnabled()) {
+        delete HazardRec;
+        HazardRec = 0;
+      }
       Available.clear();
       Pending.clear();
       CheckPending = false;
       NextSUs.clear();
-      HazardRec = 0;
       CurrCycle = 0;
-      IssueCount = 0;
+      CurrMOps = 0;
       MinReadyCycle = UINT_MAX;
       ExpectedLatency = 0;
-      ResourceCounts.resize(1);
-      assert(!ResourceCounts[0] && "nonzero count for bad resource");
-      CritResIdx = 0;
+      DependentLatency = 0;
+      RetiredMOps = 0;
+      MaxExecutedResCount = 0;
+      ZoneCritResIdx = 0;
       IsResourceLimited = false;
-      ExpectedCount = 0;
 #ifndef NDEBUG
-      MaxMinLatency = 0;
+      MaxObservedLatency = 0;
 #endif
       // Reserve a zero-count for invalid CritResIdx.
-      ResourceCounts.resize(1);
+      ExecutedResCounts.resize(1);
+      assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
     }
 
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
       DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
+      Pending(ID << GenericScheduler::LogMaxQID, Name+".P"),
       HazardRec(0) {
       reset();
     }
@@ -1300,28 +1541,63 @@ public:
               SchedRemainder *rem);
 
     bool isTop() const {
-      return Available.getID() == ConvergingScheduler::TopQID;
+      return Available.getID() == GenericScheduler::TopQID;
+    }
+
+#ifndef NDEBUG
+    const char *getResourceName(unsigned PIdx) {
+      if (!PIdx)
+        return "MOps";
+      return SchedModel->getProcResource(PIdx)->Name;
+    }
+#endif
+
+    /// Get the number of latency cycles "covered" by the scheduled
+    /// instructions. This is the larger of the critical path within the zone
+    /// and the number of cycles required to issue the instructions.
+    unsigned getScheduledLatency() const {
+      return std::max(ExpectedLatency, CurrCycle);
     }
 
     unsigned getUnscheduledLatency(SUnit *SU) const {
-      if (isTop())
-        return SU->getHeight();
-      return SU->getDepth() + SU->Latency;
+      return isTop() ? SU->getHeight() : SU->getDepth();
+    }
+
+    unsigned getResourceCount(unsigned ResIdx) const {
+      return ExecutedResCounts[ResIdx];
     }
 
+    /// Get the scaled count of scheduled micro-ops and resources, including
+    /// executed resources.
     unsigned getCriticalCount() const {
-      return ResourceCounts[CritResIdx];
+      if (!ZoneCritResIdx)
+        return RetiredMOps * SchedModel->getMicroOpFactor();
+      return getResourceCount(ZoneCritResIdx);
+    }
+
+    /// Get a scaled count for the minimum execution time of the scheduled
+    /// micro-ops that are ready to execute by getExecutedCount. Notice the
+    /// feedback loop.
+    unsigned getExecutedCount() const {
+      return std::max(CurrCycle * SchedModel->getLatencyFactor(),
+                      MaxExecutedResCount);
     }
 
     bool checkHazard(SUnit *SU);
 
-    void setLatencyPolicy(CandPolicy &Policy);
+    unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
+
+    unsigned getOtherResourceCount(unsigned &OtherCritIdx);
+
+    void setPolicy(CandPolicy &Policy, SchedBoundary &OtherZone);
 
     void releaseNode(SUnit *SU, unsigned ReadyCycle);
 
-    void bumpCycle();
+    void bumpCycle(unsigned NextCycle);
 
-    void countResource(unsigned PIdx, unsigned Cycles);
+    void incExecutedResources(unsigned PIdx, unsigned Count);
+
+    unsigned countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle);
 
     void bumpNode(SUnit *SU);
 
@@ -1330,9 +1606,14 @@ public:
     void removeReady(SUnit *SU);
 
     SUnit *pickOnlyChoice();
+
+#ifndef NDEBUG
+    void dumpScheduledState();
+#endif
   };
 
 private:
+  const MachineSchedContext *Context;
   ScheduleDAGMI *DAG;
   const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
@@ -1342,6 +1623,7 @@ private:
   SchedBoundary Top;
   SchedBoundary Bot;
 
+  MachineSchedPolicy RegionPolicy;
 public:
   /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
   enum {
@@ -1350,8 +1632,15 @@ public:
     LogMaxQID = 2
   };
 
-  ConvergingScheduler():
-    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+  GenericScheduler(const MachineSchedContext *C):
+    Context(C), DAG(0), SchedModel(0), TRI(0),
+    Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initPolicy(MachineBasicBlock::iterator Begin,
+                          MachineBasicBlock::iterator End,
+                          unsigned NumRegionInstrs);
+
+  bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; }
 
   virtual void initialize(ScheduleDAGMI *dag);
 
@@ -1366,14 +1655,7 @@ public:
   virtual void registerRoots();
 
 protected:
-  void balanceZones(
-    ConvergingScheduler::SchedBoundary &CriticalZone,
-    ConvergingScheduler::SchedCandidate &CriticalCand,
-    ConvergingScheduler::SchedBoundary &OppositeZone,
-    ConvergingScheduler::SchedCandidate &OppositeCand);
-
-  void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
-                           ConvergingScheduler::SchedCandidate &BotCand);
+  void checkAcyclicLatency();
 
   void tryCandidate(SchedCandidate &Cand,
                     SchedCandidate &TryCand,
@@ -1395,7 +1677,7 @@ protected:
 };
 } // namespace
 
-void ConvergingScheduler::SchedRemainder::
+void GenericScheduler::SchedRemainder::
 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   reset();
   if (!SchedModel->hasInstrSchedModel())
@@ -1404,7 +1686,8 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
   for (std::vector<SUnit>::iterator
          I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
     const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
-    RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
+    RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
+      * SchedModel->getMicroOpFactor();
     for (TargetSchedModel::ProcResIter
            PI = SchedModel->getWriteProcResBegin(SC),
            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
@@ -1413,26 +1696,61 @@ init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
       RemainingCounts[PIdx] += (Factor * PI->Cycles);
     }
   }
-  for (unsigned PIdx = 0, PEnd = SchedModel->getNumProcResourceKinds();
-       PIdx != PEnd; ++PIdx) {
-    if ((int)(RemainingCounts[PIdx] - RemainingCounts[CritResIdx])
-        >= (int)SchedModel->getLatencyFactor()) {
-      CritResIdx = PIdx;
-    }
-  }
 }
 
-void ConvergingScheduler::SchedBoundary::
+void GenericScheduler::SchedBoundary::
 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
   reset();
   DAG = dag;
   SchedModel = smodel;
   Rem = rem;
   if (SchedModel->hasInstrSchedModel())
-    ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
+    ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
+}
+
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                     MachineBasicBlock::iterator End,
+                                     unsigned NumRegionInstrs) {
+  const TargetMachine &TM = Context->MF->getTarget();
+
+  // Avoid setting up the register pressure tracker for small regions to save
+  // compile time. As a rough heuristic, only track pressure when the number of
+  // schedulable instructions exceeds half the integer register file.
+  unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+    TM.getTargetLowering()->getRegClassFor(MVT::i32));
+
+  RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
+
+  // For generic targets, we default to bottom-up, because it's simpler and more
+  // compile-time optimizations have been implemented in that direction.
+  RegionPolicy.OnlyBottomUp = true;
+
+  // Allow the subtarget to override default policy.
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
+
+  // After subtarget overrides, apply command line options.
+  if (!EnableRegPressure)
+    RegionPolicy.ShouldTrackPressure = false;
+
+  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+  // e.g. -misched-bottomup=false allows scheduling in both directions.
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  if (ForceBottomUp.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyBottomUp = ForceBottomUp;
+    if (RegionPolicy.OnlyBottomUp)
+      RegionPolicy.OnlyTopDown = false;
+  }
+  if (ForceTopDown.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyTopDown = ForceTopDown;
+    if (RegionPolicy.OnlyTopDown)
+      RegionPolicy.OnlyBottomUp = false;
+  }
 }
 
-void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
@@ -1447,31 +1765,36 @@ void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   // are disabled, then these HazardRecs will be disabled.
   const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
   const TargetMachine &TM = DAG->MF.getTarget();
-  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
-
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+      TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  }
 }
 
-void ConvergingScheduler::releaseTopNode(SUnit *SU) {
+void GenericScheduler::releaseTopNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
   for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
+    if (I->isWeak())
+      continue;
     unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned MinLatency = I->getMinLatency();
+    unsigned Latency = I->getLatency();
 #ifndef NDEBUG
-    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+    Top.MaxObservedLatency = std::max(Latency, Top.MaxObservedLatency);
 #endif
-    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
-      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+    if (SU->TopReadyCycle < PredReadyCycle + Latency)
+      SU->TopReadyCycle = PredReadyCycle + Latency;
   }
   Top.releaseNode(SU, SU->TopReadyCycle);
 }
 
-void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
+void GenericScheduler::releaseBottomNode(SUnit *SU) {
   if (SU->isScheduled)
     return;
 
@@ -1482,18 +1805,56 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
     if (I->isWeak())
       continue;
     unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned MinLatency = I->getMinLatency();
+    unsigned Latency = I->getLatency();
 #ifndef NDEBUG
-    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+    Bot.MaxObservedLatency = std::max(Latency, Bot.MaxObservedLatency);
 #endif
-    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
-      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
+      SU->BotReadyCycle = SuccReadyCycle + Latency;
   }
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
-void ConvergingScheduler::registerRoots() {
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+    return;
+
+  // Scaled number of cycles per loop iteration.
+  unsigned IterCount =
+    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+             Rem.RemIssueCount);
+  // Scaled acyclic critical path.
+  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+  unsigned InFlightCount =
+    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+  unsigned BufferLimit =
+    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+  DEBUG(dbgs() << "IssueCycles="
+        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
+        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+        if (Rem.IsAcyclicLatencyLimited)
+          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+}
+
+void GenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
+
   // Some roots may not feed into ExitSU. Check all of them in case.
   for (std::vector<SUnit*>::const_iterator
          I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
@@ -1501,6 +1862,11 @@ void ConvergingScheduler::registerRoots() {
       Rem.CriticalPath = (*I)->getDepth();
   }
   DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+
+  if (EnableCyclicPath) {
+    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+    checkAcyclicLatency();
+  }
 }
 
 /// Does this SU have a hazard within the current instruction group.
@@ -1516,12 +1882,12 @@ void ConvergingScheduler::registerRoots() {
 /// can dispatch per cycle.
 ///
 /// TODO: Also check whether the SU must start a new group.
-bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+bool GenericScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   if (HazardRec->isEnabled())
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
-  if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
+  if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
     DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
           << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
@@ -1529,45 +1895,125 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   return false;
 }
 
-/// Compute the remaining latency to determine whether ILP should be increased.
-void ConvergingScheduler::SchedBoundary::setLatencyPolicy(CandPolicy &Policy) {
-  // FIXME: compile time. In all, we visit four queues here one we should only
-  // need to visit the one that was last popped if we cache the result.
+// Find the unscheduled node in ReadySUs with the highest latency.
+unsigned GenericScheduler::SchedBoundary::
+findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
+  SUnit *LateSU = 0;
   unsigned RemLatency = 0;
-  for (ReadyQueue::iterator I = Available.begin(), E = Available.end();
+  for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
        I != E; ++I) {
     unsigned L = getUnscheduledLatency(*I);
-    DEBUG(dbgs() << "  " << Available.getName()
-          << " RemLatency SU(" << (*I)->NodeNum << ") " << L << '\n');
-    if (L > RemLatency)
+    if (L > RemLatency) {
       RemLatency = L;
+      LateSU = *I;
+    }
   }
-  for (ReadyQueue::iterator I = Pending.begin(), E = Pending.end();
-       I != E; ++I) {
-    unsigned L = getUnscheduledLatency(*I);
-    if (L > RemLatency)
-      RemLatency = L;
+  if (LateSU) {
+    DEBUG(dbgs() << Available.getName() << " RemLatency SU("
+          << LateSU->NodeNum << ") " << RemLatency << "c\n");
   }
-  unsigned CriticalPathLimit = Rem->CriticalPath + SchedModel->getILPWindow();
-  DEBUG(dbgs() << "  " << Available.getName()
-        << " ExpectedLatency " << ExpectedLatency
-        << " CP Limit " << CriticalPathLimit << '\n');
-  if (RemLatency + ExpectedLatency >= CriticalPathLimit
-      && RemLatency > Rem->getMaxRemainingCount(SchedModel)) {
-    Policy.ReduceLatency = true;
-    DEBUG(dbgs() << "  Increase ILP: " << Available.getName() << '\n');
+  return RemLatency;
+}
+
+// Count resources in this zone and the remaining unscheduled
+// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
+// resource index, or zero if the zone is issue limited.
+unsigned GenericScheduler::SchedBoundary::
+getOtherResourceCount(unsigned &OtherCritIdx) {
+  OtherCritIdx = 0;
+  if (!SchedModel->hasInstrSchedModel())
+    return 0;
+
+  unsigned OtherCritCount = Rem->RemIssueCount
+    + (RetiredMOps * SchedModel->getMicroOpFactor());
+  DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
+        << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
+  for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
+       PIdx != PEnd; ++PIdx) {
+    unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
+    if (OtherCount > OtherCritCount) {
+      OtherCritCount = OtherCount;
+      OtherCritIdx = PIdx;
+    }
+  }
+  if (OtherCritIdx) {
+    DEBUG(dbgs() << "  " << Available.getName() << " + Remain CritRes: "
+          << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
+          << " " << getResourceName(OtherCritIdx) << "\n");
   }
+  return OtherCritCount;
 }
 
-void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
-                                                     unsigned ReadyCycle) {
+/// Set the CandPolicy for this zone given the current resources and latencies
+/// inside and outside the zone.
+void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
+                                                   SchedBoundary &OtherZone) {
+  // Now that potential stalls have been considered, apply preemptive heuristics
+  // based on the the total latency and resources inside and outside this
+  // zone.
+
+  // Compute remaining latency. We need this both to determine whether the
+  // overall schedule has become latency-limited and whether the instructions
+  // outside this zone are resource or latency limited.
+  //
+  // The "dependent" latency is updated incrementally during scheduling as the
+  // max height/depth of scheduled nodes minus the cycles since it was
+  // scheduled:
+  //   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
+  //
+  // The "independent" latency is the max ready queue depth:
+  //   ILat = max N.depth for N in Available|Pending
+  //
+  // RemainingLatency is the greater of independent and dependent latency.
+  unsigned RemLatency = DependentLatency;
+  RemLatency = std::max(RemLatency, findMaxLatency(Available.elements()));
+  RemLatency = std::max(RemLatency, findMaxLatency(Pending.elements()));
+
+  // Compute the critical resource outside the zone.
+  unsigned OtherCritIdx;
+  unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx);
+
+  bool OtherResLimited = false;
+  if (SchedModel->hasInstrSchedModel()) {
+    unsigned LFactor = SchedModel->getLatencyFactor();
+    OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor;
+  }
+  if (!OtherResLimited && (RemLatency + CurrCycle > Rem->CriticalPath)) {
+    Policy.ReduceLatency |= true;
+    DEBUG(dbgs() << "  " << Available.getName() << " RemainingLatency "
+          << RemLatency << " + " << CurrCycle << "c > CritPath "
+          << Rem->CriticalPath << "\n");
+  }
+  // If the same resource is limiting inside and outside the zone, do nothing.
+  if (ZoneCritResIdx == OtherCritIdx)
+    return;
 
+  DEBUG(
+    if (IsResourceLimited) {
+      dbgs() << "  " << Available.getName() << " ResourceLimited: "
+             << getResourceName(ZoneCritResIdx) << "\n";
+    }
+    if (OtherResLimited)
+      dbgs() << "  RemainingLimit: " << getResourceName(OtherCritIdx) << "\n";
+    if (!IsResourceLimited && !OtherResLimited)
+      dbgs() << "  Latency limited both directions.\n");
+
+  if (IsResourceLimited && !Policy.ReduceResIdx)
+    Policy.ReduceResIdx = ZoneCritResIdx;
+
+  if (OtherResLimited)
+    Policy.DemandResIdx = OtherCritIdx;
+}
+
+void GenericScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
   // Check for interlocks first. For the purpose of other heuristics, an
   // instruction that cannot issue appears as if it's not in the ReadyQueue.
-  if (ReadyCycle > CurrCycle || checkHazard(SU))
+  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU))
     Pending.push(SU);
   else
     Available.push(SU);
@@ -1577,16 +2023,21 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
 }
 
 /// Move the boundary of scheduled code by one cycle.
-void ConvergingScheduler::SchedBoundary::bumpCycle() {
-  unsigned Width = SchedModel->getIssueWidth();
-  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
-
-  unsigned NextCycle = CurrCycle + 1;
-  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
-  if (MinReadyCycle > NextCycle) {
-    IssueCount = 0;
-    NextCycle = MinReadyCycle;
-  }
+void GenericScheduler::SchedBoundary::bumpCycle(unsigned NextCycle) {
+  if (SchedModel->getMicroOpBufferSize() == 0) {
+    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+    if (MinReadyCycle > NextCycle)
+      NextCycle = MinReadyCycle;
+  }
+  // Update the current micro-ops, which will issue in the next cycle.
+  unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
+  CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
+
+  // Decrement DependentLatency based on the next cycle.
+  if ((NextCycle - CurrCycle) > DependentLatency)
+    DependentLatency = 0;
+  else
+    DependentLatency -= (NextCycle - CurrCycle);
 
   if (!HazardRec->isEnabled()) {
     // Bypass HazardRec virtual calls.
@@ -1602,38 +2053,54 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
     }
   }
   CheckPending = true;
-  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+  unsigned LFactor = SchedModel->getLatencyFactor();
+  IsResourceLimited =
+    (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+    > (int)LFactor;
+
+  DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
+}
 
-  DEBUG(dbgs() << "  " << Available.getName()
-        << " Cycle: " << CurrCycle << '\n');
+void GenericScheduler::SchedBoundary::incExecutedResources(unsigned PIdx,
+                                                              unsigned Count) {
+  ExecutedResCounts[PIdx] += Count;
+  if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
+    MaxExecutedResCount = ExecutedResCounts[PIdx];
 }
 
 /// Add the given processor resource to this scheduled zone.
-void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
-                                                       unsigned Cycles) {
+///
+/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
+/// during which this resource is consumed.
+///
+/// \return the next cycle at which the instruction may execute without
+/// oversubscribing resources.
+unsigned GenericScheduler::SchedBoundary::
+countResource(unsigned PIdx, unsigned Cycles, unsigned ReadyCycle) {
   unsigned Factor = SchedModel->getResourceFactor(PIdx);
-  DEBUG(dbgs() << "  " << SchedModel->getProcResource(PIdx)->Name
-        << " +(" << Cycles << "x" << Factor
-        << ") / " << SchedModel->getLatencyFactor() << '\n');
-
   unsigned Count = Factor * Cycles;
-  ResourceCounts[PIdx] += Count;
+  DEBUG(dbgs() << "  " << getResourceName(PIdx)
+        << " +" << Cycles << "x" << Factor << "u\n");
+
+  // Update Executed resources counts.
+  incExecutedResources(PIdx, Count);
   assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
   Rem->RemainingCounts[PIdx] -= Count;
 
-  // Check if this resource exceeds the current critical resource by a full
-  // cycle. If so, it becomes the critical resource.
-  if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
-      >= (int)SchedModel->getLatencyFactor()) {
-    CritResIdx = PIdx;
+  // Check if this resource exceeds the current critical resource. If so, it
+  // becomes the critical resource.
+  if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
+    ZoneCritResIdx = PIdx;
     DEBUG(dbgs() << "  *** Critical resource "
-          << SchedModel->getProcResource(PIdx)->Name << " x"
-          << ResourceCounts[PIdx] << '\n');
+          << getResourceName(PIdx) << ": "
+          << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
   }
+  // TODO: We don't yet model reserved resources. It's not hard though.
+  return CurrCycle;
 }
 
 /// Move the boundary of scheduled code by one SUnit.
-void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
   if (HazardRec->isEnabled()) {
     if (!isTop() && SU->isCall) {
@@ -1643,51 +2110,108 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     }
     HazardRec->EmitInstruction(SU);
   }
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
+  CurrMOps += IncMOps;
+  // checkHazard prevents scheduling multiple instructions per cycle that exceed
+  // issue width. However, we commonly reach the maximum. In this case
+  // opportunistically bump the cycle to avoid uselessly checking everything in
+  // the readyQ. Furthermore, a single instruction may produce more than one
+  // cycle's worth of micro-ops.
+  //
+  // TODO: Also check if this SU must end a dispatch group.
+  unsigned NextCycle = CurrCycle;
+  if (CurrMOps >= SchedModel->getIssueWidth()) {
+    ++NextCycle;
+    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
+          << " at cycle " << CurrCycle << '\n');
+  }
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
+
+  switch (SchedModel->getMicroOpBufferSize()) {
+  case 0:
+    assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
+    break;
+  case 1:
+    if (ReadyCycle > NextCycle) {
+      NextCycle = ReadyCycle;
+      DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
+    }
+    break;
+  default:
+    // We don't currently model the OOO reorder buffer, so consider all
+    // scheduled MOps to be "retired".
+    break;
+  }
+  RetiredMOps += IncMOps;
+
   // Update resource counts and critical resource.
   if (SchedModel->hasInstrSchedModel()) {
-    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
-    Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
+    unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
+    assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
+    Rem->RemIssueCount -= DecRemIssue;
+    if (ZoneCritResIdx) {
+      // Scale scheduled micro-ops for comparing with the critical resource.
+      unsigned ScaledMOps =
+        RetiredMOps * SchedModel->getMicroOpFactor();
+
+      // If scaled micro-ops are now more than the previous critical resource by
+      // a full cycle, then micro-ops issue becomes critical.
+      if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
+          >= (int)SchedModel->getLatencyFactor()) {
+        ZoneCritResIdx = 0;
+        DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
+              << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
+      }
+    }
     for (TargetSchedModel::ProcResIter
            PI = SchedModel->getWriteProcResBegin(SC),
            PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
-      countResource(PI->ProcResourceIdx, PI->Cycles);
+      unsigned RCycle =
+        countResource(PI->ProcResourceIdx, PI->Cycles, ReadyCycle);
+      if (RCycle > NextCycle)
+        NextCycle = RCycle;
     }
   }
-  if (isTop()) {
-    if (SU->getDepth() > ExpectedLatency)
-      ExpectedLatency = SU->getDepth();
+  // Update ExpectedLatency and DependentLatency.
+  unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
+  unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
+  if (SU->getDepth() > TopLatency) {
+    TopLatency = SU->getDepth();
+    DEBUG(dbgs() << "  " << Available.getName()
+          << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
   }
-  else {
-    if (SU->getHeight() > ExpectedLatency)
-      ExpectedLatency = SU->getHeight();
+  if (SU->getHeight() > BotLatency) {
+    BotLatency = SU->getHeight();
+    DEBUG(dbgs() << "  " << Available.getName()
+          << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
   }
-
-  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
-
-  // Check the instruction group dispatch limit.
-  // TODO: Check if this SU must end a dispatch group.
-  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
-
-  // checkHazard prevents scheduling multiple instructions per cycle that exceed
-  // issue width. However, we commonly reach the maximum. In this case
-  // opportunistically bump the cycle to avoid uselessly checking everything in
-  // the readyQ. Furthermore, a single instruction may produce more than one
-  // cycle's worth of micro-ops.
-  if (IssueCount >= SchedModel->getIssueWidth()) {
-    DEBUG(dbgs() << "  *** Max instrs at cycle " << CurrCycle << '\n');
-    bumpCycle();
+  // If we stall for any reason, bump the cycle.
+  if (NextCycle > CurrCycle) {
+    bumpCycle(NextCycle);
+  }
+  else {
+    // After updating ZoneCritResIdx and ExpectedLatency, check if we're
+    // resource limited. If a stall occured, bumpCycle does this.
+    unsigned LFactor = SchedModel->getLatencyFactor();
+    IsResourceLimited =
+      (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+      > (int)LFactor;
   }
+  DEBUG(dumpScheduledState());
 }
 
 /// Release pending ready nodes in to the available queue. This makes them
 /// visible to heuristics.
-void ConvergingScheduler::SchedBoundary::releasePending() {
+void GenericScheduler::SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
     MinReadyCycle = UINT_MAX;
 
   // Check to see if any of the pending instructions are ready to issue.  If
   // so, add them to the available queue.
+  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
   for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
     SUnit *SU = *(Pending.begin()+i);
     unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
@@ -1695,7 +2219,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
     if (ReadyCycle < MinReadyCycle)
       MinReadyCycle = ReadyCycle;
 
-    if (ReadyCycle > CurrCycle)
+    if (!IsBuffered && ReadyCycle > CurrCycle)
       continue;
 
     if (checkHazard(SU))
@@ -1710,7 +2234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
 }
 
 /// Remove SU from the ready set for this boundary.
-void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
+void GenericScheduler::SchedBoundary::removeReady(SUnit *SU) {
   if (Available.isInQueue(SU))
     Available.remove(Available.find(SU));
   else {
@@ -1722,11 +2246,11 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 /// If this queue only has one ready candidate, return it. As a side effect,
 /// defer any nodes that now hit a hazard, and advance the cycle until at least
 /// one node is ready. If multiple instructions are ready, return NULL.
-SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
+SUnit *GenericScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
-  if (IssueCount > 0) {
+  if (CurrMOps > 0) {
     // Defer any ready instrs that now have a hazard.
     for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
       if (checkHazard(*I)) {
@@ -1738,9 +2262,9 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
     }
   }
   for (unsigned i = 0; Available.empty(); ++i) {
-    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedLatency) &&
            "permanent hazard"); (void)i;
-    bumpCycle();
+    bumpCycle(CurrCycle + 1);
     releasePending();
   }
   if (Available.size() == 1)
@@ -1748,106 +2272,33 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   return NULL;
 }
 
-/// Record the candidate policy for opposite zones with different critical
-/// resources.
-///
-/// If the CriticalZone is latency limited, don't force a policy for the
-/// candidates here. Instead, setLatencyPolicy sets ReduceLatency if needed.
-void ConvergingScheduler::balanceZones(
-  ConvergingScheduler::SchedBoundary &CriticalZone,
-  ConvergingScheduler::SchedCandidate &CriticalCand,
-  ConvergingScheduler::SchedBoundary &OppositeZone,
-  ConvergingScheduler::SchedCandidate &OppositeCand) {
-
-  if (!CriticalZone.IsResourceLimited)
-    return;
-  assert(SchedModel->hasInstrSchedModel() && "required schedmodel");
-
-  SchedRemainder *Rem = CriticalZone.Rem;
-
-  // If the critical zone is overconsuming a resource relative to the
-  // remainder, try to reduce it.
-  unsigned RemainingCritCount =
-    Rem->RemainingCounts[CriticalZone.CritResIdx];
-  if ((int)(Rem->getMaxRemainingCount(SchedModel) - RemainingCritCount)
-      > (int)SchedModel->getLatencyFactor()) {
-    CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
-    DEBUG(dbgs() << "  Balance " << CriticalZone.Available.getName()
-          << " reduce "
-          << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
-          << '\n');
-  }
-  // If the other zone is underconsuming a resource relative to the full zone,
-  // try to increase it.
-  unsigned OppositeCount =
-    OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
-  if ((int)(OppositeZone.ExpectedCount - OppositeCount)
-      > (int)SchedModel->getLatencyFactor()) {
-    OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
-    DEBUG(dbgs() << "  Balance " << OppositeZone.Available.getName()
-          << " demand "
-          << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
-          << '\n');
-  }
-}
-
-/// Determine if the scheduled zones exceed resource limits or critical path and
-/// set each candidate's ReduceHeight policy accordingly.
-void ConvergingScheduler::checkResourceLimits(
-  ConvergingScheduler::SchedCandidate &TopCand,
-  ConvergingScheduler::SchedCandidate &BotCand) {
-
-  // Set ReduceLatency to true if needed.
-  Bot.setLatencyPolicy(BotCand.Policy);
-  Top.setLatencyPolicy(TopCand.Policy);
-
-  // Handle resource-limited regions.
-  if (Top.IsResourceLimited && Bot.IsResourceLimited
-      && Top.CritResIdx == Bot.CritResIdx) {
-    // If the scheduled critical resource in both zones is no longer the
-    // critical remaining resource, attempt to reduce resource height both ways.
-    if (Top.CritResIdx != Rem.CritResIdx) {
-      TopCand.Policy.ReduceResIdx = Top.CritResIdx;
-      BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
-      DEBUG(dbgs() << "  Reduce scheduled "
-            << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
-    }
-    return;
-  }
-  // Handle latency-limited regions.
-  if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
-    // If the total scheduled expected latency exceeds the region's critical
-    // path then reduce latency both ways.
-    //
-    // Just because a zone is not resource limited does not mean it is latency
-    // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
-    // to exceed expected latency.
-    if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
-        && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
-      TopCand.Policy.ReduceLatency = true;
-      BotCand.Policy.ReduceLatency = true;
-      DEBUG(dbgs() << "  Reduce scheduled latency " << Top.ExpectedLatency
-            << " + " << Bot.ExpectedLatency << '\n');
-    }
-    return;
+#ifndef NDEBUG
+// This is useful information to dump after bumpNode.
+// Note that the Queue contents are more useful before pickNodeFromQueue.
+void GenericScheduler::SchedBoundary::dumpScheduledState() {
+  unsigned ResFactor;
+  unsigned ResCount;
+  if (ZoneCritResIdx) {
+    ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
+    ResCount = getResourceCount(ZoneCritResIdx);
   }
-  // The critical resource is different in each zone, so request balancing.
-
-  // Compute the cost of each zone.
-  Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
-  Top.ExpectedCount = std::max(
-    Top.getCriticalCount(),
-    Top.ExpectedCount * SchedModel->getLatencyFactor());
-  Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
-  Bot.ExpectedCount = std::max(
-    Bot.getCriticalCount(),
-    Bot.ExpectedCount * SchedModel->getLatencyFactor());
-
-  balanceZones(Top, TopCand, Bot, BotCand);
-  balanceZones(Bot, BotCand, Top, TopCand);
+  else {
+    ResFactor = SchedModel->getMicroOpFactor();
+    ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
+  }
+  unsigned LFactor = SchedModel->getLatencyFactor();
+  dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
+         << "  Retired: " << RetiredMOps;
+  dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
+  dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
+         << ResCount / ResFactor << " " << getResourceName(ZoneCritResIdx)
+         << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
+         << (IsResourceLimited ? "  - Resource" : "  - Latency")
+         << " limited.\n";
 }
+#endif
 
-void ConvergingScheduler::SchedCandidate::
+void GenericScheduler::SchedCandidate::
 initResourceDelta(const ScheduleDAGMI *DAG,
                   const TargetSchedModel *SchedModel) {
   if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
@@ -1864,11 +2315,12 @@ initResourceDelta(const ScheduleDAGMI *DAG,
   }
 }
 
+
 /// Return true if this heuristic determines order.
 static bool tryLess(int TryVal, int CandVal,
-                    ConvergingScheduler::SchedCandidate &TryCand,
-                    ConvergingScheduler::SchedCandidate &Cand,
-                    ConvergingScheduler::CandReason Reason) {
+                    GenericScheduler::SchedCandidate &TryCand,
+                    GenericScheduler::SchedCandidate &Cand,
+                    GenericScheduler::CandReason Reason) {
   if (TryVal < CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -1878,13 +2330,14 @@ static bool tryLess(int TryVal, int CandVal,
       Cand.Reason = Reason;
     return true;
   }
+  Cand.setRepeat(Reason);
   return false;
 }
 
 static bool tryGreater(int TryVal, int CandVal,
-                       ConvergingScheduler::SchedCandidate &TryCand,
-                       ConvergingScheduler::SchedCandidate &Cand,
-                       ConvergingScheduler::CandReason Reason) {
+                       GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::CandReason Reason) {
   if (TryVal > CandVal) {
     TryCand.Reason = Reason;
     return true;
@@ -1894,9 +2347,34 @@ static bool tryGreater(int TryVal, int CandVal,
       Cand.Reason = Reason;
     return true;
   }
+  Cand.setRepeat(Reason);
   return false;
 }
 
+static bool tryPressure(const PressureChange &TryP,
+                        const PressureChange &CandP,
+                        GenericScheduler::SchedCandidate &TryCand,
+                        GenericScheduler::SchedCandidate &Cand,
+                        GenericScheduler::CandReason Reason) {
+  int TryRank = TryP.getPSetOrMax();
+  int CandRank = CandP.getPSetOrMax();
+  // If both candidates affect the same set, go with the smallest increase.
+  if (TryRank == CandRank) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
+                   Reason);
+  }
+  // If one candidate decreases and the other increases, go with it.
+  // Invalid candidates have UnitInc==0.
+  if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
+              Reason)) {
+    return true;
+  }
+  // If the candidates are decreasing pressure, reverse priority.
+  if (TryP.getUnitInc() < 0)
+    std::swap(TryRank, CandRank);
+  return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
+}
+
 static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
   return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
 }
@@ -1929,6 +2407,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
   return 0;
 }
 
+static bool tryLatency(GenericScheduler::SchedCandidate &TryCand,
+                       GenericScheduler::SchedCandidate &Cand,
+                       GenericScheduler::SchedBoundary &Zone) {
+  if (Zone.isTop()) {
+    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                  TryCand, Cand, GenericScheduler::TopDepthReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                   TryCand, Cand, GenericScheduler::TopPathReduce))
+      return true;
+  }
+  else {
+    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                  TryCand, Cand, GenericScheduler::BotHeightReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                   TryCand, Cand, GenericScheduler::BotPathReduce))
+      return true;
+  }
+  return false;
+}
+
 /// Apply a set of heursitics to a new candidate. Heuristics are currently
 /// hierarchical. This may be more efficient than a graduated cost model because
 /// we don't need to evaluate all aspects of the model for each node in the
@@ -1940,16 +2444,44 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
 /// \param Zone describes the scheduled zone that we are extending.
 /// \param RPTracker describes reg pressure within the scheduled zone.
 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
-void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
                                        SchedCandidate &TryCand,
                                        SchedBoundary &Zone,
                                        const RegPressureTracker &RPTracker,
                                        RegPressureTracker &TempTracker) {
 
-  // Always initialize TryCand's RPDelta.
-  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
-                                  DAG->getRegionCriticalPSets(),
-                                  DAG->getRegPressure().MaxSetPressure);
+  if (DAG->isTrackingPressure()) {
+    // Always initialize TryCand's RPDelta.
+    if (Zone.isTop()) {
+      TempTracker.getMaxDownwardPressureDelta(
+        TryCand.SU->getInstr(),
+        TryCand.RPDelta,
+        DAG->getRegionCriticalPSets(),
+        DAG->getRegPressure().MaxSetPressure);
+    }
+    else {
+      if (VerifyScheduling) {
+        TempTracker.getMaxUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          &DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+      else {
+        RPTracker.getUpwardPressureDelta(
+          TryCand.SU->getInstr(),
+          DAG->getPressureDiff(TryCand.SU),
+          TryCand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+    }
+  }
+  DEBUG(if (TryCand.RPDelta.Excess.isValid())
+          dbgs() << "  SU(" << TryCand.SU->NodeNum << ") "
+                 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
+                 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
 
   // Initialize the candidate if needed.
   if (!Cand.isValid()) {
@@ -1962,20 +2494,25 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
                  TryCand, Cand, PhysRegCopy))
     return;
 
-  // Avoid exceeding the target's limit.
-  if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
-              Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
+  // Avoid exceeding the target's limit. If signed PSetID is negative, it is
+  // invalid; convert it to INT_MAX to give it lowest priority.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+                                               Cand.RPDelta.Excess,
+                                               TryCand, Cand, RegExcess))
     return;
-  if (Cand.Reason == SingleExcess)
-    Cand.Reason = MultiPressure;
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
-              Cand.RPDelta.CriticalMax.UnitIncrease,
-              TryCand, Cand, SingleCritical))
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+                                               Cand.RPDelta.CriticalMax,
+                                               TryCand, Cand, RegCritical))
+    return;
+
+  // For loops that are acyclic path limited, aggressively schedule for latency.
+  // This can result in very long dependence chains scheduled in sequence, so
+  // once every cycle (when CurrMOps == 0), switch to normal heuristics.
+  if (Rem.IsAcyclicLatencyLimited && !Zone.CurrMOps
+      && tryLatency(TryCand, Cand, Zone))
     return;
-  if (Cand.Reason == SingleCritical)
-    Cand.Reason = MultiPressure;
 
   // Keep clustered nodes together to encourage downstream peephole
   // optimizations which may reduce resource requirements.
@@ -1990,17 +2527,17 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
 
   // Weak edges are for clustering and other constraints.
-  //
-  // Deferring TryCand here does not change Cand's reason. This is good in the
-  // sense that a bad candidate shouldn't affect a previous candidate's
-  // goodness, but bad in that it is assymetric and depends on queue order.
-  CandReason OrigReason = Cand.Reason;
   if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
               getWeakLeft(Cand.SU, Zone.isTop()),
               TryCand, Cand, Weak)) {
-    Cand.Reason = OrigReason;
     return;
   }
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+                                               Cand.RPDelta.CurrentMax,
+                                               TryCand, Cand, RegMax))
+    return;
+
   // Avoid critical resource consumption and balance the schedule.
   TryCand.initResourceDelta(DAG, SchedModel);
   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
@@ -2012,41 +2549,15 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
     return;
 
   // Avoid serializing long latency dependence chains.
-  if (Cand.Policy.ReduceLatency) {
-    if (Zone.isTop()) {
-      if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
-          > Zone.ExpectedCount) {
-        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                    TryCand, Cand, TopDepthReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                     TryCand, Cand, TopPathReduce))
-        return;
-    }
-    else {
-      if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
-          > Zone.ExpectedCount) {
-        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
-                    TryCand, Cand, BotHeightReduce))
-          return;
-      }
-      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
-                     TryCand, Cand, BotPathReduce))
-        return;
-    }
-  }
-
-  // Avoid increasing the max pressure of the entire region.
-  if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
-              Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
+  // For acyclic path limited loops, latency was already checked above.
+  if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
+      && tryLatency(TryCand, Cand, Zone)) {
     return;
-  if (Cand.Reason == SingleMax)
-    Cand.Reason = MultiPressure;
+  }
 
   // Prefer immediate defs/users of the last scheduled instruction. This is a
-  // nice pressure avoidance strategy that also conserves the processor's
-  // register renaming resources and keeps the machine code readable.
+  // local pressure avoidance strategy that also makes the machine code
+  // readable.
   if (tryGreater(Zone.NextSUs.count(TryCand.SU), Zone.NextSUs.count(Cand.SU),
                  TryCand, Cand, NextDefUse))
     return;
@@ -2058,49 +2569,17 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
   }
 }
 
-/// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
-/// more desirable than RHS from scheduling standpoint.
-static bool compareRPDelta(const RegPressureDelta &LHS,
-                           const RegPressureDelta &RHS) {
-  // Compare each component of pressure in decreasing order of importance
-  // without checking if any are valid. Invalid PressureElements are assumed to
-  // have UnitIncrease==0, so are neutral.
-
-  // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
-    DEBUG(dbgs() << "  RP excess top - bot: "
-          << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
-    return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
-  }
-  // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
-    DEBUG(dbgs() << "  RP critical top - bot: "
-          << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
-          << '\n');
-    return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
-  }
-  // Avoid increasing the max pressure of the entire region.
-  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
-    DEBUG(dbgs() << "  RP current top - bot: "
-          << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
-          << '\n');
-    return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
-  }
-  return false;
-}
-
 #ifndef NDEBUG
-const char *ConvergingScheduler::getReasonStr(
-  ConvergingScheduler::CandReason Reason) {
+const char *GenericScheduler::getReasonStr(
+  GenericScheduler::CandReason Reason) {
   switch (Reason) {
   case NoCand:         return "NOCAND    ";
   case PhysRegCopy:    return "PREG-COPY";
-  case SingleExcess:   return "REG-EXCESS";
-  case SingleCritical: return "REG-CRIT  ";
+  case RegExcess:      return "REG-EXCESS";
+  case RegCritical:    return "REG-CRIT  ";
   case Cluster:        return "CLUSTER   ";
   case Weak:           return "WEAK      ";
-  case SingleMax:      return "REG-MAX   ";
-  case MultiPressure:  return "REG-MULTI ";
+  case RegMax:         return "REG-MAX   ";
   case ResourceReduce: return "RES-REDUCE";
   case ResourceDemand: return "RES-DEMAND";
   case TopDepthReduce: return "TOP-DEPTH ";
@@ -2113,20 +2592,20 @@ const char *ConvergingScheduler::getReasonStr(
   llvm_unreachable("Unknown reason!");
 }
 
-void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
-  PressureElement P;
+void GenericScheduler::traceCandidate(const SchedCandidate &Cand) {
+  PressureChange P;
   unsigned ResIdx = 0;
   unsigned Latency = 0;
   switch (Cand.Reason) {
   default:
     break;
-  case SingleExcess:
+  case RegExcess:
     P = Cand.RPDelta.Excess;
     break;
-  case SingleCritical:
+  case RegCritical:
     P = Cand.RPDelta.CriticalMax;
     break;
-  case SingleMax:
+  case RegMax:
     P = Cand.RPDelta.CurrentMax;
     break;
   case ResourceReduce:
@@ -2150,8 +2629,8 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
   }
   dbgs() << "  SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
   if (P.isValid())
-    dbgs() << " " << TRI->getRegPressureSetName(P.PSetID)
-           << ":" << P.UnitIncrease << " ";
+    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+           << ":" << P.getUnitInc() << " ";
   else
     dbgs() << "      ";
   if (ResIdx)
@@ -2166,12 +2645,12 @@ void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand) {
 }
 #endif
 
-/// Pick the best candidate from the top queue.
+/// Pick the best candidate from the queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
                                             const RegPressureTracker &RPTracker,
                                             SchedCandidate &Cand) {
   ReadyQueue &Q = Zone.Available;
@@ -2196,30 +2675,31 @@ void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+static void tracePick(const GenericScheduler::SchedCandidate &Cand,
                       bool IsTop) {
   DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
-        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
+        << GenericScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
     IsTopNode = false;
-    DEBUG(dbgs() << "Pick Top NOCAND\n");
+    DEBUG(dbgs() << "Pick Bot NOCAND\n");
     return SU;
   }
   if (SUnit *SU = Top.pickOnlyChoice()) {
     IsTopNode = true;
-    DEBUG(dbgs() << "Pick Bot NOCAND\n");
+    DEBUG(dbgs() << "Pick Top NOCAND\n");
     return SU;
   }
   CandPolicy NoPolicy;
   SchedCandidate BotCand(NoPolicy);
   SchedCandidate TopCand(NoPolicy);
-  checkResourceLimits(TopCand, BotCand);
+  Bot.setPolicy(BotCand.Policy, Top);
+  Top.setPolicy(TopCand.Policy, Bot);
 
   // Prefer bottom scheduling when heuristics are silent.
   pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
@@ -2232,7 +2712,10 @@ SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // affects picking from either Q. If scheduling in one direction must
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
-  if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
+  if ((BotCand.Reason == RegExcess && !BotCand.isRepeat(RegExcess))
+      || (BotCand.Reason == RegCritical
+          && !BotCand.isRepeat(RegCritical)))
+  {
     IsTopNode = false;
     tracePick(BotCand, IsTopNode);
     return BotCand.SU;
@@ -2241,37 +2724,20 @@ SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
   assert(TopCand.Reason != NoCand && "failed to find the first candidate");
 
-  // If either Q has a single candidate that minimizes pressure above the
-  // original region's pressure pick it.
-  if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
-    if (TopCand.Reason < BotCand.Reason) {
-      IsTopNode = true;
-      tracePick(TopCand, IsTopNode);
-      return TopCand.SU;
-    }
-    IsTopNode = false;
-    tracePick(BotCand, IsTopNode);
-    return BotCand.SU;
-  }
-  // Check for a salient pressure difference and pick the best from either side.
-  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
-    IsTopNode = true;
-    tracePick(TopCand, IsTopNode);
-    return TopCand.SU;
-  }
-  // Otherwise prefer the bottom candidate, in node order if all else failed.
+  // Choose the queue with the most important (lowest enum) reason.
   if (TopCand.Reason < BotCand.Reason) {
     IsTopNode = true;
     tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
+  // Otherwise prefer the bottom candidate, in node order if all else failed.
   IsTopNode = false;
   tracePick(BotCand, IsTopNode);
   return BotCand.SU;
 }
 
 /// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
-SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
@@ -2279,24 +2745,26 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   }
   SUnit *SU;
   do {
-    if (ForceTopDown) {
+    if (RegionPolicy.OnlyTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate TopCand(NoPolicy);
         pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
-        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand, true);
         SU = TopCand.SU;
       }
       IsTopNode = true;
     }
-    else if (ForceBottomUp) {
+    else if (RegionPolicy.OnlyBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
         CandPolicy NoPolicy;
         SchedCandidate BotCand(NoPolicy);
         pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
-        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand, false);
         SU = BotCand.SU;
       }
       IsTopNode = false;
@@ -2315,7 +2783,7 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
   return SU;
 }
 
-void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
@@ -2346,15 +2814,15 @@ void ConvergingScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
 ///
 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
 /// them here. See comments in biasPhysRegCopy.
-void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
   if (IsTopNode) {
-    SU->TopReadyCycle = Top.CurrCycle;
+    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.CurrCycle);
     Top.bumpNode(SU);
     if (SU->hasPhysRegUses)
       reschedulePhysRegCopies(SU, true);
   }
   else {
-    SU->BotReadyCycle = Bot.CurrCycle;
+    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.CurrCycle);
     Bot.bumpNode(SU);
     if (SU->hasPhysRegDefs)
       reschedulePhysRegCopies(SU, false);
@@ -2363,26 +2831,23 @@ void ConvergingScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
-static ScheduleDAGInstrs *createConvergingSched(MachineSchedContext *C) {
-  assert((!ForceTopDown || !ForceBottomUp) &&
-         "-misched-topdown incompatible with -misched-bottomup");
-  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new ConvergingScheduler());
+static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) {
+  ScheduleDAGMI *DAG = new ScheduleDAGMI(C, new GenericScheduler(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
   // data and pass it to later mutations. Have a single mutation that gathers
   // the interesting nodes in one pass.
-  if (EnableCopyConstrain)
-    DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
-  if (EnableLoadCluster)
+  DAG->addMutation(new CopyConstrain(DAG->TII, DAG->TRI));
+  if (EnableLoadCluster && DAG->TII->enableClusterLoads())
     DAG->addMutation(new LoadClusterMutation(DAG->TII, DAG->TRI));
   if (EnableMacroFusion)
     DAG->addMutation(new MacroFusion(DAG->TII));
   return DAG;
 }
 static MachineSchedRegistry
-ConvergingSchedRegistry("converge", "Standard converging scheduler.",
-                        createConvergingSched);
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+                     createGenericSched);
 
 //===----------------------------------------------------------------------===//
 // ILP Scheduler. Currently for experimental analysis of heuristics.
@@ -2424,15 +2889,6 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  /// In case all subtrees are eventually connected to a common root through
-  /// data dependence (e.g. reduction), place an upper limit on their size.
-  ///
-  /// FIXME: A subtree limit is generally good, but in the situation commented
-  /// above, where multiple similar subtrees feed a common root, we should
-  /// only split at a point where the resulting subtrees will be balanced.
-  /// (a motivating test case must be found).
-  static const unsigned SubtreeLimit = 16;
-
   ScheduleDAGMI *DAG;
   ILPOrder Cmp;
 
@@ -2616,7 +3072,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   }
 
   static bool isNodeHidden(const SUnit *Node) {
-    return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    return (Node->Preds.size() > 10 || Node->Succs.size() > 10);
   }
 
   static bool hasNodeAddressLabel(const SUnit *Node,
@@ -2639,7 +3095,11 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
     std::string Str;
     raw_string_ostream SS(Str);
-    SS << "SU(" << SU->NodeNum << ')';
+    const SchedDFSResult *DFS =
+      static_cast<const ScheduleDAGMI*>(G)->getDFSResult();
+    SS << "SU:" << SU->NodeNum;
+    if (DFS)
+      SS << " I:" << DFS->getNumInstrs(SU);
     return SS.str();
   }
   static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 4dafbe5..105d7c2 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -308,12 +308,29 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr *MI,
   // to be sunk then it's probably worth it.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg()) continue;
+    if (!MO.isReg() || !MO.isUse())
+      continue;
     unsigned Reg = MO.getReg();
-    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Reg == 0)
       continue;
-    if (MRI->hasOneNonDBGUse(Reg))
-      return true;
+
+    // We don't move live definitions of physical registers,
+    // so sinking their uses won't enable any opportunities.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    // If this instruction is the only user of a virtual register,
+    // check if breaking the edge will enable sinking
+    // both this instruction and the defining instruction.
+    if (MRI->hasOneNonDBGUse(Reg)) {
+      // If the definition resides in same MBB,
+      // claim it's likely we can sink these together.
+      // If definition resides elsewhere, we aren't
+      // blocking it from being sunk so don't break the edge.
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI->getParent() == MI->getParent())
+        return true;
+    }
   }
 
   return false;
@@ -394,7 +411,7 @@ static bool AvoidsSinking(MachineInstr *MI, MachineRegisterInfo *MRI) {
 /// collectDebgValues - Scan instructions following MI and collect any
 /// matching DBG_VALUEs.
 static void collectDebugValues(MachineInstr *MI,
-                               SmallVector<MachineInstr *, 2> & DbgValues) {
+                               SmallVectorImpl<MachineInstr *> &DbgValues) {
   DbgValues.clear();
   if (!MI->getOperand(0).isReg())
     return;
@@ -537,8 +554,8 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
       // We give successors with smaller loop depth higher priority.
       SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
       std::stable_sort(Succs.begin(), Succs.end(), SuccessorSorter(LI));
-      for (SmallVector<MachineBasicBlock*, 4>::iterator SI = Succs.begin(),
-           E = Succs.end(); SI != E; ++SI) {
+      for (SmallVectorImpl<MachineBasicBlock *>::iterator SI = Succs.begin(),
+             E = Succs.end(); SI != E; ++SI) {
         MachineBasicBlock *SuccBlock = *SI;
         bool LocalUse = false;
         if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
@@ -615,9 +632,8 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
 
   DEBUG(dbgs() << "Sink instr " << *MI << "\tinto block " << *SuccToSinkTo);
 
-  // If the block has multiple predecessors, this would introduce computation on
-  // a path that it doesn't already exist.  We could split the critical edge,
-  // but for now we just punt.
+  // If the block has multiple predecessors, this is a critical edge.
+  // Decide if we can sink along it or need to break the edge.
   if (SuccToSinkTo->pred_size() > 1) {
     // We cannot sink a load across a critical edge - there may be stores in
     // other code paths.
@@ -697,7 +713,7 @@ bool MachineSinking::SinkInstruction(MachineInstr *MI, bool &SawStore) {
                        ++MachineBasicBlock::iterator(MI));
 
   // Move debug values.
-  for (SmallVector<MachineInstr *, 2>::iterator DBI = DbgValuesToSink.begin(),
+  for (SmallVectorImpl<MachineInstr *>::iterator DBI = DbgValuesToSink.begin(),
          DBE = DbgValuesToSink.end(); DBI != DBE; ++DBI) {
     MachineInstr *DbgMI = *DBI;
     SuccToSinkTo->splice(InsertPos, ParentBlock,  DbgMI,
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 00f702c..6aa3f67 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -853,8 +853,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
         // Add latency if DefMI is a real instruction. Transients get latency 0.
         if (!Dep.DefMI->isTransient())
           DepCycle += MTM.SchedModel
-            .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp,
-                                   /* FindMin = */ false);
+            .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp);
         Cycle = std::max(Cycle, DepCycle);
       }
       // Remember the instruction depth.
@@ -902,8 +901,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
         // We may not know the UseMI of this dependency, if it came from the
         // live-in list. SchedModel can handle a NULL UseMI.
         DepHeight += SchedModel
-          .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op,
-                                 /* FindMin = */ false);
+          .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op);
       }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
@@ -941,7 +939,7 @@ static bool pushDepHeight(const DataDep &Dep,
   // Adjust height by Dep.DefMI latency.
   if (!Dep.DefMI->isTransient())
     UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
-                                                  UseMI, Dep.UseOp, false);
+                                                  UseMI, Dep.UseOp);
 
   // Update Heights[DefMI] to be the maximum height seen.
   MIHeightMap::iterator I;
@@ -1171,7 +1169,7 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
   // Add latency if DefMI is a real instruction. Transients get latency 0.
   if (!Dep.DefMI->isTransient())
     DepCycle += TE.MTM.SchedModel
-      .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp, false);
+      .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp);
   return DepCycle;
 }
 
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 037043f..d61470c 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -25,6 +25,7 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -212,6 +213,10 @@ namespace {
                 const LiveInterval &LI);
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveRange &LR);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveRange &LR);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -224,9 +229,12 @@ namespace {
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
-    void verifyLiveIntervalSegment(const LiveInterval&,
-                                   LiveInterval::const_iterator);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeSegment(const LiveRange&,
+                                const LiveRange::const_iterator I, unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned);
+
+    void verifyStackFrame();
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
@@ -268,8 +276,7 @@ bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
   raw_ostream *OutFile = 0;
   if (OutFileName) {
     std::string ErrorInfo;
-    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo,
-                                 raw_fd_ostream::F_Append);
+    OutFile = new raw_fd_ostream(OutFileName, ErrorInfo, sys::fs::F_Append);
     if (!ErrorInfo.empty()) {
       errs() << "Error opening '" << OutFileName << "': " << ErrorInfo << '\n';
       exit(1);
@@ -412,23 +419,25 @@ void MachineVerifier::report(const char *msg,
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
                              const LiveInterval &LI) {
   report(msg, MF);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
                              const LiveInterval &LI) {
   report(msg, MBB);
-  *OS << "- interval:    ";
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
-    *OS << PrintReg(LI.reg, TRI);
-  else
-    *OS << PrintRegUnit(LI.reg, TRI);
-  *OS << ' ' << LI << '\n';
+  *OS << "- interval:    " << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveRange &LR) {
+  report(msg, MBB);
+  *OS << "- liverange:    " << LR << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveRange &LR) {
+  report(msg, MF);
+  *OS << "- liverange:    " << LR << "\n";
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -475,6 +484,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
 
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
+
+  verifyStackFrame();
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -669,8 +680,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       report("MBB live-in list contains non-physical register", MBB);
       continue;
     }
-    regsLive.insert(*I);
-    for (MCSubRegIterator SubRegs(*I, TRI); SubRegs.isValid(); ++SubRegs)
+    for (MCSubRegIterator SubRegs(*I, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
       regsLive.insert(*SubRegs);
   }
   regsLiveInButUnused = regsLive;
@@ -679,8 +690,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   assert(MFI && "Function has no frame info");
   BitVector PR = MFI->getPristineRegs(MBB);
   for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
-    regsLive.insert(I);
-    for (MCSubRegIterator SubRegs(I, TRI); SubRegs.isValid(); ++SubRegs)
+    for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
       regsLive.insert(*SubRegs);
   }
 
@@ -764,7 +775,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
     *OS << MCID.getNumOperands() << " operands expected, but "
-        << MI->getNumExplicitOperands() << " given.\n";
+        << MI->getNumOperands() << " given.\n";
   }
 
   // Check the tied operands.
@@ -822,7 +833,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MO->isReg() &&
         !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
       if (MO->isDef() && !MCOI.isOptionalDef())
-          report("Explicit operand marked as def", MO, MONum);
+        report("Explicit operand marked as def", MO, MONum);
       if (MO->isImplicit())
         report("Explicit operand marked as implicit", MO, MONum);
     }
@@ -997,16 +1008,16 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       // Check the cached regunit intervals.
       if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
-            LiveRangeQuery LRQ(*LI, UseIdx);
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units)) {
+            LiveQueryResult LRQ = LR->Query(UseIdx);
             if (!LRQ.valueIn()) {
-              report("No live range at use", MO, MONum);
+              report("No live segment at use", MO, MONum);
               *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
-                  << ' ' << *LI << '\n';
+                  << ' ' << *LR << '\n';
             }
             if (MO->isKill() && !LRQ.isKill()) {
               report("Live range continues after kill flag", MO, MONum);
-              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
             }
           }
         }
@@ -1016,9 +1027,9 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
-          LiveRangeQuery LRQ(LI, UseIdx);
+          LiveQueryResult LRQ = LI.Query(UseIdx);
           if (!LRQ.valueIn()) {
-            report("No live range at use", MO, MONum);
+            report("No live segment at use", MO, MONum);
             *OS << UseIdx << " is not live in " << LI << '\n';
           }
           // Check for extra kill flags.
@@ -1067,7 +1078,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         llvm::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
-    // Check LiveInts for a live range, but only for virtual registers.
+    // Check LiveInts for a live segment, but only for virtual registers.
     if (LiveInts && TargetRegisterInfo::isVirtualRegister(Reg) &&
         !LiveInts->isNotInMIMap(MI)) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(MI);
@@ -1082,9 +1093,17 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
               << DefIdx << " in " << LI << '\n';
           }
         } else {
-          report("No live range at def", MO, MONum);
+          report("No live segment at def", MO, MONum);
           *OS << DefIdx << " is not live in " << LI << '\n';
         }
+        // Check that, if the dead def flag is present, LiveInts agree.
+        if (MO->isDead()) {
+          LiveQueryResult LRQ = LI.Query(DefIdx);
+          if (!LRQ.isDeadDef()) {
+            report("Live range continues after dead def flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        }
       } else {
         report("Virtual register has no Live interval", MO, MONum);
       }
@@ -1331,25 +1350,26 @@ void MachineVerifier::verifyLiveIntervals() {
 
   // Verify all the cached regunit intervals.
   for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
-    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
-      verifyLiveInterval(*LI);
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
+      verifyLiveRange(*LR, i);
 }
 
-void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
-                                              VNInfo *VNI) {
+void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
+                                           const VNInfo *VNI,
+                                           unsigned Reg) {
   if (VNI->isUnused())
     return;
 
-  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+  const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LI);
+    report("Valno not live at def and not marked unused", MF, LR);
     *OS << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live range at def has different valno", MF, LI);
+    report("Live segment at def has different valno", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
@@ -1357,15 +1377,15 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LI);
+    report("Invalid definition index", MF, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-        << " in " << LI << '\n';
+        << " in " << LR << '\n';
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LI);
+      report("PHIDef value is not defined at MBB start", MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
@@ -1375,161 +1395,154 @@ void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LI);
+    report("No instruction at def index", MBB, LR);
     *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
 
-  bool hasDef = false;
-  bool isEarlyClobber = false;
-  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-    if (!MOI->isReg() || !MOI->isDef())
-      continue;
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      if (MOI->getReg() != LI.reg)
-        continue;
-    } else {
-      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
-          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+  if (Reg != 0) {
+    bool hasDef = false;
+    bool isEarlyClobber = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
         continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (MOI->getReg() != Reg)
+          continue;
+      } else {
+        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+            !TRI->hasRegUnit(MOI->getReg(), Reg))
+          continue;
+      }
+      hasDef = true;
+      if (MOI->isEarlyClobber())
+        isEarlyClobber = true;
     }
-    hasDef = true;
-    if (MOI->isEarlyClobber())
-      isEarlyClobber = true;
-  }
 
-  if (!hasDef) {
-    report("Defining instruction does not modify register", MI);
-    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-  }
+    if (!hasDef) {
+      report("Defining instruction does not modify register", MI);
+      *OS << "Valno #" << VNI->id << " in " << LR << '\n';
+    }
 
-  // Early clobber defs begin at USE slots, but other defs must begin at
-  // DEF slots.
-  if (isEarlyClobber) {
-    if (!VNI->def.isEarlyClobber()) {
-      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+    // Early clobber defs begin at USE slots, but other defs must begin at
+    // DEF slots.
+    if (isEarlyClobber) {
+      if (!VNI->def.isEarlyClobber()) {
+        report("Early clobber def must be at an early-clobber slot", MBB, LR);
+        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+      }
+    } else if (!VNI->def.isRegister()) {
+      report("Non-PHI, non-early clobber def must be at a register slot",
+             MBB, LR);
       *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
-  } else if (!VNI->def.isRegister()) {
-    report("Non-PHI, non-early clobber def must be at a register slot",
-           MBB, LI);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
   }
 }
 
-void
-MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
-                                           LiveInterval::const_iterator I) {
-  const VNInfo *VNI = I->valno;
-  assert(VNI && "Live range has no valno");
-
-  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live range", MF, LI);
-    *OS << *I << " has a bad valno\n";
+void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
+                                             const LiveRange::const_iterator I,
+                                             unsigned Reg) {
+  const LiveRange::Segment &S = *I;
+  const VNInfo *VNI = S.valno;
+  assert(VNI && "Live segment has no valno");
+
+  if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live segment", MF, LR);
+    *OS << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live range valno is marked unused", MF, LI);
-    *OS << *I << '\n';
+    report("Live segment valno is marked unused", MF, LR);
+    *OS << S << '\n';
   }
 
-  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad start of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
-  if (I->start != MBBStartIdx && I->start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LI);
-    *OS << *I << '\n';
+  if (S.start != MBBStartIdx && S.start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LR);
+    *OS << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
-    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+    LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LI);
-    *OS << *I << '\n';
+    report("Bad end of live segment, no basic block", MF, LR);
+    *OS << S << '\n';
     return;
   }
 
   // No more checks for live-out segments.
-  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+  if (S.end == LiveInts->getMBBEndIdx(EndMBB))
     return;
 
   // RegUnit intervals are allowed dead phis.
-  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
-      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+      S.start == VNI->def && S.end == VNI->def.getDeadSlot())
     return;
 
   // The live segment is ending inside EndMBB
   const MachineInstr *MI =
-    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+    LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
-    *OS << *I << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR);
+    *OS << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
-  if (I->end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LI);
-    *OS << *I << '\n';
+  if (S.end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LR);
+    *OS << S << '\n';
   }
 
-  if (I->end.isDead()) {
+  if (S.end.isDead()) {
     // Segment ends on the dead slot.
     // That means there must be a dead def.
-    if (!SlotIndex::isSameInstr(I->start, I->end)) {
-      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
-      *OS << *I << '\n';
+    if (!SlotIndex::isSameInstr(S.start, S.end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // A live segment can only end at an early-clobber slot if it is being
   // redefined by an early-clobber def.
-  if (I->end.isEarlyClobber()) {
-    if (I+1 == LI.end() || (I+1)->start != I->end) {
+  if (S.end.isEarlyClobber()) {
+    if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LI);
-      *OS << *I << '\n';
+             "redefined by an EC def in the same instruction", EndMBB, LR);
+      *OS << S << '\n';
     }
   }
 
   // The following checks only apply to virtual registers. Physreg liveness
   // is too weird to check.
-  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-    // A live range can end with either a redefinition, a kill flag on a
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
-    bool hasDeadDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-      if (!MOI->isReg() || MOI->getReg() != LI.reg)
+      if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
       if (MOI->readsReg())
         hasRead = true;
-      if (MOI->isDef() && MOI->isDead())
-        hasDeadDef = true;
     }
-
-    if (I->end.isDead()) {
-      if (!hasDeadDef) {
-        report("Instruction doesn't have a dead def operand", MI);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-      }
-    } else {
+    if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live range doesn't read the register", MI);
-        *OS << *I << " in " << LI << '\n';
+        report("Instruction ending live segment doesn't read the register", MI);
+        *OS << S << " in " << LR << '\n';
       }
     }
   }
 
   // Now check all the basic blocks in this live segment.
   MachineFunction::const_iterator MFI = MBB;
-  // Is this live range the beginning of a non-PHIDef VN?
-  if (I->start == VNI->def && !VNI->isPHIDef()) {
+  // Is this live segment the beginning of a non-PHIDef VN?
+  if (S.start == VNI->def && !VNI->isPHIDef()) {
     // Not live-in to any blocks.
     if (MBB == EndMBB)
       return;
@@ -1537,9 +1550,9 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     ++MFI;
   }
   for (;;) {
-    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    assert(LiveInts->isLiveInToMBB(LR, MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
         MFI->isLandingPad()) {
       if (&*MFI == EndMBB)
         break;
@@ -1555,11 +1568,11 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
     for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
          PE = MFI->pred_end(); PI != PE; ++PI) {
       SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
-      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+      const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LI);
+        report("Register not marked live out of predecessor", *PI, LR);
         *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
@@ -1568,7 +1581,7 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LI);
+        report("Different value live out of predecessor", *PI, LR);
         *OS << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
@@ -1581,13 +1594,17 @@ MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
   }
 }
 
-void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I!=E; ++I)
-    verifyLiveIntervalValue(LI, *I);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
+  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
+       I != E; ++I)
+    verifyLiveRangeValue(LR, *I, Reg);
+
+  for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
+    verifyLiveRangeSegment(LR, I, Reg);
+}
 
-  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
-    verifyLiveIntervalSegment(LI, I);
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  verifyLiveRange(LI, LI.reg);
 
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
@@ -1606,3 +1623,130 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
     }
   }
 }
+
+namespace {
+  // FrameSetup and FrameDestroy can have zero adjustment, so using a single
+  // integer, we can't tell whether it is a FrameSetup or FrameDestroy if the
+  // value is zero.
+  // We use a bool plus an integer to capture the stack state.
+  struct StackStateOfBB {
+    StackStateOfBB() : EntryValue(0), ExitValue(0), EntryIsSetup(false),
+      ExitIsSetup(false) { }
+    StackStateOfBB(int EntryVal, int ExitVal, bool EntrySetup, bool ExitSetup) :
+      EntryValue(EntryVal), ExitValue(ExitVal), EntryIsSetup(EntrySetup),
+      ExitIsSetup(ExitSetup) { }
+    // Can be negative, which means we are setting up a frame.
+    int EntryValue;
+    int ExitValue;
+    bool EntryIsSetup;
+    bool ExitIsSetup;
+  };
+}
+
+/// Make sure on every path through the CFG, a FrameSetup <n> is always followed
+/// by a FrameDestroy <n>, stack adjustments are identical on all
+/// CFG edges to a merge point, and frame is destroyed at end of a return block.
+void MachineVerifier::verifyStackFrame() {
+  int FrameSetupOpcode   = TII->getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  SmallVector<StackStateOfBB, 8> SPState;
+  SPState.resize(MF->getNumBlockIDs());
+  SmallPtrSet<const MachineBasicBlock*, 8> Reachable;
+
+  // Visit the MBBs in DFS order.
+  for (df_ext_iterator<const MachineFunction*,
+                       SmallPtrSet<const MachineBasicBlock*, 8> >
+       DFI = df_ext_begin(MF, Reachable), DFE = df_ext_end(MF, Reachable);
+       DFI != DFE; ++DFI) {
+    const MachineBasicBlock *MBB = *DFI;
+
+    StackStateOfBB BBState;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      const MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      BBState.EntryValue = SPState[StackPred->getNumber()].ExitValue;
+      BBState.EntryIsSetup = SPState[StackPred->getNumber()].ExitIsSetup;
+      BBState.ExitValue = BBState.EntryValue;
+      BBState.ExitIsSetup = BBState.EntryIsSetup;
+    }
+
+    // Update stack state by checking contents of MBB.
+    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      if (I->getOpcode() == FrameSetupOpcode) {
+        // The first operand of a FrameOpcode should be i32.
+        int Size = I->getOperand(0).getImm();
+        assert(Size >= 0 &&
+          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
+
+        if (BBState.ExitIsSetup)
+          report("FrameSetup is after another FrameSetup", I); 
+        BBState.ExitValue -= Size;
+        BBState.ExitIsSetup = true;
+      }
+
+      if (I->getOpcode() == FrameDestroyOpcode) {
+        // The first operand of a FrameOpcode should be i32.
+        int Size = I->getOperand(0).getImm();
+        assert(Size >= 0 &&
+          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
+
+        if (!BBState.ExitIsSetup)
+          report("FrameDestroy is not after a FrameSetup", I);
+        int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
+                                               BBState.ExitValue;
+        if (BBState.ExitIsSetup && AbsSPAdj != Size) {
+          report("FrameDestroy <n> is after FrameSetup <m>", I);
+          *OS << "FrameDestroy <" << Size << "> is after FrameSetup <"
+              << AbsSPAdj << ">.\n";
+        }
+        BBState.ExitValue += Size;
+        BBState.ExitIsSetup = false;
+      }
+    }
+    SPState[MBB->getNumber()] = BBState;
+
+    // Make sure the exit state of any predecessor is consistent with the entry
+    // state.
+    for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+         E = MBB->pred_end(); I != E; ++I) {
+      if (Reachable.count(*I) &&
+          (SPState[(*I)->getNumber()].ExitValue != BBState.EntryValue ||
+           SPState[(*I)->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
+        report("The exit stack state of a predecessor is inconsistent.", MBB);
+        *OS << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
+            << SPState[(*I)->getNumber()].ExitValue << ", "
+            << SPState[(*I)->getNumber()].ExitIsSetup
+            << "), while BB#" << MBB->getNumber() << " has entry state ("
+            << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
+      }
+    }
+
+    // Make sure the entry state of any successor is consistent with the exit
+    // state.
+    for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+         E = MBB->succ_end(); I != E; ++I) {
+      if (Reachable.count(*I) &&
+          (SPState[(*I)->getNumber()].EntryValue != BBState.ExitValue ||
+           SPState[(*I)->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
+        report("The entry stack state of a successor is inconsistent.", MBB);
+        *OS << "Successor BB#" << (*I)->getNumber() << " has entry state ("
+            << SPState[(*I)->getNumber()].EntryValue << ", "
+            << SPState[(*I)->getNumber()].EntryIsSetup
+            << "), while BB#" << MBB->getNumber() << " has exit state ("
+            << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
+      }
+    }
+
+    // Make sure a basic block with return ends with zero stack adjustment.
+    if (!MBB->empty() && MBB->back().isReturn()) {
+      if (BBState.ExitIsSetup)
+        report("A return block ends with a FrameSetup.", MBB);
+      if (BBState.ExitValue)
+        report("A return block ends with a nonzero stack adjustment.", MBB);
+    }
+  }
+}
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 5584708..dcd9072 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -66,7 +66,7 @@ namespace {
     ///
     bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
     void LowerPHINode(MachineBasicBlock &MBB,
-                      MachineBasicBlock::iterator AfterPHIsIt);
+                      MachineBasicBlock::iterator LastPHIIt);
 
     /// analyzePHINodes - Gather information about the PHI nodes in
     /// here. In particular, we want to map the number of uses of a virtual
@@ -185,10 +185,11 @@ bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
 
   // Get an iterator to the first instruction after the last PHI node (this may
   // also be the end of the basic block).
-  MachineBasicBlock::iterator AfterPHIsIt = MBB.SkipPHIsAndLabels(MBB.begin());
+  MachineBasicBlock::iterator LastPHIIt =
+    prior(MBB.SkipPHIsAndLabels(MBB.begin()));
 
   while (MBB.front().isPHI())
-    LowerPHINode(MBB, AfterPHIsIt);
+    LowerPHINode(MBB, LastPHIIt);
 
   return true;
 }
@@ -218,8 +219,11 @@ static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
 /// LowerPHINode - Lower the PHI node at the top of the specified block,
 ///
 void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator AfterPHIsIt) {
+                                  MachineBasicBlock::iterator LastPHIIt) {
   ++NumLowered;
+
+  MachineBasicBlock::iterator AfterPHIsIt = llvm::next(LastPHIIt);
+
   // Unlink the PHI node from the basic block, but don't delete the PHI yet.
   MachineInstr *MPhi = MBB.remove(MBB.begin());
 
@@ -309,14 +313,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (IncomingReg) {
       // Add the region from the beginning of MBB to the copy instruction to
       // IncomingReg's live interval.
-      LiveInterval &IncomingLI = LIS->getOrCreateInterval(IncomingReg);
+      LiveInterval &IncomingLI = LIS->createEmptyInterval(IncomingReg);
       VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
       if (!IncomingVNI)
         IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
                                               LIS->getVNInfoAllocator());
-      IncomingLI.addRange(LiveRange(MBBStartIndex,
-                                    DestCopyIndex.getRegSlot(),
-                                    IncomingVNI));
+      IncomingLI.addSegment(LiveInterval::Segment(MBBStartIndex,
+                                                  DestCopyIndex.getRegSlot(),
+                                                  IncomingVNI));
     }
 
     LiveInterval &DestLI = LIS->getInterval(DestReg);
@@ -328,14 +332,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // the copy instruction.
       VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
       assert(OrigDestVNI && "PHI destination should be live at block entry.");
-      DestLI.removeRange(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.removeSegment(MBBStartIndex, MBBStartIndex.getDeadSlot());
       DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
                            LIS->getVNInfoAllocator());
       DestLI.removeValNo(OrigDestVNI);
     } else {
       // Otherwise, remove the region from the beginning of MBB to the copy
       // instruction from DestReg's live interval.
-      DestLI.removeRange(MBBStartIndex, DestCopyIndex.getRegSlot());
+      DestLI.removeSegment(MBBStartIndex, DestCopyIndex.getRegSlot());
       VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
       assert(DestVNI && "PHI destination should be live at its definition.");
       DestVNI->def = DestCopyIndex.getRegSlot();
@@ -456,7 +460,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
     if (LIS) {
       if (NewSrcInstr) {
         LIS->InsertMachineInstrInMaps(NewSrcInstr);
-        LIS->addLiveRangeToEndOfBlock(IncomingReg, NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, NewSrcInstr);
       }
 
       if (!SrcUndef &&
@@ -507,8 +511,8 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
                  "Cannot find kill instruction");
 
           SlotIndex LastUseIndex = LIS->getInstructionIndex(KillInst);
-          SrcLI.removeRange(LastUseIndex.getRegSlot(),
-                            LIS->getMBBEndIdx(&opBlock));
+          SrcLI.removeSegment(LastUseIndex.getRegSlot(),
+                              LIS->getMBBEndIdx(&opBlock));
         }
       }
     }
diff --git a/lib/CodeGen/PHIEliminationUtils.h b/lib/CodeGen/PHIEliminationUtils.h
index 9ac47fb4..48234ae 100644
--- a/lib/CodeGen/PHIEliminationUtils.h
+++ b/lib/CodeGen/PHIEliminationUtils.h
@@ -1,4 +1,4 @@
-//=- PHIEliminationUtils.h - Helper functions for PHI elimination *- C++ -*--=//
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index bfbc062..f4ffd03 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -58,8 +58,6 @@ OptimizeRegAlloc("optimize-regalloc", cl::Hidden,
 static cl::opt<cl::boolOrDefault>
 EnableMachineSched("enable-misched", cl::Hidden,
     cl::desc("Enable the machine instruction scheduling pass."));
-static cl::opt<bool> EnableStrongPHIElim("strong-phi-elim", cl::Hidden,
-    cl::desc("Use strong PHI elimination."));
 static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
     cl::Hidden,
     cl::desc("Disable Machine LICM"));
@@ -236,7 +234,7 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
   // Temporarily disable experimental passes.
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enableMachineScheduler())
+  if (!ST.useMachineScheduler())
     disablePass(&MachineSchedulerID);
 }
 
@@ -300,6 +298,8 @@ void TargetPassConfig::addPass(Pass *P) {
 
   if (Started && !Stopped)
     PM->add(P);
+  else
+    delete P;
   if (StopAfter == PassID)
     Stopped = true;
   if (StartAfter == PassID)
@@ -331,7 +331,7 @@ AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
   addPass(P); // Ends the lifetime of P.
 
   // Add the passes after the pass P if there is any.
-  for (SmallVector<std::pair<AnalysisID, IdentifyingPassPtr>, 4>::iterator
+  for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator
          I = Impl->InsertedPasses.begin(), E = Impl->InsertedPasses.end();
        I != E; ++I) {
     if ((*I).first == PassID) {
@@ -396,7 +396,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // removed from the parent invoke(s). This could happen when a landing
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
-    addPass(createSjLjEHPreparePass(TM->getTargetLowering()));
+    addPass(createSjLjEHPreparePass(TM));
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
@@ -404,7 +404,7 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     addPass(createDwarfEHPass(TM));
     break;
   case ExceptionHandling::None:
-    addPass(createLowerInvokePass(TM->getTargetLowering()));
+    addPass(createLowerInvokePass(TM));
 
     // The lower invoke pass may create unreachable code. Remove it.
     addPass(createUnreachableBlockEliminationPass());
@@ -416,13 +416,13 @@ void TargetPassConfig::addPassesToHandleExceptions() {
 /// before exception handling preparation passes.
 void TargetPassConfig::addCodeGenPrepare() {
   if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
-    addPass(createCodeGenPreparePass(getTargetLowering()));
+    addPass(createCodeGenPreparePass(TM));
 }
 
 /// Add common passes that perform LLVM IR to IR transforms in preparation for
 /// instruction selection.
 void TargetPassConfig::addISelPrepare() {
-  addPass(createStackProtectorPass(getTargetLowering()));
+  addPass(createStackProtectorPass(TM));
 
   addPreISel();
 
@@ -673,24 +673,15 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
   // preferably fix the scavenger to not depend on them).
   addPass(&LiveVariablesID);
 
-  // Add passes that move from transformed SSA into conventional SSA. This is a
-  // "copy coalescing" problem.
-  //
-  if (!EnableStrongPHIElim) {
-    // Edge splitting is smarter with machine loop info.
-    addPass(&MachineLoopInfoID);
-    addPass(&PHIEliminationID);
-  }
+  // Edge splitting is smarter with machine loop info.
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
     addPass(&LiveIntervalsID);
 
   addPass(&TwoAddressInstructionPassID);
-
-  if (EnableStrongPHIElim)
-    addPass(&StrongPHIEliminationID);
-
   addPass(&RegisterCoalescerID);
 
   // PreRA instruction scheduling.
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index a7439b5..28f2d2f 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -40,20 +40,30 @@
 //     If the branch instruction can use flag from "sub", then we can replace
 //     "sub" with "subs" and eliminate the "cmp" instruction.
 //
-// - Optimize Bitcast pairs:
-//
-//     v1 = bitcast v0
-//     v2 = bitcast v1
-//        = v2
-//   =>
-//     v1 = bitcast v0
-//        = v0
-//
 // - Optimize Loads:
 //
 //     Loads that can be folded into a later instruction. A load is foldable
 //     if it loads to virtual registers and the virtual register defined has 
 //     a single use.
+//
+// - Optimize Copies and Bitcast:
+//
+//     Rewrite copies and bitcasts to avoid cross register bank copies
+//     when possible.
+//     E.g., Consider the following example, where capital and lower
+//     letters denote different register file:
+//     b = copy A <-- cross-bank copy
+//     C = copy b <-- cross-bank copy
+//   =>
+//     b = copy A <-- cross-bank copy
+//     C = copy A <-- same-bank copy
+//
+//     E.g., for bitcast:
+//     b = bitcast A <-- cross-bank copy
+//     C = bitcast b <-- cross-bank copy
+//   =>
+//     b = bitcast A <-- cross-bank copy
+//     C = copy A    <-- same-bank copy
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "peephole-opt"
@@ -81,11 +91,11 @@ DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
                 cl::desc("Disable the peephole optimizer"));
 
 STATISTIC(NumReuse,      "Number of extension results reused");
-STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
 STATISTIC(NumLoadFold,   "Number of loads folded");
 STATISTIC(NumSelects,    "Number of selects optimized");
+STATISTIC(NumCopiesBitcasts, "Number of copies/bitcasts optimized");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
@@ -112,11 +122,11 @@ namespace {
     }
 
   private:
-    bool optimizeBitcastInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
     bool optimizeSelect(MachineInstr *MI);
+    bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
@@ -298,78 +308,6 @@ optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
   return Changed;
 }
 
-/// optimizeBitcastInstr - If the instruction is a bitcast instruction A that
-/// cannot be optimized away during isel (e.g. ARM::VMOVSR, which bitcast
-/// a value cross register classes), and the source is defined by another
-/// bitcast instruction B. And if the register class of source of B matches
-/// the register class of instruction A, then it is legal to replace all uses
-/// of the def of A with source of B. e.g.
-///   %vreg0<def> = VMOVSR %vreg1
-///   %vreg3<def> = VMOVRS %vreg0
-///   Replace all uses of vreg3 with vreg1.
-
-bool PeepholeOptimizer::optimizeBitcastInstr(MachineInstr *MI,
-                                             MachineBasicBlock *MBB) {
-  unsigned NumDefs = MI->getDesc().getNumDefs();
-  unsigned NumSrcs = MI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-
-  unsigned Def = 0;
-  unsigned Src = 0;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (MO.isDef())
-      Def = Reg;
-    else if (Src)
-      // Multiple sources?
-      return false;
-    else
-      Src = Reg;
-  }
-
-  assert(Def && Src && "Malformed bitcast instruction!");
-
-  MachineInstr *DefMI = MRI->getVRegDef(Src);
-  if (!DefMI || !DefMI->isBitcast())
-    return false;
-
-  unsigned SrcSrc = 0;
-  NumDefs = DefMI->getDesc().getNumDefs();
-  NumSrcs = DefMI->getDesc().getNumOperands() - NumDefs;
-  if (NumDefs != 1)
-    return false;
-  for (unsigned i = 0, e = NumDefs + NumSrcs; i != e; ++i) {
-    const MachineOperand &MO = DefMI->getOperand(i);
-    if (!MO.isReg() || MO.isDef())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg)
-      continue;
-    if (!MO.isDef()) {
-      if (SrcSrc)
-        // Multiple sources?
-        return false;
-      else
-        SrcSrc = Reg;
-    }
-  }
-
-  if (MRI->getRegClass(SrcSrc) != MRI->getRegClass(Def))
-    return false;
-
-  MRI->replaceRegWith(Def, SrcSrc);
-  MRI->clearKillFlags(SrcSrc);
-  MI->eraseFromParent();
-  ++NumBitcasts;
-  return true;
-}
-
 /// optimizeCmpInstr - If the instruction is a compare and the previous
 /// instruction it's comparing against all ready sets (or could be modified to
 /// set) the same flag as the compare, then we can remove the comparison and use
@@ -411,6 +349,150 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
   return true;
 }
 
+/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg)
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != NULL;
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != NULL;
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != NULL;
+}
+
+/// \brief Get the index of the definition and source for \p Copy
+/// instruction.
+/// \pre Copy.isCopy() or Copy.isBitcast().
+/// \return True if the Copy instruction has only one register source
+/// and one register definition. Otherwise, \p DefIdx and \p SrcIdx
+/// are invalid.
+static bool getCopyOrBitcastDefUseIdx(const MachineInstr &Copy,
+                                      unsigned &DefIdx, unsigned &SrcIdx) {
+  assert((Copy.isCopy() || Copy.isBitcast()) && "Wrong operation type.");
+  if (Copy.isCopy()) {
+    // Copy instruction are supposed to be: Def = Src.
+     if (Copy.getDesc().getNumOperands() != 2)
+       return false;
+     DefIdx = 0;
+     SrcIdx = 1;
+     assert(Copy.getOperand(DefIdx).isDef() && "Use comes before def!");
+     return true;
+  }
+  // Bitcast case.
+  // Bitcasts with more than one def are not supported.
+  if (Copy.getDesc().getNumDefs() != 1)
+    return false;
+  // Initialize SrcIdx to an undefined operand.
+  SrcIdx = Copy.getDesc().getNumOperands();
+  for (unsigned OpIdx = 0, EndOpIdx = SrcIdx; OpIdx != EndOpIdx; ++OpIdx) {
+    const MachineOperand &MO = Copy.getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (MO.isDef())
+      DefIdx = OpIdx;
+    else if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return false;
+    SrcIdx = OpIdx;
+  }
+  return true;
+}
+
+/// \brief Optimize a copy or bitcast instruction to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and try to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstraint the register allocator, but replaces some when
+/// possible.
+/// \pre \p MI is a Copy (MI->isCopy() is true)
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+bool PeepholeOptimizer::optimizeCopyOrBitcast(MachineInstr *MI) {
+  unsigned DefIdx, SrcIdx;
+  if (!MI || !getCopyOrBitcastDefUseIdx(*MI, DefIdx, SrcIdx))
+    return false;
+
+  const MachineOperand &MODef = MI->getOperand(DefIdx);
+  assert(MODef.isReg() && "Copies must be between registers.");
+  unsigned Def = MODef.getReg();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Def))
+    return false;
+
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Def);
+  unsigned DefSubReg = MODef.getSubReg();
+
+  unsigned Src;
+  unsigned SrcSubReg;
+  bool ShouldRewrite = false;
+  MachineInstr *Copy = MI;
+  const TargetRegisterInfo &TRI = *TM->getRegisterInfo();
+
+  // Follow the chain of copies until we reach the top or find a
+  // more suitable source.
+  do {
+    unsigned CopyDefIdx, CopySrcIdx;
+    if (!getCopyOrBitcastDefUseIdx(*Copy, CopyDefIdx, CopySrcIdx))
+      break;
+    const MachineOperand &MO = Copy->getOperand(CopySrcIdx);
+    assert(MO.isReg() && "Copies must be between registers.");
+    Src = MO.getReg();
+
+    if (TargetRegisterInfo::isPhysicalRegister(Src))
+      break;
+
+    const TargetRegisterClass *SrcRC = MRI->getRegClass(Src);
+    SrcSubReg = MO.getSubReg();
+
+    // If this source does not incur a cross register bank copy, use it.
+    ShouldRewrite = shareSameRegisterFile(TRI, DefRC, DefSubReg, SrcRC,
+                                          SrcSubReg);
+    // Follow the chain of copies: get the definition of Src.
+    Copy = MRI->getVRegDef(Src);
+  } while (!ShouldRewrite && Copy && (Copy->isCopy() || Copy->isBitcast()));
+
+  // If we did not find a more suitable source, there is nothing to optimize.
+  if (!ShouldRewrite || Src == MI->getOperand(SrcIdx).getReg())
+    return false;
+
+  // Rewrite the copy to avoid a cross register bank penalty. 
+  unsigned NewVR = TargetRegisterInfo::isPhysicalRegister(Def) ? Def :
+    MRI->createVirtualRegister(DefRC);
+  MachineInstr *NewCopy = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                  TII->get(TargetOpcode::COPY), NewVR)
+    .addReg(Src, 0, SrcSubReg);
+  NewCopy->getOperand(0).setSubReg(DefSubReg);
+
+  MRI->replaceRegWith(Def, NewVR);
+  MRI->clearKillFlags(NewVR);
+  MI->eraseFromParent();
+  ++NumCopiesBitcasts;
+  return true;
+}
+
 /// isLoadFoldable - Check whether MI is a candidate for folding into a later
 /// instruction. We only fold loads to virtual registers and the virtual
 /// register defined has a single use.
@@ -523,7 +605,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
       if (MI->mayStore() || MI->isCall())
         FoldAsLoadDefReg = 0;
 
-      if ((MI->isBitcast() && optimizeBitcastInstr(MI, MBB)) ||
+      if (((MI->isBitcast() || MI->isCopy()) && optimizeCopyOrBitcast(MI)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
           (MI->isSelect() && optimizeSelect(MI))) {
         // MI is deleted.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 53fe273..1afc1ec 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -127,6 +127,12 @@ namespace {
     /// The schedule. Null SUnit*'s represent noop instructions.
     std::vector<SUnit*> Sequence;
 
+    /// The index in BB of RegionEnd.
+    ///
+    /// This is the instruction number from the top of the current block, not
+    /// the SlotIndex. It is only used by the AntiDepBreaker.
+    unsigned EndIndex;
+
   public:
     SchedulePostRATDList(
       MachineFunction &MF, MachineLoopInfo &MLI, MachineDominatorTree &MDT,
@@ -141,11 +147,14 @@ namespace {
     ///
     void startBlock(MachineBasicBlock *BB);
 
+    // Set the index of RegionEnd within the current BB.
+    void setEndIndex(unsigned EndIdx) { EndIndex = EndIdx; }
+
     /// Initialize the scheduler state for the next scheduling region.
     virtual void enterRegion(MachineBasicBlock *bb,
                              MachineBasicBlock::iterator begin,
                              MachineBasicBlock::iterator end,
-                             unsigned endcount);
+                             unsigned regioninstrs);
 
     /// Notify that the scheduler has finished scheduling the current region.
     virtual void exitRegion();
@@ -197,7 +206,7 @@ SchedulePostRATDList::SchedulePostRATDList(
   TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
   SmallVectorImpl<const TargetRegisterClass*> &CriticalPathRCs)
   : ScheduleDAGInstrs(MF, MLI, MDT, /*IsPostRA=*/true), AA(AA),
-    LiveRegs(TRI->getNumRegs())
+    LiveRegs(TRI->getNumRegs()), EndIndex(0)
 {
   const TargetMachine &TM = MF.getTarget();
   const InstrItineraryData *InstrItins = TM.getInstrItineraryData();
@@ -223,8 +232,8 @@ SchedulePostRATDList::~SchedulePostRATDList() {
 void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
                  MachineBasicBlock::iterator begin,
                  MachineBasicBlock::iterator end,
-                 unsigned endcount) {
-  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+                 unsigned regioninstrs) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
   Sequence.clear();
 }
 
@@ -312,20 +321,21 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     unsigned Count = MBB->size(), CurrentCount = Count;
     for (MachineBasicBlock::iterator I = Current; I != MBB->begin(); ) {
       MachineInstr *MI = llvm::prior(I);
+      --Count;
       // Calls are not scheduling boundaries before register allocation, but
       // post-ra we don't gain anything by scheduling across calls since we
       // don't need to worry about register pressure.
       if (MI->isCall() || TII->isSchedulingBoundary(MI, MBB, Fn)) {
-        Scheduler.enterRegion(MBB, I, Current, CurrentCount);
+        Scheduler.enterRegion(MBB, I, Current, CurrentCount - Count);
+        Scheduler.setEndIndex(CurrentCount);
         Scheduler.schedule();
         Scheduler.exitRegion();
         Scheduler.EmitSchedule();
         Current = MI;
-        CurrentCount = Count - 1;
+        CurrentCount = Count;
         Scheduler.Observe(MI, CurrentCount);
       }
       I = MI;
-      --Count;
       if (MI->isBundle())
         Count -= MI->getBundleSize();
     }
@@ -333,6 +343,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     assert((MBB->begin() == Current || CurrentCount != 0) &&
            "Instruction count mismatch!");
     Scheduler.enterRegion(MBB, MBB->begin(), Current, CurrentCount);
+    Scheduler.setEndIndex(CurrentCount);
     Scheduler.schedule();
     Scheduler.exitRegion();
     Scheduler.EmitSchedule();
@@ -424,9 +435,9 @@ void SchedulePostRATDList::StartBlockForKills(MachineBasicBlock *BB) {
     for (MachineBasicBlock::livein_iterator I = (*SI)->livein_begin(),
          E = (*SI)->livein_end(); I != E; ++I) {
       unsigned Reg = *I;
-      LiveRegs.set(Reg);
-      // Repeat, for all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      // Repeat, for reg and all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
         LiveRegs.set(*SubRegs);
     }
   }
@@ -496,20 +507,19 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       // Ignore two-addr defs.
       if (MI->isRegTiedToUseOperand(i)) continue;
 
-      LiveRegs.reset(Reg);
-
-      // Repeat for all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      // Repeat for reg and all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
         LiveRegs.reset(*SubRegs);
     }
 
     // Examine all used registers and set/clear kill flag. When a
     // register is used multiple times we only set the kill flag on
-    // the first use.
+    // the first use. Don't set kill flags on undef operands.
     killedRegs.reset();
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse()) continue;
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
@@ -548,9 +558,8 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       unsigned Reg = MO.getReg();
       if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
-      LiveRegs.set(Reg);
-
-      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
         LiveRegs.set(*SubRegs);
     }
   }
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index e4e18c3..0c5173a 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -78,7 +78,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   unsigned Reg = MI->getOperand(0).getReg();
 
   if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    // For virtual regiusters, mark all uses as <undef>, and convert users to
+    // For virtual registers, mark all uses as <undef>, and convert users to
     // implicit-def when possible.
     for (MachineRegisterInfo::use_nodbg_iterator UI =
          MRI->use_nodbg_begin(Reg),
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 959dd7d..b0e494f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass provides an optional shrink wrapping variant of prolog/epilog
-// insertion, enabled via --shrink-wrap. See ShrinkWrapping.cpp.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "pei"
@@ -29,12 +26,14 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -46,6 +45,11 @@ using namespace llvm;
 char PEI::ID = 0;
 char &llvm::PrologEpilogCodeInserterID = PEI::ID;
 
+static cl::opt<unsigned>
+WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
+              cl::desc("Warn for stack size bigger than the given"
+                       " number"));
+
 INITIALIZE_PASS_BEGIN(PEI, "prologepilog",
                 "Prologue/Epilogue Insertion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
@@ -59,6 +63,38 @@ STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 STATISTIC(NumBytesStackSpace,
           "Number of bytes used for stack in all functions");
 
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
+  return (MBB && !MBB->empty() && MBB->back().isReturn());
+}
+
+/// Compute the set of return blocks
+void PEI::calculateSets(MachineFunction &Fn) {
+  // Sets used to compute spill, restore placement sets.
+  const std::vector<CalleeSavedInfo> &CSI =
+    Fn.getFrameInfo()->getCalleeSavedInfo();
+
+  // If no CSRs used, we are done.
+  if (CSI.empty())
+    return;
+
+  // Save refs to entry and return blocks.
+  EntryBlock = Fn.begin();
+  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
+       MBB != E; ++MBB)
+    if (isReturnBlock(MBB))
+      ReturnBlocks.push_back(MBB);
+
+  return;
+}
+
 /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
 /// frame indexes with appropriate references.
 ///
@@ -86,16 +122,11 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   calculateCalleeSavedRegisters(Fn);
 
   // Determine placement of CSR spill/restore code:
-  //  - With shrink wrapping, place spills and restores to tightly
-  //    enclose regions in the Machine CFG of the function where
-  //    they are used.
-  //  - Without shink wrapping (default), place all spills in the
-  //    entry block, all restores in return blocks.
-  placeCSRSpillsAndRestores(Fn);
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSets(Fn);
 
   // Add the code to save and restore the callee saved registers
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertCSRSpillsAndRestores(Fn);
 
   // Allow the target machine to make final modifications to the function
@@ -110,8 +141,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // called functions.  Because of this, calculateCalleeSavedRegisters()
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
-  if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::Naked))
+  if (!F->hasFnAttribute(Attribute::Naked))
     insertPrologEpilogCode(Fn);
 
   // Replace all MO_FrameIndex operands with physical register references
@@ -128,8 +158,15 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // Clear any vregs created by virtual scavenging.
   Fn.getRegInfo().clearVirtRegs();
 
+  // Warn on stack size when we exceeds the given limit.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  if (WarnStackSize.getNumOccurrences() > 0 &&
+      WarnStackSize < MFI->getStackSize())
+    errs() << "warning: Stack size limit exceeded (" << MFI->getStackSize()
+           << ") in " << Fn.getName()  << ".\n";
+
   delete RS;
-  clearAllSets();
+  ReturnBlocks.clear();
   return true;
 }
 
@@ -207,14 +244,14 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
     return;
 
   // In Naked functions we aren't going to save any registers.
-  if (F.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                    Attribute::Naked))
+  if (F.getFunction()->hasFnAttribute(Attribute::Naked))
     return;
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    if (F.getRegInfo().isPhysRegUsed(Reg)) {
+    // Functions which call __builtin_unwind_init get all their registers saved.
+    if (F.getRegInfo().isPhysRegUsed(Reg) || F.getMMI().callsUnwindInit()) {
       // If the reg is modified, save it!
       CSI.push_back(CalleeSavedInfo(Reg));
     }
@@ -271,7 +308,7 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &F) {
 }
 
 /// insertCSRSpillsAndRestores - Insert spill and restore code for
-/// callee saved registers used in the function, handling shrink wrapping.
+/// callee saved registers used in the function.
 ///
 void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   // Get callee saved register information.
@@ -289,133 +326,33 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
   const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
   MachineBasicBlock::iterator I;
 
-  if (!ShrinkWrapThisFunction) {
-    // Spill using target interface.
-    I = EntryBlock->begin();
-    if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
-      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-        // Add the callee-saved register as live-in.
-        // It's killed at the spill.
-        EntryBlock->addLiveIn(CSI[i].getReg());
-
-        // Insert the spill to the stack frame.
-        unsigned Reg = CSI[i].getReg();
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.storeRegToStackSlot(*EntryBlock, I, Reg, true,
-                                CSI[i].getFrameIdx(), RC, TRI);
-      }
-    }
-
-    // Restore using target interface.
-    for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
-      MachineBasicBlock* MBB = ReturnBlocks[ri];
-      I = MBB->end(); --I;
-
-      // Skip over all terminator instructions, which are part of the return
-      // sequence.
-      MachineBasicBlock::iterator I2 = I;
-      while (I2 != MBB->begin() && (--I2)->isTerminator())
-        I = I2;
-
-      bool AtStart = I == MBB->begin();
-      MachineBasicBlock::iterator BeforeI = I;
-      if (!AtStart)
-        --BeforeI;
-
-      // Restore all registers immediately before the return and any
-      // terminators that precede it.
-      if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
-        for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-          unsigned Reg = CSI[i].getReg();
-          const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-          TII.loadRegFromStackSlot(*MBB, I, Reg,
-                                   CSI[i].getFrameIdx(),
-                                   RC, TRI);
-          assert(I != MBB->begin() &&
-                 "loadRegFromStackSlot didn't insert any code!");
-          // Insert in reverse order.  loadRegFromStackSlot can insert
-          // multiple instructions.
-          if (AtStart)
-            I = MBB->begin();
-          else {
-            I = BeforeI;
-            ++I;
-          }
-        }
-      }
-    }
-    return;
-  }
-
-  // Insert spills.
-  std::vector<CalleeSavedInfo> blockCSI;
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet save = BI->second;
-
-    if (save.empty())
-      continue;
-
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = save.begin(),
-           RE = save.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not collect callee saved register info");
-
-    I = MBB->begin();
-
-    // When shrink wrapping, use stack slot stores/loads.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
+  // Spill using target interface.
+  I = EntryBlock->begin();
+  if (!TFI->spillCalleeSavedRegisters(*EntryBlock, I, CSI, TRI)) {
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      MBB->addLiveIn(blockCSI[i].getReg());
+      EntryBlock->addLiveIn(CSI[i].getReg());
 
       // Insert the spill to the stack frame.
-      unsigned Reg = blockCSI[i].getReg();
+      unsigned Reg = CSI[i].getReg();
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(*MBB, I, Reg,
-                              true,
-                              blockCSI[i].getFrameIdx(),
+      TII.storeRegToStackSlot(*EntryBlock, I, Reg, true, CSI[i].getFrameIdx(),
                               RC, TRI);
     }
   }
 
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restore = BI->second;
-
-    if (restore.empty())
-      continue;
+  // Restore using target interface.
+  for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri) {
+    MachineBasicBlock *MBB = ReturnBlocks[ri];
+    I = MBB->end();
+    --I;
 
-    blockCSI.clear();
-    for (CSRegSet::iterator RI = restore.begin(),
-           RE = restore.end(); RI != RE; ++RI) {
-      blockCSI.push_back(CSI[*RI]);
-    }
-    assert(blockCSI.size() > 0 &&
-           "Could not find callee saved register info");
-
-    // If MBB is empty and needs restores, insert at the _beginning_.
-    if (MBB->empty()) {
-      I = MBB->begin();
-    } else {
-      I = MBB->end();
-      --I;
-
-      // Skip over all terminator instructions, which are part of the
-      // return sequence.
-      if (! I->isTerminator()) {
-        ++I;
-      } else {
-        MachineBasicBlock::iterator I2 = I;
-        while (I2 != MBB->begin() && (--I2)->isTerminator())
-          I = I2;
-      }
-    }
+    // Skip over all terminator instructions, which are part of the return
+    // sequence.
+    MachineBasicBlock::iterator I2 = I;
+    while (I2 != MBB->begin() && (--I2)->isTerminator())
+      I = I2;
 
     bool AtStart = I == MBB->begin();
     MachineBasicBlock::iterator BeforeI = I;
@@ -424,21 +361,21 @@ void PEI::insertCSRSpillsAndRestores(MachineFunction &Fn) {
 
     // Restore all registers immediately before the return and any
     // terminators that precede it.
-    for (unsigned i = 0, e = blockCSI.size(); i != e; ++i) {
-      unsigned Reg = blockCSI[i].getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.loadRegFromStackSlot(*MBB, I, Reg,
-                               blockCSI[i].getFrameIdx(),
-                               RC, TRI);
-      assert(I != MBB->begin() &&
-             "loadRegFromStackSlot didn't insert any code!");
-      // Insert in reverse order.  loadRegFromStackSlot can insert
-      // multiple instructions.
-      if (AtStart)
-        I = MBB->begin();
-      else {
-        I = BeforeI;
-        ++I;
+    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB->begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+        if (AtStart)
+          I = MBB->begin();
+        else {
+          I = BeforeI;
+          ++I;
+        }
       }
     }
   }
@@ -543,14 +480,18 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
   unsigned MaxAlign = MFI->getMaxAlignment();
 
   // Make sure the special register scavenging spill slot is closest to the
-  // frame pointer if a frame pointer is required.
+  // incoming stack pointer if a frame pointer is required and is closer
+  // to the incoming rather than the final stack pointer.
   const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
-  if (RS && TFI.hasFP(Fn) && RegInfo->useFPForScavengingIndex(Fn) &&
-      !RegInfo->needsStackRealignment(Fn)) {
+  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
+                               TFI.isFPCloseToIncomingSP() &&
+                               RegInfo->useFPForScavengingIndex(Fn) &&
+                               !RegInfo->needsStackRealignment(Fn));
+  if (RS && EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
-    for (SmallVector<int, 2>::iterator I = SFIs.begin(),
-         IE = SFIs.end(); I != IE; ++I)
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
       AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
   }
 
@@ -630,12 +571,11 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Make sure the special register scavenging spill slot is closest to the
   // stack pointer.
-  if (RS && (!TFI.hasFP(Fn) || RegInfo->needsStackRealignment(Fn) ||
-             !RegInfo->useFPForScavengingIndex(Fn))) {
+  if (RS && !EarlyScavengingSlots) {
     SmallVector<int, 2> SFIs;
     RS->getScavengingFrameIndices(SFIs);
-    for (SmallVector<int, 2>::iterator I = SFIs.begin(),
-         IE = SFIs.end(); I != IE; ++I)
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
       AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign);
   }
 
@@ -710,6 +650,40 @@ void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
 void PEI::replaceFrameIndices(MachineFunction &Fn) {
   if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
 
+  // Store SPAdj at exit of a basic block.
+  SmallVector<int, 8> SPState;
+  SPState.resize(Fn.getNumBlockIDs());
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+  // Iterate over the reachable blocks in DFS order.
+  for (df_ext_iterator<MachineFunction*, SmallPtrSet<MachineBasicBlock*, 8> >
+       DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+       DFI != DFE; ++DFI) {
+    int SPAdj = 0;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      SPAdj = SPState[StackPred->getNumber()];
+    }
+    MachineBasicBlock *BB = *DFI;
+    replaceFrameIndices(BB, Fn, SPAdj);
+    SPState[BB->getNumber()] = SPAdj;
+  }
+
+  // Handle the unreachable blocks.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
+      // Already handled in DFS traversal.
+      continue;
+    int SPAdj = 0;
+    replaceFrameIndices(BB, Fn, SPAdj);
+  }
+}
+
+void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                              int &SPAdj) {
   const TargetMachine &TM = Fn.getTarget();
   assert(TM.getRegisterInfo() && "TM::getRegisterInfo() must be implemented!");
   const TargetInstrInfo &TII = *Fn.getTarget().getInstrInfo();
@@ -720,89 +694,85 @@ void PEI::replaceFrameIndices(MachineFunction &Fn) {
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
-  for (MachineFunction::iterator BB = Fn.begin(),
-         E = Fn.end(); BB != E; ++BB) {
-#ifndef NDEBUG
-    int SPAdjCount = 0; // frame setup / destroy count.
-#endif
-    int SPAdj = 0;  // SP offset due to call frame setup / destroy.
-    if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
+  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
-    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
-      if (I->getOpcode() == FrameSetupOpcode ||
-          I->getOpcode() == FrameDestroyOpcode) {
-#ifndef NDEBUG
-        // Track whether we see even pairs of them
-        SPAdjCount += I->getOpcode() == FrameSetupOpcode ? 1 : -1;
-#endif
-        // Remember how much SP has been adjusted to create the call
-        // frame.
-        int Size = I->getOperand(0).getImm();
-
-        if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
-            (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
-          Size = -Size;
-
-        SPAdj += Size;
-
-        MachineBasicBlock::iterator PrevI = BB->end();
-        if (I != BB->begin()) PrevI = prior(I);
-        TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
-
-        // Visit the instructions created by eliminateCallFramePseudoInstr().
-        if (PrevI == BB->end())
-          I = BB->begin();     // The replaced instr was the first in the block.
-        else
-          I = llvm::next(PrevI);
-        continue;
-      }
+    if (I->getOpcode() == FrameSetupOpcode ||
+        I->getOpcode() == FrameDestroyOpcode) {
+      // Remember how much SP has been adjusted to create the call
+      // frame.
+      int Size = I->getOperand(0).getImm();
 
-      MachineInstr *MI = I;
-      bool DoIncr = true;
-      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-        if (!MI->getOperand(i).isFI())
-            continue;
+      if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
+          (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
+        Size = -Size;
 
-        // Some instructions (e.g. inline asm instructions) can have
-        // multiple frame indices and/or cause eliminateFrameIndex
-        // to insert more than one instruction. We need the register
-        // scavenger to go through all of these instructions so that
-        // it can update its register information. We keep the
-        // iterator at the point before insertion so that we can
-        // revisit them in full.
-        bool AtBeginning = (I == BB->begin());
-        if (!AtBeginning) --I;
-
-        // If this instruction has a FrameIndex operand, we need to
-        // use that target machine register info object to eliminate
-        // it.
-        TRI.eliminateFrameIndex(MI, SPAdj, i,
-                                FrameIndexVirtualScavenging ?  NULL : RS);
-
-        // Reset the iterator if we were at the beginning of the BB.
-        if (AtBeginning) {
-          I = BB->begin();
-          DoIncr = false;
-        }
+      SPAdj += Size;
+
+      MachineBasicBlock::iterator PrevI = BB->end();
+      if (I != BB->begin()) PrevI = prior(I);
+      TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
 
-        MI = 0;
-        break;
+      // Visit the instructions created by eliminateCallFramePseudoInstr().
+      if (PrevI == BB->end())
+        I = BB->begin();     // The replaced instr was the first in the block.
+      else
+        I = llvm::next(PrevI);
+      continue;
+    }
+
+    MachineInstr *MI = I;
+    bool DoIncr = true;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      if (!MI->getOperand(i).isFI())
+        continue;
+
+      // Frame indicies in debug values are encoded in a target independent
+      // way with simply the frame index and offset rather than any
+      // target-specific addressing mode.
+      if (MI->isDebugValue()) {
+        assert(i == 0 && "Frame indicies can only appear as the first "
+                         "operand of a DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(1);
+        Offset.setImm(Offset.getImm() +
+                      TFI->getFrameIndexReference(
+                          Fn, MI->getOperand(0).getIndex(), Reg));
+        MI->getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
       }
 
-      if (DoIncr && I != BB->end()) ++I;
+      // Some instructions (e.g. inline asm instructions) can have
+      // multiple frame indices and/or cause eliminateFrameIndex
+      // to insert more than one instruction. We need the register
+      // scavenger to go through all of these instructions so that
+      // it can update its register information. We keep the
+      // iterator at the point before insertion so that we can
+      // revisit them in full.
+      bool AtBeginning = (I == BB->begin());
+      if (!AtBeginning) --I;
+
+      // If this instruction has a FrameIndex operand, we need to
+      // use that target machine register info object to eliminate
+      // it.
+      TRI.eliminateFrameIndex(MI, SPAdj, i,
+                              FrameIndexVirtualScavenging ?  NULL : RS);
+
+      // Reset the iterator if we were at the beginning of the BB.
+      if (AtBeginning) {
+        I = BB->begin();
+        DoIncr = false;
+      }
 
-      // Update register states.
-      if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
+      MI = 0;
+      break;
     }
 
-    // If we have evenly matched pairs of frame setup / destroy instructions,
-    // make sure the adjustments come out to zero. If we don't have matched
-    // pairs, we can't be sure the missing bit isn't in another basic block
-    // due to a custom inserter playing tricks, so just asserting SPAdj==0
-    // isn't sufficient. See tMOVCC on Thumb1, for example.
-    assert((SPAdjCount || SPAdj == 0) &&
-           "Unbalanced call frame setup / destroy pairs?");
+    if (DoIncr && I != BB->end()) ++I;
+
+    // Update register states.
+    if (RS && !FrameIndexVirtualScavenging && MI) RS->forward(MI);
   }
 }
 
diff --git a/lib/CodeGen/PrologEpilogInserter.h b/lib/CodeGen/PrologEpilogInserter.h
index 87fff9a..77cfa2b 100644
--- a/lib/CodeGen/PrologEpilogInserter.h
+++ b/lib/CodeGen/PrologEpilogInserter.h
@@ -1,4 +1,4 @@
-//===-- PrologEpilogInserter.h - Prolog/Epilog code insertion -*- C++ -* --===//
+//===-- PrologEpilogInserter.h - Prolog/Epilog code insertion -*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,9 +14,6 @@
 // This pass must be run after register allocation.  After this pass is
 // executed, it is illegal to construct MO_FrameIndex operands.
 //
-// This pass also implements a shrink wrapping variant of prolog/epilog
-// insertion.
-//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CODEGEN_PEI_H
@@ -54,120 +51,28 @@ namespace llvm {
     // stack frame indexes.
     unsigned MinCSFrameIndex, MaxCSFrameIndex;
 
-    // Analysis info for spill/restore placement.
-    // "CSR": "callee saved register".
-
-    // CSRegSet contains indices into the Callee Saved Register Info
-    // vector built by calculateCalleeSavedRegisters() and accessed
-    // via MF.getFrameInfo()->getCalleeSavedInfo().
-    typedef SparseBitVector<> CSRegSet;
-
-    // CSRegBlockMap maps MachineBasicBlocks to sets of callee
-    // saved register indices.
-    typedef DenseMap<MachineBasicBlock*, CSRegSet> CSRegBlockMap;
-
-    // Set and maps for computing CSR spill/restore placement:
-    //  used in function (UsedCSRegs)
-    //  used in a basic block (CSRUsed)
-    //  anticipatable in a basic block (Antic{In,Out})
-    //  available in a basic block (Avail{In,Out})
-    //  to be spilled at the entry to a basic block (CSRSave)
-    //  to be restored at the end of a basic block (CSRRestore)
-    CSRegSet UsedCSRegs;
-    CSRegBlockMap CSRUsed;
-    CSRegBlockMap AnticIn, AnticOut;
-    CSRegBlockMap AvailIn, AvailOut;
-    CSRegBlockMap CSRSave;
-    CSRegBlockMap CSRRestore;
-
     // Entry and return blocks of the current function.
     MachineBasicBlock* EntryBlock;
     SmallVector<MachineBasicBlock*, 4> ReturnBlocks;
 
-    // Map of MBBs to top level MachineLoops.
-    DenseMap<MachineBasicBlock*, MachineLoop*> TLLoops;
-
-    // Flag to control shrink wrapping per-function:
-    // may choose to skip shrink wrapping for certain
-    // functions.
-    bool ShrinkWrapThisFunction;
-
     // Flag to control whether to use the register scavenger to resolve
     // frame index materialization registers. Set according to
     // TRI->requiresFrameIndexScavenging() for the curren function.
     bool FrameIndexVirtualScavenging;
 
-#ifndef NDEBUG
-    // Machine function handle.
-    MachineFunction* MF;
-
-    // Flag indicating that the current function
-    // has at least one "short" path in the machine
-    // CFG from the entry block to an exit block.
-    bool HasFastExitPath;
-#endif
-
-    bool calculateSets(MachineFunction &Fn);
-    bool calcAnticInOut(MachineBasicBlock* MBB);
-    bool calcAvailInOut(MachineBasicBlock* MBB);
-    void calculateAnticAvail(MachineFunction &Fn);
-    bool addUsesForMEMERegion(MachineBasicBlock* MBB,
-                              SmallVector<MachineBasicBlock*, 4>& blks);
-    bool addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks);
-    bool calcSpillPlacements(MachineBasicBlock* MBB,
-                             SmallVector<MachineBasicBlock*, 4> &blks,
-                             CSRegBlockMap &prevSpills);
-    bool calcRestorePlacements(MachineBasicBlock* MBB,
-                               SmallVector<MachineBasicBlock*, 4> &blks,
-                               CSRegBlockMap &prevRestores);
-    void placeSpillsAndRestores(MachineFunction &Fn);
-    void placeCSRSpillsAndRestores(MachineFunction &Fn);
+    void calculateSets(MachineFunction &Fn);
     void calculateCallsInformation(MachineFunction &Fn);
     void calculateCalleeSavedRegisters(MachineFunction &Fn);
     void insertCSRSpillsAndRestores(MachineFunction &Fn);
     void calculateFrameObjectOffsets(MachineFunction &Fn);
     void replaceFrameIndices(MachineFunction &Fn);
+    void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                             int &SPAdj);
     void scavengeFrameVirtualRegs(MachineFunction &Fn);
     void insertPrologEpilogCode(MachineFunction &Fn);
 
-    // Initialize DFA sets, called before iterations.
-    void clearAnticAvailSets();
-    // Clear all sets constructed by shrink wrapping.
-    void clearAllSets();
-
-    // Initialize all shrink wrapping data.
-    void initShrinkWrappingInfo();
-
-    // Convienences for dealing with machine loops.
-    MachineBasicBlock* getTopLevelLoopPreheader(MachineLoop* LP);
-    MachineLoop* getTopLevelLoopParent(MachineLoop *LP);
-
-    // Propgate CSRs used in MBB to all MBBs of loop LP.
-    void propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP);
-
     // Convenience for recognizing return blocks.
     bool isReturnBlock(MachineBasicBlock* MBB);
-
-#ifndef NDEBUG
-    // Debugging methods.
-
-    // Mark this function as having fast exit paths.
-    void findFastExitPath();
-
-    // Verify placement of spills/restores.
-    void verifySpillRestorePlacement();
-
-    std::string getBasicBlockName(const MachineBasicBlock* MBB);
-    std::string stringifyCSRegSet(const CSRegSet& s);
-    void dumpSet(const CSRegSet& s);
-    void dumpUsed(MachineBasicBlock* MBB);
-    void dumpAllUsed();
-    void dumpSets(MachineBasicBlock* MBB);
-    void dumpSets1(MachineBasicBlock* MBB);
-    void dumpAllSets();
-    void dumpSRSets();
-#endif
-
   };
 } // End llvm namespace
 #endif
diff --git a/lib/CodeGen/README.txt b/lib/CodeGen/README.txt
index 7f75f65..8f19e43 100644
--- a/lib/CodeGen/README.txt
+++ b/lib/CodeGen/README.txt
@@ -21,7 +21,7 @@ can be:
 and then "merge" mul and mov:
 
         mul r4, r4, lr
-        str lr, [sp, #+52]
+        str r4, [sp, #+52]
         ldr lr, [r1, #+32]
         sxth r3, r3
         mla r4, r3, lr, r4
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index c035590..293e306 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -43,13 +43,16 @@ static cl::opt<bool, true>
 VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
                cl::desc("Verify during register allocation"));
 
-const char *RegAllocBase::TimerGroupName = "Register Allocation";
+const char RegAllocBase::TimerGroupName[] = "Register Allocation";
 bool RegAllocBase::VerifyEnabled = false;
 
 //===----------------------------------------------------------------------===//
 //                         RegAllocBase Implementation
 //===----------------------------------------------------------------------===//
 
+// Pin the vtable to this file.
+void RegAllocBase::anchor() {}
+
 void RegAllocBase::init(VirtRegMap &vrm,
                         LiveIntervals &lis,
                         LiveRegMatrix &mat) {
@@ -99,14 +102,13 @@ void RegAllocBase::allocatePhysRegs() {
     // result from splitting.
     DEBUG(dbgs() << "\nselectOrSplit "
                  << MRI->getRegClass(VirtReg->reg)->getName()
-                 << ':' << PrintReg(VirtReg->reg) << ' ' << *VirtReg << '\n');
-    typedef SmallVector<LiveInterval*, 4> VirtRegVec;
+                 << ':' << *VirtReg << '\n');
+    typedef SmallVector<unsigned, 4> VirtRegVec;
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
 
     if (AvailablePhysReg == ~0u) {
       // selectOrSplit failed to find a register!
-      const char *Msg = "ran out of registers during register allocation";
       // Probably caused by an inline asm.
       MachineInstr *MI;
       for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(VirtReg->reg);
@@ -114,9 +116,9 @@ void RegAllocBase::allocatePhysRegs() {
         if (MI->isInlineAsm())
           break;
       if (MI)
-        MI->emitError(Msg);
+        MI->emitError("inline assembly requires more registers than available");
       else
-        report_fatal_error(Msg);
+        report_fatal_error("ran out of registers during register allocation");
       // Keep going after reporting the error.
       VRM->assignVirt2Phys(VirtReg->reg,
                  RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
@@ -128,7 +130,7 @@ void RegAllocBase::allocatePhysRegs() {
 
     for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
          I != E; ++I) {
-      LiveInterval *SplitVirtReg = *I;
+      LiveInterval *SplitVirtReg = &LIS->getInterval(*I);
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index 064e40f..c17a8d9 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h
@@ -38,7 +38,7 @@
 #define LLVM_CODEGEN_REGALLOCBASE
 
 #include "llvm/ADT/OwningPtr.h"
-#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 
 namespace llvm {
@@ -57,6 +57,7 @@ class Spiller;
 /// live range splitting. They must also override enqueue/dequeue to provide an
 /// assignment order.
 class RegAllocBase {
+  virtual void anchor();
 protected:
   const TargetRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
@@ -90,10 +91,10 @@ protected:
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &splitLVRs) = 0;
+                                 SmallVectorImpl<unsigned> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
-  static const char *TimerGroupName;
+  static const char TimerGroupName[];
 
 public:
   /// VerifyEnabled - True when -verify-regalloc is given.
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 7fcfe9e..6768e45 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -101,7 +102,7 @@ public:
   }
 
   virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                                 SmallVectorImpl<unsigned> &SplitVRegs);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -110,7 +111,7 @@ public:
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
   bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                          SmallVectorImpl<LiveInterval*> &SplitVRegs);
+                          SmallVectorImpl<unsigned> &SplitVRegs);
 
   static char ID;
 };
@@ -125,7 +126,6 @@ RABasic::RABasic(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -142,9 +142,10 @@ void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
   AU.addRequiredID(MachineDominatorsID);
   AU.addPreservedID(MachineDominatorsID);
   AU.addRequired<MachineLoopInfo>();
@@ -165,7 +166,7 @@ void RABasic::releaseMemory() {
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
 bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                 SmallVectorImpl<unsigned> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
   SmallVector<LiveInterval*, 8> Intfs;
@@ -219,7 +220,7 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
 unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
-                                SmallVectorImpl<LiveInterval*> &SplitVRegs) {
+                                SmallVectorImpl<unsigned> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
   SmallVector<unsigned, 8> PhysRegSpillCands;
 
@@ -276,6 +277,11 @@ bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
+
+  calculateSpillWeightsAndHints(*LIS, *MF,
+                                getAnalysis<MachineLoopInfo>(),
+                                getAnalysis<MachineBlockFrequencyInfo>());
+
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
 
   allocatePhysRegs();
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index bb9c05c..e92dbd2 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -144,7 +144,7 @@ namespace {
     // not be erased.
     bool isBulkSpilling;
 
-    enum {
+    enum LLVM_ENUM_INT_TYPE(unsigned) {
       spillClean = 1,
       spillDirty = 100,
       spillImpossible = ~0u
@@ -293,29 +293,26 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
     // If this register is used by DBG_VALUE then insert new DBG_VALUE to
     // identify spilled location as the place to find corresponding variable's
     // value.
-    SmallVector<MachineInstr *, 4> &LRIDbgValues =
+    SmallVectorImpl<MachineInstr *> &LRIDbgValues =
       LiveDbgValueMap[LRI->VirtReg];
     for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
       MachineInstr *DBG = LRIDbgValues[li];
-      const MDNode *MDPtr =
-        DBG->getOperand(DBG->getNumOperands()-1).getMetadata();
-      int64_t Offset = 0;
-      if (DBG->getOperand(1).isImm())
-        Offset = DBG->getOperand(1).getImm();
+      const MDNode *MDPtr = DBG->getOperand(2).getMetadata();
+      bool IsIndirect = DBG->isIndirectDebugValue();
+      uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
       DebugLoc DL;
       if (MI == MBB->end()) {
         // If MI is at basic block end then use last instruction's location.
         MachineBasicBlock::iterator EI = MI;
         DL = (--EI)->getDebugLoc();
-      }
-      else
+      } else
         DL = MI->getDebugLoc();
-      if (MachineInstr *NewDV =
-          TII->emitFrameIndexDebugValue(*MF, FI, Offset, MDPtr, DL)) {
-        MachineBasicBlock *MBB = DBG->getParent();
-        MBB->insert(MI, NewDV);
-        DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
-      }
+      MachineBasicBlock *MBB = DBG->getParent();
+      MachineInstr *NewDV =
+          BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE))
+              .addFrameIndex(FI).addImm(Offset).addMetadata(MDPtr);
+      (void)NewDV;
+      DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
     }
     // Now this register is spilled there is should not be any DBG_VALUE
     // pointing to this register because they are all pointing to spilled value
@@ -572,7 +569,10 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   }
 
   // Nothing we can do. Report an error and keep going with a bad allocation.
-  MI->emitError("ran out of registers during register allocation");
+  if (MI->isInlineAsm())
+    MI->emitError("inline assembly requires more registers than available");
+  else
+    MI->emitError("ran out of registers during register allocation");
   definePhysReg(MI, *AO.begin(), regFree);
   return assignVirtToPhysReg(VirtReg, *AO.begin());
 }
@@ -859,25 +859,21 @@ void RAFast::AllocateBasicBlock() {
             }
             else {
               // Modify DBG_VALUE now that the value is in a spill slot.
-              int64_t Offset = MI->getOperand(1).getImm();
+              bool IsIndirect = MI->isIndirectDebugValue();
+              uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
               const MDNode *MDPtr =
                 MI->getOperand(MI->getNumOperands()-1).getMetadata();
               DebugLoc DL = MI->getDebugLoc();
-              if (MachineInstr *NewDV =
-                  TII->emitFrameIndexDebugValue(*MF, SS, Offset, MDPtr, DL)) {
-                DEBUG(dbgs() << "Modifying debug info due to spill:" <<
-                      "\t" << *MI);
-                MachineBasicBlock *MBB = MI->getParent();
-                MBB->insert(MBB->erase(MI), NewDV);
-                // Scan NewDV operands from the beginning.
-                MI = NewDV;
-                ScanDbgValue = true;
-                break;
-              } else {
-                // We can't allocate a physreg for a DebugValue; sorry!
-                DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
-                MO.setReg(0);
-              }
+              MachineBasicBlock *MBB = MI->getParent();
+              MachineInstr *NewDV = BuildMI(*MBB, MBB->erase(MI), DL,
+                                            TII->get(TargetOpcode::DBG_VALUE))
+                  .addFrameIndex(SS).addImm(Offset).addMetadata(MDPtr);
+              DEBUG(dbgs() << "Modifying debug info due to spill:"
+                           << "\t" << *NewDV);
+              // Scan NewDV operands from the beginning.
+              MI = NewDV;
+              ScanDbgValue = true;
+              break;
             }
           }
           LiveDbgValueMap[Reg].push_back(MI);
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 9eed1fc..c08d955 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -71,6 +72,7 @@ class RAGreedy : public MachineFunctionPass,
 
   // analyses
   SlotIndexes *Indexes;
+  MachineBlockFrequencyInfo *MBFI;
   MachineDominatorTree *DomTree;
   MachineLoopInfo *Loops;
   EdgeBundles *Bundles;
@@ -118,7 +120,9 @@ class RAGreedy : public MachineFunctionPass,
     RS_Done
   };
 
+#ifndef NDEBUG
   static const char *const StageName[];
+#endif
 
   // RegInfo - Keep additional information about each live range.
   struct RegInfo {
@@ -145,7 +149,7 @@ class RAGreedy : public MachineFunctionPass,
   void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
     ExtraRegInfo.resize(MRI->getNumVirtRegs());
     for (;Begin != End; ++Begin) {
-      unsigned Reg = (*Begin)->reg;
+      unsigned Reg = *Begin;
       if (ExtraRegInfo[Reg].Stage == RS_New)
         ExtraRegInfo[Reg].Stage = NewStage;
     }
@@ -158,6 +162,8 @@ class RAGreedy : public MachineFunctionPass,
 
     EvictionCost(unsigned B = 0) : BrokenHints(B), MaxWeight(0) {}
 
+    bool isMax() const { return BrokenHints == ~0u; }
+
     bool operator<(const EvictionCost &O) const {
       if (BrokenHints != O.BrokenHints)
         return BrokenHints < O.BrokenHints;
@@ -216,7 +222,7 @@ class RAGreedy : public MachineFunctionPass,
   /// class.
   SmallVector<GlobalSplitCandidate, 32> GlobalCand;
 
-  enum { NoCand = ~0u };
+  enum LLVM_ENUM_INT_TYPE(unsigned) { NoCand = ~0u };
 
   /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
   /// NoCand which indicates the stack interval.
@@ -237,7 +243,7 @@ public:
   virtual void enqueue(LiveInterval *LI);
   virtual LiveInterval *dequeue();
   virtual unsigned selectOrSplit(LiveInterval&,
-                                 SmallVectorImpl<LiveInterval*>&);
+                                 SmallVectorImpl<unsigned>&);
 
   /// Perform register allocation.
   virtual bool runOnMachineFunction(MachineFunction &mf);
@@ -249,33 +255,34 @@ private:
   void LRE_WillShrinkVirtReg(unsigned);
   void LRE_DidCloneVirtReg(unsigned, unsigned);
 
-  float calcSpillCost();
-  bool addSplitConstraints(InterferenceCache::Cursor, float&);
+  BlockFrequency calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency&);
   void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
   void growRegion(GlobalSplitCandidate &Cand);
-  float calcGlobalSplitCost(GlobalSplitCandidate&);
+  BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate&);
   bool calcCompactRegion(GlobalSplitCandidate&);
   void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
+  unsigned canReassign(LiveInterval &VirtReg, unsigned PhysReg);
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
   bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
   void evictInterference(LiveInterval&, unsigned,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
 
   unsigned tryAssign(LiveInterval&, AllocationOrder&,
-                     SmallVectorImpl<LiveInterval*>&);
+                     SmallVectorImpl<unsigned>&);
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&, unsigned = ~0u);
+                    SmallVectorImpl<unsigned>&, unsigned = ~0u);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
-                          SmallVectorImpl<LiveInterval*>&);
+                          SmallVectorImpl<unsigned>&);
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
-                         SmallVectorImpl<LiveInterval*>&);
+                         SmallVectorImpl<unsigned>&);
   unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
-                               SmallVectorImpl<LiveInterval*>&);
+                               SmallVectorImpl<unsigned>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
-    SmallVectorImpl<LiveInterval*>&);
+    SmallVectorImpl<unsigned>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<LiveInterval*>&);
+                    SmallVectorImpl<unsigned>&);
 };
 } // end anonymous namespace
 
@@ -308,7 +315,6 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
   initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
   initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
   initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
-  initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
   initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
   initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
@@ -320,6 +326,8 @@ RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
 
 void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
   AU.addRequired<AliasAnalysis>();
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<LiveIntervals>();
@@ -330,7 +338,6 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -407,15 +414,28 @@ void RAGreedy::enqueue(LiveInterval *LI) {
     // everything else has been allocated.
     Prio = Size;
   } else {
-    // Everything is allocated in long->short order. Long ranges that don't fit
-    // should be spilled (or split) ASAP so they don't create interference.
-    Prio = (1u << 31) + Size;
+    if (ExtraRegInfo[Reg].Stage == RS_Assign && !LI->empty() &&
+        LIS->intervalIsInOneMBB(*LI)) {
+      // Allocate original local ranges in linear instruction order. Since they
+      // are singly defined, this produces optimal coloring in the absence of
+      // global interference and other constraints.
+      Prio = LI->beginIndex().getInstrDistance(Indexes->getLastIndex());
+    }
+    else {
+      // Allocate global and split ranges in long->short order. Long ranges that
+      // don't fit should be spilled (or split) ASAP so they don't create
+      // interference.  Mark a bit to prioritize global above local ranges.
+      Prio = (1u << 29) + Size;
+    }
+    // Mark a higher bit to prioritize global and local above RS_Split.
+    Prio |= (1u << 31);
 
     // Boost ranges that have a physical register hint.
     if (VRM->hasKnownPreference(Reg))
       Prio |= (1u << 30);
   }
-
+  // The virtual register number is a tie breaker for same-sized ranges.
+  // Give lower vreg numbers higher priority to assign them first.
   Queue.push(std::make_pair(Prio, ~Reg));
 }
 
@@ -435,7 +455,7 @@ LiveInterval *RAGreedy::dequeue() {
 /// tryAssign - Try to assign VirtReg to an available register.
 unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
                              AllocationOrder &Order,
-                             SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                             SmallVectorImpl<unsigned> &NewVRegs) {
   Order.rewind();
   unsigned PhysReg;
   while ((PhysReg = Order.next()))
@@ -476,6 +496,31 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
+unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
+  unsigned PhysReg;
+  while ((PhysReg = Order.next())) {
+    if (PhysReg == PrevReg)
+      continue;
+
+    MCRegUnitIterator Units(PhysReg, TRI);
+    for (; Units.isValid(); ++Units) {
+      // Instantiate a "subquery", not to be confused with the Queries array.
+      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]);
+      if (subQ.checkInterference())
+        break;
+    }
+    // If no units have interference, break out with the current PhysReg.
+    if (!Units.isValid())
+      break;
+  }
+  if (PhysReg)
+    DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
+          << PrintReg(PrevReg, TRI) << " to " << PrintReg(PhysReg, TRI)
+          << '\n');
+  return PhysReg;
+}
+
 /// shouldEvict - determine if A should evict the assigned live range B. The
 /// eviction policy defined by this function together with the allocation order
 /// defined by enqueue() decides which registers ultimately end up being split
@@ -516,6 +561,8 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
     return false;
 
+  bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
+
   // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
   // involved in an eviction before. If a cascade number was assigned, deny
   // evicting anything with the same or a newer cascade number. This prevents
@@ -569,8 +616,17 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
       // Abort if this would be too expensive.
       if (!(Cost < MaxCost))
         return false;
+      if (Urgent)
+        continue;
+      // If !MaxCost.isMax(), then we're just looking for a cheap register.
+      // Evicting another local live range in this case could lead to suboptimal
+      // coloring.
+      if (!MaxCost.isMax() && IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
+          !canReassign(*Intf, PhysReg)) {
+        return false;
+      }
       // Finally, apply the eviction policy for non-urgent evictions.
-      if (!Urgent && !shouldEvict(VirtReg, IsHint, *Intf, BreaksHint))
+      if (!shouldEvict(VirtReg, IsHint, *Intf, BreaksHint))
         return false;
     }
   }
@@ -582,7 +638,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
 void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
@@ -614,7 +670,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
            "Cannot decrease cascade number, illegal eviction");
     ExtraRegInfo[Intf->reg].Cascade = Cascade;
     ++NumEvicted;
-    NewVRegs.push_back(Intf);
+    NewVRegs.push_back(Intf->reg);
   }
 }
 
@@ -624,7 +680,7 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 /// @return         Physreg to assign VirtReg, or 0.
 unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
                             AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*> &NewVRegs,
+                            SmallVectorImpl<unsigned> &NewVRegs,
                             unsigned CostPerUseLimit) {
   NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
 
@@ -699,12 +755,12 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
 /// that all preferences in SplitConstraints are met.
 /// Return false if there are no bundles with positive bias.
 bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
-                                   float &Cost) {
+                                   BlockFrequency &Cost) {
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
 
   // Reset interference dependent info.
   SplitConstraints.resize(UseBlocks.size());
-  float StaticCost = 0;
+  BlockFrequency StaticCost = 0;
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
     SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
@@ -713,7 +769,7 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     Intf.moveToBlock(BC.Number);
     BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
     BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
-    BC.ChangesValue = BI.FirstDef;
+    BC.ChangesValue = BI.FirstDef.isValid();
 
     if (!Intf.hasInterference())
       continue;
@@ -742,8 +798,8 @@ bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
     }
 
     // Accumulate the total frequency of inserted spill code.
-    if (Ins)
-      StaticCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
+    while (Ins--)
+      StaticCost += SpillPlacer->getBlockFrequency(BC.Number);
   }
   Cost = StaticCost;
 
@@ -876,7 +932,7 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
   SpillPlacer->prepare(Cand.LiveBundles);
 
   // The static split cost will be zero since Cand.Intf reports no interference.
-  float Cost;
+  BlockFrequency Cost;
   if (!addSplitConstraints(Cand.Intf, Cost)) {
     DEBUG(dbgs() << ", none.\n");
     return false;
@@ -901,8 +957,8 @@ bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
 
 /// calcSpillCost - Compute how expensive it would be to split the live range in
 /// SA around all use blocks instead of forming bundle regions.
-float RAGreedy::calcSpillCost() {
-  float Cost = 0;
+BlockFrequency RAGreedy::calcSpillCost() {
+  BlockFrequency Cost = 0;
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
     const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
@@ -921,8 +977,8 @@ float RAGreedy::calcSpillCost() {
 /// pattern in LiveBundles. This cost should be added to the local cost of the
 /// interference pattern in SplitConstraints.
 ///
-float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
-  float GlobalCost = 0;
+BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
+  BlockFrequency GlobalCost = 0;
   const BitVector &LiveBundles = Cand.LiveBundles;
   ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
   for (unsigned i = 0; i != UseBlocks.size(); ++i) {
@@ -936,8 +992,8 @@ float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
       Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
     if (BI.LiveOut)
       Ins += RegOut != (BC.Exit == SpillPlacement::PrefReg);
-    if (Ins)
-      GlobalCost += Ins * SpillPlacer->getBlockFrequency(BC.Number);
+    while (Ins--)
+      GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
   }
 
   for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
@@ -949,8 +1005,10 @@ float RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
     if (RegIn && RegOut) {
       // We need double spill code if this block has interference.
       Cand.Intf.moveToBlock(Number);
-      if (Cand.Intf.hasInterference())
-        GlobalCost += 2*SpillPlacer->getBlockFrequency(Number);
+      if (Cand.Intf.hasInterference()) {
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+      }
       continue;
     }
     // live-in / stack-out or stack-in live-out.
@@ -1067,7 +1125,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   unsigned OrigBlocks = SA->getNumLiveBlocks();
@@ -1078,7 +1136,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   // - Block-local splits are candidates for local splitting.
   // - DCE leftovers should go back on the queue.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &Reg = *LREdit.get(i);
+    LiveInterval &Reg = LIS->getInterval(LREdit.get(i));
 
     // Ignore old intervals from DCE.
     if (getStage(Reg) != RS_New)
@@ -1112,10 +1170,10 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
 }
 
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                  SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                  SmallVectorImpl<unsigned> &NewVRegs) {
   unsigned NumCands = 0;
   unsigned BestCand = NoCand;
-  float BestCost;
+  BlockFrequency BestCost;
   SmallVector<unsigned, 8> UsedCands;
 
   // Check if we can split this live range around a compact region.
@@ -1123,11 +1181,11 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (HasCompact) {
     // Yes, keep GlobalCand[0] as the compact region candidate.
     NumCands = 1;
-    BestCost = HUGE_VALF;
+    BestCost = BlockFrequency::getMaxFrequency();
   } else {
     // No benefit from the compact region, our fallback will be per-block
     // splitting. Make sure we find a solution that is cheaper than spilling.
-    BestCost = Hysteresis * calcSpillCost();
+    BestCost = calcSpillCost();
     DEBUG(dbgs() << "Cost of isolating all blocks = " << BestCost << '\n');
   }
 
@@ -1157,7 +1215,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     Cand.reset(IntfCache, PhysReg);
 
     SpillPlacer->prepare(Cand.LiveBundles);
-    float Cost;
+    BlockFrequency Cost;
     if (!addSplitConstraints(Cand.Intf, Cost)) {
       DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bundles\n");
       continue;
@@ -1193,7 +1251,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     });
     if (Cost < BestCost) {
       BestCand = NumCands;
-      BestCost = Hysteresis * Cost; // Prevent rounding effects.
+      BestCost = Cost;
     }
     ++NumCands;
   }
@@ -1247,7 +1305,7 @@ unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// creates a lot of local live ranges, that will be split by tryLocalSplit if
 /// they don't allocate.
 unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
   unsigned Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
@@ -1268,14 +1326,14 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->finish(&IntvMap);
 
   // Tell LiveDebugVariables about the new ranges.
-  DebugVars->splitRegister(Reg, LREdit.regs());
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
 
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Sort out the new intervals created by splitting. The remainder interval
   // goes straight to spilling, the new local ranges get to stay RS_New.
   for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
-    LiveInterval &LI = *LREdit.get(i);
+    LiveInterval &LI = LIS->getInterval(LREdit.get(i));
     if (getStage(LI) == RS_New && IntvMap[i] == 0)
       setStage(LI, RS_Spill);
   }
@@ -1299,7 +1357,7 @@ unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// This is similar to spilling to a larger register class.
 unsigned
 RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                              SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                              SmallVectorImpl<unsigned> &NewVRegs) {
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(MRI->getRegClass(VirtReg.reg)))
     return 0;
@@ -1335,7 +1393,7 @@ RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
 
   // Assign all new registers to RS_Spill. This was the last chance.
@@ -1406,9 +1464,9 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 
   // Add fixed interference.
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveInterval &LI = LIS->getRegUnit(*Units);
-    LiveInterval::const_iterator I = LI.find(StartIdx);
-    LiveInterval::const_iterator E = LI.end();
+    const LiveRange &LR = LIS->getRegUnit(*Units);
+    LiveRange::const_iterator I = LR.find(StartIdx);
+    LiveRange::const_iterator E = LR.end();
 
     // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
     for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
@@ -1419,7 +1477,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
         break;
 
       for (; Gap != NumGaps; ++Gap) {
-        GapWeight[Gap] = HUGE_VALF;
+        GapWeight[Gap] = llvm::huge_valf;
         if (Uses[Gap+1].getBaseIndex() >= I->end)
           break;
       }
@@ -1433,7 +1491,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 /// basic block.
 ///
 unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
   const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
 
@@ -1511,7 +1569,9 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   unsigned BestAfter = 0;
   float BestDiff = 0;
 
-  const float blockFreq = SpillPlacer->getBlockFrequency(BI.MBB->getNumber());
+  const float blockFreq =
+    SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
+    (1.0f / BlockFrequency::getEntryFrequency());
   SmallVector<float, 8> GapWeight;
 
   Order.rewind();
@@ -1523,7 +1583,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     // Remove any gaps with regmask clobbers.
     if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
       for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
-        GapWeight[RegMaskGaps[i]] = HUGE_VALF;
+        GapWeight[RegMaskGaps[i]] = llvm::huge_valf;
 
     // Try to find the best sequence of gaps to close.
     // The new spill weight must be larger than any gap interference.
@@ -1558,7 +1618,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
       // Legally, without causing looping?
       bool Legal = !ProgressRequired || NewGaps < NumGaps;
 
-      if (Legal && MaxGap < HUGE_VALF) {
+      if (Legal && MaxGap < llvm::huge_valf) {
         // Estimate the new spill weight. Each instruction reads or writes the
         // register. Conservatively assume there are no read-modify-write
         // instructions.
@@ -1625,7 +1685,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
   SE->useIntv(SegStart, SegStop);
   SmallVector<unsigned, 8> IntvMap;
   SE->finish(&IntvMap);
-  DebugVars->splitRegister(VirtReg.reg, LREdit.regs());
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
 
   // If the new range has the same number of instructions as before, mark it as
   // RS_Split2 so the next split will be forced to make progress. Otherwise,
@@ -1638,8 +1698,8 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
     assert(!ProgressRequired && "Didn't make progress when it was required.");
     for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
       if (IntvMap[i] == 1) {
-        setStage(*LREdit.get(i), RS_Split2);
-        DEBUG(dbgs() << PrintReg(LREdit.get(i)->reg));
+        setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)));
       }
     DEBUG(dbgs() << '\n');
   }
@@ -1656,7 +1716,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// assignable.
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
 unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                            SmallVectorImpl<LiveInterval*>&NewVRegs) {
+                            SmallVectorImpl<unsigned>&NewVRegs) {
   // Ranges must be Split2 or less.
   if (getStage(VirtReg) >= RS_Spill)
     return 0;
@@ -1705,7 +1765,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
 //===----------------------------------------------------------------------===//
 
 unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<LiveInterval*> &NewVRegs) {
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
   // First try assigning a free register.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo);
   if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
@@ -1730,7 +1790,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
   if (Stage < RS_Split) {
     setStage(VirtReg, RS_Split);
     DEBUG(dbgs() << "wait for second round\n");
-    NewVRegs.push_back(&VirtReg);
+    NewVRegs.push_back(VirtReg.reg);
     return 0;
   }
 
@@ -1770,6 +1830,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
                      getAnalysis<LiveIntervals>(),
                      getAnalysis<LiveRegMatrix>());
   Indexes = &getAnalysis<SlotIndexes>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   DomTree = &getAnalysis<MachineDominatorTree>();
   SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
   Loops = &getAnalysis<MachineLoopInfo>();
@@ -1777,8 +1838,12 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   SpillPlacer = &getAnalysis<SpillPlacement>();
   DebugVars = &getAnalysis<LiveDebugVariables>();
 
+  calculateSpillWeightsAndHints(*LIS, mf, *Loops, *MBFI);
+
+  DEBUG(LIS->dump());
+
   SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
-  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree));
+  SE.reset(new SplitEditor(*SA, *LIS, *VRM, *DomTree, *MBFI));
   ExtraRegInfo.clear();
   ExtraRegInfo.resize(MRI->getNumVirtRegs());
   NextCascade = 1;
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 15a88e2..88c8201 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -94,9 +95,7 @@ public:
       : MachineFunctionPass(ID), builder(b.take()), customPassID(cPassID) {
     initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
     initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
-    initializeCalculateSpillWeightsPass(*PassRegistry::getPassRegistry());
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
-    initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
     initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
   }
 
@@ -130,8 +129,8 @@ private:
   const TargetMachine *tm;
   const TargetRegisterInfo *tri;
   const TargetInstrInfo *tii;
-  const MachineLoopInfo *loopInfo;
   MachineRegisterInfo *mri;
+  const MachineBlockFrequencyInfo *mbfi;
 
   OwningPtr<Spiller> spiller;
   LiveIntervals *lis;
@@ -158,13 +157,13 @@ char RegAllocPBQP::ID = 0;
 
 } // End anonymous namespace.
 
-unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::ConstNodeItr node) const {
+unsigned PBQPRAProblem::getVRegForNode(PBQP::Graph::NodeId node) const {
   Node2VReg::const_iterator vregItr = node2VReg.find(node);
   assert(vregItr != node2VReg.end() && "No vreg for node.");
   return vregItr->second;
 }
 
-PBQP::Graph::NodeItr PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
+PBQP::Graph::NodeId PBQPRAProblem::getNodeForVReg(unsigned vreg) const {
   VReg2Node::const_iterator nodeItr = vreg2Node.find(vreg);
   assert(nodeItr != vreg2Node.end() && "No node for vreg.");
   return nodeItr->second;
@@ -188,7 +187,7 @@ unsigned PBQPRAProblem::getPRegForOption(unsigned vreg, unsigned option) const {
 }
 
 PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
-                                  const MachineLoopInfo *loopInfo,
+                                  const MachineBlockFrequencyInfo *mbfi,
                                   const RegSet &vregs) {
 
   LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
@@ -247,7 +246,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
     }
 
     // Construct the node.
-    PBQP::Graph::NodeItr node =
+    PBQP::Graph::NodeId node =
       g.addNode(PBQP::Vector(vrAllowed.size() + 1, 0));
 
     // Record the mapping and allowed set in the problem.
@@ -273,7 +272,7 @@ PBQPRAProblem *PBQPBuilder::build(MachineFunction *mf, const LiveIntervals *lis,
 
       assert(!l2.empty() && "Empty interval in vreg set?");
       if (l1.overlaps(l2)) {
-        PBQP::Graph::EdgeItr edge =
+        PBQP::Graph::EdgeId edge =
           g.addEdge(p->getNodeForVReg(vr1), p->getNodeForVReg(vr2),
                     PBQP::Matrix(vr1Allowed.size()+1, vr2Allowed.size()+1, 0));
 
@@ -313,10 +312,10 @@ void PBQPBuilder::addInterferenceCosts(
 
 PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
                                                 const LiveIntervals *lis,
-                                                const MachineLoopInfo *loopInfo,
+                                                const MachineBlockFrequencyInfo *mbfi,
                                                 const RegSet &vregs) {
 
-  OwningPtr<PBQPRAProblem> p(PBQPBuilder::build(mf, lis, loopInfo, vregs));
+  OwningPtr<PBQPRAProblem> p(PBQPBuilder::build(mf, lis, mbfi, vregs));
   PBQP::Graph &g = p->getGraph();
 
   const TargetMachine &tm = mf->getTarget();
@@ -350,7 +349,7 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
 
       PBQP::PBQPNum cBenefit =
         copyFactor * LiveIntervals::getSpillWeight(false, true,
-                                                   loopInfo->getLoopDepth(mbb));
+                                                   mbfi->getBlockFreq(mbb));
 
       if (cp.isPhys()) {
         if (!mf->getRegInfo().isAllocatable(dst)) {
@@ -364,16 +363,16 @@ PBQPRAProblem *PBQPBuilderWithCoalescing::build(MachineFunction *mf,
         }
         if (pregOpt < allowed.size()) {
           ++pregOpt; // +1 to account for spill option.
-          PBQP::Graph::NodeItr node = p->getNodeForVReg(src);
+          PBQP::Graph::NodeId node = p->getNodeForVReg(src);
           addPhysRegCoalesce(g.getNodeCosts(node), pregOpt, cBenefit);
         }
       } else {
         const PBQPRAProblem::AllowedSet *allowed1 = &p->getAllowedSet(dst);
         const PBQPRAProblem::AllowedSet *allowed2 = &p->getAllowedSet(src);
-        PBQP::Graph::NodeItr node1 = p->getNodeForVReg(dst);
-        PBQP::Graph::NodeItr node2 = p->getNodeForVReg(src);
-        PBQP::Graph::EdgeItr edge = g.findEdge(node1, node2);
-        if (edge == g.edgesEnd()) {
+        PBQP::Graph::NodeId node1 = p->getNodeForVReg(dst);
+        PBQP::Graph::NodeId node2 = p->getNodeForVReg(src);
+        PBQP::Graph::EdgeId edge = g.findEdge(node1, node2);
+        if (edge == g.invalidEdgeId()) {
           edge = g.addEdge(node1, node2, PBQP::Matrix(allowed1->size() + 1,
                                                       allowed2->size() + 1,
                                                       0));
@@ -432,13 +431,14 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   //au.addRequiredID(SplitCriticalEdgesID);
   if (customPassID)
     au.addRequiredID(*customPassID);
-  au.addRequired<CalculateSpillWeights>();
   au.addRequired<LiveStacks>();
   au.addPreserved<LiveStacks>();
-  au.addRequired<MachineDominatorTree>();
-  au.addPreserved<MachineDominatorTree>();
+  au.addRequired<MachineBlockFrequencyInfo>();
+  au.addPreserved<MachineBlockFrequencyInfo>();
   au.addRequired<MachineLoopInfo>();
   au.addPreserved<MachineLoopInfo>();
+  au.addRequired<MachineDominatorTree>();
+  au.addPreserved<MachineDominatorTree>();
   au.addRequired<VirtRegMap>();
   au.addPreserved<VirtRegMap>();
   MachineFunctionPass::getAnalysisUsage(au);
@@ -475,11 +475,11 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
   const PBQP::Graph &g = problem.getGraph();
   // Iterate over the nodes mapping the PBQP solution to a register
   // assignment.
-  for (PBQP::Graph::ConstNodeItr node = g.nodesBegin(),
-                                 nodeEnd = g.nodesEnd();
-       node != nodeEnd; ++node) {
-    unsigned vreg = problem.getVRegForNode(node);
-    unsigned alloc = solution.getSelection(node);
+  for (PBQP::Graph::NodeItr nodeItr = g.nodesBegin(),
+                            nodeEnd = g.nodesEnd();
+       nodeItr != nodeEnd; ++nodeItr) {
+    unsigned vreg = problem.getVRegForNode(*nodeItr);
+    unsigned alloc = solution.getSelection(*nodeItr);
 
     if (problem.isPRegOption(vreg, alloc)) {
       unsigned preg = problem.getPRegForOption(vreg, alloc);
@@ -489,7 +489,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       vrm->assignVirt2Phys(vreg, preg);
     } else if (problem.isSpillOption(vreg, alloc)) {
       vregsToAlloc.erase(vreg);
-      SmallVector<LiveInterval*, 8> newSpills;
+      SmallVector<unsigned, 8> newSpills;
       LiveRangeEdit LRE(&lis->getInterval(vreg), newSpills, *mf, *lis, vrm);
       spiller->spill(LRE);
 
@@ -500,9 +500,10 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAProblem &problem,
       // allocate.
       for (LiveRangeEdit::iterator itr = LRE.begin(), end = LRE.end();
            itr != end; ++itr) {
-        assert(!(*itr)->empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg((*itr)->reg, tri) << " ");
-        vregsToAlloc.insert((*itr)->reg);
+        LiveInterval &li = lis->getInterval(*itr);
+        assert(!li.empty() && "Empty spill range.");
+        DEBUG(dbgs() << PrintReg(li.reg, tri) << " ");
+        vregsToAlloc.insert(li.reg);
       }
 
       DEBUG(dbgs() << ")\n");
@@ -546,7 +547,10 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   lis = &getAnalysis<LiveIntervals>();
   lss = &getAnalysis<LiveStacks>();
-  loopInfo = &getAnalysis<MachineLoopInfo>();
+  mbfi = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  calculateSpillWeightsAndHints(*lis, MF, getAnalysis<MachineLoopInfo>(),
+                                *mbfi);
 
   vrm = &getAnalysis<VirtRegMap>();
   spiller.reset(createInlineSpiller(*this, MF, *vrm));
@@ -584,7 +588,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
       DEBUG(dbgs() << "  PBQP Regalloc round " << round << ":\n");
 
       OwningPtr<PBQPRAProblem> problem(
-        builder->build(mf, lis, loopInfo, vregsToAlloc));
+        builder->build(mf, lis, mbfi, vregsToAlloc));
 
 #ifndef NDEBUG
       if (pbqpDumpGraphs) {
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 87382d8..cacd7de 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -40,6 +40,9 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   if (MF->getTarget().getRegisterInfo() != TRI) {
     TRI = MF->getTarget().getRegisterInfo();
     RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    unsigned NumPSets = TRI->getNumRegPressureSets();
+    PSetLimits.reset(new unsigned[NumPSets]);
+    std::fill(&PSetLimits[0], &PSetLimits[NumPSets], 0);
     Update = true;
   }
 
@@ -144,3 +147,32 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
   RCI.Tag = Tag;
 }
 
+/// This is not accurate because two overlapping register sets may have some
+/// nonoverlapping reserved registers. However, computing the allocation order
+/// for all register classes would be too expensive.
+unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
+  const TargetRegisterClass *RC = 0;
+  unsigned NumRCUnits = 0;
+  for (TargetRegisterInfo::regclass_iterator
+         RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) {
+    const int *PSetID = TRI->getRegClassPressureSets(*RI);
+    for (; *PSetID != -1; ++PSetID) {
+      if ((unsigned)*PSetID == Idx)
+        break;
+    }
+    if (*PSetID == -1)
+      continue;
+
+    // Found a register class that counts against this pressure set.
+    // For efficiency, only compute the set order for the largest set.
+    unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit;
+    if (!RC || NUnits > NumRCUnits) {
+      RC = *RI;
+      NumRCUnits = NUnits;
+    }
+  }
+  compute(RC);
+  unsigned NReserved = RC->getNumRegs() - getNumAllocatableRegs(RC);
+  return TRI->getRegPressureSetLimit(Idx)
+    - TRI->getRegClassWeight(RC).RegWeight * NReserved;
+}
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index d85646d..dd86c1f 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -166,7 +166,8 @@ namespace {
 
     /// reMaterializeTrivialDef - If the source of a copy is defined by a
     /// trivial computation, replace the copy by rematerialize the definition.
-    bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI);
+    bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI,
+                                 bool &IsDefCopy);
 
     /// canJoinPhys - Return true if a physreg copy should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
@@ -397,7 +398,7 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void RegisterCoalescer::eliminateDeadDefs() {
-  SmallVector<LiveInterval*, 8> NewRegs;
+  SmallVector<unsigned, 8> NewRegs;
   LiveRangeEdit(0, NewRegs, *MF, *LIS, 0, this).eliminateDeadDefs(DeadDefs);
 }
 
@@ -433,11 +434,11 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
-  // BValNo is a value number in B that is defined by a copy from A.  'B3' in
+  // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
-  LiveInterval::iterator BLR = IntB.FindLiveRangeContaining(CopyIdx);
-  if (BLR == IntB.end()) return false;
-  VNInfo *BValNo = BLR->valno;
+  LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
+  if (BS == IntB.end()) return false;
+  VNInfo *BValNo = BS->valno;
 
   // Get the location that B is defined at.  Two options: either this value has
   // an unknown definition point or it is defined at CopyIdx.  If unknown, we
@@ -446,10 +447,10 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
-  LiveInterval::iterator ALR = IntA.FindLiveRangeContaining(CopyUseIdx);
-  // The live range might not exist after fun with physreg coalescing.
-  if (ALR == IntA.end()) return false;
-  VNInfo *AValNo = ALR->valno;
+  LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
+  // The live segment might not exist after fun with physreg coalescing.
+  if (AS == IntA.end()) return false;
+  VNInfo *AValNo = AS->valno;
 
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
@@ -458,54 +459,54 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
     return false;
 
-  // Get the LiveRange in IntB that this value number starts with.
-  LiveInterval::iterator ValLR =
-    IntB.FindLiveRangeContaining(AValNo->def.getPrevSlot());
-  if (ValLR == IntB.end())
+  // Get the Segment in IntB that this value number starts with.
+  LiveInterval::iterator ValS =
+    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+  if (ValS == IntB.end())
     return false;
 
-  // Make sure that the end of the live range is inside the same block as
+  // Make sure that the end of the live segment is inside the same block as
   // CopyMI.
-  MachineInstr *ValLREndInst =
-    LIS->getInstructionFromIndex(ValLR->end.getPrevSlot());
-  if (!ValLREndInst || ValLREndInst->getParent() != CopyMI->getParent())
+  MachineInstr *ValSEndInst =
+    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+  if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
     return false;
 
-  // Okay, we now know that ValLR ends in the same block that the CopyMI
-  // live-range starts.  If there are no intervening live ranges between them in
-  // IntB, we can merge them.
-  if (ValLR+1 != BLR) return false;
+  // Okay, we now know that ValS ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live segments between them
+  // in IntB, we can merge them.
+  if (ValS+1 != BS) return false;
 
   DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
 
-  SlotIndex FillerStart = ValLR->end, FillerEnd = BLR->start;
+  SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
   // We are about to delete CopyMI, so need to remove it as the 'instruction
   // that defines this value #'. Update the valnum with the new defining
   // instruction #.
   BValNo->def = FillerStart;
 
   // Okay, we can merge them.  We need to insert a new liverange:
-  // [ValLR.end, BLR.begin) of either value number, then we merge the
+  // [ValS.end, BS.begin) of either value number, then we merge the
   // two value numbers.
-  IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
+  IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
 
   // Okay, merge "B1" into the same value number as "B0".
-  if (BValNo != ValLR->valno)
-    IntB.MergeValueNumberInto(BValNo, ValLR->valno);
+  if (BValNo != ValS->valno)
+    IntB.MergeValueNumberInto(BValNo, ValS->valno);
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
   // merge, unset the isKill marker given the live range has been extended.
-  int UIdx = ValLREndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
   if (UIdx != -1) {
-    ValLREndInst->getOperand(UIdx).setIsKill(false);
+    ValSEndInst->getOperand(UIdx).setIsKill(false);
   }
 
   // Rewrite the copy. If the copy instruction was killing the destination
   // register before the merge, find the last use and trim the live range. That
   // will also add the isKill marker.
   CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
-  if (ALR->end == CopyIdx)
+  if (AS->end == CopyIdx)
     LIS->shrinkToUses(&IntA);
 
   ++numExtends;
@@ -526,11 +527,11 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    LiveInterval::Ranges::iterator BI =
-      std::upper_bound(IntB.ranges.begin(), IntB.ranges.end(), AI->start);
-    if (BI != IntB.ranges.begin())
+    LiveInterval::iterator BI =
+      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+    if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.ranges.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
       if (BI->start <= AI->start && BI->end > AI->start)
@@ -576,14 +577,12 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   LiveInterval &IntB =
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
 
-  // BValNo is a value number in B that is defined by a copy from A. 'B3' in
+  // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
   if (!BValNo || BValNo->def != CopyIdx)
     return false;
 
-  assert(BValNo->def == CopyIdx && "Copy doesn't define the value?");
-
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && "COPY source not live");
@@ -613,7 +612,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   unsigned NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
+  if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
     return false;
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -628,8 +627,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
        UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
     MachineInstr *UseMI = &*UI;
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // If this use is tied to a def, we can't rewrite the register.
     if (UseMI->isRegTiedToDefOperand(UI.getOperandNo()))
@@ -680,8 +679,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
-    LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
-    if (ULR == IntA.end() || ULR->valno != AValNo)
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -711,14 +710,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     UseMI->eraseFromParent();
   }
 
-  // Extend BValNo by merging in IntA live ranges of AValNo. Val# definition
+  // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
   VNInfo *ValNo = BValNo;
   ValNo->def = AValNo->def;
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
-    IntB.addRange(LiveRange(AI->start, AI->end, ValNo));
+    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
   }
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
@@ -731,23 +730,29 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 /// reMaterializeTrivialDef - If the source of a copy is defined by a trivial
 /// computation, replace the copy by rematerialize the definition.
 bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
-                                                MachineInstr *CopyMI) {
+                                                MachineInstr *CopyMI,
+                                                bool &IsDefCopy) {
+  IsDefCopy = false;
   unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
   unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
   if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
     return false;
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot(true);
-  LiveInterval::iterator SrcLR = SrcInt.FindLiveRangeContaining(CopyIdx);
-  assert(SrcLR != SrcInt.end() && "Live range not found!");
-  VNInfo *ValNo = SrcLR->valno;
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+  VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
+  assert(ValNo && "CopyMI input register not live");
   if (ValNo->isPHIDef() || ValNo->isUnused())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
   if (!DefMI)
     return false;
-  assert(DefMI && "Defining instruction disappeared");
+  if (DefMI->isCopyLike()) {
+    IsDefCopy = true;
+    return false;
+  }
   if (!DefMI->isAsCheapAsAMove())
     return false;
   if (!TII->isTriviallyReMaterializable(DefMI, AA))
@@ -760,31 +765,41 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     return false;
   // Only support subregister destinations when the def is read-undef.
   MachineOperand &DstOperand = CopyMI->getOperand(0);
+  unsigned CopyDstReg = DstOperand.getReg();
   if (DstOperand.getSubReg() && !DstOperand.isUndef())
     return false;
+
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
-    // Make sure the copy destination register class fits the instruction
-    // definition register class. The mismatch can happen as a result of earlier
-    // extract_subreg, insert_subreg, subreg_to_reg coalescing.
-    const TargetRegisterClass *RC = TII->getRegClass(MCID, 0, TRI, *MF);
-    if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      if (!MRI->constrainRegClass(DstReg, RC))
+    if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+      unsigned NewDstReg = DstReg;
+
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
+                                              DefMI->getOperand(0).getSubReg());
+      if (NewDstIdx)
+        NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
+
+      // Finally, make sure that the physical subregister that will be
+      // constructed later is permitted for the instruction.
+      if (!DefRC->contains(NewDstReg))
         return false;
-    } else if (!RC->contains(DstReg))
-      return false;
+    } else {
+      // Theoretically, some stack frame reference could exist. Just make sure
+      // it hasn't actually happened.
+      assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+             "Only expect to deal with virtual or physical registers");
+    }
   }
 
   MachineBasicBlock *MBB = CopyMI->getParent();
   MachineBasicBlock::iterator MII =
     llvm::next(MachineBasicBlock::iterator(CopyMI));
-  TII->reMaterialize(*MBB, MII, DstReg, 0, DefMI, *TRI);
+  TII->reMaterialize(*MBB, MII, DstReg, SrcIdx, DefMI, *TRI);
   MachineInstr *NewMI = prior(MII);
 
-  // The original DefMI may have been a subregister def, but the full register
-  // class of its destination matches the destination of CopyMI, and CopyMI is
-  // either a full register def or is read-undef. Therefore we can clear the
-  // subregister index on the rematerialized instruction.
-  NewMI->getOperand(0).setSubReg(0);
+  LIS->ReplaceMachineInstrInMaps(CopyMI, NewMI);
+  CopyMI->eraseFromParent();
+  ErasedInstrs.insert(CopyMI);
 
   // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
   // We need to remember these so we can add intervals once we insert
@@ -800,6 +815,47 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     }
   }
 
+  if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    unsigned NewIdx = NewMI->getOperand(0).getSubReg();
+    const TargetRegisterClass *RCForInst;
+    if (NewIdx)
+      RCForInst = TRI->getMatchingSuperRegClass(MRI->getRegClass(DstReg), DefRC,
+                                                NewIdx);
+
+    if (MRI->constrainRegClass(DstReg, DefRC)) {
+      // The materialized instruction is quite capable of setting DstReg
+      // directly, but it may still have a now-trivial subregister index which
+      // we should clear.
+      NewMI->getOperand(0).setSubReg(0);
+    } else if (NewIdx && RCForInst) {
+      // The subreg index on NewMI is essential; we still have to make sure
+      // DstReg:idx is in a class that NewMI can use.
+      MRI->constrainRegClass(DstReg, RCForInst);
+    } else {
+      // DstReg is actually incompatible with NewMI, we have to move to a
+      // super-reg's class. This could come from a sequence like:
+      //     GR32 = MOV32r0
+      //     GR8 = COPY GR32:sub_8
+      MRI->setRegClass(DstReg, CP.getNewRC());
+      updateRegDefsUses(DstReg, DstReg, DstIdx);
+      NewMI->getOperand(0).setSubReg(
+          TRI->composeSubRegIndices(SrcIdx, DefMI->getOperand(0).getSubReg()));
+    }
+  } else if (NewMI->getOperand(0).getReg() != CopyDstReg) {
+    // The New instruction may be defining a sub-register of what's actually
+    // been asked for. If so it must implicitly define the whole thing.
+    assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+           "Only expect virtual or physical registers in remat");
+    NewMI->getOperand(0).setIsDead(true);
+    NewMI->addOperand(MachineOperand::CreateReg(CopyDstReg,
+                                                true  /*IsDef*/,
+                                                true  /*IsImp*/,
+                                                false /*IsKill*/));
+  }
+
+  if (NewMI->getOperand(0).getSubReg())
+    NewMI->getOperand(0).setIsUndef();
+
   // CopyMI may have implicit operands, transfer them over to the newly
   // rematerialized instruction. And update implicit def interval valnos.
   for (unsigned i = CopyMI->getDesc().getNumOperands(),
@@ -814,18 +870,14 @@ bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
     }
   }
 
-  LIS->ReplaceMachineInstrInMaps(CopyMI, NewMI);
-
   SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
   for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
     unsigned Reg = NewMIImplDefs[i];
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
-      if (LiveInterval *LI = LIS->getCachedRegUnit(*Units))
-        LI->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
   }
 
-  CopyMI->eraseFromParent();
-  ErasedInstrs.insert(CopyMI);
   DEBUG(dbgs() << "Remat: " << *NewMI);
   ++NumReMats;
 
@@ -994,7 +1046,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(CopyMI));
+    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
@@ -1015,8 +1067,11 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     if (!canJoinPhys(CP)) {
       // Before giving up coalescing, if definition of source is defined by
       // trivial computation, try rematerializing it.
-      if (reMaterializeTrivialDef(CP, CopyMI))
+      bool IsDefCopy;
+      if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
         return true;
+      if (IsDefCopy)
+        Again = true;  // May be possible to coalesce later.
       return false;
     }
   } else {
@@ -1034,8 +1089,8 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
     });
 
     // When possible, let DstReg be the larger interval.
-    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).ranges.size() >
-                           LIS->getInterval(CP.getDstReg()).ranges.size())
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
       CP.flip();
   }
 
@@ -1048,10 +1103,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    if (reMaterializeTrivialDef(CP, CopyMI))
+    bool IsDefCopy;
+    if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
       return true;
 
-    // If we can eliminate the copy without merging the live ranges, do so now.
+    // If we can eliminate the copy without merging the live segments, do so
+    // now.
     if (!CP.isPartial() && !CP.isPhys()) {
       if (adjustCopiesBackFrom(CP, CopyMI) ||
           removeCopyByCommutingDef(CP, CopyMI)) {
@@ -1099,10 +1156,12 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
   TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
-    dbgs() << "\tJoined. Result = " << PrintReg(CP.getDstReg(), TRI);
-    if (!CP.isPhys())
+    dbgs() << "\tJoined. Result = ";
+    if (CP.isPhys())
+      dbgs() << PrintReg(CP.getDstReg(), TRI);
+    else
       dbgs() << LIS->getInterval(CP.getDstReg());
-     dbgs() << '\n';
+    dbgs() << '\n';
   });
 
   ++numJoins;
@@ -1114,8 +1173,7 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << '\n');
+  DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
   assert(CP.isFlipped() && RHS.containsOneValue() &&
          "Invalid join with reserved register");
@@ -1384,7 +1442,7 @@ VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
     unsigned Reg = MI->getOperand(1).getReg();
     if (!TargetRegisterInfo::isVirtualRegister(Reg))
       break;
-    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
     if (!LRQ.valueIn())
       break;
     VNI = LRQ.valueIn();
@@ -1435,7 +1493,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
     // The <read-undef> flag on the def operand means that old lane values are
     // not important.
     if (Redef) {
-      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      V.RedefVNI = LI.Query(VNI->def).valueIn();
       assert(V.RedefVNI && "Instruction is reading nonexistent value");
       computeAssignment(V.RedefVNI->id, Other);
       V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
@@ -1452,7 +1510,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1911,8 +1969,8 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
   JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
 
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+  DEBUG(dbgs() << "\t\tRHS = " << RHS
+               << "\n\t\tLHS = " << LHS
                << '\n');
 
   // First compute NewVNInfo and the simple value mappings.
@@ -1943,8 +2001,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
     LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
 
   // Join RHS into LHS.
-  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
-           MRI);
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
 
   // Kill flags are going to be wrong if the live ranges were overlapping.
   // Eventually, we should simply clear all kill flags when computing live
@@ -1959,7 +2016,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   // CR_Replace conflicts.
   DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
                << " points: " << LHS << '\n');
-  LIS->extendToIndices(&LHS, EndPoints);
+  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
@@ -1985,9 +2042,8 @@ struct MBBPriorityInfo {
 // block (the unsigned), and then on the MBB number.
 //
 // EnableGlobalCopies assumes that the primary sort key is loop depth.
-static int compareMBBPriority(const void *L, const void *R) {
-  const MBBPriorityInfo *LHS = static_cast<const MBBPriorityInfo*>(L);
-  const MBBPriorityInfo *RHS = static_cast<const MBBPriorityInfo*>(R);
+static int compareMBBPriority(const MBBPriorityInfo *LHS,
+                              const MBBPriorityInfo *RHS) {
   // Deeper loops first
   if (LHS->Depth != RHS->Depth)
     return LHS->Depth > RHS->Depth ? -1 : 1;
@@ -2012,6 +2068,9 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
   if (!Copy->isCopy())
     return false;
 
+  if (Copy->getOperand(1).isUndef())
+    return false;
+
   unsigned SrcReg = Copy->getOperand(1).getReg();
   unsigned DstReg = Copy->getOperand(0).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
@@ -2057,8 +2116,8 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
     // are not inherently easier to resolve, but slightly preferable until we
     // have local live range splitting. In particular this is required by
     // cmp+jmp macro fusion.
-    for (MachineBasicBlock::reverse_iterator
-           MII = MBB->rbegin(), E = MBB->rend(); MII != E; ++MII) {
+    for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+         MII != E; ++MII) {
       if (!MII->isCopyLike())
         continue;
       if (isLocalCopy(&(*MII), LIS))
@@ -2142,7 +2201,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
   const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
-    JoinGlobalCopies = ST.enableMachineScheduler();
+    JoinGlobalCopies = ST.useMachineScheduler();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
 
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 97f22e1..092ecdd 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -25,68 +25,39 @@ using namespace llvm;
 
 /// Increase pressure for each pressure set provided by TargetRegisterInfo.
 static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                std::vector<unsigned> &MaxSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    CurrSetPressure[*PSet] += Weight;
-    if (&CurrSetPressure != &MaxSetPressure
-        && CurrSetPressure[*PSet] > MaxSetPressure[*PSet]) {
-      MaxSetPressure[*PSet] = CurrSetPressure[*PSet];
-    }
-  }
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI)
+    CurrSetPressure[*PSetI] += Weight;
 }
 
 /// Decrease pressure for each pressure set provided by TargetRegisterInfo.
 static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
-                                const int *PSet, unsigned Weight) {
-  for (; *PSet != -1; ++PSet) {
-    assert(CurrSetPressure[*PSet] >= Weight && "register pressure underflow");
-    CurrSetPressure[*PSet] -= Weight;
-  }
-}
-
-/// Directly increase pressure only within this RegisterPressure result.
-void RegisterPressure::increase(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    increaseSetPressure(MaxSetPressure, MaxSetPressure,
-                        TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
-  }
-}
-
-/// Directly decrease pressure only within this RegisterPressure result.
-void RegisterPressure::decrease(unsigned Reg, const TargetRegisterInfo *TRI,
-                                const MachineRegisterInfo *MRI) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-    decreaseSetPressure(MaxSetPressure, TRI->getRegClassPressureSets(RC),
-                        TRI->getRegClassWeight(RC).RegWeight);
-  }
-  else {
-    decreaseSetPressure(MaxSetPressure, TRI->getRegUnitPressureSets(Reg),
-                        TRI->getRegUnitWeight(Reg));
+                                PSetIterator PSetI) {
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSetI] -= Weight;
   }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static void dumpSetPressure(const std::vector<unsigned> &SetPressure,
-                            const TargetRegisterInfo *TRI) {
+void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
+                              const TargetRegisterInfo *TRI) {
+  bool Empty = true;
   for (unsigned i = 0, e = SetPressure.size(); i < e; ++i) {
-    if (SetPressure[i] != 0)
+    if (SetPressure[i] != 0) {
       dbgs() << TRI->getRegPressureSetName(i) << "=" << SetPressure[i] << '\n';
+      Empty = false;
+    }
   }
+  if (Empty)
+    dbgs() << "\n";
 }
 
 void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Max Pressure: ";
-  dumpSetPressure(MaxSetPressure, TRI);
+  dumpRegSetPressure(MaxSetPressure, TRI);
   dbgs() << "Live In: ";
   for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
     dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
@@ -98,44 +69,33 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
 }
 
 void RegPressureTracker::dump() const {
-  dbgs() << "Curr Pressure: ";
-  dumpSetPressure(CurrSetPressure, TRI);
+  if (!isTopClosed() || !isBottomClosed()) {
+    dbgs() << "Curr Pressure: ";
+    dumpRegSetPressure(CurrSetPressure, TRI);
+  }
   P.dump(TRI);
 }
 #endif
 
 /// Increase the current pressure as impacted by these registers and bump
 /// the high water mark if needed.
-void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      increaseSetPressure(CurrSetPressure, P.MaxSetPressure,
-                          TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
+void RegPressureTracker::increaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    PSetIterator PSetI = MRI->getPressureSets(RegUnits[i]);
+    unsigned Weight = PSetI.getWeight();
+    for (; PSetI.isValid(); ++PSetI) {
+      CurrSetPressure[*PSetI] += Weight;
+      if (CurrSetPressure[*PSetI] > P.MaxSetPressure[*PSetI]) {
+        P.MaxSetPressure[*PSetI] = CurrSetPressure[*PSetI];
+      }
     }
   }
 }
 
 /// Simply decrease the current pressure as impacted by these registers.
-void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> Regs) {
-  for (unsigned I = 0, E = Regs.size(); I != E; ++I) {
-    if (TargetRegisterInfo::isVirtualRegister(Regs[I])) {
-      const TargetRegisterClass *RC = MRI->getRegClass(Regs[I]);
-      decreaseSetPressure(CurrSetPressure,
-                          TRI->getRegClassPressureSets(RC),
-                          TRI->getRegClassWeight(RC).RegWeight);
-    }
-    else {
-      decreaseSetPressure(CurrSetPressure, TRI->getRegUnitPressureSets(Regs[I]),
-                          TRI->getRegUnitWeight(Regs[I]));
-    }
-  }
+void RegPressureTracker::decreaseRegPressure(ArrayRef<unsigned> RegUnits) {
+  for (unsigned I = 0, E = RegUnits.size(); I != E; ++I)
+    decreaseSetPressure(CurrSetPressure, MRI->getPressureSets(RegUnits[I]));
 }
 
 /// Clear the result so it can be used for another round of pressure tracking.
@@ -187,12 +147,30 @@ void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
   LiveInRegs.clear();
 }
 
-const LiveInterval *RegPressureTracker::getInterval(unsigned Reg) const {
+const LiveRange *RegPressureTracker::getLiveRange(unsigned Reg) const {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
     return &LIS->getInterval(Reg);
   return LIS->getCachedRegUnit(Reg);
 }
 
+void RegPressureTracker::reset() {
+  MBB = 0;
+  LIS = 0;
+
+  CurrSetPressure.clear();
+  LiveThruPressure.clear();
+  P.MaxSetPressure.clear();
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+
+  LiveRegs.PhysRegs.clear();
+  LiveRegs.VirtRegs.clear();
+  UntiedDefs.clear();
+}
+
 /// Setup the RegPressureTracker.
 ///
 /// TODO: Add support for pressure without LiveIntervals.
@@ -200,13 +178,17 @@ void RegPressureTracker::init(const MachineFunction *mf,
                               const RegisterClassInfo *rci,
                               const LiveIntervals *lis,
                               const MachineBasicBlock *mbb,
-                              MachineBasicBlock::const_iterator pos)
+                              MachineBasicBlock::const_iterator pos,
+                              bool ShouldTrackUntiedDefs)
 {
+  reset();
+
   MF = mf;
   TRI = MF->getTarget().getRegisterInfo();
   RCI = rci;
   MRI = &MF->getRegInfo();
   MBB = mbb;
+  TrackUntiedDefs = ShouldTrackUntiedDefs;
 
   if (RequireIntervals) {
     assert(lis && "IntervalPressure requires LiveIntervals");
@@ -216,16 +198,12 @@ void RegPressureTracker::init(const MachineFunction *mf,
   CurrPos = pos;
   CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
 
-  if (RequireIntervals)
-    static_cast<IntervalPressure&>(P).reset();
-  else
-    static_cast<RegionPressure&>(P).reset();
   P.MaxSetPressure = CurrSetPressure;
 
-  LiveRegs.PhysRegs.clear();
   LiveRegs.PhysRegs.setUniverse(TRI->getNumRegs());
-  LiveRegs.VirtRegs.clear();
   LiveRegs.VirtRegs.setUniverse(MRI->getNumVirtRegs());
+  if (TrackUntiedDefs)
+    UntiedDefs.setUniverse(MRI->getNumVirtRegs());
 }
 
 /// Does this pressure result have a valid top position and live ins.
@@ -304,16 +282,36 @@ void RegPressureTracker::closeRegion() {
   // If both top and bottom are closed, do nothing.
 }
 
+/// The register tracker is unaware of global liveness so ignores normal
+/// live-thru ranges. However, two-address or coalesced chains can also lead
+/// to live ranges with no holes. Count these to inform heuristics that we
+/// can never drop below this pressure.
+void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
+  LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
+  assert(isBottomClosed() && "need bottom-up tracking to intialize.");
+  for (unsigned i = 0, e = P.LiveOutRegs.size(); i < e; ++i) {
+    unsigned Reg = P.LiveOutRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(Reg)
+        && !RPTracker.hasUntiedDef(Reg)) {
+      increaseSetPressure(LiveThruPressure, MRI->getPressureSets(Reg));
+    }
+  }
+}
+
 /// \brief Convenient wrapper for checking membership in RegisterOperands.
-static bool containsReg(ArrayRef<unsigned> Regs, unsigned Reg) {
-  return std::find(Regs.begin(), Regs.end(), Reg) != Regs.end();
+/// (std::count() doesn't have an early exit).
+static bool containsReg(ArrayRef<unsigned> RegUnits, unsigned RegUnit) {
+  return std::find(RegUnits.begin(), RegUnits.end(), RegUnit) != RegUnits.end();
 }
 
 /// Collect this instruction's unique uses and defs into SmallVectors for
 /// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
 class RegisterOperands {
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
+  bool IgnoreDead;
 
 public:
   SmallVector<unsigned, 8> Uses;
@@ -321,7 +319,8 @@ public:
   SmallVector<unsigned, 8> DeadDefs;
 
   RegisterOperands(const TargetRegisterInfo *tri,
-                   const MachineRegisterInfo *mri): TRI(tri), MRI(mri) {}
+                   const MachineRegisterInfo *mri, bool ID = false):
+    TRI(tri), MRI(mri), IgnoreDead(ID) {}
 
   /// Push this operand's register onto the correct vector.
   void collect(const MachineOperand &MO) {
@@ -330,25 +329,27 @@ public:
     if (MO.readsReg())
       pushRegUnits(MO.getReg(), Uses);
     if (MO.isDef()) {
-      if (MO.isDead())
-        pushRegUnits(MO.getReg(), DeadDefs);
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegUnits(MO.getReg(), DeadDefs);
+      }
       else
         pushRegUnits(MO.getReg(), Defs);
     }
   }
 
 protected:
-  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &Regs) {
+  void pushRegUnits(unsigned Reg, SmallVectorImpl<unsigned> &RegUnits) {
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      if (containsReg(Regs, Reg))
+      if (containsReg(RegUnits, Reg))
         return;
-      Regs.push_back(Reg);
+      RegUnits.push_back(Reg);
     }
     else if (MRI->isAllocatable(Reg)) {
       for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
-        if (containsReg(Regs, *Units))
+        if (containsReg(RegUnits, *Units))
           continue;
-        Regs.push_back(*Units);
+        RegUnits.push_back(*Units);
       }
     }
   }
@@ -367,6 +368,56 @@ static void collectOperands(const MachineInstr *MI,
   RegOpers.DeadDefs.erase(I, RegOpers.DeadDefs.end());
 }
 
+/// Initialize an array of N PressureDiffs.
+void PressureDiffs::init(unsigned N) {
+  Size = N;
+  if (N <= Max) {
+    memset(PDiffArray, 0, N * sizeof(PressureDiff));
+    return;
+  }
+  Max = Size;
+  free(PDiffArray);
+  PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
+}
+
+/// Add a change in pressure to the pressure diff of a given instruction.
+void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
+                                     const MachineRegisterInfo *MRI) {
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    // Find an existing entry in the pressure diff for this PSet.
+    PressureDiff::iterator I = begin(), E = end();
+    for (; I != E && I->isValid(); ++I) {
+      if (I->getPSet() >= *PSetI)
+        break;
+    }
+    // If all pressure sets are more constrained, skip the remaining PSets.
+    if (I == E)
+      break;
+    // Insert this PressureChange.
+    if (!I->isValid() || I->getPSet() != *PSetI) {
+      PressureChange PTmp = PressureChange(*PSetI);
+      for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
+        std::swap(*J,PTmp);
+    }
+    // Update the units for this pressure set.
+    I->setUnitInc(I->getUnitInc() + Weight);
+  }
+}
+
+/// Record the pressure difference induced by the given operand list.
+static void collectPDiff(PressureDiff &PDiff, RegisterOperands &RegOpers,
+                         const MachineRegisterInfo *MRI) {
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+
+  for (unsigned i = 0, e = RegOpers.Defs.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Defs[i], true, MRI);
+
+  for (unsigned i = 0, e = RegOpers.Uses.size(); i != e; ++i)
+    PDiff.addPressureChange(RegOpers.Uses[i], false, MRI);
+}
+
 /// Force liveness of registers.
 void RegPressureTracker::addLiveRegs(ArrayRef<unsigned> Regs) {
   for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
@@ -383,7 +434,7 @@ void RegPressureTracker::discoverLiveIn(unsigned Reg) {
 
   // At live in discovery, unconditionally increase the high water mark.
   P.LiveInRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
 /// Add Reg to the live out set and increase max pressure.
@@ -394,11 +445,16 @@ void RegPressureTracker::discoverLiveOut(unsigned Reg) {
 
   // At live out discovery, unconditionally increase the high water mark.
   P.LiveOutRegs.push_back(Reg);
-  P.increase(Reg, TRI, MRI);
+  increaseSetPressure(P.MaxSetPressure, MRI->getPressureSets(Reg));
 }
 
-/// Recede across the previous instruction.
-bool RegPressureTracker::recede() {
+/// Recede across the previous instruction. If LiveUses is provided, record any
+/// RegUnits that are made live by the current instruction's uses. This includes
+/// registers that are both defined and used by the instruction.  If a pressure
+/// difference pointer is provided record the changes is pressure caused by this
+/// instruction independent of liveness.
+bool RegPressureTracker::recede(SmallVectorImpl<unsigned> *LiveUses,
+                                PressureDiff *PDiff) {
   // Check for the top of the analyzable region.
   if (CurrPos == MBB->begin()) {
     closeRegion();
@@ -431,6 +487,9 @@ bool RegPressureTracker::recede() {
   RegisterOperands RegOpers(TRI, MRI);
   collectOperands(CurrPos, RegOpers);
 
+  if (PDiff)
+    collectPDiff(*PDiff, RegOpers, MRI);
+
   // Boost pressure for all dead defs together.
   increaseRegPressure(RegOpers.DeadDefs);
   decreaseRegPressure(RegOpers.DeadDefs);
@@ -439,10 +498,20 @@ bool RegPressureTracker::recede() {
   // TODO: consider earlyclobbers?
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (LiveRegs.erase(Reg))
-      decreaseRegPressure(Reg);
-    else
-      discoverLiveOut(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (LiveRegs.erase(Reg))
+        decreaseRegPressure(Reg);
+      else
+        discoverLiveOut(Reg);
+    }
   }
 
   // Generate liveness for uses.
@@ -451,12 +520,24 @@ bool RegPressureTracker::recede() {
     if (!LiveRegs.contains(Reg)) {
       // Adjust liveouts if LiveIntervals are available.
       if (RequireIntervals) {
-        const LiveInterval *LI = getInterval(Reg);
-        if (LI && !LI->killedAt(SlotIdx))
-          discoverLiveOut(Reg);
+        const LiveRange *LR = getLiveRange(Reg);
+        if (LR) {
+          LiveQueryResult LRQ = LR->Query(SlotIdx);
+          if (!LRQ.isKill() && !LRQ.valueDefined())
+            discoverLiveOut(Reg);
+        }
       }
       increaseRegPressure(Reg);
       LiveRegs.insert(Reg);
+      if (LiveUses && !containsReg(*LiveUses, Reg))
+        LiveUses->push_back(Reg);
+    }
+  }
+  if (TrackUntiedDefs) {
+    for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
+      unsigned Reg = RegOpers.Defs[i];
+      if (TargetRegisterInfo::isVirtualRegister(Reg) && !LiveRegs.contains(Reg))
+        UntiedDefs.insert(Reg);
     }
   }
   return true;
@@ -464,6 +545,8 @@ bool RegPressureTracker::recede() {
 
 /// Advance across the current instruction.
 bool RegPressureTracker::advance() {
+  assert(!TrackUntiedDefs && "unsupported mode");
+
   // Check for the bottom of the analyzable region.
   if (CurrPos == MBB->end()) {
     closeRegion();
@@ -496,8 +579,8 @@ bool RegPressureTracker::advance() {
     // Kill liveness at last uses.
     bool lastUse = false;
     if (RequireIntervals) {
-      const LiveInterval *LI = getInterval(Reg);
-      lastUse = LI && LI->killedAt(SlotIdx);
+      const LiveRange *LR = getLiveRange(Reg);
+      lastUse = LR && LR->Query(SlotIdx).isKill();
     }
     else {
       // Allocatable physregs are always single-use before register rewriting.
@@ -533,9 +616,9 @@ bool RegPressureTracker::advance() {
 static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
                                        ArrayRef<unsigned> NewPressureVec,
                                        RegPressureDelta &Delta,
-                                       const TargetRegisterInfo *TRI) {
-  int ExcessUnits = 0;
-  unsigned PSetID = ~0U;
+                                       const RegisterClassInfo *RCI,
+                                       ArrayRef<unsigned> LiveThruPressureVec) {
+  Delta.Excess = PressureChange();
   for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
     unsigned POld = OldPressureVec[i];
     unsigned PNew = NewPressureVec[i];
@@ -543,7 +626,10 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
     if (!PDiff) // No change in this set in the common case.
       continue;
     // Only consider change beyond the limit.
-    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    unsigned Limit = RCI->getRegPressureSetLimit(i);
+    if (!LiveThruPressureVec.empty())
+      Limit += LiveThruPressureVec[i];
+
     if (Limit > POld) {
       if (Limit > PNew)
         PDiff = 0;            // Under the limit
@@ -553,13 +639,12 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
     else if (Limit > PNew)
       PDiff = Limit - POld;   // Just obeyed limit.
 
-    if (std::abs(PDiff) > std::abs(ExcessUnits)) {
-      ExcessUnits = PDiff;
-      PSetID = i;
+    if (PDiff) {
+      Delta.Excess = PressureChange(i);
+      Delta.Excess.setUnitInc(PDiff);
+      break;
     }
   }
-  Delta.Excess.PSetID = PSetID;
-  Delta.Excess.UnitIncrease = ExcessUnits;
 }
 
 /// Find the max change in max pressure that either surpasses a critical PSet
@@ -570,11 +655,11 @@ static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
 /// RegPressureTracker API change to work with pressure differences.
 static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
                                     ArrayRef<unsigned> NewMaxPressureVec,
-                                    ArrayRef<PressureElement> CriticalPSets,
+                                    ArrayRef<PressureChange> CriticalPSets,
                                     ArrayRef<unsigned> MaxPressureLimit,
                                     RegPressureDelta &Delta) {
-  Delta.CriticalMax = PressureElement();
-  Delta.CurrentMax = PressureElement();
+  Delta.CriticalMax = PressureChange();
+  Delta.CurrentMax = PressureChange();
 
   unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
   for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
@@ -583,23 +668,25 @@ static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
     if (PNew == POld) // No change in this set in the common case.
       continue;
 
-    while (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID < i)
-      ++CritIdx;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < i)
+        ++CritIdx;
 
-    if (CritIdx != CritEnd && CriticalPSets[CritIdx].PSetID == i) {
-      int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].UnitIncrease;
-      if (PDiff > Delta.CriticalMax.UnitIncrease) {
-        Delta.CriticalMax.PSetID = i;
-        Delta.CriticalMax.UnitIncrease = PDiff;
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
+        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (PDiff > 0) {
+          Delta.CriticalMax = PressureChange(i);
+          Delta.CriticalMax.setUnitInc(PDiff);
+        }
       }
     }
-
-    // Find the greatest increase above MaxPressureLimit.
+    // Find the first increase above MaxPressureLimit.
     // (Ignores negative MDiff).
-    int MDiff = (int)PNew - (int)MaxPressureLimit[i];
-    if (MDiff > Delta.CurrentMax.UnitIncrease) {
-      Delta.CurrentMax.PSetID = i;
-      Delta.CurrentMax.UnitIncrease = PNew;
+    if (!Delta.CurrentMax.isValid() && PNew > MaxPressureLimit[i]) {
+      Delta.CurrentMax = PressureChange(i);
+      Delta.CurrentMax.setUnitInc(PNew - POld);
+      if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
+        break;
     }
   }
 }
@@ -614,7 +701,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
 
   // Account for register pressure similar to RegPressureTracker::recede().
-  RegisterOperands RegOpers(TRI, MRI);
+  RegisterOperands RegOpers(TRI, MRI, /*IgnoreDead=*/true);
   collectOperands(MI, RegOpers);
 
   // Boost max pressure for all dead defs together.
@@ -625,8 +712,19 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Kill liveness at live defs.
   for (unsigned i = 0, e = RegOpers.Defs.size(); i < e; ++i) {
     unsigned Reg = RegOpers.Defs[i];
-    if (!containsReg(RegOpers.Uses, Reg))
-      decreaseRegPressure(Reg);
+    bool DeadDef = false;
+    if (RequireIntervals) {
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        DeadDef = LRQ.isDeadDef();
+      }
+    }
+    if (!DeadDef) {
+      if (!containsReg(RegOpers.Uses, Reg))
+        decreaseRegPressure(Reg);
+    }
   }
   // Generate liveness for uses.
   for (unsigned i = 0, e = RegOpers.Uses.size(); i < e; ++i) {
@@ -648,8 +746,9 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
 /// result per-SUnit with enough information to adjust for the current
 /// scheduling position. But this works as a proof of concept.
 void RegPressureTracker::
-getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                          ArrayRef<PressureElement> CriticalPSets,
+getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
+                          RegPressureDelta &Delta,
+                          ArrayRef<PressureChange> CriticalPSets,
                           ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   // FIXME: The snapshot heap space should persist. But I'm planning to
@@ -659,15 +758,117 @@ getMaxUpwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
 
   bumpUpwardPressure(MI);
 
-  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
   CurrSetPressure.swap(SavedPressure);
+
+#ifndef NDEBUG
+  if (!PDiff)
+    return;
+
+  // Check if the alternate algorithm yields the same result.
+  RegPressureDelta Delta2;
+  getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
+  if (Delta != Delta2) {
+    dbgs() << "DELTA: " << *MI;
+    if (Delta.Excess.isValid())
+      dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
+             << " " << Delta.Excess.getUnitInc() << "\n";
+    if (Delta.CriticalMax.isValid())
+      dbgs() << "Critic1 " << TRI->getRegPressureSetName(Delta.CriticalMax.getPSet())
+             << " " << Delta.CriticalMax.getUnitInc() << "\n";
+    if (Delta.CurrentMax.isValid())
+      dbgs() << "CurrMx1 " << TRI->getRegPressureSetName(Delta.CurrentMax.getPSet())
+             << " " << Delta.CurrentMax.getUnitInc() << "\n";
+    if (Delta2.Excess.isValid())
+      dbgs() << "Excess2 " << TRI->getRegPressureSetName(Delta2.Excess.getPSet())
+             << " " << Delta2.Excess.getUnitInc() << "\n";
+    if (Delta2.CriticalMax.isValid())
+      dbgs() << "Critic2 " << TRI->getRegPressureSetName(Delta2.CriticalMax.getPSet())
+             << " " << Delta2.CriticalMax.getUnitInc() << "\n";
+    if (Delta2.CurrentMax.isValid())
+      dbgs() << "CurrMx2 " << TRI->getRegPressureSetName(Delta2.CurrentMax.getPSet())
+             << " " << Delta2.CurrentMax.getUnitInc() << "\n";
+    llvm_unreachable("RegP Delta Mismatch");
+  }
+#endif
+}
+
+/// This is a prototype of the fast version of querying register pressure that
+/// does not directly depend on current liveness. It's still slow because we
+/// recompute pressure change on-the-fly. This implementation only exists to
+/// prove correctness.
+///
+/// @param Delta captures information needed for heuristics.
+///
+/// @param CriticalPSets Are the pressure sets that are known to exceed some
+/// limit within the region, not necessarily at the current position.
+///
+/// @param MaxPressureLimit Is the max pressure within the region, not
+/// necessarily at the current position.
+void RegPressureTracker::
+getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
+                       RegPressureDelta &Delta,
+                       ArrayRef<PressureChange> CriticalPSets,
+                       ArrayRef<unsigned> MaxPressureLimit) const {
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (PressureDiff::const_iterator
+         PDiffI = PDiff.begin(), PDiffE = PDiff.end();
+       PDiffI != PDiffE && PDiffI->isValid(); ++PDiffI) {
+
+    unsigned PSetID = PDiffI->getPSet();
+    unsigned Limit = RCI->getRegPressureSetLimit(PSetID);
+    if (!LiveThruPressure.empty())
+      Limit += LiveThruPressure[PSetID];
+
+    unsigned POld = CurrSetPressure[PSetID];
+    unsigned MOld = P.MaxSetPressure[PSetID];
+    unsigned MNew = MOld;
+    // Ignore DeadDefs here because they aren't captured by PressureChange.
+    unsigned PNew = POld + PDiffI->getUnitInc();
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld) && "PSet overflow");
+    if (PNew > MOld)
+      MNew = PNew;
+    // Check if current pressure has exceeded the limit.
+    if (!Delta.Excess.isValid()) {
+      unsigned ExcessInc = 0;
+      if (PNew > Limit)
+        ExcessInc = POld > Limit ? PNew - POld : PNew - Limit;
+      else if (POld > Limit)
+        ExcessInc = Limit - POld;
+      if (ExcessInc) {
+        Delta.Excess = PressureChange(PSetID);
+        Delta.Excess.setUnitInc(ExcessInc);
+      }
+    }
+    // Check if max pressure has exceeded a critical pressure set max.
+    if (MNew == MOld)
+      continue;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < PSetID)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
+        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (CritInc > 0 && CritInc <= INT16_MAX) {
+          Delta.CriticalMax = PressureChange(PSetID);
+          Delta.CriticalMax.setUnitInc(CritInc);
+        }
+      }
+    }
+    // Check if max pressure has exceeded the current max.
+    if (!Delta.CurrentMax.isValid() && MNew > MaxPressureLimit[PSetID]) {
+      Delta.CurrentMax = PressureChange(PSetID);
+      Delta.CurrentMax.setUnitInc(MNew - MOld);
+    }
+  }
 }
 
 /// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
@@ -713,10 +914,12 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
       // FIXME: allow the caller to pass in the list of vreg uses that remain
       // to be bottom-scheduled to avoid searching uses at each query.
       SlotIndex CurrIdx = getCurrSlot();
-      const LiveInterval *LI = getInterval(Reg);
-      if (LI && LI->killedAt(SlotIdx)
-          && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
-        decreaseRegPressure(Reg);
+      const LiveRange *LR = getLiveRange(Reg);
+      if (LR) {
+        LiveQueryResult LRQ = LR->Query(SlotIdx);
+        if (LRQ.isKill() && !findUseBetween(Reg, CurrIdx, SlotIdx, MRI, LIS)) {
+          decreaseRegPressure(Reg);
+        }
       }
     }
     else if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
@@ -741,7 +944,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
 /// This assumes that the current LiveIn set is sufficient.
 void RegPressureTracker::
 getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
-                            ArrayRef<PressureElement> CriticalPSets,
+                            ArrayRef<PressureChange> CriticalPSets,
                             ArrayRef<unsigned> MaxPressureLimit) {
   // Snapshot Pressure.
   std::vector<unsigned> SavedPressure = CurrSetPressure;
@@ -749,11 +952,12 @@ getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
 
   bumpDownwardPressure(MI);
 
-  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, TRI);
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
   computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
                           MaxPressureLimit, Delta);
-  assert(Delta.CriticalMax.UnitIncrease >= 0 &&
-         Delta.CurrentMax.UnitIncrease >= 0 && "cannot decrease max pressure");
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
 
   // Restore the tracker's state.
   P.MaxSetPressure.swap(SavedMaxPressure);
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index f82ccbe..75ebdaa 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -31,9 +31,8 @@ using namespace llvm;
 
 /// setUsed - Set the register and its sub-registers as being used.
 void RegScavenger::setUsed(unsigned Reg) {
-  RegsAvailable.reset(Reg);
-
-  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+       SubRegs.isValid(); ++SubRegs)
     RegsAvailable.reset(*SubRegs);
 }
 
@@ -45,8 +44,8 @@ bool RegScavenger::isAliasUsed(unsigned Reg) const {
 }
 
 void RegScavenger::initRegState() {
-  for (SmallVector<ScavengedInfo, 2>::iterator I = Scavenged.begin(),
-       IE = Scavenged.end(); I != IE; ++I) {
+  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
+         IE = Scavenged.end(); I != IE; ++I) {
     I->Reg = 0;
     I->Restore = NULL;
   }
@@ -105,8 +104,8 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
 }
 
 void RegScavenger::addRegWithSubRegs(BitVector &BV, unsigned Reg) {
-  BV.set(Reg);
-  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+       SubRegs.isValid(); ++SubRegs)
     BV.set(*SubRegs);
 }
 
@@ -182,8 +181,8 @@ void RegScavenger::forward() {
 
   MachineInstr *MI = MBBI;
 
-  for (SmallVector<ScavengedInfo, 2>::iterator I = Scavenged.begin(),
-       IE = Scavenged.end(); I != IE; ++I) {
+  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
+         IE = Scavenged.end(); I != IE; ++I) {
     if (I->Restore != MI)
       continue;
 
@@ -369,7 +368,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   // Exclude all the registers being used by the instruction.
   for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = I->getOperand(i);
-    if (MO.isReg() && MO.getReg() != 0 &&
+    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
         !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       Candidates.reset(MO.getReg());
   }
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 07e5b47..75e3790 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -64,8 +64,8 @@ const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
 /// specified node.
 bool SUnit::addPred(const SDep &D, bool Required) {
   // If this node already has this depenence, don't add a redundant one.
-  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
-       I != E; ++I) {
+  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
     // Zero-latency weak edges may be added purely for heuristic ordering. Don't
     // add them if another kind of edge already exists.
     if (!Required && I->getSUnit() == D.getSUnit())
@@ -77,7 +77,7 @@ bool SUnit::addPred(const SDep &D, bool Required) {
         // Find the corresponding successor in N.
         SDep ForwardD = *I;
         ForwardD.setSUnit(this);
-        for (SmallVector<SDep, 4>::iterator II = PredSU->Succs.begin(),
+        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(),
                EE = PredSU->Succs.end(); II != EE; ++II) {
           if (*II == ForwardD) {
             II->setLatency(D.getLatency());
@@ -132,8 +132,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
 /// the specified node.
 void SUnit::removePred(const SDep &D) {
   // Find the matching predecessor.
-  for (SmallVector<SDep, 4>::iterator I = Preds.begin(), E = Preds.end();
-       I != E; ++I)
+  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I)
     if (*I == D) {
       // Find the corresponding successor in N.
       SDep P = D;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index e4da6a4..7f1f9c4 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,6 +36,8 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
@@ -98,7 +100,7 @@ static void getUnderlyingObjects(const Value *V,
     SmallVector<Value *, 4> Objs;
     GetUnderlyingObjects(const_cast<Value *>(V), Objs);
 
-    for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
+    for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
          I != IE; ++I) {
       V = *I;
       if (!Visited.insert(V))
@@ -116,12 +118,15 @@ static void getUnderlyingObjects(const Value *V,
   } while (!Working.empty());
 }
 
+typedef SmallVector<PointerIntPair<const Value *, 1, bool>, 4>
+UnderlyingObjectsVector;
+
 /// getUnderlyingObjectsForInstr - If this machine instr has memory reference
 /// information and it can be tracked to a normal reference to a known
 /// object, return the Value for that object.
 static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
-              const MachineFrameInfo *MFI,
-              SmallVectorImpl<std::pair<const Value *, bool> > &Objects) {
+                                         const MachineFrameInfo *MFI,
+                                         UnderlyingObjectsVector &Objects) {
   if (!MI->hasOneMemOperand() ||
       !(*MI->memoperands_begin())->getValue() ||
       (*MI->memoperands_begin())->isVolatile())
@@ -134,8 +139,8 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
   SmallVector<Value *, 4> Objs;
   getUnderlyingObjects(V, Objs);
 
-  for (SmallVector<Value *, 4>::iterator I = Objs.begin(), IE = Objs.end();
-       I != IE; ++I) {
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
+         I != IE; ++I) {
     bool MayAlias = true;
     V = *I;
 
@@ -155,7 +160,7 @@ static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
       return;
     }
 
-    Objects.push_back(std::make_pair(V, MayAlias));
+    Objects.push_back(UnderlyingObjectsVector::value_type(V, MayAlias));
   }
 }
 
@@ -175,14 +180,11 @@ void ScheduleDAGInstrs::finishBlock() {
 void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
                                     MachineBasicBlock::iterator begin,
                                     MachineBasicBlock::iterator end,
-                                    unsigned endcount) {
+                                    unsigned regioninstrs) {
   assert(bb == BB && "startBlock should set BB");
   RegionBegin = begin;
   RegionEnd = end;
-  EndIndex = endcount;
-  MISUnitMap.clear();
-
-  ScheduleDAG::clearDAG();
+  NumRegionInstrs = regioninstrs;
 }
 
 /// Close the current scheduling region. Don't clear any state in case the
@@ -267,13 +269,10 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
         SU->hasPhysRegDefs = true;
         Dep = SDep(SU, SDep::Data, *Alias);
         RegUse = UseSU->getInstr();
-        Dep.setMinLatency(
-          SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
-                                           RegUse, UseOp, /*FindMin=*/true));
       }
       Dep.setLatency(
-        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
-                                         RegUse, UseOp, /*FindMin=*/false));
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
+                                         UseOp));
 
       ST.adjustSchedDependency(SU, UseSU, Dep);
       UseSU->addPred(Dep);
@@ -310,10 +309,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
           DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
         else {
           SDep Dep(SU, Kind, /*Reg=*/*Alias);
-          unsigned OutLatency =
-            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-          Dep.setMinLatency(OutLatency);
-          Dep.setLatency(OutLatency);
+          Dep.setLatency(
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
           DefSU->addPred(Dep);
         }
       }
@@ -389,10 +386,8 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     SUnit *DefSU = DefI->SU;
     if (DefSU != SU && DefSU != &ExitSU) {
       SDep Dep(SU, SDep::Output, Reg);
-      unsigned OutLatency =
-        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-      Dep.setMinLatency(OutLatency);
-      Dep.setLatency(OutLatency);
+      Dep.setLatency(
+        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
       DefSU->addPred(Dep);
     }
     DefI->SU = SU;
@@ -409,9 +404,19 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   unsigned Reg = MI->getOperand(OperIdx).getReg();
 
+  // Record this local VReg use.
+  VReg2UseMap::iterator UI = VRegUses.find(Reg);
+  for (; UI != VRegUses.end(); ++UI) {
+    if (UI->SU == SU)
+      break;
+  }
+  if (UI == VRegUses.end())
+    VRegUses.insert(VReg2SUnit(Reg, SU));
+
   // Lookup this operand's reaching definition.
   assert(LIS && "vreg dependencies requires LiveIntervals");
-  LiveRangeQuery LRQ(LIS->getInterval(Reg), LIS->getInstructionIndex(MI));
+  LiveQueryResult LRQ
+    = LIS->getInterval(Reg).Query(LIS->getInstructionIndex(MI));
   VNInfo *VNI = LRQ.valueIn();
 
   // VNI will be valid because MachineOperand::readsReg() is checked by caller.
@@ -427,10 +432,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
       // Adjust the dependence latency using operand def/use information, then
       // allow the target to perform its own adjustments.
       int DefOp = Def->findRegisterDefOperandIdx(Reg);
-      dep.setLatency(
-        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false));
-      dep.setMinLatency(
-        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true));
+      dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
 
       const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
       ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
@@ -472,8 +474,8 @@ static inline bool isUnsafeMemoryObject(MachineInstr *MI,
 
   SmallVector<Value *, 4> Objs;
   getUnderlyingObjects(V, Objs);
-  for (SmallVector<Value *, 4>::iterator I = Objs.begin(),
-       IE = Objs.end(); I != IE; ++I) {
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(),
+         IE = Objs.end(); I != IE; ++I) {
     V = *I;
 
     if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V)) {
@@ -642,8 +644,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
                          bool isNormalMemory = false) {
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
-  if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+  if (!AA || MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
     SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
     Dep.setLatency(TrueMemOrderLatency);
     SUb->addPred(Dep);
@@ -671,7 +672,7 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
 void ScheduleDAGInstrs::initSUnits() {
   // We'll be allocating one SUnit for each real instruction in the region,
   // which is contained within a basic block.
-  SUnits.reserve(BB->size());
+  SUnits.reserve(NumRegionInstrs);
 
   for (MachineBasicBlock::iterator I = RegionBegin; I != RegionEnd; ++I) {
     MachineInstr *MI = I;
@@ -693,10 +694,22 @@ void ScheduleDAGInstrs::initSUnits() {
 /// DAG builder is an efficient place to do it because it already visits
 /// operands.
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
-                                        RegPressureTracker *RPTracker) {
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs) {
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AliasAnalysis *AAForDep = UseAA ? AA : 0;
+
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
   // Create an SUnit for each real instruction.
   initSUnits();
 
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
   // We build scheduling units by walking a block's instruction list from bottom
   // to top.
 
@@ -722,10 +735,9 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   Uses.setUniverse(TRI->getNumRegs());
 
   assert(VRegDefs.empty() && "Only BuildSchedGraph may access VRegDefs");
-  // FIXME: Allow SparseSet to reserve space for the creation of virtual
-  // registers during scheduling. Don't artificially inflate the Universe
-  // because we want to assert that vregs are not created during DAG building.
+  VRegUses.clear();
   VRegDefs.setUniverse(MRI.getNumVirtRegs());
+  VRegUses.setUniverse(MRI.getNumVirtRegs());
 
   // Model data dependencies between instructions being scheduled and the
   // ExitSU.
@@ -745,17 +757,18 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       DbgMI = MI;
       continue;
     }
+    SUnit *SU = MISUnitMap[MI];
+    assert(SU && "No SUnit mapped to this MI");
+
     if (RPTracker) {
-      RPTracker->recede();
+      PressureDiff *PDiff = PDiffs ? &(*PDiffs)[SU->NodeNum] : 0;
+      RPTracker->recede(/*LiveUses=*/0, PDiff);
       assert(RPTracker->getPos() == prior(MII) && "RPTracker can't find MI");
     }
 
     assert((CanHandleTerminators || (!MI->isTerminator() && !MI->isLabel())) &&
            "Cannot schedule terminators or labels!");
 
-    SUnit *SU = MISUnitMap[MI];
-    assert(SU && "No SUnit mapped to this MI");
-
     // Add register-based dependencies (data, anti, and output).
     bool HasVRegDef = false;
     for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
@@ -833,20 +846,20 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
           ChainLatency = TrueMemOrderLatency;
-        addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes,
                            ChainLatency);
       }
       AliasChain = SU;
       for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-        addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+        addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                            TrueMemOrderLatency);
       for (MapVector<const Value *, SUnit *>::iterator I = AliasMemDefs.begin(),
            E = AliasMemDefs.end(); I != E; ++I)
-        addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+        addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
       for (MapVector<const Value *, std::vector<SUnit *> >::iterator I =
            AliasMemUses.begin(), E = AliasMemUses.end(); I != E; ++I) {
         for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          addChainDependency(AA, MFI, SU, I->second[i], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, I->second[i], RejectMemNodes,
                              TrueMemOrderLatency);
       }
       adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -855,7 +868,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       AliasMemDefs.clear();
       AliasMemUses.clear();
     } else if (MI->mayStore()) {
-      SmallVector<std::pair<const Value *, bool>, 4> Objs;
+      UnderlyingObjectsVector Objs;
       getUnderlyingObjectsForInstr(MI, MFI, Objs);
 
       if (Objs.empty()) {
@@ -864,10 +877,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       }
 
       bool MayAlias = false;
-      for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
-           K = Objs.begin(), KE = Objs.end(); K != KE; ++K) {
-        const Value *V = K->first;
-        bool ThisMayAlias = K->second;
+      for (UnderlyingObjectsVector::iterator K = Objs.begin(), KE = Objs.end();
+           K != KE; ++K) {
+        const Value *V = K->getPointer();
+        bool ThisMayAlias = K->getInt();
         if (ThisMayAlias)
           MayAlias = true;
 
@@ -879,7 +892,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         MapVector<const Value *, SUnit *>::iterator IE =
           ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
         if (I != IE) {
-          addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+          addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                             0, true);
           I->second = SU;
         } else {
           if (ThisMayAlias)
@@ -894,7 +908,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           ((ThisMayAlias) ? AliasMemUses.end() : NonAliasMemUses.end());
         if (J != JE) {
           for (unsigned i = 0, e = J->second.size(); i != e; ++i)
-            addChainDependency(AA, MFI, SU, J->second[i], RejectMemNodes,
+            addChainDependency(AAForDep, MFI, SU, J->second[i], RejectMemNodes,
                                TrueMemOrderLatency, true);
           J->second.clear();
         }
@@ -903,11 +917,11 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // Add dependencies from all the PendingLoads, i.e. loads
         // with no underlying object.
         for (unsigned k = 0, m = PendingLoads.size(); k != m; ++k)
-          addChainDependency(AA, MFI, SU, PendingLoads[k], RejectMemNodes,
+          addChainDependency(AAForDep, MFI, SU, PendingLoads[k], RejectMemNodes,
                              TrueMemOrderLatency);
         // Add dependence on alias chain, if needed.
         if (AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         // But we also should check dependent instructions for the
         // SU in question.
         adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
@@ -929,7 +943,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       if (MI->isInvariantLoad(AA)) {
         // Invariant load, no chain dependencies needed!
       } else {
-        SmallVector<std::pair<const Value *, bool>, 4> Objs;
+        UnderlyingObjectsVector Objs;
         getUnderlyingObjectsForInstr(MI, MFI, Objs);
 
         if (Objs.empty()) {
@@ -937,7 +951,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           // potentially aliasing stores.
           for (MapVector<const Value *, SUnit *>::iterator I =
                  AliasMemDefs.begin(), E = AliasMemDefs.end(); I != E; ++I)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes);
 
           PendingLoads.push_back(SU);
           MayAlias = true;
@@ -945,10 +959,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           MayAlias = false;
         }
 
-        for (SmallVector<std::pair<const Value *, bool>, 4>::iterator
+        for (UnderlyingObjectsVector::iterator
              J = Objs.begin(), JE = Objs.end(); J != JE; ++J) {
-          const Value *V = J->first;
-          bool ThisMayAlias = J->second;
+          const Value *V = J->getPointer();
+          bool ThisMayAlias = J->getInt();
 
           if (ThisMayAlias)
             MayAlias = true;
@@ -959,7 +973,8 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           MapVector<const Value *, SUnit *>::iterator IE =
             ((ThisMayAlias) ? AliasMemDefs.end() : NonAliasMemDefs.end());
           if (I != IE)
-            addChainDependency(AA, MFI, SU, I->second, RejectMemNodes, 0, true);
+            addChainDependency(AAForDep, MFI, SU, I->second, RejectMemNodes,
+                               0, true);
           if (ThisMayAlias)
             AliasMemUses[V].push_back(SU);
           else
@@ -969,7 +984,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
           adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes, /*Latency=*/0);
         // Add dependencies on alias and barrier chains, if needed.
         if (MayAlias && AliasChain)
-          addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
+          addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
           BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2e09ec0..43f72c5 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -35,6 +35,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -43,6 +45,7 @@ STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
 STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
 STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
   static cl::opt<bool>
@@ -53,6 +56,14 @@ namespace {
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Include global information in alias analysis"));
 
+  /// Hidden option to stress test load slicing, i.e., when this option
+  /// is enabled, load slicing bypasses most of its profitability guards.
+  static cl::opt<bool>
+  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+                    cl::desc("Bypass the profitability model of load "
+                             "slicing"),
+                    cl::init(false));
+
 //------------------------------ DAGCombiner ---------------------------------//
 
   class DAGCombiner {
@@ -62,6 +73,7 @@ namespace {
     CodeGenOpt::Level OptLevel;
     bool LegalOperations;
     bool LegalTypes;
+    bool ForCodeSize;
 
     // Worklist of all of the nodes that need to be simplified.
     //
@@ -144,6 +156,7 @@ namespace {
 
     bool CombineToPreIndexedLoadStore(SDNode *N);
     bool CombineToPostIndexedLoadStore(SDNode *N);
+    bool SliceUpLoad(SDNode *N);
 
     void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
     SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
@@ -154,8 +167,8 @@ namespace {
     SDValue PromoteExtend(SDValue Op);
     bool PromoteLoad(SDValue Op);
 
-    void ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
-                         SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                         SDValue Trunc, SDValue ExtLoad, SDLoc DL,
                          ISD::NodeType ExtType);
 
     /// combine - call the node-specific routine that knows how to fold each
@@ -246,18 +259,18 @@ namespace {
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
-    SDValue ReassociateOps(unsigned Opc, DebugLoc DL, SDValue LHS, SDValue RHS);
+    SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
 
     SDValue visitShiftByConstant(SDNode *N, unsigned Amt);
 
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
-    SDValue SimplifySelect(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2);
-    SDValue SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1, SDValue N2,
+    SDValue SimplifySelect(SDLoc DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1, SDValue N2,
                              SDValue N3, ISD::CondCode CC,
                              bool NotExtCompare = false);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
-                          DebugLoc DL, bool foldBooleans = true);
+                          SDLoc DL, bool foldBooleans = true);
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
@@ -267,7 +280,7 @@ namespace {
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
-    SDNode *MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue TransformFPLoadStorePair(SDNode *N);
@@ -279,15 +292,15 @@ namespace {
     /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
     /// looking for aliasing nodes and adding them to the Aliases vector.
     void GatherAllAliases(SDNode *N, SDValue OriginalChain,
-                          SmallVector<SDValue, 8> &Aliases);
+                          SmallVectorImpl<SDValue> &Aliases);
 
     /// isAlias - Return true if there is any possibility that the two addresses
     /// overlap.
-    bool isAlias(SDValue Ptr1, int64_t Size1,
+    bool isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                  const Value *SrcValue1, int SrcValueOffset1,
                  unsigned SrcValueAlign1,
                  const MDNode *TBAAInfo1,
-                 SDValue Ptr2, int64_t Size2,
+                 SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                  const Value *SrcValue2, int SrcValueOffset2,
                  unsigned SrcValueAlign2,
                  const MDNode *TBAAInfo2) const;
@@ -299,7 +312,7 @@ namespace {
     /// FindAliasInfo - Extracts the relevant alias information from the memory
     /// node.  Returns true if the operand was a load.
     bool FindAliasInfo(SDNode *N,
-                       SDValue &Ptr, int64_t &Size,
+                       SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                        const Value *&SrcValue, int &SrcValueOffset,
                        unsigned &SrcValueAlignment,
                        const MDNode *&TBAAInfo) const;
@@ -315,8 +328,15 @@ namespace {
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
-      : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-        OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {}
+        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+      AttributeSet FnAttrs =
+          DAG.getMachineFunction().getFunction()->getAttributes();
+      ForCodeSize =
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::OptimizeForSize) ||
+          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    }
 
     /// Run - runs the dag combiner on all nodes in the work list
     void Run(CombineLevel AtLevel);
@@ -326,7 +346,11 @@ namespace {
     /// getShiftAmountTy - Returns a type large enough to hold any valid
     /// shift amount - before type legalization these can be huge.
     EVT getShiftAmountTy(EVT LHSTy) {
-      return LegalTypes ? TLI.getShiftAmountTy(LHSTy) : TLI.getPointerTy();
+      assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+      if (LHSTy.isVector())
+        return LHSTy;
+      return LegalTypes ? TLI.getScalarShiftAmountTy(LHSTy)
+                        : TLI.getPointerTy();
     }
 
     /// isTypeLegal - This method returns true if we are running before type
@@ -335,6 +359,12 @@ namespace {
       if (!LegalTypes) return true;
       return TLI.isTypeLegal(VT);
     }
+
+    /// getSetCCResultType - Convenience wrapper around
+    /// TargetLowering::getSetCCResultType
+    EVT getSetCCResultType(EVT VT) const {
+      return TLI.getSetCCResultType(*DAG.getContext(), VT);
+    }
   };
 }
 
@@ -482,12 +512,12 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
                            DAG.getTargetLoweringInfo(),
                            &DAG.getTarget().Options, Depth+1))
-      return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1));
     // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        GetNegatedExpression(Op.getOperand(1), DAG,
                                             LegalOperations, Depth+1),
                        Op.getOperand(0));
@@ -501,7 +531,7 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
         return Op.getOperand(1);
 
     // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(0));
 
   case ISD::FMUL:
@@ -512,24 +542,24 @@ static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
     if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
                            DAG.getTargetLoweringInfo(),
                            &DAG.getTarget().Options, Depth+1))
-      return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1));
 
     // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
-    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                        Op.getOperand(0),
                        GetNegatedExpression(Op.getOperand(1), DAG,
                                             LegalOperations, Depth+1));
 
   case ISD::FP_EXTEND:
   case ISD::FSIN:
-    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
                        GetNegatedExpression(Op.getOperand(0), DAG,
                                             LegalOperations, Depth+1));
   case ISD::FP_ROUND:
-      return DAG.getNode(ISD::FP_ROUND, Op.getDebugLoc(), Op.getValueType(),
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
                          GetNegatedExpression(Op.getOperand(0), DAG,
                                               LegalOperations, Depth+1),
                          Op.getOperand(1));
@@ -573,7 +603,7 @@ static bool isOneUseSetCC(SDValue N) {
   return false;
 }
 
-SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, SDLoc DL,
                                     SDValue N0, SDValue N1) {
   EVT VT = N0.getValueType();
   if (N0.getOpcode() == Opc && isa<ConstantSDNode>(N0.getOperand(1))) {
@@ -587,7 +617,7 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
     }
     if (N0.hasOneUse()) {
       // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one use
-      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT,
                                    N0.getOperand(0), N1);
       AddToWorkList(OpNode.getNode());
       return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
@@ -605,7 +635,7 @@ SDValue DAGCombiner::ReassociateOps(unsigned Opc, DebugLoc DL,
     }
     if (N1.hasOneUse()) {
       // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one use
-      SDValue OpNode = DAG.getNode(Opc, N0.getDebugLoc(), VT,
+      SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT,
                                    N1.getOperand(0), N0);
       AddToWorkList(OpNode.getNode());
       return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
@@ -706,7 +736,7 @@ bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
 }
 
 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
-  DebugLoc dl = Load->getDebugLoc();
+  SDLoc dl(Load);
   EVT VT = Load->getValueType(0);
   SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, SDValue(ExtLoad, 0));
 
@@ -725,7 +755,7 @@ void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
 
 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
   Replace = false;
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
@@ -735,9 +765,7 @@ SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
                           LD->getChain(), LD->getBasePtr(),
-                          LD->getPointerInfo(),
-                          MemVT, LD->isVolatile(),
-                          LD->isNonTemporal(), LD->getAlignment());
+                          MemVT, LD->getMemOperand());
   }
 
   unsigned Opc = Op.getOpcode();
@@ -767,7 +795,7 @@ SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
   if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
     return SDValue();
   EVT OldVT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (NewOp.getNode() == 0)
@@ -782,7 +810,7 @@ SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
 
 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
   EVT OldVT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   bool Replace = false;
   SDValue NewOp = PromoteOperand(Op, PVT, Replace);
   if (NewOp.getNode() == 0)
@@ -845,7 +873,7 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
 
     DEBUG(dbgs() << "\nPromoting ";
           Op.getNode()->dump(&DAG));
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                        DAG.getNode(Opc, dl, PVT, NN0, NN1));
   }
@@ -892,7 +920,7 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
 
     DEBUG(dbgs() << "\nPromoting ";
           Op.getNode()->dump(&DAG));
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     return DAG.getNode(ISD::TRUNCATE, dl, VT,
                        DAG.getNode(Opc, dl, PVT, N0, Op.getOperand(1)));
   }
@@ -923,7 +951,7 @@ SDValue DAGCombiner::PromoteExtend(SDValue Op) {
     // fold (aext (sext x)) -> (sext x)
     DEBUG(dbgs() << "\nPromoting ";
           Op.getNode()->dump(&DAG));
-    return DAG.getNode(Op.getOpcode(), Op.getDebugLoc(), VT, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
   }
   return SDValue();
 }
@@ -948,7 +976,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     SDNode *N = Op.getNode();
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
@@ -958,9 +986,7 @@ bool DAGCombiner::PromoteLoad(SDValue Op) {
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
-                                   LD->getPointerInfo(),
-                                   MemVT, LD->isVolatile(),
-                                   LD->isNonTemporal(), LD->getAlignment());
+                                   MemVT, LD->getMemOperand());
     SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
 
     DEBUG(dbgs() << "\nPromoting ";
@@ -1008,7 +1034,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
   // try and combine it.
   while (!WorkListContents.empty()) {
     SDNode *N;
-    // The WorkListOrder holds the SDNodes in order, but it may contain duplicates.
+    // The WorkListOrder holds the SDNodes in order, but it may contain
+    // duplicates.
     // In order to avoid a linear scan, we use a set (O(log N)) to hold what the
     // worklist *should* contain, and check the node we want to visit is should
     // actually be visited.
@@ -1245,7 +1272,7 @@ static SDValue getInputChainForNode(SDNode *N) {
   if (unsigned NumOps = N->getNumOperands()) {
     if (N->getOperand(0).getValueType() == MVT::Other)
       return N->getOperand(0);
-    else if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+    if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
       return N->getOperand(NumOps-1);
     for (unsigned i = 1; i < NumOps-1; ++i)
       if (N->getOperand(i).getValueType() == MVT::Other)
@@ -1320,7 +1347,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       Result = DAG.getEntryNode();
     } else {
       // New and improved token factor.
-      Result = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                            MVT::Other, &Ops[0], Ops.size());
     }
 
@@ -1350,7 +1377,7 @@ SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
 }
 
 static
-SDValue combineShlAddConstant(DebugLoc DL, SDValue N0, SDValue N1,
+SDValue combineShlAddConstant(SDLoc DL, SDValue N0, SDValue N1,
                               SelectionDAG &DAG) {
   EVT VT = N0.getValueType();
   SDValue N00 = N0.getOperand(0);
@@ -1360,10 +1387,10 @@ SDValue combineShlAddConstant(DebugLoc DL, SDValue N0, SDValue N1,
   if (N01C && N00.getOpcode() == ISD::ADD && N00.getNode()->hasOneUse() &&
       isa<ConstantSDNode>(N00.getOperand(1))) {
     // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
-    N0 = DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
-                     DAG.getNode(ISD::SHL, N00.getDebugLoc(), VT,
+    N0 = DAG.getNode(ISD::ADD, SDLoc(N0), VT,
+                     DAG.getNode(ISD::SHL, SDLoc(N00), VT,
                                  N00.getOperand(0), N01),
-                     DAG.getNode(ISD::SHL, N01.getDebugLoc(), VT,
+                     DAG.getNode(ISD::SHL, SDLoc(N01), VT,
                                  N00.getOperand(1), N01));
     return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
   }
@@ -1400,7 +1427,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
   // canonicalize constant to RHS
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, N0);
   // fold (add x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
@@ -1408,28 +1435,28 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA) && N1C &&
         GA->getOpcode() == ISD::GlobalAddress)
-      return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+      return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
                                   GA->getOffset() +
                                     (uint64_t)N1C->getSExtValue());
   // fold ((c1-A)+c2) -> (c1+c2)-A
   if (N1C && N0.getOpcode() == ISD::SUB)
     if (ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getOperand(0)))
-      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                          DAG.getConstant(N1C->getAPIntValue()+
                                          N0C->getAPIntValue(), VT),
                          N0.getOperand(1));
   // reassociate add
-  SDValue RADD = ReassociateOps(ISD::ADD, N->getDebugLoc(), N0, N1);
+  SDValue RADD = ReassociateOps(ISD::ADD, SDLoc(N), N0, N1);
   if (RADD.getNode() != 0)
     return RADD;
   // fold ((0-A) + B) -> B-A
   if (N0.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N0.getOperand(0)) &&
       cast<ConstantSDNode>(N0.getOperand(0))->isNullValue())
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1, N0.getOperand(1));
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1, N0.getOperand(1));
   // fold (A + (0-B)) -> A-B
   if (N1.getOpcode() == ISD::SUB && isa<ConstantSDNode>(N1.getOperand(0)) &&
       cast<ConstantSDNode>(N1.getOperand(0))->isNullValue())
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1.getOperand(1));
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1.getOperand(1));
   // fold (A+(B-A)) -> B
   if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
     return N1.getOperand(0);
@@ -1439,18 +1466,18 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   // fold (A+(B-(A+C))) to (B-C)
   if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
       N0 == N1.getOperand(1).getOperand(0))
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0),
                        N1.getOperand(1).getOperand(1));
   // fold (A+(B-(C+A))) to (B-C)
   if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
       N0 == N1.getOperand(1).getOperand(1))
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1.getOperand(0),
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0),
                        N1.getOperand(1).getOperand(0));
   // fold (A+((B-A)+or-C)) to (B+or-C)
   if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
       N1.getOperand(0).getOpcode() == ISD::SUB &&
       N0 == N1.getOperand(0).getOperand(1))
-    return DAG.getNode(N1.getOpcode(), N->getDebugLoc(), VT,
+    return DAG.getNode(N1.getOpcode(), SDLoc(N), VT,
                        N1.getOperand(0).getOperand(0), N1.getOperand(1));
 
   // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
@@ -1461,9 +1488,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     SDValue N11 = N1.getOperand(1);
 
     if (isa<ConstantSDNode>(N00) || isa<ConstantSDNode>(N10))
-      return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
-                         DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT, N00, N10),
-                         DAG.getNode(ISD::ADD, N1.getDebugLoc(), VT, N01, N11));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT,
+                         DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
   }
 
   if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
@@ -1481,17 +1508,17 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
       // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
       if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
-        return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1);
+        return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1);
     }
   }
 
   // fold (add (shl (add x, c1), c2), ) -> (add (add (shl x, c2), c1<<c2), )
   if (N0.getOpcode() == ISD::SHL && N0.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N0, N1, DAG);
+    SDValue Result = combineShlAddConstant(SDLoc(N), N0, N1, DAG);
     if (Result.getNode()) return Result;
   }
   if (N1.getOpcode() == ISD::SHL && N1.getNode()->hasOneUse()) {
-    SDValue Result = combineShlAddConstant(N->getDebugLoc(), N1, N0, DAG);
+    SDValue Result = combineShlAddConstant(SDLoc(N), N1, N0, DAG);
     if (Result.getNode()) return Result;
   }
 
@@ -1501,8 +1528,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     if (ConstantSDNode *C =
           dyn_cast<ConstantSDNode>(N1.getOperand(0).getOperand(0)))
       if (C->getAPIntValue() == 0)
-        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0,
-                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0,
+                           DAG.getNode(ISD::SHL, SDLoc(N), VT,
                                        N1.getOperand(0).getOperand(1),
                                        N1.getOperand(1)));
   if (N0.getOpcode() == ISD::SHL &&
@@ -1510,8 +1537,8 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     if (ConstantSDNode *C =
           dyn_cast<ConstantSDNode>(N0.getOperand(0).getOperand(0)))
       if (C->getAPIntValue() == 0)
-        return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N1,
-                           DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1,
+                           DAG.getNode(ISD::SHL, SDLoc(N), VT,
                                        N0.getOperand(0).getOperand(1),
                                        N0.getOperand(1)));
 
@@ -1524,7 +1551,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
     // and similar xforms where the inner op is either ~0 or 0.
     if (NumSignBits == DestBits && AndOp1 && AndOp1->isOne()) {
-      DebugLoc DL = N->getDebugLoc();
+      SDLoc DL(N);
       return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
     }
   }
@@ -1533,7 +1560,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
   if (N0.getOpcode() == ISD::SIGN_EXTEND &&
       N0.getOperand(0).getValueType() == MVT::i1 &&
       !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
     SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
     return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
   }
@@ -1550,18 +1577,18 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N1),
+    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1),
                      DAG.getNode(ISD::CARRY_FALSE,
-                                 N->getDebugLoc(), MVT::Glue));
+                                 SDLoc(N), MVT::Glue));
 
   // canonicalize constant to RHS.
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N1, N0);
+    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
 
   // fold (addc x, 0) -> x + no carry out
   if (N1C && N1C->isNullValue())
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
-                                        N->getDebugLoc(), MVT::Glue));
+                                        SDLoc(N), MVT::Glue));
 
   // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
   APInt LHSZero, LHSOne;
@@ -1574,9 +1601,9 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
     // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
     // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
     if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
-      return CombineTo(N, DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N1),
+      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1),
                        DAG.getNode(ISD::CARRY_FALSE,
-                                   N->getDebugLoc(), MVT::Glue));
+                                   SDLoc(N), MVT::Glue));
   }
 
   return SDValue();
@@ -1591,30 +1618,25 @@ SDValue DAGCombiner::visitADDE(SDNode *N) {
 
   // canonicalize constant to RHS
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDE, N->getDebugLoc(), N->getVTList(),
+    return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
                        N1, N0, CarryIn);
 
   // fold (adde x, y, false) -> (addc x, y)
   if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
-    return DAG.getNode(ISD::ADDC, N->getDebugLoc(), N->getVTList(), N0, N1);
+    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
 
   return SDValue();
 }
 
 // Since it may not be valid to emit a fold to zero for vector initializers
 // check if we can before folding.
-static SDValue tryFoldToZero(DebugLoc DL, const TargetLowering &TLI, EVT VT,
-                             SelectionDAG &DAG, bool LegalOperations) {
-  if (!VT.isVector()) {
+static SDValue tryFoldToZero(SDLoc DL, const TargetLowering &TLI, EVT VT,
+                             SelectionDAG &DAG,
+                             bool LegalOperations, bool LegalTypes) {
+  if (!VT.isVector())
+    return DAG.getConstant(0, VT);
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
     return DAG.getConstant(0, VT);
-  }
-  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
-    // Produce a vector of zeros.
-    SDValue El = DAG.getConstant(0, VT.getVectorElementType());
-    std::vector<SDValue> Ops(VT.getVectorNumElements(), El);
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
-      &Ops[0], Ops.size());
-  }
   return SDValue();
 }
 
@@ -1640,17 +1662,17 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // fold (sub x, x) -> 0
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
-    return tryFoldToZero(N->getDebugLoc(), TLI, VT, DAG, LegalOperations);
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
   // fold (sub c1, c2) -> c1-c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
   // fold (sub x, c) -> (add x, -c)
   if (N1C)
-    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0,
                        DAG.getConstant(-N1C->getAPIntValue(), VT));
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
   if (N0C && N0C->isAllOnesValue())
-    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
   // fold A-(A-B) -> B
   if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
     return N1.getOperand(1);
@@ -1664,7 +1686,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
                                    VT);
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, NewC,
                        N1.getOperand(0));
   }
   // fold ((A+(B+or-C))-B) -> A+or-C
@@ -1672,19 +1694,19 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       (N0.getOperand(1).getOpcode() == ISD::SUB ||
        N0.getOperand(1).getOpcode() == ISD::ADD) &&
       N0.getOperand(1).getOperand(0) == N1)
-    return DAG.getNode(N0.getOperand(1).getOpcode(), N->getDebugLoc(), VT,
+    return DAG.getNode(N0.getOperand(1).getOpcode(), SDLoc(N), VT,
                        N0.getOperand(0), N0.getOperand(1).getOperand(1));
   // fold ((A+(C+B))-B) -> A+C
   if (N0.getOpcode() == ISD::ADD &&
       N0.getOperand(1).getOpcode() == ISD::ADD &&
       N0.getOperand(1).getOperand(1) == N1)
-    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT,
                        N0.getOperand(0), N0.getOperand(1).getOperand(0));
   // fold ((A-(B-C))-C) -> A-B
   if (N0.getOpcode() == ISD::SUB &&
       N0.getOperand(1).getOpcode() == ISD::SUB &&
       N0.getOperand(1).getOperand(1) == N1)
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        N0.getOperand(0), N0.getOperand(1).getOperand(0));
 
   // If either operand of a sub is undef, the result is undef
@@ -1698,7 +1720,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
       // fold (sub Sym, c) -> Sym-c
       if (N1C && GA->getOpcode() == ISD::GlobalAddress)
-        return DAG.getGlobalAddress(GA->getGlobal(), N1C->getDebugLoc(), VT,
+        return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
                                     GA->getOffset() -
                                       (uint64_t)N1C->getSExtValue());
       // fold (sub Sym+c1, Sym+c2) -> c1-c2
@@ -1720,25 +1742,25 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
 
   // If the flag result is dead, turn this into an SUB.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, N1),
-                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+    return CombineTo(N, DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                  MVT::Glue));
 
   // fold (subc x, x) -> 0 + no borrow
   if (N0 == N1)
     return CombineTo(N, DAG.getConstant(0, VT),
-                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+                     DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                  MVT::Glue));
 
   // fold (subc x, 0) -> x + no borrow
   if (N1C && N1C->isNullValue())
-    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                         MVT::Glue));
 
   // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
   if (N0C && N0C->isAllOnesValue())
-    return CombineTo(N, DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0),
-                     DAG.getNode(ISD::CARRY_FALSE, N->getDebugLoc(),
+    return CombineTo(N, DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                  MVT::Glue));
 
   return SDValue();
@@ -1751,63 +1773,102 @@ SDValue DAGCombiner::visitSUBE(SDNode *N) {
 
   // fold (sube x, y, false) -> (subc x, y)
   if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
-    return DAG.getNode(ISD::SUBC, N->getDebugLoc(), N->getVTList(), N0, N1);
+    return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
 
   return SDValue();
 }
 
+/// isConstantSplatVector - Returns true if N is a BUILD_VECTOR node whose
+/// elements are all the same constant or undefined.
+static bool isConstantSplatVector(SDNode *N, APInt& SplatValue) {
+  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(N);
+  if (!C)
+    return false;
+
+  APInt SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  return (C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                             HasAnyUndefs) &&
+          EltVT.getSizeInBits() >= SplatBitSize);
+}
+
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
+  // fold (mul x, undef) -> 0
+  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
+    return DAG.getConstant(0, VT);
+
+  bool N0IsConst = false;
+  bool N1IsConst = false;
+  APInt ConstValue0, ConstValue1;
   // fold vector ops
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
+
+    N0IsConst = isConstantSplatVector(N0.getNode(), ConstValue0);
+    N1IsConst = isConstantSplatVector(N1.getNode(), ConstValue1);
+  } else {
+    N0IsConst = dyn_cast<ConstantSDNode>(N0) != 0;
+    ConstValue0 = N0IsConst ? (dyn_cast<ConstantSDNode>(N0))->getAPIntValue()
+                            : APInt();
+    N1IsConst = dyn_cast<ConstantSDNode>(N1) != 0;
+    ConstValue1 = N1IsConst ? (dyn_cast<ConstantSDNode>(N1))->getAPIntValue()
+                            : APInt();
   }
 
-  // fold (mul x, undef) -> 0
-  if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
-    return DAG.getConstant(0, VT);
   // fold (mul c1, c2) -> c1*c2
-  if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0C, N1C);
+  if (N0IsConst && N1IsConst)
+    return DAG.FoldConstantArithmetic(ISD::MUL, VT, N0.getNode(), N1.getNode());
+
   // canonicalize constant to RHS
-  if (N0C && !N1C)
-    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT, N1, N0);
+  if (N0IsConst && !N1IsConst)
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
   // fold (mul x, 0) -> 0
-  if (N1C && N1C->isNullValue())
+  if (N1IsConst && ConstValue1 == 0)
     return N1;
+  // We require a splat of the entire scalar bit width for non-contiguous
+  // bit patterns.
+  bool IsFullSplat =
+    ConstValue1.getBitWidth() == VT.getScalarType().getSizeInBits();
+  // fold (mul x, 1) -> x
+  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
+    return N0;
   // fold (mul x, -1) -> 0-x
-  if (N1C && N1C->isAllOnesValue())
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+  if (N1IsConst && ConstValue1.isAllOnesValue())
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT), N0);
   // fold (mul x, (1 << c)) -> x << c
-  if (N1C && N1C->getAPIntValue().isPowerOf2())
-    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
-                       DAG.getConstant(N1C->getAPIntValue().logBase2(),
+  if (N1IsConst && ConstValue1.isPowerOf2() && IsFullSplat)
+    return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
+                       DAG.getConstant(ConstValue1.logBase2(),
                                        getShiftAmountTy(N0.getValueType())));
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
-  if (N1C && (-N1C->getAPIntValue()).isPowerOf2()) {
-    unsigned Log2Val = (-N1C->getAPIntValue()).logBase2();
+  if (N1IsConst && (-ConstValue1).isPowerOf2() && IsFullSplat) {
+    unsigned Log2Val = (-ConstValue1).logBase2();
     // FIXME: If the input is something that is easily negated (e.g. a
     // single-use add), we should put the negate there.
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT),
-                       DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
+                       DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
                             DAG.getConstant(Log2Val,
                                       getShiftAmountTy(N0.getValueType()))));
   }
+
+  APInt Val;
   // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
-  if (N1C && N0.getOpcode() == ISD::SHL &&
-      isa<ConstantSDNode>(N0.getOperand(1))) {
-    SDValue C3 = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+  if (N1IsConst && N0.getOpcode() == ISD::SHL &&
+      (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+                     isa<ConstantSDNode>(N0.getOperand(1)))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT,
                              N1, N0.getOperand(1));
     AddToWorkList(C3.getNode());
-    return DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT,
                        N0.getOperand(0), C3);
   }
 
@@ -1816,7 +1877,9 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   {
     SDValue Sh(0,0), Y(0,0);
     // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
-    if (N0.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N0.getOperand(1)) &&
+    if (N0.getOpcode() == ISD::SHL &&
+        (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+                       isa<ConstantSDNode>(N0.getOperand(1))) &&
         N0.getNode()->hasOneUse()) {
       Sh = N0; Y = N1;
     } else if (N1.getOpcode() == ISD::SHL &&
@@ -1826,24 +1889,25 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     }
 
     if (Sh.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
                                 Sh.getOperand(0), Y);
-      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT,
                          Mul, Sh.getOperand(1));
     }
   }
 
   // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
-  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
-      isa<ConstantSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT,
-                       DAG.getNode(ISD::MUL, N0.getDebugLoc(), VT,
+  if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      (isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+                     isa<ConstantSDNode>(N0.getOperand(1))))
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT,
+                       DAG.getNode(ISD::MUL, SDLoc(N0), VT,
                                    N0.getOperand(0), N1),
-                       DAG.getNode(ISD::MUL, N1.getDebugLoc(), VT,
+                       DAG.getNode(ISD::MUL, SDLoc(N1), VT,
                                    N0.getOperand(1), N1));
 
   // reassociate mul
-  SDValue RMUL = ReassociateOps(ISD::MUL, N->getDebugLoc(), N0, N1);
+  SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1);
   if (RMUL.getNode() != 0)
     return RMUL;
 
@@ -1871,13 +1935,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     return N0;
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT), N0);
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
   if (!VT.isVector()) {
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UDIV, N->getDebugLoc(), N1.getValueType(),
+      return DAG.getNode(ISD::UDIV, SDLoc(N), N1.getValueType(),
                          N0, N1);
   }
   // fold (sdiv X, pow2) -> simple ops after legalize
@@ -1892,19 +1956,19 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
 
     // Splat the sign bit into the register
-    SDValue SGN = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
+    SDValue SGN = DAG.getNode(ISD::SRA, SDLoc(N), VT, N0,
                               DAG.getConstant(VT.getSizeInBits()-1,
                                        getShiftAmountTy(N0.getValueType())));
     AddToWorkList(SGN.getNode());
 
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue SRL = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, SGN,
+    SDValue SRL = DAG.getNode(ISD::SRL, SDLoc(N), VT, SGN,
                               DAG.getConstant(VT.getSizeInBits() - lg2,
                                        getShiftAmountTy(SGN.getValueType())));
-    SDValue ADD = DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, SRL);
+    SDValue ADD = DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, SRL);
     AddToWorkList(SRL.getNode());
     AddToWorkList(ADD.getNode());    // Divide by pow2
-    SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, ADD,
+    SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), VT, ADD,
                   DAG.getConstant(lg2, getShiftAmountTy(ADD.getValueType())));
 
     // If we're dividing by a positive value, we're done.  Otherwise, we must
@@ -1913,7 +1977,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
       return SRA;
 
     AddToWorkList(SRA.getNode());
-    return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
                        DAG.getConstant(0, VT), SRA);
   }
 
@@ -1952,7 +2016,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
   // fold (udiv x, (1 << c)) -> x >>u c
   if (N1C && N1C->getAPIntValue().isPowerOf2())
-    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0,
                        DAG.getConstant(N1C->getAPIntValue().logBase2(),
                                        getShiftAmountTy(N0.getValueType())));
   // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
@@ -1960,13 +2024,13 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
       if (SHC->getAPIntValue().isPowerOf2()) {
         EVT ADDVT = N1.getOperand(1).getValueType();
-        SDValue Add = DAG.getNode(ISD::ADD, N->getDebugLoc(), ADDVT,
+        SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N), ADDVT,
                                   N1.getOperand(1),
                                   DAG.getConstant(SHC->getAPIntValue()
                                                                   .logBase2(),
                                                   ADDVT));
         AddToWorkList(Add.getNode());
-        return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, Add);
+        return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, Add);
       }
     }
   }
@@ -2000,19 +2064,19 @@ SDValue DAGCombiner::visitSREM(SDNode *N) {
   // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
   if (!VT.isVector()) {
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UREM, N->getDebugLoc(), VT, N0, N1);
+      return DAG.getNode(ISD::UREM, SDLoc(N), VT, N0, N1);
   }
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
   if (N1C && !N1C->isNullValue()) {
-    SDValue Div = DAG.getNode(ISD::SDIV, N->getDebugLoc(), VT, N0, N1);
+    SDValue Div = DAG.getNode(ISD::SDIV, SDLoc(N), VT, N0, N1);
     AddToWorkList(Div.getNode());
     SDValue OptimizedDiv = combine(Div.getNode());
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
                                 OptimizedDiv, N1);
-      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
       AddToWorkList(Mul.getNode());
       return Sub;
     }
@@ -2040,18 +2104,18 @@ SDValue DAGCombiner::visitUREM(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
   // fold (urem x, pow2) -> (and x, pow2-1)
   if (N1C && !N1C->isNullValue() && N1C->getAPIntValue().isPowerOf2())
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0,
                        DAG.getConstant(N1C->getAPIntValue()-1,VT));
   // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
   if (N1.getOpcode() == ISD::SHL) {
     if (ConstantSDNode *SHC = dyn_cast<ConstantSDNode>(N1.getOperand(0))) {
       if (SHC->getAPIntValue().isPowerOf2()) {
         SDValue Add =
-          DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N1,
+          DAG.getNode(ISD::ADD, SDLoc(N), VT, N1,
                  DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()),
                                  VT));
         AddToWorkList(Add.getNode());
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, Add);
+        return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, Add);
       }
     }
   }
@@ -2059,13 +2123,13 @@ SDValue DAGCombiner::visitUREM(SDNode *N) {
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
   if (N1C && !N1C->isNullValue()) {
-    SDValue Div = DAG.getNode(ISD::UDIV, N->getDebugLoc(), VT, N0, N1);
+    SDValue Div = DAG.getNode(ISD::UDIV, SDLoc(N), VT, N0, N1);
     AddToWorkList(Div.getNode());
     SDValue OptimizedDiv = combine(Div.getNode());
     if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
-      SDValue Mul = DAG.getNode(ISD::MUL, N->getDebugLoc(), VT,
+      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT,
                                 OptimizedDiv, N1);
-      SDValue Sub = DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, N0, Mul);
+      SDValue Sub = DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, Mul);
       AddToWorkList(Mul.getNode());
       return Sub;
     }
@@ -2086,14 +2150,14 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // fold (mulhs x, 0) -> 0
   if (N1C && N1C->isNullValue())
     return N1;
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (N1C && N1C->getAPIntValue() == 1)
-    return DAG.getNode(ISD::SRA, N->getDebugLoc(), N0.getValueType(), N0,
+    return DAG.getNode(ISD::SRA, SDLoc(N), N0.getValueType(), N0,
                        DAG.getConstant(N0.getValueType().getSizeInBits() - 1,
                                        getShiftAmountTy(N0.getValueType())));
   // fold (mulhs x, undef) -> 0
@@ -2124,7 +2188,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // fold (mulhu x, 0) -> 0
   if (N1C && N1C->isNullValue())
@@ -2166,7 +2230,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   if (!HiExists &&
       (!LegalOperations ||
        TLI.isOperationLegal(LoOp, N->getValueType(0)))) {
-    SDValue Res = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
                               N->op_begin(), N->getNumOperands());
     return CombineTo(N, Res, Res);
   }
@@ -2176,7 +2240,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   if (!LoExists &&
       (!LegalOperations ||
        TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
-    SDValue Res = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
                               N->op_begin(), N->getNumOperands());
     return CombineTo(N, Res, Res);
   }
@@ -2187,7 +2251,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
 
   // If the two computed results can be simplified separately, separate them.
   if (LoExists) {
-    SDValue Lo = DAG.getNode(LoOp, N->getDebugLoc(), N->getValueType(0),
+    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0),
                              N->op_begin(), N->getNumOperands());
     AddToWorkList(Lo.getNode());
     SDValue LoOpt = combine(Lo.getNode());
@@ -2198,7 +2262,7 @@ SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
   }
 
   if (HiExists) {
-    SDValue Hi = DAG.getNode(HiOp, N->getDebugLoc(), N->getValueType(1),
+    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1),
                              N->op_begin(), N->getNumOperands());
     AddToWorkList(Hi.getNode());
     SDValue HiOpt = combine(Hi.getNode());
@@ -2216,7 +2280,7 @@ SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
   if (Res.getNode()) return Res;
 
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
@@ -2246,7 +2310,7 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   if (Res.getNode()) return Res;
 
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
@@ -2275,7 +2339,7 @@ SDValue DAGCombiner::visitSMULO(SDNode *N) {
   // (smulo x, 2) -> (saddo x, x)
   if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (C2->getAPIntValue() == 2)
-      return DAG.getNode(ISD::SADDO, N->getDebugLoc(), N->getVTList(),
+      return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(),
                          N->getOperand(0), N->getOperand(0));
 
   return SDValue();
@@ -2285,7 +2349,7 @@ SDValue DAGCombiner::visitUMULO(SDNode *N) {
   // (umulo x, 2) -> (uaddo x, x)
   if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
     if (C2->getAPIntValue() == 2)
-      return DAG.getNode(ISD::UADDO, N->getDebugLoc(), N->getVTList(),
+      return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(),
                          N->getOperand(0), N->getOperand(0));
 
   return SDValue();
@@ -2336,11 +2400,11 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
       !VT.isVector() &&
       Op0VT == N1.getOperand(0).getValueType() &&
       (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
-    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
     AddToWorkList(ORNode.getNode());
-    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, ORNode);
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
   }
 
   // For each of OP in SHL/SRL/SRA/AND...
@@ -2350,11 +2414,11 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
        N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
       N0.getOperand(1) == N1.getOperand(1)) {
-    SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
+    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
                                  N0.getOperand(0).getValueType(),
                                  N0.getOperand(0), N1.getOperand(0));
     AddToWorkList(ORNode.getNode());
-    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                        ORNode, N0.getOperand(1));
   }
 
@@ -2372,7 +2436,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
     EVT In1Ty = In1.getValueType();
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
     // If both incoming values are integers, and the original types are the
     // same.
     if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
@@ -2414,10 +2478,10 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
     }
 
     if (SameMask) {
-      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+      SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                N0.getOperand(0), N1.getOperand(0));
       AddToWorkList(Op.getNode());
-      return DAG.getVectorShuffle(VT, N->getDebugLoc(), Op,
+      return DAG.getVectorShuffle(VT, SDLoc(N), Op,
                                   DAG.getUNDEF(VT), &SVN0->getMask()[0]);
     }
   }
@@ -2460,7 +2524,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
   // canonicalize constant to RHS
   if (N0C && !N1C)
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
   // fold (and x, -1) -> x
   if (N1C && N1C->isAllOnesValue())
     return N0;
@@ -2469,7 +2533,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, VT);
   // reassociate and
-  SDValue RAND = ReassociateOps(ISD::AND, N->getDebugLoc(), N0, N1);
+  SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1);
   if (RAND.getNode() != 0)
     return RAND;
   // fold (and (or x, C), D) -> D if (C & D) == D
@@ -2483,7 +2547,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     APInt Mask = ~N1C->getAPIntValue();
     Mask = Mask.trunc(N0Op0.getValueSizeInBits());
     if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
-      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(),
+      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
                                  N0.getValueType(), N0Op0);
 
       // Replace uses of the AND with uses of the Zero extend node.
@@ -2496,7 +2560,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
-  // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) -> 
+  // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
   // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
   // already be zero by virtue of the width of the base type of the load.
   //
@@ -2573,7 +2637,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       SDValue NewLoad(Load, 0);
       if (Load->getExtensionType() == ISD::EXTLOAD) {
         NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
-                              Load->getValueType(0), Load->getDebugLoc(),
+                              Load->getValueType(0), SDLoc(Load),
                               Load->getChain(), Load->getBasePtr(),
                               Load->getOffset(), Load->getMemoryVT(),
                               Load->getMemOperand());
@@ -2604,26 +2668,39 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         LL.getValueType().isInteger()) {
       // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
       if (cast<ConstantSDNode>(LR)->isNullValue() && Op1 == ISD::SETEQ) {
-        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+        SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
                                      LR.getValueType(), LL, RL);
         AddToWorkList(ORNode.getNode());
-        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+        return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
       // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
       if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETEQ) {
-        SDValue ANDNode = DAG.getNode(ISD::AND, N0.getDebugLoc(),
+        SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
                                       LR.getValueType(), LL, RL);
         AddToWorkList(ANDNode.getNode());
-        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+        return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1);
       }
       // fold (and (setgt X,  -1), (setgt Y,  -1)) -> (setgt (or X, Y), -1)
       if (cast<ConstantSDNode>(LR)->isAllOnesValue() && Op1 == ISD::SETGT) {
-        SDValue ORNode = DAG.getNode(ISD::OR, N0.getDebugLoc(),
+        SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
                                      LR.getValueType(), LL, RL);
         AddToWorkList(ORNode.getNode());
-        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+        return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
     }
+    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
+    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
+        Op0 == Op1 && LL.getValueType().isInteger() &&
+      Op0 == ISD::SETNE && ((cast<ConstantSDNode>(LR)->isNullValue() &&
+                                 cast<ConstantSDNode>(RR)->isAllOnesValue()) ||
+                                (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
+                                 cast<ConstantSDNode>(RR)->isNullValue()))) {
+      SDValue ADDNode = DAG.getNode(ISD::ADD, SDLoc(N0), LL.getValueType(),
+                                    LL, DAG.getConstant(1, LL.getValueType()));
+      AddToWorkList(ADDNode.getNode());
+      return DAG.getSetCC(SDLoc(N), VT, ADDNode,
+                          DAG.getConstant(2, LL.getValueType()), ISD::SETUGE);
+    }
     // canonicalize equivalent to ll == rl
     if (LL == RR && LR == RL) {
       Op1 = ISD::getSetCCSwappedOperands(Op1);
@@ -2636,8 +2713,8 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           (!LegalOperations ||
            (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
             TLI.isOperationLegal(ISD::SETCC,
-                            TLI.getSetCCResultType(N0.getSimpleValueType())))))
-        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+                            getSetCCResultType(N0.getSimpleValueType())))))
+        return DAG.getSetCC(SDLoc(N), N0.getValueType(),
                             LL, LR, Result);
     }
   }
@@ -2665,11 +2742,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
-                                       LN0->getPointerInfo(), MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2687,12 +2762,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N0.getDebugLoc(), VT,
-                                       LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
       AddToWorkList(N);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2710,7 +2782,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       ? cast<LoadSDNode>(N0.getOperand(0))
       : cast<LoadSDNode>(N0);
     if (LN0->getExtensionType() != ISD::SEXTLOAD &&
-        LN0->isUnindexed() && N0.hasOneUse() && LN0->hasOneUse()) {
+        LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) {
       uint32_t ActiveBits = N1C->getAPIntValue().getActiveBits();
       if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
         EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
@@ -2721,11 +2793,9 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
 
           SDValue NewLoad =
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
-                           LN0->getChain(), LN0->getBasePtr(),
-                           LN0->getPointerInfo(),
-                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           LN0->getAlignment());
+            DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
+                           LN0->getChain(), LN0->getBasePtr(), ExtVT,
+                           LN0->getMemOperand());
           AddToWorkList(N);
           CombineTo(LN0, NewLoad, NewLoad.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2748,7 +2818,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
             unsigned LVTStoreBytes = LoadedVT.getStoreSize();
             unsigned EVTStoreBytes = ExtVT.getStoreSize();
             unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
-            NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(), PtrType,
+            NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0), PtrType,
                                  NewPtr, DAG.getConstant(PtrOff, PtrType));
             Alignment = MinAlign(Alignment, PtrOff);
           }
@@ -2757,11 +2827,11 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
 
           EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
-            DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), LoadResultTy,
+            DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), NewPtr,
                            LN0->getPointerInfo(),
                            ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                           Alignment);
+                           Alignment, LN0->getTBAAInfo());
           AddToWorkList(N);
           CombineTo(LN0, Load, Load.getValue(1));
           return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -2786,7 +2856,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
             ADDC |= Mask;
             if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
               SDValue NewAdd =
-                DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                DAG.getNode(ISD::ADD, SDLoc(N0), VT,
                             N0.getOperand(0), DAG.getConstant(ADDC, VT));
               CombineTo(N0.getNode(), NewAdd);
               return SDValue(N, 0); // Return N so it doesn't get rechecked!
@@ -2797,6 +2867,14 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
+  if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
+    SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                       N0.getOperand(1), false);
+    if (BSwap.getNode())
+      return BSwap;
+  }
+
   return SDValue();
 }
 
@@ -2881,17 +2959,27 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
   if (N00 != N10)
     return SDValue();
 
-  // Make sure everything beyond the low halfword is zero since the SRL 16
-  // will clear the top bits.
+  // Make sure everything beyond the low halfword gets set to zero since the SRL
+  // 16 will clear the top bits.
   unsigned OpSizeInBits = VT.getSizeInBits();
-  if (DemandHighBits && OpSizeInBits > 16 &&
-      (!LookPassAnd0 || !LookPassAnd1) &&
-      !DAG.MaskedValueIsZero(N10, APInt::getHighBitsSet(OpSizeInBits, 16)))
-    return SDValue();
+  if (DemandHighBits && OpSizeInBits > 16) {
+    // If the left-shift isn't masked out then the only way this is a bswap is
+    // if all bits beyond the low 8 are 0. In that case the entire pattern
+    // reduces to a left shift anyway: leave it for other parts of the combiner.
+    if (!LookPassAnd0)
+      return SDValue();
 
-  SDValue Res = DAG.getNode(ISD::BSWAP, N->getDebugLoc(), VT, N00);
+    // However, if the right shift isn't masked out then it might be because
+    // it's not needed. See if we can spot that too.
+    if (!LookPassAnd1 &&
+        !DAG.MaskedValueIsZero(
+            N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
+      return SDValue();
+  }
+
+  SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
   if (OpSizeInBits > 16)
-    Res = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, Res,
+    Res = DAG.getNode(ISD::SRL, SDLoc(N), VT, Res,
                       DAG.getConstant(OpSizeInBits-16, getShiftAmountTy(VT)));
   return Res;
 }
@@ -2899,7 +2987,7 @@ SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
 /// isBSwapHWordElement - Return true if the specified node is an element
 /// that makes up a 32-bit packed halfword byteswap. i.e.
 /// ((x&0xff)<<8)|((x&0xff00)>>8)|((x&0x00ff0000)<<8)|((x&0xff000000)>>8)
-static bool isBSwapHWordElement(SDValue N, SmallVector<SDNode*,4> &Parts) {
+static bool isBSwapHWordElement(SDValue N, SmallVectorImpl<SDNode *> &Parts) {
   if (!N.getNode()->hasOneUse())
     return false;
 
@@ -3024,19 +3112,19 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
     return SDValue();
 
-  SDValue BSwap = DAG.getNode(ISD::BSWAP, N->getDebugLoc(), VT,
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, SDLoc(N), VT,
                               SDValue(Parts[0],0));
 
-  // Result of the bswap should be rotated by 16. If it's not legal, than
+  // Result of the bswap should be rotated by 16. If it's not legal, then
   // do  (x << 16) | (x >> 16).
   SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT));
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
-    return DAG.getNode(ISD::ROTL, N->getDebugLoc(), VT, BSwap, ShAmt);
+    return DAG.getNode(ISD::ROTL, SDLoc(N), VT, BSwap, ShAmt);
   if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
-    return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, BSwap, ShAmt);
-  return DAG.getNode(ISD::OR, N->getDebugLoc(), VT,
-                     DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, BSwap, ShAmt),
-                     DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, BSwap, ShAmt));
+    return DAG.getNode(ISD::ROTR, SDLoc(N), VT, BSwap, ShAmt);
+  return DAG.getNode(ISD::OR, SDLoc(N), VT,
+                     DAG.getNode(ISD::SHL, SDLoc(N), VT, BSwap, ShAmt),
+                     DAG.getNode(ISD::SRL, SDLoc(N), VT, BSwap, ShAmt));
 }
 
 SDValue DAGCombiner::visitOR(SDNode *N) {
@@ -3076,7 +3164,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
   // canonicalize constant to RHS
   if (N0C && !N1C)
-    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
   // fold (or x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
@@ -3096,7 +3184,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     return BSwap;
 
   // reassociate or
-  SDValue ROR = ReassociateOps(ISD::OR, N->getDebugLoc(), N0, N1);
+  SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1);
   if (ROR.getNode() != 0)
     return ROR;
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
@@ -3105,8 +3193,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
     if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0)
-      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                         DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+      return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                         DAG.getNode(ISD::OR, SDLoc(N0), VT,
                                      N0.getOperand(0), N1),
                          DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1));
   }
@@ -3121,19 +3209,19 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
       if (cast<ConstantSDNode>(LR)->isNullValue() &&
           (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
-        SDValue ORNode = DAG.getNode(ISD::OR, LR.getDebugLoc(),
+        SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
                                      LR.getValueType(), LL, RL);
         AddToWorkList(ORNode.getNode());
-        return DAG.getSetCC(N->getDebugLoc(), VT, ORNode, LR, Op1);
+        return DAG.getSetCC(SDLoc(N), VT, ORNode, LR, Op1);
       }
       // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
       // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
       if (cast<ConstantSDNode>(LR)->isAllOnesValue() &&
           (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
-        SDValue ANDNode = DAG.getNode(ISD::AND, LR.getDebugLoc(),
+        SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
                                       LR.getValueType(), LL, RL);
         AddToWorkList(ANDNode.getNode());
-        return DAG.getSetCC(N->getDebugLoc(), VT, ANDNode, LR, Op1);
+        return DAG.getSetCC(SDLoc(N), VT, ANDNode, LR, Op1);
       }
     }
     // canonicalize equivalent to ll == rl
@@ -3148,8 +3236,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
           (!LegalOperations ||
            (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
             TLI.isOperationLegal(ISD::SETCC,
-              TLI.getSetCCResultType(N0.getValueType())))))
-        return DAG.getSetCC(N->getDebugLoc(), N0.getValueType(),
+              getSetCCResultType(N0.getValueType())))))
+        return DAG.getSetCC(SDLoc(N), N0.getValueType(),
                             LL, LR, Result);
     }
   }
@@ -3176,15 +3264,15 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
 
     if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
         DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
-      SDValue X = DAG.getNode(ISD::OR, N0.getDebugLoc(), VT,
+      SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                               N0.getOperand(0), N1.getOperand(0));
-      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, X,
+      return DAG.getNode(ISD::AND, SDLoc(N), VT, X,
                          DAG.getConstant(LHSMask | RHSMask, VT));
     }
   }
 
   // See if this is some rotate idiom.
-  if (SDNode *Rot = MatchRotate(N0, N1, N->getDebugLoc()))
+  if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
 
   // Simplify the operands using demanded-bits information.
@@ -3217,7 +3305,7 @@ static bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
-SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, SDLoc DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
   if (!TLI.isTypeLegal(VT)) return 0;
@@ -3292,33 +3380,9 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
   if (LHSMask.getNode() || RHSMask.getNode())
     return 0;
 
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotl x, y)
-  // fold (or (shl x, y), (srl x, (sub 32, y))) -> (rotr x, (sub 32, y))
-  if (RHSShiftAmt.getOpcode() == ISD::SUB &&
-      LHSShiftAmt == RHSShiftAmt.getOperand(1)) {
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
-      if (SUBC->getAPIntValue() == OpSizeInBits) {
-        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
-                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-      }
-    }
-  }
-
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotr x, y)
-  // fold (or (shl x, (sub 32, y)), (srl x, r)) -> (rotl x, (sub 32, y))
-  if (LHSShiftAmt.getOpcode() == ISD::SUB &&
-      RHSShiftAmt == LHSShiftAmt.getOperand(1)) {
-    if (ConstantSDNode *SUBC =
-          dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0))) {
-      if (SUBC->getAPIntValue() == OpSizeInBits) {
-        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
-                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-      }
-    }
-  }
-
-  // Look for sign/zext/any-extended or truncate cases:
+  // If the shift amount is sign/zext/any-extended just peel it off.
+  SDValue LExtOp0 = LHSShiftAmt;
+  SDValue RExtOp0 = RHSShiftAmt;
   if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
@@ -3327,37 +3391,31 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
        RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
        RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
-    SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
-    SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
-    if (RExtOp0.getOpcode() == ISD::SUB &&
-        RExtOp0.getOperand(1) == LExtOp0) {
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotl x, y)
-      // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
-      //   (rotr x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
-            dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0))) {
-        if (SUBC->getAPIntValue() == OpSizeInBits) {
-          return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
-                             LHSShiftArg,
-                             HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
-        }
-      }
-    } else if (LExtOp0.getOpcode() == ISD::SUB &&
-               RExtOp0 == LExtOp0.getOperand(1)) {
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotr x, y)
-      // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
-      //   (rotl x, (sub 32, y))
-      if (ConstantSDNode *SUBC =
-            dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0))) {
-        if (SUBC->getAPIntValue() == OpSizeInBits) {
-          return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT,
-                             LHSShiftArg,
-                             HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
-        }
-      }
-    }
+    LExtOp0 = LHSShiftAmt.getOperand(0);
+    RExtOp0 = RHSShiftAmt.getOperand(0);
+  }
+
+  if (RExtOp0.getOpcode() == ISD::SUB && RExtOp0.getOperand(1) == LExtOp0) {
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotl x, y)
+    // fold (or (shl x, (*ext y)), (srl x, (*ext (sub 32, y)))) ->
+    //   (rotr x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(RExtOp0.getOperand(0)))
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
+  } else if (LExtOp0.getOpcode() == ISD::SUB &&
+             RExtOp0 == LExtOp0.getOperand(1)) {
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotr x, y)
+    // fold (or (shl x, (*ext (sub 32, y))), (srl x, (*ext y))) ->
+    //   (rotl x, (sub 32, y))
+    if (ConstantSDNode *SUBC =
+            dyn_cast<ConstantSDNode>(LExtOp0.getOperand(0)))
+      if (SUBC->getAPIntValue() == OpSizeInBits)
+        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
+                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
   }
 
   return 0;
@@ -3396,12 +3454,12 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
   // canonicalize constant to RHS
   if (N0C && !N1C)
-    return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
   // fold (xor x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
   // reassociate xor
-  SDValue RXOR = ReassociateOps(ISD::XOR, N->getDebugLoc(), N0, N1);
+  SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1);
   if (RXOR.getNode() != 0)
     return RXOR;
 
@@ -3417,9 +3475,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
-        return DAG.getSetCC(N->getDebugLoc(), VT, LHS, RHS, NotCC);
+        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC);
       case ISD::SELECT_CC:
-        return DAG.getSelectCC(N->getDebugLoc(), LHS, RHS, N0.getOperand(2),
+        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
       }
     }
@@ -3430,10 +3488,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       N0.getNode()->hasOneUse() &&
       isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
     SDValue V = N0.getOperand(0);
-    V = DAG.getNode(ISD::XOR, N0.getDebugLoc(), V.getValueType(), V,
+    V = DAG.getNode(ISD::XOR, SDLoc(N0), V.getValueType(), V,
                     DAG.getConstant(1, V.getValueType()));
     AddToWorkList(V.getNode());
-    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, V);
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
@@ -3442,10 +3500,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
-      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
@@ -3454,28 +3512,36 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
     SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
     if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
       unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, LHS.getDebugLoc(), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, RHS.getDebugLoc(), VT, RHS, N1); // RHS = ~RHS
+      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
       AddToWorkList(LHS.getNode()); AddToWorkList(RHS.getNode());
-      return DAG.getNode(NewOpcode, N->getDebugLoc(), VT, LHS, RHS);
+      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
     }
   }
+  // fold (xor (and x, y), y) -> (and (not x), y)
+  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+      N0->getOperand(1) == N1) {
+    SDValue X = N0->getOperand(0);
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
+    AddToWorkList(NotX.getNode());
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
+  }
   // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
   if (N1C && N0.getOpcode() == ISD::XOR) {
     ConstantSDNode *N00C = dyn_cast<ConstantSDNode>(N0.getOperand(0));
     ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
     if (N00C)
-      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(1),
+      return DAG.getNode(ISD::XOR, SDLoc(N), VT, N0.getOperand(1),
                          DAG.getConstant(N1C->getAPIntValue() ^
                                          N00C->getAPIntValue(), VT));
     if (N01C)
-      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT, N0.getOperand(0),
+      return DAG.getNode(ISD::XOR, SDLoc(N), VT, N0.getOperand(0),
                          DAG.getConstant(N1C->getAPIntValue() ^
                                          N01C->getAPIntValue(), VT));
   }
   // fold (xor x, x) -> 0
   if (N0 == N1)
-    return tryFoldToZero(N->getDebugLoc(), TLI, VT, DAG, LegalOperations);
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
 
   // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
   if (N0.getOpcode() == N1.getOpcode()) {
@@ -3548,17 +3614,17 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, unsigned Amt) {
   }
 
   // Fold the constants, shifting the binop RHS by the shift amount.
-  SDValue NewRHS = DAG.getNode(N->getOpcode(), LHS->getOperand(1).getDebugLoc(),
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
                                N->getValueType(0),
                                LHS->getOperand(1), N->getOperand(1));
 
   // Create the new shift.
   SDValue NewShift = DAG.getNode(N->getOpcode(),
-                                 LHS->getOperand(0).getDebugLoc(),
+                                 SDLoc(LHS->getOperand(0)),
                                  VT, LHS->getOperand(0), N->getOperand(1));
 
   // Create the new binop.
-  return DAG.getNode(LHS->getOpcode(), N->getDebugLoc(), VT, NewShift, NewRHS);
+  return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
 }
 
 SDValue DAGCombiner::visitSHL(SDNode *N) {
@@ -3569,6 +3635,12 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (shl c1, c2) -> c1<<c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
@@ -3598,10 +3670,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       SDValue N100 = N1.getOperand(0).getOperand(0);
       APInt TruncC = N101C->getAPIntValue();
       TruncC = TruncC.trunc(TruncVT.getSizeInBits());
-      return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0,
-                         DAG.getNode(ISD::AND, N->getDebugLoc(), TruncVT,
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
+                         DAG.getNode(ISD::AND, SDLoc(N), TruncVT,
                                      DAG.getNode(ISD::TRUNCATE,
-                                                 N->getDebugLoc(),
+                                                 SDLoc(N),
                                                  TruncVT, N100),
                                      DAG.getConstant(TruncC, TruncVT)));
     }
@@ -3617,7 +3689,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     uint64_t c2 = N1C->getZExtValue();
     if (c1 + c2 >= OpSizeInBits)
       return DAG.getConstant(0, VT);
-    return DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+    return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getConstant(c1 + c2, N1.getValueType()));
   }
 
@@ -3639,13 +3711,34 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (c2 >= OpSizeInBits - InnerShiftSize) {
       if (c1 + c2 >= OpSizeInBits)
         return DAG.getConstant(0, VT);
-      return DAG.getNode(ISD::SHL, N0->getDebugLoc(), VT,
-                         DAG.getNode(N0.getOpcode(), N0->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SHL, SDLoc(N0), VT,
+                         DAG.getNode(N0.getOpcode(), SDLoc(N0), VT,
                                      N0.getOperand(0)->getOperand(0)),
                          DAG.getConstant(c1 + c2, N1.getValueType()));
     }
   }
 
+  // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
+  // Only fold this if the inner zext has no other uses to avoid increasing
+  // the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    if (c1 < VT.getSizeInBits()) {
+      uint64_t c2 = N1C->getZExtValue();
+      if (c1 == c2) {
+        SDValue NewOp0 = N0.getOperand(0);
+        EVT CountVT = NewOp0.getOperand(1).getValueType();
+        SDValue NewSHL = DAG.getNode(ISD::SHL, SDLoc(N), NewOp0.getValueType(),
+                                     NewOp0, DAG.getConstant(c2, CountVT));
+        AddToWorkList(NewSHL.getNode());
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
+      }
+    }
+  }
+
   // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
   //                               (and (srl x, (sub c1, c2), MASK)
   // Only fold this if the inner shift has no other uses -- if it does, folding
@@ -3660,14 +3753,14 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       SDValue Shift;
       if (c2 > c1) {
         Mask = Mask.shl(c2-c1);
-        Shift = DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, N0.getOperand(0),
+        Shift = DAG.getNode(ISD::SHL, SDLoc(N), VT, N0.getOperand(0),
                             DAG.getConstant(c2-c1, N1.getValueType()));
       } else {
         Mask = Mask.lshr(c1-c2);
-        Shift = DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+        Shift = DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0),
                             DAG.getConstant(c1-c2, N1.getValueType()));
       }
-      return DAG.getNode(ISD::AND, N0.getDebugLoc(), VT, Shift,
+      return DAG.getNode(ISD::AND, SDLoc(N0), VT, Shift,
                          DAG.getConstant(Mask, VT));
     }
   }
@@ -3678,7 +3771,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
                                             VT.getSizeInBits() -
                                               N1C->getZExtValue()),
                       VT);
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0),
                        HiBitsMask);
   }
 
@@ -3699,6 +3792,12 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (sra c1, c2) -> (sra c1, c2)
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
@@ -3724,7 +3823,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
                                ExtVT, VT.getVectorNumElements());
     if ((!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          N0.getOperand(0), DAG.getValueType(ExtVT));
   }
 
@@ -3733,7 +3832,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     if (ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
       unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
       if (Sum >= OpSizeInBits) Sum = OpSizeInBits-1;
-      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0.getOperand(0),
+      return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
                          DAG.getConstant(Sum, N1C->getValueType(0)));
     }
   }
@@ -3765,11 +3864,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
           SDValue Amt = DAG.getConstant(ShiftAmt,
               getShiftAmountTy(N0.getOperand(0).getValueType()));
-          SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT,
+          SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0), VT,
                                       N0.getOperand(0), Amt);
-          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), TruncVT,
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), TruncVT,
                                       Shift);
-          return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(),
+          return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N),
                              N->getValueType(0), Trunc);
       }
     }
@@ -3785,11 +3884,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       SDValue N100 = N1.getOperand(0).getOperand(0);
       APInt TruncC = N101C->getAPIntValue();
       TruncC = TruncC.trunc(TruncVT.getScalarType().getSizeInBits());
-      return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT, N0,
-                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+      return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0,
+                         DAG.getNode(ISD::AND, SDLoc(N),
                                      TruncVT,
                                      DAG.getNode(ISD::TRUNCATE,
-                                                 N->getDebugLoc(),
+                                                 SDLoc(N),
                                                  TruncVT, N100),
                                      DAG.getConstant(TruncC, TruncVT)));
     }
@@ -3812,9 +3911,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       SDValue Amt =
         DAG.getConstant(LargeShiftAmt->getZExtValue() + N1C->getZExtValue(),
               getShiftAmountTy(N0.getOperand(0).getOperand(0).getValueType()));
-      SDValue SRA = DAG.getNode(ISD::SRA, N->getDebugLoc(), LargeVT,
+      SDValue SRA = DAG.getNode(ISD::SRA, SDLoc(N), LargeVT,
                                 N0.getOperand(0).getOperand(0), Amt);
-      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, SRA);
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, SRA);
     }
   }
 
@@ -3825,7 +3924,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
 
   // If the sign bit is known to be zero, switch this to a SRL.
   if (DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
 
   if (N1C) {
     SDValue NewSRA = visitShiftByConstant(N, N1C->getZExtValue());
@@ -3844,6 +3943,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
+  // fold vector ops
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVBinOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (srl c1, c2) -> c1 >>u c2
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
@@ -3868,7 +3973,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     uint64_t c2 = N1C->getZExtValue();
     if (c1 + c2 >= OpSizeInBits)
       return DAG.getConstant(0, VT);
-    return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0),
+    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getConstant(c1 + c2, N1.getValueType()));
   }
 
@@ -3886,8 +3991,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     if (c1 + OpSizeInBits == InnerShiftSize) {
       if (c1 + c2 >= InnerShiftSize)
         return DAG.getConstant(0, VT);
-      return DAG.getNode(ISD::TRUNCATE, N0->getDebugLoc(), VT,
-                         DAG.getNode(ISD::SRL, N0->getDebugLoc(), InnerShiftVT,
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT,
+                         DAG.getNode(ISD::SRL, SDLoc(N0), InnerShiftVT,
                                      N0.getOperand(0)->getOperand(0),
                                      DAG.getConstant(c1 + c2, ShiftCountVT)));
     }
@@ -3897,12 +4002,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N1C && N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       N0.getValueSizeInBits() <= 64) {
     uint64_t ShAmt = N1C->getZExtValue()+64-N0.getValueSizeInBits();
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0.getOperand(0),
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getConstant(~0ULL >> ShAmt, VT));
   }
 
-
-  // fold (srl (anyextend x), c) -> (anyextend (srl x, c))
+  // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
   if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
     // Shifting in all undef bits?
     EVT SmallVT = N0.getOperand(0).getValueType();
@@ -3911,11 +4015,14 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
     if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
       uint64_t ShiftAmt = N1C->getZExtValue();
-      SDValue SmallShift = DAG.getNode(ISD::SRL, N0.getDebugLoc(), SmallVT,
+      SDValue SmallShift = DAG.getNode(ISD::SRL, SDLoc(N0), SmallVT,
                                        N0.getOperand(0),
                           DAG.getConstant(ShiftAmt, getShiftAmountTy(SmallVT)));
       AddToWorkList(SmallShift.getNode());
-      return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, SmallShift);
+      APInt Mask = APInt::getAllOnesValue(VT.getSizeInBits()).lshr(ShiftAmt);
+      return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                         DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, SmallShift),
+                         DAG.getConstant(Mask, VT));
     }
   }
 
@@ -3923,7 +4030,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // bit, which is unmodified by sra.
   if (N1C && N1C->getZExtValue() + 1 == VT.getSizeInBits()) {
     if (N0.getOpcode() == ISD::SRA)
-      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0.getOperand(0), N1);
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
   }
 
   // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
@@ -3951,12 +4058,12 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       SDValue Op = N0.getOperand(0);
 
       if (ShAmt) {
-        Op = DAG.getNode(ISD::SRL, N0.getDebugLoc(), VT, Op,
+        Op = DAG.getNode(ISD::SRL, SDLoc(N0), VT, Op,
                   DAG.getConstant(ShAmt, getShiftAmountTy(Op.getValueType())));
         AddToWorkList(Op.getNode());
       }
 
-      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::XOR, SDLoc(N), VT,
                          Op, DAG.getConstant(1, VT));
     }
   }
@@ -3971,11 +4078,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
       SDValue N100 = N1.getOperand(0).getOperand(0);
       APInt TruncC = N101C->getAPIntValue();
       TruncC = TruncC.trunc(TruncVT.getSizeInBits());
-      return DAG.getNode(ISD::SRL, N->getDebugLoc(), VT, N0,
-                         DAG.getNode(ISD::AND, N->getDebugLoc(),
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0,
+                         DAG.getNode(ISD::AND, SDLoc(N),
                                      TruncVT,
                                      DAG.getNode(ISD::TRUNCATE,
-                                                 N->getDebugLoc(),
+                                                 SDLoc(N),
                                                  TruncVT, N100),
                                      DAG.getConstant(TruncC, TruncVT)));
     }
@@ -4035,7 +4142,7 @@ SDValue DAGCombiner::visitCTLZ(SDNode *N) {
 
   // fold (ctlz c1) -> c2
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::CTLZ, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
   return SDValue();
 }
 
@@ -4045,7 +4152,7 @@ SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
 
   // fold (ctlz_zero_undef c1) -> c2
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
 
@@ -4055,7 +4162,7 @@ SDValue DAGCombiner::visitCTTZ(SDNode *N) {
 
   // fold (cttz c1) -> c2
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::CTTZ, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
   return SDValue();
 }
 
@@ -4065,7 +4172,7 @@ SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
 
   // fold (cttz_zero_undef c1) -> c2
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
   return SDValue();
 }
 
@@ -4075,7 +4182,7 @@ SDValue DAGCombiner::visitCTPOP(SDNode *N) {
 
   // fold (ctpop c1) -> c2
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
   return SDValue();
 }
 
@@ -4100,7 +4207,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     return N2;
   // fold (select C, 1, X) -> (or C, X)
   if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
-    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select C, 0, 1) -> (xor C, 1)
   if (VT.isInteger() &&
       (VT0 == MVT::i1 ||
@@ -4110,38 +4217,38 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
       N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
     SDValue XORNode;
     if (VT == VT0)
-      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT0,
+      return DAG.getNode(ISD::XOR, SDLoc(N), VT0,
                          N0, DAG.getConstant(1, VT0));
-    XORNode = DAG.getNode(ISD::XOR, N0.getDebugLoc(), VT0,
+    XORNode = DAG.getNode(ISD::XOR, SDLoc(N0), VT0,
                           N0, DAG.getConstant(1, VT0));
     AddToWorkList(XORNode.getNode());
     if (VT.bitsGT(VT0))
-      return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, XORNode);
-    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, XORNode);
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, XORNode);
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, XORNode);
   }
   // fold (select C, 0, X) -> (and (not C), X)
   if (VT == VT0 && VT == MVT::i1 && N1C && N1C->isNullValue()) {
-    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorkList(NOTNode.getNode());
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, NOTNode, N2);
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2);
   }
   // fold (select C, X, 1) -> (or (not C), X)
   if (VT == VT0 && VT == MVT::i1 && N2C && N2C->getAPIntValue() == 1) {
-    SDValue NOTNode = DAG.getNOT(N0.getDebugLoc(), N0, VT);
+    SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
     AddToWorkList(NOTNode.getNode());
-    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, NOTNode, N1);
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
   }
   // fold (select C, X, 0) -> (and C, X)
   if (VT == MVT::i1 && N2C && N2C->isNullValue())
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
   // fold (select X, X, Y) -> (or X, Y)
   // fold (select X, 1, Y) -> (or X, Y)
   if (VT == MVT::i1 && (N0 == N1 || (N1C && N1C->getAPIntValue() == 1)))
-    return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, N0, N2);
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select X, Y, X) -> (and X, Y)
   // fold (select X, Y, 0) -> (and X, Y)
   if (VT == MVT::i1 && (N0 == N2 || (N2C && N2C->getAPIntValue() == 0)))
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
 
   // If we can fold this based on the true/false value, do so.
   if (SimplifySelectOps(N, N1, N2))
@@ -4155,20 +4262,37 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
     // about, since there is no way to mark an opcode illegal at all value types
     if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other) &&
         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))
-      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1),
                          N1, N2, N0.getOperand(2));
-    return SimplifySelect(N->getDebugLoc(), N0, N1, N2);
+    return SimplifySelect(SDLoc(N), N0, N1, N2);
   }
 
   return SDValue();
 }
 
+static
+std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Canonicalize integer abs.
   // vselect (setg[te] X,  0),  X, -X ->
@@ -4201,6 +4325,34 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     }
   }
 
+  // If the VSELECT result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (N0.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
+    llvm::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
+    llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
+    llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
+
+    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
+    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
+
+    // Add the new VSELECT nodes to the work list in case they need to be split
+    // again.
+    AddToWorkList(Lo.getNode());
+    AddToWorkList(Hi.getNode());
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
   return SDValue();
 }
 
@@ -4217,35 +4369,37 @@ SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
     return N2;
 
   // Determine if the condition we're dealing with is constant
-  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
-                              N0, N1, CC, N->getDebugLoc(), false);
-  if (SCC.getNode()) AddToWorkList(SCC.getNode());
+  SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, SDLoc(N), false);
+  if (SCC.getNode()) {
+    AddToWorkList(SCC.getNode());
 
-  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
-    if (!SCCC->isNullValue())
-      return N2;    // cond always true -> true val
-    else
-      return N3;    // cond always false -> false val
-  }
+    if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
+      if (!SCCC->isNullValue())
+        return N2;    // cond always true -> true val
+      else
+        return N3;    // cond always false -> false val
+    }
 
-  // Fold to a simpler select_cc
-  if (SCC.getNode() && SCC.getOpcode() == ISD::SETCC)
-    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N2.getValueType(),
-                       SCC.getOperand(0), SCC.getOperand(1), N2, N3,
-                       SCC.getOperand(2));
+    // Fold to a simpler select_cc
+    if (SCC.getOpcode() == ISD::SETCC)
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
+                         SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                         SCC.getOperand(2));
+  }
 
   // If we can fold this based on the true/false value, do so.
   if (SimplifySelectOps(N, N2, N3))
     return SDValue(N, 0);  // Don't revisit N.
 
   // fold select_cc into other things, such as min/max/abs
-  return SimplifySelectCC(N->getDebugLoc(), N0, N1, N2, N3, CC);
+  return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
 }
 
 SDValue DAGCombiner::visitSETCC(SDNode *N) {
   return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
                        cast<CondCodeSDNode>(N->getOperand(2))->get(),
-                       N->getDebugLoc());
+                       SDLoc(N));
 }
 
 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
@@ -4254,7 +4408,7 @@ SDValue DAGCombiner::visitSETCC(SDNode *N) {
 // mentioned transformation is profitable.
 static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
                                     unsigned ExtOpc,
-                                    SmallVector<SDNode*, 4> &ExtendNodes,
+                                    SmallVectorImpl<SDNode *> &ExtendNodes,
                                     const TargetLowering &TLI) {
   bool HasCopyToRegUses = false;
   bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
@@ -4312,8 +4466,8 @@ static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
   return true;
 }
 
-void DAGCombiner::ExtendSetCCUses(SmallVector<SDNode*, 4> SetCCs,
-                                  SDValue Trunc, SDValue ExtLoad, DebugLoc DL,
+void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                                  SDValue Trunc, SDValue ExtLoad, SDLoc DL,
                                   ISD::NodeType ExtType) {
   // Extend SetCC uses if necessary.
   for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
@@ -4340,12 +4494,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
 
   // fold (sext c1) -> c1
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N0);
 
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
-    return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
                        N0.getOperand(0));
 
   if (N0.getOpcode() == ISD::TRUNCATE) {
@@ -4379,22 +4533,22 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
       // bits, just sext from i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, Op);
+        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op);
     } else {
       // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
       // bits, just truncate to i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
     }
 
     // fold (sext (truncate x)) -> (sextinreg x).
     if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
                                                  N0.getValueType())) {
       if (OpBits < DestBits)
-        Op = DAG.getNode(ISD::ANY_EXTEND, N0.getDebugLoc(), VT, Op);
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
       else if (OpBits > DestBits)
-        Op = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), VT, Op);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, Op,
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op,
                          DAG.getValueType(N0.getValueType()));
     }
   }
@@ -4412,17 +4566,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::SIGN_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -4436,15 +4588,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
-                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                             N0.getValueType(), ExtLoad),
                 ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -4467,23 +4617,20 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
                                           SetCCs, TLI);
       if (DoXform) {
-        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, LN0->getDebugLoc(), VT,
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
-        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+        SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                                   ExtLoad, DAG.getConstant(Mask, VT));
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
-                                    N0.getOperand(0).getDebugLoc(),
+                                    SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         CombineTo(N, And);
         CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                         ISD::SIGN_EXTEND);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
@@ -4494,13 +4641,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
-        TLI.getBooleanContents(true) == 
+        TLI.getBooleanContents(true) ==
           TargetLowering::ZeroOrNegativeOneBooleanContent) {
       EVT N0VT = N0.getOperand(0).getValueType();
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
-      EVT SVT = TLI.getSetCCResultType(N0VT);
+      EVT SVT = getSetCCResultType(N0VT);
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
@@ -4508,24 +4655,19 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == SVT.getSizeInBits())
-        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
-      EVT MatchingElementType =
-        EVT::getIntegerVT(*DAG.getContext(),
-                          N0VT.getScalarType().getSizeInBits());
-      EVT MatchingVectorType =
-        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                         N0VT.getVectorNumElements());
-
+      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
       if (SVT == MatchingVectorType) {
-        SDValue VsetCC = DAG.getSetCC(N->getDebugLoc(), MatchingVectorType,
+        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType,
                                N0.getOperand(0), N0.getOperand(1),
                                cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
       }
     }
 
@@ -4534,24 +4676,26 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     SDValue NegOne =
       DAG.getConstant(APInt::getAllOnesValue(ElementWidth), VT);
     SDValue SCC =
-      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+      SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1),
                        NegOne, DAG.getConstant(0, VT),
                        cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
     if (SCC.getNode()) return SCC;
-    if (!VT.isVector() && (!LegalOperations ||
-        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(VT))))
-      return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
-                         DAG.getSetCC(N->getDebugLoc(),
-                                      TLI.getSetCCResultType(VT),
-                                      N0.getOperand(0), N0.getOperand(1),
-                                 cast<CondCodeSDNode>(N0.getOperand(2))->get()),
-                         NegOne, DAG.getConstant(0, VT));
+    if (!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(VT)))) {
+      return DAG.getSelect(SDLoc(N), VT,
+                           DAG.getSetCC(SDLoc(N),
+                           getSetCCResultType(VT),
+                           N0.getOperand(0), N0.getOperand(1),
+                           cast<CondCodeSDNode>(N0.getOperand(2))->get()),
+                           NegOne, DAG.getConstant(0, VT));
+    }
   }
 
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
       DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -4600,11 +4744,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
   // fold (zext c1) -> c1
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
   // fold (zext (zext x)) -> (zext x)
   // fold (zext (aext x)) -> (zext x)
   if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
-    return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
                        N0.getOperand(0));
 
   // fold (zext (truncate x)) -> (zext x) or
@@ -4623,9 +4767,9 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                  VT.getSizeInBits()));
     if (TruncatedBits == (KnownZero & TruncatedBits)) {
       if (VT.bitsGT(Op.getValueType()))
-        return DAG.getNode(ISD::ZERO_EXTEND, N->getDebugLoc(), VT, Op);
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
       if (VT.bitsLT(Op.getValueType()))
-        return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
 
       return Op;
     }
@@ -4665,13 +4809,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
 
     SDValue Op = N0.getOperand(0);
     if (Op.getValueType().bitsLT(VT)) {
-      Op = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, Op);
+      Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
       AddToWorkList(Op.getNode());
     } else if (Op.getValueType().bitsGT(VT)) {
-      Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Op);
+      Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
       AddToWorkList(Op.getNode());
     }
-    return DAG.getZeroExtendInReg(Op, N->getDebugLoc(),
+    return DAG.getZeroExtendInReg(Op, SDLoc(N),
                                   N0.getValueType().getScalarType());
   }
 
@@ -4685,13 +4829,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, X.getDebugLoc(), VT, X);
+      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
     } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+      X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
     }
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::AND, SDLoc(N), VT,
                        X, DAG.getConstant(Mask, VT));
   }
 
@@ -4708,18 +4852,16 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
 
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ZERO_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -4741,23 +4883,20 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::ZERO_EXTEND,
                                           SetCCs, TLI);
       if (DoXform) {
-        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, LN0->getDebugLoc(), VT,
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
                                          LN0->getChain(), LN0->getBasePtr(),
-                                         LN0->getPointerInfo(),
                                          LN0->getMemoryVT(),
-                                         LN0->isVolatile(),
-                                         LN0->isNonTemporal(),
-                                         LN0->getAlignment());
+                                         LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.zext(VT.getSizeInBits());
-        SDValue And = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+        SDValue And = DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                                   ExtLoad, DAG.getConstant(Mask, VT));
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
-                                    N0.getOperand(0).getDebugLoc(),
+                                    SDLoc(N0.getOperand(0)),
                                     N0.getOperand(0).getValueType(), ExtLoad);
         CombineTo(N, And);
         CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                         ISD::ZERO_EXTEND);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
@@ -4772,15 +4911,13 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       MemVT,
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(),
-                DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(), N0.getValueType(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
                             ExtLoad),
                 ExtLoad.getValue(1));
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -4801,11 +4938,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         // for that matter).  Check to see that they are the same size.  If so,
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                           DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+        return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                           DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                                          N0.getOperand(1),
                                  cast<CondCodeSDNode>(N0.getOperand(2))->get()),
-                           DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+                           DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT,
                                        &OneOps[0], OneOps.size()));
 
       // If the desired elements are smaller or larger than the source
@@ -4818,18 +4955,18 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                          N0VT.getVectorNumElements());
       SDValue VsetCC =
-        DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+        DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
                       N0.getOperand(1),
                       cast<CondCodeSDNode>(N0.getOperand(2))->get());
-      return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
-                         DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT),
-                         DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::AND, SDLoc(N), VT,
+                         DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT),
+                         DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT,
                                      &OneOps[0], OneOps.size()));
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDValue SCC =
-      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+      SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1),
                        DAG.getConstant(1, VT), DAG.getConstant(0, VT),
                        cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
     if (SCC.getNode()) return SCC;
@@ -4852,7 +4989,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
         return SDValue();
     }
 
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     // Ensure that the shift amount is wide enough for the shifted value.
     if (VT.getSizeInBits() >= 256)
@@ -4872,14 +5009,14 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
 
   // fold (aext c1) -> c1
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, N0);
   // fold (aext (aext x)) -> (aext x)
   // fold (aext (zext x)) -> (zext x)
   // fold (aext (sext x)) -> (sext x)
   if (N0.getOpcode() == ISD::ANY_EXTEND  ||
       N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND)
-    return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, N0.getOperand(0));
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
 
   // fold (aext (truncate (load x))) -> (aext (smaller load x))
   // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
@@ -4902,8 +5039,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     if (TruncOp.getValueType() == VT)
       return TruncOp; // x iff x size == zext size.
     if (TruncOp.getValueType().bitsGT(VT))
-      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, TruncOp);
-    return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, TruncOp);
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
+    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
   }
 
   // Fold (aext (and (trunc x), cst)) -> (and x, cst)
@@ -4915,13 +5052,13 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
                           N0.getValueType())) {
     SDValue X = N0.getOperand(0).getOperand(0);
     if (X.getValueType().bitsLT(VT)) {
-      X = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, X);
+      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, X);
     } else if (X.getValueType().bitsGT(VT)) {
-      X = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, X);
+      X = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
     }
     APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
     Mask = Mask.zext(VT.getSizeInBits());
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::AND, SDLoc(N), VT,
                        X, DAG.getConstant(Mask, VT));
   }
 
@@ -4938,17 +5075,15 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
-                                       LN0->getBasePtr(), LN0->getPointerInfo(),
-                                       N0.getValueType(),
-                                       LN0->isVolatile(), LN0->isNonTemporal(),
-                                       LN0->getAlignment());
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
       CombineTo(N, ExtLoad);
-      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, N->getDebugLoc(),
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ANY_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
@@ -4962,14 +5097,12 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
       N0.hasOneUse()) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
-    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), N->getDebugLoc(),
+    SDValue ExtLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(N),
                                      VT, LN0->getChain(), LN0->getBasePtr(),
-                                     LN0->getPointerInfo(), MemVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     MemVT, LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
-              DAG.getNode(ISD::TRUNCATE, N0.getDebugLoc(),
+              DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                           N0.getValueType(), ExtLoad),
               ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -4986,7 +5119,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
         // we know that the element size of the sext'd result matches the
         // element size of the compare operands.
       if (VT.getSizeInBits() == N0VT.getSizeInBits())
-        return DAG.getSetCC(N->getDebugLoc(), VT, N0.getOperand(0),
+        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1),
                              cast<CondCodeSDNode>(N0.getOperand(2))->get());
       // If the desired elements are smaller or larger than the source
@@ -5000,16 +5133,16 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
           EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
                            N0VT.getVectorNumElements());
         SDValue VsetCC =
-          DAG.getSetCC(N->getDebugLoc(), MatchingVectorType, N0.getOperand(0),
+          DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
                         N0.getOperand(1),
                         cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
+        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
       }
     }
 
     // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
     SDValue SCC =
-      SimplifySelectCC(N->getDebugLoc(), N0.getOperand(0), N0.getOperand(1),
+      SimplifySelectCC(SDLoc(N), N0.getOperand(0), N0.getOperand(1),
                        DAG.getConstant(1, VT), DAG.getConstant(0, VT),
                        cast<CondCodeSDNode>(N0.getOperand(2))->get(), true);
     if (SCC.getNode())
@@ -5030,9 +5163,8 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
     assert(CV != 0 && "Const value should be ConstSDNode.");
     const APInt &CVal = CV->getAPIntValue();
     APInt NewVal = CVal & Mask;
-    if (NewVal != CVal) {
+    if (NewVal != CVal)
       return DAG.getConstant(NewVal, V.getValueType());
-    }
     break;
   }
   case ISD::OR:
@@ -5056,7 +5188,7 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
       APInt NewMask = Mask << Amt;
       SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask);
       if (SimplifyLHS.getNode())
-        return DAG.getNode(ISD::SRL, V.getDebugLoc(), V.getValueType(),
+        return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                            SimplifyLHS, V.getOperand(1));
     }
   }
@@ -5160,12 +5292,19 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   // For the transform to be legal, the load must produce only two values
   // (the value loaded and the chain).  Don't transform a pre-increment
-  // load, for example, which produces an extra value.  Otherwise the 
+  // load, for example, which produces an extra value.  Otherwise the
   // transformation is not equivalent, and the downstream logic to replace
   // uses gets things wrong.
   if (LN0->getNumValues() > 2)
     return SDValue();
 
+  // If the load that we're shrinking is an extload and we're not just
+  // discarding the extension we can't simply shrink the load. Bail.
+  // TODO: It would be possible to merge the extensions in some cases.
+  if (LN0->getExtensionType() != ISD::NON_EXTLOAD &&
+      LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
+    return SDValue();
+
   EVT PtrType = N0.getOperand(1).getValueType();
 
   if (PtrType == MVT::Untyped || PtrType.isExtended())
@@ -5182,22 +5321,22 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   uint64_t PtrOff = ShAmt / 8;
   unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
-  SDValue NewPtr = DAG.getNode(ISD::ADD, LN0->getDebugLoc(),
+  SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LN0),
                                PtrType, LN0->getBasePtr(),
                                DAG.getConstant(PtrOff, PtrType));
   AddToWorkList(NewPtr.getNode());
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
-    Load =  DAG.getLoad(VT, N0.getDebugLoc(), LN0->getChain(), NewPtr,
+    Load =  DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
                         LN0->getPointerInfo().getWithOffset(PtrOff),
                         LN0->isVolatile(), LN0->isNonTemporal(),
-                        LN0->isInvariant(), NewAlign);
+                        LN0->isInvariant(), NewAlign, LN0->getTBAAInfo());
   else
-    Load = DAG.getExtLoad(ExtType, N0.getDebugLoc(), VT, LN0->getChain(),NewPtr,
+    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(),NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff),
                           ExtVT, LN0->isVolatile(), LN0->isNonTemporal(),
-                          NewAlign);
+                          NewAlign, LN0->getTBAAInfo());
 
   // Replace the old load's chain with the new load's chain.
   WorkListRemover DeadNodes(*this);
@@ -5216,7 +5355,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     if (ShLeftAmt >= VT.getSizeInBits())
       Result = DAG.getConstant(0, VT);
     else
-      Result = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT,
+      Result = DAG.getNode(ISD::SHL, SDLoc(N0), VT,
                           Result, DAG.getConstant(ShLeftAmt, ShImmTy));
   }
 
@@ -5234,7 +5373,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // fold (sext_in_reg c1) -> c1
   if (isa<ConstantSDNode>(N0) || N0.getOpcode() == ISD::UNDEF)
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
   if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
@@ -5242,10 +5381,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
   if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT())) {
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                        N0.getOperand(0), N1);
-  }
 
   // fold (sext_in_reg (sext x)) -> (sext x)
   // fold (sext_in_reg (aext x)) -> (sext x)
@@ -5254,12 +5392,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getValueType().getScalarType().getSizeInBits() <= EVTBits &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
-      return DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, N00, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
   if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
-    return DAG.getZeroExtendInReg(N0, N->getDebugLoc(), EVT);
+    return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT);
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
   // demanded.
@@ -5282,7 +5420,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
         if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
-          return DAG.getNode(ISD::SRA, N->getDebugLoc(), VT,
+          return DAG.getNode(ISD::SRA, SDLoc(N), VT,
                              N0.getOperand(0), N0.getOperand(1));
       }
   }
@@ -5294,12 +5432,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     AddToWorkList(ExtLoad.getNode());
@@ -5312,12 +5448,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     EVT,
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -5328,7 +5462,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                        N0.getOperand(1), false);
     if (BSwap.getNode() != 0)
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
                          BSwap, N1);
   }
 
@@ -5345,21 +5479,21 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     return N0;
   // fold (truncate c1) -> c1
   if (isa<ConstantSDNode>(N0))
-    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
   // fold (truncate (truncate x)) -> (truncate x)
   if (N0.getOpcode() == ISD::TRUNCATE)
-    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
   // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
   if (N0.getOpcode() == ISD::ZERO_EXTEND ||
       N0.getOpcode() == ISD::SIGN_EXTEND ||
       N0.getOpcode() == ISD::ANY_EXTEND) {
     if (N0.getOperand(0).getValueType().bitsLT(VT))
       // if the source is smaller than the dest, we still need an extend
-      return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
                          N0.getOperand(0));
     if (N0.getOperand(0).getValueType().bitsGT(VT))
       // if the source is larger than the dest, than we just need the truncate
-      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
     // if the source and dest are the same type, we can drop both the extend
     // and the truncate.
     return N0.getOperand(0);
@@ -5391,14 +5525,14 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-      EVT IndexTy = N0->getOperand(1).getValueType();
+      EVT IndexTy = TLI.getVectorIdxTy();
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
-      SDValue V = DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+      SDValue V = DAG.getNode(ISD::BITCAST, SDLoc(N),
                               NVT, N0.getOperand(0));
 
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
-                         N->getDebugLoc(), TrTy, V,
+                         SDLoc(N), TrTy, V,
                          DAG.getConstant(Index, IndexTy));
     }
   }
@@ -5430,7 +5564,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
         Opnds.push_back(BuildVect.getOperand(i));
 
-      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), VT, &Opnds[0],
+      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, &Opnds[0],
                          Opnds.size());
     }
   }
@@ -5445,7 +5579,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
                                                VT.getSizeInBits()));
     if (Shorter.getNode())
-      return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, Shorter);
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
@@ -5488,11 +5622,11 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
           Opnds.push_back(DAG.getUNDEF(VTs[i]));
           continue;
         }
-        SDValue NV = DAG.getNode(ISD::TRUNCATE, V.getDebugLoc(), VTs[i], V);
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
         AddToWorkList(NV.getNode());
         Opnds.push_back(NV);
       }
-      return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
                          &Opnds[0], Opnds.size());
     }
   }
@@ -5538,7 +5672,7 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
 
     if (NewAlign <= Align &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
-      return DAG.getLoad(VT, N->getDebugLoc(), LD1->getChain(),
+      return DAG.getLoad(VT, SDLoc(N), LD1->getChain(),
                          LD1->getBasePtr(), LD1->getPointerInfo(),
                          false, false, false, Align);
   }
@@ -5575,7 +5709,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // If the input is a constant, let getNode fold it.
   if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
-    SDValue Res = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, N0);
+    SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
     if (Res.getNode() != N) {
       if (!LegalOperations ||
           TLI.isOperationLegal(Res.getNode()->getOpcode(), VT))
@@ -5592,7 +5726,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
   if (N0.getOpcode() == ISD::BITCAST)
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT,
                        N0.getOperand(0));
 
   // fold (conv (load x)) -> (load (conv*)x)
@@ -5600,20 +5734,22 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       // Do not change the width of a volatile load.
       !cast<LoadSDNode>(N0)->isVolatile() &&
-      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     unsigned Align = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
     unsigned OrigAlign = LN0->getAlignment();
 
     if (Align <= OrigAlign) {
-      SDValue Load = DAG.getLoad(VT, N->getDebugLoc(), LN0->getChain(),
+      SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(),
                                  LN0->getBasePtr(), LN0->getPointerInfo(),
                                  LN0->isVolatile(), LN0->isNonTemporal(),
-                                 LN0->isInvariant(), OrigAlign);
+                                 LN0->isInvariant(), OrigAlign,
+                                 LN0->getTBAAInfo());
       AddToWorkList(N);
       CombineTo(N0.getNode(),
-                DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+                DAG.getNode(ISD::BITCAST, SDLoc(N0),
                             N0.getValueType(), Load),
                 Load.getValue(1));
       return Load;
@@ -5623,20 +5759,20 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
   // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
   // This often reduces constant pool loads.
-  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(VT)) ||
-       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(VT))) &&
+  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
+       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
       N0.getNode()->hasOneUse() && VT.isInteger() &&
       !VT.isVector() && !N0.getValueType().isVector()) {
-    SDValue NewConv = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(), VT,
+    SDValue NewConv = DAG.getNode(ISD::BITCAST, SDLoc(N0), VT,
                                   N0.getOperand(0));
     AddToWorkList(NewConv.getNode());
 
     APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
     if (N0.getOpcode() == ISD::FNEG)
-      return DAG.getNode(ISD::XOR, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::XOR, SDLoc(N), VT,
                          NewConv, DAG.getConstant(SignBit, VT));
     assert(N0.getOpcode() == ISD::FABS);
-    return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::AND, SDLoc(N), VT,
                        NewConv, DAG.getConstant(~SignBit, VT));
   }
 
@@ -5650,38 +5786,38 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
     unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
     EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
     if (isTypeLegal(IntXVT)) {
-      SDValue X = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+      SDValue X = DAG.getNode(ISD::BITCAST, SDLoc(N0),
                               IntXVT, N0.getOperand(1));
       AddToWorkList(X.getNode());
 
       // If X has a different width than the result/lhs, sext it or truncate it.
       unsigned VTWidth = VT.getSizeInBits();
       if (OrigXWidth < VTWidth) {
-        X = DAG.getNode(ISD::SIGN_EXTEND, N->getDebugLoc(), VT, X);
+        X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
         AddToWorkList(X.getNode());
       } else if (OrigXWidth > VTWidth) {
         // To get the sign bit in the right place, we have to shift it right
         // before truncating.
-        X = DAG.getNode(ISD::SRL, X.getDebugLoc(),
+        X = DAG.getNode(ISD::SRL, SDLoc(X),
                         X.getValueType(), X,
                         DAG.getConstant(OrigXWidth-VTWidth, X.getValueType()));
         AddToWorkList(X.getNode());
-        X = DAG.getNode(ISD::TRUNCATE, X.getDebugLoc(), VT, X);
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
         AddToWorkList(X.getNode());
       }
 
       APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
-      X = DAG.getNode(ISD::AND, X.getDebugLoc(), VT,
+      X = DAG.getNode(ISD::AND, SDLoc(X), VT,
                       X, DAG.getConstant(SignBit, VT));
       AddToWorkList(X.getNode());
 
-      SDValue Cst = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(),
+      SDValue Cst = DAG.getNode(ISD::BITCAST, SDLoc(N0),
                                 VT, N0.getOperand(0));
-      Cst = DAG.getNode(ISD::AND, Cst.getDebugLoc(), VT,
+      Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
                         Cst, DAG.getConstant(~SignBit, VT));
       AddToWorkList(Cst.getNode());
 
-      return DAG.getNode(ISD::OR, N->getDebugLoc(), VT, X, Cst);
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
     }
   }
 
@@ -5722,8 +5858,8 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     // Due to the FP element handling below calling this routine recursively,
     // we can end up with a scalar-to-vector node here.
     if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
-                         DAG.getNode(ISD::BITCAST, BV->getDebugLoc(),
+      return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
+                         DAG.getNode(ISD::BITCAST, SDLoc(BV),
                                      DstEltVT, BV->getOperand(0)));
 
     SmallVector<SDValue, 8> Ops;
@@ -5732,12 +5868,12 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       // If the vector element type is not legal, the BUILD_VECTOR operands
       // are promoted and implicitly truncated.  Make that explicit here.
       if (Op.getValueType() != SrcEltVT)
-        Op = DAG.getNode(ISD::TRUNCATE, BV->getDebugLoc(), SrcEltVT, Op);
-      Ops.push_back(DAG.getNode(ISD::BITCAST, BV->getDebugLoc(),
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
+      Ops.push_back(DAG.getNode(ISD::BITCAST, SDLoc(BV),
                                 DstEltVT, Op));
       AddToWorkList(Ops.back().getNode());
     }
-    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
                        &Ops[0], Ops.size());
   }
 
@@ -5794,7 +5930,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
     }
 
     EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
-    return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
                        &Ops[0], Ops.size());
   }
 
@@ -5821,7 +5957,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       Ops.push_back(DAG.getConstant(ThisVal, DstEltVT));
       if (isS2V && i == 0 && j == 0 && ThisVal.zext(SrcBitSize) == OpVal)
         // Simply turn this into a SCALAR_TO_VECTOR of the new type.
-        return DAG.getNode(ISD::SCALAR_TO_VECTOR, BV->getDebugLoc(), VT,
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
                            Ops[0]);
       OpVal = OpVal.lshr(DstBitSize);
     }
@@ -5831,7 +5967,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
       std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, BV->getDebugLoc(), VT,
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT,
                      &Ops[0], Ops.size());
 }
 
@@ -5850,10 +5986,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold (fadd c1, c2) -> c1 + c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N1);
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N0);
   // fold (fadd A, 0) -> A
   if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
       N1CFP->getValueAPF().isZero())
@@ -5861,20 +5997,20 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
     isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
-    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
     isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
-    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
+    return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
   // If allowed, fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
   if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
       N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
       isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                    N0.getOperand(1), N1));
 
   // No FP constant should be created after legalization as Instruction
@@ -5883,22 +6019,20 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // We don't need test this condition for transformation like following, as
   // the DAG being transformed implies it is legal to take FP constant as
   // operand.
-  // 
+  //
   //  (fadd (fmul c, x), x) -> (fmul c+1, x)
-  // 
+  //
   bool AllowNewFpConst = (Level < AfterLegalizeDAG);
 
   // If allow, fold (fadd (fneg x), x) -> 0.0
   if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) {
+      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
     return DAG.getConstantFP(0.0, VT);
-  }
 
     // If allow, fold (fadd x, (fneg x)) -> 0.0
   if (AllowNewFpConst && DAG.getTarget().Options.UnsafeFPMath &&
-      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) {
+      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
     return DAG.getConstantFP(0.0, VT);
-  }
 
   // In unsafe math mode, we can fold chains of FADD's of the same value
   // into multiplications.  This transform is not safe in general because
@@ -5910,43 +6044,43 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
       ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
 
-      // (fadd (fmul c, x), x) -> (fmul c+1, x)
+      // (fadd (fmul c, x), x) -> (fmul x, c+1)
       if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP00, 0),
                                      DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N1, NewCFP);
       }
 
-      // (fadd (fmul x, c), x) -> (fmul c+1, x)
+      // (fadd (fmul x, c), x) -> (fmul x, c+1)
       if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP01, 0),
                                      DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N1, NewCFP);
       }
 
-      // (fadd (fmul c, x), (fadd x, x)) -> (fmul c+2, x)
+      // (fadd (fmul c, x), (fadd x, x)) -> (fmul x, c+2)
       if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(1) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP00, 0),
                                      DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N0.getOperand(1), NewCFP);
       }
 
-      // (fadd (fmul x, c), (fadd x, x)) -> (fmul c+2, x)
+      // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
       if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
           N1.getOperand(0) == N1.getOperand(1) &&
           N0.getOperand(0) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP01, 0),
                                      DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N0.getOperand(0), NewCFP);
       }
     }
@@ -5955,98 +6089,93 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
       ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
 
-      // (fadd x, (fmul c, x)) -> (fmul c+1, x)
+      // (fadd x, (fmul c, x)) -> (fmul x, c+1)
       if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP10, 0),
                                      DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N0, NewCFP);
       }
 
-      // (fadd x, (fmul x, c)) -> (fmul c+1, x)
+      // (fadd x, (fmul x, c)) -> (fmul x, c+1)
       if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP11, 0),
                                      DAG.getConstantFP(1.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N0, NewCFP);
       }
 
 
-      // (fadd (fadd x, x), (fmul c, x)) -> (fmul c+2, x)
-      if (CFP10 && !CFP11 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(1) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+      // (fadd (fadd x, x), (fmul c, x)) -> (fmul x, c+2)
+      if (CFP10 && !CFP11 && N0.getOpcode() == ISD::FADD &&
+          N0.getOperand(0) == N0.getOperand(1) &&
+          N1.getOperand(1) == N0.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP10, 0),
                                      DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
-                           N0.getOperand(1), NewCFP);
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                           N1.getOperand(1), NewCFP);
       }
 
-      // (fadd (fadd x, x), (fmul x, c)) -> (fmul c+2, x)
-      if (CFP11 && !CFP10 && N1.getOpcode() == ISD::FADD &&
-          N1.getOperand(0) == N1.getOperand(1) &&
-          N0.getOperand(0) == N1.getOperand(0)) {
-        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+      // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
+      if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+          N0.getOperand(0) == N0.getOperand(1) &&
+          N1.getOperand(0) == N0.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, SDLoc(N), VT,
                                      SDValue(CFP11, 0),
                                      DAG.getConstantFP(2.0, VT));
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
-                           N0.getOperand(0), NewCFP);
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
+                           N1.getOperand(0), NewCFP);
       }
     }
 
     if (N0.getOpcode() == ISD::FADD && AllowNewFpConst) {
       ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
-      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
+      // (fadd (fadd x, x), x) -> (fmul x, 3.0)
       if (!CFP && N0.getOperand(0) == N0.getOperand(1) &&
-          (N0.getOperand(0) == N1)) {
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+          (N0.getOperand(0) == N1))
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N1, DAG.getConstantFP(3.0, VT));
-      }
     }
 
     if (N1.getOpcode() == ISD::FADD && AllowNewFpConst) {
       ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
-      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
+      // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
       if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
-          N1.getOperand(0) == N0) {
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+          N1.getOperand(0) == N0)
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            N0, DAG.getConstantFP(3.0, VT));
-      }
     }
 
-    // (fadd (fadd x, x), (fadd x, x)) -> (fmul 4.0, x)
+    // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
     if (AllowNewFpConst &&
         N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
         N0.getOperand(0) == N0.getOperand(1) &&
         N1.getOperand(0) == N1.getOperand(1) &&
-        N0.getOperand(0) == N1.getOperand(0)) {
-      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        N0.getOperand(0) == N1.getOperand(0))
+      return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                          N0.getOperand(0),
                          DAG.getConstantFP(4.0, VT));
-    }
   }
 
   // FADD -> FMA combines:
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
-      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
-      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
-    }
 
     // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
-      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
+      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
                          N1.getOperand(0), N1.getOperand(1), N0);
-    }
   }
 
   return SDValue();
@@ -6058,7 +6187,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // fold vector ops
   if (VT.isVector()) {
@@ -6068,7 +6197,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // fold (fsub c1, c2) -> c1-c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0, N1);
   // fold (fsub A, 0) -> A
   if (DAG.getTarget().Options.UnsafeFPMath &&
       N1CFP && N1CFP->getValueAPF().isZero())
@@ -6101,8 +6230,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
       if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI,
                                           &DAG.getTarget().Options))
         return GetNegatedExpression(N11, DAG, LegalOperations);
-      else if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
-                                               &DAG.getTarget().Options))
+
+      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI,
+                                          &DAG.getTarget().Options))
         return GetNegatedExpression(N10, DAG, LegalOperations);
     }
   }
@@ -6110,27 +6240,25 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   // FSUB -> FMA combines:
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
-      DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
-      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
+      DAG.getTarget().getTargetLowering()->isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
+    if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse())
       return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(ISD::FNEG, dl, VT, N1));
-    }
 
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
+    if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse())
       return DAG.getNode(ISD::FMA, dl, VT,
                          DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
                          N1.getOperand(1), N0);
-    }
 
-    // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-    if (N0.getOpcode() == ISD::FNEG && 
+    // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+    if (N0.getOpcode() == ISD::FNEG &&
         N0.getOperand(0).getOpcode() == ISD::FMUL &&
         N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
       SDValue N00 = N0.getOperand(0).getOperand(0);
@@ -6160,10 +6288,10 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
 
   // fold (fmul c1, c2) -> c1*c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0, N1);
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
-    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N1, N0);
+    return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N1, N0);
   // fold (fmul A, 0) -> 0
   if (DAG.getTarget().Options.UnsafeFPMath &&
       N1CFP && N1CFP->getValueAPF().isZero())
@@ -6177,21 +6305,21 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     return N0;
   // fold (fmul X, 2.0) -> (fadd X, X)
   if (N1CFP && N1CFP->isExactlyValue(+2.0))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N0);
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N0);
   // fold (fmul X, -1.0) -> (fneg X)
   if (N1CFP && N1CFP->isExactlyValue(-1.0))
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N0);
+      return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
   // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
   if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI,
                                        &DAG.getTarget().Options)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, 
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI,
                                          &DAG.getTarget().Options)) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
                            GetNegatedExpression(N1, DAG, LegalOperations));
     }
@@ -6201,8 +6329,8 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (DAG.getTarget().Options.UnsafeFPMath &&
       N1CFP && N0.getOpcode() == ISD::FMUL &&
       N0.getNode()->hasOneUse() && isa<ConstantFPSDNode>(N0.getOperand(1)))
-    return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0.getOperand(0),
-                       DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                                    N0.getOperand(1), N1));
 
   return SDValue();
@@ -6215,7 +6343,7 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (DAG.getTarget().Options.UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
@@ -6224,13 +6352,13 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
       return N2;
   }
   if (N0CFP && N0CFP->isExactlyValue(1.0))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
   if (N1CFP && N1CFP->isExactlyValue(1.0))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N2);
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
 
   // Canonicalize (fma c, x, y) -> (fma x, c, y)
   if (N0CFP && !N1CFP)
-    return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
+    return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
 
   // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
   if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
@@ -6267,21 +6395,17 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   }
 
   // (fma x, c, x) -> (fmul x, (c+1))
-  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2) {
-    return DAG.getNode(ISD::FMUL, dl, VT,
-                       N0,
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2)
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
                                    N1, DAG.getConstantFP(1.0, VT)));
-  }
 
   // (fma x, c, (fneg x)) -> (fmul x, (c-1))
   if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
-      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
-    return DAG.getNode(ISD::FMUL, dl, VT,
-                       N0,
+      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0)
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
                        DAG.getNode(ISD::FADD, dl, VT,
                                    N1, DAG.getConstantFP(-1.0, VT)));
-  }
 
 
   return SDValue();
@@ -6303,7 +6427,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // fold (fdiv c1, c2) -> c1/c2
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1);
 
   // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
   if (N1CFP && DAG.getTarget().Options.UnsafeFPMath) {
@@ -6320,7 +6444,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
          // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
          TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
          TLI.isFPImmLegal(Recip, VT)))
-      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0,
+      return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0,
                          DAG.getConstantFP(Recip, VT));
   }
 
@@ -6332,7 +6456,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
       // Both can be negated for free, check to see if at least one is cheaper
       // negated.
       if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
                            GetNegatedExpression(N0, DAG, LegalOperations),
                            GetNegatedExpression(N1, DAG, LegalOperations));
     }
@@ -6350,7 +6474,7 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
-    return DAG.getNode(ISD::FREM, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1);
 
   return SDValue();
 }
@@ -6363,7 +6487,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   if (N0CFP && N1CFP)  // Constant fold
-    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
 
   if (N1CFP) {
     const APFloat& V = N1CFP->getValueAPF();
@@ -6371,11 +6495,11 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
     // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
     if (!V.isNegative()) {
       if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
-        return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+        return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
     } else {
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-        return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
-                           DAG.getNode(ISD::FABS, N0.getDebugLoc(), VT, N0));
+        return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                           DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
     }
   }
 
@@ -6384,22 +6508,22 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   // copysign(copysign(x,z), y) -> copysign(x, y)
   if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
       N0.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0.getOperand(0), N1);
 
   // copysign(x, abs(y)) -> abs(x)
   if (N1.getOpcode() == ISD::FABS)
-    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
 
   // copysign(x, copysign(y,z)) -> copysign(x, z)
   if (N1.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(1));
 
   // copysign(x, fp_extend(y)) -> copysign(x, y)
   // copysign(x, fp_round(y)) -> copysign(x, y)
   if (N1.getOpcode() == ISD::FP_EXTEND || N1.getOpcode() == ISD::FP_ROUND)
-    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        N0, N1.getOperand(0));
 
   return SDValue();
@@ -6416,7 +6540,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
 
   // If the input is a legal type, and SINT_TO_FP is not legal on this target,
   // but UINT_TO_FP is legal on this target, try to convert.
@@ -6424,7 +6548,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+      return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
   }
 
   // The next optimizations are desireable only if SELECT_CC can be lowered.
@@ -6442,7 +6566,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
           N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
     }
 
     // fold (sint_to_fp (zext (setcc x, y, cc))) ->
@@ -6455,7 +6579,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
         { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
           DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
           N0.getOperand(0).getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
     }
   }
 
@@ -6473,7 +6597,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
 
   // If the input is a legal type, and UINT_TO_FP is not legal on this target,
   // but SINT_TO_FP is legal on this target, try to convert.
@@ -6481,7 +6605,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
+      return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
   }
 
   // The next optimizations are desireable only if SELECT_CC can be lowered.
@@ -6499,7 +6623,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
         { N0.getOperand(0), N0.getOperand(1),
           DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
           N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT, Ops, 5);
     }
   }
 
@@ -6513,7 +6637,7 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
 
   // fold (fp_to_sint c1fp) -> c1
   if (N0CFP)
-    return DAG.getNode(ISD::FP_TO_SINT, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -6525,7 +6649,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
 
   // fold (fp_to_uint c1fp) -> c1
   if (N0CFP)
-    return DAG.getNode(ISD::FP_TO_UINT, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -6538,7 +6662,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
 
   // fold (fp_round c1fp) -> c1fp
   if (N0CFP)
-    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0, N1);
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
 
   // fold (fp_round (fp_extend x)) -> x
   if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
@@ -6549,16 +6673,16 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
     // This is a value preserving truncation if both round's are.
     bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
                    N0.getNode()->getConstantOperandVal(1) == 1;
-    return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0.getOperand(0),
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getIntPtrConstant(IsTrunc));
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
   if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
-    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(), VT,
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
                               N0.getOperand(0), N1);
     AddToWorkList(Tmp.getNode());
-    return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT,
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
                        Tmp, N0.getOperand(1));
   }
 
@@ -6574,7 +6698,7 @@ SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
   // fold (fp_round_inreg c1fp) -> c1fp
   if (N0CFP && isTypeLegal(EVT)) {
     SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), EVT);
-    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, Round);
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Round);
   }
 
   return SDValue();
@@ -6592,7 +6716,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
 
   // fold (fp_extend c1fp) -> c1fp
   if (N0CFP)
-    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
 
   // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
   // value of X.
@@ -6601,25 +6725,23 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
     SDValue In = N0.getOperand(0);
     if (In.getValueType() == VT) return In;
     if (VT.bitsLT(In.getValueType()))
-      return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
                          In, N0.getOperand(1));
-    return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, In);
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
   }
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
-  if (ISD::isNON_EXTLoad(N0.getNode()) && N0.hasOneUse() &&
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
        TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType()))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, N->getDebugLoc(), VT,
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), LN0->getPointerInfo(),
-                                     N0.getValueType(),
-                                     LN0->isVolatile(), LN0->isNonTemporal(),
-                                     LN0->getAlignment());
+                                     LN0->getBasePtr(), N0.getValueType(),
+                                     LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(),
-              DAG.getNode(ISD::FP_ROUND, N0.getDebugLoc(),
+              DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
                           N0.getValueType(), ExtLoad, DAG.getIntPtrConstant(1)),
               ExtLoad.getValue(1));
     return SDValue(N, 0);   // Return N so it doesn't get rechecked!
@@ -6650,10 +6772,10 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
-      Int = DAG.getNode(ISD::XOR, N0.getDebugLoc(), IntVT, Int,
+      Int = DAG.getNode(ISD::XOR, SDLoc(N0), IntVT, Int,
               DAG.getConstant(APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
       AddToWorkList(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+      return DAG.getNode(ISD::BITCAST, SDLoc(N),
                          VT, Int);
     }
   }
@@ -6661,12 +6783,11 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   // (fneg (fmul c, x)) -> (fmul -c, x)
   if (N0.getOpcode() == ISD::FMUL) {
     ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-    if (CFP1) {
-      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+    if (CFP1)
+      return DAG.getNode(ISD::FMUL, SDLoc(N), VT,
                          N0.getOperand(0),
-                         DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                         DAG.getNode(ISD::FNEG, SDLoc(N), VT,
                                      N0.getOperand(1)));
-    }
   }
 
   return SDValue();
@@ -6679,7 +6800,7 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) {
 
   // fold (fceil c1) -> fceil(c1)
   if (N0CFP)
-    return DAG.getNode(ISD::FCEIL, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -6691,7 +6812,7 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
 
   // fold (ftrunc c1) -> ftrunc(c1)
   if (N0CFP)
-    return DAG.getNode(ISD::FTRUNC, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -6703,7 +6824,7 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
 
   // fold (ffloor c1) -> ffloor(c1)
   if (N0CFP)
-    return DAG.getNode(ISD::FFLOOR, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
 
   return SDValue();
 }
@@ -6720,28 +6841,28 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
 
   // fold (fabs c1) -> fabs(c1)
   if (N0CFP)
-    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
   // fold (fabs (fabs x)) -> (fabs x)
   if (N0.getOpcode() == ISD::FABS)
     return N->getOperand(0);
   // fold (fabs (fneg x)) -> (fabs x)
   // fold (fabs (fcopysign x, y)) -> (fabs x)
   if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
-    return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
 
   // Transform fabs(bitconvert(x)) -> bitconvert(x&~sign) to avoid loading
   // constant pool values.
-  if (!TLI.isFAbsFree(VT) && 
+  if (!TLI.isFAbsFree(VT) &&
       N0.getOpcode() == ISD::BITCAST && N0.getNode()->hasOneUse() &&
       N0.getOperand(0).getValueType().isInteger() &&
       !N0.getOperand(0).getValueType().isVector()) {
     SDValue Int = N0.getOperand(0);
     EVT IntVT = Int.getValueType();
     if (IntVT.isInteger() && !IntVT.isVector()) {
-      Int = DAG.getNode(ISD::AND, N0.getDebugLoc(), IntVT, Int,
+      Int = DAG.getNode(ISD::AND, SDLoc(N0), IntVT, Int,
              DAG.getConstant(~APInt::getSignBit(IntVT.getSizeInBits()), IntVT));
       AddToWorkList(Int.getNode());
-      return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+      return DAG.getNode(ISD::BITCAST, SDLoc(N),
                          N->getValueType(0), Int);
     }
   }
@@ -6765,7 +6886,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
   if (N1.getOpcode() == ISD::SETCC &&
       TLI.isOperationLegalOrCustom(ISD::BR_CC,
                                    N1.getOperand(0).getValueType())) {
-    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
                        Chain, N1.getOperand(2),
                        N1.getOperand(0), N1.getOperand(1), N2);
   }
@@ -6811,12 +6932,12 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
         if (AndConst.isPowerOf2() &&
             cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
           SDValue SetCC =
-            DAG.getSetCC(N->getDebugLoc(),
-                         TLI.getSetCCResultType(Op0.getValueType()),
+            DAG.getSetCC(SDLoc(N),
+                         getSetCCResultType(Op0.getValueType()),
                          Op0, DAG.getConstant(0, Op0.getValueType()),
                          ISD::SETNE);
 
-          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, SDLoc(N),
                                           MVT::Other, Chain, SetCC, N2);
           // Don't add the new BRCond into the worklist or else SimplifySelectCC
           // will convert it back to (X & C1) >> C2.
@@ -6861,7 +6982,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
           DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
           removeFromWorkList(TheXor);
           DAG.DeleteNode(TheXor);
-          return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+          return DAG.getNode(ISD::BRCOND, SDLoc(N),
                              MVT::Other, Chain, Tmp, N2);
         }
 
@@ -6882,8 +7003,8 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
 
       EVT SetCCVT = N1.getValueType();
       if (LegalTypes)
-        SetCCVT = TLI.getSetCCResultType(SetCCVT);
-      SDValue SetCC = DAG.getSetCC(TheXor->getDebugLoc(),
+        SetCCVT = getSetCCResultType(SetCCVT);
+      SDValue SetCC = DAG.getSetCC(SDLoc(TheXor),
                                    SetCCVT,
                                    Op0, Op1,
                                    Equal ? ISD::SETEQ : ISD::SETNE);
@@ -6892,7 +7013,7 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
       DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
       removeFromWorkList(N1.getNode());
       DAG.DeleteNode(N1.getNode());
-      return DAG.getNode(ISD::BRCOND, N->getDebugLoc(),
+      return DAG.getNode(ISD::BRCOND, SDLoc(N),
                          MVT::Other, Chain, SetCC, N2);
     }
   }
@@ -6913,14 +7034,14 @@ SDValue DAGCombiner::visitBR_CC(SDNode *N) {
   // MachineBasicBlock CFG, which is awkward.
 
   // Use SimplifySetCC to simplify SETCC's.
-  SDValue Simp = SimplifySetCC(TLI.getSetCCResultType(CondLHS.getValueType()),
-                               CondLHS, CondRHS, CC->get(), N->getDebugLoc(),
+  SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), SDLoc(N),
                                false);
   if (Simp.getNode()) AddToWorkList(Simp.getNode());
 
   // fold to a simpler setcc
   if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
-    return DAG.getNode(ISD::BR_CC, N->getDebugLoc(), MVT::Other,
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
                        N->getOperand(0), Simp.getOperand(2),
                        Simp.getOperand(0), Simp.getOperand(1),
                        N->getOperand(4));
@@ -7118,10 +7239,10 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
 
   SDValue Result;
   if (isLoad)
-    Result = DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+    Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
                                 BasePtr, Offset, AM);
   else
-    Result = DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+    Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
                                  BasePtr, Offset, AM);
   ++PreIndexedNodes;
   ++NodesCombined;
@@ -7156,7 +7277,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     //   x0 * offset0 + y0 * ptr0 = t0
     // knowing that
     //   x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
-    // 
+    //
     // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
     // indexed load/store and the expresion that needs to be re-written.
     //
@@ -7186,7 +7307,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
     SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);
 
     SDValue NewUse = DAG.getNode(Opcode,
-                                 OtherUses[i]->getDebugLoc(),
+                                 SDLoc(OtherUses[i]),
                                  OtherUses[i]->getValueType(0), NewOp1, NewOp2);
     DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
     removeFromWorkList(OtherUses[i]);
@@ -7278,7 +7399,7 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
           for (SDNode::use_iterator III = Use->use_begin(),
                  EEE = Use->use_end(); III != EEE; ++III) {
             SDNode *UseUse = *III;
-            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) 
+            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
               RealUse = true;
           }
 
@@ -7295,9 +7416,9 @@ bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
       // Check for #2
       if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
         SDValue Result = isLoad
-          ? DAG.getIndexedLoad(SDValue(N,0), N->getDebugLoc(),
+          ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
                                BasePtr, Offset, AM)
-          : DAG.getIndexedStore(SDValue(N,0), N->getDebugLoc(),
+          : DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
                                 BasePtr, Offset, AM);
         ++PostIndexedNodes;
         ++NodesCombined;
@@ -7403,17 +7524,20 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > LD->getMemOperand()->getBaseAlignment()) {
         SDValue NewLoad =
-               DAG.getExtLoad(LD->getExtensionType(), N->getDebugLoc(),
+               DAG.getExtLoad(LD->getExtensionType(), SDLoc(N),
                               LD->getValueType(0),
                               Chain, Ptr, LD->getPointerInfo(),
                               LD->getMemoryVT(),
-                              LD->isVolatile(), LD->isNonTemporal(), Align);
+                              LD->isVolatile(), LD->isNonTemporal(), Align,
+                              LD->getTBAAInfo());
         return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
       }
     }
   }
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -7423,22 +7547,17 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
       // Replace the chain to void dependency.
       if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
-        ReplLoad = DAG.getLoad(N->getValueType(0), LD->getDebugLoc(),
-                               BetterChain, Ptr, LD->getPointerInfo(),
-                               LD->isVolatile(), LD->isNonTemporal(),
-                               LD->isInvariant(), LD->getAlignment());
+        ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
+                               BetterChain, Ptr, LD->getMemOperand());
       } else {
-        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(),
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
                                   LD->getValueType(0),
-                                  BetterChain, Ptr, LD->getPointerInfo(),
-                                  LD->getMemoryVT(),
-                                  LD->isVolatile(),
-                                  LD->isNonTemporal(),
-                                  LD->getAlignment());
+                                  BetterChain, Ptr, LD->getMemoryVT(),
+                                  LD->getMemOperand());
       }
 
       // Create token factor to keep old chain connected.
-      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+      SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplLoad.getValue(1));
 
       // Make sure the new and old chains are cleaned up.
@@ -7454,9 +7573,562 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
     return SDValue(N, 0);
 
+  // Try to slice up N to more direct loads if the slices are mapped to
+  // different register banks or pairing can take place.
+  if (SliceUpLoad(N))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
+namespace {
+/// \brief Helper structure used to slice a load in smaller loads.
+/// Basically a slice is obtained from the following sequence:
+/// Origin = load Ty1, Base
+/// Shift = srl Ty1 Origin, CstTy Amount
+/// Inst = trunc Shift to Ty2
+///
+/// Then, it will be rewriten into:
+/// Slice = load SliceTy, Base + SliceOffset
+/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
+///
+/// SliceTy is deduced from the number of bits that are actually used to
+/// build Inst.
+struct LoadedSlice {
+  /// \brief Helper structure used to compute the cost of a slice.
+  struct Cost {
+    /// Are we optimizing for code size.
+    bool ForCodeSize;
+    /// Various cost.
+    unsigned Loads;
+    unsigned Truncates;
+    unsigned CrossRegisterBanksCopies;
+    unsigned ZExts;
+    unsigned Shift;
+
+    Cost(bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
+
+    /// \brief Get the cost of one isolated slice.
+    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
+      EVT TruncType = LS.Inst->getValueType(0);
+      EVT LoadedType = LS.getLoadedType();
+      if (TruncType != LoadedType &&
+          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
+        ZExts = 1;
+    }
+
+    /// \brief Account for slicing gain in the current cost.
+    /// Slicing provide a few gains like removing a shift or a
+    /// truncate. This method allows to grow the cost of the original
+    /// load with the gain from this slice.
+    void addSliceGain(const LoadedSlice &LS) {
+      // Each slice saves a truncate.
+      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
+      if (!TLI.isTruncateFree(LS.Inst->getValueType(0),
+                              LS.Inst->getOperand(0).getValueType()))
+        ++Truncates;
+      // If there is a shift amount, this slice gets rid of it.
+      if (LS.Shift)
+        ++Shift;
+      // If this slice can merge a cross register bank copy, account for it.
+      if (LS.canMergeExpensiveCrossRegisterBankCopy())
+        ++CrossRegisterBanksCopies;
+    }
+
+    Cost &operator+=(const Cost &RHS) {
+      Loads += RHS.Loads;
+      Truncates += RHS.Truncates;
+      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
+      ZExts += RHS.ZExts;
+      Shift += RHS.Shift;
+      return *this;
+    }
+
+    bool operator==(const Cost &RHS) const {
+      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
+             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
+             ZExts == RHS.ZExts && Shift == RHS.Shift;
+    }
+
+    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
+
+    bool operator<(const Cost &RHS) const {
+      // Assume cross register banks copies are as expensive as loads.
+      // FIXME: Do we want some more target hooks?
+      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
+      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
+      // Unless we are optimizing for code size, consider the
+      // expensive operation first.
+      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
+        return ExpensiveOpsLHS < ExpensiveOpsRHS;
+      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
+             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
+    }
+
+    bool operator>(const Cost &RHS) const { return RHS < *this; }
+
+    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
+
+    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
+  };
+  // The last instruction that represent the slice. This should be a
+  // truncate instruction.
+  SDNode *Inst;
+  // The original load instruction.
+  LoadSDNode *Origin;
+  // The right shift amount in bits from the original load.
+  unsigned Shift;
+  // The DAG from which Origin came from.
+  // This is used to get some contextual information about legal types, etc.
+  SelectionDAG *DAG;
+
+  LoadedSlice(SDNode *Inst = NULL, LoadSDNode *Origin = NULL,
+              unsigned Shift = 0, SelectionDAG *DAG = NULL)
+      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
+
+  LoadedSlice(const LoadedSlice &LS)
+      : Inst(LS.Inst), Origin(LS.Origin), Shift(LS.Shift), DAG(LS.DAG) {}
+
+  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+  /// \return Result is \p BitWidth and has used bits set to 1 and
+  ///         not used bits set to 0.
+  APInt getUsedBits() const {
+    // Reproduce the trunc(lshr) sequence:
+    // - Start from the truncated value.
+    // - Zero extend to the desired bit width.
+    // - Shift left.
+    assert(Origin && "No original load to compare against.");
+    unsigned BitWidth = Origin->getValueSizeInBits(0);
+    assert(Inst && "This slice is not bound to an instruction");
+    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
+           "Extracted slice is bigger than the whole type!");
+    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
+    UsedBits.setAllBits();
+    UsedBits = UsedBits.zext(BitWidth);
+    UsedBits <<= Shift;
+    return UsedBits;
+  }
+
+  /// \brief Get the size of the slice to be loaded in bytes.
+  unsigned getLoadedSize() const {
+    unsigned SliceSize = getUsedBits().countPopulation();
+    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+    return SliceSize / 8;
+  }
+
+  /// \brief Get the type that will be loaded for this slice.
+  /// Note: This may not be the final type for the slice.
+  EVT getLoadedType() const {
+    assert(DAG && "Missing context");
+    LLVMContext &Ctxt = *DAG->getContext();
+    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
+  }
+
+  /// \brief Get the alignment of the load used for this slice.
+  unsigned getAlignment() const {
+    unsigned Alignment = Origin->getAlignment();
+    unsigned Offset = getOffsetFromBase();
+    if (Offset != 0)
+      Alignment = MinAlign(Alignment, Alignment + Offset);
+    return Alignment;
+  }
+
+  /// \brief Check if this slice can be rewritten with legal operations.
+  bool isLegal() const {
+    // An invalid slice is not legal.
+    if (!Origin || !Inst || !DAG)
+      return false;
+
+    // Offsets are for indexed load only, we do not handle that.
+    if (Origin->getOffset().getOpcode() != ISD::UNDEF)
+      return false;
+
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+
+    // Check that the type is legal.
+    EVT SliceType = getLoadedType();
+    if (!TLI.isTypeLegal(SliceType))
+      return false;
+
+    // Check that the load is legal for this type.
+    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
+      return false;
+
+    // Check that the offset can be computed.
+    // 1. Check its type.
+    EVT PtrType = Origin->getBasePtr().getValueType();
+    if (PtrType == MVT::Untyped || PtrType.isExtended())
+      return false;
+
+    // 2. Check that it fits in the immediate.
+    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
+      return false;
+
+    // 3. Check that the computation is legal.
+    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
+      return false;
+
+    // Check that the zext is legal if it needs one.
+    EVT TruncateType = Inst->getValueType(0);
+    if (TruncateType != SliceType &&
+        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
+      return false;
+
+    return true;
+  }
+
+  /// \brief Get the offset in bytes of this slice in the original chunk of
+  /// bits.
+  /// \pre DAG != NULL.
+  uint64_t getOffsetFromBase() const {
+    assert(DAG && "Missing context.");
+    bool IsBigEndian =
+        DAG->getTargetLoweringInfo().getDataLayout()->isBigEndian();
+    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
+    uint64_t Offset = Shift / 8;
+    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
+    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
+           "The size of the original loaded type is not a multiple of a"
+           " byte.");
+    // If Offset is bigger than TySizeInBytes, it means we are loading all
+    // zeros. This should have been optimized before in the process.
+    assert(TySizeInBytes > Offset &&
+           "Invalid shift amount for given loaded size");
+    if (IsBigEndian)
+      Offset = TySizeInBytes - Offset - getLoadedSize();
+    return Offset;
+  }
+
+  /// \brief Generate the sequence of instructions to load the slice
+  /// represented by this object and redirect the uses of this slice to
+  /// this new sequence of instructions.
+  /// \pre this->Inst && this->Origin are valid Instructions and this
+  /// object passed the legal check: LoadedSlice::isLegal returned true.
+  /// \return The last instruction of the sequence used to load the slice.
+  SDValue loadSlice() const {
+    assert(Inst && Origin && "Unable to replace a non-existing slice.");
+    const SDValue &OldBaseAddr = Origin->getBasePtr();
+    SDValue BaseAddr = OldBaseAddr;
+    // Get the offset in that chunk of bytes w.r.t. the endianess.
+    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
+    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
+    if (Offset) {
+      // BaseAddr = BaseAddr + Offset.
+      EVT ArithType = BaseAddr.getValueType();
+      BaseAddr = DAG->getNode(ISD::ADD, SDLoc(Origin), ArithType, BaseAddr,
+                              DAG->getConstant(Offset, ArithType));
+    }
+
+    // Create the type of the loaded slice according to its size.
+    EVT SliceType = getLoadedType();
+
+    // Create the load for the slice.
+    SDValue LastInst = DAG->getLoad(
+        SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+        Origin->getPointerInfo().getWithOffset(Offset), Origin->isVolatile(),
+        Origin->isNonTemporal(), Origin->isInvariant(), getAlignment());
+    // If the final type is not the same as the loaded type, this means that
+    // we have to pad with zero. Create a zero extend for that.
+    EVT FinalType = Inst->getValueType(0);
+    if (SliceType != FinalType)
+      LastInst =
+          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
+    return LastInst;
+  }
+
+  /// \brief Check if this slice can be merged with an expensive cross register
+  /// bank copy. E.g.,
+  /// i = load i32
+  /// f = bitcast i32 i to float
+  bool canMergeExpensiveCrossRegisterBankCopy() const {
+    if (!Inst || !Inst->hasOneUse())
+      return false;
+    SDNode *Use = *Inst->use_begin();
+    if (Use->getOpcode() != ISD::BITCAST)
+      return false;
+    assert(DAG && "Missing context");
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+    EVT ResVT = Use->getValueType(0);
+    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
+    const TargetRegisterClass *ArgRC =
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
+    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // At this point, we know that we perform a cross-register-bank copy.
+    // Check if it is expensive.
+    const TargetRegisterInfo *TRI = TLI.getTargetMachine().getRegisterInfo();
+    // Assume bitcasts are cheap, unless both register classes do not
+    // explicitly share a common sub class.
+    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
+      return false;
+
+    // Check if it will be merged with the load.
+    // 1. Check the alignment constraint.
+    unsigned RequiredAlignment = TLI.getDataLayout()->getABITypeAlignment(
+        ResVT.getTypeForEVT(*DAG->getContext()));
+
+    if (RequiredAlignment > getAlignment())
+      return false;
+
+    // 2. Check that the load is a legal operation for that type.
+    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // 3. Check that we do not have a zext in the way.
+    if (Inst->getValueType(0) != getLoadedType())
+      return false;
+
+    return true;
+  }
+};
+}
+
+/// \brief Sorts LoadedSlice according to their offset.
+struct LoadedSliceSorter {
+  bool operator()(const LoadedSlice &LHS, const LoadedSlice &RHS) {
+    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
+    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
+  }
+};
+
+/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// \p UsedBits looks like 0..0 1..1 0..0.
+static bool areUsedBitsDense(const APInt &UsedBits) {
+  // If all the bits are one, this is dense!
+  if (UsedBits.isAllOnesValue())
+    return true;
+
+  // Get rid of the unused bits on the right.
+  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
+  // Get rid of the unused bits on the left.
+  if (NarrowedUsedBits.countLeadingZeros())
+    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
+  // Check that the chunk of bits is completely used.
+  return NarrowedUsedBits.isAllOnesValue();
+}
+
+/// \brief Check whether or not \p First and \p Second are next to each other
+/// in memory. This means that there is no hole between the bits loaded
+/// by \p First and the bits loaded by \p Second.
+static bool areSlicesNextToEachOther(const LoadedSlice &First,
+                                     const LoadedSlice &Second) {
+  assert(First.Origin == Second.Origin && First.Origin &&
+         "Unable to match different memory origins.");
+  APInt UsedBits = First.getUsedBits();
+  assert((UsedBits & Second.getUsedBits()) == 0 &&
+         "Slices are not supposed to overlap.");
+  UsedBits |= Second.getUsedBits();
+  return areUsedBitsDense(UsedBits);
+}
+
+/// \brief Adjust the \p GlobalLSCost according to the target
+/// paring capabilities and the layout of the slices.
+/// \pre \p GlobalLSCost should account for at least as many loads as
+/// there is in the slices in \p LoadedSlices.
+static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                 LoadedSlice::Cost &GlobalLSCost) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  // If there is less than 2 elements, no pairing is possible.
+  if (NumberOfSlices < 2)
+    return;
+
+  // Sort the slices so that elements that are likely to be next to each
+  // other in memory are next to each other in the list.
+  std::sort(LoadedSlices.begin(), LoadedSlices.end(), LoadedSliceSorter());
+  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
+  // First (resp. Second) is the first (resp. Second) potentially candidate
+  // to be placed in a paired load.
+  const LoadedSlice *First = NULL;
+  const LoadedSlice *Second = NULL;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
+                // Set the beginning of the pair.
+                                                           First = Second) {
+
+    Second = &LoadedSlices[CurrSlice];
+
+    // If First is NULL, it means we start a new pair.
+    // Get to the next slice.
+    if (!First)
+      continue;
+
+    EVT LoadedType = First->getLoadedType();
+
+    // If the types of the slices are different, we cannot pair them.
+    if (LoadedType != Second->getLoadedType())
+      continue;
+
+    // Check if the target supplies paired loads for this type.
+    unsigned RequiredAlignment = 0;
+    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
+      // move to the next pair, this type is hopeless.
+      Second = NULL;
+      continue;
+    }
+    // Check if we meet the alignment requirement.
+    if (RequiredAlignment > First->getAlignment())
+      continue;
+
+    // Check that both loads are next to each other in memory.
+    if (!areSlicesNextToEachOther(*First, *Second))
+      continue;
+
+    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
+    --GlobalLSCost.Loads;
+    // Move to the next pair.
+    Second = NULL;
+  }
+}
+
+/// \brief Check the profitability of all involved LoadedSlice.
+/// Currently, it is considered profitable if there is exactly two
+/// involved slices (1) which are (2) next to each other in memory, and
+/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
+///
+/// Note: The order of the elements in \p LoadedSlices may be modified, but not
+/// the elements themselves.
+///
+/// FIXME: When the cost model will be mature enough, we can relax
+/// constraints (1) and (2).
+static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                const APInt &UsedBits, bool ForCodeSize) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  if (StressLoadSlicing)
+    return NumberOfSlices > 1;
+
+  // Check (1).
+  if (NumberOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!areUsedBitsDense(UsedBits))
+    return false;
+
+  // Check (3).
+  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
+  // The original code has one big load.
+  OrigCost.Loads = 1;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
+    const LoadedSlice &LS = LoadedSlices[CurrSlice];
+    // Accumulate the cost of all the slices.
+    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
+    GlobalSlicingCost += SliceCost;
+
+    // Account as cost in the original configuration the gain obtained
+    // with the current slices.
+    OrigCost.addSliceGain(LS);
+  }
+
+  // If the target supports paired load, adjust the cost accordingly.
+  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
+  return OrigCost > GlobalSlicingCost;
+}
+
+/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+bool DAGCombiner::SliceUpLoad(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+      !LD->getValueType(0).isInteger())
+    return false;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LD->getValueSizeInBits(0), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    SDNode *User = *UI;
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
+        isa<ConstantSDNode>(User->getOperand(1))) {
+      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
+      User = *User->use_begin();
+    }
+
+    // At this point, User is a Truncate, iff we encountered, trunc or
+    // trunc(lshr).
+    if (User->getOpcode() != ISD::TRUNCATE)
+      return false;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non 8-bits multiple, the slice
+    // will be accross several bytes. We do not support that.
+    unsigned Width = User->getValueSizeInBits(0);
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return 0;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, LD, Shift, &DAG);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return false;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Check if the new slice would be legal.
+    if (!LS.isLegal())
+      return false;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
+    return false;
+
+  ++SlicedLoads;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+
+  // Prepare the argument for the new token factor for all the slices.
+  SmallVector<SDValue, 8> ArgChains;
+  for (SmallVectorImpl<LoadedSlice>::const_iterator
+           LSIt = LoadedSlices.begin(),
+           LSItEnd = LoadedSlices.end();
+       LSIt != LSItEnd; ++LSIt) {
+    SDValue SliceInst = LSIt->loadSlice();
+    CombineTo(LSIt->Inst, SliceInst, true);
+    if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
+      SliceInst = SliceInst.getOperand(0);
+    assert(SliceInst->getOpcode() == ISD::LOAD &&
+           "It takes more than a zext to get to the loaded slice!!");
+    ArgChains.push_back(SliceInst.getValue(1));
+  }
+
+  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
+                              &ArgChains[0], ArgChains.size());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  return true;
+}
+
 /// CheckForMaskedLoad - Check to see if V is (and load (ptr), imm), where the
 /// load is having specific bytes cleared out.  If so, return the byte size
 /// being masked out and the shift amount.
@@ -7500,9 +8172,9 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
   // follow the sign bit for uniformity.
   uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
-  unsigned NotMaskLZ = CountLeadingZeros_64(NotMask);
+  unsigned NotMaskLZ = countLeadingZeros(NotMask);
   if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
-  unsigned NotMaskTZ = CountTrailingZeros_64(NotMask);
+  unsigned NotMaskTZ = countTrailingZeros(NotMask);
   if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
   if (NotMaskLZ == 64) return Result;  // All zero mask.
 
@@ -7559,7 +8231,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
   // shifted by ByteShift and truncated down to NumBytes.
   if (ByteShift)
-    IVal = DAG.getNode(ISD::SRL, IVal->getDebugLoc(), IVal.getValueType(), IVal,
+    IVal = DAG.getNode(ISD::SRL, SDLoc(IVal), IVal.getValueType(), IVal,
                        DAG.getConstant(ByteShift*8,
                                     DC->getShiftAmountTy(IVal.getValueType())));
 
@@ -7574,16 +8246,16 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 
   SDValue Ptr = St->getBasePtr();
   if (StOffset) {
-    Ptr = DAG.getNode(ISD::ADD, IVal->getDebugLoc(), Ptr.getValueType(),
+    Ptr = DAG.getNode(ISD::ADD, SDLoc(IVal), Ptr.getValueType(),
                       Ptr, DAG.getConstant(StOffset, Ptr.getValueType()));
     NewAlign = MinAlign(NewAlign, StOffset);
   }
 
   // Truncate down to the new size.
-  IVal = DAG.getNode(ISD::TRUNCATE, IVal->getDebugLoc(), VT, IVal);
+  IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
 
   ++OpsNarrowed;
-  return DAG.getStore(St->getChain(), St->getDebugLoc(), IVal, Ptr,
+  return DAG.getStore(St->getChain(), SDLoc(St), IVal, Ptr,
                       St->getPointerInfo().getWithOffset(StOffset),
                       false, false, NewAlign).getNode();
 }
@@ -7684,17 +8356,18 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       if (NewAlign < TLI.getDataLayout()->getABITypeAlignment(NewVTTy))
         return SDValue();
 
-      SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
+      SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
                                    Ptr.getValueType(), Ptr,
                                    DAG.getConstant(PtrOff, Ptr.getValueType()));
-      SDValue NewLD = DAG.getLoad(NewVT, N0.getDebugLoc(),
+      SDValue NewLD = DAG.getLoad(NewVT, SDLoc(N0),
                                   LD->getChain(), NewPtr,
                                   LD->getPointerInfo().getWithOffset(PtrOff),
                                   LD->isVolatile(), LD->isNonTemporal(),
-                                  LD->isInvariant(), NewAlign);
-      SDValue NewVal = DAG.getNode(Opc, Value.getDebugLoc(), NewVT, NewLD,
+                                  LD->isInvariant(), NewAlign,
+                                  LD->getTBAAInfo());
+      SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
                                    DAG.getConstant(NewImm, NewVT));
-      SDValue NewST = DAG.getStore(Chain, N->getDebugLoc(),
+      SDValue NewST = DAG.getStore(Chain, SDLoc(N),
                                    NewVal, NewPtr,
                                    ST->getPointerInfo().getWithOffset(PtrOff),
                                    false, false, NewAlign);
@@ -7747,12 +8420,12 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
-    SDValue NewLD = DAG.getLoad(IntVT, Value.getDebugLoc(),
+    SDValue NewLD = DAG.getLoad(IntVT, SDLoc(Value),
                                 LD->getChain(), LD->getBasePtr(),
                                 LD->getPointerInfo(),
                                 false, false, false, LDAlign);
 
-    SDValue NewST = DAG.getStore(NewLD.getValue(1), N->getDebugLoc(),
+    SDValue NewST = DAG.getStore(NewLD.getValue(1), SDLoc(N),
                                  NewLD, ST->getBasePtr(),
                                  ST->getPointerInfo(),
                                  false, false, STAlign);
@@ -7802,17 +8475,28 @@ struct BaseIndexOffset {
   static BaseIndexOffset match(SDValue Ptr) {
     bool IsIndexSignExt = false;
 
-    // Just Base or possibly anything else.
+    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
+    // instruction, then it could be just the BASE or everything else we don't
+    // know how to handle. Just use Ptr as BASE and give up.
     if (Ptr->getOpcode() != ISD::ADD)
       return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
 
-    // Base + offset.
+    // We know that we have at least an ADD instruction. Try to pattern match
+    // the simple case of BASE + OFFSET.
     if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
       int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
       return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
                               IsIndexSignExt);
     }
 
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Ptr is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
     // Look at Base + Index + Offset cases.
     SDValue Base = Ptr->getOperand(0);
     SDValue IndexOffset = Ptr->getOperand(1);
@@ -7963,6 +8647,11 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
         Index = STn;
         break;
       } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        if (Ldn->isVolatile()) {
+          Index = NULL;
+          break;
+        }
+
         // Save the load node for later. Continue the scan.
         AliasLoadNodes.push_back(Ldn);
         NextInChain = Ldn->getChain().getNode();
@@ -8080,7 +8769,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
 
     // The earliest Node in the DAG.
     LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
-    DebugLoc DL = StoreNodes[0].MemNode->getDebugLoc();
+    SDLoc DL(StoreNodes[0].MemNode);
 
     SDValue StoredVal;
     if (UseVector) {
@@ -8276,8 +8965,8 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
     JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
   }
 
-  DebugLoc LoadDL = LoadNodes[0].MemNode->getDebugLoc();
-  DebugLoc StoreDL = StoreNodes[0].MemNode->getDebugLoc();
+  SDLoc LoadDL(LoadNodes[0].MemNode);
+  SDLoc StoreDL(StoreNodes[0].MemNode);
 
   LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
   SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL,
@@ -8338,9 +9027,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     if (Align <= OrigAlign &&
         ((!LegalOperations && !ST->isVolatile()) ||
          TLI.isOperationLegalOrCustom(ISD::STORE, SVT)))
-      return DAG.getStore(Chain, N->getDebugLoc(), Value.getOperand(0),
+      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0),
                           Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                          ST->isNonTemporal(), OrigAlign);
+                          ST->isNonTemporal(), OrigAlign,
+                          ST->getTBAAInfo());
   }
 
   // Turn 'store undef, Ptr' -> nothing.
@@ -8355,7 +9045,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // transform should not be done in this case.
     if (Value.getOpcode() != ISD::TargetConstantFP) {
       SDValue Tmp;
-      switch (CFP->getValueType(0).getSimpleVT().SimpleTy) {
+      switch (CFP->getSimpleValueType(0).SimpleTy) {
       default: llvm_unreachable("Unknown FP type");
       case MVT::f16:    // We don't do this for these yet.
       case MVT::f80:
@@ -8367,9 +9057,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
             TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
           Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
                               bitcastToAPInt().getZExtValue(), MVT::i32);
-          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+          return DAG.getStore(Chain, SDLoc(N), Tmp,
+                              Ptr, ST->getMemOperand());
         }
         break;
       case MVT::f64:
@@ -8378,9 +9067,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
             TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
           Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                 getZExtValue(), MVT::i64);
-          return DAG.getStore(Chain, N->getDebugLoc(), Tmp,
-                              Ptr, ST->getPointerInfo(), ST->isVolatile(),
-                              ST->isNonTemporal(), ST->getAlignment());
+          return DAG.getStore(Chain, SDLoc(N), Tmp,
+                              Ptr, ST->getMemOperand());
         }
 
         if (!ST->isVolatile() &&
@@ -8396,19 +9084,20 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           unsigned Alignment = ST->getAlignment();
           bool isVolatile = ST->isVolatile();
           bool isNonTemporal = ST->isNonTemporal();
+          const MDNode *TBAAInfo = ST->getTBAAInfo();
 
-          SDValue St0 = DAG.getStore(Chain, ST->getDebugLoc(), Lo,
+          SDValue St0 = DAG.getStore(Chain, SDLoc(ST), Lo,
                                      Ptr, ST->getPointerInfo(),
                                      isVolatile, isNonTemporal,
-                                     ST->getAlignment());
-          Ptr = DAG.getNode(ISD::ADD, N->getDebugLoc(), Ptr.getValueType(), Ptr,
+                                     ST->getAlignment(), TBAAInfo);
+          Ptr = DAG.getNode(ISD::ADD, SDLoc(N), Ptr.getValueType(), Ptr,
                             DAG.getConstant(4, Ptr.getValueType()));
           Alignment = MinAlign(Alignment, 4U);
-          SDValue St1 = DAG.getStore(Chain, ST->getDebugLoc(), Hi,
+          SDValue St1 = DAG.getStore(Chain, SDLoc(ST), Hi,
                                      Ptr, ST->getPointerInfo().getWithOffset(4),
                                      isVolatile, isNonTemporal,
-                                     Alignment);
-          return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+                                     Alignment, TBAAInfo);
+          return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                              St0, St1);
         }
 
@@ -8421,9 +9110,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
     if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
       if (Align > ST->getAlignment())
-        return DAG.getTruncStore(Chain, N->getDebugLoc(), Value,
+        return DAG.getTruncStore(Chain, SDLoc(N), Value,
                                  Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                                 ST->isVolatile(), ST->isNonTemporal(), Align);
+                                 ST->isVolatile(), ST->isNonTemporal(), Align,
+                                 ST->getTBAAInfo());
     }
   }
 
@@ -8433,7 +9123,9 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (NewST.getNode())
     return NewST;
 
-  if (CombinerAA) {
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -8443,19 +9135,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
       // Replace the chain to avoid dependency.
       if (ST->isTruncatingStore()) {
-        ReplStore = DAG.getTruncStore(BetterChain, N->getDebugLoc(), Value, Ptr,
-                                      ST->getPointerInfo(),
-                                      ST->getMemoryVT(), ST->isVolatile(),
-                                      ST->isNonTemporal(), ST->getAlignment());
+        ReplStore = DAG.getTruncStore(BetterChain, SDLoc(N), Value, Ptr,
+                                      ST->getMemoryVT(), ST->getMemOperand());
       } else {
-        ReplStore = DAG.getStore(BetterChain, N->getDebugLoc(), Value, Ptr,
-                                 ST->getPointerInfo(),
-                                 ST->isVolatile(), ST->isNonTemporal(),
-                                 ST->getAlignment());
+        ReplStore = DAG.getStore(BetterChain, SDLoc(N), Value, Ptr,
+                                 ST->getMemOperand());
       }
 
       // Create token to keep both nodes around.
-      SDValue Token = DAG.getNode(ISD::TokenFactor, N->getDebugLoc(),
+      SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
                                   MVT::Other, Chain, ReplStore);
 
       // Make sure the new and old chains are cleaned up.
@@ -8483,10 +9171,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
                         ST->getMemoryVT().getScalarType().getSizeInBits()));
     AddToWorkList(Value.getNode());
     if (Shorter.getNode())
-      return DAG.getTruncStore(Chain, N->getDebugLoc(), Shorter,
-                               Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                               ST->isVolatile(), ST->isNonTemporal(),
-                               ST->getAlignment());
+      return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
+                               Ptr, ST->getMemoryVT(), ST->getMemOperand());
 
     // Otherwise, see if we can simplify the operation with
     // SimplifyDemandedBits, which only works if the value has a single use.
@@ -8516,10 +9202,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
       TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
                             ST->getMemoryVT())) {
-    return DAG.getTruncStore(Chain, N->getDebugLoc(), Value.getOperand(0),
-                             Ptr, ST->getPointerInfo(), ST->getMemoryVT(),
-                             ST->isVolatile(), ST->isNonTemporal(),
-                             ST->getAlignment());
+    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
+                             Ptr, ST->getMemoryVT(), ST->getMemOperand());
   }
 
   // Only perform this optimization before the types are legal, because we
@@ -8547,7 +9231,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDValue InVec = N->getOperand(0);
   SDValue InVal = N->getOperand(1);
   SDValue EltNo = N->getOperand(2);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // If the inserted element is an UNDEF, just use the input vector.
   if (InVal.getOpcode() == ISD::UNDEF)
@@ -8568,7 +9252,9 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
   SmallVector<SDValue, 8> Ops;
-  if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+  // Do not combine these two vectors if the output vector will not replace
+  // the input vector.
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
     Ops.append(InVec.getNode()->op_begin(),
                InVec.getNode()->op_end());
   } else if (InVec.getOpcode() == ISD::UNDEF) {
@@ -8608,7 +9294,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     SDValue InOp = InVec.getOperand(0);
     if (InOp.getValueType() != NVT) {
       assert(InOp.getValueType().isInteger() && NVT.isInteger());
-      return DAG.getSExtOrTrunc(InOp, InVec.getDebugLoc(), NVT);
+      return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
     }
     return InOp;
   }
@@ -8641,8 +9327,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       OrigElt -= NumElem;
     }
 
-    EVT IndexTy = N->getOperand(1).getValueType();
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(), NVT,
+    EVT IndexTy = TLI.getVectorIdxTy();
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT,
                        InVec, DAG.getConstant(OrigElt, IndexTy));
   }
 
@@ -8756,7 +9442,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       EVT PtrType = NewPtr.getValueType();
       if (TLI.isBigEndian())
         PtrOff = VT.getSizeInBits() / 8 - PtrOff;
-      NewPtr = DAG.getNode(ISD::ADD, N->getDebugLoc(), PtrType, NewPtr,
+      NewPtr = DAG.getNode(ISD::ADD, SDLoc(N), PtrType, NewPtr,
                            DAG.getConstant(PtrOff, PtrType));
     }
 
@@ -8773,20 +9459,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       // extending load instead.
       ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, LVT)
         ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-      Load = DAG.getExtLoad(ExtType, N->getDebugLoc(), NVT, LN0->getChain(),
+      Load = DAG.getExtLoad(ExtType, SDLoc(N), NVT, LN0->getChain(),
                             NewPtr, LN0->getPointerInfo().getWithOffset(PtrOff),
-                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),Align);
+                            LVT, LN0->isVolatile(), LN0->isNonTemporal(),
+                            Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
     } else {
-      Load = DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+      Load = DAG.getLoad(LVT, SDLoc(N), LN0->getChain(), NewPtr,
                          LN0->getPointerInfo().getWithOffset(PtrOff),
-                         LN0->isVolatile(), LN0->isNonTemporal(), 
-                         LN0->isInvariant(), Align);
+                         LN0->isVolatile(), LN0->isNonTemporal(),
+                         LN0->isInvariant(), Align, LN0->getTBAAInfo());
       Chain = Load.getValue(1);
       if (NVT.bitsLT(LVT))
-        Load = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), NVT, Load);
+        Load = DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, Load);
       else
-        Load = DAG.getNode(ISD::BITCAST, N->getDebugLoc(), NVT, Load);
+        Load = DAG.getNode(ISD::BITCAST, SDLoc(N), NVT, Load);
     }
     WorkListRemover DeadNodes(*this);
     SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
@@ -8816,7 +9503,7 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
     return SDValue();
 
   unsigned NumInScalars = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Check to see if this is a BUILD_VECTOR of a bunch of values
@@ -8918,7 +9605,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   unsigned NumInScalars = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT SrcVT = MVT::Other;
   unsigned Opcode = ISD::DELETED_NODE;
@@ -8983,7 +9670,7 @@ SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
 
 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   unsigned NumInScalars = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // A vector built entirely of undefs is undef.
@@ -9119,8 +9806,35 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     return N->getOperand(0);
 
   // Check if all of the operands are undefs.
+  EVT VT = N->getValueType(0);
   if (ISD::allOperandsUndef(N))
-    return DAG.getUNDEF(N->getValueType(0));
+    return DAG.getUNDEF(VT);
+
+  // Optimize concat_vectors where one of the vectors is undef.
+  if (N->getNumOperands() == 2 &&
+      N->getOperand(1)->getOpcode() == ISD::UNDEF) {
+    SDValue In = N->getOperand(0);
+    assert(In.getValueType().isVector() && "Must concat vectors");
+
+    // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
+    if (In->getOpcode() == ISD::BITCAST &&
+        !In->getOperand(0)->getValueType(0).isVector()) {
+      SDValue Scalar = In->getOperand(0);
+      EVT SclTy = Scalar->getValueType(0);
+
+      if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
+                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
+        return SDValue();
+
+      SDLoc dl = SDLoc(N);
+      SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Res);
+    }
+  }
 
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
@@ -9158,7 +9872,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     // The extract index must be constant.
     if (!CS)
       return SDValue();
-    
+
     // Check that we are reading from the identity index.
     if (CS->getZExtValue() != IdentityIndex)
       return SDValue();
@@ -9166,7 +9880,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
 
   if (SingleSource.getNode())
     return SingleSource;
-  
+
   return SDValue();
 }
 
@@ -9179,7 +9893,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     //    (extract_subvec (concat V1, V2, ...), i)
     // Into:
     //    Vi if possible
-    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same
+    // type.
     if (V->getOperand(0).getValueType() != NVT)
       return SDValue();
     unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
@@ -9194,7 +9909,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     V = V.getOperand(0);
 
   if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     // Handle only simple case where vector being inserted and vector
     // being extracted are of same type, and are half size of larger vectors.
     EVT BigVT = V->getOperand(0).getValueType();
@@ -9246,22 +9961,36 @@ static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   for (unsigned I = 0; I != NumConcats; ++I) {
     // Make sure we're dealing with a copy.
     unsigned Begin = I * NumElemsPerConcat;
-    if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
-      return SDValue();
+    bool AllUndef = true, NoUndef = true;
+    for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) {
+      if (SVN->getMaskElt(J) >= 0)
+        AllUndef = false;
+      else
+        NoUndef = false;
+    }
 
-    for (unsigned J = 1; J != NumElemsPerConcat; ++J) {
-      if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
+    if (NoUndef) {
+      if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
         return SDValue();
-    }
 
-    unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
-    if (FirstElt < N0.getNumOperands())
-      Ops.push_back(N0.getOperand(FirstElt));
-    else
-      Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));
+      for (unsigned J = 1; J != NumElemsPerConcat; ++J)
+        if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
+          return SDValue();
+
+      unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
+      if (FirstElt < N0.getNumOperands())
+        Ops.push_back(N0.getOperand(FirstElt));
+      else
+        Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));
+
+    } else if (AllUndef) {
+      Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType()));
+    } else { // Mixed with general masks and undefs, can't do optimization.
+      return SDValue();
+    }
   }
 
-  return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT, Ops.data(),
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops.data(),
                      Ops.size());
 }
 
@@ -9288,7 +10017,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       if (Idx >= (int)NumElts) Idx -= NumElts;
       NewMask.push_back(Idx);
     }
-    return DAG.getVectorShuffle(VT, N->getDebugLoc(), N0, DAG.getUNDEF(VT),
+    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
                                 &NewMask[0]);
   }
 
@@ -9298,14 +10027,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
     for (unsigned i = 0; i != NumElts; ++i) {
       int Idx = SVN->getMaskElt(i);
       if (Idx >= 0) {
-        if (Idx < (int)NumElts)
-          Idx += NumElts;
-        else
+        if (Idx >= (int)NumElts)
           Idx -= NumElts;
+        else
+          Idx = -1; // remove reference to lhs
       }
       NewMask.push_back(Idx);
     }
-    return DAG.getVectorShuffle(VT, N->getDebugLoc(), N1, DAG.getUNDEF(VT),
+    return DAG.getVectorShuffle(VT, SDLoc(N), N1, DAG.getUNDEF(VT),
                                 &NewMask[0]);
   }
 
@@ -9322,7 +10051,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       NewMask.push_back(Idx);
     }
     if (Changed)
-      return DAG.getVectorShuffle(VT, N->getDebugLoc(), N0, N1, &NewMask[0]);
+      return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, &NewMask[0]);
   }
 
   // If it is a splat, check if the argument vector is another splat or a
@@ -9419,7 +10148,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
 ///      vector_shuffle V, Zero, <0, 4, 2, 4>
 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   if (N->getOpcode() == ISD::AND) {
@@ -9450,7 +10179,7 @@ SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
       EVT EltVT = RVT.getVectorElementType();
       SmallVector<SDValue,8> ZeroOps(RVT.getVectorNumElements(),
                                      DAG.getConstant(0, EltVT));
-      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+      SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                                  RVT, &ZeroOps[0], ZeroOps.size());
       LHS = DAG.getNode(ISD::BITCAST, dl, RVT, LHS);
       SDValue Shuf = DAG.getVectorShuffle(RVT, dl, LHS, Zero, &Indices[0]);
@@ -9506,13 +10235,13 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         // legalization, the types may not match between the two BUILD_VECTORS.
         // Truncate one of the operands to make them match.
         if (RVT.getSizeInBits() > VT.getSizeInBits()) {
-          RHSOp = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, RHSOp);
+          RHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, RHSOp);
         } else {
-          LHSOp = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), RVT, LHSOp);
+          LHSOp = DAG.getNode(ISD::TRUNCATE, SDLoc(N), RVT, LHSOp);
           VT = RVT;
         }
       }
-      SDValue FoldOp = DAG.getNode(N->getOpcode(), LHS.getDebugLoc(), VT,
+      SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(LHS), VT,
                                    LHSOp, RHSOp);
       if (FoldOp.getOpcode() != ISD::UNDEF &&
           FoldOp.getOpcode() != ISD::Constant &&
@@ -9523,7 +10252,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     }
 
     if (Ops.size() == LHS.getNumOperands())
-      return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+      return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                          LHS.getValueType(), &Ops[0], Ops.size());
   }
 
@@ -9548,7 +10277,7 @@ SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
         Op.getOpcode() != ISD::ConstantFP)
       break;
     EVT EltVT = Op.getValueType();
-    SDValue FoldOp = DAG.getNode(N->getOpcode(), N0.getDebugLoc(), EltVT, Op);
+    SDValue FoldOp = DAG.getNode(N->getOpcode(), SDLoc(N0), EltVT, Op);
     if (FoldOp.getOpcode() != ISD::UNDEF &&
         FoldOp.getOpcode() != ISD::ConstantFP)
       break;
@@ -9559,11 +10288,11 @@ SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
   if (Ops.size() != N0.getNumOperands())
     return SDValue();
 
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                      N0.getValueType(), &Ops[0], Ops.size());
 }
 
-SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
+SDValue DAGCombiner::SimplifySelect(SDLoc DL, SDValue N0,
                                     SDValue N1, SDValue N2){
   assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
 
@@ -9577,13 +10306,13 @@ SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
     // Check to see if we got a select_cc back (to turn into setcc/select).
     // Otherwise, just return whatever node we got back, like fabs.
     if (SCC.getOpcode() == ISD::SELECT_CC) {
-      SDValue SETCC = DAG.getNode(ISD::SETCC, N0.getDebugLoc(),
+      SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
                                   N0.getValueType(),
                                   SCC.getOperand(0), SCC.getOperand(1),
                                   SCC.getOperand(4));
       AddToWorkList(SETCC.getNode());
-      return DAG.getNode(ISD::SELECT, SCC.getDebugLoc(), SCC.getValueType(),
-                         SCC.getOperand(2), SCC.getOperand(3), SETCC);
+      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(),
+                           SCC.getOperand(2), SCC.getOperand(3), SETCC);
     }
 
     return SCC;
@@ -9652,10 +10381,10 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
       if (LLD->isPredecessorOf(RLD) ||
           RLD->isPredecessorOf(LLD))
         return false;
-      Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
-                         LLD->getBasePtr().getValueType(),
-                         TheSelect->getOperand(0), LLD->getBasePtr(),
-                         RLD->getBasePtr());
+      Addr = DAG.getSelect(SDLoc(TheSelect),
+                           LLD->getBasePtr().getValueType(),
+                           TheSelect->getOperand(0), LLD->getBasePtr(),
+                           RLD->getBasePtr());
     } else {  // Otherwise SELECT_CC
       SDNode *CondLHS = TheSelect->getOperand(0).getNode();
       SDNode *CondRHS = TheSelect->getOperand(1).getNode();
@@ -9666,7 +10395,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
            (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS))))
         return false;
 
-      Addr = DAG.getNode(ISD::SELECT_CC, TheSelect->getDebugLoc(),
+      Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
                          LLD->getBasePtr().getValueType(),
                          TheSelect->getOperand(0),
                          TheSelect->getOperand(1),
@@ -9677,17 +10406,17 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     SDValue Load;
     if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
       Load = DAG.getLoad(TheSelect->getValueType(0),
-                         TheSelect->getDebugLoc(),
-                         // FIXME: Discards pointer info.
+                         SDLoc(TheSelect),
+                         // FIXME: Discards pointer and TBAA info.
                          LLD->getChain(), Addr, MachinePointerInfo(),
                          LLD->isVolatile(), LLD->isNonTemporal(),
                          LLD->isInvariant(), LLD->getAlignment());
     } else {
       Load = DAG.getExtLoad(LLD->getExtensionType() == ISD::EXTLOAD ?
                             RLD->getExtensionType() : LLD->getExtensionType(),
-                            TheSelect->getDebugLoc(),
+                            SDLoc(TheSelect),
                             TheSelect->getValueType(0),
-                            // FIXME: Discards pointer info.
+                            // FIXME: Discards pointer and TBAA info.
                             LLD->getChain(), Addr, MachinePointerInfo(),
                             LLD->getMemoryVT(), LLD->isVolatile(),
                             LLD->isNonTemporal(), LLD->getAlignment());
@@ -9708,7 +10437,7 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
 
 /// SimplifySelectCC - Simplify an expression of the form (N0 cond N1) ? N2 : N3
 /// where 'cond' is the comparison specified by CC.
-SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
+SDValue DAGCombiner::SimplifySelectCC(SDLoc DL, SDValue N0, SDValue N1,
                                       SDValue N2, SDValue N3,
                                       ISD::CondCode CC, bool NotExtCompare) {
   // (x ? y : y) -> y.
@@ -9720,7 +10449,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
   ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
 
   // Determine if the condition we're dealing with is constant
-  SDValue SCC = SimplifySetCC(TLI.getSetCCResultType(N0.getValueType()),
+  SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
                               N0, N1, CC, DL, false);
   if (SCC.getNode()) AddToWorkList(SCC.getNode());
   ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode());
@@ -9786,13 +10515,13 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         SDValue One = DAG.getIntPtrConstant(EltSize);
 
         SDValue Cond = DAG.getSetCC(DL,
-                                    TLI.getSetCCResultType(N0.getValueType()),
+                                    getSetCCResultType(N0.getValueType()),
                                     N0, N1, CC);
         AddToWorkList(Cond.getNode());
-        SDValue CstOffset = DAG.getNode(ISD::SELECT, DL, Zero.getValueType(),
-                                        Cond, One, Zero);
+        SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
+                                          Cond, One, Zero);
         AddToWorkList(CstOffset.getNode());
-        CPIdx = DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(), CPIdx,
+        CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
                             CstOffset);
         AddToWorkList(CPIdx.getNode());
         return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
@@ -9817,7 +10546,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         ShCtV = XType.getSizeInBits()-ShCtV-1;
         SDValue ShCt = DAG.getConstant(ShCtV,
                                        getShiftAmountTy(N0.getValueType()));
-        SDValue Shift = DAG.getNode(ISD::SRL, N0.getDebugLoc(),
+        SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0),
                                     XType, N0, ShCt);
         AddToWorkList(Shift.getNode());
 
@@ -9829,7 +10558,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
       }
 
-      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(),
+      SDValue Shift = DAG.getNode(ISD::SRA, SDLoc(N0),
                                   XType, N0,
                                   DAG.getConstant(XType.getSizeInBits()-1,
                                          getShiftAmountTy(N0.getValueType())));
@@ -9862,14 +10591,14 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
       SDValue ShlAmt =
         DAG.getConstant(AndMask.countLeadingZeros(),
                         getShiftAmountTy(AndLHS.getValueType()));
-      SDValue Shl = DAG.getNode(ISD::SHL, N0.getDebugLoc(), VT, AndLHS, ShlAmt);
+      SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
 
       // Now arithmetic right shift it all the way over, so the result is either
       // all-ones, or zero.
       SDValue ShrAmt =
         DAG.getConstant(AndMask.getBitWidth()-1,
                         getShiftAmountTy(Shl.getValueType()));
-      SDValue Shr = DAG.getNode(ISD::SRA, N0.getDebugLoc(), VT, Shl, ShrAmt);
+      SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
 
       return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
     }
@@ -9889,21 +10618,21 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
     // NOTE: Don't create a SETCC if it's not legal on this target.
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SETCC,
-          LegalTypes ? TLI.getSetCCResultType(N0.getValueType()) : MVT::i1)) {
+          LegalTypes ? getSetCCResultType(N0.getValueType()) : MVT::i1)) {
       SDValue Temp, SCC;
       // cast from setcc result type to select result type
       if (LegalTypes) {
-        SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
+        SCC  = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
                             N0, N1, CC);
         if (N2.getValueType().bitsLT(SCC.getValueType()))
-          Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(),
+          Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
                                         N2.getValueType());
         else
-          Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+          Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
                              N2.getValueType(), SCC);
       } else {
-        SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
-        Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+        SCC  = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
                            N2.getValueType(), SCC);
       }
 
@@ -9914,9 +10643,10 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
         return Temp;
 
       // shl setcc result by log2 n2c
-      return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
-                         DAG.getConstant(N2C->getAPIntValue().logBase2(),
-                                         getShiftAmountTy(Temp.getValueType())));
+      return DAG.getNode(
+          ISD::SHL, DL, N2.getValueType(), Temp,
+          DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                          getShiftAmountTy(Temp.getValueType())));
     }
   }
 
@@ -9926,8 +10656,8 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
   if (0 && N3C && N3C->isNullValue() && N2C && (N2C->getAPIntValue() == 1ULL)) {
     EVT XType = N0.getValueType();
     if (!LegalOperations ||
-        TLI.isOperationLegal(ISD::SETCC, TLI.getSetCCResultType(XType))) {
-      SDValue Res = DAG.getSetCC(DL, TLI.getSetCCResultType(XType), N0, N1, CC);
+        TLI.isOperationLegal(ISD::SETCC, getSetCCResultType(XType))) {
+      SDValue Res = DAG.getSetCC(DL, getSetCCResultType(XType), N0, N1, CC);
       if (Res.getValueType() != VT)
         Res = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Res);
       return Res;
@@ -9937,16 +10667,16 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
     if (N1C && N1C->isNullValue() && CC == ISD::SETEQ &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::CTLZ, XType))) {
-      SDValue Ctlz = DAG.getNode(ISD::CTLZ, N0.getDebugLoc(), XType, N0);
+      SDValue Ctlz = DAG.getNode(ISD::CTLZ, SDLoc(N0), XType, N0);
       return DAG.getNode(ISD::SRL, DL, XType, Ctlz,
                          DAG.getConstant(Log2_32(XType.getSizeInBits()),
                                        getShiftAmountTy(Ctlz.getValueType())));
     }
     // fold (setgt X, 0) -> (srl (and (-X, ~X), size(X)-1))
     if (N1C && N1C->isNullValue() && CC == ISD::SETGT) {
-      SDValue NegN0 = DAG.getNode(ISD::SUB, N0.getDebugLoc(),
+      SDValue NegN0 = DAG.getNode(ISD::SUB, SDLoc(N0),
                                   XType, DAG.getConstant(0, XType), N0);
-      SDValue NotN0 = DAG.getNOT(N0.getDebugLoc(), N0, XType);
+      SDValue NotN0 = DAG.getNOT(SDLoc(N0), N0, XType);
       return DAG.getNode(ISD::SRL, DL, XType,
                          DAG.getNode(ISD::AND, DL, XType, NegN0, NotN0),
                          DAG.getConstant(XType.getSizeInBits()-1,
@@ -9954,7 +10684,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
     }
     // fold (setgt X, -1) -> (xor (srl (X, size(X)-1), 1))
     if (N1C && N1C->isAllOnesValue() && CC == ISD::SETGT) {
-      SDValue Sign = DAG.getNode(ISD::SRL, N0.getDebugLoc(), XType, N0,
+      SDValue Sign = DAG.getNode(ISD::SRL, SDLoc(N0), XType, N0,
                                  DAG.getConstant(XType.getSizeInBits()-1,
                                          getShiftAmountTy(N0.getValueType())));
       return DAG.getNode(ISD::XOR, DL, XType, Sign, DAG.getConstant(1, XType));
@@ -9980,11 +10710,11 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
 
     EVT XType = N0.getValueType();
     if (SubC && SubC->isNullValue() && XType.isInteger()) {
-      SDValue Shift = DAG.getNode(ISD::SRA, N0.getDebugLoc(), XType,
+      SDValue Shift = DAG.getNode(ISD::SRA, SDLoc(N0), XType,
                                   N0,
                                   DAG.getConstant(XType.getSizeInBits()-1,
                                          getShiftAmountTy(N0.getValueType())));
-      SDValue Add = DAG.getNode(ISD::ADD, N0.getDebugLoc(),
+      SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0),
                                 XType, N0, Shift);
       AddToWorkList(Shift.getNode());
       AddToWorkList(Add.getNode());
@@ -9998,7 +10728,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
 /// SimplifySetCC - This is a stub for TargetLowering::SimplifySetCC.
 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0,
                                    SDValue N1, ISD::CondCode Cond,
-                                   DebugLoc DL, bool foldBooleans) {
+                                   SDLoc DL, bool foldBooleans) {
   TargetLowering::DAGCombinerInfo
     DagCombineInfo(DAG, Level, false, this);
   return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
@@ -10072,17 +10802,20 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
 
 /// isAlias - Return true if there is any possibility that the two addresses
 /// overlap.
-bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
+bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1, bool IsVolatile1,
                           const Value *SrcValue1, int SrcValueOffset1,
                           unsigned SrcValueAlign1,
                           const MDNode *TBAAInfo1,
-                          SDValue Ptr2, int64_t Size2,
+                          SDValue Ptr2, int64_t Size2, bool IsVolatile2,
                           const Value *SrcValue2, int SrcValueOffset2,
                           unsigned SrcValueAlign2,
                           const MDNode *TBAAInfo2) const {
   // If they are the same then they must be aliases.
   if (Ptr1 == Ptr2) return true;
 
+  // If they are both volatile then they cannot be reordered.
+  if (IsVolatile1 && IsVolatile2) return true;
+
   // Gather base node and offset information.
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
@@ -10127,7 +10860,9 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
       return false;
   }
 
-  if (CombinerGlobalAA) {
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0 ? CombinerGlobalAA :
+    TLI.getTargetMachine().getSubtarget<TargetSubtargetInfo>().useAA();
+  if (UseAA && SrcValue1 && SrcValue2) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValueOffset1, SrcValueOffset2);
     int64_t Overlap1 = Size1 + SrcValueOffset1 - MinOffset;
@@ -10146,24 +10881,25 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
 bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) {
   SDValue Ptr0, Ptr1;
   int64_t Size0, Size1;
+  bool IsVolatile0, IsVolatile1;
   const Value *SrcValue0, *SrcValue1;
   int SrcValueOffset0, SrcValueOffset1;
   unsigned SrcValueAlign0, SrcValueAlign1;
   const MDNode *SrcTBAAInfo0, *SrcTBAAInfo1;
-  FindAliasInfo(Op0, Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  FindAliasInfo(Op0, Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                 SrcValueAlign0, SrcTBAAInfo0);
-  FindAliasInfo(Op1, Ptr1, Size1, SrcValue1, SrcValueOffset1,
+  FindAliasInfo(Op1, Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                 SrcValueAlign1, SrcTBAAInfo1);
-  return isAlias(Ptr0, Size0, SrcValue0, SrcValueOffset0,
+  return isAlias(Ptr0, Size0, IsVolatile0, SrcValue0, SrcValueOffset0,
                  SrcValueAlign0, SrcTBAAInfo0,
-                 Ptr1, Size1, SrcValue1, SrcValueOffset1,
+                 Ptr1, Size1, IsVolatile1, SrcValue1, SrcValueOffset1,
                  SrcValueAlign1, SrcTBAAInfo1);
 }
 
 /// FindAliasInfo - Extracts the relevant alias information from the memory
-/// node.  Returns true if the operand was a load.
+/// node.  Returns true if the operand was a nonvolatile load.
 bool DAGCombiner::FindAliasInfo(SDNode *N,
-                                SDValue &Ptr, int64_t &Size,
+                                SDValue &Ptr, int64_t &Size, bool &IsVolatile,
                                 const Value *&SrcValue,
                                 int &SrcValueOffset,
                                 unsigned &SrcValueAlign,
@@ -10172,29 +10908,31 @@ bool DAGCombiner::FindAliasInfo(SDNode *N,
 
   Ptr = LS->getBasePtr();
   Size = LS->getMemoryVT().getSizeInBits() >> 3;
+  IsVolatile = LS->isVolatile();
   SrcValue = LS->getSrcValue();
   SrcValueOffset = LS->getSrcValueOffset();
   SrcValueAlign = LS->getOriginalAlignment();
   TBAAInfo = LS->getTBAAInfo();
-  return isa<LoadSDNode>(LS);
+  return isa<LoadSDNode>(LS) && !IsVolatile;
 }
 
 /// GatherAllAliases - Walk up chain skipping non-aliasing memory nodes,
 /// looking for aliasing nodes and adding them to the Aliases vector.
 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
-                                   SmallVector<SDValue, 8> &Aliases) {
+                                   SmallVectorImpl<SDValue> &Aliases) {
   SmallVector<SDValue, 8> Chains;     // List of chains to visit.
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
   SDValue Ptr;
   int64_t Size;
+  bool IsVolatile;
   const Value *SrcValue;
   int SrcValueOffset;
   unsigned SrcValueAlign;
   const MDNode *SrcTBAAInfo;
-  bool IsLoad = FindAliasInfo(N, Ptr, Size, SrcValue, SrcValueOffset,
-                              SrcValueAlign, SrcTBAAInfo);
+  bool IsLoad = FindAliasInfo(N, Ptr, Size, IsVolatile, SrcValue,
+                              SrcValueOffset, SrcValueAlign, SrcTBAAInfo);
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -10235,20 +10973,21 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       // Get alias information for Chain.
       SDValue OpPtr;
       int64_t OpSize;
+      bool OpIsVolatile;
       const Value *OpSrcValue;
       int OpSrcValueOffset;
       unsigned OpSrcValueAlign;
       const MDNode *OpSrcTBAAInfo;
       bool IsOpLoad = FindAliasInfo(Chain.getNode(), OpPtr, OpSize,
-                                    OpSrcValue, OpSrcValueOffset,
+                                    OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                                     OpSrcValueAlign,
                                     OpSrcTBAAInfo);
 
       // If chain is alias then stop here.
       if (!(IsLoad && IsOpLoad) &&
-          isAlias(Ptr, Size, SrcValue, SrcValueOffset, SrcValueAlign,
-                  SrcTBAAInfo,
-                  OpPtr, OpSize, OpSrcValue, OpSrcValueOffset,
+          isAlias(Ptr, Size, IsVolatile, SrcValue, SrcValueOffset,
+                  SrcValueAlign, SrcTBAAInfo,
+                  OpPtr, OpSize, OpIsVolatile, OpSrcValue, OpSrcValueOffset,
                   OpSrcValueAlign, OpSrcTBAAInfo)) {
         Aliases.push_back(Chain);
       } else {
@@ -10298,7 +11037,7 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
     return Aliases[0];
 
   // Construct a custom tailored token factor.
-  return DAG.getNode(ISD::TokenFactor, N->getDebugLoc(), MVT::Other,
+  return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other,
                      &Aliases[0], Aliases.size());
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 288499a..a6f7461 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -41,6 +41,7 @@
 
 #define DEBUG_TYPE "isel"
 #include "llvm/CodeGen/FastISel.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/CodeGen/Analysis.h"
@@ -75,15 +76,12 @@ STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 void FastISel::startNewBlock() {
   LocalValueMap.clear();
 
+  // Instructions are appended to FuncInfo.MBB. If the basic block already
+  // contains labels or copies, use the last instruction as the last local
+  // value.
   EmitStartPt = 0;
-
-  // Advance the emit start point past any EH_LABEL instructions.
-  MachineBasicBlock::iterator
-    I = FuncInfo.MBB->begin(), E = FuncInfo.MBB->end();
-  while (I != E && I->getOpcode() == TargetOpcode::EH_LABEL) {
-    EmitStartPt = I;
-    ++I;
-  }
+  if (!FuncInfo.MBB->empty())
+    EmitStartPt = &FuncInfo.MBB->back();
   LastLocalValue = EmitStartPt;
 }
 
@@ -92,18 +90,16 @@ bool FastISel::LowerArguments() {
     // Fallback to SDISel argument lowering code to deal with sret pointer
     // parameter.
     return false;
-  
+
   if (!FastLowerArguments())
     return false;
 
-  // Enter non-dead arguments into ValueMap for uses in non-entry BBs.
+  // Enter arguments into ValueMap for uses in non-entry BBs.
   for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
          E = FuncInfo.Fn->arg_end(); I != E; ++I) {
-    if (!I->use_empty()) {
-      DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
-      assert(VI != LocalValueMap.end() && "Missed an argument?");
-      FuncInfo.ValueMap[I] = VI->second;
-    }
+    DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(I);
+    assert(VI != LocalValueMap.end() && "Missed an argument?");
+    FuncInfo.ValueMap[I] = VI->second;
   }
   return true;
 }
@@ -601,7 +597,10 @@ bool FastISel::SelectCall(const User *I) {
 
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
-    if (!DIVariable(DI->getVariable()).Verify() ||
+    DIVariable DIVar(DI->getVariable());
+    assert((!DIVar || DIVar.isVariable()) &&
+      "Variable in DbgDeclareInst should be either null or a DIVariable.");
+    if (!DIVar ||
         !FuncInfo.MF->getMMI().hasDebugInfo()) {
       DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
       return true;
@@ -613,16 +612,16 @@ bool FastISel::SelectCall(const User *I) {
       return true;
     }
 
-    unsigned Reg = 0;
     unsigned Offset = 0;
-    if (const Argument *Arg = dyn_cast<Argument>(Address)) {
+    Optional<MachineOperand> Op;
+    if (const Argument *Arg = dyn_cast<Argument>(Address))
       // Some arguments' frame index is recorded during argument lowering.
       Offset = FuncInfo.getArgumentFrameIndex(Arg);
-      if (Offset)
-        Reg = TRI.getFrameRegister(*FuncInfo.MF);
-    }
-    if (!Reg)
-      Reg = lookUpRegForValue(Address);
+    if (Offset)
+        Op = MachineOperand::CreateFI(Offset);
+    if (!Op)
+      if (unsigned Reg = lookUpRegForValue(Address))
+        Op = MachineOperand::CreateReg(Reg, false);
 
     // If we have a VLA that has a "use" in a metadata node that's then used
     // here but it has no other uses, then we have a problem. E.g.,
@@ -635,20 +634,29 @@ bool FastISel::SelectCall(const User *I) {
     // If we assign 'a' a vreg and fast isel later on has to use the selection
     // DAG isel, it will want to copy the value to the vreg. However, there are
     // no uses, which goes counter to what selection DAG isel expects.
-    if (!Reg && !Address->use_empty() && isa<Instruction>(Address) &&
+    if (!Op && !Address->use_empty() && isa<Instruction>(Address) &&
         (!isa<AllocaInst>(Address) ||
          !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
-      Reg = FuncInfo.InitializeRegForValue(Address);
-
-    if (Reg)
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-              TII.get(TargetOpcode::DBG_VALUE))
-        .addReg(Reg, RegState::Debug).addImm(Offset)
-        .addMetadata(DI->getVariable());
-    else
+      Op = MachineOperand::CreateReg(FuncInfo.InitializeRegForValue(Address),
+                                     false);
+
+    if (Op) {
+      if (Op->isReg()) {
+        Op->setIsDebug(true);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
+                DI->getVariable());
+      } else
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(0)
+            .addMetadata(DI->getVariable());
+    } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
-      DEBUG(dbgs() << "Dropping debug info for " << DI);
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    }
     return true;
   }
   case Intrinsic::dbg_value: {
@@ -676,13 +684,14 @@ bool FastISel::SelectCall(const User *I) {
         .addFPImm(CF).addImm(DI->getOffset())
         .addMetadata(DI->getVariable());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
-        .addReg(Reg, RegState::Debug).addImm(DI->getOffset())
-        .addMetadata(DI->getVariable());
+      // FIXME: This does not handle register-indirect values at offset 0.
+      bool IsIndirect = DI->getOffset() != 0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, IsIndirect,
+              Reg, DI->getOffset(), DI->getVariable());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
-      DEBUG(dbgs() << "Dropping debug info for " << DI);
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
     }
     return true;
   }
@@ -1562,4 +1571,19 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   return tryToFoldLoadIntoMI(User, RI.getOperandNo(), LI);
 }
 
+bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
+  // Must be an add.
+  if (!isa<AddOperator>(Add))
+    return false;
+  // Type size needs to match.
+  if (TD.getTypeSizeInBits(GEP->getType()) !=
+      TD.getTypeSizeInBits(Add->getType()))
+    return false;
+  // Must be in the same basic block.
+  if (isa<Instruction>(Add) &&
+      FuncInfo.MBBMap[cast<Instruction>(Add)->getParent()] != FuncInfo.MBB)
+    return false;
+  // Must have a constant operand.
+  return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
+}
 
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index b46edad..4309dc1 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -55,21 +55,19 @@ static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
   return false;
 }
 
-FunctionLoweringInfo::FunctionLoweringInfo(const TargetLowering &tli)
-  : TLI(tli) {
-}
-
 void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   Fn = &fn;
   MF = &mf;
   RegInfo = &MF->getRegInfo();
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, TLI);
-  CanLowerReturn = TLI.CanLowerReturn(Fn->getCallingConv(), *MF,
-                                      Fn->isVarArg(),
-                                      Outs, Fn->getContext());
+  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI);
+  CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
+                                       Fn->isVarArg(),
+                                       Outs, Fn->getContext());
 
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
@@ -79,9 +77,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(I))
       if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
         Type *Ty = AI->getAllocatedType();
-        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
         unsigned Align =
-          std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+          std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
                    AI->getAlignment());
 
         TySize *= CUI->getZExtValue();   // Get total allocated size.
@@ -114,8 +112,11 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
       // in a predictable order.
       if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(I)) {
         MachineModuleInfo &MMI = MF->getMMI();
+        DIVariable DIVar(DI->getVariable());
+        assert((!DIVar || DIVar.isVariable()) &&
+          "Variable in DbgDeclareInst should be either null or a DIVariable.");
         if (MMI.hasDebugInfo() &&
-            DIVariable(DI->getVariable()).Verify() &&
+            DIVar &&
             !DI->getDebugLoc().isUnknown()) {
           // Don't handle byval struct arguments or VLAs, for example.
           // Non-byval arguments are handled here (they refer to the stack
@@ -167,10 +168,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
       assert(PHIReg && "PHI node does not have an assigned virtual register!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      ComputeValueVTs(*TLI, PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
-        unsigned NumRegisters = TLI.getNumRegisters(Fn->getContext(), VT);
+        unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
         const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
         for (unsigned i = 0; i != NumRegisters; ++i)
           BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
@@ -208,7 +209,8 @@ void FunctionLoweringInfo::clear() {
 
 /// CreateReg - Allocate a single virtual register for the given type.
 unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
-  return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
+  return RegInfo->
+    createVirtualRegister(TM.getTargetLowering()->getRegClassFor(VT));
 }
 
 /// CreateRegs - Allocate the appropriate number of virtual registers of
@@ -219,15 +221,17 @@ unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
 /// will assign registers for each member or element.
 ///
 unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, Ty, ValueVTs);
+  ComputeValueVTs(*TLI, Ty, ValueVTs);
 
   unsigned FirstReg = 0;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
-    MVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
+    MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
 
-    unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
+    unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
       unsigned R = CreateReg(RegisterVT);
       if (!FirstReg) FirstReg = R;
@@ -266,15 +270,17 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   if (!Ty->isIntegerTy() || Ty->isVectorTy())
     return;
 
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   SmallVector<EVT, 1> ValueVTs;
-  ComputeValueVTs(TLI, Ty, ValueVTs);
+  ComputeValueVTs(*TLI, Ty, ValueVTs);
   assert(ValueVTs.size() == 1 &&
          "PHIs with non-vector integer types should have a single VT.");
   EVT IntVT = ValueVTs[0];
 
-  if (TLI.getNumRegisters(PN->getContext(), IntVT) != 1)
+  if (TLI->getNumRegisters(PN->getContext(), IntVT) != 1)
     return;
-  IntVT = TLI.getTypeToTransformTo(PN->getContext(), IntVT);
+  IntVT = TLI->getTypeToTransformTo(PN->getContext(), IntVT);
   unsigned BitWidth = IntVT.getSizeInBits();
 
   unsigned DestReg = ValueMap[PN];
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 3b1abd7..3a8fb85 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -211,6 +212,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
   assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
          "IMPLICIT_DEF should have been handled as a special case elsewhere!");
 
+  unsigned NumResults = CountResults(Node);
   for (unsigned i = 0; i < II.getNumDefs(); ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
@@ -218,6 +220,10 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     unsigned VRBase = 0;
     const TargetRegisterClass *RC =
       TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    // If the register class is unknown for the given definition, then try to
+    // infer one from the value type.
+    if (!RC && i < NumResults)
+      RC = TLI->getRegClassFor(Node->getSimpleValueType(i));
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       unsigned NumResults = CountResults(Node);
@@ -639,8 +645,8 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
   if (SD->getKind() == SDDbgValue::FRAMEIX) {
     // Stack address; this needs to be lowered in target-dependent fashion.
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
-    unsigned FrameIx = SD->getFrameIx();
-    return TII->emitFrameIndexDebugValue(*MF, FrameIx, Offset, MDPtr, DL);
+    return BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
+        .addFrameIndex(SD->getFrameIx()).addImm(Offset).addMetadata(MDPtr);
   }
   // Otherwise, we're going to create an instruction here.
   const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
@@ -678,7 +684,13 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     MIB.addReg(0U);
   }
 
-  MIB.addImm(Offset).addMetadata(MDPtr);
+  if (Offset != 0) // Indirect addressing.
+    MIB.addImm(Offset);
+  else
+    MIB.addReg(0U, RegState::Debug);
+
+  MIB.addMetadata(MDPtr);
+
   return &*MIB;
 }
 
@@ -716,10 +728,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
+  unsigned NumDefs = II.getNumDefs();
+  const uint16_t *ScratchRegs = NULL;
+
+  // Handle PATCHPOINT specially and then use the generic code.
+  if (Opc == TargetOpcode::PATCHPOINT) {
+    unsigned CC = Node->getConstantOperandVal(PatchPointOpers::CCPos);
+    NumDefs = NumResults;
+    ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  }
+
   unsigned NumImpUses = 0;
   unsigned NodeOperands =
-    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
-  bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
+    countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
+  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
   if (II.isVariadic())
@@ -742,14 +764,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Emit all of the actual operands of this instruction, adding them to the
   // instruction as appropriate.
-  bool HasOptPRefs = II.getNumDefs() > NumResults;
+  bool HasOptPRefs = NumDefs > NumResults;
   assert((!HasOptPRefs || !HasPhysRegOuts) &&
          "Unable to cope with optional defs and phys regs defs!");
-  unsigned NumSkip = HasOptPRefs ? II.getNumDefs() - NumResults : 0;
+  unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0;
   for (unsigned i = NumSkip; i != NodeOperands; ++i)
-    AddOperand(MIB, Node->getOperand(i), i-NumSkip+II.getNumDefs(), &II,
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II,
                VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
 
+  // Add scratch registers as implicit def and early clobber
+  if (ScratchRegs)
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine |
+                                 RegState::EarlyClobber);
+
   // Transfer all of the memory reference descriptions of this instruction.
   MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
                  cast<MachineSDNode>(Node)->memoperands_end());
@@ -778,8 +806,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   // Additional results must be physical register defs.
   if (HasPhysRegOuts) {
-    for (unsigned i = II.getNumDefs(); i < NumResults; ++i) {
-      unsigned Reg = II.getImplicitDefs()[i - II.getNumDefs()];
+    for (unsigned i = NumDefs; i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
       if (!Node->hasAnyUseOfValue(i))
         continue;
       // This implicitly defined physreg has a use.
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index a9c2203..920dda8 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -1,4 +1,4 @@
-//===---- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG class ---==//
+//===- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG -*- C++ -*--==//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2a1d8c2..9061ae9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -58,6 +58,10 @@ class SelectionDAGLegalize : public SelectionDAG::DAGUpdateListener {
   /// LegalizedNodes - The set of nodes which have already been legalized.
   SmallPtrSet<SDNode *, 16> LegalizedNodes;
 
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(*DAG.getContext(), VT);
+  }
+
   // Libcall insertion helpers.
 
 public:
@@ -79,24 +83,24 @@ private:
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
   SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val,
-                                         SDValue Idx, DebugLoc dl);
+                                         SDValue Idx, SDLoc dl);
   SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
-                                  SDValue Idx, DebugLoc dl);
+                                  SDValue Idx, SDLoc dl);
 
   /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
-  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, DebugLoc dl,
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, SDLoc dl,
                                      SDValue N1, SDValue N2,
                                      ArrayRef<int> Mask) const;
 
-  void LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
-                             DebugLoc dl);
+  bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             bool &NeedInvert, SDLoc dl);
 
   SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
   SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
-                        unsigned NumOps, bool isSigned, DebugLoc dl);
+                        unsigned NumOps, bool isSigned, SDLoc dl);
 
   std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
                                                  SDNode *Node, bool isSigned);
@@ -113,21 +117,21 @@ private:
   void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
-  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, DebugLoc dl);
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT, SDLoc dl);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
   SDValue ExpandFCOPYSIGN(SDNode *Node);
   SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
-                               DebugLoc dl);
+                               SDLoc dl);
   SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
-                                DebugLoc dl);
+                                SDLoc dl);
   SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
-                                DebugLoc dl);
+                                SDLoc dl);
 
-  SDValue ExpandBSWAP(SDValue Op, DebugLoc dl);
-  SDValue ExpandBitCount(unsigned Opc, SDValue Op, DebugLoc dl);
+  SDValue ExpandBSWAP(SDValue Op, SDLoc dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, SDLoc dl);
 
   SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
   SDValue ExpandInsertToVectorThroughStack(SDValue Op);
@@ -181,7 +185,7 @@ public:
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
 SDValue
-SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  DebugLoc dl,
+SelectionDAGLegalize::ShuffleWithNarrowerEltType(EVT NVT, EVT VT,  SDLoc dl,
                                                  SDValue N1, SDValue N2,
                                                  ArrayRef<int> Mask) const {
   unsigned NumMaskElts = VT.getVectorNumElements();
@@ -247,7 +251,7 @@ void SelectionDAGLegalize::LegalizeDAG() {
 SDValue
 SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
   bool Extend = false;
-  DebugLoc dl = CFP->getDebugLoc();
+  SDLoc dl(CFP);
 
   // If a FP immediate is precise when represented as a float and if the
   // target can do an extending load from float to double, we put it into
@@ -307,7 +311,9 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   SDValue Val = ST->getValue();
   EVT VT = Val.getValueType();
   int Alignment = ST->getAlignment();
-  DebugLoc dl = ST->getDebugLoc();
+  unsigned AS = ST->getAddressSpace();
+
+  SDLoc dl(ST);
   if (ST->getMemoryVT().isFloatingPoint() ||
       ST->getMemoryVT().isVector()) {
     EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
@@ -339,7 +345,7 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
     SDValue Store = DAG.getTruncStore(Chain, dl,
                                       Val, StackPtr, MachinePointerInfo(),
                                       StoredVT, false, false, 0);
-    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy());
+    SDValue Increment = DAG.getConstant(RegBytes, TLI.getPointerTy(AS));
     SmallVector<SDValue, 8> Stores;
     unsigned Offset = 0;
 
@@ -377,7 +383,8 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                          .getWithOffset(Offset),
                                        MemVT, ST->isVolatile(),
                                        ST->isNonTemporal(),
-                                       MinAlign(ST->getAlignment(), Offset)));
+                                       MinAlign(ST->getAlignment(), Offset),
+                                       ST->getTBAAInfo()));
     // The order of the stores doesn't matter - say it with a TokenFactor.
     SDValue Result =
       DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Stores[0],
@@ -404,13 +411,14 @@ static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
   Store1 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Lo:Hi, Ptr,
                              ST->getPointerInfo(), NewStoredVT,
                              ST->isVolatile(), ST->isNonTemporal(), Alignment);
+
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                    DAG.getConstant(IncrementSize, TLI.getPointerTy(AS)));
   Alignment = MinAlign(Alignment, IncrementSize);
   Store2 = DAG.getTruncStore(Chain, dl, TLI.isLittleEndian()?Hi:Lo, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                              NewStoredVT, ST->isVolatile(), ST->isNonTemporal(),
-                             Alignment);
+                             Alignment, ST->getTBAAInfo());
 
   SDValue Result =
     DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
@@ -428,16 +436,14 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   SDValue Ptr = LD->getBasePtr();
   EVT VT = LD->getValueType(0);
   EVT LoadedVT = LD->getMemoryVT();
-  DebugLoc dl = LD->getDebugLoc();
+  SDLoc dl(LD);
   if (VT.isFloatingPoint() || VT.isVector()) {
     EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
     if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
       // Expand to a (misaligned) integer load of the same size,
       // then bitconvert to floating point or vector.
-      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                                    LD->isVolatile(),
-                                    LD->isNonTemporal(),
-                                    LD->isInvariant(), LD->getAlignment());
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
       if (LoadedVT != VT)
         Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
@@ -470,7 +476,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
                                  LD->isInvariant(),
-                                 MinAlign(LD->getAlignment(), Offset));
+                                 MinAlign(LD->getAlignment(), Offset),
+                                 LD->getTBAAInfo());
       // Follow the load with a store to the stack slot.  Remember the store.
       Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
                                     MachinePointerInfo(), false, false, 0));
@@ -488,7 +495,8 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                   LD->getPointerInfo().getWithOffset(Offset),
                                   MemVT, LD->isVolatile(),
                                   LD->isNonTemporal(),
-                                  MinAlign(LD->getAlignment(), Offset));
+                                  MinAlign(LD->getAlignment(), Offset),
+                                  LD->getTBAAInfo());
     // Follow the load with a store to the stack slot.  Remember the store.
     // On big-endian machines this requires a truncating store to ensure
     // that the bits end up in the right place.
@@ -532,23 +540,25 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   if (TLI.isLittleEndian()) {
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   } else {
     Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), Alignment);
+                        LD->isNonTemporal(), Alignment, LD->getTBAAInfo());
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getConstant(IncrementSize, TLI.getPointerTy()));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
                         LD->getPointerInfo().getWithOffset(IncrementSize),
                         NewLoadedVT, LD->isVolatile(),
-                        LD->isNonTemporal(), MinAlign(Alignment,IncrementSize));
+                        LD->isNonTemporal(), MinAlign(Alignment, IncrementSize),
+                        LD->getTBAAInfo());
   }
 
   // aggregate the two parts
@@ -570,7 +580,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
 /// the insert there, and then read the result back.
 SDValue SelectionDAGLegalize::
 PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
-                               DebugLoc dl) {
+                               SDLoc dl) {
   SDValue Tmp1 = Vec;
   SDValue Tmp2 = Val;
   SDValue Tmp3 = Idx;
@@ -606,13 +616,13 @@ PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
                          false, false, 0);
   // Load the updated vector.
   return DAG.getLoad(VT, dl, Ch, StackPtr,
-                     MachinePointerInfo::getFixedStack(SPFI), false, false, 
+                     MachinePointerInfo::getFixedStack(SPFI), false, false,
                      false, 0);
 }
 
 
 SDValue SelectionDAGLegalize::
-ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, DebugLoc dl) {
+ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx, SDLoc dl) {
   if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
     // SCALAR_TO_VECTOR requires that the type of the value being inserted
     // match the element type of the vector being created, except for
@@ -651,7 +661,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  DebugLoc dl = ST->getDebugLoc();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
+  SDLoc dl(ST);
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
@@ -659,7 +670,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
       return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                          isVolatile, isNonTemporal, Alignment);
+                          isVolatile, isNonTemporal, Alignment, TBAAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -668,7 +679,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            isVolatile, isNonTemporal, Alignment);
+                            isVolatile, isNonTemporal, Alignment, TBAAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -681,12 +692,13 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
         Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
         Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                            DAG.getIntPtrConstant(4));
+                          DAG.getConstant(4, Ptr.getValueType()));
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
-                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
+                          isVolatile, isNonTemporal, MinAlign(Alignment, 4U),
+                          TBAAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -699,11 +711,12 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     StoreSDNode *ST = cast<StoreSDNode>(Node);
     SDValue Chain = ST->getChain();
     SDValue Ptr = ST->getBasePtr();
-    DebugLoc dl = Node->getDebugLoc();
+    SDLoc dl(Node);
 
     unsigned Alignment = ST->getAlignment();
     bool isVolatile = ST->isVolatile();
     bool isNonTemporal = ST->isNonTemporal();
+    const MDNode *TBAAInfo = ST->getTBAAInfo();
 
     if (!ST->isTruncatingStore()) {
       if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
@@ -741,7 +754,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr,
                          ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
+                         isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -763,7 +776,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
+                            NVT, isVolatile, isNonTemporal, Alignment,
+                            TBAAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -784,19 +798,20 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // Store the bottom RoundWidth bits.
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
                                  RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
+                                 isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                            DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
                            DAG.getConstant(RoundWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
                              ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         } else {
           // Big endian - avoid unaligned stores.
           // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -805,16 +820,17 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
                            DAG.getConstant(ExtraWidth,
                                     TLI.getShiftAmountTy(Value.getValueType())));
           Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+                                 RoundVT, isVolatile, isNonTemporal, Alignment,
+                                 TBAAInfo);
 
           // Store the remaining ExtraWidth bits.
           IncrementSize = RoundWidth / 8;
           Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                             DAG.getIntPtrConstant(IncrementSize));
+                             DAG.getConstant(IncrementSize, Ptr.getValueType()));
           Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
                               ST->getPointerInfo().getWithOffset(IncrementSize),
                                  ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
+                                 MinAlign(Alignment, IncrementSize), TBAAInfo);
         }
 
         // The order of the stores doesn't matter.
@@ -850,7 +866,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
           SDValue Result =
             DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
+                         isVolatile, isNonTemporal, Alignment, TBAAInfo);
           ReplaceNode(SDValue(Node, 0), Result);
           break;
         }
@@ -863,7 +879,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   SDValue Chain = LD->getChain();  // The chain.
   SDValue Ptr = LD->getBasePtr();  // The base pointer.
   SDValue Value;                   // The value returned by the load op.
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   if (ExtType == ISD::NON_EXTLOAD) {
@@ -898,9 +914,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote loads to same size type");
 
-      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                         LD->isVolatile(), LD->isNonTemporal(),
-                         LD->isInvariant(), LD->getAlignment());
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
       RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
       RChain = Res.getValue(1);
       break;
@@ -920,6 +934,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   unsigned Alignment = LD->getAlignment();
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   if (SrcWidth != SrcVT.getStoreSizeInBits() &&
       // Some targets pretend to have an i1 loading operation, and actually
@@ -946,7 +961,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     SDValue Result =
       DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
                      Chain, Ptr, LD->getPointerInfo(),
-                     NVT, isVolatile, isNonTemporal, Alignment);
+                     NVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -983,16 +998,16 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
                           Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1012,17 +1027,17 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo(), RoundVT, isVolatile,
-                          isNonTemporal, Alignment);
+                          isNonTemporal, Alignment, TBAAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize, Ptr.getValueType()));
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
                           dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
                           ExtraVT, isVolatile, isNonTemporal,
-                          MinAlign(Alignment, IncrementSize));
+                          MinAlign(Alignment, IncrementSize), TBAAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -1075,9 +1090,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     case TargetLowering::Expand:
              if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
                SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                          LD->getPointerInfo(),
-                                          LD->isVolatile(), LD->isNonTemporal(),
-                                          LD->isInvariant(), LD->getAlignment());
+                                          LD->getMemOperand());
                unsigned ExtendOp;
                switch (ExtType) {
                case ISD::EXTLOAD:
@@ -1105,9 +1118,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
              // Turn the unsupported load into an EXTLOAD followed by an explicit
              // zero/sign extend inreg.
              SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
-                                             LD->isVolatile(), LD->isNonTemporal(),
-                                             LD->getAlignment());
+                                             Chain, Ptr, SrcVT,
+                                             LD->getMemOperand());
              SDValue ValRes;
              if (ExtType == ISD::SEXTLOAD)
                ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
@@ -1249,7 +1261,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     if (Action == TargetLowering::Expand) {
       // replace ISD::DEBUGTRAP with ISD::TRAP
       SDValue NewVal;
-      NewVal = DAG.getNode(ISD::TRAP, Node->getDebugLoc(), Node->getVTList(),
+      NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
                            Node->getOperand(0));
       ReplaceNode(Node, NewVal.getNode());
       LegalizeOp(NewVal.getNode());
@@ -1370,7 +1382,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   SDValue Vec = Op.getOperand(0);
   SDValue Idx = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Store the value to a temporary stack slot, then LOAD the returned part.
   SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
   SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
@@ -1382,11 +1394,7 @@ SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
 
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
-
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
   StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
 
   if (Op.getValueType().isVector())
@@ -1404,7 +1412,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue Vec  = Op.getOperand(0);
   SDValue Part = Op.getOperand(1);
   SDValue Idx  = Op.getOperand(2);
-  DebugLoc dl  = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Store the value to a temporary stack slot, then LOAD the returned part.
 
@@ -1424,11 +1432,7 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
 
   Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
                     DAG.getConstant(EltSize, Idx.getValueType()));
-
-  if (Idx.getValueType().bitsGT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Idx);
-  else
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy());
 
   SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
                                     StackPtr);
@@ -1449,7 +1453,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(FI);
@@ -1489,12 +1493,12 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
     StoreChain = DAG.getEntryNode();
 
   // Result is a load from the stack slot.
-  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo, 
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo,
                      false, false, false, 0);
 }
 
 SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SDValue Tmp1 = Node->getOperand(0);
   SDValue Tmp2 = Node->getOperand(1);
 
@@ -1527,7 +1531,8 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
       unsigned Strides = (FloatVT.getSizeInBits()-1)/LoadTy.getSizeInBits();
       unsigned ByteOffset = (Strides * LoadTy.getSizeInBits()) / 8;
       LoadPtr = DAG.getNode(ISD::ADD, dl, LoadPtr.getValueType(),
-                            LoadPtr, DAG.getIntPtrConstant(ByteOffset));
+                            LoadPtr,
+                            DAG.getConstant(ByteOffset, LoadPtr.getValueType()));
       // Load a legal integer containing the sign bit.
       SignBit = DAG.getLoad(LoadTy, dl, Ch, LoadPtr, MachinePointerInfo(),
                             false, false, false, 0);
@@ -1542,16 +1547,16 @@ SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode* Node) {
     }
   }
   // Now get the sign bit proper, by seeing whether the value is negative.
-  SignBit = DAG.getSetCC(dl, TLI.getSetCCResultType(SignBit.getValueType()),
+  SignBit = DAG.getSetCC(dl, getSetCCResultType(SignBit.getValueType()),
                          SignBit, DAG.getConstant(0, SignBit.getValueType()),
                          ISD::SETLT);
   // Get the absolute value of the result.
   SDValue AbsVal = DAG.getNode(ISD::FABS, dl, Tmp1.getValueType(), Tmp1);
   // Select between the nabs and abs value based on the sign bit of
   // the input.
-  return DAG.getNode(ISD::SELECT, dl, AbsVal.getValueType(), SignBit,
-                     DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
-                     AbsVal);
+  return DAG.getSelect(dl, AbsVal.getValueType(), SignBit,
+                       DAG.getNode(ISD::FNEG, dl, AbsVal.getValueType(), AbsVal),
+                       AbsVal);
 }
 
 void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
@@ -1559,7 +1564,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
   assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
           " not tell us which reg is the stack pointer!");
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   SDValue Tmp1 = SDValue(Node, 0);
   SDValue Tmp2 = SDValue(Node, 1);
@@ -1568,52 +1573,76 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
 
   // Chain the dynamic stack allocation so that it doesn't modify the stack
   // pointer when other instructions are using the stack.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+                               SDLoc(Node));
 
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
   unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
   unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-  if (Align > StackAlign)
-    SP = DAG.getNode(ISD::AND, dl, VT, SP,
-                      DAG.getConstant(-(uint64_t)Align, VT));
   Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, VT));
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
 
   Tmp2 = DAG.getCALLSEQ_END(Chain,  DAG.getIntPtrConstant(0, true),
-                            DAG.getIntPtrConstant(0, true), SDValue());
+                            DAG.getIntPtrConstant(0, true), SDValue(),
+                            SDLoc(Node));
 
   Results.push_back(Tmp1);
   Results.push_back(Tmp2);
 }
 
 /// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target. This routine expands SETCC with
-/// illegal condition code into AND / OR of multiple SETCC values.
-void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
+/// condition code CC on the current target.
+///
+/// If the SETCC has been legalized using AND / OR, then the legalized node
+/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
+/// will be set to false.
+///
+/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
+/// then the values of LHS and RHS will be swapped, CC will be set to the
+/// new condition, and NeedInvert will be set to false.
+///
+/// If the SETCC has been legalized using the inverse condcode, then LHS and
+/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
+/// will be set to true. The caller must invert the result of the SETCC with
+/// SelectionDAG::getNOT() or take equivalent action to swap the effect of a
+/// true/false result.
+///
+/// \returns true if the SetCC has been legalized, false if it hasn't.
+bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
                                                  SDValue &LHS, SDValue &RHS,
                                                  SDValue &CC,
-                                                 DebugLoc dl) {
+                                                 bool &NeedInvert,
+                                                 SDLoc dl) {
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  NeedInvert = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
     // Nothing to do.
     break;
   case TargetLowering::Expand: {
+    ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
+    if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(InvCC);
+      return true;
+    }
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
-    ISD::CondCode InvCC = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
-    case ISD::SETO: 
+    case ISD::SETO:
         assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
             == TargetLowering::Legal
             && "If SETO is expanded, SETOEQ must be legal!");
         CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
-    case ISD::SETUO:  
+    case ISD::SETUO:
         assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
             == TargetLowering::Legal
             && "If SETUO is expanded, SETUNE must be legal!");
@@ -1623,12 +1652,12 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     case ISD::SETOGE:
     case ISD::SETOLT:
     case ISD::SETOLE:
-    case ISD::SETONE: 
-    case ISD::SETUEQ: 
-    case ISD::SETUNE: 
-    case ISD::SETUGT: 
-    case ISD::SETUGE: 
-    case ISD::SETULT: 
+    case ISD::SETONE:
+    case ISD::SETUEQ:
+    case ISD::SETUNE:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+    case ISD::SETULT:
     case ISD::SETULE:
         // If we are floating point, assign and break, otherwise fall through.
         if (!OpVT.isInteger()) {
@@ -1644,20 +1673,23 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     case ISD::SETGT:
     case ISD::SETGE:
     case ISD::SETLT:
+      // We only support using the inverted operation, which is computed above
+      // and not a different manner of supporting expanding these cases.
+      llvm_unreachable("Don't know how to expand this condition!");
     case ISD::SETNE:
     case ISD::SETEQ:
-      InvCC = ISD::getSetCCSwappedOperands(CCCode);
-      if (TLI.getCondCodeAction(InvCC, OpVT) == TargetLowering::Expand) {
-        // We only support using the inverted operation and not a
-        // different manner of supporting expanding these cases.
-        llvm_unreachable("Don't know how to expand this condition!");
+      // Try inverting the result of the inverse condition.
+      InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+      if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+        CC = DAG.getCondCode(InvCC);
+        NeedInvert = true;
+        return true;
       }
-      LHS = DAG.getSetCC(dl, VT, RHS, LHS, InvCC);
-      RHS = SDValue();
-      CC = SDValue();
-      return;
+      // If inverting the condition didn't work then we have no means to expand
+      // the condition.
+      llvm_unreachable("Don't know how to expand this condition!");
     }
-    
+
     SDValue SetCC1, SetCC2;
     if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
       // If we aren't the ordered or unorder operation,
@@ -1672,9 +1704,10 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
     RHS = SDValue();
     CC  = SDValue();
-    break;
+    return true;
   }
   }
+  return false;
 }
 
 /// EmitStackConvert - Emit a store/load combination to the stack.  This stores
@@ -1684,7 +1717,7 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
 SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
                                                EVT SlotVT,
                                                EVT DestVT,
-                                               DebugLoc dl) {
+                                               SDLoc dl) {
   // Create the stack frame object.
   unsigned SrcAlign =
     TLI.getDataLayout()->getPrefTypeAlignment(SrcOp.getValueType().
@@ -1725,7 +1758,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
 }
 
 SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   // Create a vector sized/aligned stack slot, store the value to element #0,
   // then load the whole vector back out.
   SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
@@ -1749,7 +1782,7 @@ SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
   SDValue Value1, Value2;
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = Node->getValueType(0);
   EVT OpVT = Node->getOperand(0).getValueType();
   EVT EltVT = VT.getVectorElementType();
@@ -1881,7 +1914,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), isTailCall,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, Node->getDebugLoc());
+                    Callee, Args, DAG, SDLoc(Node));
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
 
@@ -1896,7 +1929,7 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
-                                            bool isSigned, DebugLoc dl) {
+                                            bool isSigned, SDLoc dl) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
@@ -1950,7 +1983,7 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
   CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, Node->getDebugLoc());
+                    Callee, Args, DAG, SDLoc(Node));
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
@@ -1963,7 +1996,7 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32: LC = Call_F32; break;
   case MVT::f64: LC = Call_F64; break;
@@ -1981,7 +2014,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
                                                RTLIB::Libcall Call_I64,
                                                RTLIB::Libcall Call_I128) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC = Call_I8; break;
   case MVT::i16:  LC = Call_I16; break;
@@ -1996,7 +2029,7 @@ SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2043,7 +2076,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   bool isSigned = Opcode == ISD::SDIVREM;
 
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
   case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
@@ -2082,7 +2115,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy());
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   TargetLowering::
   CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
@@ -2100,7 +2133,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 /// isSinCosLibcallAvailable - Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2130,7 +2163,7 @@ static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
     ? ISD::FCOS : ISD::FSIN;
-  
+
   SDValue Op0 = Node->getOperand(0);
   for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
        UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
@@ -2150,7 +2183,7 @@ void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
   RTLIB::Libcall LC;
-  switch (Node->getValueType(0).getSimpleVT().SimpleTy) {
+  switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
   case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
   case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
@@ -2158,25 +2191,25 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
   case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
   }
-  
+
   // The input chain to this libcall is the entry node of the function.
   // Legalizing the call will automatically add the previous call to the
   // dependence.
   SDValue InChain = DAG.getEntryNode();
-  
+
   EVT RetVT = Node->getValueType(0);
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-  
+
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  
+
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
   Entry.isSExt = false;
   Entry.isZExt = false;
   Args.push_back(Entry);
-  
+
   // Pass the return address of sin.
   SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = SinPtr;
@@ -2184,7 +2217,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   Entry.isSExt = false;
   Entry.isZExt = false;
   Args.push_back(Entry);
-  
+
   // Also pass the return address of the cos.
   SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = CosPtr;
@@ -2192,11 +2225,11 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   Entry.isSExt = false;
   Entry.isZExt = false;
   Args.push_back(Entry);
-  
+
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy());
-  
-  DebugLoc dl = Node->getDebugLoc();
+
+  SDLoc dl(Node);
   TargetLowering::
   CallLoweringInfo CLI(InChain, Type::getVoidTy(*DAG.getContext()),
                        false, false, false, false,
@@ -2218,7 +2251,7 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
 SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                                                    SDValue Op0,
                                                    EVT DestVT,
-                                                   DebugLoc dl) {
+                                                   SDLoc dl) {
   if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     // simple 32-bit [signed|unsigned] integer to float/double expansion
 
@@ -2226,11 +2259,11 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
 
     // word offset constant for Hi/Lo address computation
-    SDValue WordOff = DAG.getConstant(sizeof(int), TLI.getPointerTy());
+    SDValue WordOff = DAG.getConstant(sizeof(int), StackSlot.getValueType());
     // set up Hi and Lo (into buffer) address based on endian
     SDValue Hi = StackSlot;
-    SDValue Lo = DAG.getNode(ISD::ADD, dl,
-                             TLI.getPointerTy(), StackSlot, WordOff);
+    SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
+                             StackSlot, WordOff);
     if (TLI.isLittleEndian())
       std::swap(Hi, Lo);
 
@@ -2327,9 +2360,9 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
       // select.  We happen to get lucky and machinesink does the right
       // thing most of the time.  This would be a good candidate for a
       //pseudo-op, or, even better, for whole-function isel.
-      SDValue SignBitTest = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+      SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
         Op0, DAG.getConstant(0, MVT::i64), ISD::SETLT);
-      return DAG.getNode(ISD::SELECT, dl, MVT::f32, SignBitTest, Slow, Fast);
+      return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
     }
 
     // Otherwise, implement the fully general conversion.
@@ -2340,13 +2373,13 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
          DAG.getConstant(UINT64_C(0x800), MVT::i64));
     SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
          DAG.getConstant(UINT64_C(0x7ff), MVT::i64));
-    SDValue Ne = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+    SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
                    And2, DAG.getConstant(UINT64_C(0), MVT::i64), ISD::SETNE);
-    SDValue Sel = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ne, Or, Op0);
-    SDValue Ge = DAG.getSetCC(dl, TLI.getSetCCResultType(MVT::i64),
+    SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
+    SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
                    Op0, DAG.getConstant(UINT64_C(0x0020000000000000), MVT::i64),
                    ISD::SETUGE);
-    SDValue Sel2 = DAG.getNode(ISD::SELECT, dl, MVT::i64, Ge, Sel, Op0);
+    SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
     EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType());
 
     SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
@@ -2365,18 +2398,18 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 
   SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
 
-  SDValue SignSet = DAG.getSetCC(dl, TLI.getSetCCResultType(Op0.getValueType()),
+  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Op0.getValueType()),
                                  Op0, DAG.getConstant(0, Op0.getValueType()),
                                  ISD::SETLT);
   SDValue Zero = DAG.getIntPtrConstant(0), Four = DAG.getIntPtrConstant(4);
-  SDValue CstOffset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(),
+  SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
                                     SignSet, Four, Zero);
 
   // If the sign bit of the integer is set, the large number will be treated
   // as a negative number.  To counteract this, the dynamic code adds an
   // offset depending on the data type.
   uint64_t FF;
-  switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op0.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unsupported integer type!");
   case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
   case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
@@ -2389,7 +2422,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 
   SDValue CPIdx = DAG.getConstantPool(FudgeFactor, TLI.getPointerTy());
   unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
-  CPIdx = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), CPIdx, CstOffset);
+  CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
   Alignment = std::min(Alignment, 4u);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
@@ -2417,7 +2450,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
 SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
                                                     EVT DestVT,
                                                     bool isSigned,
-                                                    DebugLoc dl) {
+                                                    SDLoc dl) {
   // First step, figure out the appropriate *INT_TO_FP operation to use.
   EVT NewInTy = LegalOp.getValueType();
 
@@ -2459,7 +2492,7 @@ SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp,
 SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
                                                     EVT DestVT,
                                                     bool isSigned,
-                                                    DebugLoc dl) {
+                                                    SDLoc dl) {
   // First step, figure out the appropriate FP_TO*INT operation to use.
   EVT NewOutTy = DestVT;
 
@@ -2494,7 +2527,7 @@ SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp,
 
 /// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
 ///
-SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
@@ -2542,7 +2575,7 @@ SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, DebugLoc dl) {
 /// ExpandBitCount - Expand the specified bitcount instruction into operations.
 ///
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
-                                             DebugLoc dl) {
+                                             SDLoc dl) {
   switch (Opc) {
   default: llvm_unreachable("Cannot expand this yet!");
   case ISD::CTPOP: {
@@ -2650,6 +2683,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -2659,6 +2693,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -2668,6 +2703,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -2677,6 +2713,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -2686,6 +2723,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -2695,6 +2733,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -2704,6 +2743,7 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -2713,6 +2753,47 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMAX:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMAX_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMAX_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMAX_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMAX_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMAX_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_MIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_MIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_MIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_MIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_MIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_MIN_16;break;
+    }
+    break;
+  case ISD::ATOMIC_LOAD_UMIN:
+    switch (VT.SimpleTy) {
+    default: llvm_unreachable("Unexpected value type for atomic!");
+    case MVT::i8:  LC = RTLIB::SYNC_FETCH_AND_UMIN_1; break;
+    case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_UMIN_2; break;
+    case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_UMIN_4; break;
+    case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_UMIN_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_UMIN_16;break;
     }
     break;
   }
@@ -2722,8 +2803,9 @@ std::pair <SDValue, SDValue> SelectionDAGLegalize::ExpandAtomic(SDNode *Node) {
 
 void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  bool NeedInvert;
   switch (Node->getOpcode()) {
   case ISD::CTPOP:
   case ISD::CTLZ:
@@ -2913,7 +2995,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     APInt x = APInt::getSignBit(NVT.getSizeInBits());
     (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
     Tmp1 = DAG.getConstantFP(apf, VT);
-    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
                         Node->getOperand(0),
                         Tmp1, ISD::SETLT);
     True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
@@ -2922,7 +3004,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                     Node->getOperand(0), Tmp1));
     False = DAG.getNode(ISD::XOR, dl, NVT, False,
                         DAG.getConstant(x, NVT));
-    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2, True, False);
+    Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
     Results.push_back(Tmp1);
     break;
   }
@@ -2934,27 +3016,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned Align = Node->getConstantOperandVal(3);
 
     SDValue VAListLoad = DAG.getLoad(TLI.getPointerTy(), dl, Tmp1, Tmp2,
-                                     MachinePointerInfo(V), 
+                                     MachinePointerInfo(V),
                                      false, false, false, 0);
     SDValue VAList = VAListLoad;
 
     if (Align > TLI.getMinStackArgumentAlignment()) {
       assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
 
-      VAList = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(Align - 1,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
 
-      VAList = DAG.getNode(ISD::AND, dl, TLI.getPointerTy(), VAList,
+      VAList = DAG.getNode(ISD::AND, dl, VAList.getValueType(), VAList,
                            DAG.getConstant(-(int64_t)Align,
-                                           TLI.getPointerTy()));
+                                           VAList.getValueType()));
     }
 
     // Increment the pointer, VAList, to the next vaarg
-    Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
+    Tmp3 = DAG.getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
                        DAG.getConstant(TLI.getDataLayout()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
-                                       TLI.getPointerTy()));
+                                       VAList.getValueType()));
     // Store the incremented VAList to the legalized pointer
     Tmp3 = DAG.getStore(VAListLoad.getValue(1), dl, Tmp3, Tmp2,
                         MachinePointerInfo(V), false, false, 0);
@@ -3025,7 +3107,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
         // cast operands to v8i32 and re-build the mask.
 
         // Calculate new VT, the size of the new VT should be equal to original.
-        EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT, 
+        EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltVT,
                                       VT.getSizeInBits()/NewEltVT.getSizeInBits());
         assert(NewVT.bitsEq(VT));
 
@@ -3065,11 +3147,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       if (Idx < NumElems)
         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
                                   Op0,
-                                  DAG.getIntPtrConstant(Idx)));
+                                  DAG.getConstant(Idx, TLI.getVectorIdxTy())));
       else
         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
                                   Op1,
-                                  DAG.getIntPtrConstant(Idx - NumElems)));
+                                  DAG.getConstant(Idx - NumElems,
+                                                  TLI.getVectorIdxTy())));
     }
 
     Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
@@ -3131,10 +3214,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     EVT VT = Node->getValueType(0);
     Tmp1 = Node->getOperand(0);
     Tmp2 = DAG.getConstantFP(0.0, VT);
-    Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(Tmp1.getValueType()),
+    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(Tmp1.getValueType()),
                         Tmp1, Tmp2, ISD::SETUGT);
     Tmp3 = DAG.getNode(ISD::FNEG, dl, VT, Tmp1);
-    Tmp1 = DAG.getNode(ISD::SELECT, dl, VT, Tmp2, Tmp1, Tmp3);
+    Tmp1 = DAG.getSelect(dl, VT, Tmp2, Tmp1, Tmp3);
     Results.push_back(Tmp1);
     break;
   }
@@ -3224,6 +3307,13 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
                                       RTLIB::NEARBYINT_F128,
                                       RTLIB::NEARBYINT_PPCF128));
     break;
+  case ISD::FROUND:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                                      RTLIB::ROUND_F64,
+                                      RTLIB::ROUND_F80,
+                                      RTLIB::ROUND_F128,
+                                      RTLIB::ROUND_PPCF128));
+    break;
   case ISD::FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
@@ -3263,22 +3353,6 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Results.push_back(ExpandConstantFP(CFP, true));
     break;
   }
-  case ISD::EHSELECTION: {
-    unsigned Reg = TLI.getExceptionSelectorRegister();
-    assert(Reg && "Can't expand to unknown register!");
-    Results.push_back(DAG.getCopyFromReg(Node->getOperand(1), dl, Reg,
-                                         Node->getValueType(0)));
-    Results.push_back(Results[0].getValue(1));
-    break;
-  }
-  case ISD::EXCEPTIONADDR: {
-    unsigned Reg = TLI.getExceptionPointerRegister();
-    assert(Reg && "Can't expand to unknown register!");
-    Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, Reg,
-                                         Node->getValueType(0)));
-    Results.push_back(Results[0].getValue(1));
-    break;
-  }
   case ISD::FSUB: {
     EVT VT = Node->getValueType(0);
     assert(TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
@@ -3528,10 +3602,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       Tmp1 = DAG.getConstant(VT.getSizeInBits() - 1,
                              TLI.getShiftAmountTy(BottomHalf.getValueType()));
       Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
-      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf, Tmp1,
+      TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
                              ISD::SETNE);
     } else {
-      TopHalf = DAG.getSetCC(dl, TLI.getSetCCResultType(VT), TopHalf,
+      TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
                              DAG.getConstant(0, VT), ISD::SETNE);
     }
     Results.push_back(BottomHalf);
@@ -3574,9 +3648,10 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
-    Index = DAG.getNode(ISD::MUL, dl, PTy,
-                        Index, DAG.getConstant(EntrySize, PTy));
-    SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+    Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(),
+                       Index, DAG.getConstant(EntrySize, Index.getValueType()));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
+                               Index, Table);
 
     EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
     SDValue LD = DAG.getExtLoad(ISD::SEXTLOAD, dl, PTy, Chain, Addr,
@@ -3620,10 +3695,21 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp1 = Node->getOperand(0);
     Tmp2 = Node->getOperand(1);
     Tmp3 = Node->getOperand(2);
-    LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, dl);
+    bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
+                                           Tmp3, NeedInvert, dl);
+
+    if (Legalized) {
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SETCC node.
+      if (Tmp3.getNode())
+        Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3);
+
+      // If we expanded the SETCC by inverting the condition code, then wrap
+      // the existing SETCC in a NOT to restore the intended condition.
+      if (NeedInvert)
+        Tmp1 = DAG.getNOT(dl, Tmp1, Tmp1->getValueType(0));
 
-    // If we expanded the SETCC into an AND/OR, return the new node
-    if (Tmp2.getNode() == 0) {
       Results.push_back(Tmp1);
       break;
     }
@@ -3654,14 +3740,52 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp4 = Node->getOperand(3);   // False
     SDValue CC = Node->getOperand(4);
 
-    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp1.getValueType()),
-                          Tmp1, Tmp2, CC, dl);
+    bool Legalized = false;
+    // Try to legalize by inverting the condition.  This is for targets that
+    // might support an ordered version of a condition, but not the unordered
+    // version (or vice versa).
+    ISD::CondCode InvCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               Tmp1.getValueType().isInteger());
+    if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
+      // Use the new condition code and swap true and false
+      Legalized = true;
+      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
+    } else {
+      // If The inverse is not legal, then try to swap the arguments using
+      // the inverse condition code.
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
+      if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
+        // The swapped inverse condition is legal, so swap true and false,
+        // lhs and rhs.
+        Legalized = true;
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
+      }
+    }
+
+    if (!Legalized) {
+      Legalized = LegalizeSetCCCondCode(
+          getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
+          dl);
 
-    assert(!Tmp2.getNode() && "Can't legalize SELECT_CC with legal condition!");
-    Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
-    CC = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
-                       Tmp3, Tmp4, CC);
+      assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
+
+      // If we expanded the SETCC by inverting the condition code, then swap
+      // the True/False operands to match.
+      if (NeedInvert)
+        std::swap(Tmp3, Tmp4);
+
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SELECT_CC node.
+      if (CC.getNode()) {
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+      } else {
+        Tmp2 = DAG.getConstant(0, Tmp1.getValueType());
+        CC = DAG.getCondCode(ISD::SETNE);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1, Tmp2,
+                           Tmp3, Tmp4, CC);
+      }
+    }
     Results.push_back(Tmp1);
     break;
   }
@@ -3671,14 +3795,27 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(3);              // RHS
     Tmp4 = Node->getOperand(1);              // CC
 
-    LegalizeSetCCCondCode(TLI.getSetCCResultType(Tmp2.getValueType()),
-                          Tmp2, Tmp3, Tmp4, dl);
-
-    assert(!Tmp3.getNode() && "Can't legalize BR_CC with legal condition!");
-    Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
-    Tmp4 = DAG.getCondCode(ISD::SETNE);
-    Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
-                       Tmp3, Node->getOperand(4));
+    bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
+        Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
+    (void)Legalized;
+    assert(Legalized && "Can't legalize BR_CC with legal condition!");
+
+    // If we expanded the SETCC by inverting the condition code, then wrap
+    // the existing SETCC in a NOT to restore the intended condition.
+    if (NeedInvert)
+      Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));
+
+    // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
+    // node.
+    if (Tmp4.getNode()) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
+                         Tmp4, Tmp2, Tmp3, Node->getOperand(4));
+    } else {
+      Tmp3 = DAG.getConstant(0, Tmp2.getValueType());
+      Tmp4 = DAG.getCondCode(ISD::SETNE);
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4, Tmp2,
+                         Tmp3, Node->getOperand(4));
+    }
     Results.push_back(Tmp1);
     break;
   }
@@ -3698,10 +3835,12 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     for (unsigned Idx = 0; Idx < NumElem; Idx++) {
       SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                VT.getScalarType(),
-                               Node->getOperand(0), DAG.getIntPtrConstant(Idx));
+                               Node->getOperand(0), DAG.getConstant(Idx,
+                                                    TLI.getVectorIdxTy()));
       SDValue Sh = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                                VT.getScalarType(),
-                               Node->getOperand(1), DAG.getIntPtrConstant(Idx));
+                               Node->getOperand(1), DAG.getConstant(Idx,
+                                                    TLI.getVectorIdxTy()));
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
@@ -3738,7 +3877,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     OVT = Node->getOperand(0).getSimpleValueType();
   }
   MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SDValue Tmp1, Tmp2, Tmp3;
   switch (Node->getOpcode()) {
   case ISD::CTTZ:
@@ -3753,11 +3892,11 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
     if (Node->getOpcode() == ISD::CTTZ) {
       // FIXME: This should set a bit in the zero extended value instead.
-      Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT),
+      Tmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT),
                           Tmp1, DAG.getConstant(NVT.getSizeInBits(), NVT),
                           ISD::SETEQ);
-      Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp2,
-                          DAG.getConstant(OVT.getSizeInBits(), NVT), Tmp1);
+      Tmp1 = DAG.getSelect(dl, NVT, Tmp2,
+                           DAG.getConstant(OVT.getSizeInBits(), NVT), Tmp1);
     } else if (Node->getOpcode() == ISD::CTLZ ||
                Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
       // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
@@ -3852,7 +3991,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
     // Perform the larger operation, then round down.
-    Tmp1 = DAG.getNode(ISD::SELECT, dl, NVT, Tmp1, Tmp2, Tmp3);
+    Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
     if (TruncOp != ISD::FP_ROUND)
       Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
     else
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index de217d8..ecf4c5d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -88,6 +88,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
     case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
@@ -118,7 +119,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
   // Convert the inputs to integers, and build a new pair out of them.
-  return DAG.getNode(ISD::BUILD_PAIR, N->getDebugLoc(),
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N),
                      TLI.getTypeToTransformTo(*DAG.getContext(),
                                               N->getValueType(0)),
                      BitConvertToInteger(N->getOperand(0)),
@@ -133,7 +134,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(ConstantFPSDNode *N) {
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      NewOp.getValueType().getVectorElementType(),
                      NewOp, N->getOperand(1));
 }
@@ -147,7 +148,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
   API.clearBit(Size-1);
   SDValue Mask = DAG.getConstant(API, NVT);
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return DAG.getNode(ISD::AND, N->getDebugLoc(), NVT, Op, Mask);
+  return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
@@ -160,7 +161,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
@@ -172,13 +173,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT LVT = LHS.getValueType();
   EVT RVT = RHS.getValueType();
@@ -226,7 +227,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
@@ -239,7 +240,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
@@ -251,7 +252,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
@@ -263,7 +264,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
@@ -275,7 +276,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
@@ -287,7 +288,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
@@ -299,7 +300,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
@@ -311,7 +312,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -325,7 +326,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, 3, false, N->getDebugLoc());
+                         NVT, Ops, 3, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
@@ -338,7 +339,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
@@ -350,7 +351,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
@@ -364,7 +365,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -372,7 +373,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -381,7 +382,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP32(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = N->getOperand(0);
   return TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, NVT, &Op, 1, false,
-                         N->getDebugLoc());
+                         SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -389,7 +390,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
@@ -402,7 +403,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -416,7 +417,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
@@ -429,7 +430,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
@@ -441,7 +442,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ROUND_F32,
+                                           RTLIB::ROUND_F64,
+                                           RTLIB::ROUND_F80,
+                                           RTLIB::ROUND_F128,
+                                           RTLIB::ROUND_PPCF128),
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
@@ -453,7 +466,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
@@ -465,7 +478,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
@@ -478,7 +491,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, 2, false, N->getDebugLoc());
+                         NVT, Ops, 2, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -490,21 +503,22 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, &Op, 1, false, N->getDebugLoc());
+                         NVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue NewL;
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
     NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(),
                        NVT, dl, L->getChain(), L->getBasePtr(), L->getOffset(),
-                       L->getPointerInfo(), NVT, L->isVolatile(), 
-                       L->isNonTemporal(), false, L->getAlignment());
+                       L->getPointerInfo(), NVT, L->isVolatile(),
+                       L->isNonTemporal(), false, L->getAlignment(),
+                       L->getTBAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -516,7 +530,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
                      L->getMemoryVT(), dl, L->getChain(),
                      L->getBasePtr(), L->getOffset(), L->getPointerInfo(),
                      L->getMemoryVT(), L->isVolatile(),
-                     L->isNonTemporal(), false, L->getAlignment());
+                     L->isNonTemporal(), false, L->getAlignment(),
+                     L->getTBAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -526,14 +541,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
-                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
-  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      LHS.getValueType(), N->getOperand(0),
                      N->getOperand(1), LHS, RHS, N->getOperand(4));
 }
@@ -548,7 +563,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
   SDValue Ptr = N->getOperand(1); // Get the pointer.
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue NewVAARG;
   NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
@@ -565,7 +580,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   EVT SVT = N->getOperand(0).getValueType();
   EVT RVT = N->getValueType(0);
   EVT NVT = EVT();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
   // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
@@ -585,7 +600,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl);
+                         &Op, 1, false, dl).first;
 }
 
 
@@ -633,7 +648,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
-  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      GetSoftenedFloat(N->getOperand(0)));
 }
 
@@ -645,7 +660,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -655,7 +670,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -676,7 +691,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_SINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
@@ -684,14 +699,14 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_UINT(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP32_TO_FP16(SDNode *N) {
   EVT RVT = N->getValueType(0);
   RTLIB::Libcall LC = RTLIB::FPROUND_F32_F16;
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, RVT, &Op, 1, false, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
@@ -701,7 +716,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -724,7 +739,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If softenSetCCOperands returned a scalar, use it.
   if (NewRHS.getNode() == 0) {
@@ -744,7 +759,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
   assert(OpNo == 1 && "Can only soften the stored value!");
   StoreSDNode *ST = cast<StoreSDNode>(N);
   SDValue Val = ST->getValue();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (ST->isTruncatingStore())
     // Do an FP_ROUND followed by a non-truncating store.
@@ -754,9 +769,7 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
     Val = GetSoftenedFloat(Val);
 
   return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
-                      ST->getPointerInfo(),
-                      ST->isVolatile(), ST->isNonTemporal(),
-                      ST->getAlignment());
+                      ST->getMemOperand());
 }
 
 
@@ -817,6 +830,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
   case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
@@ -850,14 +864,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   assert(N->getValueType(0) == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Tmp;
   GetExpandedFloat(N->getOperand(0), Lo, Tmp);
   Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
   // Lo = Hi==fabs(Hi) ? Lo : -Lo;
-  Lo = DAG.getNode(ISD::SELECT_CC, dl, Lo.getValueType(), Tmp, Hi, Lo,
+  Lo = DAG.getSelectCC(dl, Tmp, Hi, Lo,
                    DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
-                   DAG.getCondCode(ISD::SETEQ));
+                   ISD::SETEQ);
 }
 
 void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
@@ -912,7 +926,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 N->getDebugLoc());
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -986,7 +1000,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
                                  N->getValueType(0), Ops, 3, false,
-                                 N->getDebugLoc());
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1000,7 +1014,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 N->getDebugLoc());
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1018,7 +1032,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
 
 void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedFloat(N->getOperand(0), Lo, Hi);
   Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
   Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
@@ -1027,7 +1041,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  Hi = DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), NVT, N->getOperand(0));
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
                                  APInt(NVT.getSizeInBits(), 0)), NVT);
 }
@@ -1072,6 +1086,18 @@ void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
   GetPairElements(Call, Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ROUND_F32,
+                                         RTLIB::ROUND_F64,
+                                         RTLIB::ROUND_F80,
+                                         RTLIB::ROUND_F128,
+                                         RTLIB::ROUND_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
@@ -1102,7 +1128,7 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
                                  N->getValueType(0), Ops, 2, false,
-                                 N->getDebugLoc());
+                                 SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
 
@@ -1127,15 +1153,14 @@ void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
   LoadSDNode *LD = cast<LoadSDNode>(N);
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
 
   Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
-                      LD->getPointerInfo(), LD->getMemoryVT(), LD->isVolatile(),
-                      LD->isNonTemporal(), LD->getAlignment());
+                      LD->getMemoryVT(), LD->getMemOperand());
 
   // Remember the chain.
   Chain = Hi.getValue(1);
@@ -1157,7 +1182,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
   SDValue Src = N->getOperand(0);
   EVT SrcVT = Src.getValueType();
   bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // First do an SINT_TO_FP, whether the original was signed or unsigned.
   // When promoting partial word types to i32 we must honor the signedness,
@@ -1181,7 +1206,7 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl);
+    Hi = TLI.makeLibCall(DAG, LC, VT, &Src, 1, true, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1216,8 +1241,8 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
                    DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble,
                                              APInt(128, Parts)),
                                      MVT::ppcf128));
-  Lo = DAG.getNode(ISD::SELECT_CC, dl, VT, Src, DAG.getConstant(0, SrcVT),
-                   Lo, Hi, DAG.getCondCode(ISD::SETLT));
+  Lo = DAG.getSelectCC(dl, Src, DAG.getConstant(0, SrcVT),
+                       Lo, Hi, ISD::SETLT);
   GetPairElements(Lo, Lo, Hi);
 }
 
@@ -1251,6 +1276,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
 
   case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
   case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
   case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
   case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
@@ -1280,7 +1306,7 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
 void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
                                                 SDValue &NewRHS,
                                                 ISD::CondCode &CCCode,
-                                                DebugLoc dl) {
+                                                SDLoc dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedFloat(NewLHS, LHSLo, LHSHi);
   GetExpandedFloat(NewRHS, RHSLo, RHSHi);
@@ -1293,14 +1319,14 @@ void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
   //         FCMPU crN, lo1, lo2
   // The following can be improved, but not that much.
   SDValue Tmp1, Tmp2, Tmp3;
-  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, ISD::SETOEQ);
-  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
                       LHSLo, RHSLo, CCCode);
   Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
-  Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, ISD::SETUNE);
-  Tmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                       LHSHi, RHSHi, CCCode);
   Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
   NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
@@ -1310,7 +1336,7 @@ void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
 SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1325,19 +1351,30 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
                                 N->getOperand(4)), 0);
 }
 
+SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(1), Lo, Hi);
+  // The ppcf128 value is providing only the sign; take it from the
+  // higher-order double (which must have the larger magnitude).
+  return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
+                     N->getValueType(0), N->getOperand(0), Hi);
+}
+
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
   assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
          "Logic only correct for ppcf128!");
   SDValue Lo, Hi;
   GetExpandedFloat(N->getOperand(0), Lo, Hi);
   // Round it the rest of the way (e.g. to f32) if needed.
-  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+  return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                      N->getValueType(0), Hi, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
@@ -1353,12 +1390,12 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl);
+  return TLI.makeLibCall(DAG, LC, RVT, &N->getOperand(0), 1, false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
   // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
@@ -1370,29 +1407,29 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
     SDValue Tmp = DAG.getConstantFP(APF, MVT::ppcf128);
     //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
     // FIXME: generated code sucks.
-    return DAG.getNode(ISD::SELECT_CC, dl, MVT::i32, N->getOperand(0), Tmp,
-                       DAG.getNode(ISD::ADD, dl, MVT::i32,
-                                   DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
-                                               DAG.getNode(ISD::FSUB, dl,
-                                                           MVT::ppcf128,
-                                                           N->getOperand(0),
-                                                           Tmp)),
-                                   DAG.getConstant(0x80000000, MVT::i32)),
-                       DAG.getNode(ISD::FP_TO_SINT, dl,
-                                   MVT::i32, N->getOperand(0)),
-                       DAG.getCondCode(ISD::SETGE));
+    return DAG.getSelectCC(dl, N->getOperand(0), Tmp,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                       DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                                   DAG.getNode(ISD::FSUB, dl,
+                                                               MVT::ppcf128,
+                                                               N->getOperand(0),
+                                                               Tmp)),
+                                       DAG.getConstant(0x80000000, MVT::i32)),
+                           DAG.getNode(ISD::FP_TO_SINT, dl,
+                                       MVT::i32, N->getOperand(0)),
+                           ISD::SETGE);
   }
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), &N->getOperand(0), 1,
-                         false, dl);
+                         false, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1410,7 +1447,7 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
 SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
   if (NewRHS.getNode() == 0) {
@@ -1444,8 +1481,6 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
   SDValue Lo, Hi;
   GetExpandedOp(ST->getValue(), Lo, Hi);
 
-  return DAG.getTruncStore(Chain, N->getDebugLoc(), Hi, Ptr,
-                           ST->getPointerInfo(),
-                           ST->getMemoryVT(), ST->isVolatile(),
-                           ST->isNonTemporal(), ST->getAlignment());
+  return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
+                           ST->getMemoryVT(), ST->getMemOperand());
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index cd2f060..4255948 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -153,20 +153,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
   // Sign-extend the new bits, and continue the assertion.
   SDValue Op = SExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::AssertSext, N->getDebugLoc(),
+  return DAG.getNode(ISD::AssertSext, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
   // Zero the new bits, and continue the assertion.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::AssertZext, N->getDebugLoc(),
+  return DAG.getNode(ISD::AssertZext, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
   EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(), ResVT,
                               N->getChain(), N->getBasePtr(),
                               N->getMemOperand(), N->getOrdering(),
@@ -179,7 +179,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
-  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(),
                               N->getChain(), N->getBasePtr(),
                               Op2, N->getMemOperand(), N->getOrdering(),
@@ -193,7 +193,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_Atomic2(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
-  SDValue Res = DAG.getAtomic(N->getOpcode(), N->getDebugLoc(),
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
                               N->getMemoryVT(), N->getChain(), N->getBasePtr(),
                               Op2, Op3, N->getMemOperand(), N->getOrdering(),
                               N->getSynchScope());
@@ -209,7 +209,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
   EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
@@ -264,7 +264,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
   return DAG.getNode(ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
@@ -274,7 +274,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
   // The pair element type may be legal, or may not promote to the same type as
   // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
-  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(),
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N),
                      TLI.getTypeToTransformTo(*DAG.getContext(),
                      N->getValueType(0)), JoinIntegers(N->getOperand(0),
                      N->getOperand(1)));
@@ -283,7 +283,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
   EVT VT = N->getValueType(0);
   // FIXME there is no actual debug info here
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // Zero extend things like i1, sign extend everything else.  It shouldn't
   // matter in theory which one we pick, but this tends to give better code?
   unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
@@ -301,7 +301,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
            CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
           "can only promote integers");
   EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getConvertRndSat(OutVT, N->getDebugLoc(), N->getOperand(0),
+  return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0),
                               N->getOperand(1), N->getOperand(2),
                               N->getOperand(3), N->getOperand(4), CvtCode);
 }
@@ -309,7 +309,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
   Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
@@ -322,14 +322,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
   // Zero extend to the promoted type and do the count there.
   SDValue Op = ZExtPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::CTPOP, N->getDebugLoc(), Op.getValueType(), Op);
+  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
   EVT NVT = Op.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getOpcode() == ISD::CTTZ) {
     // The count is the same in the promoted type except if the original
     // value was zero.  This can be handled by setting the bit just off
@@ -342,7 +342,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0),
                      N->getOperand(1));
@@ -351,7 +351,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned NewOpc = N->getOpcode();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
   // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
@@ -374,7 +374,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
 
@@ -384,7 +384,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP32_TO_FP16(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (getTypeAction(N->getOperand(0).getValueType())
       == TargetLowering::TypePromoteInteger) {
@@ -415,11 +415,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   ISD::LoadExtType ExtType =
     ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
-                               N->getPointerInfo(),
-                               N->getMemoryVT(), N->isVolatile(),
-                               N->isNonTemporal(), N->getAlignment());
+                               N->getMemoryVT(), N->getMemOperand());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -433,7 +431,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
   EVT ValueVTs[] = { N->getValueType(0), NVT };
   SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
-  SDValue Res = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
                             DAG.getVTList(ValueVTs, 2), Ops, 2);
 
   // Modified the sum result - switch anything that used the old sum to use
@@ -453,7 +451,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
   EVT OVT = N->getOperand(0).getValueType();
   EVT NVT = LHS.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Do the arithmetic in the larger type.
   unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
@@ -476,15 +474,15 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SDIV(SDNode *N) {
   // Sign extend the input.
   SDValue LHS = SExtPromotedInteger(N->getOperand(0));
   SDValue RHS = SExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
-                     LHS.getValueType(), N->getOperand(0),LHS,RHS);
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
@@ -492,23 +490,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
   EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  Mask = PromoteTargetBoolean(Mask, TLI.getSetCCResultType(OpTy));
+  Mask = PromoteTargetBoolean(Mask, getSetCCResultType(OpTy));
   SDValue LHS = GetPromotedInteger(N->getOperand(1));
   SDValue RHS = GetPromotedInteger(N->getOperand(2));
-  return DAG.getNode(ISD::VSELECT, N->getDebugLoc(),
+  return DAG.getNode(ISD::VSELECT, SDLoc(N),
                      LHS.getValueType(), Mask, LHS, RHS);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetPromotedInteger(N->getOperand(2));
   SDValue RHS = GetPromotedInteger(N->getOperand(3));
-  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      LHS.getValueType(), N->getOperand(0),
                      N->getOperand(1), LHS, RHS, N->getOperand(4));
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
-  EVT SVT = TLI.getSetCCResultType(N->getOperand(0).getValueType());
+  EVT SVT = getSetCCResultType(N->getOperand(0).getValueType());
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
 
@@ -517,13 +515,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
   if (!TLI.isTypeLegal(SVT))
     SVT = NVT;
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
          "Vector compare must return a vector result!");
 
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (LHS.getValueType() != RHS.getValueType()) {
+    if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger &&
+        !LHS.getValueType().isVector())
+      LHS = GetPromotedInteger(LHS);
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger &&
+        !RHS.getValueType().isVector())
+      RHS = GetPromotedInteger(RHS);
+  }
+
   // Get the SETCC result using the canonical SETCC type.
-  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, N->getOperand(0),
-                              N->getOperand(1), N->getOperand(2));
+  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS,
+                              N->getOperand(2));
 
   assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
   // Convert to the expected type.
@@ -534,12 +543,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
   SDValue Res = GetPromotedInteger(N->getOperand(0));
   SDValue Amt = N->getOperand(1);
   Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
-  return DAG.getNode(ISD::SHL, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+  return DAG.getNode(ISD::SHL, SDLoc(N), Res.getValueType(), Res, Amt);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(),
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
@@ -549,7 +558,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
   // that too is okay if they are integer operations.
   SDValue LHS = GetPromotedInteger(N->getOperand(0));
   SDValue RHS = GetPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
@@ -558,7 +567,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
   SDValue Res = SExtPromotedInteger(N->getOperand(0));
   SDValue Amt = N->getOperand(1);
   Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
-  return DAG.getNode(ISD::SRA, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+  return DAG.getNode(ISD::SRA, SDLoc(N), Res.getValueType(), Res, Amt);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
@@ -566,14 +575,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
   SDValue Res = ZExtPromotedInteger(N->getOperand(0));
   SDValue Amt = N->getOperand(1);
   Amt = Amt.getValueType().isVector() ? ZExtPromotedInteger(Amt) : Amt;
-  return DAG.getNode(ISD::SRL, N->getDebugLoc(), Res.getValueType(), Res, Amt);
+  return DAG.getNode(ISD::SRL, SDLoc(N), Res.getValueType(), Res, Amt);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Res;
   SDValue InOp = N->getOperand(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   switch (getTypeAction(InOp.getValueType())) {
   default: llvm_unreachable("Unknown type action!");
@@ -618,7 +627,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
   EVT OVT = N->getOperand(0).getValueType();
   EVT NVT = LHS.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Do the arithmetic in the larger type.
   unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
@@ -642,7 +651,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
     return PromoteIntRes_Overflow(N);
 
   SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT SmallVT = LHS.getValueType();
 
   // To determine if the result overflowed in a larger type, we extend the
@@ -690,7 +699,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UDIV(SDNode *N) {
   // Zero extend the input.
   SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
   SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
@@ -703,7 +712,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   SDValue Chain = N->getOperand(0); // Get the chain.
   SDValue Ptr = N->getOperand(1); // Get the pointer.
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
   unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
@@ -847,12 +856,12 @@ void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0), Op);
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
   SDValue Op2 = GetPromotedInteger(N->getOperand(2));
-  return DAG.getAtomic(N->getOpcode(), N->getDebugLoc(), N->getMemoryVT(),
+  return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
                        N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(),
                        N->getOrdering(), N->getSynchScope());
 }
@@ -881,7 +890,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
   assert(OpNo == 1 && "only know how to promote condition");
 
   // Promote all the way up to the canonical SetCC type.
-  EVT SVT = TLI.getSetCCResultType(MVT::Other);
+  EVT SVT = getSetCCResultType(MVT::Other);
   SDValue Cond = PromoteTargetBoolean(N->getOperand(1), SVT);
 
   // The chain (Op#0) and basic block destination (Op#2) are always legal types.
@@ -895,7 +904,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
   SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
   SDValue Hi = GetPromotedInteger(N->getOperand(1));
   assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
                    DAG.getConstant(OVT.getSizeInBits(), TLI.getPointerTy()));
@@ -908,7 +917,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
   // type does not have a strange size (eg: it is not i1).
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
-  assert(!(NumElts & 1) && "Legal vector of one illegal element?");
+  assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
+		 "Legal vector of one illegal element?");
 
   // Promote the inserted value.  The type does not need to match the
   // vector element type.  Check that any extra bits introduced will be
@@ -931,7 +941,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
            CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
            "can only promote integer arguments");
   SDValue InOp = GetPromotedInteger(N->getOperand(0));
-  return DAG.getConvertRndSat(N->getValueType(0), N->getDebugLoc(), InOp,
+  return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp,
                               N->getOperand(1), N->getOperand(2),
                               N->getOperand(3), N->getOperand(4), CvtCode);
 }
@@ -955,7 +965,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
   assert(OpNo == 2 && "Different operand and result vector types?");
 
   // Promote the index.
-  SDValue Idx = ZExtPromotedInteger(N->getOperand(2));
+  SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N),
+                                   TLI.getVectorIdxTy());
   return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
                                 N->getOperand(1), Idx), 0);
 }
@@ -973,7 +984,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   EVT OpTy = N->getOperand(1).getValueType();
 
   // Promote all the way up to the canonical SetCC type.
-  EVT SVT = TLI.getSetCCResultType(N->getOpcode() == ISD::SELECT ?
+  EVT SVT = getSetCCResultType(N->getOpcode() == ISD::SELECT ?
                                    OpTy.getScalarType() : OpTy);
   Cond = PromoteTargetBoolean(Cond, SVT);
 
@@ -1011,7 +1022,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
   return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
                      Op, DAG.getValueType(N->getOperand(0).getValueType()));
@@ -1025,22 +1036,18 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
   SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
-  unsigned Alignment = N->getAlignment();
-  bool isVolatile = N->isVolatile();
-  bool isNonTemporal = N->isNonTemporal();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
 
   // Truncate the value and store the result.
-  return DAG.getTruncStore(Ch, dl, Val, Ptr, N->getPointerInfo(),
-                           N->getMemoryVT(),
-                           isVolatile, isNonTemporal, Alignment);
+  return DAG.getTruncStore(Ch, dl, Val, Ptr,
+                           N->getMemoryVT(), N->getMemOperand());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
-  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), Op);
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
@@ -1049,7 +1056,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
   return DAG.getZeroExtendInReg(Op, dl,
@@ -1127,7 +1134,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ATOMIC_LOAD_MAX:
   case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_LOAD_UMAX:
-  case ISD::ATOMIC_SWAP: {
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_CMP_SWAP: {
     std::pair<SDValue, SDValue> Tmp = ExpandAtomic(N);
     SplitInteger(Tmp.first, Lo, Hi);
     ReplaceValueWith(SDValue(N, 1), Tmp.second);
@@ -1180,6 +1188,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_2; break;
     case MVT::i32: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_4; break;
     case MVT::i64: LC = RTLIB::SYNC_LOCK_TEST_AND_SET_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_LOCK_TEST_AND_SET_16;break;
     }
     break;
   case ISD::ATOMIC_CMP_SWAP:
@@ -1189,6 +1198,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2; break;
     case MVT::i32: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4; break;
     case MVT::i64: LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_ADD:
@@ -1198,6 +1208,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_ADD_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_ADD_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_ADD_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_ADD_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_SUB:
@@ -1207,6 +1218,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_SUB_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_SUB_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_SUB_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_SUB_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_AND:
@@ -1216,6 +1228,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_AND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_AND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_AND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_AND_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_OR:
@@ -1225,6 +1238,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_OR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_OR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_OR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_OR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_XOR:
@@ -1234,6 +1248,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_XOR_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_XOR_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_XOR_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_XOR_16;break;
     }
     break;
   case ISD::ATOMIC_LOAD_NAND:
@@ -1243,6 +1258,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
     case MVT::i16: LC = RTLIB::SYNC_FETCH_AND_NAND_2; break;
     case MVT::i32: LC = RTLIB::SYNC_FETCH_AND_NAND_4; break;
     case MVT::i64: LC = RTLIB::SYNC_FETCH_AND_NAND_8; break;
+    case MVT::i128:LC = RTLIB::SYNC_FETCH_AND_NAND_16;break;
     }
     break;
   }
@@ -1254,7 +1270,7 @@ std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
 /// and the shift amount is a constant 'Amt'.  Expand the operation.
 void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, unsigned Amt,
                                              SDValue &Lo, SDValue &Hi) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   // Expand the incoming operand to be shifted, so that we have its parts
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
@@ -1352,7 +1368,7 @@ ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   unsigned NVTBits = NVT.getScalarType().getSizeInBits();
   assert(isPowerOf2_32(NVTBits) &&
          "Expanded integer type size not a power of two!");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
   APInt KnownZero, KnownOne;
@@ -1439,7 +1455,7 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   unsigned NVTBits = NVT.getSizeInBits();
   assert(isPowerOf2_32(NVTBits) &&
          "Expanded integer type size not a power of two!");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Get the incoming operand to be shifted.
   SDValue InL, InH;
@@ -1448,7 +1464,7 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue NVBitsNode = DAG.getConstant(NVTBits, ShTy);
   SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
   SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
-  SDValue isShort = DAG.getSetCC(dl, TLI.getSetCCResultType(ShTy),
+  SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy),
                                  Amt, NVBitsNode, ISD::SETULT);
 
   SDValue LoS, HiS, LoL, HiL;
@@ -1467,8 +1483,8 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
     LoL = DAG.getConstant(0, NVT);                        // Lo part is zero.
     HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRL:
     // Short: ShAmt < NVTBits
@@ -1483,8 +1499,8 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
     HiL = DAG.getConstant(0, NVT);                        // Hi part is zero.
     LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
     return true;
   case ISD::SRA:
     // Short: ShAmt < NVTBits
@@ -1500,15 +1516,15 @@ ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
                       DAG.getConstant(NVTBits-1, ShTy));
     LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
 
-    Lo = DAG.getNode(ISD::SELECT, dl, NVT, isShort, LoS, LoL);
-    Hi = DAG.getNode(ISD::SELECT, dl, NVT, isShort, HiS, HiL);
+    Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
     return true;
   }
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
@@ -1545,25 +1561,25 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
   if (N->getOpcode() == ISD::ADD) {
     Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps, 2);
     Hi = DAG.getNode(ISD::ADD, dl, NVT, HiOps, 2);
-    SDValue Cmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[0],
+    SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
                                 ISD::SETULT);
-    SDValue Carry1 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp1,
-                                 DAG.getConstant(1, NVT),
-                                 DAG.getConstant(0, NVT));
-    SDValue Cmp2 = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo, LoOps[1],
+    SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
+    SDValue Cmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[1],
                                 ISD::SETULT);
-    SDValue Carry2 = DAG.getNode(ISD::SELECT, dl, NVT, Cmp2,
-                                 DAG.getConstant(1, NVT), Carry1);
+    SDValue Carry2 = DAG.getSelect(dl, NVT, Cmp2,
+                                   DAG.getConstant(1, NVT), Carry1);
     Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
   } else {
     Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps, 2);
     Hi = DAG.getNode(ISD::SUB, dl, NVT, HiOps, 2);
     SDValue Cmp =
-      DAG.getSetCC(dl, TLI.getSetCCResultType(LoOps[0].getValueType()),
+      DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
                    LoOps[0], LoOps[1], ISD::SETULT);
-    SDValue Borrow = DAG.getNode(ISD::SELECT, dl, NVT, Cmp,
-                                 DAG.getConstant(1, NVT),
-                                 DAG.getConstant(0, NVT));
+    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
+                                   DAG.getConstant(1, NVT),
+                                   DAG.getConstant(0, NVT));
     Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
   }
 }
@@ -1572,7 +1588,7 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
   SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
@@ -1598,7 +1614,7 @@ void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
   // Expand the subcomponents.
   SDValue LHSL, LHSH, RHSL, RHSH;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
   GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
   SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
@@ -1623,7 +1639,7 @@ void DAGTypeLegalizer::ExpandIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
 void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is any extension of the input (which degenerates to a copy).
@@ -1645,7 +1661,7 @@ void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
@@ -1666,7 +1682,7 @@ void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
                                                SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
@@ -1686,7 +1702,7 @@ void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
   Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
   Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
@@ -1703,26 +1719,26 @@ void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
 
-  SDValue HiNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Hi,
+  SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
                                    DAG.getConstant(0, NVT), ISD::SETNE);
 
   SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo);
   SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi);
 
-  Lo = DAG.getNode(ISD::SELECT, dl, NVT, HiNotZero, HiLZ,
-                   DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
-                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), NVT)));
   Hi = DAG.getConstant(0, NVT);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
@@ -1733,42 +1749,44 @@ void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT NVT = Lo.getValueType();
 
-  SDValue LoNotZero = DAG.getSetCC(dl, TLI.getSetCCResultType(NVT), Lo,
+  SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
                                    DAG.getConstant(0, NVT), ISD::SETNE);
 
   SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo);
   SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi);
 
-  Lo = DAG.getNode(ISD::SELECT, dl, NVT, LoNotZero, LoLZ,
-                   DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
-                               DAG.getConstant(NVT.getSizeInBits(), NVT)));
+  Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), NVT)));
   Hi = DAG.getConstant(0, NVT);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
                                                SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
                                                SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue Op = N->getOperand(0);
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, &Op, 1, false/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1790,7 +1808,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
   bool isInvariant = N->isInvariant();
-  DebugLoc dl = N->getDebugLoc();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
+  SDLoc dl(N);
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
@@ -1798,7 +1817,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
-                        MemVT, isVolatile, isNonTemporal, Alignment);
+                        MemVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -1820,7 +1839,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   } else if (TLI.isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
-                     isVolatile, isNonTemporal, isInvariant, Alignment);
+                     isVolatile, isNonTemporal, isInvariant, Alignment,
+                     TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -1829,11 +1849,11 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1851,17 +1871,17 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        isVolatile, isNonTemporal, Alignment);
+                        isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     // Load the rest of the low bits.
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                         isVolatile, isNonTemporal,
-                        MinAlign(Alignment, IncrementSize));
+                        MinAlign(Alignment, IncrementSize), TBAAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -1889,7 +1909,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
                                             SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(N->getOperand(0), LL, LH);
   GetExpandedInteger(N->getOperand(1), RL, RH);
@@ -1901,7 +1921,7 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
                                         SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, NVT);
   bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, NVT);
@@ -1984,7 +2004,8 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported MUL!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/, dl),
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true/*irrelevant*/,
+                               dl).first,
                Lo, Hi);
 }
 
@@ -1992,7 +2013,7 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = Node->getOperand(0);
   SDValue RHS = Node->getOperand(1);
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // Expand the result by simply replacing it with the equivalent
   // non-overflow-checking operation.
@@ -2033,7 +2054,7 @@ void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
 void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
@@ -2047,13 +2068,13 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // If we can emit an efficient shift operation, do so now.  Check to see if
   // the RHS is a constant.
@@ -2142,7 +2163,8 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl), Lo, Hi);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, isSigned, dl).first, Lo,
+                 Hi);
     return;
   }
 
@@ -2153,7 +2175,7 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is sign extension of the input (degenerates to a copy).
@@ -2183,7 +2205,7 @@ void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
 
 void DAGTypeLegalizer::
 ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetExpandedInteger(N->getOperand(0), Lo, Hi);
   EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
 
@@ -2211,7 +2233,7 @@ ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
@@ -2225,13 +2247,13 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, true, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
   Hi = DAG.getNode(ISD::SRL, dl,
                    N->getOperand(0).getValueType(), N->getOperand(0),
@@ -2243,7 +2265,7 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
                                              SDValue &Lo, SDValue &Hi) {
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Expand the result by simply replacing it with the equivalent
   // non-overflow-checking operation.
@@ -2265,7 +2287,7 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // A divide for UMULO should be faster than a function call.
   if (N->getOpcode() == ISD::UMULO) {
@@ -2276,16 +2298,16 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
 
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
-    SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
+    SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT),
                                   RHS, DAG.getConstant(0, VT), ISD::SETEQ);
-    SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
-                                  DAG.getConstant(1, VT), RHS);
+    SDValue NotZero = DAG.getSelect(dl, VT, isZero,
+                                    DAG.getConstant(1, VT), RHS);
     SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
     SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
                                     ISD::SETNE);
-    Overflow = DAG.getNode(ISD::SELECT, dl, N->getValueType(1), isZero,
-                           DAG.getConstant(0, N->getValueType(1)),
-                           Overflow);
+    Overflow = DAG.getSelect(dl, N->getValueType(1), isZero,
+                             DAG.getConstant(0, N->getValueType(1)),
+                             Overflow);
     ReplaceValueWith(SDValue(N, 1), Overflow);
     return;
   }
@@ -2293,7 +2315,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
   EVT PtrVT = TLI.getPointerTy();
   Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
-  
+
   // Replace this with a libcall that will check overflow.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i32)
@@ -2351,7 +2373,7 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
@@ -2365,13 +2387,13 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
                                          SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i16)
@@ -2385,13 +2407,13 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl), Lo, Hi);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, 2, false, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = N->getOperand(0);
   if (Op.getValueType().bitsLE(NVT)) {
     // The low part is zero extension of the input (degenerates to a copy).
@@ -2418,7 +2440,7 @@ void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
 
 void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
                                                 SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
   SDValue Zero = DAG.getConstant(0, VT);
   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl, VT,
@@ -2498,7 +2520,7 @@ bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
 void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
                                                   SDValue &NewRHS,
                                                   ISD::CondCode &CCCode,
-                                                  DebugLoc dl) {
+                                                  SDLoc dl) {
   SDValue LHSLo, LHSHi, RHSLo, RHSHi;
   GetExpandedInteger(NewLHS, LHSLo, LHSHi);
   GetExpandedInteger(NewRHS, RHSLo, RHSHi);
@@ -2555,16 +2577,16 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
   // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
   TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true, NULL);
   SDValue Tmp1, Tmp2;
-  Tmp1 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSLo.getValueType()),
+  Tmp1 = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()),
                            LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl);
   if (!Tmp1.getNode())
-    Tmp1 = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSLo.getValueType()),
+    Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
                         LHSLo, RHSLo, LowCC);
-  Tmp2 = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+  Tmp2 = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
                            LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl);
   if (!Tmp2.getNode())
     Tmp2 = DAG.getNode(ISD::SETCC, dl,
-                       TLI.getSetCCResultType(LHSHi.getValueType()),
+                       getSetCCResultType(LHSHi.getValueType()),
                        LHSHi, RHSHi, DAG.getCondCode(CCCode));
 
   ConstantSDNode *Tmp1C = dyn_cast<ConstantSDNode>(Tmp1.getNode());
@@ -2584,21 +2606,21 @@ void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
     return;
   }
 
-  NewLHS = TLI.SimplifySetCC(TLI.getSetCCResultType(LHSHi.getValueType()),
+  NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
                              LHSHi, RHSHi, ISD::SETEQ, false,
                              DagCombineInfo, dl);
   if (!NewLHS.getNode())
-    NewLHS = DAG.getSetCC(dl, TLI.getSetCCResultType(LHSHi.getValueType()),
+    NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
                           LHSHi, RHSHi, ISD::SETEQ);
-  NewLHS = DAG.getNode(ISD::SELECT, dl, Tmp1.getValueType(),
-                       NewLHS, Tmp1, Tmp2);
+  NewLHS = DAG.getSelect(dl, Tmp1.getValueType(),
+                         NewLHS, Tmp1, Tmp2);
   NewRHS = SDValue();
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
-  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -2616,7 +2638,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
 SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
-  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -2634,7 +2656,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
 SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
-  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, N->getDebugLoc());
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
 
   // If ExpandSetCCOperands returned a scalar, use it.
   if (NewRHS.getNode() == 0) {
@@ -2672,7 +2694,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, N->getDebugLoc());
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2689,7 +2711,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getAlignment();
   bool isVolatile = N->isVolatile();
   bool isNonTemporal = N->isNonTemporal();
-  DebugLoc dl = N->getDebugLoc();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
+  SDLoc dl(N);
   SDValue Lo, Hi;
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -2698,7 +2721,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
                              N->getMemoryVT(), isVolatile, isNonTemporal,
-                             Alignment);
+                             Alignment, TBAAInfo);
   }
 
   if (TLI.isLittleEndian()) {
@@ -2706,7 +2729,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
     Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                      isVolatile, isNonTemporal, Alignment);
+                      isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2715,11 +2738,11 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                      DAG.getIntPtrConstant(IncrementSize));
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
     Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
                            NEVT, isVolatile, isNonTemporal,
-                           MinAlign(Alignment, IncrementSize));
+                           MinAlign(Alignment, IncrementSize), TBAAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -2747,17 +2770,17 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
 
   // Store both the high bits and maybe some of the low bits.
   Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(),
-                         HiVT, isVolatile, isNonTemporal, Alignment);
+                         HiVT, isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   // Store the lowest ExcessBits bits in the second half.
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
                          isVolatile, isNonTemporal,
-                         MinAlign(Alignment, IncrementSize));
+                         MinAlign(Alignment, IncrementSize), TBAAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -2765,14 +2788,14 @@ SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
   SDValue InL, InH;
   GetExpandedInteger(N->getOperand(0), InL, InH);
   // Just truncate the low part of the source.
-  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), N->getValueType(0), InL);
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL);
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   SDValue Op = N->getOperand(0);
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // The following optimization is valid only if every value in SrcVT (when
   // treated as signed) is representable in DstVT.  Check that the mantissa
@@ -2806,7 +2829,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
     SDValue Lo, Hi;
     GetExpandedInteger(Op, Lo, Hi);
     SDValue SignSet = DAG.getSetCC(dl,
-                                   TLI.getSetCCResultType(Hi.getValueType()),
+                                   getSetCCResultType(Hi.getValueType()),
                                    Hi, DAG.getConstant(0, Hi.getValueType()),
                                    ISD::SETLT);
 
@@ -2819,10 +2842,11 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
     SDValue Zero = DAG.getIntPtrConstant(0);
     SDValue Four = DAG.getIntPtrConstant(4);
     if (TLI.isBigEndian()) std::swap(Zero, Four);
-    SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
-                                 Zero, Four);
+    SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
+                                   Zero, Four);
     unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
-    FudgePtr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), FudgePtr, Offset);
+    FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(),
+                           FudgePtr, Offset);
     Alignment = std::min(Alignment, 4u);
 
     // Load the value out, extending it from f32 to the destination float type.
@@ -2839,11 +2863,11 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl);
+  return TLI.makeLibCall(DAG, LC, DstVT, &Op, 1, true, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
                                cast<AtomicSDNode>(N)->getMemoryVT(),
                                N->getOperand(0),
@@ -2865,7 +2889,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
   unsigned OutNumElems = OutVT.getVectorNumElements();
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue BaseIdx = N->getOperand(1);
 
   SmallVector<SDValue, 8> Ops;
@@ -2874,7 +2898,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
 
     // Extract the element from the original vector.
     SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
-      BaseIdx, DAG.getIntPtrConstant(i));
+      BaseIdx, DAG.getConstant(i, BaseIdx.getValueType()));
     SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
       InVT.getVectorElementType(), N->getOperand(0), Index);
 
@@ -2890,7 +2914,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
   ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<int, 8> NewMask;
@@ -2913,12 +2937,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
   unsigned NumElems = N->getNumOperands();
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(NumElems);
   for (unsigned i = 0; i != NumElems; ++i) {
-    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    SDValue Op;
+    // BUILD_VECTOR integer operand types are allowed to be larger than the
+    // result's element type. This may still be true after the promotion. For
+    // example, we might be promoting (<v?i1> = BV <i32>, <i32>, ...) to
+    // (v?i16 = BV <i32>, <i32>, ...), and we can't any_extend <i32> to <i16>.
+    if (N->getOperand(i).getValueType().bitsLT(NOutVTElem))
+      Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    else
+      Op = N->getOperand(i);
     Ops.push_back(Op);
   }
 
@@ -2927,7 +2959,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   assert(!N->getOperand(0).getValueType().isVector() &&
          "Input must be a scalar");
@@ -2943,7 +2975,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
@@ -2964,7 +2996,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
     SDValue Op = N->getOperand(i);
     for (unsigned j = 0; j < NumElem; ++j) {
       SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                                InElemTy, Op, DAG.getIntPtrConstant(j));
+                                InElemTy, Op, DAG.getConstant(j,
+                                              TLI.getVectorIdxTy()));
       Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
     }
   }
@@ -2979,7 +3012,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
 
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
 
   SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
@@ -2989,9 +3022,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
-  SDValue V1 = N->getOperand(1);
+  SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl, TLI.getVectorIdxTy());
   SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
     V0->getValueType(0).getScalarType(), V0, V1);
 
@@ -3002,7 +3035,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumElems = N->getNumOperands();
 
   EVT RetSclrTy = N->getValueType(0).getVectorElementType();
@@ -3019,7 +3052,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
     for (unsigned i=0; i<NumElem; ++i) {
       // Extract element from incoming vector
       SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy,
-      Incoming, DAG.getIntPtrConstant(i));
+      Incoming, DAG.getConstant(i, TLI.getVectorIdxTy()));
       SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
       NewOps.push_back(Tr);
     }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index b6436bf..eb13230 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -615,7 +615,10 @@ void DAGTypeLegalizer::RemapValue(SDValue &N) {
     // replaced with other values.
     RemapValue(I->second);
     N = I->second;
-    assert(N.getNode()->getNodeId() != NewNode && "Mapped to new node!");
+
+    // Note that it is possible to have N.getNode()->getNodeId() == NewNode at
+    // this point because it is possible for a node to be put in the map before
+    // being processed.
   }
 }
 
@@ -735,9 +738,6 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
   SDValue &OpEntry = PromotedIntegers[Op];
   assert(OpEntry.getNode() == 0 && "Node is already promoted!");
   OpEntry = Result;
-
-  // Propagate node ordering
-  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
@@ -749,9 +749,6 @@ void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
   SDValue &OpEntry = SoftenedFloats[Op];
   assert(OpEntry.getNode() == 0 && "Node is already converted to integer!");
   OpEntry = Result;
-
-  // Propagate node ordering
-  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
@@ -766,9 +763,6 @@ void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
   SDValue &OpEntry = ScalarizedVectors[Op];
   assert(OpEntry.getNode() == 0 && "Node is already scalarized!");
   OpEntry = Result;
-
-  // Propagate node ordering
-  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
@@ -796,10 +790,6 @@ void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
   assert(Entry.first.getNode() == 0 && "Node already expanded");
   Entry.first = Lo;
   Entry.second = Hi;
-
-  // Propagate ordering
-  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
-  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
@@ -827,10 +817,6 @@ void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
   assert(Entry.first.getNode() == 0 && "Node already expanded");
   Entry.first = Lo;
   Entry.second = Hi;
-
-  // Propagate ordering
-  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
-  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
@@ -860,10 +846,6 @@ void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
   assert(Entry.first.getNode() == 0 && "Node already split");
   Entry.first = Lo;
   Entry.second = Hi;
-
-  // Propagate ordering
-  DAG.AssignOrdering(Lo.getNode(), DAG.GetOrdering(Op.getNode()));
-  DAG.AssignOrdering(Hi.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
@@ -875,9 +857,6 @@ void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
   SDValue &OpEntry = WidenedVectors[Op];
   assert(OpEntry.getNode() == 0 && "Node already widened!");
   OpEntry = Result;
-
-  // Propagate node ordering
-  DAG.AssignOrdering(Result.getNode(), DAG.GetOrdering(Op.getNode()));
 }
 
 
@@ -888,7 +867,7 @@ void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
 /// BitConvertToInteger - Convert to an integer of the same size.
 SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
   unsigned BitWidth = Op.getValueType().getSizeInBits();
-  return DAG.getNode(ISD::BITCAST, Op.getDebugLoc(),
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
                      EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
 }
 
@@ -899,13 +878,13 @@ SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
   unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
   EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
   unsigned NumElts = Op.getValueType().getVectorNumElements();
-  return DAG.getNode(ISD::BITCAST, Op.getDebugLoc(),
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
                      EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op);
 }
 
 SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
                                                EVT DestVT) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and destination types.
   SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
@@ -945,8 +924,6 @@ bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
          "Custom lowering returned the wrong number of results!");
   for (unsigned i = 0, e = Results.size(); i != e; ++i) {
     ReplaceValueWith(SDValue(N, i), Results[i]);
-    // Propagate node ordering
-    DAG.AssignOrdering(Results[i].getNode(), DAG.GetOrdering(N));
   }
   return true;
 }
@@ -981,25 +958,11 @@ SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
   return SDValue(N->getOperand(ResNo));
 }
 
-/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-/// which is split into two not necessarily identical pieces.
-void DAGTypeLegalizer::GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT) {
-  // Currently all types are split in half.
-  if (!InVT.isVector()) {
-    LoVT = HiVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
-  } else {
-    unsigned NumElements = InVT.getVectorNumElements();
-    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-    LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                   InVT.getVectorElementType(), NumElements/2);
-  }
-}
-
 /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
 /// high parts of the given value.
 void DAGTypeLegalizer::GetPairElements(SDValue Pair,
                                        SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = Pair.getDebugLoc();
+  SDLoc dl(Pair);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType());
   Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
                    DAG.getIntPtrConstant(0));
@@ -1009,12 +972,9 @@ void DAGTypeLegalizer::GetPairElements(SDValue Pair,
 
 SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
                                                   SDValue Index) {
-  DebugLoc dl = Index.getDebugLoc();
+  SDLoc dl(Index);
   // Make sure the index type is big enough to compute in.
-  if (Index.getValueType().bitsGT(TLI.getPointerTy()))
-    Index = DAG.getNode(ISD::TRUNCATE, dl, TLI.getPointerTy(), Index);
-  else
-    Index = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Index);
+  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy());
 
   // Calculate the element offset and add it to the pointer.
   unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
@@ -1026,9 +986,9 @@ SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
 
 /// JoinIntegers - Build an integer with low bits Lo and high bits Hi.
 SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
-  // Arbitrarily use dlHi for result DebugLoc
-  DebugLoc dlHi = Hi.getDebugLoc();
-  DebugLoc dlLo = Lo.getDebugLoc();
+  // Arbitrarily use dlHi for result SDLoc
+  SDLoc dlHi(Hi);
+  SDLoc dlLo(Lo);
   EVT LVT = Lo.getValueType();
   EVT HVT = Hi.getValueType();
   EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
@@ -1045,22 +1005,25 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
 SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
                                      bool isSigned) {
   unsigned NumOps = N->getNumOperands();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), 0, 0, isSigned,
+                           dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), &Op, 1, isSigned,
+                           dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned, dl);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, 2, isSigned,
+                           dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
   return TLI.makeLibCall(DAG, LC, N->getValueType(0),
-                         &Ops[0], NumOps, isSigned, dl);
+                         &Ops[0], NumOps, isSigned, dl).first;
 }
 
 // ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
@@ -1090,7 +1053,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
   CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, false,
                     0, TLI.getLibcallCallingConv(LC), /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, Node->getDebugLoc());
+                    Callee, Args, DAG, SDLoc(Node));
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   return CallInfo;
@@ -1100,7 +1063,7 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC,
 /// of the given type.  A target boolean is an integer value, not necessarily of
 /// type i1, the bits of which conform to getBooleanContents.
 SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
-  DebugLoc dl = Bool.getDebugLoc();
+  SDLoc dl(Bool);
   ISD::NodeType ExtendCode =
     TargetLowering::getExtendForContent(TLI.getBooleanContents(VT.isVector()));
   return DAG.getNode(ExtendCode, dl, VT, Bool);
@@ -1111,7 +1074,7 @@ SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT VT) {
 void DAGTypeLegalizer::SplitInteger(SDValue Op,
                                     EVT LoVT, EVT HiVT,
                                     SDValue &Lo, SDValue &Hi) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
          Op.getValueType().getSizeInBits() && "Invalid integer splitting!");
   Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1c4274a..13bb08f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -1,4 +1,4 @@
-//===-- LegalizeTypes.h - Definition of the DAG Type Legalizer class ------===//
+//===-- LegalizeTypes.h - DAG Type Legalizer class definition ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -73,6 +73,10 @@ private:
     return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
   }
 
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(*DAG.getContext(), VT);
+  }
+
   /// IgnoreNodeResults - Pretend all of this node's results are legal.
   bool IgnoreNodeResults(SDNode *N) const {
     return N->getOpcode() == ISD::TargetConstant;
@@ -195,7 +199,7 @@ private:
   /// final size.
   SDValue SExtPromotedInteger(SDValue Op) {
     EVT OldVT = Op.getValueType();
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     Op = GetPromotedInteger(Op);
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
                        DAG.getValueType(OldVT));
@@ -205,7 +209,7 @@ private:
   /// final size.
   SDValue ZExtPromotedInteger(SDValue Op) {
     EVT OldVT = Op.getValueType();
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     Op = GetPromotedInteger(Op);
     return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
   }
@@ -357,7 +361,7 @@ private:
   SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
 
   void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                  ISD::CondCode &CCCode, DebugLoc dl);
+                                  ISD::CondCode &CCCode, SDLoc dl);
 
   //===--------------------------------------------------------------------===//
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
@@ -406,6 +410,7 @@ private:
   SDValue SoftenFloatRes_FPOWI(SDNode *N);
   SDValue SoftenFloatRes_FREM(SDNode *N);
   SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FROUND(SDNode *N);
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
@@ -466,6 +471,7 @@ private:
   void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -476,6 +482,7 @@ private:
   // Float Operand Expansion.
   bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
   SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
   SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
   SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
@@ -484,7 +491,7 @@ private:
   SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
 
   void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
-                                ISD::CondCode &CCCode, DebugLoc dl);
+                                ISD::CondCode &CCCode, SDLoc dl);
 
   //===--------------------------------------------------------------------===//
   // Scalarization Support: LegalizeVectorTypes.cpp
@@ -530,7 +537,7 @@ private:
   // Vector Operand Scalarization: <1 x ty> -> ty.
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
-  SDValue ScalarizeVecOp_EXTEND(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
@@ -554,6 +561,7 @@ private:
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -624,6 +632,7 @@ private:
 
   SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
   SDValue WidenVecRes_Shift(SDNode *N);
@@ -649,7 +658,7 @@ private:
   /// loads to load a vector with a resulting wider type. It takes
   ///   LdChain: list of chains for the load to be generated.
   ///   Ld:      load to widen
-  SDValue GenWidenVectorLoads(SmallVector<SDValue, 16>& LdChain,
+  SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                               LoadSDNode *LD);
 
   /// GenWidenVectorExtLoads - Helper function to generate a set of extension
@@ -657,20 +666,20 @@ private:
   ///   LdChain: list of chains for the load to be generated.
   ///   Ld:      load to widen
   ///   ExtType: extension element type
-  SDValue GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
+  SDValue GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
                                  LoadSDNode *LD, ISD::LoadExtType ExtType);
 
   /// Helper genWidenVectorStores - Helper function to generate a set of
   /// stores to store a widen vector into non widen memory
   ///   StChain: list of chains for the stores we have generated
   ///   ST:      store of a widen value
-  void GenWidenVectorStores(SmallVector<SDValue, 16>& StChain, StoreSDNode *ST);
+  void GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
 
   /// Helper genWidenVectorTruncStores - Helper function to generate a set of
   /// stores to store a truncate widen vector into non widen memory
   ///   StChain: list of chains for the stores we have generated
   ///   ST:      store of a widen value
-  void GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+  void GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                                  StoreSDNode *ST);
 
   /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
@@ -695,10 +704,6 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
-  /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
-  /// which is split (or expanded) into two not necessarily identical pieces.
-  void GetSplitDestVTs(EVT InVT, EVT &LoVT, EVT &HiVT);
-
   /// GetPairElements - Use ISD::EXTRACT_ELEMENT nodes to extract the low and
   /// high parts of the given value.
   void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
@@ -726,6 +731,12 @@ private:
       GetExpandedFloat(Op, Lo, Hi);
   }
 
+
+  /// This function will split the integer \p Op into \p NumElements
+  /// operations of type \p EltVT and store them in \p Ops.
+  void IntegerToVector(SDValue Op, unsigned NumElements,
+                       SmallVectorImpl<SDValue> &Ops, EVT EltVT);
+
   // Generic Result Expansion.
   void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
                                     SDValue &Lo, SDValue &Hi);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 222d1c0..c749fde 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -41,7 +41,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Handle some special cases efficiently.
   switch (getTypeAction(InVT)) {
@@ -77,12 +77,9 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
-      EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                   InVT.getVectorNumElements()/2);
-      Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getIntPtrConstant(0));
-      Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, InOp,
-                       DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+      EVT LoVT, HiVT;
+      llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
+      llvm::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
@@ -115,7 +112,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       SmallVector<SDValue, 8> Vals;
       for (unsigned i = 0; i < NumElems; ++i)
         Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
-                                   CastInOp, DAG.getIntPtrConstant(i)));
+                                   CastInOp, DAG.getConstant(i,
+                                             TLI.getVectorIdxTy())));
 
       // Build Lo, Hi pair by pairing extracted elements if needed.
       unsigned Slot = 0;
@@ -161,13 +159,14 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
                                false, false, 0);
 
   // Load the first half from the stack slot.
-  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, 
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo,
                    false, false, false, 0);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                         DAG.getConstant(IncrementSize,
+                                         StackPtr.getValueType()));
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
@@ -203,7 +202,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   SDValue OldVec = N->getOperand(0);
   unsigned OldElts = OldVec.getValueType().getVectorNumElements();
   EVT OldEltVT = OldVec.getValueType().getVectorElementType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Convert to a vector of the expanded element type, for example
   // <3 x i64> -> <6 x i32>.
@@ -227,10 +226,6 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
   SDValue Idx = N->getOperand(1);
 
-  // Make sure the type of Idx is big enough to hold the new values.
-  if (Idx.getValueType().bitsLT(TLI.getPointerTy()))
-    Idx = DAG.getNode(ISD::ZERO_EXTEND, dl, TLI.getPointerTy(), Idx);
-
   Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
   Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
 
@@ -245,7 +240,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
   assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
@@ -255,20 +250,22 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
   Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
-                   isVolatile, isNonTemporal, isInvariant, Alignment);
+                   isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
                    isVolatile, isNonTemporal, isInvariant,
-                   MinAlign(Alignment, IncrementSize));
+                   MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -289,7 +286,7 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
   SDValue Chain = N->getOperand(0);
   SDValue Ptr = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   const unsigned Align = N->getConstantOperandVal(3);
 
   Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
@@ -309,29 +306,54 @@ void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
 // Generic Operand Expansion.
 //===--------------------------------------------------------------------===//
 
+void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
+                                       SmallVectorImpl<SDValue> &Ops,
+                                       EVT EltVT) {
+  assert(Op.getValueType().isInteger());
+  SDLoc DL(Op);
+  SDValue Parts[2];
+
+  if (NumElements > 1) {
+    NumElements >>= 1;
+    SplitInteger(Op, Parts[0], Parts[1]);
+      if (TLI.isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+    IntegerToVector(Parts[0], NumElements, Ops, EltVT);
+    IntegerToVector(Parts[1], NumElements, Ops, EltVT);
+  } else {
+    Ops.push_back(DAG.getNode(ISD::BITCAST, DL, EltVT, Op));
+  }
+}
+
 SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0).isVector()) {
     // An illegal expanding type is being converted to a legal vector type.
     // Make a two element vector out of the expanded parts and convert that
     // instead, but only if the new vector type is legal (otherwise there
     // is no point, and it might create expansion loops).  For example, on
     // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    //
+    // FIXME: I'm not sure why we are first trying to split the input into
+    // a 2 element vector, so I'm leaving it here to maintain the current
+    // behavior.
+    unsigned NumElts = 2;
     EVT OVT = N->getOperand(0).getValueType();
     EVT NVT = EVT::getVectorVT(*DAG.getContext(),
                                TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
-                               2);
+                               NumElts);
+    if (!isTypeLegal(NVT)) {
+      // If we can't find a legal type by splitting the integer in half,
+      // then we can use the node's value type.
+      NumElts = N->getValueType(0).getVectorNumElements();
+      NVT = N->getValueType(0);
+    }
 
-    if (isTypeLegal(NVT)) {
-      SDValue Parts[2];
-      GetExpandedOp(N->getOperand(0), Parts[0], Parts[1]);
+    SmallVector<SDValue, 8> Ops;
+    IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
 
-      if (TLI.isBigEndian())
-        std::swap(Parts[0], Parts[1]);
-
-      SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Parts, 2);
-      return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
-    }
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &Ops[0], NumElts);
+    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
   }
 
   // Otherwise, store to a temporary and load out again as the new type.
@@ -344,7 +366,7 @@ SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
   unsigned NumElts = VecVT.getVectorNumElements();
   EVT OldVT = N->getOperand(0).getValueType();
   EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   assert(OldVT == VecVT.getVectorElementType() &&
          "BUILD_VECTOR operand type doesn't match vector element type!");
@@ -382,7 +404,7 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
   // The vector type is legal but the element type needs expansion.
   EVT VecVT = N->getValueType(0);
   unsigned NumElts = VecVT.getVectorNumElements();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue Val = N->getOperand(1);
   EVT OldEVT = Val.getValueType();
@@ -406,7 +428,8 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
   Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
   NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
   Idx = DAG.getNode(ISD::ADD, dl,
-                    Idx.getValueType(), Idx, DAG.getIntPtrConstant(1));
+                    Idx.getValueType(), Idx,
+                    DAG.getConstant(1, Idx.getValueType()));
   NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
 
   // Convert the new vector to the old vector type.
@@ -414,7 +437,7 @@ SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
          "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
@@ -430,7 +453,7 @@ SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
   assert(OpNo == 1 && "Can only expand the stored value so far");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
@@ -440,6 +463,7 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   unsigned Alignment = St->getAlignment();
   bool isVolatile = St->isVolatile();
   bool isNonTemporal = St->isNonTemporal();
+  const MDNode *TBAAInfo = St->getTBAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
@@ -451,15 +475,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
     std::swap(Lo, Hi);
 
   Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
-                    isVolatile, isNonTemporal, Alignment);
+                    isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
-  assert(isTypeLegal(Ptr.getValueType()) && "Pointers must be legal!");
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                     St->getPointerInfo().getWithOffset(IncrementSize),
                     isVolatile, isNonTemporal,
-                    MinAlign(Alignment, IncrementSize));
+                    MinAlign(Alignment, IncrementSize), TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
@@ -483,21 +506,19 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
 void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
                                        SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitOp(N->getOperand(1), LL, LH);
   GetSplitOp(N->getOperand(2), RL, RH);
 
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
-    assert(Cond.getValueType().getVectorElementType() == MVT::i1 &&
-           "Condition legalized before result?");
-    unsigned NumElements = Cond.getValueType().getVectorNumElements();
-    EVT VCondTy = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElements / 2);
-    CL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getIntPtrConstant(0));
-    CH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VCondTy, Cond,
-                     DAG.getIntPtrConstant(NumElements / 2));
+    // Check if there are already splitted versions of the vector available and
+    // use those instead of splitting the mask operand again.
+    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Cond, CL, CH);
+    else
+      llvm::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
   Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
@@ -507,7 +528,7 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue LL, LH, RL, RH;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitOp(N->getOperand(2), LL, LH);
   GetSplitOp(N->getOperand(3), RL, RH);
 
@@ -519,7 +540,7 @@ void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getUNDEF(LoVT);
   Hi = DAG.getUNDEF(HiVT);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index c6e066e..2c3cdcc 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -171,7 +171,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom:
         Changed = true;
-        return LegalizeOp(TLI.LowerOperation(Result, DAG));
+        return TranslateLegalizeResults(Op, TLI.LowerOperation(Result, DAG));
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
@@ -227,6 +227,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FP_TO_UINT:
   case ISD::FNEG:
   case ISD::FABS:
+  case ISD::FCOPYSIGN:
   case ISD::FSQRT:
   case ISD::FSIN:
   case ISD::FCOS:
@@ -241,6 +242,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FTRUNC:
   case ISD::FRINT:
   case ISD::FNEARBYINT:
+  case ISD::FROUND:
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
@@ -320,7 +322,7 @@ SDValue VectorLegalizer::PromoteVectorOp(SDValue Op) {
   assert(Op.getNode()->getNumValues() == 1 &&
          "Can't promote a vector with multiple results!");
   MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SmallVector<SDValue, 4> Operands(Op.getNumOperands());
 
   for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
@@ -357,7 +359,7 @@ SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
   // Build a new vector type and check if it is legal.
   MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SmallVector<SDValue, 4> Operands(Op.getNumOperands());
 
   unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND :
@@ -375,7 +377,7 @@ SDValue VectorLegalizer::PromoteVectorOpINT_TO_FP(SDValue Op) {
 
 
 SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
   SDValue Chain = LD->getChain();
   SDValue BasePTR = LD->getBasePtr();
@@ -416,7 +418,8 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), LD->getAlignment());
+                                 LD->isInvariant(), LD->getAlignment(),
+                                 LD->getTBAAInfo());
       } else {
         EVT LoadVT = WideVT;
         while (RemainingBytes < LoadBytes) {
@@ -426,13 +429,14 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
         ScalarLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
-                                    LD->isNonTemporal(), LD->getAlignment());
+                                    LD->isNonTemporal(), LD->getAlignment(),
+                                    LD->getTBAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
       Offset += LoadBytes;
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                            DAG.getIntPtrConstant(LoadBytes));
+                            DAG.getConstant(LoadBytes, BasePTR.getValueType()));
 
       LoadVals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -497,10 +501,10 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
                 LD->isVolatile(), LD->isNonTemporal(),
-                LD->getAlignment());
+                LD->getAlignment(), LD->getTBAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                         DAG.getIntPtrConstant(Stride));
+                         DAG.getConstant(Stride, BasePTR.getValueType()));
 
       Vals.push_back(ScalarLoad.getValue(0));
       LoadChains.push_back(ScalarLoad.getValue(1));
@@ -519,7 +523,7 @@ SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
 }
 
 SDValue VectorLegalizer::ExpandStore(SDValue Op) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
   SDValue Chain = ST->getChain();
   SDValue BasePTR = ST->getBasePtr();
@@ -529,6 +533,7 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
 
   unsigned NumElem = StVT.getVectorNumElements();
   // The type of the data we want to save
@@ -551,15 +556,15 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   SmallVector<SDValue, 8> Stores;
   for (unsigned Idx = 0; Idx < NumElem; Idx++) {
     SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-               RegSclVT, Value, DAG.getIntPtrConstant(Idx));
+               RegSclVT, Value, DAG.getConstant(Idx, TLI.getVectorIdxTy()));
 
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment);
+               isVolatile, isNonTemporal, Alignment, TBAAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
-                                DAG.getIntPtrConstant(Stride));
+                               DAG.getConstant(Stride, BasePTR.getValueType()));
 
     Stores.push_back(Store);
   }
@@ -572,9 +577,9 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
 SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
   // Lower a select instruction where the condition is a scalar and the
   // operands are vectors. Lower this select to VSELECT and implement it
-  // using XOR AND OR. The selector bit is broadcasted. 
+  // using XOR AND OR. The selector bit is broadcasted.
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue Mask = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -597,15 +602,12 @@ SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
     return DAG.UnrollVectorOp(Op.getNode());
 
   // Generate a mask operand.
-  EVT MaskTy = TLI.getSetCCResultType(VT);
-  assert(MaskTy.isVector() && "Invalid CC type");
-  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
-         && "Invalid mask size");
+  EVT MaskTy = VT.changeVectorElementTypeToInteger();
 
   // What is the size of each element in the vector mask.
   EVT BitTy = MaskTy.getScalarType();
 
-  Mask = DAG.getNode(ISD::SELECT, DL, BitTy, Mask,
+  Mask = DAG.getSelect(DL, BitTy, Mask,
           DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), BitTy),
           DAG.getConstant(0, BitTy));
 
@@ -637,7 +639,7 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
       TLI.getOperationAction(ISD::SHL, VT) == TargetLowering::Expand)
     return DAG.UnrollVectorOp(Op.getNode());
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OrigTy = cast<VTSDNode>(Op->getOperand(1))->getVT();
 
   unsigned BW = VT.getScalarType().getSizeInBits();
@@ -652,13 +654,14 @@ SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
-  EVT VT =  Op.getOperand(0).getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue Mask = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
 
+  EVT VT = Mask.getValueType();
+
   // If we can't even use the basic vector operations of
   // AND,OR,XOR, we will have to scalarize the op.
   // Notice that the operation may be 'promoted' which means that it is
@@ -673,8 +676,12 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
       TargetLowering::ZeroOrNegativeOneBooleanContent)
     return DAG.UnrollVectorOp(Op.getNode());
 
-  assert(VT.getSizeInBits() == Op1.getValueType().getSizeInBits()
-         && "Invalid mask size");
+  // If the mask and the type are different sizes, unroll the vector op. This
+  // can occur when getSetCCResultType returns something that is different in
+  // size from the operand types. For example, v4i8 = select v4i32, v4i8, v4i8.
+  if (VT.getSizeInBits() != Op1.getValueType().getSizeInBits())
+    return DAG.UnrollVectorOp(Op.getNode());
+
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because
   // the mask is a vector of integers.
@@ -693,7 +700,7 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
   EVT VT = Op.getOperand(0).getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Make sure that the SINT_TO_FP and SRL instructions are available.
   if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand ||
@@ -734,7 +741,7 @@ SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
 SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
   if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
     SDValue Zero = DAG.getConstantFP(-0.0, Op.getValueType());
-    return DAG.getNode(ISD::FSUB, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
                        Zero, Op.getOperand(0));
   }
   return DAG.UnrollVectorOp(Op.getNode());
@@ -746,19 +753,20 @@ SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
   EVT EltVT = VT.getVectorElementType();
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1), CC = Op.getOperand(2);
   EVT TmpEltVT = LHS.getValueType().getVectorElementType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SmallVector<SDValue, 8> Ops(NumElems);
   for (unsigned i = 0; i < NumElems; ++i) {
     SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
-                                  DAG.getIntPtrConstant(i));
+                                  DAG.getConstant(i, TLI.getVectorIdxTy()));
     SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
-                                  DAG.getIntPtrConstant(i));
-    Ops[i] = DAG.getNode(ISD::SETCC, dl, TLI.getSetCCResultType(TmpEltVT),
+                                  DAG.getConstant(i, TLI.getVectorIdxTy()));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl,
+                         TLI.getSetCCResultType(*DAG.getContext(), TmpEltVT),
                          LHSElem, RHSElem, CC);
-    Ops[i] = DAG.getNode(ISD::SELECT, dl, EltVT, Ops[i],
-                         DAG.getConstant(APInt::getAllOnesValue
-                                         (EltVT.getSizeInBits()), EltVT),
-                         DAG.getConstant(0, EltVT));
+    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i],
+                           DAG.getConstant(APInt::getAllOnesValue
+                                           (EltVT.getSizeInBits()), EltVT),
+                           DAG.getConstant(0, EltVT));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElems);
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 04c6bfd..f7a3e3d 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -83,6 +83,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -97,6 +98,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::ADD:
   case ISD::AND:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
@@ -128,7 +130,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      LHS.getValueType(), LHS, RHS);
 }
 
@@ -136,7 +138,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
   SDValue Op1 = GetScalarizedVector(N->getOperand(1));
   SDValue Op2 = GetScalarizedVector(N->getOperand(2));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      Op0.getValueType(), Op0, Op1, Op2);
 }
 
@@ -148,7 +150,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
-  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
                      NewVT, N->getOperand(0));
 }
 
@@ -158,14 +160,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
   // The BUILD_VECTOR operands may be of wider element types and
   // we may need to truncate them back to the requested return type.
   if (EltVT.isInteger())
-    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
   return InOp;
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
-  return DAG.getConvertRndSat(NewVT, N->getDebugLoc(),
+  return DAG.getConvertRndSat(NewVT, SDLoc(N),
                               Op0, DAG.getValueType(NewVT),
                               DAG.getValueType(Op0.getValueType()),
                               N->getOperand(3),
@@ -174,7 +176,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0).getVectorElementType(),
                      N->getOperand(0), N->getOperand(1));
 }
@@ -182,13 +184,13 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op = GetScalarizedVector(N->getOperand(0));
-  return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(),
+  return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
                      NewVT, Op, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
   SDValue Op = GetScalarizedVector(N->getOperand(0));
-  return DAG.getNode(ISD::FPOWI, N->getDebugLoc(),
+  return DAG.getNode(ISD::FPOWI, SDLoc(N),
                      Op.getValueType(), Op, N->getOperand(1));
 }
 
@@ -199,7 +201,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   EVT EltVT = N->getValueType(0).getVectorElementType();
   if (Op.getValueType() != EltVT)
     // FIXME: Can this happen for floating point types?
-    Op = DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, Op);
+    Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
   return Op;
 }
 
@@ -209,13 +211,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   SDValue Result = DAG.getLoad(ISD::UNINDEXED,
                                N->getExtensionType(),
                                N->getValueType(0).getVectorElementType(),
-                               N->getDebugLoc(),
+                               SDLoc(N),
                                N->getChain(), N->getBasePtr(),
                                DAG.getUNDEF(N->getBasePtr().getValueType()),
                                N->getPointerInfo(),
                                N->getMemoryVT().getVectorElementType(),
                                N->isVolatile(), N->isNonTemporal(),
-                               N->isInvariant(), N->getOriginalAlignment());
+                               N->isInvariant(), N->getOriginalAlignment(),
+                               N->getTBAAInfo());
 
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
@@ -227,14 +230,14 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
   // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
   EVT DestVT = N->getValueType(0).getVectorElementType();
   SDValue Op = GetScalarizedVector(N->getOperand(0));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), DestVT, Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
   EVT EltVT = N->getValueType(0).getVectorElementType();
   EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), EltVT,
+  return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
                      LHS, DAG.getValueType(ExtVT));
 }
 
@@ -244,7 +247,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT EltVT = N->getValueType(0).getVectorElementType();
   SDValue InOp = N->getOperand(0);
   if (InOp.getValueType() != EltVT)
-    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
   return InOp;
 }
 
@@ -262,33 +265,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
         assert(VecBool == TargetLowering::UndefinedBooleanContent ||
                VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
         // Vector read from all ones, scalar expects a single 1 so mask.
-        Cond = DAG.getNode(ISD::AND, N->getDebugLoc(), CondVT,
+        Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
                            Cond, DAG.getConstant(1, CondVT));
         break;
       case TargetLowering::ZeroOrNegativeOneBooleanContent:
         assert(VecBool == TargetLowering::UndefinedBooleanContent ||
                VecBool == TargetLowering::ZeroOrOneBooleanContent);
         // Vector reads from a one, scalar from all ones so sign extend.
-        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, N->getDebugLoc(), CondVT,
+        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
                            Cond, DAG.getValueType(MVT::i1));
         break;
     }
   }
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
-                     LHS.getValueType(), Cond, LHS,
-                     GetScalarizedVector(N->getOperand(2)));
+
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), Cond, LHS,
+                       GetScalarizedVector(N->getOperand(2)));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(),
-                     LHS.getValueType(), N->getOperand(0), LHS,
-                     GetScalarizedVector(N->getOperand(2)));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS,
+                       GetScalarizedVector(N->getOperand(2)));
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(2));
-  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), LHS.getValueType(),
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
                      N->getOperand(0), N->getOperand(1),
                      LHS, GetScalarizedVector(N->getOperand(3)),
                      N->getOperand(4));
@@ -303,7 +307,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
 
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Turn it into a scalar SETCC.
   return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2));
@@ -330,7 +334,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
   SDValue LHS = GetScalarizedVector(N->getOperand(0));
   SDValue RHS = GetScalarizedVector(N->getOperand(1));
   EVT NVT = N->getValueType(0).getVectorElementType();
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Turn it into a scalar SETCC.
   SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
@@ -368,7 +372,8 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::ANY_EXTEND:
     case ISD::ZERO_EXTEND:
     case ISD::SIGN_EXTEND:
-      Res = ScalarizeVecOp_EXTEND(N);
+    case ISD::TRUNCATE:
+      Res = ScalarizeVecOp_UnaryOp(N);
       break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
@@ -401,22 +406,22 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
 /// to be scalarized, it must be <1 x ty>.  Convert the element instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
-  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
                      N->getValueType(0), Elt);
 }
 
 /// ScalarizeVecOp_EXTEND - If the value to extend is a vector that needs
 /// to be scalarized, it must be <1 x ty>.  Extend the element instead.
-SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTEND(SDNode *N) {
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   assert(N->getValueType(0).getVectorNumElements() == 1 &&
          "Unexected vector type!");
   SDValue Elt = GetScalarizedVector(N->getOperand(0));
   SmallVector<SDValue, 1> Ops(1);
-  Ops[0] = DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  Ops[0] = DAG.getNode(N->getOpcode(), SDLoc(N),
                        N->getValueType(0).getScalarType(), Elt);
   // Revectorize the result so the types line up with what the uses of this
   // expression expect.
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0),
                      &Ops[0], 1);
 }
 
@@ -426,7 +431,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
   for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
     Ops[i] = GetScalarizedVector(N->getOperand(i));
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0),
                      &Ops[0], Ops.size());
 }
 
@@ -436,7 +441,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue Res = GetScalarizedVector(N->getOperand(0));
   if (Res.getValueType() != N->getValueType(0))
-    Res = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), N->getValueType(0),
+    Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0),
                       Res);
   return Res;
 }
@@ -446,7 +451,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
   assert(N->isUnindexed() && "Indexed store of one-element vector?");
   assert(OpNo == 1 && "Do not know how to scalarize this operand!");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (N->isTruncatingStore())
     return DAG.getTruncStore(N->getChain(), dl,
@@ -454,12 +459,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
                              N->getBasePtr(), N->getPointerInfo(),
                              N->getMemoryVT().getVectorElementType(),
                              N->isVolatile(), N->isNonTemporal(),
-                             N->getAlignment());
+                             N->getAlignment(), N->getTBAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
                       N->isVolatile(), N->isNonTemporal(),
-                      N->getOriginalAlignment());
+                      N->getOriginalAlignment(), N->getTBAAInfo());
 }
 
 
@@ -516,7 +521,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
     break;
 
-  case ISD::ANY_EXTEND:
   case ISD::CONVERT_RNDSAT:
   case ISD::CTLZ:
   case ISD::CTTZ:
@@ -539,21 +543,27 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
-  case ISD::SIGN_EXTEND:
   case ISD::SINT_TO_FP:
   case ISD::TRUNCATE:
   case ISD::UINT_TO_FP:
-  case ISD::ZERO_EXTEND:
     SplitVecRes_UnaryOp(N, Lo, Hi);
     break;
 
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    SplitVecRes_ExtendOp(N, Lo, Hi);
+    break;
+
   case ISD::ADD:
   case ISD::SUB:
   case ISD::MUL:
   case ISD::FADD:
+  case ISD::FCOPYSIGN:
   case ISD::FSUB:
   case ISD::FMUL:
   case ISD::SDIV:
@@ -587,7 +597,7 @@ void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
   SDValue RHSLo, RHSHi;
   GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo, RHSLo);
   Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi, RHSHi);
@@ -601,7 +611,7 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
   SDValue Op2Lo, Op2Hi;
   GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
                    Op0Lo, Op1Lo, Op2Lo);
@@ -614,8 +624,8 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   // We know the result is a vector.  The input may be either a vector or a
   // scalar value.
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
-  DebugLoc dl = N->getDebugLoc();
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDLoc dl(N);
 
   SDValue InOp = N->getOperand(0);
   EVT InVT = InOp.getValueType();
@@ -668,8 +678,8 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
   EVT LoVT, HiVT;
-  DebugLoc dl = N->getDebugLoc();
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  SDLoc dl(N);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   unsigned LoNumElts = LoVT.getVectorNumElements();
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
   Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, &LoOps[0], LoOps.size());
@@ -681,7 +691,7 @@ void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
                                                   SDValue &Hi) {
   assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumSubvectors = N->getNumOperands() / 2;
   if (NumSubvectors == 1) {
     Lo = N->getOperand(0);
@@ -690,7 +700,7 @@ void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
   }
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
   Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, &LoOps[0], LoOps.size());
@@ -703,20 +713,21 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
                                                      SDValue &Hi) {
   SDValue Vec = N->getOperand(0);
   SDValue Idx = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
-                   DAG.getIntPtrConstant(IdxVal + LoVT.getVectorNumElements()));
+                   DAG.getConstant(IdxVal + LoVT.getVectorNumElements(),
+                                   TLI.getVectorIdxTy()));
 }
 
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
                                          SDValue &Hi) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
@@ -726,10 +737,11 @@ void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue LHSLo, LHSHi;
   GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT LoVT, HiVT;
-  GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT(), LoVT, HiVT);
+  llvm::tie(LoVT, HiVT) =
+    DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
 
   Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
                    DAG.getValueType(LoVT));
@@ -742,7 +754,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   SDValue Vec = N->getOperand(0);
   SDValue Elt = N->getOperand(1);
   SDValue Idx = N->getOperand(2);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitVector(Vec, Lo, Hi);
 
   if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
@@ -753,7 +765,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                        Lo.getValueType(), Lo, Elt, Idx);
     else
       Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
-                       DAG.getIntPtrConstant(IdxVal - LoNumElts));
+                       DAG.getConstant(IdxVal - LoNumElts,
+                                       TLI.getVectorIdxTy()));
     return;
   }
 
@@ -780,7 +793,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
   StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getIntPtrConstant(IncrementSize));
+                       DAG.getConstant(IncrementSize, StackPtr.getValueType()));
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
@@ -790,8 +803,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   EVT LoVT, HiVT;
-  DebugLoc dl = N->getDebugLoc();
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  SDLoc dl(N);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
   Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
   Hi = DAG.getUNDEF(HiVT);
 }
@@ -800,8 +813,8 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
                                         SDValue &Hi) {
   assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
   EVT LoVT, HiVT;
-  DebugLoc dl = LD->getDebugLoc();
-  GetSplitDestVTs(LD->getValueType(0), LoVT, HiVT);
+  SDLoc dl(LD);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
 
   ISD::LoadExtType ExtType = LD->getExtensionType();
   SDValue Ch = LD->getChain();
@@ -812,20 +825,22 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   bool isVolatile = LD->isVolatile();
   bool isNonTemporal = LD->isNonTemporal();
   bool isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo(), LoMemVT, isVolatile, isNonTemporal,
-                   isInvariant, Alignment);
+                   isInvariant, Alignment, TBAAInfo);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
   Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
   Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
                    LD->getPointerInfo().getWithOffset(IncrementSize),
-                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment);
+                   HiMemVT, isVolatile, isNonTemporal, isInvariant, Alignment,
+                   TBAAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -843,23 +858,13 @@ void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
          "Operand types must be vectors");
 
   EVT LoVT, HiVT;
-  DebugLoc DL = N->getDebugLoc();
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  SDLoc DL(N);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // Split the input.
-  EVT InVT = N->getOperand(0).getValueType();
   SDValue LL, LH, RL, RH;
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  LL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getIntPtrConstant(0));
-  LH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(0),
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-
-  RL = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getIntPtrConstant(0));
-  RH = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, N->getOperand(1),
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  llvm::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  llvm::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
 
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
@@ -869,22 +874,16 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   // Get the dest types - they may not match the input types, e.g. int_to_fp.
   EVT LoVT, HiVT;
-  DebugLoc dl = N->getDebugLoc();
-  GetSplitDestVTs(N->getValueType(0), LoVT, HiVT);
+  SDLoc dl(N);
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
 
   // If the input also splits, handle it directly for a compile time speedup.
   // Otherwise split it by hand.
   EVT InVT = N->getOperand(0).getValueType();
-  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
     GetSplitVector(N->getOperand(0), Lo, Hi);
-  } else {
-    EVT InNVT = EVT::getVectorVT(*DAG.getContext(), InVT.getVectorElementType(),
-                                 LoVT.getVectorNumElements());
-    Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getIntPtrConstant(0));
-    Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InNVT, N->getOperand(0),
-                     DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-  }
+  else
+    llvm::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
 
   if (N->getOpcode() == ISD::FP_ROUND) {
     Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
@@ -907,11 +906,63 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
   }
 }
 
+void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DestVT = N->getValueType(0);
+  EVT LoVT, HiVT;
+  llvm::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
+
+  // We can do better than a generic split operation if the extend is doing
+  // more than just doubling the width of the elements and the following are
+  // true:
+  //   - The number of vector elements is even,
+  //   - the source type is legal,
+  //   - the type of a split source is illegal,
+  //   - the type of an extended (by doubling element size) source is legal, and
+  //   - the type of that extended source when split is legal.
+  //
+  // This won't necessarily completely legalize the operation, but it will
+  // more effectively move in the right direction and prevent falling down
+  // to scalarization in many cases due to the input vector being split too
+  // far.
+  unsigned NumElements = SrcVT.getVectorNumElements();
+  if ((NumElements & 1) == 0 &&
+      SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewSrcVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(
+                 Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
+        NumElements);
+    EVT SplitSrcVT =
+        EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
+    EVT SplitLoVT, SplitHiVT;
+    llvm::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
+    if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
+        TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
+      DEBUG(dbgs() << "Split vector extend via incremental extend:";
+            N->dump(&DAG); dbgs() << "\n");
+      // Extend the source vector by one step.
+      SDValue NewSrc =
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+      // Get the low and high halves of the new, extended one step, vector.
+      llvm::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+      // Extend those vector halves the rest of the way.
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+      return;
+    }
+  }
+  // Fall back to the generic unary operator splitting otherwise.
+  SplitVecRes_UnaryOp(N, Lo, Hi);
+}
+
 void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
                                                   SDValue &Lo, SDValue &Hi) {
   // The low and high parts of the original input give four input vectors.
   SDValue Inputs[4];
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
   GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
   EVT NewVT = Inputs[0].getValueType();
@@ -994,7 +1045,8 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
 
         // Extract the vector element by hand.
         SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
-                                    Inputs[Input], DAG.getIntPtrConstant(Idx)));
+                                    Inputs[Input], DAG.getConstant(Idx,
+                                                   TLI.getVectorIdxTy())));
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
@@ -1030,6 +1082,10 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
         dbgs() << "\n");
   SDValue Res = SDValue();
 
+  // See if the target wants to custom split this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
   if (Res.getNode() == 0) {
     switch (N->getOpcode()) {
     default:
@@ -1094,41 +1150,23 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
   SDValue Mask = N->getOperand(0);
   SDValue Src0 = N->getOperand(1);
   SDValue Src1 = N->getOperand(2);
-  DebugLoc DL = N->getDebugLoc();
-  EVT MaskVT = Mask.getValueType();
-  assert(MaskVT.isVector() && "VSELECT without a vector mask?");
+  EVT Src0VT = Src0.getValueType();
+  SDLoc DL(N);
+  assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
 
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
   assert(Lo.getValueType() == Hi.getValueType() &&
-         "Lo and Hi have differing types");;
-
-  unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
-  unsigned HiNumElts = Hi.getValueType().getVectorNumElements();
-  assert(LoNumElts == HiNumElts && "Asymmetric vector split?");
-
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Zero = DAG.getIntPtrConstant(0);
-  SDValue LoElts = DAG.getIntPtrConstant(LoNumElts);
-  EVT Src0VT = Src0.getValueType();
-  EVT Src0EltTy = Src0VT.getVectorElementType();
-  EVT MaskEltTy = MaskVT.getVectorElementType();
-
-  EVT LoOpVT = EVT::getVectorVT(Ctx, Src0EltTy, LoNumElts);
-  EVT LoMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, LoNumElts);
-  EVT HiOpVT = EVT::getVectorVT(Ctx, Src0EltTy, HiNumElts);
-  EVT HiMaskVT = EVT::getVectorVT(Ctx, MaskEltTy, HiNumElts);
-
-  SDValue LoOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src0, Zero);
-  SDValue LoOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoOpVT, Src1, Zero);
+         "Lo and Hi have differing types");
 
-  SDValue HiOp0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src0, LoElts);
-  SDValue HiOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiOpVT, Src1, LoElts);
+  EVT LoOpVT, HiOpVT;
+  llvm::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
+  assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
 
-  SDValue LoMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoMaskVT, Mask, Zero);
-  SDValue HiMask =
-    DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiMaskVT, Mask, LoElts);
+  SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
+  llvm::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
+  llvm::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
+  llvm::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
 
   SDValue LoSelect =
     DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
@@ -1142,7 +1180,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
   SDValue Lo, Hi;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
 
@@ -1167,7 +1205,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
   if (TLI.isBigEndian())
     std::swap(Lo, Hi);
 
-  return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
                      JoinIntegers(Lo, Hi));
 }
 
@@ -1175,7 +1213,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   // We know that the extracted result type is legal.
   EVT SubVT = N->getValueType(0);
   SDValue Idx = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(0), Lo, Hi);
 
@@ -1215,7 +1253,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Store the vector to the stack.
   EVT EltVT = VecVT.getVectorElementType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
                                MachinePointerInfo(), false, false, 0);
@@ -1229,7 +1267,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   assert(N->isUnindexed() && "Indexed store of vector?");
   assert(OpNo == 1 && "Can only split the stored value");
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   bool isTruncating = N->isTruncatingStore();
   SDValue Ch  = N->getChain();
@@ -1238,39 +1276,40 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   unsigned Alignment = N->getOriginalAlignment();
   bool isVol = N->isVolatile();
   bool isNT = N->isNonTemporal();
+  const MDNode *TBAAInfo = N->getTBAAInfo();
   SDValue Lo, Hi;
   GetSplitVector(N->getOperand(1), Lo, Hi);
 
   EVT LoMemVT, HiMemVT;
-  GetSplitDestVTs(MemoryVT, LoMemVT, HiMemVT);
+  llvm::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
 
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                           LoMemVT, isVol, isNT, Alignment);
+                           LoMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
-                    DAG.getIntPtrConstant(IncrementSize));
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
 
   if (isTruncating)
     Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
                            N->getPointerInfo().getWithOffset(IncrementSize),
-                           HiMemVT, isVol, isNT, Alignment);
+                           HiMemVT, isVol, isNT, Alignment, TBAAInfo);
   else
     Hi = DAG.getStore(Ch, DL, Hi, Ptr,
                       N->getPointerInfo().getWithOffset(IncrementSize),
-                      isVol, isNT, Alignment);
+                      isVol, isNT, Alignment, TBAAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // The input operands all must have the same type, and we know the result
   // type is valid.  Convert this to a buildvector which extracts all the
@@ -1284,7 +1323,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
     for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
          i != e; ++i) {
       Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
-                                 Op, DAG.getIntPtrConstant(i)));
+                                 Op, DAG.getConstant(i, TLI.getVectorIdxTy())));
 
     }
   }
@@ -1327,15 +1366,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_TRUNCATE(SDNode *N) {
   // to split more than once.
   if (InElementSize <= OutElementSize * 2)
     return SplitVecOp_UnaryOp(N);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Extract the halves of the input via extract_subvector.
-  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
-                                 InVT.getVectorElementType(), NumElements/2);
-  SDValue InLoVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getIntPtrConstant(0));
-  SDValue InHiVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, InVec,
-                                DAG.getIntPtrConstant(NumElements/2));
+  SDValue InLoVec, InHiVec;
+  llvm::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
   // Truncate them to 1/2 the element size.
   EVT HalfElementVT = EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
   EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
@@ -1359,7 +1394,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
          "Operand types must be vectors");
   // The result has a legal vector type, but the input needs splitting.
   SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Lo0, Hi0);
   GetSplitVector(N->getOperand(1), Lo1, Hi1);
   unsigned PartElements = Lo0.getValueType().getVectorNumElements();
@@ -1377,7 +1412,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   // The result has a legal vector type, but the input needs splitting.
   EVT ResVT = N->getValueType(0);
   SDValue Lo, Hi;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
 
@@ -1434,27 +1469,31 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+
   case ISD::ADD:
   case ISD::AND:
   case ISD::BSWAP:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SUB:
+  case ISD::XOR:
+    Res = WidenVecRes_Binary(N);
+    break;
+
   case ISD::FADD:
   case ISD::FCOPYSIGN:
-  case ISD::FDIV:
   case ISD::FMUL:
   case ISD::FPOW:
-  case ISD::FREM:
   case ISD::FSUB:
-  case ISD::MUL:
-  case ISD::MULHS:
-  case ISD::MULHU:
-  case ISD::OR:
+  case ISD::FDIV:
+  case ISD::FREM:
   case ISD::SDIV:
-  case ISD::SREM:
   case ISD::UDIV:
+  case ISD::SREM:
   case ISD::UREM:
-  case ISD::SUB:
-  case ISD::XOR:
-    Res = WidenVecRes_Binary(N);
+    Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
   case ISD::FPOWI:
@@ -1495,6 +1534,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FNEG:
   case ISD::FRINT:
+  case ISD::FROUND:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -1512,7 +1552,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
   // Ternary op widening.
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
@@ -1522,8 +1562,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
+  // Binary op widening for operations that can trap.
   unsigned Opcode = N->getOpcode();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   EVT WidenEltVT = WidenVT.getVectorElementType();
   EVT VT = WidenVT;
@@ -1562,9 +1611,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   while (CurNumElts != 0) {
     while (CurNumElts >= NumElts) {
       SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
-                                 DAG.getIntPtrConstant(Idx));
+                                 DAG.getConstant(Idx, TLI.getVectorIdxTy()));
       SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
-                                 DAG.getIntPtrConstant(Idx));
+                                 DAG.getConstant(Idx, TLI.getVectorIdxTy()));
       ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2);
       Idx += NumElts;
       CurNumElts -= NumElts;
@@ -1577,9 +1626,11 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
     if (NumElts == 1) {
       for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
         SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
-                                   InOp1, DAG.getIntPtrConstant(Idx));
+                                   InOp1, DAG.getConstant(Idx,
+                                                         TLI.getVectorIdxTy()));
         SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
-                                   InOp2, DAG.getIntPtrConstant(Idx));
+                                   InOp2, DAG.getConstant(Idx,
+                                                         TLI.getVectorIdxTy()));
         ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
                                              EOp1, EOp2);
       }
@@ -1617,7 +1668,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
       unsigned NumToInsert = ConcatEnd - Idx - 1;
       for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
         VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
-                            ConcatOps[OpIdx], DAG.getIntPtrConstant(i));
+                            ConcatOps[OpIdx], DAG.getConstant(i,
+                                                         TLI.getVectorIdxTy()));
       }
       ConcatOps[Idx+1] = VecOp;
       ConcatEnd = Idx + 2;
@@ -1659,7 +1711,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   SDValue InOp = N->getOperand(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -1705,7 +1757,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
 
     if (InVTNumElts % WidenNumElts == 0) {
       SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT,
-                                  InOp, DAG.getIntPtrConstant(0));
+                                  InOp, DAG.getConstant(0,
+                                                        TLI.getVectorIdxTy()));
       // Extract the input and convert the shorten input vector.
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVal);
@@ -1720,7 +1773,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   unsigned i;
   for (i=0; i < MinElts; ++i) {
     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
-                              DAG.getIntPtrConstant(i));
+                              DAG.getConstant(i, TLI.getVectorIdxTy()));
     if (N->getNumOperands() == 1)
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
     else
@@ -1738,7 +1791,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   SDValue ShOp = N->getOperand(1);
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
@@ -1757,14 +1810,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
   if (ShVT != ShWidenVT)
     ShOp = ModifyToType(ShOp, ShWidenVT);
 
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp, ShOp);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
   // Unary op widening.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(), WidenVT, InOp);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
@@ -1774,7 +1827,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
                                  .getVectorElementType(),
                                WidenVT.getVectorNumElements());
   SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      WidenVT, WidenLHS, DAG.getValueType(ExtVT));
 }
 
@@ -1788,7 +1841,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   EVT InVT = InOp.getValueType();
   EVT VT = N->getValueType(0);
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
@@ -1868,19 +1921,21 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // Build a vector with undefined for the new nodes.
   EVT VT = N->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
+
+  // Integer BUILD_VECTOR operands may be larger than the node's vector element
+  // type. The UNDEFs need to have the same type as the existing operands.
+  EVT EltVT = N->getOperand(0).getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
 
   SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
-  NewOps.reserve(WidenNumElts);
-  for (unsigned i = NumElts; i < WidenNumElts; ++i)
-    NewOps.push_back(DAG.getUNDEF(EltVT));
+  assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+  NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
 
   return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, &NewOps[0], NewOps.size());
 }
@@ -1888,7 +1943,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
   EVT InVT = N->getOperand(0).getValueType();
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   unsigned NumInElts = InVT.getVectorNumElements();
   unsigned NumOperands = N->getNumOperands();
@@ -1946,7 +2001,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                               DAG.getIntPtrConstant(j));
+                               DAG.getConstant(j, TLI.getVectorIdxTy()));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -1955,7 +2010,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue InOp  = N->getOperand(0);
   SDValue RndOp = N->getOperand(3);
   SDValue SatOp = N->getOperand(4);
@@ -2004,7 +2059,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
     if (InVTNumElts % WidenNumElts == 0) {
       // Extract the input and convert the shorten input vector.
       InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
-                         DAG.getIntPtrConstant(0));
+                         DAG.getConstant(0, TLI.getVectorIdxTy()));
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
     }
@@ -2020,7 +2075,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
   unsigned i;
   for (i=0; i < MinElts; ++i) {
     SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-                                 DAG.getIntPtrConstant(i));
+                                 DAG.getConstant(i, TLI.getVectorIdxTy()));
     Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
                                   SatOp, CvtCode);
   }
@@ -2038,7 +2093,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
   SDValue  InOp = N->getOperand(0);
   SDValue  Idx  = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
     InOp = GetWidenedVector(InOp);
@@ -2063,7 +2118,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   unsigned i;
   for (i=0; i < NumElts; ++i)
     Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                         DAG.getIntPtrConstant(IdxVal+i));
+                         DAG.getConstant(IdxVal+i, TLI.getVectorIdxTy()));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
@@ -2073,7 +2128,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(ISD::INSERT_VECTOR_ELT, N->getDebugLoc(),
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
                      InOp.getValueType(), InOp,
                      N->getOperand(1), N->getOperand(2));
 }
@@ -2096,7 +2151,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   if (LdChain.size() == 1)
     NewChain = LdChain[0];
   else
-    NewChain = DAG.getNode(ISD::TokenFactor, LD->getDebugLoc(), MVT::Other,
+    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
                            &LdChain[0], LdChain.size());
 
   // Modified the chain - switch anything that used the old chain to use
@@ -2108,7 +2163,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  return DAG.getNode(ISD::SCALAR_TO_VECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
                      WidenVT, N->getOperand(0));
 }
 
@@ -2132,14 +2187,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
   SDValue InOp2 = GetWidenedVector(N->getOperand(2));
   assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
-  return DAG.getNode(N->getOpcode(), N->getDebugLoc(),
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
                      WidenVT, Cond1, InOp1, InOp2);
 }
 
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
   SDValue InOp1 = GetWidenedVector(N->getOperand(2));
   SDValue InOp2 = GetWidenedVector(N->getOperand(3));
-  return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(),
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
                      InOp1.getValueType(), N->getOperand(0),
                      N->getOperand(1), InOp1, InOp2, N->getOperand(4));
 }
@@ -2153,7 +2208,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(0));
   SDValue InOp2 = GetWidenedVector(N->getOperand(1));
-  return DAG.getNode(ISD::SETCC, N->getDebugLoc(), WidenVT,
+  return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT,
                      InOp1, InOp2, N->getOperand(2));
 }
 
@@ -2164,7 +2219,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
 
 SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   unsigned NumElts = VT.getVectorNumElements();
@@ -2208,7 +2263,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
          InOp2.getValueType() == WidenInVT &&
          "Input not widened to expected type!");
   (void)WidenInVT;
-  return DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+  return DAG.getNode(ISD::SETCC, SDLoc(N),
                      WidenVT, InOp1, InOp2, N->getOperand(2));
 }
 
@@ -2277,7 +2332,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   // into some scalar code and create a nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumElts = VT.getVectorNumElements();
   SDValue InOp = N->getOperand(0);
   if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
@@ -2290,7 +2345,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
   for (unsigned i=0; i < NumElts; ++i)
     Ops[i] = DAG.getNode(Opcode, dl, EltVT,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-                                     DAG.getIntPtrConstant(i)));
+                                     DAG.getConstant(i, TLI.getVectorIdxTy())));
 
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
 }
@@ -2299,7 +2354,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDValue InOp = GetWidenedVector(N->getOperand(0));
   EVT InWidenVT = InOp.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Check if we can convert between two legal vector types and extract.
   unsigned InWidenSize = InWidenVT.getSizeInBits();
@@ -2311,7 +2366,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
     if (TLI.isTypeLegal(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
-                         DAG.getIntPtrConstant(0));
+                         DAG.getConstant(0, TLI.getVectorIdxTy()));
     }
   }
 
@@ -2324,7 +2379,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
   // nasty build vector.
   EVT VT = N->getValueType(0);
   EVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumElts = VT.getVectorNumElements();
   SmallVector<SDValue, 16> Ops(NumElts);
 
@@ -2339,20 +2394,20 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
       Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                               DAG.getIntPtrConstant(j));
+                               DAG.getConstant(j, TLI.getVectorIdxTy()));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], NumElts);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
                      N->getValueType(0), InOp, N->getOperand(1));
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   SDValue InOp = GetWidenedVector(N->getOperand(0));
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, N->getDebugLoc(),
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      N->getValueType(0), InOp, N->getOperand(1));
 }
 
@@ -2370,14 +2425,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   if (StChain.size() == 1)
     return StChain[0];
   else
-    return DAG.getNode(ISD::TokenFactor, ST->getDebugLoc(),
+    return DAG.getNode(ISD::TokenFactor, SDLoc(ST),
                        MVT::Other,&StChain[0],StChain.size());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // WARNING: In this code we widen the compare instruction with garbage.
   // This garbage may contain denormal floats which may be slow. Is this a real
@@ -2385,8 +2440,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
 
   // Get a new SETCC node to compare the newly widened operands.
   // Only some of the compared elements are legal.
-  EVT SVT = TLI.getSetCCResultType(InOp0.getValueType());
-  SDValue WideSETCC = DAG.getNode(ISD::SETCC, N->getDebugLoc(),
+  EVT SVT = TLI.getSetCCResultType(*DAG.getContext(), InOp0.getValueType());
+  SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
                      SVT, InOp0, InOp1, N->getOperand(2));
 
   // Extract the needed results from the result vector.
@@ -2394,7 +2449,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
                                SVT.getVectorElementType(),
                                N->getValueType(0).getVectorNumElements());
   SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
-                           ResVT, WideSETCC, DAG.getIntPtrConstant(0));
+                           ResVT, WideSETCC, DAG.getConstant(0,
+                                             TLI.getVectorIdxTy()));
 
   return PromoteTargetBoolean(CC, N->getValueType(0));
 }
@@ -2465,9 +2521,10 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
 //  LDOps: Load operators to build a vector type
 //  [Start,End) the list of loads to use.
 static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
-                                     SmallVector<SDValue, 16>& LdOps,
+                                     SmallVectorImpl<SDValue> &LdOps,
                                      unsigned Start, unsigned End) {
-  DebugLoc dl = LdOps[Start].getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(LdOps[Start]);
   EVT LdTy = LdOps[Start].getValueType();
   unsigned Width = VecTy.getSizeInBits();
   unsigned NumElts = Width / LdTy.getSizeInBits();
@@ -2487,12 +2544,12 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
       LdTy = NewLdTy;
     }
     VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
-                        DAG.getIntPtrConstant(Idx++));
+                        DAG.getConstant(Idx++, TLI.getVectorIdxTy()));
   }
   return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
 }
 
-SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                                               LoadSDNode *LD) {
   // The strategy assumes that we can efficiently load powers of two widths.
   // The routines chops the vector into the largest vector loads with the same
@@ -2501,7 +2558,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
   unsigned WidenWidth = WidenVT.getSizeInBits();
   EVT LdVT    = LD->getMemoryVT();
-  DebugLoc dl = LD->getDebugLoc();
+  SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
   assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
 
@@ -2512,6 +2569,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
   bool      isInvariant = LD->isInvariant();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;          // Difference
@@ -2521,7 +2579,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
-                             isVolatile, isNonTemporal, isInvariant, Align);
+                             isVolatile, isNonTemporal, isInvariant, Align,
+                             TBAAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction
@@ -2557,7 +2616,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
     unsigned Increment = NewVTWidth / 8;
     Offset += Increment;
     BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                          DAG.getIntPtrConstant(Increment));
+                          DAG.getConstant(Increment, BasePtr.getValueType()));
 
     SDValue L;
     if (LdWidth < NewVTWidth) {
@@ -2566,7 +2625,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2582,7 +2642,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset), isVolatile,
-                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment),
+                      TBAAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -2646,14 +2707,14 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
 }
 
 SDValue
-DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
-                                         LoadSDNode * LD,
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
+                                         LoadSDNode *LD,
                                          ISD::LoadExtType ExtType) {
   // For extension loads, it may not be more efficient to chop up the vector
   // and then extended it.  Instead, we unroll the load and build a new vector.
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
   EVT LdVT    = LD->getMemoryVT();
-  DebugLoc dl = LD->getDebugLoc();
+  SDLoc dl(LD);
   assert(LdVT.isVector() && WidenVT.isVector());
 
   // Load information
@@ -2662,6 +2723,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
   unsigned  Align    = LD->getAlignment();
   bool      isVolatile = LD->isVolatile();
   bool      isNonTemporal = LD->isNonTemporal();
+  const MDNode *TBAAInfo = LD->getTBAAInfo();
 
   EVT EltVT = WidenVT.getVectorElementType();
   EVT LdEltVT = LdVT.getVectorElementType();
@@ -2673,15 +2735,17 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr,
                           LD->getPointerInfo(),
-                          LdEltVT, isVolatile, isNonTemporal, Align);
+                          LdEltVT, isVolatile, isNonTemporal, Align, TBAAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr,
+                                     DAG.getConstant(Offset,
+                                                     BasePtr.getValueType()));
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            isVolatile, isNonTemporal, Align);
+                            isVolatile, isNonTemporal, Align, TBAAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -2694,7 +2758,7 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVector<SDValue, 16>& LdChain,
 }
 
 
-void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // The strategy assumes that we can efficiently store powers of two widths.
   // The routines chops the vector into the largest vector stores with the same
@@ -2704,8 +2768,9 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
-  DebugLoc dl = ST->getDebugLoc();
+  SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
   unsigned StWidth = StVT.getSizeInBits();
@@ -2726,16 +2791,16 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
       unsigned NumVTElts = NewVT.getVectorNumElements();
       do {
         SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
-                                   DAG.getIntPtrConstant(Idx));
+                                   DAG.getConstant(Idx, TLI.getVectorIdxTy()));
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                              DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
     } else {
       // Cast the vector to the scalar type we can store
@@ -2746,15 +2811,15 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
       Idx = Idx * ValEltWidth / NewVTWidth;
       do {
         SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
-                      DAG.getIntPtrConstant(Idx++));
+                      DAG.getConstant(Idx++, TLI.getVectorIdxTy()));
         StChain.push_back(DAG.getStore(Chain, dl, EOp, BasePtr,
                                     ST->getPointerInfo().getWithOffset(Offset),
                                        isVolatile, isNonTemporal,
-                                       MinAlign(Align, Offset)));
+                                       MinAlign(Align, Offset), TBAAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
-                              DAG.getIntPtrConstant(Increment));
+                            DAG.getConstant(Increment, BasePtr.getValueType()));
       } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
@@ -2763,7 +2828,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
 }
 
 void
-DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
+DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
                                             StoreSDNode *ST) {
   // For extension loads, it may not be more efficient to truncate the vector
   // and then store it.  Instead, we extract each element and then store it.
@@ -2772,8 +2837,9 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
   unsigned Align = ST->getAlignment();
   bool     isVolatile = ST->isVolatile();
   bool     isNonTemporal = ST->isNonTemporal();
+  const MDNode *TBAAInfo = ST->getTBAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
-  DebugLoc dl = ST->getDebugLoc();
+  SDLoc dl(ST);
 
   EVT StVT = ST->getMemoryVT();
   EVT ValVT = ValOp.getValueType();
@@ -2791,20 +2857,22 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVector<SDValue, 16>& StChain,
   unsigned Increment = ValEltVT.getSizeInBits() / 8;
   unsigned NumElts = StVT.getVectorNumElements();
   SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                            DAG.getIntPtrConstant(0));
+                            DAG.getConstant(0, TLI.getVectorIdxTy()));
   StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
                                       ST->getPointerInfo(), StEltVT,
-                                      isVolatile, isNonTemporal, Align));
+                                      isVolatile, isNonTemporal, Align,
+                                      TBAAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
-                                     BasePtr, DAG.getIntPtrConstant(Offset));
+                                     BasePtr, DAG.getConstant(Offset,
+                                                       BasePtr.getValueType()));
     SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-                            DAG.getIntPtrConstant(0));
+                            DAG.getConstant(0, TLI.getVectorIdxTy()));
     StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, NewBasePtr,
                                       ST->getPointerInfo().getWithOffset(Offset),
                                         StEltVT, isVolatile, isNonTemporal,
-                                        MinAlign(Align, Offset)));
+                                        MinAlign(Align, Offset), TBAAInfo));
   }
 }
 
@@ -2816,7 +2884,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   EVT InVT = InOp.getValueType();
   assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
          "input and widen element type must match");
-  DebugLoc dl = InOp.getDebugLoc();
+  SDLoc dl(InOp);
 
   // Check if InOp already has the right width.
   if (InVT == NVT)
@@ -2837,7 +2905,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
 
   if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
-                       DAG.getIntPtrConstant(0));
+                       DAG.getConstant(0, TLI.getVectorIdxTy()));
 
   // Fall back to extract and build.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
@@ -2846,7 +2914,7 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT) {
   unsigned Idx;
   for (Idx = 0; Idx < MinNumElts; ++Idx)
     Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                           DAG.getIntPtrConstant(Idx));
+                           DAG.getConstant(Idx, TLI.getVectorIdxTy()));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for ( ; Idx < WidenNumElts; ++Idx)
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 473e138..1dd2128 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -42,11 +42,11 @@ static cl::opt<signed> RegPressureThreshold(
 
 ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS) :
   Picker(this),
-  InstrItins(IS->getTargetLowering().getTargetMachine().getInstrItineraryData())
+ InstrItins(IS->getTargetLowering()->getTargetMachine().getInstrItineraryData())
 {
-   TII = IS->getTargetLowering().getTargetMachine().getInstrInfo();
-   TRI = IS->getTargetLowering().getTargetMachine().getRegisterInfo();
-   TLI = &IS->getTargetLowering();
+   TII = IS->getTargetLowering()->getTargetMachine().getInstrInfo();
+   TRI = IS->getTargetLowering()->getTargetMachine().getRegisterInfo();
+   TLI = IS->getTargetLowering();
 
    const TargetMachine &tm = (*IS->MF).getTarget();
    ResourcesModel = tm.getInstrInfo()->CreateTargetScheduleState(&tm,NULL);
@@ -389,10 +389,9 @@ signed ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 15;
-static const unsigned PriorityFive = 5;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 15;
+static const unsigned PriorityFour = 5;
 static const unsigned ScaleOne = 20;
 static const unsigned ScaleTwo = 10;
 static const unsigned ScaleThree = 5;
@@ -449,7 +448,7 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
     if (N->isMachineOpcode()) {
       const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
       if (TID.isCall())
-        ResCount += (PriorityThree + (ScaleThree*N->getNumValues()));
+        ResCount += (PriorityTwo + (ScaleThree*N->getNumValues()));
     }
     else
       switch (N->getOpcode()) {
@@ -457,11 +456,11 @@ signed ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
       case ISD::TokenFactor:
       case ISD::CopyFromReg:
       case ISD::CopyToReg:
-        ResCount += PriorityFive;
+        ResCount += PriorityFour;
         break;
 
       case ISD::INLINEASM:
-        ResCount += PriorityFour;
+        ResCount += PriorityThree;
         break;
       }
   }
diff --git a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h b/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
deleted file mode 100644
index 7e7b897..0000000
--- a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- llvm/CodeGen/SDNodeOrdering.h - SDNode Ordering ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the SDNodeOrdering class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_SDNODEORDERING_H
-#define LLVM_CODEGEN_SDNODEORDERING_H
-
-#include "llvm/ADT/DenseMap.h"
-
-namespace llvm {
-
-class SDNode;
-
-/// SDNodeOrdering - Maps a unique (monotonically increasing) value to each
-/// SDNode that roughly corresponds to the ordering of the original LLVM
-/// instruction. This is used for turning off scheduling, because we'll forgo
-/// the normal scheduling algorithms and output the instructions according to
-/// this ordering.
-class SDNodeOrdering {
-  DenseMap<const SDNode*, unsigned> OrderMap;
-
-  void operator=(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
-  SDNodeOrdering(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
-public:
-  SDNodeOrdering() {}
-
-  void add(const SDNode *Node, unsigned NewOrder) {
-    unsigned &OldOrder = OrderMap[Node];
-    if (OldOrder == 0 || (OldOrder > 0 && NewOrder < OldOrder))
-      OldOrder = NewOrder;
-  }
-  void remove(const SDNode *Node) {
-    DenseMap<const SDNode*, unsigned>::iterator Itr = OrderMap.find(Node);
-    if (Itr != OrderMap.end())
-      OrderMap.erase(Itr);
-  }
-  void clear() {
-    OrderMap.clear();
-  }
-  unsigned getOrder(const SDNode *Node) {
-    return OrderMap[Node];
-  }
-};
-
-} // end llvm namespace
-
-#endif
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index d1f36cb..6c5e0ab 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -102,8 +102,8 @@ private:
   void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
                                 const TargetRegisterClass*,
                                 const TargetRegisterClass*,
-                                SmallVector<SUnit*, 2>&);
-  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
   void ListScheduleBottomUp();
 
   /// forceUnitLatencies - The fast scheduler doesn't care about real latencies.
@@ -387,7 +387,7 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
 void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
                                               const TargetRegisterClass *DestRC,
                                               const TargetRegisterClass *SrcRC,
-                                               SmallVector<SUnit*, 2> &Copies) {
+                                              SmallVectorImpl<SUnit*> &Copies) {
   SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(NULL));
   CopyFromSU->CopySrcRC = SrcRC;
   CopyFromSU->CopyDstRC = DestRC;
@@ -448,7 +448,7 @@ static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                std::vector<SUnit*> &LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
-                               SmallVector<unsigned, 4> &LRegs,
+                               SmallVectorImpl<unsigned> &LRegs,
                                const TargetRegisterInfo *TRI) {
   bool Added = false;
   for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
@@ -467,7 +467,7 @@ static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
 /// If the specific node is the last one that's available to schedule, do
 /// whatever is necessary (i.e. backtracking or cloning) to make it possible.
 bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
-                                               SmallVector<unsigned, 4> &LRegs){
+                                              SmallVectorImpl<unsigned> &LRegs){
   if (NumLiveRegs == 0)
     return false;
 
@@ -567,7 +567,7 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
         // "expensive to copy" values to break the dependency. In case even
         // that doesn't work, insert cross class copies.
         SUnit *TrySU = NotReady[0];
-        SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+        SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
         assert(LRegs.size() == 1 && "Can't handle this yet!");
         unsigned Reg = LRegs[0];
         SUnit *LRDef = LiveRegDefs[Reg];
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index c009cfc..1a562d7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -229,8 +229,8 @@ private:
   void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
                                 const TargetRegisterClass*,
                                 const TargetRegisterClass*,
-                                SmallVector<SUnit*, 2>&);
-  bool DelayForLiveRegsBottomUp(SUnit*, SmallVector<unsigned, 4>&);
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
 
   void releaseInterferences(unsigned Reg = 0);
 
@@ -718,7 +718,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   // indicate the scheduled cycle.
   SU->setHeightToAtLeast(CurCycle);
 
-  // Reserve resources for the scheduled intruction.
+  // Reserve resources for the scheduled instruction.
   EmitNode(SU);
 
   Sequence.push_back(SU);
@@ -1133,9 +1133,9 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
 /// InsertCopiesAndMoveSuccs - Insert register copies and move all
 /// scheduled successors of the given SUnit to the last copy.
 void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
-                                               const TargetRegisterClass *DestRC,
-                                               const TargetRegisterClass *SrcRC,
-                                               SmallVector<SUnit*, 2> &Copies) {
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                              SmallVectorImpl<SUnit*> &Copies) {
   SUnit *CopyFromSU = CreateNewSUnit(NULL);
   CopyFromSU->CopySrcRC = SrcRC;
   CopyFromSU->CopyDstRC = DestRC;
@@ -1205,7 +1205,7 @@ static EVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
 static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
                                std::vector<SUnit*> &LiveRegDefs,
                                SmallSet<unsigned, 4> &RegAdded,
-                               SmallVector<unsigned, 4> &LRegs,
+                               SmallVectorImpl<unsigned> &LRegs,
                                const TargetRegisterInfo *TRI) {
   for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
 
@@ -1227,7 +1227,7 @@ static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
 static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
                                      std::vector<SUnit*> &LiveRegDefs,
                                      SmallSet<unsigned, 4> &RegAdded,
-                                     SmallVector<unsigned, 4> &LRegs) {
+                                     SmallVectorImpl<unsigned> &LRegs) {
   // Look at all live registers. Skip Reg0 and the special CallResource.
   for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
     if (!LiveRegDefs[i]) continue;
@@ -1252,7 +1252,7 @@ static const uint32_t *getNodeRegMask(const SDNode *N) {
 /// If the specific node is the last one that's available to schedule, do
 /// whatever is necessary (i.e. backtracking or cloning) to make it possible.
 bool ScheduleDAGRRList::
-DelayForLiveRegsBottomUp(SUnit *SU, SmallVector<unsigned, 4> &LRegs) {
+DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
   if (NumLiveRegs == 0)
     return false;
 
@@ -1331,7 +1331,7 @@ void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
     SUnit *SU = Interferences[i-1];
     LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
     if (Reg) {
-      SmallVector<unsigned, 4> &LRegs = LRegsPos->second;
+      SmallVectorImpl<unsigned> &LRegs = LRegsPos->second;
       if (std::find(LRegs.begin(), LRegs.end(), Reg) == LRegs.end())
         continue;
     }
@@ -1385,7 +1385,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
   // to resolve it.
   for (unsigned i = 0, e = Interferences.size(); i != e; ++i) {
     SUnit *TrySU = Interferences[i];
-    SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
 
     // Try unscheduling up to the point where it's safe to schedule
     // this node.
@@ -1433,7 +1433,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
     // insert cross class copies.
     // If it's not too expensive, i.e. cost != -1, issue copies.
     SUnit *TrySU = Interferences[0];
-    SmallVector<unsigned, 4> &LRegs = LRegsMap[TrySU];
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
     assert(LRegs.size() == 1 && "Can't handle this yet!");
     unsigned Reg = LRegs[0];
     SUnit *LRDef = LiveRegDefs[Reg];
@@ -1692,7 +1692,7 @@ public:
   unsigned getNodeOrdering(const SUnit *SU) const {
     if (!SU->getNode()) return 0;
 
-    return scheduleDAG->DAG->GetOrdering(SU->getNode());
+    return SU->getNode()->getIROrder();
   }
 
   bool empty() const { return Queue.empty(); }
@@ -2401,7 +2401,8 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
     bool RHasPhysReg = right->hasPhysRegDefs;
     if (LHasPhysReg != RHasPhysReg) {
       #ifndef NDEBUG
-      const char *const PhysRegMsg[] = {" has no physreg"," defines a physreg"};
+      static const char *const PhysRegMsg[] = { " has no physreg",
+                                                " defines a physreg" };
       #endif
       DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
             << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
@@ -3013,7 +3014,7 @@ llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
   const TargetMachine &TM = IS->TM;
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = &IS->getTargetLowering();
+  const TargetLowering *TLI = IS->getTargetLowering();
 
   HybridBURRPriorityQueue *PQ =
     new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
@@ -3029,7 +3030,7 @@ llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
   const TargetMachine &TM = IS->TM;
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
-  const TargetLowering *TLI = &IS->getTargetLowering();
+  const TargetLowering *TLI = IS->getTargetLowering();
 
   ILPBURRPriorityQueue *PQ =
     new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index b22440d..054e3dd 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -690,21 +690,11 @@ void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
 }
 #endif // NDEBUG
 
-namespace {
-  struct OrderSorter {
-    bool operator()(const std::pair<unsigned, MachineInstr*> &A,
-                    const std::pair<unsigned, MachineInstr*> &B) {
-      return A.first < B.first;
-    }
-  };
-}
-
 /// ProcessSDDbgValues - Process SDDbgValues associated with this node.
-static void ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG,
-                               InstrEmitter &Emitter,
-                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
-                            DenseMap<SDValue, unsigned> &VRBaseMap,
-                            unsigned Order) {
+static void
+ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                   SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
+                   DenseMap<SDValue, unsigned> &VRBaseMap, unsigned Order) {
   if (!N->getHasDebugValue())
     return;
 
@@ -731,12 +721,12 @@ static void ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG,
 // ProcessSourceNode - Process nodes with source order numbers. These are added
 // to a vector which EmitSchedule uses to determine how to insert dbg_value
 // instructions in the right order.
-static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
-                           InstrEmitter &Emitter,
-                           DenseMap<SDValue, unsigned> &VRBaseMap,
-                    SmallVector<std::pair<unsigned, MachineInstr*>, 32> &Orders,
-                           SmallSet<unsigned, 8> &Seen) {
-  unsigned Order = DAG->GetOrdering(N);
+static void
+ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
+                  SmallSet<unsigned, 8> &Seen) {
+  unsigned Order = N->getIROrder();
   if (!Order || !Seen.insert(Order)) {
     // Process any valid SDDbgValues even if node does not have any order
     // assigned.
@@ -745,7 +735,10 @@ static void ProcessSourceNode(SDNode *N, SelectionDAG *DAG,
   }
 
   MachineBasicBlock *BB = Emitter.getBlock();
-  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI()) {
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI() ||
+      // Fast-isel may have inserted some instructions, in which case the
+      // BB->back().isPHI() test will not fire when we want it to.
+      prior(Emitter.getInsertPos())->isPHI()) {
     // Did not insert any instruction.
     Orders.push_back(std::make_pair(Order, (MachineInstr*)0));
     return;
@@ -858,7 +851,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
 
     // Sort the source order instructions and use the order to insert debug
     // values.
-    std::sort(Orders.begin(), Orders.end(), OrderSorter());
+    std::sort(Orders.begin(), Orders.end(), less_first());
 
     SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
     SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
@@ -883,7 +876,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
             // Insert at the instruction, which may be in a different
             // block, if the block was split by a custom inserter.
             MachineBasicBlock::iterator Pos = MI;
-            MI->getParent()->insert(llvm::next(Pos), DbgMI);
+            MI->getParent()->insert(Pos, DbgMI);
           }
         }
       }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 15235c8..45d5a4f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "SDNodeDbgValue.h"
-#include "SDNodeOrdering.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -636,9 +635,6 @@ void SelectionDAG::DeallocateNode(SDNode *N) {
 
   NodeAllocator.Deallocate(AllNodes.remove(N));
 
-  // Remove the ordering of this node.
-  Ordering->remove(N);
-
   // If any of the SDDbgValue nodes refer to this SDNode, invalidate them.
   ArrayRef<SDDbgValue*> DbgVals = DbgInfo->getSDDbgValues(N);
   for (unsigned i = 0, e = DbgVals.size(); i != e; ++i)
@@ -868,30 +864,30 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return TLI.getDataLayout()->getABITypeAlignment(Ty);
+  return TM.getTargetLowering()->getDataLayout()->getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
 SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
-  : TM(tm), TLI(*tm.getTargetLowering()), TSI(*tm.getSelectionDAGInfo()),
-    TTI(0), OptLevel(OL), EntryNode(ISD::EntryToken, DebugLoc(),
-                                    getVTList(MVT::Other)),
-    Root(getEntryNode()), Ordering(0), UpdateListeners(0) {
+  : TM(tm), TSI(*tm.getSelectionDAGInfo()), TTI(0), TLI(0), OptLevel(OL),
+    EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+    Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+    UpdateListeners(0) {
   AllNodes.push_back(&EntryNode);
-  Ordering = new SDNodeOrdering();
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti) {
+void SelectionDAG::init(MachineFunction &mf, const TargetTransformInfo *tti,
+                        const TargetLowering *tli) {
   MF = &mf;
   TTI = tti;
+  TLI = tli;
   Context = &mf.getFunction()->getContext();
 }
 
 SelectionDAG::~SelectionDAG() {
   assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
   allnodes_clear();
-  delete Ordering;
   delete DbgInfo;
 }
 
@@ -918,29 +914,28 @@ void SelectionDAG::clear() {
   EntryNode.UseList = 0;
   AllNodes.push_back(&EntryNode);
   Root = getEntryNode();
-  Ordering->clear();
   DbgInfo->clear();
 }
 
-SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ANY_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, DebugLoc DL, EVT VT) {
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, SDLoc DL, EVT VT) {
   return VT.bitsGT(Op.getValueType()) ?
     getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
     getNode(ISD::TRUNCATE, DL, VT, Op);
 }
 
-SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, SDLoc DL, EVT VT) {
   assert(!VT.isVector() &&
          "getZeroExtendInReg should use the vector element type instead of "
          "the vector type!");
@@ -954,7 +949,7 @@ SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, DebugLoc DL, EVT VT) {
 
 /// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
 ///
-SDValue SelectionDAG::getNOT(DebugLoc DL, SDValue Val, EVT VT) {
+SDValue SelectionDAG::getNOT(SDLoc DL, SDValue Val, EVT VT) {
   EVT EltVT = VT.getScalarType();
   SDValue NegOne =
     getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
@@ -979,16 +974,66 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
   EVT EltVT = VT.getScalarType();
   const ConstantInt *Elt = &Val;
 
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   // In some cases the vector type is legal but the element type is illegal and
   // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
   // inserted value (the type does not need to match the vector element type).
   // Any extra bits introduced will be truncated away.
-  if (VT.isVector() && TLI.getTypeAction(*getContext(), EltVT) ==
+  if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
       TargetLowering::TypePromoteInteger) {
-   EltVT = TLI.getTypeToTransformTo(*getContext(), EltVT);
+   EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
    APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
    Elt = ConstantInt::get(*getContext(), NewVal);
   }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+           TargetLowering::TypeExpandInteger) {
+    APInt NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                           .trunc(ViaEltSizeInBits),
+                                     ViaEltVT, isT));
+    }
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (TLI->isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+
+    SDValue Result = getNode(ISD::BITCAST, SDLoc(), VT,
+                             getNode(ISD::BUILD_VECTOR, SDLoc(), ViaVecVT,
+                                     &Ops[0], Ops.size()));
+    return Result;
+  }
 
   assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
          "APInt size does not match type size!");
@@ -1012,13 +1057,13 @@ SDValue SelectionDAG::getConstant(const ConstantInt &Val, EVT VT, bool isT) {
   if (VT.isVector()) {
     SmallVector<SDValue, 8> Ops;
     Ops.assign(VT.getVectorNumElements(), Result);
-    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, &Ops[0], Ops.size());
   }
   return Result;
 }
 
 SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, bool isTarget) {
-  return getConstant(Val, TLI.getPointerTy(), isTarget);
+  return getConstant(Val, TM.getTargetLowering()->getPointerTy(), isTarget);
 }
 
 
@@ -1054,8 +1099,8 @@ SDValue SelectionDAG::getConstantFP(const ConstantFP& V, EVT VT, bool isTarget){
   if (VT.isVector()) {
     SmallVector<SDValue, 8> Ops;
     Ops.assign(VT.getVectorNumElements(), Result);
-    // FIXME DebugLoc info might be appropriate here
-    Result = getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, &Ops[0], Ops.size());
+    // FIXME SDLoc info might be appropriate here
+    Result = getNode(ISD::BUILD_VECTOR, SDLoc(), VT, &Ops[0], Ops.size());
   }
   return Result;
 }
@@ -1077,15 +1122,16 @@ SDValue SelectionDAG::getConstantFP(double Val, EVT VT, bool isTarget) {
     llvm_unreachable("Unsupported type in getConstantFP");
 }
 
-SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, SDLoc DL,
                                        EVT VT, int64_t Offset,
                                        bool isTargetGA,
                                        unsigned char TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  unsigned BitWidth = TLI.getPointerTy().getSizeInBits();
+  unsigned BitWidth = TLI->getPointerTypeSizeInBits(GV->getType());
   if (BitWidth < 64)
     Offset = SignExtend64(Offset, BitWidth);
 
@@ -1112,7 +1158,8 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL, GV, VT,
+  SDNode *N = new (NodeAllocator) GlobalAddressSDNode(Opc, DL.getIROrder(),
+                                                      DL.getDebugLoc(), GV, VT,
                                                       Offset, TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
@@ -1161,7 +1208,8 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment =
+    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
@@ -1188,7 +1236,8 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
+    Alignment =
+    TM.getTargetLowering()->getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
@@ -1299,13 +1348,10 @@ static void commuteShuffle(SDValue &N1, SDValue &N2, SmallVectorImpl<int> &M) {
   }
 }
 
-SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
+SDValue SelectionDAG::getVectorShuffle(EVT VT, SDLoc dl, SDValue N1,
                                        SDValue N2, const int *Mask) {
-  assert(N1.getValueType() == N2.getValueType() && "Invalid VECTOR_SHUFFLE");
-  assert(VT.isVector() && N1.getValueType().isVector() &&
-         "Vector Shuffle VTs must be a vectors");
-  assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType()
-         && "Vector Shuffle VTs must have same element type");
+  assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+         "Invalid VECTOR_SHUFFLE");
 
   // Canonicalize shuffle undef, undef -> undef
   if (N1.getOpcode() == ISD::UNDEF && N2.getOpcode() == ISD::UNDEF)
@@ -1354,17 +1400,13 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
     commuteShuffle(N1, N2, MaskVec);
   }
 
-  // If Identity shuffle, or all shuffle in to undef, return that node.
-  bool AllUndef = true;
+  // If Identity shuffle return that node.
   bool Identity = true;
   for (unsigned i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
-    if (MaskVec[i] >= 0) AllUndef = false;
   }
-  if (Identity && NElts == N1.getValueType().getVectorNumElements())
+  if (Identity && NElts)
     return N1;
-  if (AllUndef)
-    return getUNDEF(VT);
 
   FoldingSetNodeID ID;
   SDValue Ops[2] = { N1, N2 };
@@ -1383,13 +1425,15 @@ SDValue SelectionDAG::getVectorShuffle(EVT VT, DebugLoc dl, SDValue N1,
   memcpy(MaskAlloc, &MaskVec[0], NElts * sizeof(int));
 
   ShuffleVectorSDNode *N =
-    new (NodeAllocator) ShuffleVectorSDNode(VT, dl, N1, N2, MaskAlloc);
+    new (NodeAllocator) ShuffleVectorSDNode(VT, dl.getIROrder(),
+                                            dl.getDebugLoc(), N1, N2,
+                                            MaskAlloc);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
+SDValue SelectionDAG::getConvertRndSat(EVT VT, SDLoc dl,
                                        SDValue Val, SDValue DTy,
                                        SDValue STy, SDValue Rnd, SDValue Sat,
                                        ISD::CvtCode Code) {
@@ -1406,8 +1450,9 @@ SDValue SelectionDAG::getConvertRndSat(EVT VT, DebugLoc dl,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl, Ops, 5,
-                                                           Code);
+  CvtRndSatSDNode *N = new (NodeAllocator) CvtRndSatSDNode(VT, dl.getIROrder(),
+                                                           dl.getDebugLoc(),
+                                                           Ops, 5, Code);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1441,7 +1486,7 @@ SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label) {
+SDValue SelectionDAG::getEHLabel(SDLoc dl, SDValue Root, MCSymbol *Label) {
   FoldingSetNodeID ID;
   SDValue Ops[] = { Root };
   AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), &Ops[0], 1);
@@ -1450,7 +1495,8 @@ SDValue SelectionDAG::getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label) {
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl, Root, Label);
+  SDNode *N = new (NodeAllocator) EHLabelSDNode(dl.getIROrder(),
+                                                dl.getDebugLoc(), Root, Label);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1513,16 +1559,36 @@ SDValue SelectionDAG::getMDNode(const MDNode *MD) {
   return SDValue(N, 0);
 }
 
+/// getAddrSpaceCast - Return an AddrSpaceCastSDNode.
+SDValue SelectionDAG::getAddrSpaceCast(SDLoc dl, EVT VT, SDValue Ptr,
+                                       unsigned SrcAS, unsigned DestAS) {
+  SDValue Ops[] = {Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), &Ops[0], 1);
+  ID.AddInteger(SrcAS);
+  ID.AddInteger(DestAS);
+
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) AddrSpaceCastSDNode(dl.getIROrder(),
+                                                      dl.getDebugLoc(),
+                                                      VT, Ptr, SrcAS, DestAS);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
 
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
   EVT OpTy = Op.getValueType();
-  EVT ShTy = TLI.getShiftAmountTy(LHSTy);
+  EVT ShTy = TM.getTargetLowering()->getShiftAmountTy(LHSTy);
   if (OpTy == ShTy || OpTy.isVector()) return Op;
 
   ISD::NodeType Opcode = OpTy.bitsGT(ShTy) ?  ISD::TRUNCATE : ISD::ZERO_EXTEND;
-  return getNode(Opcode, Op.getDebugLoc(), ShTy, Op);
+  return getNode(Opcode, SDLoc(Op), ShTy, Op);
 }
 
 /// CreateStackTemporary - Create a stack temporary, suitable for holding the
@@ -1531,11 +1597,12 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
+  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned StackAlign =
-  std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
+  std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
 
   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
-  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+  return getFrameIndex(FrameIdx, TLI->getPointerTy());
 }
 
 /// CreateStackTemporary - Create a stack temporary suitable for holding
@@ -1545,24 +1612,30 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
                             VT2.getStoreSizeInBits())/8;
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
-  const DataLayout *TD = TLI.getDataLayout();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  const DataLayout *TD = TLI->getDataLayout();
   unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
                             TD->getPrefTypeAlignment(Ty2));
 
   MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false);
-  return getFrameIndex(FrameIdx, TLI.getPointerTy());
+  return getFrameIndex(FrameIdx, TLI->getPointerTy());
 }
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
-                                SDValue N2, ISD::CondCode Cond, DebugLoc dl) {
+                                SDValue N2, ISD::CondCode Cond, SDLoc dl) {
   // These setcc operations always fold.
   switch (Cond) {
   default: break;
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    const TargetLowering *TLI = TM.getTargetLowering();
+    TargetLowering::BooleanContent Cnt = TLI->getBooleanContents(VT.isVector());
+    return getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
 
   case ISD::SETOEQ:
   case ISD::SETOGT:
@@ -1644,7 +1717,12 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
       }
     } else {
       // Ensure that the constant occurs on the RHS.
-      return getSetCC(dl, VT, N2, N1, ISD::getSetCCSwappedOperands(Cond));
+      ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+      MVT CompVT = N1.getValueType().getSimpleVT();
+      if (!TM.getTargetLowering()->isCondCodeLegal(SwappedCond, CompVT))
+        return SDValue();
+
+      return getSetCC(dl, VT, N2, N1, SwappedCond);
     }
   }
 
@@ -1680,6 +1758,7 @@ bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
 /// processing.
 void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
                                      APInt &KnownOne, unsigned Depth) const {
+  const TargetLowering *TLI = TM.getTargetLowering();
   unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
 
   KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
@@ -1802,7 +1881,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     // The boolean result conforms to getBooleanContents.  Fall through.
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
-    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
+    if (TLI->getBooleanContents(Op.getValueType().isVector()) ==
         TargetLowering::ZeroOrOneBooleanContent && BitWidth > 1)
       KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
     return;
@@ -1942,7 +2021,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarType().getSizeInBits();
-    APInt InSignBit = APInt::getSignBit(InBits);
     APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
 
     KnownZero = KnownZero.trunc(InBits);
@@ -2054,7 +2132,6 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
       const APInt &RA = Rem->getAPIntValue().abs();
       if (RA.isPowerOf2()) {
         APInt LowBits = RA - 1;
-        APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
         ComputeMaskedBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
 
         // The low bits of the first operand are unchanged by the srem.
@@ -2114,7 +2191,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI.computeMaskedBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    TLI->computeMaskedBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
     return;
   }
 }
@@ -2125,6 +2202,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
 /// information.  For example, immediately after an "SRA X, 2", we know that
 /// the top 3 bits are all equal to each other, so we return 3.
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
+  const TargetLowering *TLI = TM.getTargetLowering();
   EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarType().getSizeInBits();
@@ -2149,7 +2227,8 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
   }
 
   case ISD::SIGN_EXTEND:
-    Tmp = VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    Tmp =
+        VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
   case ISD::SIGN_EXTEND_INREG:
@@ -2209,7 +2288,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
     // The boolean result conforms to getBooleanContents.  Fall through.
   case ISD::SETCC:
     // If setcc returns 0/-1, all bits are sign bits.
-    if (TLI.getBooleanContents(Op.getValueType().isVector()) ==
+    if (TLI->getBooleanContents(Op.getValueType().isVector()) ==
         TargetLowering::ZeroOrNegativeOneBooleanContent)
       return VTBits;
     break;
@@ -2310,7 +2389,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const{
       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI.ComputeNumSignBitsForTargetNode(Op, Depth);
+    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, Depth);
     if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
@@ -2403,14 +2482,15 @@ bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
 
 /// getNode - Gets or creates the specified node.
 ///
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opcode, getVTList(VT), 0, 0);
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL, getVTList(VT));
+  SDNode *N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), getVTList(VT));
   CSEMap.InsertNode(N, IP);
 
   AllNodes.push_back(N);
@@ -2420,7 +2500,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT) {
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
                               EVT VT, SDValue Operand) {
   // Constant fold unary operations with an integer constant operand.
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand.getNode())) {
@@ -2671,10 +2751,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTs, Operand);
+    N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs, Operand);
   }
 
   AllNodes.push_back(N);
@@ -2789,11 +2871,11 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, EVT VT,
     return Outputs.back();
 
   // Otherwise build a big vector out of the scalar elements we generated.
-  return getNode(ISD::BUILD_VECTOR, DebugLoc(), VT, Outputs.data(),
+  return getNode(ISD::BUILD_VECTOR, SDLoc(), VT, Outputs.data(),
                  Outputs.size());
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT, SDValue N1,
                               SDValue N2) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
@@ -3072,9 +3154,10 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
     if (VT.isSimple() && N1.getValueType().isSimple()) {
       assert(VT.isVector() && N1.getValueType().isVector() &&
              "Extract subvector VTs must be a vectors!");
-      assert(VT.getVectorElementType() == N1.getValueType().getVectorElementType() &&
+      assert(VT.getVectorElementType() ==
+             N1.getValueType().getVectorElementType() &&
              "Extract subvector VTs must have the same element type!");
-      assert(VT.getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
              "Extract subvector must be from larger vector to smaller vector!");
 
       if (isa<ConstantSDNode>(Index.getNode())) {
@@ -3085,7 +3168,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
       }
 
       // Trivial extraction.
-      if (VT.getSimpleVT() == N1.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N1.getSimpleValueType())
         return N1;
     }
     break;
@@ -3243,10 +3326,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTs, N1, N2);
+    N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                         DL.getDebugLoc(), VTs, N1, N2);
   }
 
   AllNodes.push_back(N);
@@ -3256,11 +3341,26 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT, SDValue N1,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3) {
   // Perform various simplifications.
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   switch (Opcode) {
+  case ISD::FMA: {
+    ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+    ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+    ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
+    if (N1CFP && N2CFP && N3CFP) {
+      APFloat  V1 = N1CFP->getValueAPF();
+      const APFloat &V2 = N2CFP->getValueAPF();
+      const APFloat &V3 = N3CFP->getValueAPF();
+      APFloat::opStatus s =
+        V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
+      if (s != APFloat::opInvalidOp)
+        return getConstantFP(V1, VT);
+    }
+    break;
+  }
   case ISD::CONCAT_VECTORS:
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
     // one big BUILD_VECTOR.
@@ -3300,7 +3400,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
              "Insert subvector VTs must be a vectors");
       assert(VT == N1.getValueType() &&
              "Dest and insert subvector source types must match!");
-      assert(N2.getValueType().getSimpleVT() <= N1.getValueType().getSimpleVT() &&
+      assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
              "Insert subvector must be from smaller vector to larger vector!");
       if (isa<ConstantSDNode>(Index.getNode())) {
         assert((N2.getValueType().getVectorNumElements() +
@@ -3310,7 +3410,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
       }
 
       // Trivial insertion.
-      if (VT.getSimpleVT() == N2.getValueType().getSimpleVT())
+      if (VT.getSimpleVT() == N2.getSimpleValueType())
         return N2;
     }
     break;
@@ -3333,10 +3433,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTs, N1, N2, N3);
+    N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTs, N1, N2, N3);
   }
 
   AllNodes.push_back(N);
@@ -3346,14 +3448,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VT, Ops, 4);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4, SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
@@ -3379,14 +3481,14 @@ SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
           ArgChains.push_back(SDValue(L, 1));
 
   // Build a tokenfactor for all the chains.
-  return getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+  return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
                  &ArgChains[0], ArgChains.size());
 }
 
 /// getMemsetValue - Vectorized representation of the memset value
 /// operand.
 static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
-                              DebugLoc dl) {
+                              SDLoc dl) {
   assert(Value.getOpcode() != ISD::UNDEF);
 
   unsigned NumBits = VT.getScalarType().getSizeInBits();
@@ -3412,7 +3514,7 @@ static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
 /// getMemsetStringVal - Similar to getMemsetValue. Except this is only
 /// used when a memcpy is turned into a memset when the source is a constant
 /// string ptr.
-static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
+static SDValue getMemsetStringVal(EVT VT, SDLoc dl, SelectionDAG &DAG,
                                   const TargetLowering &TLI, StringRef Str) {
   // Handle vector with all elements zero.
   if (Str.empty()) {
@@ -3454,10 +3556,10 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
 
 /// getMemBasePlusOffset - Returns base and offset node for the
 ///
-static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset,
+static SDValue getMemBasePlusOffset(SDValue Base, unsigned Offset, SDLoc dl,
                                       SelectionDAG &DAG) {
   EVT VT = Base.getValueType();
-  return DAG.getNode(ISD::ADD, Base.getDebugLoc(),
+  return DAG.getNode(ISD::ADD, dl,
                      VT, Base, DAG.getConstant(Offset, VT));
 }
 
@@ -3585,7 +3687,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
   return true;
 }
 
-static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
                                        SDValue Chain, SDValue Dst,
                                        SDValue Src, uint64_t Size,
                                        unsigned Align, bool isVol,
@@ -3630,7 +3732,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
     unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
-    // realignment.  
+    // realignment.
     const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
        while (NewAlign > Align &&
@@ -3671,7 +3773,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
       if (Value.getNode())
         Store = DAG.getStore(Chain, dl, Value,
-                             getMemBasePlusOffset(Dst, DstOff, DAG),
+                             getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                              DstPtrInfo.getWithOffset(DstOff), isVol,
                              false, Align);
     }
@@ -3685,11 +3787,11 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
       EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
       assert(NVT.bitsGE(VT));
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
-                             getMemBasePlusOffset(Src, SrcOff, DAG),
+                             getMemBasePlusOffset(Src, SrcOff, dl, DAG),
                              SrcPtrInfo.getWithOffset(SrcOff), VT, isVol, false,
                              MinAlign(SrcAlign, SrcOff));
       Store = DAG.getTruncStore(Chain, dl, Value,
-                                getMemBasePlusOffset(Dst, DstOff, DAG),
+                                getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                                 DstPtrInfo.getWithOffset(DstOff), VT, isVol,
                                 false, Align);
     }
@@ -3703,7 +3805,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
                      &OutChains[0], OutChains.size());
 }
 
-static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, SDLoc dl,
                                         SDValue Chain, SDValue Dst,
                                         SDValue Src, uint64_t Size,
                                         unsigned Align,  bool isVol,
@@ -3755,10 +3857,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Value;
 
     Value = DAG.getLoad(VT, dl, Chain,
-                        getMemBasePlusOffset(Src, SrcOff, DAG),
+                        getMemBasePlusOffset(Src, SrcOff, dl, DAG),
                         SrcPtrInfo.getWithOffset(SrcOff), isVol,
                         false, false, SrcAlign);
     LoadValues.push_back(Value);
@@ -3771,10 +3873,10 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   for (unsigned i = 0; i < NumMemOps; i++) {
     EVT VT = MemOps[i];
     unsigned VTSize = VT.getSizeInBits() / 8;
-    SDValue Value, Store;
+    SDValue Store;
 
     Store = DAG.getStore(Chain, dl, LoadValues[i],
-                         getMemBasePlusOffset(Dst, DstOff, DAG),
+                         getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                          DstPtrInfo.getWithOffset(DstOff), isVol, false, Align);
     OutChains.push_back(Store);
     DstOff += VTSize;
@@ -3784,7 +3886,25 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
                      &OutChains[0], OutChains.size());
 }
 
-static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
+/// \brief Lower the call to 'memset' intrinsic function into a series of store
+/// operations.
+///
+/// \param DAG Selection DAG where lowered code is placed.
+/// \param dl Link to corresponding IR location.
+/// \param Chain Control flow dependency.
+/// \param Dst Pointer to destination memory location.
+/// \param Src Value of byte to write into the memory.
+/// \param Size Number of bytes to write.
+/// \param Align Alignment of the destination in bytes.
+/// \param isVol True if destination is volatile.
+/// \param DstPtrInfo IR information on the memory pointer.
+/// \returns New head in the control flow, if lowering was successful, empty
+/// SDValue otherwise.
+///
+/// The function tries to replace 'llvm.memset' intrinsic with several store
+/// operations and value calculation code. This is usually profitable for small
+/// memory size.
+static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl,
                                SDValue Chain, SDValue Dst,
                                SDValue Src, uint64_t Size,
                                unsigned Align, bool isVol,
@@ -3856,7 +3976,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
     }
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(Chain, dl, Value,
-                                 getMemBasePlusOffset(Dst, DstOff, DAG),
+                                 getMemBasePlusOffset(Dst, DstOff, dl, DAG),
                                  DstPtrInfo.getWithOffset(DstOff),
                                  isVol, false, Align);
     OutChains.push_back(Store);
@@ -3868,7 +3988,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
                      &OutChains[0], OutChains.size());
 }
 
-SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
+SDValue SelectionDAG::getMemcpy(SDValue Chain, SDLoc dl, SDValue Dst,
                                 SDValue Src, SDValue Size,
                                 unsigned Align, bool isVol, bool AlwaysInline,
                                 MachinePointerInfo DstPtrInfo,
@@ -3914,29 +4034,31 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
   // beyond the given memory regions. But fixing this isn't easy, and most
   // people don't care.
 
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
-  // FIXME: pass in DebugLoc
+  // FIXME: pass in SDLoc
   TargetLowering::
   CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
-                    TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                    TLI->getLibcallCallingConv(RTLIB::MEMCPY),
                     /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMCPY),
-                                      TLI.getPointerTy()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                      TLI->getPointerTy()),
                     Args, *this, dl);
-  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
+SDValue SelectionDAG::getMemmove(SDValue Chain, SDLoc dl, SDValue Dst,
                                  SDValue Src, SDValue Size,
                                  unsigned Align, bool isVol,
                                  MachinePointerInfo DstPtrInfo,
@@ -3970,29 +4092,31 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
 
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  Entry.Ty = TLI->getDataLayout()->getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
-  // FIXME:  pass in DebugLoc
+  // FIXME:  pass in SDLoc
   TargetLowering::
   CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
-                    TLI.getLibcallCallingConv(RTLIB::MEMMOVE),
+                    TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
                     /*isTailCall=*/false,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMMOVE),
-                                      TLI.getPointerTy()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                      TLI->getPointerTy()),
                     Args, *this, dl);
-  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
+SDValue SelectionDAG::getMemset(SDValue Chain, SDLoc dl, SDValue Dst,
                                 SDValue Src, SDValue Size,
                                 unsigned Align, bool isVol,
                                 MachinePointerInfo DstPtrInfo) {
@@ -4023,7 +4147,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
     return Result;
 
   // Emit a library call.
-  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*getContext());
+  const TargetLowering *TLI = TM.getTargetLowering();
+  Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
@@ -4041,22 +4166,53 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
   Entry.Ty = IntPtrTy;
   Entry.isSExt = false;
   Args.push_back(Entry);
-  // FIXME: pass in DebugLoc
+  // FIXME: pass in SDLoc
   TargetLowering::
   CallLoweringInfo CLI(Chain, Type::getVoidTy(*getContext()),
                     false, false, false, false, 0,
-                    TLI.getLibcallCallingConv(RTLIB::MEMSET),
+                    TLI->getLibcallCallingConv(RTLIB::MEMSET),
                     /*isTailCall=*/false,
                     /*doesNotReturn*/false, /*isReturnValueUsed=*/false,
-                    getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
-                                      TLI.getPointerTy()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                      TLI->getPointerTy()),
                     Args, *this, dl);
-  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
 
   return CallResult.second;
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
+                                SDVTList VTList, SDValue* Ops, unsigned NumOps,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  // Allocate the operands array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the allocator is released.
+  // If the number of operands is less than 5 we use AtomicSDNode's internal
+  // storage.
+  SDUse *DynOps = NumOps > 4 ? OperandAllocator.Allocate<SDUse>(NumOps) : 0;
+
+  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, MemVT,
+                                               Ops, DynOps, NumOps, MMO,
+                                               Ordering, SynchScope);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain, SDValue Ptr, SDValue Cmp,
                                 SDValue Swp, MachinePointerInfo PtrInfo,
                                 unsigned Alignment,
@@ -4084,7 +4240,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Cmp,
                                 SDValue Swp, MachineMemOperand *MMO,
@@ -4096,25 +4252,11 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   EVT VT = Cmp.getValueType();
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
-                                               Ptr, Cmp, Swp, MMO, Ordering,
-                                               SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 4, MMO, Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Val,
                                 const Value* PtrVal,
@@ -4145,7 +4287,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 SDValue Chain,
                                 SDValue Ptr, SDValue Val,
                                 MachineMemOperand *MMO,
@@ -4169,25 +4311,11 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
 
   SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
                                                getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
-                                               Ptr, Val, MMO,
-                                               Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 3, MMO, Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 EVT VT, SDValue Chain,
                                 SDValue Ptr,
                                 const Value* PtrVal,
@@ -4218,7 +4346,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                    Ordering, SynchScope);
 }
 
-SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
+SDValue SelectionDAG::getAtomic(unsigned Opcode, SDLoc dl, EVT MemVT,
                                 EVT VT, SDValue Chain,
                                 SDValue Ptr,
                                 MachineMemOperand *MMO,
@@ -4227,26 +4355,13 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
 
   SDVTList VTs = getVTList(VT, MVT::Other);
-  FoldingSetNodeID ID;
-  ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr};
-  AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
-  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
-  void* IP = 0;
-  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-    cast<AtomicSDNode>(E)->refineAlignment(MMO);
-    return SDValue(E, 0);
-  }
-  SDNode *N = new (NodeAllocator) AtomicSDNode(Opcode, dl, VTs, MemVT, Chain,
-                                               Ptr, MMO, Ordering, SynchScope);
-  CSEMap.InsertNode(N, IP);
-  AllNodes.push_back(N);
-  return SDValue(N, 0);
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, 2, MMO, Ordering, SynchScope);
 }
 
 /// getMergeValues - Create a MERGE_VALUES node from the given operands.
 SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
-                                     DebugLoc dl) {
+                                     SDLoc dl) {
   if (NumOps == 1)
     return Ops[0];
 
@@ -4259,7 +4374,7 @@ SDValue SelectionDAG::getMergeValues(const SDValue *Ops, unsigned NumOps,
 }
 
 SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl,
                                   const EVT *VTs, unsigned NumVTs,
                                   const SDValue *Ops, unsigned NumOps,
                                   EVT MemVT, MachinePointerInfo PtrInfo,
@@ -4271,7 +4386,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl,
 }
 
 SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
                                   const SDValue *Ops, unsigned NumOps,
                                   EVT MemVT, MachinePointerInfo PtrInfo,
                                   unsigned Align, bool Vol,
@@ -4294,7 +4409,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
 }
 
 SDValue
-SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
+SelectionDAG::getMemIntrinsicNode(unsigned Opcode, SDLoc dl, SDVTList VTList,
                                   const SDValue *Ops, unsigned NumOps,
                                   EVT MemVT, MachineMemOperand *MMO) {
   assert((Opcode == ISD::INTRINSIC_VOID ||
@@ -4318,12 +4433,14 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
       return SDValue(E, 0);
     }
 
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl, VTList, Ops, NumOps,
-                                               MemVT, MMO);
+    N = new (NodeAllocator) MemIntrinsicSDNode(Opcode, dl.getIROrder(),
+                                               dl.getDebugLoc(), VTList, Ops,
+                                               NumOps, MemVT, MMO);
   }
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -4365,7 +4482,7 @@ static MachinePointerInfo InferPointerInfo(SDValue Ptr, SDValue OffsetOp) {
 
 SDValue
 SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                      EVT VT, DebugLoc dl, SDValue Chain,
+                      EVT VT, SDLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset,
                       MachinePointerInfo PtrInfo, EVT MemVT,
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
@@ -4398,7 +4515,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
 
 SDValue
 SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
-                      EVT VT, DebugLoc dl, SDValue Chain,
+                      EVT VT, SDLoc dl, SDValue Chain,
                       SDValue Ptr, SDValue Offset, EVT MemVT,
                       MachineMemOperand *MMO) {
   if (VT == MemVT) {
@@ -4437,14 +4554,15 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
     cast<LoadSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl, VTs, AM, ExtType,
+  SDNode *N = new (NodeAllocator) LoadSDNode(Ops, dl.getIROrder(),
+                                             dl.getDebugLoc(), VTs, AM, ExtType,
                                              MemVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
+SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
                               SDValue Chain, SDValue Ptr,
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
@@ -4457,7 +4575,15 @@ SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
                  TBAAInfo, Ranges);
 }
 
-SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
+SDValue SelectionDAG::getLoad(EVT VT, SDLoc dl,
+                              SDValue Chain, SDValue Ptr,
+                              MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 VT, MMO);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
                                  SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
                                  bool isVolatile, bool isNonTemporal,
@@ -4469,8 +4595,16 @@ SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, DebugLoc dl, EVT VT,
 }
 
 
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, SDLoc dl, EVT VT,
+                                 SDValue Chain, SDValue Ptr, EVT MemVT,
+                                 MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 MemVT, MMO);
+}
+
 SDValue
-SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
+SelectionDAG::getIndexedLoad(SDValue OrigLoad, SDLoc dl, SDValue Base,
                              SDValue Offset, ISD::MemIndexedMode AM) {
   LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
   assert(LD->getOffset().getOpcode() == ISD::UNDEF &&
@@ -4481,7 +4615,7 @@ SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
                  false, LD->getAlignment());
 }
 
-SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                bool isVolatile, bool isNonTemporal,
                                unsigned Alignment, const MDNode *TBAAInfo) {
@@ -4508,7 +4642,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
 
-SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
+SDValue SelectionDAG::getStore(SDValue Chain, SDLoc dl, SDValue Val,
                                SDValue Ptr, MachineMemOperand *MMO) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
@@ -4527,14 +4661,15 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
-                                              false, VT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, false, VT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
                                     EVT SVT,bool isVolatile, bool isNonTemporal,
                                     unsigned Alignment,
@@ -4561,7 +4696,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
 }
 
-SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
+SDValue SelectionDAG::getTruncStore(SDValue Chain, SDLoc dl, SDValue Val,
                                     SDValue Ptr, EVT SVT,
                                     MachineMemOperand *MMO) {
   EVT VT = Val.getValueType();
@@ -4595,15 +4730,16 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
     cast<StoreSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, ISD::UNINDEXED,
-                                              true, SVT, MMO);
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs,
+                                              ISD::UNINDEXED, true, SVT, MMO);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
 }
 
 SDValue
-SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
+SelectionDAG::getIndexedStore(SDValue OrigStore, SDLoc dl, SDValue Base,
                               SDValue Offset, ISD::MemIndexedMode AM) {
   StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
   assert(ST->getOffset().getOpcode() == ISD::UNDEF &&
@@ -4619,7 +4755,8 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl, VTs, AM,
+  SDNode *N = new (NodeAllocator) StoreSDNode(Ops, dl.getIROrder(),
+                                              dl.getDebugLoc(), VTs, AM,
                                               ST->isTruncatingStore(),
                                               ST->getMemoryVT(),
                                               ST->getMemOperand());
@@ -4628,7 +4765,7 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
+SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
                                SDValue Chain, SDValue Ptr,
                                SDValue SV,
                                unsigned Align) {
@@ -4636,7 +4773,7 @@ SDValue SelectionDAG::getVAArg(EVT VT, DebugLoc dl,
   return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops, 4);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               const SDUse *Ops, unsigned NumOps) {
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
@@ -4652,7 +4789,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   return getNode(Opcode, DL, VT, &NewOps[0], NumOps);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, EVT VT,
                               const SDValue *Ops, unsigned NumOps) {
   switch (NumOps) {
   case 0: return getNode(Opcode, DL, VT);
@@ -4694,10 +4831,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
       return SDValue(E, 0);
 
-    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
     CSEMap.InsertNode(N, IP);
   } else {
-    N = new (NodeAllocator) SDNode(Opcode, DL, VTs, Ops, NumOps);
+    N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                   VTs, Ops, NumOps);
   }
 
   AllNodes.push_back(N);
@@ -4707,14 +4846,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
                               ArrayRef<EVT> ResultTys,
                               const SDValue *Ops, unsigned NumOps) {
   return getNode(Opcode, DL, getVTList(&ResultTys[0], ResultTys.size()),
                  Ops, NumOps);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL,
                               const EVT *VTs, unsigned NumVTs,
                               const SDValue *Ops, unsigned NumOps) {
   if (NumVTs == 1)
@@ -4722,7 +4861,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
   return getNode(Opcode, DL, makeVTList(VTs, NumVTs), Ops, NumOps);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               const SDValue *Ops, unsigned NumOps) {
   if (VTList.NumVTs == 1)
     return getNode(Opcode, DL, VTList.VTs[0], Ops, NumOps);
@@ -4760,26 +4899,36 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
       return SDValue(E, 0);
 
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
     CSEMap.InsertNode(N, IP);
   } else {
     if (NumOps == 1) {
-      N = new (NodeAllocator) UnarySDNode(Opcode, DL, VTList, Ops[0]);
+      N = new (NodeAllocator) UnarySDNode(Opcode, DL.getIROrder(),
+                                          DL.getDebugLoc(), VTList, Ops[0]);
     } else if (NumOps == 2) {
-      N = new (NodeAllocator) BinarySDNode(Opcode, DL, VTList, Ops[0], Ops[1]);
+      N = new (NodeAllocator) BinarySDNode(Opcode, DL.getIROrder(),
+                                           DL.getDebugLoc(), VTList, Ops[0],
+                                           Ops[1]);
     } else if (NumOps == 3) {
-      N = new (NodeAllocator) TernarySDNode(Opcode, DL, VTList, Ops[0], Ops[1],
-                                            Ops[2]);
+      N = new (NodeAllocator) TernarySDNode(Opcode, DL.getIROrder(),
+                                            DL.getDebugLoc(), VTList, Ops[0],
+                                            Ops[1], Ops[2]);
     } else {
-      N = new (NodeAllocator) SDNode(Opcode, DL, VTList, Ops, NumOps);
+      N = new (NodeAllocator) SDNode(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                                     VTList, Ops, NumOps);
     }
   }
   AllNodes.push_back(N);
@@ -4789,36 +4938,36 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
   return SDValue(N, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList) {
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList) {
   return getNode(Opcode, DL, VTList, 0, 0);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1) {
   SDValue Ops[] = { N1 };
   return getNode(Opcode, DL, VTList, Ops, 1);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2) {
   SDValue Ops[] = { N1, N2 };
   return getNode(Opcode, DL, VTList, Ops, 2);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3) {
   SDValue Ops[] = { N1, N2, N3 };
   return getNode(Opcode, DL, VTList, Ops, 3);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4) {
   SDValue Ops[] = { N1, N2, N3, N4 };
   return getNode(Opcode, DL, VTList, Ops, 4);
 }
 
-SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, SDVTList VTList,
+SDValue SelectionDAG::getNode(unsigned Opcode, SDLoc DL, SDVTList VTList,
                               SDValue N1, SDValue N2, SDValue N3,
                               SDValue N4, SDValue N5) {
   SDValue Ops[] = { N1, N2, N3, N4, N5 };
@@ -4830,76 +4979,81 @@ SDVTList SelectionDAG::getVTList(EVT VT) {
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 2 && I->VTs[0] == VT1 && I->VTs[1] == VT2)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(2);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  SDVTList Result = makeVTList(Array, 2);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(2U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(2);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 3 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(3);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  SDVTList Result = makeVTList(Array, 3);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(3U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(3);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I)
-    if (I->NumVTs == 4 && I->VTs[0] == VT1 && I->VTs[1] == VT2 &&
-                          I->VTs[2] == VT3 && I->VTs[3] == VT4)
-      return *I;
-
-  EVT *Array = Allocator.Allocate<EVT>(4);
-  Array[0] = VT1;
-  Array[1] = VT2;
-  Array[2] = VT3;
-  Array[3] = VT4;
-  SDVTList Result = makeVTList(Array, 4);
-  VTList.push_back(Result);
-  return Result;
+  FoldingSetNodeID ID;
+  ID.AddInteger(4U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+  ID.AddInteger(VT4.getRawBits());
+
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(4);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Array[3] = VT4;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
 }
 
 SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
-  switch (NumVTs) {
-    case 0: llvm_unreachable("Cannot have nodes without results!");
-    case 1: return getVTList(VTs[0]);
-    case 2: return getVTList(VTs[0], VTs[1]);
-    case 3: return getVTList(VTs[0], VTs[1], VTs[2]);
-    case 4: return getVTList(VTs[0], VTs[1], VTs[2], VTs[3]);
-    default: break;
+  FoldingSetNodeID ID;
+  ID.AddInteger(NumVTs);
+  for (unsigned index = 0; index < NumVTs; index++) {
+    ID.AddInteger(VTs[index].getRawBits());
   }
 
-  for (std::vector<SDVTList>::reverse_iterator I = VTList.rbegin(),
-       E = VTList.rend(); I != E; ++I) {
-    if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
-      continue;
-
-    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
-      return *I;
+  void *IP = 0;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (Result == NULL) {
+    EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+    std::copy(VTs, VTs + NumVTs, Array);
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+    VTListMap.InsertNode(Result, IP);
   }
-
-  EVT *Array = Allocator.Allocate<EVT>(NumVTs);
-  std::copy(VTs, VTs+NumVTs, Array);
-  SDVTList Result = makeVTList(Array, NumVTs);
-  VTList.push_back(Result);
-  return Result;
+  return Result->getSDVTList();
 }
 
 
@@ -5138,17 +5292,21 @@ SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
   return N;
 }
 
-/// UpdadeDebugLocOnMergedSDNode - If the opt level is -O0 then it throws away
+/// UpdadeSDLocOnMergedSDNode - If the opt level is -O0 then it throws away
 /// the line number information on the merged node since it is not possible to
 /// preserve the information that operation is associated with multiple lines.
 /// This will make the debugger working better at -O0, were there is a higher
 /// probability having other instructions associated with that line.
 ///
-SDNode *SelectionDAG::UpdadeDebugLocOnMergedSDNode(SDNode *N, DebugLoc OLoc) {
+/// For IROrder, we keep the smaller of the two
+SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, SDLoc OLoc) {
   DebugLoc NLoc = N->getDebugLoc();
-  if (!(NLoc.isUnknown()) && (OptLevel == CodeGenOpt::None) && (OLoc != NLoc)) {
+  if (!(NLoc.isUnknown()) && (OptLevel == CodeGenOpt::None) &&
+    (OLoc.getDebugLoc() != NLoc)) {
     N->setDebugLoc(DebugLoc());
   }
+  unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
+  N->setIROrder(Order);
   return N;
 }
 
@@ -5157,7 +5315,7 @@ SDNode *SelectionDAG::UpdadeDebugLocOnMergedSDNode(SDNode *N, DebugLoc OLoc) {
 ///
 /// Note that MorphNodeTo returns the resultant node.  If there is already a
 /// node of the specified opcode and operands, it returns that node instead of
-/// the current one.  Note that the DebugLoc need not be the same.
+/// the current one.  Note that the SDLoc need not be the same.
 ///
 /// Using MorphNodeTo is faster than creating a new node and swapping it in
 /// with ReplaceAllUsesWith both because it often avoids allocating a new
@@ -5173,7 +5331,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opc, VTs, Ops, NumOps);
     if (SDNode *ON = CSEMap.FindNodeOrInsertPos(ID, IP))
-      return UpdadeDebugLocOnMergedSDNode(ON, N->getDebugLoc());
+      return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N));
   }
 
   if (!RemoveNodeFromCSEMaps(N))
@@ -5250,20 +5408,20 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 /// node of the specified opcode and operands, it returns that node instead of
 /// the current one.
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT) {
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, None);
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT, SDValue Op1) {
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT, SDValue Op1) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1 };
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
                              SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2 };
@@ -5271,7 +5429,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
                              SDValue Op1, SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = { Op1, Op2, Op3 };
@@ -5279,20 +5437,20 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT,
                              ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT);
   return getMachineNode(Opcode, dl, VTs, Ops);
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1, EVT VT2) {
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1, EVT VT2) {
   SDVTList VTs = getVTList(VT1, VT2);
   return getMachineNode(Opcode, dl, VTs, None);
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, SDValue Op1) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1 };
@@ -5300,7 +5458,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2);
   SDValue Ops[] = { Op1, Op2 };
@@ -5308,7 +5466,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, SDValue Op1,
                              SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2);
@@ -5317,7 +5475,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2,
                              ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2);
@@ -5325,7 +5483,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, EVT VT3,
                              SDValue Op1, SDValue Op2) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
@@ -5334,7 +5492,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, EVT VT3,
                              SDValue Op1, SDValue Op2, SDValue Op3) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
@@ -5343,7 +5501,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              EVT VT1, EVT VT2, EVT VT3,
                              ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3);
@@ -5351,7 +5509,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl, EVT VT1,
                              EVT VT2, EVT VT3, EVT VT4,
                              ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
@@ -5359,7 +5517,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc dl,
                              ArrayRef<EVT> ResultTys,
                              ArrayRef<SDValue> Ops) {
   SDVTList VTs = getVTList(&ResultTys[0], ResultTys.size());
@@ -5367,7 +5525,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl,
 }
 
 MachineSDNode *
-SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
+SelectionDAG::getMachineNode(unsigned Opcode, SDLoc DL, SDVTList VTs,
                              ArrayRef<SDValue> OpsArray) {
   bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
   MachineSDNode *N;
@@ -5380,12 +5538,13 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
     AddNodeIDNode(ID, ~Opcode, VTs, Ops, NumOps);
     IP = 0;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
-      return cast<MachineSDNode>(UpdadeDebugLocOnMergedSDNode(E, DL));
+      return cast<MachineSDNode>(UpdadeSDLocOnMergedSDNode(E, DL));
     }
   }
 
   // Allocate a new MachineSDNode.
-  N = new (NodeAllocator) MachineSDNode(~Opcode, DL, VTs);
+  N = new (NodeAllocator) MachineSDNode(~Opcode, DL.getIROrder(),
+                                        DL.getDebugLoc(), VTs);
 
   // Initialize the operands list.
   if (NumOps > array_lengthof(N->LocalOperands))
@@ -5411,7 +5570,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc DL, SDVTList VTs,
 /// getTargetExtractSubreg - A convenience function for creating
 /// TargetOpcode::EXTRACT_SUBREG nodes.
 SDValue
-SelectionDAG::getTargetExtractSubreg(int SRIdx, DebugLoc DL, EVT VT,
+SelectionDAG::getTargetExtractSubreg(int SRIdx, SDLoc DL, EVT VT,
                                      SDValue Operand) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
   SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
@@ -5422,7 +5581,7 @@ SelectionDAG::getTargetExtractSubreg(int SRIdx, DebugLoc DL, EVT VT,
 /// getTargetInsertSubreg - A convenience function for creating
 /// TargetOpcode::INSERT_SUBREG nodes.
 SDValue
-SelectionDAG::getTargetInsertSubreg(int SRIdx, DebugLoc DL, EVT VT,
+SelectionDAG::getTargetInsertSubreg(int SRIdx, SDLoc DL, EVT VT,
                                     SDValue Operand, SDValue Subreg) {
   SDValue SRIdxVal = getTargetConstant(SRIdx, MVT::i32);
   SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
@@ -5845,18 +6004,6 @@ unsigned SelectionDAG::AssignTopologicalOrder() {
   return DAGSize;
 }
 
-/// AssignOrdering - Assign an order to the SDNode.
-void SelectionDAG::AssignOrdering(const SDNode *SD, unsigned Order) {
-  assert(SD && "Trying to assign an order to a null node!");
-  Ordering->add(SD, Order);
-}
-
-/// GetOrdering - Get the order for the SDNode.
-unsigned SelectionDAG::GetOrdering(const SDNode *SD) const {
-  assert(SD && "Trying to get the order of a null node!");
-  return Ordering->getOrder(SD);
-}
-
 /// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
 /// value is produced by SD.
 void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
@@ -5883,7 +6030,7 @@ void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
       ClonedDVs.push_back(Clone);
     }
   }
-  for (SmallVector<SDDbgValue *, 2>::iterator I = ClonedDVs.begin(),
+  for (SmallVectorImpl<SDDbgValue *>::iterator I = ClonedDVs.begin(),
          E = ClonedDVs.end(); I != E; ++I)
     AddDbgValue(*I, ToNode, false);
 }
@@ -5896,16 +6043,22 @@ HandleSDNode::~HandleSDNode() {
   DropOperands();
 }
 
-GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, DebugLoc DL,
-                                         const GlobalValue *GA,
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
+                                         DebugLoc DL, const GlobalValue *GA,
                                          EVT VT, int64_t o, unsigned char TF)
-  : SDNode(Opc, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
 
-MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs, EVT memvt,
-                     MachineMemOperand *mmo)
- : SDNode(Opc, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, DebugLoc dl, EVT VT,
+                                         SDValue X, unsigned SrcAS,
+                                         unsigned DestAS)
+ : UnarySDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT), X),
+   SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+
+MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
+                     EVT memvt, MachineMemOperand *mmo)
+ : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
   assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
@@ -5914,10 +6067,10 @@ MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs, EVT memvt,
   assert(memvt.getStoreSize() == MMO->getSize() && "Size mismatch!");
 }
 
-MemSDNode::MemSDNode(unsigned Opc, DebugLoc dl, SDVTList VTs,
+MemSDNode::MemSDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs,
                      const SDValue *Ops, unsigned NumOps, EVT memvt,
                      MachineMemOperand *mmo)
-   : SDNode(Opc, dl, VTs, Ops, NumOps),
+   : SDNode(Opc, Order, dl, VTs, Ops, NumOps),
      MemoryVT(memvt), MMO(mmo) {
   SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
                                       MMO->isNonTemporal(), MMO->isInvariant());
@@ -6064,9 +6217,10 @@ bool SDNode::hasPredecessor(const SDNode *N) const {
   return hasPredecessorHelper(N, Visited, Worklist);
 }
 
-bool SDNode::hasPredecessorHelper(const SDNode *N,
-                                  SmallPtrSet<const SDNode *, 32> &Visited,
-                                  SmallVector<const SDNode *, 16> &Worklist) const {
+bool
+SDNode::hasPredecessorHelper(const SDNode *N,
+                             SmallPtrSet<const SDNode *, 32> &Visited,
+                             SmallVectorImpl<const SDNode *> &Worklist) const {
   if (Visited.empty()) {
     Worklist.push_back(this);
   } else {
@@ -6103,7 +6257,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
   EVT VT = N->getValueType(0);
   unsigned NE = VT.getVectorNumElements();
   EVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SmallVector<SDValue, 8> Scalars;
   SmallVector<SDValue, 4> Operands(N->getNumOperands());
@@ -6121,11 +6275,12 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       EVT OperandVT = Operand.getValueType();
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
+        const TargetLowering *TLI = TM.getTargetLowering();
         EVT OperandEltVT = OperandVT.getVectorElementType();
         Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               OperandEltVT,
                               Operand,
-                              getConstant(i, TLI.getPointerTy()));
+                              getConstant(i, TLI->getVectorIdxTy()));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
@@ -6147,8 +6302,8 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
     case ISD::ROTL:
     case ISD::ROTR:
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
-                                getShiftAmountOperand(Operands[0].getValueType(),
-                                                      Operands[1])));
+                               getShiftAmountOperand(Operands[0].getValueType(),
+                                                     Operands[1])));
       break;
     case ISD::SIGN_EXTEND_INREG:
     case ISD::FP_ROUND_INREG: {
@@ -6203,8 +6358,9 @@ bool SelectionDAG::isConsecutiveLoad(LoadSDNode *LD, LoadSDNode *Base,
   const GlobalValue *GV2 = NULL;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
-  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
-  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  const TargetLowering *TLI = TM.getTargetLowering();
+  bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
   if (isGA1 && isGA2 && GV1 == GV2)
     return Offset1 == (Offset2 + Dist*Bytes);
   return false;
@@ -6217,11 +6373,12 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV;
   int64_t GVOffset = 0;
-  if (TLI.isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
-    unsigned PtrWidth = TLI.getPointerTy().getSizeInBits();
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    unsigned PtrWidth = TLI->getPointerTypeSizeInBits(GV->getType());
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
     llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
-                            TLI.getDataLayout());
+                            TLI->getDataLayout());
     unsigned AlignBits = KnownZero.countTrailingOnes();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
@@ -6251,6 +6408,38 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
   return 0;
 }
 
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split (or expanded) into two not necessarily identical pieces.
+std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+  // Currently all types are split in half.
+  EVT LoVT, HiVT;
+  if (!VT.isVector()) {
+    LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+  } else {
+    unsigned NumElements = VT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
+                                   NumElements/2);
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+/// low/high part.
+std::pair<SDValue, SDValue>
+SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                          const EVT &HiVT) {
+  assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
+         N.getValueType().getVectorNumElements() &&
+         "More vector elements requested than available!");
+  SDValue Lo, Hi;
+  Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+               getConstant(0, TLI->getVectorIdxTy()));
+  Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+               getConstant(LoVT.getVectorNumElements(), TLI->getVectorIdxTy()));
+  return std::make_pair(Lo, Hi);
+}
+
 // getAddressSpace - Return the address space this GlobalAddress belongs to.
 unsigned GlobalAddressSDNode::getAddressSpace() const {
   return getGlobal()->getType()->getAddressSpace();
@@ -6372,7 +6561,7 @@ static void checkForCyclesHelper(const SDNode *N,
 
 void llvm::checkForCycles(const llvm::SDNode *N) {
 #ifdef XDEBUG
-  assert(N && "Checking nonexistant SDNode");
+  assert(N && "Checking nonexistent SDNode");
   SmallPtrSet<const SDNode*, 32> visited;
   SmallPtrSet<const SDNode*, 32> checked;
   checkForCyclesHelper(N, visited, checked);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 67db211..2b2713d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -15,6 +15,7 @@
 #include "SelectionDAGBuilder.h"
 #include "SDNodeDbgValue.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
@@ -32,6 +33,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -48,7 +50,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/IntegersSubsetMapping.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -57,6 +58,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSelectionDAGInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -87,7 +89,7 @@ LimitFPPrecision("limit-float-precision",
 // store [4096 x i8] %data, [4096 x i8]* %buffer
 static const unsigned MaxParallelChains = 64;
 
-static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V);
 
@@ -96,7 +98,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
 /// larger then ValueVT then AssertOp can be used to specify whether the extra
 /// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
 /// (ISD::AssertSext).
-static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
+static SDValue getCopyFromParts(SelectionDAG &DAG, SDLoc DL,
                                 const SDValue *Parts,
                                 unsigned NumParts, MVT PartVT, EVT ValueVT,
                                 const Value *V,
@@ -217,7 +219,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
 /// type larger then ValueVT then AssertOp can be used to specify whether the
 /// extra bits are known to be zero (ISD::AssertZext) or sign extended from
 /// ValueVT (ISD::AssertSext).
-static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, SDLoc DL,
                                       const SDValue *Parts, unsigned NumParts,
                                       MVT PartVT, EVT ValueVT, const Value *V) {
   assert(ValueVT.isVector() && "Not a vector value");
@@ -280,7 +282,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
       assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
              "Cannot narrow, it would be a lossy transformation");
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
-                         DAG.getIntPtrConstant(0));
+                         DAG.getConstant(0, TLI.getVectorIdxTy()));
     }
 
     // Vector/Vector bitcast.
@@ -327,14 +329,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
   return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
 }
 
-static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc dl,
+static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
 /// integers, ExtendKind can be used to specify how to generate the extra bits.
-static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
+static void getCopyToParts(SelectionDAG &DAG, SDLoc DL,
                            SDValue Val, SDValue *Parts, unsigned NumParts,
                            MVT PartVT, const Value *V,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
@@ -466,7 +468,7 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
 
 /// getCopyToPartsVector - Create a series of nodes that contain the specified
 /// value split into legal parts.
-static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
+static void getCopyToPartsVector(SelectionDAG &DAG, SDLoc DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
                                  MVT PartVT, const Value *V) {
   EVT ValueVT = Val.getValueType();
@@ -489,7 +491,8 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
       SmallVector<SDValue, 16> Ops;
       for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
         Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                  ElementVT, Val, DAG.getIntPtrConstant(i)));
+                                  ElementVT, Val, DAG.getConstant(i,
+                                                  TLI.getVectorIdxTy())));
 
       for (unsigned i = ValueVT.getVectorNumElements(),
            e = PartVT.getVectorNumElements(); i != e; ++i)
@@ -515,7 +518,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
       assert(ValueVT.getVectorNumElements() == 1 &&
              "Only trivial vector-to-scalar conversions should get here!");
       Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                        PartVT, Val, DAG.getIntPtrConstant(0));
+                        PartVT, Val, DAG.getConstant(0, TLI.getVectorIdxTy()));
 
       bool Smaller = ValueVT.bitsLE(PartVT);
       Val = DAG.getNode((Smaller ? ISD::TRUNCATE : ISD::ANY_EXTEND),
@@ -545,10 +548,12 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
     if (IntermediateVT.isVector())
       Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
                            IntermediateVT, Val,
-                   DAG.getIntPtrConstant(i * (NumElements / NumIntermediates)));
+                   DAG.getConstant(i * (NumElements / NumIntermediates),
+                                   TLI.getVectorIdxTy()));
     else
       Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                           IntermediateVT, Val, DAG.getIntPtrConstant(i));
+                           IntermediateVT, Val,
+                           DAG.getConstant(i, TLI.getVectorIdxTy()));
   }
 
   // Split the intermediate operands into legal parts.
@@ -644,7 +649,7 @@ namespace {
     /// Chain/Flag as the input and updates them for the output Chain/Flag.
     /// If the Flag pointer is NULL, no flag is used.
     SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
-                            DebugLoc dl,
+                            SDLoc dl,
                             SDValue &Chain, SDValue *Flag,
                             const Value *V = 0) const;
 
@@ -652,7 +657,7 @@ namespace {
     /// specified value into the registers specified by this object.  This uses
     /// Chain/Flag as the input and updates them for the output Chain/Flag.
     /// If the Flag pointer is NULL, no flag is used.
-    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+    void getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
                        SDValue &Chain, SDValue *Flag, const Value *V) const;
 
     /// AddInlineAsmOperands - Add this value to the specified inlineasm node
@@ -671,7 +676,7 @@ namespace {
 /// If the Flag pointer is NULL, no flag is used.
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
-                                      DebugLoc dl,
+                                      SDLoc dl,
                                       SDValue &Chain, SDValue *Flag,
                                       const Value *V) const {
   // A Value with type {} or [0 x %t] needs no registers.
@@ -717,6 +722,14 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
       unsigned NumSignBits = LOI->NumSignBits;
       unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes();
 
+      if (NumZeroBits == RegSize) {
+        // The current value is a zero.
+        // Explicitly express that as it would be easier for
+        // optimizations to kick in.
+        Parts[i] = DAG.getConstant(0, RegisterVT);
+        continue;
+      }
+
       // FIXME: We capture more information than the dag can represent.  For
       // now, just use the tightest assertzext/assertsext possible.
       bool isSExt = true;
@@ -761,7 +774,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 /// specified value into the registers specified by this object.  This uses
 /// Chain/Flag as the input and updates them for the output Chain/Flag.
 /// If the Flag pointer is NULL, no flag is used.
-void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, SDLoc dl,
                                  SDValue &Chain, SDValue *Flag,
                                  const Value *V) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -869,7 +882,7 @@ void SelectionDAGBuilder::clear() {
   UnusedArgNodeMap.clear();
   PendingLoads.clear();
   PendingExports.clear();
-  CurDebugLoc = DebugLoc();
+  CurInst = NULL;
   HasTailCall = false;
 }
 
@@ -900,7 +913,7 @@ SDValue SelectionDAGBuilder::getRoot() {
   }
 
   // Otherwise, we have to make a token factor node.
-  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
                                &PendingLoads[0], PendingLoads.size());
   PendingLoads.clear();
   DAG.setRoot(Root);
@@ -930,7 +943,7 @@ SDValue SelectionDAGBuilder::getControlRoot() {
       PendingExports.push_back(Root);
   }
 
-  Root = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+  Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
                      &PendingExports[0],
                      PendingExports.size());
   PendingExports.clear();
@@ -938,27 +951,21 @@ SDValue SelectionDAGBuilder::getControlRoot() {
   return Root;
 }
 
-void SelectionDAGBuilder::AssignOrderingToNode(const SDNode *Node) {
-  if (DAG.GetOrdering(Node) != 0) return; // Already has ordering.
-  DAG.AssignOrdering(Node, SDNodeOrder);
-
-  for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I)
-    AssignOrderingToNode(Node->getOperand(I).getNode());
-}
-
 void SelectionDAGBuilder::visit(const Instruction &I) {
   // Set up outgoing PHI node register values before emitting the terminator.
   if (isa<TerminatorInst>(&I))
     HandlePHINodesInSuccessorBlocks(I.getParent());
 
-  CurDebugLoc = I.getDebugLoc();
+  ++SDNodeOrder;
+
+  CurInst = &I;
 
   visit(I.getOpcode(), I);
 
   if (!isa<TerminatorInst>(&I) && !HasTailCall)
     CopyToExportRegsIfNeeded(&I);
 
-  CurDebugLoc = DebugLoc();
+  CurInst = NULL;
 }
 
 void SelectionDAGBuilder::visitPHI(const PHINode &) {
@@ -975,12 +982,6 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
     case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
 #include "llvm/IR/Instruction.def"
   }
-
-  // Assign the ordering to the freshly created DAG nodes.
-  if (NodeMap.count(&I)) {
-    ++SDNodeOrder;
-    AssignOrderingToNode(getValue(&I).getNode());
-  }
 }
 
 // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
@@ -1002,7 +1003,7 @@ void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
         DAG.AddDbgValue(SDV, Val.getNode(), false);
       }
     } else
-      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
     DanglingDebugInfoMap[V] = DanglingDebugInfo();
   }
 }
@@ -1020,9 +1021,10 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
   DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
   if (It != FuncInfo.ValueMap.end()) {
     unsigned InReg = It->second;
-    RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
+    RegsForValue RFV(*DAG.getContext(), *TM.getTargetLowering(),
+                     InReg, V->getType());
     SDValue Chain = DAG.getEntryNode();
-    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
+    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, NULL, V);
     resolveDanglingDebugInfo(V, N);
     return N;
   }
@@ -1051,17 +1053,21 @@ SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
 /// getValueImpl - Helper function for getValue and getNonRegisterValue.
 /// Create an SDValue for the given value.
 SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+
   if (const Constant *C = dyn_cast<Constant>(V)) {
-    EVT VT = TLI.getValueType(V->getType(), true);
+    EVT VT = TLI->getValueType(V->getType(), true);
 
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
       return DAG.getConstant(*CI, VT);
 
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-      return DAG.getGlobalAddress(GV, getCurDebugLoc(), VT);
+      return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
-    if (isa<ConstantPointerNull>(C))
-      return DAG.getConstant(0, TLI.getPointerTy());
+    if (isa<ConstantPointerNull>(C)) {
+      unsigned AS = V->getType()->getPointerAddressSpace();
+      return DAG.getConstant(0, TLI->getPointerTy(AS));
+    }
 
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
       return DAG.getConstantFP(*CFP, VT);
@@ -1090,9 +1096,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       }
 
       return DAG.getMergeValues(&Constants[0], Constants.size(),
-                                getCurDebugLoc());
+                                getCurSDLoc());
     }
-    
+
     if (const ConstantDataSequential *CDS =
           dyn_cast<ConstantDataSequential>(C)) {
       SmallVector<SDValue, 4> Ops;
@@ -1105,8 +1111,8 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       }
 
       if (isa<ArrayType>(CDS->getType()))
-        return DAG.getMergeValues(&Ops[0], Ops.size(), getCurDebugLoc());
-      return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+        return DAG.getMergeValues(&Ops[0], Ops.size(), getCurSDLoc());
+      return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
                                       VT, &Ops[0], Ops.size());
     }
 
@@ -1115,7 +1121,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
              "Unknown struct or array constant!");
 
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(TLI, C->getType(), ValueVTs);
+      ComputeValueVTs(*TLI, C->getType(), ValueVTs);
       unsigned NumElts = ValueVTs.size();
       if (NumElts == 0)
         return SDValue(); // empty struct
@@ -1131,7 +1137,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       }
 
       return DAG.getMergeValues(&Constants[0], NumElts,
-                                getCurDebugLoc());
+                                getCurSDLoc());
     }
 
     if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
@@ -1148,7 +1154,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
         Ops.push_back(getValue(CV->getOperand(i)));
     } else {
       assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
-      EVT EltVT = TLI.getValueType(VecTy->getElementType());
+      EVT EltVT = TLI->getValueType(VecTy->getElementType());
 
       SDValue Op;
       if (EltVT.isFloatingPoint())
@@ -1159,7 +1165,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     }
 
     // Create a BUILD_VECTOR node.
-    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
                                     VT, &Ops[0], Ops.size());
   }
 
@@ -1169,21 +1175,22 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     DenseMap<const AllocaInst*, int>::iterator SI =
       FuncInfo.StaticAllocaMap.find(AI);
     if (SI != FuncInfo.StaticAllocaMap.end())
-      return DAG.getFrameIndex(SI->second, TLI.getPointerTy());
+      return DAG.getFrameIndex(SI->second, TLI->getPointerTy());
   }
 
   // If this is an instruction which fast-isel has deferred, select it now.
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
-    RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
+    RegsForValue RFV(*DAG.getContext(), *TLI, InReg, Inst->getType());
     SDValue Chain = DAG.getEntryNode();
-    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, NULL, V);
   }
 
   llvm_unreachable("Can't get register for value!");
 }
 
 void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
+  const TargetLowering *TLI = TM.getTargetLowering();
   SDValue Chain = getControlRoot();
   SmallVector<ISD::OutputArg, 8> Outs;
   SmallVector<SDValue, 8> OutVals;
@@ -1196,7 +1203,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     // Leave Outs empty so that LowerReturn won't try to load return
     // registers the usual way.
     SmallVector<EVT, 1> PtrValueVTs;
-    ComputeValueVTs(TLI, PointerType::getUnqual(F->getReturnType()),
+    ComputeValueVTs(*TLI, PointerType::getUnqual(F->getReturnType()),
                     PtrValueVTs);
 
     SDValue RetPtr = DAG.getRegister(DemoteReg, PtrValueVTs[0]);
@@ -1204,26 +1211,26 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
     SmallVector<EVT, 4> ValueVTs;
     SmallVector<uint64_t, 4> Offsets;
-    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs, &Offsets);
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
     for (unsigned i = 0; i != NumValues; ++i) {
-      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
                                 RetPtr.getValueType(), RetPtr,
                                 DAG.getIntPtrConstant(Offsets[i]));
       Chains[i] =
-        DAG.getStore(Chain, getCurDebugLoc(),
+        DAG.getStore(Chain, getCurSDLoc(),
                      SDValue(RetOp.getNode(), RetOp.getResNo() + i),
                      // FIXME: better loc info would be nice.
                      Add, MachinePointerInfo(), false, false, 0);
     }
 
-    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                         MVT::Other, &Chains[0], NumValues);
   } else if (I.getNumOperands() != 0) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, I.getOperand(0)->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, I.getOperand(0)->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
@@ -1241,12 +1248,12 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
           ExtendKind = ISD::ZERO_EXTEND;
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI.getTypeForExtArgOrReturn(VT.getSimpleVT(), ExtendKind);
+          VT = TLI->getTypeForExtArgOrReturn(VT.getSimpleVT(), ExtendKind);
 
-        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
-        MVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
+        unsigned NumParts = TLI->getNumRegisters(*DAG.getContext(), VT);
+        MVT PartVT = TLI->getRegisterType(*DAG.getContext(), VT);
         SmallVector<SDValue, 4> Parts(NumParts);
-        getCopyToParts(DAG, getCurDebugLoc(),
+        getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
                        &Parts[0], NumParts, PartVT, &I, ExtendKind);
 
@@ -1264,7 +1271,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
-                                        /*isfixed=*/true, 0, 0));
+                                        VT, /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -1274,8 +1281,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
   bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
   CallingConv::ID CallConv =
     DAG.getMachineFunction().getFunction()->getCallingConv();
-  Chain = TLI.LowerReturn(Chain, CallConv, isVarArg,
-                          Outs, OutVals, getCurDebugLoc(), DAG);
+  Chain = TM.getTargetLowering()->LowerReturn(Chain, CallConv, isVarArg,
+                                              Outs, OutVals, getCurSDLoc(),
+                                              DAG);
 
   // Verify that the target's LowerReturn behaved as expected.
   assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
@@ -1474,7 +1482,7 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
 /// If we should emit this as a bunch of and/or'd together conditions, return
 /// false.
 bool
-SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases){
+SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
   if (Cases.size() != 2) return true;
 
   // If this is two comparisons of the same values or'd or and'd together, they
@@ -1519,7 +1527,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
 
     // If this is not a fall-through branch, emit the branch.
     if (Succ0MBB != NextBlock)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
                               MVT::Other, getControlRoot(),
                               DAG.getBasicBlock(Succ0MBB)));
 
@@ -1548,7 +1556,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     jle foo
   //
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
-    if (!TLI.isJumpExpensive() &&
+    if (!TM.getTargetLowering()->isJumpExpensive() &&
         BOp->hasOneUse() &&
         (BOp->getOpcode() == Instruction::And ||
          BOp->getOpcode() == Instruction::Or)) {
@@ -1596,7 +1604,7 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
                                           MachineBasicBlock *SwitchBB) {
   SDValue Cond;
   SDValue CondLHS = getValue(CB.CmpLHS);
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
 
   // Build the setcc now.
   if (CB.CmpMHS == NULL) {
@@ -1612,18 +1620,17 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
     } else
       Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
   } else {
-    assert(CB.CC == ISD::SETCC_INVALID &&
-           "Condition is undefined for to-the-range belonging check.");
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High  = cast<ConstantInt>(CB.CmpRHS)->getValue();
 
     SDValue CmpOp = getValue(CB.CmpMHS);
     EVT VT = CmpOp.getValueType();
-    
-    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(false)) {
+
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, VT),
-                          ISD::SETULE);
+                          ISD::SETLE);
     } else {
       SDValue SUB = DAG.getNode(ISD::SUB, dl,
                                 VT, CmpOp, DAG.getConstant(Low, VT));
@@ -1671,11 +1678,11 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
   // Emit the code for the jump table
   assert(JT.Reg != -1U && "Should lower JT Header first!");
-  EVT PTy = TLI.getPointerTy();
-  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+  EVT PTy = TM.getTargetLowering()->getPointerTy();
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                      JT.Reg, PTy);
   SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
-  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurDebugLoc(),
+  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
                                     MVT::Other, Index.getValue(1),
                                     Table, Index);
   DAG.setRoot(BrJumpTable);
@@ -1691,7 +1698,7 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // difference between smallest and largest cases.
   SDValue SwitchOp = getValue(JTH.SValue);
   EVT VT = SwitchOp.getValueType();
-  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, SwitchOp,
                             DAG.getConstant(JTH.First, VT));
 
   // The SDNode we just created, which holds the value being switched on minus
@@ -1699,19 +1706,22 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   // can be used as an index into the jump table in a subsequent basic block.
   // This value may be smaller or larger than the target's pointer type, and
   // therefore require extension or truncating.
-  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), TLI.getPointerTy());
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SwitchOp = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), TLI->getPointerTy());
 
-  unsigned JumpTableReg = FuncInfo.CreateReg(TLI.getPointerTy());
-  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+  unsigned JumpTableReg = FuncInfo.CreateReg(TLI->getPointerTy());
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(),
                                     JumpTableReg, SwitchOp);
   JT.Reg = JumpTableReg;
 
   // Emit the range check for the jump table, and branch to the default block
   // for the switch statement if the value being switched on exceeds the largest
   // case in the switch.
-  SDValue CMP = DAG.getSetCC(getCurDebugLoc(),
-                             TLI.getSetCCResultType(Sub.getValueType()), Sub,
-                             DAG.getConstant(JTH.Last-JTH.First,VT),
+  SDValue CMP = DAG.getSetCC(getCurSDLoc(),
+                             TLI->getSetCCResultType(*DAG.getContext(),
+                                                     Sub.getValueType()),
+                             Sub,
+                             DAG.getConstant(JTH.Last - JTH.First,VT),
                              ISD::SETUGT);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
@@ -1722,17 +1732,88 @@ void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
 
-  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
                                MVT::Other, CopyTo, CMP,
                                DAG.getBasicBlock(JT.Default));
 
   if (JT.MBB != NextBlock)
-    BrCond = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrCond,
+    BrCond = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, BrCond,
                          DAG.getBasicBlock(JT.MBB));
 
   DAG.setRoot(BrCond);
 }
 
+/// Codegen a new tail for a stack protector check ParentMBB which has had its
+/// tail spliced into a stack protector check success bb.
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                                  MachineBasicBlock *ParentBB) {
+
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT PtrTy = TLI->getPointerTy();
+
+  MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI->getStackProtectorIndex();
+
+  const Value *IRGuard = SPD.getGuard();
+  SDValue GuardPtr = getValue(IRGuard);
+  SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+
+  unsigned Align =
+    TLI->getDataLayout()->getPrefTypeAlignment(IRGuard->getType());
+  SDValue Guard = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                              GuardPtr, MachinePointerInfo(IRGuard, 0),
+                              true, false, false, Align);
+
+  SDValue StackSlot = DAG.getLoad(PtrTy, getCurSDLoc(), DAG.getEntryNode(),
+                                  StackSlotPtr,
+                                  MachinePointerInfo::getFixedStack(FI),
+                                  true, false, false, Align);
+
+  // Perform the comparison via a subtract/getsetcc.
+  EVT VT = Guard.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, Guard, StackSlot);
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(),
+                             TLI->getSetCCResultType(*DAG.getContext(),
+                                                     Sub.getValueType()),
+                             Sub, DAG.getConstant(0, VT),
+                             ISD::SETNE);
+
+  // If the sub is not 0, then we know the guard/stackslot do not equal, so
+  // branch to failure MBB.
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
+                               MVT::Other, StackSlot.getOperand(0),
+                               Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
+  // Otherwise branch to success MBB.
+  SDValue Br = DAG.getNode(ISD::BR, getCurSDLoc(),
+                           MVT::Other, BrCond,
+                           DAG.getBasicBlock(SPD.getSuccessMBB()));
+
+  DAG.setRoot(Br);
+}
+
+/// Codegen the failure basic block for a stack protector check.
+///
+/// A failure stack protector machine basic block consists simply of a call to
+/// __stack_chk_fail().
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void
+SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SDValue Chain = TLI->makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL,
+                                   MVT::isVoid, 0, 0, false, getCurSDLoc(),
+                                   false, false).second;
+  DAG.setRoot(Chain);
+}
+
 /// visitBitTestHeader - This function emits necessary code to produce value
 /// suitable for "bit tests"
 void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
@@ -1740,18 +1821,20 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
   // Subtract the minimum value
   SDValue SwitchOp = getValue(B.SValue);
   EVT VT = SwitchOp.getValueType();
-  SDValue Sub = DAG.getNode(ISD::SUB, getCurDebugLoc(), VT, SwitchOp,
+  SDValue Sub = DAG.getNode(ISD::SUB, getCurSDLoc(), VT, SwitchOp,
                             DAG.getConstant(B.First, VT));
 
   // Check range
-  SDValue RangeCmp = DAG.getSetCC(getCurDebugLoc(),
-                                  TLI.getSetCCResultType(Sub.getValueType()),
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SDValue RangeCmp = DAG.getSetCC(getCurSDLoc(),
+                                  TLI->getSetCCResultType(*DAG.getContext(),
+                                                         Sub.getValueType()),
                                   Sub, DAG.getConstant(B.Range, VT),
                                   ISD::SETUGT);
 
   // Determine the type of the test operands.
   bool UsePtrType = false;
-  if (!TLI.isTypeLegal(VT))
+  if (!TLI->isTypeLegal(VT))
     UsePtrType = true;
   else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
@@ -1763,13 +1846,13 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
       }
   }
   if (UsePtrType) {
-    VT = TLI.getPointerTy();
-    Sub = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), VT);
+    VT = TLI->getPointerTy();
+    Sub = DAG.getZExtOrTrunc(Sub, getCurSDLoc(), VT);
   }
 
   B.RegVT = VT.getSimpleVT();
   B.Reg = FuncInfo.CreateReg(B.RegVT);
-  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurSDLoc(),
                                     B.Reg, Sub);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
@@ -1784,12 +1867,12 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
   addSuccessorWithWeight(SwitchBB, B.Default);
   addSuccessorWithWeight(SwitchBB, MBB);
 
-  SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+  SDValue BrRange = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
                                 MVT::Other, CopyTo, RangeCmp,
                                 DAG.getBasicBlock(B.Default));
 
   if (MBB != NextBlock)
-    BrRange = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, CopyTo,
+    BrRange = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, CopyTo,
                           DAG.getBasicBlock(MBB));
 
   DAG.setRoot(BrRange);
@@ -1803,35 +1886,36 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
   MVT VT = BB.RegVT;
-  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurDebugLoc(),
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                        Reg, VT);
   SDValue Cmp;
   unsigned PopCount = CountPopulation_64(B.Mask);
+  const TargetLowering *TLI = TM.getTargetLowering();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
     // would need to be to shift a 1 bit in that position.
-    Cmp = DAG.getSetCC(getCurDebugLoc(),
-                       TLI.getSetCCResultType(VT),
+    Cmp = DAG.getSetCC(getCurSDLoc(),
+                       TLI->getSetCCResultType(*DAG.getContext(), VT),
                        ShiftOp,
-                       DAG.getConstant(CountTrailingZeros_64(B.Mask), VT),
+                       DAG.getConstant(countTrailingZeros(B.Mask), VT),
                        ISD::SETEQ);
   } else if (PopCount == BB.Range) {
     // There is only one zero bit in the range, test for it directly.
-    Cmp = DAG.getSetCC(getCurDebugLoc(),
-                       TLI.getSetCCResultType(VT),
+    Cmp = DAG.getSetCC(getCurSDLoc(),
+                       TLI->getSetCCResultType(*DAG.getContext(), VT),
                        ShiftOp,
                        DAG.getConstant(CountTrailingOnes_64(B.Mask), VT),
                        ISD::SETNE);
   } else {
     // Make desired shift
-    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurDebugLoc(), VT,
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT,
                                     DAG.getConstant(1, VT), ShiftOp);
 
     // Emit bit tests and jumps
-    SDValue AndOp = DAG.getNode(ISD::AND, getCurDebugLoc(),
+    SDValue AndOp = DAG.getNode(ISD::AND, getCurSDLoc(),
                                 VT, SwitchVal, DAG.getConstant(B.Mask, VT));
-    Cmp = DAG.getSetCC(getCurDebugLoc(),
-                       TLI.getSetCCResultType(VT),
+    Cmp = DAG.getSetCC(getCurSDLoc(),
+                       TLI->getSetCCResultType(*DAG.getContext(), VT),
                        AndOp, DAG.getConstant(0, VT),
                        ISD::SETNE);
   }
@@ -1841,7 +1925,7 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
   // The branch weight from SwitchBB to NextMBB is BranchWeightToNext.
   addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext);
 
-  SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurSDLoc(),
                               MVT::Other, getControlRoot(),
                               Cmp, DAG.getBasicBlock(B.TargetBB));
 
@@ -1853,7 +1937,7 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
     NextBlock = BBI;
 
   if (NextMBB != NextBlock)
-    BrAnd = DAG.getNode(ISD::BR, getCurDebugLoc(), MVT::Other, BrAnd,
+    BrAnd = DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, BrAnd,
                         DAG.getBasicBlock(NextMBB));
 
   DAG.setRoot(BrAnd);
@@ -1885,7 +1969,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   addSuccessorWithWeight(InvokeMBB, LandingPad);
 
   // Drop into normal successor.
-  DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
                           MVT::Other, getControlRoot(),
                           DAG.getBasicBlock(Return)));
 }
@@ -1904,39 +1988,32 @@ void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
 
   // If there aren't registers to copy the values into (e.g., during SjLj
   // exceptions), then don't bother to create these DAG nodes.
-  if (TLI.getExceptionPointerRegister() == 0 &&
-      TLI.getExceptionSelectorRegister() == 0)
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (TLI->getExceptionPointerRegister() == 0 &&
+      TLI->getExceptionSelectorRegister() == 0)
     return;
 
   SmallVector<EVT, 2> ValueVTs;
-  ComputeValueVTs(TLI, LP.getType(), ValueVTs);
+  ComputeValueVTs(*TLI, LP.getType(), ValueVTs);
+  assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
 
-  // Insert the EXCEPTIONADDR instruction.
-  assert(FuncInfo.MBB->isLandingPad() &&
-         "Call to eh.exception not in landing pad!");
-  SDVTList VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
+  // Get the two live-in registers as SDValues. The physregs have already been
+  // copied into virtual registers.
   SDValue Ops[2];
-  Ops[0] = DAG.getRoot();
-  SDValue Op1 = DAG.getNode(ISD::EXCEPTIONADDR, getCurDebugLoc(), VTs, Ops, 1);
-  SDValue Chain = Op1.getValue(1);
-
-  // Insert the EHSELECTION instruction.
-  VTs = DAG.getVTList(TLI.getPointerTy(), MVT::Other);
-  Ops[0] = Op1;
-  Ops[1] = Chain;
-  SDValue Op2 = DAG.getNode(ISD::EHSELECTION, getCurDebugLoc(), VTs, Ops, 2);
-  Chain = Op2.getValue(1);
-  Op2 = DAG.getSExtOrTrunc(Op2, getCurDebugLoc(), MVT::i32);
-
-  Ops[0] = Op1;
-  Ops[1] = Op2;
-  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+  Ops[0] = DAG.getZExtOrTrunc(
+    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                       FuncInfo.ExceptionPointerVirtReg, TLI->getPointerTy()),
+    getCurSDLoc(), ValueVTs[0]);
+  Ops[1] = DAG.getZExtOrTrunc(
+    DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                       FuncInfo.ExceptionSelectorVirtReg, TLI->getPointerTy()),
+    getCurSDLoc(), ValueVTs[1]);
+
+  // Merge into one.
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                             DAG.getVTList(&ValueVTs[0], ValueVTs.size()),
                             &Ops[0], 2);
-
-  std::pair<SDValue, SDValue> RetPair = std::make_pair(Res, Chain);
-  setValue(&LP, RetPair.first);
-  DAG.setRoot(RetPair.second);
+  setValue(&LP, Res);
 }
 
 /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
@@ -1987,7 +2064,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
 
         SDValue CondLHS = getValue(SV);
         EVT VT = CondLHS.getValueType();
-        DebugLoc DL = getCurDebugLoc();
+        SDLoc DL = getCurSDLoc();
 
         SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
                                  DAG.getConstant(CommonBit, VT));
@@ -2038,12 +2115,11 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
     // The last case block won't fall through into 'NextBlock' if we emit the
     // branches in this order.  See if rearranging a case value would help.
     // We start at the bottom as it's the case with the least weight.
-    for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I){
+    for (Case *I = &*(CR.Range.second-2), *E = &*CR.Range.first-1; I != E; --I)
       if (I->BB == NextBlock) {
         std::swap(*I, BackCase);
         break;
       }
-    }
   }
 
   // Create a CaseBlock record representing a conditional branch to
@@ -2070,7 +2146,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       CC = ISD::SETEQ;
       LHS = SV; RHS = I->High; MHS = NULL;
     } else {
-      CC = ISD::SETCC_INVALID; 
+      CC = ISD::SETLE;
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
@@ -2104,7 +2180,7 @@ static inline bool areJTsAllowed(const TargetLowering &TLI) {
 
 static APInt ComputeRange(const APInt &First, const APInt &Last) {
   uint32_t BitWidth = std::max(Last.getBitWidth(), First.getBitWidth()) + 1;
-  APInt LastExt = Last.zext(BitWidth), FirstExt = First.zext(BitWidth);
+  APInt LastExt = Last.sext(BitWidth), FirstExt = First.sext(BitWidth);
   return (LastExt - FirstExt + 1ULL);
 }
 
@@ -2124,7 +2200,8 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
   for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
     TSize += I->size();
 
-  if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries()))
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (!areJTsAllowed(*TLI) || TSize.ult(TLI->getMinimumJumpTableEntries()))
     return false;
 
   APInt Range = ComputeRange(First, Last);
@@ -2170,7 +2247,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     const APInt &Low = cast<ConstantInt>(I->Low)->getValue();
     const APInt &High = cast<ConstantInt>(I->High)->getValue();
 
-    if (Low.ule(TEI) && TEI.ule(High)) {
+    if (Low.sle(TEI) && TEI.sle(High)) {
       DestBBs.push_back(I->BB);
       if (TEI==High)
         ++I;
@@ -2185,7 +2262,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
       DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
           DestWeights.find(I->BB);
-      if (Itr != DestWeights.end()) 
+      if (Itr != DestWeights.end())
         Itr->second += I->ExtraWeight;
       else
         DestWeights[I->BB] = I->ExtraWeight;
@@ -2205,7 +2282,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
   }
 
   // Create a jump table index for this jump table.
-  unsigned JTEncoding = TLI.getJumpTableEncoding();
+  unsigned JTEncoding = TLI->getJumpTableEncoding();
   unsigned JTI = CurMF->getOrCreateJumpTableInfo(JTEncoding)
                        ->createJumpTableIndex(DestBBs);
 
@@ -2225,8 +2302,8 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
 bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
                                                   CaseRecVector& WorkList,
                                                   const Value* SV,
-                                                  MachineBasicBlock *Default,
-                                                  MachineBasicBlock *SwitchBB) {
+                                                  MachineBasicBlock* Default,
+                                                  MachineBasicBlock* SwitchBB) {
   // Get the MachineFunction which holds the current MBB.  This is used when
   // inserting any additional MBBs necessary to represent the switch.
   MachineFunction *CurMF = FuncInfo.MF;
@@ -2290,7 +2367,9 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
     LSize += J->size();
     RSize -= J->size();
   }
-  if (areJTsAllowed(TLI)) {
+
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (areJTsAllowed(*TLI)) {
     // If our case is dense we *really* should handle it earlier!
     assert((FMetric > 0) && "Should handle dense range earlier!");
   } else {
@@ -2342,7 +2421,7 @@ bool SelectionDAGBuilder::handleBTSplitSwitchCase(CaseRec& CR,
   // Create a CaseBlock record representing a conditional branch to
   // the LHS node if the value being switched on SV is less than C.
   // Otherwise, branch to LHS.
-  CaseBlock CB(ISD::SETULT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
+  CaseBlock CB(ISD::SETLT, SV, C, NULL, TrueBB, FalseBB, CR.CaseBB);
 
   if (CR.CaseBB == SwitchBB)
     visitSwitchCase(CB, SwitchBB);
@@ -2359,8 +2438,9 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
                                                    CaseRecVector& WorkList,
                                                    const Value* SV,
                                                    MachineBasicBlock* Default,
-                                                   MachineBasicBlock *SwitchBB){
-  EVT PTy = TLI.getPointerTy();
+                                                   MachineBasicBlock* SwitchBB) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT PTy = TLI->getPointerTy();
   unsigned IntPtrBits = PTy.getSizeInBits();
 
   Case& FrontCase = *CR.Range.first;
@@ -2371,7 +2451,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   MachineFunction *CurMF = FuncInfo.MF;
 
   // If target does not have legal shift left, do not emit bit tests at all.
-  if (!TLI.isOperationLegal(ISD::SHL, TLI.getPointerTy()))
+  if (!TLI->isOperationLegal(ISD::SHL, PTy))
     return false;
 
   size_t numCmps = 0;
@@ -2414,7 +2494,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
   // Optimize the case where all the case values fit in a
   // word without having to subtract minValue. In this case,
   // we can optimize away the subtraction.
-  if (maxValue.ult(IntPtrBits)) {
+  if (minValue.isNonNegative() && maxValue.slt(IntPtrBits)) {
     cmpRange = maxValue;
   } else {
     lowBound = minValue;
@@ -2489,12 +2569,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 /// Clusterify - Transform simple list of Cases into list of CaseRange's
 size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                        const SwitchInst& SI) {
-  
-  /// Use a shorter form of declaration, and also
-  /// show the we want to use CRSBuilder as Clusterifier.
-  typedef IntegersSubsetMapping<MachineBasicBlock> Clusterifier;
-  
-  Clusterifier TheClusterifier;
+  size_t numCmps = 0;
 
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
@@ -2503,27 +2578,40 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    TheClusterifier.add(i.getCaseValueEx(), SMBB, 
-        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
-  }
-  
-  TheClusterifier.optimize();
-  
-  size_t numCmps = 0;
-  for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    Clusterifier::Cluster &C = *i;
-    // Update edge weight for the cluster.
-    unsigned W = C.first.Weight;
-
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(Case(C.first.getLow().toConstantInt(),
-                         C.first.getHigh().toConstantInt(), C.second, W));
-    
-    if (C.first.getLow() != C.first.getHigh())
-    // A range counts double, since it requires two compares.
-    ++numCmps;
+    uint32_t ExtraWeight =
+      BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0;
+
+    Cases.push_back(Case(i.getCaseValue(), i.getCaseValue(),
+                         SMBB, ExtraWeight));
+  }
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size() >= 2)
+    // Must recompute end() each iteration because it may be
+    // invalidated by erase if we hold on to it
+    for (CaseItr I = Cases.begin(), J = llvm::next(Cases.begin());
+         J != Cases.end(); ) {
+      const APInt& nextValue = cast<ConstantInt>(J->Low)->getValue();
+      const APInt& currentValue = cast<ConstantInt>(I->High)->getValue();
+      MachineBasicBlock* nextBB = J->BB;
+      MachineBasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue - currentValue == 1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        I->ExtraWeight += J->ExtraWeight;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
+      // A range counts double, since it requires two compares.
+      ++numCmps;
   }
 
   return numCmps;
@@ -2557,7 +2645,7 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
     // If this is not a fall-through branch, emit the branch.
     SwitchMBB->addSuccessor(Default);
     if (Default != NextBlock)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
                               MVT::Other, getControlRoot(),
                               DAG.getBasicBlock(Default)));
 
@@ -2624,7 +2712,7 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
     addSuccessorWithWeight(IndirectBrMBB, Succ);
   }
 
-  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurDebugLoc(),
+  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
                           MVT::Other, getControlRoot(),
                           getValue(I.getAddress())));
 }
@@ -2635,7 +2723,7 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
   if (isa<Constant>(I.getOperand(0)) &&
       I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
     SDValue Op2 = getValue(I.getOperand(1));
-    setValue(&I, DAG.getNode(ISD::FNEG, getCurDebugLoc(),
+    setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
                              Op2.getValueType(), Op2));
     return;
   }
@@ -2646,7 +2734,7 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
-  setValue(&I, DAG.getNode(OpCode, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(OpCode, getCurSDLoc(),
                            Op1.getValueType(), Op1, Op2));
 }
 
@@ -2654,13 +2742,13 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
 
-  EVT ShiftTy = TLI.getShiftAmountTy(Op2.getValueType());
+  EVT ShiftTy = TM.getTargetLowering()->getShiftAmountTy(Op2.getValueType());
 
   // Coerce the shift amount to the right type if we can.
   if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
     unsigned ShiftSize = ShiftTy.getSizeInBits();
     unsigned Op2Size = Op2.getValueType().getSizeInBits();
-    DebugLoc DL = getCurDebugLoc();
+    SDLoc DL = getCurSDLoc();
 
     // If the operand is smaller than the shift count type, promote it.
     if (ShiftSize > Op2Size)
@@ -2678,7 +2766,7 @@ void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
       Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
   }
 
-  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(),
                            Op1.getValueType(), Op1, Op2));
 }
 
@@ -2692,9 +2780,10 @@ void SelectionDAGBuilder::visitSDiv(const User &I) {
   if (isa<BinaryOperator>(&I) && cast<BinaryOperator>(&I)->isExact() &&
       !isa<ConstantSDNode>(Op1) &&
       isa<ConstantSDNode>(Op2) && !cast<ConstantSDNode>(Op2)->isNullValue())
-    setValue(&I, TLI.BuildExactSDIV(Op1, Op2, getCurDebugLoc(), DAG));
+    setValue(&I, TM.getTargetLowering()->BuildExactSDIV(Op1, Op2,
+                                                        getCurSDLoc(), DAG));
   else
-    setValue(&I, DAG.getNode(ISD::SDIV, getCurDebugLoc(), Op1.getValueType(),
+    setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(),
                              Op1, Op2));
 }
 
@@ -2708,8 +2797,8 @@ void SelectionDAGBuilder::visitICmp(const User &I) {
   SDValue Op2 = getValue(I.getOperand(1));
   ISD::CondCode Opcode = getICmpCondCode(predicate);
 
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Opcode));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
 }
 
 void SelectionDAGBuilder::visitFCmp(const User &I) {
@@ -2723,13 +2812,13 @@ void SelectionDAGBuilder::visitFCmp(const User &I) {
   ISD::CondCode Condition = getFCmpCondCode(predicate);
   if (TM.Options.NoNaNsFPMath)
     Condition = getFCmpCodeWithoutNaN(Condition);
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getSetCC(getCurDebugLoc(), DestVT, Op1, Op2, Condition));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
 }
 
 void SelectionDAGBuilder::visitSelect(const User &I) {
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  ComputeValueVTs(*TM.getTargetLowering(), I.getType(), ValueVTs);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0) return;
 
@@ -2741,7 +2830,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     ISD::VSELECT : ISD::SELECT;
 
   for (unsigned i = 0; i != NumValues; ++i)
-    Values[i] = DAG.getNode(OpCode, getCurDebugLoc(),
+    Values[i] = DAG.getNode(OpCode, getCurSDLoc(),
                             TrueVal.getNode()->getValueType(TrueVal.getResNo()+i),
                             Cond,
                             SDValue(TrueVal.getNode(),
@@ -2749,7 +2838,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
                             SDValue(FalseVal.getNode(),
                                     FalseVal.getResNo() + i));
 
-  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(&ValueVTs[0], NumValues),
                            &Values[0], NumValues));
 }
@@ -2757,117 +2846,134 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 void SelectionDAGBuilder::visitTrunc(const User &I) {
   // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitZExt(const User &I) {
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitSExt(const User &I) {
   // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // SExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPTrunc(const User &I) {
   // FPTrunc is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurDebugLoc(),
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT DestVT = TLI->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, getCurSDLoc(),
                            DestVT, N,
-                           DAG.getTargetConstant(0, TLI.getPointerTy())));
+                           DAG.getTargetConstant(0, TLI->getPointerTy())));
 }
 
-void SelectionDAGBuilder::visitFPExt(const User &I){
+void SelectionDAGBuilder::visitFPExt(const User &I) {
   // FPExt is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToUI(const User &I) {
   // FPToUI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitFPToSI(const User &I) {
   // FPToSI is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitUIToFP(const User &I) {
   // UIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
-void SelectionDAGBuilder::visitSIToFP(const User &I){
+void SelectionDAGBuilder::visitSIToFP(const User &I) {
   // SIToFP is never a no-op cast, no need to check
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurDebugLoc(), DestVT, N));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
 }
 
 void SelectionDAGBuilder::visitPtrToInt(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitIntToPtr(const User &I) {
   // What to do depends on the size of the integer and the size of the pointer.
   // We can either truncate, zero extend, or no-op, accordingly.
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
-  setValue(&I, DAG.getZExtOrTrunc(N, getCurDebugLoc(), DestVT));
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
 }
 
 void SelectionDAGBuilder::visitBitCast(const User &I) {
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = TLI.getValueType(I.getType());
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
 
   // BitCast assures us that source and destination are the same size so this is
   // either a BITCAST or a no-op.
   if (DestVT != N.getValueType())
-    setValue(&I, DAG.getNode(ISD::BITCAST, getCurDebugLoc(),
+    setValue(&I, DAG.getNode(ISD::BITCAST, getCurSDLoc(),
                              DestVT, N)); // convert types.
   else
     setValue(&I, N);            // noop cast.
 }
 
+void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  SDValue N = getValue(SV);
+  EVT DestVT = TM.getTargetLowering()->getValueType(I.getType());
+
+  unsigned SrcAS = SV->getType()->getPointerAddressSpace();
+  unsigned DestAS = I.getType()->getPointerAddressSpace();
+
+  if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+    N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
+
+  setValue(&I, N);
+}
+
 void SelectionDAGBuilder::visitInsertElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
   SDValue InVal = getValue(I.getOperand(1));
-  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
-                              TLI.getPointerTy(),
-                              getValue(I.getOperand(2)));
-  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurDebugLoc(),
-                           TLI.getValueType(I.getType()),
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)),
+                                     getCurSDLoc(), TLI.getVectorIdxTy());
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
+                           TM.getTargetLowering()->getValueType(I.getType()),
                            InVec, InVal, InIdx));
 }
 
 void SelectionDAGBuilder::visitExtractElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue InVec = getValue(I.getOperand(0));
-  SDValue InIdx = DAG.getNode(ISD::ZERO_EXTEND, getCurDebugLoc(),
-                              TLI.getPointerTy(),
-                              getValue(I.getOperand(1)));
-  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
-                           TLI.getValueType(I.getType()), InVec, InIdx));
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)),
+                                     getCurSDLoc(), TLI.getVectorIdxTy());
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
+                           TM.getTargetLowering()->getValueType(I.getType()),
+                           InVec, InIdx));
 }
 
 // Utility for visitShuffleVector - Return true if every element in Mask,
@@ -2888,13 +2994,14 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SmallVector<int, 8> Mask;
   ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
   unsigned MaskNumElts = Mask.size();
-  
-  EVT VT = TLI.getValueType(I.getType());
+
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT VT = TLI->getValueType(I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
   if (SrcNumElts == MaskNumElts) {
-    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
                                       &Mask[0]));
     return;
   }
@@ -2909,7 +3016,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       if (isSequentialInRange(Mask, 0, SrcNumElts, 0) &&
           isSequentialInRange(Mask, SrcNumElts, SrcNumElts, SrcNumElts)) {
         // The shuffle is concatenating two vectors together.
-        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
                                  VT, Src1, Src2));
         return;
       }
@@ -2917,7 +3024,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       if (isSequentialInRange(Mask, 0, SrcNumElts, SrcNumElts) &&
           isSequentialInRange(Mask, SrcNumElts, SrcNumElts, 0)) {
         // The shuffle is concatenating two vectors together.
-        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurDebugLoc(),
+        setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
                                  VT, Src2, Src1));
         return;
       }
@@ -2935,10 +3042,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     MOps2[0] = Src2;
 
     Src1 = Src1U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurDebugLoc(), VT,
+                                                  getCurSDLoc(), VT,
                                                   &MOps1[0], NumConcat);
     Src2 = Src2U ? DAG.getUNDEF(VT) : DAG.getNode(ISD::CONCAT_VECTORS,
-                                                  getCurDebugLoc(), VT,
+                                                  getCurSDLoc(), VT,
                                                   &MOps2[0], NumConcat);
 
     // Readjust mask for new input vector length.
@@ -2950,7 +3057,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       MappedOps.push_back(Idx);
     }
 
-    setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
                                       &MappedOps[0]));
     return;
   }
@@ -3010,8 +3117,9 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         if (RangeUse[Input] == 0)
           Src = DAG.getUNDEF(VT);
         else
-          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurDebugLoc(), VT,
-                            Src, DAG.getIntPtrConstant(StartIdx[Input]));
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, getCurSDLoc(), VT,
+                            Src, DAG.getConstant(StartIdx[Input],
+                                                 TLI->getVectorIdxTy()));
       }
 
       // Calculate new mask.
@@ -3027,7 +3135,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         MappedOps.push_back(Idx);
       }
 
-      setValue(&I, DAG.getVectorShuffle(VT, getCurDebugLoc(), Src1, Src2,
+      setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
                                         &MappedOps[0]));
       return;
     }
@@ -3037,7 +3145,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
-  EVT PtrVT = TLI.getPointerTy();
+  EVT IdxVT = TLI->getVectorIdxTy();
   SmallVector<SDValue,8> Ops;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
     int Idx = Mask[i];
@@ -3049,14 +3157,14 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
       if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
 
-      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurDebugLoc(),
-                        EltVT, Src, DAG.getConstant(Idx, PtrVT));
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
+                        EltVT, Src, DAG.getConstant(Idx, IdxVT));
     }
 
     Ops.push_back(Res);
   }
 
-  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
                            VT, &Ops[0], Ops.size()));
 }
 
@@ -3070,10 +3178,11 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
+  const TargetLowering *TLI = TM.getTargetLowering();
   SmallVector<EVT, 4> AggValueVTs;
-  ComputeValueVTs(TLI, AggTy, AggValueVTs);
+  ComputeValueVTs(*TLI, AggTy, AggValueVTs);
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
 
   unsigned NumAggValues = AggValueVTs.size();
   unsigned NumValValues = ValValueVTs.size();
@@ -3097,7 +3206,7 @@ void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
     Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
                 SDValue(Agg.getNode(), Agg.getResNo() + i);
 
-  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(&AggValueVTs[0], NumAggValues),
                            &Values[0], NumAggValues));
 }
@@ -3110,8 +3219,9 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
 
   unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
 
+  const TargetLowering *TLI = TM.getTargetLowering();
   SmallVector<EVT, 4> ValValueVTs;
-  ComputeValueVTs(TLI, ValTy, ValValueVTs);
+  ComputeValueVTs(*TLI, ValTy, ValValueVTs);
 
   unsigned NumValValues = ValValueVTs.size();
 
@@ -3131,16 +3241,18 @@ void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
         DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
         SDValue(Agg.getNode(), Agg.getResNo() + i);
 
-  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(&ValValueVTs[0], NumValValues),
                            &Values[0], NumValValues));
 }
 
 void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
-  SDValue N = getValue(I.getOperand(0));
+  Value *Op0 = I.getOperand(0);
   // Note that the pointer operand may be a vector of pointers. Take the scalar
   // element which holds a pointer.
-  Type *Ty = I.getOperand(0)->getType()->getScalarType();
+  Type *Ty = Op0->getType()->getScalarType();
+  unsigned AS = Ty->getPointerAddressSpace();
+  SDValue N = getValue(Op0);
 
   for (GetElementPtrInst::const_op_iterator OI = I.op_begin()+1, E = I.op_end();
        OI != E; ++OI) {
@@ -3150,7 +3262,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (Field) {
         // N = N + Offset
         uint64_t Offset = TD->getStructLayout(StTy)->getElementOffset(Field);
-        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+        N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N,
                         DAG.getConstant(Offset, N.getValueType()));
       }
 
@@ -3159,50 +3271,50 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       Ty = cast<SequentialType>(Ty)->getElementType();
 
       // If this is a constant subscript, handle it quickly.
+      const TargetLowering *TLI = TM.getTargetLowering();
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         uint64_t Offs =
             TD->getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         SDValue OffsVal;
-        EVT PTy = TLI.getPointerTy();
+        EVT PTy = TLI->getPointerTy(AS);
         unsigned PtrBits = PTy.getSizeInBits();
         if (PtrBits < 64)
-          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(),
-                                TLI.getPointerTy(),
+          OffsVal = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), PTy,
                                 DAG.getConstant(Offs, MVT::i64));
         else
-          OffsVal = DAG.getIntPtrConstant(Offs);
+          OffsVal = DAG.getConstant(Offs, PTy);
 
-        N = DAG.getNode(ISD::ADD, getCurDebugLoc(), N.getValueType(), N,
+        N = DAG.getNode(ISD::ADD, getCurSDLoc(), N.getValueType(), N,
                         OffsVal);
         continue;
       }
 
       // N = N + Idx * ElementSize;
-      APInt ElementSize = APInt(TLI.getPointerTy().getSizeInBits(),
+      APInt ElementSize = APInt(TLI->getPointerSizeInBits(AS),
                                 TD->getTypeAllocSize(Ty));
       SDValue IdxN = getValue(Idx);
 
       // If the index is smaller or larger than intptr_t, truncate or extend
       // it.
-      IdxN = DAG.getSExtOrTrunc(IdxN, getCurDebugLoc(), N.getValueType());
+      IdxN = DAG.getSExtOrTrunc(IdxN, getCurSDLoc(), N.getValueType());
 
       // If this is a multiply by a power of two, turn it into a shl
       // immediately.  This is a very common case.
       if (ElementSize != 1) {
         if (ElementSize.isPowerOf2()) {
           unsigned Amt = ElementSize.logBase2();
-          IdxN = DAG.getNode(ISD::SHL, getCurDebugLoc(),
+          IdxN = DAG.getNode(ISD::SHL, getCurSDLoc(),
                              N.getValueType(), IdxN,
                              DAG.getConstant(Amt, IdxN.getValueType()));
         } else {
           SDValue Scale = DAG.getConstant(ElementSize, IdxN.getValueType());
-          IdxN = DAG.getNode(ISD::MUL, getCurDebugLoc(),
+          IdxN = DAG.getNode(ISD::MUL, getCurSDLoc(),
                              N.getValueType(), IdxN, Scale);
         }
       }
 
-      N = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+      N = DAG.getNode(ISD::ADD, getCurSDLoc(),
                       N.getValueType(), N, IdxN);
     }
   }
@@ -3217,18 +3329,19 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
     return;   // getValue will auto-populate this.
 
   Type *Ty = I.getAllocatedType();
-  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+  const TargetLowering *TLI = TM.getTargetLowering();
+  uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
   unsigned Align =
-    std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
+    std::max((unsigned)TLI->getDataLayout()->getPrefTypeAlignment(Ty),
              I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
-  EVT IntPtr = TLI.getPointerTy();
+  EVT IntPtr = TLI->getPointerTy();
   if (AllocSize.getValueType() != IntPtr)
-    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurDebugLoc(), IntPtr);
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, getCurSDLoc(), IntPtr);
 
-  AllocSize = DAG.getNode(ISD::MUL, getCurDebugLoc(), IntPtr,
+  AllocSize = DAG.getNode(ISD::MUL, getCurSDLoc(), IntPtr,
                           AllocSize,
                           DAG.getConstant(TySize, IntPtr));
 
@@ -3241,18 +3354,18 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
 
   // Round the size of the allocation up to the stack alignment size
   // by add SA-1 to the size.
-  AllocSize = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+  AllocSize = DAG.getNode(ISD::ADD, getCurSDLoc(),
                           AllocSize.getValueType(), AllocSize,
                           DAG.getIntPtrConstant(StackAlign-1));
 
   // Mask out the low bits for alignment purposes.
-  AllocSize = DAG.getNode(ISD::AND, getCurDebugLoc(),
+  AllocSize = DAG.getNode(ISD::AND, getCurSDLoc(),
                           AllocSize.getValueType(), AllocSize,
                           DAG.getIntPtrConstant(~(uint64_t)(StackAlign-1)));
 
   SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align) };
   SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
-  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurDebugLoc(),
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, getCurSDLoc(),
                             VTs, Ops, 3);
   setValue(&I, DSA);
   DAG.setRoot(DSA.getValue(1));
@@ -3280,7 +3393,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(TLI, Ty, ValueVTs, &Offsets);
+  ComputeValueVTs(*TM.getTargetLowering(), Ty, ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3314,15 +3427,15 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
     // (MaxParallelChains should always remain as failsafe).
     if (ChainI == MaxParallelChains) {
       assert(PendingLoads.empty() && "PendingLoads must be serialized first");
-      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                                   MVT::Other, &Chains[0], ChainI);
       Root = Chain;
       ChainI = 0;
     }
-    SDValue A = DAG.getNode(ISD::ADD, getCurDebugLoc(),
+    SDValue A = DAG.getNode(ISD::ADD, getCurSDLoc(),
                             PtrVT, Ptr,
                             DAG.getConstant(Offsets[i], PtrVT));
-    SDValue L = DAG.getLoad(ValueVTs[i], getCurDebugLoc(), Root,
+    SDValue L = DAG.getLoad(ValueVTs[i], getCurSDLoc(), Root,
                             A, MachinePointerInfo(SV, Offsets[i]), isVolatile,
                             isNonTemporal, isInvariant, Alignment, TBAAInfo,
                             Ranges);
@@ -3332,7 +3445,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   }
 
   if (!ConstantMemory) {
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                                 MVT::Other, &Chains[0], ChainI);
     if (isVolatile)
       DAG.setRoot(Chain);
@@ -3340,7 +3453,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
       PendingLoads.push_back(Chain);
   }
 
-  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                            DAG.getVTList(&ValueVTs[0], NumValues),
                            &Values[0], NumValues));
 }
@@ -3354,7 +3467,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
 
   SmallVector<EVT, 4> ValueVTs;
   SmallVector<uint64_t, 4> Offsets;
-  ComputeValueVTs(TLI, SrcV->getType(), ValueVTs, &Offsets);
+  ComputeValueVTs(*TM.getTargetLowering(), SrcV->getType(), ValueVTs, &Offsets);
   unsigned NumValues = ValueVTs.size();
   if (NumValues == 0)
     return;
@@ -3378,30 +3491,28 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // See visitLoad comments.
     if (ChainI == MaxParallelChains) {
-      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                                   MVT::Other, &Chains[0], ChainI);
       Root = Chain;
       ChainI = 0;
     }
-    SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT, Ptr,
+    SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT, Ptr,
                               DAG.getConstant(Offsets[i], PtrVT));
-    SDValue St = DAG.getStore(Root, getCurDebugLoc(),
+    SDValue St = DAG.getStore(Root, getCurSDLoc(),
                               SDValue(Src.getNode(), Src.getResNo() + i),
                               Add, MachinePointerInfo(PtrV, Offsets[i]),
                               isVolatile, isNonTemporal, Alignment, TBAAInfo);
     Chains[ChainI] = St;
   }
 
-  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                                   MVT::Other, &Chains[0], ChainI);
-  ++SDNodeOrder;
-  AssignOrderingToNode(StoreNode.getNode());
   DAG.setRoot(StoreNode);
 }
 
 static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
                                     SynchronizationScope Scope,
-                                    bool Before, DebugLoc dl,
+                                    bool Before, SDLoc dl,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
   // Fence, if necessary
@@ -3424,39 +3535,40 @@ static SDValue InsertFenceForAtomic(SDValue Chain, AtomicOrdering Order,
 }
 
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
   AtomicOrdering Order = I.getOrdering();
   SynchronizationScope Scope = I.getSynchScope();
 
   SDValue InChain = getRoot();
 
-  if (TLI.getInsertFencesForAtomic())
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (TLI->getInsertFencesForAtomic())
     InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, TLI);
+                                   DAG, *TLI);
 
   SDValue L =
     DAG.getAtomic(ISD::ATOMIC_CMP_SWAP, dl,
-                  getValue(I.getCompareOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getCompareOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getCompareOperand()),
                   getValue(I.getNewValOperand()),
                   MachinePointerInfo(I.getPointerOperand()), 0 /* Alignment */,
-                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
                   Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI.getInsertFencesForAtomic())
+  if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, TLI);
+                                    DAG, *TLI);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
   ISD::NodeType NT;
   switch (I.getOperation()) {
   default: llvm_unreachable("Unknown atomicrmw operation");
@@ -3477,47 +3589,50 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
 
   SDValue InChain = getRoot();
 
-  if (TLI.getInsertFencesForAtomic())
+  const TargetLowering *TLI = TM.getTargetLowering();
+  if (TLI->getInsertFencesForAtomic())
     InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, TLI);
+                                   DAG, *TLI);
 
   SDValue L =
     DAG.getAtomic(NT, dl,
-                  getValue(I.getValOperand()).getValueType().getSimpleVT(),
+                  getValue(I.getValOperand()).getSimpleValueType(),
                   InChain,
                   getValue(I.getPointerOperand()),
                   getValue(I.getValOperand()),
                   I.getPointerOperand(), 0 /* Alignment */,
-                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
                   Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI.getInsertFencesForAtomic())
+  if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, TLI);
+                                    DAG, *TLI);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitFence(const FenceInst &I) {
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
+  const TargetLowering *TLI = TM.getTargetLowering();
   SDValue Ops[3];
   Ops[0] = getRoot();
-  Ops[1] = DAG.getConstant(I.getOrdering(), TLI.getPointerTy());
-  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI.getPointerTy());
+  Ops[1] = DAG.getConstant(I.getOrdering(), TLI->getPointerTy());
+  Ops[2] = DAG.getConstant(I.getSynchScope(), TLI->getPointerTy());
   DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops, 3));
 }
 
 void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
   AtomicOrdering Order = I.getOrdering();
   SynchronizationScope Scope = I.getSynchScope();
 
   SDValue InChain = getRoot();
 
-  EVT VT = TLI.getValueType(I.getType());
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT VT = TLI->getValueType(I.getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3526,35 +3641,36 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
     DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
                   getValue(I.getPointerOperand()),
                   I.getPointerOperand(), I.getAlignment(),
-                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
                   Scope);
 
   SDValue OutChain = L.getValue(1);
 
-  if (TLI.getInsertFencesForAtomic())
+  if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, TLI);
+                                    DAG, *TLI);
 
   setValue(&I, L);
   DAG.setRoot(OutChain);
 }
 
 void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
-  DebugLoc dl = getCurDebugLoc();
+  SDLoc dl = getCurSDLoc();
 
   AtomicOrdering Order = I.getOrdering();
   SynchronizationScope Scope = I.getSynchScope();
 
   SDValue InChain = getRoot();
 
-  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
+  const TargetLowering *TLI = TM.getTargetLowering();
+  EVT VT = TLI->getValueType(I.getValueOperand()->getType());
 
   if (I.getAlignment() < VT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
-  if (TLI.getInsertFencesForAtomic())
+  if (TLI->getInsertFencesForAtomic())
     InChain = InsertFenceForAtomic(InChain, Order, Scope, true, dl,
-                                   DAG, TLI);
+                                   DAG, *TLI);
 
   SDValue OutChain =
     DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
@@ -3562,12 +3678,12 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
                   getValue(I.getPointerOperand()),
                   getValue(I.getValueOperand()),
                   I.getPointerOperand(), I.getAlignment(),
-                  TLI.getInsertFencesForAtomic() ? Monotonic : Order,
+                  TLI->getInsertFencesForAtomic() ? Monotonic : Order,
                   Scope);
 
-  if (TLI.getInsertFencesForAtomic())
+  if (TLI->getInsertFencesForAtomic())
     OutChain = InsertFenceForAtomic(OutChain, Order, Scope, false, dl,
-                                    DAG, TLI);
+                                    DAG, *TLI);
 
   DAG.setRoot(OutChain);
 }
@@ -3592,12 +3708,13 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   // Info is set by getTgtMemInstrinsic
   TargetLowering::IntrinsicInfo Info;
-  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
+  const TargetLowering *TLI = TM.getTargetLowering();
+  bool IsTgtIntrinsic = TLI->getTgtMemIntrinsic(Info, I, Intrinsic);
 
   // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
   if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
       Info.opc == ISD::INTRINSIC_W_CHAIN)
-    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI.getPointerTy()));
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, TLI->getPointerTy()));
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
@@ -3606,7 +3723,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   }
 
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, I.getType(), ValueVTs);
+  ComputeValueVTs(*TLI, I.getType(), ValueVTs);
 
   if (HasChain)
     ValueVTs.push_back(MVT::Other);
@@ -3617,20 +3734,20 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
   SDValue Result;
   if (IsTgtIntrinsic) {
     // This is target intrinsic that touches memory
-    Result = DAG.getMemIntrinsicNode(Info.opc, getCurDebugLoc(),
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(),
                                      VTs, &Ops[0], Ops.size(),
                                      Info.memVT,
                                    MachinePointerInfo(Info.ptrVal, Info.offset),
                                      Info.align, Info.vol,
                                      Info.readMem, Info.writeMem);
   } else if (!HasChain) {
-    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurDebugLoc(),
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(),
                          VTs, &Ops[0], Ops.size());
   } else if (!I.getType()->isVoidTy()) {
-    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurDebugLoc(),
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(),
                          VTs, &Ops[0], Ops.size());
   } else {
-    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurDebugLoc(),
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(),
                          VTs, &Ops[0], Ops.size());
   }
 
@@ -3644,17 +3761,11 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   if (!I.getType()->isVoidTy()) {
     if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
-      EVT VT = TLI.getValueType(PTy);
-      Result = DAG.getNode(ISD::BITCAST, getCurDebugLoc(), VT, Result);
+      EVT VT = TLI->getValueType(PTy);
+      Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
     }
 
     setValue(&I, Result);
-  } else {
-    // Assign order to result here. If the intrinsic does not produce a result,
-    // it won't be mapped to a SDNode and visit() will not assign it an order
-    // number.
-    ++SDNodeOrder;
-    AssignOrderingToNode(Result.getNode());
   }
 }
 
@@ -3665,7 +3776,7 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 ///
 /// where Op is the hexadecimal representation of floating point value.
 static SDValue
-GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
+GetSignificand(SelectionDAG &DAG, SDValue Op, SDLoc dl) {
   SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x007fffff, MVT::i32));
   SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
@@ -3680,7 +3791,7 @@ GetSignificand(SelectionDAG &DAG, SDValue Op, DebugLoc dl) {
 /// where Op is the hexadecimal representation of floating point value.
 static SDValue
 GetExponent(SelectionDAG &DAG, SDValue Op, const TargetLowering &TLI,
-            DebugLoc dl) {
+            SDLoc dl) {
   SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
                            DAG.getConstant(0x7f800000, MVT::i32));
   SDValue t1 = DAG.getNode(ISD::SRL, dl, MVT::i32, t0,
@@ -3699,7 +3810,7 @@ getF32Constant(SelectionDAG &DAG, unsigned Flt) {
 
 /// expandExp - Lower an exp intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandExp(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandExp(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -3802,7 +3913,7 @@ static SDValue expandExp(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog - Lower a log intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                          const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -3898,7 +4009,7 @@ static SDValue expandLog(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -3993,7 +4104,7 @@ static SDValue expandLog2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandLog10(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandLog10(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                            const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -4081,7 +4192,7 @@ static SDValue expandLog10(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
 /// limited-precision mode.
-static SDValue expandExp2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
+static SDValue expandExp2(SDLoc dl, SDValue Op, SelectionDAG &DAG,
                           const TargetLowering &TLI) {
   if (Op.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
@@ -4176,10 +4287,10 @@ static SDValue expandExp2(DebugLoc dl, SDValue Op, SelectionDAG &DAG,
 
 /// visitPow - Lower a pow intrinsic. Handles the special sequences for
 /// limited-precision mode with x == 10.0f.
-static SDValue expandPow(DebugLoc dl, SDValue LHS, SDValue RHS,
+static SDValue expandPow(SDLoc dl, SDValue LHS, SDValue RHS,
                          SelectionDAG &DAG, const TargetLowering &TLI) {
   bool IsExp10 = false;
-  if (LHS.getValueType() == MVT::f32 && LHS.getValueType() == MVT::f32 &&
+  if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
       APFloat Ten(10.0f);
@@ -4284,7 +4395,7 @@ static SDValue expandPow(DebugLoc dl, SDValue LHS, SDValue RHS,
 
 
 /// ExpandPowI - Expand a llvm.powi intrinsic.
-static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
+static SDValue ExpandPowI(SDLoc DL, SDValue LHS, SDValue RHS,
                           SelectionDAG &DAG) {
   // If RHS is a constant, we can expand this out to a multiplication tree,
   // otherwise we end up lowering to a call to __powidf2 (for example).  When
@@ -4343,7 +4454,8 @@ static unsigned getTruncatedArgReg(const SDValue &N) {
     return 0;
 
   const SDValue &Ext = N.getOperand(0);
-  if (Ext.getOpcode() == ISD::AssertZext || Ext.getOpcode() == ISD::AssertSext){
+  if (Ext.getOpcode() == ISD::AssertZext ||
+      Ext.getOpcode() == ISD::AssertSext) {
     const SDValue &CFR = Ext.getOperand(0);
     if (CFR.getOpcode() == ISD::CopyFromReg)
       return cast<RegisterSDNode>(CFR.getOperand(1))->getReg();
@@ -4366,20 +4478,19 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
 
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetInstrInfo *TII = DAG.getTarget().getInstrInfo();
-  const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo();
 
   // Ignore inlined function arguments here.
   DIVariable DV(Variable);
   if (DV.isInlinedFnArgument(MF.getFunction()))
     return false;
 
-  unsigned Reg = 0;
+  Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
-  Offset = FuncInfo.getArgumentFrameIndex(Arg);
-  if (Offset)
-    Reg = TRI->getFrameRegister(MF);
+  if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+    Op = MachineOperand::CreateFI(FI);
 
-  if (!Reg && N.getNode()) {
+  if (!Op && N.getNode()) {
+    unsigned Reg;
     if (N.getOpcode() == ISD::CopyFromReg)
       Reg = cast<RegisterSDNode>(N.getOperand(1))->getReg();
     else
@@ -4390,32 +4501,39 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
       if (PR)
         Reg = PR;
     }
+    if (Reg)
+      Op = MachineOperand::CreateReg(Reg, false);
   }
 
-  if (!Reg) {
+  if (!Op) {
     // Check if ValueMap has reg number.
     DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
     if (VMI != FuncInfo.ValueMap.end())
-      Reg = VMI->second;
+      Op = MachineOperand::CreateReg(VMI->second, false);
   }
 
-  if (!Reg && N.getNode()) {
+  if (!Op && N.getNode())
     // Check if frame index is available.
     if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(N.getNode()))
       if (FrameIndexSDNode *FINode =
-          dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode())) {
-        Reg = TRI->getFrameRegister(MF);
-        Offset = FINode->getIndex();
-      }
-  }
+          dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        Op = MachineOperand::CreateFI(FINode->getIndex());
 
-  if (!Reg)
+  if (!Op)
     return false;
 
-  MachineInstrBuilder MIB = BuildMI(MF, getCurDebugLoc(),
-                                    TII->get(TargetOpcode::DBG_VALUE))
-    .addReg(Reg, RegState::Debug).addImm(Offset).addMetadata(Variable);
-  FuncInfo.ArgDbgValues.push_back(&*MIB);
+  // FIXME: This does not handle register-indirect values at offset 0.
+  bool IsIndirect = Offset != 0;
+  if (Op->isReg())
+    FuncInfo.ArgDbgValues.push_back(BuildMI(MF, getCurDebugLoc(),
+                                            TII->get(TargetOpcode::DBG_VALUE),
+                                            IsIndirect,
+                                            Op->getReg(), Offset, Variable));
+  else
+    FuncInfo.ArgDbgValues.push_back(
+      BuildMI(MF, getCurDebugLoc(), TII->get(TargetOpcode::DBG_VALUE))
+          .addOperand(*Op).addImm(Offset).addMetadata(Variable));
+
   return true;
 }
 
@@ -4432,6 +4550,8 @@ SelectionDAGBuilder::EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
 /// otherwise lower it and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  SDLoc sdl = getCurSDLoc();
   DebugLoc dl = getCurDebugLoc();
   SDValue Res;
 
@@ -4444,17 +4564,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::vaend:    visitVAEnd(I); return 0;
   case Intrinsic::vacopy:   visitVACopy(I); return 0;
   case Intrinsic::returnaddress:
-    setValue(&I, DAG.getNode(ISD::RETURNADDR, dl, TLI.getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl, TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::frameaddress:
-    setValue(&I, DAG.getNode(ISD::FRAMEADDR, dl, TLI.getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl, TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::setjmp:
-    return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
+    return &"_setjmp"[!TLI->usesUnderscoreSetJmp()];
   case Intrinsic::longjmp:
-    return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
+    return &"_longjmp"[!TLI->usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
     // Assert for address < 256 since we support only user defined address
     // spaces.
@@ -4470,7 +4590,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     if (!Align)
       Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
-    DAG.setRoot(DAG.getMemcpy(getRoot(), dl, Op1, Op2, Op3, Align, isVol, false,
+    DAG.setRoot(DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, false,
                               MachinePointerInfo(I.getArgOperand(0)),
                               MachinePointerInfo(I.getArgOperand(1))));
     return 0;
@@ -4488,7 +4608,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     if (!Align)
       Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
-    DAG.setRoot(DAG.getMemset(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+    DAG.setRoot(DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                               MachinePointerInfo(I.getArgOperand(0))));
     return 0;
   }
@@ -4507,7 +4627,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     if (!Align)
       Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
     bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
-    DAG.setRoot(DAG.getMemmove(getRoot(), dl, Op1, Op2, Op3, Align, isVol,
+    DAG.setRoot(DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
                                MachinePointerInfo(I.getArgOperand(0)),
                                MachinePointerInfo(I.getArgOperand(1))));
     return 0;
@@ -4516,17 +4636,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     MDNode *Variable = DI.getVariable();
     const Value *Address = DI.getAddress();
-    if (!Address || !DIVariable(Variable).Verify()) {
+    DIVariable DIVar(Variable);
+    assert((!DIVar || DIVar.isVariable()) &&
+      "Variable in DbgDeclareInst should be either null or a DIVariable.");
+    if (!Address || !DIVar) {
       DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
       return 0;
     }
 
-    // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
-    // but do not always have a corresponding SDNode built.  The SDNodeOrder
-    // absolute, but not relative, values are different depending on whether
-    // debug info exists.
-    ++SDNodeOrder;
-
     // Check if address has undef value.
     if (isa<UndefValue>(Address) ||
         (Address->use_empty() && !isa<Argument>(Address))) {
@@ -4597,7 +4714,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
   case Intrinsic::dbg_value: {
     const DbgValueInst &DI = cast<DbgValueInst>(I);
-    if (!DIVariable(DI.getVariable()).Verify())
+    DIVariable DIVar(DI.getVariable());
+    assert((!DIVar || DIVar.isVariable()) &&
+      "Variable in DbgValueInst should be either null or a DIVariable.");
+    if (!DIVar)
       return 0;
 
     MDNode *Variable = DI.getVariable();
@@ -4606,11 +4726,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     if (!V)
       return 0;
 
-    // Build an entry in DbgOrdering.  Debug info input nodes get an SDNodeOrder
-    // but do not always have a corresponding SDNode built.  The SDNodeOrder
-    // absolute, but not relative, values are different depending on whether
-    // debug info exists.
-    ++SDNodeOrder;
     SDDbgValue *SDV;
     if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
       SDV = DAG.getDbgValue(Variable, V, Offset, dl, SDNodeOrder);
@@ -4674,7 +4789,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::eh_return_i32:
   case Intrinsic::eh_return_i64:
     DAG.getMachineFunction().getMMI().setCallsEHReturn(true);
-    DAG.setRoot(DAG.getNode(ISD::EH_RETURN, dl,
+    DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
                             MVT::Other,
                             getControlRoot(),
                             getValue(I.getArgOperand(0)),
@@ -4684,17 +4799,17 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
     return 0;
   case Intrinsic::eh_dwarf_cfa: {
-    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), dl,
-                                        TLI.getPointerTy());
-    SDValue Offset = DAG.getNode(ISD::ADD, dl,
-                                 TLI.getPointerTy(),
-                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, dl,
-                                             TLI.getPointerTy()),
+    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
+                                        TLI->getPointerTy());
+    SDValue Offset = DAG.getNode(ISD::ADD, sdl,
+                                 CfaArg.getValueType(),
+                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
+                                             CfaArg.getValueType()),
                                  CfaArg);
-    SDValue FA = DAG.getNode(ISD::FRAMEADDR, dl,
-                             TLI.getPointerTy(),
-                             DAG.getConstant(0, TLI.getPointerTy()));
-    setValue(&I, DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
+    SDValue FA = DAG.getNode(ISD::FRAMEADDR, sdl,
+                             TLI->getPointerTy(),
+                             DAG.getConstant(0, TLI->getPointerTy()));
+    setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
                              FA, Offset));
     return 0;
   }
@@ -4720,7 +4835,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Ops[2];
     Ops[0] = getRoot();
     Ops[1] = getValue(I.getArgOperand(0));
-    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, dl,
+    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
                              DAG.getVTList(MVT::i32, MVT::Other),
                              Ops, 2);
     setValue(&I, Op.getValue(0));
@@ -4728,7 +4843,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::eh_sjlj_longjmp: {
-    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, dl, MVT::Other,
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
                             getRoot(), getValue(I.getArgOperand(0))));
     return 0;
   }
@@ -4783,10 +4898,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue ShOps[2];
     ShOps[0] = ShAmt;
     ShOps[1] = DAG.getConstant(0, MVT::i32);
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
-    EVT DestVT = TLI.getValueType(I.getType());
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, DestVT, ShAmt);
-    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, &ShOps[0], 2);
+    EVT DestVT = TLI->getValueType(I.getType());
+    ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
                        DAG.getConstant(NewIntrinsic, MVT::i32),
                        getValue(I.getArgOperand(0)), ShAmt);
     setValue(&I, Res);
@@ -4796,14 +4911,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::x86_avx_vinsertf128_ps_256:
   case Intrinsic::x86_avx_vinsertf128_si_256:
   case Intrinsic::x86_avx2_vinserti128: {
-    EVT DestVT = TLI.getValueType(I.getType());
-    EVT ElVT = TLI.getValueType(I.getArgOperand(1)->getType());
+    EVT DestVT = TLI->getValueType(I.getType());
+    EVT ElVT = TLI->getValueType(I.getArgOperand(1)->getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(2))->getZExtValue() & 1) *
                    ElVT.getVectorNumElements();
-    Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, DestVT,
+    Res = DAG.getNode(ISD::INSERT_SUBVECTOR, sdl, DestVT,
                       getValue(I.getArgOperand(0)),
                       getValue(I.getArgOperand(1)),
-                      DAG.getIntPtrConstant(Idx));
+                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
     setValue(&I, Res);
     return 0;
   }
@@ -4811,12 +4926,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::x86_avx_vextractf128_ps_256:
   case Intrinsic::x86_avx_vextractf128_si_256:
   case Intrinsic::x86_avx2_vextracti128: {
-    EVT DestVT = TLI.getValueType(I.getType());
+    EVT DestVT = TLI->getValueType(I.getType());
     uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
                    DestVT.getVectorNumElements();
-    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, sdl, DestVT,
                       getValue(I.getArgOperand(0)),
-                      DAG.getIntPtrConstant(Idx));
+                      DAG.getConstant(Idx, TLI->getVectorIdxTy()));
     setValue(&I, Res);
     return 0;
   }
@@ -4842,9 +4957,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::convertus:  Code = ISD::CVT_US; break;
     case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
     }
-    EVT DestVT = TLI.getValueType(I.getType());
+    EVT DestVT = TLI->getValueType(I.getType());
     const Value *Op1 = I.getArgOperand(0);
-    Res = DAG.getConvertRndSat(DestVT, dl, getValue(Op1),
+    Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
                                DAG.getValueType(DestVT),
                                DAG.getValueType(getValue(Op1).getValueType()),
                                getValue(I.getArgOperand(1)),
@@ -4854,27 +4969,27 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::powi:
-    setValue(&I, ExpandPowI(dl, getValue(I.getArgOperand(0)),
+    setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
     return 0;
   case Intrinsic::log:
-    setValue(&I, expandLog(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
     return 0;
   case Intrinsic::log2:
-    setValue(&I, expandLog2(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
     return 0;
   case Intrinsic::log10:
-    setValue(&I, expandLog10(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
     return 0;
   case Intrinsic::exp:
-    setValue(&I, expandExp(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
     return 0;
   case Intrinsic::exp2:
-    setValue(&I, expandExp2(dl, getValue(I.getArgOperand(0)), DAG, TLI));
+    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, *TLI));
     return 0;
   case Intrinsic::pow:
-    setValue(&I, expandPow(dl, getValue(I.getArgOperand(0)),
-                           getValue(I.getArgOperand(1)), DAG, TLI));
+    setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
+                           getValue(I.getArgOperand(1)), DAG, *TLI));
     return 0;
   case Intrinsic::sqrt:
   case Intrinsic::fabs:
@@ -4884,7 +4999,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::ceil:
   case Intrinsic::trunc:
   case Intrinsic::rint:
-  case Intrinsic::nearbyint: {
+  case Intrinsic::nearbyint:
+  case Intrinsic::round: {
     unsigned Opcode;
     switch (Intrinsic) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -4897,35 +5013,42 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
     }
 
-    setValue(&I, DAG.getNode(Opcode, dl,
+    setValue(&I, DAG.getNode(Opcode, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return 0;
   }
+  case Intrinsic::copysign:
+    setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return 0;
   case Intrinsic::fma:
-    setValue(&I, DAG.getNode(ISD::FMA, dl,
+    setValue(&I, DAG.getNode(ISD::FMA, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0)),
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return 0;
   case Intrinsic::fmuladd: {
-    EVT VT = TLI.getValueType(I.getType());
+    EVT VT = TLI->getValueType(I.getType());
     if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
-        TLI.isFMAFasterThanMulAndAdd(VT)){
-      setValue(&I, DAG.getNode(ISD::FMA, dl,
+        TLI->isFMAFasterThanFMulAndFAdd(VT)) {
+      setValue(&I, DAG.getNode(ISD::FMA, sdl,
                                getValue(I.getArgOperand(0)).getValueType(),
                                getValue(I.getArgOperand(0)),
                                getValue(I.getArgOperand(1)),
                                getValue(I.getArgOperand(2))));
     } else {
-      SDValue Mul = DAG.getNode(ISD::FMUL, dl,
+      SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
                                 getValue(I.getArgOperand(0)),
                                 getValue(I.getArgOperand(1)));
-      SDValue Add = DAG.getNode(ISD::FADD, dl,
+      SDValue Add = DAG.getNode(ISD::FADD, sdl,
                                 getValue(I.getArgOperand(0)).getValueType(),
                                 Mul,
                                 getValue(I.getArgOperand(2)));
@@ -4934,21 +5057,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::convert_to_fp16:
-    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, dl,
+    setValue(&I, DAG.getNode(ISD::FP32_TO_FP16, sdl,
                              MVT::i16, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::convert_from_fp16:
-    setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, dl,
+    setValue(&I, DAG.getNode(ISD::FP16_TO_FP32, sdl,
                              MVT::f32, getValue(I.getArgOperand(0))));
     return 0;
   case Intrinsic::pcmarker: {
     SDValue Tmp = getValue(I.getArgOperand(0));
-    DAG.setRoot(DAG.getNode(ISD::PCMARKER, dl, MVT::Other, getRoot(), Tmp));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
     return 0;
   }
   case Intrinsic::readcyclecounter: {
     SDValue Op = getRoot();
-    Res = DAG.getNode(ISD::READCYCLECOUNTER, dl,
+    Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
                       DAG.getVTList(MVT::i64, MVT::Other),
                       &Op, 1);
     setValue(&I, Res);
@@ -4956,7 +5079,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
   case Intrinsic::bswap:
-    setValue(&I, DAG.getNode(ISD::BSWAP, dl,
+    setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return 0;
@@ -4965,7 +5088,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
-                             dl, Ty, Arg));
+                             sdl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctlz: {
@@ -4973,33 +5096,33 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
     EVT Ty = Arg.getValueType();
     setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
-                             dl, Ty, Arg));
+                             sdl, Ty, Arg));
     return 0;
   }
   case Intrinsic::ctpop: {
     SDValue Arg = getValue(I.getArgOperand(0));
     EVT Ty = Arg.getValueType();
-    setValue(&I, DAG.getNode(ISD::CTPOP, dl, Ty, Arg));
+    setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
     return 0;
   }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
-    Res = DAG.getNode(ISD::STACKSAVE, dl,
-                      DAG.getVTList(TLI.getPointerTy(), MVT::Other), &Op, 1);
+    Res = DAG.getNode(ISD::STACKSAVE, sdl,
+                      DAG.getVTList(TLI->getPointerTy(), MVT::Other), &Op, 1);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return 0;
   }
   case Intrinsic::stackrestore: {
     Res = getValue(I.getArgOperand(0));
-    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, dl, MVT::Other, getRoot(), Res));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
     return 0;
   }
   case Intrinsic::stackprotector: {
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo *MFI = MF.getFrameInfo();
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = TLI->getPointerTy();
 
     SDValue Src = getValue(I.getArgOperand(0));   // The guard's value.
     AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
@@ -5010,7 +5133,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
     // Store the stack protector onto the stack.
-    Res = DAG.getStore(getRoot(), dl, Src, FIN,
+    Res = DAG.getStore(getRoot(), sdl, Src, FIN,
                        MachinePointerInfo::getFixedStack(FI),
                        true, false, 0);
     setValue(&I, Res);
@@ -5054,14 +5177,14 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
     Ops[5] = DAG.getSrcValue(F);
 
-    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, dl, MVT::Other, Ops, 6);
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops, 6);
 
     DAG.setRoot(Res);
     return 0;
   }
   case Intrinsic::adjust_trampoline: {
-    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, dl,
-                             TLI.getPointerTy(),
+    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
+                             TLI->getPointerTy(),
                              getValue(I.getArgOperand(0))));
     return 0;
   }
@@ -5078,7 +5201,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::gcwrite:
     llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
   case Intrinsic::flt_rounds:
-    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, dl, MVT::i32));
+    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
     return 0;
 
   case Intrinsic::expect: {
@@ -5091,9 +5214,9 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::trap: {
     StringRef TrapFuncName = TM.Options.getTrapFunctionName();
     if (TrapFuncName.empty()) {
-      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? 
+      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
         ISD::TRAP : ISD::DEBUGTRAP;
-      DAG.setRoot(DAG.getNode(Op, dl,MVT::Other, getRoot()));
+      DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
       return 0;
     }
     TargetLowering::ArgListTy Args;
@@ -5102,9 +5225,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                  false, false, false, false, 0, CallingConv::C,
                  /*isTailCall=*/false,
                  /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                 DAG.getExternalSymbol(TrapFuncName.data(), TLI.getPointerTy()),
-                 Args, DAG, dl);
-    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+                 DAG.getExternalSymbol(TrapFuncName.data(),
+                                       TLI->getPointerTy()),
+                 Args, DAG, sdl);
+    std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
     DAG.setRoot(Result.second);
     return 0;
   }
@@ -5129,7 +5253,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SDValue Op2 = getValue(I.getArgOperand(1));
 
     SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
-    setValue(&I, DAG.getNode(Op, dl, VTs, Op1, Op2));
+    setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
     return 0;
   }
   case Intrinsic::prefetch: {
@@ -5140,7 +5264,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
-    DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, dl,
+    DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
                                         DAG.getVTList(MVT::Other),
                                         &Ops[0], 5,
                                         EVT::getIntegerVT(*Context, 8),
@@ -5161,8 +5285,8 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     SmallVector<Value *, 4> Allocas;
     GetUnderlyingObjects(I.getArgOperand(1), Allocas, TD);
 
-    for (SmallVector<Value*, 4>::iterator Object = Allocas.begin(),
-         E = Allocas.end(); Object != E; ++Object) {
+    for (SmallVectorImpl<Value*>::iterator Object = Allocas.begin(),
+           E = Allocas.end(); Object != E; ++Object) {
       AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
 
       // Could not find an Alloca.
@@ -5173,24 +5297,45 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
 
       SDValue Ops[2];
       Ops[0] = getRoot();
-      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
+      Ops[1] = DAG.getFrameIndex(FI, TLI->getPointerTy(), true);
       unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
 
-      Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2);
+      Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops, 2);
       DAG.setRoot(Res);
     }
     return 0;
   }
   case Intrinsic::invariant_start:
     // Discard region information.
-    setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
+    setValue(&I, DAG.getUNDEF(TLI->getPointerTy()));
     return 0;
   case Intrinsic::invariant_end:
     // Discard region information.
     return 0;
+  case Intrinsic::stackprotectorcheck: {
+    // Do not actually emit anything for this basic block. Instead we initialize
+    // the stack protector descriptor and export the guard variable so we can
+    // access it in FinishBasicBlock.
+    const BasicBlock *BB = I.getParent();
+    SPDescriptor.initialize(BB, FuncInfo.MBBMap[BB], I);
+    ExportFromCurrentBlock(SPDescriptor.getGuard());
+
+    // Flush our exports since we are going to process a terminator.
+    (void)getControlRoot();
+    return 0;
+  }
   case Intrinsic::donothing:
     // ignore
     return 0;
+  case Intrinsic::experimental_stackmap: {
+    visitStackmap(I);
+    return 0;
+  }
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64: {
+    visitPatchpoint(I);
+    return 0;
+  }
   }
 }
 
@@ -5209,26 +5354,27 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check whether the function can return without sret-demotion.
   SmallVector<ISD::OutputArg, 4> Outs;
-  GetReturnInfo(RetTy, CS.getAttributes(), Outs, TLI);
+  const TargetLowering *TLI = TM.getTargetLowering();
+  GetReturnInfo(RetTy, CS.getAttributes(), Outs, *TLI);
 
-  bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-                                           DAG.getMachineFunction(),
-                                           FTy->isVarArg(), Outs,
-                                           FTy->getContext());
+  bool CanLowerReturn = TLI->CanLowerReturn(CS.getCallingConv(),
+                                            DAG.getMachineFunction(),
+                                            FTy->isVarArg(), Outs,
+                                            FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
 
   if (!CanLowerReturn) {
-    uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(
+    uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(
                       FTy->getReturnType());
-    unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(
+    unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(
                       FTy->getReturnType());
     MachineFunction &MF = DAG.getMachineFunction();
     DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
     Type *StackSlotPtrType = PointerType::getUnqual(FTy->getReturnType());
 
-    DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI.getPointerTy());
+    DemoteStackSlot = DAG.getFrameIndex(DemoteStackIdx, TLI->getPointerTy());
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
     Entry.isSExt = false;
@@ -5254,15 +5400,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    unsigned attrInd = i - CS.arg_begin() + 1;
-    Entry.isSExt     = CS.paramHasAttr(attrInd, Attribute::SExt);
-    Entry.isZExt     = CS.paramHasAttr(attrInd, Attribute::ZExt);
-    Entry.isInReg    = CS.paramHasAttr(attrInd, Attribute::InReg);
-    Entry.isSRet     = CS.paramHasAttr(attrInd, Attribute::StructRet);
-    Entry.isNest     = CS.paramHasAttr(attrInd, Attribute::Nest);
-    Entry.isByVal    = CS.paramHasAttr(attrInd, Attribute::ByVal);
-    Entry.isReturned = CS.paramHasAttr(attrInd, Attribute::Returned);
-    Entry.Alignment  = CS.getParamAlignment(attrInd);
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
     Args.push_back(Entry);
   }
 
@@ -5285,18 +5424,18 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     // Both PendingLoads and PendingExports must be flushed here;
     // this call might not return.
     (void)getRoot();
-    DAG.setRoot(DAG.getEHLabel(getCurDebugLoc(), getControlRoot(), BeginLabel));
+    DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
   }
 
   // Check if target-independent constraints permit a tail call here.
-  // Target-dependent constraints are checked within TLI.LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, TLI))
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CS, *TLI))
     isTailCall = false;
 
   TargetLowering::
   CallLoweringInfo CLI(getRoot(), RetTy, FTy, isTailCall, Callee, Args, DAG,
-                       getCurDebugLoc(), CS);
-  std::pair<SDValue,SDValue> Result = TLI.LowerCallTo(CLI);
+                       getCurSDLoc(), CS);
+  std::pair<SDValue,SDValue> Result = TLI->LowerCallTo(CLI);
   assert((isTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
   assert((Result.second.getNode() || !Result.first.getNode()) &&
@@ -5309,59 +5448,57 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SmallVector<EVT, 1> PVTs;
     Type *PtrRetTy = PointerType::getUnqual(FTy->getReturnType());
 
-    ComputeValueVTs(TLI, PtrRetTy, PVTs);
+    ComputeValueVTs(*TLI, PtrRetTy, PVTs);
     assert(PVTs.size() == 1 && "Pointers should fit in one register");
     EVT PtrVT = PVTs[0];
 
     SmallVector<EVT, 4> RetTys;
     SmallVector<uint64_t, 4> Offsets;
     RetTy = FTy->getReturnType();
-    ComputeValueVTs(TLI, RetTy, RetTys, &Offsets);
+    ComputeValueVTs(*TLI, RetTy, RetTys, &Offsets);
 
     unsigned NumValues = RetTys.size();
     SmallVector<SDValue, 4> Values(NumValues);
     SmallVector<SDValue, 4> Chains(NumValues);
 
     for (unsigned i = 0; i < NumValues; ++i) {
-      SDValue Add = DAG.getNode(ISD::ADD, getCurDebugLoc(), PtrVT,
+      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(), PtrVT,
                                 DemoteStackSlot,
                                 DAG.getConstant(Offsets[i], PtrVT));
-      SDValue L = DAG.getLoad(RetTys[i], getCurDebugLoc(), Result.second, Add,
+      SDValue L = DAG.getLoad(RetTys[i], getCurSDLoc(), Result.second, Add,
                   MachinePointerInfo::getFixedStack(DemoteStackIdx, Offsets[i]),
                               false, false, false, 1);
       Values[i] = L;
       Chains[i] = L.getValue(1);
     }
 
-    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(),
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
                                 MVT::Other, &Chains[0], NumValues);
     PendingLoads.push_back(Chain);
 
     setValue(CS.getInstruction(),
-             DAG.getNode(ISD::MERGE_VALUES, getCurDebugLoc(),
+             DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                          DAG.getVTList(&RetTys[0], RetTys.size()),
                          &Values[0], Values.size()));
   }
 
-  // Assign order to nodes here. If the call does not produce a result, it won't
-  // be mapped to a SDNode and visit() will not assign it an order number.
   if (!Result.second.getNode()) {
-    // As a special case, a null chain means that a tail call has been emitted and
-    // the DAG root is already updated.
+    // As a special case, a null chain means that a tail call has been emitted
+    // and the DAG root is already updated.
     HasTailCall = true;
-    ++SDNodeOrder;
-    AssignOrderingToNode(DAG.getRoot().getNode());
+
+    // Since there's no actual continuation from this block, nothing can be
+    // relying on us setting vregs for them.
+    PendingExports.clear();
   } else {
     DAG.setRoot(Result.second);
-    ++SDNodeOrder;
-    AssignOrderingToNode(Result.second.getNode());
   }
 
   if (LandingPad) {
     // Insert a label at the end of the invoke call to mark the try range.  This
     // can be used to detect deletion of the invoke via the MachineModuleInfo.
     MCSymbol *EndLabel = MMI.getContext().CreateTempSymbol();
-    DAG.setRoot(DAG.getEHLabel(getCurDebugLoc(), getRoot(), EndLabel));
+    DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
 
     // Inform MachineModuleInfo of range.
     MMI.addInvoke(LandingPad, BeginLabel, EndLabel);
@@ -5416,10 +5553,10 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   }
 
   SDValue Ptr = Builder.getValue(PtrVal);
-  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurDebugLoc(), Root,
+  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
                                         Ptr, MachinePointerInfo(PtrVal),
                                         false /*volatile*/,
-                                        false /*nontemporal*/, 
+                                        false /*nontemporal*/,
                                         false /*isinvariant*/, 1 /* align=1 */);
 
   if (!ConstantMemory)
@@ -5427,6 +5564,18 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
+/// processIntegerCallValue - Record the value for an instruction that
+/// produces an integer result, converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = TM.getTargetLowering()->getValueType(I.getType(), true);
+  if (IsSigned)
+    Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
+  else
+    Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
 
 /// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
 /// If so, return true and lower it, otherwise return false and it will be
@@ -5442,15 +5591,33 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       !I.getType()->isIntegerTy())
     return false;
 
-  const ConstantInt *Size = dyn_cast<ConstantInt>(I.getArgOperand(2));
+  const Value *Size = I.getArgOperand(2);
+  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  if (CSize && CSize->getZExtValue() == 0) {
+    EVT CallVT = TM.getTargetLowering()->getValueType(I.getType(), true);
+    setValue(&I, DAG.getConstant(0, CallVT));
+    return true;
+  }
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(LHS), getValue(RHS), getValue(Size),
+                                MachinePointerInfo(LHS),
+                                MachinePointerInfo(RHS));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (Size && IsOnlyUsedInZeroEqualityComparison(&I)) {
+  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
     bool ActuallyDoIt = true;
     MVT LoadVT;
     Type *LoadTy;
-    switch (Size->getZExtValue()) {
+    switch (CSize->getZExtValue()) {
     default:
       LoadVT = MVT::Other;
       LoadTy = 0;
@@ -5458,20 +5625,20 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       break;
     case 2:
       LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(Size->getContext());
+      LoadTy = Type::getInt16Ty(CSize->getContext());
       break;
     case 4:
       LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       break;
     case 8:
       LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(Size->getContext());
+      LoadTy = Type::getInt64Ty(CSize->getContext());
       break;
         /*
     case 16:
       LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(Size->getContext());
+      LoadTy = Type::getInt32Ty(CSize->getContext());
       LoadTy = VectorType::get(LoadTy, 4);
       break;
          */
@@ -5484,10 +5651,11 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
     // Require that we can find a legal MVT, and only do this if the target
     // supports unaligned loads of that type.  Expanding into byte loads would
     // bloat the code.
-    if (ActuallyDoIt && Size->getZExtValue() > 4) {
+    const TargetLowering *TLI = TM.getTargetLowering();
+    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
-      if (!TLI.isTypeLegal(LoadVT) ||!TLI.allowsUnalignedMemoryAccesses(LoadVT))
+      if (!TLI->isTypeLegal(LoadVT) ||!TLI->allowsUnalignedMemoryAccesses(LoadVT))
         ActuallyDoIt = false;
     }
 
@@ -5495,10 +5663,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
       SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
       SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
 
-      SDValue Res = DAG.getSetCC(getCurDebugLoc(), MVT::i1, LHSVal, RHSVal,
+      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
                                  ISD::SETNE);
-      EVT CallVT = TLI.getValueType(I.getType(), true);
-      setValue(&I, DAG.getZExtOrTrunc(Res, getCurDebugLoc(), CallVT));
+      processIntegerCallValue(I, Res, false);
       return true;
     }
   }
@@ -5507,6 +5674,148 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   return false;
 }
 
+/// visitMemChrCall -- See if we can lower a memchr call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *Src = I.getArgOperand(0);
+  const Value *Char = I.getArgOperand(1);
+  const Value *Length = I.getArgOperand(2);
+  if (!Src->getType()->isPointerTy() ||
+      !Char->getType()->isIntegerTy() ||
+      !Length->getType()->isIntegerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Src), getValue(Char), getValue(Length),
+                                MachinePointerInfo(Src));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
+/// optimized form.  If so, return true and lower it, otherwise return false
+/// and it will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
+  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1), isStpcpy);
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int strcmp(void*,void*)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrLenCall -- See if we can lower a strlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strlen(char *)
+  if (I.getNumArgOperands() != 1)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0);
+  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const TargetSelectionDAGInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                 getValue(Arg0), getValue(Arg1),
+                                 MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
 /// visitUnaryFloatCall - If a call instruction is a unary floating-point
 /// operation (as expected), translate it to an SDNode with the specified opcode
 /// and return true.
@@ -5520,7 +5829,7 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
     return false;
 
   SDValue Tmp = getValue(I.getArgOperand(0));
-  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(), Tmp.getValueType(), Tmp));
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
   return true;
 }
 
@@ -5569,7 +5878,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
             I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
-          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
@@ -5595,6 +5904,9 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
       case LibFunc::sqrt:
       case LibFunc::sqrtf:
       case LibFunc::sqrtl:
+      case LibFunc::sqrt_finite:
+      case LibFunc::sqrtf_finite:
+      case LibFunc::sqrtl_finite:
         if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
         break;
@@ -5622,6 +5934,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
         break;
+      case LibFunc::round:
+      case LibFunc::roundf:
+      case LibFunc::roundl:
+        if (visitUnaryFloatCall(I, ISD::FROUND))
+          return;
+        break;
       case LibFunc::trunc:
       case LibFunc::truncf:
       case LibFunc::truncl:
@@ -5644,6 +5962,30 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitMemCmpCall(I))
           return;
         break;
+      case LibFunc::memchr:
+        if (visitMemChrCall(I))
+          return;
+        break;
+      case LibFunc::strcpy:
+        if (visitStrCpyCall(I, false))
+          return;
+        break;
+      case LibFunc::stpcpy:
+        if (visitStrCpyCall(I, true))
+          return;
+        break;
+      case LibFunc::strcmp:
+        if (visitStrCmpCall(I))
+          return;
+        break;
+      case LibFunc::strlen:
+        if (visitStrLenCall(I))
+          return;
+        break;
+      case LibFunc::strnlen:
+        if (visitStrNLenCall(I))
+          return;
+        break;
       }
     }
   }
@@ -5652,7 +5994,8 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   if (!RenameFn)
     Callee = getValue(I.getCalledValue());
   else
-    Callee = DAG.getExternalSymbol(RenameFn, TLI.getPointerTy());
+    Callee = DAG.getExternalSymbol(RenameFn,
+                                   TM.getTargetLowering()->getPointerTy());
 
   // Check if we can potentially perform a tail call. More detailed checking is
   // be done within LowerCallTo, after more information about the call is known.
@@ -5741,7 +6084,7 @@ typedef SmallVector<SDISelAsmOperandInfo,16> SDISelAsmOperandInfoVector;
 ///
 static void GetRegistersForValue(SelectionDAG &DAG,
                                  const TargetLowering &TLI,
-                                 DebugLoc DL,
+                                 SDLoc DL,
                                  SDISelAsmOperandInfo &OpInfo) {
   LLVMContext &Context = *DAG.getContext();
 
@@ -5847,8 +6190,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   /// ConstraintOperands - Information about all of the constraints.
   SDISelAsmOperandInfoVector ConstraintOperands;
 
+  const TargetLowering *TLI = TM.getTargetLowering();
   TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI.ParseConstraints(CS);
+    TargetConstraints = TLI->ParseConstraints(CS);
 
   bool hasMemory = false;
 
@@ -5873,10 +6217,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // corresponding argument.
       assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
       if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
-        OpVT = TLI.getSimpleValueType(STy->getElementType(ResNo));
+        OpVT = TLI->getSimpleValueType(STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpVT = TLI.getSimpleValueType(CS.getType());
+        OpVT = TLI->getSimpleValueType(CS.getType());
       }
       ++ResNo;
       break;
@@ -5897,7 +6241,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
       }
 
-      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI, TD).
+      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), *TLI, TD).
         getSimpleVT();
     }
 
@@ -5909,7 +6253,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     else {
       for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
         TargetLowering::ConstraintType
-          CType = TLI.getConstraintType(OpInfo.Codes[j]);
+          CType = TLI->getConstraintType(OpInfo.Codes[j]);
         if (CType == TargetLowering::C_Memory) {
           hasMemory = true;
           break;
@@ -5941,11 +6285,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
         std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                           OpInfo.ConstraintVT);
+          TLI->getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                            OpInfo.ConstraintVT);
         std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
-                                           Input.ConstraintVT);
+          TLI->getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                            Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
@@ -5958,7 +6302,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     }
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+    TLI->ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
 
     if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
         OpInfo.Type == InlineAsm::isClobber)
@@ -5986,17 +6330,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
           isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
         OpInfo.CallOperand = DAG.getConstantPool(cast<Constant>(OpVal),
-                                                 TLI.getPointerTy());
+                                                 TLI->getPointerTy());
       } else {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
         Type *Ty = OpVal->getType();
-        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
-        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
+        uint64_t TySize = TLI->getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI->getDataLayout()->getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
-        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
-        Chain = DAG.getStore(Chain, getCurDebugLoc(),
+        SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI->getPointerTy());
+        Chain = DAG.getStore(Chain, getCurSDLoc(),
                              OpInfo.CallOperand, StackSlot,
                              MachinePointerInfo::getFixedStack(SSFI),
                              false, false, 0);
@@ -6013,7 +6357,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // If this constraint is for a specific register, allocate it before
     // anything else.
     if (OpInfo.ConstraintType == TargetLowering::C_Register)
-      GetRegistersForValue(DAG, TLI, getCurDebugLoc(), OpInfo);
+      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
   }
 
   // Second pass - Loop over all of the operands, assigning virtual or physregs
@@ -6024,7 +6368,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // C_Register operands have already been allocated, Other/Memory don't need
     // to be.
     if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
-      GetRegistersForValue(DAG, TLI, getCurDebugLoc(), OpInfo);
+      GetRegistersForValue(DAG, *TLI, getCurSDLoc(), OpInfo);
   }
 
   // AsmNodeOperands - The operands for the ISD::INLINEASM node.
@@ -6032,7 +6376,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
   AsmNodeOperands.push_back(
           DAG.getTargetExternalSymbol(IA->getAsmString().c_str(),
-                                      TLI.getPointerTy()));
+                                      TLI->getPointerTy()));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
@@ -6055,7 +6399,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
     // Ideally, we would only check against memory constraints.  However, the
     // meaning of an other constraint can be target-specific and we can't easily
@@ -6073,7 +6417,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
-                                                  TLI.getPointerTy()));
+                                                  TLI->getPointerTy()));
 
   // Loop over all of the inputs, copying the operand values into the
   // appropriate registers and processing the output regs.
@@ -6095,7 +6439,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // Add information to the INLINEASM node to know about this output.
         unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags,
-                                                        TLI.getPointerTy()));
+                                                        TLI->getPointerTy()));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
       }
@@ -6106,10 +6450,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // we can use.
       if (OpInfo.AssignedRegs.Regs.empty()) {
         LLVMContext &Ctx = *DAG.getContext();
-        Ctx.emitError(CS.getInstruction(),  
+        Ctx.emitError(CS.getInstruction(),
                       "couldn't allocate output register for constraint '" +
-                           Twine(OpInfo.ConstraintCode) + "'");
-        break;
+                          Twine(OpInfo.ConstraintCode) + "'");
+        return;
       }
 
       // If this is an indirect operand, store through the pointer after the
@@ -6126,13 +6470,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       // Add information to the INLINEASM node to know that this register is
       // set.
-      OpInfo.AssignedRegs.AddInlineAsmOperands(OpInfo.isEarlyClobber ?
-                                           InlineAsm::Kind_RegDefEarlyClobber :
-                                               InlineAsm::Kind_RegDef,
-                                               false,
-                                               0,
-                                               DAG,
-                                               AsmNodeOperands);
+      OpInfo.AssignedRegs
+          .AddInlineAsmOperands(OpInfo.isEarlyClobber
+                                    ? InlineAsm::Kind_RegDefEarlyClobber
+                                    : InlineAsm::Kind_RegDef,
+                                false, 0, DAG, AsmNodeOperands);
       break;
     }
     case InlineAsm::isInput: {
@@ -6164,10 +6506,10 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           if (OpInfo.isIndirect) {
             // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
             LLVMContext &Ctx = *DAG.getContext();
-            Ctx.emitError(CS.getInstruction(),  "inline asm not supported yet:"
-                          " don't know how to handle tied "
-                          "indirect register inputs");
-            report_fatal_error("Cannot handle indirect register inputs!");
+            Ctx.emitError(CS.getInstruction(), "inline asm not supported yet:"
+                                               " don't know how to handle tied "
+                                               "indirect register inputs");
+            return;
           }
 
           RegsForValue MatchedRegs;
@@ -6177,18 +6519,18 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
           for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
                i != e; ++i) {
-            if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
+            if (const TargetRegisterClass *RC = TLI->getRegClassFor(RegVT))
               MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC));
             else {
               LLVMContext &Ctx = *DAG.getContext();
-              Ctx.emitError(CS.getInstruction(), "inline asm error: This value"
+              Ctx.emitError(CS.getInstruction(),
+                            "inline asm error: This value"
                             " type register class is not natively supported!");
-              report_fatal_error("inline asm error: This value type register "
-                                 "class is not natively supported!");
+              return;
             }
           }
           // Use the produced MatchedRegs object to
-          MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurSDLoc(),
                                     Chain, &Flag, CS.getInstruction());
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(),
@@ -6204,7 +6546,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
                                                     OpInfo.getMatchedOperand());
         AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlag,
-                                                        TLI.getPointerTy()));
+                                                        TLI->getPointerTy()));
         AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
         break;
       }
@@ -6216,34 +6558,34 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
       if (OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
-        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
-                                         Ops, DAG);
+        TLI->LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+                                          Ops, DAG);
         if (Ops.empty()) {
           LLVMContext &Ctx = *DAG.getContext();
           Ctx.emitError(CS.getInstruction(),
                         "invalid operand for inline asm constraint '" +
-                        Twine(OpInfo.ConstraintCode) + "'");
-          break;
+                            Twine(OpInfo.ConstraintCode) + "'");
+          return;
         }
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType =
           InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI.getPointerTy()));
+                                                        TLI->getPointerTy()));
         AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
         break;
       }
 
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
         assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
-        assert(InOperandVal.getValueType() == TLI.getPointerTy() &&
+        assert(InOperandVal.getValueType() == TLI->getPointerTy() &&
                "Memory operands expect pointer values");
 
         // Add information to the INLINEASM node to know about this input.
         unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
         AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
-                                                        TLI.getPointerTy()));
+                                                        TLI->getPointerTy()));
         AsmNodeOperands.push_back(InOperandVal);
         break;
       }
@@ -6257,20 +6599,21 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         LLVMContext &Ctx = *DAG.getContext();
         Ctx.emitError(CS.getInstruction(),
                       "Don't know how to handle indirect register inputs yet "
-                      "for constraint '" + Twine(OpInfo.ConstraintCode) + "'");
-        break;
+                      "for constraint '" +
+                          Twine(OpInfo.ConstraintCode) + "'");
+        return;
       }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
         LLVMContext &Ctx = *DAG.getContext();
-        Ctx.emitError(CS.getInstruction(), 
+        Ctx.emitError(CS.getInstruction(),
                       "couldn't allocate input reg for constraint '" +
-                           Twine(OpInfo.ConstraintCode) + "'");
-        break;
+                          Twine(OpInfo.ConstraintCode) + "'");
+        return;
       }
 
-      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurSDLoc(),
                                         Chain, &Flag, CS.getInstruction());
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
@@ -6293,7 +6636,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
   if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
 
-  Chain = DAG.getNode(ISD::INLINEASM, getCurDebugLoc(),
+  Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(),
                       DAG.getVTList(MVT::Other, MVT::Glue),
                       &AsmNodeOperands[0], AsmNodeOperands.size());
   Flag = Chain.getValue(1);
@@ -6301,12 +6644,12 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // If this asm returns a register value, copy the result from that register
   // and set it as the value of the call.
   if (!RetValRegs.Regs.empty()) {
-    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
                                              Chain, &Flag, CS.getInstruction());
 
     // FIXME: Why don't we do this for inline asms with MRVs?
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
-      EVT ResultType = TLI.getValueType(CS.getType());
+      EVT ResultType = TLI->getValueType(CS.getType());
 
       // If any of the results of the inline asm is a vector, it may have the
       // wrong width/num elts.  This can happen for register classes that can
@@ -6314,7 +6657,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // not have the same VT as was expected.  Convert it to the right type
       // with bit_convert.
       if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
-        Val = DAG.getNode(ISD::BITCAST, getCurDebugLoc(),
+        Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(),
                           ResultType, Val);
 
       } else if (ResultType != Val.getValueType() &&
@@ -6322,7 +6665,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // If a result value was tied to an input value, the computed result may
         // have a wider width than the expected result.  Extract the relevant
         // portion.
-        Val = DAG.getNode(ISD::TRUNCATE, getCurDebugLoc(), ResultType, Val);
+        Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultType, Val);
       }
 
       assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
@@ -6341,7 +6684,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
     RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
     const Value *Ptr = IndirectStoresToEmit[i].second;
-    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
                                              Chain, &Flag, IA);
     StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
   }
@@ -6349,7 +6692,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // Emit the non-flagged stores from the physregs.
   SmallVector<SDValue, 8> OutChains;
   for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
-    SDValue Val = DAG.getStore(Chain, getCurDebugLoc(),
+    SDValue Val = DAG.getStore(Chain, getCurSDLoc(),
                                StoresToEmit[i].first,
                                getValue(StoresToEmit[i].second),
                                MachinePointerInfo(StoresToEmit[i].second),
@@ -6358,22 +6701,23 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   }
 
   if (!OutChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, getCurDebugLoc(), MVT::Other,
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
                         &OutChains[0], OutChains.size());
 
   DAG.setRoot(Chain);
 }
 
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
-  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurDebugLoc(),
+  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
-  const DataLayout &TD = *TLI.getDataLayout();
-  SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
+  const TargetLowering *TLI = TM.getTargetLowering();
+  const DataLayout &TD = *TLI->getDataLayout();
+  SDValue V = DAG.getVAArg(TLI->getValueType(I.getType()), getCurSDLoc(),
                            getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
                            TD.getABITypeAlignment(I.getType()));
@@ -6382,14 +6726,14 @@ void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
 }
 
 void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
-  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurDebugLoc(),
+  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           DAG.getSrcValue(I.getArgOperand(0))));
 }
 
 void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
-  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurDebugLoc(),
+  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
                           MVT::Other, getRoot(),
                           getValue(I.getArgOperand(0)),
                           getValue(I.getArgOperand(1)),
@@ -6397,6 +6741,248 @@ void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
                           DAG.getSrcValue(I.getArgOperand(1))));
 }
 
+/// \brief Lower an argument list according to the target calling convention.
+///
+/// \return A tuple of <return-value, token-chain>
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::LowerCallOperands(const CallInst &CI, unsigned ArgIdx,
+                                       unsigned NumArgs, SDValue Callee,
+                                       bool useVoidTy) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  ImmutableCallSite CS(&CI);
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    const Value *V = CI.getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = getValue(V);
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  Type *retTy = useVoidTy ? Type::getVoidTy(*DAG.getContext()) : CI.getType();
+  TargetLowering::CallLoweringInfo CLI(getRoot(), retTy, /*retSExt*/ false,
+    /*retZExt*/ false, /*isVarArg*/ false, /*isInReg*/ false, NumArgs,
+    CI.getCallingConv(), /*isTailCall*/ false, /*doesNotReturn*/ false,
+    /*isReturnValueUsed*/ CI.use_empty(), Callee, Args, DAG, getCurSDLoc());
+
+  const TargetLowering *TLI = TM.getTargetLowering();
+  return TLI->LowerCallTo(CLI);
+}
+
+/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
+void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
+  // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+
+  assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
+
+  SDValue Callee = getValue(CI.getCalledValue());
+
+  // Lower into a call sequence with no args and no return value.
+  std::pair<SDValue, SDValue> Result = LowerCallOperands(CI, 0, 0, Callee);
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  SDNode *CallEnd = Chain.getNode();
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the stackmap intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numShadowBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Push live variables for the stack map.
+  for (unsigned i = 2, e = CI.getNumArgOperands(); i != e; ++i)
+    Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+    // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a STACKMAP node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::STACKMAP, getCurSDLoc(),
+                                         NodeTys, Ops);
+
+  // StackMap generates no value, so nothing goes in the NodeMap.
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence.
+  DAG.ReplaceAllUsesWith(Call, MN);
+
+  DAG.DeleteNode(Call);
+}
+
+/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
+void SelectionDAGBuilder::visitPatchpoint(const CallInst &CI) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i32 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+
+  CallingConv::ID CC = CI.getCallingConv();
+  bool isAnyRegCC = CC == CallingConv::AnyReg;
+  bool hasDef = !CI.getType()->isVoidTy();
+  SDValue Callee = getValue(CI.getOperand(2)); // <target>
+
+  // Get the real number of arguments participating in the call <numArgs>
+  unsigned NumArgs =
+    cast<ConstantSDNode>(getValue(CI.getArgOperand(3)))->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  assert(CI.getNumArgOperands() >= NumArgs + 4 &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = isAnyRegCC ? 0 : NumArgs;
+  std::pair<SDValue, SDValue> Result =
+    LowerCallOperands(CI, 4, NumCallArgs, Callee, isAnyRegCC);
+
+  // Set the root to the target-lowered call chain.
+  SDValue Chain = Result.second;
+  DAG.setRoot(Chain);
+
+  SDNode *CallEnd = Chain.getNode();
+  if (hasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+    CallEnd = CallEnd->getOperand(0).getNode();
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool hasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the patchable intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numNopBytes> constants.
+  for (unsigned i = 0; i < 2; ++i) {
+    SDValue tmp = getValue(CI.getOperand(i));
+    Ops.push_back(DAG.getTargetConstant(
+        cast<ConstantSDNode>(tmp)->getZExtValue(), MVT::i32));
+  }
+  // Assume that the Callee is a constant address.
+  Ops.push_back(
+    DAG.getIntPtrConstant(cast<ConstantSDNode>(Callee)->getZExtValue(),
+                          /*isTarget=*/true));
+
+  // Adjust <numArgs> to account for any arguments that have been passed on the
+  // stack instead.
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  unsigned NumCallRegArgs = Call->getNumOperands() - (hasGlue ? 4 : 3);
+  NumCallRegArgs = isAnyRegCC ? NumArgs : NumCallRegArgs;
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add the calling convention
+  Ops.push_back(DAG.getTargetConstant((unsigned)CC, MVT::i32));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (isAnyRegCC)
+    for (unsigned i = 4, e = NumArgs + 4; i != e; ++i)
+      Ops.push_back(getValue(CI.getArgOperand(i)));
+
+  // Push the arguments from the call instruction.
+  SDNode::op_iterator e = hasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
+    Ops.push_back(*i);
+
+  // Push live variables for the stack map.
+  for (unsigned i = NumArgs + 4, e = CI.getNumArgOperands(); i != e; ++i) {
+    SDValue OpVal = getValue(CI.getArgOperand(i));
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
+      Ops.push_back(
+        DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+      Ops.push_back(
+        DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+    } else
+      Ops.push_back(OpVal);
+  }
+
+  // Push the register mask info.
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-2));
+  else
+    Ops.push_back(*(Call->op_end()-1));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+  // Push the glue flag (last operand).
+  if (hasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys;
+  if (isAnyRegCC && hasDef) {
+    // Create the return types based on the intrinsic definition
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SmallVector<EVT, 3> ValueVTs;
+    ComputeValueVTs(TLI, CI.getType(), ValueVTs);
+    assert(ValueVTs.size() == 1 && "Expected only one return value type.");
+
+    // There is always a chain and a glue type at the end
+    ValueVTs.push_back(MVT::Other);
+    ValueVTs.push_back(MVT::Glue);
+    NodeTys = DAG.getVTList(ValueVTs.data(), ValueVTs.size());
+  } else
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a PATCHPOINT node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
+                                         getCurSDLoc(), NodeTys, Ops);
+
+  // Update the NodeMap.
+  if (hasDef) {
+    if (isAnyRegCC)
+      setValue(&CI, SDValue(MN, 0));
+    else
+      setValue(&CI, Result.first);
+  }
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence. Furthermore the location of the chain and glue can change
+  // when the AnyReg calling convention is used and the intrinsic returns a
+  // value.
+  if (isAnyRegCC && hasDef) {
+    SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
+    SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  } else
+    DAG.ReplaceAllUsesWith(Call, MN);
+  DAG.DeleteNode(Call);
+}
+
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
 /// implementation, which just calls LowerCall.
 /// FIXME: When all targets are
@@ -6414,6 +7000,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     for (unsigned i = 0; i != NumRegs; ++i) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
       MyFlags.Used = CLI.IsReturnValueUsed;
       if (CLI.RetSExt)
         MyFlags.Flags.setSExt();
@@ -6503,7 +7090,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
-        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
                                i < CLI.NumFixedArgs,
                                i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
@@ -6596,9 +7183,10 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
          "Copy from a reg to the same reg!");
   assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
 
-  RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
+  const TargetLowering *TLI = TM.getTargetLowering();
+  RegsForValue RFV(V->getContext(), *TLI, Reg, V->getType());
   SDValue Chain = DAG.getEntryNode();
-  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0, V);
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, 0, V);
   PendingExports.push_back(Chain);
 }
 
@@ -6625,21 +7213,23 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
 
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
-  DebugLoc dl = SDB->getCurDebugLoc();
-  const DataLayout *TD = TLI.getDataLayout();
+  SDLoc dl = SDB->getCurSDLoc();
+  const TargetLowering *TLI = getTargetLowering();
+  const DataLayout *TD = TLI->getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*getTargetLowering(),
+                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
 
     // NOTE: Assuming that a pointer will never break down to more than one VT
     // or one register.
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
-    MVT RegisterVT = TLI.getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, true, 0, 0);
+    MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
     Ins.push_back(RetArg);
   }
 
@@ -6648,8 +7238,9 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
        I != E; ++I, ++Idx) {
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, I->getType(), ValueVTs);
     bool isArgValueUsed = !I->use_empty();
+    unsigned PartBase = 0;
     for (unsigned Value = 0, NumValues = ValueVTs.size();
          Value != NumValues; ++Value) {
       EVT VT = ValueVTs[Value];
@@ -6677,18 +7268,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         if (F.getParamAlignment(Idx))
           FrameAlign = F.getParamAlignment(Idx);
         else
-          FrameAlign = TLI.getByValTypeAlignment(ElementTy);
+          FrameAlign = TLI->getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
       Flags.setOrigAlign(OriginalAlignment);
 
-      MVT RegisterVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumRegs = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+      MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed,
-                              Idx-1, i*RegisterVT.getStoreSize());
+        ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
+                              Idx-1, PartBase+i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
@@ -6696,14 +7287,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      PartBase += VT.getStoreSize();
     }
   }
 
   // Call the target to set up the argument values.
   SmallVector<SDValue, 8> InVals;
-  SDValue NewRoot = TLI.LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
-                                             F.isVarArg(), Ins,
-                                             dl, DAG, InVals);
+  SDValue NewRoot = TLI->LowerFormalArguments(DAG.getRoot(), F.getCallingConv(),
+                                              F.isVarArg(), Ins,
+                                              dl, DAG, InVals);
 
   // Verify that the target's LowerFormalArguments behaved as expected.
   assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
@@ -6729,18 +7321,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
-    ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    ComputeValueVTs(*TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
     MVT VT = ValueVTs[0].getSimpleVT();
-    MVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+    MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
     ISD::NodeType AssertOp = ISD::DELETED_NODE;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
                                         RegVT, VT, NULL, AssertOp);
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
-    unsigned SRetReg = RegInfo.createVirtualRegister(TLI.getRegClassFor(RegVT));
+    unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
-    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurDebugLoc(),
+    NewRoot = SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(),
                                     SRetReg, ArgValue);
     DAG.setRoot(NewRoot);
 
@@ -6753,18 +7345,24 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       ++I, ++Idx) {
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(TLI, I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, I->getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
-    if (I->use_empty() && NumValues)
+    if (I->use_empty() && NumValues) {
       SDB->setUnusedArgValue(I, InVals[i]);
 
+      // Also remember any frame index for use in FastISel.
+      if (FrameIndexSDNode *FI =
+          dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
+        FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
+    }
+
     for (unsigned Val = 0; Val != NumValues; ++Val) {
       EVT VT = ValueVTs[Val];
-      MVT PartVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
-      unsigned NumParts = TLI.getNumRegisters(*CurDAG->getContext(), VT);
+      MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT);
 
       if (!I->use_empty()) {
         ISD::NodeType AssertOp = ISD::DELETED_NODE;
@@ -6791,11 +7389,11 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
-                                     SDB->getCurDebugLoc());
+                                     SDB->getCurSDLoc());
 
     SDB->setValue(I, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
-      if (LoadSDNode *LNode = 
+      if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
@@ -6893,15 +7491,36 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       // Remember that this register needs to added to the machine PHI node as
       // the input for this MBB.
       SmallVector<EVT, 4> ValueVTs;
-      ComputeValueVTs(TLI, PN->getType(), ValueVTs);
+      const TargetLowering *TLI = TM.getTargetLowering();
+      ComputeValueVTs(*TLI, PN->getType(), ValueVTs);
       for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
         EVT VT = ValueVTs[vti];
-        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
+        unsigned NumRegisters = TLI->getNumRegisters(*DAG.getContext(), VT);
         for (unsigned i = 0, e = NumRegisters; i != e; ++i)
           FuncInfo.PHINodesToUpdate.push_back(std::make_pair(MBBI++, Reg+i));
         Reg += NumRegisters;
       }
     }
   }
+
   ConstantsOut.clear();
 }
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+SelectionDAGBuilder::StackProtectorDescriptor::
+AddSuccessorMBB(const BasicBlock *BB,
+                MachineBasicBlock *ParentMBB,
+                MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI = ParentMBB;
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(SuccMBB);
+  return SuccMBB;
+}
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9188945..835f643 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -1,4 +1,4 @@
-//===-- SelectionDAGBuilder.h - Selection-DAG building --------------------===//
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class AddrSpaceCastInst;
 class AliasAnalysis;
 class AllocaInst;
 class BasicBlock;
@@ -80,11 +81,11 @@ class ZExtInst;
 /// implementation that is parameterized by a TargetLowering object.
 ///
 class SelectionDAGBuilder {
-  /// CurDebugLoc - current file + line number.  Changes as we build the DAG.
-  DebugLoc CurDebugLoc;
+  /// CurInst - The current instruction being visited
+  const Instruction *CurInst;
 
   DenseMap<const Value*, SDValue> NodeMap;
-  
+
   /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
   /// to preserve debug information for incoming arguments.
   DenseMap<const Value*, SDValue> UnusedArgNodeMap;
@@ -182,6 +183,17 @@ private:
 
   typedef std::vector<CaseRec> CaseRecVector;
 
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator()(const Case &C1, const Case &C2) {
+      assert(isa<ConstantInt>(C1.Low) && isa<ConstantInt>(C2.High));
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
+
   struct CaseBitsCmp {
     bool operator()(const CaseBits &C1, const CaseBits &C2) {
       return C1.Bits > C2.Bits;
@@ -224,7 +236,7 @@ private:
   struct JumpTable {
     JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
               MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
-  
+
     /// Reg - the virtual register containing the index of the jump table entry
     //. to jump to.
     unsigned Reg;
@@ -278,12 +290,204 @@ private:
     BitTestInfo Cases;
   };
 
-public:
-  // TLI - This is information that describes the available target features we
-  // need for lowering.  This indicates when operations are unavailable,
-  // implemented with a libcall, etc.
+  /// A class which encapsulates all of the information needed to generate a
+  /// stack protector check and signals to isel via its state being initialized
+  /// that a stack protector needs to be generated.
+  ///
+  /// *NOTE* The following is a high level documentation of SelectionDAG Stack
+  /// Protector Generation. The reason that it is placed here is for a lack of
+  /// other good places to stick it.
+  ///
+  /// High Level Overview of SelectionDAG Stack Protector Generation:
+  ///
+  /// Previously, generation of stack protectors was done exclusively in the
+  /// pre-SelectionDAG Codegen LLVM IR Pass "Stack Protector". This necessitated
+  /// splitting basic blocks at the IR level to create the success/failure basic
+  /// blocks in the tail of the basic block in question. As a result of this,
+  /// calls that would have qualified for the sibling call optimization were no
+  /// longer eligible for optimization since said calls were no longer right in
+  /// the "tail position" (i.e. the immediate predecessor of a ReturnInst
+  /// instruction).
+  ///
+  /// Then it was noticed that since the sibling call optimization causes the
+  /// callee to reuse the caller's stack, if we could delay the generation of
+  /// the stack protector check until later in CodeGen after the sibling call
+  /// decision was made, we get both the tail call optimization and the stack
+  /// protector check!
+  ///
+  /// A few goals in solving this problem were:
+  ///
+  ///   1. Preserve the architecture independence of stack protector generation.
+  ///
+  ///   2. Preserve the normal IR level stack protector check for platforms like
+  ///      OpenBSD for which we support platform specific stack protector
+  ///      generation.
+  ///
+  /// The main problem that guided the present solution is that one can not
+  /// solve this problem in an architecture independent manner at the IR level
+  /// only. This is because:
+  ///
+  ///   1. The decision on whether or not to perform a sibling call on certain
+  ///      platforms (for instance i386) requires lower level information
+  ///      related to available registers that can not be known at the IR level.
+  ///
+  ///   2. Even if the previous point were not true, the decision on whether to
+  ///      perform a tail call is done in LowerCallTo in SelectionDAG which
+  ///      occurs after the Stack Protector Pass. As a result, one would need to
+  ///      put the relevant callinst into the stack protector check success
+  ///      basic block (where the return inst is placed) and then move it back
+  ///      later at SelectionDAG/MI time before the stack protector check if the
+  ///      tail call optimization failed. The MI level option was nixed
+  ///      immediately since it would require platform specific pattern
+  ///      matching. The SelectionDAG level option was nixed because
+  ///      SelectionDAG only processes one IR level basic block at a time
+  ///      implying one could not create a DAG Combine to move the callinst.
+  ///
+  /// To get around this problem a few things were realized:
+  ///
+  ///   1. While one can not handle multiple IR level basic blocks at the
+  ///      SelectionDAG Level, one can generate multiple machine basic blocks
+  ///      for one IR level basic block. This is how we handle bit tests and
+  ///      switches.
+  ///
+  ///   2. At the MI level, tail calls are represented via a special return
+  ///      MIInst called "tcreturn". Thus if we know the basic block in which we
+  ///      wish to insert the stack protector check, we get the correct behavior
+  ///      by always inserting the stack protector check right before the return
+  ///      statement. This is a "magical transformation" since no matter where
+  ///      the stack protector check intrinsic is, we always insert the stack
+  ///      protector check code at the end of the BB.
+  ///
+  /// Given the aforementioned constraints, the following solution was devised:
+  ///
+  ///   1. On platforms that do not support SelectionDAG stack protector check
+  ///      generation, allow for the normal IR level stack protector check
+  ///      generation to continue.
+  ///
+  ///   2. On platforms that do support SelectionDAG stack protector check
+  ///      generation:
+  ///
+  ///     a. Use the IR level stack protector pass to decide if a stack
+  ///        protector is required/which BB we insert the stack protector check
+  ///        in by reusing the logic already therein. If we wish to generate a
+  ///        stack protector check in a basic block, we place a special IR
+  ///        intrinsic called llvm.stackprotectorcheck right before the BB's
+  ///        returninst or if there is a callinst that could potentially be
+  ///        sibling call optimized, before the call inst.
+  ///
+  ///     b. Then when a BB with said intrinsic is processed, we codegen the BB
+  ///        normally via SelectBasicBlock. In said process, when we visit the
+  ///        stack protector check, we do not actually emit anything into the
+  ///        BB. Instead, we just initialize the stack protector descriptor
+  ///        class (which involves stashing information/creating the success
+  ///        mbbb and the failure mbb if we have not created one for this
+  ///        function yet) and export the guard variable that we are going to
+  ///        compare.
+  ///
+  ///     c. After we finish selecting the basic block, in FinishBasicBlock if
+  ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
+  ///        initialized, we first find a splice point in the parent basic block
+  ///        before the terminator and then splice the terminator of said basic
+  ///        block into the success basic block. Then we code-gen a new tail for
+  ///        the parent basic block consisting of the two loads, the comparison,
+  ///        and finally two branches to the success/failure basic blocks. We
+  ///        conclude by code-gening the failure basic block if we have not
+  ///        code-gened it already (all stack protector checks we generate in
+  ///        the same function, use the same failure basic block).
+  class StackProtectorDescriptor {
+  public:
+    StackProtectorDescriptor() : ParentMBB(0), SuccessMBB(0), FailureMBB(0),
+                                 Guard(0) { }
+    ~StackProtectorDescriptor() { }
+
+    /// Returns true if all fields of the stack protector descriptor are
+    /// initialized implying that we should/are ready to emit a stack protector.
+    bool shouldEmitStackProtector() const {
+      return ParentMBB && SuccessMBB && FailureMBB && Guard;
+    }
+
+    /// Initialize the stack protector descriptor structure for a new basic
+    /// block.
+    void initialize(const BasicBlock *BB,
+                    MachineBasicBlock *MBB,
+                    const CallInst &StackProtCheckCall) {
+      // Make sure we are not initialized yet.
+      assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
+             "already initialized!");
+      ParentMBB = MBB;
+      SuccessMBB = AddSuccessorMBB(BB, MBB);
+      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      if (!Guard)
+        Guard = StackProtCheckCall.getArgOperand(0);
+    }
+
+    /// Reset state that changes when we handle different basic blocks.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. The specific basic block we are generating a
+    /// stack protector for (ParentMBB).
+    ///
+    /// 2. The successor machine basic block that will contain the tail of
+    /// parent mbb after we create the stack protector check (SuccessMBB). This
+    /// BB is visited only on stack protector check success.
+    void resetPerBBState() {
+      ParentMBB = 0;
+      SuccessMBB = 0;
+    }
+
+    /// Reset state that only changes when we switch functions.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. FailureMBB since we reuse the failure code path for all stack
+    /// protector checks created in an individual function.
+    ///
+    /// 2.The guard variable since the guard variable we are checking against is
+    /// always the same.
+    void resetPerFunctionState() {
+      FailureMBB = 0;
+      Guard = 0;
+    }
+
+    MachineBasicBlock *getParentMBB() { return ParentMBB; }
+    MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
+    MachineBasicBlock *getFailureMBB() { return FailureMBB; }
+    const Value *getGuard() { return Guard; }
+
+  private:
+    /// The basic block for which we are generating the stack protector.
+    ///
+    /// As a result of stack protector generation, we will splice the
+    /// terminators of this basic block into the successor mbb SuccessMBB and
+    /// replace it with a compare/branch to the successor mbbs
+    /// SuccessMBB/FailureMBB depending on whether or not the stack protector
+    /// was violated.
+    MachineBasicBlock *ParentMBB;
+
+    /// A basic block visited on stack protector check success that contains the
+    /// terminators of ParentMBB.
+    MachineBasicBlock *SuccessMBB;
+
+    /// This basic block visited on stack protector check failure that will
+    /// contain a call to __stack_chk_fail().
+    MachineBasicBlock *FailureMBB;
+
+    /// The guard variable which we will compare against the stored value in the
+    /// stack protector stack slot.
+    const Value *Guard;
+
+    /// Add a successor machine basic block to ParentMBB. If the successor mbb
+    /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
+    /// block will be created.
+    MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
+                                       MachineBasicBlock *ParentMBB,
+                                       MachineBasicBlock *SuccMBB = 0);
+  };
+
+private:
   const TargetMachine &TM;
-  const TargetLowering &TLI;
+public:
   SelectionDAG &DAG;
   const DataLayout *TD;
   AliasAnalysis *AA;
@@ -298,6 +502,9 @@ public:
   /// BitTestCases - Vector of BitTestBlock structures used to communicate
   /// SwitchInst code generation information.
   std::vector<BitTestBlock> BitTestCases;
+  /// A StackProtectorDescriptor structure used to communicate stack protector
+  /// information in between SelectBasicBlock and FinishBasicBlock.
+  StackProtectorDescriptor SPDescriptor;
 
   // Emit PHI-node-operand constants only once even if used by multiple
   // PHI nodes.
@@ -308,9 +515,9 @@ public:
   FunctionLoweringInfo &FuncInfo;
 
   /// OptLevel - What optimization level we're generating code for.
-  /// 
+  ///
   CodeGenOpt::Level OptLevel;
-  
+
   /// GFI - Garbage collection metadata for the function.
   GCFunctionInfo *GFI;
 
@@ -327,7 +534,7 @@ public:
 
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       CodeGenOpt::Level ol)
-    : SDNodeOrder(0), TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
+    : CurInst(NULL), SDNodeOrder(0), TM(dag.getTarget()),
       DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
       HasTailCall(false) {
   }
@@ -364,17 +571,18 @@ public:
   ///
   SDValue getControlRoot();
 
-  DebugLoc getCurDebugLoc() const { return CurDebugLoc; }
+  SDLoc getCurSDLoc() const {
+    return SDLoc(CurInst, SDNodeOrder);
+  }
+
+  DebugLoc getCurDebugLoc() const {
+    return CurInst ? CurInst->getDebugLoc() : DebugLoc();
+  }
 
   unsigned getSDNodeOrder() const { return SDNodeOrder; }
 
   void CopyValueToVirtualRegister(const Value *V, unsigned Reg);
 
-  /// AssignOrderingToNode - Assign an ordering to the node. The order is gotten
-  /// from how the code appeared in the source. The ordering is used by the
-  /// scheduler to effectively turn off scheduling.
-  void AssignOrderingToNode(const SDNode *Node);
-
   void visit(const Instruction &I);
 
   void visit(unsigned Opcode, const User &I);
@@ -391,7 +599,7 @@ public:
     assert(N.getNode() == 0 && "Already set a value for this node!");
     N = NewN;
   }
-  
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(N.getNode() == 0 && "Already set a value for this node!");
@@ -412,6 +620,12 @@ public:
   void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
                    MachineBasicBlock *LandingPad = NULL);
 
+  std::pair<SDValue, SDValue> LowerCallOperands(const CallInst &CI,
+                                                unsigned ArgIdx,
+                                                unsigned NumArgs,
+                                                SDValue Callee,
+                                                bool useVoidTy = false);
+
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that ned to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
@@ -453,6 +667,9 @@ private:
 public:
   void visitSwitchCase(CaseBlock &CB,
                        MachineBasicBlock *SwitchBB);
+  void visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *ParentBB);
+  void visitSPDescriptorFailure(StackProtectorDescriptor &SPD);
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
@@ -463,7 +680,7 @@ public:
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
-  
+
 private:
   // These all get lowered before this pass.
   void visitInvoke(const InvokeInst &I);
@@ -504,6 +721,7 @@ private:
   void visitPtrToInt(const User &I);
   void visitIntToPtr(const User &I);
   void visitBitCast(const User &I);
+  void visitAddrSpaceCast(const User &I);
 
   void visitExtractElement(const User &I);
   void visitInsertElement(const User &I);
@@ -525,6 +743,11 @@ private:
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
+  bool visitMemChrCall(const CallInst &I);
+  bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
+  bool visitStrCmpCall(const CallInst &I);
+  bool visitStrLenCall(const CallInst &I);
+  bool visitStrNLenCall(const CallInst &I);
   bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
@@ -537,6 +760,8 @@ private:
   void visitVAArg(const VAArgInst &I);
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
+  void visitStackmap(const CallInst &I);
+  void visitPatchpoint(const CallInst &I);
 
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
@@ -545,10 +770,13 @@ private:
     llvm_unreachable("UserOp2 should not exist at instruction selection time!");
   }
 
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
   /// EmitFuncArgumentDbgValue - If V is an function argument then create
-  /// corresponding DBG_VALUE machine instruction for it now. At the end of 
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of
   /// instruction selection, they will be inserted to the entry BB.
   bool EmitFuncArgumentDbgValue(const Value *V, MDNode *Variable,
                                 int64_t Offset, const SDValue &N);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 47b0391..c04a08d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -92,9 +92,6 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::RETURNADDR:                 return "RETURNADDR";
   case ISD::FRAMEADDR:                  return "FRAMEADDR";
   case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
-  case ISD::EXCEPTIONADDR:              return "EXCEPTIONADDR";
-  case ISD::LSDAADDR:                   return "LSDAADDR";
-  case ISD::EHSELECTION:                return "EHSELECTION";
   case ISD::EH_RETURN:                  return "EH_RETURN";
   case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
   case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
@@ -145,6 +142,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FCEIL:                      return "fceil";
   case ISD::FRINT:                      return "frint";
   case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FROUND:                     return "fround";
   case ISD::FEXP:                       return "fexp";
   case ISD::FEXP2:                      return "fexp2";
   case ISD::FLOG:                       return "flog";
@@ -226,6 +224,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP_TO_SINT:                 return "fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
+  case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP32:               return "fp16_to_fp32";
   case ISD::FP32_TO_FP16:               return "fp32_to_fp16";
 
@@ -487,10 +486,16 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << " " << offset;
     if (unsigned int TF = BA->getTargetFlags())
       OS << " [TF=" << TF << ']';
+  } else if (const AddrSpaceCastSDNode *ASC =
+               dyn_cast<AddrSpaceCastSDNode>(this)) {
+    OS << '['
+       << ASC->getSrcAddressSpace()
+       << " -> "
+       << ASC->getDestAddressSpace()
+       << ']';
   }
 
-  if (G)
-    if (unsigned Order = G->GetOrdering(this))
+  if (unsigned Order = getIROrder())
       OS << " [ORD=" << Order << ']';
 
   if (getNodeId() != -1)
@@ -501,8 +506,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     DIScope
       Scope(dl.getScope(G->getMachineFunction().getFunction()->getContext()));
     OS << " dbg:";
+    assert((!Scope || Scope.isScope()) &&
+      "Scope of a DebugLoc should be null or a DIScope.");
     // Omit the directory, since it's usually long and uninteresting.
-    if (Scope.Verify())
+    if (Scope)
       OS << Scope.getFilename();
     else
       OS << "<unknown>";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index e21f26e..3a0cfa1 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -222,23 +223,61 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
 
 namespace llvm {
   //===--------------------------------------------------------------------===//
+  /// \brief This class is used by SelectionDAGISel to temporarily override
+  /// the optimization level on a per-function basis.
+  class OptLevelChanger {
+    SelectionDAGISel &IS;
+    CodeGenOpt::Level SavedOptLevel;
+    bool SavedFastISel;
+
+  public:
+    OptLevelChanger(SelectionDAGISel &ISel,
+                    CodeGenOpt::Level NewOptLevel) : IS(ISel) {
+      SavedOptLevel = IS.OptLevel;
+      if (NewOptLevel == SavedOptLevel)
+        return;
+      IS.OptLevel = NewOptLevel;
+      IS.TM.setOptLevel(NewOptLevel);
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None)
+        IS.TM.setFastISel(true);
+      DEBUG(dbgs() << "\nChanging optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
+            << " ; After: -O" << NewOptLevel << "\n");
+    }
+
+    ~OptLevelChanger() {
+      if (IS.OptLevel == SavedOptLevel)
+        return;
+      DEBUG(dbgs() << "\nRestoring optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
+            << " ; After: -O" << SavedOptLevel << "\n");
+      IS.OptLevel = SavedOptLevel;
+      IS.TM.setOptLevel(SavedOptLevel);
+      IS.TM.setFastISel(SavedFastISel);
+    }
+  };
+
+  //===--------------------------------------------------------------------===//
   /// createDefaultScheduler - This creates an instruction scheduler appropriate
   /// for the target.
   ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
                                              CodeGenOpt::Level OptLevel) {
-    const TargetLowering &TLI = IS->getTargetLowering();
+    const TargetLowering *TLI = IS->getTargetLowering();
     const TargetSubtargetInfo &ST = IS->TM.getSubtarget<TargetSubtargetInfo>();
 
-    if (OptLevel == CodeGenOpt::None || ST.enableMachineScheduler() ||
-        TLI.getSchedulingPreference() == Sched::Source)
+    if (OptLevel == CodeGenOpt::None || ST.useMachineScheduler() ||
+        TLI->getSchedulingPreference() == Sched::Source)
       return createSourceListDAGScheduler(IS, OptLevel);
-    if (TLI.getSchedulingPreference() == Sched::RegPressure)
+    if (TLI->getSchedulingPreference() == Sched::RegPressure)
       return createBURRListDAGScheduler(IS, OptLevel);
-    if (TLI.getSchedulingPreference() == Sched::Hybrid)
+    if (TLI->getSchedulingPreference() == Sched::Hybrid)
       return createHybridListDAGScheduler(IS, OptLevel);
-    if (TLI.getSchedulingPreference() == Sched::VLIW)
+    if (TLI->getSchedulingPreference() == Sched::VLIW)
       return createVLIWDAGScheduler(IS, OptLevel);
-    assert(TLI.getSchedulingPreference() == Sched::ILP &&
+    assert(TLI->getSchedulingPreference() == Sched::ILP &&
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
@@ -275,10 +314,10 @@ void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
 // SelectionDAGISel code
 //===----------------------------------------------------------------------===//
 
-SelectionDAGISel::SelectionDAGISel(const TargetMachine &tm,
+SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
                                    CodeGenOpt::Level OL) :
-  MachineFunctionPass(ID), TM(tm), TLI(*tm.getTargetLowering()),
-  FuncInfo(new FunctionLoweringInfo(TLI)),
+  MachineFunctionPass(ID), TM(tm),
+  FuncInfo(new FunctionLoweringInfo(TM)),
   CurDAG(new SelectionDAG(tm, OL)),
   SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
   GFI(),
@@ -355,6 +394,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   const Function &Fn = *mf.getFunction();
   const TargetInstrInfo &TII = *TM.getInstrInfo();
   const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  const TargetLowering *TLI = TM.getTargetLowering();
 
   MF = &mf;
   RegInfo = &MF->getRegInfo();
@@ -368,11 +408,17 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   ST.resetSubtargetFeatures(MF);
   TM.resetTargetOptions(MF);
 
+  // Reset OptLevel to None for optnone functions.
+  CodeGenOpt::Level NewOptLevel = OptLevel;
+  if (Fn.hasFnAttribute(Attribute::OptimizeNone))
+    NewOptLevel = CodeGenOpt::None;
+  OptLevelChanger OLC(*this, NewOptLevel);
+
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
 
-  CurDAG->init(*MF, TTI);
+  CurDAG->init(*MF, TTI, TLI);
   FuncInfo->set(Fn, *MF);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -401,29 +447,37 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // Insert DBG_VALUE instructions for function arguments to the entry block.
   for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
     MachineInstr *MI = FuncInfo->ArgDbgValues[e-i-1];
-    unsigned Reg = MI->getOperand(0).getReg();
+    bool hasFI = MI->getOperand(0).isFI();
+    unsigned Reg = hasFI ? TRI.getFrameRegister(*MF) : MI->getOperand(0).getReg();
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
-      MachineBasicBlock::iterator InsertPos = Def;
-      // FIXME: VR def may not be in entry block.
-      Def->getParent()->insert(llvm::next(InsertPos), MI);
+      if (Def) {
+        MachineBasicBlock::iterator InsertPos = Def;
+        // FIXME: VR def may not be in entry block.
+        Def->getParent()->insert(llvm::next(InsertPos), MI);
+      } else
+        DEBUG(dbgs() << "Dropping debug info for dead vreg"
+              << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
     DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
     if (LDI != LiveInMap.end()) {
+      assert(!hasFI && "There's no handling of frame pointer updating here yet "
+                       "- add if needed");
       MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
       MachineBasicBlock::iterator InsertPos = Def;
       const MDNode *Variable =
         MI->getOperand(MI->getNumOperands()-1).getMetadata();
-      unsigned Offset = MI->getOperand(1).getImm();
+      bool IsIndirect = MI->isIndirectDebugValue();
+      unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
       // Def is never a terminator here, so it is ok to increment InsertPos.
       BuildMI(*EntryMBB, ++InsertPos, MI->getDebugLoc(),
-              TII.get(TargetOpcode::DBG_VALUE))
-        .addReg(LDI->second, RegState::Debug)
-        .addImm(Offset).addMetadata(Variable);
+              TII.get(TargetOpcode::DBG_VALUE),
+              IsIndirect,
+              LDI->second, Offset, Variable);
 
       // If this vreg is directly copied into an exported register then
       // that COPY instructions also need DBG_VALUE, if it is the only
@@ -442,9 +496,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (CopyUseMI) {
         MachineInstr *NewMI =
           BuildMI(*MF, CopyUseMI->getDebugLoc(),
-                  TII.get(TargetOpcode::DBG_VALUE))
-          .addReg(CopyUseMI->getOperand(0).getReg(), RegState::Debug)
-          .addImm(Offset).addMetadata(Variable);
+                  TII.get(TargetOpcode::DBG_VALUE),
+                  IsIndirect,
+                  CopyUseMI->getOperand(0).getReg(),
+                  Offset, Variable);
         MachineBasicBlock::iterator Pos = CopyUseMI;
         EntryMBB->insertAfter(Pos, NewMI);
       }
@@ -491,6 +546,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       if (J == E) break;
       To = J->second;
     }
+    // Make sure the new register has a sufficiently constrained register class.
+    if (TargetRegisterInfo::isVirtualRegister(From) &&
+        TargetRegisterInfo::isVirtualRegister(To))
+      MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
     MRI.replaceRegWith(From, To);
   }
@@ -611,6 +670,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
@@ -624,6 +685,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 
     DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
           << " '" << BlockName << "'\n"; CurDAG->dump());
+
   }
 
   {
@@ -790,9 +852,6 @@ void SelectionDAGISel::DoInstructionSelection() {
         continue;
       // Replace node.
       if (ResNode) {
-        // Propagate ordering
-        CurDAG->AssignOrdering(ResNode, CurDAG->GetOrdering(Node));
-
         ReplaceUses(Node, ResNode);
       }
 
@@ -827,12 +886,14 @@ void SelectionDAGISel::PrepareEHLandingPad() {
     .addSym(Label);
 
   // Mark exception register as live in.
-  unsigned Reg = TLI.getExceptionPointerRegister();
-  if (Reg) MBB->addLiveIn(Reg);
+  const TargetLowering *TLI = getTargetLowering();
+  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
+  if (unsigned Reg = TLI->getExceptionPointerRegister())
+    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 
   // Mark exception selector register as live in.
-  Reg = TLI.getExceptionSelectorRegister();
-  if (Reg) MBB->addLiveIn(Reg);
+  if (unsigned Reg = TLI->getExceptionSelectorRegister())
+    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
 }
 
 /// isFoldedOrDeadInstruction - Return true if the specified instruction is
@@ -931,7 +992,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
   if (TM.Options.EnableFastISel)
-    FastIS = TLI.createFastISel(*FuncInfo, LibInfo);
+    FastIS = getTargetLowering()->createFastISel(*FuncInfo, LibInfo);
 
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -970,6 +1031,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
 
     // Setup an EH landing-pad block.
+    FuncInfo->ExceptionPointerVirtReg = 0;
+    FuncInfo->ExceptionSelectorVirtReg = 0;
     if (FuncInfo->MBB->isLandingPad())
       PrepareEHLandingPad();
 
@@ -1132,6 +1195,91 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   delete FastIS;
   SDB->clearDanglingDebugInfo();
+  SDB->SPDescriptor.resetPerFunctionState();
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr *MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI->isCopy() && !MI->isImplicitDef())
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    return MI->isDebugValue();
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI->operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI->isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI->operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() ||
+      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
+       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+static MachineBasicBlock::iterator
+FindSplitPointForStackProtector(MachineBasicBlock *BB, DebugLoc DL) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  //
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  --Previous;
+
+  while (MIIsInTerminatorSequence(Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
 }
 
 void
@@ -1144,11 +1292,13 @@ SelectionDAGISel::FinishBasicBlock() {
                  << FuncInfo->PHINodesToUpdate[i].first
                  << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
 
+  const bool MustUpdatePHINodes = SDB->SwitchCases.empty() &&
+                                  SDB->JTCases.empty() &&
+                                  SDB->BitTestCases.empty();
+
   // Next, now that we know what the last MBB the LLVM BB expanded is, update
   // PHI nodes in successors.
-  if (SDB->SwitchCases.empty() &&
-      SDB->JTCases.empty() &&
-      SDB->BitTestCases.empty()) {
+  if (MustUpdatePHINodes) {
     for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
       MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
       assert(PHI->isPHI() &&
@@ -1157,9 +1307,54 @@ SelectionDAGISel::FinishBasicBlock() {
         continue;
       PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
     }
-    return;
   }
 
+  // Handle stack protector.
+  if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint =
+      FindSplitPointForStackProtector(ParentMBB, SDB->getCurDebugLoc());
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
+                       SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt = ParentMBB->end();
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
+    if (!FailureMBB->size()) {
+      FuncInfo->MBB = FailureMBB;
+      FuncInfo->InsertPt = FailureMBB->end();
+      SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  }
+
+  // If we updated PHI Nodes, return early.
+  if (MustUpdatePHINodes)
+    return;
+
   for (unsigned i = 0, e = SDB->BitTestCases.size(); i != e; ++i) {
     // Lower header first, if it wasn't already lowered
     if (!SDB->BitTestCases[i].Emitted) {
@@ -1606,7 +1801,7 @@ SDNode *SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   SelectInlineAsmMemoryOperands(Ops);
 
   EVT VTs[] = { MVT::Other, MVT::Glue };
-  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
                                 VTs, &Ops[0], Ops.size());
   New->setNodeId(-1);
   return New.getNode();
@@ -1733,15 +1928,15 @@ WalkChainUsers(const SDNode *ChainedNode,
 
     SDNode *User = *UI;
 
+    if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+
     // If we see an already-selected machine node, then we've gone beyond the
     // pattern that we're selecting down into the already selected chunk of the
     // DAG.
-    if (User->isMachineOpcode() ||
-        User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
-      continue;
-
     unsigned UserOpcode = User->getOpcode();
-    if (UserOpcode == ISD::CopyToReg ||
+    if (User->isMachineOpcode() ||
+        UserOpcode == ISD::CopyToReg ||
         UserOpcode == ISD::CopyFromReg ||
         UserOpcode == ISD::INLINEASM ||
         UserOpcode == ISD::EH_LABEL ||
@@ -1878,10 +2073,9 @@ HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
     }
   }
 
-  SDValue Res;
   if (InputChains.size() == 1)
     return InputChains[0];
-  return CurDAG->getNode(ISD::TokenFactor, ChainNodesMatched[0]->getDebugLoc(),
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
                          MVT::Other, &InputChains[0], InputChains.size());
 }
 
@@ -1954,6 +2148,18 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
   return N == RecordedNodes[RecNo].first;
 }
 
+/// CheckChildSame - Implements OP_CheckChildXSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
+CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N,
+             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
+             unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
+                     RecordedNodes);
+}
+
 /// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
@@ -1978,24 +2184,23 @@ CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-          SDValue N, const TargetLowering &TLI) {
+          SDValue N, const TargetLowering *TLI) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (N.getValueType() == VT) return true;
 
   // Handle the case when VT is iPTR.
-  return VT == MVT::iPTR && N.getValueType() == TLI.getPointerTy();
+  return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy();
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-               SDValue N, const TargetLowering &TLI,
+               SDValue N, const TargetLowering *TLI,
                unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI);
 }
 
-
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
               SDValue N) {
@@ -2005,13 +2210,13 @@ CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-               SDValue N, const TargetLowering &TLI) {
+               SDValue N, const TargetLowering *TLI) {
   MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
   if (cast<VTSDNode>(N)->getVT() == VT)
     return true;
 
   // Handle the case when VT is iPTR.
-  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI.getPointerTy();
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy();
 }
 
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
@@ -2069,6 +2274,13 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckSame:
     Result = !::CheckSame(Table, Index, N, RecordedNodes);
     return Index;
+  case SelectionDAGISel::OPC_CheckChild0Same:
+  case SelectionDAGISel::OPC_CheckChild1Same:
+  case SelectionDAGISel::OPC_CheckChild2Same:
+  case SelectionDAGISel::OPC_CheckChild3Same:
+    Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
+    return Index;
   case SelectionDAGISel::OPC_CheckPatternPredicate:
     Result = !::CheckPatternPredicate(Table, Index, SDISel);
     return Index;
@@ -2079,7 +2291,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
     Result = !::CheckOpcode(Table, Index, N.getNode());
     return Index;
   case SelectionDAGISel::OPC_CheckType:
-    Result = !::CheckType(Table, Index, N, SDISel.TLI);
+    Result = !::CheckType(Table, Index, N, SDISel.getTargetLowering());
     return Index;
   case SelectionDAGISel::OPC_CheckChild0Type:
   case SelectionDAGISel::OPC_CheckChild1Type:
@@ -2089,14 +2301,14 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
   case SelectionDAGISel::OPC_CheckChild5Type:
   case SelectionDAGISel::OPC_CheckChild6Type:
   case SelectionDAGISel::OPC_CheckChild7Type:
-    Result = !::CheckChildType(Table, Index, N, SDISel.TLI,
+    Result = !::CheckChildType(Table, Index, N, SDISel.getTargetLowering(),
                         Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Type);
     return Index;
   case SelectionDAGISel::OPC_CheckCondCode:
     Result = !::CheckCondCode(Table, Index, N);
     return Index;
   case SelectionDAGISel::OPC_CheckValueType:
-    Result = !::CheckValueType(Table, Index, N, SDISel.TLI);
+    Result = !::CheckValueType(Table, Index, N, SDISel.getTargetLowering());
     return Index;
   case SelectionDAGISel::OPC_CheckInteger:
     Result = !::CheckInteger(Table, Index, N);
@@ -2366,6 +2578,14 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckSame:
       if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
       continue;
+
+    case OPC_CheckChild0Same: case OPC_CheckChild1Same:
+    case OPC_CheckChild2Same: case OPC_CheckChild3Same:
+      if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
+                            Opcode-OPC_CheckChild0Same))
+        break;
+      continue;
+
     case OPC_CheckPatternPredicate:
       if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
       continue;
@@ -2389,7 +2609,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       continue;
 
     case OPC_CheckType:
-      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI)) break;
+      if (!::CheckType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+        break;
       continue;
 
     case OPC_SwitchOpcode: {
@@ -2424,7 +2645,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     }
 
     case OPC_SwitchType: {
-      MVT CurNodeVT = N.getValueType().getSimpleVT();
+      MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
       while (1) {
@@ -2436,7 +2657,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
         MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
         if (CaseVT == MVT::iPTR)
-          CaseVT = TLI.getPointerTy();
+          CaseVT = getTargetLowering()->getPointerTy();
 
         // If the VT matches, then we will execute this case.
         if (CurNodeVT == CaseVT)
@@ -2458,7 +2679,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_CheckChild2Type: case OPC_CheckChild3Type:
     case OPC_CheckChild4Type: case OPC_CheckChild5Type:
     case OPC_CheckChild6Type: case OPC_CheckChild7Type:
-      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, getTargetLowering(),
                             Opcode-OPC_CheckChild0Type))
         break;
       continue;
@@ -2466,7 +2687,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
       continue;
     case OPC_CheckValueType:
-      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI)) break;
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, getTargetLowering()))
+        break;
       continue;
     case OPC_CheckInteger:
       if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
@@ -2535,7 +2757,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitConvertToTarget:  {
       // Convert from IMM/FPIMM to target version.
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
       SDValue Imm = RecordedNodes[RecNo].first;
 
       if (Imm->getOpcode() == ISD::Constant) {
@@ -2560,7 +2782,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
       // Read all of the chained nodes.
       unsigned RecNo = Opcode == OPC_EmitMergeInputChains1_1;
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
       ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
       // FIXME: What if other value results of the node have uses not matched
@@ -2597,7 +2819,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       // Read all of the chained nodes.
       for (unsigned i = 0; i != NumChains; ++i) {
         unsigned RecNo = MatcherTable[MatcherIndex++];
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
         ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
 
         // FIXME: What if other value results of the node have uses not matched
@@ -2624,13 +2846,13 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
 
     case OPC_EmitCopyToReg: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
 
       if (InputChain.getNode() == 0)
         InputChain = CurDAG->getEntryNode();
 
-      InputChain = CurDAG->getCopyToReg(InputChain, NodeToMatch->getDebugLoc(),
+      InputChain = CurDAG->getCopyToReg(InputChain, SDLoc(NodeToMatch),
                                         DestPhysReg, RecordedNodes[RecNo].first,
                                         InputGlue);
 
@@ -2641,7 +2863,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
     case OPC_EmitNodeXForm: {
       unsigned XFormNo = MatcherTable[MatcherIndex++];
       unsigned RecNo = MatcherTable[MatcherIndex++];
-      assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
       SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, (SDNode*) 0));
       continue;
@@ -2658,7 +2880,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       for (unsigned i = 0; i != NumVTs; ++i) {
         MVT::SimpleValueType VT =
           (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
-        if (VT == MVT::iPTR) VT = TLI.getPointerTy().SimpleTy;
+        if (VT == MVT::iPTR) VT = getTargetLowering()->getPointerTy().SimpleTy;
         VTs.push_back(VT);
       }
 
@@ -2717,7 +2939,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
       if (Opcode != OPC_MorphNodeTo) {
         // If this is a normal EmitNode command, just create the new node and
         // add the results to the RecordedNodes list.
-        Res = CurDAG->getMachineNode(TargetOpc, NodeToMatch->getDebugLoc(),
+        Res = CurDAG->getMachineNode(TargetOpc, SDLoc(NodeToMatch),
                                      VTList, Ops);
 
         // Add all the non-glue/non-chain results to the RecordedNodes list.
@@ -2760,8 +2982,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         bool mayStore = MCID.mayStore();
 
         unsigned NumMemRefs = 0;
-        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
-             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+        for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
+               MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
           if ((*I)->isLoad()) {
             if (mayLoad)
               ++NumMemRefs;
@@ -2777,8 +2999,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
           MF->allocateMemRefsArray(NumMemRefs);
 
         MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
-        for (SmallVector<MachineMemOperand*, 2>::const_iterator I =
-             MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+        for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
+               MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
           if ((*I)->isLoad()) {
             if (mayLoad)
               *MemRefsPos++ = *I;
@@ -2818,7 +3040,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (RecNo & 128)
           RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
 
-        assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+        assert(RecNo < RecordedNodes.size() && "Invalid MarkGlueResults");
         GlueResultNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
       }
       continue;
@@ -2835,7 +3057,7 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
         if (ResSlot & 128)
           ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
 
-        assert(ResSlot < RecordedNodes.size() && "Invalid CheckSame");
+        assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
         SDValue Res = RecordedNodes[ResSlot].first;
 
         assert(i < NodeToMatch->getNumValues() &&
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index f5fc66c..82b068d 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -64,13 +64,29 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   return isUsedByReturnOnly(Node, Chain);
 }
 
+/// \brief Set CallLoweringInfo attribute flags based on a call instruction
+/// and called function attributes.
+void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                 unsigned AttrIdx) {
+  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  Alignment  = CS->getParamAlignment(AttrIdx);
+}
 
 /// Generate a libcall taking the given operands as arguments and returning a
 /// result of type RetVT.
-SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
-                                    RTLIB::Libcall LC, EVT RetVT,
-                                    const SDValue *Ops, unsigned NumOps,
-                                    bool isSigned, DebugLoc dl) const {
+std::pair<SDValue, SDValue>
+TargetLowering::makeLibCall(SelectionDAG &DAG,
+                            RTLIB::Libcall LC, EVT RetVT,
+                            const SDValue *Ops, unsigned NumOps,
+                            bool isSigned, SDLoc dl,
+                            bool doesNotReturn,
+                            bool isReturnValueUsed) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumOps);
 
@@ -89,11 +105,9 @@ SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
   CallLoweringInfo CLI(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
                     false, 0, getLibcallCallingConv(LC),
                     /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallInfo = LowerCallTo(CLI);
-
-  return CallInfo.first;
+                    doesNotReturn, isReturnValueUsed, Callee, Args,
+                    DAG, dl);
+  return LowerCallTo(CLI);
 }
 
 
@@ -102,7 +116,7 @@ SDValue TargetLowering::makeLibCall(SelectionDAG &DAG,
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
                                          SDValue &NewLHS, SDValue &NewRHS,
                                          ISD::CondCode &CCCode,
-                                         DebugLoc dl) const {
+                                         SDLoc dl) const {
   assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
          && "Unsupported setcc type!");
 
@@ -183,14 +197,18 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
   SDValue Ops[2] = { NewLHS, NewRHS };
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, 2, false/*sign irrelevant*/,
+                       dl).first;
   NewRHS = DAG.getConstant(0, RetVT);
   CCCode = getCmpLibcallCC(LC1);
   if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
-    SDValue Tmp = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT),
+    SDValue Tmp = DAG.getNode(ISD::SETCC, dl,
+                              getSetCCResultType(*DAG.getContext(), RetVT),
                               NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/, dl);
-    NewLHS = DAG.getNode(ISD::SETCC, dl, getSetCCResultType(RetVT), NewLHS,
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, 2, false/*sign irrelevant*/,
+                         dl).first;
+    NewLHS = DAG.getNode(ISD::SETCC, dl,
+                         getSetCCResultType(*DAG.getContext(), RetVT), NewLHS,
                          NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
     NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
     NewRHS = SDValue();
@@ -262,7 +280,7 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 /// constant and return true.
 bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
                                                         const APInt &Demanded) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // FIXME: ISD::SELECT, ISD::SELECT_CC
   switch (Op.getOpcode()) {
@@ -302,7 +320,7 @@ bool
 TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
                                                     unsigned BitWidth,
                                                     const APInt &Demanded,
-                                                    DebugLoc dl) {
+                                                    SDLoc dl) {
   assert(Op.getNumOperands() == 2 &&
          "ShrinkDemandedOp only supports binary operators!");
   assert(Op.getNode()->getNumValues() == 1 &&
@@ -356,7 +374,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
          "Mask size mismatches value type size!");
   APInt NewMask = DemandedMask;
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Don't know anything.
   KnownZero = KnownOne = APInt(BitWidth, 0);
@@ -508,7 +526,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // into an AND, as we know the bits will be cleared.
     //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
     // NB: it is okay if more bits are known than are requested
-    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side 
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side
       if (KnownOne == KnownOne2) { // set bits are the same on both sides
         EVT VT = Op.getValueType();
         SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, VT);
@@ -630,6 +648,31 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
                           TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
                                           NarrowShl));
         }
+        // Repeat the SHL optimization above in cases where an extension
+        // intervenes: (shl (anyext (shr x, c1)), c2) to
+        // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
+        // aren't demanded (as above) and that the shifted upper c1 bits of
+        // x aren't demanded.
+        if (InOp.hasOneUse() &&
+            InnerOp.getOpcode() == ISD::SRL &&
+            InnerOp.hasOneUse() &&
+            isa<ConstantSDNode>(InnerOp.getOperand(1))) {
+          uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
+            ->getZExtValue();
+          if (InnerShAmt < ShAmt &&
+              InnerShAmt < InnerBits &&
+              NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 &&
+              NewMask.trunc(ShAmt) == 0) {
+            SDValue NewSA =
+              TLO.DAG.getConstant(ShAmt - InnerShAmt,
+                                  Op.getOperand(1).getValueType());
+            EVT VT = Op.getValueType();
+            SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                             InnerOp.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
+                                                     NewExt, NewSA));
+          }
+        }
       }
 
       KnownZero <<= SA->getZExtValue();
@@ -720,13 +763,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
-      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits)
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
                                                  Op.getOperand(0),
                                                  Op.getOperand(1)));
-      } else if (KnownOne.intersects(SignBit)) { // New bits are known one.
-        KnownOne |= HighBits;
+
+      int Log2 = NewMask.exactLogBase2();
+      if (Log2 >= 0) {
+        // The bit must come from the sign.
+        SDValue NewSA =
+          TLO.DAG.getConstant(BitWidth - 1 - Log2,
+                              Op.getOperand(1).getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
+                                                 Op.getOperand(0), NewSA));
       }
+
+      if (KnownOne.intersects(SignBit))
+        // New bits are known one.
+        KnownOne |= HighBits;
     }
     break;
   case ISD::SIGN_EXTEND_INREG: {
@@ -1066,7 +1120,7 @@ static bool ValueHasExactlyOneBitSet(SDValue Val, const SelectionDAG &DAG) {
 SDValue
 TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
                               ISD::CondCode Cond, bool foldBooleans,
-                              DAGCombinerInfo &DCI, DebugLoc dl) const {
+                              DAGCombinerInfo &DCI, SDLoc dl) const {
   SelectionDAG &DAG = DCI.DAG;
 
   // These setcc operations always fold.
@@ -1075,13 +1129,20 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   case ISD::SETFALSE:
   case ISD::SETFALSE2: return DAG.getConstant(0, VT);
   case ISD::SETTRUE:
-  case ISD::SETTRUE2:  return DAG.getConstant(1, VT);
+  case ISD::SETTRUE2: {
+    TargetLowering::BooleanContent Cnt = getBooleanContents(VT.isVector());
+    return DAG.getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, VT);
+  }
   }
 
   // Ensure that the constant occurs on the RHS, and fold constant
   // comparisons.
-  if (isa<ConstantSDNode>(N0.getNode()))
-    return DAG.getSetCC(dl, VT, N1, N0, ISD::getSetCCSwappedOperands(Cond));
+  ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
+  if (isa<ConstantSDNode>(N0.getNode()) &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
 
   if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
     const APInt &C1 = N1C->getAPIntValue();
@@ -1160,7 +1221,8 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
 
       // Make sure we're not losing bits from the constant.
-      if (MinBits < C1.getBitWidth() && MinBits > C1.getActiveBits()) {
+      if (MinBits > 0 &&
+          MinBits < C1.getBitWidth() && MinBits >= C1.getActiveBits()) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
           // Will get folded away.
@@ -1175,6 +1237,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
     // the test is for equality or unsigned, and all 1 bits of the const are
     // in the same partial word, see if we can shorten the load.
     if (DCI.isBeforeLegalize() &&
+        !ISD::isSignedIntSetCC(Cond) &&
         N0.getOpcode() == ISD::AND && C1 == 0 &&
         N0.getNode()->hasOneUse() &&
         isa<LoadSDNode>(N0.getOperand(0)) &&
@@ -1319,7 +1382,9 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
         CC = ISD::getSetCCInverse(CC,
                                   N0.getOperand(0).getValueType().isInteger());
-        return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+        if (DCI.isBeforeLegalizeOps() ||
+            isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
       }
 
       if ((N0.getOpcode() == ISD::XOR ||
@@ -1756,16 +1821,22 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       if (N0.getOperand(0) == N1 || N0.getOperand(1) == N1) {
         if (ValueHasExactlyOneBitSet(N1, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N1.getValueType());
-          return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N1.getValueType());
+            return DAG.getSetCC(dl, VT, N0, Zero, Cond);
+          }
         }
       }
     if (N1.getOpcode() == ISD::AND)
       if (N1.getOperand(0) == N0 || N1.getOperand(1) == N0) {
         if (ValueHasExactlyOneBitSet(N0, DAG)) {
           Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
-          SDValue Zero = DAG.getConstant(0, N0.getValueType());
-          return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          if (DCI.isBeforeLegalizeOps() ||
+              isCondCodeLegal(Cond, N1.getSimpleValueType())) {
+            SDValue Zero = DAG.getConstant(0, N0.getValueType());
+            return DAG.getSetCC(dl, VT, N1, Zero, Cond);
+          }
         }
       }
   }
@@ -1966,7 +2037,7 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
         int64_t Offs = GA->getOffset();
         if (C) Offs += C->getZExtValue();
         Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
-                                                 C ? C->getDebugLoc() : DebugLoc(),
+                                                 C ? SDLoc(C) : SDLoc(),
                                                  Op.getValueType(), Offs));
         return;
       }
@@ -1989,8 +2060,8 @@ void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
-                             EVT VT) const {
-  if (Constraint[0] != '{')
+                             MVT VT) const {
+  if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
 
@@ -2139,8 +2210,9 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
           break;
         }
       } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
-        OpInfo.ConstraintVT = MVT::getIntegerVT(
-            8*getDataLayout()->getPointerSize(PT->getAddressSpace()));
+        unsigned PtrSize
+          = getDataLayout()->getPointerSizeInBits(PT->getAddressSpace());
+        OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
       } else {
         OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
       }
@@ -2435,9 +2507,9 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
   }
 }
 
-/// BuildExactDiv - Given an exact SDIV by a constant, create a multiplication
+/// \brief Given an exact SDIV by a constant, create a multiplication
 /// with the multiplicative inverse of the constant.
-SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
+SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, SDLoc dl,
                                        SelectionDAG &DAG) const {
   ConstantSDNode *C = cast<ConstantSDNode>(Op2);
   APInt d = C->getAPIntValue();
@@ -2461,7 +2533,7 @@ SDValue TargetLowering::BuildExactSDIV(SDValue Op1, SDValue Op2, DebugLoc dl,
   return DAG.getNode(ISD::MUL, dl, Op1.getValueType(), Op1, Op2);
 }
 
-/// BuildSDIVSequence - Given an ISD::SDIV node expressing a divide by constant,
+/// \brief Given an ISD::SDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
@@ -2469,7 +2541,7 @@ SDValue TargetLowering::
 BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
           std::vector<SDNode*> *Created) const {
   EVT VT = N->getValueType(0);
-  DebugLoc dl= N->getDebugLoc();
+  SDLoc dl(N);
 
   // Check to see if we can do this.
   // FIXME: We should be more aggressive here.
@@ -2521,7 +2593,7 @@ BuildSDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
   return DAG.getNode(ISD::ADD, dl, VT, Q, T);
 }
 
-/// BuildUDIVSequence - Given an ISD::UDIV node expressing a divide by constant,
+/// \brief Given an ISD::UDIV node expressing a divide by constant,
 /// return a DAG expression to select that will generate the same value by
 /// multiplying by a magic number.  See:
 /// <http://the.wall.riscom.net/books/proc/ppc/cwg/code2.html>
@@ -2529,7 +2601,7 @@ SDValue TargetLowering::
 BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
           std::vector<SDNode*> *Created) const {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Check to see if we can do this.
   // FIXME: We should be more aggressive here.
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
deleted file mode 100644
index 2feea59..0000000
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ /dev/null
@@ -1,1152 +0,0 @@
-//===-- ShrinkWrapping.cpp - Reduce spills/restores of callee-saved regs --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a shrink wrapping variant of prolog/epilog insertion:
-// - Spills and restores of callee-saved registers (CSRs) are placed in the
-//   machine CFG to tightly surround their uses so that execution paths that
-//   do not use CSRs do not pay the spill/restore penalty.
-//
-// - Avoiding placment of spills/restores in loops: if a CSR is used inside a
-//   loop the spills are placed in the loop preheader, and restores are
-//   placed in the loop exit nodes (the successors of loop _exiting_ nodes).
-//
-// - Covering paths without CSR uses:
-//   If a region in a CFG uses CSRs and has multiple entry and/or exit points,
-//   the use info for the CSRs inside the region is propagated outward in the
-//   CFG to ensure validity of the spill/restore placements. This decreases
-//   the effectiveness of shrink wrapping but does not require edge splitting
-//   in the machine CFG.
-//
-// This shrink wrapping implementation uses an iterative analysis to determine
-// which basic blocks require spills and restores for CSRs.
-//
-// This pass uses MachineDominators and MachineLoopInfo. Loop information
-// is used to prevent placement of callee-saved register spills/restores
-// in the bodies of loops.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "shrink-wrap"
-
-#include "PrologEpilogInserter.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include <sstream>
-
-using namespace llvm;
-
-STATISTIC(numSRReduced, "Number of CSR spills+restores reduced.");
-
-// Shrink Wrapping:
-static cl::opt<bool>
-ShrinkWrapping("shrink-wrap",
-               cl::desc("Shrink wrap callee-saved register spills/restores"));
-
-// Shrink wrap only the specified function, a debugging aid.
-static cl::opt<std::string>
-ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
-               cl::desc("Shrink wrap the specified function"),
-               cl::value_desc("funcname"),
-               cl::init(""));
-
-// Debugging level for shrink wrapping.
-enum ShrinkWrapDebugLevel {
-  Disabled, BasicInfo, Iterations, Details
-};
-
-static cl::opt<enum ShrinkWrapDebugLevel>
-ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
-  cl::desc("Print shrink wrapping debugging information"),
-  cl::values(
-    clEnumVal(Disabled  , "disable debug output"),
-    clEnumVal(BasicInfo , "print basic DF sets"),
-    clEnumVal(Iterations, "print SR sets for each iteration"),
-    clEnumVal(Details   , "print all DF sets"),
-    clEnumValEnd));
-
-
-void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  if (ShrinkWrapping || ShrinkWrapFunc != "") {
-    AU.addRequired<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-  }
-  AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
-  AU.addRequired<TargetPassConfig>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-//===----------------------------------------------------------------------===//
-//  ShrinkWrapping implementation
-//===----------------------------------------------------------------------===//
-
-// Convienences for dealing with machine loops.
-MachineBasicBlock* PEI::getTopLevelLoopPreheader(MachineLoop* LP) {
-  assert(LP && "Machine loop is NULL.");
-  MachineBasicBlock* PHDR = LP->getLoopPreheader();
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    PHDR = PLP->getLoopPreheader();
-    PLP = PLP->getParentLoop();
-  }
-  return PHDR;
-}
-
-MachineLoop* PEI::getTopLevelLoopParent(MachineLoop *LP) {
-  if (LP == 0)
-    return 0;
-  MachineLoop* PLP = LP->getParentLoop();
-  while (PLP) {
-    LP = PLP;
-    PLP = PLP->getParentLoop();
-  }
-  return LP;
-}
-
-bool PEI::isReturnBlock(MachineBasicBlock* MBB) {
-  return (MBB && !MBB->empty() && MBB->back().isReturn());
-}
-
-// Initialize shrink wrapping DFA sets, called before iterations.
-void PEI::clearAnticAvailSets() {
-  AnticIn.clear();
-  AnticOut.clear();
-  AvailIn.clear();
-  AvailOut.clear();
-}
-
-// Clear all sets constructed by shrink wrapping.
-void PEI::clearAllSets() {
-  ReturnBlocks.clear();
-  clearAnticAvailSets();
-  UsedCSRegs.clear();
-  CSRUsed.clear();
-  TLLoops.clear();
-  CSRSave.clear();
-  CSRRestore.clear();
-}
-
-// Initialize all shrink wrapping data.
-void PEI::initShrinkWrappingInfo() {
-  clearAllSets();
-  EntryBlock = 0;
-#ifndef NDEBUG
-  HasFastExitPath = false;
-#endif
-  ShrinkWrapThisFunction = ShrinkWrapping;
-  // DEBUG: enable or disable shrink wrapping for the current function
-  // via --shrink-wrap-func=<funcname>.
-#ifndef NDEBUG
-  if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getName().str();
-    ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
-  }
-#endif
-}
-
-
-/// placeCSRSpillsAndRestores - determine which MBBs of the function
-/// need save, restore code for callee-saved registers by doing a DF analysis
-/// similar to the one used in code motion (GVNPRE). This produces maps of MBBs
-/// to sets of registers (CSRs) for saves and restores. MachineLoopInfo
-/// is used to ensure that CSR save/restore code is not placed inside loops.
-/// This function computes the maps of MBBs -> CSRs to spill and restore
-/// in CSRSave, CSRRestore.
-///
-/// If shrink wrapping is not being performed, place all spills in
-/// the entry block, all restores in return blocks. In this case,
-/// CSRSave has a single mapping, CSRRestore has mappings for each
-/// return block.
-///
-void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
-
-  DEBUG(MF = &Fn);
-
-  initShrinkWrappingInfo();
-
-  DEBUG(if (ShrinkWrapThisFunction) {
-      dbgs() << "Place CSR spills/restores for "
-             << MF->getName() << "\n";
-    });
-
-  if (calculateSets(Fn))
-    placeSpillsAndRestores(Fn);
-}
-
-/// calcAnticInOut - calculate the anticipated in/out reg sets
-/// for the given MBB by looking forward in the MCFG at MBB's
-/// successors.
-///
-bool PEI::calcAnticInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AnticOut[MBB] = INTERSECT(AnticIn[S] for S in SUCCESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    CSRegSet prevAnticOut = AnticOut[MBB];
-    MachineBasicBlock* SUCC = successors[i];
-
-    AnticOut[MBB] = AnticIn[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      AnticOut[MBB] &= AnticIn[SUCC];
-    }
-    if (prevAnticOut != AnticOut[MBB])
-      changed = true;
-  }
-
-  // AnticIn[MBB] = UNION(CSRUsed[MBB], AnticOut[MBB]);
-  CSRegSet prevAnticIn = AnticIn[MBB];
-  AnticIn[MBB] = CSRUsed[MBB] | AnticOut[MBB];
-  if (prevAnticIn != AnticIn[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calcAvailInOut - calculate the available in/out reg sets
-/// for the given MBB by looking backward in the MCFG at MBB's
-/// predecessors.
-///
-bool PEI::calcAvailInOut(MachineBasicBlock* MBB) {
-  bool changed = false;
-
-  // AvailIn[MBB] = INTERSECT(AvailOut[P] for P in PREDECESSORS(MBB))
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    CSRegSet prevAvailIn = AvailIn[MBB];
-    MachineBasicBlock* PRED = predecessors[i];
-
-    AvailIn[MBB] = AvailOut[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      AvailIn[MBB] &= AvailOut[PRED];
-    }
-    if (prevAvailIn != AvailIn[MBB])
-      changed = true;
-  }
-
-  // AvailOut[MBB] = UNION(CSRUsed[MBB], AvailIn[MBB]);
-  CSRegSet prevAvailOut = AvailOut[MBB];
-  AvailOut[MBB] = CSRUsed[MBB] | AvailIn[MBB];
-  if (prevAvailOut != AvailOut[MBB])
-    changed = true;
-  return changed;
-}
-
-/// calculateAnticAvail - build the sets anticipated and available
-/// registers in the MCFG of the current function iteratively,
-/// doing a combined forward and backward analysis.
-///
-void PEI::calculateAnticAvail(MachineFunction &Fn) {
-  // Initialize data flow sets.
-  clearAnticAvailSets();
-
-  // Calculate Antic{In,Out} and Avail{In,Out} iteratively on the MCFG.
-  bool changed = true;
-  unsigned iterations = 0;
-  while (changed) {
-    changed = false;
-    ++iterations;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Calculate anticipated in, out regs at MBB from
-      // anticipated at successors of MBB.
-      changed |= calcAnticInOut(MBB);
-
-      // Calculate available in, out regs at MBB from
-      // available at predecessors of MBB.
-      changed |= calcAvailInOut(MBB);
-    }
-  }
-
-  DEBUG({
-      if (ShrinkWrapDebugging >= Details) {
-        dbgs()
-          << "-----------------------------------------------------------\n"
-          << " Antic/Avail Sets:\n"
-          << "-----------------------------------------------------------\n"
-          << "iterations = " << iterations << "\n"
-          << "-----------------------------------------------------------\n"
-          << "MBB | USED | ANTIC_IN | ANTIC_OUT | AVAIL_IN | AVAIL_OUT\n"
-          << "-----------------------------------------------------------\n";
-
-        for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-             MBBI != MBBE; ++MBBI) {
-          MachineBasicBlock* MBB = MBBI;
-          dumpSets(MBB);
-        }
-
-        dbgs()
-          << "-----------------------------------------------------------\n";
-      }
-    });
-}
-
-/// propagateUsesAroundLoop - copy used register info from MBB to all blocks
-/// of the loop given by LP and its parent loops. This prevents spills/restores
-/// from being placed in the bodies of loops.
-///
-void PEI::propagateUsesAroundLoop(MachineBasicBlock* MBB, MachineLoop* LP) {
-  if (! MBB || !LP)
-    return;
-
-  std::vector<MachineBasicBlock*> loopBlocks = LP->getBlocks();
-  for (unsigned i = 0, e = loopBlocks.size(); i != e; ++i) {
-    MachineBasicBlock* LBB = loopBlocks[i];
-    if (LBB == MBB)
-      continue;
-    if (CSRUsed[LBB].contains(CSRUsed[MBB]))
-      continue;
-    CSRUsed[LBB] |= CSRUsed[MBB];
-  }
-}
-
-/// calculateSets - collect the CSRs used in this function, compute
-/// the DF sets that describe the initial minimal regions in the
-/// Machine CFG around which CSR spills and restores must be placed.
-///
-/// Additionally, this function decides if shrink wrapping should
-/// be disabled for the current function, checking the following:
-///  1. the current function has more than 500 MBBs: heuristic limit
-///     on function size to reduce compile time impact of the current
-///     iterative algorithm.
-///  2. all CSRs are used in the entry block.
-///  3. all CSRs are used in all immediate successors of the entry block.
-///  4. all CSRs are used in a subset of blocks, each of which dominates
-///     all return blocks. These blocks, taken as a subgraph of the MCFG,
-///     are equivalent to the entry block since all execution paths pass
-///     through them.
-///
-bool PEI::calculateSets(MachineFunction &Fn) {
-  // Sets used to compute spill, restore placement sets.
-  const std::vector<CalleeSavedInfo> CSI =
-    Fn.getFrameInfo()->getCalleeSavedInfo();
-
-  // If no CSRs used, we are done.
-  if (CSI.empty()) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": uses no callee-saved registers\n");
-    return false;
-  }
-
-  // Save refs to entry and return blocks.
-  EntryBlock = Fn.begin();
-  for (MachineFunction::iterator MBB = Fn.begin(), E = Fn.end();
-       MBB != E; ++MBB)
-    if (isReturnBlock(MBB))
-      ReturnBlocks.push_back(MBB);
-
-  // Determine if this function has fast exit paths.
-  DEBUG(if (ShrinkWrapThisFunction)
-          findFastExitPath());
-
-  // Limit shrink wrapping via the current iterative bit vector
-  // implementation to functions with <= 500 MBBs.
-  if (Fn.size() > 500) {
-    DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getName()
-                   << ": too large (" << Fn.size() << " MBBs)\n");
-    ShrinkWrapThisFunction = false;
-  }
-
-  // Return now if not shrink wrapping.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  // Collect set of used CSRs.
-  for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-    UsedCSRegs.set(inx);
-  }
-
-  // Walk instructions in all MBBs, create CSRUsed[] sets, choose
-  // whether or not to shrink wrap this function.
-  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
-  const TargetRegisterInfo *TRI = Fn.getTarget().getRegisterInfo();
-
-  bool allCSRUsesInEntryBlock = true;
-  for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) {
-      for (unsigned inx = 0, e = CSI.size(); inx != e; ++inx) {
-        unsigned Reg = CSI[inx].getReg();
-        // If instruction I reads or modifies Reg, add it to UsedCSRegs,
-        // CSRUsed map for the current block.
-        for (unsigned opInx = 0, opEnd = I->getNumOperands();
-             opInx != opEnd; ++opInx) {
-          const MachineOperand &MO = I->getOperand(opInx);
-          if (! (MO.isReg() && (MO.isUse() || MO.isDef())))
-            continue;
-          unsigned MOReg = MO.getReg();
-          if (!MOReg)
-            continue;
-          if (MOReg == Reg ||
-              (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-               TargetRegisterInfo::isPhysicalRegister(Reg) &&
-               TRI->isSubRegister(Reg, MOReg))) {
-            // CSR Reg is defined/used in block MBB.
-            CSRUsed[MBB].set(inx);
-            // Check for uses in EntryBlock.
-            if (MBB != EntryBlock)
-              allCSRUsesInEntryBlock = false;
-          }
-        }
-      }
-    }
-
-    if (CSRUsed[MBB].empty())
-      continue;
-
-    // Propagate CSRUsed[MBB] in loops
-    if (MachineLoop* LP = LI.getLoopFor(MBB)) {
-      // Add top level loop to work list.
-      MachineBasicBlock* HDR = getTopLevelLoopPreheader(LP);
-      MachineLoop* PLP = getTopLevelLoopParent(LP);
-
-      if (! HDR) {
-        HDR = PLP->getHeader();
-        assert(HDR->pred_size() > 0 && "Loop header has no predecessors?");
-        MachineBasicBlock::pred_iterator PI = HDR->pred_begin();
-        HDR = *PI;
-      }
-      TLLoops[HDR] = PLP;
-
-      // Push uses from inside loop to its parent loops,
-      // or to all other MBBs in its loop.
-      if (LP->getLoopDepth() > 1) {
-        for (MachineLoop* PLP = LP->getParentLoop(); PLP;
-             PLP = PLP->getParentLoop()) {
-          propagateUsesAroundLoop(MBB, PLP);
-        }
-      } else {
-        propagateUsesAroundLoop(MBB, LP);
-      }
-    }
-  }
-
-  if (allCSRUsesInEntryBlock) {
-    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                 << ": all CSRs used in EntryBlock\n");
-    ShrinkWrapThisFunction = false;
-  } else {
-    bool allCSRsUsedInEntryFanout = true;
-    for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-           SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (CSRUsed[SUCC] != UsedCSRegs)
-        allCSRsUsedInEntryFanout = false;
-    }
-    if (allCSRsUsedInEntryFanout) {
-      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                   << ": all CSRs used in imm successors of EntryBlock\n");
-      ShrinkWrapThisFunction = false;
-    }
-  }
-
-  if (ShrinkWrapThisFunction) {
-    // Check if MBB uses CSRs and dominates all exit nodes.
-    // Such nodes are equiv. to the entry node w.r.t.
-    // CSR uses: every path through the function must
-    // pass through this node. If each CSR is used at least
-    // once by these nodes, shrink wrapping is disabled.
-    CSRegSet CSRUsedInChokePoints;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      if (MBB == EntryBlock || CSRUsed[MBB].empty() || MBB->succ_size() < 1)
-        continue;
-      bool dominatesExitNodes = true;
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        if (! DT.dominates(MBB, ReturnBlocks[ri])) {
-          dominatesExitNodes = false;
-          break;
-        }
-      if (dominatesExitNodes) {
-        CSRUsedInChokePoints |= CSRUsed[MBB];
-        if (CSRUsedInChokePoints == UsedCSRegs) {
-          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
-                       << ": all CSRs used in choke point(s) at "
-                       << getBasicBlockName(MBB) << "\n");
-          ShrinkWrapThisFunction = false;
-          break;
-        }
-      }
-    }
-  }
-
-  // Return now if we have decided not to apply shrink wrapping
-  // to the current function.
-  if (! ShrinkWrapThisFunction)
-    return false;
-
-  DEBUG({
-      dbgs() << "ENABLED: " << Fn.getName();
-      if (HasFastExitPath)
-        dbgs() << " (fast exit path)";
-      dbgs() << "\n";
-      if (ShrinkWrapDebugging >= BasicInfo) {
-        dbgs() << "------------------------------"
-             << "-----------------------------\n";
-        dbgs() << "UsedCSRegs = " << stringifyCSRegSet(UsedCSRegs) << "\n";
-        if (ShrinkWrapDebugging >= Details) {
-          dbgs() << "------------------------------"
-               << "-----------------------------\n";
-          dumpAllUsed();
-        }
-      }
-    });
-
-  // Build initial DF sets to determine minimal regions in the
-  // Machine CFG around which CSRs must be spilled and restored.
-  calculateAnticAvail(Fn);
-
-  return true;
-}
-
-/// addUsesForMEMERegion - add uses of CSRs spilled or restored in
-/// multi-entry, multi-exit (MEME) regions so spill and restore
-/// placement will not break code that enters or leaves a
-/// shrink-wrapped region by inducing spills with no matching
-/// restores or restores with no matching spills. A MEME region
-/// is a subgraph of the MCFG with multiple entry edges, multiple
-/// exit edges, or both. This code propagates use information
-/// through the MCFG until all paths requiring spills and restores
-/// _outside_ the computed minimal placement regions have been covered.
-///
-bool PEI::addUsesForMEMERegion(MachineBasicBlock* MBB,
-                               SmallVector<MachineBasicBlock*, 4>& blks) {
-  if (MBB->succ_size() < 2 && MBB->pred_size() < 2) {
-    bool processThisBlock = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           SE = MBB->succ_end(); SI != SE; ++SI) {
-      MachineBasicBlock* SUCC = *SI;
-      if (SUCC->pred_size() > 1) {
-        processThisBlock = true;
-        break;
-      }
-    }
-    if (!CSRRestore[MBB].empty() && MBB->succ_size() > 0) {
-      for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-             PE = MBB->pred_end(); PI != PE; ++PI) {
-        MachineBasicBlock* PRED = *PI;
-        if (PRED->succ_size() > 1) {
-          processThisBlock = true;
-          break;
-        }
-      }
-    }
-    if (! processThisBlock)
-      return false;
-  }
-
-  CSRegSet prop;
-  if (!CSRSave[MBB].empty())
-    prop = CSRSave[MBB];
-  else if (!CSRRestore[MBB].empty())
-    prop = CSRRestore[MBB];
-  else
-    prop = CSRUsed[MBB];
-  if (prop.empty())
-    return false;
-
-  // Propagate selected bits to successors, predecessors of MBB.
-  bool addedUses = false;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    // Self-loop
-    if (SUCC == MBB)
-      continue;
-    if (! CSRUsed[SUCC].contains(prop)) {
-      CSRUsed[SUCC] |= prop;
-      addedUses = true;
-      blks.push_back(SUCC);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "successor " << getBasicBlockName(SUCC) << "\n");
-    }
-  }
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    // Self-loop
-    if (PRED == MBB)
-      continue;
-    if (! CSRUsed[PRED].contains(prop)) {
-      CSRUsed[PRED] |= prop;
-      addedUses = true;
-      blks.push_back(PRED);
-      DEBUG(if (ShrinkWrapDebugging >= Iterations)
-              dbgs() << getBasicBlockName(MBB)
-                   << "(" << stringifyCSRegSet(prop) << ")->"
-                   << "predecessor " << getBasicBlockName(PRED) << "\n");
-    }
-  }
-  return addedUses;
-}
-
-/// addUsesForTopLevelLoops - add uses for CSRs used inside top
-/// level loops to the exit blocks of those loops.
-///
-bool PEI::addUsesForTopLevelLoops(SmallVector<MachineBasicBlock*, 4>& blks) {
-  bool addedUses = false;
-
-  // Place restores for top level loops where needed.
-  for (DenseMap<MachineBasicBlock*, MachineLoop*>::iterator
-         I = TLLoops.begin(), E = TLLoops.end(); I != E; ++I) {
-    MachineBasicBlock* MBB = I->first;
-    MachineLoop* LP = I->second;
-    MachineBasicBlock* HDR = LP->getHeader();
-    SmallVector<MachineBasicBlock*, 4> exitBlocks;
-    CSRegSet loopSpills;
-
-    loopSpills = CSRSave[MBB];
-    if (CSRSave[MBB].empty()) {
-      loopSpills = CSRUsed[HDR];
-      assert(!loopSpills.empty() && "No CSRs used in loop?");
-    } else if (CSRRestore[MBB].contains(CSRSave[MBB]))
-      continue;
-
-    LP->getExitBlocks(exitBlocks);
-    assert(exitBlocks.size() > 0 && "Loop has no top level exit blocks?");
-    for (unsigned i = 0, e = exitBlocks.size(); i != e; ++i) {
-      MachineBasicBlock* EXB = exitBlocks[i];
-      if (! CSRUsed[EXB].contains(loopSpills)) {
-        CSRUsed[EXB] |= loopSpills;
-        addedUses = true;
-        DEBUG(if (ShrinkWrapDebugging >= Iterations)
-                dbgs() << "LOOP " << getBasicBlockName(MBB)
-                     << "(" << stringifyCSRegSet(loopSpills) << ")->"
-                     << getBasicBlockName(EXB) << "\n");
-        if (EXB->succ_size() > 1 || EXB->pred_size() > 1)
-          blks.push_back(EXB);
-      }
-    }
-  }
-  return addedUses;
-}
-
-/// calcSpillPlacements - determine which CSRs should be spilled
-/// in MBB using AnticIn sets of MBB's predecessors, keeping track
-/// of changes to spilled reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcSpillPlacements(MachineBasicBlock* MBB,
-                              SmallVector<MachineBasicBlock*, 4> &blks,
-                              CSRegBlockMap &prevSpills) {
-  bool placedSpills = false;
-  // Intersect (CSRegs - AnticIn[P]) for P in Predecessors(MBB)
-  CSRegSet anticInPreds;
-  SmallVector<MachineBasicBlock*, 4> predecessors;
-  for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-    MachineBasicBlock* PRED = *PI;
-    if (PRED != MBB)
-      predecessors.push_back(PRED);
-  }
-  unsigned i = 0, e = predecessors.size();
-  if (i != e) {
-    MachineBasicBlock* PRED = predecessors[i];
-    anticInPreds = UsedCSRegs - AnticIn[PRED];
-    for (++i; i != e; ++i) {
-      PRED = predecessors[i];
-      anticInPreds &= (UsedCSRegs - AnticIn[PRED]);
-    }
-  } else {
-    // Handle uses in entry blocks (which have no predecessors).
-    // This is necessary because the DFA formulation assumes the
-    // entry and (multiple) exit nodes cannot have CSR uses, which
-    // is not the case in the real world.
-    anticInPreds = UsedCSRegs;
-  }
-  // Compute spills required at MBB:
-  CSRSave[MBB] |= (AnticIn[MBB] - AvailIn[MBB]) & anticInPreds;
-
-  if (! CSRSave[MBB].empty()) {
-    if (MBB == EntryBlock) {
-      for (unsigned ri = 0, re = ReturnBlocks.size(); ri != re; ++ri)
-        CSRRestore[ReturnBlocks[ri]] |= CSRSave[MBB];
-    } else {
-      // Reset all regs spilled in MBB that are also spilled in EntryBlock.
-      if (CSRSave[EntryBlock].intersects(CSRSave[MBB])) {
-        CSRSave[MBB] = CSRSave[MBB] - CSRSave[EntryBlock];
-      }
-    }
-  }
-  placedSpills = (CSRSave[MBB] != prevSpills[MBB]);
-  prevSpills[MBB] = CSRSave[MBB];
-  // Remember this block for adding restores to successor
-  // blocks for multi-entry region.
-  if (placedSpills)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRSave[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRSave[MBB]) << "\n");
-
-  return placedSpills;
-}
-
-/// calcRestorePlacements - determine which CSRs should be restored
-/// in MBB using AvailOut sets of MBB's succcessors, keeping track
-/// of changes to restored reg sets. Add MBB to the set of blocks
-/// that need to be processed for propagating use info to cover
-/// multi-entry/exit regions.
-///
-bool PEI::calcRestorePlacements(MachineBasicBlock* MBB,
-                                SmallVector<MachineBasicBlock*, 4> &blks,
-                                CSRegBlockMap &prevRestores) {
-  bool placedRestores = false;
-  // Intersect (CSRegs - AvailOut[S]) for S in Successors(MBB)
-  CSRegSet availOutSucc;
-  SmallVector<MachineBasicBlock*, 4> successors;
-  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-    if (SUCC != MBB)
-      successors.push_back(SUCC);
-  }
-  unsigned i = 0, e = successors.size();
-  if (i != e) {
-    MachineBasicBlock* SUCC = successors[i];
-    availOutSucc = UsedCSRegs - AvailOut[SUCC];
-    for (++i; i != e; ++i) {
-      SUCC = successors[i];
-      availOutSucc &= (UsedCSRegs - AvailOut[SUCC]);
-    }
-  } else {
-    if (! CSRUsed[MBB].empty() || ! AvailOut[MBB].empty()) {
-      // Handle uses in return blocks (which have no successors).
-      // This is necessary because the DFA formulation assumes the
-      // entry and (multiple) exit nodes cannot have CSR uses, which
-      // is not the case in the real world.
-      availOutSucc = UsedCSRegs;
-    }
-  }
-  // Compute restores required at MBB:
-  CSRRestore[MBB] |= (AvailOut[MBB] - AnticOut[MBB]) & availOutSucc;
-
-  // Postprocess restore placements at MBB.
-  // Remove the CSRs that are restored in the return blocks.
-  // Lest this be confusing, note that:
-  // CSRSave[EntryBlock] == CSRRestore[B] for all B in ReturnBlocks.
-  if (MBB->succ_size() && ! CSRRestore[MBB].empty()) {
-    if (! CSRSave[EntryBlock].empty())
-      CSRRestore[MBB] = CSRRestore[MBB] - CSRSave[EntryBlock];
-  }
-  placedRestores = (CSRRestore[MBB] != prevRestores[MBB]);
-  prevRestores[MBB] = CSRRestore[MBB];
-  // Remember this block for adding saves to predecessor
-  // blocks for multi-entry region.
-  if (placedRestores)
-    blks.push_back(MBB);
-
-  DEBUG(if (! CSRRestore[MBB].empty() && ShrinkWrapDebugging >= Iterations)
-          dbgs() << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-  return placedRestores;
-}
-
-/// placeSpillsAndRestores - place spills and restores of CSRs
-/// used in MBBs in minimal regions that contain the uses.
-///
-void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
-  CSRegBlockMap prevCSRSave;
-  CSRegBlockMap prevCSRRestore;
-  SmallVector<MachineBasicBlock*, 4> cvBlocks, ncvBlocks;
-  bool changed = true;
-  unsigned iterations = 0;
-
-  // Iterate computation of spill and restore placements in the MCFG until:
-  //   1. CSR use info has been fully propagated around the MCFG, and
-  //   2. computation of CSRSave[], CSRRestore[] reach fixed points.
-  while (changed) {
-    changed = false;
-    ++iterations;
-
-    DEBUG(if (ShrinkWrapDebugging >= Iterations)
-            dbgs() << "iter " << iterations
-                 << " --------------------------------------------------\n");
-
-    // Calculate CSR{Save,Restore} sets using Antic, Avail on the MCFG,
-    // which determines the placements of spills and restores.
-    // Keep track of changes to spills, restores in each iteration to
-    // minimize the total iterations.
-    bool SRChanged = false;
-    for (MachineFunction::iterator MBBI = Fn.begin(), MBBE = Fn.end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-
-      // Place spills for CSRs in MBB.
-      SRChanged |= calcSpillPlacements(MBB, cvBlocks, prevCSRSave);
-
-      // Place restores for CSRs in MBB.
-      SRChanged |= calcRestorePlacements(MBB, cvBlocks, prevCSRRestore);
-    }
-
-    // Add uses of CSRs used inside loops where needed.
-    changed |= addUsesForTopLevelLoops(cvBlocks);
-
-    // Add uses for CSRs spilled or restored at branch, join points.
-    if (changed || SRChanged) {
-      while (! cvBlocks.empty()) {
-        MachineBasicBlock* MBB = cvBlocks.pop_back_val();
-        changed |= addUsesForMEMERegion(MBB, ncvBlocks);
-      }
-      if (! ncvBlocks.empty()) {
-        cvBlocks = ncvBlocks;
-        ncvBlocks.clear();
-      }
-    }
-
-    if (changed) {
-      calculateAnticAvail(Fn);
-      CSRSave.clear();
-      CSRRestore.clear();
-    }
-  }
-
-  // Check for effectiveness:
-  //  SR0 = {r | r in CSRSave[EntryBlock], CSRRestore[RB], RB in ReturnBlocks}
-  //  numSRReduced = |(UsedCSRegs - SR0)|, approx. SR0 by CSRSave[EntryBlock]
-  // Gives a measure of how many CSR spills have been moved from EntryBlock
-  // to minimal regions enclosing their uses.
-  CSRegSet notSpilledInEntryBlock = (UsedCSRegs - CSRSave[EntryBlock]);
-  unsigned numSRReducedThisFunc = notSpilledInEntryBlock.count();
-  numSRReduced += numSRReducedThisFunc;
-  DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
-      dbgs() << "-----------------------------------------------------------\n";
-      dbgs() << "total iterations = " << iterations << " ( "
-           << Fn.getName()
-           << " " << numSRReducedThisFunc
-           << " " << Fn.size()
-           << " )\n";
-      dbgs() << "-----------------------------------------------------------\n";
-      dumpSRSets();
-      dbgs() << "-----------------------------------------------------------\n";
-      if (numSRReducedThisFunc)
-        verifySpillRestorePlacement();
-    });
-}
-
-// Debugging methods.
-#ifndef NDEBUG
-/// findFastExitPath - debugging method used to detect functions
-/// with at least one path from the entry block to a return block
-/// directly or which has a very small number of edges.
-///
-void PEI::findFastExitPath() {
-  if (! EntryBlock)
-    return;
-  // Fina a path from EntryBlock to any return block that does not branch:
-  //        Entry
-  //          |     ...
-  //          v      |
-  //         B1<-----+
-  //          |
-  //          v
-  //       Return
-  for (MachineBasicBlock::succ_iterator SI = EntryBlock->succ_begin(),
-         SE = EntryBlock->succ_end(); SI != SE; ++SI) {
-    MachineBasicBlock* SUCC = *SI;
-
-    // Assume positive, disprove existence of fast path.
-    HasFastExitPath = true;
-
-    // Check the immediate successors.
-    if (isReturnBlock(SUCC)) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << getBasicBlockName(SUCC) << "\n";
-      break;
-    }
-    // Traverse df from SUCC, look for a branch block.
-    std::string exitPath = getBasicBlockName(SUCC);
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(SUCC),
-           BE = df_end(SUCC); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      // Reject paths with branch nodes.
-      if (SBB->succ_size() > 1) {
-        HasFastExitPath = false;
-        break;
-      }
-      exitPath += "->" + getBasicBlockName(SBB);
-    }
-    if (HasFastExitPath) {
-      if (ShrinkWrapDebugging >= BasicInfo)
-        dbgs() << "Fast exit path: " << getBasicBlockName(EntryBlock)
-             << "->" << exitPath << "\n";
-      break;
-    }
-  }
-}
-
-/// verifySpillRestorePlacement - check the current spill/restore
-/// sets for safety. Attempt to find spills without restores or
-/// restores without spills.
-/// Spills: walk df from each MBB in spill set ensuring that
-///         all CSRs spilled at MMBB are restored on all paths
-///         from MBB to all exit blocks.
-/// Restores: walk idf from each MBB in restore set ensuring that
-///           all CSRs restored at MBB are spilled on all paths
-///           reaching MBB.
-///
-void PEI::verifySpillRestorePlacement() {
-  unsigned numReturnBlocks = 0;
-  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-       MBBI != MBBE; ++MBBI) {
-    MachineBasicBlock* MBB = MBBI;
-    if (isReturnBlock(MBB) || MBB->succ_size() == 0)
-      ++numReturnBlocks;
-  }
-  for (CSRegBlockMap::iterator BI = CSRSave.begin(),
-         BE = CSRSave.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet spilled = BI->second;
-    CSRegSet restored;
-
-    if (spilled.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(spilled)
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n");
-
-    if (CSRRestore[MBB].intersects(spilled)) {
-      restored |= (CSRRestore[MBB] & spilled);
-    }
-
-    // Walk depth first from MBB to find restores of all CSRs spilled at MBB:
-    // we must find restores for all spills w/no intervening spills on all
-    // paths from MBB to all return blocks.
-    for (df_iterator<MachineBasicBlock*> BI = df_begin(MBB),
-           BE = df_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* SBB = *BI;
-      if (SBB == MBB)
-        continue;
-      // Stop when we encounter spills of any CSRs spilled at MBB that
-      // have not yet been seen to be restored.
-      if (CSRSave[SBB].intersects(spilled) &&
-          !restored.contains(CSRSave[SBB] & spilled))
-        break;
-      // Collect the CSRs spilled at MBB that are restored
-      // at this DF successor of MBB.
-      if (CSRRestore[SBB].intersects(spilled))
-        restored |= (CSRRestore[SBB] & spilled);
-      // If we are at a retun block, check that the restores
-      // we have seen so far exhaust the spills at MBB, then
-      // reset the restores.
-      if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
-        if (restored != spilled) {
-          CSRegSet notRestored = (spilled - restored);
-          DEBUG(dbgs() << MF->getName() << ": "
-                       << stringifyCSRegSet(notRestored)
-                       << " spilled at " << getBasicBlockName(MBB)
-                       << " are never restored on path to return "
-                       << getBasicBlockName(SBB) << "\n");
-        }
-        restored.clear();
-      }
-    }
-  }
-
-  // Check restore placements.
-  for (CSRegBlockMap::iterator BI = CSRRestore.begin(),
-         BE = CSRRestore.end(); BI != BE; ++BI) {
-    MachineBasicBlock* MBB = BI->first;
-    CSRegSet restored = BI->second;
-    CSRegSet spilled;
-
-    if (restored.empty())
-      continue;
-
-    DEBUG(dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB])
-                 << "  RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(restored) << "\n");
-
-    if (CSRSave[MBB].intersects(restored)) {
-      spilled |= (CSRSave[MBB] & restored);
-    }
-    // Walk inverse depth first from MBB to find spills of all
-    // CSRs restored at MBB:
-    for (idf_iterator<MachineBasicBlock*> BI = idf_begin(MBB),
-           BE = idf_end(MBB); BI != BE; ++BI) {
-      MachineBasicBlock* PBB = *BI;
-      if (PBB == MBB)
-        continue;
-      // Stop when we encounter restores of any CSRs restored at MBB that
-      // have not yet been seen to be spilled.
-      if (CSRRestore[PBB].intersects(restored) &&
-          !spilled.contains(CSRRestore[PBB] & restored))
-        break;
-      // Collect the CSRs restored at MBB that are spilled
-      // at this DF predecessor of MBB.
-      if (CSRSave[PBB].intersects(restored))
-        spilled |= (CSRSave[PBB] & restored);
-    }
-    if (spilled != restored) {
-      CSRegSet notSpilled = (restored - spilled);
-      DEBUG(dbgs() << MF->getName() << ": "
-                   << stringifyCSRegSet(notSpilled)
-                   << " restored at " << getBasicBlockName(MBB)
-                   << " are never spilled\n");
-    }
-  }
-}
-
-// Debugging print methods.
-std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
-  if (!MBB)
-    return "";
-
-  if (MBB->getBasicBlock())
-    return MBB->getBasicBlock()->getName().str();
-
-  std::ostringstream name;
-  name << "_MBB_" << MBB->getNumber();
-  return name.str();
-}
-
-std::string PEI::stringifyCSRegSet(const CSRegSet& s) {
-  const TargetRegisterInfo* TRI = MF->getTarget().getRegisterInfo();
-  const std::vector<CalleeSavedInfo> CSI =
-    MF->getFrameInfo()->getCalleeSavedInfo();
-
-  std::ostringstream srep;
-  if (CSI.size() == 0) {
-    srep << "[]";
-    return srep.str();
-  }
-  srep << "[";
-  CSRegSet::iterator I = s.begin(), E = s.end();
-  if (I != E) {
-    unsigned reg = CSI[*I].getReg();
-    srep << TRI->getName(reg);
-    for (++I; I != E; ++I) {
-      reg = CSI[*I].getReg();
-      srep << ",";
-      srep << TRI->getName(reg);
-    }
-  }
-  srep << "]";
-  return srep.str();
-}
-
-void PEI::dumpSet(const CSRegSet& s) {
-  DEBUG(dbgs() << stringifyCSRegSet(s) << "\n");
-}
-
-void PEI::dumpUsed(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << "CSRUsed[" << getBasicBlockName(MBB) << "] = "
-               << stringifyCSRegSet(CSRUsed[MBB])  << "\n";
-    });
-}
-
-void PEI::dumpAllUsed() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpUsed(MBB);
-    }
-}
-
-void PEI::dumpSets(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)           << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])  << " | "
-               << stringifyCSRegSet(AnticIn[MBB])  << " | "
-               << stringifyCSRegSet(AnticOut[MBB]) << " | "
-               << stringifyCSRegSet(AvailIn[MBB])  << " | "
-               << stringifyCSRegSet(AvailOut[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpSets1(MachineBasicBlock* MBB) {
-  DEBUG({
-      if (MBB)
-        dbgs() << getBasicBlockName(MBB)             << " | "
-               << stringifyCSRegSet(CSRUsed[MBB])    << " | "
-               << stringifyCSRegSet(AnticIn[MBB])    << " | "
-               << stringifyCSRegSet(AnticOut[MBB])   << " | "
-               << stringifyCSRegSet(AvailIn[MBB])    << " | "
-               << stringifyCSRegSet(AvailOut[MBB])   << " | "
-               << stringifyCSRegSet(CSRSave[MBB])    << " | "
-               << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-    });
-}
-
-void PEI::dumpAllSets() {
-    for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
-         MBBI != MBBE; ++MBBI) {
-      MachineBasicBlock* MBB = MBBI;
-      dumpSets1(MBB);
-    }
-}
-
-void PEI::dumpSRSets() {
-  DEBUG({
-      for (MachineFunction::iterator MBB = MF->begin(), E = MF->end();
-           MBB != E; ++MBB) {
-        if (!CSRSave[MBB].empty()) {
-          dbgs() << "SAVE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRSave[MBB]);
-          if (CSRRestore[MBB].empty())
-            dbgs() << '\n';
-        }
-
-        if (!CSRRestore[MBB].empty() && !CSRSave[MBB].empty())
-          dbgs() << "    "
-                 << "RESTORE[" << getBasicBlockName(MBB) << "] = "
-                 << stringifyCSRegSet(CSRRestore[MBB]) << "\n";
-      }
-    });
-}
-#endif
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 3903743..da2e710 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -42,48 +42,47 @@ STATISTIC(NumInvokes, "Number of invokes replaced");
 STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
-  class SjLjEHPrepare : public FunctionPass {
-    const TargetLoweringBase *TLI;
-    Type *FunctionContextTy;
-    Constant *RegisterFn;
-    Constant *UnregisterFn;
-    Constant *BuiltinSetjmpFn;
-    Constant *FrameAddrFn;
-    Constant *StackAddrFn;
-    Constant *StackRestoreFn;
-    Constant *LSDAAddrFn;
-    Value *PersonalityFn;
-    Constant *CallSiteFn;
-    Constant *FuncCtxFn;
-    AllocaInst *FuncCtx;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit SjLjEHPrepare(const TargetLoweringBase *tli = NULL)
-      : FunctionPass(ID), TLI(tli) { }
-    bool doInitialization(Module &M);
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
-    const char *getPassName() const {
-      return "SJLJ Exception Handling preparation";
-    }
+class SjLjEHPrepare : public FunctionPass {
+  const TargetMachine *TM;
+  Type *FunctionContextTy;
+  Constant *RegisterFn;
+  Constant *UnregisterFn;
+  Constant *BuiltinSetjmpFn;
+  Constant *FrameAddrFn;
+  Constant *StackAddrFn;
+  Constant *StackRestoreFn;
+  Constant *LSDAAddrFn;
+  Value *PersonalityFn;
+  Constant *CallSiteFn;
+  Constant *FuncCtxFn;
+  AllocaInst *FuncCtx;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit SjLjEHPrepare(const TargetMachine *TM) : FunctionPass(ID), TM(TM) {}
+  bool doInitialization(Module &M);
+  bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {}
+  const char *getPassName() const {
+    return "SJLJ Exception Handling preparation";
+  }
 
-  private:
-    bool setupEntryBlockAndCallSites(Function &F);
-    void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
-                              Value *SelVal);
-    Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads);
-    void lowerIncomingArguments(Function &F);
-    void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst*> Invokes);
-    void insertCallSiteStore(Instruction *I, int Number);
-  };
+private:
+  bool setupEntryBlockAndCallSites(Function &F);
+  void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
+  Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
+  void lowerIncomingArguments(Function &F);
+  void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst *> Invokes);
+  void insertCallSiteStore(Instruction *I, int Number);
+};
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
 
 // Public Interface To the SjLjEHPrepare pass.
-FunctionPass *llvm::createSjLjEHPreparePass(const TargetLoweringBase *TLI) {
-  return new SjLjEHPrepare(TLI);
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetMachine *TM) {
+  return new SjLjEHPrepare(TM);
 }
 // doInitialization - Set up decalarations and types needed to process
 // exceptions.
@@ -92,23 +91,19 @@ bool SjLjEHPrepare::doInitialization(Module &M) {
   // builtin_setjmp uses a five word jbuf
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   Type *Int32Ty = Type::getInt32Ty(M.getContext());
-  FunctionContextTy =
-    StructType::get(VoidPtrTy,                        // __prev
-                    Int32Ty,                          // call_site
-                    ArrayType::get(Int32Ty, 4),       // __data
-                    VoidPtrTy,                        // __personality
-                    VoidPtrTy,                        // __lsda
-                    ArrayType::get(VoidPtrTy, 5),     // __jbuf
-                    NULL);
-  RegisterFn = M.getOrInsertFunction("_Unwind_SjLj_Register",
-                                     Type::getVoidTy(M.getContext()),
-                                     PointerType::getUnqual(FunctionContextTy),
-                                     (Type *)0);
-  UnregisterFn =
-    M.getOrInsertFunction("_Unwind_SjLj_Unregister",
-                          Type::getVoidTy(M.getContext()),
-                          PointerType::getUnqual(FunctionContextTy),
-                          (Type *)0);
+  FunctionContextTy = StructType::get(VoidPtrTy,                  // __prev
+                                      Int32Ty,                    // call_site
+                                      ArrayType::get(Int32Ty, 4), // __data
+                                      VoidPtrTy, // __personality
+                                      VoidPtrTy, // __lsda
+                                      ArrayType::get(VoidPtrTy, 5), // __jbuf
+                                      NULL);
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -134,16 +129,17 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
   Value *CallSite = Builder.CreateGEP(FuncCtx, Idxs, "call_site");
 
   // Insert a store of the call-site number
-  ConstantInt *CallSiteNoC = ConstantInt::get(Type::getInt32Ty(I->getContext()),
-                                              Number);
-  Builder.CreateStore(CallSiteNoC, CallSite, true/*volatile*/);
+  ConstantInt *CallSiteNoC =
+      ConstantInt::get(Type::getInt32Ty(I->getContext()), Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
 }
 
 /// MarkBlocksLiveIn - Insert BB and all of its predescessors into LiveBBs until
 /// we reach blocks we've already seen.
 static void MarkBlocksLiveIn(BasicBlock *BB,
-                             SmallPtrSet<BasicBlock*, 64> &LiveBBs) {
-  if (!LiveBBs.insert(BB)) return; // already been here.
+                             SmallPtrSet<BasicBlock *, 64> &LiveBBs) {
+  if (!LiveBBs.insert(BB))
+    return; // already been here.
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     MarkBlocksLiveIn(*PI, LiveBBs);
@@ -153,12 +149,14 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 /// instruction with those returned by the personality function.
 void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
                                          Value *SelVal) {
-  SmallVector<Value*, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
+  SmallVector<Value *, 8> UseWorkList(LPI->use_begin(), LPI->use_end());
   while (!UseWorkList.empty()) {
     Value *Val = UseWorkList.pop_back_val();
     ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Val);
-    if (!EVI) continue;
-    if (EVI->getNumIndices() != 1) continue;
+    if (!EVI)
+      continue;
+    if (EVI->getNumIndices() != 1)
+      continue;
     if (*EVI->idx_begin() == 0)
       EVI->replaceAllUsesWith(ExnVal);
     else if (*EVI->idx_begin() == 1)
@@ -167,14 +165,15 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
       EVI->eraseFromParent();
   }
 
-  if (LPI->getNumUses() == 0)  return;
+  if (LPI->getNumUses() == 0)
+    return;
 
   // There are still some uses of LPI. Construct an aggregate with the exception
   // values and replace the LPI with that aggregate.
   Type *LPadType = LPI->getType();
   Value *LPadVal = UndefValue::get(LPadType);
-  IRBuilder<>
-    Builder(llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
+  IRBuilder<> Builder(
+      llvm::next(BasicBlock::iterator(cast<Instruction>(SelVal))));
   LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
   LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
 
@@ -183,17 +182,18 @@ void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
 
 /// setupFunctionContext - Allocate the function context on the stack and fill
 /// it with all of the data that we know at this point.
-Value *SjLjEHPrepare::
-setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
+Value *SjLjEHPrepare::setupFunctionContext(Function &F,
+                                           ArrayRef<LandingPadInst *> LPads) {
   BasicBlock *EntryBB = F.begin();
 
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
+  const TargetLowering *TLI = TM->getTargetLowering();
   unsigned Align =
-    TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx =
-    new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
+      TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
+  FuncCtx = new AllocaInst(FunctionContextTy, 0, Align, "fn_context",
+                           EntryBB->begin());
 
   // Fill in the function context structure.
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
@@ -204,13 +204,13 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
     Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
-                                                      "exception_gep");
+    Value *ExceptionAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 0, "exception_gep");
     Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
     ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
-    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
-                                                     "exn_selector_gep");
+    Value *SelectorAddr =
+        Builder.CreateConstGEP2_32(FCData, 0, 1, "exn_selector_gep");
     Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
 
     substituteLPadValues(LPI, ExnVal, SelVal);
@@ -220,9 +220,11 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   IRBuilder<> Builder(EntryBB->getTerminator());
   if (!PersonalityFn)
     PersonalityFn = LPads[0]->getPersonalityFn();
-  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
-                                                          "pers_fn_gep");
-  Builder.CreateStore(PersonalityFn, PersonalityFieldPtr, /*isVolatile=*/true);
+  Value *PersonalityFieldPtr =
+      Builder.CreateConstGEP2_32(FuncCtx, 0, 3, "pers_fn_gep");
+  Builder.CreateStore(
+      Builder.CreateBitCast(PersonalityFn, Builder.getInt8PtrTy()),
+      PersonalityFieldPtr, /*isVolatile=*/true);
 
   // LSDA address
   Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
@@ -242,8 +244,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
          isa<ConstantInt>(cast<AllocaInst>(AfterAllocaInsPt)->getArraySize()))
     ++AfterAllocaInsPt;
 
-  for (Function::arg_iterator
-         AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI) {
+  for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
+       ++AI) {
     Type *Ty = AI->getType();
 
     // Aggregate types can't be cast, but are legal argument types, so we have
@@ -262,9 +264,8 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
       // This is always a no-op cast because we're casting AI to AI->getType()
       // so src and destination types are identical. BitCast is the only
       // possibility.
-      CastInst *NC =
-        new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
-                        AfterAllocaInsPt);
+      CastInst *NC = new BitCastInst(AI, AI->getType(), AI->getName() + ".tmp",
+                                     AfterAllocaInsPt);
       AI->replaceAllUsesWith(NC);
 
       // Set the operand of the cast instruction back to the AllocaInst.
@@ -281,20 +282,21 @@ void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
 /// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
 /// edge and spill them.
 void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
-                                           ArrayRef<InvokeInst*> Invokes) {
+                                           ArrayRef<InvokeInst *> Invokes) {
   // Finally, scan the code looking for instructions with bad live ranges.
-  for (Function::iterator
-         BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
-    for (BasicBlock::iterator
-           II = BB->begin(), IIE = BB->end(); II != IIE; ++II) {
+  for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), IIE = BB->end(); II != IIE;
+         ++II) {
       // Ignore obvious cases we don't have to handle. In particular, most
       // instructions either have no uses or only have a single use inside the
       // current block. Ignore them quickly.
       Instruction *Inst = II;
-      if (Inst->use_empty()) continue;
+      if (Inst->use_empty())
+        continue;
       if (Inst->hasOneUse() &&
           cast<Instruction>(Inst->use_back())->getParent() == BB &&
-          !isa<PHINode>(Inst->use_back())) continue;
+          !isa<PHINode>(Inst->use_back()))
+        continue;
 
       // If this is an alloca in the entry block, it's not a real register
       // value.
@@ -303,16 +305,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
           continue;
 
       // Avoid iterator invalidation by copying users to a temporary vector.
-      SmallVector<Instruction*, 16> Users;
-      for (Value::use_iterator
-             UI = Inst->use_begin(), E = Inst->use_end(); UI != E; ++UI) {
+      SmallVector<Instruction *, 16> Users;
+      for (Value::use_iterator UI = Inst->use_begin(), E = Inst->use_end();
+           UI != E; ++UI) {
         Instruction *User = cast<Instruction>(*UI);
         if (User->getParent() != BB || isa<PHINode>(User))
           Users.push_back(User);
       }
 
       // Find all of the blocks that this value is live in.
-      SmallPtrSet<BasicBlock*, 64> LiveBBs;
+      SmallPtrSet<BasicBlock *, 64> LiveBBs;
       LiveBBs.insert(Inst->getParent());
       while (!Users.empty()) {
         Instruction *U = Users.back();
@@ -336,7 +338,7 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
         BasicBlock *UnwindBlock = Invokes[i]->getUnwindDest();
         if (UnwindBlock != BB && LiveBBs.count(UnwindBlock)) {
           DEBUG(dbgs() << "SJLJ Spill: " << *Inst << " around "
-                << UnwindBlock->getName() << "\n");
+                       << UnwindBlock->getName() << "\n");
           NeedsSpill = true;
           break;
         }
@@ -359,15 +361,16 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
     LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
 
     // Place PHIs into a set to avoid invalidating the iterator.
-    SmallPtrSet<PHINode*, 8> PHIsToDemote;
-    for (BasicBlock::iterator
-           PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+    SmallPtrSet<PHINode *, 8> PHIsToDemote;
+    for (BasicBlock::iterator PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
       PHIsToDemote.insert(cast<PHINode>(PN));
-    if (PHIsToDemote.empty()) continue;
+    if (PHIsToDemote.empty())
+      continue;
 
     // Demote the PHIs to the stack.
-    for (SmallPtrSet<PHINode*, 8>::iterator
-           I = PHIsToDemote.begin(), E = PHIsToDemote.end(); I != E; ++I)
+    for (SmallPtrSet<PHINode *, 8>::iterator I = PHIsToDemote.begin(),
+                                             E = PHIsToDemote.end();
+         I != E; ++I)
       DemotePHIToStack(*I);
 
     // Move the landingpad instruction back to the top of the landing pad block.
@@ -379,9 +382,9 @@ void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
 /// the function context and marking the call sites with the appropriate
 /// values. These values are used by the DWARF EH emitter.
 bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
-  SmallVector<ReturnInst*, 16> Returns;
-  SmallVector<InvokeInst*, 16> Invokes;
-  SmallSetVector<LandingPadInst*, 16> LPads;
+  SmallVector<ReturnInst *, 16> Returns;
+  SmallVector<InvokeInst *, 16> Invokes;
+  SmallSetVector<LandingPadInst *, 16> LPads;
 
   // Look through the terminators of the basic blocks to find invokes.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
@@ -401,7 +404,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       Returns.push_back(RI);
     }
 
-  if (Invokes.empty()) return false;
+  if (Invokes.empty())
+    return false;
 
   NumInvokes += Invokes.size();
 
@@ -409,7 +413,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   lowerAcrossUnwindEdges(F, Invokes);
 
   Value *FuncCtx =
-    setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
+      setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
   IRBuilder<> Builder(EntryBB->getTerminator());
 
@@ -443,7 +447,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
     insertCallSiteStore(Invokes[I], I + 1);
 
     ConstantInt *CallSiteNum =
-      ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+        ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
 
     // Record the call site value for the back end so it stays associated with
     // the invoke.
@@ -465,8 +469,8 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       }
 
   // Register the function context and make sure it's known to not throw
-  CallInst *Register = CallInst::Create(RegisterFn, FuncCtx, "",
-                                        EntryBB->getTerminator());
+  CallInst *Register =
+      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
   Register->setDoesNotThrow();
 
   // Following any allocas not in the entry block, update the saved SP in the
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index c5bbba3..10a93b7 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -31,8 +31,8 @@
 #include "SpillPlacement.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/EdgeBundles.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -53,11 +53,16 @@ char &llvm::SpillPlacementID = SpillPlacement::ID;
 
 void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
   AU.addRequiredTransitive<EdgeBundles>();
   AU.addRequiredTransitive<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+/// Decision threshold. A node gets the output value 0 if the weighted sum of
+/// its inputs falls in the open interval (-Threshold;Threshold).
+static const BlockFrequency Threshold = 2;
+
 /// Node - Each edge bundle corresponds to a Hopfield node.
 ///
 /// The node contains precomputed frequency data that only depends on the CFG,
@@ -68,31 +73,25 @@ void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
 /// because all weights are positive.
 ///
 struct SpillPlacement::Node {
-  /// Scale - Inverse block frequency feeding into[0] or out of[1] the bundle.
-  /// Ideally, these two numbers should be identical, but inaccuracies in the
-  /// block frequency estimates means that we need to normalize ingoing and
-  /// outgoing frequencies separately so they are commensurate.
-  float Scale[2];
-
-  /// Bias - Normalized contributions from non-transparent blocks.
-  /// A bundle connected to a MustSpill block has a huge negative bias,
-  /// otherwise it is a number in the range [-2;2].
-  float Bias;
+  /// BiasN - Sum of blocks that prefer a spill.
+  BlockFrequency BiasN;
+  /// BiasP - Sum of blocks that prefer a register.
+  BlockFrequency BiasP;
 
   /// Value - Output value of this node computed from the Bias and links.
-  /// This is always in the range [-1;1]. A positive number means the variable
-  /// should go in a register through this bundle.
-  float Value;
+  /// This is always on of the values {-1, 0, 1}. A positive number means the
+  /// variable should go in a register through this bundle.
+  int Value;
 
-  typedef SmallVector<std::pair<float, unsigned>, 4> LinkVector;
+  typedef SmallVector<std::pair<BlockFrequency, unsigned>, 4> LinkVector;
 
   /// Links - (Weight, BundleNo) for all transparent blocks connecting to other
-  /// bundles. The weights are all positive and add up to at most 2, weights
-  /// from ingoing and outgoing nodes separately add up to a most 1. The weight
-  /// sum can be less than 2 when the variable is not live into / out of some
-  /// connected basic blocks.
+  /// bundles. The weights are all positive block frequencies.
   LinkVector Links;
 
+  /// SumLinkWeights - Cached sum of the weights of all links + ThresHold.
+  BlockFrequency SumLinkWeights;
+
   /// preferReg - Return true when this node prefers to be in a register.
   bool preferReg() const {
     // Undecided nodes (Value==0) go on the stack.
@@ -101,28 +100,24 @@ struct SpillPlacement::Node {
 
   /// mustSpill - Return True if this node is so biased that it must spill.
   bool mustSpill() const {
-    // Actually, we must spill if Bias < sum(weights).
-    // It may be worth it to compute the weight sum here?
-    return Bias < -2.0f;
-  }
-
-  /// Node - Create a blank Node.
-  Node() {
-    Scale[0] = Scale[1] = 0;
+    // We must spill if Bias < -sum(weights) or the MustSpill flag was set.
+    // BiasN is saturated when MustSpill is set, make sure this still returns
+    // true when the RHS saturates. Note that SumLinkWeights includes Threshold.
+    return BiasN >= BiasP + SumLinkWeights;
   }
 
   /// clear - Reset per-query data, but preserve frequencies that only depend on
   // the CFG.
   void clear() {
-    Bias = Value = 0;
+    BiasN = BiasP = Value = 0;
+    SumLinkWeights = Threshold;
     Links.clear();
   }
 
   /// addLink - Add a link to bundle b with weight w.
-  /// out=0 for an ingoing link, and 1 for an outgoing link.
-  void addLink(unsigned b, float w, bool out) {
-    // Normalize w relative to all connected blocks from that direction.
-    w *= Scale[out];
+  void addLink(unsigned b, BlockFrequency w) {
+    // Update cached sum.
+    SumLinkWeights += w;
 
     // There can be multiple links to the same bundle, add them up.
     for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I)
@@ -134,33 +129,48 @@ struct SpillPlacement::Node {
     Links.push_back(std::make_pair(w, b));
   }
 
-  /// addBias - Bias this node from an ingoing[0] or outgoing[1] link.
-  /// Return the change to the total number of positive biases.
-  void addBias(float w, bool out) {
-    // Normalize w relative to all connected blocks from that direction.
-    w *= Scale[out];
-    Bias += w;
+  /// addBias - Bias this node.
+  void addBias(BlockFrequency freq, BorderConstraint direction) {
+    switch (direction) {
+    default:
+      break;
+    case PrefReg:
+      BiasP += freq;
+      break;
+    case PrefSpill:
+      BiasN += freq;
+      break;
+    case MustSpill:
+      BiasN = BlockFrequency::getMaxFrequency();
+      break;
+    }
   }
 
   /// update - Recompute Value from Bias and Links. Return true when node
   /// preference changes.
   bool update(const Node nodes[]) {
     // Compute the weighted sum of inputs.
-    float Sum = Bias;
-    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I)
-      Sum += I->first * nodes[I->second].Value;
+    BlockFrequency SumN = BiasN;
+    BlockFrequency SumP = BiasP;
+    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I) {
+      if (nodes[I->second].Value == -1)
+        SumN += I->first;
+      else if (nodes[I->second].Value == 1)
+        SumP += I->first;
+    }
 
-    // The weighted sum is going to be in the range [-2;2]. Ideally, we should
-    // simply set Value = sign(Sum), but we will add a dead zone around 0 for
-    // two reasons:
+    // Each weighted sum is going to be less than the total frequency of the
+    // bundle. Ideally, we should simply set Value = sign(SumP - SumN), but we
+    // will add a dead zone around 0 for two reasons:
+    //
     //  1. It avoids arbitrary bias when all links are 0 as is possible during
     //     initial iterations.
     //  2. It helps tame rounding errors when the links nominally sum to 0.
-    const float Thres = 1e-4f;
+    //
     bool Before = preferReg();
-    if (Sum < -Thres)
+    if (SumN >= SumP + Threshold)
       Value = -1;
-    else if (Sum > Thres)
+    else if (SumP >= SumN + Threshold)
       Value = 1;
     else
       Value = 0;
@@ -177,22 +187,13 @@ bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
   nodes = new Node[bundles->getNumBundles()];
 
   // Compute total ingoing and outgoing block frequencies for all bundles.
-  BlockFrequency.resize(mf.getNumBlockIDs());
+  BlockFrequencies.resize(mf.getNumBlockIDs());
+  MachineBlockFrequencyInfo &MBFI = getAnalysis<MachineBlockFrequencyInfo>();
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I) {
-    float Freq = LiveIntervals::getSpillWeight(true, false,
-                                               loops->getLoopDepth(I));
     unsigned Num = I->getNumber();
-    BlockFrequency[Num] = Freq;
-    nodes[bundles->getBundle(Num, 1)].Scale[0] += Freq;
-    nodes[bundles->getBundle(Num, 0)].Scale[1] += Freq;
+    BlockFrequencies[Num] = MBFI.getBlockFreq(I);
   }
 
-  // Scales are reciprocal frequencies.
-  for (unsigned i = 0, e = bundles->getNumBundles(); i != e; ++i)
-    for (unsigned d = 0; d != 2; ++d)
-      if (nodes[i].Scale[d] > 0)
-        nodes[i].Scale[d] = 1 / nodes[i].Scale[d];
-
   // We never change the function.
   return false;
 }
@@ -213,12 +214,15 @@ void SpillPlacement::activate(unsigned n) {
   // landing pads, or loops with many 'continue' statements. It is difficult to
   // allocate registers when so many different blocks are involved.
   //
-  // Give a small negative bias to large bundles such that 1/32 of the
-  // connected blocks need to be interested before we consider expanding the
-  // region through the bundle. This helps compile time by limiting the number
-  // of blocks visited and the number of links in the Hopfield network.
-  if (bundles->getBlocks(n).size() > 100)
-    nodes[n].Bias = -0.0625f;
+  // Give a small negative bias to large bundles such that a substantial
+  // fraction of the connected blocks need to be interested before we consider
+  // expanding the region through the bundle. This helps compile time by
+  // limiting the number of blocks visited and the number of links in the
+  // Hopfield network.
+  if (bundles->getBlocks(n).size() > 100) {
+    nodes[n].BiasP = 0;
+    nodes[n].BiasN = (BlockFrequency::getEntryFrequency() / 16);
+  }
 }
 
 
@@ -227,27 +231,20 @@ void SpillPlacement::activate(unsigned n) {
 void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
   for (ArrayRef<BlockConstraint>::iterator I = LiveBlocks.begin(),
        E = LiveBlocks.end(); I != E; ++I) {
-    float Freq = getBlockFrequency(I->Number);
-    const float Bias[] = {
-      0,           // DontCare,
-      1,           // PrefReg,
-      -1,          // PrefSpill
-      0,           // PrefBoth
-      -HUGE_VALF   // MustSpill
-    };
+    BlockFrequency Freq = BlockFrequencies[I->Number];
 
     // Live-in to block?
     if (I->Entry != DontCare) {
       unsigned ib = bundles->getBundle(I->Number, 0);
       activate(ib);
-      nodes[ib].addBias(Freq * Bias[I->Entry], 1);
+      nodes[ib].addBias(Freq, I->Entry);
     }
 
     // Live-out from block?
     if (I->Exit != DontCare) {
       unsigned ob = bundles->getBundle(I->Number, 1);
       activate(ob);
-      nodes[ob].addBias(Freq * Bias[I->Exit], 0);
+      nodes[ob].addBias(Freq, I->Exit);
     }
   }
 }
@@ -256,15 +253,15 @@ void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
 void SpillPlacement::addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong) {
   for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
        I != E; ++I) {
-    float Freq = getBlockFrequency(*I);
+    BlockFrequency Freq = BlockFrequencies[*I];
     if (Strong)
       Freq += Freq;
     unsigned ib = bundles->getBundle(*I, 0);
     unsigned ob = bundles->getBundle(*I, 1);
     activate(ib);
     activate(ob);
-    nodes[ib].addBias(-Freq, 1);
-    nodes[ob].addBias(-Freq, 0);
+    nodes[ib].addBias(Freq, PrefSpill);
+    nodes[ob].addBias(Freq, PrefSpill);
   }
 }
 
@@ -284,9 +281,9 @@ void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
       Linked.push_back(ib);
     if (nodes[ob].Links.empty() && !nodes[ob].mustSpill())
       Linked.push_back(ob);
-    float Freq = getBlockFrequency(Number);
-    nodes[ib].addLink(ob, Freq, 1);
-    nodes[ob].addLink(ib, Freq, 0);
+    BlockFrequency Freq = BlockFrequencies[Number];
+    nodes[ib].addLink(ob, Freq);
+    nodes[ob].addLink(ib, Freq);
   }
 }
 
diff --git a/lib/CodeGen/SpillPlacement.h b/lib/CodeGen/SpillPlacement.h
index fc412f8..105516b 100644
--- a/lib/CodeGen/SpillPlacement.h
+++ b/lib/CodeGen/SpillPlacement.h
@@ -30,6 +30,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/BlockFrequency.h"
 
 namespace llvm {
 
@@ -57,7 +58,7 @@ class SpillPlacement  : public MachineFunctionPass {
   SmallVector<unsigned, 8> RecentPositive;
 
   // Block frequencies are computed once. Indexed by block number.
-  SmallVector<float, 4> BlockFrequency;
+  SmallVector<BlockFrequency, 4> BlockFrequencies;
 
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -139,8 +140,8 @@ public:
 
   /// getBlockFrequency - Return the estimated block execution frequency per
   /// function invocation.
-  float getBlockFrequency(unsigned Number) const {
-    return BlockFrequency[Number];
+  BlockFrequency getBlockFrequency(unsigned Number) const {
+    return BlockFrequencies[Number];
   }
 
 private:
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 209792f..d5b3a4a 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -77,7 +77,7 @@ protected:
 
     DEBUG(dbgs() << "Spilling everywhere " << *li << "\n");
 
-    assert(li->weight != HUGE_VALF &&
+    assert(li->weight != llvm::huge_valf &&
            "Attempting to spill already spilled value.");
 
     assert(!TargetRegisterInfo::isStackSlot(li->reg) &&
@@ -115,15 +115,14 @@ protected:
         indices.push_back(i);
       }
 
-      // Create a new vreg & interval for this instr.
-      LiveInterval *newLI = &LRE.create();
-      newLI->weight = HUGE_VALF;
+      // Create a new virtual register for the load and/or store.
+      unsigned NewVReg = LRE.create();
 
       // Update the reg operands & kill flags.
       for (unsigned i = 0; i < indices.size(); ++i) {
         unsigned mopIdx = indices[i];
         MachineOperand &mop = mi->getOperand(mopIdx);
-        mop.setReg(newLI->reg);
+        mop.setReg(NewVReg);
         if (mop.isUse() && !mi->isRegTiedToDefOperand(mopIdx)) {
           mop.setIsKill(true);
         }
@@ -133,28 +132,20 @@ protected:
       // Insert reload if necessary.
       MachineBasicBlock::iterator miItr(mi);
       if (hasUse) {
-        tii->loadRegFromStackSlot(*mi->getParent(), miItr, newLI->reg, ss, trc,
+        MachineInstrSpan MIS(miItr);
+
+        tii->loadRegFromStackSlot(*mi->getParent(), miItr, NewVReg, ss, trc,
                                   tri);
-        MachineInstr *loadInstr(prior(miItr));
-        SlotIndex loadIndex =
-          lis->InsertMachineInstrInMaps(loadInstr).getRegSlot();
-        SlotIndex endIndex = loadIndex.getNextIndex();
-        VNInfo *loadVNI =
-          newLI->getNextValue(loadIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(loadIndex, endIndex, loadVNI));
+        lis->InsertMachineInstrRangeInMaps(MIS.begin(), miItr);
       }
 
       // Insert store if necessary.
       if (hasDef) {
-        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr),newLI->reg,
+        MachineInstrSpan MIS(miItr);
+
+        tii->storeRegToStackSlot(*mi->getParent(), llvm::next(miItr), NewVReg,
                                  true, ss, trc, tri);
-        MachineInstr *storeInstr(llvm::next(miItr));
-        SlotIndex storeIndex =
-          lis->InsertMachineInstrInMaps(storeInstr).getRegSlot();
-        SlotIndex beginIndex = storeIndex.getPrevIndex();
-        VNInfo *storeVNI =
-          newLI->getNextValue(beginIndex, lis->getVNInfoAllocator());
-        newLI->addRange(LiveRange(beginIndex, storeIndex, storeVNI));
+        lis->InsertMachineInstrRangeInMaps(llvm::next(miItr), MIS.end());
       }
     }
   }
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 0a3818e..68a15f7 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -214,7 +214,7 @@ bool SplitAnalysis::calcLiveBlockInfo() {
 
       // When not live in, the first use should be a def.
       if (!BI.LiveIn) {
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         assert(LVI->start == BI.FirstInstr && "First instr should be a def");
         BI.FirstDef = BI.FirstInstr;
       }
@@ -245,8 +245,8 @@ bool SplitAnalysis::calcLiveBlockInfo() {
           BI.FirstInstr = BI.FirstDef = LVI->start;
         }
 
-        // A LiveRange that starts in the middle of the block must be a def.
-        assert(LVI->start == LVI->valno->def && "Dangling LiveRange start");
+        // A Segment that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
         if (!BI.FirstDef)
           BI.FirstDef = LVI->start;
       }
@@ -325,12 +325,14 @@ void SplitAnalysis::analyze(const LiveInterval *li) {
 SplitEditor::SplitEditor(SplitAnalysis &sa,
                          LiveIntervals &lis,
                          VirtRegMap &vrm,
-                         MachineDominatorTree &mdt)
+                         MachineDominatorTree &mdt,
+                         MachineBlockFrequencyInfo &mbfi)
   : SA(sa), LIS(lis), VRM(vrm),
     MRI(vrm.getMachineFunction().getRegInfo()),
     MDT(mdt),
     TII(*vrm.getMachineFunction().getTarget().getInstrInfo()),
     TRI(*vrm.getMachineFunction().getTarget().getRegisterInfo()),
+    MBFI(mbfi),
     Edit(0),
     OpenIdx(0),
     SpillMode(SM_Partition),
@@ -375,7 +377,7 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   assert(ParentVNI && "Mapping  NULL value");
   assert(Idx.isValid() && "Invalid SlotIndex");
   assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // Create a new value.
   VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
@@ -393,14 +395,14 @@ VNInfo *SplitEditor::defValue(unsigned RegIdx,
   // If the previous value was a simple mapping, add liveness for it now.
   if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
     SlotIndex Def = OldVNI->def;
-    LI->addRange(LiveRange(Def, Def.getDeadSlot(), OldVNI));
+    LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), OldVNI));
     // No longer a simple mapping.  Switch to a complex, non-forced mapping.
     InsP.first->second = ValueForcePair();
   }
 
   // This is a complex mapping, add liveness for VNI
   SlotIndex Def = VNI->def;
-  LI->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
 
   return VNI;
 }
@@ -420,7 +422,8 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   // This was previously a single mapping. Make sure the old def is represented
   // by a trivial live range.
   SlotIndex Def = VNI->def;
-  Edit->get(RegIdx)->addRange(LiveRange(Def, Def.getDeadSlot(), VNI));
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
   // Mark as complex mapped, forced.
   VFP = ValueForcePair(0, true);
 }
@@ -432,7 +435,7 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    MachineBasicBlock::iterator I) {
   MachineInstr *CopyMI = 0;
   SlotIndex Def;
-  LiveInterval *LI = Edit->get(RegIdx);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
   // We may be trying to avoid interference that ends at a deleted instruction,
   // so always begin RegIdx 0 early and all others late.
@@ -460,11 +463,11 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
 unsigned SplitEditor::openIntv() {
   // Create the complement as index 0.
   if (Edit->empty())
-    Edit->create();
+    Edit->createEmptyInterval();
 
   // Create the open interval.
   OpenIdx = Edit->size();
-  Edit->create();
+  Edit->createEmptyInterval();
   return OpenIdx;
 }
 
@@ -629,7 +632,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
 //===----------------------------------------------------------------------===//
 
 void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
   RegAssignMap::iterator AssignI;
   AssignI.setMap(RegAssign);
@@ -728,7 +731,7 @@ SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
 
 void SplitEditor::hoistCopiesForSize() {
   // Get the complement interval, always RegIdx 0.
-  LiveInterval *LI = Edit->get(0);
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
   LiveInterval *Parent = &Edit->getParent();
 
   // Track the nearest common dominator for all back-copies for each ParentVNI,
@@ -859,13 +862,13 @@ bool SplitEditor::transferValues() {
 
       // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
       DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
-      LiveInterval *LI = Edit->get(RegIdx);
+      LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
 
       // Check for a simply defined value that can be blitted directly.
       ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
       if (VNInfo *VNI = VFP.getPointer()) {
         DEBUG(dbgs() << ':' << VNI->id);
-        LI->addRange(LiveRange(Start, End, VNI));
+        LR.addSegment(LiveInterval::Segment(Start, End, VNI));
         Start = End;
         continue;
       }
@@ -889,7 +892,7 @@ bool SplitEditor::transferValues() {
 
       // The first block may be live-in, or it may have its own def.
       if (Start != BlockStart) {
-        VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+        VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
         assert(VNI && "Missing def for complex mapped value");
         DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
         // MBB has its own def. Is it also live-out?
@@ -909,7 +912,7 @@ bool SplitEditor::transferValues() {
         if (BlockStart == ParentVNI->def) {
           // This block has the def of a parent PHI, so it isn't live-in.
           assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
-          VNInfo *VNI = LI->extendInBlock(BlockStart, std::min(BlockEnd, End));
+          VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
           if (End >= BlockEnd)
             LRC.setLiveOutValue(MBB, VNI); // Live-out as well.
@@ -917,10 +920,10 @@ bool SplitEditor::transferValues() {
           // This block needs a live-in value.  The last block covered may not
           // be live-out.
           if (End < BlockEnd)
-            LRC.addLiveInBlock(LI, MDT[MBB], End);
+            LRC.addLiveInBlock(LR, MDT[MBB], End);
           else {
             // Live-through, and we don't know the value.
-            LRC.addLiveInBlock(LI, MDT[MBB]);
+            LRC.addLiveInBlock(LR, MDT[MBB]);
             LRC.setLiveOutValue(MBB, 0);
           }
         }
@@ -947,7 +950,7 @@ void SplitEditor::extendPHIKillRanges() {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
     LiveRangeCalc &LRC = getLRCalc(RegIdx);
     MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
@@ -959,7 +962,7 @@ void SplitEditor::extendPHIKillRanges() {
       if (Edit->getParent().liveAt(LastUse)) {
         assert(RegAssign.lookup(LastUse) == RegIdx &&
                "Different register assignment in phi predecessor");
-        LRC.extend(LI, End);
+        LRC.extend(LR, End);
       }
     }
   }
@@ -988,7 +991,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
 
     // Rewrite to the mapped register at Idx.
     unsigned RegIdx = RegAssign.lookup(Idx);
-    LiveInterval *LI = Edit->get(RegIdx);
+    LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
     MO.setReg(LI->reg);
     DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
                  << Idx << ':' << RegIdx << '\t' << *MI);
@@ -1009,14 +1012,14 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     } else
       Idx = Idx.getRegSlot(true);
 
-    getLRCalc(RegIdx).extend(LI, Idx.getNextSlot());
+    getLRCalc(RegIdx).extend(*LI, Idx.getNextSlot());
   }
 }
 
 void SplitEditor::deleteRematVictims() {
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
-    LiveInterval *LI = *I;
+    LiveInterval *LI = &LIS.getInterval(*I);
     for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
            LII != LIE; ++LII) {
       // Dead defs end at the dead slot.
@@ -1089,8 +1092,10 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     deleteRematVictims();
 
   // Get rid of unused values and set phi-kill flags.
-  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I)
-    (*I)->RenumberValues(LIS);
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
+    LI.RenumberValues();
+  }
 
   // Provide a reverse mapping from original indices to Edit ranges.
   if (LRMap) {
@@ -1103,7 +1108,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   ConnectedVNInfoEqClasses ConEQ(LIS);
   for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
     // Don't use iterators, they are invalidated by create() below.
-    LiveInterval *li = Edit->get(i);
+    LiveInterval *li = &LIS.getInterval(Edit->get(i));
     unsigned NumComp = ConEQ.Classify(li);
     if (NumComp <= 1)
       continue;
@@ -1111,7 +1116,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     SmallVector<LiveInterval*, 8> dups;
     dups.push_back(li);
     for (unsigned j = 1; j != NumComp; ++j)
-      dups.push_back(&Edit->create());
+      dups.push_back(&Edit->createEmptyInterval());
     ConEQ.Distribute(&dups[0], MRI);
     // The new intervals all map back to i.
     if (LRMap)
@@ -1119,7 +1124,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
   }
 
   // Calculate spill weight and allocation hints for new intervals.
-  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops);
+  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops, MBFI);
 
   assert(!LRMap || LRMap->size() == Edit->size());
 }
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 4005a3d..f029c73 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -27,6 +27,7 @@ class ConnectedVNInfoEqClasses;
 class LiveInterval;
 class LiveIntervals;
 class LiveRangeEdit;
+class MachineBlockFrequencyInfo;
 class MachineInstr;
 class MachineLoopInfo;
 class MachineRegisterInfo;
@@ -215,6 +216,7 @@ class SplitEditor {
   MachineDominatorTree &MDT;
   const TargetInstrInfo &TII;
   const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
 
 public:
 
@@ -349,7 +351,7 @@ public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
   SplitEditor(SplitAnalysis &SA, LiveIntervals&, VirtRegMap&,
-              MachineDominatorTree&);
+              MachineDominatorTree&, MachineBlockFrequencyInfo &);
 
   /// reset - Prepare for a new split.
   void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index a789a25..3dbc050 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -169,7 +170,7 @@ private:
   /// slots to use the joint slots.
   void remapInstructions(DenseMap<int, int> &SlotRemap);
 
-  /// The input program may contain intructions which are not inside lifetime
+  /// The input program may contain instructions which are not inside lifetime
   /// markers. This can happen due to a bug in the compiler or due to a bug in
   /// user code (for example, returning a reference to a local variable).
   /// This procedure checks all of the instructions in the function and
@@ -309,9 +310,9 @@ void StackColoring::calculateLocalLiveness() {
 
     SmallPtrSet<const MachineBasicBlock*, 8> NextBBSet;
 
-    for (SmallVector<const MachineBasicBlock*, 8>::iterator
-         PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
-         PI != PE; ++PI) {
+    for (SmallVectorImpl<const MachineBasicBlock *>::iterator
+           PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
+           PI != PE; ++PI) {
 
       const MachineBasicBlock *BB = *PI;
       if (!BBSet.count(BB)) continue;
@@ -428,17 +429,14 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
     }
 
     // Create the interval of the blocks that we previously found to be 'alive'.
-    BitVector Alive = BlockLiveness[MBB].LiveIn;
-    Alive |= BlockLiveness[MBB].LiveOut;
-
-    if (Alive.any()) {
-      for (int pos = Alive.find_first(); pos != -1;
-           pos = Alive.find_next(pos)) {
-        if (!Starts[pos].isValid())
-          Starts[pos] = Indexes->getMBBStartIdx(MBB);
-        if (!Finishes[pos].isValid())
-          Finishes[pos] = Indexes->getMBBEndIdx(MBB);
-      }
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[MBB];
+    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
+         pos = MBBLiveness.LiveIn.find_next(pos)) {
+      Starts[pos] = Indexes->getMBBStartIdx(MBB);
+    }
+    for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1;
+         pos = MBBLiveness.LiveOut.find_next(pos)) {
+      Finishes[pos] = Indexes->getMBBEndIdx(MBB);
     }
 
     for (unsigned i = 0; i < NumSlots; ++i) {
@@ -452,14 +450,14 @@ void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
       SlotIndex F = Finishes[i];
       if (S < F) {
         // We have a single consecutive region.
-        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
       } else {
         // We have two non consecutive regions. This happens when
         // LIFETIME_START appears after the LIFETIME_END marker.
         SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
         SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
-        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
-        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
       }
     }
   }
@@ -528,6 +526,10 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         if (!V)
           continue;
 
+        const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V);
+        if (PSV && PSV->isConstant(MFI))
+          continue;
+
         // Climb up and find the original alloca.
         V = GetUnderlyingObject(V);
         // If we did not find one, or if the one that we found is not in our
@@ -761,7 +763,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
         // Merge disjoint slots.
         if (!First->overlaps(*Second)) {
           Changed = true;
-          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
           SlotRemap[SecondSlot] = FirstSlot;
           SortedSlots[J] = -1;
           DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
new file mode 100644
index 0000000..40893ea
--- /dev/null
+++ b/lib/CodeGen/StackMaps.cpp
@@ -0,0 +1,314 @@
+//===---------------------------- StackMaps.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackmaps"
+
+#include "llvm/CodeGen/StackMaps.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#include <iterator>
+
+using namespace llvm;
+
+PatchPointOpers::PatchPointOpers(const MachineInstr *MI):
+  MI(MI),
+  HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+         !MI->getOperand(0).isImplicit()),
+  IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() == CallingConv::AnyReg) {
+
+#ifndef NDEBUG
+  {
+  unsigned CheckStartIdx = 0, e = MI->getNumOperands();
+  while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
+         MI->getOperand(CheckStartIdx).isDef() &&
+         !MI->getOperand(CheckStartIdx).isImplicit())
+    ++CheckStartIdx;
+
+  assert(getMetaIdx() == CheckStartIdx &&
+         "Unexpected additonal definition in Patchpoint intrinsic.");
+  }
+#endif
+}
+
+unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
+  if (!StartIdx)
+    StartIdx = getVarIdx();
+
+  // Find the next scratch register (implicit def and early clobber)
+  unsigned ScratchIdx = StartIdx, e = MI->getNumOperands();
+  while (ScratchIdx < e &&
+         !(MI->getOperand(ScratchIdx).isReg() &&
+           MI->getOperand(ScratchIdx).isDef() &&
+           MI->getOperand(ScratchIdx).isImplicit() &&
+           MI->getOperand(ScratchIdx).isEarlyClobber()))
+    ++ScratchIdx;
+
+  assert(ScratchIdx != e && "No scratch register available");
+  return ScratchIdx;
+}
+
+void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint32_t ID,
+                                    MachineInstr::const_mop_iterator MOI,
+                                    MachineInstr::const_mop_iterator MOE,
+                                    bool recordResult) {
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  MCSymbol *MILabel = OutContext.CreateTempSymbol();
+  AP.OutStreamer.EmitLabel(MILabel);
+
+  LocationVec CallsiteLocs;
+
+  if (recordResult) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MI.operands_begin(), llvm::next(MI.operands_begin()), AP.TM);
+
+    Location &Loc = ParseResult.first;
+    assert(Loc.LocType == Location::Register &&
+           "Stackmap return location must be a register.");
+    CallsiteLocs.push_back(Loc);
+  }
+
+  while (MOI != MOE) {
+    std::pair<Location, MachineInstr::const_mop_iterator> ParseResult =
+      OpParser(MOI, MOE, AP.TM);
+
+    Location &Loc = ParseResult.first;
+
+    // Move large constants into the constant pool.
+    if (Loc.LocType == Location::Constant && (Loc.Offset & ~0xFFFFFFFFULL)) {
+      Loc.LocType = Location::ConstantIndex;
+      Loc.Offset = ConstPool.getConstantIndex(Loc.Offset);
+    }
+
+    CallsiteLocs.push_back(Loc);
+    MOI = ParseResult.second;
+  }
+
+  const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
+    MCSymbolRefExpr::Create(MILabel, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    OutContext);
+
+  CSInfos.push_back(CallsiteInfo(CSOffsetExpr, ID, CallsiteLocs));
+}
+
+static MachineInstr::const_mop_iterator
+getStackMapEndMOP(MachineInstr::const_mop_iterator MOI,
+                  MachineInstr::const_mop_iterator MOE) {
+  for (; MOI != MOE; ++MOI)
+    if (MOI->isRegMask() || (MOI->isReg() && MOI->isImplicit()))
+      break;
+
+  return MOI;
+}
+
+void StackMaps::recordStackMap(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STACKMAP && "exected stackmap");
+
+  int64_t ID = MI.getOperand(0).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  recordStackMapOpers(MI, ID, llvm::next(MI.operands_begin(), 2),
+                      getStackMapEndMOP(MI.operands_begin(),
+                                        MI.operands_end()));
+}
+
+void StackMaps::recordPatchPoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::PATCHPOINT && "exected stackmap");
+
+  PatchPointOpers opers(&MI);
+  int64_t ID = opers.getMetaOper(PatchPointOpers::IDPos).getImm();
+  assert((int32_t)ID == ID && "Stack maps hold 32-bit IDs");
+  MachineInstr::const_mop_iterator MOI =
+    llvm::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  recordStackMapOpers(MI, ID, MOI, getStackMapEndMOP(MOI, MI.operands_end()),
+                      opers.isAnyReg() && opers.hasDef());
+
+#ifndef NDEBUG
+  // verify anyregcc
+  LocationVec &Locations = CSInfos.back().Locations;
+  if (opers.isAnyReg()) {
+    unsigned NArgs = opers.getMetaOper(PatchPointOpers::NArgPos).getImm();
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs+1 : NArgs); i != e; ++i)
+      assert(Locations[i].LocType == Location::Register &&
+             "anyreg arg must be in reg.");
+  }
+#endif
+}
+
+/// serializeToStackMapSection conceptually populates the following fields:
+///
+/// uint32 : Reserved (header)
+/// uint32 : NumConstants
+/// int64  : Constants[NumConstants]
+/// uint32 : NumRecords
+/// StkMapRecord[NumRecords] {
+///   uint32 : PatchPoint ID
+///   uint32 : Instruction Offset
+///   uint16 : Reserved (record flags)
+///   uint16 : NumLocations
+///   Location[NumLocations] {
+///     uint8  : Register | Direct | Indirect | Constant | ConstantIndex
+///     uint8  : Size in Bytes
+///     uint16 : Dwarf RegNum
+///     int32  : Offset
+///   }
+/// }
+///
+/// Location Encoding, Type, Value:
+///   0x1, Register, Reg                 (value in register)
+///   0x2, Direct, Reg + Offset          (frame index)
+///   0x3, Indirect, [Reg + Offset]      (spilled value)
+///   0x4, Constant, Offset              (small constant)
+///   0x5, ConstIndex, Constants[Offset] (large constant)
+///
+void StackMaps::serializeToStackMapSection() {
+  // Bail out if there's no stack map data.
+  if (CSInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer.getContext();
+  const TargetRegisterInfo *TRI = AP.TM.getRegisterInfo();
+
+  // Create the section.
+  const MCSection *StackMapSection =
+    OutContext.getObjectFileInfo()->getStackMapSection();
+  AP.OutStreamer.SwitchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  AP.OutStreamer.EmitLabel(
+    OutContext.GetOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  const char *WSMP = "Stack Maps: ";
+  (void)WSMP;
+  const MCRegisterInfo &MCRI = *OutContext.getRegisterInfo();
+
+  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+
+  // Header.
+  AP.OutStreamer.EmitIntValue(0, 4);
+
+  // Num constants.
+  AP.OutStreamer.EmitIntValue(ConstPool.getNumConstants(), 4);
+
+  // Constant pool entries.
+  for (unsigned i = 0; i < ConstPool.getNumConstants(); ++i)
+    AP.OutStreamer.EmitIntValue(ConstPool.getConstant(i), 8);
+
+  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << "\n");
+  AP.OutStreamer.EmitIntValue(CSInfos.size(), 4);
+
+  for (CallsiteInfoList::const_iterator CSII = CSInfos.begin(),
+                                        CSIE = CSInfos.end();
+       CSII != CSIE; ++CSII) {
+
+    unsigned CallsiteID = CSII->ID;
+    const LocationVec &CSLocs = CSII->Locations;
+
+    DEBUG(dbgs() << WSMP << "callsite " << CallsiteID << "\n");
+
+    // Verify stack map entry. It's better to communicate a problem to the
+    // runtime than crash in case of in-process compilation. Currently, we do
+    // simple overflow checks, but we may eventually communicate other
+    // compilation errors this way.
+    if (CSLocs.size() > UINT16_MAX) {
+      AP.OutStreamer.EmitIntValue(UINT32_MAX, 4); // Invalid ID.
+      AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+      AP.OutStreamer.EmitIntValue(0, 2); // Reserved.
+      AP.OutStreamer.EmitIntValue(0, 2); // 0 locations.
+      continue;
+    }
+
+    AP.OutStreamer.EmitIntValue(CallsiteID, 4);
+    AP.OutStreamer.EmitValue(CSII->CSOffsetExpr, 4);
+
+    // Reserved for flags.
+    AP.OutStreamer.EmitIntValue(0, 2);
+
+    DEBUG(dbgs() << WSMP << "  has " << CSLocs.size() << " locations\n");
+
+    AP.OutStreamer.EmitIntValue(CSLocs.size(), 2);
+
+    unsigned operIdx = 0;
+    for (LocationVec::const_iterator LocI = CSLocs.begin(), LocE = CSLocs.end();
+         LocI != LocE; ++LocI, ++operIdx) {
+      const Location &Loc = *LocI;
+      DEBUG(
+        dbgs() << WSMP << "  Loc " << operIdx << ": ";
+        switch (Loc.LocType) {
+        case Location::Unprocessed:
+          dbgs() << "<Unprocessed operand>";
+          break;
+        case Location::Register:
+          dbgs() << "Register " << MCRI.getName(Loc.Reg);
+          break;
+        case Location::Direct:
+          dbgs() << "Direct " << MCRI.getName(Loc.Reg);
+          if (Loc.Offset)
+            dbgs() << " + " << Loc.Offset;
+          break;
+        case Location::Indirect:
+          dbgs() << "Indirect " << MCRI.getName(Loc.Reg)
+                 << " + " << Loc.Offset;
+          break;
+        case Location::Constant:
+          dbgs() << "Constant " << Loc.Offset;
+          break;
+        case Location::ConstantIndex:
+          dbgs() << "Constant Index " << Loc.Offset;
+          break;
+        }
+        dbgs() << "\n";
+      );
+
+      unsigned RegNo = 0;
+      int Offset = Loc.Offset;
+      if(Loc.Reg) {
+        RegNo = MCRI.getDwarfRegNum(Loc.Reg, false);
+        for (MCSuperRegIterator SR(Loc.Reg, TRI);
+             SR.isValid() && (int)RegNo < 0; ++SR) {
+          RegNo = TRI->getDwarfRegNum(*SR, false);
+        }
+        // If this is a register location, put the subregister byte offset in
+        // the location offset.
+        if (Loc.LocType == Location::Register) {
+          assert(!Loc.Offset && "Register location should have zero offset");
+          unsigned LLVMRegNo = MCRI.getLLVMRegNum(RegNo, false);
+          unsigned SubRegIdx = MCRI.getSubRegIndex(LLVMRegNo, Loc.Reg);
+          if (SubRegIdx)
+            Offset = MCRI.getSubRegIdxOffset(SubRegIdx);
+        }
+      }
+      else {
+        assert(Loc.LocType != Location::Register &&
+               "Missing location register");
+      }
+      AP.OutStreamer.EmitIntValue(Loc.LocType, 1);
+      AP.OutStreamer.EmitIntValue(Loc.Size, 1);
+      AP.OutStreamer.EmitIntValue(RegNo, 2);
+      AP.OutStreamer.EmitIntValue(Offset, 4);
+    }
+  }
+
+  AP.OutStreamer.AddBlankLine();
+
+  CSInfos.clear();
+}
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index fbef347..9020449 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -15,147 +15,120 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "stack-protector"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLowering.h"
+#include <cstdlib>
 using namespace llvm;
 
 STATISTIC(NumFunProtected, "Number of functions protected");
 STATISTIC(NumAddrTaken, "Number of local variables that have their address"
                         " taken.");
 
-namespace {
-  class StackProtector : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// target type sizes.
-    const TargetLoweringBase *TLI;
-
-    Function *F;
-    Module *M;
-
-    DominatorTree *DT;
-
-    /// VisitedPHIs - The set of PHI nodes visited when determining
-    /// if a variable's reference has been taken.  This set 
-    /// is maintained to ensure we don't visit the same PHI node multiple
-    /// times.
-    SmallPtrSet<const PHINode*, 16> VisitedPHIs;
-
-    /// InsertStackProtectors - Insert code into the prologue and epilogue of
-    /// the function.
-    ///
-    ///  - The prologue code loads and stores the stack guard onto the stack.
-    ///  - The epilogue checks the value stored in the prologue against the
-    ///    original value. It calls __stack_chk_fail if they differ.
-    bool InsertStackProtectors();
-
-    /// CreateFailBB - Create a basic block to jump to when the stack protector
-    /// check fails.
-    BasicBlock *CreateFailBB();
-
-    /// ContainsProtectableArray - Check whether the type either is an array or
-    /// contains an array of sufficient size so that we need stack protectors
-    /// for it.
-    bool ContainsProtectableArray(Type *Ty, bool Strong = false,
-                                  bool InStruct = false) const;
-
-    /// \brief Check whether a stack allocation has its address taken.
-    bool HasAddressTaken(const Instruction *AI);
-
-    /// RequiresStackProtector - Check whether or not this function needs a
-    /// stack protector based upon the stack protector level.
-    bool RequiresStackProtector();
-  public:
-    static char ID;             // Pass identification, replacement for typeid.
-    StackProtector() : FunctionPass(ID), TLI(0) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-    StackProtector(const TargetLoweringBase *tli)
-      : FunctionPass(ID), TLI(tli) {
-      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addPreserved<DominatorTree>();
-    }
-
-    virtual bool runOnFunction(Function &Fn);
-  };
-} // end anonymous namespace
+static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
+                                          cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_PASS(StackProtector, "stack-protector",
-                "Insert stack protectors", false, false)
+INITIALIZE_PASS(StackProtector, "stack-protector", "Insert stack protectors",
+                false, true)
 
-FunctionPass *llvm::createStackProtectorPass(const TargetLoweringBase *tli) {
-  return new StackProtector(tli);
+FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
+  return new StackProtector(TM);
+}
+
+StackProtector::SSPLayoutKind
+StackProtector::getSSPLayout(const AllocaInst *AI) const {
+  return AI ? Layout.lookup(AI) : SSPLK_None;
 }
 
 bool StackProtector::runOnFunction(Function &Fn) {
   F = &Fn;
   M = F->getParent();
   DT = getAnalysisIfAvailable<DominatorTree>();
+  TLI = TM->getTargetLowering();
 
-  if (!RequiresStackProtector()) return false;
+  if (!RequiresStackProtector())
+    return false;
+
+  Attribute Attr = Fn.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex, "stack-protector-buffer-size");
+  if (Attr.isStringAttribute())
+    Attr.getValueAsString().getAsInteger(10, SSPBufferSize);
 
   ++NumFunProtected;
   return InsertStackProtectors();
 }
 
-/// ContainsProtectableArray - Check whether the type either is an array or
-/// contains a char array of sufficient size so that we need stack protectors
-/// for it.
-bool StackProtector::ContainsProtectableArray(Type *Ty, bool Strong,
+/// \param [out] IsLarge is set to true if a protectable array is found and
+/// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+/// multiple arrays, this gets set if any of them is large.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
+                                              bool Strong,
                                               bool InStruct) const {
-  if (!Ty) return false;
+  if (!Ty)
+    return false;
   if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
-    // In strong mode any array, regardless of type and size, triggers a
-    // protector
-    if (Strong)
-      return true;
-    const TargetMachine &TM = TLI->getTargetMachine();
     if (!AT->getElementType()->isIntegerTy(8)) {
-      Triple Trip(TM.getTargetTriple());
-
       // If we're on a non-Darwin platform or we're inside of a structure, don't
       // add stack protectors unless the array is a character array.
-      if (InStruct || !Trip.isOSDarwin())
-          return false;
+      // However, in strong mode any array, regardless of type and size,
+      // triggers a protector.
+      if (!Strong && (InStruct || !Trip.isOSDarwin()))
+        return false;
     }
 
     // If an array has more than SSPBufferSize bytes of allocated space, then we
     // emit stack protectors.
-    if (TM.Options.SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT))
+    if (SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT)) {
+      IsLarge = true;
+      return true;
+    }
+
+    if (Strong)
+      // Require a protector for all arrays in strong mode
       return true;
   }
 
   const StructType *ST = dyn_cast<StructType>(Ty);
-  if (!ST) return false;
+  if (!ST)
+    return false;
 
+  bool NeedsProtector = false;
   for (StructType::element_iterator I = ST->element_begin(),
-         E = ST->element_end(); I != E; ++I)
-    if (ContainsProtectableArray(*I, Strong, true))
-      return true;
+                                    E = ST->element_end();
+       I != E; ++I)
+    if (ContainsProtectableArray(*I, IsLarge, Strong, true)) {
+      // If the element is a protectable array and is large (>= SSPBufferSize)
+      // then we are done.  If the protectable array is not large, then
+      // keep looking in case a subsequent element is a large array.
+      if (IsLarge)
+        return true;
+      NeedsProtector = true;
+    }
 
-  return false;
+  return NeedsProtector;
 }
 
 bool StackProtector::HasAddressTaken(const Instruction *AI) {
   for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-        UI != UE; ++UI) {
+       UI != UE; ++UI) {
     const User *U = *UI;
     if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (AI == SI->getValueOperand())
@@ -202,11 +175,13 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
 /// address taken.
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
+  bool NeedsProtector = false;
   if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq))
-    return true;
-  else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::StackProtectStrong))
+                                      Attribute::StackProtectReq)) {
+    NeedsProtector = true;
+    Strong = true; // Use the same heuristic as strong to determine SSPLayout
+  } else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                             Attribute::StackProtectStrong))
     Strong = true;
   else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                             Attribute::StackProtect))
@@ -215,38 +190,156 @@ bool StackProtector::RequiresStackProtector() {
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
-    for (BasicBlock::iterator
-           II = BB->begin(), IE = BB->end(); II != IE; ++II) {
+    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
+         ++II) {
       if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
-  
+
           if (const ConstantInt *CI =
-               dyn_cast<ConstantInt>(AI->getArraySize())) {
-            unsigned BufferSize = TLI->getTargetMachine().Options.SSPBufferSize;
-            if (CI->getLimitedValue(BufferSize) >= BufferSize)
+                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
-              return true;
-          } else // A call to alloca with a variable size requires protectors.
-            return true;
+              Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              NeedsProtector = true;
+            } else if (Strong) {
+              // Require protectors for all alloca calls in strong mode.
+              Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              NeedsProtector = true;
+            }
+          } else {
+            // A call to alloca with a variable size requires protectors.
+            Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            NeedsProtector = true;
+          }
+          continue;
         }
 
-        if (ContainsProtectableArray(AI->getAllocatedType(), Strong))
-          return true;
+        bool IsLarge = false;
+        if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
+          Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
+                                                   : SSPLK_SmallArray));
+          NeedsProtector = true;
+          continue;
+        }
 
         if (Strong && HasAddressTaken(AI)) {
-          ++NumAddrTaken; 
-          return true;
+          ++NumAddrTaken;
+          Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          NeedsProtector = true;
         }
       }
     }
   }
 
-  return false;
+  return NeedsProtector;
+}
+
+static bool InstructionWillNotHaveChain(const Instruction *I) {
+  return !I->mayHaveSideEffects() && !I->mayReadFromMemory() &&
+         isSafeToSpeculativelyExecute(I);
+}
+
+/// Identify if RI has a previous instruction in the "Tail Position" and return
+/// it. Otherwise return 0.
+///
+/// This is based off of the code in llvm::isInTailCallPosition. The difference
+/// is that it inverts the first part of llvm::isInTailCallPosition since
+/// isInTailCallPosition is checking if a call is in a tail call position, and
+/// we are searching for an unknown tail call that might be in the tail call
+/// position. Once we find the call though, the code uses the same refactored
+/// code, returnTypeIsEligibleForTailCall.
+static CallInst *FindPotentialTailCall(BasicBlock *BB, ReturnInst *RI,
+                                       const TargetLoweringBase *TLI) {
+  // Establish a reasonable upper bound on the maximum amount of instructions we
+  // will look through to find a tail call.
+  unsigned SearchCounter = 0;
+  const unsigned MaxSearch = 4;
+  bool NoInterposingChain = true;
+
+  for (BasicBlock::reverse_iterator I = llvm::next(BB->rbegin()),
+                                    E = BB->rend();
+       I != E && SearchCounter < MaxSearch; ++I) {
+    Instruction *Inst = &*I;
+
+    // Skip over debug intrinsics and do not allow them to affect our MaxSearch
+    // counter.
+    if (isa<DbgInfoIntrinsic>(Inst))
+      continue;
+
+    // If we find a call and the following conditions are satisifed, then we
+    // have found a tail call that satisfies at least the target independent
+    // requirements of a tail call:
+    //
+    // 1. The call site has the tail marker.
+    //
+    // 2. The call site either will not cause the creation of a chain or if a
+    // chain is necessary there are no instructions in between the callsite and
+    // the call which would create an interposing chain.
+    //
+    // 3. The return type of the function does not impede tail call
+    // optimization.
+    if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
+      if (CI->isTailCall() &&
+          (InstructionWillNotHaveChain(CI) || NoInterposingChain) &&
+          returnTypeIsEligibleForTailCall(BB->getParent(), CI, RI, *TLI))
+        return CI;
+    }
+
+    // If we did not find a call see if we have an instruction that may create
+    // an interposing chain.
+    NoInterposingChain =
+        NoInterposingChain && InstructionWillNotHaveChain(Inst);
+
+    // Increment max search.
+    SearchCounter++;
+  }
+
+  return 0;
+}
+
+/// Insert code into the entry block that stores the __stack_chk_guard
+/// variable onto the stack:
+///
+///   entry:
+///     StackGuardSlot = alloca i8*
+///     StackGuard = load __stack_chk_guard
+///     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
+///
+/// Returns true if the platform/triple supports the stackprotectorcreate pseudo
+/// node.
+static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
+                           const TargetLoweringBase *TLI, const Triple &Trip,
+                           AllocaInst *&AI, Value *&StackGuardVar) {
+  bool SupportsSelectionDAGSP = false;
+  PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+  unsigned AddressSpace, Offset;
+  if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
+    Constant *OffsetVal =
+        ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
+
+    StackGuardVar = ConstantExpr::getIntToPtr(
+        OffsetVal, PointerType::get(PtrTy, AddressSpace));
+  } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
+    StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
+    cast<GlobalValue>(StackGuardVar)
+        ->setVisibility(GlobalValue::HiddenVisibility);
+  } else {
+    SupportsSelectionDAGSP = true;
+    StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+  }
+
+  IRBuilder<> B(&F->getEntryBlock().front());
+  AI = B.CreateAlloca(PtrTy, 0, "StackGuardSlot");
+  LoadInst *LI = B.CreateLoad(StackGuardVar, "StackGuard");
+  B.CreateCall2(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), LI,
+                AI);
+
+  return SupportsSelectionDAGSP;
 }
 
 /// InsertStackProtectors - Insert code into the prologue and epilogue of the
@@ -256,102 +349,102 @@ bool StackProtector::RequiresStackProtector() {
 ///  - The epilogue checks the value stored in the prologue against the original
 ///    value. It calls __stack_chk_fail if they differ.
 bool StackProtector::InsertStackProtectors() {
-  BasicBlock *FailBB = 0;       // The basic block to jump to if check fails.
-  BasicBlock *FailBBDom = 0;    // FailBB's dominator.
-  AllocaInst *AI = 0;           // Place on stack that stores the stack guard.
-  Value *StackGuardVar = 0;  // The stack guard variable.
+  bool HasPrologue = false;
+  bool SupportsSelectionDAGSP =
+      EnableSelectionDAGSP && !TM->Options.EnableFastISel;
+  AllocaInst *AI = 0;       // Place on stack that stores the stack guard.
+  Value *StackGuardVar = 0; // The stack guard variable.
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ) {
+  for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
     BasicBlock *BB = I++;
     ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
-    if (!RI) continue;
+    if (!RI)
+      continue;
 
-    if (!FailBB) {
-      // Insert code into the entry block that stores the __stack_chk_guard
-      // variable onto the stack:
-      //
-      //   entry:
-      //     StackGuardSlot = alloca i8*
-      //     StackGuard = load __stack_chk_guard
-      //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
-      //
-      PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
-      unsigned AddressSpace, Offset;
-      if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
-        Constant *OffsetVal =
-          ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-
-        StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
-                                      PointerType::get(PtrTy, AddressSpace));
+    if (!HasPrologue) {
+      HasPrologue = true;
+      SupportsSelectionDAGSP &=
+          CreatePrologue(F, M, RI, TLI, Trip, AI, StackGuardVar);
+    }
+
+    if (SupportsSelectionDAGSP) {
+      // Since we have a potential tail call, insert the special stack check
+      // intrinsic.
+      Instruction *InsertionPt = 0;
+      if (CallInst *CI = FindPotentialTailCall(BB, RI, TLI)) {
+        InsertionPt = CI;
       } else {
-        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
+        InsertionPt = RI;
+        // At this point we know that BB has a return statement so it *DOES*
+        // have a terminator.
+        assert(InsertionPt != 0 && "BB must have a terminator instruction at "
+                                   "this point.");
       }
 
-      BasicBlock &Entry = F->getEntryBlock();
-      Instruction *InsPt = &Entry.front();
-
-      AI = new AllocaInst(PtrTy, "StackGuardSlot", InsPt);
-      LoadInst *LI = new LoadInst(StackGuardVar, "StackGuard", false, InsPt);
+      Function *Intrinsic =
+          Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
+      CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
 
-      Value *Args[] = { LI, AI };
-      CallInst::
-        Create(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
-               Args, "", InsPt);
-
-      // Create the basic block to jump to when the guard check fails.
-      FailBB = CreateFailBB();
-    }
+    } else {
+      // If we do not support SelectionDAG based tail calls, generate IR level
+      // tail calls.
+      //
+      // For each block with a return instruction, convert this:
+      //
+      //   return:
+      //     ...
+      //     ret ...
+      //
+      // into this:
+      //
+      //   return:
+      //     ...
+      //     %1 = load __stack_chk_guard
+      //     %2 = load StackGuardSlot
+      //     %3 = cmp i1 %1, %2
+      //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+      //
+      //   SP_return:
+      //     ret ...
+      //
+      //   CallStackCheckFailBlk:
+      //     call void @__stack_chk_fail()
+      //     unreachable
+
+      // Create the FailBB. We duplicate the BB every time since the MI tail
+      // merge pass will merge together all of the various BB into one including
+      // fail BB generated by the stack protector pseudo instruction.
+      BasicBlock *FailBB = CreateFailBB();
+
+      // Split the basic block before the return instruction.
+      BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+
+      // Update the dominator tree if we need to.
+      if (DT && DT->isReachableFromEntry(BB)) {
+        DT->addNewBlock(NewBB, BB);
+        DT->addNewBlock(FailBB, BB);
+      }
 
-    // For each block with a return instruction, convert this:
-    //
-    //   return:
-    //     ...
-    //     ret ...
-    //
-    // into this:
-    //
-    //   return:
-    //     ...
-    //     %1 = load __stack_chk_guard
-    //     %2 = load StackGuardSlot
-    //     %3 = cmp i1 %1, %2
-    //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
-    //
-    //   SP_return:
-    //     ret ...
-    //
-    //   CallStackCheckFailBlk:
-    //     call void @__stack_chk_fail()
-    //     unreachable
+      // Remove default branch instruction to the new BB.
+      BB->getTerminator()->eraseFromParent();
 
-    // Split the basic block before the return instruction.
-    BasicBlock *NewBB = BB->splitBasicBlock(RI, "SP_return");
+      // Move the newly created basic block to the point right after the old
+      // basic block so that it's in the "fall through" position.
+      NewBB->moveAfter(BB);
 
-    if (DT && DT->isReachableFromEntry(BB)) {
-      DT->addNewBlock(NewBB, BB);
-      FailBBDom = FailBBDom ? DT->findNearestCommonDominator(FailBBDom, BB) :BB;
+      // Generate the stack protector instructions in the old basic block.
+      IRBuilder<> B(BB);
+      LoadInst *LI1 = B.CreateLoad(StackGuardVar);
+      LoadInst *LI2 = B.CreateLoad(AI);
+      Value *Cmp = B.CreateICmpEQ(LI1, LI2);
+      B.CreateCondBr(Cmp, NewBB, FailBB);
     }
-
-    // Remove default branch instruction to the new BB.
-    BB->getTerminator()->eraseFromParent();
-
-    // Move the newly created basic block to the point right after the old basic
-    // block so that it's in the "fall through" position.
-    NewBB->moveAfter(BB);
-
-    // Generate the stack protector instructions in the old basic block.
-    LoadInst *LI1 = new LoadInst(StackGuardVar, "", false, BB);
-    LoadInst *LI2 = new LoadInst(AI, "", true, BB);
-    ICmpInst *Cmp = new ICmpInst(*BB, CmpInst::ICMP_EQ, LI1, LI2, "");
-    BranchInst::Create(NewBB, FailBB, Cmp, BB);
   }
 
   // Return if we didn't modify any basic blocks. I.e., there are no return
   // statements in the function.
-  if (!FailBB) return false;
-
-  if (DT && FailBBDom)
-    DT->addNewBlock(FailBB, FailBBDom);
+  if (!HasPrologue)
+    return false;
 
   return true;
 }
@@ -359,12 +452,20 @@ bool StackProtector::InsertStackProtectors() {
 /// CreateFailBB - Create a basic block to jump to when the stack protector
 /// check fails.
 BasicBlock *StackProtector::CreateFailBB() {
-  BasicBlock *FailBB = BasicBlock::Create(F->getContext(),
-                                          "CallStackCheckFailBlk", F);
-  Constant *StackChkFail =
-    M->getOrInsertFunction("__stack_chk_fail",
-                           Type::getVoidTy(F->getContext()), NULL);
-  CallInst::Create(StackChkFail, "", FailBB);
-  new UnreachableInst(F->getContext(), FailBB);
+  LLVMContext &Context = F->getContext();
+  BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
+  IRBuilder<> B(FailBB);
+  if (Trip.getOS() == llvm::Triple::OpenBSD) {
+    Constant *StackChkFail = M->getOrInsertFunction(
+        "__stack_smash_handler", Type::getVoidTy(Context),
+        Type::getInt8PtrTy(Context), NULL);
+
+    B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
+  } else {
+    Constant *StackChkFail = M->getOrInsertFunction(
+        "__stack_chk_fail", Type::getVoidTy(Context), NULL);
+    B.CreateCall(StackChkFail);
+  }
+  B.CreateUnreachable();
   return FailBB;
 }
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index f951561..9f44df8 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -14,20 +14,20 @@
 #define DEBUG_TYPE "stackslotcoloring"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <vector>
@@ -48,13 +48,16 @@ namespace {
     LiveStacks* LS;
     MachineFrameInfo *MFI;
     const TargetInstrInfo  *TII;
-    const MachineLoopInfo *loopInfo;
+    const MachineBlockFrequencyInfo *MBFI;
 
     // SSIntervals - Spill slot intervals.
     std::vector<LiveInterval*> SSIntervals;
 
-    // SSRefs - Keep a list of frame index references for each spill slot.
-    SmallVector<SmallVector<MachineInstr*, 8>, 16> SSRefs;
+    // SSRefs - Keep a list of MachineMemOperands for each spill slot.
+    // MachineMemOperands can be shared between instructions, so we need
+    // to be careful that renames like [FI0, FI1] -> [FI1, FI2] do not
+    // become FI0 -> FI1 -> FI2.
+    SmallVector<SmallVector<MachineMemOperand *, 8>, 16> SSRefs;
 
     // OrigAlignments - Alignments of stack objects before coloring.
     SmallVector<unsigned, 16> OrigAlignments;
@@ -89,8 +92,8 @@ namespace {
       AU.addRequired<SlotIndexes>();
       AU.addPreserved<SlotIndexes>();
       AU.addRequired<LiveStacks>();
-      AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineLoopInfo>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -103,7 +106,7 @@ namespace {
     bool OverlapWithAssignments(LiveInterval *li, int Color) const;
     int ColorSlot(LiveInterval *li);
     bool ColorSlots(MachineFunction &MF);
-    void RewriteInstruction(MachineInstr *MI, int OldFI, int NewFI,
+    void RewriteInstruction(MachineInstr *MI, SmallVectorImpl<int> &SlotMapping,
                             MachineFunction &MF);
     bool RemoveDeadStores(MachineBasicBlock* MBB);
   };
@@ -139,7 +142,7 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
   for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
        MBBI != E; ++MBBI) {
     MachineBasicBlock *MBB = &*MBBI;
-    unsigned loopDepth = loopInfo->getLoopDepth(MBB);
+    BlockFrequency Freq = MBFI->getBlockFreq(MBB);
     for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
          MII != EE; ++MII) {
       MachineInstr *MI = &*MII;
@@ -154,8 +157,19 @@ void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
           continue;
         LiveInterval &li = LS->getInterval(FI);
         if (!MI->isDebugValue())
-          li.weight += LiveIntervals::getSpillWeight(false, true, loopDepth);
-        SSRefs[FI].push_back(MI);
+          li.weight += LiveIntervals::getSpillWeight(false, true, Freq);
+      }
+      for (MachineInstr::mmo_iterator MMOI = MI->memoperands_begin(),
+           EE = MI->memoperands_end(); MMOI != EE; ++MMOI) {
+        MachineMemOperand *MMO = *MMOI;
+        if (const Value *V = MMO->getValue()) {
+          if (const FixedStackPseudoSourceValue *FSV =
+              dyn_cast<FixedStackPseudoSourceValue>(V)) {
+            int FI = FSV->getFrameIndex();
+            if (FI >= 0)
+              SSRefs[FI].push_back(MMO);
+          }
+        }
       }
     }
   }
@@ -197,7 +211,7 @@ void StackSlotColoring::InitializeSlots() {
 /// LiveIntervals that have already been assigned to the specified color.
 bool
 StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
-  const SmallVector<LiveInterval*,4> &OtherLIs = Assignments[Color];
+  const SmallVectorImpl<LiveInterval *> &OtherLIs = Assignments[Color];
   for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) {
     LiveInterval *OtherLI = OtherLIs[i];
     if (OtherLI->overlaps(*li))
@@ -291,16 +305,26 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   if (!Changed)
     return false;
 
-  // Rewrite all MO_FrameIndex operands.
-  SmallVector<SmallSet<unsigned, 4>, 4> NewDefs(MF.getNumBlockIDs());
+  // Rewrite all MachineMemOperands.
   for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
     int NewFI = SlotMapping[SS];
     if (NewFI == -1 || (NewFI == (int)SS))
       continue;
 
-    SmallVector<MachineInstr*, 8> &RefMIs = SSRefs[SS];
-    for (unsigned i = 0, e = RefMIs.size(); i != e; ++i)
-      RewriteInstruction(RefMIs[i], SS, NewFI, MF);
+    const Value *NewSV = PseudoSourceValue::getFixedStack(NewFI);
+    SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
+      RefMMOs[i]->setValue(NewSV);
+  }
+
+  // Rewrite all MO_FrameIndex operands.  Look for dead stores.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
+         MII != EE; ++MII)
+      RewriteInstruction(MII, SlotMapping, MF);
+    RemoveDeadStores(MBB);
   }
 
   // Delete unused stack slots.
@@ -315,28 +339,24 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
 
 /// RewriteInstruction - Rewrite specified instruction by replacing references
 /// to old frame index with new one.
-void StackSlotColoring::RewriteInstruction(MachineInstr *MI, int OldFI,
-                                           int NewFI, MachineFunction &MF) {
+void StackSlotColoring::RewriteInstruction(MachineInstr *MI,
+                                           SmallVectorImpl<int> &SlotMapping,
+                                           MachineFunction &MF) {
   // Update the operands.
   for (unsigned i = 0, ee = MI->getNumOperands(); i != ee; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isFI())
       continue;
-    int FI = MO.getIndex();
-    if (FI != OldFI)
+    int OldFI = MO.getIndex();
+    if (OldFI < 0)
+      continue;
+    int NewFI = SlotMapping[OldFI];
+    if (NewFI == -1 || NewFI == OldFI)
       continue;
     MO.setIndex(NewFI);
   }
 
-  // Update the memory references. This changes the MachineMemOperands
-  // directly. They may be in use by multiple instructions, however all
-  // instructions using OldFI are being rewritten to use NewFI.
-  const Value *OldSV = PseudoSourceValue::getFixedStack(OldFI);
-  const Value *NewSV = PseudoSourceValue::getFixedStack(NewFI);
-  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
-       E = MI->memoperands_end(); I != E; ++I)
-    if ((*I)->getValue() == OldSV)
-      (*I)->setValue(NewSV);
+  // The MachineMemOperands have already been updated.
 }
 
 
@@ -357,10 +377,19 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     if (DCELimit != -1 && (int)NumDead >= DCELimit)
       break;
 
+    int FirstSS, SecondSS;
+    if (TII->isStackSlotCopy(I, FirstSS, SecondSS) &&
+        FirstSS == SecondSS &&
+        FirstSS != -1) {
+      ++NumDead;
+      changed = true;
+      toErase.push_back(I);
+      continue;
+    }
+        
     MachineBasicBlock::iterator NextMI = llvm::next(I);
     if (NextMI == MBB->end()) continue;
 
-    int FirstSS, SecondSS;
     unsigned LoadReg = 0;
     unsigned StoreReg = 0;
     if (!(LoadReg = TII->isLoadFromStackSlot(I, FirstSS))) continue;
@@ -379,7 +408,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     ++I;
   }
 
-  for (SmallVector<MachineInstr*, 4>::iterator I = toErase.begin(),
+  for (SmallVectorImpl<MachineInstr *>::iterator I = toErase.begin(),
        E = toErase.end(); I != E; ++I)
     (*I)->eraseFromParent();
 
@@ -396,7 +425,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   MFI = MF.getFrameInfo();
   TII = MF.getTarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
-  loopInfo = &getAnalysis<MachineLoopInfo>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
 
   bool Changed = false;
 
@@ -430,10 +459,5 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
     Assignments[i].clear();
   Assignments.clear();
 
-  if (Changed) {
-    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
-      Changed |= RemoveDeadStores(I);
-  }
-
   return Changed;
 }
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
deleted file mode 100644
index b337c53..0000000
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ /dev/null
@@ -1,825 +0,0 @@
-//===- StrongPHIElimination.cpp - Eliminate PHI nodes by inserting copies -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass eliminates PHI instructions by aggressively coalescing the copies
-// that would be inserted by a naive algorithm and only inserting the copies
-// that are necessary. The coalescing technique initially assumes that all
-// registers appearing in a PHI instruction do not interfere. It then eliminates
-// proven interferences, using dominators to only perform a linear number of
-// interference tests instead of the quadratic number of interference tests
-// that this would naively require. This is a technique derived from:
-// 
-//    Budimlic, et al. Fast copy coalescing and live-range identification.
-//    In Proceedings of the ACM SIGPLAN 2002 Conference on Programming Language
-//    Design and Implementation (Berlin, Germany, June 17 - 19, 2002).
-//    PLDI '02. ACM, New York, NY, 25-32.
-//
-// The original implementation constructs a data structure they call a dominance
-// forest for this purpose. The dominance forest was shown to be unnecessary,
-// as it is possible to emulate the creation and traversal of a dominance forest
-// by directly using the dominator tree, rather than actually constructing the
-// dominance forest.  This technique is explained in:
-//
-//   Boissinot, et al. Revisiting Out-of-SSA Translation for Correctness, Code
-//     Quality and Efficiency,
-//   In Proceedings of the 7th annual IEEE/ACM International Symposium on Code
-//   Generation and Optimization (Seattle, Washington, March 22 - 25, 2009).
-//   CGO '09. IEEE, Washington, DC, 114-125.
-//
-// Careful implementation allows for all of the dominator forest interference
-// checks to be performed at once in a single depth-first traversal of the
-// dominator tree, which is what is implemented here.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "strongphielim"
-#include "llvm/CodeGen/Passes.h"
-#include "PHIEliminationUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetInstrInfo.h"
-using namespace llvm;
-
-namespace {
-  class StrongPHIElimination : public MachineFunctionPass {
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    StrongPHIElimination() : MachineFunctionPass(ID) {
-      initializeStrongPHIEliminationPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual void getAnalysisUsage(AnalysisUsage&) const;
-    bool runOnMachineFunction(MachineFunction&);
-
-  private:
-    /// This struct represents a single node in the union-find data structure
-    /// representing the variable congruence classes. There is one difference
-    /// from a normal union-find data structure. We steal two bits from the parent
-    /// pointer . One of these bits is used to represent whether the register
-    /// itself has been isolated, and the other is used to represent whether the
-    /// PHI with that register as its destination has been isolated.
-    ///
-    /// Note that this leads to the strange situation where the leader of a
-    /// congruence class may no longer logically be a member, due to being
-    /// isolated.
-    struct Node {
-      enum Flags {
-        kRegisterIsolatedFlag = 1,
-        kPHIIsolatedFlag = 2
-      };
-      Node(unsigned v) : value(v), rank(0) { parent.setPointer(this); }
-
-      Node *getLeader();
-
-      PointerIntPair<Node*, 2> parent;
-      unsigned value;
-      unsigned rank;
-    };
-
-    /// Add a register in a new congruence class containing only itself.
-    void addReg(unsigned);
-
-    /// Join the congruence classes of two registers. This function is biased
-    /// towards the left argument, i.e. after
-    ///
-    /// addReg(r2);
-    /// unionRegs(r1, r2);
-    ///
-    /// the leader of the unioned congruence class is the same as the leader of
-    /// r1's congruence class prior to the union. This is actually relied upon
-    /// in the copy insertion code.
-    void unionRegs(unsigned, unsigned);
-
-    /// Get the color of a register. The color is 0 if the register has been
-    /// isolated.
-    unsigned getRegColor(unsigned);
-
-    // Isolate a register.
-    void isolateReg(unsigned);
-
-    /// Get the color of a PHI. The color of a PHI is 0 if the PHI has been
-    /// isolated. Otherwise, it is the original color of its destination and
-    /// all of its operands (before they were isolated, if they were).
-    unsigned getPHIColor(MachineInstr*);
-
-    /// Isolate a PHI.
-    void isolatePHI(MachineInstr*);
-
-    /// Traverses a basic block, splitting any interferences found between
-    /// registers in the same congruence class. It takes two DenseMaps as
-    /// arguments that it also updates: CurrentDominatingParent, which maps
-    /// a color to the register in that congruence class whose definition was
-    /// most recently seen, and ImmediateDominatingParent, which maps a register
-    /// to the register in the same congruence class that most immediately
-    /// dominates it.
-    ///
-    /// This function assumes that it is being called in a depth-first traversal
-    /// of the dominator tree.
-    void SplitInterferencesForBasicBlock(
-      MachineBasicBlock&,
-      DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-      DenseMap<unsigned, unsigned> &ImmediateDominatingParent);
-
-    // Lowers a PHI instruction, inserting copies of the source and destination
-    // registers as necessary.
-    void InsertCopiesForPHI(MachineInstr*, MachineBasicBlock*);
-
-    // Merges the live interval of Reg into NewReg and renames Reg to NewReg
-    // everywhere that Reg appears. Requires Reg and NewReg to have non-
-    // overlapping lifetimes.
-    void MergeLIsAndRename(unsigned Reg, unsigned NewReg);
-
-    MachineRegisterInfo *MRI;
-    const TargetInstrInfo *TII;
-    MachineDominatorTree *DT;
-    LiveIntervals *LI;
-
-    BumpPtrAllocator Allocator;
-
-    DenseMap<unsigned, Node*> RegNodeMap;
-
-    // Maps a basic block to a list of its defs of registers that appear as PHI
-    // sources.
-    DenseMap<MachineBasicBlock*, std::vector<MachineInstr*> > PHISrcDefs;
-
-    // Maps a color to a pair of a MachineInstr* and a virtual register, which
-    // is the operand of that PHI corresponding to the current basic block.
-    DenseMap<unsigned, std::pair<MachineInstr*, unsigned> > CurrentPHIForColor;
-
-    // FIXME: Can these two data structures be combined? Would a std::multimap
-    // be any better?
-
-    // Stores pairs of predecessor basic blocks and the source registers of
-    // inserted copy instructions.
-    typedef DenseSet<std::pair<MachineBasicBlock*, unsigned> > SrcCopySet;
-    SrcCopySet InsertedSrcCopySet;
-
-    // Maps pairs of predecessor basic blocks and colors to their defining copy
-    // instructions.
-    typedef DenseMap<std::pair<MachineBasicBlock*, unsigned>, MachineInstr*>
-      SrcCopyMap;
-    SrcCopyMap InsertedSrcCopyMap;
-
-    // Maps inserted destination copy registers to their defining copy
-    // instructions.
-    typedef DenseMap<unsigned, MachineInstr*> DestCopyMap;
-    DestCopyMap InsertedDestCopies;
-  };
-
-  struct MIIndexCompare {
-    MIIndexCompare(LiveIntervals *LiveIntervals) : LI(LiveIntervals) { }
-
-    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
-      return LI->getInstructionIndex(LHS) < LI->getInstructionIndex(RHS);
-    }
-
-    LiveIntervals *LI;
-  };
-} // namespace
-
-STATISTIC(NumPHIsLowered, "Number of PHIs lowered");
-STATISTIC(NumDestCopiesInserted, "Number of destination copies inserted");
-STATISTIC(NumSrcCopiesInserted, "Number of source copies inserted");
-
-char StrongPHIElimination::ID = 0;
-INITIALIZE_PASS_BEGIN(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(StrongPHIElimination, "strong-phi-node-elimination",
-  "Eliminate PHI nodes for register allocation, intelligently", false, false)
-
-char &llvm::StrongPHIEliminationID = StrongPHIElimination::ID;
-
-void StrongPHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<SlotIndexes>();
-  AU.addPreserved<SlotIndexes>();
-  AU.addRequired<LiveIntervals>();
-  AU.addPreserved<LiveIntervals>();
-  MachineFunctionPass::getAnalysisUsage(AU);
-}
-
-static MachineOperand *findLastUse(MachineBasicBlock *MBB, unsigned Reg) {
-  // FIXME: This only needs to check from the first terminator, as only the
-  // first terminator can use a virtual register.
-  for (MachineBasicBlock::reverse_iterator RI = MBB->rbegin(); ; ++RI) {
-    assert (RI != MBB->rend());
-    MachineInstr *MI = &*RI;
-
-    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-         OE = MI->operands_end(); OI != OE; ++OI) {
-      MachineOperand &MO = *OI;
-      if (MO.isReg() && MO.isUse() && MO.getReg() == Reg)
-        return &MO;
-    }
-  }
-}
-
-bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
-  MRI = &MF.getRegInfo();
-  TII = MF.getTarget().getInstrInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
-  LI = &getAnalysis<LiveIntervals>();
-
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-      PHISrcDefs[I].push_back(BBI);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        MachineOperand &SrcMO = BBI->getOperand(i);
-        unsigned SrcReg = SrcMO.getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-
-        MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
-        if (DefMI)
-          PHISrcDefs[DefMI->getParent()].push_back(DefMI);
-      }
-    }
-  }
-
-  // Perform a depth-first traversal of the dominator tree, splitting
-  // interferences amongst PHI-congruence classes.
-  DenseMap<unsigned, unsigned> CurrentDominatingParent;
-  DenseMap<unsigned, unsigned> ImmediateDominatingParent;
-  for (df_iterator<MachineDomTreeNode*> DI = df_begin(DT->getRootNode()),
-       DE = df_end(DT->getRootNode()); DI != DE; ++DI) {
-    SplitInterferencesForBasicBlock(*DI->getBlock(),
-                                    CurrentDominatingParent,
-                                    ImmediateDominatingParent);
-  }
-
-  // Insert copies for all PHI source and destination registers.
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      InsertCopiesForPHI(BBI, I);
-    }
-  }
-
-  // FIXME: Preserve the equivalence classes during copy insertion and use
-  // the preversed equivalence classes instead of recomputing them.
-  RegNodeMap.clear();
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    for (MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      unsigned DestReg = BBI->getOperand(0).getReg();
-      addReg(DestReg);
-
-      for (unsigned i = 1; i < BBI->getNumOperands(); i += 2) {
-        unsigned SrcReg = BBI->getOperand(i).getReg();
-        addReg(SrcReg);
-        unionRegs(DestReg, SrcReg);
-      }
-    }
-  }
-
-  DenseMap<unsigned, unsigned> RegRenamingMap;
-  bool Changed = false;
-  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
-       I != E; ++I) {
-    MachineBasicBlock::iterator BBI = I->begin(), BBE = I->end();
-    while (BBI != BBE && BBI->isPHI()) {
-      MachineInstr *PHI = BBI;
-
-      assert(PHI->getNumOperands() > 0);
-
-      unsigned SrcReg = PHI->getOperand(1).getReg();
-      unsigned SrcColor = getRegColor(SrcReg);
-      unsigned NewReg = RegRenamingMap[SrcColor];
-      if (!NewReg) {
-        NewReg = SrcReg;
-        RegRenamingMap[SrcColor] = SrcReg;
-      }
-      MergeLIsAndRename(SrcReg, NewReg);
-
-      unsigned DestReg = PHI->getOperand(0).getReg();
-      if (!InsertedDestCopies.count(DestReg))
-        MergeLIsAndRename(DestReg, NewReg);
-
-      for (unsigned i = 3; i < PHI->getNumOperands(); i += 2) {
-        unsigned SrcReg = PHI->getOperand(i).getReg();
-        MergeLIsAndRename(SrcReg, NewReg);
-      }
-
-      ++BBI;
-      LI->RemoveMachineInstrFromMaps(PHI);
-      PHI->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  // Due to the insertion of copies to split live ranges, the live intervals are
-  // guaranteed to not overlap, except in one case: an original PHI source and a
-  // PHI destination copy. In this case, they have the same value and thus don't
-  // truly intersect, so we merge them into the value live at that point.
-  // FIXME: Is there some better way we can handle this?
-  for (DestCopyMap::iterator I = InsertedDestCopies.begin(),
-       E = InsertedDestCopies.end(); I != E; ++I) {
-    unsigned DestReg = I->first;
-    unsigned DestColor = getRegColor(DestReg);
-    unsigned NewReg = RegRenamingMap[DestColor];
-
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-    LiveInterval &NewLI = LI->getInterval(NewReg);
-
-    assert(DestLI.ranges.size() == 1
-           && "PHI destination copy's live interval should be a single live "
-               "range from the beginning of the BB to the copy instruction.");
-    LiveRange *DestLR = DestLI.begin();
-    VNInfo *NewVNI = NewLI.getVNInfoAt(DestLR->start);
-    if (!NewVNI) {
-      NewVNI = NewLI.createValueCopy(DestLR->valno, LI->getVNInfoAllocator());
-      MachineInstr *CopyInstr = I->second;
-      CopyInstr->getOperand(1).setIsKill(true);
-    }
-
-    LiveRange NewLR(DestLR->start, DestLR->end, NewVNI);
-    NewLI.addRange(NewLR);
-
-    LI->removeInterval(DestReg);
-    MRI->replaceRegWith(DestReg, NewReg);
-  }
-
-  // Adjust the live intervals of all PHI source registers to handle the case
-  // where the PHIs in successor blocks were the only later uses of the source
-  // register.
-  for (SrcCopySet::iterator I = InsertedSrcCopySet.begin(),
-       E = InsertedSrcCopySet.end(); I != E; ++I) {
-    MachineBasicBlock *MBB = I->first;
-    unsigned SrcReg = I->second;
-    if (unsigned RenamedRegister = RegRenamingMap[getRegColor(SrcReg)])
-      SrcReg = RenamedRegister;
-
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-
-    bool isLiveOut = false;
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-         SE = MBB->succ_end(); SI != SE; ++SI) {
-      if (SrcLI.liveAt(LI->getMBBStartIdx(*SI))) {
-        isLiveOut = true;
-        break;
-      }
-    }
-
-    if (isLiveOut)
-      continue;
-
-    MachineOperand *LastUse = findLastUse(MBB, SrcReg);
-    assert(LastUse);
-    SlotIndex LastUseIndex = LI->getInstructionIndex(LastUse->getParent());
-    SrcLI.removeRange(LastUseIndex.getRegSlot(), LI->getMBBEndIdx(MBB));
-    LastUse->setIsKill(true);
-  }
-
-  Allocator.Reset();
-  RegNodeMap.clear();
-  PHISrcDefs.clear();
-  InsertedSrcCopySet.clear();
-  InsertedSrcCopyMap.clear();
-  InsertedDestCopies.clear();
-
-  return Changed;
-}
-
-void StrongPHIElimination::addReg(unsigned Reg) {
-  Node *&N = RegNodeMap[Reg];
-  if (!N)
-    N = new (Allocator) Node(Reg);
-}
-
-StrongPHIElimination::Node*
-StrongPHIElimination::Node::getLeader() {
-  Node *N = this;
-  Node *Parent = parent.getPointer();
-  Node *Grandparent = Parent->parent.getPointer();
-
-  while (Parent != Grandparent) {
-    N->parent.setPointer(Grandparent);
-    N = Grandparent;
-    Parent = Parent->parent.getPointer();
-    Grandparent = Parent->parent.getPointer();
-  }
-
-  return Parent;
-}
-
-unsigned StrongPHIElimination::getRegColor(unsigned Reg) {
-  DenseMap<unsigned, Node*>::iterator RI = RegNodeMap.find(Reg);
-  if (RI == RegNodeMap.end())
-    return 0;
-  Node *Node = RI->second;
-  if (Node->parent.getInt() & Node::kRegisterIsolatedFlag)
-    return 0;
-  return Node->getLeader()->value;
-}
-
-void StrongPHIElimination::unionRegs(unsigned Reg1, unsigned Reg2) {
-  Node *Node1 = RegNodeMap[Reg1]->getLeader();
-  Node *Node2 = RegNodeMap[Reg2]->getLeader();
-
-  if (Node1->rank > Node2->rank) {
-    Node2->parent.setPointer(Node1->getLeader());
-  } else if (Node1->rank < Node2->rank) {
-    Node1->parent.setPointer(Node2->getLeader());
-  } else if (Node1 != Node2) {
-    Node2->parent.setPointer(Node1->getLeader());
-    Node1->rank++;
-  }
-}
-
-void StrongPHIElimination::isolateReg(unsigned Reg) {
-  Node *Node = RegNodeMap[Reg];
-  Node->parent.setInt(Node->parent.getInt() | Node::kRegisterIsolatedFlag);
-}
-
-unsigned StrongPHIElimination::getPHIColor(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  Node *DestNode = RegNodeMap[DestReg];
-  if (DestNode->parent.getInt() & Node::kPHIIsolatedFlag)
-    return 0;
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    unsigned SrcColor = getRegColor(PHI->getOperand(i).getReg());
-    if (SrcColor)
-      return SrcColor;
-  }
-  return 0;
-}
-
-void StrongPHIElimination::isolatePHI(MachineInstr *PHI) {
-  assert(PHI->isPHI());
-  Node *Node = RegNodeMap[PHI->getOperand(0).getReg()];
-  Node->parent.setInt(Node->parent.getInt() | Node::kPHIIsolatedFlag);
-}
-
-/// SplitInterferencesForBasicBlock - traverses a basic block, splitting any
-/// interferences found between registers in the same congruence class. It
-/// takes two DenseMaps as arguments that it also updates:
-///
-/// 1) CurrentDominatingParent, which maps a color to the register in that
-///    congruence class whose definition was most recently seen.
-///
-/// 2) ImmediateDominatingParent, which maps a register to the register in the
-///    same congruence class that most immediately dominates it.
-///
-/// This function assumes that it is being called in a depth-first traversal
-/// of the dominator tree.
-///
-/// The algorithm used here is a generalization of the dominance-based SSA test
-/// for two variables. If there are variables a_1, ..., a_n such that
-///
-///   def(a_1) dom ... dom def(a_n),
-///
-/// then we can test for an interference between any two a_i by only using O(n)
-/// interference tests between pairs of variables. If i < j and a_i and a_j
-/// interfere, then a_i is alive at def(a_j), so it is also alive at def(a_i+1).
-/// Thus, in order to test for an interference involving a_i, we need only check
-/// for a potential interference with a_i+1.
-///
-/// This method can be generalized to arbitrary sets of variables by performing
-/// a depth-first traversal of the dominator tree. As we traverse down a branch
-/// of the dominator tree, we keep track of the current dominating variable and
-/// only perform an interference test with that variable. However, when we go to
-/// another branch of the dominator tree, the definition of the current dominating
-/// variable may no longer dominate the current block. In order to correct this,
-/// we need to use a stack of past choices of the current dominating variable
-/// and pop from this stack until we find a variable whose definition actually
-/// dominates the current block.
-/// 
-/// There will be one push on this stack for each variable that has become the
-/// current dominating variable, so instead of using an explicit stack we can
-/// simply associate the previous choice for a current dominating variable with
-/// the new choice. This works better in our implementation, where we test for
-/// interference in multiple distinct sets at once.
-void
-StrongPHIElimination::SplitInterferencesForBasicBlock(
-    MachineBasicBlock &MBB,
-    DenseMap<unsigned, unsigned> &CurrentDominatingParent,
-    DenseMap<unsigned, unsigned> &ImmediateDominatingParent) {
-  // Sort defs by their order in the original basic block, as the code below
-  // assumes that it is processing definitions in dominance order.
-  std::vector<MachineInstr*> &DefInstrs = PHISrcDefs[&MBB];
-  std::sort(DefInstrs.begin(), DefInstrs.end(), MIIndexCompare(LI));
-
-  for (std::vector<MachineInstr*>::const_iterator BBI = DefInstrs.begin(),
-       BBE = DefInstrs.end(); BBI != BBE; ++BBI) {
-    for (MachineInstr::const_mop_iterator I = (*BBI)->operands_begin(),
-         E = (*BBI)->operands_end(); I != E; ++I) {
-      const MachineOperand &MO = *I;
-
-      // FIXME: This would be faster if it were possible to bail out of checking
-      // an instruction's operands after the explicit defs, but this is incorrect
-      // for variadic instructions, which may appear before register allocation
-      // in the future.
-      if (!MO.isReg() || !MO.isDef())
-        continue;
-
-      unsigned DestReg = MO.getReg();
-      if (!DestReg || !TargetRegisterInfo::isVirtualRegister(DestReg))
-        continue;
-
-      // If the virtual register being defined is not used in any PHI or has
-      // already been isolated, then there are no more interferences to check.
-      unsigned DestColor = getRegColor(DestReg);
-      if (!DestColor)
-        continue;
-
-      // The input to this pass sometimes is not in SSA form in every basic
-      // block, as some virtual registers have redefinitions. We could eliminate
-      // this by fixing the passes that generate the non-SSA code, or we could
-      // handle it here by tracking defining machine instructions rather than
-      // virtual registers. For now, we just handle the situation conservatively
-      // in a way that will possibly lead to false interferences.
-      unsigned &CurrentParent = CurrentDominatingParent[DestColor];
-      unsigned NewParent = CurrentParent;
-      if (NewParent == DestReg)
-        continue;
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      while (NewParent && (!DT->dominates(MRI->getVRegDef(NewParent), *BBI)
-                           || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-
-      // If NewParent is nonzero, then its definition dominates the current
-      // instruction, so it is only necessary to check for the liveness of
-      // NewParent in order to check for an interference.
-      if (NewParent
-          && LI->getInterval(NewParent).liveAt(LI->getInstructionIndex(*BBI))) {
-        // If there is an interference, always isolate the new register. This
-        // could be improved by using a heuristic that decides which of the two
-        // registers to isolate.
-        isolateReg(DestReg);
-        CurrentParent = NewParent;
-      } else {
-        // If there is no interference, update ImmediateDominatingParent and set
-        // the CurrentDominatingParent for this color to the current register.
-        ImmediateDominatingParent[DestReg] = NewParent;
-        CurrentParent = DestReg;
-      }
-    }
-  }
-
-  // We now walk the PHIs in successor blocks and check for interferences. This
-  // is necessary because the use of a PHI's operands are logically contained in
-  // the predecessor block. The def of a PHI's destination register is processed
-  // along with the other defs in a basic block.
-
-  CurrentPHIForColor.clear();
-
-  for (MachineBasicBlock::succ_iterator SI = MBB.succ_begin(),
-       SE = MBB.succ_end(); SI != SE; ++SI) {
-    for (MachineBasicBlock::iterator BBI = (*SI)->begin(), BBE = (*SI)->end();
-         BBI != BBE && BBI->isPHI(); ++BBI) {
-      MachineInstr *PHI = BBI;
-
-      // If a PHI is already isolated, either by being isolated directly or
-      // having all of its operands isolated, ignore it.
-      unsigned Color = getPHIColor(PHI);
-      if (!Color)
-        continue;
-
-      // Find the index of the PHI operand that corresponds to this basic block.
-      unsigned PredIndex;
-      for (PredIndex = 1; PredIndex < PHI->getNumOperands(); PredIndex += 2) {
-        if (PHI->getOperand(PredIndex + 1).getMBB() == &MBB)
-          break;
-      }
-      assert(PredIndex < PHI->getNumOperands());
-      unsigned PredOperandReg = PHI->getOperand(PredIndex).getReg();
-
-      // Pop registers from the stack represented by ImmediateDominatingParent
-      // until we find a parent that dominates the current instruction.
-      unsigned &CurrentParent = CurrentDominatingParent[Color];
-      unsigned NewParent = CurrentParent;
-      while (NewParent
-             && (!DT->dominates(MRI->getVRegDef(NewParent)->getParent(), &MBB)
-                 || !getRegColor(NewParent)))
-        NewParent = ImmediateDominatingParent[NewParent];
-      CurrentParent = NewParent;
-
-      // If there is an interference with a register, always isolate the
-      // register rather than the PHI. It is also possible to isolate the
-      // PHI, but that introduces copies for all of the registers involved
-      // in that PHI.
-      if (NewParent && LI->isLiveOutOfMBB(LI->getInterval(NewParent), &MBB)
-                    && NewParent != PredOperandReg)
-        isolateReg(NewParent);
-
-      std::pair<MachineInstr*, unsigned>
-        &CurrentPHI = CurrentPHIForColor[Color];
-
-      // If two PHIs have the same operand from every shared predecessor, then
-      // they don't actually interfere. Otherwise, isolate the current PHI. This
-      // could possibly be improved, e.g. we could isolate the PHI with the
-      // fewest operands.
-      if (CurrentPHI.first && CurrentPHI.second != PredOperandReg)
-        isolatePHI(PHI);
-      else
-        CurrentPHI = std::make_pair(PHI, PredOperandReg);
-    }
-  }
-}
-
-void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
-                                              MachineBasicBlock *MBB) {
-  assert(PHI->isPHI());
-  ++NumPHIsLowered;
-  unsigned PHIColor = getPHIColor(PHI);
-
-  for (unsigned i = 1; i < PHI->getNumOperands(); i += 2) {
-    MachineOperand &SrcMO = PHI->getOperand(i);
-
-    // If a source is defined by an implicit def, there is no need to insert a
-    // copy in the predecessor.
-    if (SrcMO.isUndef())
-      continue;
-
-    unsigned SrcReg = SrcMO.getReg();
-    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-           "Machine PHI Operands must all be virtual registers!");
-
-    MachineBasicBlock *PredBB = PHI->getOperand(i + 1).getMBB();
-    unsigned SrcColor = getRegColor(SrcReg);
-
-    // If neither the PHI nor the operand were isolated, then we only need to
-    // set the phi-kill flag on the VNInfo at this PHI.
-    if (PHIColor && SrcColor == PHIColor) {
-      LiveInterval &SrcInterval = LI->getInterval(SrcReg);
-      SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
-      VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
-      (void)SrcVNI;
-      assert(SrcVNI);
-      continue;
-    }
-
-    unsigned CopyReg = 0;
-    if (PHIColor) {
-      SrcCopyMap::const_iterator I
-        = InsertedSrcCopyMap.find(std::make_pair(PredBB, PHIColor));
-      CopyReg
-        = I != InsertedSrcCopyMap.end() ? I->second->getOperand(0).getReg() : 0;
-    }
-
-    if (!CopyReg) {
-      const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
-      CopyReg = MRI->createVirtualRegister(RC);
-
-      MachineBasicBlock::iterator
-        CopyInsertPoint = findPHICopyInsertPoint(PredBB, MBB, SrcReg);
-      unsigned SrcSubReg = SrcMO.getSubReg();
-      MachineInstr *CopyInstr = BuildMI(*PredBB,
-                                        CopyInsertPoint,
-                                        PHI->getDebugLoc(),
-                                        TII->get(TargetOpcode::COPY),
-                                        CopyReg).addReg(SrcReg, 0, SrcSubReg);
-      LI->InsertMachineInstrInMaps(CopyInstr);
-      ++NumSrcCopiesInserted;
-
-      // addLiveRangeToEndOfBlock() also adds the phikill flag to the VNInfo for
-      // the newly added range.
-      LI->addLiveRangeToEndOfBlock(CopyReg, CopyInstr);
-      InsertedSrcCopySet.insert(std::make_pair(PredBB, SrcReg));
-
-      addReg(CopyReg);
-      if (PHIColor) {
-        unionRegs(PHIColor, CopyReg);
-        assert(getRegColor(CopyReg) != CopyReg);
-      } else {
-        PHIColor = CopyReg;
-        assert(getRegColor(CopyReg) == CopyReg);
-      }
-
-      // Insert into map if not already there.
-      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
-                                               CopyInstr));
-    }
-
-    SrcMO.setReg(CopyReg);
-
-    // If SrcReg is not live beyond the PHI, trim its interval so that it is no
-    // longer live-in to MBB. Note that SrcReg may appear in other PHIs that are
-    // processed later, but this is still correct to do at this point because we
-    // never rely on LiveIntervals being correct while inserting copies.
-    // FIXME: Should this just count uses at PHIs like the normal PHIElimination
-    // pass does?
-    LiveInterval &SrcLI = LI->getInterval(SrcReg);
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    SlotIndex NextInstrIndex = PHIIndex.getNextIndex();
-    if (SrcLI.liveAt(MBBStartIndex) && SrcLI.expiredAt(NextInstrIndex))
-      SrcLI.removeRange(MBBStartIndex, PHIIndex, true);
-  }
-
-  unsigned DestReg = PHI->getOperand(0).getReg();
-  unsigned DestColor = getRegColor(DestReg);
-
-  if (PHIColor && DestColor == PHIColor) {
-    LiveInterval &DestLI = LI->getInterval(DestReg);
-
-    // Set the phi-def flag for the VN at this PHI.
-    SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-    VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
-    assert(DestVNI);
-  
-    // Prior to PHI elimination, the live ranges of PHIs begin at their defining
-    // instruction. After PHI elimination, PHI instructions are replaced by VNs
-    // with the phi-def flag set, and the live ranges of these VNs start at the
-    // beginning of the basic block.
-    SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-    DestVNI->def = MBBStartIndex;
-    DestLI.addRange(LiveRange(MBBStartIndex,
-                              PHIIndex.getRegSlot(),
-                              DestVNI));
-    return;
-  }
-
-  const TargetRegisterClass *RC = MRI->getRegClass(DestReg);
-  unsigned CopyReg = MRI->createVirtualRegister(RC);
-
-  MachineInstr *CopyInstr = BuildMI(*MBB,
-                                    MBB->SkipPHIsAndLabels(MBB->begin()),
-                                    PHI->getDebugLoc(),
-                                    TII->get(TargetOpcode::COPY),
-                                    DestReg).addReg(CopyReg);
-  LI->InsertMachineInstrInMaps(CopyInstr);
-  PHI->getOperand(0).setReg(CopyReg);
-  ++NumDestCopiesInserted;
-
-  // Add the region from the beginning of MBB to the copy instruction to
-  // CopyReg's live interval, and give the VNInfo the phidef flag.
-  LiveInterval &CopyLI = LI->getOrCreateInterval(CopyReg);
-  SlotIndex MBBStartIndex = LI->getMBBStartIdx(MBB);
-  SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
-  VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
-                                        LI->getVNInfoAllocator());
-  CopyLI.addRange(LiveRange(MBBStartIndex,
-                            DestCopyIndex.getRegSlot(),
-                            CopyVNI));
-
-  // Adjust DestReg's live interval to adjust for its new definition at
-  // CopyInstr.
-  LiveInterval &DestLI = LI->getOrCreateInterval(DestReg);
-  SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
-  DestLI.removeRange(PHIIndex.getRegSlot(), DestCopyIndex.getRegSlot());
-
-  VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
-  assert(DestVNI);
-  DestVNI->def = DestCopyIndex.getRegSlot();
-
-  InsertedDestCopies[CopyReg] = CopyInstr;
-}
-
-void StrongPHIElimination::MergeLIsAndRename(unsigned Reg, unsigned NewReg) {
-  if (Reg == NewReg)
-    return;
-
-  LiveInterval &OldLI = LI->getInterval(Reg);
-  LiveInterval &NewLI = LI->getInterval(NewReg);
-
-  // Merge the live ranges of the two registers.
-  DenseMap<VNInfo*, VNInfo*> VNMap;
-  for (LiveInterval::iterator LRI = OldLI.begin(), LRE = OldLI.end();
-       LRI != LRE; ++LRI) {
-    LiveRange OldLR = *LRI;
-    VNInfo *OldVN = OldLR.valno;
-
-    VNInfo *&NewVN = VNMap[OldVN];
-    if (!NewVN) {
-      NewVN = NewLI.createValueCopy(OldVN, LI->getVNInfoAllocator());
-      VNMap[OldVN] = NewVN;
-    }
-
-    LiveRange LR(OldLR.start, OldLR.end, NewVN);
-    NewLI.addRange(LR);
-  }
-
-  // Remove the LiveInterval for the register being renamed and replace all
-  // of its defs and uses with the new register.
-  LI->removeInterval(Reg);
-  MRI->replaceRegWith(Reg, NewReg);
-}
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 1ec8817..ff0181e 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -86,7 +86,7 @@ namespace {
     void ProcessPHI(MachineInstr *MI, MachineBasicBlock *TailBB,
                     MachineBasicBlock *PredBB,
                     DenseMap<unsigned, unsigned> &LocalVRMap,
-                    SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
+                    SmallVectorImpl<std::pair<unsigned,unsigned> > &Copies,
                     const DenseSet<unsigned> &UsedByPhi,
                     bool Remove);
     void DuplicateInstruction(MachineInstr *MI,
@@ -96,7 +96,7 @@ namespace {
                               DenseMap<unsigned, unsigned> &LocalVRMap,
                               const DenseSet<unsigned> &UsedByPhi);
     void UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
-                              SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                              SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                               SmallSetVector<MachineBasicBlock*, 8> &Succs);
     bool TailDuplicateBlocks(MachineFunction &MF);
     bool shouldTailDuplicate(const MachineFunction &MF,
@@ -104,14 +104,14 @@ namespace {
     bool isSimpleBB(MachineBasicBlock *TailBB);
     bool canCompletelyDuplicateBB(MachineBasicBlock &BB);
     bool duplicateSimpleBB(MachineBasicBlock *TailBB,
-                           SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                           SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                            const DenseSet<unsigned> &RegsUsedByPhi,
-                           SmallVector<MachineInstr*, 16> &Copies);
+                           SmallVectorImpl<MachineInstr *> &Copies);
     bool TailDuplicate(MachineBasicBlock *TailBB,
                        bool IsSimple,
                        MachineFunction &MF,
-                       SmallVector<MachineBasicBlock*, 8> &TDBBs,
-                       SmallVector<MachineInstr*, 16> &Copies);
+                       SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                       SmallVectorImpl<MachineInstr *> &Copies);
     bool TailDuplicateAndUpdate(MachineBasicBlock *MBB,
                                 bool IsSimple,
                                 MachineFunction &MF);
@@ -382,13 +382,11 @@ void TailDuplicatePass::AddSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
 /// ProcessPHI - Process PHI node in TailBB by turning it into a copy in PredBB.
 /// Remember the source register that's contributed by PredBB and update SSA
 /// update map.
-void TailDuplicatePass::ProcessPHI(MachineInstr *MI,
-                                   MachineBasicBlock *TailBB,
-                                   MachineBasicBlock *PredBB,
-                                   DenseMap<unsigned, unsigned> &LocalVRMap,
-                           SmallVector<std::pair<unsigned,unsigned>, 4> &Copies,
-                                   const DenseSet<unsigned> &RegsUsedByPhi,
-                                   bool Remove) {
+void TailDuplicatePass::ProcessPHI(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    DenseMap<unsigned, unsigned> &LocalVRMap,
+    SmallVectorImpl<std::pair<unsigned, unsigned> > &Copies,
+    const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
   unsigned DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
@@ -452,7 +450,7 @@ void TailDuplicatePass::DuplicateInstruction(MachineInstr *MI,
 /// instructions in them accordingly.
 void
 TailDuplicatePass::UpdateSuccessorsPHIs(MachineBasicBlock *FromBB, bool isDead,
-                                  SmallVector<MachineBasicBlock*, 8> &TDBBs,
+                                  SmallVectorImpl<MachineBasicBlock *> &TDBBs,
                                   SmallSetVector<MachineBasicBlock*,8> &Succs) {
   for (SmallSetVector<MachineBasicBlock*, 8>::iterator SI = Succs.begin(),
          SE = Succs.end(); SI != SE; ++SI) {
@@ -640,8 +638,6 @@ bothUsedInPHI(const MachineBasicBlock &A,
 
 bool
 TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
-  SmallPtrSet<MachineBasicBlock*, 8> Succs(BB.succ_begin(), BB.succ_end());
-
   for (MachineBasicBlock::pred_iterator PI = BB.pred_begin(),
        PE = BB.pred_end(); PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
@@ -662,9 +658,9 @@ TailDuplicatePass::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
 
 bool
 TailDuplicatePass::duplicateSimpleBB(MachineBasicBlock *TailBB,
-                                     SmallVector<MachineBasicBlock*, 8> &TDBBs,
-                                     const DenseSet<unsigned> &UsedByPhi,
-                                     SmallVector<MachineInstr*, 16> &Copies) {
+                                    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                                    const DenseSet<unsigned> &UsedByPhi,
+                                    SmallVectorImpl<MachineInstr *> &Copies) {
   SmallPtrSet<MachineBasicBlock*, 8> Succs(TailBB->succ_begin(),
                                            TailBB->succ_end());
   SmallVector<MachineBasicBlock*, 8> Preds(TailBB->pred_begin(),
@@ -742,8 +738,8 @@ bool
 TailDuplicatePass::TailDuplicate(MachineBasicBlock *TailBB,
                                  bool IsSimple,
                                  MachineFunction &MF,
-                                 SmallVector<MachineBasicBlock*, 8> &TDBBs,
-                                 SmallVector<MachineInstr*, 16> &Copies) {
+                                 SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                                 SmallVectorImpl<MachineInstr *> &Copies) {
   DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
 
   DenseSet<unsigned> UsedByPhi;
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 20eb918..bf4fd65 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -276,6 +277,36 @@ bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
   return false;
 }
 
+bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+                                        unsigned SubIdx, unsigned &Size,
+                                        unsigned &Offset,
+                                        const TargetMachine *TM) const {
+  if (!SubIdx) {
+    Size = RC->getSize();
+    Offset = 0;
+    return true;
+  }
+  unsigned BitSize = TM->getRegisterInfo()->getSubRegIdxSize(SubIdx);
+  // Convert bit size to byte size to be consistent with
+  // MCRegisterClass::getSize().
+  if (BitSize % 8)
+    return false;
+
+  int BitOffset = TM->getRegisterInfo()->getSubRegIdxOffset(SubIdx);
+  if (BitOffset < 0 || BitOffset % 8)
+    return false;
+
+  Size = BitSize /= 8;
+  Offset = (unsigned)BitOffset / 8;
+
+  assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
+
+  if (!TM->getDataLayout()->isLittleEndian()) {
+    Offset = RC->getSize() - (Offset + Size);
+  }
+  return true;
+}
+
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     unsigned DestReg,
@@ -364,6 +395,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
 
   // Ask the target to do the actual folding.
   if (MachineInstr *NewMI = foldMemoryOperandImpl(MF, MI, Ops, FI)) {
+    NewMI->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
     // Add a memory operand, foldMemoryOperandImpl doesn't do that.
     assert((!(Flags & MachineMemOperand::MOStore) ||
             NewMI->mayStore()) &&
@@ -424,9 +456,19 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
   NewMI = MBB.insert(MI, NewMI);
 
   // Copy the memoperands from the load to the folded instruction.
-  NewMI->setMemRefs(LoadMI->memoperands_begin(),
-                    LoadMI->memoperands_end());
-
+  if (MI->memoperands_empty()) {
+    NewMI->setMemRefs(LoadMI->memoperands_begin(),
+                      LoadMI->memoperands_end());
+  }
+  else {
+    // Handle the rare case of folding multiple loads.
+    NewMI->setMemRefs(MI->memoperands_begin(),
+                      MI->memoperands_end());
+    for (MachineInstr::mmo_iterator I = LoadMI->memoperands_begin(),
+           E = LoadMI->memoperands_end(); I != E; ++I) {
+      NewMI->addMemOperand(MF, *I);
+    }
+  }
   return NewMI;
 }
 
@@ -630,6 +672,10 @@ unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
   return 1;
 }
 
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 0;
+}
+
 unsigned TargetInstrInfo::
 getInstrLatency(const InstrItineraryData *ItinData,
                 const MachineInstr *MI,
@@ -668,27 +714,13 @@ getOperandLatency(const InstrItineraryData *ItinData,
 /// lookup, do so. Otherwise return -1.
 int TargetInstrInfo::computeDefOperandLatency(
   const InstrItineraryData *ItinData,
-  const MachineInstr *DefMI, bool FindMin) const {
+  const MachineInstr *DefMI) const {
 
   // Let the target hook getInstrLatency handle missing itineraries.
   if (!ItinData)
     return getInstrLatency(ItinData, DefMI);
 
-  // Return a latency based on the itinerary properties and defining instruction
-  // if possible. Some common subtargets don't require per-operand latency,
-  // especially for minimum latencies.
-  if (FindMin) {
-    // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
-    // it exists before defaulting to MinLatency.
-    if (ItinData->SchedModel->MinLatency >= 0)
-      return getInstrLatency(ItinData, DefMI);
-
-    // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
-    // For empty itineraries, short-cirtuit the check and default to one cycle.
-    if (ItinData->isEmpty())
-      return 1;
-  }
-  else if(ItinData->isEmpty())
+  if(ItinData->isEmpty())
     return defaultDefLatency(ItinData->SchedModel, DefMI);
 
   // ...operand lookup required
@@ -709,10 +741,9 @@ int TargetInstrInfo::computeDefOperandLatency(
 unsigned TargetInstrInfo::
 computeOperandLatency(const InstrItineraryData *ItinData,
                       const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx,
-                      bool FindMin) const {
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
 
-  int DefLatency = computeDefOperandLatency(ItinData, DefMI, FindMin);
+  int DefLatency = computeDefOperandLatency(ItinData, DefMI);
   if (DefLatency >= 0)
     return DefLatency;
 
@@ -732,8 +763,7 @@ computeOperandLatency(const InstrItineraryData *ItinData,
   unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
 
   // Expected latency is the max of the stage latency and itinerary props.
-  if (!FindMin)
-    InstrLatency = std::max(InstrLatency,
-                            defaultDefLatency(ItinData->SchedModel, DefMI));
+  InstrLatency = std::max(InstrLatency,
+                          defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 8074d16..30305af 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -191,6 +191,11 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
   Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
   Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::ROUND_F32] = "roundf";
+  Names[RTLIB::ROUND_F64] = "round";
+  Names[RTLIB::ROUND_F80] = "roundl";
+  Names[RTLIB::ROUND_F128] = "roundl";
+  Names[RTLIB::ROUND_PPCF128] = "roundl";
   Names[RTLIB::FLOOR_F32] = "floorf";
   Names[RTLIB::FLOOR_F64] = "floor";
   Names[RTLIB::FLOOR_F80] = "floorl";
@@ -313,34 +318,62 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = "__sync_val_compare_and_swap_16";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
   Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = "__sync_lock_test_and_set_16";
   Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
   Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
   Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
   Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
   Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
   Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
   Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
   Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
   Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
   Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
   Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
   Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
   Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
   Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
   Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
   Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
   Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
   Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
   Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
   Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
   Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
   Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
   Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
   Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_16] = "__sync_fetch_and_nand_16";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_16] = "__sync_fetch_and_umax_16";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
   
   if (Triple(TM.getTargetTriple()).getEnvironment() == Triple::GNU) {
     Names[RTLIB::SINCOS_F32] = "sincosf";
@@ -356,6 +389,13 @@ static void InitLibcallNames(const char **Names, const TargetMachine &TM) {
     Names[RTLIB::SINCOS_F128] = 0;
     Names[RTLIB::SINCOS_PPCF128] = 0;
   }
+
+  if (Triple(TM.getTargetTriple()).getOS() != Triple::OpenBSD) {
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
+  } else {
+    // These are generally not available.
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = 0;
+  }
 }
 
 /// InitLibcallCallingConvs - Set default libcall CallingConvs.
@@ -624,7 +664,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm,
 
   // Perform these initializations only once.
   IsLittleEndian = TD->isLittleEndian();
-  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
   MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
   MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
     = MaxStoresPerMemmoveOptSize = 4;
@@ -682,6 +721,14 @@ void TargetLoweringBase::initActions() {
     // These operations default to expand.
     setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
+
+    // These library functions default to expand.
+    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+
+    // These operations default to expand for vector types.
+    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
+        VT <= MVT::LAST_VECTOR_VALUETYPE)
+      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
   }
 
   // Most targets ignore the @llvm.prefetch intrinsic.
@@ -747,6 +794,19 @@ void TargetLoweringBase::initActions() {
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
 }
 
+MVT TargetLoweringBase::getPointerTy(uint32_t AS) const {
+  return MVT::getIntegerVT(getPointerSizeInBits(AS));
+}
+
+unsigned TargetLoweringBase::getPointerSizeInBits(uint32_t AS) const {
+  return TD->getPointerSizeInBits(AS);
+}
+
+unsigned TargetLoweringBase::getPointerTypeSizeInBits(Type *Ty) const {
+  assert(Ty->isPointerTy());
+  return getPointerSizeInBits(Ty->getPointerAddressSpace());
+}
+
 MVT TargetLoweringBase::getScalarShiftAmountTy(EVT LHSTy) const {
   return MVT::getIntegerVT(8*TD->getPointerSize(0));
 }
@@ -1033,7 +1093,7 @@ void TargetLoweringBase::computeRegisterProperties() {
   }
 }
 
-EVT TargetLoweringBase::getSetCCResultType(EVT VT) const {
+EVT TargetLoweringBase::getSetCCResultType(LLVMContext &, EVT VT) const {
   assert(!VT.isVector() && "No default SetCC type for vectors!");
   return getPointerTy(0).SimpleTy;
 }
@@ -1162,7 +1222,7 @@ void llvm::GetReturnInfo(Type* ReturnType, AttributeSet attr,
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isFixed=*/true, 0, 0));
   }
 }
 
@@ -1228,6 +1288,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case PtrToInt:       return ISD::BITCAST;
   case IntToPtr:       return ISD::BITCAST;
   case BitCast:        return ISD::BITCAST;
+  case AddrSpaceCast:  return ISD::ADDRSPACECAST;
   case ICmp:           return ISD::SETCC;
   case FCmp:           return ISD::SETCC;
   case PHI:            return 0;
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 7e7359a..59d7b57 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -52,10 +52,10 @@ TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
   default:
     report_fatal_error("We do not support this DWARF encoding yet!");
   case dwarf::DW_EH_PE_absptr:
-    return  Mang->getSymbol(GV);
+    return  getSymbol(*Mang, GV);
   case dwarf::DW_EH_PE_pcrel: {
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
-                                          Mang->getSymbol(GV)->getName());
+                                          getSymbol(*Mang, GV)->getName());
   }
   }
 }
@@ -104,7 +104,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
     MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -252,7 +252,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     Prefix = getSectionPrefixForGlobal(Kind);
 
     SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     Name.append(Sym->getName().begin(), Sym->getName().end());
     StringRef Group = "";
     unsigned Flags = getELFSectionFlags(Kind);
@@ -523,6 +523,11 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler *Mang, const TargetMachine &TM) const {
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
   if (Kind.isText())
     return GV->isWeakForLinker() ? TextCoalSection : TextSection;
 
@@ -575,10 +580,6 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (Kind.isBSSLocal())
     return DataBSSSection;
 
-  // Handle thread local data.
-  if (Kind.isThreadBSS()) return TLSBSSSection;
-  if (Kind.isThreadData()) return TLSDataSection;
-
   // Otherwise, just drop the variable in the normal data section.
   return DataSection;
 }
@@ -613,7 +614,7 @@ shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
     // FIXME: ObjC metadata is currently emitted as internal symbols that have
     // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
     // this horrible hack can go away.
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     if (Sym->getName()[0] == 'L' || Sym->getName()[0] == 'l')
       return false;
   }
@@ -642,7 +643,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
       GV->hasHiddenVisibility() ? MachOMMI.getHiddenGVStubEntry(SSym) :
                                   MachOMMI.getGVStubEntry(SSym);
     if (StubSym.getPointer() == 0) {
-      MCSymbol *Sym = Mang->getSymbol(GV);
+      MCSymbol *Sym = getSymbol(*Mang, GV);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
 
@@ -671,7 +672,7 @@ getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
   MCSymbol *SSym = getContext().GetOrCreateSymbol(Name.str());
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
   if (StubSym.getPointer() == 0) {
-    MCSymbol *Sym = Mang->getSymbol(GV);
+    MCSymbol *Sym = getSymbol(*Mang, GV);
     StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
   }
 
@@ -726,14 +727,14 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
   if (GV->isWeakForLinker()) {
     Selection = COFF::IMAGE_COMDAT_SELECT_ANY;
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-    MCSymbol *Sym = Mang->getSymbol(GV);
     Name.append("$");
-    Name.append(Sym->getName().begin() + 1, Sym->getName().end());
+    Mang->getNameWithPrefix(Name, GV, false, false);
   }
   return getContext().getCOFFSection(Name,
                                      Characteristics,
-                                     Selection,
-                                     Kind);
+                                     Kind,
+                                     "",
+                                     Selection);
 }
 
 static const char *getCOFFSectionPrefixForUniqueGlobal(SectionKind Kind) {
@@ -761,24 +762,29 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
   if (GV->isWeakForLinker()) {
     const char *Prefix = getCOFFSectionPrefixForUniqueGlobal(Kind);
     SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
-    MCSymbol *Sym = Mang->getSymbol(GV);
-    Name.append(Sym->getName().begin() + 1, Sym->getName().end());
+    Mang->getNameWithPrefix(Name, GV, false, false);
 
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
 
     return getContext().getCOFFSection(Name.str(), Characteristics,
-                          COFF::IMAGE_COMDAT_SELECT_ANY, Kind);
+                                       Kind, "", COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
   if (Kind.isText())
-    return getTextSection();
+    return TextSection;
 
   if (Kind.isThreadLocal())
-    return getTLSDataSection();
+    return TLSDataSection;
 
-  return getDataSection();
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isBSS())
+    return BSSSection;
+
+  return DataSection;
 }
 
 void TargetLoweringObjectFileCOFF::
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index 435a5e7..f7bf86b 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Target/TargetOptions.h"
@@ -21,7 +22,8 @@ using namespace llvm;
 bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   // Check to see if we should eliminate non-leaf frame pointers and then
   // check to see if we should eliminate all frame pointers.
-  if (NoFramePointerElimNonLeaf && !NoFramePointerElim) {
+  if (MF.getFunction()->hasFnAttribute("no-frame-pointer-elim-non-leaf") &&
+      !NoFramePointerElim) {
     const MachineFrameInfo *MFI = MF.getFrameInfo();
     return MFI->hasCalls();
   }
@@ -49,30 +51,3 @@ bool TargetOptions::HonorSignDependentRoundingFPMath() const {
 StringRef TargetOptions::getTrapFunctionName() const {
   return TrapFuncName;
 }
-
-bool TargetOptions::operator==(const TargetOptions &TO) {
-#define ARE_EQUAL(X) X == TO.X
-  return
-    ARE_EQUAL(UnsafeFPMath) &&
-    ARE_EQUAL(NoInfsFPMath) &&
-    ARE_EQUAL(NoNaNsFPMath) &&
-    ARE_EQUAL(HonorSignDependentRoundingFPMathOption) &&
-    ARE_EQUAL(UseSoftFloat) &&
-    ARE_EQUAL(NoZerosInBSS) &&
-    ARE_EQUAL(JITExceptionHandling) &&
-    ARE_EQUAL(JITEmitDebugInfo) &&
-    ARE_EQUAL(JITEmitDebugInfoToDisk) &&
-    ARE_EQUAL(GuaranteedTailCallOpt) &&
-    ARE_EQUAL(DisableTailCalls) &&
-    ARE_EQUAL(StackAlignmentOverride) &&
-    ARE_EQUAL(RealignStack) &&
-    ARE_EQUAL(SSPBufferSize) &&
-    ARE_EQUAL(EnableFastISel) &&
-    ARE_EQUAL(PositionIndependentExecutable) &&
-    ARE_EQUAL(EnableSegmentedStacks) &&
-    ARE_EQUAL(UseInitArray) &&
-    ARE_EQUAL(TrapFuncName) &&
-    ARE_EQUAL(FloatABIType) &&
-    ARE_EQUAL(AllowFPOpFusion);
-#undef ARE_EQUAL
-}
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index 84b4bfc..5a15243 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -23,10 +23,12 @@ using namespace llvm;
 TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
                              regclass_iterator RCB, regclass_iterator RCE,
                              const char *const *SRINames,
-                             const unsigned *SRILaneMasks)
+                             const unsigned *SRILaneMasks,
+                             unsigned SRICoveringLanes)
   : InfoDesc(ID), SubRegIndexNames(SRINames),
     SubRegIndexLaneMasks(SRILaneMasks),
-    RegClassBegin(RCB), RegClassEnd(RCE) {
+    RegClassBegin(RCB), RegClassEnd(RCE),
+    CoveringLanes(SRICoveringLanes) {
 }
 
 TargetRegisterInfo::~TargetRegisterInfo() {}
@@ -71,6 +73,14 @@ void PrintRegUnit::print(raw_ostream &OS) const {
     OS << '~' << TRI->getName(*Roots);
 }
 
+void PrintVRegOrUnit::print(raw_ostream &OS) const {
+  if (TRI && TRI->isVirtualRegister(Unit)) {
+    OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    return;
+  }
+  PrintRegUnit::print(OS);
+}
+
 /// getAllocatableClass - Return the maximal subclass of the given register
 /// class that is alloctable, or NULL.
 const TargetRegisterClass *
@@ -83,7 +93,7 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
        Base < BaseE; Base += 32) {
     unsigned Idx = Base;
     for (unsigned Mask = *SubClass++; Mask; Mask >>= 1) {
-      unsigned Offset = CountTrailingZeros_32(Mask);
+      unsigned Offset = countTrailingZeros(Mask);
       const TargetRegisterClass *SubRC = getRegClass(Idx + Offset);
       if (SubRC->isAllocatable())
         return SubRC;
@@ -153,7 +163,7 @@ const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const TargetRegisterInfo *TRI) {
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
     if (unsigned Common = *A++ & *B++)
-      return TRI->getRegClass(I + CountTrailingZeros_32(Common));
+      return TRI->getRegClass(I + countTrailingZeros(Common));
   return 0;
 }
 
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 1bf14db..b0f2ca6 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -93,33 +93,10 @@ unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
 // effectively means infinite latency. Since users of the TargetSchedule API
 // don't know how to handle this, we convert it to a very large latency that is
 // easy to distinguish when debugging the DAG but won't induce overflow.
-static unsigned convertLatency(int Cycles) {
+static unsigned capLatency(int Cycles) {
   return Cycles >= 0 ? Cycles : 1000;
 }
 
-/// If we can determine the operand latency from the def only, without machine
-/// model or itinerary lookup, do so. Otherwise return -1.
-int TargetSchedModel::getDefLatency(const MachineInstr *DefMI,
-                                    bool FindMin) const {
-
-  // Return a latency based on the itinerary properties and defining instruction
-  // if possible. Some common subtargets don't require per-operand latency,
-  // especially for minimum latencies.
-  if (FindMin) {
-    // If MinLatency is invalid, then use the itinerary for MinLatency. If no
-    // itinerary exists either, then use single cycle latency.
-    if (SchedModel.MinLatency < 0 && !hasInstrItineraries()) {
-      return 1;
-    }
-    return SchedModel.MinLatency;
-  }
-  else if (!hasInstrSchedModel() && !hasInstrItineraries()) {
-    return TII->defaultDefLatency(&SchedModel, DefMI);
-  }
-  // ...operand lookup required
-  return -1;
-}
-
 /// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
 /// evaluation of predicates that depend on instruction operands or flags.
 const MCSchedClassDesc *TargetSchedModel::
@@ -177,18 +154,16 @@ static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
 // Top-level API for clients that know the operand indices.
 unsigned TargetSchedModel::computeOperandLatency(
   const MachineInstr *DefMI, unsigned DefOperIdx,
-  const MachineInstr *UseMI, unsigned UseOperIdx,
-  bool FindMin) const {
+  const MachineInstr *UseMI, unsigned UseOperIdx) const {
 
-  int DefLatency = getDefLatency(DefMI, FindMin);
-  if (DefLatency >= 0)
-    return DefLatency;
+  if (!hasInstrSchedModel() && !hasInstrItineraries())
+    return TII->defaultDefLatency(&SchedModel, DefMI);
 
   if (hasInstrItineraries()) {
     int OperLatency = 0;
     if (UseMI) {
-      OperLatency =
-        TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, UseMI, UseOperIdx);
+      OperLatency = TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx,
+                                           UseMI, UseOperIdx);
     }
     else {
       unsigned DefClass = DefMI->getDesc().getSchedClass();
@@ -205,13 +180,11 @@ unsigned TargetSchedModel::computeOperandLatency(
     // hook to allow subtargets to specialize latency. This hook is only
     // applicable to the InstrItins model. InstrSchedModel should model all
     // special cases without TII hooks.
-    if (!FindMin)
-      InstrLatency = std::max(InstrLatency,
-                              TII->defaultDefLatency(&SchedModel, DefMI));
+    InstrLatency = std::max(InstrLatency,
+                            TII->defaultDefLatency(&SchedModel, DefMI));
     return InstrLatency;
   }
-  assert(!FindMin && hasInstrSchedModel() &&
-         "Expected a SchedModel for this cpu");
+  // hasInstrSchedModel()
   const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
   unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
   if (DefIdx < SCDesc->NumWriteLatencyEntries) {
@@ -219,7 +192,7 @@ unsigned TargetSchedModel::computeOperandLatency(
     const MCWriteLatencyEntry *WLEntry =
       STI->getWriteLatencyEntry(SCDesc, DefIdx);
     unsigned WriteID = WLEntry->WriteResourceID;
-    unsigned Latency = convertLatency(WLEntry->Cycles);
+    unsigned Latency = capLatency(WLEntry->Cycles);
     if (!UseMI)
       return Latency;
 
@@ -228,13 +201,17 @@ unsigned TargetSchedModel::computeOperandLatency(
     if (UseDesc->NumReadAdvanceEntries == 0)
       return Latency;
     unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
-    return Latency - STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+    int Advance = STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+    if (Advance > 0 && (unsigned)Advance > Latency) // unsigned wrap
+      return 0;
+    return Latency - Advance;
   }
   // If DefIdx does not exist in the model (e.g. implicit defs), then return
   // unit latency (defaultDefLatency may be too conservative).
 #ifndef NDEBUG
   if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
-      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
+      && SchedModel.isComplete()) {
     std::string Err;
     raw_string_ostream ss(Err);
     ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
@@ -248,10 +225,13 @@ unsigned TargetSchedModel::computeOperandLatency(
   return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI);
 }
 
-unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
+unsigned
+TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
+                                      bool UseDefaultDefLatency) const {
   // For the itinerary model, fall back to the old subtarget hook.
   // Allow subtargets to compute Bundle latencies outside the machine model.
-  if (hasInstrItineraries() || MI->isBundle())
+  if (hasInstrItineraries() || MI->isBundle() ||
+      (!hasInstrSchedModel() && !UseDefaultDefLatency))
     return TII->getInstrLatency(&InstrItins, MI);
 
   if (hasInstrSchedModel()) {
@@ -263,7 +243,7 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
         // Lookup the definition's write latency in SubtargetInfo.
         const MCWriteLatencyEntry *WLEntry =
           STI->getWriteLatencyEntry(SCDesc, DefIdx);
-        Latency = std::max(Latency, convertLatency(WLEntry->Cycles));
+        Latency = std::max(Latency, capLatency(WLEntry->Cycles));
       }
       return Latency;
     }
@@ -274,13 +254,10 @@ unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
 unsigned TargetSchedModel::
 computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
                      const MachineInstr *DepMI) const {
-  // MinLatency == -1 is for in-order processors that always have unit
-  // MinLatency. MinLatency > 0 is for in-order processors with varying min
-  // latencies, but since this is not a RAW dep, we always use unit latency.
-  if (SchedModel.MinLatency != 0)
+  if (SchedModel.MicroOpBufferSize <= 1)
     return 1;
 
-  // MinLatency == 0 indicates an out-of-order processor that can dispatch
+  // MicroOpBufferSize > 1 indicates an out-of-order processor that can dispatch
   // WAW dependencies in the same cycle.
 
   // Treat predication as a data dependency for out-of-order cpus. In-order
@@ -302,7 +279,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
     if (SCDesc->isValid()) {
       for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc),
              *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) {
-        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->IsBuffered)
+        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->BufferSize)
           return 1;
       }
     }
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 7ca2bee..b9a6b47 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1400,7 +1400,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
           LIS->getInstructionIndex(MI).getRegSlot(IsEarlyClobber);
-        LI.addRange(LiveRange(LastCopyIdx, endIdx, VNI));
+        LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
       }
     }
 
@@ -1457,7 +1457,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
 
       SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
       if (I->end == UseIdx)
-        LI.removeRange(LastCopyIdx, UseIdx);
+        LI.removeSegment(LastCopyIdx, UseIdx);
     }
 
   } else if (RemovedKillFlag) {
@@ -1539,7 +1539,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
       // transformations that may either eliminate the tied operands or
       // improve the opportunities for coalescing away the register copy.
       if (TiedOperands.size() == 1) {
-        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs
+        SmallVectorImpl<std::pair<unsigned, unsigned> > &TiedPairs
           = TiedOperands.begin()->second;
         if (TiedPairs.size() == 1) {
           unsigned SrcIdx = TiedPairs[0].first;
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index a95ebcd..f735ef2 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -50,7 +49,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
     }
   };
 }
@@ -87,9 +85,7 @@ bool UnreachableBlockElim::runOnFunction(Function &F) {
     }
 
   // Actually remove the blocks now.
-  ProfileInfo *PI = getAnalysisIfAvailable<ProfileInfo>();
   for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
-    if (PI) PI->removeBlock(DeadBlocks[i]);
     DeadBlocks[i]->eraseFromParent();
   }
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index cd012d2..e0aa405 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
@@ -264,15 +265,36 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
+  SmallPtrSet<const MachineInstr *, 4> NoReturnInsts;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
     DEBUG(MBBI->print(dbgs(), Indexes));
+    bool IsExitBB = MBBI->succ_empty();
     for (MachineBasicBlock::instr_iterator
            MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
       MachineInstr *MI = MII;
       ++MII;
 
+      // Check if this instruction is a call to a noreturn function.
+      // If so, all the definitions set by this instruction can be ignored.
+      if (IsExitBB && MI->isCall())
+        for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+               MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+          MachineOperand &MO = *MOI;
+          if (!MO.isGlobal())
+            continue;
+          const Function *Func = dyn_cast<Function>(MO.getGlobal());
+          if (!Func || !Func->hasFnAttribute(Attribute::NoReturn) ||
+              // We need to keep correct unwind information
+              // even if the function will not return, since the
+              // runtime may need it.
+              !Func->hasFnAttribute(Attribute::NoUnwind))
+            continue;
+          NoReturnInsts.insert(MI);
+          break;
+        }
+
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
            MOE = MI->operands_end(); MOI != MOE; ++MOI) {
         MachineOperand &MO = *MOI;
@@ -353,7 +375,25 @@ void VirtRegRewriter::rewrite() {
   }
 
   // Tell MRI about physical registers in use.
-  for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
-    if (!MRI->reg_nodbg_empty(Reg))
-      MRI->setPhysRegUsed(Reg);
+  if (NoReturnInsts.empty()) {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg)
+      if (!MRI->reg_nodbg_empty(Reg))
+        MRI->setPhysRegUsed(Reg);
+  } else {
+    for (unsigned Reg = 1, RegE = TRI->getNumRegs(); Reg != RegE; ++Reg) {
+      if (MRI->reg_nodbg_empty(Reg))
+        continue;
+      // Check if this register has a use that will impact the rest of the
+      // code. Uses in debug and noreturn instructions do not impact the
+      // generated code.
+      for (MachineRegisterInfo::reg_nodbg_iterator It =
+             MRI->reg_nodbg_begin(Reg),
+             EndIt = MRI->reg_nodbg_end(); It != EndIt; ++It) {
+        if (!NoReturnInsts.count(&(*It))) {
+          MRI->setPhysRegUsed(Reg);
+          break;
+        }
+      }
+    }
+  }
 }
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index e97455a..61a3fb0 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -9,6 +9,9 @@ add_llvm_library(LLVMDebugInfo
   DWARFDebugFrame.cpp
   DWARFDebugInfoEntry.cpp
   DWARFDebugLine.cpp
+  DWARFDebugLoc.cpp
   DWARFDebugRangeList.cpp
   DWARFFormValue.cpp
+  DWARFTypeUnit.cpp
+  DWARFUnit.cpp
   )
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
index 2de62ab..f46fd58 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.cpp
@@ -14,37 +14,51 @@
 using namespace llvm;
 using namespace dwarf;
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr){
-  return extract(data, offset_ptr, data.getULEB128(offset_ptr));
+void DWARFAbbreviationDeclaration::clear() {
+  Code = 0;
+  Tag = 0;
+  HasChildren = false;
+  Attributes.clear();
 }
 
-bool
-DWARFAbbreviationDeclaration::extract(DataExtractor data, uint32_t* offset_ptr,
-                                      uint32_t code) {
-  Code = code;
-  Attribute.clear();
-  if (Code) {
-    Tag = data.getULEB128(offset_ptr);
-    HasChildren = data.getU8(offset_ptr);
+DWARFAbbreviationDeclaration::DWARFAbbreviationDeclaration() {
+  clear();
+}
 
-    while (data.isValidOffset(*offset_ptr)) {
-      uint16_t attr = data.getULEB128(offset_ptr);
-      uint16_t form = data.getULEB128(offset_ptr);
+bool
+DWARFAbbreviationDeclaration::extract(DataExtractor Data, uint32_t* OffsetPtr) {
+  clear();
+  Code = Data.getULEB128(OffsetPtr);
+  if (Code == 0) {
+    return false;
+  }
+  Tag = Data.getULEB128(OffsetPtr);
+  uint8_t ChildrenByte = Data.getU8(OffsetPtr);
+  HasChildren = (ChildrenByte == DW_CHILDREN_yes);
 
-      if (attr && form)
-        Attribute.push_back(DWARFAttribute(attr, form));
-      else
-        break;
+  while (true) {
+    uint32_t CurOffset = *OffsetPtr;
+    uint16_t Attr = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
     }
-
-    return Tag != 0;
-  } else {
-    Tag = 0;
-    HasChildren = false;
+    CurOffset = *OffsetPtr;
+    uint16_t Form = Data.getULEB128(OffsetPtr);
+    if (CurOffset == *OffsetPtr) {
+      clear();
+      return false;
+    }
+    if (Attr == 0 && Form == 0)
+      break;
+    Attributes.push_back(AttributeSpec(Attr, Form));
   }
 
-  return false;
+  if (Tag == 0) {
+    clear();
+    return false;
+  }
+  return true;
 }
 
 void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
@@ -55,19 +69,19 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
   else
     OS << format("DW_TAG_Unknown_%x", getTag());
   OS << "\tDW_CHILDREN_" << (hasChildren() ? "yes" : "no") << '\n';
-  for (unsigned i = 0, e = Attribute.size(); i != e; ++i) {
+  for (unsigned i = 0, e = Attributes.size(); i != e; ++i) {
     OS << '\t';
-    const char *attrString = AttributeString(Attribute[i].getAttribute());
+    const char *attrString = AttributeString(Attributes[i].Attr);
     if (attrString)
       OS << attrString;
     else
-      OS << format("DW_AT_Unknown_%x", Attribute[i].getAttribute());
+      OS << format("DW_AT_Unknown_%x", Attributes[i].Attr);
     OS << '\t';
-    const char *formString = FormEncodingString(Attribute[i].getForm());
+    const char *formString = FormEncodingString(Attributes[i].Form);
     if (formString)
       OS << formString;
     else
-      OS << format("DW_FORM_Unknown_%x", Attribute[i].getForm());
+      OS << format("DW_FORM_Unknown_%x", Attributes[i].Form);
     OS << '\n';
   }
   OS << '\n';
@@ -75,8 +89,8 @@ void DWARFAbbreviationDeclaration::dump(raw_ostream &OS) const {
 
 uint32_t
 DWARFAbbreviationDeclaration::findAttributeIndex(uint16_t attr) const {
-  for (uint32_t i = 0, e = Attribute.size(); i != e; ++i) {
-    if (Attribute[i].getAttribute() == attr)
+  for (uint32_t i = 0, e = Attributes.size(); i != e; ++i) {
+    if (Attributes[i].Attr == attr)
       return i;
   }
   return -1U;
diff --git a/lib/DebugInfo/DWARFAbbreviationDeclaration.h b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
index 9a3fcd8..e9b072e 100644
--- a/lib/DebugInfo/DWARFAbbreviationDeclaration.h
+++ b/lib/DebugInfo/DWARFAbbreviationDeclaration.h
@@ -10,7 +10,6 @@
 #ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 #define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 
-#include "DWARFAttribute.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataExtractor.h"
 
@@ -22,31 +21,33 @@ class DWARFAbbreviationDeclaration {
   uint32_t Code;
   uint32_t Tag;
   bool HasChildren;
-  SmallVector<DWARFAttribute, 8> Attribute;
+
+  struct AttributeSpec {
+    AttributeSpec(uint16_t Attr, uint16_t Form) : Attr(Attr), Form(Form) {}
+    uint16_t Attr;
+    uint16_t Form;
+  };
+  SmallVector<AttributeSpec, 8> Attributes;
 public:
-  enum { InvalidCode = 0 };
-  DWARFAbbreviationDeclaration()
-    : Code(InvalidCode), Tag(0), HasChildren(0) {}
+  DWARFAbbreviationDeclaration();
 
   uint32_t getCode() const { return Code; }
   uint32_t getTag() const { return Tag; }
   bool hasChildren() const { return HasChildren; }
-  uint32_t getNumAttributes() const { return Attribute.size(); }
+  uint32_t getNumAttributes() const { return Attributes.size(); }
   uint16_t getAttrByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getAttribute() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Attr : 0;
   }
   uint16_t getFormByIndex(uint32_t idx) const {
-    return Attribute.size() > idx ? Attribute[idx].getForm() : 0;
+    return idx < Attributes.size() ? Attributes[idx].Form : 0;
   }
 
   uint32_t findAttributeIndex(uint16_t attr) const;
-  bool extract(DataExtractor data, uint32_t* offset_ptr);
-  bool extract(DataExtractor data, uint32_t* offset_ptr, uint32_t code);
-  bool isValid() const { return Code != 0 && Tag != 0; }
+  bool extract(DataExtractor Data, uint32_t* OffsetPtr);
   void dump(raw_ostream &OS) const;
-  const SmallVectorImpl<DWARFAttribute> &getAttributes() const {
-    return Attribute;
-  }
+
+private:
+  void clear();
 };
 
 }
diff --git a/lib/DebugInfo/DWARFAttribute.h b/lib/DebugInfo/DWARFAttribute.h
deleted file mode 100644
index 6f49b63..0000000
--- a/lib/DebugInfo/DWARFAttribute.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class DWARFAttribute {
-  uint16_t Attribute;
-  uint16_t Form;
-  public:
-  DWARFAttribute(uint16_t attr, uint16_t form)
-    : Attribute(attr), Form(form) {}
-
-  uint16_t getAttribute() const { return Attribute; }
-  uint16_t getForm() const { return Form; }
-};
-
-}
-
-#endif
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index 4f0eed4..33869d8 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -8,96 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFCompileUnit.h"
-#include "DWARFContext.h"
-#include "llvm/DebugInfo/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-using namespace dwarf;
-
-DataExtractor DWARFCompileUnit::getDebugInfoExtractor() const {
-  return DataExtractor(InfoSection, isLittleEndian, AddrSize);
-}
-
-bool DWARFCompileUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
-  clear();
-
-  Offset = *offset_ptr;
-
-  if (debug_info.isValidOffset(*offset_ptr)) {
-    uint64_t abbrOffset;
-    Length = debug_info.getU32(offset_ptr);
-    Version = debug_info.getU16(offset_ptr);
-    abbrOffset = debug_info.getU32(offset_ptr);
-    AddrSize = debug_info.getU8(offset_ptr);
-
-    bool lengthOK = debug_info.isValidOffset(getNextCompileUnitOffset()-1);
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (lengthOK && versionOK && addrSizeOK && abbrOffsetOK && Abbrev != NULL) {
-      Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
-      return true;
-    }
-
-    // reset the offset to where we tried to parse from if anything went wrong
-    *offset_ptr = Offset;
-  }
-
-  return false;
-}
-
-uint32_t
-DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
-                          const DWARFAbbreviationDeclarationSet *abbrevs) {
-  clear();
 
-  Offset = offset;
-
-  if (debug_info_data.isValidOffset(offset)) {
-    Length = debug_info_data.getU32(&offset);
-    Version = debug_info_data.getU16(&offset);
-    bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
-    Abbrevs = abbrevs;
-    AddrSize = debug_info_data.getU8(&offset);
-
-    bool versionOK = DWARFContext::isSupportedVersion(Version);
-    bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
-
-    if (versionOK && addrSizeOK && abbrevsOK &&
-        debug_info_data.isValidOffset(offset))
-      return offset;
-  }
-  return 0;
-}
-
-bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
-                                        DWARFDebugRangeList &RangeList) const {
-  // Require that compile unit is extracted.
-  assert(DieArray.size() > 0);
-  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
-  return RangeList.extract(RangesData, &RangeListOffset);
-}
-
-void DWARFCompileUnit::clear() {
-  Offset = 0;
-  Length = 0;
-  Version = 0;
-  Abbrevs = 0;
-  AddrSize = 0;
-  BaseAddr = 0;
-  clearDIEs(false);
-}
+using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS) {
-  OS << format("0x%08x", Offset) << ": Compile Unit:"
-     << " length = " << format("0x%08x", Length)
-     << " version = " << format("0x%04x", Version)
-     << " abbr_offset = " << format("0x%04x", Abbrevs->getOffset())
-     << " addr_size = " << format("0x%02x", AddrSize)
-     << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
+  OS << format("0x%08x", getOffset()) << ": Compile Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
 
   const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
@@ -105,168 +27,6 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
   CU->dump(OS, this, -1U);
 }
 
-const char *DWARFCompileUnit::getCompilationDir() {
-  extractDIEsIfNeeded(true);
-  if (DieArray.empty())
-    return 0;
-  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
-}
-
-void DWARFCompileUnit::setDIERelations() {
-  if (DieArray.empty())
-    return;
-  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
-  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
-  DWARFDebugInfoEntryMinimal *curr_die;
-  // We purposely are skipping the last element in the array in the loop below
-  // so that we can always have a valid next item
-  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
-    // Since our loop doesn't include the last element, we can always
-    // safely access the next die in the array.
-    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
-
-    const DWARFAbbreviationDeclaration *curr_die_abbrev =
-      curr_die->getAbbreviationDeclarationPtr();
-
-    if (curr_die_abbrev) {
-      // Normal DIE
-      if (curr_die_abbrev->hasChildren())
-        next_die->setParent(curr_die);
-      else
-        curr_die->setSibling(next_die);
-    } else {
-      // NULL DIE that terminates a sibling chain
-      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
-      if (parent)
-        parent->setSibling(next_die);
-    }
-  }
-
-  // Since we skipped the last element, we need to fix it up!
-  if (die_array_begin < die_array_end)
-    curr_die->setParent(die_array_begin);
-}
-
-size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
-  const size_t initial_die_array_size = DieArray.size();
-  if ((cu_die_only && initial_die_array_size > 0) ||
-      initial_die_array_size > 1)
-    return 0; // Already parsed
-
-  // Set the offset to that of the first DIE and calculate the start of the
-  // next compilation unit header.
-  uint32_t offset = getFirstDIEOffset();
-  uint32_t next_cu_offset = getNextCompileUnitOffset();
-
-  DWARFDebugInfoEntryMinimal die;
-  // Keep a flat array of the DIE for binary lookup by DIE offset
-  uint32_t depth = 0;
-  // We are in our compile unit, parse starting at the offset
-  // we were told to parse
-
-  const uint8_t *fixed_form_sizes =
-    DWARFFormValue::getFixedFormSizes(getAddressByteSize(), getVersion());
-
-  while (offset < next_cu_offset &&
-         die.extractFast(this, fixed_form_sizes, &offset)) {
-
-    if (depth == 0) {
-      uint64_t base_addr =
-        die.getAttributeValueAsUnsigned(this, DW_AT_low_pc, -1U);
-      if (base_addr == -1U)
-        base_addr = die.getAttributeValueAsUnsigned(this, DW_AT_entry_pc, 0);
-      setBaseAddress(base_addr);
-    }
-
-    if (cu_die_only) {
-      addDIE(die);
-      return 1;
-    }
-    else if (depth == 0 && initial_die_array_size == 1)
-      // Don't append the CU die as we already did that
-      ;
-    else
-      addDIE(die);
-
-    const DWARFAbbreviationDeclaration *abbrDecl =
-      die.getAbbreviationDeclarationPtr();
-    if (abbrDecl) {
-      // Normal DIE
-      if (abbrDecl->hasChildren())
-        ++depth;
-    } else {
-      // NULL DIE.
-      if (depth > 0)
-        --depth;
-      if (depth == 0)
-        break;  // We are done with this compile unit!
-    }
-
-  }
-
-  // Give a little bit of info if we encounter corrupt DWARF (our offset
-  // should always terminate at or before the start of the next compilation
-  // unit header).
-  if (offset > next_cu_offset)
-    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
-                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
-
-  setDIERelations();
-  return DieArray.size();
-}
-
-void DWARFCompileUnit::clearDIEs(bool keep_compile_unit_die) {
-  if (DieArray.size() > (unsigned)keep_compile_unit_die) {
-    // std::vectors never get any smaller when resized to a smaller size,
-    // or when clear() or erase() are called, the size will report that it
-    // is smaller, but the memory allocated remains intact (call capacity()
-    // to see this). So we need to create a temporary vector and swap the
-    // contents which will cause just the internal pointers to be swapped
-    // so that when "tmp_array" goes out of scope, it will destroy the
-    // contents.
-
-    // Save at least the compile unit DIE
-    std::vector<DWARFDebugInfoEntryMinimal> tmpArray;
-    DieArray.swap(tmpArray);
-    if (keep_compile_unit_die)
-      DieArray.push_back(tmpArray.front());
-  }
-}
-
-void
-DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                                         bool clear_dies_if_already_not_parsed){
-  // This function is usually called if there in no .debug_aranges section
-  // in order to produce a compile unit level set of address ranges that
-  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
-  // all compile units to stay loaded when they weren't needed. So we can end
-  // up parsing the DWARF and then throwing them all away to keep memory usage
-  // down.
-  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
-                          clear_dies_if_already_not_parsed;
-  DieArray[0].buildAddressRangeTable(this, debug_aranges);
-
-  // Keep memory down by clearing DIEs if this generate function
-  // caused them to be parsed.
-  if (clear_dies)
-    clearDIEs(true);
-}
-
-DWARFDebugInfoEntryMinimal::InlinedChain
-DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
-  // First, find a subprogram that contains the given address (the root
-  // of inlined chain).
-  extractDIEsIfNeeded(false);
-  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
-  for (size_t i = 0, n = DieArray.size(); i != n; i++) {
-    if (DieArray[i].isSubprogramDIE() &&
-        DieArray[i].addressRangeContainsAddress(this, Address)) {
-      SubprogramDIE = &DieArray[i];
-      break;
-    }
-  }
-  // Get inlined chain rooted at this subprogram DIE.
-  if (!SubprogramDIE)
-    return DWARFDebugInfoEntryMinimal::InlinedChain();
-  return SubprogramDIE->getInlinedChainForAddress(this, Address);
+// VTable anchor.
+DWARFCompileUnit::~DWARFCompileUnit() {
 }
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index 2a74605..1c9573b 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -10,132 +10,19 @@
 #ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 #define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 
-#include "DWARFDebugAbbrev.h"
-#include "DWARFDebugInfoEntry.h"
-#include "DWARFDebugRangeList.h"
-#include "DWARFRelocMap.h"
-#include <vector>
+#include "DWARFUnit.h"
 
 namespace llvm {
 
-class DWARFDebugAbbrev;
-class StringRef;
-class raw_ostream;
-
-class DWARFCompileUnit {
-  const DWARFDebugAbbrev *Abbrev;
-  StringRef InfoSection;
-  StringRef AbbrevSection;
-  StringRef RangeSection;
-  StringRef StringSection;
-  StringRef StringOffsetSection;
-  StringRef AddrOffsetSection;
-  const RelocAddrMap *RelocMap;
-  bool isLittleEndian;
-
-  uint32_t Offset;
-  uint32_t Length;
-  uint16_t Version;
-  const DWARFAbbreviationDeclarationSet *Abbrevs;
-  uint8_t AddrSize;
-  uint64_t BaseAddr;
-  // The compile unit debug information entry item.
-  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+class DWARFCompileUnit : public DWARFUnit {
 public:
-
   DWARFCompileUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
                    StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
-                   const RelocAddrMap *M, bool LE) :
-    Abbrev(DA), InfoSection(IS), AbbrevSection(AS),
-    RangeSection(RS), StringSection(SS), StringOffsetSection(SOS),
-    AddrOffsetSection(AOS), RelocMap(M), isLittleEndian(LE) {
-    clear();
-  }
-
-  StringRef getStringSection() const { return StringSection; }
-  StringRef getStringOffsetSection() const { return StringOffsetSection; }
-  StringRef getAddrOffsetSection() const { return AddrOffsetSection; }
-  const RelocAddrMap *getRelocMap() const { return RelocMap; }
-  DataExtractor getDebugInfoExtractor() const;
-
-  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
-  uint32_t extract(uint32_t offset, DataExtractor debug_info_data,
-                   const DWARFAbbreviationDeclarationSet *abbrevs);
-
-  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
-  /// hasn't already been done. Returns the number of DIEs parsed at this call.
-  size_t extractDIEsIfNeeded(bool cu_die_only);
-  /// extractRangeList - extracts the range list referenced by this compile
-  /// unit from .debug_ranges section. Returns true on success.
-  /// Requires that compile unit is already extracted.
-  bool extractRangeList(uint32_t RangeListOffset,
-                        DWARFDebugRangeList &RangeList) const;
-  void clear();
+                   const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
   void dump(raw_ostream &OS);
-  uint32_t getOffset() const { return Offset; }
-  /// Size in bytes of the compile unit header.
-  uint32_t getSize() const { return 11; }
-  bool containsDIEOffset(uint32_t die_offset) const {
-    return die_offset >= getFirstDIEOffset() &&
-      die_offset < getNextCompileUnitOffset();
-  }
-  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
-  uint32_t getNextCompileUnitOffset() const { return Offset + Length + 4; }
-  /// Size in bytes of the .debug_info data associated with this compile unit.
-  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
-  uint32_t getLength() const { return Length; }
-  uint16_t getVersion() const { return Version; }
-  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
-    return Abbrevs;
-  }
-  uint8_t getAddressByteSize() const { return AddrSize; }
-  uint64_t getBaseAddress() const { return BaseAddr; }
-
-  void setBaseAddress(uint64_t base_addr) {
-    BaseAddr = base_addr;
-  }
-
-  const DWARFDebugInfoEntryMinimal *
-  getCompileUnitDIE(bool extract_cu_die_only = true) {
-    extractDIEsIfNeeded(extract_cu_die_only);
-    if (DieArray.empty())
-      return NULL;
-    return &DieArray[0];
-  }
-
-  const char *getCompilationDir();
-
-  /// setDIERelations - We read in all of the DIE entries into our flat list
-  /// of DIE entries and now we need to go back through all of them and set the
-  /// parent, sibling and child pointers for quick DIE navigation.
-  void setDIERelations();
-
-  void addDIE(DWARFDebugInfoEntryMinimal &die) {
-    // The average bytes per DIE entry has been seen to be
-    // around 14-20 so lets pre-reserve the needed memory for
-    // our DIE entries accordingly. Search forward for "Compute
-    // average bytes per DIE" to see #if'ed out code that does
-    // that determination.
-
-    // Only reserve the memory if we are adding children of
-    // the main compile unit DIE. The compile unit DIE is always
-    // the first entry, so if our size is 1, then we are adding
-    // the first compile unit child DIE and should reserve
-    // the memory.
-    if (DieArray.empty())
-      DieArray.reserve(getDebugInfoSize() / 14);
-    DieArray.push_back(die);
-  }
-
-  void clearDIEs(bool keep_compile_unit_die);
-
-  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
-                              bool clear_dies_if_already_not_parsed);
-
-  /// getInlinedChainForAddress - fetches inlined chain for a given address.
-  /// Returns empty chain if there is no subprogram containing address.
-  DWARFDebugInfoEntryMinimal::InlinedChain getInlinedChainForAddress(
-      uint64_t Address);
+  // VTable anchor.
+  ~DWARFCompileUnit() LLVM_OVERRIDE;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 9f52133..e477190 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -19,9 +19,45 @@
 #include <algorithm>
 using namespace llvm;
 using namespace dwarf;
+using namespace object;
 
 typedef DWARFDebugLine::LineTable DWARFLineTable;
 
+DWARFContext::~DWARFContext() {
+  DeleteContainerPointers(CUs);
+  DeleteContainerPointers(TUs);
+  DeleteContainerPointers(DWOCUs);
+}
+
+static void dumpPubSection(raw_ostream &OS, StringRef Name, StringRef Data,
+                           bool LittleEndian, bool GnuStyle) {
+  OS << "\n." << Name << " contents:\n";
+  DataExtractor pubNames(Data, LittleEndian, 0);
+  uint32_t offset = 0;
+  OS << "length = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " version = " << format("0x%04x", pubNames.getU16(&offset));
+  OS << " unit_offset = " << format("0x%08x", pubNames.getU32(&offset));
+  OS << " unit_size = " << format("0x%08x", pubNames.getU32(&offset)) << '\n';
+  if (GnuStyle)
+    OS << "Offset     Linkage  Kind     Name\n";
+  else
+    OS << "Offset     Name\n";
+
+  while (offset < Data.size()) {
+    uint32_t dieRef = pubNames.getU32(&offset);
+    if (dieRef == 0)
+      break;
+    OS << format("0x%8.8x ", dieRef);
+    if (GnuStyle) {
+      PubIndexEntryDescriptor desc(pubNames.getU8(&offset));
+      OS << format("%-8s", dwarf::GDBIndexEntryLinkageString(desc.Linkage))
+         << ' ' << format("%-8s", dwarf::GDBIndexEntryKindString(desc.Kind))
+         << ' ';
+    }
+    OS << '\"' << pubNames.getCStr(&offset) << "\"\n";
+  }
+}
+
 void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
   if (DumpType == DIDT_All || DumpType == DIDT_Abbrev) {
     OS << ".debug_abbrev contents:\n";
@@ -34,6 +70,17 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       getCompileUnitAtIndex(i)->dump(OS);
   }
 
+  if (DumpType == DIDT_All || DumpType == DIDT_Types) {
+    OS << "\n.debug_types contents:\n";
+    for (unsigned i = 0, e = getNumTypeUnits(); i != e; ++i)
+      getTypeUnitAtIndex(i)->dump(OS);
+  }
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Loc) {
+    OS << "\n.debug_loc contents:\n";
+    getDebugLoc()->dump(OS);
+  }
+
   if (DumpType == DIDT_All || DumpType == DIDT_Frames) {
     OS << "\n.debug_frame contents:\n";
     getDebugFrame()->dump(OS);
@@ -55,13 +102,13 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
       savedAddressByteSize = cu->getAddressByteSize();
       unsigned stmtOffset =
-        cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                             -1U);
+          cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+              cu, DW_AT_stmt_list, -1U);
       if (stmtOffset != -1U) {
-        DataExtractor lineData(getLineSection(), isLittleEndian(),
+        DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                                savedAddressByteSize);
         DWARFDebugLine::DumpingState state(OS);
-        DWARFDebugLine::parseStatementTable(lineData, &lineRelocMap(), &stmtOffset, state);
+        DWARFDebugLine::parseStatementTable(lineData, &getLineSection().Relocs, &stmtOffset, state);
       }
     }
   }
@@ -91,23 +138,21 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType) {
       rangeList.dump(OS);
   }
 
-  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames) {
-    OS << "\n.debug_pubnames contents:\n";
-    DataExtractor pubNames(getPubNamesSection(), isLittleEndian(), 0);
-    offset = 0;
-    OS << "Length:                " << pubNames.getU32(&offset) << "\n";
-    OS << "Version:               " << pubNames.getU16(&offset) << "\n";
-    OS << "Offset in .debug_info: " << pubNames.getU32(&offset) << "\n";
-    OS << "Size:                  " << pubNames.getU32(&offset) << "\n";
-    OS << "\n  Offset    Name\n";
-    while (offset < getPubNamesSection().size()) {
-      uint32_t n = pubNames.getU32(&offset);
-      if (n == 0)
-        break;
-      OS << format("%8x    ", n);
-      OS << pubNames.getCStr(&offset) << "\n";
-    }
-  }
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubnames)
+    dumpPubSection(OS, "debug_pubnames", getPubNamesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_Pubtypes)
+    dumpPubSection(OS, "debug_pubtypes", getPubTypesSection(),
+                   isLittleEndian(), false);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubnames)
+    dumpPubSection(OS, "debug_gnu_pubnames", getGnuPubNamesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
+
+  if (DumpType == DIDT_All || DumpType == DIDT_GnuPubtypes)
+    dumpPubSection(OS, "debug_gnu_pubtypes", getGnuPubTypesSection(),
+                   isLittleEndian(), true /* GnuStyle */);
 
   if (DumpType == DIDT_All || DumpType == DIDT_AbbrevDwo) {
     const DWARFDebugAbbrev *D = getDebugAbbrevDWO();
@@ -170,17 +215,23 @@ const DWARFDebugAbbrev *DWARFContext::getDebugAbbrevDWO() {
   return AbbrevDWO.get();
 }
 
+const DWARFDebugLoc *DWARFContext::getDebugLoc() {
+  if (Loc)
+    return Loc.get();
+
+  DataExtractor LocData(getLocSection().Data, isLittleEndian(), 0);
+  Loc.reset(new DWARFDebugLoc(getLocSection().Relocs));
+  // assume all compile units have the same address byte size
+  if (getNumCompileUnits())
+    Loc->parse(LocData, getCompileUnitAtIndex(0)->getAddressByteSize());
+  return Loc.get();
+}
+
 const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   if (Aranges)
     return Aranges.get();
 
-  DataExtractor arangesData(getARangeSection(), isLittleEndian(), 0);
-
   Aranges.reset(new DWARFDebugAranges());
-  Aranges->extract(arangesData);
-  // Generate aranges from DIEs: even if .debug_aranges section is present,
-  // it may describe only a small subset of compilation units, so we need to
-  // manually build aranges for the rest of them.
   Aranges->generate(this);
   return Aranges.get();
 }
@@ -208,11 +259,11 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
 const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
-    Line.reset(new DWARFDebugLine(&lineRelocMap()));
+    Line.reset(new DWARFDebugLine(&getLineSection().Relocs));
 
   unsigned stmtOffset =
-    cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
-                                                         -1U);
+      cu->getCompileUnitDIE()->getAttributeValueAsSectionOffset(
+          cu, DW_AT_stmt_list, -1U);
   if (stmtOffset == -1U)
     return 0; // No line table for this compile unit.
 
@@ -221,64 +272,79 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
     return lt;
 
   // We have to parse it first.
-  DataExtractor lineData(getLineSection(), isLittleEndian(),
+  DataExtractor lineData(getLineSection().Data, isLittleEndian(),
                          cu->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset);
 }
 
 void DWARFContext::parseCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+  const DataExtractor &DIData = DataExtractor(getInfoSection().Data,
                                               isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    CUs.push_back(DWARFCompileUnit(getDebugAbbrev(), getInfoSection(),
-                                   getAbbrevSection(), getRangeSection(),
-                                   getStringSection(), StringRef(),
-                                   getAddrSection(),
-                                   &infoRelocMap(),
-                                   isLittleEndian()));
-    if (!CUs.back().extract(DIData, &offset)) {
-      CUs.pop_back();
+    OwningPtr<DWARFCompileUnit> CU(new DWARFCompileUnit(
+        getDebugAbbrev(), getInfoSection().Data, getAbbrevSection(),
+        getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+        &getInfoSection().Relocs, isLittleEndian()));
+    if (!CU->extract(DIData, &offset)) {
       break;
     }
+    CUs.push_back(CU.take());
+    offset = CUs.back()->getNextUnitOffset();
+  }
+}
 
-    offset = CUs.back().getNextCompileUnitOffset();
+void DWARFContext::parseTypeUnits() {
+  const std::map<object::SectionRef, Section> &Sections = getTypesSections();
+  for (std::map<object::SectionRef, Section>::const_iterator
+           I = Sections.begin(),
+           E = Sections.end();
+       I != E; ++I) {
+    uint32_t offset = 0;
+    const DataExtractor &DIData =
+        DataExtractor(I->second.Data, isLittleEndian(), 0);
+    while (DIData.isValidOffset(offset)) {
+      OwningPtr<DWARFTypeUnit> TU(new DWARFTypeUnit(
+          getDebugAbbrev(), I->second.Data, getAbbrevSection(),
+          getRangeSection(), getStringSection(), StringRef(), getAddrSection(),
+          &I->second.Relocs, isLittleEndian()));
+      if (!TU->extract(DIData, &offset))
+        break;
+      TUs.push_back(TU.take());
+      offset = TUs.back()->getNextUnitOffset();
+    }
   }
 }
 
 void DWARFContext::parseDWOCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &DIData = DataExtractor(getInfoDWOSection(),
-                                              isLittleEndian(), 0);
+  const DataExtractor &DIData =
+      DataExtractor(getInfoDWOSection().Data, isLittleEndian(), 0);
   while (DIData.isValidOffset(offset)) {
-    DWOCUs.push_back(DWARFCompileUnit(getDebugAbbrevDWO(), getInfoDWOSection(),
-                                      getAbbrevDWOSection(),
-                                      getRangeDWOSection(),
-                                      getStringDWOSection(),
-                                      getStringOffsetDWOSection(),
-                                      getAddrSection(),
-                                      &infoDWORelocMap(),
-                                      isLittleEndian()));
-    if (!DWOCUs.back().extract(DIData, &offset)) {
-      DWOCUs.pop_back();
+    OwningPtr<DWARFCompileUnit> DWOCU(new DWARFCompileUnit(
+        getDebugAbbrevDWO(), getInfoDWOSection().Data, getAbbrevDWOSection(),
+        getRangeDWOSection(), getStringDWOSection(),
+        getStringOffsetDWOSection(), getAddrSection(),
+        &getInfoDWOSection().Relocs, isLittleEndian()));
+    if (!DWOCU->extract(DIData, &offset)) {
       break;
     }
-
-    offset = DWOCUs.back().getNextCompileUnitOffset();
+    DWOCUs.push_back(DWOCU.take());
+    offset = DWOCUs.back()->getNextUnitOffset();
   }
 }
 
 namespace {
   struct OffsetComparator {
-    bool operator()(const DWARFCompileUnit &LHS,
-                    const DWARFCompileUnit &RHS) const {
-      return LHS.getOffset() < RHS.getOffset();
+    bool operator()(const DWARFCompileUnit *LHS,
+                    const DWARFCompileUnit *RHS) const {
+      return LHS->getOffset() < RHS->getOffset();
     }
-    bool operator()(const DWARFCompileUnit &LHS, uint32_t RHS) const {
-      return LHS.getOffset() < RHS;
+    bool operator()(const DWARFCompileUnit *LHS, uint32_t RHS) const {
+      return LHS->getOffset() < RHS;
     }
-    bool operator()(uint32_t LHS, const DWARFCompileUnit &RHS) const {
-      return LHS < RHS.getOffset();
+    bool operator()(uint32_t LHS, const DWARFCompileUnit *RHS) const {
+      return LHS < RHS->getOffset();
     }
   };
 }
@@ -287,10 +353,11 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CUs.empty())
     parseCompileUnits();
 
-  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
-                                          OffsetComparator());
-  if (CU != CUs.end())
-    return &*CU;
+  DWARFCompileUnit **CU =
+      std::lower_bound(CUs.begin(), CUs.end(), Offset, OffsetComparator());
+  if (CU != CUs.end()) {
+    return *CU;
+  }
   return 0;
 }
 
@@ -358,11 +425,11 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
     // The address may correspond to instruction in some inlined function,
     // so we have to build the chain of inlined functions and take the
     // name of the topmost function in it.
-    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+    const DWARFDebugInfoEntryInlinedChain &InlinedChain =
         CU->getInlinedChainForAddress(Address);
-    if (InlinedChain.size() > 0) {
-      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+    if (InlinedChain.DIEs.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
@@ -391,23 +458,20 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
     // The address may correspond to instruction in some inlined function,
     // so we have to build the chain of inlined functions and take the
     // name of the topmost function in it.
-    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+    const DWARFDebugInfoEntryInlinedChain &InlinedChain =
         CU->getInlinedChainForAddress(Address);
-    if (InlinedChain.size() > 0) {
-      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
-      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+    if (InlinedChain.DIEs.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain.DIEs[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
   }
 
-  StringRef  FuncNameRef = StringRef(FunctionName);
-
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (!Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    Lines.push_back(std::make_pair(Address, 
-                                   DILineInfo(StringRef("<invalid>"), 
-                                              FuncNameRef, 0, 0)));
+    Lines.push_back(
+        std::make_pair(Address, DILineInfo("<invalid>", FunctionName, 0, 0)));
     return Lines;
   }
 
@@ -428,9 +492,8 @@ DILineInfoTable DWARFContext::getLineInfoForAddressRange(uint64_t Address,
     std::string FileName = "<invalid>";
     getFileNameForCompileUnit(CU, LineTable, Row.File,
                               NeedsAbsoluteFilePath, FileName);
-    Lines.push_back(std::make_pair(Row.Address, 
-                                   DILineInfo(StringRef(FileName),
-                                         FuncNameRef, Row.Line, Row.Column)));
+    Lines.push_back(std::make_pair(
+        Row.Address, DILineInfo(FileName, FunctionName, Row.Line, Row.Column)));
   }
 
   return Lines;
@@ -442,23 +505,23 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
   if (!CU)
     return DIInliningInfo();
 
-  const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+  const DWARFDebugInfoEntryInlinedChain &InlinedChain =
       CU->getInlinedChainForAddress(Address);
-  if (InlinedChain.size() == 0)
+  if (InlinedChain.DIEs.size() == 0)
     return DIInliningInfo();
 
   DIInliningInfo InliningInfo;
   uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
   const DWARFLineTable *LineTable = 0;
-  for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
-    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i];
+  for (uint32_t i = 0, n = InlinedChain.DIEs.size(); i != n; i++) {
+    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain.DIEs[i];
     std::string FileName = "<invalid>";
     std::string FunctionName = "<invalid>";
     uint32_t Line = 0;
     uint32_t Column = 0;
     // Get function name if necessary.
     if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
-      if (const char *Name = FunctionDIE.getSubroutineName(CU))
+      if (const char *Name = FunctionDIE.getSubroutineName(InlinedChain.U))
         FunctionName = Name;
     }
     if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
@@ -482,7 +545,8 @@ DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
       }
       // Get call file/line/column of a current DIE.
       if (i + 1 < n) {
-        FunctionDIE.getCallerFrame(CU, CallFile, CallLine, CallColumn);
+        FunctionDIE.getCallerFrame(InlinedChain.U, CallFile, CallLine,
+                                   CallColumn);
       }
     }
     DILineInfo Frame(StringRef(FileName), StringRef(FunctionName),
@@ -538,43 +602,67 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
       UncompressedSections.push_back(UncompressedSection.take());
     }
 
-    StringRef *Section = StringSwitch<StringRef*>(name)
-        .Case("debug_info", &InfoSection)
-        .Case("debug_abbrev", &AbbrevSection)
-        .Case("debug_line", &LineSection)
-        .Case("debug_aranges", &ARangeSection)
-        .Case("debug_frame", &DebugFrameSection)
-        .Case("debug_str", &StringSection)
-        .Case("debug_ranges", &RangeSection)
-        .Case("debug_pubnames", &PubNamesSection)
-        .Case("debug_info.dwo", &InfoDWOSection)
-        .Case("debug_abbrev.dwo", &AbbrevDWOSection)
-        .Case("debug_str.dwo", &StringDWOSection)
-        .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
-        .Case("debug_addr", &AddrSection)
-        // Any more debug info sections go here.
-        .Default(0);
-    if (!Section)
-      continue;
-    *Section = data;
-    if (name == "debug_ranges") {
-      // FIXME: Use the other dwo range section when we emit it.
-      RangeDWOSection = data;
+    StringRef *Section =
+        StringSwitch<StringRef *>(name)
+            .Case("debug_info", &InfoSection.Data)
+            .Case("debug_abbrev", &AbbrevSection)
+            .Case("debug_loc", &LocSection.Data)
+            .Case("debug_line", &LineSection.Data)
+            .Case("debug_aranges", &ARangeSection)
+            .Case("debug_frame", &DebugFrameSection)
+            .Case("debug_str", &StringSection)
+            .Case("debug_ranges", &RangeSection)
+            .Case("debug_pubnames", &PubNamesSection)
+            .Case("debug_pubtypes", &PubTypesSection)
+            .Case("debug_gnu_pubnames", &GnuPubNamesSection)
+            .Case("debug_gnu_pubtypes", &GnuPubTypesSection)
+            .Case("debug_info.dwo", &InfoDWOSection.Data)
+            .Case("debug_abbrev.dwo", &AbbrevDWOSection)
+            .Case("debug_str.dwo", &StringDWOSection)
+            .Case("debug_str_offsets.dwo", &StringOffsetDWOSection)
+            .Case("debug_addr", &AddrSection)
+            // Any more debug info sections go here.
+            .Default(0);
+    if (Section) {
+      *Section = data;
+      if (name == "debug_ranges") {
+        // FIXME: Use the other dwo range section when we emit it.
+        RangeDWOSection = data;
+      }
+    } else if (name == "debug_types") {
+      // Find debug_types data by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      TypesSections[*i].Data = data;
     }
 
+    section_iterator RelocatedSection = i->getRelocatedSection();
+    if (RelocatedSection == Obj->end_sections())
+      continue;
+
+    StringRef RelSecName;
+    RelocatedSection->getName(RelSecName);
+    RelSecName = RelSecName.substr(
+        RelSecName.find_first_not_of("._")); // Skip . and _ prefixes.
+
     // TODO: Add support for relocations in other sections as needed.
     // Record relocations for the debug_info and debug_line sections.
-    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(name)
-        .Case("debug_info", &InfoRelocMap)
-        .Case("debug_info.dwo", &InfoDWORelocMap)
-        .Case("debug_line", &LineRelocMap)
+    RelocAddrMap *Map = StringSwitch<RelocAddrMap*>(RelSecName)
+        .Case("debug_info", &InfoSection.Relocs)
+        .Case("debug_loc", &LocSection.Relocs)
+        .Case("debug_info.dwo", &InfoDWOSection.Relocs)
+        .Case("debug_line", &LineSection.Relocs)
         .Default(0);
-    if (!Map)
-      continue;
+    if (!Map) {
+      if (RelSecName != "debug_types")
+        continue;
+      // Find debug_types relocs by section rather than name as there are
+      // multiple, comdat grouped, debug_types sections.
+      Map = &TypesSections[*RelocatedSection].Relocs;
+    }
 
     if (i->begin_relocations() != i->end_relocations()) {
       uint64_t SectionSize;
-      i->getSize(SectionSize);
+      RelocatedSection->getSize(SectionSize);
       for (object::relocation_iterator reloc_i = i->begin_relocations(),
              reloc_e = i->end_relocations();
            reloc_i != reloc_e; reloc_i.increment(ec)) {
@@ -585,9 +673,8 @@ DWARFContextInMemory::DWARFContextInMemory(object::ObjectFile *Obj) :
         uint64_t SymAddr = 0;
         // ELF relocations may need the symbol address
         if (Obj->isELF()) {
-          object::SymbolRef Sym;
-          reloc_i->getSymbol(Sym);
-          Sym.getAddress(SymAddr);
+          object::symbol_iterator Sym = reloc_i->getSymbol();
+          Sym->getAddress(SymAddr);
         }
 
         object::RelocVisitor V(Obj->getFileFormatName());
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 78c18e6..03863ab 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -14,7 +14,9 @@
 #include "DWARFDebugAranges.h"
 #include "DWARFDebugFrame.h"
 #include "DWARFDebugLine.h"
+#include "DWARFDebugLoc.h"
 #include "DWARFDebugRangeList.h"
+#include "DWARFTypeUnit.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DIContext.h"
@@ -26,13 +28,15 @@ namespace llvm {
 /// information parsing. The actual data is supplied through pure virtual
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
-  SmallVector<DWARFCompileUnit, 1> CUs;
+  SmallVector<DWARFCompileUnit *, 1> CUs;
+  SmallVector<DWARFTypeUnit *, 1> TUs;
   OwningPtr<DWARFDebugAbbrev> Abbrev;
+  OwningPtr<DWARFDebugLoc> Loc;
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
   OwningPtr<DWARFDebugFrame> DebugFrame;
 
-  SmallVector<DWARFCompileUnit, 1> DWOCUs;
+  SmallVector<DWARFCompileUnit *, 1> DWOCUs;
   OwningPtr<DWARFDebugAbbrev> AbbrevDWO;
 
   DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
@@ -41,12 +45,26 @@ class DWARFContext : public DIContext {
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
 
+  /// Read type units from the debug_types sections and store them in CUs.
+  void parseTypeUnits();
+
   /// Read compile units from the debug_info.dwo section and store them in
   /// DWOCUs.
   void parseDWOCompileUnits();
 
 public:
-  DWARFContext() {}
+  struct Section {
+    StringRef Data;
+    RelocAddrMap Relocs;
+  };
+
+  DWARFContext() : DIContext(CK_DWARF) {}
+  virtual ~DWARFContext();
+
+  static bool classof(const DIContext *DICtx) {
+    return DICtx->getKind() == CK_DWARF;
+  }
+
   virtual void dump(raw_ostream &OS, DIDumpType DumpType = DIDT_All);
 
   /// Get the number of compile units in this context.
@@ -56,6 +74,13 @@ public:
     return CUs.size();
   }
 
+  /// Get the number of compile units in this context.
+  unsigned getNumTypeUnits() {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs.size();
+  }
+
   /// Get the number of compile units in the DWO context.
   unsigned getNumDWOCompileUnits() {
     if (DWOCUs.empty())
@@ -67,19 +92,29 @@ public:
   DWARFCompileUnit *getCompileUnitAtIndex(unsigned index) {
     if (CUs.empty())
       parseCompileUnits();
-    return &CUs[index];
+    return CUs[index];
+  }
+
+  /// Get the type unit at the specified index for this compile unit.
+  DWARFTypeUnit *getTypeUnitAtIndex(unsigned index) {
+    if (TUs.empty())
+      parseTypeUnits();
+    return TUs[index];
   }
 
   /// Get the compile unit at the specified index for the DWO compile units.
   DWARFCompileUnit *getDWOCompileUnitAtIndex(unsigned index) {
     if (DWOCUs.empty())
       parseDWOCompileUnits();
-    return &DWOCUs[index];
+    return DWOCUs[index];
   }
 
   /// Get a pointer to the parsed DebugAbbrev object.
   const DWARFDebugAbbrev *getDebugAbbrev();
 
+  /// Get a pointer to the parsed DebugLoc object.
+  const DWARFDebugLoc *getDebugLoc();
+
   /// Get a pointer to the parsed dwo abbreviations object.
   const DWARFDebugAbbrev *getDebugAbbrevDWO();
 
@@ -102,28 +137,30 @@ public:
 
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const = 0;
-  virtual const RelocAddrMap &infoRelocMap() const = 0;
-  virtual const RelocAddrMap &lineRelocMap() const = 0;
-  virtual StringRef getInfoSection() = 0;
+  virtual const Section &getInfoSection() = 0;
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() = 0;
   virtual StringRef getAbbrevSection() = 0;
+  virtual const Section &getLocSection() = 0;
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getDebugFrameSection() = 0;
-  virtual StringRef getLineSection() = 0;
+  virtual const Section &getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
   virtual StringRef getRangeSection() = 0;
   virtual StringRef getPubNamesSection() = 0;
+  virtual StringRef getPubTypesSection() = 0;
+  virtual StringRef getGnuPubNamesSection() = 0;
+  virtual StringRef getGnuPubTypesSection() = 0;
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() = 0;
+  virtual const Section &getInfoDWOSection() = 0;
   virtual StringRef getAbbrevDWOSection() = 0;
   virtual StringRef getStringDWOSection() = 0;
   virtual StringRef getStringOffsetDWOSection() = 0;
   virtual StringRef getRangeDWOSection() = 0;
   virtual StringRef getAddrSection() = 0;
-  virtual const RelocAddrMap &infoDWORelocMap() const = 0;
 
   static bool isSupportedVersion(unsigned version) {
-    return version == 2 || version == 3;
+    return version == 2 || version == 3 || version == 4;
   }
 private:
   /// Return the compile unit that includes an offset (relative to .debug_info).
@@ -141,20 +178,22 @@ class DWARFContextInMemory : public DWARFContext {
   virtual void anchor();
   bool IsLittleEndian;
   uint8_t AddressSize;
-  RelocAddrMap InfoRelocMap;
-  RelocAddrMap LineRelocMap;
-  StringRef InfoSection;
+  Section InfoSection;
+  std::map<object::SectionRef, Section> TypesSections;
   StringRef AbbrevSection;
+  Section LocSection;
   StringRef ARangeSection;
   StringRef DebugFrameSection;
-  StringRef LineSection;
+  Section LineSection;
   StringRef StringSection;
   StringRef RangeSection;
   StringRef PubNamesSection;
+  StringRef PubTypesSection;
+  StringRef GnuPubNamesSection;
+  StringRef GnuPubTypesSection;
 
   // Sections for DWARF5 split dwarf proposal.
-  RelocAddrMap InfoDWORelocMap;
-  StringRef InfoDWOSection;
+  Section InfoDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StringDWOSection;
   StringRef StringOffsetDWOSection;
@@ -168,19 +207,24 @@ public:
   ~DWARFContextInMemory();
   virtual bool isLittleEndian() const { return IsLittleEndian; }
   virtual uint8_t getAddressSize() const { return AddressSize; }
-  virtual const RelocAddrMap &infoRelocMap() const { return InfoRelocMap; }
-  virtual const RelocAddrMap &lineRelocMap() const { return LineRelocMap; }
-  virtual StringRef getInfoSection() { return InfoSection; }
+  virtual const Section &getInfoSection() { return InfoSection; }
+  virtual const std::map<object::SectionRef, Section> &getTypesSections() {
+    return TypesSections;
+  }
   virtual StringRef getAbbrevSection() { return AbbrevSection; }
+  virtual const Section &getLocSection() { return LocSection; }
   virtual StringRef getARangeSection() { return ARangeSection; }
   virtual StringRef getDebugFrameSection() { return DebugFrameSection; }
-  virtual StringRef getLineSection() { return LineSection; }
+  virtual const Section &getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
   virtual StringRef getRangeSection() { return RangeSection; }
   virtual StringRef getPubNamesSection() { return PubNamesSection; }
+  virtual StringRef getPubTypesSection() { return PubTypesSection; }
+  virtual StringRef getGnuPubNamesSection() { return GnuPubNamesSection; }
+  virtual StringRef getGnuPubTypesSection() { return GnuPubTypesSection; }
 
   // Sections for DWARF5 split dwarf proposal.
-  virtual StringRef getInfoDWOSection() { return InfoDWOSection; }
+  virtual const Section &getInfoDWOSection() { return InfoDWOSection; }
   virtual StringRef getAbbrevDWOSection() { return AbbrevDWOSection; }
   virtual StringRef getStringDWOSection() { return StringDWOSection; }
   virtual StringRef getStringOffsetDWOSection() {
@@ -190,9 +234,6 @@ public:
   virtual StringRef getAddrSection() {
     return AddrSection;
   }
-  virtual const RelocAddrMap &infoDWORelocMap() const {
-    return InfoDWORelocMap;
-  }
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARFDebugArangeSet.cpp
index 7dff9ff..229376e 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARFDebugArangeSet.cpp
@@ -20,32 +20,6 @@ void DWARFDebugArangeSet::clear() {
   ArangeDescriptors.clear();
 }
 
-void DWARFDebugArangeSet::compact() {
-  if (ArangeDescriptors.empty())
-    return;
-
-  // Iterate through all arange descriptors and combine any ranges that
-  // overlap or have matching boundaries. The ArangeDescriptors are assumed
-  // to be in ascending order.
-  uint32_t i = 0;
-  while (i + 1 < ArangeDescriptors.size()) {
-    if (ArangeDescriptors[i].getEndAddress() >= ArangeDescriptors[i+1].Address){
-      // The current range ends at or exceeds the start of the next address
-      // range. Compute the max end address between the two and use that to
-      // make the new length.
-      const uint64_t max_end_addr =
-        std::max(ArangeDescriptors[i].getEndAddress(),
-                 ArangeDescriptors[i+1].getEndAddress());
-      ArangeDescriptors[i].Length = max_end_addr - ArangeDescriptors[i].Address;
-      // Now remove the next entry as it was just combined with the previous one
-      ArangeDescriptors.erase(ArangeDescriptors.begin()+i+1);
-    } else {
-      // Discontiguous address range, just proceed to the next one.
-      ++i;
-    }
-  }
-}
-
 bool
 DWARFDebugArangeSet::extract(DataExtractor data, uint32_t *offset_ptr) {
   if (data.isValidOffset(*offset_ptr)) {
@@ -126,26 +100,3 @@ void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
        << format(" 0x%*.*" PRIx64 ")\n",
                  hex_width, hex_width, pos->getEndAddress());
 }
-
-
-namespace {
-  class DescriptorContainsAddress {
-    const uint64_t Address;
-  public:
-    DescriptorContainsAddress(uint64_t address) : Address(address) {}
-    bool operator()(const DWARFDebugArangeSet::Descriptor &desc) const {
-      return Address >= desc.Address && Address < (desc.Address + desc.Length);
-    }
-  };
-}
-
-uint32_t DWARFDebugArangeSet::findAddress(uint64_t address) const {
-  DescriptorConstIter end = ArangeDescriptors.end();
-  DescriptorConstIter pos =
-    std::find_if(ArangeDescriptors.begin(), end, // Range
-                 DescriptorContainsAddress(address)); // Predicate
-  if (pos != end)
-    return HeaderData.CuOffset;
-
-  return -1U;
-}
diff --git a/lib/DebugInfo/DWARFDebugArangeSet.h b/lib/DebugInfo/DWARFDebugArangeSet.h
index d768676..49a7132 100644
--- a/lib/DebugInfo/DWARFDebugArangeSet.h
+++ b/lib/DebugInfo/DWARFDebugArangeSet.h
@@ -44,7 +44,6 @@ public:
 
 private:
   typedef std::vector<Descriptor> DescriptorColl;
-  typedef DescriptorColl::iterator DescriptorIter;
   typedef DescriptorColl::const_iterator DescriptorConstIter;
 
   uint32_t Offset;
@@ -54,15 +53,11 @@ private:
 public:
   DWARFDebugArangeSet() { clear(); }
   void clear();
-  void compact();
   bool extract(DataExtractor data, uint32_t *offset_ptr);
   void dump(raw_ostream &OS) const;
 
   uint32_t getCompileUnitDIEOffset() const { return HeaderData.CuOffset; }
-  uint32_t getOffsetOfNextEntry() const { return Offset + HeaderData.Length + 4; }
-  uint32_t findAddress(uint64_t address) const;
   uint32_t getNumDescriptors() const { return ArangeDescriptors.size(); }
-  const struct Header &getHeader() const { return HeaderData; }
   const Descriptor *getDescriptor(uint32_t i) const {
     if (i < ArangeDescriptors.size())
       return &ArangeDescriptors[i];
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index f79862d..591d4bd 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -16,128 +16,79 @@
 #include <cassert>
 using namespace llvm;
 
-// Compare function DWARFDebugAranges::Range structures
-static bool RangeLessThan(const DWARFDebugAranges::Range &range1,
-                          const DWARFDebugAranges::Range &range2) {
-  return range1.LoPC < range2.LoPC;
-}
-
-namespace {
-  class CountArangeDescriptors {
-  public:
-    CountArangeDescriptors(uint32_t &count_ref) : Count(count_ref) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      Count += Set.getNumDescriptors();
-    }
-    uint32_t &Count;
-  };
-
-  class AddArangeDescriptors {
-  public:
-    AddArangeDescriptors(DWARFDebugAranges::RangeColl &Ranges,
-                         DWARFDebugAranges::ParsedCUOffsetColl &CUOffsets)
-      : RangeCollection(Ranges),
-        CUOffsetCollection(CUOffsets) {}
-    void operator()(const DWARFDebugArangeSet &Set) {
-      DWARFDebugAranges::Range Range;
-      Range.Offset = Set.getCompileUnitDIEOffset();
-      CUOffsetCollection.insert(Range.Offset);
-
-      for (uint32_t i = 0, n = Set.getNumDescriptors(); i < n; ++i) {
-        const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
-            Set.getDescriptor(i);
-        Range.LoPC = ArangeDescPtr->Address;
-        Range.Length = ArangeDescPtr->Length;
-
-        // Insert each item in increasing address order so binary searching
-        // can later be done!
-        DWARFDebugAranges::RangeColl::iterator InsertPos =
-          std::lower_bound(RangeCollection.begin(), RangeCollection.end(),
-                           Range, RangeLessThan);
-        RangeCollection.insert(InsertPos, Range);
-      }
-
-    }
-    DWARFDebugAranges::RangeColl &RangeCollection;
-    DWARFDebugAranges::ParsedCUOffsetColl &CUOffsetCollection;
-  };
-}
-
-bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
-  if (debug_aranges_data.isValidOffset(0)) {
-    uint32_t offset = 0;
-
-    typedef std::vector<DWARFDebugArangeSet> SetCollection;
-    SetCollection sets;
-
-    DWARFDebugArangeSet set;
-    Range range;
-    while (set.extract(debug_aranges_data, &offset))
-      sets.push_back(set);
-
-    uint32_t count = 0;
-
-    std::for_each(sets.begin(), sets.end(), CountArangeDescriptors(count));
-
-    if (count > 0) {
-      Aranges.reserve(count);
-      AddArangeDescriptors range_adder(Aranges, ParsedCUOffsets);
-      std::for_each(sets.begin(), sets.end(), range_adder);
-    }
+void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
+  if (!DebugArangesData.isValidOffset(0))
+    return;
+  uint32_t Offset = 0;
+  typedef std::vector<DWARFDebugArangeSet> RangeSetColl;
+  RangeSetColl Sets;
+  DWARFDebugArangeSet Set;
+  uint32_t TotalRanges = 0;
+
+  while (Set.extract(DebugArangesData, &Offset)) {
+    Sets.push_back(Set);
+    TotalRanges += Set.getNumDescriptors();
   }
-  return false;
-}
+  if (TotalRanges == 0)
+    return;
 
-bool DWARFDebugAranges::generate(DWARFContext *ctx) {
-  if (ctx) {
-    const uint32_t num_compile_units = ctx->getNumCompileUnits();
-    for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) {
-      if (DWARFCompileUnit *cu = ctx->getCompileUnitAtIndex(cu_idx)) {
-        uint32_t CUOffset = cu->getOffset();
-        if (ParsedCUOffsets.insert(CUOffset).second)
-          cu->buildAddressRangeTable(this, true);
-      }
+  Aranges.reserve(TotalRanges);
+  for (RangeSetColl::const_iterator I = Sets.begin(), E = Sets.end(); I != E;
+       ++I) {
+    uint32_t CUOffset = I->getCompileUnitDIEOffset();
+
+    for (uint32_t i = 0, n = I->getNumDescriptors(); i < n; ++i) {
+      const DWARFDebugArangeSet::Descriptor *ArangeDescPtr =
+          I->getDescriptor(i);
+      uint64_t LowPC = ArangeDescPtr->Address;
+      uint64_t HighPC = LowPC + ArangeDescPtr->Length;
+      appendRange(CUOffset, LowPC, HighPC);
     }
   }
-  sort(true, /* overlap size */ 0);
-  return !isEmpty();
 }
 
-void DWARFDebugAranges::dump(raw_ostream &OS) const {
-  const uint32_t num_ranges = getNumRanges();
-  for (uint32_t i = 0; i < num_ranges; ++i) {
-    const Range &range = Aranges[i];
-    OS << format("0x%8.8x: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-                 range.Offset, (uint64_t)range.LoPC, (uint64_t)range.HiPC());
+void DWARFDebugAranges::generate(DWARFContext *CTX) {
+  clear();
+  if (!CTX)
+    return;
+
+  // Extract aranges from .debug_aranges section.
+  DataExtractor ArangesData(CTX->getARangeSection(), CTX->isLittleEndian(), 0);
+  extract(ArangesData);
+
+  // Generate aranges from DIEs: even if .debug_aranges section is present,
+  // it may describe only a small subset of compilation units, so we need to
+  // manually build aranges for the rest of them.
+  for (uint32_t i = 0, n = CTX->getNumCompileUnits(); i < n; ++i) {
+    if (DWARFCompileUnit *CU = CTX->getCompileUnitAtIndex(i)) {
+      uint32_t CUOffset = CU->getOffset();
+      if (ParsedCUOffsets.insert(CUOffset).second)
+        CU->buildAddressRangeTable(this, true, CUOffset);
+    }
   }
-}
 
-void DWARFDebugAranges::Range::dump(raw_ostream &OS) const {
-  OS << format("{0x%8.8x}: [0x%8.8" PRIx64 " - 0x%8.8" PRIx64 ")\n",
-               Offset, LoPC, HiPC());
+  sortAndMinimize();
 }
 
-void DWARFDebugAranges::appendRange(uint32_t offset, uint64_t low_pc,
-                                    uint64_t high_pc) {
+void DWARFDebugAranges::appendRange(uint32_t CUOffset, uint64_t LowPC,
+                                    uint64_t HighPC) {
   if (!Aranges.empty()) {
-    if (Aranges.back().Offset == offset && Aranges.back().HiPC() == low_pc) {
-      Aranges.back().setHiPC(high_pc);
+    if (Aranges.back().CUOffset == CUOffset &&
+        Aranges.back().HighPC() == LowPC) {
+      Aranges.back().setHighPC(HighPC);
       return;
     }
   }
-  Aranges.push_back(Range(low_pc, high_pc, offset));
+  Aranges.push_back(Range(LowPC, HighPC, CUOffset));
 }
 
-void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
+void DWARFDebugAranges::sortAndMinimize() {
   const size_t orig_arange_size = Aranges.size();
   // Size of one? If so, no sorting is needed
   if (orig_arange_size <= 1)
     return;
   // Sort our address range entries
-  std::stable_sort(Aranges.begin(), Aranges.end(), RangeLessThan);
-
-  if (!minimize)
-    return;
+  std::stable_sort(Aranges.begin(), Aranges.end());
 
   // Most address ranges are contiguous from function to function
   // so our new ranges will likely be smaller. We calculate the size
@@ -151,7 +102,7 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   // copy the new minimal stuff over to the new collection.
   size_t minimal_size = 1;
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i], n))
+    if (!Range::SortedOverlapCheck(Aranges[i-1], Aranges[i]))
       ++minimal_size;
   }
 
@@ -166,14 +117,14 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   uint32_t j = 0;
   minimal_aranges[j] = Aranges[0];
   for (size_t i = 1; i < orig_arange_size; ++i) {
-    if(Range::SortedOverlapCheck (minimal_aranges[j], Aranges[i], n)) {
-      minimal_aranges[j].setHiPC (Aranges[i].HiPC());
+    if (Range::SortedOverlapCheck(minimal_aranges[j], Aranges[i])) {
+      minimal_aranges[j].setHighPC(Aranges[i].HighPC());
     } else {
       // Only increment j if we aren't merging
       minimal_aranges[++j] = Aranges[i];
     }
   }
-  assert (j+1 == minimal_size);
+  assert(j+1 == minimal_size);
 
   // Now swap our new minimal aranges into place. The local
   // minimal_aranges will then contian the old big collection
@@ -181,50 +132,21 @@ void DWARFDebugAranges::sort(bool minimize, uint32_t n) {
   minimal_aranges.swap(Aranges);
 }
 
-uint32_t DWARFDebugAranges::findAddress(uint64_t address) const {
+uint32_t DWARFDebugAranges::findAddress(uint64_t Address) const {
   if (!Aranges.empty()) {
-    Range range(address);
+    Range range(Address);
     RangeCollIterator begin = Aranges.begin();
     RangeCollIterator end = Aranges.end();
-    RangeCollIterator pos = std::lower_bound(begin, end, range, RangeLessThan);
+    RangeCollIterator pos =
+        std::lower_bound(begin, end, range);
 
-    if (pos != end && pos->LoPC <= address && address < pos->HiPC()) {
-      return pos->Offset;
+    if (pos != end && pos->containsAddress(Address)) {
+      return pos->CUOffset;
     } else if (pos != begin) {
       --pos;
-      if (pos->LoPC <= address && address < pos->HiPC())
-        return (*pos).Offset;
+      if (pos->containsAddress(Address))
+        return pos->CUOffset;
     }
   }
   return -1U;
 }
-
-bool
-DWARFDebugAranges::allRangesAreContiguous(uint64_t &LoPC, uint64_t &HiPC) const{
-  if (Aranges.empty())
-    return false;
-
-  uint64_t next_addr = 0;
-  RangeCollIterator begin = Aranges.begin();
-  for (RangeCollIterator pos = begin, end = Aranges.end(); pos != end;
-       ++pos) {
-    if (pos != begin && pos->LoPC != next_addr)
-      return false;
-    next_addr = pos->HiPC();
-  }
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
-
-bool DWARFDebugAranges::getMaxRange(uint64_t &LoPC, uint64_t &HiPC) const {
-  if (Aranges.empty())
-    return false;
-  // We checked for empty at the start of function so front() will be valid.
-  LoPC = Aranges.front().LoPC;
-  // We checked for empty at the start of function so back() will be valid.
-  HiPC = Aranges.back().HiPC();
-  return true;
-}
diff --git a/lib/DebugInfo/DWARFDebugAranges.h b/lib/DebugInfo/DWARFDebugAranges.h
index 1509ffa..35ad8e5 100644
--- a/lib/DebugInfo/DWARFDebugAranges.h
+++ b/lib/DebugInfo/DWARFDebugAranges.h
@@ -20,81 +20,61 @@ class DWARFContext;
 
 class DWARFDebugAranges {
 public:
+  void clear() {
+    Aranges.clear();
+    ParsedCUOffsets.clear();
+  }
+
+  void generate(DWARFContext *CTX);
+
+  // Use appendRange multiple times and then call sortAndMinimize.
+  void appendRange(uint32_t CUOffset, uint64_t LowPC, uint64_t HighPC);
+
+  uint32_t findAddress(uint64_t Address) const;
+
+private:
+  void extract(DataExtractor DebugArangesData);
+  void sortAndMinimize();
+
   struct Range {
-    explicit Range(uint64_t lo = -1ULL, uint64_t hi = -1ULL,
-                   uint32_t off = -1U)
-      : LoPC(lo), Length(hi-lo), Offset(off) {}
-
-    void clear() {
-      LoPC = -1ULL;
-      Length = 0;
-      Offset = -1U;
-    }
+    explicit Range(uint64_t LowPC = -1ULL, uint64_t HighPC = -1ULL,
+                   uint32_t CUOffset = -1U)
+      : LowPC(LowPC), Length(HighPC - LowPC), CUOffset(CUOffset) {}
 
-    void setHiPC(uint64_t HiPC) {
-      if (HiPC == -1ULL || HiPC <= LoPC)
+    void setHighPC(uint64_t HighPC) {
+      if (HighPC == -1ULL || HighPC <= LowPC)
         Length = 0;
       else
-        Length = HiPC - LoPC;
+        Length = HighPC - LowPC;
     }
-    uint64_t HiPC() const {
+    uint64_t HighPC() const {
       if (Length)
-        return LoPC + Length;
+        return LowPC + Length;
       return -1ULL;
     }
-    bool isValidRange() const { return Length > 0; }
+    bool containsAddress(uint64_t Address) const {
+      return LowPC <= Address && Address < HighPC();
+    }
 
-    static bool SortedOverlapCheck(const Range &curr_range,
-                                   const Range &next_range, uint32_t n) {
-      if (curr_range.Offset != next_range.Offset)
-        return false;
-      return curr_range.HiPC() + n >= next_range.LoPC;
+    bool operator <(const Range &other) const {
+      return LowPC < other.LowPC;
     }
 
-    bool contains(const Range &range) const {
-      return LoPC <= range.LoPC && range.HiPC() <= HiPC();
+    static bool SortedOverlapCheck(const Range &Left, const Range &Right) {
+      if (Left.CUOffset != Right.CUOffset)
+        return false;
+      return Left.HighPC() >= Right.LowPC;
     }
 
-    void dump(raw_ostream &OS) const;
-    uint64_t LoPC; // Start of address range
-    uint32_t Length; // End of address range (not including this address)
-    uint32_t Offset; // Offset of the compile unit or die
+    uint64_t LowPC; // Start of address range.
+    uint32_t Length; // End of address range (not including this address).
+    uint32_t CUOffset; // Offset of the compile unit or die.
   };
 
-  void clear() {
-    Aranges.clear();
-    ParsedCUOffsets.clear();
-  }
-  bool allRangesAreContiguous(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool getMaxRange(uint64_t& LoPC, uint64_t& HiPC) const;
-  bool extract(DataExtractor debug_aranges_data);
-  bool generate(DWARFContext *ctx);
-
-  // Use append range multiple times and then call sort
-  void appendRange(uint32_t cu_offset, uint64_t low_pc, uint64_t high_pc);
-  void sort(bool minimize, uint32_t n);
-
-  const Range *rangeAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return &Aranges[idx];
-    return NULL;
-  }
-  void dump(raw_ostream &OS) const;
-  uint32_t findAddress(uint64_t address) const;
-  bool isEmpty() const { return Aranges.empty(); }
-  uint32_t getNumRanges() const { return Aranges.size(); }
-
-  uint32_t offsetAtIndex(uint32_t idx) const {
-    if (idx < Aranges.size())
-      return Aranges[idx].Offset;
-    return -1U;
-  }
-
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
   typedef DenseSet<uint32_t>              ParsedCUOffsetColl;
 
-private:
   RangeColl Aranges;
   ParsedCUOffsetColl ParsedCUOffsets;
 };
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 10be7b4..babfd2e 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -19,11 +19,10 @@
 using namespace llvm;
 using namespace dwarf;
 
-void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
-                                      const DWARFCompileUnit *cu,
+void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS, const DWARFUnit *u,
                                       unsigned recurseDepth,
                                       unsigned indent) const {
-  DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  DataExtractor debug_info_data = u->getDebugInfoExtractor();
   uint32_t offset = Offset;
 
   if (debug_info_data.isValidOffset(offset)) {
@@ -45,13 +44,13 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
         for (uint32_t i = 0; i != numAttributes; ++i) {
           uint16_t attr = AbbrevDecl->getAttrByIndex(i);
           uint16_t form = AbbrevDecl->getFormByIndex(i);
-          dumpAttribute(OS, cu, &offset, attr, form, indent);
+          dumpAttribute(OS, u, &offset, attr, form, indent);
         }
 
         const DWARFDebugInfoEntryMinimal *child = getFirstChild();
         if (recurseDepth > 0 && child) {
           while (child) {
-            child->dump(OS, cu, recurseDepth-1, indent+2);
+            child->dump(OS, u, recurseDepth-1, indent+2);
             child = child->getSibling();
           }
         }
@@ -66,12 +65,11 @@ void DWARFDebugInfoEntryMinimal::dump(raw_ostream &OS,
 }
 
 void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
-                                               const DWARFCompileUnit *cu,
-                                               uint32_t* offset_ptr,
-                                               uint16_t attr,
-                                               uint16_t form,
+                                               const DWARFUnit *u,
+                                               uint32_t *offset_ptr,
+                                               uint16_t attr, uint16_t form,
                                                unsigned indent) const {
-  OS << format("0x%8.8x: ", *offset_ptr);
+  OS << "            ";
   OS.indent(indent+2);
   const char *attrString = AttributeString(attr);
   if (attrString)
@@ -86,57 +84,20 @@ void DWARFDebugInfoEntryMinimal::dumpAttribute(raw_ostream &OS,
 
   DWARFFormValue formValue(form);
 
-  if (!formValue.extractValue(cu->getDebugInfoExtractor(), offset_ptr, cu))
+  if (!formValue.extractValue(u->getDebugInfoExtractor(), offset_ptr, u))
     return;
 
   OS << "\t(";
-  formValue.dump(OS, cu);
+  formValue.dump(OS, u);
   OS << ")\n";
 }
 
-bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *CU,
-                                             const uint8_t *FixedFormSizes,
+bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFUnit *U,
                                              uint32_t *OffsetPtr) {
   Offset = *OffsetPtr;
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
-  if (0 == AbbrCode) {
-    // NULL debug tag entry.
-    AbbrevDecl = NULL;
-    return true;
-  }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
-  assert(AbbrevDecl);
-  assert(FixedFormSizes); // For best performance this should be specified!
-
-  // Skip all data in the .debug_info for the attributes
-  for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Form = AbbrevDecl->getFormByIndex(i);
-
-    // FIXME: Currently we're checking if this is less than the last
-    // entry in the fixed_form_sizes table, but this should be changed
-    // to use dynamic dispatch.
-    uint8_t FixedFormSize =
-        (Form < DW_FORM_ref_sig8) ? FixedFormSizes[Form] : 0;
-    if (FixedFormSize)
-      *OffsetPtr += FixedFormSize;
-    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                        CU)) {
-      // Restore the original offset.
-      *OffsetPtr = Offset;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool
-DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
-                                    uint32_t *OffsetPtr) {
-  DataExtractor DebugInfoData = CU->getDebugInfoExtractor();
-  const uint32_t CUEndOffset = CU->getNextCompileUnitOffset();
-  Offset = *OffsetPtr;
-  if ((Offset >= CUEndOffset) || !DebugInfoData.isValidOffset(Offset))
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t UEndOffset = U->getNextUnitOffset();
+  if (Offset >= UEndOffset || !DebugInfoData.isValidOffset(Offset))
     return false;
   uint64_t AbbrCode = DebugInfoData.getULEB128(OffsetPtr);
   if (0 == AbbrCode) {
@@ -144,31 +105,25 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *CU,
     AbbrevDecl = NULL;
     return true;
   }
-  AbbrevDecl = CU->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
+  AbbrevDecl = U->getAbbreviations()->getAbbreviationDeclaration(AbbrCode);
   if (0 == AbbrevDecl) {
     // Restore the original offset.
     *OffsetPtr = Offset;
     return false;
   }
-  bool IsCompileUnitTag = (AbbrevDecl->getTag() == DW_TAG_compile_unit);
-  if (IsCompileUnitTag)
-    const_cast<DWARFCompileUnit*>(CU)->setBaseAddress(0);
+  ArrayRef<uint8_t> FixedFormSizes = DWARFFormValue::getFixedFormSizes(
+      U->getAddressByteSize(), U->getVersion());
+  assert(FixedFormSizes.size() > 0);
 
   // Skip all data in the .debug_info for the attributes
   for (uint32_t i = 0, n = AbbrevDecl->getNumAttributes(); i < n; ++i) {
-    uint16_t Attr = AbbrevDecl->getAttrByIndex(i);
     uint16_t Form = AbbrevDecl->getFormByIndex(i);
 
-    if (IsCompileUnitTag &&
-        ((Attr == DW_AT_entry_pc) || (Attr == DW_AT_low_pc))) {
-      DWARFFormValue FormValue(Form);
-      if (FormValue.extractValue(DebugInfoData, OffsetPtr, CU)) {
-        if (Attr == DW_AT_low_pc || Attr == DW_AT_entry_pc)
-          const_cast<DWARFCompileUnit*>(CU)
-            ->setBaseAddress(FormValue.getUnsigned());
-      }
-    } else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr,
-                                          CU)) {
+    uint8_t FixedFormSize =
+        (Form < FixedFormSizes.size()) ? FixedFormSizes[Form] : 0;
+    if (FixedFormSize)
+      *OffsetPtr += FixedFormSize;
+    else if (!DWARFFormValue::skipValue(Form, DebugInfoData, OffsetPtr, U)) {
       // Restore the original offset.
       *OffsetPtr = Offset;
       return false;
@@ -187,203 +142,191 @@ bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
          Tag == DW_TAG_inlined_subroutine;
 }
 
-uint32_t
-DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
-                                              const uint16_t attr,
-                                              DWARFFormValue &form_value,
-                                              uint32_t *end_attr_offset_ptr)
-                                              const {
-  if (AbbrevDecl) {
-    uint32_t attr_idx = AbbrevDecl->findAttributeIndex(attr);
-
-    if (attr_idx != -1U) {
-      uint32_t offset = getOffset();
+bool DWARFDebugInfoEntryMinimal::getAttributeValue(
+    const DWARFUnit *U, const uint16_t Attr, DWARFFormValue &FormValue) const {
+  if (!AbbrevDecl)
+    return false;
 
-      DataExtractor debug_info_data = cu->getDebugInfoExtractor();
+  uint32_t AttrIdx = AbbrevDecl->findAttributeIndex(Attr);
+  if (AttrIdx == -1U)
+    return false;
 
-      // Skip the abbreviation code so we are at the data for the attributes
-      debug_info_data.getULEB128(&offset);
+  DataExtractor DebugInfoData = U->getDebugInfoExtractor();
+  uint32_t DebugInfoOffset = getOffset();
 
-      uint32_t idx = 0;
-      while (idx < attr_idx)
-        DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(idx++),
-                                  debug_info_data, &offset, cu);
+  // Skip the abbreviation code so we are at the data for the attributes
+  DebugInfoData.getULEB128(&DebugInfoOffset);
 
-      const uint32_t attr_offset = offset;
-      form_value = DWARFFormValue(AbbrevDecl->getFormByIndex(idx));
-      if (form_value.extractValue(debug_info_data, &offset, cu)) {
-        if (end_attr_offset_ptr)
-          *end_attr_offset_ptr = offset;
-        return attr_offset;
-      }
-    }
+  // Skip preceding attribute values.
+  for (uint32_t i = 0; i < AttrIdx; ++i) {
+    DWARFFormValue::skipValue(AbbrevDecl->getFormByIndex(i),
+                              DebugInfoData, &DebugInfoOffset, U);
   }
 
-  return 0;
+  FormValue = DWARFFormValue(AbbrevDecl->getFormByIndex(AttrIdx));
+  return FormValue.extractValue(DebugInfoData, &DebugInfoOffset, U);
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     const char* fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value)) {
-    DataExtractor stringExtractor(cu->getStringSection(), false, 0);
-    return form_value.getAsCString(&stringExtractor);
-  }
-  return fail_value;
+const char *DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
+    const DWARFUnit *U, const uint16_t Attr, const char *FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<const char *> Result = FormValue.getAsCString(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
+}
+
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsAddress(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsAddress(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
-                                                    const DWARFCompileUnit* cu,
-                                                    const uint16_t attr,
-                                                    uint64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getUnsigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsignedConstant(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsUnsignedConstant();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-int64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     int64_t fail_value) const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getSigned();
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsReference(U);
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-uint64_t
-DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
-                                                     const DWARFCompileUnit* cu,
-                                                     const uint16_t attr,
-                                                     uint64_t fail_value)
-                                                     const {
-  DWARFFormValue form_value;
-  if (getAttributeValue(cu, attr, form_value))
-      return form_value.getReference(cu);
-  return fail_value;
+uint64_t DWARFDebugInfoEntryMinimal::getAttributeValueAsSectionOffset(
+    const DWARFUnit *U, const uint16_t Attr, uint64_t FailValue) const {
+  DWARFFormValue FormValue;
+  if (!getAttributeValue(U, Attr, FormValue))
+    return FailValue;
+  Optional<uint64_t> Result = FormValue.getAsSectionOffset();
+  return Result.hasValue() ? Result.getValue() : FailValue;
 }
 
-bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFUnit *U,
                                                  uint64_t &LowPC,
                                                  uint64_t &HighPC) const {
-  HighPC = -1ULL;
-  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
-  if (LowPC != -1ULL)
-    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  LowPC = getAttributeValueAsAddress(U, DW_AT_low_pc, -1ULL);
+  if (LowPC == -1ULL)
+    return false;
+  HighPC = getAttributeValueAsAddress(U, DW_AT_high_pc, -1ULL);
+  if (HighPC == -1ULL) {
+    // Since DWARF4, DW_AT_high_pc may also be of class constant, in which case
+    // it represents function size.
+    HighPC = getAttributeValueAsUnsignedConstant(U, DW_AT_high_pc, -1ULL);
+    if (HighPC != -1ULL)
+      HighPC += LowPC;
+  }
   return (HighPC != -1ULL);
 }
 
-void
-DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
-                                               DWARFDebugAranges *DebugAranges)
-                                                   const {
+void DWARFDebugInfoEntryMinimal::buildAddressRangeTable(
+    const DWARFUnit *U, DWARFDebugAranges *DebugAranges,
+    uint32_t UOffsetInAranges) const {
   if (AbbrevDecl) {
     if (isSubprogramDIE()) {
       uint64_t LowPC, HighPC;
-      if (getLowAndHighPC(CU, LowPC, HighPC)) {
-        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
-      }
+      if (getLowAndHighPC(U, LowPC, HighPC))
+        DebugAranges->appendRange(UOffsetInAranges, LowPC, HighPC);
       // FIXME: try to append ranges from .debug_ranges section.
     }
 
-    const DWARFDebugInfoEntryMinimal *child = getFirstChild();
-    while (child) {
-      child->buildAddressRangeTable(CU, DebugAranges);
-      child = child->getSibling();
+    const DWARFDebugInfoEntryMinimal *Child = getFirstChild();
+    while (Child) {
+      Child->buildAddressRangeTable(U, DebugAranges, UOffsetInAranges);
+      Child = Child->getSibling();
     }
   }
 }
 
-bool
-DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-                                                     const DWARFCompileUnit *CU,
-                                                     const uint64_t Address)
-                                                     const {
+bool DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
+    const DWARFUnit *U, const uint64_t Address) const {
   if (isNULL())
     return false;
   uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(CU, LowPC, HighPC))
+  if (getLowAndHighPC(U, LowPC, HighPC))
     return (LowPC <= Address && Address <= HighPC);
   // Try to get address ranges from .debug_ranges section.
-  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  uint32_t RangesOffset =
+      getAttributeValueAsSectionOffset(U, DW_AT_ranges, -1U);
   if (RangesOffset != -1U) {
     DWARFDebugRangeList RangeList;
-    if (CU->extractRangeList(RangesOffset, RangeList))
-      return RangeList.containsAddress(CU->getBaseAddress(), Address);
+    if (U->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(U->getBaseAddress(), Address);
   }
   return false;
 }
 
-const char*
-DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFCompileUnit *CU)
-                                                                         const {
+const char *
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFUnit *U) const {
   if (!isSubroutineDIE())
     return 0;
   // Try to get mangled name if possible.
   if (const char *name =
-      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
+      getAttributeValueAsString(U, DW_AT_MIPS_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
+  if (const char *name = getAttributeValueAsString(U, DW_AT_name, 0))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
-      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
+      getAttributeValueAsReference(U, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extract(CU, &spec_ref)) {
-      if (const char *name = spec_die.getSubroutineName(CU))
+    if (spec_die.extractFast(U, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(U))
         return name;
     }
   }
   // Try to get name from abstract origin DIE.
   uint32_t abs_origin_ref =
-      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
+      getAttributeValueAsReference(U, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubroutineName(CU))
+    if (abs_origin_die.extractFast(U, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(U))
         return name;
     }
   }
   return 0;
 }
 
-void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFCompileUnit *CU,
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFUnit *U,
                                                 uint32_t &CallFile,
                                                 uint32_t &CallLine,
                                                 uint32_t &CallColumn) const {
-  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
-  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
-  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+  CallFile = getAttributeValueAsUnsignedConstant(U, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsignedConstant(U, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsignedConstant(U, DW_AT_call_column, 0);
 }
 
-DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFDebugInfoEntryInlinedChain
 DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
-                                                     const DWARFCompileUnit *CU,
-                                                     const uint64_t Address)
-                                                     const {
-  DWARFDebugInfoEntryMinimal::InlinedChain InlinedChain;
+    const DWARFUnit *U, const uint64_t Address) const {
+  DWARFDebugInfoEntryInlinedChain InlinedChain;
+  InlinedChain.U = U;
   if (isNULL())
     return InlinedChain;
   for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
     // Append current DIE to inlined chain only if it has correct tag
     // (e.g. it is not a lexical block).
     if (DIE->isSubroutineDIE()) {
-      InlinedChain.push_back(*DIE);
+      InlinedChain.DIEs.push_back(*DIE);
     }
     // Try to get child which also contains provided address.
     const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
     while (Child) {
-      if (Child->addressRangeContainsAddress(CU, Address)) {
+      if (Child->addressRangeContainsAddress(U, Address)) {
         // Assume there is only one such child.
         break;
       }
@@ -392,6 +335,6 @@ DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
     DIE = Child;
   }
   // Reverse the obtained chain to make the root of inlined chain last.
-  std::reverse(InlinedChain.begin(), InlinedChain.end());
+  std::reverse(InlinedChain.DIEs.begin(), InlinedChain.DIEs.end());
   return InlinedChain;
 }
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index 9003591..aa61056 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -18,9 +18,10 @@ namespace llvm {
 
 class DWARFDebugAranges;
 class DWARFCompileUnit;
+class DWARFUnit;
 class DWARFContext;
 class DWARFFormValue;
-class DWARFInlinedSubroutineChain;
+struct DWARFDebugInfoEntryInlinedChain;
 
 /// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
 class DWARFDebugInfoEntryMinimal {
@@ -39,23 +40,15 @@ public:
   DWARFDebugInfoEntryMinimal()
     : Offset(0), ParentIdx(0), SiblingIdx(0), AbbrevDecl(0) {}
 
-  void dump(raw_ostream &OS, const DWARFCompileUnit *cu,
-            unsigned recurseDepth, unsigned indent = 0) const;
-  void dumpAttribute(raw_ostream &OS, const DWARFCompileUnit *cu,
-                     uint32_t *offset_ptr, uint16_t attr, uint16_t form,
-                     unsigned indent = 0) const;
+  void dump(raw_ostream &OS, const DWARFUnit *u, unsigned recurseDepth,
+            unsigned indent = 0) const;
+  void dumpAttribute(raw_ostream &OS, const DWARFUnit *u, uint32_t *offset_ptr,
+                     uint16_t attr, uint16_t form, unsigned indent = 0) const;
 
-  /// Extracts a debug info entry, which is a child of a given compile unit,
+  /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
-  bool extractFast(const DWARFCompileUnit *CU, const uint8_t *FixedFormSizes,
-                   uint32_t *OffsetPtr);
-
-  /// Extract a debug info entry for a given compile unit from the
-  /// .debug_info and .debug_abbrev data starting at the given offset.
-  /// If compile unit can't be parsed, returns false and doesn't change
-  /// OffsetPtr.
-  bool extract(const DWARFCompileUnit *CU, uint32_t *OffsetPtr);
+  bool extractFast(const DWARFUnit *U, uint32_t *OffsetPtr);
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
   bool isNULL() const { return AbbrevDecl == 0; }
@@ -120,60 +113,65 @@ public:
     return AbbrevDecl;
   }
 
-  uint32_t getAttributeValue(const DWARFCompileUnit *cu,
-                             const uint16_t attr, DWARFFormValue &formValue,
-                             uint32_t *end_attr_offset_ptr = 0) const;
+  bool getAttributeValue(const DWARFUnit *U, const uint16_t Attr,
+                         DWARFFormValue &FormValue) const;
+
+  const char *getAttributeValueAsString(const DWARFUnit *U, const uint16_t Attr,
+                                        const char *FailValue) const;
 
-  const char* getAttributeValueAsString(const DWARFCompileUnit* cu,
-                                        const uint16_t attr,
-                                        const char *fail_value) const;
+  uint64_t getAttributeValueAsAddress(const DWARFUnit *U, const uint16_t Attr,
+                                      uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsUnsigned(const DWARFCompileUnit *cu,
-                                       const uint16_t attr,
-                                       uint64_t fail_value) const;
+  uint64_t getAttributeValueAsUnsignedConstant(const DWARFUnit *U,
+                                               const uint16_t Attr,
+                                               uint64_t FailValue) const;
 
-  uint64_t getAttributeValueAsReference(const DWARFCompileUnit *cu,
-                                        const uint16_t attr,
-                                        uint64_t fail_value) const;
+  uint64_t getAttributeValueAsReference(const DWARFUnit *U, const uint16_t Attr,
+                                        uint64_t FailValue) const;
 
-  int64_t getAttributeValueAsSigned(const DWARFCompileUnit* cu,
-                                    const uint16_t attr,
-                                    int64_t fail_value) const;
+  uint64_t getAttributeValueAsSectionOffset(const DWARFUnit *U,
+                                            const uint16_t Attr,
+                                            uint64_t FailValue) const;
 
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
-  bool getLowAndHighPC(const DWARFCompileUnit *CU,
-                       uint64_t &LowPC, uint64_t &HighPC) const;
+  bool getLowAndHighPC(const DWARFUnit *U, uint64_t &LowPC,
+                       uint64_t &HighPC) const;
 
-  void buildAddressRangeTable(const DWARFCompileUnit *CU,
-                              DWARFDebugAranges *DebugAranges) const;
+  void buildAddressRangeTable(const DWARFUnit *U,
+                              DWARFDebugAranges *DebugAranges,
+                              uint32_t CUOffsetInAranges) const;
 
-  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+  bool addressRangeContainsAddress(const DWARFUnit *U,
                                    const uint64_t Address) const;
 
   /// If a DIE represents a subprogram (or inlined subroutine),
   /// returns its mangled name (or short name, if mangled is missing).
   /// This name may be fetched from specification or abstract origin
   /// for this subprogram. Returns null if no name is found.
-  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+  const char *getSubroutineName(const DWARFUnit *U) const;
 
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
   /// DW_AT_call_column from DIE (or zeroes if they are missing).
-  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+  void getCallerFrame(const DWARFUnit *U, uint32_t &CallFile,
                       uint32_t &CallLine, uint32_t &CallColumn) const;
 
-  /// InlinedChain - represents a chain of inlined_subroutine
-  /// DIEs, (possibly ending with subprogram DIE), all of which are contained
-  /// in some concrete inlined instance tree. Address range for each DIE
-  /// (except the last DIE) in this chain is contained in address
-  /// range for next DIE in the chain.
-  typedef SmallVector<DWARFDebugInfoEntryMinimal, 4> InlinedChain;
-
   /// Get inlined chain for a given address, rooted at the current DIE.
   /// Returns empty chain if address is not contained in address range
   /// of current DIE.
-  InlinedChain getInlinedChainForAddress(const DWARFCompileUnit *CU,
-                                         const uint64_t Address) const;
+  DWARFDebugInfoEntryInlinedChain
+  getInlinedChainForAddress(const DWARFUnit *U, const uint64_t Address) const;
+};
+
+/// DWARFDebugInfoEntryInlinedChain - represents a chain of inlined_subroutine
+/// DIEs, (possibly ending with subprogram DIE), all of which are contained
+/// in some concrete inlined instance tree. Address range for each DIE
+/// (except the last DIE) in this chain is contained in address
+/// range for next DIE in the chain.
+struct DWARFDebugInfoEntryInlinedChain {
+  DWARFDebugInfoEntryInlinedChain() : U(0) {}
+  SmallVector<DWARFDebugInfoEntryMinimal, 4> DIEs;
+  const DWARFUnit *U;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index 192381c..13d09dd 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -211,7 +211,7 @@ DWARFDebugLine::parsePrologue(DataExtractor debug_line_data,
 
   if (*offset_ptr != end_prologue_offset) {
     fprintf(stderr, "warning: parsing line table prologue at 0x%8.8x should"
-                    " have ended at 0x%8.8x but it ended ad 0x%8.8x\n",
+                    " have ended at 0x%8.8x but it ended at 0x%8.8x\n",
             prologue_offset, end_prologue_offset, *offset_ptr);
     return false;
   }
diff --git a/lib/DebugInfo/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARFDebugLoc.cpp
new file mode 100644
index 0000000..3895ffa
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugLoc.cpp
@@ -0,0 +1,74 @@
+//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugLoc.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugLoc::dump(raw_ostream &OS) const {
+  for (LocationLists::const_iterator I = Locations.begin(), E = Locations.end(); I != E; ++I) {
+    OS << format("0x%8.8x: ", I->Offset);
+    const unsigned Indent = 12;
+    for (SmallVectorImpl<Entry>::const_iterator I2 = I->Entries.begin(), E2 = I->Entries.end(); I2 != E2; ++I2) {
+      if (I2 != I->Entries.begin())
+        OS.indent(Indent);
+      OS << "Beginning address offset: " << format("0x%016" PRIx64, I2->Begin)
+         << '\n';
+      OS.indent(Indent) << "   Ending address offset: "
+                        << format("0x%016" PRIx64, I2->End) << '\n';
+      OS.indent(Indent) << "    Location description: ";
+      for (SmallVectorImpl<unsigned char>::const_iterator I3 = I2->Loc.begin(), E3 = I2->Loc.end(); I3 != E3; ++I3) {
+        OS << format("%2.2x ", *I3);
+      }
+      OS << "\n\n";
+    }
+  }
+}
+
+void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
+  uint32_t Offset = 0;
+  while (data.isValidOffset(Offset)) {
+    Locations.resize(Locations.size() + 1);
+    LocationList &Loc = Locations.back();
+    Loc.Offset = Offset;
+    // 2.6.2 Location Lists
+    // A location list entry consists of:
+    while (true) {
+      Entry E;
+      RelocAddrMap::const_iterator AI = RelocMap.find(Offset);
+      // 1. A beginning address offset. ...
+      E.Begin = data.getUnsigned(&Offset, AddressSize);
+      if (AI != RelocMap.end())
+        E.Begin += AI->second.second;
+
+      AI = RelocMap.find(Offset);
+      // 2. An ending address offset. ...
+      E.End = data.getUnsigned(&Offset, AddressSize);
+      if (AI != RelocMap.end())
+        E.End += AI->second.second;
+
+      // The end of any given location list is marked by an end of list entry,
+      // which consists of a 0 for the beginning address offset and a 0 for the
+      // ending address offset.
+      if (E.Begin == 0 && E.End == 0)
+        break;
+
+      unsigned Bytes = data.getU16(&Offset);
+      // A single location description describing the location of the object...
+      StringRef str = data.getData().substr(Offset, Bytes);
+      Offset += Bytes;
+      E.Loc.reserve(str.size());
+      std::copy(str.begin(), str.end(), std::back_inserter(E.Loc));
+      Loc.Entries.push_back(llvm_move(E));
+    }
+  }
+}
diff --git a/lib/DebugInfo/DWARFDebugLoc.h b/lib/DebugInfo/DWARFDebugLoc.h
new file mode 100644
index 0000000..d31aaaa
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugLoc.h
@@ -0,0 +1,60 @@
+//===-- DWARFDebugLoc.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGLOC_H
+#define LLVM_DEBUGINFO_DWARFDEBUGLOC_H
+
+#include "DWARFRelocMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataExtractor.h"
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugLoc {
+  /// A single location within a location list.
+  struct Entry {
+    /// The beginning address of the instruction range.
+    uint64_t Begin;
+    /// The ending address of the instruction range.
+    uint64_t End;
+    /// The location of the variable within the specified range.
+    SmallVector<unsigned char, 4> Loc;
+  };
+
+  /// A list of locations that contain one variable.
+  struct LocationList {
+    /// The beginning offset where this location list is stored in the debug_loc
+    /// section.
+    unsigned Offset;
+    /// All the locations in which the variable is stored.
+    SmallVector<Entry, 2> Entries;
+  };
+
+  typedef SmallVector<LocationList, 4> LocationLists;
+
+  /// A list of all the variables in the debug_loc section, each one describing
+  /// the locations in which the variable is stored.
+  LocationLists Locations;
+
+  /// A map used to resolve binary relocations.
+  const RelocAddrMap &RelocMap;
+
+public:
+  DWARFDebugLoc(const RelocAddrMap &LocRelocMap) : RelocMap(LocRelocMap) {}
+  /// Print the location lists found within the debug_loc section.
+  void dump(raw_ostream &OS) const;
+  /// Parse the debug_loc section accessible via the 'data' parameter using the
+  /// specified address size to interpret the address ranges.
+  void parse(DataExtractor data, unsigned AddressSize);
+};
+}
+
+#endif
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index c5583f9..da71fb3 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -10,6 +10,8 @@
 #include "llvm/DebugInfo/DWARFFormValue.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
@@ -19,64 +21,114 @@ using namespace llvm;
 using namespace dwarf;
 
 namespace {
-template <uint8_t AddrSize, uint8_t RefAddrSize> struct FixedFormSizes {
-  static const uint8_t sizes[];
-};
+uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
+  // FIXME: Support DWARF64.
+  return (Version == 2) ? AddrSize : 4;
 }
 
 template <uint8_t AddrSize, uint8_t RefAddrSize>
-const uint8_t FixedFormSizes<AddrSize, RefAddrSize>::sizes[] = {
-  0, // 0x00 unused
-  AddrSize, // 0x01 DW_FORM_addr
-  0, // 0x02 unused
-  0, // 0x03 DW_FORM_block2
-  0, // 0x04 DW_FORM_block4
-  2, // 0x05 DW_FORM_data2
-  4, // 0x06 DW_FORM_data4
-  8, // 0x07 DW_FORM_data8
-  0, // 0x08 DW_FORM_string
-  0, // 0x09 DW_FORM_block
-  0, // 0x0a DW_FORM_block1
-  1, // 0x0b DW_FORM_data1
-  1, // 0x0c DW_FORM_flag
-  0, // 0x0d DW_FORM_sdata
-  4, // 0x0e DW_FORM_strp
-  0, // 0x0f DW_FORM_udata
-  RefAddrSize, // 0x10 DW_FORM_ref_addr
-  1, // 0x11 DW_FORM_ref1
-  2, // 0x12 DW_FORM_ref2
-  4, // 0x13 DW_FORM_ref4
-  8, // 0x14 DW_FORM_ref8
-  0, // 0x15 DW_FORM_ref_udata
-  0, // 0x16 DW_FORM_indirect
-  4, // 0x17 DW_FORM_sec_offset
-  0, // 0x18 DW_FORM_exprloc
-  0, // 0x19 DW_FORM_flag_present
-  8, // 0x20 DW_FORM_ref_sig8
-};
-
-static uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
-  // FIXME: Support DWARF64.
-  return (Version == 2) ? AddrSize : 4;
+ArrayRef<uint8_t> makeFixedFormSizesArrayRef() {
+  static const uint8_t sizes[] = {
+    0,           // 0x00 unused
+    AddrSize,    // 0x01 DW_FORM_addr
+    0,           // 0x02 unused
+    0,           // 0x03 DW_FORM_block2
+    0,           // 0x04 DW_FORM_block4
+    2,           // 0x05 DW_FORM_data2
+    4,           // 0x06 DW_FORM_data4
+    8,           // 0x07 DW_FORM_data8
+    0,           // 0x08 DW_FORM_string
+    0,           // 0x09 DW_FORM_block
+    0,           // 0x0a DW_FORM_block1
+    1,           // 0x0b DW_FORM_data1
+    1,           // 0x0c DW_FORM_flag
+    0,           // 0x0d DW_FORM_sdata
+    4,           // 0x0e DW_FORM_strp
+    0,           // 0x0f DW_FORM_udata
+    RefAddrSize, // 0x10 DW_FORM_ref_addr
+    1,           // 0x11 DW_FORM_ref1
+    2,           // 0x12 DW_FORM_ref2
+    4,           // 0x13 DW_FORM_ref4
+    8,           // 0x14 DW_FORM_ref8
+    0,           // 0x15 DW_FORM_ref_udata
+    0,           // 0x16 DW_FORM_indirect
+    4,           // 0x17 DW_FORM_sec_offset
+    0,           // 0x18 DW_FORM_exprloc
+    0,           // 0x19 DW_FORM_flag_present
+  };
+  return makeArrayRef(sizes);
+}
 }
 
-const uint8_t *
-DWARFFormValue::getFixedFormSizes(uint8_t AddrSize, uint16_t Version) {
+ArrayRef<uint8_t> DWARFFormValue::getFixedFormSizes(uint8_t AddrSize,
+                                                    uint16_t Version) {
   uint8_t RefAddrSize = getRefAddrSize(AddrSize, Version);
   if (AddrSize == 4 && RefAddrSize == 4)
-    return FixedFormSizes<4, 4>::sizes;
+    return makeFixedFormSizesArrayRef<4, 4>();
   if (AddrSize == 4 && RefAddrSize == 8)
-    return FixedFormSizes<4, 8>::sizes;
+    return makeFixedFormSizesArrayRef<4, 8>();
   if (AddrSize == 8 && RefAddrSize == 4)
-    return FixedFormSizes<8, 4>::sizes;
+    return makeFixedFormSizesArrayRef<8, 4>();
   if (AddrSize == 8 && RefAddrSize == 8)
-    return FixedFormSizes<8, 8>::sizes;
-  return 0;
+    return makeFixedFormSizesArrayRef<8, 8>();
+  return None;
 }
 
-bool
-DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
-                             const DWARFCompileUnit *cu) {
+static const DWARFFormValue::FormClass DWARF4FormClasses[] = {
+  DWARFFormValue::FC_Unknown,       // 0x0
+  DWARFFormValue::FC_Address,       // 0x01 DW_FORM_addr
+  DWARFFormValue::FC_Unknown,       // 0x02 unused
+  DWARFFormValue::FC_Block,         // 0x03 DW_FORM_block2
+  DWARFFormValue::FC_Block,         // 0x04 DW_FORM_block4
+  DWARFFormValue::FC_Constant,      // 0x05 DW_FORM_data2
+  // --- These can be FC_SectionOffset in DWARF3 and below:
+  DWARFFormValue::FC_Constant,      // 0x06 DW_FORM_data4
+  DWARFFormValue::FC_Constant,      // 0x07 DW_FORM_data8
+  // ---
+  DWARFFormValue::FC_String,        // 0x08 DW_FORM_string
+  DWARFFormValue::FC_Block,         // 0x09 DW_FORM_block
+  DWARFFormValue::FC_Block,         // 0x0a DW_FORM_block1
+  DWARFFormValue::FC_Constant,      // 0x0b DW_FORM_data1
+  DWARFFormValue::FC_Flag,          // 0x0c DW_FORM_flag
+  DWARFFormValue::FC_Constant,      // 0x0d DW_FORM_sdata
+  DWARFFormValue::FC_String,        // 0x0e DW_FORM_strp
+  DWARFFormValue::FC_Constant,      // 0x0f DW_FORM_udata
+  DWARFFormValue::FC_Reference,     // 0x10 DW_FORM_ref_addr
+  DWARFFormValue::FC_Reference,     // 0x11 DW_FORM_ref1
+  DWARFFormValue::FC_Reference,     // 0x12 DW_FORM_ref2
+  DWARFFormValue::FC_Reference,     // 0x13 DW_FORM_ref4
+  DWARFFormValue::FC_Reference,     // 0x14 DW_FORM_ref8
+  DWARFFormValue::FC_Reference,     // 0x15 DW_FORM_ref_udata
+  DWARFFormValue::FC_Indirect,      // 0x16 DW_FORM_indirect
+  DWARFFormValue::FC_SectionOffset, // 0x17 DW_FORM_sec_offset
+  DWARFFormValue::FC_Exprloc,       // 0x18 DW_FORM_exprloc
+  DWARFFormValue::FC_Flag,          // 0x19 DW_FORM_flag_present
+};
+
+bool DWARFFormValue::isFormClass(DWARFFormValue::FormClass FC) const {
+  // First, check DWARF4 form classes.
+  if (Form < ArrayRef<FormClass>(DWARF4FormClasses).size() &&
+      DWARF4FormClasses[Form] == FC)
+    return true;
+  // Check DW_FORM_ref_sig8 from DWARF4.
+  if (Form == DW_FORM_ref_sig8)
+    return (FC == FC_Reference);
+  // Check for some DWARF5 forms.
+  if (Form == DW_FORM_GNU_addr_index)
+    return (FC == FC_Address);
+  if (Form == DW_FORM_GNU_str_index)
+    return (FC == FC_String);
+  // In DWARF3 DW_FORM_data4 and DW_FORM_data8 served also as a section offset.
+  // Don't check for DWARF version here, as some producers may still do this
+  // by mistake.
+  if ((Form == DW_FORM_data4 || Form == DW_FORM_data8) &&
+      FC == FC_SectionOffset)
+    return true;
+  return false;
+}
+
+bool DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
+                                  const DWARFUnit *cu) {
   bool indirect = false;
   bool is_block = false;
   Value.data = NULL;
@@ -126,9 +178,13 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       Value.uval = data.getU16(offset_ptr);
       break;
     case DW_FORM_data4:
-    case DW_FORM_ref4:
+    case DW_FORM_ref4: {
+      RelocAddrMap::const_iterator AI = cu->getRelocMap()->find(*offset_ptr);
       Value.uval = data.getU32(offset_ptr);
+      if (AI != cu->getRelocMap()->end())
+        Value.uval += AI->second.second;
       break;
+    }
     case DW_FORM_data8:
     case DW_FORM_ref8:
       Value.uval = data.getU64(offset_ptr);
@@ -152,10 +208,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       break;
     case DW_FORM_string:
       Value.cstr = data.getCStr(offset_ptr);
-      // Set the string value to also be the data for inlined cstr form
-      // values only so we can tell the differnence between DW_FORM_string
-      // and DW_FORM_strp form values
-      Value.data = (const uint8_t*)Value.cstr;
       break;
     case DW_FORM_indirect:
       Form = data.getULEB128(offset_ptr);
@@ -179,8 +231,6 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       Value.uval = data.getU64(offset_ptr);
       break;
     case DW_FORM_GNU_addr_index:
-      Value.uval = data.getULEB128(offset_ptr);
-      break;
     case DW_FORM_GNU_str_index:
       Value.uval = data.getULEB128(offset_ptr);
       break;
@@ -203,13 +253,13 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
 
 bool
 DWARFFormValue::skipValue(DataExtractor debug_info_data, uint32_t* offset_ptr,
-                          const DWARFCompileUnit *cu) const {
+                          const DWARFUnit *cu) const {
   return DWARFFormValue::skipValue(Form, debug_info_data, offset_ptr, cu);
 }
 
 bool
 DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
-                          uint32_t *offset_ptr, const DWARFCompileUnit *cu) {
+                          uint32_t *offset_ptr, const DWARFUnit *cu) {
   bool indirect = false;
   do {
     switch (form) {
@@ -309,21 +359,20 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
 }
 
 void
-DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
+DWARFFormValue::dump(raw_ostream &OS, const DWARFUnit *cu) const {
   DataExtractor debug_str_data(cu->getStringSection(), true, 0);
   DataExtractor debug_str_offset_data(cu->getStringOffsetSection(), true, 0);
-  uint64_t uvalue = getUnsigned();
+  uint64_t uvalue = Value.uval;
   bool cu_relative_offset = false;
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_GNU_addr_index: {
-    StringRef AddrOffsetSec = cu->getAddrOffsetSection();
     OS << format(" indexed (%8.8x) address = ", (uint32_t)uvalue);
-    if (AddrOffsetSec.size() != 0) {
-      DataExtractor DA(AddrOffsetSec, true, cu->getAddressByteSize());
-      OS << format("0x%016" PRIx64, getIndirectAddress(&DA, cu));
-    } else
+    uint64_t Address;
+    if (cu->getAddrOffsetSectionItem(uvalue, Address))
+      OS << format("0x%016" PRIx64, Address);
+    else
       OS << "<no .debug_addr section>";
     break;
   }
@@ -336,7 +385,7 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
   case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_string:
     OS << '"';
-    OS.write_escaped(getAsCString(NULL));
+    OS.write_escaped(Value.cstr);
     OS << '"';
     break;
   case DW_FORM_exprloc:
@@ -368,25 +417,24 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     }
     break;
 
-  case DW_FORM_sdata:     OS << getSigned();   break;
-  case DW_FORM_udata:     OS << getUnsigned(); break;
+  case DW_FORM_sdata:     OS << Value.sval; break;
+  case DW_FORM_udata:     OS << Value.uval; break;
   case DW_FORM_strp: {
     OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
-    const char* dbg_str = getAsCString(&debug_str_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
   }
   case DW_FORM_GNU_str_index: {
     OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
-    const char *dbg_str = getIndirectCString(&debug_str_data,
-                                             &debug_str_offset_data);
-    if (dbg_str) {
+    Optional<const char *> DbgStr = getAsCString(cu);
+    if (DbgStr.hasValue()) {
       OS << '"';
-      OS.write_escaped(dbg_str);
+      OS.write_escaped(DbgStr.getValue());
       OS << '"';
     }
     break;
@@ -435,97 +483,67 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
     OS << format(" => {0x%8.8" PRIx64 "}", uvalue + (cu ? cu->getOffset() : 0));
 }
 
-const char*
-DWARFFormValue::getAsCString(const DataExtractor *debug_str_data_ptr) const {
-  if (isInlinedCStr()) {
+Optional<const char *> DWARFFormValue::getAsCString(const DWARFUnit *U) const {
+  if (!isFormClass(FC_String))
+    return None;
+  if (Form == DW_FORM_string)
     return Value.cstr;
-  } else if (debug_str_data_ptr) {
-    uint32_t offset = Value.uval;
-    return debug_str_data_ptr->getCStr(&offset);
+  if (U == 0)
+    return None;
+  uint32_t Offset = Value.uval;
+  if (Form == DW_FORM_GNU_str_index) {
+    uint32_t StrOffset;
+    if (!U->getStringOffsetSectionItem(Offset, StrOffset))
+      return None;
+    Offset = StrOffset;
   }
-  return NULL;
-}
-
-const char*
-DWARFFormValue::getIndirectCString(const DataExtractor *DS,
-                                   const DataExtractor *DSO) const {
-  if (!DS || !DSO) return NULL;
-
-  uint32_t offset = Value.uval * 4;
-  uint32_t soffset = DSO->getU32(&offset);
-  return DS->getCStr(&soffset);
-}
-
-uint64_t
-DWARFFormValue::getIndirectAddress(const DataExtractor *DA,
-                                   const DWARFCompileUnit *cu) const {
-  if (!DA) return 0;
-
-  uint32_t offset = Value.uval * cu->getAddressByteSize();
-  return DA->getAddress(&offset);
+  if (const char *Str = U->getStringExtractor().getCStr(&Offset)) {
+    return Str;
+  }
+  return None;
 }
 
-uint64_t DWARFFormValue::getReference(const DWARFCompileUnit *cu) const {
-  uint64_t die_offset = Value.uval;
-  switch (Form) {
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata:
-      die_offset += (cu ? cu->getOffset() : 0);
-      break;
-  default:
-      break;
+Optional<uint64_t> DWARFFormValue::getAsAddress(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Address))
+    return None;
+  if (Form == DW_FORM_GNU_addr_index) {
+    uint32_t Index = Value.uval;
+    uint64_t Result;
+    if (U == 0 || !U->getAddrOffsetSectionItem(Index, Result))
+      return None;
+    return Result;
   }
-
-  return die_offset;
+  return Value.uval;
 }
 
-bool
-DWARFFormValue::resolveCompileUnitReferences(const DWARFCompileUnit *cu) {
+Optional<uint64_t> DWARFFormValue::getAsReference(const DWARFUnit *U) const {
+  if (!isFormClass(FC_Reference))
+    return None;
   switch (Form) {
   case DW_FORM_ref1:
   case DW_FORM_ref2:
   case DW_FORM_ref4:
   case DW_FORM_ref8:
   case DW_FORM_ref_udata:
-    Value.uval += cu->getOffset();
-    Form = DW_FORM_ref_addr;
-    return true;
+    if (U == 0)
+      return None;
+    return Value.uval + U->getOffset();
+  case DW_FORM_ref_addr:
+    return Value.uval;
+  // FIXME: Add proper support for DW_FORM_ref_sig8
   default:
-    break;
+    return Value.uval;
   }
-  return false;
-}
-
-const uint8_t *DWARFFormValue::BlockData() const {
-  if (!isInlinedCStr())
-    return Value.data;
-  return NULL;
 }
 
-bool DWARFFormValue::isBlockForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_exprloc:
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-    return true;
-  }
-  return false;
+Optional<uint64_t> DWARFFormValue::getAsSectionOffset() const {
+  if (!isFormClass(FC_SectionOffset))
+    return None;
+  return Value.uval;
 }
 
-bool DWARFFormValue::isDataForm(uint16_t form) {
-  switch (form) {
-  case DW_FORM_sdata:
-  case DW_FORM_udata:
-  case DW_FORM_data1:
-  case DW_FORM_data2:
-  case DW_FORM_data4:
-  case DW_FORM_data8:
-    return true;
-  }
-  return false;
+Optional<uint64_t> DWARFFormValue::getAsUnsignedConstant() const {
+  if (!isFormClass(FC_Constant) || Form == DW_FORM_sdata)
+    return None;
+  return Value.uval;
 }
diff --git a/lib/DebugInfo/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARFTypeUnit.cpp
new file mode 100644
index 0000000..303bf70
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.cpp
@@ -0,0 +1,39 @@
+//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFTypeUnit.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+bool DWARFTypeUnit::extractImpl(DataExtractor debug_info,
+                                uint32_t *offset_ptr) {
+  if (!DWARFUnit::extractImpl(debug_info, offset_ptr))
+    return false;
+  TypeHash = debug_info.getU64(offset_ptr);
+  TypeOffset = debug_info.getU32(offset_ptr);
+  return TypeOffset < getLength();
+}
+
+void DWARFTypeUnit::dump(raw_ostream &OS) {
+  OS << format("0x%08x", getOffset()) << ": Type Unit:"
+     << " length = " << format("0x%08x", getLength())
+     << " version = " << format("0x%04x", getVersion())
+     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " addr_size = " << format("0x%02x", getAddressByteSize())
+     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_offset = " << format("0x%04x", TypeOffset)
+     << " (next unit at " << format("0x%08x", getNextUnitOffset())
+     << ")\n";
+
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
+}
diff --git a/lib/DebugInfo/DWARFTypeUnit.h b/lib/DebugInfo/DWARFTypeUnit.h
new file mode 100644
index 0000000..7a0dab2
--- /dev/null
+++ b/lib/DebugInfo/DWARFTypeUnit.h
@@ -0,0 +1,35 @@
+//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+#define LLVM_DEBUGINFO_DWARFTYPEUNIT_H
+
+#include "DWARFUnit.h"
+
+namespace llvm {
+
+class DWARFTypeUnit : public DWARFUnit {
+private:
+  uint64_t TypeHash;
+  uint32_t TypeOffset;
+public:
+  DWARFTypeUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                const RelocAddrMap *M, bool LE)
+      : DWARFUnit(DA, IS, AS, RS, SS, SOS, AOS, M, LE) {}
+  uint32_t getSize() const LLVM_OVERRIDE { return DWARFUnit::getSize() + 12; }
+  void dump(raw_ostream &OS);
+protected:
+  bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) LLVM_OVERRIDE;
+};
+
+}
+
+#endif
+
diff --git a/lib/DebugInfo/DWARFUnit.cpp b/lib/DebugInfo/DWARFUnit.cpp
new file mode 100644
index 0000000..5167eb9
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.cpp
@@ -0,0 +1,365 @@
+//===-- DWARFUnit.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFUnit.h"
+#include "DWARFContext.h"
+#include "llvm/DebugInfo/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Path.h"
+#include <cstdio>
+
+using namespace llvm;
+using namespace dwarf;
+
+DWARFUnit::DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+                     StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+                     const RelocAddrMap *M, bool LE)
+    : Abbrev(DA), InfoSection(IS), AbbrevSection(AS), RangeSection(RS),
+      StringSection(SS), StringOffsetSection(SOS), AddrOffsetSection(AOS),
+      RelocMap(M), isLittleEndian(LE) {
+  clear();
+}
+
+DWARFUnit::~DWARFUnit() {
+}
+
+bool DWARFUnit::getAddrOffsetSectionItem(uint32_t Index,
+                                                uint64_t &Result) const {
+  uint32_t Offset = AddrOffsetSectionBase + Index * AddrSize;
+  if (AddrOffsetSection.size() < Offset + AddrSize)
+    return false;
+  DataExtractor DA(AddrOffsetSection, isLittleEndian, AddrSize);
+  Result = DA.getAddress(&Offset);
+  return true;
+}
+
+bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
+                                                  uint32_t &Result) const {
+  // FIXME: string offset section entries are 8-byte for DWARF64.
+  const uint32_t ItemSize = 4;
+  uint32_t Offset = Index * ItemSize;
+  if (StringOffsetSection.size() < Offset + ItemSize)
+    return false;
+  DataExtractor DA(StringOffsetSection, isLittleEndian, 0);
+  Result = DA.getU32(&Offset);
+  return true;
+}
+
+bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
+  Length = debug_info.getU32(offset_ptr);
+  Version = debug_info.getU16(offset_ptr);
+  uint64_t abbrOffset = debug_info.getU32(offset_ptr);
+  AddrSize = debug_info.getU8(offset_ptr);
+
+  bool lengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
+  bool versionOK = DWARFContext::isSupportedVersion(Version);
+  bool abbrOffsetOK = AbbrevSection.size() > abbrOffset;
+  bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
+
+  if (!lengthOK || !versionOK || !addrSizeOK || !abbrOffsetOK)
+    return false;
+
+  Abbrevs = Abbrev->getAbbreviationDeclarationSet(abbrOffset);
+  return true;
+}
+
+bool DWARFUnit::extract(DataExtractor debug_info, uint32_t *offset_ptr) {
+  clear();
+
+  Offset = *offset_ptr;
+
+  if (debug_info.isValidOffset(*offset_ptr)) {
+    if (extractImpl(debug_info, offset_ptr))
+      return true;
+
+    // reset the offset to where we tried to parse from if anything went wrong
+    *offset_ptr = Offset;
+  }
+
+  return false;
+}
+
+bool DWARFUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(RangeSection, isLittleEndian, AddrSize);
+  uint32_t ActualRangeListOffset = RangeSectionBase + RangeListOffset;
+  return RangeList.extract(RangesData, &ActualRangeListOffset);
+}
+
+void DWARFUnit::clear() {
+  Offset = 0;
+  Length = 0;
+  Version = 0;
+  Abbrevs = 0;
+  AddrSize = 0;
+  BaseAddr = 0;
+  RangeSectionBase = 0;
+  AddrOffsetSectionBase = 0;
+  clearDIEs(false);
+  DWO.reset();
+}
+
+const char *DWARFUnit::getCompilationDir() {
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return 0;
+  return DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+}
+
+uint64_t DWARFUnit::getDWOId() {
+  extractDIEsIfNeeded(true);
+  const uint64_t FailValue = -1ULL;
+  if (DieArray.empty())
+    return FailValue;
+  return DieArray[0]
+      .getAttributeValueAsUnsignedConstant(this, DW_AT_GNU_dwo_id, FailValue);
+}
+
+void DWARFUnit::setDIERelations() {
+  if (DieArray.empty())
+    return;
+  DWARFDebugInfoEntryMinimal *die_array_begin = &DieArray.front();
+  DWARFDebugInfoEntryMinimal *die_array_end = &DieArray.back();
+  DWARFDebugInfoEntryMinimal *curr_die;
+  // We purposely are skipping the last element in the array in the loop below
+  // so that we can always have a valid next item
+  for (curr_die = die_array_begin; curr_die < die_array_end; ++curr_die) {
+    // Since our loop doesn't include the last element, we can always
+    // safely access the next die in the array.
+    DWARFDebugInfoEntryMinimal *next_die = curr_die + 1;
+
+    const DWARFAbbreviationDeclaration *curr_die_abbrev =
+      curr_die->getAbbreviationDeclarationPtr();
+
+    if (curr_die_abbrev) {
+      // Normal DIE
+      if (curr_die_abbrev->hasChildren())
+        next_die->setParent(curr_die);
+      else
+        curr_die->setSibling(next_die);
+    } else {
+      // NULL DIE that terminates a sibling chain
+      DWARFDebugInfoEntryMinimal *parent = curr_die->getParent();
+      if (parent)
+        parent->setSibling(next_die);
+    }
+  }
+
+  // Since we skipped the last element, we need to fix it up!
+  if (die_array_begin < die_array_end)
+    curr_die->setParent(die_array_begin);
+}
+
+void DWARFUnit::extractDIEsToVector(
+    bool AppendCUDie, bool AppendNonCUDies,
+    std::vector<DWARFDebugInfoEntryMinimal> &Dies) const {
+  if (!AppendCUDie && !AppendNonCUDies)
+    return;
+
+  // Set the offset to that of the first DIE and calculate the start of the
+  // next compilation unit header.
+  uint32_t Offset = getFirstDIEOffset();
+  uint32_t NextCUOffset = getNextUnitOffset();
+  DWARFDebugInfoEntryMinimal DIE;
+  uint32_t Depth = 0;
+  bool IsCUDie = true;
+
+  while (Offset < NextCUOffset && DIE.extractFast(this, &Offset)) {
+    if (IsCUDie) {
+      if (AppendCUDie)
+        Dies.push_back(DIE);
+      if (!AppendNonCUDies)
+        break;
+      // The average bytes per DIE entry has been seen to be
+      // around 14-20 so let's pre-reserve the needed memory for
+      // our DIE entries accordingly.
+      Dies.reserve(Dies.size() + getDebugInfoSize() / 14);
+      IsCUDie = false;
+    } else {
+      Dies.push_back(DIE);
+    }
+
+    const DWARFAbbreviationDeclaration *AbbrDecl =
+      DIE.getAbbreviationDeclarationPtr();
+    if (AbbrDecl) {
+      // Normal DIE
+      if (AbbrDecl->hasChildren())
+        ++Depth;
+    } else {
+      // NULL DIE.
+      if (Depth > 0)
+        --Depth;
+      if (Depth == 0)
+        break;  // We are done with this compile unit!
+    }
+  }
+
+  // Give a little bit of info if we encounter corrupt DWARF (our offset
+  // should always terminate at or before the start of the next compilation
+  // unit header).
+  if (Offset > NextCUOffset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its "
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), Offset);
+}
+
+size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
+  if ((CUDieOnly && DieArray.size() > 0) ||
+      DieArray.size() > 1)
+    return 0; // Already parsed.
+
+  bool HasCUDie = DieArray.size() > 0;
+  extractDIEsToVector(!HasCUDie, !CUDieOnly, DieArray);
+
+  if (DieArray.empty())
+    return 0;
+
+  // If CU DIE was just parsed, copy several attribute values from it.
+  if (!HasCUDie) {
+    uint64_t BaseAddr =
+        DieArray[0].getAttributeValueAsAddress(this, DW_AT_low_pc, -1ULL);
+    if (BaseAddr == -1ULL)
+      BaseAddr = DieArray[0].getAttributeValueAsAddress(this, DW_AT_entry_pc, 0);
+    setBaseAddress(BaseAddr);
+    AddrOffsetSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_addr_base, 0);
+    RangeSectionBase = DieArray[0].getAttributeValueAsSectionOffset(
+        this, DW_AT_GNU_ranges_base, 0);
+  }
+
+  setDIERelations();
+  return DieArray.size();
+}
+
+DWARFUnit::DWOHolder::DWOHolder(object::ObjectFile *DWOFile)
+    : DWOFile(DWOFile),
+      DWOContext(cast<DWARFContext>(DIContext::getDWARFContext(DWOFile))),
+      DWOU(0) {
+  if (DWOContext->getNumDWOCompileUnits() > 0)
+    DWOU = DWOContext->getDWOCompileUnitAtIndex(0);
+}
+
+bool DWARFUnit::parseDWO() {
+  if (DWO.get() != 0)
+    return false;
+  extractDIEsIfNeeded(true);
+  if (DieArray.empty())
+    return false;
+  const char *DWOFileName =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_GNU_dwo_name, 0);
+  if (DWOFileName == 0)
+    return false;
+  const char *CompilationDir =
+      DieArray[0].getAttributeValueAsString(this, DW_AT_comp_dir, 0);
+  SmallString<16> AbsolutePath;
+  if (sys::path::is_relative(DWOFileName) && CompilationDir != 0) {
+    sys::path::append(AbsolutePath, CompilationDir);
+  }
+  sys::path::append(AbsolutePath, DWOFileName);
+  object::ObjectFile *DWOFile =
+      object::ObjectFile::createObjectFile(AbsolutePath);
+  if (!DWOFile)
+    return false;
+  // Reset DWOHolder.
+  DWO.reset(new DWOHolder(DWOFile));
+  DWARFUnit *DWOCU = DWO->getUnit();
+  // Verify that compile unit in .dwo file is valid.
+  if (DWOCU == 0 || DWOCU->getDWOId() != getDWOId()) {
+    DWO.reset();
+    return false;
+  }
+  // Share .debug_addr and .debug_ranges section with compile unit in .dwo
+  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  DWOCU->setRangesSection(RangeSection, RangeSectionBase);
+  return true;
+}
+
+void DWARFUnit::clearDIEs(bool KeepCUDie) {
+  if (DieArray.size() > (unsigned)KeepCUDie) {
+    // std::vectors never get any smaller when resized to a smaller size,
+    // or when clear() or erase() are called, the size will report that it
+    // is smaller, but the memory allocated remains intact (call capacity()
+    // to see this). So we need to create a temporary vector and swap the
+    // contents which will cause just the internal pointers to be swapped
+    // so that when temporary vector goes out of scope, it will destroy the
+    // contents.
+    std::vector<DWARFDebugInfoEntryMinimal> TmpArray;
+    DieArray.swap(TmpArray);
+    // Save at least the compile unit DIE
+    if (KeepCUDie)
+      DieArray.push_back(TmpArray.front());
+  }
+}
+
+void
+DWARFUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                                         bool clear_dies_if_already_not_parsed,
+                                         uint32_t CUOffsetInAranges) {
+  // This function is usually called if there in no .debug_aranges section
+  // in order to produce a compile unit level set of address ranges that
+  // is accurate. If the DIEs weren't parsed, then we don't want all dies for
+  // all compile units to stay loaded when they weren't needed. So we can end
+  // up parsing the DWARF and then throwing them all away to keep memory usage
+  // down.
+  const bool clear_dies = extractDIEsIfNeeded(false) > 1 &&
+                          clear_dies_if_already_not_parsed;
+  DieArray[0].buildAddressRangeTable(this, debug_aranges, CUOffsetInAranges);
+  bool DWOCreated = parseDWO();
+  if (DWO.get()) {
+    // If there is a .dwo file for this compile unit, then skeleton CU DIE
+    // doesn't have children, and we should instead build address range table
+    // from DIEs in the .debug_info.dwo section of .dwo file.
+    DWO->getUnit()->buildAddressRangeTable(
+        debug_aranges, clear_dies_if_already_not_parsed, CUOffsetInAranges);
+  }
+  if (DWOCreated && clear_dies_if_already_not_parsed)
+    DWO.reset();
+
+  // Keep memory down by clearing DIEs if this generate function
+  // caused them to be parsed.
+  if (clear_dies)
+    clearDIEs(true);
+}
+
+const DWARFDebugInfoEntryMinimal *
+DWARFUnit::getSubprogramForAddress(uint64_t Address) {
+  extractDIEsIfNeeded(false);
+  for (size_t i = 0, n = DieArray.size(); i != n; i++)
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      return &DieArray[i];
+    }
+  return 0;
+}
+
+DWARFDebugInfoEntryInlinedChain
+DWARFUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
+  const DWARFUnit *ChainCU = 0;
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE =
+      getSubprogramForAddress(Address);
+  if (SubprogramDIE) {
+    ChainCU = this;
+  } else {
+    // Try to look for subprogram DIEs in the DWO file.
+    parseDWO();
+    if (DWO.get()) {
+      SubprogramDIE = DWO->getUnit()->getSubprogramForAddress(Address);
+      if (SubprogramDIE)
+        ChainCU = DWO->getUnit();
+    }
+  }
+
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryInlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(ChainCU, Address);
+}
diff --git a/lib/DebugInfo/DWARFUnit.h b/lib/DebugInfo/DWARFUnit.h
new file mode 100644
index 0000000..bd768a6
--- /dev/null
+++ b/lib/DebugInfo/DWARFUnit.h
@@ -0,0 +1,168 @@
+//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFUNIT_H
+#define LLVM_DEBUGINFO_DWARFUNIT_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "DWARFDebugAbbrev.h"
+#include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
+#include "DWARFRelocMap.h"
+#include <vector>
+
+namespace llvm {
+
+namespace object {
+class ObjectFile;
+}
+
+class DWARFDebugAbbrev;
+class StringRef;
+class raw_ostream;
+
+class DWARFUnit {
+  const DWARFDebugAbbrev *Abbrev;
+  StringRef InfoSection;
+  StringRef AbbrevSection;
+  StringRef RangeSection;
+  uint32_t RangeSectionBase;
+  StringRef StringSection;
+  StringRef StringOffsetSection;
+  StringRef AddrOffsetSection;
+  uint32_t AddrOffsetSectionBase;
+  const RelocAddrMap *RelocMap;
+  bool isLittleEndian;
+
+  uint32_t Offset;
+  uint32_t Length;
+  uint16_t Version;
+  const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint8_t AddrSize;
+  uint64_t BaseAddr;
+  // The compile unit debug information entry items.
+  std::vector<DWARFDebugInfoEntryMinimal> DieArray;
+
+  class DWOHolder {
+    OwningPtr<object::ObjectFile> DWOFile;
+    OwningPtr<DWARFContext> DWOContext;
+    DWARFUnit *DWOU;
+  public:
+    DWOHolder(object::ObjectFile *DWOFile);
+    DWARFUnit *getUnit() const { return DWOU; }
+  };
+  OwningPtr<DWOHolder> DWO;
+
+protected:
+  virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+
+public:
+
+  DWARFUnit(const DWARFDebugAbbrev *DA, StringRef IS, StringRef AS,
+            StringRef RS, StringRef SS, StringRef SOS, StringRef AOS,
+            const RelocAddrMap *M, bool LE);
+
+  virtual ~DWARFUnit();
+
+  StringRef getStringSection() const { return StringSection; }
+  StringRef getStringOffsetSection() const { return StringOffsetSection; }
+  void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
+    AddrOffsetSection = AOS;
+    AddrOffsetSectionBase = Base;
+  }
+  void setRangesSection(StringRef RS, uint32_t Base) {
+    RangeSection = RS;
+    RangeSectionBase = Base;
+  }
+
+  bool getAddrOffsetSectionItem(uint32_t Index, uint64_t &Result) const;
+  // FIXME: Result should be uint64_t in DWARF64.
+  bool getStringOffsetSectionItem(uint32_t Index, uint32_t &Result) const;
+
+  DataExtractor getDebugInfoExtractor() const {
+    return DataExtractor(InfoSection, isLittleEndian, AddrSize);
+  }
+  DataExtractor getStringExtractor() const {
+    return DataExtractor(StringSection, false, 0);
+  }
+
+  const RelocAddrMap *getRelocMap() const { return RelocMap; }
+
+  bool extract(DataExtractor debug_info, uint32_t* offset_ptr);
+
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
+  void clear();
+  uint32_t getOffset() const { return Offset; }
+  /// Size in bytes of the compile unit header.
+  virtual uint32_t getSize() const { return 11; }
+  uint32_t getFirstDIEOffset() const { return Offset + getSize(); }
+  uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
+  /// Size in bytes of the .debug_info data associated with this compile unit.
+  size_t getDebugInfoSize() const { return Length + 4 - getSize(); }
+  uint32_t getLength() const { return Length; }
+  uint16_t getVersion() const { return Version; }
+  const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
+    return Abbrevs;
+  }
+  uint8_t getAddressByteSize() const { return AddrSize; }
+  uint64_t getBaseAddress() const { return BaseAddr; }
+
+  void setBaseAddress(uint64_t base_addr) {
+    BaseAddr = base_addr;
+  }
+
+  const DWARFDebugInfoEntryMinimal *
+  getCompileUnitDIE(bool extract_cu_die_only = true) {
+    extractDIEsIfNeeded(extract_cu_die_only);
+    return DieArray.empty() ? NULL : &DieArray[0];
+  }
+
+  const char *getCompilationDir();
+  uint64_t getDWOId();
+
+  void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
+                              bool clear_dies_if_already_not_parsed,
+                              uint32_t CUOffsetInAranges);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address. The
+  /// chain is valid as long as parsed compile unit DIEs are not cleared.
+  DWARFDebugInfoEntryInlinedChain getInlinedChainForAddress(uint64_t Address);
+
+private:
+  /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
+  /// hasn't already been done. Returns the number of DIEs parsed at this call.
+  size_t extractDIEsIfNeeded(bool CUDieOnly);
+  /// extractDIEsToVector - Appends all parsed DIEs to a vector.
+  void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
+                           std::vector<DWARFDebugInfoEntryMinimal> &DIEs) const;
+  /// setDIERelations - We read in all of the DIE entries into our flat list
+  /// of DIE entries and now we need to go back through all of them and set the
+  /// parent, sibling and child pointers for quick DIE navigation.
+  void setDIERelations();
+  /// clearDIEs - Clear parsed DIEs to keep memory usage low.
+  void clearDIEs(bool KeepCUDie);
+
+  /// parseDWO - Parses .dwo file for current compile unit. Returns true if
+  /// it was actually constructed.
+  bool parseDWO();
+
+  /// getSubprogramForAddress - Returns subprogram DIE with address range
+  /// encompassing the provided address. The pointer is alive as long as parsed
+  /// compile unit DIEs are not cleared.
+  const DWARFDebugInfoEntryMinimal *getSubprogramForAddress(uint64_t Address);
+};
+
+}
+
+#endif
diff --git a/lib/DebugInfo/LLVMBuild.txt b/lib/DebugInfo/LLVMBuild.txt
index 210b9f9..f347d5e 100644
--- a/lib/DebugInfo/LLVMBuild.txt
+++ b/lib/DebugInfo/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Library
 name = DebugInfo
 parent = Libraries
-required_libraries = Support
+required_libraries = Object Support
diff --git a/lib/ExecutionEngine/CMakeLists.txt b/lib/ExecutionEngine/CMakeLists.txt
index cb11bfe..3102c7b 100644
--- a/lib/ExecutionEngine/CMakeLists.txt
+++ b/lib/ExecutionEngine/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_llvm_library(LLVMExecutionEngine
   ExecutionEngine.cpp
   ExecutionEngineBindings.cpp
+  RTDyldMemoryManager.cpp
   TargetSelect.cpp
   )
 
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index e43ba4f..2a610d5 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -14,6 +14,8 @@
 
 #define DEBUG_TYPE "jit"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
@@ -38,6 +40,11 @@ using namespace llvm;
 STATISTIC(NumInitBytes, "Number of bytes of global vars initialized");
 STATISTIC(NumGlobals  , "Number of global vars initialized");
 
+// Pin the vtable to this file.
+void ObjectCache::anchor() {}
+void ObjectBuffer::anchor() {}
+void ObjectBufferStream::anchor() {}
+
 ExecutionEngine *(*ExecutionEngine::JITCtor)(
   Module *M,
   std::string *ErrorStr,
@@ -47,7 +54,7 @@ ExecutionEngine *(*ExecutionEngine::JITCtor)(
 ExecutionEngine *(*ExecutionEngine::MCJITCtor)(
   Module *M,
   std::string *ErrorStr,
-  JITMemoryManager *JMM,
+  RTDyldMemoryManager *MCJMM,
   bool GVsWithCode,
   TargetMachine *TM) = 0;
 ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
@@ -55,9 +62,7 @@ ExecutionEngine *(*ExecutionEngine::InterpCtor)(Module *M,
 
 ExecutionEngine::ExecutionEngine(Module *M)
   : EEState(*this),
-    LazyFunctionCreator(0),
-    ExceptionTableRegister(0),
-    ExceptionTableDeregister(0) {
+    LazyFunctionCreator(0) {
   CompilingLazily         = false;
   GVCompilationDisabled   = false;
   SymbolSearchingDisabled = false;
@@ -71,16 +76,6 @@ ExecutionEngine::~ExecutionEngine() {
     delete Modules[i];
 }
 
-void ExecutionEngine::DeregisterAllTables() {
-  if (ExceptionTableDeregister) {
-    DenseMap<const Function*, void*>::iterator it = AllExceptionTables.begin();
-    DenseMap<const Function*, void*>::iterator ite = AllExceptionTables.end();
-    for (; it != ite; ++it)
-      ExceptionTableDeregister(it->second);
-    AllExceptionTables.clear();
-  }
-}
-
 namespace {
 /// \brief Helper class which uses a value handler to automatically deletes the
 /// memory block when the GlobalVariable is destroyed.
@@ -117,7 +112,7 @@ char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) {
 }
 
 bool ExecutionEngine::removeModule(Module *M) {
-  for(SmallVector<Module *, 1>::iterator I = Modules.begin(),
+  for(SmallVectorImpl<Module *>::iterator I = Modules.begin(),
         E = Modules.end(); I != E; ++I) {
     Module *Found = *I;
     if (Found == M) {
@@ -455,10 +450,12 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   if (sys::DynamicLibrary::LoadLibraryPermanently(0, ErrorStr))
     return 0;
 
+  assert(!(JMM && MCJMM));
+  
   // If the user specified a memory manager but didn't specify which engine to
   // create, we assume they only want the JIT, and we fail if they only want
   // the interpreter.
-  if (JMM) {
+  if (JMM || MCJMM) {
     if (WhichEngine & EngineKind::JIT)
       WhichEngine = EngineKind::JIT;
     else {
@@ -467,6 +464,14 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
       return 0;
     }
   }
+  
+  if (MCJMM && ! UseMCJIT) {
+    if (ErrorStr)
+      *ErrorStr =
+        "Cannot create a legacy JIT with a runtime dyld memory "
+        "manager.";
+    return 0;
+  }
 
   // Unless the interpreter was explicitly selected or the JIT is not linked,
   // try making a JIT.
@@ -480,7 +485,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
 
     if (UseMCJIT && ExecutionEngine::MCJITCtor) {
       ExecutionEngine *EE =
-        ExecutionEngine::MCJITCtor(M, ErrorStr, JMM,
+        ExecutionEngine::MCJITCtor(M, ErrorStr, MCJMM ? MCJMM : JMM,
                                    AllocateGVsWithCode, TheTM.take());
       if (EE) return EE;
     } else if (ExecutionEngine::JITCtor) {
@@ -545,6 +550,24 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       // with the correct bit width.
       Result.IntVal = APInt(C->getType()->getPrimitiveSizeInBits(), 0);
       break;
+    case Type::StructTyID: {
+      // if the whole struct is 'undef' just reserve memory for the value.
+      if(StructType *STy = dyn_cast<StructType>(C->getType())) {
+        unsigned int elemNum = STy->getNumElements();
+        Result.AggregateVal.resize(elemNum);
+        for (unsigned int i = 0; i < elemNum; ++i) {
+          Type *ElemTy = STy->getElementType(i);
+          if (ElemTy->isIntegerTy())
+            Result.AggregateVal[i].IntVal = 
+              APInt(ElemTy->getPrimitiveSizeInBits(), 0);
+          else if (ElemTy->isAggregateType()) {
+              const Constant *ElemUndef = UndefValue::get(ElemTy);
+              Result.AggregateVal[i] = getConstantValue(ElemUndef);
+            }
+          }
+        }
+      }
+      break;
     case Type::VectorTyID:
       // if the whole vector is 'undef' just reserve memory for the value.
       const VectorType* VTy = dyn_cast<VectorType>(C->getType());
@@ -553,7 +576,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       Result.AggregateVal.resize(elemNum);
       if (ElemTy->isIntegerTy())
         for (unsigned int i = 0; i < elemNum; ++i)
-          Result.AggregateVal[i].IntVal = 
+          Result.AggregateVal[i].IntVal =
             APInt(ElemTy->getPrimitiveSizeInBits(), 0);
       break;
     }
@@ -1272,6 +1295,10 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
   if (GA == 0) {
     // If it's not already specified, allocate memory for the global.
     GA = getMemoryForGV(GV);
+
+    // If we failed to allocate memory for this global, return.
+    if (GA == 0) return;
+
     addGlobalMapping(GV, GA);
   }
 
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index f9b08a0..2d34eea 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -15,6 +15,7 @@
 #include "llvm-c/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -157,10 +158,8 @@ LLVMBool LLVMCreateJITCompilerForModule(LLVMExecutionEngineRef *OutJIT,
 void LLVMInitializeMCJITCompilerOptions(LLVMMCJITCompilerOptions *PassedOptions,
                                         size_t SizeOfPassedOptions) {
   LLVMMCJITCompilerOptions options;
-  options.OptLevel = 0;
+  memset(&options, 0, sizeof(options)); // Most fields are zero by default.
   options.CodeModel = LLVMCodeModelJITDefault;
-  options.NoFramePointerElim = false;
-  options.EnableFastISel = false;
   
   memcpy(PassedOptions, &options,
          std::min(sizeof(options), SizeOfPassedOptions));
@@ -199,6 +198,8 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
          .setOptLevel((CodeGenOpt::Level)options.OptLevel)
          .setCodeModel(unwrap(options.CodeModel))
          .setTargetOptions(targetOptions);
+  if (options.MCJMM)
+    builder.setMCJITMemoryManager(unwrap(options.MCJMM));
   if (ExecutionEngine *JIT = builder.create()) {
     *OutJIT = wrap(JIT);
     return 0;
@@ -332,3 +333,107 @@ void *LLVMGetPointerToGlobal(LLVMExecutionEngineRef EE, LLVMValueRef Global) {
   
   return unwrap(EE)->getPointerToGlobal(unwrap<GlobalValue>(Global));
 }
+
+/*===-- Operations on memory managers -------------------------------------===*/
+
+namespace {
+
+struct SimpleBindingMMFunctions {
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection;
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection;
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory;
+  LLVMMemoryManagerDestroyCallback Destroy;
+};
+
+class SimpleBindingMemoryManager : public RTDyldMemoryManager {
+public:
+  SimpleBindingMemoryManager(const SimpleBindingMMFunctions& Functions,
+                             void *Opaque);
+  virtual ~SimpleBindingMemoryManager();
+  
+  virtual uint8_t *allocateCodeSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName);
+
+  virtual uint8_t *allocateDataSection(
+    uintptr_t Size, unsigned Alignment, unsigned SectionID,
+    StringRef SectionName, bool isReadOnly);
+
+  virtual bool finalizeMemory(std::string *ErrMsg);
+  
+private:
+  SimpleBindingMMFunctions Functions;
+  void *Opaque;
+};
+
+SimpleBindingMemoryManager::SimpleBindingMemoryManager(
+  const SimpleBindingMMFunctions& Functions,
+  void *Opaque)
+  : Functions(Functions), Opaque(Opaque) {
+  assert(Functions.AllocateCodeSection &&
+         "No AllocateCodeSection function provided!");
+  assert(Functions.AllocateDataSection &&
+         "No AllocateDataSection function provided!");
+  assert(Functions.FinalizeMemory &&
+         "No FinalizeMemory function provided!");
+  assert(Functions.Destroy &&
+         "No Destroy function provided!");
+}
+
+SimpleBindingMemoryManager::~SimpleBindingMemoryManager() {
+  Functions.Destroy(Opaque);
+}
+
+uint8_t *SimpleBindingMemoryManager::allocateCodeSection(
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName) {
+  return Functions.AllocateCodeSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str());
+}
+
+uint8_t *SimpleBindingMemoryManager::allocateDataSection(
+  uintptr_t Size, unsigned Alignment, unsigned SectionID,
+  StringRef SectionName, bool isReadOnly) {
+  return Functions.AllocateDataSection(Opaque, Size, Alignment, SectionID,
+                                       SectionName.str().c_str(),
+                                       isReadOnly);
+}
+
+bool SimpleBindingMemoryManager::finalizeMemory(std::string *ErrMsg) {
+  char *errMsgCString = 0;
+  bool result = Functions.FinalizeMemory(Opaque, &errMsgCString);
+  assert((result || !errMsgCString) &&
+         "Did not expect an error message if FinalizeMemory succeeded");
+  if (errMsgCString) {
+    if (ErrMsg)
+      *ErrMsg = errMsgCString;
+    free(errMsgCString);
+  }
+  return result;
+}
+
+} // anonymous namespace
+
+LLVMMCJITMemoryManagerRef LLVMCreateSimpleMCJITMemoryManager(
+  void *Opaque,
+  LLVMMemoryManagerAllocateCodeSectionCallback AllocateCodeSection,
+  LLVMMemoryManagerAllocateDataSectionCallback AllocateDataSection,
+  LLVMMemoryManagerFinalizeMemoryCallback FinalizeMemory,
+  LLVMMemoryManagerDestroyCallback Destroy) {
+  
+  if (!AllocateCodeSection || !AllocateDataSection || !FinalizeMemory ||
+      !Destroy)
+    return NULL;
+  
+  SimpleBindingMMFunctions functions;
+  functions.AllocateCodeSection = AllocateCodeSection;
+  functions.AllocateDataSection = AllocateDataSection;
+  functions.FinalizeMemory = FinalizeMemory;
+  functions.Destroy = Destroy;
+  return wrap(new SimpleBindingMemoryManager(functions, Opaque));
+}
+
+void LLVMDisposeMCJITMemoryManager(LLVMMCJITMemoryManagerRef MM) {
+  delete unwrap(MM);
+}
+
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
index 3d9ff535..777d0f1 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -61,7 +61,7 @@ public:
     GetNewMethodIDFunc(GetNewMethodIDImpl) {
   }
 
-  // Sends an event anncouncing that a function has been emitted
+  // Sends an event announcing that a function has been emitted
   //   return values are event-specific.  See Intel documentation for details.
   int  iJIT_NotifyEvent(iJIT_JVM_EVENT EventType, void *EventSpecificData) {
     if (!NotifyEventFunc)
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index b95a9e8..5de0659 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -786,20 +786,31 @@ void Interpreter::visitBinaryOperator(BinaryOperator &I) {
 }
 
 static GenericValue executeSelectInst(GenericValue Src1, GenericValue Src2,
-                                      GenericValue Src3) {
-  return Src1.IntVal == 0 ? Src3 : Src2;
+                                      GenericValue Src3, const Type *Ty) {
+    GenericValue Dest;
+    if(Ty->isVectorTy()) {
+      assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());
+      assert(Src2.AggregateVal.size() == Src3.AggregateVal.size());
+      Dest.AggregateVal.resize( Src1.AggregateVal.size() );
+      for (size_t i = 0; i < Src1.AggregateVal.size(); ++i)
+        Dest.AggregateVal[i] = (Src1.AggregateVal[i].IntVal == 0) ?
+          Src3.AggregateVal[i] : Src2.AggregateVal[i];
+    } else {
+      Dest = (Src1.IntVal == 0) ? Src3 : Src2;
+    }
+    return Dest;
 }
 
 void Interpreter::visitSelectInst(SelectInst &I) {
   ExecutionContext &SF = ECStack.back();
+  const Type * Ty = I.getOperand(0)->getType();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
-  GenericValue R = executeSelectInst(Src1, Src2, Src3);
+  GenericValue R = executeSelectInst(Src1, Src2, Src3, Ty);
   SetValue(&I, R, SF);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                     Terminator Instruction Implementations
 //===----------------------------------------------------------------------===//
@@ -887,40 +898,11 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
   // Check to see if any of the cases match...
   BasicBlock *Dest = 0;
   for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    IntegersSubset& Case = i.getCaseValueEx();
-    if (Case.isSingleNumber()) {
-      // FIXME: Currently work with ConstantInt based numbers.
-      const ConstantInt *CI = Case.getSingleNumber(0).toConstantInt();
-      GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-      if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-        Dest = cast<BasicBlock>(i.getCaseSuccessor());
-        break;        
-      }
+    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+    if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
+      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      break;
     }
-    if (Case.isSingleNumbersOnly()) {
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *CI = Case.getSingleNumber(n).toConstantInt();
-        GenericValue Val = getOperandValue(const_cast<ConstantInt*>(CI), SF);
-        if (executeICMP_EQ(Val, CondVal, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }      
-    } else
-      for (unsigned n = 0, en = Case.getNumItems(); n != en; ++n) {
-        IntegersSubset::Range r = Case.getItem(n);
-        // FIXME: Currently work with ConstantInt based numbers.
-        const ConstantInt *LowCI = r.getLow().toConstantInt();
-        const ConstantInt *HighCI = r.getHigh().toConstantInt();
-        GenericValue Low = getOperandValue(const_cast<ConstantInt*>(LowCI), SF);
-        GenericValue High = getOperandValue(const_cast<ConstantInt*>(HighCI), SF);
-        if (executeICMP_ULE(Low, CondVal, ElTy).IntVal != 0 &&
-            executeICMP_ULE(CondVal, High, ElTy).IntVal != 0) {
-          Dest = cast<BasicBlock>(i.getCaseSuccessor());
-          break;        
-        }
-      }
   }
   if (!Dest) Dest = I.getDefaultDest();   // No cases matched: use default
   SwitchToNewBasicBlock(Dest, SF);
@@ -1138,16 +1120,42 @@ void Interpreter::visitCallSite(CallSite CS) {
   callFunction((Function*)GVTOP(SRC), ArgVals);
 }
 
+// auxilary function for shift operations
+static unsigned getShiftAmount(uint64_t orgShiftAmount,
+                               llvm::APInt valueToShift) {
+  unsigned valueWidth = valueToShift.getBitWidth();
+  if (orgShiftAmount < (uint64_t)valueWidth)
+    return orgShiftAmount;
+  // according to the llvm documentation, if orgShiftAmount > valueWidth,
+  // the result is undfeined. but we do shift by this rule:
+  return (NextPowerOf2(valueWidth-1) - 1) & orgShiftAmount;
+}
+
+
 void Interpreter::visitShl(BinaryOperator &I) {
   ExecutionContext &SF = ECStack.back();
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
-    Dest.IntVal = Src1.IntVal.shl(Src2.IntVal.getZExtValue());
-  else
-    Dest.IntVal = Src1.IntVal;
-  
+  const Type *Ty = I.getType();
+
+  if (Ty->isVectorTy()) {
+    uint32_t src1Size = uint32_t(Src1.AggregateVal.size());
+    assert(src1Size == Src2.AggregateVal.size());
+    for (unsigned i = 0; i < src1Size; i++) {
+      GenericValue Result;
+      uint64_t shiftAmount = Src2.AggregateVal[i].IntVal.getZExtValue();
+      llvm::APInt valueToShift = Src1.AggregateVal[i].IntVal;
+      Result.IntVal = valueToShift.shl(getShiftAmount(shiftAmount, valueToShift));
+      Dest.AggregateVal.push_back(Result);
+    }
+  } else {
+    // scalar
+    uint64_t shiftAmount = Src2.IntVal.getZExtValue();
+    llvm::APInt valueToShift = Src1.IntVal;
+    Dest.IntVal = valueToShift.shl(getShiftAmount(shiftAmount, valueToShift));
+  }
+
   SetValue(&I, Dest, SF);
 }
 
@@ -1156,11 +1164,25 @@ void Interpreter::visitLShr(BinaryOperator &I) {
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
-    Dest.IntVal = Src1.IntVal.lshr(Src2.IntVal.getZExtValue());
-  else
-    Dest.IntVal = Src1.IntVal;
-  
+  const Type *Ty = I.getType();
+
+  if (Ty->isVectorTy()) {
+    uint32_t src1Size = uint32_t(Src1.AggregateVal.size());
+    assert(src1Size == Src2.AggregateVal.size());
+    for (unsigned i = 0; i < src1Size; i++) {
+      GenericValue Result;
+      uint64_t shiftAmount = Src2.AggregateVal[i].IntVal.getZExtValue();
+      llvm::APInt valueToShift = Src1.AggregateVal[i].IntVal;
+      Result.IntVal = valueToShift.lshr(getShiftAmount(shiftAmount, valueToShift));
+      Dest.AggregateVal.push_back(Result);
+    }
+  } else {
+    // scalar
+    uint64_t shiftAmount = Src2.IntVal.getZExtValue();
+    llvm::APInt valueToShift = Src1.IntVal;
+    Dest.IntVal = valueToShift.lshr(getShiftAmount(shiftAmount, valueToShift));
+  }
+
   SetValue(&I, Dest, SF);
 }
 
@@ -1169,110 +1191,273 @@ void Interpreter::visitAShr(BinaryOperator &I) {
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
   GenericValue Dest;
-  if (Src2.IntVal.getZExtValue() < Src1.IntVal.getBitWidth())
-    Dest.IntVal = Src1.IntVal.ashr(Src2.IntVal.getZExtValue());
-  else
-    Dest.IntVal = Src1.IntVal;
-  
+  const Type *Ty = I.getType();
+
+  if (Ty->isVectorTy()) {
+    size_t src1Size = Src1.AggregateVal.size();
+    assert(src1Size == Src2.AggregateVal.size());
+    for (unsigned i = 0; i < src1Size; i++) {
+      GenericValue Result;
+      uint64_t shiftAmount = Src2.AggregateVal[i].IntVal.getZExtValue();
+      llvm::APInt valueToShift = Src1.AggregateVal[i].IntVal;
+      Result.IntVal = valueToShift.ashr(getShiftAmount(shiftAmount, valueToShift));
+      Dest.AggregateVal.push_back(Result);
+    }
+  } else {
+    // scalar
+    uint64_t shiftAmount = Src2.IntVal.getZExtValue();
+    llvm::APInt valueToShift = Src1.IntVal;
+    Dest.IntVal = valueToShift.ashr(getShiftAmount(shiftAmount, valueToShift));
+  }
+
   SetValue(&I, Dest, SF);
 }
 
 GenericValue Interpreter::executeTruncInst(Value *SrcVal, Type *DstTy,
                                            ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  IntegerType *DITy = cast<IntegerType>(DstTy);
-  unsigned DBitWidth = DITy->getBitWidth();
-  Dest.IntVal = Src.IntVal.trunc(DBitWidth);
+  Type *SrcTy = SrcVal->getType();
+  if (SrcTy->isVectorTy()) {
+    Type *DstVecTy = DstTy->getScalarType();
+    unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
+    unsigned NumElts = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal
+    Dest.AggregateVal.resize(NumElts);
+    for (unsigned i = 0; i < NumElts; i++)
+      Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.trunc(DBitWidth);
+  } else {
+    IntegerType *DITy = cast<IntegerType>(DstTy);
+    unsigned DBitWidth = DITy->getBitWidth();
+    Dest.IntVal = Src.IntVal.trunc(DBitWidth);
+  }
   return Dest;
 }
 
 GenericValue Interpreter::executeSExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
+  const Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  IntegerType *DITy = cast<IntegerType>(DstTy);
-  unsigned DBitWidth = DITy->getBitWidth();
-  Dest.IntVal = Src.IntVal.sext(DBitWidth);
+  if (SrcTy->isVectorTy()) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal.
+    Dest.AggregateVal.resize(size);
+    for (unsigned i = 0; i < size; i++)
+      Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.sext(DBitWidth);
+  } else {
+    const IntegerType *DITy = cast<IntegerType>(DstTy);
+    unsigned DBitWidth = DITy->getBitWidth();
+    Dest.IntVal = Src.IntVal.sext(DBitWidth);
+  }
   return Dest;
 }
 
 GenericValue Interpreter::executeZExtInst(Value *SrcVal, Type *DstTy,
                                           ExecutionContext &SF) {
+  const Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  IntegerType *DITy = cast<IntegerType>(DstTy);
-  unsigned DBitWidth = DITy->getBitWidth();
-  Dest.IntVal = Src.IntVal.zext(DBitWidth);
+  if (SrcTy->isVectorTy()) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    unsigned DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
+
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal.
+    Dest.AggregateVal.resize(size);
+    for (unsigned i = 0; i < size; i++)
+      Dest.AggregateVal[i].IntVal = Src.AggregateVal[i].IntVal.zext(DBitWidth);
+  } else {
+    const IntegerType *DITy = cast<IntegerType>(DstTy);
+    unsigned DBitWidth = DITy->getBitWidth();
+    Dest.IntVal = Src.IntVal.zext(DBitWidth);
+  }
   return Dest;
 }
 
 GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, Type *DstTy,
                                              ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(SrcVal->getType()->isDoubleTy() && DstTy->isFloatTy() &&
-         "Invalid FPTrunc instruction");
-  Dest.FloatVal = (float) Src.DoubleVal;
+
+  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+    assert(SrcVal->getType()->getScalarType()->isDoubleTy() &&
+           DstTy->getScalarType()->isFloatTy() &&
+           "Invalid FPTrunc instruction");
+
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal.
+    Dest.AggregateVal.resize(size);
+    for (unsigned i = 0; i < size; i++)
+      Dest.AggregateVal[i].FloatVal = (float)Src.AggregateVal[i].DoubleVal;
+  } else {
+    assert(SrcVal->getType()->isDoubleTy() && DstTy->isFloatTy() &&
+           "Invalid FPTrunc instruction");
+    Dest.FloatVal = (float)Src.DoubleVal;
+  }
+
   return Dest;
 }
 
 GenericValue Interpreter::executeFPExtInst(Value *SrcVal, Type *DstTy,
                                            ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(SrcVal->getType()->isFloatTy() && DstTy->isDoubleTy() &&
-         "Invalid FPTrunc instruction");
-  Dest.DoubleVal = (double) Src.FloatVal;
+
+  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+    assert(SrcVal->getType()->getScalarType()->isFloatTy() &&
+           DstTy->getScalarType()->isDoubleTy() && "Invalid FPExt instruction");
+
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal.
+    Dest.AggregateVal.resize(size);
+    for (unsigned i = 0; i < size; i++)
+      Dest.AggregateVal[i].DoubleVal = (double)Src.AggregateVal[i].FloatVal;
+  } else {
+    assert(SrcVal->getType()->isFloatTy() && DstTy->isDoubleTy() &&
+           "Invalid FPExt instruction");
+    Dest.DoubleVal = (double)Src.FloatVal;
+  }
+
   return Dest;
 }
 
 GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   Type *SrcTy = SrcVal->getType();
-  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(SrcTy->isFloatingPointTy() && "Invalid FPToUI instruction");
 
-  if (SrcTy->getTypeID() == Type::FloatTyID)
-    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
-  else
-    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  if (SrcTy->getTypeID() == Type::VectorTyID) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    const Type *SrcVecTy = SrcTy->getScalarType();
+    uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal.
+    Dest.AggregateVal.resize(size);
+
+    if (SrcVecTy->getTypeID() == Type::FloatTyID) {
+      assert(SrcVecTy->isFloatingPointTy() && "Invalid FPToUI instruction");
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].IntVal = APIntOps::RoundFloatToAPInt(
+            Src.AggregateVal[i].FloatVal, DBitWidth);
+    } else {
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].IntVal = APIntOps::RoundDoubleToAPInt(
+            Src.AggregateVal[i].DoubleVal, DBitWidth);
+    }
+  } else {
+    // scalar
+    uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+    assert(SrcTy->isFloatingPointTy() && "Invalid FPToUI instruction");
+
+    if (SrcTy->getTypeID() == Type::FloatTyID)
+      Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+    else {
+      Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+    }
+  }
+
   return Dest;
 }
 
 GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   Type *SrcTy = SrcVal->getType();
-  uint32_t DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(SrcTy->isFloatingPointTy() && "Invalid FPToSI instruction");
 
-  if (SrcTy->getTypeID() == Type::FloatTyID)
-    Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
-  else
-    Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+  if (SrcTy->getTypeID() == Type::VectorTyID) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    const Type *SrcVecTy = SrcTy->getScalarType();
+    uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal
+    Dest.AggregateVal.resize(size);
+
+    if (SrcVecTy->getTypeID() == Type::FloatTyID) {
+      assert(SrcVecTy->isFloatingPointTy() && "Invalid FPToSI instruction");
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].IntVal = APIntOps::RoundFloatToAPInt(
+            Src.AggregateVal[i].FloatVal, DBitWidth);
+    } else {
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].IntVal = APIntOps::RoundDoubleToAPInt(
+            Src.AggregateVal[i].DoubleVal, DBitWidth);
+    }
+  } else {
+    // scalar
+    unsigned DBitWidth = cast<IntegerType>(DstTy)->getBitWidth();
+    assert(SrcTy->isFloatingPointTy() && "Invalid FPToSI instruction");
+
+    if (SrcTy->getTypeID() == Type::FloatTyID)
+      Dest.IntVal = APIntOps::RoundFloatToAPInt(Src.FloatVal, DBitWidth);
+    else {
+      Dest.IntVal = APIntOps::RoundDoubleToAPInt(Src.DoubleVal, DBitWidth);
+    }
+  }
   return Dest;
 }
 
 GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(DstTy->isFloatingPointTy() && "Invalid UIToFP instruction");
 
-  if (DstTy->getTypeID() == Type::FloatTyID)
-    Dest.FloatVal = APIntOps::RoundAPIntToFloat(Src.IntVal);
-  else
-    Dest.DoubleVal = APIntOps::RoundAPIntToDouble(Src.IntVal);
+  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal
+    Dest.AggregateVal.resize(size);
+
+    if (DstVecTy->getTypeID() == Type::FloatTyID) {
+      assert(DstVecTy->isFloatingPointTy() && "Invalid UIToFP instruction");
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].FloatVal =
+            APIntOps::RoundAPIntToFloat(Src.AggregateVal[i].IntVal);
+    } else {
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].DoubleVal =
+            APIntOps::RoundAPIntToDouble(Src.AggregateVal[i].IntVal);
+    }
+  } else {
+    // scalar
+    assert(DstTy->isFloatingPointTy() && "Invalid UIToFP instruction");
+    if (DstTy->getTypeID() == Type::FloatTyID)
+      Dest.FloatVal = APIntOps::RoundAPIntToFloat(Src.IntVal);
+    else {
+      Dest.DoubleVal = APIntOps::RoundAPIntToDouble(Src.IntVal);
+    }
+  }
   return Dest;
 }
 
 GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  assert(DstTy->isFloatingPointTy() && "Invalid SIToFP instruction");
 
-  if (DstTy->getTypeID() == Type::FloatTyID)
-    Dest.FloatVal = APIntOps::RoundSignedAPIntToFloat(Src.IntVal);
-  else
-    Dest.DoubleVal = APIntOps::RoundSignedAPIntToDouble(Src.IntVal);
-  return Dest;
+  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+    const Type *DstVecTy = DstTy->getScalarType();
+    unsigned size = Src.AggregateVal.size();
+    // the sizes of src and dst vectors must be equal
+    Dest.AggregateVal.resize(size);
+
+    if (DstVecTy->getTypeID() == Type::FloatTyID) {
+      assert(DstVecTy->isFloatingPointTy() && "Invalid SIToFP instruction");
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].FloatVal =
+            APIntOps::RoundSignedAPIntToFloat(Src.AggregateVal[i].IntVal);
+    } else {
+      for (unsigned i = 0; i < size; i++)
+        Dest.AggregateVal[i].DoubleVal =
+            APIntOps::RoundSignedAPIntToDouble(Src.AggregateVal[i].IntVal);
+    }
+  } else {
+    // scalar
+    assert(DstTy->isFloatingPointTy() && "Invalid SIToFP instruction");
+
+    if (DstTy->getTypeID() == Type::FloatTyID)
+      Dest.FloatVal = APIntOps::RoundSignedAPIntToFloat(Src.IntVal);
+    else {
+      Dest.DoubleVal = APIntOps::RoundSignedAPIntToDouble(Src.IntVal);
+    }
+  }
 
+  return Dest;
 }
 
 GenericValue Interpreter::executePtrToIntInst(Value *SrcVal, Type *DstTy,
@@ -1300,33 +1485,167 @@ GenericValue Interpreter::executeIntToPtrInst(Value *SrcVal, Type *DstTy,
 
 GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
                                              ExecutionContext &SF) {
-  
+
+  // This instruction supports bitwise conversion of vectors to integers and
+  // to vectors of other types (as long as they have the same size)
   Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
-  if (DstTy->isPointerTy()) {
-    assert(SrcTy->isPointerTy() && "Invalid BitCast");
-    Dest.PointerVal = Src.PointerVal;
-  } else if (DstTy->isIntegerTy()) {
-    if (SrcTy->isFloatTy()) {
-      Dest.IntVal = APInt::floatToBits(Src.FloatVal);
-    } else if (SrcTy->isDoubleTy()) {
-      Dest.IntVal = APInt::doubleToBits(Src.DoubleVal);
-    } else if (SrcTy->isIntegerTy()) {
-      Dest.IntVal = Src.IntVal;
-    } else 
+
+  if ((SrcTy->getTypeID() == Type::VectorTyID) ||
+      (DstTy->getTypeID() == Type::VectorTyID)) {
+    // vector src bitcast to vector dst or vector src bitcast to scalar dst or
+    // scalar src bitcast to vector dst
+    bool isLittleEndian = TD.isLittleEndian();
+    GenericValue TempDst, TempSrc, SrcVec;
+    const Type *SrcElemTy;
+    const Type *DstElemTy;
+    unsigned SrcBitSize;
+    unsigned DstBitSize;
+    unsigned SrcNum;
+    unsigned DstNum;
+
+    if (SrcTy->getTypeID() == Type::VectorTyID) {
+      SrcElemTy = SrcTy->getScalarType();
+      SrcBitSize = SrcTy->getScalarSizeInBits();
+      SrcNum = Src.AggregateVal.size();
+      SrcVec = Src;
+    } else {
+      // if src is scalar value, make it vector <1 x type>
+      SrcElemTy = SrcTy;
+      SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+      SrcNum = 1;
+      SrcVec.AggregateVal.push_back(Src);
+    }
+
+    if (DstTy->getTypeID() == Type::VectorTyID) {
+      DstElemTy = DstTy->getScalarType();
+      DstBitSize = DstTy->getScalarSizeInBits();
+      DstNum = (SrcNum * SrcBitSize) / DstBitSize;
+    } else {
+      DstElemTy = DstTy;
+      DstBitSize = DstTy->getPrimitiveSizeInBits();
+      DstNum = 1;
+    }
+
+    if (SrcNum * SrcBitSize != DstNum * DstBitSize)
       llvm_unreachable("Invalid BitCast");
-  } else if (DstTy->isFloatTy()) {
-    if (SrcTy->isIntegerTy())
-      Dest.FloatVal = Src.IntVal.bitsToFloat();
-    else
-      Dest.FloatVal = Src.FloatVal;
-  } else if (DstTy->isDoubleTy()) {
-    if (SrcTy->isIntegerTy())
-      Dest.DoubleVal = Src.IntVal.bitsToDouble();
-    else
-      Dest.DoubleVal = Src.DoubleVal;
-  } else
-    llvm_unreachable("Invalid Bitcast");
+
+    // If src is floating point, cast to integer first.
+    TempSrc.AggregateVal.resize(SrcNum);
+    if (SrcElemTy->isFloatTy()) {
+      for (unsigned i = 0; i < SrcNum; i++)
+        TempSrc.AggregateVal[i].IntVal =
+            APInt::floatToBits(SrcVec.AggregateVal[i].FloatVal);
+
+    } else if (SrcElemTy->isDoubleTy()) {
+      for (unsigned i = 0; i < SrcNum; i++)
+        TempSrc.AggregateVal[i].IntVal =
+            APInt::doubleToBits(SrcVec.AggregateVal[i].DoubleVal);
+    } else if (SrcElemTy->isIntegerTy()) {
+      for (unsigned i = 0; i < SrcNum; i++)
+        TempSrc.AggregateVal[i].IntVal = SrcVec.AggregateVal[i].IntVal;
+    } else {
+      // Pointers are not allowed as the element type of vector.
+      llvm_unreachable("Invalid Bitcast");
+    }
+
+    // now TempSrc is integer type vector
+    if (DstNum < SrcNum) {
+      // Example: bitcast <4 x i32> <i32 0, i32 1, i32 2, i32 3> to <2 x i64>
+      unsigned Ratio = SrcNum / DstNum;
+      unsigned SrcElt = 0;
+      for (unsigned i = 0; i < DstNum; i++) {
+        GenericValue Elt;
+        Elt.IntVal = 0;
+        Elt.IntVal = Elt.IntVal.zext(DstBitSize);
+        unsigned ShiftAmt = isLittleEndian ? 0 : SrcBitSize * (Ratio - 1);
+        for (unsigned j = 0; j < Ratio; j++) {
+          APInt Tmp;
+          Tmp = Tmp.zext(SrcBitSize);
+          Tmp = TempSrc.AggregateVal[SrcElt++].IntVal;
+          Tmp = Tmp.zext(DstBitSize);
+          Tmp = Tmp.shl(ShiftAmt);
+          ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
+          Elt.IntVal |= Tmp;
+        }
+        TempDst.AggregateVal.push_back(Elt);
+      }
+    } else {
+      // Example: bitcast <2 x i64> <i64 0, i64 1> to <4 x i32>
+      unsigned Ratio = DstNum / SrcNum;
+      for (unsigned i = 0; i < SrcNum; i++) {
+        unsigned ShiftAmt = isLittleEndian ? 0 : DstBitSize * (Ratio - 1);
+        for (unsigned j = 0; j < Ratio; j++) {
+          GenericValue Elt;
+          Elt.IntVal = Elt.IntVal.zext(SrcBitSize);
+          Elt.IntVal = TempSrc.AggregateVal[i].IntVal;
+          Elt.IntVal = Elt.IntVal.lshr(ShiftAmt);
+          // it could be DstBitSize == SrcBitSize, so check it
+          if (DstBitSize < SrcBitSize)
+            Elt.IntVal = Elt.IntVal.trunc(DstBitSize);
+          ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
+          TempDst.AggregateVal.push_back(Elt);
+        }
+      }
+    }
+
+    // convert result from integer to specified type
+    if (DstTy->getTypeID() == Type::VectorTyID) {
+      if (DstElemTy->isDoubleTy()) {
+        Dest.AggregateVal.resize(DstNum);
+        for (unsigned i = 0; i < DstNum; i++)
+          Dest.AggregateVal[i].DoubleVal =
+              TempDst.AggregateVal[i].IntVal.bitsToDouble();
+      } else if (DstElemTy->isFloatTy()) {
+        Dest.AggregateVal.resize(DstNum);
+        for (unsigned i = 0; i < DstNum; i++)
+          Dest.AggregateVal[i].FloatVal =
+              TempDst.AggregateVal[i].IntVal.bitsToFloat();
+      } else {
+        Dest = TempDst;
+      }
+    } else {
+      if (DstElemTy->isDoubleTy())
+        Dest.DoubleVal = TempDst.AggregateVal[0].IntVal.bitsToDouble();
+      else if (DstElemTy->isFloatTy()) {
+        Dest.FloatVal = TempDst.AggregateVal[0].IntVal.bitsToFloat();
+      } else {
+        Dest.IntVal = TempDst.AggregateVal[0].IntVal;
+      }
+    }
+  } else { //  if ((SrcTy->getTypeID() == Type::VectorTyID) ||
+           //     (DstTy->getTypeID() == Type::VectorTyID))
+
+    // scalar src bitcast to scalar dst
+    if (DstTy->isPointerTy()) {
+      assert(SrcTy->isPointerTy() && "Invalid BitCast");
+      Dest.PointerVal = Src.PointerVal;
+    } else if (DstTy->isIntegerTy()) {
+      if (SrcTy->isFloatTy())
+        Dest.IntVal = APInt::floatToBits(Src.FloatVal);
+      else if (SrcTy->isDoubleTy()) {
+        Dest.IntVal = APInt::doubleToBits(Src.DoubleVal);
+      } else if (SrcTy->isIntegerTy()) {
+        Dest.IntVal = Src.IntVal;
+      } else {
+        llvm_unreachable("Invalid BitCast");
+      }
+    } else if (DstTy->isFloatTy()) {
+      if (SrcTy->isIntegerTy())
+        Dest.FloatVal = Src.IntVal.bitsToFloat();
+      else {
+        Dest.FloatVal = Src.FloatVal;
+      }
+    } else if (DstTy->isDoubleTy()) {
+      if (SrcTy->isIntegerTy())
+        Dest.DoubleVal = Src.IntVal.bitsToDouble();
+      else {
+        Dest.DoubleVal = Src.DoubleVal;
+      }
+    } else {
+      llvm_unreachable("Invalid Bitcast");
+    }
+  }
 
   return Dest;
 }
@@ -1456,10 +1775,204 @@ void Interpreter::visitExtractElementInst(ExtractElementInst &I) {
   SetValue(&I, Dest, SF);
 }
 
+void Interpreter::visitInsertElementInst(InsertElementInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Type *Ty = I.getType();
+
+  if(!(Ty->isVectorTy()) )
+    llvm_unreachable("Unhandled dest type for insertelement instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  Type *TyContained = Ty->getContainedType(0);
+
+  const unsigned indx = unsigned(Src3.IntVal.getZExtValue());
+  Dest.AggregateVal = Src1.AggregateVal;
+
+  if(Src1.AggregateVal.size() <= indx)
+      llvm_unreachable("Invalid index in insertelement instruction");
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    case Type::IntegerTyID:
+      Dest.AggregateVal[indx].IntVal = Src2.IntVal;
+      break;
+    case Type::FloatTyID:
+      Dest.AggregateVal[indx].FloatVal = Src2.FloatVal;
+      break;
+    case Type::DoubleTyID:
+      Dest.AggregateVal[indx].DoubleVal = Src2.DoubleVal;
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
+  ExecutionContext &SF = ECStack.back();
+
+  Type *Ty = I.getType();
+  if(!(Ty->isVectorTy()))
+    llvm_unreachable("Unhandled dest type for shufflevector instruction");
+
+  GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
+  GenericValue Dest;
+
+  // There is no need to check types of src1 and src2, because the compiled
+  // bytecode can't contain different types for src1 and src2 for a
+  // shufflevector instruction.
+
+  Type *TyContained = Ty->getContainedType(0);
+  unsigned src1Size = (unsigned)Src1.AggregateVal.size();
+  unsigned src2Size = (unsigned)Src2.AggregateVal.size();
+  unsigned src3Size = (unsigned)Src3.AggregateVal.size();
+
+  Dest.AggregateVal.resize(src3Size);
+
+  switch (TyContained->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+      break;
+    case Type::IntegerTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].IntVal = Src1.AggregateVal[j].IntVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].IntVal = Src2.AggregateVal[j-src1Size].IntVal;
+        else
+          // The selector may not be greater than sum of lengths of first and
+          // second operands and llasm should not allow situation like
+          // %tmp = shufflevector <2 x i32> <i32 3, i32 4>, <2 x i32> undef,
+          //                      <2 x i32> < i32 0, i32 5 >,
+          // where i32 5 is invalid, but let it be additional check here:
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+    case Type::FloatTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].FloatVal = Src1.AggregateVal[j].FloatVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].FloatVal = Src2.AggregateVal[j-src1Size].FloatVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+        }
+      break;
+    case Type::DoubleTyID:
+      for( unsigned i=0; i<src3Size; i++) {
+        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        if(j < src1Size)
+          Dest.AggregateVal[i].DoubleVal = Src1.AggregateVal[j].DoubleVal;
+        else if(j < src1Size + src2Size)
+          Dest.AggregateVal[i].DoubleVal =
+            Src2.AggregateVal[j-src1Size].DoubleVal;
+        else
+          llvm_unreachable("Invalid mask in shufflevector instruction");
+      }
+      break;
+  }
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitExtractValueInst(ExtractValueInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+  GenericValue Dest;
+  GenericValue Src = getOperandValue(Agg, SF);
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+  GenericValue *pSrc = &Src;
+
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pSrc = &pSrc->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for extractelement instruction");
+    break;
+    case Type::IntegerTyID:
+      Dest.IntVal = pSrc->IntVal;
+    break;
+    case Type::FloatTyID:
+      Dest.FloatVal = pSrc->FloatVal;
+    break;
+    case Type::DoubleTyID:
+      Dest.DoubleVal = pSrc->DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      Dest.AggregateVal = pSrc->AggregateVal;
+    break;
+    case Type::PointerTyID:
+      Dest.PointerVal = pSrc->PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
+void Interpreter::visitInsertValueInst(InsertValueInst &I) {
+
+  ExecutionContext &SF = ECStack.back();
+  Value *Agg = I.getAggregateOperand();
+
+  GenericValue Src1 = getOperandValue(Agg, SF);
+  GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
+  GenericValue Dest = Src1; // Dest is a slightly changed Src1
+
+  ExtractValueInst::idx_iterator IdxBegin = I.idx_begin();
+  unsigned Num = I.getNumIndices();
+
+  GenericValue *pDest = &Dest;
+  for (unsigned i = 0 ; i < Num; ++i) {
+    pDest = &pDest->AggregateVal[*IdxBegin];
+    ++IdxBegin;
+  }
+  // pDest points to the target value in the Dest now
+
+  Type *IndexedType = ExtractValueInst::getIndexedType(Agg->getType(), I.getIndices());
+
+  switch (IndexedType->getTypeID()) {
+    default:
+      llvm_unreachable("Unhandled dest type for insertelement instruction");
+    break;
+    case Type::IntegerTyID:
+      pDest->IntVal = Src2.IntVal;
+    break;
+    case Type::FloatTyID:
+      pDest->FloatVal = Src2.FloatVal;
+    break;
+    case Type::DoubleTyID:
+      pDest->DoubleVal = Src2.DoubleVal;
+    break;
+    case Type::ArrayTyID:
+    case Type::StructTyID:
+    case Type::VectorTyID:
+      pDest->AggregateVal = Src2.AggregateVal;
+    break;
+    case Type::PointerTyID:
+      pDest->PointerVal = Src2.PointerVal;
+    break;
+  }
+
+  SetValue(&I, Dest, SF);
+}
+
 GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
                                                 ExecutionContext &SF) {
   switch (CE->getOpcode()) {
-  case Instruction::Trunc:   
+  case Instruction::Trunc:
       return executeTruncInst(CE->getOperand(0), CE->getType(), SF);
   case Instruction::ZExt:
       return executeZExtInst(CE->getOperand(0), CE->getType(), SF);
@@ -1495,7 +2008,8 @@ GenericValue Interpreter::getConstantExprValue (ConstantExpr *CE,
   case Instruction::Select:
     return executeSelectInst(getOperandValue(CE->getOperand(0), SF),
                              getOperandValue(CE->getOperand(1), SF),
-                             getOperandValue(CE->getOperand(2), SF));
+                             getOperandValue(CE->getOperand(2), SF),
+                             CE->getOperand(0)->getType());
   default :
     break;
   }
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index bef4bbf..a03c7f5 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -406,6 +406,7 @@ GenericValue lle_X_sprintf(FunctionType *FT,
       break;
     }
   }
+  return GV;
 }
 
 // int printf(const char *, ...) - a very rough implementation to make output
@@ -434,7 +435,7 @@ GenericValue lle_X_sscanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, sscanf(Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -450,7 +451,7 @@ GenericValue lle_X_scanf(FunctionType *FT,
 
   GenericValue GV;
   GV.IntVal = APInt(32, scanf( Args[0], Args[1], Args[2], Args[3], Args[4],
-                        Args[5], Args[6], Args[7], Args[8], Args[9]));
+                    Args[5], Args[6], Args[7], Args[8], Args[9]));
   return GV;
 }
 
@@ -470,6 +471,30 @@ GenericValue lle_X_fprintf(FunctionType *FT,
   return GV;
 }
 
+static GenericValue lle_X_memset(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  int val = (int)Args[1].IntVal.getSExtValue();
+  size_t len = (size_t)Args[2].IntVal.getZExtValue();
+  memset((void *)GVTOP(Args[0]), val, len);
+  // llvm.memset.* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
+static GenericValue lle_X_memcpy(FunctionType *FT,
+                                 const std::vector<GenericValue> &Args) {
+  memcpy(GVTOP(Args[0]), GVTOP(Args[1]),
+         (size_t)(Args[2].IntVal.getLimitedValue()));
+
+  // llvm.memcpy* returns void, lle_X_* returns GenericValue,
+  // so here we return GenericValue with IntVal set to zero
+  GenericValue GV;
+  GV.IntVal = 0;
+  return GV;
+}
+
 void Interpreter::initializeExternalFunctions() {
   sys::ScopedLock Writer(*FunctionsLock);
   FuncNames["lle_X_atexit"]       = lle_X_atexit;
@@ -481,4 +506,6 @@ void Interpreter::initializeExternalFunctions() {
   FuncNames["lle_X_sscanf"]       = lle_X_sscanf;
   FuncNames["lle_X_scanf"]        = lle_X_scanf;
   FuncNames["lle_X_fprintf"]      = lle_X_fprintf;
+  FuncNames["lle_X_memset"]       = lle_X_memset;
+  FuncNames["lle_X_memcpy"]       = lle_X_memcpy;
 }
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 2952d7e..98269ef 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -179,6 +179,12 @@ public:
 
   void visitVAArgInst(VAArgInst &I);
   void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+
   void visitInstruction(Instruction &I) {
     errs() << I << "\n";
     llvm_unreachable("Instruction not interpretable yet!");
diff --git a/lib/ExecutionEngine/JIT/CMakeLists.txt b/lib/ExecutionEngine/JIT/CMakeLists.txt
index 52bb389..e16baed 100644
--- a/lib/ExecutionEngine/JIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/JIT/CMakeLists.txt
@@ -3,7 +3,6 @@ add_definitions(-DENABLE_X86_JIT)
 
 add_llvm_library(LLVMJIT
   JIT.cpp
-  JITDwarfEmitter.cpp
   JITEmitter.cpp
   JITMemoryManager.cpp
   )
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 53ea0a2..246a675 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -67,140 +67,6 @@ static struct RegisterJIT {
 extern "C" void LLVMLinkInJIT() {
 }
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-
-// libgcc defines the __register_frame function to dynamically register new
-// dwarf frames for exception handling. This functionality is not portable
-// across compilers and is only provided by GCC. We use the __register_frame
-// function here so that code generated by the JIT cooperates with the unwinding
-// runtime of libgcc. When JITting with exception handling enable, LLVM
-// generates dwarf frames and registers it to libgcc with __register_frame.
-//
-// The __register_frame function works with Linux.
-//
-// Unfortunately, this functionality seems to be in libgcc after the unwinding
-// library of libgcc for darwin was written. The code for darwin overwrites the
-// value updated by __register_frame with a value fetched with "keymgr".
-// "keymgr" is an obsolete functionality, which should be rewritten some day.
-// In the meantime, since "keymgr" is on all libgccs shipped with apple-gcc, we
-// need a workaround in LLVM which uses the "keymgr" to dynamically modify the
-// values of an opaque key, used by libgcc to find dwarf tables.
-
-extern "C" void __register_frame(void*);
-extern "C" void __deregister_frame(void*);
-
-#if defined(__APPLE__) && MAC_OS_X_VERSION_MAX_ALLOWED <= 1050
-# define USE_KEYMGR 1
-#else
-# define USE_KEYMGR 0
-#endif
-
-#if USE_KEYMGR
-
-namespace {
-
-// LibgccObject - This is the structure defined in libgcc. There is no #include
-// provided for this structure, so we also define it here. libgcc calls it
-// "struct object". The structure is undocumented in libgcc.
-struct LibgccObject {
-  void *unused1;
-  void *unused2;
-  void *unused3;
-
-  /// frame - Pointer to the exception table.
-  void *frame;
-
-  /// encoding -  The encoding of the object?
-  union {
-    struct {
-      unsigned long sorted : 1;
-      unsigned long from_array : 1;
-      unsigned long mixed_encoding : 1;
-      unsigned long encoding : 8;
-      unsigned long count : 21;
-    } b;
-    size_t i;
-  } encoding;
-
-  /// fde_end - libgcc defines this field only if some macro is defined. We
-  /// include this field even if it may not there, to make libgcc happy.
-  char *fde_end;
-
-  /// next - At least we know it's a chained list!
-  struct LibgccObject *next;
-};
-
-// "kemgr" stuff. Apparently, all frame tables are stored there.
-extern "C" void _keymgr_set_and_unlock_processwide_ptr(int, void *);
-extern "C" void *_keymgr_get_and_lock_processwide_ptr(int);
-#define KEYMGR_GCC3_DW2_OBJ_LIST        302     /* Dwarf2 object list  */
-
-/// LibgccObjectInfo - libgcc defines this struct as km_object_info. It
-/// probably contains all dwarf tables that are loaded.
-struct LibgccObjectInfo {
-
-  /// seenObjects - LibgccObjects already parsed by the unwinding runtime.
-  ///
-  struct LibgccObject* seenObjects;
-
-  /// unseenObjects - LibgccObjects not parsed yet by the unwinding runtime.
-  ///
-  struct LibgccObject* unseenObjects;
-
-  unsigned unused[2];
-};
-
-/// darwin_register_frame - Since __register_frame does not work with darwin's
-/// libgcc,we provide our own function, which "tricks" libgcc by modifying the
-/// "Dwarf2 object list" key.
-void DarwinRegisterFrame(void* FrameBegin) {
-  // Get the key.
-  LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-  assert(LOI && "This should be preallocated by the runtime");
-
-  // Allocate a new LibgccObject to represent this frame. Deallocation of this
-  // object may be impossible: since darwin code in libgcc was written after
-  // the ability to dynamically register frames, things may crash if we
-  // deallocate it.
-  struct LibgccObject* ob = (struct LibgccObject*)
-    malloc(sizeof(struct LibgccObject));
-
-  // Do like libgcc for the values of the field.
-  ob->unused1 = (void *)-1;
-  ob->unused2 = 0;
-  ob->unused3 = 0;
-  ob->frame = FrameBegin;
-  ob->encoding.i = 0;
-  ob->encoding.b.encoding = llvm::dwarf::DW_EH_PE_omit;
-
-  // Put the info on both places, as libgcc uses the first or the second
-  // field. Note that we rely on having two pointers here. If fde_end was a
-  // char, things would get complicated.
-  ob->fde_end = (char*)LOI->unseenObjects;
-  ob->next = LOI->unseenObjects;
-
-  // Update the key's unseenObjects list.
-  LOI->unseenObjects = ob;
-
-  // Finally update the "key". Apparently, libgcc requires it.
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST,
-                                         LOI);
-
-}
-
-}
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
 /// createJIT - This is the factory method for creating a JIT for the current
 /// machine, it does not fall back to the interpreter.  This takes ownership
 /// of the module.
@@ -293,33 +159,11 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
     report_fatal_error("Target does not support machine code emission!");
   }
 
-  // Register routine for informing unwinding runtime about new EH frames
-#if HAVE_EHTABLE_SUPPORT
-#if USE_KEYMGR
-  struct LibgccObjectInfo* LOI = (struct LibgccObjectInfo*)
-    _keymgr_get_and_lock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST);
-
-  // The key is created on demand, and libgcc creates it the first time an
-  // exception occurs. Since we need the key to register frames, we create
-  // it now.
-  if (!LOI)
-    LOI = (LibgccObjectInfo*)calloc(sizeof(struct LibgccObjectInfo), 1);
-  _keymgr_set_and_unlock_processwide_ptr(KEYMGR_GCC3_DW2_OBJ_LIST, LOI);
-  InstallExceptionTableRegister(DarwinRegisterFrame);
-  // Not sure about how to deregister on Darwin.
-#else
-  InstallExceptionTableRegister(__register_frame);
-  InstallExceptionTableDeregister(__deregister_frame);
-#endif // __APPLE__
-#endif // HAVE_EHTABLE_SUPPORT
-
   // Initialize passes.
   PM.doInitialization();
 }
 
 JIT::~JIT() {
-  // Unregister all exception tables registered by this JIT.
-  DeregisterAllTables();
   // Cleanup.
   AllJits->Remove(this);
   delete jitstate;
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
deleted file mode 100644
index 35d2b8b..0000000
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
+++ /dev/null
@@ -1,596 +0,0 @@
-//===----- JITDwarfEmitter.cpp - Write dwarf tables into memory -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a JITDwarfEmitter object that is used by the JIT to
-// write dwarf tables to memory.
-//
-//===----------------------------------------------------------------------===//
-
-#include "JITDwarfEmitter.h"
-#include "JIT.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MachineLocation.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-using namespace llvm;
-
-JITDwarfEmitter::JITDwarfEmitter(JIT& theJit) : MMI(0), Jit(theJit) {}
-
-
-unsigned char* JITDwarfEmitter::EmitDwarfTable(MachineFunction& F,
-                                               JITCodeEmitter& jce,
-                                               unsigned char* StartFunction,
-                                               unsigned char* EndFunction,
-                                               unsigned char* &EHFramePtr) {
-  assert(MMI && "MachineModuleInfo not registered!");
-
-  const TargetMachine& TM = F.getTarget();
-  TD = TM.getDataLayout();
-  stackGrowthDirection = TM.getFrameLowering()->getStackGrowthDirection();
-  RI = TM.getRegisterInfo();
-  MAI = TM.getMCAsmInfo();
-  JCE = &jce;
-
-  unsigned char* ExceptionTable = EmitExceptionTable(&F, StartFunction,
-                                                     EndFunction);
-
-  unsigned char* Result = 0;
-
-  const std::vector<const Function *> Personalities = MMI->getPersonalities();
-  EHFramePtr = EmitCommonEHFrame(Personalities[MMI->getPersonalityIndex()]);
-
-  Result = EmitEHFrame(Personalities[MMI->getPersonalityIndex()], EHFramePtr,
-                       StartFunction, EndFunction, ExceptionTable);
-
-  return Result;
-}
-
-
-void
-JITDwarfEmitter::EmitFrameMoves(intptr_t BaseLabelPtr,
-                                const std::vector<MachineMove> &Moves) const {
-  unsigned PointerSize = TD->getPointerSize();
-  int stackGrowth = stackGrowthDirection == TargetFrameLowering::StackGrowsUp ?
-          PointerSize : -PointerSize;
-  MCSymbol *BaseLabel = 0;
-
-  for (unsigned i = 0, N = Moves.size(); i < N; ++i) {
-    const MachineMove &Move = Moves[i];
-    MCSymbol *Label = Move.getLabel();
-
-    // Throw out move if the label is invalid.
-    if (Label && (*JCE->getLabelLocations())[Label] == 0)
-      continue;
-
-    intptr_t LabelPtr = 0;
-    if (Label) LabelPtr = JCE->getLabelAddress(Label);
-
-    const MachineLocation &Dst = Move.getDestination();
-    const MachineLocation &Src = Move.getSource();
-
-    // Advance row if new location.
-    if (BaseLabelPtr && Label && BaseLabel != Label) {
-      JCE->emitByte(dwarf::DW_CFA_advance_loc4);
-      JCE->emitInt32(LabelPtr - BaseLabelPtr);
-
-      BaseLabel = Label;
-      BaseLabelPtr = LabelPtr;
-    }
-
-    // If advancing cfa.
-    if (Dst.isReg() && Dst.getReg() == MachineLocation::VirtualFP) {
-      if (!Src.isReg()) {
-        if (Src.getReg() == MachineLocation::VirtualFP) {
-          JCE->emitByte(dwarf::DW_CFA_def_cfa_offset);
-        } else {
-          JCE->emitByte(dwarf::DW_CFA_def_cfa);
-          JCE->emitULEB128Bytes(RI->getDwarfRegNum(Src.getReg(), true));
-        }
-
-        JCE->emitULEB128Bytes(-Src.getOffset());
-      } else {
-        llvm_unreachable("Machine move not supported yet.");
-      }
-    } else if (Src.isReg() &&
-      Src.getReg() == MachineLocation::VirtualFP) {
-      if (Dst.isReg()) {
-        JCE->emitByte(dwarf::DW_CFA_def_cfa_register);
-        JCE->emitULEB128Bytes(RI->getDwarfRegNum(Dst.getReg(), true));
-      } else {
-        llvm_unreachable("Machine move not supported yet.");
-      }
-    } else {
-      unsigned Reg = RI->getDwarfRegNum(Src.getReg(), true);
-      int Offset = Dst.getOffset() / stackGrowth;
-
-      if (Offset < 0) {
-        JCE->emitByte(dwarf::DW_CFA_offset_extended_sf);
-        JCE->emitULEB128Bytes(Reg);
-        JCE->emitSLEB128Bytes(Offset);
-      } else if (Reg < 64) {
-        JCE->emitByte(dwarf::DW_CFA_offset + Reg);
-        JCE->emitULEB128Bytes(Offset);
-      } else {
-        JCE->emitByte(dwarf::DW_CFA_offset_extended);
-        JCE->emitULEB128Bytes(Reg);
-        JCE->emitULEB128Bytes(Offset);
-      }
-    }
-  }
-}
-
-/// SharedTypeIds - How many leading type ids two landing pads have in common.
-static unsigned SharedTypeIds(const LandingPadInfo *L,
-                              const LandingPadInfo *R) {
-  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
-  unsigned LSize = LIds.size(), RSize = RIds.size();
-  unsigned MinSize = LSize < RSize ? LSize : RSize;
-  unsigned Count = 0;
-
-  for (; Count != MinSize; ++Count)
-    if (LIds[Count] != RIds[Count])
-      return Count;
-
-  return Count;
-}
-
-
-/// PadLT - Order landing pads lexicographically by type id.
-static bool PadLT(const LandingPadInfo *L, const LandingPadInfo *R) {
-  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
-  unsigned LSize = LIds.size(), RSize = RIds.size();
-  unsigned MinSize = LSize < RSize ? LSize : RSize;
-
-  for (unsigned i = 0; i != MinSize; ++i)
-    if (LIds[i] != RIds[i])
-      return LIds[i] < RIds[i];
-
-  return LSize < RSize;
-}
-
-namespace {
-
-/// ActionEntry - Structure describing an entry in the actions table.
-struct ActionEntry {
-  int ValueForTypeID; // The value to write - may not be equal to the type id.
-  int NextAction;
-  struct ActionEntry *Previous;
-};
-
-/// PadRange - Structure holding a try-range and the associated landing pad.
-struct PadRange {
-  // The index of the landing pad.
-  unsigned PadIndex;
-  // The index of the begin and end labels in the landing pad's label lists.
-  unsigned RangeIndex;
-};
-
-typedef DenseMap<MCSymbol*, PadRange> RangeMapType;
-
-/// CallSiteEntry - Structure describing an entry in the call-site table.
-struct CallSiteEntry {
-  MCSymbol *BeginLabel; // zero indicates the start of the function.
-  MCSymbol *EndLabel;   // zero indicates the end of the function.
-  MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
-  unsigned Action;
-};
-
-}
-
-unsigned char* JITDwarfEmitter::EmitExceptionTable(MachineFunction* MF,
-                                         unsigned char* StartFunction,
-                                         unsigned char* EndFunction) const {
-  assert(MMI && "MachineModuleInfo not registered!");
-
-  // Map all labels and get rid of any dead landing pads.
-  MMI->TidyLandingPads(JCE->getLabelLocations());
-
-  const std::vector<const GlobalVariable *> &TypeInfos = MMI->getTypeInfos();
-  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
-  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
-  if (PadInfos.empty()) return 0;
-
-  // Sort the landing pads in order of their type ids.  This is used to fold
-  // duplicate actions.
-  SmallVector<const LandingPadInfo *, 64> LandingPads;
-  LandingPads.reserve(PadInfos.size());
-  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
-    LandingPads.push_back(&PadInfos[i]);
-  std::sort(LandingPads.begin(), LandingPads.end(), PadLT);
-
-  // Negative type ids index into FilterIds, positive type ids index into
-  // TypeInfos.  The value written for a positive type id is just the type
-  // id itself.  For a negative type id, however, the value written is the
-  // (negative) byte offset of the corresponding FilterIds entry.  The byte
-  // offset is usually equal to the type id, because the FilterIds entries
-  // are written using a variable width encoding which outputs one byte per
-  // entry as long as the value written is not too large, but can differ.
-  // This kind of complication does not occur for positive type ids because
-  // type infos are output using a fixed width encoding.
-  // FilterOffsets[i] holds the byte offset corresponding to FilterIds[i].
-  SmallVector<int, 16> FilterOffsets;
-  FilterOffsets.reserve(FilterIds.size());
-  int Offset = -1;
-  for(std::vector<unsigned>::const_iterator I = FilterIds.begin(),
-    E = FilterIds.end(); I != E; ++I) {
-    FilterOffsets.push_back(Offset);
-    Offset -= MCAsmInfo::getULEB128Size(*I);
-  }
-
-  // Compute the actions table and gather the first action index for each
-  // landing pad site.
-  SmallVector<ActionEntry, 32> Actions;
-  SmallVector<unsigned, 64> FirstActions;
-  FirstActions.reserve(LandingPads.size());
-
-  int FirstAction = 0;
-  unsigned SizeActions = 0;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LP = LandingPads[i];
-    const std::vector<int> &TypeIds = LP->TypeIds;
-    const unsigned NumShared = i ? SharedTypeIds(LP, LandingPads[i-1]) : 0;
-    unsigned SizeSiteActions = 0;
-
-    if (NumShared < TypeIds.size()) {
-      unsigned SizeAction = 0;
-      ActionEntry *PrevAction = 0;
-
-      if (NumShared) {
-        const unsigned SizePrevIds = LandingPads[i-1]->TypeIds.size();
-        assert(Actions.size());
-        PrevAction = &Actions.back();
-        SizeAction = MCAsmInfo::getSLEB128Size(PrevAction->NextAction) +
-          MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
-        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
-          SizeAction -= MCAsmInfo::getSLEB128Size(PrevAction->ValueForTypeID);
-          SizeAction += -PrevAction->NextAction;
-          PrevAction = PrevAction->Previous;
-        }
-      }
-
-      // Compute the actions.
-      for (unsigned I = NumShared, M = TypeIds.size(); I != M; ++I) {
-        int TypeID = TypeIds[I];
-        assert(-1-TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
-        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
-        unsigned SizeTypeID = MCAsmInfo::getSLEB128Size(ValueForTypeID);
-
-        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
-        SizeAction = SizeTypeID + MCAsmInfo::getSLEB128Size(NextAction);
-        SizeSiteActions += SizeAction;
-
-        ActionEntry Action = {ValueForTypeID, NextAction, PrevAction};
-        Actions.push_back(Action);
-
-        PrevAction = &Actions.back();
-      }
-
-      // Record the first action of the landing pad site.
-      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
-    } // else identical - re-use previous FirstAction
-
-    FirstActions.push_back(FirstAction);
-
-    // Compute this sites contribution to size.
-    SizeActions += SizeSiteActions;
-  }
-
-  // Compute the call-site table.  Entries must be ordered by address.
-  SmallVector<CallSiteEntry, 64> CallSites;
-
-  RangeMapType PadMap;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LandingPad = LandingPads[i];
-    for (unsigned j=0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
-      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
-      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
-      PadRange P = { i, j };
-      PadMap[BeginLabel] = P;
-    }
-  }
-
-  bool MayThrow = false;
-  MCSymbol *LastLabel = 0;
-  for (MachineFunction::const_iterator I = MF->begin(), E = MF->end();
-        I != E; ++I) {
-    for (MachineBasicBlock::const_iterator MI = I->begin(), E = I->end();
-          MI != E; ++MI) {
-      if (!MI->isLabel()) {
-        MayThrow |= MI->isCall();
-        continue;
-      }
-
-      MCSymbol *BeginLabel = MI->getOperand(0).getMCSymbol();
-      assert(BeginLabel && "Invalid label!");
-
-      if (BeginLabel == LastLabel)
-        MayThrow = false;
-
-      RangeMapType::iterator L = PadMap.find(BeginLabel);
-
-      if (L == PadMap.end())
-        continue;
-
-      PadRange P = L->second;
-      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
-
-      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
-              "Inconsistent landing pad map!");
-
-      // If some instruction between the previous try-range and this one may
-      // throw, create a call-site entry with no landing pad for the region
-      // between the try-ranges.
-      if (MayThrow) {
-        CallSiteEntry Site = {LastLabel, BeginLabel, 0, 0};
-        CallSites.push_back(Site);
-      }
-
-      LastLabel = LandingPad->EndLabels[P.RangeIndex];
-      CallSiteEntry Site = {BeginLabel, LastLabel,
-        LandingPad->LandingPadLabel, FirstActions[P.PadIndex]};
-
-      assert(Site.BeginLabel && Site.EndLabel && Site.PadLabel &&
-              "Invalid landing pad!");
-
-      // Try to merge with the previous call-site.
-      if (CallSites.size()) {
-        CallSiteEntry &Prev = CallSites.back();
-        if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
-          // Extend the range of the previous entry.
-          Prev.EndLabel = Site.EndLabel;
-          continue;
-        }
-      }
-
-      // Otherwise, create a new call-site.
-      CallSites.push_back(Site);
-    }
-  }
-  // If some instruction between the previous try-range and the end of the
-  // function may throw, create a call-site entry with no landing pad for the
-  // region following the try-range.
-  if (MayThrow) {
-    CallSiteEntry Site = {LastLabel, 0, 0, 0};
-    CallSites.push_back(Site);
-  }
-
-  // Final tallies.
-  unsigned SizeSites = CallSites.size() * (sizeof(int32_t) + // Site start.
-                                            sizeof(int32_t) + // Site length.
-                                            sizeof(int32_t)); // Landing pad.
-  for (unsigned i = 0, e = CallSites.size(); i < e; ++i)
-    SizeSites += MCAsmInfo::getULEB128Size(CallSites[i].Action);
-
-  unsigned SizeTypes = TypeInfos.size() * TD->getPointerSize();
-
-  unsigned TypeOffset = sizeof(int8_t) + // Call site format
-                        // Call-site table length
-                        MCAsmInfo::getULEB128Size(SizeSites) +
-                        SizeSites + SizeActions + SizeTypes;
-
-  // Begin the exception table.
-  JCE->emitAlignmentWithFill(4, 0);
-  // Asm->EOL("Padding");
-
-  unsigned char* DwarfExceptionTable = (unsigned char*)JCE->getCurrentPCValue();
-
-  // Emit the header.
-  JCE->emitByte(dwarf::DW_EH_PE_omit);
-  // Asm->EOL("LPStart format (DW_EH_PE_omit)");
-  JCE->emitByte(dwarf::DW_EH_PE_absptr);
-  // Asm->EOL("TType format (DW_EH_PE_absptr)");
-  JCE->emitULEB128Bytes(TypeOffset);
-  // Asm->EOL("TType base offset");
-  JCE->emitByte(dwarf::DW_EH_PE_udata4);
-  // Asm->EOL("Call site format (DW_EH_PE_udata4)");
-  JCE->emitULEB128Bytes(SizeSites);
-  // Asm->EOL("Call-site table length");
-
-  // Emit the landing pad site information.
-  for (unsigned i = 0; i < CallSites.size(); ++i) {
-    CallSiteEntry &S = CallSites[i];
-    intptr_t BeginLabelPtr = 0;
-    intptr_t EndLabelPtr = 0;
-
-    if (!S.BeginLabel) {
-      BeginLabelPtr = (intptr_t)StartFunction;
-      JCE->emitInt32(0);
-    } else {
-      BeginLabelPtr = JCE->getLabelAddress(S.BeginLabel);
-      JCE->emitInt32(BeginLabelPtr - (intptr_t)StartFunction);
-    }
-
-    // Asm->EOL("Region start");
-
-    if (!S.EndLabel)
-      EndLabelPtr = (intptr_t)EndFunction;
-    else
-      EndLabelPtr = JCE->getLabelAddress(S.EndLabel);
-
-    JCE->emitInt32(EndLabelPtr - BeginLabelPtr);
-    //Asm->EOL("Region length");
-
-    if (!S.PadLabel) {
-      JCE->emitInt32(0);
-    } else {
-      unsigned PadLabelPtr = JCE->getLabelAddress(S.PadLabel);
-      JCE->emitInt32(PadLabelPtr - (intptr_t)StartFunction);
-    }
-    // Asm->EOL("Landing pad");
-
-    JCE->emitULEB128Bytes(S.Action);
-    // Asm->EOL("Action");
-  }
-
-  // Emit the actions.
-  for (unsigned I = 0, N = Actions.size(); I != N; ++I) {
-    ActionEntry &Action = Actions[I];
-
-    JCE->emitSLEB128Bytes(Action.ValueForTypeID);
-    //Asm->EOL("TypeInfo index");
-    JCE->emitSLEB128Bytes(Action.NextAction);
-    //Asm->EOL("Next action");
-  }
-
-  // Emit the type ids.
-  for (unsigned M = TypeInfos.size(); M; --M) {
-    const GlobalVariable *GV = TypeInfos[M - 1];
-
-    if (GV) {
-      if (TD->getPointerSize() == sizeof(int32_t))
-        JCE->emitInt32((intptr_t)Jit.getOrEmitGlobalVariable(GV));
-      else
-        JCE->emitInt64((intptr_t)Jit.getOrEmitGlobalVariable(GV));
-    } else {
-      if (TD->getPointerSize() == sizeof(int32_t))
-        JCE->emitInt32(0);
-      else
-        JCE->emitInt64(0);
-    }
-    // Asm->EOL("TypeInfo");
-  }
-
-  // Emit the filter typeids.
-  for (unsigned j = 0, M = FilterIds.size(); j < M; ++j) {
-    unsigned TypeID = FilterIds[j];
-    JCE->emitULEB128Bytes(TypeID);
-    //Asm->EOL("Filter TypeInfo index");
-  }
-
-  JCE->emitAlignmentWithFill(4, 0);
-
-  return DwarfExceptionTable;
-}
-
-unsigned char*
-JITDwarfEmitter::EmitCommonEHFrame(const Function* Personality) const {
-  unsigned PointerSize = TD->getPointerSize();
-  int stackGrowth = stackGrowthDirection == TargetFrameLowering::StackGrowsUp ?
-          PointerSize : -PointerSize;
-
-  unsigned char* StartCommonPtr = (unsigned char*)JCE->getCurrentPCValue();
-  // EH Common Frame header
-  JCE->allocateSpace(4, 0);
-  unsigned char* FrameCommonBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
-  JCE->emitInt32((int)0);
-  JCE->emitByte(dwarf::DW_CIE_VERSION);
-  JCE->emitString(Personality ? "zPLR" : "zR");
-  JCE->emitULEB128Bytes(1);
-  JCE->emitSLEB128Bytes(stackGrowth);
-  JCE->emitByte(RI->getDwarfRegNum(RI->getRARegister(), true));
-
-  if (Personality) {
-    // Augmentation Size: 3 small ULEBs of one byte each, and the personality
-    // function which size is PointerSize.
-    JCE->emitULEB128Bytes(3 + PointerSize);
-
-    // We set the encoding of the personality as direct encoding because we use
-    // the function pointer. The encoding is not relative because the current
-    // PC value may be bigger than the personality function pointer.
-    if (PointerSize == 4) {
-      JCE->emitByte(dwarf::DW_EH_PE_sdata4);
-      JCE->emitInt32(((intptr_t)Jit.getPointerToGlobal(Personality)));
-    } else {
-      JCE->emitByte(dwarf::DW_EH_PE_sdata8);
-      JCE->emitInt64(((intptr_t)Jit.getPointerToGlobal(Personality)));
-    }
-
-    // LSDA encoding: This must match the encoding used in EmitEHFrame ()
-    if (PointerSize == 4)
-      JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
-    else
-      JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8);
-    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
-  } else {
-    JCE->emitULEB128Bytes(1);
-    JCE->emitULEB128Bytes(dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4);
-  }
-
-  EmitFrameMoves(0, MAI->getInitialFrameState());
-
-  JCE->emitAlignmentWithFill(PointerSize, dwarf::DW_CFA_nop);
-
-  JCE->emitInt32At((uintptr_t*)StartCommonPtr,
-                   (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() -
-                               FrameCommonBeginPtr));
-
-  return StartCommonPtr;
-}
-
-
-unsigned char*
-JITDwarfEmitter::EmitEHFrame(const Function* Personality,
-                             unsigned char* StartCommonPtr,
-                             unsigned char* StartFunction,
-                             unsigned char* EndFunction,
-                             unsigned char* ExceptionTable) const {
-  unsigned PointerSize = TD->getPointerSize();
-
-  // EH frame header.
-  unsigned char* StartEHPtr = (unsigned char*)JCE->getCurrentPCValue();
-  JCE->allocateSpace(4, 0);
-  unsigned char* FrameBeginPtr = (unsigned char*)JCE->getCurrentPCValue();
-  // FDE CIE Offset
-  JCE->emitInt32(FrameBeginPtr - StartCommonPtr);
-  JCE->emitInt32(StartFunction - (unsigned char*)JCE->getCurrentPCValue());
-  JCE->emitInt32(EndFunction - StartFunction);
-
-  // If there is a personality and landing pads then point to the language
-  // specific data area in the exception table.
-  if (Personality) {
-    JCE->emitULEB128Bytes(PointerSize == 4 ? 4 : 8);
-
-    if (PointerSize == 4) {
-      if (!MMI->getLandingPads().empty())
-        JCE->emitInt32(ExceptionTable-(unsigned char*)JCE->getCurrentPCValue());
-      else
-        JCE->emitInt32((int)0);
-    } else {
-      if (!MMI->getLandingPads().empty())
-        JCE->emitInt64(ExceptionTable-(unsigned char*)JCE->getCurrentPCValue());
-      else
-        JCE->emitInt64((int)0);
-    }
-  } else {
-    JCE->emitULEB128Bytes(0);
-  }
-
-  // Indicate locations of function specific  callee saved registers in
-  // frame.
-  EmitFrameMoves((intptr_t)StartFunction, MMI->getFrameMoves());
-
-  JCE->emitAlignmentWithFill(PointerSize, dwarf::DW_CFA_nop);
-
-  // Indicate the size of the table
-  JCE->emitInt32At((uintptr_t*)StartEHPtr,
-                   (uintptr_t)((unsigned char*)JCE->getCurrentPCValue() -
-                               StartEHPtr));
-
-  // Double zeroes for the unwind runtime
-  if (PointerSize == 8) {
-    JCE->emitInt64(0);
-    JCE->emitInt64(0);
-  } else {
-    JCE->emitInt32(0);
-    JCE->emitInt32(0);
-  }
-
-  return StartEHPtr;
-}
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
deleted file mode 100644
index 98ac340..0000000
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
+++ /dev/null
@@ -1,77 +0,0 @@
-//===------ JITDwarfEmitter.h - Write dwarf tables into memory ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a JITDwarfEmitter object that is used by the JIT to
-// write dwarf tables to memory.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
-#define LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
-
-#include "llvm/Support/DataTypes.h"
-#include <vector>
-
-namespace llvm {
-
-class Function;
-class JIT;
-class JITCodeEmitter;
-class MachineFunction;
-class MachineModuleInfo;
-class MachineMove;
-class MCAsmInfo;
-class DataLayout;
-class TargetMachine;
-class TargetRegisterInfo;
-
-class JITDwarfEmitter {
-  const DataLayout* TD;
-  JITCodeEmitter* JCE;
-  const TargetRegisterInfo* RI;
-  const MCAsmInfo *MAI;
-  MachineModuleInfo* MMI;
-  JIT& Jit;
-  bool stackGrowthDirection;
-
-  unsigned char* EmitExceptionTable(MachineFunction* MF,
-                                    unsigned char* StartFunction,
-                                    unsigned char* EndFunction) const;
-
-  void EmitFrameMoves(intptr_t BaseLabelPtr,
-                      const std::vector<MachineMove> &Moves) const;
-
-  unsigned char* EmitCommonEHFrame(const Function* Personality) const;
-
-  unsigned char* EmitEHFrame(const Function* Personality,
-                             unsigned char* StartBufferPtr,
-                             unsigned char* StartFunction,
-                             unsigned char* EndFunction,
-                             unsigned char* ExceptionTable) const;
-
-public:
-
-  JITDwarfEmitter(JIT& jit);
-
-  unsigned char* EmitDwarfTable(MachineFunction& F,
-                                JITCodeEmitter& JCE,
-                                unsigned char* StartFunction,
-                                unsigned char* EndFunction,
-                                unsigned char* &EHFramePtr);
-
-
-  void setModuleInfo(MachineModuleInfo* Info) {
-    MMI = Info;
-  }
-};
-
-
-} // end namespace llvm
-
-#endif // LLVM_EXECUTION_ENGINE_JIT_DWARFEMITTER_H
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index c273876..acbbfa1 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "jit"
 #include "JIT.h"
-#include "JITDwarfEmitter.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -325,9 +324,6 @@ namespace {
     /// Resolver - This contains info about the currently resolved functions.
     JITResolver Resolver;
 
-    /// DE - The dwarf emitter for the jit.
-    OwningPtr<JITDwarfEmitter> DE;
-
     /// LabelLocations - This vector is a mapping from Label ID's to their
     /// address.
     DenseMap<MCSymbol*, uintptr_t> LabelLocations;
@@ -363,22 +359,16 @@ namespace {
     /// Instance of the JIT
     JIT *TheJIT;
 
-    bool JITExceptionHandling;
-
   public:
     JITEmitter(JIT &jit, JITMemoryManager *JMM, TargetMachine &TM)
       : SizeEstimate(0), Resolver(jit, *this), MMI(0), CurFn(0),
-        EmittedFunctions(this), TheJIT(&jit),
-        JITExceptionHandling(TM.Options.JITExceptionHandling) {
+        EmittedFunctions(this), TheJIT(&jit) {
       MemMgr = JMM ? JMM : JITMemoryManager::CreateDefaultMemManager();
       if (jit.getJITInfo().needsGOT()) {
         MemMgr->AllocateGOT();
         DEBUG(dbgs() << "JIT is managing a GOT\n");
       }
 
-      if (JITExceptionHandling) {
-        DE.reset(new JITDwarfEmitter(jit));
-      }
     }
     ~JITEmitter() {
       delete MemMgr;
@@ -460,7 +450,6 @@ namespace {
 
     virtual void setModuleInfo(MachineModuleInfo* Info) {
       MMI = Info;
-      if (DE.get()) DE->setModuleInfo(Info);
     }
 
   private:
@@ -964,40 +953,6 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
       }
     });
 
-  if (JITExceptionHandling) {
-    uintptr_t ActualSize = 0;
-    SavedBufferBegin = BufferBegin;
-    SavedBufferEnd = BufferEnd;
-    SavedCurBufferPtr = CurBufferPtr;
-    uint8_t *FrameRegister;
-
-    while (true) {
-      BufferBegin = CurBufferPtr = MemMgr->startExceptionTable(F.getFunction(),
-                                                               ActualSize);
-      BufferEnd = BufferBegin+ActualSize;
-      EmittedFunctions[F.getFunction()].ExceptionTable = BufferBegin;
-      uint8_t *EhStart;
-      FrameRegister = DE->EmitDwarfTable(F, *this, FnStart, FnEnd, EhStart);
-
-      // If the buffer was large enough to hold the table then we are done.
-      if (CurBufferPtr != BufferEnd)
-        break;
-
-      // Try again with twice as much space.
-      ActualSize = (CurBufferPtr - BufferBegin) * 2;
-      MemMgr->deallocateExceptionTable(BufferBegin);
-    }
-    MemMgr->endExceptionTable(F.getFunction(), BufferBegin, CurBufferPtr,
-                              FrameRegister);
-    BufferBegin = SavedBufferBegin;
-    BufferEnd = SavedBufferEnd;
-    CurBufferPtr = SavedCurBufferPtr;
-
-    if (JITExceptionHandling) {
-      TheJIT->RegisterTable(F.getFunction(), FrameRegister);
-    }
-  }
-
   if (MMI)
     MMI->EndFunction();
 
@@ -1027,15 +982,10 @@ void JITEmitter::deallocateMemForFunction(const Function *F) {
     Emitted = EmittedFunctions.find(F);
   if (Emitted != EmittedFunctions.end()) {
     MemMgr->deallocateFunctionBody(Emitted->second.FunctionBody);
-    MemMgr->deallocateExceptionTable(Emitted->second.ExceptionTable);
     TheJIT->NotifyFreeingMachineCode(Emitted->second.Code);
 
     EmittedFunctions.erase(Emitted);
   }
-
-  if (JITExceptionHandling) {
-    TheJIT->DeregisterTable(F);
-  }
 }
 
 
diff --git a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
index 66aeb77..f58d31b 100644
--- a/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
+++ b/lib/ExecutionEngine/JIT/JITMemoryManager.cpp
@@ -464,11 +464,15 @@ namespace {
 
     /// allocateCodeSection - Allocate memory for a code section.
     uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID) {
+                                 unsigned SectionID, StringRef SectionName) {
       // Grow the required block size to account for the block header
       Size += sizeof(*CurBlock);
 
-      // FIXME: Alignement handling.
+      // Alignment handling.
+      if (!Alignment)
+        Alignment = 16;
+      Size += Alignment - 1;
+
       FreeRangeHeader* candidateBlock = FreeMemoryList;
       FreeRangeHeader* head = FreeMemoryList;
       FreeRangeHeader* iter = head->Next;
@@ -500,39 +504,21 @@ namespace {
       FreeMemoryList = candidateBlock->AllocateBlock();
       // Release the memory at the end of this block that isn't needed.
       FreeMemoryList = CurBlock->TrimAllocationToSize(FreeMemoryList, Size);
-      return (uint8_t *)(CurBlock + 1);
+      uintptr_t unalignedAddr = (uintptr_t)CurBlock + sizeof(*CurBlock);
+      return (uint8_t*)RoundUpToAlignment((uint64_t)unalignedAddr, Alignment);
     }
 
     /// allocateDataSection - Allocate memory for a data section.
     uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                 unsigned SectionID, bool IsReadOnly) {
+                                 unsigned SectionID, StringRef SectionName,
+                                 bool IsReadOnly) {
       return (uint8_t*)DataAllocator.Allocate(Size, Alignment);
     }
 
-    bool applyPermissions(std::string *ErrMsg) {
+    bool finalizeMemory(std::string *ErrMsg) {
       return false;
     }
 
-    /// startExceptionTable - Use startFunctionBody to allocate memory for the
-    /// function's exception table.
-    uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize) {
-      return startFunctionBody(F, ActualSize);
-    }
-
-    /// endExceptionTable - The exception table of F is now allocated,
-    /// and takes the memory in the range [TableStart,TableEnd).
-    void endExceptionTable(const Function *F, uint8_t *TableStart,
-                           uint8_t *TableEnd, uint8_t* FrameRegister) {
-      assert(TableEnd > TableStart);
-      assert(TableStart == (uint8_t *)(CurBlock+1) &&
-             "Mismatched table start/end!");
-
-      uintptr_t BlockSize = TableEnd - (uint8_t *)CurBlock;
-
-      // Release the memory at the end of this block that isn't needed.
-      FreeMemoryList =CurBlock->TrimAllocationToSize(FreeMemoryList, BlockSize);
-    }
-
     uint8_t *getGOTBase() const {
       return GOTBase;
     }
@@ -557,12 +543,6 @@ namespace {
       if (Body) deallocateBlock(Body);
     }
 
-    /// deallocateExceptionTable - Deallocate memory for the specified
-    /// exception table.
-    void deallocateExceptionTable(void *ET) {
-      if (ET) deallocateBlock(ET);
-    }
-
     /// setMemoryWritable - When code generation is in progress,
     /// the code pages may need permissions changed.
     void setMemoryWritable()
@@ -814,7 +794,7 @@ static void runAtExitHandlers() {
 // not inlined, and hiding their real definitions in a separate archive file
 // that the dynamic linker can't see. For more info, search for
 // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-#if defined(__linux__)
+#if defined(__linux__) && defined(__GLIBC__)
 /* stat functions are redirecting to __xstat with a version number.  On x86-64
  * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
  * available as an exported symbol, so we have to add it explicitly.
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 38aa547..195c458 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -14,10 +14,12 @@
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/PassManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,7 +41,7 @@ extern "C" void LLVMLinkInMCJIT() {
 
 ExecutionEngine *MCJIT::createJIT(Module *M,
                                   std::string *ErrorStr,
-                                  JITMemoryManager *JMM,
+                                  RTDyldMemoryManager *MemMgr,
                                   bool GVsWithCode,
                                   TargetMachine *TM) {
   // Try to register the program as a source of symbols to resolve against.
@@ -47,43 +49,69 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  return new MCJIT(M, TM, JMM ? JMM : new SectionMemoryManager(), GVsWithCode);
+  return new MCJIT(M, TM, MemMgr ? MemMgr : new SectionMemoryManager(),
+                   GVsWithCode);
 }
 
 MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
              bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(0),
-    MemMgr(MM ? MM : new SectionMemoryManager()), Dyld(MemMgr),
-    IsLoaded(false), M(m), ObjCache(0)  {
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(this, MM), Dyld(&MemMgr),
+    ObjCache(0) {
 
+  OwnedModules.addModule(m);
   setDataLayout(TM->getDataLayout());
 }
 
 MCJIT::~MCJIT() {
-  if (LoadedObject)
-    NotifyFreeingObject(*LoadedObject.get());
-  delete MemMgr;
+  MutexGuard locked(lock);
+  // FIXME: We are managing our modules, so we do not want the base class
+  // ExecutionEngine to manage them as well. To avoid double destruction
+  // of the first (and only) module added in ExecutionEngine constructor
+  // we remove it from EE and will destruct it ourselves.
+  //
+  // It may make sense to move our module manager (based on SmallStPtr) back
+  // into EE if the JIT and Interpreter can live with it.
+  // If so, additional functions: addModule, removeModule, FindFunctionNamed,
+  // runStaticConstructorsDestructors could be moved back to EE as well.
+  //
+  Modules.clear();
+  Dyld.deregisterEHFrames();
+
+  LoadedObjectMap::iterator it, end = LoadedObjects.end();
+  for (it = LoadedObjects.begin(); it != end; ++it) {
+    ObjectImage *Obj = it->second;
+    if (Obj) {
+      NotifyFreeingObject(*Obj);
+      delete Obj;
+    }
+  }
+  LoadedObjects.clear();
   delete TM;
 }
 
+void MCJIT::addModule(Module *M) {
+  MutexGuard locked(lock);
+  OwnedModules.addModule(M);
+}
+
+bool MCJIT::removeModule(Module *M) {
+  MutexGuard locked(lock);
+  return OwnedModules.removeModule(M);
+}
+
+
+
 void MCJIT::setObjectCache(ObjectCache* NewCache) {
+  MutexGuard locked(lock);
   ObjCache = NewCache;
 }
 
-ObjectBufferStream* MCJIT::emitObject(Module *m) {
-  /// Currently, MCJIT only supports a single module and the module passed to
-  /// this function call is expected to be the contained module.  The module
-  /// is passed as a parameter here to prepare for multiple module support in
-  /// the future.
-  assert(M == m);
-
-  // Get a thread lock to make sure we aren't trying to compile multiple times
+ObjectBufferStream* MCJIT::emitObject(Module *M) {
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
-  // Re-compilation is not supported
-  assert(!IsLoaded);
+  // This must be a module which has already been added but not loaded to this
+  // MCJIT instance, since these conditions are tested by our caller,
+  // generateCodeForModule.
 
   PassManager PM;
 
@@ -99,7 +127,7 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
   }
 
   // Initialize passes.
-  PM.run(*m);
+  PM.run(*M);
   // Flush the output buffer to get the generated code into memory
   CompiledObject->flush();
 
@@ -109,27 +137,28 @@ ObjectBufferStream* MCJIT::emitObject(Module *m) {
     // MemoryBuffer is a thin wrapper around the actual memory, so it's OK
     // to create a temporary object here and delete it after the call.
     OwningPtr<MemoryBuffer> MB(CompiledObject->getMemBuffer());
-    ObjCache->notifyObjectCompiled(m, MB.get());
+    ObjCache->notifyObjectCompiled(M, MB.get());
   }
 
   return CompiledObject.take();
 }
 
-void MCJIT::loadObject(Module *M) {
-
+void MCJIT::generateCodeForModule(Module *M) {
   // Get a thread lock to make sure we aren't trying to load multiple times
   MutexGuard locked(lock);
 
-  // FIXME: Track compilation state on a per-module basis when multiple modules
-  //        are supported.
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) &&
+         "MCJIT::generateCodeForModule: Unknown module.");
+
   // Re-compilation is not supported
-  if (IsLoaded)
+  if (OwnedModules.hasModuleBeenLoaded(M))
     return;
 
   OwningPtr<ObjectBuffer> ObjectToLoad;
   // Try to load the pre-compiled object from cache if possible
   if (0 != ObjCache) {
-    OwningPtr<MemoryBuffer> PreCompiledObject(ObjCache->getObjectCopy(M));
+    OwningPtr<MemoryBuffer> PreCompiledObject(ObjCache->getObject(M));
     if (0 != PreCompiledObject.get())
       ObjectToLoad.reset(new ObjectBuffer(PreCompiledObject.take()));
   }
@@ -141,59 +170,137 @@ void MCJIT::loadObject(Module *M) {
   }
 
   // Load the object into the dynamic linker.
-  // handing off ownership of the buffer
-  LoadedObject.reset(Dyld.loadObject(ObjectToLoad.take()));
+  // MCJIT now owns the ObjectImage pointer (via its LoadedObjects map).
+  ObjectImage *LoadedObject = Dyld.loadObject(ObjectToLoad.take());
+  LoadedObjects[M] = LoadedObject;
   if (!LoadedObject)
     report_fatal_error(Dyld.getErrorString());
 
-  // Resolve any relocations.
-  Dyld.resolveRelocations();
-
   // FIXME: Make this optional, maybe even move it to a JIT event listener
   LoadedObject->registerWithDebugger();
 
   NotifyObjectEmitted(*LoadedObject);
 
-  // FIXME: Add support for per-module compilation state
-  IsLoaded = true;
+  OwnedModules.markModuleAsLoaded(M);
 }
 
-// FIXME: Add a parameter to identify which object is being finalized when
-// MCJIT supports multiple modules.
-// FIXME: Provide a way to separate code emission, relocations and page 
-// protection in the interface.
+void MCJIT::finalizeLoadedModules() {
+  MutexGuard locked(lock);
+
+  // Resolve any outstanding relocations.
+  Dyld.resolveRelocations();
+
+  OwnedModules.markAllLoadedModulesAsFinalized();
+
+  // Register EH frame data for any module we own which has been loaded
+  Dyld.registerEHFrames();
+
+  // Set page permissions.
+  MemMgr.finalizeMemory();
+}
+
+// FIXME: Rename this.
 void MCJIT::finalizeObject() {
-  // If the module hasn't been compiled, just do that.
-  if (!IsLoaded) {
-    // If the call to Dyld.resolveRelocations() is removed from loadObject()
-    // we'll need to do that here.
-    loadObject(M);
-  } else {
-    // Resolve any relocations.
-    Dyld.resolveRelocations();
+  MutexGuard locked(lock);
+
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    generateCodeForModule(M);
   }
 
-  StringRef EHData = Dyld.getEHFrameSection();
-  if (!EHData.empty())
-    MemMgr->registerEHFrames(EHData);
+  finalizeLoadedModules();
+}
 
-  // Set page permissions.
-  MemMgr->applyPermissions();
+void MCJIT::finalizeModule(Module *M) {
+  MutexGuard locked(lock);
+
+  // This must be a module which has already been added to this MCJIT instance.
+  assert(OwnedModules.ownsModule(M) && "MCJIT::finalizeModule: Unknown module.");
+
+  // If the module hasn't been compiled, just do that.
+  if (!OwnedModules.hasModuleBeenLoaded(M))
+    generateCodeForModule(M);
+
+  finalizeLoadedModules();
 }
 
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
   report_fatal_error("not yet implemented");
 }
 
-void *MCJIT::getPointerToFunction(Function *F) {
-  // FIXME: This should really return a uint64_t since it's a pointer in the
-  // target address space, not our local address space. That's part of the
-  // ExecutionEngine interface, though. Fix that when the old JIT finally
-  // dies.
+uint64_t MCJIT::getExistingSymbolAddress(const std::string &Name) {
+  // Check with the RuntimeDyld to see if we already have this symbol.
+  if (Name[0] == '\1')
+    return Dyld.getSymbolLoadAddress(Name.substr(1));
+  return Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+                                       + Name));
+}
+
+Module *MCJIT::findModuleForSymbol(const std::string &Name,
+                                   bool CheckFunctionsOnly) {
+  MutexGuard locked(lock);
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  for (ModulePtrSet::iterator I = OwnedModules.begin_added(),
+                              E = OwnedModules.end_added();
+       I != E; ++I) {
+    Module *M = *I;
+    Function *F = M->getFunction(Name);
+    if (F && !F->isDeclaration())
+      return M;
+    if (!CheckFunctionsOnly) {
+      GlobalVariable *G = M->getGlobalVariable(Name);
+      if (G && !G->isDeclaration())
+        return M;
+      // FIXME: Do we need to worry about global aliases?
+    }
+  }
+  // We didn't find the symbol in any of our modules.
+  return NULL;
+}
+
+uint64_t MCJIT::getSymbolAddress(const std::string &Name,
+                                 bool CheckFunctionsOnly)
+{
+  MutexGuard locked(lock);
+
+  // First, check to see if we already have this symbol.
+  uint64_t Addr = getExistingSymbolAddress(Name);
+  if (Addr)
+    return Addr;
+
+  // If it hasn't already been generated, see if it's in one of our modules.
+  Module *M = findModuleForSymbol(Name, CheckFunctionsOnly);
+  if (!M)
+    return 0;
+
+  generateCodeForModule(M);
+
+  // Check the RuntimeDyld table again, it should be there now.
+  return getExistingSymbolAddress(Name);
+}
 
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
+uint64_t MCJIT::getGlobalValueAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, false);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+uint64_t MCJIT::getFunctionAddress(const std::string &Name) {
+  MutexGuard locked(lock);
+  uint64_t Result = getSymbolAddress(Name, true);
+  if (Result != 0)
+    finalizeLoadedModules();
+  return Result;
+}
+
+// Deprecated.  Use getFunctionAddress instead.
+void *MCJIT::getPointerToFunction(Function *F) {
+  MutexGuard locked(lock);
 
   if (F->isDeclaration() || F->hasAvailableExternallyLinkage()) {
     bool AbortOnFailure = !F->hasExternalWeakLinkage();
@@ -202,6 +309,16 @@ void *MCJIT::getPointerToFunction(Function *F) {
     return Addr;
   }
 
+  Module *M = F->getParent();
+  bool HasBeenAddedButNotLoaded = OwnedModules.hasModuleBeenAddedButNotLoaded(M);
+
+  // Make sure the relevant module has been compiled and loaded.
+  if (HasBeenAddedButNotLoaded)
+    generateCodeForModule(M);
+  else if (!OwnedModules.hasModuleBeenLoaded(M))
+    // If this function doesn't belong to one of our modules, we're done.
+    return NULL;
+
   // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
   //
@@ -222,6 +339,45 @@ void MCJIT::freeMachineCodeForFunction(Function *F) {
   report_fatal_error("not yet implemented");
 }
 
+void MCJIT::runStaticConstructorsDestructorsInModulePtrSet(
+    bool isDtors, ModulePtrSet::iterator I, ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    ExecutionEngine::runStaticConstructorsDestructors(*I, isDtors);
+  }
+}
+
+void MCJIT::runStaticConstructorsDestructors(bool isDtors) {
+  // Execute global ctors/dtors for each module in the program.
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_added(), OwnedModules.end_added());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_loaded(), OwnedModules.end_loaded());
+  runStaticConstructorsDestructorsInModulePtrSet(
+      isDtors, OwnedModules.begin_finalized(), OwnedModules.end_finalized());
+}
+
+Function *MCJIT::FindFunctionNamedInModulePtrSet(const char *FnName,
+                                                 ModulePtrSet::iterator I,
+                                                 ModulePtrSet::iterator E) {
+  for (; I != E; ++I) {
+    if (Function *F = (*I)->getFunction(FnName))
+      return F;
+  }
+  return 0;
+}
+
+Function *MCJIT::FindFunctionNamed(const char *FnName) {
+  Function *F = FindFunctionNamedInModulePtrSet(
+      FnName, OwnedModules.begin_added(), OwnedModules.end_added());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_loaded(),
+                                        OwnedModules.end_loaded());
+  if (!F)
+    F = FindFunctionNamedInModulePtrSet(FnName, OwnedModules.begin_finalized(),
+                                        OwnedModules.end_finalized());
+  return F;
+}
+
 GenericValue MCJIT::runFunction(Function *F,
                                 const std::vector<GenericValue> &ArgValues) {
   assert(F && "Function *F was null at entry to run()");
@@ -324,12 +480,8 @@ GenericValue MCJIT::runFunction(Function *F,
 
 void *MCJIT::getPointerToNamedFunction(const std::string &Name,
                                        bool AbortOnFailure) {
-  // FIXME: Add support for per-module compilation state
-  if (!IsLoaded)
-    loadObject(M);
-
-  if (!isSymbolSearchingDisabled() && MemMgr) {
-    void *ptr = MemMgr->getPointerToNamedFunction(Name, false);
+  if (!isSymbolSearchingDisabled()) {
+    void *ptr = MemMgr.getPointerToNamedFunction(Name, false);
     if (ptr)
       return ptr;
   }
@@ -365,6 +517,7 @@ void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
 }
 void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
   MutexGuard locked(lock);
+  MemMgr.notifyObjectLoaded(this, &Obj);
   for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
     EventListeners[I]->NotifyObjectEmitted(Obj);
   }
@@ -375,3 +528,14 @@ void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
     EventListeners[I]->NotifyFreeingObject(Obj);
   }
 }
+
+uint64_t LinkingMemoryManager::getSymbolAddress(const std::string &Name) {
+  uint64_t Result = ParentEngine->getSymbolAddress(Name, false);
+  // If the symbols wasn't found and it begins with an underscore, try again
+  // without the underscore.
+  if (!Result && Name[0] == '_')
+    Result = ParentEngine->getSymbolAddress(Name.substr(1), false);
+  if (Result)
+    return Result;
+  return ClientMM->getSymbolAddress(Name);
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 8c4bf6e..86b478b 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -10,49 +10,253 @@
 #ifndef LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/ObjectCache.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/PassManager.h"
+#include "llvm/IR/Module.h"
 
 namespace llvm {
+class MCJIT;
 
-class ObjectImage;
+// This is a helper class that the MCJIT execution engine uses for linking
+// functions across modules that it owns.  It aggregates the memory manager
+// that is passed in to the MCJIT constructor and defers most functionality
+// to that object.
+class LinkingMemoryManager : public RTDyldMemoryManager {
+public:
+  LinkingMemoryManager(MCJIT *Parent, RTDyldMemoryManager *MM)
+    : ParentEngine(Parent), ClientMM(MM) {}
+
+  virtual uint64_t getSymbolAddress(const std::string &Name);
+
+  // Functions deferred to client memory manager
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName) {
+    return ClientMM->allocateCodeSection(Size, Alignment, SectionID, SectionName);
+  }
+
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID, StringRef SectionName,
+                                       bool IsReadOnly) {
+    return ClientMM->allocateDataSection(Size, Alignment,
+                                         SectionID, SectionName, IsReadOnly);
+  }
+
+  virtual void notifyObjectLoaded(ExecutionEngine *EE,
+                                  const ObjectImage *Obj) {
+    ClientMM->notifyObjectLoaded(EE, Obj);
+  }
+
+  virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) {
+    ClientMM->registerEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual void deregisterEHFrames(uint8_t *Addr,
+                                  uint64_t LoadAddr,
+                                  size_t Size) {
+    ClientMM->deregisterEHFrames(Addr, LoadAddr, Size);
+  }
+
+  virtual bool finalizeMemory(std::string *ErrMsg = 0) {
+    return ClientMM->finalizeMemory(ErrMsg);
+  }
 
-// FIXME: This makes all kinds of horrible assumptions for the time being,
-// like only having one module, not needing to worry about multi-threading,
-// blah blah. Purely in get-it-up-and-limping mode for now.
+private:
+  MCJIT *ParentEngine;
+  OwningPtr<RTDyldMemoryManager> ClientMM;
+};
+
+// About Module states: added->loaded->finalized.
+//
+// The purpose of the "added" state is having modules in standby. (added=known
+// but not compiled). The idea is that you can add a module to provide function
+// definitions but if nothing in that module is referenced by a module in which
+// a function is executed (note the wording here because it's not exactly the
+// ideal case) then the module never gets compiled. This is sort of lazy
+// compilation.
+//
+// The purpose of the "loaded" state (loaded=compiled and required sections
+// copied into local memory but not yet ready for execution) is to have an
+// intermediate state wherein clients can remap the addresses of sections, using
+// MCJIT::mapSectionAddress, (in preparation for later copying to a new location
+// or an external process) before relocations and page permissions are applied.
+//
+// It might not be obvious at first glance, but the "remote-mcjit" case in the
+// lli tool does this.  In that case, the intermediate action is taken by the
+// RemoteMemoryManager in response to the notifyObjectLoaded function being
+// called.
 
 class MCJIT : public ExecutionEngine {
   MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
         bool AllocateGVsWithCode);
 
+  typedef llvm::SmallPtrSet<Module *, 4> ModulePtrSet;
+
+  class OwningModuleContainer {
+  public:
+    OwningModuleContainer() {
+    }
+    ~OwningModuleContainer() {
+      freeModulePtrSet(AddedModules);
+      freeModulePtrSet(LoadedModules);
+      freeModulePtrSet(FinalizedModules);
+    }
+
+    ModulePtrSet::iterator begin_added() { return AddedModules.begin(); }
+    ModulePtrSet::iterator end_added() { return AddedModules.end(); }
+
+    ModulePtrSet::iterator begin_loaded() { return LoadedModules.begin(); }
+    ModulePtrSet::iterator end_loaded() { return LoadedModules.end(); }
+
+    ModulePtrSet::iterator begin_finalized() { return FinalizedModules.begin(); }
+    ModulePtrSet::iterator end_finalized() { return FinalizedModules.end(); }
+
+    void addModule(Module *M) {
+      AddedModules.insert(M);
+    }
+
+    bool removeModule(Module *M) {
+      return AddedModules.erase(M) || LoadedModules.erase(M) ||
+             FinalizedModules.erase(M);
+    }
+
+    bool hasModuleBeenAddedButNotLoaded(Module *M) {
+      return AddedModules.count(M) != 0;
+    }
+
+    bool hasModuleBeenLoaded(Module *M) {
+      // If the module is in either the "loaded" or "finalized" sections it
+      // has been loaded.
+      return (LoadedModules.count(M) != 0 ) || (FinalizedModules.count(M) != 0);
+    }
+
+    bool hasModuleBeenFinalized(Module *M) {
+      return FinalizedModules.count(M) != 0;
+    }
+
+    bool ownsModule(Module* M) {
+      return (AddedModules.count(M) != 0) || (LoadedModules.count(M) != 0) ||
+             (FinalizedModules.count(M) != 0);
+    }
+
+    void markModuleAsLoaded(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own or a Module that has already been loaded and/or finalized.
+      assert(AddedModules.count(M) &&
+             "markModuleAsLoaded: Module not found in AddedModules");
+
+      // Remove the module from the "Added" set.
+      AddedModules.erase(M);
+
+      // Add the Module to the "Loaded" set.
+      LoadedModules.insert(M);
+    }
+
+    void markModuleAsFinalized(Module *M) {
+      // This checks against logic errors in the MCJIT implementation.
+      // This function should never be called with either a Module that MCJIT
+      // does not own, a Module that has not been loaded or a Module that has
+      // already been finalized.
+      assert(LoadedModules.count(M) &&
+             "markModuleAsFinalized: Module not found in LoadedModules");
+
+      // Remove the module from the "Loaded" section of the list.
+      LoadedModules.erase(M);
+
+      // Add the Module to the "Finalized" section of the list by inserting it
+      // before the 'end' iterator.
+      FinalizedModules.insert(M);
+    }
+
+    void markAllLoadedModulesAsFinalized() {
+      for (ModulePtrSet::iterator I = LoadedModules.begin(),
+                                  E = LoadedModules.end();
+           I != E; ++I) {
+        Module *M = *I;
+        FinalizedModules.insert(M);
+      }
+      LoadedModules.clear();
+    }
+
+  private:
+    ModulePtrSet AddedModules;
+    ModulePtrSet LoadedModules;
+    ModulePtrSet FinalizedModules;
+
+    void freeModulePtrSet(ModulePtrSet& MPS) {
+      // Go through the module set and delete everything.
+      for (ModulePtrSet::iterator I = MPS.begin(), E = MPS.end(); I != E; ++I) {
+        Module *M = *I;
+        delete M;
+      }
+      MPS.clear();
+    }
+  };
+
   TargetMachine *TM;
   MCContext *Ctx;
-  RTDyldMemoryManager *MemMgr;
+  LinkingMemoryManager MemMgr;
   RuntimeDyld Dyld;
   SmallVector<JITEventListener*, 2> EventListeners;
 
-  // FIXME: Add support for multiple modules
-  bool IsLoaded;
-  Module *M;
-  OwningPtr<ObjectImage> LoadedObject;
+  OwningModuleContainer OwnedModules;
+
+  typedef DenseMap<Module *, ObjectImage *> LoadedObjectMap;
+  LoadedObjectMap  LoadedObjects;
 
   // An optional ObjectCache to be notified of compiled objects and used to
   // perform lookup of pre-compiled code to avoid re-compilation.
   ObjectCache *ObjCache;
 
+  Function *FindFunctionNamedInModulePtrSet(const char *FnName,
+                                            ModulePtrSet::iterator I,
+                                            ModulePtrSet::iterator E);
+
+  void runStaticConstructorsDestructorsInModulePtrSet(bool isDtors,
+                                                      ModulePtrSet::iterator I,
+                                                      ModulePtrSet::iterator E);
+
 public:
   ~MCJIT();
 
   /// @name ExecutionEngine interface implementation
   /// @{
+  virtual void addModule(Module *M);
+  virtual bool removeModule(Module *M);
+
+  /// FindFunctionNamed - Search all of the active modules to find the one that
+  /// defines FnName.  This is very slow operation and shouldn't be used for
+  /// general code.
+  virtual Function *FindFunctionNamed(const char *FnName);
 
   /// Sets the object manager that MCJIT should use to avoid compilation.
   virtual void setObjectCache(ObjectCache *manager);
 
+  virtual void generateCodeForModule(Module *M);
+
+  /// finalizeObject - ensure the module is fully processed and is usable.
+  ///
+  /// It is the user-level function for completing the process of making the
+  /// object usable for execution. It should be called after sections within an
+  /// object have been relocated using mapSectionAddress.  When this method is
+  /// called the MCJIT execution engine will reapply relocations for a loaded
+  /// object.
+  /// Is it OK to finalize a set of modules, add modules and finalize again.
+  // FIXME: Do we really need both of these?
   virtual void finalizeObject();
+  virtual void finalizeModule(Module *);
+  void finalizeLoadedModules();
+
+  /// runStaticConstructorsDestructors - This method is used to execute all of
+  /// the static constructors or destructors for a program.
+  ///
+  /// \param isDtors - Run the destructors instead of constructors.
+  void runStaticConstructorsDestructors(bool isDtors);
 
   virtual void *getPointerToBasicBlock(BasicBlock *BB);
 
@@ -84,10 +288,15 @@ public:
                                  uint64_t TargetAddress) {
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
-
   virtual void RegisterJITEventListener(JITEventListener *L);
   virtual void UnregisterJITEventListener(JITEventListener *L);
 
+  // If successful, these function will implicitly finalize all loaded objects.
+  // To get a function address within MCJIT without causing a finalize, use
+  // getSymbolAddress.
+  virtual uint64_t getGlobalValueAddress(const std::string &Name);
+  virtual uint64_t getFunctionAddress(const std::string &Name);
+
   /// @}
   /// @name (Private) Registration Interfaces
   /// @{
@@ -98,12 +307,17 @@ public:
 
   static ExecutionEngine *createJIT(Module *M,
                                     std::string *ErrorStr,
-                                    JITMemoryManager *JMM,
+                                    RTDyldMemoryManager *MemMgr,
                                     bool GVsWithCode,
                                     TargetMachine *TM);
 
   // @}
 
+  // This is not directly exposed via the ExecutionEngine API, but it is
+  // used by the LinkingMemoryManager.
+  uint64_t getSymbolAddress(const std::string &Name,
+                          bool CheckFunctionsOnly);
+
 protected:
   /// emitObject -- Generate a JITed object in memory from the specified module
   /// Currently, MCJIT only supports a single module and the module passed to
@@ -112,10 +326,12 @@ protected:
   /// the future.
   ObjectBufferStream* emitObject(Module *M);
 
-  void loadObject(Module *M);
-
   void NotifyObjectEmitted(const ObjectImage& Obj);
   void NotifyFreeingObject(const ObjectImage& Obj);
+
+  uint64_t getExistingSymbolAddress(const std::string &Name);
+  Module *findModuleForSymbol(const std::string &Name,
+                              bool CheckFunctionsOnly);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
index bac77ce..cf90e77 100644
--- a/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
+++ b/lib/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -14,25 +14,15 @@
 
 #include "llvm/Config/config.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MathExtras.h"
 
-#ifdef __linux__
-  // These includes used by SectionMemoryManager::getPointerToNamedFunction()
-  // for Glibc trickery. See comments in this function for more information.
-  #ifdef HAVE_SYS_STAT_H
-    #include <sys/stat.h>
-  #endif
-  #include <fcntl.h>
-  #include <unistd.h>
-#endif
-
 namespace llvm {
 
 uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
-                                                    unsigned Alignment,
-                                                    unsigned SectionID,
-                                                    bool IsReadOnly) {
+                                                   unsigned Alignment,
+                                                   unsigned SectionID,
+                                                   StringRef SectionName,
+                                                   bool IsReadOnly) {
   if (IsReadOnly)
     return allocateSection(RODataMem, Size, Alignment);
   return allocateSection(RWDataMem, Size, Alignment);
@@ -40,7 +30,8 @@ uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
 
 uint8_t *SectionMemoryManager::allocateCodeSection(uintptr_t Size,
                                                    unsigned Alignment,
-                                                   unsigned SectionID) {
+                                                   unsigned SectionID,
+                                                   StringRef SectionName) {
   return allocateSection(CodeMem, Size, Alignment);
 }
 
@@ -111,11 +102,14 @@ uint8_t *SectionMemoryManager::allocateSection(MemoryGroup &MemGroup,
   return (uint8_t*)Addr;
 }
 
-bool SectionMemoryManager::applyPermissions(std::string *ErrMsg)
+bool SectionMemoryManager::finalizeMemory(std::string *ErrMsg)
 {
   // FIXME: Should in-progress permissions be reverted if an error occurs?
   error_code ec;
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  CodeMem.FreeMem.clear();
+
   // Make code memory executable.
   ec = applyMemoryGroupPermissions(CodeMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -126,6 +120,9 @@ bool SectionMemoryManager::applyPermissions(std::string *ErrMsg)
     return true;
   }
 
+  // Don't allow free memory blocks to be used after setting protection flags.
+  RODataMem.FreeMem.clear();
+
   // Make read-only data memory read-only.
   ec = applyMemoryGroupPermissions(RODataMem,
                                    sys::Memory::MF_READ | sys::Memory::MF_EXEC);
@@ -146,38 +143,6 @@ bool SectionMemoryManager::applyPermissions(std::string *ErrMsg)
   return false;
 }
 
-// Determine whether we can register EH tables.
-#if (defined(__GNUC__) && !defined(__ARM_EABI__) && \
-     !defined(__USING_SJLJ_EXCEPTIONS__))
-#define HAVE_EHTABLE_SUPPORT 1
-#else
-#define HAVE_EHTABLE_SUPPORT 0
-#endif
-
-#if HAVE_EHTABLE_SUPPORT
-extern "C" void __register_frame(void*);
-
-static const char *processFDE(const char *Entry) {
-  const char *P = Entry;
-  uint32_t Length = *((uint32_t*)P);
-  P += 4;
-  uint32_t Offset = *((uint32_t*)P);
-  if (Offset != 0)
-    __register_frame((void*)Entry);
-  return P + Length;
-}
-#endif
-
-void SectionMemoryManager::registerEHFrames(StringRef SectionData) {
-#if HAVE_EHTABLE_SUPPORT
-  const char *P = SectionData.data();
-  const char *End = SectionData.data() + SectionData.size();
-  do  {
-    P = processFDE(P);
-  } while(P != End);
-#endif
-}
-
 error_code SectionMemoryManager::applyMemoryGroupPermissions(MemoryGroup &MemGroup,
                                                              unsigned Permissions) {
 
@@ -199,57 +164,6 @@ void SectionMemoryManager::invalidateInstructionCache() {
                                             CodeMem.AllocatedMem[i].size());
 }
 
-static int jit_noop() {
-  return 0;
-}
-
-void *SectionMemoryManager::getPointerToNamedFunction(const std::string &Name,
-                                                       bool AbortOnFailure) {
-#if defined(__linux__)
-  //===--------------------------------------------------------------------===//
-  // Function stubs that are invoked instead of certain library calls
-  //
-  // Force the following functions to be linked in to anything that uses the
-  // JIT. This is a hack designed to work around the all-too-clever Glibc
-  // strategy of making these functions work differently when inlined vs. when
-  // not inlined, and hiding their real definitions in a separate archive file
-  // that the dynamic linker can't see. For more info, search for
-  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
-  if (Name == "stat") return (void*)(intptr_t)&stat;
-  if (Name == "fstat") return (void*)(intptr_t)&fstat;
-  if (Name == "lstat") return (void*)(intptr_t)&lstat;
-  if (Name == "stat64") return (void*)(intptr_t)&stat64;
-  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
-  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
-  if (Name == "atexit") return (void*)(intptr_t)&atexit;
-  if (Name == "mknod") return (void*)(intptr_t)&mknod;
-#endif // __linux__
-
-  // We should not invoke parent's ctors/dtors from generated main()!
-  // On Mingw and Cygwin, the symbol __main is resolved to
-  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
-  // (and register wrong callee's dtors with atexit(3)).
-  // We expect ExecutionEngine::runStaticConstructorsDestructors()
-  // is called before ExecutionEngine::runFunctionAsMain() is called.
-  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
-
-  const char *NameStr = Name.c_str();
-  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
-  if (Ptr) return Ptr;
-
-  // If it wasn't found and if it starts with an underscore ('_') character,
-  // try again without the underscore.
-  if (NameStr[0] == '_') {
-    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
-    if (Ptr) return Ptr;
-  }
-
-  if (AbortOnFailure)
-    report_fatal_error("Program used external function '" + Name +
-                      "' which could not be resolved!");
-  return 0;
-}
-
 SectionMemoryManager::~SectionMemoryManager() {
   for (unsigned i = 0, e = CodeMem.AllocatedMem.size(); i != e; ++i)
     sys::Memory::releaseMappedMemory(CodeMem.AllocatedMem[i]);
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
index 38867ec..f11df82 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileJITEventListener.cpp
@@ -20,7 +20,9 @@
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ExecutionEngine/OProfileWrapper.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
@@ -52,6 +54,10 @@ public:
                                 const JITEvent_EmittedFunctionDetails &Details);
 
   virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
 void OProfileJITEventListener::initialize() {
@@ -159,6 +165,66 @@ void OProfileJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   }
 }
 
+void OProfileJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      StringRef  Name;
+      uint64_t   Addr;
+      uint64_t   Size;
+      if (I->getName(Name)) continue;
+      if (I->getAddress(Addr)) continue;
+      if (I->getSize(Size)) continue;
+
+      if (Wrapper.op_write_native_code(Name.data(), Addr, (void*)Addr, Size)
+                        == -1) {
+        DEBUG(dbgs() << "Failed to tell OProfile about native function "
+          << Name << " at ["
+          << (void*)Addr << "-" << ((char*)Addr + Size) << "]\n");
+        continue;
+      }
+      // TODO: support line number info (similar to IntelJITEventListener.cpp)
+    }
+  }
+}
+
+void OProfileJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+  if (!Wrapper.isAgentAvailable()) {
+    return;
+  }
+
+  // Use symbol info to iterate functions in the object.
+  error_code ec;
+  for (object::symbol_iterator I = Obj.begin_symbols(),
+                               E = Obj.end_symbols();
+                        I != E && !ec;
+                        I.increment(ec)) {
+    object::SymbolRef::Type SymType;
+    if (I->getType(SymType)) continue;
+    if (SymType == object::SymbolRef::ST_Function) {
+      uint64_t   Addr;
+      if (I->getAddress(Addr)) continue;
+
+      if (Wrapper.op_unload_native_code(Addr) == -1) {
+        DEBUG(dbgs()
+          << "Failed to tell OProfile about unload of native function at "
+          << (void*)Addr << "\n");
+        continue;
+      }
+    }
+  }
+}
+
 }  // anonymous namespace.
 
 namespace llvm {
diff --git a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
index 7c0d395..61d8dc2 100644
--- a/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
+++ b/lib/ExecutionEngine/OProfileJIT/OProfileWrapper.cpp
@@ -13,22 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/OProfileWrapper.h"
-
 #define DEBUG_TYPE "oprofile-wrapper"
+#include "llvm/ExecutionEngine/OProfileWrapper.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/MutexGuard.h"
 #include "llvm/ADT/SmallString.h"
-
-#include <sstream>
 #include <cstring>
-#include <stddef.h>
 #include <dirent.h>
-#include <sys/stat.h>
 #include <fcntl.h>
+#include <sstream>
+#include <stddef.h>
+#include <sys/stat.h>
 #include <unistd.h>
 
 namespace {
@@ -143,6 +141,10 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         close(CmdLineFD);
         ssize_t Idx = 0;
 
+        if (ExeName[0] != '/') {
+          BaseName = ExeName;
+        }
+
         // Find the terminator for the first string
         while (Idx < NumRead-1 && ExeName[Idx] != 0) {
           Idx++;
@@ -161,7 +163,8 @@ bool OProfileWrapper::checkForOProfileProcEntry() {
         }
 
         // Test this to see if it is the oprofile daemon
-        if (BaseName != 0 && !strcmp("oprofiled", BaseName)) {
+        if (BaseName != 0 && (!strcmp("oprofiled", BaseName) ||
+                              !strcmp("operf", BaseName))) {
           // If it is, we're done
           closedir(ProcDir);
           return true;
diff --git a/lib/ExecutionEngine/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
new file mode 100644
index 0000000..58a6460
--- /dev/null
+++ b/lib/ExecutionEngine/RTDyldMemoryManager.cpp
@@ -0,0 +1,282 @@
+//===-- RTDyldMemoryManager.cpp - Memory manager for MC-JIT -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the runtime dynamic memory manager base class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+
+#include <cstdlib>
+
+#ifdef __linux__
+  // These includes used by RTDyldMemoryManager::getPointerToNamedFunction()
+  // for Glibc trickery. See comments in this function for more information.
+  #ifdef HAVE_SYS_STAT_H
+    #include <sys/stat.h>
+  #endif
+  #include <fcntl.h>
+  #include <unistd.h>
+#endif
+
+namespace llvm {
+
+RTDyldMemoryManager::~RTDyldMemoryManager() {}
+
+// Determine whether we can register EH tables.
+#if (defined(__GNUC__) && !defined(__ARM_EABI__) && !defined(__ia64__) && \
+     !defined(__USING_SJLJ_EXCEPTIONS__))
+#define HAVE_EHTABLE_SUPPORT 1
+#else
+#define HAVE_EHTABLE_SUPPORT 0
+#endif
+
+#if HAVE_EHTABLE_SUPPORT
+extern "C" void __register_frame(void*);
+extern "C" void __deregister_frame(void*);
+#else
+// The building compiler does not have __(de)register_frame but
+// it may be found at runtime in a dynamically-loaded library.
+// For example, this happens when building LLVM with Visual C++
+// but using the MingW runtime.
+void __register_frame(void *p) {
+  static bool Searched = false;
+  static void *rf = 0;
+
+  if (!Searched) {
+    Searched = true;
+    rf = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__register_frame");
+  }
+  if (rf)
+    ((void (*)(void *))rf)(p);
+}
+
+void __deregister_frame(void *p) {
+  static bool Searched = false;
+  static void *df = 0;
+
+  if (!Searched) {
+    Searched = true;
+    df = llvm::sys::DynamicLibrary::SearchForAddressOfSymbol(
+                                                      "__deregister_frame");
+  }
+  if (df)
+    ((void (*)(void *))df)(p);
+}
+#endif
+
+#ifdef __APPLE__
+
+static const char *processFDE(const char *Entry, bool isDeregister) {
+  const char *P = Entry;
+  uint32_t Length = *((const uint32_t *)P);
+  P += 4;
+  uint32_t Offset = *((const uint32_t *)P);
+  if (Offset != 0) {
+    if (isDeregister)
+      __deregister_frame(const_cast<char *>(Entry));
+    else
+      __register_frame(const_cast<char *>(Entry));
+  }
+  return P + Length;
+}
+
+// This implementation handles frame registration for local targets.
+// Memory managers for remote targets should re-implement this function
+// and use the LoadAddr parameter.
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On OS X OS X __register_frame takes a single FDE as an argument.
+  // See http://lists.cs.uiuc.edu/pipermail/llvmdev/2013-April/061768.html
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, false);
+  } while(P != End);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  const char *P = (const char *)Addr;
+  const char *End = P + Size;
+  do  {
+    P = processFDE(P, true);
+  } while(P != End);
+}
+
+#else
+
+void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  // On Linux __register_frame takes a single argument: 
+  // a pointer to the start of the .eh_frame section.
+
+  // How can it find the end? Because crtendS.o is linked 
+  // in and it has an .eh_frame section with four zero chars.
+  __register_frame(Addr);
+}
+
+void RTDyldMemoryManager::deregisterEHFrames(uint8_t *Addr,
+                                           uint64_t LoadAddr,
+                                           size_t Size) {
+  __deregister_frame(Addr);
+}
+
+#endif
+
+static int jit_noop() {
+  return 0;
+}
+
+// ARM math functions are statically linked on Android from libgcc.a, but not
+// available at runtime for dynamic linking. On Linux these are usually placed
+// in libgcc_s.so so can be found by normal dynamic lookup.
+#if defined(__BIONIC__) && defined(__arm__)
+// List of functions which are statically linked on Android and can be generated
+// by LLVM. This is done as a nested macro which is used once to declare the
+// imported functions with ARM_MATH_DECL and once to compare them to the
+// user-requested symbol in getSymbolAddress with ARM_MATH_CHECK. The test
+// assumes that all functions start with __aeabi_ and getSymbolAddress must be
+// modified if that changes.
+#define ARM_MATH_IMPORTS(PP) \
+  PP(__aeabi_d2f) \
+  PP(__aeabi_d2iz) \
+  PP(__aeabi_d2lz) \
+  PP(__aeabi_d2uiz) \
+  PP(__aeabi_d2ulz) \
+  PP(__aeabi_dadd) \
+  PP(__aeabi_dcmpeq) \
+  PP(__aeabi_dcmpge) \
+  PP(__aeabi_dcmpgt) \
+  PP(__aeabi_dcmple) \
+  PP(__aeabi_dcmplt) \
+  PP(__aeabi_dcmpun) \
+  PP(__aeabi_ddiv) \
+  PP(__aeabi_dmul) \
+  PP(__aeabi_dsub) \
+  PP(__aeabi_f2d) \
+  PP(__aeabi_f2iz) \
+  PP(__aeabi_f2lz) \
+  PP(__aeabi_f2uiz) \
+  PP(__aeabi_f2ulz) \
+  PP(__aeabi_fadd) \
+  PP(__aeabi_fcmpeq) \
+  PP(__aeabi_fcmpge) \
+  PP(__aeabi_fcmpgt) \
+  PP(__aeabi_fcmple) \
+  PP(__aeabi_fcmplt) \
+  PP(__aeabi_fcmpun) \
+  PP(__aeabi_fdiv) \
+  PP(__aeabi_fmul) \
+  PP(__aeabi_fsub) \
+  PP(__aeabi_i2d) \
+  PP(__aeabi_i2f) \
+  PP(__aeabi_idiv) \
+  PP(__aeabi_idivmod) \
+  PP(__aeabi_l2d) \
+  PP(__aeabi_l2f) \
+  PP(__aeabi_lasr) \
+  PP(__aeabi_ldivmod) \
+  PP(__aeabi_llsl) \
+  PP(__aeabi_llsr) \
+  PP(__aeabi_lmul) \
+  PP(__aeabi_ui2d) \
+  PP(__aeabi_ui2f) \
+  PP(__aeabi_uidiv) \
+  PP(__aeabi_uidivmod) \
+  PP(__aeabi_ul2d) \
+  PP(__aeabi_ul2f) \
+  PP(__aeabi_uldivmod)
+
+// Declare statically linked math functions on ARM. The function declarations
+// here do not have the correct prototypes for each function in
+// ARM_MATH_IMPORTS, but it doesn't matter because only the symbol addresses are
+// needed. In particular the __aeabi_*divmod functions do not have calling
+// conventions which match any C prototype.
+#define ARM_MATH_DECL(name) extern "C" void name();
+ARM_MATH_IMPORTS(ARM_MATH_DECL)
+#undef ARM_MATH_DECL
+#endif
+
+uint64_t RTDyldMemoryManager::getSymbolAddress(const std::string &Name) {
+  // This implementation assumes that the host program is the target.
+  // Clients generating code for a remote target should implement their own
+  // memory manager.
+#if defined(__linux__) && defined(__GLIBC__)
+  //===--------------------------------------------------------------------===//
+  // Function stubs that are invoked instead of certain library calls
+  //
+  // Force the following functions to be linked in to anything that uses the
+  // JIT. This is a hack designed to work around the all-too-clever Glibc
+  // strategy of making these functions work differently when inlined vs. when
+  // not inlined, and hiding their real definitions in a separate archive file
+  // that the dynamic linker can't see. For more info, search for
+  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+  if (Name == "stat") return (uint64_t)&stat;
+  if (Name == "fstat") return (uint64_t)&fstat;
+  if (Name == "lstat") return (uint64_t)&lstat;
+  if (Name == "stat64") return (uint64_t)&stat64;
+  if (Name == "fstat64") return (uint64_t)&fstat64;
+  if (Name == "lstat64") return (uint64_t)&lstat64;
+  if (Name == "atexit") return (uint64_t)&atexit;
+  if (Name == "mknod") return (uint64_t)&mknod;
+#endif // __linux__ && __GLIBC__
+  
+  // See ARM_MATH_IMPORTS definition for explanation
+#if defined(__BIONIC__) && defined(__arm__)
+  if (Name.compare(0, 8, "__aeabi_") == 0) {
+    // Check if the user has requested any of the functions listed in
+    // ARM_MATH_IMPORTS, and if so redirect to the statically linked symbol.
+#define ARM_MATH_CHECK(fn) if (Name == #fn) return (uint64_t)&fn;
+    ARM_MATH_IMPORTS(ARM_MATH_CHECK)
+#undef ARM_MATH_CHECK
+  }
+#endif
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (uint64_t)&jit_noop;
+
+  const char *NameStr = Name.c_str();
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr)
+    return (uint64_t)Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr)
+      return (uint64_t)Ptr;
+  }
+  return 0;
+}
+
+void *RTDyldMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                     bool AbortOnFailure) {
+  uint64_t Addr = getSymbolAddress(Name);
+
+  if (!Addr && AbortOnFailure)
+    report_fatal_error("Program used external function '" + Name +
+                       "' which could not be resolved!");
+  return (void*)Addr;
+}
+
+} // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
index 69e9dbe..6a514ea 100644
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -16,6 +16,7 @@ namespace llvm {
 
 /// Global access point for the JIT debugging interface.
 class JITRegistrar {
+  virtual void anchor();
 public:
   /// Instantiates the JIT service.
   JITRegistrar() {}
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index 89350cc..9cbde5d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -23,6 +23,7 @@ namespace llvm {
 class ObjectImageCommon : public ObjectImage {
   ObjectImageCommon(); // = delete
   ObjectImageCommon(const ObjectImageCommon &other); // = delete
+  virtual void anchor();
 
 protected:
   object::ObjectFile *ObjFile;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index a08b508..161135a 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -13,45 +13,60 @@
 
 #define DEBUG_TYPE "dyld"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "JITRegistrar.h"
 #include "ObjectImageCommon.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldMachO.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MutexGuard.h"
+#include "llvm/Object/ELF.h"
 
 using namespace llvm;
 using namespace llvm::object;
 
 // Empty out-of-line virtual destructor as the key function.
-RTDyldMemoryManager::~RTDyldMemoryManager() {}
-void RTDyldMemoryManager::registerEHFrames(StringRef SectionData) {}
 RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
+// Pin the JITRegistrar's and ObjectImage*'s vtables to this file.
+void JITRegistrar::anchor() {}
+void ObjectImage::anchor() {}
+void ObjectImageCommon::anchor() {}
+
 namespace llvm {
 
-StringRef RuntimeDyldImpl::getEHFrameSection() {
-  return StringRef();
+void RuntimeDyldImpl::registerEHFrames() {
+}
+
+void RuntimeDyldImpl::deregisterEHFrames() {
 }
 
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
+  MutexGuard locked(lock);
+
   // First, resolve relocations associated with external symbols.
   resolveExternalSymbols();
 
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
+    // The Section here (Sections[i]) refers to the section in which the
+    // symbol for the relocation is located.  The SectionID in the relocation
+    // entry provides the section to which the relocation will be applied.
     uint64_t Addr = Sections[i].LoadAddress;
     DEBUG(dbgs() << "Resolving relocations Section #" << i
             << "\t" << format("%p", (uint8_t *)Addr)
             << "\n");
     resolveRelocationList(Relocations[i], Addr);
+    Relocations.erase(i);
   }
 }
 
 void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
+  MutexGuard locked(lock);
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].Address == LocalAddress) {
       reassignSectionAddress(i, TargetAddress);
@@ -68,11 +83,15 @@ ObjectImage *RuntimeDyldImpl::createObjectImage(ObjectBuffer *InputBuffer) {
 }
 
 ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
+  MutexGuard locked(lock);
+
   OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
   if (!obj)
     report_fatal_error("Unable to create object image from memory buffer!");
 
+  // Save information about our target
   Arch = (Triple::ArchType)obj->getArch();
+  IsTargetLittleEndian = obj->getObjectFile()->isLittleEndian();
 
   // Symbols found in this object
   StringMap<SymbolLoc> LocalSymbols;
@@ -148,6 +167,7 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
     bool isFirstRelocation = true;
     unsigned SectionID = 0;
     StubMap Stubs;
+    section_iterator RelocatedSection = si->getRelocatedSection();
 
     for (relocation_iterator i = si->begin_relocations(),
          e = si->end_relocations(); i != e; i.increment(err)) {
@@ -155,7 +175,8 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
 
       // If it's the first relocation in this section, find its SectionID
       if (isFirstRelocation) {
-        SectionID = findOrEmitSection(*obj, *si, true, LocalSections);
+        SectionID =
+            findOrEmitSection(*obj, *RelocatedSection, true, LocalSections);
         DEBUG(dbgs() << "\tSectionID: " << SectionID << "\n");
         isFirstRelocation = false;
       }
@@ -165,6 +186,9 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
     }
   }
 
+  // Give the subclasses a chance to tie-up any loose ends.
+  finalizeLoad(LocalSections);
+
   return obj.take();
 }
 
@@ -174,8 +198,8 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
                                         SymbolTableMap &SymbolTable) {
   // Allocate memory for the section
   unsigned SectionID = Sections.size();
-  uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, sizeof(void*),
-                                              SectionID, false);
+  uint8_t *Addr = MemMgr->allocateDataSection(
+    TotalSize, sizeof(void*), SectionID, StringRef(), false);
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
@@ -216,11 +240,25 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   unsigned StubBufSize = 0,
            StubSize = getMaxStubSize();
   error_code err;
+  const ObjectFile *ObjFile = Obj.getObjectFile();
+  // FIXME: this is an inefficient way to handle this. We should computed the
+  // necessary section allocation size in loadObject by walking all the sections
+  // once.
   if (StubSize > 0) {
-    for (relocation_iterator i = Section.begin_relocations(),
-         e = Section.end_relocations(); i != e; i.increment(err), Check(err))
-      StubBufSize += StubSize;
+    for (section_iterator SI = ObjFile->begin_sections(),
+           SE = ObjFile->end_sections();
+         SI != SE; SI.increment(err), Check(err)) {
+      section_iterator RelSecI = SI->getRelocatedSection();
+      if (!(RelSecI == Section))
+        continue;
+
+      for (relocation_iterator I = SI->begin_relocations(),
+             E = SI->end_relocations(); I != E; I.increment(err), Check(err)) {
+        StubBufSize += StubSize;
+      }
+    }
   }
+
   StringRef data;
   uint64_t Alignment64;
   Check(Section.getContents(data));
@@ -232,6 +270,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   bool IsZeroInit;
   bool IsReadOnly;
   uint64_t DataSize;
+  unsigned PaddingSize = 0;
   StringRef Name;
   Check(Section.isRequiredForExecution(IsRequired));
   Check(Section.isVirtual(IsVirtual));
@@ -246,6 +285,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
       StubBufSize += StubAlignment - EndAlignment;
   }
 
+  // The .eh_frame section (at least on Linux) needs an extra four bytes padded
+  // with zeroes added at the end.  For MachO objects, this section has a
+  // slightly different name, so this won't have any effect for MachO objects.
+  if (Name == ".eh_frame")
+    PaddingSize = 4;
+
   unsigned Allocate;
   unsigned SectionID = Sections.size();
   uint8_t *Addr;
@@ -254,10 +299,11 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   // Some sections, such as debug info, don't need to be loaded for execution.
   // Leave those where they are.
   if (IsRequired) {
-    Allocate = DataSize + StubBufSize;
+    Allocate = DataSize + PaddingSize + StubBufSize;
     Addr = IsCode
-      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID)
-      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, IsReadOnly);
+      ? MemMgr->allocateCodeSection(Allocate, Alignment, SectionID, Name)
+      : MemMgr->allocateDataSection(Allocate, Alignment, SectionID, Name,
+                                    IsReadOnly);
     if (!Addr)
       report_fatal_error("Unable to allocate section memory!");
 
@@ -271,6 +317,13 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     else
       memcpy(Addr, pData, DataSize);
 
+    // Fill in any extra bytes we allocated for padding
+    if (PaddingSize != 0) {
+      memset(Addr + DataSize, 0, PaddingSize);
+      // Update the DataSize variable so that the stub offset is set correctly.
+      DataSize += PaddingSize;
+    }
+
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
                  << " Name: " << Name
                  << " obj addr: " << format("%p", pData)
@@ -381,7 +434,7 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     StubAddr++;
     *StubAddr = NopInstr;
     return Addr;
-  } else if (Arch == Triple::ppc64) {
+  } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
     // PowerPC64 stub: the address points to a function descriptor
     // instead of the function itself. Load the function address
     // on r11 and sets it to control register. Also loads the function
@@ -406,6 +459,10 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     writeInt16BE(Addr+6,  0x07F1);     // brc 15,%r1
     // 8-byte address stored at Addr + 8
     return Addr;
+  } else if (Arch == Triple::x86_64) {
+    *Addr      = 0xFF; // jmp
+    *(Addr+1)  = 0x25; // rip
+    // 32-bit PC-relative address of the GOT entry will be stored at Addr+2
   }
   return Addr;
 }
@@ -439,30 +496,52 @@ void RuntimeDyldImpl::resolveRelocationList(const RelocationList &Relocs,
 }
 
 void RuntimeDyldImpl::resolveExternalSymbols() {
-  StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin(),
-                                      e = ExternalSymbolRelocations.end();
-  for (; i != e; i++) {
+  while(!ExternalSymbolRelocations.empty()) {
+    StringMap<RelocationList>::iterator i = ExternalSymbolRelocations.begin();
+
     StringRef Name = i->first();
-    RelocationList &Relocs = i->second;
-    SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
-    if (Loc == GlobalSymbolTable.end()) {
-      if (Name.size() == 0) {
-        // This is an absolute symbol, use an address of zero.
-        DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
-        resolveRelocationList(Relocs, 0);
+    if (Name.size() == 0) {
+      // This is an absolute symbol, use an address of zero.
+      DEBUG(dbgs() << "Resolving absolute relocations." << "\n");
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, 0);
+    } else {
+      uint64_t Addr = 0;
+      SymbolTableMap::const_iterator Loc = GlobalSymbolTable.find(Name);
+      if (Loc == GlobalSymbolTable.end()) {
+          // This is an external symbol, try to get its address from
+          // MemoryManager.
+          Addr = MemMgr->getSymbolAddress(Name.data());
+          // The call to getSymbolAddress may have caused additional modules to
+          // be loaded, which may have added new entries to the
+          // ExternalSymbolRelocations map.  Consquently, we need to update our
+          // iterator.  This is also why retrieval of the relocation list
+          // associated with this symbol is deferred until below this point.
+          // New entries may have been added to the relocation list.
+          i = ExternalSymbolRelocations.find(Name);
       } else {
-        // This is an external symbol, try to get its address from
-        // MemoryManager.
-        uint8_t *Addr = (uint8_t*) MemMgr->getPointerToNamedFunction(Name.data(),
-                                                                   true);
-        DEBUG(dbgs() << "Resolving relocations Name: " << Name
-                << "\t" << format("%p", Addr)
-                << "\n");
-        resolveRelocationList(Relocs, (uintptr_t)Addr);
+        // We found the symbol in our global table.  It was probably in a
+        // Module that we loaded previously.
+        SymbolLoc SymLoc = Loc->second;
+        Addr = getSectionLoadAddress(SymLoc.first) + SymLoc.second;
       }
-    } else {
-      report_fatal_error("Expected external symbol");
+
+      // FIXME: Implement error handling that doesn't kill the host program!
+      if (!Addr)
+        report_fatal_error("Program used external function '" + Name +
+                          "' which could not be resolved!");
+
+      updateGOTEntries(Name, Addr);
+      DEBUG(dbgs() << "Resolving relocations Name: " << Name
+              << "\t" << format("0x%lx", Addr)
+              << "\n");
+      // This list may have been updated when we called getSymbolAddress, so
+      // don't change this code to get the list earlier.
+      RelocationList &Relocs = i->second;
+      resolveRelocationList(Relocs, Addr);
     }
+
+    ExternalSymbolRelocations.erase(i);
   }
 }
 
@@ -486,33 +565,36 @@ RuntimeDyld::~RuntimeDyld() {
 
 ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
   if (!Dyld) {
-    sys::LLVMFileType type = sys::IdentifyFileType(
-            InputBuffer->getBufferStart(),
-            static_cast<unsigned>(InputBuffer->getBufferSize()));
-    switch (type) {
-      case sys::ELF_Relocatable_FileType:
-      case sys::ELF_Executable_FileType:
-      case sys::ELF_SharedObject_FileType:
-      case sys::ELF_Core_FileType:
-        Dyld = new RuntimeDyldELF(MM);
-        break;
-      case sys::Mach_O_Object_FileType:
-      case sys::Mach_O_Executable_FileType:
-      case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
-      case sys::Mach_O_Core_FileType:
-      case sys::Mach_O_PreloadExecutable_FileType:
-      case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
-      case sys::Mach_O_DynamicLinker_FileType:
-      case sys::Mach_O_Bundle_FileType:
-      case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-      case sys::Mach_O_DSYMCompanion_FileType:
-        Dyld = new RuntimeDyldMachO(MM);
-        break;
-      case sys::Unknown_FileType:
-      case sys::Bitcode_FileType:
-      case sys::Archive_FileType:
-      case sys::COFF_FileType:
-        report_fatal_error("Incompatible object format!");
+    sys::fs::file_magic Type =
+        sys::fs::identify_magic(InputBuffer->getBuffer());
+    switch (Type) {
+    case sys::fs::file_magic::elf_relocatable:
+    case sys::fs::file_magic::elf_executable:
+    case sys::fs::file_magic::elf_shared_object:
+    case sys::fs::file_magic::elf_core:
+      Dyld = new RuntimeDyldELF(MM);
+      break;
+    case sys::fs::file_magic::macho_object:
+    case sys::fs::file_magic::macho_executable:
+    case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
+    case sys::fs::file_magic::macho_core:
+    case sys::fs::file_magic::macho_preload_executable:
+    case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
+    case sys::fs::file_magic::macho_dynamic_linker:
+    case sys::fs::file_magic::macho_bundle:
+    case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
+    case sys::fs::file_magic::macho_dsym_companion:
+      Dyld = new RuntimeDyldMachO(MM);
+      break;
+    case sys::fs::file_magic::unknown:
+    case sys::fs::file_magic::bitcode:
+    case sys::fs::file_magic::archive:
+    case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
+    case sys::fs::file_magic::pecoff_executable:
+    case sys::fs::file_magic::macho_universal_binary:
+    case sys::fs::file_magic::windows_resource:
+      report_fatal_error("Incompatible object format!");
     }
   } else {
     if (!Dyld->isCompatibleFormat(InputBuffer))
@@ -523,10 +605,14 @@ ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
 }
 
 void *RuntimeDyld::getSymbolAddress(StringRef Name) {
+  if (!Dyld)
+    return NULL;
   return Dyld->getSymbolAddress(Name);
 }
 
 uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  if (!Dyld)
+    return 0;
   return Dyld->getSymbolLoadAddress(Name);
 }
 
@@ -548,8 +634,14 @@ StringRef RuntimeDyld::getErrorString() {
   return Dyld->getErrorString();
 }
 
-StringRef RuntimeDyld::getEHFrameSection() {
-  return Dyld->getEHFrameSection();
+void RuntimeDyld::registerEHFrames() {
+  if (Dyld)
+    Dyld->registerEHFrames();
+}
+
+void RuntimeDyld::deregisterEHFrames() {
+  if (Dyld)
+    Dyld->deregisterEHFrames();
 }
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index d4d84d3..f2c69fc 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -22,7 +22,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/ExecutionEngine/ObjectImage.h"
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
@@ -151,12 +151,31 @@ void DyldELFObject<ELFT>::updateSymbolAddress(const SymbolRef &SymRef,
 
 namespace llvm {
 
-StringRef RuntimeDyldELF::getEHFrameSection() {
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == ".eh_frame")
-      return StringRef((const char*)Sections[i].Address, Sections[i].Size);
+void RuntimeDyldELF::registerEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = UnregisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+    RegisteredEHFrameSections.push_back(EHFrameSID);
   }
-  return StringRef();
+  UnregisteredEHFrameSections.clear();
+}
+
+void RuntimeDyldELF::deregisterEHFrames() {
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) {
+    SID EHFrameSID = RegisteredEHFrameSections[i];
+    uint8_t *EHFrameAddr = Sections[EHFrameSID].Address;
+    uint64_t EHFrameLoadAddr = Sections[EHFrameSID].LoadAddress;
+    size_t EHFrameSize = Sections[EHFrameSID].Size;
+    MemMgr->deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize);
+  }
+  RegisteredEHFrameSections.clear();
 }
 
 ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
@@ -202,7 +221,8 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                                              uint64_t Offset,
                                              uint64_t Value,
                                              uint32_t Type,
-                                             int64_t Addend) {
+                                             int64_t  Addend,
+                                             uint64_t SymOffset) {
   switch (Type) {
   default:
     llvm_unreachable("Relocation type not implemented yet!");
@@ -227,6 +247,21 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
                  << " at " << format("%p\n",Target));
     break;
   }
+  case ELF::R_X86_64_GOTPCREL: {
+    // findGOTEntry returns the 'G + GOT' part of the relocation calculation
+    // based on the load/target address of the GOT (not the current/local addr).
+    uint64_t GOTAddr = findGOTEntry(Value, SymOffset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    // The processRelocationRef method combines the symbol offset and the addend
+    // and in most cases that's what we want.  For this relocation type, we need
+    // the raw addend, so we subtract the symbol offset to get it.
+    int64_t RealOffset = GOTAddr + Addend - SymOffset - FinalAddress;
+    assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
+    int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
+    *Target = TruncOffset;
+    break;
+  }
   case ELF::R_X86_64_PC32: {
     // Get the placeholder value from the generated object since
     // a previous relocation attempt may have overwritten the loaded version
@@ -240,6 +275,16 @@ void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
     *Target = TruncOffset;
     break;
   }
+  case ELF::R_X86_64_PC64: {
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint64_t *Placeholder = reinterpret_cast<uint64_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint64_t *Target = reinterpret_cast<uint64_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
+    *Target = *Placeholder + Value + Addend - FinalAddress;
+    break;
+  }
   }
 }
 
@@ -302,9 +347,9 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     *TargetPtr = Value + Addend;
     break;
   }
-  case ELF::R_AARCH64_PREL32: { // test-shift.ll (.eh_frame)
+  case ELF::R_AARCH64_PREL32: {
     uint64_t Result = Value + Addend - FinalAddress;
-    assert(static_cast<int64_t>(Result) >= INT32_MIN && 
+    assert(static_cast<int64_t>(Result) >= INT32_MIN &&
            static_cast<int64_t>(Result) <= UINT32_MAX);
     *TargetPtr = static_cast<uint32_t>(Result & 0xffffffffU);
     break;
@@ -316,41 +361,62 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
     uint64_t BranchImm = Value + Addend - FinalAddress;
 
     // "Check that -2^27 <= result < 2^27".
-    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) && 
+    assert(-(1LL << 27) <= static_cast<int64_t>(BranchImm) &&
            static_cast<int64_t>(BranchImm) < (1LL << 27));
+
+    // AArch64 code is emitted with .rela relocations. The data already in any
+    // bits affected by the relocation on entry is garbage.
+    *TargetPtr &= 0xfc000000U;
     // Immediate goes in bits 25:0 of B and BL.
     *TargetPtr |= static_cast<uint32_t>(BranchImm & 0xffffffcU) >> 2;
     break;
   }
   case ELF::R_AARCH64_MOVW_UABS_G3: {
     uint64_t Result = Value + Addend;
+
+    // AArch64 code is emitted with .rela relocations. The data already in any
+    // bits affected by the relocation on entry is garbage.
+    *TargetPtr &= 0xffe0001fU;
     // Immediate goes in bits 20:5 of MOVZ/MOVK instruction
     *TargetPtr |= Result >> (48 - 5);
-    // Shift is "lsl #48", in bits 22:21
-    *TargetPtr |= 3 << 21;
+    // Shift must be "lsl #48", in bits 22:21
+    assert((*TargetPtr >> 21 & 0x3) == 3 && "invalid shift for relocation");
     break;
   }
   case ELF::R_AARCH64_MOVW_UABS_G2_NC: {
     uint64_t Result = Value + Addend;
+
+    // AArch64 code is emitted with .rela relocations. The data already in any
+    // bits affected by the relocation on entry is garbage.
+    *TargetPtr &= 0xffe0001fU;
     // Immediate goes in bits 20:5 of MOVZ/MOVK instruction
     *TargetPtr |= ((Result & 0xffff00000000ULL) >> (32 - 5));
-    // Shift is "lsl #32", in bits 22:21
-    *TargetPtr |= 2 << 21;
+    // Shift must be "lsl #32", in bits 22:21
+    assert((*TargetPtr >> 21 & 0x3) == 2 && "invalid shift for relocation");
     break;
   }
   case ELF::R_AARCH64_MOVW_UABS_G1_NC: {
     uint64_t Result = Value + Addend;
+
+    // AArch64 code is emitted with .rela relocations. The data already in any
+    // bits affected by the relocation on entry is garbage.
+    *TargetPtr &= 0xffe0001fU;
     // Immediate goes in bits 20:5 of MOVZ/MOVK instruction
     *TargetPtr |= ((Result & 0xffff0000U) >> (16 - 5));
-    // Shift is "lsl #16", in bits 22:21
-    *TargetPtr |= 1 << 21;
+    // Shift must be "lsl #16", in bits 22:2
+    assert((*TargetPtr >> 21 & 0x3) == 1 && "invalid shift for relocation");
     break;
   }
   case ELF::R_AARCH64_MOVW_UABS_G0_NC: {
     uint64_t Result = Value + Addend;
+
+    // AArch64 code is emitted with .rela relocations. The data already in any
+    // bits affected by the relocation on entry is garbage.
+    *TargetPtr &= 0xffe0001fU;
     // Immediate goes in bits 20:5 of MOVZ/MOVK instruction
     *TargetPtr |= ((Result & 0xffffU) << 5);
-    // Shift is "lsl #0", in bits 22:21. No action needed.
+    // Shift must be "lsl #0", in bits 22:21.
+    assert((*TargetPtr >> 21 & 0x3) == 0 && "invalid shift for relocation");
     break;
   }
   }
@@ -362,6 +428,8 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
                                           uint32_t Type,
                                           int32_t Addend) {
   // TODO: Add Thumb relocations.
+  uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress +
+                                                      Offset);
   uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
   uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
   Value += Addend;
@@ -380,44 +448,51 @@ void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
 
   // Write a 32bit value to relocation address, taking into account the
   // implicit addend encoded in the target.
-  case ELF::R_ARM_TARGET1 :
-  case ELF::R_ARM_ABS32 :
-    *TargetPtr += Value;
+  case ELF::R_ARM_TARGET1:
+  case ELF::R_ARM_ABS32:
+    *TargetPtr = *Placeholder + Value;
     break;
-
   // Write first 16 bit of 32 bit value to the mov instruction.
   // Last 4 bit should be shifted.
-  case ELF::R_ARM_MOVW_ABS_NC :
+  case ELF::R_ARM_MOVW_ABS_NC:
     // We are not expecting any other addend in the relocation address.
     // Using 0x000F0FFF because MOVW has its 16 bit immediate split into 2
     // non-contiguous fields.
-    assert((*TargetPtr & 0x000F0FFF) == 0);
+    assert((*Placeholder & 0x000F0FFF) == 0);
     Value = Value & 0xFFFF;
-    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr = *Placeholder | (Value & 0xFFF);
     *TargetPtr |= ((Value >> 12) & 0xF) << 16;
     break;
-
   // Write last 16 bit of 32 bit value to the mov instruction.
   // Last 4 bit should be shifted.
-  case ELF::R_ARM_MOVT_ABS :
+  case ELF::R_ARM_MOVT_ABS:
     // We are not expecting any other addend in the relocation address.
     // Use 0x000F0FFF for the same reason as R_ARM_MOVW_ABS_NC.
-    assert((*TargetPtr & 0x000F0FFF) == 0);
+    assert((*Placeholder & 0x000F0FFF) == 0);
+
     Value = (Value >> 16) & 0xFFFF;
-    *TargetPtr |= Value & 0xFFF;
+    *TargetPtr = *Placeholder | (Value & 0xFFF);
     *TargetPtr |= ((Value >> 12) & 0xF) << 16;
     break;
-
   // Write 24 bit relative value to the branch instruction.
   case ELF::R_ARM_PC24 :    // Fall through.
   case ELF::R_ARM_CALL :    // Fall through.
-  case ELF::R_ARM_JUMP24 :
+  case ELF::R_ARM_JUMP24: {
     int32_t RelValue = static_cast<int32_t>(Value - FinalAddress - 8);
     RelValue = (RelValue & 0x03FFFFFC) >> 2;
+    assert((*TargetPtr & 0xFFFFFF) == 0xFFFFFE);
     *TargetPtr &= 0xFF000000;
     *TargetPtr |= RelValue;
     break;
   }
+  case ELF::R_ARM_PRIVATE_0:
+    // This relocation is reserved by the ARM ELF ABI for internal use. We
+    // appropriate it here to act as an R_ARM_ABS32 without any addend for use
+    // in the stubs created during JIT (which can't put an addend into the
+    // original object file).
+    *TargetPtr = Value;
+    break;
+  }
 }
 
 void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
@@ -425,6 +500,8 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
                                            uint32_t Value,
                                            uint32_t Type,
                                            int32_t Addend) {
+  uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress +
+                                                      Offset);
   uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
   Value += Addend;
 
@@ -442,19 +519,30 @@ void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
     llvm_unreachable("Not implemented relocation type!");
     break;
   case ELF::R_MIPS_32:
-    *TargetPtr = Value + (*TargetPtr);
+    *TargetPtr = Value + (*Placeholder);
     break;
   case ELF::R_MIPS_26:
-    *TargetPtr = ((*TargetPtr) & 0xfc000000) | (( Value & 0x0fffffff) >> 2);
+    *TargetPtr = ((*Placeholder) & 0xfc000000) | (( Value & 0x0fffffff) >> 2);
     break;
   case ELF::R_MIPS_HI16:
     // Get the higher 16-bits. Also add 1 if bit 15 is 1.
-    Value += ((*TargetPtr) & 0x0000ffff) << 16;
+    Value += ((*Placeholder) & 0x0000ffff) << 16;
+    *TargetPtr = ((*Placeholder) & 0xffff0000) |
+                 (((Value + 0x8000) >> 16) & 0xffff);
+    break;
+  case ELF::R_MIPS_LO16:
+    Value += ((*Placeholder) & 0x0000ffff);
+    *TargetPtr = ((*Placeholder) & 0xffff0000) | (Value & 0xffff);
+    break;
+  case ELF::R_MIPS_UNUSED1:
+    // Similar to ELF::R_ARM_PRIVATE_0, R_MIPS_UNUSED1 and R_MIPS_UNUSED2
+    // are used for internal JIT purpose. These relocations are similar to
+    // R_MIPS_HI16 and R_MIPS_LO16, but they do not take any addend into
+    // account.
     *TargetPtr = ((*TargetPtr) & 0xffff0000) |
                  (((Value + 0x8000) >> 16) & 0xffff);
     break;
-   case ELF::R_MIPS_LO16:
-    Value += ((*TargetPtr) & 0x0000ffff);
+  case ELF::R_MIPS_UNUSED2:
     *TargetPtr = ((*TargetPtr) & 0xffff0000) | (Value & 0xffff);
     break;
    }
@@ -499,9 +587,13 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
   error_code err;
   for (section_iterator si = Obj.begin_sections(),
      se = Obj.end_sections(); si != se; si.increment(err)) {
-    StringRef SectionName;
-    check(si->getName(SectionName));
-    if (SectionName != ".opd")
+    section_iterator RelSecI = si->getRelocatedSection();
+    if (RelSecI == Obj.end_sections())
+      continue;
+
+    StringRef RelSectionName;
+    check(RelSecI->getName(RelSectionName));
+    if (RelSectionName != ".opd")
       continue;
 
     for (relocation_iterator i = si->begin_relocations(),
@@ -517,12 +609,11 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
         continue;
       }
 
-      SymbolRef TargetSymbol;
       uint64_t TargetSymbolOffset;
-      int64_t TargetAdditionalInfo;
-      check(i->getSymbol(TargetSymbol));
+      symbol_iterator TargetSymbol = i->getSymbol();
       check(i->getOffset(TargetSymbolOffset));
-      check(i->getAdditionalInfo(TargetAdditionalInfo));
+      int64_t Addend;
+      check(getELFRelocationAddend(*i, Addend));
 
       i = i.increment(err);
       if (i == e)
@@ -538,13 +629,13 @@ void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
       // Finally compares the Symbol value and the target symbol offset
       // to check if this .opd entry refers to the symbol the relocation
       // points to.
-      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+      if (Rel.Addend != (int64_t)TargetSymbolOffset)
         continue;
 
       section_iterator tsi(Obj.end_sections());
-      check(TargetSymbol.getSection(tsi));
+      check(TargetSymbol->getSection(tsi));
       Rel.SectionID = findOrEmitSection(Obj, (*tsi), true, LocalSections);
-      Rel.Addend = (intptr_t)TargetAdditionalInfo;
+      Rel.Addend = (intptr_t)Addend;
       return;
     }
   }
@@ -688,20 +779,42 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section,
   }
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldELF::resolveRelocation(const RelocationEntry &RE,
-				       uint64_t Value) {
+                                       uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
-  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend);
+  return resolveRelocation(Section, RE.Offset, Value, RE.RelType, RE.Addend,
+                           RE.SymOffset);
 }
 
 void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
                                        uint64_t Offset,
                                        uint64_t Value,
                                        uint32_t Type,
-                                       int64_t Addend) {
+                                       int64_t  Addend,
+                                       uint64_t SymOffset) {
   switch (Arch) {
   case Triple::x86_64:
-    resolveX86_64Relocation(Section, Offset, Value, Type, Addend);
+    resolveX86_64Relocation(Section, Offset, Value, Type, Addend, SymOffset);
     break;
   case Triple::x86:
     resolveX86Relocation(Section, Offset,
@@ -723,7 +836,8 @@ void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
                           (uint32_t)(Value & 0xffffffffL), Type,
                           (uint32_t)(Addend & 0xffffffffL));
     break;
-  case Triple::ppc64:
+  case Triple::ppc64:   // Fall through.
+  case Triple::ppc64le:
     resolvePPC64Relocation(Section, Offset, Value, Type, Addend);
     break;
   case Triple::systemz:
@@ -742,31 +856,37 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   uint64_t RelType;
   Check(RelI.getType(RelType));
   int64_t Addend;
-  Check(RelI.getAdditionalInfo(Addend));
-  SymbolRef Symbol;
-  Check(RelI.getSymbol(Symbol));
+  Check(getELFRelocationAddend(RelI, Addend));
+  symbol_iterator Symbol = RelI.getSymbol();
 
   // Obtain the symbol name which is referenced in the relocation
   StringRef TargetName;
-  Symbol.getName(TargetName);
+  if (Symbol != Obj.end_symbols())
+    Symbol->getName(TargetName);
   DEBUG(dbgs() << "\t\tRelType: " << RelType
                << " Addend: " << Addend
                << " TargetName: " << TargetName
                << "\n");
   RelocationValueRef Value;
   // First search for the symbol in the local symbol table
-  SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
-  SymbolRef::Type SymType;
-  Symbol.getType(SymType);
+  SymbolTableMap::const_iterator lsi = Symbols.end();
+  SymbolRef::Type SymType = SymbolRef::ST_Unknown;
+  if (Symbol != Obj.end_symbols()) {
+    lsi = Symbols.find(TargetName.data());
+    Symbol->getType(SymType);
+  }
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
+    Value.Offset = lsi->second.second;
     Value.Addend = lsi->second.second + Addend;
   } else {
     // Search for the symbol in the global symbol table
-    SymbolTableMap::const_iterator gsi =
-        GlobalSymbolTable.find(TargetName.data());
+    SymbolTableMap::const_iterator gsi = GlobalSymbolTable.end();
+    if (Symbol != Obj.end_symbols())
+      gsi = GlobalSymbolTable.find(TargetName.data());
     if (gsi != GlobalSymbolTable.end()) {
       Value.SectionID = gsi->second.first;
+      Value.Offset = gsi->second.second;
       Value.Addend = gsi->second.second + Addend;
     } else {
       switch (SymType) {
@@ -775,7 +895,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
           // and can be changed by another developers. Maybe best way is add
           // a new symbol type ST_Section to SymbolRef and use it.
           section_iterator si(Obj.end_sections());
-          Symbol.getSection(si);
+          Symbol->getSection(si);
           if (si == Obj.end_sections())
             llvm_unreachable("Symbol section not found, bad object file format!");
           DEBUG(dbgs() << "\t\tThis is section symbol\n");
@@ -789,9 +909,17 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
           Value.Addend = Addend;
           break;
         }
+        case SymbolRef::ST_Data:
         case SymbolRef::ST_Unknown: {
           Value.SymbolName = TargetName.data();
           Value.Addend = Addend;
+
+          // Absolute relocations will have a zero symbol ID (STN_UNDEF), which
+          // will manifest here as a NULL symbol name.
+          // We can set this as a valid (but empty) symbol name, and rely
+          // on addRelocationForSymbol to handle this.
+          if (!Value.SymbolName)
+              Value.SymbolName = "";
           break;
         }
         default:
@@ -876,7 +1004,7 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
       RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-                         ELF::R_ARM_ABS32, Value.Addend);
+                         ELF::R_ARM_PRIVATE_0, Value.Addend);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -903,8 +1031,8 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
     //  Look up for existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + i->second, RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, i->second);
+      addRelocationForSection(RE, SectionID);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
@@ -916,10 +1044,10 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       // Creating Hi and Lo relocations for the filled stub instructions.
       RelocationEntry REHi(SectionID,
                            StubTargetAddr - Section.Address,
-                           ELF::R_MIPS_HI16, Value.Addend);
+                           ELF::R_MIPS_UNUSED1, Value.Addend);
       RelocationEntry RELo(SectionID,
                            StubTargetAddr - Section.Address + 4,
-                           ELF::R_MIPS_LO16, Value.Addend);
+                           ELF::R_MIPS_UNUSED2, Value.Addend);
 
       if (Value.SymbolName) {
         addRelocationForSymbol(REHi, Value.SymbolName);
@@ -929,12 +1057,11 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
         addRelocationForSection(RELo, Value.SectionID);
       }
 
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.Address + Section.StubOffset,
-                        RelType, 0);
+      RelocationEntry RE(SectionID, Offset, RelType, Section.StubOffset);
+      addRelocationForSection(RE, SectionID);
       Section.StubOffset += getMaxStubSize();
     }
-  } else if (Arch == Triple::ppc64) {
+  } else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le) {
     if (RelType == ELF::R_PPC64_REL24) {
       // A PPC branch relocation will need a stub function if the target is
       // an external symbol (Symbol::ST_Unknown) or if the target address
@@ -1017,7 +1144,10 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
       RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
       // Extra check to avoid relocation againt empty symbols (usually
       // the R_PPC64_TOC).
-      if (Value.SymbolName && !TargetName.empty())
+      if (SymType != SymbolRef::ST_Unknown && TargetName.empty())
+        Value.SymbolName = NULL;
+
+      if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
@@ -1069,8 +1199,67 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
                         ELF::R_390_PC32DBL, Addend);
     else
       resolveRelocation(Section, Offset, StubAddress, RelType, Addend);
+  } else if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_PLT32) {
+    // The way the PLT relocations normally work is that the linker allocates the
+    // PLT and this relocation makes a PC-relative call into the PLT.  The PLT
+    // entry will then jump to an address provided by the GOT.  On first call, the
+    // GOT address will point back into PLT code that resolves the symbol.  After
+    // the first call, the GOT entry points to the actual function.
+    //
+    // For local functions we're ignoring all of that here and just replacing
+    // the PLT32 relocation type with PC32, which will translate the relocation
+    // into a PC-relative call directly to the function. For external symbols we
+    // can't be sure the function will be within 2^32 bytes of the call site, so
+    // we need to create a stub, which calls into the GOT.  This case is
+    // equivalent to the usual PLT implementation except that we use the stub
+    // mechanism in RuntimeDyld (which puts stubs at the end of the section)
+    // rather than allocating a PLT section.
+    if (Value.SymbolName) {
+      // This is a call to an external function.
+      // Look for an existing stub.
+      SectionEntry &Section = Sections[SectionID];
+      StubMap::const_iterator i = Stubs.find(Value);
+      uintptr_t StubAddress;
+      if (i != Stubs.end()) {
+        StubAddress = uintptr_t(Section.Address) + i->second;
+        DEBUG(dbgs() << " Stub function found\n");
+      } else {
+        // Create a new stub function (equivalent to a PLT entry).
+        DEBUG(dbgs() << " Create a new stub function\n");
+
+        uintptr_t BaseAddress = uintptr_t(Section.Address);
+        uintptr_t StubAlignment = getStubAlignment();
+        StubAddress = (BaseAddress + Section.StubOffset +
+                      StubAlignment - 1) & -StubAlignment;
+        unsigned StubOffset = StubAddress - BaseAddress;
+        Stubs[Value] = StubOffset;
+        createStubFunction((uint8_t *)StubAddress);
+
+        // Create a GOT entry for the external function.
+        GOTEntries.push_back(Value);
+
+        // Make our stub function a relative call to the GOT entry.
+        RelocationEntry RE(SectionID, StubOffset + 2,
+                           ELF::R_X86_64_GOTPCREL, -4);
+        addRelocationForSymbol(RE, Value.SymbolName);
+
+        // Bump our stub offset counter
+        Section.StubOffset = StubOffset + getMaxStubSize();
+      }
+
+      // Make the target call a call into the stub table.
+      resolveRelocation(Section, Offset, StubAddress,
+                      ELF::R_X86_64_PC32, Addend);
+    } else {
+      RelocationEntry RE(SectionID, Offset, ELF::R_X86_64_PC32, Value.Addend,
+                         Value.Offset);
+      addRelocationForSection(RE, Value.SectionID);
+    }
   } else {
-    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
+    if (Arch == Triple::x86_64 && RelType == ELF::R_X86_64_GOTPCREL) {
+      GOTEntries.push_back(Value);
+    }
+    RelocationEntry RE(SectionID, Offset, RelType, Value.Addend, Value.Offset);
     if (Value.SymbolName)
       addRelocationForSymbol(RE, Value.SymbolName);
     else
@@ -1078,6 +1267,137 @@ void RuntimeDyldELF::processRelocationRef(unsigned SectionID,
   }
 }
 
+void RuntimeDyldELF::updateGOTEntries(StringRef Name, uint64_t Addr) {
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::iterator end = GOTs.end();
+
+  for (it = GOTs.begin(); it != end; ++it) {
+    GOTRelocations &GOTEntries = it->second;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName != 0 && GOTEntries[i].SymbolName == Name) {
+        GOTEntries[i].Offset = Addr;
+      }
+    }
+  }
+}
+
+size_t RuntimeDyldELF::getGOTEntrySize() {
+  // We don't use the GOT in all of these cases, but it's essentially free
+  // to put them all here.
+  size_t Result = 0;
+  switch (Arch) {
+  case Triple::x86_64:
+  case Triple::aarch64:
+  case Triple::ppc64:
+  case Triple::ppc64le:
+  case Triple::systemz:
+    Result = sizeof(uint64_t);
+    break;
+  case Triple::x86:
+  case Triple::arm:
+  case Triple::thumb:
+  case Triple::mips:
+  case Triple::mipsel:
+    Result = sizeof(uint32_t);
+    break;
+  default: llvm_unreachable("Unsupported CPU type!");
+  }
+  return Result;
+}
+
+uint64_t RuntimeDyldELF::findGOTEntry(uint64_t LoadAddress,
+                                      uint64_t Offset) {
+
+  const size_t GOTEntrySize = getGOTEntrySize();
+
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator it;
+  SmallVectorImpl<std::pair<SID, GOTRelocations> >::const_iterator end = GOTs.end();
+
+  int GOTIndex = -1;
+  for (it = GOTs.begin(); it != end; ++it) {
+    SID GOTSectionID = it->first;
+    const GOTRelocations &GOTEntries = it->second;
+
+    // Find the matching entry in our vector.
+    uint64_t SymbolOffset = 0;
+    for (int i = 0, e = GOTEntries.size(); i != e; ++i) {
+      if (GOTEntries[i].SymbolName == 0) {
+        if (getSectionLoadAddress(GOTEntries[i].SectionID) == LoadAddress &&
+            GOTEntries[i].Offset == Offset) {
+          GOTIndex = i;
+          SymbolOffset = GOTEntries[i].Offset;
+          break;
+        }
+      } else {
+        // GOT entries for external symbols use the addend as the address when
+        // the external symbol has been resolved.
+        if (GOTEntries[i].Offset == LoadAddress) {
+          GOTIndex = i;
+          // Don't use the Addend here.  The relocation handler will use it.
+          break;
+        }
+      }
+    }
+
+    if (GOTIndex != -1) {
+      if (GOTEntrySize == sizeof(uint64_t)) {
+        uint64_t *LocalGOTAddr = (uint64_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = LoadAddress + SymbolOffset;
+      } else {
+        uint32_t *LocalGOTAddr = (uint32_t*)getSectionAddress(GOTSectionID);
+        // Fill in this entry with the address of the symbol being referenced.
+        LocalGOTAddr[GOTIndex] = (uint32_t)(LoadAddress + SymbolOffset);
+      }
+
+      // Calculate the load address of this entry
+      return getSectionLoadAddress(GOTSectionID) + (GOTIndex * GOTEntrySize);
+    }
+  }
+
+  assert(GOTIndex != -1 && "Unable to find requested GOT entry.");
+  return 0;
+}
+
+void RuntimeDyldELF::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  // If necessary, allocate the global offset table
+  if (MemMgr) {
+    // Allocate the GOT if necessary
+    size_t numGOTEntries = GOTEntries.size();
+    if (numGOTEntries != 0) {
+      // Allocate memory for the section
+      unsigned SectionID = Sections.size();
+      size_t TotalSize = numGOTEntries * getGOTEntrySize();
+      uint8_t *Addr = MemMgr->allocateDataSection(TotalSize, getGOTEntrySize(),
+                                                  SectionID, ".got", false);
+      if (!Addr)
+        report_fatal_error("Unable to allocate memory for GOT!");
+
+      GOTs.push_back(std::make_pair(SectionID, GOTEntries));
+      Sections.push_back(SectionEntry(".got", Addr, TotalSize, 0));
+      // For now, initialize all GOT entries to zero.  We'll fill them in as
+      // needed when GOT-based relocations are applied.
+      memset(Addr, 0, TotalSize);
+    }
+  }
+  else {
+    report_fatal_error("Unable to allocate memory for GOT!");
+  }
+
+  // Look for and record the EH frame section.
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == ".eh_frame") {
+      UnregisteredEHFrameSections.push_back(i->second);
+      break;
+    }
+  }
+}
+
 bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
   if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
     return false;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 794c7ec..3adf827 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -15,6 +15,7 @@
 #define LLVM_RUNTIME_DYLD_ELF_H
 
 #include "RuntimeDyldImpl.h"
+#include "llvm/ADT/DenseMap.h"
 
 using namespace llvm;
 
@@ -35,13 +36,15 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                          uint64_t Offset,
                          uint64_t Value,
                          uint32_t Type,
-                         int64_t Addend);
+                         int64_t Addend,
+                         uint64_t SymOffset=0);
 
   void resolveX86_64Relocation(const SectionEntry &Section,
                                uint64_t Offset,
                                uint64_t Value,
                                uint32_t Type,
-                               int64_t Addend);
+                               int64_t  Addend,
+                               uint64_t SymOffset);
 
   void resolveX86Relocation(const SectionEntry &Section,
                             uint64_t Offset,
@@ -79,13 +82,55 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                                 uint32_t Type,
                                 int64_t Addend);
 
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::aarch64)
+      return 20; // movz; movk; movk; movk; br
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::mipsel || Arch == Triple::mips)
+      return 16;
+    else if (Arch == Triple::ppc64 || Arch == Triple::ppc64le)
+      return 44;
+    else if (Arch == Triple::x86_64)
+      return 6; // 2-byte jmp instruction + 32-bit relative address
+    else if (Arch == Triple::systemz)
+      return 16;
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    if (Arch == Triple::systemz)
+      return 8;
+    else
+      return 1;
+  }
+
   uint64_t findPPC64TOC() const;
   void findOPDEntrySection(ObjectImage &Obj,
                            ObjSectionToIDMap &LocalSections,
                            RelocationValueRef &Rel);
 
+  uint64_t findGOTEntry(uint64_t LoadAddr, uint64_t Offset);
+  size_t getGOTEntrySize();
+
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr);
+
+  // Relocation entries for symbols whose position-independant offset is
+  // updated in a global offset table.
+  typedef SmallVector<RelocationValueRef, 2> GOTRelocations;
+  GOTRelocations GOTEntries; // List of entries requiring finalization.
+  SmallVector<std::pair<SID, GOTRelocations>, 8> GOTs; // Allocated tables.
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<SID, 2> UnregisteredEHFrameSections;
+  SmallVector<SID, 2> RegisteredEHFrameSections;
+
 public:
-  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
+  RuntimeDyldELF(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm)
+                                          {}
 
   virtual void resolveRelocation(const RelocationEntry &RE, uint64_t Value);
   virtual void processRelocationRef(unsigned SectionID,
@@ -96,7 +141,9 @@ public:
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void deregisterEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
   virtual ~RuntimeDyldELF();
 };
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 383ffab..3014b30 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -25,6 +25,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
@@ -80,14 +81,18 @@ public:
   unsigned SectionID;
 
   /// Offset - offset into the section.
-  uintptr_t Offset;
+  uint64_t Offset;
 
   /// RelType - relocation type.
   uint32_t RelType;
 
   /// Addend - the relocation addend encoded in the instruction itself.  Also
   /// used to make a relocation section relative instead of symbol relative.
-  intptr_t Addend;
+  int64_t Addend;
+
+  /// SymOffset - Section offset of the relocation entry's symbol (used for GOT
+  /// lookup).
+  uint64_t SymOffset;
 
   /// True if this is a PCRel relocation (MachO specific).
   bool IsPCRel;
@@ -97,26 +102,39 @@ public:
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(false), Size(0) {}
+      SymOffset(0), IsPCRel(false), Size(0) {}
+
+  RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
+                  uint64_t symoffset)
+    : SectionID(id), Offset(offset), RelType(type), Addend(addend),
+      SymOffset(symoffset), IsPCRel(false), Size(0) {}
 
   RelocationEntry(unsigned id, uint64_t offset, uint32_t type, int64_t addend,
                   bool IsPCRel, unsigned Size)
     : SectionID(id), Offset(offset), RelType(type), Addend(addend),
-      IsPCRel(IsPCRel), Size(Size) {}
+      SymOffset(0), IsPCRel(IsPCRel), Size(Size) {}
 };
 
 class RelocationValueRef {
 public:
   unsigned  SectionID;
-  intptr_t  Addend;
+  uint64_t  Offset;
+  int64_t   Addend;
   const char *SymbolName;
-  RelocationValueRef(): SectionID(0), Addend(0), SymbolName(0) {}
+  RelocationValueRef(): SectionID(0), Offset(0), Addend(0), SymbolName(0) {}
 
   inline bool operator==(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) == 0;
+    return SectionID == Other.SectionID && Offset == Other.Offset &&
+           Addend == Other.Addend && SymbolName == Other.SymbolName;
   }
   inline bool operator <(const RelocationValueRef &Other) const {
-    return std::memcmp(this, &Other, sizeof(RelocationValueRef)) < 0;
+    if (SectionID != Other.SectionID)
+      return SectionID < Other.SectionID;
+    if (Offset != Other.Offset)
+      return Offset < Other.Offset;
+    if (Addend != Other.Addend)
+      return Addend < Other.Addend;
+    return SymbolName < Other.SymbolName;
   }
 };
 
@@ -130,6 +148,9 @@ protected:
   typedef SmallVector<SectionEntry, 64> SectionList;
   SectionList Sections;
 
+  typedef unsigned SID; // Type for SectionIDs
+  #define RTDYLD_INVALID_SECTION_ID ((SID)(-1)) 
+
   // Keep a map of sections from object file to the SectionID which
   // references it.
   typedef std::map<SectionRef, unsigned> ObjSectionToIDMap;
@@ -164,30 +185,22 @@ protected:
   typedef std::map<RelocationValueRef, uintptr_t> StubMap;
 
   Triple::ArchType Arch;
-
-  inline unsigned getMaxStubSize() {
-    if (Arch == Triple::aarch64)
-      return 20; // movz; movk; movk; movk; br
-    if (Arch == Triple::arm || Arch == Triple::thumb)
-      return 8; // 32-bit instruction and 32-bit address
-    else if (Arch == Triple::mipsel || Arch == Triple::mips)
-      return 16;
-    else if (Arch == Triple::ppc64)
-      return 44;
-    else if (Arch == Triple::x86_64)
-      return 8; // GOT
-    else if (Arch == Triple::systemz)
-      return 16;
-    else
-      return 0;
-  }
-
-  inline unsigned getStubAlignment() {
-    if (Arch == Triple::systemz)
-      return 8;
-    else
-      return 1;
-  }
+  bool IsTargetLittleEndian;
+
+  // This mutex prevents simultaneously loading objects from two different
+  // threads.  This keeps us from having to protect individual data structures
+  // and guarantees that section allocation requests to the memory manager
+  // won't be interleaved between modules.  It is also used in mapSectionAddress
+  // and resolveRelocations to protect write access to internal data structures.
+  //
+  // loadObject may be called on the same thread during the handling of of
+  // processRelocations, and that's OK.  The handling of the relocation lists
+  // is written in such a way as to work correctly if new elements are added to
+  // the end of the list while the list is being processed.
+  sys::Mutex lock;
+
+  virtual unsigned getMaxStubSize() = 0;
+  virtual unsigned getStubAlignment() = 0;
 
   bool HasError;
   std::string ErrorStr;
@@ -208,14 +221,14 @@ protected:
   }
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 8) & 0xFF;
     *(Addr+1) = Value & 0xFF;
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 24) & 0xFF;
     *(Addr+1) = (Value >> 16) & 0xFF;
@@ -224,7 +237,7 @@ protected:
   }
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
-    if (sys::IsLittleEndianHost)
+    if (IsTargetLittleEndian)
       Value = sys::SwapByteOrder(Value);
     *Addr     = (Value >> 56) & 0xFF;
     *(Addr+1) = (Value >> 48) & 0xFF;
@@ -292,6 +305,11 @@ protected:
 
   /// \brief Resolve relocations to external symbols.
   void resolveExternalSymbols();
+
+  /// \brief Update GOT entries for external symbols.
+  // The base class does nothing.  ELF overrides this.
+  virtual void updateGOTEntries(StringRef Name, uint64_t Addr) {}
+
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
@@ -303,18 +321,20 @@ public:
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
   uint64_t getSymbolLoadAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
     // Work in progress.
-    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+    SymbolTableMap::const_iterator pos = GlobalSymbolTable.find(Name);
+    if (pos == GlobalSymbolTable.end())
       return 0;
-    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    SymbolLoc Loc = pos->second;
     return getSectionLoadAddress(Loc.first) + Loc.second;
   }
 
@@ -335,7 +355,11 @@ public:
 
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
 
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+
+  virtual void deregisterEHFrames();
+
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap) {}
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 01a3fd9..5b92867 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -55,35 +55,80 @@ static intptr_t computeDelta(SectionEntry *A, SectionEntry *B) {
   return ObjDistance - MemDistance;
 }
 
-StringRef RuntimeDyldMachO::getEHFrameSection() {
-  SectionEntry *Text = NULL;
-  SectionEntry *EHFrame = NULL;
-  SectionEntry *ExceptTab = NULL;
-  for (int i = 0, e = Sections.size(); i != e; ++i) {
-    if (Sections[i].Name == "__eh_frame")
-      EHFrame = &Sections[i];
-    else if (Sections[i].Name == "__text")
-      Text = &Sections[i];
-    else if (Sections[i].Name == "__gcc_except_tab")
-      ExceptTab = &Sections[i];
+void RuntimeDyldMachO::registerEHFrames() {
+
+  if (!MemMgr)
+    return;
+  for (int i = 0, e = UnregisteredEHFrameSections.size(); i != e; ++i) {
+    EHFrameRelatedSections &SectionInfo = UnregisteredEHFrameSections[i];
+    if (SectionInfo.EHFrameSID == RTDYLD_INVALID_SECTION_ID ||
+        SectionInfo.TextSID == RTDYLD_INVALID_SECTION_ID)
+      continue;
+    SectionEntry *Text = &Sections[SectionInfo.TextSID];
+    SectionEntry *EHFrame = &Sections[SectionInfo.EHFrameSID];
+    SectionEntry *ExceptTab = NULL;
+    if (SectionInfo.ExceptTabSID != RTDYLD_INVALID_SECTION_ID)
+      ExceptTab = &Sections[SectionInfo.ExceptTabSID];
+
+    intptr_t DeltaForText = computeDelta(Text, EHFrame);
+    intptr_t DeltaForEH = 0;
+    if (ExceptTab)
+      DeltaForEH = computeDelta(ExceptTab, EHFrame);
+
+    unsigned char *P = EHFrame->Address;
+    unsigned char *End = P + EHFrame->Size;
+    do  {
+      P = processFDE(P, DeltaForText, DeltaForEH);
+    } while(P != End);
+
+    MemMgr->registerEHFrames(EHFrame->Address,
+                             EHFrame->LoadAddress,
+                             EHFrame->Size);
   }
-  if (Text == NULL || EHFrame == NULL)
-    return StringRef();
-
-  intptr_t DeltaForText = computeDelta(Text, EHFrame);
-  intptr_t DeltaForEH = 0;
-  if (ExceptTab)
-    DeltaForEH = computeDelta(ExceptTab, EHFrame);
-
-  unsigned char *P = EHFrame->Address;
-  unsigned char *End = P + EHFrame->Size;
-  do  {
-    P = processFDE(P, DeltaForText, DeltaForEH);
-  } while(P != End);
+  UnregisteredEHFrameSections.clear();
+}
 
-  return StringRef((char*)EHFrame->Address, EHFrame->Size);
+void RuntimeDyldMachO::finalizeLoad(ObjSectionToIDMap &SectionMap) {
+  unsigned EHFrameSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned TextSID = RTDYLD_INVALID_SECTION_ID;
+  unsigned ExceptTabSID = RTDYLD_INVALID_SECTION_ID;
+  ObjSectionToIDMap::iterator i, e;
+  for (i = SectionMap.begin(), e = SectionMap.end(); i != e; ++i) {
+    const SectionRef &Section = i->first;
+    StringRef Name;
+    Section.getName(Name);
+    if (Name == "__eh_frame")
+      EHFrameSID = i->second;
+    else if (Name == "__text")
+      TextSID = i->second;
+    else if (Name == "__gcc_except_tab")
+      ExceptTabSID = i->second;
+  }
+  UnregisteredEHFrameSections.push_back(EHFrameRelatedSections(EHFrameSID,
+                                                               TextSID,
+                                                               ExceptTabSID));
 }
 
+// The target location for the relocation is described by RE.SectionID and
+// RE.Offset.  RE.SectionID can be used to find the SectionEntry.  Each
+// SectionEntry has three members describing its location.
+// SectionEntry::Address is the address at which the section has been loaded
+// into memory in the current (host) process.  SectionEntry::LoadAddress is the
+// address that the section will have in the target process.
+// SectionEntry::ObjAddress is the address of the bits for this section in the
+// original emitted object image (also in the current address space).
+//
+// Relocations will be applied as if the section were loaded at
+// SectionEntry::LoadAddress, but they will be applied at an address based
+// on SectionEntry::Address.  SectionEntry::ObjAddress will be used to refer to
+// Target memory contents if they are required for value calculations.
+//
+// The Value parameter here is the load address of the symbol for the
+// relocation to be applied.  For relocations which refer to symbols in the
+// current object Value will be the LoadAddress of the section in which
+// the symbol resides (RE.Addend provides additional information about the
+// symbol location).  For external symbols, Value will be the address of the
+// symbol in the target address space.
 void RuntimeDyldMachO::resolveRelocation(const RelocationEntry &RE,
                                          uint64_t Value) {
   const SectionEntry &Section = Sections[RE.SectionID];
@@ -160,7 +205,7 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
   switch (Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::GENERIC_RELOC_VANILLA: {
     uint8_t *p = LocalAddress;
     uint64_t ValueToWrite = Value + Addend;
     for (unsigned i = 0; i < Size; ++i) {
@@ -169,9 +214,9 @@ bool RuntimeDyldMachO::resolveI386Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_Difference:
-  case macho::RIT_Generic_LocalDifference:
-  case macho::RIT_Generic_PreboundLazyPointer:
+  case MachO::GENERIC_RELOC_SECTDIFF:
+  case MachO::GENERIC_RELOC_LOCAL_SECTDIFF:
+  case MachO::GENERIC_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -193,12 +238,12 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_X86_64_Signed1:
-  case macho::RIT_X86_64_Signed2:
-  case macho::RIT_X86_64_Signed4:
-  case macho::RIT_X86_64_Signed:
-  case macho::RIT_X86_64_Unsigned:
-  case macho::RIT_X86_64_Branch: {
+  case MachO::X86_64_RELOC_SIGNED_1:
+  case MachO::X86_64_RELOC_SIGNED_2:
+  case MachO::X86_64_RELOC_SIGNED_4:
+  case MachO::X86_64_RELOC_SIGNED:
+  case MachO::X86_64_RELOC_UNSIGNED:
+  case MachO::X86_64_RELOC_BRANCH: {
     Value += Addend;
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
@@ -209,10 +254,10 @@ bool RuntimeDyldMachO::resolveX86_64Relocation(uint8_t *LocalAddress,
     }
     return false;
   }
-  case macho::RIT_X86_64_GOTLoad:
-  case macho::RIT_X86_64_GOT:
-  case macho::RIT_X86_64_Subtractor:
-  case macho::RIT_X86_64_TLV:
+  case MachO::X86_64_RELOC_GOT_LOAD:
+  case MachO::X86_64_RELOC_GOT:
+  case MachO::X86_64_RELOC_SUBTRACTOR:
+  case MachO::X86_64_RELOC_TLV:
     return Error("Relocation type not implemented yet!");
   }
 }
@@ -237,7 +282,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
   switch(Type) {
   default:
     llvm_unreachable("Invalid relocation type!");
-  case macho::RIT_Vanilla: {
+  case MachO::ARM_RELOC_VANILLA: {
     // Mask in the target value a byte at a time (we don't have an alignment
     // guarantee for the target address, so this is safest).
     uint8_t *p = (uint8_t*)LocalAddress;
@@ -247,7 +292,7 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     }
     break;
   }
-  case macho::RIT_ARM_Branch24Bit: {
+  case MachO::ARM_RELOC_BR24: {
     // Mask the value into the target address. We know instructions are
     // 32-bit aligned, so we can do it all at once.
     uint32_t *p = (uint32_t*)LocalAddress;
@@ -263,14 +308,14 @@ bool RuntimeDyldMachO::resolveARMRelocation(uint8_t *LocalAddress,
     *p = (*p & ~0xffffff) | Value;
     break;
   }
-  case macho::RIT_ARM_ThumbBranch22Bit:
-  case macho::RIT_ARM_ThumbBranch32Bit:
-  case macho::RIT_ARM_Half:
-  case macho::RIT_ARM_HalfDifference:
-  case macho::RIT_Pair:
-  case macho::RIT_Difference:
-  case macho::RIT_ARM_LocalDifference:
-  case macho::RIT_ARM_PreboundLazyPointer:
+  case MachO::ARM_THUMB_RELOC_BR22:
+  case MachO::ARM_THUMB_32BIT_BRANCH:
+  case MachO::ARM_RELOC_HALF:
+  case MachO::ARM_RELOC_HALF_SECTDIFF:
+  case MachO::ARM_RELOC_PAIR:
+  case MachO::ARM_RELOC_SECTDIFF:
+  case MachO::ARM_RELOC_LOCAL_SECTDIFF:
+  case MachO::ARM_RELOC_PB_LA_PTR:
     return Error("Relocation type not implemented yet!");
   }
   return false;
@@ -284,9 +329,19 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
                                             StubMap &Stubs) {
   const ObjectFile *OF = Obj.getObjectFile();
   const MachOObjectFile *MachO = static_cast<const MachOObjectFile*>(OF);
-  macho::RelocationEntry RE = MachO->getRelocation(RelI.getRawDataRefImpl());
+  MachO::any_relocation_info RE= MachO->getRelocation(RelI.getRawDataRefImpl());
 
   uint32_t RelType = MachO->getAnyRelocationType(RE);
+
+  // FIXME: Properly handle scattered relocations.
+  //        For now, optimistically skip these: they can often be ignored, as
+  //        the static linker will already have applied the relocation, and it
+  //        only needs to be reapplied if symbols move relative to one another.
+  //        Note: This will fail horribly where the relocations *do* need to be
+  //        applied, but that was already the case.
+  if (MachO->isRelocationScattered(RE))
+    return;
+
   RelocationValueRef Value;
   SectionEntry &Section = Sections[SectionID];
 
@@ -302,10 +357,9 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
 
   if (isExtern) {
     // Obtain the symbol name which is referenced in the relocation
-    SymbolRef Symbol;
-    RelI.getSymbol(Symbol);
+    symbol_iterator Symbol = RelI.getSymbol();
     StringRef TargetName;
-    Symbol.getName(TargetName);
+    Symbol->getName(TargetName);
     // First search for the symbol in the local symbol table
     SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
     if (lsi != Symbols.end()) {
@@ -330,7 +384,8 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
     Value.Addend = Addend - Addr;
   }
 
-  if (Arch == Triple::x86_64 && RelType == macho::RIT_X86_64_GOT) {
+  if (Arch == Triple::x86_64 && (RelType == MachO::X86_64_RELOC_GOT ||
+                                 RelType == MachO::X86_64_RELOC_GOT_LOAD)) {
     assert(IsPCRel);
     assert(Size == 2);
     StubMap::const_iterator i = Stubs.find(Value);
@@ -341,8 +396,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Stubs[Value] = Section.StubOffset;
       uint8_t *GOTEntry = Section.Address + Section.StubOffset;
       RelocationEntry RE(SectionID, Section.StubOffset,
-                         macho::RIT_X86_64_Unsigned, Value.Addend - 4, false,
-                         3);
+                         MachO::X86_64_RELOC_UNSIGNED, 0, false, 3);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -351,9 +405,9 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       Addr = GOTEntry;
     }
     resolveRelocation(Section, Offset, (uint64_t)Addr,
-                      macho::RIT_X86_64_Unsigned, 4, true, 2);
+                      MachO::X86_64_RELOC_UNSIGNED, Value.Addend, true, 2);
   } else if (Arch == Triple::arm &&
-             (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
+             (RelType & 0xf) == MachO::ARM_RELOC_BR24) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
@@ -368,7 +422,7 @@ void RuntimeDyldMachO::processRelocationRef(unsigned SectionID,
       uint8_t *StubTargetAddr = createStubFunction(Section.Address +
                                                    Section.StubOffset);
       RelocationEntry RE(SectionID, StubTargetAddr - Section.Address,
-                         macho::RIT_Vanilla, Value.Addend);
+                         MachO::GENERIC_RELOC_VANILLA, Value.Addend);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index df8d3bb..bbf6aa9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -54,6 +54,35 @@ class RuntimeDyldMachO : public RuntimeDyldImpl {
                          int64_t Addend,
                          bool isPCRel,
                          unsigned Size);
+
+  unsigned getMaxStubSize() {
+    if (Arch == Triple::arm || Arch == Triple::thumb)
+      return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::x86_64)
+      return 8; // GOT entry
+    else
+      return 0;
+  }
+
+  unsigned getStubAlignment() {
+    return 1;
+  }
+
+  struct EHFrameRelatedSections {
+    EHFrameRelatedSections() : EHFrameSID(RTDYLD_INVALID_SECTION_ID),
+                               TextSID(RTDYLD_INVALID_SECTION_ID),
+                               ExceptTabSID(RTDYLD_INVALID_SECTION_ID) {}
+    EHFrameRelatedSections(SID EH, SID T, SID Ex) 
+      : EHFrameSID(EH), TextSID(T), ExceptTabSID(Ex) {}
+    SID EHFrameSID;
+    SID TextSID;
+    SID ExceptTabSID;
+  };
+
+  // When a module is loaded we save the SectionID of the EH frame section
+  // in a table until we receive a request to register all unregistered
+  // EH frame sections with the memory manager.
+  SmallVector<EHFrameRelatedSections, 2> UnregisteredEHFrameSections;
 public:
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
@@ -65,7 +94,8 @@ public:
                                     const SymbolTableMap &Symbols,
                                     StubMap &Stubs);
   virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
-  virtual StringRef getEHFrameSection();
+  virtual void registerEHFrames();
+  virtual void finalizeLoad(ObjSectionToIDMap &SectionMap);
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index ca4330f..9b7d348 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -88,6 +88,14 @@ TargetMachine *EngineBuilder::selectTarget(const Triple &TargetTriple,
     FeaturesStr = Features.getString();
   }
 
+  // FIXME: non-iOS ARM FastISel is broken with MCJIT.
+  if (UseMCJIT &&
+      TheTriple.getArch() == Triple::arm &&
+      !TheTriple.isiOS() &&
+      OptLevel == CodeGenOpt::None) {
+    OptLevel = CodeGenOpt::Less;
+  }
+
   // Allocate a target...
   TargetMachine *Target = TheTarget->createTargetMachine(TheTriple.getTriple(),
                                                          MCPU, FeaturesStr,
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 7761127d..7decffd 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -14,6 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AsmWriter.h"
+
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -38,6 +40,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
+
 #include <algorithm>
 #include <cctype>
 using namespace llvm;
@@ -71,6 +74,8 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   default:                         Out << "cc" << cc; break;
   case CallingConv::Fast:          Out << "fastcc"; break;
   case CallingConv::Cold:          Out << "coldcc"; break;
+  case CallingConv::WebKit_JS:     Out << "webkit_jscc"; break;
+  case CallingConv::AnyReg:        Out << "anyregcc"; break;
   case CallingConv::X86_StdCall:   Out << "x86_stdcallcc"; break;
   case CallingConv::X86_FastCall:  Out << "x86_fastcallcc"; break;
   case CallingConv::X86_ThisCall:  Out << "x86_thiscallcc"; break;
@@ -81,6 +86,8 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) {
   case CallingConv::MSP430_INTR:   Out << "msp430_intrcc"; break;
   case CallingConv::PTX_Kernel:    Out << "ptx_kernel"; break;
   case CallingConv::PTX_Device:    Out << "ptx_device"; break;
+  case CallingConv::X86_64_SysV:   Out << "x86_64_sysvcc"; break;
+  case CallingConv::X86_64_Win64:  Out << "x86_64_win64cc"; break;
   }
 }
 
@@ -153,35 +160,8 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
 }
 
-//===----------------------------------------------------------------------===//
-// TypePrinting Class: Type printing machinery
-//===----------------------------------------------------------------------===//
-
-/// TypePrinting - Type printing machinery.
-namespace {
-class TypePrinting {
-  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
-  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
-public:
-
-  /// NamedTypes - The named types that are used by the current module.
-  TypeFinder NamedTypes;
-
-  /// NumberedTypes - The numbered types, along with their value.
-  DenseMap<StructType*, unsigned> NumberedTypes;
-
-
-  TypePrinting() {}
-  ~TypePrinting() {}
-
-  void incorporateTypes(const Module &M);
-
-  void print(Type *Ty, raw_ostream &OS);
-
-  void printStructBody(StructType *Ty, raw_ostream &OS);
-};
-} // end anonymous namespace.
 
+namespace llvm {
 
 void TypePrinting::incorporateTypes(const Module &M) {
   NamedTypes.run(M, false);
@@ -313,14 +293,9 @@ void TypePrinting::printStructBody(StructType *STy, raw_ostream &OS) {
     OS << '>';
 }
 
-
-
 //===----------------------------------------------------------------------===//
 // SlotTracker Class: Enumerate slot numbers for unnamed values
 //===----------------------------------------------------------------------===//
-
-namespace {
-
 /// This class provides computation of slot numbers for LLVM Assembly writing.
 ///
 class SlotTracker {
@@ -418,8 +393,9 @@ private:
   void operator=(const SlotTracker &) LLVM_DELETED_FUNCTION;
 };
 
-}  // end anonymous namespace
-
+SlotTracker *createSlotTracker(const Module *M) {
+  return new SlotTracker(M);
+}
 
 static SlotTracker *createSlotTracker(const Value *V) {
   if (const Argument *FA = dyn_cast<Argument>(V))
@@ -1200,8 +1176,8 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
     Out << "<badref>";
 }
 
-void llvm::WriteAsOperand(raw_ostream &Out, const Value *V,
-                          bool PrintType, const Module *Context) {
+void WriteAsOperand(raw_ostream &Out, const Value *V,
+                    bool PrintType, const Module *Context) {
 
   // Fast path: Don't construct and populate a TypePrinting object if we
   // won't be needing any types printed.
@@ -1225,50 +1201,27 @@ void llvm::WriteAsOperand(raw_ostream &Out, const Value *V,
   WriteAsOperandInternal(Out, V, &TypePrinter, 0, Context);
 }
 
-namespace {
-
-class AssemblyWriter {
-  formatted_raw_ostream &Out;
-  SlotTracker &Machine;
-  const Module *TheModule;
-  TypePrinting TypePrinter;
-  AssemblyAnnotationWriter *AnnotationWriter;
-
-public:
-  inline AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
-                        const Module *M,
-                        AssemblyAnnotationWriter *AAW)
-    : Out(o), Machine(Mac), TheModule(M), AnnotationWriter(AAW) {
-    if (M)
-      TypePrinter.incorporateTypes(*M);
-  }
-
-  void printMDNodeBody(const MDNode *MD);
-  void printNamedMDNode(const NamedMDNode *NMD);
-
-  void printModule(const Module *M);
+void AssemblyWriter::init() {
+  if (TheModule)
+    TypePrinter.incorporateTypes(*TheModule);
+}
 
-  void writeOperand(const Value *Op, bool PrintType);
-  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
-  void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 
-  void writeAllMDNodes();
-  void writeAllAttributeGroups();
+AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                               const Module *M,
+                               AssemblyAnnotationWriter *AAW)
+  : Out(o), TheModule(M), Machine(Mac), AnnotationWriter(AAW) {
+  init();
+}
 
-  void printTypeIdentities();
-  void printGlobal(const GlobalVariable *GV);
-  void printAlias(const GlobalAlias *GV);
-  void printFunction(const Function *F);
-  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
-  void printBasicBlock(const BasicBlock *BB);
-  void printInstruction(const Instruction &I);
+AssemblyWriter::AssemblyWriter(formatted_raw_ostream &o, const Module *M,
+                               AssemblyAnnotationWriter *AAW)
+  : Out(o), TheModule(M), ModuleSlotTracker(createSlotTracker(M)),
+    Machine(*ModuleSlotTracker), AnnotationWriter(AAW) {
+  init();
+}
 
-private:
-  // printInfoComment - Print a little comment after the instruction indicating
-  // which slot it occupies.
-  void printInfoComment(const Value &V);
-};
-}  // end of anonymous namespace
+AssemblyWriter::~AssemblyWriter() { }
 
 void AssemblyWriter::writeOperand(const Value *Operand, bool PrintType) {
   if (Operand == 0) {
@@ -1443,9 +1396,6 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "linkonce_odr_auto_hide ";
-    break;
   case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
   case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
   case GlobalValue::CommonLinkage:        Out << "common ";         break;
@@ -1696,6 +1646,10 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " align " << F->getAlignment();
   if (F->hasGC())
     Out << " gc \"" << F->getGC() << '"';
+  if (F->hasPrefixData()) {
+    Out << " prefix ";
+    writeOperand(F->getPrefixData(), true);
+  }
   if (F->isDeclaration()) {
     Out << '\n';
   } else {
@@ -1772,13 +1726,18 @@ void AssemblyWriter::printBasicBlock(const BasicBlock *BB) {
 
   // Output all of the instructions in the basic block...
   for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    printInstruction(*I);
-    Out << '\n';
+    printInstructionLine(*I);
   }
 
   if (AnnotationWriter) AnnotationWriter->emitBasicBlockEndAnnot(BB, Out);
 }
 
+/// printInstructionLine - Print an instruction and a newline character.
+void AssemblyWriter::printInstructionLine(const Instruction &I) {
+  printInstruction(I);
+  Out << '\n';
+}
+
 /// printInfoComment - Print a little comment after the instruction indicating
 /// which slot it occupies.
 ///
@@ -2093,9 +2052,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       unsigned Kind = InstMD[i].first;
        if (Kind < MDNames.size()) {
          Out << ", !" << MDNames[Kind];
-      } else {
-        Out << ", !<unknown kind #" << Kind << ">";
-      }
+       } else {
+         Out << ", !<unknown kind #" << Kind << ">";
+       }
       Out << ' ';
       WriteAsOperandInternal(Out, InstMD[i].second, &TypePrinter, &Machine,
                              TheModule);
@@ -2127,6 +2086,11 @@ static void WriteMDNodeComment(const MDNode *Node,
   }
 }
 
+void AssemblyWriter::writeMDNode(unsigned Slot, const MDNode *Node) {
+  Out << '!' << Slot << " = metadata ";
+  printMDNodeBody(Node);
+}
+
 void AssemblyWriter::writeAllMDNodes() {
   SmallVector<const MDNode *, 16> Nodes;
   Nodes.resize(Machine.mdn_size());
@@ -2135,8 +2099,7 @@ void AssemblyWriter::writeAllMDNodes() {
     Nodes[I->second] = cast<MDNode>(I->first);
 
   for (unsigned i = 0, e = Nodes.size(); i != e; ++i) {
-    Out << '!' << i << " = metadata ";
-    printMDNodeBody(Nodes[i]);
+    writeMDNode(i, Nodes[i]);
   }
 }
 
@@ -2160,6 +2123,8 @@ void AssemblyWriter::writeAllAttributeGroups() {
         << I->first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
 }
 
+} // namespace llvm
+
 //===----------------------------------------------------------------------===//
 //                       External Interface declarations
 //===----------------------------------------------------------------------===//
diff --git a/lib/IR/AsmWriter.h b/lib/IR/AsmWriter.h
new file mode 100644
index 0000000..8f4a377
--- /dev/null
+++ b/lib/IR/AsmWriter.h
@@ -0,0 +1,118 @@
+//===-- llvm/IR/AsmWriter.h - Printing LLVM IR as an assembly file - C++ --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This files defines the interface for the AssemblyWriter class used to print
+// LLVM IR and various helper classes that are used in printing.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_IR_ASSEMBLYWRITER_H
+#define LLVM_IR_ASSEMBLYWRITER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Support/FormattedStream.h"
+
+namespace llvm {
+
+class BasicBlock;
+class Function;
+class GlobalValue;
+class Module;
+class NamedMDNode;
+class Value;
+class SlotTracker;
+
+/// Create a new SlotTracker for a Module 
+SlotTracker *createSlotTracker(const Module *M);
+
+//===----------------------------------------------------------------------===//
+// TypePrinting Class: Type printing machinery
+//===----------------------------------------------------------------------===//
+
+class TypePrinting {
+  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
+  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
+public:
+
+  /// NamedTypes - The named types that are used by the current module.
+  TypeFinder NamedTypes;
+
+  /// NumberedTypes - The numbered types, along with their value.
+  DenseMap<StructType*, unsigned> NumberedTypes;
+
+
+  TypePrinting() {}
+  ~TypePrinting() {}
+
+  void incorporateTypes(const Module &M);
+
+  void print(Type *Ty, raw_ostream &OS);
+
+  void printStructBody(StructType *Ty, raw_ostream &OS);
+};
+
+class AssemblyWriter {
+protected:
+  formatted_raw_ostream &Out;
+  const Module *TheModule;
+
+private:
+  OwningPtr<SlotTracker> ModuleSlotTracker;
+  SlotTracker &Machine;
+  TypePrinting TypePrinter;
+  AssemblyAnnotationWriter *AnnotationWriter;
+
+public:
+  /// Construct an AssemblyWriter with an external SlotTracker
+  AssemblyWriter(formatted_raw_ostream &o, SlotTracker &Mac,
+                 const Module *M, AssemblyAnnotationWriter *AAW);
+
+  /// Construct an AssemblyWriter with an internally allocated SlotTracker
+  AssemblyWriter(formatted_raw_ostream &o, const Module *M,
+                 AssemblyAnnotationWriter *AAW);
+
+  virtual ~AssemblyWriter();
+
+  void printMDNodeBody(const MDNode *MD);
+  void printNamedMDNode(const NamedMDNode *NMD);
+
+  void printModule(const Module *M);
+
+  void writeOperand(const Value *Op, bool PrintType);
+  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
+
+  void writeAllMDNodes();
+  void writeMDNode(unsigned Slot, const MDNode *Node);
+  void writeAllAttributeGroups();
+
+  void printTypeIdentities();
+  void printGlobal(const GlobalVariable *GV);
+  void printAlias(const GlobalAlias *GV);
+  void printFunction(const Function *F);
+  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
+  void printBasicBlock(const BasicBlock *BB);
+  void printInstructionLine(const Instruction &I);
+  void printInstruction(const Instruction &I);
+
+private:
+  void init();
+
+  // printInfoComment - Print a little comment after the instruction indicating
+  // which slot it occupies.
+  void printInfoComment(const Value &V);
+};
+
+} // namespace llvm
+
+#endif //LLVM_IR_ASMWRITER_H
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index 0b6228b..ea954ac 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -27,97 +27,30 @@ class LLVMContext;
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief A set of classes that contain the kind and (optional) value of the
-/// attribute object. There are three main categories: enum attribute entries,
-/// represented by Attribute::AttrKind; alignment attribute entries; and string
-/// attribute enties, which are for target-dependent attributes.
-class AttributeEntry {
-  unsigned char KindID;
+/// \brief This class represents a single, uniqued attribute. That attribute
+/// could be a single enum, a tuple, or a string.
+class AttributeImpl : public FoldingSetNode {
+  unsigned char KindID; ///< Holds the AttrEntryKind of the attribute
+
+  // AttributesImpl is uniqued, these should not be publicly available.
+  void operator=(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+  AttributeImpl(const AttributeImpl &) LLVM_DELETED_FUNCTION;
+
 protected:
   enum AttrEntryKind {
     EnumAttrEntry,
     AlignAttrEntry,
     StringAttrEntry
   };
-public:
-  AttributeEntry(AttrEntryKind Kind)
-    : KindID(Kind) {}
-  virtual ~AttributeEntry() {}
 
-  unsigned getKindID() const { return KindID; }
+  AttributeImpl(AttrEntryKind KindID) : KindID(KindID) {}
 
-  static inline bool classof(const AttributeEntry *) { return true; }
-};
-
-class EnumAttributeEntry : public AttributeEntry {
-  Attribute::AttrKind Kind;
 public:
-  EnumAttributeEntry(Attribute::AttrKind Kind)
-    : AttributeEntry(EnumAttrEntry), Kind(Kind) {}
-
-  Attribute::AttrKind getEnumKind() const { return Kind; }
-
-  static inline bool classof(const AttributeEntry *AE) {
-    return AE->getKindID() == EnumAttrEntry;
-  }
-  static inline bool classof(const EnumAttributeEntry *) { return true; }
-};
+  virtual ~AttributeImpl();
 
-class AlignAttributeEntry : public AttributeEntry {
-  Attribute::AttrKind Kind;
-  unsigned Align;
-public:
-  AlignAttributeEntry(Attribute::AttrKind Kind, unsigned Align)
-    : AttributeEntry(AlignAttrEntry), Kind(Kind), Align(Align) {}
-
-  Attribute::AttrKind getEnumKind() const { return Kind; }
-  unsigned getAlignment() const { return Align; }
-
-  static inline bool classof(const AttributeEntry *AE) {
-    return AE->getKindID() == AlignAttrEntry;
-  }
-  static inline bool classof(const AlignAttributeEntry *) { return true; }
-};
-
-class StringAttributeEntry : public AttributeEntry {
-  std::string Kind;
-  std::string Val;
-public:
-  StringAttributeEntry(StringRef Kind, StringRef Val = StringRef())
-    : AttributeEntry(StringAttrEntry), Kind(Kind), Val(Val) {}
-
-  StringRef getStringKind() const { return Kind; }
-  StringRef getStringValue() const { return Val; }
-
-  static inline bool classof(const AttributeEntry *AE) {
-    return AE->getKindID() == StringAttrEntry;
-  }
-  static inline bool classof(const StringAttributeEntry *) { return true; }
-};
-
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a single, uniqued attribute. That attribute
-/// could be a single enum, a tuple, or a string.
-class AttributeImpl : public FoldingSetNode {
-  LLVMContext &Context;  ///< Global context for uniquing objects
-
-  AttributeEntry *Entry; ///< Holds the kind and value of the attribute
-
-  // AttributesImpl is uniqued, these should not be publicly available.
-  void operator=(const AttributeImpl &) LLVM_DELETED_FUNCTION;
-  AttributeImpl(const AttributeImpl &) LLVM_DELETED_FUNCTION;
-public:
-  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind);
-  AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind, unsigned Align);
-  AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val = StringRef());
-  ~AttributeImpl();
-
-  LLVMContext &getContext() { return Context; }
-
-  bool isEnumAttribute() const;
-  bool isAlignAttribute() const;
-  bool isStringAttribute() const;
+  bool isEnumAttribute() const { return KindID == EnumAttrEntry; }
+  bool isAlignAttribute() const { return KindID == AlignAttrEntry; }
+  bool isStringAttribute() const { return KindID == StringAttrEntry; }
 
   bool hasAttribute(Attribute::AttrKind A) const;
   bool hasAttribute(StringRef Kind) const;
@@ -155,13 +88,66 @@ public:
 
 //===----------------------------------------------------------------------===//
 /// \class
+/// \brief A set of classes that contain the value of the
+/// attribute object. There are three main categories: enum attribute entries,
+/// represented by Attribute::AttrKind; alignment attribute entries; and string
+/// attribute enties, which are for target-dependent attributes.
+
+class EnumAttributeImpl : public AttributeImpl {
+  virtual void anchor();
+  Attribute::AttrKind Kind;
+
+protected:
+  EnumAttributeImpl(AttrEntryKind ID, Attribute::AttrKind Kind)
+      : AttributeImpl(ID), Kind(Kind) {}
+
+public:
+  EnumAttributeImpl(Attribute::AttrKind Kind)
+      : AttributeImpl(EnumAttrEntry), Kind(Kind) {}
+
+  Attribute::AttrKind getEnumKind() const { return Kind; }
+};
+
+class AlignAttributeImpl : public EnumAttributeImpl {
+  virtual void anchor();
+  unsigned Align;
+
+public:
+  AlignAttributeImpl(Attribute::AttrKind Kind, unsigned Align)
+      : EnumAttributeImpl(AlignAttrEntry, Kind), Align(Align) {
+    assert(
+        (Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) &&
+        "Wrong kind for alignment attribute!");
+  }
+
+  unsigned getAlignment() const { return Align; }
+};
+
+class StringAttributeImpl : public AttributeImpl {
+  virtual void anchor();
+  std::string Kind;
+  std::string Val;
+
+public:
+  StringAttributeImpl(StringRef Kind, StringRef Val = StringRef())
+      : AttributeImpl(StringAttrEntry), Kind(Kind), Val(Val) {}
+
+  StringRef getStringKind() const { return Kind; }
+  StringRef getStringValue() const { return Val; }
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
 /// \brief This class represents a group of attributes that apply to one
 /// element: function, return type, or parameter.
 class AttributeSetNode : public FoldingSetNode {
-  SmallVector<Attribute, 4> AttrList;
+  unsigned NumAttrs; ///< Number of attributes in this node.
 
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : AttrList(Attrs.begin(), Attrs.end()) {}
+  AttributeSetNode(ArrayRef<Attribute> Attrs) : NumAttrs(Attrs.size()) {
+    // There's memory after the node where we can store the entries in.
+    std::copy(Attrs.begin(), Attrs.end(),
+              reinterpret_cast<Attribute *>(this + 1));
+  }
 
   // AttributesSetNode is uniqued, these should not be publicly available.
   void operator=(const AttributeSetNode &) LLVM_DELETED_FUNCTION;
@@ -171,7 +157,7 @@ public:
 
   bool hasAttribute(Attribute::AttrKind Kind) const;
   bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return !AttrList.empty(); }
+  bool hasAttributes() const { return NumAttrs != 0; }
 
   Attribute getAttribute(Attribute::AttrKind Kind) const;
   Attribute getAttribute(StringRef Kind) const;
@@ -180,17 +166,12 @@ public:
   unsigned getStackAlignment() const;
   std::string getAsString(bool InAttrGrp) const;
 
-  typedef SmallVectorImpl<Attribute>::iterator       iterator;
-  typedef SmallVectorImpl<Attribute>::const_iterator const_iterator;
-
-  iterator begin() { return AttrList.begin(); }
-  iterator end()   { return AttrList.end(); }
-
-  const_iterator begin() const { return AttrList.begin(); }
-  const_iterator end() const   { return AttrList.end(); }
+  typedef const Attribute *iterator;
+  iterator begin() const { return reinterpret_cast<iterator>(this + 1); }
+  iterator end() const { return begin() + NumAttrs; }
 
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, AttrList);
+    Profile(ID, makeArrayRef(begin(), end()));
   }
   static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
     for (unsigned I = 0, E = AttrList.size(); I != E; ++I)
@@ -208,58 +189,67 @@ class AttributeSetImpl : public FoldingSetNode {
   LLVMContext &Context;
 
   typedef std::pair<unsigned, AttributeSetNode*> IndexAttrPair;
-  SmallVector<IndexAttrPair, 4> AttrNodes;
+  unsigned NumAttrs; ///< Number of entries in this set.
+
+  /// \brief Return a pointer to the IndexAttrPair for the specified slot.
+  const IndexAttrPair *getNode(unsigned Slot) const {
+    return reinterpret_cast<const IndexAttrPair *>(this + 1) + Slot;
+  }
 
   // AttributesSet is uniqued, these should not be publicly available.
   void operator=(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
   AttributeSetImpl(const AttributeSetImpl &) LLVM_DELETED_FUNCTION;
 public:
   AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode*> > attrs)
-    : Context(C), AttrNodes(attrs.begin(), attrs.end()) {}
+                   ArrayRef<std::pair<unsigned, AttributeSetNode *> > Attrs)
+      : Context(C), NumAttrs(Attrs.size()) {
+#ifndef NDEBUG
+    if (Attrs.size() >= 2) {
+      for (const std::pair<unsigned, AttributeSetNode *> *i = Attrs.begin() + 1,
+                                                         *e = Attrs.end();
+           i != e; ++i) {
+        assert((i-1)->first <= i->first && "Attribute set not ordered!");
+      }
+    }
+#endif
+    // There's memory after the node where we can store the entries in.
+    std::copy(Attrs.begin(), Attrs.end(),
+              reinterpret_cast<IndexAttrPair *>(this + 1));
+  }
 
   /// \brief Get the context that created this AttributeSetImpl.
   LLVMContext &getContext() { return Context; }
 
   /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return AttrNodes.size(); }
+  unsigned getNumAttributes() const { return NumAttrs; }
 
   /// \brief Get the index of the given "slot" in the AttrNodes list. This index
   /// is the index of the return, parameter, or function object that the
   /// attributes are applied to, not the index into the AttrNodes list where the
   /// attributes reside.
   unsigned getSlotIndex(unsigned Slot) const {
-    return AttrNodes[Slot].first;
+    return getNode(Slot)->first;
   }
 
   /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
   /// \p Slot is an index into the AttrNodes list, not the index of the return /
   /// parameter/ function which the attributes apply to.
   AttributeSet getSlotAttributes(unsigned Slot) const {
-    return AttributeSet::get(Context, AttrNodes[Slot]);
+    return AttributeSet::get(Context, *getNode(Slot));
   }
 
   /// \brief Retrieve the attribute set node for the given "slot" in the
   /// AttrNode list.
   AttributeSetNode *getSlotNode(unsigned Slot) const {
-    return AttrNodes[Slot].second;
+    return getNode(Slot)->second;
   }
 
-  typedef AttributeSetNode::iterator       iterator;
-  typedef AttributeSetNode::const_iterator const_iterator;
-
-  iterator begin(unsigned Slot)
-    { return AttrNodes[Slot].second->begin(); }
-  iterator end(unsigned Slot)
-    { return AttrNodes[Slot].second->end(); }
-
-  const_iterator begin(unsigned Slot) const
-    { return AttrNodes[Slot].second->begin(); }
-  const_iterator end(unsigned Slot) const
-    { return AttrNodes[Slot].second->end(); }
+  typedef AttributeSetNode::iterator iterator;
+  iterator begin(unsigned Slot) const { return getSlotNode(Slot)->begin(); }
+  iterator end(unsigned Slot) const { return getSlotNode(Slot)->end(); }
 
   void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, AttrNodes);
+    Profile(ID, makeArrayRef(getNode(0), getNumAttributes()));
   }
   static void Profile(FoldingSetNodeID &ID,
                       ArrayRef<std::pair<unsigned, AttributeSetNode*> > Nodes) {
@@ -271,6 +261,8 @@ public:
 
   // FIXME: This atrocity is temporary.
   uint64_t Raw(unsigned Index) const;
+
+  void dump() const;
 };
 
 } // end llvm namespace
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 4fe6f9d..0f2b7a0 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -43,9 +43,10 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
   if (!PA) {
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
-    PA = !Val ?
-      new AttributeImpl(Context, Kind) :
-      new AttributeImpl(Context, Kind, Val);
+    if (!Val)
+      PA = new EnumAttributeImpl(Kind);
+    else
+      PA = new AlignAttributeImpl(Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -65,7 +66,7 @@ Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) {
   if (!PA) {
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
-    PA = new AttributeImpl(Context, Kind, Val);
+    PA = new StringAttributeImpl(Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -103,24 +104,28 @@ bool Attribute::isStringAttribute() const {
 }
 
 Attribute::AttrKind Attribute::getKindAsEnum() const {
+  if (!pImpl) return None;
   assert((isEnumAttribute() || isAlignAttribute()) &&
          "Invalid attribute type to get the kind as an enum!");
   return pImpl ? pImpl->getKindAsEnum() : None;
 }
 
 uint64_t Attribute::getValueAsInt() const {
+  if (!pImpl) return 0;
   assert(isAlignAttribute() &&
          "Expected the attribute to be an alignment attribute!");
   return pImpl ? pImpl->getValueAsInt() : 0;
 }
 
 StringRef Attribute::getKindAsString() const {
+  if (!pImpl) return StringRef();
   assert(isStringAttribute() &&
          "Invalid attribute type to get the kind as a string!");
   return pImpl ? pImpl->getKindAsString() : StringRef();
 }
 
 StringRef Attribute::getValueAsString() const {
+  if (!pImpl) return StringRef();
   assert(isStringAttribute() &&
          "Invalid attribute type to get the value as a string!");
   return pImpl ? pImpl->getValueAsString() : StringRef();
@@ -157,6 +162,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "sanitize_address";
   if (hasAttribute(Attribute::AlwaysInline))
     return "alwaysinline";
+  if (hasAttribute(Attribute::Builtin))
+    return "builtin";
   if (hasAttribute(Attribute::ByVal))
     return "byval";
   if (hasAttribute(Attribute::InlineHint))
@@ -189,6 +196,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noreturn";
   if (hasAttribute(Attribute::NoUnwind))
     return "nounwind";
+  if (hasAttribute(Attribute::OptimizeNone))
+    return "optnone";
   if (hasAttribute(Attribute::OptimizeForSize))
     return "optsize";
   if (hasAttribute(Attribute::ReadNone))
@@ -217,6 +226,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "uwtable";
   if (hasAttribute(Attribute::ZExt))
     return "zeroext";
+  if (hasAttribute(Attribute::Cold))
+    return "cold";
 
   // FIXME: These should be output like this:
   //
@@ -275,35 +286,11 @@ bool Attribute::operator<(Attribute A) const {
 // AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
-AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind)
-  : Context(C), Entry(new EnumAttributeEntry(Kind)) {}
-
-AttributeImpl::AttributeImpl(LLVMContext &C, Attribute::AttrKind Kind,
-                             unsigned Align)
-  : Context(C) {
-  assert((Kind == Attribute::Alignment || Kind == Attribute::StackAlignment) &&
-         "Wrong kind for alignment attribute!");
-  Entry = new AlignAttributeEntry(Kind, Align);
-}
-
-AttributeImpl::AttributeImpl(LLVMContext &C, StringRef Kind, StringRef Val)
-  : Context(C), Entry(new StringAttributeEntry(Kind, Val)) {}
-
-AttributeImpl::~AttributeImpl() {
-  delete Entry;
-}
-
-bool AttributeImpl::isEnumAttribute() const {
-  return isa<EnumAttributeEntry>(Entry);
-}
-
-bool AttributeImpl::isAlignAttribute() const {
-  return isa<AlignAttributeEntry>(Entry);
-}
-
-bool AttributeImpl::isStringAttribute() const {
-  return isa<StringAttributeEntry>(Entry);
-}
+// Pin the vtabels to this file.
+AttributeImpl::~AttributeImpl() {}
+void EnumAttributeImpl::anchor() {}
+void AlignAttributeImpl::anchor() {}
+void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
   if (isStringAttribute()) return false;
@@ -316,21 +303,23 @@ bool AttributeImpl::hasAttribute(StringRef Kind) const {
 }
 
 Attribute::AttrKind AttributeImpl::getKindAsEnum() const {
-  if (EnumAttributeEntry *E = dyn_cast<EnumAttributeEntry>(Entry))
-    return E->getEnumKind();
-  return cast<AlignAttributeEntry>(Entry)->getEnumKind();
+  assert(isEnumAttribute() || isAlignAttribute());
+  return static_cast<const EnumAttributeImpl *>(this)->getEnumKind();
 }
 
 uint64_t AttributeImpl::getValueAsInt() const {
-  return cast<AlignAttributeEntry>(Entry)->getAlignment();
+  assert(isAlignAttribute());
+  return static_cast<const AlignAttributeImpl *>(this)->getAlignment();
 }
 
 StringRef AttributeImpl::getKindAsString() const {
-  return cast<StringAttributeEntry>(Entry)->getStringKind();
+  assert(isStringAttribute());
+  return static_cast<const StringAttributeImpl *>(this)->getStringKind();
 }
 
 StringRef AttributeImpl::getValueAsString() const {
-  return cast<StringAttributeEntry>(Entry)->getStringValue();
+  assert(isStringAttribute());
+  return static_cast<const StringAttributeImpl *>(this)->getStringValue();
 }
 
 bool AttributeImpl::operator<(const AttributeImpl &AI) const {
@@ -396,6 +385,9 @@ uint64_t AttributeImpl::getAttrMask(Attribute::AttrKind Val) {
   case Attribute::SanitizeMemory:  return 1ULL << 37;
   case Attribute::NoBuiltin:       return 1ULL << 38;
   case Attribute::Returned:        return 1ULL << 39;
+  case Attribute::Cold:            return 1ULL << 40;
+  case Attribute::Builtin:         return 1ULL << 41;
+  case Attribute::OptimizeNone:    return 1ULL << 42;
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -427,7 +419,10 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
   // If we didn't find any existing attributes of the same shape then create a
   // new one and insert it.
   if (!PA) {
-    PA = new AttributeSetNode(SortedAttrs);
+    // Coallocate entries after the AttributeSetNode itself.
+    void *Mem = ::operator new(sizeof(AttributeSetNode) +
+                               sizeof(Attribute) * SortedAttrs.size());
+    PA = new (Mem) AttributeSetNode(SortedAttrs);
     pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
   }
 
@@ -436,48 +431,42 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
 }
 
 bool AttributeSetNode::hasAttribute(Attribute::AttrKind Kind) const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Kind))
       return true;
   return false;
 }
 
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Kind))
       return true;
   return false;
 }
 
 Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Kind))
       return *I;
   return Attribute();
 }
 
 Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Kind))
       return *I;
   return Attribute();
 }
 
 unsigned AttributeSetNode::getAlignment() const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Attribute::Alignment))
       return I->getAlignment();
   return 0;
 }
 
 unsigned AttributeSetNode::getStackAlignment() const {
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I)
+  for (iterator I = begin(), E = end(); I != E; ++I)
     if (I->hasAttribute(Attribute::StackAlignment))
       return I->getStackAlignment();
   return 0;
@@ -485,9 +474,8 @@ unsigned AttributeSetNode::getStackAlignment() const {
 
 std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
   std::string Str;
-  for (SmallVectorImpl<Attribute>::const_iterator I = AttrList.begin(),
-         E = AttrList.end(); I != E; ++I) {
-    if (I != AttrList.begin())
+  for (iterator I = begin(), E = end(); I != E; ++I) {
+    if (I != begin())
       Str += ' ';
     Str += I->getAsString(InAttrGrp);
   }
@@ -501,10 +489,10 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 uint64_t AttributeSetImpl::Raw(unsigned Index) const {
   for (unsigned I = 0, E = getNumAttributes(); I != E; ++I) {
     if (getSlotIndex(I) != Index) continue;
-    const AttributeSetNode *ASN = AttrNodes[I].second;
+    const AttributeSetNode *ASN = getSlotNode(I);
     uint64_t Mask = 0;
 
-    for (AttributeSetNode::const_iterator II = ASN->begin(),
+    for (AttributeSetNode::iterator II = ASN->begin(),
            IE = ASN->end(); II != IE; ++II) {
       Attribute Attr = *II;
 
@@ -527,6 +515,10 @@ uint64_t AttributeSetImpl::Raw(unsigned Index) const {
   return 0;
 }
 
+void AttributeSetImpl::dump() const {
+  AttributeSet(const_cast<AttributeSetImpl *>(this)).dump();
+}
+
 //===----------------------------------------------------------------------===//
 // AttributeSet Construction and Mutation Methods
 //===----------------------------------------------------------------------===//
@@ -544,7 +536,11 @@ AttributeSet::getImpl(LLVMContext &C,
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PA) {
-    PA = new AttributeSetImpl(C, Attrs);
+    // Coallocate entries after the AttributeSetImpl itself.
+    void *Mem = ::operator new(sizeof(AttributeSetImpl) +
+                               sizeof(std::pair<unsigned, AttributeSetNode *>) *
+                                   Attrs.size());
+    PA = new (Mem) AttributeSetImpl(C, Attrs);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
@@ -636,12 +632,30 @@ AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
 
 AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
   if (Attrs.empty()) return AttributeSet();
+  if (Attrs.size() == 1) return Attrs[0];
 
   SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
-  for (unsigned I = 0, E = Attrs.size(); I != E; ++I) {
-    AttributeSet AS = Attrs[I];
-    if (!AS.pImpl) continue;
-    AttrNodeVec.append(AS.pImpl->AttrNodes.begin(), AS.pImpl->AttrNodes.end());
+  AttributeSetImpl *A0 = Attrs[0].pImpl;
+  if (A0)
+    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumAttributes()));
+  // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
+  // ordered by index.  Because we know that each list in Attrs is ordered by
+  // index we only need to merge each successive list in rather than doing a
+  // full sort.
+  for (unsigned I = 1, E = Attrs.size(); I != E; ++I) {
+    AttributeSetImpl *AS = Attrs[I].pImpl;
+    if (!AS) continue;
+    SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
+      ANVI = AttrNodeVec.begin(), ANVE;
+    for (const AttributeSetImpl::IndexAttrPair
+             *AI = AS->getNode(0),
+             *AE = AS->getNode(AS->getNumAttributes());
+         AI != AE; ++AI) {
+      ANVE = AttrNodeVec.end();
+      while (ANVI != ANVE && ANVI->first <= AI->first)
+        ++ANVI;
+      ANVI = AttrNodeVec.insert(ANVI, *AI) + 1;
+    }
   }
 
   return getImpl(C, AttrNodeVec);
@@ -660,6 +674,13 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
   return addAttributes(C, Index, AttributeSet::get(C, Index, B));
 }
 
+AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
+                                        StringRef Kind, StringRef Value) const {
+  llvm::AttrBuilder B;
+  B.addAttribute(Kind, Value);
+  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+}
+
 AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
                                          AttributeSet Attrs) const {
   if (!pImpl) return Attrs;
@@ -694,7 +715,7 @@ AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
 
   for (unsigned I = 0, E = Attrs.pImpl->getNumAttributes(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Index) {
-      for (AttributeSetImpl::const_iterator II = Attrs.pImpl->begin(I),
+      for (AttributeSetImpl::iterator II = Attrs.pImpl->begin(I),
              IE = Attrs.pImpl->end(I); II != IE; ++II)
         B.addAttribute(*II);
       break;
@@ -815,7 +836,7 @@ bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr) const {
   if (pImpl == 0) return false;
 
   for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I)
-    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+    for (AttributeSetImpl::iterator II = pImpl->begin(I),
            IE = pImpl->end(I); II != IE; ++II)
       if (II->hasAttribute(Attr))
         return true;
@@ -931,7 +952,7 @@ AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
   for (unsigned I = 0, E = pImpl->getNumAttributes(); I != E; ++I) {
     if (pImpl->getSlotIndex(I) != Index) continue;
 
-    for (AttributeSetImpl::const_iterator II = pImpl->begin(I),
+    for (AttributeSetImpl::iterator II = pImpl->begin(I),
            IE = pImpl->end(I); II != IE; ++II)
       addAttribute(*II);
 
@@ -1151,6 +1172,8 @@ AttributeSet AttributeFuncs::typeIncompatible(Type *Ty, uint64_t Index) {
       .addAttribute(Attribute::Nest)
       .addAttribute(Attribute::NoAlias)
       .addAttribute(Attribute::NoCapture)
+      .addAttribute(Attribute::ReadNone)
+      .addAttribute(Attribute::ReadOnly)
       .addAttribute(Attribute::StructRet);
 
   return AttributeSet::get(Ty->getContext(), Index, Incompatible);
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index f237537..d12bf7b 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the auto-upgrade helper functions 
+// This file implements the auto-upgrade helper functions
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/AutoUpgrade.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -55,14 +56,14 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'a': {
     if (Name.startswith("arm.neon.vclz")) {
       Type* args[2] = {
-        F->arg_begin()->getType(), 
+        F->arg_begin()->getType(),
         Type::getInt1Ty(F->getContext())
       };
       // Can't use Intrinsic::getDeclaration here as it adds a ".i1" to
       // the end of the name. Change name from llvm.arm.neon.vclz.* to
       //  llvm.ctlz.*
       FunctionType* fType = FunctionType::get(F->getReturnType(), args, false);
-      NewFn = Function::Create(fType, F->getLinkage(), 
+      NewFn = Function::Create(fType, F->getLinkage(),
                                "llvm.ctlz." + Name.substr(14), F->getParent());
       return true;
     }
@@ -88,6 +89,20 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
+  case 'o':
+    // We only need to change the name to match the mangling including the
+    // address space.
+    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+      Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
+      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+        F->setName(Name + ".old");
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::objectsize, Tys);
+        return true;
+      }
+    }
+    break;
+
   case 'x': {
     if (Name.startswith("x86.sse2.pcmpeq.") ||
         Name.startswith("x86.sse2.pcmpgt.") ||
@@ -97,6 +112,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
         Name == "x86.avx.movnt.dq.256" ||
         Name == "x86.avx.movnt.pd.256" ||
         Name == "x86.avx.movnt.ps.256" ||
+        Name == "x86.sse42.crc32.64.8" ||
         (Name.startswith("x86.xop.vpcom") && F->arg_size() == 2)) {
       NewFn = 0;
       return true;
@@ -257,6 +273,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Function *VPCOM = Intrinsic::getDeclaration(F->getParent(), intID);
       Rep = Builder.CreateCall3(VPCOM, CI->getArgOperand(0),
                                 CI->getArgOperand(1), Builder.getInt8(Imm));
+    } else if (Name == "llvm.x86.sse42.crc32.64.8") {
+      Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
+                                               Intrinsic::x86_sse42_crc32_32_8);
+      Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
+      Rep = Builder.CreateCall2(CRC32, Trunc0, CI->getArgOperand(1));
+      Rep = Builder.CreateZExt(Rep, CI->getType(), "");
     } else {
       bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
       if (Name == "llvm.x86.avx.vpermil.pd.256")
@@ -317,6 +339,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     CI->eraseFromParent();
     return;
 
+  case Intrinsic::objectsize:
+    CI->replaceAllUsesWith(Builder.CreateCall2(NewFn,
+                                               CI->getArgOperand(0),
+                                               CI->getArgOperand(1),
+                                               Name));
+    CI->eraseFromParent();
+    return;
+
   case Intrinsic::arm_neon_vclz: {
     // Change name from llvm.arm.neon.vclz.* to llvm.ctlz.*
     CI->replaceAllUsesWith(Builder.CreateCall2(NewFn, CI->getArgOperand(0),
@@ -369,8 +399,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   }
 }
 
-// This tests each Function to determine if it needs upgrading. When we find 
-// one we are interested in, we then upgrade all calls to reflect the new 
+// This tests each Function to determine if it needs upgrading. When we find
+// one we are interested in, we then upgrade all calls to reflect the new
 // function.
 void llvm::UpgradeCallsToIntrinsic(Function* F) {
   assert(F && "Illegal attempt to upgrade a non-existent intrinsic.");
@@ -391,3 +421,81 @@ void llvm::UpgradeCallsToIntrinsic(Function* F) {
   }
 }
 
+void llvm::UpgradeInstWithTBAATag(Instruction *I) {
+  MDNode *MD = I->getMetadata(LLVMContext::MD_tbaa);
+  assert(MD && "UpgradeInstWithTBAATag should have a TBAA tag");
+  // Check if the tag uses struct-path aware TBAA format.
+  if (isa<MDNode>(MD->getOperand(0)) && MD->getNumOperands() >= 3)
+    return;
+
+  if (MD->getNumOperands() == 3) {
+    Value *Elts[] = {
+      MD->getOperand(0),
+      MD->getOperand(1)
+    };
+    MDNode *ScalarType = MDNode::get(I->getContext(), Elts);
+    // Create a MDNode <ScalarType, ScalarType, offset 0, const>
+    Value *Elts2[] = {
+      ScalarType, ScalarType,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext())),
+      MD->getOperand(2)
+    };
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts2));
+  } else {
+    // Create a MDNode <MD, MD, offset 0>
+    Value *Elts[] = {MD, MD,
+      Constant::getNullValue(Type::getInt64Ty(I->getContext()))};
+    I->setMetadata(LLVMContext::MD_tbaa, MDNode::get(I->getContext(), Elts));
+  }
+}
+
+Instruction *llvm::UpgradeBitCastInst(unsigned Opc, Value *V, Type *DestTy,
+                                      Instruction *&Temp) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Temp = 0;
+  Type *SrcTy = V->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = V->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+    Temp = CastInst::Create(Instruction::PtrToInt, V, MidTy);
+
+    return CastInst::Create(Instruction::IntToPtr, Temp, DestTy);
+  }
+
+  return 0;
+}
+
+Value *llvm::UpgradeBitCastExpr(unsigned Opc, Constant *C, Type *DestTy) {
+  if (Opc != Instruction::BitCast)
+    return 0;
+
+  Type *SrcTy = C->getType();
+  if (SrcTy->isPtrOrPtrVectorTy() && DestTy->isPtrOrPtrVectorTy() &&
+      SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace()) {
+    LLVMContext &Context = C->getContext();
+
+    // We have no information about target data layout, so we assume that
+    // the maximum pointer size is 64bit.
+    Type *MidTy = Type::getInt64Ty(Context);
+
+    return ConstantExpr::getIntToPtr(ConstantExpr::getPtrToInt(C, MidTy),
+                                     DestTy);
+  }
+
+  return 0;
+}
+
+/// Check the debug info version number, if it is out-dated, drop the debug
+/// info. Return true if module is modified.
+bool llvm::UpgradeDebugInfo(Module &M) {
+  if (getDebugMetadataVersionFromModule(M) == DEBUG_METADATA_VERSION)
+    return false;
+
+  return StripDebugInfo(M);
+}
diff --git a/lib/IR/CMakeLists.txt b/lib/IR/CMakeLists.txt
index c2a4ee3..581946c 100644
--- a/lib/IR/CMakeLists.txt
+++ b/lib/IR/CMakeLists.txt
@@ -6,10 +6,10 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DIBuilder.cpp
   DataLayout.cpp
   DebugInfo.cpp
   DebugLoc.cpp
-  DIBuilder.cpp
   Dominators.cpp
   Function.cpp
   GCOV.cpp
@@ -23,6 +23,7 @@ add_llvm_library(LLVMCore
   LLVMContext.cpp
   LLVMContextImpl.cpp
   LeakDetector.cpp
+  LegacyPassManager.cpp
   Metadata.cpp
   Module.cpp
   Pass.cpp
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index bf93d4f..f5e225c 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -75,7 +75,7 @@ static unsigned
 foldConstantCastPair(
   unsigned opc,          ///< opcode of the second cast constant expression
   ConstantExpr *Op,      ///< the first cast constant expression
-  Type *DstTy      ///< desintation type of the first cast
+  Type *DstTy            ///< destination type of the first cast
 ) {
   assert(Op && Op->isCast() && "Can't fold cast of cast without a cast!");
   assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
@@ -87,13 +87,14 @@ foldConstantCastPair(
   Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opc);
 
-  // Assume that pointers are never more than 64 bits wide.
+  // Assume that pointers are never more than 64 bits wide, and only use this
+  // for the middle type. Otherwise we could end up folding away illegal
+  // bitcasts between address spaces with different sizes.
   IntegerType *FakeIntPtrTy = Type::getInt64Ty(DstTy->getContext());
 
   // Let CastInst::isEliminableCastPair do the heavy lifting.
   return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
-                                        FakeIntPtrTy, FakeIntPtrTy,
-                                        FakeIntPtrTy);
+                                        0, FakeIntPtrTy, 0);
 }
 
 static Constant *FoldBitCast(Constant *V, Type *DestTy) {
@@ -688,6 +689,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   }
   case Instruction::BitCast:
     return FoldBitCast(V, DestTy);
+  case Instruction::AddrSpaceCast:
+    return 0;
   }
 }
 
@@ -1857,9 +1860,9 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
         if (CE1Inverse == CE1Op0) {
           // Check whether we can safely truncate the right hand side.
           Constant *C2Inverse = ConstantExpr::getTrunc(C2, CE1Op0->getType());
-          if (ConstantExpr::getZExt(C2Inverse, C2->getType()) == C2) {
+          if (ConstantExpr::getCast(CE1->getOpcode(), C2Inverse,
+                                    C2->getType()) == C2)
             return ConstantExpr::getICmp(pred, CE1Inverse, C2Inverse);
-          }
         }
       }
     }
@@ -1896,6 +1899,37 @@ static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
   return true;
 }
 
+/// \brief Test whether a given ConstantInt is in-range for a SequentialType.
+static bool isIndexInRangeOfSequentialType(const SequentialType *STy,
+                                           const ConstantInt *CI) {
+  if (const PointerType *PTy = dyn_cast<PointerType>(STy))
+    // Only handle pointers to sized types, not pointers to functions.
+    return PTy->getElementType()->isSized();
+
+  uint64_t NumElements = 0;
+  // Determine the number of elements in our sequential type.
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(STy))
+    NumElements = ATy->getNumElements();
+  else if (const VectorType *VTy = dyn_cast<VectorType>(STy))
+    NumElements = VTy->getNumElements();
+
+  assert((isa<ArrayType>(STy) || NumElements > 0) &&
+         "didn't expect non-array type to have zero elements!");
+
+  // We cannot bounds check the index if it doesn't fit in an int64_t.
+  if (CI->getValue().getActiveBits() > 64)
+    return false;
+
+  // A negative index or an index past the end of our sequential type is
+  // considered out-of-range.
+  int64_t IndexVal = CI->getSExtValue();
+  if (IndexVal < 0 || (NumElements > 0 && (uint64_t)IndexVal >= NumElements))
+    return false;
+
+  // Otherwise, it is in-range.
+  return true;
+}
+
 template<typename IndexTy>
 static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
                                                bool inBounds,
@@ -1939,7 +1973,32 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
            I != E; ++I)
         LastTy = *I;
 
-      if ((LastTy && isa<SequentialType>(LastTy)) || Idx0->isNullValue()) {
+      // We cannot combine indices if doing so would take us outside of an
+      // array or vector.  Doing otherwise could trick us if we evaluated such a
+      // GEP as part of a load.
+      //
+      // e.g. Consider if the original GEP was:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 0)
+      //
+      // If we then tried to offset it by '8' to get to the third element,
+      // an i8, we should *not* get:
+      // i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
+      //                    i32 0, i32 0, i64 8)
+      //
+      // This GEP tries to index array element '8  which runs out-of-bounds.
+      // Subsequent evaluation would get confused and produce erroneous results.
+      //
+      // The following prohibits such a GEP from being formed by checking to see
+      // if the index is in-range with respect to an array or vector.
+      bool PerformFold = false;
+      if (Idx0->isNullValue())
+        PerformFold = true;
+      else if (SequentialType *STy = dyn_cast_or_null<SequentialType>(LastTy))
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
+          PerformFold = isIndexInRangeOfSequentialType(STy, CI);
+
+      if (PerformFold) {
         SmallVector<Value*, 16> NewIndices;
         NewIndices.reserve(Idxs.size() + CE->getNumOperands());
         for (unsigned i = 1, e = CE->getNumOperands()-1; i != e; ++i)
@@ -1999,8 +2058,8 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   }
 
   // Check to see if any array indices are not within the corresponding
-  // notional array bounds. If so, try to determine if they can be factored
-  // out into preceding dimensions.
+  // notional array or vector bounds. If so, try to determine if they can be
+  // factored out into preceding dimensions.
   bool Unknown = false;
   SmallVector<Constant *, 8> NewIdxs;
   Type *Ty = C->getType();
@@ -2008,16 +2067,20 @@ static Constant *ConstantFoldGetElementPtrImpl(Constant *C,
   for (unsigned i = 0, e = Idxs.size(); i != e;
        Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
-      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
-        if (ATy->getNumElements() <= INT64_MAX &&
-            ATy->getNumElements() != 0 &&
-            CI->getSExtValue() >= (int64_t)ATy->getNumElements()) {
+      if (isa<ArrayType>(Ty) || isa<VectorType>(Ty))
+        if (CI->getSExtValue() > 0 &&
+            !isIndexInRangeOfSequentialType(cast<SequentialType>(Ty), CI)) {
           if (isa<SequentialType>(Prev)) {
             // It's out of range, but we can factor it into the prior
             // dimension.
             NewIdxs.resize(Idxs.size());
-            ConstantInt *Factor = ConstantInt::get(CI->getType(),
-                                                   ATy->getNumElements());
+            uint64_t NumElements = 0;
+            if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+              NumElements = ATy->getNumElements();
+            else
+              NumElements = cast<VectorType>(Ty)->getNumElements();
+
+            ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
             NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
             Constant *PrevIdx = cast<Constant>(Idxs[i-1]);
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 2c6971c..690ac59 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -483,8 +483,8 @@ ConstantInt *ConstantInt::get(LLVMContext &Context, const APInt &V) {
   // Get the corresponding integer type for the bit width of the value.
   IntegerType *ITy = IntegerType::get(Context, V.getBitWidth());
   // get an existing value or the insertion position
-  DenseMapAPIntKeyInfo::KeyTy Key(V, ITy);
-  ConstantInt *&Slot = Context.pImpl->IntConstants[Key]; 
+  LLVMContextImpl *pImpl = Context.pImpl;
+  ConstantInt *&Slot = pImpl->IntConstants[DenseMapAPIntKeyInfo::KeyTy(V, ITy)];
   if (!Slot) Slot = new ConstantInt(ITy, V);
   return Slot;
 }
@@ -608,11 +608,9 @@ Constant *ConstantFP::getZeroValueForNegation(Type *Ty) {
 
 // ConstantFP accessors.
 ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
-  DenseMapAPFloatKeyInfo::KeyTy Key(V);
-
   LLVMContextImpl* pImpl = Context.pImpl;
 
-  ConstantFP *&Slot = pImpl->FPConstants[Key];
+  ConstantFP *&Slot = pImpl->FPConstants[DenseMapAPFloatKeyInfo::KeyTy(V)];
 
   if (!Slot) {
     Type *Ty;
@@ -1128,6 +1126,7 @@ getWithOperands(ArrayRef<Constant*> Ops, Type *Ty) const {
   case Instruction::PtrToInt:
   case Instruction::IntToPtr:
   case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
     return ConstantExpr::getCast(getOpcode(), Ops[0], Ty);
   case Instruction::Select:
     return ConstantExpr::getSelect(Ops[0], Ops[1], Ops[2]);
@@ -1391,7 +1390,7 @@ void BlockAddress::replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U) {
   BasicBlock *NewBB = getBasicBlock();
 
   if (U == &Op<0>())
-    NewF = cast<Function>(To);
+    NewF = cast<Function>(To->stripPointerCasts());
   else
     NewBB = cast<BasicBlock>(To);
 
@@ -1463,6 +1462,7 @@ Constant *ConstantExpr::getCast(unsigned oc, Constant *C, Type *Ty) {
   case Instruction::PtrToInt: return getPtrToInt(C, Ty);
   case Instruction::IntToPtr: return getIntToPtr(C, Ty);
   case Instruction::BitCast:  return getBitCast(C, Ty);
+  case Instruction::AddrSpaceCast:  return getAddrSpaceCast(C, Ty);
   }
 }
 
@@ -1491,10 +1491,26 @@ Constant *ConstantExpr::getPointerCast(Constant *S, Type *Ty) {
 
   if (Ty->isIntOrIntVectorTy())
     return getPtrToInt(S, Ty);
+
+  unsigned SrcAS = S->getType()->getPointerAddressSpace();
+  if (Ty->isPtrOrPtrVectorTy() && SrcAS != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
+  return getBitCast(S, Ty);
+}
+
+Constant *ConstantExpr::getPointerBitCastOrAddrSpaceCast(Constant *S,
+                                                         Type *Ty) {
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert(Ty->isPtrOrPtrVectorTy() && "Invalid cast");
+
+  if (S->getType()->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return getAddrSpaceCast(S, Ty);
+
   return getBitCast(S, Ty);
 }
 
-Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty, 
+Constant *ConstantExpr::getIntegerCast(Constant *C, Type *Ty,
                                        bool isSigned) {
   assert(C->getType()->isIntOrIntVectorTy() &&
          Ty->isIntOrIntVectorTy() && "Invalid cast");
@@ -1664,6 +1680,13 @@ Constant *ConstantExpr::getBitCast(Constant *C, Type *DstTy) {
   return getFoldedCast(Instruction::BitCast, C, DstTy);
 }
 
+Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy) {
+  assert(CastInst::castIsValid(Instruction::AddrSpaceCast, C, DstTy) &&
+         "Invalid constantexpr addrspacecast!");
+
+  return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy);
+}
+
 Constant *ConstantExpr::get(unsigned Opcode, Constant *C1, Constant *C2,
                             unsigned Flags) {
   // Check the operands for consistency first.
@@ -1956,14 +1979,22 @@ Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
 
 Constant *ConstantExpr::getInsertValue(Constant *Agg, Constant *Val,
                                        ArrayRef<unsigned> Idxs) {
+  assert(Agg->getType()->isFirstClassType() &&
+         "Non-first-class type for constant insertvalue expression");
+
   assert(ExtractValueInst::getIndexedType(Agg->getType(),
                                           Idxs) == Val->getType() &&
          "insertvalue indices invalid!");
-  assert(Agg->getType()->isFirstClassType() &&
-         "Non-first-class type for constant insertvalue expression");
-  Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs);
-  assert(FC && "insertvalue constant expr couldn't be folded!");
-  return FC;
+  Type *ReqTy = Val->getType();
+
+  if (Constant *FC = ConstantFoldInsertValueInstruction(Agg, Val, Idxs))
+    return FC;
+
+  Constant *ArgVec[] = { Agg, Val };
+  const ExprMapKeyType Key(Instruction::InsertValue, ArgVec, 0, 0, Idxs);
+
+  LLVMContextImpl *pImpl = Agg->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
 Constant *ConstantExpr::getExtractValue(Constant *Agg,
@@ -1977,9 +2008,14 @@ Constant *ConstantExpr::getExtractValue(Constant *Agg,
 
   assert(Agg->getType()->isFirstClassType() &&
          "Non-first-class type for constant extractvalue expression");
-  Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs);
-  assert(FC && "ExtractValue constant expr couldn't be folded!");
-  return FC;
+  if (Constant *FC = ConstantFoldExtractValueInstruction(Agg, Idxs))
+    return FC;
+
+  Constant *ArgVec[] = { Agg };
+  const ExprMapKeyType Key(Instruction::ExtractValue, ArgVec, 0, 0, Idxs);
+
+  LLVMContextImpl *pImpl = Agg->getContext().pImpl;
+  return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
 }
 
 Constant *ConstantExpr::getNeg(Constant *C, bool HasNUW, bool HasNSW) {
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 889d574..c70f459 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -58,6 +58,10 @@ void LLVMShutdown() {
 
 /*===-- Error handling ----------------------------------------------------===*/
 
+char *LLVMCreateMessage(const char *Message) {
+  return strdup(Message);
+}
+
 void LLVMDisposeMessage(char *Message) {
   free(Message);
 }
@@ -93,7 +97,7 @@ LLVMModuleRef LLVMModuleCreateWithName(const char *ModuleID) {
   return wrap(new Module(ModuleID, getGlobalContext()));
 }
 
-LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID, 
+LLVMModuleRef LLVMModuleCreateWithNameInContext(const char *ModuleID,
                                                 LLVMContextRef C) {
   return wrap(new Module(ModuleID, *unwrap(C)));
 }
@@ -143,6 +147,16 @@ LLVMBool LLVMPrintModuleToFile(LLVMModuleRef M, const char *Filename,
   return false;
 }
 
+char *LLVMPrintModuleToString(LLVMModuleRef M) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(M)->print(os, NULL);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on inline assembler ......................................--*/
 void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm) {
   unwrap(M)->setModuleInlineAsm(StringRef(Asm));
@@ -206,6 +220,20 @@ LLVMContextRef LLVMGetTypeContext(LLVMTypeRef Ty) {
   return wrap(&unwrap(Ty)->getContext());
 }
 
+void LLVMDumpType(LLVMTypeRef Ty) {
+  return unwrap(Ty)->dump();
+}
+
+char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Ty)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 /*--.. Operations on integer types .........................................--*/
 
 LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C)  {
@@ -446,6 +474,16 @@ void LLVMDumpValue(LLVMValueRef Val) {
   unwrap(Val)->dump();
 }
 
+char* LLVMPrintValueToString(LLVMValueRef Val) {
+  std::string buf;
+  raw_string_ostream os(buf);
+
+  unwrap(Val)->print(os);
+  os.flush();
+
+  return strdup(buf.c_str());
+}
+
 void LLVMReplaceAllUsesWith(LLVMValueRef OldVal, LLVMValueRef NewVal) {
   unwrap(OldVal)->replaceAllUsesWith(unwrap(NewVal));
 }
@@ -677,7 +715,7 @@ LLVMValueRef LLVMConstStringInContext(LLVMContextRef C, const char *Str,
   return wrap(ConstantDataArray::getString(*unwrap(C), StringRef(Str, Length),
                                            DontNullTerminate == 0));
 }
-LLVMValueRef LLVMConstStructInContext(LLVMContextRef C, 
+LLVMValueRef LLVMConstStructInContext(LLVMContextRef C,
                                       LLVMValueRef *ConstantVals,
                                       unsigned Count, LLVMBool Packed) {
   Constant **Elements = unwrap<Constant>(ConstantVals, Count);
@@ -995,6 +1033,12 @@ LLVMValueRef LLVMConstBitCast(LLVMValueRef ConstantVal, LLVMTypeRef ToType) {
                                        unwrap(ToType)));
 }
 
+LLVMValueRef LLVMConstAddrSpaceCast(LLVMValueRef ConstantVal,
+                                    LLVMTypeRef ToType) {
+  return wrap(ConstantExpr::getAddrSpaceCast(unwrap<Constant>(ConstantVal),
+                                             unwrap(ToType)));
+}
+
 LLVMValueRef LLVMConstZExtOrBitCast(LLVMValueRef ConstantVal,
                                     LLVMTypeRef ToType) {
   return wrap(ConstantExpr::getZExtOrBitCast(unwrap<Constant>(ConstantVal),
@@ -1106,8 +1150,6 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkOnceAnyLinkage;
   case GlobalValue::LinkOnceODRLinkage:
     return LLVMLinkOnceODRLinkage;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    return LLVMLinkOnceODRAutoHideLinkage;
   case GlobalValue::WeakAnyLinkage:
     return LLVMWeakAnyLinkage;
   case GlobalValue::WeakODRLinkage:
@@ -1152,7 +1194,8 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
     GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
     break;
   case LLVMLinkOnceODRAutoHideLinkage:
-    GV->setLinkage(GlobalValue::LinkOnceODRAutoHideLinkage);
+    DEBUG(errs() << "LLVMSetLinkage(): LLVMLinkOnceODRAutoHideLinkage is no "
+                    "longer supported.");
     break;
   case LLVMWeakAnyLinkage:
     GV->setLinkage(GlobalValue::WeakAnyLinkage);
@@ -1212,12 +1255,30 @@ void LLVMSetVisibility(LLVMValueRef Global, LLVMVisibility Viz) {
     ->setVisibility(static_cast<GlobalValue::VisibilityTypes>(Viz));
 }
 
-unsigned LLVMGetAlignment(LLVMValueRef Global) {
-  return unwrap<GlobalValue>(Global)->getAlignment();
+/*--.. Operations on global variables, load and store instructions .........--*/
+
+unsigned LLVMGetAlignment(LLVMValueRef V) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    return GV->getAlignment();
+  if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    return LI->getAlignment();
+  if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    return SI->getAlignment();
+
+  llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
-void LLVMSetAlignment(LLVMValueRef Global, unsigned Bytes) {
-  unwrap<GlobalValue>(Global)->setAlignment(Bytes);
+void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
+  Value *P = unwrap<Value>(V);
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+    GV->setAlignment(Bytes);
+  else if (LoadInst *LI = dyn_cast<LoadInst>(P))
+    LI->setAlignment(Bytes);
+  else if (StoreInst *SI = dyn_cast<StoreInst>(P))
+    SI->setAlignment(Bytes);
+  else
+    llvm_unreachable("only GlobalValue, LoadInst and StoreInst have alignment");
 }
 
 /*--.. Operations on global variables ......................................--*/
@@ -1549,7 +1610,7 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
   return (LLVMAttribute)A->getParent()->getAttributes().
     Raw(A->getArgNo()+1);
 }
-  
+
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
@@ -1741,7 +1802,7 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
   llvm_unreachable("LLVMSetInstructionCallConv applies only to call and invoke!");
 }
 
-void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                            LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1751,7 +1812,7 @@ void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                          index, B)));
 }
 
-void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
+void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                               LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B(PA);
@@ -1761,7 +1822,7 @@ void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index,
                                                            index, B)));
 }
 
-void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
+void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
                                 unsigned align) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
@@ -2115,8 +2176,8 @@ LLVMValueRef LLVMBuildMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                0, 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2126,8 +2187,8 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   Type* ITy = Type::getInt32Ty(unwrap(B)->GetInsertBlock()->getContext());
   Constant* AllocSize = ConstantExpr::getSizeOf(unwrap(Ty));
   AllocSize = ConstantExpr::getTruncOrBitCast(AllocSize, ITy);
-  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(), 
-                                               ITy, unwrap(Ty), AllocSize, 
+  Instruction* Malloc = CallInst::CreateMalloc(unwrap(B)->GetInsertBlock(),
+                                               ITy, unwrap(Ty), AllocSize,
                                                unwrap(Val), 0, "");
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
@@ -2153,7 +2214,7 @@ LLVMValueRef LLVMBuildLoad(LLVMBuilderRef B, LLVMValueRef PointerVal,
   return wrap(unwrap(B)->CreateLoad(unwrap(PointerVal), Name));
 }
 
-LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val, 
+LLVMValueRef LLVMBuildStore(LLVMBuilderRef B, LLVMValueRef Val,
                             LLVMValueRef PointerVal) {
   return wrap(unwrap(B)->CreateStore(unwrap(Val), unwrap(PointerVal)));
 }
@@ -2263,6 +2324,11 @@ LLVMValueRef LLVMBuildBitCast(LLVMBuilderRef B, LLVMValueRef Val,
   return wrap(unwrap(B)->CreateBitCast(unwrap(Val), unwrap(DestTy), Name));
 }
 
+LLVMValueRef LLVMBuildAddrSpaceCast(LLVMBuilderRef B, LLVMValueRef Val,
+                                    LLVMTypeRef DestTy, const char *Name) {
+  return wrap(unwrap(B)->CreateAddrSpaceCast(unwrap(Val), unwrap(DestTy), Name));
+}
+
 LLVMValueRef LLVMBuildZExtOrBitCast(LLVMBuilderRef B, LLVMValueRef Val,
                                     LLVMTypeRef DestTy, const char *Name) {
   return wrap(unwrap(B)->CreateZExtOrBitCast(unwrap(Val), unwrap(DestTy),
@@ -2392,9 +2458,9 @@ LLVMValueRef LLVMBuildPtrDiff(LLVMBuilderRef B, LLVMValueRef LHS,
   return wrap(unwrap(B)->CreatePtrDiff(unwrap(LHS), unwrap(RHS), Name));
 }
 
-LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op, 
-                               LLVMValueRef PTR, LLVMValueRef Val, 
-                               LLVMAtomicOrdering ordering, 
+LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
+                               LLVMValueRef PTR, LLVMValueRef Val,
+                               LLVMAtomicOrdering ordering,
                                LLVMBool singleThread) {
   AtomicRMWInst::BinOp intop;
   switch (op) {
@@ -2417,14 +2483,14 @@ LLVMValueRef LLVMBuildAtomicRMW(LLVMBuilderRef B,LLVMAtomicRMWBinOp op,
     case LLVMAtomicOrderingMonotonic: intordering = Monotonic; break;
     case LLVMAtomicOrderingAcquire: intordering = Acquire; break;
     case LLVMAtomicOrderingRelease: intordering = Release; break;
-    case LLVMAtomicOrderingAcquireRelease: 
-      intordering = AcquireRelease; 
+    case LLVMAtomicOrderingAcquireRelease:
+      intordering = AcquireRelease;
       break;
-    case LLVMAtomicOrderingSequentiallyConsistent: 
-      intordering = SequentiallyConsistent; 
+    case LLVMAtomicOrderingSequentiallyConsistent:
+      intordering = SequentiallyConsistent;
       break;
   }
-  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val), 
+  return wrap(unwrap(B)->CreateAtomicRMW(intop, unwrap(PTR), unwrap(Val),
     intordering, singleThread ? SingleThread : CrossThread));
 }
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index 0980e80..c4a9f41 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -30,17 +30,24 @@ static Constant *GetTagConstant(LLVMContext &VMContext, unsigned Tag) {
 }
 
 DIBuilder::DIBuilder(Module &m)
-  : M(m), VMContext(M.getContext()), TheCU(0), TempEnumTypes(0),
-    TempRetainTypes(0), TempSubprograms(0), TempGVs(0), DeclareFn(0),
-    ValueFn(0)
-{}
+    : M(m), VMContext(M.getContext()), TempEnumTypes(0), TempRetainTypes(0),
+      TempSubprograms(0), TempGVs(0), DeclareFn(0), ValueFn(0) {}
 
 /// finalize - Construct any deferred debug info descriptors.
 void DIBuilder::finalize() {
   DIArray Enums = getOrCreateArray(AllEnumTypes);
   DIType(TempEnumTypes).replaceAllUsesWith(Enums);
 
-  DIArray RetainTypes = getOrCreateArray(AllRetainTypes);
+  SmallVector<Value *, 16> RetainValues;
+  // Declarations and definitions of the same type may be retained. Some
+  // clients RAUW these pairs, leaving duplicates in the retained types
+  // list. Use a set to remove the duplicates while we transform the
+  // TrackingVHs back into Values.
+  SmallPtrSet<Value *, 16> RetainSet;
+  for (unsigned I = 0, E = AllRetainTypes.size(); I < E; I++)
+    if (RetainSet.insert(AllRetainTypes[I]))
+      RetainValues.push_back(AllRetainTypes[I]);
+  DIArray RetainTypes = getOrCreateArray(RetainValues);
   DIType(TempRetainTypes).replaceAllUsesWith(RetainTypes);
 
   DIArray SPs = getOrCreateArray(AllSubprograms);
@@ -79,17 +86,18 @@ static MDNode *createFilePathPair(LLVMContext &VMContext, StringRef Filename,
   assert(!Filename.empty() && "Unable to create file without name");
   Value *Pair[] = {
     MDString::get(VMContext, Filename),
-    MDString::get(VMContext, Directory),
+    MDString::get(VMContext, Directory)
   };
   return MDNode::get(VMContext, Pair);
 }
 
 /// createCompileUnit - A CompileUnit provides an anchor for all debugging
 /// information generated during this instance of compilation.
-void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
-                                  StringRef Directory, StringRef Producer,
-                                  bool isOptimized, StringRef Flags,
-                                  unsigned RunTimeVer, StringRef SplitName) {
+DICompileUnit DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
+                                           StringRef Directory,
+                                           StringRef Producer, bool isOptimized,
+                                           StringRef Flags, unsigned RunTimeVer,
+                                           StringRef SplitName) {
   assert(((Lang <= dwarf::DW_LANG_Python && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
          "Invalid Language tag");
@@ -121,23 +129,70 @@ void DIBuilder::createCompileUnit(unsigned Lang, StringRef Filename,
     TempImportedModules,
     MDString::get(VMContext, SplitName)
   };
-  TheCU = DICompileUnit(MDNode::get(VMContext, Elts));
+
+  MDNode *CUNode = MDNode::get(VMContext, Elts);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
-  NMD->addOperand(TheCU);
+  NMD->addOperand(CUNode);
+
+  return DICompileUnit(CUNode);
+}
+
+static DIImportedEntity
+createImportedModule(LLVMContext &C, DIScope Context, DIDescriptor NS,
+                     unsigned Line, StringRef Name,
+                     SmallVectorImpl<Value *> &AllImportedModules) {
+  const MDNode *R;
+  if (Name.empty()) {
+    Value *Elts[] = {
+      GetTagConstant(C, dwarf::DW_TAG_imported_module),
+      Context,
+      NS,
+      ConstantInt::get(Type::getInt32Ty(C), Line),
+    };
+    R = MDNode::get(C, Elts);
+  } else {
+    Value *Elts[] = {
+      GetTagConstant(C, dwarf::DW_TAG_imported_module),
+      Context,
+      NS,
+      ConstantInt::get(Type::getInt32Ty(C), Line),
+      MDString::get(C, Name)
+    };
+    R = MDNode::get(C, Elts);
+  }
+  DIImportedEntity M(R);
+  assert(M.Verify() && "Imported module should be valid");
+  AllImportedModules.push_back(M);
+  return M;
 }
 
-DIImportedModule DIBuilder::createImportedModule(DIScope Context,
-                                                 DINameSpace NS,
-                                                 unsigned Line) {
+DIImportedEntity DIBuilder::createImportedModule(DIScope Context,
+                                                 DINameSpace NS, unsigned Line,
+                                                 StringRef Name) {
+  return ::createImportedModule(VMContext, Context, NS, Line, Name,
+                                AllImportedModules);
+}
+
+DIImportedEntity DIBuilder::createImportedModule(DIScope Context,
+                                                 DIImportedEntity NS,
+                                                 unsigned Line,
+                                                 StringRef Name) {
+  return ::createImportedModule(VMContext, Context, NS, Line, Name,
+                                AllImportedModules);
+}
+
+DIImportedEntity DIBuilder::createImportedDeclaration(DIScope Context,
+                                                      DIDescriptor Decl,
+                                                      unsigned Line) {
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_imported_module),
+    GetTagConstant(VMContext, dwarf::DW_TAG_imported_declaration),
     Context,
-    NS,
+    Decl,
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
   };
-  DIImportedModule M(MDNode::get(VMContext, Elts));
+  DIImportedEntity M(MDNode::get(VMContext, Elts));
   assert(M.Verify() && "Imported module should be valid");
   AllImportedModules.push_back(M);
   return M;
@@ -154,7 +209,7 @@ DIFile DIBuilder::createFile(StringRef Filename, StringRef Directory) {
 }
 
 /// createEnumerator - Create a single enumerator value.
-DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
+DIEnumerator DIBuilder::createEnumerator(StringRef Name, int64_t Val) {
   assert(!Name.empty() && "Unable to create enumerator without name");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumerator),
@@ -164,15 +219,15 @@ DIEnumerator DIBuilder::createEnumerator(StringRef Name, uint64_t Val) {
   return DIEnumerator(MDNode::get(VMContext, Elts));
 }
 
-/// createNullPtrType - Create C++0x nullptr type.
-DIType DIBuilder::createNullPtrType(StringRef Name) {
+/// \brief Create a DWARF unspecified type.
+DIBasicType DIBuilder::createUnspecifiedType(StringRef Name) {
   assert(!Name.empty() && "Unable to create type without name");
-  // nullptr is encoded in DIBasicType format. Line number, filename,
-  // ,size, alignment, offset and flags are always empty here.
+  // Unspecified types are encoded in DIBasicType format. Line number, filename,
+  // size, alignment, offset and flags are always empty here.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_unspecified_type),
     NULL, // Filename
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
@@ -181,7 +236,12 @@ DIType DIBuilder::createNullPtrType(StringRef Name) {
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags;
     ConstantInt::get(Type::getInt32Ty(VMContext), 0)  // Encoding
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIBasicType(MDNode::get(VMContext, Elts));
+}
+
+/// \brief Create C++11 nullptr type.
+DIBasicType DIBuilder::createNullPtrType() {
+  return createUnspecifiedType("decltype(nullptr)");
 }
 
 /// createBasicType - Create debugging information entry for a basic
@@ -195,7 +255,7 @@ DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_base_type),
     NULL, // File/directory name
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
@@ -214,14 +274,14 @@ DIDerivedType DIBuilder::createQualifiedType(unsigned Tag, DIType FromTy) {
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
     NULL, // Filename
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, StringRef()), // Empty name.
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FromTy
+    FromTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -234,14 +294,14 @@ DIBuilder::createPointerType(DIType PointeeTy, uint64_t SizeInBits,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_pointer_type),
     NULL, // Filename
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy
+    PointeeTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -252,15 +312,15 @@ DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_ptr_to_member_type),
     NULL, // Filename
-    NULL, //TheCU,
+    NULL, // Unused
     NULL,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    PointeeTy,
-    Base
+    PointeeTy.getRef(),
+    Base.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -268,7 +328,7 @@ DIDerivedType DIBuilder::createMemberPointerType(DIType PointeeTy,
 /// createReferenceType - Create debugging information entry for a reference
 /// type.
 DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
-  assert(RTy.Verify() && "Unable to create reference type");
+  assert(RTy.isType() && "Unable to create reference type");
   // References are encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
@@ -280,7 +340,7 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    RTy
+    RTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
@@ -289,117 +349,120 @@ DIDerivedType DIBuilder::createReferenceType(unsigned Tag, DIType RTy) {
 DIDerivedType DIBuilder::createTypedef(DIType Ty, StringRef Name, DIFile File,
                                        unsigned LineNo, DIDescriptor Context) {
   // typedefs are encoded in DIDerivedType format.
-  assert(Ty.Verify() && "Invalid typedef type!");
+  assert(Ty.isType() && "Invalid typedef type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_typedef),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createFriend - Create debugging information entry for a 'friend'.
-DIType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
+DIDerivedType DIBuilder::createFriend(DIType Ty, DIType FriendTy) {
   // typedefs are encoded in DIDerivedType format.
-  assert(Ty.Verify() && "Invalid type!");
-  assert(FriendTy.Verify() && "Invalid friend type!");
+  assert(Ty.isType() && "Invalid type!");
+  assert(FriendTy.isType() && "Invalid friend type!");
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_friend),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
-    FriendTy
+    FriendTy.getRef()
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createInheritance - Create debugging information entry to establish
 /// inheritance relationship between two types.
-DIDerivedType DIBuilder::createInheritance(
-    DIType Ty, DIType BaseTy, uint64_t BaseOffset, unsigned Flags) {
-  assert(Ty.Verify() && "Unable to create inheritance");
+DIDerivedType DIBuilder::createInheritance(DIType Ty, DIType BaseTy,
+                                           uint64_t BaseOffset,
+                                           unsigned Flags) {
+  assert(Ty.isType() && "Unable to create inheritance");
   // TAG_inheritance is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_inheritance),
     NULL,
-    Ty,
+    Ty.getRef(),
     NULL, // Name
     ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
     ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
     ConstantInt::get(Type::getInt64Ty(VMContext), BaseOffset),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    BaseTy
+    BaseTy.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createMemberType - Create debugging information entry for a member.
-DIDerivedType DIBuilder::createMemberType(
-    DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
-    uint64_t SizeInBits, uint64_t AlignInBits, uint64_t OffsetInBits,
-    unsigned Flags, DIType Ty) {
+DIDerivedType DIBuilder::createMemberType(DIDescriptor Scope, StringRef Name,
+                                          DIFile File, unsigned LineNumber,
+                                          uint64_t SizeInBits,
+                                          uint64_t AlignInBits,
+                                          uint64_t OffsetInBits, unsigned Flags,
+                                          DIType Ty) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty
+    Ty.getRef()
   };
   return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createStaticMemberType - Create debugging information entry for a
 /// C++ static data member.
-DIType DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
-                                         DIFile File, unsigned LineNumber,
-                                         DIType Ty, unsigned Flags,
-                                         llvm::Value *Val) {
+DIDerivedType
+DIBuilder::createStaticMemberType(DIDescriptor Scope, StringRef Name,
+                                  DIFile File, unsigned LineNumber,
+                                  DIType Ty, unsigned Flags,
+                                  llvm::Value *Val) {
   // TAG_member is encoded in DIDerivedType format.
   Flags |= DIDescriptor::FlagStaticMember;
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*SizeInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*AlignInBits*/),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0/*OffsetInBits*/),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    Ty,
+    Ty.getRef(),
     Val
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
-DIType DIBuilder::createObjCIVar(StringRef Name,
-                                 DIFile File, unsigned LineNumber,
-                                 uint64_t SizeInBits, uint64_t AlignInBits,
-                                 uint64_t OffsetInBits, unsigned Flags,
-                                 DIType Ty, StringRef PropertyName,
-                                 StringRef GetterName, StringRef SetterName,
-                                 unsigned PropertyAttributes) {
+DIDerivedType
+DIBuilder::createObjCIVar(StringRef Name, DIFile File, unsigned LineNumber,
+                          uint64_t SizeInBits, uint64_t AlignInBits,
+                          uint64_t OffsetInBits, unsigned Flags, DIType Ty,
+                          StringRef PropertyName, StringRef GetterName,
+                          StringRef SetterName, unsigned PropertyAttributes) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -417,16 +480,17 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
     MDString::get(VMContext, SetterName),
     ConstantInt::get(Type::getInt32Ty(VMContext), PropertyAttributes)
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createObjCIVar - Create debugging information entry for Objective-C
 /// instance variable.
-DIType DIBuilder::createObjCIVar(StringRef Name,
-                                 DIFile File, unsigned LineNumber,
-                                 uint64_t SizeInBits, uint64_t AlignInBits,
-                                 uint64_t OffsetInBits, unsigned Flags,
-                                 DIType Ty, MDNode *PropertyNode) {
+DIDerivedType DIBuilder::createObjCIVar(StringRef Name, DIFile File,
+                                        unsigned LineNumber,
+                                        uint64_t SizeInBits,
+                                        uint64_t AlignInBits,
+                                        uint64_t OffsetInBits, unsigned Flags,
+                                        DIType Ty, MDNode *PropertyNode) {
   // TAG_member is encoded in DIDerivedType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_member),
@@ -441,17 +505,15 @@ DIType DIBuilder::createObjCIVar(StringRef Name,
     Ty,
     PropertyNode
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DIDerivedType(MDNode::get(VMContext, Elts));
 }
 
 /// createObjCProperty - Create debugging information entry for Objective-C
 /// property.
-DIObjCProperty DIBuilder::createObjCProperty(StringRef Name,
-                                             DIFile File, unsigned LineNumber,
-                                             StringRef GetterName,
-                                             StringRef SetterName,
-                                             unsigned PropertyAttributes,
-                                             DIType Ty) {
+DIObjCProperty
+DIBuilder::createObjCProperty(StringRef Name, DIFile File, unsigned LineNumber,
+                              StringRef GetterName, StringRef SetterName,
+                              unsigned PropertyAttributes, DIType Ty) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_APPLE_property),
     MDString::get(VMContext, Name),
@@ -473,9 +535,9 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
                                        unsigned ColumnNo) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_template_type_parameter),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
+    Ty.getRef(),
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
@@ -483,19 +545,18 @@ DIBuilder::createTemplateTypeParameter(DIDescriptor Context, StringRef Name,
   return DITemplateTypeParameter(MDNode::get(VMContext, Elts));
 }
 
-/// createTemplateValueParameter - Create debugging information for template
-/// value parameter.
 DITemplateValueParameter
-DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
-                                        DIType Ty, uint64_t Val,
-                                        MDNode *File, unsigned LineNo,
+DIBuilder::createTemplateValueParameter(unsigned Tag, DIDescriptor Context,
+                                        StringRef Name, DIType Ty,
+                                        Value *Val, MDNode *File,
+                                        unsigned LineNo,
                                         unsigned ColumnNo) {
   Value *Elts[] = {
-    GetTagConstant(VMContext, dwarf::DW_TAG_template_value_parameter),
-    getNonCompileUnitScope(Context),
+    GetTagConstant(VMContext, Tag),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
-    Ty,
-    ConstantInt::get(Type::getInt64Ty(VMContext), Val),
+    Ty.getRef(),
+    Val,
     File,
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNo),
     ConstantInt::get(Type::getInt32Ty(VMContext), ColumnNo)
@@ -503,6 +564,38 @@ DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
   return DITemplateValueParameter(MDNode::get(VMContext, Elts));
 }
 
+/// createTemplateValueParameter - Create debugging information for template
+/// value parameter.
+DITemplateValueParameter
+DIBuilder::createTemplateValueParameter(DIDescriptor Context, StringRef Name,
+                                        DIType Ty, Value *Val,
+                                        MDNode *File, unsigned LineNo,
+                                        unsigned ColumnNo) {
+  return createTemplateValueParameter(dwarf::DW_TAG_template_value_parameter,
+                                      Context, Name, Ty, Val, File, LineNo,
+                                      ColumnNo);
+}
+
+DITemplateValueParameter
+DIBuilder::createTemplateTemplateParameter(DIDescriptor Context, StringRef Name,
+                                           DIType Ty, StringRef Val,
+                                           MDNode *File, unsigned LineNo,
+                                           unsigned ColumnNo) {
+  return createTemplateValueParameter(
+      dwarf::DW_TAG_GNU_template_template_param, Context, Name, Ty,
+      MDString::get(VMContext, Val), File, LineNo, ColumnNo);
+}
+
+DITemplateValueParameter
+DIBuilder::createTemplateParameterPack(DIDescriptor Context, StringRef Name,
+                                       DIType Ty, DIArray Val,
+                                       MDNode *File, unsigned LineNo,
+                                       unsigned ColumnNo) {
+  return createTemplateValueParameter(dwarf::DW_TAG_GNU_template_parameter_pack,
+                                      Context, Name, Ty, Val, File, LineNo,
+                                      ColumnNo);
+}
+
 /// createClassType - Create debugging information entry for a class.
 DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
                                            DIFile File, unsigned LineNumber,
@@ -511,29 +604,34 @@ DICompositeType DIBuilder::createClassType(DIDescriptor Context, StringRef Name,
                                            uint64_t OffsetInBits,
                                            unsigned Flags, DIType DerivedFrom,
                                            DIArray Elements,
-                                           MDNode *VTableHolder,
-                                           MDNode *TemplateParams) {
-  assert((!Context || Context.Verify()) &&
+                                           DIType VTableHolder,
+                                           MDNode *TemplateParams,
+                                           StringRef UniqueIdentifier) {
+  assert((!Context || Context.isScope() || Context.isType()) &&
          "createClassType should be called with a valid Context");
   // TAG_class_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_class_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), OffsetInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    VTableHolder,
-    TemplateParams
+    VTableHolder.getRef(),
+    TemplateParams,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
-  assert(R.Verify() && "createClassType should return a verifiable DIType");
+  assert(R.isCompositeType() &&
+         "createClassType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -546,26 +644,31 @@ DICompositeType DIBuilder::createStructType(DIDescriptor Context,
                                             unsigned Flags, DIType DerivedFrom,
                                             DIArray Elements,
                                             unsigned RunTimeLang,
-                                            MDNode *VTableHolder) {
+                                            DIType VTableHolder,
+                                            StringRef UniqueIdentifier) {
  // TAG_structure_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_structure_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
-    DerivedFrom,
+    DerivedFrom.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    VTableHolder,
+    VTableHolder.getRef(),
     NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   DICompositeType R(MDNode::get(VMContext, Elts));
-  assert(R.Verify() && "createStructType should return a verifiable DIType");
+  assert(R.isCompositeType() &&
+         "createStructType should return a DICompositeType");
+  if (!UniqueIdentifier.empty())
+    retainType(R);
   return R;
 }
 
@@ -575,45 +678,52 @@ DICompositeType DIBuilder::createUnionType(DIDescriptor Scope, StringRef Name,
                                            uint64_t SizeInBits,
                                            uint64_t AlignInBits, unsigned Flags,
                                            DIArray Elements,
-                                           unsigned RunTimeLang) {
+                                           unsigned RunTimeLang,
+                                           StringRef UniqueIdentifier) {
   // TAG_union_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_union_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    NULL
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  return DICompositeType(MDNode::get(VMContext, Elts));
+  DICompositeType R(MDNode::get(VMContext, Elts));
+  if (!UniqueIdentifier.empty())
+    retainType(R);
+  return R;
 }
 
 /// createSubroutineType - Create subroutine type.
-DICompositeType
-DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
+DICompositeType DIBuilder::createSubroutineType(DIFile File,
+                                                DIArray ParameterTypes) {
   // TAG_subroutine_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subroutine_type),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
-    Constant::getNullValue(Type::getInt32Ty(VMContext)),
+    NULL,
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt64Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Size
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Align
+    ConstantInt::get(Type::getInt64Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
     NULL,
     ParameterTypes,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
@@ -623,26 +733,30 @@ DIBuilder::createSubroutineType(DIFile File, DIArray ParameterTypes) {
 DICompositeType DIBuilder::createEnumerationType(
     DIDescriptor Scope, StringRef Name, DIFile File, unsigned LineNumber,
     uint64_t SizeInBits, uint64_t AlignInBits, DIArray Elements,
-    DIType UnderlyingType) {
+    DIType UnderlyingType, StringRef UniqueIdentifier) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
     File.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), LineNumber),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    UnderlyingType,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    UnderlyingType.getRef(),
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
-  MDNode *Node = MDNode::get(VMContext, Elts);
-  AllEnumTypes.push_back(Node);
-  return DICompositeType(Node);
+  DICompositeType CTy(MDNode::get(VMContext, Elts));
+  AllEnumTypes.push_back(CTy);
+  if (!UniqueIdentifier.empty())
+    retainType(CTy);
+  return CTy;
 }
 
 /// createArrayType - Create debugging information entry for an array.
@@ -652,42 +766,45 @@ DICompositeType DIBuilder::createArrayType(uint64_t Size, uint64_t AlignInBits,
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
     NULL, // Filename/Directory,
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Ty,
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Flags
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
   return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createVectorType - Create debugging information entry for a vector.
-DIType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
-                                   DIType Ty, DIArray Subscripts) {
-
+DICompositeType DIBuilder::createVectorType(uint64_t Size, uint64_t AlignInBits,
+                                            DIType Ty, DIArray Subscripts) {
   // A vector is an array type with the FlagVector flag applied.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_array_type),
     NULL, // Filename/Directory,
-    NULL, //TheCU,
+    NULL, // Unused
     MDString::get(VMContext, ""),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Line
     ConstantInt::get(Type::getInt64Ty(VMContext), Size),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
     ConstantInt::get(Type::getInt32Ty(VMContext), DIType::FlagVector),
-    Ty,
+    Ty.getRef(),
     Subscripts,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    NULL,
+    NULL,
+    NULL  // Type Identifer
   };
-  return DIType(MDNode::get(VMContext, Elts));
+  return DICompositeType(MDNode::get(VMContext, Elts));
 }
 
 /// createArtificialType - Create a new DIType with "artificial" flag set.
@@ -698,17 +815,14 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | DIType::FlagArtificial;
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] =  ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -723,17 +837,14 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
   SmallVector<Value *, 9> Elts;
   MDNode *N = Ty;
   assert (N && "Unexpected input DIType!");
-  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
-    if (Value *V = N->getOperand(i))
-      Elts.push_back(V);
-    else
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
-  }
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+    Elts.push_back(N->getOperand(i));
 
   unsigned CurFlags = Ty.getFlags();
   CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
 
   // Flags are stored at this slot.
+  // FIXME: Add an enum for this magic value.
   Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
 
   return DIType(MDNode::get(VMContext, Elts));
@@ -742,7 +853,7 @@ DIType DIBuilder::createObjectPointerType(DIType Ty) {
 /// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
-  AllRetainTypes.push_back(T);
+  AllRetainTypes.push_back(TrackingVH<MDNode>(T));
 }
 
 /// createUnspecifiedParameter - Create unspeicified type descriptor
@@ -756,31 +867,36 @@ DIDescriptor DIBuilder::createUnspecifiedParameter() {
 
 /// createForwardDecl - Create a temporary forward-declared type that
 /// can be RAUW'd if the full type is seen.
-DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
-                                    DIDescriptor Scope, DIFile F,
-                                    unsigned Line, unsigned RuntimeLang,
-                                    uint64_t SizeInBits,
-                                    uint64_t AlignInBits) {
+DICompositeType
+DIBuilder::createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
+                             DIFile F, unsigned Line, unsigned RuntimeLang,
+                             uint64_t SizeInBits, uint64_t AlignInBits,
+                             StringRef UniqueIdentifier) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
     F.getFileNode(),
-    getNonCompileUnitScope(Scope),
+    DIScope(getNonCompileUnitScope(Scope)).getRef(),
     MDString::get(VMContext, Name),
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext),
-                     DIDescriptor::FlagFwdDecl),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0), // Offset
+    ConstantInt::get(Type::getInt32Ty(VMContext), DIDescriptor::FlagFwdDecl),
     NULL,
     DIArray(),
-    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang)
+    ConstantInt::get(Type::getInt32Ty(VMContext), RuntimeLang),
+    NULL,
+    NULL, //TemplateParams
+    UniqueIdentifier.empty() ? NULL : MDString::get(VMContext, UniqueIdentifier)
   };
   MDNode *Node = MDNode::getTemporary(VMContext, Elts);
-  assert(DIType(Node).Verify() &&
-         "createForwardDecl result should be verifiable");
-  return DIType(Node);
+  DICompositeType RetTy(Node);
+  assert(RetTy.isCompositeType() &&
+         "createForwardDecl result should be a DIType");
+  if (!UniqueIdentifier.empty())
+    retainType(RetTy);
+  return RetTy;
 }
 
 /// getOrCreateArray - Get a DIArray, create one if required.
@@ -805,10 +921,11 @@ DISubrange DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
-                     unsigned LineNumber, DIType Ty, bool isLocalToUnit,
-                     Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -830,19 +947,22 @@ createGlobalVariable(StringRef Name, StringRef LinkageName, DIFile F,
 }
 
 /// \brief Create a new descriptor for the specified global.
-DIGlobalVariable DIBuilder::
-createGlobalVariable(StringRef Name, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val) {
+DIGlobalVariable DIBuilder::createGlobalVariable(StringRef Name, DIFile F,
+                                                 unsigned LineNumber, DIType Ty,
+                                                 bool isLocalToUnit,
+                                                 Value *Val) {
   return createGlobalVariable(Name, Name, F, LineNumber, Ty, isLocalToUnit,
                               Val);
 }
 
 /// createStaticVariable - Create a new descriptor for the specified static
 /// variable.
-DIGlobalVariable DIBuilder::
-createStaticVariable(DIDescriptor Context, StringRef Name,
-                     StringRef LinkageName, DIFile F, unsigned LineNumber,
-                     DIType Ty, bool isLocalToUnit, Value *Val, MDNode *Decl) {
+DIGlobalVariable DIBuilder::createStaticVariable(DIDescriptor Context,
+                                                 StringRef Name,
+                                                 StringRef LinkageName,
+                                                 DIFile F, unsigned LineNumber,
+                                                 DIType Ty, bool isLocalToUnit,
+                                                 Value *Val, MDNode *Decl) {
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_variable),
     Constant::getNullValue(Type::getInt32Ty(VMContext)),
@@ -870,9 +990,9 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
                                           bool AlwaysPreserve, unsigned Flags,
                                           unsigned ArgNo) {
   DIDescriptor Context(getNonCompileUnitScope(Scope));
-  assert((!Context || Context.Verify()) &&
+  assert((!Context || Context.isScope()) &&
          "createLocalVariable should be called with a valid Context");
-  assert(Ty.Verify() &&
+  assert(Ty.isType() &&
          "createLocalVariable should be called with a valid type");
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
@@ -893,9 +1013,10 @@ DIVariable DIBuilder::createLocalVariable(unsigned Tag, DIDescriptor Scope,
     NamedMDNode *FnLocals = getOrInsertFnSpecificMDNode(M, Fn);
     FnLocals->addOperand(Node);
   }
-  assert(DIVariable(Node).Verify() &&
-         "createLocalVariable should return a verifiable DIVariable");
-  return DIVariable(Node);
+  DIVariable RetVar(Node);
+  assert(RetVar.isVariable() &&
+         "createLocalVariable should return a valid DIVariable");
+  return RetVar;
 }
 
 /// createComplexVariable - Create a new descriptor for the specified variable
@@ -921,22 +1042,38 @@ DIVariable DIBuilder::createComplexVariable(unsigned Tag, DIDescriptor Scope,
 }
 
 /// createFunction - Create a new descriptor for the specified function.
-DISubprogram DIBuilder::createFunction(DIDescriptor Context,
-                                       StringRef Name,
-                                       StringRef LinkageName,
-                                       DIFile File, unsigned LineNo,
-                                       DIType Ty,
+/// FIXME: this is added for dragonegg. Once we update dragonegg
+/// to call resolve function, this will be removed.
+DISubprogram DIBuilder::createFunction(DIScopeRef Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
+                                       bool isLocalToUnit, bool isDefinition,
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
+  // dragonegg does not generate identifier for types, so using an empty map
+  // to resolve the context should be fine.
+  DITypeIdentifierMap EmptyMap;
+  return createFunction(Context.resolve(EmptyMap), Name, LinkageName, File,
+                        LineNo, Ty, isLocalToUnit, isDefinition, ScopeLine,
+                        Flags, isOptimized, Fn, TParams, Decl);
+}
+
+/// createFunction - Create a new descriptor for the specified function.
+DISubprogram DIBuilder::createFunction(DIDescriptor Context, StringRef Name,
+                                       StringRef LinkageName, DIFile File,
+                                       unsigned LineNo, DICompositeType Ty,
                                        bool isLocalToUnit, bool isDefinition,
-                                       unsigned ScopeLine,
-                                       unsigned Flags, bool isOptimized,
-                                       Function *Fn,
-                                       MDNode *TParams,
-                                       MDNode *Decl) {
+                                       unsigned ScopeLine, unsigned Flags,
+                                       bool isOptimized, Function *Fn,
+                                       MDNode *TParams, MDNode *Decl) {
+  assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
+         "function types should be subroutines");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     File.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(getNonCompileUnitScope(Context)).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -961,29 +1098,29 @@ DISubprogram DIBuilder::createFunction(DIDescriptor Context,
   if (isDefinition)
     AllSubprograms.push_back(Node);
   DISubprogram S(Node);
-  assert(S.Verify() && "createFunction should return a valid DISubprogram");
+  assert(S.isSubprogram() && "createFunction should return a valid DISubprogram");
   return S;
 }
 
 /// createMethod - Create a new descriptor for the specified C++ method.
-DISubprogram DIBuilder::createMethod(DIDescriptor Context,
-                                     StringRef Name,
-                                     StringRef LinkageName,
-                                     DIFile F,
-                                     unsigned LineNo, DIType Ty,
-                                     bool isLocalToUnit,
-                                     bool isDefinition,
+DISubprogram DIBuilder::createMethod(DIDescriptor Context, StringRef Name,
+                                     StringRef LinkageName, DIFile F,
+                                     unsigned LineNo, DICompositeType Ty,
+                                     bool isLocalToUnit, bool isDefinition,
                                      unsigned VK, unsigned VIndex,
-                                     MDNode *VTableHolder,
-                                     unsigned Flags,
-                                     bool isOptimized,
-                                     Function *Fn,
+                                     DIType VTableHolder, unsigned Flags,
+                                     bool isOptimized, Function *Fn,
                                      MDNode *TParam) {
+  assert(Ty.getTag() == dwarf::DW_TAG_subroutine_type &&
+         "function types should be subroutines");
+  assert(getNonCompileUnitScope(Context) &&
+         "Methods should have both a Context and a context that isn't "
+         "the compile unit.");
   Value *TElts[] = { GetTagConstant(VMContext, DW_TAG_base_type) };
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_subprogram),
     F.getFileNode(),
-    getNonCompileUnitScope(Context),
+    DIScope(Context).getRef(),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, Name),
     MDString::get(VMContext, LinkageName),
@@ -991,9 +1128,9 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
     Ty,
     ConstantInt::get(Type::getInt1Ty(VMContext), isLocalToUnit),
     ConstantInt::get(Type::getInt1Ty(VMContext), isDefinition),
-    ConstantInt::get(Type::getInt32Ty(VMContext), (unsigned)VK),
+    ConstantInt::get(Type::getInt32Ty(VMContext), VK),
     ConstantInt::get(Type::getInt32Ty(VMContext), VIndex),
-    VTableHolder,
+    VTableHolder.getRef(),
     ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
     ConstantInt::get(Type::getInt1Ty(VMContext), isOptimized),
     Fn,
@@ -1007,7 +1144,7 @@ DISubprogram DIBuilder::createMethod(DIDescriptor Context,
   if (isDefinition)
     AllSubprograms.push_back(Node);
   DISubprogram S(Node);
-  assert(S.Verify() && "createMethod should return a valid DISubprogram");
+  assert(S.isSubprogram() && "createMethod should return a valid DISubprogram");
   return S;
 }
 
@@ -1046,7 +1183,7 @@ DILexicalBlockFile DIBuilder::createLexicalBlockFile(DIDescriptor Scope,
 
 DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
                                              unsigned Line, unsigned Col) {
-  // Defeat MDNode uniqing for lexical blocks by using unique id.
+  // Defeat MDNode uniquing for lexical blocks by using unique id.
   static unsigned int unique_id = 0;
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_lexical_block),
@@ -1066,7 +1203,8 @@ DILexicalBlock DIBuilder::createLexicalBlock(DIDescriptor Scope, DIFile File,
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
                                       Instruction *InsertBefore) {
   assert(Storage && "no storage passed to dbg.declare");
-  assert(VarInfo.Verify() && "empty DIVariable passed to dbg.declare");
+  assert(VarInfo.isVariable() &&
+         "empty or invalid DIVariable passed to dbg.declare");
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
@@ -1078,7 +1216,8 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
 Instruction *DIBuilder::insertDeclare(Value *Storage, DIVariable VarInfo,
                                       BasicBlock *InsertAtEnd) {
   assert(Storage && "no storage passed to dbg.declare");
-  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.declare");
+  assert(VarInfo.isVariable() &&
+         "empty or invalid DIVariable passed to dbg.declare");
   if (!DeclareFn)
     DeclareFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare);
 
@@ -1097,7 +1236,8 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
                                                 DIVariable VarInfo,
                                                 Instruction *InsertBefore) {
   assert(V && "no value passed to dbg.value");
-  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.value");
+  assert(VarInfo.isVariable() &&
+         "empty or invalid DIVariable passed to dbg.value");
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
@@ -1112,7 +1252,8 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V, uint64_t Offset,
                                                 DIVariable VarInfo,
                                                 BasicBlock *InsertAtEnd) {
   assert(V && "no value passed to dbg.value");
-  assert(VarInfo.Verify() && "invalid DIVariable passed to dbg.value");
+  assert(VarInfo.isVariable() &&
+         "empty or invalid DIVariable passed to dbg.value");
   if (!ValueFn)
     ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value);
 
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index 5658f56..6bdc09e 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -200,9 +200,7 @@ static unsigned inBytes(unsigned Bits) {
 }
 
 void DataLayout::parseSpecifier(StringRef Desc) {
-
   while (!Desc.empty()) {
-
     // Split at '-'.
     std::pair<StringRef, StringRef> Split = split(Desc, '-');
     Desc = Split.second;
@@ -482,7 +480,7 @@ std::string DataLayout::getStringRepresentation() const {
     addrSpaces.push_back(pib->first);
   }
   std::sort(addrSpaces.begin(), addrSpaces.end());
-  for (SmallVector<unsigned, 8>::iterator asb = addrSpaces.begin(),
+  for (SmallVectorImpl<unsigned>::iterator asb = addrSpaces.begin(),
       ase = addrSpaces.end(); asb != ase; ++asb) {
     const PointerAlignElem &PI = Pointers.find(*asb)->second;
     OS << "-p";
@@ -509,6 +507,15 @@ std::string DataLayout::getStringRepresentation() const {
   return OS.str();
 }
 
+unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "This should only be called with a pointer or pointer vector type");
+
+  if (Ty->isPointerTy())
+    return getTypeSizeInBits(Ty);
+
+  return getTypeSizeInBits(Ty->getScalarType());
+}
 
 /*!
   \param abi_or_pref Flag that determines which alignment is returned. true
@@ -582,7 +589,6 @@ unsigned DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
   return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
 }
 
-
 unsigned DataLayout::getCallFrameTypeAlignment(Type *Ty) const {
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
     if (Alignments[i].AlignType == STACK_ALIGN)
@@ -601,16 +607,11 @@ unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
   return Log2_32(Align);
 }
 
-/// getIntPtrType - Return an integer type with size at least as big as that
-/// of a pointer in the given address space.
 IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
                                        unsigned AddressSpace) const {
   return IntegerType::get(C, getPointerSizeInBits(AddressSpace));
 }
 
-/// getIntPtrType - Return an integer (vector of integer) type with size at
-/// least as big as that of a pointer of the given pointer (vector of pointer)
-/// type.
 Type *DataLayout::getIntPtrType(Type *Ty) const {
   assert(Ty->isPtrOrPtrVectorTy() &&
          "Expected a pointer or pointer vector type.");
@@ -628,6 +629,13 @@ Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const
   return 0;
 }
 
+unsigned DataLayout::getLargestLegalIntTypeSize() const {
+  unsigned MaxWidth = 0;
+  for (unsigned i = 0, e = (unsigned)LegalIntWidths.size(); i != e; ++i)
+    MaxWidth = std::max<unsigned>(MaxWidth, LegalIntWidths[i]);
+  return MaxWidth;
+}
+
 uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
                                       ArrayRef<Value *> Indices) const {
   Type *Ty = ptrTy;
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index ec83dca..70a756f 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -34,24 +34,6 @@ using namespace llvm::dwarf;
 // DIDescriptor
 //===----------------------------------------------------------------------===//
 
-DIDescriptor::DIDescriptor(const DIFile F) : DbgNode(F.DbgNode) {
-}
-
-DIDescriptor::DIDescriptor(const DISubprogram F) : DbgNode(F.DbgNode) {
-}
-
-DIDescriptor::DIDescriptor(const DILexicalBlockFile F) : DbgNode(F.DbgNode) {
-}
-
-DIDescriptor::DIDescriptor(const DILexicalBlock F) : DbgNode(F.DbgNode) {
-}
-
-DIDescriptor::DIDescriptor(const DIVariable F) : DbgNode(F.DbgNode) {
-}
-
-DIDescriptor::DIDescriptor(const DIType F) : DbgNode(F.DbgNode) {
-}
-
 bool DIDescriptor::Verify() const {
   return DbgNode &&
          (DIDerivedType(DbgNode).Verify() ||
@@ -65,7 +47,7 @@ bool DIDescriptor::Verify() const {
           DIObjCProperty(DbgNode).Verify() ||
           DITemplateTypeParameter(DbgNode).Verify() ||
           DITemplateValueParameter(DbgNode).Verify() ||
-          DIImportedModule(DbgNode).Verify());
+          DIImportedEntity(DbgNode).Verify());
 }
 
 static Value *getField(const MDNode *DbgNode, unsigned Elt) {
@@ -74,10 +56,8 @@ static Value *getField(const MDNode *DbgNode, unsigned Elt) {
   return DbgNode->getOperand(Elt);
 }
 
-static const MDNode *getNodeField(const MDNode *DbgNode, unsigned Elt) {
-  if (const MDNode *R = dyn_cast_or_null<MDNode>(getField(DbgNode, Elt)))
-    return R;
-  return 0;
+static MDNode *getNodeField(const MDNode *DbgNode, unsigned Elt) {
+  return dyn_cast_or_null<MDNode>(getField(DbgNode, Elt));
 }
 
 static StringRef getStringField(const MDNode *DbgNode, unsigned Elt) {
@@ -95,8 +75,8 @@ uint64_t DIDescriptor::getUInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getZExtValue();
 
   return 0;
@@ -107,21 +87,16 @@ int64_t DIDescriptor::getInt64Field(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-    if (ConstantInt *CI
-        = dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
+    if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(DbgNode->getOperand(Elt)))
       return CI->getSExtValue();
 
   return 0;
 }
 
 DIDescriptor DIDescriptor::getDescriptorField(unsigned Elt) const {
-  if (DbgNode == 0)
-    return DIDescriptor();
-
-  if (Elt < DbgNode->getNumOperands())
-    return
-      DIDescriptor(dyn_cast_or_null<const MDNode>(DbgNode->getOperand(Elt)));
-  return DIDescriptor();
+  MDNode *Field = getNodeField(DbgNode, Elt);
+  return DIDescriptor(Field);
 }
 
 GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
@@ -129,7 +104,7 @@ GlobalVariable *DIDescriptor::getGlobalVariableField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<GlobalVariable>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -138,7 +113,7 @@ Constant *DIDescriptor::getConstantField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Constant>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -147,7 +122,7 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
     return 0;
 
   if (Elt < DbgNode->getNumOperands())
-      return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
+    return dyn_cast_or_null<Function>(DbgNode->getOperand(Elt));
   return 0;
 }
 
@@ -156,19 +131,17 @@ void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
     return;
 
   if (Elt < DbgNode->getNumOperands()) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     Node->replaceOperandWith(Elt, F);
   }
 }
 
 unsigned DIVariable::getNumAddrElements() const {
-  return DbgNode->getNumOperands()-8;
+  return DbgNode->getNumOperands() - 8;
 }
 
 /// getInlinedAt - If this variable is inlined then return inline location.
-MDNode *DIVariable::getInlinedAt() const {
-  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(7));
-}
+MDNode *DIVariable::getInlinedAt() const { return getNodeField(DbgNode, 7); }
 
 //===----------------------------------------------------------------------===//
 // Predicates
@@ -177,7 +150,8 @@ MDNode *DIVariable::getInlinedAt() const {
 /// isBasicType - Return true if the specified tag is legal for
 /// DIBasicType.
 bool DIDescriptor::isBasicType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_unspecified_type:
@@ -189,7 +163,8 @@ bool DIDescriptor::isBasicType() const {
 
 /// isDerivedType - Return true if the specified tag is legal for DIDerivedType.
 bool DIDescriptor::isDerivedType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_pointer_type:
@@ -212,7 +187,8 @@ bool DIDescriptor::isDerivedType() const {
 /// isCompositeType - Return true if the specified tag is legal for
 /// DICompositeType.
 bool DIDescriptor::isCompositeType() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_array_type:
   case dwarf::DW_TAG_structure_type:
@@ -228,7 +204,8 @@ bool DIDescriptor::isCompositeType() const {
 
 /// isVariable - Return true if the specified tag is legal for DIVariable.
 bool DIDescriptor::isVariable() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_auto_variable:
   case dwarf::DW_TAG_arg_variable:
@@ -256,11 +233,6 @@ bool DIDescriptor::isGlobalVariable() const {
                      getTag() == dwarf::DW_TAG_constant);
 }
 
-/// isGlobal - Return true if the specified tag is legal for DIGlobal.
-bool DIDescriptor::isGlobal() const {
-  return isGlobalVariable();
-}
-
 /// isUnspecifiedParmeter - Return true if the specified tag is
 /// DW_TAG_unspecified_parameters.
 bool DIDescriptor::isUnspecifiedParameter() const {
@@ -270,17 +242,19 @@ bool DIDescriptor::isUnspecifiedParameter() const {
 /// isScope - Return true if the specified tag is one of the scope
 /// related tag.
 bool DIDescriptor::isScope() const {
-  if (!DbgNode) return false;
+  if (!DbgNode)
+    return false;
   switch (getTag()) {
   case dwarf::DW_TAG_compile_unit:
   case dwarf::DW_TAG_lexical_block:
   case dwarf::DW_TAG_subprogram:
   case dwarf::DW_TAG_namespace:
+  case dwarf::DW_TAG_file_type:
     return true;
   default:
     break;
   }
-  return false;
+  return isType();
 }
 
 /// isTemplateTypeParameter - Return true if the specified tag is
@@ -292,7 +266,9 @@ bool DIDescriptor::isTemplateTypeParameter() const {
 /// isTemplateValueParameter - Return true if the specified tag is
 /// DW_TAG_template_value_parameter.
 bool DIDescriptor::isTemplateValueParameter() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_template_value_parameter;
+  return DbgNode && (getTag() == dwarf::DW_TAG_template_value_parameter ||
+                     getTag() == dwarf::DW_TAG_GNU_template_template_param ||
+                     getTag() == dwarf::DW_TAG_GNU_template_parameter_pack);
 }
 
 /// isCompileUnit - Return true if the specified tag is DW_TAG_compile_unit.
@@ -314,13 +290,13 @@ bool DIDescriptor::isNameSpace() const {
 /// lexical block with an extra file.
 bool DIDescriptor::isLexicalBlockFile() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() == 3);
+         (DbgNode->getNumOperands() == 3);
 }
 
 /// isLexicalBlock - Return true if the specified tag is DW_TAG_lexical_block.
 bool DIDescriptor::isLexicalBlock() const {
   return DbgNode && getTag() == dwarf::DW_TAG_lexical_block &&
-    (DbgNode->getNumOperands() > 3);
+         (DbgNode->getNumOperands() > 3);
 }
 
 /// isSubrange - Return true if the specified tag is DW_TAG_subrange_type.
@@ -338,33 +314,28 @@ bool DIDescriptor::isObjCProperty() const {
   return DbgNode && getTag() == dwarf::DW_TAG_APPLE_property;
 }
 
-/// \brief Return true if the specified tag is DW_TAG_imported_module.
-bool DIDescriptor::isImportedModule() const {
-  return DbgNode && getTag() == dwarf::DW_TAG_imported_module;
+/// \brief Return true if the specified tag is DW_TAG_imported_module or
+/// DW_TAG_imported_declaration.
+bool DIDescriptor::isImportedEntity() const {
+  return DbgNode && (getTag() == dwarf::DW_TAG_imported_module ||
+                     getTag() == dwarf::DW_TAG_imported_declaration);
 }
 
 //===----------------------------------------------------------------------===//
 // Simple Descriptor Constructors and other Methods
 //===----------------------------------------------------------------------===//
 
-DIType::DIType(const MDNode *N) : DIScope(N) {
-  if (!N) return;
-  if (!isBasicType() && !isDerivedType() && !isCompositeType()) {
-    DbgNode = 0;
-  }
-}
-
 unsigned DIArray::getNumElements() const {
   if (!DbgNode)
     return 0;
   return DbgNode->getNumOperands();
 }
 
-/// replaceAllUsesWith - Replace all uses of debug info referenced by
-/// this descriptor.
+/// replaceAllUsesWith - Replace all uses of the MDNode used by this
+/// type with the one in the passed descriptor.
 void DIType::replaceAllUsesWith(DIDescriptor &D) {
-  if (!DbgNode)
-    return;
+
+  assert(DbgNode && "Trying to replace an unverified type!");
 
   // Since we use a TrackingVH for the node, its easy for clients to manufacture
   // legitimate situations where they want to replaceAllUsesWith() on something
@@ -372,19 +343,19 @@ void DIType::replaceAllUsesWith(DIDescriptor &D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
 
-/// replaceAllUsesWith - Replace all uses of debug info referenced by
-/// this descriptor.
+/// replaceAllUsesWith - Replace all uses of the MDNode used by this
+/// type with the one in D.
 void DIType::replaceAllUsesWith(MDNode *D) {
-  if (!DbgNode)
-    return;
+
+  assert(DbgNode && "Trying to replace an unverified type!");
 
   // Since we use a TrackingVH for the node, its easy for clients to manufacture
   // legitimate situations where they want to replaceAllUsesWith() on something
@@ -392,39 +363,24 @@ void DIType::replaceAllUsesWith(MDNode *D) {
   // this detail by allowing a value to be replaced with replaceAllUsesWith()
   // itself.
   if (DbgNode != D) {
-    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    MDNode *Node = const_cast<MDNode *>(DbgNode);
     const MDNode *DN = D;
     const Value *V = cast_or_null<Value>(DN);
-    Node->replaceAllUsesWith(const_cast<Value*>(V));
+    Node->replaceAllUsesWith(const_cast<Value *>(V));
     MDNode::deleteTemporary(Node);
   }
 }
 
-/// isUnsignedDIType - Return true if type encoding is unsigned.
-bool DIType::isUnsignedDIType() {
-  DIDerivedType DTy(DbgNode);
-  if (DTy.Verify())
-    return DTy.getTypeDerivedFrom().isUnsignedDIType();
-
-  DIBasicType BTy(DbgNode);
-  if (BTy.Verify()) {
-    unsigned Encoding = BTy.getEncoding();
-    if (Encoding == dwarf::DW_ATE_unsigned ||
-        Encoding == dwarf::DW_ATE_unsigned_char ||
-        Encoding == dwarf::DW_ATE_boolean)
-      return true;
-  }
-  return false;
-}
-
 /// Verify - Verify that a compile unit is well formed.
 bool DICompileUnit::Verify() const {
   if (!isCompileUnit())
     return false;
-  StringRef N = getFilename();
-  if (N.empty())
+
+  // Don't bother verifying the compilation directory or producer string
+  // as those could be empty.
+  if (getFilename().empty())
     return false;
-  // It is possible that directory and produce string is empty.
+
   return DbgNode->getNumOperands() == 13;
 }
 
@@ -433,31 +389,85 @@ bool DIObjCProperty::Verify() const {
   if (!isObjCProperty())
     return false;
 
-  DIType Ty = getType();
-  if (!Ty.Verify()) return false;
-
   // Don't worry about the rest of the strings for now.
   return DbgNode->getNumOperands() == 8;
 }
 
+/// Check if a field at position Elt of a MDNode is a MDNode.
+/// We currently allow an empty string and an integer.
+/// But we don't allow a non-empty string in a MDNode field.
+static bool fieldIsMDNode(const MDNode *DbgNode, unsigned Elt) {
+  // FIXME: This function should return true, if the field is null or the field
+  // is indeed a MDNode: return !Fld || isa<MDNode>(Fld).
+  Value *Fld = getField(DbgNode, Elt);
+  if (Fld && isa<MDString>(Fld) && !cast<MDString>(Fld)->getString().empty())
+    return false;
+  return true;
+}
+
+/// Check if a field at position Elt of a MDNode is a MDString.
+static bool fieldIsMDString(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return !Fld || isa<MDString>(Fld);
+}
+
+/// Check if a value can be a reference to a type.
+static bool isTypeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIType(cast<MDNode>(Val)).isType());
+}
+
+/// Check if a field at position Elt of a MDNode can be a reference to a type.
+static bool fieldIsTypeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isTypeRef(Fld);
+}
+
+/// Check if a value can be a ScopeRef.
+static bool isScopeRef(const Value *Val) {
+  return !Val ||
+         (isa<MDString>(Val) && !cast<MDString>(Val)->getString().empty()) ||
+         (isa<MDNode>(Val) && DIScope(cast<MDNode>(Val)).isScope());
+}
+
+/// Check if a field at position Elt of a MDNode can be a ScopeRef.
+static bool fieldIsScopeRef(const MDNode *DbgNode, unsigned Elt) {
+  Value *Fld = getField(DbgNode, Elt);
+  return isScopeRef(Fld);
+}
+
 /// Verify - Verify that a type descriptor is well formed.
 bool DIType::Verify() const {
   if (!isType())
     return false;
-  if (getContext() && !getContext().Verify())
+  // Make sure Context @ field 2 is MDNode.
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
-  unsigned Tag = getTag();
+
+  // FIXME: Sink this into the various subclass verifies.
+  uint16_t Tag = getTag();
   if (!isBasicType() && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type && Tag != dwarf::DW_TAG_pointer_type &&
       Tag != dwarf::DW_TAG_ptr_to_member_type &&
       Tag != dwarf::DW_TAG_reference_type &&
       Tag != dwarf::DW_TAG_rvalue_reference_type &&
-      Tag != dwarf::DW_TAG_restrict_type &&
-      Tag != dwarf::DW_TAG_array_type &&
+      Tag != dwarf::DW_TAG_restrict_type && Tag != dwarf::DW_TAG_array_type &&
       Tag != dwarf::DW_TAG_enumeration_type &&
       Tag != dwarf::DW_TAG_subroutine_type &&
+      Tag != dwarf::DW_TAG_inheritance && Tag != dwarf::DW_TAG_friend &&
       getFilename().empty())
     return false;
+  // DIType is abstract, it should be a BasicType, a DerivedType or
+  // a CompositeType.
+  if (isBasicType())
+    DIBasicType(DbgNode).Verify();
+  else if (isCompositeType())
+    DICompositeType(DbgNode).Verify();
+  else if (isDerivedType())
+    DIDerivedType(DbgNode).Verify();
+  else
+    return false;
   return true;
 }
 
@@ -468,6 +478,14 @@ bool DIBasicType::Verify() const {
 
 /// Verify - Verify that a derived type descriptor is well formed.
 bool DIDerivedType::Verify() const {
+  // Make sure DerivedFrom @ field 9 is TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
+    return false;
+  if (getTag() == dwarf::DW_TAG_ptr_to_member_type)
+    // Make sure ClassType @ field 10 is a TypeRef.
+    if (!fieldIsTypeRef(DbgNode, 10))
+      return false;
+
   return isDerivedType() && DbgNode->getNumOperands() >= 10 &&
          DbgNode->getNumOperands() <= 14;
 }
@@ -476,10 +494,18 @@ bool DIDerivedType::Verify() const {
 bool DICompositeType::Verify() const {
   if (!isCompositeType())
     return false;
-  if (getContext() && !getContext().Verify())
+
+  // Make sure DerivedFrom @ field 9 and ContainingType @ field 12 are TypeRef.
+  if (!fieldIsTypeRef(DbgNode, 9))
+    return false;
+  if (!fieldIsTypeRef(DbgNode, 12))
+    return false;
+
+  // Make sure the type identifier at field 14 is MDString, it can be null.
+  if (!fieldIsMDString(DbgNode, 14))
     return false;
 
-  return DbgNode->getNumOperands() >= 10 && DbgNode->getNumOperands() <= 14;
+  return DbgNode->getNumOperands() == 15;
 }
 
 /// Verify - Verify that a subprogram descriptor is well formed.
@@ -487,11 +513,13 @@ bool DISubprogram::Verify() const {
   if (!isSubprogram())
     return false;
 
-  if (getContext() && !getContext().Verify())
+  // Make sure context @ field 2 is a ScopeRef and type @ field 7 is a MDNode.
+  if (!fieldIsScopeRef(DbgNode, 2))
     return false;
-
-  DICompositeType Ty = getType();
-  if (!Ty.Verify())
+  if (!fieldIsMDNode(DbgNode, 7))
+    return false;
+  // Containing type @ field 12.
+  if (!fieldIsTypeRef(DbgNode, 12))
     return false;
   return DbgNode->getNumOperands() == 20;
 }
@@ -503,15 +531,13 @@ bool DIGlobalVariable::Verify() const {
 
   if (getDisplayName().empty())
     return false;
-
-  if (getContext() && !getContext().Verify())
+  // Make sure context @ field 2 and type @ field 8 are MDNodes.
+  if (!fieldIsMDNode(DbgNode, 2))
     return false;
-
-  DIType Ty = getType();
-  if (!Ty.Verify())
+  if (!fieldIsMDNode(DbgNode, 8))
     return false;
-
-  if (!getGlobal() && !getConstant())
+  // Make sure StaticDataMemberDeclaration @ field 12 is MDNode.
+  if (!fieldIsMDNode(DbgNode, 12))
     return false;
 
   return DbgNode->getNumOperands() == 13;
@@ -522,13 +548,11 @@ bool DIVariable::Verify() const {
   if (!isVariable())
     return false;
 
-  if (getContext() && !getContext().Verify())
+  // Make sure context @ field 1 and type @ field 5 are MDNodes.
+  if (!fieldIsMDNode(DbgNode, 1))
     return false;
-
-  DIType Ty = getType();
-  if (!Ty.Verify())
+  if (!fieldIsMDNode(DbgNode, 5))
     return false;
-
   return DbgNode->getNumOperands() >= 8;
 }
 
@@ -548,9 +572,7 @@ bool DINameSpace::Verify() const {
 }
 
 /// \brief Retrieve the MDNode for the directory/file pair.
-MDNode *DIFile::getFileNode() const {
-  return const_cast<MDNode*>(getNodeField(DbgNode, 1));
-}
+MDNode *DIFile::getFileNode() const { return getNodeField(DbgNode, 1); }
 
 /// \brief Verify that the file descriptor is well formed.
 bool DIFile::Verify() const {
@@ -588,62 +610,82 @@ bool DITemplateValueParameter::Verify() const {
 }
 
 /// \brief Verify that the imported module descriptor is well formed.
-bool DIImportedModule::Verify() const {
-  return isImportedModule() && DbgNode->getNumOperands() == 4;
+bool DIImportedEntity::Verify() const {
+  return isImportedEntity() &&
+         (DbgNode->getNumOperands() == 4 || DbgNode->getNumOperands() == 5);
 }
 
-/// getOriginalTypeSize - If this type is derived from a base type then
-/// return base type size.
-uint64_t DIDerivedType::getOriginalTypeSize() const {
-  unsigned Tag = getTag();
-
-  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
-      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
-      Tag != dwarf::DW_TAG_restrict_type)
-    return getSizeInBits();
-
-  DIType BaseType = getTypeDerivedFrom();
-
-  // If this type is not derived from any type then take conservative approach.
-  if (!BaseType.isValid())
-    return getSizeInBits();
-
-  // If this is a derived type, go ahead and get the base type, unless it's a
-  // reference then it's just the size of the field. Pointer types have no need
-  // of this since they're a different type of qualification on the type.
-  if (BaseType.getTag() == dwarf::DW_TAG_reference_type ||
-      BaseType.getTag() == dwarf::DW_TAG_rvalue_reference_type)
-    return getSizeInBits();
-
-  if (BaseType.isDerivedType())
-    return DIDerivedType(BaseType).getOriginalTypeSize();
+/// getObjCProperty - Return property node, if this ivar is associated with one.
+MDNode *DIDerivedType::getObjCProperty() const {
+  return getNodeField(DbgNode, 10);
+}
 
-  return BaseType.getSizeInBits();
+MDString *DICompositeType::getIdentifier() const {
+  return cast_or_null<MDString>(getField(DbgNode, 14));
 }
 
-/// getObjCProperty - Return property node, if this ivar is associated with one.
-MDNode *DIDerivedType::getObjCProperty() const {
-  if (DbgNode->getNumOperands() <= 10)
-    return NULL;
-  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(10));
+#ifndef NDEBUG
+static void VerifySubsetOf(const MDNode *LHS, const MDNode *RHS) {
+  for (unsigned i = 0; i != LHS->getNumOperands(); ++i) {
+    // Skip the 'empty' list (that's a single i32 0, rather than truly empty).
+    if (i == 0 && isa<ConstantInt>(LHS->getOperand(i)))
+      continue;
+    const MDNode *E = cast<MDNode>(LHS->getOperand(i));
+    bool found = false;
+    for (unsigned j = 0; !found && j != RHS->getNumOperands(); ++j)
+      found = E == RHS->getOperand(j);
+    assert(found && "Losing a member during member list replacement");
+  }
 }
+#endif
 
 /// \brief Set the array of member DITypes.
 void DICompositeType::setTypeArray(DIArray Elements, DIArray TParams) {
-  assert((!TParams || DbgNode->getNumOperands() == 14) &&
+  assert((!TParams || DbgNode->getNumOperands() == 15) &&
          "If you're setting the template parameters this should include a slot "
          "for that!");
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(10, Elements);
+  if (Elements) {
+#ifndef NDEBUG
+    // Check that the new list of members contains all the old members as well.
+    if (const MDNode *El = cast_or_null<MDNode>(N->getOperand(10)))
+      VerifySubsetOf(El, Elements);
+#endif
+    N->replaceOperandWith(10, Elements);
+  }
   if (TParams)
     N->replaceOperandWith(13, TParams);
   DbgNode = N;
 }
 
+void DICompositeType::addMember(DIDescriptor D) {
+  SmallVector<llvm::Value *, 16> M;
+  DIArray OrigM = getTypeArray();
+  unsigned Elements = OrigM.getNumElements();
+  if (Elements == 1 && !OrigM.getElement(0))
+    Elements = 0;
+  M.reserve(Elements + 1);
+  for (unsigned i = 0; i != Elements; ++i)
+    M.push_back(OrigM.getElement(i));
+  M.push_back(D);
+  setTypeArray(DIArray(MDNode::get(DbgNode->getContext(), M)));
+}
+
+/// Generate a reference to this DIType. Uses the type identifier instead
+/// of the actual MDNode if possible, to help type uniquing.
+DIScopeRef DIScope::getRef() const {
+  if (!isCompositeType())
+    return DIScopeRef(*this);
+  DICompositeType DTy(DbgNode);
+  if (!DTy.getIdentifier())
+    return DIScopeRef(*this);
+  return DIScopeRef(DTy.getIdentifier());
+}
+
 /// \brief Set the containing type.
 void DICompositeType::setContainingType(DICompositeType ContainingType) {
   TrackingVH<MDNode> N(*this);
-  N->replaceOperandWith(12, ContainingType);
+  N->replaceOperandWith(12, ContainingType.getRef());
   DbgNode = N;
 }
 
@@ -673,24 +715,59 @@ bool DISubprogram::describes(const Function *F) {
 }
 
 unsigned DISubprogram::isOptimized() const {
-  assert (DbgNode && "Invalid subprogram descriptor!");
+  assert(DbgNode && "Invalid subprogram descriptor!");
   if (DbgNode->getNumOperands() == 15)
     return getUnsignedField(14);
   return 0;
 }
 
 MDNode *DISubprogram::getVariablesNodes() const {
-  if (!DbgNode || DbgNode->getNumOperands() <= 18)
-    return NULL;
-  return dyn_cast_or_null<MDNode>(DbgNode->getOperand(18));
+  return getNodeField(DbgNode, 18);
 }
 
 DIArray DISubprogram::getVariables() const {
-  if (!DbgNode || DbgNode->getNumOperands() <= 18)
-    return DIArray();
-  if (MDNode *T = dyn_cast_or_null<MDNode>(DbgNode->getOperand(18)))
-    return DIArray(T);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 18));
+}
+
+Value *DITemplateValueParameter::getValue() const {
+  return getField(DbgNode, 4);
+}
+
+// If the current node has a parent scope then return that,
+// else return an empty scope.
+DIScopeRef DIScope::getContext() const {
+
+  if (isType())
+    return DIType(DbgNode).getContext();
+
+  if (isSubprogram())
+    return DIScopeRef(DISubprogram(DbgNode).getContext());
+
+  if (isLexicalBlock())
+    return DIScopeRef(DILexicalBlock(DbgNode).getContext());
+
+  if (isLexicalBlockFile())
+    return DIScopeRef(DILexicalBlockFile(DbgNode).getContext());
+
+  if (isNameSpace())
+    return DIScopeRef(DINameSpace(DbgNode).getContext());
+
+  assert((isFile() || isCompileUnit()) && "Unhandled type of scope.");
+  return DIScopeRef(NULL);
+}
+
+// If the scope node has a name, return that, else return an empty string.
+StringRef DIScope::getName() const {
+  if (isType())
+    return DIType(DbgNode).getName();
+  if (isSubprogram())
+    return DISubprogram(DbgNode).getName();
+  if (isNameSpace())
+    return DINameSpace(DbgNode).getName();
+  assert((isLexicalBlock() || isLexicalBlockFile() || isFile() ||
+          isCompileUnit()) &&
+         "Unhandled type of scope.");
+  return StringRef();
 }
 
 StringRef DIScope::getFilename() const {
@@ -709,54 +786,51 @@ DIArray DICompileUnit::getEnumTypes() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
 
-  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(7)))
-    return DIArray(N);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 7));
 }
 
 DIArray DICompileUnit::getRetainedTypes() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
 
-  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(8)))
-    return DIArray(N);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 8));
 }
 
 DIArray DICompileUnit::getSubprograms() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
 
-  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(9)))
-    return DIArray(N);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 9));
 }
 
-
 DIArray DICompileUnit::getGlobalVariables() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
 
-  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(10)))
-    return DIArray(N);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 10));
 }
 
-DIArray DICompileUnit::getImportedModules() const {
+DIArray DICompileUnit::getImportedEntities() const {
   if (!DbgNode || DbgNode->getNumOperands() < 13)
     return DIArray();
 
-  if (MDNode *N = dyn_cast_or_null<MDNode>(DbgNode->getOperand(11)))
-    return DIArray(N);
-  return DIArray();
+  return DIArray(getNodeField(DbgNode, 11));
 }
 
-/// fixupObjcLikeName - Replace contains special characters used
+/// fixupSubprogramName - Replace contains special characters used
 /// in a typical Objective-C names with '.' in a given string.
-static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
+static void fixupSubprogramName(DISubprogram Fn, SmallVectorImpl<char> &Out) {
+  StringRef FName =
+      Fn.getFunction() ? Fn.getFunction()->getName() : Fn.getName();
+  FName = Function::getRealLinkageName(FName);
+
+  StringRef Prefix("llvm.dbg.lv.");
+  Out.reserve(FName.size() + Prefix.size());
+  Out.append(Prefix.begin(), Prefix.end());
+
   bool isObjCLike = false;
-  for (size_t i = 0, e = Str.size(); i < e; ++i) {
-    char C = Str[i];
+  for (size_t i = 0, e = FName.size(); i < e; ++i) {
+    char C = FName[i];
     if (C == '[')
       isObjCLike = true;
 
@@ -771,33 +845,16 @@ static void fixupObjcLikeName(StringRef Str, SmallVectorImpl<char> &Out) {
 /// getFnSpecificMDNode - Return a NameMDNode, if available, that is
 /// suitable to hold function specific information.
 NamedMDNode *llvm::getFnSpecificMDNode(const Module &M, DISubprogram Fn) {
-  SmallString<32> Name = StringRef("llvm.dbg.lv.");
-  StringRef FName = "fn";
-  if (Fn.getFunction())
-    FName = Fn.getFunction()->getName();
-  else
-    FName = Fn.getName();
-  char One = '\1';
-  if (FName.startswith(StringRef(&One, 1)))
-    FName = FName.substr(1);
-  fixupObjcLikeName(FName, Name);
+  SmallString<32> Name;
+  fixupSubprogramName(Fn, Name);
   return M.getNamedMetadata(Name.str());
 }
 
 /// getOrInsertFnSpecificMDNode - Return a NameMDNode that is suitable
 /// to hold function specific information.
 NamedMDNode *llvm::getOrInsertFnSpecificMDNode(Module &M, DISubprogram Fn) {
-  SmallString<32> Name = StringRef("llvm.dbg.lv.");
-  StringRef FName = "fn";
-  if (Fn.getFunction())
-    FName = Fn.getFunction()->getName();
-  else
-    FName = Fn.getName();
-  char One = '\1';
-  if (FName.startswith(StringRef(&One, 1)))
-    FName = FName.substr(1);
-  fixupObjcLikeName(FName, Name);
-
+  SmallString<32> Name;
+  fixupSubprogramName(Fn, Name);
   return M.getOrInsertNamedMetadata(Name.str());
 }
 
@@ -810,8 +867,7 @@ DIVariable llvm::createInlinedVariable(MDNode *DV, MDNode *InlinedScope,
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ? Elts.push_back(InlinedScope) :
-             Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(InlinedScope) : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -820,9 +876,8 @@ DIVariable llvm::cleanseInlinedVariable(MDNode *DV, LLVMContext &VMContext) {
   SmallVector<Value *, 16> Elts;
   // Insert inlined scope as 7th element.
   for (unsigned i = 0, e = DV->getNumOperands(); i != e; ++i)
-    i == 7 ?
-      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext))):
-      Elts.push_back(DV->getOperand(i));
+    i == 7 ? Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)))
+           : Elts.push_back(DV->getOperand(i));
   return DIVariable(MDNode::get(VMContext, Elts));
 }
 
@@ -846,31 +901,70 @@ DICompositeType llvm::getDICompositeType(DIType T) {
   if (T.isCompositeType())
     return DICompositeType(T);
 
-  if (T.isDerivedType())
-    return getDICompositeType(DIDerivedType(T).getTypeDerivedFrom());
+  if (T.isDerivedType()) {
+    // This function is currently used by dragonegg and dragonegg does
+    // not generate identifier for types, so using an empty map to resolve
+    // DerivedFrom should be fine.
+    DITypeIdentifierMap EmptyMap;
+    return getDICompositeType(
+        DIDerivedType(T).getTypeDerivedFrom().resolve(EmptyMap));
+  }
 
   return DICompositeType();
 }
 
-/// isSubprogramContext - Return true if Context is either a subprogram
-/// or another context nested inside a subprogram.
-bool llvm::isSubprogramContext(const MDNode *Context) {
-  if (!Context)
-    return false;
-  DIDescriptor D(Context);
-  if (D.isSubprogram())
-    return true;
-  if (D.isType())
-    return isSubprogramContext(DIType(Context).getContext());
-  return false;
+/// Update DITypeIdentifierMap by going through retained types of each CU.
+DITypeIdentifierMap
+llvm::generateDITypeIdentifierMap(const NamedMDNode *CU_Nodes) {
+  DITypeIdentifierMap Map;
+  for (unsigned CUi = 0, CUe = CU_Nodes->getNumOperands(); CUi != CUe; ++CUi) {
+    DICompileUnit CU(CU_Nodes->getOperand(CUi));
+    DIArray Retain = CU.getRetainedTypes();
+    for (unsigned Ti = 0, Te = Retain.getNumElements(); Ti != Te; ++Ti) {
+      if (!Retain.getElement(Ti).isCompositeType())
+        continue;
+      DICompositeType Ty(Retain.getElement(Ti));
+      if (MDString *TypeId = Ty.getIdentifier()) {
+        // Definition has priority over declaration.
+        // Try to insert (TypeId, Ty) to Map.
+        std::pair<DITypeIdentifierMap::iterator, bool> P =
+            Map.insert(std::make_pair(TypeId, Ty));
+        // If TypeId already exists in Map and this is a definition, replace
+        // whatever we had (declaration or definition) with the definition.
+        if (!P.second && !Ty.isForwardDecl())
+          P.first->second = Ty;
+      }
+    }
+  }
+  return Map;
 }
 
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
 //===----------------------------------------------------------------------===//
 
+void DebugInfoFinder::reset() {
+  CUs.clear();
+  SPs.clear();
+  GVs.clear();
+  TYs.clear();
+  Scopes.clear();
+  NodesSeen.clear();
+  TypeIdentifierMap.clear();
+  TypeMapInitialized = false;
+}
+
+void DebugInfoFinder::InitializeTypeMap(const Module &M) {
+  if (!TypeMapInitialized)
+    if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
+      TypeIdentifierMap = generateDITypeIdentifierMap(CU_Nodes);
+      TypeMapInitialized = true;
+    }
+}
+
 /// processModule - Process entire module and collect debug info.
 void DebugInfoFinder::processModule(const Module &M) {
+  InitializeTypeMap(M);
   if (NamedMDNode *CU_Nodes = M.getNamedMetadata("llvm.dbg.cu")) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CU(CU_Nodes->getOperand(i));
@@ -878,8 +972,10 @@ void DebugInfoFinder::processModule(const Module &M) {
       DIArray GVs = CU.getGlobalVariables();
       for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
         DIGlobalVariable DIG(GVs.getElement(i));
-        if (addGlobalVariable(DIG))
+        if (addGlobalVariable(DIG)) {
+          processScope(DIG.getContext());
           processType(DIG.getType());
+        }
       }
       DIArray SPs = CU.getSubprograms();
       for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i)
@@ -890,36 +986,38 @@ void DebugInfoFinder::processModule(const Module &M) {
       DIArray RetainedTypes = CU.getRetainedTypes();
       for (unsigned i = 0, e = RetainedTypes.getNumElements(); i != e; ++i)
         processType(DIType(RetainedTypes.getElement(i)));
-      // FIXME: We really shouldn't be bailing out after visiting just one CU
-      return;
+      DIArray Imports = CU.getImportedEntities();
+      for (unsigned i = 0, e = Imports.getNumElements(); i != e; ++i) {
+        DIImportedEntity Import = DIImportedEntity(Imports.getElement(i));
+        DIDescriptor Entity = Import.getEntity();
+        if (Entity.isType())
+          processType(DIType(Entity));
+        else if (Entity.isSubprogram())
+          processSubprogram(DISubprogram(Entity));
+        else if (Entity.isNameSpace())
+          processScope(DINameSpace(Entity).getContext());
+      }
     }
   }
 }
 
 /// processLocation - Process DILocation.
-void DebugInfoFinder::processLocation(DILocation Loc) {
-  if (!Loc.Verify()) return;
-  DIDescriptor S(Loc.getScope());
-  if (S.isCompileUnit())
-    addCompileUnit(DICompileUnit(S));
-  else if (S.isSubprogram())
-    processSubprogram(DISubprogram(S));
-  else if (S.isLexicalBlock())
-    processLexicalBlock(DILexicalBlock(S));
-  else if (S.isLexicalBlockFile()) {
-    DILexicalBlockFile DBF = DILexicalBlockFile(S);
-    processLexicalBlock(DILexicalBlock(DBF.getScope()));
-  }
-  processLocation(Loc.getOrigLocation());
+void DebugInfoFinder::processLocation(const Module &M, DILocation Loc) {
+  if (!Loc)
+    return;
+  InitializeTypeMap(M);
+  processScope(Loc.getScope());
+  processLocation(M, Loc.getOrigLocation());
 }
 
 /// processType - Process DIType.
 void DebugInfoFinder::processType(DIType DT) {
   if (!addType(DT))
     return;
+  processScope(DT.getContext().resolve(TypeIdentifierMap));
   if (DT.isCompositeType()) {
     DICompositeType DCT(DT);
-    processType(DCT.getTypeDerivedFrom());
+    processType(DCT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
     DIArray DA = DCT.getTypeArray();
     for (unsigned i = 0, e = DA.getNumElements(); i != e; ++i) {
       DIDescriptor D = DA.getElement(i);
@@ -930,7 +1028,35 @@ void DebugInfoFinder::processType(DIType DT) {
     }
   } else if (DT.isDerivedType()) {
     DIDerivedType DDT(DT);
-    processType(DDT.getTypeDerivedFrom());
+    processType(DDT.getTypeDerivedFrom().resolve(TypeIdentifierMap));
+  }
+}
+
+void DebugInfoFinder::processScope(DIScope Scope) {
+  if (Scope.isType()) {
+    DIType Ty(Scope);
+    processType(Ty);
+    return;
+  }
+  if (Scope.isCompileUnit()) {
+    addCompileUnit(DICompileUnit(Scope));
+    return;
+  }
+  if (Scope.isSubprogram()) {
+    processSubprogram(DISubprogram(Scope));
+    return;
+  }
+  if (!addScope(Scope))
+    return;
+  if (Scope.isLexicalBlock()) {
+    DILexicalBlock LB(Scope);
+    processScope(LB.getContext());
+  } else if (Scope.isLexicalBlockFile()) {
+    DILexicalBlockFile LBF = DILexicalBlockFile(Scope);
+    processScope(LBF.getScope());
+  } else if (Scope.isNameSpace()) {
+    DINameSpace NS(Scope);
+    processScope(NS.getContext());
   }
 }
 
@@ -942,8 +1068,7 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
   else if (Context.isLexicalBlockFile()) {
     DILexicalBlockFile DBF = DILexicalBlockFile(Context);
     return processLexicalBlock(DILexicalBlock(DBF.getScope()));
-  }
-  else
+  } else
     return processSubprogram(DISubprogram(Context));
 }
 
@@ -951,13 +1076,30 @@ void DebugInfoFinder::processLexicalBlock(DILexicalBlock LB) {
 void DebugInfoFinder::processSubprogram(DISubprogram SP) {
   if (!addSubprogram(SP))
     return;
+  processScope(SP.getContext().resolve(TypeIdentifierMap));
   processType(SP.getType());
+  DIArray TParams = SP.getTemplateParams();
+  for (unsigned I = 0, E = TParams.getNumElements(); I != E; ++I) {
+    DIDescriptor Element = TParams.getElement(I);
+    if (Element.isTemplateTypeParameter()) {
+      DITemplateTypeParameter TType(Element);
+      processScope(TType.getContext().resolve(TypeIdentifierMap));
+      processType(TType.getType().resolve(TypeIdentifierMap));
+    } else if (Element.isTemplateValueParameter()) {
+      DITemplateValueParameter TVal(Element);
+      processScope(TVal.getContext().resolve(TypeIdentifierMap));
+      processType(TVal.getType().resolve(TypeIdentifierMap));
+    }
+  }
 }
 
 /// processDeclare - Process DbgDeclareInst.
-void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
+void DebugInfoFinder::processDeclare(const Module &M,
+                                     const DbgDeclareInst *DDI) {
   MDNode *N = dyn_cast<MDNode>(DDI->getVariable());
-  if (!N) return;
+  if (!N)
+    return;
+  InitializeTypeMap(M);
 
   DIDescriptor DV(N);
   if (!DV.isVariable())
@@ -965,12 +1107,29 @@ void DebugInfoFinder::processDeclare(const DbgDeclareInst *DDI) {
 
   if (!NodesSeen.insert(DV))
     return;
+  processScope(DIVariable(N).getContext());
+  processType(DIVariable(N).getType());
+}
+
+void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
+  MDNode *N = dyn_cast<MDNode>(DVI->getVariable());
+  if (!N)
+    return;
+  InitializeTypeMap(M);
+
+  DIDescriptor DV(N);
+  if (!DV.isVariable())
+    return;
+
+  if (!NodesSeen.insert(DV))
+    return;
+  processScope(DIVariable(N).getContext());
   processType(DIVariable(N).getType());
 }
 
 /// addType - Add type into Tys.
 bool DebugInfoFinder::addType(DIType DT) {
-  if (!DT.isValid())
+  if (!DT)
     return false;
 
   if (!NodesSeen.insert(DT))
@@ -982,9 +1141,8 @@ bool DebugInfoFinder::addType(DIType DT) {
 
 /// addCompileUnit - Add compile unit into CUs.
 bool DebugInfoFinder::addCompileUnit(DICompileUnit CU) {
-  if (!CU.Verify())
+  if (!CU)
     return false;
-
   if (!NodesSeen.insert(CU))
     return false;
 
@@ -994,7 +1152,7 @@ bool DebugInfoFinder::addCompileUnit(DICompileUnit CU) {
 
 /// addGlobalVariable - Add global variable into GVs.
 bool DebugInfoFinder::addGlobalVariable(DIGlobalVariable DIG) {
-  if (!DIDescriptor(DIG).isGlobalVariable())
+  if (!DIG)
     return false;
 
   if (!NodesSeen.insert(DIG))
@@ -1006,7 +1164,7 @@ bool DebugInfoFinder::addGlobalVariable(DIGlobalVariable DIG) {
 
 // addSubprogram - Add subprgoram into SPs.
 bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
-  if (!DIDescriptor(SP).isSubprogram())
+  if (!SP)
     return false;
 
   if (!NodesSeen.insert(SP))
@@ -1016,18 +1174,33 @@ bool DebugInfoFinder::addSubprogram(DISubprogram SP) {
   return true;
 }
 
+bool DebugInfoFinder::addScope(DIScope Scope) {
+  if (!Scope)
+    return false;
+  // FIXME: Ocaml binding generates a scope with no content, we treat it
+  // as null for now.
+  if (Scope->getNumOperands() == 0)
+    return false;
+  if (!NodesSeen.insert(Scope))
+    return false;
+  Scopes.push_back(Scope);
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // DIDescriptor: dump routines for all descriptors.
 //===----------------------------------------------------------------------===//
 
 /// dump - Print descriptor to dbgs() with a newline.
 void DIDescriptor::dump() const {
-  print(dbgs()); dbgs() << '\n';
+  print(dbgs());
+  dbgs() << '\n';
 }
 
 /// print - Print descriptor.
 void DIDescriptor::print(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   if (const char *Tag = dwarf::TagString(getTag()))
     OS << "[ " << Tag << " ]";
@@ -1089,7 +1262,8 @@ void DIEnumerator::printInternal(raw_ostream &OS) const {
 }
 
 void DIType::printInternal(raw_ostream &OS) const {
-  if (!DbgNode) return;
+  if (!DbgNode)
+    return;
 
   StringRef Res = getName();
   if (!Res.empty())
@@ -1097,13 +1271,11 @@ void DIType::printInternal(raw_ostream &OS) const {
 
   // TODO: Print context?
 
-  OS << " [line " << getLineNumber()
-     << ", size " << getSizeInBits()
-     << ", align " << getAlignInBits()
-     << ", offset " << getOffsetInBits();
+  OS << " [line " << getLineNumber() << ", size " << getSizeInBits()
+     << ", align " << getAlignInBits() << ", offset " << getOffsetInBits();
   if (isBasicType())
     if (const char *Enc =
-        dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
+            dwarf::AttributeEncodingString(DIBasicType(DbgNode).getEncoding()))
       OS << ", enc " << Enc;
   OS << "]";
 
@@ -1116,7 +1288,12 @@ void DIType::printInternal(raw_ostream &OS) const {
     OS << " [artificial]";
 
   if (isForwardDecl())
-    OS << " [fwd]";
+    OS << " [decl]";
+  else if (getTag() == dwarf::DW_TAG_structure_type ||
+           getTag() == dwarf::DW_TAG_union_type ||
+           getTag() == dwarf::DW_TAG_enumeration_type ||
+           getTag() == dwarf::DW_TAG_class_type)
+    OS << " [def]";
   if (isVector())
     OS << " [vector]";
   if (isStaticMember())
@@ -1194,19 +1371,17 @@ void DIObjCProperty::printInternal(raw_ostream &OS) const {
   if (!Name.empty())
     OS << " [" << Name << ']';
 
-  OS << " [line " << getLineNumber()
-     << ", properties " << getUnsignedField(6) << ']';
+  OS << " [line " << getLineNumber() << ", properties " << getUnsignedField(6)
+     << ']';
 }
 
 static void printDebugLoc(DebugLoc DL, raw_ostream &CommentOS,
                           const LLVMContext &Ctx) {
-  if (!DL.isUnknown()) {          // Print source line info.
+  if (!DL.isUnknown()) { // Print source line info.
     DIScope Scope(DL.getScope(Ctx));
+    assert(Scope.isScope() && "Scope of a DebugLoc should be a DIScope.");
     // Omit the directory, because it's likely to be long and uninteresting.
-    if (Scope.Verify())
-      CommentOS << Scope.getFilename();
-    else
-      CommentOS << "<unknown>";
+    CommentOS << Scope.getFilename();
     CommentOS << ':' << DL.getLine();
     if (DL.getCol() != 0)
       CommentOS << ':' << DL.getCol();
@@ -1233,3 +1408,81 @@ void DIVariable::printExtendedName(raw_ostream &OS) const {
     }
   }
 }
+
+/// Specialize constructor to make sure it has the correct type.
+template <> DIRef<DIScope>::DIRef(const Value *V) : Val(V) {
+  assert(isScopeRef(V) && "DIScopeRef should be a MDString or MDNode");
+}
+template <> DIRef<DIType>::DIRef(const Value *V) : Val(V) {
+  assert(isTypeRef(V) && "DITypeRef should be a MDString or MDNode");
+}
+
+/// Specialize getFieldAs to handle fields that are references to DIScopes.
+template <>
+DIScopeRef DIDescriptor::getFieldAs<DIScopeRef>(unsigned Elt) const {
+  return DIScopeRef(getField(DbgNode, Elt));
+}
+/// Specialize getFieldAs to handle fields that are references to DITypes.
+template <> DITypeRef DIDescriptor::getFieldAs<DITypeRef>(unsigned Elt) const {
+  return DITypeRef(getField(DbgNode, Elt));
+}
+
+/// Strip debug info in the module if it exists.
+/// To do this, we remove all calls to the debugger intrinsics and any named
+/// metadata for debugging. We also remove debug locations for instructions.
+/// Return true if module is modified.
+bool llvm::StripDebugInfo(Module &M) {
+
+  bool Changed = false;
+
+  // Remove all of the calls to the debugger intrinsics, and remove them from
+  // the module.
+  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
+    while (!Declare->use_empty()) {
+      CallInst *CI = cast<CallInst>(Declare->use_back());
+      CI->eraseFromParent();
+    }
+    Declare->eraseFromParent();
+    Changed = true;
+  }
+
+  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
+    while (!DbgVal->use_empty()) {
+      CallInst *CI = cast<CallInst>(DbgVal->use_back());
+      CI->eraseFromParent();
+    }
+    DbgVal->eraseFromParent();
+    Changed = true;
+  }
+
+  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
+         NME = M.named_metadata_end(); NMI != NME;) {
+    NamedMDNode *NMD = NMI;
+    ++NMI;
+    if (NMD->getName().startswith("llvm.dbg.")) {
+      NMD->eraseFromParent();
+      Changed = true;
+    }
+  }
+
+  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
+    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
+         ++FI)
+      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
+           ++BI) {
+        if (!BI->getDebugLoc().isUnknown()) {
+          Changed = true;
+          BI->setDebugLoc(DebugLoc());
+        }
+      }
+
+  return Changed;
+}
+
+/// Return Debug Info Metadata Version by checking module flags.
+unsigned llvm::getDebugMetadataVersionFromModule(const Module &M) {
+  Value *Val = M.getModuleFlag("Debug Info Version");
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val)->getZExtValue();
+}
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 7f7efab..e8a2402 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -131,6 +131,15 @@ bool Argument::hasReturnedAttr() const {
     hasAttribute(getArgNo()+1, Attribute::Returned);
 }
 
+/// Return true if this argument has the readonly or readnone attribute on it
+/// in its containing function.
+bool Argument::onlyReadsMemory() const {
+  return getParent()->getAttributes().
+      hasAttribute(getArgNo()+1, Attribute::ReadOnly) ||
+      getParent()->getAttributes().
+      hasAttribute(getArgNo()+1, Attribute::ReadNone);
+}
+
 /// addAttr - Add attributes to an argument.
 void Argument::addAttr(AttributeSet AS) {
   assert(AS.getNumSlots() <= 1 &&
@@ -267,6 +276,9 @@ void Function::dropAllReferences() {
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
     BasicBlocks.begin()->eraseFromParent();
+
+  // Prefix data is stored in a side table.
+  setPrefixData(0);
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind attr) {
@@ -342,6 +354,10 @@ void Function::copyAttributesFrom(const GlobalValue *Src) {
     setGC(SrcF->getGC());
   else
     clearGC();
+  if (SrcF->hasPrefixData())
+    setPrefixData(SrcF->getPrefixData());
+  else
+    setPrefixData(0);
 }
 
 /// getIntrinsicID - This method returns the ID number of the specified
@@ -437,7 +453,9 @@ enum IIT_Info {
   IIT_STRUCT5 = 22,
   IIT_EXTEND_VEC_ARG = 23,
   IIT_TRUNC_VEC_ARG = 24,
-  IIT_ANYPTR = 25
+  IIT_ANYPTR = 25,
+  IIT_V1   = 26,
+  IIT_VARARG = 27
 };
 
 
@@ -451,6 +469,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_Done:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
     return;
+  case IIT_VARARG:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VarArg, 0));
+    return;
   case IIT_MMX:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::MMX, 0));
     return;
@@ -481,6 +502,10 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_I64:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 64));
     return;
+  case IIT_V1:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1));
+    DecodeIITType(NextElt, Infos, OutputTable);
+    return;
   case IIT_V2:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
     DecodeIITType(NextElt, Infos, OutputTable);
@@ -592,6 +617,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
 
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
+  case IITDescriptor::VarArg: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
@@ -712,3 +738,31 @@ bool Function::callsFunctionThatReturnsTwice() const {
   return false;
 }
 
+Constant *Function::getPrefixData() const {
+  assert(hasPrefixData());
+  const LLVMContextImpl::PrefixDataMapTy &PDMap =
+      getContext().pImpl->PrefixDataMap;
+  assert(PDMap.find(this) != PDMap.end());
+  return cast<Constant>(PDMap.find(this)->second->getReturnValue());
+}
+
+void Function::setPrefixData(Constant *PrefixData) {
+  if (!PrefixData && !hasPrefixData())
+    return;
+
+  unsigned SCData = getSubclassDataFromValue();
+  LLVMContextImpl::PrefixDataMapTy &PDMap = getContext().pImpl->PrefixDataMap;
+  ReturnInst *&PDHolder = PDMap[this];
+  if (PrefixData) {
+    if (PDHolder)
+      PDHolder->setOperand(0, PrefixData);
+    else
+      PDHolder = ReturnInst::Create(getContext(), PrefixData);
+    SCData |= 2;
+  } else {
+    delete PDHolder;
+    PDMap.erase(this);
+    SCData &= ~2;
+  }
+  setValueSubclassData(SCData);
+}
diff --git a/lib/IR/GCOV.cpp b/lib/IR/GCOV.cpp
index ea2f0a6..f0f8c7d 100644
--- a/lib/IR/GCOV.cpp
+++ b/lib/IR/GCOV.cpp
@@ -7,14 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// GCOV implements the interface to read and write coverage files that use 
+// GCOV implements the interface to read and write coverage files that use
 // 'gcov' format.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/system_error.h"
 using namespace llvm;
@@ -43,27 +45,47 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
   if (Format == GCOV::InvalidGCOV)
     return false;
 
-  unsigned i = 0;
-  while (1) {
-    GCOVFunction *GFun = NULL;
-    if (isGCDAFile(Format)) {
-      // Use existing function while reading .gcda file.
-      assert(i < Functions.size() && ".gcda data does not match .gcno data");
-      GFun = Functions[i];
-    } else if (isGCNOFile(Format)){
-      GFun = new GCOVFunction();
+  if (isGCNOFile(Format)) {
+    while (true) {
+      if (!Buffer.readFunctionTag()) break;
+      GCOVFunction *GFun = new GCOVFunction();
+      if (!GFun->read(Buffer, Format))
+        return false;
       Functions.push_back(GFun);
     }
-    if (!GFun || !GFun->read(Buffer, Format))
-      break;
-    ++i;
   }
+  else if (isGCDAFile(Format)) {
+    for (size_t i = 0, e = Functions.size(); i < e; ++i) {
+      if (!Buffer.readFunctionTag()) {
+        errs() << "Unexpected number of functions.\n";
+        return false;
+      }
+      if (!Functions[i]->read(Buffer, Format))
+        return false;
+    }
+    if (Buffer.readObjectTag()) {
+      uint32_t Length;
+      uint32_t Dummy;
+      if (!Buffer.readInt(Length)) return false;
+      if (!Buffer.readInt(Dummy)) return false; // checksum
+      if (!Buffer.readInt(Dummy)) return false; // num
+      if (!Buffer.readInt(RunCount)) return false;;
+      Buffer.advanceCursor(Length-3);
+    }
+    while (Buffer.readProgramTag()) {
+      uint32_t Length;
+      if (!Buffer.readInt(Length)) return false;
+      Buffer.advanceCursor(Length);
+      ++ProgramCount;
+    }
+  }
+
   return true;
 }
 
-/// dump - Dump GCOVFile content on standard out for debugging purposes.
+/// dump - Dump GCOVFile content to dbgs() for debugging purposes.
 void GCOVFile::dump() {
-  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
+  for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
          E = Functions.end(); I != E; ++I)
     (*I)->dump();
 }
@@ -71,10 +93,11 @@ void GCOVFile::dump() {
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVFile::collectLineCounts(FileInfo &FI) {
-  for (SmallVector<GCOVFunction *, 16>::iterator I = Functions.begin(),
-         E = Functions.end(); I != E; ++I) 
+  for (SmallVectorImpl<GCOVFunction *>::iterator I = Functions.begin(),
+         E = Functions.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
-  FI.print();
+  FI.setRunCount(RunCount);
+  FI.setProgramCount(ProgramCount);
 }
 
 //===----------------------------------------------------------------------===//
@@ -85,77 +108,122 @@ GCOVFunction::~GCOVFunction() {
   DeleteContainerPointers(Blocks);
 }
 
-/// read - Read a aunction from the buffer. Return false if buffer cursor
+/// read - Read a function from the buffer. Return false if buffer cursor
 /// does not point to a function tag.
 bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
-  if (!Buff.readFunctionTag())
-    return false;
+  uint32_t Dummy;
+  if (!Buff.readInt(Dummy)) return false; // Function header length
+  if (!Buff.readInt(Ident)) return false;
+  if (!Buff.readInt(Dummy)) return false; // Checksum #1
+  if (Format != GCOV::GCNO_402 && Format != GCOV::GCDA_402)
+    if (!Buff.readInt(Dummy)) return false; // Checksum #2
 
-  Buff.readInt(); // Function header length
-  Ident = Buff.readInt(); 
-  Buff.readInt(); // Checksum #1
-  if (Format != GCOV::GCNO_402)
-    Buff.readInt(); // Checksum #2
+  if (!Buff.readString(Name)) return false;
 
-  Name = Buff.readString();
   if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
-    Filename = Buff.readString();
+    if (!Buff.readString(Filename)) return false;
 
   if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
-    Buff.readArcTag();
-    uint32_t Count = Buff.readInt() / 2;
-    for (unsigned i = 0, e = Count; i != e; ++i) {
-      Blocks[i]->addCount(Buff.readInt64());
+    if (!Buff.readArcTag()) {
+      errs() << "Arc tag not found.\n";
+      return false;
+    }
+    uint32_t Count;
+    if (!Buff.readInt(Count)) return false;
+    Count /= 2;
+
+    // This for loop adds the counts for each block. A second nested loop is
+    // required to combine the edge counts that are contained in the GCDA file.
+    for (uint32_t Line = 0; Count > 0; ++Line) {
+      if (Line >= Blocks.size()) {
+        errs() << "Unexpected number of edges.\n";
+        return false;
+      }
+      GCOVBlock &Block = *Blocks[Line];
+      for (size_t Edge = 0, End = Block.getNumEdges(); Edge < End; ++Edge) {
+        if (Count == 0) {
+          errs() << "Unexpected number of edges.\n";
+          return false;
+        }
+        uint64_t ArcCount;
+        if (!Buff.readInt64(ArcCount)) return false;
+        Block.addCount(ArcCount);
+        --Count;
+      }
     }
     return true;
   }
 
-  LineNumber = Buff.readInt();
+  if (!Buff.readInt(LineNumber)) return false;
 
   // read blocks.
-  bool BlockTagFound = Buff.readBlockTag();
-  (void)BlockTagFound;
-  assert(BlockTagFound && "Block Tag not found!");
-  uint32_t BlockCount = Buff.readInt();
-  for (int i = 0, e = BlockCount; i != e; ++i) {
-    Buff.readInt(); // Block flags;
-    Blocks.push_back(new GCOVBlock(i));
+  if (!Buff.readBlockTag()) {
+    errs() << "Block tag not found.\n";
+    return false;
+  }
+  uint32_t BlockCount;
+  if (!Buff.readInt(BlockCount)) return false;
+  for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
+    if (!Buff.readInt(Dummy)) return false; // Block flags;
+    Blocks.push_back(new GCOVBlock(*this, i));
   }
 
   // read edges.
   while (Buff.readEdgeTag()) {
-    uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
-    for (int i = 0, e = EdgeCount; i != e; ++i) {
-      Blocks[BlockNo]->addEdge(Buff.readInt());
-      Buff.readInt(); // Edge flag
+    uint32_t EdgeCount;
+    if (!Buff.readInt(EdgeCount)) return false;
+    EdgeCount = (EdgeCount - 1) / 2;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
+    for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
+      uint32_t Dst;
+      if (!Buff.readInt(Dst)) return false;
+      Blocks[BlockNo]->addEdge(Dst);
+      if (!Buff.readInt(Dummy)) return false; // Edge flag
     }
   }
 
   // read line table.
   while (Buff.readLineTag()) {
-    uint32_t LineTableLength = Buff.readInt();
-    uint32_t Size = Buff.getCursor() + LineTableLength*4;
-    uint32_t BlockNo = Buff.readInt();
-    assert(BlockNo < BlockCount && "Unexpected Block number!");
+    uint32_t LineTableLength;
+    if (!Buff.readInt(LineTableLength)) return false;
+    uint32_t EndPos = Buff.getCursor() + LineTableLength*4;
+    uint32_t BlockNo;
+    if (!Buff.readInt(BlockNo)) return false;
+    if (BlockNo >= BlockCount) {
+      errs() << "Unexpected block number.\n";
+      return false;
+    }
     GCOVBlock *Block = Blocks[BlockNo];
-    Buff.readInt(); // flag
-    while (Buff.getCursor() != (Size - 4)) {
-      StringRef Filename = Buff.readString();
-      if (Buff.getCursor() == (Size - 4)) break;
-      while (uint32_t L = Buff.readInt())
-        Block->addLine(Filename, L);
+    if (!Buff.readInt(Dummy)) return false; // flag
+    while (Buff.getCursor() != (EndPos - 4)) {
+      StringRef F;
+      if (!Buff.readString(F)) return false;
+      if (F != Filename) {
+        errs() << "Multiple sources for a single basic block.\n";
+        return false;
+      }
+      if (Buff.getCursor() == (EndPos - 4)) break;
+      while (true) {
+        uint32_t Line;
+        if (!Buff.readInt(Line)) return false;
+        if (!Line) break;
+        Block->addLine(Line);
+      }
     }
-    Buff.readInt(); // flag
+    if (!Buff.readInt(Dummy)) return false; // flag
   }
   return true;
 }
 
-/// dump - Dump GCOVFunction content on standard out for debugging purposes.
+/// dump - Dump GCOVFunction content to dbgs() for debugging purposes.
 void GCOVFunction::dump() {
-  outs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
-  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+  dbgs() <<  "===== " << Name << " @ " << Filename << ":" << LineNumber << "\n";
+  for (SmallVectorImpl<GCOVBlock *>::iterator I = Blocks.begin(),
          E = Blocks.end(); I != E; ++I)
     (*I)->dump();
 }
@@ -163,7 +231,7 @@ void GCOVFunction::dump() {
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVFunction::collectLineCounts(FileInfo &FI) {
-  for (SmallVector<GCOVBlock *, 16>::iterator I = Blocks.begin(),
+  for (SmallVectorImpl<GCOVBlock *>::iterator I = Blocks.begin(),
          E = Blocks.end(); I != E; ++I)
     (*I)->collectLineCounts(FI);
 }
@@ -174,110 +242,73 @@ void GCOVFunction::collectLineCounts(FileInfo &FI) {
 /// ~GCOVBlock - Delete GCOVBlock and its content.
 GCOVBlock::~GCOVBlock() {
   Edges.clear();
-  DeleteContainerSeconds(Lines);
-}
-
-void GCOVBlock::addLine(StringRef Filename, uint32_t LineNo) {
-  GCOVLines *&LinesForFile = Lines[Filename];
-  if (!LinesForFile)
-    LinesForFile = new GCOVLines();
-  LinesForFile->add(LineNo);
+  Lines.clear();
 }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
-  for (StringMap<GCOVLines *>::iterator I = Lines.begin(),
+  for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
          E = Lines.end(); I != E; ++I)
-    I->second->collectLineCounts(FI, I->first(), Counter);
+    FI.addLineCount(Parent.getFilename(), *I, Counter);
 }
 
-/// dump - Dump GCOVBlock content on standard out for debugging purposes.
+/// dump - Dump GCOVBlock content to dbgs() for debugging purposes.
 void GCOVBlock::dump() {
-  outs() << "Block : " << Number << " Counter : " << Counter << "\n";
+  dbgs() << "Block : " << Number << " Counter : " << Counter << "\n";
   if (!Edges.empty()) {
-    outs() << "\tEdges : ";
-    for (SmallVector<uint32_t, 16>::iterator I = Edges.begin(), E = Edges.end();
+    dbgs() << "\tEdges : ";
+    for (SmallVectorImpl<uint32_t>::iterator I = Edges.begin(), E = Edges.end();
          I != E; ++I)
-      outs() << (*I) << ",";
-    outs() << "\n";
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
   if (!Lines.empty()) {
-    outs() << "\tLines : ";
-    for (StringMap<GCOVLines *>::iterator LI = Lines.begin(),
-           LE = Lines.end(); LI != LE; ++LI) {
-      outs() << LI->first() << " -> ";
-      LI->second->dump();
-      outs() << "\n";
-    }
+    dbgs() << "\tLines : ";
+    for (SmallVectorImpl<uint32_t>::iterator I = Lines.begin(),
+           E = Lines.end(); I != E; ++I)
+      dbgs() << (*I) << ",";
+    dbgs() << "\n";
   }
 }
 
 //===----------------------------------------------------------------------===//
-// GCOVLines implementation.
-
-/// collectLineCounts - Collect line counts. This must be used after
-/// reading .gcno and .gcda files.
-void GCOVLines::collectLineCounts(FileInfo &FI, StringRef Filename, 
-                                  uint32_t Count) {
-  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    FI.addLineCount(Filename, *I, Count);
-}
-
-/// dump - Dump GCOVLines content on standard out for debugging purposes.
-void GCOVLines::dump() {
-  for (SmallVector<uint32_t, 16>::iterator I = Lines.begin(),
-         E = Lines.end(); I != E; ++I)
-    outs() << (*I) << ",";
-}
-
-//===----------------------------------------------------------------------===//
 // FileInfo implementation.
 
-/// addLineCount - Add line count for the given line number in a file.
-void FileInfo::addLineCount(StringRef Filename, uint32_t Line, uint32_t Count) {
-  if (LineInfo.find(Filename) == LineInfo.end()) {
-    OwningPtr<MemoryBuffer> Buff;
-    if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
-      errs() << Filename << ": " << ec.message() << "\n";
-      return;
-    }
-    StringRef AllLines = Buff.take()->getBuffer();
-    LineCounts L(AllLines.count('\n')+2);
-    L[Line-1] = Count;
-    LineInfo[Filename] = L;
-    return;
-  }
-  LineCounts &L = LineInfo[Filename];
-  L[Line-1] = Count;
-}
-
 /// print -  Print source files with collected line count information.
-void FileInfo::print() {
+void FileInfo::print(raw_fd_ostream &OS, StringRef gcnoFile,
+                     StringRef gcdaFile) {
   for (StringMap<LineCounts>::iterator I = LineInfo.begin(), E = LineInfo.end();
        I != E; ++I) {
     StringRef Filename = I->first();
-    outs() << Filename << "\n";
+    OS << "        -:    0:Source:" << Filename << "\n";
+    OS << "        -:    0:Graph:" << gcnoFile << "\n";
+    OS << "        -:    0:Data:" << gcdaFile << "\n";
+    OS << "        -:    0:Runs:" << RunCount << "\n";
+    OS << "        -:    0:Programs:" << ProgramCount << "\n";
     LineCounts &L = LineInfo[Filename];
     OwningPtr<MemoryBuffer> Buff;
     if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, Buff)) {
       errs() << Filename << ": " << ec.message() << "\n";
       return;
     }
-    StringRef AllLines = Buff.take()->getBuffer();
-    for (unsigned i = 0, e = L.size(); i != e; ++i) {
-      if (L[i])
-        outs() << L[i] << ":\t";
-      else
-        outs() << " :\t";
+    StringRef AllLines = Buff->getBuffer();
+    uint32_t i = 0;
+    while (!AllLines.empty()) {
+      if (L.find(i) != L.end()) {
+        if (L[i] == 0)
+          OS << "    #####:";
+        else
+          OS << format("%9" PRIu64 ":", L[i]);
+      } else {
+        OS << "        -:";
+      }
       std::pair<StringRef, StringRef> P = AllLines.split('\n');
       if (AllLines != P.first)
-        outs() << P.first;
-      outs() << "\n";
+        OS << format("%5u:", i+1) << P.first;
+      OS << "\n";
       AllLines = P.second;
+      ++i;
     }
   }
 }
-
-
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6d547f3..da3b02a 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -229,14 +229,14 @@ void GlobalAlias::setAliasee(Constant *Aliasee) {
   setOperand(0, Aliasee);
 }
 
-const GlobalValue *GlobalAlias::getAliasedGlobal() const {
-  const Constant *C = getAliasee();
+GlobalValue *GlobalAlias::getAliasedGlobal() {
+  Constant *C = getAliasee();
   if (C == 0) return 0;
   
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GV;
 
-  const ConstantExpr *CE = cast<ConstantExpr>(C);
+  ConstantExpr *CE = cast<ConstantExpr>(C);
   assert((CE->getOpcode() == Instruction::BitCast || 
           CE->getOpcode() == Instruction::GetElementPtr) &&
          "Unsupported aliasee");
@@ -244,18 +244,18 @@ const GlobalValue *GlobalAlias::getAliasedGlobal() const {
   return cast<GlobalValue>(CE->getOperand(0));
 }
 
-const GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) const {
-  SmallPtrSet<const GlobalValue*, 3> Visited;
+GlobalValue *GlobalAlias::resolveAliasedGlobal(bool stopOnWeak) {
+  SmallPtrSet<GlobalValue*, 3> Visited;
 
   // Check if we need to stop early.
   if (stopOnWeak && mayBeOverridden())
     return this;
 
-  const GlobalValue *GV = getAliasedGlobal();
+  GlobalValue *GV = getAliasedGlobal();
   Visited.insert(GV);
 
   // Iterate over aliasing chain, stopping on weak alias if necessary.
-  while (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
+  while (GlobalAlias *GA = dyn_cast<GlobalAlias>(GV)) {
     if (stopOnWeak && GA->mayBeOverridden())
       break;
 
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 2b5a0b3..a7773c4 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -223,18 +223,19 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case GetElementPtr: return "getelementptr";
 
   // Convert instructions...
-  case Trunc:     return "trunc";
-  case ZExt:      return "zext";
-  case SExt:      return "sext";
-  case FPTrunc:   return "fptrunc";
-  case FPExt:     return "fpext";
-  case FPToUI:    return "fptoui";
-  case FPToSI:    return "fptosi";
-  case UIToFP:    return "uitofp";
-  case SIToFP:    return "sitofp";
-  case IntToPtr:  return "inttoptr";
-  case PtrToInt:  return "ptrtoint";
-  case BitCast:   return "bitcast";
+  case Trunc:         return "trunc";
+  case ZExt:          return "zext";
+  case SExt:          return "sext";
+  case FPTrunc:       return "fptrunc";
+  case FPExt:         return "fpext";
+  case FPToUI:        return "fptoui";
+  case FPToSI:        return "fptosi";
+  case UIToFP:        return "uitofp";
+  case SIToFP:        return "sitofp";
+  case IntToPtr:      return "inttoptr";
+  case PtrToInt:      return "ptrtoint";
+  case BitCast:       return "bitcast";
+  case AddrSpaceCast: return "addrspacecast";
 
   // Other instructions...
   case ICmp:           return "icmp";
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index d58877e..8a6b77b 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -346,7 +346,7 @@ void CallInst::removeAttribute(unsigned i, Attribute attr) {
   setAttributes(PAL);
 }
 
-bool CallInst::hasFnAttr(Attribute::AttrKind A) const {
+bool CallInst::hasFnAttrImpl(Attribute::AttrKind A) const {
   if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
     return true;
   if (const Function *F = getCalledFunction())
@@ -574,7 +574,7 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
 
-bool InvokeInst::hasFnAttr(Attribute::AttrKind A) const {
+bool InvokeInst::hasFnAttrImpl(Attribute::AttrKind A) const {
   if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, A))
     return true;
   if (const Function *F = getCalledFunction())
@@ -2095,7 +2095,9 @@ bool CastInst::isNoopCast(Instruction::CastOps Opcode,
     case Instruction::SIToFP:
     case Instruction::FPToUI:
     case Instruction::FPToSI:
-      return false; // These always modify bits
+    case Instruction::AddrSpaceCast:
+      // TODO: Target informations may give a more accurate answer here.
+      return false;
     case Instruction::BitCast:
       return true;  // BitCast never modifies bits.
     case Instruction::PtrToInt:
@@ -2137,44 +2139,46 @@ unsigned CastInst::isEliminableCastPair(
   // ZEXT          <       Integral   Unsigned     Integer      Any
   // SEXT          <       Integral    Signed      Integer      Any
   // FPTOUI       n/a      FloatPt      n/a        Integral   Unsigned
-  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed 
-  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a   
-  // SITOFP       n/a      Integral    Signed      FloatPt      n/a   
-  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a   
-  // FPEXT         <       FloatPt      n/a        FloatPt      n/a   
+  // FPTOSI       n/a      FloatPt      n/a        Integral    Signed
+  // UITOFP       n/a      Integral   Unsigned     FloatPt      n/a
+  // SITOFP       n/a      Integral    Signed      FloatPt      n/a
+  // FPTRUNC       >       FloatPt      n/a        FloatPt      n/a
+  // FPEXT         <       FloatPt      n/a        FloatPt      n/a
   // PTRTOINT     n/a      Pointer      n/a        Integral   Unsigned
   // INTTOPTR     n/a      Integral   Unsigned     Pointer      n/a
-  // BITCAST       =       FirstClass   n/a       FirstClass    n/a   
+  // BITCAST       =       FirstClass   n/a       FirstClass    n/a
+  // ADDRSPCST    n/a      Pointer      n/a        Pointer      n/a
   //
   // NOTE: some transforms are safe, but we consider them to be non-profitable.
   // For example, we could merge "fptoui double to i32" + "zext i32 to i64",
   // into "fptoui double to i64", but this loses information about the range
-  // of the produced value (we no longer know the top-part is all zeros). 
+  // of the produced value (we no longer know the top-part is all zeros).
   // Further this conversion is often much more expensive for typical hardware,
-  // and causes issues when building libgcc.  We disallow fptosi+sext for the 
+  // and causes issues when building libgcc.  We disallow fptosi+sext for the
   // same reason.
-  const unsigned numCastOps = 
+  const unsigned numCastOps =
     Instruction::CastOpsEnd - Instruction::CastOpsBegin;
   static const uint8_t CastResults[numCastOps][numCastOps] = {
-    // T        F  F  U  S  F  F  P  I  B   -+
-    // R  Z  S  P  P  I  I  T  P  2  N  T    |
-    // U  E  E  2  2  2  2  R  E  I  T  C    +- secondOp
-    // N  X  X  U  S  F  F  N  X  N  2  V    |
-    // C  T  T  I  I  P  P  C  T  T  P  T   -+
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // Trunc      -+
-    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3 }, // ZExt        |
-    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3 }, // SExt        |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToUI      |
-    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3 }, // FPToSI      |
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // UIToFP      +- firstOp
-    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4 }, // SIToFP      |
-    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4 }, // FPTrunc     |
-    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4 }, // FPExt       |
-    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3 }, // PtrToInt    |
-    { 99,99,99,99,99,99,99,99,99,13,99,12 }, // IntToPtr    |
-    {  5, 5, 5, 6, 6, 5, 5, 6, 6,11, 5, 1 }, // BitCast    -+
+    // T        F  F  U  S  F  F  P  I  B  A  -+
+    // R  Z  S  P  P  I  I  T  P  2  N  T  S   |
+    // U  E  E  2  2  2  2  R  E  I  T  C  C   +- secondOp
+    // N  X  X  U  S  F  F  N  X  N  2  V  V   |
+    // C  T  T  I  I  P  P  C  T  T  P  T  T  -+
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // Trunc         -+
+    {  8, 1, 9,99,99, 2, 0,99,99,99, 2, 3, 0}, // ZExt           |
+    {  8, 0, 1,99,99, 0, 2,99,99,99, 0, 3, 0}, // SExt           |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToUI         |
+    {  0, 0, 0,99,99, 0, 0,99,99,99, 0, 3, 0}, // FPToSI         |
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // UIToFP         +- firstOp
+    { 99,99,99, 0, 0,99,99, 0, 0,99,99, 4, 0}, // SIToFP         |
+    { 99,99,99, 0, 0,99,99, 1, 0,99,99, 4, 0}, // FPTrunc        |
+    { 99,99,99, 2, 2,99,99,10, 2,99,99, 4, 0}, // FPExt          |
+    {  1, 0, 0,99,99, 0, 0,99,99,99, 7, 3, 0}, // PtrToInt       |
+    { 99,99,99,99,99,99,99,99,99,11,99,15, 0}, // IntToPtr       |
+    {  5, 5, 5, 6, 6, 5, 5, 6, 6,16, 5, 1,14}, // BitCast        |
+    {  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,13,12}, // AddrSpaceCast -+
   };
-  
+
   // If either of the casts are a bitcast from scalar to vector, disallow the
   // merging. However, bitcast of A->B->A are allowed.
   bool isFirstBitcast  = (firstOp == Instruction::BitCast);
@@ -2191,45 +2195,56 @@ unsigned CastInst::isEliminableCastPair(
                             [secondOp-Instruction::CastOpsBegin];
   switch (ElimCase) {
     case 0: 
-      // categorically disallowed
+      // Categorically disallowed.
       return 0;
     case 1: 
-      // allowed, use first cast's opcode
+      // Allowed, use first cast's opcode.
       return firstOp;
     case 2: 
-      // allowed, use second cast's opcode
+      // Allowed, use second cast's opcode.
       return secondOp;
     case 3: 
-      // no-op cast in second op implies firstOp as long as the DestTy 
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is integer and we are not converting between a vector and a
       // non vector type.
       if (!SrcTy->isVectorTy() && DstTy->isIntegerTy())
         return firstOp;
       return 0;
     case 4:
-      // no-op cast in second op implies firstOp as long as the DestTy
+      // No-op cast in second op implies firstOp as long as the DestTy
       // is floating point.
       if (DstTy->isFloatingPointTy())
         return firstOp;
       return 0;
     case 5: 
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is an integer.
       if (SrcTy->isIntegerTy())
         return secondOp;
       return 0;
     case 6:
-      // no-op cast in first op implies secondOp as long as the SrcTy
+      // No-op cast in first op implies secondOp as long as the SrcTy
       // is a floating point.
       if (SrcTy->isFloatingPointTy())
         return secondOp;
       return 0;
-    case 7: { 
-      // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size
+    case 7: {
+      // Cannot simplify if address spaces are different!
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return 0;
+
+      unsigned MidSize = MidTy->getScalarSizeInBits();
+      // We can still fold this without knowing the actual sizes as long we
+      // know that the intermediate pointer is the largest possible
+      // pointer size.
+      // FIXME: Is this always true?
+      if (MidSize == 64)
+        return Instruction::BitCast;
+
+      // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size.
       if (!SrcIntPtrTy || DstIntPtrTy != SrcIntPtrTy)
         return 0;
       unsigned PtrSize = SrcIntPtrTy->getScalarSizeInBits();
-      unsigned MidSize = MidTy->getScalarSizeInBits();
       if (MidSize >= PtrSize)
         return Instruction::BitCast;
       return 0;
@@ -2246,7 +2261,8 @@ unsigned CastInst::isEliminableCastPair(
         return firstOp;
       return secondOp;
     }
-    case 9: // zext, sext -> zext, because sext can't sign extend after zext
+    case 9:
+      // zext, sext -> zext, because sext can't sign extend after zext
       return Instruction::ZExt;
     case 10:
       // fpext followed by ftrunc is allowed if the bit size returned to is
@@ -2254,18 +2270,7 @@ unsigned CastInst::isEliminableCastPair(
       if (SrcTy == DstTy)
         return Instruction::BitCast;
       return 0; // If the types are not the same we can't eliminate it.
-    case 11:
-      // bitcast followed by ptrtoint is allowed as long as the bitcast
-      // is a pointer to pointer cast.
-      if (SrcTy->isPointerTy() && MidTy->isPointerTy())
-        return secondOp;
-      return 0;
-    case 12:
-      // inttoptr, bitcast -> intptr  if bitcast is a ptr to ptr cast
-      if (MidTy->isPointerTy() && DstTy->isPointerTy())
-        return firstOp;
-      return 0;
-    case 13: {
+    case 11: {
       // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
       if (!MidIntPtrTy)
         return 0;
@@ -2276,8 +2281,65 @@ unsigned CastInst::isEliminableCastPair(
         return Instruction::BitCast;
       return 0;
     }
+    case 12: {
+      // addrspacecast, addrspacecast -> bitcast,       if SrcAS == DstAS
+      // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS
+      if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace())
+        return Instruction::AddrSpaceCast;
+      return Instruction::BitCast;
+    }
+    case 13:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() != MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal addrspacecast, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 14:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        MidTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+        "Illegal bitcast, addrspacecast sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
+    case 15:
+      // FIXME: this state can be merged with (1), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isIntOrIntVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isPtrOrPtrVectorTy() &&
+        MidTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+        "Illegal inttoptr, bitcast sequence!");
+      // Allowed, use first cast's opcode
+      return firstOp;
+    case 16:
+      // FIXME: this state can be merged with (2), but the following assert
+      // is useful to check the correcteness of the sequence due to semantic
+      // change of bitcast.
+      assert(
+        SrcTy->isPtrOrPtrVectorTy() &&
+        MidTy->isPtrOrPtrVectorTy() &&
+        DstTy->isIntOrIntVectorTy() &&
+        SrcTy->getPointerAddressSpace() == MidTy->getPointerAddressSpace() &&
+        "Illegal bitcast, ptrtoint sequence!");
+      // Allowed, use second cast's opcode
+      return secondOp;
     case 99: 
-      // cast combination can't happen (error in input). This is for all cases
+      // Cast combination can't happen (error in input). This is for all cases
       // where the MidTy is not the same for the two cast instructions.
       llvm_unreachable("Invalid Cast Combination");
     default:
@@ -2290,19 +2352,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertBefore);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertBefore);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertBefore);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertBefore);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertBefore);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertBefore);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertBefore);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertBefore);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertBefore);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertBefore);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertBefore);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertBefore);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertBefore);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertBefore);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertBefore);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertBefore);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertBefore);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertBefore);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertBefore);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertBefore);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertBefore);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertBefore);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertBefore);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertBefore);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertBefore);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2311,19 +2374,20 @@ CastInst *CastInst::Create(Instruction::CastOps op, Value *S, Type *Ty,
   assert(castIsValid(op, S, Ty) && "Invalid cast!");
   // Construct and return the appropriate CastInst subclass
   switch (op) {
-    case Trunc:    return new TruncInst    (S, Ty, Name, InsertAtEnd);
-    case ZExt:     return new ZExtInst     (S, Ty, Name, InsertAtEnd);
-    case SExt:     return new SExtInst     (S, Ty, Name, InsertAtEnd);
-    case FPTrunc:  return new FPTruncInst  (S, Ty, Name, InsertAtEnd);
-    case FPExt:    return new FPExtInst    (S, Ty, Name, InsertAtEnd);
-    case UIToFP:   return new UIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case SIToFP:   return new SIToFPInst   (S, Ty, Name, InsertAtEnd);
-    case FPToUI:   return new FPToUIInst   (S, Ty, Name, InsertAtEnd);
-    case FPToSI:   return new FPToSIInst   (S, Ty, Name, InsertAtEnd);
-    case PtrToInt: return new PtrToIntInst (S, Ty, Name, InsertAtEnd);
-    case IntToPtr: return new IntToPtrInst (S, Ty, Name, InsertAtEnd);
-    case BitCast:  return new BitCastInst  (S, Ty, Name, InsertAtEnd);
-    default: llvm_unreachable("Invalid opcode provided");
+  case Trunc:         return new TruncInst         (S, Ty, Name, InsertAtEnd);
+  case ZExt:          return new ZExtInst          (S, Ty, Name, InsertAtEnd);
+  case SExt:          return new SExtInst          (S, Ty, Name, InsertAtEnd);
+  case FPTrunc:       return new FPTruncInst       (S, Ty, Name, InsertAtEnd);
+  case FPExt:         return new FPExtInst         (S, Ty, Name, InsertAtEnd);
+  case UIToFP:        return new UIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case SIToFP:        return new SIToFPInst        (S, Ty, Name, InsertAtEnd);
+  case FPToUI:        return new FPToUIInst        (S, Ty, Name, InsertAtEnd);
+  case FPToSI:        return new FPToSIInst        (S, Ty, Name, InsertAtEnd);
+  case PtrToInt:      return new PtrToIntInst      (S, Ty, Name, InsertAtEnd);
+  case IntToPtr:      return new IntToPtrInst      (S, Ty, Name, InsertAtEnd);
+  case BitCast:       return new BitCastInst       (S, Ty, Name, InsertAtEnd);
+  case AddrSpaceCast: return new AddrSpaceCastInst (S, Ty, Name, InsertAtEnd);
+  default: llvm_unreachable("Invalid opcode provided");
   }
 }
 
@@ -2378,25 +2442,43 @@ CastInst *CastInst::CreateTruncOrBitCast(Value *S, Type *Ty,
 CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
                                       const Twine &Name,
                                       BasicBlock *InsertAtEnd) {
-  assert(S->getType()->isPointerTy() && "Invalid cast");
-  assert((Ty->isIntegerTy() || Ty->isPointerTy()) &&
+  assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
+  assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
+         "Invalid cast");
+  assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
+  assert((!Ty->isVectorTy() ||
+          Ty->getVectorNumElements() == S->getType()->getVectorNumElements()) &&
          "Invalid cast");
 
-  if (Ty->isIntegerTy())
+  if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertAtEnd);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertAtEnd);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertAtEnd);
 }
 
 /// @brief Create a BitCast or a PtrToInt cast instruction
-CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty, 
-                                      const Twine &Name, 
+CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
+                                      const Twine &Name,
                                       Instruction *InsertBefore) {
   assert(S->getType()->isPtrOrPtrVectorTy() && "Invalid cast");
   assert((Ty->isIntOrIntVectorTy() || Ty->isPtrOrPtrVectorTy()) &&
          "Invalid cast");
+  assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
+  assert((!Ty->isVectorTy() ||
+          Ty->getVectorNumElements() == S->getType()->getVectorNumElements()) &&
+         "Invalid cast");
 
   if (Ty->isIntOrIntVectorTy())
     return Create(Instruction::PtrToInt, S, Ty, Name, InsertBefore);
+
+  Type *STy = S->getType();
+  if (STy->getPointerAddressSpace() != Ty->getPointerAddressSpace())
+    return Create(Instruction::AddrSpaceCast, S, Ty, Name, InsertBefore);
+
   return Create(Instruction::BitCast, S, Ty, Name, InsertBefore);
 }
 
@@ -2517,8 +2599,48 @@ bool CastInst::isCastable(Type *SrcTy, Type *DestTy) {
   }
 }
 
-// Provide a way to get a "cast" where the cast opcode is inferred from the 
-// types and size of the operand. This, basically, is a parallel of the 
+bool CastInst::isBitCastable(Type *SrcTy, Type *DestTy) {
+  if (!SrcTy->isFirstClassType() || !DestTy->isFirstClassType())
+    return false;
+
+  if (SrcTy == DestTy)
+    return true;
+
+  if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy)) {
+    if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy)) {
+      if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
+        // An element by element cast. Valid if casting the elements is valid.
+        SrcTy = SrcVecTy->getElementType();
+        DestTy = DestVecTy->getElementType();
+      }
+    }
+  }
+
+  if (PointerType *DestPtrTy = dyn_cast<PointerType>(DestTy)) {
+    if (PointerType *SrcPtrTy = dyn_cast<PointerType>(SrcTy)) {
+      return SrcPtrTy->getAddressSpace() == DestPtrTy->getAddressSpace();
+    }
+  }
+
+  unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();   // 0 for ptr
+  unsigned DestBits = DestTy->getPrimitiveSizeInBits(); // 0 for ptr
+
+  // Could still have vectors of pointers if the number of elements doesn't
+  // match
+  if (SrcBits == 0 || DestBits == 0)
+    return false;
+
+  if (SrcBits != DestBits)
+    return false;
+
+  if (DestTy->isX86_MMXTy() || SrcTy->isX86_MMXTy())
+    return false;
+
+  return true;
+}
+
+// Provide a way to get a "cast" where the cast opcode is inferred from the
+// types and size of the operand. This, basically, is a parallel of the
 // logic in the castIsValid function below.  This axiom should hold:
 //   castIsValid( getCastOpcode(Val, Ty), Val, Ty)
 // should not assert in castIsValid. In other words, this produces a "correct"
@@ -2535,6 +2657,7 @@ CastInst::getCastOpcode(
   if (SrcTy == DestTy)
     return BitCast;
 
+  // FIXME: Check address space sizes here
   if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy))
     if (VectorType *DestVecTy = dyn_cast<VectorType>(DestTy))
       if (SrcVecTy->getNumElements() == DestVecTy->getNumElements()) {
@@ -2601,6 +2724,8 @@ CastInst::getCastOpcode(
     return BitCast;
   } else if (DestTy->isPointerTy()) {
     if (SrcTy->isPointerTy()) {
+      if (DestTy->getPointerAddressSpace() != SrcTy->getPointerAddressSpace())
+        return AddrSpaceCast;
       return BitCast;                               // ptr -> ptr
     } else if (SrcTy->isIntegerTy()) {
       return IntToPtr;                              // int -> ptr
@@ -2695,13 +2820,27 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
   case Instruction::BitCast:
     // BitCast implies a no-op cast of type only. No bits change.
     // However, you can't cast pointers to anything but pointers.
-    if (SrcTy->isPointerTy() != DstTy->isPointerTy())
+    if (SrcTy->isPtrOrPtrVectorTy() != DstTy->isPtrOrPtrVectorTy())
       return false;
 
-    // Now we know we're not dealing with a pointer/non-pointer mismatch. In all
-    // these cases, the cast is okay if the source and destination bit widths
-    // are identical.
-    return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+    // For non pointer cases, the cast is okay if the source and destination bit
+    // widths are identical.
+    if (!SrcTy->isPtrOrPtrVectorTy())
+      return SrcTy->getPrimitiveSizeInBits() == DstTy->getPrimitiveSizeInBits();
+
+    // If both are pointers then the address spaces must match and vector of
+    // pointers must have the same number of elements.
+    return SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
+
+  case Instruction::AddrSpaceCast:
+    return SrcTy->isPtrOrPtrVectorTy() && DstTy->isPtrOrPtrVectorTy() &&
+           SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace() &&
+           SrcTy->isVectorTy() == DstTy->isVectorTy() &&
+           (!SrcTy->isVectorTy() ||
+            SrcTy->getVectorNumElements() == SrcTy->getVectorNumElements());
   }
 }
 
@@ -2848,6 +2987,18 @@ BitCastInst::BitCastInst(
   assert(castIsValid(getOpcode(), S, Ty) && "Illegal BitCast");
 }
 
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, Instruction *InsertBefore
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertBefore) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
+AddrSpaceCastInst::AddrSpaceCastInst(
+  Value *S, Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd
+) : CastInst(Ty, AddrSpaceCast, S, Name, InsertAtEnd) {
+  assert(castIsValid(getOpcode(), S, Ty) && "Illegal AddrSpaceCast");
+}
+
 //===----------------------------------------------------------------------===//
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
@@ -3180,7 +3331,6 @@ SwitchInst::SwitchInst(const SwitchInst &SI)
     OL[i] = InOL[i];
     OL[i+1] = InOL[i+1];
   }
-  TheSubsets = SI.TheSubsets;
   SubclassOptionalData = SI.SubclassOptionalData;
 }
 
@@ -3192,16 +3342,6 @@ SwitchInst::~SwitchInst() {
 /// addCase - Add an entry to the switch instruction...
 ///
 void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
-  IntegersSubsetToBB Mapping;
-  
-  // FIXME: Currently we work with ConstantInt based cases.
-  // So inititalize IntItem container directly from ConstantInt.
-  Mapping.add(IntItem::fromConstantInt(OnVal));
-  IntegersSubset CaseRanges = Mapping.getCase();
-  addCase(CaseRanges, Dest);
-}
-
-void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   unsigned NewCaseIdx = getNumCases(); 
   unsigned OpNo = NumOperands;
   if (OpNo+2 > ReservedSpace)
@@ -3209,17 +3349,14 @@ void SwitchInst::addCase(IntegersSubset& OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   NumOperands = OpNo+2;
-
-  SubsetsIt TheSubsetsIt = TheSubsets.insert(TheSubsets.end(), OnVal);
-  
-  CaseIt Case(this, NewCaseIdx, TheSubsetsIt);
-  Case.updateCaseValueOperand(OnVal);
+  CaseIt Case(this, NewCaseIdx);
+  Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt& i) {
+void SwitchInst::removeCase(CaseIt i) {
   unsigned idx = i.getCaseIndex();
   
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
@@ -3236,16 +3373,6 @@ void SwitchInst::removeCase(CaseIt& i) {
   // Nuke the last value.
   OL[NumOps-2].set(0);
   OL[NumOps-2+1].set(0);
-
-  // Do the same with TheCases collection:
-  if (i.SubsetIt != --TheSubsets.end()) {
-    *i.SubsetIt = TheSubsets.back();
-    TheSubsets.pop_back();
-  } else {
-    TheSubsets.pop_back();
-    i.SubsetIt = TheSubsets.end();
-  }
-  
   NumOperands = NumOps-2;
 }
 
@@ -3490,6 +3617,10 @@ BitCastInst *BitCastInst::clone_impl() const {
   return new BitCastInst(getOperand(0), getType());
 }
 
+AddrSpaceCastInst *AddrSpaceCastInst::clone_impl() const {
+  return new AddrSpaceCastInst(getOperand(0), getType());
+}
+
 CallInst *CallInst::clone_impl() const {
   return  new(getNumOperands()) CallInst(*this);
 }
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 0c659b8..407b985 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -355,6 +355,11 @@ public:
   typedef DenseMap<const Function*, unsigned> IntrinsicIDCacheTy;
   IntrinsicIDCacheTy IntrinsicIDCache;
 
+  /// \brief Mapping from a function to its prefix data, which is stored as the
+  /// operand of an unparented ReturnInst so that the prefix data has a Use.
+  typedef DenseMap<const Function *, ReturnInst *> PrefixDataMapTy;
+  PrefixDataMapTy PrefixDataMap;
+
   int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
   int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
   
diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp
new file mode 100644
index 0000000..a431d82
--- /dev/null
+++ b/lib/IR/LegacyPassManager.cpp
@@ -0,0 +1,1920 @@
+//===- LegacyPassManager.cpp - LLVM Pass Infrastructure Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the legacy LLVM Pass Manager infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManagers.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/PassNameParser.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+using namespace llvm::legacy;
+
+// See PassManagers.h for Pass Manager infrastructure overview.
+
+//===----------------------------------------------------------------------===//
+// Pass debugging information.  Often it is useful to find out what pass is
+// running when a crash occurs in a utility.  When this library is compiled with
+// debugging on, a command line option (--debug-pass) is enabled that causes the
+// pass name to be printed before it executes.
+//
+
+namespace {
+// Different debug levels that can be enabled...
+enum PassDebugLevel {
+  Disabled, Arguments, Structure, Executions, Details
+};
+}
+
+static cl::opt<enum PassDebugLevel>
+PassDebugging("debug-pass", cl::Hidden,
+                  cl::desc("Print PassManager debugging information"),
+                  cl::values(
+  clEnumVal(Disabled  , "disable debug output"),
+  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
+  clEnumVal(Structure , "print pass structure before run()"),
+  clEnumVal(Executions, "print pass name before it is executed"),
+  clEnumVal(Details   , "print pass details when it is executed"),
+                             clEnumValEnd));
+
+namespace {
+typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
+PassOptionList;
+}
+
+// Print IR out before/after specified passes.
+static PassOptionList
+PrintBefore("print-before",
+            llvm::cl::desc("Print IR before specified passes"),
+            cl::Hidden);
+
+static PassOptionList
+PrintAfter("print-after",
+           llvm::cl::desc("Print IR after specified passes"),
+           cl::Hidden);
+
+static cl::opt<bool>
+PrintBeforeAll("print-before-all",
+               llvm::cl::desc("Print IR before each pass"),
+               cl::init(false));
+static cl::opt<bool>
+PrintAfterAll("print-after-all",
+              llvm::cl::desc("Print IR after each pass"),
+              cl::init(false));
+
+/// This is a helper to determine whether to print IR before or
+/// after a pass.
+
+static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
+                                         PassOptionList &PassesToPrint) {
+  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
+    const llvm::PassInfo *PassInf = PassesToPrint[i];
+    if (PassInf)
+      if (PassInf->getPassArgument() == PI->getPassArgument()) {
+        return true;
+      }
+  }
+  return false;
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// before it.
+static bool ShouldPrintBeforePass(const PassInfo *PI) {
+  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
+}
+
+/// This is a utility to check whether a pass should have IR dumped
+/// after it.
+static bool ShouldPrintAfterPass(const PassInfo *PI) {
+  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
+}
+
+/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
+/// or higher is specified.
+bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
+  return PassDebugging >= Executions;
+}
+
+
+
+
+void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
+  if (V == 0 && M == 0)
+    OS << "Releasing pass '";
+  else
+    OS << "Running pass '";
+
+  OS << P->getPassName() << "'";
+
+  if (M) {
+    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
+    return;
+  }
+  if (V == 0) {
+    OS << '\n';
+    return;
+  }
+
+  OS << " on ";
+  if (isa<Function>(V))
+    OS << "function";
+  else if (isa<BasicBlock>(V))
+    OS << "basic block";
+  else
+    OS << "value";
+
+  OS << " '";
+  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
+  OS << "'\n";
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// BBPassManager
+//
+/// BBPassManager manages BasicBlockPass. It batches all the
+/// pass together and sequence them to process one basic block before
+/// processing next basic block.
+class BBPassManager : public PMDataManager, public FunctionPass {
+
+public:
+  static char ID;
+  explicit BBPassManager()
+    : PMDataManager(), FunctionPass(ID) {}
+
+  /// Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the function, and if so, return true.
+  bool runOnFunction(Function &F);
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  bool doInitialization(Module &M);
+  bool doInitialization(Function &F);
+  bool doFinalization(Module &M);
+  bool doFinalization(Function &F);
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  virtual const char *getPassName() const {
+    return "BasicBlock Pass Manager";
+  }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      BP->dumpPassStructure(Offset + 1);
+      dumpLastUses(BP, Offset+1);
+    }
+  }
+
+  BasicBlockPass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
+    return BP;
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_BasicBlockPassManager;
+  }
+};
+
+char BBPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl
+//
+/// FunctionPassManagerImpl manages FPPassManagers
+class FunctionPassManagerImpl : public Pass,
+                                public PMDataManager,
+                                public PMTopLevelManager {
+  virtual void anchor();
+private:
+  bool wasRun;
+public:
+  static char ID;
+  explicit FunctionPassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a function printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintFunctionPass(Banner, &O);
+  }
+
+  // Prepare for running an on the fly pass, freeing memory if needed
+  // from a previous run.
+  void releaseMemoryOnTheFly();
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Function &F);
+
+  /// doInitialization - Run all of the initializers for the function passes.
+  ///
+  bool doInitialization(Module &M);
+
+  /// doFinalization - Run all of the finalizers for the function passes.
+  ///
+  bool doFinalization(Module &M);
+
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_FunctionPassManager;
+  }
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  FPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
+    return FP;
+  }
+};
+
+void FunctionPassManagerImpl::anchor() {}
+
+char FunctionPassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+//===----------------------------------------------------------------------===//
+// MPPassManager
+//
+/// MPPassManager manages ModulePasses and function pass managers.
+/// It batches all Module passes and function pass managers together and
+/// sequences them to process one module.
+class MPPassManager : public Pass, public PMDataManager {
+public:
+  static char ID;
+  explicit MPPassManager() :
+    Pass(PT_PassManager, ID), PMDataManager() { }
+
+  // Delete on the fly managers.
+  virtual ~MPPassManager() {
+    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+         I != E; ++I) {
+      FunctionPassManagerImpl *FPP = I->second;
+      delete FPP;
+    }
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool runOnModule(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  /// Add RequiredPass into list of lower level passes required by pass P.
+  /// RequiredPass is run on the fly by Pass Manager when P requests it
+  /// through getAnalysis interface.
+  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
+
+  /// Return function pass corresponding to PassInfo PI, that is
+  /// required by module pass MP. Instantiate analysis pass, by using
+  /// its runOnFunction() for function F.
+  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
+
+  virtual const char *getPassName() const {
+    return "Module Pass Manager";
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+
+  // Print passes managed by this manager
+  void dumpPassStructure(unsigned Offset) {
+    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      ModulePass *MP = getContainedPass(Index);
+      MP->dumpPassStructure(Offset + 1);
+      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+        OnTheFlyManagers.find(MP);
+      if (I != OnTheFlyManagers.end())
+        I->second->dumpPassStructure(Offset + 2);
+      dumpLastUses(MP, Offset+1);
+    }
+  }
+
+  ModulePass *getContainedPass(unsigned N) {
+    assert(N < PassVector.size() && "Pass number out of range!");
+    return static_cast<ModulePass *>(PassVector[N]);
+  }
+
+  virtual PassManagerType getPassManagerType() const {
+    return PMT_ModulePassManager;
+  }
+
+ private:
+  /// Collection of on the fly FPPassManagers. These managers manage
+  /// function passes that are required by module passes.
+  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+};
+
+char MPPassManager::ID = 0;
+} // End anonymous namespace
+
+namespace llvm {
+namespace legacy {
+//===----------------------------------------------------------------------===//
+// PassManagerImpl
+//
+
+/// PassManagerImpl manages MPPassManagers
+class PassManagerImpl : public Pass,
+                        public PMDataManager,
+                        public PMTopLevelManager {
+  virtual void anchor();
+
+public:
+  static char ID;
+  explicit PassManagerImpl() :
+    Pass(PT_PassManager, ID), PMDataManager(),
+                              PMTopLevelManager(new MPPassManager()) {}
+
+  /// add - Add a pass to the queue of passes to run.  This passes ownership of
+  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+  /// will be destroyed as well, so there is no need to delete the pass.  This
+  /// implies that all passes MUST be allocated with 'new'.
+  void add(Pass *P) {
+    schedulePass(P);
+  }
+
+  /// createPrinterPass - Get a module printer pass.
+  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
+    return createPrintModulePass(&O, false, Banner);
+  }
+
+  /// run - Execute all of the passes scheduled for execution.  Keep track of
+  /// whether any of the passes modifies the module, and if so, return true.
+  bool run(Module &M);
+
+  using llvm::Pass::doInitialization;
+  using llvm::Pass::doFinalization;
+
+  /// doInitialization - Run all of the initializers for the module passes.
+  ///
+  bool doInitialization();
+
+  /// doFinalization - Run all of the finalizers for the module passes.
+  ///
+  bool doFinalization();
+
+  /// Pass Manager itself does not invalidate any analysis info.
+  void getAnalysisUsage(AnalysisUsage &Info) const {
+    Info.setPreservesAll();
+  }
+
+  virtual PMDataManager *getAsPMDataManager() { return this; }
+  virtual Pass *getAsPass() { return this; }
+  virtual PassManagerType getTopLevelPassManagerType() {
+    return PMT_ModulePassManager;
+  }
+
+  MPPassManager *getContainedManager(unsigned N) {
+    assert(N < PassManagers.size() && "Pass number out of range!");
+    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
+    return MP;
+  }
+};
+
+void PassManagerImpl::anchor() {}
+
+char PassManagerImpl::ID = 0;
+} // End of legacy namespace
+} // End of llvm namespace
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+/// TimingInfo Class - This class is used to calculate information about the
+/// amount of time each pass takes to execute.  This only happens when
+/// -time-passes is enabled on the command line.
+///
+
+static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
+
+class TimingInfo {
+  DenseMap<Pass*, Timer*> TimingData;
+  TimerGroup TG;
+public:
+  // Use 'create' member to get this.
+  TimingInfo() : TG("... Pass execution timing report ...") {}
+
+  // TimingDtor - Print out information about timing information
+  ~TimingInfo() {
+    // Delete all of the timers, which accumulate their info into the
+    // TimerGroup.
+    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
+         E = TimingData.end(); I != E; ++I)
+      delete I->second;
+    // TimerGroup is deleted next, printing the report.
+  }
+
+  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
+  // to a non null value (if the -time-passes option is enabled) or it leaves it
+  // null.  It may be called multiple times.
+  static void createTheTimeInfo();
+
+  /// getPassTimer - Return the timer for the specified pass if it exists.
+  Timer *getPassTimer(Pass *P) {
+    if (P->getAsPMDataManager())
+      return 0;
+
+    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
+    Timer *&T = TimingData[P];
+    if (T == 0)
+      T = new Timer(P->getPassName(), TG);
+    return T;
+  }
+};
+
+} // End of anon namespace
+
+static TimingInfo *TheTimeInfo;
+
+//===----------------------------------------------------------------------===//
+// PMTopLevelManager implementation
+
+/// Initialize top level manager. Create first pass manager.
+PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
+  PMDM->setTopLevelManager(this);
+  addPassManager(PMDM);
+  activeStack.push(PMDM);
+}
+
+/// Set pass P as the last user of the given analysis passes.
+void
+PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
+  unsigned PDepth = 0;
+  if (P->getResolver())
+    PDepth = P->getResolver()->getPMDataManager().getDepth();
+
+  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
+         E = AnalysisPasses.end(); I != E; ++I) {
+    Pass *AP = *I;
+    LastUser[AP] = P;
+
+    if (P == AP)
+      continue;
+
+    // Update the last users of passes that are required transitive by AP.
+    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
+    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+    SmallVector<Pass *, 12> LastUses;
+    SmallVector<Pass *, 12> LastPMUses;
+    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      assert(AnalysisPass && "Expected analysis pass to exist.");
+      AnalysisResolver *AR = AnalysisPass->getResolver();
+      assert(AR && "Expected analysis resolver to exist.");
+      unsigned APDepth = AR->getPMDataManager().getDepth();
+
+      if (PDepth == APDepth)
+        LastUses.push_back(AnalysisPass);
+      else if (PDepth > APDepth)
+        LastPMUses.push_back(AnalysisPass);
+    }
+
+    setLastUser(LastUses, P);
+
+    // If this pass has a corresponding pass manager, push higher level
+    // analysis to this pass manager.
+    if (P->getResolver())
+      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
+
+
+    // If AP is the last user of other passes then make P last user of
+    // such passes.
+    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
+           LUE = LastUser.end(); LUI != LUE; ++LUI) {
+      if (LUI->second == AP)
+        // DenseMap iterator is not invalidated here because
+        // this is just updating existing entries.
+        LastUser[LUI->first] = P;
+    }
+  }
+}
+
+/// Collect passes whose last user is P
+void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
+                                        Pass *P) {
+  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
+    InversedLastUser.find(P);
+  if (DMI == InversedLastUser.end())
+    return;
+
+  SmallPtrSet<Pass *, 8> &LU = DMI->second;
+  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
+         E = LU.end(); I != E; ++I) {
+    LastUses.push_back(*I);
+  }
+
+}
+
+AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
+  AnalysisUsage *AnUsage = NULL;
+  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
+  if (DMI != AnUsageMap.end())
+    AnUsage = DMI->second;
+  else {
+    AnUsage = new AnalysisUsage();
+    P->getAnalysisUsage(*AnUsage);
+    AnUsageMap[P] = AnUsage;
+  }
+  return AnUsage;
+}
+
+/// Schedule pass P for execution. Make sure that passes required by
+/// P are run before P is run. Update analysis info maintained by
+/// the manager. Remove dead passes. This is a recursive function.
+void PMTopLevelManager::schedulePass(Pass *P) {
+
+  // TODO : Allocate function manager for this pass, other wise required set
+  // may be inserted into previous function manager
+
+  // Give pass a chance to prepare the stage.
+  P->preparePassManager(activeStack);
+
+  // If P is an analysis pass and it is available then do not
+  // generate the analysis again. Stale analysis info should not be
+  // available at this point.
+  const PassInfo *PI =
+    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
+  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
+    delete P;
+    return;
+  }
+
+  AnalysisUsage *AnUsage = findAnalysisUsage(P);
+
+  bool checkAnalysis = true;
+  while (checkAnalysis) {
+    checkAnalysis = false;
+
+    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
+           E = RequiredSet.end(); I != E; ++I) {
+
+      Pass *AnalysisPass = findAnalysisPass(*I);
+      if (!AnalysisPass) {
+        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+
+        if (PI == NULL) {
+          // Pass P is not in the global PassRegistry
+          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
+          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
+          dbgs() << "Required Passes:" << "\n";
+          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
+                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
+            Pass *AnalysisPass2 = findAnalysisPass(*I2);
+            if (AnalysisPass2) {
+              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
+            } else {
+              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
+              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
+              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
+            }
+          }
+        }
+
+        assert(PI && "Expected required passes to be initialized");
+        AnalysisPass = PI->createPass();
+        if (P->getPotentialPassManagerType () ==
+            AnalysisPass->getPotentialPassManagerType())
+          // Schedule analysis pass that is managed by the same pass manager.
+          schedulePass(AnalysisPass);
+        else if (P->getPotentialPassManagerType () >
+                 AnalysisPass->getPotentialPassManagerType()) {
+          // Schedule analysis pass that is managed by a new manager.
+          schedulePass(AnalysisPass);
+          // Recheck analysis passes to ensure that required analyses that
+          // are already checked are still available.
+          checkAnalysis = true;
+        } else
+          // Do not schedule this analysis. Lower level analsyis
+          // passes are run on the fly.
+          delete AnalysisPass;
+      }
+    }
+  }
+
+  // Now all required passes are available.
+  if (ImmutablePass *IP = P->getAsImmutablePass()) {
+    // P is a immutable pass and it will be managed by this
+    // top level manager. Set up analysis resolver to connect them.
+    PMDataManager *DM = getAsPMDataManager();
+    AnalysisResolver *AR = new AnalysisResolver(*DM);
+    P->setResolver(AR);
+    DM->initializeAnalysisImpl(P);
+    addImmutablePass(IP);
+    DM->recordAvailableAnalysis(IP);
+    return;
+  }
+
+  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+
+  // Add the requested pass to the best available pass manager.
+  P->assignPassManager(activeStack, getTopLevelPassManagerType());
+
+  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
+    Pass *PP = P->createPrinterPass(
+      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
+    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
+  }
+}
+
+/// Find the pass that implements Analysis AID. Search immutable
+/// passes and all pass managers. If desired pass is not found
+/// then return NULL.
+Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
+
+  // Check pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+         I = IndirectPassManagers.begin(),
+         E = IndirectPassManagers.end(); I != E; ++I)
+    if (Pass *P = (*I)->findAnalysisPass(AID, false))
+      return P;
+
+  // Check the immutable passes. Iterate in reverse order so that we find
+  // the most recently registered passes first.
+  for (SmallVectorImpl<ImmutablePass *>::reverse_iterator I =
+       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
+    AnalysisID PI = (*I)->getPassID();
+    if (PI == AID)
+      return *I;
+
+    // If Pass not found then check the interfaces implemented by Immutable Pass
+    const PassInfo *PassInf =
+      PassRegistry::getPassRegistry()->getPassInfo(PI);
+    assert(PassInf && "Expected all immutable passes to be initialized");
+    const std::vector<const PassInfo*> &ImmPI =
+      PassInf->getInterfacesImplemented();
+    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
+         EE = ImmPI.end(); II != EE; ++II) {
+      if ((*II)->getTypeInfo() == AID)
+        return *I;
+    }
+  }
+
+  return 0;
+}
+
+// Print passes managed by this top level manager.
+void PMTopLevelManager::dumpPasses() const {
+
+  if (PassDebugging < Structure)
+    return;
+
+  // Print out the immutable passes
+  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
+    ImmutablePasses[i]->dumpPassStructure(0);
+  }
+
+  // Every class that derives from PMDataManager also derives from Pass
+  // (sometimes indirectly), but there's no inheritance relationship
+  // between PMDataManager and Pass, so we have to getAsPass to get
+  // from a PMDataManager* to a Pass*.
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->getAsPass()->dumpPassStructure(1);
+}
+
+void PMTopLevelManager::dumpArguments() const {
+
+  if (PassDebugging < Arguments)
+    return;
+
+  dbgs() << "Pass Arguments: ";
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I =
+       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    if (const PassInfo *PI =
+        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
+      assert(PI && "Expected all immutable passes to be initialized");
+      if (!PI->isAnalysisGroup())
+        dbgs() << " -" << PI->getPassArgument();
+    }
+  for (SmallVectorImpl<PMDataManager *>::const_iterator I =
+       PassManagers.begin(), E = PassManagers.end(); I != E; ++I)
+    (*I)->dumpPassArguments();
+  dbgs() << "\n";
+}
+
+void PMTopLevelManager::initializeAllAnalysisInfo() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  // Initailize other pass managers
+  for (SmallVectorImpl<PMDataManager *>::iterator
+       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+       I != E; ++I)
+    (*I)->initializeAnalysisInfo();
+
+  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
+        DME = LastUser.end(); DMI != DME; ++DMI) {
+    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
+      InversedLastUser.find(DMI->second);
+    if (InvDMI != InversedLastUser.end()) {
+      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
+      L.insert(DMI->first);
+    } else {
+      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
+      InversedLastUser[DMI->second] = L;
+    }
+  }
+}
+
+/// Destructor
+PMTopLevelManager::~PMTopLevelManager() {
+  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
+         E = PassManagers.end(); I != E; ++I)
+    delete *I;
+
+  for (SmallVectorImpl<ImmutablePass *>::iterator
+         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
+    delete *I;
+
+  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
+         DME = AnUsageMap.end(); DMI != DME; ++DMI)
+    delete DMI->second;
+}
+
+//===----------------------------------------------------------------------===//
+// PMDataManager implementation
+
+/// Augement AvailableAnalysis by adding analysis made available by pass P.
+void PMDataManager::recordAvailableAnalysis(Pass *P) {
+  AnalysisID PI = P->getPassID();
+
+  AvailableAnalysis[PI] = P;
+
+  assert(!AvailableAnalysis.empty());
+
+  // This pass is the current implementation of all of the interfaces it
+  // implements as well.
+  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
+  if (PInf == 0) return;
+  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+  for (unsigned i = 0, e = II.size(); i != e; ++i)
+    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+}
+
+// Return true if P preserves high level analysis used by other
+// passes managed by this manager
+bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return true;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
+         E = HigherLevelAnalysis.end(); I  != E; ++I) {
+    Pass *P1 = *I;
+    if (P1->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(),
+                  P1->getPassID()) ==
+           PreservedSet.end())
+      return false;
+  }
+
+  return true;
+}
+
+/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
+void PMDataManager::verifyPreservedAnalysis(Pass *P) {
+  // Don't do this unless assertions are enabled.
+#ifdef NDEBUG
+  return;
+#endif
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+
+  // Verify preserved analysis
+  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
+         E = PreservedSet.end(); I != E; ++I) {
+    AnalysisID AID = *I;
+    if (Pass *AP = findAnalysisPass(AID, true)) {
+      TimeRegion PassTimer(getPassTimer(AP));
+      AP->verifyAnalysis();
+    }
+  }
+}
+
+/// Remove Analysis not preserved by Pass P
+void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  if (AnUsage->getPreservesAll())
+    return;
+
+  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
+  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
+         E = AvailableAnalysis.end(); I != E; ) {
+    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
+    if (Info->second->getAsImmutablePass() == 0 &&
+        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+        PreservedSet.end()) {
+      // Remove this analysis
+      if (PassDebugging >= Details) {
+        Pass *S = Info->second;
+        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+        dbgs() << S->getPassName() << "'\n";
+      }
+      AvailableAnalysis.erase(Info);
+    }
+  }
+
+  // Check inherited analysis also. If P is not preserving analysis
+  // provided by parent manager then remove it here.
+  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
+
+    if (!InheritedAnalysis[Index])
+      continue;
+
+    for (DenseMap<AnalysisID, Pass*>::iterator
+           I = InheritedAnalysis[Index]->begin(),
+           E = InheritedAnalysis[Index]->end(); I != E; ) {
+      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
+      if (Info->second->getAsImmutablePass() == 0 &&
+          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
+             PreservedSet.end()) {
+        // Remove this analysis
+        if (PassDebugging >= Details) {
+          Pass *S = Info->second;
+          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
+          dbgs() << S->getPassName() << "'\n";
+        }
+        InheritedAnalysis[Index]->erase(Info);
+      }
+    }
+  }
+}
+
+/// Remove analysis passes that are not used any longer
+void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
+                                     enum PassDebuggingString DBG_STR) {
+
+  SmallVector<Pass *, 12> DeadPasses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(DeadPasses, P);
+
+  if (PassDebugging >= Details && !DeadPasses.empty()) {
+    dbgs() << " -*- '" <<  P->getPassName();
+    dbgs() << "' is the last user of following pass instances.";
+    dbgs() << " Free these instances\n";
+  }
+
+  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
+         E = DeadPasses.end(); I != E; ++I)
+    freePass(*I, Msg, DBG_STR);
+}
+
+void PMDataManager::freePass(Pass *P, StringRef Msg,
+                             enum PassDebuggingString DBG_STR) {
+  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
+
+  {
+    // If the pass crashes releasing memory, remember this.
+    PassManagerPrettyStackEntry X(P);
+    TimeRegion PassTimer(getPassTimer(P));
+
+    P->releaseMemory();
+  }
+
+  AnalysisID PI = P->getPassID();
+  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
+    // Remove the pass itself (if it is not already removed).
+    AvailableAnalysis.erase(PI);
+
+    // Remove all interfaces this pass implements, for which it is also
+    // listed as the available implementation.
+    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
+    for (unsigned i = 0, e = II.size(); i != e; ++i) {
+      DenseMap<AnalysisID, Pass*>::iterator Pos =
+        AvailableAnalysis.find(II[i]->getTypeInfo());
+      if (Pos != AvailableAnalysis.end() && Pos->second == P)
+        AvailableAnalysis.erase(Pos);
+    }
+  }
+}
+
+/// Add pass P into the PassVector. Update
+/// AvailableAnalysis appropriately if ProcessAnalysis is true.
+void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
+  // This manager is going to manage pass P. Set up analysis resolver
+  // to connect them.
+  AnalysisResolver *AR = new AnalysisResolver(*this);
+  P->setResolver(AR);
+
+  // If a FunctionPass F is the last user of ModulePass info M
+  // then the F's manager, not F, records itself as a last user of M.
+  SmallVector<Pass *, 12> TransferLastUses;
+
+  if (!ProcessAnalysis) {
+    // Add pass
+    PassVector.push_back(P);
+    return;
+  }
+
+  // At the moment, this pass is the last user of all required passes.
+  SmallVector<Pass *, 12> LastUses;
+  SmallVector<Pass *, 8> RequiredPasses;
+  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
+
+  unsigned PDepth = this->getDepth();
+
+  collectRequiredAnalysis(RequiredPasses,
+                          ReqAnalysisNotAvailable, P);
+  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
+         E = RequiredPasses.end(); I != E; ++I) {
+    Pass *PRequired = *I;
+    unsigned RDepth = 0;
+
+    assert(PRequired->getResolver() && "Analysis Resolver is not set");
+    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
+    RDepth = DM.getDepth();
+
+    if (PDepth == RDepth)
+      LastUses.push_back(PRequired);
+    else if (PDepth > RDepth) {
+      // Let the parent claim responsibility of last use
+      TransferLastUses.push_back(PRequired);
+      // Keep track of higher level analysis used by this manager.
+      HigherLevelAnalysis.push_back(PRequired);
+    } else
+      llvm_unreachable("Unable to accommodate Required Pass");
+  }
+
+  // Set P as P's last user until someone starts using P.
+  // However, if P is a Pass Manager then it does not need
+  // to record its last user.
+  if (P->getAsPMDataManager() == 0)
+    LastUses.push_back(P);
+  TPM->setLastUser(LastUses, P);
+
+  if (!TransferLastUses.empty()) {
+    Pass *My_PM = getAsPass();
+    TPM->setLastUser(TransferLastUses, My_PM);
+    TransferLastUses.clear();
+  }
+
+  // Now, take care of required analyses that are not available.
+  for (SmallVectorImpl<AnalysisID>::iterator
+         I = ReqAnalysisNotAvailable.begin(),
+         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
+    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
+    Pass *AnalysisPass = PI->createPass();
+    this->addLowerLevelRequiredPass(P, AnalysisPass);
+  }
+
+  // Take a note of analysis required and made available by this pass.
+  // Remove the analysis not preserved by this pass
+  removeNotPreservedAnalysis(P);
+  recordAvailableAnalysis(P);
+
+  // Add pass
+  PassVector.push_back(P);
+}
+
+
+/// Populate RP with analysis pass that are required by
+/// pass P and are available. Populate RP_NotAvail with analysis
+/// pass that are required by pass P but are not available.
+void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
+                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
+                                            Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
+  for (AnalysisUsage::VectorType::const_iterator
+         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+
+  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
+  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
+         E = IDs.end(); I != E; ++I) {
+    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
+      RP.push_back(AnalysisPass);
+    else
+      RP_NotAvail.push_back(*I);
+  }
+}
+
+// All Required analyses should be available to the pass as it runs!  Here
+// we fill in the AnalysisImpls member of the pass so that it can
+// successfully use the getAnalysis() method to retrieve the
+// implementations it needs.
+//
+void PMDataManager::initializeAnalysisImpl(Pass *P) {
+  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
+
+  for (AnalysisUsage::VectorType::const_iterator
+         I = AnUsage->getRequiredSet().begin(),
+         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
+    Pass *Impl = findAnalysisPass(*I, true);
+    if (Impl == 0)
+      // This may be analysis pass that is initialized on the fly.
+      // If that is not the case then it will raise an assert when it is used.
+      continue;
+    AnalysisResolver *AR = P->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->addAnalysisImplsPair(*I, Impl);
+  }
+}
+
+/// Find the pass that implements Analysis AID. If desired pass is not found
+/// then return NULL.
+Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
+
+  // Check if AvailableAnalysis map has one entry.
+  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
+
+  if (I != AvailableAnalysis.end())
+    return I->second;
+
+  // Search Parents through TopLevelManager
+  if (SearchParent)
+    return TPM->findAnalysisPass(AID);
+
+  return NULL;
+}
+
+// Print list of passes that are last used by P.
+void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
+
+  SmallVector<Pass *, 12> LUses;
+
+  // If this is a on the fly manager then it does not have TPM.
+  if (!TPM)
+    return;
+
+  TPM->collectLastUses(LUses, P);
+
+  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
+         E = LUses.end(); I != E; ++I) {
+    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
+    (*I)->dumpPassStructure(0);
+  }
+}
+
+void PMDataManager::dumpPassArguments() const {
+  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
+        E = PassVector.end(); I != E; ++I) {
+    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
+      PMD->dumpPassArguments();
+    else
+      if (const PassInfo *PI =
+            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
+        if (!PI->isAnalysisGroup())
+          dbgs() << " -" << PI->getPassArgument();
+  }
+}
+
+void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
+                                 enum PassDebuggingString S2,
+                                 StringRef Msg) {
+  if (PassDebugging < Executions)
+    return;
+  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
+  switch (S1) {
+  case EXECUTION_MSG:
+    dbgs() << "Executing Pass '" << P->getPassName();
+    break;
+  case MODIFICATION_MSG:
+    dbgs() << "Made Modification '" << P->getPassName();
+    break;
+  case FREEING_MSG:
+    dbgs() << " Freeing Pass '" << P->getPassName();
+    break;
+  default:
+    break;
+  }
+  switch (S2) {
+  case ON_BASICBLOCK_MSG:
+    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
+    break;
+  case ON_FUNCTION_MSG:
+    dbgs() << "' on Function '" << Msg << "'...\n";
+    break;
+  case ON_MODULE_MSG:
+    dbgs() << "' on Module '"  << Msg << "'...\n";
+    break;
+  case ON_REGION_MSG:
+    dbgs() << "' on Region '"  << Msg << "'...\n";
+    break;
+  case ON_LOOP_MSG:
+    dbgs() << "' on Loop '" << Msg << "'...\n";
+    break;
+  case ON_CG_MSG:
+    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
+    break;
+  default:
+    break;
+  }
+}
+
+void PMDataManager::dumpRequiredSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
+}
+
+void PMDataManager::dumpPreservedSet(const Pass *P) const {
+  if (PassDebugging < Details)
+    return;
+
+  AnalysisUsage analysisUsage;
+  P->getAnalysisUsage(analysisUsage);
+  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
+}
+
+void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
+                                   const AnalysisUsage::VectorType &Set) const {
+  assert(PassDebugging >= Details);
+  if (Set.empty())
+    return;
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  for (unsigned i = 0; i != Set.size(); ++i) {
+    if (i) dbgs() << ',';
+    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
+    if (!PInf) {
+      // Some preserved passes, such as AliasAnalysis, may not be initialized by
+      // all drivers.
+      dbgs() << " Uninitialized Pass";
+      continue;
+    }
+    dbgs() << ' ' << PInf->getPassName();
+  }
+  dbgs() << '\n';
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+/// This should be handled by specific pass manager.
+void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  if (TPM) {
+    TPM->dumpArguments();
+    TPM->dumpPasses();
+  }
+
+  // Module Level pass may required Function Level analysis info
+  // (e.g. dominator info). Pass manager uses on the fly function pass manager
+  // to provide this on demand. In that case, in Pass manager terminology,
+  // module level pass is requiring lower level analysis info managed by
+  // lower level pass manager.
+
+  // When Pass manager is not able to order required analysis info, Pass manager
+  // checks whether any lower level manager will be able to provide this
+  // analysis info on demand or not.
+#ifndef NDEBUG
+  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
+  dbgs() << "' required by '" << P->getPassName() << "'\n";
+#endif
+  llvm_unreachable("Unable to schedule pass");
+}
+
+Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
+  llvm_unreachable("Unable to find on the fly pass");
+}
+
+// Destructor
+PMDataManager::~PMDataManager() {
+  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
+         E = PassVector.end(); I != E; ++I)
+    delete *I;
+}
+
+//===----------------------------------------------------------------------===//
+// NOTE: Is this the right place to define this method ?
+// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
+Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
+  return PM.findAnalysisPass(ID, dir);
+}
+
+Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
+                                     Function &F) {
+  return PM.getOnTheFlyPass(P, AnalysisPI, F);
+}
+
+//===----------------------------------------------------------------------===//
+// BBPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool BBPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = doInitialization(F);
+
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+      BasicBlockPass *BP = getContainedPass(Index);
+      bool LocalChanged = false;
+
+      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
+      dumpRequiredSet(BP);
+
+      initializeAnalysisImpl(BP);
+
+      {
+        // If the pass crashes, remember this.
+        PassManagerPrettyStackEntry X(BP, *I);
+        TimeRegion PassTimer(getPassTimer(BP));
+
+        LocalChanged |= BP->runOnBasicBlock(*I);
+      }
+
+      Changed |= LocalChanged;
+      if (LocalChanged)
+        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
+                     I->getName());
+      dumpPreservedSet(BP);
+
+      verifyPreservedAnalysis(BP);
+      removeNotPreservedAnalysis(BP);
+      recordAvailableAnalysis(BP);
+      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
+    }
+
+  return doFinalization(F) || Changed;
+}
+
+// Implement doInitialization and doFinalization
+bool BBPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+bool BBPassManager::doInitialization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doInitialization(F);
+  }
+
+  return Changed;
+}
+
+bool BBPassManager::doFinalization(Function &F) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    BasicBlockPass *BP = getContainedPass(Index);
+    Changed |= BP->doFinalization(F);
+  }
+
+  return Changed;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManager implementation
+
+/// Create new Function pass manager
+FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
+  FPM = new FunctionPassManagerImpl();
+  // FPM is the top level manager.
+  FPM->setTopLevelManager(FPM);
+
+  AnalysisResolver *AR = new AnalysisResolver(*FPM);
+  FPM->setResolver(AR);
+}
+
+FunctionPassManager::~FunctionPassManager() {
+  delete FPM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes
+/// ownership of the Pass to the PassManager.  When the
+/// PassManager_X is destroyed, the pass will be destroyed as well, so
+/// there is no need to delete the pass. (TODO delete passes.)
+/// This implies that all passes MUST be allocated with 'new'.
+void FunctionPassManager::add(Pass *P) {
+  FPM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep
+/// track of whether any of the passes modifies the function, and if
+/// so, return true.
+///
+bool FunctionPassManager::run(Function &F) {
+  if (F.isMaterializable()) {
+    std::string errstr;
+    if (F.Materialize(&errstr))
+      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  }
+  return FPM->run(F);
+}
+
+
+/// doInitialization - Run all of the initializers for the function passes.
+///
+bool FunctionPassManager::doInitialization() {
+  return FPM->doInitialization(*M);
+}
+
+/// doFinalization - Run all of the finalizers for the function passes.
+///
+bool FunctionPassManager::doFinalization() {
+  return FPM->doFinalization(*M);
+}
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// cleanup - After running all passes, clean up pass manager cache.
+void FPPassManager::cleanup() {
+ for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    AnalysisResolver *AR = FP->getResolver();
+    assert(AR && "Analysis Resolver is not set");
+    AR->clearAnalysisImpls();
+ }
+}
+
+void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
+  if (!wasRun)
+    return;
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    FPPassManager *FPPM = getContainedManager(Index);
+    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
+      FPPM->getContainedPass(Index)->releaseMemory();
+    }
+  }
+  wasRun = false;
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  wasRun = true;
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// FPPassManager implementation
+
+char FPPassManager::ID = 0;
+/// Print passes managed by this manager
+void FPPassManager::dumpPassStructure(unsigned Offset) {
+  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    FP->dumpPassStructure(Offset + 1);
+    dumpLastUses(FP, Offset+1);
+  }
+}
+
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnFunction method.  Keep track of whether any of the passes modifies
+/// the function, and if so, return true.
+bool FPPassManager::runOnFunction(Function &F) {
+  if (F.isDeclaration())
+    return false;
+
+  bool Changed = false;
+
+  // Collect inherited analysis from Module level pass manager.
+  populateInheritedAnalysis(TPM->activeStack);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    FunctionPass *FP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpRequiredSet(FP);
+
+    initializeAnalysisImpl(FP);
+
+    {
+      PassManagerPrettyStackEntry X(FP, F);
+      TimeRegion PassTimer(getPassTimer(FP));
+
+      LocalChanged |= FP->runOnFunction(F);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
+    dumpPreservedSet(FP);
+
+    verifyPreservedAnalysis(FP);
+    removeNotPreservedAnalysis(FP);
+    recordAvailableAnalysis(FP);
+    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
+  }
+  return Changed;
+}
+
+bool FPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    Changed |= runOnFunction(*I);
+
+  return Changed;
+}
+
+bool FPPassManager::doInitialization(Module &M) {
+  bool Changed = false;
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FPPassManager::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MPPassManager implementation
+
+/// Execute all of the passes scheduled for execution by invoking
+/// runOnModule method.  Keep track of whether any of the passes modifies
+/// the module, and if so, return true.
+bool
+MPPassManager::runOnModule(Module &M) {
+  bool Changed = false;
+
+  // Initialize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    Changed |= FPP->doInitialization(M);
+  }
+
+  // Initialize module passes
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
+    Changed |= getContainedPass(Index)->doInitialization(M);
+
+  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
+    ModulePass *MP = getContainedPass(Index);
+    bool LocalChanged = false;
+
+    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
+    dumpRequiredSet(MP);
+
+    initializeAnalysisImpl(MP);
+
+    {
+      PassManagerPrettyStackEntry X(MP, M);
+      TimeRegion PassTimer(getPassTimer(MP));
+
+      LocalChanged |= MP->runOnModule(M);
+    }
+
+    Changed |= LocalChanged;
+    if (LocalChanged)
+      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
+                   M.getModuleIdentifier());
+    dumpPreservedSet(MP);
+
+    verifyPreservedAnalysis(MP);
+    removeNotPreservedAnalysis(MP);
+    recordAvailableAnalysis(MP);
+    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
+  }
+
+  // Finalize module passes
+  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
+    Changed |= getContainedPass(Index)->doFinalization(M);
+
+  // Finalize on-the-fly passes
+  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
+       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
+       I != E; ++I) {
+    FunctionPassManagerImpl *FPP = I->second;
+    // We don't know when is the last time an on-the-fly pass is run,
+    // so we need to releaseMemory / finalize here
+    FPP->releaseMemoryOnTheFly();
+    Changed |= FPP->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+/// Add RequiredPass into list of lower level passes required by pass P.
+/// RequiredPass is run on the fly by Pass Manager when P requests it
+/// through getAnalysis interface.
+void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
+  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+  assert((P->getPotentialPassManagerType() <
+          RequiredPass->getPotentialPassManagerType()) &&
+         "Unable to handle Pass that requires lower level Analysis pass");
+
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  if (!FPP) {
+    FPP = new FunctionPassManagerImpl();
+    // FPP is the top level manager.
+    FPP->setTopLevelManager(FPP);
+
+    OnTheFlyManagers[P] = FPP;
+  }
+  FPP->add(RequiredPass);
+
+  // Register P as the last user of RequiredPass.
+  if (RequiredPass) {
+    SmallVector<Pass *, 1> LU;
+    LU.push_back(RequiredPass);
+    FPP->setLastUser(LU,  P);
+  }
+}
+
+/// Return function pass corresponding to PassInfo PI, that is
+/// required by module pass MP. Instantiate analysis pass, by using
+/// its runOnFunction() for function F.
+Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
+  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+  assert(FPP && "Unable to find on the fly pass");
+
+  FPP->releaseMemoryOnTheFly();
+  FPP->run(F);
+  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
+}
+
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+  TimingInfo::createTheTimeInfo();
+
+  dumpArguments();
+  dumpPasses();
+
+  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doInitialization(M);
+  }
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->runOnModule(M);
+
+  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
+       E = IPV.end(); I != E; ++I) {
+    Changed |= (*I)->doFinalization(M);
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// PassManager implementation
+
+/// Create new pass manager
+PassManager::PassManager() {
+  PM = new PassManagerImpl();
+  // PM is the top level manager
+  PM->setTopLevelManager(PM);
+}
+
+PassManager::~PassManager() {
+  delete PM;
+}
+
+/// add - Add a pass to the queue of passes to run.  This passes ownership of
+/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
+/// will be destroyed as well, so there is no need to delete the pass.  This
+/// implies that all passes MUST be allocated with 'new'.
+void PassManager::add(Pass *P) {
+  PM->add(P);
+}
+
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManager::run(Module &M) {
+  return PM->run(M);
+}
+
+//===----------------------------------------------------------------------===//
+// TimingInfo implementation
+
+bool llvm::TimePassesIsEnabled = false;
+static cl::opt<bool,true>
+EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
+            cl::desc("Time each pass, printing elapsed time for each on exit"));
+
+// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
+// a non null value (if the -time-passes option is enabled) or it leaves it
+// null.  It may be called multiple times.
+void TimingInfo::createTheTimeInfo() {
+  if (!TimePassesIsEnabled || TheTimeInfo) return;
+
+  // Constructed the first time this is called, iff -time-passes is enabled.
+  // This guarantees that the object will be constructed before static globals,
+  // thus it will be destroyed before them.
+  static ManagedStatic<TimingInfo> TTI;
+  TheTimeInfo = &*TTI;
+}
+
+/// If TimingInfo is enabled then start pass timer.
+Timer *llvm::getPassTimer(Pass *P) {
+  if (TheTimeInfo)
+    return TheTimeInfo->getPassTimer(P);
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// PMStack implementation
+//
+
+// Pop Pass Manager from the stack and clear its analysis info.
+void PMStack::pop() {
+
+  PMDataManager *Top = this->top();
+  Top->initializeAnalysisInfo();
+
+  S.pop_back();
+}
+
+// Push PM on the stack and set its top level manager.
+void PMStack::push(PMDataManager *PM) {
+  assert(PM && "Unable to push. Pass Manager expected");
+  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
+
+  if (!this->empty()) {
+    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
+           && "pushing bad pass manager to PMStack");
+    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
+
+    assert(TPM && "Unable to find top level manager");
+    TPM->addIndirectPassManager(PM);
+    PM->setTopLevelManager(TPM);
+    PM->setDepth(this->top()->getDepth()+1);
+  } else {
+    assert((PM->getPassManagerType() == PMT_ModulePassManager
+           || PM->getPassManagerType() == PMT_FunctionPassManager)
+           && "pushing bad pass manager to PMStack");
+    PM->setDepth(1);
+  }
+
+  S.push_back(PM);
+}
+
+// Dump content of the pass manager stack.
+void PMStack::dump() const {
+  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
+         E = S.end(); I != E; ++I)
+    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
+
+  if (!S.empty())
+    dbgs() << '\n';
+}
+
+/// Find appropriate Module Pass Manager in the PM Stack and
+/// add self into that manager.
+void ModulePass::assignPassManager(PMStack &PMS,
+                                   PassManagerType PreferredType) {
+  // Find Module Pass Manager
+  while (!PMS.empty()) {
+    PassManagerType TopPMType = PMS.top()->getPassManagerType();
+    if (TopPMType == PreferredType)
+      break; // We found desired pass manager
+    else if (TopPMType > PMT_ModulePassManager)
+      PMS.pop();    // Pop children pass managers
+    else
+      break;
+  }
+  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
+  PMS.top()->add(this);
+}
+
+/// Find appropriate Function Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void FunctionPass::assignPassManager(PMStack &PMS,
+                                     PassManagerType PreferredType) {
+
+  // Find Function Pass Manager
+  while (!PMS.empty()) {
+    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
+      PMS.pop();
+    else
+      break;
+  }
+
+  // Create new Function Pass Manager if needed.
+  FPPassManager *FPP;
+  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
+    FPP = (FPPassManager *)PMS.top();
+  } else {
+    assert(!PMS.empty() && "Unable to create Function Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Function Pass Manager
+    FPP = new FPPassManager();
+    FPP->populateInheritedAnalysis(PMS);
+
+    // [2] Set up new manager's top level manager
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(FPP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    FPP->assignPassManager(PMS, PMD->getPassManagerType());
+
+    // [4] Push new manager into PMS
+    PMS.push(FPP);
+  }
+
+  // Assign FPP as the manager of this pass.
+  FPP->add(this);
+}
+
+/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
+/// in the PM Stack and add self into that manager.
+void BasicBlockPass::assignPassManager(PMStack &PMS,
+                                       PassManagerType PreferredType) {
+  BBPassManager *BBP;
+
+  // Basic Pass Manager is a leaf pass manager. It does not handle
+  // any other pass manager.
+  if (!PMS.empty() &&
+      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
+    BBP = (BBPassManager *)PMS.top();
+  } else {
+    // If leaf manager is not Basic Block Pass manager then create new
+    // basic Block Pass manager.
+    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
+    PMDataManager *PMD = PMS.top();
+
+    // [1] Create new Basic Block Manager
+    BBP = new BBPassManager();
+
+    // [2] Set up new manager's top level manager
+    // Basic Block Pass Manager does not live by itself
+    PMTopLevelManager *TPM = PMD->getTopLevelManager();
+    TPM->addIndirectPassManager(BBP);
+
+    // [3] Assign manager to manage this new manager. This may create
+    // and push new managers into PMS
+    BBP->assignPassManager(PMS, PreferredType);
+
+    // [4] Push new manager into PMS
+    PMS.push(BBP);
+  }
+
+  // Assign BBP as the manager of this pass.
+  BBP->add(this);
+}
+
+PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 6a6b7af..a32d25c 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -65,7 +65,7 @@ class MDNodeOperand : public CallbackVH {
 
 public:
   MDNodeOperand(Value *V) : CallbackVH(V) {}
-  ~MDNodeOperand() {}
+  virtual ~MDNodeOperand();
 
   void set(Value *V) {
     unsigned IsFirst = this->getValPtrInt();
@@ -82,6 +82,8 @@ public:
 };
 } // end namespace llvm.
 
+// Provide out-of-line definition to prevent weak vtable.
+MDNodeOperand::~MDNodeOperand() {}
 
 void MDNodeOperand::deleted() {
   getParent()->replaceOperand(this, 0);
@@ -422,7 +424,7 @@ static bool canBeMerged(const ConstantRange &A, const ConstantRange &B) {
   return !A.intersectWith(B).isEmptySet() || isContiguous(A, B);
 }
 
-static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+static bool tryMergeRange(SmallVectorImpl<Value *> &EndPoints, ConstantInt *Low,
                           ConstantInt *High) {
   ConstantRange NewRange(Low->getValue(), High->getValue());
   unsigned Size = EndPoints.size();
@@ -439,7 +441,7 @@ static bool tryMergeRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
   return false;
 }
 
-static void addRange(SmallVector<Value*, 4> &EndPoints, ConstantInt *Low,
+static void addRange(SmallVectorImpl<Value *> &EndPoints, ConstantInt *Low,
                      ConstantInt *High) {
   if (!EndPoints.empty())
     if (tryMergeRange(EndPoints, Low, High))
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 8affcc9..4f240c7 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -168,23 +168,6 @@ Constant *Module::getOrInsertFunction(StringRef Name,
   return F;
 }
 
-Constant *Module::getOrInsertTargetIntrinsic(StringRef Name,
-                                             FunctionType *Ty,
-                                             AttributeSet AttributeList) {
-  // See if we have a definition for the specified function already.
-  GlobalValue *F = getNamedValue(Name);
-  if (F == 0) {
-    // Nope, add it
-    Function *New = Function::Create(Ty, GlobalVariable::ExternalLinkage, Name);
-    New->setAttributes(AttributeList);
-    FunctionList.push_back(New);
-    return New; // Return the new prototype.
-  }
-
-  // Otherwise, we just found the existing function or a prototype.
-  return F;
-}
-
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
   return getOrInsertFunction(Name, Ty, AttributeSet());
@@ -250,8 +233,7 @@ Function *Module::getFunction(StringRef Name) const {
 /// If AllowLocal is set to true, this function will return types that
 /// have an local. By default, these types are not returned.
 ///
-GlobalVariable *Module::getGlobalVariable(StringRef Name,
-                                          bool AllowLocal) const {
+GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
   if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
@@ -263,7 +245,7 @@ GlobalVariable *Module::getGlobalVariable(StringRef Name,
 ///   1. If it does not exist, add a declaration of the global and return it.
 ///   2. Else, the global exists but has the wrong type: return the function
 ///      with a constantexpr cast to the right type.
-///   3. Finally, if the existing global is the correct delclaration, return the
+///   3. Finally, if the existing global is the correct declaration, return the
 ///      existing global.
 Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
   // See if we have a definition for the specified global already.
@@ -278,8 +260,10 @@ Constant *Module::getOrInsertGlobal(StringRef Name, Type *Ty) {
 
   // If the variable exists but has the wrong type, return a bitcast to the
   // right type.
-  if (GV->getType() != PointerType::getUnqual(Ty))
-    return ConstantExpr::getBitCast(GV, PointerType::getUnqual(Ty));
+  Type *GVTy = GV->getType();
+  PointerType *PTy = PointerType::get(Ty, GVTy->getPointerAddressSpace());
+  if (GVTy != PTy)
+    return ConstantExpr::getBitCast(GV, PTy);
 
   // Otherwise, we just found the existing function or a prototype.
   return GV;
@@ -334,12 +318,30 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
 
   for (unsigned i = 0, e = ModFlags->getNumOperands(); i != e; ++i) {
     MDNode *Flag = ModFlags->getOperand(i);
-    ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
-    MDString *Key = cast<MDString>(Flag->getOperand(1));
-    Value *Val = Flag->getOperand(2);
-    Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
-                                    Key, Val));
+    if (Flag->getNumOperands() >= 3 && isa<ConstantInt>(Flag->getOperand(0)) &&
+        isa<MDString>(Flag->getOperand(1))) {
+      // Check the operands of the MDNode before accessing the operands.
+      // The verifier will actually catch these failures.
+      ConstantInt *Behavior = cast<ConstantInt>(Flag->getOperand(0));
+      MDString *Key = cast<MDString>(Flag->getOperand(1));
+      Value *Val = Flag->getOperand(2);
+      Flags.push_back(ModuleFlagEntry(ModFlagBehavior(Behavior->getZExtValue()),
+                                      Key, Val));
+    }
+  }
+}
+
+/// Return the corresponding value if Key appears in module flags, otherwise
+/// return null.
+Value *Module::getModuleFlag(StringRef Key) const {
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  getModuleFlagsMetadata(ModuleFlags);
+  for (unsigned I = 0, E = ModuleFlags.size(); I < E; ++I) {
+    const ModuleFlagEntry &MFE = ModuleFlags[I];
+    if (Key == MFE.Key->getString())
+      return MFE.Val;
   }
+  return 0;
 }
 
 /// getModuleFlagsMetadata - Returns the NamedMDNode in the module that
@@ -404,9 +406,15 @@ bool Module::isDematerializable(const GlobalValue *GV) const {
 }
 
 bool Module::Materialize(GlobalValue *GV, std::string *ErrInfo) {
-  if (Materializer)
-    return Materializer->Materialize(GV, ErrInfo);
-  return false;
+  if (!Materializer)
+    return false;
+
+  error_code EC = Materializer->Materialize(GV);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 void Module::Dematerialize(GlobalValue *GV) {
@@ -417,7 +425,12 @@ void Module::Dematerialize(GlobalValue *GV) {
 bool Module::MaterializeAll(std::string *ErrInfo) {
   if (!Materializer)
     return false;
-  return Materializer->MaterializeModule(this, ErrInfo);
+  error_code EC = Materializer->MaterializeModule(this);
+  if (!EC)
+    return false;
+  if (ErrInfo)
+    *ErrInfo = EC.message();
+  return true;
 }
 
 bool Module::MaterializeAllPermanently(std::string *ErrInfo) {
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 387094a..966af7d 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -1,4 +1,4 @@
-//===- PassManager.cpp - LLVM Pass Infrastructure Implementation ----------===//
+//===- PassManager.h - Infrastructure for managing & running IR passes ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1907 +6,152 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file implements the LLVM Pass Manager infrastructure.
-//
-//===----------------------------------------------------------------------===//
 
+#include "llvm/IR/PassManager.h"
+#include "llvm/ADT/STLExtras.h"
 
-#include "llvm/PassManagers.h"
-#include "llvm/Assembly/PrintModulePass.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/IR/Module.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/PassNameParser.h"
-#include "llvm/Support/Timer.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <map>
 using namespace llvm;
 
-// See PassManagers.h for Pass Manager infrastructure overview.
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// Pass debugging information.  Often it is useful to find out what pass is
-// running when a crash occurs in a utility.  When this library is compiled with
-// debugging on, a command line option (--debug-pass) is enabled that causes the
-// pass name to be printed before it executes.
-//
-
-// Different debug levels that can be enabled...
-enum PassDebugLevel {
-  Disabled, Arguments, Structure, Executions, Details
-};
-
-static cl::opt<enum PassDebugLevel>
-PassDebugging("debug-pass", cl::Hidden,
-                  cl::desc("Print PassManager debugging information"),
-                  cl::values(
-  clEnumVal(Disabled  , "disable debug output"),
-  clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
-  clEnumVal(Structure , "print pass structure before run()"),
-  clEnumVal(Executions, "print pass name before it is executed"),
-  clEnumVal(Details   , "print pass details when it is executed"),
-                             clEnumValEnd));
-
-typedef llvm::cl::list<const llvm::PassInfo *, bool, PassNameParser>
-PassOptionList;
-
-// Print IR out before/after specified passes.
-static PassOptionList
-PrintBefore("print-before",
-            llvm::cl::desc("Print IR before specified passes"),
-            cl::Hidden);
-
-static PassOptionList
-PrintAfter("print-after",
-           llvm::cl::desc("Print IR after specified passes"),
-           cl::Hidden);
-
-static cl::opt<bool>
-PrintBeforeAll("print-before-all",
-               llvm::cl::desc("Print IR before each pass"),
-               cl::init(false));
-static cl::opt<bool>
-PrintAfterAll("print-after-all",
-              llvm::cl::desc("Print IR after each pass"),
-              cl::init(false));
-
-/// This is a helper to determine whether to print IR before or
-/// after a pass.
-
-static bool ShouldPrintBeforeOrAfterPass(const PassInfo *PI,
-                                         PassOptionList &PassesToPrint) {
-  for (unsigned i = 0, ie = PassesToPrint.size(); i < ie; ++i) {
-    const llvm::PassInfo *PassInf = PassesToPrint[i];
-    if (PassInf)
-      if (PassInf->getPassArgument() == PI->getPassArgument()) {
-        return true;
-      }
-  }
-  return false;
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// before it.
-static bool ShouldPrintBeforePass(const PassInfo *PI) {
-  return PrintBeforeAll || ShouldPrintBeforeOrAfterPass(PI, PrintBefore);
-}
-
-/// This is a utility to check whether a pass should have IR dumped
-/// after it.
-static bool ShouldPrintAfterPass(const PassInfo *PI) {
-  return PrintAfterAll || ShouldPrintBeforeOrAfterPass(PI, PrintAfter);
-}
-
-} // End of llvm namespace
-
-/// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
-/// or higher is specified.
-bool PMDataManager::isPassDebuggingExecutionsOrMore() const {
-  return PassDebugging >= Executions;
-}
-
-
-
-
-void PassManagerPrettyStackEntry::print(raw_ostream &OS) const {
-  if (V == 0 && M == 0)
-    OS << "Releasing pass '";
-  else
-    OS << "Running pass '";
-
-  OS << P->getPassName() << "'";
-
-  if (M) {
-    OS << " on module '" << M->getModuleIdentifier() << "'.\n";
-    return;
-  }
-  if (V == 0) {
-    OS << '\n';
-    return;
-  }
-
-  OS << " on ";
-  if (isa<Function>(V))
-    OS << "function";
-  else if (isa<BasicBlock>(V))
-    OS << "basic block";
-  else
-    OS << "value";
-
-  OS << " '";
-  WriteAsOperand(OS, V, /*PrintTy=*/false, M);
-  OS << "'\n";
+void ModulePassManager::run() {
+  for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+    if (Passes[Idx]->run(M))
+      if (AM) AM->invalidateAll(M);
 }
 
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-// BBPassManager
-//
-/// BBPassManager manages BasicBlockPass. It batches all the
-/// pass together and sequence them to process one basic block before
-/// processing next basic block.
-class BBPassManager : public PMDataManager, public FunctionPass {
-
-public:
-  static char ID;
-  explicit BBPassManager()
-    : PMDataManager(), FunctionPass(ID) {}
-
-  /// Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the function, and if so, return true.
-  bool runOnFunction(Function &F);
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  bool doInitialization(Module &M);
-  bool doInitialization(Function &F);
-  bool doFinalization(Module &M);
-  bool doFinalization(Function &F);
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  virtual const char *getPassName() const {
-    return "BasicBlock Pass Manager";
-  }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "BasicBlockPass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      BP->dumpPassStructure(Offset + 1);
-      dumpLastUses(BP, Offset+1);
-    }
-  }
-
-  BasicBlockPass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    BasicBlockPass *BP = static_cast<BasicBlockPass *>(PassVector[N]);
-    return BP;
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_BasicBlockPassManager;
-  }
-};
-
-char BBPassManager::ID = 0;
-}
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl
-//
-/// FunctionPassManagerImpl manages FPPassManagers
-class FunctionPassManagerImpl : public Pass,
-                                public PMDataManager,
-                                public PMTopLevelManager {
-  virtual void anchor();
-private:
-  bool wasRun;
-public:
-  static char ID;
-  explicit FunctionPassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-    PMTopLevelManager(new FPPassManager()), wasRun(false) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a function printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintFunctionPass(Banner, &O);
-  }
-
-  // Prepare for running an on the fly pass, freeing memory if needed
-  // from a previous run.
-  void releaseMemoryOnTheFly();
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Function &F);
-
-  /// doInitialization - Run all of the initializers for the function passes.
-  ///
-  bool doInitialization(Module &M);
-
-  /// doFinalization - Run all of the finalizers for the function passes.
-  ///
-  bool doFinalization(Module &M);
-
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_FunctionPassManager;
-  }
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  FPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    FPPassManager *FP = static_cast<FPPassManager *>(PassManagers[N]);
-    return FP;
-  }
-};
-
-void FunctionPassManagerImpl::anchor() {}
-
-char FunctionPassManagerImpl::ID = 0;
-
-//===----------------------------------------------------------------------===//
-// MPPassManager
-//
-/// MPPassManager manages ModulePasses and function pass managers.
-/// It batches all Module passes and function pass managers together and
-/// sequences them to process one module.
-class MPPassManager : public Pass, public PMDataManager {
-public:
-  static char ID;
-  explicit MPPassManager() :
-    Pass(PT_PassManager, ID), PMDataManager() { }
-
-  // Delete on the fly managers.
-  virtual ~MPPassManager() {
-    for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-           I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-         I != E; ++I) {
-      FunctionPassManagerImpl *FPP = I->second;
-      delete FPP;
-    }
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool runOnModule(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  /// Add RequiredPass into list of lower level passes required by pass P.
-  /// RequiredPass is run on the fly by Pass Manager when P requests it
-  /// through getAnalysis interface.
-  virtual void addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass);
-
-  /// Return function pass corresponding to PassInfo PI, that is
-  /// required by module pass MP. Instantiate analysis pass, by using
-  /// its runOnFunction() for function F.
-  virtual Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F);
-
-  virtual const char *getPassName() const {
-    return "Module Pass Manager";
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-
-  // Print passes managed by this manager
-  void dumpPassStructure(unsigned Offset) {
-    llvm::dbgs().indent(Offset*2) << "ModulePass Manager\n";
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      ModulePass *MP = getContainedPass(Index);
-      MP->dumpPassStructure(Offset + 1);
-      std::map<Pass *, FunctionPassManagerImpl *>::const_iterator I =
-        OnTheFlyManagers.find(MP);
-      if (I != OnTheFlyManagers.end())
-        I->second->dumpPassStructure(Offset + 2);
-      dumpLastUses(MP, Offset+1);
-    }
-  }
-
-  ModulePass *getContainedPass(unsigned N) {
-    assert(N < PassVector.size() && "Pass number out of range!");
-    return static_cast<ModulePass *>(PassVector[N]);
-  }
-
-  virtual PassManagerType getPassManagerType() const {
-    return PMT_ModulePassManager;
-  }
-
- private:
-  /// Collection of on the fly FPPassManagers. These managers manage
-  /// function passes that are required by module passes.
-  std::map<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
-};
-
-char MPPassManager::ID = 0;
-//===----------------------------------------------------------------------===//
-// PassManagerImpl
-//
-
-/// PassManagerImpl manages MPPassManagers
-class PassManagerImpl : public Pass,
-                        public PMDataManager,
-                        public PMTopLevelManager {
-  virtual void anchor();
-
-public:
-  static char ID;
-  explicit PassManagerImpl() :
-    Pass(PT_PassManager, ID), PMDataManager(),
-                              PMTopLevelManager(new MPPassManager()) {}
-
-  /// add - Add a pass to the queue of passes to run.  This passes ownership of
-  /// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-  /// will be destroyed as well, so there is no need to delete the pass.  This
-  /// implies that all passes MUST be allocated with 'new'.
-  void add(Pass *P) {
-    schedulePass(P);
-  }
-
-  /// createPrinterPass - Get a module printer pass.
-  Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const {
-    return createPrintModulePass(&O, false, Banner);
-  }
-
-  /// run - Execute all of the passes scheduled for execution.  Keep track of
-  /// whether any of the passes modifies the module, and if so, return true.
-  bool run(Module &M);
-
-  using llvm::Pass::doInitialization;
-  using llvm::Pass::doFinalization;
-
-  /// doInitialization - Run all of the initializers for the module passes.
-  ///
-  bool doInitialization();
-
-  /// doFinalization - Run all of the finalizers for the module passes.
-  ///
-  bool doFinalization();
-
-  /// Pass Manager itself does not invalidate any analysis info.
-  void getAnalysisUsage(AnalysisUsage &Info) const {
-    Info.setPreservesAll();
-  }
-
-  virtual PMDataManager *getAsPMDataManager() { return this; }
-  virtual Pass *getAsPass() { return this; }
-  virtual PassManagerType getTopLevelPassManagerType() {
-    return PMT_ModulePassManager;
-  }
-
-  MPPassManager *getContainedManager(unsigned N) {
-    assert(N < PassManagers.size() && "Pass number out of range!");
-    MPPassManager *MP = static_cast<MPPassManager *>(PassManagers[N]);
-    return MP;
-  }
-};
-
-void PassManagerImpl::anchor() {}
-
-char PassManagerImpl::ID = 0;
-} // End of llvm namespace
-
-namespace {
-
-//===----------------------------------------------------------------------===//
-/// TimingInfo Class - This class is used to calculate information about the
-/// amount of time each pass takes to execute.  This only happens when
-/// -time-passes is enabled on the command line.
-///
-
-static ManagedStatic<sys::SmartMutex<true> > TimingInfoMutex;
-
-class TimingInfo {
-  DenseMap<Pass*, Timer*> TimingData;
-  TimerGroup TG;
-public:
-  // Use 'create' member to get this.
-  TimingInfo() : TG("... Pass execution timing report ...") {}
-
-  // TimingDtor - Print out information about timing information
-  ~TimingInfo() {
-    // Delete all of the timers, which accumulate their info into the
-    // TimerGroup.
-    for (DenseMap<Pass*, Timer*>::iterator I = TimingData.begin(),
-         E = TimingData.end(); I != E; ++I)
-      delete I->second;
-    // TimerGroup is deleted next, printing the report.
-  }
-
-  // createTheTimeInfo - This method either initializes the TheTimeInfo pointer
-  // to a non null value (if the -time-passes option is enabled) or it leaves it
-  // null.  It may be called multiple times.
-  static void createTheTimeInfo();
-
-  /// getPassTimer - Return the timer for the specified pass if it exists.
-  Timer *getPassTimer(Pass *P) {
-    if (P->getAsPMDataManager())
-      return 0;
-
-    sys::SmartScopedLock<true> Lock(*TimingInfoMutex);
-    Timer *&T = TimingData[P];
-    if (T == 0)
-      T = new Timer(P->getPassName(), TG);
-    return T;
-  }
-};
-
-} // End of anon namespace
-
-static TimingInfo *TheTimeInfo;
-
-//===----------------------------------------------------------------------===//
-// PMTopLevelManager implementation
-
-/// Initialize top level manager. Create first pass manager.
-PMTopLevelManager::PMTopLevelManager(PMDataManager *PMDM) {
-  PMDM->setTopLevelManager(this);
-  addPassManager(PMDM);
-  activeStack.push(PMDM);
-}
-
-/// Set pass P as the last user of the given analysis passes.
-void
-PMTopLevelManager::setLastUser(ArrayRef<Pass*> AnalysisPasses, Pass *P) {
-  unsigned PDepth = 0;
-  if (P->getResolver())
-    PDepth = P->getResolver()->getPMDataManager().getDepth();
-
-  for (SmallVectorImpl<Pass *>::const_iterator I = AnalysisPasses.begin(),
-         E = AnalysisPasses.end(); I != E; ++I) {
-    Pass *AP = *I;
-    LastUser[AP] = P;
-
-    if (P == AP)
-      continue;
-
-    // Update the last users of passes that are required transitive by AP.
-    AnalysisUsage *AnUsage = findAnalysisUsage(AP);
-    const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-    SmallVector<Pass *, 12> LastUses;
-    SmallVector<Pass *, 12> LastPMUses;
-    for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      assert(AnalysisPass && "Expected analysis pass to exist.");
-      AnalysisResolver *AR = AnalysisPass->getResolver();
-      assert(AR && "Expected analysis resolver to exist.");
-      unsigned APDepth = AR->getPMDataManager().getDepth();
-
-      if (PDepth == APDepth)
-        LastUses.push_back(AnalysisPass);
-      else if (PDepth > APDepth)
-        LastPMUses.push_back(AnalysisPass);
-    }
-
-    setLastUser(LastUses, P);
-
-    // If this pass has a corresponding pass manager, push higher level
-    // analysis to this pass manager.
-    if (P->getResolver())
-      setLastUser(LastPMUses, P->getResolver()->getPMDataManager().getAsPass());
-
-
-    // If AP is the last user of other passes then make P last user of
-    // such passes.
-    for (DenseMap<Pass *, Pass *>::iterator LUI = LastUser.begin(),
-           LUE = LastUser.end(); LUI != LUE; ++LUI) {
-      if (LUI->second == AP)
-        // DenseMap iterator is not invalidated here because
-        // this is just updating existing entries.
-        LastUser[LUI->first] = P;
-    }
-  }
-}
-
-/// Collect passes whose last user is P
-void PMTopLevelManager::collectLastUses(SmallVectorImpl<Pass *> &LastUses,
-                                        Pass *P) {
-  DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator DMI =
-    InversedLastUser.find(P);
-  if (DMI == InversedLastUser.end())
-    return;
-
-  SmallPtrSet<Pass *, 8> &LU = DMI->second;
-  for (SmallPtrSet<Pass *, 8>::iterator I = LU.begin(),
-         E = LU.end(); I != E; ++I) {
-    LastUses.push_back(*I);
-  }
-
-}
-
-AnalysisUsage *PMTopLevelManager::findAnalysisUsage(Pass *P) {
-  AnalysisUsage *AnUsage = NULL;
-  DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.find(P);
-  if (DMI != AnUsageMap.end())
-    AnUsage = DMI->second;
-  else {
-    AnUsage = new AnalysisUsage();
-    P->getAnalysisUsage(*AnUsage);
-    AnUsageMap[P] = AnUsage;
-  }
-  return AnUsage;
-}
-
-/// Schedule pass P for execution. Make sure that passes required by
-/// P are run before P is run. Update analysis info maintained by
-/// the manager. Remove dead passes. This is a recursive function.
-void PMTopLevelManager::schedulePass(Pass *P) {
-
-  // TODO : Allocate function manager for this pass, other wise required set
-  // may be inserted into previous function manager
-
-  // Give pass a chance to prepare the stage.
-  P->preparePassManager(activeStack);
-
-  // If P is an analysis pass and it is available then do not
-  // generate the analysis again. Stale analysis info should not be
-  // available at this point.
-  const PassInfo *PI =
-    PassRegistry::getPassRegistry()->getPassInfo(P->getPassID());
-  if (PI && PI->isAnalysis() && findAnalysisPass(P->getPassID())) {
-    delete P;
-    return;
-  }
-
-  AnalysisUsage *AnUsage = findAnalysisUsage(P);
-
-  bool checkAnalysis = true;
-  while (checkAnalysis) {
-    checkAnalysis = false;
-
-    const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-    for (AnalysisUsage::VectorType::const_iterator I = RequiredSet.begin(),
-           E = RequiredSet.end(); I != E; ++I) {
-
-      Pass *AnalysisPass = findAnalysisPass(*I);
-      if (!AnalysisPass) {
-        const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-
-        if (PI == NULL) {
-          // Pass P is not in the global PassRegistry
-          dbgs() << "Pass '"  << P->getPassName() << "' is not initialized." << "\n";
-          dbgs() << "Verify if there is a pass dependency cycle." << "\n";
-          dbgs() << "Required Passes:" << "\n";
-          for (AnalysisUsage::VectorType::const_iterator I2 = RequiredSet.begin(),
-                 E = RequiredSet.end(); I2 != E && I2 != I; ++I2) {
-            Pass *AnalysisPass2 = findAnalysisPass(*I2);
-            if (AnalysisPass2) {
-              dbgs() << "\t" << AnalysisPass2->getPassName() << "\n";
-            } else {
-              dbgs() << "\t"   << "Error: Required pass not found! Possible causes:"  << "\n";
-              dbgs() << "\t\t" << "- Pass misconfiguration (e.g.: missing macros)"    << "\n";
-              dbgs() << "\t\t" << "- Corruption of the global PassRegistry"           << "\n";
-            }
-          }
-        }
-
-        assert(PI && "Expected required passes to be initialized");
-        AnalysisPass = PI->createPass();
-        if (P->getPotentialPassManagerType () ==
-            AnalysisPass->getPotentialPassManagerType())
-          // Schedule analysis pass that is managed by the same pass manager.
-          schedulePass(AnalysisPass);
-        else if (P->getPotentialPassManagerType () >
-                 AnalysisPass->getPotentialPassManagerType()) {
-          // Schedule analysis pass that is managed by a new manager.
-          schedulePass(AnalysisPass);
-          // Recheck analysis passes to ensure that required analyses that
-          // are already checked are still available.
-          checkAnalysis = true;
-        } else
-          // Do not schedule this analysis. Lower level analsyis
-          // passes are run on the fly.
-          delete AnalysisPass;
+bool FunctionPassManager::run(Module *M) {
+  bool Changed = false;
+  for (Module::iterator I = M->begin(), E = M->end(); I != E; ++I)
+    for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx)
+      if (Passes[Idx]->run(I)) {
+        Changed = true;
+        if (AM) AM->invalidateAll(I);
       }
-    }
-  }
-
-  // Now all required passes are available.
-  if (ImmutablePass *IP = P->getAsImmutablePass()) {
-    // P is a immutable pass and it will be managed by this
-    // top level manager. Set up analysis resolver to connect them.
-    PMDataManager *DM = getAsPMDataManager();
-    AnalysisResolver *AR = new AnalysisResolver(*DM);
-    P->setResolver(AR);
-    DM->initializeAnalysisImpl(P);
-    addImmutablePass(IP);
-    DM->recordAvailableAnalysis(IP);
-    return;
-  }
-
-  if (PI && !PI->isAnalysis() && ShouldPrintBeforePass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump Before ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-
-  // Add the requested pass to the best available pass manager.
-  P->assignPassManager(activeStack, getTopLevelPassManagerType());
-
-  if (PI && !PI->isAnalysis() && ShouldPrintAfterPass(PI)) {
-    Pass *PP = P->createPrinterPass(
-      dbgs(), std::string("*** IR Dump After ") + P->getPassName() + " ***");
-    PP->assignPassManager(activeStack, getTopLevelPassManagerType());
-  }
-}
-
-/// Find the pass that implements Analysis AID. Search immutable
-/// passes and all pass managers. If desired pass is not found
-/// then return NULL.
-Pass *PMTopLevelManager::findAnalysisPass(AnalysisID AID) {
-
-  // Check pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-         I = IndirectPassManagers.begin(),
-         E = IndirectPassManagers.end(); I != E; ++I)
-    if (Pass *P = (*I)->findAnalysisPass(AID, false))
-      return P;
-
-  // Check the immutable passes. Iterate in reverse order so that we find
-  // the most recently registered passes first.
-  for (SmallVector<ImmutablePass *, 8>::reverse_iterator I =
-       ImmutablePasses.rbegin(), E = ImmutablePasses.rend(); I != E; ++I) {
-    AnalysisID PI = (*I)->getPassID();
-    if (PI == AID)
-      return *I;
-
-    // If Pass not found then check the interfaces implemented by Immutable Pass
-    const PassInfo *PassInf =
-      PassRegistry::getPassRegistry()->getPassInfo(PI);
-    assert(PassInf && "Expected all immutable passes to be initialized");
-    const std::vector<const PassInfo*> &ImmPI =
-      PassInf->getInterfacesImplemented();
-    for (std::vector<const PassInfo*>::const_iterator II = ImmPI.begin(),
-         EE = ImmPI.end(); II != EE; ++II) {
-      if ((*II)->getTypeInfo() == AID)
-        return *I;
-    }
-  }
-
-  return 0;
-}
-
-// Print passes managed by this top level manager.
-void PMTopLevelManager::dumpPasses() const {
-
-  if (PassDebugging < Structure)
-    return;
-
-  // Print out the immutable passes
-  for (unsigned i = 0, e = ImmutablePasses.size(); i != e; ++i) {
-    ImmutablePasses[i]->dumpPassStructure(0);
-  }
-
-  // Every class that derives from PMDataManager also derives from Pass
-  // (sometimes indirectly), but there's no inheritance relationship
-  // between PMDataManager and Pass, so we have to getAsPass to get
-  // from a PMDataManager* to a Pass*.
-  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->getAsPass()->dumpPassStructure(1);
-}
-
-void PMTopLevelManager::dumpArguments() const {
-
-  if (PassDebugging < Arguments)
-    return;
-
-  dbgs() << "Pass Arguments: ";
-  for (SmallVector<ImmutablePass *, 8>::const_iterator I =
-       ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    if (const PassInfo *PI =
-        PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID())) {
-      assert(PI && "Expected all immutable passes to be initialized");
-      if (!PI->isAnalysisGroup())
-        dbgs() << " -" << PI->getPassArgument();
-    }
-  for (SmallVector<PMDataManager *, 8>::const_iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->dumpPassArguments();
-  dbgs() << "\n";
+  return Changed;
 }
 
-void PMTopLevelManager::initializeAllAnalysisInfo() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    (*I)->initializeAnalysisInfo();
+void AnalysisManager::invalidateAll(Function *F) {
+  assert(F->getParent() == M && "Invalidating a function from another module!");
 
-  // Initailize other pass managers
-  for (SmallVectorImpl<PMDataManager *>::iterator
-       I = IndirectPassManagers.begin(), E = IndirectPassManagers.end();
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
        I != E; ++I)
-    (*I)->initializeAnalysisInfo();
-
-  for (DenseMap<Pass *, Pass *>::iterator DMI = LastUser.begin(),
-        DME = LastUser.end(); DMI != DME; ++DMI) {
-    DenseMap<Pass *, SmallPtrSet<Pass *, 8> >::iterator InvDMI =
-      InversedLastUser.find(DMI->second);
-    if (InvDMI != InversedLastUser.end()) {
-      SmallPtrSet<Pass *, 8> &L = InvDMI->second;
-      L.insert(DMI->first);
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now clear all the invalidated results associated specifically with this
+  // function.
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  FunctionAnalysisResultListT &ResultsList = FunctionAnalysisResultLists[F];
+  for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                             E = ResultsList.end();
+       I != E;)
+    if (I->second->invalidate(F)) {
+      InvalidatedPassIDs.push_back(I->first);
+      I = ResultsList.erase(I);
     } else {
-      SmallPtrSet<Pass *, 8> L; L.insert(DMI->first);
-      InversedLastUser[DMI->second] = L;
+      ++I;
     }
-  }
-}
-
-/// Destructor
-PMTopLevelManager::~PMTopLevelManager() {
-  for (SmallVectorImpl<PMDataManager *>::iterator I = PassManagers.begin(),
-         E = PassManagers.end(); I != E; ++I)
-    delete *I;
-
-  for (SmallVectorImpl<ImmutablePass *>::iterator
-         I = ImmutablePasses.begin(), E = ImmutablePasses.end(); I != E; ++I)
-    delete *I;
-
-  for (DenseMap<Pass *, AnalysisUsage *>::iterator DMI = AnUsageMap.begin(),
-         DME = AnUsageMap.end(); DMI != DME; ++DMI)
-    delete DMI->second;
-}
-
-//===----------------------------------------------------------------------===//
-// PMDataManager implementation
-
-/// Augement AvailableAnalysis by adding analysis made available by pass P.
-void PMDataManager::recordAvailableAnalysis(Pass *P) {
-  AnalysisID PI = P->getPassID();
-
-  AvailableAnalysis[PI] = P;
-
-  assert(!AvailableAnalysis.empty());
-
-  // This pass is the current implementation of all of the interfaces it
-  // implements as well.
-  const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI);
-  if (PInf == 0) return;
-  const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-  for (unsigned i = 0, e = II.size(); i != e; ++i)
-    AvailableAnalysis[II[i]->getTypeInfo()] = P;
+  while (!InvalidatedPassIDs.empty())
+    FunctionAnalysisResults.erase(
+        std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
 }
 
-// Return true if P preserves high level analysis used by other
-// passes managed by this manager
-bool PMDataManager::preserveHigherLevelAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return true;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (SmallVectorImpl<Pass *>::iterator I = HigherLevelAnalysis.begin(),
-         E = HigherLevelAnalysis.end(); I  != E; ++I) {
-    Pass *P1 = *I;
-    if (P1->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(),
-                  P1->getPassID()) ==
-           PreservedSet.end())
-      return false;
-  }
-
-  return true;
-}
-
-/// verifyPreservedAnalysis -- Verify analysis preserved by pass P.
-void PMDataManager::verifyPreservedAnalysis(Pass *P) {
-  // Don't do this unless assertions are enabled.
-#ifdef NDEBUG
-  return;
-#endif
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-
-  // Verify preserved analysis
-  for (AnalysisUsage::VectorType::const_iterator I = PreservedSet.begin(),
-         E = PreservedSet.end(); I != E; ++I) {
-    AnalysisID AID = *I;
-    if (Pass *AP = findAnalysisPass(AID, true)) {
-      TimeRegion PassTimer(getPassTimer(AP));
-      AP->verifyAnalysis();
-    }
-  }
-}
-
-/// Remove Analysis not preserved by Pass P
-void PMDataManager::removeNotPreservedAnalysis(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  if (AnUsage->getPreservesAll())
-    return;
-
-  const AnalysisUsage::VectorType &PreservedSet = AnUsage->getPreservedSet();
-  for (DenseMap<AnalysisID, Pass*>::iterator I = AvailableAnalysis.begin(),
-         E = AvailableAnalysis.end(); I != E; ) {
-    DenseMap<AnalysisID, Pass*>::iterator Info = I++;
-    if (Info->second->getAsImmutablePass() == 0 &&
-        std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-        PreservedSet.end()) {
-      // Remove this analysis
-      if (PassDebugging >= Details) {
-        Pass *S = Info->second;
-        dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-        dbgs() << S->getPassName() << "'\n";
-      }
-      AvailableAnalysis.erase(Info);
-    }
-  }
-
-  // Check inherited analysis also. If P is not preserving analysis
-  // provided by parent manager then remove it here.
-  for (unsigned Index = 0; Index < PMT_Last; ++Index) {
-
-    if (!InheritedAnalysis[Index])
-      continue;
-
-    for (DenseMap<AnalysisID, Pass*>::iterator
-           I = InheritedAnalysis[Index]->begin(),
-           E = InheritedAnalysis[Index]->end(); I != E; ) {
-      DenseMap<AnalysisID, Pass *>::iterator Info = I++;
-      if (Info->second->getAsImmutablePass() == 0 &&
-          std::find(PreservedSet.begin(), PreservedSet.end(), Info->first) ==
-             PreservedSet.end()) {
-        // Remove this analysis
-        if (PassDebugging >= Details) {
-          Pass *S = Info->second;
-          dbgs() << " -- '" <<  P->getPassName() << "' is not preserving '";
-          dbgs() << S->getPassName() << "'\n";
-        }
-        InheritedAnalysis[Index]->erase(Info);
-      }
-    }
-  }
-}
-
-/// Remove analysis passes that are not used any longer
-void PMDataManager::removeDeadPasses(Pass *P, StringRef Msg,
-                                     enum PassDebuggingString DBG_STR) {
-
-  SmallVector<Pass *, 12> DeadPasses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(DeadPasses, P);
-
-  if (PassDebugging >= Details && !DeadPasses.empty()) {
-    dbgs() << " -*- '" <<  P->getPassName();
-    dbgs() << "' is the last user of following pass instances.";
-    dbgs() << " Free these instances\n";
-  }
-
-  for (SmallVectorImpl<Pass *>::iterator I = DeadPasses.begin(),
-         E = DeadPasses.end(); I != E; ++I)
-    freePass(*I, Msg, DBG_STR);
-}
-
-void PMDataManager::freePass(Pass *P, StringRef Msg,
-                             enum PassDebuggingString DBG_STR) {
-  dumpPassInfo(P, FREEING_MSG, DBG_STR, Msg);
-
-  {
-    // If the pass crashes releasing memory, remember this.
-    PassManagerPrettyStackEntry X(P);
-    TimeRegion PassTimer(getPassTimer(P));
-
-    P->releaseMemory();
-  }
-
-  AnalysisID PI = P->getPassID();
-  if (const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(PI)) {
-    // Remove the pass itself (if it is not already removed).
-    AvailableAnalysis.erase(PI);
-
-    // Remove all interfaces this pass implements, for which it is also
-    // listed as the available implementation.
-    const std::vector<const PassInfo*> &II = PInf->getInterfacesImplemented();
-    for (unsigned i = 0, e = II.size(); i != e; ++i) {
-      DenseMap<AnalysisID, Pass*>::iterator Pos =
-        AvailableAnalysis.find(II[i]->getTypeInfo());
-      if (Pos != AvailableAnalysis.end() && Pos->second == P)
-        AvailableAnalysis.erase(Pos);
-    }
-  }
-}
-
-/// Add pass P into the PassVector. Update
-/// AvailableAnalysis appropriately if ProcessAnalysis is true.
-void PMDataManager::add(Pass *P, bool ProcessAnalysis) {
-  // This manager is going to manage pass P. Set up analysis resolver
-  // to connect them.
-  AnalysisResolver *AR = new AnalysisResolver(*this);
-  P->setResolver(AR);
-
-  // If a FunctionPass F is the last user of ModulePass info M
-  // then the F's manager, not F, records itself as a last user of M.
-  SmallVector<Pass *, 12> TransferLastUses;
-
-  if (!ProcessAnalysis) {
-    // Add pass
-    PassVector.push_back(P);
-    return;
-  }
-
-  // At the moment, this pass is the last user of all required passes.
-  SmallVector<Pass *, 12> LastUses;
-  SmallVector<Pass *, 8> RequiredPasses;
-  SmallVector<AnalysisID, 8> ReqAnalysisNotAvailable;
-
-  unsigned PDepth = this->getDepth();
-
-  collectRequiredAnalysis(RequiredPasses,
-                          ReqAnalysisNotAvailable, P);
-  for (SmallVectorImpl<Pass *>::iterator I = RequiredPasses.begin(),
-         E = RequiredPasses.end(); I != E; ++I) {
-    Pass *PRequired = *I;
-    unsigned RDepth = 0;
-
-    assert(PRequired->getResolver() && "Analysis Resolver is not set");
-    PMDataManager &DM = PRequired->getResolver()->getPMDataManager();
-    RDepth = DM.getDepth();
-
-    if (PDepth == RDepth)
-      LastUses.push_back(PRequired);
-    else if (PDepth > RDepth) {
-      // Let the parent claim responsibility of last use
-      TransferLastUses.push_back(PRequired);
-      // Keep track of higher level analysis used by this manager.
-      HigherLevelAnalysis.push_back(PRequired);
-    } else
-      llvm_unreachable("Unable to accommodate Required Pass");
-  }
-
-  // Set P as P's last user until someone starts using P.
-  // However, if P is a Pass Manager then it does not need
-  // to record its last user.
-  if (P->getAsPMDataManager() == 0)
-    LastUses.push_back(P);
-  TPM->setLastUser(LastUses, P);
-
-  if (!TransferLastUses.empty()) {
-    Pass *My_PM = getAsPass();
-    TPM->setLastUser(TransferLastUses, My_PM);
-    TransferLastUses.clear();
-  }
-
-  // Now, take care of required analyses that are not available.
-  for (SmallVectorImpl<AnalysisID>::iterator
-         I = ReqAnalysisNotAvailable.begin(),
-         E = ReqAnalysisNotAvailable.end() ;I != E; ++I) {
-    const PassInfo *PI = PassRegistry::getPassRegistry()->getPassInfo(*I);
-    Pass *AnalysisPass = PI->createPass();
-    this->addLowerLevelRequiredPass(P, AnalysisPass);
-  }
-
-  // Take a note of analysis required and made available by this pass.
-  // Remove the analysis not preserved by this pass
-  removeNotPreservedAnalysis(P);
-  recordAvailableAnalysis(P);
-
-  // Add pass
-  PassVector.push_back(P);
-}
-
-
-/// Populate RP with analysis pass that are required by
-/// pass P and are available. Populate RP_NotAvail with analysis
-/// pass that are required by pass P but are not available.
-void PMDataManager::collectRequiredAnalysis(SmallVectorImpl<Pass *> &RP,
-                                       SmallVectorImpl<AnalysisID> &RP_NotAvail,
-                                            Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-  const AnalysisUsage::VectorType &RequiredSet = AnUsage->getRequiredSet();
-  for (AnalysisUsage::VectorType::const_iterator
-         I = RequiredSet.begin(), E = RequiredSet.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-
-  const AnalysisUsage::VectorType &IDs = AnUsage->getRequiredTransitiveSet();
-  for (AnalysisUsage::VectorType::const_iterator I = IDs.begin(),
-         E = IDs.end(); I != E; ++I) {
-    if (Pass *AnalysisPass = findAnalysisPass(*I, true))
-      RP.push_back(AnalysisPass);
-    else
-      RP_NotAvail.push_back(*I);
-  }
-}
-
-// All Required analyses should be available to the pass as it runs!  Here
-// we fill in the AnalysisImpls member of the pass so that it can
-// successfully use the getAnalysis() method to retrieve the
-// implementations it needs.
-//
-void PMDataManager::initializeAnalysisImpl(Pass *P) {
-  AnalysisUsage *AnUsage = TPM->findAnalysisUsage(P);
-
-  for (AnalysisUsage::VectorType::const_iterator
-         I = AnUsage->getRequiredSet().begin(),
-         E = AnUsage->getRequiredSet().end(); I != E; ++I) {
-    Pass *Impl = findAnalysisPass(*I, true);
-    if (Impl == 0)
-      // This may be analysis pass that is initialized on the fly.
-      // If that is not the case then it will raise an assert when it is used.
-      continue;
-    AnalysisResolver *AR = P->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->addAnalysisImplsPair(*I, Impl);
-  }
-}
-
-/// Find the pass that implements Analysis AID. If desired pass is not found
-/// then return NULL.
-Pass *PMDataManager::findAnalysisPass(AnalysisID AID, bool SearchParent) {
-
-  // Check if AvailableAnalysis map has one entry.
-  DenseMap<AnalysisID, Pass*>::const_iterator I =  AvailableAnalysis.find(AID);
-
-  if (I != AvailableAnalysis.end())
-    return I->second;
-
-  // Search Parents through TopLevelManager
-  if (SearchParent)
-    return TPM->findAnalysisPass(AID);
-
-  return NULL;
-}
-
-// Print list of passes that are last used by P.
-void PMDataManager::dumpLastUses(Pass *P, unsigned Offset) const{
-
-  SmallVector<Pass *, 12> LUses;
-
-  // If this is a on the fly manager then it does not have TPM.
-  if (!TPM)
-    return;
-
-  TPM->collectLastUses(LUses, P);
-
-  for (SmallVectorImpl<Pass *>::iterator I = LUses.begin(),
-         E = LUses.end(); I != E; ++I) {
-    llvm::dbgs() << "--" << std::string(Offset*2, ' ');
-    (*I)->dumpPassStructure(0);
-  }
-}
-
-void PMDataManager::dumpPassArguments() const {
-  for (SmallVectorImpl<Pass *>::const_iterator I = PassVector.begin(),
-        E = PassVector.end(); I != E; ++I) {
-    if (PMDataManager *PMD = (*I)->getAsPMDataManager())
-      PMD->dumpPassArguments();
-    else
-      if (const PassInfo *PI =
-            PassRegistry::getPassRegistry()->getPassInfo((*I)->getPassID()))
-        if (!PI->isAnalysisGroup())
-          dbgs() << " -" << PI->getPassArgument();
-  }
-}
-
-void PMDataManager::dumpPassInfo(Pass *P, enum PassDebuggingString S1,
-                                 enum PassDebuggingString S2,
-                                 StringRef Msg) {
-  if (PassDebugging < Executions)
-    return;
-  dbgs() << (void*)this << std::string(getDepth()*2+1, ' ');
-  switch (S1) {
-  case EXECUTION_MSG:
-    dbgs() << "Executing Pass '" << P->getPassName();
-    break;
-  case MODIFICATION_MSG:
-    dbgs() << "Made Modification '" << P->getPassName();
-    break;
-  case FREEING_MSG:
-    dbgs() << " Freeing Pass '" << P->getPassName();
-    break;
-  default:
-    break;
-  }
-  switch (S2) {
-  case ON_BASICBLOCK_MSG:
-    dbgs() << "' on BasicBlock '" << Msg << "'...\n";
-    break;
-  case ON_FUNCTION_MSG:
-    dbgs() << "' on Function '" << Msg << "'...\n";
-    break;
-  case ON_MODULE_MSG:
-    dbgs() << "' on Module '"  << Msg << "'...\n";
-    break;
-  case ON_REGION_MSG:
-    dbgs() << "' on Region '"  << Msg << "'...\n";
-    break;
-  case ON_LOOP_MSG:
-    dbgs() << "' on Loop '" << Msg << "'...\n";
-    break;
-  case ON_CG_MSG:
-    dbgs() << "' on Call Graph Nodes '" << Msg << "'...\n";
-    break;
-  default:
-    break;
-  }
-}
-
-void PMDataManager::dumpRequiredSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Required", P, analysisUsage.getRequiredSet());
-}
-
-void PMDataManager::dumpPreservedSet(const Pass *P) const {
-  if (PassDebugging < Details)
-    return;
-
-  AnalysisUsage analysisUsage;
-  P->getAnalysisUsage(analysisUsage);
-  dumpAnalysisUsage("Preserved", P, analysisUsage.getPreservedSet());
-}
-
-void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
-                                   const AnalysisUsage::VectorType &Set) const {
-  assert(PassDebugging >= Details);
-  if (Set.empty())
-    return;
-  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
-  for (unsigned i = 0; i != Set.size(); ++i) {
-    if (i) dbgs() << ',';
-    const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
-    if (!PInf) {
-      // Some preserved passes, such as AliasAnalysis, may not be initialized by
-      // all drivers.
-      dbgs() << " Uninitialized Pass";
-      continue;
-    }
-    dbgs() << ' ' << PInf->getPassName();
-  }
-  dbgs() << '\n';
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-/// This should be handled by specific pass manager.
-void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  if (TPM) {
-    TPM->dumpArguments();
-    TPM->dumpPasses();
-  }
-
-  // Module Level pass may required Function Level analysis info
-  // (e.g. dominator info). Pass manager uses on the fly function pass manager
-  // to provide this on demand. In that case, in Pass manager terminology,
-  // module level pass is requiring lower level analysis info managed by
-  // lower level pass manager.
-
-  // When Pass manager is not able to order required analysis info, Pass manager
-  // checks whether any lower level manager will be able to provide this
-  // analysis info on demand or not.
-#ifndef NDEBUG
-  dbgs() << "Unable to schedule '" << RequiredPass->getPassName();
-  dbgs() << "' required by '" << P->getPassName() << "'\n";
-#endif
-  llvm_unreachable("Unable to schedule pass");
-}
-
-Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
-  llvm_unreachable("Unable to find on the fly pass");
-}
-
-// Destructor
-PMDataManager::~PMDataManager() {
-  for (SmallVectorImpl<Pass *>::iterator I = PassVector.begin(),
-         E = PassVector.end(); I != E; ++I)
-    delete *I;
-}
-
-//===----------------------------------------------------------------------===//
-// NOTE: Is this the right place to define this method ?
-// getAnalysisIfAvailable - Return analysis result or null if it doesn't exist.
-Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
-  return PM.findAnalysisPass(ID, dir);
-}
-
-Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
-                                     Function &F) {
-  return PM.getOnTheFlyPass(P, AnalysisPI, F);
-}
-
-//===----------------------------------------------------------------------===//
-// BBPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnBasicBlock method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool BBPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = doInitialization(F);
-
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
-    for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-      BasicBlockPass *BP = getContainedPass(Index);
-      bool LocalChanged = false;
-
-      dumpPassInfo(BP, EXECUTION_MSG, ON_BASICBLOCK_MSG, I->getName());
-      dumpRequiredSet(BP);
-
-      initializeAnalysisImpl(BP);
-
-      {
-        // If the pass crashes, remember this.
-        PassManagerPrettyStackEntry X(BP, *I);
-        TimeRegion PassTimer(getPassTimer(BP));
-
-        LocalChanged |= BP->runOnBasicBlock(*I);
+void AnalysisManager::invalidateAll(Module *M) {
+  // First invalidate any module results we still have laying about.
+  // FIXME: This is a total hack based on the fact that erasure doesn't
+  // invalidate iteration for DenseMap.
+  for (ModuleAnalysisResultMapT::iterator I = ModuleAnalysisResults.begin(),
+                                          E = ModuleAnalysisResults.end();
+       I != E; ++I)
+    if (I->second->invalidate(M))
+      ModuleAnalysisResults.erase(I);
+
+  // Now walk all of the functions for which there are cached results, and
+  // attempt to invalidate each of those as the entire module may have changed.
+  // FIXME: How do we handle functions which have been deleted or RAUWed?
+  SmallVector<void *, 8> InvalidatedPassIDs;
+  for (FunctionAnalysisResultListMapT::iterator
+           FI = FunctionAnalysisResultLists.begin(),
+           FE = FunctionAnalysisResultLists.end();
+       FI != FE; ++FI) {
+    Function *F = FI->first;
+    FunctionAnalysisResultListT &ResultsList = FI->second;
+    for (FunctionAnalysisResultListT::iterator I = ResultsList.begin(),
+                                               E = ResultsList.end();
+         I != E;)
+      if (I->second->invalidate(F)) {
+        InvalidatedPassIDs.push_back(I->first);
+        I = ResultsList.erase(I);
+      } else {
+        ++I;
       }
-
-      Changed |= LocalChanged;
-      if (LocalChanged)
-        dumpPassInfo(BP, MODIFICATION_MSG, ON_BASICBLOCK_MSG,
-                     I->getName());
-      dumpPreservedSet(BP);
-
-      verifyPreservedAnalysis(BP);
-      removeNotPreservedAnalysis(BP);
-      recordAvailableAnalysis(BP);
-      removeDeadPasses(BP, I->getName(), ON_BASICBLOCK_MSG);
-    }
-
-  return doFinalization(F) || Changed;
-}
-
-// Implement doInitialization and doFinalization
-bool BBPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  return Changed;
-}
-
-bool BBPassManager::doInitialization(Function &F) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doInitialization(F);
+    while (!InvalidatedPassIDs.empty())
+      FunctionAnalysisResults.erase(
+          std::make_pair(InvalidatedPassIDs.pop_back_val(), F));
   }
-
-  return Changed;
 }
 
-bool BBPassManager::doFinalization(Function &F) {
-  bool Changed = false;
+const AnalysisManager::AnalysisResultConcept<Module> &
+AnalysisManager::getResultImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Wrong module used when querying the AnalysisManager");
+  ModuleAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = ModuleAnalysisResults.insert(std::make_pair(
+      PassID, polymorphic_ptr<AnalysisResultConcept<Module> >()));
 
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    BasicBlockPass *BP = getContainedPass(Index);
-    Changed |= BP->doFinalization(F);
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    ModuleAnalysisPassMapT::const_iterator PI =
+        ModuleAnalysisPasses.find(PassID);
+    assert(PI != ModuleAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    RI->second = PI->second->run(M);
   }
 
-  return Changed;
+  return *RI->second;
 }
 
+const AnalysisManager::AnalysisResultConcept<Function> &
+AnalysisManager::getResultImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M && "Analyzing a function from another module!");
 
-//===----------------------------------------------------------------------===//
-// FunctionPassManager implementation
-
-/// Create new Function pass manager
-FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
-  FPM = new FunctionPassManagerImpl();
-  // FPM is the top level manager.
-  FPM->setTopLevelManager(FPM);
-
-  AnalysisResolver *AR = new AnalysisResolver(*FPM);
-  FPM->setResolver(AR);
-}
-
-FunctionPassManager::~FunctionPassManager() {
-  delete FPM;
-}
+  FunctionAnalysisResultMapT::iterator RI;
+  bool Inserted;
+  llvm::tie(RI, Inserted) = FunctionAnalysisResults.insert(std::make_pair(
+      std::make_pair(PassID, F), FunctionAnalysisResultListT::iterator()));
 
-/// add - Add a pass to the queue of passes to run.  This passes
-/// ownership of the Pass to the PassManager.  When the
-/// PassManager_X is destroyed, the pass will be destroyed as well, so
-/// there is no need to delete the pass. (TODO delete passes.)
-/// This implies that all passes MUST be allocated with 'new'.
-void FunctionPassManager::add(Pass *P) {
-  FPM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep
-/// track of whether any of the passes modifies the function, and if
-/// so, return true.
-///
-bool FunctionPassManager::run(Function &F) {
-  if (F.isMaterializable()) {
-    std::string errstr;
-    if (F.Materialize(&errstr))
-      report_fatal_error("Error reading bitcode file: " + Twine(errstr));
+  if (Inserted) {
+    // We don't have a cached result for this result. Look up the pass and run
+    // it to produce a result, which we then add to the cache.
+    FunctionAnalysisPassMapT::const_iterator PI =
+        FunctionAnalysisPasses.find(PassID);
+    assert(PI != FunctionAnalysisPasses.end() &&
+           "Analysis passes must be registered prior to being queried!");
+    FunctionAnalysisResultListT &ResultList = FunctionAnalysisResultLists[F];
+    ResultList.push_back(std::make_pair(PassID, PI->second->run(F)));
+    RI->second = llvm::prior(ResultList.end());
   }
-  return FPM->run(F);
-}
 
-
-/// doInitialization - Run all of the initializers for the function passes.
-///
-bool FunctionPassManager::doInitialization() {
-  return FPM->doInitialization(*M);
-}
-
-/// doFinalization - Run all of the finalizers for the function passes.
-///
-bool FunctionPassManager::doFinalization() {
-  return FPM->doFinalization(*M);
-}
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl implementation
-//
-bool FunctionPassManagerImpl::doInitialization(Module &M) {
-  bool Changed = false;
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->doInitialization(M);
-
-  return Changed;
+  return *RI->second->second;
 }
 
-bool FunctionPassManagerImpl::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
-    Changed |= getContainedManager(Index)->doFinalization(M);
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
+void AnalysisManager::invalidateImpl(void *PassID, Module *M) {
+  assert(M == this->M && "Invalidating a pass over a different module!");
+  ModuleAnalysisResults.erase(PassID);
 }
 
-/// cleanup - After running all passes, clean up pass manager cache.
-void FPPassManager::cleanup() {
- for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    AnalysisResolver *AR = FP->getResolver();
-    assert(AR && "Analysis Resolver is not set");
-    AR->clearAnalysisImpls();
- }
-}
+void AnalysisManager::invalidateImpl(void *PassID, Function *F) {
+  assert(F->getParent() == M &&
+         "Invalidating a pass over a function from another module!");
 
-void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
-  if (!wasRun)
+  FunctionAnalysisResultMapT::iterator RI =
+      FunctionAnalysisResults.find(std::make_pair(PassID, F));
+  if (RI == FunctionAnalysisResults.end())
     return;
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    FPPassManager *FPPM = getContainedManager(Index);
-    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
-      FPPM->getContainedPass(Index)->releaseMemory();
-    }
-  }
-  wasRun = false;
-}
-
-// Execute all the passes managed by this top level manager.
-// Return true if any function is modified by a pass.
-bool FunctionPassManagerImpl::run(Function &F) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnFunction(F);
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    getContainedManager(Index)->cleanup();
-
-  wasRun = true;
-  return Changed;
-}
 
-//===----------------------------------------------------------------------===//
-// FPPassManager implementation
-
-char FPPassManager::ID = 0;
-/// Print passes managed by this manager
-void FPPassManager::dumpPassStructure(unsigned Offset) {
-  dbgs().indent(Offset*2) << "FunctionPass Manager\n";
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    FP->dumpPassStructure(Offset + 1);
-    dumpLastUses(FP, Offset+1);
-  }
-}
-
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnFunction method.  Keep track of whether any of the passes modifies
-/// the function, and if so, return true.
-bool FPPassManager::runOnFunction(Function &F) {
-  if (F.isDeclaration())
-    return false;
-
-  bool Changed = false;
-
-  // Collect inherited analysis from Module level pass manager.
-  populateInheritedAnalysis(TPM->activeStack);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    FunctionPass *FP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(FP, EXECUTION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpRequiredSet(FP);
-
-    initializeAnalysisImpl(FP);
-
-    {
-      PassManagerPrettyStackEntry X(FP, F);
-      TimeRegion PassTimer(getPassTimer(FP));
-
-      LocalChanged |= FP->runOnFunction(F);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(FP, MODIFICATION_MSG, ON_FUNCTION_MSG, F.getName());
-    dumpPreservedSet(FP);
-
-    verifyPreservedAnalysis(FP);
-    removeNotPreservedAnalysis(FP);
-    recordAvailableAnalysis(FP);
-    removeDeadPasses(FP, F.getName(), ON_FUNCTION_MSG);
-  }
-  return Changed;
-}
-
-bool FPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    Changed |= runOnFunction(*I);
-
-  return Changed;
-}
-
-bool FPPassManager::doInitialization(Module &M) {
-  bool Changed = false;
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-  
-  return Changed;
-}
-
-bool FPPassManager::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-  
-  return Changed;
+  FunctionAnalysisResultLists[F].erase(RI->second);
 }
-
-//===----------------------------------------------------------------------===//
-// MPPassManager implementation
-
-/// Execute all of the passes scheduled for execution by invoking
-/// runOnModule method.  Keep track of whether any of the passes modifies
-/// the module, and if so, return true.
-bool
-MPPassManager::runOnModule(Module &M) {
-  bool Changed = false;
-
-  // Initialize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    Changed |= FPP->doInitialization(M);
-  }
-
-  // Initialize module passes
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index)
-    Changed |= getContainedPass(Index)->doInitialization(M);
-
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    ModulePass *MP = getContainedPass(Index);
-    bool LocalChanged = false;
-
-    dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier());
-    dumpRequiredSet(MP);
-
-    initializeAnalysisImpl(MP);
-
-    {
-      PassManagerPrettyStackEntry X(MP, M);
-      TimeRegion PassTimer(getPassTimer(MP));
-
-      LocalChanged |= MP->runOnModule(M);
-    }
-
-    Changed |= LocalChanged;
-    if (LocalChanged)
-      dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG,
-                   M.getModuleIdentifier());
-    dumpPreservedSet(MP);
-
-    verifyPreservedAnalysis(MP);
-    removeNotPreservedAnalysis(MP);
-    recordAvailableAnalysis(MP);
-    removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG);
-  }
-
-  // Finalize module passes
-  for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index)
-    Changed |= getContainedPass(Index)->doFinalization(M);
-
-  // Finalize on-the-fly passes
-  for (std::map<Pass *, FunctionPassManagerImpl *>::iterator
-       I = OnTheFlyManagers.begin(), E = OnTheFlyManagers.end();
-       I != E; ++I) {
-    FunctionPassManagerImpl *FPP = I->second;
-    // We don't know when is the last time an on-the-fly pass is run,
-    // so we need to releaseMemory / finalize here
-    FPP->releaseMemoryOnTheFly();
-    Changed |= FPP->doFinalization(M);
-  }
-  
-  return Changed;
-}
-
-/// Add RequiredPass into list of lower level passes required by pass P.
-/// RequiredPass is run on the fly by Pass Manager when P requests it
-/// through getAnalysis interface.
-void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
-  assert(P->getPotentialPassManagerType() == PMT_ModulePassManager &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-  assert((P->getPotentialPassManagerType() <
-          RequiredPass->getPotentialPassManagerType()) &&
-         "Unable to handle Pass that requires lower level Analysis pass");
-
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
-  if (!FPP) {
-    FPP = new FunctionPassManagerImpl();
-    // FPP is the top level manager.
-    FPP->setTopLevelManager(FPP);
-
-    OnTheFlyManagers[P] = FPP;
-  }
-  FPP->add(RequiredPass);
-
-  // Register P as the last user of RequiredPass.
-  if (RequiredPass) {
-    SmallVector<Pass *, 1> LU;
-    LU.push_back(RequiredPass);
-    FPP->setLastUser(LU,  P);
-  }
-}
-
-/// Return function pass corresponding to PassInfo PI, that is
-/// required by module pass MP. Instantiate analysis pass, by using
-/// its runOnFunction() for function F.
-Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
-  assert(FPP && "Unable to find on the fly pass");
-
-  FPP->releaseMemoryOnTheFly();
-  FPP->run(F);
-  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
-}
-
-
-//===----------------------------------------------------------------------===//
-// PassManagerImpl implementation
-
-//
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManagerImpl::run(Module &M) {
-  bool Changed = false;
-  TimingInfo::createTheTimeInfo();
-
-  dumpArguments();
-  dumpPasses();
-
-  SmallVectorImpl<ImmutablePass *>& IPV = getImmutablePasses();
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doInitialization(M);
-  }
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->runOnModule(M);
-
-  for (SmallVectorImpl<ImmutablePass *>::const_iterator I = IPV.begin(),
-       E = IPV.end(); I != E; ++I) {
-    Changed |= (*I)->doFinalization(M);
-  }
-
-  return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// PassManager implementation
-
-/// Create new pass manager
-PassManager::PassManager() {
-  PM = new PassManagerImpl();
-  // PM is the top level manager
-  PM->setTopLevelManager(PM);
-}
-
-PassManager::~PassManager() {
-  delete PM;
-}
-
-/// add - Add a pass to the queue of passes to run.  This passes ownership of
-/// the Pass to the PassManager.  When the PassManager is destroyed, the pass
-/// will be destroyed as well, so there is no need to delete the pass.  This
-/// implies that all passes MUST be allocated with 'new'.
-void PassManager::add(Pass *P) {
-  PM->add(P);
-}
-
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManager::run(Module &M) {
-  return PM->run(M);
-}
-
-//===----------------------------------------------------------------------===//
-// TimingInfo implementation
-
-bool llvm::TimePassesIsEnabled = false;
-static cl::opt<bool,true>
-EnableTiming("time-passes", cl::location(TimePassesIsEnabled),
-            cl::desc("Time each pass, printing elapsed time for each on exit"));
-
-// createTheTimeInfo - This method either initializes the TheTimeInfo pointer to
-// a non null value (if the -time-passes option is enabled) or it leaves it
-// null.  It may be called multiple times.
-void TimingInfo::createTheTimeInfo() {
-  if (!TimePassesIsEnabled || TheTimeInfo) return;
-
-  // Constructed the first time this is called, iff -time-passes is enabled.
-  // This guarantees that the object will be constructed before static globals,
-  // thus it will be destroyed before them.
-  static ManagedStatic<TimingInfo> TTI;
-  TheTimeInfo = &*TTI;
-}
-
-/// If TimingInfo is enabled then start pass timer.
-Timer *llvm::getPassTimer(Pass *P) {
-  if (TheTimeInfo)
-    return TheTimeInfo->getPassTimer(P);
-  return 0;
-}
-
-//===----------------------------------------------------------------------===//
-// PMStack implementation
-//
-
-// Pop Pass Manager from the stack and clear its analysis info.
-void PMStack::pop() {
-
-  PMDataManager *Top = this->top();
-  Top->initializeAnalysisInfo();
-
-  S.pop_back();
-}
-
-// Push PM on the stack and set its top level manager.
-void PMStack::push(PMDataManager *PM) {
-  assert(PM && "Unable to push. Pass Manager expected");
-  assert(PM->getDepth()==0 && "Pass Manager depth set too early");
-
-  if (!this->empty()) {
-    assert(PM->getPassManagerType() > this->top()->getPassManagerType()
-           && "pushing bad pass manager to PMStack");
-    PMTopLevelManager *TPM = this->top()->getTopLevelManager();
-
-    assert(TPM && "Unable to find top level manager");
-    TPM->addIndirectPassManager(PM);
-    PM->setTopLevelManager(TPM);
-    PM->setDepth(this->top()->getDepth()+1);
-  } else {
-    assert((PM->getPassManagerType() == PMT_ModulePassManager
-           || PM->getPassManagerType() == PMT_FunctionPassManager)
-           && "pushing bad pass manager to PMStack");
-    PM->setDepth(1);
-  }
-
-  S.push_back(PM);
-}
-
-// Dump content of the pass manager stack.
-void PMStack::dump() const {
-  for (std::vector<PMDataManager *>::const_iterator I = S.begin(),
-         E = S.end(); I != E; ++I)
-    dbgs() << (*I)->getAsPass()->getPassName() << ' ';
-
-  if (!S.empty())
-    dbgs() << '\n';
-}
-
-/// Find appropriate Module Pass Manager in the PM Stack and
-/// add self into that manager.
-void ModulePass::assignPassManager(PMStack &PMS,
-                                   PassManagerType PreferredType) {
-  // Find Module Pass Manager
-  while (!PMS.empty()) {
-    PassManagerType TopPMType = PMS.top()->getPassManagerType();
-    if (TopPMType == PreferredType)
-      break; // We found desired pass manager
-    else if (TopPMType > PMT_ModulePassManager)
-      PMS.pop();    // Pop children pass managers
-    else
-      break;
-  }
-  assert(!PMS.empty() && "Unable to find appropriate Pass Manager");
-  PMS.top()->add(this);
-}
-
-/// Find appropriate Function Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void FunctionPass::assignPassManager(PMStack &PMS,
-                                     PassManagerType PreferredType) {
-
-  // Find Function Pass Manager
-  while (!PMS.empty()) {
-    if (PMS.top()->getPassManagerType() > PMT_FunctionPassManager)
-      PMS.pop();
-    else
-      break;
-  }
-
-  // Create new Function Pass Manager if needed.
-  FPPassManager *FPP;
-  if (PMS.top()->getPassManagerType() == PMT_FunctionPassManager) {
-    FPP = (FPPassManager *)PMS.top();
-  } else {
-    assert(!PMS.empty() && "Unable to create Function Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Function Pass Manager
-    FPP = new FPPassManager();
-    FPP->populateInheritedAnalysis(PMS);
-
-    // [2] Set up new manager's top level manager
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(FPP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    FPP->assignPassManager(PMS, PMD->getPassManagerType());
-
-    // [4] Push new manager into PMS
-    PMS.push(FPP);
-  }
-
-  // Assign FPP as the manager of this pass.
-  FPP->add(this);
-}
-
-/// Find appropriate Basic Pass Manager or Call Graph Pass Manager
-/// in the PM Stack and add self into that manager.
-void BasicBlockPass::assignPassManager(PMStack &PMS,
-                                       PassManagerType PreferredType) {
-  BBPassManager *BBP;
-
-  // Basic Pass Manager is a leaf pass manager. It does not handle
-  // any other pass manager.
-  if (!PMS.empty() &&
-      PMS.top()->getPassManagerType() == PMT_BasicBlockPassManager) {
-    BBP = (BBPassManager *)PMS.top();
-  } else {
-    // If leaf manager is not Basic Block Pass manager then create new
-    // basic Block Pass manager.
-    assert(!PMS.empty() && "Unable to create BasicBlock Pass Manager");
-    PMDataManager *PMD = PMS.top();
-
-    // [1] Create new Basic Block Manager
-    BBP = new BBPassManager();
-
-    // [2] Set up new manager's top level manager
-    // Basic Block Pass Manager does not live by itself
-    PMTopLevelManager *TPM = PMD->getTopLevelManager();
-    TPM->addIndirectPassManager(BBP);
-
-    // [3] Assign manager to manage this new manager. This may create
-    // and push new managers into PMS
-    BBP->assignPassManager(PMS, PreferredType);
-
-    // [4] Push new manager into PMS
-    PMS.push(BBP);
-  }
-
-  // Assign BBP as the manager of this pass.
-  BBP->add(this);
-}
-
-PassManagerBase::~PassManagerBase() {}
diff --git a/lib/IR/PassRegistry.cpp b/lib/IR/PassRegistry.cpp
index a0b64ed..d3b2f1f 100644
--- a/lib/IR/PassRegistry.cpp
+++ b/lib/IR/PassRegistry.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/RWMutex.h"
 #include <vector>
 
 using namespace llvm;
@@ -35,7 +36,7 @@ PassRegistry *PassRegistry::getPassRegistry() {
   return &*PassRegistryObj;
 }
 
-static ManagedStatic<sys::SmartMutex<true> > Lock;
+static ManagedStatic<sys::SmartRWMutex<true> > Lock;
 
 //===----------------------------------------------------------------------===//
 // PassRegistryImpl
@@ -72,7 +73,7 @@ void *PassRegistry::getImpl() const {
 //
 
 PassRegistry::~PassRegistry() {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedWriter<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(pImpl);
   
   for (std::vector<const PassInfo*>::iterator I = Impl->ToFree.begin(),
@@ -84,14 +85,14 @@ PassRegistry::~PassRegistry() {
 }
 
 const PassInfo *PassRegistry::getPassInfo(const void *TI) const {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedReader<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.find(TI);
   return I != Impl->PassInfoMap.end() ? I->second : 0;
 }
 
 const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedReader<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   PassRegistryImpl::StringMapType::const_iterator
     I = Impl->PassInfoStringMap.find(Arg);
@@ -103,7 +104,7 @@ const PassInfo *PassRegistry::getPassInfo(StringRef Arg) const {
 //
 
 void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedWriter<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   bool Inserted =
     Impl->PassInfoMap.insert(std::make_pair(PI.getTypeInfo(),&PI)).second;
@@ -120,7 +121,7 @@ void PassRegistry::registerPass(const PassInfo &PI, bool ShouldFree) {
 }
 
 void PassRegistry::unregisterPass(const PassInfo &PI) {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedWriter<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   PassRegistryImpl::MapType::iterator I = 
     Impl->PassInfoMap.find(PI.getTypeInfo());
@@ -132,7 +133,7 @@ void PassRegistry::unregisterPass(const PassInfo &PI) {
 }
 
 void PassRegistry::enumerateWith(PassRegistrationListener *L) {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedReader<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   for (PassRegistryImpl::MapType::const_iterator I = Impl->PassInfoMap.begin(),
        E = Impl->PassInfoMap.end(); I != E; ++I)
@@ -160,7 +161,7 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
     assert(ImplementationInfo &&
            "Must register pass before adding to AnalysisGroup!");
 
-    sys::SmartScopedLock<true> Guard(*Lock);
+    sys::SmartScopedWriter<true> Guard(*Lock);
     
     // Make sure we keep track of the fact that the implementation implements
     // the interface.
@@ -186,13 +187,13 @@ void PassRegistry::registerAnalysisGroup(const void *InterfaceID,
 }
 
 void PassRegistry::addRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedWriter<true> Guard(*Lock);
   PassRegistryImpl *Impl = static_cast<PassRegistryImpl*>(getImpl());
   Impl->Listeners.push_back(L);
 }
 
 void PassRegistry::removeRegistrationListener(PassRegistrationListener *L) {
-  sys::SmartScopedLock<true> Guard(*Lock);
+  sys::SmartScopedWriter<true> Guard(*Lock);
   
   // NOTE: This is necessary, because removeRegistrationListener() can be called
   // as part of the llvm_shutdown sequence.  Since we have no control over the
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 46c61fc..432cbc9 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -616,11 +616,7 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
 /// getTypeByName - Return the type with the specified name, or null if there
 /// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
-  StringMap<StructType*>::iterator I =
-    getContext().pImpl->NamedStructTypes.find(Name);
-  if (I != getContext().pImpl->NamedStructTypes.end())
-    return I->second;
-  return 0;
+  return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
 
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index d5e6203..689b903 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -44,6 +44,9 @@ void TypeFinder::run(const Module &M, bool onlyNamed) {
   for (Module::const_iterator FI = M.begin(), E = M.end(); FI != E; ++FI) {
     incorporateType(FI->getType());
 
+    if (FI->hasPrefixData())
+      incorporateValue(FI->getPrefixData());
+
     // First incorporate the arguments.
     for (Function::const_arg_iterator AI = FI->arg_begin(),
            AE = FI->arg_end(); AI != AE; ++AI)
@@ -91,19 +94,27 @@ void TypeFinder::clear() {
 /// incorporateType - This method adds the type to the list of used structures
 /// if it's not in there already.
 void TypeFinder::incorporateType(Type *Ty) {
-  // Check to see if we're already visited this type.
+  // Check to see if we've already visited this type.
   if (!VisitedTypes.insert(Ty).second)
     return;
 
-  // If this is a structure or opaque type, add a name for the type.
-  if (StructType *STy = dyn_cast<StructType>(Ty))
-    if (!OnlyNamed || STy->hasName())
-      StructTypes.push_back(STy);
-
-  // Recursively walk all contained types.
-  for (Type::subtype_iterator I = Ty->subtype_begin(),
-         E = Ty->subtype_end(); I != E; ++I)
-    incorporateType(*I);
+  SmallVector<Type *, 4> TypeWorklist;
+  TypeWorklist.push_back(Ty);
+  do {
+    Ty = TypeWorklist.pop_back_val();
+
+    // If this is a structure or opaque type, add a name for the type.
+    if (StructType *STy = dyn_cast<StructType>(Ty))
+      if (!OnlyNamed || STy->hasName())
+        StructTypes.push_back(STy);
+
+    // Add all unvisited subtypes to worklist for processing
+    for (Type::subtype_reverse_iterator I = Ty->subtype_rbegin(),
+                                        E = Ty->subtype_rend();
+         I != E; ++I)
+      if (VisitedTypes.insert(*I).second)
+        TypeWorklist.push_back(*I);
+  } while (!TypeWorklist.empty());
 }
 
 /// incorporateValue - This method is used to walk operand lists finding types
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index 89a3c05..62a3b31 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -112,21 +112,20 @@ bool Value::hasNUsesOrMore(unsigned N) const {
 /// isUsedInBasicBlock - Return true if this value is used in the specified
 /// basic block.
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
-  // Start by scanning over the instructions looking for a use before we start
-  // the expensive use iteration.
-  unsigned MaxBlockSize = 3;
-  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    if (std::find(I->op_begin(), I->op_end(), this) != I->op_end())
+  // This can be computed either by scanning the instructions in BB, or by
+  // scanning the use list of this Value. Both lists can be very long, but
+  // usually one is quite short.
+  //
+  // Scan both lists simultaneously until one is exhausted. This limits the
+  // search to the shorter list.
+  BasicBlock::const_iterator BI = BB->begin(), BE = BB->end();
+  const_use_iterator UI = use_begin(), UE = use_end();
+  for (; BI != BE && UI != UE; ++BI, ++UI) {
+    // Scan basic block: Check if this Value is used by the instruction at BI.
+    if (std::find(BI->op_begin(), BI->op_end(), this) != BI->op_end())
       return true;
-    if (--MaxBlockSize == 0) // If the block is larger fall back to use_iterator
-      break;
-  }
-
-  if (MaxBlockSize != 0) // We scanned the entire block and found no use.
-    return false;
-
-  for (const_use_iterator I = use_begin(), E = use_end(); I != E; ++I) {
-    const Instruction *User = dyn_cast<Instruction>(*I);
+    // Scan use list: Check if the use at UI is in BB.
+    const Instruction *User = dyn_cast<Instruction>(*UI);
     if (User && User->getParent() == BB)
       return true;
   }
@@ -366,7 +365,8 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
         break;
       }
       V = GEP->getPointerOperand();
-    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+    } else if (Operator::getOpcode(V) == Instruction::BitCast ||
+               Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->mayBeOverridden())
@@ -394,6 +394,42 @@ Value *Value::stripInBoundsConstantOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
+Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                        APInt &Offset) {
+  if (!getType()->isPointerTy())
+    return this;
+
+  assert(Offset.getBitWidth() == DL.getPointerSizeInBits(cast<PointerType>(
+                                     getType())->getAddressSpace()) &&
+         "The offset must have exactly as many bits as our pointer.");
+
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(this);
+  Value *V = this;
+  do {
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+      if (!GEP->isInBounds())
+        return V;
+      APInt GEPOffset(Offset);
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+        return V;
+      Offset = GEPOffset;
+      V = GEP->getPointerOperand();
+    } else if (Operator::getOpcode(V) == Instruction::BitCast) {
+      V = cast<Operator>(V)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+      V = GA->getAliasee();
+    } else {
+      return V;
+    }
+    assert(V->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(V));
+
+  return V;
+}
+
 Value *Value::stripInBoundsOffsets() {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
@@ -699,9 +735,5 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
 #endif
 }
 
-// Default implementation for CallbackVH.
-void CallbackVH::allUsesReplacedWith(Value *) {}
-
-void CallbackVH::deleted() {
-  setValPtr(NULL);
-}
+// Pin the vtable to this file.
+void CallbackVH::anchor() {}
diff --git a/lib/IR/ValueTypes.cpp b/lib/IR/ValueTypes.cpp
index ba04d60..2d4da95 100644
--- a/lib/IR/ValueTypes.cpp
+++ b/lib/IR/ValueTypes.cpp
@@ -134,6 +134,7 @@ std::string EVT::getEVTString() const {
   case MVT::v16i1:   return "v16i1";
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
+  case MVT::v1i8:    return "v1i8";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
   case MVT::v8i8:    return "v8i8";
@@ -156,11 +157,15 @@ std::string EVT::getEVTString() const {
   case MVT::v4i64:   return "v4i64";
   case MVT::v8i64:   return "v8i64";
   case MVT::v16i64:  return "v16i64";
+  case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
+  case MVT::v4f16:   return "v4f16";
+  case MVT::v8f16:   return "v8f16";
   case MVT::v4f32:   return "v4f32";
   case MVT::v8f32:   return "v8f32";
   case MVT::v16f32:  return "v16f32";
+  case MVT::v1f64:   return "v1f64";
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::v8f64:   return "v8f64";
@@ -197,6 +202,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
@@ -220,10 +226,14 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
   case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
   case MVT::v8f32:   return VectorType::get(Type::getFloatTy(Context), 8);
   case MVT::v16f32:   return VectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v1f64:   return VectorType::get(Type::getDoubleTy(Context), 1);
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
   case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4); 
   case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8); 
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index d106173..da6b573 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -53,8 +53,10 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -66,6 +68,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -74,6 +77,9 @@
 #include <cstdarg>
 using namespace llvm;
 
+static cl::opt<bool> DisableDebugInfoVerifier("disable-debug-info-verifier",
+                                              cl::init(true));
+
 namespace {  // Anonymous namespace for class
   struct PreVerifier : public FunctionPass {
     static char ID; // Pass ID, replacement for typeid
@@ -93,7 +99,7 @@ namespace {  // Anonymous namespace for class
 
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
         if (I->empty() || !I->back().isTerminator()) {
-          dbgs() << "Basic Block in function '" << F.getName() 
+          dbgs() << "Basic Block in function '" << F.getName()
                  << "' does not have terminator!\n";
           WriteAsOperand(dbgs(), I, true);
           dbgs() << "\n";
@@ -110,7 +116,7 @@ namespace {  // Anonymous namespace for class
 }
 
 char PreVerifier::ID = 0;
-INITIALIZE_PASS(PreVerifier, "preverify", "Preliminary module verification", 
+INITIALIZE_PASS(PreVerifier, "preverify", "Preliminary module verification",
                 false, false)
 static char &PreVerifyID = PreVerifier::ID;
 
@@ -123,6 +129,7 @@ namespace {
     Module *Mod;          // Module we are verifying right now
     LLVMContext *Context; // Context within which we are verifying
     DominatorTree *DT;    // Dominator Tree, caution can be null!
+    const DataLayout *DL;
 
     std::string Messages;
     raw_string_ostream MessagesStr;
@@ -142,15 +149,18 @@ namespace {
     /// the same personality function.
     const Value *PersonalityFn;
 
+    /// Finder keeps track of all debug info MDNodes in a Module.
+    DebugInfoFinder Finder;
+
     Verifier()
       : FunctionPass(ID), Broken(false),
-        action(AbortProcessAction), Mod(0), Context(0), DT(0),
+        action(AbortProcessAction), Mod(0), Context(0), DT(0), DL(0),
         MessagesStr(Messages), PersonalityFn(0) {
       initializeVerifierPass(*PassRegistry::getPassRegistry());
     }
     explicit Verifier(VerifierFailureAction ctn)
       : FunctionPass(ID), Broken(false), action(ctn), Mod(0),
-        Context(0), DT(0), MessagesStr(Messages), PersonalityFn(0) {
+        Context(0), DT(0), DL(0), MessagesStr(Messages), PersonalityFn(0) {
       initializeVerifierPass(*PassRegistry::getPassRegistry());
     }
 
@@ -158,6 +168,8 @@ namespace {
       Mod = &M;
       Context = &M.getContext();
 
+      DL = getAnalysisIfAvailable<DataLayout>();
+
       // We must abort before returning back to the pass manager, or else the
       // pass manager may try to run other passes on the broken module.
       return abortIfBroken();
@@ -170,10 +182,15 @@ namespace {
       Mod = F.getParent();
       if (!Context) Context = &F.getContext();
 
+      Finder.reset();
       visit(F);
       InstsInThisBlock.clear();
       PersonalityFn = 0;
 
+      if (!DisableDebugInfoVerifier)
+        // Verify Debug Info.
+        verifyDebugInfo();
+
       // We must abort before returning back to the pass manager, or else the
       // pass manager may try to run other passes on the broken module.
       return abortIfBroken();
@@ -188,11 +205,11 @@ namespace {
         if (I->isDeclaration()) visitFunction(*I);
       }
 
-      for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+      for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I)
         visitGlobalVariable(*I);
 
-      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end(); 
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
            I != E; ++I)
         visitGlobalAlias(*I);
 
@@ -201,6 +218,14 @@ namespace {
         visitNamedMDNode(*I);
 
       visitModuleFlags(M);
+      visitModuleIdents(M);
+
+      if (!DisableDebugInfoVerifier) {
+        Finder.reset();
+        Finder.processModule(M);
+        // Verify Debug Info.
+        verifyDebugInfo();
+      }
 
       // If the module is broken, abort at this time.
       return abortIfBroken();
@@ -242,6 +267,7 @@ namespace {
     void visitGlobalAlias(GlobalAlias &GA);
     void visitNamedMDNode(NamedMDNode &NMD);
     void visitMDNode(MDNode &MD, Function *F);
+    void visitModuleIdents(Module &M);
     void visitModuleFlags(Module &M);
     void visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*> &SeenIDs,
                          SmallVectorImpl<MDNode*> &Requirements);
@@ -263,6 +289,7 @@ namespace {
     void visitIntToPtrInst(IntToPtrInst &I);
     void visitPtrToIntInst(PtrToIntInst &I);
     void visitBitCastInst(BitCastInst &I);
+    void visitAddrSpaceCastInst(AddrSpaceCastInst &I);
     void visitPHINode(PHINode &PN);
     void visitBinaryOperator(BinaryOperator &B);
     void visitICmpInst(ICmpInst &IC);
@@ -301,6 +328,8 @@ namespace {
     bool VerifyIntrinsicType(Type *Ty,
                              ArrayRef<Intrinsic::IITDescriptor> &Infos,
                              SmallVectorImpl<Type*> &ArgTys);
+    bool VerifyIntrinsicIsVarArg(bool isVarArg,
+                                 ArrayRef<Intrinsic::IITDescriptor> &Infos);
     bool VerifyAttributeCount(AttributeSet Attrs, unsigned Params);
     void VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
                               bool isFunction, const Value *V);
@@ -309,6 +338,11 @@ namespace {
     void VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
                              const Value *V);
 
+    void VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy);
+    void VerifyConstantExprBitcastType(const ConstantExpr *CE);
+
+    void verifyDebugInfo();
+
     void WriteValue(const Value *V) {
       if (!V) return;
       if (isa<Instruction>(V)) {
@@ -406,10 +440,6 @@ void Verifier::visitGlobalValue(GlobalValue &GV) {
     Assert1(GVar && GVar->getType()->getElementType()->isArrayTy(),
             "Only global arrays can have appending linkage!", GVar);
   }
-
-  Assert1(!GV.hasLinkOnceODRAutoHideLinkage() || GV.hasDefaultVisibility(),
-          "linkonce_odr_auto_hide can only have default visibility!",
-          &GV);
 }
 
 void Verifier::visitGlobalVariable(GlobalVariable &GV) {
@@ -450,7 +480,7 @@ void Verifier::visitGlobalVariable(GlobalVariable &GV) {
   }
 
   if (GV.hasName() && (GV.getName() == "llvm.used" ||
-                       GV.getName() == "llvm.compiler_used")) {
+                       GV.getName() == "llvm.compiler.used")) {
     Assert1(!GV.hasInitializer() || GV.hasAppendingLinkage(),
             "invalid linkage for intrinsic global variable", &GV);
     Type *GVType = GV.getType()->getElementType();
@@ -463,24 +493,50 @@ void Verifier::visitGlobalVariable(GlobalVariable &GV) {
         Assert1(InitArray, "wrong initalizer for intrinsic global variable",
                 Init);
         for (unsigned i = 0, e = InitArray->getNumOperands(); i != e; ++i) {
-          Value *V = Init->getOperand(i)->stripPointerCasts();
-          // stripPointerCasts strips aliases, so we only need to check for
-          // variables and functions.
-          Assert1(isa<GlobalVariable>(V) || isa<Function>(V),
-                  "invalid llvm.used member", V);
+          Value *V = Init->getOperand(i)->stripPointerCastsNoFollowAliases();
+          Assert1(
+              isa<GlobalVariable>(V) || isa<Function>(V) || isa<GlobalAlias>(V),
+              "invalid llvm.used member", V);
+          Assert1(V->hasName(), "members of llvm.used must be named", V);
         }
       }
     }
   }
 
+  if (!GV.hasInitializer()) {
+    visitGlobalValue(GV);
+    return;
+  }
+
+  // Walk any aggregate initializers looking for bitcasts between address spaces
+  SmallPtrSet<const Value *, 4> Visited;
+  SmallVector<const Value *, 4> WorkStack;
+  WorkStack.push_back(cast<Value>(GV.getInitializer()));
+
+  while (!WorkStack.empty()) {
+    const Value *V = WorkStack.pop_back_val();
+    if (!Visited.insert(V))
+      continue;
+
+    if (const User *U = dyn_cast<User>(V)) {
+      for (unsigned I = 0, N = U->getNumOperands(); I != N; ++I)
+        WorkStack.push_back(U->getOperand(I));
+    }
+
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+      VerifyConstantExprBitcastType(CE);
+      if (Broken)
+        return;
+    }
+  }
+
   visitGlobalValue(GV);
 }
 
 void Verifier::visitGlobalAlias(GlobalAlias &GA) {
   Assert1(!GA.getName().empty(),
           "Alias name cannot be empty!", &GA);
-  Assert1(GA.hasExternalLinkage() || GA.hasLocalLinkage() ||
-          GA.hasWeakLinkage(),
+  Assert1(GlobalAlias::isValidLinkage(GA.getLinkage()),
           "Alias should have external or external weak linkage!", &GA);
   Assert1(GA.getAliasee(),
           "Aliasee cannot be NULL!", &GA);
@@ -488,18 +544,29 @@ void Verifier::visitGlobalAlias(GlobalAlias &GA) {
           "Alias and aliasee types should match!", &GA);
   Assert1(!GA.hasUnnamedAddr(), "Alias cannot have unnamed_addr!", &GA);
 
-  if (!isa<GlobalValue>(GA.getAliasee())) {
-    const ConstantExpr *CE = dyn_cast<ConstantExpr>(GA.getAliasee());
-    Assert1(CE && 
+  Constant *Aliasee = GA.getAliasee();
+
+  if (!isa<GlobalValue>(Aliasee)) {
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(Aliasee);
+    Assert1(CE &&
             (CE->getOpcode() == Instruction::BitCast ||
              CE->getOpcode() == Instruction::GetElementPtr) &&
             isa<GlobalValue>(CE->getOperand(0)),
             "Aliasee should be either GlobalValue or bitcast of GlobalValue",
             &GA);
+
+    if (CE->getOpcode() == Instruction::BitCast) {
+      unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
+      unsigned DstAS = CE->getType()->getPointerAddressSpace();
+
+      Assert1(SrcAS == DstAS,
+              "Alias bitcasts cannot be between different address spaces",
+              &GA);
+    }
   }
 
-  const GlobalValue* Aliasee = GA.resolveAliasedGlobal(/*stopOnWeak*/ false);
-  Assert1(Aliasee,
+  const GlobalValue* Resolved = GA.resolveAliasedGlobal(/*stopOnWeak*/ false);
+  Assert1(Resolved,
           "Aliasing chain should end with function or global variable", &GA);
 
   visitGlobalValue(GA);
@@ -553,6 +620,24 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
   }
 }
 
+void Verifier::visitModuleIdents(Module &M) {
+  const NamedMDNode *Idents = M.getNamedMetadata("llvm.ident");
+  if (!Idents) 
+    return;
+  
+  // llvm.ident takes a list of metadata entry. Each entry has only one string.
+  // Scan each llvm.ident entry and make sure that this requirement is met.
+  for (unsigned i = 0, e = Idents->getNumOperands(); i != e; ++i) {
+    const MDNode *N = Idents->getOperand(i);
+    Assert1(N->getNumOperands() == 1,
+            "incorrect number of operands in llvm.ident metadata", N);
+    Assert1(isa<MDString>(N->getOperand(0)),
+            ("invalid value for llvm.ident metadata entry operand"
+             "(the operand should be a string)"),
+            N->getOperand(0));
+  } 
+}
+
 void Verifier::visitModuleFlags(Module &M) {
   const NamedMDNode *Flags = M.getModuleFlagsMetadata();
   if (!Flags) return;
@@ -654,7 +739,7 @@ void Verifier::visitModuleFlag(MDNode *Op, DenseMap<MDString*, MDNode*>&SeenIDs,
 }
 
 void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
-                                    bool isFunction, const Value* V) {
+                                    bool isFunction, const Value *V) {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = Attrs.getNumSlots(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Idx) {
@@ -671,8 +756,6 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
     if (I->getKindAsEnum() == Attribute::NoReturn ||
         I->getKindAsEnum() == Attribute::NoUnwind ||
-        I->getKindAsEnum() == Attribute::ReadNone ||
-        I->getKindAsEnum() == Attribute::ReadOnly ||
         I->getKindAsEnum() == Attribute::NoInline ||
         I->getKindAsEnum() == Attribute::AlwaysInline ||
         I->getKindAsEnum() == Attribute::OptimizeForSize ||
@@ -692,15 +775,26 @@ void Verifier::VerifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
         I->getKindAsEnum() == Attribute::SanitizeMemory ||
         I->getKindAsEnum() == Attribute::MinSize ||
         I->getKindAsEnum() == Attribute::NoDuplicate ||
-        I->getKindAsEnum() == Attribute::NoBuiltin) {
-      if (!isFunction)
-          CheckFailed("Attribute '" + I->getKindAsString() +
-                      "' only applies to functions!", V);
-          return;
-    } else if (isFunction) {
-        CheckFailed("Attribute '" + I->getKindAsString() +
-                    "' does not apply to functions!", V);
+        I->getKindAsEnum() == Attribute::Builtin ||
+        I->getKindAsEnum() == Attribute::NoBuiltin ||
+        I->getKindAsEnum() == Attribute::Cold ||
+        I->getKindAsEnum() == Attribute::OptimizeNone) {
+      if (!isFunction) {
+        CheckFailed("Attribute '" + I->getAsString() +
+                    "' only applies to functions!", V);
         return;
+      }
+    } else if (I->getKindAsEnum() == Attribute::ReadOnly ||
+               I->getKindAsEnum() == Attribute::ReadNone) {
+      if (Idx == 0) {
+        CheckFailed("Attribute '" + I->getAsString() +
+                    "' does not apply to function returns");
+        return;
+      }
+    } else if (isFunction) {
+      CheckFailed("Attribute '" + I->getAsString() +
+                  "' does not apply to functions!", V);
+      return;
     }
   }
 }
@@ -830,6 +924,65 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
             Attrs.hasAttribute(AttributeSet::FunctionIndex,
                                Attribute::AlwaysInline)),
           "Attributes 'noinline and alwaysinline' are incompatible!", V);
+
+  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+                         Attribute::OptimizeNone)) {
+    Assert1(Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                               Attribute::NoInline),
+            "Attribute 'optnone' requires 'noinline'!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::OptimizeForSize),
+            "Attributes 'optsize and optnone' are incompatible!", V);
+
+    Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                                Attribute::MinSize),
+            "Attributes 'minsize and optnone' are incompatible!", V);
+  }
+}
+
+void Verifier::VerifyBitcastType(const Value *V, Type *DestTy, Type *SrcTy) {
+  // Get the size of the types in bits, we'll need this later
+  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // BitCast implies a no-op cast of type only. No bits change.
+  // However, you can't cast pointers to anything but pointers.
+  Assert1(SrcTy->isPointerTy() == DestTy->isPointerTy(),
+          "Bitcast requires both operands to be pointer or neither", V);
+  Assert1(SrcBitSize == DestBitSize,
+          "Bitcast requires types of same width", V);
+
+  // Disallow aggregates.
+  Assert1(!SrcTy->isAggregateType(),
+          "Bitcast operand must not be aggregate", V);
+  Assert1(!DestTy->isAggregateType(),
+          "Bitcast type must not be aggregate", V);
+
+  // Without datalayout, assume all address spaces are the same size.
+  // Don't check if both types are not pointers.
+  // Skip casts between scalars and vectors.
+  if (!DL ||
+      !SrcTy->isPtrOrPtrVectorTy() ||
+      !DestTy->isPtrOrPtrVectorTy() ||
+      SrcTy->isVectorTy() != DestTy->isVectorTy()) {
+    return;
+  }
+
+  unsigned SrcAS = SrcTy->getPointerAddressSpace();
+  unsigned DstAS = DestTy->getPointerAddressSpace();
+
+  Assert1(SrcAS == DstAS,
+          "Bitcasts between pointers of different address spaces is not legal."
+          "Use AddrSpaceCast instead.", V);
+}
+
+void Verifier::VerifyConstantExprBitcastType(const ConstantExpr *CE) {
+  if (CE->getOpcode() == Instruction::BitCast) {
+    Type *SrcTy = CE->getOperand(0)->getType();
+    Type *DstTy = CE->getType();
+    VerifyBitcastType(CE, DstTy, SrcTy);
+  }
 }
 
 bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
@@ -842,7 +995,7 @@ bool Verifier::VerifyAttributeCount(AttributeSet Attrs, unsigned Params) {
       || (LastIndex == AttributeSet::FunctionIndex
           && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
     return true;
- 
+
   return false;
 }
 
@@ -861,7 +1014,7 @@ void Verifier::visitFunction(Function &F) {
           "# formal arguments must match # of arguments for function type!",
           &F, FT);
   Assert1(F.getReturnType()->isFirstClassType() ||
-          F.getReturnType()->isVoidTy() || 
+          F.getReturnType()->isVoidTy() ||
           F.getReturnType()->isStructTy(),
           "Functions cannot return aggregate values!", &F);
 
@@ -876,6 +1029,13 @@ void Verifier::visitFunction(Function &F) {
   // Check function attributes.
   VerifyFunctionAttrs(FT, Attrs, &F);
 
+  // On function declarations/definitions, we do not support the builtin
+  // attribute. We do not check this in VerifyFunctionAttrs since that is
+  // checking for Attributes that can/can not ever be on functions.
+  Assert1(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+                              Attribute::Builtin),
+          "Attribute 'builtin' can only be applied to a callsite.", &F);
+
   // Check that this function meets the restrictions on this calling convention.
   switch (F.getCallingConv()) {
   default:
@@ -921,25 +1081,25 @@ void Verifier::visitFunction(Function &F) {
     // Verify that this function (which has a body) is not named "llvm.*".  It
     // is not legal to define intrinsics.
     Assert1(!isLLVMdotName, "llvm intrinsics cannot be defined!", &F);
-    
+
     // Check the entry node
     BasicBlock *Entry = &F.getEntryBlock();
     Assert1(pred_begin(Entry) == pred_end(Entry),
             "Entry block to function must not have predecessors!", Entry);
-    
+
     // The address of the entry block cannot be taken, unless it is dead.
     if (Entry->hasAddressTaken()) {
       Assert1(!BlockAddress::get(Entry)->isConstantUsed(),
               "blockaddress may not be used with the entry block!", Entry);
     }
   }
- 
+
   // If this function is actually an intrinsic, verify that it is only used in
   // direct call/invokes, never having its "address taken".
   if (F.getIntrinsicID()) {
     const User *U;
     if (F.hasAddressTaken(&U))
-      Assert1(0, "Invalid user of intrinsic instruction!", U); 
+      Assert1(0, "Invalid user of intrinsic instruction!", U);
   }
 }
 
@@ -1014,7 +1174,7 @@ void Verifier::visitBranchInst(BranchInst &BI) {
 void Verifier::visitReturnInst(ReturnInst &RI) {
   Function *F = RI.getParent()->getParent();
   unsigned N = RI.getNumOperands();
-  if (F->getReturnType()->isVoidTy()) 
+  if (F->getReturnType()->isVoidTy())
     Assert2(N == 0,
             "Found return instr that returns non-void in Function of void "
             "return type!", &RI, F->getReturnType());
@@ -1032,29 +1192,14 @@ void Verifier::visitSwitchInst(SwitchInst &SI) {
   // Check to make sure that all of the constants in the switch instruction
   // have the same type as the switched-on value.
   Type *SwitchTy = SI.getCondition()->getType();
-  IntegerType *IntTy = cast<IntegerType>(SwitchTy);
-  IntegersSubsetToBB Mapping;
-  std::map<IntegersSubset::Range, unsigned> RangeSetMap;
+  SmallPtrSet<ConstantInt*, 32> Constants;
   for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end(); i != e; ++i) {
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    for (unsigned ri = 0, rie = CaseRanges.getNumItems(); ri < rie; ++ri) {
-      IntegersSubset::Range r = CaseRanges.getItem(ri);
-      Assert1(((const APInt&)r.getLow()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Assert1(((const APInt&)r.getHigh()).getBitWidth() == IntTy->getBitWidth(),
-              "Switch constants must all be same type as switch value!", &SI);
-      Mapping.add(r);
-      RangeSetMap[r] = i.getCaseIndex();
-    }
-  }
-  
-  IntegersSubsetToBB::RangeIterator errItem;
-  if (!Mapping.verify(errItem)) {
-    unsigned CaseIndex = RangeSetMap[errItem->first];
-    SwitchInst::CaseIt i(&SI, CaseIndex);
-    Assert2(false, "Duplicate integer as switch case", &SI, i.getCaseValueEx());
+    Assert1(i.getCaseValue()->getType() == SwitchTy,
+            "Switch constants must all be same type as switch value!", &SI);
+    Assert2(Constants.insert(i.getCaseValue()),
+            "Duplicate integer as switch case", &SI, i.getCaseValue());
   }
-  
+
   visitTerminatorInst(SI);
 }
 
@@ -1309,26 +1454,25 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
 }
 
 void Verifier::visitBitCastInst(BitCastInst &I) {
-  // Get the source and destination types
   Type *SrcTy = I.getOperand(0)->getType();
   Type *DestTy = I.getType();
+  VerifyBitcastType(&I, DestTy, SrcTy);
+  visitInstruction(I);
+}
 
-  // Get the size of the types in bits, we'll need this later
-  unsigned SrcBitSize = SrcTy->getPrimitiveSizeInBits();
-  unsigned DestBitSize = DestTy->getPrimitiveSizeInBits();
-
-  // BitCast implies a no-op cast of type only. No bits change.
-  // However, you can't cast pointers to anything but pointers.
-  Assert1(SrcTy->isPointerTy() == DestTy->isPointerTy(),
-          "Bitcast requires both operands to be pointer or neither", &I);
-  Assert1(SrcBitSize == DestBitSize, "Bitcast requires types of same width",&I);
-
-  // Disallow aggregates.
-  Assert1(!SrcTy->isAggregateType(),
-          "Bitcast operand must not be aggregate", &I);
-  Assert1(!DestTy->isAggregateType(),
-          "Bitcast type must not be aggregate", &I);
+void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
+  Type *SrcTy = I.getOperand(0)->getType();
+  Type *DestTy = I.getType();
 
+  Assert1(SrcTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast source must be a pointer", &I);
+  Assert1(DestTy->isPtrOrPtrVectorTy(),
+          "AddrSpaceCast result must be a pointer", &I);
+  Assert1(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
+          "AddrSpaceCast must be between different address spaces", &I);
+  if (SrcTy->isVectorTy())
+    Assert1(SrcTy->getVectorNumElements() == DestTy->getVectorNumElements(),
+            "AddrSpaceCast vector pointer number of elements mismatch", &I);
   visitInstruction(I);
 }
 
@@ -1339,7 +1483,7 @@ void Verifier::visitPHINode(PHINode &PN) {
   // This can be tested by checking whether the instruction before this is
   // either nonexistent (because this is begin()) or is a PHI node.  If not,
   // then there is some other instruction before a PHI.
-  Assert2(&PN == &PN.getParent()->front() || 
+  Assert2(&PN == &PN.getParent()->front() ||
           isa<PHINode>(--BasicBlock::iterator(&PN)),
           "PHI nodes not grouped at top of basic block!",
           &PN, PN.getParent());
@@ -1403,9 +1547,9 @@ void Verifier::VerifyCallSite(CallSite CS) {
 
     // Check attributes on the varargs part.
     for (unsigned Idx = 1 + FTy->getNumParams(); Idx <= CS.arg_size(); ++Idx) {
-      Type *Ty = CS.getArgument(Idx-1)->getType(); 
+      Type *Ty = CS.getArgument(Idx-1)->getType();
       VerifyParameterAttrs(Attrs, Idx, Ty, false, I);
-      
+
       if (Attrs.hasAttribute(Idx, Attribute::Nest)) {
         Assert1(!SawNest, "More than one parameter has attribute nest!", I);
         SawNest = true;
@@ -1718,7 +1862,7 @@ void Verifier::visitStoreInst(StoreInst &SI) {
 
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   PointerType *PTy = AI.getType();
-  Assert1(PTy->getAddressSpace() == 0, 
+  Assert1(PTy->getAddressSpace() == 0,
           "Allocation instruction pointer not in the generic address space!",
           &AI);
   Assert1(PTy->getElementType()->isSized(), "Cannot allocate unsized type",
@@ -1790,7 +1934,7 @@ void Verifier::visitExtractValueInst(ExtractValueInst &EVI) {
                                            EVI.getIndices()) ==
           EVI.getType(),
           "Invalid ExtractValueInst operands!", &EVI);
-  
+
   visitInstruction(EVI);
 }
 
@@ -1799,7 +1943,7 @@ void Verifier::visitInsertValueInst(InsertValueInst &IVI) {
                                            IVI.getIndices()) ==
           IVI.getOperand(1)->getType(),
           "Invalid InsertValueInst operands!", &IVI);
-  
+
   visitInstruction(IVI);
 }
 
@@ -1886,7 +2030,7 @@ void Verifier::visitInstruction(Instruction &I) {
 
   // Check that the return value of the instruction is either void or a legal
   // value type.
-  Assert1(I.getType()->isVoidTy() || 
+  Assert1(I.getType()->isVoidTy() ||
           I.getType()->isFirstClassType(),
           "Instruction returns a non-scalar type!", &I);
 
@@ -1944,6 +2088,27 @@ void Verifier::visitInstruction(Instruction &I) {
       Assert1((i + 1 == e && isa<CallInst>(I)) ||
               (i + 3 == e && isa<InvokeInst>(I)),
               "Cannot take the address of an inline asm!", &I);
+    } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(I.getOperand(i))) {
+      if (CE->getType()->isPtrOrPtrVectorTy()) {
+        // If we have a ConstantExpr pointer, we need to see if it came from an
+        // illegal bitcast (inttoptr <constant int> )
+        SmallVector<const ConstantExpr *, 4> Stack;
+        SmallPtrSet<const ConstantExpr *, 4> Visited;
+        Stack.push_back(CE);
+
+        while (!Stack.empty()) {
+          const ConstantExpr *V = Stack.pop_back_val();
+          if (!Visited.insert(V))
+            continue;
+
+          VerifyConstantExprBitcastType(V);
+
+          for (unsigned I = 0, N = V->getNumOperands(); I != N; ++I) {
+            if (ConstantExpr *Op = dyn_cast<ConstantExpr>(V->getOperand(I)))
+              Stack.push_back(Op);
+          }
+        }
+      }
     }
   }
 
@@ -1954,7 +2119,7 @@ void Verifier::visitInstruction(Instruction &I) {
     Value *Op0 = MD->getOperand(0);
     if (ConstantFP *CFP0 = dyn_cast_or_null<ConstantFP>(Op0)) {
       APFloat Accuracy = CFP0->getValueAPF();
-      Assert1(Accuracy.isNormal() && !Accuracy.isNegative(),
+      Assert1(Accuracy.isFiniteNonZero() && !Accuracy.isNegative(),
               "fpmath accuracy not a positive number!", &I);
     } else {
       Assert1(false, "invalid fpmath accuracy!", &I);
@@ -1964,6 +2129,11 @@ void Verifier::visitInstruction(Instruction &I) {
   MDNode *MD = I.getMetadata(LLVMContext::MD_range);
   Assert1(!MD || isa<LoadInst>(I), "Ranges are only for loads!", &I);
 
+  if (!DisableDebugInfoVerifier) {
+    MD = I.getMetadata(LLVMContext::MD_dbg);
+    Finder.processLocation(*Mod, DILocation(MD));
+  }
+
   InstsInThisBlock.insert(&I);
 }
 
@@ -1978,12 +2148,13 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   using namespace Intrinsic;
 
   // If we ran out of descriptors, there are too many arguments.
-  if (Infos.empty()) return true; 
+  if (Infos.empty()) return true;
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
-  
+
   switch (D.Kind) {
   case IITDescriptor::Void: return !Ty->isVoidTy();
+  case IITDescriptor::VarArg: return true;
   case IITDescriptor::MMX:  return !Ty->isX86_MMXTy();
   case IITDescriptor::Metadata: return !Ty->isMetadataTy();
   case IITDescriptor::Half: return !Ty->isHalfTy();
@@ -2000,29 +2171,29 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
     return PT == 0 || PT->getAddressSpace() != D.Pointer_AddressSpace ||
            VerifyIntrinsicType(PT->getElementType(), Infos, ArgTys);
   }
-      
+
   case IITDescriptor::Struct: {
     StructType *ST = dyn_cast<StructType>(Ty);
     if (ST == 0 || ST->getNumElements() != D.Struct_NumElements)
       return true;
-    
+
     for (unsigned i = 0, e = D.Struct_NumElements; i != e; ++i)
       if (VerifyIntrinsicType(ST->getElementType(i), Infos, ArgTys))
         return true;
     return false;
   }
-      
+
   case IITDescriptor::Argument:
     // Two cases here - If this is the second occurrence of an argument, verify
-    // that the later instance matches the previous instance. 
+    // that the later instance matches the previous instance.
     if (D.getArgumentNumber() < ArgTys.size())
-      return Ty != ArgTys[D.getArgumentNumber()];  
-      
+      return Ty != ArgTys[D.getArgumentNumber()];
+
     // Otherwise, if this is the first instance of an argument, record it and
     // verify the "Any" kind.
     assert(D.getArgumentNumber() == ArgTys.size() && "Table consistency error");
     ArgTys.push_back(Ty);
-      
+
     switch (D.getArgumentKind()) {
     case IITDescriptor::AK_AnyInteger: return !Ty->isIntOrIntVectorTy();
     case IITDescriptor::AK_AnyFloat:   return !Ty->isFPOrFPVectorTy();
@@ -2030,7 +2201,7 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
     case IITDescriptor::AK_AnyPointer: return !isa<PointerType>(Ty);
     }
     llvm_unreachable("all argument kinds not covered");
-      
+
   case IITDescriptor::ExtendVecArgument:
     // This may only be used when referring to a previous vector argument.
     return D.getArgumentNumber() >= ArgTys.size() ||
@@ -2048,6 +2219,33 @@ bool Verifier::VerifyIntrinsicType(Type *Ty,
   llvm_unreachable("unhandled");
 }
 
+/// \brief Verify if the intrinsic has variable arguments.
+/// This method is intended to be called after all the fixed arguments have been
+/// verified first.
+///
+/// This method returns true on error and does not print an error message.
+bool
+Verifier::VerifyIntrinsicIsVarArg(bool isVarArg,
+                                  ArrayRef<Intrinsic::IITDescriptor> &Infos) {
+  using namespace Intrinsic;
+
+  // If there are no descriptors left, then it can't be a vararg.
+  if (Infos.empty())
+    return isVarArg ? true : false;
+
+  // There should be only one descriptor remaining at this point.
+  if (Infos.size() != 1)
+    return true;
+
+  // Check and verify the descriptor.
+  IITDescriptor D = Infos.front();
+  Infos = Infos.slice(1);
+  if (D.Kind == IITDescriptor::VarArg)
+    return isVarArg ? false : true;
+
+  return true;
+}
+
 /// visitIntrinsicFunction - Allow intrinsics to be verified in different ways.
 ///
 void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
@@ -2058,8 +2256,8 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   // Verify that the intrinsic prototype lines up with what the .td files
   // describe.
   FunctionType *IFTy = IF->getFunctionType();
-  Assert1(!IFTy->isVarArg(), "Intrinsic prototypes are not varargs", IF);
-  
+  bool IsVarArg = IFTy->isVarArg();
+
   SmallVector<Intrinsic::IITDescriptor, 8> Table;
   getIntrinsicInfoTableEntries(ID, Table);
   ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
@@ -2070,6 +2268,16 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   for (unsigned i = 0, e = IFTy->getNumParams(); i != e; ++i)
     Assert1(!VerifyIntrinsicType(IFTy->getParamType(i), TableRef, ArgTys),
             "Intrinsic has incorrect argument type!", IF);
+
+  // Verify if the intrinsic call matches the vararg property.
+  if (IsVarArg)
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Intrinsic was not defined with variable arguments!", IF);
+  else
+    Assert1(!VerifyIntrinsicIsVarArg(IsVarArg, TableRef),
+            "Callsite was not defined with variable arguments!", IF);
+
+  // All descriptors should be absorbed by now.
   Assert1(TableRef.empty(), "Intrinsic has too few arguments!", IF);
 
   // Now that we have the intrinsic ID and the actual argument types (and we
@@ -2078,7 +2286,7 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   // the name.
   Assert1(Intrinsic::getName(ID, ArgTys) == IF->getName(),
           "Intrinsic name not mangled correctly for type arguments!", IF);
-  
+
   // If the intrinsic takes MDNode arguments, verify that they are either global
   // or are local to *this* function.
   for (unsigned i = 0, e = CI.getNumArgOperands(); i != e; ++i)
@@ -2100,7 +2308,17 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
     MDNode *MD = cast<MDNode>(CI.getArgOperand(0));
     Assert1(MD->getNumOperands() == 1,
                 "invalid llvm.dbg.declare intrinsic call 2", &CI);
+    if (!DisableDebugInfoVerifier)
+      Finder.processDeclare(*Mod, cast<DbgDeclareInst>(&CI));
   } break;
+  case Intrinsic::dbg_value: { //llvm.dbg.value
+    if (!DisableDebugInfoVerifier) {
+      Assert1(CI.getArgOperand(0) && isa<MDNode>(CI.getArgOperand(0)),
+              "invalid llvm.dbg.value intrinsic call 1", &CI);
+      Finder.processValue(*Mod, cast<DbgValueInst>(&CI));
+    }
+    break;
+  }
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
@@ -2162,6 +2380,28 @@ void Verifier::visitIntrinsicFunctionCall(Intrinsic::ID ID, CallInst &CI) {
   }
 }
 
+void Verifier::verifyDebugInfo() {
+  // Verify Debug Info.
+  if (!DisableDebugInfoVerifier) {
+    for (DebugInfoFinder::iterator I = Finder.compile_unit_begin(),
+         E = Finder.compile_unit_end(); I != E; ++I)
+      Assert1(DICompileUnit(*I).Verify(), "DICompileUnit does not Verify!", *I);
+    for (DebugInfoFinder::iterator I = Finder.subprogram_begin(),
+         E = Finder.subprogram_end(); I != E; ++I)
+      Assert1(DISubprogram(*I).Verify(), "DISubprogram does not Verify!", *I);
+    for (DebugInfoFinder::iterator I = Finder.global_variable_begin(),
+         E = Finder.global_variable_end(); I != E; ++I)
+      Assert1(DIGlobalVariable(*I).Verify(),
+              "DIGlobalVariable does not Verify!", *I);
+    for (DebugInfoFinder::iterator I = Finder.type_begin(),
+         E = Finder.type_end(); I != E; ++I)
+      Assert1(DIType(*I).Verify(), "DIType does not Verify!", *I);
+    for (DebugInfoFinder::iterator I = Finder.scope_begin(),
+         E = Finder.scope_end(); I != E; ++I)
+      Assert1(DIScope(*I).Verify(), "DIScope does not Verify!", *I);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //  Implement the public interfaces to this file...
 //===----------------------------------------------------------------------===//
@@ -2181,6 +2421,7 @@ bool llvm::verifyFunction(const Function &f, VerifierFailureAction action) {
   FunctionPassManager FPM(F.getParent());
   Verifier *V = new Verifier(action);
   FPM.add(V);
+  FPM.doInitialization();
   FPM.run(F);
   return V->Broken;
 }
diff --git a/lib/IRReader/IRReader.cpp b/lib/IRReader/IRReader.cpp
index eeec14e..935e81d 100644
--- a/lib/IRReader/IRReader.cpp
+++ b/lib/IRReader/IRReader.cpp
@@ -11,10 +11,15 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Assembly/Parser.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
+#include "llvm-c/IRReader.h"
 
 using namespace llvm;
 
@@ -22,8 +27,8 @@ namespace llvm {
   extern bool TimePassesIsEnabled;
 }
 
-static const char *TimeIRParsingGroupName = "LLVM IR Parsing";
-static const char *TimeIRParsingName = "Parse IR";
+static const char *const TimeIRParsingGroupName = "LLVM IR Parsing";
+static const char *const TimeIRParsingName = "Parse IR";
 
 
 Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
@@ -48,7 +53,7 @@ Module *llvm::getLazyIRModule(MemoryBuffer *Buffer, SMDiagnostic &Err,
 Module *llvm::getLazyIRFileModule(const std::string &Filename, SMDiagnostic &Err,
                                   LLVMContext &Context) {
   OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return 0;
@@ -79,7 +84,7 @@ Module *llvm::ParseIR(MemoryBuffer *Buffer, SMDiagnostic &Err,
 Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
                           LLVMContext &Context) {
   OwningPtr<MemoryBuffer> File;
-  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename.c_str(), File)) {
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, File)) {
     Err = SMDiagnostic(Filename, SourceMgr::DK_Error,
                        "Could not open input file: " + ec.message());
     return 0;
@@ -87,3 +92,30 @@ Module *llvm::ParseIRFile(const std::string &Filename, SMDiagnostic &Err,
 
   return ParseIR(File.take(), Err, Context);
 }
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMParseIRInContext(LLVMContextRef ContextRef,
+                              LLVMMemoryBufferRef MemBuf, LLVMModuleRef *OutM,
+                              char **OutMessage) {
+  SMDiagnostic Diag;
+
+  *OutM = wrap(ParseIR(unwrap(MemBuf), Diag, *unwrap(ContextRef)));
+
+  if(!*OutM) {
+    if (OutMessage) {
+      std::string buf;
+      raw_string_ostream os(buf);
+
+      Diag.print(NULL, os, false);
+      os.flush();
+
+      *OutMessage = strdup(buf.c_str());
+    }
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index 0565443..00280c8 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Analysis Archive AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader MC Object Option Support TableGen Target Transforms
+subdirectories = Analysis AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker IR IRReader LTO MC Object Option Support TableGen Target Transforms
 
 [component_0]
 type = Group
diff --git a/lib/LTO/CMakeLists.txt b/lib/LTO/CMakeLists.txt
new file mode 100644
index 0000000..8e00bcb
--- /dev/null
+++ b/lib/LTO/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_llvm_library(LLVMLTO
+  LTOModule.cpp
+  LTOCodeGenerator.cpp
+  )
diff --git a/lib/Archive/LLVMBuild.txt b/lib/LTO/LLVMBuild.txt
index d68550b..38c1170 100644
--- a/lib/Archive/LLVMBuild.txt
+++ b/lib/LTO/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Archive/LLVMBuild.txt ------------------------------*- Conf -*--===;
+;===- ./lib/LTO/LLVMBuild.txt ----------------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,6 +17,6 @@
 
 [component_0]
 type = Library
-name = Archive
+name = LTO
 parent = Libraries
-required_libraries = BitReader Core Support
+required_libraries = Analysis BitReader BitWriter Core IPO Linker MC MCParser Scalar Support Target Vectorize
+\ No newline at end of file
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
new file mode 100644
index 0000000..2b3648e
--- /dev/null
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -0,0 +1,521 @@
+//===-LTOCodeGenerator.cpp - LLVM Link Time Optimizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOCodeGenerator.h"
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/Config/config.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Linker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/ObjCARC.h"
+using namespace llvm;
+
+const char* LTOCodeGenerator::getVersionString() {
+#ifdef LLVM_VERSION_INFO
+  return PACKAGE_NAME " version " PACKAGE_VERSION ", " LLVM_VERSION_INFO;
+#else
+  return PACKAGE_NAME " version " PACKAGE_VERSION;
+#endif
+}
+
+LTOCodeGenerator::LTOCodeGenerator()
+    : Context(getGlobalContext()), Linker(new Module("ld-temp.o", Context)),
+      TargetMach(NULL), EmitDwarfDebugInfo(false), ScopeRestrictionsDone(false),
+      CodeModel(LTO_CODEGEN_PIC_MODEL_DYNAMIC), NativeObjectFile(NULL) {
+  initializeLTOPasses();
+}
+
+LTOCodeGenerator::~LTOCodeGenerator() {
+  delete TargetMach;
+  delete NativeObjectFile;
+  TargetMach = NULL;
+  NativeObjectFile = NULL;
+
+  Linker.deleteModule();
+
+  for (std::vector<char *>::iterator I = CodegenOptions.begin(),
+                                     E = CodegenOptions.end();
+       I != E; ++I)
+    free(*I);
+}
+
+// Initialize LTO passes. Please keep this funciton in sync with
+// PassManagerBuilder::populateLTOPassManager(), and make sure all LTO
+// passes are initialized. 
+//
+void LTOCodeGenerator::initializeLTOPasses() {
+  PassRegistry &R = *PassRegistry::getPassRegistry();
+
+  initializeInternalizePassPass(R);
+  initializeIPSCCPPass(R);
+  initializeGlobalOptPass(R);
+  initializeConstantMergePass(R);
+  initializeDAHPass(R);
+  initializeInstCombinerPass(R);
+  initializeSimpleInlinerPass(R);
+  initializePruneEHPass(R);
+  initializeGlobalDCEPass(R);
+  initializeArgPromotionPass(R);
+  initializeJumpThreadingPass(R);
+  initializeSROAPass(R);
+  initializeSROA_DTPass(R);
+  initializeSROA_SSAUpPass(R);
+  initializeFunctionAttrsPass(R);
+  initializeGlobalsModRefPass(R);
+  initializeLICMPass(R);
+  initializeGVNPass(R);
+  initializeMemCpyOptPass(R);
+  initializeDCEPass(R);
+  initializeCFGSimplifyPassPass(R);
+}
+
+bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
+  bool ret = Linker.linkInModule(mod->getLLVVMModule(), &errMsg);
+
+  const std::vector<const char*> &undefs = mod->getAsmUndefinedRefs();
+  for (int i = 0, e = undefs.size(); i != e; ++i)
+    AsmUndefinedRefs[undefs[i]] = 1;
+
+  return !ret;
+}
+
+void LTOCodeGenerator::setTargetOptions(TargetOptions options) {
+  Options.LessPreciseFPMADOption = options.LessPreciseFPMADOption;
+  Options.NoFramePointerElim = options.NoFramePointerElim;
+  Options.AllowFPOpFusion = options.AllowFPOpFusion;
+  Options.UnsafeFPMath = options.UnsafeFPMath;
+  Options.NoInfsFPMath = options.NoInfsFPMath;
+  Options.NoNaNsFPMath = options.NoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+    options.HonorSignDependentRoundingFPMathOption;
+  Options.UseSoftFloat = options.UseSoftFloat;
+  Options.FloatABIType = options.FloatABIType;
+  Options.NoZerosInBSS = options.NoZerosInBSS;
+  Options.GuaranteedTailCallOpt = options.GuaranteedTailCallOpt;
+  Options.DisableTailCalls = options.DisableTailCalls;
+  Options.StackAlignmentOverride = options.StackAlignmentOverride;
+  Options.TrapFuncName = options.TrapFuncName;
+  Options.PositionIndependentExecutable = options.PositionIndependentExecutable;
+  Options.EnableSegmentedStacks = options.EnableSegmentedStacks;
+  Options.UseInitArray = options.UseInitArray;
+}
+
+void LTOCodeGenerator::setDebugInfo(lto_debug_model debug) {
+  switch (debug) {
+  case LTO_DEBUG_MODEL_NONE:
+    EmitDwarfDebugInfo = false;
+    return;
+
+  case LTO_DEBUG_MODEL_DWARF:
+    EmitDwarfDebugInfo = true;
+    return;
+  }
+  llvm_unreachable("Unknown debug format!");
+}
+
+void LTOCodeGenerator::setCodePICModel(lto_codegen_model model) {
+  switch (model) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    CodeModel = model;
+    return;
+  }
+  llvm_unreachable("Unknown PIC model!");
+}
+
+bool LTOCodeGenerator::writeMergedModules(const char *path,
+                                          std::string &errMsg) {
+  if (!determineTarget(errMsg))
+    return false;
+
+  // mark which symbols can not be internalized
+  applyScopeRestrictions();
+
+  // create output file
+  std::string ErrInfo;
+  tool_output_file Out(path, ErrInfo, sys::fs::F_Binary);
+  if (!ErrInfo.empty()) {
+    errMsg = "could not open bitcode file for writing: ";
+    errMsg += path;
+    return false;
+  }
+
+  // write bitcode to it
+  WriteBitcodeToFile(Linker.getModule(), Out.os());
+  Out.os().close();
+
+  if (Out.os().has_error()) {
+    errMsg = "could not write bitcode file: ";
+    errMsg += path;
+    Out.os().clear_error();
+    return false;
+  }
+
+  Out.keep();
+  return true;
+}
+
+bool LTOCodeGenerator::compile_to_file(const char** name, 
+                                       bool disableOpt,
+                                       bool disableInline,
+                                       bool disableGVNLoadPRE,
+                                       std::string& errMsg) {
+  // make unique temp .o file to put generated object file
+  SmallString<128> Filename;
+  int FD;
+  error_code EC = sys::fs::createTemporaryFile("lto-llvm", "o", FD, Filename);
+  if (EC) {
+    errMsg = EC.message();
+    return false;
+  }
+
+  // generate object file
+  tool_output_file objFile(Filename.c_str(), FD);
+
+  bool genResult = generateObjectFile(objFile.os(), disableOpt, disableInline,
+                                      disableGVNLoadPRE, errMsg);
+  objFile.os().close();
+  if (objFile.os().has_error()) {
+    objFile.os().clear_error();
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  objFile.keep();
+  if (!genResult) {
+    sys::fs::remove(Twine(Filename));
+    return false;
+  }
+
+  NativeObjectPath = Filename.c_str();
+  *name = NativeObjectPath.c_str();
+  return true;
+}
+
+const void* LTOCodeGenerator::compile(size_t* length,
+                                      bool disableOpt,
+                                      bool disableInline,
+                                      bool disableGVNLoadPRE,
+                                      std::string& errMsg) {
+  const char *name;
+  if (!compile_to_file(&name, disableOpt, disableInline, disableGVNLoadPRE,
+                       errMsg))
+    return NULL;
+
+  // remove old buffer if compile() called twice
+  delete NativeObjectFile;
+
+  // read .o file into memory buffer
+  OwningPtr<MemoryBuffer> BuffPtr;
+  if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
+    errMsg = ec.message();
+    sys::fs::remove(NativeObjectPath);
+    return NULL;
+  }
+  NativeObjectFile = BuffPtr.take();
+
+  // remove temp files
+  sys::fs::remove(NativeObjectPath);
+
+  // return buffer, unless error
+  if (NativeObjectFile == NULL)
+    return NULL;
+  *length = NativeObjectFile->getBufferSize();
+  return NativeObjectFile->getBufferStart();
+}
+
+bool LTOCodeGenerator::determineTarget(std::string &errMsg) {
+  if (TargetMach != NULL)
+    return true;
+
+  std::string TripleStr = Linker.getModule()->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // create target machine from info for merged modules
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (march == NULL)
+    return false;
+
+  // The relocation model is actually a static member of TargetMachine and
+  // needs to be set before the TargetMachine is instantiated.
+  Reloc::Model RelocModel = Reloc::Default;
+  switch (CodeModel) {
+  case LTO_CODEGEN_PIC_MODEL_STATIC:
+    RelocModel = Reloc::Static;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC:
+    RelocModel = Reloc::PIC_;
+    break;
+  case LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC:
+    RelocModel = Reloc::DynamicNoPIC;
+    break;
+  }
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  if (MCpu.empty() && Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      MCpu = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      MCpu = "yonah";
+  }
+
+  TargetMach = march->createTargetMachine(TripleStr, MCpu, FeatureStr, Options,
+                                          RelocModel, CodeModel::Default,
+                                          CodeGenOpt::Aggressive);
+  return true;
+}
+
+void LTOCodeGenerator::
+applyRestriction(GlobalValue &GV,
+                 const ArrayRef<StringRef> &Libcalls,
+                 std::vector<const char*> &MustPreserveList,
+                 SmallPtrSet<GlobalValue*, 8> &AsmUsed,
+                 Mangler &Mangler) {
+  SmallString<64> Buffer;
+  Mangler.getNameWithPrefix(Buffer, &GV, false);
+
+  if (GV.isDeclaration())
+    return;
+  if (MustPreserveSymbols.count(Buffer))
+    MustPreserveList.push_back(GV.getName().data());
+  if (AsmUndefinedRefs.count(Buffer))
+    AsmUsed.insert(&GV);
+
+  // Conservatively append user-supplied runtime library functions to
+  // llvm.compiler.used.  These could be internalized and deleted by
+  // optimizations like -globalopt, causing problems when later optimizations
+  // add new library calls (e.g., llvm.memset => memset and printf => puts).
+  // Leave it to the linker to remove any dead code (e.g. with -dead_strip).
+  if (isa<Function>(GV) &&
+      std::binary_search(Libcalls.begin(), Libcalls.end(), GV.getName()))
+    AsmUsed.insert(&GV);
+}
+
+static void findUsedValues(GlobalVariable *LLVMUsed,
+                           SmallPtrSet<GlobalValue*, 8> &UsedValues) {
+  if (LLVMUsed == 0) return;
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+    if (GlobalValue *GV =
+        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+      UsedValues.insert(GV);
+}
+
+static void accumulateAndSortLibcalls(std::vector<StringRef> &Libcalls,
+                                      const TargetLibraryInfo& TLI,
+                                      const TargetLowering *Lowering)
+{
+  // TargetLibraryInfo has info on C runtime library calls on the current
+  // target.
+  for (unsigned I = 0, E = static_cast<unsigned>(LibFunc::NumLibFuncs);
+       I != E; ++I) {
+    LibFunc::Func F = static_cast<LibFunc::Func>(I);
+    if (TLI.has(F))
+      Libcalls.push_back(TLI.getName(F));
+  }
+
+  // TargetLowering has info on library calls that CodeGen expects to be
+  // available, both from the C runtime and compiler-rt.
+  if (Lowering)
+    for (unsigned I = 0, E = static_cast<unsigned>(RTLIB::UNKNOWN_LIBCALL);
+         I != E; ++I)
+      if (const char *Name
+          = Lowering->getLibcallName(static_cast<RTLIB::Libcall>(I)))
+        Libcalls.push_back(Name);
+
+  array_pod_sort(Libcalls.begin(), Libcalls.end());
+  Libcalls.erase(std::unique(Libcalls.begin(), Libcalls.end()),
+                 Libcalls.end());
+}
+
+void LTOCodeGenerator::applyScopeRestrictions() {
+  if (ScopeRestrictionsDone)
+    return;
+  Module *mergedModule = Linker.getModule();
+
+  // Start off with a verification pass.
+  PassManager passes;
+  passes.add(createVerifierPass());
+
+  // mark which symbols can not be internalized
+  Mangler Mangler(TargetMach);
+  std::vector<const char*> MustPreserveList;
+  SmallPtrSet<GlobalValue*, 8> AsmUsed;
+  std::vector<StringRef> Libcalls;
+  TargetLibraryInfo TLI(Triple(TargetMach->getTargetTriple()));
+  accumulateAndSortLibcalls(Libcalls, TLI, TargetMach->getTargetLowering());
+
+  for (Module::iterator f = mergedModule->begin(),
+         e = mergedModule->end(); f != e; ++f)
+    applyRestriction(*f, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::global_iterator v = mergedModule->global_begin(),
+         e = mergedModule->global_end(); v !=  e; ++v)
+    applyRestriction(*v, Libcalls, MustPreserveList, AsmUsed, Mangler);
+  for (Module::alias_iterator a = mergedModule->alias_begin(),
+         e = mergedModule->alias_end(); a != e; ++a)
+    applyRestriction(*a, Libcalls, MustPreserveList, AsmUsed, Mangler);
+
+  GlobalVariable *LLVMCompilerUsed =
+    mergedModule->getGlobalVariable("llvm.compiler.used");
+  findUsedValues(LLVMCompilerUsed, AsmUsed);
+  if (LLVMCompilerUsed)
+    LLVMCompilerUsed->eraseFromParent();
+
+  if (!AsmUsed.empty()) {
+    llvm::Type *i8PTy = llvm::Type::getInt8PtrTy(Context);
+    std::vector<Constant*> asmUsed2;
+    for (SmallPtrSet<GlobalValue*, 16>::const_iterator i = AsmUsed.begin(),
+           e = AsmUsed.end(); i !=e; ++i) {
+      GlobalValue *GV = *i;
+      Constant *c = ConstantExpr::getBitCast(GV, i8PTy);
+      asmUsed2.push_back(c);
+    }
+
+    llvm::ArrayType *ATy = llvm::ArrayType::get(i8PTy, asmUsed2.size());
+    LLVMCompilerUsed =
+      new llvm::GlobalVariable(*mergedModule, ATy, false,
+                               llvm::GlobalValue::AppendingLinkage,
+                               llvm::ConstantArray::get(ATy, asmUsed2),
+                               "llvm.compiler.used");
+
+    LLVMCompilerUsed->setSection("llvm.metadata");
+  }
+
+  passes.add(createInternalizePass(MustPreserveList));
+
+  // apply scope restrictions
+  passes.run(*mergedModule);
+
+  ScopeRestrictionsDone = true;
+}
+
+/// Optimize merged modules using various IPO passes
+bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
+                                          bool DisableOpt,
+                                          bool DisableInline,
+                                          bool DisableGVNLoadPRE,
+                                          std::string &errMsg) {
+  if (!this->determineTarget(errMsg))
+    return false;
+
+  Module *mergedModule = Linker.getModule();
+
+  // Mark which symbols can not be internalized
+  this->applyScopeRestrictions();
+
+  // Instantiate the pass manager to organize the passes.
+  PassManager passes;
+
+  // Start off with a verification pass.
+  passes.add(createVerifierPass());
+
+  // Add an appropriate DataLayout instance for this module...
+  passes.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(passes);
+
+  // Enabling internalize here would use its AllButMain variant. It
+  // keeps only main if it exists and does nothing for libraries. Instead
+  // we create the pass ourselves with the symbol list provided by the linker.
+  if (!DisableOpt)
+    PassManagerBuilder().populateLTOPassManager(passes,
+                                              /*Internalize=*/false,
+                                              !DisableInline,
+                                              DisableGVNLoadPRE);
+
+  // Make sure everything is still good.
+  passes.add(createVerifierPass());
+
+  PassManager codeGenPasses;
+
+  codeGenPasses.add(new DataLayout(*TargetMach->getDataLayout()));
+  TargetMach->addAnalysisPasses(codeGenPasses);
+
+  formatted_raw_ostream Out(out);
+
+  // If the bitcode files contain ARC code and were compiled with optimization,
+  // the ObjCARCContractPass must be run, so do it unconditionally here.
+  codeGenPasses.add(createObjCARCContractPass());
+
+  if (TargetMach->addPassesToEmitFile(codeGenPasses, Out,
+                                      TargetMachine::CGFT_ObjectFile)) {
+    errMsg = "target file type not supported";
+    return false;
+  }
+
+  // Run our queue of passes all at once now, efficiently.
+  passes.run(*mergedModule);
+
+  // Run the code generator, and write assembly file
+  codeGenPasses.run(*mergedModule);
+
+  return true;
+}
+
+/// setCodeGenDebugOptions - Set codegen debugging options to aid in debugging
+/// LTO problems.
+void LTOCodeGenerator::setCodeGenDebugOptions(const char *options) {
+  for (std::pair<StringRef, StringRef> o = getToken(options);
+       !o.first.empty(); o = getToken(o.second)) {
+    // ParseCommandLineOptions() expects argv[0] to be program name. Lazily add
+    // that.
+    if (CodegenOptions.empty())
+      CodegenOptions.push_back(strdup("libLLVMLTO"));
+    CodegenOptions.push_back(strdup(o.first.str().c_str()));
+  }
+}
+
+void LTOCodeGenerator::parseCodeGenDebugOptions() {
+  // if options were requested, set them
+  if (!CodegenOptions.empty())
+    cl::ParseCommandLineOptions(CodegenOptions.size(),
+                                const_cast<char **>(&CodegenOptions[0]));
+}
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
new file mode 100644
index 0000000..65416be
--- /dev/null
+++ b/lib/LTO/LTOModule.cpp
@@ -0,0 +1,794 @@
+//===-- LTOModule.cpp - LLVM Link Time Optimizer --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Link Time Optimization library. This library is
+// intended to be used by linker to optimize code at link time.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/LTO/LTOModule.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetAsmParser.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
+  : _module(m), _target(t),
+    _context(_target->getMCAsmInfo(), _target->getRegisterInfo(), NULL),
+    _mangler(t) {}
+
+/// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
+/// bitcode.
+bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
+  return sys::fs::identify_magic(StringRef((const char *)mem, length)) ==
+         sys::fs::file_magic::bitcode;
+}
+
+bool LTOModule::isBitcodeFile(const char *path) {
+  sys::fs::file_magic type;
+  if (sys::fs::identify_magic(path, type))
+    return false;
+  return type == sys::fs::file_magic::bitcode;
+}
+
+/// isBitcodeFileForTarget - Returns 'true' if the file (or memory contents) is
+/// LLVM bitcode for the specified triple.
+bool LTOModule::isBitcodeFileForTarget(const void *mem, size_t length,
+                                       const char *triplePrefix) {
+  MemoryBuffer *buffer = makeBuffer(mem, length);
+  if (!buffer)
+    return false;
+  return isTargetMatch(buffer, triplePrefix);
+}
+
+bool LTOModule::isBitcodeFileForTarget(const char *path,
+                                       const char *triplePrefix) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (MemoryBuffer::getFile(path, buffer))
+    return false;
+  return isTargetMatch(buffer.take(), triplePrefix);
+}
+
+/// isTargetMatch - Returns 'true' if the memory buffer is for the specified
+/// target triple.
+bool LTOModule::isTargetMatch(MemoryBuffer *buffer, const char *triplePrefix) {
+  std::string Triple = getBitcodeTargetTriple(buffer, getGlobalContext());
+  delete buffer;
+  return strncmp(Triple.c_str(), triplePrefix, strlen(triplePrefix)) == 0;
+}
+
+/// makeLTOModule - Create an LTOModule. N.B. These methods take ownership of
+/// the buffer.
+LTOModule *LTOModule::makeLTOModule(const char *path, TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec = MemoryBuffer::getFile(path, buffer)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t size, TargetOptions options,
+                                    std::string &errMsg) {
+  return makeLTOModule(fd, path, size, 0, options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(int fd, const char *path,
+                                    size_t map_size,
+                                    off_t offset,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer;
+  if (error_code ec =
+          MemoryBuffer::getOpenFileSlice(fd, path, buffer, map_size, offset)) {
+    errMsg = ec.message();
+    return NULL;
+  }
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(const void *mem, size_t length,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  OwningPtr<MemoryBuffer> buffer(makeBuffer(mem, length));
+  if (!buffer)
+    return NULL;
+  return makeLTOModule(buffer.take(), options, errMsg);
+}
+
+LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
+                                    TargetOptions options,
+                                    std::string &errMsg) {
+  // parse bitcode buffer
+  OwningPtr<Module> m(getLazyBitcodeModule(buffer, getGlobalContext(),
+                                           &errMsg));
+  if (!m) {
+    delete buffer;
+    return NULL;
+  }
+
+  std::string TripleStr = m->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
+
+  // find machine architecture for this module
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
+  if (!march)
+    return NULL;
+
+  // construct LTOModule, hand over ownership of module and target
+  SubtargetFeatures Features;
+  Features.getDefaultSubtargetFeatures(Triple);
+  std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  std::string CPU;
+  if (Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      CPU = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      CPU = "yonah";
+  }
+
+  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
+                                                     options);
+  m->MaterializeAllPermanently();
+
+  LTOModule *Ret = new LTOModule(m.take(), target);
+  if (Ret->parseSymbols(errMsg)) {
+    delete Ret;
+    return NULL;
+  }
+
+  return Ret;
+}
+
+/// makeBuffer - Create a MemoryBuffer from a memory range.
+MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
+  const char *startPtr = (const char*)mem;
+  return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
+}
+
+/// objcClassNameFromExpression - Get string that the data pointer points to.
+bool
+LTOModule::objcClassNameFromExpression(const Constant *c, std::string &name) {
+  if (const ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
+    Constant *op = ce->getOperand(0);
+    if (GlobalVariable *gvn = dyn_cast<GlobalVariable>(op)) {
+      Constant *cn = gvn->getInitializer();
+      if (ConstantDataArray *ca = dyn_cast<ConstantDataArray>(cn)) {
+        if (ca->isCString()) {
+          name = ".objc_class_name_" + ca->getAsCString().str();
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/// addObjCClass - Parse i386/ppc ObjC class data structure.
+void LTOModule::addObjCClass(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__class is pointer to superclass name
+  std::string superclassName;
+  if (objcClassNameFromExpression(c->getOperand(1), superclassName)) {
+    NameAndAttributes info;
+    StringMap<NameAndAttributes>::value_type &entry =
+      _undefines.GetOrCreateValue(superclassName);
+    if (!entry.getValue().name) {
+      const char *symbolName = entry.getKey().data();
+      info.name = symbolName;
+      info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+      info.isFunction = false;
+      info.symbol = clgv;
+      entry.setValue(info);
+    }
+  }
+
+  // third slot in __OBJC,__class is pointer to class name
+  std::string className;
+  if (objcClassNameFromExpression(c->getOperand(2), className)) {
+    StringSet::value_type &entry = _defines.GetOrCreateValue(className);
+    entry.setValue(1);
+
+    NameAndAttributes info;
+    info.name = entry.getKey().data();
+    info.attributes = LTO_SYMBOL_PERMISSIONS_DATA |
+      LTO_SYMBOL_DEFINITION_REGULAR | LTO_SYMBOL_SCOPE_DEFAULT;
+    info.isFunction = false;
+    info.symbol = clgv;
+    _symbols.push_back(info);
+  }
+}
+
+/// addObjCCategory - Parse i386/ppc ObjC category data structure.
+void LTOModule::addObjCCategory(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+  if (!c) return;
+
+  // second slot in __OBJC,__category is pointer to target class name
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(c->getOperand(1), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
+void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
+  std::string targetclassName;
+  if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
+    return;
+
+  NameAndAttributes info;
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(targetclassName);
+  if (entry.getValue().name)
+    return;
+
+  const char *symbolName = entry.getKey().data();
+  info.name = symbolName;
+  info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+  info.isFunction = false;
+  info.symbol = clgv;
+  entry.setValue(info);
+}
+
+/// addDefinedDataSymbol - Add a data symbol as defined to the list.
+void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
+  // Add to list of defined symbols.
+  addDefinedSymbol(v, false);
+
+  if (!v->hasSection() /* || !isTargetDarwin */)
+    return;
+
+  // Special case i386/ppc ObjC data structures in magic sections:
+  // The issue is that the old ObjC object format did some strange
+  // contortions to avoid real linker symbols.  For instance, the
+  // ObjC class data structure is allocated statically in the executable
+  // that defines that class.  That data structures contains a pointer to
+  // its superclass.  But instead of just initializing that part of the
+  // struct to the address of its superclass, and letting the static and
+  // dynamic linkers do the rest, the runtime works by having that field
+  // instead point to a C-string that is the name of the superclass.
+  // At runtime the objc initialization updates that pointer and sets
+  // it to point to the actual super class.  As far as the linker
+  // knows it is just a pointer to a string.  But then someone wanted the
+  // linker to issue errors at build time if the superclass was not found.
+  // So they figured out a way in mach-o object format to use an absolute
+  // symbols (.objc_class_name_Foo = 0) and a floating reference
+  // (.reference .objc_class_name_Bar) to cause the linker into erroring when
+  // a class was missing.
+  // The following synthesizes the implicit .objc_* symbols for the linker
+  // from the ObjC data structures generated by the front end.
+
+  // special case if this data blob is an ObjC class definition
+  if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClass(gv);
+    }
+  }
+
+  // special case if this data blob is an ObjC category definition
+  else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCCategory(gv);
+    }
+  }
+
+  // special case if this data blob is the list of referenced classes
+  else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+      addObjCClassRef(gv);
+    }
+  }
+}
+
+/// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
+void LTOModule::addDefinedFunctionSymbol(const Function *f) {
+  // add to list of defined symbols
+  addDefinedSymbol(f, true);
+}
+
+static bool canBeHidden(const GlobalValue *GV) {
+  GlobalValue::LinkageTypes L = GV->getLinkage();
+
+  if (L != GlobalValue::LinkOnceODRLinkage)
+    return false;
+
+  if (GV->hasUnnamedAddr())
+    return true;
+
+  GlobalStatus GS;
+  if (GlobalStatus::analyzeGlobal(GV, GS))
+    return false;
+
+  return !GS.IsCompared;
+}
+
+/// addDefinedSymbol - Add a defined symbol to the list.
+void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
+  // ignore all llvm.* symbols
+  if (def->getName().startswith("llvm."))
+    return;
+
+  // string is owned by _defines
+  SmallString<64> Buffer;
+  _mangler.getNameWithPrefix(Buffer, def, false);
+
+  // set alignment part log2() can have rounding errors
+  uint32_t align = def->getAlignment();
+  uint32_t attr = align ? countTrailingZeros(def->getAlignment()) : 0;
+
+  // set permissions part
+  if (isFunction) {
+    attr |= LTO_SYMBOL_PERMISSIONS_CODE;
+  } else {
+    const GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
+    if (gv && gv->isConstant())
+      attr |= LTO_SYMBOL_PERMISSIONS_RODATA;
+    else
+      attr |= LTO_SYMBOL_PERMISSIONS_DATA;
+  }
+
+  // set definition part
+  if (def->hasWeakLinkage() || def->hasLinkOnceLinkage() ||
+      def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_WEAK;
+  else if (def->hasCommonLinkage())
+    attr |= LTO_SYMBOL_DEFINITION_TENTATIVE;
+  else
+    attr |= LTO_SYMBOL_DEFINITION_REGULAR;
+
+  // set scope part
+  if (def->hasHiddenVisibility())
+    attr |= LTO_SYMBOL_SCOPE_HIDDEN;
+  else if (def->hasProtectedVisibility())
+    attr |= LTO_SYMBOL_SCOPE_PROTECTED;
+  else if (canBeHidden(def))
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
+  else if (def->hasExternalLinkage() || def->hasWeakLinkage() ||
+           def->hasLinkOnceLinkage() || def->hasCommonLinkage() ||
+           def->hasLinkerPrivateWeakLinkage())
+    attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  else
+    attr |= LTO_SYMBOL_SCOPE_INTERNAL;
+
+  StringSet::value_type &entry = _defines.GetOrCreateValue(Buffer);
+  entry.setValue(1);
+
+  // fill information structure
+  NameAndAttributes info;
+  StringRef Name = entry.getKey();
+  info.name = Name.data();
+  assert(info.name[Name.size()] == '\0');
+  info.attributes = attr;
+  info.isFunction = isFunction;
+  info.symbol = def;
+
+  // add to table of symbols
+  _symbols.push_back(info);
+}
+
+/// addAsmGlobalSymbol - Add a global symbol from module-level ASM to the
+/// defined list.
+void LTOModule::addAsmGlobalSymbol(const char *name,
+                                   lto_symbol_attributes scope) {
+  StringSet::value_type &entry = _defines.GetOrCreateValue(name);
+
+  // only add new define if not already defined
+  if (entry.getValue())
+    return;
+
+  entry.setValue(1);
+
+  NameAndAttributes &info = _undefines[entry.getKey().data()];
+
+  if (info.symbol == 0) {
+    // FIXME: This is trying to take care of module ASM like this:
+    //
+    //   module asm ".zerofill __FOO, __foo, _bar_baz_qux, 0"
+    //
+    // but is gross and its mother dresses it funny. Have the ASM parser give us
+    // more details for this type of situation so that we're not guessing so
+    // much.
+
+    // fill information structure
+    info.name = entry.getKey().data();
+    info.attributes =
+      LTO_SYMBOL_PERMISSIONS_DATA | LTO_SYMBOL_DEFINITION_REGULAR | scope;
+    info.isFunction = false;
+    info.symbol = 0;
+
+    // add to table of symbols
+    _symbols.push_back(info);
+    return;
+  }
+
+  if (info.isFunction)
+    addDefinedFunctionSymbol(cast<Function>(info.symbol));
+  else
+    addDefinedDataSymbol(info.symbol);
+
+  _symbols.back().attributes &= ~LTO_SYMBOL_SCOPE_MASK;
+  _symbols.back().attributes |= scope;
+}
+
+/// addAsmGlobalSymbolUndef - Add a global symbol from module-level ASM to the
+/// undefined list.
+void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  _asm_undefines.push_back(entry.getKey().data());
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  uint32_t attr = LTO_SYMBOL_DEFINITION_UNDEFINED;;
+  attr |= LTO_SYMBOL_SCOPE_DEFAULT;
+  NameAndAttributes info;
+  info.name = entry.getKey().data();
+  info.attributes = attr;
+  info.isFunction = false;
+  info.symbol = 0;
+
+  entry.setValue(info);
+}
+
+/// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
+/// list to be resolved later.
+void
+LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
+  // ignore all llvm.* symbols
+  if (decl->getName().startswith("llvm."))
+    return;
+
+  // ignore all aliases
+  if (isa<GlobalAlias>(decl))
+    return;
+
+  SmallString<64> name;
+  _mangler.getNameWithPrefix(name, decl, false);
+
+  StringMap<NameAndAttributes>::value_type &entry =
+    _undefines.GetOrCreateValue(name);
+
+  // we already have the symbol
+  if (entry.getValue().name)
+    return;
+
+  NameAndAttributes info;
+
+  info.name = entry.getKey().data();
+
+  if (decl->hasExternalWeakLinkage())
+    info.attributes = LTO_SYMBOL_DEFINITION_WEAKUNDEF;
+  else
+    info.attributes = LTO_SYMBOL_DEFINITION_UNDEFINED;
+
+  info.isFunction = isFunc;
+  info.symbol = decl;
+
+  entry.setValue(info);
+}
+
+namespace {
+  class RecordStreamer : public MCStreamer {
+  public:
+    enum State { NeverSeen, Global, Defined, DefinedGlobal, Used };
+
+  private:
+    StringMap<State> Symbols;
+
+    void markDefined(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Global:
+        S = DefinedGlobal;
+        break;
+      case NeverSeen:
+      case Defined:
+      case Used:
+        S = Defined;
+        break;
+      }
+    }
+    void markGlobal(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+        S = DefinedGlobal;
+        break;
+
+      case NeverSeen:
+      case Global:
+      case Used:
+        S = Global;
+        break;
+      }
+    }
+    void markUsed(const MCSymbol &Symbol) {
+      State &S = Symbols[Symbol.getName()];
+      switch (S) {
+      case DefinedGlobal:
+      case Defined:
+      case Global:
+        break;
+
+      case NeverSeen:
+      case Used:
+        S = Used;
+        break;
+      }
+    }
+
+    // FIXME: mostly copied for the obj streamer.
+    void AddValueSymbols(const MCExpr *Value) {
+      switch (Value->getKind()) {
+      case MCExpr::Target:
+        // FIXME: What should we do in here?
+        break;
+
+      case MCExpr::Constant:
+        break;
+
+      case MCExpr::Binary: {
+        const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+        AddValueSymbols(BE->getLHS());
+        AddValueSymbols(BE->getRHS());
+        break;
+      }
+
+      case MCExpr::SymbolRef:
+        markUsed(cast<MCSymbolRefExpr>(Value)->getSymbol());
+        break;
+
+      case MCExpr::Unary:
+        AddValueSymbols(cast<MCUnaryExpr>(Value)->getSubExpr());
+        break;
+      }
+    }
+
+  public:
+    typedef StringMap<State>::const_iterator const_iterator;
+
+    const_iterator begin() {
+      return Symbols.begin();
+    }
+
+    const_iterator end() {
+      return Symbols.end();
+    }
+
+    RecordStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
+
+    virtual void EmitInstruction(const MCInst &Inst) {
+      // Scan for values.
+      for (unsigned i = Inst.getNumOperands(); i--; )
+        if (Inst.getOperand(i).isExpr())
+          AddValueSymbols(Inst.getOperand(i).getExpr());
+    }
+    virtual void EmitLabel(MCSymbol *Symbol) {
+      Symbol->setSection(*getCurrentSection().first);
+      markDefined(*Symbol);
+    }
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
+    virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+      // FIXME: should we handle aliases?
+      markDefined(*Symbol);
+    }
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+      if (Attribute == MCSA_Global)
+        markGlobal(*Symbol);
+      return true;
+    }
+    virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
+                              uint64_t Size , unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+    virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                  unsigned ByteAlignment) {
+      markDefined(*Symbol);
+    }
+
+    virtual void EmitBundleAlignMode(unsigned AlignPow2) {}
+    virtual void EmitBundleLock(bool AlignToEnd) {}
+    virtual void EmitBundleUnlock() {}
+
+    // Noop calls.
+    virtual void ChangeSection(const MCSection *Section,
+                               const MCExpr *Subsection) {}
+    virtual void InitToTextSection() {}
+    virtual void InitSections() {}
+    virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
+    virtual void EmitThumbFunc(MCSymbol *Func) {}
+    virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+    virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
+    virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
+    virtual void EmitCOFFSymbolStorageClass(int StorageClass) {}
+    virtual void EmitCOFFSymbolType(int Type) {}
+    virtual void EndCOFFSymbolDef() {}
+    virtual void EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+    virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                       unsigned ByteAlignment) {}
+    virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
+                                uint64_t Size, unsigned ByteAlignment) {}
+    virtual void EmitBytes(StringRef Data) {}
+    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {}
+    virtual void EmitULEB128Value(const MCExpr *Value) {}
+    virtual void EmitSLEB128Value(const MCExpr *Value) {}
+    virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                      unsigned ValueSize,
+                                      unsigned MaxBytesToEmit) {}
+    virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                   unsigned MaxBytesToEmit) {}
+    virtual bool EmitValueToOffset(const MCExpr *Offset,
+                                   unsigned char Value ) { return false; }
+    virtual void EmitFileDirective(StringRef Filename) {}
+    virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+                                          const MCSymbol *LastLabel,
+                                          const MCSymbol *Label,
+                                          unsigned PointerSize) {}
+    virtual void FinishImpl() {}
+    virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+      RecordProcEnd(Frame);
+    }
+  };
+} // end anonymous namespace
+
+/// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
+/// defined or undefined lists.
+bool LTOModule::addAsmGlobalSymbols(std::string &errMsg) {
+  const std::string &inlineAsm = _module->getModuleInlineAsm();
+  if (inlineAsm.empty())
+    return false;
+
+  OwningPtr<RecordStreamer> Streamer(new RecordStreamer(_context));
+  MemoryBuffer *Buffer = MemoryBuffer::getMemBuffer(inlineAsm);
+  SourceMgr SrcMgr;
+  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+  OwningPtr<MCAsmParser> Parser(createMCAsmParser(SrcMgr,
+                                                  _context, *Streamer,
+                                                  *_target->getMCAsmInfo()));
+  const Target &T = _target->getTarget();
+  OwningPtr<MCInstrInfo> MCII(T.createMCInstrInfo());
+  OwningPtr<MCSubtargetInfo>
+    STI(T.createMCSubtargetInfo(_target->getTargetTriple(),
+                                _target->getTargetCPU(),
+                                _target->getTargetFeatureString()));
+  OwningPtr<MCTargetAsmParser> TAP(T.createMCAsmParser(*STI, *Parser.get(), *MCII));
+  if (!TAP) {
+    errMsg = "target " + std::string(T.getName()) +
+      " does not define AsmParser.";
+    return true;
+  }
+
+  Parser->setTargetParser(*TAP);
+  if (Parser->Run(false))
+    return true;
+
+  for (RecordStreamer::const_iterator i = Streamer->begin(),
+         e = Streamer->end(); i != e; ++i) {
+    StringRef Key = i->first();
+    RecordStreamer::State Value = i->second;
+    if (Value == RecordStreamer::DefinedGlobal)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_DEFAULT);
+    else if (Value == RecordStreamer::Defined)
+      addAsmGlobalSymbol(Key.data(), LTO_SYMBOL_SCOPE_INTERNAL);
+    else if (Value == RecordStreamer::Global ||
+             Value == RecordStreamer::Used)
+      addAsmGlobalSymbolUndef(Key.data());
+  }
+
+  return false;
+}
+
+/// isDeclaration - Return 'true' if the global value is a declaration.
+static bool isDeclaration(const GlobalValue &V) {
+  if (V.hasAvailableExternallyLinkage())
+    return true;
+
+  if (V.isMaterializable())
+    return false;
+
+  return V.isDeclaration();
+}
+
+/// parseSymbols - Parse the symbols from the module and model-level ASM and add
+/// them to either the defined or undefined lists.
+bool LTOModule::parseSymbols(std::string &errMsg) {
+  // add functions
+  for (Module::iterator f = _module->begin(), e = _module->end(); f != e; ++f) {
+    if (isDeclaration(*f))
+      addPotentialUndefinedSymbol(f, true);
+    else
+      addDefinedFunctionSymbol(f);
+  }
+
+  // add data
+  for (Module::global_iterator v = _module->global_begin(),
+         e = _module->global_end(); v !=  e; ++v) {
+    if (isDeclaration(*v))
+      addPotentialUndefinedSymbol(v, false);
+    else
+      addDefinedDataSymbol(v);
+  }
+
+  // add asm globals
+  if (addAsmGlobalSymbols(errMsg))
+    return true;
+
+  // add aliases
+  for (Module::alias_iterator a = _module->alias_begin(),
+         e = _module->alias_end(); a != e; ++a) {
+    if (isDeclaration(*a->getAliasedGlobal()))
+      // Is an alias to a declaration.
+      addPotentialUndefinedSymbol(a, false);
+    else
+      addDefinedDataSymbol(a);
+  }
+
+  // make symbols for all undefines
+  for (StringMap<NameAndAttributes>::iterator u =_undefines.begin(),
+         e = _undefines.end(); u != e; ++u) {
+    // If this symbol also has a definition, then don't make an undefine because
+    // it is a tentative definition.
+    if (_defines.count(u->getKey())) continue;
+    NameAndAttributes info = u->getValue();
+    _symbols.push_back(info);
+  }
+
+  return false;
+}
diff --git a/lib/Archive/Makefile b/lib/LTO/Makefile
index da97804..55e2a5e 100644
--- a/lib/Archive/Makefile
+++ b/lib/LTO/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Archive/Makefile --------------------------------*- Makefile -*-===##
+##===- lib/LTO/Makefile ------------------------------------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,10 +8,8 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-LIBRARYNAME = LLVMArchive
-
-# We only want an archive so only those modules actually used by a tool are
-# included.
+LIBRARYNAME = LLVMLTO
 BUILD_ARCHIVE := 1
 
 include $(LEVEL)/Makefile.common
+
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index d2e13c9..8f2200e 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include <cctype>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -353,12 +354,32 @@ Type *TypeMapTy::getImpl(Type *Ty) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+  class ModuleLinker;
+
+  /// ValueMaterializerTy - Creates prototypes for functions that are lazily
+  /// linked on the fly. This speeds up linking for modules with many
+  /// lazily linked functions of which few get used.
+  class ValueMaterializerTy : public ValueMaterializer {
+    TypeMapTy &TypeMap;
+    Module *DstM;
+    std::vector<Function*> &LazilyLinkFunctions;
+  public:
+    ValueMaterializerTy(TypeMapTy &TypeMap, Module *DstM,
+                        std::vector<Function*> &LazilyLinkFunctions) :
+      ValueMaterializer(), TypeMap(TypeMap), DstM(DstM),
+      LazilyLinkFunctions(LazilyLinkFunctions) {
+    }
+
+    virtual Value *materializeValueFor(Value *V);
+  };
+
   /// ModuleLinker - This is an implementation class for the LinkModules
   /// function, which is the entrypoint for this file.
   class ModuleLinker {
     Module *DstM, *SrcM;
     
     TypeMapTy TypeMap; 
+    ValueMaterializerTy ValMaterializer;
 
     /// ValueMap - Mapping of values from what they used to be in Src, to what
     /// they are now in DstM.  ValueToValueMapTy is a ValueMap, which involves
@@ -386,7 +407,9 @@ namespace {
     std::string ErrorMsg;
     
     ModuleLinker(Module *dstM, TypeSet &Set, Module *srcM, unsigned mode)
-      : DstM(dstM), SrcM(srcM), TypeMap(Set), Mode(mode) { }
+      : DstM(dstM), SrcM(srcM), TypeMap(Set),
+        ValMaterializer(TypeMap, DstM, LazilyLinkFunctions),
+        Mode(mode) { }
     
     bool run();
     
@@ -487,6 +510,20 @@ static bool isLessConstraining(GlobalValue::VisibilityTypes a,
   return false;
 }
 
+Value *ValueMaterializerTy::materializeValueFor(Value *V) {
+  Function *SF = dyn_cast<Function>(V);
+  if (!SF)
+    return NULL;
+
+  Function *DF = Function::Create(TypeMap.get(SF->getFunctionType()),
+                                  SF->getLinkage(), SF->getName(), DstM);
+  copyGVAttributes(DF, SF);
+
+  LazilyLinkFunctions.push_back(SF);
+  return DF;
+}
+
+
 /// getLinkageResult - This analyzes the two global values and determines what
 /// the result will look like in the destination module.  In particular, it
 /// computes the resultant linkage type and visibility, computes whether the
@@ -668,7 +705,11 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
   if (DstGV->getVisibility() != SrcGV->getVisibility())
     return emitError(
             "Appending variables with different visibility need to be linked!");
-  
+
+  if (DstGV->hasUnnamedAddr() != SrcGV->hasUnnamedAddr())
+    return emitError(
+        "Appending variables with different unnamed_addr need to be linked!");
+
   if (DstGV->getSection() != SrcGV->getSection())
     return emitError(
           "Appending variables with different section name need to be linked!");
@@ -710,6 +751,7 @@ bool ModuleLinker::linkAppendingVarProto(GlobalVariable *DstGV,
 bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   GlobalValue *DGV = getLinkedToGlobal(SGV);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SGV->hasUnnamedAddr();
 
   if (DGV) {
     // Concatenation of appending linkage variables is magic and handled later.
@@ -724,6 +766,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
     if (getLinkageResult(DGV, SGV, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     // If we're not linking from the source, then keep the definition that we
     // have.
@@ -732,10 +775,11 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
       if (GlobalVariable *DGVar = dyn_cast<GlobalVariable>(DGV))
         if (DGVar->isDeclaration() && SGV->isConstant() && !DGVar->isConstant())
           DGVar->setConstant(true);
-      
-      // Set calculated linkage and visibility.
+
+      // Set calculated linkage, visibility and unnamed_addr.
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SGV] = ConstantExpr::getBitCast(DGV,TypeMap.get(SGV->getType()));
@@ -761,6 +805,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
   copyGVAttributes(NewDGV, SGV);
   if (NewVisibility)
     NewDGV->setVisibility(*NewVisibility);
+  NewDGV->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDGV, DGV->getType()));
@@ -777,6 +822,7 @@ bool ModuleLinker::linkGlobalProto(GlobalVariable *SGV) {
 bool ModuleLinker::linkFunctionProto(Function *SF) {
   GlobalValue *DGV = getLinkedToGlobal(SF);
   llvm::Optional<GlobalValue::VisibilityTypes> NewVisibility;
+  bool HasUnnamedAddr = SF->hasUnnamedAddr();
 
   if (DGV) {
     GlobalValue::LinkageTypes NewLinkage = GlobalValue::InternalLinkage;
@@ -785,11 +831,13 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     if (getLinkageResult(DGV, SF, NewLinkage, NV, LinkFromSrc))
       return true;
     NewVisibility = NV;
+    HasUnnamedAddr = HasUnnamedAddr && DGV->hasUnnamedAddr();
 
     if (!LinkFromSrc) {
       // Set calculated linkage
       DGV->setLinkage(NewLinkage);
       DGV->setVisibility(*NewVisibility);
+      DGV->setUnnamedAddr(HasUnnamedAddr);
 
       // Make sure to remember this mapping.
       ValueMap[SF] = ConstantExpr::getBitCast(DGV, TypeMap.get(SF->getType()));
@@ -802,6 +850,14 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
     }
   }
   
+  // If the function is to be lazily linked, don't create it just yet.
+  // The ValueMaterializerTy will deal with creating it if it's used.
+  if (!DGV && (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
+               SF->hasAvailableExternallyLinkage())) {
+    DoNotLinkFromSource.insert(SF);
+    return false;
+  }
+
   // If there is no linkage to be performed or we are linking from the source,
   // bring SF over.
   Function *NewDF = Function::Create(TypeMap.get(SF->getFunctionType()),
@@ -809,18 +865,12 @@ bool ModuleLinker::linkFunctionProto(Function *SF) {
   copyGVAttributes(NewDF, SF);
   if (NewVisibility)
     NewDF->setVisibility(*NewVisibility);
+  NewDF->setUnnamedAddr(HasUnnamedAddr);
 
   if (DGV) {
     // Any uses of DF need to change to NewDF, with cast.
     DGV->replaceAllUsesWith(ConstantExpr::getBitCast(NewDF, DGV->getType()));
     DGV->eraseFromParent();
-  } else {
-    // Internal, LO_ODR, or LO linkage - stick in set to ignore and lazily link.
-    if (SF->hasLocalLinkage() || SF->hasLinkOnceLinkage() ||
-        SF->hasAvailableExternallyLinkage()) {
-      DoNotLinkFromSource.insert(SF);
-      LazilyLinkFunctions.push_back(SF);
-    }
   }
   
   ValueMap[SF] = NewDF;
@@ -887,7 +937,7 @@ void ModuleLinker::linkAppendingVarInit(const AppendingVarInfo &AVI) {
   SmallVector<Constant*, 16> Elements;
   getArrayElements(AVI.DstInit, Elements);
   
-  Constant *SrcInit = MapValue(AVI.SrcInit, ValueMap, RF_None, &TypeMap);
+  Constant *SrcInit = MapValue(AVI.SrcInit, ValueMap, RF_None, &TypeMap, &ValMaterializer);
   getArrayElements(SrcInit, Elements);
   
   ArrayType *NewType = cast<ArrayType>(AVI.NewGV->getType()->getElementType());
@@ -908,7 +958,7 @@ void ModuleLinker::linkGlobalInits() {
     GlobalVariable *DGV = cast<GlobalVariable>(ValueMap[I]);
     // Figure out what the initializer looks like in the dest module.
     DGV->setInitializer(MapValue(I->getInitializer(), ValueMap,
-                                 RF_None, &TypeMap));
+                                 RF_None, &TypeMap, &ValMaterializer));
   }
 }
 
@@ -938,12 +988,14 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
     // functions and patch them up to point to the local versions.
     for (Function::iterator BB = Dst->begin(), BE = Dst->end(); BB != BE; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-        RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries, &TypeMap);
+        RemapInstruction(I, ValueMap, RF_IgnoreMissingEntries,
+                         &TypeMap, &ValMaterializer);
     
   } else {
     // Clone the body of the function into the dest function.
     SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
-    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", NULL, &TypeMap);
+    CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", NULL,
+                      &TypeMap, &ValMaterializer);
   }
   
   // There is no need to map the arguments anymore.
@@ -961,7 +1013,8 @@ void ModuleLinker::linkAliasBodies() {
       continue;
     if (Constant *Aliasee = I->getAliasee()) {
       GlobalAlias *DA = cast<GlobalAlias>(ValueMap[I]);
-      DA->setAliasee(MapValue(Aliasee, ValueMap, RF_None, &TypeMap));
+      DA->setAliasee(MapValue(Aliasee, ValueMap, RF_None,
+                              &TypeMap, &ValMaterializer));
     }
   }
 }
@@ -978,7 +1031,7 @@ void ModuleLinker::linkNamedMDNodes() {
     // Add Src elements into Dest node.
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
       DestNMD->addOperand(MapValue(I->getOperand(i), ValueMap,
-                                   RF_None, &TypeMap));
+                                   RF_None, &TypeMap, &ValMaterializer));
   }
 }
 
@@ -1208,6 +1261,13 @@ bool ModuleLinker::run() {
     // Skip if not linking from source.
     if (DoNotLinkFromSource.count(SF)) continue;
     
+    Function *DF = cast<Function>(ValueMap[SF]);
+    if (SF->hasPrefixData()) {
+      // Link in the prefix data.
+      DF->setPrefixData(MapValue(
+          SF->getPrefixData(), ValueMap, RF_None, &TypeMap, &ValMaterializer));
+    }
+
     // Skip if no body (function is external) or materialize.
     if (SF->isDeclaration()) {
       if (!SF->isMaterializable())
@@ -1216,7 +1276,7 @@ bool ModuleLinker::run() {
         return true;
     }
     
-    linkFunctionBody(cast<Function>(ValueMap[SF]), SF);
+    linkFunctionBody(DF, SF);
     SF->Dematerialize();
   }
 
@@ -1238,49 +1298,44 @@ bool ModuleLinker::run() {
     LinkedInAnyFunctions = false;
     
     for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
-        E = LazilyLinkFunctions.end(); I != E; ++I) {
-      if (!*I)
-        continue;
-      
+        E = LazilyLinkFunctions.end(); I != E; ++I) {      
       Function *SF = *I;
+      if (!SF)
+        continue;
+
       Function *DF = cast<Function>(ValueMap[SF]);
-      
-      if (!DF->use_empty()) {
-        
-        // Materialize if necessary.
-        if (SF->isDeclaration()) {
-          if (!SF->isMaterializable())
-            continue;
-          if (SF->Materialize(&ErrorMsg))
-            return true;
-        }
-        
-        // Link in function body.
-        linkFunctionBody(DF, SF);
-        SF->Dematerialize();
+      if (SF->hasPrefixData()) {
+        // Link in the prefix data.
+        DF->setPrefixData(MapValue(SF->getPrefixData(),
+                                   ValueMap,
+                                   RF_None,
+                                   &TypeMap,
+                                   &ValMaterializer));
+      }
 
-        // "Remove" from vector by setting the element to 0.
-        *I = 0;
-        
-        // Set flag to indicate we may have more functions to lazily link in
-        // since we linked in a function.
-        LinkedInAnyFunctions = true;
+      // Materialize if necessary.
+      if (SF->isDeclaration()) {
+        if (!SF->isMaterializable())
+          continue;
+        if (SF->Materialize(&ErrorMsg))
+          return true;
       }
+      
+      // Erase from vector *before* the function body is linked - linkFunctionBody could
+      // invalidate I.
+      LazilyLinkFunctions.erase(I);
+
+      // Link in function body.
+      linkFunctionBody(DF, SF);
+      SF->Dematerialize();
+
+      // Set flag to indicate we may have more functions to lazily link in
+      // since we linked in a function.
+      LinkedInAnyFunctions = true;
+      break;
     }
   } while (LinkedInAnyFunctions);
   
-  // Remove any prototypes of functions that were not actually linked in.
-  for(std::vector<Function*>::iterator I = LazilyLinkFunctions.begin(),
-      E = LazilyLinkFunctions.end(); I != E; ++I) {
-    if (!*I)
-      continue;
-    
-    Function *SF = *I;
-    Function *DF = cast<Function>(ValueMap[SF]);
-    if (DF->use_empty())
-      DF->eraseFromParent();
-  }
-  
   // Now that all of the types from the source are used, resolve any structs
   // copied over to the dest that didn't exist there.
   TypeMap.linkDefinedTypeBodies();
@@ -1297,6 +1352,11 @@ Linker::Linker(Module *M) : Composite(M) {
 Linker::~Linker() {
 }
 
+void Linker::deleteModule() {
+  delete Composite;
+  Composite = NULL;
+}
+
 bool Linker::linkInModule(Module *Src, unsigned Mode, std::string *ErrorMsg) {
   ModuleLinker TheLinker(Composite, IdentifiedStructTypes, Src, Mode);
   if (TheLinker.run()) {
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index db882c0..fa844ef 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMMC
   MCAsmInfo.cpp
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
+  MCAsmInfoELF.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCAtom.cpp
@@ -15,7 +16,9 @@ add_llvm_library(LLVMMC
   MCELF.cpp
   MCELFObjectTargetWriter.cpp
   MCELFStreamer.cpp
+  MCFunction.cpp
   MCExpr.cpp
+  MCExternalSymbolizer.cpp
   MCInst.cpp
   MCInstPrinter.cpp
   MCInstrAnalysis.cpp
@@ -23,12 +26,16 @@ add_llvm_library(LLVMMC
   MCMachOStreamer.cpp
   MCMachObjectTargetWriter.cpp
   MCModule.cpp
+  MCModuleYAML.cpp
   MCNullStreamer.cpp
   MCObjectFileInfo.cpp
+  MCObjectDisassembler.cpp
   MCObjectStreamer.cpp
+  MCObjectSymbolizer.cpp
   MCObjectWriter.cpp
   MCPureStreamer.cpp
   MCRegisterInfo.cpp
+  MCRelocationInfo.cpp
   MCSection.cpp
   MCSectionCOFF.cpp
   MCSectionELF.cpp
@@ -36,6 +43,7 @@ add_llvm_library(LLVMMC
   MCStreamer.cpp
   MCSubtargetInfo.cpp
   MCSymbol.cpp
+  MCSymbolizer.cpp
   MCValue.cpp
   MCWin64EH.cpp
   MachObjectWriter.cpp
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 3d99548..9899bb2 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -73,10 +73,6 @@ class ELFObjectWriter : public MCObjectWriter {
 
       // Support lexicographic sorting.
       bool operator<(const ELFSymbolData &RHS) const {
-        if (MCELF::GetType(*SymbolData) == ELF::STT_FILE)
-          return true;
-        if (MCELF::GetType(*RHS.SymbolData) == ELF::STT_FILE)
-          return false;
         return SymbolData->getSymbol().getName() <
                RHS.SymbolData->getSymbol().getName();
       }
@@ -98,6 +94,7 @@ class ELFObjectWriter : public MCObjectWriter {
     /// @{
 
     SmallString<256> StringTable;
+    std::vector<uint64_t> FileSymbolData;
     std::vector<ELFSymbolData> LocalSymbolData;
     std::vector<ELFSymbolData> ExternalSymbolData;
     std::vector<ELFSymbolData> UndefinedSymbolData;
@@ -551,7 +548,7 @@ void ELFObjectWriter::WriteSymbol(MCDataFragment *SymtabF,
   uint8_t Type = MCELF::GetType(Data);
   uint8_t Info = (Binding << ELF_STB_Shift) | (Type << ELF_STT_Shift);
 
-  // Other and Visibility share the same byte with Visability using the lower
+  // Other and Visibility share the same byte with Visibility using the lower
   // 2 bits
   uint8_t Visibility = MCELF::GetVisibility(OrigData);
   uint8_t Other = MCELF::getOther(OrigData) <<
@@ -590,8 +587,15 @@ void ELFObjectWriter::WriteSymbolTable(MCDataFragment *SymtabF,
   // The first entry is the undefined symbol entry.
   WriteSymbolEntry(SymtabF, ShndxF, 0, 0, 0, 0, 0, 0, false);
 
+  for (unsigned i = 0, e = FileSymbolData.size(); i != e; ++i) {
+    WriteSymbolEntry(SymtabF, ShndxF, FileSymbolData[i],
+                     ELF::STT_FILE | ELF::STB_LOCAL, 0, 0,
+                     ELF::STV_DEFAULT, ELF::SHN_ABS, true);
+  }
+
   // Write the symbol table entries.
-  LastLocalSymbolIndex = LocalSymbolData.size() + 1;
+  LastLocalSymbolIndex = FileSymbolData.size() + LocalSymbolData.size() + 1;
+
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i) {
     ELFSymbolData &MSD = LocalSymbolData[i];
     WriteSymbol(SymtabF, ShndxF, MSD, Layout);
@@ -759,9 +763,6 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
   uint64_t RelocOffset = Layout.getFragmentOffset(Fragment) +
     Fixup.getOffset();
 
-  // FIXME: no tests cover this. Is adjustFixupOffset dead code?
-  TargetObjectWriter->adjustFixupOffset(Fixup, RelocOffset);
-
   if (!hasRelocationAddend())
     Addend = 0;
 
@@ -883,6 +884,20 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
   // FIXME: We could optimize suffixes in strtab in the same way we
   // optimize them in shstrtab.
 
+  for (MCAssembler::const_file_name_iterator it = Asm.file_names_begin(),
+                                            ie = Asm.file_names_end();
+                                            it != ie;
+                                            ++it) {
+    StringRef Name = *it;
+    uint64_t &Entry = StringIndexMap[Name];
+    if (!Entry) {
+      Entry = StringTable.size();
+      StringTable += Name;
+      StringTable += '\x00';
+    }
+    FileSymbolData.push_back(Entry);
+  }
+
   // Add the data for the symbols.
   for (MCAssembler::symbol_iterator it = Asm.symbol_begin(),
          ie = Asm.symbol_end(); it != ie; ++it) {
@@ -967,7 +982,7 @@ void ELFObjectWriter::ComputeSymbolTable(MCAssembler &Asm,
 
   // Set the symbol indices. Local symbols must come before all other
   // symbols with non-local bindings.
-  unsigned Index = 1;
+  unsigned Index = FileSymbolData.size() + 1;
   for (unsigned i = 0, e = LocalSymbolData.size(); i != e; ++i)
     LocalSymbolData[i].SymbolData->setIndex(Index++);
 
@@ -1005,11 +1020,18 @@ void ELFObjectWriter::CreateRelocationSections(MCAssembler &Asm,
     else
       EntrySize = is64Bit() ? sizeof(ELF::Elf64_Rel) : sizeof(ELF::Elf32_Rel);
 
+    unsigned Flags = 0;
+    StringRef Group = "";
+    if (Section.getFlags() & ELF::SHF_GROUP) {
+      Flags = ELF::SHF_GROUP;
+      Group = Section.getGroup()->getName();
+    }
+
     const MCSectionELF *RelaSection =
       Ctx.getELFSection(RelaSectionName, hasRelocationAddend() ?
-                        ELF::SHT_RELA : ELF::SHT_REL, 0,
+                        ELF::SHT_RELA : ELF::SHT_REL, Flags,
                         SectionKind::getReadOnly(),
-                        EntrySize, "");
+                        EntrySize, Group);
     RelMap[&Section] = RelaSection;
     Asm.getOrCreateSectionData(*RelaSection);
   }
@@ -1069,7 +1091,7 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
     else if (entry.Index < 0)
       entry.Index = getSymbolIndexInSymbolTable(Asm, entry.Symbol);
     else
-      entry.Index += LocalSymbolData.size();
+      entry.Index += FileSymbolData.size() + LocalSymbolData.size();
     if (is64Bit()) {
       String64(*F, entry.r_offset);
       if (TargetObjectWriter->isN64()) {
@@ -1100,11 +1122,10 @@ void ELFObjectWriter::WriteRelocationsFragment(const MCAssembler &Asm,
   }
 }
 
-static int compareBySuffix(const void *a, const void *b) {
-  const MCSectionELF *secA = *static_cast<const MCSectionELF* const *>(a);
-  const MCSectionELF *secB = *static_cast<const MCSectionELF* const *>(b);
-  const StringRef &NameA = secA->getSectionName();
-  const StringRef &NameB = secB->getSectionName();
+static int compareBySuffix(const MCSectionELF *const *a,
+                           const MCSectionELF *const *b) {
+  const StringRef &NameA = (*a)->getSectionName();
+  const StringRef &NameB = (*b)->getSectionName();
   const unsigned sizeA = NameA.size();
   const unsigned sizeB = NameB.size();
   const unsigned len = std::min(sizeA, sizeB);
@@ -1295,10 +1316,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
     // Remove ".rel" and ".rela" prefixes.
     unsigned SecNameLen = (Section.getType() == ELF::SHT_REL) ? 4 : 5;
     StringRef SectionName = Section.getSectionName().substr(SecNameLen);
+    StringRef GroupName =
+        Section.getGroup() ? Section.getGroup()->getName() : "";
 
-    InfoSection = Asm.getContext().getELFSection(SectionName,
-                                                 ELF::SHT_PROGBITS, 0,
-                                                 SectionKind::getReadOnly());
+    InfoSection = Asm.getContext().getELFSection(SectionName, ELF::SHT_PROGBITS,
+                                                 0, SectionKind::getReadOnly(),
+                                                 0, GroupName);
     sh_info = SectionIndexMap.lookup(InfoSection);
     break;
   }
@@ -1348,11 +1371,12 @@ void ELFObjectWriter::WriteSection(MCAssembler &Asm,
                                        ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
                                        SectionKind::getText()));
     } else if (SecName.startswith(".ARM.exidx")) {
-      sh_link = SectionIndexMap.lookup(
-        Asm.getContext().getELFSection(SecName.substr(sizeof(".ARM.exidx") - 1),
-                                       ELF::SHT_PROGBITS,
-                                       ELF::SHF_EXECINSTR | ELF::SHF_ALLOC,
-                                       SectionKind::getText()));
+      StringRef GroupName =
+          Section.getGroup() ? Section.getGroup()->getName() : "";
+      sh_link = SectionIndexMap.lookup(Asm.getContext().getELFSection(
+          SecName.substr(sizeof(".ARM.exidx") - 1), ELF::SHT_PROGBITS,
+          ELF::SHF_EXECINSTR | ELF::SHF_ALLOC, SectionKind::getText(), 0,
+          GroupName));
     }
   }
 
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 53960e7..c4c98cc 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 using namespace llvm;
 
@@ -37,7 +38,6 @@ MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
     { "FK_SecRel_8", 0, 64, 0 }
   };
 
-  assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) &&
-         "Unknown fixup kind");
+  assert((size_t)Kind <= array_lengthof(Builtins) && "Unknown fixup kind");
   return Builtins[Kind];
 }
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 9e60884..28f1c95 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -34,7 +34,8 @@ MCAsmInfo::MCAsmInfo() {
   HasStaticCtorDtorReferenceInStaticMode = false;
   LinkerRequiresNonEmptyDwarfLines = false;
   MaxInstLength = 4;
-  PCSymbol = "$";
+  MinInstAlignment = 1;
+  DollarIsPC = false;
   SeparatorString = ";";
   CommentColumn = 40;
   CommentString = "#";
@@ -49,10 +50,7 @@ MCAsmInfo::MCAsmInfo() {
   Code32Directive = ".code32";
   Code64Directive = ".code64";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = true;
-  AllowUTF8 = true;
+  AllowAtInName = false;
   UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
@@ -75,8 +73,8 @@ MCAsmInfo::MCAsmInfo() {
   LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
+  HasIdentDirective = false;
   HasNoDeadStrip = false;
-  HasSymbolResolver = false;
   WeakRefDirective = 0;
   WeakDefDirective = 0;
   LinkOnceDirective = 0;
@@ -86,7 +84,6 @@ MCAsmInfo::MCAsmInfo() {
   HasLEB128 = false;
   SupportsDebugInformation = false;
   ExceptionsType = ExceptionHandling::None;
-  DwarfUsesInlineInfoSection = false;
   DwarfUsesRelocationsAcrossSections = true;
   DwarfRegNumForCFI = false;
   HasMicrosoftFastStdCallMangling = false;
@@ -97,7 +94,7 @@ MCAsmInfo::~MCAsmInfo() {
 }
 
 
-unsigned MCAsmInfo::getULEB128Size(unsigned Value) {
+unsigned MCAsmInfo::getULEB128Size(uint64_t Value) {
   unsigned Size = 0;
   do {
     Value >>= 7;
@@ -106,7 +103,7 @@ unsigned MCAsmInfo::getULEB128Size(unsigned Value) {
   return Size;
 }
 
-unsigned MCAsmInfo::getSLEB128Size(int Value) {
+unsigned MCAsmInfo::getSLEB128Size(int64_t Value) {
   unsigned Size = 0;
   int Sign = Value >> (8 * sizeof(Value) - 1);
   bool IsMore;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 33350d9..9d9f98e 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -43,7 +43,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
 void MCAsmInfoMicrosoft::anchor() { }
 
 MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-  AllowQuotesInName = true;
 }
 
 void MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index a0e3eba..704c816 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -26,7 +26,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   GlobalPrefix = "_";
   PrivateGlobalPrefix = "L";
   LinkerPrivateGlobalPrefix = "l";
-  AllowQuotesInName = true;
   HasSingleParameterDotFile = false;
   HasSubsectionsViaSymbols = true;
 
@@ -58,7 +57,6 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = true;
 
   DwarfUsesRelocationsAcrossSections = false;
 }
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
new file mode 100644
index 0000000..8cf4e4f
--- /dev/null
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -0,0 +1,23 @@
+//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on ELF-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoELF.h"
+using namespace llvm;
+
+void MCAsmInfoELF::anchor() { }
+
+MCAsmInfoELF::MCAsmInfoELF() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+}
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 9e86785..ca49f8f 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -25,11 +25,12 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/PathV2.h"
+#include "llvm/Support/Path.h"
 #include <cctype>
 using namespace llvm;
 
@@ -38,7 +39,7 @@ namespace {
 class MCAsmStreamer : public MCStreamer {
 protected:
   formatted_raw_ostream &OS;
-  const MCAsmInfo &MAI;
+  const MCAsmInfo *MAI;
 private:
   OwningPtr<MCInstPrinter> InstPrinter;
   OwningPtr<MCCodeEmitter> Emitter;
@@ -65,17 +66,15 @@ private:
   virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame);
 
 public:
-  MCAsmStreamer(MCContext &Context, formatted_raw_ostream &os,
-                bool isVerboseAsm, bool useLoc, bool useCFI,
-                bool useDwarfDirectory,
-                MCInstPrinter *printer, MCCodeEmitter *emitter,
-                MCAsmBackend *asmbackend,
-                bool showInst)
-    : MCStreamer(SK_AsmStreamer, Context), OS(os), MAI(Context.getAsmInfo()),
-      InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
-      CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
-      ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
-      UseDwarfDirectory(useDwarfDirectory) {
+  MCAsmStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                formatted_raw_ostream &os, bool isVerboseAsm, bool useLoc,
+                bool useCFI, bool useDwarfDirectory, MCInstPrinter *printer,
+                MCCodeEmitter *emitter, MCAsmBackend *asmbackend, bool showInst)
+      : MCStreamer(Context, TargetStreamer), OS(os), MAI(Context.getAsmInfo()),
+        InstPrinter(printer), Emitter(emitter), AsmBackend(asmbackend),
+        CommentStream(CommentToEmit), IsVerboseAsm(isVerboseAsm),
+        ShowInst(showInst), UseLoc(useLoc), UseCFI(useCFI),
+        UseDwarfDirectory(useDwarfDirectory) {
     if (InstPrinter && IsVerboseAsm)
       InstPrinter->setCommentStream(CommentStream);
   }
@@ -154,7 +153,7 @@ public:
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
 
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(const MCSymbol *Symbol);
@@ -180,12 +179,10 @@ public:
   virtual void EmitTBSSSymbol (const MCSection *Section, MCSymbol *Symbol,
                                uint64_t Size, unsigned ByteAlignment = 0);
 
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitBytes(StringRef Data);
 
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace);
-  virtual void EmitIntValue(uint64_t Value, unsigned Size,
-                            unsigned AddrSpace = 0);
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size);
+  virtual void EmitIntValue(uint64_t Value, unsigned Size);
 
   virtual void EmitULEB128Value(const MCExpr *Value);
 
@@ -196,8 +193,7 @@ public:
   virtual void EmitGPRel32Value(const MCExpr *Value);
 
 
-  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                        unsigned AddrSpace);
+  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue);
 
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                     unsigned ValueSize = 1,
@@ -217,6 +213,7 @@ public:
                                      unsigned Isa, unsigned Discriminator,
                                      StringRef FileName);
 
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitCFISections(bool EH, bool Debug);
   virtual void EmitCFIDefCfa(int64_t Register, int64_t Offset);
   virtual void EmitCFIDefCfaOffset(int64_t Offset);
@@ -232,6 +229,7 @@ public:
   virtual void EmitCFISignalFrame();
   virtual void EmitCFIUndefined(int64_t Register);
   virtual void EmitCFIRegister(int64_t Register1, int64_t Register2);
+  virtual void EmitCFIWindowSave();
 
   virtual void EmitWin64EHStartProc(const MCSymbol *Symbol);
   virtual void EmitWin64EHEndProc();
@@ -248,17 +246,6 @@ public:
   virtual void EmitWin64EHPushFrame(bool Code);
   virtual void EmitWin64EHEndProlog();
 
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Personality);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool);
-
-  virtual void EmitTCEntry(const MCSymbol &S);
-
   virtual void EmitInstruction(const MCInst &Inst);
 
   virtual void EmitBundleAlignMode(unsigned AlignPow2);
@@ -268,15 +255,9 @@ public:
   /// EmitRawText - If this file is backed by an assembly streamer, this dumps
   /// the specified string in the output .s file.  This capability is
   /// indicated by the hasRawTextSupport() predicate.
-  virtual void EmitRawText(StringRef String);
+  virtual void EmitRawTextImpl(StringRef String);
 
   virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_AsmStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -312,9 +293,9 @@ void MCAsmStreamer::EmitCommentsAndEOL() {
          "Comment array not newline terminated");
   do {
     // Emit a line of comments.
-    OS.PadToColumn(MAI.getCommentColumn());
+    OS.PadToColumn(MAI->getCommentColumn());
     size_t Position = Comments.find('\n');
-    OS << MAI.getCommentString() << ' ' << Comments.substr(0, Position) << '\n';
+    OS << MAI->getCommentString() << ' ' << Comments.substr(0, Position) <<'\n';
 
     Comments = Comments.substr(Position+1);
   } while (!Comments.empty());
@@ -332,7 +313,7 @@ static inline int64_t truncateToSize(int64_t Value, unsigned Bytes) {
 void MCAsmStreamer::ChangeSection(const MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
-  Section->PrintSwitchToSection(MAI, OS, Subsection);
+  Section->PrintSwitchToSection(*MAI, OS, Subsection);
 }
 
 void MCAsmStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
@@ -354,7 +335,7 @@ void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   MCStreamer::EmitLabel(Symbol);
 
-  OS << *Symbol << MAI.getLabelSuffix();
+  OS << *Symbol << MAI->getLabelSuffix();
   EmitEOL();
 }
 
@@ -362,7 +343,7 @@ void MCAsmStreamer::EmitDebugLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
   MCStreamer::EmitDebugLabel(Symbol);
 
-  OS << *Symbol << MAI.getDebugLabelSuffix();
+  OS << *Symbol << MAI->getDebugLabelSuffix();
   EmitEOL();
 }
 
@@ -370,9 +351,9 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
   case MCAF_SubsectionsViaSymbols: OS << ".subsections_via_symbols"; break;
-  case MCAF_Code16:                OS << '\t'<< MAI.getCode16Directive(); break;
-  case MCAF_Code32:                OS << '\t'<< MAI.getCode32Directive(); break;
-  case MCAF_Code64:                OS << '\t'<< MAI.getCode64Directive(); break;
+  case MCAF_Code16:                OS << '\t'<< MAI->getCode16Directive();break;
+  case MCAF_Code32:                OS << '\t'<< MAI->getCode32Directive();break;
+  case MCAF_Code64:                OS << '\t'<< MAI->getCode64Directive();break;
   }
   EmitEOL();
 }
@@ -388,9 +369,7 @@ void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
 }
 
 void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
-  MCContext &Ctx = getContext();
-  const MCAsmInfo &MAI = Ctx.getAsmInfo();
-  if (!MAI.doesSupportDataRegionDirectives())
+  if (!MAI->doesSupportDataRegionDirectives())
     return;
   switch (Kind) {
   case MCDR_DataRegion:            OS << "\t.data_region"; break;
@@ -407,7 +386,7 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   // MCSymbols when they have spaces in them.
   OS << "\t.thumb_func";
   // Only Mach-O hasSubsectionsViaSymbols()
-  if (MAI.hasSubsectionsViaSymbols())
+  if (MAI->hasSubsectionsViaSymbols())
     OS << '\t' << *Func;
   EmitEOL();
 }
@@ -441,7 +420,7 @@ void MCAsmStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 }
 
 
-void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
   case MCSA_Invalid: llvm_unreachable("Invalid symbol attribute");
@@ -452,11 +431,12 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_ELF_TypeCommon:      /// .type _foo, STT_COMMON  # aka @common
   case MCSA_ELF_TypeNoType:      /// .type _foo, STT_NOTYPE  # aka @notype
   case MCSA_ELF_TypeGnuUniqueObject:  /// .type _foo, @gnu_unique_object
-    assert(MAI.hasDotTypeDotSizeDirective() && "Symbol Attr not supported");
+    if (!MAI->hasDotTypeDotSizeDirective())
+      return false; // Symbol attribute not supported
     OS << "\t.type\t" << *Symbol << ','
-       << ((MAI.getCommentString()[0] != '@') ? '@' : '%');
+       << ((MAI->getCommentString()[0] != '@') ? '@' : '%');
     switch (Attribute) {
-    default: llvm_unreachable("Unknown ELF .type");
+    default: return false;
     case MCSA_ELF_TypeFunction:    OS << "function"; break;
     case MCSA_ELF_TypeIndFunction: OS << "gnu_indirect_function"; break;
     case MCSA_ELF_TypeObject:      OS << "object"; break;
@@ -466,9 +446,9 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     case MCSA_ELF_TypeGnuUniqueObject: OS << "gnu_unique_object"; break;
     }
     EmitEOL();
-    return;
+    return true;
   case MCSA_Global: // .globl/.global
-    OS << MAI.getGlobalDirective();
+    OS << MAI->getGlobalDirective();
     FlagMap[Symbol] |= EHGlobal;
     break;
   case MCSA_Hidden:         OS << "\t.hidden\t";          break;
@@ -490,12 +470,14 @@ void MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     FlagMap[Symbol] |= EHWeakDefinition;
     break;
       // .weak_reference
-  case MCSA_WeakReference:  OS << MAI.getWeakRefDirective(); break;
+  case MCSA_WeakReference:  OS << MAI->getWeakRefDirective(); break;
   case MCSA_WeakDefAutoPrivate: OS << "\t.weak_def_can_be_hidden\t"; break;
   }
 
   OS << *Symbol;
   EmitEOL();
+
+  return true;
 }
 
 void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -529,15 +511,18 @@ void MCAsmStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
 }
 
 void MCAsmStreamer::EmitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
-  assert(MAI.hasDotTypeDotSizeDirective());
+  assert(MAI->hasDotTypeDotSizeDirective());
   OS << "\t.size\t" << *Symbol << ", " << *Value << '\n';
 }
 
 void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.comm\t" << *Symbol << ',' << Size;
   if (ByteAlignment != 0) {
-    if (MAI.getCOMMDirectiveAlignmentIsInBytes())
+    if (MAI->getCOMMDirectiveAlignmentIsInBytes())
       OS << ',' << ByteAlignment;
     else
       OS << ',' << Log2_32(ByteAlignment);
@@ -551,9 +536,12 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
+  // Common symbols do not belong to any actual section.
+  AssignSection(Symbol, NULL);
+
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
-    switch (MAI.getLCOMMDirectiveAlignmentType()) {
+    switch (MAI->getLCOMMDirectiveAlignmentType()) {
     case LCOMM::NoAlignment:
       llvm_unreachable("alignment not supported on .lcomm!");
     case LCOMM::ByteAlignment:
@@ -570,6 +558,9 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
 void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
+  if (Symbol)
+    AssignSection(Symbol, Section);
+
   // Note: a .zerofill directive does not switch sections.
   OS << ".zerofill ";
 
@@ -590,6 +581,8 @@ void MCAsmStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
 // e.g. _a.
 void MCAsmStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
+  AssignSection(Symbol, Section);
+
   assert(Symbol != NULL && "Symbol shouldn't be NULL!");
   // Instead of using the Section we'll just use the shortcut.
   // This is a mach-o specific directive and section.
@@ -638,13 +631,13 @@ static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
 }
 
 
-void MCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+void MCAsmStreamer::EmitBytes(StringRef Data) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
   if (Data.empty()) return;
 
   if (Data.size() == 1) {
-    OS << MAI.getData8bitsDirective(AddrSpace);
+    OS << MAI->getData8bitsDirective();
     OS << (unsigned)(unsigned char)Data[0];
     EmitEOL();
     return;
@@ -652,46 +645,43 @@ void MCAsmStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
 
   // If the data ends with 0 and the target supports .asciz, use it, otherwise
   // use .ascii
-  if (MAI.getAscizDirective() && Data.back() == 0) {
-    OS << MAI.getAscizDirective();
+  if (MAI->getAscizDirective() && Data.back() == 0) {
+    OS << MAI->getAscizDirective();
     Data = Data.substr(0, Data.size()-1);
   } else {
-    OS << MAI.getAsciiDirective();
+    OS << MAI->getAsciiDirective();
   }
 
-  OS << ' ';
   PrintQuotedString(Data, OS);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size,
-                                 unsigned AddrSpace) {
-  EmitValue(MCConstantExpr::Create(Value, getContext()), Size, AddrSpace);
+void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
+  EmitValue(MCConstantExpr::Create(Value, getContext()), Size);
 }
 
-void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                  unsigned AddrSpace) {
+void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
   const char *Directive = 0;
   switch (Size) {
   default: break;
-  case 1: Directive = MAI.getData8bitsDirective(AddrSpace); break;
-  case 2: Directive = MAI.getData16bitsDirective(AddrSpace); break;
-  case 4: Directive = MAI.getData32bitsDirective(AddrSpace); break;
+  case 1: Directive = MAI->getData8bitsDirective();  break;
+  case 2: Directive = MAI->getData16bitsDirective(); break;
+  case 4: Directive = MAI->getData32bitsDirective(); break;
   case 8:
-    Directive = MAI.getData64bitsDirective(AddrSpace);
+    Directive = MAI->getData64bitsDirective();
     // If the target doesn't support 64-bit data, emit as two 32-bit halves.
     if (Directive) break;
     int64_t IntValue;
     if (!Value->EvaluateAsAbsolute(IntValue))
       report_fatal_error("Don't know how to emit this value.");
-    if (getContext().getAsmInfo().isLittleEndian()) {
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
+    if (MAI->isLittleEndian()) {
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4);
+      EmitIntValue((uint32_t)(IntValue >> 32), 4);
     } else {
-      EmitIntValue((uint32_t)(IntValue >> 32), 4, AddrSpace);
-      EmitIntValue((uint32_t)(IntValue >> 0 ), 4, AddrSpace);
+      EmitIntValue((uint32_t)(IntValue >> 32), 4);
+      EmitIntValue((uint32_t)(IntValue >> 0 ), 4);
     }
     return;
   }
@@ -707,7 +697,7 @@ void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
     EmitULEB128IntValue(IntValue);
     return;
   }
-  assert(MAI.hasLEB128() && "Cannot print a .uleb");
+  assert(MAI->hasLEB128() && "Cannot print a .uleb");
   OS << ".uleb128 " << *Value;
   EmitEOL();
 }
@@ -718,41 +708,39 @@ void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
     EmitSLEB128IntValue(IntValue);
     return;
   }
-  assert(MAI.hasLEB128() && "Cannot print a .sleb");
+  assert(MAI->hasLEB128() && "Cannot print a .sleb");
   OS << ".sleb128 " << *Value;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitGPRel64Value(const MCExpr *Value) {
-  assert(MAI.getGPRel64Directive() != 0);
-  OS << MAI.getGPRel64Directive() << *Value;
+  assert(MAI->getGPRel64Directive() != 0);
+  OS << MAI->getGPRel64Directive() << *Value;
   EmitEOL();
 }
 
 void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
-  assert(MAI.getGPRel32Directive() != 0);
-  OS << MAI.getGPRel32Directive() << *Value;
+  assert(MAI->getGPRel32Directive() != 0);
+  OS << MAI->getGPRel32Directive() << *Value;
   EmitEOL();
 }
 
 
 /// EmitFill - Emit NumBytes bytes worth of the value specified by
 /// FillValue.  This implements directives such as '.space'.
-void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                             unsigned AddrSpace) {
+void MCAsmStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
   if (NumBytes == 0) return;
 
-  if (AddrSpace == 0)
-    if (const char *ZeroDirective = MAI.getZeroDirective()) {
-      OS << ZeroDirective << NumBytes;
-      if (FillValue != 0)
-        OS << ',' << (int)FillValue;
-      EmitEOL();
-      return;
-    }
+  if (const char *ZeroDirective = MAI->getZeroDirective()) {
+    OS << ZeroDirective << NumBytes;
+    if (FillValue != 0)
+      OS << ',' << (int)FillValue;
+    EmitEOL();
+    return;
+  }
 
   // Emit a byte at a time.
-  MCStreamer::EmitFill(NumBytes, FillValue, AddrSpace);
+  MCStreamer::EmitFill(NumBytes, FillValue);
 }
 
 void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
@@ -763,14 +751,14 @@ void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
   if (isPowerOf2_32(ByteAlignment)) {
     switch (ValueSize) {
     default: llvm_unreachable("Invalid size for machine code value!");
-    case 1: OS << MAI.getAlignDirective(); break;
+    case 1: OS << MAI->getAlignDirective(); break;
     // FIXME: use MAI for this!
     case 2: OS << ".p2alignw "; break;
     case 4: OS << ".p2alignl "; break;
     case 8: llvm_unreachable("Unsupported alignment size!");
     }
 
-    if (MAI.getAlignmentIsInBytes())
+    if (MAI->getAlignmentIsInBytes())
       OS << ByteAlignment;
     else
       OS << Log2_32(ByteAlignment);
@@ -806,7 +794,7 @@ void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
 void MCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
                                       unsigned MaxBytesToEmit) {
   // Emit with a text fill value.
-  EmitValueToAlignment(ByteAlignment, MAI.getTextAlignFillValue(),
+  EmitValueToAlignment(ByteAlignment, MAI->getTextAlignFillValue(),
                        1, MaxBytesToEmit);
 }
 
@@ -820,7 +808,7 @@ bool MCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
 
 
 void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
-  assert(MAI.hasSingleParameterDotFile());
+  assert(MAI->hasSingleParameterDotFile());
   OS << "\t.file\t";
   PrintQuotedString(Filename, OS);
   EmitEOL();
@@ -886,13 +874,20 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
     OS << "discriminator " << Discriminator;
 
   if (IsVerboseAsm) {
-    OS.PadToColumn(MAI.getCommentColumn());
-    OS << MAI.getCommentString() << ' ' << FileName << ':'
+    OS.PadToColumn(MAI->getCommentColumn());
+    OS << MAI->getCommentString() << ' ' << FileName << ':'
        << Line << ':' << Column;
   }
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitIdent(StringRef IdentString) {
+  assert(MAI->hasIdentDirective() && ".ident directive not supported");
+  OS << "\t.ident\t";
+  PrintQuotedString(IdentString, OS);
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
   MCStreamer::EmitCFISections(EH, Debug);
 
@@ -936,9 +931,9 @@ void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
 void MCAsmStreamer::EmitRegisterName(int64_t Register) {
-  if (InstPrinter && !MAI.useDwarfRegNumForCFI()) {
-    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-    unsigned LLVMRegister = MRI.getLLVMRegNum(Register, true);
+  if (InstPrinter && !MAI->useDwarfRegNumForCFI()) {
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    unsigned LLVMRegister = MRI->getLLVMRegNum(Register, true);
     InstPrinter->printRegName(OS, LLVMRegister);
   } else {
     OS << Register;
@@ -1094,6 +1089,16 @@ void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitCFIWindowSave() {
+  MCStreamer::EmitCFIWindowSave();
+
+  if (!UseCFI)
+    return;
+
+  OS << "\t.cfi_window_save";
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitWin64EHStartProc(const MCSymbol *Symbol) {
   MCStreamer::EmitWin64EHStartProc(Symbol);
 
@@ -1276,7 +1281,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
         unsigned Bit = (Code[i] >> j) & 1;
 
         unsigned FixupBit;
-        if (getContext().getAsmInfo().isLittleEndian())
+        if (MAI->isLittleEndian())
           FixupBit = i * 8 + j;
         else
           FixupBit = i * 8 + (7-j);
@@ -1299,73 +1304,6 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst) {
   }
 }
 
-void MCAsmStreamer::EmitFnStart() {
-  OS << "\t.fnstart";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitFnEnd() {
-  OS << "\t.fnend";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitCantUnwind() {
-  OS << "\t.cantunwind";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitHandlerData() {
-  OS << "\t.handlerdata";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPersonality(const MCSymbol *Personality) {
-  OS << "\t.personality " << Personality->getName();
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  OS << "\t.setfp\t";
-  InstPrinter->printRegName(OS, FpReg);
-  OS << ", ";
-  InstPrinter->printRegName(OS, SpReg);
-  if (Offset)
-    OS << ", #" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitPad(int64_t Offset) {
-  OS << "\t.pad\t#" << Offset;
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                bool isVector) {
-  assert(RegList.size() && "RegList should not be empty");
-  if (isVector)
-    OS << "\t.vsave\t{";
-  else
-    OS << "\t.save\t{";
-
-  InstPrinter->printRegName(OS, RegList[0]);
-
-  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
-    OS << ", ";
-    InstPrinter->printRegName(OS, RegList[i]);
-  }
-
-  OS << "}";
-  EmitEOL();
-}
-
-void MCAsmStreamer::EmitTCEntry(const MCSymbol &S) {
-  OS << "\t.tc ";
-  OS << S.getName();
-  OS << "[TC],";
-  OS << S.getName();
-  EmitEOL();
-}
-
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   assert(getCurrentSection().first &&
          "Cannot emit contents before setting section!");
@@ -1376,7 +1314,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
 
   // Show the MCInst if enabled.
   if (ShowInst) {
-    Inst.dump_pretty(GetCommentOS(), &MAI, InstPrinter.get(), "\n ");
+    Inst.dump_pretty(GetCommentOS(), MAI, InstPrinter.get(), "\n ");
     GetCommentOS() << "\n";
   }
 
@@ -1384,7 +1322,7 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   if (InstPrinter)
     InstPrinter->printInst(&Inst, OS, "");
   else
-    Inst.print(OS, &MAI);
+    Inst.print(OS, MAI);
   EmitEOL();
 }
 
@@ -1408,7 +1346,7 @@ void MCAsmStreamer::EmitBundleUnlock() {
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCAsmStreamer::EmitRawText(StringRef String) {
+void MCAsmStreamer::EmitRawTextImpl(StringRef String) {
   if (!String.empty() && String.back() == '\n')
     String = String.substr(0, String.size()-1);
   OS << String;
@@ -1427,14 +1365,16 @@ void MCAsmStreamer::FinishImpl() {
     MCGenDwarfInfo::Emit(this, LineSectionSymbol);
 
   if (!UseCFI)
-    EmitFrames(false);
+    EmitFrames(AsmBackend.get(), false);
 }
+
 MCStreamer *llvm::createAsmStreamer(MCContext &Context,
+                                    MCTargetStreamer *TargetStreamer,
                                     formatted_raw_ostream &OS,
-                                    bool isVerboseAsm, bool useLoc,
-                                    bool useCFI, bool useDwarfDirectory,
-                                    MCInstPrinter *IP, MCCodeEmitter *CE,
-                                    MCAsmBackend *MAB, bool ShowInst) {
-  return new MCAsmStreamer(Context, OS, isVerboseAsm, useLoc, useCFI,
-                           useDwarfDirectory, IP, CE, MAB, ShowInst);
+                                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                                    bool useDwarfDirectory, MCInstPrinter *IP,
+                                    MCCodeEmitter *CE, MCAsmBackend *MAB,
+                                    bool ShowInst) {
+  return new MCAsmStreamer(Context, TargetStreamer, OS, isVerboseAsm, useLoc,
+                           useCFI, useDwarfDirectory, IP, CE, MAB, ShowInst);
 }
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index fb5ab28..68111f1 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -580,10 +580,10 @@ static void writeFragment(const MCAssembler &Asm, const MCAsmLayout &Layout,
   case MCFragment::FT_Align: {
     ++stats::EmittedAlignFragments;
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
-    uint64_t Count = FragmentSize / AF.getValueSize();
-
     assert(AF.getValueSize() && "Invalid virtual align in concrete fragment!");
 
+    uint64_t Count = FragmentSize / AF.getValueSize();
+
     // FIXME: This error shouldn't actually occur (the front end should emit
     // multiple .align directives to enforce the semantics it wants), but is
     // severe enough that we want to report it. How to handle this?
@@ -708,12 +708,13 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
       case MCFragment::FT_Align:
         // Check that we aren't trying to write a non-zero value into a virtual
         // section.
-        assert((!cast<MCAlignFragment>(it)->getValueSize() ||
-                !cast<MCAlignFragment>(it)->getValue()) &&
+        assert((cast<MCAlignFragment>(it)->getValueSize() == 0 ||
+                cast<MCAlignFragment>(it)->getValue() == 0) &&
                "Invalid align in virtual section!");
         break;
       case MCFragment::FT_Fill:
-        assert(!cast<MCFillFragment>(it)->getValueSize() &&
+        assert((cast<MCFillFragment>(it)->getValueSize() == 0 ||
+                cast<MCFillFragment>(it)->getValue() == 0) &&
                "Invalid fill in virtual section!");
         break;
       }
@@ -904,6 +905,7 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
 
 bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
                                      MCDwarfLineAddrFragment &DF) {
+  MCContext &Context = Layout.getAssembler().getContext();
   int64_t AddrDelta = 0;
   uint64_t OldSize = DF.getContents().size();
   bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
@@ -914,13 +916,14 @@ bool MCAssembler::relaxDwarfLineAddr(MCAsmLayout &Layout,
   SmallString<8> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
-  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OSE);
+  MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OSE);
   OSE.flush();
   return OldSize != Data.size();
 }
 
 bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
                                               MCDwarfCallFrameFragment &DF) {
+  MCContext &Context = Layout.getAssembler().getContext();
   int64_t AddrDelta = 0;
   uint64_t OldSize = DF.getContents().size();
   bool IsAbs = DF.getAddrDelta().EvaluateAsAbsolute(AddrDelta, Layout);
@@ -929,7 +932,7 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
   SmallString<8> &Data = DF.getContents();
   Data.clear();
   raw_svector_ostream OSE(Data);
-  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OSE);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OSE);
   OSE.flush();
   return OldSize != Data.size();
 }
diff --git a/lib/MC/MCAtom.cpp b/lib/MC/MCAtom.cpp
index d714443..bc353cd 100644
--- a/lib/MC/MCAtom.cpp
+++ b/lib/MC/MCAtom.cpp
@@ -10,88 +10,105 @@
 #include "llvm/MC/MCAtom.h"
 #include "llvm/MC/MCModule.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <iterator>
 
 using namespace llvm;
 
-void MCAtom::addInst(const MCInst &I, uint64_t Address, unsigned Size) {
-  assert(Type == TextAtom && "Trying to add MCInst to a non-text atom!");
+// Pin the vtable to this file.
+void MCAtom::anchor() {}
 
-  assert(Address < End+Size &&
-         "Instruction not contiguous with end of atom!");
-  if (Address > End)
-    Parent->remap(this, Begin, End+Size);
-
-  Text.push_back(std::make_pair(Address, I));
+void MCAtom::remap(uint64_t NewBegin, uint64_t NewEnd) {
+  Parent->remap(this, NewBegin, NewEnd);
 }
 
-void MCAtom::addData(const MCData &D) {
-  assert(Type == DataAtom && "Trying to add MCData to a non-data atom!");
-  Parent->remap(this, Begin, End+1);
-
-  Data.push_back(D);
+void MCAtom::remapForTruncate(uint64_t TruncPt) {
+  assert((TruncPt >= Begin && TruncPt < End) &&
+         "Truncation point not contained in atom!");
+  remap(Begin, TruncPt);
 }
 
-MCAtom *MCAtom::split(uint64_t SplitPt) {
+void MCAtom::remapForSplit(uint64_t SplitPt,
+                           uint64_t &LBegin, uint64_t &LEnd,
+                           uint64_t &RBegin, uint64_t &REnd) {
   assert((SplitPt > Begin && SplitPt <= End) &&
          "Splitting at point not contained in atom!");
 
   // Compute the new begin/end points.
-  uint64_t LeftBegin = Begin;
-  uint64_t LeftEnd = SplitPt - 1;
-  uint64_t RightBegin = SplitPt;
-  uint64_t RightEnd = End;
+  LBegin = Begin;
+  LEnd = SplitPt - 1;
+  RBegin = SplitPt;
+  REnd = End;
 
   // Remap this atom to become the lower of the two new ones.
-  Parent->remap(this, LeftBegin, LeftEnd);
+  remap(LBegin, LEnd);
+}
 
-  // Create a new atom for the higher atom.
-  MCAtom *RightAtom = Parent->createAtom(Type, RightBegin, RightEnd);
+// MCDataAtom
 
-  // Split the contents of the original atom between it and the new one.  The
-  // precise method depends on whether this is a data or a text atom.
-  if (isDataAtom()) {
-    std::vector<MCData>::iterator I = Data.begin() + (RightBegin - LeftBegin);
+void MCDataAtom::addData(const MCData &D) {
+  Data.push_back(D);
+  if (Data.size() > End + 1 - Begin)
+    remap(Begin, End + 1);
+}
 
-    assert(I != Data.end() && "Split point not found in range!");
+void MCDataAtom::truncate(uint64_t TruncPt) {
+  remapForTruncate(TruncPt);
 
-    std::copy(I, Data.end(), RightAtom->Data.end());
-    Data.erase(I, Data.end());
-  } else if (isTextAtom()) {
-    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+  Data.resize(TruncPt - Begin + 1);
+}
 
-    while (I != Text.end() && I->first < SplitPt) ++I;
+MCDataAtom *MCDataAtom::split(uint64_t SplitPt) {
+  uint64_t LBegin, LEnd, RBegin, REnd;
+  remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
 
-    assert(I != Text.end() && "Split point not found in disassembly!");
-    assert(I->first == SplitPt &&
-           "Split point does not fall on instruction boundary!");
+  MCDataAtom *RightAtom = Parent->createDataAtom(RBegin, REnd);
+  RightAtom->setName(getName());
 
-    std::copy(I, Text.end(), RightAtom->Text.end());
-    Text.erase(I, Text.end());
-  } else
-    llvm_unreachable("Unknown atom type!");
+  std::vector<MCData>::iterator I = Data.begin() + (RBegin - LBegin);
+  assert(I != Data.end() && "Split point not found in range!");
 
+  std::copy(I, Data.end(), std::back_inserter(RightAtom->Data));
+  Data.erase(I, Data.end());
   return RightAtom;
 }
 
-void MCAtom::truncate(uint64_t TruncPt) {
-  assert((TruncPt >= Begin && TruncPt < End) &&
-         "Truncation point not contained in atom!");
+// MCTextAtom
 
-  Parent->remap(this, Begin, TruncPt);
+void MCTextAtom::addInst(const MCInst &I, uint64_t Size) {
+  if (NextInstAddress + Size - 1 > End)
+    remap(Begin, NextInstAddress + Size - 1);
+  Insts.push_back(MCDecodedInst(I, NextInstAddress, Size));
+  NextInstAddress += Size;
+}
 
-  if (isDataAtom()) {
-    Data.resize(TruncPt - Begin + 1);
-  } else if (isTextAtom()) {
-    std::vector<std::pair<uint64_t, MCInst> >::iterator I = Text.begin();
+void MCTextAtom::truncate(uint64_t TruncPt) {
+  remapForTruncate(TruncPt);
 
-    while (I != Text.end() && I->first <= TruncPt) ++I;
+  InstListTy::iterator I = Insts.begin();
+  while (I != Insts.end() && I->Address <= TruncPt) ++I;
 
-    assert(I != Text.end() && "Truncation point not found in disassembly!");
-    assert(I->first == TruncPt+1 &&
-           "Truncation point does not fall on instruction boundary");
+  assert(I != Insts.end() && "Truncation point not found in disassembly!");
+  assert(I->Address == TruncPt + 1 &&
+         "Truncation point does not fall on instruction boundary");
 
-    Text.erase(I, Text.end());
-  } else
-    llvm_unreachable("Unknown atom type!");
+  Insts.erase(I, Insts.end());
 }
 
+MCTextAtom *MCTextAtom::split(uint64_t SplitPt) {
+  uint64_t LBegin, LEnd, RBegin, REnd;
+  remapForSplit(SplitPt, LBegin, LEnd, RBegin, REnd);
+
+  MCTextAtom *RightAtom = Parent->createTextAtom(RBegin, REnd);
+  RightAtom->setName(getName());
+
+  InstListTy::iterator I = Insts.begin();
+  while (I != Insts.end() && I->Address < SplitPt) ++I;
+  assert(I != Insts.end() && "Split point not found in disassembly!");
+  assert(I->Address == SplitPt &&
+         "Split point does not fall on instruction boundary!");
+
+  std::copy(I, Insts.end(), std::back_inserter(RightAtom->Insts));
+  Insts.erase(I, Insts.end());
+  Parent->splitBasicBlocksForAtom(this, RightAtom);
+  return RightAtom;
+}
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 9adcc02..3b45d16 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -21,27 +21,35 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+
+#include <map>
+
 using namespace llvm;
 
-typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
-typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
-typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+typedef std::pair<std::string, std::string> SectionGroupPair;
 
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionELF *> ELFUniqueMapTy;
+typedef std::map<SectionGroupPair, const MCSectionCOFF *> COFFUniqueMapTy;
 
-MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
+MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset) :
   SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi),
   Allocator(), Symbols(Allocator), UsedNames(Allocator),
   NextUniqueID(0),
-  CompilationDir(llvm::sys::Path::GetCurrentDirectory().str()),
   CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0), 
   DwarfLocSeen(false), GenDwarfForAssembly(false), GenDwarfFileNumber(0),
   AllowTemporaryLabels(true), DwarfCompileUnitID(0), AutoReset(DoAutoReset) {
 
+  error_code EC = llvm::sys::fs::current_path(CompilationDir);
+  assert(!EC && "Could not determine the current directory");
+  (void)EC;
+
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
   COFFUniquingMap = 0;
@@ -126,7 +134,7 @@ MCSymbol *MCContext::CreateSymbol(StringRef Name) {
   // Determine whether this is an assembler temporary or normal label, if used.
   bool isTemporary = false;
   if (AllowTemporaryLabels)
-    isTemporary = Name.startswith(MAI.getPrivateGlobalPrefix());
+    isTemporary = Name.startswith(MAI->getPrivateGlobalPrefix());
 
   StringMapEntry<bool> *NameEntry = &UsedNames.GetOrCreateValue(Name);
   if (NameEntry->getValue()) {
@@ -156,7 +164,7 @@ MCSymbol *MCContext::GetOrCreateSymbol(const Twine &Name) {
 MCSymbol *MCContext::CreateTempSymbol() {
   SmallString<128> NameSV;
   raw_svector_ostream(NameSV)
-    << MAI.getPrivateGlobalPrefix() << "tmp" << NextUniqueID++;
+    << MAI->getPrivateGlobalPrefix() << "tmp" << NextUniqueID++;
   return CreateSymbol(NameSV);
 }
 
@@ -175,14 +183,14 @@ unsigned MCContext::GetInstance(int64_t LocalLabelVal) {
 }
 
 MCSymbol *MCContext::CreateDirectionalLocalSymbol(int64_t LocalLabelVal) {
-  return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
+  return GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
                            Twine(LocalLabelVal) +
                            "\2" +
                            Twine(NextInstance(LocalLabelVal)));
 }
 MCSymbol *MCContext::GetDirectionalLocalSymbol(int64_t LocalLabelVal,
                                                int bORf) {
-  return GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix()) +
+  return GetOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
                            Twine(LocalLabelVal) +
                            "\2" +
                            Twine(GetInstance(LocalLabelVal) + bORf));
@@ -245,8 +253,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)ELFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionELF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
+  std::pair<ELFUniqueMapTy::iterator, bool> Entry = Map.insert(
+      std::make_pair(SectionGroupPair(Section, Group), (MCSectionELF *)0));
+  if (!Entry.second) return Entry.first->second;
 
   // Possibly refine the entry size first.
   if (!EntrySize) {
@@ -257,9 +266,9 @@ getELFSection(StringRef Section, unsigned Type, unsigned Flags,
   if (!Group.empty())
     GroupSym = GetOrCreateSymbol(Group);
 
-  MCSectionELF *Result = new (*this) MCSectionELF(Entry.getKey(), Type, Flags,
-                                                  Kind, EntrySize, GroupSym);
-  Entry.setValue(Result);
+  MCSectionELF *Result = new (*this) MCSectionELF(
+      Entry.first->first.first, Type, Flags, Kind, EntrySize, GroupSym);
+  Entry.first->second = Result;
   return Result;
 }
 
@@ -270,26 +279,53 @@ const MCSectionELF *MCContext::CreateELFGroupSection() {
   return Result;
 }
 
-const MCSection *MCContext::getCOFFSection(StringRef Section,
-                                           unsigned Characteristics,
-                                           int Selection,
-                                           SectionKind Kind) {
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          int Selection, const MCSectionCOFF *Assoc) {
   if (COFFUniquingMap == 0)
     COFFUniquingMap = new COFFUniqueMapTy();
   COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
 
   // Do the lookup, if we have a hit, return it.
-  StringMapEntry<const MCSectionCOFF*> &Entry = Map.GetOrCreateValue(Section);
-  if (Entry.getValue()) return Entry.getValue();
 
-  MCSectionCOFF *Result = new (*this) MCSectionCOFF(Entry.getKey(),
-                                                    Characteristics,
-                                                    Selection, Kind);
+  SectionGroupPair P(Section, COMDATSymName);
+  std::pair<COFFUniqueMapTy::iterator, bool> Entry =
+      Map.insert(std::make_pair(P, (MCSectionCOFF *)0));
+  COFFUniqueMapTy::iterator Iter = Entry.first;
+  if (!Entry.second)
+    return Iter->second;
+
+  const MCSymbol *COMDATSymbol = NULL;
+  if (!COMDATSymName.empty())
+    COMDATSymbol = GetOrCreateSymbol(COMDATSymName);
 
-  Entry.setValue(Result);
+  MCSectionCOFF *Result =
+      new (*this) MCSectionCOFF(Iter->first.first, Characteristics,
+                                COMDATSymbol, Selection, Assoc, Kind);
+
+  Iter->second = Result;
   return Result;
 }
 
+const MCSectionCOFF *
+MCContext::getCOFFSection(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind) {
+  return getCOFFSection(Section, Characteristics, Kind, "", 0);
+}
+
+const MCSectionCOFF *MCContext::getCOFFSection(StringRef Section) {
+  if (COFFUniquingMap == 0)
+    COFFUniquingMap = new COFFUniqueMapTy();
+  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)COFFUniquingMap;
+
+  SectionGroupPair P(Section, "");
+  COFFUniqueMapTy::iterator Iter = Map.find(P);
+  if (Iter == Map.end())
+    return 0;
+  return Iter->second;
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Management
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDisassembler.cpp b/lib/MC/MCDisassembler.cpp
index 0809690..bfd51ab 100644
--- a/lib/MC/MCDisassembler.cpp
+++ b/lib/MC/MCDisassembler.cpp
@@ -8,7 +8,49 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 MCDisassembler::~MCDisassembler() {
 }
+
+void
+MCDisassembler::setupForSymbolicDisassembly(
+    LLVMOpInfoCallback GetOpInfo,
+    LLVMSymbolLookupCallback SymbolLookUp,
+    void *DisInfo,
+    MCContext *Ctx,
+    OwningPtr<MCRelocationInfo> &RelInfo) {
+  this->GetOpInfo = GetOpInfo;
+  this->SymbolLookUp = SymbolLookUp;
+  this->DisInfo = DisInfo;
+  this->Ctx = Ctx;
+  assert(Ctx != 0 && "No MCContext given for symbolic disassembly");
+  if (!Symbolizer)
+    Symbolizer.reset(new MCExternalSymbolizer(*Ctx, RelInfo, GetOpInfo,
+                                              SymbolLookUp, DisInfo));
+}
+
+bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
+                                              uint64_t Address, bool IsBranch,
+                                              uint64_t Offset,
+                                              uint64_t InstSize) const {
+  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
+  if (Symbolizer)
+    return Symbolizer->tryAddingSymbolicOperand(Inst, cStream, Value, Address,
+                                                IsBranch, Offset, InstSize);
+  return false;
+}
+
+void MCDisassembler::tryAddingPcLoadReferenceComment(int64_t Value,
+                                                     uint64_t Address) const {
+  raw_ostream &cStream = CommentStream ? *CommentStream : nulls();
+  if (Symbolizer)
+    Symbolizer->tryAddingPcLoadReferenceComment(cStream, Value, Address);
+}
+
+void MCDisassembler::setSymbolizer(OwningPtr<MCSymbolizer> &Symzer) {
+  Symbolizer.reset(Symzer.take());
+}
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 4766b37..a0066c8 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -16,8 +16,11 @@
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCRelocationInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbolizer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -40,10 +43,15 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   // Get the target.
   std::string Error;
   const Target *TheTarget = TargetRegistry::lookupTarget(Triple, Error);
-  assert(TheTarget && "Unable to create target!");
+  if (!TheTarget)
+    return 0;
+
+  const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(Triple);
+  if (!MRI)
+    return 0;
 
   // Get the assembler info needed to setup the MCContext.
-  const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(Triple);
+  const MCAsmInfo *MAI = TheTarget->createMCAsmInfo(*MRI, Triple);
   if (!MAI)
     return 0;
 
@@ -51,10 +59,6 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   if (!MII)
     return 0;
 
-  const MCRegisterInfo *MRI = TheTarget->createMCRegInfo(Triple);
-  if (!MRI)
-    return 0;
-
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
 
@@ -64,7 +68,7 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
     return 0;
 
   // Set up the MCContext for creating symbols and MCExpr's.
-  MCContext *Ctx = new MCContext(*MAI, *MRI, 0);
+  MCContext *Ctx = new MCContext(MAI, MRI, 0);
   if (!Ctx)
     return 0;
 
@@ -72,8 +76,18 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   MCDisassembler *DisAsm = TheTarget->createMCDisassembler(*STI);
   if (!DisAsm)
     return 0;
-  DisAsm->setupForSymbolicDisassembly(GetOpInfo, SymbolLookUp, DisInfo, Ctx);
 
+  OwningPtr<MCRelocationInfo> RelInfo(
+    TheTarget->createMCRelocationInfo(Triple, *Ctx));
+  if (!RelInfo)
+    return 0;
+
+  OwningPtr<MCSymbolizer> Symbolizer(
+    TheTarget->createMCSymbolizer(Triple, GetOpInfo, SymbolLookUp, DisInfo,
+                                  Ctx, RelInfo.take()));
+  DisAsm->setSymbolizer(Symbolizer);
+  DisAsm->setupForSymbolicDisassembly(GetOpInfo, SymbolLookUp, DisInfo,
+                                      Ctx, RelInfo);
   // Set up the instruction printer.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
   MCInstPrinter *IP = TheTarget->createMCInstPrinter(AsmPrinterVariant,
@@ -88,6 +102,7 @@ LLVMDisasmContextRef LLVMCreateDisasmCPU(const char *Triple, const char *CPU,
   if (!DC)
     return 0;
 
+  DC->setCPU(CPU);
   return DC;
 }
 
@@ -130,6 +145,112 @@ public:
 };
 } // end anonymous namespace
 
+/// \brief Emits the comments that are stored in \p DC comment stream.
+/// Each comment in the comment stream must end with a newline.
+static void emitComments(LLVMDisasmContext *DC,
+                         formatted_raw_ostream &FormattedOS) {
+  // Flush the stream before taking its content.
+  DC->CommentStream.flush();
+  StringRef Comments = DC->CommentsToEmit.str();
+  // Get the default information for printing a comment.
+  const MCAsmInfo *MAI = DC->getAsmInfo();
+  const char *CommentBegin = MAI->getCommentString();
+  unsigned CommentColumn = MAI->getCommentColumn();
+  bool IsFirst = true;
+  while (!Comments.empty()) {
+    if (!IsFirst)
+      FormattedOS << '\n';
+    // Emit a line of comments.
+    FormattedOS.PadToColumn(CommentColumn);
+    size_t Position = Comments.find('\n');
+    FormattedOS << CommentBegin << ' ' << Comments.substr(0, Position);
+    // Move after the newline character.
+    Comments = Comments.substr(Position+1);
+    IsFirst = false;
+  }
+  FormattedOS.flush();
+
+  // Tell the comment stream that the vector changed underneath it.
+  DC->CommentsToEmit.clear();
+  DC->CommentStream.resync();
+}
+
+/// \brief Gets latency information for \p Inst form the itinerary
+/// scheduling model, based on \p DC information.
+/// \return The maximum expected latency over all the operands or -1
+/// if no information are available.
+static int getItineraryLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a CPU to get the itinerary information.
+  if (DC->getCPU().empty())
+    return NoInformationAvailable;
+
+  // Get itinerary information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  InstrItineraryData IID = STI->getInstrItineraryForCPU(DC->getCPU());
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+
+  int Latency = 0;
+  for (unsigned OpIdx = 0, OpIdxEnd = Inst.getNumOperands(); OpIdx != OpIdxEnd;
+       ++OpIdx)
+    Latency = std::max(Latency, IID.getOperandCycle(SCClass, OpIdx));
+
+  return Latency;
+}
+
+/// \brief Gets latency information for \p Inst, based on \p DC information.
+/// \return The maximum expected latency over all the definitions or -1
+/// if no information are available.
+static int getLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  // Try to compute scheduling information.
+  const MCSubtargetInfo *STI = DC->getSubtargetInfo();
+  const MCSchedModel *SCModel = STI->getSchedModel();
+  const int NoInformationAvailable = -1;
+
+  // Check if we have a scheduling model for instructions.
+  if (!SCModel || !SCModel->hasInstrSchedModel())
+    // Try to fall back to the itinerary model if we do not have a
+    // scheduling model.
+    return getItineraryLatency(DC, Inst);
+
+  // Get the scheduling class of the requested instruction.
+  const MCInstrDesc& Desc = DC->getInstrInfo()->get(Inst.getOpcode());
+  unsigned SCClass = Desc.getSchedClass();
+  const MCSchedClassDesc *SCDesc = SCModel->getSchedClassDesc(SCClass);
+  // Resolving the variant SchedClass requires an MI to pass to
+  // SubTargetInfo::resolveSchedClass.
+  if (!SCDesc || !SCDesc->isValid() || SCDesc->isVariant())
+    return NoInformationAvailable;
+
+  // Compute output latency.
+  int Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry = STI->getWriteLatencyEntry(SCDesc,
+                                                                   DefIdx);
+    Latency = std::max(Latency, WLEntry->Cycles);
+  }
+
+  return Latency;
+}
+
+
+/// \brief Emits latency information in DC->CommentStream for \p Inst, based
+/// on the information available in \p DC.
+static void emitLatency(LLVMDisasmContext *DC, const MCInst &Inst) {
+  int Latency = getLatency(DC, Inst);
+
+  // Report only interesting latency.
+  if (Latency < 2)
+    return;
+
+  DC->CommentStream << "Latency: " << Latency << '\n';
+}
+
 //
 // LLVMDisasmInstruction() disassembles a single instruction using the
 // disassembler context specified in the parameter DC.  The bytes of the
@@ -154,8 +275,10 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   const MCDisassembler *DisAsm = DC->getDisAsm();
   MCInstPrinter *IP = DC->getIP();
   MCDisassembler::DecodeStatus S;
+  SmallVector<char, 64> InsnStr;
+  raw_svector_ostream Annotations(InsnStr);
   S = DisAsm->getInstruction(Inst, Size, MemoryObject, PC,
-                             /*REMOVE*/ nulls(), DC->CommentStream);
+                             /*REMOVE*/ nulls(), Annotations);
   switch (S) {
   case MCDisassembler::Fail:
   case MCDisassembler::SoftFail:
@@ -163,17 +286,18 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
     return 0;
 
   case MCDisassembler::Success: {
-    DC->CommentStream.flush();
-    StringRef Comments = DC->CommentsToEmit.str();
+    Annotations.flush();
+    StringRef AnnotationsStr = Annotations.str();
 
     SmallVector<char, 64> InsnStr;
     raw_svector_ostream OS(InsnStr);
-    IP->printInst(&Inst, OS, Comments);
-    OS.flush();
+    formatted_raw_ostream FormattedOS(OS);
+    IP->printInst(&Inst, FormattedOS, AnnotationsStr);
+
+    if (DC->getOptions() & LLVMDisassembler_Option_PrintLatency)
+      emitLatency(DC, Inst);
 
-    // Tell the comment stream that the vector changed underneath it.
-    DC->CommentsToEmit.clear();
-    DC->CommentStream.resync();
+    emitComments(DC, FormattedOS);
 
     assert(OutStringSize != 0 && "Output buffer cannot be zero size");
     size_t OutputSize = std::min(OutStringSize-1, InsnStr.size());
@@ -195,12 +319,14 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setUseMarkup(1);
+      DC->addOptions(LLVMDisassembler_Option_UseMarkup);
       Options &= ~LLVMDisassembler_Option_UseMarkup;
   }
   if (Options & LLVMDisassembler_Option_PrintImmHex){
       LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
       MCInstPrinter *IP = DC->getIP();
       IP->setPrintImmHex(1);
+      DC->addOptions(LLVMDisassembler_Option_PrintImmHex);
       Options &= ~LLVMDisassembler_Option_PrintImmHex;
   }
   if (Options & LLVMDisassembler_Option_AsmPrinterVariant){
@@ -216,8 +342,21 @@ int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
           AsmPrinterVariant, *MAI, *MII, *MRI, *STI);
       if (IP) {
         DC->setIP(IP);
+        DC->addOptions(LLVMDisassembler_Option_AsmPrinterVariant);
         Options &= ~LLVMDisassembler_Option_AsmPrinterVariant;
       }
   }
+  if (Options & LLVMDisassembler_Option_SetInstrComments) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    MCInstPrinter *IP = DC->getIP();
+    IP->setCommentStream(DC->CommentStream);
+    DC->addOptions(LLVMDisassembler_Option_SetInstrComments);
+    Options &= ~LLVMDisassembler_Option_SetInstrComments;
+  }
+  if (Options & LLVMDisassembler_Option_PrintLatency) {
+    LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+    DC->addOptions(LLVMDisassembler_Option_PrintLatency);
+    Options &= ~LLVMDisassembler_Option_PrintLatency;
+  }
   return (Options == 0);
 }
diff --git a/lib/MC/MCDisassembler/Disassembler.h b/lib/MC/MCDisassembler/Disassembler.h
index 6eb59d0..4855af2 100644
--- a/lib/MC/MCDisassembler/Disassembler.h
+++ b/lib/MC/MCDisassembler/Disassembler.h
@@ -73,6 +73,10 @@ private:
   llvm::OwningPtr<const llvm::MCDisassembler> DisAsm;
   // The instruction printer for the target architecture.
   llvm::OwningPtr<llvm::MCInstPrinter> IP;
+  // The options used to set up the disassembler.
+  uint64_t Options;
+  // The CPU string.
+  std::string CPU;
 
 public:
   // Comment stream and backing vector.
@@ -90,6 +94,7 @@ public:
                     MCInstPrinter *iP) : TripleName(tripleName),
                     DisInfo(disInfo), TagType(tagType), GetOpInfo(getOpInfo),
                     SymbolLookUp(symbolLookUp), TheTarget(theTarget),
+                    Options(0),
                     CommentStream(CommentsToEmit) {
     MAI.reset(mAI);
     MRI.reset(mRI);
@@ -114,6 +119,10 @@ public:
   const MCSubtargetInfo *getSubtargetInfo() const { return MSI.get(); }
   MCInstPrinter *getIP() { return IP.get(); }
   void setIP(MCInstPrinter *NewIP) { IP.reset(NewIP); }
+  uint64_t getOptions() const { return Options; }
+  void addOptions(uint64_t Options) { this->Options |= Options; }
+  StringRef getCPU() const { return CPU; }
+  void setCPU(const char *CPU) { this->CPU = CPU; }
 };
 
 } // namespace llvm
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 18982e9..1e5c2e3 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -16,7 +16,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -47,20 +46,15 @@ using namespace llvm;
 // Range of line offsets in a special line info. opcode.
 #define DWARF2_LINE_RANGE               14
 
-// Define the architecture-dependent minimum instruction length (in bytes).
-// This value should be rather too small than too big.
-#define DWARF2_LINE_MIN_INSN_LENGTH     1
-
-// Note: when DWARF2_LINE_MIN_INSN_LENGTH == 1 which is the current setting,
-// this routine is a nop and will be optimized away.
-static inline uint64_t ScaleAddrDelta(uint64_t AddrDelta) {
-  if (DWARF2_LINE_MIN_INSN_LENGTH == 1)
+static inline uint64_t ScaleAddrDelta(MCContext &Context, uint64_t AddrDelta) {
+  unsigned MinInsnLength = Context.getAsmInfo()->getMinInstAlignment();
+  if (MinInsnLength == 1)
     return AddrDelta;
-  if (AddrDelta % DWARF2_LINE_MIN_INSN_LENGTH != 0) {
+  if (AddrDelta % MinInsnLength != 0) {
     // TODO: report this error, but really only once.
     ;
   }
-  return AddrDelta / DWARF2_LINE_MIN_INSN_LENGTH;
+  return AddrDelta / MinInsnLength;
 }
 
 //
@@ -182,9 +176,9 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
     // At this point we want to emit/create the sequence to encode the delta in
     // line numbers and the increment of the address from the previous Label
     // and the current Label.
-    const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
+    const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
     MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
-                                   asmInfo.getPointerSize());
+                                   asmInfo->getPointerSize());
 
     LastLine = it->getLine();
     LastLabel = Label;
@@ -210,9 +204,9 @@ static inline void EmitDwarfLineTable(MCStreamer *MCOS,
   // Switch back the dwarf line section.
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfLineSection());
 
-  const MCAsmInfo &asmInfo = MCOS->getContext().getAsmInfo();
+  const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
   MCOS->EmitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
-                                 asmInfo.getPointerSize());
+                                 asmInfo->getPointerSize());
 }
 
 //
@@ -274,10 +268,10 @@ const MCSymbol *MCDwarfFileTable::EmitCU(MCStreamer *MCOS, unsigned CUID) {
   // total length, the 2 bytes for the version, and these 4 bytes for the
   // length of the prologue.
   MCOS->EmitAbsValue(MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym,
-                                           (4 + 2 + 4)), 4, 0);
+                                           (4 + 2 + 4)), 4);
 
   // Parameters of the state machine, are next.
-  MCOS->EmitIntValue(DWARF2_LINE_MIN_INSN_LENGTH, 1);
+  MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
   MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1);
   MCOS->EmitIntValue(DWARF2_LINE_BASE, 1);
   MCOS->EmitIntValue(DWARF2_LINE_RANGE, 1);
@@ -338,7 +332,7 @@ const MCSymbol *MCDwarfFileTable::EmitCU(MCStreamer *MCOS, unsigned CUID) {
     EmitDwarfLineTable(MCOS, Sec, Line, CUID);
   }
 
-  if (MCOS->getContext().getAsmInfo().getLinkerRequiresNonEmptyDwarfLines()
+  if (MCOS->getContext().getAsmInfo()->getLinkerRequiresNonEmptyDwarfLines()
       && MCLineSectionOrder.begin() == MCLineSectionOrder.end()) {
     // The darwin9 linker has a bug (see PR8715). For for 32-bit architectures
     // it requires:
@@ -357,32 +351,24 @@ const MCSymbol *MCDwarfFileTable::EmitCU(MCStreamer *MCOS, unsigned CUID) {
   return LineStartSym;
 }
 
-/// Utility function to write the encoding to an object writer.
-void MCDwarfLineAddr::Write(MCObjectWriter *OW, int64_t LineDelta,
-                            uint64_t AddrDelta) {
-  SmallString<256> Tmp;
-  raw_svector_ostream OS(Tmp);
-  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OS);
-  OW->WriteBytes(OS.str());
-}
-
 /// Utility function to emit the encoding to a streamer.
 void MCDwarfLineAddr::Emit(MCStreamer *MCOS, int64_t LineDelta,
                            uint64_t AddrDelta) {
+  MCContext &Context = MCOS->getContext();
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
-  MCDwarfLineAddr::Encode(LineDelta, AddrDelta, OS);
+  MCDwarfLineAddr::Encode(Context, LineDelta, AddrDelta, OS);
   MCOS->EmitBytes(OS.str());
 }
 
 /// Utility function to encode a Dwarf pair of LineDelta and AddrDeltas.
-void MCDwarfLineAddr::Encode(int64_t LineDelta, uint64_t AddrDelta,
-                             raw_ostream &OS) {
+void MCDwarfLineAddr::Encode(MCContext &Context, int64_t LineDelta,
+                             uint64_t AddrDelta, raw_ostream &OS) {
   uint64_t Temp, Opcode;
   bool NeedCopy = false;
 
   // Scale the address delta by the minimum instruction length.
-  AddrDelta = ScaleAddrDelta(AddrDelta);
+  AddrDelta = ScaleAddrDelta(Context, AddrDelta);
 
   // A LineDelta of INT64_MAX is a signal that this is actually a
   // DW_LNE_end_sequence. We cannot use special opcodes here, since we want the
@@ -534,8 +520,8 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 
   // Figure the padding after the header before the table of address and size
   // pairs who's values are PointerSize'ed.
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo.getPointerSize();
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
+  int AddrSize = asmInfo->getPointerSize();
   int Pad = 2 * AddrSize - (Length & (2 * AddrSize - 1));
   if (Pad == 2 * AddrSize)
     Pad = 0;
@@ -615,8 +601,8 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     MCOS->EmitIntValue(0, 4);
   }
 
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo.getPointerSize();
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
+  int AddrSize = asmInfo->getPointerSize();
   // The 1 byte size of an address.
   MCOS->EmitIntValue(AddrSize, 1);
 
@@ -743,9 +729,9 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
 void MCGenDwarfInfo::Emit(MCStreamer *MCOS, const MCSymbol *LineSectionSymbol) {
   // Create the dwarf sections in this order (.debug_line already created).
   MCContext &context = MCOS->getContext();
-  const MCAsmInfo &AsmInfo = context.getAsmInfo();
+  const MCAsmInfo *AsmInfo = context.getAsmInfo();
   bool CreateDwarfSectionSymbols =
-      AsmInfo.doesDwarfUseRelocationsAcrossSections();
+      AsmInfo->doesDwarfUseRelocationsAcrossSections();
   if (!CreateDwarfSectionSymbols)
     LineSectionSymbol = NULL;
   MCSymbol *AbbrevSectionSymbol = NULL;
@@ -821,9 +807,9 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
 
 static int getDataAlignmentFactor(MCStreamer &streamer) {
   MCContext &context = streamer.getContext();
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
-  int size = asmInfo.getCalleeSaveStackSlotSize();
-  if (asmInfo.isStackGrowthDirectionUp())
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
+  int size = asmInfo->getCalleeSaveStackSlotSize();
+  if (asmInfo->isStackGrowthDirectionUp())
     return size;
   else
     return -size;
@@ -837,7 +823,7 @@ static unsigned getSizeForEncoding(MCStreamer &streamer,
   default: llvm_unreachable("Unknown Encoding");
   case dwarf::DW_EH_PE_absptr:
   case dwarf::DW_EH_PE_signed:
-    return context.getAsmInfo().getPointerSize();
+    return context.getAsmInfo()->getPointerSize();
   case dwarf::DW_EH_PE_udata2:
   case dwarf::DW_EH_PE_sdata2:
     return 2;
@@ -853,10 +839,10 @@ static unsigned getSizeForEncoding(MCStreamer &streamer,
 static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol,
                        unsigned symbolEncoding, const char *comment = 0) {
   MCContext &context = streamer.getContext();
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
-  const MCExpr *v = asmInfo.getExprForFDESymbol(&symbol,
-                                                symbolEncoding,
-                                                streamer);
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo->getExprForFDESymbol(&symbol,
+                                                 symbolEncoding,
+                                                 streamer);
   unsigned size = getSizeForEncoding(streamer, symbolEncoding);
   if (streamer.isVerboseAsm() && comment) streamer.AddComment(comment);
   streamer.EmitAbsValue(v, size);
@@ -865,25 +851,14 @@ static void EmitSymbol(MCStreamer &streamer, const MCSymbol &symbol,
 static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
                             unsigned symbolEncoding) {
   MCContext &context = streamer.getContext();
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
-  const MCExpr *v = asmInfo.getExprForPersonalitySymbol(&symbol,
-                                                        symbolEncoding,
-                                                        streamer);
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
+  const MCExpr *v = asmInfo->getExprForPersonalitySymbol(&symbol,
+                                                         symbolEncoding,
+                                                         streamer);
   unsigned size = getSizeForEncoding(streamer, symbolEncoding);
   streamer.EmitValue(v, size);
 }
 
-static const MachineLocation TranslateMachineLocation(
-                                                  const MCRegisterInfo &MRI,
-                                                  const MachineLocation &Loc) {
-  unsigned Reg = Loc.getReg() == MachineLocation::VirtualFP ?
-    MachineLocation::VirtualFP :
-    unsigned(MRI.getDwarfRegNum(Loc.getReg(), true));
-  const MachineLocation &NewLoc = Loc.isReg() ?
-    MachineLocation(Reg) : MachineLocation(Reg, Loc.getOffset());
-  return NewLoc;
-}
-
 namespace {
   class FrameEmitterImpl {
     int CFAOffset;
@@ -898,9 +873,7 @@ namespace {
 
     void setSectionStart(const MCSymbol *Label) { SectionStart = Label; }
 
-    /// EmitCompactUnwind - Emit the unwind information in a compact way. If
-    /// we're successful, return 'true'. Otherwise, return 'false' and it will
-    /// emit the normal CIE and FDE.
+    /// EmitCompactUnwind - Emit the unwind information in a compact way.
     void EmitCompactUnwind(MCStreamer &streamer,
                            const MCDwarfFrameInfo &frame);
 
@@ -914,7 +887,7 @@ namespace {
                       const MCSymbol &cieStart,
                       const MCDwarfFrameInfo &frame);
     void EmitCFIInstructions(MCStreamer &streamer,
-                             const std::vector<MCCFIInstruction> &Instrs,
+                             ArrayRef<MCCFIInstruction> Instrs,
                              MCSymbol *BaseLabel);
     void EmitCFIInstruction(MCStreamer &Streamer,
                             const MCCFIInstruction &Instr);
@@ -986,6 +959,10 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
+  case MCCFIInstruction::OpWindowSave: {
+    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
+    return;
+  }
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     if (VerboseAsm) {
@@ -1116,7 +1093,7 @@ void FrameEmitterImpl::EmitCFIInstruction(MCStreamer &Streamer,
 /// EmitFrameMoves - Emit frame instructions to describe the layout of the
 /// frame.
 void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
-                                    const std::vector<MCCFIInstruction> &Instrs,
+                                           ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (unsigned i = 0, N = Instrs.size(); i < N; ++i) {
     const MCCFIInstruction &Instr = Instrs[i];
@@ -1138,9 +1115,7 @@ void FrameEmitterImpl::EmitCFIInstructions(MCStreamer &streamer,
   }
 }
 
-/// EmitCompactUnwind - Emit the unwind information in a compact way. If we're
-/// successful, return 'true'. Otherwise, return 'false' and it will emit the
-/// normal CIE and FDE.
+/// EmitCompactUnwind - Emit the unwind information in a compact way.
 void FrameEmitterImpl::EmitCompactUnwind(MCStreamer &Streamer,
                                          const MCDwarfFrameInfo &Frame) {
   MCContext &Context = Streamer.getContext();
@@ -1219,7 +1194,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
                                           bool IsSignalFrame,
                                           unsigned lsdaEncoding) {
   MCContext &context = streamer.getContext();
-  const MCRegisterInfo &MRI = context.getRegisterInfo();
+  const MCRegisterInfo *MRI = context.getRegisterInfo();
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
   bool verboseAsm = streamer.isVerboseAsm();
 
@@ -1267,7 +1242,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
 
   // Code Alignment Factor
   if (verboseAsm) streamer.AddComment("CIE Code Alignment Factor");
-  streamer.EmitULEB128IntValue(1);
+  streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
 
   // Data Alignment Factor
   if (verboseAsm) streamer.AddComment("CIE Data Alignment Factor");
@@ -1275,7 +1250,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
 
   // Return Address Register
   if (verboseAsm) streamer.AddComment("CIE Return Address Column");
-  streamer.EmitULEB128IntValue(MRI.getDwarfRegNum(MRI.getRARegister(), true));
+  streamer.EmitULEB128IntValue(MRI->getDwarfRegNum(MRI->getRARegister(), true));
 
   // Augmentation Data Length (optional)
 
@@ -1315,38 +1290,13 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(MCStreamer &streamer,
 
   // Initial Instructions
 
-  const MCAsmInfo &MAI = context.getAsmInfo();
-  const std::vector<MachineMove> &Moves = MAI.getInitialFrameState();
-  std::vector<MCCFIInstruction> Instructions;
-
-  for (int i = 0, n = Moves.size(); i != n; ++i) {
-    MCSymbol *Label = Moves[i].getLabel();
-    const MachineLocation &Dst =
-      TranslateMachineLocation(MRI, Moves[i].getDestination());
-    const MachineLocation &Src =
-      TranslateMachineLocation(MRI, Moves[i].getSource());
-
-    if (Dst.isReg()) {
-      assert(Dst.getReg() == MachineLocation::VirtualFP);
-      assert(!Src.isReg());
-      MCCFIInstruction Inst =
-        MCCFIInstruction::createDefCfa(Label, Src.getReg(), -Src.getOffset());
-      Instructions.push_back(Inst);
-    } else {
-      assert(Src.isReg());
-      unsigned Reg = Src.getReg();
-      int Offset = Dst.getOffset();
-      MCCFIInstruction Inst =
-        MCCFIInstruction::createOffset(Label, Reg, Offset);
-      Instructions.push_back(Inst);
-    }
-  }
-
+  const MCAsmInfo *MAI = context.getAsmInfo();
+  const std::vector<MCCFIInstruction> &Instructions =
+      MAI->getInitialFrameState();
   EmitCFIInstructions(streamer, Instructions, NULL);
 
   // Padding
-  streamer.EmitValueToAlignment(IsEH
-                                ? 4 : context.getAsmInfo().getPointerSize());
+  streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getPointerSize());
 
   streamer.EmitLabel(sectionEnd);
   return *sectionStart;
@@ -1376,13 +1326,13 @@ MCSymbol *FrameEmitterImpl::EmitFDE(MCStreamer &streamer,
   streamer.EmitLabel(fdeStart);
 
   // CIE Pointer
-  const MCAsmInfo &asmInfo = context.getAsmInfo();
+  const MCAsmInfo *asmInfo = context.getAsmInfo();
   if (IsEH) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, cieStart, *fdeStart,
                                                  0);
     if (verboseAsm) streamer.AddComment("FDE CIE Offset");
     streamer.EmitAbsValue(offset, 4);
-  } else if (!asmInfo.doesDwarfUseRelocationsAcrossSections()) {
+  } else if (!asmInfo->doesDwarfUseRelocationsAcrossSections()) {
     const MCExpr *offset = MakeStartMinusEndExpr(streamer, *SectionStart,
                                                  cieStart, 0);
     streamer.EmitAbsValue(offset, 4);
@@ -1469,36 +1419,33 @@ namespace llvm {
   };
 }
 
-void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
-                               bool UsingCFI,
-                               bool IsEH) {
+void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer, MCAsmBackend *MAB,
+                               bool UsingCFI, bool IsEH) {
+  Streamer.generateCompactUnwindEncodings(MAB);
+
   MCContext &Context = Streamer.getContext();
-  MCObjectFileInfo *MOFI =
-    const_cast<MCObjectFileInfo*>(Context.getObjectFileInfo());
+  const MCObjectFileInfo *MOFI = Context.getObjectFileInfo();
   FrameEmitterImpl Emitter(UsingCFI, IsEH);
   ArrayRef<MCDwarfFrameInfo> FrameArray = Streamer.getFrameInfos();
 
   // Emit the compact unwind info if available.
   if (IsEH && MOFI->getCompactUnwindSection()) {
-    unsigned NumFrameInfos = Streamer.getNumFrameInfos();
     bool SectionEmitted = false;
-
-    if (NumFrameInfos) {
-      for (unsigned i = 0; i < NumFrameInfos; ++i) {
-        const MCDwarfFrameInfo &Frame = Streamer.getFrameInfo(i);
-        if (Frame.CompactUnwindEncoding == 0) continue;
-        if (!SectionEmitted) {
-          Streamer.SwitchSection(MOFI->getCompactUnwindSection());
-          Streamer.EmitValueToAlignment(Context.getAsmInfo().getPointerSize());
-          SectionEmitted = true;
-        }
-        Emitter.EmitCompactUnwind(Streamer, Frame);
+    for (unsigned i = 0, n = FrameArray.size(); i < n; ++i) {
+      const MCDwarfFrameInfo &Frame = FrameArray[i];
+      if (Frame.CompactUnwindEncoding == 0) continue;
+      if (!SectionEmitted) {
+        Streamer.SwitchSection(MOFI->getCompactUnwindSection());
+        Streamer.EmitValueToAlignment(Context.getAsmInfo()->getPointerSize());
+        SectionEmitted = true;
       }
+      Emitter.EmitCompactUnwind(Streamer, Frame);
     }
   }
 
-  const MCSection &Section = IsEH ? *MOFI->getEHFrameSection() :
-                                    *MOFI->getDwarfFrameSection();
+  const MCSection &Section =
+    IsEH ? *const_cast<MCObjectFileInfo*>(MOFI)->getEHFrameSection() :
+           *MOFI->getDwarfFrameSection();
   Streamer.SwitchSection(&Section);
   MCSymbol *SectionStart = Context.CreateTempSymbol();
   Streamer.EmitLabel(SectionStart);
@@ -1525,22 +1472,26 @@ void MCDwarfFrameEmitter::Emit(MCStreamer &Streamer,
       Streamer.EmitLabel(FDEEnd);
   }
 
-  Streamer.EmitValueToAlignment(Context.getAsmInfo().getPointerSize());
+  Streamer.EmitValueToAlignment(Context.getAsmInfo()->getPointerSize());
   if (FDEEnd)
     Streamer.EmitLabel(FDEEnd);
 }
 
 void MCDwarfFrameEmitter::EmitAdvanceLoc(MCStreamer &Streamer,
                                          uint64_t AddrDelta) {
+  MCContext &Context = Streamer.getContext();
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
-  MCDwarfFrameEmitter::EncodeAdvanceLoc(AddrDelta, OS);
+  MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OS);
   Streamer.EmitBytes(OS.str());
 }
 
-void MCDwarfFrameEmitter::EncodeAdvanceLoc(uint64_t AddrDelta,
+void MCDwarfFrameEmitter::EncodeAdvanceLoc(MCContext &Context,
+                                           uint64_t AddrDelta,
                                            raw_ostream &OS) {
-  // FIXME: Assumes the code alignment factor is 1.
+  // Scale the address delta by the minimum instruction length.
+  AddrDelta = ScaleAddrDelta(Context, AddrDelta);
+
   if (AddrDelta == 0) {
   } else if (isUIntN(6, AddrDelta)) {
     uint8_t Opcode = dwarf::DW_CFA_advance_loc | AddrDelta;
diff --git a/lib/MC/MCELF.cpp b/lib/MC/MCELF.cpp
index 560cdbc..ebb189e 100644
--- a/lib/MC/MCELF.cpp
+++ b/lib/MC/MCELF.cpp
@@ -36,8 +36,8 @@ unsigned MCELF::GetBinding(const MCSymbolData &SD) {
 void MCELF::SetType(MCSymbolData &SD, unsigned Type) {
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS ||
+         Type == ELF::STT_GNU_IFUNC);
 
   uint32_t OtherFlags = SD.getFlags() & ~(0xf << ELF_STT_Shift);
   SD.setFlags(OtherFlags | (Type << ELF_STT_Shift));
@@ -47,8 +47,7 @@ unsigned MCELF::GetType(const MCSymbolData &SD) {
   uint32_t Type = (SD.getFlags() & (0xf << ELF_STT_Shift)) >> ELF_STT_Shift;
   assert(Type == ELF::STT_NOTYPE || Type == ELF::STT_OBJECT ||
          Type == ELF::STT_FUNC || Type == ELF::STT_SECTION ||
-         Type == ELF::STT_FILE || Type == ELF::STT_COMMON ||
-         Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
+         Type == ELF::STT_COMMON || Type == ELF::STT_TLS || Type == ELF::STT_GNU_IFUNC);
   return Type;
 }
 
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 4cac84d..0c39e4a 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -39,13 +39,23 @@ const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &
   return &Symbol.AliasedSymbol();
 }
 
-void MCELFObjectTargetWriter::adjustFixupOffset(const MCFixup &Fixup,
-                                                uint64_t &RelocOffset) {
+// ELF doesn't require relocations to be in any order. We sort by the r_offset,
+// just to match gnu as for easier comparison. The use type and index is an
+// arbitrary way of making the sort deterministic.
+static int cmpRel(const ELFRelocationEntry *AP, const ELFRelocationEntry *BP) {
+  const ELFRelocationEntry &A = *AP;
+  const ELFRelocationEntry &B = *BP;
+  if (A.r_offset != B.r_offset)
+    return B.r_offset - A.r_offset;
+  if (B.Type != A.Type)
+    return A.Type - B.Type;
+  if (B.Index != A.Index)
+    return B.Index - A.Index;
+  llvm_unreachable("ELFRelocs might be unstable!");
 }
 
 void
 MCELFObjectTargetWriter::sortRelocs(const MCAssembler &Asm,
                                     std::vector<ELFRelocationEntry> &Relocs) {
-  // Sort by the r_offset, just like gnu as does.
-  array_pod_sort(Relocs.begin(), Relocs.end());
+  array_pod_sort(Relocs.begin(), Relocs.end(), cmpRel);
 }
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 116f86f..e806cb9 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
@@ -96,6 +97,9 @@ void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
 }
 
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+  // Do any generic stuff we need to do.
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
   case MCAF_Code16: return; // Change parsing mode; no-op here.
@@ -148,8 +152,8 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) {
   return T2;
 }
 
-void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
-                                          MCSymbolAttr Attribute) {
+bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+                                        MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
   if (Attribute == MCSA_IndirectSymbol) {
@@ -159,7 +163,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -182,7 +186,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_WeakDefAutoPrivate:
   case MCSA_Invalid:
   case MCSA_IndirectSymbol:
-    llvm_unreachable("Invalid symbol attribute for ELF!");
+    return false;
 
   case MCSA_NoDeadStrip:
   case MCSA_ELF_TypeGnuUniqueObject:
@@ -251,6 +255,8 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     MCELF::SetVisibility(SD, ELF::STV_INTERNAL);
     break;
   }
+
+  return true;
 }
 
 void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -270,7 +276,8 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                                          ELF::SHF_WRITE |
                                                          ELF::SHF_ALLOC,
                                                          SectionKind::getBSS());
-    Symbol->setSection(*Section);
+
+    AssignSection(Symbol, Section);
 
     struct LocalCommon L = {&SD, Size, ByteAlignment};
     LocalCommons.push_back(L);
@@ -296,12 +303,11 @@ void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
-void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                  unsigned AddrSpace) {
+void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
   if (getCurrentSectionData()->isBundleLocked())
     report_fatal_error("Emitting values inside a locked bundle is forbidden");
   fixSymbolsInTLSFixups(Value);
-  MCObjectStreamer::EmitValueImpl(Value, Size, AddrSpace);
+  MCObjectStreamer::EmitValueImpl(Value, Size);
 }
 
 void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
@@ -314,20 +320,29 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-
-// Add a symbol for the file name of this module. This is the second
-// entry in the module's symbol table (the first being the null symbol).
+// Add a symbol for the file name of this module. They start after the
+// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
+// with the same name may appear.
 void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  MCSymbol *Symbol = getAssembler().getContext().GetOrCreateSymbol(Filename);
-  Symbol->setSection(*getCurrentSection().first);
-  Symbol->setAbsolute();
-
-  MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-
-  SD.setFlags(ELF_STT_File | ELF_STB_Local | ELF_STV_Default);
+  getAssembler().addFileName(Filename);
+}
+
+void MCELFStreamer::EmitIdent(StringRef IdentString) {
+  const MCSection *Comment = getAssembler().getContext().getELFSection(
+      ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS,
+      SectionKind::getReadOnly(), 1, "");
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
 }
 
-void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
+void MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
   switch (expr->getKind()) {
   case MCExpr::Target:
     cast<MCTargetExpr>(expr)->fixELFSymbolsInTLSFixups(getAssembler());
@@ -363,18 +378,41 @@ void  MCELFStreamer::fixSymbolsInTLSFixups(const MCExpr *expr) {
     case MCSymbolRefExpr::VK_Mips_GOTTPREL:
     case MCSymbolRefExpr::VK_Mips_TPREL_HI:
     case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-    case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
-    case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
-    case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
-    case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
-    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
-    case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+    case MCSymbolRefExpr::VK_PPC_DTPMOD:
+    case MCSymbolRefExpr::VK_PPC_TPREL:
+    case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HI:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
+    case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
+    case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
+    case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+    case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+    case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
+    case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
     case MCSymbolRefExpr::VK_PPC_TLS:
-    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
-    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
     case MCSymbolRefExpr::VK_PPC_TLSGD:
-    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
-    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI:
+    case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA:
     case MCSymbolRefExpr::VK_PPC_TLSLD:
       break;
     }
@@ -503,9 +541,7 @@ void MCELFStreamer::EmitBundleUnlock() {
   SD->setBundleLockState(MCSectionData::NotBundleLocked);
 }
 
-void MCELFStreamer::FinishImpl() {
-  EmitFrames(true);
-
+void MCELFStreamer::Flush() {
   for (std::vector<LocalCommon>::const_iterator i = LocalCommons.begin(),
                                                 e = LocalCommons.end();
        i != e; ++i) {
@@ -526,17 +562,23 @@ void MCELFStreamer::FinishImpl() {
       SectData.setAlignment(ByteAlignment);
   }
 
-  this->MCObjectStreamer::FinishImpl();
+  LocalCommons.clear();
 }
-void MCELFStreamer::EmitTCEntry(const MCSymbol &S) {
-  // Creates a R_PPC64_TOC relocation
-  MCObjectStreamer::EmitSymbolValue(&S, 8);
+
+void MCELFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
+
+  Flush();
+
+  this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_ostream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll, bool NoExecStack) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+MCStreamer *llvm::createELFStreamer(MCContext &Context,
+                                    MCTargetStreamer *Streamer,
+                                    MCAsmBackend &MAB, raw_ostream &OS,
+                                    MCCodeEmitter *CE, bool RelaxAll,
+                                    bool NoExecStack) {
+  MCELFStreamer *S = new MCELFStreamer(Context, Streamer, MAB, OS, CE);
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   if (NoExecStack)
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 06bc72f..c777e64 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -42,13 +42,6 @@ void MCExpr::print(raw_ostream &OS) const {
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
     bool UseParens = Sym.getName()[0] == '$';
-
-    if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 ||
-        SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) {
-      OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
-      UseParens = true;
-    }
-
     if (UseParens)
       OS << '(' << Sym << ')';
     else
@@ -65,9 +58,7 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2 ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_PREL31)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
-    else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
-             SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_LO16)
+    else if (SRE.getKind() != MCSymbolRefExpr::VK_None)
       OS << '@' << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
 
     return;
@@ -205,26 +196,56 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_TARGET1: return "(target1)";
   case VK_ARM_TARGET2: return "(target2)";
   case VK_ARM_PREL31: return "(prel31)";
-  case VK_PPC_TOC: return "tocbase";
-  case VK_PPC_TOC_ENTRY: return "toc";
-  case VK_PPC_DARWIN_HA16: return "ha16";
-  case VK_PPC_DARWIN_LO16: return "lo16";
-  case VK_PPC_GAS_HA16: return "ha";
-  case VK_PPC_GAS_LO16: return "l";
-  case VK_PPC_TPREL16_HA: return "tprel@ha";
-  case VK_PPC_TPREL16_LO: return "tprel@l";
-  case VK_PPC_DTPREL16_HA: return "dtprel@ha";
-  case VK_PPC_DTPREL16_LO: return "dtprel@l";
-  case VK_PPC_TOC16_HA: return "toc@ha";
-  case VK_PPC_TOC16_LO: return "toc@l";
-  case VK_PPC_GOT_TPREL16_HA: return "got@tprel@ha";
-  case VK_PPC_GOT_TPREL16_LO: return "got@tprel@l";
+  case VK_PPC_LO: return "l";
+  case VK_PPC_HI: return "h";
+  case VK_PPC_HA: return "ha";
+  case VK_PPC_HIGHER: return "higher";
+  case VK_PPC_HIGHERA: return "highera";
+  case VK_PPC_HIGHEST: return "highest";
+  case VK_PPC_HIGHESTA: return "highesta";
+  case VK_PPC_GOT_LO: return "got@l";
+  case VK_PPC_GOT_HI: return "got@h";
+  case VK_PPC_GOT_HA: return "got@ha";
+  case VK_PPC_TOCBASE: return "tocbase";
+  case VK_PPC_TOC: return "toc";
+  case VK_PPC_TOC_LO: return "toc@l";
+  case VK_PPC_TOC_HI: return "toc@h";
+  case VK_PPC_TOC_HA: return "toc@ha";
+  case VK_PPC_DTPMOD: return "dtpmod";
+  case VK_PPC_TPREL: return "tprel";
+  case VK_PPC_TPREL_LO: return "tprel@l";
+  case VK_PPC_TPREL_HI: return "tprel@h";
+  case VK_PPC_TPREL_HA: return "tprel@ha";
+  case VK_PPC_TPREL_HIGHER: return "tprel@higher";
+  case VK_PPC_TPREL_HIGHERA: return "tprel@highera";
+  case VK_PPC_TPREL_HIGHEST: return "tprel@highest";
+  case VK_PPC_TPREL_HIGHESTA: return "tprel@highesta";
+  case VK_PPC_DTPREL: return "dtprel";
+  case VK_PPC_DTPREL_LO: return "dtprel@l";
+  case VK_PPC_DTPREL_HI: return "dtprel@h";
+  case VK_PPC_DTPREL_HA: return "dtprel@ha";
+  case VK_PPC_DTPREL_HIGHER: return "dtprel@higher";
+  case VK_PPC_DTPREL_HIGHERA: return "dtprel@highera";
+  case VK_PPC_DTPREL_HIGHEST: return "dtprel@highest";
+  case VK_PPC_DTPREL_HIGHESTA: return "dtprel@highesta";
+  case VK_PPC_GOT_TPREL: return "got@tprel";
+  case VK_PPC_GOT_TPREL_LO: return "got@tprel@l";
+  case VK_PPC_GOT_TPREL_HI: return "got@tprel@h";
+  case VK_PPC_GOT_TPREL_HA: return "got@tprel@ha";
+  case VK_PPC_GOT_DTPREL: return "got@dtprel";
+  case VK_PPC_GOT_DTPREL_LO: return "got@dtprel@l";
+  case VK_PPC_GOT_DTPREL_HI: return "got@dtprel@h";
+  case VK_PPC_GOT_DTPREL_HA: return "got@dtprel@ha";
   case VK_PPC_TLS: return "tls";
-  case VK_PPC_GOT_TLSGD16_HA: return "got@tlsgd@ha";
-  case VK_PPC_GOT_TLSGD16_LO: return "got@tlsgd@l";
-  case VK_PPC_GOT_TLSLD16_HA: return "got@tlsld@ha";
-  case VK_PPC_GOT_TLSLD16_LO: return "got@tlsld@l";
+  case VK_PPC_GOT_TLSGD: return "got@tlsgd";
+  case VK_PPC_GOT_TLSGD_LO: return "got@tlsgd@l";
+  case VK_PPC_GOT_TLSGD_HI: return "got@tlsgd@h";
+  case VK_PPC_GOT_TLSGD_HA: return "got@tlsgd@ha";
   case VK_PPC_TLSGD: return "tlsgd";
+  case VK_PPC_GOT_TLSLD: return "got@tlsld";
+  case VK_PPC_GOT_TLSLD_LO: return "got@tlsld@l";
+  case VK_PPC_GOT_TLSLD_HI: return "got@tlsld@h";
+  case VK_PPC_GOT_TLSLD_HA: return "got@tlsld@ha";
   case VK_PPC_TLSLD: return "tlsld";
   case VK_Mips_GPREL: return "GPREL";
   case VK_Mips_GOT_CALL: return "GOT_CALL";
@@ -290,40 +311,104 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("imgrel", VK_COFF_IMGREL32)
     .Case("SECREL32", VK_SECREL)
     .Case("secrel32", VK_SECREL)
-    .Case("HA", VK_PPC_GAS_HA16)
-    .Case("ha", VK_PPC_GAS_HA16)
-    .Case("L", VK_PPC_GAS_LO16)
-    .Case("l", VK_PPC_GAS_LO16)
-    .Case("TOCBASE", VK_PPC_TOC)
-    .Case("tocbase", VK_PPC_TOC)
-    .Case("TOC", VK_PPC_TOC_ENTRY)
-    .Case("toc", VK_PPC_TOC_ENTRY)
-    .Case("TOC@HA", VK_PPC_TOC16_HA)
-    .Case("toc@ha", VK_PPC_TOC16_HA)
-    .Case("TOC@L", VK_PPC_TOC16_LO)
-    .Case("toc@l", VK_PPC_TOC16_LO)
+    .Case("L", VK_PPC_LO)
+    .Case("l", VK_PPC_LO)
+    .Case("H", VK_PPC_HI)
+    .Case("h", VK_PPC_HI)
+    .Case("HA", VK_PPC_HA)
+    .Case("ha", VK_PPC_HA)
+    .Case("HIGHER", VK_PPC_HIGHER)
+    .Case("higher", VK_PPC_HIGHER)
+    .Case("HIGHERA", VK_PPC_HIGHERA)
+    .Case("highera", VK_PPC_HIGHERA)
+    .Case("HIGHEST", VK_PPC_HIGHEST)
+    .Case("highest", VK_PPC_HIGHEST)
+    .Case("HIGHESTA", VK_PPC_HIGHESTA)
+    .Case("highesta", VK_PPC_HIGHESTA)
+    .Case("GOT@L", VK_PPC_GOT_LO)
+    .Case("got@l", VK_PPC_GOT_LO)
+    .Case("GOT@H", VK_PPC_GOT_HI)
+    .Case("got@h", VK_PPC_GOT_HI)
+    .Case("GOT@HA", VK_PPC_GOT_HA)
+    .Case("got@ha", VK_PPC_GOT_HA)
+    .Case("TOCBASE", VK_PPC_TOCBASE)
+    .Case("tocbase", VK_PPC_TOCBASE)
+    .Case("TOC", VK_PPC_TOC)
+    .Case("toc", VK_PPC_TOC)
+    .Case("TOC@L", VK_PPC_TOC_LO)
+    .Case("toc@l", VK_PPC_TOC_LO)
+    .Case("TOC@H", VK_PPC_TOC_HI)
+    .Case("toc@h", VK_PPC_TOC_HI)
+    .Case("TOC@HA", VK_PPC_TOC_HA)
+    .Case("toc@ha", VK_PPC_TOC_HA)
     .Case("TLS", VK_PPC_TLS)
     .Case("tls", VK_PPC_TLS)
-    .Case("TPREL@HA", VK_PPC_TPREL16_HA)
-    .Case("tprel@ha", VK_PPC_TPREL16_HA)
-    .Case("TPREL@L", VK_PPC_TPREL16_LO)
-    .Case("tprel@l", VK_PPC_TPREL16_LO)
-    .Case("DTPREL@HA", VK_PPC_DTPREL16_HA)
-    .Case("dtprel@ha", VK_PPC_DTPREL16_HA)
-    .Case("DTPREL@L", VK_PPC_DTPREL16_LO)
-    .Case("dtprel@l", VK_PPC_DTPREL16_LO)
-    .Case("GOT@TPREL@HA", VK_PPC_GOT_TPREL16_HA)
-    .Case("got@tprel@ha", VK_PPC_GOT_TPREL16_HA)
-    .Case("GOT@TPREL@L", VK_PPC_GOT_TPREL16_LO)
-    .Case("got@tprel@l", VK_PPC_GOT_TPREL16_LO)
-    .Case("GOT@TLSGD@HA", VK_PPC_GOT_TLSGD16_HA)
-    .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD16_HA)
-    .Case("GOT@TLSGD@L", VK_PPC_GOT_TLSGD16_LO)
-    .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD16_LO)
-    .Case("GOT@TLSLD@HA", VK_PPC_GOT_TLSLD16_HA)
-    .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD16_HA)
-    .Case("GOT@TLSLD@L", VK_PPC_GOT_TLSLD16_LO)
-    .Case("got@tlsld@l", VK_PPC_GOT_TLSLD16_LO)
+    .Case("DTPMOD", VK_PPC_DTPMOD)
+    .Case("dtpmod", VK_PPC_DTPMOD)
+    .Case("TPREL", VK_PPC_TPREL)
+    .Case("tprel", VK_PPC_TPREL)
+    .Case("TPREL@L", VK_PPC_TPREL_LO)
+    .Case("tprel@l", VK_PPC_TPREL_LO)
+    .Case("TPREL@H", VK_PPC_TPREL_HI)
+    .Case("tprel@h", VK_PPC_TPREL_HI)
+    .Case("TPREL@HA", VK_PPC_TPREL_HA)
+    .Case("tprel@ha", VK_PPC_TPREL_HA)
+    .Case("TPREL@HIGHER", VK_PPC_TPREL_HIGHER)
+    .Case("tprel@higher", VK_PPC_TPREL_HIGHER)
+    .Case("TPREL@HIGHERA", VK_PPC_TPREL_HIGHERA)
+    .Case("tprel@highera", VK_PPC_TPREL_HIGHERA)
+    .Case("TPREL@HIGHEST", VK_PPC_TPREL_HIGHEST)
+    .Case("tprel@highest", VK_PPC_TPREL_HIGHEST)
+    .Case("TPREL@HIGHESTA", VK_PPC_TPREL_HIGHESTA)
+    .Case("tprel@highesta", VK_PPC_TPREL_HIGHESTA)
+    .Case("DTPREL", VK_PPC_DTPREL)
+    .Case("dtprel", VK_PPC_DTPREL)
+    .Case("DTPREL@L", VK_PPC_DTPREL_LO)
+    .Case("dtprel@l", VK_PPC_DTPREL_LO)
+    .Case("DTPREL@H", VK_PPC_DTPREL_HI)
+    .Case("dtprel@h", VK_PPC_DTPREL_HI)
+    .Case("DTPREL@HA", VK_PPC_DTPREL_HA)
+    .Case("dtprel@ha", VK_PPC_DTPREL_HA)
+    .Case("DTPREL@HIGHER", VK_PPC_DTPREL_HIGHER)
+    .Case("dtprel@higher", VK_PPC_DTPREL_HIGHER)
+    .Case("DTPREL@HIGHERA", VK_PPC_DTPREL_HIGHERA)
+    .Case("dtprel@highera", VK_PPC_DTPREL_HIGHERA)
+    .Case("DTPREL@HIGHEST", VK_PPC_DTPREL_HIGHEST)
+    .Case("dtprel@highest", VK_PPC_DTPREL_HIGHEST)
+    .Case("DTPREL@HIGHESTA", VK_PPC_DTPREL_HIGHESTA)
+    .Case("dtprel@highesta", VK_PPC_DTPREL_HIGHESTA)
+    .Case("GOT@TPREL", VK_PPC_GOT_TPREL)
+    .Case("got@tprel", VK_PPC_GOT_TPREL)
+    .Case("GOT@TPREL@L", VK_PPC_GOT_TPREL_LO)
+    .Case("got@tprel@l", VK_PPC_GOT_TPREL_LO)
+    .Case("GOT@TPREL@H", VK_PPC_GOT_TPREL_HI)
+    .Case("got@tprel@h", VK_PPC_GOT_TPREL_HI)
+    .Case("GOT@TPREL@HA", VK_PPC_GOT_TPREL_HA)
+    .Case("got@tprel@ha", VK_PPC_GOT_TPREL_HA)
+    .Case("GOT@DTPREL", VK_PPC_GOT_DTPREL)
+    .Case("got@dtprel", VK_PPC_GOT_DTPREL)
+    .Case("GOT@DTPREL@L", VK_PPC_GOT_DTPREL_LO)
+    .Case("got@dtprel@l", VK_PPC_GOT_DTPREL_LO)
+    .Case("GOT@DTPREL@H", VK_PPC_GOT_DTPREL_HI)
+    .Case("got@dtprel@h", VK_PPC_GOT_DTPREL_HI)
+    .Case("GOT@DTPREL@HA", VK_PPC_GOT_DTPREL_HA)
+    .Case("got@dtprel@ha", VK_PPC_GOT_DTPREL_HA)
+    .Case("GOT@TLSGD", VK_PPC_GOT_TLSGD)
+    .Case("got@tlsgd", VK_PPC_GOT_TLSGD)
+    .Case("GOT@TLSGD@L", VK_PPC_GOT_TLSGD_LO)
+    .Case("got@tlsgd@l", VK_PPC_GOT_TLSGD_LO)
+    .Case("GOT@TLSGD@H", VK_PPC_GOT_TLSGD_HI)
+    .Case("got@tlsgd@h", VK_PPC_GOT_TLSGD_HI)
+    .Case("GOT@TLSGD@HA", VK_PPC_GOT_TLSGD_HA)
+    .Case("got@tlsgd@ha", VK_PPC_GOT_TLSGD_HA)
+    .Case("GOT@TLSLD", VK_PPC_GOT_TLSLD)
+    .Case("got@tlsld", VK_PPC_GOT_TLSLD)
+    .Case("GOT@TLSLD@L", VK_PPC_GOT_TLSLD_LO)
+    .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
+    .Case("GOT@TLSLD@H", VK_PPC_GOT_TLSLD_HI)
+    .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
+    .Case("GOT@TLSLD@HA", VK_PPC_GOT_TLSLD_HA)
+    .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
     .Default(VK_Invalid);
 }
 
diff --git a/lib/MC/MCExternalSymbolizer.cpp b/lib/MC/MCExternalSymbolizer.cpp
new file mode 100644
index 0000000..ca368b2
--- /dev/null
+++ b/lib/MC/MCExternalSymbolizer.cpp
@@ -0,0 +1,181 @@
+//===-- lib/MC/MCExternalSymbolizer.cpp - External symbolizer ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+// This function tries to add a symbolic operand in place of the immediate
+// Value in the MCInst. The immediate Value has had any PC adjustment made by
+// the caller. If the instruction is a branch instruction then IsBranch is true,
+// else false. If the getOpInfo() function was set as part of the
+// setupForSymbolicDisassembly() call then that function is called to get any
+// symbolic information at the Address for this instruction. If that returns
+// non-zero then the symbolic information it returns is used to create an MCExpr
+// and that is added as an operand to the MCInst. If getOpInfo() returns zero
+// and IsBranch is true then a symbol look up for Value is done and if a symbol
+// is found an MCExpr is created with that, else an MCExpr with Value is
+// created. This function returns true if it adds an operand to the MCInst and
+// false otherwise.
+bool MCExternalSymbolizer::tryAddingSymbolicOperand(MCInst &MI,
+                                                    raw_ostream &cStream,
+                                                    int64_t Value,
+                                                    uint64_t Address,
+                                                    bool IsBranch,
+                                                    uint64_t Offset,
+                                                    uint64_t InstSize) {
+  struct LLVMOpInfo1 SymbolicOp;
+  std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, Offset, InstSize, 1, &SymbolicOp)) {
+    // Clear SymbolicOp.Value from above and also all other fields.
+    std::memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+    if (!SymbolLookUp)
+      return false;
+    uint64_t ReferenceType;
+    if (IsBranch)
+       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+    else
+       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
+    const char *ReferenceName;
+    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
+                                    &ReferenceName);
+    if (Name) {
+      SymbolicOp.AddSymbol.Name = Name;
+      SymbolicOp.AddSymbol.Present = true;
+    }
+    // For branches always create an MCExpr so it gets printed as hex address.
+    else if (IsBranch) {
+      SymbolicOp.Value = Value;
+    }
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+      cStream << "symbol stub for: " << ReferenceName;
+    else if(ReferenceType == LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    if (!Name && !IsBranch)
+      return false;
+  }
+
+  const MCExpr *Add = NULL;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = NULL;
+  if (SymbolicOp.SubtractSymbol.Present) {
+      if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = NULL;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off != 0)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off != 0)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  Expr = RelInfo->createExprForCAPIVariantKind(Expr, SymbolicOp.VariantKind);
+  if (!Expr)
+    return false;
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+  return true;
+}
+
+// This function tries to add a comment as to what is being referenced by a load
+// instruction with the base register that is the Pc.  These can often be values
+// in a literal pool near the Address of the instruction. The Address of the
+// instruction and its immediate Value are used as a possible literal pool entry.
+// The SymbolLookUp call back will return the name of a symbol referenced by the
+// literal pool's entry if the referenced address is that of a symbol. Or it
+// will return a pointer to a literal 'C' string if the referenced address of
+// the literal pool's entry is an address into a section with C string literals.
+// Or if the reference is to an Objective-C data structure it will return a
+// specific reference type for it and a string.
+void MCExternalSymbolizer::tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                                           int64_t Value,
+                                                           uint64_t Address) {
+  if (SymbolLookUp) {
+    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
+    const char *ReferenceName;
+    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
+    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+      cStream << "literal pool symbol address: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+      cStream << "literal pool for: \"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+      cStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message)
+      cStream << "Objc message: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+      cStream << "Objc message ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+      cStream << "Objc selector ref: " << ReferenceName;
+    else if(ReferenceType ==
+            LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+      cStream << "Objc class ref: " << ReferenceName;
+  }
+}
+
+namespace llvm {
+MCSymbolizer *createMCSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                                 LLVMSymbolLookupCallback SymbolLookUp,
+                                 void *DisInfo,
+                                 MCContext *Ctx,
+                                 MCRelocationInfo *RelInfo) {
+  assert(Ctx != 0 && "No MCContext given for symbolic disassembly");
+
+  OwningPtr<MCRelocationInfo> RelInfoOwingPtr(RelInfo);
+  return new MCExternalSymbolizer(*Ctx, RelInfoOwingPtr, GetOpInfo,
+                                  SymbolLookUp, DisInfo);
+}
+}
diff --git a/lib/MC/MCFunction.cpp b/lib/MC/MCFunction.cpp
new file mode 100644
index 0000000..767e1e0
--- /dev/null
+++ b/lib/MC/MCFunction.cpp
@@ -0,0 +1,81 @@
+//===-- lib/MC/MCFunction.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCModule.h"
+#include <algorithm>
+
+using namespace llvm;
+
+// MCFunction
+
+MCFunction::MCFunction(StringRef Name, MCModule *Parent)
+  : Name(Name), ParentModule(Parent)
+{}
+
+MCFunction::~MCFunction() {
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    delete *I;
+}
+
+MCBasicBlock &MCFunction::createBlock(const MCTextAtom &TA) {
+  MCBasicBlock *MCBB = new MCBasicBlock(TA, this);
+  Blocks.push_back(MCBB);
+  return *MCBB;
+}
+
+MCBasicBlock *MCFunction::find(uint64_t StartAddr) {
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    if ((*I)->getInsts()->getBeginAddr() == StartAddr)
+      return *I;
+  return 0;
+}
+
+const MCBasicBlock *MCFunction::find(uint64_t StartAddr) const {
+  return const_cast<MCFunction *>(this)->find(StartAddr);
+}
+
+// MCBasicBlock
+
+MCBasicBlock::MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent)
+  : Insts(&Insts), Parent(Parent) {
+  getParent()->getParent()->trackBBForAtom(&Insts, this);
+}
+
+void MCBasicBlock::addSuccessor(const MCBasicBlock *MCBB) {
+  if (!isSuccessor(MCBB))
+    Successors.push_back(MCBB);
+}
+
+bool MCBasicBlock::isSuccessor(const MCBasicBlock *MCBB) const {
+  return std::find(Successors.begin(), Successors.end(),
+                   MCBB) != Successors.end();
+}
+
+void MCBasicBlock::addPredecessor(const MCBasicBlock *MCBB) {
+  if (!isPredecessor(MCBB))
+    Predecessors.push_back(MCBB);
+}
+
+bool MCBasicBlock::isPredecessor(const MCBasicBlock *MCBB) const {
+  return std::find(Predecessors.begin(), Predecessors.end(),
+                   MCBB) != Predecessors.end();
+}
+
+void MCBasicBlock::splitBasicBlock(MCBasicBlock *SplitBB) {
+  assert(Insts->getEndAddr() + 1 == SplitBB->Insts->getBeginAddr() &&
+         "Splitting unrelated basic blocks!");
+  SplitBB->addPredecessor(this);
+  assert(SplitBB->Successors.empty() &&
+         "Split basic block shouldn't already have successors!");
+  SplitBB->Successors = Successors;
+  Successors.clear();
+  addSuccessor(SplitBB);
+}
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 73f30ff..ba71245 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -31,9 +31,13 @@ void MCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
   if (!Annot.empty()) {
-    if (CommentStream)
+    if (CommentStream) {
       (*CommentStream) << Annot;
-    else
+      // By definition (see MCInstPrinter.h), CommentStream must end with
+      // a newline after each comment.
+      if (Annot.back() != '\n')
+        (*CommentStream) << '\n';
+    } else
       OS << " " << MAI.getCommentString() << " " << Annot;
   }
 }
@@ -52,10 +56,55 @@ StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
     return b;
 }
 
-/// Utility function to print immediates in decimal or hex.
-format_object1<int64_t> MCInstPrinter::formatImm(const int64_t Value) const {
-  if (getPrintImmHex())
-    return format("0x%" PRIx64, Value);
-  else
-    return format("%" PRId64, Value);
+// For asm-style hex (e.g. 0ffh) the first digit always has to be a number.
+static bool needsLeadingZero(uint64_t Value)
+{
+  while(Value)
+  {
+    uint64_t digit = (Value >> 60) & 0xf;
+    if (digit != 0)
+      return (digit >= 0xa);
+    Value <<= 4;
+  }
+  return false;
+}
+
+format_object1<int64_t> MCInstPrinter::formatDec(const int64_t Value) const {
+  return format("%" PRId64, Value);
+}
+
+format_object1<int64_t> MCInstPrinter::formatHex(const int64_t Value) const {
+  switch(PrintHexStyle) {
+  case HexStyle::C:
+    if (Value < 0)
+      return format("-0x%" PRIx64, -Value);
+    else
+      return format("0x%" PRIx64, Value);
+  case HexStyle::Asm:
+    if (Value < 0) {
+      if (needsLeadingZero((uint64_t)(-Value)))
+        return format("-0%" PRIx64 "h", -Value);
+      else
+        return format("-%" PRIx64 "h", -Value);
+    } else {
+      if (needsLeadingZero((uint64_t)(Value)))
+        return format("0%" PRIx64 "h", Value);
+      else
+        return format("%" PRIx64 "h", Value);
+    }
+  }
+  llvm_unreachable("unsupported print style");
+}
+
+format_object1<uint64_t> MCInstPrinter::formatHex(const uint64_t Value) const {
+  switch(PrintHexStyle) {
+  case HexStyle::C:
+     return format("0x%" PRIx64, Value);
+  case HexStyle::Asm:
+    if (needsLeadingZero(Value))
+      return format("0%" PRIx64 "h", Value);
+    else
+      return format("%" PRIx64 "h", Value);
+  }
+  llvm_unreachable("unsupported print style");
 }
diff --git a/lib/MC/MCInstrAnalysis.cpp b/lib/MC/MCInstrAnalysis.cpp
index 7736702..2d8336d 100644
--- a/lib/MC/MCInstrAnalysis.cpp
+++ b/lib/MC/MCInstrAnalysis.cpp
@@ -10,12 +10,13 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 using namespace llvm;
 
-uint64_t MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                                         uint64_t Size) const {
+bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                     uint64_t Size, uint64_t &Target) const {
   if (Inst.getNumOperands() == 0 ||
       Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
-    return -1ULL;
+    return false;
 
   int64_t Imm = Inst.getOperand(0).getImm();
-  return Addr+Size+Imm;
+  Target = Addr+Size+Imm;
+  return true;
 }
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index e08b01b..2924dcd 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,3 +1,4 @@
+//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -36,7 +37,7 @@ private:
 public:
   MCMachOStreamer(MCContext &Context, MCAsmBackend &MAB, raw_ostream &OS,
                   MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_MachOStreamer, Context, MAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, MAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -51,7 +52,7 @@ public:
   virtual void EmitLinkerOptions(ArrayRef<std::string> Options);
   virtual void EmitDataRegion(MCDataRegionType Kind);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                 unsigned ByteAlignment);
@@ -81,16 +82,14 @@ public:
     // FIXME: Just ignore the .file; it isn't important enough to fail the
     // entire assembly.
 
-    //report_fatal_error("unsupported directive: '.file'");
+    // report_fatal_error("unsupported directive: '.file'");
   }
 
-  virtual void FinishImpl();
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MachOStreamer;
+  virtual void EmitIdent(StringRef IdentString) {
+    llvm_unreachable("macho doesn't support this directive");
   }
+
+  virtual void FinishImpl();
 };
 
 } // end anonymous namespace.
@@ -122,7 +121,7 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
   // isSymbolLinkerVisible uses the section.
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
@@ -217,7 +216,7 @@ void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() | SF_ThumbFunc);
 }
 
-void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   // Indirect symbols are handled differently, to match how 'as' handles
   // them. This makes writing matching .o files easier.
@@ -228,7 +227,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     ISD.Symbol = Symbol;
     ISD.SectionData = getCurrentSectionData();
     getAssembler().getIndirectSymbols().push_back(ISD);
-    return;
+    return true;
   }
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -257,7 +256,7 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_Protected:
   case MCSA_Weak:
   case MCSA_Local:
-    llvm_unreachable("Invalid symbol attribute for Mach-O!");
+    return false;
 
   case MCSA_Global:
     SD.setExternal(true);
@@ -309,6 +308,8 @@ void MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     SD.setFlags(SD.getFlags() | SF_WeakDefinition | SF_WeakReference);
     break;
   }
+
+  return true;
 }
 
 void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -324,6 +325,8 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
+  AssignSection(Symbol, NULL);
+
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
   SD.setExternal(true);
   SD.setCommon(Size, ByteAlignment);
@@ -346,7 +349,8 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   if (!Symbol)
     return;
 
-  // FIXME: Assert that this section has the zerofill type.
+  // On darwin all virtual sections have zerofill type.
+  assert(Section->isVirtualSection() && "Section does not have zerofill type!");
 
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
 
@@ -359,7 +363,7 @@ void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   MCFragment *F = new MCFillFragment(0, 0, Size, &SectData);
   SD.setFragment(F);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   // Update the maximum alignment on the zero fill section if necessary.
   if (ByteAlignment > SectData.getAlignment())
@@ -392,7 +396,7 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
 }
 
 void MCMachOStreamer::FinishImpl() {
-  EmitFrames(true);
+  EmitFrames(&getAssembler().getBackend(), true);
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
index f563160..7e9e18a 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCModule.cpp
@@ -7,39 +7,136 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAtom.h"
 #include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCFunction.h"
+#include <algorithm>
 
 using namespace llvm;
 
-MCAtom *MCModule::createAtom(MCAtom::AtomType Type,
-                             uint64_t Begin, uint64_t End) {
-  assert(Begin < End && "Creating MCAtom with endpoints reversed?");
+static bool AtomComp(const MCAtom *L, uint64_t Addr) {
+  return L->getEndAddr() < Addr;
+}
+
+static bool AtomCompInv(uint64_t Addr, const MCAtom *R) {
+  return Addr < R->getEndAddr();
+}
+
+void MCModule::map(MCAtom *NewAtom) {
+  uint64_t Begin = NewAtom->Begin;
+
+  assert(Begin <= NewAtom->End && "Creating MCAtom with endpoints reversed?");
 
   // Check for atoms already covering this range.
-  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Begin);
-  assert((!I.valid() || I.start() < End) && "Offset range already occupied!");
+  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
+                                            Begin, AtomComp);
+  assert((I == atom_end() || (*I)->getBeginAddr() > NewAtom->End)
+         && "Offset range already occupied!");
 
-  // Create the new atom and add it to our maps.
-  MCAtom *NewAtom = new MCAtom(Type, this, Begin, End);
-  AtomAllocationTracker.insert(NewAtom);
-  OffsetMap.insert(Begin, End, NewAtom);
+  // Insert the new atom to the list.
+  Atoms.insert(I, NewAtom);
+}
+
+MCTextAtom *MCModule::createTextAtom(uint64_t Begin, uint64_t End) {
+  MCTextAtom *NewAtom = new MCTextAtom(this, Begin, End);
+  map(NewAtom);
+  return NewAtom;
+}
+
+MCDataAtom *MCModule::createDataAtom(uint64_t Begin, uint64_t End) {
+  MCDataAtom *NewAtom = new MCDataAtom(this, Begin, End);
+  map(NewAtom);
   return NewAtom;
 }
 
 // remap - Update the interval mapping for an atom.
 void MCModule::remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd) {
   // Find and erase the old mapping.
-  IntervalMap<uint64_t, MCAtom*>::iterator I = OffsetMap.find(Atom->Begin);
-  assert(I.valid() && "Atom offset not found in module!");
+  AtomListTy::iterator I = std::lower_bound(atom_begin(), atom_end(),
+                                            Atom->Begin, AtomComp);
+  assert(I != atom_end() && "Atom offset not found in module!");
   assert(*I == Atom && "Previous atom mapping was invalid!");
-  I.erase();
+  Atoms.erase(I);
+
+  // FIXME: special case NewBegin == Atom->Begin
 
   // Insert the new mapping.
-  OffsetMap.insert(NewBegin, NewEnd, Atom);
+  AtomListTy::iterator NewI = std::lower_bound(atom_begin(), atom_end(),
+                                               NewBegin, AtomComp);
+  assert((NewI == atom_end() || (*NewI)->getBeginAddr() > Atom->End)
+         && "Offset range already occupied!");
+  Atoms.insert(NewI, Atom);
 
   // Update the atom internal bounds.
   Atom->Begin = NewBegin;
   Atom->End = NewEnd;
 }
 
+const MCAtom *MCModule::findAtomContaining(uint64_t Addr) const {
+  AtomListTy::const_iterator I = std::lower_bound(atom_begin(), atom_end(),
+                                                  Addr, AtomComp);
+  if (I != atom_end() && (*I)->getBeginAddr() <= Addr)
+    return *I;
+  return 0;
+}
+
+MCAtom *MCModule::findAtomContaining(uint64_t Addr) {
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findAtomContaining(Addr));
+}
+
+const MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) const {
+  AtomListTy::const_iterator I = std::upper_bound(atom_begin(), atom_end(),
+                                                  Addr, AtomCompInv);
+  if (I != atom_end())
+    return *I;
+  return 0;
+}
+
+MCAtom *MCModule::findFirstAtomAfter(uint64_t Addr) {
+  return const_cast<MCAtom*>(
+    const_cast<const MCModule *>(this)->findFirstAtomAfter(Addr));
+}
+
+MCFunction *MCModule::createFunction(StringRef Name) {
+  Functions.push_back(new MCFunction(Name, this));
+  return Functions.back();
+}
+
+static bool CompBBToAtom(MCBasicBlock *BB, const MCTextAtom *Atom) {
+  return BB->getInsts() < Atom;
+}
+
+void MCModule::splitBasicBlocksForAtom(const MCTextAtom *TA,
+                                       const MCTextAtom *NewTA) {
+  BBsByAtomTy::iterator
+    I = std::lower_bound(BBsByAtom.begin(), BBsByAtom.end(),
+                         TA, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == TA; ++I) {
+    MCBasicBlock *BB = *I;
+    MCBasicBlock *NewBB = &BB->getParent()->createBlock(*NewTA);
+    BB->splitBasicBlock(NewBB);
+  }
+}
+
+void MCModule::trackBBForAtom(const MCTextAtom *Atom, MCBasicBlock *BB) {
+  assert(Atom == BB->getInsts() && "Text atom doesn't back the basic block!");
+  BBsByAtomTy::iterator I = std::lower_bound(BBsByAtom.begin(),
+                                             BBsByAtom.end(),
+                                             Atom, CompBBToAtom);
+  for (; I != BBsByAtom.end() && (*I)->getInsts() == Atom; ++I)
+    if (*I == BB)
+      return;
+  BBsByAtom.insert(I, BB);
+}
+
+MCModule::~MCModule() {
+  for (AtomListTy::iterator AI = atom_begin(),
+                            AE = atom_end();
+                            AI != AE; ++AI)
+    delete *AI;
+  for (FunctionListTy::iterator FI = func_begin(),
+                                FE = func_end();
+                                FI != FE; ++FI)
+    delete *FI;
+}
diff --git a/lib/MC/MCModuleYAML.cpp b/lib/MC/MCModuleYAML.cpp
new file mode 100644
index 0000000..e2de578
--- /dev/null
+++ b/lib/MC/MCModuleYAML.cpp
@@ -0,0 +1,461 @@
+//===- MCModuleYAML.cpp - MCModule YAMLIO implementation ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of MCModule.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCModuleYAML.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Object/YAML.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <vector>
+
+namespace llvm {
+
+namespace {
+
+// This class is used to map opcode and register names to enum values.
+//
+// There are at least 3 obvious ways to do this:
+// 1- Generate an MII/MRI method using a tablegen StringMatcher
+// 2- Write an MII/MRI method using std::lower_bound and the assumption that
+//    the enums are sorted (starting at a fixed value).
+// 3- Do the matching manually as is done here.
+//
+// Why 3?
+// 1- A StringMatcher function for thousands of entries would incur
+//    a non-negligible binary size overhead.
+// 2- The lower_bound comparators would be somewhat involved and aren't
+//    obviously reusable (see LessRecordRegister in llvm/TableGen/Record.h)
+// 3- This isn't actually something useful outside tests (but the same argument
+//    can be made against having {MII,MRI}::getName).
+//
+// If this becomes useful outside this specific situation, feel free to do
+// the Right Thing (tm) and move the functionality to MII/MRI.
+//
+class InstrRegInfoHolder {
+  typedef StringMap<unsigned, BumpPtrAllocator> EnumValByNameTy;
+  EnumValByNameTy InstEnumValueByName;
+  EnumValByNameTy RegEnumValueByName;
+
+public:
+  const MCInstrInfo &MII;
+  const MCRegisterInfo &MRI;
+  InstrRegInfoHolder(const MCInstrInfo &MII, const MCRegisterInfo &MRI)
+      : InstEnumValueByName(NextPowerOf2(MII.getNumOpcodes())),
+        RegEnumValueByName(NextPowerOf2(MRI.getNumRegs())), MII(MII), MRI(MRI) {
+    for (int i = 0, e = MII.getNumOpcodes(); i != e; ++i)
+      InstEnumValueByName[MII.getName(i)] = i;
+    for (int i = 0, e = MRI.getNumRegs(); i != e; ++i)
+      RegEnumValueByName[MRI.getName(i)] = i;
+  }
+
+  bool matchRegister(StringRef Name, unsigned &Reg) {
+    EnumValByNameTy::const_iterator It = RegEnumValueByName.find(Name);
+    if (It == RegEnumValueByName.end())
+      return false;
+    Reg = It->getValue();
+    return true;
+  }
+  bool matchOpcode(StringRef Name, unsigned &Opc) {
+    EnumValByNameTy::const_iterator It = InstEnumValueByName.find(Name);
+    if (It == InstEnumValueByName.end())
+      return false;
+    Opc = It->getValue();
+    return true;
+  }
+};
+
+} // end unnamed namespace
+
+namespace MCModuleYAML {
+
+LLVM_YAML_STRONG_TYPEDEF(unsigned, OpcodeEnum)
+
+struct Operand {
+  MCOperand MCOp;
+};
+
+struct Inst {
+  OpcodeEnum Opcode;
+  std::vector<Operand> Operands;
+  uint64_t Size;
+};
+
+struct Atom {
+  MCAtom::AtomKind Type;
+  yaml::Hex64 StartAddress;
+  uint64_t Size;
+
+  std::vector<Inst> Insts;
+  object::yaml::BinaryRef Data;
+};
+
+struct BasicBlock {
+  yaml::Hex64 Address;
+  std::vector<yaml::Hex64> Preds;
+  std::vector<yaml::Hex64> Succs;
+};
+
+struct Function {
+  StringRef Name;
+  std::vector<BasicBlock> BasicBlocks;
+};
+
+struct Module {
+  std::vector<Atom> Atoms;
+  std::vector<Function> Functions;
+};
+
+} // end namespace MCModuleYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::yaml::Hex64)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::MCModuleYAML::Operand)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Inst)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Atom)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::BasicBlock)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MCModuleYAML::Function)
+
+namespace llvm {
+
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<MCAtom::AtomKind> {
+  static void enumeration(IO &IO, MCAtom::AtomKind &Kind);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Atom> {
+  static void mapping(IO &IO, MCModuleYAML::Atom &A);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Inst> {
+  static void mapping(IO &IO, MCModuleYAML::Inst &I);
+};
+
+template <> struct MappingTraits<MCModuleYAML::BasicBlock> {
+  static void mapping(IO &IO, MCModuleYAML::BasicBlock &BB);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Function> {
+  static void mapping(IO &IO, MCModuleYAML::Function &Fn);
+};
+
+template <> struct MappingTraits<MCModuleYAML::Module> {
+  static void mapping(IO &IO, MCModuleYAML::Module &M);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::Operand> {
+  static void output(const MCModuleYAML::Operand &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::Operand &);
+};
+
+template <> struct ScalarTraits<MCModuleYAML::OpcodeEnum> {
+  static void output(const MCModuleYAML::OpcodeEnum &, void *,
+                     llvm::raw_ostream &);
+  static StringRef input(StringRef, void *, MCModuleYAML::OpcodeEnum &);
+};
+
+void ScalarEnumerationTraits<MCAtom::AtomKind>::enumeration(
+    IO &IO, MCAtom::AtomKind &Value) {
+  IO.enumCase(Value, "Text", MCAtom::TextAtom);
+  IO.enumCase(Value, "Data", MCAtom::DataAtom);
+}
+
+void MappingTraits<MCModuleYAML::Atom>::mapping(IO &IO, MCModuleYAML::Atom &A) {
+  IO.mapRequired("StartAddress", A.StartAddress);
+  IO.mapRequired("Size", A.Size);
+  IO.mapRequired("Type", A.Type);
+  if (A.Type == MCAtom::TextAtom)
+    IO.mapRequired("Content", A.Insts);
+  else if (A.Type == MCAtom::DataAtom)
+    IO.mapRequired("Content", A.Data);
+}
+
+void MappingTraits<MCModuleYAML::Inst>::mapping(IO &IO, MCModuleYAML::Inst &I) {
+  IO.mapRequired("Inst", I.Opcode);
+  IO.mapRequired("Size", I.Size);
+  IO.mapRequired("Ops", I.Operands);
+}
+
+void
+MappingTraits<MCModuleYAML::BasicBlock>::mapping(IO &IO,
+                                                 MCModuleYAML::BasicBlock &BB) {
+  IO.mapRequired("Address", BB.Address);
+  IO.mapRequired("Preds", BB.Preds);
+  IO.mapRequired("Succs", BB.Succs);
+}
+
+void MappingTraits<MCModuleYAML::Function>::mapping(IO &IO,
+                                                    MCModuleYAML::Function &F) {
+  IO.mapRequired("Name", F.Name);
+  IO.mapRequired("BasicBlocks", F.BasicBlocks);
+}
+
+void MappingTraits<MCModuleYAML::Module>::mapping(IO &IO,
+                                                  MCModuleYAML::Module &M) {
+  IO.mapRequired("Atoms", M.Atoms);
+  IO.mapOptional("Functions", M.Functions);
+}
+
+void
+ScalarTraits<MCModuleYAML::Operand>::output(const MCModuleYAML::Operand &Val,
+                                            void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+
+  // FIXME: Doesn't support FPImm and expr/inst, but do these make sense?
+  if (Val.MCOp.isImm())
+    Out << "I" << Val.MCOp.getImm();
+  else if (Val.MCOp.isReg())
+    Out << "R" << IRI->MRI.getName(Val.MCOp.getReg());
+  else
+    llvm_unreachable("Trying to output invalid MCOperand!");
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::Operand>::input(StringRef Scalar, void *Ctx,
+                                           MCModuleYAML::Operand &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  char Type = 0;
+  if (Scalar.size() >= 1)
+    Type = Scalar.front();
+  if (Type != 'R' && Type != 'I')
+    return "Operand must start with 'R' (register) or 'I' (immediate).";
+  if (Type == 'R') {
+    unsigned Reg;
+    if (!IRI->matchRegister(Scalar.substr(1), Reg))
+      return "Invalid register name.";
+    Val.MCOp = MCOperand::CreateReg(Reg);
+  } else if (Type == 'I') {
+    int64_t RIVal;
+    if (Scalar.substr(1).getAsInteger(10, RIVal))
+      return "Invalid immediate value.";
+    Val.MCOp = MCOperand::CreateImm(RIVal);
+  } else {
+    Val.MCOp = MCOperand();
+  }
+  return StringRef();
+}
+
+void ScalarTraits<MCModuleYAML::OpcodeEnum>::output(
+    const MCModuleYAML::OpcodeEnum &Val, void *Ctx, raw_ostream &Out) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  Out << IRI->MII.getName(Val);
+}
+
+StringRef
+ScalarTraits<MCModuleYAML::OpcodeEnum>::input(StringRef Scalar, void *Ctx,
+                                              MCModuleYAML::OpcodeEnum &Val) {
+  InstrRegInfoHolder *IRI = (InstrRegInfoHolder *)Ctx;
+  unsigned Opc;
+  if (!IRI->matchOpcode(Scalar, Opc))
+    return "Invalid instruction opcode.";
+  Val = Opc;
+  return "";
+}
+
+} // end namespace yaml
+
+namespace {
+
+class MCModule2YAML {
+  const MCModule &MCM;
+  MCModuleYAML::Module YAMLModule;
+  void dumpAtom(const MCAtom *MCA);
+  void dumpFunction(const MCFunction *MCF);
+  void dumpBasicBlock(const MCBasicBlock *MCBB);
+
+public:
+  MCModule2YAML(const MCModule &MCM);
+  MCModuleYAML::Module &getYAMLModule();
+};
+
+class YAML2MCModule {
+  MCModule &MCM;
+
+public:
+  YAML2MCModule(MCModule &MCM);
+  StringRef parse(const MCModuleYAML::Module &YAMLModule);
+};
+
+} // end unnamed namespace
+
+MCModule2YAML::MCModule2YAML(const MCModule &MCM) : MCM(MCM), YAMLModule() {
+  for (MCModule::const_atom_iterator AI = MCM.atom_begin(), AE = MCM.atom_end();
+       AI != AE; ++AI)
+    dumpAtom(*AI);
+  for (MCModule::const_func_iterator FI = MCM.func_begin(), FE = MCM.func_end();
+       FI != FE; ++FI)
+    dumpFunction(*FI);
+}
+
+void MCModule2YAML::dumpAtom(const MCAtom *MCA) {
+  YAMLModule.Atoms.resize(YAMLModule.Atoms.size() + 1);
+  MCModuleYAML::Atom &A = YAMLModule.Atoms.back();
+  A.Type = MCA->getKind();
+  A.StartAddress = MCA->getBeginAddr();
+  A.Size = MCA->getEndAddr() - MCA->getBeginAddr() + 1;
+  if (const MCTextAtom *TA = dyn_cast<MCTextAtom>(MCA)) {
+    const size_t InstCount = TA->size();
+    A.Insts.resize(InstCount);
+    for (size_t i = 0; i != InstCount; ++i) {
+      const MCDecodedInst &MCDI = TA->at(i);
+      A.Insts[i].Opcode = MCDI.Inst.getOpcode();
+      A.Insts[i].Size = MCDI.Size;
+      const unsigned OpCount = MCDI.Inst.getNumOperands();
+      A.Insts[i].Operands.resize(OpCount);
+      for (unsigned oi = 0; oi != OpCount; ++oi)
+        A.Insts[i].Operands[oi].MCOp = MCDI.Inst.getOperand(oi);
+    }
+  } else if (const MCDataAtom *DA = dyn_cast<MCDataAtom>(MCA)) {
+    A.Data = DA->getData();
+  } else {
+    llvm_unreachable("Unknown atom type.");
+  }
+}
+
+void MCModule2YAML::dumpFunction(const MCFunction *MCF) {
+  YAMLModule.Functions.resize(YAMLModule.Functions.size() + 1);
+  MCModuleYAML::Function &F = YAMLModule.Functions.back();
+  F.Name = MCF->getName();
+  for (MCFunction::const_iterator BBI = MCF->begin(), BBE = MCF->end();
+       BBI != BBE; ++BBI) {
+    const MCBasicBlock *MCBB = *BBI;
+    F.BasicBlocks.resize(F.BasicBlocks.size() + 1);
+    MCModuleYAML::BasicBlock &BB = F.BasicBlocks.back();
+    BB.Address = MCBB->getInsts()->getBeginAddr();
+    for (MCBasicBlock::pred_const_iterator PI = MCBB->pred_begin(),
+                                           PE = MCBB->pred_end();
+         PI != PE; ++PI)
+      BB.Preds.push_back((*PI)->getInsts()->getBeginAddr());
+    for (MCBasicBlock::succ_const_iterator SI = MCBB->succ_begin(),
+                                           SE = MCBB->succ_end();
+         SI != SE; ++SI)
+      BB.Succs.push_back((*SI)->getInsts()->getBeginAddr());
+  }
+}
+
+MCModuleYAML::Module &MCModule2YAML::getYAMLModule() { return YAMLModule; }
+
+YAML2MCModule::YAML2MCModule(MCModule &MCM) : MCM(MCM) {}
+
+StringRef YAML2MCModule::parse(const MCModuleYAML::Module &YAMLModule) {
+  typedef std::vector<MCModuleYAML::Atom>::const_iterator AtomIt;
+  typedef std::vector<MCModuleYAML::Inst>::const_iterator InstIt;
+  typedef std::vector<MCModuleYAML::Operand>::const_iterator OpIt;
+
+  typedef DenseMap<uint64_t, MCTextAtom *> AddrToTextAtomTy;
+  AddrToTextAtomTy TAByAddr;
+
+  for (AtomIt AI = YAMLModule.Atoms.begin(), AE = YAMLModule.Atoms.end();
+       AI != AE; ++AI) {
+    uint64_t StartAddress = AI->StartAddress;
+    if (AI->Size == 0)
+      return "Atoms can't be empty!";
+    uint64_t EndAddress = StartAddress + AI->Size - 1;
+    switch (AI->Type) {
+    case MCAtom::TextAtom: {
+      MCTextAtom *TA = MCM.createTextAtom(StartAddress, EndAddress);
+      TAByAddr[StartAddress] = TA;
+      for (InstIt II = AI->Insts.begin(), IE = AI->Insts.end(); II != IE;
+           ++II) {
+        MCInst MI;
+        MI.setOpcode(II->Opcode);
+        for (OpIt OI = II->Operands.begin(), OE = II->Operands.end(); OI != OE;
+             ++OI)
+          MI.addOperand(OI->MCOp);
+        TA->addInst(MI, II->Size);
+      }
+      break;
+    }
+    case MCAtom::DataAtom: {
+      MCDataAtom *DA = MCM.createDataAtom(StartAddress, EndAddress);
+      SmallVector<char, 64> Data;
+      raw_svector_ostream OS(Data);
+      AI->Data.writeAsBinary(OS);
+      OS.flush();
+      for (size_t i = 0, e = Data.size(); i != e; ++i)
+        DA->addData((uint8_t)Data[i]);
+      break;
+    }
+    }
+  }
+
+  typedef std::vector<MCModuleYAML::Function>::const_iterator FuncIt;
+  typedef std::vector<MCModuleYAML::BasicBlock>::const_iterator BBIt;
+  typedef std::vector<yaml::Hex64>::const_iterator AddrIt;
+  for (FuncIt FI = YAMLModule.Functions.begin(),
+              FE = YAMLModule.Functions.end();
+       FI != FE; ++FI) {
+    MCFunction *MCFN = MCM.createFunction(FI->Name);
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      AddrToTextAtomTy::const_iterator It = TAByAddr.find(BBI->Address);
+      if (It == TAByAddr.end())
+        return "Basic block start address doesn't match any text atom!";
+      MCFN->createBlock(*It->second);
+    }
+    for (BBIt BBI = FI->BasicBlocks.begin(), BBE = FI->BasicBlocks.end();
+         BBI != BBE; ++BBI) {
+      MCBasicBlock *MCBB = MCFN->find(BBI->Address);
+      if (!MCBB)
+        return "Couldn't find matching basic block in function.";
+      for (AddrIt PI = BBI->Preds.begin(), PE = BBI->Preds.end(); PI != PE;
+           ++PI) {
+        MCBasicBlock *Pred = MCFN->find(*PI);
+        if (!Pred)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addPredecessor(Pred);
+      }
+      for (AddrIt SI = BBI->Succs.begin(), SE = BBI->Succs.end(); SI != SE;
+           ++SI) {
+        MCBasicBlock *Succ = MCFN->find(*SI);
+        if (!Succ)
+          return "Couldn't find predecessor basic block.";
+        MCBB->addSuccessor(Succ);
+      }
+    }
+  }
+  return "";
+}
+
+StringRef mcmodule2yaml(raw_ostream &OS, const MCModule &MCM,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCModule2YAML Dumper(MCM);
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Output YOut(OS, (void *)&IRI);
+  YOut << Dumper.getYAMLModule();
+  return "";
+}
+
+StringRef yaml2mcmodule(OwningPtr<MCModule> &MCM, StringRef YamlContent,
+                        const MCInstrInfo &MII, const MCRegisterInfo &MRI) {
+  MCM.reset(new MCModule);
+  YAML2MCModule Parser(*MCM);
+  MCModuleYAML::Module YAMLModule;
+  InstrRegInfoHolder IRI(MII, MRI);
+  yaml::Input YIn(YamlContent, (void *)&IRI);
+  YIn >> YAMLModule;
+  if (error_code ec = YIn.error())
+    return ec.message();
+  StringRef err = Parser.parse(YAMLModule);
+  if (!err.empty())
+    return err;
+  return "";
+}
+
+} // end namespace llvm
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 659706a..9b9c4aa 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -19,7 +19,7 @@ namespace {
 
   class MCNullStreamer : public MCStreamer {
   public:
-    MCNullStreamer(MCContext &Context) : MCStreamer(SK_NullStreamer, Context) {}
+    MCNullStreamer(MCContext &Context) : MCStreamer(Context, 0) {}
 
     /// @name MCStreamer Interface
     /// @{
@@ -37,7 +37,7 @@ namespace {
     virtual void EmitLabel(MCSymbol *Symbol) {
       assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
       assert(getCurrentSection().first &&"Cannot emit before setting section!");
-      Symbol->setSection(*getCurrentSection().first);
+      AssignSection(Symbol, getCurrentSection().first);
     }
     virtual void EmitDebugLabel(MCSymbol *Symbol) {
       EmitLabel(Symbol);
@@ -52,7 +52,9 @@ namespace {
                                           const MCSymbol *Label,
                                           unsigned PointerSize) {}
 
-    virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){}
+    virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute){
+      return true;
+    }
 
     virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
 
@@ -71,10 +73,9 @@ namespace {
                               uint64_t Size = 0, unsigned ByteAlignment = 0) {}
     virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment) {}
-    virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {}
+    virtual void EmitBytes(StringRef Data) {}
 
-    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                               unsigned AddrSpace) {}
+    virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {}
     virtual void EmitULEB128Value(const MCExpr *Value) {}
     virtual void EmitSLEB128Value(const MCExpr *Value) {}
     virtual void EmitGPRel32Value(const MCExpr *Value) {}
@@ -108,13 +109,6 @@ namespace {
     virtual void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
       RecordProcEnd(Frame);
     }
-
-    /// @}
-
-    static bool classof(const MCStreamer *S) {
-      return S->getKind() == SK_NullStreamer;
-    }
-
   };
 
 }
diff --git a/lib/MC/MCObjectDisassembler.cpp b/lib/MC/MCObjectDisassembler.cpp
new file mode 100644
index 0000000..16a110f0
--- /dev/null
+++ b/lib/MC/MCObjectDisassembler.cpp
@@ -0,0 +1,584 @@
+//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectDisassembler.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAtom.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFunction.h"
+#include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCModule.h"
+#include "llvm/MC/MCObjectSymbolizer.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/StringRefMemoryObject.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+using namespace object;
+
+MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
+                                           const MCDisassembler &Dis,
+                                           const MCInstrAnalysis &MIA)
+    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(0) {}
+
+uint64_t MCObjectDisassembler::getEntrypoint() {
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    if (Name == "main" || Name == "_main") {
+      uint64_t Entrypoint;
+      SI->getAddress(Entrypoint);
+      return getEffectiveLoadAddr(Entrypoint);
+    }
+  }
+  return 0;
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() {
+  return ArrayRef<uint64_t>();
+}
+
+MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) {
+  // FIXME: Keep track of object sections.
+  return FallbackRegion.get();
+}
+
+uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) {
+  return Addr;
+}
+
+MCModule *MCObjectDisassembler::buildEmptyModule() {
+  MCModule *Module = new MCModule;
+  Module->Entrypoint = getEntrypoint();
+  return Module;
+}
+
+MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
+  MCModule *Module = buildEmptyModule();
+
+  buildSectionAtoms(Module);
+  if (withCFG)
+    buildCFG(Module);
+  return Module;
+}
+
+void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
+  error_code ec;
+  for (section_iterator SI = Obj.begin_sections(),
+                        SE = Obj.end_sections();
+                        SI != SE;
+                        SI.increment(ec)) {
+    if (ec) break;
+
+    bool isText; SI->isText(isText);
+    bool isData; SI->isData(isData);
+    if (!isData && !isText)
+      continue;
+
+    uint64_t StartAddr; SI->getAddress(StartAddr);
+    uint64_t SecSize; SI->getSize(SecSize);
+    if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
+      continue;
+    StartAddr = getEffectiveLoadAddr(StartAddr);
+
+    StringRef Contents; SI->getContents(Contents);
+    StringRefMemoryObject memoryObject(Contents, StartAddr);
+
+    // We don't care about things like non-file-backed sections yet.
+    if (Contents.size() != SecSize || !SecSize)
+      continue;
+    uint64_t EndAddr = StartAddr + SecSize - 1;
+
+    StringRef SecName; SI->getName(SecName);
+
+    if (isText) {
+      MCTextAtom *Text = 0;
+      MCDataAtom *InvalidData = 0;
+
+      uint64_t InstSize;
+      for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
+        const uint64_t CurAddr = StartAddr + Index;
+        MCInst Inst;
+        if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(),
+                               nulls())) {
+          if (!Text) {
+            Text = Module->createTextAtom(CurAddr, CurAddr);
+            Text->setName(SecName);
+          }
+          Text->addInst(Inst, InstSize);
+          InvalidData = 0;
+        } else {
+          assert(InstSize && "getInstruction() consumed no bytes");
+          if (!InvalidData) {
+            Text = 0;
+            InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
+          }
+          for (uint64_t I = 0; I < InstSize; ++I)
+            InvalidData->addData(Contents[Index+I]);
+        }
+      }
+    } else {
+      MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
+      Data->setName(SecName);
+      for (uint64_t Index = 0; Index < SecSize; ++Index)
+        Data->addData(Contents[Index]);
+    }
+  }
+}
+
+namespace {
+  struct BBInfo;
+  typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy;
+
+  struct BBInfo {
+    MCTextAtom *Atom;
+    MCBasicBlock *BB;
+    BBInfoSetTy Succs;
+    BBInfoSetTy Preds;
+    MCObjectDisassembler::AddressSetTy SuccAddrs;
+
+    BBInfo() : Atom(0), BB(0) {}
+
+    void addSucc(BBInfo &Succ) {
+      Succs.insert(&Succ);
+      Succ.Preds.insert(this);
+    }
+  };
+}
+
+static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) {
+  std::sort(V.begin(), V.end());
+  V.erase(std::unique(V.begin(), V.end()), V.end());
+}
+
+void MCObjectDisassembler::buildCFG(MCModule *Module) {
+  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
+  BBInfoByAddrTy BBInfos;
+  AddressSetTy Splits;
+  AddressSetTy Calls;
+
+  error_code ec;
+  for (symbol_iterator SI = Obj.begin_symbols(), SE = Obj.end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    SymbolRef::Type SymType;
+    SI->getType(SymType);
+    if (SymType == SymbolRef::ST_Function) {
+      uint64_t SymAddr;
+      SI->getAddress(SymAddr);
+      SymAddr = getEffectiveLoadAddr(SymAddr);
+      Calls.push_back(SymAddr);
+      Splits.push_back(SymAddr);
+    }
+  }
+
+  assert(Module->func_begin() == Module->func_end()
+         && "Module already has a CFG!");
+
+  // First, determine the basic block boundaries and call targets.
+  for (MCModule::atom_iterator AI = Module->atom_begin(),
+                               AE = Module->atom_end();
+       AI != AE; ++AI) {
+    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
+    if (!TA) continue;
+    Calls.push_back(TA->getBeginAddr());
+    BBInfos[TA->getBeginAddr()].Atom = TA;
+    for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
+         II != IE; ++II) {
+      if (MIA.isTerminator(II->Inst))
+        Splits.push_back(II->Address + II->Size);
+      uint64_t Target;
+      if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
+        if (MIA.isCall(II->Inst))
+          Calls.push_back(Target);
+        Splits.push_back(Target);
+      }
+    }
+  }
+
+  RemoveDupsFromAddressVector(Splits);
+  RemoveDupsFromAddressVector(Calls);
+
+  // Split text atoms into basic block atoms.
+  for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
+       SI != SE; ++SI) {
+    MCAtom *A = Module->findAtomContaining(*SI);
+    if (!A) continue;
+    MCTextAtom *TA = cast<MCTextAtom>(A);
+    if (TA->getBeginAddr() == *SI)
+      continue;
+    MCTextAtom *NewAtom = TA->split(*SI);
+    BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;
+    StringRef BBName = TA->getName();
+    BBName = BBName.substr(0, BBName.find_last_of(':'));
+    NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());
+  }
+
+  // Compute succs/preds.
+  for (MCModule::atom_iterator AI = Module->atom_begin(),
+                               AE = Module->atom_end();
+                               AI != AE; ++AI) {
+    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
+    if (!TA) continue;
+    BBInfo &CurBB = BBInfos[TA->getBeginAddr()];
+    const MCDecodedInst &LI = TA->back();
+    if (MIA.isBranch(LI.Inst)) {
+      uint64_t Target;
+      if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))
+        CurBB.addSucc(BBInfos[Target]);
+      if (MIA.isConditionalBranch(LI.Inst))
+        CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
+    } else if (!MIA.isTerminator(LI.Inst))
+      CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
+  }
+
+
+  // Create functions and basic blocks.
+  for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();
+       CI != CE; ++CI) {
+    BBInfo &BBI = BBInfos[*CI];
+    if (!BBI.Atom) continue;
+
+    MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());
+
+    // Create MCBBs.
+    SmallSetVector<BBInfo*, 16> Worklist;
+    Worklist.insert(&BBI);
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
+      if (!BBI->Atom)
+        continue;
+      BBI->BB = &MCFN.createBlock(*BBI->Atom);
+      // Add all predecessors and successors to the worklist.
+      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
+                                 SI != SE; ++SI)
+        Worklist.insert(*SI);
+      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
+                                 PI != PE; ++PI)
+        Worklist.insert(*PI);
+    }
+
+    // Set preds/succs.
+    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+      BBInfo *BBI = Worklist[wi];
+      MCBasicBlock *MCBB = BBI->BB;
+      if (!MCBB)
+        continue;
+      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
+           SI != SE; ++SI)
+        if ((*SI)->BB)
+          MCBB->addSuccessor((*SI)->BB);
+      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
+           PI != PE; ++PI)
+        if ((*PI)->BB)
+          MCBB->addPredecessor((*PI)->BB);
+    }
+  }
+}
+
+// Basic idea of the disassembly + discovery:
+//
+// start with the wanted address, insert it in the worklist
+// while worklist not empty, take next address in the worklist:
+// - check if atom exists there
+//   - if middle of atom:
+//     - split basic blocks referencing the atom
+//     - look for an already encountered BBInfo (using a map<atom, bbinfo>)
+//       - if there is, split it (new one, fallthrough, move succs, etc..)
+//   - if start of atom: nothing else to do
+//   - if no atom: create new atom and new bbinfo
+// - look at the last instruction in the atom, add succs to worklist
+// for all elements in the worklist:
+// - create basic block, update preds/succs, etc..
+//
+MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN,
+                                            uint64_t BBBeginAddr,
+                                            AddressSetTy &CallTargets,
+                                            AddressSetTy &TailCallTargets) {
+  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
+  typedef SmallSetVector<uint64_t, 16> AddrWorklistTy;
+  BBInfoByAddrTy BBInfos;
+  AddrWorklistTy Worklist;
+
+  Worklist.insert(BBBeginAddr);
+  for (size_t wi = 0; wi < Worklist.size(); ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    MCTextAtom *&TA = BBI->Atom;
+    assert(!TA && "Discovered basic block already has an associated atom!");
+
+    // Look for an atom at BeginAddr.
+    if (MCAtom *A = Module->findAtomContaining(BeginAddr)) {
+      // FIXME: We don't care about mixed atoms, see above.
+      TA = cast<MCTextAtom>(A);
+
+      // The found atom doesn't begin at BeginAddr, we have to split it.
+      if (TA->getBeginAddr() != BeginAddr) {
+        // FIXME: Handle overlapping atoms: middle-starting instructions, etc..
+        MCTextAtom *NewTA = TA->split(BeginAddr);
+
+        // Look for an already encountered basic block that needs splitting
+        BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr());
+        if (It != BBInfos.end() && It->second.Atom) {
+          BBI->SuccAddrs = It->second.SuccAddrs;
+          It->second.SuccAddrs.clear();
+          It->second.SuccAddrs.push_back(BeginAddr);
+        }
+        TA = NewTA;
+      }
+      BBI->Atom = TA;
+    } else {
+      // If we didn't find an atom, then we have to disassemble to create one!
+
+      MemoryObject *Region = getRegionFor(BeginAddr);
+      if (!Region)
+        llvm_unreachable(("Couldn't find suitable region for disassembly at " +
+                          utostr(BeginAddr)).c_str());
+
+      uint64_t InstSize;
+      uint64_t EndAddr = Region->getBase() + Region->getExtent();
+
+      // We want to stop before the next atom and have a fallthrough to it.
+      if (MCTextAtom *NextAtom =
+              cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr)))
+        EndAddr = std::min(EndAddr, NextAtom->getBeginAddr());
+
+      for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) {
+        MCInst Inst;
+        if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(),
+                               nulls())) {
+          if (!TA)
+            TA = Module->createTextAtom(Addr, Addr);
+          TA->addInst(Inst, InstSize);
+        } else {
+          // We don't care about splitting mixed atoms either.
+          llvm_unreachable("Couldn't disassemble instruction in atom.");
+        }
+
+        uint64_t BranchTarget;
+        if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) {
+          if (MIA.isCall(Inst))
+            CallTargets.push_back(BranchTarget);
+        }
+
+        if (MIA.isTerminator(Inst))
+          break;
+      }
+      BBI->Atom = TA;
+    }
+
+    assert(TA && "Couldn't disassemble atom, none was created!");
+    assert(TA->begin() != TA->end() && "Empty atom!");
+
+    MemoryObject *Region = getRegionFor(TA->getBeginAddr());
+    assert(Region && "Couldn't find region for already disassembled code!");
+    uint64_t EndRegion = Region->getBase() + Region->getExtent();
+
+    // Now we have a basic block atom, add successors.
+    // Add the fallthrough block.
+    if ((MIA.isConditionalBranch(TA->back().Inst) ||
+         !MIA.isTerminator(TA->back().Inst)) &&
+        (TA->getEndAddr() + 1 < EndRegion)) {
+      BBI->SuccAddrs.push_back(TA->getEndAddr() + 1);
+      Worklist.insert(TA->getEndAddr() + 1);
+    }
+
+    // If the terminator is a branch, add the target block.
+    if (MIA.isBranch(TA->back().Inst)) {
+      uint64_t BranchTarget;
+      if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address,
+                             TA->back().Size, BranchTarget)) {
+        StringRef ExtFnName;
+        if (MOS)
+          ExtFnName =
+              MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget));
+        if (!ExtFnName.empty()) {
+          TailCallTargets.push_back(BranchTarget);
+          CallTargets.push_back(BranchTarget);
+        } else {
+          BBI->SuccAddrs.push_back(BranchTarget);
+          Worklist.insert(BranchTarget);
+        }
+      }
+    }
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+
+    assert(BBI->Atom && "Found a basic block without an associated atom!");
+
+    // Look for a basic block at BeginAddr.
+    BBI->BB = MCFN->find(BeginAddr);
+    if (BBI->BB) {
+      // FIXME: check that the succs/preds are the same
+      continue;
+    }
+    // If there was none, we have to create one from the atom.
+    BBI->BB = &MCFN->createBlock(*BBI->Atom);
+  }
+
+  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
+    const uint64_t BeginAddr = Worklist[wi];
+    BBInfo *BBI = &BBInfos[BeginAddr];
+    MCBasicBlock *BB = BBI->BB;
+
+    RemoveDupsFromAddressVector(BBI->SuccAddrs);
+    for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(),
+         SE = BBI->SuccAddrs.end();
+         SE != SE; ++SI) {
+      MCBasicBlock *Succ = BBInfos[*SI].BB;
+      BB->addSuccessor(Succ);
+      Succ->addPredecessor(BB);
+    }
+  }
+
+  assert(BBInfos[Worklist[0]].BB &&
+         "No basic block created at requested address?");
+
+  return BBInfos[Worklist[0]].BB;
+}
+
+MCFunction *
+MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
+                                     AddressSetTy &CallTargets,
+                                     AddressSetTy &TailCallTargets) {
+  // First, check if this is an external function.
+  StringRef ExtFnName;
+  if (MOS)
+    ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr));
+  if (!ExtFnName.empty())
+    return Module->createFunction(ExtFnName);
+
+  // If it's not, look for an existing function.
+  for (MCModule::func_iterator FI = Module->func_begin(),
+                               FE = Module->func_end();
+       FI != FE; ++FI) {
+    if ((*FI)->empty())
+      continue;
+    // FIXME: MCModule should provide a findFunctionByAddr()
+    if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
+      return *FI;
+  }
+
+  // Finally, just create a new one.
+  MCFunction *MCFN = Module->createFunction("");
+  getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets);
+  return MCFN;
+}
+
+// MachO MCObjectDisassembler implementation.
+
+MCMachOObjectDisassembler::MCMachOObjectDisassembler(
+    const MachOObjectFile &MOOF, const MCDisassembler &Dis,
+    const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
+    uint64_t HeaderLoadAddress)
+    : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF),
+      VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) {
+
+  error_code ec;
+  for (section_iterator SI = MOOF.begin_sections(), SE = MOOF.end_sections();
+       SI != SE; SI.increment(ec)) {
+    if (ec)
+      break;
+    StringRef Name;
+    SI->getName(Name);
+    // FIXME: We should use the S_ section type instead of the name.
+    if (Name == "__mod_init_func") {
+      DEBUG(dbgs() << "Found __mod_init_func section!\n");
+      SI->getContents(ModInitContents);
+    } else if (Name == "__mod_exit_func") {
+      DEBUG(dbgs() << "Found __mod_exit_func section!\n");
+      SI->getContents(ModExitContents);
+    }
+  }
+}
+
+// FIXME: Only do the translations for addresses actually inside the object.
+uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
+  return Addr + VMAddrSlide;
+}
+
+uint64_t
+MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) {
+  return EffectiveAddr - VMAddrSlide;
+}
+
+uint64_t MCMachOObjectDisassembler::getEntrypoint() {
+  uint64_t EntryFileOffset = 0;
+
+  // Look for LC_MAIN.
+  {
+    uint32_t LoadCommandCount = MOOF.getHeader().ncmds;
+    MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo();
+    for (unsigned I = 0;; ++I) {
+      if (Load.C.cmd == MachO::LC_MAIN) {
+        EntryFileOffset =
+            ((const MachO::entry_point_command *)Load.Ptr)->entryoff;
+        break;
+      }
+
+      if (I == LoadCommandCount - 1)
+        break;
+      else
+        Load = MOOF.getNextLoadCommandInfo(Load);
+    }
+  }
+
+  // If we didn't find anything, default to the common implementation.
+  // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends?
+  if (EntryFileOffset)
+    return MCObjectDisassembler::getEntrypoint();
+
+  return EntryFileOffset + HeaderLoadAddress;
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModInitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount);
+}
+
+ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() {
+  // FIXME: We only handle 64bit mach-o
+  assert(MOOF.is64Bit());
+
+  size_t EntrySize = 8;
+  size_t EntryCount = ModExitContents.size() / EntrySize;
+  return ArrayRef<uint64_t>(
+      reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount);
+}
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 96b62f1..8ef4a0a 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -39,6 +39,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     = Ctx->getMachOSection("__DATA", "__data", 0,
                            SectionKind::getDataRel());
 
+  // BSSSection might not be expected initialized on msvc.
+  BSSSection = 0;
+
   TLSDataSection // .tdata
     = Ctx->getMachOSection("__DATA", "__thread_data",
                            MCSectionMachO::S_THREAD_LOCAL_REGULAR,
@@ -79,7 +82,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
   // to using it in -static mode.
   SixteenByteConstantSection = 0;
   if (RelocM != Reloc::Static &&
-      T.getArch() != Triple::x86_64 && T.getArch() != Triple::ppc64)
+      T.getArch() != Triple::x86_64 && T.getArch() != Triple::ppc64 &&
+      T.getArch() != Triple::ppc64le)
     SixteenByteConstantSection =   // .literal16
       Ctx->getMachOSection("__TEXT", "__literal16",
                            MCSectionMachO::S_16BYTE_LITERALS,
@@ -198,6 +202,14 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_pubtypes",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubn",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getMachOSection("__DWARF", "__debug_gnu_pubt",
+                         MCSectionMachO::S_ATTR_DEBUG,
+                         SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getMachOSection("__DWARF", "__debug_str",
                          MCSectionMachO::S_ATTR_DEBUG,
@@ -222,6 +234,9 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
     Ctx->getMachOSection("__DWARF", "__debug_inlined",
                          MCSectionMachO::S_ATTR_DEBUG,
                          SectionKind::getMetadata());
+  StackMapSection =
+    Ctx->getMachOSection("__LLVM_STACKMAPS", "__llvm_stackmaps", 0,
+                         SectionKind::getMetadata());
 
   TLSExtraDataSection = TLSTLVSection;
 }
@@ -288,7 +303,7 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       FDEEncoding = dwarf::DW_EH_PE_udata4;
       TTypeEncoding = dwarf::DW_EH_PE_absptr;
     }
-  } else if (T.getArch() == Triple::ppc64) {
+  } else if (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le) {
     PersonalityEncoding = dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel |
       dwarf::DW_EH_PE_udata8;
     LSDAEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8;
@@ -434,6 +449,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfPubTypesSection =
     Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
                        ELF::SHF_MERGE | ELF::SHF_STRINGS,
@@ -495,6 +516,12 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
 
 void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   // COFF
+  BSSSection =
+    Ctx->getCOFFSection(".bss",
+                        COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+                        COFF::IMAGE_SCN_MEM_READ |
+                        COFF::IMAGE_SCN_MEM_WRITE,
+                        SectionKind::getBSS());
   TextSection =
     Ctx->getCOFFSection(".text",
                         COFF::IMAGE_SCN_CNT_CODE |
@@ -584,6 +611,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
                         COFF::IMAGE_SCN_MEM_READ,
                         SectionKind::getMetadata());
+  DwarfGnuPubNamesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubnames",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
+  DwarfGnuPubTypesSection =
+    Ctx->getCOFFSection(".debug_gnu_pubtypes",
+                        COFF::IMAGE_SCN_MEM_DISCARDABLE |
+                        COFF::IMAGE_SCN_MEM_READ,
+                        SectionKind::getMetadata());
   DwarfStrSection =
     Ctx->getCOFFSection(".debug_str",
                         COFF::IMAGE_SCN_MEM_DISCARDABLE |
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index d21ce8d..bc14c2a 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -18,22 +18,26 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_)
-    : MCStreamer(Kind, Context),
+    : MCStreamer(Context, TargetStreamer),
       Assembler(new MCAssembler(Context, TAB, *Emitter_,
                                 *TAB.createObjectWriter(OS), OS)),
       CurSectionData(0) {}
 
-MCObjectStreamer::MCObjectStreamer(StreamerKind Kind, MCContext &Context,
+MCObjectStreamer::MCObjectStreamer(MCContext &Context,
+                                   MCTargetStreamer *TargetStreamer,
                                    MCAsmBackend &TAB, raw_ostream &OS,
                                    MCCodeEmitter *Emitter_,
                                    MCAssembler *_Assembler)
-    : MCStreamer(Kind, Context), Assembler(_Assembler), CurSectionData(0) {}
+    : MCStreamer(Context, TargetStreamer), Assembler(_Assembler),
+      CurSectionData(0) {}
 
 MCObjectStreamer::~MCObjectStreamer() {
   delete &Assembler->getBackend();
@@ -98,15 +102,15 @@ const MCExpr *MCObjectStreamer::AddValueSymbols(const MCExpr *Value) {
   return Value;
 }
 
-void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
-                                     unsigned AddrSpace) {
-  assert(AddrSpace == 0 && "Address space must be 0!");
+void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
+  MCLineEntry::Make(this, getCurrentSection().first);
+
   // Avoid fixups when possible.
   int64_t AbsValue;
   if (AddValueSymbols(Value)->EvaluateAsAbsolute(AbsValue, getAssembler())) {
-    EmitIntValue(AbsValue, Size, AddrSpace);
+    EmitIntValue(AbsValue, Size);
     return;
   }
   DF->getFixups().push_back(
@@ -241,7 +245,7 @@ void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst) {
 }
 
 #ifndef NDEBUG
-static const char *BundlingNotImplementedMsg =
+static const char *const BundlingNotImplementedMsg =
   "Aligned bundling is not implemented for this object format";
 #endif
 
@@ -257,6 +261,19 @@ void MCObjectStreamer::EmitBundleUnlock() {
   llvm_unreachable(BundlingNotImplementedMsg);
 }
 
+void MCObjectStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+                                             unsigned Column, unsigned Flags,
+                                             unsigned Isa,
+                                             unsigned Discriminator,
+                                             StringRef FileName) {
+  // In case we see two .loc directives in a row, make sure the
+  // first one gets a line entry.
+  MCLineEntry::Make(this, getCurrentSection().first);
+
+  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
+                                          Isa, Discriminator, FileName);
+}
+
 void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                                 const MCSymbol *LastLabel,
                                                 const MCSymbol *Label,
@@ -287,8 +304,8 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
-void MCObjectStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  assert(AddrSpace == 0 && "Address space must be 0!");
+void MCObjectStreamer::EmitBytes(StringRef Data) {
+  MCLineEntry::Make(this, getCurrentSection().first);
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
 }
 
@@ -351,14 +368,17 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
-void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                                unsigned AddrSpace) {
-  assert(AddrSpace == 0 && "Address space must be 0!");
+void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
   // FIXME: A MCFillFragment would be more memory efficient but MCExpr has
   //        problems evaluating expressions across multiple fragments.
   getOrCreateDataFragment()->getContents().append(NumBytes, FillValue);
 }
 
+void MCObjectStreamer::EmitZeros(uint64_t NumBytes) {
+  unsigned ItemSize = getCurrentSection().first->isVirtualSection() ? 0 : 1;
+  insert(new MCFillFragment(0, ItemSize, NumBytes));
+}
+
 void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   const MCSymbol *LineSectionSymbol = NULL;
diff --git a/lib/MC/MCObjectSymbolizer.cpp b/lib/MC/MCObjectSymbolizer.cpp
new file mode 100644
index 0000000..b9131d1
--- /dev/null
+++ b/lib/MC/MCObjectSymbolizer.cpp
@@ -0,0 +1,310 @@
+//===-- lib/MC/MCObjectSymbolizer.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCObjectSymbolizer.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+using namespace llvm;
+using namespace object;
+
+//===- MCMachObjectSymbolizer ---------------------------------------------===//
+
+namespace {
+class MCMachObjectSymbolizer : public MCObjectSymbolizer {
+  const MachOObjectFile *MOOF;
+  // __TEXT;__stubs support.
+  uint64_t StubsStart;
+  uint64_t StubsCount;
+  uint64_t StubSize;
+  uint64_t StubsIndSymIndex;
+
+public:
+  MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
+                         const MachOObjectFile *MOOF);
+
+  StringRef findExternalFunctionAt(uint64_t Addr) LLVM_OVERRIDE;
+
+  void tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                       int64_t Value,
+                                       uint64_t Address) LLVM_OVERRIDE;
+};
+} // End unnamed namespace
+
+
+MCMachObjectSymbolizer::
+MCMachObjectSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo,
+                       const MachOObjectFile *MOOF)
+    : MCObjectSymbolizer(Ctx, RelInfo, MOOF), MOOF(MOOF),
+      StubsStart(0), StubsCount(0), StubSize(0), StubsIndSymIndex(0) {
+
+  error_code ec;
+  for (section_iterator SI = MOOF->begin_sections(), SE = MOOF->end_sections();
+       SI != SE; SI.increment(ec)) {
+    if (ec) break;
+    StringRef Name; SI->getName(Name);
+    if (Name == "__stubs") {
+      SectionRef StubsSec = *SI;
+      if (MOOF->is64Bit()) {
+        MachO::section_64 S = MOOF->getSection64(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
+      } else {
+        MachO::section S = MOOF->getSection(StubsSec.getRawDataRefImpl());
+        StubsIndSymIndex = S.reserved1;
+        StubSize = S.reserved2;
+      }
+      assert(StubSize && "Mach-O stub entry size can't be zero!");
+      StubsSec.getAddress(StubsStart);
+      StubsSec.getSize(StubsCount);
+      StubsCount /= StubSize;
+    }
+  }
+}
+
+StringRef MCMachObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  // FIXME: also, this can all be done at the very beginning, by iterating over
+  // all stubs and creating the calls to outside functions. Is it worth it
+  // though?
+  if (!StubSize)
+    return StringRef();
+  uint64_t StubIdx = (Addr - StubsStart) / StubSize;
+  if (StubIdx >= StubsCount)
+    return StringRef();
+
+  uint32_t SymtabIdx =
+    MOOF->getIndirectSymbolTableEntry(MOOF->getDysymtabLoadCommand(), StubIdx);
+
+  StringRef SymName;
+  symbol_iterator SI = MOOF->begin_symbols();
+  error_code ec;
+  for (uint32_t i = 0; i != SymtabIdx; ++i) {
+    SI.increment(ec);
+  }
+  SI->getName(SymName);
+  assert(SI != MOOF->end_symbols() && "Stub wasn't found in the symbol table!");
+  assert(SymName.front() == '_' && "Mach-O symbol doesn't start with '_'!");
+  return SymName.substr(1);
+}
+
+void MCMachObjectSymbolizer::
+tryAddingPcLoadReferenceComment(raw_ostream &cStream, int64_t Value,
+                                uint64_t Address) {
+  if (const RelocationRef *R = findRelocationAt(Address)) {
+    const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R);
+    if (!RelExpr || RelExpr->EvaluateAsAbsolute(Value) == false)
+      return;
+  }
+  uint64_t Addr = Value;
+  if (const SectionRef *S = findSectionContaining(Addr)) {
+    StringRef Name; S->getName(Name);
+    uint64_t SAddr; S->getAddress(SAddr);
+    if (Name == "__cstring") {
+      StringRef Contents;
+      S->getContents(Contents);
+      Contents = Contents.substr(Addr - SAddr);
+      cStream << " ## literal pool for: "
+              << Contents.substr(0, Contents.find_first_of(0));
+    }
+  }
+}
+
+//===- MCObjectSymbolizer -------------------------------------------------===//
+
+MCObjectSymbolizer::MCObjectSymbolizer(MCContext &Ctx,
+                                       OwningPtr<MCRelocationInfo> &RelInfo,
+                                       const ObjectFile *Obj)
+    : MCSymbolizer(Ctx, RelInfo), Obj(Obj), SortedSections(), AddrToReloc() {
+}
+
+bool MCObjectSymbolizer::
+tryAddingSymbolicOperand(MCInst &MI, raw_ostream &cStream,
+                         int64_t Value, uint64_t Address, bool IsBranch,
+                         uint64_t Offset, uint64_t InstSize) {
+  if (IsBranch) {
+    StringRef ExtFnName = findExternalFunctionAt((uint64_t)Value);
+    if (!ExtFnName.empty()) {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(ExtFnName);
+      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      MI.addOperand(MCOperand::CreateExpr(Expr));
+      return true;
+    }
+  }
+
+  if (const RelocationRef *R = findRelocationAt(Address + Offset)) {
+    if (const MCExpr *RelExpr = RelInfo->createExprForRelocation(*R)) {
+      MI.addOperand(MCOperand::CreateExpr(RelExpr));
+      return true;
+    }
+    // Only try to create a symbol+offset expression if there is no relocation.
+    return false;
+  }
+
+  // Interpret Value as a branch target.
+  if (IsBranch == false)
+    return false;
+  uint64_t UValue = Value;
+  // FIXME: map instead of looping each time?
+  error_code ec;
+  for (symbol_iterator SI = Obj->begin_symbols(), SE = Obj->end_symbols();
+       SI != SE; SI.increment(ec)) {
+    if (ec) break;
+    uint64_t SymAddr; SI->getAddress(SymAddr);
+    uint64_t SymSize; SI->getSize(SymSize);
+    StringRef SymName; SI->getName(SymName);
+    SymbolRef::Type SymType; SI->getType(SymType);
+    if (SymAddr == UnknownAddressOrSize || SymSize == UnknownAddressOrSize
+        || SymName.empty() || SymType != SymbolRef::ST_Function)
+      continue;
+
+    if ( SymAddr == UValue ||
+        (SymAddr <= UValue && SymAddr + SymSize > UValue)) {
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+      const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      if (SymAddr != UValue) {
+        const MCExpr *Off = MCConstantExpr::Create(UValue - SymAddr, Ctx);
+        Expr = MCBinaryExpr::CreateAdd(Expr, Off, Ctx);
+      }
+      MI.addOperand(MCOperand::CreateExpr(Expr));
+      return true;
+    }
+  }
+  return false;
+}
+
+void MCObjectSymbolizer::
+tryAddingPcLoadReferenceComment(raw_ostream &cStream,
+                                int64_t Value, uint64_t Address) {
+}
+
+StringRef MCObjectSymbolizer::findExternalFunctionAt(uint64_t Addr) {
+  return StringRef();
+}
+
+MCObjectSymbolizer *
+MCObjectSymbolizer::createObjectSymbolizer(MCContext &Ctx,
+                                           OwningPtr<MCRelocationInfo> &RelInfo,
+                                           const ObjectFile *Obj) {
+  if (const MachOObjectFile *MOOF = dyn_cast<MachOObjectFile>(Obj))
+    return new MCMachObjectSymbolizer(Ctx, RelInfo, MOOF);
+  return new MCObjectSymbolizer(Ctx, RelInfo, Obj);
+}
+
+// SortedSections implementation.
+
+static bool SectionStartsBefore(const SectionRef &S, uint64_t Addr) {
+  uint64_t SAddr; S.getAddress(SAddr);
+  return SAddr < Addr;
+}
+
+const SectionRef *MCObjectSymbolizer::findSectionContaining(uint64_t Addr) {
+  if (SortedSections.empty())
+    buildSectionList();
+
+  SortedSectionList::iterator
+    EndIt = SortedSections.end(),
+    It = std::lower_bound(SortedSections.begin(), EndIt,
+                          Addr, SectionStartsBefore);
+  if (It == EndIt)
+    return 0;
+  uint64_t SAddr; It->getAddress(SAddr);
+  uint64_t SSize; It->getSize(SSize);
+  if (Addr >= SAddr + SSize)
+    return 0;
+  return &*It;
+}
+
+const RelocationRef *MCObjectSymbolizer::findRelocationAt(uint64_t Addr) {
+  if (AddrToReloc.empty())
+    buildRelocationByAddrMap();
+
+  AddrToRelocMap::const_iterator RI = AddrToReloc.find(Addr);
+  if (RI == AddrToReloc.end())
+    return 0;
+  return &RI->second;
+}
+
+void MCObjectSymbolizer::buildSectionList() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    bool RequiredForExec; SI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false)
+      continue;
+    uint64_t SAddr; SI->getAddress(SAddr);
+    uint64_t SSize; SI->getSize(SSize);
+    SortedSectionList::iterator It = std::lower_bound(SortedSections.begin(),
+                                                      SortedSections.end(),
+                                                      SAddr,
+                                                      SectionStartsBefore);
+    if (It != SortedSections.end()) {
+      uint64_t FoundSAddr; It->getAddress(FoundSAddr);
+      if (FoundSAddr < SAddr + SSize)
+        llvm_unreachable("Inserting overlapping sections");
+    }
+    SortedSections.insert(It, *SI);
+  }
+}
+
+void MCObjectSymbolizer::buildRelocationByAddrMap() {
+  error_code ec;
+  for (section_iterator SI = Obj->begin_sections(), SE = Obj->end_sections();
+                        SI != SE; SI.increment(ec)) {
+    if (ec) break;
+
+    section_iterator RelSecI = SI->getRelocatedSection();
+    if (RelSecI == Obj->end_sections())
+      continue;
+
+    uint64_t StartAddr; RelSecI->getAddress(StartAddr);
+    uint64_t Size; RelSecI->getSize(Size);
+    bool RequiredForExec; RelSecI->isRequiredForExecution(RequiredForExec);
+    if (RequiredForExec == false || Size == 0)
+      continue;
+    for (relocation_iterator RI = SI->begin_relocations(),
+                             RE = SI->end_relocations();
+                             RI != RE;
+                             RI.increment(ec)) {
+      if (ec) break;
+      // FIXME: libObject is inconsistent regarding error handling. The
+      // overwhelming majority of methods always return object_error::success,
+      // and assert for simple errors.. Here, ELFObjectFile::getRelocationOffset
+      // asserts when the file type isn't ET_REL.
+      // This workaround handles x86-64 elf, the only one that has a relocinfo.
+      uint64_t Offset;
+      if (Obj->isELF()) {
+        const ELF64LEObjectFile *ELFObj = dyn_cast<ELF64LEObjectFile>(Obj);
+        if (ELFObj == 0)
+          break;
+        if (ELFObj->getELFFile()->getHeader()->e_type == ELF::ET_REL) {
+          RI->getOffset(Offset);
+          Offset += StartAddr;
+        } else {
+          RI->getAddress(Offset);
+        }
+      } else {
+        RI->getOffset(Offset);
+        Offset += StartAddr;
+      }
+      // At a specific address, only keep the first relocation.
+      if (AddrToReloc.find(Offset) == AddrToReloc.end())
+        AddrToReloc[Offset] = *RI;
+    }
+  }
+}
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c1c594a..b49dd01 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -91,9 +91,56 @@ AsmToken AsmLexer::LexFloatLiteral() {
                   StringRef(TokStart, CurPtr - TokStart));
 }
 
-/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
+/// while making sure there are enough actual digits around for the constant to
+/// be valid.
+///
+/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
+/// before we get here.
+AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
+  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
+         "unexpected parse state in floating hex");
+  bool NoFracDigits = true;
+
+  // Skip the fractional part if there is one
+  if (*CurPtr == '.') {
+    ++CurPtr;
+
+    const char *FracStart = CurPtr;
+    while (isxdigit(*CurPtr))
+      ++CurPtr;
+
+    NoFracDigits = CurPtr == FracStart;
+  }
+
+  if (NoIntDigits && NoFracDigits)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one significand digit");
+
+  // Make sure we do have some kind of proper exponent part
+  if (*CurPtr != 'p' && *CurPtr != 'P')
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected exponent part 'p'");
+  ++CurPtr;
+
+  if (*CurPtr == '+' || *CurPtr == '-')
+    ++CurPtr;
+
+  // N.b. exponent digits are *not* hex
+  const char *ExpStart = CurPtr;
+  while (isdigit(*CurPtr))
+    ++CurPtr;
+
+  if (CurPtr == ExpStart)
+    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
+                                 "expected at least one exponent digit");
+
+  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
+}
+
+/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
 static bool IsIdentifierChar(char c) {
-  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
+  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@' || c == '?';
 }
 AsmToken AsmLexer::LexIdentifier() {
   // Check for floating point literals.
@@ -265,7 +312,12 @@ AsmToken AsmLexer::LexDigit() {
     while (isxdigit(CurPtr[0]))
       ++CurPtr;
 
-    // Requires at least one hex digit.
+    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
+    // diagnosed by LexHexFloatLiteral).
+    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
+      return LexHexFloatLiteral(NumStart == CurPtr);
+
+    // Otherwise requires at least one hex digit.
     if (CurPtr == NumStart)
       return ReturnError(CurPtr-2, "invalid hexadecimal number");
 
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index edefdb4..a91bd93 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -94,13 +94,13 @@ public:
 };
 
 struct ParseStatementInfo {
-  /// ParsedOperands - The parsed operands from the last parsed statement.
+  /// \brief The parsed operands from the last parsed statement.
   SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
 
-  /// Opcode - The opcode from the last parsed instruction.
+  /// \brief The opcode from the last parsed instruction.
   unsigned Opcode;
 
-  /// Error - Was there an error parsing the inline assembly?
+  /// \brief Was there an error parsing the inline assembly?
   bool ParseError;
 
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
@@ -138,17 +138,20 @@ private:
   AsmCond TheCondState;
   std::vector<AsmCond> TheCondStack;
 
-  /// ExtensionDirectiveMap - maps directive names to handler methods in parser
+  /// \brief maps directive names to handler methods in parser
   /// extensions. Extensions register themselves in this map by calling
   /// addDirectiveHandler.
   StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
 
-  /// MacroMap - Map of currently defined macros.
+  /// \brief Map of currently defined macros.
   StringMap<MCAsmMacro*> MacroMap;
 
-  /// ActiveMacros - Stack of active macro instantiations.
+  /// \brief Stack of active macro instantiations.
   std::vector<MacroInstantiation*> ActiveMacros;
 
+  /// \brief List of bodies of anonymous macros.
+  std::deque<MCAsmMacro> MacroLikeBodies;
+
   /// Boolean tracking whether macro substitution is enabled.
   unsigned MacrosEnabledFlag : 1;
 
@@ -160,14 +163,21 @@ private:
   int64_t CppHashLineNumber;
   SMLoc CppHashLoc;
   int CppHashBuf;
+  /// When generating dwarf for assembly source files we need to calculate the
+  /// logical line number based on the last parsed cpp hash file line comment
+  /// and current line. Since this is slow and messes up the SourceMgr's
+  /// cache we save the last info we queried with SrcMgr.FindLineNumber().
+  SMLoc LastQueryIDLoc;
+  int LastQueryBuffer;
+  unsigned LastQueryLine;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
 
-  /// IsDarwin - is Darwin compatibility enabled?
+  /// \brief is Darwin compatibility enabled?
   bool IsDarwin;
 
-  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  /// \brief Are we parsing ms-style inline assembly?
   bool ParsingInlineAsm;
 
 public:
@@ -225,7 +235,7 @@ public:
   virtual bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc);
   virtual bool parseAbsoluteExpression(int64_t &Res);
 
-  /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
+  /// \brief Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
   virtual bool parseIdentifier(StringRef &Res);
   virtual void eatToEndOfStatement();
@@ -235,11 +245,11 @@ public:
 
 private:
 
-  bool ParseStatement(ParseStatementInfo &Info);
-  void EatToEndOfLine();
-  bool ParseCppHashLineFilenameComment(const SMLoc &L);
+  bool parseStatement(ParseStatementInfo &Info);
+  void eatToEndOfLine();
+  bool parseCppHashLineFilenameComment(const SMLoc &L);
 
-  void CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+  void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
                         MCAsmMacroParameters Parameters);
   bool expandMacro(raw_svector_ostream &OS, StringRef Body,
                    const MCAsmMacroParameters &Parameters,
@@ -247,55 +257,56 @@ private:
                    const SMLoc &L);
 
   /// \brief Are macros enabled in the parser?
-  bool MacrosEnabled() {return MacrosEnabledFlag;}
+  bool areMacrosEnabled() {return MacrosEnabledFlag;}
 
   /// \brief Control a flag in the parser that enables or disables macros.
-  void SetMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
+  void setMacrosEnabled(bool Flag) {MacrosEnabledFlag = Flag;}
 
   /// \brief Lookup a previously defined macro.
   /// \param Name Macro name.
   /// \returns Pointer to macro. NULL if no such macro was defined.
-  const MCAsmMacro* LookupMacro(StringRef Name);
+  const MCAsmMacro* lookupMacro(StringRef Name);
 
   /// \brief Define a new macro with the given name and information.
-  void DefineMacro(StringRef Name, const MCAsmMacro& Macro);
+  void defineMacro(StringRef Name, const MCAsmMacro& Macro);
 
   /// \brief Undefine a macro. If no such macro was defined, it's a no-op.
-  void UndefineMacro(StringRef Name);
+  void undefineMacro(StringRef Name);
 
   /// \brief Are we inside a macro instantiation?
-  bool InsideMacroInstantiation() {return !ActiveMacros.empty();}
+  bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
 
-  /// \brief Handle entry to macro instantiation. 
+  /// \brief Handle entry to macro instantiation.
   ///
   /// \param M The macro.
   /// \param NameLoc Instantiation location.
-  bool HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+  bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
 
   /// \brief Handle exit from macro instantiation.
-  void HandleMacroExit();
+  void handleMacroExit();
 
   /// \brief Extract AsmTokens for a macro argument. If the argument delimiter
   /// is initially unknown, set it to AsmToken::Eof. It will be set to the
   /// correct delimiter by the method.
-  bool ParseMacroArgument(MCAsmMacroArgument &MA,
+  bool parseMacroArgument(MCAsmMacroArgument &MA,
                           AsmToken::TokenKind &ArgumentDelimiter);
 
   /// \brief Parse all macro arguments for a given macro.
-  bool ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+  bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
 
-  void PrintMacroInstantiations();
-  void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
+  void printMacroInstantiations();
+  void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
                     ArrayRef<SMRange> Ranges = None) const {
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
 
-  /// EnterIncludeFile - Enter the specified file. This returns true on failure.
-  bool EnterIncludeFile(const std::string &Filename);
-  /// ProcessIncbinFile - Process the specified file for the .incbin directive.
+  /// \brief Enter the specified file. This returns true on failure.
+  bool enterIncludeFile(const std::string &Filename);
+
+  /// \brief Process the specified file for the .incbin directive.
   /// This returns true on failure.
-  bool ProcessIncbinFile(const std::string &Filename);
+  bool processIncbinFile(const std::string &Filename);
 
   /// \brief Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
@@ -303,7 +314,7 @@ private:
   ///
   /// \param InBuffer If not -1, should be the known buffer id that contains the
   /// location.
-  void JumpToLoc(SMLoc Loc, int InBuffer=-1);
+  void jumpToLoc(SMLoc Loc, int InBuffer=-1);
 
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
@@ -312,17 +323,16 @@ private:
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
-  StringRef ParseStringToComma();
+  StringRef parseStringToComma();
 
-  bool ParseAssignment(StringRef Name, bool allow_redef,
+  bool parseAssignment(StringRef Name, bool allow_redef,
                        bool NoDeadStrip = false);
 
-  bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
-  bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
-  bool ParseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+  bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
 
   // Generic (target and platform independent) directive parsing.
   enum DirectiveKind {
@@ -332,7 +342,7 @@ private:
     DK_FLOAT, DK_DOUBLE, DK_ALIGN, DK_ALIGN32, DK_BALIGN, DK_BALIGNW,
     DK_BALIGNL, DK_P2ALIGN, DK_P2ALIGNW, DK_P2ALIGNL, DK_ORG, DK_FILL, DK_ENDR,
     DK_BUNDLE_ALIGN_MODE, DK_BUNDLE_LOCK, DK_BUNDLE_UNLOCK,
-    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL, DK_INDIRECT_SYMBOL,
+    DK_ZERO, DK_EXTERN, DK_GLOBL, DK_GLOBAL,
     DK_LAZY_REFERENCE, DK_NO_DEAD_STRIP, DK_SYMBOL_RESOLVER, DK_PRIVATE_EXTERN,
     DK_REFERENCE, DK_WEAK_DEFINITION, DK_WEAK_REFERENCE,
     DK_WEAK_DEF_CAN_BE_HIDDEN, DK_COMM, DK_COMMON, DK_LCOMM, DK_ABORT,
@@ -345,112 +355,113 @@ private:
     DK_CFI_OFFSET, DK_CFI_REL_OFFSET, DK_CFI_PERSONALITY, DK_CFI_LSDA,
     DK_CFI_REMEMBER_STATE, DK_CFI_RESTORE_STATE, DK_CFI_SAME_VALUE,
     DK_CFI_RESTORE, DK_CFI_ESCAPE, DK_CFI_SIGNAL_FRAME, DK_CFI_UNDEFINED,
-    DK_CFI_REGISTER,
+    DK_CFI_REGISTER, DK_CFI_WINDOW_SAVE,
     DK_MACROS_ON, DK_MACROS_OFF, DK_MACRO, DK_ENDM, DK_ENDMACRO, DK_PURGEM,
     DK_SLEB128, DK_ULEB128
   };
 
-  /// DirectiveKindMap - Maps directive name --> DirectiveKind enum, for
+  /// \brief Maps directive name --> DirectiveKind enum, for
   /// directives parsed by this class.
   StringMap<DirectiveKind> DirectiveKindMap;
 
   // ".ascii", ".asciz", ".string"
-  bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
-  bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
-  bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
-  bool ParseDirectiveFill(); // ".fill"
-  bool ParseDirectiveZero(); // ".zero"
+  bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+  bool parseDirectiveValue(unsigned Size); // ".byte", ".long", ...
+  bool parseDirectiveRealValue(const fltSemantics &); // ".single", ...
+  bool parseDirectiveFill(); // ".fill"
+  bool parseDirectiveZero(); // ".zero"
   // ".set", ".equ", ".equiv"
-  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
-  bool ParseDirectiveOrg(); // ".org"
+  bool parseDirectiveSet(StringRef IDVal, bool allow_redef);
+  bool parseDirectiveOrg(); // ".org"
   // ".align{,32}", ".p2align{,w,l}"
-  bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
+  bool parseDirectiveAlign(bool IsPow2, unsigned ValueSize);
 
   // ".file", ".line", ".loc", ".stabs"
-  bool ParseDirectiveFile(SMLoc DirectiveLoc);
-  bool ParseDirectiveLine();
-  bool ParseDirectiveLoc();
-  bool ParseDirectiveStabs();
+  bool parseDirectiveFile(SMLoc DirectiveLoc);
+  bool parseDirectiveLine();
+  bool parseDirectiveLoc();
+  bool parseDirectiveStabs();
 
   // .cfi directives
-  bool ParseDirectiveCFIRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFISections();
-  bool ParseDirectiveCFIStartProc();
-  bool ParseDirectiveCFIEndProc();
-  bool ParseDirectiveCFIDefCfaOffset();
-  bool ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIAdjustCfaOffset();
-  bool ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
-  bool ParseDirectiveCFIRememberState();
-  bool ParseDirectiveCFIRestoreState();
-  bool ParseDirectiveCFISameValue(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIRestore(SMLoc DirectiveLoc);
-  bool ParseDirectiveCFIEscape();
-  bool ParseDirectiveCFISignalFrame();
-  bool ParseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIWindowSave();
+  bool parseDirectiveCFISections();
+  bool parseDirectiveCFIStartProc();
+  bool parseDirectiveCFIEndProc();
+  bool parseDirectiveCFIDefCfaOffset();
+  bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIAdjustCfaOffset();
+  bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool parseDirectiveCFIRememberState();
+  bool parseDirectiveCFIRestoreState();
+  bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIEscape();
+  bool parseDirectiveCFISignalFrame();
+  bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);
 
   // macro directives
-  bool ParseDirectivePurgeMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveEndMacro(StringRef Directive);
-  bool ParseDirectiveMacro(SMLoc DirectiveLoc);
-  bool ParseDirectiveMacrosOnOff(StringRef Directive);
+  bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveEndMacro(StringRef Directive);
+  bool parseDirectiveMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveMacrosOnOff(StringRef Directive);
 
   // ".bundle_align_mode"
-  bool ParseDirectiveBundleAlignMode();
+  bool parseDirectiveBundleAlignMode();
   // ".bundle_lock"
-  bool ParseDirectiveBundleLock();
+  bool parseDirectiveBundleLock();
   // ".bundle_unlock"
-  bool ParseDirectiveBundleUnlock();
+  bool parseDirectiveBundleUnlock();
 
   // ".space", ".skip"
-  bool ParseDirectiveSpace(StringRef IDVal);
+  bool parseDirectiveSpace(StringRef IDVal);
 
   // .sleb128 (Signed=true) and .uleb128 (Signed=false)
-  bool ParseDirectiveLEB128(bool Signed);
+  bool parseDirectiveLEB128(bool Signed);
 
-  /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
+  /// \brief Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
-  bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+  bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
 
-  bool ParseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+  bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
 
-  bool ParseDirectiveAbort(); // ".abort"
-  bool ParseDirectiveInclude(); // ".include"
-  bool ParseDirectiveIncbin(); // ".incbin"
+  bool parseDirectiveAbort(); // ".abort"
+  bool parseDirectiveInclude(); // ".include"
+  bool parseDirectiveIncbin(); // ".incbin"
 
-  bool ParseDirectiveIf(SMLoc DirectiveLoc); // ".if"
+  bool parseDirectiveIf(SMLoc DirectiveLoc); // ".if"
   // ".ifb" or ".ifnb", depending on ExpectBlank.
-  bool ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
   // ".ifc" or ".ifnc", depending on ExpectEqual.
-  bool ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
+  bool parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual);
   // ".ifdef" or ".ifndef", depending on expect_defined
-  bool ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
-  bool ParseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
-  bool ParseDirectiveElse(SMLoc DirectiveLoc); // ".else"
-  bool ParseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
+  bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  bool parseDirectiveElseIf(SMLoc DirectiveLoc); // ".elseif"
+  bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
+  bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
   virtual bool parseEscapedString(std::string &Data);
 
-  const MCExpr *ApplyModifierToExpr(const MCExpr *E,
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
 
   // Macro-like directives
-  MCAsmMacro *ParseMacroLikeBody(SMLoc DirectiveLoc);
-  void InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+  MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
+  void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                 raw_svector_ostream &OS);
-  bool ParseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
-  bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
-  bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
-  bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+  bool parseDirectiveRept(SMLoc DirectiveLoc); // ".rept"
+  bool parseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
 
   // "_emit" or "__emit"
-  bool ParseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+  bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
                             size_t Len);
 
   // "align"
-  bool ParseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
+  bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
 
   void initializeDirectiveKindMap();
 };
@@ -466,12 +477,12 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 
 enum { DEFAULT_ADDRSPACE = 0 };
 
-AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
-                     MCStreamer &_Out, const MCAsmInfo &_MAI)
-  : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
-    PlatformParser(0),
-    CurBuffer(0), MacrosEnabledFlag(true), CppHashLineNumber(0),
-    AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
+AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx, MCStreamer &_Out,
+                     const MCAsmInfo &_MAI)
+    : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
+      PlatformParser(0), CurBuffer(0), MacrosEnabledFlag(true),
+      CppHashLineNumber(0), AssemblerDialect(~0U), IsDarwin(false),
+      ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -502,37 +513,40 @@ AsmParser::~AsmParser() {
   assert(ActiveMacros.empty() && "Unexpected active macro instantiation!");
 
   // Destroy any macros.
-  for (StringMap<MCAsmMacro*>::iterator it = MacroMap.begin(),
-         ie = MacroMap.end(); it != ie; ++it)
+  for (StringMap<MCAsmMacro *>::iterator it = MacroMap.begin(),
+                                         ie = MacroMap.end();
+       it != ie; ++it)
     delete it->getValue();
 
   delete PlatformParser;
 }
 
-void AsmParser::PrintMacroInstantiations() {
+void AsmParser::printMacroInstantiations() {
   // Print the active macro instantiation stack.
-  for (std::vector<MacroInstantiation*>::const_reverse_iterator
-         it = ActiveMacros.rbegin(), ie = ActiveMacros.rend(); it != ie; ++it)
-    PrintMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
+  for (std::vector<MacroInstantiation *>::const_reverse_iterator
+           it = ActiveMacros.rbegin(),
+           ie = ActiveMacros.rend();
+       it != ie; ++it)
+    printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
                  "while in macro instantiation");
 }
 
 bool AsmParser::Warning(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   if (FatalAssemblerWarnings)
     return Error(L, Msg, Ranges);
-  PrintMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Warning, Msg, Ranges);
+  printMacroInstantiations();
   return false;
 }
 
 bool AsmParser::Error(SMLoc L, const Twine &Msg, ArrayRef<SMRange> Ranges) {
   HadError = true;
-  PrintMessage(L, SourceMgr::DK_Error, Msg, Ranges);
-  PrintMacroInstantiations();
+  printMessage(L, SourceMgr::DK_Error, Msg, Ranges);
+  printMacroInstantiations();
   return true;
 }
 
-bool AsmParser::EnterIncludeFile(const std::string &Filename) {
+bool AsmParser::enterIncludeFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
@@ -545,22 +559,21 @@ bool AsmParser::EnterIncludeFile(const std::string &Filename) {
   return false;
 }
 
-/// Process the specified .incbin file by seaching for it in the include paths
+/// Process the specified .incbin file by searching for it in the include paths
 /// then just emitting the byte contents of the file to the streamer. This
 /// returns true on failure.
-bool AsmParser::ProcessIncbinFile(const std::string &Filename) {
+bool AsmParser::processIncbinFile(const std::string &Filename) {
   std::string IncludedFile;
   int NewBuf = SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
   if (NewBuf == -1)
     return true;
 
   // Pick up the bytes from the file and emit them.
-  getStreamer().EmitBytes(SrcMgr.getMemoryBuffer(NewBuf)->getBuffer(),
-                          DEFAULT_ADDRSPACE);
+  getStreamer().EmitBytes(SrcMgr.getMemoryBuffer(NewBuf)->getBuffer());
   return false;
 }
 
-void AsmParser::JumpToLoc(SMLoc Loc, int InBuffer) {
+void AsmParser::jumpToLoc(SMLoc Loc, int InBuffer) {
   if (InBuffer != -1) {
     CurBuffer = InBuffer;
   } else {
@@ -577,7 +590,7 @@ const AsmToken &AsmParser::Lex() {
     // include stack.
     SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
     if (ParentIncludeLoc != SMLoc()) {
-      JumpToLoc(ParentIncludeLoc);
+      jumpToLoc(ParentIncludeLoc);
       tok = &Lexer.Lex();
     }
   }
@@ -614,7 +627,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
     ParseStatementInfo Info;
-    if (!ParseStatement(Info)) continue;
+    if (!parseStatement(Info))
+      continue;
 
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
@@ -628,7 +642,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // Check to see there are no empty DwarfFile slots.
   const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
-    getContext().getMCDwarfFiles();
+      getContext().getMCDwarfFiles();
   for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
     if (!MCDwarfFiles[i])
       TokError("unassigned file number: " + Twine(i) + " for .file directives");
@@ -641,7 +655,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
   if (!NoFinalize && MAI.hasSubsectionsViaSymbols()) {
     const MCContext::SymbolTable &Symbols = getContext().getSymbols();
     for (MCContext::SymbolTable::const_iterator i = Symbols.begin(),
-         e = Symbols.end();
+                                                e = Symbols.end();
          i != e; ++i) {
       MCSymbol *Sym = i->getValue();
       // Variable symbols may not be marked as defined, so check those
@@ -651,13 +665,12 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
         // FIXME: We would really like to refer back to where the symbol was
         // first referenced for a source location. We need to add something
         // to track that. Currently, we just point to the end of the file.
-        PrintMessage(getLexer().getLoc(), SourceMgr::DK_Error,
-                     "assembler local symbol '" + Sym->getName() +
-                     "' not defined");
+        printMessage(
+            getLexer().getLoc(), SourceMgr::DK_Error,
+            "assembler local symbol '" + Sym->getName() + "' not defined");
     }
   }
 
-
   // Finalize the output stream if there are no errors and if the client wants
   // us to.
   if (!HadError && !NoFinalize)
@@ -673,10 +686,9 @@ void AsmParser::checkForValidSection() {
   }
 }
 
-/// eatToEndOfStatement - Throw away the rest of the line for testing purposes.
+/// \brief Throw away the rest of the line for testing purposes.
 void AsmParser::eatToEndOfStatement() {
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   // Eat EOL.
@@ -687,33 +699,32 @@ void AsmParser::eatToEndOfStatement() {
 StringRef AsmParser::parseStringToEndOfStatement() {
   const char *Start = getTok().getLoc().getPointer();
 
-  while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Eof))
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-StringRef AsmParser::ParseStringToComma() {
+StringRef AsmParser::parseStringToComma() {
   const char *Start = getTok().getLoc().getPointer();
 
   while (Lexer.isNot(AsmToken::EndOfStatement) &&
-         Lexer.isNot(AsmToken::Comma) &&
-         Lexer.isNot(AsmToken::Eof))
+         Lexer.isNot(AsmToken::Comma) && Lexer.isNot(AsmToken::Eof))
     Lex();
 
   const char *End = getTok().getLoc().getPointer();
   return StringRef(Start, End - Start);
 }
 
-/// ParseParenExpr - Parse a paren expression and return it.
+/// \brief Parse a paren expression and return it.
 /// NOTE: This assumes the leading '(' has already been consumed.
 ///
 /// parenexpr ::= expr)
 ///
-bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RParen))
     return TokError("expected ')' in parentheses expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -721,13 +732,14 @@ bool AsmParser::ParseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParseBracketExpr - Parse a bracket expression and return it.
+/// \brief Parse a bracket expression and return it.
 /// NOTE: This assumes the leading '[' has already been consumed.
 ///
 /// bracketexpr ::= expr]
 ///
-bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  if (parseExpression(Res)) return true;
+bool AsmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
   if (Lexer.isNot(AsmToken::RBrac))
     return TokError("expected ']' in brackets expression");
   EndLoc = Lexer.getTok().getEndLoc();
@@ -735,13 +747,13 @@ bool AsmParser::ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   return false;
 }
 
-/// ParsePrimaryExpr - Parse a primary expression and return it.
+/// \brief Parse a primary expression and return it.
 ///  primaryexpr ::= (parenexpr
 ///  primaryexpr ::= symbol
 ///  primaryexpr ::= number
 ///  primaryexpr ::= '.'
 ///  primaryexpr ::= ~,+,- primaryexpr
-bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   SMLoc FirstTokenLoc = getLexer().getLoc();
   AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
   switch (FirstTokenKind) {
@@ -752,36 +764,54 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     return true;
   case AsmToken::Exclaim:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateLNot(Res, getContext());
     return false;
   case AsmToken::Dollar:
+  case AsmToken::At:
   case AsmToken::String:
   case AsmToken::Identifier: {
     StringRef Identifier;
     if (parseIdentifier(Identifier)) {
-      if (FirstTokenKind == AsmToken::Dollar)
-        return Error(FirstTokenLoc, "invalid token in expression");
-      return true;
+      if (FirstTokenKind == AsmToken::Dollar) {
+        if (Lexer.getMAI().getDollarIsPC()) {
+          // This is a '$' reference, which references the current PC.  Emit a
+          // temporary label to the streamer and refer to it.
+          MCSymbol *Sym = Ctx.CreateTempSymbol();
+          Out.EmitLabel(Sym);
+          Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                        getContext());
+          EndLoc = FirstTokenLoc;
+          return false;
+        } else
+          return Error(FirstTokenLoc, "invalid token in expression");
+        return true;
+      }
     }
 
     EndLoc = SMLoc::getFromPointer(Identifier.end());
 
     // This is a symbol reference.
+    StringRef SymbolName = Identifier;
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     std::pair<StringRef, StringRef> Split = Identifier.split('@');
-    MCSymbol *Sym = getContext().GetOrCreateSymbol(Split.first);
 
     // Lookup the symbol variant if used.
-    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
     if (Split.first.size() != Identifier.size()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
-      if (Variant == MCSymbolRefExpr::VK_Invalid) {
+      if (Variant != MCSymbolRefExpr::VK_Invalid) {
+        SymbolName = Split.first;
+      } else if (MAI.doesAllowAtInName()) {
+        Variant = MCSymbolRefExpr::VK_None;
+      } else {
         Variant = MCSymbolRefExpr::VK_None;
         return TokError("invalid variant '" + Split.second + "'");
       }
     }
 
+    MCSymbol *Sym = getContext().GetOrCreateSymbol(SymbolName);
+
     // If this is an absolute variable reference, substitute it now to preserve
     // semantics in the face of reassignment.
     if (Sym->isVariable() && isa<MCConstantExpr>(Sym->getVariableValue())) {
@@ -805,11 +835,21 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     // Look for 'b' or 'f' following an Integer as a directional label
     if (Lexer.getKind() == AsmToken::Identifier) {
       StringRef IDVal = getTok().getString();
-      if (IDVal == "f" || IDVal == "b"){
-        MCSymbol *Sym = Ctx.GetDirectionalLocalSymbol(IntVal,
-                                                      IDVal == "f" ? 1 : 0);
-        Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
-                                      getContext());
+      // Lookup the symbol variant if used.
+      std::pair<StringRef, StringRef> Split = IDVal.split('@');
+      MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+      if (Split.first.size() != IDVal.size()) {
+        Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
+        if (Variant == MCSymbolRefExpr::VK_Invalid) {
+          Variant = MCSymbolRefExpr::VK_None;
+          return TokError("invalid variant '" + Split.second + "'");
+        }
+        IDVal = Split.first;
+      }
+      if (IDVal == "f" || IDVal == "b") {
+        MCSymbol *Sym =
+            Ctx.GetDirectionalLocalSymbol(IntVal, IDVal == "f" ? 1 : 0);
+        Res = MCSymbolRefExpr::Create(Sym, Variant, getContext());
         if (IDVal == "b" && Sym->isUndefined())
           return Error(Loc, "invalid reference to undefined symbol");
         EndLoc = Lexer.getTok().getEndLoc();
@@ -838,27 +878,27 @@ bool AsmParser::ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
   }
   case AsmToken::LParen:
     Lex(); // Eat the '('.
-    return ParseParenExpr(Res, EndLoc);
+    return parseParenExpr(Res, EndLoc);
   case AsmToken::LBrac:
     if (!PlatformParser->HasBracketExpressions())
       return TokError("brackets expression not supported on this target");
     Lex(); // Eat the '['.
-    return ParseBracketExpr(Res, EndLoc);
+    return parseBracketExpr(Res, EndLoc);
   case AsmToken::Minus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateMinus(Res, getContext());
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreatePlus(Res, getContext());
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
-    if (ParsePrimaryExpr(Res, EndLoc))
+    if (parsePrimaryExpr(Res, EndLoc))
       return true;
     Res = MCUnaryExpr::CreateNot(Res, getContext());
     return false;
@@ -870,13 +910,13 @@ bool AsmParser::parseExpression(const MCExpr *&Res) {
   return parseExpression(Res, EndLoc);
 }
 
-bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
-  return ParsePrimaryExpr(Res, EndLoc);
-}
-
 const MCExpr *
-AsmParser::ApplyModifierToExpr(const MCExpr *E,
+AsmParser::applyModifierToExpr(const MCExpr *E,
                                MCSymbolRefExpr::VariantKind Variant) {
+  // Ask the target implementation about this expression first.
+  const MCExpr *NewE = getTargetParser().applyModifierToExpr(E, Variant, Ctx);
+  if (NewE)
+    return NewE;
   // Recurse over the given expression, rebuilding it to apply the given variant
   // if there is exactly one symbol.
   switch (E->getKind()) {
@@ -888,8 +928,8 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
 
     if (SRE->getKind() != MCSymbolRefExpr::VK_None) {
-      TokError("invalid variant on expression '" +
-               getTok().getIdentifier() + "' (already modified)");
+      TokError("invalid variant on expression '" + getTok().getIdentifier() +
+               "' (already modified)");
       return E;
     }
 
@@ -898,7 +938,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Unary: {
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
-    const MCExpr *Sub = ApplyModifierToExpr(UE->getSubExpr(), Variant);
+    const MCExpr *Sub = applyModifierToExpr(UE->getSubExpr(), Variant);
     if (!Sub)
       return 0;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, getContext());
@@ -906,14 +946,16 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 
   case MCExpr::Binary: {
     const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
-    const MCExpr *LHS = ApplyModifierToExpr(BE->getLHS(), Variant);
-    const MCExpr *RHS = ApplyModifierToExpr(BE->getRHS(), Variant);
+    const MCExpr *LHS = applyModifierToExpr(BE->getLHS(), Variant);
+    const MCExpr *RHS = applyModifierToExpr(BE->getRHS(), Variant);
 
     if (!LHS && !RHS)
       return 0;
 
-    if (!LHS) LHS = BE->getLHS();
-    if (!RHS) RHS = BE->getRHS();
+    if (!LHS)
+      LHS = BE->getLHS();
+    if (!RHS)
+      RHS = BE->getRHS();
 
     return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, getContext());
   }
@@ -922,7 +964,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
   llvm_unreachable("Invalid expression kind!");
 }
 
-/// parseExpression - Parse an expression and return it.
+/// \brief Parse an expression and return it.
 ///
 ///  expr ::= expr &&,|| expr               -> lowest.
 ///  expr ::= expr |,^,&,! expr
@@ -935,7 +977,7 @@ AsmParser::ApplyModifierToExpr(const MCExpr *E,
 bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   // Parse the expression.
   Res = 0;
-  if (ParsePrimaryExpr(Res, EndLoc) || ParseBinOpRHS(1, Res, EndLoc))
+  if (parsePrimaryExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc))
     return true;
 
   // As a special case, we support 'a op b @ modifier' by rewriting the
@@ -948,11 +990,11 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
       return TokError("unexpected symbol modifier following '@'");
 
     MCSymbolRefExpr::VariantKind Variant =
-      MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
+        MCSymbolRefExpr::getVariantKindForName(getTok().getIdentifier());
     if (Variant == MCSymbolRefExpr::VK_Invalid)
       return TokError("invalid variant '" + getTok().getIdentifier() + "'");
 
-    const MCExpr *ModifiedRes = ApplyModifierToExpr(Res, Variant);
+    const MCExpr *ModifiedRes = applyModifierToExpr(Res, Variant);
     if (!ModifiedRes) {
       return TokError("invalid modifier '" + getTok().getIdentifier() +
                       "' (no symbols present)");
@@ -972,8 +1014,7 @@ bool AsmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
 
 bool AsmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
   Res = 0;
-  return ParseParenExpr(Res, EndLoc) ||
-         ParseBinOpRHS(1, Res, EndLoc);
+  return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
 }
 
 bool AsmParser::parseAbsoluteExpression(int64_t &Res) {
@@ -993,9 +1034,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
                                    MCBinaryExpr::Opcode &Kind) {
   switch (K) {
   default:
-    return 0;    // not a binop.
+    return 0; // not a binop.
 
-    // Lowest Precedence: &&, ||
+  // Lowest Precedence: &&, ||
   case AsmToken::AmpAmp:
     Kind = MCBinaryExpr::LAnd;
     return 1;
@@ -1003,10 +1044,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::LOr;
     return 1;
 
-
-    // Low Precedence: |, &, ^
-    //
-    // FIXME: gas seems to support '!' as an infix operator?
+  // Low Precedence: |, &, ^
+  //
+  // FIXME: gas seems to support '!' as an infix operator?
   case AsmToken::Pipe:
     Kind = MCBinaryExpr::Or;
     return 2;
@@ -1017,7 +1057,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::And;
     return 2;
 
-    // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
+  // Low Intermediate Precedence: ==, !=, <>, <, <=, >, >=
   case AsmToken::EqualEqual:
     Kind = MCBinaryExpr::EQ;
     return 3;
@@ -1038,7 +1078,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::GTE;
     return 3;
 
-    // Intermediate Precedence: <<, >>
+  // Intermediate Precedence: <<, >>
   case AsmToken::LessLess:
     Kind = MCBinaryExpr::Shl;
     return 4;
@@ -1046,7 +1086,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Shr;
     return 4;
 
-    // High Intermediate Precedence: +, -
+  // High Intermediate Precedence: +, -
   case AsmToken::Plus:
     Kind = MCBinaryExpr::Add;
     return 5;
@@ -1054,7 +1094,7 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
     Kind = MCBinaryExpr::Sub;
     return 5;
 
-    // Highest Precedence: *, /, %
+  // Highest Precedence: *, /, %
   case AsmToken::Star:
     Kind = MCBinaryExpr::Mul;
     return 6;
@@ -1067,10 +1107,9 @@ static unsigned getBinOpPrecedence(AsmToken::TokenKind K,
   }
 }
 
-
-/// ParseBinOpRHS - Parse all binary operators with precedence >= 'Precedence'.
+/// \brief Parse all binary operators with precedence >= 'Precedence'.
 /// Res contains the LHS of the expression on input.
-bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
+bool AsmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
                               SMLoc &EndLoc) {
   while (1) {
     MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
@@ -1085,15 +1124,15 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 
     // Eat the next primary expression.
     const MCExpr *RHS;
-    if (ParsePrimaryExpr(RHS, EndLoc)) return true;
+    if (parsePrimaryExpr(RHS, EndLoc))
+      return true;
 
     // If BinOp binds less tightly with RHS than the operator after RHS, let
     // the pending operator take RHS as its LHS.
     MCBinaryExpr::Opcode Dummy;
     unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
-    if (TokPrec < NextTokPrec) {
-      if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true;
-    }
+    if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
+      return true;
 
     // Merge LHS and RHS according to operator.
     Res = MCBinaryExpr::Create(Kind, Res, RHS, getContext());
@@ -1104,7 +1143,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
-bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
+bool AsmParser::parseStatement(ParseStatementInfo &Info) {
   if (Lexer.is(AsmToken::EndOfStatement)) {
     Out.AddBlankLine();
     Lex();
@@ -1118,7 +1157,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   int64_t LocalLabelVal = -1;
   // A full line comment is a '#' as the first token.
   if (Lexer.is(AsmToken::Hash))
-    return ParseCppHashLineFilenameComment(IDLoc);
+    return parseCppHashLineFilenameComment(IDLoc);
 
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
@@ -1149,34 +1188,34 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // have to do this so that .endif isn't skipped in a ".if 0" block for
   // example.
   StringMap<DirectiveKind>::const_iterator DirKindIt =
-    DirectiveKindMap.find(IDVal);
-  DirectiveKind DirKind =
-    (DirKindIt == DirectiveKindMap.end()) ? DK_NO_DIRECTIVE :
-                                            DirKindIt->getValue();
+      DirectiveKindMap.find(IDVal);
+  DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
+                              ? DK_NO_DIRECTIVE
+                              : DirKindIt->getValue();
   switch (DirKind) {
-    default:
-      break;
-    case DK_IF:
-      return ParseDirectiveIf(IDLoc);
-    case DK_IFB:
-      return ParseDirectiveIfb(IDLoc, true);
-    case DK_IFNB:
-      return ParseDirectiveIfb(IDLoc, false);
-    case DK_IFC:
-      return ParseDirectiveIfc(IDLoc, true);
-    case DK_IFNC:
-      return ParseDirectiveIfc(IDLoc, false);
-    case DK_IFDEF:
-      return ParseDirectiveIfdef(IDLoc, true);
-    case DK_IFNDEF:
-    case DK_IFNOTDEF:
-      return ParseDirectiveIfdef(IDLoc, false);
-    case DK_ELSEIF:
-      return ParseDirectiveElseIf(IDLoc);
-    case DK_ELSE:
-      return ParseDirectiveElse(IDLoc);
-    case DK_ENDIF:
-      return ParseDirectiveEndIf(IDLoc);
+  default:
+    break;
+  case DK_IF:
+    return parseDirectiveIf(IDLoc);
+  case DK_IFB:
+    return parseDirectiveIfb(IDLoc, true);
+  case DK_IFNB:
+    return parseDirectiveIfb(IDLoc, false);
+  case DK_IFC:
+    return parseDirectiveIfc(IDLoc, true);
+  case DK_IFNC:
+    return parseDirectiveIfc(IDLoc, false);
+  case DK_IFDEF:
+    return parseDirectiveIfdef(IDLoc, true);
+  case DK_IFNDEF:
+  case DK_IFNOTDEF:
+    return parseDirectiveIfdef(IDLoc, false);
+  case DK_ELSEIF:
+    return parseDirectiveElseIf(IDLoc);
+  case DK_ELSE:
+    return parseDirectiveElse(IDLoc);
+  case DK_ENDIF:
+    return parseDirectiveEndIf(IDLoc);
   }
 
   // Ignore the statement if in the middle of inactive conditional
@@ -1223,6 +1262,8 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
       MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
                                  IDLoc);
 
+    getTargetParser().onLabelParsed(Sym);
+
     // Consume any end of statement token, if present, to avoid spurious
     // AddBlankLine calls().
     if (Lexer.is(AsmToken::EndOfStatement)) {
@@ -1238,24 +1279,24 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     // identifier '=' ... -> assignment statement
     Lex();
 
-    return ParseAssignment(IDVal, true);
+    return parseAssignment(IDVal, true);
 
   default: // Normal instruction or directive.
     break;
   }
 
   // If macros are enabled, check to see if this is a macro instantiation.
-  if (MacrosEnabled())
-    if (const MCAsmMacro *M = LookupMacro(IDVal)) {
-      return HandleMacroEntry(M, IDLoc);
+  if (areMacrosEnabled())
+    if (const MCAsmMacro *M = lookupMacro(IDVal)) {
+      return handleMacroEntry(M, IDLoc);
     }
 
   // Otherwise, we have a normal instruction or directive.
-  
+
   // Directives start with "."
   if (IDVal[0] == '.' && IDVal != ".") {
     // There are several entities interested in parsing directives:
-    // 
+    //
     // 1. The target-specific assembly parser. Some directives are target
     //    specific or may potentially behave differently on certain targets.
     // 2. Asm parser extensions. For example, platform-specific parsers
@@ -1272,185 +1313,185 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
 
     // Next, check the extention directive map to see if any extension has
     // registered itself to parse this directive.
-    std::pair<MCAsmParserExtension*, DirectiveHandler> Handler =
-      ExtensionDirectiveMap.lookup(IDVal);
+    std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
+        ExtensionDirectiveMap.lookup(IDVal);
     if (Handler.first)
       return (*Handler.second)(Handler.first, IDVal, IDLoc);
 
     // Finally, if no one else is interested in this directive, it must be
     // generic and familiar to this class.
     switch (DirKind) {
-      default:
-        break;
-      case DK_SET:
-      case DK_EQU:
-        return ParseDirectiveSet(IDVal, true);
-      case DK_EQUIV:
-        return ParseDirectiveSet(IDVal, false);
-      case DK_ASCII:
-        return ParseDirectiveAscii(IDVal, false);
-      case DK_ASCIZ:
-      case DK_STRING:
-        return ParseDirectiveAscii(IDVal, true);
-      case DK_BYTE:
-        return ParseDirectiveValue(1);
-      case DK_SHORT:
-      case DK_VALUE:
-      case DK_2BYTE:
-        return ParseDirectiveValue(2);
-      case DK_LONG:
-      case DK_INT:
-      case DK_4BYTE:
-        return ParseDirectiveValue(4);
-      case DK_QUAD:
-      case DK_8BYTE:
-        return ParseDirectiveValue(8);
-      case DK_SINGLE:
-      case DK_FLOAT:
-        return ParseDirectiveRealValue(APFloat::IEEEsingle);
-      case DK_DOUBLE:
-        return ParseDirectiveRealValue(APFloat::IEEEdouble);
-      case DK_ALIGN: {
-        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/1);
-      }
-      case DK_ALIGN32: {
-        bool IsPow2 = !getContext().getAsmInfo().getAlignmentIsInBytes();
-        return ParseDirectiveAlign(IsPow2, /*ExprSize=*/4);
-      }
-      case DK_BALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
-      case DK_BALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
-      case DK_BALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
-      case DK_P2ALIGN:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
-      case DK_P2ALIGNW:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
-      case DK_P2ALIGNL:
-        return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
-      case DK_ORG:
-        return ParseDirectiveOrg();
-      case DK_FILL:
-        return ParseDirectiveFill();
-      case DK_ZERO:
-        return ParseDirectiveZero();
-      case DK_EXTERN:
-        eatToEndOfStatement(); // .extern is the default, ignore it.
-        return false;
-      case DK_GLOBL:
-      case DK_GLOBAL:
-        return ParseDirectiveSymbolAttribute(MCSA_Global);
-      case DK_INDIRECT_SYMBOL:
-        return ParseDirectiveSymbolAttribute(MCSA_IndirectSymbol);
-      case DK_LAZY_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_LazyReference);
-      case DK_NO_DEAD_STRIP:
-        return ParseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
-      case DK_SYMBOL_RESOLVER:
-        return ParseDirectiveSymbolAttribute(MCSA_SymbolResolver);
-      case DK_PRIVATE_EXTERN:
-        return ParseDirectiveSymbolAttribute(MCSA_PrivateExtern);
-      case DK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_Reference);
-      case DK_WEAK_DEFINITION:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefinition);
-      case DK_WEAK_REFERENCE:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakReference);
-      case DK_WEAK_DEF_CAN_BE_HIDDEN:
-        return ParseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
-      case DK_COMM:
-      case DK_COMMON:
-        return ParseDirectiveComm(/*IsLocal=*/false);
-      case DK_LCOMM:
-        return ParseDirectiveComm(/*IsLocal=*/true);
-      case DK_ABORT:
-        return ParseDirectiveAbort();
-      case DK_INCLUDE:
-        return ParseDirectiveInclude();
-      case DK_INCBIN:
-        return ParseDirectiveIncbin();
-      case DK_CODE16:
-      case DK_CODE16GCC:
-        return TokError(Twine(IDVal) + " not supported yet");
-      case DK_REPT:
-        return ParseDirectiveRept(IDLoc);
-      case DK_IRP:
-        return ParseDirectiveIrp(IDLoc);
-      case DK_IRPC:
-        return ParseDirectiveIrpc(IDLoc);
-      case DK_ENDR:
-        return ParseDirectiveEndr(IDLoc);
-      case DK_BUNDLE_ALIGN_MODE:
-        return ParseDirectiveBundleAlignMode();
-      case DK_BUNDLE_LOCK:
-        return ParseDirectiveBundleLock();
-      case DK_BUNDLE_UNLOCK:
-        return ParseDirectiveBundleUnlock();
-      case DK_SLEB128:
-        return ParseDirectiveLEB128(true);
-      case DK_ULEB128:
-        return ParseDirectiveLEB128(false);
-      case DK_SPACE:
-      case DK_SKIP:
-        return ParseDirectiveSpace(IDVal);
-      case DK_FILE:
-        return ParseDirectiveFile(IDLoc);
-      case DK_LINE:
-        return ParseDirectiveLine();
-      case DK_LOC:
-        return ParseDirectiveLoc();
-      case DK_STABS:
-        return ParseDirectiveStabs();
-      case DK_CFI_SECTIONS:
-        return ParseDirectiveCFISections();
-      case DK_CFI_STARTPROC:
-        return ParseDirectiveCFIStartProc();
-      case DK_CFI_ENDPROC:
-        return ParseDirectiveCFIEndProc();
-      case DK_CFI_DEF_CFA:
-        return ParseDirectiveCFIDefCfa(IDLoc);
-      case DK_CFI_DEF_CFA_OFFSET:
-        return ParseDirectiveCFIDefCfaOffset();
-      case DK_CFI_ADJUST_CFA_OFFSET:
-        return ParseDirectiveCFIAdjustCfaOffset();
-      case DK_CFI_DEF_CFA_REGISTER:
-        return ParseDirectiveCFIDefCfaRegister(IDLoc);
-      case DK_CFI_OFFSET:
-        return ParseDirectiveCFIOffset(IDLoc);
-      case DK_CFI_REL_OFFSET:
-        return ParseDirectiveCFIRelOffset(IDLoc);
-      case DK_CFI_PERSONALITY:
-        return ParseDirectiveCFIPersonalityOrLsda(true);
-      case DK_CFI_LSDA:
-        return ParseDirectiveCFIPersonalityOrLsda(false);
-      case DK_CFI_REMEMBER_STATE:
-        return ParseDirectiveCFIRememberState();
-      case DK_CFI_RESTORE_STATE:
-        return ParseDirectiveCFIRestoreState();
-      case DK_CFI_SAME_VALUE:
-        return ParseDirectiveCFISameValue(IDLoc);
-      case DK_CFI_RESTORE:
-        return ParseDirectiveCFIRestore(IDLoc);
-      case DK_CFI_ESCAPE:
-        return ParseDirectiveCFIEscape();
-      case DK_CFI_SIGNAL_FRAME:
-        return ParseDirectiveCFISignalFrame();
-      case DK_CFI_UNDEFINED:
-        return ParseDirectiveCFIUndefined(IDLoc);
-      case DK_CFI_REGISTER:
-        return ParseDirectiveCFIRegister(IDLoc);
-      case DK_MACROS_ON:
-      case DK_MACROS_OFF:
-        return ParseDirectiveMacrosOnOff(IDVal);
-      case DK_MACRO:
-        return ParseDirectiveMacro(IDLoc);
-      case DK_ENDM:
-      case DK_ENDMACRO:
-        return ParseDirectiveEndMacro(IDVal);
-      case DK_PURGEM:
-        return ParseDirectivePurgeMacro(IDLoc);
+    default:
+      break;
+    case DK_SET:
+    case DK_EQU:
+      return parseDirectiveSet(IDVal, true);
+    case DK_EQUIV:
+      return parseDirectiveSet(IDVal, false);
+    case DK_ASCII:
+      return parseDirectiveAscii(IDVal, false);
+    case DK_ASCIZ:
+    case DK_STRING:
+      return parseDirectiveAscii(IDVal, true);
+    case DK_BYTE:
+      return parseDirectiveValue(1);
+    case DK_SHORT:
+    case DK_VALUE:
+    case DK_2BYTE:
+      return parseDirectiveValue(2);
+    case DK_LONG:
+    case DK_INT:
+    case DK_4BYTE:
+      return parseDirectiveValue(4);
+    case DK_QUAD:
+    case DK_8BYTE:
+      return parseDirectiveValue(8);
+    case DK_SINGLE:
+    case DK_FLOAT:
+      return parseDirectiveRealValue(APFloat::IEEEsingle);
+    case DK_DOUBLE:
+      return parseDirectiveRealValue(APFloat::IEEEdouble);
+    case DK_ALIGN: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/1);
+    }
+    case DK_ALIGN32: {
+      bool IsPow2 = !getContext().getAsmInfo()->getAlignmentIsInBytes();
+      return parseDirectiveAlign(IsPow2, /*ExprSize=*/4);
+    }
+    case DK_BALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/1);
+    case DK_BALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/2);
+    case DK_BALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/false, /*ExprSize=*/4);
+    case DK_P2ALIGN:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/1);
+    case DK_P2ALIGNW:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/2);
+    case DK_P2ALIGNL:
+      return parseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
+    case DK_ORG:
+      return parseDirectiveOrg();
+    case DK_FILL:
+      return parseDirectiveFill();
+    case DK_ZERO:
+      return parseDirectiveZero();
+    case DK_EXTERN:
+      eatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    case DK_GLOBL:
+    case DK_GLOBAL:
+      return parseDirectiveSymbolAttribute(MCSA_Global);
+    case DK_LAZY_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_LazyReference);
+    case DK_NO_DEAD_STRIP:
+      return parseDirectiveSymbolAttribute(MCSA_NoDeadStrip);
+    case DK_SYMBOL_RESOLVER:
+      return parseDirectiveSymbolAttribute(MCSA_SymbolResolver);
+    case DK_PRIVATE_EXTERN:
+      return parseDirectiveSymbolAttribute(MCSA_PrivateExtern);
+    case DK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_Reference);
+    case DK_WEAK_DEFINITION:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefinition);
+    case DK_WEAK_REFERENCE:
+      return parseDirectiveSymbolAttribute(MCSA_WeakReference);
+    case DK_WEAK_DEF_CAN_BE_HIDDEN:
+      return parseDirectiveSymbolAttribute(MCSA_WeakDefAutoPrivate);
+    case DK_COMM:
+    case DK_COMMON:
+      return parseDirectiveComm(/*IsLocal=*/false);
+    case DK_LCOMM:
+      return parseDirectiveComm(/*IsLocal=*/true);
+    case DK_ABORT:
+      return parseDirectiveAbort();
+    case DK_INCLUDE:
+      return parseDirectiveInclude();
+    case DK_INCBIN:
+      return parseDirectiveIncbin();
+    case DK_CODE16:
+    case DK_CODE16GCC:
+      return TokError(Twine(IDVal) + " not supported yet");
+    case DK_REPT:
+      return parseDirectiveRept(IDLoc);
+    case DK_IRP:
+      return parseDirectiveIrp(IDLoc);
+    case DK_IRPC:
+      return parseDirectiveIrpc(IDLoc);
+    case DK_ENDR:
+      return parseDirectiveEndr(IDLoc);
+    case DK_BUNDLE_ALIGN_MODE:
+      return parseDirectiveBundleAlignMode();
+    case DK_BUNDLE_LOCK:
+      return parseDirectiveBundleLock();
+    case DK_BUNDLE_UNLOCK:
+      return parseDirectiveBundleUnlock();
+    case DK_SLEB128:
+      return parseDirectiveLEB128(true);
+    case DK_ULEB128:
+      return parseDirectiveLEB128(false);
+    case DK_SPACE:
+    case DK_SKIP:
+      return parseDirectiveSpace(IDVal);
+    case DK_FILE:
+      return parseDirectiveFile(IDLoc);
+    case DK_LINE:
+      return parseDirectiveLine();
+    case DK_LOC:
+      return parseDirectiveLoc();
+    case DK_STABS:
+      return parseDirectiveStabs();
+    case DK_CFI_SECTIONS:
+      return parseDirectiveCFISections();
+    case DK_CFI_STARTPROC:
+      return parseDirectiveCFIStartProc();
+    case DK_CFI_ENDPROC:
+      return parseDirectiveCFIEndProc();
+    case DK_CFI_DEF_CFA:
+      return parseDirectiveCFIDefCfa(IDLoc);
+    case DK_CFI_DEF_CFA_OFFSET:
+      return parseDirectiveCFIDefCfaOffset();
+    case DK_CFI_ADJUST_CFA_OFFSET:
+      return parseDirectiveCFIAdjustCfaOffset();
+    case DK_CFI_DEF_CFA_REGISTER:
+      return parseDirectiveCFIDefCfaRegister(IDLoc);
+    case DK_CFI_OFFSET:
+      return parseDirectiveCFIOffset(IDLoc);
+    case DK_CFI_REL_OFFSET:
+      return parseDirectiveCFIRelOffset(IDLoc);
+    case DK_CFI_PERSONALITY:
+      return parseDirectiveCFIPersonalityOrLsda(true);
+    case DK_CFI_LSDA:
+      return parseDirectiveCFIPersonalityOrLsda(false);
+    case DK_CFI_REMEMBER_STATE:
+      return parseDirectiveCFIRememberState();
+    case DK_CFI_RESTORE_STATE:
+      return parseDirectiveCFIRestoreState();
+    case DK_CFI_SAME_VALUE:
+      return parseDirectiveCFISameValue(IDLoc);
+    case DK_CFI_RESTORE:
+      return parseDirectiveCFIRestore(IDLoc);
+    case DK_CFI_ESCAPE:
+      return parseDirectiveCFIEscape();
+    case DK_CFI_SIGNAL_FRAME:
+      return parseDirectiveCFISignalFrame();
+    case DK_CFI_UNDEFINED:
+      return parseDirectiveCFIUndefined(IDLoc);
+    case DK_CFI_REGISTER:
+      return parseDirectiveCFIRegister(IDLoc);
+    case DK_CFI_WINDOW_SAVE:
+      return parseDirectiveCFIWindowSave();
+    case DK_MACROS_ON:
+    case DK_MACROS_OFF:
+      return parseDirectiveMacrosOnOff(IDVal);
+    case DK_MACRO:
+      return parseDirectiveMacro(IDLoc);
+    case DK_ENDM:
+    case DK_ENDMACRO:
+      return parseDirectiveEndMacro(IDVal);
+    case DK_PURGEM:
+      return parseDirectivePurgeMacro(IDLoc);
     }
 
     return Error(IDLoc, "unknown directive");
@@ -1459,19 +1500,19 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // __asm _emit or __asm __emit
   if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
                            IDVal == "_EMIT" || IDVal == "__EMIT"))
-    return ParseDirectiveMSEmit(IDLoc, Info, IDVal.size());
+    return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
 
   // __asm align
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
-    return ParseDirectiveMSAlign(IDLoc, Info);
+    return parseDirectiveMSAlign(IDLoc, Info);
 
   checkForValidSection();
 
   // Canonicalize the opcode to lower case.
   std::string OpcodeStr = IDVal.lower();
   ParseInstructionInfo IInfo(Info.AsmRewrites);
-  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr,
-                                                     IDLoc, Info.ParsedOperands);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, IDLoc,
+                                                     Info.ParsedOperands);
   Info.ParseError = HadError;
 
   // Dump the parsed representation, if requested.
@@ -1486,7 +1527,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     }
     OS << "]";
 
-    PrintMessage(IDLoc, SourceMgr::DK_Note, OS.str());
+    printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
   }
 
   // If we are generating dwarf for assembly source files and the current
@@ -1494,38 +1535,49 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSection() ==
-      getStreamer().getCurrentSection().first) {
+          getStreamer().getCurrentSection().first) {
 
     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles = 
-      getContext().getMCDwarfFiles();
+    const SmallVectorImpl<MCDwarfFile *> &MCDwarfFiles =
+        getContext().getMCDwarfFiles();
     if (CppHashFilename.size() != 0) {
       if (MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
           CppHashFilename)
         getStreamer().EmitDwarfFileDirective(
-          getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
-
-       unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
-       Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+            getContext().nextGenDwarfFileNumber(), StringRef(),
+            CppHashFilename);
+
+      // Since SrcMgr.FindLineNumber() is slow and messes up the SourceMgr's
+      // cache with the different Loc from the call above we save the last
+      // info we queried here with SrcMgr.FindLineNumber().
+      unsigned CppHashLocLineNo;
+      if (LastQueryIDLoc == CppHashLoc && LastQueryBuffer == CppHashBuf)
+        CppHashLocLineNo = LastQueryLine;
+      else {
+        CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CppHashBuf);
+        LastQueryLine = CppHashLocLineNo;
+        LastQueryIDLoc = CppHashLoc;
+        LastQueryBuffer = CppHashBuf;
+      }
+      Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
-    getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
-                                        Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
-                                        DWARF2_FLAG_IS_STMT : 0, 0, 0,
-                                        StringRef());
+    getStreamer().EmitDwarfLocDirective(
+        getContext().getGenDwarfFileNumber(), Line, 0,
+        DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
+        StringRef());
   }
 
   // If parsing succeeded, match the instruction.
   if (!HadError) {
     unsigned ErrorInfo;
-    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
-                                                         Info.ParsedOperands,
-                                                         Out, ErrorInfo,
-                                                         ParsingInlineAsm);
+    HadError = getTargetParser().MatchAndEmitInstruction(
+        IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
+        ParsingInlineAsm);
   }
 
   // Don't skip the rest of the line, the instruction parser is responsible for
@@ -1533,25 +1585,25 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   return false;
 }
 
-/// EatToEndOfLine uses the Lexer to eat the characters to the end of the line
+/// eatToEndOfLine uses the Lexer to eat the characters to the end of the line
 /// since they may not be able to be tokenized to get to the end of line token.
-void AsmParser::EatToEndOfLine() {
+void AsmParser::eatToEndOfLine() {
   if (!Lexer.is(AsmToken::EndOfStatement))
     Lexer.LexUntilEndOfLine();
- // Eat EOL.
- Lex();
+  // Eat EOL.
+  Lex();
 }
 
-/// ParseCppHashLineFilenameComment as this:
+/// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
 /// or just as a full line comment if it doesn't have a number and a string.
-bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
+bool AsmParser::parseCppHashLineFilenameComment(const SMLoc &L) {
   Lex(); // Eat the hash token.
 
   if (getLexer().isNot(AsmToken::Integer)) {
     // Consume the line since in cases it is not a well-formed line directive,
     // as if were simply a full line comment.
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
@@ -1559,13 +1611,13 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   Lex();
 
   if (getLexer().isNot(AsmToken::String)) {
-    EatToEndOfLine();
+    eatToEndOfLine();
     return false;
   }
 
   StringRef Filename = getTok().getString();
   // Get rid of the enclosing quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
+  Filename = Filename.substr(1, Filename.size() - 2);
 
   // Save the SMLoc, Filename and LineNumber for later use by diagnostics.
   CppHashLoc = L;
@@ -1574,14 +1626,14 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   CppHashBuf = CurBuffer;
 
   // Ignore any trailing characters, they're just comment.
-  EatToEndOfLine();
+  eatToEndOfLine();
   return false;
 }
 
-/// DiagHandler - will use the last parsed cpp hash line filename comment
+/// \brief will use the last parsed cpp hash line filename comment
 /// for the Filename and LineNo if any in the diagnostic.
 void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
-  const AsmParser *Parser = static_cast<const AsmParser*>(Context);
+  const AsmParser *Parser = static_cast<const AsmParser *>(Context);
   raw_ostream &OS = errs();
 
   const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
@@ -1589,19 +1641,18 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   int DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   int CppHashBuf = Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashLoc);
 
-  // Like SourceMgr::PrintMessage() we need to print the include stack if any
+  // Like SourceMgr::printMessage() we need to print the include stack if any
   // before printing the message.
   int DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
   if (!Parser->SavedDiagHandler && DiagCurBuffer > 0) {
-     SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
-     DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+    SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+    DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
   }
 
   // If we have not parsed a cpp hash line filename comment or the source
   // manager changed or buffer changed (like in a nested include) then just
   // print the normal diagnostic using its Filename and LineNo.
-  if (!Parser->CppHashLineNumber ||
-      &DiagSrcMgr != &Parser->SrcMgr ||
+  if (!Parser->CppHashLineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
       DiagBuf != CppHashBuf) {
     if (Parser->SavedDiagHandler)
       Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
@@ -1613,17 +1664,16 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   // Use the CppHashFilename and calculate a line number based on the
   // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
   // the diagnostic.
-  const std::string Filename = Parser->CppHashFilename;
+  const std::string &Filename = Parser->CppHashFilename;
 
   int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
   int CppHashLocLineNo =
       Parser->SrcMgr.FindLineNumber(Parser->CppHashLoc, CppHashBuf);
-  int LineNo = Parser->CppHashLineNumber - 1 +
-               (DiagLocLineNo - CppHashLocLineNo);
+  int LineNo =
+      Parser->CppHashLineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
 
-  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(),
-                       Filename, LineNo, Diag.getColumnNo(),
-                       Diag.getKind(), Diag.getMessage(),
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
+                       Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
                        Diag.getLineContents(), Diag.getRanges());
 
   if (Parser->SavedDiagHandler)
@@ -1643,8 +1693,7 @@ static bool isIdentifierChar(char c) {
 
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             const MCAsmMacroParameters &Parameters,
-                            const MCAsmMacroArguments &A,
-                            const SMLoc &L) {
+                            const MCAsmMacroArguments &A, const SMLoc &L) {
   unsigned NParameters = Parameters.size();
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
@@ -1680,27 +1729,28 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       break;
 
     if (!NParameters) {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         OS << '$';
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         OS << A.size();
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         // Missing arguments are ignored.
-        unsigned Index = Body[Pos+1] - '0';
+        unsigned Index = Body[Pos + 1] - '0';
         if (Index >= A.size())
           break;
 
         // Otherwise substitute with the token values, with spaces eliminated.
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           OS << it->getString();
         break;
       }
@@ -1711,23 +1761,24 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            OS << '\\' << Argument;
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          OS << '\\' << Argument;
+          Pos = I;
+        }
       } else {
         for (MCAsmMacroArgument::const_iterator it = A[Index].begin(),
-               ie = A[Index].end(); it != ie; ++it)
+                                                ie = A[Index].end();
+             it != ie; ++it)
           if (it->getKind() == AsmToken::String)
             OS << it->getStringContents();
           else
@@ -1743,48 +1794,43 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   return false;
 }
 
-MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL,
-                                       int EB, SMLoc EL,
-                                       MemoryBuffer *I)
-  : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
-    ExitLoc(EL)
-{
-}
+MacroInstantiation::MacroInstantiation(const MCAsmMacro *M, SMLoc IL, int EB,
+                                       SMLoc EL, MemoryBuffer *I)
+    : TheMacro(M), Instantiation(I), InstantiationLoc(IL), ExitBuffer(EB),
+      ExitLoc(EL) {}
 
-static bool IsOperator(AsmToken::TokenKind kind)
-{
-  switch (kind)
-  {
-    default:
-      return false;
-    case AsmToken::Plus:
-    case AsmToken::Minus:
-    case AsmToken::Tilde:
-    case AsmToken::Slash:
-    case AsmToken::Star:
-    case AsmToken::Dot:
-    case AsmToken::Equal:
-    case AsmToken::EqualEqual:
-    case AsmToken::Pipe:
-    case AsmToken::PipePipe:
-    case AsmToken::Caret:
-    case AsmToken::Amp:
-    case AsmToken::AmpAmp:
-    case AsmToken::Exclaim:
-    case AsmToken::ExclaimEqual:
-    case AsmToken::Percent:
-    case AsmToken::Less:
-    case AsmToken::LessEqual:
-    case AsmToken::LessLess:
-    case AsmToken::LessGreater:
-    case AsmToken::Greater:
-    case AsmToken::GreaterEqual:
-    case AsmToken::GreaterGreater:
-      return true;
+static bool isOperator(AsmToken::TokenKind kind) {
+  switch (kind) {
+  default:
+    return false;
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+  case AsmToken::Slash:
+  case AsmToken::Star:
+  case AsmToken::Dot:
+  case AsmToken::Equal:
+  case AsmToken::EqualEqual:
+  case AsmToken::Pipe:
+  case AsmToken::PipePipe:
+  case AsmToken::Caret:
+  case AsmToken::Amp:
+  case AsmToken::AmpAmp:
+  case AsmToken::Exclaim:
+  case AsmToken::ExclaimEqual:
+  case AsmToken::Percent:
+  case AsmToken::Less:
+  case AsmToken::LessEqual:
+  case AsmToken::LessLess:
+  case AsmToken::LessGreater:
+  case AsmToken::Greater:
+  case AsmToken::GreaterEqual:
+  case AsmToken::GreaterGreater:
+    return true;
   }
 }
 
-bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
+bool AsmParser::parseMacroArgument(MCAsmMacroArgument &MA,
                                    AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
   unsigned AddTokens = 0;
@@ -1818,7 +1864,7 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
       // one into this argument
       if (ArgumentDelimiter == AsmToken::Space ||
           ArgumentDelimiter == AsmToken::Eof) {
-        if (IsOperator(Lexer.getKind())) {
+        if (isOperator(Lexer.getKind())) {
           // Check to see whether the token is used as an operator,
           // or part of an identifier
           const char *NextChar = getTok().getEndLoc().getPointer();
@@ -1828,14 +1874,14 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 
         if (!AddTokens && ParenLevel == 0) {
           if (ArgumentDelimiter == AsmToken::Eof &&
-              !IsOperator(Lexer.getKind()))
+              !isOperator(Lexer.getKind()))
             ArgumentDelimiter = AsmToken::Space;
           break;
         }
       }
     }
 
-    // HandleMacroEntry relies on not advancing the lexer here
+    // handleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
@@ -1860,10 +1906,11 @@ bool AsmParser::ParseMacroArgument(MCAsmMacroArgument &MA,
 }
 
 // Parse the macro instantiation arguments.
-bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A) {
+bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
+                                    MCAsmMacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
 
   // Parse two kinds of macro invocations:
@@ -1873,7 +1920,7 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
        ++Parameter) {
     MCAsmMacroArgument MA;
 
-    if (ParseMacroArgument(MA, ArgumentDelimiter))
+    if (parseMacroArgument(MA, ArgumentDelimiter))
       return true;
 
     if (!MA.empty() || !NParameters)
@@ -1904,31 +1951,31 @@ bool AsmParser::ParseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A)
   return TokError("Too many arguments");
 }
 
-const MCAsmMacro* AsmParser::LookupMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+const MCAsmMacro *AsmParser::lookupMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   return (I == MacroMap.end()) ? NULL : I->getValue();
 }
 
-void AsmParser::DefineMacro(StringRef Name, const MCAsmMacro& Macro) {
+void AsmParser::defineMacro(StringRef Name, const MCAsmMacro &Macro) {
   MacroMap[Name] = new MCAsmMacro(Macro);
 }
 
-void AsmParser::UndefineMacro(StringRef Name) {
-  StringMap<MCAsmMacro*>::iterator I = MacroMap.find(Name);
+void AsmParser::undefineMacro(StringRef Name) {
+  StringMap<MCAsmMacro *>::iterator I = MacroMap.find(Name);
   if (I != MacroMap.end()) {
     delete I->getValue();
     MacroMap.erase(I);
   }
 }
 
-bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+bool AsmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   // Arbitrarily limit macro nesting depth, to match 'as'. We can eliminate
   // this, although we should protect against infinite loops.
   if (ActiveMacros.size() == 20)
     return TokError("macros cannot be nested more than 20 levels deep");
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(M, A))
+  if (parseMacroArguments(M, A))
     return true;
 
   // Remove any trailing empty arguments. Do this after-the-fact as we have
@@ -1951,14 +1998,12 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   OS << ".endmacro\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, NameLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, NameLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -1969,9 +2014,9 @@ bool AsmParser::HandleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
   return false;
 }
 
-void AsmParser::HandleMacroExit() {
+void AsmParser::handleMacroExit() {
   // Jump to the EndOfStatement we should return to, and consume it.
-  JumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
   Lex();
 
   // Pop the instantiation entry.
@@ -1979,29 +2024,30 @@ void AsmParser::HandleMacroExit() {
   ActiveMacros.pop_back();
 }
 
-static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
+static bool isUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
   switch (Value->getKind()) {
   case MCExpr::Binary: {
-    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Value);
-    return IsUsedIn(Sym, BE->getLHS()) || IsUsedIn(Sym, BE->getRHS());
+    const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Value);
+    return isUsedIn(Sym, BE->getLHS()) || isUsedIn(Sym, BE->getRHS());
   }
   case MCExpr::Target:
   case MCExpr::Constant:
     return false;
   case MCExpr::SymbolRef: {
-    const MCSymbol &S = static_cast<const MCSymbolRefExpr*>(Value)->getSymbol();
+    const MCSymbol &S =
+        static_cast<const MCSymbolRefExpr *>(Value)->getSymbol();
     if (S.isVariable())
-      return IsUsedIn(Sym, S.getVariableValue());
+      return isUsedIn(Sym, S.getVariableValue());
     return &S == Sym;
   }
   case MCExpr::Unary:
-    return IsUsedIn(Sym, static_cast<const MCUnaryExpr*>(Value)->getSubExpr());
+    return isUsedIn(Sym, static_cast<const MCUnaryExpr *>(Value)->getSubExpr());
   }
 
   llvm_unreachable("Unknown expr kind!");
 }
 
-bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
                                 bool NoDeadStrip) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
@@ -2034,7 +2080,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
     //
     // FIXME: Diagnostics. Note the location of the definition as a label.
     // FIXME: Diagnose assignment to protected identifier (e.g., register name).
-    if (IsUsedIn(Sym, Value))
+    if (isUsedIn(Sym, Value))
       return Error(EqualLoc, "Recursive use of '" + Name + "'");
     else if (Sym->isUndefined() && !Sym->isUsed() && !Sym->isVariable())
       ; // Allow redefinitions of undefined symbols only used in directives.
@@ -2046,7 +2092,7 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
       return Error(EqualLoc, "invalid assignment to '" + Name + "'");
     else if (!isa<MCConstantExpr>(Sym->getVariableValue()))
       return Error(EqualLoc, "invalid reassignment of non-absolute variable '" +
-                   Name + "'");
+                                 Name + "'");
 
     // Don't count these checks as uses.
     Sym->setUsed(false);
@@ -2060,7 +2106,6 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
   if (NoDeadStrip)
     Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
 
-
   return false;
 }
 
@@ -2069,31 +2114,30 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
 ///   ::= string
 bool AsmParser::parseIdentifier(StringRef &Res) {
   // The assembler has relaxed rules for accepting identifiers, in particular we
-  // allow things like '.globl $foo', which would normally be separate
-  // tokens. At this level, we have already lexed so we cannot (currently)
+  // allow things like '.globl $foo' and '.def @feat.00', which would normally be
+  // separate tokens. At this level, we have already lexed so we cannot (currently)
   // handle this as a context dependent token, instead we detect adjacent tokens
   // and return the combined identifier.
-  if (Lexer.is(AsmToken::Dollar)) {
-    SMLoc DollarLoc = getLexer().getLoc();
+  if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
+    SMLoc PrefixLoc = getLexer().getLoc();
 
-    // Consume the dollar sign, and check for a following identifier.
+    // Consume the prefix character, and check for a following identifier.
     Lex();
     if (Lexer.isNot(AsmToken::Identifier))
       return true;
 
-    // We have a '$' followed by an identifier, make sure they are adjacent.
-    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    if (PrefixLoc.getPointer() + 1 != getTok().getLoc().getPointer())
       return true;
 
     // Construct the joined identifier and consume the token.
-    Res = StringRef(DollarLoc.getPointer(),
-                    getTok().getIdentifier().size() + 1);
+    Res =
+        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
     Lex();
     return false;
   }
 
-  if (Lexer.isNot(AsmToken::Identifier) &&
-      Lexer.isNot(AsmToken::String))
+  if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
     return true;
 
   Res = getTok().getIdentifier();
@@ -2103,11 +2147,11 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
   return false;
 }
 
-/// ParseDirectiveSet:
+/// parseDirectiveSet:
 ///   ::= .equ identifier ',' expression
 ///   ::= .equiv identifier ',' expression
 ///   ::= .set identifier ',' expression
-bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
+bool AsmParser::parseDirectiveSet(StringRef IDVal, bool allow_redef) {
   StringRef Name;
 
   if (parseIdentifier(Name))
@@ -2117,7 +2161,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
     return TokError("unexpected token in '" + Twine(IDVal) + "'");
   Lex();
 
-  return ParseAssignment(Name, allow_redef, true);
+  return parseAssignment(Name, allow_redef, true);
 }
 
 bool AsmParser::parseEscapedString(std::string &Data) {
@@ -2138,15 +2182,15 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       return TokError("unexpected backslash at end of string");
 
     // Recognize octal sequences.
-    if ((unsigned) (Str[i] - '0') <= 7) {
+    if ((unsigned)(Str[i] - '0') <= 7) {
       // Consume up to three octal characters.
       unsigned Value = Str[i] - '0';
 
-      if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
         ++i;
         Value = Value * 8 + (Str[i] - '0');
 
-        if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
           ++i;
           Value = Value * 8 + (Str[i] - '0');
         }
@@ -2155,7 +2199,7 @@ bool AsmParser::parseEscapedString(std::string &Data) {
       if (Value > 255)
         return TokError("invalid octal escape sequence (out of range)");
 
-      Data += (unsigned char) Value;
+      Data += (unsigned char)Value;
       continue;
     }
 
@@ -2178,9 +2222,9 @@ bool AsmParser::parseEscapedString(std::string &Data) {
   return false;
 }
 
-/// ParseDirectiveAscii:
+/// parseDirectiveAscii:
 ///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
-bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
+bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2192,9 +2236,9 @@ bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
       if (parseEscapedString(Data))
         return true;
 
-      getStreamer().EmitBytes(Data, DEFAULT_ADDRSPACE);
+      getStreamer().EmitBytes(Data);
       if (ZeroTerminated)
-        getStreamer().EmitBytes(StringRef("\0", 1), DEFAULT_ADDRSPACE);
+        getStreamer().EmitBytes(StringRef("\0", 1));
 
       Lex();
 
@@ -2211,9 +2255,9 @@ bool AsmParser::ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
   return false;
 }
 
-/// ParseDirectiveValue
+/// parseDirectiveValue
 ///  ::= (.byte | .short | ... ) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveValue(unsigned Size) {
+bool AsmParser::parseDirectiveValue(unsigned Size) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2229,9 +2273,9 @@ bool AsmParser::ParseDirectiveValue(unsigned Size) {
         uint64_t IntValue = MCE->getValue();
         if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
           return Error(ExprLoc, "literal value out of range for directive");
-        getStreamer().EmitIntValue(IntValue, Size, DEFAULT_ADDRSPACE);
+        getStreamer().EmitIntValue(IntValue, Size);
       } else
-        getStreamer().EmitValue(Value, Size, DEFAULT_ADDRSPACE);
+        getStreamer().EmitValue(Value, Size);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -2247,9 +2291,9 @@ bool AsmParser::ParseDirectiveValue(unsigned Size) {
   return false;
 }
 
-/// ParseDirectiveRealValue
+/// parseDirectiveRealValue
 ///  ::= (.single | .double) [ expression (, expression)* ]
-bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
+bool AsmParser::parseDirectiveRealValue(const fltSemantics &Semantics) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     checkForValidSection();
 
@@ -2279,7 +2323,7 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
         else
           return TokError("invalid floating point literal");
       } else if (Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven) ==
-          APFloat::opInvalidOp)
+                 APFloat::opInvalidOp)
         return TokError("invalid floating point literal");
       if (IsNeg)
         Value.changeSign();
@@ -2290,7 +2334,7 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
       // Emit the value as an integer.
       APInt AsInt = Value.bitcastToAPInt();
       getStreamer().EmitIntValue(AsInt.getLimitedValue(),
-                                 AsInt.getBitWidth() / 8, DEFAULT_ADDRSPACE);
+                                 AsInt.getBitWidth() / 8);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -2305,9 +2349,9 @@ bool AsmParser::ParseDirectiveRealValue(const fltSemantics &Semantics) {
   return false;
 }
 
-/// ParseDirectiveZero
+/// parseDirectiveZero
 ///  ::= .zero expression
-bool AsmParser::ParseDirectiveZero() {
+bool AsmParser::parseDirectiveZero() {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -2326,53 +2370,58 @@ bool AsmParser::ParseDirectiveZero() {
 
   Lex();
 
-  getStreamer().EmitFill(NumBytes, Val, DEFAULT_ADDRSPACE);
+  getStreamer().EmitFill(NumBytes, Val);
 
   return false;
 }
 
-/// ParseDirectiveFill
-///  ::= .fill expression , expression , expression
-bool AsmParser::ParseDirectiveFill() {
+/// parseDirectiveFill
+///  ::= .fill expression [ , expression [ , expression ] ]
+bool AsmParser::parseDirectiveFill() {
   checkForValidSection();
 
   int64_t NumValues;
   if (parseAbsoluteExpression(NumValues))
     return true;
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+  int64_t FillSize = 1;
+  int64_t FillExpr = 0;
 
-  int64_t FillSize;
-  if (parseAbsoluteExpression(FillSize))
-    return true;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '.fill' directive");
+    Lex();
 
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("unexpected token in '.fill' directive");
-  Lex();
+    if (parseAbsoluteExpression(FillSize))
+      return true;
 
-  int64_t FillExpr;
-  if (parseAbsoluteExpression(FillExpr))
-    return true;
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      if (getLexer().isNot(AsmToken::Comma))
+        return TokError("unexpected token in '.fill' directive");
+      Lex();
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.fill' directive");
+      if (parseAbsoluteExpression(FillExpr))
+        return true;
 
-  Lex();
+      if (getLexer().isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in '.fill' directive");
+
+      Lex();
+    }
+  }
 
   if (FillSize != 1 && FillSize != 2 && FillSize != 4 && FillSize != 8)
     return TokError("invalid '.fill' size, expected 1, 2, 4, or 8");
 
   for (uint64_t i = 0, e = NumValues; i != e; ++i)
-    getStreamer().EmitIntValue(FillExpr, FillSize, DEFAULT_ADDRSPACE);
+    getStreamer().EmitIntValue(FillExpr, FillSize);
 
   return false;
 }
 
-/// ParseDirectiveOrg
+/// parseDirectiveOrg
 ///  ::= .org expression [ , expression ]
-bool AsmParser::ParseDirectiveOrg() {
+bool AsmParser::parseDirectiveOrg() {
   checkForValidSection();
 
   const MCExpr *Offset;
@@ -2405,9 +2454,9 @@ bool AsmParser::ParseDirectiveOrg() {
   return false;
 }
 
-/// ParseDirectiveAlign
+/// parseDirectiveAlign
 ///  ::= {.align, ...} expression [ , expression [ , expression ]]
-bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
+bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   checkForValidSection();
 
   SMLoc AlignmentLoc = getLexer().getLoc();
@@ -2471,13 +2520,13 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   if (MaxBytesLoc.isValid()) {
     if (MaxBytesToFill < 1) {
       Error(MaxBytesLoc, "alignment directive can never be satisfied in this "
-            "many bytes, ignoring maximum bytes expression");
+                         "many bytes, ignoring maximum bytes expression");
       MaxBytesToFill = 0;
     }
 
     if (MaxBytesToFill >= Alignment) {
       Warning(MaxBytesLoc, "maximum bytes expression exceeds alignment and "
-              "has no effect");
+                           "has no effect");
       MaxBytesToFill = 0;
     }
   }
@@ -2497,10 +2546,10 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   return false;
 }
 
-/// ParseDirectiveFile
+/// parseDirectiveFile
 /// ::= .file [number] filename
 /// ::= .file number directory filename
-bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
   // FIXME: I'm not sure what this is.
   int64_t FileNumber = -1;
   SMLoc FileNumberLoc = getLexer().getLoc();
@@ -2516,17 +2565,21 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     return TokError("unexpected token in '.file' directive");
 
   // Usually the directory and filename together, otherwise just the directory.
-  StringRef Path = getTok().getString();
-  Path = Path.substr(1, Path.size()-2);
+  // Allow the strings to have escaped octal character sequence.
+  std::string Path = getTok().getString();
+  if (parseEscapedString(Path))
+    return true;
   Lex();
 
   StringRef Directory;
   StringRef Filename;
+  std::string FilenameData;
   if (getLexer().is(AsmToken::String)) {
     if (FileNumber == -1)
       return TokError("explicit path specified, but no file number");
-    Filename = getTok().getString();
-    Filename = Filename.substr(1, Filename.size()-2);
+    if (parseEscapedString(FilenameData))
+      return true;
+    Filename = FilenameData;
     Directory = Path;
     Lex();
   } else {
@@ -2540,8 +2593,9 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
     getStreamer().EmitFileDirective(Filename);
   else {
     if (getContext().getGenDwarfForAssembly() == true)
-      Error(DirectiveLoc, "input can't have .file dwarf directives when -g is "
-                        "used to generate dwarf debug info for assembly code");
+      Error(DirectiveLoc,
+            "input can't have .file dwarf directives when -g is "
+            "used to generate dwarf debug info for assembly code");
 
     if (getStreamer().EmitDwarfFileDirective(FileNumber, Directory, Filename))
       Error(FileNumberLoc, "file number already allocated");
@@ -2550,15 +2604,15 @@ bool AsmParser::ParseDirectiveFile(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveLine
+/// parseDirectiveLine
 /// ::= .line [number]
-bool AsmParser::ParseDirectiveLine() {
+bool AsmParser::parseDirectiveLine() {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     if (getLexer().isNot(AsmToken::Integer))
       return TokError("unexpected token in '.line' directive");
 
     int64_t LineNumber = getTok().getIntVal();
-    (void) LineNumber;
+    (void)LineNumber;
     Lex();
 
     // FIXME: Do something with the .line.
@@ -2570,14 +2624,14 @@ bool AsmParser::ParseDirectiveLine() {
   return false;
 }
 
-/// ParseDirectiveLoc
+/// parseDirectiveLoc
 /// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
 ///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
 /// The first number is a file number, must have been previously assigned with
 /// a .file directive, the second number is the line number and optionally the
 /// third number is a column position (zero if not specified).  The remaining
 /// optional items are .loc sub-directives.
-bool AsmParser::ParseDirectiveLoc() {
+bool AsmParser::parseDirectiveLoc() {
   if (getLexer().isNot(AsmToken::Integer))
     return TokError("unexpected token in '.loc' directive");
   int64_t FileNumber = getTok().getIntVal();
@@ -2590,8 +2644,8 @@ bool AsmParser::ParseDirectiveLoc() {
   int64_t LineNumber = 0;
   if (getLexer().is(AsmToken::Integer)) {
     LineNumber = getTok().getIntVal();
-    if (LineNumber < 1)
-      return TokError("line number less than one in '.loc' directive");
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.loc' directive");
     Lex();
   }
 
@@ -2671,15 +2725,15 @@ bool AsmParser::ParseDirectiveLoc() {
   return false;
 }
 
-/// ParseDirectiveStabs
+/// parseDirectiveStabs
 /// ::= .stabs string, number, number, number
-bool AsmParser::ParseDirectiveStabs() {
+bool AsmParser::parseDirectiveStabs() {
   return TokError("unsupported directive '.stabs'");
 }
 
-/// ParseDirectiveCFISections
+/// parseDirectiveCFISections
 /// ::= .cfi_sections section [, section]
-bool AsmParser::ParseDirectiveCFISections() {
+bool AsmParser::parseDirectiveCFISections() {
   StringRef Name;
   bool EH = false;
   bool Debug = false;
@@ -2708,40 +2762,40 @@ bool AsmParser::ParseDirectiveCFISections() {
   return false;
 }
 
-/// ParseDirectiveCFIStartProc
+/// parseDirectiveCFIStartProc
 /// ::= .cfi_startproc
-bool AsmParser::ParseDirectiveCFIStartProc() {
+bool AsmParser::parseDirectiveCFIStartProc() {
   getStreamer().EmitCFIStartProc();
   return false;
 }
 
-/// ParseDirectiveCFIEndProc
+/// parseDirectiveCFIEndProc
 /// ::= .cfi_endproc
-bool AsmParser::ParseDirectiveCFIEndProc() {
+bool AsmParser::parseDirectiveCFIEndProc() {
   getStreamer().EmitCFIEndProc();
   return false;
 }
 
-/// ParseRegisterOrRegisterNumber - parse register name or number.
-bool AsmParser::ParseRegisterOrRegisterNumber(int64_t &Register,
+/// \brief parse register name or number.
+bool AsmParser::parseRegisterOrRegisterNumber(int64_t &Register,
                                               SMLoc DirectiveLoc) {
   unsigned RegNo;
 
   if (getLexer().isNot(AsmToken::Integer)) {
     if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
       return true;
-    Register = getContext().getRegisterInfo().getDwarfRegNum(RegNo, true);
+    Register = getContext().getRegisterInfo()->getDwarfRegNum(RegNo, true);
   } else
     return parseAbsoluteExpression(Register);
 
   return false;
 }
 
-/// ParseDirectiveCFIDefCfa
+/// parseDirectiveCFIDefCfa
 /// ::= .cfi_def_cfa register,  offset
-bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2756,9 +2810,9 @@ bool AsmParser::ParseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaOffset
+/// parseDirectiveCFIDefCfaOffset
 /// ::= .cfi_def_cfa_offset offset
-bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
+bool AsmParser::parseDirectiveCFIDefCfaOffset() {
   int64_t Offset = 0;
   if (parseAbsoluteExpression(Offset))
     return true;
@@ -2767,11 +2821,11 @@ bool AsmParser::ParseDirectiveCFIDefCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIRegister
+/// parseDirectiveCFIRegister
 /// ::= .cfi_register register, register
-bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   int64_t Register1 = 0;
-  if (ParseRegisterOrRegisterNumber(Register1, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2779,16 +2833,23 @@ bool AsmParser::ParseDirectiveCFIRegister(SMLoc DirectiveLoc) {
   Lex();
 
   int64_t Register2 = 0;
-  if (ParseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRegister(Register1, Register2);
   return false;
 }
 
-/// ParseDirectiveCFIAdjustCfaOffset
+/// parseDirectiveCFIWindowSave
+/// ::= .cfi_window_save
+bool AsmParser::parseDirectiveCFIWindowSave() {
+  getStreamer().EmitCFIWindowSave();
+  return false;
+}
+
+/// parseDirectiveCFIAdjustCfaOffset
 /// ::= .cfi_adjust_cfa_offset adjustment
-bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
+bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
   int64_t Adjustment = 0;
   if (parseAbsoluteExpression(Adjustment))
     return true;
@@ -2797,24 +2858,24 @@ bool AsmParser::ParseDirectiveCFIAdjustCfaOffset() {
   return false;
 }
 
-/// ParseDirectiveCFIDefCfaRegister
+/// parseDirectiveCFIDefCfaRegister
 /// ::= .cfi_def_cfa_register register
-bool AsmParser::ParseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIDefCfaRegister(Register);
   return false;
 }
 
-/// ParseDirectiveCFIOffset
+/// parseDirectiveCFIOffset
 /// ::= .cfi_offset register, offset
-bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   int64_t Offset = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2828,12 +2889,12 @@ bool AsmParser::ParseDirectiveCFIOffset(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveCFIRelOffset
+/// parseDirectiveCFIRelOffset
 /// ::= .cfi_rel_offset register, offset
-bool AsmParser::ParseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   if (getLexer().isNot(AsmToken::Comma))
@@ -2870,11 +2931,11 @@ static bool isValidEncoding(int64_t Encoding) {
   return true;
 }
 
-/// ParseDirectiveCFIPersonalityOrLsda
+/// parseDirectiveCFIPersonalityOrLsda
 /// IsPersonality true for cfi_personality, false for cfi_lsda
 /// ::= .cfi_personality encoding, [symbol_name]
 /// ::= .cfi_lsda encoding, [symbol_name]
-bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
+bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   int64_t Encoding = 0;
   if (parseAbsoluteExpression(Encoding))
     return true;
@@ -2901,46 +2962,46 @@ bool AsmParser::ParseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   return false;
 }
 
-/// ParseDirectiveCFIRememberState
+/// parseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRememberState() {
+bool AsmParser::parseDirectiveCFIRememberState() {
   getStreamer().EmitCFIRememberState();
   return false;
 }
 
-/// ParseDirectiveCFIRestoreState
+/// parseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
-bool AsmParser::ParseDirectiveCFIRestoreState() {
+bool AsmParser::parseDirectiveCFIRestoreState() {
   getStreamer().EmitCFIRestoreState();
   return false;
 }
 
-/// ParseDirectiveCFISameValue
+/// parseDirectiveCFISameValue
 /// ::= .cfi_same_value register
-bool AsmParser::ParseDirectiveCFISameValue(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFISameValue(Register);
   return false;
 }
 
-/// ParseDirectiveCFIRestore
+/// parseDirectiveCFIRestore
 /// ::= .cfi_restore register
-bool AsmParser::ParseDirectiveCFIRestore(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
   int64_t Register = 0;
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIRestore(Register);
   return false;
 }
 
-/// ParseDirectiveCFIEscape
+/// parseDirectiveCFIEscape
 /// ::= .cfi_escape expression[,...]
-bool AsmParser::ParseDirectiveCFIEscape() {
+bool AsmParser::parseDirectiveCFIEscape() {
   std::string Values;
   int64_t CurrValue;
   if (parseAbsoluteExpression(CurrValue))
@@ -2961,9 +3022,9 @@ bool AsmParser::ParseDirectiveCFIEscape() {
   return false;
 }
 
-/// ParseDirectiveCFISignalFrame
+/// parseDirectiveCFISignalFrame
 /// ::= .cfi_signal_frame
-bool AsmParser::ParseDirectiveCFISignalFrame() {
+bool AsmParser::parseDirectiveCFISignalFrame() {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '.cfi_signal_frame'");
@@ -2972,40 +3033,40 @@ bool AsmParser::ParseDirectiveCFISignalFrame() {
   return false;
 }
 
-/// ParseDirectiveCFIUndefined
+/// parseDirectiveCFIUndefined
 /// ::= .cfi_undefined register
-bool AsmParser::ParseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   int64_t Register = 0;
 
-  if (ParseRegisterOrRegisterNumber(Register, DirectiveLoc))
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
   getStreamer().EmitCFIUndefined(Register);
   return false;
 }
 
-/// ParseDirectiveMacrosOnOff
+/// parseDirectiveMacrosOnOff
 /// ::= .macros_on
 /// ::= .macros_off
-bool AsmParser::ParseDirectiveMacrosOnOff(StringRef Directive) {
+bool AsmParser::parseDirectiveMacrosOnOff(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
                  "unexpected token in '" + Directive + "' directive");
 
-  SetMacrosEnabled(Directive == ".macros_on");
+  setMacrosEnabled(Directive == ".macros_on");
   return false;
 }
 
-/// ParseDirectiveMacro
+/// parseDirectiveMacro
 /// ::= .macro name [parameters]
-bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.macro' directive");
 
   MCAsmMacroParameters Parameters;
   // Argument delimiter is initially unknown. It will be set by
-  // ParseMacroArgument()
+  // parseMacroArgument()
   AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
@@ -3015,7 +3076,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 
       if (getLexer().is(AsmToken::Equal)) {
         Lex();
-        if (ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+        if (parseMacroArgument(Parameter.second, ArgumentDelimiter))
           return true;
       }
 
@@ -3055,19 +3116,19 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
     eatToEndOfStatement();
   }
 
-  if (LookupMacro(Name)) {
+  if (lookupMacro(Name)) {
     return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
   }
 
   const char *BodyStart = StartToken.getLoc().getPointer();
   const char *BodyEnd = EndToken.getLoc().getPointer();
   StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
-  CheckForBadMacro(DirectiveLoc, Name, Body, Parameters);
-  DefineMacro(Name, MCAsmMacro(Name, Body, Parameters));
+  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  defineMacro(Name, MCAsmMacro(Name, Body, Parameters));
   return false;
 }
 
-/// CheckForBadMacro
+/// checkForBadMacro
 ///
 /// With the support added for named parameters there may be code out there that
 /// is transitioning from positional parameters.  In versions of gas that did
@@ -3081,7 +3142,7 @@ bool AsmParser::ParseDirectiveMacro(SMLoc DirectiveLoc) {
 /// intended or change the macro to use the named parameters.  It is possible
 /// this warning will trigger when the none of the named parameters are used
 /// and the strings like $1 are infact to simply to be passed trough unchanged.
-void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+void AsmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                                  StringRef Body,
                                  MCAsmMacroParameters Parameters) {
   // If this macro is not defined with named parameters the warning we are
@@ -3119,21 +3180,21 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       break;
 
     if (Body[Pos] == '$') {
-      switch (Body[Pos+1]) {
-        // $$ => $
+      switch (Body[Pos + 1]) {
+      // $$ => $
       case '$':
         break;
 
-        // $n => number of arguments
+      // $n => number of arguments
       case 'n':
         PositionalParametersFound = true;
         break;
 
-        // $[0-9] => argument
+      // $[0-9] => argument
       default: {
         PositionalParametersFound = true;
         break;
-        }
+      }
       }
       Pos += 2;
     } else {
@@ -3141,19 +3202,19 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
       while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
-      const char *Begin = Body.data() + Pos +1;
-      StringRef Argument(Begin, I - (Pos +1));
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
         if (Parameters[Index].first == Argument)
           break;
 
       if (Index == NParameters) {
-          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
-            Pos += 3;
-          else {
-            Pos = I;
-          }
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          Pos = I;
+        }
       } else {
         NamedParametersFound = true;
         Pos += 1 + Argument.size();
@@ -3169,29 +3230,29 @@ void AsmParser::CheckForBadMacro(SMLoc DirectiveLoc, StringRef Name,
                           "found in body which will have no effect");
 }
 
-/// ParseDirectiveEndMacro
+/// parseDirectiveEndMacro
 /// ::= .endm
 /// ::= .endmacro
-bool AsmParser::ParseDirectiveEndMacro(StringRef Directive) {
+bool AsmParser::parseDirectiveEndMacro(StringRef Directive) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '" + Directive + "' directive");
 
   // If we are inside a macro instantiation, terminate the current
   // instantiation.
-  if (InsideMacroInstantiation()) {
-    HandleMacroExit();
+  if (isInsideMacroInstantiation()) {
+    handleMacroExit();
     return false;
   }
 
   // Otherwise, this .endmacro is a stray entry in the file; well formed
   // .endmacro directives are handled during the macro definition parsing.
   return TokError("unexpected '" + Directive + "' in file, "
-                  "no current macro definition");
+                                               "no current macro definition");
 }
 
-/// ParseDirectivePurgeMacro
+/// parseDirectivePurgeMacro
 /// ::= .purgem
-bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   StringRef Name;
   if (parseIdentifier(Name))
     return TokError("expected identifier in '.purgem' directive");
@@ -3199,16 +3260,16 @@ bool AsmParser::ParseDirectivePurgeMacro(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.purgem' directive");
 
-  if (!LookupMacro(Name))
+  if (!lookupMacro(Name))
     return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
 
-  UndefineMacro(Name);
+  undefineMacro(Name);
   return false;
 }
 
-/// ParseDirectiveBundleAlignMode
+/// parseDirectiveBundleAlignMode
 /// ::= {.bundle_align_mode} expression
-bool AsmParser::ParseDirectiveBundleAlignMode() {
+bool AsmParser::parseDirectiveBundleAlignMode() {
   checkForValidSection();
 
   // Expect a single argument: an expression that evaluates to a constant
@@ -3232,9 +3293,9 @@ bool AsmParser::ParseDirectiveBundleAlignMode() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock} [align_to_end]
-bool AsmParser::ParseDirectiveBundleLock() {
+bool AsmParser::parseDirectiveBundleLock() {
   checkForValidSection();
   bool AlignToEnd = false;
 
@@ -3242,7 +3303,7 @@ bool AsmParser::ParseDirectiveBundleLock() {
     StringRef Option;
     SMLoc Loc = getTok().getLoc();
     const char *kInvalidOptionError =
-      "invalid option for '.bundle_lock' directive";
+        "invalid option for '.bundle_lock' directive";
 
     if (parseIdentifier(Option))
       return Error(Loc, kInvalidOptionError);
@@ -3261,9 +3322,9 @@ bool AsmParser::ParseDirectiveBundleLock() {
   return false;
 }
 
-/// ParseDirectiveBundleLock
+/// parseDirectiveBundleLock
 /// ::= {.bundle_lock}
-bool AsmParser::ParseDirectiveBundleUnlock() {
+bool AsmParser::parseDirectiveBundleUnlock() {
   checkForValidSection();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -3274,9 +3335,9 @@ bool AsmParser::ParseDirectiveBundleUnlock() {
   return false;
 }
 
-/// ParseDirectiveSpace
+/// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
-bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
+bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
   checkForValidSection();
 
   int64_t NumBytes;
@@ -3299,18 +3360,18 @@ bool AsmParser::ParseDirectiveSpace(StringRef IDVal) {
   Lex();
 
   if (NumBytes <= 0)
-    return TokError("invalid number of bytes in '" +
-                    Twine(IDVal) + "' directive");
+    return TokError("invalid number of bytes in '" + Twine(IDVal) +
+                    "' directive");
 
   // FIXME: Sometimes the fill expr is 'nop' if it isn't supplied, instead of 0.
-  getStreamer().EmitFill(NumBytes, FillExpr, DEFAULT_ADDRSPACE);
+  getStreamer().EmitFill(NumBytes, FillExpr);
 
   return false;
 }
 
-/// ParseDirectiveLEB128
+/// parseDirectiveLEB128
 /// ::= (.sleb128 | .uleb128) expression
-bool AsmParser::ParseDirectiveLEB128(bool Signed) {
+bool AsmParser::parseDirectiveLEB128(bool Signed) {
   checkForValidSection();
   const MCExpr *Value;
 
@@ -3328,9 +3389,9 @@ bool AsmParser::ParseDirectiveLEB128(bool Signed) {
   return false;
 }
 
-/// ParseDirectiveSymbolAttribute
+/// parseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
-bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       StringRef Name;
@@ -3345,7 +3406,8 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
       if (Sym->isTemporary())
         return Error(Loc, "non-local symbol required in directive");
 
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
+      if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
+        return Error(Loc, "unable to emit symbol attribute");
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -3360,9 +3422,9 @@ bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
   return false;
 }
 
-/// ParseDirectiveComm
+/// parseDirectiveComm
 ///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
-bool AsmParser::ParseDirectiveComm(bool IsLocal) {
+bool AsmParser::parseDirectiveComm(bool IsLocal) {
   checkForValidSection();
 
   SMLoc IDLoc = getLexer().getLoc();
@@ -3412,14 +3474,14 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   // but a size of .lcomm creates a bss symbol of size zero.
   if (Size < 0)
     return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
-                 "be less than zero");
+                          "be less than zero");
 
   // NOTE: The alignment in the directive is a power of 2 value, the assembler
   // may internally end up wanting an alignment in bytes.
   // FIXME: Diagnose overflow.
   if (Pow2Alignment < 0)
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
-                 "alignment, can't be less than zero");
+                                   "alignment, can't be less than zero");
 
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
@@ -3434,9 +3496,9 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   return false;
 }
 
-/// ParseDirectiveAbort
+/// parseDirectiveAbort
 ///  ::= .abort [... message ...]
-bool AsmParser::ParseDirectiveAbort() {
+bool AsmParser::parseDirectiveAbort() {
   // FIXME: Use loc from directive.
   SMLoc Loc = getLexer().getLoc();
 
@@ -3455,25 +3517,25 @@ bool AsmParser::ParseDirectiveAbort() {
   return false;
 }
 
-/// ParseDirectiveInclude
+/// parseDirectiveInclude
 ///  ::= .include "filename"
-bool AsmParser::ParseDirectiveInclude() {
+bool AsmParser::parseDirectiveInclude() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.include' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncludeLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.include' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to switch the lexer to the included file before consuming the end
   // of statement to avoid losing it when we switch.
-  if (EnterIncludeFile(Filename)) {
+  if (enterIncludeFile(Filename)) {
     Error(IncludeLoc, "Could not find include file '" + Filename + "'");
     return true;
   }
@@ -3481,24 +3543,24 @@ bool AsmParser::ParseDirectiveInclude() {
   return false;
 }
 
-/// ParseDirectiveIncbin
+/// parseDirectiveIncbin
 ///  ::= .incbin "filename"
-bool AsmParser::ParseDirectiveIncbin() {
+bool AsmParser::parseDirectiveIncbin() {
   if (getLexer().isNot(AsmToken::String))
     return TokError("expected string in '.incbin' directive");
 
-  std::string Filename = getTok().getString();
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  if (parseEscapedString(Filename))
+    return true;
   SMLoc IncbinLoc = getLexer().getLoc();
   Lex();
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.incbin' directive");
 
-  // Strip the quotes.
-  Filename = Filename.substr(1, Filename.size()-2);
-
   // Attempt to process the included file.
-  if (ProcessIncbinFile(Filename)) {
+  if (processIncbinFile(Filename)) {
     Error(IncbinLoc, "Could not find incbin file '" + Filename + "'");
     return true;
   }
@@ -3506,9 +3568,9 @@ bool AsmParser::ParseDirectiveIncbin() {
   return false;
 }
 
-/// ParseDirectiveIf
+/// parseDirectiveIf
 /// ::= .if expression
-bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIf(SMLoc DirectiveLoc) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
   if (TheCondState.Ignore) {
@@ -3530,9 +3592,9 @@ bool AsmParser::ParseDirectiveIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveIfb
+/// parseDirectiveIfb
 /// ::= .ifb string
-bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+bool AsmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
@@ -3553,16 +3615,16 @@ bool AsmParser::ParseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
   return false;
 }
 
-/// ParseDirectiveIfc
+/// parseDirectiveIfc
 /// ::= .ifc string1, string2
-bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
+bool AsmParser::parseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
 
   if (TheCondState.Ignore) {
     eatToEndOfStatement();
   } else {
-    StringRef Str1 = ParseStringToComma();
+    StringRef Str1 = parseStringToComma();
 
     if (getLexer().isNot(AsmToken::Comma))
       return TokError("unexpected token in '.ifc' directive");
@@ -3583,9 +3645,9 @@ bool AsmParser::ParseDirectiveIfc(SMLoc DirectiveLoc, bool ExpectEqual) {
   return false;
 }
 
-/// ParseDirectiveIfdef
+/// parseDirectiveIfdef
 /// ::= .ifdef symbol
-bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+bool AsmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   StringRef Name;
   TheCondStack.push_back(TheCondState);
   TheCondState.TheCond = AsmCond::IfCond;
@@ -3610,9 +3672,9 @@ bool AsmParser::ParseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
   return false;
 }
 
-/// ParseDirectiveElseIf
+/// parseDirectiveElseIf
 /// ::= .elseif expression
-bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElseIf(SMLoc DirectiveLoc) {
   if (TheCondState.TheCond != AsmCond::IfCond &&
       TheCondState.TheCond != AsmCond::ElseIfCond)
     Error(DirectiveLoc, "Encountered a .elseif that doesn't follow a .if or "
@@ -3641,9 +3703,9 @@ bool AsmParser::ParseDirectiveElseIf(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveElse
+/// parseDirectiveElse
 /// ::= .else
-bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.else' directive");
 
@@ -3665,16 +3727,15 @@ bool AsmParser::ParseDirectiveElse(SMLoc DirectiveLoc) {
   return false;
 }
 
-/// ParseDirectiveEndIf
+/// parseDirectiveEndIf
 /// ::= .endif
-bool AsmParser::ParseDirectiveEndIf(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in '.endif' directive");
 
   Lex();
 
-  if ((TheCondState.TheCond == AsmCond::NoCond) ||
-      TheCondStack.empty())
+  if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
     Error(DirectiveLoc, "Encountered a .endif that doesn't follow a .if or "
                         ".else");
   if (!TheCondStack.empty()) {
@@ -3718,7 +3779,6 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".extern"] = DK_EXTERN;
   DirectiveKindMap[".globl"] = DK_GLOBL;
   DirectiveKindMap[".global"] = DK_GLOBAL;
-  DirectiveKindMap[".indirect_symbol"] = DK_INDIRECT_SYMBOL;
   DirectiveKindMap[".lazy_reference"] = DK_LAZY_REFERENCE;
   DirectiveKindMap[".no_dead_strip"] = DK_NO_DEAD_STRIP;
   DirectiveKindMap[".symbol_resolver"] = DK_SYMBOL_RESOLVER;
@@ -3780,6 +3840,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
   DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
   DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
   DirectiveKindMap[".macros_on"] = DK_MACROS_ON;
   DirectiveKindMap[".macros_off"] = DK_MACROS_OFF;
   DirectiveKindMap[".macro"] = DK_MACRO;
@@ -3788,8 +3849,7 @@ void AsmParser::initializeDirectiveKindMap() {
   DirectiveKindMap[".purgem"] = DK_PURGEM;
 }
 
-
-MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
+MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
   AsmToken EndToken, StartToken = getTok();
 
   unsigned NestLevel = 0;
@@ -3806,8 +3866,7 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
     }
 
     // Otherwise, check whether we have reached the .endr.
-    if (Lexer.is(AsmToken::Identifier) &&
-        getTok().getIdentifier() == ".endr") {
+    if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
       if (NestLevel == 0) {
         EndToken = getTok();
         Lex();
@@ -3831,22 +3890,21 @@ MCAsmMacro *AsmParser::ParseMacroLikeBody(SMLoc DirectiveLoc) {
   // We Are Anonymous.
   StringRef Name;
   MCAsmMacroParameters Parameters;
-  return new MCAsmMacro(Name, Body, Parameters);
+  MacroLikeBodies.push_back(MCAsmMacro(Name, Body, Parameters));
+  return &MacroLikeBodies.back();
 }
 
-void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+void AsmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
                                          raw_svector_ostream &OS) {
   OS << ".endr\n";
 
   MemoryBuffer *Instantiation =
-    MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
 
   // Create the macro instantiation object and add to the current macro
   // instantiation stack.
-  MacroInstantiation *MI = new MacroInstantiation(M, DirectiveLoc,
-                                                  CurBuffer,
-                                                  getTok().getLoc(),
-                                                  Instantiation);
+  MacroInstantiation *MI = new MacroInstantiation(
+      M, DirectiveLoc, CurBuffer, getTok().getLoc(), Instantiation);
   ActiveMacros.push_back(MI);
 
   // Jump to the macro instantiation and prime the lexer.
@@ -3855,7 +3913,7 @@ void AsmParser::InstantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
   Lex();
 }
 
-bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveRept(SMLoc DirectiveLoc) {
   int64_t Count;
   if (parseAbsoluteExpression(Count))
     return TokError("unexpected token in '.rept' directive");
@@ -3870,7 +3928,7 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the rept definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3884,14 +3942,14 @@ bool AsmParser::ParseDirectiveRept(SMLoc DirectiveLoc) {
     if (expandMacro(OS, M->Body, Parameters, A, getTok().getLoc()))
       return true;
   }
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrp
+/// parseDirectiveIrp
 /// ::= .irp symbol,values
-bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3906,14 +3964,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   // Eat the end of statement.
   Lex();
 
   // Lex the irp definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3930,14 +3988,14 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-/// ParseDirectiveIrpc
+/// parseDirectiveIrpc
 /// ::= .irpc symbol,values
-bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
   MCAsmMacroParameters Parameters;
   MCAsmMacroParameter Parameter;
 
@@ -3952,7 +4010,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   MCAsmMacroArguments A;
-  if (ParseMacroArguments(0, A))
+  if (parseMacroArguments(0, A))
     return true;
 
   if (A.size() != 1 || A.front().size() != 1)
@@ -3962,7 +4020,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   Lex();
 
   // Lex the irpc definition.
-  MCAsmMacro *M = ParseMacroLikeBody(DirectiveLoc);
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
   if (!M)
     return true;
 
@@ -3975,7 +4033,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   std::size_t I, End = Values.size();
   for (I = 0; I < End; ++I) {
     MCAsmMacroArgument Arg;
-    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I+1)));
+    Arg.push_back(AsmToken(AsmToken::Identifier, Values.slice(I, I + 1)));
 
     MCAsmMacroArguments Args;
     Args.push_back(Arg);
@@ -3984,24 +4042,24 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
       return true;
   }
 
-  InstantiateMacroLikeBody(M, DirectiveLoc, OS);
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
 
   return false;
 }
 
-bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
+bool AsmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
   if (ActiveMacros.empty())
     return TokError("unmatched '.endr' directive");
 
   // The only .repl that should get here are the ones created by
-  // InstantiateMacroLikeBody.
+  // instantiateMacroLikeBody.
   assert(getLexer().is(AsmToken::EndOfStatement));
 
-  HandleMacroExit();
+  handleMacroExit();
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+bool AsmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
                                      size_t Len) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
@@ -4018,7 +4076,7 @@ bool AsmParser::ParseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
   return false;
 }
 
-bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+bool AsmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   const MCExpr *Value;
   SMLoc ExprLoc = getLexer().getLoc();
   if (parseExpression(Value))
@@ -4030,16 +4088,15 @@ bool AsmParser::ParseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
   if (!isPowerOf2_64(IntValue))
     return Error(ExprLoc, "literal value not a power of two greater then zero");
 
-  Info.AsmRewrites->push_back(AsmRewrite(AOK_Align, IDLoc, 5,
-                                         Log2_64(IntValue)));
+  Info.AsmRewrites->push_back(
+      AsmRewrite(AOK_Align, IDLoc, 5, Log2_64(IntValue)));
   return false;
 }
 
 // We are comparing pointers, but the pointers are relative to a single string.
 // Thus, this should always be deterministic.
-static int RewritesSort(const void *A, const void *B) {
-  const AsmRewrite *AsmRewriteA = static_cast<const AsmRewrite *>(A);
-  const AsmRewrite *AsmRewriteB = static_cast<const AsmRewrite *>(B);
+static int rewritesSort(const AsmRewrite *AsmRewriteA,
+                        const AsmRewrite *AsmRewriteB) {
   if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
     return -1;
   if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
@@ -4049,25 +4106,22 @@ static int RewritesSort(const void *A, const void *B) {
   // rewrite to the same location.  Make sure the SizeDirective rewrite is
   // performed first, then the Imm/ImmPrefix and finally the Input/Output.  This
   // ensures the sort algorithm is stable.
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] >
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] >
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return -1;
 
-  if (AsmRewritePrecedence [AsmRewriteA->Kind] <
-      AsmRewritePrecedence [AsmRewriteB->Kind])
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] <
+      AsmRewritePrecedence[AsmRewriteB->Kind])
     return 1;
-  llvm_unreachable ("Unstable rewrite sort.");
+  llvm_unreachable("Unstable rewrite sort.");
 }
 
-bool
-AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
-                            unsigned &NumOutputs, unsigned &NumInputs,
-                            SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
-                            SmallVectorImpl<std::string> &Constraints,
-                            SmallVectorImpl<std::string> &Clobbers,
-                            const MCInstrInfo *MII,
-                            const MCInstPrinter *IP,
-                            MCAsmParserSemaCallback &SI) {
+bool AsmParser::parseMSInlineAsm(
+    void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    SmallVectorImpl<std::string> &Constraints,
+    SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
+    const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
   SmallVector<void *, 4> InputDecls;
   SmallVector<void *, 4> OutputDecls;
   SmallVector<bool, 4> InputDeclsAddressOf;
@@ -4086,7 +4140,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   unsigned OutputIdx = 0;
   while (getLexer().isNot(AsmToken::Eof)) {
     ParseStatementInfo Info(&AsmStrRewrites);
-    if (ParseStatement(Info))
+    if (parseStatement(Info))
       return true;
 
     if (Info.ParseError)
@@ -4174,7 +4228,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
   raw_string_ostream OS(AsmStringIR);
   const char *AsmStart = SrcMgr.getMemoryBuffer(0)->getBufferStart();
   const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
-  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), RewritesSort);
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
   for (SmallVectorImpl<AsmRewrite>::iterator I = AsmStrRewrites.begin(),
                                              E = AsmStrRewrites.end();
        I != E; ++I) {
@@ -4199,7 +4253,8 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     unsigned AdditionalSkip = 0;
     // Rewrite expressions in $N notation.
     switch (Kind) {
-    default: break;
+    default:
+      break;
     case AOK_Imm:
       OS << "$$" << (*I).Val;
       break;
@@ -4254,8 +4309,7 @@ AsmParser::parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
 }
 
 /// \brief Create an MCAsmParser instance.
-MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
-                                     MCContext &C, MCStreamer &Out,
-                                     const MCAsmInfo &MAI) {
+MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
+                                     MCStreamer &Out, const MCAsmInfo &MAI) {
   return new AsmParser(SM, C, Out, MAI);
 }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index a50eab2..d8343a3 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -35,6 +35,13 @@ class COFFAsmParser : public MCAsmParserExtension {
                           unsigned Characteristics,
                           SectionKind Kind);
 
+  bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          COFF::COMDATType Type, const MCSectionCOFF *Assoc);
+
+  bool ParseSectionName(StringRef &SectionName);
+  bool ParseSectionFlags(StringRef FlagsString, unsigned* Flags);
+
   virtual void Initialize(MCAsmParser &Parser) {
     // Call the base implementation.
     MCAsmParserExtension::Initialize(Parser);
@@ -42,11 +49,13 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveText>(".text");
     addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveData>(".data");
     addDirectiveHandler<&COFFAsmParser::ParseSectionDirectiveBSS>(".bss");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveSection>(".section");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveDef>(".def");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveScl>(".scl");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveType>(".type");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveEndef>(".endef");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveSecRel32>(".secrel32");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveLinkOnce>(".linkonce");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
@@ -100,11 +109,15 @@ class COFFAsmParser : public MCAsmParserExtension {
                               SectionKind::getBSS());
   }
 
+  bool ParseDirectiveSection(StringRef, SMLoc);
   bool ParseDirectiveDef(StringRef, SMLoc);
   bool ParseDirectiveScl(StringRef, SMLoc);
   bool ParseDirectiveType(StringRef, SMLoc);
   bool ParseDirectiveEndef(StringRef, SMLoc);
   bool ParseDirectiveSecRel32(StringRef, SMLoc);
+  bool parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                               const MCSectionCOFF *&Assoc);
+  bool ParseDirectiveLinkOnce(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -130,6 +143,119 @@ public:
 
 } // end annonomous namespace.
 
+static SectionKind computeSectionKind(unsigned Flags) {
+  if (Flags & COFF::IMAGE_SCN_MEM_EXECUTE)
+    return SectionKind::getText();
+  if (Flags & COFF::IMAGE_SCN_MEM_READ &&
+      (Flags & COFF::IMAGE_SCN_MEM_WRITE) == 0)
+    return SectionKind::getReadOnly();
+  return SectionKind::getDataRel();
+}
+
+bool COFFAsmParser::ParseSectionFlags(StringRef FlagsString, unsigned* Flags) {
+  enum {
+    None      = 0,
+    Alloc     = 1 << 0,
+    Code      = 1 << 1,
+    Load      = 1 << 2,
+    InitData  = 1 << 3,
+    Shared    = 1 << 4,
+    NoLoad    = 1 << 5,
+    NoRead    = 1 << 6,
+    NoWrite  =  1 << 7
+  };
+
+  bool ReadOnlyRemoved = false;
+  unsigned SecFlags = None;
+
+  for (unsigned i = 0; i < FlagsString.size(); ++i) {
+    switch (FlagsString[i]) {
+    case 'a':
+      // Ignored.
+      break;
+
+    case 'b': // bss section
+      SecFlags |= Alloc;
+      if (SecFlags & InitData)
+        return TokError("conflicting section flags 'b' and 'd'.");
+      SecFlags &= ~Load;
+      break;
+
+    case 'd': // data section
+      SecFlags |= InitData;
+      if (SecFlags & Alloc)
+        return TokError("conflicting section flags 'b' and 'd'.");
+      SecFlags &= ~NoWrite;
+      if ((SecFlags & NoLoad) == 0)
+        SecFlags |= Load;
+      break;
+
+    case 'n': // section is not loaded
+      SecFlags |= NoLoad;
+      SecFlags &= ~Load;
+      break;
+
+    case 'r': // read-only
+      ReadOnlyRemoved = false;
+      SecFlags |= NoWrite;
+      if ((SecFlags & Code) == 0)
+        SecFlags |= InitData;
+      if ((SecFlags & NoLoad) == 0)
+        SecFlags |= Load;
+      break;
+
+    case 's': // shared section
+      SecFlags |= Shared | InitData;
+      SecFlags &= ~NoWrite;
+      if ((SecFlags & NoLoad) == 0)
+        SecFlags |= Load;
+      break;
+
+    case 'w': // writable
+      SecFlags &= ~NoWrite;
+      ReadOnlyRemoved = true;
+      break;
+
+    case 'x': // executable section
+      SecFlags |= Code;
+      if ((SecFlags & NoLoad) == 0)
+        SecFlags |= Load;
+      if (!ReadOnlyRemoved)
+        SecFlags |= NoWrite;
+      break;
+
+    case 'y': // not readable
+      SecFlags |= NoRead | NoWrite;
+      break;
+
+    default:
+      return TokError("unknown flag");
+    }
+  }
+
+  *Flags = 0;
+
+  if (SecFlags == None)
+    SecFlags = InitData;
+
+  if (SecFlags & Code)
+    *Flags |= COFF::IMAGE_SCN_CNT_CODE | COFF::IMAGE_SCN_MEM_EXECUTE;
+  if (SecFlags & InitData)
+    *Flags |= COFF::IMAGE_SCN_CNT_INITIALIZED_DATA;
+  if ((SecFlags & Alloc) && (SecFlags & Load) == 0)
+    *Flags |= COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
+  if (SecFlags & NoLoad)
+    *Flags |= COFF::IMAGE_SCN_LNK_REMOVE;
+  if ((SecFlags & NoRead) == 0)
+    *Flags |= COFF::IMAGE_SCN_MEM_READ;
+  if ((SecFlags & NoWrite) == 0)
+    *Flags |= COFF::IMAGE_SCN_MEM_WRITE;
+  if (SecFlags & Shared)
+    *Flags |= COFF::IMAGE_SCN_MEM_SHARED;
+
+  return false;
+}
+
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".weak", ... } [ identifier ( , identifier )* ]
 bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
@@ -164,13 +290,96 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
+  return ParseSectionSwitch(Section, Characteristics, Kind, "",
+                            COFF::IMAGE_COMDAT_SELECT_ANY, 0);
+}
+
+bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
+                                       unsigned Characteristics,
+                                       SectionKind Kind,
+                                       StringRef COMDATSymName,
+                                       COFF::COMDATType Type,
+                                       const MCSectionCOFF *Assoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in section switching directive");
   Lex();
 
   getStreamer().SwitchSection(getContext().getCOFFSection(
-                                Section, Characteristics, Kind));
+      Section, Characteristics, Kind, COMDATSymName, Type, Assoc));
+
+  return false;
+}
+
+bool COFFAsmParser::ParseSectionName(StringRef &SectionName) {
+  if (!getLexer().is(AsmToken::Identifier))
+    return true;
+
+  SectionName = getTok().getIdentifier();
+  Lex();
+  return false;
+}
+
+// .section name [, "flags"] [, identifier [ identifier ], identifier]
+//
+// Supported flags:
+//   a: Ignored.
+//   b: BSS section (uninitialized data)
+//   d: data section (initialized data)
+//   n: Discardable section
+//   r: Readable section
+//   s: Shared section
+//   w: Writable section
+//   x: Executable section
+//   y: Not-readable section (clears 'r')
+//
+// Subsections are not supported.
+bool COFFAsmParser::ParseDirectiveSection(StringRef, SMLoc) {
+  StringRef SectionName;
+
+  if (ParseSectionName(SectionName))
+    return TokError("expected identifier in directive");
+
+  unsigned Flags = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                   COFF::IMAGE_SCN_MEM_READ |
+                   COFF::IMAGE_SCN_MEM_WRITE;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (getLexer().isNot(AsmToken::String))
+      return TokError("expected string in directive");
+
+    StringRef FlagsStr = getTok().getStringContents();
+    Lex();
+
+    if (ParseSectionFlags(FlagsStr, &Flags))
+      return true;
+  }
+
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  StringRef COMDATSymName;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    Flags |= COFF::IMAGE_SCN_LNK_COMDAT;
 
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("expected comma in directive");
+    Lex();
+
+    if (getParser().parseIdentifier(COMDATSymName))
+      return TokError("expected identifier in directive");
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  SectionKind Kind = computeSectionKind(Flags);
+  ParseSectionSwitch(SectionName, Flags, Kind, COMDATSymName, Type, Assoc);
   return false;
 }
 
@@ -235,6 +444,75 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   return false;
 }
 
+/// ::= [ identifier [ identifier ] ]
+bool COFFAsmParser::parseCOMDATTypeAndAssoc(COFF::COMDATType &Type,
+                                            const MCSectionCOFF *&Assoc) {
+  StringRef TypeId = getTok().getIdentifier();
+
+  Type = StringSwitch<COFF::COMDATType>(TypeId)
+    .Case("one_only", COFF::IMAGE_COMDAT_SELECT_NODUPLICATES)
+    .Case("discard", COFF::IMAGE_COMDAT_SELECT_ANY)
+    .Case("same_size", COFF::IMAGE_COMDAT_SELECT_SAME_SIZE)
+    .Case("same_contents", COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH)
+    .Case("associative", COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+    .Case("largest", COFF::IMAGE_COMDAT_SELECT_LARGEST)
+    .Case("newest", COFF::IMAGE_COMDAT_SELECT_NEWEST)
+    .Default((COFF::COMDATType)0);
+
+  if (Type == 0)
+    return TokError(Twine("unrecognized COMDAT type '" + TypeId + "'"));
+
+  Lex();
+
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    SMLoc Loc = getTok().getLoc();
+    StringRef AssocName;
+    if (ParseSectionName(AssocName))
+      return TokError("expected associated section name");
+
+    Assoc = static_cast<const MCSectionCOFF*>(
+                                        getContext().getCOFFSection(AssocName));
+    if (!Assoc)
+      return Error(Loc, "cannot associate unknown section '" + AssocName + "'");
+    if (!(Assoc->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT))
+      return Error(Loc, "associated section must be a COMDAT section");
+    if (Assoc->getSelection() == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+      return Error(Loc, "associated section cannot be itself associative");
+  }
+
+  return false;
+}
+
+/// ParseDirectiveLinkOnce
+///  ::= .linkonce [ identifier [ identifier ] ]
+bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
+  COFF::COMDATType Type = COFF::IMAGE_COMDAT_SELECT_ANY;
+  const MCSectionCOFF *Assoc = 0;
+  if (getLexer().is(AsmToken::Identifier))
+    if (parseCOMDATTypeAndAssoc(Type, Assoc))
+      return true;
+
+  const MCSectionCOFF *Current = static_cast<const MCSectionCOFF*>(
+                                       getStreamer().getCurrentSection().first);
+
+
+  if (Type == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (Assoc == Current)
+      return Error(Loc, "cannot associate a section with itself");
+  }
+
+  if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
+    return Error(Loc, Twine("section '") + Current->getSectionName() +
+                                                       "' is already linkonce");
+
+  Current->setSelection(Type, Assoc);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  return false;
+}
+
 bool COFFAsmParser::ParseSEHDirectiveStartProc(StringRef, SMLoc) {
   StringRef SymbolID;
   if (getParser().parseIdentifier(SymbolID))
@@ -453,7 +731,7 @@ bool COFFAsmParser::ParseAtUnwindOrAtExcept(bool &unwind, bool &except) {
 bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
   SMLoc startLoc = getLexer().getLoc();
   if (getLexer().is(AsmToken::Percent)) {
-    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
     SMLoc endLoc;
     unsigned LLVMRegNo;
     if (getParser().getTargetParser().ParseRegister(LLVMRegNo,startLoc,endLoc))
@@ -473,7 +751,7 @@ bool COFFAsmParser::ParseSEHRegisterNumber(unsigned &RegNo) {
       return Error(startLoc, "expected non-volatile register");
 #endif
 
-    int SEHRegNo = MRI.getSEHRegNum(LLVMRegNo);
+    int SEHRegNo = MRI->getSEHRegNum(LLVMRegNo);
     if (SEHRegNo < 0)
       return Error(startLoc,"register can't be represented in SEH unwind info");
     RegNo = SEHRegNo;
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 7eb8b74..4c9bafa 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -45,6 +45,8 @@ public:
     this->MCAsmParserExtension::Initialize(Parser);
 
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveDesc>(".desc");
+    addDirectiveHandler<&DarwinAsmParser::ParseDirectiveIndirectSymbol>(
+      ".indirect_symbol");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveLsym>(".lsym");
     addDirectiveHandler<&DarwinAsmParser::ParseDirectiveSubsectionsViaSymbols>(
       ".subsections_via_symbols");
@@ -69,6 +71,7 @@ public:
       ".end_data_region");
 
     // Special section directives.
+    addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveBss>(".bss");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConst>(".const");
     addDirectiveHandler<&DarwinAsmParser::ParseSectionDirectiveConstData>(
       ".const_data");
@@ -163,6 +166,7 @@ public:
   }
 
   bool ParseDirectiveDesc(StringRef, SMLoc);
+  bool ParseDirectiveIndirectSymbol(StringRef, SMLoc);
   bool ParseDirectiveDumpOrLoad(StringRef, SMLoc);
   bool ParseDirectiveLsym(StringRef, SMLoc);
   bool ParseDirectiveLinkerOption(StringRef, SMLoc);
@@ -179,6 +183,10 @@ public:
   bool ParseDirectiveDataRegionEnd(StringRef, SMLoc);
 
   // Named Section Directive
+  bool ParseSectionDirectiveBss(StringRef, SMLoc) {
+    return ParseSectionSwitch("__DATA", "__bss");
+  }
+
   bool ParseSectionDirectiveConst(StringRef, SMLoc) {
     return ParseSectionSwitch("__TEXT", "__const");
   }
@@ -415,6 +423,39 @@ bool DarwinAsmParser::ParseDirectiveDesc(StringRef, SMLoc) {
   return false;
 }
 
+/// ParseDirectiveIndirectSymbol
+///  ::= .indirect_symbol identifier
+bool DarwinAsmParser::ParseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
+  const MCSectionMachO *Current = static_cast<const MCSectionMachO*>(
+                                       getStreamer().getCurrentSection().first);
+  unsigned SectionType = Current->getType();
+  if (SectionType != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+      SectionType != MCSectionMachO::S_SYMBOL_STUBS)
+    return Error(Loc, "indirect symbol not in a symbol pointer or stub "
+                      "section");
+
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return TokError("expected identifier in .indirect_symbol directive");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+
+  // Assembler local symbols don't make any sense here. Complain loudly.
+  if (Sym->isTemporary())
+    return TokError("non-local symbol required in directive");
+
+  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_IndirectSymbol))
+    return TokError("unable to emit indirect symbol attribute for: " + Name);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.indirect_symbol' directive");
+
+  Lex();
+
+  return false;
+}
+
 /// ParseDirectiveDumpOrLoad
 ///  ::= ( .dump | .load ) "filename"
 bool DarwinAsmParser::ParseDirectiveDumpOrLoad(StringRef Directive,
@@ -593,7 +634,7 @@ bool DarwinAsmParser::ParseDirectiveSecureLogUnique(StringRef, SMLoc IDLoc) {
   raw_ostream *OS = getContext().getSecureLog();
   if (OS == NULL) {
     std::string Err;
-    OS = new raw_fd_ostream(SecureLogFile, Err, raw_fd_ostream::F_Append);
+    OS = new raw_fd_ostream(SecureLogFile, Err, sys::fs::F_Append);
     if (!Err.empty()) {
        delete OS;
        return Error(IDLoc, Twine("can't open secure log file: ") +
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 3134fc3..8807975 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 using namespace llvm;
 
@@ -30,14 +31,11 @@ class ELFAsmParser : public MCAsmParserExtension {
     getParser().addDirectiveHandler(Directive, Handler);
   }
 
-  bool ParseSectionSwitch(StringRef Section, unsigned Type,
-                          unsigned Flags, SectionKind Kind);
-  bool SeenIdent;
+  bool ParseSectionSwitch(StringRef Section, unsigned Type, unsigned Flags,
+                          SectionKind Kind);
 
 public:
-  ELFAsmParser() : SeenIdent(false) {
-    BracketExpressionsSupported = true;
-  }
+  ELFAsmParser() { BracketExpressionsSupported = true; }
 
   virtual void Initialize(MCAsmParser &Parser) {
     // Call the base implementation.
@@ -241,7 +239,6 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
   }
 
   for (;;) {
-    StringRef Tmp;
     unsigned CurSize;
 
     SMLoc PrevLoc = getLexer().getLoc();
@@ -279,14 +276,17 @@ static SectionKind computeSectionKind(unsigned Flags) {
   return SectionKind::getDataRel();
 }
 
-static int parseSectionFlags(StringRef flagsStr) {
-  int flags = 0;
+static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
+  unsigned flags = 0;
 
   for (unsigned i = 0; i < flagsStr.size(); i++) {
     switch (flagsStr[i]) {
     case 'a':
       flags |= ELF::SHF_ALLOC;
       break;
+    case 'e':
+      flags |= ELF::SHF_EXCLUDE;
+      break;
     case 'x':
       flags |= ELF::SHF_EXECINSTR;
       break;
@@ -311,8 +311,11 @@ static int parseSectionFlags(StringRef flagsStr) {
     case 'G':
       flags |= ELF::SHF_GROUP;
       break;
+    case '?':
+      *UseLastGroup = true;
+      break;
     default:
-      return -1;
+      return -1U;
     }
   }
 
@@ -352,6 +355,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
   StringRef GroupName;
   unsigned Flags = 0;
   const MCExpr *Subsection = 0;
+  bool UseLastGroup = false;
 
   // Set the defaults first.
   if (SectionName == ".fini" || SectionName == ".init" ||
@@ -377,13 +381,16 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
     StringRef FlagsStr = getTok().getStringContents();
     Lex();
 
-    int extraFlags = parseSectionFlags(FlagsStr);
-    if (extraFlags < 0)
+    unsigned extraFlags = parseSectionFlags(FlagsStr, &UseLastGroup);
+    if (extraFlags == -1U)
       return TokError("unknown flag");
     Flags |= extraFlags;
 
     bool Mergeable = Flags & ELF::SHF_MERGE;
     bool Group = Flags & ELF::SHF_GROUP;
+    if (Group && UseLastGroup)
+      return TokError("Section cannot specifiy a group name while also acting "
+                      "as a member of the last group");
 
     if (getLexer().isNot(AsmToken::Comma)) {
       if (Mergeable)
@@ -392,10 +399,13 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush) {
         return TokError("Group section must specify the type");
     } else {
       Lex();
-      if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-        return TokError("expected '@' or '%' before type");
+      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
+          getLexer().is(AsmToken::String)) {
+        if (!getLexer().is(AsmToken::String))
+          Lex();
+      } else
+        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
 
-      Lex();
       if (getParser().parseIdentifier(TypeName))
         return TokError("expected identifier in directive");
 
@@ -461,6 +471,16 @@ EndStmt:
       return TokError("unknown section type");
   }
 
+  if (UseLastGroup) {
+    MCSectionSubPair CurrentSection = getStreamer().getCurrentSection();
+    if (const MCSectionELF *Section =
+            cast_or_null<MCSectionELF>(CurrentSection.first))
+      if (const MCSymbol *Group = Section->getGroup()) {
+        GroupName = Group->getName();
+        Flags |= ELF::SHF_GROUP;
+      }
+  }
+
   SectionKind Kind = computeSectionKind(Flags);
   getStreamer().SwitchSection(getContext().getELFSection(SectionName, Type,
                                                          Flags, Kind, Size,
@@ -479,7 +499,11 @@ bool ELFAsmParser::ParseDirectivePrevious(StringRef DirName, SMLoc) {
 }
 
 /// ParseDirectiveELFType
+///  ::= .type identifier , STT_<TYPE_IN_UPPER_CASE>
+///  ::= .type identifier , #attribute
 ///  ::= .type identifier , @attribute
+///  ::= .type identifier , %attribute
+///  ::= .type identifier , "attribute"
 bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().parseIdentifier(Name))
@@ -492,26 +516,42 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
     return TokError("unexpected token in '.type' directive");
   Lex();
 
-  if (getLexer().isNot(AsmToken::Percent) && getLexer().isNot(AsmToken::At))
-    return TokError("expected '@' or '%' before type");
-  Lex();
-
   StringRef Type;
   SMLoc TypeLoc;
+  MCSymbolAttr Attr;
+  if (getLexer().is(AsmToken::Identifier)) {
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("STT_FUNC", MCSA_ELF_TypeFunction)
+               .Case("STT_OBJECT", MCSA_ELF_TypeObject)
+               .Case("STT_TLS", MCSA_ELF_TypeTLS)
+               .Case("STT_COMMON", MCSA_ELF_TypeCommon)
+               .Case("STT_NOTYPE", MCSA_ELF_TypeNoType)
+               .Case("STT_GNU_IFUNC", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else if (getLexer().is(AsmToken::Hash) || getLexer().is(AsmToken::At) ||
+             getLexer().is(AsmToken::Percent) ||
+             getLexer().is(AsmToken::String)) {
+    if (!getLexer().is(AsmToken::String))
+      Lex();
 
-  TypeLoc = getLexer().getLoc();
-  if (getParser().parseIdentifier(Type))
-    return TokError("expected symbol type in directive");
-
-  MCSymbolAttr Attr = StringSwitch<MCSymbolAttr>(Type)
-    .Case("function", MCSA_ELF_TypeFunction)
-    .Case("object", MCSA_ELF_TypeObject)
-    .Case("tls_object", MCSA_ELF_TypeTLS)
-    .Case("common", MCSA_ELF_TypeCommon)
-    .Case("notype", MCSA_ELF_TypeNoType)
-    .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
-    .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
-    .Default(MCSA_Invalid);
+    TypeLoc = getLexer().getLoc();
+    if (getParser().parseIdentifier(Type))
+      return TokError("expected symbol type in directive");
+    Attr = StringSwitch<MCSymbolAttr>(Type)
+               .Case("function", MCSA_ELF_TypeFunction)
+               .Case("object", MCSA_ELF_TypeObject)
+               .Case("tls_object", MCSA_ELF_TypeTLS)
+               .Case("common", MCSA_ELF_TypeCommon)
+               .Case("notype", MCSA_ELF_TypeNoType)
+               .Case("gnu_unique_object", MCSA_ELF_TypeGnuUniqueObject)
+               .Case("gnu_indirect_function", MCSA_ELF_TypeIndFunction)
+               .Default(MCSA_Invalid);
+  } else
+    return TokError("expected STT_<TYPE_IN_UPPER_CASE>, '#<type>', '@<type>', "
+                    "'%<type>' or \"<type>\"");
 
   if (Attr == MCSA_Invalid)
     return Error(TypeLoc, "unsupported attribute in '.type' directive");
@@ -536,22 +576,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
 
   Lex();
 
-  const MCSection *Comment =
-    getContext().getELFSection(".comment", ELF::SHT_PROGBITS,
-                               ELF::SHF_MERGE |
-                               ELF::SHF_STRINGS,
-                               SectionKind::getReadOnly(),
-                               1, "");
-
-  getStreamer().PushSection();
-  getStreamer().SwitchSection(Comment);
-  if (!SeenIdent) {
-    getStreamer().EmitIntValue(0, 1);
-    SeenIdent = true;
-  }
-  getStreamer().EmitBytes(Data);
-  getStreamer().EmitIntValue(0, 1);
-  getStreamer().PopSection();
+  getStreamer().EmitIdent(Data);
   return false;
 }
 
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index 8ae724f..f7bf002 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -29,7 +29,7 @@ private:
 public:
   MCPureStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
                  MCCodeEmitter *Emitter)
-      : MCObjectStreamer(SK_PureStreamer, Context, TAB, OS, Emitter) {}
+      : MCObjectStreamer(Context, 0, TAB, OS, Emitter) {}
 
   /// @name MCStreamer Interface
   /// @{
@@ -40,7 +40,7 @@ public:
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             uint64_t Size = 0, unsigned ByteAlignment = 0);
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitBytes(StringRef Data);
   virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                                     unsigned ValueSize = 1,
                                     unsigned MaxBytesToEmit = 0);
@@ -51,8 +51,9 @@ public:
   virtual void FinishImpl();
 
 
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) {
     report_fatal_error("unsupported directive in pure streamer");
+    return false;
   }
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
     report_fatal_error("unsupported directive in pure streamer");
@@ -93,16 +94,13 @@ public:
   virtual void EmitFileDirective(StringRef Filename) {
     report_fatal_error("unsupported directive in pure streamer");
   }
+  virtual void EmitIdent(StringRef IdentString) {
+    report_fatal_error("unsupported directive in pure streamer");
+  }
   virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                       StringRef Filename, unsigned CUID = 0) {
     report_fatal_error("unsupported directive in pure streamer");
   }
-
-  /// @}
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_PureStreamer;
-  }
 };
 
 } // end anonymous namespace.
@@ -120,7 +118,7 @@ void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
 
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
 
   MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
 
@@ -149,7 +147,7 @@ void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
   report_fatal_error("not yet implemented in pure streamer");
 }
 
-void MCPureStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+void MCPureStreamer::EmitBytes(StringRef Data) {
   // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
   // MCObjectStreamer.
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index 5c71106..ce79cd5 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -46,6 +46,18 @@ unsigned MCRegisterInfo::getSubRegIndex(unsigned Reg, unsigned SubReg) const {
   return 0;
 }
 
+unsigned MCRegisterInfo::getSubRegIdxSize(unsigned Idx) const {
+  assert(Idx && Idx < getNumSubRegIndices() &&
+         "This is not a subregister index");
+  return SubRegIdxRanges[Idx].Size;
+}
+
+unsigned MCRegisterInfo::getSubRegIdxOffset(unsigned Idx) const {
+  assert(Idx && Idx < getNumSubRegIndices() &&
+         "This is not a subregister index");
+  return SubRegIdxRanges[Idx].Offset;
+}
+
 int MCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
   const DwarfLLVMRegPair *M = isEH ? EHL2DwarfRegs : L2DwarfRegs;
   unsigned Size = isEH ? EHL2DwarfRegsSize : L2DwarfRegsSize;
diff --git a/lib/MC/MCRelocationInfo.cpp b/lib/MC/MCRelocationInfo.cpp
new file mode 100644
index 0000000..53c48ded
--- /dev/null
+++ b/lib/MC/MCRelocationInfo.cpp
@@ -0,0 +1,39 @@
+//==-- lib/MC/MCRelocationInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm-c/Disassembler.h"
+
+using namespace llvm;
+
+MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
+  : Ctx(Ctx) {
+}
+
+MCRelocationInfo::~MCRelocationInfo() {
+}
+
+const MCExpr *
+MCRelocationInfo::createExprForRelocation(object::RelocationRef Rel) {
+  return 0;
+}
+
+const MCExpr *
+MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                               unsigned VariantKind) {
+  if (VariantKind != LLVMDisassembler_VariantKind_None)
+    return 0;
+  return SubExpr;
+}
+
+MCRelocationInfo *llvm::createMCRelocationInfo(StringRef TT, MCContext &Ctx) {
+  return new MCRelocationInfo(Ctx);
+}
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 6cedf06..64aa2c5 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -28,6 +28,17 @@ bool MCSectionCOFF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
+void MCSectionCOFF::setSelection(int Selection,
+                                 const MCSectionCOFF *Assoc) const {
+  assert(Selection != 0 && "invalid COMDAT selection type");
+  assert((Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) ==
+         (Assoc != 0) &&
+    "associative COMDAT section must have an associated section");
+  this->Selection = Selection;
+  this->Assoc = Assoc;
+  Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+}
+
 void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
@@ -63,12 +74,15 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI,
       case COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH:
         OS << "\t.linkonce same_contents\n";
         break;
-    //NOTE: as of binutils 2.20, there is no way to specifiy select largest
-    //      with the .linkonce directive. For now, we treat it as an invalid
-    //      comdat selection value.
+      case COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE:
+        OS << "\t.linkonce associative " << Assoc->getSectionName() << "\n";
+        break;
       case COFF::IMAGE_COMDAT_SELECT_LARGEST:
-    //  OS << "\t.linkonce largest\n";
-    //  break;
+        OS << "\t.linkonce largest\n";
+        break;
+      case COFF::IMAGE_COMDAT_SELECT_NEWEST:
+        OS << "\t.linkonce newest\n";
+        break;
       default:
         assert (0 && "unsupported COFF selection type");
         break;
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index bf1a984..09eb3e7 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -32,6 +32,29 @@ bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
   return false;
 }
 
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
@@ -44,27 +67,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     return;
   }
 
-  StringRef name = getSectionName();
-  if (name.find_first_not_of("0123456789_."
-                             "abcdefghijklmnopqrstuvwxyz"
-                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == name.npos) {
-    OS << "\t.section\t" << name;
-  } else {
-    OS << "\t.section\t\"";
-    for (const char *b = name.begin(), *e = name.end(); b < e; ++b) {
-      if (*b == '"') // Unquoted "
-        OS << "\\\"";
-      else if (*b != '\\') // Neither " or backslash
-        OS << *b;
-      else if (b + 1 == e) // Trailing backslash
-        OS << "\\\\";
-      else {
-        OS << b[0] << b[1]; // Quoted character
-        ++b;
-      }
-    }
-    OS << '"';
-  }
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
 
   // Handle the weird solaris syntax if desired.
   if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
@@ -75,6 +79,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
       OS << ",#execinstr";
     if (Flags & ELF::SHF_WRITE)
       OS << ",#write";
+    if (Flags & ELF::SHF_EXCLUDE)
+      OS << ",#exclude";
     if (Flags & ELF::SHF_TLS)
       OS << ",#tls";
     OS << '\n';
@@ -84,6 +90,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
   OS << ",\"";
   if (Flags & ELF::SHF_ALLOC)
     OS << 'a';
+  if (Flags & ELF::SHF_EXCLUDE)
+    OS << 'e';
   if (Flags & ELF::SHF_EXECINSTR)
     OS << 'x';
   if (Flags & ELF::SHF_GROUP)
@@ -131,8 +139,11 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI,
     OS << "," << EntrySize;
   }
 
-  if (Flags & ELF::SHF_GROUP)
-    OS << "," << Group->getName() << ",comdat";
+  if (Flags & ELF::SHF_GROUP) {
+    OS << ",";
+    printName(OS, Group->getName());
+    OS << ",comdat";
+  }
   OS << '\n';
 
   if (Subsection)
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 8f1895e..2e1d69b 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -21,10 +22,17 @@
 #include <cstdlib>
 using namespace llvm;
 
-MCStreamer::MCStreamer(StreamerKind Kind, MCContext &Ctx)
-    : Kind(Kind), Context(Ctx), EmitEHFrame(true), EmitDebugFrame(false),
-      CurrentW64UnwindInfo(0), LastSymbol(0), AutoInitSections(false) {
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() {}
+void ARMTargetStreamer::anchor() {}
+
+MCStreamer::MCStreamer(MCContext &Ctx, MCTargetStreamer *TargetStreamer)
+    : Context(Ctx), TargetStreamer(TargetStreamer), EmitEHFrame(true),
+      EmitDebugFrame(false), CurrentW64UnwindInfo(0), LastSymbol(0),
+      AutoInitSections(false) {
   SectionStack.push_back(std::pair<MCSectionSubPair, MCSectionSubPair>());
+  if (TargetStreamer)
+    TargetStreamer->setStreamer(this);
 }
 
 MCStreamer::~MCStreamer() {
@@ -58,7 +66,7 @@ const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
 }
 
 const MCExpr *MCStreamer::ForceExpAbs(const MCExpr* Expr) {
-  if (Context.getAsmInfo().hasAggressiveSymbolFolding() ||
+  if (Context.getAsmInfo()->hasAggressiveSymbolFolding() ||
       isa<MCSymbolRefExpr>(Expr))
     return Expr;
 
@@ -72,6 +80,13 @@ raw_ostream &MCStreamer::GetCommentOS() {
   return nulls();
 }
 
+void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
+  for (std::vector<MCDwarfFrameInfo>::iterator I = FrameInfos.begin(),
+         E = FrameInfos.end(); I != E; ++I)
+    I->CompactUnwindEncoding =
+      (MAB ? MAB->generateCompactUnwindEncoding(I->Instructions) : 0);
+}
+
 void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
                                       const MCSymbol *Label, int PointerSize) {
   // emit the sequence to set the address
@@ -86,55 +101,49 @@ void MCStreamer::EmitDwarfSetLineAddr(int64_t LineDelta,
 
 /// EmitIntValue - Special case of EmitValue that avoids the client having to
 /// pass in a MCExpr for constant integers.
-void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size,
-                              unsigned AddrSpace) {
+void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
   assert(Size <= 8 && "Invalid size");
   assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
          "Invalid size");
   char buf[8];
-  const bool isLittleEndian = Context.getAsmInfo().isLittleEndian();
+  const bool isLittleEndian = Context.getAsmInfo()->isLittleEndian();
   for (unsigned i = 0; i != Size; ++i) {
     unsigned index = isLittleEndian ? i : (Size - i - 1);
     buf[i] = uint8_t(Value >> (index * 8));
   }
-  EmitBytes(StringRef(buf, Size), AddrSpace);
+  EmitBytes(StringRef(buf, Size));
 }
 
 /// EmitULEB128Value - Special case of EmitULEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned Padding,
-                                     unsigned AddrSpace) {
+void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned Padding) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
   encodeULEB128(Value, OSE, Padding);
-  EmitBytes(OSE.str(), AddrSpace);
+  EmitBytes(OSE.str());
 }
 
 /// EmitSLEB128Value - Special case of EmitSLEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitSLEB128IntValue(int64_t Value, unsigned AddrSpace) {
+void MCStreamer::EmitSLEB128IntValue(int64_t Value) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
   encodeSLEB128(Value, OSE);
-  EmitBytes(OSE.str(), AddrSpace);
+  EmitBytes(OSE.str());
 }
 
-void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size,
-                              unsigned AddrSpace) {
+void MCStreamer::EmitAbsValue(const MCExpr *Value, unsigned Size) {
   const MCExpr *ABS = ForceExpAbs(Value);
-  EmitValue(ABS, Size, AddrSpace);
+  EmitValue(ABS, Size);
 }
 
 
-void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size,
-                           unsigned AddrSpace) {
-  EmitValueImpl(Value, Size, AddrSpace);
+void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size) {
+  EmitValueImpl(Value, Size);
 }
 
-void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
-                                  unsigned AddrSpace) {
-  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size,
-                AddrSpace);
+void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size) {
+  EmitValueImpl(MCSymbolRefExpr::Create(Sym, getContext()), Size);
 }
 
 void MCStreamer::EmitGPRel64Value(const MCExpr *Value) {
@@ -147,11 +156,15 @@ void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
 
 /// EmitFill - Emit NumBytes bytes worth of the value specified by
 /// FillValue.  This implements directives such as '.space'.
-void MCStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
-                          unsigned AddrSpace) {
+void MCStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue) {
   const MCExpr *E = MCConstantExpr::Create(FillValue, getContext());
   for (uint64_t i = 0, e = NumBytes; i != e; ++i)
-    EmitValue(E, 1, AddrSpace);
+    EmitValue(E, 1);
+}
+
+/// The implementation in this class just redirects to EmitFill.
+void MCStreamer::EmitZeros(uint64_t NumBytes) {
+  EmitFill(NumBytes, 0);
 }
 
 bool MCStreamer::EmitDwarfFileDirective(unsigned FileNo,
@@ -185,17 +198,28 @@ void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
 
+void MCStreamer::AssignSection(MCSymbol *Symbol, const MCSection *Section) {
+  if (Section)
+    Symbol->setSection(*Section);
+  else
+    Symbol->setUndefined();
+
+  // As we emit symbols into a section, track the order so that they can
+  // be sorted upon later. Zero is reserved to mean 'unemitted'.
+  SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
+}
+
 void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
 void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSection().first && "Cannot emit before setting section!");
-  Symbol->setSection(*getCurrentSection().first);
+  AssignSection(Symbol, getCurrentSection().first);
   LastSymbol = Symbol;
 }
 
@@ -229,7 +253,7 @@ void MCStreamer::RecordProcStart(MCDwarfFrameInfo &Frame) {
   Frame.Function = LastSymbol;
   // If the function is externally visible, we need to create a local
   // symbol to avoid relocations.
-  StringRef Prefix = getContext().getAsmInfo().getPrivateGlobalPrefix();
+  StringRef Prefix = getContext().getAsmInfo()->getPrivateGlobalPrefix();
   if (LastSymbol && LastSymbol->getName().startswith(Prefix)) {
     Frame.Begin = LastSymbol;
   } else {
@@ -382,6 +406,14 @@ void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
+void MCStreamer::EmitCFIWindowSave() {
+  MCSymbol *Label = EmitCFICommon();
+  MCCFIInstruction Instruction =
+    MCCFIInstruction::createWindowSave(Label);
+  MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
+  CurFrame->Instructions.push_back(Instruction);
+}
+
 void MCStreamer::setCurrentW64UnwindInfo(MCWin64EHUnwindInfo *Frame) {
   W64UnwindInfos.push_back(Frame);
   CurrentW64UnwindInfo = W64UnwindInfos.back();
@@ -472,7 +504,9 @@ void MCStreamer::EmitWin64EHSetFrame(unsigned Register, unsigned Offset) {
     report_fatal_error("Frame register and offset already specified!");
   if (Offset & 0x0F)
     report_fatal_error("Misaligned frame pointer offset!");
-  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, 0, Register, Offset);
+  MCSymbol *Label = getContext().CreateTempSymbol();
+  MCWin64EHInstruction Inst(Win64EH::UOP_SetFPReg, Label, Register, Offset);
+  EmitLabel(Label);
   CurFrame->LastFrameInst = CurFrame->Instructions.size();
   CurFrame->Instructions.push_back(Inst);
 }
@@ -536,54 +570,10 @@ void MCStreamer::EmitCOFFSecRel32(MCSymbol const *Symbol) {
   llvm_unreachable("This file format doesn't support this directive");
 }
 
-void MCStreamer::EmitFnStart() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitFnEnd() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitCantUnwind() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitHandlerData() {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPersonality(const MCSymbol *Personality) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitPad(int64_t Offset) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool) {
-  errs() << "Not implemented yet\n";
-  abort();
-}
-
-void MCStreamer::EmitTCEntry(const MCSymbol &S) {
-  llvm_unreachable("Unsupported method");
-}
-
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCStreamer::EmitRawText(StringRef String) {
+void MCStreamer::EmitRawTextImpl(StringRef String) {
   errs() << "EmitRawText called on an MCStreamer that doesn't support it, "
   " something must not be fully mc'ized\n";
   abort();
@@ -591,19 +581,18 @@ void MCStreamer::EmitRawText(StringRef String) {
 
 void MCStreamer::EmitRawText(const Twine &T) {
   SmallString<128> Str;
-  T.toVector(Str);
-  EmitRawText(Str.str());
+  EmitRawTextImpl(T.toStringRef(Str));
 }
 
-void MCStreamer::EmitFrames(bool usingCFI) {
+void MCStreamer::EmitFrames(MCAsmBackend *MAB, bool usingCFI) {
   if (!getNumFrameInfos())
     return;
 
   if (EmitEHFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, true);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, true);
 
   if (EmitDebugFrame)
-    MCDwarfFrameEmitter::Emit(*this, usingCFI, false);
+    MCDwarfFrameEmitter::Emit(*this, MAB, usingCFI, false);
 }
 
 void MCStreamer::EmitW64Tables() {
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index f18828d..8d8e290 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -27,6 +27,11 @@ MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
   FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
                                         ProcFeatures, NumFeatures);
 
+  InitCPUSchedModel(CPU);
+}
+
+void
+MCSubtargetInfo::InitCPUSchedModel(StringRef CPU) {
   if (!CPU.empty())
     CPUSchedModel = getSchedModelForCPU(CPU);
   else
@@ -91,10 +96,8 @@ MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
 #endif
 
   // Find entry
-  SubtargetInfoKV KV;
-  KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, CPU);
   if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index b973c57..2416525 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -68,12 +68,23 @@ void MCSymbol::print(raw_ostream &OS) const {
   // The name for this MCSymbol is required to be a valid target name.  However,
   // some targets support quoting names with funny characters.  If the name
   // contains a funny character, then print it quoted.
-  if (!NameNeedsQuoting(getName())) {
-    OS << getName();
+  StringRef Name = getName();
+  if (!NameNeedsQuoting(Name)) {
+    OS << Name;
     return;
   }
 
-  OS << '"' << getName() << '"';
+  OS << '"';
+  for (unsigned I = 0, E = Name.size(); I != E; ++I) {
+    char C = Name[I];
+    if (C == '\n')
+      OS << "\\n";
+    else if (C == '"')
+      OS << "\\\"";
+    else
+      OS << C;
+  }
+  OS << '"';
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/lib/MC/MCSymbolizer.cpp b/lib/MC/MCSymbolizer.cpp
new file mode 100644
index 0000000..1020b74
--- /dev/null
+++ b/lib/MC/MCSymbolizer.cpp
@@ -0,0 +1,20 @@
+//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSymbolizer.h"
+#include "llvm/MC/MCRelocationInfo.h"
+
+using namespace llvm;
+
+MCSymbolizer::MCSymbolizer(MCContext &Ctx, OwningPtr<MCRelocationInfo> &RelInfo)
+  : Ctx(Ctx), RelInfo(RelInfo.take()) {
+}
+
+MCSymbolizer::~MCSymbolizer() {
+}
diff --git a/lib/MC/MCWin64EH.cpp b/lib/MC/MCWin64EH.cpp
index c5b637c..b8b07d3 100644
--- a/lib/MC/MCWin64EH.cpp
+++ b/lib/MC/MCWin64EH.cpp
@@ -64,7 +64,7 @@ static void EmitAbsDifference(MCStreamer &streamer, MCSymbol *lhs,
 
 static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
                            MCWin64EHInstruction &inst) {
-  uint8_t b1, b2;
+  uint8_t b2;
   uint16_t w;
   b2 = (inst.getOperation() & 0x0F);
   switch (inst.getOperation()) {
@@ -93,8 +93,7 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SetFPReg:
-    b1 = inst.getOffset() & 0xF0;
-    streamer.EmitIntValue(b1, 1);
+    EmitAbsDifference(streamer, inst.getLabel(), begin);
     streamer.EmitIntValue(b2, 1);
     break;
   case Win64EH::UOP_SaveNonVol:
@@ -129,14 +128,29 @@ static void EmitUnwindCode(MCStreamer &streamer, MCSymbol *begin,
   }
 }
 
+static void EmitSymbolRefWithOfs(MCStreamer &streamer,
+                                 const MCSymbol *Base,
+                                 const MCSymbol *Other) {
+  MCContext &Context = streamer.getContext();
+  const MCSymbolRefExpr *BaseRef = MCSymbolRefExpr::Create(Base, Context);
+  const MCSymbolRefExpr *OtherRef = MCSymbolRefExpr::Create(Other, Context);
+  const MCExpr *Ofs = MCBinaryExpr::CreateSub(OtherRef, BaseRef, Context);
+  const MCSymbolRefExpr *BaseRefRel = MCSymbolRefExpr::Create(Base,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              Context);
+  streamer.EmitValue(MCBinaryExpr::CreateAdd(BaseRefRel, Ofs, Context), 4);
+}
+
 static void EmitRuntimeFunction(MCStreamer &streamer,
                                 const MCWin64EHUnwindInfo *info) {
   MCContext &context = streamer.getContext();
 
   streamer.EmitValueToAlignment(4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Begin, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->End, context), 4);
-  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol, context), 4);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
+  EmitSymbolRefWithOfs(streamer, info->Function, info->End);
+  streamer.EmitValue(MCSymbolRefExpr::Create(info->Symbol,
+                                             MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                             context), 4);
 }
 
 static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
@@ -145,11 +159,11 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
 
   MCContext &context = streamer.getContext();
   streamer.EmitValueToAlignment(4);
-  // Upper 3 bits are the version number (currently 1).
-  uint8_t flags = 0x01;
   info->Symbol = context.CreateTempSymbol();
   streamer.EmitLabel(info->Symbol);
 
+  // Upper 3 bits are the version number (currently 1).
+  uint8_t flags = 0x01;
   if (info->ChainedParent)
     flags |= Win64EH::UNW_ChainInfo << 3;
   else {
@@ -185,20 +199,26 @@ static void EmitUnwindInfo(MCStreamer &streamer, MCWin64EHUnwindInfo *info) {
     EmitUnwindCode(streamer, info->Begin, inst);
   }
 
+  // For alignment purposes, the instruction array will always have an even
+  // number of entries, with the final entry potentially unused (in which case
+  // the array will be one longer than indicated by the count of unwind codes
+  // field).
+  if (numCodes & 1) {
+    streamer.EmitIntValue(0, 2);
+  }
+
   if (flags & (Win64EH::UNW_ChainInfo << 3))
     EmitRuntimeFunction(streamer, info->ChainedParent);
   else if (flags &
            ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
-    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler, context),
-                       4);
-  else if (numCodes < 2) {
+    streamer.EmitValue(MCSymbolRefExpr::Create(info->ExceptionHandler,
+                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                              context), 4);
+  else if (numCodes == 0) {
     // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
     // a chained unwind info, if there is no handler, and if there are fewer
     // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
-    if (numCodes == 1)
-      streamer.EmitIntValue(0, 2);
-    else
-      streamer.EmitIntValue(0, 4);
+    streamer.EmitIntValue(0, 4);
   }
 }
 
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index a5ba3c3..8234aff 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -20,12 +20,11 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include <vector>
 using namespace llvm;
-using namespace llvm::object;
 
 void MachObjectWriter::reset() {
   Relocations.clear();
@@ -128,7 +127,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint32_t Flags = 0;
 
   if (SubsectionsViaSymbols)
-    Flags |= macho::HF_SubsectionsViaSymbols;
+    Flags |= MachO::MH_SUBSECTIONS_VIA_SYMBOLS;
 
   // struct mach_header (28 bytes) or
   // struct mach_header_64 (32 bytes)
@@ -136,12 +135,12 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(is64Bit() ? macho::HM_Object64 : macho::HM_Object32);
+  Write32(is64Bit() ? MachO::MH_MAGIC_64 : MachO::MH_MAGIC);
 
   Write32(TargetObjectWriter->getCPUType());
   Write32(TargetObjectWriter->getCPUSubtype());
 
-  Write32(macho::HFT_Object);
+  Write32(MachO::MH_OBJECT);
   Write32(NumLoadCommands);
   Write32(LoadCommandsSize);
   Write32(Flags);
@@ -149,7 +148,7 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
     Write32(0); // reserved
 
   assert(OS.tell() - Start ==
-         (is64Bit() ? macho::Header64Size : macho::Header32Size));
+         (is64Bit()?sizeof(MachO::mach_header_64): sizeof(MachO::mach_header)));
 }
 
 /// WriteSegmentLoadCommand - Write a segment load command.
@@ -167,12 +166,12 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
   (void) Start;
 
   unsigned SegmentLoadCommandSize =
-    is64Bit() ? macho::SegmentLoadCommand64Size:
-    macho::SegmentLoadCommand32Size;
-  Write32(is64Bit() ? macho::LCT_Segment64 : macho::LCT_Segment);
+    is64Bit() ? sizeof(MachO::segment_command_64):
+    sizeof(MachO::segment_command);
+  Write32(is64Bit() ? MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT);
   Write32(SegmentLoadCommandSize +
-          NumSections * (is64Bit() ? macho::Section64Size :
-                         macho::Section32Size));
+          NumSections * (is64Bit() ? sizeof(MachO::section_64) :
+                         sizeof(MachO::section)));
 
   WriteBytes("", 16);
   if (is64Bit()) {
@@ -186,8 +185,10 @@ void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
     Write32(SectionDataStartOffset); // file offset
     Write32(SectionDataSize); // file size
   }
-  Write32(0x7); // maxprot
-  Write32(0x7); // initprot
+  // maxprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
+  // initprot
+  Write32(MachO::VM_PROT_READ | MachO::VM_PROT_WRITE | MachO::VM_PROT_EXECUTE); 
   Write32(NumSections);
   Write32(0); // flags
 
@@ -240,8 +241,8 @@ void MachObjectWriter::WriteSection(const MCAssembler &Asm,
   if (is64Bit())
     Write32(0); // reserved3
 
-  assert(OS.tell() - Start == (is64Bit() ? macho::Section64Size :
-                               macho::Section32Size));
+  assert(OS.tell() - Start == (is64Bit() ? sizeof(MachO::section_64) :
+                               sizeof(MachO::section)));
 }
 
 void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
@@ -253,14 +254,14 @@ void MachObjectWriter::WriteSymtabLoadCommand(uint32_t SymbolOffset,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Symtab);
-  Write32(macho::SymtabLoadCommandSize);
+  Write32(MachO::LC_SYMTAB);
+  Write32(sizeof(MachO::symtab_command));
   Write32(SymbolOffset);
   Write32(NumSymbols);
   Write32(StringTableOffset);
   Write32(StringTableSize);
 
-  assert(OS.tell() - Start == macho::SymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::symtab_command));
 }
 
 void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
@@ -276,8 +277,8 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_Dysymtab);
-  Write32(macho::DysymtabLoadCommandSize);
+  Write32(MachO::LC_DYSYMTAB);
+  Write32(sizeof(MachO::dysymtab_command));
   Write32(FirstLocalSymbol);
   Write32(NumLocalSymbols);
   Write32(FirstExternalSymbol);
@@ -297,7 +298,7 @@ void MachObjectWriter::WriteDysymtabLoadCommand(uint32_t FirstLocalSymbol,
   Write32(0); // locreloff
   Write32(0); // nlocrel
 
-  assert(OS.tell() - Start == macho::DysymtabLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::dysymtab_command));
 }
 
 void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
@@ -312,20 +313,20 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
   //
   // FIXME: Are the prebound or indirect fields possible here?
   if (Symbol.isUndefined())
-    Type = macho::STT_Undefined;
+    Type = MachO::N_UNDF;
   else if (Symbol.isAbsolute())
-    Type = macho::STT_Absolute;
+    Type = MachO::N_ABS;
   else
-    Type = macho::STT_Section;
+    Type = MachO::N_SECT;
 
   // FIXME: Set STAB bits.
 
   if (Data.isPrivateExtern())
-    Type |= macho::STF_PrivateExtern;
+    Type |= MachO::N_PEXT;
 
   // Set external bit.
   if (Data.isExternal() || Symbol.isUndefined())
-    Type |= macho::STF_External;
+    Type |= MachO::N_EXT;
 
   // Compute the symbol address.
   if (Symbol.isDefined()) {
@@ -341,7 +342,8 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
       assert((1U << Log2Size) == Align && "Invalid 'common' alignment!");
       if (Log2Size > 15)
         report_fatal_error("invalid 'common' alignment '" +
-                           Twine(Align) + "'");
+                           Twine(Align) + "' for '" + Symbol.getName() + "'",
+                           false);
       // FIXME: Keep this mask with the SymbolFlags enumeration.
       Flags = (Flags & 0xF0FF) | (Log2Size << 8);
     }
@@ -369,17 +371,17 @@ void MachObjectWriter::WriteLinkeditLoadCommand(uint32_t Type,
   (void) Start;
 
   Write32(Type);
-  Write32(macho::LinkeditLoadCommandSize);
+  Write32(sizeof(MachO::linkedit_data_command));
   Write32(DataOffset);
   Write32(DataSize);
 
-  assert(OS.tell() - Start == macho::LinkeditLoadCommandSize);
+  assert(OS.tell() - Start == sizeof(MachO::linkedit_data_command));
 }
 
 static unsigned ComputeLinkerOptionsLoadCommandSize(
   const std::vector<std::string> &Options, bool is64Bit)
 {
-  unsigned Size = sizeof(macho::LinkerOptionsLoadCommand);
+  unsigned Size = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i)
     Size += Options[i].size() + 1;
   return RoundUpToAlignment(Size, is64Bit ? 8 : 4);
@@ -392,10 +394,10 @@ void MachObjectWriter::WriteLinkerOptionsLoadCommand(
   uint64_t Start = OS.tell();
   (void) Start;
 
-  Write32(macho::LCT_LinkerOptions);
+  Write32(MachO::LC_LINKER_OPTIONS);
   Write32(Size);
   Write32(Options.size());
-  uint64_t BytesWritten = sizeof(macho::LinkerOptionsLoadCommand);
+  uint64_t BytesWritten = sizeof(MachO::linker_options_command);
   for (unsigned i = 0, e = Options.size(); i != e; ++i) {
     // Write each string, including the null byte.
     const std::string &Option = Options[i];
@@ -428,6 +430,22 @@ void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
   //
   // FIXME: Revisit this when the dust settles.
 
+  // Report errors for use of .indirect_symbol not in a symbol pointer section
+  // or stub section.
+  for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
+         ie = Asm.indirect_symbol_end(); it != ie; ++it) {
+    const MCSectionMachO &Section =
+      cast<MCSectionMachO>(it->SectionData->getSection());
+
+    if (Section.getType() != MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_LAZY_SYMBOL_POINTERS &&
+        Section.getType() != MCSectionMachO::S_SYMBOL_STUBS) {
+	MCSymbol &Symbol = *it->Symbol;
+	report_fatal_error("indirect symbol '" + Symbol.getName() +
+                           "' not in a symbol pointer or stub section");
+    }
+  }
+
   // Bind non lazy symbol pointers first.
   unsigned IndirectIndex = 0;
   for (MCAssembler::indirect_symbol_iterator it = Asm.indirect_symbol_begin(),
@@ -723,14 +741,14 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   // section headers) and the symbol table.
   unsigned NumLoadCommands = 1;
   uint64_t LoadCommandsSize = is64Bit() ?
-    macho::SegmentLoadCommand64Size + NumSections * macho::Section64Size :
-    macho::SegmentLoadCommand32Size + NumSections * macho::Section32Size;
+    sizeof(MachO::segment_command_64) + NumSections * sizeof(MachO::section_64):
+    sizeof(MachO::segment_command) + NumSections * sizeof(MachO::section);
 
   // Add the data-in-code load command size, if used.
   unsigned NumDataRegions = Asm.getDataRegions().size();
   if (NumDataRegions) {
     ++NumLoadCommands;
-    LoadCommandsSize += macho::LinkeditLoadCommandSize;
+    LoadCommandsSize += sizeof(MachO::linkedit_data_command);
   }
 
   // Add the symbol table load command sizes, if used.
@@ -738,8 +756,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
     UndefinedSymbolData.size();
   if (NumSymbols) {
     NumLoadCommands += 2;
-    LoadCommandsSize += (macho::SymtabLoadCommandSize +
-                         macho::DysymtabLoadCommandSize);
+    LoadCommandsSize += (sizeof(MachO::symtab_command) +
+                         sizeof(MachO::dysymtab_command));
   }
 
   // Add the linker option load commands sizes.
@@ -753,8 +771,8 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   
   // Compute the total size of the section data, as well as its file size and vm
   // size.
-  uint64_t SectionDataStart = (is64Bit() ? macho::Header64Size :
-                               macho::Header32Size) + LoadCommandsSize;
+  uint64_t SectionDataStart = (is64Bit() ? sizeof(MachO::mach_header_64) :
+                               sizeof(MachO::mach_header)) + LoadCommandsSize;
   uint64_t SectionDataSize = 0;
   uint64_t SectionDataFileSize = 0;
   uint64_t VMSize = 0;
@@ -791,11 +809,11 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   uint64_t RelocTableEnd = SectionDataStart + SectionDataFileSize;
   for (MCAssembler::const_iterator it = Asm.begin(),
          ie = Asm.end(); it != ie; ++it) {
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     unsigned NumRelocs = Relocs.size();
     uint64_t SectionStart = SectionDataStart + getSectionAddress(it);
     WriteSection(Asm, Layout, *it, SectionStart, RelocTableEnd, NumRelocs);
-    RelocTableEnd += NumRelocs * macho::RelocationInfoSize;
+    RelocTableEnd += NumRelocs * sizeof(MachO::any_relocation_info);
   }
 
   // Write the data-in-code load command, if used.
@@ -803,7 +821,7 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
   if (NumDataRegions) {
     uint64_t DataRegionsOffset = RelocTableEnd;
     uint64_t DataRegionsSize = NumDataRegions * 8;
-    WriteLinkeditLoadCommand(macho::LCT_DataInCode, DataRegionsOffset,
+    WriteLinkeditLoadCommand(MachO::LC_DATA_IN_CODE, DataRegionsOffset,
                              DataRegionsSize);
   }
 
@@ -830,8 +848,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
 
     // The string table is written after symbol table.
     uint64_t StringTableOffset =
-      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ? macho::Nlist64Size :
-                                              macho::Nlist32Size);
+      SymbolTableOffset + NumSymTabSymbols * (is64Bit() ?
+                                              sizeof(MachO::nlist_64) :
+                                              sizeof(MachO::nlist));
     WriteSymtabLoadCommand(SymbolTableOffset, NumSymTabSymbols,
                            StringTableOffset, StringTable.size());
 
@@ -864,10 +883,10 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          ie = Asm.end(); it != ie; ++it) {
     // Write the section relocation entries, in reverse order to match 'as'
     // (approximately, the exact algorithm is more complicated than this).
-    std::vector<macho::RelocationEntry> &Relocs = Relocations[it];
+    std::vector<MachO::any_relocation_info> &Relocs = Relocations[it];
     for (unsigned i = 0, e = Relocs.size(); i != e; ++i) {
-      Write32(Relocs[e - i - 1].Word0);
-      Write32(Relocs[e - i - 1].Word1);
+      Write32(Relocs[e - i - 1].r_word0);
+      Write32(Relocs[e - i - 1].r_word1);
     }
   }
 
@@ -906,9 +925,9 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
         // If this symbol is defined and internal, mark it as such.
         if (it->Symbol->isDefined() &&
             !Asm.getSymbolData(*it->Symbol).isExternal()) {
-          uint32_t Flags = macho::ISF_Local;
+          uint32_t Flags = MachO::INDIRECT_SYMBOL_LOCAL;
           if (it->Symbol->isAbsolute())
-            Flags |= macho::ISF_Absolute;
+            Flags |= MachO::INDIRECT_SYMBOL_ABS;
           Write32(Flags);
           continue;
         }
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 7625abd..2fb91f2 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -121,13 +121,10 @@ void SubtargetFeatures::AddFeature(const StringRef String,
 /// Find KV in array using binary search.
 static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
                                       size_t L) {
-  // Make the lower bound element we're looking for
-  SubtargetFeatureKV KV;
-  KV.Key = S.data();
   // Determine the end of the array
   const SubtargetFeatureKV *Hi = A + L;
   // Binary search the array
-  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, KV);
+  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, S);
   // If not found then return NULL
   if (F == Hi || StringRef(F->Key) != S) return NULL;
   // Return the found array item
@@ -353,8 +350,7 @@ void SubtargetFeatures::dump() const {
 }
 #endif
 
-/// getDefaultSubtargetFeatures - Return a string listing the features
-/// associated with the target triple.
+/// Adds the default features for the specified target triple.
 ///
 /// FIXME: This is an inelegant way of specifying the features of a
 /// subtarget. It would be better if we could encode this information
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index 518b59e..d9ca86d 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -137,7 +138,7 @@ public:
   symbol_map  SymbolMap;
 
   WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW, raw_ostream &OS);
-  ~WinCOFFObjectWriter();
+  virtual ~WinCOFFObjectWriter();
 
   COFFSymbol *createSymbol(StringRef Name);
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol * Symbol);
@@ -147,13 +148,12 @@ public:
   object_t *createCOFFEntity(StringRef Name, list_t &List);
 
   void DefineSection(MCSectionData const &SectionData);
-  void DefineSymbol(MCSymbolData const &SymbolData,
-                    MCAssembler &Assembler);
+  void DefineSymbol(MCSymbolData const &SymbolData, MCAssembler &Assembler,
+                    const MCAsmLayout &Layout);
 
   void MakeSymbolReal(COFFSymbol &S, size_t Index);
   void MakeSectionReal(COFFSection &S, size_t Number);
 
-  bool ExportSection(COFFSection const *S);
   bool ExportSymbol(MCSymbolData const &SymbolData, MCAssembler &Asm);
 
   bool IsPhysicalSection(COFFSection *S);
@@ -189,17 +189,6 @@ static inline void write_uint32_le(void *Data, uint32_t const &Value) {
   Ptr[3] = (Value & 0xFF000000) >> 24;
 }
 
-static inline void write_uint16_le(void *Data, uint16_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0x00FF) >> 0;
-  Ptr[1] = (Value & 0xFF00) >> 8;
-}
-
-static inline void write_uint8_le(void *Data, uint8_t const &Value) {
-  uint8_t *Ptr = reinterpret_cast<uint8_t *>(Data);
-  Ptr[0] = (Value & 0xFF) >> 0;
-}
-
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
@@ -410,7 +399,8 @@ void WinCOFFObjectWriter::DefineSection(MCSectionData const &SectionData) {
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF symbol staging object.
 void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
-                                       MCAssembler &Assembler) {
+                                       MCAssembler &Assembler,
+                                       const MCAsmLayout &Layout) {
   MCSymbol const &Symbol = SymbolData.getSymbol();
   COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
   SymbolMap[&Symbol] = coff_symbol;
@@ -451,6 +441,12 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
     const MCSymbolData &ResSymData =
       Assembler.getSymbolData(Symbol.AliasedSymbol());
 
+    if (Symbol.isVariable()) {
+      int64_t Addr;
+      if (Symbol.getVariableValue()->EvaluateAsAbsolute(Addr, Layout))
+        coff_symbol->Data.Value = Addr;
+    }
+
     coff_symbol->Data.Type         = (ResSymData.getFlags() & 0x0000FFFF) >>  0;
     coff_symbol->Data.StorageClass = (ResSymData.getFlags() & 0x00FF0000) >> 16;
 
@@ -462,7 +458,9 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
        external ? COFF::IMAGE_SYM_CLASS_EXTERNAL : COFF::IMAGE_SYM_CLASS_STATIC;
     }
 
-    if (ResSymData.Fragment != NULL)
+    if (Symbol.isAbsolute() || Symbol.AliasedSymbol().isVariable())
+      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+    else if (ResSymData.Fragment != NULL)
       coff_symbol->Section =
         SectionMap[&ResSymData.Fragment->getParent()->getSection()];
 
@@ -474,18 +472,21 @@ void WinCOFFObjectWriter::DefineSymbol(MCSymbolData const &SymbolData,
 /// name into the string table if needed
 void WinCOFFObjectWriter::MakeSectionReal(COFFSection &S, size_t Number) {
   if (S.Name.size() > COFF::NameSize) {
-    size_t StringTableEntry = Strings.insert(S.Name.c_str());
-
-    // FIXME: Why is this number 999999? This number is never mentioned in the
-    // spec. I'm assuming this is due to the printed value needing to fit into
-    // the S.Header.Name field. In which case why not 9999999 (7 9's instead of
-    // 6)? The spec does not state if this entry should be null terminated in
-    // this case, and thus this seems to be the best way to do it. I think I
-    // just solved my own FIXME...
-    if (StringTableEntry > 999999)
-      report_fatal_error("COFF string table is greater than 999999 bytes.");
-
-    std::sprintf(S.Header.Name, "/%d", unsigned(StringTableEntry));
+    const unsigned Max6DecimalSize = 999999;
+    const unsigned Max7DecimalSize = 9999999;
+    uint64_t StringTableEntry = Strings.insert(S.Name.c_str());
+
+    if (StringTableEntry <= Max6DecimalSize) {
+      std::sprintf(S.Header.Name, "/%d", unsigned(StringTableEntry));
+    } else if (StringTableEntry <= Max7DecimalSize) {
+      // With seven digits, we have to skip the terminating null. Because
+      // sprintf always appends it, we use a larger temporary buffer.
+      char buffer[9] = { };
+      std::sprintf(buffer, "/%d", unsigned(StringTableEntry));
+      std::memcpy(S.Header.Name, buffer, 8);
+    } else {
+      report_fatal_error("COFF string table is greater than 9,999,999 bytes.");
+    }
   } else
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
 
@@ -504,10 +505,6 @@ void WinCOFFObjectWriter::MakeSymbolReal(COFFSymbol &S, size_t Index) {
   S.Index = Index;
 }
 
-bool WinCOFFObjectWriter::ExportSection(COFFSection const *S) {
-  return !S->MCData->getFragmentList().empty();
-}
-
 bool WinCOFFObjectWriter::ExportSymbol(MCSymbolData const &SymbolData,
                                        MCAssembler &Asm) {
   // This doesn't seem to be right. Strings referred to from the .data section
@@ -621,9 +618,10 @@ void WinCOFFObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
     DefineSection(*i);
 
   for (MCAssembler::const_symbol_iterator i = Asm.symbol_begin(),
-                                          e = Asm.symbol_end(); i != e; i++) {
+                                          e = Asm.symbol_end();
+       i != e; i++) {
     if (ExportSymbol(*i, Asm)) {
-      DefineSymbol(*i, Asm);
+      DefineSymbol(*i, Asm, Layout);
     }
   }
 }
@@ -636,8 +634,9 @@ void WinCOFFObjectWriter::RecordRelocation(const MCAssembler &Asm,
                                            uint64_t &FixedValue) {
   assert(Target.getSymA() != NULL && "Relocation must reference a symbol!");
 
-  const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData &A_SD = Asm.getSymbolData(*A);
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+  const MCSymbol &A = Symbol.AliasedSymbol();
+  MCSymbolData &A_SD = Asm.getSymbolData(A);
 
   MCSectionData const *SectionData = Fragment->getParent();
 
@@ -707,10 +706,13 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
   // Assign symbol and section indexes and offsets.
   Header.NumberOfSections = 0;
 
+  DenseMap<COFFSection *, uint16_t> SectionIndices;
   for (sections::iterator i = Sections.begin(),
                           e = Sections.end(); i != e; i++) {
     if (Layout.getSectionAddressSize((*i)->MCData) > 0) {
-      MakeSectionReal(**i, ++Header.NumberOfSections);
+      size_t Number = ++Header.NumberOfSections;
+      SectionIndices[*i] = Number;
+      MakeSectionReal(**i, Number);
     } else {
       (*i)->Number = -1;
     }
@@ -754,6 +756,31 @@ void WinCOFFObjectWriter::WriteObject(MCAssembler &Asm,
     }
   }
 
+  // Fixup associative COMDAT sections.
+  for (sections::iterator i = Sections.begin(),
+                          e = Sections.end(); i != e; i++) {
+    if ((*i)->Symbol->Aux[0].Aux.SectionDefinition.Selection !=
+        COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+      continue;
+
+    const MCSectionCOFF &MCSec = static_cast<const MCSectionCOFF &>(
+                                                    (*i)->MCData->getSection());
+
+    COFFSection *Assoc = SectionMap.lookup(MCSec.getAssocSection());
+    if (!Assoc) {
+      report_fatal_error(Twine("Missing associated COMDAT section ") +
+                         MCSec.getAssocSection()->getSectionName() +
+                         " for section " + MCSec.getSectionName());
+    }
+
+    // Skip this section if the associated section is unused.
+    if (Assoc->Number == -1)
+      continue;
+
+    (*i)->Symbol->Aux[0].Aux.SectionDefinition.Number = SectionIndices[Assoc];
+  }
+
+
   // Assign file offsets to COFF object file structures.
 
   unsigned offset = 0;
@@ -888,6 +915,9 @@ MCWinCOFFObjectTargetWriter::MCWinCOFFObjectTargetWriter(unsigned Machine_) :
   Machine(Machine_) {
 }
 
+// Pin the vtable to this file.
+void MCWinCOFFObjectTargetWriter::anchor() {}
+
 //------------------------------------------------------------------------------
 // WinCOFFObjectWriter factory function
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 75f343c..5b5aad7 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -55,7 +55,7 @@ public:
   virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
-  virtual void EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
+  virtual bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute);
   virtual void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue);
   virtual void BeginCOFFSymbolDef(MCSymbol const *Symbol);
   virtual void EmitCOFFSymbolStorageClass(int StorageClass);
@@ -72,13 +72,10 @@ public:
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
   virtual void EmitFileDirective(StringRef Filename);
+  virtual void EmitIdent(StringRef IdentString);
   virtual void EmitWin64EHHandlerData();
   virtual void FinishImpl();
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_WinCOFFStreamer;
-  }
-
 private:
   virtual void EmitInstToData(const MCInst &Inst) {
     MCDataFragment *DF = getOrCreateDataFragment();
@@ -134,8 +131,7 @@ private:
 
 WinCOFFStreamer::WinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                  MCCodeEmitter &CE, raw_ostream &OS)
-    : MCObjectStreamer(SK_WinCOFFStreamer, Context, MAB, OS, &CE),
-      CurSymbol(NULL) {}
+    : MCObjectStreamer(Context, 0, MAB, OS, &CE), CurSymbol(NULL) {}
 
 void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment, bool External) {
@@ -155,7 +151,8 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   int Selection = COFF::IMAGE_COMDAT_SELECT_LARGEST;
 
   const MCSection *Section = MCStreamer::getContext().getCOFFSection(
-    SectionName, Characteristics, Selection, SectionKind::getBSS());
+      SectionName, Characteristics, SectionKind::getBSS(), Symbol->getName(),
+      Selection);
 
   MCSectionData &SectionData = getAssembler().getOrCreateSectionData(*Section);
 
@@ -164,7 +161,7 @@ void WinCOFFStreamer::AddCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
   SymbolData.setExternal(External);
 
-  Symbol->setSection(*Section);
+  AssignSection(Symbol, Section);
 
   if (ByteAlignment != 1)
       new MCAlignFragment(ByteAlignment, 0, 0, ByteAlignment, &SectionData);
@@ -201,7 +198,7 @@ void WinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
                                           MCSymbolAttr Attribute) {
   assert(Symbol && "Symbol must be non-null!");
   assert((Symbol->isInSection()
@@ -221,8 +218,10 @@ void WinCOFFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     break;
 
   default:
-    llvm_unreachable("unsupported attribute");
+    return false;
   }
+
+  return true;
 }
 
 void WinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
@@ -309,6 +308,11 @@ void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
   // info will be a much large effort.
 }
 
+// TODO: Implement this if you want to emit .comment section in COFF obj files.
+void WinCOFFStreamer::EmitIdent(StringRef IdentString) {
+  llvm_unreachable("unsupported directive");
+}
+
 void WinCOFFStreamer::EmitWin64EHHandlerData() {
   MCStreamer::EmitWin64EHHandlerData();
 
@@ -318,6 +322,7 @@ void WinCOFFStreamer::EmitWin64EHHandlerData() {
 }
 
 void WinCOFFStreamer::FinishImpl() {
+  EmitFrames(NULL, true);
   EmitW64Tables();
   MCObjectStreamer::FinishImpl();
 }
diff --git a/lib/Makefile b/lib/Makefile
index 57f016b..2ed0636 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,9 +10,8 @@ LEVEL = ..
 
 include $(LEVEL)/Makefile.config
 
-PARALLEL_DIRS := IR AsmParser Bitcode Archive Analysis Transforms CodeGen \
-                 Target ExecutionEngine Linker MC Object Option DebugInfo \
-								 IRReader
+PARALLEL_DIRS := IR AsmParser Bitcode Analysis Transforms CodeGen Target \
+                 ExecutionEngine Linker LTO MC Object Option DebugInfo   \
+                 IRReader
 
 include $(LEVEL)/Makefile.common
-
diff --git a/lib/Object/Archive.cpp b/lib/Object/Archive.cpp
index 0e13d05..71efca2 100644
--- a/lib/Object/Archive.cpp
+++ b/lib/Object/Archive.cpp
@@ -13,33 +13,110 @@
 
 #include "llvm/Object/Archive.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 using namespace llvm;
 using namespace object;
 
-static const char *Magic = "!<arch>\n";
+static const char *const Magic = "!<arch>\n";
 
-static bool isInternalMember(const ArchiveMemberHeader &amh) {
-  static const char *const internals[] = {
-    "/",
-    "//",
-    "#_LLVM_SYM_TAB_#"
-  };
+void Archive::anchor() { }
+
+StringRef ArchiveMemberHeader::getName() const {
+  char EndCond;
+  if (Name[0] == '/' || Name[0] == '#')
+    EndCond = ' ';
+  else
+    EndCond = '/';
+  llvm::StringRef::size_type end =
+      llvm::StringRef(Name, sizeof(Name)).find(EndCond);
+  if (end == llvm::StringRef::npos)
+    end = sizeof(Name);
+  assert(end <= sizeof(Name) && end > 0);
+  // Don't include the EndCond if there is one.
+  return llvm::StringRef(Name, end);
+}
+
+uint32_t ArchiveMemberHeader::getSize() const {
+  uint32_t Ret;
+  if (llvm::StringRef(Size, sizeof(Size)).rtrim(" ").getAsInteger(10, Ret))
+    llvm_unreachable("Size is not a decimal number.");
+  return Ret;
+}
+
+sys::fs::perms ArchiveMemberHeader::getAccessMode() const {
+  unsigned Ret;
+  if (StringRef(AccessMode, sizeof(AccessMode)).rtrim(" ").getAsInteger(8, Ret))
+    llvm_unreachable("Access mode is not an octal number.");
+  return static_cast<sys::fs::perms>(Ret);
+}
+
+sys::TimeValue ArchiveMemberHeader::getLastModified() const {
+  unsigned Seconds;
+  if (StringRef(LastModified, sizeof(LastModified)).rtrim(" ")
+          .getAsInteger(10, Seconds))
+    llvm_unreachable("Last modified time not a decimal number.");
+
+  sys::TimeValue Ret;
+  Ret.fromEpochTime(Seconds);
+  return Ret;
+}
+
+unsigned ArchiveMemberHeader::getUID() const {
+  unsigned Ret;
+  if (StringRef(UID, sizeof(UID)).rtrim(" ").getAsInteger(10, Ret))
+    llvm_unreachable("UID time not a decimal number.");
+  return Ret;
+}
+
+unsigned ArchiveMemberHeader::getGID() const {
+  unsigned Ret;
+  if (StringRef(GID, sizeof(GID)).rtrim(" ").getAsInteger(10, Ret))
+    llvm_unreachable("GID time not a decimal number.");
+  return Ret;
+}
+
+Archive::Child::Child(const Archive *Parent, const char *Start)
+    : Parent(Parent) {
+  if (!Start)
+    return;
+
+  const ArchiveMemberHeader *Header =
+      reinterpret_cast<const ArchiveMemberHeader *>(Start);
+  Data = StringRef(Start, sizeof(ArchiveMemberHeader) + Header->getSize());
 
-  StringRef name = amh.getName();
-  for (std::size_t i = 0; i < sizeof(internals) / sizeof(*internals); ++i) {
-    if (name == internals[i])
-      return true;
+  // Setup StartOfFile and PaddingBytes.
+  StartOfFile = sizeof(ArchiveMemberHeader);
+  // Don't include attached name.
+  StringRef Name = Header->getName();
+  if (Name.startswith("#1/")) {
+    uint64_t NameSize;
+    if (Name.substr(3).rtrim(" ").getAsInteger(10, NameSize))
+      llvm_unreachable("Long name length is not an integer");
+    StartOfFile += NameSize;
   }
-  return false;
 }
 
-void Archive::anchor() { }
+Archive::Child Archive::Child::getNext() const {
+  size_t SpaceToSkip = Data.size();
+  // If it's odd, add 1 to make it even.
+  if (SpaceToSkip & 1)
+    ++SpaceToSkip;
+
+  const char *NextLoc = Data.data() + SpaceToSkip;
+
+  // Check to see if this is past the end of the archive.
+  if (NextLoc >= Parent->Data->getBufferEnd())
+    return Child(Parent, NULL);
+
+  return Child(Parent, NextLoc);
+}
 
 error_code Archive::Child::getName(StringRef &Result) const {
-  StringRef name = ToHeader(Data.data())->getName();
+  StringRef name = getRawName();
   // Check if it's a special name.
   if (name[0] == '/') {
     if (name.size() == 1) { // Linker member.
@@ -79,7 +156,8 @@ error_code Archive::Child::getName(StringRef &Result) const {
     uint64_t name_size;
     if (name.substr(3).rtrim(" ").getAsInteger(10, name_size))
       llvm_unreachable("Long name length is not an ingeter");
-    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size);
+    Result = Data.substr(sizeof(ArchiveMemberHeader), name_size)
+        .rtrim(StringRef("\0", 1));
     return object_error::success;
   }
   // It's a simple name.
@@ -90,6 +168,20 @@ error_code Archive::Child::getName(StringRef &Result) const {
   return object_error::success;
 }
 
+error_code Archive::Child::getMemoryBuffer(OwningPtr<MemoryBuffer> &Result,
+                                           bool FullPath) const {
+  StringRef Name;
+  if (error_code ec = getName(Name))
+    return ec;
+  SmallString<128> Path;
+  Result.reset(MemoryBuffer::getMemBuffer(
+      getBuffer(), FullPath ? (Twine(Parent->getFileName()) + "(" + Name + ")")
+                                  .toStringRef(Path)
+                            : Name,
+      false));
+  return error_code::success();
+}
+
 error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
   OwningPtr<Binary> ret;
   OwningPtr<MemoryBuffer> Buff;
@@ -102,11 +194,11 @@ error_code Archive::Child::getAsBinary(OwningPtr<Binary> &Result) const {
 }
 
 Archive::Archive(MemoryBuffer *source, error_code &ec)
-  : Binary(Binary::ID_Archive, source) {
+  : Binary(Binary::ID_Archive, source), SymbolTable(end_children()) {
   // Check for sufficient magic.
-  if (!source || source->getBufferSize()
-                 < (8 + sizeof(ArchiveMemberHeader) + 2) // Smallest archive.
-              || StringRef(source->getBufferStart(), 8) != Magic) {
+  assert(source);
+  if (source->getBufferSize() < 8 ||
+      StringRef(source->getBufferStart(), 8) != Magic) {
     ec = object_error::invalid_file_type;
     return;
   }
@@ -115,72 +207,122 @@ Archive::Archive(MemoryBuffer *source, error_code &ec)
   child_iterator i = begin_children(false);
   child_iterator e = end_children();
 
-  StringRef name;
-  if ((ec = i->getName(name)))
+  if (i == e) {
+    ec = object_error::success;
     return;
+  }
+
+  StringRef Name = i->getRawName();
 
   // Below is the pattern that is used to figure out the archive format
   // GNU archive format
-  //  First member : / (points to the symbol table )
+  //  First member : / (may exist, if it exists, points to the symbol table )
   //  Second member : // (may exist, if it exists, points to the string table)
   //  Note : The string table is used if the filename exceeds 15 characters
   // BSD archive format
-  //  First member : __.SYMDEF (points to the symbol table)
-  //  There is no string table, if the filename exceeds 15 characters or has a 
-  //  embedded space, the filename has #1/<size>, The size represents the size 
+  //  First member : __.SYMDEF or "__.SYMDEF SORTED" (the symbol table)
+  //  There is no string table, if the filename exceeds 15 characters or has a
+  //  embedded space, the filename has #1/<size>, The size represents the size
   //  of the filename that needs to be read after the archive header
   // COFF archive format
   //  First member : /
   //  Second member : / (provides a directory of symbols)
-  //  Third member : // contains the string table, this is present even if the
-  //                    string table is empty
-  if (name == "/") {
+  //  Third member : // (may exist, if it exists, contains the string table)
+  //  Note: Microsoft PE/COFF Spec 8.3 says that the third member is present
+  //  even if the string table is empty. However, lib.exe does not in fact
+  //  seem to create the third member if there's no member whose filename
+  //  exceeds 15 characters. So the third member is optional.
+
+  if (Name == "__.SYMDEF") {
+    Format = K_BSD;
     SymbolTable = i;
-    StringTable = e;
-    if (i != e) ++i;
-    if (i == e) {
-      ec = object_error::parse_failed;
-      return;
-    }
-    if ((ec = i->getName(name)))
+    ++i;
+    FirstRegular = i;
+    ec = object_error::success;
+    return;
+  }
+
+  if (Name.startswith("#1/")) {
+    Format = K_BSD;
+    // We know this is BSD, so getName will work since there is no string table.
+    ec = i->getName(Name);
+    if (ec)
       return;
-    if (name[0] != '/') {
-      Format = K_GNU;
-    } else if ((name.size() > 1) && (name == "//")) { 
-      Format = K_GNU;
-      StringTable = i;
+    if (Name == "__.SYMDEF SORTED") {
+      SymbolTable = i;
       ++i;
-    } else  { 
-      Format = K_COFF;
-      if (i != e) {
-        SymbolTable = i;
-        ++i;
-      }
-      if (i != e) {
-        StringTable = i;
-      }
     }
-  } else if (name == "__.SYMDEF") {
-    Format = K_BSD;
+    FirstRegular = i;
+    return;
+  }
+
+  if (Name == "/") {
     SymbolTable = i;
-    StringTable = e;
-  } 
+
+    ++i;
+    if (i == e) {
+      ec = object_error::parse_failed;
+      return;
+    }
+    Name = i->getRawName();
+  }
+
+  if (Name == "//") {
+    Format = K_GNU;
+    StringTable = i;
+    ++i;
+    FirstRegular = i;
+    ec = object_error::success;
+    return;
+  }
+
+  if (Name[0] != '/') {
+    Format = K_GNU;
+    FirstRegular = i;
+    ec = object_error::success;
+    return;
+  }
+
+  if (Name != "/") {
+    ec = object_error::parse_failed;
+    return;
+  }
+
+  Format = K_COFF;
+  SymbolTable = i;
+
+  ++i;
+  if (i == e) {
+    FirstRegular = i;
+    ec = object_error::success;
+    return;
+  }
+
+  Name = i->getRawName();
+
+  if (Name == "//") {
+    StringTable = i;
+    ++i;
+  }
+
+  FirstRegular = i;
   ec = object_error::success;
 }
 
-Archive::child_iterator Archive::begin_children(bool skip_internal) const {
+Archive::child_iterator Archive::begin_children(bool SkipInternal) const {
+  if (Data->getBufferSize() == 8) // empty archive.
+    return end_children();
+
+  if (SkipInternal)
+    return FirstRegular;
+
   const char *Loc = Data->getBufferStart() + strlen(Magic);
-  size_t Size = sizeof(ArchiveMemberHeader) +
-    ToHeader(Loc)->getSize();
-  Child c(this, StringRef(Loc, Size));
-  // Skip internals at the beginning of an archive.
-  if (skip_internal && isInternalMember(*ToHeader(Loc)))
-    return c.getNext();
+  Child c(this, Loc);
   return c;
 }
 
 Archive::child_iterator Archive::end_children() const {
-  return Child(this, StringRef(0, 0));
+  return Child(this, NULL);
 }
 
 error_code Archive::Symbol::getName(StringRef &Result) const {
@@ -228,9 +370,7 @@ error_code Archive::Symbol::getMember(child_iterator &Result) const {
   }
 
   const char *Loc = Parent->getData().begin() + Offset;
-  size_t Size = sizeof(ArchiveMemberHeader) +
-    ToHeader(Loc)->getSize();
-  Result = Child(Parent, StringRef(Loc, Size));
+  Result = Child(Parent, Loc);
 
   return object_error::success;
 }
@@ -245,6 +385,9 @@ Archive::Symbol Archive::Symbol::getNext() const {
 }
 
 Archive::symbol_iterator Archive::begin_symbols() const {
+  if (!hasSymbolTable())
+    return symbol_iterator(Symbol(this, 0, 0));
+
   const char *buf = SymbolTable->getBuffer().begin();
   if (kind() == K_GNU) {
     uint32_t symbol_count = 0;
@@ -265,11 +408,13 @@ Archive::symbol_iterator Archive::begin_symbols() const {
 }
 
 Archive::symbol_iterator Archive::end_symbols() const {
+  if (!hasSymbolTable())
+    return symbol_iterator(Symbol(this, 0, 0));
+
   const char *buf = SymbolTable->getBuffer().begin();
   uint32_t symbol_count = 0;
   if (kind() == K_GNU) {
     symbol_count = *reinterpret_cast<const support::ubig32_t*>(buf);
-    buf += sizeof(uint32_t) + (symbol_count * (sizeof(uint32_t)));
   } else if (kind() == K_BSD) {
     llvm_unreachable("BSD archive format is not supported");
   } else {
@@ -299,3 +444,7 @@ Archive::child_iterator Archive::findSym(StringRef name) const {
   }
   return end_children();
 }
+
+bool Archive::hasSymbolTable() const {
+  return SymbolTable != end_children();
+}
diff --git a/lib/Object/Binary.cpp b/lib/Object/Binary.cpp
index 4e528d8..de57b4c 100644
--- a/lib/Object/Binary.cpp
+++ b/lib/Object/Binary.cpp
@@ -14,11 +14,13 @@
 #include "llvm/Object/Binary.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 
 // Include headers for createBinary.
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
 
 using namespace llvm;
@@ -45,22 +47,19 @@ error_code object::createBinary(MemoryBuffer *Source,
   OwningPtr<MemoryBuffer> scopedSource(Source);
   if (!Source)
     return make_error_code(errc::invalid_argument);
-  if (Source->getBufferSize() < 64)
-    return object_error::invalid_file_type;
-  sys::LLVMFileType type = sys::IdentifyFileType(Source->getBufferStart(),
-                                static_cast<unsigned>(Source->getBufferSize()));
+  sys::fs::file_magic type = sys::fs::identify_magic(Source->getBuffer());
   error_code ec;
   switch (type) {
-    case sys::Archive_FileType: {
+    case sys::fs::file_magic::archive: {
       OwningPtr<Binary> ret(new Archive(scopedSource.take(), ec));
       if (ec) return ec;
       Result.swap(ret);
       return object_error::success;
     }
-    case sys::ELF_Relocatable_FileType:
-    case sys::ELF_Executable_FileType:
-    case sys::ELF_SharedObject_FileType:
-    case sys::ELF_Core_FileType: {
+    case sys::fs::file_magic::elf_relocatable:
+    case sys::fs::file_magic::elf_executable:
+    case sys::fs::file_magic::elf_shared_object:
+    case sys::fs::file_magic::elf_core: {
       OwningPtr<Binary> ret(
         ObjectFile::createELFObjectFile(scopedSource.take()));
       if (!ret)
@@ -68,15 +67,16 @@ error_code object::createBinary(MemoryBuffer *Source,
       Result.swap(ret);
       return object_error::success;
     }
-    case sys::Mach_O_Object_FileType:
-    case sys::Mach_O_Executable_FileType:
-    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
-    case sys::Mach_O_Core_FileType:
-    case sys::Mach_O_PreloadExecutable_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
-    case sys::Mach_O_DynamicLinker_FileType:
-    case sys::Mach_O_Bundle_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType: {
+    case sys::fs::file_magic::macho_object:
+    case sys::fs::file_magic::macho_executable:
+    case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
+    case sys::fs::file_magic::macho_core:
+    case sys::fs::file_magic::macho_preload_executable:
+    case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
+    case sys::fs::file_magic::macho_dynamic_linker:
+    case sys::fs::file_magic::macho_bundle:
+    case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
+    case sys::fs::file_magic::macho_dsym_companion: {
       OwningPtr<Binary> ret(
         ObjectFile::createMachOObjectFile(scopedSource.take()));
       if (!ret)
@@ -84,15 +84,30 @@ error_code object::createBinary(MemoryBuffer *Source,
       Result.swap(ret);
       return object_error::success;
     }
-    case sys::COFF_FileType: {
-      OwningPtr<Binary> ret(new COFFObjectFile(scopedSource.take(), ec));
+    case sys::fs::file_magic::macho_universal_binary: {
+      OwningPtr<Binary> ret(new MachOUniversalBinary(scopedSource.take(), ec));
       if (ec) return ec;
       Result.swap(ret);
       return object_error::success;
     }
-    default: // Unrecognized object file format.
+    case sys::fs::file_magic::coff_object:
+    case sys::fs::file_magic::coff_import_library:
+    case sys::fs::file_magic::pecoff_executable: {
+      OwningPtr<Binary> ret(
+          ObjectFile::createCOFFObjectFile(scopedSource.take()));
+      if (!ret)
+        return object_error::invalid_file_type;
+      Result.swap(ret);
+      return object_error::success;
+    }
+    case sys::fs::file_magic::unknown:
+    case sys::fs::file_magic::bitcode:
+    case sys::fs::file_magic::windows_resource: {
+      // Unrecognized object file format.
       return object_error::invalid_file_type;
+    }
   }
+  llvm_unreachable("Unexpected Binary File Type");
 }
 
 error_code object::createBinary(StringRef Path, OwningPtr<Binary> &Result) {
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index 4ed129f..1f07cbb 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -2,9 +2,14 @@ add_llvm_library(LLVMObject
   Archive.cpp
   Binary.cpp
   COFFObjectFile.cpp
+  COFFYAML.cpp
+  ELF.cpp
   ELFObjectFile.cpp
+  ELFYAML.cpp
   Error.cpp
   MachOObjectFile.cpp
+  MachOUniversal.cpp
   Object.cpp
   ObjectFile.cpp
+  YAML.cpp
   )
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 70fec32..42066c3 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -16,6 +16,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 using namespace object;
@@ -37,18 +40,19 @@ bool checkSize(const MemoryBuffer *m, error_code &ec, uint64_t size) {
   return true;
 }
 
-// Returns false if any bytes in [addr, addr + size) fall outsize of m.
-bool checkAddr(const MemoryBuffer *m,
-               error_code &ec,
-               uintptr_t addr,
-               uint64_t size) {
-  if (addr + size < addr ||
-      addr + size < size ||
-      addr + size > uintptr_t(m->getBufferEnd())) {
-    ec = object_error::unexpected_eof;
-    return false;
+// Sets Obj unless any bytes in [addr, addr + size) fall outsize of m.
+// Returns unexpected_eof if error.
+template<typename T>
+error_code getObject(const T *&Obj, const MemoryBuffer *M, const uint8_t *Ptr,
+                     const size_t Size = sizeof(T)) {
+  uintptr_t Addr = uintptr_t(Ptr);
+  if (Addr + Size < Addr ||
+      Addr + Size < Size ||
+      Addr + Size > uintptr_t(M->getBufferEnd())) {
+    return object_error::unexpected_eof;
   }
-  return true;
+  Obj = reinterpret_cast<const T *>(Addr);
+  return object_error::success;
 }
 }
 
@@ -58,12 +62,12 @@ const coff_symbol *COFFObjectFile::toSymb(DataRefImpl Symb) const {
 # ifndef NDEBUG
   // Verify that the symbol points to a valid entry in the symbol table.
   uintptr_t offset = uintptr_t(addr) - uintptr_t(base());
-  if (offset < Header->PointerToSymbolTable
-      || offset >= Header->PointerToSymbolTable
-         + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+  if (offset < COFFHeader->PointerToSymbolTable
+      || offset >= COFFHeader->PointerToSymbolTable
+         + (COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
     report_fatal_error("Symbol was outside of symbol table.");
 
-  assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+  assert((offset - COFFHeader->PointerToSymbolTable) % sizeof(coff_symbol)
          == 0 && "Symbol did not point to the beginning of a symbol");
 # endif
 
@@ -76,7 +80,7 @@ const coff_section *COFFObjectFile::toSec(DataRefImpl Sec) const {
 # ifndef NDEBUG
   // Verify that the section points to a valid entry in the section table.
   if (addr < SectionTable
-      || addr >= (SectionTable + Header->NumberOfSections))
+      || addr >= (SectionTable + COFFHeader->NumberOfSections))
     report_fatal_error("Section was outside of section table.");
 
   uintptr_t offset = uintptr_t(addr) - uintptr_t(SectionTable);
@@ -108,10 +112,8 @@ error_code COFFObjectFile::getSymbolFileOffset(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->PointerToRawData + symb->Value;
@@ -126,10 +128,8 @@ error_code COFFObjectFile::getSymbolAddress(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->VirtualAddress + symb->Value;
@@ -149,12 +149,16 @@ error_code COFFObjectFile::getSymbolType(DataRefImpl Symb,
     if (symb->getComplexType() == COFF::IMAGE_SYM_DTYPE_FUNCTION) {
       Result = SymbolRef::ST_Function;
     } else {
-      char Type;
-      if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-        return ec;
-      if (Type == 'r' || Type == 'R') {
-        Result = SymbolRef::ST_Data;
+      uint32_t Characteristics = 0;
+      if (symb->SectionNumber > 0) {
+        const coff_section *Section = NULL;
+        if (error_code ec = getSection(symb->SectionNumber, Section))
+          return ec;
+        Characteristics = Section->Characteristics;
       }
+      if (Characteristics & COFF::IMAGE_SCN_MEM_READ &&
+          ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
+        Result = SymbolRef::ST_Data;
     }
   }
   return object_error::success;
@@ -193,10 +197,8 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   const coff_section *Section = NULL;
   if (error_code ec = getSection(symb->SectionNumber, Section))
     return ec;
-  char Type;
-  if (error_code ec = getSymbolNMTypeChar(Symb, Type))
-    return ec;
-  if (Type == 'U' || Type == 'w')
+
+  if (symb->SectionNumber == COFF::IMAGE_SYM_UNDEFINED)
     Result = UnknownAddressOrSize;
   else if (Section)
     Result = Section->SizeOfRawData - symb->Value;
@@ -205,74 +207,6 @@ error_code COFFObjectFile::getSymbolSize(DataRefImpl Symb,
   return object_error::success;
 }
 
-error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                               char &Result) const {
-  const coff_symbol *symb = toSymb(Symb);
-  StringRef name;
-  if (error_code ec = getSymbolName(Symb, name))
-    return ec;
-  char ret = StringSwitch<char>(name)
-    .StartsWith(".debug", 'N')
-    .StartsWith(".sxdata", 'N')
-    .Default('?');
-
-  if (ret != '?') {
-    Result = ret;
-    return object_error::success;
-  }
-
-  uint32_t Characteristics = 0;
-  if (symb->SectionNumber > 0) {
-    const coff_section *Section = NULL;
-    if (error_code ec = getSection(symb->SectionNumber, Section))
-      return ec;
-    Characteristics = Section->Characteristics;
-  }
-
-  switch (symb->SectionNumber) {
-  case COFF::IMAGE_SYM_UNDEFINED:
-    // Check storage classes.
-    if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
-      Result = 'w';
-      return object_error::success; // Don't do ::toupper.
-    } else if (symb->Value != 0) // Check for common symbols.
-      ret = 'c';
-    else
-      ret = 'u';
-    break;
-  case COFF::IMAGE_SYM_ABSOLUTE:
-    ret = 'a';
-    break;
-  case COFF::IMAGE_SYM_DEBUG:
-    ret = 'n';
-    break;
-  default:
-    // Check section type.
-    if (Characteristics & COFF::IMAGE_SCN_CNT_CODE)
-      ret = 't';
-    else if (  Characteristics & COFF::IMAGE_SCN_MEM_READ
-            && ~Characteristics & COFF::IMAGE_SCN_MEM_WRITE) // Read only.
-      ret = 'r';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
-      ret = 'd';
-    else if (Characteristics & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
-      ret = 'b';
-    else if (Characteristics & COFF::IMAGE_SCN_LNK_INFO)
-      ret = 'i';
-
-    // Check for section symbol.
-    else if (  symb->StorageClass == COFF::IMAGE_SYM_CLASS_STATIC
-            && symb->Value == 0)
-       ret = 's';
-  }
-
-  if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_EXTERNAL)
-    ret = ::toupper(static_cast<unsigned char>(ret));
-
-  Result = ret;
-  return object_error::success;
-}
-
 error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
                                             section_iterator &Result) const {
   const coff_symbol *symb = toSymb(Symb);
@@ -403,7 +337,7 @@ error_code COFFObjectFile::sectionContainsSymbol(DataRefImpl Sec,
   return object_error::success;
 }
 
-relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -414,7 +348,7 @@ relocation_iterator COFFObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
-relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Sec) const {
   const coff_section *sec = toSec(Sec);
   DataRefImpl ret;
   if (sec->NumberOfRelocations == 0)
@@ -428,86 +362,178 @@ relocation_iterator COFFObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
   return relocation_iterator(RelocationRef(ret, this));
 }
 
+// Initialize the pointer to the symbol table.
+error_code COFFObjectFile::initSymbolTablePtr() {
+  if (error_code ec = getObject(
+          SymbolTable, Data, base() + COFFHeader->PointerToSymbolTable,
+          COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
+    return ec;
+
+  // Find string table. The first four byte of the string table contains the
+  // total size of the string table, including the size field itself. If the
+  // string table is empty, the value of the first four byte would be 4.
+  const uint8_t *StringTableAddr =
+      base() + COFFHeader->PointerToSymbolTable +
+      COFFHeader->NumberOfSymbols * sizeof(coff_symbol);
+  const ulittle32_t *StringTableSizePtr;
+  if (error_code ec = getObject(StringTableSizePtr, Data, StringTableAddr))
+    return ec;
+  StringTableSize = *StringTableSizePtr;
+  if (error_code ec =
+      getObject(StringTable, Data, StringTableAddr, StringTableSize))
+    return ec;
+
+  // Check that the string table is null terminated if has any in it.
+  if (StringTableSize < 4 ||
+      (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0))
+    return  object_error::parse_failed;
+  return object_error::success;
+}
+
+// Returns the file offset for the given RVA.
+error_code COFFObjectFile::getRvaPtr(uint32_t Rva, uintptr_t &Res) const {
+  error_code ec;
+  for (section_iterator i = begin_sections(), e = end_sections(); i != e;
+       i.increment(ec)) {
+    if (ec)
+      return ec;
+    const coff_section *Section = getCOFFSection(i);
+    uint32_t SectionStart = Section->VirtualAddress;
+    uint32_t SectionEnd = Section->VirtualAddress + Section->VirtualSize;
+    if (SectionStart <= Rva && Rva < SectionEnd) {
+      uint32_t Offset = Rva - SectionStart;
+      Res = uintptr_t(base()) + Section->PointerToRawData + Offset;
+      return object_error::success;
+    }
+  }
+  return object_error::parse_failed;
+}
+
+// Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
+// table entry.
+error_code COFFObjectFile::
+getHintName(uint32_t Rva, uint16_t &Hint, StringRef &Name) const {
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(Rva, IntPtr))
+    return ec;
+  const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
+  Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
+  Name = StringRef(reinterpret_cast<const char *>(Ptr + 2));
+  return object_error::success;
+}
+
+// Find the import table.
+error_code COFFObjectFile::initImportTablePtr() {
+  // First, we get the RVA of the import table. If the file lacks a pointer to
+  // the import table, do nothing.
+  const data_directory *DataEntry;
+  if (getDataDirectory(COFF::IMPORT_TABLE, DataEntry))
+    return object_error::success;
+
+  // Do nothing if the pointer to import table is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return object_error::success;
+
+  uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
+  NumberOfImportDirectory = DataEntry->Size /
+      sizeof(import_directory_table_entry);
+
+  // Find the section that contains the RVA. This is needed because the RVA is
+  // the import table's memory address which is different from its file offset.
+  uintptr_t IntPtr = 0;
+  if (error_code ec = getRvaPtr(ImportTableRva, IntPtr))
+    return ec;
+  ImportDirectory = reinterpret_cast<
+      const import_directory_table_entry *>(IntPtr);
+
+  // It's an error if there's no section containing the Import Table RVA.
+  return object_error::parse_failed;
+}
+
 COFFObjectFile::COFFObjectFile(MemoryBuffer *Object, error_code &ec)
   : ObjectFile(Binary::ID_COFF, Object)
-  , Header(0)
+  , COFFHeader(0)
+  , PE32Header(0)
+  , DataDirectory(0)
   , SectionTable(0)
   , SymbolTable(0)
   , StringTable(0)
-  , StringTableSize(0) {
+  , StringTableSize(0)
+  , ImportDirectory(0)
+  , NumberOfImportDirectory(0) {
   // Check that we at least have enough room for a header.
   if (!checkSize(Data, ec, sizeof(coff_file_header))) return;
 
-  // The actual starting location of the COFF header in the file. This can be
-  // non-zero in PE/COFF files.
-  uint64_t HeaderStart = 0;
+  // The current location in the file where we are looking at.
+  uint64_t CurPtr = 0;
+
+  // PE header is optional and is present only in executables. If it exists,
+  // it is placed right after COFF header.
+  bool hasPEHeader = false;
 
   // Check if this is a PE/COFF file.
   if (base()[0] == 0x4d && base()[1] == 0x5a) {
     // PE/COFF, seek through MS-DOS compatibility stub and 4-byte
     // PE signature to find 'normal' COFF header.
     if (!checkSize(Data, ec, 0x3c + 8)) return;
-    HeaderStart = *reinterpret_cast<const ulittle16_t *>(base() + 0x3c);
-    // Check the PE header. ("PE\0\0")
-    if (std::memcmp(base() + HeaderStart, "PE\0\0", 4) != 0) {
+    CurPtr = *reinterpret_cast<const ulittle16_t *>(base() + 0x3c);
+    // Check the PE magic bytes. ("PE\0\0")
+    if (std::memcmp(base() + CurPtr, "PE\0\0", 4) != 0) {
       ec = object_error::parse_failed;
       return;
     }
-    HeaderStart += 4; // Skip the PE Header.
+    CurPtr += 4; // Skip the PE magic bytes.
+    hasPEHeader = true;
   }
 
-  Header = reinterpret_cast<const coff_file_header *>(base() + HeaderStart);
-  if (!checkAddr(Data, ec, uintptr_t(Header), sizeof(coff_file_header)))
-    return;
-
-  SectionTable =
-    reinterpret_cast<const coff_section *>( base()
-                                          + HeaderStart
-                                          + sizeof(coff_file_header)
-                                          + Header->SizeOfOptionalHeader);
-  if (!checkAddr(Data, ec, uintptr_t(SectionTable),
-                 Header->NumberOfSections * sizeof(coff_section)))
+  if ((ec = getObject(COFFHeader, Data, base() + CurPtr)))
     return;
+  CurPtr += sizeof(coff_file_header);
 
-  if (Header->PointerToSymbolTable != 0) {
-    SymbolTable =
-      reinterpret_cast<const coff_symbol *>(base()
-                                            + Header->PointerToSymbolTable);
-    if (!checkAddr(Data, ec, uintptr_t(SymbolTable),
-                   Header->NumberOfSymbols * sizeof(coff_symbol)))
+  if (hasPEHeader) {
+    if ((ec = getObject(PE32Header, Data, base() + CurPtr)))
       return;
+    if (PE32Header->Magic != 0x10b) {
+      // We only support PE32. If this is PE32 (not PE32+), the magic byte
+      // should be 0x10b. If this is not PE32, continue as if there's no PE
+      // header in this file.
+      PE32Header = 0;
+    } else if (PE32Header->NumberOfRvaAndSize > 0) {
+      const uint8_t *addr = base() + CurPtr + sizeof(pe32_header);
+      uint64_t size = sizeof(data_directory) * PE32Header->NumberOfRvaAndSize;
+      if ((ec = getObject(DataDirectory, Data, addr, size)))
+        return;
+    }
+    CurPtr += COFFHeader->SizeOfOptionalHeader;
+  }
 
-    // Find string table.
-    StringTable = reinterpret_cast<const char *>(base())
-                  + Header->PointerToSymbolTable
-                  + Header->NumberOfSymbols * sizeof(coff_symbol);
-    if (!checkAddr(Data, ec, uintptr_t(StringTable), sizeof(ulittle32_t)))
+  if (!COFFHeader->isImportLibrary())
+    if ((ec = getObject(SectionTable, Data, base() + CurPtr,
+                        COFFHeader->NumberOfSections * sizeof(coff_section))))
       return;
 
-    StringTableSize = *reinterpret_cast<const ulittle32_t *>(StringTable);
-    if (!checkAddr(Data, ec, uintptr_t(StringTable), StringTableSize))
+  // Initialize the pointer to the symbol table.
+  if (COFFHeader->PointerToSymbolTable != 0)
+    if ((ec = initSymbolTablePtr()))
       return;
-    // Check that the string table is null terminated if has any in it.
-    if (StringTableSize < 4
-        || (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)) {
-      ec = object_error::parse_failed;
-      return;
-    }
-  }
+
+  // Initialize the pointer to the beginning of the import table.
+  if ((ec = initImportTablePtr()))
+    return;
 
   ec = object_error::success;
 }
 
 symbol_iterator COFFObjectFile::begin_symbols() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SymbolTable);
+  ret.p = reinterpret_cast<uintptr_t>(SymbolTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
 symbol_iterator COFFObjectFile::end_symbols() const {
   // The symbol table ends where the string table begins.
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(StringTable);
+  ret.p = reinterpret_cast<uintptr_t>(StringTable);
   return symbol_iterator(SymbolRef(ret, this));
 }
 
@@ -536,16 +562,34 @@ StringRef COFFObjectFile::getLoadName() const {
   return "";
 }
 
+import_directory_iterator COFFObjectFile::import_directory_begin() const {
+  DataRefImpl Imp;
+  Imp.p = reinterpret_cast<uintptr_t>(ImportDirectory);
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
+
+import_directory_iterator COFFObjectFile::import_directory_end() const {
+  DataRefImpl Imp;
+  if (ImportDirectory) {
+    Imp.p = reinterpret_cast<uintptr_t>(
+        ImportDirectory + (NumberOfImportDirectory - 1));
+  } else {
+    Imp.p = 0;
+  }
+  return import_directory_iterator(ImportDirectoryEntryRef(Imp, this));
+}
 
 section_iterator COFFObjectFile::begin_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable);
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable);
   return section_iterator(SectionRef(ret, this));
 }
 
 section_iterator COFFObjectFile::end_sections() const {
   DataRefImpl ret;
-  ret.p = reinterpret_cast<intptr_t>(SectionTable + Header->NumberOfSections);
+  int numSections = COFFHeader->isImportLibrary()
+      ? 0 : COFFHeader->NumberOfSections;
+  ret.p = reinterpret_cast<uintptr_t>(SectionTable + numSections);
   return section_iterator(SectionRef(ret, this));
 }
 
@@ -554,7 +598,7 @@ uint8_t COFFObjectFile::getBytesInAddress() const {
 }
 
 StringRef COFFObjectFile::getFileFormatName() const {
-  switch(Header->Machine) {
+  switch(COFFHeader->Machine) {
   case COFF::IMAGE_FILE_MACHINE_I386:
     return "COFF-i386";
   case COFF::IMAGE_FILE_MACHINE_AMD64:
@@ -565,7 +609,7 @@ StringRef COFFObjectFile::getFileFormatName() const {
 }
 
 unsigned COFFObjectFile::getArch() const {
-  switch(Header->Machine) {
+  switch(COFFHeader->Machine) {
   case COFF::IMAGE_FILE_MACHINE_I386:
     return Triple::x86;
   case COFF::IMAGE_FILE_MACHINE_AMD64:
@@ -575,8 +619,28 @@ unsigned COFFObjectFile::getArch() const {
   }
 }
 
+// This method is kept here because lld uses this. As soon as we make
+// lld to use getCOFFHeader, this method will be removed.
 error_code COFFObjectFile::getHeader(const coff_file_header *&Res) const {
-  Res = Header;
+  return getCOFFHeader(Res);
+}
+
+error_code COFFObjectFile::getCOFFHeader(const coff_file_header *&Res) const {
+  Res = COFFHeader;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getPE32Header(const pe32_header *&Res) const {
+  Res = PE32Header;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::getDataDirectory(uint32_t index,
+                                            const data_directory *&Res) const {
+  // Error if if there's no data directory or the index is out of range.
+  if (!DataDirectory || index > PE32Header->NumberOfRvaAndSize)
+    return object_error::parse_failed;
+  Res = &DataDirectory[index];
   return object_error::success;
 }
 
@@ -587,7 +651,7 @@ error_code COFFObjectFile::getSection(int32_t index,
       index == COFF::IMAGE_SYM_ABSOLUTE ||
       index == COFF::IMAGE_SYM_DEBUG)
     Result = NULL;
-  else if (index > 0 && index <= Header->NumberOfSections)
+  else if (index > 0 && index <= COFFHeader->NumberOfSections)
     // We already verified the section table data, so no need to check again.
     Result = SectionTable + (index - 1);
   else
@@ -608,7 +672,7 @@ error_code COFFObjectFile::getString(uint32_t offset,
 
 error_code COFFObjectFile::getSymbol(uint32_t index,
                                      const coff_symbol *&Result) const {
-  if (index < Header->NumberOfSymbols)
+  if (index < COFFHeader->NumberOfSymbols)
     Result = SymbolTable + index;
   else
     return object_error::parse_failed;
@@ -637,19 +701,19 @@ error_code COFFObjectFile::getSymbolName(const coff_symbol *symbol,
 ArrayRef<uint8_t> COFFObjectFile::getSymbolAuxData(
                                   const coff_symbol *symbol) const {
   const uint8_t *aux = NULL;
-  
+
   if ( symbol->NumberOfAuxSymbols > 0 ) {
   // AUX data comes immediately after the symbol in COFF
     aux = reinterpret_cast<const uint8_t *>(symbol + 1);
 # ifndef NDEBUG
     // Verify that the aux symbol points to a valid entry in the symbol table.
     uintptr_t offset = uintptr_t(aux) - uintptr_t(base());
-    if (offset < Header->PointerToSymbolTable
-        || offset >= Header->PointerToSymbolTable
-           + (Header->NumberOfSymbols * sizeof(coff_symbol)))
+    if (offset < COFFHeader->PointerToSymbolTable
+        || offset >= COFFHeader->PointerToSymbolTable
+           + (COFFHeader->NumberOfSymbols * sizeof(coff_symbol)))
       report_fatal_error("Aux Symbol data was outside of symbol table.");
 
-    assert((offset - Header->PointerToSymbolTable) % sizeof(coff_symbol)
+    assert((offset - COFFHeader->PointerToSymbolTable) % sizeof(coff_symbol)
          == 0 && "Aux Symbol data did not point to the beginning of a symbol");
 # endif
   }
@@ -712,13 +776,11 @@ error_code COFFObjectFile::getRelocationOffset(DataRefImpl Rel,
   Res = toRel(Rel)->VirtualAddress;
   return object_error::success;
 }
-error_code COFFObjectFile::getRelocationSymbol(DataRefImpl Rel,
-                                               SymbolRef &Res) const {
+symbol_iterator COFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   const coff_relocation* R = toRel(Rel);
   DataRefImpl Symb;
   Symb.p = reinterpret_cast<uintptr_t>(SymbolTable + R->SymbolTableIndex);
-  Res = SymbolRef(Symb, this);
-  return object_error::success;
+  return symbol_iterator(SymbolRef(Symb, this));
 }
 error_code COFFObjectFile::getRelocationType(DataRefImpl Rel,
                                              uint64_t &Res) const {
@@ -740,7 +802,6 @@ const coff_relocation *COFFObjectFile::getCOFFRelocation(
   return toRel(It->getRawDataRefImpl());
 }
 
-
 #define LLVM_COFF_SWITCH_RELOC_TYPE_NAME(enum) \
   case COFF::enum: res = #enum; break;
 
@@ -748,7 +809,7 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
   const coff_relocation *reloc = toRel(Rel);
   StringRef res;
-  switch (Header->Machine) {
+  switch (COFFHeader->Machine) {
   case COFF::IMAGE_FILE_MACHINE_AMD64:
     switch (reloc->Type) {
     LLVM_COFF_SWITCH_RELOC_TYPE_NAME(IMAGE_REL_AMD64_ABSOLUTE);
@@ -798,11 +859,6 @@ error_code COFFObjectFile::getRelocationTypeName(DataRefImpl Rel,
 
 #undef LLVM_COFF_SWITCH_RELOC_TYPE_NAME
 
-error_code COFFObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
-                                                       int64_t &Res) const {
-  Res = 0;
-  return object_error::success;
-}
 error_code COFFObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
   const coff_relocation *reloc = toRel(Rel);
@@ -826,6 +882,52 @@ error_code COFFObjectFile::getLibraryPath(DataRefImpl LibData,
   report_fatal_error("getLibraryPath not implemented in COFFObjectFile");
 }
 
+bool ImportDirectoryEntryRef::
+operator==(const ImportDirectoryEntryRef &Other) const {
+  return ImportDirectoryPimpl == Other.ImportDirectoryPimpl;
+}
+
+static const import_directory_table_entry *toImportEntry(DataRefImpl Imp) {
+  return reinterpret_cast<const import_directory_table_entry *>(Imp.p);
+}
+
+error_code
+ImportDirectoryEntryRef::getNext(ImportDirectoryEntryRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  Dir += 1;
+  DataRefImpl Next;
+  Next.p = reinterpret_cast<uintptr_t>(Dir);
+  Result = ImportDirectoryEntryRef(Next, OwningObject);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::
+getImportTableEntry(const import_directory_table_entry *&Result) const {
+  Result = toImportEntry(ImportDirectoryPimpl);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(Dir->NameRVA, IntPtr))
+    return ec;
+  const char *Ptr = reinterpret_cast<const char *>(IntPtr);
+  Result = StringRef(Ptr);
+  return object_error::success;
+}
+
+error_code ImportDirectoryEntryRef::getImportLookupEntry(
+    const import_lookup_table_entry32 *&Result) const {
+  const import_directory_table_entry *Dir = toImportEntry(ImportDirectoryPimpl);
+  uintptr_t IntPtr = 0;
+  if (error_code ec = OwningObject->getRvaPtr(
+          Dir->ImportLookupTableRVA, IntPtr))
+    return ec;
+  Result = reinterpret_cast<const import_lookup_table_entry32 *>(IntPtr);
+  return object_error::success;
+}
+
 namespace llvm {
 
   ObjectFile *ObjectFile::createCOFFObjectFile(MemoryBuffer *Object) {
diff --git a/lib/Object/COFFYAML.cpp b/lib/Object/COFFYAML.cpp
new file mode 100644
index 0000000..e549b4e
--- /dev/null
+++ b/lib/Object/COFFYAML.cpp
@@ -0,0 +1,281 @@
+//===- COFFYAML.cpp - COFF YAMLIO implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of COFF.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/COFFYAML.h"
+
+#define ECase(X) IO.enumCase(Value, #X, COFF::X);
+namespace llvm {
+
+namespace COFFYAML {
+Section::Section() { memset(&Header, 0, sizeof(COFF::section)); }
+Symbol::Symbol() { memset(&Header, 0, sizeof(COFF::symbol)); }
+Object::Object() { memset(&Header, 0, sizeof(COFF::header)); }
+}
+
+namespace yaml {
+void ScalarEnumerationTraits<COFF::MachineTypes>::enumeration(
+    IO &IO, COFF::MachineTypes &Value) {
+  ECase(IMAGE_FILE_MACHINE_UNKNOWN);
+  ECase(IMAGE_FILE_MACHINE_AM33);
+  ECase(IMAGE_FILE_MACHINE_AMD64);
+  ECase(IMAGE_FILE_MACHINE_ARM);
+  ECase(IMAGE_FILE_MACHINE_ARMV7);
+  ECase(IMAGE_FILE_MACHINE_EBC);
+  ECase(IMAGE_FILE_MACHINE_I386);
+  ECase(IMAGE_FILE_MACHINE_IA64);
+  ECase(IMAGE_FILE_MACHINE_M32R);
+  ECase(IMAGE_FILE_MACHINE_MIPS16);
+  ECase(IMAGE_FILE_MACHINE_MIPSFPU);
+  ECase(IMAGE_FILE_MACHINE_MIPSFPU16);
+  ECase(IMAGE_FILE_MACHINE_POWERPC);
+  ECase(IMAGE_FILE_MACHINE_POWERPCFP);
+  ECase(IMAGE_FILE_MACHINE_R4000);
+  ECase(IMAGE_FILE_MACHINE_SH3);
+  ECase(IMAGE_FILE_MACHINE_SH3DSP);
+  ECase(IMAGE_FILE_MACHINE_SH4);
+  ECase(IMAGE_FILE_MACHINE_SH5);
+  ECase(IMAGE_FILE_MACHINE_THUMB);
+  ECase(IMAGE_FILE_MACHINE_WCEMIPSV2);
+}
+
+void ScalarEnumerationTraits<COFF::SymbolBaseType>::enumeration(
+    IO &IO, COFF::SymbolBaseType &Value) {
+  ECase(IMAGE_SYM_TYPE_NULL);
+  ECase(IMAGE_SYM_TYPE_VOID);
+  ECase(IMAGE_SYM_TYPE_CHAR);
+  ECase(IMAGE_SYM_TYPE_SHORT);
+  ECase(IMAGE_SYM_TYPE_INT);
+  ECase(IMAGE_SYM_TYPE_LONG);
+  ECase(IMAGE_SYM_TYPE_FLOAT);
+  ECase(IMAGE_SYM_TYPE_DOUBLE);
+  ECase(IMAGE_SYM_TYPE_STRUCT);
+  ECase(IMAGE_SYM_TYPE_UNION);
+  ECase(IMAGE_SYM_TYPE_ENUM);
+  ECase(IMAGE_SYM_TYPE_MOE);
+  ECase(IMAGE_SYM_TYPE_BYTE);
+  ECase(IMAGE_SYM_TYPE_WORD);
+  ECase(IMAGE_SYM_TYPE_UINT);
+  ECase(IMAGE_SYM_TYPE_DWORD);
+}
+
+void ScalarEnumerationTraits<COFF::SymbolStorageClass>::enumeration(
+    IO &IO, COFF::SymbolStorageClass &Value) {
+  ECase(IMAGE_SYM_CLASS_END_OF_FUNCTION);
+  ECase(IMAGE_SYM_CLASS_NULL);
+  ECase(IMAGE_SYM_CLASS_AUTOMATIC);
+  ECase(IMAGE_SYM_CLASS_EXTERNAL);
+  ECase(IMAGE_SYM_CLASS_STATIC);
+  ECase(IMAGE_SYM_CLASS_REGISTER);
+  ECase(IMAGE_SYM_CLASS_EXTERNAL_DEF);
+  ECase(IMAGE_SYM_CLASS_LABEL);
+  ECase(IMAGE_SYM_CLASS_UNDEFINED_LABEL);
+  ECase(IMAGE_SYM_CLASS_MEMBER_OF_STRUCT);
+  ECase(IMAGE_SYM_CLASS_ARGUMENT);
+  ECase(IMAGE_SYM_CLASS_STRUCT_TAG);
+  ECase(IMAGE_SYM_CLASS_MEMBER_OF_UNION);
+  ECase(IMAGE_SYM_CLASS_UNION_TAG);
+  ECase(IMAGE_SYM_CLASS_TYPE_DEFINITION);
+  ECase(IMAGE_SYM_CLASS_UNDEFINED_STATIC);
+  ECase(IMAGE_SYM_CLASS_ENUM_TAG);
+  ECase(IMAGE_SYM_CLASS_MEMBER_OF_ENUM);
+  ECase(IMAGE_SYM_CLASS_REGISTER_PARAM);
+  ECase(IMAGE_SYM_CLASS_BIT_FIELD);
+  ECase(IMAGE_SYM_CLASS_BLOCK);
+  ECase(IMAGE_SYM_CLASS_FUNCTION);
+  ECase(IMAGE_SYM_CLASS_END_OF_STRUCT);
+  ECase(IMAGE_SYM_CLASS_FILE);
+  ECase(IMAGE_SYM_CLASS_SECTION);
+  ECase(IMAGE_SYM_CLASS_WEAK_EXTERNAL);
+  ECase(IMAGE_SYM_CLASS_CLR_TOKEN);
+}
+
+void ScalarEnumerationTraits<COFF::SymbolComplexType>::enumeration(
+    IO &IO, COFF::SymbolComplexType &Value) {
+  ECase(IMAGE_SYM_DTYPE_NULL);
+  ECase(IMAGE_SYM_DTYPE_POINTER);
+  ECase(IMAGE_SYM_DTYPE_FUNCTION);
+  ECase(IMAGE_SYM_DTYPE_ARRAY);
+}
+
+void ScalarEnumerationTraits<COFF::RelocationTypeX86>::enumeration(
+    IO &IO, COFF::RelocationTypeX86 &Value) {
+  ECase(IMAGE_REL_I386_ABSOLUTE);
+  ECase(IMAGE_REL_I386_DIR16);
+  ECase(IMAGE_REL_I386_REL16);
+  ECase(IMAGE_REL_I386_DIR32);
+  ECase(IMAGE_REL_I386_DIR32NB);
+  ECase(IMAGE_REL_I386_SEG12);
+  ECase(IMAGE_REL_I386_SECTION);
+  ECase(IMAGE_REL_I386_SECREL);
+  ECase(IMAGE_REL_I386_TOKEN);
+  ECase(IMAGE_REL_I386_SECREL7);
+  ECase(IMAGE_REL_I386_REL32);
+  ECase(IMAGE_REL_AMD64_ABSOLUTE);
+  ECase(IMAGE_REL_AMD64_ADDR64);
+  ECase(IMAGE_REL_AMD64_ADDR32);
+  ECase(IMAGE_REL_AMD64_ADDR32NB);
+  ECase(IMAGE_REL_AMD64_REL32);
+  ECase(IMAGE_REL_AMD64_REL32_1);
+  ECase(IMAGE_REL_AMD64_REL32_2);
+  ECase(IMAGE_REL_AMD64_REL32_3);
+  ECase(IMAGE_REL_AMD64_REL32_4);
+  ECase(IMAGE_REL_AMD64_REL32_5);
+  ECase(IMAGE_REL_AMD64_SECTION);
+  ECase(IMAGE_REL_AMD64_SECREL);
+  ECase(IMAGE_REL_AMD64_SECREL7);
+  ECase(IMAGE_REL_AMD64_TOKEN);
+  ECase(IMAGE_REL_AMD64_SREL32);
+  ECase(IMAGE_REL_AMD64_PAIR);
+  ECase(IMAGE_REL_AMD64_SSPAN32);
+}
+#undef ECase
+
+#define BCase(X) IO.bitSetCase(Value, #X, COFF::X);
+void ScalarBitSetTraits<COFF::Characteristics>::bitset(
+    IO &IO, COFF::Characteristics &Value) {
+  BCase(IMAGE_FILE_RELOCS_STRIPPED);
+  BCase(IMAGE_FILE_EXECUTABLE_IMAGE);
+  BCase(IMAGE_FILE_LINE_NUMS_STRIPPED);
+  BCase(IMAGE_FILE_LOCAL_SYMS_STRIPPED);
+  BCase(IMAGE_FILE_AGGRESSIVE_WS_TRIM);
+  BCase(IMAGE_FILE_LARGE_ADDRESS_AWARE);
+  BCase(IMAGE_FILE_BYTES_REVERSED_LO);
+  BCase(IMAGE_FILE_32BIT_MACHINE);
+  BCase(IMAGE_FILE_DEBUG_STRIPPED);
+  BCase(IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP);
+  BCase(IMAGE_FILE_NET_RUN_FROM_SWAP);
+  BCase(IMAGE_FILE_SYSTEM);
+  BCase(IMAGE_FILE_DLL);
+  BCase(IMAGE_FILE_UP_SYSTEM_ONLY);
+  BCase(IMAGE_FILE_BYTES_REVERSED_HI);
+}
+
+void ScalarBitSetTraits<COFF::SectionCharacteristics>::bitset(
+    IO &IO, COFF::SectionCharacteristics &Value) {
+  BCase(IMAGE_SCN_TYPE_NO_PAD);
+  BCase(IMAGE_SCN_CNT_CODE);
+  BCase(IMAGE_SCN_CNT_INITIALIZED_DATA);
+  BCase(IMAGE_SCN_CNT_UNINITIALIZED_DATA);
+  BCase(IMAGE_SCN_LNK_OTHER);
+  BCase(IMAGE_SCN_LNK_INFO);
+  BCase(IMAGE_SCN_LNK_REMOVE);
+  BCase(IMAGE_SCN_LNK_COMDAT);
+  BCase(IMAGE_SCN_GPREL);
+  BCase(IMAGE_SCN_MEM_PURGEABLE);
+  BCase(IMAGE_SCN_MEM_16BIT);
+  BCase(IMAGE_SCN_MEM_LOCKED);
+  BCase(IMAGE_SCN_MEM_PRELOAD);
+  BCase(IMAGE_SCN_LNK_NRELOC_OVFL);
+  BCase(IMAGE_SCN_MEM_DISCARDABLE);
+  BCase(IMAGE_SCN_MEM_NOT_CACHED);
+  BCase(IMAGE_SCN_MEM_NOT_PAGED);
+  BCase(IMAGE_SCN_MEM_SHARED);
+  BCase(IMAGE_SCN_MEM_EXECUTE);
+  BCase(IMAGE_SCN_MEM_READ);
+  BCase(IMAGE_SCN_MEM_WRITE);
+}
+#undef BCase
+
+namespace {
+struct NSectionCharacteristics {
+  NSectionCharacteristics(IO &)
+      : Characteristics(COFF::SectionCharacteristics(0)) {}
+  NSectionCharacteristics(IO &, uint32_t C)
+      : Characteristics(COFF::SectionCharacteristics(C)) {}
+  uint32_t denormalize(IO &) { return Characteristics; }
+  COFF::SectionCharacteristics Characteristics;
+};
+
+struct NStorageClass {
+  NStorageClass(IO &) : StorageClass(COFF::SymbolStorageClass(0)) {}
+  NStorageClass(IO &, uint8_t S) : StorageClass(COFF::SymbolStorageClass(S)) {}
+  uint8_t denormalize(IO &) { return StorageClass; }
+
+  COFF::SymbolStorageClass StorageClass;
+};
+
+struct NMachine {
+  NMachine(IO &) : Machine(COFF::MachineTypes(0)) {}
+  NMachine(IO &, uint16_t M) : Machine(COFF::MachineTypes(M)) {}
+  uint16_t denormalize(IO &) { return Machine; }
+  COFF::MachineTypes Machine;
+};
+
+struct NHeaderCharacteristics {
+  NHeaderCharacteristics(IO &) : Characteristics(COFF::Characteristics(0)) {}
+  NHeaderCharacteristics(IO &, uint16_t C)
+      : Characteristics(COFF::Characteristics(C)) {}
+  uint16_t denormalize(IO &) { return Characteristics; }
+
+  COFF::Characteristics Characteristics;
+};
+
+struct NType {
+  NType(IO &) : Type(COFF::RelocationTypeX86(0)) {}
+  NType(IO &, uint16_t T) : Type(COFF::RelocationTypeX86(T)) {}
+  uint16_t denormalize(IO &) { return Type; }
+  COFF::RelocationTypeX86 Type;
+};
+
+}
+
+void MappingTraits<COFFYAML::Relocation>::mapping(IO &IO,
+                                                  COFFYAML::Relocation &Rel) {
+  MappingNormalization<NType, uint16_t> NT(IO, Rel.Type);
+
+  IO.mapRequired("VirtualAddress", Rel.VirtualAddress);
+  IO.mapRequired("SymbolName", Rel.SymbolName);
+  IO.mapRequired("Type", NT->Type);
+}
+
+void MappingTraits<COFF::header>::mapping(IO &IO, COFF::header &H) {
+  MappingNormalization<NMachine, uint16_t> NM(IO, H.Machine);
+  MappingNormalization<NHeaderCharacteristics, uint16_t> NC(IO,
+                                                            H.Characteristics);
+
+  IO.mapRequired("Machine", NM->Machine);
+  IO.mapOptional("Characteristics", NC->Characteristics);
+}
+
+void MappingTraits<COFFYAML::Symbol>::mapping(IO &IO, COFFYAML::Symbol &S) {
+  MappingNormalization<NStorageClass, uint8_t> NS(IO, S.Header.StorageClass);
+
+  IO.mapRequired("Name", S.Name);
+  IO.mapRequired("Value", S.Header.Value);
+  IO.mapRequired("SectionNumber", S.Header.SectionNumber);
+  IO.mapRequired("SimpleType", S.SimpleType);
+  IO.mapRequired("ComplexType", S.ComplexType);
+  IO.mapRequired("StorageClass", NS->StorageClass);
+  IO.mapOptional("NumberOfAuxSymbols", S.Header.NumberOfAuxSymbols,
+                 (uint8_t) 0);
+  IO.mapOptional("AuxiliaryData", S.AuxiliaryData, object::yaml::BinaryRef());
+}
+
+void MappingTraits<COFFYAML::Section>::mapping(IO &IO, COFFYAML::Section &Sec) {
+  MappingNormalization<NSectionCharacteristics, uint32_t> NC(
+      IO, Sec.Header.Characteristics);
+  IO.mapRequired("Name", Sec.Name);
+  IO.mapRequired("Characteristics", NC->Characteristics);
+  IO.mapOptional("Alignment", Sec.Alignment);
+  IO.mapRequired("SectionData", Sec.SectionData);
+  IO.mapOptional("Relocations", Sec.Relocations);
+}
+
+void MappingTraits<COFFYAML::Object>::mapping(IO &IO, COFFYAML::Object &Obj) {
+  IO.mapRequired("header", Obj.Header);
+  IO.mapRequired("sections", Obj.Sections);
+  IO.mapRequired("symbols", Obj.Symbols);
+}
+
+}
+}
diff --git a/lib/Object/ELF.cpp b/lib/Object/ELF.cpp
new file mode 100644
index 0000000..7c80d41
--- /dev/null
+++ b/lib/Object/ELF.cpp
@@ -0,0 +1,714 @@
+//===- ELF.cpp - ELF object file implementation -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELF.h"
+
+namespace llvm {
+namespace object {
+
+#define LLVM_ELF_SWITCH_RELOC_TYPE_NAME(enum)                                  \
+  case ELF::enum:                                                              \
+    return #enum;                                                              \
+
+StringRef getELFRelocationTypeName(uint32_t Machine, uint32_t Type) {
+  switch (Machine) {
+  case ELF::EM_X86_64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_32S);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPCREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_SIZE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_GOTPC32_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_TLSDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_X86_64_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_386:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_32PLT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTIE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GD_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_PUSH);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDM_POP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LDO_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_LE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_386_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_MIPS:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_LITERAL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SHIFT6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SUB);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_A);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_INSERT_B);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_DELETE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_CALL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_SCN_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_ADD_IMMEDIATE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_PJUMP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_RELGOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JALR);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_LDM);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_GOTTPREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_26_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_PC16_S1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_CALL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_DISP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_GOT_OFST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MICROMIPS_TLS_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_MIPS_NUM);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_AARCH64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_PREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G2_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_UABS_G3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_MOVW_SABS_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD_PREL_LO19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_LO21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_PREL_PG_HI21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADD_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST8_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TSTBR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CONDBR19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_JUMP26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_CALL26);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST16_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST32_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST64_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LDST128_ABS_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_ADR_GOT_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_LD64_GOT_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSIE_LD_GOTTPREL_PREL19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_MOVW_TPREL_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_HI12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_ADD_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADR_PAGE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_LD64_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_ADD_LO12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_AARCH64_TLSDESC_CALL);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_ARM:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_HEXAGON:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GPREL16_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_HL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B32_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B22_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B15_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B13_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B9_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_B7_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_12_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_10_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_9_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_8_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_7_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_32_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPMOD_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_PLT_B22_PCREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_LO16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_HI16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_6_PCREL_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOTREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_DTPREL_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_GD_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_IE_GOT_11_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_32_6_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_16_X);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_TPREL_11_X);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_DTPREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_PPC64:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL14_BRNTAKEN);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_ADDR16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TOC16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPMOD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSGD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TLSLD16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_TPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_GOT_DTPREL16_HA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_LO_DS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHER);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHERA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHEST);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_DTPREL16_HIGHESTA);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSGD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_TLSLD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_LO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_PPC64_REL16_HA);
+    default:
+      break;
+    }
+    break;
+  case ELF::EM_S390:
+    switch (Type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_JMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT16DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT32DBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPCDBL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PC64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLTENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_PLTOFF64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LOAD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDCALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GD64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDM64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_IEENT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LE64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_LDO64);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPMOD);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_DTPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_TPOFF);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_GOTPLT20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_TLS_GOTIE20);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_390_IRELATIVE);
+    default:
+      break;
+    }
+    break;
+  default:
+    break;
+  }
+  return "Unknown";
+}
+
+#undef LLVM_ELF_SWITCH_RELOC_TYPE_NAME
+
+} // end namespace object
+} // end namespace llvm
diff --git a/lib/Object/ELFObjectFile.cpp b/lib/Object/ELFObjectFile.cpp
index cfe0eb4..15bc6be 100644
--- a/lib/Object/ELFObjectFile.cpp
+++ b/lib/Object/ELFObjectFile.cpp
@@ -11,11 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/MathExtras.h"
 
 namespace llvm {
-
 using namespace object;
 
 // Creates an in-memory object-file by default: createELFObjectFile(Buffer)
@@ -24,7 +23,7 @@ ObjectFile *ObjectFile::createELFObjectFile(MemoryBuffer *Object) {
   error_code ec;
 
   std::size_t MaxAlignment =
-    1ULL << CountTrailingZeros_64(uintptr_t(Object->getBufferStart()));
+    1ULL << countTrailingZeros(uintptr_t(Object->getBufferStart()));
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB)
 #if !LLVM_IS_UNALIGNED_ACCESS_FAST
diff --git a/lib/Object/ELFYAML.cpp b/lib/Object/ELFYAML.cpp
new file mode 100644
index 0000000..2f35cf9
--- /dev/null
+++ b/lib/Object/ELFYAML.cpp
@@ -0,0 +1,338 @@
+//===- ELFYAML.cpp - ELF YAMLIO implementation ----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of ELF.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/ELFYAML.h"
+
+namespace llvm {
+namespace yaml {
+
+void
+ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(IO &IO,
+                                                      ELFYAML::ELF_ET &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(ET_NONE)
+  ECase(ET_REL)
+  ECase(ET_EXEC)
+  ECase(ET_DYN)
+  ECase(ET_CORE)
+#undef ECase
+}
+
+void
+ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
+                                                      ELFYAML::ELF_EM &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(EM_NONE)
+  ECase(EM_M32)
+  ECase(EM_SPARC)
+  ECase(EM_386)
+  ECase(EM_68K)
+  ECase(EM_88K)
+  ECase(EM_486)
+  ECase(EM_860)
+  ECase(EM_MIPS)
+  ECase(EM_S370)
+  ECase(EM_MIPS_RS3_LE)
+  ECase(EM_PARISC)
+  ECase(EM_VPP500)
+  ECase(EM_SPARC32PLUS)
+  ECase(EM_960)
+  ECase(EM_PPC)
+  ECase(EM_PPC64)
+  ECase(EM_S390)
+  ECase(EM_SPU)
+  ECase(EM_V800)
+  ECase(EM_FR20)
+  ECase(EM_RH32)
+  ECase(EM_RCE)
+  ECase(EM_ARM)
+  ECase(EM_ALPHA)
+  ECase(EM_SH)
+  ECase(EM_SPARCV9)
+  ECase(EM_TRICORE)
+  ECase(EM_ARC)
+  ECase(EM_H8_300)
+  ECase(EM_H8_300H)
+  ECase(EM_H8S)
+  ECase(EM_H8_500)
+  ECase(EM_IA_64)
+  ECase(EM_MIPS_X)
+  ECase(EM_COLDFIRE)
+  ECase(EM_68HC12)
+  ECase(EM_MMA)
+  ECase(EM_PCP)
+  ECase(EM_NCPU)
+  ECase(EM_NDR1)
+  ECase(EM_STARCORE)
+  ECase(EM_ME16)
+  ECase(EM_ST100)
+  ECase(EM_TINYJ)
+  ECase(EM_X86_64)
+  ECase(EM_PDSP)
+  ECase(EM_PDP10)
+  ECase(EM_PDP11)
+  ECase(EM_FX66)
+  ECase(EM_ST9PLUS)
+  ECase(EM_ST7)
+  ECase(EM_68HC16)
+  ECase(EM_68HC11)
+  ECase(EM_68HC08)
+  ECase(EM_68HC05)
+  ECase(EM_SVX)
+  ECase(EM_ST19)
+  ECase(EM_VAX)
+  ECase(EM_CRIS)
+  ECase(EM_JAVELIN)
+  ECase(EM_FIREPATH)
+  ECase(EM_ZSP)
+  ECase(EM_MMIX)
+  ECase(EM_HUANY)
+  ECase(EM_PRISM)
+  ECase(EM_AVR)
+  ECase(EM_FR30)
+  ECase(EM_D10V)
+  ECase(EM_D30V)
+  ECase(EM_V850)
+  ECase(EM_M32R)
+  ECase(EM_MN10300)
+  ECase(EM_MN10200)
+  ECase(EM_PJ)
+  ECase(EM_OPENRISC)
+  ECase(EM_ARC_COMPACT)
+  ECase(EM_XTENSA)
+  ECase(EM_VIDEOCORE)
+  ECase(EM_TMM_GPP)
+  ECase(EM_NS32K)
+  ECase(EM_TPC)
+  ECase(EM_SNP1K)
+  ECase(EM_ST200)
+  ECase(EM_IP2K)
+  ECase(EM_MAX)
+  ECase(EM_CR)
+  ECase(EM_F2MC16)
+  ECase(EM_MSP430)
+  ECase(EM_BLACKFIN)
+  ECase(EM_SE_C33)
+  ECase(EM_SEP)
+  ECase(EM_ARCA)
+  ECase(EM_UNICORE)
+  ECase(EM_EXCESS)
+  ECase(EM_DXP)
+  ECase(EM_ALTERA_NIOS2)
+  ECase(EM_CRX)
+  ECase(EM_XGATE)
+  ECase(EM_C166)
+  ECase(EM_M16C)
+  ECase(EM_DSPIC30F)
+  ECase(EM_CE)
+  ECase(EM_M32C)
+  ECase(EM_TSK3000)
+  ECase(EM_RS08)
+  ECase(EM_SHARC)
+  ECase(EM_ECOG2)
+  ECase(EM_SCORE7)
+  ECase(EM_DSP24)
+  ECase(EM_VIDEOCORE3)
+  ECase(EM_LATTICEMICO32)
+  ECase(EM_SE_C17)
+  ECase(EM_TI_C6000)
+  ECase(EM_TI_C2000)
+  ECase(EM_TI_C5500)
+  ECase(EM_MMDSP_PLUS)
+  ECase(EM_CYPRESS_M8C)
+  ECase(EM_R32C)
+  ECase(EM_TRIMEDIA)
+  ECase(EM_HEXAGON)
+  ECase(EM_8051)
+  ECase(EM_STXP7X)
+  ECase(EM_NDS32)
+  ECase(EM_ECOG1)
+  ECase(EM_ECOG1X)
+  ECase(EM_MAXQ30)
+  ECase(EM_XIMO16)
+  ECase(EM_MANIK)
+  ECase(EM_CRAYNV2)
+  ECase(EM_RX)
+  ECase(EM_METAG)
+  ECase(EM_MCST_ELBRUS)
+  ECase(EM_ECOG16)
+  ECase(EM_CR16)
+  ECase(EM_ETPU)
+  ECase(EM_SLE9X)
+  ECase(EM_L10M)
+  ECase(EM_K10M)
+  ECase(EM_AARCH64)
+  ECase(EM_AVR32)
+  ECase(EM_STM8)
+  ECase(EM_TILE64)
+  ECase(EM_TILEPRO)
+  ECase(EM_CUDA)
+  ECase(EM_TILEGX)
+  ECase(EM_CLOUDSHIELD)
+  ECase(EM_COREA_1ST)
+  ECase(EM_COREA_2ND)
+  ECase(EM_ARC_COMPACT2)
+  ECase(EM_OPEN8)
+  ECase(EM_RL78)
+  ECase(EM_VIDEOCORE5)
+  ECase(EM_78KOR)
+  ECase(EM_56800EX)
+#undef ECase
+}
+
+void ScalarEnumerationTraits<ELFYAML::ELF_ELFCLASS>::enumeration(
+    IO &IO, ELFYAML::ELF_ELFCLASS &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  // Since the semantics of ELFCLASSNONE is "invalid", just don't accept it
+  // here.
+  ECase(ELFCLASS32)
+  ECase(ELFCLASS64)
+#undef ECase
+}
+
+void ScalarEnumerationTraits<ELFYAML::ELF_ELFDATA>::enumeration(
+    IO &IO, ELFYAML::ELF_ELFDATA &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  // Since the semantics of ELFDATANONE is "invalid", just don't accept it
+  // here.
+  ECase(ELFDATA2LSB)
+  ECase(ELFDATA2MSB)
+#undef ECase
+}
+
+void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
+    IO &IO, ELFYAML::ELF_ELFOSABI &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(ELFOSABI_NONE)
+  ECase(ELFOSABI_HPUX)
+  ECase(ELFOSABI_NETBSD)
+  ECase(ELFOSABI_GNU)
+  ECase(ELFOSABI_GNU)
+  ECase(ELFOSABI_HURD)
+  ECase(ELFOSABI_SOLARIS)
+  ECase(ELFOSABI_AIX)
+  ECase(ELFOSABI_IRIX)
+  ECase(ELFOSABI_FREEBSD)
+  ECase(ELFOSABI_TRU64)
+  ECase(ELFOSABI_MODESTO)
+  ECase(ELFOSABI_OPENBSD)
+  ECase(ELFOSABI_OPENVMS)
+  ECase(ELFOSABI_NSK)
+  ECase(ELFOSABI_AROS)
+  ECase(ELFOSABI_FENIXOS)
+  ECase(ELFOSABI_C6000_ELFABI)
+  ECase(ELFOSABI_C6000_LINUX)
+  ECase(ELFOSABI_ARM)
+  ECase(ELFOSABI_STANDALONE)
+#undef ECase
+}
+
+void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
+    IO &IO, ELFYAML::ELF_SHT &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(SHT_NULL)
+  ECase(SHT_PROGBITS)
+  // No SHT_SYMTAB. Use the top-level `Symbols` key instead.
+  // FIXME: Issue a diagnostic with this information.
+  ECase(SHT_STRTAB)
+  ECase(SHT_RELA)
+  ECase(SHT_HASH)
+  ECase(SHT_DYNAMIC)
+  ECase(SHT_NOTE)
+  ECase(SHT_NOBITS)
+  ECase(SHT_REL)
+  ECase(SHT_SHLIB)
+  ECase(SHT_DYNSYM)
+  ECase(SHT_INIT_ARRAY)
+  ECase(SHT_FINI_ARRAY)
+  ECase(SHT_PREINIT_ARRAY)
+  ECase(SHT_GROUP)
+  ECase(SHT_SYMTAB_SHNDX)
+#undef ECase
+}
+
+void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
+                                                  ELFYAML::ELF_SHF &Value) {
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+  BCase(SHF_WRITE)
+  BCase(SHF_ALLOC)
+  BCase(SHF_EXCLUDE)
+  BCase(SHF_EXECINSTR)
+  BCase(SHF_MERGE)
+  BCase(SHF_STRINGS)
+  BCase(SHF_INFO_LINK)
+  BCase(SHF_LINK_ORDER)
+  BCase(SHF_OS_NONCONFORMING)
+  BCase(SHF_GROUP)
+  BCase(SHF_TLS)
+#undef BCase
+}
+
+void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
+    IO &IO, ELFYAML::ELF_STT &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+  ECase(STT_NOTYPE)
+  ECase(STT_OBJECT)
+  ECase(STT_FUNC)
+  ECase(STT_SECTION)
+  ECase(STT_FILE)
+  ECase(STT_COMMON)
+  ECase(STT_TLS)
+  ECase(STT_GNU_IFUNC)
+#undef ECase
+}
+
+void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
+                                                 ELFYAML::FileHeader &FileHdr) {
+  IO.mapRequired("Class", FileHdr.Class);
+  IO.mapRequired("Data", FileHdr.Data);
+  IO.mapOptional("OSABI", FileHdr.OSABI, ELFYAML::ELF_ELFOSABI(0));
+  IO.mapRequired("Type", FileHdr.Type);
+  IO.mapRequired("Machine", FileHdr.Machine);
+  IO.mapOptional("Entry", FileHdr.Entry, Hex64(0));
+}
+
+void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
+  IO.mapOptional("Name", Symbol.Name, StringRef());
+  IO.mapOptional("Type", Symbol.Type, ELFYAML::ELF_STT(0));
+  IO.mapOptional("Section", Symbol.Section, StringRef());
+  IO.mapOptional("Value", Symbol.Value, Hex64(0));
+  IO.mapOptional("Size", Symbol.Size, Hex64(0));
+}
+
+void MappingTraits<ELFYAML::LocalGlobalWeakSymbols>::mapping(
+    IO &IO, ELFYAML::LocalGlobalWeakSymbols &Symbols) {
+  IO.mapOptional("Local", Symbols.Local);
+  IO.mapOptional("Global", Symbols.Global);
+  IO.mapOptional("Weak", Symbols.Weak);
+}
+
+void MappingTraits<ELFYAML::Section>::mapping(IO &IO,
+                                              ELFYAML::Section &Section) {
+  IO.mapOptional("Name", Section.Name, StringRef());
+  IO.mapRequired("Type", Section.Type);
+  IO.mapOptional("Flags", Section.Flags, ELFYAML::ELF_SHF(0));
+  IO.mapOptional("Address", Section.Address, Hex64(0));
+  IO.mapOptional("Content", Section.Content);
+  IO.mapOptional("Link", Section.Link);
+  IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
+}
+
+void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
+  IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("Sections", Object.Sections);
+  IO.mapOptional("Symbols", Object.Symbols);
+}
+
+} // end namespace yaml
+} // end namespace llvm
diff --git a/lib/Object/Error.cpp b/lib/Object/Error.cpp
index 2594625..47ce38c 100644
--- a/lib/Object/Error.cpp
+++ b/lib/Object/Error.cpp
@@ -31,18 +31,20 @@ const char *_object_error_category::name() const {
 }
 
 std::string _object_error_category::message(int ev) const {
-  switch (ev) {
+  object_error::Impl E = static_cast<object_error::Impl>(ev);
+  switch (E) {
   case object_error::success: return "Success";
+  case object_error::arch_not_found:
+    return "No object file for requested architecture";
   case object_error::invalid_file_type:
     return "The file was not recognized as a valid object file";
   case object_error::parse_failed:
     return "Invalid data was encountered while parsing the file";
   case object_error::unexpected_eof:
     return "The end of the file was unexpectedly encountered";
-  default:
-    llvm_unreachable("An enumerator of object_error does not have a message "
-                     "defined.");
   }
+  llvm_unreachable("An enumerator of object_error does not have a message "
+                   "defined.");
 }
 
 error_condition _object_error_category::default_error_condition(int ev) const {
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index dfd8d3d..d2cb8bd 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -14,11 +14,11 @@
 
 #include "llvm/Object/MachO.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include <cctype>
 #include <cstring>
 #include <limits>
@@ -29,16 +29,16 @@ using namespace object;
 namespace llvm {
 namespace object {
 
-struct SymbolTableEntryBase {
-  uint32_t StringIndex;
-  uint8_t Type;
-  uint8_t SectionIndex;
-  uint16_t Flags;
+struct nlist_base {
+  uint32_t n_strx;
+  uint8_t n_type;
+  uint8_t n_sect;
+  uint16_t n_desc;
 };
 
-struct SectionBase {
-  char Name[16];
-  char SegmentName[16];
+struct section_base {
+  char sectname[16];
+  char segname[16];
 };
 
 template<typename T>
@@ -50,167 +50,174 @@ template<typename T>
 static void SwapStruct(T &Value);
 
 template<>
-void SwapStruct(macho::RelocationEntry &H) {
-  SwapValue(H.Word0);
-  SwapValue(H.Word1);
+void SwapStruct(MachO::any_relocation_info &H) {
+  SwapValue(H.r_word0);
+  SwapValue(H.r_word1);
 }
 
 template<>
-void SwapStruct(macho::LoadCommand &L) {
-  SwapValue(L.Type);
-  SwapValue(L.Size);
+void SwapStruct(MachO::load_command &L) {
+  SwapValue(L.cmd);
+  SwapValue(L.cmdsize);
 }
 
 template<>
-void SwapStruct(SymbolTableEntryBase &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
+void SwapStruct(nlist_base &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
 }
 
 template<>
-void SwapStruct(macho::Section &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
+void SwapStruct(MachO::section &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
 }
 
 template<>
-void SwapStruct(macho::Section64 &S) {
-  SwapValue(S.Address);
-  SwapValue(S.Size);
-  SwapValue(S.Offset);
-  SwapValue(S.Align);
-  SwapValue(S.RelocationTableOffset);
-  SwapValue(S.NumRelocationTableEntries);
-  SwapValue(S.Flags);
-  SwapValue(S.Reserved1);
-  SwapValue(S.Reserved2);
-  SwapValue(S.Reserved3);
+void SwapStruct(MachO::section_64 &S) {
+  SwapValue(S.addr);
+  SwapValue(S.size);
+  SwapValue(S.offset);
+  SwapValue(S.align);
+  SwapValue(S.reloff);
+  SwapValue(S.nreloc);
+  SwapValue(S.flags);
+  SwapValue(S.reserved1);
+  SwapValue(S.reserved2);
+  SwapValue(S.reserved3);
 }
 
 template<>
-void SwapStruct(macho::SymbolTableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Symbol64TableEntry &S) {
-  SwapValue(S.StringIndex);
-  SwapValue(S.Flags);
-  SwapValue(S.Value);
+void SwapStruct(MachO::nlist_64 &S) {
+  SwapValue(S.n_strx);
+  SwapValue(S.n_desc);
+  SwapValue(S.n_value);
 }
 
 template<>
-void SwapStruct(macho::Header &H) {
-  SwapValue(H.Magic);
-  SwapValue(H.CPUType);
-  SwapValue(H.CPUSubtype);
-  SwapValue(H.FileType);
-  SwapValue(H.NumLoadCommands);
-  SwapValue(H.SizeOfLoadCommands);
-  SwapValue(H.Flags);
+void SwapStruct(MachO::mach_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
 }
 
 template<>
-void SwapStruct(macho::Header64Ext &E) {
-  SwapValue(E.Reserved);
+void SwapStruct(MachO::mach_header_64 &H) {
+  SwapValue(H.magic);
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.filetype);
+  SwapValue(H.ncmds);
+  SwapValue(H.sizeofcmds);
+  SwapValue(H.flags);
+  SwapValue(H.reserved);
 }
 
 template<>
-void SwapStruct(macho::SymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.SymbolTableOffset);
-  SwapValue(C.NumSymbolTableEntries);
-  SwapValue(C.StringTableOffset);
-  SwapValue(C.StringTableSize);
+void SwapStruct(MachO::symtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.symoff);
+  SwapValue(C.nsyms);
+  SwapValue(C.stroff);
+  SwapValue(C.strsize);
 }
 
 template<>
-void SwapStruct(macho::DysymtabLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.LocalSymbolsIndex);
-  SwapValue(C.NumLocalSymbols);
-  SwapValue(C.ExternalSymbolsIndex);
-  SwapValue(C.NumExternalSymbols);
-  SwapValue(C.UndefinedSymbolsIndex);
-  SwapValue(C.NumUndefinedSymbols);
-  SwapValue(C.TOCOffset);
-  SwapValue(C.NumTOCEntries);
-  SwapValue(C.ModuleTableOffset);
-  SwapValue(C.NumModuleTableEntries);
-  SwapValue(C.ReferenceSymbolTableOffset);
-  SwapValue(C.NumReferencedSymbolTableEntries);
-  SwapValue(C.IndirectSymbolTableOffset);
-  SwapValue(C.NumIndirectSymbolTableEntries);
-  SwapValue(C.ExternalRelocationTableOffset);
-  SwapValue(C.NumExternalRelocationTableEntries);
-  SwapValue(C.LocalRelocationTableOffset);
-  SwapValue(C.NumLocalRelocationTableEntries);
+void SwapStruct(MachO::dysymtab_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.ilocalsym);
+  SwapValue(C.nlocalsym);
+  SwapValue(C.iextdefsym);
+  SwapValue(C.nextdefsym);
+  SwapValue(C.iundefsym);
+  SwapValue(C.nundefsym);
+  SwapValue(C.tocoff);
+  SwapValue(C.ntoc);
+  SwapValue(C.modtaboff);
+  SwapValue(C.nmodtab);
+  SwapValue(C.extrefsymoff);
+  SwapValue(C.nextrefsyms);
+  SwapValue(C.indirectsymoff);
+  SwapValue(C.nindirectsyms);
+  SwapValue(C.extreloff);
+  SwapValue(C.nextrel);
+  SwapValue(C.locreloff);
+  SwapValue(C.nlocrel);
 }
 
 template<>
-void SwapStruct(macho::LinkeditDataLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.DataOffset);
-  SwapValue(C.DataSize);
+void SwapStruct(MachO::linkedit_data_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.dataoff);
+  SwapValue(C.datasize);
 }
 
 template<>
-void SwapStruct(macho::SegmentLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::Segment64LoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.VMAddress);
-  SwapValue(C.VMSize);
-  SwapValue(C.FileOffset);
-  SwapValue(C.FileSize);
-  SwapValue(C.MaxVMProtection);
-  SwapValue(C.InitialVMProtection);
-  SwapValue(C.NumSections);
-  SwapValue(C.Flags);
+void SwapStruct(MachO::segment_command_64 &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.vmaddr);
+  SwapValue(C.vmsize);
+  SwapValue(C.fileoff);
+  SwapValue(C.filesize);
+  SwapValue(C.maxprot);
+  SwapValue(C.initprot);
+  SwapValue(C.nsects);
+  SwapValue(C.flags);
 }
 
 template<>
-void SwapStruct(macho::IndirectSymbolTableEntry &C) {
-  SwapValue(C.Index);
+void SwapStruct(uint32_t &C) {
+  SwapValue(C);
 }
 
 template<>
-void SwapStruct(macho::LinkerOptionsLoadCommand &C) {
-  SwapValue(C.Type);
-  SwapValue(C.Size);
-  SwapValue(C.Count);
+void SwapStruct(MachO::linker_options_command &C) {
+  SwapValue(C.cmd);
+  SwapValue(C.cmdsize);
+  SwapValue(C.count);
 }
 
 template<>
-void SwapStruct(macho::DataInCodeTableEntry &C) {
-  SwapValue(C.Offset);
-  SwapValue(C.Length);
-  SwapValue(C.Kind);
+void SwapStruct(MachO::data_in_code_entry &C) {
+  SwapValue(C.offset);
+  SwapValue(C.length);
+  SwapValue(C.kind);
 }
 
 template<typename T>
@@ -226,11 +233,11 @@ static uint32_t
 getSegmentLoadCommandNumSections(const MachOObjectFile *O,
                                  const MachOObjectFile::LoadCommandInfo &L) {
   if (O->is64Bit()) {
-    macho::Segment64LoadCommand S = O->getSegment64LoadCommand(L);
-    return S.NumSections;
+    MachO::segment_command_64 S = O->getSegment64LoadCommand(L);
+    return S.nsects;
   }
-  macho::SegmentLoadCommand S = O->getSegmentLoadCommand(L);
-  return S.NumSections;
+  MachO::segment_command S = O->getSegmentLoadCommand(L);
+  return S.nsects;
 }
 
 static const char *
@@ -239,10 +246,10 @@ getSectionPtr(const MachOObjectFile *O, MachOObjectFile::LoadCommandInfo L,
   uintptr_t CommandAddr = reinterpret_cast<uintptr_t>(L.Ptr);
 
   bool Is64 = O->is64Bit();
-  unsigned SegmentLoadSize = Is64 ? sizeof(macho::Segment64LoadCommand) :
-                                    sizeof(macho::SegmentLoadCommand);
-  unsigned SectionSize = Is64 ? sizeof(macho::Section64) :
-                                sizeof(macho::Section);
+  unsigned SegmentLoadSize = Is64 ? sizeof(MachO::segment_command_64) :
+                                    sizeof(MachO::segment_command);
+  unsigned SectionSize = Is64 ? sizeof(MachO::section_64) :
+                                sizeof(MachO::section);
 
   uintptr_t SectionAddr = CommandAddr + SegmentLoadSize + Sec * SectionSize;
   return reinterpret_cast<const char*>(SectionAddr);
@@ -252,10 +259,10 @@ static const char *getPtr(const MachOObjectFile *O, size_t Offset) {
   return O->getData().substr(Offset, 1).data();
 }
 
-static SymbolTableEntryBase
+static nlist_base
 getSymbolTableEntryBase(const MachOObjectFile *O, DataRefImpl DRI) {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<SymbolTableEntryBase>(O, P);
+  return getStruct<nlist_base>(O, P);
 }
 
 static StringRef parseSegmentOrSectionName(const char *P) {
@@ -283,11 +290,11 @@ static void advanceTo(T &it, size_t Val) {
 }
 
 static unsigned getCPUType(const MachOObjectFile *O) {
-  return O->getHeader().CPUType;
+  return O->getHeader().cputype;
 }
 
 static void printRelocationTargetName(const MachOObjectFile *O,
-                                      const macho::RelocationEntry &RE,
+                                      const MachO::any_relocation_info &RE,
                                       raw_string_ostream &fmt) {
   bool IsScattered = O->isRelocationScattered(RE);
 
@@ -339,7 +346,7 @@ static void printRelocationTargetName(const MachOObjectFile *O,
 
   StringRef S;
   bool isExtern = O->getPlainRelocationExternal(RE);
-  uint64_t Val = O->getAnyRelocationAddress(RE);
+  uint64_t Val = O->getPlainRelocationSymbolNum(RE);
 
   if (isExtern) {
     symbol_iterator SI = O->begin_symbols();
@@ -347,86 +354,92 @@ static void printRelocationTargetName(const MachOObjectFile *O,
     SI->getName(S);
   } else {
     section_iterator SI = O->begin_sections();
-    advanceTo(SI, Val);
+    // Adjust for the fact that sections are 1-indexed.
+    advanceTo(SI, Val - 1);
     SI->getName(S);
   }
 
   fmt << S;
 }
 
-static uint32_t getPlainRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0;
+static uint32_t
+getPlainRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0;
 }
 
 static unsigned
-getScatteredRelocationAddress(const macho::RelocationEntry &RE) {
-  return RE.Word0 & 0xffffff;
+getScatteredRelocationAddress(const MachO::any_relocation_info &RE) {
+  return RE.r_word0 & 0xffffff;
 }
 
 static bool getPlainRelocationPCRel(const MachOObjectFile *O,
-                                    const macho::RelocationEntry &RE) {
+                                    const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 24) & 1;
-  return (RE.Word1 >> 7) & 1;
+    return (RE.r_word1 >> 24) & 1;
+  return (RE.r_word1 >> 7) & 1;
 }
 
 static bool
 getScatteredRelocationPCRel(const MachOObjectFile *O,
-                            const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 30) & 1;
+                            const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 30) & 1;
 }
 
 static unsigned getPlainRelocationLength(const MachOObjectFile *O,
-                                         const macho::RelocationEntry &RE) {
+                                         const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return (RE.Word1 >> 25) & 3;
-  return (RE.Word1 >> 5) & 3;
+    return (RE.r_word1 >> 25) & 3;
+  return (RE.r_word1 >> 5) & 3;
 }
 
 static unsigned
-getScatteredRelocationLength(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 28) & 3;
+getScatteredRelocationLength(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 28) & 3;
 }
 
 static unsigned getPlainRelocationType(const MachOObjectFile *O,
-                                       const macho::RelocationEntry &RE) {
+                                       const MachO::any_relocation_info &RE) {
   if (O->isLittleEndian())
-    return RE.Word1 >> 28;
-  return RE.Word1 & 0xf;
+    return RE.r_word1 >> 28;
+  return RE.r_word1 & 0xf;
 }
 
-static unsigned getScatteredRelocationType(const macho::RelocationEntry &RE) {
-  return (RE.Word0 >> 24) & 0xf;
+static unsigned
+getScatteredRelocationType(const MachO::any_relocation_info &RE) {
+  return (RE.r_word0 >> 24) & 0xf;
 }
 
 static uint32_t getSectionFlags(const MachOObjectFile *O,
                                 DataRefImpl Sec) {
   if (O->is64Bit()) {
-    macho::Section64 Sect = O->getSection64(Sec);
-    return Sect.Flags;
+    MachO::section_64 Sect = O->getSection64(Sec);
+    return Sect.flags;
   }
-  macho::Section Sect = O->getSection(Sec);
-  return Sect.Flags;
+  MachO::section Sect = O->getSection(Sec);
+  return Sect.flags;
 }
 
 MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
                                  bool IsLittleEndian, bool Is64bits,
                                  error_code &ec)
     : ObjectFile(getMachOType(IsLittleEndian, Is64bits), Object),
-      SymtabLoadCmd(NULL), DysymtabLoadCmd(NULL) {
-  uint32_t LoadCommandCount = this->getHeader().NumLoadCommands;
-  macho::LoadCommandType SegmentLoadType = is64Bit() ?
-    macho::LCT_Segment64 : macho::LCT_Segment;
+      SymtabLoadCmd(NULL), DysymtabLoadCmd(NULL), DataInCodeLoadCmd(NULL) {
+  uint32_t LoadCommandCount = this->getHeader().ncmds;
+  MachO::LoadCommandType SegmentLoadType = is64Bit() ?
+    MachO::LC_SEGMENT_64 : MachO::LC_SEGMENT;
 
   MachOObjectFile::LoadCommandInfo Load = getFirstLoadCommandInfo();
   for (unsigned I = 0; ; ++I) {
-    if (Load.C.Type == macho::LCT_Symtab) {
+    if (Load.C.cmd == MachO::LC_SYMTAB) {
       assert(!SymtabLoadCmd && "Multiple symbol tables");
       SymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == macho::LCT_Dysymtab) {
+    } else if (Load.C.cmd == MachO::LC_DYSYMTAB) {
       assert(!DysymtabLoadCmd && "Multiple dynamic symbol tables");
       DysymtabLoadCmd = Load.Ptr;
-    } else if (Load.C.Type == SegmentLoadType) {
+    } else if (Load.C.cmd == MachO::LC_DATA_IN_CODE) {
+      assert(!DataInCodeLoadCmd && "Multiple data in code tables");
+      DataInCodeLoadCmd = Load.Ptr;
+    } else if (Load.C.cmd == SegmentLoadType) {
       uint32_t NumSections = getSegmentLoadCommandNumSections(this, Load);
       for (unsigned J = 0; J < NumSections; ++J) {
         const char *Sec = getSectionPtr(this, Load, J);
@@ -444,8 +457,8 @@ MachOObjectFile::MachOObjectFile(MemoryBuffer *Object,
 error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
                                           SymbolRef &Res) const {
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
   Symb.p += SymbolTableEntrySize;
   Res = SymbolRef(Symb, this);
   return object_error::success;
@@ -454,8 +467,8 @@ error_code MachOObjectFile::getSymbolNext(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
                                           StringRef &Res) const {
   StringRef StringTable = getStringTableData();
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  const char *Start = &StringTable.data()[Entry.StringIndex];
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  const char *Start = &StringTable.data()[Entry.n_strx];
   Res = StringRef(Start);
   return object_error::success;
 }
@@ -463,11 +476,11 @@ error_code MachOObjectFile::getSymbolName(DataRefImpl Symb,
 error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
                                              uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Symbol64TableEntry Entry = getSymbol64TableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist_64 Entry = getSymbol64TableEntry(Symb);
+    Res = Entry.n_value;
   } else {
-    macho::SymbolTableEntry Entry = getSymbolTableEntry(Symb);
-    Res = Entry.Value;
+    MachO::nlist Entry = getSymbolTableEntry(Symb);
+    Res = Entry.n_value;
   }
   return object_error::success;
 }
@@ -475,18 +488,18 @@ error_code MachOObjectFile::getSymbolAddress(DataRefImpl Symb,
 error_code
 MachOObjectFile::getSymbolFileOffset(DataRefImpl Symb,
                                      uint64_t &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
   getSymbolAddress(Symb, Res);
-  if (Entry.SectionIndex) {
+  if (Entry.n_sect) {
     uint64_t Delta;
     DataRefImpl SecRel;
-    SecRel.d.a = Entry.SectionIndex-1;
+    SecRel.d.a = Entry.n_sect-1;
     if (is64Bit()) {
-      macho::Section64 Sec = getSection64(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section_64 Sec = getSection64(SecRel);
+      Delta = Sec.offset - Sec.addr;
     } else {
-      macho::Section Sec = getSection(SecRel);
-      Delta = Sec.Offset - Sec.Address;
+      MachO::section Sec = getSection(SecRel);
+      Delta = Sec.offset - Sec.addr;
     }
 
     Res += Delta;
@@ -500,8 +513,8 @@ error_code MachOObjectFile::getSymbolAlignment(DataRefImpl DRI,
   uint32_t flags;
   this->getSymbolFlags(DRI, flags);
   if (flags & SymbolRef::SF_Common) {
-    SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
-    Result = 1 << MachO::GET_COMM_ALIGN(Entry.Flags);
+    nlist_base Entry = getSymbolTableEntryBase(this, DRI);
+    Result = 1 << MachO::GET_COMM_ALIGN(Entry.n_desc);
   } else {
     Result = 0;
   }
@@ -514,13 +527,13 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
   uint64_t EndOffset = 0;
   uint8_t SectionIndex;
 
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
   uint64_t Value;
   getSymbolAddress(DRI, Value);
 
   BeginOffset = Value;
 
-  SectionIndex = Entry.SectionIndex;
+  SectionIndex = Entry.n_sect;
   if (!SectionIndex) {
     uint32_t flags = SymbolRef::SF_None;
     this->getSymbolFlags(DRI, flags);
@@ -538,7 +551,7 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
     DataRefImpl DRI = I->getRawDataRefImpl();
     Entry = getSymbolTableEntryBase(this, DRI);
     getSymbolAddress(DRI, Value);
-    if (Entry.SectionIndex == SectionIndex && Value > BeginOffset)
+    if (Entry.n_sect == SectionIndex && Value > BeginOffset)
       if (!EndOffset || Value < EndOffset)
         EndOffset = Value;
   }
@@ -556,73 +569,47 @@ error_code MachOObjectFile::getSymbolSize(DataRefImpl DRI,
 
 error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
                                           SymbolRef::Type &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t n_type = Entry.Type;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t n_type = Entry.n_type;
 
   Res = SymbolRef::ST_Other;
 
   // If this is a STAB debugging symbol, we can do nothing more.
-  if (n_type & MachO::NlistMaskStab) {
+  if (n_type & MachO::N_STAB) {
     Res = SymbolRef::ST_Debug;
     return object_error::success;
   }
 
-  switch (n_type & MachO::NlistMaskType) {
-    case MachO::NListTypeUndefined :
+  switch (n_type & MachO::N_TYPE) {
+    case MachO::N_UNDF :
       Res = SymbolRef::ST_Unknown;
       break;
-    case MachO::NListTypeSection :
+    case MachO::N_SECT :
       Res = SymbolRef::ST_Function;
       break;
   }
   return object_error::success;
 }
 
-error_code MachOObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
-                                                char &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t Type = Entry.Type;
-  uint16_t Flags = Entry.Flags;
-
-  char Char;
-  switch (Type & macho::STF_TypeMask) {
-    case macho::STT_Undefined:
-      Char = 'u';
-      break;
-    case macho::STT_Absolute:
-    case macho::STT_Section:
-      Char = 's';
-      break;
-    default:
-      Char = '?';
-      break;
-  }
-
-  if (Flags & (macho::STF_External | macho::STF_PrivateExtern))
-    Char = toupper(static_cast<unsigned char>(Char));
-  Res = Char;
-  return object_error::success;
-}
-
 error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
                                            uint32_t &Result) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, DRI);
+  nlist_base Entry = getSymbolTableEntryBase(this, DRI);
 
-  uint8_t MachOType = Entry.Type;
-  uint16_t MachOFlags = Entry.Flags;
+  uint8_t MachOType = Entry.n_type;
+  uint16_t MachOFlags = Entry.n_desc;
 
   // TODO: Correctly set SF_ThreadLocal
   Result = SymbolRef::SF_None;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF)
     Result |= SymbolRef::SF_Undefined;
 
-  if (MachOFlags & macho::STF_StabsEntryMask)
+  if (MachOType & MachO::N_STAB)
     Result |= SymbolRef::SF_FormatSpecific;
 
-  if (MachOType & MachO::NlistMaskExternal) {
+  if (MachOType & MachO::N_EXT) {
     Result |= SymbolRef::SF_Global;
-    if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeUndefined) {
+    if ((MachOType & MachO::N_TYPE) == MachO::N_UNDF) {
       uint64_t Value;
       getSymbolAddress(DRI, Value);
       if (Value)
@@ -630,10 +617,10 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
     }
   }
 
-  if (MachOFlags & (MachO::NListDescWeakRef | MachO::NListDescWeakDef))
+  if (MachOFlags & (MachO::N_WEAK_REF | MachO::N_WEAK_DEF))
     Result |= SymbolRef::SF_Weak;
 
-  if ((MachOType & MachO::NlistMaskType) == MachO::NListTypeAbsolute)
+  if ((MachOType & MachO::N_TYPE) == MachO::N_ABS)
     Result |= SymbolRef::SF_Absolute;
 
   return object_error::success;
@@ -642,8 +629,8 @@ error_code MachOObjectFile::getSymbolFlags(DataRefImpl DRI,
 error_code
 MachOObjectFile::getSymbolSection(DataRefImpl Symb,
                                   section_iterator &Res) const {
-  SymbolTableEntryBase Entry = getSymbolTableEntryBase(this, Symb);
-  uint8_t index = Entry.SectionIndex;
+  nlist_base Entry = getSymbolTableEntryBase(this, Symb);
+  uint8_t index = Entry.n_sect;
 
   if (index == 0) {
     Res = end_sections();
@@ -678,11 +665,11 @@ MachOObjectFile::getSectionName(DataRefImpl Sec, StringRef &Result) const {
 error_code
 MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Address;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.addr;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Address;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.addr;
   }
   return object_error::success;
 }
@@ -690,11 +677,11 @@ MachOObjectFile::getSectionAddress(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::getSectionSize(DataRefImpl Sec, uint64_t &Res) const {
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Res = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Res = Sect.size;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Res = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Res = Sect.size;
   }
 
   return object_error::success;
@@ -706,13 +693,13 @@ MachOObjectFile::getSectionContents(DataRefImpl Sec, StringRef &Res) const {
   uint64_t Size;
 
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   } else {
-    macho::Section Sect =getSection(Sec);
-    Offset = Sect.Offset;
-    Size = Sect.Size;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.offset;
+    Size = Sect.size;
   }
 
   Res = this->getData().substr(Offset, Size);
@@ -723,11 +710,11 @@ error_code
 MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
   uint32_t Align;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Align = Sect.Align;
+    MachO::section_64 Sect = getSection64(Sec);
+    Align = Sect.align;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Align = Sect.Align;
+    MachO::section Sect = getSection(Sec);
+    Align = Sect.align;
   }
 
   Res = uint64_t(1) << Align;
@@ -737,7 +724,7 @@ MachOObjectFile::getSectionAlignment(DataRefImpl Sec, uint64_t &Res) const {
 error_code
 MachOObjectFile::isSectionText(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  Res = Flags & macho::SF_PureInstructions;
+  Res = Flags & MachO::S_ATTR_PURE_INSTRUCTIONS;
   return object_error::success;
 }
 
@@ -771,9 +758,9 @@ error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
 error_code
 MachOObjectFile::isSectionZeroInit(DataRefImpl Sec, bool &Res) const {
   uint32_t Flags = getSectionFlags(this, Sec);
-  unsigned SectionType = Flags & MachO::SectionFlagMaskSectionType;
-  Res = SectionType == MachO::SectionTypeZeroFill ||
-    SectionType == MachO::SectionTypeZeroFillLarge;
+  unsigned SectionType = Flags & MachO::SECTION_TYPE;
+  Res = SectionType == MachO::S_ZEROFILL ||
+    SectionType == MachO::S_GB_ZEROFILL;
   return object_error::success;
 }
 
@@ -810,14 +797,14 @@ MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
   return object_error::success;
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
+relocation_iterator MachOObjectFile::section_rel_begin(DataRefImpl Sec) const {
   uint32_t Offset;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
   }
 
   DataRefImpl Ret;
@@ -826,21 +813,21 @@ relocation_iterator MachOObjectFile::getSectionRelBegin(DataRefImpl Sec) const {
 }
 
 relocation_iterator
-MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
+MachOObjectFile::section_rel_end(DataRefImpl Sec) const {
   uint32_t Offset;
   uint32_t Num;
   if (is64Bit()) {
-    macho::Section64 Sect = getSection64(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section_64 Sect = getSection64(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   } else {
-    macho::Section Sect = getSection(Sec);
-    Offset = Sect.RelocationTableOffset;
-    Num = Sect.NumRelocationTableEntries;
+    MachO::section Sect = getSection(Sec);
+    Offset = Sect.reloff;
+    Num = Sect.nreloc;
   }
 
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry*>(getPtr(this, Offset));
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(getPtr(this, Offset));
 
   DataRefImpl Ret;
   Ret.p = reinterpret_cast<uintptr_t>(P + Num);
@@ -849,8 +836,8 @@ MachOObjectFile::getSectionRelEnd(DataRefImpl Sec) const {
 
 error_code MachOObjectFile::getRelocationNext(DataRefImpl Rel,
                                               RelocationRef &Res) const {
-  const macho::RelocationEntry *P =
-    reinterpret_cast<const macho::RelocationEntry *>(Rel.p);
+  const MachO::any_relocation_info *P =
+    reinterpret_cast<const MachO::any_relocation_info *>(Rel.p);
   Rel.p = reinterpret_cast<uintptr_t>(P + 1);
   Res = RelocationRef(Rel, this);
   return object_error::success;
@@ -863,35 +850,32 @@ MachOObjectFile::getRelocationAddress(DataRefImpl Rel, uint64_t &Res) const {
 
 error_code MachOObjectFile::getRelocationOffset(DataRefImpl Rel,
                                                 uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationAddress(RE);
   return object_error::success;
 }
 
-error_code
-MachOObjectFile::getRelocationSymbol(DataRefImpl Rel, SymbolRef &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+symbol_iterator
+MachOObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
+  MachO::any_relocation_info RE = getRelocation(Rel);
   uint32_t SymbolIdx = getPlainRelocationSymbolNum(RE);
   bool isExtern = getPlainRelocationExternal(RE);
-  if (!isExtern) {
-    Res = *end_symbols();
-    return object_error::success;
-  }
+  if (!isExtern)
+    return end_symbols();
 
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
+  MachO::symtab_command S = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  uint64_t Offset = S.SymbolTableOffset + SymbolIdx * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  uint64_t Offset = S.symoff + SymbolIdx * SymbolTableEntrySize;
   DataRefImpl Sym;
   Sym.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
-  Res = SymbolRef(Sym, this);
-  return object_error::success;
+  return symbol_iterator(SymbolRef(Sym, this));
 }
 
 error_code MachOObjectFile::getRelocationType(DataRefImpl Rel,
                                               uint64_t &Res) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
   Res = getAnyRelocationType(RE);
   return object_error::success;
 }
@@ -989,16 +973,10 @@ MachOObjectFile::getRelocationTypeName(DataRefImpl Rel,
   return object_error::success;
 }
 
-error_code MachOObjectFile::getRelocationAdditionalInfo(DataRefImpl Rel,
-                                                        int64_t &Res) const {
-  Res = 0;
-  return object_error::success;
-}
-
 error_code
 MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
                                           SmallVectorImpl<char> &Result) const {
-  macho::RelocationEntry RE = getRelocation(Rel);
+  MachO::any_relocation_info RE = getRelocation(Rel);
 
   unsigned Arch = this->getArch();
 
@@ -1015,47 +993,47 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
     bool isPCRel = getAnyRelocationPCRel(RE);
 
     switch (Type) {
-      case macho::RIT_X86_64_GOTLoad:   // X86_64_RELOC_GOT_LOAD
-      case macho::RIT_X86_64_GOT: {     // X86_64_RELOC_GOT
+      case MachO::X86_64_RELOC_GOT_LOAD:
+      case MachO::X86_64_RELOC_GOT: {
         printRelocationTargetName(this, RE, fmt);
         fmt << "@GOT";
         if (isPCRel) fmt << "PCREL";
         break;
       }
-      case macho::RIT_X86_64_Subtractor: { // X86_64_RELOC_SUBTRACTOR
+      case MachO::X86_64_RELOC_SUBTRACTOR: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
-        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_SUBTRACTOR must be followed by a relocation of type
         // X86_64_RELOC_UNSIGNED.
         // NOTE: Scattered relocations don't exist on x86_64.
         unsigned RType = getAnyRelocationType(RENext);
-        if (RType != 0)
+        if (RType != MachO::X86_64_RELOC_UNSIGNED)
           report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
                              "X86_64_RELOC_SUBTRACTOR.");
 
-        // The X86_64_RELOC_UNSIGNED contains the minuend symbol,
-        // X86_64_SUBTRACTOR contains to the subtrahend.
+        // The X86_64_RELOC_UNSIGNED contains the minuend symbol;
+        // X86_64_RELOC_SUBTRACTOR contains the subtrahend.
         printRelocationTargetName(this, RENext, fmt);
         fmt << "-";
         printRelocationTargetName(this, RE, fmt);
         break;
       }
-      case macho::RIT_X86_64_TLV:
+      case MachO::X86_64_RELOC_TLV:
         printRelocationTargetName(this, RE, fmt);
         fmt << "@TLV";
         if (isPCRel) fmt << "P";
         break;
-      case macho::RIT_X86_64_Signed1: // X86_64_RELOC_SIGNED1
+      case MachO::X86_64_RELOC_SIGNED_1:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-1";
         break;
-      case macho::RIT_X86_64_Signed2: // X86_64_RELOC_SIGNED2
+      case MachO::X86_64_RELOC_SIGNED_2:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-2";
         break;
-      case macho::RIT_X86_64_Signed4: // X86_64_RELOC_SIGNED4
+      case MachO::X86_64_RELOC_SIGNED_4:
         printRelocationTargetName(this, RE, fmt);
         fmt << "-4";
         break;
@@ -1064,21 +1042,22 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
         break;
     }
   // X86 and ARM share some relocation types in common.
-  } else if (Arch == Triple::x86 || Arch == Triple::arm) {
+  } else if (Arch == Triple::x86 || Arch == Triple::arm ||
+             Arch == Triple::ppc) {
     // Generic relocation types...
     switch (Type) {
-      case macho::RIT_Pair: // GENERIC_RELOC_PAIR - prints no info
+      case MachO::GENERIC_RELOC_PAIR: // prints no info
         return object_error::success;
-      case macho::RIT_Difference: { // GENERIC_RELOC_SECTDIFF
+      case MachO::GENERIC_RELOC_SECTDIFF: {
         DataRefImpl RelNext = Rel;
         RelNext.d.a++;
-        macho::RelocationEntry RENext = getRelocation(RelNext);
+        MachO::any_relocation_info RENext = getRelocation(RelNext);
 
         // X86 sect diff's must be followed by a relocation of type
         // GENERIC_RELOC_PAIR.
         unsigned RType = getAnyRelocationType(RENext);
 
-        if (RType != 1)
+        if (RType != MachO::GENERIC_RELOC_PAIR)
           report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                              "GENERIC_RELOC_SECTDIFF.");
 
@@ -1089,19 +1068,17 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     }
 
-    if (Arch == Triple::x86) {
-      // All X86 relocations that need special printing were already
-      // handled in the generic code.
+    if (Arch == Triple::x86 || Arch == Triple::ppc) {
       switch (Type) {
-        case macho::RIT_Generic_LocalDifference:{// GENERIC_RELOC_LOCAL_SECTDIFF
+        case MachO::GENERIC_RELOC_LOCAL_SECTDIFF: {
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // X86 sect diff's must be followed by a relocation of type
           // GENERIC_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::GENERIC_RELOC_PAIR)
             report_fatal_error("Expected GENERIC_RELOC_PAIR after "
                                "GENERIC_RELOC_LOCAL_SECTDIFF.");
 
@@ -1110,7 +1087,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
           printRelocationTargetName(this, RENext, fmt);
           break;
         }
-        case macho::RIT_Generic_TLV: {
+        case MachO::GENERIC_RELOC_TLV: {
           printRelocationTargetName(this, RE, fmt);
           fmt << "@TLV";
           if (IsPCRel) fmt << "P";
@@ -1121,8 +1098,8 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
       }
     } else { // ARM-specific relocations
       switch (Type) {
-        case macho::RIT_ARM_Half:             // ARM_RELOC_HALF
-        case macho::RIT_ARM_HalfDifference: { // ARM_RELOC_HALF_SECTDIFF
+        case MachO::ARM_RELOC_HALF:
+        case MachO::ARM_RELOC_HALF_SECTDIFF: {
           // Half relocations steal a bit from the length field to encode
           // whether this is an upper16 or a lower16 relocation.
           bool isUpper = getAnyRelocationLength(RE) >> 1;
@@ -1135,14 +1112,14 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           DataRefImpl RelNext = Rel;
           RelNext.d.a++;
-          macho::RelocationEntry RENext = getRelocation(RelNext);
+          MachO::any_relocation_info RENext = getRelocation(RelNext);
 
           // ARM half relocs must be followed by a relocation of type
           // ARM_RELOC_PAIR.
           unsigned RType = getAnyRelocationType(RENext);
-          if (RType != 1)
+          if (RType != MachO::ARM_RELOC_PAIR)
             report_fatal_error("Expected ARM_RELOC_PAIR after "
-                               "GENERIC_RELOC_HALF");
+                               "ARM_RELOC_HALF");
 
           // NOTE: The half of the target virtual address is stashed in the
           // address field of the secondary relocation, but we can't reverse
@@ -1151,7 +1128,7 @@ MachOObjectFile::getRelocationValueString(DataRefImpl Rel,
 
           // ARM_RELOC_HALF_SECTDIFF encodes the second section in the
           // symbol/section pointer of the follow-on relocation.
-          if (Type == macho::RIT_ARM_HalfDifference) {
+          if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
             fmt << "-";
             printRelocationTargetName(this, RENext, fmt);
           }
@@ -1182,17 +1159,17 @@ MachOObjectFile::getRelocationHidden(DataRefImpl Rel, bool &Result) const {
 
   // On arches that use the generic relocations, GENERIC_RELOC_PAIR
   // is always hidden.
-  if (Arch == Triple::x86 || Arch == Triple::arm) {
-    if (Type == macho::RIT_Pair) Result = true;
+  if (Arch == Triple::x86 || Arch == Triple::arm || Arch == Triple::ppc) {
+    if (Type == MachO::GENERIC_RELOC_PAIR) Result = true;
   } else if (Arch == Triple::x86_64) {
     // On x86_64, X86_64_RELOC_UNSIGNED is hidden only when it follows
-    // an X864_64_RELOC_SUBTRACTOR.
-    if (Type == macho::RIT_X86_64_Unsigned && Rel.d.a > 0) {
+    // an X86_64_RELOC_SUBTRACTOR.
+    if (Type == MachO::X86_64_RELOC_UNSIGNED && Rel.d.a > 0) {
       DataRefImpl RelPrev = Rel;
       RelPrev.d.a--;
       uint64_t PrevType;
       getRelocationType(RelPrev, PrevType);
-      if (PrevType == macho::RIT_X86_64_Subtractor)
+      if (PrevType == MachO::X86_64_RELOC_SUBTRACTOR)
         Result = true;
     }
   }
@@ -1215,8 +1192,8 @@ symbol_iterator MachOObjectFile::begin_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
-  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.SymbolTableOffset));
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Symtab.symoff));
   return symbol_iterator(SymbolRef(DRI, this));
 }
 
@@ -1225,12 +1202,12 @@ symbol_iterator MachOObjectFile::end_symbols() const {
   if (!SymtabLoadCmd)
     return symbol_iterator(SymbolRef(DRI, this));
 
-  macho::SymtabLoadCommand Symtab = getSymtabLoadCommand();
+  MachO::symtab_command Symtab = getSymtabLoadCommand();
   unsigned SymbolTableEntrySize = is64Bit() ?
-    sizeof(macho::Symbol64TableEntry) :
-    sizeof(macho::SymbolTableEntry);
-  unsigned Offset = Symtab.SymbolTableOffset +
-    Symtab.NumSymbolTableEntries * SymbolTableEntrySize;
+    sizeof(MachO::nlist_64) :
+    sizeof(MachO::nlist);
+  unsigned Offset = Symtab.symoff +
+    Symtab.nsyms * SymbolTableEntrySize;
   DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
   return symbol_iterator(SymbolRef(DRI, this));
 }
@@ -1274,66 +1251,91 @@ StringRef MachOObjectFile::getFileFormatName() const {
   unsigned CPUType = getCPUType(this);
   if (!is64Bit()) {
     switch (CPUType) {
-    case llvm::MachO::CPUTypeI386:
+    case llvm::MachO::CPU_TYPE_I386:
       return "Mach-O 32-bit i386";
-    case llvm::MachO::CPUTypeARM:
+    case llvm::MachO::CPU_TYPE_ARM:
       return "Mach-O arm";
-    case llvm::MachO::CPUTypePowerPC:
+    case llvm::MachO::CPU_TYPE_POWERPC:
       return "Mach-O 32-bit ppc";
     default:
-      assert((CPUType & llvm::MachO::CPUArchABI64) == 0 &&
+      assert((CPUType & llvm::MachO::CPU_ARCH_ABI64) == 0 &&
              "64-bit object file when we're not 64-bit?");
       return "Mach-O 32-bit unknown";
     }
   }
 
   // Make sure the cpu type has the correct mask.
-  assert((CPUType & llvm::MachO::CPUArchABI64)
-	 == llvm::MachO::CPUArchABI64 &&
-	 "32-bit object file when we're 64-bit?");
+  assert((CPUType & llvm::MachO::CPU_ARCH_ABI64)
+         == llvm::MachO::CPU_ARCH_ABI64 &&
+         "32-bit object file when we're 64-bit?");
 
   switch (CPUType) {
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return "Mach-O 64-bit x86-64";
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return "Mach-O 64-bit ppc64";
   default:
     return "Mach-O 64-bit unknown";
   }
 }
 
-unsigned MachOObjectFile::getArch() const {
-  switch (getCPUType(this)) {
-  case llvm::MachO::CPUTypeI386:
+Triple::ArchType MachOObjectFile::getArch(uint32_t CPUType) {
+  switch (CPUType) {
+  case llvm::MachO::CPU_TYPE_I386:
     return Triple::x86;
-  case llvm::MachO::CPUTypeX86_64:
+  case llvm::MachO::CPU_TYPE_X86_64:
     return Triple::x86_64;
-  case llvm::MachO::CPUTypeARM:
+  case llvm::MachO::CPU_TYPE_ARM:
     return Triple::arm;
-  case llvm::MachO::CPUTypePowerPC:
+  case llvm::MachO::CPU_TYPE_POWERPC:
     return Triple::ppc;
-  case llvm::MachO::CPUTypePowerPC64:
+  case llvm::MachO::CPU_TYPE_POWERPC64:
     return Triple::ppc64;
   default:
     return Triple::UnknownArch;
   }
 }
 
+unsigned MachOObjectFile::getArch() const {
+  return getArch(getCPUType(this));
+}
+
 StringRef MachOObjectFile::getLoadName() const {
   // TODO: Implement
   report_fatal_error("get_load_name() unimplemented in MachOObjectFile");
 }
 
-relocation_iterator MachOObjectFile::getSectionRelBegin(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_begin(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelBegin(DRI);
+  return section_rel_begin(DRI);
 }
 
-relocation_iterator MachOObjectFile::getSectionRelEnd(unsigned Index) const {
+relocation_iterator MachOObjectFile::section_rel_end(unsigned Index) const {
   DataRefImpl DRI;
   DRI.d.a = Index;
-  return getSectionRelEnd(DRI);
+  return section_rel_end(DRI);
+}
+
+dice_iterator MachOObjectFile::begin_dices() const {
+  DataRefImpl DRI;
+  if (!DataInCodeLoadCmd)
+    return dice_iterator(DiceRef(DRI, this));
+
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, DicLC.dataoff));
+  return dice_iterator(DiceRef(DRI, this));
+}
+
+dice_iterator MachOObjectFile::end_dices() const {
+  DataRefImpl DRI;
+  if (!DataInCodeLoadCmd)
+    return dice_iterator(DiceRef(DRI, this));
+
+  MachO::linkedit_data_command DicLC = getDataInCodeLoadCommand();
+  unsigned Offset = DicLC.dataoff + DicLC.datasize;
+  DRI.p = reinterpret_cast<uintptr_t>(getPtr(this, Offset));
+  return dice_iterator(DiceRef(DRI, this));
 }
 
 StringRef
@@ -1344,78 +1346,82 @@ MachOObjectFile::getSectionFinalSegmentName(DataRefImpl Sec) const {
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->Name);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->sectname);
 }
 
 ArrayRef<char>
 MachOObjectFile::getSectionRawFinalSegmentName(DataRefImpl Sec) const {
-  const SectionBase *Base =
-    reinterpret_cast<const SectionBase*>(Sections[Sec.d.a]);
-  return ArrayRef<char>(Base->SegmentName);
+  const section_base *Base =
+    reinterpret_cast<const section_base *>(Sections[Sec.d.a]);
+  return ArrayRef<char>(Base->segname);
 }
 
 bool
-MachOObjectFile::isRelocationScattered(const macho::RelocationEntry &RE)
+MachOObjectFile::isRelocationScattered(const MachO::any_relocation_info &RE)
   const {
-  if (getCPUType(this) == llvm::MachO::CPUTypeX86_64)
+  if (getCPUType(this) == MachO::CPU_TYPE_X86_64)
     return false;
-  return getPlainRelocationAddress(RE) & macho::RF_Scattered;
+  return getPlainRelocationAddress(RE) & MachO::R_SCATTERED;
 }
 
-unsigned MachOObjectFile::getPlainRelocationSymbolNum(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getPlainRelocationSymbolNum(
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return RE.Word1 & 0xffffff;
-  return RE.Word1 >> 8;
+    return RE.r_word1 & 0xffffff;
+  return RE.r_word1 >> 8;
 }
 
-bool MachOObjectFile::getPlainRelocationExternal(const macho::RelocationEntry &RE) const {
+bool MachOObjectFile::getPlainRelocationExternal(
+    const MachO::any_relocation_info &RE) const {
   if (isLittleEndian())
-    return (RE.Word1 >> 27) & 1;
-  return (RE.Word1 >> 4) & 1;
+    return (RE.r_word1 >> 27) & 1;
+  return (RE.r_word1 >> 4) & 1;
 }
 
-bool
-MachOObjectFile::getScatteredRelocationScattered(const macho::RelocationEntry &RE) const {
-  return RE.Word0 >> 31;
+bool MachOObjectFile::getScatteredRelocationScattered(
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word0 >> 31;
 }
 
-uint32_t
-MachOObjectFile::getScatteredRelocationValue(const macho::RelocationEntry &RE) const {
-  return RE.Word1;
+uint32_t MachOObjectFile::getScatteredRelocationValue(
+    const MachO::any_relocation_info &RE) const {
+  return RE.r_word1;
 }
 
-unsigned
-MachOObjectFile::getAnyRelocationAddress(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getAnyRelocationAddress(
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationAddress(RE);
   return getPlainRelocationAddress(RE);
 }
 
-unsigned
-MachOObjectFile::getAnyRelocationPCRel(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getAnyRelocationPCRel(
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationPCRel(this, RE);
   return getPlainRelocationPCRel(this, RE);
 }
 
-unsigned
-MachOObjectFile::getAnyRelocationLength(const macho::RelocationEntry &RE) const {
+unsigned MachOObjectFile::getAnyRelocationLength(
+    const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationLength(RE);
   return getPlainRelocationLength(this, RE);
 }
 
 unsigned
-MachOObjectFile::getAnyRelocationType(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getAnyRelocationType(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE))
     return getScatteredRelocationType(RE);
   return getPlainRelocationType(this, RE);
 }
 
 SectionRef
-MachOObjectFile::getRelocationSection(const macho::RelocationEntry &RE) const {
+MachOObjectFile::getRelocationSection(
+                                   const MachO::any_relocation_info &RE) const {
   if (isRelocationScattered(RE) || getPlainRelocationExternal(RE))
     return *end_sections();
   unsigned SecNum = getPlainRelocationSymbolNum(RE) - 1;
@@ -1428,113 +1434,132 @@ MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getFirstLoadCommandInfo() const {
   MachOObjectFile::LoadCommandInfo Load;
 
-  unsigned HeaderSize = is64Bit() ? macho::Header64Size : macho::Header32Size;
+  unsigned HeaderSize = is64Bit() ? sizeof(MachO::mach_header_64) :
+                                    sizeof(MachO::mach_header);
   Load.Ptr = getPtr(this, HeaderSize);
-  Load.C = getStruct<macho::LoadCommand>(this, Load.Ptr);
+  Load.C = getStruct<MachO::load_command>(this, Load.Ptr);
   return Load;
 }
 
 MachOObjectFile::LoadCommandInfo
 MachOObjectFile::getNextLoadCommandInfo(const LoadCommandInfo &L) const {
   MachOObjectFile::LoadCommandInfo Next;
-  Next.Ptr = L.Ptr + L.C.Size;
-  Next.C = getStruct<macho::LoadCommand>(this, Next.Ptr);
+  Next.Ptr = L.Ptr + L.C.cmdsize;
+  Next.C = getStruct<MachO::load_command>(this, Next.Ptr);
   return Next;
 }
 
-macho::Section MachOObjectFile::getSection(DataRefImpl DRI) const {
-  return getStruct<macho::Section>(this, Sections[DRI.d.a]);
+MachO::section MachOObjectFile::getSection(DataRefImpl DRI) const {
+  return getStruct<MachO::section>(this, Sections[DRI.d.a]);
 }
 
-macho::Section64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
-  return getStruct<macho::Section64>(this, Sections[DRI.d.a]);
+MachO::section_64 MachOObjectFile::getSection64(DataRefImpl DRI) const {
+  return getStruct<MachO::section_64>(this, Sections[DRI.d.a]);
 }
 
-macho::Section MachOObjectFile::getSection(const LoadCommandInfo &L,
+MachO::section MachOObjectFile::getSection(const LoadCommandInfo &L,
                                            unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section>(this, Sec);
+  return getStruct<MachO::section>(this, Sec);
 }
 
-macho::Section64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
-                                               unsigned Index) const {
+MachO::section_64 MachOObjectFile::getSection64(const LoadCommandInfo &L,
+                                                unsigned Index) const {
   const char *Sec = getSectionPtr(this, L, Index);
-  return getStruct<macho::Section64>(this, Sec);
+  return getStruct<MachO::section_64>(this, Sec);
 }
 
-macho::SymbolTableEntry
+MachO::nlist
 MachOObjectFile::getSymbolTableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::SymbolTableEntry>(this, P);
+  return getStruct<MachO::nlist>(this, P);
 }
 
-macho::Symbol64TableEntry
+MachO::nlist_64
 MachOObjectFile::getSymbol64TableEntry(DataRefImpl DRI) const {
   const char *P = reinterpret_cast<const char *>(DRI.p);
-  return getStruct<macho::Symbol64TableEntry>(this, P);
+  return getStruct<MachO::nlist_64>(this, P);
 }
 
-macho::LinkeditDataLoadCommand
-MachOObjectFile::getLinkeditDataLoadCommand(const MachOObjectFile::LoadCommandInfo &L) const {
-  return getStruct<macho::LinkeditDataLoadCommand>(this, L.Ptr);
+MachO::linkedit_data_command
+MachOObjectFile::getLinkeditDataLoadCommand(const LoadCommandInfo &L) const {
+  return getStruct<MachO::linkedit_data_command>(this, L.Ptr);
 }
 
-macho::SegmentLoadCommand
+MachO::segment_command
 MachOObjectFile::getSegmentLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::SegmentLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command>(this, L.Ptr);
 }
 
-macho::Segment64LoadCommand
+MachO::segment_command_64
 MachOObjectFile::getSegment64LoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::Segment64LoadCommand>(this, L.Ptr);
+  return getStruct<MachO::segment_command_64>(this, L.Ptr);
 }
 
-macho::LinkerOptionsLoadCommand
+MachO::linker_options_command
 MachOObjectFile::getLinkerOptionsLoadCommand(const LoadCommandInfo &L) const {
-  return getStruct<macho::LinkerOptionsLoadCommand>(this, L.Ptr);
+  return getStruct<MachO::linker_options_command>(this, L.Ptr);
 }
 
-macho::RelocationEntry
+MachO::any_relocation_info
 MachOObjectFile::getRelocation(DataRefImpl Rel) const {
   const char *P = reinterpret_cast<const char *>(Rel.p);
-  return getStruct<macho::RelocationEntry>(this, P);
+  return getStruct<MachO::any_relocation_info>(this, P);
+}
+
+MachO::data_in_code_entry
+MachOObjectFile::getDice(DataRefImpl Rel) const {
+  const char *P = reinterpret_cast<const char *>(Rel.p);
+  return getStruct<MachO::data_in_code_entry>(this, P);
 }
 
-macho::Header MachOObjectFile::getHeader() const {
-  return getStruct<macho::Header>(this, getPtr(this, 0));
+MachO::mach_header MachOObjectFile::getHeader() const {
+  return getStruct<MachO::mach_header>(this, getPtr(this, 0));
 }
 
-macho::Header64Ext MachOObjectFile::getHeader64Ext() const {
-  return
-    getStruct<macho::Header64Ext>(this, getPtr(this, sizeof(macho::Header)));
+MachO::mach_header_64 MachOObjectFile::getHeader64() const {
+  return getStruct<MachO::mach_header_64>(this, getPtr(this, 0));
 }
 
-macho::IndirectSymbolTableEntry MachOObjectFile::getIndirectSymbolTableEntry(
-                                          const macho::DysymtabLoadCommand &DLC,
-                                          unsigned Index) const {
-  uint64_t Offset = DLC.IndirectSymbolTableOffset +
-    Index * sizeof(macho::IndirectSymbolTableEntry);
-  return getStruct<macho::IndirectSymbolTableEntry>(this, getPtr(this, Offset));
+uint32_t MachOObjectFile::getIndirectSymbolTableEntry(
+                                             const MachO::dysymtab_command &DLC,
+                                             unsigned Index) const {
+  uint64_t Offset = DLC.indirectsymoff + Index * sizeof(uint32_t);
+  return getStruct<uint32_t>(this, getPtr(this, Offset));
 }
 
-macho::DataInCodeTableEntry
+MachO::data_in_code_entry
 MachOObjectFile::getDataInCodeTableEntry(uint32_t DataOffset,
                                          unsigned Index) const {
-  uint64_t Offset = DataOffset + Index * sizeof(macho::DataInCodeTableEntry);
-  return getStruct<macho::DataInCodeTableEntry>(this, getPtr(this, Offset));
+  uint64_t Offset = DataOffset + Index * sizeof(MachO::data_in_code_entry);
+  return getStruct<MachO::data_in_code_entry>(this, getPtr(this, Offset));
 }
 
-macho::SymtabLoadCommand MachOObjectFile::getSymtabLoadCommand() const {
-  return getStruct<macho::SymtabLoadCommand>(this, SymtabLoadCmd);
+MachO::symtab_command MachOObjectFile::getSymtabLoadCommand() const {
+  return getStruct<MachO::symtab_command>(this, SymtabLoadCmd);
 }
 
-macho::DysymtabLoadCommand MachOObjectFile::getDysymtabLoadCommand() const {
-  return getStruct<macho::DysymtabLoadCommand>(this, DysymtabLoadCmd);
+MachO::dysymtab_command MachOObjectFile::getDysymtabLoadCommand() const {
+  return getStruct<MachO::dysymtab_command>(this, DysymtabLoadCmd);
+}
+
+MachO::linkedit_data_command
+MachOObjectFile::getDataInCodeLoadCommand() const {
+  if (DataInCodeLoadCmd)
+    return getStruct<MachO::linkedit_data_command>(this, DataInCodeLoadCmd);
+
+  // If there is no DataInCodeLoadCmd return a load command with zero'ed fields.
+  MachO::linkedit_data_command Cmd;
+  Cmd.cmd = MachO::LC_DATA_IN_CODE;
+  Cmd.cmdsize = sizeof(MachO::linkedit_data_command);
+  Cmd.dataoff = 0;
+  Cmd.datasize = 0;
+  return Cmd;
 }
 
 StringRef MachOObjectFile::getStringTableData() const {
-  macho::SymtabLoadCommand S = getSymtabLoadCommand();
-  return getData().substr(S.StringTableOffset, S.StringTableSize);
+  MachO::symtab_command S = getSymtabLoadCommand();
+  return getData().substr(S.stroff, S.strsize);
 }
 
 bool MachOObjectFile::is64Bit() const {
@@ -1557,21 +1582,23 @@ void MachOObjectFile::ReadULEB128s(uint64_t Index,
 ObjectFile *ObjectFile::createMachOObjectFile(MemoryBuffer *Buffer) {
   StringRef Magic = Buffer->getBuffer().slice(0, 4);
   error_code ec;
-  ObjectFile *Ret;
+  OwningPtr<ObjectFile> Ret;
   if (Magic == "\xFE\xED\xFA\xCE")
-    Ret = new MachOObjectFile(Buffer, false, false, ec);
+    Ret.reset(new MachOObjectFile(Buffer, false, false, ec));
   else if (Magic == "\xCE\xFA\xED\xFE")
-    Ret = new MachOObjectFile(Buffer, true, false, ec);
+    Ret.reset(new MachOObjectFile(Buffer, true, false, ec));
   else if (Magic == "\xFE\xED\xFA\xCF")
-    Ret = new MachOObjectFile(Buffer, false, true, ec);
+    Ret.reset(new MachOObjectFile(Buffer, false, true, ec));
   else if (Magic == "\xCF\xFA\xED\xFE")
-    Ret = new MachOObjectFile(Buffer, true, true, ec);
-  else
+    Ret.reset(new MachOObjectFile(Buffer, true, true, ec));
+  else {
+    delete Buffer;
     return NULL;
+  }
 
   if (ec)
     return NULL;
-  return Ret;
+  return Ret.take();
 }
 
 } // end namespace object
diff --git a/lib/Object/MachOUniversal.cpp b/lib/Object/MachOUniversal.cpp
new file mode 100644
index 0000000..75160af
--- /dev/null
+++ b/lib/Object/MachOUniversal.cpp
@@ -0,0 +1,139 @@
+//===- MachOUniversal.cpp - Mach-O universal binary -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachOUniversalBinary class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/MachOUniversal.h"
+
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+using namespace llvm;
+using namespace object;
+
+template<typename T>
+static void SwapValue(T &Value) {
+  Value = sys::SwapByteOrder(Value);
+}
+
+template<typename T>
+static void SwapStruct(T &Value);
+
+template<>
+void SwapStruct(MachO::fat_header &H) {
+  SwapValue(H.magic);
+  SwapValue(H.nfat_arch);
+}
+
+template<>
+void SwapStruct(MachO::fat_arch &H) {
+  SwapValue(H.cputype);
+  SwapValue(H.cpusubtype);
+  SwapValue(H.offset);
+  SwapValue(H.size);
+  SwapValue(H.align);
+}
+
+template<typename T>
+static T getUniversalBinaryStruct(const char *Ptr) {
+  T Res;
+  memcpy(&Res, Ptr, sizeof(T));
+  // Universal binary headers have big-endian byte order.
+  if (sys::IsLittleEndianHost)
+    SwapStruct(Res);
+  return Res;
+}
+
+MachOUniversalBinary::ObjectForArch::ObjectForArch(
+    const MachOUniversalBinary *Parent, uint32_t Index)
+    : Parent(Parent), Index(Index) {
+  if (Parent == 0 || Index > Parent->getNumberOfObjects()) {
+    clear();
+  } else {
+    // Parse object header.
+    StringRef ParentData = Parent->getData();
+    const char *HeaderPos = ParentData.begin() + sizeof(MachO::fat_header) +
+                            Index * sizeof(MachO::fat_arch);
+    Header = getUniversalBinaryStruct<MachO::fat_arch>(HeaderPos);
+    if (ParentData.size() < Header.offset + Header.size) {
+      clear();
+    }
+  }
+}
+
+error_code MachOUniversalBinary::ObjectForArch::getAsObjectFile(
+    OwningPtr<ObjectFile> &Result) const {
+  if (Parent) {
+    StringRef ParentData = Parent->getData();
+    StringRef ObjectData = ParentData.substr(Header.offset, Header.size);
+    std::string ObjectName =
+        Parent->getFileName().str() + ":" +
+        Triple::getArchTypeName(MachOObjectFile::getArch(Header.cputype));
+    MemoryBuffer *ObjBuffer = MemoryBuffer::getMemBuffer(
+        ObjectData, ObjectName, false);
+    if (ObjectFile *Obj = ObjectFile::createMachOObjectFile(ObjBuffer)) {
+      Result.reset(Obj);
+      return object_error::success;
+    }
+  }
+  return object_error::parse_failed;
+}
+
+void MachOUniversalBinary::anchor() { }
+
+MachOUniversalBinary::MachOUniversalBinary(MemoryBuffer *Source,
+                                           error_code &ec)
+  : Binary(Binary::ID_MachOUniversalBinary, Source),
+    NumberOfObjects(0) {
+  if (Source->getBufferSize() < sizeof(MachO::fat_header)) {
+    ec = object_error::invalid_file_type;
+    return;
+  }
+  // Check for magic value and sufficient header size.
+  StringRef Buf = getData();
+  MachO::fat_header H= getUniversalBinaryStruct<MachO::fat_header>(Buf.begin());
+  NumberOfObjects = H.nfat_arch;
+  uint32_t MinSize = sizeof(MachO::fat_header) +
+                     sizeof(MachO::fat_arch) * NumberOfObjects;
+  if (H.magic != MachO::FAT_MAGIC || Buf.size() < MinSize) {
+    ec = object_error::parse_failed;
+    return;
+  }
+  ec = object_error::success;
+}
+
+static bool getCTMForArch(Triple::ArchType Arch, MachO::CPUType &CTM) {
+  switch (Arch) {
+    case Triple::x86:    CTM = MachO::CPU_TYPE_I386; return true;
+    case Triple::x86_64: CTM = MachO::CPU_TYPE_X86_64; return true;
+    case Triple::arm:    CTM = MachO::CPU_TYPE_ARM; return true;
+    case Triple::sparc:  CTM = MachO::CPU_TYPE_SPARC; return true;
+    case Triple::ppc:    CTM = MachO::CPU_TYPE_POWERPC; return true;
+    case Triple::ppc64:  CTM = MachO::CPU_TYPE_POWERPC64; return true;
+    default: return false;
+  }
+}
+
+error_code
+MachOUniversalBinary::getObjectForArch(Triple::ArchType Arch,
+                                       OwningPtr<ObjectFile> &Result) const {
+  MachO::CPUType CTM;
+  if (!getCTMForArch(Arch, CTM))
+    return object_error::arch_not_found;
+  for (object_iterator I = begin_objects(), E = end_objects(); I != E; ++I) {
+    if (I->getCPUType() == static_cast<uint32_t>(CTM))
+      return I->getAsObjectFile(Result);
+  }
+  return object_error::arch_not_found;
+}
diff --git a/lib/Object/Object.cpp b/lib/Object/Object.cpp
index 3e2c78e..6941708 100644
--- a/lib/Object/Object.cpp
+++ b/lib/Object/Object.cpp
@@ -219,10 +219,7 @@ uint64_t LLVMGetRelocationOffset(LLVMRelocationIteratorRef RI) {
 }
 
 LLVMSymbolIteratorRef LLVMGetRelocationSymbol(LLVMRelocationIteratorRef RI) {
-  SymbolRef ret;
-  if (error_code ec = (*unwrap(RI))->getSymbol(ret))
-    report_fatal_error(ec.message());
-
+  symbol_iterator ret = (*unwrap(RI))->getSymbol();
   return wrap(new symbol_iterator(ret));
 }
 
diff --git a/lib/Object/ObjectFile.cpp b/lib/Object/ObjectFile.cpp
index 77fd995..0e626d6 100644
--- a/lib/Object/ObjectFile.cpp
+++ b/lib/Object/ObjectFile.cpp
@@ -14,8 +14,8 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Path.h"
 #include "llvm/Support/system_error.h"
 
 using namespace llvm;
@@ -33,35 +33,47 @@ error_code ObjectFile::getSymbolAlignment(DataRefImpl DRI,
   return object_error::success;
 }
 
+section_iterator ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
+  return section_iterator(SectionRef(Sec, this));
+}
+
 ObjectFile *ObjectFile::createObjectFile(MemoryBuffer *Object) {
-  if (!Object || Object->getBufferSize() < 64)
+  if (Object->getBufferSize() < 64) {
+    delete Object;
+    return 0;
+  }
+
+  sys::fs::file_magic Type = sys::fs::identify_magic(Object->getBuffer());
+  switch (Type) {
+  case sys::fs::file_magic::unknown:
+  case sys::fs::file_magic::bitcode:
+  case sys::fs::file_magic::archive:
+  case sys::fs::file_magic::macho_universal_binary:
+  case sys::fs::file_magic::windows_resource:
+    delete Object;
     return 0;
-  sys::LLVMFileType type = sys::IdentifyFileType(Object->getBufferStart(),
-                                static_cast<unsigned>(Object->getBufferSize()));
-  switch (type) {
-    case sys::Unknown_FileType:
-      return 0;
-    case sys::ELF_Relocatable_FileType:
-    case sys::ELF_Executable_FileType:
-    case sys::ELF_SharedObject_FileType:
-    case sys::ELF_Core_FileType:
-      return createELFObjectFile(Object);
-    case sys::Mach_O_Object_FileType:
-    case sys::Mach_O_Executable_FileType:
-    case sys::Mach_O_FixedVirtualMemorySharedLib_FileType:
-    case sys::Mach_O_Core_FileType:
-    case sys::Mach_O_PreloadExecutable_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLib_FileType:
-    case sys::Mach_O_DynamicLinker_FileType:
-    case sys::Mach_O_Bundle_FileType:
-    case sys::Mach_O_DynamicallyLinkedSharedLibStub_FileType:
-    case sys::Mach_O_DSYMCompanion_FileType:
-      return createMachOObjectFile(Object);
-    case sys::COFF_FileType:
-      return createCOFFObjectFile(Object);
-    default:
-      llvm_unreachable("Unexpected Object File Type");
+  case sys::fs::file_magic::elf_relocatable:
+  case sys::fs::file_magic::elf_executable:
+  case sys::fs::file_magic::elf_shared_object:
+  case sys::fs::file_magic::elf_core:
+    return createELFObjectFile(Object);
+  case sys::fs::file_magic::macho_object:
+  case sys::fs::file_magic::macho_executable:
+  case sys::fs::file_magic::macho_fixed_virtual_memory_shared_lib:
+  case sys::fs::file_magic::macho_core:
+  case sys::fs::file_magic::macho_preload_executable:
+  case sys::fs::file_magic::macho_dynamically_linked_shared_lib:
+  case sys::fs::file_magic::macho_dynamic_linker:
+  case sys::fs::file_magic::macho_bundle:
+  case sys::fs::file_magic::macho_dynamically_linked_shared_lib_stub:
+  case sys::fs::file_magic::macho_dsym_companion:
+    return createMachOObjectFile(Object);
+  case sys::fs::file_magic::coff_object:
+  case sys::fs::file_magic::coff_import_library:
+  case sys::fs::file_magic::pecoff_executable:
+    return createCOFFObjectFile(Object);
   }
+  llvm_unreachable("Unexpected Object File Type");
 }
 
 ObjectFile *ObjectFile::createObjectFile(StringRef ObjectPath) {
diff --git a/lib/Object/YAML.cpp b/lib/Object/YAML.cpp
new file mode 100644
index 0000000..c527bde
--- /dev/null
+++ b/lib/Object/YAML.cpp
@@ -0,0 +1,68 @@
+//===- YAML.cpp - YAMLIO utilities for object files -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines utility classes for handling the YAML representation of
+// object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/YAML.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
+using namespace llvm;
+using namespace object::yaml;
+
+void yaml::ScalarTraits<object::yaml::BinaryRef>::output(
+    const object::yaml::BinaryRef &Val, void *, llvm::raw_ostream &Out) {
+  Val.writeAsHex(Out);
+}
+
+StringRef yaml::ScalarTraits<object::yaml::BinaryRef>::input(
+    StringRef Scalar, void *, object::yaml::BinaryRef &Val) {
+  if (Scalar.size() % 2 != 0)
+    return "BinaryRef hex string must contain an even number of nybbles.";
+  // TODO: Can we improve YAMLIO to permit a more accurate diagnostic here?
+  // (e.g. a caret pointing to the offending character).
+  for (unsigned I = 0, N = Scalar.size(); I != N; ++I)
+    if (!isxdigit(Scalar[I]))
+      return "BinaryRef hex string must contain only hex digits.";
+  Val = object::yaml::BinaryRef(Scalar);
+  return StringRef();
+}
+
+void BinaryRef::writeAsBinary(raw_ostream &OS) const {
+  if (!DataIsHexString) {
+    OS.write((const char *)Data.data(), Data.size());
+    return;
+  }
+  for (unsigned I = 0, N = Data.size(); I != N; I += 2) {
+    uint8_t Byte;
+    StringRef((const char *)&Data[I],  2).getAsInteger(16, Byte);
+    OS.write(Byte);
+  }
+}
+
+void BinaryRef::writeAsHex(raw_ostream &OS) const {
+  if (binary_size() == 0) {
+    OS << "\"\"";
+    return;
+  }
+  if (DataIsHexString) {
+    OS.write((const char *)Data.data(), Data.size());
+    return;
+  }
+  for (ArrayRef<uint8_t>::iterator I = Data.begin(), E = Data.end(); I != E;
+       ++I) {
+    uint8_t Byte = *I;
+    OS << hexdigit(Byte >> 4);
+    OS << hexdigit(Byte & 0xf);
+  }
+}
diff --git a/lib/Option/ArgList.cpp b/lib/Option/ArgList.cpp
index 39b22d7..15f7e8b 100644
--- a/lib/Option/ArgList.cpp
+++ b/lib/Option/ArgList.cpp
@@ -206,6 +206,13 @@ bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier Neg, bool Default) const {
   return Default;
 }
 
+bool ArgList::hasFlag(OptSpecifier Pos, OptSpecifier PosAlias, OptSpecifier Neg,
+                      bool Default) const {
+  if (Arg *A = getLastArg(Pos, PosAlias, Neg))
+    return A->getOption().matches(Pos) || A->getOption().matches(PosAlias);
+  return Default;
+}
+
 StringRef ArgList::getLastArgValue(OptSpecifier Id,
                                          StringRef Default) const {
   if (Arg *A = getLastArg(Id))
@@ -226,6 +233,14 @@ void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id) const {
   }
 }
 
+void ArgList::AddLastArg(ArgStringList &Output, OptSpecifier Id0,
+                         OptSpecifier Id1) const {
+  if (Arg *A = getLastArg(Id0, Id1)) {
+    A->claim();
+    A->render(*this, Output);
+  }
+}
+
 void ArgList::AddAllArgs(ArgStringList &Output, OptSpecifier Id0,
                          OptSpecifier Id1, OptSpecifier Id2) const {
   for (arg_iterator it = filtered_begin(Id0, Id1, Id2),
diff --git a/lib/Option/CMakeLists.txt b/lib/Option/CMakeLists.txt
index 2e7acc2..1cd7d3a 100644
--- a/lib/Option/CMakeLists.txt
+++ b/lib/Option/CMakeLists.txt
@@ -4,5 +4,3 @@ add_llvm_library(LLVMOption
   Option.cpp
   OptTable.cpp
   )
-
-target_link_libraries(LLVMOption LLVMSupport)
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 5c8a0ea..6fa459a 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -14,26 +14,27 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cctype>
 #include <map>
 
 using namespace llvm;
 using namespace llvm::opt;
 
-// Ordering on Info. The ordering is *almost* lexicographic, with two
-// exceptions. First, '\0' comes at the end of the alphabet instead of
-// the beginning (thus options precede any other options which prefix
-// them). Second, for options with the same name, the less permissive
-// version should come first; a Flag option should precede a Joined
-// option, for example.
+namespace llvm {
+namespace opt {
 
-static int StrCmpOptionName(const char *A, const char *B) {
-  char a = *A, b = *B;
+// Ordering on Info. The ordering is *almost* case-insensitive lexicographic,
+// with an exceptions. '\0' comes at the end of the alphabet instead of the
+// beginning (thus options precede any other options which prefix them).
+static int StrCmpOptionNameIgnoreCase(const char *A, const char *B) {
+  const char *X = A, *Y = B;
+  char a = tolower(*A), b = tolower(*B);
   while (a == b) {
     if (a == '\0')
       return 0;
 
-    a = *++A;
-    b = *++B;
+    a = tolower(*++X);
+    b = tolower(*++Y);
   }
 
   if (a == '\0') // A is a prefix of B.
@@ -45,21 +46,25 @@ static int StrCmpOptionName(const char *A, const char *B) {
   return (a < b) ? -1 : 1;
 }
 
-namespace llvm {
-namespace opt {
+#ifndef NDEBUG
+static int StrCmpOptionName(const char *A, const char *B) {
+  if (int N = StrCmpOptionNameIgnoreCase(A, B))
+    return N;
+  return strcmp(A, B);
+}
 
 static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
   if (&A == &B)
     return false;
 
   if (int N = StrCmpOptionName(A.Name, B.Name))
-    return N == -1;
+    return N < 0;
 
   for (const char * const *APre = A.Prefixes,
                   * const *BPre = B.Prefixes;
                           *APre != 0 && *BPre != 0; ++APre, ++BPre) {
     if (int N = StrCmpOptionName(*APre, *BPre))
-      return N == -1;
+      return N < 0;
   }
 
   // Names are the same, check that classes are in order; exactly one
@@ -68,22 +73,22 @@ static inline bool operator<(const OptTable::Info &A, const OptTable::Info &B) {
          "Unexpected classes for options with same name.");
   return B.Kind == Option::JoinedClass;
 }
+#endif
 
 // Support lower_bound between info and an option name.
 static inline bool operator<(const OptTable::Info &I, const char *Name) {
-  return StrCmpOptionName(I.Name, Name) == -1;
-}
-static inline bool operator<(const char *Name, const OptTable::Info &I) {
-  return StrCmpOptionName(Name, I.Name) == -1;
+  return StrCmpOptionNameIgnoreCase(I.Name, Name) < 0;
 }
 }
 }
 
 OptSpecifier::OptSpecifier(const Option *Opt) : ID(Opt->getID()) {}
 
-OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos)
+OptTable::OptTable(const Info *_OptionInfos, unsigned _NumOptionInfos,
+                   bool _IgnoreCase)
   : OptionInfos(_OptionInfos),
     NumOptionInfos(_NumOptionInfos),
+    IgnoreCase(_IgnoreCase),
     TheInputOptionID(0),
     TheUnknownOptionID(0),
     FirstSearchableIndex(0)
@@ -160,10 +165,6 @@ const Option OptTable::getOption(OptSpecifier Opt) const {
   return Option(&getInfo(id), this);
 }
 
-bool OptTable::isOptionHelpHidden(OptSpecifier id) const {
-  return getInfo(id).Flags & HelpHidden;
-}
-
 static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
   if (Arg == "-")
     return true;
@@ -175,16 +176,25 @@ static bool isInput(const llvm::StringSet<> &Prefixes, StringRef Arg) {
 }
 
 /// \returns Matched size. 0 means no match.
-static unsigned matchOption(const OptTable::Info *I, StringRef Str) {
+static unsigned matchOption(const OptTable::Info *I, StringRef Str,
+                            bool IgnoreCase) {
   for (const char * const *Pre = I->Prefixes; *Pre != 0; ++Pre) {
     StringRef Prefix(*Pre);
-    if (Str.startswith(Prefix) && Str.substr(Prefix.size()).startswith(I->Name))
-      return Prefix.size() + StringRef(I->Name).size();
+    if (Str.startswith(Prefix)) {
+      StringRef Rest = Str.substr(Prefix.size());
+      bool Matched = IgnoreCase
+          ? Rest.startswith_lower(I->Name)
+          : Rest.startswith(I->Name);
+      if (Matched)
+        return Prefix.size() + StringRef(I->Name).size();
+    }
   }
   return 0;
 }
 
-Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index) const {
+Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
+                           unsigned FlagsToInclude,
+                           unsigned FlagsToExclude) const {
   unsigned Prev = Index;
   const char *Str = Args.getArgString(Index);
 
@@ -212,13 +222,20 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index) const {
     unsigned ArgSize = 0;
     // Scan for first option which is a proper prefix.
     for (; Start != End; ++Start)
-      if ((ArgSize = matchOption(Start, Str)))
+      if ((ArgSize = matchOption(Start, Str, IgnoreCase)))
         break;
     if (Start == End)
       break;
 
+    Option Opt(Start, this);
+
+    if (FlagsToInclude && !Opt.hasFlag(FlagsToInclude))
+      continue;
+    if (Opt.hasFlag(FlagsToExclude))
+      continue;
+
     // See if this option matches.
-    if (Arg *A = Option(Start, this).accept(Args, Index, ArgSize))
+    if (Arg *A = Opt.accept(Args, Index, ArgSize))
       return A;
 
     // Otherwise, see if this argument was missing values.
@@ -226,13 +243,20 @@ Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index) const {
       return 0;
   }
 
+  // If we failed to find an option and this arg started with /, then it's
+  // probably an input path.
+  if (Str[0] == '/')
+    return new Arg(getOption(TheInputOptionID), Str, Index++, Str);
+
   return new Arg(getOption(TheUnknownOptionID), Str, Index++, Str);
 }
 
-InputArgList *OptTable::ParseArgs(const char* const *ArgBegin,
-                                  const char* const *ArgEnd,
+InputArgList *OptTable::ParseArgs(const char *const *ArgBegin,
+                                  const char *const *ArgEnd,
                                   unsigned &MissingArgIndex,
-                                  unsigned &MissingArgCount) const {
+                                  unsigned &MissingArgCount,
+                                  unsigned FlagsToInclude,
+                                  unsigned FlagsToExclude) const {
   InputArgList *Args = new InputArgList(ArgBegin, ArgEnd);
 
   // FIXME: Handle '@' args (or at least error on them).
@@ -241,13 +265,14 @@ InputArgList *OptTable::ParseArgs(const char* const *ArgBegin,
   unsigned Index = 0, End = ArgEnd - ArgBegin;
   while (Index < End) {
     // Ignore empty arguments (other things may still take them as arguments).
-    if (Args->getArgString(Index)[0] == '\0') {
+    StringRef Str = Args->getArgString(Index);
+    if (Str == "") {
       ++Index;
       continue;
     }
 
     unsigned Prev = Index;
-    Arg *A = ParseOneArg(*Args, Index);
+    Arg *A = ParseOneArg(*Args, Index, FlagsToInclude, FlagsToExclude);
     assert(Index > Prev && "Parser failed to consume argument.");
 
     // Check for missing argument error.
@@ -281,6 +306,7 @@ static std::string getOptionHelpName(const OptTable &Opts, OptSpecifier Id) {
     break;
 
   case Option::SeparateClass: case Option::JoinedOrSeparateClass:
+  case Option::RemainingArgsClass:
     Name += ' ';
     // FALLTHROUGH
   case Option::JoinedClass: case Option::CommaJoinedClass:
@@ -346,8 +372,16 @@ static const char *getOptionHelpGroup(const OptTable &Opts, OptSpecifier Id) {
   return getOptionHelpGroup(Opts, GroupID);
 }
 
-void OptTable::PrintHelp(raw_ostream &OS, const char *Name,
-                         const char *Title, bool ShowHidden) const {
+void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
+                         bool ShowHidden) const {
+  PrintHelp(OS, Name, Title, /*Include*/ 0, /*Exclude*/
+            (ShowHidden ? 0 : HelpHidden));
+}
+
+
+void OptTable::PrintHelp(raw_ostream &OS, const char *Name, const char *Title,
+                         unsigned FlagsToInclude,
+                         unsigned FlagsToExclude) const {
   OS << "OVERVIEW: " << Title << "\n";
   OS << '\n';
   OS << "USAGE: " << Name << " [options] <inputs>\n";
@@ -366,7 +400,10 @@ void OptTable::PrintHelp(raw_ostream &OS, const char *Name,
     if (getOptionKind(Id) == Option::GroupClass)
       continue;
 
-    if (!ShowHidden && isOptionHelpHidden(Id))
+    unsigned Flags = getInfo(Id).Flags;
+    if (FlagsToInclude && !(Flags & FlagsToInclude))
+      continue;
+    if (Flags & FlagsToExclude)
       continue;
 
     if (const char *Text = getOptionHelpText(Id)) {
diff --git a/lib/Option/Option.cpp b/lib/Option/Option.cpp
index 0e22634..7b5ff2b 100644
--- a/lib/Option/Option.cpp
+++ b/lib/Option/Option.cpp
@@ -22,12 +22,17 @@ using namespace llvm::opt;
 Option::Option(const OptTable::Info *info, const OptTable *owner)
   : Info(info), Owner(owner) {
 
-  // Multi-level aliases are not supported, and alias options cannot
-  // have groups. This just simplifies option tracking, it is not an
-  // inherent limitation.
-  assert((!Info || !getAlias().isValid() || (!getAlias().getAlias().isValid() &&
-         !getGroup().isValid())) &&
-         "Multi-level aliases and aliases with groups are unsupported.");
+  // Multi-level aliases are not supported. This just simplifies option
+  // tracking, it is not an inherent limitation.
+  assert((!Info || !getAlias().isValid() || !getAlias().getAlias().isValid()) &&
+         "Multi-level aliases are not supported.");
+
+  if (Info && getAliasArgs()) {
+    assert(getAlias().isValid() && "Only alias options can have alias args.");
+    assert(getKind() == FlagClass && "Only Flag aliases can have alias args.");
+    assert(getAlias().getKind() != FlagClass &&
+           "Cannot provide alias args to a flag option.");
+  }
 }
 
 Option::~Option() {
@@ -47,14 +52,17 @@ void Option::dump() const {
     P(MultiArgClass);
     P(JoinedOrSeparateClass);
     P(JoinedAndSeparateClass);
+    P(RemainingArgsClass);
 #undef P
   }
 
-  llvm::errs() << " Prefixes:[";
-  for (const char * const *Pre = Info->Prefixes; *Pre != 0; ++Pre) {
-    llvm::errs() << '"' << *Pre << (*(Pre + 1) == 0 ? "\"" : "\", ");
+  if (Info->Prefixes) {
+    llvm::errs() << " Prefixes:[";
+    for (const char * const *Pre = Info->Prefixes; *Pre != 0; ++Pre) {
+      llvm::errs() << '"' << *Pre << (*(Pre + 1) == 0 ? "\"" : "\", ");
+    }
+    llvm::errs() << ']';
   }
-  llvm::errs() << ']';
 
   llvm::errs() << " Name:\"" << getName() << '"';
 
@@ -106,11 +114,22 @@ Arg *Option::accept(const ArgList &Args,
   }
 
   switch (getKind()) {
-  case FlagClass:
+  case FlagClass: {
     if (ArgSize != strlen(Args.getArgString(Index)))
       return 0;
 
-    return new Arg(UnaliasedOption, Spelling, Index++);
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
+    if (getAliasArgs()) {
+      const char *Val = getAliasArgs();
+      while (*Val != '\0') {
+        A->getValues().push_back(Val);
+
+        // Move past the '\0' to the next argument.
+        Val += strlen(Val) + 1;
+      }
+    }
+    return A;
+  }
   case JoinedClass: {
     const char *Value = Args.getArgString(Index) + ArgSize;
     return new Arg(UnaliasedOption, Spelling, Index++, Value);
@@ -196,6 +215,16 @@ Arg *Option::accept(const ArgList &Args,
     return new Arg(UnaliasedOption, Spelling, Index - 2,
                    Args.getArgString(Index - 2) + ArgSize,
                    Args.getArgString(Index - 1));
+  case RemainingArgsClass: {
+    // Matches iff this is an exact match.
+    // FIXME: Avoid strlen.
+    if (ArgSize != strlen(Args.getArgString(Index)))
+      return 0;
+    Arg *A = new Arg(UnaliasedOption, Spelling, Index++);
+    while (Index < Args.getNumInputArgStrings())
+      A->getValues().push_back(Args.getArgString(Index++));
+    return A;
+  }
   default:
     llvm_unreachable("Invalid option kind!");
   }
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 6182e34..676e2d4 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -25,7 +25,13 @@
 
 using namespace llvm;
 
-#define convolve(lhs, rhs) ((lhs) * 4 + (rhs))
+/// A macro used to combine two fcCategory enums into one key which can be used
+/// in a switch statement to classify how the interaction of two APFloat's
+/// categories affects an operation.
+///
+/// TODO: If clang source code is ever allowed to use constexpr in its own
+/// codebase, change this into a static inline function.
+#define PackCategoriesIntoKey(_lhs, _rhs) ((_lhs) * 4 + (_rhs))
 
 /* Assumed in hexadecimal significand parsing, and conversion to
    hexadecimal strings.  */
@@ -38,11 +44,11 @@ namespace llvm {
   struct fltSemantics {
     /* The largest E such that 2^E is representable; this matches the
        definition of IEEE 754.  */
-    exponent_t maxExponent;
+    APFloat::ExponentType maxExponent;
 
     /* The smallest E such that 2^E is a normalized number; this
        matches the definition of IEEE 754.  */
-    exponent_t minExponent;
+    APFloat::ExponentType minExponent;
 
     /* Number of bits in the significand.  This includes the integer
        bit.  */
@@ -288,9 +294,9 @@ interpretDecimal(StringRef::iterator begin, StringRef::iterator end,
     }
 
     /* Adjust the exponents for any decimal point.  */
-    D->exponent += static_cast<exponent_t>((dot - p) - (dot > p));
+    D->exponent += static_cast<APFloat::ExponentType>((dot - p) - (dot > p));
     D->normalizedExponent = (D->exponent +
-              static_cast<exponent_t>((p - D->firstSigDigit)
+              static_cast<APFloat::ExponentType>((p - D->firstSigDigit)
                                       - (dot > D->firstSigDigit && dot < p)));
   }
 
@@ -313,8 +319,8 @@ trailingHexadecimalFraction(StringRef::iterator p, StringRef::iterator end,
   else if (digitValue < 8 && digitValue > 0)
     return lfLessThanHalf;
 
-  /* Otherwise we need to find the first non-zero digit.  */
-  while (*p == '0')
+  // Otherwise we need to find the first non-zero digit.
+  while (p != end && (*p == '0' || *p == '.'))
     p++;
 
   assert(p != end && "Invalid trailing hexadecimal fraction!");
@@ -580,7 +586,7 @@ APFloat::initialize(const fltSemantics *ourSemantics)
 void
 APFloat::freeSignificand()
 {
-  if (partCount() > 1)
+  if (needsCleanup())
     delete [] significand.parts;
 }
 
@@ -592,14 +598,14 @@ APFloat::assign(const APFloat &rhs)
   sign = rhs.sign;
   category = rhs.category;
   exponent = rhs.exponent;
-  if (category == fcNormal || category == fcNaN)
+  if (isFiniteNonZero() || category == fcNaN)
     copySignificand(rhs);
 }
 
 void
 APFloat::copySignificand(const APFloat &rhs)
 {
-  assert(category == fcNormal || category == fcNaN);
+  assert(isFiniteNonZero() || category == fcNaN);
   assert(rhs.partCount() >= partCount());
 
   APInt::tcAssign(significandParts(), rhs.significandParts(),
@@ -679,12 +685,73 @@ APFloat::operator=(const APFloat &rhs)
 
 bool
 APFloat::isDenormal() const {
-  return isNormal() && (exponent == semantics->minExponent) &&
+  return isFiniteNonZero() && (exponent == semantics->minExponent) &&
          (APInt::tcExtractBit(significandParts(), 
                               semantics->precision - 1) == 0);
 }
 
 bool
+APFloat::isSmallest() const {
+  // The smallest number by magnitude in our format will be the smallest
+  // denormal, i.e. the floating point number with exponent being minimum
+  // exponent and significand bitwise equal to 1 (i.e. with MSB equal to 0).
+  return isFiniteNonZero() && exponent == semantics->minExponent &&
+    significandMSB() == 0;
+}
+
+bool APFloat::isSignificandAllOnes() const {
+  // Test if the significand excluding the integral bit is all ones. This allows
+  // us to test for binade boundaries.
+  const integerPart *Parts = significandParts();
+  const unsigned PartCount = partCount();
+  for (unsigned i = 0; i < PartCount - 1; i++)
+    if (~Parts[i])
+      return false;
+
+  // Set the unused high bits to all ones when we compare.
+  const unsigned NumHighBits =
+    PartCount*integerPartWidth - semantics->precision + 1;
+  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
+         "fill than integerPartWidth");
+  const integerPart HighBitFill =
+    ~integerPart(0) << (integerPartWidth - NumHighBits);
+  if (~(Parts[PartCount - 1] | HighBitFill))
+    return false;
+
+  return true;
+}
+
+bool APFloat::isSignificandAllZeros() const {
+  // Test if the significand excluding the integral bit is all zeros. This
+  // allows us to test for binade boundaries.
+  const integerPart *Parts = significandParts();
+  const unsigned PartCount = partCount();
+
+  for (unsigned i = 0; i < PartCount - 1; i++)
+    if (Parts[i])
+      return false;
+
+  const unsigned NumHighBits =
+    PartCount*integerPartWidth - semantics->precision + 1;
+  assert(NumHighBits <= integerPartWidth && "Can not have more high bits to "
+         "clear than integerPartWidth");
+  const integerPart HighBitMask = ~integerPart(0) >> NumHighBits;
+
+  if (Parts[PartCount - 1] & HighBitMask)
+    return false;
+
+  return true;
+}
+
+bool
+APFloat::isLargest() const {
+  // The largest number by magnitude in our format will be the floating point
+  // number with maximum exponent and with significand that is all ones.
+  return isFiniteNonZero() && exponent == semantics->maxExponent
+    && isSignificandAllOnes();
+}
+
+bool
 APFloat::bitwiseIsEqual(const APFloat &rhs) const {
   if (this == &rhs)
     return true;
@@ -694,7 +761,7 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const {
     return false;
   if (category==fcZero || category==fcInfinity)
     return true;
-  else if (category==fcNormal && exponent!=rhs.exponent)
+  else if (isFiniteNonZero() && exponent!=rhs.exponent)
     return false;
   else {
     int i= partCount();
@@ -711,6 +778,7 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const {
 APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value) {
   initialize(&ourSemantics);
   sign = 0;
+  category = fcNormal;
   zeroSignificand();
   exponent = ourSemantics.precision - 1;
   significandParts()[0] = value;
@@ -728,17 +796,6 @@ APFloat::APFloat(const fltSemantics &ourSemantics, uninitializedTag tag) {
   initialize(&ourSemantics);
 }
 
-APFloat::APFloat(const fltSemantics &ourSemantics,
-                 fltCategory ourCategory, bool negative) {
-  initialize(&ourSemantics);
-  category = ourCategory;
-  sign = negative;
-  if (category == fcNormal)
-    category = fcZero;
-  else if (ourCategory == fcNaN)
-    makeNaN();
-}
-
 APFloat::APFloat(const fltSemantics &ourSemantics, StringRef text) {
   initialize(&ourSemantics);
   convertFromString(text, rmNearestTiesToEven);
@@ -780,8 +837,6 @@ APFloat::significandParts() const
 integerPart *
 APFloat::significandParts()
 {
-  assert(category == fcNormal || category == fcNaN);
-
   if (partCount() > 1)
     return significand.parts;
   else
@@ -791,7 +846,6 @@ APFloat::significandParts()
 void
 APFloat::zeroSignificand()
 {
-  category = fcNormal;
   APInt::tcSet(significandParts(), 0, partCount());
 }
 
@@ -872,7 +926,21 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
   omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
   exponent += rhs.exponent;
 
+  // Assume the operands involved in the multiplication are single-precision
+  // FP, and the two multiplicants are:
+  //   *this = a23 . a22 ... a0 * 2^e1
+  //     rhs = b23 . b22 ... b0 * 2^e2
+  // the result of multiplication is:
+  //   *this = c47 c46 . c45 ... c0 * 2^(e1+e2)
+  // Note that there are two significant bits at the left-hand side of the 
+  // radix point. Move the radix point toward left by one bit, and adjust
+  // exponent accordingly.
+  exponent += 1;
+
   if (addend) {
+    // The intermediate result of the multiplication has "2 * precision" 
+    // signicant bit; adjust the addend to be consistent with mul result.
+    //
     Significand savedSignificand = significand;
     const fltSemantics *savedSemantics = semantics;
     fltSemantics extendedSemantics;
@@ -880,8 +948,9 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
     unsigned int extendedPrecision;
 
     /* Normalize our MSB.  */
-    extendedPrecision = precision + precision - 1;
+    extendedPrecision = 2 * precision;
     if (omsb != extendedPrecision) {
+      assert(extendedPrecision > omsb);
       APInt::tcShiftLeft(fullSignificand, newPartsCount,
                          extendedPrecision - omsb);
       exponent -= extendedPrecision - omsb;
@@ -912,8 +981,18 @@ APFloat::multiplySignificand(const APFloat &rhs, const APFloat *addend)
     omsb = APInt::tcMSB(fullSignificand, newPartsCount) + 1;
   }
 
-  exponent -= (precision - 1);
+  // Convert the result having "2 * precision" significant-bits back to the one
+  // having "precision" significant-bits. First, move the radix point from 
+  // poision "2*precision - 1" to "precision - 1". The exponent need to be
+  // adjusted by "2*precision - 1" - "precision - 1" = "precision".
+  exponent -= precision;
 
+  // In case MSB resides at the left-hand side of radix point, shift the
+  // mantissa right by some amount to make sure the MSB reside right before
+  // the radix point (i.e. "MSB . rest-significant-bits").
+  //
+  // Note that the result is not normalized when "omsb < precision". So, the
+  // caller needs to call APFloat::normalize() if normalized value is expected.
   if (omsb > precision) {
     unsigned int bits, significantParts;
     lostFraction lf;
@@ -1035,7 +1114,7 @@ lostFraction
 APFloat::shiftSignificandRight(unsigned int bits)
 {
   /* Our exponent should not overflow.  */
-  assert((exponent_t) (exponent + bits) >= exponent);
+  assert((ExponentType) (exponent + bits) >= exponent);
 
   exponent += bits;
 
@@ -1064,8 +1143,8 @@ APFloat::compareAbsoluteValue(const APFloat &rhs) const
   int compare;
 
   assert(semantics == rhs.semantics);
-  assert(category == fcNormal);
-  assert(rhs.category == fcNormal);
+  assert(isFiniteNonZero());
+  assert(rhs.isFiniteNonZero());
 
   compare = exponent - rhs.exponent;
 
@@ -1117,7 +1196,7 @@ APFloat::roundAwayFromZero(roundingMode rounding_mode,
                            unsigned int bit) const
 {
   /* NaNs and infinities should not have lost fractions.  */
-  assert(category == fcNormal || category == fcZero);
+  assert(isFiniteNonZero() || category == fcZero);
 
   /* Current callers never pass this so we don't handle it.  */
   assert(lost_fraction != lfExactlyZero);
@@ -1155,7 +1234,7 @@ APFloat::normalize(roundingMode rounding_mode,
   unsigned int omsb;                /* One, not zero, based MSB.  */
   int exponentChange;
 
-  if (category != fcNormal)
+  if (!isFiniteNonZero())
     return opOK;
 
   /* Before rounding normalize the exponent of fcNormal numbers.  */
@@ -1259,42 +1338,43 @@ APFloat::normalize(roundingMode rounding_mode,
 APFloat::opStatus
 APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
 {
-  switch (convolve(category, rhs.category)) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
     llvm_unreachable(0);
 
-  case convolve(fcNaN, fcZero):
-  case convolve(fcNaN, fcNormal):
-  case convolve(fcNaN, fcInfinity):
-  case convolve(fcNaN, fcNaN):
-  case convolve(fcNormal, fcZero):
-  case convolve(fcInfinity, fcNormal):
-  case convolve(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
     return opOK;
 
-  case convolve(fcZero, fcNaN):
-  case convolve(fcNormal, fcNaN):
-  case convolve(fcInfinity, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    sign = false;
     category = fcNaN;
     copySignificand(rhs);
     return opOK;
 
-  case convolve(fcNormal, fcInfinity):
-  case convolve(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
     category = fcInfinity;
     sign = rhs.sign ^ subtract;
     return opOK;
 
-  case convolve(fcZero, fcNormal):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
     assign(rhs);
     sign = rhs.sign ^ subtract;
     return opOK;
 
-  case convolve(fcZero, fcZero):
+  case PackCategoriesIntoKey(fcZero, fcZero):
     /* Sign depends on rounding mode; handled by caller.  */
     return opOK;
 
-  case convolve(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
     /* Differently signed infinities can only be validly
        subtracted.  */
     if (((sign ^ rhs.sign)!=0) != subtract) {
@@ -1304,7 +1384,7 @@ APFloat::addOrSubtractSpecials(const APFloat &rhs, bool subtract)
 
     return opOK;
 
-  case convolve(fcNormal, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
     return opDivByZero;
   }
 }
@@ -1385,41 +1465,43 @@ APFloat::addOrSubtractSignificand(const APFloat &rhs, bool subtract)
 APFloat::opStatus
 APFloat::multiplySpecials(const APFloat &rhs)
 {
-  switch (convolve(category, rhs.category)) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
     llvm_unreachable(0);
 
-  case convolve(fcNaN, fcZero):
-  case convolve(fcNaN, fcNormal):
-  case convolve(fcNaN, fcInfinity):
-  case convolve(fcNaN, fcNaN):
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+    sign = false;
     return opOK;
 
-  case convolve(fcZero, fcNaN):
-  case convolve(fcNormal, fcNaN):
-  case convolve(fcInfinity, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    sign = false;
     category = fcNaN;
     copySignificand(rhs);
     return opOK;
 
-  case convolve(fcNormal, fcInfinity):
-  case convolve(fcInfinity, fcNormal):
-  case convolve(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
     category = fcInfinity;
     return opOK;
 
-  case convolve(fcZero, fcNormal):
-  case convolve(fcNormal, fcZero):
-  case convolve(fcZero, fcZero):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcZero, fcZero):
     category = fcZero;
     return opOK;
 
-  case convolve(fcZero, fcInfinity):
-  case convolve(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
     makeNaN();
     return opInvalidOp;
 
-  case convolve(fcNormal, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
     return opOK;
   }
 }
@@ -1427,41 +1509,40 @@ APFloat::multiplySpecials(const APFloat &rhs)
 APFloat::opStatus
 APFloat::divideSpecials(const APFloat &rhs)
 {
-  switch (convolve(category, rhs.category)) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
     llvm_unreachable(0);
 
-  case convolve(fcNaN, fcZero):
-  case convolve(fcNaN, fcNormal):
-  case convolve(fcNaN, fcInfinity):
-  case convolve(fcNaN, fcNaN):
-  case convolve(fcInfinity, fcZero):
-  case convolve(fcInfinity, fcNormal):
-  case convolve(fcZero, fcInfinity):
-  case convolve(fcZero, fcNormal):
-    return opOK;
-
-  case convolve(fcZero, fcNaN):
-  case convolve(fcNormal, fcNaN):
-  case convolve(fcInfinity, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
     category = fcNaN;
     copySignificand(rhs);
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+    sign = false;
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
     return opOK;
 
-  case convolve(fcNormal, fcInfinity):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
     category = fcZero;
     return opOK;
 
-  case convolve(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcNormal, fcZero):
     category = fcInfinity;
     return opDivByZero;
 
-  case convolve(fcInfinity, fcInfinity):
-  case convolve(fcZero, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcZero):
     makeNaN();
     return opInvalidOp;
 
-  case convolve(fcNormal, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
     return opOK;
   }
 }
@@ -1469,35 +1550,36 @@ APFloat::divideSpecials(const APFloat &rhs)
 APFloat::opStatus
 APFloat::modSpecials(const APFloat &rhs)
 {
-  switch (convolve(category, rhs.category)) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
     llvm_unreachable(0);
 
-  case convolve(fcNaN, fcZero):
-  case convolve(fcNaN, fcNormal):
-  case convolve(fcNaN, fcInfinity):
-  case convolve(fcNaN, fcNaN):
-  case convolve(fcZero, fcInfinity):
-  case convolve(fcZero, fcNormal):
-  case convolve(fcNormal, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
     return opOK;
 
-  case convolve(fcZero, fcNaN):
-  case convolve(fcNormal, fcNaN):
-  case convolve(fcInfinity, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    sign = false;
     category = fcNaN;
     copySignificand(rhs);
     return opOK;
 
-  case convolve(fcNormal, fcZero):
-  case convolve(fcInfinity, fcZero):
-  case convolve(fcInfinity, fcNormal):
-  case convolve(fcInfinity, fcInfinity):
-  case convolve(fcZero, fcZero):
+  case PackCategoriesIntoKey(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcZero):
     makeNaN();
     return opInvalidOp;
 
-  case convolve(fcNormal, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
     return opOK;
   }
 }
@@ -1578,7 +1660,7 @@ APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
   sign ^= rhs.sign;
   fs = multiplySpecials(rhs);
 
-  if (category == fcNormal) {
+  if (isFiniteNonZero()) {
     lostFraction lost_fraction = multiplySignificand(rhs, 0);
     fs = normalize(rounding_mode, lost_fraction);
     if (lost_fraction != lfExactlyZero)
@@ -1597,7 +1679,7 @@ APFloat::divide(const APFloat &rhs, roundingMode rounding_mode)
   sign ^= rhs.sign;
   fs = divideSpecials(rhs);
 
-  if (category == fcNormal) {
+  if (isFiniteNonZero()) {
     lostFraction lost_fraction = divideSignificand(rhs);
     fs = normalize(rounding_mode, lost_fraction);
     if (lost_fraction != lfExactlyZero)
@@ -1651,7 +1733,7 @@ APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
   opStatus fs;
   fs = modSpecials(rhs);
 
-  if (category == fcNormal && rhs.category == fcNormal) {
+  if (isFiniteNonZero() && rhs.isFiniteNonZero()) {
     APFloat V = *this;
     unsigned int origSign = sign;
 
@@ -1697,9 +1779,9 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
 
   /* If and only if all arguments are normal do we need to do an
      extended-precision calculation.  */
-  if (category == fcNormal &&
-      multiplicand.category == fcNormal &&
-      addend.category == fcNormal) {
+  if (isFiniteNonZero() &&
+      multiplicand.isFiniteNonZero() &&
+      addend.isFiniteNonZero()) {
     lostFraction lost_fraction;
 
     lost_fraction = multiplySignificand(multiplicand, &addend);
@@ -1736,7 +1818,7 @@ APFloat::opStatus APFloat::roundToIntegral(roundingMode rounding_mode) {
   // If the exponent is large enough, we know that this value is already
   // integral, and the arithmetic below would potentially cause it to saturate
   // to +/-Inf.  Bail out early instead.
-  if (category == fcNormal && exponent+1 >= (int)semanticsPrecision(*semantics))
+  if (isFiniteNonZero() && exponent+1 >= (int)semanticsPrecision(*semantics))
     return opOK;
 
   // The algorithm here is quite simple: we add 2^(p-1), where p is the
@@ -1780,36 +1862,36 @@ APFloat::compare(const APFloat &rhs) const
 
   assert(semantics == rhs.semantics);
 
-  switch (convolve(category, rhs.category)) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
   default:
     llvm_unreachable(0);
 
-  case convolve(fcNaN, fcZero):
-  case convolve(fcNaN, fcNormal):
-  case convolve(fcNaN, fcInfinity):
-  case convolve(fcNaN, fcNaN):
-  case convolve(fcZero, fcNaN):
-  case convolve(fcNormal, fcNaN):
-  case convolve(fcInfinity, fcNaN):
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
     return cmpUnordered;
 
-  case convolve(fcInfinity, fcNormal):
-  case convolve(fcInfinity, fcZero):
-  case convolve(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcNormal, fcZero):
     if (sign)
       return cmpLessThan;
     else
       return cmpGreaterThan;
 
-  case convolve(fcNormal, fcInfinity):
-  case convolve(fcZero, fcInfinity):
-  case convolve(fcZero, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
     if (rhs.sign)
       return cmpGreaterThan;
     else
       return cmpLessThan;
 
-  case convolve(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
     if (sign == rhs.sign)
       return cmpEqual;
     else if (sign)
@@ -1817,10 +1899,10 @@ APFloat::compare(const APFloat &rhs) const
     else
       return cmpGreaterThan;
 
-  case convolve(fcZero, fcZero):
+  case PackCategoriesIntoKey(fcZero, fcZero):
     return cmpEqual;
 
-  case convolve(fcNormal, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
     break;
   }
 
@@ -1877,8 +1959,25 @@ APFloat::convert(const fltSemantics &toSemantics,
     X86SpecialNan = true;
   }
 
+  // If this is a truncation of a denormal number, and the target semantics
+  // has larger exponent range than the source semantics (this can happen
+  // when truncating from PowerPC double-double to double format), the
+  // right shift could lose result mantissa bits.  Adjust exponent instead
+  // of performing excessive shift.
+  if (shift < 0 && isFiniteNonZero()) {
+    int exponentChange = significandMSB() + 1 - fromSemantics.precision;
+    if (exponent + exponentChange < toSemantics.minExponent)
+      exponentChange = toSemantics.minExponent - exponent;
+    if (exponentChange < shift)
+      exponentChange = shift;
+    if (exponentChange < 0) {
+      shift -= exponentChange;
+      exponent += exponentChange;
+    }
+  }
+
   // If this is a truncation, perform the shift before we narrow the storage.
-  if (shift < 0 && (category==fcNormal || category==fcNaN))
+  if (shift < 0 && (isFiniteNonZero() || category==fcNaN))
     lostFraction = shiftRight(significandParts(), oldPartCount, -shift);
 
   // Fix the storage so it can hold to new value.
@@ -1887,14 +1986,14 @@ APFloat::convert(const fltSemantics &toSemantics,
     integerPart *newParts;
     newParts = new integerPart[newPartCount];
     APInt::tcSet(newParts, 0, newPartCount);
-    if (category==fcNormal || category==fcNaN)
+    if (isFiniteNonZero() || category==fcNaN)
       APInt::tcAssign(newParts, significandParts(), oldPartCount);
     freeSignificand();
     significand.parts = newParts;
   } else if (newPartCount == 1 && oldPartCount != 1) {
     // Switch to built-in storage for a single part.
     integerPart newPart = 0;
-    if (category==fcNormal || category==fcNaN)
+    if (isFiniteNonZero() || category==fcNaN)
       newPart = significandParts()[0];
     freeSignificand();
     significand.part = newPart;
@@ -1905,10 +2004,10 @@ APFloat::convert(const fltSemantics &toSemantics,
 
   // If this is an extension, perform the shift now that the storage is
   // available.
-  if (shift > 0 && (category==fcNormal || category==fcNaN))
+  if (shift > 0 && (isFiniteNonZero() || category==fcNaN))
     APInt::tcShiftLeft(significandParts(), newPartCount, shift);
 
-  if (category == fcNormal) {
+  if (isFiniteNonZero()) {
     fs = normalize(rounding_mode, lostFraction);
     *losesInfo = (fs != opOK);
   } else if (category == fcNaN) {
@@ -2204,56 +2303,46 @@ APFloat::opStatus
 APFloat::convertFromHexadecimalString(StringRef s, roundingMode rounding_mode)
 {
   lostFraction lost_fraction = lfExactlyZero;
-  integerPart *significand;
-  unsigned int bitPos, partsCount;
-  StringRef::iterator dot, firstSignificantDigit;
 
+  category = fcNormal;
   zeroSignificand();
   exponent = 0;
-  category = fcNormal;
 
-  significand = significandParts();
-  partsCount = partCount();
-  bitPos = partsCount * integerPartWidth;
+  integerPart *significand = significandParts();
+  unsigned partsCount = partCount();
+  unsigned bitPos = partsCount * integerPartWidth;
+  bool computedTrailingFraction = false;
 
-  /* Skip leading zeroes and any (hexa)decimal point.  */
+  // Skip leading zeroes and any (hexa)decimal point.
   StringRef::iterator begin = s.begin();
   StringRef::iterator end = s.end();
+  StringRef::iterator dot;
   StringRef::iterator p = skipLeadingZeroesAndAnyDot(begin, end, &dot);
-  firstSignificantDigit = p;
+  StringRef::iterator firstSignificantDigit = p;
 
-  for (; p != end;) {
+  while (p != end) {
     integerPart hex_value;
 
     if (*p == '.') {
       assert(dot == end && "String contains multiple dots");
       dot = p++;
-      if (p == end) {
-        break;
-      }
+      continue;
     }
 
     hex_value = hexDigitValue(*p);
-    if (hex_value == -1U) {
+    if (hex_value == -1U)
       break;
-    }
 
     p++;
 
-    if (p == end) {
-      break;
-    } else {
-      /* Store the number whilst 4-bit nibbles remain.  */
-      if (bitPos) {
-        bitPos -= 4;
-        hex_value <<= bitPos % integerPartWidth;
-        significand[bitPos / integerPartWidth] |= hex_value;
-      } else {
-        lost_fraction = trailingHexadecimalFraction(p, end, hex_value);
-        while (p != end && hexDigitValue(*p) != -1U)
-          p++;
-        break;
-      }
+    // Store the number while we have space.
+    if (bitPos) {
+      bitPos -= 4;
+      hex_value <<= bitPos % integerPartWidth;
+      significand[bitPos / integerPartWidth] |= hex_value;
+    } else if (!computedTrailingFraction) {
+      lost_fraction = trailingHexadecimalFraction(p, end, hex_value);
+      computedTrailingFraction = true;
     }
   }
 
@@ -2316,8 +2405,8 @@ APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
     excessPrecision = calcSemantics.precision - semantics->precision;
     truncatedBits = excessPrecision;
 
-    APFloat decSig(calcSemantics, fcZero, sign);
-    APFloat pow5(calcSemantics, fcZero, false);
+    APFloat decSig = APFloat::getZero(calcSemantics, sign);
+    APFloat pow5(calcSemantics);
 
     sigStatus = decSig.convertFromUnsignedParts(decSigParts, sigPartCount,
                                                 rmNearestTiesToEven);
@@ -2402,7 +2491,14 @@ APFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode)
            42039/12655 < L < 28738/8651  [ numerator <= 65536 ]
   */
 
-  if (decDigitValue(*D.firstSigDigit) >= 10U) {
+  // Test if we have a zero number allowing for strings with no null terminators
+  // and zero decimals with non-zero exponents.
+  // 
+  // We computed firstSigDigit by ignoring all zeros and dots. Thus if
+  // D->firstSigDigit equals str.end(), every digit must be a zero and there can
+  // be at most one dot. On the other hand, if we have a zero with a non-zero
+  // exponent, then we know that D.firstSigDigit will be non-numeric.
+  if (D.firstSigDigit == str.end() || decDigitValue(*D.firstSigDigit) >= 10U) {
     category = fcZero;
     fs = opOK;
 
@@ -2419,6 +2515,7 @@ APFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode)
              (D.normalizedExponent + 1) * 28738 <=
                8651 * (semantics->minExponent - (int) semantics->precision)) {
     /* Underflow to zero and round.  */
+    category = fcNormal;
     zeroSignificand();
     fs = normalize(rounding_mode, lfLessThanHalf);
 
@@ -2485,11 +2582,40 @@ APFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode)
   return fs;
 }
 
+bool
+APFloat::convertFromStringSpecials(StringRef str) {
+  if (str.equals("inf") || str.equals("INFINITY")) {
+    makeInf(false);
+    return true;
+  }
+
+  if (str.equals("-inf") || str.equals("-INFINITY")) {
+    makeInf(true);
+    return true;
+  }
+
+  if (str.equals("nan") || str.equals("NaN")) {
+    makeNaN(false, false);
+    return true;
+  }
+
+  if (str.equals("-nan") || str.equals("-NaN")) {
+    makeNaN(false, true);
+    return true;
+  }
+
+  return false;
+}
+
 APFloat::opStatus
 APFloat::convertFromString(StringRef str, roundingMode rounding_mode)
 {
   assert(!str.empty() && "Invalid string length");
 
+  // Handle special cases.
+  if (convertFromStringSpecials(str))
+    return opOK;
+
   /* Handle a leading minus sign.  */
   StringRef::iterator p = str.begin();
   size_t slen = str.size();
@@ -2686,7 +2812,7 @@ APFloat::convertNormalToHexString(char *dst, unsigned int hexDigits,
 }
 
 hash_code llvm::hash_value(const APFloat &Arg) {
-  if (Arg.category != APFloat::fcNormal)
+  if (!Arg.isFiniteNonZero())
     return hash_combine((uint8_t)Arg.category,
                         // NaN has no sign, fix it at zero.
                         Arg.isNaN() ? (uint8_t)0 : (uint8_t)Arg.sign,
@@ -2717,7 +2843,7 @@ APFloat::convertF80LongDoubleAPFloatToAPInt() const
 
   uint64_t myexponent, mysignificand;
 
-  if (category==fcNormal) {
+  if (isFiniteNonZero()) {
     myexponent = exponent+16383; //bias
     mysignificand = significandParts()[0];
     if (myexponent==1 && !(mysignificand & 0x8000000000000000ULL))
@@ -2774,7 +2900,7 @@ APFloat::convertPPCDoubleDoubleAPFloatToAPInt() const
   // just set the second double to zero.  Otherwise, re-convert back to
   // the extended format and compute the difference.  This now should
   // convert exactly to double.
-  if (u.category == fcNormal && losesInfo) {
+  if (u.isFiniteNonZero() && losesInfo) {
     fs = u.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
     (void)fs;
@@ -2800,7 +2926,7 @@ APFloat::convertQuadrupleAPFloatToAPInt() const
 
   uint64_t myexponent, mysignificand, mysignificand2;
 
-  if (category==fcNormal) {
+  if (isFiniteNonZero()) {
     myexponent = exponent+16383; //bias
     mysignificand = significandParts()[0];
     mysignificand2 = significandParts()[1];
@@ -2836,7 +2962,7 @@ APFloat::convertDoubleAPFloatToAPInt() const
 
   uint64_t myexponent, mysignificand;
 
-  if (category==fcNormal) {
+  if (isFiniteNonZero()) {
     myexponent = exponent+1023; //bias
     mysignificand = *significandParts();
     if (myexponent==1 && !(mysignificand & 0x10000000000000LL))
@@ -2866,7 +2992,7 @@ APFloat::convertFloatAPFloatToAPInt() const
 
   uint32_t myexponent, mysignificand;
 
-  if (category==fcNormal) {
+  if (isFiniteNonZero()) {
     myexponent = exponent+127; //bias
     mysignificand = (uint32_t)*significandParts();
     if (myexponent == 1 && !(mysignificand & 0x800000))
@@ -2895,7 +3021,7 @@ APFloat::convertHalfAPFloatToAPInt() const
 
   uint32_t myexponent, mysignificand;
 
-  if (category==fcNormal) {
+  if (isFiniteNonZero()) {
     myexponent = exponent+15; //bias
     mysignificand = (uint32_t)*significandParts();
     if (myexponent == 1 && !(mysignificand & 0x400))
@@ -3018,7 +3144,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
   (void)fs;
 
   // Unless we have a special case, add in second double.
-  if (category == fcNormal) {
+  if (isFiniteNonZero()) {
     APFloat v(IEEEdouble, APInt(64, i2));
     fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
     assert(fs == opOK && !losesInfo);
@@ -3211,55 +3337,75 @@ APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE)
   }
 }
 
-APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
-  APFloat Val(Sem, fcNormal, Negative);
-
+/// Make this number the largest magnitude normal number in the given
+/// semantics.
+void APFloat::makeLargest(bool Negative) {
   // We want (in interchange format):
   //   sign = {Negative}
   //   exponent = 1..10
   //   significand = 1..1
+  category = fcNormal;
+  sign = Negative;
+  exponent = semantics->maxExponent;
 
-  Val.exponent = Sem.maxExponent; // unbiased
+  // Use memset to set all but the highest integerPart to all ones.
+  integerPart *significand = significandParts();
+  unsigned PartCount = partCount();
+  memset(significand, 0xFF, sizeof(integerPart)*(PartCount - 1));
 
-  // 1-initialize all bits....
-  Val.zeroSignificand();
-  integerPart *significand = Val.significandParts();
-  unsigned N = partCountForBits(Sem.precision);
-  for (unsigned i = 0; i != N; ++i)
-    significand[i] = ~((integerPart) 0);
+  // Set the high integerPart especially setting all unused top bits for
+  // internal consistency.
+  const unsigned NumUnusedHighBits =
+    PartCount*integerPartWidth - semantics->precision;
+  significand[PartCount - 1] = ~integerPart(0) >> NumUnusedHighBits;
+}
+
+/// Make this number the smallest magnitude denormal number in the given
+/// semantics.
+void APFloat::makeSmallest(bool Negative) {
+  // We want (in interchange format):
+  //   sign = {Negative}
+  //   exponent = 0..0
+  //   significand = 0..01
+  category = fcNormal;
+  sign = Negative;
+  exponent = semantics->minExponent;
+  APInt::tcSet(significandParts(), 1, partCount());
+}
 
-  // ...and then clear the top bits for internal consistency.
-  if (Sem.precision % integerPartWidth != 0)
-    significand[N-1] &=
-      (((integerPart) 1) << (Sem.precision % integerPartWidth)) - 1;
 
+APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
+  // We want (in interchange format):
+  //   sign = {Negative}
+  //   exponent = 1..10
+  //   significand = 1..1
+  APFloat Val(Sem, uninitialized);
+  Val.makeLargest(Negative);
   return Val;
 }
 
 APFloat APFloat::getSmallest(const fltSemantics &Sem, bool Negative) {
-  APFloat Val(Sem, fcNormal, Negative);
-
   // We want (in interchange format):
   //   sign = {Negative}
   //   exponent = 0..0
   //   significand = 0..01
-
-  Val.exponent = Sem.minExponent; // unbiased
-  Val.zeroSignificand();
-  Val.significandParts()[0] = 1;
+  APFloat Val(Sem, uninitialized);
+  Val.makeSmallest(Negative);
   return Val;
 }
 
 APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
-  APFloat Val(Sem, fcNormal, Negative);
+  APFloat Val(Sem, uninitialized);
 
   // We want (in interchange format):
   //   sign = {Negative}
   //   exponent = 0..0
   //   significand = 10..0
 
-  Val.exponent = Sem.minExponent;
+  Val.category = fcNormal;
   Val.zeroSignificand();
+  Val.sign = Negative;
+  Val.exponent = Sem.minExponent;
   Val.significandParts()[partCountForBits(Sem.precision)-1] |=
     (((integerPart) 1) << ((Sem.precision - 1) % integerPartWidth));
 
@@ -3400,11 +3546,14 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
   // Set FormatPrecision if zero.  We want to do this before we
   // truncate trailing zeros, as those are part of the precision.
   if (!FormatPrecision) {
-    // It's an interesting question whether to use the nominal
-    // precision or the active precision here for denormals.
+    // We use enough digits so the number can be round-tripped back to an
+    // APFloat. The formula comes from "How to Print Floating-Point Numbers
+    // Accurately" by Steele and White.
+    // FIXME: Using a formula based purely on the precision is conservative;
+    // we can print fewer digits depending on the actual value being printed.
 
-    // FormatPrecision = ceil(significandBits / lg_2(10))
-    FormatPrecision = (semantics->precision * 59 + 195) / 196;
+    // FormatPrecision = 2 + floor(significandBits / lg_2(10))
+    FormatPrecision = 2 + semantics->precision * 59 / 196;
   }
 
   // Ignore trailing binary zeros.
@@ -3564,7 +3713,7 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 
 bool APFloat::getExactInverse(APFloat *inv) const {
   // Special floats and denormals have no exact inverse.
-  if (category != fcNormal)
+  if (!isFiniteNonZero())
     return false;
 
   // Check that the number is a power of two by making sure that only the
@@ -3579,10 +3728,10 @@ bool APFloat::getExactInverse(APFloat *inv) const {
 
   // Avoid multiplication with a denormal, it is not safe on all platforms and
   // may be slower than a normal division.
-  if (reciprocal.significandMSB() + 1 < reciprocal.semantics->precision)
+  if (reciprocal.isDenormal())
     return false;
 
-  assert(reciprocal.category == fcNormal &&
+  assert(reciprocal.isFiniteNonZero() &&
          reciprocal.significandLSB() == reciprocal.semantics->precision - 1);
 
   if (inv)
@@ -3590,3 +3739,148 @@ bool APFloat::getExactInverse(APFloat *inv) const {
 
   return true;
 }
+
+bool APFloat::isSignaling() const {
+  if (!isNaN())
+    return false;
+
+  // IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the
+  // first bit of the trailing significand being 0.
+  return !APInt::tcExtractBit(significandParts(), semantics->precision - 2);
+}
+
+/// IEEE-754R 2008 5.3.1: nextUp/nextDown.
+///
+/// *NOTE* since nextDown(x) = -nextUp(-x), we only implement nextUp with
+/// appropriate sign switching before/after the computation.
+APFloat::opStatus APFloat::next(bool nextDown) {
+  // If we are performing nextDown, swap sign so we have -x.
+  if (nextDown)
+    changeSign();
+
+  // Compute nextUp(x)
+  opStatus result = opOK;
+
+  // Handle each float category separately.
+  switch (category) {
+  case fcInfinity:
+    // nextUp(+inf) = +inf
+    if (!isNegative())
+      break;
+    // nextUp(-inf) = -getLargest()
+    makeLargest(true);
+    break;
+  case fcNaN:
+    // IEEE-754R 2008 6.2 Par 2: nextUp(sNaN) = qNaN. Set Invalid flag.
+    // IEEE-754R 2008 6.2: nextUp(qNaN) = qNaN. Must be identity so we do not
+    //                     change the payload.
+    if (isSignaling()) {
+      result = opInvalidOp;
+      // For consistency, propogate the sign of the sNaN to the qNaN.
+      makeNaN(false, isNegative(), 0);
+    }
+    break;
+  case fcZero:
+    // nextUp(pm 0) = +getSmallest()
+    makeSmallest(false);
+    break;
+  case fcNormal:
+    // nextUp(-getSmallest()) = -0
+    if (isSmallest() && isNegative()) {
+      APInt::tcSet(significandParts(), 0, partCount());
+      category = fcZero;
+      exponent = 0;
+      break;
+    }
+
+    // nextUp(getLargest()) == INFINITY
+    if (isLargest() && !isNegative()) {
+      APInt::tcSet(significandParts(), 0, partCount());
+      category = fcInfinity;
+      exponent = semantics->maxExponent + 1;
+      break;
+    }
+
+    // nextUp(normal) == normal + inc.
+    if (isNegative()) {
+      // If we are negative, we need to decrement the significand.
+
+      // We only cross a binade boundary that requires adjusting the exponent
+      // if:
+      //   1. exponent != semantics->minExponent. This implies we are not in the
+      //   smallest binade or are dealing with denormals.
+      //   2. Our significand excluding the integral bit is all zeros.
+      bool WillCrossBinadeBoundary =
+        exponent != semantics->minExponent && isSignificandAllZeros();
+
+      // Decrement the significand.
+      //
+      // We always do this since:
+      //   1. If we are dealing with a non binade decrement, by definition we
+      //   just decrement the significand.
+      //   2. If we are dealing with a normal -> normal binade decrement, since
+      //   we have an explicit integral bit the fact that all bits but the
+      //   integral bit are zero implies that subtracting one will yield a
+      //   significand with 0 integral bit and 1 in all other spots. Thus we
+      //   must just adjust the exponent and set the integral bit to 1.
+      //   3. If we are dealing with a normal -> denormal binade decrement,
+      //   since we set the integral bit to 0 when we represent denormals, we
+      //   just decrement the significand.
+      integerPart *Parts = significandParts();
+      APInt::tcDecrement(Parts, partCount());
+
+      if (WillCrossBinadeBoundary) {
+        // Our result is a normal number. Do the following:
+        // 1. Set the integral bit to 1.
+        // 2. Decrement the exponent.
+        APInt::tcSetBit(Parts, semantics->precision - 1);
+        exponent--;
+      }
+    } else {
+      // If we are positive, we need to increment the significand.
+
+      // We only cross a binade boundary that requires adjusting the exponent if
+      // the input is not a denormal and all of said input's significand bits
+      // are set. If all of said conditions are true: clear the significand, set
+      // the integral bit to 1, and increment the exponent. If we have a
+      // denormal always increment since moving denormals and the numbers in the
+      // smallest normal binade have the same exponent in our representation.
+      bool WillCrossBinadeBoundary = !isDenormal() && isSignificandAllOnes();
+
+      if (WillCrossBinadeBoundary) {
+        integerPart *Parts = significandParts();
+        APInt::tcSet(Parts, 0, partCount());
+        APInt::tcSetBit(Parts, semantics->precision - 1);
+        assert(exponent != semantics->maxExponent &&
+               "We can not increment an exponent beyond the maxExponent allowed"
+               " by the given floating point semantics.");
+        exponent++;
+      } else {
+        incrementSignificand();
+      }
+    }
+    break;
+  }
+
+  // If we are performing nextDown, swap sign so we have -nextUp(-x)
+  if (nextDown)
+    changeSign();
+
+  return result;
+}
+
+void
+APFloat::makeInf(bool Negative) {
+  category = fcInfinity;
+  sign = Negative;
+  exponent = semantics->maxExponent + 1;
+  APInt::tcSet(significandParts(), 0, partCount());
+}
+
+void
+APFloat::makeZero(bool Negative) {
+  category = fcZero;
+  sign = Negative;
+  exponent = semantics->minExponent-1;
+  APInt::tcSet(significandParts(), 0, partCount());  
+}
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index e853475..89f96bd 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -692,14 +692,14 @@ unsigned APInt::countLeadingZerosSlowCase() const {
   unsigned i = getNumWords();
   integerPart MSW = pVal[i-1] & MSWMask;
   if (MSW)
-    return CountLeadingZeros_64(MSW) - (APINT_BITS_PER_WORD - BitsInMSW);
+    return llvm::countLeadingZeros(MSW) - (APINT_BITS_PER_WORD - BitsInMSW);
 
   unsigned Count = BitsInMSW;
   for (--i; i > 0u; --i) {
     if (pVal[i-1] == 0)
       Count += APINT_BITS_PER_WORD;
     else {
-      Count += CountLeadingZeros_64(pVal[i-1]);
+      Count += llvm::countLeadingZeros(pVal[i-1]);
       break;
     }
   }
@@ -735,13 +735,13 @@ unsigned APInt::countLeadingOnes() const {
 
 unsigned APInt::countTrailingZeros() const {
   if (isSingleWord())
-    return std::min(unsigned(CountTrailingZeros_64(VAL)), BitWidth);
+    return std::min(unsigned(llvm::countTrailingZeros(VAL)), BitWidth);
   unsigned Count = 0;
   unsigned i = 0;
   for (; i < getNumWords() && pVal[i] == 0; ++i)
     Count += APINT_BITS_PER_WORD;
   if (i < getNumWords())
-    Count += CountTrailingZeros_64(pVal[i]);
+    Count += llvm::countTrailingZeros(pVal[i]);
   return std::min(Count, BitWidth);
 }
 
@@ -1512,7 +1512,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   // and v so that its high bits are shifted to the top of v's range without
   // overflow. Note that this can require an extra word in u so that u must
   // be of length m+n+1.
-  unsigned shift = CountLeadingZeros_32(v[n-1]);
+  unsigned shift = countLeadingZeros(v[n-1]);
   unsigned v_carry = 0;
   unsigned u_carry = 0;
   if (shift) {
@@ -2304,24 +2304,7 @@ namespace {
   static unsigned int
   partMSB(integerPart value)
   {
-    unsigned int n, msb;
-
-    if (value == 0)
-      return -1U;
-
-    n = integerPartWidth / 2;
-
-    msb = 0;
-    do {
-      if (value >> n) {
-        value >>= n;
-        msb += n;
-      }
-
-      n >>= 1;
-    } while (n);
-
-    return msb;
+    return findLastSet(value, ZB_Max);
   }
 
   /* Returns the bit number of the least significant set bit of a
@@ -2329,24 +2312,7 @@ namespace {
   static unsigned int
   partLSB(integerPart value)
   {
-    unsigned int n, lsb;
-
-    if (value == 0)
-      return -1U;
-
-    lsb = integerPartWidth - 1;
-    n = integerPartWidth / 2;
-
-    do {
-      if (value << n) {
-        value <<= n;
-        lsb -= n;
-      }
-
-      n >>= 1;
-    } while (n);
-
-    return lsb;
+    return findFirstSet(value, ZB_Max);
   }
 }
 
@@ -2888,6 +2854,20 @@ APInt::tcIncrement(integerPart *dst, unsigned int parts)
   return i == parts;
 }
 
+/* Decrement a bignum in-place, return the borrow flag.  */
+integerPart
+APInt::tcDecrement(integerPart *dst, unsigned int parts) {
+  for (unsigned int i = 0; i < parts; i++) {
+    // If the current word is non-zero, then the decrement has no effect on the
+    // higher-order words of the integer and no borrow can occur. Exit early.
+    if (dst[i]--)
+      return 0;
+  }
+  // If every word was zero, then there is a borrow.
+  return 1;
+}
+
+
 /* Set the least significant BITS bits of a bignum, clear the
    rest.  */
 void
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 3c4191b..6e7a541 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -26,6 +26,10 @@ BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold,
     : SlabSize(size), SizeThreshold(std::min(size, threshold)),
       Allocator(allocator), CurSlab(0), BytesAllocated(0) { }
 
+BumpPtrAllocator::BumpPtrAllocator(size_t size, size_t threshold)
+    : SlabSize(size), SizeThreshold(std::min(size, threshold)),
+      Allocator(DefaultSlabAllocator), CurSlab(0), BytesAllocated(0) { }
+
 BumpPtrAllocator::~BumpPtrAllocator() {
   DeallocateSlabs(CurSlab);
 }
@@ -167,9 +171,6 @@ void BumpPtrAllocator::PrintStats() const {
          << " (includes alignment, etc)\n";
 }
 
-MallocSlabAllocator BumpPtrAllocator::DefaultSlabAllocator =
-  MallocSlabAllocator();
-
 SlabAllocator::~SlabAllocator() { }
 
 MallocSlabAllocator::~MallocSlabAllocator() { }
diff --git a/lib/Support/BlockFrequency.cpp b/lib/Support/BlockFrequency.cpp
index 84a993e..00efe90 100644
--- a/lib/Support/BlockFrequency.cpp
+++ b/lib/Support/BlockFrequency.cpp
@@ -18,76 +18,94 @@
 
 using namespace llvm;
 
-namespace {
-
-/// mult96bit - Multiply FREQ by N and store result in W array.
-void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
+/// Multiply FREQ by N and store result in W array.
+static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
   uint64_t u0 = freq & UINT32_MAX;
   uint64_t u1 = freq >> 32;
 
-  // Represent 96-bit value as w[2]:w[1]:w[0];
-  uint32_t w[3] = { 0, 0, 0 };
-
+  // Represent 96-bit value as W[2]:W[1]:W[0];
   uint64_t t = u0 * N;
   uint64_t k = t >> 32;
-  w[0] = t;
+  W[0] = t;
   t = u1 * N + k;
-  w[1] = t;
-  w[2] = t >> 32;
-
-  // W[1] - higher bits.
-  // W[0] - lower bits.
-  W[0] = w[0] + ((uint64_t) w[1] << 32);
-  W[1] = w[2];
+  W[1] = t;
+  W[2] = t >> 32;
 }
 
-
-/// div96bit - Divide 96-bit value stored in W array by D. Return 64-bit frequency.
-uint64_t div96bit(uint64_t W[2], uint32_t D) {
-  uint64_t y = W[0];
-  uint64_t x = W[1];
-  int i;
-
-  for (i = 1; i <= 64 && x; ++i) {
-    uint32_t t = (int)x >> 31;
-    x = (x << 1) | (y >> 63);
-    y = y << 1;
-    if ((x | t) >= D) {
-      x -= D;
-      ++y;
+/// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
+/// 32 bit unsigned integer, we can use a short division algorithm.
+static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
+  // We assume that W[2] is non-zero since if W[2] is not then the user should
+  // just use hardware division.
+  assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
+         "zero, the caller should just use 64/32 hardware.");
+  uint32_t Q[3] = { 0, 0, 0 };
+
+  // The generalized short division algorithm sets i to m + n - 1, where n is
+  // the number of words in the divisior and m is the number of words by which
+  // the divident exceeds the divisor (i.e. m + n == the length of the dividend
+  // in words). Due to our assumption that W[2] is non-zero, we know that the
+  // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
+  // m + n - 1 = 2 + 1 - 1 = 2.
+  uint32_t R = 0;
+  for (int i = 2; i >= 0; --i) {
+    uint64_t PartialD = uint64_t(R) << 32 | W[i];
+    if (PartialD == 0) {
+      Q[i] = 0;
+      R = 0;
+    } else if (PartialD < D) {
+      Q[i] = 0;
+      R = uint32_t(PartialD);
+    } else if (PartialD == D) {
+      Q[i] = 1;
+      R = 0;
+    } else {
+      Q[i] = uint32_t(PartialD / D);
+      R = uint32_t(PartialD - (Q[i] * D));
     }
   }
 
-  return y << (64 - i + 1);
-}
+  // If Q[2] is non-zero, then we overflowed.
+  uint64_t Result;
+  if (Q[2]) {
+    Result = UINT64_MAX;
+    R = D;
+  } else {
+    // Form the final uint64_t result, avoiding endianness issues.
+    Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
+  }
+
+  if (Rout)
+    *Rout = R;
 
+  return Result;
 }
 
+uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
+  assert(D != 0 && "Division by zero");
 
-BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
-  uint32_t n = Prob.getNumerator();
-  uint32_t d = Prob.getDenominator();
-
-  assert(n <= d && "Probability must be less or equal to 1.");
-
-  // Calculate Frequency * n.
-  uint64_t mulLo = (Frequency & UINT32_MAX) * n;
-  uint64_t mulHi = (Frequency >> 32) * n;
-  uint64_t mulRes = (mulHi << 32) + mulLo;
-
-  // If there was overflow use 96-bit operations.
-  if (mulHi > UINT32_MAX || mulRes < mulLo) {
-    // 96-bit value represented as W[1]:W[0].
-    uint64_t W[2];
-
-    // Probability is less or equal to 1 which means that results must fit
-    // 64-bit.
-    mult96bit(Frequency, n, W);
-    Frequency = div96bit(W, d);
-    return *this;
+  // Calculate Frequency * N.
+  uint64_t MulLo = (Frequency & UINT32_MAX) * N;
+  uint64_t MulHi = (Frequency >> 32) * N;
+  uint64_t MulRes = (MulHi << 32) + MulLo;
+
+  // If the product fits in 64 bits, just use built-in division.
+  if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
+    Frequency = MulRes / D;
+    return MulRes % D;
   }
 
-  Frequency = mulRes / d;
+  // Product overflowed, use 96-bit operations.
+  // 96-bit value represented as W[2]:W[1]:W[0].
+  uint32_t W[3];
+  uint32_t R;
+  mult96bit(Frequency, N, W);
+  Frequency = divrem96bit(W, D, &R);
+  return R;
+}
+
+BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
+  scale(Prob.getNumerator(), Prob.getDenominator());
   return *this;
 }
 
@@ -98,6 +116,17 @@ BlockFrequency::operator*(const BranchProbability &Prob) const {
   return Freq;
 }
 
+BlockFrequency &BlockFrequency::operator/=(const BranchProbability &Prob) {
+  scale(Prob.getDenominator(), Prob.getNumerator());
+  return *this;
+}
+
+BlockFrequency BlockFrequency::operator/(const BranchProbability &Prob) const {
+  BlockFrequency Freq(Frequency);
+  Freq /= Prob;
+  return Freq;
+}
+
 BlockFrequency &BlockFrequency::operator+=(const BlockFrequency &Freq) {
   uint64_t Before = Freq.Frequency;
   Frequency += Freq.Frequency;
@@ -116,8 +145,21 @@ BlockFrequency::operator+(const BlockFrequency &Prob) const {
   return Freq;
 }
 
+uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
+  return scale(Prob.getNumerator(), Prob.getDenominator());
+}
+
 void BlockFrequency::print(raw_ostream &OS) const {
-  OS << Frequency;
+  // Convert fixed-point number to decimal.
+  OS << Frequency / getEntryFrequency() << ".";
+  uint64_t Rem = Frequency % getEntryFrequency();
+  uint64_t Eps = 1;
+  do {
+    Rem *= 10;
+    Eps *= 10;
+    OS << Rem / getEntryFrequency();
+    Rem = Rem % getEntryFrequency();
+  } while (Rem >= Eps/2);
 }
 
 namespace llvm {
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 01565c5..3aecf3f 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -35,6 +35,7 @@ add_llvm_library(LLVMSupport
   ManagedStatic.cpp
   MemoryBuffer.cpp
   MemoryObject.cpp
+  MD5.cpp
   PluginLoader.cpp
   PrettyStackTrace.cpp
   Regex.cpp
@@ -47,11 +48,13 @@ add_llvm_library(LLVMSupport
   StringMap.cpp
   StringPool.cpp
   StringRef.cpp
+  StringRefMemoryObject.cpp
   SystemUtils.cpp
   Timer.cpp
   ToolOutputFile.cpp
   Triple.cpp
   Twine.cpp
+  Unicode.cpp
   YAMLParser.cpp
   YAMLTraits.cpp
   raw_os_ostream.cpp
@@ -72,7 +75,6 @@ add_llvm_library(LLVMSupport
   Memory.cpp
   Mutex.cpp
   Path.cpp
-  PathV2.cpp
   Process.cpp
   Program.cpp
   RWMutex.cpp
@@ -89,7 +91,6 @@ add_llvm_library(LLVMSupport
   Unix/Memory.inc
   Unix/Mutex.inc
   Unix/Path.inc
-  Unix/PathV2.inc
   Unix/Process.inc
   Unix/Program.inc
   Unix/RWMutex.inc
@@ -103,7 +104,6 @@ add_llvm_library(LLVMSupport
   Windows/Memory.inc
   Windows/Mutex.inc
   Windows/Path.inc
-  Windows/PathV2.inc
   Windows/Process.inc
   Windows/Program.inc
   Windows/RWMutex.inc
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 18d3db5..44a88d8 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -17,12 +17,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -58,6 +60,7 @@ TEMPLATE_INSTANTIATION(class opt<char>);
 TEMPLATE_INSTANTIATION(class opt<bool>);
 } } // end namespace llvm::cl
 
+// Pin the vtables to this file.
 void GenericOptionValue::anchor() {}
 void OptionValue<boolOrDefault>::anchor() {}
 void OptionValue<std::string>::anchor() {}
@@ -72,6 +75,7 @@ void parser<double>::anchor() {}
 void parser<float>::anchor() {}
 void parser<std::string>::anchor() {}
 void parser<char>::anchor() {}
+void StringSaver::anchor() {}
 
 //===----------------------------------------------------------------------===//
 
@@ -435,39 +439,248 @@ static bool EatsUnboundedNumberOfValues(const Option *O) {
          O->getNumOccurrencesFlag() == cl::OneOrMore;
 }
 
-/// ParseCStringVector - Break INPUT up wherever one or more
-/// whitespace characters are found, and store the resulting tokens in
-/// OUTPUT. The tokens stored in OUTPUT are dynamically allocated
-/// using strdup(), so it is the caller's responsibility to free()
-/// them later.
+static bool isWhitespace(char C) {
+  return strchr(" \t\n\r\f\v", C);
+}
+
+static bool isQuote(char C) {
+  return C == '\"' || C == '\'';
+}
+
+static bool isGNUSpecial(char C) {
+  return strchr("\\\"\' ", C);
+}
+
+void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver,
+                                SmallVectorImpl<const char *> &NewArgv) {
+  SmallString<128> Token;
+  for (size_t I = 0, E = Src.size(); I != E; ++I) {
+    // Consume runs of whitespace.
+    if (Token.empty()) {
+      while (I != E && isWhitespace(Src[I]))
+        ++I;
+      if (I == E) break;
+    }
+
+    // Backslashes can escape backslashes, spaces, and other quotes.  Otherwise
+    // they are literal.  This makes it much easier to read Windows file paths.
+    if (I + 1 < E && Src[I] == '\\' && isGNUSpecial(Src[I + 1])) {
+      ++I;  // Skip the escape.
+      Token.push_back(Src[I]);
+      continue;
+    }
+
+    // Consume a quoted string.
+    if (isQuote(Src[I])) {
+      char Quote = Src[I++];
+      while (I != E && Src[I] != Quote) {
+        // Backslashes are literal, unless they escape a special character.
+        if (Src[I] == '\\' && I + 1 != E && isGNUSpecial(Src[I + 1]))
+          ++I;
+        Token.push_back(Src[I]);
+        ++I;
+      }
+      if (I == E) break;
+      continue;
+    }
+
+    // End the token if this is whitespace.
+    if (isWhitespace(Src[I])) {
+      if (!Token.empty())
+        NewArgv.push_back(Saver.SaveString(Token.c_str()));
+      Token.clear();
+      continue;
+    }
+
+    // This is a normal character.  Append it.
+    Token.push_back(Src[I]);
+  }
+
+  // Append the last token after hitting EOF with no whitespace.
+  if (!Token.empty())
+    NewArgv.push_back(Saver.SaveString(Token.c_str()));
+}
+
+/// Backslashes are interpreted in a rather complicated way in the Windows-style
+/// command line, because backslashes are used both to separate path and to
+/// escape double quote. This method consumes runs of backslashes as well as the
+/// following double quote if it's escaped.
+///
+///  * If an even number of backslashes is followed by a double quote, one
+///    backslash is output for every pair of backslashes, and the last double
+///    quote remains unconsumed. The double quote will later be interpreted as
+///    the start or end of a quoted string in the main loop outside of this
+///    function.
+///
+///  * If an odd number of backslashes is followed by a double quote, one
+///    backslash is output for every pair of backslashes, and a double quote is
+///    output for the last pair of backslash-double quote. The double quote is
+///    consumed in this case.
 ///
-static void ParseCStringVector(std::vector<char *> &OutputVector,
-                               const char *Input) {
-  // Characters which will be treated as token separators:
-  StringRef Delims = " \v\f\t\r\n";
-
-  StringRef WorkStr(Input);
-  while (!WorkStr.empty()) {
-    // If the first character is a delimiter, strip them off.
-    if (Delims.find(WorkStr[0]) != StringRef::npos) {
-      size_t Pos = WorkStr.find_first_not_of(Delims);
-      if (Pos == StringRef::npos) Pos = WorkStr.size();
-      WorkStr = WorkStr.substr(Pos);
+///  * Otherwise, backslashes are interpreted literally.
+static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
+  size_t E = Src.size();
+  int BackslashCount = 0;
+  // Skip the backslashes.
+  do {
+    ++I;
+    ++BackslashCount;
+  } while (I != E && Src[I] == '\\');
+
+  bool FollowedByDoubleQuote = (I != E && Src[I] == '"');
+  if (FollowedByDoubleQuote) {
+    Token.append(BackslashCount / 2, '\\');
+    if (BackslashCount % 2 == 0)
+      return I - 1;
+    Token.push_back('"');
+    return I;
+  }
+  Token.append(BackslashCount, '\\');
+  return I - 1;
+}
+
+void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
+                                    SmallVectorImpl<const char *> &NewArgv) {
+  SmallString<128> Token;
+
+  // This is a small state machine to consume characters until it reaches the
+  // end of the source string.
+  enum { INIT, UNQUOTED, QUOTED } State = INIT;
+  for (size_t I = 0, E = Src.size(); I != E; ++I) {
+    // INIT state indicates that the current input index is at the start of
+    // the string or between tokens.
+    if (State == INIT) {
+      if (isWhitespace(Src[I]))
+        continue;
+      if (Src[I] == '"') {
+        State = QUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        State = UNQUOTED;
+        continue;
+      }
+      Token.push_back(Src[I]);
+      State = UNQUOTED;
       continue;
     }
 
-    // Find position of first delimiter.
-    size_t Pos = WorkStr.find_first_of(Delims);
-    if (Pos == StringRef::npos) Pos = WorkStr.size();
+    // UNQUOTED state means that it's reading a token not quoted by double
+    // quotes.
+    if (State == UNQUOTED) {
+      // Whitespace means the end of the token.
+      if (isWhitespace(Src[I])) {
+        NewArgv.push_back(Saver.SaveString(Token.c_str()));
+        Token.clear();
+        State = INIT;
+        continue;
+      }
+      if (Src[I] == '"') {
+        State = QUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        continue;
+      }
+      Token.push_back(Src[I]);
+      continue;
+    }
 
-    // Everything from 0 to Pos is the next word to copy.
-    char *NewStr = (char*)malloc(Pos+1);
-    memcpy(NewStr, WorkStr.data(), Pos);
-    NewStr[Pos] = 0;
-    OutputVector.push_back(NewStr);
+    // QUOTED state means that it's reading a token quoted by double quotes.
+    if (State == QUOTED) {
+      if (Src[I] == '"') {
+        State = UNQUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        continue;
+      }
+      Token.push_back(Src[I]);
+    }
+  }
+  // Append the last token after hitting EOF with no whitespace.
+  if (!Token.empty())
+    NewArgv.push_back(Saver.SaveString(Token.c_str()));
+}
 
-    WorkStr = WorkStr.substr(Pos);
+static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
+                               TokenizerCallback Tokenizer,
+                               SmallVectorImpl<const char *> &NewArgv) {
+  OwningPtr<MemoryBuffer> MemBuf;
+  if (MemoryBuffer::getFile(FName, MemBuf))
+    return false;
+  StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize());
+
+  // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing.
+  ArrayRef<char> BufRef(MemBuf->getBufferStart(), MemBuf->getBufferEnd());
+  std::string UTF8Buf;
+  if (hasUTF16ByteOrderMark(BufRef)) {
+    if (!convertUTF16ToUTF8String(BufRef, UTF8Buf))
+      return false;
+    Str = StringRef(UTF8Buf);
   }
+
+  // Tokenize the contents into NewArgv.
+  Tokenizer(Str, Saver, NewArgv);
+
+  return true;
+}
+
+/// \brief Expand response files on a command line recursively using the given
+/// StringSaver and tokenization strategy.
+bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer,
+                             SmallVectorImpl<const char *> &Argv) {
+  unsigned RspFiles = 0;
+  bool AllExpanded = false;
+
+  // Don't cache Argv.size() because it can change.
+  for (unsigned I = 0; I != Argv.size(); ) {
+    const char *Arg = Argv[I];
+    if (Arg[0] != '@') {
+      ++I;
+      continue;
+    }
+
+    // If we have too many response files, leave some unexpanded.  This avoids
+    // crashing on self-referential response files.
+    if (RspFiles++ > 20)
+      return false;
+
+    // Replace this response file argument with the tokenization of its
+    // contents.  Nested response files are expanded in subsequent iterations.
+    // FIXME: If a nested response file uses a relative path, is it relative to
+    // the cwd of the process or the response file?
+    SmallVector<const char *, 0> ExpandedArgv;
+    if (!ExpandResponseFile(Arg + 1, Saver, Tokenizer, ExpandedArgv)) {
+      AllExpanded = false;
+      continue;
+    }
+    Argv.erase(Argv.begin() + I);
+    Argv.insert(Argv.begin() + I, ExpandedArgv.begin(), ExpandedArgv.end());
+  }
+  return AllExpanded;
+}
+
+namespace {
+  class StrDupSaver : public StringSaver {
+    std::vector<char*> Dups;
+  public:
+    ~StrDupSaver() {
+      for (std::vector<char *>::iterator I = Dups.begin(), E = Dups.end();
+           I != E; ++I) {
+        char *Dup = *I;
+        free(Dup);
+      }
+    }
+    const char *SaveString(const char *Str) LLVM_OVERRIDE {
+      char *Dup = strdup(Str);
+      Dups.push_back(Dup);
+      return Dup;
+    }
+  };
 }
 
 /// ParseEnvironmentOptions - An alternative entry point to the
@@ -488,56 +701,15 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 
   // Get program's "name", which we wouldn't know without the caller
   // telling us.
-  std::vector<char*> newArgv;
-  newArgv.push_back(strdup(progName));
+  SmallVector<const char *, 20> newArgv;
+  StrDupSaver Saver;
+  newArgv.push_back(Saver.SaveString(progName));
 
   // Parse the value of the environment variable into a "command line"
   // and hand it off to ParseCommandLineOptions().
-  ParseCStringVector(newArgv, envValue);
+  TokenizeGNUCommandLine(envValue, Saver, newArgv);
   int newArgc = static_cast<int>(newArgv.size());
   ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
-
-  // Free all the strdup()ed strings.
-  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
-       i != e; ++i)
-    free(*i);
-}
-
-
-/// ExpandResponseFiles - Copy the contents of argv into newArgv,
-/// substituting the contents of the response files for the arguments
-/// of type @file.
-static void ExpandResponseFiles(unsigned argc, const char*const* argv,
-                                std::vector<char*>& newArgv) {
-  for (unsigned i = 1; i != argc; ++i) {
-    const char *arg = argv[i];
-
-    if (arg[0] == '@') {
-      sys::PathWithStatus respFile(++arg);
-
-      // Check that the response file is not empty (mmap'ing empty
-      // files can be problematic).
-      const sys::FileStatus *FileStat = respFile.getFileStatus();
-      if (FileStat && FileStat->getSize() != 0) {
-
-        // If we could open the file, parse its contents, otherwise
-        // pass the @file option verbatim.
-
-        // TODO: we should also support recursive loading of response files,
-        // since this is how gcc behaves. (From their man page: "The file may
-        // itself contain additional @file options; any such options will be
-        // processed recursively.")
-
-        // Mmap the response file into memory.
-        OwningPtr<MemoryBuffer> respFilePtr;
-        if (!MemoryBuffer::getFile(respFile.c_str(), respFilePtr)) {
-          ParseCStringVector(newArgv, respFilePtr->getBufferStart());
-          continue;
-        }
-      }
-    }
-    newArgv.push_back(strdup(arg));
-  }
 }
 
 void cl::ParseCommandLineOptions(int argc, const char * const *argv,
@@ -552,9 +724,11 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
          "No options specified!");
 
   // Expand response files.
-  std::vector<char*> newArgv;
-  newArgv.push_back(strdup(argv[0]));
-  ExpandResponseFiles(argc, argv, newArgv);
+  SmallVector<const char *, 20> newArgv;
+  for (int i = 0; i != argc; ++i)
+    newArgv.push_back(argv[i]);
+  StrDupSaver Saver;
+  ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv);
   argv = &newArgv[0];
   argc = static_cast<int>(newArgv.size());
 
@@ -848,12 +1022,6 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   PositionalOpts.clear();
   MoreHelp->clear();
 
-  // Free the memory allocated by ExpandResponseFiles.
-  // Free all the strdup()ed strings.
-  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
-       i != e; ++i)
-    free(*i);
-
   // If we had an error processing our arguments, don't let the program execute
   if (ErrorParsing) exit(1);
 }
@@ -913,11 +1081,20 @@ size_t alias::getOptionWidth() const {
   return std::strlen(ArgStr)+6;
 }
 
+static void printHelpStr(StringRef HelpStr, size_t Indent,
+                         size_t FirstLineIndentedBy) {
+  std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
+  outs().indent(Indent - FirstLineIndentedBy) << " - " << Split.first << "\n";
+  while (!Split.second.empty()) {
+    Split = Split.second.split('\n');
+    outs().indent(Indent) << Split.first << "\n";
+  }
+}
+
 // Print out the option for the alias.
 void alias::printOptionInfo(size_t GlobalWidth) const {
-  size_t L = std::strlen(ArgStr);
   outs() << "  -" << ArgStr;
-  outs().indent(GlobalWidth-L-6) << " - " << HelpStr << "\n";
+  printHelpStr(HelpStr, GlobalWidth, std::strlen(ArgStr) + 6);
 }
 
 //===----------------------------------------------------------------------===//
@@ -946,7 +1123,7 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   if (const char *ValName = getValueName())
     outs() << "=<" << getValueStr(O, ValName) << '>';
 
-  outs().indent(GlobalWidth-getOptionWidth(O)) << " - " << O.HelpStr << '\n';
+  printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
 }
 
 void basic_parser_impl::printOptionName(const Option &O,
@@ -1087,9 +1264,8 @@ size_t generic_parser_base::getOptionWidth(const Option &O) const {
 void generic_parser_base::printOptionInfo(const Option &O,
                                           size_t GlobalWidth) const {
   if (O.hasArgStr()) {
-    size_t L = std::strlen(O.ArgStr);
     outs() << "  -" << O.ArgStr;
-    outs().indent(GlobalWidth-L-6) << " - " << O.HelpStr << '\n';
+    printHelpStr(O.HelpStr, GlobalWidth, std::strlen(O.ArgStr) + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       size_t NumSpaces = GlobalWidth-strlen(getOption(i))-8;
@@ -1100,9 +1276,9 @@ void generic_parser_base::printOptionInfo(const Option &O,
     if (O.HelpStr[0])
       outs() << "  " << O.HelpStr << '\n';
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
-      size_t L = std::strlen(getOption(i));
-      outs() << "    -" << getOption(i);
-      outs().indent(GlobalWidth-L-8) << " - " << getDescription(i) << '\n';
+      const char *Option = getOption(i);
+      outs() << "    -" << Option;
+      printHelpStr(getDescription(i), GlobalWidth, std::strlen(Option) + 8);
     }
   }
 }
diff --git a/lib/Support/Compression.cpp b/lib/Support/Compression.cpp
index fd8a874..b5ddb70 100644
--- a/lib/Support/Compression.cpp
+++ b/lib/Support/Compression.cpp
@@ -81,6 +81,10 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
   return Res;
 }
 
+uint32_t zlib::crc32(StringRef Buffer) {
+  return ::crc32(0, (const Bytef *)Buffer.data(), Buffer.size());
+}
+
 #else
 bool zlib::isAvailable() { return false; }
 zlib::Status zlib::compress(StringRef InputBuffer,
@@ -93,5 +97,8 @@ zlib::Status zlib::uncompress(StringRef InputBuffer,
                               size_t UncompressedSize) {
   return zlib::StatusUnsupported;
 }
+uint32_t zlib::crc32(StringRef Buffer) {
+  llvm_unreachable("zlib::crc32 is unavailable");
+}
 #endif
 
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index 5c58950..265b6e9 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -38,13 +38,14 @@ ConstantRange::ConstantRange(uint32_t BitWidth, bool Full) {
 
 /// Initialize a range to hold the single specified value.
 ///
-ConstantRange::ConstantRange(const APInt &V) : Lower(V), Upper(V + 1) {}
+ConstantRange::ConstantRange(APIntMoveTy V)
+    : Lower(llvm_move(V)), Upper(Lower + 1) {}
 
-ConstantRange::ConstantRange(const APInt &L, const APInt &U) :
-  Lower(L), Upper(U) {
-  assert(L.getBitWidth() == U.getBitWidth() &&
+ConstantRange::ConstantRange(APIntMoveTy L, APIntMoveTy U)
+    : Lower(llvm_move(L)), Upper(llvm_move(U)) {
+  assert(Lower.getBitWidth() == Upper.getBitWidth() &&
          "ConstantRange with unequal bit widths");
-  assert((L != U || (L.isMaxValue() || L.isMinValue())) &&
+  assert((Lower != Upper || (Lower.isMaxValue() || Lower.isMinValue())) &&
          "Lower == Upper, but they aren't min or max value!");
 }
 
@@ -143,9 +144,6 @@ bool ConstantRange::isSignWrappedSet() const {
 /// getSetSize - Return the number of elements in this set.
 ///
 APInt ConstantRange::getSetSize() const {
-  if (isEmptySet())
-    return APInt(getBitWidth()+1, 0);
-
   if (isFullSet()) {
     APInt Size(getBitWidth()+1, 0);
     Size.setBit(getBitWidth());
@@ -432,7 +430,7 @@ ConstantRange ConstantRange::zeroExtend(uint32_t DstTySize) const {
     APInt LowerExt(DstTySize, 0);
     if (!Upper) // special case: [X, 0) -- not really wrapping around
       LowerExt = Lower.zext(DstTySize);
-    return ConstantRange(LowerExt, APInt(DstTySize, 1).shl(SrcTySize));
+    return ConstantRange(LowerExt, APInt::getOneBitSet(DstTySize, SrcTySize));
   }
 
   return ConstantRange(Lower.zext(DstTySize), Upper.zext(DstTySize));
@@ -447,6 +445,11 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 
   unsigned SrcTySize = getBitWidth();
   assert(SrcTySize < DstTySize && "Not a value extension");
+
+  // special case: [X, INT_MIN) -- not really wrapping around
+  if (Upper.isMinSignedValue())
+    return ConstantRange(Lower.sext(DstTySize), Upper.zext(DstTySize));
+
   if (isFullSet() || isSignWrappedSet()) {
     return ConstantRange(APInt::getHighBitsSet(DstTySize,DstTySize-SrcTySize+1),
                          APInt::getLowBitsSet(DstTySize, SrcTySize-1) + 1);
diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp
index 458fbb0..e45335d 100644
--- a/lib/Support/ConvertUTFWrapper.cpp
+++ b/lib/Support/ConvertUTFWrapper.cpp
@@ -8,6 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <string>
+#include <vector>
 
 namespace llvm {
 
@@ -72,5 +75,57 @@ bool ConvertCodePointToUTF8(unsigned Source, char *&ResultPtr) {
   return true;
 }
 
+bool hasUTF16ByteOrderMark(ArrayRef<char> S) {
+  return (S.size() >= 2 &&
+          ((S[0] == '\xff' && S[1] == '\xfe') ||
+           (S[0] == '\xfe' && S[1] == '\xff')));
+}
+
+bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
+  assert(Out.empty());
+
+  // Error out on an uneven byte count.
+  if (SrcBytes.size() % 2)
+    return false;
+
+  // Avoid OOB by returning early on empty input.
+  if (SrcBytes.empty())
+    return true;
+
+  const UTF16 *Src = reinterpret_cast<const UTF16 *>(SrcBytes.begin());
+  const UTF16 *SrcEnd = reinterpret_cast<const UTF16 *>(SrcBytes.end());
+
+  // Byteswap if necessary.
+  std::vector<UTF16> ByteSwapped;
+  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
+    ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
+    for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
+      ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);
+    Src = &ByteSwapped[0];
+    SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
+  }
+
+  // Skip the BOM for conversion.
+  if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_NATIVE)
+    Src++;
+
+  // Just allocate enough space up front.  We'll shrink it later.
+  Out.resize(SrcBytes.size() * UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  UTF8 *Dst = reinterpret_cast<UTF8 *>(&Out[0]);
+  UTF8 *DstEnd = Dst + Out.size();
+
+  ConversionResult CR =
+      ConvertUTF16toUTF8(&Src, SrcEnd, &Dst, DstEnd, strictConversion);
+  assert(CR != targetExhausted);
+
+  if (CR != conversionOK) {
+    Out.clear();
+    return false;
+  }
+
+  Out.resize(reinterpret_cast<char *>(Dst) - &Out[0]);
+  return true;
+}
+
 } // end namespace llvm
 
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index 182c362..92c370d 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/ThreadLocal.h"
 #include <cstdio>
@@ -21,27 +22,34 @@ namespace {
 
 struct CrashRecoveryContextImpl;
 
-static sys::ThreadLocal<const CrashRecoveryContextImpl> CurrentContext;
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextImpl> > CurrentContext;
 
 struct CrashRecoveryContextImpl {
   CrashRecoveryContext *CRC;
   std::string Backtrace;
   ::jmp_buf JumpBuffer;
   volatile unsigned Failed : 1;
+  unsigned SwitchedThread : 1;
 
 public:
   CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
-                                                        Failed(false) {
-    CurrentContext.set(this);
+                                                        Failed(false),
+                                                        SwitchedThread(false) {
+    CurrentContext->set(this);
   }
   ~CrashRecoveryContextImpl() {
-    CurrentContext.erase();
+    if (!SwitchedThread)
+      CurrentContext->erase();
   }
 
+  /// \brief Called when the separate crash-recovery thread was finished, to
+  /// indicate that we don't need to clear the thread-local CurrentContext.
+  void setSwitchedThread() { SwitchedThread = true; }
+
   void HandleCrash() {
     // Eliminate the current context entry, to avoid re-entering in case the
     // cleanup code crashes.
-    CurrentContext.erase();
+    CurrentContext->erase();
 
     assert(!Failed && "Crash recovery context already failed!");
     Failed = true;
@@ -55,10 +63,10 @@ public:
 
 }
 
-static sys::Mutex gCrashRecoveryContexMutex;
+static ManagedStatic<sys::Mutex> gCrashRecoveryContextMutex;
 static bool gCrashRecoveryEnabled = false;
 
-static sys::ThreadLocal<const CrashRecoveryContextCleanup> 
+static ManagedStatic<sys::ThreadLocal<const CrashRecoveryContextCleanup> >
        tlIsRecoveringFromCrash;
 
 CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
@@ -66,7 +74,7 @@ CrashRecoveryContextCleanup::~CrashRecoveryContextCleanup() {}
 CrashRecoveryContext::~CrashRecoveryContext() {
   // Reclaim registered resources.
   CrashRecoveryContextCleanup *i = head;
-  tlIsRecoveringFromCrash.set(head);
+  tlIsRecoveringFromCrash->set(head);
   while (i) {
     CrashRecoveryContextCleanup *tmp = i;
     i = tmp->next;
@@ -74,21 +82,21 @@ CrashRecoveryContext::~CrashRecoveryContext() {
     tmp->recoverResources();
     delete tmp;
   }
-  tlIsRecoveringFromCrash.erase();
+  tlIsRecoveringFromCrash->erase();
   
   CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
   delete CRCI;
 }
 
 bool CrashRecoveryContext::isRecoveringFromCrash() {
-  return tlIsRecoveringFromCrash.get() != 0;
+  return tlIsRecoveringFromCrash->get() != 0;
 }
 
 CrashRecoveryContext *CrashRecoveryContext::GetCurrent() {
   if (!gCrashRecoveryEnabled)
     return 0;
 
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
   if (!CRCI)
     return 0;
 
@@ -147,7 +155,7 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // Something has gone horribly wrong, so let's just tell everyone
@@ -175,7 +183,7 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 static sys::ThreadLocal<const void> sCurrentExceptionHandle;
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -191,7 +199,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
@@ -229,7 +237,7 @@ static struct sigaction PrevActions[NumSignals];
 
 static void CrashRecoverySignalHandler(int Signal) {
   // Lookup the current thread local recovery object.
-  const CrashRecoveryContextImpl *CRCI = CurrentContext.get();
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
   if (!CRCI) {
     // We didn't find a crash recovery context -- this means either we got a
@@ -260,7 +268,7 @@ static void CrashRecoverySignalHandler(int Signal) {
 }
 
 void CrashRecoveryContext::Enable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (gCrashRecoveryEnabled)
     return;
@@ -279,7 +287,7 @@ void CrashRecoveryContext::Enable() {
 }
 
 void CrashRecoveryContext::Disable() {
-  sys::ScopedLock L(gCrashRecoveryContexMutex);
+  sys::ScopedLock L(*gCrashRecoveryContextMutex);
 
   if (!gCrashRecoveryEnabled)
     return;
@@ -342,5 +350,7 @@ bool CrashRecoveryContext::RunSafelyOnThread(void (*Fn)(void*), void *UserData,
                                              unsigned RequestedStackSize) {
   RunSafelyOnThreadInfo Info = { Fn, UserData, this, false };
   llvm_execute_on_thread(RunSafelyOnThread_Dispatch, &Info, RequestedStackSize);
+  if (CrashRecoveryContextImpl *CRC = (CrashRecoveryContextImpl *)Impl)
+    CRC->setSwitchedThread();
   return Info.Result;
 }
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index 0a02281..0bd0c68 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -17,6 +17,7 @@
 #define DEBUG_TYPE "Data-stream"
 #include "llvm/Support/DataStream.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/system_error.h"
 #include <cerrno>
@@ -27,7 +28,6 @@
 #else
 #include <io.h>
 #endif
-#include <fcntl.h>
 using namespace llvm;
 
 // Interface goals:
@@ -66,18 +66,11 @@ public:
   error_code OpenFile(const std::string &Filename) {
     if (Filename == "-") {
       Fd = 0;
-      sys::Program::ChangeStdinToBinary();
+      sys::ChangeStdinToBinary();
       return error_code::success();
     }
-  
-    int OpenFlags = O_RDONLY;
-#ifdef O_BINARY
-    OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
-#endif
-    Fd = ::open(Filename.c_str(), OpenFlags);
-    if (Fd == -1)
-      return error_code(errno, posix_category());
-    return error_code::success();
+
+    return sys::fs::openFileForRead(Filename, Fd);
   }
 };
 
diff --git a/lib/Support/Disassembler.cpp b/lib/Support/Disassembler.cpp
index b3244fa..27df3a9 100644
--- a/lib/Support/Disassembler.cpp
+++ b/lib/Support/Disassembler.cpp
@@ -41,10 +41,10 @@ bool llvm::sys::hasDisassembler()
 
 std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
                                          uint64_t pc) {
-  std::stringstream res;
-
 #if (defined (__i386__) || defined (__amd64__) || defined (__x86_64__)) \
   && USE_UDIS86
+  std::stringstream res;
+
   unsigned bits;
 # if defined(__i386__)
   bits = 32;
@@ -66,9 +66,9 @@ std::string llvm::sys::disassembleBuffer(uint8_t* start, size_t length,
   while (ud_disassemble(&ud_obj)) {
     res << ud_insn_off(&ud_obj) << ":\t" << ud_insn_asm(&ud_obj) << "\n";
   }
-#else
-  res << "No disassembler available. See configure help for options.\n";
-#endif
 
   return res.str();
+#else
+  return "No disassembler available. See configure help for options.\n";
+#endif
 }
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 0f91c11..c000b63 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -59,8 +61,8 @@ const char *llvm::dwarf::TagString(unsigned Tag) {
   case DW_TAG_namelist_item:             return "DW_TAG_namelist_item";
   case DW_TAG_packed_type:               return "DW_TAG_packed_type";
   case DW_TAG_subprogram:                return "DW_TAG_subprogram";
-  case DW_TAG_template_type_parameter:  return "DW_TAG_template_type_parameter";
-  case DW_TAG_template_value_parameter:return "DW_TAG_template_value_parameter";
+  case DW_TAG_template_type_parameter:   return "DW_TAG_template_type_parameter";
+  case DW_TAG_template_value_parameter:  return "DW_TAG_template_value_parameter";
   case DW_TAG_thrown_type:               return "DW_TAG_thrown_type";
   case DW_TAG_try_block:                 return "DW_TAG_try_block";
   case DW_TAG_variant_part:              return "DW_TAG_variant_part";
@@ -230,6 +232,7 @@ const char *llvm::dwarf::AttributeString(unsigned Attribute) {
   case DW_AT_body_end:                   return "DW_AT_body_end";
   case DW_AT_GNU_vector:                 return "DW_AT_GNU_vector";
   case DW_AT_GNU_template_name:          return "DW_AT_GNU_template_name";
+  case DW_AT_GNU_odr_signature:          return "DW_AT_GNU_odr_signature";
   case DW_AT_MIPS_assumed_size:          return "DW_AT_MIPS_assumed_size";
   case DW_AT_lo_user:                    return "DW_AT_lo_user";
   case DW_AT_hi_user:                    return "DW_AT_hi_user";
@@ -453,10 +456,11 @@ const char *llvm::dwarf::OperationEncodingString(unsigned Encoding) {
   case DW_OP_bit_piece:                  return "DW_OP_bit_piece";
   case DW_OP_implicit_value:             return "DW_OP_implicit_value";
   case DW_OP_stack_value:                return "DW_OP_stack_value";
-  case DW_OP_lo_user:                    return "DW_OP_lo_user";
-  case DW_OP_hi_user:                    return "DW_OP_hi_user";
 
-    // DWARF5 Fission Proposal Op Extensions
+  // GNU thread-local storage
+  case DW_OP_GNU_push_tls_address:       return "DW_OP_GNU_push_tls_address";
+
+  // DWARF5 Fission Proposal Op Extensions
   case DW_OP_GNU_addr_index:             return "DW_OP_GNU_addr_index";
   case DW_OP_GNU_const_index:            return "DW_OP_GNU_const_index";
   }
@@ -722,3 +726,51 @@ const char *llvm::dwarf::CallFrameString(unsigned Encoding) {
   }
   return 0;
 }
+
+const char *llvm::dwarf::AtomTypeString(unsigned AT) {
+  switch (AT) {
+  case dwarf::DW_ATOM_null:
+    return "DW_ATOM_null";
+  case dwarf::DW_ATOM_die_offset:
+    return "DW_ATOM_die_offset";
+  case DW_ATOM_cu_offset:
+    return "DW_ATOM_cu_offset";
+  case DW_ATOM_die_tag:
+    return "DW_ATOM_die_tag";
+  case DW_ATOM_type_flags:
+    return "DW_ATOM_type_flags";
+  }
+  return 0;
+}
+
+const char *llvm::dwarf::GDBIndexEntryKindString(GDBIndexEntryKind Kind) {
+  switch (Kind) {
+  case GIEK_NONE:
+    return "NONE";
+  case GIEK_TYPE:
+    return "TYPE";
+  case GIEK_VARIABLE:
+    return "VARIABLE";
+  case GIEK_FUNCTION:
+    return "FUNCTION";
+  case GIEK_OTHER:
+    return "OTHER";
+  case GIEK_UNUSED5:
+    return "UNUSED5";
+  case GIEK_UNUSED6:
+    return "UNUSED6";
+  case GIEK_UNUSED7:
+    return "UNUSED7";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryKind value");
+}
+
+const char *llvm::dwarf::GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage) {
+  switch (Linkage) {
+  case GIEL_EXTERNAL:
+    return "EXTERNAL";
+  case GIEL_STATIC:
+    return "STATIC";
+  }
+  llvm_unreachable("Unknown GDBIndexEntryLinkage value");
+}
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index f14cb45..a825c68 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -14,39 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm-c/Support.h"
 #include <cstdio>
 #include <cstring>
 
 // Collection of symbol name/value pairs to be searched prior to any libraries.
-static llvm::StringMap<void *> *ExplicitSymbols = 0;
-
-namespace {
-
-struct ExplicitSymbolsDeleter {
-  ~ExplicitSymbolsDeleter() {
-    delete ExplicitSymbols;
-  }
-};
-
-}
-
-static ExplicitSymbolsDeleter Dummy;
-
-
-static llvm::sys::SmartMutex<true>& getMutex() {
-  static llvm::sys::SmartMutex<true> HandlesMutex;
-  return HandlesMutex;
-}
+static llvm::ManagedStatic<llvm::StringMap<void *> > ExplicitSymbols;
+static llvm::ManagedStatic<llvm::sys::SmartMutex<true> > SymbolsMutex;
 
 void llvm::sys::DynamicLibrary::AddSymbol(StringRef symbolName,
                                           void *symbolValue) {
-  SmartScopedLock<true> lock(getMutex());
-  if (ExplicitSymbols == 0)
-    ExplicitSymbols = new StringMap<void*>();
+  SmartScopedLock<true> lock(*SymbolsMutex);
   (*ExplicitSymbols)[symbolName] = symbolValue;
 }
 
@@ -72,7 +55,7 @@ static DenseSet<void *> *OpenedHandles = 0;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   void *handle = dlopen(filename, RTLD_LAZY|RTLD_GLOBAL);
   if (handle == 0) {
@@ -126,10 +109,10 @@ void *SearchForAddressOfSpecialSymbol(const char* symbolName);
 }
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
@@ -187,3 +170,11 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 }
 
 #endif // LLVM_ON_WIN32
+
+//===----------------------------------------------------------------------===//
+// C API.
+//===----------------------------------------------------------------------===//
+
+LLVMBool LLVMLoadLibraryPermanently(const char* Filename) {
+  return llvm::sys::DynamicLibrary::LoadLibraryPermanently(Filename);
+}
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index 730220f..1eefa3e 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -14,8 +14,6 @@
 #include "llvm/Support/Errno.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
 #include "llvm/Support/raw_ostream.h"
-
-#if HAVE_STRING_H
 #include <string.h>
 
 #if HAVE_ERRNO_H
@@ -41,28 +39,27 @@ std::string StrError(int errnum) {
   char buffer[MaxErrStrLen];
   buffer[0] = '\0';
   std::string str;
+  if (errnum == 0)
+    return str;
+
 #ifdef HAVE_STRERROR_R
   // strerror_r is thread-safe.
-  if (errnum)
-# if defined(__GLIBC__) && defined(_GNU_SOURCE)
-    // glibc defines its own incompatible version of strerror_r
-    // which may not use the buffer supplied.
-    str = strerror_r(errnum,buffer,MaxErrStrLen-1);
-# else
-    strerror_r(errnum,buffer,MaxErrStrLen-1);
-    str = buffer;
-# endif
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+  // glibc defines its own incompatible version of strerror_r
+  // which may not use the buffer supplied.
+  str = strerror_r(errnum, buffer, MaxErrStrLen - 1);
+#else
+  strerror_r(errnum, buffer, MaxErrStrLen - 1);
+  str = buffer;
+#endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
-    if (errnum) {
-      strerror_s(buffer, MaxErrStrLen - 1, errnum);
-      str = buffer;
-    }
+  strerror_s(buffer, MaxErrStrLen - 1, errnum);
+  str = buffer;
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
   // of collision of strerror in multiple threads.
-  if (errnum)
-    str = strerror(errnum);
+  str = strerror(errnum);
 #else
   // Strange that this system doesn't even have strerror
   // but, oh well, just use a generic message
@@ -75,5 +72,3 @@ std::string StrError(int errnum) {
 
 }  // namespace sys
 }  // namespace llvm
-
-#endif  // HAVE_STRING_H
diff --git a/lib/Support/ErrorHandling.cpp b/lib/Support/ErrorHandling.cpp
index f4b591e..1eafb96 100644
--- a/lib/Support/ErrorHandling.cpp
+++ b/lib/Support/ErrorHandling.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 #include <cassert>
 #include <cstdlib>
 
@@ -96,4 +97,25 @@ void llvm::llvm_unreachable_internal(const char *msg, const char *file,
     dbgs() << " at " << file << ":" << line;
   dbgs() << "!\n";
   abort();
+#ifdef LLVM_BUILTIN_UNREACHABLE
+  // Windows systems and possibly others don't declare abort() to be noreturn,
+  // so use the unreachable builtin to avoid a Clang self-host warning.
+  LLVM_BUILTIN_UNREACHABLE;
+#endif
+}
+
+static void bindingsErrorHandler(void *user_data, const std::string& reason,
+                                 bool gen_crash_diag) {
+  LLVMFatalErrorHandler handler =
+      LLVM_EXTENSION reinterpret_cast<LLVMFatalErrorHandler>(user_data);
+  handler(reason.c_str());
+}
+
+void LLVMInstallFatalErrorHandler(LLVMFatalErrorHandler Handler) {
+  install_fatal_error_handler(bindingsErrorHandler,
+                              LLVM_EXTENSION reinterpret_cast<void *>(Handler));
+}
+
+void LLVMResetFatalErrorHandler() {
+  remove_fatal_error_handler();
 }
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 1ee69b6..ed084fa 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -62,11 +62,16 @@ error_code FileOutputBuffer::create(StringRef FilePath,
   if (EC)
     return EC;
 
+  unsigned Mode = sys::fs::all_read | sys::fs::all_write;
+  // If requested, make the output file executable.
+  if (Flags & F_executable)
+    Mode |= sys::fs::all_exe;
+
   // Create new file in same directory but with random name.
   SmallString<128> TempFilePath;
   int FD;
-  EC = sys::fs::unique_file(Twine(FilePath) + ".tmp%%%%%%%",
-                            FD, TempFilePath, false, 0644);
+  EC = sys::fs::createUniqueFile(Twine(FilePath) + ".tmp%%%%%%%", FD,
+                                 TempFilePath, Mode);
   if (EC)
     return EC;
 
@@ -75,26 +80,6 @@ error_code FileOutputBuffer::create(StringRef FilePath,
   if (EC)
     return EC;
 
-  // If requested, make the output file executable.
-  if ( Flags & F_executable ) {
-    sys::fs::file_status Stat2;
-    EC = sys::fs::status(Twine(TempFilePath), Stat2);
-    if (EC)
-      return EC;
-
-    sys::fs::perms new_perms = Stat2.permissions();
-    if ( new_perms & sys::fs::owner_read )
-      new_perms |= sys::fs::owner_exe;
-    if ( new_perms & sys::fs::group_read )
-      new_perms |= sys::fs::group_exe;
-    if ( new_perms & sys::fs::others_read )
-      new_perms |= sys::fs::others_exe;
-    new_perms |= sys::fs::add_perms;
-    EC = sys::fs::permissions(Twine(TempFilePath), new_perms);
-    if (EC)
-      return EC;
-  }
-
   Result.reset(new FileOutputBuffer(MappedFile.get(), FilePath, TempFilePath));
   if (Result)
     MappedFile.take();
diff --git a/lib/Support/FileUtilities.cpp b/lib/Support/FileUtilities.cpp
index 4d7b239..7f5d540 100644
--- a/lib/Support/FileUtilities.cpp
+++ b/lib/Support/FileUtilities.cpp
@@ -171,43 +171,20 @@ static bool CompareNumbers(const char *&F1P, const char *&F2P,
 /// error occurs, allowing the caller to distinguish between a failed diff and a
 /// file system error.
 ///
-int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
-                                 const sys::PathWithStatus &FileB,
+int llvm::DiffFilesWithTolerance(StringRef NameA,
+                                 StringRef NameB,
                                  double AbsTol, double RelTol,
                                  std::string *Error) {
-  const sys::FileStatus *FileAStat = FileA.getFileStatus(false, Error);
-  if (!FileAStat)
-    return 2;
-  const sys::FileStatus *FileBStat = FileB.getFileStatus(false, Error);
-  if (!FileBStat)
-    return 2;
-
-  // Check for zero length files because some systems croak when you try to
-  // mmap an empty file.
-  size_t A_size = FileAStat->getSize();
-  size_t B_size = FileBStat->getSize();
-
-  // If they are both zero sized then they're the same
-  if (A_size == 0 && B_size == 0)
-    return 0;
-
-  // If only one of them is zero sized then they can't be the same
-  if ((A_size == 0 || B_size == 0)) {
-    if (Error)
-      *Error = "Files differ: one is zero-sized, the other isn't";
-    return 1;
-  }
-
   // Now its safe to mmap the files into memory because both files
   // have a non-zero size.
   OwningPtr<MemoryBuffer> F1;
-  if (error_code ec = MemoryBuffer::getFile(FileA.c_str(), F1)) {
+  if (error_code ec = MemoryBuffer::getFile(NameA, F1)) {
     if (Error)
       *Error = ec.message();
     return 2;
   }
   OwningPtr<MemoryBuffer> F2;
-  if (error_code ec = MemoryBuffer::getFile(FileB.c_str(), F2)) {
+  if (error_code ec = MemoryBuffer::getFile(NameB, F2)) {
     if (Error)
       *Error = ec.message();
     return 2;
@@ -220,6 +197,8 @@ int llvm::DiffFilesWithTolerance(const sys::PathWithStatus &FileA,
   const char *File2End = F2->getBufferEnd();
   const char *F1P = File1Start;
   const char *F2P = File2Start;
+  uint64_t A_size = F1->getBufferSize();
+  uint64_t B_size = F2->getBufferSize();
 
   // Are the buffers identical?  Common case: Handle this efficiently.
   if (A_size == B_size &&
diff --git a/lib/Support/FormattedStream.cpp b/lib/Support/FormattedStream.cpp
index 231ae48..9febf66 100644
--- a/lib/Support/FormattedStream.cpp
+++ b/lib/Support/FormattedStream.cpp
@@ -17,38 +17,43 @@
 
 using namespace llvm;
 
-/// CountColumns - Examine the given char sequence and figure out which
-/// column we end up in after output.
+/// UpdatePosition - Examine the given char sequence and figure out which
+/// column we end up in after output, and how many line breaks are contained.
 ///
-static unsigned CountColumns(unsigned Column, const char *Ptr, size_t Size) {
-  // Keep track of the current column by scanning the string for
-  // special characters
+static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+  unsigned &Column = Position.first;
+  unsigned &Line = Position.second;
 
+  // Keep track of the current column and line by scanning the string for
+  // special characters
   for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
     ++Column;
-    if (*Ptr == '\n' || *Ptr == '\r')
+    switch (*Ptr) {
+    case '\n':
+      Line += 1;
+    case '\r':
       Column = 0;
-    else if (*Ptr == '\t')
+      break;
+    case '\t':
       // Assumes tab stop = 8 characters.
       Column += (8 - (Column & 0x7)) & 0x7;
+      break;
+    }
   }
-
-  return Column;
 }
 
-/// ComputeColumn - Examine the current output and figure out which
-/// column we end up in after output.
-void formatted_raw_ostream::ComputeColumn(const char *Ptr, size_t Size) {
+/// ComputePosition - Examine the current output and update line and column
+/// counts.
+void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) {
   // If our previous scan pointer is inside the buffer, assume we already
   // scanned those bytes. This depends on raw_ostream to not change our buffer
   // in unexpected ways.
-  if (Ptr <= Scanned && Scanned <= Ptr + Size) {
+  if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    ColumnScanned = CountColumns(ColumnScanned, Scanned, 
-                                 Size - (Scanned - Ptr));
-  } else
-    ColumnScanned = CountColumns(ColumnScanned, Ptr, Size);
+    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+  else
+    UpdatePosition(Position, Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;
@@ -60,16 +65,16 @@ void formatted_raw_ostream::ComputeColumn(const char *Ptr, size_t Size) {
 ///
 formatted_raw_ostream &formatted_raw_ostream::PadToColumn(unsigned NewCol) { 
   // Figure out what's in the buffer and add it to the column count.
-  ComputeColumn(getBufferStart(), GetNumBytesInBuffer());
+  ComputePosition(getBufferStart(), GetNumBytesInBuffer());
 
   // Output spaces until we reach the desired column.
-  indent(std::max(int(NewCol - ColumnScanned), 1));
+  indent(std::max(int(NewCol - getColumn()), 1));
   return *this;
 }
 
 void formatted_raw_ostream::write_impl(const char *Ptr, size_t Size) {
   // Figure out what's in the buffer and add it to the column count.
-  ComputeColumn(Ptr, Size);
+  ComputePosition(Ptr, Size);
 
   // Write the data to the underlying stream (which is unbuffered, so
   // the data will be immediately written out).
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index bff182f..85be415 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
 using namespace llvm;
@@ -64,31 +65,46 @@ StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
   return Colors[ColorNumber % NumColors];
 }
 
+std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
+  FD = -1;
+  SmallString<128> Filename;
+  error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
+  if (EC) {
+    errs() << "Error: " << EC.message() << "\n";
+    return "";
+  }
+
+  errs() << "Writing '" << Filename << "'... ";
+  return Filename.str();
+}
+
 // Execute the graph viewer. Return true if successful.
 static bool LLVM_ATTRIBUTE_UNUSED
-ExecGraphViewer(const sys::Path &ExecPath, std::vector<const char*> &args,
-                const sys::Path &Filename, bool wait, std::string &ErrMsg) {
+ExecGraphViewer(StringRef ExecPath, std::vector<const char*> &args,
+                StringRef Filename, bool wait, std::string &ErrMsg) {
   if (wait) {
-    if (sys::Program::ExecuteAndWait(ExecPath, &args[0],0,0,0,0,&ErrMsg)) {
+    if (sys::ExecuteAndWait(ExecPath, &args[0],0,0,0,0,&ErrMsg)) {
       errs() << "Error: " << ErrMsg << "\n";
       return false;
     }
-    Filename.eraseFromDisk();
+    bool Existed;
+    sys::fs::remove(Filename, Existed);
     errs() << " done. \n";
   }
   else {
-    sys::Program::ExecuteNoWait(ExecPath, &args[0],0,0,0,&ErrMsg);
+    sys::ExecuteNoWait(ExecPath, &args[0],0,0,0,&ErrMsg);
     errs() << "Remember to erase graph file: " << Filename.str() << "\n";
   }
   return true;
 }
 
-void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
+void llvm::DisplayGraph(StringRef FilenameRef, bool wait,
                         GraphProgram::Name program) {
+  std::string Filename = FilenameRef;
   wait &= !ViewBackground;
   std::string ErrMsg;
 #if HAVE_GRAPHVIZ
-  sys::Path Graphviz(LLVM_PATH_GRAPHVIZ);
+  std::string Graphviz(LLVM_PATH_GRAPHVIZ);
 
   std::vector<const char*> args;
   args.push_back(Graphviz.c_str());
@@ -99,9 +115,9 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   if (!ExecGraphViewer(Graphviz, args, Filename, wait, ErrMsg))
     return;
 
-#elif HAVE_XDOT_PY
+#elif HAVE_XDOT
   std::vector<const char*> args;
-  args.push_back(LLVM_PATH_XDOT_PY);
+  args.push_back(LLVM_PATH_XDOT);
   args.push_back(Filename.c_str());
 
   switch (program) {
@@ -115,53 +131,51 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back(0);
 
   errs() << "Running 'xdot.py' program... ";
-  if (!ExecGraphViewer(sys::Path(LLVM_PATH_XDOT_PY), args, Filename, wait, ErrMsg))
+  if (!ExecGraphViewer(LLVM_PATH_XDOT, args, Filename, wait, ErrMsg))
     return;
 
 #elif (HAVE_GV && (HAVE_DOT || HAVE_FDP || HAVE_NEATO || \
                    HAVE_TWOPI || HAVE_CIRCO))
-  sys::Path PSFilename = Filename;
-  PSFilename.appendSuffix("ps");
-
-  sys::Path prog;
+  std::string PSFilename = Filename + ".ps";
+  std::string prog;
 
   // Set default grapher
 #if HAVE_CIRCO
-  prog = sys::Path(LLVM_PATH_CIRCO);
+  prog = LLVM_PATH_CIRCO;
 #endif
 #if HAVE_TWOPI
-  prog = sys::Path(LLVM_PATH_TWOPI);
+  prog = LLVM_PATH_TWOPI;
 #endif
 #if HAVE_NEATO
-  prog = sys::Path(LLVM_PATH_NEATO);
+  prog = LLVM_PATH_NEATO;
 #endif
 #if HAVE_FDP
-  prog = sys::Path(LLVM_PATH_FDP);
+  prog = LLVM_PATH_FDP;
 #endif
 #if HAVE_DOT
-  prog = sys::Path(LLVM_PATH_DOT);
+  prog = LLVM_PATH_DOT;
 #endif
 
   // Find which program the user wants
 #if HAVE_DOT
   if (program == GraphProgram::DOT)
-    prog = sys::Path(LLVM_PATH_DOT);
+    prog = LLVM_PATH_DOT;
 #endif
 #if (HAVE_FDP)
   if (program == GraphProgram::FDP)
-    prog = sys::Path(LLVM_PATH_FDP);
+    prog = LLVM_PATH_FDP;
 #endif
 #if (HAVE_NEATO)
   if (program == GraphProgram::NEATO)
-    prog = sys::Path(LLVM_PATH_NEATO);
+    prog = LLVM_PATH_NEATO;
 #endif
 #if (HAVE_TWOPI)
   if (program == GraphProgram::TWOPI)
-    prog = sys::Path(LLVM_PATH_TWOPI);
+    prog = LLVM_PATH_TWOPI;
 #endif
 #if (HAVE_CIRCO)
   if (program == GraphProgram::CIRCO)
-    prog = sys::Path(LLVM_PATH_CIRCO);
+    prog = LLVM_PATH_CIRCO;
 #endif
 
   std::vector<const char*> args;
@@ -174,12 +188,12 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   args.push_back(PSFilename.c_str());
   args.push_back(0);
 
-  errs() << "Running '" << prog.str() << "' program... ";
+  errs() << "Running '" << prog << "' program... ";
 
   if (!ExecGraphViewer(prog, args, Filename, wait, ErrMsg))
     return;
 
-  sys::Path gv(LLVM_PATH_GV);
+  std::string gv(LLVM_PATH_GV);
   args.clear();
   args.push_back(gv.c_str());
   args.push_back(PSFilename.c_str());
@@ -191,7 +205,7 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
     return;
 
 #elif HAVE_DOTTY
-  sys::Path dotty(LLVM_PATH_DOTTY);
+  std::string dotty(LLVM_PATH_DOTTY);
 
   std::vector<const char*> args;
   args.push_back(dotty.c_str());
@@ -205,5 +219,8 @@ void llvm::DisplayGraph(const sys::Path &Filename, bool wait,
   errs() << "Running 'dotty' program... ";
   if (!ExecGraphViewer(dotty, args, Filename, wait, ErrMsg))
     return;
+#else
+  (void)Filename;
+  (void)ErrMsg;
 #endif
 }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index a7c7a95..6e9a5c9 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -52,8 +52,54 @@ using namespace llvm;
 
 /// GetX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
 /// specified arguments.  If we can't run cpuid on the host, return true.
-static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
-                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX, unsigned *rEBX,
+                               unsigned *rECX, unsigned *rEDX) {
+#if defined(__GNUC__) || defined(__clang__)
+  #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+// pedantic #else returns to appease -Wunreachable-code (so we don't generate
+// postprocessed code that looks like "return true; return false;")
+  #else
+    return true;
+  #endif
+#elif defined(_MSC_VER)
+  // The MSVC intrinsic is portable across x86 and x64.
+  int registers[4];
+  __cpuid(registers, value);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return false;
+#else
+  return true;
+#endif
+}
+
+/// GetX86CpuIDAndInfoEx - Execute the specified cpuid with subleaf and return the
+/// 4 values in the specified arguments.  If we can't run cpuid on the host,
+/// return true.
+bool GetX86CpuIDAndInfoEx(unsigned value, unsigned subleaf, unsigned *rEAX,
+                          unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
 #if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
   #if defined(__GNUC__)
     // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
@@ -64,16 +110,22 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
-    int registers[4];
-    __cpuid(registers, value);
-    *rEAX = registers[0];
-    *rEBX = registers[1];
-    *rECX = registers[2];
-    *rEDX = registers[3];
-    return false;
+    // __cpuidex was added in MSVC++ 9.0 SP1
+    #if (_MSC_VER > 1500) || (_MSC_VER == 1500 && _MSC_FULL_VER >= 150030729)
+      int registers[4];
+      __cpuidex(registers, value, subleaf);
+      *rEAX = registers[0];
+      *rEBX = registers[1];
+      *rECX = registers[2];
+      *rEDX = registers[3];
+      return false;
+    #else
+      return true;
+    #endif
   #else
     return true;
   #endif
@@ -86,11 +138,13 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
            "=S" (*rEBX),
            "=c" (*rECX),
            "=d" (*rEDX)
-         :  "a" (value));
+         :  "a" (value),
+            "c" (subleaf));
     return false;
   #elif defined(_MSC_VER)
     __asm {
       mov   eax,value
+      mov   ecx,subleaf
       cpuid
       mov   esi,rEAX
       mov   dword ptr [esi],eax
@@ -102,8 +156,6 @@ static bool GetX86CpuIDAndInfo(unsigned value, unsigned *rEAX,
       mov   dword ptr [esi],edx
     }
     return false;
-// pedantic #else returns to appease -Wunreachable-code (so we don't generate
-// postprocessed code that looks like "return true; return false;")
   #else
     return true;
   #endif
@@ -148,21 +200,27 @@ std::string sys::getHostCPUName() {
   unsigned Model  = 0;
   DetectX86FamilyModel(EAX, Family, Model);
 
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+
+  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
+
+  unsigned MaxLeaf = EAX;
   bool HasSSE3 = (ECX & 0x1);
+  bool HasSSE41 = (ECX & 0x80000);
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV 
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
   const unsigned AVXBits = (1 << 27) | (1 << 28);
   bool HasAVX = ((ECX & AVXBits) == AVXBits) && OSHasAVXSupport();
+  bool HasAVX2 = HasAVX && MaxLeaf >= 0x7 &&
+                 !GetX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX) &&
+                 (EBX & 0x20);
   GetX86CpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
   bool Em64T = (EDX >> 29) & 0x1;
 
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  GetX86CpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1);
   if (memcmp(text.c, "GenuineIntel", 12) == 0) {
     switch (Family) {
     case 3:
@@ -244,7 +302,8 @@ std::string sys::getHostCPUName() {
                // 17h. All processors are manufactured using the 45 nm process.
                //
                // 45nm: Penryn , Wolfdale, Yorkfield (XE)
-        return "penryn";
+        // Not all Penryn processors support SSE 4.1 (such as the Pentium brand)
+        return HasSSE41 ? "penryn" : "core2";
 
       case 26: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 45 nm process.
@@ -269,10 +328,20 @@ std::string sys::getHostCPUName() {
 
       // Ivy Bridge:
       case 58:
+      case 62: // Ivy Bridge EP
         // Not all Ivy Bridge processors support AVX (such as the Pentium
         // versions instead of the i7 versions).
         return HasAVX ? "core-avx-i" : "corei7";
 
+      // Haswell:
+      case 60:
+      case 63:
+      case 69:
+      case 70:
+        // Not all Haswell processors support AVX too (such as the Pentium
+        // versions instead of the i7 versions).
+        return HasAVX2 ? "core-avx2" : "corei7";
+
       case 28: // Most 45 nm Intel Atom processors
       case 38: // 45 nm Atom Lincroft
       case 39: // 32 nm Atom Medfield
@@ -280,6 +349,12 @@ std::string sys::getHostCPUName() {
       case 54: // 32 nm Atom Midview
         return "atom";
 
+      // Atom Silvermont codes from the Intel software optimization guide.
+      case 55:
+      case 74:
+      case 77:
+        return "slm";
+
       default: return (Em64T) ? "x86-64" : "i686";
       }
     case 15: {
@@ -357,9 +432,11 @@ std::string sys::getHostCPUName() {
       case 21:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
-        if (Model > 15 && Model <= 31)
-          return "bdver2";
-        return "bdver1";
+        if (Model >= 0x30)
+          return "bdver3"; // 30h-3Fh: Steamroller
+        if (Model >= 0x10)
+          return "bdver2"; // 10h-1Fh: Piledriver
+        return "bdver1";   // 00h-0Fh: Bulldozer
       case 22:
         if (!HasAVX) // If the OS doesn't support AVX provide a sane fallback.
           return "btver1";
@@ -544,6 +621,48 @@ std::string sys::getHostCPUName() {
 
   return "generic";
 }
+#elif defined(__linux__) && defined(__s390x__)
+std::string sys::getHostCPUName() {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
+  // memory buffer because the 'file' has 0 size (it can be read from only
+  // as a stream).
+
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return "generic";
+  }
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  char buffer[2048];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+  
+  return "generic";
+}
 #else
 std::string sys::getHostCPUName() {
   return "generic";
@@ -570,41 +689,31 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   SmallVector<StringRef, 32> Lines;
   Str.split(Lines, "\n");
 
-  // Look for the CPU implementer line.
-  StringRef Implementer;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("CPU implementer"))
-      Implementer = Lines[I].substr(15).ltrim("\t :");
-
-  if (Implementer == "0x41") { // ARM Ltd.
-    SmallVector<StringRef, 32> CPUFeatures;
+  SmallVector<StringRef, 32> CPUFeatures;
 
-    // Look for the CPU features.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("Features")) {
-        Lines[I].split(CPUFeatures, " ");
-        break;
-      }
-
-    for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
-      StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
-        .Case("half", "fp16")
-        .Case("neon", "neon")
-        .Case("vfpv3", "vfp3")
-        .Case("vfpv3d16", "d16")
-        .Case("vfpv4", "vfp4")
-        .Case("idiva", "hwdiv-arm")
-        .Case("idivt", "hwdiv")
-        .Default("");
-
-      if (LLVMFeatureStr != "")
-        Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
+  // Look for the CPU features.
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("Features")) {
+      Lines[I].split(CPUFeatures, " ");
+      break;
     }
 
-    return true;
+  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+    StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
+      .Case("half", "fp16")
+      .Case("neon", "neon")
+      .Case("vfpv3", "vfp3")
+      .Case("vfpv3d16", "d16")
+      .Case("vfpv4", "vfp4")
+      .Case("idiva", "hwdiv-arm")
+      .Case("idivt", "hwdiv")
+      .Default("");
+
+    if (LLVMFeatureStr != "")
+      Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
   }
 
-  return false;
+  return true;
 }
 #else
 bool sys::getHostCPUFeatures(StringMap<bool> &Features){
@@ -613,7 +722,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features){
 #endif
 
 std::string sys::getProcessTriple() {
-  Triple PT(LLVM_HOST_TRIPLE);
+  Triple PT(Triple::normalize(LLVM_HOST_TRIPLE));
 
   if (sizeof(void *) == 8 && PT.isArch32Bit())
     PT = PT.get64BitArchVariant();
diff --git a/lib/Support/Locale.cpp b/lib/Support/Locale.cpp
index 17b9b6c..35ddf7f 100644
--- a/lib/Support/Locale.cpp
+++ b/lib/Support/Locale.cpp
@@ -1,10 +1,31 @@
 #include "llvm/Support/Locale.h"
-#include "llvm/Config/config.h"
+#include "llvm/Support/Unicode.h"
 
-#ifdef __APPLE__
-#include "LocaleXlocale.inc"
-#elif LLVM_ON_WIN32
-#include "LocaleWindows.inc"
+namespace llvm {
+namespace sys {
+namespace locale {
+
+int columnWidth(StringRef Text) {
+#if LLVM_ON_WIN32
+  return Text.size();
 #else
-#include "LocaleGeneric.inc"
+  return llvm::sys::unicode::columnWidthUTF8(Text);
 #endif
+}
+
+bool isPrint(int UCS) {
+#if LLVM_ON_WIN32
+  // Restrict characters that we'll try to print to the the lower part of ASCII
+  // except for the control characters (0x20 - 0x7E). In general one can not
+  // reliably output code points U+0080 and higher using narrow character C/C++
+  // output functions in Windows, because the meaning of the upper 128 codes is
+  // determined by the active code page in the console.
+  return ' ' <= UCS && UCS <= '~';
+#else
+  return llvm::sys::unicode::isPrintable(UCS);
+#endif
+}
+
+} // namespace locale
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/LocaleGeneric.inc b/lib/Support/LocaleGeneric.inc
deleted file mode 100644
index 278deee..0000000
--- a/lib/Support/LocaleGeneric.inc
+++ /dev/null
@@ -1,17 +0,0 @@
-#include <cwctype>
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-    return s.size();
-}
-
-bool isPrint(int c) {
-    return iswprint(c);
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleWindows.inc b/lib/Support/LocaleWindows.inc
deleted file mode 100644
index 28e429c..0000000
--- a/lib/Support/LocaleWindows.inc
+++ /dev/null
@@ -1,15 +0,0 @@
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-    return s.size();
-}
-
-bool isPrint(int c) {
-    return ' ' <= c && c <= '~';
-}
-
-}
-}
-}
diff --git a/lib/Support/LocaleXlocale.inc b/lib/Support/LocaleXlocale.inc
deleted file mode 100644
index 389fe3d..0000000
--- a/lib/Support/LocaleXlocale.inc
+++ /dev/null
@@ -1,61 +0,0 @@
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ManagedStatic.h"
-#include <cassert>
-#include <xlocale.h>
-
-
-namespace {
-  struct locale_holder {
-    locale_holder()
-    : l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
-    {
-      assert(NULL!=l);
-    }
-    ~locale_holder() {
-      freelocale(l);
-    }
-
-    int mbswidth(llvm::SmallString<16> s) const {
-       // this implementation assumes no '\0' in s
-      assert(s.size()==strlen(s.c_str()));
-
-      size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
-      assert(size!=(size_t)-1);
-      if (size==0)
-        return 0;
-      llvm::SmallVector<wchar_t,200> ws(size);
-      size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
-      assert(ws.size()==size);
-      return wcswidth_l(&ws[0],ws.size(),l);
-    }
-
-    int isprint(int c) const {
-      return iswprint_l(c,l);
-    }
-
-  private:
-
-    locale_t l;
-  };
-
-  llvm::ManagedStatic<locale_holder> l;
-}
-
-namespace llvm {
-namespace sys {
-namespace locale {
-
-int columnWidth(StringRef s) {
-  int width = l->mbswidth(s);
-  assert(width>=0);
-  return width;
-}
-
-bool isPrint(int c) {
-  return l->isprint(c);
-}
-
-}
-}
-}
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 2917e27..eeec274 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -7,9 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/Support/LockFileManager.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
-#include <fstream>
 #include <sys/stat.h>
 #include <sys/types.h>
 #if LLVM_ON_WIN32
@@ -35,16 +37,20 @@ LockFileManager::readLockFile(StringRef LockFileName) {
 
   // Read the owning host and PID out of the lock file. If it appears that the
   // owning process is dead, the lock file is invalid.
-  int PID = 0;
-  std::string Hostname;
-  std::ifstream Input(LockFileName.str().c_str());
-  if (Input >> Hostname >> PID && PID > 0 &&
-      processStillExecuting(Hostname, PID))
-    return std::make_pair(Hostname, PID);
+  OwningPtr<MemoryBuffer> MB;
+  if (MemoryBuffer::getFile(LockFileName, MB))
+    return None;
+
+  StringRef Hostname;
+  StringRef PIDStr;
+  tie(Hostname, PIDStr) = getToken(MB->getBuffer(), " ");
+  PIDStr = PIDStr.substr(PIDStr.find_first_not_of(" "));
+  int PID;
+  if (!PIDStr.getAsInteger(10, PID))
+    return std::make_pair(std::string(Hostname), PID);
 
   // Delete the lock file. It's invalid anyway.
-  bool Existed;
-  sys::fs::remove(LockFileName, Existed);
+  sys::fs::remove(LockFileName);
   return None;
 }
 
@@ -78,10 +84,9 @@ LockFileManager::LockFileManager(StringRef FileName)
   UniqueLockFileName += "-%%%%%%%%";
   int UniqueLockFileID;
   if (error_code EC
-        = sys::fs::unique_file(UniqueLockFileName.str(),
-                                     UniqueLockFileID,
-                                     UniqueLockFileName,
-                                     /*makeAbsolute=*/false)) {
+        = sys::fs::createUniqueFile(UniqueLockFileName.str(),
+                                    UniqueLockFileID,
+                                    UniqueLockFileName)) {
     Error = EC;
     return;
   }
diff --git a/lib/Support/MD5.cpp b/lib/Support/MD5.cpp
new file mode 100644
index 0000000..514466c
--- /dev/null
+++ b/lib/Support/MD5.cpp
@@ -0,0 +1,286 @@
+/*
+ * This code is derived from (original license follows):
+ *
+ * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
+ * MD5 Message-Digest Algorithm (RFC 1321).
+ *
+ * Homepage:
+ * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5
+ *
+ * Author:
+ * Alexander Peslyak, better known as Solar Designer <solar at openwall.com>
+ *
+ * This software was written by Alexander Peslyak in 2001.  No copyright is
+ * claimed, and the software is hereby placed in the public domain.
+ * In case this attempt to disclaim copyright and place the software in the
+ * public domain is deemed null and void, then the software is
+ * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the
+ * general public under the following terms:
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted.
+ *
+ * There's ABSOLUTELY NO WARRANTY, express or implied.
+ *
+ * (This is a heavily cut-down "BSD license".)
+ *
+ * This differs from Colin Plumb's older public domain implementation in that
+ * no exactly 32-bit integer data type is required (any 32-bit or wider
+ * unsigned integer data type will do), there's no compile-time endianness
+ * configuration, and the function prototypes match OpenSSL's.  No code from
+ * Colin Plumb's implementation has been reused; this comment merely compares
+ * the properties of the two independent implementations.
+ *
+ * The primary goals of this implementation are portability and ease of use.
+ * It is meant to be fast, but not as fast as possible.  Some known
+ * optimizations are not included to reduce source code size and avoid
+ * compile-time configuration.
+ */
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+// The basic MD5 functions.
+
+// F and G are optimized compared to their RFC 1321 definitions for
+// architectures that lack an AND-NOT instruction, just like in Colin Plumb's
+// implementation.
+#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+#define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y))))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | ~(z)))
+
+// The MD5 transformation for all four rounds.
+#define STEP(f, a, b, c, d, x, t, s)                                           \
+  (a) += f((b), (c), (d)) + (x) + (t);                                         \
+  (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s))));                   \
+  (a) += (b);
+
+// SET reads 4 input bytes in little-endian byte order and stores them
+// in a properly aligned word in host byte order.
+#define SET(n)                                                                 \
+  (block[(n)] =                                                                \
+       (MD5_u32plus) ptr[(n) * 4] | ((MD5_u32plus) ptr[(n) * 4 + 1] << 8) |    \
+       ((MD5_u32plus) ptr[(n) * 4 + 2] << 16) |                                \
+       ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
+#define GET(n) (block[(n)])
+
+namespace llvm {
+
+/// \brief This processes one or more 64-byte data blocks, but does NOT update
+///the bit counters.  There are no alignment requirements.
+const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
+  const uint8_t *ptr;
+  MD5_u32plus a, b, c, d;
+  MD5_u32plus saved_a, saved_b, saved_c, saved_d;
+  unsigned long Size = Data.size();
+
+  ptr = Data.data();
+
+  a = this->a;
+  b = this->b;
+  c = this->c;
+  d = this->d;
+
+  do {
+    saved_a = a;
+    saved_b = b;
+    saved_c = c;
+    saved_d = d;
+
+    // Round 1
+    STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7)
+    STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12)
+    STEP(F, c, d, a, b, SET(2), 0x242070db, 17)
+    STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22)
+    STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7)
+    STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12)
+    STEP(F, c, d, a, b, SET(6), 0xa8304613, 17)
+    STEP(F, b, c, d, a, SET(7), 0xfd469501, 22)
+    STEP(F, a, b, c, d, SET(8), 0x698098d8, 7)
+    STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12)
+    STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17)
+    STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22)
+    STEP(F, a, b, c, d, SET(12), 0x6b901122, 7)
+    STEP(F, d, a, b, c, SET(13), 0xfd987193, 12)
+    STEP(F, c, d, a, b, SET(14), 0xa679438e, 17)
+    STEP(F, b, c, d, a, SET(15), 0x49b40821, 22)
+
+    // Round 2
+    STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5)
+    STEP(G, d, a, b, c, GET(6), 0xc040b340, 9)
+    STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14)
+    STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20)
+    STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5)
+    STEP(G, d, a, b, c, GET(10), 0x02441453, 9)
+    STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14)
+    STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20)
+    STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5)
+    STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9)
+    STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14)
+    STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20)
+    STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5)
+    STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9)
+    STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14)
+    STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20)
+
+    // Round 3
+    STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4)
+    STEP(H, d, a, b, c, GET(8), 0x8771f681, 11)
+    STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16)
+    STEP(H, b, c, d, a, GET(14), 0xfde5380c, 23)
+    STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4)
+    STEP(H, d, a, b, c, GET(4), 0x4bdecfa9, 11)
+    STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16)
+    STEP(H, b, c, d, a, GET(10), 0xbebfbc70, 23)
+    STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4)
+    STEP(H, d, a, b, c, GET(0), 0xeaa127fa, 11)
+    STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16)
+    STEP(H, b, c, d, a, GET(6), 0x04881d05, 23)
+    STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4)
+    STEP(H, d, a, b, c, GET(12), 0xe6db99e5, 11)
+    STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16)
+    STEP(H, b, c, d, a, GET(2), 0xc4ac5665, 23)
+
+    // Round 4
+    STEP(I, a, b, c, d, GET(0), 0xf4292244, 6)
+    STEP(I, d, a, b, c, GET(7), 0x432aff97, 10)
+    STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15)
+    STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21)
+    STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6)
+    STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10)
+    STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15)
+    STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21)
+    STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6)
+    STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10)
+    STEP(I, c, d, a, b, GET(6), 0xa3014314, 15)
+    STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21)
+    STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6)
+    STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10)
+    STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15)
+    STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21)
+
+    a += saved_a;
+    b += saved_b;
+    c += saved_c;
+    d += saved_d;
+
+    ptr += 64;
+  } while (Size -= 64);
+
+  this->a = a;
+  this->b = b;
+  this->c = c;
+  this->d = d;
+
+  return ptr;
+}
+
+MD5::MD5()
+    : a(0x67452301), b(0xefcdab89), c(0x98badcfe), d(0x10325476), hi(0), lo(0) {
+}
+
+/// Incrementally add the bytes in \p Data to the hash.
+void MD5::update(ArrayRef<uint8_t> Data) {
+  MD5_u32plus saved_lo;
+  unsigned long used, free;
+  const uint8_t *Ptr = Data.data();
+  unsigned long Size = Data.size();
+
+  saved_lo = lo;
+  if ((lo = (saved_lo + Size) & 0x1fffffff) < saved_lo)
+    hi++;
+  hi += Size >> 29;
+
+  used = saved_lo & 0x3f;
+
+  if (used) {
+    free = 64 - used;
+
+    if (Size < free) {
+      memcpy(&buffer[used], Ptr, Size);
+      return;
+    }
+
+    memcpy(&buffer[used], Ptr, free);
+    Ptr = Ptr + free;
+    Size -= free;
+    body(ArrayRef<uint8_t>(buffer, 64));
+  }
+
+  if (Size >= 64) {
+    Ptr = body(ArrayRef<uint8_t>(Ptr, Size & ~(unsigned long) 0x3f));
+    Size &= 0x3f;
+  }
+
+  memcpy(buffer, Ptr, Size);
+}
+
+/// Add the bytes in the StringRef \p Str to the hash.
+// Note that this isn't a string and so this won't include any trailing NULL
+// bytes.
+void MD5::update(StringRef Str) {
+  ArrayRef<uint8_t> SVal((const uint8_t *)Str.data(), Str.size());
+  update(SVal);
+}
+
+/// \brief Finish the hash and place the resulting hash into \p result.
+/// \param result is assumed to be a minimum of 16-bytes in size.
+void MD5::final(MD5Result &result) {
+  unsigned long used, free;
+
+  used = lo & 0x3f;
+
+  buffer[used++] = 0x80;
+
+  free = 64 - used;
+
+  if (free < 8) {
+    memset(&buffer[used], 0, free);
+    body(ArrayRef<uint8_t>(buffer, 64));
+    used = 0;
+    free = 64;
+  }
+
+  memset(&buffer[used], 0, free - 8);
+
+  lo <<= 3;
+  buffer[56] = lo;
+  buffer[57] = lo >> 8;
+  buffer[58] = lo >> 16;
+  buffer[59] = lo >> 24;
+  buffer[60] = hi;
+  buffer[61] = hi >> 8;
+  buffer[62] = hi >> 16;
+  buffer[63] = hi >> 24;
+
+  body(ArrayRef<uint8_t>(buffer, 64));
+
+  result[0] = a;
+  result[1] = a >> 8;
+  result[2] = a >> 16;
+  result[3] = a >> 24;
+  result[4] = b;
+  result[5] = b >> 8;
+  result[6] = b >> 16;
+  result[7] = b >> 24;
+  result[8] = c;
+  result[9] = c >> 8;
+  result[10] = c >> 16;
+  result[11] = c >> 24;
+  result[12] = d;
+  result[13] = d >> 8;
+  result[14] = d >> 16;
+  result[15] = d >> 24;
+}
+
+void MD5::stringifyResult(MD5Result &result, SmallString<32> &Str) {
+  raw_svector_ostream Res(Str);
+  for (int i = 0; i < 16; ++i)
+    Res << format("%.2x", result[i]);
+}
+
+}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 7c5ab96..dcd5529 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -33,8 +33,7 @@
 #include <unistd.h>
 #else
 #include <io.h>
-// Simplistic definitinos of these macros to allow files to be read with
-// MapInFilePages.
+// Simplistic definitinos of these macros for use in getOpenFile.
 #ifndef S_ISREG
 #define S_ISREG(x) (1)
 #endif
@@ -42,7 +41,6 @@
 #define S_ISBLK(x) (0)
 #endif
 #endif
-#include <fcntl.h>
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -174,20 +172,12 @@ error_code MemoryBuffer::getFileOrSTDIN(StringRef Filename,
   return getFile(Filename, result, FileSize);
 }
 
-error_code MemoryBuffer::getFileOrSTDIN(const char *Filename,
-                                        OwningPtr<MemoryBuffer> &result,
-                                        int64_t FileSize) {
-  if (strcmp(Filename, "-") == 0)
-    return getSTDIN(result);
-  return getFile(Filename, result, FileSize);
-}
-
 //===----------------------------------------------------------------------===//
 // MemoryBuffer::getFile implementation.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// \brief Memorry maps a file descriptor using sys::fs::mapped_file_region.
+/// \brief Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
 class MemoryBufferMMapFile : public MemoryBuffer {
@@ -227,7 +217,7 @@ public:
 };
 }
 
-static error_code getMemoryBufferForStream(int FD, 
+static error_code getMemoryBufferForStream(int FD,
                                            StringRef BufferName,
                                            OwningPtr<MemoryBuffer> &result) {
   const ssize_t ChunkSize = 4096*4;
@@ -248,41 +238,36 @@ static error_code getMemoryBufferForStream(int FD,
   return error_code::success();
 }
 
-error_code MemoryBuffer::getFile(StringRef Filename,
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator);
+
+error_code MemoryBuffer::getFile(Twine Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
                                  bool RequiresNullTerminator) {
   // Ensure the path is null terminated.
-  SmallString<256> PathBuf(Filename.begin(), Filename.end());
-  return MemoryBuffer::getFile(PathBuf.c_str(), result, FileSize,
-                               RequiresNullTerminator);
+  SmallString<256> PathBuf;
+  StringRef NullTerminatedName = Filename.toNullTerminatedStringRef(PathBuf);
+  return getFileAux(NullTerminatedName.data(), result, FileSize,
+                    RequiresNullTerminator);
 }
 
-error_code MemoryBuffer::getFile(const char *Filename,
-                                 OwningPtr<MemoryBuffer> &result,
-                                 int64_t FileSize,
-                                 bool RequiresNullTerminator) {
-  // FIXME: Review if this check is unnecessary on windows as well.
-#ifdef LLVM_ON_WIN32
-  // First check that the "file" is not a directory
-  bool is_dir = false;
-  error_code err = sys::fs::is_directory(Filename, is_dir);
-  if (err)
-    return err;
-  if (is_dir)
-    return make_error_code(errc::is_a_directory);
-#endif
-
-  int OpenFlags = O_RDONLY;
-#ifdef O_BINARY
-  OpenFlags |= O_BINARY;  // Open input file in binary mode on win32.
-#endif
-  int FD = ::open(Filename, OpenFlags);
-  if (FD == -1)
-    return error_code(errno, posix_category());
-
-  error_code ret = getOpenFile(FD, Filename, result, FileSize, FileSize,
-                               0, RequiresNullTerminator);
+static error_code getOpenFileImpl(int FD, const char *Filename,
+                                  OwningPtr<MemoryBuffer> &Result,
+                                  uint64_t FileSize, uint64_t MapSize,
+                                  int64_t Offset, bool RequiresNullTerminator);
+
+static error_code getFileAux(const char *Filename,
+                             OwningPtr<MemoryBuffer> &result, int64_t FileSize,
+                             bool RequiresNullTerminator) {
+  int FD;
+  error_code EC = sys::fs::openFileForRead(Filename, FD);
+  if (EC)
+    return EC;
+
+  error_code ret = getOpenFileImpl(FD, Filename, result, FileSize, FileSize, 0,
+                                   RequiresNullTerminator);
   close(FD);
   return ret;
 }
@@ -295,7 +280,7 @@ static bool shouldUseMmap(int FD,
                           int PageSize) {
   // We don't use mmap for small files because this can severely fragment our
   // address space.
-  if (MapSize < 4096*4)
+  if (MapSize < 4 * 4096 || MapSize < (unsigned)PageSize)
     return false;
 
   if (!RequiresNullTerminator)
@@ -307,12 +292,11 @@ static bool shouldUseMmap(int FD,
   // FIXME: this chunk of code is duplicated, but it avoids a fstat when
   // RequiresNullTerminator = false and MapSize != -1.
   if (FileSize == size_t(-1)) {
-    struct stat FileInfo;
-    // TODO: This should use fstat64 when available.
-    if (fstat(FD, &FileInfo) == -1) {
-      return error_code(errno, posix_category());
-    }
-    FileSize = FileInfo.st_size;
+    sys::fs::file_status Status;
+    error_code EC = sys::fs::status(FD, Status);
+    if (EC)
+      return EC;
+    FileSize = Status.getSize();
   }
 
   // If we need a null terminator and the end of the map is inside the file,
@@ -322,6 +306,15 @@ static bool shouldUseMmap(int FD,
   if (End != FileSize)
     return false;
 
+#if defined(_WIN32) || defined(__CYGWIN__)
+  // Don't peek the next page if file is multiple of *physical* pagesize(4k)
+  // but is not multiple of AllocationGranularity(64k),
+  // when a null terminator is required.
+  // FIXME: It's not good to hardcode 4096 here. dwPageSize shows 4096.
+  if ((FileSize & (4096 - 1)) == 0)
+    return false;
+#endif
+
   // Don't try to map files that are exactly a multiple of the system page size
   // if we need a null terminator.
   if ((FileSize & (PageSize -1)) == 0)
@@ -330,11 +323,10 @@ static bool shouldUseMmap(int FD,
   return true;
 }
 
-error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
-                                     OwningPtr<MemoryBuffer> &result,
-                                     uint64_t FileSize, uint64_t MapSize,
-                                     int64_t Offset,
-                                     bool RequiresNullTerminator) {
+static error_code getOpenFileImpl(int FD, const char *Filename,
+                                  OwningPtr<MemoryBuffer> &result,
+                                  uint64_t FileSize, uint64_t MapSize,
+                                  int64_t Offset, bool RequiresNullTerminator) {
   static int PageSize = sys::process::get_self()->page_size();
 
   // Default is to map the full file.
@@ -342,20 +334,20 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
     // If we don't know the file size, use fstat to find out.  fstat on an open
     // file descriptor is cheaper than stat on a random path.
     if (FileSize == uint64_t(-1)) {
-      struct stat FileInfo;
-      // TODO: This should use fstat64 when available.
-      if (fstat(FD, &FileInfo) == -1) {
-        return error_code(errno, posix_category());
-      }
+      sys::fs::file_status Status;
+      error_code EC = sys::fs::status(FD, Status);
+      if (EC)
+        return EC;
 
       // If this not a file or a block device (e.g. it's a named pipe
       // or character device), we can't trust the size. Create the memory
       // buffer by copying off the stream.
-      if (!S_ISREG(FileInfo.st_mode) && !S_ISBLK(FileInfo.st_mode)) {
+      sys::fs::file_type Type = Status.type();
+      if (Type != sys::fs::file_type::regular_file &&
+          Type != sys::fs::file_type::block_file)
         return getMemoryBufferForStream(FD, Filename, result);
-      }
 
-      FileSize = FileInfo.st_size;
+      FileSize = Status.getSize();
     }
     MapSize = FileSize;
   }
@@ -411,6 +403,20 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
   return error_code::success();
 }
 
+error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
+                                     OwningPtr<MemoryBuffer> &Result,
+                                     uint64_t FileSize,
+                                     bool RequiresNullTerminator) {
+  return getOpenFileImpl(FD, Filename, Result, FileSize, FileSize, 0,
+                         RequiresNullTerminator);
+}
+
+error_code MemoryBuffer::getOpenFileSlice(int FD, const char *Filename,
+                                          OwningPtr<MemoryBuffer> &Result,
+                                          uint64_t MapSize, int64_t Offset) {
+  return getOpenFileImpl(FD, Filename, Result, -1, MapSize, Offset, false);
+}
+
 //===----------------------------------------------------------------------===//
 // MemoryBuffer::getSTDIN implementation.
 //===----------------------------------------------------------------------===//
@@ -420,7 +426,7 @@ error_code MemoryBuffer::getSTDIN(OwningPtr<MemoryBuffer> &result) {
   //
   // FIXME: That isn't necessarily true, we should try to mmap stdin and
   // fallback if it fails.
-  sys::Program::ChangeStdinToBinary();
+  sys::ChangeStdinToBinary();
 
   return getMemoryBufferForStream(0, "<stdin>", result);
 }
diff --git a/lib/Support/MemoryObject.cpp b/lib/Support/MemoryObject.cpp
index b20ab89..02b5b50 100644
--- a/lib/Support/MemoryObject.cpp
+++ b/lib/Support/MemoryObject.cpp
@@ -15,8 +15,7 @@ MemoryObject::~MemoryObject() {
 
 int MemoryObject::readBytes(uint64_t address,
                             uint64_t size,
-                            uint8_t* buf,
-                            uint64_t* copied) const {
+                            uint8_t* buf) const {
   uint64_t current = address;
   uint64_t limit = getBase() + getExtent();
 
@@ -30,8 +29,5 @@ int MemoryObject::readBytes(uint64_t address,
     current++;
   }
   
-  if (copied)
-    *copied = current - address;
-  
   return 0;
 }
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index d070375..c869b30 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -1,4 +1,4 @@
-//===-- Path.cpp - Implement OS Path Concept --------------------*- C++ -*-===//
+//===-- Path.cpp - Implement OS Path Concept ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,80 +7,900 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This header file implements the operating system Path concept.
+//  This file implements the operating system Path API.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Path.h"
-#include "llvm/Config/config.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include <cassert>
+#include <cctype>
+#include <cstdio>
 #include <cstring>
-#include <ostream>
-using namespace llvm;
-using namespace sys;
+#include <fcntl.h>
+
+#if !defined(_MSC_VER) && !defined(__MINGW32__)
+#include <unistd.h>
+#else
+#include <io.h>
+#endif
+
 namespace {
-using support::ulittle32_t;
+  using llvm::StringRef;
+  using llvm::sys::path::is_separator;
+
+#ifdef LLVM_ON_WIN32
+  const char *separators = "\\/";
+  const char  prefered_separator = '\\';
+#else
+  const char  separators = '/';
+  const char  prefered_separator = '/';
+#endif
+
+  StringRef find_first_component(StringRef path) {
+    // Look for this first component in the following order.
+    // * empty (in this case we return an empty string)
+    // * either C: or {//,\\}net.
+    // * {/,\}
+    // * {.,..}
+    // * {file,directory}name
+
+    if (path.empty())
+      return path;
+
+#ifdef LLVM_ON_WIN32
+    // C:
+    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
+        path[1] == ':')
+      return path.substr(0, 2);
+#endif
+
+    // //net
+    if ((path.size() > 2) &&
+        is_separator(path[0]) &&
+        path[0] == path[1] &&
+        !is_separator(path[2])) {
+      // Find the next directory separator.
+      size_t end = path.find_first_of(separators, 2);
+      return path.substr(0, end);
+    }
+
+    // {/,\}
+    if (is_separator(path[0]))
+      return path.substr(0, 1);
+
+    if (path.startswith(".."))
+      return path.substr(0, 2);
+
+    if (path[0] == '.')
+      return path.substr(0, 1);
+
+    // * {file,directory}name
+    size_t end = path.find_first_of(separators);
+    return path.substr(0, end);
+  }
+
+  size_t filename_pos(StringRef str) {
+    if (str.size() == 2 &&
+        is_separator(str[0]) &&
+        str[0] == str[1])
+      return 0;
+
+    if (str.size() > 0 && is_separator(str[str.size() - 1]))
+      return str.size() - 1;
+
+    size_t pos = str.find_last_of(separators, str.size() - 1);
+
+#ifdef LLVM_ON_WIN32
+    if (pos == StringRef::npos)
+      pos = str.find_last_of(':', str.size() - 2);
+#endif
+
+    if (pos == StringRef::npos ||
+        (pos == 1 && is_separator(str[0])))
+      return 0;
+
+    return pos + 1;
+  }
+
+  size_t root_dir_start(StringRef str) {
+    // case "c:/"
+#ifdef LLVM_ON_WIN32
+    if (str.size() > 2 &&
+        str[1] == ':' &&
+        is_separator(str[2]))
+      return 2;
+#endif
+
+    // case "//"
+    if (str.size() == 2 &&
+        is_separator(str[0]) &&
+        str[0] == str[1])
+      return StringRef::npos;
+
+    // case "//net"
+    if (str.size() > 3 &&
+        is_separator(str[0]) &&
+        str[0] == str[1] &&
+        !is_separator(str[2])) {
+      return str.find_first_of(separators, 2);
+    }
+
+    // case "/"
+    if (str.size() > 0 && is_separator(str[0]))
+      return 0;
+
+    return StringRef::npos;
+  }
+
+  size_t parent_path_end(StringRef path) {
+    size_t end_pos = filename_pos(path);
+
+    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+
+    // Skip separators except for root dir.
+    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+
+    while(end_pos > 0 &&
+          (end_pos - 1) != root_dir_pos &&
+          is_separator(path[end_pos - 1]))
+      --end_pos;
+
+    if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
+      return StringRef::npos;
+
+    return end_pos;
+  }
+} // end unnamed namespace
+
+enum FSEntity {
+  FS_Dir,
+  FS_File,
+  FS_Name
+};
+
+// Implemented in Unix/Path.inc and Windows/Path.inc.
+static llvm::error_code
+createUniqueEntity(const llvm::Twine &Model, int &ResultFD,
+                   llvm::SmallVectorImpl<char> &ResultPath,
+                   bool MakeAbsolute, unsigned Mode, FSEntity Type);
+
+namespace llvm {
+namespace sys  {
+namespace path {
+
+const_iterator begin(StringRef path) {
+  const_iterator i;
+  i.Path      = path;
+  i.Component = find_first_component(path);
+  i.Position  = 0;
+  return i;
 }
 
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only TRULY operating system
-//===          independent code.
-//===----------------------------------------------------------------------===//
+const_iterator end(StringRef path) {
+  const_iterator i;
+  i.Path      = path;
+  i.Position  = path.size();
+  return i;
+}
+
+const_iterator &const_iterator::operator++() {
+  assert(Position < Path.size() && "Tried to increment past end!");
+
+  // Increment Position to past the current component
+  Position += Component.size();
+
+  // Check for end.
+  if (Position == Path.size()) {
+    Component = StringRef();
+    return *this;
+  }
+
+  // Both POSIX and Windows treat paths that begin with exactly two separators
+  // specially.
+  bool was_net = Component.size() > 2 &&
+    is_separator(Component[0]) &&
+    Component[1] == Component[0] &&
+    !is_separator(Component[2]);
+
+  // Handle separators.
+  if (is_separator(Path[Position])) {
+    // Root dir.
+    if (was_net
+#ifdef LLVM_ON_WIN32
+        // c:/
+        || Component.endswith(":")
+#endif
+        ) {
+      Component = Path.substr(Position, 1);
+      return *this;
+    }
+
+    // Skip extra separators.
+    while (Position != Path.size() &&
+           is_separator(Path[Position])) {
+      ++Position;
+    }
+
+    // Treat trailing '/' as a '.'.
+    if (Position == Path.size()) {
+      --Position;
+      Component = ".";
+      return *this;
+    }
+  }
+
+  // Find next component.
+  size_t end_pos = Path.find_first_of(separators, Position);
+  Component = Path.slice(Position, end_pos);
+
+  return *this;
+}
+
+const_iterator &const_iterator::operator--() {
+  // If we're at the end and the previous char was a '/', return '.'.
+  if (Position == Path.size() &&
+      Path.size() > 1 &&
+      is_separator(Path[Position - 1])
+#ifdef LLVM_ON_WIN32
+      && Path[Position - 2] != ':'
+#endif
+      ) {
+    --Position;
+    Component = ".";
+    return *this;
+  }
+
+  // Skip separators unless it's the root directory.
+  size_t root_dir_pos = root_dir_start(Path);
+  size_t end_pos = Position;
+
+  while(end_pos > 0 &&
+        (end_pos - 1) != root_dir_pos &&
+        is_separator(Path[end_pos - 1]))
+    --end_pos;
+
+  // Find next separator.
+  size_t start_pos = filename_pos(Path.substr(0, end_pos));
+  Component = Path.slice(start_pos, end_pos);
+  Position = start_pos;
+  return *this;
+}
+
+bool const_iterator::operator==(const const_iterator &RHS) const {
+  return Path.begin() == RHS.Path.begin() &&
+         Position == RHS.Position;
+}
+
+bool const_iterator::operator!=(const const_iterator &RHS) const {
+  return !(*this == RHS);
+}
+
+ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
+  return Position - RHS.Position;
+}
+
+const StringRef root_path(StringRef path) {
+  const_iterator b = begin(path),
+                 pos = b,
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if (has_net || has_drive) {
+      if ((++pos != e) && is_separator((*pos)[0])) {
+        // {C:/,//net/}, so get the first two components.
+        return path.substr(0, b->size() + pos->size());
+      } else {
+        // just {C:,//net}, return the first component.
+        return *b;
+      }
+    }
+
+    // POSIX style root directory.
+    if (is_separator((*b)[0])) {
+      return *b;
+    }
+  }
+
+  return StringRef();
+}
+
+const StringRef root_name(StringRef path) {
+  const_iterator b = begin(path),
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if (has_net || has_drive) {
+      // just {C:,//net}, return the first component.
+      return *b;
+    }
+  }
+
+  // No path or no name.
+  return StringRef();
+}
+
+const StringRef root_directory(StringRef path) {
+  const_iterator b = begin(path),
+                 pos = b,
+                 e = end(path);
+  if (b != e) {
+    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
+    bool has_drive =
+#ifdef LLVM_ON_WIN32
+      b->endswith(":");
+#else
+      false;
+#endif
+
+    if ((has_net || has_drive) &&
+        // {C:,//net}, skip to the next component.
+        (++pos != e) && is_separator((*pos)[0])) {
+      return *pos;
+    }
+
+    // POSIX style root directory.
+    if (!has_net && is_separator((*b)[0])) {
+      return *b;
+    }
+  }
+
+  // No path or no root.
+  return StringRef();
+}
+
+const StringRef relative_path(StringRef path) {
+  StringRef root = root_path(path);
+  return path.substr(root.size());
+}
+
+void append(SmallVectorImpl<char> &path, const Twine &a,
+                                         const Twine &b,
+                                         const Twine &c,
+                                         const Twine &d) {
+  SmallString<32> a_storage;
+  SmallString<32> b_storage;
+  SmallString<32> c_storage;
+  SmallString<32> d_storage;
+
+  SmallVector<StringRef, 4> components;
+  if (!a.isTriviallyEmpty()) components.push_back(a.toStringRef(a_storage));
+  if (!b.isTriviallyEmpty()) components.push_back(b.toStringRef(b_storage));
+  if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage));
+  if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
+
+  for (SmallVectorImpl<StringRef>::const_iterator i = components.begin(),
+                                                  e = components.end();
+                                                  i != e; ++i) {
+    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
+    bool component_has_sep = !i->empty() && is_separator((*i)[0]);
+    bool is_root_name = has_root_name(*i);
+
+    if (path_has_sep) {
+      // Strip separators from beginning of component.
+      size_t loc = i->find_first_not_of(separators);
+      StringRef c = i->substr(loc);
+
+      // Append it.
+      path.append(c.begin(), c.end());
+      continue;
+    }
+
+    if (!component_has_sep && !(path.empty() || is_root_name)) {
+      // Add a separator.
+      path.push_back(prefered_separator);
+    }
+
+    path.append(i->begin(), i->end());
+  }
+}
+
+void append(SmallVectorImpl<char> &path,
+            const_iterator begin, const_iterator end) {
+  for (; begin != end; ++begin)
+    path::append(path, *begin);
+}
+
+const StringRef parent_path(StringRef path) {
+  size_t end_pos = parent_path_end(path);
+  if (end_pos == StringRef::npos)
+    return StringRef();
+  else
+    return path.substr(0, end_pos);
+}
+
+void remove_filename(SmallVectorImpl<char> &path) {
+  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+  if (end_pos != StringRef::npos)
+    path.set_size(end_pos);
+}
+
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+  StringRef p(path.begin(), path.size());
+  SmallString<32> ext_storage;
+  StringRef ext = extension.toStringRef(ext_storage);
+
+  // Erase existing extension.
+  size_t pos = p.find_last_of('.');
+  if (pos != StringRef::npos && pos >= filename_pos(p))
+    path.set_size(pos);
 
-bool Path::operator==(const Path &that) const {
-  return path == that.path;
+  // Append '.' if needed.
+  if (ext.size() > 0 && ext[0] != '.')
+    path.push_back('.');
+
+  // Append extension.
+  path.append(ext.begin(), ext.end());
+}
+
+void native(const Twine &path, SmallVectorImpl<char> &result) {
+  assert((!path.isSingleStringRef() ||
+          path.getSingleStringRef().data() != result.data()) &&
+         "path and result are not allowed to overlap!");
+  // Clear result.
+  result.clear();
+  path.toVector(result);
+  native(result);
+}
+
+void native(SmallVectorImpl<char> &path) {
+#ifdef LLVM_ON_WIN32
+  std::replace(path.begin(), path.end(), '/', '\\');
+#endif
 }
 
-bool Path::operator<(const Path& that) const {
-  return path < that.path;
+const StringRef filename(StringRef path) {
+  return *(--end(path));
 }
 
-LLVMFileType
-sys::IdentifyFileType(const char *magic, unsigned length) {
-  assert(magic && "Invalid magic number string");
-  assert(length >=4 && "Invalid magic number length");
-  switch ((unsigned char)magic[0]) {
+const StringRef stem(StringRef path) {
+  StringRef fname = filename(path);
+  size_t pos = fname.find_last_of('.');
+  if (pos == StringRef::npos)
+    return fname;
+  else
+    if ((fname.size() == 1 && fname == ".") ||
+        (fname.size() == 2 && fname == ".."))
+      return fname;
+    else
+      return fname.substr(0, pos);
+}
+
+const StringRef extension(StringRef path) {
+  StringRef fname = filename(path);
+  size_t pos = fname.find_last_of('.');
+  if (pos == StringRef::npos)
+    return StringRef();
+  else
+    if ((fname.size() == 1 && fname == ".") ||
+        (fname.size() == 2 && fname == ".."))
+      return StringRef();
+    else
+      return fname.substr(pos);
+}
+
+bool is_separator(char value) {
+  switch(value) {
+#ifdef LLVM_ON_WIN32
+    case '\\': // fall through
+#endif
+    case '/': return true;
+    default: return false;
+  }
+}
+
+void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
+  result.clear();
+
+#ifdef __APPLE__
+  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
+  int ConfName = erasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
+                               : _CS_DARWIN_USER_CACHE_DIR;
+  size_t ConfLen = confstr(ConfName, 0, 0);
+  if (ConfLen > 0) {
+    do {
+      result.resize(ConfLen);
+      ConfLen = confstr(ConfName, result.data(), result.size());
+    } while (ConfLen > 0 && ConfLen != result.size());
+
+    if (ConfLen > 0) {
+      assert(result.back() == 0);
+      result.pop_back();
+      return;
+    }
+
+    result.clear();
+  }
+#endif
+
+  // Check whether the temporary directory is specified by an environment
+  // variable.
+  const char *EnvironmentVariable;
+#ifdef LLVM_ON_WIN32
+  EnvironmentVariable = "TEMP";
+#else
+  EnvironmentVariable = "TMPDIR";
+#endif
+  if (char *RequestedDir = getenv(EnvironmentVariable)) {
+    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+    return;
+  }
+
+  // Fall back to a system default.
+  const char *DefaultResult;
+#ifdef LLVM_ON_WIN32
+  (void)erasedOnReboot;
+  DefaultResult = "C:\\TEMP";
+#else
+  if (erasedOnReboot)
+    DefaultResult = "/tmp";
+  else
+    DefaultResult = "/var/tmp";
+#endif
+  result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
+}
+
+bool has_root_name(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_name(p).empty();
+}
+
+bool has_root_directory(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_directory(p).empty();
+}
+
+bool has_root_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !root_path(p).empty();
+}
+
+bool has_relative_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !relative_path(p).empty();
+}
+
+bool has_filename(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !filename(p).empty();
+}
+
+bool has_parent_path(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !parent_path(p).empty();
+}
+
+bool has_stem(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !stem(p).empty();
+}
+
+bool has_extension(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  return !extension(p).empty();
+}
+
+bool is_absolute(const Twine &path) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  bool rootDir = has_root_directory(p),
+#ifdef LLVM_ON_WIN32
+       rootName = has_root_name(p);
+#else
+       rootName = true;
+#endif
+
+  return rootDir && rootName;
+}
+
+bool is_relative(const Twine &path) {
+  return !is_absolute(path);
+}
+
+} // end namespace path
+
+namespace fs {
+
+error_code getUniqueID(const Twine Path, UniqueID &Result) {
+  file_status Status;
+  error_code EC = status(Path, Status);
+  if (EC)
+    return EC;
+  Result = Status.getUniqueID();
+  return error_code::success();
+}
+
+error_code createUniqueFile(const Twine &Model, int &ResultFd,
+                            SmallVectorImpl<char> &ResultPath, unsigned Mode) {
+  return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File);
+}
+
+error_code createUniqueFile(const Twine &Model,
+                            SmallVectorImpl<char> &ResultPath) {
+  int Dummy;
+  return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name);
+}
+
+static error_code createTemporaryFile(const Twine &Model, int &ResultFD,
+                                      llvm::SmallVectorImpl<char> &ResultPath,
+                                      FSEntity Type) {
+  SmallString<128> Storage;
+  StringRef P = Model.toNullTerminatedStringRef(Storage);
+  assert(P.find_first_of(separators) == StringRef::npos &&
+         "Model must be a simple filename.");
+  // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
+  return createUniqueEntity(P.begin(), ResultFD, ResultPath,
+                            true, owner_read | owner_write, Type);
+}
+
+static error_code
+createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD,
+                    llvm::SmallVectorImpl<char> &ResultPath,
+                    FSEntity Type) {
+  const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%.";
+  return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath,
+                             Type);
+}
+
+
+error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                               int &ResultFD,
+                               SmallVectorImpl<char> &ResultPath) {
+  return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File);
+}
+
+error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix,
+                               SmallVectorImpl<char> &ResultPath) {
+  int Dummy;
+  return createTemporaryFile(Prefix, Suffix, Dummy, ResultPath, FS_Name);
+}
+
+
+// This is a mkdtemp with a different pattern. We use createUniqueEntity mostly
+// for consistency. We should try using mkdtemp.
+error_code createUniqueDirectory(const Twine &Prefix,
+                                 SmallVectorImpl<char> &ResultPath) {
+  int Dummy;
+  return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath,
+                            true, 0, FS_Dir);
+}
+
+error_code make_absolute(SmallVectorImpl<char> &path) {
+  StringRef p(path.data(), path.size());
+
+  bool rootDirectory = path::has_root_directory(p),
+#ifdef LLVM_ON_WIN32
+       rootName = path::has_root_name(p);
+#else
+       rootName = true;
+#endif
+
+  // Already absolute.
+  if (rootName && rootDirectory)
+    return error_code::success();
+
+  // All of the following conditions will need the current directory.
+  SmallString<128> current_dir;
+  if (error_code ec = current_path(current_dir)) return ec;
+
+  // Relative path. Prepend the current directory.
+  if (!rootName && !rootDirectory) {
+    // Append path to the current directory.
+    path::append(current_dir, p);
+    // Set path to the result.
+    path.swap(current_dir);
+    return error_code::success();
+  }
+
+  if (!rootName && rootDirectory) {
+    StringRef cdrn = path::root_name(current_dir);
+    SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
+    path::append(curDirRootName, p);
+    // Set path to the result.
+    path.swap(curDirRootName);
+    return error_code::success();
+  }
+
+  if (rootName && !rootDirectory) {
+    StringRef pRootName      = path::root_name(p);
+    StringRef bRootDirectory = path::root_directory(current_dir);
+    StringRef bRelativePath  = path::relative_path(current_dir);
+    StringRef pRelativePath  = path::relative_path(p);
+
+    SmallString<128> res;
+    path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
+    path.swap(res);
+    return error_code::success();
+  }
+
+  llvm_unreachable("All rootName and rootDirectory combinations should have "
+                   "occurred above!");
+}
+
+error_code create_directories(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
+
+  StringRef parent = path::parent_path(p);
+  if (!parent.empty()) {
+    bool parent_exists;
+    if (error_code ec = fs::exists(parent, parent_exists)) return ec;
+
+    if (!parent_exists)
+      if (error_code ec = create_directories(parent, existed)) return ec;
+  }
+
+  return create_directory(p, existed);
+}
+
+bool exists(file_status status) {
+  return status_known(status) && status.type() != file_type::file_not_found;
+}
+
+bool status_known(file_status s) {
+  return s.type() != file_type::status_error;
+}
+
+bool is_directory(file_status status) {
+  return status.type() == file_type::directory_file;
+}
+
+error_code is_directory(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_directory(st);
+  return error_code::success();
+}
+
+bool is_regular_file(file_status status) {
+  return status.type() == file_type::regular_file;
+}
+
+error_code is_regular_file(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_regular_file(st);
+  return error_code::success();
+}
+
+bool is_symlink(file_status status) {
+  return status.type() == file_type::symlink_file;
+}
+
+error_code is_symlink(const Twine &path, bool &result) {
+  file_status st;
+  if (error_code ec = status(path, st))
+    return ec;
+  result = is_symlink(st);
+  return error_code::success();
+}
+
+bool is_other(file_status status) {
+  return exists(status) &&
+         !is_regular_file(status) &&
+         !is_directory(status) &&
+         !is_symlink(status);
+}
+
+void directory_entry::replace_filename(const Twine &filename, file_status st) {
+  SmallString<128> path(Path.begin(), Path.end());
+  path::remove_filename(path);
+  path::append(path, filename);
+  Path = path.str();
+  Status = st;
+}
+
+error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
+  SmallString<32>  MagicStorage;
+  StringRef Magic = magic.toStringRef(MagicStorage);
+  SmallString<32> Buffer;
+
+  if (error_code ec = get_magic(path, Magic.size(), Buffer)) {
+    if (ec == errc::value_too_large) {
+      // Magic.size() > file_size(Path).
+      result = false;
+      return error_code::success();
+    }
+    return ec;
+  }
+
+  result = Magic == Buffer;
+  return error_code::success();
+}
+
+/// @brief Identify the magic in magic.
+  file_magic identify_magic(StringRef Magic) {
+  if (Magic.size() < 4)
+    return file_magic::unknown;
+  switch ((unsigned char)Magic[0]) {
+    case 0x00: {
+      // COFF short import library file
+      if (Magic[1] == (char)0x00 && Magic[2] == (char)0xff &&
+          Magic[3] == (char)0xff)
+        return file_magic::coff_import_library;
+      // Windows resource file
+      const char Expected[] = { 0, 0, 0, 0, '\x20', 0, 0, 0, '\xff' };
+      if (Magic.size() >= sizeof(Expected) &&
+          memcmp(Magic.data(), Expected, sizeof(Expected)) == 0)
+        return file_magic::windows_resource;
+      // 0x0000 = COFF unknown machine type
+      if (Magic[1] == 0)
+        return file_magic::coff_object;
+      break;
+    }
     case 0xDE:  // 0x0B17C0DE = BC wraper
-      if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
-          magic[3] == (char)0x0B)
-        return Bitcode_FileType;
+      if (Magic[1] == (char)0xC0 && Magic[2] == (char)0x17 &&
+          Magic[3] == (char)0x0B)
+        return file_magic::bitcode;
       break;
     case 'B':
-      if (magic[1] == 'C' && magic[2] == (char)0xC0 && magic[3] == (char)0xDE)
-        return Bitcode_FileType;
+      if (Magic[1] == 'C' && Magic[2] == (char)0xC0 && Magic[3] == (char)0xDE)
+        return file_magic::bitcode;
       break;
     case '!':
-      if (length >= 8)
-        if (memcmp(magic,"!<arch>\n",8) == 0)
-          return Archive_FileType;
+      if (Magic.size() >= 8)
+        if (memcmp(Magic.data(),"!<arch>\n",8) == 0)
+          return file_magic::archive;
       break;
 
     case '\177':
-      if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
-        bool Data2MSB = magic[5] == 2;
+      if (Magic.size() >= 18 && Magic[1] == 'E' && Magic[2] == 'L' &&
+          Magic[3] == 'F') {
+        bool Data2MSB = Magic[5] == 2;
         unsigned high = Data2MSB ? 16 : 17;
         unsigned low  = Data2MSB ? 17 : 16;
-        if (length >= 18 && magic[high] == 0)
-          switch (magic[low]) {
+        if (Magic[high] == 0)
+          switch (Magic[low]) {
             default: break;
-            case 1: return ELF_Relocatable_FileType;
-            case 2: return ELF_Executable_FileType;
-            case 3: return ELF_SharedObject_FileType;
-            case 4: return ELF_Core_FileType;
+            case 1: return file_magic::elf_relocatable;
+            case 2: return file_magic::elf_executable;
+            case 3: return file_magic::elf_shared_object;
+            case 4: return file_magic::elf_core;
           }
       }
       break;
 
     case 0xCA:
-      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) &&
-          magic[3] == char(0xBE)) {
+      if (Magic[1] == char(0xFE) && Magic[2] == char(0xBA) &&
+          Magic[3] == char(0xBE)) {
         // This is complicated by an overlap with Java class files.
         // See the Mach-O section in /usr/share/file/magic for details.
-        if (length >= 8 && magic[7] < 43)
-          // FIXME: Universal Binary of any type.
-          return Mach_O_DynamicallyLinkedSharedLib_FileType;
+        if (Magic.size() >= 8 && Magic[7] < 43)
+          return file_magic::macho_universal_binary;
       }
       break;
 
@@ -91,29 +911,29 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
     case 0xCE:
     case 0xCF: {
       uint16_t type = 0;
-      if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
-          magic[2] == char(0xFA) &&
-          (magic[3] == char(0xCE) || magic[3] == char(0xCF))) {
+      if (Magic[0] == char(0xFE) && Magic[1] == char(0xED) &&
+          Magic[2] == char(0xFA) &&
+          (Magic[3] == char(0xCE) || Magic[3] == char(0xCF))) {
         /* Native endian */
-        if (length >= 16) type = magic[14] << 8 | magic[15];
-      } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) &&
-                 magic[1] == char(0xFA) && magic[2] == char(0xED) &&
-                 magic[3] == char(0xFE)) {
+        if (Magic.size() >= 16) type = Magic[14] << 8 | Magic[15];
+      } else if ((Magic[0] == char(0xCE) || Magic[0] == char(0xCF)) &&
+                 Magic[1] == char(0xFA) && Magic[2] == char(0xED) &&
+                 Magic[3] == char(0xFE)) {
         /* Reverse endian */
-        if (length >= 14) type = magic[13] << 8 | magic[12];
+        if (Magic.size() >= 14) type = Magic[13] << 8 | Magic[12];
       }
       switch (type) {
         default: break;
-        case 1: return Mach_O_Object_FileType;
-        case 2: return Mach_O_Executable_FileType;
-        case 3: return Mach_O_FixedVirtualMemorySharedLib_FileType;
-        case 4: return Mach_O_Core_FileType;
-        case 5: return Mach_O_PreloadExecutable_FileType;
-        case 6: return Mach_O_DynamicallyLinkedSharedLib_FileType;
-        case 7: return Mach_O_DynamicLinker_FileType;
-        case 8: return Mach_O_Bundle_FileType;
-        case 9: return Mach_O_DynamicallyLinkedSharedLibStub_FileType;
-        case 10: return Mach_O_DSYMCompanion_FileType;
+        case 1: return file_magic::macho_object;
+        case 2: return file_magic::macho_executable;
+        case 3: return file_magic::macho_fixed_virtual_memory_shared_lib;
+        case 4: return file_magic::macho_core;
+        case 5: return file_magic::macho_preload_executable;
+        case 6: return file_magic::macho_dynamically_linked_shared_lib;
+        case 7: return file_magic::macho_dynamic_linker;
+        case 8: return file_magic::macho_bundle;
+        case 9: return file_magic::macho_dynamic_linker;
+        case 10: return file_magic::macho_dsym_companion;
       }
       break;
     }
@@ -123,170 +943,94 @@ sys::IdentifyFileType(const char *magic, unsigned length) {
     case 0x66: // MPS R4000 Windows
     case 0x50: // mc68K
     case 0x4c: // 80386 Windows
-      if (magic[1] == 0x01)
-        return COFF_FileType;
+      if (Magic[1] == 0x01)
+        return file_magic::coff_object;
 
     case 0x90: // PA-RISC Windows
     case 0x68: // mc68K Windows
-      if (magic[1] == 0x02)
-        return COFF_FileType;
+      if (Magic[1] == 0x02)
+        return file_magic::coff_object;
       break;
 
     case 0x4d: // Possible MS-DOS stub on Windows PE file
-      if (magic[1] == 0x5a) {
-        uint32_t off = *reinterpret_cast<const ulittle32_t *>(magic + 0x3c);
+      if (Magic[1] == 0x5a) {
+        uint32_t off =
+          *reinterpret_cast<const support::ulittle32_t*>(Magic.data() + 0x3c);
         // PE/COFF file, either EXE or DLL.
-        if (off < length && memcmp(magic + off, "PE\0\0",4) == 0)
-          return COFF_FileType;
+        if (off < Magic.size() && memcmp(Magic.data() + off, "PE\0\0",4) == 0)
+          return file_magic::pecoff_executable;
       }
       break;
 
     case 0x64: // x86-64 Windows.
-      if (magic[1] == char(0x86))
-        return COFF_FileType;
+      if (Magic[1] == char(0x86))
+        return file_magic::coff_object;
       break;
 
     default:
       break;
   }
-  return Unknown_FileType;
+  return file_magic::unknown;
 }
 
-bool
-Path::isArchive() const {
-  fs::file_magic type;
-  if (fs::identify_magic(str(), type))
-    return false;
-  return type == fs::file_magic::archive;
-}
+error_code identify_magic(const Twine &path, file_magic &result) {
+  SmallString<32> Magic;
+  error_code ec = get_magic(path, Magic.capacity(), Magic);
+  if (ec && ec != errc::value_too_large)
+    return ec;
 
-bool
-Path::isDynamicLibrary() const {
-  fs::file_magic type;
-  if (fs::identify_magic(str(), type))
-    return false;
-  switch (type) {
-    default: return false;
-    case fs::file_magic::macho_fixed_virtual_memory_shared_lib:
-    case fs::file_magic::macho_dynamically_linked_shared_lib:
-    case fs::file_magic::macho_dynamically_linked_shared_lib_stub:
-    case fs::file_magic::elf_shared_object:
-    case fs::file_magic::pecoff_executable:  return true;
-  }
+  result = identify_magic(Magic);
+  return error_code::success();
 }
 
-bool
-Path::isObjectFile() const {
-  fs::file_magic type;
-  if (fs::identify_magic(str(), type) || type == fs::file_magic::unknown)
-    return false;
-  return true;
-}
-
-Path
-Path::FindLibrary(std::string& name) {
-  std::vector<sys::Path> LibPaths;
-  GetSystemLibraryPaths(LibPaths);
-  for (unsigned i = 0; i < LibPaths.size(); ++i) {
-    sys::Path FullPath(LibPaths[i]);
-    FullPath.appendComponent("lib" + name + LTDL_SHLIB_EXT);
-    if (FullPath.isDynamicLibrary())
-      return FullPath;
-    FullPath.eraseSuffix();
-    FullPath.appendSuffix("a");
-    if (FullPath.isArchive())
-      return FullPath;
+namespace {
+error_code remove_all_r(StringRef path, file_type ft, uint32_t &count) {
+  if (ft == file_type::directory_file) {
+    // This code would be a lot better with exceptions ;/.
+    error_code ec;
+    directory_iterator i(path, ec);
+    if (ec) return ec;
+    for (directory_iterator e; i != e; i.increment(ec)) {
+      if (ec) return ec;
+      file_status st;
+      if (error_code ec = i->status(st)) return ec;
+      if (error_code ec = remove_all_r(i->path(), st.type(), count)) return ec;
+    }
+    bool obviously_this_exists;
+    if (error_code ec = remove(path, obviously_this_exists)) return ec;
+    assert(obviously_this_exists);
+    ++count; // Include the directory itself in the items removed.
+  } else {
+    bool obviously_this_exists;
+    if (error_code ec = remove(path, obviously_this_exists)) return ec;
+    assert(obviously_this_exists);
+    ++count;
   }
-  return sys::Path();
-}
 
-StringRef Path::GetDLLSuffix() {
-  return &(LTDL_SHLIB_EXT[1]);
+  return error_code::success();
 }
+} // end unnamed namespace
 
-void
-Path::appendSuffix(StringRef suffix) {
-  if (!suffix.empty()) {
-    path.append(".");
-    path.append(suffix);
-  }
-}
-
-bool
-Path::isBitcodeFile() const {
-  fs::file_magic type;
-  if (fs::identify_magic(str(), type))
-    return false;
-  return type == fs::file_magic::bitcode;
-}
-
-bool Path::hasMagicNumber(StringRef Magic) const {
-  std::string actualMagic;
-  if (getMagicNumber(actualMagic, static_cast<unsigned>(Magic.size())))
-    return Magic == actualMagic;
-  return false;
-}
-
-static void getPathList(const char*path, std::vector<Path>& Paths) {
-  const char* at = path;
-  const char* delim = strchr(at, PathSeparator);
-  Path tmpPath;
-  while (delim != 0) {
-    std::string tmp(at, size_t(delim-at));
-    if (tmpPath.set(tmp))
-      if (tmpPath.canRead())
-        Paths.push_back(tmpPath);
-    at = delim + 1;
-    delim = strchr(at, PathSeparator);
-  }
+error_code remove_all(const Twine &path, uint32_t &num_removed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toStringRef(path_storage);
 
-  if (*at != 0)
-    if (tmpPath.set(std::string(at)))
-      if (tmpPath.canRead())
-        Paths.push_back(tmpPath);
+  file_status fs;
+  if (error_code ec = status(path, fs))
+    return ec;
+  num_removed = 0;
+  return remove_all_r(p, fs.type(), num_removed);
 }
 
-static StringRef getDirnameCharSep(StringRef path, const char *Sep) {
-  assert(Sep[0] != '\0' && Sep[1] == '\0' &&
-         "Sep must be a 1-character string literal.");
-  if (path.empty())
-    return ".";
-
-  // If the path is all slashes, return a single slash.
-  // Otherwise, remove all trailing slashes.
-
-  signed pos = static_cast<signed>(path.size()) - 1;
-
-  while (pos >= 0 && path[pos] == Sep[0])
-    --pos;
-
-  if (pos < 0)
-    return path[0] == Sep[0] ? Sep : ".";
-
-  // Any slashes left?
-  signed i = 0;
-
-  while (i < pos && path[i] != Sep[0])
-    ++i;
-
-  if (i == pos) // No slashes?  Return "."
-    return ".";
-
-  // There is at least one slash left.  Remove all trailing non-slashes.
-  while (pos >= 0 && path[pos] != Sep[0])
-    --pos;
-
-  // Remove any trailing slashes.
-  while (pos >= 0 && path[pos] == Sep[0])
-    --pos;
-
-  if (pos < 0)
-    return path[0] == Sep[0] ? Sep : ".";
-
-  return path.substr(0, pos+1);
+error_code directory_entry::status(file_status &result) const {
+  return fs::status(Path, result);
 }
 
-// Include the truly platform-specific parts of this class.
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
+
+// Include the truly platform-specific parts.
 #if defined(LLVM_ON_UNIX)
 #include "Unix/Path.inc"
 #endif
diff --git a/lib/Support/PathV2.cpp b/lib/Support/PathV2.cpp
deleted file mode 100644
index ac53a9e9..0000000
--- a/lib/Support/PathV2.cpp
+++ /dev/null
@@ -1,949 +0,0 @@
-//===-- PathV2.cpp - Implement OS Path Concept ------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file implements the operating system PathV2 API.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/PathV2.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include <cctype>
-#include <cstdio>
-#include <cstring>
-#ifdef __APPLE__
-#include <unistd.h>
-#endif
-
-namespace {
-  using llvm::StringRef;
-  using llvm::sys::path::is_separator;
-
-#ifdef LLVM_ON_WIN32
-  const char *separators = "\\/";
-  const char  prefered_separator = '\\';
-#else
-  const char  separators = '/';
-  const char  prefered_separator = '/';
-#endif
-
-  StringRef find_first_component(StringRef path) {
-    // Look for this first component in the following order.
-    // * empty (in this case we return an empty string)
-    // * either C: or {//,\\}net.
-    // * {/,\}
-    // * {.,..}
-    // * {file,directory}name
-
-    if (path.empty())
-      return path;
-
-#ifdef LLVM_ON_WIN32
-    // C:
-    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
-        path[1] == ':')
-      return path.substr(0, 2);
-#endif
-
-    // //net
-    if ((path.size() > 2) &&
-        is_separator(path[0]) &&
-        path[0] == path[1] &&
-        !is_separator(path[2])) {
-      // Find the next directory separator.
-      size_t end = path.find_first_of(separators, 2);
-      return path.substr(0, end);
-    }
-
-    // {/,\}
-    if (is_separator(path[0]))
-      return path.substr(0, 1);
-
-    if (path.startswith(".."))
-      return path.substr(0, 2);
-
-    if (path[0] == '.')
-      return path.substr(0, 1);
-
-    // * {file,directory}name
-    size_t end = path.find_first_of(separators, 2);
-    return path.substr(0, end);
-  }
-
-  size_t filename_pos(StringRef str) {
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
-      return 0;
-
-    if (str.size() > 0 && is_separator(str[str.size() - 1]))
-      return str.size() - 1;
-
-    size_t pos = str.find_last_of(separators, str.size() - 1);
-
-#ifdef LLVM_ON_WIN32
-    if (pos == StringRef::npos)
-      pos = str.find_last_of(':', str.size() - 2);
-#endif
-
-    if (pos == StringRef::npos ||
-        (pos == 1 && is_separator(str[0])))
-      return 0;
-
-    return pos + 1;
-  }
-
-  size_t root_dir_start(StringRef str) {
-    // case "c:/"
-#ifdef LLVM_ON_WIN32
-    if (str.size() > 2 &&
-        str[1] == ':' &&
-        is_separator(str[2]))
-      return 2;
-#endif
-
-    // case "//"
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
-      return StringRef::npos;
-
-    // case "//net"
-    if (str.size() > 3 &&
-        is_separator(str[0]) &&
-        str[0] == str[1] &&
-        !is_separator(str[2])) {
-      return str.find_first_of(separators, 2);
-    }
-
-    // case "/"
-    if (str.size() > 0 && is_separator(str[0]))
-      return 0;
-
-    return StringRef::npos;
-  }
-
-  size_t parent_path_end(StringRef path) {
-    size_t end_pos = filename_pos(path);
-
-    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
-
-    // Skip separators except for root dir.
-    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
-
-    while(end_pos > 0 &&
-          (end_pos - 1) != root_dir_pos &&
-          is_separator(path[end_pos - 1]))
-      --end_pos;
-
-    if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
-      return StringRef::npos;
-
-    return end_pos;
-  }
-} // end unnamed namespace
-
-namespace llvm {
-namespace sys  {
-namespace path {
-
-const_iterator begin(StringRef path) {
-  const_iterator i;
-  i.Path      = path;
-  i.Component = find_first_component(path);
-  i.Position  = 0;
-  return i;
-}
-
-const_iterator end(StringRef path) {
-  const_iterator i;
-  i.Path      = path;
-  i.Position  = path.size();
-  return i;
-}
-
-const_iterator &const_iterator::operator++() {
-  assert(Position < Path.size() && "Tried to increment past end!");
-
-  // Increment Position to past the current component
-  Position += Component.size();
-
-  // Check for end.
-  if (Position == Path.size()) {
-    Component = StringRef();
-    return *this;
-  }
-
-  // Both POSIX and Windows treat paths that begin with exactly two separators
-  // specially.
-  bool was_net = Component.size() > 2 &&
-    is_separator(Component[0]) &&
-    Component[1] == Component[0] &&
-    !is_separator(Component[2]);
-
-  // Handle separators.
-  if (is_separator(Path[Position])) {
-    // Root dir.
-    if (was_net
-#ifdef LLVM_ON_WIN32
-        // c:/
-        || Component.endswith(":")
-#endif
-        ) {
-      Component = Path.substr(Position, 1);
-      return *this;
-    }
-
-    // Skip extra separators.
-    while (Position != Path.size() &&
-           is_separator(Path[Position])) {
-      ++Position;
-    }
-
-    // Treat trailing '/' as a '.'.
-    if (Position == Path.size()) {
-      --Position;
-      Component = ".";
-      return *this;
-    }
-  }
-
-  // Find next component.
-  size_t end_pos = Path.find_first_of(separators, Position);
-  Component = Path.slice(Position, end_pos);
-
-  return *this;
-}
-
-const_iterator &const_iterator::operator--() {
-  // If we're at the end and the previous char was a '/', return '.'.
-  if (Position == Path.size() &&
-      Path.size() > 1 &&
-      is_separator(Path[Position - 1])
-#ifdef LLVM_ON_WIN32
-      && Path[Position - 2] != ':'
-#endif
-      ) {
-    --Position;
-    Component = ".";
-    return *this;
-  }
-
-  // Skip separators unless it's the root directory.
-  size_t root_dir_pos = root_dir_start(Path);
-  size_t end_pos = Position;
-
-  while(end_pos > 0 &&
-        (end_pos - 1) != root_dir_pos &&
-        is_separator(Path[end_pos - 1]))
-    --end_pos;
-
-  // Find next separator.
-  size_t start_pos = filename_pos(Path.substr(0, end_pos));
-  Component = Path.slice(start_pos, end_pos);
-  Position = start_pos;
-  return *this;
-}
-
-bool const_iterator::operator==(const const_iterator &RHS) const {
-  return Path.begin() == RHS.Path.begin() &&
-         Position == RHS.Position;
-}
-
-bool const_iterator::operator!=(const const_iterator &RHS) const {
-  return !(*this == RHS);
-}
-
-ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
-  return Position - RHS.Position;
-}
-
-const StringRef root_path(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
-  if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
-
-    if (has_net || has_drive) {
-      if ((++pos != e) && is_separator((*pos)[0])) {
-        // {C:/,//net/}, so get the first two components.
-        return path.substr(0, b->size() + pos->size());
-      } else {
-        // just {C:,//net}, return the first component.
-        return *b;
-      }
-    }
-
-    // POSIX style root directory.
-    if (is_separator((*b)[0])) {
-      return *b;
-    }
-  }
-
-  return StringRef();
-}
-
-const StringRef root_name(StringRef path) {
-  const_iterator b = begin(path),
-                 e = end(path);
-  if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
-
-    if (has_net || has_drive) {
-      // just {C:,//net}, return the first component.
-      return *b;
-    }
-  }
-
-  // No path or no name.
-  return StringRef();
-}
-
-const StringRef root_directory(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
-  if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
-
-    if ((has_net || has_drive) &&
-        // {C:,//net}, skip to the next component.
-        (++pos != e) && is_separator((*pos)[0])) {
-      return *pos;
-    }
-
-    // POSIX style root directory.
-    if (!has_net && is_separator((*b)[0])) {
-      return *b;
-    }
-  }
-
-  // No path or no root.
-  return StringRef();
-}
-
-const StringRef relative_path(StringRef path) {
-  StringRef root = root_path(path);
-  return path.substr(root.size());
-}
-
-void append(SmallVectorImpl<char> &path, const Twine &a,
-                                         const Twine &b,
-                                         const Twine &c,
-                                         const Twine &d) {
-  SmallString<32> a_storage;
-  SmallString<32> b_storage;
-  SmallString<32> c_storage;
-  SmallString<32> d_storage;
-
-  SmallVector<StringRef, 4> components;
-  if (!a.isTriviallyEmpty()) components.push_back(a.toStringRef(a_storage));
-  if (!b.isTriviallyEmpty()) components.push_back(b.toStringRef(b_storage));
-  if (!c.isTriviallyEmpty()) components.push_back(c.toStringRef(c_storage));
-  if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
-
-  for (SmallVectorImpl<StringRef>::const_iterator i = components.begin(),
-                                                  e = components.end();
-                                                  i != e; ++i) {
-    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
-    bool component_has_sep = !i->empty() && is_separator((*i)[0]);
-    bool is_root_name = has_root_name(*i);
-
-    if (path_has_sep) {
-      // Strip separators from beginning of component.
-      size_t loc = i->find_first_not_of(separators);
-      StringRef c = i->substr(loc);
-
-      // Append it.
-      path.append(c.begin(), c.end());
-      continue;
-    }
-
-    if (!component_has_sep && !(path.empty() || is_root_name)) {
-      // Add a separator.
-      path.push_back(prefered_separator);
-    }
-
-    path.append(i->begin(), i->end());
-  }
-}
-
-void append(SmallVectorImpl<char> &path,
-            const_iterator begin, const_iterator end) {
-  for (; begin != end; ++begin)
-    path::append(path, *begin);
-}
-
-const StringRef parent_path(StringRef path) {
-  size_t end_pos = parent_path_end(path);
-  if (end_pos == StringRef::npos)
-    return StringRef();
-  else
-    return path.substr(0, end_pos);
-}
-
-void remove_filename(SmallVectorImpl<char> &path) {
-  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
-  if (end_pos != StringRef::npos)
-    path.set_size(end_pos);
-}
-
-void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
-  StringRef p(path.begin(), path.size());
-  SmallString<32> ext_storage;
-  StringRef ext = extension.toStringRef(ext_storage);
-
-  // Erase existing extension.
-  size_t pos = p.find_last_of('.');
-  if (pos != StringRef::npos && pos >= filename_pos(p))
-    path.set_size(pos);
-
-  // Append '.' if needed.
-  if (ext.size() > 0 && ext[0] != '.')
-    path.push_back('.');
-
-  // Append extension.
-  path.append(ext.begin(), ext.end());
-}
-
-void native(const Twine &path, SmallVectorImpl<char> &result) {
-  // Clear result.
-  result.clear();
-#ifdef LLVM_ON_WIN32
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-  result.reserve(p.size());
-  for (StringRef::const_iterator i = p.begin(),
-                                 e = p.end();
-                                 i != e;
-                                 ++i) {
-    if (*i == '/')
-      result.push_back('\\');
-    else
-      result.push_back(*i);
-  }
-#else
-  path.toVector(result);
-#endif
-}
-
-const StringRef filename(StringRef path) {
-  return *(--end(path));
-}
-
-const StringRef stem(StringRef path) {
-  StringRef fname = filename(path);
-  size_t pos = fname.find_last_of('.');
-  if (pos == StringRef::npos)
-    return fname;
-  else
-    if ((fname.size() == 1 && fname == ".") ||
-        (fname.size() == 2 && fname == ".."))
-      return fname;
-    else
-      return fname.substr(0, pos);
-}
-
-const StringRef extension(StringRef path) {
-  StringRef fname = filename(path);
-  size_t pos = fname.find_last_of('.');
-  if (pos == StringRef::npos)
-    return StringRef();
-  else
-    if ((fname.size() == 1 && fname == ".") ||
-        (fname.size() == 2 && fname == ".."))
-      return StringRef();
-    else
-      return fname.substr(pos);
-}
-
-bool is_separator(char value) {
-  switch(value) {
-#ifdef LLVM_ON_WIN32
-    case '\\': // fall through
-#endif
-    case '/': return true;
-    default: return false;
-  }
-}
-
-void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result) {
-  result.clear();
-
-#ifdef __APPLE__
-  // On Darwin, use DARWIN_USER_TEMP_DIR or DARWIN_USER_CACHE_DIR.
-  int ConfName = erasedOnReboot? _CS_DARWIN_USER_TEMP_DIR
-                               : _CS_DARWIN_USER_CACHE_DIR;
-  size_t ConfLen = confstr(ConfName, 0, 0);
-  if (ConfLen > 0) {
-    do {
-      result.resize(ConfLen);
-      ConfLen = confstr(ConfName, result.data(), result.size());
-    } while (ConfLen > 0 && ConfLen != result.size());
-
-    if (ConfLen > 0) {
-      assert(result.back() == 0);
-      result.pop_back();
-      return;
-    }
-
-    result.clear();
-  }
-#endif
-
-  // Check whether the temporary directory is specified by an environment
-  // variable.
-  const char *EnvironmentVariable;
-#ifdef LLVM_ON_WIN32
-  EnvironmentVariable = "TEMP";
-#else
-  EnvironmentVariable = "TMPDIR";
-#endif
-  if (char *RequestedDir = getenv(EnvironmentVariable)) {
-    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
-    return;
-  }
-
-  // Fall back to a system default.
-  const char *DefaultResult;
-#ifdef LLVM_ON_WIN32
-  (void)erasedOnReboot;
-  DefaultResult = "C:\\TEMP";
-#else
-  if (erasedOnReboot)
-    DefaultResult = "/tmp";
-  else
-    DefaultResult = "/var/tmp";
-#endif
-  result.append(DefaultResult, DefaultResult + strlen(DefaultResult));
-}
-
-bool has_root_name(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !root_name(p).empty();
-}
-
-bool has_root_directory(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !root_directory(p).empty();
-}
-
-bool has_root_path(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !root_path(p).empty();
-}
-
-bool has_relative_path(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !relative_path(p).empty();
-}
-
-bool has_filename(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !filename(p).empty();
-}
-
-bool has_parent_path(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !parent_path(p).empty();
-}
-
-bool has_stem(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !stem(p).empty();
-}
-
-bool has_extension(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  return !extension(p).empty();
-}
-
-bool is_absolute(const Twine &path) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  bool rootDir = has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = has_root_name(p);
-#else
-       rootName = true;
-#endif
-
-  return rootDir && rootName;
-}
-
-bool is_relative(const Twine &path) {
-  return !is_absolute(path);
-}
-
-} // end namespace path
-
-namespace fs {
-
-error_code make_absolute(SmallVectorImpl<char> &path) {
-  StringRef p(path.data(), path.size());
-
-  bool rootDirectory = path::has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = path::has_root_name(p);
-#else
-       rootName = true;
-#endif
-
-  // Already absolute.
-  if (rootName && rootDirectory)
-    return error_code::success();
-
-  // All of the following conditions will need the current directory.
-  SmallString<128> current_dir;
-  if (error_code ec = current_path(current_dir)) return ec;
-
-  // Relative path. Prepend the current directory.
-  if (!rootName && !rootDirectory) {
-    // Append path to the current directory.
-    path::append(current_dir, p);
-    // Set path to the result.
-    path.swap(current_dir);
-    return error_code::success();
-  }
-
-  if (!rootName && rootDirectory) {
-    StringRef cdrn = path::root_name(current_dir);
-    SmallString<128> curDirRootName(cdrn.begin(), cdrn.end());
-    path::append(curDirRootName, p);
-    // Set path to the result.
-    path.swap(curDirRootName);
-    return error_code::success();
-  }
-
-  if (rootName && !rootDirectory) {
-    StringRef pRootName      = path::root_name(p);
-    StringRef bRootDirectory = path::root_directory(current_dir);
-    StringRef bRelativePath  = path::relative_path(current_dir);
-    StringRef pRelativePath  = path::relative_path(p);
-
-    SmallString<128> res;
-    path::append(res, pRootName, bRootDirectory, bRelativePath, pRelativePath);
-    path.swap(res);
-    return error_code::success();
-  }
-
-  llvm_unreachable("All rootName and rootDirectory combinations should have "
-                   "occurred above!");
-}
-
-error_code create_directories(const Twine &path, bool &existed) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  StringRef parent = path::parent_path(p);
-  if (!parent.empty()) {
-    bool parent_exists;
-    if (error_code ec = fs::exists(parent, parent_exists)) return ec;
-
-    if (!parent_exists)
-      if (error_code ec = create_directories(parent, existed)) return ec;
-  }
-
-  return create_directory(p, existed);
-}
-
-bool exists(file_status status) {
-  return status_known(status) && status.type() != file_type::file_not_found;
-}
-
-bool status_known(file_status s) {
-  return s.type() != file_type::status_error;
-}
-
-bool is_directory(file_status status) {
-  return status.type() == file_type::directory_file;
-}
-
-error_code is_directory(const Twine &path, bool &result) {
-  file_status st;
-  if (error_code ec = status(path, st))
-    return ec;
-  result = is_directory(st);
-  return error_code::success();
-}
-
-bool is_regular_file(file_status status) {
-  return status.type() == file_type::regular_file;
-}
-
-error_code is_regular_file(const Twine &path, bool &result) {
-  file_status st;
-  if (error_code ec = status(path, st))
-    return ec;
-  result = is_regular_file(st);
-  return error_code::success();
-}
-
-bool is_symlink(file_status status) {
-  return status.type() == file_type::symlink_file;
-}
-
-error_code is_symlink(const Twine &path, bool &result) {
-  file_status st;
-  if (error_code ec = status(path, st))
-    return ec;
-  result = is_symlink(st);
-  return error_code::success();
-}
-
-bool is_other(file_status status) {
-  return exists(status) &&
-         !is_regular_file(status) &&
-         !is_directory(status) &&
-         !is_symlink(status);
-}
-
-void directory_entry::replace_filename(const Twine &filename, file_status st) {
-  SmallString<128> path(Path.begin(), Path.end());
-  path::remove_filename(path);
-  path::append(path, filename);
-  Path = path.str();
-  Status = st;
-}
-
-error_code has_magic(const Twine &path, const Twine &magic, bool &result) {
-  SmallString<32>  MagicStorage;
-  StringRef Magic = magic.toStringRef(MagicStorage);
-  SmallString<32> Buffer;
-
-  if (error_code ec = get_magic(path, Magic.size(), Buffer)) {
-    if (ec == errc::value_too_large) {
-      // Magic.size() > file_size(Path).
-      result = false;
-      return error_code::success();
-    }
-    return ec;
-  }
-
-  result = Magic == Buffer;
-  return error_code::success();
-}
-
-/// @brief Identify the magic in magic.
-file_magic identify_magic(StringRef magic) {
-  if (magic.size() < 4)
-    return file_magic::unknown;
-  switch ((unsigned char)magic[0]) {
-    case 0xDE:  // 0x0B17C0DE = BC wraper
-      if (magic[1] == (char)0xC0 && magic[2] == (char)0x17 &&
-          magic[3] == (char)0x0B)
-        return file_magic::bitcode;
-      break;
-    case 'B':
-      if (magic[1] == 'C' && magic[2] == (char)0xC0 && magic[3] == (char)0xDE)
-        return file_magic::bitcode;
-      break;
-    case '!':
-      if (magic.size() >= 8)
-        if (memcmp(magic.data(),"!<arch>\n",8) == 0)
-          return file_magic::archive;
-      break;
-
-    case '\177':
-      if (magic[1] == 'E' && magic[2] == 'L' && magic[3] == 'F') {
-        bool Data2MSB = magic[5] == 2;
-        unsigned high = Data2MSB ? 16 : 17;
-        unsigned low  = Data2MSB ? 17 : 16;
-        if (magic.size() >= 18 && magic[high] == 0)
-          switch (magic[low]) {
-            default: break;
-            case 1: return file_magic::elf_relocatable;
-            case 2: return file_magic::elf_executable;
-            case 3: return file_magic::elf_shared_object;
-            case 4: return file_magic::elf_core;
-          }
-      }
-      break;
-
-    case 0xCA:
-      if (magic[1] == char(0xFE) && magic[2] == char(0xBA) &&
-          magic[3] == char(0xBE)) {
-        // This is complicated by an overlap with Java class files.
-        // See the Mach-O section in /usr/share/file/magic for details.
-        if (magic.size() >= 8 && magic[7] < 43)
-          // FIXME: Universal Binary of any type.
-          return file_magic::macho_dynamically_linked_shared_lib;
-      }
-      break;
-
-      // The two magic numbers for mach-o are:
-      // 0xfeedface - 32-bit mach-o
-      // 0xfeedfacf - 64-bit mach-o
-    case 0xFE:
-    case 0xCE:
-    case 0xCF: {
-      uint16_t type = 0;
-      if (magic[0] == char(0xFE) && magic[1] == char(0xED) &&
-          magic[2] == char(0xFA) &&
-          (magic[3] == char(0xCE) || magic[3] == char(0xCF))) {
-        /* Native endian */
-        if (magic.size() >= 16) type = magic[14] << 8 | magic[15];
-      } else if ((magic[0] == char(0xCE) || magic[0] == char(0xCF)) &&
-                 magic[1] == char(0xFA) && magic[2] == char(0xED) &&
-                 magic[3] == char(0xFE)) {
-        /* Reverse endian */
-        if (magic.size() >= 14) type = magic[13] << 8 | magic[12];
-      }
-      switch (type) {
-        default: break;
-        case 1: return file_magic::macho_object;
-        case 2: return file_magic::macho_executable;
-        case 3: return file_magic::macho_fixed_virtual_memory_shared_lib;
-        case 4: return file_magic::macho_core;
-        case 5: return file_magic::macho_preload_executabl;
-        case 6: return file_magic::macho_dynamically_linked_shared_lib;
-        case 7: return file_magic::macho_dynamic_linker;
-        case 8: return file_magic::macho_bundle;
-        case 9: return file_magic::macho_dynamic_linker;
-        case 10: return file_magic::macho_dsym_companion;
-      }
-      break;
-    }
-    case 0xF0: // PowerPC Windows
-    case 0x83: // Alpha 32-bit
-    case 0x84: // Alpha 64-bit
-    case 0x66: // MPS R4000 Windows
-    case 0x50: // mc68K
-    case 0x4c: // 80386 Windows
-      if (magic[1] == 0x01)
-        return file_magic::coff_object;
-
-    case 0x90: // PA-RISC Windows
-    case 0x68: // mc68K Windows
-      if (magic[1] == 0x02)
-        return file_magic::coff_object;
-      break;
-
-    case 0x4d: // Possible MS-DOS stub on Windows PE file
-      if (magic[1] == 0x5a) {
-        uint32_t off =
-          *reinterpret_cast<const support::ulittle32_t*>(magic.data() + 0x3c);
-        // PE/COFF file, either EXE or DLL.
-        if (off < magic.size() && memcmp(magic.data() + off, "PE\0\0",4) == 0)
-          return file_magic::pecoff_executable;
-      }
-      break;
-
-    case 0x64: // x86-64 Windows.
-      if (magic[1] == char(0x86))
-        return file_magic::coff_object;
-      break;
-
-    default:
-      break;
-  }
-  return file_magic::unknown;
-}
-
-error_code identify_magic(const Twine &path, file_magic &result) {
-  SmallString<32> Magic;
-  error_code ec = get_magic(path, Magic.capacity(), Magic);
-  if (ec && ec != errc::value_too_large)
-    return ec;
-
-  result = identify_magic(Magic);
-  return error_code::success();
-}
-
-namespace {
-error_code remove_all_r(StringRef path, file_type ft, uint32_t &count) {
-  if (ft == file_type::directory_file) {
-    // This code would be a lot better with exceptions ;/.
-    error_code ec;
-    directory_iterator i(path, ec);
-    if (ec) return ec;
-    for (directory_iterator e; i != e; i.increment(ec)) {
-      if (ec) return ec;
-      file_status st;
-      if (error_code ec = i->status(st)) return ec;
-      if (error_code ec = remove_all_r(i->path(), st.type(), count)) return ec;
-    }
-    bool obviously_this_exists;
-    if (error_code ec = remove(path, obviously_this_exists)) return ec;
-    assert(obviously_this_exists);
-    ++count; // Include the directory itself in the items removed.
-  } else {
-    bool obviously_this_exists;
-    if (error_code ec = remove(path, obviously_this_exists)) return ec;
-    assert(obviously_this_exists);
-    ++count;
-  }
-
-  return error_code::success();
-}
-} // end unnamed namespace
-
-error_code remove_all(const Twine &path, uint32_t &num_removed) {
-  SmallString<128> path_storage;
-  StringRef p = path.toStringRef(path_storage);
-
-  file_status fs;
-  if (error_code ec = status(path, fs))
-    return ec;
-  num_removed = 0;
-  return remove_all_r(p, fs.type(), num_removed);
-}
-
-error_code directory_entry::status(file_status &result) const {
-  return fs::status(Path, result);
-}
-
-} // end namespace fs
-} // end namespace sys
-} // end namespace llvm
-
-// Include the truly platform-specific parts.
-#if defined(LLVM_ON_UNIX)
-#include "Unix/PathV2.inc"
-#endif
-#if defined(LLVM_ON_WIN32)
-#include "Windows/PathV2.inc"
-#endif
diff --git a/lib/Support/PrettyStackTrace.cpp b/lib/Support/PrettyStackTrace.cpp
index 23ee5ab..722f4ca 100644
--- a/lib/Support/PrettyStackTrace.cpp
+++ b/lib/Support/PrettyStackTrace.cpp
@@ -15,10 +15,12 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/ThreadLocal.h"
 #include "llvm/Support/Watchdog.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm-c/Core.h"
 
 #ifdef HAVE_CRASHREPORTERCLIENT_H
 #include <CrashReporterClient.h>
@@ -26,12 +28,7 @@
 
 using namespace llvm;
 
-namespace llvm {
-  bool DisablePrettyStackTrace = false;
-}
-
-// FIXME: This should be thread local when llvm supports threads.
-static sys::ThreadLocal<const PrettyStackTraceEntry> PrettyStackTraceHead;
+static ManagedStatic<sys::ThreadLocal<const PrettyStackTraceEntry> > PrettyStackTraceHead;
 
 static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
   unsigned NextID = 0;
@@ -49,12 +46,12 @@ static unsigned PrintStack(const PrettyStackTraceEntry *Entry, raw_ostream &OS){
 /// PrintCurStackTrace - Print the current stack trace to the specified stream.
 static void PrintCurStackTrace(raw_ostream &OS) {
   // Don't print an empty trace.
-  if (PrettyStackTraceHead.get() == 0) return;
+  if (PrettyStackTraceHead->get() == 0) return;
   
   // If there are pretty stack frames registered, walk and emit them.
   OS << "Stack dump:\n";
   
-  PrintStack(PrettyStackTraceHead.get(), OS);
+  PrintStack(PrettyStackTraceHead->get(), OS);
   OS.flush();
 }
 
@@ -102,26 +99,28 @@ static void CrashHandler(void *) {
 #endif
 }
 
-static bool RegisterCrashPrinter() {
-  if (!DisablePrettyStackTrace)
-    sys::AddSignalHandler(CrashHandler, 0);
-  return false;
-}
-
 PrettyStackTraceEntry::PrettyStackTraceEntry() {
-  // The first time this is called, we register the crash printer.
-  static bool HandlerRegistered = RegisterCrashPrinter();
-  (void)HandlerRegistered;
-    
   // Link ourselves.
-  NextEntry = PrettyStackTraceHead.get();
-  PrettyStackTraceHead.set(this);
+  NextEntry = PrettyStackTraceHead->get();
+  PrettyStackTraceHead->set(this);
 }
 
 PrettyStackTraceEntry::~PrettyStackTraceEntry() {
-  assert(PrettyStackTraceHead.get() == this &&
+  // Do nothing if PrettyStackTraceHead is uninitialized. This can only happen
+  // if a shutdown occurred after we created the PrettyStackTraceEntry. That
+  // does occur in the following idiom:
+  //
+  // PrettyStackTraceProgram X(...);
+  // llvm_shutdown_obj Y;
+  //
+  // Without this check, we may end up removing ourselves from the stack trace
+  // after PrettyStackTraceHead has already been destroyed.
+  if (!PrettyStackTraceHead.isConstructed())
+    return;
+  
+  assert(PrettyStackTraceHead->get() == this &&
          "Pretty stack trace entry destruction is out of order");
-  PrettyStackTraceHead.set(getNextEntry());
+  PrettyStackTraceHead->set(getNextEntry());
 }
 
 void PrettyStackTraceString::print(raw_ostream &OS) const {
@@ -135,3 +134,18 @@ void PrettyStackTraceProgram::print(raw_ostream &OS) const {
     OS << ArgV[i] << ' ';
   OS << '\n';
 }
+
+static bool RegisterCrashPrinter() {
+  sys::AddSignalHandler(CrashHandler, 0);
+  return false;
+}
+
+void llvm::EnablePrettyStackTrace() {
+  // The first time this is called, we register the crash printer.
+  static bool HandlerRegistered = RegisterCrashPrinter();
+  (void)HandlerRegistered;
+}
+
+void LLVMEnablePrettyStackTrace() {
+  EnablePrettyStackTrace();
+}
diff --git a/lib/Support/Process.cpp b/lib/Support/Process.cpp
index 2c0d37b..d5168f0 100644
--- a/lib/Support/Process.cpp
+++ b/lib/Support/Process.cpp
@@ -80,6 +80,24 @@ TimeValue self_process::get_wall_time() const {
 #endif
 
 
+#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
+
+#define ALLCOLORS(FGBG,BOLD) {\
+    COLOR(FGBG, "0", BOLD),\
+    COLOR(FGBG, "1", BOLD),\
+    COLOR(FGBG, "2", BOLD),\
+    COLOR(FGBG, "3", BOLD),\
+    COLOR(FGBG, "4", BOLD),\
+    COLOR(FGBG, "5", BOLD),\
+    COLOR(FGBG, "6", BOLD),\
+    COLOR(FGBG, "7", BOLD)\
+  }
+
+static const char colorcodes[2][2][8][10] = {
+ { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
+ { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
+};
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
diff --git a/lib/Support/Program.cpp b/lib/Support/Program.cpp
index 201d5c0..83f2ec4 100644
--- a/lib/Support/Program.cpp
+++ b/lib/Support/Program.cpp
@@ -22,33 +22,40 @@ using namespace sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-int
-Program::ExecuteAndWait(const Path& path,
-                        const char** args,
-                        const char** envp,
-                        const Path** redirects,
-                        unsigned secondsToWait,
-                        unsigned memoryLimit,
-                        std::string* ErrMsg,
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **env, const StringRef **Redirects,
+                    unsigned memoryLimit, std::string *ErrMsg);
+
+int sys::ExecuteAndWait(StringRef Program, const char **args, const char **envp,
+                        const StringRef **redirects, unsigned secondsToWait,
+                        unsigned memoryLimit, std::string *ErrMsg,
                         bool *ExecutionFailed) {
-  Program prg;
-  if (prg.Execute(path, args, envp, redirects, memoryLimit, ErrMsg)) {
-    if (ExecutionFailed) *ExecutionFailed = false;
-    return prg.Wait(path, secondsToWait, ErrMsg);
+  ProcessInfo PI;
+  if (Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg)) {
+    if (ExecutionFailed)
+      *ExecutionFailed = false;
+    ProcessInfo Result = Wait(PI, secondsToWait, true, ErrMsg);
+    return Result.ReturnCode;
   }
-  if (ExecutionFailed) *ExecutionFailed = true;
+
+  if (ExecutionFailed)
+    *ExecutionFailed = true;
+
   return -1;
 }
 
-void
-Program::ExecuteNoWait(const Path& path,
-                       const char** args,
-                       const char** envp,
-                       const Path** redirects,
-                       unsigned memoryLimit,
-                       std::string* ErrMsg) {
-  Program prg;
-  prg.Execute(path, args, envp, redirects, memoryLimit, ErrMsg);
+ProcessInfo sys::ExecuteNoWait(StringRef Program, const char **args,
+                               const char **envp, const StringRef **redirects,
+                               unsigned memoryLimit, std::string *ErrMsg,
+                               bool *ExecutionFailed) {
+  ProcessInfo PI;
+  if (ExecutionFailed)
+    *ExecutionFailed = false;
+  if (!Execute(PI, Program, args, envp, redirects, memoryLimit, ErrMsg))
+    if (ExecutionFailed)
+      *ExecutionFailed = true;
+
+  return PI;
 }
 
 // Include the platform-specific parts of this class.
diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp
index efc8b90..5413641 100644
--- a/lib/Support/Regex.cpp
+++ b/lib/Support/Regex.cpp
@@ -43,7 +43,7 @@ bool Regex::isValid(std::string &Error) {
   
   size_t len = llvm_regerror(error, preg, NULL, 0);
   
-  Error.resize(len);
+  Error.resize(len - 1);
   llvm_regerror(error, preg, &Error[0], len);
   return false;
 }
@@ -168,3 +168,10 @@ std::string Regex::sub(StringRef Repl, StringRef String,
 
   return Res;
 }
+
+bool Regex::isLiteralERE(StringRef Str) {
+  // Check for regex metacharacters.  This list was derived from our regex
+  // implementation in regcomp.c and double checked against the POSIX extended
+  // regular expression specification.
+  return Str.find_first_of("()^$|*+?.[]\\{}") == StringRef::npos;
+}
diff --git a/lib/Support/SmallPtrSet.cpp b/lib/Support/SmallPtrSet.cpp
index f0fed77..dd417b4 100644
--- a/lib/Support/SmallPtrSet.cpp
+++ b/lib/Support/SmallPtrSet.cpp
@@ -202,8 +202,13 @@ void SmallPtrSetImpl::CopyFrom(const SmallPtrSetImpl &RHS) {
   } else if (CurArraySize != RHS.CurArraySize) {
     if (isSmall())
       CurArray = (const void**)malloc(sizeof(void*) * RHS.CurArraySize);
-    else
-      CurArray = (const void**)realloc(CurArray, sizeof(void*)*RHS.CurArraySize);
+    else {
+      const void **T = (const void**)realloc(CurArray,
+                                             sizeof(void*) * RHS.CurArraySize);
+      if (!T)
+        free(CurArray);
+      CurArray = T;
+    }
     assert(CurArray && "Failed to allocate memory?");
   }
   
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index fac3cad..d4b94f8 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -52,9 +52,9 @@ SourceMgr::~SourceMgr() {
 /// AddIncludeFile - Search for a file with the specified name in the current
 /// directory or in one of the IncludeDirs.  If no file is found, this returns
 /// ~0, otherwise it returns the buffer ID of the stacked file.
-unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
-                                   SMLoc IncludeLoc,
-                                   std::string &IncludedFile) {
+size_t SourceMgr::AddIncludeFile(const std::string &Filename,
+                                 SMLoc IncludeLoc,
+                                 std::string &IncludedFile) {
   OwningPtr<MemoryBuffer> NewBuf;
   IncludedFile = Filename;
   MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
@@ -65,7 +65,7 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
     MemoryBuffer::getFile(IncludedFile.c_str(), NewBuf);
   }
 
-  if (NewBuf == 0) return ~0U;
+  if (!NewBuf) return ~0U;
 
   return AddNewSourceBuffer(NewBuf.take(), IncludeLoc);
 }
@@ -211,7 +211,8 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                       LineStr, ColRanges, FixIts);
 }
 
-void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
+                             SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
   SMDiagnostic Diagnostic = GetMessage(Loc, Kind, Msg, Ranges, FixIts);
@@ -222,8 +223,6 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     return;
   }
 
-  raw_ostream &OS = errs();
-
   if (Loc != SMLoc()) {
     int CurBuf = FindBufferContainingLoc(Loc);
     assert(CurBuf != -1 && "Invalid or unspecified location!");
@@ -233,6 +232,12 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
   Diagnostic.print(0, OS, ShowColors);
 }
 
+void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
+                             const Twine &Msg, ArrayRef<SMRange> Ranges,
+                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+}
+
 //===----------------------------------------------------------------------===//
 // SMDiagnostic Implementation
 //===----------------------------------------------------------------------===//
@@ -465,7 +470,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &S,
   if (FixItInsertionLine.empty())
     return;
   
-  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i != e; ++i) {
+  for (size_t i = 0, e = FixItInsertionLine.size(), OutCol = 0; i < e; ++i) {
     if (i >= LineContents.size() || LineContents[i] != '\t') {
       S << FixItInsertionLine[i];
       ++OutCol;
diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
index 59e27a2..2ed7c5c 100644
--- a/lib/Support/StreamableMemoryObject.cpp
+++ b/lib/Support/StreamableMemoryObject.cpp
@@ -31,8 +31,7 @@ public:
   virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
   virtual int readBytes(uint64_t address,
                         uint64_t size,
-                        uint8_t* buf,
-                        uint64_t* copied) const LLVM_OVERRIDE;
+                        uint8_t *buf) const LLVM_OVERRIDE;
   virtual const uint8_t *getPointer(uint64_t address,
                                     uint64_t size) const LLVM_OVERRIDE;
   virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE {
@@ -67,11 +66,9 @@ int RawMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
 
 int RawMemoryObject::readBytes(uint64_t address,
                                uint64_t size,
-                               uint8_t* buf,
-                               uint64_t* copied) const {
+                               uint8_t *buf) const {
   if (!validAddress(address) || !validAddress(address + size - 1)) return -1;
   memcpy(buf, (uint8_t *)(uintptr_t)(address + FirstChar), size);
-  if (copied) *copied = size;
   return size;
 }
 
@@ -111,11 +108,9 @@ int StreamingMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
 
 int StreamingMemoryObject::readBytes(uint64_t address,
                                      uint64_t size,
-                                     uint8_t* buf,
-                                     uint64_t* copied) const {
+                                     uint8_t *buf) const {
   if (!fetchToPos(address + size - 1)) return -1;
   memcpy(buf, &Bytes[address + BytesSkipped], size);
-  if (copied) *copied = size;
   return 0;
 }
 
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index d7a0bfa..bfae754 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -37,20 +37,39 @@ static bool ascii_isdigit(char x) {
   return x >= '0' && x <= '9';
 }
 
-/// compare_lower - Compare strings, ignoring case.
-int StringRef::compare_lower(StringRef RHS) const {
-  for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
-    unsigned char LHC = ascii_tolower(Data[I]);
-    unsigned char RHC = ascii_tolower(RHS.Data[I]);
+// strncasecmp() is not available on non-POSIX systems, so define an
+// alternative function here.
+static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
+  for (size_t I = 0; I < Length; ++I) {
+    unsigned char LHC = ascii_tolower(LHS[I]);
+    unsigned char RHC = ascii_tolower(RHS[I]);
     if (LHC != RHC)
       return LHC < RHC ? -1 : 1;
   }
+  return 0;
+}
 
+/// compare_lower - Compare strings, ignoring case.
+int StringRef::compare_lower(StringRef RHS) const {
+  if (int Res = ascii_strncasecmp(Data, RHS.Data, min(Length, RHS.Length)))
+    return Res;
   if (Length == RHS.Length)
     return 0;
   return Length < RHS.Length ? -1 : 1;
 }
 
+/// Check if this string starts with the given \p Prefix, ignoring case.
+bool StringRef::startswith_lower(StringRef Prefix) const {
+  return Length >= Prefix.Length &&
+      ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
+}
+
+/// Check if this string ends with the given \p Suffix, ignoring case.
+bool StringRef::endswith_lower(StringRef Suffix) const {
+  return Length >= Suffix.Length &&
+      ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
+}
+
 /// compare_numeric - Compare strings, handle embedded numbers.
 int StringRef::compare_numeric(StringRef RHS) const {
   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
@@ -85,7 +104,7 @@ int StringRef::compare_numeric(StringRef RHS) const {
 // Compute the edit distance between the two given strings.
 unsigned StringRef::edit_distance(llvm::StringRef Other,
                                   bool AllowReplacements,
-                                  unsigned MaxEditDistance) {
+                                  unsigned MaxEditDistance) const {
   return llvm::ComputeEditDistance(
       llvm::ArrayRef<char>(data(), size()),
       llvm::ArrayRef<char>(Other.data(), Other.size()),
diff --git a/lib/Support/StringRefMemoryObject.cpp b/lib/Support/StringRefMemoryObject.cpp
new file mode 100644
index 0000000..e035ed1
--- /dev/null
+++ b/lib/Support/StringRefMemoryObject.cpp
@@ -0,0 +1,29 @@
+//===- lib/Support/StringRefMemoryObject.cpp --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/StringRefMemoryObject.h"
+
+using namespace llvm;
+
+int StringRefMemoryObject::readByte(uint64_t Addr, uint8_t *Byte) const {
+  if (Addr >= Base + getExtent() || Addr < Base)
+    return -1;
+  *Byte = Bytes[Addr - Base];
+  return 0;
+}
+
+int StringRefMemoryObject::readBytes(uint64_t Addr,
+                                     uint64_t Size,
+                                     uint8_t *Buf) const {
+  uint64_t Offset = Addr - Base;
+  if (Addr >= Base + getExtent() || Offset + Size > getExtent() || Addr < Base)
+    return -1;
+  memcpy(Buf, Bytes.data() + Offset, Size);
+  return 0;
+}
diff --git a/lib/Support/SystemUtils.cpp b/lib/Support/SystemUtils.cpp
index 54b5e97..2036364 100644
--- a/lib/Support/SystemUtils.cpp
+++ b/lib/Support/SystemUtils.cpp
@@ -31,25 +31,3 @@ bool llvm::CheckBitcodeOutputToConsole(raw_ostream &stream_to_check,
   }
   return false;
 }
-
-/// PrependMainExecutablePath - Prepend the path to the program being executed
-/// to \p ExeName, given the value of argv[0] and the address of main()
-/// itself. This allows us to find another LLVM tool if it is built in the same
-/// directory. An empty string is returned on error; note that this function
-/// just mainpulates the path and doesn't check for executability.
-/// @brief Find a named executable.
-sys::Path llvm::PrependMainExecutablePath(const std::string &ExeName,
-                                          const char *Argv0, void *MainAddr) {
-  // Check the directory that the calling program is in.  We can do
-  // this if ProgramPath contains at least one / character, indicating that it
-  // is a relative path to the executable itself.
-  sys::Path Result = sys::Path::GetMainExecutable(Argv0, MainAddr);
-  Result.eraseComponent();
-
-  if (!Result.isEmpty()) {
-    Result.appendComponent(ExeName);
-    Result.appendSuffix(sys::Path::GetEXESuffix());
-  }
-
-  return Result;
-}
diff --git a/lib/Support/TargetRegistry.cpp b/lib/Support/TargetRegistry.cpp
index 9c81327..0c90c17 100644
--- a/lib/Support/TargetRegistry.cpp
+++ b/lib/Support/TargetRegistry.cpp
@@ -135,9 +135,9 @@ const Target *TargetRegistry::getClosestTargetForJIT(std::string &Error) {
   return TheTarget;
 }
 
-static int TargetArraySortFn(const void *LHS, const void *RHS) {
-  typedef std::pair<StringRef, const Target*> pair_ty;
-  return ((const pair_ty*)LHS)->first.compare(((const pair_ty*)RHS)->first);
+static int TargetArraySortFn(const std::pair<StringRef, const Target *> *LHS,
+                             const std::pair<StringRef, const Target *> *RHS) {
+  return LHS->first.compare(RHS->first);
 }
 
 void TargetRegistry::printRegisteredTargetsForVersion() {
diff --git a/lib/Support/ThreadLocal.cpp b/lib/Support/ThreadLocal.cpp
index 0587aae..868b6ea 100644
--- a/lib/Support/ThreadLocal.cpp
+++ b/lib/Support/ThreadLocal.cpp
@@ -23,7 +23,7 @@
 // Define all methods as no-ops if threading is explicitly disabled
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) {
   typedef int SIZE_TOO_BIG[sizeof(d) <= sizeof(data) ? 1 : -1];
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 896d869..100b21e 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -66,8 +66,8 @@ raw_ostream *llvm::CreateInfoOutputFile() {
   // compensate for this, the test-suite Makefiles have code to delete the
   // info output file before running commands which write to it.
   std::string Error;
-  raw_ostream *Result = new raw_fd_ostream(OutputFilename.c_str(),
-                                           Error, raw_fd_ostream::F_Append);
+  raw_ostream *Result =
+      new raw_fd_ostream(OutputFilename.c_str(), Error, sys::fs::F_Append);
   if (Error.empty())
     return Result;
   
diff --git a/lib/Support/ToolOutputFile.cpp b/lib/Support/ToolOutputFile.cpp
index e7ca927..5c1268a 100644
--- a/lib/Support/ToolOutputFile.cpp
+++ b/lib/Support/ToolOutputFile.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Signals.h"
 using namespace llvm;
 
@@ -19,25 +20,30 @@ tool_output_file::CleanupInstaller::CleanupInstaller(const char *filename)
   : Filename(filename), Keep(false) {
   // Arrange for the file to be deleted if the process is killed.
   if (Filename != "-")
-    sys::RemoveFileOnSignal(sys::Path(Filename));
+    sys::RemoveFileOnSignal(Filename);
 }
 
 tool_output_file::CleanupInstaller::~CleanupInstaller() {
   // Delete the file if the client hasn't told us not to.
-  if (!Keep && Filename != "-")
-    sys::Path(Filename).eraseFromDisk();
+  if (!Keep && Filename != "-") {
+    bool Existed;
+    sys::fs::remove(Filename, Existed);
+  }
 
   // Ok, the file is successfully written and closed, or deleted. There's no
   // further need to clean it up on signals.
   if (Filename != "-")
-    sys::DontRemoveFileOnSignal(sys::Path(Filename));
+    sys::DontRemoveFileOnSignal(Filename);
 }
 
 tool_output_file::tool_output_file(const char *filename, std::string &ErrorInfo,
-                                   unsigned Flags)
-  : Installer(filename),
-    OS(filename, ErrorInfo, Flags) {
+                                   sys::fs::OpenFlags Flags)
+    : Installer(filename), OS(filename, ErrorInfo, Flags) {
   // If open fails, no cleanup is needed.
   if (!ErrorInfo.empty())
     Installer.Keep = true;
 }
+
+tool_output_file::tool_output_file(const char *Filename, int FD)
+    : Installer(Filename), OS(FD, true) {
+}
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 412e34c..6c978a0 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -28,6 +28,7 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case mips64el:return "mips64el";
   case msp430:  return "msp430";
   case ppc64:   return "powerpc64";
+  case ppc64le: return "powerpc64le";
   case ppc:     return "powerpc";
   case r600:    return "r600";
   case sparc:   return "sparc";
@@ -38,7 +39,6 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case x86:     return "i386";
   case x86_64:  return "x86_64";
   case xcore:   return "xcore";
-  case mblaze:  return "mblaze";
   case nvptx:   return "nvptx";
   case nvptx64: return "nvptx64";
   case le32:    return "le32";
@@ -61,10 +61,9 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case thumb:   return "arm";
 
   case ppc64:
+  case ppc64le:
   case ppc:     return "ppc";
 
-  case mblaze:  return "mblaze";
-
   case mips:
   case mipsel:
   case mips64:
@@ -104,6 +103,7 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case BGQ: return "bgq";
   case Freescale: return "fsl";
   case IBM: return "ibm";
+  case NVIDIA: return "nvidia";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -135,6 +135,8 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case CNK: return "cnk";
   case Bitrig: return "bitrig";
   case AIX: return "aix";
+  case CUDA: return "cuda";
+  case NVCL: return "nvcl";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -168,7 +170,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("ppc64", ppc64)
     .Case("ppc32", ppc)
     .Case("ppc", ppc)
-    .Case("mblaze", mblaze)
+    .Case("ppc64le", ppc64le)
     .Case("r600", r600)
     .Case("hexagon", hexagon)
     .Case("sparc", sparc)
@@ -198,7 +200,7 @@ const char *Triple::getArchNameForAssembler() {
     .Case("x86_64", "x86_64")
     .Case("powerpc", "ppc")
     .Case("powerpc64", "ppc64")
-    .Cases("mblaze", "microblaze", "mblaze")
+    .Case("powerpc64le", "ppc64le")
     .Case("arm", "arm")
     .Cases("armv4t", "thumbv4t", "armv4t")
     .Cases("armv5", "armv5e", "thumbv5", "thumbv5e", "armv5")
@@ -219,10 +221,10 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("i386", "i486", "i586", "i686", Triple::x86)
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
-    .Cases("amd64", "x86_64", Triple::x86_64)
+    .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
     .Case("powerpc", Triple::ppc)
     .Cases("powerpc64", "ppu", Triple::ppc64)
-    .Case("mblaze", Triple::mblaze)
+    .Case("powerpc64le", Triple::ppc64le)
     .Case("aarch64", Triple::aarch64)
     .Cases("arm", "xscale", Triple::arm)
     // FIXME: It would be good to replace these with explicit names for all the
@@ -239,7 +241,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("hexagon", Triple::hexagon)
     .Case("s390x", Triple::systemz)
     .Case("sparc", Triple::sparc)
-    .Case("sparcv9", Triple::sparcv9)
+    .Cases("sparcv9", "sparc64", Triple::sparcv9)
     .Case("tce", Triple::tce)
     .Case("xcore", Triple::xcore)
     .Case("nvptx", Triple::nvptx)
@@ -260,6 +262,7 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("bgq", Triple::BGQ)
     .Case("fsl", Triple::Freescale)
     .Case("ibm", Triple::IBM)
+    .Case("nvidia", Triple::NVIDIA)
     .Default(Triple::UnknownVendor);
 }
 
@@ -287,6 +290,8 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("cnk", Triple::CNK)
     .StartsWith("bitrig", Triple::Bitrig)
     .StartsWith("aix", Triple::AIX)
+    .StartsWith("cuda", Triple::CUDA)
+    .StartsWith("nvcl", Triple::NVCL)
     .Default(Triple::UnknownOS);
 }
 
@@ -672,7 +677,6 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::arm:
   case llvm::Triple::hexagon:
   case llvm::Triple::le32:
-  case llvm::Triple::mblaze:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::nvptx:
@@ -691,6 +695,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
   case llvm::Triple::ppc64:
+  case llvm::Triple::ppc64le:
   case llvm::Triple::sparcv9:
   case llvm::Triple::systemz:
   case llvm::Triple::x86_64:
@@ -719,6 +724,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::aarch64:
   case Triple::msp430:
   case Triple::systemz:
+  case Triple::ppc64le:
     T.setArch(UnknownArch);
     break;
 
@@ -727,7 +733,6 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::arm:
   case Triple::hexagon:
   case Triple::le32:
-  case Triple::mblaze:
   case Triple::mips:
   case Triple::mipsel:
   case Triple::nvptx:
@@ -760,7 +765,6 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::arm:
   case Triple::hexagon:
   case Triple::le32:
-  case Triple::mblaze:
   case Triple::msp430:
   case Triple::r600:
   case Triple::tce:
@@ -775,6 +779,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::mips64el:
   case Triple::nvptx64:
   case Triple::ppc64:
+  case Triple::ppc64le:
   case Triple::sparcv9:
   case Triple::systemz:
   case Triple::x86_64:
diff --git a/lib/Support/Unicode.cpp b/lib/Support/Unicode.cpp
new file mode 100644
index 0000000..b719bd8
--- /dev/null
+++ b/lib/Support/Unicode.cpp
@@ -0,0 +1,367 @@
+//===- llvm/Support/Unicode.cpp - Unicode character properties  -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements functions that allow querying certain properties of
+// Unicode characters.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Unicode.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/UnicodeCharRanges.h"
+
+namespace llvm {
+namespace sys {
+namespace unicode {
+
+bool isPrintable(int UCS) {
+  // Sorted list of non-overlapping intervals of code points that are not
+  // supposed to be printable.
+  static const UnicodeCharRange NonPrintableRanges[] = {
+    { 0x0000, 0x001F }, { 0x007F, 0x009F }, { 0x034F, 0x034F },
+    { 0x0378, 0x0379 }, { 0x037F, 0x0383 }, { 0x038B, 0x038B },
+    { 0x038D, 0x038D }, { 0x03A2, 0x03A2 }, { 0x0528, 0x0530 },
+    { 0x0557, 0x0558 }, { 0x0560, 0x0560 }, { 0x0588, 0x0588 },
+    { 0x058B, 0x058E }, { 0x0590, 0x0590 }, { 0x05C8, 0x05CF },
+    { 0x05EB, 0x05EF }, { 0x05F5, 0x0605 }, { 0x061C, 0x061D },
+    { 0x06DD, 0x06DD }, { 0x070E, 0x070F }, { 0x074B, 0x074C },
+    { 0x07B2, 0x07BF }, { 0x07FB, 0x07FF }, { 0x082E, 0x082F },
+    { 0x083F, 0x083F }, { 0x085C, 0x085D }, { 0x085F, 0x089F },
+    { 0x08A1, 0x08A1 }, { 0x08AD, 0x08E3 }, { 0x08FF, 0x08FF },
+    { 0x0978, 0x0978 }, { 0x0980, 0x0980 }, { 0x0984, 0x0984 },
+    { 0x098D, 0x098E }, { 0x0991, 0x0992 }, { 0x09A9, 0x09A9 },
+    { 0x09B1, 0x09B1 }, { 0x09B3, 0x09B5 }, { 0x09BA, 0x09BB },
+    { 0x09C5, 0x09C6 }, { 0x09C9, 0x09CA }, { 0x09CF, 0x09D6 },
+    { 0x09D8, 0x09DB }, { 0x09DE, 0x09DE }, { 0x09E4, 0x09E5 },
+    { 0x09FC, 0x0A00 }, { 0x0A04, 0x0A04 }, { 0x0A0B, 0x0A0E },
+    { 0x0A11, 0x0A12 }, { 0x0A29, 0x0A29 }, { 0x0A31, 0x0A31 },
+    { 0x0A34, 0x0A34 }, { 0x0A37, 0x0A37 }, { 0x0A3A, 0x0A3B },
+    { 0x0A3D, 0x0A3D }, { 0x0A43, 0x0A46 }, { 0x0A49, 0x0A4A },
+    { 0x0A4E, 0x0A50 }, { 0x0A52, 0x0A58 }, { 0x0A5D, 0x0A5D },
+    { 0x0A5F, 0x0A65 }, { 0x0A76, 0x0A80 }, { 0x0A84, 0x0A84 },
+    { 0x0A8E, 0x0A8E }, { 0x0A92, 0x0A92 }, { 0x0AA9, 0x0AA9 },
+    { 0x0AB1, 0x0AB1 }, { 0x0AB4, 0x0AB4 }, { 0x0ABA, 0x0ABB },
+    { 0x0AC6, 0x0AC6 }, { 0x0ACA, 0x0ACA }, { 0x0ACE, 0x0ACF },
+    { 0x0AD1, 0x0ADF }, { 0x0AE4, 0x0AE5 }, { 0x0AF2, 0x0B00 },
+    { 0x0B04, 0x0B04 }, { 0x0B0D, 0x0B0E }, { 0x0B11, 0x0B12 },
+    { 0x0B29, 0x0B29 }, { 0x0B31, 0x0B31 }, { 0x0B34, 0x0B34 },
+    { 0x0B3A, 0x0B3B }, { 0x0B45, 0x0B46 }, { 0x0B49, 0x0B4A },
+    { 0x0B4E, 0x0B55 }, { 0x0B58, 0x0B5B }, { 0x0B5E, 0x0B5E },
+    { 0x0B64, 0x0B65 }, { 0x0B78, 0x0B81 }, { 0x0B84, 0x0B84 },
+    { 0x0B8B, 0x0B8D }, { 0x0B91, 0x0B91 }, { 0x0B96, 0x0B98 },
+    { 0x0B9B, 0x0B9B }, { 0x0B9D, 0x0B9D }, { 0x0BA0, 0x0BA2 },
+    { 0x0BA5, 0x0BA7 }, { 0x0BAB, 0x0BAD }, { 0x0BBA, 0x0BBD },
+    { 0x0BC3, 0x0BC5 }, { 0x0BC9, 0x0BC9 }, { 0x0BCE, 0x0BCF },
+    { 0x0BD1, 0x0BD6 }, { 0x0BD8, 0x0BE5 }, { 0x0BFB, 0x0C00 },
+    { 0x0C04, 0x0C04 }, { 0x0C0D, 0x0C0D }, { 0x0C11, 0x0C11 },
+    { 0x0C29, 0x0C29 }, { 0x0C34, 0x0C34 }, { 0x0C3A, 0x0C3C },
+    { 0x0C45, 0x0C45 }, { 0x0C49, 0x0C49 }, { 0x0C4E, 0x0C54 },
+    { 0x0C57, 0x0C57 }, { 0x0C5A, 0x0C5F }, { 0x0C64, 0x0C65 },
+    { 0x0C70, 0x0C77 }, { 0x0C80, 0x0C81 }, { 0x0C84, 0x0C84 },
+    { 0x0C8D, 0x0C8D }, { 0x0C91, 0x0C91 }, { 0x0CA9, 0x0CA9 },
+    { 0x0CB4, 0x0CB4 }, { 0x0CBA, 0x0CBB }, { 0x0CC5, 0x0CC5 },
+    { 0x0CC9, 0x0CC9 }, { 0x0CCE, 0x0CD4 }, { 0x0CD7, 0x0CDD },
+    { 0x0CDF, 0x0CDF }, { 0x0CE4, 0x0CE5 }, { 0x0CF0, 0x0CF0 },
+    { 0x0CF3, 0x0D01 }, { 0x0D04, 0x0D04 }, { 0x0D0D, 0x0D0D },
+    { 0x0D11, 0x0D11 }, { 0x0D3B, 0x0D3C }, { 0x0D45, 0x0D45 },
+    { 0x0D49, 0x0D49 }, { 0x0D4F, 0x0D56 }, { 0x0D58, 0x0D5F },
+    { 0x0D64, 0x0D65 }, { 0x0D76, 0x0D78 }, { 0x0D80, 0x0D81 },
+    { 0x0D84, 0x0D84 }, { 0x0D97, 0x0D99 }, { 0x0DB2, 0x0DB2 },
+    { 0x0DBC, 0x0DBC }, { 0x0DBE, 0x0DBF }, { 0x0DC7, 0x0DC9 },
+    { 0x0DCB, 0x0DCE }, { 0x0DD5, 0x0DD5 }, { 0x0DD7, 0x0DD7 },
+    { 0x0DE0, 0x0DF1 }, { 0x0DF5, 0x0E00 }, { 0x0E3B, 0x0E3E },
+    { 0x0E5C, 0x0E80 }, { 0x0E83, 0x0E83 }, { 0x0E85, 0x0E86 },
+    { 0x0E89, 0x0E89 }, { 0x0E8B, 0x0E8C }, { 0x0E8E, 0x0E93 },
+    { 0x0E98, 0x0E98 }, { 0x0EA0, 0x0EA0 }, { 0x0EA4, 0x0EA4 },
+    { 0x0EA6, 0x0EA6 }, { 0x0EA8, 0x0EA9 }, { 0x0EAC, 0x0EAC },
+    { 0x0EBA, 0x0EBA }, { 0x0EBE, 0x0EBF }, { 0x0EC5, 0x0EC5 },
+    { 0x0EC7, 0x0EC7 }, { 0x0ECE, 0x0ECF }, { 0x0EDA, 0x0EDB },
+    { 0x0EE0, 0x0EFF }, { 0x0F48, 0x0F48 }, { 0x0F6D, 0x0F70 },
+    { 0x0F98, 0x0F98 }, { 0x0FBD, 0x0FBD }, { 0x0FCD, 0x0FCD },
+    { 0x0FDB, 0x0FFF }, { 0x10C6, 0x10C6 }, { 0x10C8, 0x10CC },
+    { 0x10CE, 0x10CF }, { 0x115F, 0x1160 }, { 0x1249, 0x1249 },
+    { 0x124E, 0x124F }, { 0x1257, 0x1257 }, { 0x1259, 0x1259 },
+    { 0x125E, 0x125F }, { 0x1289, 0x1289 }, { 0x128E, 0x128F },
+    { 0x12B1, 0x12B1 }, { 0x12B6, 0x12B7 }, { 0x12BF, 0x12BF },
+    { 0x12C1, 0x12C1 }, { 0x12C6, 0x12C7 }, { 0x12D7, 0x12D7 },
+    { 0x1311, 0x1311 }, { 0x1316, 0x1317 }, { 0x135B, 0x135C },
+    { 0x137D, 0x137F }, { 0x139A, 0x139F }, { 0x13F5, 0x13FF },
+    { 0x169D, 0x169F }, { 0x16F1, 0x16FF }, { 0x170D, 0x170D },
+    { 0x1715, 0x171F }, { 0x1737, 0x173F }, { 0x1754, 0x175F },
+    { 0x176D, 0x176D }, { 0x1771, 0x1771 }, { 0x1774, 0x177F },
+    { 0x17B4, 0x17B5 }, { 0x17DE, 0x17DF }, { 0x17EA, 0x17EF },
+    { 0x17FA, 0x17FF }, { 0x180B, 0x180D }, { 0x180F, 0x180F },
+    { 0x181A, 0x181F }, { 0x1878, 0x187F }, { 0x18AB, 0x18AF },
+    { 0x18F6, 0x18FF }, { 0x191D, 0x191F }, { 0x192C, 0x192F },
+    { 0x193C, 0x193F }, { 0x1941, 0x1943 }, { 0x196E, 0x196F },
+    { 0x1975, 0x197F }, { 0x19AC, 0x19AF }, { 0x19CA, 0x19CF },
+    { 0x19DB, 0x19DD }, { 0x1A1C, 0x1A1D }, { 0x1A5F, 0x1A5F },
+    { 0x1A7D, 0x1A7E }, { 0x1A8A, 0x1A8F }, { 0x1A9A, 0x1A9F },
+    { 0x1AAE, 0x1AFF }, { 0x1B4C, 0x1B4F }, { 0x1B7D, 0x1B7F },
+    { 0x1BF4, 0x1BFB }, { 0x1C38, 0x1C3A }, { 0x1C4A, 0x1C4C },
+    { 0x1C80, 0x1CBF }, { 0x1CC8, 0x1CCF }, { 0x1CF7, 0x1CFF },
+    { 0x1DE7, 0x1DFB }, { 0x1F16, 0x1F17 }, { 0x1F1E, 0x1F1F },
+    { 0x1F46, 0x1F47 }, { 0x1F4E, 0x1F4F }, { 0x1F58, 0x1F58 },
+    { 0x1F5A, 0x1F5A }, { 0x1F5C, 0x1F5C }, { 0x1F5E, 0x1F5E },
+    { 0x1F7E, 0x1F7F }, { 0x1FB5, 0x1FB5 }, { 0x1FC5, 0x1FC5 },
+    { 0x1FD4, 0x1FD5 }, { 0x1FDC, 0x1FDC }, { 0x1FF0, 0x1FF1 },
+    { 0x1FF5, 0x1FF5 }, { 0x1FFF, 0x1FFF }, { 0x200B, 0x200F },
+    { 0x202A, 0x202E }, { 0x2060, 0x206F }, { 0x2072, 0x2073 },
+    { 0x208F, 0x208F }, { 0x209D, 0x209F }, { 0x20BB, 0x20CF },
+    { 0x20F1, 0x20FF }, { 0x218A, 0x218F }, { 0x23F4, 0x23FF },
+    { 0x2427, 0x243F }, { 0x244B, 0x245F }, { 0x2700, 0x2700 },
+    { 0x2B4D, 0x2B4F }, { 0x2B5A, 0x2BFF }, { 0x2C2F, 0x2C2F },
+    { 0x2C5F, 0x2C5F }, { 0x2CF4, 0x2CF8 }, { 0x2D26, 0x2D26 },
+    { 0x2D28, 0x2D2C }, { 0x2D2E, 0x2D2F }, { 0x2D68, 0x2D6E },
+    { 0x2D71, 0x2D7E }, { 0x2D97, 0x2D9F }, { 0x2DA7, 0x2DA7 },
+    { 0x2DAF, 0x2DAF }, { 0x2DB7, 0x2DB7 }, { 0x2DBF, 0x2DBF },
+    { 0x2DC7, 0x2DC7 }, { 0x2DCF, 0x2DCF }, { 0x2DD7, 0x2DD7 },
+    { 0x2DDF, 0x2DDF }, { 0x2E3C, 0x2E7F }, { 0x2E9A, 0x2E9A },
+    { 0x2EF4, 0x2EFF }, { 0x2FD6, 0x2FEF }, { 0x2FFC, 0x2FFF },
+    { 0x3040, 0x3040 }, { 0x3097, 0x3098 }, { 0x3100, 0x3104 },
+    { 0x312E, 0x3130 }, { 0x3164, 0x3164 }, { 0x318F, 0x318F },
+    { 0x31BB, 0x31BF }, { 0x31E4, 0x31EF }, { 0x321F, 0x321F },
+    { 0x32FF, 0x32FF }, { 0x4DB6, 0x4DBF }, { 0x9FCD, 0x9FFF },
+    { 0xA48D, 0xA48F }, { 0xA4C7, 0xA4CF }, { 0xA62C, 0xA63F },
+    { 0xA698, 0xA69E }, { 0xA6F8, 0xA6FF }, { 0xA78F, 0xA78F },
+    { 0xA794, 0xA79F }, { 0xA7AB, 0xA7F7 }, { 0xA82C, 0xA82F },
+    { 0xA83A, 0xA83F }, { 0xA878, 0xA87F }, { 0xA8C5, 0xA8CD },
+    { 0xA8DA, 0xA8DF }, { 0xA8FC, 0xA8FF }, { 0xA954, 0xA95E },
+    { 0xA97D, 0xA97F }, { 0xA9CE, 0xA9CE }, { 0xA9DA, 0xA9DD },
+    { 0xA9E0, 0xA9FF }, { 0xAA37, 0xAA3F }, { 0xAA4E, 0xAA4F },
+    { 0xAA5A, 0xAA5B }, { 0xAA7C, 0xAA7F }, { 0xAAC3, 0xAADA },
+    { 0xAAF7, 0xAB00 }, { 0xAB07, 0xAB08 }, { 0xAB0F, 0xAB10 },
+    { 0xAB17, 0xAB1F }, { 0xAB27, 0xAB27 }, { 0xAB2F, 0xABBF },
+    { 0xABEE, 0xABEF }, { 0xABFA, 0xABFF }, { 0xD7A4, 0xD7AF },
+    { 0xD7C7, 0xD7CA }, { 0xD7FC, 0xDFFF }, { 0xFA6E, 0xFA6F },
+    { 0xFADA, 0xFAFF }, { 0xFB07, 0xFB12 }, { 0xFB18, 0xFB1C },
+    { 0xFB37, 0xFB37 }, { 0xFB3D, 0xFB3D }, { 0xFB3F, 0xFB3F },
+    { 0xFB42, 0xFB42 }, { 0xFB45, 0xFB45 }, { 0xFBC2, 0xFBD2 },
+    { 0xFD40, 0xFD4F }, { 0xFD90, 0xFD91 }, { 0xFDC8, 0xFDEF },
+    { 0xFDFE, 0xFE0F }, { 0xFE1A, 0xFE1F }, { 0xFE27, 0xFE2F },
+    { 0xFE53, 0xFE53 }, { 0xFE67, 0xFE67 }, { 0xFE6C, 0xFE6F },
+    { 0xFE75, 0xFE75 }, { 0xFEFD, 0xFEFF }, { 0xFF00, 0xFF00 },
+    { 0xFFA0, 0xFFA0 }, { 0xFFBF, 0xFFC1 }, { 0xFFC8, 0xFFC9 },
+    { 0xFFD0, 0xFFD1 }, { 0xFFD8, 0xFFD9 }, { 0xFFDD, 0xFFDF },
+    { 0xFFE7, 0xFFE7 }, { 0xFFEF, 0xFFFB }, { 0xFFFE, 0xFFFF },
+    { 0x1000C, 0x1000C }, { 0x10027, 0x10027 }, { 0x1003B, 0x1003B },
+    { 0x1003E, 0x1003E }, { 0x1004E, 0x1004F }, { 0x1005E, 0x1007F },
+    { 0x100FB, 0x100FF }, { 0x10103, 0x10106 }, { 0x10134, 0x10136 },
+    { 0x1018B, 0x1018F }, { 0x1019C, 0x101CF }, { 0x101FE, 0x1027F },
+    { 0x1029D, 0x1029F }, { 0x102D1, 0x102FF }, { 0x1031F, 0x1031F },
+    { 0x10324, 0x1032F }, { 0x1034B, 0x1037F }, { 0x1039E, 0x1039E },
+    { 0x103C4, 0x103C7 }, { 0x103D6, 0x103FF }, { 0x1049E, 0x1049F },
+    { 0x104AA, 0x107FF }, { 0x10806, 0x10807 }, { 0x10809, 0x10809 },
+    { 0x10836, 0x10836 }, { 0x10839, 0x1083B }, { 0x1083D, 0x1083E },
+    { 0x10856, 0x10856 }, { 0x10860, 0x108FF }, { 0x1091C, 0x1091E },
+    { 0x1093A, 0x1093E }, { 0x10940, 0x1097F }, { 0x109B8, 0x109BD },
+    { 0x109C0, 0x109FF }, { 0x10A04, 0x10A04 }, { 0x10A07, 0x10A0B },
+    { 0x10A14, 0x10A14 }, { 0x10A18, 0x10A18 }, { 0x10A34, 0x10A37 },
+    { 0x10A3B, 0x10A3E }, { 0x10A48, 0x10A4F }, { 0x10A59, 0x10A5F },
+    { 0x10A80, 0x10AFF }, { 0x10B36, 0x10B38 }, { 0x10B56, 0x10B57 },
+    { 0x10B73, 0x10B77 }, { 0x10B80, 0x10BFF }, { 0x10C49, 0x10E5F },
+    { 0x10E7F, 0x10FFF }, { 0x1104E, 0x11051 }, { 0x11070, 0x1107F },
+    { 0x110BD, 0x110BD }, { 0x110C2, 0x110CF }, { 0x110E9, 0x110EF },
+    { 0x110FA, 0x110FF }, { 0x11135, 0x11135 }, { 0x11144, 0x1117F },
+    { 0x111C9, 0x111CF }, { 0x111DA, 0x1167F }, { 0x116B8, 0x116BF },
+    { 0x116CA, 0x11FFF }, { 0x1236F, 0x123FF }, { 0x12463, 0x1246F },
+    { 0x12474, 0x12FFF }, { 0x1342F, 0x167FF }, { 0x16A39, 0x16EFF },
+    { 0x16F45, 0x16F4F }, { 0x16F7F, 0x16F8E }, { 0x16FA0, 0x1AFFF },
+    { 0x1B002, 0x1CFFF }, { 0x1D0F6, 0x1D0FF }, { 0x1D127, 0x1D128 },
+    { 0x1D173, 0x1D17A }, { 0x1D1DE, 0x1D1FF }, { 0x1D246, 0x1D2FF },
+    { 0x1D357, 0x1D35F }, { 0x1D372, 0x1D3FF }, { 0x1D455, 0x1D455 },
+    { 0x1D49D, 0x1D49D }, { 0x1D4A0, 0x1D4A1 }, { 0x1D4A3, 0x1D4A4 },
+    { 0x1D4A7, 0x1D4A8 }, { 0x1D4AD, 0x1D4AD }, { 0x1D4BA, 0x1D4BA },
+    { 0x1D4BC, 0x1D4BC }, { 0x1D4C4, 0x1D4C4 }, { 0x1D506, 0x1D506 },
+    { 0x1D50B, 0x1D50C }, { 0x1D515, 0x1D515 }, { 0x1D51D, 0x1D51D },
+    { 0x1D53A, 0x1D53A }, { 0x1D53F, 0x1D53F }, { 0x1D545, 0x1D545 },
+    { 0x1D547, 0x1D549 }, { 0x1D551, 0x1D551 }, { 0x1D6A6, 0x1D6A7 },
+    { 0x1D7CC, 0x1D7CD }, { 0x1D800, 0x1EDFF }, { 0x1EE04, 0x1EE04 },
+    { 0x1EE20, 0x1EE20 }, { 0x1EE23, 0x1EE23 }, { 0x1EE25, 0x1EE26 },
+    { 0x1EE28, 0x1EE28 }, { 0x1EE33, 0x1EE33 }, { 0x1EE38, 0x1EE38 },
+    { 0x1EE3A, 0x1EE3A }, { 0x1EE3C, 0x1EE41 }, { 0x1EE43, 0x1EE46 },
+    { 0x1EE48, 0x1EE48 }, { 0x1EE4A, 0x1EE4A }, { 0x1EE4C, 0x1EE4C },
+    { 0x1EE50, 0x1EE50 }, { 0x1EE53, 0x1EE53 }, { 0x1EE55, 0x1EE56 },
+    { 0x1EE58, 0x1EE58 }, { 0x1EE5A, 0x1EE5A }, { 0x1EE5C, 0x1EE5C },
+    { 0x1EE5E, 0x1EE5E }, { 0x1EE60, 0x1EE60 }, { 0x1EE63, 0x1EE63 },
+    { 0x1EE65, 0x1EE66 }, { 0x1EE6B, 0x1EE6B }, { 0x1EE73, 0x1EE73 },
+    { 0x1EE78, 0x1EE78 }, { 0x1EE7D, 0x1EE7D }, { 0x1EE7F, 0x1EE7F },
+    { 0x1EE8A, 0x1EE8A }, { 0x1EE9C, 0x1EEA0 }, { 0x1EEA4, 0x1EEA4 },
+    { 0x1EEAA, 0x1EEAA }, { 0x1EEBC, 0x1EEEF }, { 0x1EEF2, 0x1EFFF },
+    { 0x1F02C, 0x1F02F }, { 0x1F094, 0x1F09F }, { 0x1F0AF, 0x1F0B0 },
+    { 0x1F0BF, 0x1F0C0 }, { 0x1F0D0, 0x1F0D0 }, { 0x1F0E0, 0x1F0FF },
+    { 0x1F10B, 0x1F10F }, { 0x1F12F, 0x1F12F }, { 0x1F16C, 0x1F16F },
+    { 0x1F19B, 0x1F1E5 }, { 0x1F203, 0x1F20F }, { 0x1F23B, 0x1F23F },
+    { 0x1F249, 0x1F24F }, { 0x1F252, 0x1F2FF }, { 0x1F321, 0x1F32F },
+    { 0x1F336, 0x1F336 }, { 0x1F37D, 0x1F37F }, { 0x1F394, 0x1F39F },
+    { 0x1F3C5, 0x1F3C5 }, { 0x1F3CB, 0x1F3DF }, { 0x1F3F1, 0x1F3FF },
+    { 0x1F43F, 0x1F43F }, { 0x1F441, 0x1F441 }, { 0x1F4F8, 0x1F4F8 },
+    { 0x1F4FD, 0x1F4FF }, { 0x1F53E, 0x1F53F }, { 0x1F544, 0x1F54F },
+    { 0x1F568, 0x1F5FA }, { 0x1F641, 0x1F644 }, { 0x1F650, 0x1F67F },
+    { 0x1F6C6, 0x1F6FF }, { 0x1F774, 0x1FFFF }, { 0x2A6D7, 0x2A6FF },
+    { 0x2B735, 0x2B73F }, { 0x2B81E, 0x2F7FF }, { 0x2FA1E, 0xF0000 },
+    { 0xFFFFE, 0xFFFFF }, { 0x10FFFE, 0x10FFFF }
+  };
+  static const UnicodeCharSet NonPrintables(NonPrintableRanges);
+
+  return UCS >= 0 && UCS <= 0x10FFFF && !NonPrintables.contains(UCS);
+}
+
+/// Gets the number of positions a character is likely to occupy when output
+/// on a terminal ("character width"). This depends on the implementation of the
+/// terminal, and there's no standard definition of character width.
+/// The implementation defines it in a way that is expected to be compatible
+/// with a generic Unicode-capable terminal.
+/// \return Character width:
+///   * ErrorNonPrintableCharacter (-1) for non-printable characters (as
+///     identified by isPrintable);
+///   * 0 for non-spacing and enclosing combining marks;
+///   * 2 for CJK characters excluding halfwidth forms;
+///   * 1 for all remaining characters.
+static inline int charWidth(int UCS)
+{
+  if (!isPrintable(UCS))
+    return ErrorNonPrintableCharacter;
+
+  // Sorted list of non-spacing and enclosing combining mark intervals as
+  // defined in "3.6 Combination" of
+  // http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
+  static const UnicodeCharRange CombiningCharacterRanges[] = {
+    { 0x0300, 0x036F }, { 0x0483, 0x0489 }, { 0x0591, 0x05BD },
+    { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 }, { 0x05C4, 0x05C5 },
+    { 0x05C7, 0x05C7 }, { 0x0610, 0x061A }, { 0x064B, 0x065F },
+    { 0x0670, 0x0670 }, { 0x06D6, 0x06DC }, { 0x06DF, 0x06E4 },
+    { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED }, { 0x0711, 0x0711 },
+    { 0x0730, 0x074A }, { 0x07A6, 0x07B0 }, { 0x07EB, 0x07F3 },
+    { 0x0816, 0x0819 }, { 0x081B, 0x0823 }, { 0x0825, 0x0827 },
+    { 0x0829, 0x082D }, { 0x0859, 0x085B }, { 0x08E4, 0x08FE },
+    { 0x0900, 0x0902 }, { 0x093A, 0x093A }, { 0x093C, 0x093C },
+    { 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0957 },
+    { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },
+    { 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },
+    { 0x0A01, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },
+    { 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A51, 0x0A51 },
+    { 0x0A70, 0x0A71 }, { 0x0A75, 0x0A75 }, { 0x0A81, 0x0A82 },
+    { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 }, { 0x0AC7, 0x0AC8 },
+    { 0x0ACD, 0x0ACD }, { 0x0AE2, 0x0AE3 }, { 0x0B01, 0x0B01 },
+    { 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B44 },
+    { 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B62, 0x0B63 },
+    { 0x0B82, 0x0B82 }, { 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD },
+    { 0x0C3E, 0x0C40 }, { 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D },
+    { 0x0C55, 0x0C56 }, { 0x0C62, 0x0C63 }, { 0x0CBC, 0x0CBC },
+    { 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },
+    { 0x0CE2, 0x0CE3 }, { 0x0D41, 0x0D44 }, { 0x0D4D, 0x0D4D },
+    { 0x0D62, 0x0D63 }, { 0x0DCA, 0x0DCA }, { 0x0DD2, 0x0DD4 },
+    { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 }, { 0x0E34, 0x0E3A },
+    { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 }, { 0x0EB4, 0x0EB9 },
+    { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD }, { 0x0F18, 0x0F19 },
+    { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 }, { 0x0F39, 0x0F39 },
+    { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 }, { 0x0F86, 0x0F87 },
+    { 0x0F8D, 0x0F97 }, { 0x0F99, 0x0FBC }, { 0x0FC6, 0x0FC6 },
+    { 0x102D, 0x1030 }, { 0x1032, 0x1037 }, { 0x1039, 0x103A },
+    { 0x103D, 0x103E }, { 0x1058, 0x1059 }, { 0x105E, 0x1060 },
+    { 0x1071, 0x1074 }, { 0x1082, 0x1082 }, { 0x1085, 0x1086 },
+    { 0x108D, 0x108D }, { 0x109D, 0x109D }, { 0x135D, 0x135F },
+    { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 },
+    { 0x1772, 0x1773 }, { 0x17B4, 0x17B5 }, { 0x17B7, 0x17BD },
+    { 0x17C6, 0x17C6 }, { 0x17C9, 0x17D3 }, { 0x17DD, 0x17DD },
+    { 0x180B, 0x180D }, { 0x18A9, 0x18A9 }, { 0x1920, 0x1922 },
+    { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193B },
+    { 0x1A17, 0x1A18 }, { 0x1A56, 0x1A56 }, { 0x1A58, 0x1A5E },
+    { 0x1A60, 0x1A60 }, { 0x1A62, 0x1A62 }, { 0x1A65, 0x1A6C },
+    { 0x1A73, 0x1A7C }, { 0x1A7F, 0x1A7F }, { 0x1B00, 0x1B03 },
+    { 0x1B34, 0x1B34 }, { 0x1B36, 0x1B3A }, { 0x1B3C, 0x1B3C },
+    { 0x1B42, 0x1B42 }, { 0x1B6B, 0x1B73 }, { 0x1B80, 0x1B81 },
+    { 0x1BA2, 0x1BA5 }, { 0x1BA8, 0x1BA9 }, { 0x1BAB, 0x1BAB },
+    { 0x1BE6, 0x1BE6 }, { 0x1BE8, 0x1BE9 }, { 0x1BED, 0x1BED },
+    { 0x1BEF, 0x1BF1 }, { 0x1C2C, 0x1C33 }, { 0x1C36, 0x1C37 },
+    { 0x1CD0, 0x1CD2 }, { 0x1CD4, 0x1CE0 }, { 0x1CE2, 0x1CE8 },
+    { 0x1CED, 0x1CED }, { 0x1CF4, 0x1CF4 }, { 0x1DC0, 0x1DE6 },
+    { 0x1DFC, 0x1DFF }, { 0x20D0, 0x20F0 }, { 0x2CEF, 0x2CF1 },
+    { 0x2D7F, 0x2D7F }, { 0x2DE0, 0x2DFF }, { 0x302A, 0x302D },
+    { 0x3099, 0x309A }, { 0xA66F, 0xA672 }, { 0xA674, 0xA67D },
+    { 0xA69F, 0xA69F }, { 0xA6F0, 0xA6F1 }, { 0xA802, 0xA802 },
+    { 0xA806, 0xA806 }, { 0xA80B, 0xA80B }, { 0xA825, 0xA826 },
+    { 0xA8C4, 0xA8C4 }, { 0xA8E0, 0xA8F1 }, { 0xA926, 0xA92D },
+    { 0xA947, 0xA951 }, { 0xA980, 0xA982 }, { 0xA9B3, 0xA9B3 },
+    { 0xA9B6, 0xA9B9 }, { 0xA9BC, 0xA9BC }, { 0xAA29, 0xAA2E },
+    { 0xAA31, 0xAA32 }, { 0xAA35, 0xAA36 }, { 0xAA43, 0xAA43 },
+    { 0xAA4C, 0xAA4C }, { 0xAAB0, 0xAAB0 }, { 0xAAB2, 0xAAB4 },
+    { 0xAAB7, 0xAAB8 }, { 0xAABE, 0xAABF }, { 0xAAC1, 0xAAC1 },
+    { 0xAAEC, 0xAAED }, { 0xAAF6, 0xAAF6 }, { 0xABE5, 0xABE5 },
+    { 0xABE8, 0xABE8 }, { 0xABED, 0xABED }, { 0xFB1E, 0xFB1E },
+    { 0xFE00, 0xFE0F }, { 0xFE20, 0xFE26 }, { 0x101FD, 0x101FD },
+    { 0x10A01, 0x10A03 }, { 0x10A05, 0x10A06 }, { 0x10A0C, 0x10A0F },
+    { 0x10A38, 0x10A3A }, { 0x10A3F, 0x10A3F }, { 0x11001, 0x11001 },
+    { 0x11038, 0x11046 }, { 0x11080, 0x11081 }, { 0x110B3, 0x110B6 },
+    { 0x110B9, 0x110BA }, { 0x11100, 0x11102 }, { 0x11127, 0x1112B },
+    { 0x1112D, 0x11134 }, { 0x11180, 0x11181 }, { 0x111B6, 0x111BE },
+    { 0x116AB, 0x116AB }, { 0x116AD, 0x116AD }, { 0x116B0, 0x116B5 },
+    { 0x116B7, 0x116B7 }, { 0x16F8F, 0x16F92 }, { 0x1D167, 0x1D169 },
+    { 0x1D17B, 0x1D182 }, { 0x1D185, 0x1D18B }, { 0x1D1AA, 0x1D1AD },
+    { 0x1D242, 0x1D244 }, { 0xE0100, 0xE01EF },
+  };
+  static const UnicodeCharSet CombiningCharacters(CombiningCharacterRanges);
+
+  if (CombiningCharacters.contains(UCS))
+    return 0;
+
+  static const UnicodeCharRange DoubleWidthCharacterRanges[] = {
+    // Hangul Jamo
+    { 0x1100, 0x11FF },
+    // Deprecated fullwidth angle brackets
+    { 0x2329, 0x232A },
+    // CJK Misc, CJK Unified Ideographs, Yijing Hexagrams, Yi
+    // excluding U+303F (IDEOGRAPHIC HALF FILL SPACE)
+    { 0x2E80, 0x303E }, { 0x3040, 0xA4CF },
+    // Hangul
+    { 0xAC00, 0xD7A3 }, { 0xD7B0, 0xD7C6 }, { 0xD7CB, 0xD7FB },
+    // CJK Unified Ideographs
+    { 0xF900, 0xFAFF },
+    // Vertical forms
+    { 0xFE10, 0xFE19 },
+    // CJK Compatibility Forms + Small Form Variants
+    { 0xFE30, 0xFE6F },
+    // Fullwidth forms
+    { 0xFF01, 0xFF60 }, { 0xFFE0, 0xFFE6 },
+    // CJK Unified Ideographs
+    { 0x20000, 0x2A6DF }, { 0x2A700, 0x2B81F }, { 0x2F800, 0x2FA1F }
+  };
+  static const UnicodeCharSet DoubleWidthCharacters(DoubleWidthCharacterRanges);
+
+  if (DoubleWidthCharacters.contains(UCS))
+    return 2;
+  return 1;
+}
+
+int columnWidthUTF8(StringRef Text) {
+  unsigned ColumnWidth = 0;
+  unsigned Length;
+  for (size_t i = 0, e = Text.size(); i < e; i += Length) {
+    Length = getNumBytesForUTF8(Text[i]);
+    if (Length <= 0 || i + Length > Text.size())
+      return ErrorInvalidUTF8;
+    UTF32 buf[1];
+    const UTF8 *Start = reinterpret_cast<const UTF8 *>(Text.data() + i);
+    UTF32 *Target = &buf[0];
+    if (conversionOK != ConvertUTF8toUTF32(&Start, Start + Length, &Target,
+                                           Target + 1, strictConversion))
+      return ErrorInvalidUTF8;
+    int Width = charWidth(buf[0]);
+    if (Width < 0)
+      return ErrorNonPrintableCharacter;
+    ColumnWidth += Width;
+  }
+  return ColumnWidth;
+}
+
+} // namespace unicode
+} // namespace sys
+} // namespace llvm
+
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 72a8af6..58fda42 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -32,7 +32,11 @@
 #  endif
 #endif
 
+#ifdef __APPLE__
 extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#else
+extern "C" void __clear_cache(void *, void*);
+#endif
 
 namespace {
 
@@ -267,6 +271,9 @@ bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   return KERN_SUCCESS == kr;
+#elif defined(__arm__) || defined(__aarch64__)
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
+  return true;
 #else
   return true;
 #endif
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 6a5ebb8..c9dc871 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -1,4 +1,4 @@
-//===- llvm/Support/Unix/Path.cpp - Unix Path Implementation -----*- C++ -*-===//
+//===- llvm/Support/Unix/Path.inc - Unix Path Implementation ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the Unix specific portion of the Path class.
+// This file implements the Unix specific implementation of the Path API.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,6 +17,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Support/Process.h"
+#include <limits.h>
+#include <stdio.h>
 #if HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
@@ -26,15 +29,6 @@
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
 #endif
-#ifdef HAVE_SYS_STAT_H
-#include <sys/stat.h>
-#endif
-#if HAVE_UTIME_H
-#include <utime.h>
-#endif
-#if HAVE_TIME_H
-#include <time.h>
-#endif
 #if HAVE_DIRENT_H
 # include <dirent.h>
 # define NAMLEN(dirent) strlen((dirent)->d_name)
@@ -52,217 +46,143 @@
 # endif
 #endif
 
-#if HAVE_DLFCN_H
-#include <dlfcn.h>
-#endif
-
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #endif
 
+// Both stdio.h and cstdio are included via different pathes and
+// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
+// either.
+#undef ferror
+#undef feof
+
 // For GNU Hurd
-#if defined(__GNU__) && !defined(MAXPATHLEN)
-# define MAXPATHLEN 4096
+#if defined(__GNU__) && !defined(PATH_MAX)
+# define PATH_MAX 4096
 #endif
 
-// Put in a hack for Cygwin which falsely reports that the mkdtemp function
-// is available when it is not.
-#ifdef __CYGWIN__
-# undef HAVE_MKDTEMP
-#endif
+using namespace llvm;
 
 namespace {
-inline bool lastIsSlash(const std::string& path) {
-  return !path.empty() && path[path.length() - 1] == '/';
-}
-
-}
+  /// This class automatically closes the given file descriptor when it goes out
+  /// of scope. You can take back explicit ownership of the file descriptor by
+  /// calling take(). The destructor does not verify that close was successful.
+  /// Therefore, never allow this class to call close on a file descriptor that
+  /// has been read from or written to.
+  struct AutoFD {
+    int FileDescriptor;
+
+    AutoFD(int fd) : FileDescriptor(fd) {}
+    ~AutoFD() {
+      if (FileDescriptor >= 0)
+        ::close(FileDescriptor);
+    }
 
-namespace llvm {
-using namespace sys;
+    int take() {
+      int ret = FileDescriptor;
+      FileDescriptor = -1;
+      return ret;
+    }
 
-const char sys::PathSeparator = ':';
+    operator int() const {return FileDescriptor;}
+  };
+
+  error_code TempDir(SmallVectorImpl<char> &result) {
+    // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
+    const char *dir = 0;
+    (dir = std::getenv("TMPDIR" )) ||
+    (dir = std::getenv("TMP"    )) ||
+    (dir = std::getenv("TEMP"   )) ||
+    (dir = std::getenv("TEMPDIR")) ||
+#ifdef P_tmpdir
+    (dir = P_tmpdir) ||
+#endif
+    (dir = "/tmp");
 
-StringRef Path::GetEXESuffix() {
-  return StringRef();
+    result.clear();
+    StringRef d(dir);
+    result.append(d.begin(), d.end());
+    return error_code::success();
+  }
 }
 
-Path::Path(StringRef p)
-  : path(p) {}
-
-Path::Path(const char *StrStart, unsigned StrLen)
-  : path(StrStart, StrLen) {}
-
-Path&
-Path::operator=(StringRef that) {
-  path.assign(that.data(), that.size());
-  return *this;
-}
+static error_code createUniqueEntity(const Twine &Model, int &ResultFD,
+                                     SmallVectorImpl<char> &ResultPath,
+                                     bool MakeAbsolute, unsigned Mode,
+                                     FSEntity Type) {
+  SmallString<128> ModelStorage;
+  Model.toVector(ModelStorage);
+
+  if (MakeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = sys::path::is_absolute(Twine(ModelStorage));
+    if (!absolute) {
+      SmallString<128> TDir;
+      if (error_code ec = TempDir(TDir)) return ec;
+      sys::path::append(TDir, Twine(ModelStorage));
+      ModelStorage.swap(TDir);
+    }
+  }
 
-bool
-Path::isValid() const {
-  // Empty paths are considered invalid here.
-  // This code doesn't check MAXPATHLEN because there's no need. Nothing in
-  // LLVM manipulates Paths with fixed-sizes arrays, and if the OS can't
-  // handle names longer than some limit, it'll report this on demand using
-  // ENAMETOLONG.
-  return !path.empty();
-}
+  // From here on, DO NOT modify model. It may be needed if the randomly chosen
+  // path already exists.
+  ResultPath = ModelStorage;
+  // Null terminate.
+  ResultPath.push_back(0);
+  ResultPath.pop_back();
+
+retry_random_path:
+  // Replace '%' with random chars.
+  for (unsigned i = 0, e = ModelStorage.size(); i != e; ++i) {
+    if (ModelStorage[i] == '%')
+      ResultPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15];
+  }
 
-bool
-Path::isAbsolute(const char *NameStart, unsigned NameLen) {
-  assert(NameStart);
-  if (NameLen == 0)
-    return false;
-  return NameStart[0] == '/';
-}
+  // Try to open + create the file.
+  switch (Type) {
+  case FS_File: {
+    int RandomFD = ::open(ResultPath.begin(), O_RDWR | O_CREAT | O_EXCL, Mode);
+    if (RandomFD == -1) {
+      int SavedErrno = errno;
+      // If the file existed, try again, otherwise, error.
+      if (SavedErrno == errc::file_exists)
+        goto retry_random_path;
+      return error_code(SavedErrno, system_category());
+    }
 
-bool
-Path::isAbsolute() const {
-  if (path.empty())
-    return false;
-  return path[0] == '/';
-}
-
-Path
-Path::GetRootDirectory() {
-  Path result;
-  result.set("/");
-  return result;
-}
-
-Path
-Path::GetTemporaryDirectory(std::string *ErrMsg) {
-#if defined(HAVE_MKDTEMP)
-  // The best way is with mkdtemp but that's not available on many systems,
-  // Linux and FreeBSD have it. Others probably won't.
-  char pathname[] = "/tmp/llvm_XXXXXX";
-  if (0 == mkdtemp(pathname)) {
-    MakeErrMsg(ErrMsg,
-               std::string(pathname) + ": can't create temporary directory");
-    return Path();
-  }
-  return Path(pathname);
-#elif defined(HAVE_MKSTEMP)
-  // If no mkdtemp is available, mkstemp can be used to create a temporary file
-  // which is then removed and created as a directory. We prefer this over
-  // mktemp because of mktemp's inherent security and threading risks. We still
-  // have a slight race condition from the time the temporary file is created to
-  // the time it is re-created as a directoy.
-  char pathname[] = "/tmp/llvm_XXXXXX";
-  int fd = 0;
-  if (-1 == (fd = mkstemp(pathname))) {
-    MakeErrMsg(ErrMsg,
-      std::string(pathname) + ": can't create temporary directory");
-    return Path();
-  }
-  ::close(fd);
-  ::unlink(pathname); // start race condition, ignore errors
-  if (-1 == ::mkdir(pathname, S_IRWXU)) { // end race condition
-    MakeErrMsg(ErrMsg,
-      std::string(pathname) + ": can't create temporary directory");
-    return Path();
-  }
-  return Path(pathname);
-#elif defined(HAVE_MKTEMP)
-  // If a system doesn't have mkdtemp(3) or mkstemp(3) but it does have
-  // mktemp(3) then we'll assume that system (e.g. AIX) has a reasonable
-  // implementation of mktemp(3) and doesn't follow BSD 4.3's lead of replacing
-  // the XXXXXX with the pid of the process and a letter. That leads to only
-  // twenty six temporary files that can be generated.
-  char pathname[] = "/tmp/llvm_XXXXXX";
-  char *TmpName = ::mktemp(pathname);
-  if (TmpName == 0) {
-    MakeErrMsg(ErrMsg,
-      std::string(TmpName) + ": can't create unique directory name");
-    return Path();
-  }
-  if (-1 == ::mkdir(TmpName, S_IRWXU)) {
-    MakeErrMsg(ErrMsg,
-        std::string(TmpName) + ": can't create temporary directory");
-    return Path();
+    ResultFD = RandomFD;
+    return error_code::success();
   }
-  return Path(TmpName);
-#else
-  // This is the worst case implementation. tempnam(3) leaks memory unless its
-  // on an SVID2 (or later) system. On BSD 4.3 it leaks. tmpnam(3) has thread
-  // issues. The mktemp(3) function doesn't have enough variability in the
-  // temporary name generated. So, we provide our own implementation that
-  // increments an integer from a random number seeded by the current time. This
-  // should be sufficiently unique that we don't have many collisions between
-  // processes. Generally LLVM processes don't run very long and don't use very
-  // many temporary files so this shouldn't be a big issue for LLVM.
-  static time_t num = ::time(0);
-  char pathname[MAXPATHLEN];
-  do {
-    num++;
-    sprintf(pathname, "/tmp/llvm_%010u", unsigned(num));
-  } while ( 0 == access(pathname, F_OK ) );
-  if (-1 == ::mkdir(pathname, S_IRWXU)) {
-    MakeErrMsg(ErrMsg,
-      std::string(pathname) + ": can't create temporary directory");
-    return Path();
-  }
-  return Path(pathname);
-#endif
-}
 
-void
-Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
-#ifdef LTDL_SHLIBPATH_VAR
-  char* env_var = getenv(LTDL_SHLIBPATH_VAR);
-  if (env_var != 0) {
-    getPathList(env_var,Paths);
-  }
-#endif
-  // FIXME: Should this look at LD_LIBRARY_PATH too?
-  Paths.push_back(sys::Path("/usr/local/lib/"));
-  Paths.push_back(sys::Path("/usr/X11R6/lib/"));
-  Paths.push_back(sys::Path("/usr/lib/"));
-  Paths.push_back(sys::Path("/lib/"));
-}
-
-void
-Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
-  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
-  if (env_var != 0) {
-    getPathList(env_var,Paths);
-  }
-#ifdef LLVM_LIBDIR
-  {
-    Path tmpPath;
-    if (tmpPath.set(LLVM_LIBDIR))
-      if (tmpPath.canRead())
-        Paths.push_back(tmpPath);
+  case FS_Name: {
+    bool Exists;
+    error_code EC = sys::fs::exists(ResultPath.begin(), Exists);
+    if (EC)
+      return EC;
+    if (Exists)
+      goto retry_random_path;
+    return error_code::success();
   }
-#endif
-  GetSystemLibraryPaths(Paths);
-}
 
-Path
-Path::GetUserHomeDirectory() {
-  const char* home = getenv("HOME");
-  Path result;
-  if (home && result.set(home))
-    return result;
-  result.set("/");
-  return result;
-}
-
-Path
-Path::GetCurrentDirectory() {
-  char pathname[MAXPATHLEN];
-  if (!getcwd(pathname, MAXPATHLEN)) {
-    assert(false && "Could not query current working directory.");
-    return Path();
+  case FS_Dir: {
+    bool Existed;
+    error_code EC = sys::fs::create_directory(ResultPath.begin(), Existed);
+    if (EC)
+      return EC;
+    if (Existed)
+      goto retry_random_path;
+    return error_code::success();
   }
-
-  return Path(pathname);
+  }
+  llvm_unreachable("Invalid Type");
 }
 
+namespace llvm {
+namespace sys  {
+namespace fs {
 #if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
     defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
-    defined(__linux__) || defined(__CYGWIN__)
+    defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
@@ -318,7 +238,7 @@ getprogpath(char ret[PATH_MAX], const char *bin)
 
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
-Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
+std::string getMainExecutable(const char *argv0, void *MainAddr) {
 #if defined(__APPLE__)
   // On OS X the executable path is saved to the stack by dyld. Reading it
   // from there is much faster than calling dladdr, especially for large
@@ -328,14 +248,15 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
   if (_NSGetExecutablePath(exe_path, &size) == 0) {
     char link_path[MAXPATHLEN];
     if (realpath(exe_path, link_path))
-      return Path(link_path);
+      return link_path;
   }
 #elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-      defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
+      defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \
+      defined(__FreeBSD_kernel__)
   char exe_path[PATH_MAX];
 
   if (getprogpath(exe_path, argv0) != NULL)
-    return Path(exe_path);
+    return exe_path;
 #elif defined(__linux__) || defined(__CYGWIN__)
   char exe_path[MAXPATHLEN];
   StringRef aPath("/proc/self/exe");
@@ -343,558 +264,534 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
       // /proc is not always mounted under Linux (chroot for example).
       ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
       if (len >= 0)
-          return Path(StringRef(exe_path, len));
+          return StringRef(exe_path, len);
   } else {
       // Fall back to the classical detection.
       if (getprogpath(exe_path, argv0) != NULL)
-          return Path(exe_path);
+          return exe_path;
   }
 #elif defined(HAVE_DLFCN_H)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
   int err = dladdr(MainAddr, &DLInfo);
   if (err == 0)
-    return Path();
+    return "";
 
   // If the filename is a symlink, we need to resolve and return the location of
   // the actual executable.
   char link_path[MAXPATHLEN];
   if (realpath(DLInfo.dli_fname, link_path))
-    return Path(link_path);
+    return link_path;
 #else
 #error GetMainExecutable is not implemented on this host yet.
 #endif
-  return Path();
+  return "";
 }
 
+TimeValue file_status::getLastModificationTime() const {
+  TimeValue Ret;
+  Ret.fromEpochTime(fs_st_mtime);
+  return Ret;
+}
 
-StringRef Path::getDirname() const {
-  return getDirnameCharSep(path, "/");
+UniqueID file_status::getUniqueID() const {
+  return UniqueID(fs_st_dev, fs_st_ino);
 }
 
-StringRef
-Path::getBasename() const {
-  // Find the last slash
-  std::string::size_type slash = path.rfind('/');
-  if (slash == std::string::npos)
-    slash = 0;
-  else
-    slash++;
+error_code current_path(SmallVectorImpl<char> &result) {
+  result.clear();
 
-  std::string::size_type dot = path.rfind('.');
-  if (dot == std::string::npos || dot < slash)
-    return StringRef(path).substr(slash);
-  else
-    return StringRef(path).substr(slash, dot - slash);
-}
+  const char *pwd = ::getenv("PWD");
+  llvm::sys::fs::file_status PWDStatus, DotStatus;
+  if (pwd && llvm::sys::path::is_absolute(pwd) &&
+      !llvm::sys::fs::status(pwd, PWDStatus) &&
+      !llvm::sys::fs::status(".", DotStatus) &&
+      PWDStatus.getUniqueID() == DotStatus.getUniqueID()) {
+    result.append(pwd, pwd + strlen(pwd));
+    return error_code::success();
+  }
 
-StringRef
-Path::getSuffix() const {
-  // Find the last slash
-  std::string::size_type slash = path.rfind('/');
-  if (slash == std::string::npos)
-    slash = 0;
-  else
-    slash++;
+#ifdef MAXPATHLEN
+  result.reserve(MAXPATHLEN);
+#else
+// For GNU Hurd
+  result.reserve(1024);
+#endif
 
-  std::string::size_type dot = path.rfind('.');
-  if (dot == std::string::npos || dot < slash)
-    return StringRef();
-  else
-    return StringRef(path).substr(dot + 1);
-}
+  while (true) {
+    if (::getcwd(result.data(), result.capacity()) == 0) {
+      // See if there was a real error.
+      if (errno != errc::not_enough_memory)
+        return error_code(errno, system_category());
+      // Otherwise there just wasn't enough space.
+      result.reserve(result.capacity() * 2);
+    } else
+      break;
+  }
 
-bool Path::getMagicNumber(std::string &Magic, unsigned len) const {
-  assert(len < 1024 && "Request for magic string too long");
-  char Buf[1025];
-  int fd = ::open(path.c_str(), O_RDONLY);
-  if (fd < 0)
-    return false;
-  ssize_t bytes_read = ::read(fd, Buf, len);
-  ::close(fd);
-  if (ssize_t(len) != bytes_read)
-    return false;
-  Magic.assign(Buf, len);
-  return true;
+  result.set_size(strlen(result.data()));
+  return error_code::success();
 }
 
-bool
-Path::exists() const {
-  return 0 == access(path.c_str(), F_OK );
-}
+error_code create_directory(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-bool
-Path::isDirectory() const {
-  struct stat buf;
-  if (0 != stat(path.c_str(), &buf))
-    return false;
-  return ((buf.st_mode & S_IFMT) == S_IFDIR) ? true : false;
-}
+  if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
+    if (errno != errc::file_exists)
+      return error_code(errno, system_category());
+    existed = true;
+  } else
+    existed = false;
 
-bool
-Path::isSymLink() const {
-  struct stat buf;
-  if (0 != lstat(path.c_str(), &buf))
-    return false;
-  return S_ISLNK(buf.st_mode);
+  return error_code::success();
 }
 
+error_code create_hard_link(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
 
-bool
-Path::canRead() const {
-  return 0 == access(path.c_str(), R_OK);
-}
+  if (::link(t.begin(), f.begin()) == -1)
+    return error_code(errno, system_category());
 
-bool
-Path::canWrite() const {
-  return 0 == access(path.c_str(), W_OK);
+  return error_code::success();
 }
 
-bool
-Path::isRegularFile() const {
-  // Get the status so we can determine if it's a file or directory
-  struct stat buf;
-
-  if (0 != stat(path.c_str(), &buf))
-    return false;
+error_code create_symlink(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
 
-  if (S_ISREG(buf.st_mode))
-    return true;
+  if (::symlink(t.begin(), f.begin()) == -1)
+    return error_code(errno, system_category());
 
-  return false;
+  return error_code::success();
 }
 
-bool
-Path::canExecute() const {
-  if (0 != access(path.c_str(), R_OK | X_OK ))
-    return false;
+error_code remove(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
+
   struct stat buf;
-  if (0 != stat(path.c_str(), &buf))
-    return false;
-  if (!S_ISREG(buf.st_mode))
-    return false;
-  return true;
-}
+  if (stat(p.begin(), &buf) != 0) {
+    if (errno != errc::no_such_file_or_directory)
+      return error_code(errno, system_category());
+    existed = false;
+    return error_code::success();
+  }
 
-StringRef
-Path::getLast() const {
-  // Find the last slash
-  size_t pos = path.rfind('/');
+  // Note: this check catches strange situations. In all cases, LLVM should
+  // only be involved in the creation and deletion of regular files.  This
+  // check ensures that what we're trying to erase is a regular file. It
+  // effectively prevents LLVM from erasing things like /dev/null, any block
+  // special file, or other things that aren't "regular" files.
+  if (!S_ISREG(buf.st_mode) && !S_ISDIR(buf.st_mode))
+    return make_error_code(errc::operation_not_permitted);
 
-  // Handle the corner cases
-  if (pos == std::string::npos)
-    return path;
+  if (::remove(p.begin()) == -1) {
+    if (errno != errc::no_such_file_or_directory)
+      return error_code(errno, system_category());
+    existed = false;
+  } else
+    existed = true;
 
-  // If the last character is a slash
-  if (pos == path.length()-1) {
-    // Find the second to last slash
-    size_t pos2 = path.rfind('/', pos-1);
-    if (pos2 == std::string::npos)
-      return StringRef(path).substr(0,pos);
-    else
-      return StringRef(path).substr(pos2+1,pos-pos2-1);
-  }
-  // Return everything after the last slash
-  return StringRef(path).substr(pos+1);
+  return error_code::success();
 }
 
-const FileStatus *
-PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
-  if (!fsIsValid || update) {
-    struct stat buf;
-    if (0 != stat(path.c_str(), &buf)) {
-      MakeErrMsg(ErrStr, path + ": can't get status of file");
-      return 0;
-    }
-    status.fileSize = buf.st_size;
-    status.modTime.fromEpochTime(buf.st_mtime);
-    status.mode = buf.st_mode;
-    status.user = buf.st_uid;
-    status.group = buf.st_gid;
-    status.uniqueID = uint64_t(buf.st_ino);
-    status.isDir  = S_ISDIR(buf.st_mode);
-    status.isFile = S_ISREG(buf.st_mode);
-    fsIsValid = true;
-  }
-  return &status;
-}
+error_code rename(const Twine &from, const Twine &to) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toNullTerminatedStringRef(from_storage);
+  StringRef t = to.toNullTerminatedStringRef(to_storage);
 
-static bool AddPermissionBits(const Path &File, int bits) {
-  // Get the umask value from the operating system.  We want to use it
-  // when changing the file's permissions. Since calling umask() sets
-  // the umask and returns its old value, we must call it a second
-  // time to reset it to the user's preference.
-  int mask = umask(0777); // The arg. to umask is arbitrary.
-  umask(mask);            // Restore the umask.
+  if (::rename(f.begin(), t.begin()) == -1)
+    return error_code(errno, system_category());
 
-  // Get the file's current mode.
-  struct stat buf;
-  if (0 != stat(File.c_str(), &buf))
-    return false;
-  // Change the file to have whichever permissions bits from 'bits'
-  // that the umask would not disable.
-  if ((chmod(File.c_str(), (buf.st_mode | (bits & ~mask)))) == -1)
-      return false;
-  return true;
+  return error_code::success();
 }
 
-bool Path::makeReadableOnDisk(std::string* ErrMsg) {
-  if (!AddPermissionBits(*this, 0444))
-    return MakeErrMsg(ErrMsg, path + ": can't make file readable");
-  return false;
-}
+error_code resize_file(const Twine &path, uint64_t size) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
-  if (!AddPermissionBits(*this, 0222))
-    return MakeErrMsg(ErrMsg, path + ": can't make file writable");
-  return false;
-}
+  if (::truncate(p.begin(), size) == -1)
+    return error_code(errno, system_category());
 
-bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
-  if (!AddPermissionBits(*this, 0111))
-    return MakeErrMsg(ErrMsg, path + ": can't make file executable");
-  return false;
+  return error_code::success();
 }
 
-bool
-Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
-  DIR* direntries = ::opendir(path.c_str());
-  if (direntries == 0)
-    return MakeErrMsg(ErrMsg, path + ": can't open directory");
-
-  std::string dirPath = path;
-  if (!lastIsSlash(dirPath))
-    dirPath += '/';
+error_code exists(const Twine &path, bool &result) {
+  SmallString<128> path_storage;
+  StringRef p = path.toNullTerminatedStringRef(path_storage);
 
-  result.clear();
-  struct dirent* de = ::readdir(direntries);
-  for ( ; de != 0; de = ::readdir(direntries)) {
-    if (de->d_name[0] != '.') {
-      Path aPath(dirPath + (const char*)de->d_name);
-      struct stat st;
-      if (0 != lstat(aPath.path.c_str(), &st)) {
-        if (S_ISLNK(st.st_mode))
-          continue; // dangling symlink -- ignore
-        return MakeErrMsg(ErrMsg,
-                          aPath.path +  ": can't determine file object type");
-      }
-      result.insert(aPath);
-    }
-  }
+  if (::access(p.begin(), F_OK) == -1) {
+    if (errno != errc::no_such_file_or_directory)
+      return error_code(errno, system_category());
+    result = false;
+  } else
+    result = true;
 
-  closedir(direntries);
-  return false;
+  return error_code::success();
 }
 
-bool
-Path::set(StringRef a_path) {
-  if (a_path.empty())
-    return false;
-  path = a_path;
-  return true;
+bool can_write(const Twine &Path) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+  return 0 == access(P.begin(), W_OK);
 }
 
-bool
-Path::appendComponent(StringRef name) {
-  if (name.empty())
+bool can_execute(const Twine &Path) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  if (0 != access(P.begin(), R_OK | X_OK))
+    return false;
+  struct stat buf;
+  if (0 != stat(P.begin(), &buf))
+    return false;
+  if (!S_ISREG(buf.st_mode))
     return false;
-  if (!lastIsSlash(path))
-    path += '/';
-  path += name;
   return true;
 }
 
-bool
-Path::eraseComponent() {
-  size_t slashpos = path.rfind('/',path.size());
-  if (slashpos == 0 || slashpos == std::string::npos) {
-    path.erase();
-    return true;
-  }
-  if (slashpos == path.size() - 1)
-    slashpos = path.rfind('/',slashpos-1);
-  if (slashpos == std::string::npos) {
-    path.erase();
-    return true;
-  }
-  path.erase(slashpos);
-  return true;
+bool equivalent(file_status A, file_status B) {
+  assert(status_known(A) && status_known(B));
+  return A.fs_st_dev == B.fs_st_dev &&
+         A.fs_st_ino == B.fs_st_ino;
 }
 
-bool
-Path::eraseSuffix() {
-  size_t dotpos = path.rfind('.',path.size());
-  size_t slashpos = path.rfind('/',path.size());
-  if (dotpos != std::string::npos) {
-    if (slashpos == std::string::npos || dotpos > slashpos+1) {
-      path.erase(dotpos, path.size()-dotpos);
-      return true;
-    }
-  }
-  return false;
+error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+  file_status fsA, fsB;
+  if (error_code ec = status(A, fsA)) return ec;
+  if (error_code ec = status(B, fsB)) return ec;
+  result = equivalent(fsA, fsB);
+  return error_code::success();
 }
 
-static bool createDirectoryHelper(char* beg, char* end, bool create_parents) {
+static error_code fillStatus(int StatRet, const struct stat &Status,
+                             file_status &Result) {
+  if (StatRet != 0) {
+    error_code ec(errno, system_category());
+    if (ec == errc::no_such_file_or_directory)
+      Result = file_status(file_type::file_not_found);
+    else
+      Result = file_status(file_type::status_error);
+    return ec;
+  }
 
-  if (access(beg, R_OK | W_OK) == 0)
-    return false;
+  file_type Type = file_type::type_unknown;
+
+  if (S_ISDIR(Status.st_mode))
+    Type = file_type::directory_file;
+  else if (S_ISREG(Status.st_mode))
+    Type = file_type::regular_file;
+  else if (S_ISBLK(Status.st_mode))
+    Type = file_type::block_file;
+  else if (S_ISCHR(Status.st_mode))
+    Type = file_type::character_file;
+  else if (S_ISFIFO(Status.st_mode))
+    Type = file_type::fifo_file;
+  else if (S_ISSOCK(Status.st_mode))
+    Type = file_type::socket_file;
+
+  perms Perms = static_cast<perms>(Status.st_mode);
+  Result =
+      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_mtime,
+                  Status.st_uid, Status.st_gid, Status.st_size);
+
+  return error_code::success();
+}
+
+error_code status(const Twine &Path, file_status &Result) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  struct stat Status;
+  int StatRet = ::stat(P.begin(), &Status);
+  return fillStatus(StatRet, Status, Result);
+}
+
+error_code status(int FD, file_status &Result) {
+  struct stat Status;
+  int StatRet = ::fstat(FD, &Status);
+  return fillStatus(StatRet, Status, Result);
+}
+
+error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+#if defined(HAVE_FUTIMENS)
+  timespec Times[2];
+  Times[0].tv_sec = Time.toPosixTime();
+  Times[0].tv_nsec = 0;
+  Times[1] = Times[0];
+  if (::futimens(FD, Times))
+#elif defined(HAVE_FUTIMES)
+  timeval Times[2];
+  Times[0].tv_sec = Time.toPosixTime();
+  Times[0].tv_usec = 0;
+  Times[1] = Times[0];
+  if (::futimes(FD, Times))
+#else
+#error Missing futimes() and futimens()
+#endif
+    return error_code(errno, system_category());
+  return error_code::success();
+}
+
+error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+  AutoFD ScopedFD(FD);
+  if (!CloseFD)
+    ScopedFD.take();
+
+  // Figure out how large the file is.
+  struct stat FileInfo;
+  if (fstat(FD, &FileInfo) == -1)
+    return error_code(errno, system_category());
+  uint64_t FileSize = FileInfo.st_size;
+
+  if (Size == 0)
+    Size = FileSize;
+  else if (FileSize < Size) {
+    // We need to grow the file.
+    if (ftruncate(FD, Size) == -1)
+      return error_code(errno, system_category());
+  }
 
-  if (create_parents) {
+  int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
+  int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  Mapping = ::mmap(0, Size, prot, flags, FD, Offset);
+  if (Mapping == MAP_FAILED)
+    return error_code(errno, system_category());
+  return error_code::success();
+}
+
+mapped_file_region::mapped_file_region(const Twine &path,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping() {
+  // Make sure that the requested size fits within SIZE_T.
+  if (length > std::numeric_limits<size_t>::max()) {
+    ec = make_error_code(errc::invalid_argument);
+    return;
+  }
 
-    char* c = end;
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
+  int ofd = ::open(name.begin(), oflags);
+  if (ofd == -1) {
+    ec = error_code(errno, system_category());
+    return;
+  }
 
-    for (; c != beg; --c)
-      if (*c == '/') {
+  ec = init(ofd, true, offset);
+  if (ec)
+    Mapping = 0;
+}
+
+mapped_file_region::mapped_file_region(int fd,
+                                       bool closefd,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping() {
+  // Make sure that the requested size fits within SIZE_T.
+  if (length > std::numeric_limits<size_t>::max()) {
+    ec = make_error_code(errc::invalid_argument);
+    return;
+  }
 
-        // Recurse to handling the parent directory.
-        *c = '\0';
-        bool x = createDirectoryHelper(beg, c, create_parents);
-        *c = '/';
+  ec = init(fd, closefd, offset);
+  if (ec)
+    Mapping = 0;
+}
 
-        // Return if we encountered an error.
-        if (x)
-          return true;
+mapped_file_region::~mapped_file_region() {
+  if (Mapping)
+    ::munmap(Mapping, Size);
+}
 
-        break;
-      }
-  }
+#if LLVM_HAS_RVALUE_REFERENCES
+mapped_file_region::mapped_file_region(mapped_file_region &&other)
+  : Mode(other.Mode), Size(other.Size), Mapping(other.Mapping) {
+  other.Mapping = 0;
+}
+#endif
 
-  return mkdir(beg, S_IRWXU | S_IRWXG) != 0;
+mapped_file_region::mapmode mapped_file_region::flags() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Mode;
 }
 
-bool
-Path::createDirectoryOnDisk( bool create_parents, std::string* ErrMsg ) {
-  // Get a writeable copy of the path name
-  std::string pathname(path);
+uint64_t mapped_file_region::size() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Size;
+}
 
-  // Null-terminate the last component
-  size_t lastchar = path.length() - 1 ;
+char *mapped_file_region::data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
+  return reinterpret_cast<char*>(Mapping);
+}
 
-  if (pathname[lastchar] != '/')
-    ++lastchar;
+const char *mapped_file_region::const_data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<const char*>(Mapping);
+}
 
-  pathname[lastchar] = '\0';
+int mapped_file_region::alignment() {
+  return process::get_self()->page_size();
+}
 
-  if (createDirectoryHelper(&pathname[0], &pathname[lastchar], create_parents))
-    return MakeErrMsg(ErrMsg, pathname + ": can't create directory");
+error_code detail::directory_iterator_construct(detail::DirIterState &it,
+                                                StringRef path){
+  SmallString<128> path_null(path);
+  DIR *directory = ::opendir(path_null.c_str());
+  if (directory == 0)
+    return error_code(errno, system_category());
 
-  return false;
+  it.IterationHandle = reinterpret_cast<intptr_t>(directory);
+  // Add something for replace_filename to replace.
+  path::append(path_null, ".");
+  it.CurrentEntry = directory_entry(path_null.str());
+  return directory_iterator_increment(it);
 }
 
-bool
-Path::createFileOnDisk(std::string* ErrMsg) {
-  // Create the file
-  int fd = ::creat(path.c_str(), S_IRUSR | S_IWUSR);
-  if (fd < 0)
-    return MakeErrMsg(ErrMsg, path + ": can't create file");
-  ::close(fd);
-  return false;
+error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+  if (it.IterationHandle)
+    ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
+  it.IterationHandle = 0;
+  it.CurrentEntry = directory_entry();
+  return error_code::success();
 }
 
-bool
-Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
-  // Make this into a unique file name
-  if (makeUnique( reuse_current, ErrMsg ))
-    return true;
+error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+  errno = 0;
+  dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
+  if (cur_dir == 0 && errno != 0) {
+    return error_code(errno, system_category());
+  } else if (cur_dir != 0) {
+    StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
+    if ((name.size() == 1 && name[0] == '.') ||
+        (name.size() == 2 && name[0] == '.' && name[1] == '.'))
+      return directory_iterator_increment(it);
+    it.CurrentEntry.replace_filename(name);
+  } else
+    return directory_iterator_destruct(it);
 
-  // create the file
-  int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_TRUNC, 0666);
-  if (fd < 0)
-    return MakeErrMsg(ErrMsg, path + ": can't create temporary file");
-  ::close(fd);
-  return false;
+  return error_code::success();
 }
 
-bool
-Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
-  // Get the status so we can determine if it's a file or directory.
-  struct stat buf;
-  if (0 != stat(path.c_str(), &buf)) {
-    MakeErrMsg(ErrStr, path + ": can't get status of file");
-    return true;
-  }
+error_code get_magic(const Twine &path, uint32_t len,
+                     SmallVectorImpl<char> &result) {
+  SmallString<128> PathStorage;
+  StringRef Path = path.toNullTerminatedStringRef(PathStorage);
+  result.set_size(0);
 
-  // Note: this check catches strange situations. In all cases, LLVM should
-  // only be involved in the creation and deletion of regular files.  This
-  // check ensures that what we're trying to erase is a regular file. It
-  // effectively prevents LLVM from erasing things like /dev/null, any block
-  // special file, or other things that aren't "regular" files.
-  if (S_ISREG(buf.st_mode)) {
-    if (unlink(path.c_str()) != 0)
-      return MakeErrMsg(ErrStr, path + ": can't destroy file");
-    return false;
-  }
+  // Open path.
+  std::FILE *file = std::fopen(Path.data(), "rb");
+  if (file == 0)
+    return error_code(errno, system_category());
 
-  if (!S_ISDIR(buf.st_mode)) {
-    if (ErrStr) *ErrStr = "not a file or directory";
-    return true;
-  }
+  // Reserve storage.
+  result.reserve(len);
 
-  if (remove_contents) {
-    // Recursively descend the directory to remove its contents.
-    std::string cmd = "/bin/rm -rf " + path;
-    if (system(cmd.c_str()) != 0) {
-      MakeErrMsg(ErrStr, path + ": failed to recursively remove directory.");
-      return true;
+  // Read magic!
+  size_t size = std::fread(result.data(), 1, len, file);
+  if (std::ferror(file) != 0) {
+    std::fclose(file);
+    return error_code(errno, system_category());
+  } else if (size != len) {
+    if (std::feof(file) != 0) {
+      std::fclose(file);
+      result.set_size(size);
+      return make_error_code(errc::value_too_large);
     }
-    return false;
-  }
-
-  // Otherwise, try to just remove the one directory.
-  std::string pathname(path);
-  size_t lastchar = path.length() - 1;
-  if (pathname[lastchar] == '/')
-    pathname[lastchar] = '\0';
-  else
-    pathname[lastchar+1] = '\0';
-
-  if (rmdir(pathname.c_str()) != 0)
-    return MakeErrMsg(ErrStr, pathname + ": can't erase directory");
-  return false;
-}
-
-bool
-Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
-  if (0 != ::rename(path.c_str(), newName.c_str()))
-    return MakeErrMsg(ErrMsg, std::string("can't rename '") + path + "' as '" +
-               newName.str() + "'");
-  return false;
-}
-
-bool
-Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrStr) const {
-  struct utimbuf utb;
-  utb.actime = si.modTime.toPosixTime();
-  utb.modtime = utb.actime;
-  if (0 != ::utime(path.c_str(),&utb))
-    return MakeErrMsg(ErrStr, path + ": can't set file modification time");
-  if (0 != ::chmod(path.c_str(),si.mode))
-    return MakeErrMsg(ErrStr, path + ": can't set mode");
-  return false;
-}
-
-bool
-sys::CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg){
-  int inFile = -1;
-  int outFile = -1;
-  inFile = ::open(Src.c_str(), O_RDONLY);
-  if (inFile == -1)
-    return MakeErrMsg(ErrMsg, Src.str() +
-      ": can't open source file to copy");
-
-  outFile = ::open(Dest.c_str(), O_WRONLY|O_CREAT, 0666);
-  if (outFile == -1) {
-    ::close(inFile);
-    return MakeErrMsg(ErrMsg, Dest.str() +
-      ": can't create destination file for copy");
   }
-
-  char Buffer[16*1024];
-  while (ssize_t Amt = ::read(inFile, Buffer, 16*1024)) {
-    if (Amt == -1) {
-      if (errno != EINTR && errno != EAGAIN) {
-        ::close(inFile);
-        ::close(outFile);
-        return MakeErrMsg(ErrMsg, Src.str()+": can't read source file");
-      }
-    } else {
-      char *BufPtr = Buffer;
-      while (Amt) {
-        ssize_t AmtWritten = ::write(outFile, BufPtr, Amt);
-        if (AmtWritten == -1) {
-          if (errno != EINTR && errno != EAGAIN) {
-            ::close(inFile);
-            ::close(outFile);
-            return MakeErrMsg(ErrMsg, Dest.str() +
-              ": can't write destination file");
-          }
-        } else {
-          Amt -= AmtWritten;
-          BufPtr += AmtWritten;
-        }
-      }
-    }
+  std::fclose(file);
+  result.set_size(size);
+  return error_code::success();
+}
+
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
+                                            bool map_writable, void *&result) {
+  SmallString<128> path_storage;
+  StringRef name = path.toNullTerminatedStringRef(path_storage);
+  int oflags = map_writable ? O_RDWR : O_RDONLY;
+  int ofd = ::open(name.begin(), oflags);
+  if ( ofd == -1 )
+    return error_code(errno, system_category());
+  AutoFD fd(ofd);
+  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
+  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
+#ifdef MAP_FILE
+  flags |= MAP_FILE;
+#endif
+  result = ::mmap(0, size, prot, flags, fd, file_offset);
+  if (result == MAP_FAILED) {
+    return error_code(errno, system_category());
   }
-  ::close(inFile);
-  ::close(outFile);
-  return false;
-}
-
-bool
-Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
-  bool Exists;
-  if (reuse_current && (fs::exists(path, Exists) || !Exists))
-    return false; // File doesn't exist already, just use it!
-
-  // Append an XXXXXX pattern to the end of the file for use with mkstemp,
-  // mktemp or our own implementation.
-  // This uses std::vector instead of SmallVector to avoid a dependence on
-  // libSupport. And performance isn't critical here.
-  std::vector<char> Buf;
-  Buf.resize(path.size()+8);
-  char *FNBuffer = &Buf[0];
-    path.copy(FNBuffer,path.size());
-  bool isdir;
-  if (!fs::is_directory(path, isdir) && isdir)
-    strcpy(FNBuffer+path.size(), "/XXXXXX");
-  else
-    strcpy(FNBuffer+path.size(), "-XXXXXX");
+  
+  return error_code::success();
+}
 
-#if defined(HAVE_MKSTEMP)
-  int TempFD;
-  if ((TempFD = mkstemp(FNBuffer)) == -1)
-    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+error_code unmap_file_pages(void *base, size_t size) {
+  if ( ::munmap(base, size) == -1 )
+    return error_code(errno, system_category());
+   
+  return error_code::success();
+}
 
-  // We don't need to hold the temp file descriptor... we will trust that no one
-  // will overwrite/delete the file before we can open it again.
-  close(TempFD);
+error_code openFileForRead(const Twine &Name, int &ResultFD) {
+  SmallString<128> Storage;
+  StringRef P = Name.toNullTerminatedStringRef(Storage);
+  while ((ResultFD = open(P.begin(), O_RDONLY)) < 0) {
+    if (errno != EINTR)
+      return error_code(errno, system_category());
+  }
+  return error_code::success();
+}
 
-  // Save the name
-  path = FNBuffer;
+error_code openFileForWrite(const Twine &Name, int &ResultFD,
+                            sys::fs::OpenFlags Flags, unsigned Mode) {
+  // Verify that we don't have both "append" and "excl".
+  assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
+         "Cannot specify both 'excl' and 'append' file creation flags!");
 
-  // By default mkstemp sets the mode to 0600, so update mode bits now.
-  AddPermissionBits (*this, 0666);
-#elif defined(HAVE_MKTEMP)
-  // If we don't have mkstemp, use the old and obsolete mktemp function.
-  if (mktemp(FNBuffer) == 0)
-    return MakeErrMsg(ErrMsg, path + ": can't make unique filename");
+  int OpenFlags = O_WRONLY | O_CREAT;
 
-  // Save the name
-  path = FNBuffer;
-#else
-  // Okay, looks like we have to do it all by our lonesome.
-  static unsigned FCounter = 0;
-  // Try to initialize with unique value.
-  if (FCounter == 0) FCounter = ((unsigned)getpid() & 0xFFFF) << 8;
-  char* pos = strstr(FNBuffer, "XXXXXX");
-  do {
-    if (++FCounter > 0xFFFFFF) {
-      return MakeErrMsg(ErrMsg,
-        path + ": can't make unique filename: too many files");
-    }
-    sprintf(pos, "%06X", FCounter);
-    path = FNBuffer;
-  } while (exists());
-  // POSSIBLE SECURITY BUG: An attacker can easily guess the name and exploit
-  // LLVM.
-#endif
-  return false;
-}
+  if (Flags & F_Append)
+    OpenFlags |= O_APPEND;
+  else
+    OpenFlags |= O_TRUNC;
 
-const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
-  int Flags = MAP_PRIVATE;
-#ifdef MAP_FILE
-  Flags |= MAP_FILE;
-#endif
-  void *BasePtr = ::mmap(0, FileSize, PROT_READ, Flags, FD, Offset);
-  if (BasePtr == MAP_FAILED)
-    return 0;
-  return (const char*)BasePtr;
-}
+  if (Flags & F_Excl)
+    OpenFlags |= O_EXCL;
 
-void Path::UnMapFilePages(const char *BasePtr, size_t FileSize) {
-  const void *Addr = static_cast<const void *>(BasePtr);
-  ::munmap(const_cast<void *>(Addr), FileSize);
+  SmallString<128> Storage;
+  StringRef P = Name.toNullTerminatedStringRef(Storage);
+  while ((ResultFD = open(P.begin(), OpenFlags, Mode)) < 0) {
+    if (errno != EINTR)
+      return error_code(errno, system_category());
+  }
+  return error_code::success();
 }
 
-} // end llvm namespace
+} // end namespace fs
+} // end namespace sys
+} // end namespace llvm
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
deleted file mode 100644
index 7e0aead..0000000
--- a/lib/Support/Unix/PathV2.inc
+++ /dev/null
@@ -1,693 +0,0 @@
-//===- llvm/Support/Unix/PathV2.cpp - Unix Path Implementation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Unix specific implementation of the PathV2 API.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic UNIX code that
-//===          is guaranteed to work on *all* UNIX variants.
-//===----------------------------------------------------------------------===//
-
-#include "Unix.h"
-#include "llvm/Support/Process.h"
-#if HAVE_SYS_STAT_H
-#include <sys/stat.h>
-#endif
-#if HAVE_FCNTL_H
-#include <fcntl.h>
-#endif
-#ifdef HAVE_SYS_MMAN_H
-#include <sys/mman.h>
-#endif
-#if HAVE_DIRENT_H
-# include <dirent.h>
-# define NAMLEN(dirent) strlen((dirent)->d_name)
-#else
-# define dirent direct
-# define NAMLEN(dirent) (dirent)->d_namlen
-# if HAVE_SYS_NDIR_H
-#  include <sys/ndir.h>
-# endif
-# if HAVE_SYS_DIR_H
-#  include <sys/dir.h>
-# endif
-# if HAVE_NDIR_H
-#  include <ndir.h>
-# endif
-#endif
-#if HAVE_STDIO_H
-#include <stdio.h>
-#endif
-#if HAVE_LIMITS_H
-#include <limits.h>
-#endif
-
-// Both stdio.h and cstdio are included via different pathes and
-// stdcxx's cstdio doesn't include stdio.h, so it doesn't #undef the macros
-// either.
-#undef ferror
-#undef feof
-
-// For GNU Hurd
-#if defined(__GNU__) && !defined(PATH_MAX)
-# define PATH_MAX 4096
-#endif
-
-using namespace llvm;
-
-namespace {
-  /// This class automatically closes the given file descriptor when it goes out
-  /// of scope. You can take back explicit ownership of the file descriptor by
-  /// calling take(). The destructor does not verify that close was successful.
-  /// Therefore, never allow this class to call close on a file descriptor that
-  /// has been read from or written to.
-  struct AutoFD {
-    int FileDescriptor;
-
-    AutoFD(int fd) : FileDescriptor(fd) {}
-    ~AutoFD() {
-      if (FileDescriptor >= 0)
-        ::close(FileDescriptor);
-    }
-
-    int take() {
-      int ret = FileDescriptor;
-      FileDescriptor = -1;
-      return ret;
-    }
-
-    operator int() const {return FileDescriptor;}
-  };
-
-  error_code TempDir(SmallVectorImpl<char> &result) {
-    // FIXME: Don't use TMPDIR if program is SUID or SGID enabled.
-    const char *dir = 0;
-    (dir = std::getenv("TMPDIR" )) ||
-    (dir = std::getenv("TMP"    )) ||
-    (dir = std::getenv("TEMP"   )) ||
-    (dir = std::getenv("TEMPDIR")) ||
-#ifdef P_tmpdir
-    (dir = P_tmpdir) ||
-#endif
-    (dir = "/tmp");
-
-    result.clear();
-    StringRef d(dir);
-    result.append(d.begin(), d.end());
-    return error_code::success();
-  }
-}
-
-namespace llvm {
-namespace sys  {
-namespace fs {
-
-error_code current_path(SmallVectorImpl<char> &result) {
-#ifdef MAXPATHLEN
-  result.reserve(MAXPATHLEN);
-#else
-// For GNU Hurd
-  result.reserve(1024);
-#endif
-
-  while (true) {
-    if (::getcwd(result.data(), result.capacity()) == 0) {
-      // See if there was a real error.
-      if (errno != errc::not_enough_memory)
-        return error_code(errno, system_category());
-      // Otherwise there just wasn't enough space.
-      result.reserve(result.capacity() * 2);
-    } else
-      break;
-  }
-
-  result.set_size(strlen(result.data()));
-  return error_code::success();
-}
-
-error_code copy_file(const Twine &from, const Twine &to, copy_option copt) {
- // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toNullTerminatedStringRef(from_storage);
-  StringRef t = to.toNullTerminatedStringRef(to_storage);
-
-  const size_t buf_sz = 32768;
-  char buffer[buf_sz];
-  int from_file = -1, to_file = -1;
-
-  // Open from.
-  if ((from_file = ::open(f.begin(), O_RDONLY)) < 0)
-    return error_code(errno, system_category());
-  AutoFD from_fd(from_file);
-
-  // Stat from.
-  struct stat from_stat;
-  if (::stat(f.begin(), &from_stat) != 0)
-    return error_code(errno, system_category());
-
-  // Setup to flags.
-  int to_flags = O_CREAT | O_WRONLY;
-  if (copt == copy_option::fail_if_exists)
-    to_flags |= O_EXCL;
-
-  // Open to.
-  if ((to_file = ::open(t.begin(), to_flags, from_stat.st_mode)) < 0)
-    return error_code(errno, system_category());
-  AutoFD to_fd(to_file);
-
-  // Copy!
-  ssize_t sz, sz_read = 1, sz_write;
-  while (sz_read > 0 &&
-         (sz_read = ::read(from_fd, buffer, buf_sz)) > 0) {
-    // Allow for partial writes - see Advanced Unix Programming (2nd Ed.),
-    // Marc Rochkind, Addison-Wesley, 2004, page 94
-    sz_write = 0;
-    do {
-      if ((sz = ::write(to_fd, buffer + sz_write, sz_read - sz_write)) < 0) {
-        sz_read = sz;  // cause read loop termination.
-        break;         // error.
-      }
-      sz_write += sz;
-    } while (sz_write < sz_read);
-  }
-
-  // After all the file operations above the return value of close actually
-  // matters.
-  if (::close(from_fd.take()) < 0) sz_read = -1;
-  if (::close(to_fd.take()) < 0) sz_read = -1;
-
-  // Check for errors.
-  if (sz_read < 0)
-    return error_code(errno, system_category());
-
-  return error_code::success();
-}
-
-error_code create_directory(const Twine &path, bool &existed) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  if (::mkdir(p.begin(), S_IRWXU | S_IRWXG) == -1) {
-    if (errno != errc::file_exists)
-      return error_code(errno, system_category());
-    existed = true;
-  } else
-    existed = false;
-
-  return error_code::success();
-}
-
-error_code create_hard_link(const Twine &to, const Twine &from) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toNullTerminatedStringRef(from_storage);
-  StringRef t = to.toNullTerminatedStringRef(to_storage);
-
-  if (::link(t.begin(), f.begin()) == -1)
-    return error_code(errno, system_category());
-
-  return error_code::success();
-}
-
-error_code create_symlink(const Twine &to, const Twine &from) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toNullTerminatedStringRef(from_storage);
-  StringRef t = to.toNullTerminatedStringRef(to_storage);
-
-  if (::symlink(t.begin(), f.begin()) == -1)
-    return error_code(errno, system_category());
-
-  return error_code::success();
-}
-
-error_code remove(const Twine &path, bool &existed) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  if (::remove(p.begin()) == -1) {
-    if (errno != errc::no_such_file_or_directory)
-      return error_code(errno, system_category());
-    existed = false;
-  } else
-    existed = true;
-
-  return error_code::success();
-}
-
-error_code rename(const Twine &from, const Twine &to) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toNullTerminatedStringRef(from_storage);
-  StringRef t = to.toNullTerminatedStringRef(to_storage);
-
-  if (::rename(f.begin(), t.begin()) == -1) {
-    // If it's a cross device link, copy then delete, otherwise return the error
-    if (errno == EXDEV) {
-      if (error_code ec = copy_file(from, to, copy_option::overwrite_if_exists))
-        return ec;
-      bool Existed;
-      if (error_code ec = remove(from, Existed))
-        return ec;
-    } else
-      return error_code(errno, system_category());
-  }
-
-  return error_code::success();
-}
-
-error_code resize_file(const Twine &path, uint64_t size) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  if (::truncate(p.begin(), size) == -1)
-    return error_code(errno, system_category());
-
-  return error_code::success();
-}
-
-error_code exists(const Twine &path, bool &result) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  if (::access(p.begin(), F_OK) == -1) {
-    if (errno != errc::no_such_file_or_directory)
-      return error_code(errno, system_category());
-    result = false;
-  } else
-    result = true;
-
-  return error_code::success();
-}
-
-bool equivalent(file_status A, file_status B) {
-  assert(status_known(A) && status_known(B));
-  return A.fs_st_dev == B.fs_st_dev &&
-         A.fs_st_ino == B.fs_st_ino;
-}
-
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
-  file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
-  result = equivalent(fsA, fsB);
-  return error_code::success();
-}
-
-error_code file_size(const Twine &path, uint64_t &result) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  struct stat status;
-  if (::stat(p.begin(), &status) == -1)
-    return error_code(errno, system_category());
-  if (!S_ISREG(status.st_mode))
-    return make_error_code(errc::operation_not_permitted);
-
-  result = status.st_size;
-  return error_code::success();
-}
-
-error_code status(const Twine &path, file_status &result) {
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-
-  struct stat status;
-  if (::stat(p.begin(), &status) != 0) {
-    error_code ec(errno, system_category());
-    if (ec == errc::no_such_file_or_directory)
-      result = file_status(file_type::file_not_found);
-    else
-      result = file_status(file_type::status_error);
-    return ec;
-  }
-
-  perms prms = static_cast<perms>(status.st_mode & perms_mask);
-  
-  if (S_ISDIR(status.st_mode))
-    result = file_status(file_type::directory_file, prms);
-  else if (S_ISREG(status.st_mode))
-    result = file_status(file_type::regular_file, prms);
-  else if (S_ISBLK(status.st_mode))
-    result = file_status(file_type::block_file, prms);
-  else if (S_ISCHR(status.st_mode))
-    result = file_status(file_type::character_file, prms);
-  else if (S_ISFIFO(status.st_mode))
-    result = file_status(file_type::fifo_file, prms);
-  else if (S_ISSOCK(status.st_mode))
-    result = file_status(file_type::socket_file, prms);
-  else
-    result = file_status(file_type::type_unknown, prms);
-
-  result.fs_st_dev = status.st_dev;
-  result.fs_st_ino = status.st_ino;
-
-  return error_code::success();
-}
-
-// Modifies permissions on a file.
-error_code permissions(const Twine &path, perms prms) {
-  if ((prms & add_perms) && (prms & remove_perms))
-    llvm_unreachable("add_perms and remove_perms are mutually exclusive");
-
-  // Get current permissions
-  file_status info;
-  if (error_code ec = status(path, info)) {
-    return ec;
-  }
-  
-  // Set updated permissions.
-  SmallString<128> path_storage;
-  StringRef p = path.toNullTerminatedStringRef(path_storage);
-  perms permsToSet;
-  if (prms & add_perms) {
-    permsToSet = (info.permissions() | prms) & perms_mask;
-  } else if (prms & remove_perms) {
-    permsToSet = (info.permissions() & ~prms) & perms_mask;
-  } else {
-    permsToSet = prms & perms_mask;
-  }
-  if (::chmod(p.begin(), static_cast<mode_t>(permsToSet))) {
-    return error_code(errno, system_category()); 
-  }
-
-  return error_code::success();
-}
-
-// Since this is most often used for temporary files, mode defaults to 0600.
-error_code unique_file(const Twine &model, int &result_fd,
-                       SmallVectorImpl<char> &result_path,
-                       bool makeAbsolute, unsigned mode) {
-  SmallString<128> Model;
-  model.toVector(Model);
-  // Null terminate.
-  Model.c_str();
-
-  if (makeAbsolute) {
-    // Make model absolute by prepending a temp directory if it's not already.
-    bool absolute = path::is_absolute(Twine(Model));
-    if (!absolute) {
-      SmallString<128> TDir;
-      if (error_code ec = TempDir(TDir)) return ec;
-      path::append(TDir, Twine(Model));
-      Model.swap(TDir);
-    }
-  }
-
-  // From here on, DO NOT modify model. It may be needed if the randomly chosen
-  // path already exists.
-  SmallString<128> RandomPath = Model;
-
-retry_random_path:
-  // Replace '%' with random chars.
-  for (unsigned i = 0, e = Model.size(); i != e; ++i) {
-    if (Model[i] == '%')
-      RandomPath[i] = "0123456789abcdef"[sys::Process::GetRandomNumber() & 15];
-  }
-
-  // Make sure we don't fall into an infinite loop by constantly trying
-  // to create the parent path.
-  bool TriedToCreateParent = false;
-
-  // Try to open + create the file.
-rety_open_create:
-  int RandomFD = ::open(RandomPath.c_str(), O_RDWR | O_CREAT | O_EXCL, mode);
-  if (RandomFD == -1) {
-    int SavedErrno = errno;
-    // If the file existed, try again, otherwise, error.
-    if (SavedErrno == errc::file_exists)
-      goto retry_random_path;
-    // If path prefix doesn't exist, try to create it.
-    if (SavedErrno == errc::no_such_file_or_directory && !TriedToCreateParent) {
-      TriedToCreateParent = true;
-      StringRef p(RandomPath);
-      SmallString<64> dir_to_create;
-      for (path::const_iterator i = path::begin(p),
-                                e = --path::end(p); i != e; ++i) {
-        path::append(dir_to_create, *i);
-        bool Exists;
-        if (error_code ec = exists(Twine(dir_to_create), Exists)) return ec;
-        if (!Exists) {
-          // Don't try to create network paths.
-          if (i->size() > 2 && (*i)[0] == '/' &&
-                               (*i)[1] == '/' &&
-                               (*i)[2] != '/')
-            return make_error_code(errc::no_such_file_or_directory);
-          if (::mkdir(dir_to_create.c_str(), 0700) == -1 &&
-              errno != errc::file_exists)
-            return error_code(errno, system_category());
-        }
-      }
-      goto rety_open_create;
-    }
-
-    return error_code(SavedErrno, system_category());
-  }
-
-   // Make the path absolute.
-  char real_path_buff[PATH_MAX + 1];
-  if (realpath(RandomPath.c_str(), real_path_buff) == NULL) {
-    int error = errno;
-    ::close(RandomFD);
-    ::unlink(RandomPath.c_str());
-    return error_code(error, system_category());
-  }
-
-  result_path.clear();
-  StringRef d(real_path_buff);
-  result_path.append(d.begin(), d.end());
-
-  result_fd = RandomFD;
-  return error_code::success();
-}
-
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
-  AutoFD ScopedFD(FD);
-  if (!CloseFD)
-    ScopedFD.take();
-
-  // Figure out how large the file is.
-  struct stat FileInfo;
-  if (fstat(FD, &FileInfo) == -1)
-    return error_code(errno, system_category());
-  uint64_t FileSize = FileInfo.st_size;
-
-  if (Size == 0)
-    Size = FileSize;
-  else if (FileSize < Size) {
-    // We need to grow the file.
-    if (ftruncate(FD, Size) == -1)
-      return error_code(errno, system_category());
-  }
-
-  int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
-  int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
-#ifdef MAP_FILE
-  flags |= MAP_FILE;
-#endif
-  Mapping = ::mmap(0, Size, prot, flags, FD, Offset);
-  if (Mapping == MAP_FAILED)
-    return error_code(errno, system_category());
-  return error_code::success();
-}
-
-mapped_file_region::mapped_file_region(const Twine &path,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping() {
-  // Make sure that the requested size fits within SIZE_T.
-  if (length > std::numeric_limits<size_t>::max()) {
-    ec = make_error_code(errc::invalid_argument);
-    return;
-  }
-
-  SmallString<128> path_storage;
-  StringRef name = path.toNullTerminatedStringRef(path_storage);
-  int oflags = (mode == readonly) ? O_RDONLY : O_RDWR;
-  int ofd = ::open(name.begin(), oflags);
-  if (ofd == -1) {
-    ec = error_code(errno, system_category());
-    return;
-  }
-
-  ec = init(ofd, true, offset);
-  if (ec)
-    Mapping = 0;
-}
-
-mapped_file_region::mapped_file_region(int fd,
-                                       bool closefd,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping() {
-  // Make sure that the requested size fits within SIZE_T.
-  if (length > std::numeric_limits<size_t>::max()) {
-    ec = make_error_code(errc::invalid_argument);
-    return;
-  }
-
-  ec = init(fd, closefd, offset);
-  if (ec)
-    Mapping = 0;
-}
-
-mapped_file_region::~mapped_file_region() {
-  if (Mapping)
-    ::munmap(Mapping, Size);
-}
-
-#if LLVM_HAS_RVALUE_REFERENCES
-mapped_file_region::mapped_file_region(mapped_file_region &&other)
-  : Mode(other.Mode), Size(other.Size), Mapping(other.Mapping) {
-  other.Mapping = 0;
-}
-#endif
-
-mapped_file_region::mapmode mapped_file_region::flags() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Mode;
-}
-
-uint64_t mapped_file_region::size() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Size;
-}
-
-char *mapped_file_region::data() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
-  return reinterpret_cast<char*>(Mapping);
-}
-
-const char *mapped_file_region::const_data() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return reinterpret_cast<const char*>(Mapping);
-}
-
-int mapped_file_region::alignment() {
-  return process::get_self()->page_size();
-}
-
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
-  SmallString<128> path_null(path);
-  DIR *directory = ::opendir(path_null.c_str());
-  if (directory == 0)
-    return error_code(errno, system_category());
-
-  it.IterationHandle = reinterpret_cast<intptr_t>(directory);
-  // Add something for replace_filename to replace.
-  path::append(path_null, ".");
-  it.CurrentEntry = directory_entry(path_null.str());
-  return directory_iterator_increment(it);
-}
-
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
-  if (it.IterationHandle)
-    ::closedir(reinterpret_cast<DIR *>(it.IterationHandle));
-  it.IterationHandle = 0;
-  it.CurrentEntry = directory_entry();
-  return error_code::success();
-}
-
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
-  errno = 0;
-  dirent *cur_dir = ::readdir(reinterpret_cast<DIR *>(it.IterationHandle));
-  if (cur_dir == 0 && errno != 0) {
-    return error_code(errno, system_category());
-  } else if (cur_dir != 0) {
-    StringRef name(cur_dir->d_name, NAMLEN(cur_dir));
-    if ((name.size() == 1 && name[0] == '.') ||
-        (name.size() == 2 && name[0] == '.' && name[1] == '.'))
-      return directory_iterator_increment(it);
-    it.CurrentEntry.replace_filename(name);
-  } else
-    return directory_iterator_destruct(it);
-
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> PathStorage;
-  StringRef Path = path.toNullTerminatedStringRef(PathStorage);
-  result.set_size(0);
-
-  // Open path.
-  std::FILE *file = std::fopen(Path.data(), "rb");
-  if (file == 0)
-    return error_code(errno, system_category());
-
-  // Reserve storage.
-  result.reserve(len);
-
-  // Read magic!
-  size_t size = std::fread(result.data(), 1, len, file);
-  if (std::ferror(file) != 0) {
-    std::fclose(file);
-    return error_code(errno, system_category());
-  } else if (size != result.size()) {
-    if (std::feof(file) != 0) {
-      std::fclose(file);
-      result.set_size(size);
-      return make_error_code(errc::value_too_large);
-    }
-  }
-  std::fclose(file);
-  result.set_size(len);
-  return error_code::success();
-}
-
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                                            bool map_writable, void *&result) {
-  SmallString<128> path_storage;
-  StringRef name = path.toNullTerminatedStringRef(path_storage);
-  int oflags = map_writable ? O_RDWR : O_RDONLY;
-  int ofd = ::open(name.begin(), oflags);
-  if ( ofd == -1 )
-    return error_code(errno, system_category());
-  AutoFD fd(ofd);
-  int flags = map_writable ? MAP_SHARED : MAP_PRIVATE;
-  int prot = map_writable ? (PROT_READ|PROT_WRITE) : PROT_READ;
-#ifdef MAP_FILE
-  flags |= MAP_FILE;
-#endif
-  result = ::mmap(0, size, prot, flags, fd, file_offset);
-  if (result == MAP_FAILED) {
-    return error_code(errno, system_category());
-  }
-  
-  return error_code::success();
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  if ( ::munmap(base, size) == -1 )
-    return error_code(errno, system_category());
-   
-  return error_code::success();
-}
-
-
-} // end namespace fs
-} // end namespace sys
-} // end namespace llvm
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 9a4454f..c5778e7 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -13,6 +13,9 @@
 
 #include "Unix.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/MutexGuard.h"
 #include "llvm/Support/TimeValue.h"
 #ifdef HAVE_SYS_TIME_H
 #include <sys/time.h>
@@ -86,13 +89,10 @@ TimeValue self_process::get_system_time() const {
   return getRUsageTimes().second;
 }
 
+// On Cygwin, getpagesize() returns 64k(AllocationGranularity) and
+// offset in mmap(3) should be aligned to the AllocationGranularity.
 static unsigned getPageSize() {
-#if defined(__CYGWIN__)
-  // On Cygwin, getpagesize() returns 64k but the page size for the purposes of
-  // memory protection and mmap() is 4k.
-  // See http://www.cygwin.com/ml/cygwin/2009-01/threads.html#00492
-  const int page_size = 0x1000;
-#elif defined(HAVE_GETPAGESIZE)
+#if defined(HAVE_GETPAGESIZE)
   const int page_size = ::getpagesize();
 #elif defined(HAVE_SYSCONF)
   long page_size = ::sysconf(_SC_PAGE_SIZE);
@@ -138,14 +138,6 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   llvm::tie(user_time, sys_time) = getRUsageTimes();
 }
 
-int Process::GetCurrentUserId() {
-  return getuid();
-}
-
-int Process::GetCurrentGroupId() {
-  return getgid();
-}
-
 #if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
 #include <mach/mach.h>
 #endif
@@ -190,6 +182,22 @@ void Process::PreventCoreFiles() {
 #endif
 }
 
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  std::string NameStr = Name.str();
+  const char *Val = ::getenv(NameStr.c_str());
+  if (!Val)
+    return None;
+  return std::string(Val);
+}
+
+error_code Process::GetArgumentVector(SmallVectorImpl<const char *> &ArgsOut,
+                                      ArrayRef<const char *> ArgsIn,
+                                      SpecificBumpPtrAllocator<char> &) {
+  ArgsOut.append(ArgsIn.begin(), ArgsIn.end());
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(STDIN_FILENO);
 }
@@ -224,8 +232,6 @@ static unsigned getColumns(int FileID) {
 #if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H)
   // Try to determine the width of the terminal.
   struct winsize ws;
-  // Zero-fill ws to avoid a false positive from MemorySanitizer.
-  memset(&ws, 0, sizeof(ws));
   if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
     Columns = ws.ws_col;
 #endif
@@ -247,22 +253,61 @@ unsigned Process::StandardErrColumns() {
   return getColumns(2);
 }
 
-static bool terminalHasColors() {
-  if (const char *term = std::getenv("TERM")) {
-    // Most modern terminals support ANSI escape sequences for colors.
-    // We could check terminfo, or have a list of known terms that support
-    // colors, but that would be overkill.
-    // The user can always ask for no colors by setting TERM to dumb, or
-    // using a commandline flag.
-    return strcmp(term, "dumb") != 0;
-  }
+#ifdef HAVE_TERMINFO
+// We manually declare these extern functions because finding the correct
+// headers from various terminfo, curses, or other sources is harder than
+// writing their specs down.
+extern "C" int setupterm(char *term, int filedes, int *errret);
+extern "C" struct term *set_curterm(struct term *termp);
+extern "C" int del_curterm(struct term *termp);
+extern "C" int tigetnum(char *capname);
+#endif
+
+static bool terminalHasColors(int fd) {
+#ifdef HAVE_TERMINFO
+  // First, acquire a global lock because these C routines are thread hostile.
+  static sys::Mutex M;
+  MutexGuard G(M);
+
+  int errret = 0;
+  if (setupterm((char *)0, fd, &errret) != 0)
+    // Regardless of why, if we can't get terminfo, we shouldn't try to print
+    // colors.
+    return false;
+
+  // Test whether the terminal as set up supports color output. How to do this
+  // isn't entirely obvious. We can use the curses routine 'has_colors' but it
+  // would be nice to avoid a dependency on curses proper when we can make do
+  // with a minimal terminfo parsing library. Also, we don't really care whether
+  // the terminal supports the curses-specific color changing routines, merely
+  // if it will interpret ANSI color escape codes in a reasonable way. Thus, the
+  // strategy here is just to query the baseline colors capability and if it
+  // supports colors at all to assume it will translate the escape codes into
+  // whatever range of colors it does support. We can add more detailed tests
+  // here if users report them as necessary.
+  //
+  // The 'tigetnum' routine returns -2 or -1 on errors, and might return 0 if
+  // the terminfo says that no colors are supported.
+  bool HasColors = tigetnum(const_cast<char *>("colors")) > 0;
+
+  // Now extract the structure allocated by setupterm and free its memory
+  // through a really silly dance.
+  struct term *termp = set_curterm((struct term *)0);
+  (void)del_curterm(termp); // Drop any errors here.
+
+  // Return true if we found a color capabilities for the current terminal.
+  if (HasColors)
+    return true;
+#endif
+
+  // Otherwise, be conservative.
   return false;
 }
 
 bool Process::FileDescriptorHasColors(int fd) {
   // A file descriptor has colors if it is displayed and the terminal has
   // colors.
-  return FileDescriptorIsDisplayed(fd) && terminalHasColors();
+  return FileDescriptorIsDisplayed(fd) && terminalHasColors(fd);
 }
 
 bool Process::StandardOutHasColors() {
@@ -273,29 +318,15 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(STDERR_FILENO);
 }
 
+void Process::UseANSIEscapeCodes(bool /*enable*/) {
+  // No effect.
+}
+
 bool Process::ColorNeedsFlush() {
   // No, we use ANSI escape sequences.
   return false;
 }
 
-#define COLOR(FGBG, CODE, BOLD) "\033[0;" BOLD FGBG CODE "m"
-
-#define ALLCOLORS(FGBG,BOLD) {\
-    COLOR(FGBG, "0", BOLD),\
-    COLOR(FGBG, "1", BOLD),\
-    COLOR(FGBG, "2", BOLD),\
-    COLOR(FGBG, "3", BOLD),\
-    COLOR(FGBG, "4", BOLD),\
-    COLOR(FGBG, "5", BOLD),\
-    COLOR(FGBG, "6", BOLD),\
-    COLOR(FGBG, "7", BOLD)\
-  }
-
-static const char colorcodes[2][2][8][10] = {
- { ALLCOLORS("3",""), ALLCOLORS("3","1;") },
- { ALLCOLORS("4",""), ALLCOLORS("4","1;") }
-};
-
 const char *Process::OutputColor(char code, bool bold, bool bg) {
   return colorcodes[bg?1:0][bold?1:0][code&7];
 }
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index aa03d48..78b2971 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -36,6 +36,9 @@
 #include <unistd.h>
 #endif
 #ifdef HAVE_POSIX_SPAWN
+#ifdef __sun__
+#define  _RESTRICT_KYWD
+#endif
 #include <spawn.h>
 #if !defined(__APPLE__)
   extern char **environ;
@@ -47,20 +50,16 @@
 namespace llvm {
 using namespace sys;
 
-Program::Program() : Data_(0) {}
-
-Program::~Program() {}
+ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
 
 // This function just uses the PATH environment variable to find the program.
-Path
-Program::FindProgramByName(const std::string& progName) {
+std::string
+sys::FindProgramByName(const std::string& progName) {
 
   // Check some degenerate cases
   if (progName.length() == 0) // no program
-    return Path();
-  Path temp;
-  if (!temp.set(progName)) // invalid name
-    return Path();
+    return "";
+  std::string temp = progName;
   // Use the given path verbatim if it contains any slashes; this matches
   // the behavior of sh(1) and friends.
   if (progName.find('/') != std::string::npos)
@@ -72,7 +71,7 @@ Program::FindProgramByName(const std::string& progName) {
   // Get the path. If its empty, we can't do anything to find it.
   const char *PathStr = getenv("PATH");
   if (PathStr == 0)
-    return Path();
+    return "";
 
   // Now we have a colon separated list of directories to search; try them.
   size_t PathLen = strlen(PathStr);
@@ -81,12 +80,10 @@ Program::FindProgramByName(const std::string& progName) {
     const char *Colon = std::find(PathStr, PathStr+PathLen, ':');
 
     // Check to see if this first directory contains the executable...
-    Path FilePath;
-    if (FilePath.set(std::string(PathStr,Colon))) {
-      FilePath.appendComponent(progName);
-      if (FilePath.canExecute())
-        return FilePath;                    // Found the executable!
-    }
+    SmallString<128> FilePath(PathStr,Colon);
+    sys::path::append(FilePath, progName);
+    if (sys::fs::can_execute(Twine(FilePath)))
+      return FilePath.str();                    // Found the executable!
 
     // Nope it wasn't in this directory, check the next path in the list!
     PathLen -= Colon-PathStr;
@@ -98,23 +95,23 @@ Program::FindProgramByName(const std::string& progName) {
       PathLen--;
     }
   }
-  return Path();
+  return "";
 }
 
-static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
+static bool RedirectIO(const StringRef *Path, int FD, std::string* ErrMsg) {
   if (Path == 0) // Noop
     return false;
-  const char *File;
-  if (Path->isEmpty())
+  std::string File;
+  if (Path->empty())
     // Redirect empty paths to /dev/null
     File = "/dev/null";
   else
-    File = Path->c_str();
+    File = *Path;
 
   // Open the file
-  int InFD = open(File, FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666);
+  int InFD = open(File.c_str(), FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666);
   if (InFD == -1) {
-    MakeErrMsg(ErrMsg, "Cannot open file '" + std::string(File) + "' for "
+    MakeErrMsg(ErrMsg, "Cannot open file '" + File + "' for "
               + (FD == 0 ? "input" : "output"));
     return true;
   }
@@ -130,19 +127,20 @@ static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
 }
 
 #ifdef HAVE_POSIX_SPAWN
-static bool RedirectIO_PS(const Path *Path, int FD, std::string *ErrMsg,
+static bool RedirectIO_PS(const std::string *Path, int FD, std::string *ErrMsg,
                           posix_spawn_file_actions_t *FileActions) {
   if (Path == 0) // Noop
     return false;
   const char *File;
-  if (Path->isEmpty())
+  if (Path->empty())
     // Redirect empty paths to /dev/null
     File = "/dev/null";
   else
     File = Path->c_str();
 
-  if (int Err = posix_spawn_file_actions_addopen(FileActions, FD,
-                            File, FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666))
+  if (int Err = posix_spawn_file_actions_addopen(
+          FileActions, FD, File,
+          FD == 0 ? O_RDONLY : O_WRONLY | O_CREAT, 0666))
     return MakeErrMsg(ErrMsg, "Cannot dup2", Err);
   return false;
 }
@@ -180,10 +178,18 @@ static void SetMemoryLimits (unsigned size)
 #endif
 }
 
-bool
-Program::Execute(const Path &path, const char **args, const char **envp,
-                 const Path **redirects, unsigned memoryLimit,
-                  std::string *ErrMsg) {
+}
+
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **envp, const StringRef **redirects,
+                    unsigned memoryLimit, std::string *ErrMsg) {
+  if (!llvm::sys::fs::exists(Program)) {
+    if (ErrMsg)
+      *ErrMsg = std::string("Executable \"") + Program.str() +
+                std::string("\" doesn't exist!");
+    return false;
+  }
+
   // If this OS has posix_spawn and there is no memory limit being implied, use
   // posix_spawn.  It is more efficient than fork/exec.
 #ifdef HAVE_POSIX_SPAWN
@@ -191,18 +197,32 @@ Program::Execute(const Path &path, const char **args, const char **envp,
     posix_spawn_file_actions_t FileActionsStore;
     posix_spawn_file_actions_t *FileActions = 0;
 
+    // If we call posix_spawn_file_actions_addopen we have to make sure the
+    // c strings we pass to it stay alive until the call to posix_spawn,
+    // so we copy any StringRefs into this variable.
+    std::string RedirectsStorage[3];
+
     if (redirects) {
+      std::string *RedirectsStr[3] = {0, 0, 0};
+      for (int I = 0; I < 3; ++I) {
+        if (redirects[I]) {
+          RedirectsStorage[I] = *redirects[I];
+          RedirectsStr[I] = &RedirectsStorage[I];
+        }
+      }
+
       FileActions = &FileActionsStore;
       posix_spawn_file_actions_init(FileActions);
 
       // Redirect stdin/stdout.
-      if (RedirectIO_PS(redirects[0], 0, ErrMsg, FileActions) ||
-          RedirectIO_PS(redirects[1], 1, ErrMsg, FileActions))
+      if (RedirectIO_PS(RedirectsStr[0], 0, ErrMsg, FileActions) ||
+          RedirectIO_PS(RedirectsStr[1], 1, ErrMsg, FileActions))
         return false;
       if (redirects[1] == 0 || redirects[2] == 0 ||
           *redirects[1] != *redirects[2]) {
         // Just redirect stderr
-        if (RedirectIO_PS(redirects[2], 2, ErrMsg, FileActions)) return false;
+        if (RedirectIO_PS(RedirectsStr[2], 2, ErrMsg, FileActions))
+          return false;
       } else {
         // If stdout and stderr should go to the same place, redirect stderr
         // to the FD already open for stdout.
@@ -222,7 +242,7 @@ Program::Execute(const Path &path, const char **args, const char **envp,
     // Explicitly initialized to prevent what appears to be a valgrind false
     // positive.
     pid_t PID = 0;
-    int Err = posix_spawn(&PID, path.c_str(), FileActions, /*attrp*/0,
+    int Err = posix_spawn(&PID, Program.str().c_str(), FileActions, /*attrp*/0,
                           const_cast<char **>(args), const_cast<char **>(envp));
 
     if (FileActions)
@@ -231,7 +251,8 @@ Program::Execute(const Path &path, const char **args, const char **envp,
     if (Err)
      return !MakeErrMsg(ErrMsg, "posix_spawn failed", Err);
 
-    Data_ = reinterpret_cast<void*>(PID);
+    PI.Pid = PID;
+
     return true;
   }
 #endif
@@ -272,12 +293,13 @@ Program::Execute(const Path &path, const char **args, const char **envp,
       }
 
       // Execute!
+      std::string PathStr = Program;
       if (envp != 0)
-        execve(path.c_str(),
+        execve(PathStr.c_str(),
                const_cast<char **>(args),
                const_cast<char **>(envp));
       else
-        execv(path.c_str(),
+        execv(PathStr.c_str(),
               const_cast<char **>(args));
       // If the execve() failed, we should exit. Follow Unix protocol and
       // return 127 if the executable was not found, and 126 otherwise.
@@ -293,62 +315,71 @@ Program::Execute(const Path &path, const char **args, const char **envp,
       break;
   }
 
-  Data_ = reinterpret_cast<void*>(child);
+  PI.Pid = child;
 
   return true;
 }
 
-int
-Program::Wait(const sys::Path &path,
-              unsigned secondsToWait,
-              std::string* ErrMsg)
-{
+namespace llvm {
+
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilTerminates, std::string *ErrMsg) {
 #ifdef HAVE_SYS_WAIT_H
   struct sigaction Act, Old;
-
-  if (Data_ == 0) {
-    MakeErrMsg(ErrMsg, "Process not started!");
-    return -1;
-  }
-
-  // Install a timeout handler.  The handler itself does nothing, but the simple
-  // fact of having a handler at all causes the wait below to return with EINTR,
-  // unlike if we used SIG_IGN.
-  if (secondsToWait) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+
+  int WaitPidOptions = 0;
+  pid_t ChildPid = PI.Pid;
+  if (WaitUntilTerminates) {
+    SecondsToWait = 0;
+    ChildPid = -1; // mimic a wait() using waitpid()
+  } else if (SecondsToWait) {
+    // Install a timeout handler.  The handler itself does nothing, but the
+    // simple fact of having a handler at all causes the wait below to return
+    // with EINTR, unlike if we used SIG_IGN.
     memset(&Act, 0, sizeof(Act));
     Act.sa_handler = TimeOutHandler;
     sigemptyset(&Act.sa_mask);
     sigaction(SIGALRM, &Act, &Old);
-    alarm(secondsToWait);
-  }
+    alarm(SecondsToWait);
+  } else if (SecondsToWait == 0)
+    WaitPidOptions = WNOHANG;
 
   // Parent process: Wait for the child process to terminate.
   int status;
-  uint64_t pid = reinterpret_cast<uint64_t>(Data_);
-  pid_t child = static_cast<pid_t>(pid);
-  while (waitpid(pid, &status, 0) != child)
-    if (secondsToWait && errno == EINTR) {
-      // Kill the child.
-      kill(child, SIGKILL);
-
-      // Turn off the alarm and restore the signal handler
-      alarm(0);
-      sigaction(SIGALRM, &Old, 0);
-
-      // Wait for child to die
-      if (wait(&status) != child)
-        MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
-      else
-        MakeErrMsg(ErrMsg, "Child timed out", 0);
-
-      return -2;   // Timeout detected
-    } else if (errno != EINTR) {
-      MakeErrMsg(ErrMsg, "Error waiting for child process");
-      return -1;
+  ProcessInfo WaitResult;
+  WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+  if (WaitResult.Pid != PI.Pid) {
+    if (WaitResult.Pid == 0) {
+      // Non-blocking wait.
+      return WaitResult;
+    } else {
+      if (SecondsToWait && errno == EINTR) {
+        // Kill the child.
+        kill(PI.Pid, SIGKILL);
+
+        // Turn off the alarm and restore the signal handler
+        alarm(0);
+        sigaction(SIGALRM, &Old, 0);
+
+        // Wait for child to die
+        if (wait(&status) != ChildPid)
+          MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
+        else
+          MakeErrMsg(ErrMsg, "Child timed out", 0);
+
+        WaitResult.ReturnCode = -2; // Timeout detected
+        return WaitResult;
+      } else if (errno != EINTR) {
+        MakeErrMsg(ErrMsg, "Error waiting for child process");
+        WaitResult.ReturnCode = -1;
+        return WaitResult;
+      }
     }
+  }
 
   // We exited normally without timeout, so turn off the timer.
-  if (secondsToWait) {
+  if (SecondsToWait && !WaitUntilTerminates) {
     alarm(0);
     sigaction(SIGALRM, &Old, 0);
   }
@@ -358,24 +389,19 @@ Program::Wait(const sys::Path &path,
   int result = 0;
   if (WIFEXITED(status)) {
     result = WEXITSTATUS(status);
-#ifdef HAVE_POSIX_SPAWN
-    // The posix_spawn child process returns 127 on any kind of error.
-    // Following the POSIX convention for command-line tools (which posix_spawn
-    // itself apparently does not), check to see if the failure was due to some
-    // reason other than the file not existing, and return 126 in this case.
-    bool Exists;
-    if (result == 127 && !llvm::sys::fs::exists(path.str(), Exists) && Exists)
-      result = 126;
-#endif
+    WaitResult.ReturnCode = result;
+
     if (result == 127) {
       if (ErrMsg)
         *ErrMsg = llvm::sys::StrError(ENOENT);
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
     if (result == 126) {
       if (ErrMsg)
         *ErrMsg = "Program could not be executed";
-      return -1;
+      WaitResult.ReturnCode = -1;
+      return WaitResult;
     }
   } else if (WIFSIGNALED(status)) {
     if (ErrMsg) {
@@ -387,27 +413,27 @@ Program::Wait(const sys::Path &path,
     }
     // Return a special value to indicate that the process received an unhandled
     // signal during execution as opposed to failing to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
   }
-  return result;
 #else
   if (ErrMsg)
     *ErrMsg = "Program::Wait is not implemented on this platform yet!";
-  return -1;
+  WaitResult.ReturnCode = -2;
 #endif
+  return WaitResult;
 }
 
-error_code Program::ChangeStdinToBinary(){
+error_code sys::ChangeStdinToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return make_error_code(errc::success);
 }
 
-error_code Program::ChangeStdoutToBinary(){
+error_code sys::ChangeStdoutToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return make_error_code(errc::success);
 }
 
-error_code Program::ChangeStderrToBinary(){
+error_code sys::ChangeStderrToBinary(){
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return make_error_code(errc::success);
 }
@@ -432,5 +458,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 64d1fc1..b4c78d6 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -55,8 +55,7 @@ static std::vector<std::pair<void(*)(void*), void*> > CallBacksToRun;
 static const int IntSigs[] = {
   SIGHUP, SIGINT, SIGPIPE, SIGTERM, SIGUSR1, SIGUSR2
 };
-static const int *const IntSigsEnd =
-  IntSigs + sizeof(IntSigs) / sizeof(IntSigs[0]);
+static const int *const IntSigsEnd = array_endof(IntSigs);
 
 // KillSigs - Signals that represent that we have a bug, and our prompt
 // termination has been ordered.
@@ -75,8 +74,7 @@ static const int KillSigs[] = {
   , SIGEMT
 #endif
 };
-static const int *const KillSigsEnd =
-  KillSigs + sizeof(KillSigs) / sizeof(KillSigs[0]);
+static const int *const KillSigsEnd = array_endof(KillSigs);
 
 static unsigned NumRegisteredSignals = 0;
 static struct {
@@ -211,11 +209,11 @@ void llvm::sys::SetInterruptFunction(void (*IF)()) {
 }
 
 // RemoveFileOnSignal - The public API
-bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
+bool llvm::sys::RemoveFileOnSignal(StringRef Filename,
                                    std::string* ErrMsg) {
   SignalsMutex.acquire();
   std::string *OldPtr = FilesToRemove.empty() ? 0 : &FilesToRemove[0];
-  FilesToRemove.push_back(Filename.str());
+  FilesToRemove.push_back(Filename);
 
   // We want to call 'c_str()' on every std::string in this vector so that if
   // the underlying implementation requires a re-allocation, it happens here
@@ -235,10 +233,10 @@ bool llvm::sys::RemoveFileOnSignal(const sys::Path &Filename,
 }
 
 // DontRemoveFileOnSignal - The public API
-void llvm::sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+void llvm::sys::DontRemoveFileOnSignal(StringRef Filename) {
   SignalsMutex.acquire();
   std::vector<std::string>::reverse_iterator RI =
-    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename.str());
+    std::find(FilesToRemove.rbegin(), FilesToRemove.rend(), Filename);
   std::vector<std::string>::iterator I = FilesToRemove.end();
   if (RI != FilesToRemove.rend())
     I = FilesToRemove.erase(RI.base()-1);
@@ -335,7 +333,7 @@ static void PrintStackTraceSignalHandler(void *) {
 void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTraceSignalHandler, 0);
 
-#if defined(__APPLE__)
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
   // Environment variable to disable any kind of crash dialog.
   if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
     mach_port_t self = mach_task_self();
@@ -361,7 +359,7 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 // the same linkage unit by just defining our own versions of the assert handler
 // and abort.
 
-#ifdef __APPLE__
+#if defined(__APPLE__) && defined(ENABLE_CRASH_OVERRIDES)
 
 #include <signal.h>
 #include <pthread.h>
diff --git a/lib/Support/Unix/ThreadLocal.inc b/lib/Support/Unix/ThreadLocal.inc
index 2b4c901..f14d0fa 100644
--- a/lib/Support/Unix/ThreadLocal.inc
+++ b/lib/Support/Unix/ThreadLocal.inc
@@ -18,7 +18,7 @@
 
 namespace llvm {
 using namespace sys;
-ThreadLocalImpl::ThreadLocalImpl() { }
+ThreadLocalImpl::ThreadLocalImpl() : data() { }
 ThreadLocalImpl::~ThreadLocalImpl() { }
 void ThreadLocalImpl::setInstance(const void* d) { data = const_cast<void*>(d);}
 const void* ThreadLocalImpl::getInstance() { return data; }
diff --git a/lib/Support/Unix/TimeValue.inc b/lib/Support/Unix/TimeValue.inc
index df8558b..80532b0 100644
--- a/lib/Support/Unix/TimeValue.inc
+++ b/lib/Support/Unix/TimeValue.inc
@@ -22,18 +22,13 @@ namespace llvm {
   using namespace sys;
 
 std::string TimeValue::str() const {
-  char buffer[32];
-
-  time_t ourTime = time_t(this->toEpochTime());
-#ifdef __hpux
-// note that the following line needs -D_REENTRANT on HP-UX to be picked up
-  asctime_r(localtime(&ourTime), buffer);
-#else
-  ::asctime_r(::localtime(&ourTime), buffer);
-#endif
-
-  std::string result(buffer);
-  return result.substr(0,24);
+  time_t OurTime = time_t(this->toEpochTime());
+  struct tm Storage;
+  struct tm *LT = ::localtime_r(&OurTime, &Storage);
+  assert(LT);
+  char Buffer[25];
+  strftime(Buffer, 25, "%b %e %H:%M %Y", LT);
+  return std::string(Buffer);
 }
 
 TimeValue TimeValue::now() {
diff --git a/lib/Support/Unix/Unix.h b/lib/Support/Unix/Unix.h
index 051f56f..ba688e3 100644
--- a/lib/Support/Unix/Unix.h
+++ b/lib/Support/Unix/Unix.h
@@ -22,28 +22,22 @@
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
 #include "llvm/Support/Errno.h"
 #include <algorithm>
+#include <assert.h>
 #include <cerrno>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <string>
+#include <sys/types.h>
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
 
-#ifdef HAVE_SYS_TYPES_H
-#include <sys/types.h>
-#endif
-
 #ifdef HAVE_SYS_PARAM_H
 #include <sys/param.h>
 #endif
 
-#ifdef HAVE_ASSERT_H
-#include <assert.h>
-#endif
-
 #ifdef HAVE_SYS_TIME_H
 # include <sys/time.h>
 #endif
@@ -53,6 +47,10 @@
 # include <sys/wait.h>
 #endif
 
+#ifdef HAVE_DLFCN_H
+# include <dlfcn.h>
+#endif
+
 #ifndef WEXITSTATUS
 # define WEXITSTATUS(stat_val) ((unsigned)(stat_val) >> 8)
 #endif
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 83da82a..5a7b219 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -71,7 +71,7 @@ extern "C" {
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
-  SmartScopedLock<true> lock(getMutex());
+  SmartScopedLock<true> lock(*SymbolsMutex);
 
   if (!filename) {
     // When no file is specified, enumerate all DLLs and EXEs in the process.
@@ -83,8 +83,15 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     // This is mostly to ensure that the return value still shows up as "valid".
     return DynamicLibrary(&OpenedHandles);
   }
+
+  SmallVector<wchar_t, MAX_PATH> filenameUnicode;
+  if (error_code ec = windows::UTF8ToUTF16(filename, filenameUnicode)) {
+    SetLastError(ec.value());
+    MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16: ");
+    return DynamicLibrary();
+  }
   
-  HMODULE a_handle = LoadLibrary(filename);
+  HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
     MakeErrMsg(errMsg, std::string(filename) + ": Can't open : ");
@@ -114,10 +121,10 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef EXPLICIT_SYMBOL2
 
 void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
-  SmartScopedLock<true> Lock(getMutex());
+  SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
-  if (ExplicitSymbols) {
+  if (ExplicitSymbols.isConstructed()) {
     StringMap<void *>::iterator i = ExplicitSymbols->find(symbolName);
 
     if (i != ExplicitSymbols->end())
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index 4c5aebd..1260452 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -82,7 +82,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
 
   uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
                                 NearBlock->size()
-                           : NULL;
+                           : 0;
 
   // If the requested address is not aligned to the allocation granularity,
   // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
@@ -106,7 +106,7 @@ MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
   MemoryBlock Result;
   Result.Address = PA;
   Result.Size = NumBlocks*Granularity;
-                                 ;
+
   if (Flags & MF_EXEC)
     Memory::InvalidateInstructionCache(Result.Address, Result.Size);
 
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index f4898e6..0b39198 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -1,4 +1,4 @@
-//===- llvm/Support/Win32/Path.cpp - Win32 Path Implementation ---*- C++ -*-===//
+//===- llvm/Support/Windows/Path.inc - Windows Path Impl --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,920 +7,1116 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file provides the Win32 specific implementation of the Path class.
+// This file implements the Windows specific implementation of the Path API.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Win32 code that
-//===          is guaranteed to work on *all* Win32 variants.
+//=== WARNING: Implementation here must contain only generic Windows code that
+//===          is guaranteed to work on *all* Windows variants.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "Windows.h"
-#include <cstdio>
-#include <malloc.h>
+#include <fcntl.h>
+#include <io.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 
-// We need to undo a macro defined in Windows.h, otherwise we won't compile:
-#undef CopyFile
-#undef GetCurrentDirectory
+#undef max
 
-// Windows happily accepts either forward or backward slashes, though any path
-// returned by a Win32 API will have backward slashes.  As LLVM code basically
-// assumes forward slashes are used, backward slashs are converted where they
-// can be introduced into a path.
-//
-// Another invariant is that a path ends with a slash if and only if the path
-// is a root directory.  Any other use of a trailing slash is stripped.  Unlike
-// in Unix, Windows has a rather complicated notion of a root path and this
-// invariant helps simply the code.
-
-static void FlipBackSlashes(std::string& s) {
-  for (size_t i = 0; i < s.size(); i++)
-    if (s[i] == '\\')
-      s[i] = '/';
-}
+// MinGW doesn't define this.
+#ifndef _ERRNO_T_DEFINED
+#define _ERRNO_T_DEFINED
+typedef int errno_t;
+#endif
 
-namespace llvm {
-namespace sys {
+#ifdef _MSC_VER
+# pragma comment(lib, "advapi32.lib")  // This provides CryptAcquireContextW.
+#endif
 
-const char PathSeparator = ';';
+using namespace llvm;
 
-StringRef Path::GetEXESuffix() {
-  return "exe";
-}
+using llvm::sys::windows::UTF8ToUTF16;
+using llvm::sys::windows::UTF16ToUTF8;
 
-Path::Path(llvm::StringRef p)
-  : path(p) {
-  FlipBackSlashes(path);
-}
+namespace {
+  typedef BOOLEAN (WINAPI *PtrCreateSymbolicLinkW)(
+    /*__in*/ LPCWSTR lpSymlinkFileName,
+    /*__in*/ LPCWSTR lpTargetFileName,
+    /*__in*/ DWORD dwFlags);
 
-Path::Path(const char *StrStart, unsigned StrLen)
-  : path(StrStart, StrLen) {
-  FlipBackSlashes(path);
-}
+  PtrCreateSymbolicLinkW create_symbolic_link_api =
+      PtrCreateSymbolicLinkW(::GetProcAddress(
+          ::GetModuleHandleW(L"Kernel32.dll"), "CreateSymbolicLinkW"));
 
-Path&
-Path::operator=(StringRef that) {
-  path.assign(that.data(), that.size());
-  FlipBackSlashes(path);
-  return *this;
-}
+  error_code TempDir(SmallVectorImpl<wchar_t> &result) {
+  retry_temp_dir:
+    DWORD len = ::GetTempPathW(result.capacity(), result.begin());
 
-bool
-Path::isValid() const {
-  if (path.empty())
-    return false;
+    if (len == 0)
+      return windows_error(::GetLastError());
 
-  size_t len = path.size();
-  // If there is a null character, it and all its successors are ignored.
-  size_t pos = path.find_first_of('\0');
-  if (pos != std::string::npos)
-    len = pos;
-
-  // If there is a colon, it must be the second character, preceded by a letter
-  // and followed by something.
-  pos = path.rfind(':',len);
-  size_t rootslash = 0;
-  if (pos != std::string::npos) {
-    if (pos != 1 || !isalpha(static_cast<unsigned char>(path[0])) || len < 3)
-      return false;
-      rootslash = 2;
-  }
+    if (len > result.capacity()) {
+      result.reserve(len);
+      goto retry_temp_dir;
+    }
 
-  // Look for a UNC path, and if found adjust our notion of the root slash.
-  if (len > 3 && path[0] == '/' && path[1] == '/') {
-    rootslash = path.find('/', 2);
-    if (rootslash == std::string::npos)
-      rootslash = 0;
+    result.set_size(len);
+    return error_code::success();
   }
 
-  // Check for illegal characters.
-  if (path.find_first_of("\\<>\"|\001\002\003\004\005\006\007\010\011\012"
-                         "\013\014\015\016\017\020\021\022\023\024\025\026"
-                         "\027\030\031\032\033\034\035\036\037")
-      != std::string::npos)
-    return false;
-
-  // Remove trailing slash, unless it's a root slash.
-  if (len > rootslash+1 && path[len-1] == '/')
-    path.erase(--len);
-
-  // Check each component for legality.
-  for (pos = 0; pos < len; ++pos) {
-    // A component may not end in a space.
-    if (path[pos] == ' ') {
-      if (pos+1 == len || path[pos+1] == '/' || path[pos+1] == '\0')
-        return false;
+  bool is_separator(const wchar_t value) {
+    switch (value) {
+    case L'\\':
+    case L'/':
+      return true;
+    default:
+      return false;
     }
+  }
+}
 
-    // A component may not end in a period.
-    if (path[pos] == '.') {
-      if (pos+1 == len || path[pos+1] == '/') {
-        // Unless it is the pseudo-directory "."...
-        if (pos == 0 || path[pos-1] == '/' || path[pos-1] == ':')
-          return true;
-        // or "..".
-        if (pos > 0 && path[pos-1] == '.') {
-          if (pos == 1 || path[pos-2] == '/' || path[pos-2] == ':')
-            return true;
-        }
-        return false;
+// FIXME: mode should be used here and default to user r/w only,
+// it currently comes in as a UNIX mode.
+static error_code createUniqueEntity(const Twine &model, int &result_fd,
+                                     SmallVectorImpl<char> &result_path,
+                                     bool makeAbsolute, unsigned mode,
+                                     FSEntity Type) {
+  // Use result_path as temp storage.
+  result_path.set_size(0);
+  StringRef m = model.toStringRef(result_path);
+
+  SmallVector<wchar_t, 128> model_utf16;
+  if (error_code ec = UTF8ToUTF16(m, model_utf16)) return ec;
+
+  if (makeAbsolute) {
+    // Make model absolute by prepending a temp directory if it's not already.
+    bool absolute = sys::path::is_absolute(m);
+
+    if (!absolute) {
+      SmallVector<wchar_t, 64> temp_dir;
+      if (error_code ec = TempDir(temp_dir)) return ec;
+      // Handle c: by removing it.
+      if (model_utf16.size() > 2 && model_utf16[1] == L':') {
+        model_utf16.erase(model_utf16.begin(), model_utf16.begin() + 2);
       }
+      model_utf16.insert(model_utf16.begin(), temp_dir.begin(), temp_dir.end());
     }
   }
 
-  return true;
-}
+  // Replace '%' with random chars. From here on, DO NOT modify model. It may be
+  // needed if the randomly chosen path already exists.
+  SmallVector<wchar_t, 128> random_path_utf16;
+
+  // Get a Crypto Provider for CryptGenRandom.
+  HCRYPTPROV HCPC;
+  if (!::CryptAcquireContextW(&HCPC,
+                              NULL,
+                              NULL,
+                              PROV_RSA_FULL,
+                              CRYPT_VERIFYCONTEXT))
+    return windows_error(::GetLastError());
+  ScopedCryptContext CryptoProvider(HCPC);
+
+retry_random_path:
+  random_path_utf16.set_size(0);
+  for (SmallVectorImpl<wchar_t>::const_iterator i = model_utf16.begin(),
+                                                e = model_utf16.end();
+                                                i != e; ++i) {
+    if (*i == L'%') {
+      BYTE val = 0;
+      if (!::CryptGenRandom(CryptoProvider, 1, &val))
+          return windows_error(::GetLastError());
+      random_path_utf16.push_back(L"0123456789abcdef"[val & 15]);
+    }
+    else
+      random_path_utf16.push_back(*i);
+  }
+  // Make random_path_utf16 null terminated.
+  random_path_utf16.push_back(0);
+  random_path_utf16.pop_back();
+
+  HANDLE TempFileHandle = INVALID_HANDLE_VALUE;
+
+  switch (Type) {
+  case FS_File: {
+    // Try to create + open the path.
+    TempFileHandle =
+        ::CreateFileW(random_path_utf16.begin(), GENERIC_READ | GENERIC_WRITE,
+                      FILE_SHARE_READ, NULL,
+                      // Return ERROR_FILE_EXISTS if the file
+                      // already exists.
+                      CREATE_NEW, FILE_ATTRIBUTE_TEMPORARY, NULL);
+    if (TempFileHandle == INVALID_HANDLE_VALUE) {
+      // If the file existed, try again, otherwise, error.
+      error_code ec = windows_error(::GetLastError());
+      if (ec == windows_error::file_exists)
+        goto retry_random_path;
+
+      return ec;
+    }
 
-void Path::makeAbsolute() {
-  TCHAR  FullPath[MAX_PATH + 1] = {0};
-  LPTSTR FilePart = NULL;
+    // Convert the Windows API file handle into a C-runtime handle.
+    int fd = ::_open_osfhandle(intptr_t(TempFileHandle), 0);
+    if (fd == -1) {
+      ::CloseHandle(TempFileHandle);
+      ::DeleteFileW(random_path_utf16.begin());
+      // MSDN doesn't say anything about _open_osfhandle setting errno or
+      // GetLastError(), so just return invalid_handle.
+      return windows_error::invalid_handle;
+    }
 
-  DWORD RetLength = ::GetFullPathNameA(path.c_str(),
-                        sizeof(FullPath)/sizeof(FullPath[0]),
-                        FullPath, &FilePart);
+    result_fd = fd;
+    break;
+  }
 
-  if (0 == RetLength) {
-    // FIXME: Report the error GetLastError()
-    assert(0 && "Unable to make absolute path!");
-  } else if (RetLength > MAX_PATH) {
-    // FIXME: Report too small buffer (needed RetLength bytes).
-    assert(0 && "Unable to make absolute path!");
-  } else {
-    path = FullPath;
+  case FS_Name: {
+    DWORD attributes = ::GetFileAttributesW(random_path_utf16.begin());
+    if (attributes != INVALID_FILE_ATTRIBUTES)
+      goto retry_random_path;
+    error_code EC = make_error_code(windows_error(::GetLastError()));
+    if (EC != windows_error::file_not_found &&
+        EC != windows_error::path_not_found)
+      return EC;
+    break;
   }
-}
 
-bool
-Path::isAbsolute(const char *NameStart, unsigned NameLen) {
-  assert(NameStart);
-  // FIXME: This does not handle correctly an absolute path starting from
-  // a drive letter or in UNC format.
-  switch (NameLen) {
-  case 0:
-    return false;
-  case 1:
-  case 2:
-    return NameStart[0] == '/';
-  default:
-    return
-      (NameStart[0] == '/' || (NameStart[1] == ':' && NameStart[2] == '/')) ||
-      (NameStart[0] == '\\' || (NameStart[1] == ':' && NameStart[2] == '\\'));
+  case FS_Dir:
+    if (!::CreateDirectoryW(random_path_utf16.begin(), NULL)) {
+      error_code EC = windows_error(::GetLastError());
+      if (EC != windows_error::already_exists)
+        return EC;
+      goto retry_random_path;
+    }
+    break;
   }
-}
 
-bool
-Path::isAbsolute() const {
-  // FIXME: This does not handle correctly an absolute path starting from
-  // a drive letter or in UNC format.
-  switch (path.length()) {
-    case 0:
-      return false;
-    case 1:
-    case 2:
-      return path[0] == '/';
-    default:
-      return path[0] == '/' || (path[1] == ':' && path[2] == '/');
+  // Set result_path to the utf-8 representation of the path.
+  if (error_code ec = UTF16ToUTF8(random_path_utf16.begin(),
+                                  random_path_utf16.size(), result_path)) {
+    switch (Type) {
+    case FS_File:
+      ::CloseHandle(TempFileHandle);
+      ::DeleteFileW(random_path_utf16.begin());
+    case FS_Name:
+      break;
+    case FS_Dir:
+      ::RemoveDirectoryW(random_path_utf16.begin());
+      break;
+    }
+    return ec;
   }
+
+  return error_code::success();
 }
 
-static Path *TempDirectory;
+namespace llvm {
+namespace sys  {
+namespace fs {
 
-Path
-Path::GetTemporaryDirectory(std::string* ErrMsg) {
-  if (TempDirectory) {
-#if defined(_MSC_VER)
-    // Visual Studio gets confused and emits a diagnostic about calling exists,
-    // even though this is the implementation for PathV1.  Temporarily 
-    // disable the deprecated warning message
-    #pragma warning(push)
-    #pragma warning(disable:4996)
-#endif
-    assert(TempDirectory->exists() && "Who has removed TempDirectory?");
-#if defined(_MSC_VER)
-    #pragma warning(pop)
-#endif
-    return *TempDirectory;
-  }
+std::string getMainExecutable(const char *argv0, void *MainExecAddr) {
+  SmallVector<wchar_t, MAX_PATH> PathName;
+  DWORD Size = ::GetModuleFileNameW(NULL, PathName.data(), PathName.capacity());
 
-  char pathname[MAX_PATH];
-  if (!GetTempPath(MAX_PATH, pathname)) {
-    if (ErrMsg)
-      *ErrMsg = "Can't determine temporary directory";
-    return Path();
-  }
+  // A zero return value indicates a failure other than insufficient space.
+  if (Size == 0)
+    return "";
 
-  Path result;
-  result.set(pathname);
+  // Insufficient space is determined by a return value equal to the size of
+  // the buffer passed in.
+  if (Size == PathName.capacity())
+    return "";
 
-  // Append a subdirectory based on our process id so multiple LLVMs don't
-  // step on each other's toes.
-#ifdef __MINGW32__
-  // Mingw's Win32 header files are broken.
-  sprintf(pathname, "LLVM_%u", unsigned(GetCurrentProcessId()));
-#else
-  sprintf(pathname, "LLVM_%u", GetCurrentProcessId());
-#endif
-  result.appendComponent(pathname);
-
-  // If there's a directory left over from a previous LLVM execution that
-  // happened to have the same process id, get rid of it.
-  result.eraseFromDisk(true);
-
-  // And finally (re-)create the empty directory.
-  result.createDirectoryOnDisk(false);
-  TempDirectory = new Path(result);
-  return *TempDirectory;
-}
-
-// FIXME: the following set of functions don't map to Windows very well.
-Path
-Path::GetRootDirectory() {
-  // This is the only notion that that Windows has of a root directory. Nothing
-  // is here except for drives.
-  return Path("file:///");
-}
-
-void
-Path::GetSystemLibraryPaths(std::vector<sys::Path>& Paths) {
-  char buff[MAX_PATH];
-  // Generic form of C:\Windows\System32
-  HRESULT res =  SHGetFolderPathA(NULL,
-                                  CSIDL_FLAG_CREATE | CSIDL_SYSTEM,
-                                  NULL,
-                                  SHGFP_TYPE_CURRENT,
-                                  buff);
-  if (res != S_OK) {
-    assert(0 && "Failed to get system directory");
-    return;
-  }
-  Paths.push_back(sys::Path(buff));
-
-  // Reset buff.
-  buff[0] = 0;
-  // Generic form of C:\Windows
-  res =  SHGetFolderPathA(NULL,
-                          CSIDL_FLAG_CREATE | CSIDL_WINDOWS,
-                          NULL,
-                          SHGFP_TYPE_CURRENT,
-                          buff);
-  if (res != S_OK) {
-    assert(0 && "Failed to get windows directory");
-    return;
-  }
-  Paths.push_back(sys::Path(buff));
-}
+  // On success, GetModuleFileNameW returns the number of characters written to
+  // the buffer not including the NULL terminator.
+  PathName.set_size(Size);
 
-void
-Path::GetBitcodeLibraryPaths(std::vector<sys::Path>& Paths) {
-  char * env_var = getenv("LLVM_LIB_SEARCH_PATH");
-  if (env_var != 0) {
-    getPathList(env_var,Paths);
-  }
-#ifdef LLVM_LIBDIR
-  {
-    Path tmpPath;
-    if (tmpPath.set(LLVM_LIBDIR))
-      if (tmpPath.canRead())
-        Paths.push_back(tmpPath);
-  }
-#endif
-  GetSystemLibraryPaths(Paths);
-}
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> PathNameUTF8;
+  if (UTF16ToUTF8(PathName.data(), PathName.size(), PathNameUTF8))
+    return "";
 
-Path
-Path::GetUserHomeDirectory() {
-  char buff[MAX_PATH];
-  HRESULT res = SHGetFolderPathA(NULL,
-                                 CSIDL_FLAG_CREATE | CSIDL_APPDATA,
-                                 NULL,
-                                 SHGFP_TYPE_CURRENT,
-                                 buff);
-  if (res != S_OK)
-    assert(0 && "Failed to get user home directory");
-  return Path(buff);
+  return std::string(PathNameUTF8.data());
 }
 
-Path
-Path::GetCurrentDirectory() {
-  char pathname[MAX_PATH];
-  ::GetCurrentDirectoryA(MAX_PATH,pathname);
-  return Path(pathname);
+UniqueID file_status::getUniqueID() const {
+  // The file is uniquely identified by the volume serial number along
+  // with the 64-bit file identifier.
+  uint64_t FileID = (static_cast<uint64_t>(FileIndexHigh) << 32ULL) |
+                    static_cast<uint64_t>(FileIndexLow);
+
+  return UniqueID(VolumeSerialNumber, FileID);
 }
 
-/// GetMainExecutable - Return the path to the main executable, given the
-/// value of argv[0] from program startup.
-Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
-  char pathname[MAX_PATH];
-  DWORD ret = ::GetModuleFileNameA(NULL, pathname, MAX_PATH);
-  return ret != MAX_PATH ? Path(pathname) : Path();
+TimeValue file_status::getLastModificationTime() const {
+  ULARGE_INTEGER UI;
+  UI.LowPart = LastWriteTimeLow;
+  UI.HighPart = LastWriteTimeHigh;
+
+  TimeValue Ret;
+  Ret.fromWin32Time(UI.QuadPart);
+  return Ret;
 }
 
+error_code current_path(SmallVectorImpl<char> &result) {
+  SmallVector<wchar_t, MAX_PATH> cur_path;
+  DWORD len = MAX_PATH;
 
-// FIXME: the above set of functions don't map to Windows very well.
+  do {
+    cur_path.reserve(len);
+    len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
+
+    // A zero return value indicates a failure other than insufficient space.
+    if (len == 0)
+      return windows_error(::GetLastError());
 
+    // If there's insufficient space, the len returned is larger than the len
+    // given.
+  } while (len > cur_path.capacity());
 
-StringRef Path::getDirname() const {
-  return getDirnameCharSep(path, "/");
+  // On success, GetCurrentDirectoryW returns the number of characters not
+  // including the null-terminator.
+  cur_path.set_size(len);
+  return UTF16ToUTF8(cur_path.begin(), cur_path.size(), result);
 }
 
-StringRef
-Path::getBasename() const {
-  // Find the last slash
-  size_t slash = path.rfind('/');
-  if (slash == std::string::npos)
-    slash = 0;
-  else
-    slash++;
+error_code create_directory(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
 
-  size_t dot = path.rfind('.');
-  if (dot == std::string::npos || dot < slash)
-    return StringRef(path).substr(slash);
-  else
-    return StringRef(path).substr(slash, dot - slash);
-}
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
 
-StringRef
-Path::getSuffix() const {
-  // Find the last slash
-  size_t slash = path.rfind('/');
-  if (slash == std::string::npos)
-    slash = 0;
-  else
-    slash++;
+  if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
+    error_code ec = windows_error(::GetLastError());
+    if (ec == windows_error::already_exists)
+      existed = true;
+    else
+      return ec;
+  } else
+    existed = false;
 
-  size_t dot = path.rfind('.');
-  if (dot == std::string::npos || dot < slash)
-    return StringRef("");
-  else
-    return StringRef(path).substr(dot + 1);
+  return error_code::success();
 }
 
-bool
-Path::exists() const {
-  DWORD attr = GetFileAttributes(path.c_str());
-  return attr != INVALID_FILE_ATTRIBUTES;
-}
+error_code create_hard_link(const Twine &to, const Twine &from) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
 
-bool
-Path::isDirectory() const {
-  DWORD attr = GetFileAttributes(path.c_str());
-  return (attr != INVALID_FILE_ATTRIBUTES) &&
-         (attr & FILE_ATTRIBUTE_DIRECTORY);
+  if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
+    return windows_error(::GetLastError());
+
+  return error_code::success();
 }
 
-bool
-Path::isSymLink() const {
-  DWORD attributes = GetFileAttributes(path.c_str());
+error_code create_symlink(const Twine &to, const Twine &from) {
+  // Only do it if the function is available at runtime.
+  if (!create_symbolic_link_api)
+    return make_error_code(errc::function_not_supported);
 
-  if (attributes == INVALID_FILE_ATTRIBUTES)
-    // There's no sane way to report this :(.
-    assert(0 && "GetFileAttributes returned INVALID_FILE_ATTRIBUTES");
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
 
-  // This isn't exactly what defines a NTFS symlink, but it is only true for
-  // paths that act like a symlink.
-  return attributes & FILE_ATTRIBUTE_REPARSE_POINT;
-}
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
 
-bool
-Path::canRead() const {
-  // FIXME: take security attributes into account.
-  DWORD attr = GetFileAttributes(path.c_str());
-  return attr != INVALID_FILE_ATTRIBUTES;
-}
+  if (!create_symbolic_link_api(wide_from.begin(), wide_to.begin(), 0))
+    return windows_error(::GetLastError());
 
-bool
-Path::canWrite() const {
-  // FIXME: take security attributes into account.
-  DWORD attr = GetFileAttributes(path.c_str());
-  return (attr != INVALID_FILE_ATTRIBUTES) && !(attr & FILE_ATTRIBUTE_READONLY);
+  return error_code::success();
 }
 
-bool
-Path::canExecute() const {
-  // FIXME: take security attributes into account.
-  DWORD attr = GetFileAttributes(path.c_str());
-  return attr != INVALID_FILE_ATTRIBUTES;
+error_code remove(const Twine &path, bool &existed) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  file_status st;
+  error_code EC = status(path, st);
+  if (EC) {
+    if (EC == windows_error::file_not_found ||
+        EC == windows_error::path_not_found) {
+      existed = false;
+      return error_code::success();
+    }
+    return EC;
+  }
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  if (st.type() == file_type::directory_file) {
+    if (!::RemoveDirectoryW(c_str(path_utf16))) {
+      error_code ec = windows_error(::GetLastError());
+      if (ec != windows_error::file_not_found)
+        return ec;
+      existed = false;
+    } else
+      existed = true;
+  } else {
+    if (!::DeleteFileW(c_str(path_utf16))) {
+      error_code ec = windows_error(::GetLastError());
+      if (ec != windows_error::file_not_found)
+        return ec;
+      existed = false;
+    } else
+      existed = true;
+  }
+
+  return error_code::success();
 }
 
-bool
-Path::isRegularFile() const {
-  bool res;
-  if (fs::is_regular_file(path, res))
-    return false;
-  return res;
+error_code rename(const Twine &from, const Twine &to) {
+  // Get arguments.
+  SmallString<128> from_storage;
+  SmallString<128> to_storage;
+  StringRef f = from.toStringRef(from_storage);
+  StringRef t = to.toStringRef(to_storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> wide_from;
+  SmallVector<wchar_t, 128> wide_to;
+  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
+  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
+
+  error_code ec = error_code::success();
+  for (int i = 0; i < 2000; i++) {
+    if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
+                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
+      return error_code::success();
+    ec = windows_error(::GetLastError());
+    if (ec != windows_error::access_denied)
+      break;
+    // Retry MoveFile() at ACCESS_DENIED.
+    // System scanners (eg. indexer) might open the source file when
+    // It is written and closed.
+    ::Sleep(1);
+  }
+
+  return ec;
 }
 
-StringRef
-Path::getLast() const {
-  // Find the last slash
-  size_t pos = path.rfind('/');
+error_code resize_file(const Twine &path, uint64_t size) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
 
-  // Handle the corner cases
-  if (pos == std::string::npos)
-    return path;
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
 
-  // If the last character is a slash, we have a root directory
-  if (pos == path.length()-1)
-    return path;
+  int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
+  if (fd == -1)
+    return error_code(errno, generic_category());
+#ifdef HAVE__CHSIZE_S
+  errno_t error = ::_chsize_s(fd, size);
+#else
+  errno_t error = ::_chsize(fd, size);
+#endif
+  ::close(fd);
+  return error_code(error, generic_category());
+}
 
-  // Return everything after the last slash
-  return StringRef(path).substr(pos+1);
+error_code exists(const Twine &path, bool &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
+
+  if (attributes == INVALID_FILE_ATTRIBUTES) {
+    // See if the file didn't actually exist.
+    error_code ec = make_error_code(windows_error(::GetLastError()));
+    if (ec != windows_error::file_not_found &&
+        ec != windows_error::path_not_found)
+      return ec;
+    result = false;
+  } else
+    result = true;
+  return error_code::success();
 }
 
-const FileStatus *
-PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
-  if (!fsIsValid || update) {
-    WIN32_FILE_ATTRIBUTE_DATA fi;
-    if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
-      MakeErrMsg(ErrStr, "getStatusInfo():" + std::string(path) +
-                      ": Can't get status: ");
-      return 0;
-    }
+bool can_write(const Twine &Path) {
+  // FIXME: take security attributes into account.
+  SmallString<128> PathStorage;
+  SmallVector<wchar_t, 128> PathUtf16;
 
-    status.fileSize = fi.nFileSizeHigh;
-    status.fileSize <<= sizeof(fi.nFileSizeHigh)*8;
-    status.fileSize += fi.nFileSizeLow;
+  if (UTF8ToUTF16(Path.toStringRef(PathStorage), PathUtf16))
+    return false;
 
-    status.mode = fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY ? 0555 : 0777;
-    status.user = 9999;    // Not applicable to Windows, so...
-    status.group = 9999;   // Not applicable to Windows, so...
+  DWORD Attr = ::GetFileAttributesW(PathUtf16.begin());
+  return (Attr != INVALID_FILE_ATTRIBUTES) && !(Attr & FILE_ATTRIBUTE_READONLY);
+}
 
-    // FIXME: this is only unique if the file is accessed by the same file path.
-    // How do we do this for C:\dir\file and ..\dir\file ? Unix has inode
-    // numbers, but the concept doesn't exist in Windows.
-    status.uniqueID = 0;
-    for (unsigned i = 0; i < path.length(); ++i)
-      status.uniqueID += path[i];
+bool can_execute(const Twine &Path) {
+  SmallString<128> PathStorage;
+  SmallVector<wchar_t, 128> PathUtf16;
 
-    ULARGE_INTEGER ui;
-    ui.LowPart = fi.ftLastWriteTime.dwLowDateTime;
-    ui.HighPart = fi.ftLastWriteTime.dwHighDateTime;
-    status.modTime.fromWin32Time(ui.QuadPart);
+  if (UTF8ToUTF16(Path.toStringRef(PathStorage), PathUtf16))
+    return false;
 
-    status.isDir = fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY;
-    fsIsValid = true;
-  }
-  return &status;
+  DWORD Attr = ::GetFileAttributesW(PathUtf16.begin());
+  return Attr != INVALID_FILE_ATTRIBUTES;
 }
 
-bool Path::makeReadableOnDisk(std::string* ErrMsg) {
-  // All files are readable on Windows (ignoring security attributes).
-  return false;
+bool equivalent(file_status A, file_status B) {
+  assert(status_known(A) && status_known(B));
+  return A.FileIndexHigh      == B.FileIndexHigh &&
+         A.FileIndexLow       == B.FileIndexLow &&
+         A.FileSizeHigh       == B.FileSizeHigh &&
+         A.FileSizeLow        == B.FileSizeLow &&
+         A.LastWriteTimeHigh  == B.LastWriteTimeHigh &&
+         A.LastWriteTimeLow   == B.LastWriteTimeLow &&
+         A.VolumeSerialNumber == B.VolumeSerialNumber;
 }
 
-bool Path::makeWriteableOnDisk(std::string* ErrMsg) {
-  DWORD attr = GetFileAttributes(path.c_str());
+error_code equivalent(const Twine &A, const Twine &B, bool &result) {
+  file_status fsA, fsB;
+  if (error_code ec = status(A, fsA)) return ec;
+  if (error_code ec = status(B, fsB)) return ec;
+  result = equivalent(fsA, fsB);
+  return error_code::success();
+}
 
-  // If it doesn't exist, we're done.
-  if (attr == INVALID_FILE_ATTRIBUTES)
-    return false;
+static bool isReservedName(StringRef path) {
+  // This list of reserved names comes from MSDN, at:
+  // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
+  static const char *sReservedNames[] = { "nul", "con", "prn", "aux",
+                              "com1", "com2", "com3", "com4", "com5", "com6",
+                              "com7", "com8", "com9", "lpt1", "lpt2", "lpt3",
+                              "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" };
+
+  // First, check to see if this is a device namespace, which always
+  // starts with \\.\, since device namespaces are not legal file paths.
+  if (path.startswith("\\\\.\\"))
+    return true;
 
-  if (attr & FILE_ATTRIBUTE_READONLY) {
-    if (!SetFileAttributes(path.c_str(), attr & ~FILE_ATTRIBUTE_READONLY)) {
-      MakeErrMsg(ErrMsg, std::string(path) + ": Can't make file writable: ");
+  // Then compare against the list of ancient reserved names
+  for (size_t i = 0; i < array_lengthof(sReservedNames); ++i) {
+    if (path.equals_lower(sReservedNames[i]))
       return true;
-    }
   }
-  return false;
-}
 
-bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
-  // All files are executable on Windows (ignoring security attributes).
+  // The path isn't what we consider reserved.
   return false;
 }
 
-bool
-Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
-  WIN32_FILE_ATTRIBUTE_DATA fi;
-  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi)) {
-    MakeErrMsg(ErrMsg, path + ": can't get status of file");
-    return true;
+static error_code getStatus(HANDLE FileHandle, file_status &Result) {
+  if (FileHandle == INVALID_HANDLE_VALUE)
+    goto handle_status_error;
+
+  switch (::GetFileType(FileHandle)) {
+  default:
+    llvm_unreachable("Don't know anything about this file type");
+  case FILE_TYPE_UNKNOWN: {
+    DWORD Err = ::GetLastError();
+    if (Err != NO_ERROR)
+      return windows_error(Err);
+    Result = file_status(file_type::type_unknown);
+    return error_code::success();
+  }
+  case FILE_TYPE_DISK:
+    break;
+  case FILE_TYPE_CHAR:
+    Result = file_status(file_type::character_file);
+    return error_code::success();
+  case FILE_TYPE_PIPE:
+    Result = file_status(file_type::fifo_file);
+    return error_code::success();
   }
 
-  if (!(fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
-    if (ErrMsg)
-      *ErrMsg = path + ": not a directory";
-    return true;
+  BY_HANDLE_FILE_INFORMATION Info;
+  if (!::GetFileInformationByHandle(FileHandle, &Info))
+    goto handle_status_error;
+
+  {
+    file_type Type = (Info.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
+                         ? file_type::directory_file
+                         : file_type::regular_file;
+    Result =
+        file_status(Type, Info.ftLastWriteTime.dwHighDateTime,
+                    Info.ftLastWriteTime.dwLowDateTime,
+                    Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
+                    Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
+    return error_code::success();
   }
 
-  result.clear();
-  WIN32_FIND_DATA fd;
-  std::string searchpath = path;
-  if (path.size() == 0 || searchpath[path.size()-1] == '/')
-    searchpath += "*";
+handle_status_error:
+  error_code EC = windows_error(::GetLastError());
+  if (EC == windows_error::file_not_found ||
+      EC == windows_error::path_not_found)
+    Result = file_status(file_type::file_not_found);
+  else if (EC == windows_error::sharing_violation)
+    Result = file_status(file_type::type_unknown);
   else
-    searchpath += "/*";
+    Result = file_status(file_type::status_error);
+  return EC;
+}
 
-  HANDLE h = FindFirstFile(searchpath.c_str(), &fd);
-  if (h == INVALID_HANDLE_VALUE) {
-    if (GetLastError() == ERROR_FILE_NOT_FOUND)
-      return true; // not really an error, now is it?
-    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
-    return true;
-  }
+error_code status(const Twine &path, file_status &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
 
-  do {
-    if (fd.cFileName[0] == '.')
-      continue;
-    Path aPath(path);
-    aPath.appendComponent(&fd.cFileName[0]);
-    result.insert(aPath);
-  } while (FindNextFile(h, &fd));
-
-  DWORD err = GetLastError();
-  FindClose(h);
-  if (err != ERROR_NO_MORE_FILES) {
-    SetLastError(err);
-    MakeErrMsg(ErrMsg, path + ": Can't read directory: ");
-    return true;
+  StringRef path8 = path.toStringRef(path_storage);
+  if (isReservedName(path8)) {
+    result = file_status(file_type::character_file);
+    return error_code::success();
   }
-  return false;
-}
 
-bool
-Path::set(StringRef a_path) {
-  if (a_path.empty())
-    return false;
-  std::string save(path);
-  path = a_path;
-  FlipBackSlashes(path);
-  if (!isValid()) {
-    path = save;
-    return false;
+  if (error_code ec = UTF8ToUTF16(path8, path_utf16))
+    return ec;
+
+  DWORD attr = ::GetFileAttributesW(path_utf16.begin());
+  if (attr == INVALID_FILE_ATTRIBUTES)
+    return getStatus(INVALID_HANDLE_VALUE, result);
+
+  // Handle reparse points.
+  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
+    ScopedFileHandle h(
+      ::CreateFileW(path_utf16.begin(),
+                    0, // Attributes only.
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL,
+                    OPEN_EXISTING,
+                    FILE_FLAG_BACKUP_SEMANTICS,
+                    0));
+    if (!h)
+      return getStatus(INVALID_HANDLE_VALUE, result);
   }
-  return true;
+
+  ScopedFileHandle h(
+      ::CreateFileW(path_utf16.begin(), 0, // Attributes only.
+                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0));
+    if (!h)
+      return getStatus(INVALID_HANDLE_VALUE, result);
+
+    return getStatus(h, result);
 }
 
-bool
-Path::appendComponent(StringRef name) {
-  if (name.empty())
-    return false;
-  std::string save(path);
-  if (!path.empty()) {
-    size_t last = path.size() - 1;
-    if (path[last] != '/')
-      path += '/';
-  }
-  path += name;
-  if (!isValid()) {
-    path = save;
-    return false;
-  }
-  return true;
+error_code status(int FD, file_status &Result) {
+  HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  return getStatus(FileHandle, Result);
 }
 
-bool
-Path::eraseComponent() {
-  size_t slashpos = path.rfind('/',path.size());
-  if (slashpos == path.size() - 1 || slashpos == std::string::npos)
-    return false;
-  std::string save(path);
-  path.erase(slashpos);
-  if (!isValid()) {
-    path = save;
-    return false;
-  }
-  return true;
+error_code setLastModificationAndAccessTime(int FD, TimeValue Time) {
+  ULARGE_INTEGER UI;
+  UI.QuadPart = Time.toWin32Time();
+  FILETIME FT;
+  FT.dwLowDateTime = UI.LowPart;
+  FT.dwHighDateTime = UI.HighPart;
+  HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+  if (!SetFileTime(FileHandle, NULL, &FT, &FT))
+    return windows_error(::GetLastError());
+  return error_code::success();
 }
 
-bool
-Path::eraseSuffix() {
-  size_t dotpos = path.rfind('.',path.size());
-  size_t slashpos = path.rfind('/',path.size());
-  if (dotpos != std::string::npos) {
-    if (slashpos == std::string::npos || dotpos > slashpos+1) {
-      std::string save(path);
-      path.erase(dotpos, path.size()-dotpos);
-      if (!isValid()) {
-        path = save;
-        return false;
-      }
-      return true;
-    }
+error_code get_magic(const Twine &path, uint32_t len,
+                     SmallVectorImpl<char> &result) {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+  result.set_size(0);
+
+  // Convert path to UTF-16.
+  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
+                                  path_utf16))
+    return ec;
+
+  // Open file.
+  HANDLE file = ::CreateFileW(c_str(path_utf16),
+                              GENERIC_READ,
+                              FILE_SHARE_READ,
+                              NULL,
+                              OPEN_EXISTING,
+                              FILE_ATTRIBUTE_READONLY,
+                              NULL);
+  if (file == INVALID_HANDLE_VALUE)
+    return windows_error(::GetLastError());
+
+  // Allocate buffer.
+  result.reserve(len);
+
+  // Get magic!
+  DWORD bytes_read = 0;
+  BOOL read_success = ::ReadFile(file, result.data(), len, &bytes_read, NULL);
+  error_code ec = windows_error(::GetLastError());
+  ::CloseHandle(file);
+  if (!read_success || (bytes_read != len)) {
+    // Set result size to the number of bytes read if it's valid.
+    if (bytes_read <= len)
+      result.set_size(bytes_read);
+    // ERROR_HANDLE_EOF is mapped to errc::value_too_large.
+    return ec;
   }
-  return false;
-}
 
-inline bool PathMsg(std::string* ErrMsg, const char* pathname, const char*msg) {
-  if (ErrMsg)
-    *ErrMsg = std::string(pathname) + ": " + std::string(msg);
-  return true;
+  result.set_size(len);
+  return error_code::success();
 }
 
-bool
-Path::createDirectoryOnDisk(bool create_parents, std::string* ErrMsg) {
-  // Get a writeable copy of the path name
-  size_t len = path.length();
-  char *pathname = reinterpret_cast<char *>(_alloca(len+2));
-  path.copy(pathname, len);
-  pathname[len] = 0;
-
-  // Make sure it ends with a slash.
-  if (len == 0 || pathname[len - 1] != '/') {
-    pathname[len] = '/';
-    pathname[++len] = 0;
+error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
+  FileDescriptor = FD;
+  // Make sure that the requested size fits within SIZE_T.
+  if (Size > std::numeric_limits<SIZE_T>::max()) {
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return make_error_code(errc::invalid_argument);
   }
 
-  // Determine starting point for initial / search.
-  char *next = pathname;
-  if (pathname[0] == '/' && pathname[1] == '/') {
-    // Skip host name.
-    next = strchr(pathname+2, '/');
-    if (next == NULL)
-      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+  DWORD flprotect;
+  switch (Mode) {
+  case readonly:  flprotect = PAGE_READONLY; break;
+  case readwrite: flprotect = PAGE_READWRITE; break;
+  case priv:      flprotect = PAGE_WRITECOPY; break;
+  }
 
-    // Skip share name.
-    next = strchr(next+1, '/');
-    if (next == NULL)
-      return PathMsg(ErrMsg, pathname,"badly formed remote directory");
+  FileMappingHandle =
+      ::CreateFileMappingW(FileHandle, 0, flprotect,
+                           (Offset + Size) >> 32,
+                           (Offset + Size) & 0xffffffff,
+                           0);
+  if (FileMappingHandle == NULL) {
+    error_code ec = windows_error(GetLastError());
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return ec;
+  }
 
-    next++;
-    if (*next == 0)
-      return PathMsg(ErrMsg, pathname, "badly formed remote directory");
+  DWORD dwDesiredAccess;
+  switch (Mode) {
+  case readonly:  dwDesiredAccess = FILE_MAP_READ; break;
+  case readwrite: dwDesiredAccess = FILE_MAP_WRITE; break;
+  case priv:      dwDesiredAccess = FILE_MAP_COPY; break;
+  }
+  Mapping = ::MapViewOfFile(FileMappingHandle,
+                            dwDesiredAccess,
+                            Offset >> 32,
+                            Offset & 0xffffffff,
+                            Size);
+  if (Mapping == NULL) {
+    error_code ec = windows_error(GetLastError());
+    ::CloseHandle(FileMappingHandle);
+    if (FileDescriptor) {
+      if (CloseFD)
+        _close(FileDescriptor);
+    } else
+      ::CloseHandle(FileHandle);
+    return ec;
+  }
 
-  } else {
-    if (pathname[1] == ':')
-      next += 2;    // skip drive letter
-    if (*next == '/')
-      next++;       // skip root directory
-  }
-
-  // If we're supposed to create intermediate directories
-  if (create_parents) {
-    // Loop through the directory components until we're done
-    while (*next) {
-      next = strchr(next, '/');
-      *next = 0;
-      if (!CreateDirectory(pathname, NULL) &&
-          GetLastError() != ERROR_ALREADY_EXISTS)
-          return MakeErrMsg(ErrMsg,
-            std::string(pathname) + ": Can't create directory: ");
-      *next++ = '/';
-    }
-  } else {
-    // Drop trailing slash.
-    pathname[len-1] = 0;
-    if (!CreateDirectory(pathname, NULL) &&
-        GetLastError() != ERROR_ALREADY_EXISTS) {
-      return MakeErrMsg(ErrMsg, std::string(pathname) +
-                        ": Can't create directory: ");
+  if (Size == 0) {
+    MEMORY_BASIC_INFORMATION mbi;
+    SIZE_T Result = VirtualQuery(Mapping, &mbi, sizeof(mbi));
+    if (Result == 0) {
+      error_code ec = windows_error(GetLastError());
+      ::UnmapViewOfFile(Mapping);
+      ::CloseHandle(FileMappingHandle);
+      if (FileDescriptor) {
+        if (CloseFD)
+          _close(FileDescriptor);
+      } else
+        ::CloseHandle(FileHandle);
+      return ec;
     }
+    Size = mbi.RegionSize;
   }
-  return false;
-}
-
-bool
-Path::createFileOnDisk(std::string* ErrMsg) {
-  // Create the file
-  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
-                        FILE_ATTRIBUTE_NORMAL, NULL);
-  if (h == INVALID_HANDLE_VALUE)
-    return MakeErrMsg(ErrMsg, path + ": Can't create file: ");
 
-  CloseHandle(h);
-  return false;
+  // Close all the handles except for the view. It will keep the other handles
+  // alive.
+  ::CloseHandle(FileMappingHandle);
+  if (FileDescriptor) {
+    if (CloseFD)
+      _close(FileDescriptor); // Also closes FileHandle.
+  } else
+    ::CloseHandle(FileHandle);
+  return error_code::success();
 }
 
-bool
-Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
-  WIN32_FILE_ATTRIBUTE_DATA fi;
-  if (!GetFileAttributesEx(path.c_str(), GetFileExInfoStandard, &fi))
-    return true;
+mapped_file_region::mapped_file_region(const Twine &path,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping()
+  , FileDescriptor()
+  , FileHandle(INVALID_HANDLE_VALUE)
+  , FileMappingHandle() {
+  SmallString<128> path_storage;
+  SmallVector<wchar_t, 128> path_utf16;
+
+  // Convert path to UTF-16.
+  if ((ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16)))
+    return;
 
-  if (fi.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
-    // If it doesn't exist, we're done.
-    bool Exists;
-    if (fs::exists(path, Exists) || !Exists)
-      return false;
+  // Get file handle for creating a file mapping.
+  FileHandle = ::CreateFileW(c_str(path_utf16),
+                             Mode == readonly ? GENERIC_READ
+                                              : GENERIC_READ | GENERIC_WRITE,
+                             Mode == readonly ? FILE_SHARE_READ
+                                              : 0,
+                             0,
+                             Mode == readonly ? OPEN_EXISTING
+                                              : OPEN_ALWAYS,
+                             Mode == readonly ? FILE_ATTRIBUTE_READONLY
+                                              : FILE_ATTRIBUTE_NORMAL,
+                             0);
+  if (FileHandle == INVALID_HANDLE_VALUE) {
+    ec = windows_error(::GetLastError());
+    return;
+  }
 
-    char *pathname = reinterpret_cast<char *>(_alloca(path.length()+3));
-    int lastchar = path.length() - 1 ;
-    path.copy(pathname, lastchar+1);
-
-    // Make path end with '/*'.
-    if (pathname[lastchar] != '/')
-      pathname[++lastchar] = '/';
-    pathname[lastchar+1] = '*';
-    pathname[lastchar+2] = 0;
-
-    if (remove_contents) {
-      WIN32_FIND_DATA fd;
-      HANDLE h = FindFirstFile(pathname, &fd);
-
-      // It's a bad idea to alter the contents of a directory while enumerating
-      // its contents. So build a list of its contents first, then destroy them.
-
-      if (h != INVALID_HANDLE_VALUE) {
-        std::vector<Path> list;
-
-        do {
-          if (strcmp(fd.cFileName, ".") == 0)
-            continue;
-          if (strcmp(fd.cFileName, "..") == 0)
-            continue;
-
-          Path aPath(path);
-          aPath.appendComponent(&fd.cFileName[0]);
-          list.push_back(aPath);
-        } while (FindNextFile(h, &fd));
-
-        DWORD err = GetLastError();
-        FindClose(h);
-        if (err != ERROR_NO_MORE_FILES) {
-          SetLastError(err);
-          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
-        }
-
-        for (std::vector<Path>::iterator I = list.begin(); I != list.end();
-             ++I) {
-          Path &aPath = *I;
-          aPath.eraseFromDisk(true);
-        }
-      } else {
-        if (GetLastError() != ERROR_FILE_NOT_FOUND)
-          return MakeErrMsg(ErrStr, path + ": Can't read directory: ");
-      }
-    }
+  FileDescriptor = 0;
+  ec = init(FileDescriptor, true, offset);
+  if (ec) {
+    Mapping = FileMappingHandle = 0;
+    FileHandle = INVALID_HANDLE_VALUE;
+    FileDescriptor = 0;
+  }
+}
 
-    pathname[lastchar] = 0;
-    if (!RemoveDirectory(pathname))
-      return MakeErrMsg(ErrStr,
-        std::string(pathname) + ": Can't destroy directory: ");
-    return false;
-  } else {
-    // Read-only files cannot be deleted on Windows.  Must remove the read-only
-    // attribute first.
-    if (fi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
-      if (!SetFileAttributes(path.c_str(),
-                             fi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
-        return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
-    }
+mapped_file_region::mapped_file_region(int fd,
+                                       bool closefd,
+                                       mapmode mode,
+                                       uint64_t length,
+                                       uint64_t offset,
+                                       error_code &ec)
+  : Mode(mode)
+  , Size(length)
+  , Mapping()
+  , FileDescriptor(fd)
+  , FileHandle(INVALID_HANDLE_VALUE)
+  , FileMappingHandle() {
+  FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (FileHandle == INVALID_HANDLE_VALUE) {
+    if (closefd)
+      _close(FileDescriptor);
+    FileDescriptor = 0;
+    ec = make_error_code(errc::bad_file_descriptor);
+    return;
+  }
 
-    if (!DeleteFile(path.c_str()))
-      return MakeErrMsg(ErrStr, path + ": Can't destroy file: ");
-    return false;
+  ec = init(FileDescriptor, closefd, offset);
+  if (ec) {
+    Mapping = FileMappingHandle = 0;
+    FileHandle = INVALID_HANDLE_VALUE;
+    FileDescriptor = 0;
   }
 }
 
-bool Path::getMagicNumber(std::string& Magic, unsigned len) const {
-  assert(len < 1024 && "Request for magic string too long");
-  char* buf = reinterpret_cast<char*>(alloca(len));
+mapped_file_region::~mapped_file_region() {
+  if (Mapping)
+    ::UnmapViewOfFile(Mapping);
+}
+
+#if LLVM_HAS_RVALUE_REFERENCES
+mapped_file_region::mapped_file_region(mapped_file_region &&other)
+  : Mode(other.Mode)
+  , Size(other.Size)
+  , Mapping(other.Mapping)
+  , FileDescriptor(other.FileDescriptor)
+  , FileHandle(other.FileHandle)
+  , FileMappingHandle(other.FileMappingHandle) {
+  other.Mapping = other.FileMappingHandle = 0;
+  other.FileHandle = INVALID_HANDLE_VALUE;
+  other.FileDescriptor = 0;
+}
+#endif
 
-  HANDLE h = CreateFile(path.c_str(),
-                        GENERIC_READ,
-                        FILE_SHARE_READ,
-                        NULL,
-                        OPEN_EXISTING,
-                        FILE_ATTRIBUTE_NORMAL,
-                        NULL);
-  if (h == INVALID_HANDLE_VALUE)
-    return false;
+mapped_file_region::mapmode mapped_file_region::flags() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Mode;
+}
 
-  DWORD nRead = 0;
-  BOOL ret = ReadFile(h, buf, len, &nRead, NULL);
-  CloseHandle(h);
+uint64_t mapped_file_region::size() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return Size;
+}
 
-  if (!ret || nRead != len)
-    return false;
+char *mapped_file_region::data() const {
+  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<char*>(Mapping);
+}
 
-  Magic = std::string(buf, len);
-  return true;
+const char *mapped_file_region::const_data() const {
+  assert(Mapping && "Mapping failed but used anyway!");
+  return reinterpret_cast<const char*>(Mapping);
 }
 
-bool
-Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
-  if (!MoveFileEx(path.c_str(), newName.c_str(), MOVEFILE_REPLACE_EXISTING))
-    return MakeErrMsg(ErrMsg, "Can't move '" + path + "' to '" + newName.path
-        + "': ");
-  return false;
+int mapped_file_region::alignment() {
+  SYSTEM_INFO SysInfo;
+  ::GetSystemInfo(&SysInfo);
+  return SysInfo.dwAllocationGranularity;
 }
 
-bool
-Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrMsg) const {
-  // FIXME: should work on directories also.
-  if (!si.isFile) {
-    return true;
-  }
+error_code detail::directory_iterator_construct(detail::DirIterState &it,
+                                                StringRef path){
+  SmallVector<wchar_t, 128> path_utf16;
 
-  HANDLE h = CreateFile(path.c_str(),
-                        FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES,
-                        FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
-                        NULL,
-                        OPEN_EXISTING,
-                        FILE_ATTRIBUTE_NORMAL,
-                        NULL);
-  if (h == INVALID_HANDLE_VALUE)
-    return true;
+  if (error_code ec = UTF8ToUTF16(path,
+                                  path_utf16))
+    return ec;
 
-  BY_HANDLE_FILE_INFORMATION bhfi;
-  if (!GetFileInformationByHandle(h, &bhfi)) {
-    DWORD err = GetLastError();
-    CloseHandle(h);
-    SetLastError(err);
-    return MakeErrMsg(ErrMsg, path + ": GetFileInformationByHandle: ");
-  }
-
-  ULARGE_INTEGER ui;
-  ui.QuadPart = si.modTime.toWin32Time();
-  FILETIME ft;
-  ft.dwLowDateTime = ui.LowPart;
-  ft.dwHighDateTime = ui.HighPart;
-  BOOL ret = SetFileTime(h, NULL, &ft, &ft);
-  DWORD err = GetLastError();
-  CloseHandle(h);
-  if (!ret) {
-    SetLastError(err);
-    return MakeErrMsg(ErrMsg, path + ": SetFileTime: ");
-  }
-
-  // Best we can do with Unix permission bits is to interpret the owner
-  // writable bit.
-  if (si.mode & 0200) {
-    if (bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY) {
-      if (!SetFileAttributes(path.c_str(),
-              bhfi.dwFileAttributes & ~FILE_ATTRIBUTE_READONLY))
-        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
-    }
+  // Convert path to the format that Windows is happy with.
+  if (path_utf16.size() > 0 &&
+      !is_separator(path_utf16[path.size() - 1]) &&
+      path_utf16[path.size() - 1] != L':') {
+    path_utf16.push_back(L'\\');
+    path_utf16.push_back(L'*');
   } else {
-    if (!(bhfi.dwFileAttributes & FILE_ATTRIBUTE_READONLY)) {
-      if (!SetFileAttributes(path.c_str(),
-              bhfi.dwFileAttributes | FILE_ATTRIBUTE_READONLY))
-        return MakeErrMsg(ErrMsg, path + ": SetFileAttributes: ");
-    }
+    path_utf16.push_back(L'*');
   }
 
-  return false;
+  //  Get the first directory entry.
+  WIN32_FIND_DATAW FirstFind;
+  ScopedFindHandle FindHandle(::FindFirstFileW(c_str(path_utf16), &FirstFind));
+  if (!FindHandle)
+    return windows_error(::GetLastError());
+
+  size_t FilenameLen = ::wcslen(FirstFind.cFileName);
+  while ((FilenameLen == 1 && FirstFind.cFileName[0] == L'.') ||
+         (FilenameLen == 2 && FirstFind.cFileName[0] == L'.' &&
+                              FirstFind.cFileName[1] == L'.'))
+    if (!::FindNextFileW(FindHandle, &FirstFind)) {
+      error_code ec = windows_error(::GetLastError());
+      // Check for end.
+      if (ec == windows_error::no_more_files)
+        return detail::directory_iterator_destruct(it);
+      return ec;
+    } else
+      FilenameLen = ::wcslen(FirstFind.cFileName);
+
+  // Construct the current directory entry.
+  SmallString<128> directory_entry_name_utf8;
+  if (error_code ec = UTF16ToUTF8(FirstFind.cFileName,
+                                  ::wcslen(FirstFind.cFileName),
+                                  directory_entry_name_utf8))
+    return ec;
+
+  it.IterationHandle = intptr_t(FindHandle.take());
+  SmallString<128> directory_entry_path(path);
+  path::append(directory_entry_path, directory_entry_name_utf8.str());
+  it.CurrentEntry = directory_entry(directory_entry_path.str());
+
+  return error_code::success();
 }
 
-bool
-CopyFile(const sys::Path &Dest, const sys::Path &Src, std::string* ErrMsg) {
-  // Can't use CopyFile macro defined in Windows.h because it would mess up the
-  // above line.  We use the expansion it would have in a non-UNICODE build.
-  if (!::CopyFileA(Src.c_str(), Dest.c_str(), false))
-    return MakeErrMsg(ErrMsg, "Can't copy '" + Src.str() +
-               "' to '" + Dest.str() + "': ");
-  return false;
+error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
+  if (it.IterationHandle != 0)
+    // Closes the handle if it's valid.
+    ScopedFindHandle close(HANDLE(it.IterationHandle));
+  it.IterationHandle = 0;
+  it.CurrentEntry = directory_entry();
+  return error_code::success();
 }
 
-bool
-Path::makeUnique(bool reuse_current, std::string* ErrMsg) {
-  bool Exists;
-  if (reuse_current && (fs::exists(path, Exists) || !Exists))
-    return false; // File doesn't exist already, just use it!
-
-  // Reserve space for -XXXXXX at the end.
-  char *FNBuffer = (char*) alloca(path.size()+8);
-  unsigned offset = path.size();
-  path.copy(FNBuffer, offset);
-
-  // Find a numeric suffix that isn't used by an existing file.  Assume there
-  // won't be more than 1 million files with the same prefix.  Probably a safe
-  // bet.
-  static int FCounter = -1;
-  if (FCounter < 0) {
-    // Give arbitrary initial seed.
-    // FIXME: We should use sys::fs::unique_file() in future.
-    LARGE_INTEGER cnt64;
-    DWORD x = GetCurrentProcessId();
-    x = (x << 16) | (x >> 16);
-    if (QueryPerformanceCounter(&cnt64))    // RDTSC
-      x ^= cnt64.HighPart ^ cnt64.LowPart;
-    FCounter = x % 1000000;
+error_code detail::directory_iterator_increment(detail::DirIterState &it) {
+  WIN32_FIND_DATAW FindData;
+  if (!::FindNextFileW(HANDLE(it.IterationHandle), &FindData)) {
+    error_code ec = windows_error(::GetLastError());
+    // Check for end.
+    if (ec == windows_error::no_more_files)
+      return detail::directory_iterator_destruct(it);
+    return ec;
   }
-  do {
-    sprintf(FNBuffer+offset, "-%06u", FCounter);
-    if (++FCounter > 999999)
-      FCounter = 0;
-    path = FNBuffer;
-  } while (!fs::exists(path, Exists) && Exists);
-  return false;
-}
 
-bool
-Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
-  // Make this into a unique file name
-  makeUnique(reuse_current, ErrMsg);
+  size_t FilenameLen = ::wcslen(FindData.cFileName);
+  if ((FilenameLen == 1 && FindData.cFileName[0] == L'.') ||
+      (FilenameLen == 2 && FindData.cFileName[0] == L'.' &&
+                           FindData.cFileName[1] == L'.'))
+    return directory_iterator_increment(it);
 
-  // Now go and create it
-  HANDLE h = CreateFile(path.c_str(), GENERIC_WRITE, 0, NULL, CREATE_NEW,
-                        FILE_ATTRIBUTE_NORMAL, NULL);
-  if (h == INVALID_HANDLE_VALUE)
-    return MakeErrMsg(ErrMsg, path + ": can't create file");
+  SmallString<128> directory_entry_path_utf8;
+  if (error_code ec = UTF16ToUTF8(FindData.cFileName,
+                                  ::wcslen(FindData.cFileName),
+                                  directory_entry_path_utf8))
+    return ec;
 
-  CloseHandle(h);
-  return false;
+  it.CurrentEntry.replace_filename(Twine(directory_entry_path_utf8));
+  return error_code::success();
 }
 
-/// MapInFilePages - Not yet implemented on win32.
-const char *Path::MapInFilePages(int FD, size_t FileSize, off_t Offset) {
-  return 0;
+error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
+                                            bool map_writable, void *&result) {
+  assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
 }
 
-/// MapInFilePages - Not yet implemented on win32.
-void Path::UnMapFilePages(const char *Base, size_t FileSize) {
+error_code unmap_file_pages(void *base, size_t size) {
   assert(0 && "NOT IMPLEMENTED");
+  return windows_error::invalid_function;
+}
+
+error_code openFileForRead(const Twine &Name, int &ResultFD) {
+  SmallString<128> PathStorage;
+  SmallVector<wchar_t, 128> PathUTF16;
+
+  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
+                                  PathUTF16))
+    return EC;
+
+  HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
+                           FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+                           OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+  if (H == INVALID_HANDLE_VALUE) {
+    error_code EC = windows_error(::GetLastError());
+    // Provide a better error message when trying to open directories.
+    // This only runs if we failed to open the file, so there is probably
+    // no performances issues.
+    if (EC != windows_error::access_denied)
+      return EC;
+    if (is_directory(Name))
+      return error_code(errc::is_a_directory, posix_category());
+    return EC;
+  }
+
+  int FD = ::_open_osfhandle(intptr_t(H), 0);
+  if (FD == -1) {
+    ::CloseHandle(H);
+    return windows_error::invalid_handle;
+  }
+
+  ResultFD = FD;
+  return error_code::success();
+}
+
+error_code openFileForWrite(const Twine &Name, int &ResultFD,
+                            sys::fs::OpenFlags Flags, unsigned Mode) {
+  // Verify that we don't have both "append" and "excl".
+  assert((!(Flags & sys::fs::F_Excl) || !(Flags & sys::fs::F_Append)) &&
+         "Cannot specify both 'excl' and 'append' file creation flags!");
+
+  SmallString<128> PathStorage;
+  SmallVector<wchar_t, 128> PathUTF16;
+
+  if (error_code EC = UTF8ToUTF16(Name.toStringRef(PathStorage),
+                                  PathUTF16))
+    return EC;
+
+  DWORD CreationDisposition;
+  if (Flags & F_Excl)
+    CreationDisposition = CREATE_NEW;
+  else if (Flags & F_Append)
+    CreationDisposition = OPEN_ALWAYS;
+  else
+    CreationDisposition = CREATE_ALWAYS;
+
+  HANDLE H = ::CreateFileW(PathUTF16.begin(), GENERIC_WRITE,
+                           FILE_SHARE_READ | FILE_SHARE_WRITE, NULL,
+                           CreationDisposition, FILE_ATTRIBUTE_NORMAL, NULL);
+
+  if (H == INVALID_HANDLE_VALUE) {
+    error_code EC = windows_error(::GetLastError());
+    // Provide a better error message when trying to open directories.
+    // This only runs if we failed to open the file, so there is probably
+    // no performances issues.
+    if (EC != windows_error::access_denied)
+      return EC;
+    if (is_directory(Name))
+      return error_code(errc::is_a_directory, posix_category());
+    return EC;
+  }
+
+  int OpenFlags = 0;
+  if (Flags & F_Append)
+    OpenFlags |= _O_APPEND;
+
+  if (!(Flags & F_Binary))
+    OpenFlags |= _O_TEXT;
+
+  int FD = ::_open_osfhandle(intptr_t(H), OpenFlags);
+  if (FD == -1) {
+    ::CloseHandle(H);
+    return windows_error::invalid_handle;
+  }
+
+  ResultFD = FD;
+  return error_code::success();
 }
+} // end namespace fs
 
+namespace windows {
+llvm::error_code UTF8ToUTF16(llvm::StringRef utf8,
+                             llvm::SmallVectorImpl<wchar_t> &utf16) {
+  int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                                  utf8.begin(), utf8.size(),
+                                  utf16.begin(), 0);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf16.reserve(len + 1);
+  utf16.set_size(len);
+
+  len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
+                              utf8.begin(), utf8.size(),
+                              utf16.begin(), utf16.size());
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf16 null terminated.
+  utf16.push_back(0);
+  utf16.pop_back();
+
+  return llvm::error_code::success();
 }
+
+llvm::error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                             llvm::SmallVectorImpl<char> &utf8) {
+  // Get length.
+  int len = ::WideCharToMultiByte(CP_UTF8, 0,
+                                  utf16, utf16_len,
+                                  utf8.begin(), 0,
+                                  NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  utf8.reserve(len);
+  utf8.set_size(len);
+
+  // Now do the actual conversion.
+  len = ::WideCharToMultiByte(CP_UTF8, 0,
+                              utf16, utf16_len,
+                              utf8.data(), utf8.size(),
+                              NULL, NULL);
+
+  if (len == 0)
+    return llvm::windows_error(::GetLastError());
+
+  // Make utf8 null terminated.
+  utf8.push_back(0);
+  utf8.pop_back();
+
+  return llvm::error_code::success();
 }
+} // end namespace windows
+} // end namespace sys
+} // end namespace llvm
diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc
deleted file mode 100644
index 23f3d14..0000000
--- a/lib/Support/Windows/PathV2.inc
+++ /dev/null
@@ -1,1022 +0,0 @@
-//===- llvm/Support/Windows/PathV2.inc - Windows Path Impl ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Windows specific implementation of the PathV2 API.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Windows code that
-//===          is guaranteed to work on *all* Windows variants.
-//===----------------------------------------------------------------------===//
-
-#include "Windows.h"
-#include <fcntl.h>
-#include <io.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#undef max
-
-// MinGW doesn't define this.
-#ifndef _ERRNO_T_DEFINED
-#define _ERRNO_T_DEFINED
-typedef int errno_t;
-#endif
-
-using namespace llvm;
-
-namespace {
-  typedef BOOLEAN (WINAPI *PtrCreateSymbolicLinkW)(
-    /*__in*/ LPCWSTR lpSymlinkFileName,
-    /*__in*/ LPCWSTR lpTargetFileName,
-    /*__in*/ DWORD dwFlags);
-
-  PtrCreateSymbolicLinkW create_symbolic_link_api = PtrCreateSymbolicLinkW(
-    ::GetProcAddress(::GetModuleHandleA("kernel32.dll"),
-                     "CreateSymbolicLinkW"));
-
-  error_code UTF8ToUTF16(StringRef utf8, SmallVectorImpl<wchar_t> &utf16) {
-    int len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), 0);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf16.reserve(len + 1);
-    utf16.set_size(len);
-
-    len = ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS,
-                                    utf8.begin(), utf8.size(),
-                                    utf16.begin(), utf16.size());
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf16 null terminated.
-    utf16.push_back(0);
-    utf16.pop_back();
-
-    return error_code::success();
-  }
-
-  error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
-                               SmallVectorImpl<char> &utf8) {
-    // Get length.
-    int len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                    utf16, utf16_len,
-                                    utf8.begin(), 0,
-                                    NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    utf8.reserve(len);
-    utf8.set_size(len);
-
-    // Now do the actual conversion.
-    len = ::WideCharToMultiByte(CP_UTF8, 0,
-                                utf16, utf16_len,
-                                utf8.data(), utf8.size(),
-                                NULL, NULL);
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    // Make utf8 null terminated.
-    utf8.push_back(0);
-    utf8.pop_back();
-
-    return error_code::success();
-  }
-
-  error_code TempDir(SmallVectorImpl<wchar_t> &result) {
-  retry_temp_dir:
-    DWORD len = ::GetTempPathW(result.capacity(), result.begin());
-
-    if (len == 0)
-      return windows_error(::GetLastError());
-
-    if (len > result.capacity()) {
-      result.reserve(len);
-      goto retry_temp_dir;
-    }
-
-    result.set_size(len);
-    return error_code::success();
-  }
-
-  bool is_separator(const wchar_t value) {
-    switch (value) {
-    case L'\\':
-    case L'/':
-      return true;
-    default:
-      return false;
-    }
-  }
-}
-
-namespace llvm {
-namespace sys  {
-namespace fs {
-
-error_code current_path(SmallVectorImpl<char> &result) {
-  SmallVector<wchar_t, 128> cur_path;
-  cur_path.reserve(128);
-retry_cur_dir:
-  DWORD len = ::GetCurrentDirectoryW(cur_path.capacity(), cur_path.data());
-
-  // A zero return value indicates a failure other than insufficient space.
-  if (len == 0)
-    return windows_error(::GetLastError());
-
-  // If there's insufficient space, the len returned is larger than the len
-  // given.
-  if (len > cur_path.capacity()) {
-    cur_path.reserve(len);
-    goto retry_cur_dir;
-  }
-
-  cur_path.set_size(len);
-  // cur_path now holds the current directory in utf-16. Convert to utf-8.
-
-  // Find out how much space we need. Sadly, this function doesn't return the
-  // size needed unless you tell it the result size is 0, which means you
-  // _always_ have to call it twice.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), 0,
-                              NULL, NULL);
-
-  if (len == 0)
-    return make_error_code(windows_error(::GetLastError()));
-
-  result.reserve(len);
-  result.set_size(len);
-  // Now do the actual conversion.
-  len = ::WideCharToMultiByte(CP_UTF8, 0,
-                              cur_path.data(), cur_path.size(),
-                              result.data(), result.size(),
-                              NULL, NULL);
-  if (len == 0)
-    return windows_error(::GetLastError());
-
-  return error_code::success();
-}
-
-error_code copy_file(const Twine &from, const Twine &to, copy_option copt) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
-  // Convert to utf-16.
-  SmallVector<wchar_t, 128> wide_from;
-  SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
-
-  // Copy the file.
-  BOOL res = ::CopyFileW(wide_from.begin(), wide_to.begin(),
-                         copt != copy_option::overwrite_if_exists);
-
-  if (res == 0)
-    return windows_error(::GetLastError());
-
-  return error_code::success();
-}
-
-error_code create_directory(const Twine &path, bool &existed) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
-    error_code ec = windows_error(::GetLastError());
-    if (ec == windows_error::already_exists)
-      existed = true;
-    else
-      return ec;
-  } else
-    existed = false;
-
-  return error_code::success();
-}
-
-error_code create_hard_link(const Twine &to, const Twine &from) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
-  // Convert to utf-16.
-  SmallVector<wchar_t, 128> wide_from;
-  SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
-
-  if (!::CreateHardLinkW(wide_from.begin(), wide_to.begin(), NULL))
-    return windows_error(::GetLastError());
-
-  return error_code::success();
-}
-
-error_code create_symlink(const Twine &to, const Twine &from) {
-  // Only do it if the function is available at runtime.
-  if (!create_symbolic_link_api)
-    return make_error_code(errc::function_not_supported);
-
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
-  // Convert to utf-16.
-  SmallVector<wchar_t, 128> wide_from;
-  SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
-
-  if (!create_symbolic_link_api(wide_from.begin(), wide_to.begin(), 0))
-    return windows_error(::GetLastError());
-
-  return error_code::success();
-}
-
-error_code remove(const Twine &path, bool &existed) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  file_status st;
-  if (error_code ec = status(path, st))
-    return ec;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  if (st.type() == file_type::directory_file) {
-    if (!::RemoveDirectoryW(c_str(path_utf16))) {
-      error_code ec = windows_error(::GetLastError());
-      if (ec != windows_error::file_not_found)
-        return ec;
-      existed = false;
-    } else
-      existed = true;
-  } else {
-    if (!::DeleteFileW(c_str(path_utf16))) {
-      error_code ec = windows_error(::GetLastError());
-      if (ec != windows_error::file_not_found)
-        return ec;
-      existed = false;
-    } else
-      existed = true;
-  }
-
-  return error_code::success();
-}
-
-error_code rename(const Twine &from, const Twine &to) {
-  // Get arguments.
-  SmallString<128> from_storage;
-  SmallString<128> to_storage;
-  StringRef f = from.toStringRef(from_storage);
-  StringRef t = to.toStringRef(to_storage);
-
-  // Convert to utf-16.
-  SmallVector<wchar_t, 128> wide_from;
-  SmallVector<wchar_t, 128> wide_to;
-  if (error_code ec = UTF8ToUTF16(f, wide_from)) return ec;
-  if (error_code ec = UTF8ToUTF16(t, wide_to)) return ec;
-
-  error_code ec = error_code::success();
-  for (int i = 0; i < 2000; i++) {
-    if (::MoveFileExW(wide_from.begin(), wide_to.begin(),
-                      MOVEFILE_COPY_ALLOWED | MOVEFILE_REPLACE_EXISTING))
-      return error_code::success();
-    ec = windows_error(::GetLastError());
-    if (ec != windows_error::access_denied)
-      break;
-    // Retry MoveFile() at ACCESS_DENIED.
-    // System scanners (eg. indexer) might open the source file when
-    // It is written and closed.
-    ::Sleep(1);
-  }
-
-  return ec;
-}
-
-error_code resize_file(const Twine &path, uint64_t size) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  int fd = ::_wopen(path_utf16.begin(), O_BINARY | _O_RDWR, S_IWRITE);
-  if (fd == -1)
-    return error_code(errno, generic_category());
-#ifdef HAVE__CHSIZE_S
-  errno_t error = ::_chsize_s(fd, size);
-#else
-  errno_t error = ::_chsize(fd, size);
-#endif
-  ::close(fd);
-  return error_code(error, generic_category());
-}
-
-error_code exists(const Twine &path, bool &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
-
-  if (attributes == INVALID_FILE_ATTRIBUTES) {
-    // See if the file didn't actually exist.
-    error_code ec = make_error_code(windows_error(::GetLastError()));
-    if (ec != windows_error::file_not_found &&
-        ec != windows_error::path_not_found)
-      return ec;
-    result = false;
-  } else
-    result = true;
-  return error_code::success();
-}
-
-bool equivalent(file_status A, file_status B) {
-  assert(status_known(A) && status_known(B));
-  return A.FileIndexHigh      == B.FileIndexHigh &&
-         A.FileIndexLow       == B.FileIndexLow &&
-         A.FileSizeHigh       == B.FileSizeHigh &&
-         A.FileSizeLow        == B.FileSizeLow &&
-         A.LastWriteTimeHigh  == B.LastWriteTimeHigh &&
-         A.LastWriteTimeLow   == B.LastWriteTimeLow &&
-         A.VolumeSerialNumber == B.VolumeSerialNumber;
-}
-
-error_code equivalent(const Twine &A, const Twine &B, bool &result) {
-  file_status fsA, fsB;
-  if (error_code ec = status(A, fsA)) return ec;
-  if (error_code ec = status(B, fsB)) return ec;
-  result = equivalent(fsA, fsB);
-  return error_code::success();
-}
-
-error_code file_size(const Twine &path, uint64_t &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  WIN32_FILE_ATTRIBUTE_DATA FileData;
-  if (!::GetFileAttributesExW(path_utf16.begin(),
-                              ::GetFileExInfoStandard,
-                              &FileData))
-    return windows_error(::GetLastError());
-
-  result =
-    (uint64_t(FileData.nFileSizeHigh) << (sizeof(FileData.nFileSizeLow) * 8))
-    + FileData.nFileSizeLow;
-
-  return error_code::success();
-}
-
-static bool isReservedName(StringRef path) {
-  // This list of reserved names comes from MSDN, at:
-  // http://msdn.microsoft.com/en-us/library/aa365247%28v=vs.85%29.aspx
-  static const char *sReservedNames[] = { "nul", "con", "prn", "aux",
-                              "com1", "com2", "com3", "com4", "com5", "com6",
-                              "com7", "com8", "com9", "lpt1", "lpt2", "lpt3",
-                              "lpt4", "lpt5", "lpt6", "lpt7", "lpt8", "lpt9" };
-
-  // First, check to see if this is a device namespace, which always
-  // starts with \\.\, since device namespaces are not legal file paths.
-  if (path.startswith("\\\\.\\"))
-    return true;
-
-  // Then compare against the list of ancient reserved names
-  for (size_t i = 0; i < sizeof(sReservedNames) / sizeof(const char *); ++i) {
-    if (path.equals_lower(sReservedNames[i]))
-      return true;
-  }
-
-  // The path isn't what we consider reserved.
-  return false;
-}
-
-error_code status(const Twine &path, file_status &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  StringRef path8 = path.toStringRef(path_storage);
-  if (isReservedName(path8)) {
-    result = file_status(file_type::character_file);
-    return error_code::success();
-  }
-
-  if (error_code ec = UTF8ToUTF16(path8, path_utf16))
-    return ec;
-
-  DWORD attr = ::GetFileAttributesW(path_utf16.begin());
-  if (attr == INVALID_FILE_ATTRIBUTES)
-    goto handle_status_error;
-
-  // Handle reparse points.
-  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
-    ScopedFileHandle h(
-      ::CreateFileW(path_utf16.begin(),
-                    0, // Attributes only.
-                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL,
-                    OPEN_EXISTING,
-                    FILE_FLAG_BACKUP_SEMANTICS,
-                    0));
-    if (!h)
-      goto handle_status_error;
-  }
-
-  if (attr & FILE_ATTRIBUTE_DIRECTORY)
-    result = file_status(file_type::directory_file);
-  else {
-    result = file_status(file_type::regular_file);
-    ScopedFileHandle h(
-      ::CreateFileW(path_utf16.begin(),
-                    0, // Attributes only.
-                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL,
-                    OPEN_EXISTING,
-                    FILE_FLAG_BACKUP_SEMANTICS,
-                    0));
-    if (!h)
-      goto handle_status_error;
-    BY_HANDLE_FILE_INFORMATION Info;
-    if (!::GetFileInformationByHandle(h, &Info))
-      goto handle_status_error;
-    result.FileIndexHigh      = Info.nFileIndexHigh;
-    result.FileIndexLow       = Info.nFileIndexLow;
-    result.FileSizeHigh       = Info.nFileSizeHigh;
-    result.FileSizeLow        = Info.nFileSizeLow;
-    result.LastWriteTimeHigh  = Info.ftLastWriteTime.dwHighDateTime;
-    result.LastWriteTimeLow   = Info.ftLastWriteTime.dwLowDateTime;
-    result.VolumeSerialNumber = Info.dwVolumeSerialNumber;
-  }
-
-  return error_code::success();
-
-handle_status_error:
-  error_code ec = windows_error(::GetLastError());
-  if (ec == windows_error::file_not_found ||
-      ec == windows_error::path_not_found)
-    result = file_status(file_type::file_not_found);
-  else if (ec == windows_error::sharing_violation)
-    result = file_status(file_type::type_unknown);
-  else {
-    result = file_status(file_type::status_error);
-    return ec;
-  }
-
-  return error_code::success();
-}
-
-
-// Modifies permissions on a file.
-error_code permissions(const Twine &path, perms prms) {
-#if 0 // verify code below before enabling:
-  // If the permissions bits are not trying to modify
-  // "write" permissions, there is nothing to do.
-  if (!(prms & (owner_write|group_write|others_write)))
-    return error_code::success();
-  
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  DWORD attributes = ::GetFileAttributesW(path_utf16.begin());
-
-  if (prms & add_perms) {
-    attributes &= ~FILE_ATTRIBUTE_READONLY;
-  }
-  else if (prms & remove_perms) {
-    attributes |= FILE_ATTRIBUTE_READONLY;
-  }
-  else {
-    assert(0 && "neither add_perms or remove_perms is set");
-  }
-
-  if ( ! ::SetFileAttributesW(path_utf16.begin(), attributes))
-    return windows_error(::GetLastError());
-#endif    
-  return error_code::success();
-}
-
-
-// FIXME: mode should be used here and default to user r/w only,
-// it currently comes in as a UNIX mode.
-error_code unique_file(const Twine &model, int &result_fd,
-                       SmallVectorImpl<char> &result_path,
-                       bool makeAbsolute, unsigned mode) {
-  // Use result_path as temp storage.
-  result_path.set_size(0);
-  StringRef m = model.toStringRef(result_path);
-
-  SmallVector<wchar_t, 128> model_utf16;
-  if (error_code ec = UTF8ToUTF16(m, model_utf16)) return ec;
-
-  if (makeAbsolute) {
-    // Make model absolute by prepending a temp directory if it's not already.
-    bool absolute = path::is_absolute(m);
-
-    if (!absolute) {
-      SmallVector<wchar_t, 64> temp_dir;
-      if (error_code ec = TempDir(temp_dir)) return ec;
-      // Handle c: by removing it.
-      if (model_utf16.size() > 2 && model_utf16[1] == L':') {
-        model_utf16.erase(model_utf16.begin(), model_utf16.begin() + 2);
-      }
-      model_utf16.insert(model_utf16.begin(), temp_dir.begin(), temp_dir.end());
-    }
-  }
-
-  // Replace '%' with random chars. From here on, DO NOT modify model. It may be
-  // needed if the randomly chosen path already exists.
-  SmallVector<wchar_t, 128> random_path_utf16;
-
-  // Get a Crypto Provider for CryptGenRandom.
-  HCRYPTPROV HCPC;
-  if (!::CryptAcquireContextW(&HCPC,
-                              NULL,
-                              NULL,
-                              PROV_RSA_FULL,
-                              CRYPT_VERIFYCONTEXT))
-    return windows_error(::GetLastError());
-  ScopedCryptContext CryptoProvider(HCPC);
-
-retry_random_path:
-  random_path_utf16.set_size(0);
-  for (SmallVectorImpl<wchar_t>::const_iterator i = model_utf16.begin(),
-                                                e = model_utf16.end();
-                                                i != e; ++i) {
-    if (*i == L'%') {
-      BYTE val = 0;
-      if (!::CryptGenRandom(CryptoProvider, 1, &val))
-          return windows_error(::GetLastError());
-      random_path_utf16.push_back("0123456789abcdef"[val & 15]);
-    }
-    else
-      random_path_utf16.push_back(*i);
-  }
-  // Make random_path_utf16 null terminated.
-  random_path_utf16.push_back(0);
-  random_path_utf16.pop_back();
-
-  // Make sure we don't fall into an infinite loop by constantly trying
-  // to create the parent path.
-  bool TriedToCreateParent = false;
-
-  // Try to create + open the path.
-retry_create_file:
-  HANDLE TempFileHandle = ::CreateFileW(random_path_utf16.begin(),
-                                        GENERIC_READ | GENERIC_WRITE,
-                                        FILE_SHARE_READ,
-                                        NULL,
-                                        // Return ERROR_FILE_EXISTS if the file
-                                        // already exists.
-                                        CREATE_NEW,
-                                        FILE_ATTRIBUTE_TEMPORARY,
-                                        NULL);
-  if (TempFileHandle == INVALID_HANDLE_VALUE) {
-    // If the file existed, try again, otherwise, error.
-    error_code ec = windows_error(::GetLastError());
-    if (ec == windows_error::file_exists)
-      goto retry_random_path;
-    // Check for non-existing parent directories.
-    if (ec == windows_error::path_not_found && !TriedToCreateParent) {
-      TriedToCreateParent = true;
-
-      // Create the directories using result_path as temp storage.
-      if (error_code ec = UTF16ToUTF8(random_path_utf16.begin(),
-                                      random_path_utf16.size(), result_path))
-        return ec;
-      StringRef p(result_path.begin(), result_path.size());
-      SmallString<64> dir_to_create;
-      for (path::const_iterator i = path::begin(p),
-                                e = --path::end(p); i != e; ++i) {
-        path::append(dir_to_create, *i);
-        bool Exists;
-        if (error_code ec = exists(Twine(dir_to_create), Exists)) return ec;
-        if (!Exists) {
-          // If c: doesn't exist, bail.
-          if (i->endswith(":"))
-            return ec;
-
-          SmallVector<wchar_t, 64> dir_to_create_utf16;
-          if (error_code ec = UTF8ToUTF16(dir_to_create, dir_to_create_utf16))
-            return ec;
-
-          // Create the directory.
-          if (!::CreateDirectoryW(dir_to_create_utf16.begin(), NULL))
-            return windows_error(::GetLastError());
-        }
-      }
-      goto retry_create_file;
-    }
-    return ec;
-  }
-
-  // Set result_path to the utf-8 representation of the path.
-  if (error_code ec = UTF16ToUTF8(random_path_utf16.begin(),
-                                  random_path_utf16.size(), result_path)) {
-    ::CloseHandle(TempFileHandle);
-    ::DeleteFileW(random_path_utf16.begin());
-    return ec;
-  }
-
-  // Convert the Windows API file handle into a C-runtime handle.
-  int fd = ::_open_osfhandle(intptr_t(TempFileHandle), 0);
-  if (fd == -1) {
-    ::CloseHandle(TempFileHandle);
-    ::DeleteFileW(random_path_utf16.begin());
-    // MSDN doesn't say anything about _open_osfhandle setting errno or
-    // GetLastError(), so just return invalid_handle.
-    return windows_error::invalid_handle;
-  }
-
-  result_fd = fd;
-  return error_code::success();
-}
-
-error_code get_magic(const Twine &path, uint32_t len,
-                     SmallVectorImpl<char> &result) {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-  result.set_size(0);
-
-  // Convert path to UTF-16.
-  if (error_code ec = UTF8ToUTF16(path.toStringRef(path_storage),
-                                  path_utf16))
-    return ec;
-
-  // Open file.
-  HANDLE file = ::CreateFileW(c_str(path_utf16),
-                              GENERIC_READ,
-                              FILE_SHARE_READ,
-                              NULL,
-                              OPEN_EXISTING,
-                              FILE_ATTRIBUTE_READONLY,
-                              NULL);
-  if (file == INVALID_HANDLE_VALUE)
-    return windows_error(::GetLastError());
-
-  // Allocate buffer.
-  result.reserve(len);
-
-  // Get magic!
-  DWORD bytes_read = 0;
-  BOOL read_success = ::ReadFile(file, result.data(), len, &bytes_read, NULL);
-  error_code ec = windows_error(::GetLastError());
-  ::CloseHandle(file);
-  if (!read_success || (bytes_read != len)) {
-    // Set result size to the number of bytes read if it's valid.
-    if (bytes_read <= len)
-      result.set_size(bytes_read);
-    // ERROR_HANDLE_EOF is mapped to errc::value_too_large.
-    return ec;
-  }
-
-  result.set_size(len);
-  return error_code::success();
-}
-
-error_code mapped_file_region::init(int FD, bool CloseFD, uint64_t Offset) {
-  FileDescriptor = FD;
-  // Make sure that the requested size fits within SIZE_T.
-  if (Size > std::numeric_limits<SIZE_T>::max()) {
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
-    return make_error_code(errc::invalid_argument);
-  }
-
-  DWORD flprotect;
-  switch (Mode) {
-  case readonly:  flprotect = PAGE_READONLY; break;
-  case readwrite: flprotect = PAGE_READWRITE; break;
-  case priv:      flprotect = PAGE_WRITECOPY; break;
-  default: llvm_unreachable("invalid mapping mode");
-  }
-
-  FileMappingHandle = ::CreateFileMapping(FileHandle,
-                                          0,
-                                          flprotect,
-                                          Size >> 32,
-                                          Size & 0xffffffff,
-                                          0);
-  if (FileMappingHandle == NULL) {
-    error_code ec = windows_error(GetLastError());
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
-    return ec;
-  }
-
-  DWORD dwDesiredAccess;
-  switch (Mode) {
-  case readonly:  dwDesiredAccess = FILE_MAP_READ; break;
-  case readwrite: dwDesiredAccess = FILE_MAP_WRITE; break;
-  case priv:      dwDesiredAccess = FILE_MAP_COPY; break;
-  default: llvm_unreachable("invalid mapping mode");
-  }
-  Mapping = ::MapViewOfFile(FileMappingHandle,
-                            dwDesiredAccess,
-                            Offset >> 32,
-                            Offset & 0xffffffff,
-                            Size);
-  if (Mapping == NULL) {
-    error_code ec = windows_error(GetLastError());
-    ::CloseHandle(FileMappingHandle);
-    if (FileDescriptor) {
-      if (CloseFD)
-        _close(FileDescriptor);
-    } else
-      ::CloseHandle(FileHandle);
-    return ec;
-  }
-
-  if (Size == 0) {
-    MEMORY_BASIC_INFORMATION mbi;
-    SIZE_T Result = VirtualQuery(Mapping, &mbi, sizeof(mbi));
-    if (Result == 0) {
-      error_code ec = windows_error(GetLastError());
-      ::UnmapViewOfFile(Mapping);
-      ::CloseHandle(FileMappingHandle);
-      if (FileDescriptor) {
-        if (CloseFD)
-          _close(FileDescriptor);
-      } else
-        ::CloseHandle(FileHandle);
-      return ec;
-    }
-    Size = mbi.RegionSize;
-  }
-
-  // Close all the handles except for the view. It will keep the other handles
-  // alive.
-  ::CloseHandle(FileMappingHandle);
-  if (FileDescriptor) {
-    if (CloseFD)
-      _close(FileDescriptor); // Also closes FileHandle.
-  } else
-    ::CloseHandle(FileHandle);
-  return error_code::success();
-}
-
-mapped_file_region::mapped_file_region(const Twine &path,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       error_code &ec) 
-  : Mode(mode)
-  , Size(length)
-  , Mapping()
-  , FileDescriptor()
-  , FileHandle(INVALID_HANDLE_VALUE)
-  , FileMappingHandle() {
-  SmallString<128> path_storage;
-  SmallVector<wchar_t, 128> path_utf16;
-
-  // Convert path to UTF-16.
-  if ((ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16)))
-    return;
-
-  // Get file handle for creating a file mapping.
-  FileHandle = ::CreateFileW(c_str(path_utf16),
-                             Mode == readonly ? GENERIC_READ
-                                              : GENERIC_READ | GENERIC_WRITE,
-                             Mode == readonly ? FILE_SHARE_READ
-                                              : 0,
-                             0,
-                             Mode == readonly ? OPEN_EXISTING
-                                              : OPEN_ALWAYS,
-                             Mode == readonly ? FILE_ATTRIBUTE_READONLY
-                                              : FILE_ATTRIBUTE_NORMAL,
-                             0);
-  if (FileHandle == INVALID_HANDLE_VALUE) {
-    ec = windows_error(::GetLastError());
-    return;
-  }
-
-  FileDescriptor = 0;
-  ec = init(FileDescriptor, true, offset);
-  if (ec) {
-    Mapping = FileMappingHandle = 0;
-    FileHandle = INVALID_HANDLE_VALUE;
-    FileDescriptor = 0;
-  }
-}
-
-mapped_file_region::mapped_file_region(int fd,
-                                       bool closefd,
-                                       mapmode mode,
-                                       uint64_t length,
-                                       uint64_t offset,
-                                       error_code &ec)
-  : Mode(mode)
-  , Size(length)
-  , Mapping()
-  , FileDescriptor(fd)
-  , FileHandle(INVALID_HANDLE_VALUE)
-  , FileMappingHandle() {
-  FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
-  if (FileHandle == INVALID_HANDLE_VALUE) {
-    if (closefd)
-      _close(FileDescriptor);
-    FileDescriptor = 0;
-    ec = make_error_code(errc::bad_file_descriptor);
-    return;
-  }
-
-  ec = init(FileDescriptor, closefd, offset);
-  if (ec) {
-    Mapping = FileMappingHandle = 0;
-    FileHandle = INVALID_HANDLE_VALUE;
-    FileDescriptor = 0;
-  }
-}
-
-mapped_file_region::~mapped_file_region() {
-  if (Mapping)
-    ::UnmapViewOfFile(Mapping);
-}
-
-#if LLVM_HAS_RVALUE_REFERENCES
-mapped_file_region::mapped_file_region(mapped_file_region &&other)
-  : Mode(other.Mode)
-  , Size(other.Size)
-  , Mapping(other.Mapping)
-  , FileDescriptor(other.FileDescriptor)
-  , FileHandle(other.FileHandle)
-  , FileMappingHandle(other.FileMappingHandle) {
-  other.Mapping = other.FileMappingHandle = 0;
-  other.FileHandle = INVALID_HANDLE_VALUE;
-  other.FileDescriptor = 0;
-}
-#endif
-
-mapped_file_region::mapmode mapped_file_region::flags() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Mode;
-}
-
-uint64_t mapped_file_region::size() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return Size;
-}
-
-char *mapped_file_region::data() const {
-  assert(Mode != readonly && "Cannot get non const data for readonly mapping!");
-  assert(Mapping && "Mapping failed but used anyway!");
-  return reinterpret_cast<char*>(Mapping);
-}
-
-const char *mapped_file_region::const_data() const {
-  assert(Mapping && "Mapping failed but used anyway!");
-  return reinterpret_cast<const char*>(Mapping);
-}
-
-int mapped_file_region::alignment() {
-  SYSTEM_INFO SysInfo;
-  ::GetSystemInfo(&SysInfo);
-  return SysInfo.dwAllocationGranularity;
-}
-
-error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
-  SmallVector<wchar_t, 128> path_utf16;
-
-  if (error_code ec = UTF8ToUTF16(path,
-                                  path_utf16))
-    return ec;
-
-  // Convert path to the format that Windows is happy with.
-  if (path_utf16.size() > 0 &&
-      !is_separator(path_utf16[path.size() - 1]) &&
-      path_utf16[path.size() - 1] != L':') {
-    path_utf16.push_back(L'\\');
-    path_utf16.push_back(L'*');
-  } else {
-    path_utf16.push_back(L'*');
-  }
-
-  //  Get the first directory entry.
-  WIN32_FIND_DATAW FirstFind;
-  ScopedFindHandle FindHandle(::FindFirstFileW(c_str(path_utf16), &FirstFind));
-  if (!FindHandle)
-    return windows_error(::GetLastError());
-
-  size_t FilenameLen = ::wcslen(FirstFind.cFileName);
-  while ((FilenameLen == 1 && FirstFind.cFileName[0] == L'.') ||
-         (FilenameLen == 2 && FirstFind.cFileName[0] == L'.' &&
-                              FirstFind.cFileName[1] == L'.'))
-    if (!::FindNextFileW(FindHandle, &FirstFind)) {
-      error_code ec = windows_error(::GetLastError());
-      // Check for end.
-      if (ec == windows_error::no_more_files)
-        return detail::directory_iterator_destruct(it);
-      return ec;
-    } else
-      FilenameLen = ::wcslen(FirstFind.cFileName);
-
-  // Construct the current directory entry.
-  SmallString<128> directory_entry_name_utf8;
-  if (error_code ec = UTF16ToUTF8(FirstFind.cFileName,
-                                  ::wcslen(FirstFind.cFileName),
-                                  directory_entry_name_utf8))
-    return ec;
-
-  it.IterationHandle = intptr_t(FindHandle.take());
-  SmallString<128> directory_entry_path(path);
-  path::append(directory_entry_path, directory_entry_name_utf8.str());
-  it.CurrentEntry = directory_entry(directory_entry_path.str());
-
-  return error_code::success();
-}
-
-error_code detail::directory_iterator_destruct(detail::DirIterState &it) {
-  if (it.IterationHandle != 0)
-    // Closes the handle if it's valid.
-    ScopedFindHandle close(HANDLE(it.IterationHandle));
-  it.IterationHandle = 0;
-  it.CurrentEntry = directory_entry();
-  return error_code::success();
-}
-
-error_code detail::directory_iterator_increment(detail::DirIterState &it) {
-  WIN32_FIND_DATAW FindData;
-  if (!::FindNextFileW(HANDLE(it.IterationHandle), &FindData)) {
-    error_code ec = windows_error(::GetLastError());
-    // Check for end.
-    if (ec == windows_error::no_more_files)
-      return detail::directory_iterator_destruct(it);
-    return ec;
-  }
-
-  size_t FilenameLen = ::wcslen(FindData.cFileName);
-  if ((FilenameLen == 1 && FindData.cFileName[0] == L'.') ||
-      (FilenameLen == 2 && FindData.cFileName[0] == L'.' &&
-                           FindData.cFileName[1] == L'.'))
-    return directory_iterator_increment(it);
-
-  SmallString<128> directory_entry_path_utf8;
-  if (error_code ec = UTF16ToUTF8(FindData.cFileName,
-                                  ::wcslen(FindData.cFileName),
-                                  directory_entry_path_utf8))
-    return ec;
-
-  it.CurrentEntry.replace_filename(Twine(directory_entry_path_utf8));
-  return error_code::success();
-}
-
-error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
-                                            bool map_writable, void *&result) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-error_code unmap_file_pages(void *base, size_t size) {
-  assert(0 && "NOT IMPLEMENTED");
-  return windows_error::invalid_function;
-}
-
-
-
-} // end namespace fs
-} // end namespace sys
-} // end namespace llvm
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index ad94128..f9a3db9 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -11,18 +11,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Allocator.h"
+
 #include "Windows.h"
 #include <direct.h>
 #include <io.h>
 #include <malloc.h>
 #include <psapi.h>
+#include <shellapi.h>
 
 #ifdef __MINGW32__
  #if (HAVE_LIBPSAPI != 1)
   #error "libpsapi.a should be present"
  #endif
+ #if (HAVE_LIBSHELL32 != 1)
+  #error "libshell32.a should be present"
+ #endif
 #else
  #pragma comment(lib, "psapi.lib")
+ #pragma comment(lib, "shell32.lib")
 #endif
 
 //===----------------------------------------------------------------------===//
@@ -40,7 +47,7 @@ using namespace sys;
 
 
 process::id_type self_process::get_id() {
-  return GetCurrentProcess();
+  return GetCurrentProcessId();
 }
 
 static TimeValue getTimeValueFromFILETIME(FILETIME Time) {
@@ -83,6 +90,8 @@ static unsigned getPageSize() {
   // that LLVM ought to run as 64-bits on a 64-bit system, anyway.
   SYSTEM_INFO info;
   GetSystemInfo(&info);
+  // FIXME: FileOffset in MapViewOfFile() should be aligned to not dwPageSize,
+  // but dwAllocationGranularity.
   return static_cast<unsigned>(info.dwPageSize);
 }
 
@@ -119,28 +128,89 @@ void Process::GetTimeUsage(TimeValue &elapsed, TimeValue &user_time,
   sys_time = getTimeValueFromFILETIME(KernelTime);
 }
 
-int Process::GetCurrentUserId()
-{
-  return 65536;
-}
-
-int Process::GetCurrentGroupId()
-{
-  return 65536;
-}
-
 // Some LLVM programs such as bugpoint produce core files as a normal part of
-// their operation. To prevent the disk from filling up, this configuration item
-// does what's necessary to prevent their generation.
+// their operation. To prevent the disk from filling up, this configuration
+// item does what's necessary to prevent their generation.
 void Process::PreventCoreFiles() {
-  // Windows doesn't do core files, but it does do modal pop-up message
-  // boxes.  As this method is used by bugpoint, preventing these pop-ups
-  // is the moral equivalent of suppressing core files.
+  // Windows does have the concept of core files, called minidumps.  However,
+  // disabling minidumps for a particular application extends past the lifetime
+  // of that application, which is the incorrect behavior for this API.
+  // Additionally, the APIs require elevated privileges to disable and re-
+  // enable minidumps, which makes this untenable. For more information, see
+  // WerAddExcludedApplication and WerRemoveExcludedApplication (Vista and
+  // later).
+  //
+  // Windows also has modal pop-up message boxes.  As this method is used by
+  // bugpoint, preventing these pop-ups is additionally important.
   SetErrorMode(SEM_FAILCRITICALERRORS |
                SEM_NOGPFAULTERRORBOX |
                SEM_NOOPENFILEERRORBOX);
 }
 
+/// Returns the environment variable \arg Name's value as a string encoded in
+/// UTF-8. \arg Name is assumed to be in UTF-8 encoding.
+Optional<std::string> Process::GetEnv(StringRef Name) {
+  // Convert the argument to UTF-16 to pass it to _wgetenv().
+  SmallVector<wchar_t, 128> NameUTF16;
+  if (error_code ec = windows::UTF8ToUTF16(Name, NameUTF16))
+    return None;
+
+  // Environment variable can be encoded in non-UTF8 encoding, and there's no
+  // way to know what the encoding is. The only reliable way to look up
+  // multibyte environment variable is to use GetEnvironmentVariableW().
+  SmallVector<wchar_t, MAX_PATH> Buf;
+  size_t Size = MAX_PATH;
+  do {
+    Buf.reserve(Size);
+    Size =
+        GetEnvironmentVariableW(NameUTF16.data(), Buf.data(), Buf.capacity());
+    if (Size == 0)
+      return None;
+
+    // Try again with larger buffer.
+  } while (Size > Buf.capacity());
+  Buf.set_size(Size);
+
+  // Convert the result from UTF-16 to UTF-8.
+  SmallVector<char, MAX_PATH> Res;
+  if (error_code ec = windows::UTF16ToUTF8(Buf.data(), Size, Res))
+    return None;
+  return std::string(Res.data());
+}
+
+error_code
+Process::GetArgumentVector(SmallVectorImpl<const char *> &Args,
+                           ArrayRef<const char *>,
+                           SpecificBumpPtrAllocator<char> &ArgAllocator) {
+  int NewArgCount;
+  error_code ec;
+
+  wchar_t **UnicodeCommandLine = CommandLineToArgvW(GetCommandLineW(),
+                                                    &NewArgCount);
+  if (!UnicodeCommandLine)
+    return windows_error(::GetLastError());
+
+  Args.reserve(NewArgCount);
+
+  for (int i = 0; i < NewArgCount; ++i) {
+    SmallVector<char, MAX_PATH> NewArgString;
+    ec = windows::UTF16ToUTF8(UnicodeCommandLine[i],
+                              wcslen(UnicodeCommandLine[i]),
+                              NewArgString);
+    if (ec)
+      break;
+
+    char *Buffer = ArgAllocator.Allocate(NewArgString.size() + 1);
+    ::memcpy(Buffer, NewArgString.data(), NewArgString.size() + 1);
+    Args.push_back(Buffer);
+  }
+  LocalFree(UnicodeCommandLine);
+  if (ec)
+    return ec;
+
+  return error_code::success();
+}
+
 bool Process::StandardInIsUserInput() {
   return FileDescriptorIsDisplayed(0);
 }
@@ -187,6 +257,11 @@ bool Process::StandardErrHasColors() {
   return FileDescriptorHasColors(2);
 }
 
+static bool UseANSI = false;
+void Process::UseANSIEscapeCodes(bool enable) {
+  UseANSI = enable;
+}
+
 namespace {
 class DefaultColors
 {
@@ -208,10 +283,12 @@ DefaultColors defaultColors;
 }
 
 bool Process::ColorNeedsFlush() {
-  return true;
+  return !UseANSI;
 }
 
 const char *Process::OutputBold(bool bg) {
+  if (UseANSI) return "\033[1m";
+
   WORD colors = DefaultColors::GetCurrentColor();
   if (bg)
     colors |= BACKGROUND_INTENSITY;
@@ -222,6 +299,8 @@ const char *Process::OutputBold(bool bg) {
 }
 
 const char *Process::OutputColor(char code, bool bold, bool bg) {
+  if (UseANSI) return colorcodes[bg?1:0][bold?1:0][code&7];
+
   WORD colors;
   if (bg) {
     colors = ((code&1) ? BACKGROUND_RED : 0) |
@@ -247,6 +326,8 @@ static WORD GetConsoleTextAttribute(HANDLE hConsoleOutput) {
 }
 
 const char *Process::OutputReverse() {
+  if (UseANSI) return "\033[7m";
+
   const WORD attributes
    = GetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE));
 
@@ -273,6 +354,7 @@ const char *Process::OutputReverse() {
 }
 
 const char *Process::ResetColor() {
+  if (UseANSI) return "\033[0m";
   SetConsoleTextAttribute(GetStdHandle(STD_OUTPUT_HANDLE), defaultColors());
   return 0;
 }
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 619ae5d..dc09738 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "Windows.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/FileSystem.h"
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
@@ -22,37 +24,17 @@
 //===          and must not be UNIX code
 //===----------------------------------------------------------------------===//
 
-namespace {
-  struct Win32ProcessInfo {
-    HANDLE hProcess;
-    DWORD  dwProcessId;
-  };
-}
-
 namespace llvm {
 using namespace sys;
 
-Program::Program() : Data_(0) {}
-
-Program::~Program() {
-  if (Data_) {
-    Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
-    CloseHandle(wpi->hProcess);
-    delete wpi;
-    Data_ = 0;
-  }
-}
+ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
 
 // This function just uses the PATH environment variable to find the program.
-Path
-Program::FindProgramByName(const std::string& progName) {
-
+std::string sys::FindProgramByName(const std::string &progName) {
   // Check some degenerate cases
   if (progName.length() == 0) // no program
-    return Path();
-  Path temp;
-  if (!temp.set(progName)) // invalid name
-    return Path();
+    return "";
+  std::string temp = progName;
   // Return paths with slashes verbatim.
   if (progName.find('\\') != std::string::npos ||
       progName.find('/') != std::string::npos)
@@ -60,58 +42,60 @@ Program::FindProgramByName(const std::string& progName) {
 
   // At this point, the file name is valid and does not contain slashes.
   // Let Windows search for it.
-  char buffer[MAX_PATH];
-  char *dummy = NULL;
-  DWORD len = SearchPath(NULL, progName.c_str(), ".exe", MAX_PATH,
-                         buffer, &dummy);
-
-  // See if it wasn't found.
-  if (len == 0)
-    return Path();
-
-  // See if we got the entire path.
-  if (len < MAX_PATH)
-    return Path(buffer);
-
-  // Buffer was too small; grow and retry.
-  while (true) {
-    char *b = reinterpret_cast<char *>(_alloca(len+1));
-    DWORD len2 = SearchPath(NULL, progName.c_str(), ".exe", len+1, b, &dummy);
-
-    // It is unlikely the search failed, but it's always possible some file
-    // was added or removed since the last search, so be paranoid...
-    if (len2 == 0)
-      return Path();
-    else if (len2 <= len)
-      return Path(b);
-
-    len = len2;
-  }
+  SmallVector<wchar_t, MAX_PATH> progNameUnicode;
+  if (windows::UTF8ToUTF16(progName, progNameUnicode))
+    return "";
+
+  SmallVector<wchar_t, MAX_PATH> buffer;
+  DWORD len = MAX_PATH;
+  do {
+    buffer.reserve(len);
+    len = ::SearchPathW(NULL, progNameUnicode.data(), L".exe",
+                        buffer.capacity(), buffer.data(), NULL);
+
+    // See if it wasn't found.
+    if (len == 0)
+      return "";
+
+    // Buffer was too small; grow and retry.
+  } while (len > buffer.capacity());
+
+  buffer.set_size(len);
+  SmallVector<char, MAX_PATH> result;
+  if (windows::UTF16ToUTF8(buffer.begin(), buffer.size(), result))
+    return "";
+
+  return std::string(result.data(), result.size());
 }
 
-static HANDLE RedirectIO(const Path *path, int fd, std::string* ErrMsg) {
+static HANDLE RedirectIO(const StringRef *path, int fd, std::string* ErrMsg) {
   HANDLE h;
   if (path == 0) {
-    DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
-                    GetCurrentProcess(), &h,
-                    0, TRUE, DUPLICATE_SAME_ACCESS);
+    if (!DuplicateHandle(GetCurrentProcess(), (HANDLE)_get_osfhandle(fd),
+                         GetCurrentProcess(), &h,
+                         0, TRUE, DUPLICATE_SAME_ACCESS))
+      return INVALID_HANDLE_VALUE;
     return h;
   }
 
-  const char *fname;
-  if (path->isEmpty())
+  std::string fname;
+  if (path->empty())
     fname = "NUL";
   else
-    fname = path->c_str();
+    fname = *path;
 
   SECURITY_ATTRIBUTES sa;
   sa.nLength = sizeof(sa);
   sa.lpSecurityDescriptor = 0;
   sa.bInheritHandle = TRUE;
 
-  h = CreateFile(fname, fd ? GENERIC_WRITE : GENERIC_READ, FILE_SHARE_READ,
-                 &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
-                 FILE_ATTRIBUTE_NORMAL, NULL);
+  SmallVector<wchar_t, 128> fnameUnicode;
+  if (windows::UTF8ToUTF16(fname, fnameUnicode))
+    return INVALID_HANDLE_VALUE;
+
+  h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ,
+                  FILE_SHARE_READ, &sa, fd == 0 ? OPEN_EXISTING : CREATE_ALWAYS,
+                  FILE_ATTRIBUTE_NORMAL, NULL);
   if (h == INVALID_HANDLE_VALUE) {
     MakeErrMsg(ErrMsg, std::string(fname) + ": Can't open file for " +
         (fd ? "input: " : "output: "));
@@ -181,22 +165,12 @@ static unsigned int ArgLenWithQuotes(const char *Str) {
   return len;
 }
 
+}
 
-bool
-Program::Execute(const Path& path,
-                 const char** args,
-                 const char** envp,
-                 const Path** redirects,
-                 unsigned memoryLimit,
-                 std::string* ErrMsg) {
-  if (Data_) {
-    Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
-    CloseHandle(wpi->hProcess);
-    delete wpi;
-    Data_ = 0;
-  }
-
-  if (!path.canExecute()) {
+static bool Execute(ProcessInfo &PI, StringRef Program, const char **args,
+                    const char **envp, const StringRef **redirects,
+                    unsigned memoryLimit, std::string *ErrMsg) {
+  if (!sys::fs::can_execute(Program)) {
     if (ErrMsg)
       *ErrMsg = "program not executable";
     return false;
@@ -213,8 +187,8 @@ Program::Execute(const Path& path,
   }
 
   // Now build the command line.
-  char *command = reinterpret_cast<char *>(_alloca(len+1));
-  char *p = command;
+  OwningArrayPtr<char> command(new char[len+1]);
+  char *p = command.get();
 
   for (unsigned i = 0; args[i]; i++) {
     const char *arg = args[i];
@@ -245,34 +219,28 @@ Program::Execute(const Path& path,
   *p = 0;
 
   // The pointer to the environment block for the new process.
-  char *envblock = 0;
+  std::vector<wchar_t> EnvBlock;
 
   if (envp) {
     // An environment block consists of a null-terminated block of
     // null-terminated strings. Convert the array of environment variables to
     // an environment block by concatenating them.
+    for (unsigned i = 0; envp[i]; ++i) {
+      SmallVector<wchar_t, MAX_PATH> EnvString;
+      if (error_code ec = windows::UTF8ToUTF16(envp[i], EnvString)) {
+        SetLastError(ec.value());
+        MakeErrMsg(ErrMsg, "Unable to convert environment variable to UTF-16");
+        return false;
+      }
 
-    // First, determine the length of the environment block.
-    len = 0;
-    for (unsigned i = 0; envp[i]; i++)
-      len += strlen(envp[i]) + 1;
-
-    // Now build the environment block.
-    envblock = reinterpret_cast<char *>(_alloca(len+1));
-    p = envblock;
-
-    for (unsigned i = 0; envp[i]; i++) {
-      const char *ev = envp[i];
-      size_t len = strlen(ev) + 1;
-      memcpy(p, ev, len);
-      p += len;
+      EnvBlock.insert(EnvBlock.end(), EnvString.begin(), EnvString.end());
+      EnvBlock.push_back(0);
     }
-
-    *p = 0;
+    EnvBlock.push_back(0);
   }
 
   // Create a child process.
-  STARTUPINFO si;
+  STARTUPINFOW si;
   memset(&si, 0, sizeof(si));
   si.cb = sizeof(si);
   si.hStdInput = INVALID_HANDLE_VALUE;
@@ -296,9 +264,14 @@ Program::Execute(const Path& path,
     if (redirects[1] && redirects[2] && *(redirects[1]) == *(redirects[2])) {
       // If stdout and stderr should go to the same place, redirect stderr
       // to the handle already open for stdout.
-      DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
-                      GetCurrentProcess(), &si.hStdError,
-                      0, TRUE, DUPLICATE_SAME_ACCESS);
+      if (!DuplicateHandle(GetCurrentProcess(), si.hStdOutput,
+                           GetCurrentProcess(), &si.hStdError,
+                           0, TRUE, DUPLICATE_SAME_ACCESS)) {
+        CloseHandle(si.hStdInput);
+        CloseHandle(si.hStdOutput);
+        MakeErrMsg(ErrMsg, "can't dup stderr to stdout");
+        return false;
+      }
     } else {
       // Just redirect stderr
       si.hStdError = RedirectIO(redirects[2], 2, ErrMsg);
@@ -316,8 +289,27 @@ Program::Execute(const Path& path,
 
   fflush(stdout);
   fflush(stderr);
-  BOOL rc = CreateProcess(path.c_str(), command, NULL, NULL, TRUE, 0,
-                          envblock, NULL, &si, &pi);
+
+  SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(Program, ProgramUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert application name to UTF-16"));
+    return false;
+  }
+
+  SmallVector<wchar_t, MAX_PATH> CommandUtf16;
+  if (error_code ec = windows::UTF8ToUTF16(command.get(), CommandUtf16)) {
+    SetLastError(ec.value());
+    MakeErrMsg(ErrMsg,
+               std::string("Unable to convert command-line to UTF-16"));
+    return false;
+  }
+
+  BOOL rc = CreateProcessW(ProgramUtf16.data(), CommandUtf16.data(), 0, 0,
+                           TRUE, CREATE_UNICODE_ENVIRONMENT,
+                           EnvBlock.empty() ? 0 : EnvBlock.data(), 0, &si,
+                           &pi);
   DWORD err = GetLastError();
 
   // Regardless of whether the process got created or not, we are done with
@@ -330,13 +322,12 @@ Program::Execute(const Path& path,
   if (!rc) {
     SetLastError(err);
     MakeErrMsg(ErrMsg, std::string("Couldn't execute program '") +
-               path.str() + "'");
+               Program.str() + "'");
     return false;
   }
-  Win32ProcessInfo* wpi = new Win32ProcessInfo;
-  wpi->hProcess = pi.hProcess;
-  wpi->dwProcessId = pi.dwProcessId;
-  Data_ = wpi;
+
+  PI.Pid = pi.dwProcessId;
+  PI.ProcessHandle = pi.hProcess;
 
   // Make sure these get closed no matter what.
   ScopedCommonHandle hThread(pi.hThread);
@@ -344,7 +335,7 @@ Program::Execute(const Path& path,
   // Assign the process to a job if a memory limit is defined.
   ScopedJobHandle hJob;
   if (memoryLimit != 0) {
-    hJob = CreateJobObject(0, 0);
+    hJob = CreateJobObjectW(0, 0);
     bool success = false;
     if (hJob) {
       JOBOBJECT_EXTENDED_LIMIT_INFORMATION jeli;
@@ -369,72 +360,84 @@ Program::Execute(const Path& path,
   return true;
 }
 
-int
-Program::Wait(const Path &path,
-              unsigned secondsToWait,
-              std::string* ErrMsg) {
-  if (Data_ == 0) {
-    MakeErrMsg(ErrMsg, "Process not started!");
-    return -1;
-  }
-
-  Win32ProcessInfo* wpi = reinterpret_cast<Win32ProcessInfo*>(Data_);
-  HANDLE hProcess = wpi->hProcess;
-
-  // Wait for the process to terminate.
-  DWORD millisecondsToWait = INFINITE;
-  if (secondsToWait > 0)
-    millisecondsToWait = secondsToWait * 1000;
-
-  if (WaitForSingleObject(hProcess, millisecondsToWait) == WAIT_TIMEOUT) {
-    if (!TerminateProcess(hProcess, 1)) {
-      MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
-      // -2 indicates a crash or timeout as opposed to failure to execute.
-      return -2;
+namespace llvm {
+ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                      bool WaitUntilChildTerminates, std::string *ErrMsg) {
+  assert(PI.Pid && "invalid pid to wait on, process not started?");
+  assert(PI.ProcessHandle &&
+         "invalid process handle to wait on, process not started?");
+  DWORD milliSecondsToWait = 0;
+  if (WaitUntilChildTerminates)
+    milliSecondsToWait = INFINITE;
+  else if (SecondsToWait > 0)
+    milliSecondsToWait = SecondsToWait * 1000;
+
+  ProcessInfo WaitResult = PI;
+  DWORD WaitStatus = WaitForSingleObject(PI.ProcessHandle, milliSecondsToWait);
+  if (WaitStatus == WAIT_TIMEOUT) {
+    if (SecondsToWait) {
+      if (!TerminateProcess(PI.ProcessHandle, 1)) {
+        if (ErrMsg)
+          MakeErrMsg(ErrMsg, "Failed to terminate timed-out program.");
+
+        // -2 indicates a crash or timeout as opposed to failure to execute.
+        WaitResult.ReturnCode = -2;
+        CloseHandle(PI.ProcessHandle);
+        return WaitResult;
+      }
+      WaitForSingleObject(PI.ProcessHandle, INFINITE);
+      CloseHandle(PI.ProcessHandle);
+    } else {
+      // Non-blocking wait.
+      return ProcessInfo();
     }
-    WaitForSingleObject(hProcess, INFINITE);
   }
 
   // Get its exit status.
   DWORD status;
-  BOOL rc = GetExitCodeProcess(hProcess, &status);
+  BOOL rc = GetExitCodeProcess(PI.ProcessHandle, &status);
   DWORD err = GetLastError();
+  CloseHandle(PI.ProcessHandle);
 
   if (!rc) {
     SetLastError(err);
-    MakeErrMsg(ErrMsg, "Failed getting status for program.");
+    if (ErrMsg)
+      MakeErrMsg(ErrMsg, "Failed getting status for program.");
+
     // -2 indicates a crash or timeout as opposed to failure to execute.
-    return -2;
+    WaitResult.ReturnCode = -2;
+    return WaitResult;
   }
 
   if (!status)
-    return 0;
+    return WaitResult;
 
   // Pass 10(Warning) and 11(Error) to the callee as negative value.
   if ((status & 0xBFFF0000U) == 0x80000000U)
-    return (int)status;
-
-  if (status & 0xFF)
-    return status & 0x7FFFFFFF;
+    WaitResult.ReturnCode = static_cast<int>(status);
+  else if (status & 0xFF)
+    WaitResult.ReturnCode = status & 0x7FFFFFFF;
+  else
+    WaitResult.ReturnCode = 1;
 
-  return 1;
+  return WaitResult;
 }
 
-error_code Program::ChangeStdinToBinary(){
+error_code sys::ChangeStdinToBinary(){
   int result = _setmode( _fileno(stdin), _O_BINARY );
   if (result == -1)
     return error_code(errno, generic_category());
   return make_error_code(errc::success);
 }
 
-error_code Program::ChangeStdoutToBinary(){
+error_code sys::ChangeStdoutToBinary(){
   int result = _setmode( _fileno(stdout), _O_BINARY );
   if (result == -1)
     return error_code(errno, generic_category());
   return make_error_code(errc::success);
 }
 
-error_code Program::ChangeStderrToBinary(){
+error_code sys::ChangeStderrToBinary(){
   int result = _setmode( _fileno(stderr), _O_BINARY );
   if (result == -1)
     return error_code(errno, generic_category());
@@ -456,5 +459,4 @@ bool llvm::sys::argumentsFitWithinSystemLimits(ArrayRef<const char*> Args) {
   }
   return true;
 }
-
 }
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 9593923..c431844 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -48,8 +48,7 @@ static bool loadSRW() {
   if (!sChecked) {
     sChecked = true;
 
-    HMODULE hLib = ::LoadLibrary(TEXT("Kernel32"));
-    if (hLib) {
+    if (HMODULE hLib = ::GetModuleHandleW(L"Kernel32.dll")) {
       fpInitializeSRWLock =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "InitializeSRWLock");
@@ -65,7 +64,6 @@ static bool loadSRW() {
       fpReleaseSRWLockShared =
         (VOID (WINAPI *)(PSRWLOCK))::GetProcAddress(hLib,
                                                "ReleaseSRWLockShared");
-      ::FreeLibrary(hLib);
 
       if (fpInitializeSRWLock != NULL) {
         sHasSRW = true;
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index b18b4d1..4b40d51 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -11,6 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/FileSystem.h"
+
 #include "Windows.h"
 #include <algorithm>
 #include <stdio.h>
@@ -133,7 +135,7 @@ typedef PVOID (WINAPI *fpSymFunctionTableAccess64)(HANDLE, DWORD64);
 static fpSymFunctionTableAccess64 SymFunctionTableAccess64;
 
 static bool load64BitDebugHelp(void) {
-  HMODULE hLib = ::LoadLibrary("Dbghelp.dll");
+  HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
   if (hLib) {
     StackWalk64 = (fpStackWalk64)
                       ::GetProcAddress(hLib, "StackWalk64");
@@ -158,7 +160,7 @@ static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType);
 // InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = 0;
 
-static std::vector<llvm::sys::Path> *FilesToRemove = NULL;
+static std::vector<std::string> *FilesToRemove = NULL;
 static std::vector<std::pair<void(*)(void*), void*> > *CallBacksToRun = 0;
 static bool RegisteredUnhandledExceptionFilter = false;
 static bool CleanupExecuted = false;
@@ -191,34 +193,6 @@ static int AvoidMessageBoxHook(int ReportType, char *Message, int *Return) {
   return TRUE;
 }
 
-/// CRTReportHook - Function called on a CRT debugging event.
-static int CRTReportHook(int ReportType, char *Message, int *Return) {
-  // Don't cause a DebugBreak() on return.
-  if (Return)
-    *Return = 0;
-
-  switch (ReportType) {
-  default:
-  case _CRT_ASSERT:
-    fprintf(stderr, "CRT assert: %s\n", Message);
-    // FIXME: Is there a way to just crash? Perhaps throw to the unhandled
-    // exception code? Perhaps SetErrorMode() handles this.
-    _exit(3);
-    break;
-  case _CRT_ERROR:
-    fprintf(stderr, "CRT error: %s\n", Message);
-    // FIXME: Is there a way to just crash? Perhaps throw to the unhandled
-    // exception code? Perhaps SetErrorMode() handles this.
-    _exit(3);
-    break;
-  case _CRT_WARN:
-    fprintf(stderr, "CRT warn: %s\n", Message);
-    break;
-  }
-
-  // Don't call _CrtDbgReport.
-  return TRUE;
-}
 #endif
 
 static void RegisterHandler() {
@@ -251,19 +225,10 @@ static void RegisterHandler() {
   OldFilter = SetUnhandledExceptionFilter(LLVMUnhandledExceptionFilter);
   SetConsoleCtrlHandler(LLVMConsoleCtrlHandler, TRUE);
 
-#ifdef _MSC_VER
-  const char *EnableMsgbox = getenv("LLVM_ENABLE_CRT_REPORT");
-  if (!EnableMsgbox || strcmp("0", EnableMsgbox) == 0) {
-    // Setting a report hook overrides the default behavior of popping an "abort,
-    // retry, or ignore" dialog.
-    _CrtSetReportHook(AvoidMessageBoxHook);
-  }
-#endif
-
   // Environment variable to disable any kind of crash dialog.
   if (getenv("LLVM_DISABLE_CRASH_REPORT")) {
 #ifdef _MSC_VER
-    _CrtSetReportHook(CRTReportHook);
+    _CrtSetReportHook(AvoidMessageBoxHook);
 #endif
     SetErrorMode(SEM_FAILCRITICALERRORS |
                  SEM_NOGPFAULTERRORBOX |
@@ -276,7 +241,7 @@ static void RegisterHandler() {
 }
 
 // RemoveFileOnSignal - The public API
-bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
+bool sys::RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg) {
   RegisterHandler();
 
   if (CleanupExecuted) {
@@ -286,7 +251,7 @@ bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
   }
 
   if (FilesToRemove == NULL)
-    FilesToRemove = new std::vector<sys::Path>;
+    FilesToRemove = new std::vector<std::string>;
 
   FilesToRemove->push_back(Filename);
 
@@ -295,14 +260,14 @@ bool sys::RemoveFileOnSignal(const sys::Path &Filename, std::string* ErrMsg) {
 }
 
 // DontRemoveFileOnSignal - The public API
-void sys::DontRemoveFileOnSignal(const sys::Path &Filename) {
+void sys::DontRemoveFileOnSignal(StringRef Filename) {
   if (FilesToRemove == NULL)
     return;
 
   RegisterHandler();
 
   FilesToRemove->push_back(Filename);
-  std::vector<sys::Path>::reverse_iterator I =
+  std::vector<std::string>::reverse_iterator I =
   std::find(FilesToRemove->rbegin(), FilesToRemove->rend(), Filename);
   if (I != FilesToRemove->rend())
     FilesToRemove->erase(I.base()-1);
@@ -352,7 +317,8 @@ static void Cleanup() {
 
   if (FilesToRemove != NULL)
     while (!FilesToRemove->empty()) {
-      FilesToRemove->back().eraseFromDisk();
+      bool Existed;
+      llvm::sys::fs::remove(FilesToRemove->back(), Existed);
       FilesToRemove->pop_back();
     }
 
diff --git a/lib/Support/Windows/TimeValue.inc b/lib/Support/Windows/TimeValue.inc
index 1227552..98b07d6 100644
--- a/lib/Support/Windows/TimeValue.inc
+++ b/lib/Support/Windows/TimeValue.inc
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Windows.h"
+#include <cctype>
 #include <time.h>
 
-namespace llvm {
-using namespace sys;
+using namespace llvm;
+using namespace llvm::sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code.
@@ -31,21 +32,28 @@ TimeValue TimeValue::now() {
 }
 
 std::string TimeValue::str() const {
+  struct tm *LT;
 #ifdef __MINGW32__
-  // This ban may be lifted by either:
-  // (i) a future MinGW version other than 1.0 inherents the __time64_t type, or
-  // (ii) configure tests for either the time_t or __time64_t type.
-  time_t ourTime = time_t(this->toEpochTime());
-  struct tm *lt = ::localtime(&ourTime);
+  // Old versions of mingw don't have _localtime64_s. Remove this once we drop support
+  // for them.
+  time_t OurTime = time_t(this->toEpochTime());
+  LT = ::localtime(&OurTime);
+  assert(LT);
 #else
-  __time64_t ourTime = this->toEpochTime();
-  struct tm *lt = ::_localtime64(&ourTime);
+  struct tm Storage;
+  __time64_t OurTime = this->toEpochTime();
+  int Error = ::_localtime64_s(&Storage, &OurTime);
+  assert(!Error);
+  LT = &Storage;
 #endif
 
-  char buffer[25];
-  strftime(buffer, 25, "%a %b %d %H:%M:%S %Y", lt);
-  return std::string(buffer);
-}
-
-
+  char Buffer[25];
+  // FIXME: the windows version of strftime doesn't support %e
+  strftime(Buffer, 25, "%b %d %H:%M %Y", LT);
+  assert((Buffer[3] == ' ' && isdigit(Buffer[5]) && Buffer[6] == ' ') &&
+         "Unexpected format in strftime()!");
+  // Emulate %e on %d to mute '0'.
+  if (Buffer[4] == '0')
+    Buffer[4] = ' ';
+  return std::string(Buffer);
 }
diff --git a/lib/Support/Windows/Windows.h b/lib/Support/Windows/Windows.h
index 5c1da0d..1f3417d 100644
--- a/lib/Support/Windows/Windows.h
+++ b/lib/Support/Windows/Windows.h
@@ -24,22 +24,31 @@
 #define _WIN32_IE    0x0600 // MinGW at it again.
 #define WIN32_LEAN_AND_MEAN
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Config/config.h" // Get build system configuration settings
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/system_error.h"
 #include <windows.h>
 #include <wincrypt.h>
-#include <shlobj.h>
 #include <cassert>
 #include <string>
+#include <vector>
 
 inline bool MakeErrMsg(std::string* ErrMsg, const std::string& prefix) {
   if (!ErrMsg)
     return true;
   char *buffer = NULL;
-  FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
-      NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
-  *ErrMsg = prefix + buffer;
+  DWORD R = FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER |
+                          FORMAT_MESSAGE_FROM_SYSTEM,
+                          NULL, GetLastError(), 0, (LPSTR)&buffer, 1, NULL);
+  if (R)
+    *ErrMsg = prefix + buffer;
+  else
+    *ErrMsg = prefix + "Unknown error";
+
   LocalFree(buffer);
-  return true;
+  return R != 0;
 }
 
 template <typename HandleTraits>
@@ -75,7 +84,7 @@ public:
   }
 
   // True if Handle is valid.
-  operator bool() const {
+  LLVM_EXPLICIT operator bool() const {
     return HandleTraits::IsValid(Handle) ? true : false;
   }
 
@@ -147,4 +156,13 @@ c_str(SmallVectorImpl<T> &str) {
   str.pop_back();
   return str.data();
 }
+
+namespace sys {
+namespace windows {
+error_code UTF8ToUTF16(StringRef utf8,
+                       SmallVectorImpl<wchar_t> &utf16);
+error_code UTF16ToUTF8(const wchar_t *utf16, size_t utf16_len,
+                       SmallVectorImpl<char> &utf8);
+} // end namespace windows
+} // end namespace sys
 } // end namespace llvm.
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 213f5e1..9495cd4 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -96,6 +96,15 @@ static EncodingInfo getUnicodeEncoding(StringRef Input) {
 
 namespace llvm {
 namespace yaml {
+/// Pin the vtables to this file.
+void Node::anchor() {}
+void NullNode::anchor() {}
+void ScalarNode::anchor() {}
+void KeyValueNode::anchor() {}
+void MappingNode::anchor() {}
+void SequenceNode::anchor() {}
+void AliasNode::anchor() {}
+
 /// Token - A single YAML token.
 struct Token : ilist_node<Token> {
   enum TokenKind {
@@ -1070,14 +1079,22 @@ bool Scanner::scanDirective() {
   Current = skip_while(&Scanner::skip_ns_char, Current);
   StringRef Name(NameStart, Current - NameStart);
   Current = skip_while(&Scanner::skip_s_white, Current);
-
+  
+  Token T;
   if (Name == "YAML") {
     Current = skip_while(&Scanner::skip_ns_char, Current);
-    Token T;
     T.Kind = Token::TK_VersionDirective;
     T.Range = StringRef(Start, Current - Start);
     TokenQueue.push_back(T);
     return true;
+  } else if(Name == "TAG") {
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    Current = skip_while(&Scanner::skip_s_white, Current);
+    Current = skip_while(&Scanner::skip_ns_char, Current);
+    T.Kind = Token::TK_TagDirective;
+    T.Range = StringRef(Start, Current - Start);
+    TokenQueue.push_back(T);
+    return true;
   }
   return false;
 }
@@ -1564,10 +1581,6 @@ void Stream::printError(Node *N, const Twine &Msg) {
                      , Ranges);
 }
 
-void Stream::handleYAMLDirective(const Token &t) {
-  // TODO: Ensure version is 1.x.
-}
-
 document_iterator Stream::begin() {
   if (CurrentDoc)
     report_fatal_error("Can only iterate over the stream once");
@@ -1588,14 +1601,59 @@ void Stream::skip() {
     i->skip();
 }
 
-Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A)
+Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A, StringRef T)
   : Doc(D)
   , TypeID(Type)
-  , Anchor(A) {
+  , Anchor(A)
+  , Tag(T) {
   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
   SourceRange = SMRange(Start, Start);
 }
 
+std::string Node::getVerbatimTag() const {
+  StringRef Raw = getRawTag();
+  if (!Raw.empty() && Raw != "!") {
+    std::string Ret;
+    if (Raw.find_last_of('!') == 0) {
+      Ret = Doc->getTagMap().find("!")->second;
+      Ret += Raw.substr(1);
+      return llvm_move(Ret);
+    } else if (Raw.startswith("!!")) {
+      Ret = Doc->getTagMap().find("!!")->second;
+      Ret += Raw.substr(2);
+      return llvm_move(Ret);
+    } else {
+      StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
+      std::map<StringRef, StringRef>::const_iterator It =
+          Doc->getTagMap().find(TagHandle);
+      if (It != Doc->getTagMap().end())
+        Ret = It->second;
+      else {
+        Token T;
+        T.Kind = Token::TK_Tag;
+        T.Range = TagHandle;
+        setError(Twine("Unknown tag handle ") + TagHandle, T);
+      }
+      Ret += Raw.substr(Raw.find_last_of('!') + 1);
+      return llvm_move(Ret);
+    }
+  }
+
+  switch (getType()) {
+  case NK_Null:
+    return "tag:yaml.org,2002:null";
+  case NK_Scalar:
+    // TODO: Tag resolution.
+    return "tag:yaml.org,2002:str";
+  case NK_Mapping:
+    return "tag:yaml.org,2002:map";
+  case NK_Sequence:
+    return "tag:yaml.org,2002:seq";
+  }
+
+  return "";
+}
+
 Token &Node::peekNext() {
   return Doc->peekNext();
 }
@@ -1999,6 +2057,10 @@ void SequenceNode::increment() {
 }
 
 Document::Document(Stream &S) : stream(S), Root(0) {
+  // Tag maps starts with two default mappings.
+  TagMap["!"] = "!";
+  TagMap["!!"] = "tag:yaml.org,2002:";
+
   if (parseDirectives())
     expectToken(Token::TK_DocumentStart);
   Token &T = peekNext();
@@ -2042,6 +2104,7 @@ Node *Document::parseBlockNode() {
   Token T = peekNext();
   // Handle properties.
   Token AnchorInfo;
+  Token TagInfo;
 parse_property:
   switch (T.Kind) {
   case Token::TK_Alias:
@@ -2056,7 +2119,11 @@ parse_property:
     T = peekNext();
     goto parse_property;
   case Token::TK_Tag:
-    getNext(); // Skip TK_Tag.
+    if (TagInfo.Kind == Token::TK_Tag) {
+      setError("Already encountered a tag for this node!", T);
+      return 0;
+    }
+    TagInfo = getNext(); // Consume TK_Tag.
     T = peekNext();
     goto parse_property;
   default:
@@ -2070,42 +2137,49 @@ parse_property:
     // Don't eat the TK_BlockEntry, SequenceNode needs it.
     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
                                            , AnchorInfo.Range.substr(1)
+                                           , TagInfo.Range
                                            , SequenceNode::ST_Indentless);
   case Token::TK_BlockSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Block);
   case Token::TK_BlockMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Block);
   case Token::TK_FlowSequenceStart:
     getNext();
     return new (NodeAllocator)
       SequenceNode( stream.CurrentDoc
                   , AnchorInfo.Range.substr(1)
+                  , TagInfo.Range
                   , SequenceNode::ST_Flow);
   case Token::TK_FlowMappingStart:
     getNext();
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Flow);
   case Token::TK_Scalar:
     getNext();
     return new (NodeAllocator)
       ScalarNode( stream.CurrentDoc
                 , AnchorInfo.Range.substr(1)
+                , TagInfo.Range
                 , T.Range);
   case Token::TK_Key:
     // Don't eat the TK_Key, KeyValueNode expects it.
     return new (NodeAllocator)
       MappingNode( stream.CurrentDoc
                  , AnchorInfo.Range.substr(1)
+                 , TagInfo.Range
                  , MappingNode::MT_Inline);
   case Token::TK_DocumentStart:
   case Token::TK_DocumentEnd:
@@ -2126,10 +2200,10 @@ bool Document::parseDirectives() {
   while (true) {
     Token T = peekNext();
     if (T.Kind == Token::TK_TagDirective) {
-      handleTagDirective(getNext());
+      parseTAGDirective();
       isDirective = true;
     } else if (T.Kind == Token::TK_VersionDirective) {
-      stream.handleYAMLDirective(getNext());
+      parseYAMLDirective();
       isDirective = true;
     } else
       break;
@@ -2137,6 +2211,21 @@ bool Document::parseDirectives() {
   return isDirective;
 }
 
+void Document::parseYAMLDirective() {
+  getNext(); // Eat %YAML <version>
+}
+
+void Document::parseTAGDirective() {
+  Token Tag = getNext(); // %TAG <handle> <prefix>
+  StringRef T = Tag.Range;
+  // Strip %TAG
+  T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
+  std::size_t HandleEnd = T.find_first_of(" \t");
+  StringRef TagHandle = T.substr(0, HandleEnd);
+  StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
+  TagMap[TagHandle] = TagPrefix;
+}
+
 bool Document::expectToken(int TK) {
   Token T = getNext();
   if (T.Kind != TK) {
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 9da2aa7..42bff96 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/YAMLParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstring>
+#include <cctype>
 using namespace llvm;
 using namespace yaml;
 
@@ -40,32 +41,43 @@ void IO::setContext(void *Context) {
 //  Input
 //===----------------------------------------------------------------------===//
 
-Input::Input(StringRef InputContent, void *Ctxt) 
-  : IO(Ctxt), 
+Input::Input(StringRef InputContent,
+             void *Ctxt,
+             SourceMgr::DiagHandlerTy DiagHandler,
+             void *DiagHandlerCtxt)
+  : IO(Ctxt),
     Strm(new Stream(InputContent, SrcMgr)),
     CurrentNode(NULL) {
+  if (DiagHandler)
+    SrcMgr.setDiagHandler(DiagHandler, DiagHandlerCtxt);
   DocIterator = Strm->begin();
 }
 
 Input::~Input() {
-  
 }
 
 error_code Input::error() {
   return EC;
 }
 
-void Input::setDiagHandler(SourceMgr::DiagHandlerTy Handler, void *Ctxt) {
-  SrcMgr.setDiagHandler(Handler, Ctxt);
-}
+// Pin the vtables to this file.
+void Input::HNode::anchor() {}
+void Input::EmptyHNode::anchor() {}
+void Input::ScalarHNode::anchor() {}
 
-bool Input::outputting() {
+bool Input::outputting() const {
   return false;
 }
 
 bool Input::setCurrentDocument() {
   if (DocIterator != Strm->end()) {
     Node *N = DocIterator->getRoot();
+    if (!N) {
+      assert(Strm->failed() && "Root is NULL iff parsing failed");
+      EC = make_error_code(errc::invalid_argument);
+      return false;
+    }
+
     if (isa<NullNode>(N)) {
       // Empty files are allowed and ignored
       ++DocIterator;
@@ -82,10 +94,21 @@ void Input::nextDocument() {
   ++DocIterator;
 }
 
+bool Input::mapTag(StringRef Tag, bool Default) {
+  std::string foundTag = CurrentNode->_node->getVerbatimTag();
+  if (foundTag.empty()) {
+    // If no tag found and 'Tag' is the default, say it was found.
+    return Default;
+  }
+  // Return true iff found tag matches supplied tag.
+  return Tag.equals(foundTag);
+}
+
 void Input::beginMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (MN) {
     MN->ValidKeys.clear();
   }
@@ -96,6 +119,15 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
   UseDefault = false;
   if (EC)
     return false;
+
+  // CurrentNode is null for empty documents, which is an error in case required
+  // nodes are present.
+  if (!CurrentNode) {
+    if (Required)
+      EC = make_error_code(errc::invalid_argument);
+    return false;
+  }
+
   MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
   if (!MN) {
     setError(CurrentNode, "not a mapping");
@@ -122,13 +154,14 @@ void Input::postflightKey(void *saveInfo) {
 void Input::endMapping() {
   if (EC)
     return;
-  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  // CurrentNode can be null if the document is empty.
+  MapHNode *MN = dyn_cast_or_null<MapHNode>(CurrentNode);
   if (!MN)
     return;
   for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
        End = MN->Mapping.end(); i != End; ++i) {
-    if (!MN->isValidKey(i->first)) {
-      setError(i->second, Twine("unknown key '") + i->first + "'");
+    if (!MN->isValidKey(i->first())) {
+      setError(i->second, Twine("unknown key '") + i->first() + "'");
       break;
     }
   }
@@ -263,6 +296,7 @@ void Input::scalarString(StringRef &S) {
 }
 
 void Input::setError(HNode *hnode, const Twine &message) {
+  assert(hnode && "HNode must not be NULL");
   this->setError(hnode->_node, message);
 }
 
@@ -322,7 +356,7 @@ Input::HNode *Input::createHNodes(Node *N) {
 }
 
 bool Input::MapHNode::isValidKey(StringRef Key) {
-  for (SmallVector<const char *, 6>::iterator i = ValidKeys.begin(),
+  for (SmallVectorImpl<const char *>::iterator i = ValidKeys.begin(),
        End = ValidKeys.end(); i != End; ++i) {
     if (Key.equals(*i))
       return true;
@@ -334,6 +368,10 @@ void Input::setError(const Twine &Message) {
   this->setError(CurrentNode, Message);
 }
 
+bool Input::canElideEmptySequence() {
+  return false;
+}
+
 Input::MapHNode::~MapHNode() {
   for (MapHNode::NameToNode::iterator i = Mapping.begin(), End = Mapping.end();
                                                                 i != End; ++i) {
@@ -368,7 +406,7 @@ Output::Output(raw_ostream &yout, void *context)
 Output::~Output() {
 }
 
-bool Output::outputting() {
+bool Output::outputting() const {
   return true;
 }
 
@@ -377,6 +415,14 @@ void Output::beginMapping() {
   NeedsNewLine = true;
 }
 
+bool Output::mapTag(StringRef Tag, bool Use) {
+  if (Use) {
+    this->output(" ");
+    this->output(Tag);
+  }
+  return Use;
+}
+
 void Output::endMapping() {
   StateStack.pop_back();
 }
@@ -505,9 +551,20 @@ void Output::endBitSetScalar() {
 }
 
 void Output::scalarString(StringRef &S) {
+  const char ScalarSafeChars[] = "abcdefghijklmnopqrstuvwxyz"
+      "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-/^., \t";
+
   this->newLineCheck();
-  if (S.find('\n') == StringRef::npos) {
-    // No embedded new-line chars, just print string.
+  if (S.empty()) {
+    // Print '' for the empty string because leaving the field empty is not
+    // allowed.
+    this->outputUpToEndOfLine("''");
+    return;
+  }
+  if (S.find_first_not_of(ScalarSafeChars) == StringRef::npos &&
+      !isspace(S.front()) && !isspace(S.back())) {
+    // If the string consists only of safe characters, print it out without
+    // quotes.
     this->outputUpToEndOfLine(S);
     return;
   }
@@ -532,6 +589,19 @@ void Output::scalarString(StringRef &S) {
 void Output::setError(const Twine &message) {
 }
 
+bool Output::canElideEmptySequence() {
+  // Normally, with an optional key/value where the value is an empty sequence,
+  // the whole key/value can be not written.  But, that produces wrong yaml
+  // if the key/value is the only thing in the map and the map is used in
+  // a sequence.  This detects if the this sequence is the first key/value
+  // in map that itself is embedded in a sequnce.
+  if (StateStack.size() < 2)
+    return true;
+  if (StateStack.back() != inMapFirstKey)
+    return true;
+  return (StateStack[StateStack.size()-2] != inSeq);
+}
+
 void Output::output(StringRef s) {
   Column += s.size();
   Out << s;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index a433088..cb96489 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Config/config.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Program.h"
@@ -25,14 +26,15 @@
 #include <cctype>
 #include <cerrno>
 #include <sys/stat.h>
-#include <sys/types.h>
 
-#if defined(HAVE_UNISTD_H)
-# include <unistd.h>
-#endif
+// <fcntl.h> may provide O_BINARY.
 #if defined(HAVE_FCNTL_H)
 # include <fcntl.h>
 #endif
+
+#if defined(HAVE_UNISTD_H)
+# include <unistd.h>
+#endif
 #if defined(HAVE_SYS_UIO_H) && defined(HAVE_WRITEV)
 #  include <sys/uio.h>
 #endif
@@ -43,7 +45,6 @@
 
 #if defined(_MSC_VER)
 #include <io.h>
-#include <fcntl.h>
 #ifndef STDIN_FILENO
 # define STDIN_FILENO 0
 #endif
@@ -424,14 +425,9 @@ void format_object_base::home() {
 /// stream should be immediately destroyed; the string will be empty
 /// if no error occurred.
 raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
-                               unsigned Flags)
-  : Error(false), UseAtomicWrites(false), pos(0)
-{
+                               sys::fs::OpenFlags Flags)
+    : Error(false), UseAtomicWrites(false), pos(0) {
   assert(Filename != 0 && "Filename is null");
-  // Verify that we don't have both "append" and "excl".
-  assert((!(Flags & F_Excl) || !(Flags & F_Append)) &&
-         "Cannot specify both 'excl' and 'append' file creation flags!");
-
   ErrorInfo.clear();
 
   // Handle "-" as stdout. Note that when we do this, we consider ourself
@@ -441,32 +437,20 @@ raw_fd_ostream::raw_fd_ostream(const char *Filename, std::string &ErrorInfo,
     FD = STDOUT_FILENO;
     // If user requested binary then put stdout into binary mode if
     // possible.
-    if (Flags & F_Binary)
-      sys::Program::ChangeStdoutToBinary();
+    if (Flags & sys::fs::F_Binary)
+      sys::ChangeStdoutToBinary();
     // Close stdout when we're done, to detect any output errors.
     ShouldClose = true;
     return;
   }
 
-  int OpenFlags = O_WRONLY|O_CREAT;
-#ifdef O_BINARY
-  if (Flags & F_Binary)
-    OpenFlags |= O_BINARY;
-#endif
-
-  if (Flags & F_Append)
-    OpenFlags |= O_APPEND;
-  else
-    OpenFlags |= O_TRUNC;
-  if (Flags & F_Excl)
-    OpenFlags |= O_EXCL;
+  error_code EC = sys::fs::openFileForWrite(Filename, FD, Flags);
 
-  while ((FD = open(Filename, OpenFlags, 0664)) < 0) {
-    if (errno != EINTR) {
-      ErrorInfo = "Error opening output file '" + std::string(Filename) + "'";
-      ShouldClose = false;
-      return;
-    }
+  if (EC) {
+    ErrorInfo = "Error opening output file '" + std::string(Filename) + "': " +
+                EC.message();
+    ShouldClose = false;
+    return;
   }
 
   // Ok, we successfully opened the file, so it'll need to be closed.
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index dc4167b..7fe47bc 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -83,7 +83,7 @@ int TableGenMain(char *argv0, TableGenMainFn *MainFn) {
   // Parse the input file.
   OwningPtr<MemoryBuffer> File;
   if (error_code ec =
-        MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+        MemoryBuffer::getFileOrSTDIN(InputFilename, File)) {
     errs() << "Could not open input file '" << InputFilename << "': "
            << ec.message() <<"\n";
     return 1;
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 9ad2053..431f4aa 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -557,9 +557,23 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   return const_cast<BitsInit *>(this);
 }
 
+namespace {
+  template<typename T>
+  class Pool : public T {
+  public:
+    ~Pool();
+  };
+  template<typename T>
+  Pool<T>::~Pool() {
+    for (typename T::iterator I = this->begin(), E = this->end(); I != E; ++I) {
+      typename T::value_type &Item = *I;
+      delete Item.second;
+    }
+  }
+}
+
 IntInit *IntInit::get(int64_t V) {
-  typedef DenseMap<int64_t, IntInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<int64_t, IntInit *> > ThePool;
 
   IntInit *&I = ThePool[V];
   if (!I) I = new IntInit(V);
@@ -586,8 +600,7 @@ IntInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 void StringInit::anchor() { }
 
 StringInit *StringInit::get(StringRef V) {
-  typedef StringMap<StringInit *> Pool;
-  static Pool ThePool;
+  static Pool<StringMap<StringInit *> > ThePool;
 
   StringInit *&I = ThePool[V];
   if (!I) I = new StringInit(V);
@@ -726,9 +739,7 @@ Init *OpInit::getBit(unsigned Bit) const {
 
 UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
   typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
-
-  typedef DenseMap<Key, UnOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, UnOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(opc, lhs), Type));
 
@@ -873,8 +884,7 @@ BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
     RecTy *
     > Key;
 
-  typedef DenseMap<Key, BinOpInit *> Pool;
-  static Pool ThePool;  
+  static Pool<DenseMap<Key, BinOpInit *> > ThePool;
 
   Key TheKey(std::make_pair(std::make_pair(std::make_pair(opc, lhs), rhs),
                             Type));
@@ -1298,8 +1308,7 @@ VarInit *VarInit::get(const std::string &VN, RecTy *T) {
 
 VarInit *VarInit::get(Init *VN, RecTy *T) {
   typedef std::pair<RecTy *, Init *> Key;
-  typedef DenseMap<Key, VarInit *> Pool;
-  static Pool ThePool;
+  static Pool<DenseMap<Key, VarInit *> > ThePool;
 
   Key TheKey(std::make_pair(T, VN));
 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index 86ad2a6..daac574 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -1271,10 +1271,11 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     if (ItemType != 0) {
       ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
       if (ListType == 0) {
-        std::stringstream s;
-        s << "Type mismatch for list, expected list type, got "
-          << ItemType->getAsString();
-        TokError(s.str());
+        std::string s;
+        raw_string_ostream ss(s);
+        ss << "Type mismatch for list, expected list type, got "
+           << ItemType->getAsString();
+        TokError(ss.str());
         return 0;
       }
       GivenListTy = ListType;
@@ -2495,6 +2496,9 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     if (Lex.getCode() != tgtok::comma) break;
     Lex.Lex(); // eat ','.
 
+    if (Lex.getCode() != tgtok::Id)
+      return TokError("expected identifier");
+
     SubClassLoc = Lex.getLoc();
 
     // A defm can inherit from regular classes (non-multiclass) as
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e17052b..9c2c69a 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -21,8 +21,11 @@ include "llvm/Target/Target.td"
 // AArch64 Subtarget features.
 //
 
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
+  "Enable ARMv8 FP">;
+
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
-  "Enable Advanced SIMD instructions">;
+  "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
@@ -33,7 +36,7 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
 
 include "AArch64Schedule.td"
 
-def : Processor<"generic", GenericItineraries, [FeatureNEON, FeatureCrypto]>;
+def : Processor<"generic", GenericItineraries, [FeatureFPARMv8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 47ebb82..d59ca56 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -27,32 +27,23 @@
 
 using namespace llvm;
 
-MachineLocation
-AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  // See emitFrameIndexDebugValue in InstrInfo for where this instruction is
-  // expected to be created.
-  assert(MI->getNumOperands() == 4 && MI->getOperand(0).isReg()
-         && MI->getOperand(1).isImm() && "unexpected custom DBG_VALUE");
-  return MachineLocation(MI->getOperand(0).getReg(),
-                         MI->getOperand(1).getImm());
-}
-
 /// Try to print a floating-point register as if it belonged to a specified
 /// register-class. For example the inline asm operand modifier "b" requires its
 /// argument to be printed as "bN".
 static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
                                        const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
+                                       char RegType, raw_ostream &O) {
   if (!MO.isReg())
     return true;
 
   for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (RegClass.contains(*AR)) {
-      O << AArch64InstPrinter::getRegisterName(*AR);
+    if (AArch64::FPR8RegClass.contains(*AR)) {
+      O << RegType << TRI->getEncodingValue(MO.getReg());
       return false;
     }
   }
+
+  // The register doesn't correspond to anything floating-point like.
   return true;
 }
 
@@ -91,9 +82,9 @@ bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
   StringRef Modifier;
   switch (MO.getType()) {
   default:
-    llvm_unreachable("Unexpected operand for symbolic address constraint");
+    return true;
   case MachineOperand::MO_GlobalAddress:
-    Name = Mang->getSymbol(MO.getGlobal())->getName();
+    Name = getSymbol(MO.getGlobal())->getName();
 
     // Global variables may be accessed either via a GOT or in various fun and
     // interesting TLS-model specific ways. Set the prefix modifier as
@@ -155,57 +146,29 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
   const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
-  if (!ExtraCode || !ExtraCode[0]) {
-    // There's actually no operand modifier, which leads to a slightly eclectic
-    // set of behaviour which we have to handle here.
-    const MachineOperand &MO = MI->getOperand(OpNum);
-    switch (MO.getType()) {
-    default:
-      llvm_unreachable("Unexpected operand for inline assembly");
-    case MachineOperand::MO_Register:
-      // GCC prints the unmodified operand of a 'w' constraint as the vector
-      // register. Technically, we could allocate the argument as a VPR128, but
-      // that leads to extremely dodgy copies being generated to get the data
-      // there.
-      if (printModifiedFPRAsmOperand(MO, TRI, AArch64::VPR128RegClass, O))
-        O << AArch64InstPrinter::getRegisterName(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      O << '#' << MO.getImm();
-      break;
-    case MachineOperand::MO_FPImmediate:
-      assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-      O << "#0.0";
-      break;
-    case MachineOperand::MO_BlockAddress:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_GlobalAddress:
-    case MachineOperand::MO_ExternalSymbol:
-      return printSymbolicAddress(MO, false, "", O);
-    }
-    return false;
-  }
 
-  // We have a real modifier to handle.
+  if (!ExtraCode)
+    ExtraCode = "";
+
   switch(ExtraCode[0]) {
   default:
-    // See if this is a generic operand
-    return AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O);
-  case 'c': // Don't print "#" before an immediate operand.
-    if (!MI->getOperand(OpNum).isImm())
-      return true;
-    O << MI->getOperand(OpNum).getImm();
-    return false;
+    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+        return false;
+    break;
   case 'w':
     // Output 32-bit general register operand, constant zero as wzr, or stack
     // pointer as wsp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR32RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR32RegClass, O))
+      return false;
+    break;
   case 'x':
     // Output 64-bit general register operand, constant zero as xzr, or stack
     // pointer as sp. Ignored when used with other operand types.
-    return printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::GPR64RegClass, O);
+    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    AArch64::GPR64RegClass, O))
+      return false;
+    break;
   case 'H':
     // Output higher numbered of a 64-bit general register pair
   case 'Q':
@@ -221,40 +184,65 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     // copies ...).
     llvm_unreachable("FIXME: Unimplemented register pairs");
   case 'b':
-    // Output 8-bit FP/SIMD scalar register operand, prefixed with b.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR8RegClass, O);
   case 'h':
-    // Output 16-bit FP/SIMD scalar register operand, prefixed with h.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR16RegClass, O);
   case 's':
-    // Output 32-bit FP/SIMD scalar register operand, prefixed with s.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR32RegClass, O);
   case 'd':
-    // Output 64-bit FP/SIMD scalar register operand, prefixed with d.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR64RegClass, O);
   case 'q':
-    // Output 128-bit FP/SIMD scalar register operand, prefixed with q.
-    return printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                      AArch64::FPR128RegClass, O);
+    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
+                                    ExtraCode[0], O))
+      return false;
+    break;
   case 'A':
     // Output symbolic address with appropriate relocation modifier (also
     // suitable for ADRP).
-    return printSymbolicAddress(MI->getOperand(OpNum), false, "", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
+      return false;
+    break;
   case 'L':
     // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
     // modifier.
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      return false;
+    break;
   case 'G':
     // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
     // modifier (currently only for TLS local exec).
-    return printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O);
+    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+      return false;
+    break;
+  case 'a':
+    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
   }
 
+  // There's actually no operand modifier, which leads to a slightly eclectic
+  // set of behaviour which we have to handle here.
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default:
+    llvm_unreachable("Unexpected operand for inline assembly");
+  case MachineOperand::MO_Register:
+    // GCC prints the unmodified operand of a 'w' constraint as the vector
+    // register. Technically, we could allocate the argument as a VPR128, but
+    // that leads to extremely dodgy copies being generated to get the data
+    // there.
+    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
+      O << AArch64InstPrinter::getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << '#' << MO.getImm();
+    break;
+  case MachineOperand::MO_FPImmediate:
+    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
+    O << "#0.0";
+    break;
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    return printSymbolicAddress(MO, false, "", O);
+  }
 
+  return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
@@ -271,24 +259,6 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                               raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps==4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '[' << AArch64InstPrinter::getRegisterName(MI->getOperand(0).getReg());
-  OS << '+' << MI->getOperand(1).getImm();
-  OS << ']';
-  OS << "+" << MI->getOperand(NOps - 2).getImm();
-}
-
-
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -296,18 +266,6 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
-  switch (MI->getOpcode()) {
-  case AArch64::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
-  }
-
   MCInst TmpInst;
   LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
   OutStreamer.EmitInstruction(TmpInst);
@@ -329,7 +287,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
         OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0), 0);
+                                    TD->getPointerSize(0));
       }
       Stubs.clear();
     }
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
index af0c9fe..824f003 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.h
+++ b/lib/Target/AArch64/AArch64AsmPrinter.h
@@ -55,8 +55,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &O);
 
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
   /// printSymbolicAddress - Given some kind of reasonably bare symbolic
   /// reference, print out the appropriate asm string to represent it. If
   /// appropriate, a relocation-specifier will be produced, composed of a
@@ -67,8 +65,6 @@ class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
                             bool PrintImmediatePrefix,
                             StringRef Suffix, raw_ostream &O);
 
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
-
   virtual const char *getPassName() const {
     return "AArch64 Assembly Printer";
   }
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
index 71233ba..11e7f41 100644
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
@@ -87,7 +87,7 @@ namespace {
         // If the block size isn't a multiple of the known bits, assume the
         // worst case padding.
         if (Size & ((1u << Bits) - 1))
-          Bits = CountTrailingZeros_32(Size);
+          Bits = countTrailingZeros(Size);
         return Bits;
       }
 
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
index b880d83..a2a9f3f 100644
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -59,9 +59,9 @@ def CC_A64_APCS : CallingConv<[
   // Canonicalise the various types that live in different floating-point
   // registers. This makes sense because the PCS does not distinguish Short
   // Vectors and Floating-point types.
-  CCIfType<[v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
+  CCIfType<[v1i32, v4i8, v2i16, v1f32], CCBitConvertToType<f32>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
   CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
            CCBitConvertToType<f128>>,
 
@@ -70,7 +70,8 @@ def CC_A64_APCS : CallingConv<[
   // argument is allocated to the least significant bits of register
   // v[NSRN]. The NSRN is incremented by one. The argument has now been
   // allocated."
-  CCIfType<[f16],  CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
+  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
   CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
   CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index daa7f1d..7318230 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -54,7 +54,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   bool NeedsFrameMoves = MMI.hasDebugInfo()
     || MF.getFunction()->needsUnwindTableEntry();
 
@@ -97,8 +97,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addSym(SPLabel);
 
     MachineLocation Dst(MachineLocation::VirtualFP);
-    MachineLocation Src(AArch64::XSP, NumInitialBytes);
-    Moves.push_back(MachineMove(SPLabel, Dst, Src));
+    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfa(SPLabel, Reg, -NumInitialBytes));
   }
 
   // Otherwise we need to set the frame pointer and/or add a second stack
@@ -131,9 +132,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
         MCSymbol *FPLabel = MMI.getContext().CreateTempSymbol();
         BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::PROLOG_LABEL))
           .addSym(FPLabel);
-        MachineLocation Dst(MachineLocation::VirtualFP);
-        MachineLocation Src(AArch64::X29, -MFI->getObjectOffset(X29FrameIdx));
-        Moves.push_back(MachineMove(FPLabel, Dst, Src));
+        unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
+        unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
+        MMI.addFrameInst(MCCFIInstruction::createDefCfa(FPLabel, Reg, Offset));
       }
 
       FPNeedsSetting = false;
@@ -164,8 +165,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addSym(CSLabel);
 
     MachineLocation Dst(MachineLocation::VirtualFP);
-    MachineLocation Src(AArch64::XSP, NumResidualBytes + NumInitialBytes);
-    Moves.push_back(MachineMove(CSLabel, Dst, Src));
+    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
+    unsigned Offset = NumResidualBytes + NumInitialBytes;
+    MMI.addFrameInst(MCCFIInstruction::createDefCfa(CSLabel, Reg, -Offset));
   }
 
   // And any callee-saved registers (it's fine to leave them to the end here,
@@ -180,10 +182,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
            E = CSI.end(); I != E; ++I) {
-      MachineLocation Dst(MachineLocation::VirtualFP,
-                          MFI->getObjectOffset(I->getFrameIdx()));
-      MachineLocation Src(I->getReg());
-      Moves.push_back(MachineMove(CSLabel, Dst, Src));
+      unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
+      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, Reg, Offset));
     }
   }
 }
@@ -424,7 +425,7 @@ AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MBBI,
                                       const std::vector<CalleeSavedInfo> &CSI,
                                       const TargetRegisterInfo *TRI,
-                                      LoadStoreMethod PossClasses[],
+                                      const LoadStoreMethod PossClasses[],
                                       unsigned NumClasses) const {
   DebugLoc DL = MBB.findDebugLoc(MBBI);
   MachineFunction &MF = *MBB.getParent();
@@ -527,11 +528,11 @@ AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (CSI.empty())
     return false;
 
-  static LoadStoreMethod PossibleClasses[] = {
+  static const LoadStoreMethod PossibleClasses[] = {
     {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
     {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
   };
-  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
 
   emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
                   PossibleClasses, NumClasses);
@@ -548,11 +549,11 @@ AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (CSI.empty())
     return false;
 
-  static LoadStoreMethod PossibleClasses[] = {
+  static const LoadStoreMethod PossibleClasses[] = {
     {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
     {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
   };
-  unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
+  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
 
   emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
                   PossibleClasses, NumClasses);
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 45ea0ec..032dd90 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -90,7 +90,7 @@ public:
                        MachineBasicBlock::iterator MI,
                        const std::vector<CalleeSavedInfo> &CSI,
                        const TargetRegisterInfo *TRI,
-                       LoadStoreMethod PossibleClasses[],
+                       const LoadStoreMethod PossibleClasses[],
                        unsigned NumClasses) const;
 
 
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 102c71b..ef99541 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -33,7 +33,6 @@ namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
   AArch64TargetMachine &TM;
-  const AArch64InstrInfo *TII;
 
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -43,7 +42,6 @@ public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-      TII(static_cast<const AArch64InstrInfo*>(TM.getInstrInfo())),
       Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
   }
 
@@ -72,10 +70,11 @@ public:
 
   /// Used for pre-lowered address-reference nodes, so we already know
   /// the fields match. This operand's job is simply to add an
-  /// appropriate shift operand (i.e. 0) to the MOVZ/MOVK instruction.
+  /// appropriate shift operand to the MOVZ/MOVK instruction.
+  template<unsigned LogShift>
   bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
     Imm = N;
-    Shift = CurDAG->getTargetConstant(0, MVT::i32);
+    Shift = CurDAG->getTargetConstant(LogShift, MVT::i32);
     return true;
   }
 
@@ -102,7 +101,7 @@ public:
 
   /// Put the given constant into a pool and return a DAG which will give its
   /// address.
-  SDValue getConstantPoolItemAddress(DebugLoc DL, const Constant *CV);
+  SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
 
   SDNode *TrySelectToMoveImm(SDNode *N);
   SDNode *LowerToFPLitPool(SDNode *Node);
@@ -110,6 +109,45 @@ public:
 
   SDNode* Select(SDNode*);
 private:
+  /// Get the opcode for table lookup instruction
+  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
+
+  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  /// IsExt is to indicate if the result will be extended with an argument.
+  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+
+  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcode);
+
+  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
+  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+                    const uint16_t *Opcodes);
+
+  /// Form sequences of consecutive 64/128-bit registers for use in NEON
+  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
+  /// between 1 and 4 elements. If it contains a single element that is returned
+  /// unchanged; otherwise a REG_SEQUENCE value is returned.
+  SDValue createDTuple(ArrayRef<SDValue> Vecs);
+  SDValue createQTuple(ArrayRef<SDValue> Vecs);
+
+  /// Generic helper for the createDTuple/createQTuple
+  /// functions. Those should almost always be called instead.
+  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
+                      unsigned SubRegs[]);
+
+  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode array specifies the instructions used for load.
+  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+                       const uint16_t *Opcodes);
+
+  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
+  /// The opcode arrays specify the instructions used for load/store.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+                          unsigned NumVecs, const uint16_t *Opcodes);
+
+  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                               SDValue Operand);
 };
 }
 
@@ -191,7 +229,7 @@ bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
 
 SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
   SDNode *ResNode;
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT DestType = Node->getValueType(0);
   unsigned DestWidth = DestType.getSizeInBits();
 
@@ -241,14 +279,14 @@ SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
 }
 
 SDValue
-AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL,
+AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
                                                 const Constant *CV) {
-  EVT PtrVT = TLI.getPointerTy();
+  EVT PtrVT = getTargetLowering()->getPointerTy();
 
-  switch (TLI.getTargetMachine().getCodeModel()) {
+  switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
   case CodeModel::Small: {
     unsigned Alignment =
-        TLI.getDataLayout()->getABITypeAlignment(CV->getType());
+      getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
     return CurDAG->getNode(
         AArch64ISD::WrapperSmall, DL, PtrVT,
         CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
@@ -260,15 +298,15 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL,
     LitAddr = CurDAG->getMachineNode(
         AArch64::MOVZxii, DL, PtrVT,
         CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-        CurDAG->getTargetConstant(0, MVT::i32));
+        CurDAG->getTargetConstant(3, MVT::i32));
     LitAddr = CurDAG->getMachineNode(
         AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
         CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-        CurDAG->getTargetConstant(0, MVT::i32));
+        CurDAG->getTargetConstant(2, MVT::i32));
     LitAddr = CurDAG->getMachineNode(
         AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
         CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-        CurDAG->getTargetConstant(0, MVT::i32));
+        CurDAG->getTargetConstant(1, MVT::i32));
     LitAddr = CurDAG->getMachineNode(
         AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
         CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
@@ -281,7 +319,7 @@ AArch64DAGToDAGISel::getConstantPoolItemAddress(DebugLoc DL,
 }
 
 SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
   int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
   EVT DestType = Node->getValueType(0);
@@ -312,7 +350,8 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
                                                   MemType.getSizeInBits()),
                                   UnsignedVal);
   SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
-  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(CV->getType());
+  unsigned Alignment =
+    getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
 
   return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
                             PoolAddr,
@@ -323,11 +362,12 @@ SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
 }
 
 SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
   EVT DestType = Node->getValueType(0);
 
-  unsigned Alignment = TLI.getDataLayout()->getABITypeAlignment(FV->getType());
+  unsigned Alignment =
+    getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
   SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
 
   return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
@@ -389,12 +429,607 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
                               &Ops[0], Ops.size());
 }
 
+SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
+                                    AArch64::DTripleRegClassID,
+                                    AArch64::DQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
+                                AArch64::dsub_2, AArch64::dsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
+  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
+                                    AArch64::QTripleRegClassID,
+                                    AArch64::QQuadRegClassID };
+  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
+                                AArch64::qsub_2, AArch64::qsub_3 };
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
+SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
+                                         unsigned RegClassIDs[],
+                                         unsigned SubRegs[]) {
+  // There's no special register-class for a vector-list of 1 element: it's just
+  // a vector.
+  if (Regs.size() == 1)
+    return Regs[0];
+
+  assert(Regs.size() >= 2 && Regs.size() <= 4);
+
+  SDLoc DL(Regs[0].getNode());
+
+  SmallVector<SDValue, 4> Ops;
+
+  // First operand of REG_SEQUENCE is the desired RegClass.
+  Ops.push_back(
+      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
+
+  // Then we get pairs of source & subregister-position for the components.
+  for (unsigned i = 0; i < Regs.size(); ++i) {
+    Ops.push_back(Regs[i]);
+    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
+  }
+
+  SDNode *N =
+      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
+  return SDValue(N, 0);
+}
+
+
+// Get the register stride update opcode of a VLD/VST instruction that
+// is otherwise equivalent to the given fixed stride updating instruction.
+static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
+  switch (Opc) {
+  default: break;
+  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
+  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
+  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
+  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
+  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
+  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
+  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
+  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
+
+  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
+  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
+  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
+  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
+  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
+  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
+  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
+
+  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
+  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
+  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
+  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
+  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
+  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
+  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
+
+  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
+  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
+  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
+  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
+  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
+  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
+  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
+
+  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
+  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
+  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
+  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
+  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
+  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
+  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
+  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
+
+  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
+  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
+  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
+  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
+  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
+  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
+  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
+  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
+
+  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
+  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
+  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
+  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
+  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
+  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
+  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
+  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
+
+  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
+  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
+  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
+  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
+  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
+  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
+  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
+  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
+
+  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
+  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
+  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
+  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
+  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
+  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
+  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
+
+  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
+  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
+  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
+  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
+  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
+  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
+  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
+
+  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
+  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
+  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
+  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
+  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
+  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
+  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
+
+  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
+  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
+  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
+  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
+  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
+  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
+  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
+  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
+
+  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
+  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
+  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
+  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
+  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
+  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
+  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
+  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
+
+  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
+  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
+  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
+  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
+  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
+  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
+  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
+  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
+
+  // Post-index of duplicate loads
+  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
+  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
+  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
+  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
+  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
+  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
+  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
+  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
+
+  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
+  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
+  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
+  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
+  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
+  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
+  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
+  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
+
+  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
+  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
+  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
+  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
+  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
+  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
+  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
+  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
+
+  // Post-index of lane loads
+  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
+  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
+  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
+  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
+
+  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
+  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
+  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
+  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
+
+  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
+  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
+  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
+  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
+
+  // Post-index of lane stores
+  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
+  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
+  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
+  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
+
+  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
+  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
+  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
+  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
+
+  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
+  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
+  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
+  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
+  }
+  return Opc; // If not one we handle, return it unchanged.
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<SDValue, 2> Ops;
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 1)
+    ResTys.push_back(VT);
+  else if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDLoc dl(N);
+  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+
+  if (NumVecs == 1)
+    return VLd;
+
+  // If NumVecs > 1, the return result is a super register containing 2-4
+  // consecutive vector registers.
+  SDValue SuperReg = SDValue(VLd, 0);
+
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update users of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
+
+  return NULL;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
+                                       unsigned NumVecs,
+                                       const uint16_t *Opcodes) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 2> ResTys;
+  if (isUpdating)
+    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::Other); // Type for the Chain
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
+  Ops.push_back(SrcReg);
+
+  // Push back the Chain
+  Ops.push_back(N->getOperand(0));
+
+  // Transfer memoperands.
+  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+
+  return VSt;
+}
+
+SDValue
+AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+                                          SDValue Operand) {
+  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
+                        VT, VTD, MVT::Other,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Operand,
+                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
+  return SDValue(Reg, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+                                          unsigned NumVecs,
+                                          const uint16_t *Opcodes) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+  SDLoc dl(N);
+
+  EVT VT = N->getValueType(0);
+  unsigned OpcodeIndex;
+  bool is64BitVector = VT.is64BitVector();
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+  default: llvm_unreachable("unhandled vector duplicate lane load type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SDValue SuperReg;
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(2);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+  Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+  SmallVector<EVT, 3> ResTys;
+  // Push back the type of return super register
+  if (NumVecs == 3)
+    ResTys.push_back(MVT::Untyped);
+  else {
+    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                 is64BitVector ? NumVecs : NumVecs * 2);
+    ResTys.push_back(ResTy);
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of the Chain
+  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+  SuperReg = SDValue(VLdDup, 0);
+  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    ReplaceUses(SDValue(N, Vec),
+                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+  // Update uses of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+  return NULL;
+}
+
+// We only have 128-bit vector type of load/store lane instructions.
+// If it is 64-bit vector, we also select it to the 128-bit instructions.
+// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
+// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
+SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                             bool isUpdating, unsigned NumVecs,
+                                             const uint16_t *Opcodes) {
+  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  SDLoc dl(N);
+  unsigned AddrOpIdx = isUpdating ? 1 : 2;
+  unsigned Vec0Idx = 3;
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+  EVT VT = N->getOperand(Vec0Idx).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+  EVT VT64; // 64-bit Vector Type
+
+  if (is64BitVector) {
+    VT64 = VT;
+    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
+                          VT.getVectorNumElements() * 2);
+  }
+
+  unsigned OpcodeIndex;
+  switch (VT.getScalarType().getSizeInBits()) {
+  case 8: OpcodeIndex = 0; break;
+  case 16: OpcodeIndex = 1; break;
+  case 32: OpcodeIndex = 2; break;
+  case 64: OpcodeIndex = 3; break;
+  default: llvm_unreachable("unhandled vector lane load/store type");
+  }
+  unsigned Opc = Opcodes[OpcodeIndex];
+
+  SmallVector<EVT, 3> ResTys;
+  if (IsLoad) {
+    // Push back the type of return super register
+    if (NumVecs == 3)
+      ResTys.push_back(MVT::Untyped);
+    else {
+      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+                                   is64BitVector ? NumVecs : NumVecs * 2);
+      ResTys.push_back(ResTy);
+    }
+  }
+  if (isUpdating)
+    ResTys.push_back(MVT::i64); // Type of the updated register
+  ResTys.push_back(MVT::Other); // Type of Chain
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  if (isUpdating) {
+    SDValue Inc = N->getOperand(AddrOpIdx + 1);
+    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+      Opc = getVLDSTRegisterUpdateOpcode(Opc);
+    Ops.push_back(Inc);
+  }
+
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  if (is64BitVector)
+    for (unsigned i = 0; i < Regs.size(); i++)
+      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
+  SDValue SuperReg = createQTuple(Regs);
+
+  Ops.push_back(SuperReg); // Source Reg
+  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
+  Ops.push_back(LaneValue);
+  Ops.push_back(Chain); // Push back the Chain
+
+  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+  if (!IsLoad)
+    return VLdLn;
+
+  // Extract the subregisters.
+  SuperReg = SDValue(VLdLn, 0);
+  unsigned Sub0 = AArch64::qsub_0;
+  // Update uses of each registers in super register
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
+    if (is64BitVector) {
+      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
+    }
+    ReplaceUses(SDValue(N, Vec), SUB0);
+  }
+  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+  if (isUpdating)
+    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+  return NULL;
+}
+
+unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
+                                        unsigned NumOfVec) {
+  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+
+  unsigned Opc = 0;
+  switch (NumOfVec) {
+  default:
+    break;
+  case 1:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+    break;
+  case 2:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+    break;
+  case 3:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+    break;
+  case 4:
+    if (IsExt)
+      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
+    else
+      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+    break;
+  }
+
+  return Opc;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
+                                        bool IsExt) {
+  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  SDLoc dl(N);
+
+  // Check the element of look up table is 64-bit or not
+  unsigned Vec0Idx = IsExt ? 2 : 1;
+  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
+         "The element of lookup table for vtbl and vtbx must be 128-bit");
+
+  // Check the return value type is 64-bit or not
+  EVT ResVT = N->getValueType(0);
+  bool is64BitRes = ResVT.is64BitVector();
+
+  // Create new SDValue for vector list
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+                               N->op_begin() + Vec0Idx + NumVecs);
+  SDValue TblReg = createQTuple(Regs);
+  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+
+  SmallVector<SDValue, 3> Ops;
+  if (IsExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(TblReg);
+  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
+  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+}
+
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
@@ -473,7 +1108,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
                         AArch64::ATOMIC_CMP_SWAP_I64);
   case ISD::FrameIndex: {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT PtrTy = TLI.getPointerTy();
+    EVT PtrTy = getTargetLowering()->getPointerTy();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
     return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
                                 TFI, CurDAG->getTargetConstant(0, PtrTy));
@@ -497,7 +1132,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
       assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
       uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
       ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                       Node->getDebugLoc(),
+                                       SDLoc(Node),
                                        Register, Ty).getNode();
     }
 
@@ -534,6 +1169,399 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
     Node = ResNode;
     break;
   }
+  case AArch64ISD::NEON_LD1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
+      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
+      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
+      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
+      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
+      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
+      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
+      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
+      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
+      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
+      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
+      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
+      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
+      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
+      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
+      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
+      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
+      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
+      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
+    };
+    return SelectVLD(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
+      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
+      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
+      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
+    };
+    return SelectVST(Node, true, 1, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
+      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
+      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
+      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
+      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
+      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
+      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
+        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
+        AArch64::LD2R_4S, AArch64::LD2R_2D
+    };
+    return SelectVLDDup(Node, false, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
+        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
+        AArch64::LD3R_4S, AArch64::LD3R_2D
+    };
+    return SelectVLDDup(Node, false, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
+        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
+        AArch64::LD4R_4S, AArch64::LD4R_2D
+    };
+    return SelectVLDDup(Node, false, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
+      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
+      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
+      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
+      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
+      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
+      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4DUP_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
+      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
+      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
+      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
+    };
+    return SelectVLDDup(Node, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_LD2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
+        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_LD3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
+        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_LD4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
+        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST2LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
+        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST3LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
+        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST4LN_UPD: {
+    static const uint16_t Opcodes[] = {
+        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
+        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
+    };
+    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x2_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
+      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
+      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
+      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
+    };
+    return SelectVST(Node, true, 2, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x3_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
+      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
+      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
+      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
+    };
+    return SelectVST(Node, true, 3, Opcodes);
+  }
+  case AArch64ISD::NEON_ST1x4_UPD: {
+    static const uint16_t Opcodes[] = {
+      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
+      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
+      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
+      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
+    };
+    return SelectVST(Node, true, 4, Opcodes);
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    bool IsExt = false;
+    switch (IntNo) {
+      default:
+        break;
+      case Intrinsic::aarch64_neon_vtbx1:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl1:
+        return SelectVTBL(Node, 1, IsExt);
+      case Intrinsic::aarch64_neon_vtbx2:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl2:
+        return SelectVTBL(Node, 2, IsExt);
+      case Intrinsic::aarch64_neon_vtbx3:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl3:
+        return SelectVTBL(Node, 3, IsExt);
+      case Intrinsic::aarch64_neon_vtbx4:
+        IsExt = true;
+      case Intrinsic::aarch64_neon_vtbl4:
+        return SelectVTBL(Node, 4, IsExt);
+    }
+    break;
+  }
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::arm_neon_vld1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
+          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
+      };
+      return SelectVLD(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
+          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
+          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
+          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
+          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
+          AArch64::LD1x2_4S, AArch64::LD1x2_2D
+      };
+      return SelectVLD(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
+          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
+          AArch64::LD1x3_4S, AArch64::LD1x3_2D
+      };
+      return SelectVLD(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vld1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
+          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
+          AArch64::LD1x4_4S, AArch64::LD1x4_2D
+      };
+      return SelectVLD(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst1: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
+          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
+      };
+      return SelectVST(Node, false, 1, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
+          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
+          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
+          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x2: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
+          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
+          AArch64::ST1x2_4S, AArch64::ST1x2_2D
+      };
+      return SelectVST(Node, false, 2, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x3: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
+          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
+          AArch64::ST1x3_4S, AArch64::ST1x3_2D
+      };
+      return SelectVST(Node, false, 3, Opcodes);
+    }
+    case Intrinsic::aarch64_neon_vst1x4: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
+          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
+          AArch64::ST1x4_4S, AArch64::ST1x4_2D
+      };
+      return SelectVST(Node, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vld4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
+      };
+      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst2lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst3lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    }
+    case Intrinsic::arm_neon_vst4lane: {
+      static const uint16_t Opcodes[] = {
+          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
+      };
+      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
+    }
+    } // End of switch IntNo
+    break;
+  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
   default:
     break; // Let generic code handle it
   }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 56f6751..4fdb667 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -39,12 +39,10 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
   llvm_unreachable("unknown subtarget type");
 }
 
-
 AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)),
-    Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
-    RegInfo(TM.getRegisterInfo()),
-    Itins(TM.getInstrItineraryData()) {
+  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+
+  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
 
   // SIMD compares set the entire lane's bits to 1
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -52,10 +50,34 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   // Scalar register <-> type mapping
   addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
   addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
-  addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
-  addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
-  addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
-  addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+
+  if (Subtarget->hasFPARMv8()) {
+    addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+  }
+
+  if (Subtarget->hasNEON()) {
+    // And the vectors
+    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1f32, &AArch64::FPR32RegClass);
+    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
+  }
 
   computeRegisterProperties();
 
@@ -64,6 +86,12 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
 
   setTargetDAGCombine(ISD::AND);
   setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::SHL);
+
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // AArch64 does not have i1 loads, or much of anything for i1 really.
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
@@ -253,14 +281,97 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
 
-  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
-  setOperationAction(ISD::EHSELECTION, MVT::i64, Expand);
-
   setExceptionPointerRegister(AArch64::X0);
   setExceptionSelectorRegister(AArch64::X1);
+
+  if (Subtarget->hasNEON()) {
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
+    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
+    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
+    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
+    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+
+    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+
+    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
+  }
 }
 
-EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   // It's reasonably important that this value matches the "natural" legal
   // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
   // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
@@ -271,16 +382,16 @@ EVT AArch64TargetLowering::getSetCCResultType(EVT VT) const {
 static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
                                   unsigned &LdrOpc,
                                   unsigned &StrOpc) {
-  static unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
-                                 AArch64::LDXR_word, AArch64::LDXR_dword};
-  static unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
-                                AArch64::LDAXR_word, AArch64::LDAXR_dword};
-  static unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
-                                  AArch64::STXR_word, AArch64::STXR_dword};
-  static unsigned StoreRels[] = {AArch64::STLXR_byte, AArch64::STLXR_hword,
-                                 AArch64::STLXR_word, AArch64::STLXR_dword};
-
-  unsigned *LoadOps, *StoreOps;
+  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
+                                       AArch64::LDXR_word, AArch64::LDXR_dword};
+  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
+                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
+  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
+                                       AArch64::STXR_word, AArch64::STXR_dword};
+  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
+                                     AArch64::STLXR_word, AArch64::STLXR_dword};
+
+  const unsigned *LoadOps, *StoreOps;
   if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
     LoadOps = LoadAcqs;
   else
@@ -298,6 +409,29 @@ static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
   StrOpc = StoreOps[Log2_32(Size)];
 }
 
+// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
+// have value type mapped, and they are both being defined as MVT::untyped.
+// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
+// would fail to figure out the register pressure correctly.
+std::pair<const TargetRegisterClass*, uint8_t>
+AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
+  const TargetRegisterClass *RRC = 0;
+  uint8_t Cost = 1;
+  switch (VT.SimpleTy) {
+  default:
+    return TargetLowering::findRepresentativeClass(VT);
+  case MVT::v4i64:
+    RRC = &AArch64::QPairRegClass;
+    Cost = 2;
+    break;
+  case MVT::v8i64:
+    RRC = &AArch64::QQuadRegClass;
+    Cost = 4;
+    break;
+  }
+  return std::make_pair(RRC, Cost);
+}
+
 MachineBasicBlock *
 AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                                         unsigned Size,
@@ -623,6 +757,12 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  if (!NZCVKilled) {
+    // NZCV is live-through TrueBB.
+    TrueBB->addLiveIn(AArch64::NZCV);
+    EndBB->addLiveIn(AArch64::NZCV);
+  }
+
   // IfTrue:
   //     str qIFTRUE, [sp]
   BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
@@ -637,8 +777,6 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
   // Done:
   //     ldr qDEST, [sp]
   //     [... rest of incoming MBB ...]
-  if (!NZCVKilled)
-    EndBB->addLiveIn(AArch64::NZCV);
   MachineInstr *StartOfEnd = EndBB->begin();
   BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
     .addFrameIndex(ScratchFI)
@@ -784,7 +922,102 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
   case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
 
-  default:                       return NULL;
+  case AArch64ISD::NEON_BSL:
+    return "AArch64ISD::NEON_BSL";
+  case AArch64ISD::NEON_MOVIMM:
+    return "AArch64ISD::NEON_MOVIMM";
+  case AArch64ISD::NEON_MVNIMM:
+    return "AArch64ISD::NEON_MVNIMM";
+  case AArch64ISD::NEON_FMOVIMM:
+    return "AArch64ISD::NEON_FMOVIMM";
+  case AArch64ISD::NEON_CMP:
+    return "AArch64ISD::NEON_CMP";
+  case AArch64ISD::NEON_CMPZ:
+    return "AArch64ISD::NEON_CMPZ";
+  case AArch64ISD::NEON_TST:
+    return "AArch64ISD::NEON_TST";
+  case AArch64ISD::NEON_QSHLs:
+    return "AArch64ISD::NEON_QSHLs";
+  case AArch64ISD::NEON_QSHLu:
+    return "AArch64ISD::NEON_QSHLu";
+  case AArch64ISD::NEON_VDUP:
+    return "AArch64ISD::NEON_VDUP";
+  case AArch64ISD::NEON_VDUPLANE:
+    return "AArch64ISD::NEON_VDUPLANE";
+  case AArch64ISD::NEON_REV16:
+    return "AArch64ISD::NEON_REV16";
+  case AArch64ISD::NEON_REV32:
+    return "AArch64ISD::NEON_REV32";
+  case AArch64ISD::NEON_REV64:
+    return "AArch64ISD::NEON_REV64";
+  case AArch64ISD::NEON_UZP1:
+    return "AArch64ISD::NEON_UZP1";
+  case AArch64ISD::NEON_UZP2:
+    return "AArch64ISD::NEON_UZP2";
+  case AArch64ISD::NEON_ZIP1:
+    return "AArch64ISD::NEON_ZIP1";
+  case AArch64ISD::NEON_ZIP2:
+    return "AArch64ISD::NEON_ZIP2";
+  case AArch64ISD::NEON_TRN1:
+    return "AArch64ISD::NEON_TRN1";
+  case AArch64ISD::NEON_TRN2:
+    return "AArch64ISD::NEON_TRN2";
+  case AArch64ISD::NEON_LD1_UPD:
+    return "AArch64ISD::NEON_LD1_UPD";
+  case AArch64ISD::NEON_LD2_UPD:
+    return "AArch64ISD::NEON_LD2_UPD";
+  case AArch64ISD::NEON_LD3_UPD:
+    return "AArch64ISD::NEON_LD3_UPD";
+  case AArch64ISD::NEON_LD4_UPD:
+    return "AArch64ISD::NEON_LD4_UPD";
+  case AArch64ISD::NEON_ST1_UPD:
+    return "AArch64ISD::NEON_ST1_UPD";
+  case AArch64ISD::NEON_ST2_UPD:
+    return "AArch64ISD::NEON_ST2_UPD";
+  case AArch64ISD::NEON_ST3_UPD:
+    return "AArch64ISD::NEON_ST3_UPD";
+  case AArch64ISD::NEON_ST4_UPD:
+    return "AArch64ISD::NEON_ST4_UPD";
+  case AArch64ISD::NEON_LD1x2_UPD:
+    return "AArch64ISD::NEON_LD1x2_UPD";
+  case AArch64ISD::NEON_LD1x3_UPD:
+    return "AArch64ISD::NEON_LD1x3_UPD";
+  case AArch64ISD::NEON_LD1x4_UPD:
+    return "AArch64ISD::NEON_LD1x4_UPD";
+  case AArch64ISD::NEON_ST1x2_UPD:
+    return "AArch64ISD::NEON_ST1x2_UPD";
+  case AArch64ISD::NEON_ST1x3_UPD:
+    return "AArch64ISD::NEON_ST1x3_UPD";
+  case AArch64ISD::NEON_ST1x4_UPD:
+    return "AArch64ISD::NEON_ST1x4_UPD";
+  case AArch64ISD::NEON_LD2DUP:
+    return "AArch64ISD::NEON_LD2DUP";
+  case AArch64ISD::NEON_LD3DUP:
+    return "AArch64ISD::NEON_LD3DUP";
+  case AArch64ISD::NEON_LD4DUP:
+    return "AArch64ISD::NEON_LD4DUP";
+  case AArch64ISD::NEON_LD2DUP_UPD:
+    return "AArch64ISD::NEON_LD2DUP_UPD";
+  case AArch64ISD::NEON_LD3DUP_UPD:
+    return "AArch64ISD::NEON_LD3DUP_UPD";
+  case AArch64ISD::NEON_LD4DUP_UPD:
+    return "AArch64ISD::NEON_LD4DUP_UPD";
+  case AArch64ISD::NEON_LD2LN_UPD:
+    return "AArch64ISD::NEON_LD2LN_UPD";
+  case AArch64ISD::NEON_LD3LN_UPD:
+    return "AArch64ISD::NEON_LD3LN_UPD";
+  case AArch64ISD::NEON_LD4LN_UPD:
+    return "AArch64ISD::NEON_LD4LN_UPD";
+  case AArch64ISD::NEON_ST2LN_UPD:
+    return "AArch64ISD::NEON_ST2LN_UPD";
+  case AArch64ISD::NEON_ST3LN_UPD:
+    return "AArch64ISD::NEON_ST3LN_UPD";
+  case AArch64ISD::NEON_ST4LN_UPD:
+    return "AArch64ISD::NEON_ST4LN_UPD";
+  case AArch64ISD::NEON_VEXTRACT:
+    return "AArch64ISD::NEON_VEXTRACT";
+  default:
+    return NULL;
   }
 }
 
@@ -826,7 +1059,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
 
 void
 AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                           DebugLoc DL, SDValue &Chain) const {
+                                           SDLoc DL, SDValue &Chain) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   AArch64MachineFunctionInfo *FuncInfo
@@ -858,24 +1091,31 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
     }
   }
 
+  if (getSubtarget()->hasFPARMv8()) {
   unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
   int FPRIdx = 0;
-  if (FPRSaveSize != 0) {
-    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-                                   &AArch64::FPR128RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 16),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(16, getPointerTy()));
+    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
+    // can omit a register save area if we know we'll never use registers of
+    // that class.
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
+            &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
+            MachinePointerInfo::getStack(i * 16),
+            false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+            DAG.getConstant(16, getPointerTy()));
+      }
     }
+    FuncInfo->setVariadicFPRIdx(FPRIdx);
+    FuncInfo->setVariadicFPRSize(FPRSaveSize);
   }
 
   int StackIdx = MFI->CreateFixedObject(8, CCInfo.getNextStackOffset(), true);
@@ -883,8 +1123,6 @@ AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
   FuncInfo->setVariadicStackIdx(StackIdx);
   FuncInfo->setVariadicGPRIdx(GPRIdx);
   FuncInfo->setVariadicGPRSize(GPRSaveSize);
-  FuncInfo->setVariadicFPRIdx(FPRIdx);
-  FuncInfo->setVariadicFPRSize(FPRSaveSize);
 
   if (!MemOps.empty()) {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
@@ -897,7 +1135,7 @@ SDValue
 AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64MachineFunctionInfo *FuncInfo
@@ -1012,7 +1250,7 @@ AArch64TargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc dl, SelectionDAG &DAG) const {
+                                   SDLoc dl, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -1085,10 +1323,10 @@ SDValue
 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &IsTailCall                      = CLI.IsTailCall;
@@ -1151,7 +1389,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
                                         getPointerTy());
@@ -1282,7 +1521,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   // in the correct location.
   if (IsTailCall && !IsSibCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag);
+                               DAG.getIntPtrConstant(0, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -1336,7 +1575,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                                DAG.getIntPtrConstant(CalleePopBytes, true),
-                               InFlag);
+                               InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -1348,7 +1587,7 @@ SDValue
 AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       CallingConv::ID CallConv, bool IsVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -1537,7 +1776,7 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
         }
 
    // Build a tokenfactor for all the chains.
-   return DAG.getNode(ISD::TokenFactor, Chain.getDebugLoc(), MVT::Other,
+   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
                       &ArgChains[0], ArgChains.size());
 }
 
@@ -1570,7 +1809,7 @@ bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
 
 SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
                                         ISD::CondCode CC, SDValue &A64cc,
-                                        SelectionDAG &DAG, DebugLoc &dl) const {
+                                        SelectionDAG &DAG, SDLoc &dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     int64_t C = 0;
     EVT VT = RHSC->getValueType(0);
@@ -1663,7 +1902,7 @@ static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
 
 SDValue
 AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PtrVT = getPointerTy();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
 
@@ -1693,7 +1932,7 @@ AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 // (BRCOND chain, val, dest)
 SDValue
 AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue TheBit = Op.getOperand(1);
   SDValue DestBB = Op.getOperand(2);
@@ -1716,7 +1955,7 @@ AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 // (BR_CC chain, condcode, lhs, rhs, dest)
 SDValue
 AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
@@ -1802,7 +2041,7 @@ AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
   CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
                     0, getLibcallCallingConv(Call), isTailCall,
                     /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, Op->getDebugLoc());
+                    Callee, Args, DAG, SDLoc(Op));
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode())
@@ -1824,7 +2063,7 @@ AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue SrcVal = Op.getOperand(0);
   return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, Op.getDebugLoc());
+                     /*isSigned*/ false, SDLoc(Op)).first;
 }
 
 SDValue
@@ -1854,6 +2093,45 @@ AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
   return LowerF128ToCall(Op, DAG, LC);
 }
 
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, MVT::i64);
+    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return X30, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
+}
+
+
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
+                                              const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = AArch64::X29;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(),
+                            false, false, false, 0);
+  return FrameAddr;
+}
+
 SDValue
 AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
                                                   SelectionDAG &DAG) const {
@@ -1861,7 +2139,7 @@ AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
   assert(getTargetMachine().getRelocationModel() == Reloc::Static);
 
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
 
@@ -1885,7 +2163,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
   assert(getTargetMachine().getCodeModel() == CodeModel::Small);
 
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   unsigned Alignment = GV->getAlignment();
@@ -1927,7 +2205,7 @@ AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
   }
 
   unsigned char HiFixup, LoFixup;
-  bool UseGOT = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
 
   if (UseGOT) {
     HiFixup = AArch64II::MO_GOT;
@@ -1978,7 +2256,7 @@ AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
                                                 SDValue DescAddr,
-                                                DebugLoc DL,
+                                                SDLoc DL,
                                                 SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
@@ -2023,7 +2301,7 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
 SDValue
 AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                              SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() &&
+  assert(getSubtarget()->isTargetELF() &&
          "TLS not implemented for non-ELF targets");
   assert(getTargetMachine().getCodeModel() == CodeModel::Small
          && "TLS only supported in small memory model");
@@ -2033,7 +2311,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   SDValue TPOff;
   EVT PtrVT = getPointerTy();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   const GlobalValue *GV = GA->getGlobal();
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
@@ -2054,7 +2332,7 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                AArch64II::MO_TPREL_G0_NC);
 
     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+                                       DAG.getTargetConstant(1, MVT::i32)), 0);
     TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
                                        TPOff, LoVar,
                                        DAG.getTargetConstant(0, MVT::i32)), 0);
@@ -2134,7 +2412,7 @@ AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
 SDValue
 AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  DebugLoc dl = JT->getDebugLoc();
+  SDLoc dl(JT);
   EVT PtrVT = getPointerTy();
 
   // When compiling PIC, jump tables get put in the code section so a static
@@ -2161,7 +2439,7 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 // (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
 SDValue
 AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue IfTrue = Op.getOperand(2);
@@ -2217,7 +2495,7 @@ AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 // (SELECT testbit, iftrue, iffalse)
 SDValue
 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue TheBit = Op.getOperand(0);
   SDValue IfTrue = Op.getOperand(1);
   SDValue IfFalse = Op.getOperand(2);
@@ -2236,15 +2514,225 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getConstant(A64CC::NE, MVT::i32));
 }
 
+static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  EVT VT = Op.getValueType();
+  bool Invert = false;
+  SDValue Op0, Op1;
+  unsigned Opcode;
+
+  if (LHS.getValueType().isInteger()) {
+
+    // Attempt to use Vector Integer Compare Mask Test instruction.
+    // TST = icmp ne (and (op0, op1), zero).
+    if (CC == ISD::SETNE) {
+      if (((LHS.getOpcode() == ISD::AND) &&
+           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
+          ((RHS.getOpcode() == ISD::AND) &&
+           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
+
+        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
+        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
+        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
+        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
+      }
+    }
+
+    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
+    // Note: Compare against Zero does not support unsigned predicates.
+    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
+         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
+        !isUnsignedIntSetCC(CC)) {
+
+      // If LHS is the zero value, swap operands and CondCode.
+      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+        CC = getSetCCSwappedOperands(CC);
+        Op0 = RHS;
+      } else
+        Op0 = LHS;
+
+      // Ensure valid CondCode for Compare Mask against Zero instruction:
+      // EQ, GE, GT, LE, LT.
+      if (ISD::SETNE == CC) {
+        Invert = true;
+        CC = ISD::SETEQ;
+      }
+
+      // Using constant type to differentiate integer and FP compares with zero.
+      Op1 = DAG.getConstant(0, MVT::i32);
+      Opcode = AArch64ISD::NEON_CMPZ;
+
+    } else {
+      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
+      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
+      bool Swap = false;
+      switch (CC) {
+      default:
+        llvm_unreachable("Illegal integer comparison.");
+      case ISD::SETEQ:
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+        break;
+      case ISD::SETNE:
+        Invert = true;
+        CC = ISD::SETEQ;
+        break;
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Swap = true;
+        CC = getSetCCSwappedOperands(CC);
+      }
+
+      if (Swap)
+        std::swap(LHS, RHS);
+
+      Opcode = AArch64ISD::NEON_CMP;
+      Op0 = LHS;
+      Op1 = RHS;
+    }
+
+    // Generate Compare Mask instr or Compare Mask against Zero instr.
+    SDValue NeonCmp =
+        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+
+    if (Invert)
+      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+
+    return NeonCmp;
+  }
+
+  // Now handle Floating Point cases.
+  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
+  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
+      ISD::isBuildVectorAllZeros(LHS.getNode())) {
+
+    // If LHS is the zero value, swap operands and CondCode.
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      CC = getSetCCSwappedOperands(CC);
+      Op0 = RHS;
+    } else
+      Op0 = LHS;
+
+    // Using constant type to differentiate integer and FP compares with zero.
+    Op1 = DAG.getConstantFP(0, MVT::f32);
+    Opcode = AArch64ISD::NEON_CMPZ;
+  } else {
+    // Attempt to use Vector Floating Point Compare Mask instruction.
+    Op0 = LHS;
+    Op1 = RHS;
+    Opcode = AArch64ISD::NEON_CMP;
+  }
+
+  SDValue NeonCmpAlt;
+  // Some register compares have to be implemented with swapped CC and operands,
+  // e.g.: OLT implemented as OGT with swapped operands.
+  bool SwapIfRegArgs = false;
+
+  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
+  // EQ, GE, GT, LE, LT.
+  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
+  switch (CC) {
+  default:
+    llvm_unreachable("Illegal FP comparison");
+  case ISD::SETUNE:
+  case ISD::SETNE:
+    Invert = true; // Fallthrough
+  case ISD::SETOEQ:
+  case ISD::SETEQ:
+    CC = ISD::SETEQ;
+    break;
+  case ISD::SETOLT:
+  case ISD::SETLT:
+    CC = ISD::SETLT;
+    SwapIfRegArgs = true;
+    break;
+  case ISD::SETOGT:
+  case ISD::SETGT:
+    CC = ISD::SETGT;
+    break;
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    CC = ISD::SETLE;
+    SwapIfRegArgs = true;
+    break;
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    CC = ISD::SETGE;
+    break;
+  case ISD::SETUGE:
+    Invert = true;
+    CC = ISD::SETLT;
+    SwapIfRegArgs = true;
+    break;
+  case ISD::SETULE:
+    Invert = true;
+    CC = ISD::SETGT;
+    break;
+  case ISD::SETUGT:
+    Invert = true;
+    CC = ISD::SETLE;
+    SwapIfRegArgs = true;
+    break;
+  case ISD::SETULT:
+    Invert = true;
+    CC = ISD::SETGE;
+    break;
+  case ISD::SETUEQ:
+    Invert = true; // Fallthrough
+  case ISD::SETONE:
+    // Expand this to (OGT |OLT).
+    NeonCmpAlt =
+        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
+    CC = ISD::SETLT;
+    SwapIfRegArgs = true;
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    // Expand this to (OGE | OLT).
+    NeonCmpAlt =
+        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
+    CC = ISD::SETLT;
+    SwapIfRegArgs = true;
+    break;
+  }
+
+  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
+    CC = getSetCCSwappedOperands(CC);
+    std::swap(Op0, Op1);
+  }
+
+  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
+  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+
+  if (NeonCmpAlt.getNode())
+    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
+
+  if (Invert)
+    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+
+  return NeonCmp;
+}
+
 // (SETCC lhs, rhs, condcode)
 SDValue
 AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   EVT VT = Op.getValueType();
 
+  if (VT.isVector())
+    return LowerVectorSETCC(Op, DAG);
+
   if (LHS.getValueType() == MVT::f128) {
     // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
     // for the rest of the function (some i32 or i64 values).
@@ -2298,7 +2786,7 @@ AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
 
   // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
   // rather than just 8.
-  return DAG.getMemcpy(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
                        Op.getOperand(1), Op.getOperand(2),
                        DAG.getConstant(32, MVT::i32), 8, false, false,
                        MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
@@ -2311,7 +2799,7 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   AArch64MachineFunctionInfo *FuncInfo
     = MF.getInfo<AArch64MachineFunctionInfo>();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
   SDValue VAList = Op.getOperand(1);
@@ -2389,6 +2877,8 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
 
   case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
@@ -2401,16 +2891,161 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SETCC: return LowerSETCC(Op, DAG);
   case ISD::VACOPY: return LowerVACOPY(Op, DAG);
   case ISD::VASTART: return LowerVASTART(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return SDValue();
 }
 
+/// Check if the specified splat value corresponds to a valid vector constant
+/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
+/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
+/// values.
+static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+                              unsigned SplatBitSize, SelectionDAG &DAG,
+                              bool is128Bits, NeonModImmType type, EVT &VT,
+                              unsigned &Imm, unsigned &OpCmode) {
+  switch (SplatBitSize) {
+  default:
+    llvm_unreachable("unexpected size for isNeonModifiedImm");
+  case 8: {
+    if (type != Neon_Mov_Imm)
+      return false;
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    // Neon movi per byte: Op=0, Cmode=1110.
+    OpCmode = 0xe;
+    Imm = SplatBits;
+    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    break;
+  }
+  case 16: {
+    // Neon move inst per halfword
+    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x00nn is 0x00nn LSL 0
+      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
+      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
+      // Op=x, Cmode=100y
+      Imm = SplatBits;
+      OpCmode = 0x8;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0xnn00 is 0x00nn LSL 8
+      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
+      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
+      // Op=x, Cmode=101x
+      Imm = SplatBits >> 8;
+      OpCmode = 0xa;
+      break;
+    }
+    // can't handle any other
+    return false;
+  }
+
+  case 32: {
+    // First the LSL variants (MSL is unusable by some interested instructions).
+
+    // Neon move instr per word, shift zeros
+    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+    if ((SplatBits & ~0xff) == 0) {
+      // Value = 0x000000nn is 0x000000nn LSL 0
+      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
+      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
+      // Op=x, Cmode=000x
+      Imm = SplatBits;
+      OpCmode = 0;
+      break;
+    }
+    if ((SplatBits & ~0xff00) == 0) {
+      // Value = 0x0000nn00 is 0x000000nn LSL 8
+      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
+      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
+      // Op=x, Cmode=001x
+      Imm = SplatBits >> 8;
+      OpCmode = 0x2;
+      break;
+    }
+    if ((SplatBits & ~0xff0000) == 0) {
+      // Value = 0x00nn0000 is 0x000000nn LSL 16
+      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
+      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
+      // Op=x, Cmode=010x
+      Imm = SplatBits >> 16;
+      OpCmode = 0x4;
+      break;
+    }
+    if ((SplatBits & ~0xff000000) == 0) {
+      // Value = 0xnn000000 is 0x000000nn LSL 24
+      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
+      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
+      // Op=x, Cmode=011x
+      Imm = SplatBits >> 24;
+      OpCmode = 0x6;
+      break;
+    }
+
+    // Now the MSL immediates.
+
+    // Neon move instr per word, shift ones
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+      // Value = 0x0000nnff is 0x000000nn MSL 8
+      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
+      // Op=x, Cmode=1100
+      Imm = SplatBits >> 8;
+      OpCmode = 0xc;
+      break;
+    }
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+      // Value = 0x00nnffff is 0x000000nn MSL 16
+      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
+      // Op=x, Cmode=1101
+      Imm = SplatBits >> 16;
+      OpCmode = 0xd;
+      break;
+    }
+    // can't handle any other
+    return false;
+  }
+
+  case 64: {
+    if (type != Neon_Mov_Imm)
+      return false;
+    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
+    // movi Op=1, Cmode=1110.
+    OpCmode = 0x1e;
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    unsigned ImmMask = 1;
+    Imm = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+        Val |= BitMask;
+        Imm |= ImmMask;
+      } else if ((SplatBits & BitMask) != 0) {
+        return false;
+      }
+      BitMask <<= 8;
+      ImmMask <<= 1;
+    }
+    SplatBits = Val;
+    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+    break;
+  }
+  }
+
+  return true;
+}
+
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // We're looking for an SRA/SHL pair which form an SBFX.
@@ -2448,7 +3083,7 @@ static SDValue PerformANDCombine(SDNode *N,
 /// a compatible SHL operation (unless they're already low). This function
 /// checks that condition and returns the least-significant bit that's
 /// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
                             SDValue &MaskedVal, uint64_t Mask) {
   if (!isShiftedMask_64(Mask))
     return -1;
@@ -2464,7 +3099,7 @@ static int32_t getLSBForBFI(SelectionDAG &DAG, DebugLoc DL, EVT VT,
   // cases (e.g. bitfield to bitfield copy) may still need a real shift before
   // the BFI.
 
-  uint64_t LSB = CountTrailingZeros_64(Mask);
+  uint64_t LSB = countTrailingZeros(Mask);
   int64_t ShiftRightRequired = LSB;
   if (MaskedVal.getOpcode() == ISD::SHL &&
       isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
@@ -2524,7 +3159,7 @@ static SDValue tryCombineToBFI(SDNode *N,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
@@ -2605,7 +3240,7 @@ static SDValue tryCombineToLargerBFI(SDNode *N,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const AArch64Subtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // First job is to hunt for a MaskedBFI on either the left or right. Swap
@@ -2687,7 +3322,7 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
 static SDValue tryCombineToEXTR(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   assert(N->getOpcode() == ISD::OR && "Unexpected root");
@@ -2731,6 +3366,7 @@ static SDValue PerformORCombine(SDNode *N,
                                 const AArch64Subtarget *Subtarget) {
 
   SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -2751,6 +3387,44 @@ static SDValue PerformORCombine(SDNode *N,
   if (Res.getNode())
     return Res;
 
+  if (!Subtarget->hasNEON())
+    return SDValue();
+
+  // Attempt to use vector immediate-form BSL
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    APInt SplatBits0;
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs) &&
+        !HasAnyUndefs) {
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+      APInt SplatBits1;
+      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                        HasAnyUndefs) &&
+          !HasAnyUndefs && SplatBits0 == ~SplatBits1) {
+        // Canonicalize the vector type to make instruction selection simpler.
+        EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8;
+        SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT,
+                                     N0->getOperand(1), N0->getOperand(0),
+                                     N1->getOperand(0));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Result);
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -2759,7 +3433,7 @@ static SDValue PerformSRACombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI) {
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   // We're looking for an SRA/SHL pair which form an SBFX.
@@ -2791,6 +3465,336 @@ static SDValue PerformSRACombine(SDNode *N,
                      DAG.getConstant(LSB + Width - 1, MVT::i64));
 }
 
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift operation, where all the elements of the build_vector
+/// must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// Check if this is a valid build_vector for the immediate operand of
+/// a vector shift left operation.  That value must be in the range:
+/// 0 <= Value < ElementBits
+static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && Cnt < ElementBits);
+}
+
+/// Check if this is a valid build_vector for the immediate operand of a
+/// vector shift right operation. The value must be in the range:
+///   1 <= Value <= ElementBits
+static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 1 && Cnt <= ElementBits);
+}
+
+/// Checks for immediate versions of vector shifts and lowers them.
+static SDValue PerformShiftCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const AArch64Subtarget *ST) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
+    return PerformSRACombine(N, DCI);
+
+  // Nothing to be done for scalar shifts.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
+      SDValue RHS =
+          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
+                      DAG.getConstant(Cnt, MVT::i32));
+      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
+    }
+    break;
+  }
+
+  return SDValue();
+}
+
+/// ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
+      break;
+    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
+                             ? AArch64ISD::NEON_QSHLs
+                             : AArch64ISD::NEON_QSHLu;
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  return SDValue();
+}
+
+/// Target-specific DAG combine function for NEON load/store intrinsics
+/// to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  SDValue Addr = N->getOperand(AddrOpIdx);
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+       UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle.
+    if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoad = true;
+    bool isLaneOp = false;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    if (isIntrinsic) {
+      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      switch (IntNo) {
+      default: llvm_unreachable("unexpected intrinsic for Neon base update");
+      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
+        NumVecs = 1; break;
+      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
+        NumVecs = 1; isLoad = false; break;
+      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
+        NumVecs = 2; break;
+      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
+        NumVecs = 3; break;
+      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
+        NumVecs = 4; break;
+      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
+        NumVecs = 2; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
+        NumVecs = 3; isLoad = false; break;
+      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
+        NumVecs = 4; isLoad = false; break;
+      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
+        NumVecs = 2; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
+        NumVecs = 3; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
+        NumVecs = 4; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
+        NumVecs = 2; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
+        NumVecs = 3; isLoad = false; isLaneOp = true; break;
+      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
+        NumVecs = 4; isLoad = false; isLaneOp = true; break;
+      }
+    } else {
+      isLaneOp = true;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode for Neon base update");
+      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
+        NumVecs = 2; break;
+      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
+        NumVecs = 3; break;
+      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
+        NumVecs = 4; break;
+      }
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoad)
+      VecTy = N->getValueType(0);
+    else
+      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+    if (isLaneOp)
+      NumBytes /= VecTy.getVectorNumElements();
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
+    }
+
+    // Create the new updating load/store node.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i64;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(AddrOpIdx));
+    Ops.push_back(Inc);
+    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+      Ops.push_back(N->getOperand(i));
+    }
+    MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
+                                           Ops.data(), Ops.size(),
+                                           MemInt->getMemoryVT(),
+                                           MemInt->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i) {
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+    }
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+  return SDValue();
+}
+
+/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
+/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
+/// If so, combine them to a vldN-dup operation and return true.
+static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+  SDNode *VLD = N->getOperand(0).getNode();
+  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+    return SDValue();
+  unsigned NumVecs = 0;
+  unsigned NewOpc = 0;
+  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_neon_vld2lane) {
+    NumVecs = 2;
+    NewOpc = AArch64ISD::NEON_LD2DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+    NumVecs = 3;
+    NewOpc = AArch64ISD::NEON_LD3DUP;
+  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+    NumVecs = 4;
+    NewOpc = AArch64ISD::NEON_LD4DUP;
+  } else {
+    return SDValue();
+  }
+
+  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+  // numbers match the load.
+  unsigned VLDLaneNo =
+      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    // Ignore uses of the chain result.
+    if (UI.getUse().getResNo() == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
+        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+      return SDValue();
+  }
+
+  // Create the vldN-dup node.
+  EVT Tys[5];
+  unsigned n;
+  for (n = 0; n < NumVecs; ++n)
+    Tys[n] = VT;
+  Tys[n] = MVT::Other;
+  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
+  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
+                                           VLDMemInt->getMemoryVT(),
+                                           VLDMemInt->getMemOperand());
+
+  // Update the uses.
+  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+       UI != UE; ++UI) {
+    unsigned ResNo = UI.getUse().getResNo();
+    // Ignore uses of the chain result.
+    if (ResNo == NumVecs)
+      continue;
+    SDNode *User = *UI;
+    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+  }
+
+  // Now the vldN-lane intrinsic is dead except for its chain result.
+  // Update uses of the chain.
+  std::vector<SDValue> VLDDupResults;
+  for (unsigned n = 0; n < NumVecs; ++n)
+    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+  DCI.CombineTo(VLD, VLDDupResults);
+
+  return SDValue(N, 0);
+}
 
 SDValue
 AArch64TargetLowering::PerformDAGCombine(SDNode *N,
@@ -2798,12 +3802,578 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   switch (N->getOpcode()) {
   default: break;
   case ISD::AND: return PerformANDCombine(N, DCI);
-  case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
-  case ISD::SRA: return PerformSRACombine(N, DCI);
+  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI, getSubtarget());
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI.DAG);
+  case AArch64ISD::NEON_VDUPLANE:
+    return CombineVLDDUP(N, DCI);
+  case AArch64ISD::NEON_LD2DUP:
+  case AArch64ISD::NEON_LD3DUP:
+  case AArch64ISD::NEON_LD4DUP:
+    return CombineBaseUpdate(N, DCI);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:
+    switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::arm_neon_vld1:
+    case Intrinsic::arm_neon_vld2:
+    case Intrinsic::arm_neon_vld3:
+    case Intrinsic::arm_neon_vld4:
+    case Intrinsic::arm_neon_vst1:
+    case Intrinsic::arm_neon_vst2:
+    case Intrinsic::arm_neon_vst3:
+    case Intrinsic::arm_neon_vst4:
+    case Intrinsic::arm_neon_vld2lane:
+    case Intrinsic::arm_neon_vld3lane:
+    case Intrinsic::arm_neon_vld4lane:
+    case Intrinsic::aarch64_neon_vld1x2:
+    case Intrinsic::aarch64_neon_vld1x3:
+    case Intrinsic::aarch64_neon_vld1x4:
+    case Intrinsic::aarch64_neon_vst1x2:
+    case Intrinsic::aarch64_neon_vst1x3:
+    case Intrinsic::aarch64_neon_vst1x4:
+    case Intrinsic::arm_neon_vst2lane:
+    case Intrinsic::arm_neon_vst3lane:
+    case Intrinsic::arm_neon_vst4lane:
+      return CombineBaseUpdate(N, DCI);
+    default:
+      break;
+    }
   }
   return SDValue();
 }
 
+bool
+AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f16:
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  case MVT::f128:
+    return false;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
+// try to call LowerVECTOR_SHUFFLE to lower it.
+bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
+                                                 SDValue &Res) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned V0NumElts = 0;
+  int Mask[16];
+  SDValue V0, V1;
+
+  // Check if all elements are extracted from less than 3 vectors.
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return false;
+
+    if (V0.getNode() == 0) {
+      V0 = Elt.getOperand(0);
+      V0NumElts = V0.getValueType().getVectorNumElements();
+    }
+    if (Elt.getOperand(0) == V0) {
+      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
+      continue;
+    } else if (V1.getNode() == 0) {
+      V1 = Elt.getOperand(0);
+    }
+    if (Elt.getOperand(0) == V1) {
+      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
+      Mask[i] = (Lane + V0NumElts);
+      continue;
+    } else {
+      return false;
+    }
+  }
+
+  if (!V1.getNode() && V0NumElts == NumElts * 2) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(NumElts, MVT::i64));
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+                     DAG.getConstant(0, MVT::i64));
+    V0NumElts = V0.getValueType().getVectorNumElements();
+  }
+
+  if (V1.getNode() && NumElts == V0NumElts &&
+      V0NumElts == V1.getValueType().getVectorNumElements()) {
+    SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
+    Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
+    return true;
+  } else
+    return false;
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue
+AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                                         const AArch64Subtarget *ST) const {
+
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
+
+  // Note we favor lowering MOVI over MVNI.
+  // This has implications on the definition of patterns in TableGen to select
+  // BIC immediate instructions but not ORR immediate instructions.
+  // If this lowering order is changed, TableGen patterns for BIC immediate and
+  // ORR immediate instructions have to be updated.
+  if (UseNeonMov &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      // First attempt to use vector immediate-form MOVI
+      EVT NeonMovVT;
+      unsigned Imm = 0;
+      unsigned OpCmode = 0;
+
+      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                            SplatBitSize, DAG, VT.is128BitVector(),
+                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
+        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
+        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
+
+        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
+          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
+                                        ImmVal, OpCmodeVal);
+          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
+        }
+      }
+
+      // Then attempt to use vector immediate-form MVNI
+      uint64_t NegatedImm = (~SplatBits).getZExtValue();
+      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
+                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
+                            Imm, OpCmode)) {
+        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
+        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
+        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
+          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
+                                        ImmVal, OpCmodeVal);
+          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
+        }
+      }
+
+      // Attempt to use vector immediate-form FMOV
+      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
+          (VT == MVT::v2f64 && SplatBitSize == 64)) {
+        APFloat RealVal(
+            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
+            SplatBits);
+        uint32_t ImmVal;
+        if (A64Imms::isFPImm(RealVal, ImmVal)) {
+          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
+        }
+      }
+    }
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
+  bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
+  SDValue Value;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
+      Value = V;
+    }
+  }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
+
+  if (ValueCounts.size() == 0)
+    return DAG.getUNDEF(VT);
+
+  // Loads are better lowered with insert_vector_elt.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (hasDominantValue && EltSize <= 64) {
+    // Use VDUP for non-constant splats.
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are DUPing a value that comes directly from a vector, we could
+      // just use DUPLANE. We can only do this if the lane being extracted
+      // is at a constant index, as the DUP from lane instructions only have
+      // constant-index forms.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          isa<ConstantSDNode>(Value->getOperand(1))) {
+          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      } else
+        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i64));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
+    if (usesOnlyOneValue && isConstant) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
+    }
+  }
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Try to lower this in lowering ShuffleVector way.
+  SDValue Shuf;
+  if (isKnownShuffleVector(Op, DAG, Shuf))
+    return Shuf;
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+  return SDValue();
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
+// TRN instruction.
+static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts < 4)
+    return 0;
+
+  bool ismatch = true;
+
+  // Check UZP1
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP1;
+
+  // Check UZP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i * 2 + 1) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_UZP2;
+
+  // Check ZIP1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP1;
+
+  // Check ZIP2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_ZIP2;
+
+  // Check TRN1
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN1;
+
+  // Check TRN2
+  ismatch = true;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
+      ismatch = false;
+      break;
+    }
+  }
+  if (ismatch)
+    return AArch64ISD::NEON_TRN2;
+
+  return 0;
+}
+
+SDValue
+AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize > 64)
+    return SDValue();
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
+
+  unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
+  if (ISDNo)
+    return DAG.getNode(ISDNo, dl, VT, V1, V2);
+
+  // If the element of shuffle mask are all the same constant, we can
+  // transform it into either NEON_VDUP or NEON_VDUPLANE
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1) Lane = 0;
+
+    // Test if V1 is a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
+    }
+    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
+      bool IsScalarToVector = true;
+      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
+        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
+            i != (unsigned)Lane) {
+          IsScalarToVector = false;
+          break;
+        }
+      if (IsScalarToVector)
+        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
+                           V1.getOperand(Lane));
+    }
+
+    // Test if V1 is a EXTRACT_SUBVECTOR.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
+                         DAG.getConstant(Lane + ExtLane, MVT::i64));
+    }
+    // Test if V1 is a CONCAT_VECTORS.
+    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
+        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
+      SDValue Op0 = V1.getOperand(0);
+      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
+             "Invalid vector lane access");
+      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
+                         DAG.getConstant(Lane, MVT::i64));
+    }
+
+    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
+                       DAG.getConstant(Lane, MVT::i64));
+  }
+
+  int Length = ShuffleMask.size();
+  int V1EltNum = V1.getValueType().getVectorNumElements();
+
+  // If the number of v1 elements is the same as the number of shuffle mask
+  // element and the shuffle masks are sequential values, we can transform
+  // it into NEON_VEXTRACT.
+  if (V1EltNum == Length) {
+    // Check if the shuffle mask is sequential.
+    bool IsSequential = true;
+    int CurMask = ShuffleMask[0];
+    for (int I = 0; I < Length; ++I) {
+      if (ShuffleMask[I] != CurMask) {
+        IsSequential = false;
+        break;
+      }
+      CurMask++;
+    }
+    if (IsSequential) {
+      assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
+      unsigned VecSize = EltSize * V1EltNum;
+      unsigned Index = (EltSize/8) * ShuffleMask[0];
+      if (VecSize == 64 || VecSize == 128)
+        return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
+                           DAG.getConstant(Index, MVT::i64));
+    }
+  }
+
+  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
+  // by element from V2 to V1 .
+  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
+  // better choice to be inserted than V1 as less insert needed, so we count
+  // element to be inserted for both V1 and V2, and select less one as insert
+  // target.
+
+  // Collect elements need to be inserted and their index.
+  SmallVector<int, 8> NV1Elt;
+  SmallVector<int, 8> N1Index;
+  SmallVector<int, 8> NV2Elt;
+  SmallVector<int, 8> N2Index;
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != I) {
+      NV1Elt.push_back(ShuffleMask[I]);
+      N1Index.push_back(I);
+    }
+  }
+  for (int I = 0; I != Length; ++I) {
+    if (ShuffleMask[I] != (I + V1EltNum)) {
+      NV2Elt.push_back(ShuffleMask[I]);
+      N2Index.push_back(I);
+    }
+  }
+
+  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
+  // will be inserted.
+  SDValue InsV = V1;
+  SmallVector<int, 8> InsMasks = NV1Elt;
+  SmallVector<int, 8> InsIndex = N1Index;
+  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
+    if (NV1Elt.size() > NV2Elt.size()) {
+      InsV = V2;
+      InsMasks = NV2Elt;
+      InsIndex = N2Index;
+    }
+  } else {
+    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
+  }
+
+  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
+    SDValue ExtV = V1;
+    int Mask = InsMasks[I];
+    if (Mask >= V1EltNum) {
+      ExtV = V2;
+      Mask -= V1EltNum;
+    }
+    // Any value type smaller than i32 is illegal in AArch64, and this lower
+    // function is called after legalize pass, so we need to legalize
+    // the result here.
+    EVT EltVT;
+    if (VT.getVectorElementType().isFloatingPoint())
+      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
+    else
+      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
+
+    if (Mask >= 0) {
+      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+                         DAG.getConstant(Mask, MVT::i64));
+      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
+                         DAG.getConstant(InsIndex[I], MVT::i64));
+    }
+  }
+  return InsV;
+}
+
 AArch64TargetLowering::ConstraintType
 AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
   if (Constraint.size() == 1) {
@@ -2899,7 +4469,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   case 'S': {
     // An absolute symbolic address or label reference.
     if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
-      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), Op.getDebugLoc(),
+      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
                                           GA->getValueType(0));
     } else if (const BlockAddressSDNode *BA
                  = dyn_cast<BlockAddressSDNode>(Op)) {
@@ -2935,7 +4505,7 @@ AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 std::pair<unsigned, const TargetRegisterClass*>
 AArch64TargetLowering::getRegForInlineAsmConstraint(
                                                   const std::string &Constraint,
-                                                  EVT VT) const {
+                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -2949,14 +4519,10 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &AArch64::FPR16RegClass);
       else if (VT == MVT::f32)
         return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT == MVT::f64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::VPR64RegClass);
-      else if (VT == MVT::f128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
       else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::VPR128RegClass);
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
     }
   }
@@ -2965,3 +4531,69 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
   // constraint into a member of a register class.
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
+
+/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
+/// The associated MachineMemOperands record the alignment specified
+/// in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::arm_neon_vld1:
+  case Intrinsic::arm_neon_vld2:
+  case Intrinsic::arm_neon_vld3:
+  case Intrinsic::arm_neon_vld4:
+  case Intrinsic::aarch64_neon_vld1x2:
+  case Intrinsic::aarch64_neon_vld1x3:
+  case Intrinsic::aarch64_neon_vld1x4:
+  case Intrinsic::arm_neon_vld2lane:
+  case Intrinsic::arm_neon_vld3lane:
+  case Intrinsic::arm_neon_vld4lane: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_neon_vst1:
+  case Intrinsic::arm_neon_vst2:
+  case Intrinsic::arm_neon_vst3:
+  case Intrinsic::arm_neon_vst4:
+  case Intrinsic::aarch64_neon_vst1x2:
+  case Intrinsic::aarch64_neon_vst1x3:
+  case Intrinsic::aarch64_neon_vst1x4:
+  case Intrinsic::arm_neon_vst2lane:
+  case Intrinsic::arm_neon_vst3lane:
+  case Intrinsic::arm_neon_vst4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
+  }
+
+  return false;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index d49b3ee..8ad5a79 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -19,7 +19,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
-
+#include "llvm/IR/Intrinsics.h"
 
 namespace llvm {
 namespace AArch64ISD {
@@ -111,7 +111,92 @@ namespace AArch64ISD {
     // created using the small memory model style: i.e. adrp/add or
     // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
     // get selected.
-    WrapperSmall
+    WrapperSmall,
+
+    // Vector bitwise select
+    NEON_BSL,
+
+    // Vector move immediate
+    NEON_MOVIMM,
+
+    // Vector Move Inverted Immediate
+    NEON_MVNIMM,
+
+    // Vector FP move immediate
+    NEON_FMOVIMM,
+
+    // Vector permute
+    NEON_UZP1,
+    NEON_UZP2,
+    NEON_ZIP1,
+    NEON_ZIP2,
+    NEON_TRN1,
+    NEON_TRN2,
+
+    // Vector Element reverse
+    NEON_REV64,
+    NEON_REV32,
+    NEON_REV16,
+
+    // Vector compare
+    NEON_CMP,
+
+    // Vector compare zero
+    NEON_CMPZ,
+
+    // Vector compare bitwise test
+    NEON_TST,
+
+    // Vector saturating shift
+    NEON_QSHLs,
+    NEON_QSHLu,
+
+    // Vector dup
+    NEON_VDUP,
+
+    // Vector dup by lane
+    NEON_VDUPLANE,
+
+    // Vector extract
+    NEON_VEXTRACT,
+
+    // NEON duplicate lane loads
+    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    NEON_LD3DUP,
+    NEON_LD4DUP,
+
+    // NEON loads with post-increment base updates:
+    NEON_LD1_UPD,
+    NEON_LD2_UPD,
+    NEON_LD3_UPD,
+    NEON_LD4_UPD,
+    NEON_LD1x2_UPD,
+    NEON_LD1x3_UPD,
+    NEON_LD1x4_UPD,
+
+    // NEON stores with post-increment base updates:
+    NEON_ST1_UPD,
+    NEON_ST2_UPD,
+    NEON_ST3_UPD,
+    NEON_ST4_UPD,
+    NEON_ST1x2_UPD,
+    NEON_ST1x3_UPD,
+    NEON_ST1x4_UPD,
+
+    // NEON duplicate lane loads with post-increment base updates:
+    NEON_LD2DUP_UPD,
+    NEON_LD3DUP_UPD,
+    NEON_LD4DUP_UPD,
+
+    // NEON lane loads with post-increment base updates:
+    NEON_LD2LN_UPD,
+    NEON_LD3LN_UPD,
+    NEON_LD4LN_UPD,
+
+    // NEON lane store with post-increment base updates:
+    NEON_ST2LN_UPD,
+    NEON_ST3LN_UPD,
+    NEON_ST4LN_UPD
   };
 }
 
@@ -130,14 +215,14 @@ public:
   SDValue LowerFormalArguments(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc dl, SelectionDAG &DAG,
+                               SDLoc dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const;
 
   SDValue LowerReturn(SDValue Chain,
                       CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals,
-                      DebugLoc dl, SelectionDAG &DAG) const;
+                      SDLoc dl, SelectionDAG &DAG) const;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const;
@@ -145,12 +230,18 @@ public:
   SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                           CallingConv::ID CallConv, bool IsVarArg,
                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                          DebugLoc dl, SelectionDAG &DAG,
+                          SDLoc dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals) const;
 
-  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                           DebugLoc DL, SDValue &Chain) const;
+  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const;
+
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+                            const AArch64Subtarget *ST) const;
+
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
 
+  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
 
   /// IsEligibleForTailCallOptimization - Check whether the call is eligible
   /// for tail call optimization. Targets which want to do tail call
@@ -171,7 +262,7 @@ public:
   SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
                               MachineFrameInfo *MFI, int ClobberedFI) const;
 
-  EVT getSetCCResultType(EVT VT) const;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
   bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
@@ -181,7 +272,7 @@ public:
 
   bool isLegalICmpImmediate(int64_t Val) const;
   SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &A64cc, SelectionDAG &DAG, DebugLoc &dl) const;
+                         SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
 
   virtual MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
@@ -211,12 +302,14 @@ public:
   SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, DebugLoc DL,
+  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
                            SelectionDAG &DAG) const;
   SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
@@ -229,11 +322,11 @@ public:
 
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
-  /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
-  /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
-  /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
-  /// is expanded to mul + add.
-  virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
 
   ConstraintType getConstraintType(const std::string &Constraint) const;
 
@@ -245,12 +338,30 @@ public:
                                     SelectionDAG &DAG) const;
 
   std::pair<unsigned, const TargetRegisterClass*>
-  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                                  unsigned Intrinsic) const LLVM_OVERRIDE;
+
+protected:
+  std::pair<const TargetRegisterClass*, uint8_t>
+  findRepresentativeClass(MVT VT) const;
+
 private:
-  const AArch64Subtarget *Subtarget;
-  const TargetRegisterInfo *RegInfo;
   const InstrItineraryData *Itins;
+
+  const AArch64Subtarget *getSubtarget() const {
+    return &getTargetMachine().getSubtarget<AArch64Subtarget>();
+  }
 };
+enum NeonModImmType {
+  Neon_Mov_Imm,
+  Neon_Mvn_Imm
+};
+
+extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
+                                bool &usesOnlyOneValue, bool &hasDominantValue,
+                                bool &isConstant, bool &isUNDEF);
 } // namespace llvm
 
 #endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 9dd122f..34f917c 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -120,6 +120,14 @@ class A64InstRdnm<dag outs, dag ins, string asmstr,
   let Inst{20-16} = Rm;
 }
 
+class A64InstRtnm<dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Rm;
+
+  let Inst{20-16} = Rm;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // Actual A64 Instruction Formats
@@ -383,6 +391,8 @@ class A64I_extract<bit sf, bits<3> op, bit n,
   // Inherits Rd in 4-0
 }
 
+let Predicates = [HasFPARMv8] in {
+
 // Format for floating-point compare instructions.
 class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
                 dag outs, dag ins, string asmstr,
@@ -562,6 +572,8 @@ class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
   // Inherit Rd in 4-0
 }
 
+}
+
 // Format for load-register (literal) instructions.
 class A64I_LDRlit<bits<2> opc, bit v,
                   dag outs, dag ins, string asmstr,
@@ -959,3 +971,519 @@ class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
   let Inst{4-0}   = op4;
 }
 
+
+//===----------------------------------------------------------------------===//
+//
+// Neon Instruction Format Definitions.
+//
+
+let Predicates = [HasNEON] in {
+
+class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
+  : InstAlias<Asm, Result, Emit> {
+}
+
+// Format AdvSIMD bitwise extract
+class NeonI_BitExtract<bit q, bits<2> op2,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b101110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  // imm4 in 14-11
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD perm
+class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD table lookup
+class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+                dag outs, dag ins, string asmstr,
+                list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = op2;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-13} = len;
+  let Inst{12} = op;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 3 vector registers with same vector type
+class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 3 vector registers with different vector type
+class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11} = 0b0;
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD two registers and an element
+class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01111;
+  let Inst{23-22} = size;
+  // l in Inst{21}
+  // m in Inst{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 1 vector register with modified immediate
+class NeonI_1VModImm<bit q, bit op,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRd<outs,ins, asmstr, patterns, itin> {
+  bits<8> Imm;
+  bits<4> cmode;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{15-12} = cmode;
+  let Inst{11} = 0b0; // o2
+  let Inst{10} = 1;
+  // Inherit Rd in 4-0
+  let Inst{18-16} = Imm{7-5}; // imm a:b:c
+  let Inst{9-5} = Imm{4-0};   // imm d:e:f:g:h
+}
+
+// Format AdvSIMD 3 scalar registers with same type
+
+class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+
+// Format AdvSIMD 2 vector registers miscellaneous
+class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
+                   dag outs, dag ins, string asmstr,
+                   list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector 1 immediate shift
+class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<7> Imm;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = Imm;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD duplicate and insert
+class NeonI_copy<bit q, bit op, bits<4> imm4,
+                 dag outs, dag ins, string asmstr,
+                 list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD insert from element to vector
+class NeonI_insert<bit q, bit op,
+                  dag outs, dag ins, string asmstr,
+                  list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<5> Imm5;
+  bits<4> Imm4;
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{20-16} = Imm5;
+  let Inst{15} = 0b0;
+  let Inst{14-11} = Imm4;
+  let Inst{10} = 0b1;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar pairwise
+class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 2 vector across lanes
+class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
+                     dag outs, dag ins, string asmstr,
+                     list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b11000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar two registers miscellaneous
+class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
+                            string asmstr, list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure
+class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
+                    dag outs, dag ins, string asmstr,
+                    list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011000;
+  let Inst{22} = l;
+  let Inst{21-16} = 0b000000;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store multiple N-element structure (post-index)
+class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011001;
+  let Inst{22} = l;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = size;
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                      dag ins, string asmstr, list<dag> patterns,
+                      InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store Single N-element structure to/from one lane
+class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011010;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+                           dag ins, string asmstr, list<dag> patterns,
+                           InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = q;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = 0b1;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-13} = opcode;
+  let Inst{12} = 0b0;
+  let Inst{11-10} = size;
+
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load/store Single N-element structure
+// to/from one lane
+class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+                         dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+  bits<4> lane;
+  let Inst{31} = 0b0;
+  let Inst{29-23} = 0b0011011;
+  let Inst{22} = l;
+  let Inst{21} = r;
+  // Inherit Rm in 20-16
+  let Inst{15-14} = op2_1;
+  let Inst{13} = op0;
+  
+  // Inherit Rn in 9-5
+  // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD 3 scalar registers with different type
+
+class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
+                          dag outs, dag ins, string asmstr,
+                          list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b1;
+  // Inherit Rm in 20-16
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar shift by immediate
+
+class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
+                           dag outs, dag ins, string asmstr,
+                           list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  bits<4> Imm4;
+  bits<3> Imm3;
+  let Inst{31-30} = 0b01;
+  let Inst{29} = u;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-19} = Imm4;
+  let Inst{18-16} = Imm3;
+  let Inst{15-11} = opcode;
+  let Inst{10} = 0b1;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD crypto AES
+class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01001110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD crypto SHA
+class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
+                       dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10100;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD crypto 3V SHA
+class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
+                         dag outs, dag ins, string asmstr,
+                         list<dag> patterns, InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
+  let Inst{31-24} = 0b01011110;
+  let Inst{23-22} = size;
+  let Inst{21} = 0b0;
+  // Inherit Rm in 20-16
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opcode;
+  let Inst{11-10} = 0b00;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD scalar x indexed element
+class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
+                               bits<4> opcode, dag outs, dag ins,
+                               string asmstr, list<dag> patterns,
+                               InstrItinClass itin>
+  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+  let Inst{31} = 0b0;
+  let Inst{30} = 0b1;
+  let Inst{29} = u;
+  let Inst{28-24} = 0b11111;
+  let Inst{23} = szhi;
+  let Inst{22} = szlo;
+  // l in Inst{21}
+  // m in Instr{20}
+  // Inherit Rm in 19-16
+  let Inst{15-12} = opcode;
+  // h in Inst{11}
+  let Inst{10} = 0b0;
+  // Inherit Rn in 9-5
+  // Inherit Rd in 4-0
+}
+// Format AdvSIMD scalar copy - insert from element to scalar
+class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
+                       list<dag> patterns, InstrItinClass itin>
+  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
+  let Inst{28} = 0b1;
+}
+}
+
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index cf3a2c3..180110a 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -29,14 +29,14 @@
 
 #include <algorithm>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
 using namespace llvm;
 
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
   : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
-    RI(*this, STI), Subtarget(STI) {}
+    Subtarget(STI) {}
 
 void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I, DebugLoc DL,
@@ -68,43 +68,71 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
       .addImm(A64SysReg::NZCV);
   } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    Opc = AArch64::ORRxxx_lsl;
-    ZeroReg = AArch64::XZR;
+    if(AArch64::GPR64RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRxxx_lsl;
+      ZeroReg = AArch64::XZR;
+    } else{
+      assert(AArch64::FPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    assert(AArch64::GPR32RegClass.contains(SrcReg));
-    Opc = AArch64::ORRwww_lsl;
-    ZeroReg = AArch64::WZR;
+    if(AArch64::GPR32RegClass.contains(SrcReg)){
+      Opc = AArch64::ORRwww_lsl;
+      ZeroReg = AArch64::WZR;
+    } else{
+      assert(AArch64::FPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    assert(AArch64::FPR32RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR32RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR32RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    assert(AArch64::FPR64RegClass.contains(SrcReg));
-    BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-      .addReg(SrcReg);
-    return;
+    if(AArch64::FPR64RegClass.contains(SrcReg)){
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
+    else {
+      assert(AArch64::GPR64RegClass.contains(SrcReg));
+      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
+        .addReg(SrcReg);
+      return;
+    }
   } else if (AArch64::FPR128RegClass.contains(DestReg)) {
     assert(AArch64::FPR128RegClass.contains(SrcReg));
 
-    // FIXME: there's no good way to do this, at least without NEON:
-    //   + There's no single move instruction for q-registers
-    //   + We can't create a spill slot and use normal STR/LDR because stack
-    //     allocation has already happened
-    //   + We can't go via X-registers with FMOV because register allocation has
-    //     already happened.
-    // This may not be efficient, but at least it works.
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-      .addReg(SrcReg)
-      .addReg(AArch64::XSP)
-      .addImm(0x1ff & -16);
-
-    BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-      .addReg(AArch64::XSP, RegState::Define)
-      .addReg(AArch64::XSP)
-      .addImm(16);
-    return;
+    // If NEON is enable, we use ORR to implement this copy.
+    // If NEON isn't available, emit STR and LDR to handle this.
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
+        .addReg(SrcReg)
+        .addReg(SrcReg);
+      return;
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
+        .addReg(SrcReg)
+        .addReg(AArch64::XSP)
+        .addImm(0x1ff & -16);
+
+      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
+        .addReg(AArch64::XSP, RegState::Define)
+        .addReg(AArch64::XSP)
+        .addImm(16);
+      return;
+    }
   } else {
     llvm_unreachable("Unknown register class in copyPhysReg");
   }
@@ -116,17 +144,6 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
-MachineInstr *
-AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                           uint64_t Offset, const MDNode *MDPtr,
-                                           DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0)
-    .addImm(Offset)
-    .addMetadata(MDPtr);
-  return &*MIB;
-}
-
 /// Does the Opcode represent a conditional branch that we can remove and re-add
 /// at the end of a basic block?
 static bool isCondBranch(unsigned Opc) {
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 22a2ab4..620ecc9 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -43,10 +43,6 @@ public:
                    unsigned DestReg, unsigned SrcReg,
                    bool KillSrc) const;
 
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index d2cfc7d..23d81fc 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -11,6 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+def HasNEON          : Predicate<"Subtarget->hasNEON()">,
+                                 AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto","crypto">;
+
+// Use fused MAC if more precision in FP computation is allowed.
+def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast)">;
 include "AArch64InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
@@ -114,6 +127,8 @@ def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
 
 def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
 
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+
 //===----------------------------------------------------------------------===//
 // Call sequence pseudo-instructions
 //===----------------------------------------------------------------------===//
@@ -1263,7 +1278,7 @@ def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
 
 // UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
 // use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(zext i32:$Rn), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
+def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
                                          sub_32)>;
 
 //===-------------------------------
@@ -1967,6 +1982,13 @@ def fpz64 : Operand<f64>,
   let DecoderMethod = "DecodeFPZeroOperand";
 }
 
+def fpz64movi : Operand<i64>,
+            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
+  let ParserMatchClass = fpzero_asmoperand;
+  let PrintMethod = "printFPZeroOperand";
+  let DecoderMethod = "DecodeFPZeroOperand";
+}
+
 multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
   def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
                           (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
@@ -2173,6 +2195,29 @@ def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
 def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
 def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
 
+// Extra patterns for when we're allowed to optimise separate multiplication and
+// addition.
+let Predicates = [HasFPARMv8, UseFusedMAC] in {
+def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+          (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+          (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(f32 (fsub (f32 (fmul FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
+          (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul FPR32:$Rn, FPR32:$Rm)))),
+          (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+          (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+          (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(f64 (fsub (f64 (fmul FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
+          (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul FPR64:$Rn, FPR64:$Rm)))),
+          (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Floating-point <-> fixed-point conversion instructions
 //===----------------------------------------------------------------------===//
@@ -2308,6 +2353,7 @@ defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
 defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
 defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
 def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
 def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
@@ -2316,6 +2362,7 @@ def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
 def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
 def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
 def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
+}
 
 multiclass A64I_inttofp<bit o0, string asmop> {
   def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
@@ -2327,6 +2374,7 @@ multiclass A64I_inttofp<bit o0, string asmop> {
 defm S : A64I_inttofp<0b0, "scvtf">;
 defm U : A64I_inttofp<0b1, "ucvtf">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
 def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
 def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
@@ -2335,16 +2383,19 @@ def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
 def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
 def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
 def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
+}
 
 def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
 def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
 def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
 def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
 
+let Predicates = [HasFPARMv8] in {
 def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
 def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
 def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
 def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
+}
 
 def lane1_asmoperand : AsmOperandClass {
   let Name = "Lane1";
@@ -2367,11 +2418,13 @@ let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
                           "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
                 (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
 
 def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
                 (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
+}
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediate instructions
@@ -2465,11 +2518,15 @@ let mayLoad = 1 in {
   def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
 }
 
+let Predicates = [HasFPARMv8] in {
 def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
 def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+}
 
 let mayLoad = 1 in {
+  let Predicates = [HasFPARMv8] in {
   def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+  }
 
 
   def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
@@ -3063,6 +3120,7 @@ defm LS32
 defm LS64
   : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
 
+let Predicates = [HasFPARMv8] in {
 // STR/LDR to/from a B register
 defm LSFP8
   : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
@@ -3081,6 +3139,7 @@ defm LSFP64
 defm LSFP128
   : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
                          qword_addrparams>;
+}
 
 //===------------------------------
 // 2.3 Signed loads
@@ -3536,10 +3595,13 @@ multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
 
 defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
 defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
+
+let Predicates = [HasFPARMv8] in {
 defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
 defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
 defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
                                   "LSFPPair128">;
+}
 
 
 def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
@@ -3974,14 +4036,17 @@ def : movalias<MOVZxii, GPR64, movz64_movimm>;
 def : movalias<MOVNwii, GPR32, movn32_movimm>;
 def : movalias<MOVNxii, GPR64, movn64_movimm>;
 
-def movw_addressref : ComplexPattern<i64, 2, "SelectMOVWAddressRef">;
+def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">;
+def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">;
+def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">;
+def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">;
 
-def : Pat<(A64WrapperLarge movw_addressref:$G3, movw_addressref:$G2,
-                           movw_addressref:$G1, movw_addressref:$G0),
-          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref:$G3),
-                                     movw_addressref:$G2),
-                            movw_addressref:$G1),
-                   movw_addressref:$G0)>;
+def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2,
+                           movw_addressref_g1:$G1, movw_addressref_g0:$G0),
+          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3),
+                                     movw_addressref_g2:$G2),
+                            movw_addressref_g1:$G1),
+                   movw_addressref_g0:$G0)>;
 
 //===----------------------------------------------------------------------===//
 // PC-relative addressing instructions
@@ -5120,3 +5185,9 @@ defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
 
 defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
                    (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "AArch64InstrNEON.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
new file mode 100644
index 0000000..d71749d
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -0,0 +1,8671 @@
+//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AArch64 NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+def Neon_bsl       : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3,
+                      [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                      SDTCisSameAs<0, 3>]>>;
+
+// (outs Result), (ins Imm, OpCmode)
+def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+
+def Neon_movi     : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
+
+def Neon_mvni     : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
+
+// (outs Result), (ins Imm)
+def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
+                        [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
+
+// (outs Result), (ins LHS, RHS, CondCode)
+def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
+                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
+
+// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
+def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
+                 [SDTCisVec<0>,  SDTCisVec<1>]>>;
+
+// (outs Result), (ins LHS, RHS)
+def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
+                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
+
+def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                     SDTCisVT<2, i32>]>;
+def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
+def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
+
+def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                               SDTCisSameAs<0, 2>]>;
+def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
+def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
+def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
+def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
+def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
+def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
+
+def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
+def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
+def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
+def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
+                       [SDTCisVec<0>]>>;
+def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
+                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
+def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
+                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
+                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
+
+def SDT_assertext : SDTypeProfile<1, 1,
+  [SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 0>]>;
+def assertsext : SDNode<"ISD::AssertSext", SDT_assertext>;
+def assertzext : SDNode<"ISD::AssertZext", SDT_assertext>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
+                                string asmop, SDPatternOperator opnode8B,
+                                SDPatternOperator opnode16B,
+                                bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8B :  NeonI_3VSame<0b0, u, size, opcode,
+               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
+               [(set (v8i8 VPR64:$Rd),
+                  (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
+               NoItinerary>;
+
+    def _16B : NeonI_3VSame<0b1, u, size, opcode,
+               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
+               [(set (v16i8 VPR128:$Rd),
+                  (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
+               NoItinerary>;
+  }
+
+}
+
+multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
+                                  string asmop, SDPatternOperator opnode,
+                                  bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
+              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+              asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
+              [(set (v4i16 VPR64:$Rd),
+                 (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
+              NoItinerary>;
+
+    def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
+              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+              asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
+              [(set (v8i16 VPR128:$Rd),
+                 (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
+              NoItinerary>;
+
+    def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
+              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
+              [(set (v2i32 VPR64:$Rd),
+                 (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
+              NoItinerary>;
+
+    def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
+              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+              [(set (v4i32 VPR128:$Rd),
+                 (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
+              NoItinerary>;
+  }
+}
+multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
+                                  string asmop, SDPatternOperator opnode,
+                                  bit Commutable = 0>
+   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
+  let isCommutable = Commutable in {
+    def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
+               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
+               [(set (v8i8 VPR64:$Rd),
+                  (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
+               NoItinerary>;
+
+    def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
+               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
+               [(set (v16i8 VPR128:$Rd),
+                  (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
+               NoItinerary>;
+  }
+}
+
+multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
+                                   string asmop, SDPatternOperator opnode,
+                                   bit Commutable = 0>
+   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
+  let isCommutable = Commutable in {
+    def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
+              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
+              [(set (v2i64 VPR128:$Rd),
+                 (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
+              NoItinerary>;
+  }
+}
+
+// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
+// but Result types can be integer or floating point types.
+multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
+                                 string asmop, SDPatternOperator opnode2S,
+                                 SDPatternOperator opnode4S,
+                                 SDPatternOperator opnode2D,
+                                 ValueType ResTy2S, ValueType ResTy4S,
+                                 ValueType ResTy2D, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
+              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
+              [(set (ResTy2S VPR64:$Rd),
+                 (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
+              NoItinerary>;
+
+    def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
+              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+              [(set (ResTy4S VPR128:$Rd),
+                 (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
+              NoItinerary>;
+
+    def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
+              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
+              [(set (ResTy2D VPR128:$Rd),
+                 (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
+               NoItinerary>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions
+//===----------------------------------------------------------------------===//
+
+// Vector Arithmetic Instructions
+
+// Vector Add (Integer and Floating-Point)
+
+defm ADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
+defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd,
+                                     v2f32, v4f32, v2f64, 1>;
+
+// Vector Sub (Integer and Floating-Point)
+
+defm SUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
+defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub,
+                                     v2f32, v4f32, v2f64, 0>;
+
+// Vector Multiply (Integer and Floating-Point)
+
+defm MULvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
+defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul,
+                                     v2f32, v4f32, v2f64, 1>;
+
+// Vector Multiply (Polynomial)
+
+defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
+                                    int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
+
+// Vector Multiply-accumulate and Multiply-subtract (Integer)
+
+// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
+// two operands constraints.
+class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
+  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
+  bits<5> opcode, SDPatternOperator opnode>
+  : NeonI_3VSame<q, u, size, opcode,
+    (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
+    asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
+    [(set (OpTy VPRC:$Rd),
+       (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
+    NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                       (add node:$Ra, (mul node:$Rn, node:$Rm))>;
+
+def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                       (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
+
+
+def MLAvvv_8B:  NeonI_3VSame_Constraint_impl<"mla", ".8b",  VPR64,  v8i8,
+                                             0b0, 0b0, 0b00, 0b10010, Neon_mla>;
+def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
+                                             0b1, 0b0, 0b00, 0b10010, Neon_mla>;
+def MLAvvv_4H:  NeonI_3VSame_Constraint_impl<"mla", ".4h",  VPR64,  v4i16,
+                                             0b0, 0b0, 0b01, 0b10010, Neon_mla>;
+def MLAvvv_8H:  NeonI_3VSame_Constraint_impl<"mla", ".8h",  VPR128, v8i16,
+                                             0b1, 0b0, 0b01, 0b10010, Neon_mla>;
+def MLAvvv_2S:  NeonI_3VSame_Constraint_impl<"mla", ".2s",  VPR64,  v2i32,
+                                             0b0, 0b0, 0b10, 0b10010, Neon_mla>;
+def MLAvvv_4S:  NeonI_3VSame_Constraint_impl<"mla", ".4s",  VPR128, v4i32,
+                                             0b1, 0b0, 0b10, 0b10010, Neon_mla>;
+
+def MLSvvv_8B:  NeonI_3VSame_Constraint_impl<"mls", ".8b",  VPR64,  v8i8,
+                                             0b0, 0b1, 0b00, 0b10010, Neon_mls>;
+def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
+                                             0b1, 0b1, 0b00, 0b10010, Neon_mls>;
+def MLSvvv_4H:  NeonI_3VSame_Constraint_impl<"mls", ".4h",  VPR64,  v4i16,
+                                             0b0, 0b1, 0b01, 0b10010, Neon_mls>;
+def MLSvvv_8H:  NeonI_3VSame_Constraint_impl<"mls", ".8h",  VPR128, v8i16,
+                                             0b1, 0b1, 0b01, 0b10010, Neon_mls>;
+def MLSvvv_2S:  NeonI_3VSame_Constraint_impl<"mls", ".2s",  VPR64,  v2i32,
+                                             0b0, 0b1, 0b10, 0b10010, Neon_mls>;
+def MLSvvv_4S:  NeonI_3VSame_Constraint_impl<"mls", ".4s",  VPR128, v4i32,
+                                             0b1, 0b1, 0b10, 0b10010, Neon_mls>;
+
+// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
+
+def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                        (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>;
+
+def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                        (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>;
+
+let Predicates = [HasNEON, UseFusedMAC] in {
+def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s",  VPR64,  v2f32,
+                                             0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
+def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s",  VPR128, v4f32,
+                                             0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
+def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d",  VPR128, v2f64,
+                                             0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
+
+def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s",  VPR64,  v2f32,
+                                              0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
+def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s",  VPR128, v4f32,
+                                             0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
+def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d",  VPR128, v2f64,
+                                             0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
+}
+
+// We're also allowed to match the fma instruction regardless of compile
+// options.
+def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
+          (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
+def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
+          (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
+          (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+
+def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
+          (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
+def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
+          (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
+          (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+
+// Vector Divide (Floating-Point)
+
+defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv,
+                                     v2f32, v4f32, v2f64, 0>;
+
+// Vector Bitwise Operations
+
+// Vector Bitwise AND
+
+defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
+
+// Vector Bitwise Exclusive OR
+
+defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
+
+// Vector Bitwise OR
+
+defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
+
+// ORR disassembled as MOV if Vn==Vm
+
+// Vector Move - register
+// Alias for ORR if Vn=Vm.
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
+                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
+def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
+                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
+
+// The MOVI instruction takes two immediate operands.  The first is the
+// immediate encoding, while the second is the cmode.  A cmode of 14, or
+// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
+def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
+def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
+
+def Neon_not8B  : PatFrag<(ops node:$in),
+                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
+def Neon_not16B : PatFrag<(ops node:$in),
+                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
+
+def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
+                         (or node:$Rn, (Neon_not8B node:$Rm))>;
+
+def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
+                          (or node:$Rn, (Neon_not16B node:$Rm))>;
+
+def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
+                         (and node:$Rn, (Neon_not8B node:$Rm))>;
+
+def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
+                          (and node:$Rn, (Neon_not16B node:$Rm))>;
+
+
+// Vector Bitwise OR NOT - register
+
+defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
+                                   Neon_orn8B, Neon_orn16B, 0>;
+
+// Vector Bitwise Bit Clear (AND NOT) - register
+
+defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
+                                   Neon_bic8B, Neon_bic16B, 0>;
+
+multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
+                                   SDPatternOperator opnode16B,
+                                   Instruction INST8B,
+                                   Instruction INST16B> {
+  def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$Rn, VPR128:$Rm)>;
+}
+
+// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
+defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
+defm : Neon_bitwise2V_patterns<or,  or,  ORRvvv_8B, ORRvvv_16B>;
+defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
+defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
+defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
+
+//   Vector Bitwise Select
+def BSLvvv_8B  : NeonI_3VSame_Constraint_impl<"bsl", ".8b",  VPR64, v8i8,
+                                              0b0, 0b1, 0b01, 0b00011, Neon_bsl>;
+
+def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
+                                              0b1, 0b1, 0b01, 0b00011, Neon_bsl>;
+
+multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
+                                   Instruction INST8B,
+                                   Instruction INST16B> {
+  // Disassociate type from instruction definition
+  def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+
+  // Allow to match BSL instruction pattern with non-constant operand
+  def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
+                    (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
+                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
+                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
+                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
+                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
+                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
+                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
+                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+
+  // Allow to match llvm.arm.* intrinsics.
+  def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
+                    (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
+                    (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
+                    (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
+                    (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
+                    (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
+                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
+                    (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
+                    (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
+                    (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
+                    (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
+                    (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+  def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
+                    (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+}
+
+// Additional patterns for bitwise instruction BSL
+defm: Neon_bitwise3V_patterns<Neon_bsl, BSLvvv_8B, BSLvvv_16B>;
+
+def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
+                           (Neon_bsl node:$src, node:$Rn, node:$Rm),
+                           [{ (void)N; return false; }]>;
+
+// Vector Bitwise Insert if True
+
+def BITvvv_8B  : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64,   v8i8,
+                   0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
+def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
+                   0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
+
+// Vector Bitwise Insert if False
+
+def BIFvvv_8B  : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64,  v8i8,
+                                0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
+def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
+                                0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
+
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+
+def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                       (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
+def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                       (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
+
+// Vector Absolute Difference and Accumulate (Unsigned)
+def UABAvvv_8B :  NeonI_3VSame_Constraint_impl<"uaba", ".8b",  VPR64,  v8i8,
+                    0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
+def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
+                    0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
+def UABAvvv_4H :  NeonI_3VSame_Constraint_impl<"uaba", ".4h",  VPR64,  v4i16,
+                    0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
+def UABAvvv_8H :  NeonI_3VSame_Constraint_impl<"uaba", ".8h",  VPR128, v8i16,
+                    0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
+def UABAvvv_2S :  NeonI_3VSame_Constraint_impl<"uaba", ".2s",  VPR64,  v2i32,
+                    0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
+def UABAvvv_4S :  NeonI_3VSame_Constraint_impl<"uaba", ".4s",  VPR128, v4i32,
+                    0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
+
+// Vector Absolute Difference and Accumulate (Signed)
+def SABAvvv_8B :  NeonI_3VSame_Constraint_impl<"saba", ".8b",  VPR64,  v8i8,
+                    0b0, 0b0, 0b00, 0b01111, Neon_saba>;
+def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
+                    0b1, 0b0, 0b00, 0b01111, Neon_saba>;
+def SABAvvv_4H :  NeonI_3VSame_Constraint_impl<"saba", ".4h",  VPR64,  v4i16,
+                    0b0, 0b0, 0b01, 0b01111, Neon_saba>;
+def SABAvvv_8H :  NeonI_3VSame_Constraint_impl<"saba", ".8h",  VPR128, v8i16,
+                    0b1, 0b0, 0b01, 0b01111, Neon_saba>;
+def SABAvvv_2S :  NeonI_3VSame_Constraint_impl<"saba", ".2s",  VPR64,  v2i32,
+                    0b0, 0b0, 0b10, 0b01111, Neon_saba>;
+def SABAvvv_4S :  NeonI_3VSame_Constraint_impl<"saba", ".4s",  VPR128, v4i32,
+                    0b1, 0b0, 0b10, 0b01111, Neon_saba>;
+
+
+// Vector Absolute Difference (Signed, Unsigned)
+defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
+defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
+
+// Vector Absolute Difference (Floating Point)
+defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
+                                    int_arm_neon_vabds, int_arm_neon_vabds,
+                                    int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
+
+// Vector Reciprocal Step (Floating Point)
+defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
+                                       int_arm_neon_vrecps, int_arm_neon_vrecps,
+                                       int_arm_neon_vrecps,
+                                       v2f32, v4f32, v2f64, 0>;
+
+// Vector Reciprocal Square Root Step (Floating Point)
+defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
+                                        int_arm_neon_vrsqrts,
+                                        int_arm_neon_vrsqrts,
+                                        int_arm_neon_vrsqrts,
+                                        v2f32, v4f32, v2f64, 0>;
+
+// Vector Comparisons
+
+def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
+                        (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
+def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
+                         (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
+def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
+                        (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
+def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
+                        (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
+def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
+                        (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
+
+// NeonI_compare_aliases class: swaps register operands to implement
+// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
+class NeonI_compare_aliases<string asmop, string asmlane,
+                            Instruction inst, RegisterOperand VPRC>
+  : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
+                    ", $Rm" # asmlane,
+                  (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
+
+// Vector Comparisons (Integer)
+
+// Vector Compare Mask Equal (Integer)
+let isCommutable =1 in {
+defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
+}
+
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
+
+// Vector Compare Mask Greater Than or Equal (Integer)
+defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
+
+// Vector Compare Mask Higher (Unsigned Integer)
+defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
+
+// Vector Compare Mask Greater Than (Integer)
+defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
+
+// Vector Compare Mask Bitwise Test (Integer)
+defm CMTSTvvv:  NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
+
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+def CMLSvvv_8B  : NeonI_compare_aliases<"cmls", ".8b",  CMHSvvv_8B,  VPR64>;
+def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
+def CMLSvvv_4H  : NeonI_compare_aliases<"cmls", ".4h",  CMHSvvv_4H,  VPR64>;
+def CMLSvvv_8H  : NeonI_compare_aliases<"cmls", ".8h",  CMHSvvv_8H,  VPR128>;
+def CMLSvvv_2S  : NeonI_compare_aliases<"cmls", ".2s",  CMHSvvv_2S,  VPR64>;
+def CMLSvvv_4S  : NeonI_compare_aliases<"cmls", ".4s",  CMHSvvv_4S,  VPR128>;
+def CMLSvvv_2D  : NeonI_compare_aliases<"cmls", ".2d",  CMHSvvv_2D,  VPR128>;
+
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+def CMLEvvv_8B  : NeonI_compare_aliases<"cmle", ".8b",  CMGEvvv_8B,  VPR64>;
+def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
+def CMLEvvv_4H  : NeonI_compare_aliases<"cmle", ".4h",  CMGEvvv_4H,  VPR64>;
+def CMLEvvv_8H  : NeonI_compare_aliases<"cmle", ".8h",  CMGEvvv_8H,  VPR128>;
+def CMLEvvv_2S  : NeonI_compare_aliases<"cmle", ".2s",  CMGEvvv_2S,  VPR64>;
+def CMLEvvv_4S  : NeonI_compare_aliases<"cmle", ".4s",  CMGEvvv_4S,  VPR128>;
+def CMLEvvv_2D  : NeonI_compare_aliases<"cmle", ".2d",  CMGEvvv_2D,  VPR128>;
+
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+def CMLOvvv_8B  : NeonI_compare_aliases<"cmlo", ".8b",  CMHIvvv_8B,  VPR64>;
+def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
+def CMLOvvv_4H  : NeonI_compare_aliases<"cmlo", ".4h",  CMHIvvv_4H,  VPR64>;
+def CMLOvvv_8H  : NeonI_compare_aliases<"cmlo", ".8h",  CMHIvvv_8H,  VPR128>;
+def CMLOvvv_2S  : NeonI_compare_aliases<"cmlo", ".2s",  CMHIvvv_2S,  VPR64>;
+def CMLOvvv_4S  : NeonI_compare_aliases<"cmlo", ".4s",  CMHIvvv_4S,  VPR128>;
+def CMLOvvv_2D  : NeonI_compare_aliases<"cmlo", ".2d",  CMHIvvv_2D,  VPR128>;
+
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+def CMLTvvv_8B  : NeonI_compare_aliases<"cmlt", ".8b",  CMGTvvv_8B,  VPR64>;
+def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
+def CMLTvvv_4H  : NeonI_compare_aliases<"cmlt", ".4h",  CMGTvvv_4H,  VPR64>;
+def CMLTvvv_8H  : NeonI_compare_aliases<"cmlt", ".8h",  CMGTvvv_8H,  VPR128>;
+def CMLTvvv_2S  : NeonI_compare_aliases<"cmlt", ".2s",  CMGTvvv_2S,  VPR64>;
+def CMLTvvv_4S  : NeonI_compare_aliases<"cmlt", ".4s",  CMGTvvv_4S,  VPR128>;
+def CMLTvvv_2D  : NeonI_compare_aliases<"cmlt", ".2d",  CMGTvvv_2D,  VPR128>;
+
+
+def neon_uimm0_asmoperand : AsmOperandClass
+{
+  let Name = "UImm0";
+  let PredicateMethod = "isUImm<0>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
+  let ParserMatchClass = neon_uimm0_asmoperand;
+  let PrintMethod = "printNeonUImm0Operand";
+
+}
+
+multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
+{
+  def _8B :  NeonI_2VMisc<0b0, u, 0b00, opcode,
+             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+             asmop # "\t$Rd.8b, $Rn.8b, $Imm",
+             [(set (v8i8 VPR64:$Rd),
+                (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+             NoItinerary>;
+
+  def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
+             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+             asmop # "\t$Rd.16b, $Rn.16b, $Imm",
+             [(set (v16i8 VPR128:$Rd),
+                (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+             NoItinerary>;
+
+  def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
+            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+            asmop # "\t$Rd.4h, $Rn.4h, $Imm",
+            [(set (v4i16 VPR64:$Rd),
+               (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+            NoItinerary>;
+
+  def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
+            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+            asmop # "\t$Rd.8h, $Rn.8h, $Imm",
+            [(set (v8i16 VPR128:$Rd),
+               (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+            NoItinerary>;
+
+  def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
+            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+            asmop # "\t$Rd.2s, $Rn.2s, $Imm",
+            [(set (v2i32 VPR64:$Rd),
+               (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+            NoItinerary>;
+
+  def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
+            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+            asmop # "\t$Rd.4s, $Rn.4s, $Imm",
+            [(set (v4i32 VPR128:$Rd),
+               (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+            NoItinerary>;
+
+  def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
+            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+            asmop # "\t$Rd.2d, $Rn.2d, $Imm",
+            [(set (v2i64 VPR128:$Rd),
+               (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+            NoItinerary>;
+}
+
+// Vector Compare Mask Equal to Zero (Integer)
+defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
+
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
+
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
+
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
+
+// Vector Compare Mask Less Than Zero (Signed Integer)
+defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
+
+// Vector Comparisons (Floating Point)
+
+// Vector Compare Mask Equal (Floating Point)
+let isCommutable =1 in {
+defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
+                                      Neon_cmeq, Neon_cmeq,
+                                      v2i32, v4i32, v2i64, 0>;
+}
+
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
+                                      Neon_cmge, Neon_cmge,
+                                      v2i32, v4i32, v2i64, 0>;
+
+// Vector Compare Mask Greater Than (Floating Point)
+defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
+                                      Neon_cmgt, Neon_cmgt,
+                                      v2i32, v4i32, v2i64, 0>;
+
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+def FCMLEvvv_2S  : NeonI_compare_aliases<"fcmle", ".2s",  FCMGEvvv_2S,  VPR64>;
+def FCMLEvvv_4S  : NeonI_compare_aliases<"fcmle", ".4s",  FCMGEvvv_4S,  VPR128>;
+def FCMLEvvv_2D  : NeonI_compare_aliases<"fcmle", ".2d",  FCMGEvvv_2D,  VPR128>;
+
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+def FCMLTvvv_2S  : NeonI_compare_aliases<"fcmlt", ".2s",  FCMGTvvv_2S,  VPR64>;
+def FCMLTvvv_4S  : NeonI_compare_aliases<"fcmlt", ".4s",  FCMGTvvv_4S,  VPR128>;
+def FCMLTvvv_2D  : NeonI_compare_aliases<"fcmlt", ".2d",  FCMGTvvv_2D,  VPR128>;
+
+
+multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
+                              string asmop, CondCode CC>
+{
+  def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
+            (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm),
+            asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
+            [(set (v2i32 VPR64:$Rd),
+               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))],
+            NoItinerary>;
+
+  def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
+            (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+            asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
+            [(set (v4i32 VPR128:$Rd),
+               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
+            NoItinerary>;
+
+  def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
+            (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+            asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
+            [(set (v2i64 VPR128:$Rd),
+               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
+            NoItinerary>;
+}
+
+// Vector Compare Mask Equal to Zero (Floating Point)
+defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
+
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
+
+// Vector Compare Mask Greater Than Zero (Floating Point)
+defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
+
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
+
+// Vector Compare Mask Less Than Zero (Floating Point)
+defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
+
+// Vector Absolute Comparisons (Floating Point)
+
+// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
+defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
+                                      int_arm_neon_vacged, int_arm_neon_vacgeq,
+                                      int_aarch64_neon_vacgeq,
+                                      v2i32, v4i32, v2i64, 0>;
+
+// Vector Absolute Compare Mask Greater Than (Floating Point)
+defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
+                                      int_arm_neon_vacgtd, int_arm_neon_vacgtq,
+                                      int_aarch64_neon_vacgtq,
+                                      v2i32, v4i32, v2i64, 0>;
+
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+// FACLE is alias for FACGE with operands reversed.
+def FACLEvvv_2S  : NeonI_compare_aliases<"facle", ".2s",  FACGEvvv_2S,  VPR64>;
+def FACLEvvv_4S  : NeonI_compare_aliases<"facle", ".4s",  FACGEvvv_4S,  VPR128>;
+def FACLEvvv_2D  : NeonI_compare_aliases<"facle", ".2d",  FACGEvvv_2D,  VPR128>;
+
+// Vector Absolute Compare Mask Less Than (Floating Point)
+// FACLT is alias for FACGT with operands reversed.
+def FACLTvvv_2S  : NeonI_compare_aliases<"faclt", ".2s",  FACGTvvv_2S,  VPR64>;
+def FACLTvvv_4S  : NeonI_compare_aliases<"faclt", ".4s",  FACGTvvv_4S,  VPR128>;
+def FACLTvvv_2D  : NeonI_compare_aliases<"faclt", ".2d",  FACGTvvv_2D,  VPR128>;
+
+// Vector halving add (Integer Signed, Unsigned)
+defm SHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
+                                        int_arm_neon_vhadds, 1>;
+defm UHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
+                                        int_arm_neon_vhaddu, 1>;
+
+// Vector halving sub (Integer Signed, Unsigned)
+defm SHSUBvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
+                                        int_arm_neon_vhsubs, 0>;
+defm UHSUBvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
+                                        int_arm_neon_vhsubu, 0>;
+
+// Vector rouding halving add (Integer Signed, Unsigned)
+defm SRHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
+                                         int_arm_neon_vrhadds, 1>;
+defm URHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
+                                         int_arm_neon_vrhaddu, 1>;
+
+// Vector Saturating add (Integer Signed, Unsigned)
+defm SQADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
+                   int_arm_neon_vqadds, 1>;
+defm UQADDvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
+                   int_arm_neon_vqaddu, 1>;
+
+// Vector Saturating sub (Integer Signed, Unsigned)
+defm SQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
+                   int_arm_neon_vqsubs, 1>;
+defm UQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
+                   int_arm_neon_vqsubu, 1>;
+
+// Vector Shift Left (Signed and Unsigned Integer)
+defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
+                 int_arm_neon_vshifts, 1>;
+defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
+                 int_arm_neon_vshiftu, 1>;
+
+// Vector Saturating Shift Left (Signed and Unsigned Integer)
+defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
+                  int_arm_neon_vqshifts, 1>;
+defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
+                  int_arm_neon_vqshiftu, 1>;
+
+// Vector Rouding Shift Left (Signed and Unsigned Integer)
+defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
+                  int_arm_neon_vrshifts, 1>;
+defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
+                  int_arm_neon_vrshiftu, 1>;
+
+// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
+                   int_arm_neon_vqrshifts, 1>;
+defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
+                   int_arm_neon_vqrshiftu, 1>;
+
+// Vector Maximum (Signed and Unsigned Integer)
+defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
+defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
+
+// Vector Minimum (Signed and Unsigned Integer)
+defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
+defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
+
+// Vector Maximum (Floating Point)
+defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
+                                     int_arm_neon_vmaxs, int_arm_neon_vmaxs,
+                                     int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>;
+
+// Vector Minimum (Floating Point)
+defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
+                                     int_arm_neon_vmins, int_arm_neon_vmins,
+                                     int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>;
+
+// Vector maxNum (Floating Point) -  prefer a number over a quiet NaN)
+defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
+                                       int_aarch64_neon_vmaxnm,
+                                       int_aarch64_neon_vmaxnm,
+                                       int_aarch64_neon_vmaxnm,
+                                       v2f32, v4f32, v2f64, 1>;
+
+// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
+defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
+                                       int_aarch64_neon_vminnm,
+                                       int_aarch64_neon_vminnm,
+                                       int_aarch64_neon_vminnm,
+                                       v2f32, v4f32, v2f64, 1>;
+
+// Vector Maximum Pairwise (Signed and Unsigned Integer)
+defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
+defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
+
+// Vector Minimum Pairwise (Signed and Unsigned Integer)
+defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
+defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
+
+// Vector Maximum Pairwise (Floating Point)
+defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
+                                     int_arm_neon_vpmaxs, int_arm_neon_vpmaxs,
+                                     int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
+
+// Vector Minimum Pairwise (Floating Point)
+defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
+                                     int_arm_neon_vpmins, int_arm_neon_vpmins,
+                                     int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
+
+// Vector maxNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
+defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
+                                       int_aarch64_neon_vpmaxnm,
+                                       int_aarch64_neon_vpmaxnm,
+                                       int_aarch64_neon_vpmaxnm,
+                                       v2f32, v4f32, v2f64, 1>;
+
+// Vector minNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
+defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
+                                       int_aarch64_neon_vpminnm,
+                                       int_aarch64_neon_vpminnm,
+                                       int_aarch64_neon_vpminnm,
+                                       v2f32, v4f32, v2f64, 1>;
+
+// Vector Addition Pairwise (Integer)
+defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
+
+// Vector Addition Pairwise (Floating Point)
+defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
+                                       int_arm_neon_vpadd,
+                                       int_arm_neon_vpadd,
+                                       int_arm_neon_vpadd,
+                                       v2f32, v4f32, v2f64, 1>;
+
+// Vector Saturating Doubling Multiply High
+defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
+                    int_arm_neon_vqdmulh, 1>;
+
+// Vector Saturating Rouding Doubling Multiply High
+defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
+                     int_arm_neon_vqrdmulh, 1>;
+
+// Vector Multiply Extended (Floating Point)
+defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
+                                      int_aarch64_neon_vmulx,
+                                      int_aarch64_neon_vmulx,
+                                      int_aarch64_neon_vmulx,
+                                      v2f32, v4f32, v2f64, 1>;
+
+// Vector Immediate Instructions
+
+multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
+{
+  def _asmoperand : AsmOperandClass
+    {
+      let Name = "NeonMovImmShift" # PREFIX;
+      let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
+      let PredicateMethod = "isNeonMovImmShift" # PREFIX;
+    }
+}
+
+// Definition of vector immediates shift operands
+
+// The selectable use-cases extract the shift operation
+// information from the OpCmode fields encoded in the immediate.
+def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
+  uint64_t OpCmode = N->getZExtValue();
+  unsigned ShiftImm;
+  unsigned ShiftOnesIn;
+  unsigned HasShift =
+    A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
+  if (!HasShift) return SDValue();
+  return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
+}]>;
+
+// Vector immediates shift operands which accept LSL and MSL
+// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
+// or 0, 8 (LSLH) or 8, 16 (MSL).
+defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
+defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
+// LSLH restricts shift amount to  0, 8 out of 0, 8, 16, 24
+defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
+
+multiclass neon_mov_imm_shift_operands<string PREFIX,
+                                       string HALF, string ISHALF, code pred>
+{
+   def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
+    {
+      let PrintMethod =
+        "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
+      let DecoderMethod =
+        "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
+      let ParserMatchClass =
+        !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
+    }
+}
+
+defm neon_mov_imm_LSL  : neon_mov_imm_shift_operands<"LSL", "", "false", [{
+  unsigned ShiftImm;
+  unsigned ShiftOnesIn;
+  unsigned HasShift =
+    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+  return (HasShift && !ShiftOnesIn);
+}]>;
+
+defm neon_mov_imm_MSL  : neon_mov_imm_shift_operands<"MSL", "", "false", [{
+  unsigned ShiftImm;
+  unsigned ShiftOnesIn;
+  unsigned HasShift =
+    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+  return (HasShift && ShiftOnesIn);
+}]>;
+
+defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
+  unsigned ShiftImm;
+  unsigned ShiftOnesIn;
+  unsigned HasShift =
+    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+  return (HasShift && !ShiftOnesIn);
+}]>;
+
+def neon_uimm1_asmoperand : AsmOperandClass
+{
+  let Name = "UImm1";
+  let PredicateMethod = "isUImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm2_asmoperand : AsmOperandClass
+{
+  let Name = "UImm2";
+  let PredicateMethod = "isUImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm8_asmoperand : AsmOperandClass
+{
+  let Name = "UImm8";
+  let PredicateMethod = "isUImm<8>";
+  let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = neon_uimm8_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+def neon_uimm64_mask_asmoperand : AsmOperandClass
+{
+  let Name = "NeonUImm64Mask";
+  let PredicateMethod = "isNeonUImm64Mask";
+  let RenderMethod = "addNeonUImm64MaskOperands";
+}
+
+// MCOperand for 64-bit bytemask with each byte having only the
+// value 0x00 and 0xff is encoded as an unsigned 8-bit value
+def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
+  let ParserMatchClass = neon_uimm64_mask_asmoperand;
+  let PrintMethod = "printNeonUImm64MaskOperand";
+}
+
+multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
+                                   SDPatternOperator opnode>
+{
+    // shift zeros, per word
+    def _2S  : NeonI_1VModImm<0b0, op,
+                              (outs VPR64:$Rd),
+                              (ins neon_uimm8:$Imm,
+                                neon_mov_imm_LSL_operand:$Simm),
+                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
+                              [(set (v2i32 VPR64:$Rd),
+                                 (v2i32 (opnode (timm:$Imm),
+                                   (neon_mov_imm_LSL_operand:$Simm))))],
+                              NoItinerary> {
+       bits<2> Simm;
+       let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
+     }
+
+    def _4S  : NeonI_1VModImm<0b1, op,
+                              (outs VPR128:$Rd),
+                              (ins neon_uimm8:$Imm,
+                                neon_mov_imm_LSL_operand:$Simm),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
+                              [(set (v4i32 VPR128:$Rd),
+                                 (v4i32 (opnode (timm:$Imm),
+                                   (neon_mov_imm_LSL_operand:$Simm))))],
+                              NoItinerary> {
+      bits<2> Simm;
+      let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
+    }
+
+    // shift zeros, per halfword
+    def _4H  : NeonI_1VModImm<0b0, op,
+                              (outs VPR64:$Rd),
+                              (ins neon_uimm8:$Imm,
+                                neon_mov_imm_LSLH_operand:$Simm),
+                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
+                              [(set (v4i16 VPR64:$Rd),
+                                 (v4i16 (opnode (timm:$Imm),
+                                   (neon_mov_imm_LSLH_operand:$Simm))))],
+                              NoItinerary> {
+      bit  Simm;
+      let cmode = {0b1, 0b0, Simm, 0b0};
+    }
+
+    def _8H  : NeonI_1VModImm<0b1, op,
+                              (outs VPR128:$Rd),
+                              (ins neon_uimm8:$Imm,
+                                neon_mov_imm_LSLH_operand:$Simm),
+                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
+                              [(set (v8i16 VPR128:$Rd),
+                                 (v8i16 (opnode (timm:$Imm),
+                                   (neon_mov_imm_LSLH_operand:$Simm))))],
+                              NoItinerary> {
+      bit Simm;
+      let cmode = {0b1, 0b0, Simm, 0b0};
+     }
+}
+
+multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
+                                                   SDPatternOperator opnode,
+                                                   SDPatternOperator neonopnode>
+{
+  let Constraints = "$src = $Rd" in {
+    // shift zeros, per word
+    def _2S  : NeonI_1VModImm<0b0, op,
+                 (outs VPR64:$Rd),
+                 (ins VPR64:$src, neon_uimm8:$Imm,
+                   neon_mov_imm_LSL_operand:$Simm),
+                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
+                 [(set (v2i32 VPR64:$Rd),
+                    (v2i32 (opnode (v2i32 VPR64:$src),
+                      (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))))],
+                 NoItinerary> {
+      bits<2> Simm;
+      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
+    }
+
+    def _4S  : NeonI_1VModImm<0b1, op,
+                 (outs VPR128:$Rd),
+                 (ins VPR128:$src, neon_uimm8:$Imm,
+                   neon_mov_imm_LSL_operand:$Simm),
+                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
+                 [(set (v4i32 VPR128:$Rd),
+                    (v4i32 (opnode (v4i32 VPR128:$src),
+                      (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))))],
+                 NoItinerary> {
+      bits<2> Simm;
+      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
+    }
+
+    // shift zeros, per halfword
+    def _4H  : NeonI_1VModImm<0b0, op,
+                 (outs VPR64:$Rd),
+                 (ins VPR64:$src, neon_uimm8:$Imm,
+                   neon_mov_imm_LSLH_operand:$Simm),
+                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
+                 [(set (v4i16 VPR64:$Rd),
+                    (v4i16 (opnode (v4i16 VPR64:$src),
+                       (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm,
+                          neon_mov_imm_LSL_operand:$Simm)))))))],
+                 NoItinerary> {
+      bit  Simm;
+      let cmode = {0b1, 0b0, Simm, 0b1};
+    }
+
+    def _8H  : NeonI_1VModImm<0b1, op,
+                 (outs VPR128:$Rd),
+                 (ins VPR128:$src, neon_uimm8:$Imm,
+                   neon_mov_imm_LSLH_operand:$Simm),
+                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
+                 [(set (v8i16 VPR128:$Rd),
+                    (v8i16 (opnode (v8i16 VPR128:$src),
+                      (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm,
+                        neon_mov_imm_LSL_operand:$Simm)))))))],
+                 NoItinerary> {
+      bit Simm;
+      let cmode = {0b1, 0b0, Simm, 0b1};
+    }
+  }
+}
+
+multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
+                                   SDPatternOperator opnode>
+{
+    // shift ones, per word
+    def _2S  : NeonI_1VModImm<0b0, op,
+                             (outs VPR64:$Rd),
+                             (ins neon_uimm8:$Imm,
+                               neon_mov_imm_MSL_operand:$Simm),
+                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
+                              [(set (v2i32 VPR64:$Rd),
+                                 (v2i32 (opnode (timm:$Imm),
+                                   (neon_mov_imm_MSL_operand:$Simm))))],
+                             NoItinerary> {
+       bit Simm;
+       let cmode = {0b1, 0b1, 0b0, Simm};
+     }
+
+   def _4S  : NeonI_1VModImm<0b1, op,
+                              (outs VPR128:$Rd),
+                              (ins neon_uimm8:$Imm,
+                                neon_mov_imm_MSL_operand:$Simm),
+                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
+                              [(set (v4i32 VPR128:$Rd),
+                                 (v4i32 (opnode (timm:$Imm),
+                                   (neon_mov_imm_MSL_operand:$Simm))))],
+                              NoItinerary> {
+     bit Simm;
+     let cmode = {0b1, 0b1, 0b0, Simm};
+   }
+}
+
+// Vector Move Immediate Shifted
+let isReMaterializable = 1 in {
+defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
+}
+
+// Vector Move Inverted Immediate Shifted
+let isReMaterializable = 1 in {
+defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
+}
+
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+let isReMaterializable = 1 in {
+defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
+                                                         and, Neon_mvni>;
+}
+
+// Vector Bitwise OR - immedidate
+
+let isReMaterializable = 1 in {
+defm ORRvi_lsl   : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
+                                                           or, Neon_movi>;
+}
+
+// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
+// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
+// BIC immediate instructions selection requires additional patterns to
+// transform Neon_movi operands into BIC immediate operands
+
+def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
+  uint64_t OpCmode = N->getZExtValue();
+  unsigned ShiftImm;
+  unsigned ShiftOnesIn;
+  (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
+  // LSLH restricts shift amount to  0, 8 which are encoded as 0 and 1
+  // Transform encoded shift amount 0 to 1 and 1 to 0.
+  return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
+}]>;
+
+def neon_mov_imm_LSLH_transform_operand
+  : ImmLeaf<i32, [{
+    unsigned ShiftImm;
+    unsigned ShiftOnesIn;
+    unsigned HasShift =
+      A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+    return (HasShift && !ShiftOnesIn); }],
+  neon_mov_imm_LSLH_transform_XFORM>;
+
+// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8)
+// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00)
+def : Pat<(v4i16 (and VPR64:$src,
+            (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
+          (BICvi_lsl_4H VPR64:$src, 0,
+            neon_mov_imm_LSLH_transform_operand:$Simm)>;
+
+// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8)
+// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00)
+def : Pat<(v8i16 (and VPR128:$src,
+            (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
+          (BICvi_lsl_8H VPR128:$src, 0,
+            neon_mov_imm_LSLH_transform_operand:$Simm)>;
+
+
+multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
+                                   SDPatternOperator neonopnode,
+                                   Instruction INST4H,
+                                   Instruction INST8H> {
+  def : Pat<(v8i8 (opnode VPR64:$src,
+                    (bitconvert(v4i16 (neonopnode timm:$Imm,
+                      neon_mov_imm_LSLH_operand:$Simm))))),
+            (INST4H VPR64:$src, neon_uimm8:$Imm,
+              neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v1i64 (opnode VPR64:$src,
+                  (bitconvert(v4i16 (neonopnode timm:$Imm,
+                    neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST4H VPR64:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+
+  def : Pat<(v16i8 (opnode VPR128:$src,
+                   (bitconvert(v8i16 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST8H VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v4i32 (opnode VPR128:$src,
+                   (bitconvert(v8i16 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST8H VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+  def : Pat<(v2i64 (opnode VPR128:$src,
+                   (bitconvert(v8i16 (neonopnode timm:$Imm,
+                     neon_mov_imm_LSLH_operand:$Simm))))),
+          (INST8H VPR128:$src, neon_uimm8:$Imm,
+            neon_mov_imm_LSLH_operand:$Simm)>;
+}
+
+// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
+defm : Neon_bitwiseVi_patterns<or, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H>;
+
+// Additional patterns for Vector Bitwise OR - immedidate
+defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H>;
+
+
+// Vector Move Immediate Masked
+let isReMaterializable = 1 in {
+defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
+}
+
+// Vector Move Inverted Immediate Masked
+let isReMaterializable = 1 in {
+defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
+}
+
+class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
+                                Instruction inst, RegisterOperand VPRC>
+  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
+                        (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
+
+// Aliases for Vector Move Immediate Shifted
+def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Move Inverted Immediate Shifted
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
+def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Bitwise OR - immedidate
+def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
+
+//  Vector Move Immediate - per byte
+let isReMaterializable = 1 in {
+def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
+                               (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
+                               "movi\t$Rd.8b, $Imm",
+                               [(set (v8i8 VPR64:$Rd),
+                                  (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
+                                NoItinerary> {
+  let cmode = 0b1110;
+}
+
+def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
+                                (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
+                                "movi\t$Rd.16b, $Imm",
+                                [(set (v16i8 VPR128:$Rd),
+                                   (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
+                                 NoItinerary> {
+  let cmode = 0b1110;
+}
+}
+
+// Vector Move Immediate - bytemask, per double word
+let isReMaterializable = 1 in {
+def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
+                               (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
+                               "movi\t $Rd.2d, $Imm",
+                               [(set (v2i64 VPR128:$Rd),
+                                  (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
+                               NoItinerary> {
+  let cmode = 0b1110;
+}
+}
+
+// Vector Move Immediate - bytemask, one doubleword
+
+let isReMaterializable = 1 in {
+def MOVIdi : NeonI_1VModImm<0b0, 0b1,
+                           (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
+                           "movi\t $Rd, $Imm",
+                           [(set (v1i64 FPR64:$Rd),
+                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
+                           NoItinerary> {
+  let cmode = 0b1110;
+}
+}
+
+// Vector Floating Point Move Immediate
+
+class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
+                      Operand immOpType, bit q, bit op>
+  : NeonI_1VModImm<q, op,
+                   (outs VPRC:$Rd), (ins immOpType:$Imm),
+                   "fmov\t$Rd" # asmlane # ", $Imm",
+                   [(set (OpTy VPRC:$Rd),
+                      (OpTy (Neon_fmovi (timm:$Imm))))],
+                   NoItinerary> {
+     let cmode = 0b1111;
+   }
+
+let isReMaterializable = 1 in {
+def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64,  v2f32, fmov32_operand, 0b0, 0b0>;
+def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
+def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
+}
+
+// Vector Shift (Immediate)
+// Immediate in [0, 63]
+def imm0_63 : Operand<i32> {
+  let ParserMatchClass = uimm6_asmoperand;
+}
+
+// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
+// as follows:
+//
+//    Offset    Encoding
+//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
+//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
+//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
+//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
+//
+// The shift right immediate amount, in the range 1 to element bits, is computed
+// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
+// to element bits - 1, is computed as UInt(immh:immb) - Offset.
+
+class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShrImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShrImm" # OFFSET;
+}
+
+class shr_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftRightImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
+}
+
+def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
+def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
+def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
+def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
+
+def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
+def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
+def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
+def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
+
+class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
+  let Name = "ShlImm" # OFFSET;
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "ShlImm" # OFFSET;
+}
+
+class shl_imm<string OFFSET> : Operand<i32> {
+  let EncoderMethod = "getShiftLeftImm" # OFFSET;
+  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
+}
+
+def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
+def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
+def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
+def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
+
+def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
+def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
+def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
+def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
+
+class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
+               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd),
+                        (Ty (OpNode (Ty VPRC:$Rn),
+                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
+  // 64-bit vector types.
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
+    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
+  }
+}
+
+multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                     OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                     OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                     OpNode> {
+     let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                      OpNode> {
+                      let Inst{22-19} = 0b0001;
+                    }
+
+  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                     OpNode> {
+                     let Inst{22-20} = 0b001;
+                    }
+
+  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                     OpNode> {
+                      let Inst{22-21} = 0b01;
+                    }
+
+  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                     OpNode> {
+                      let Inst{22} = 0b1;
+                    }
+}
+
+// Shift left
+defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
+
+// Shift right
+defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
+defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
+
+def Neon_High16B : PatFrag<(ops node:$in),
+                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
+def Neon_High8H  : PatFrag<(ops node:$in),
+                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
+def Neon_High4S  : PatFrag<(ops node:$in),
+                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
+def Neon_High2D  : PatFrag<(ops node:$in),
+                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
+def Neon_High4float : PatFrag<(ops node:$in),
+                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+def Neon_High2double : PatFrag<(ops node:$in),
+                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
+
+def Neon_Low16B : PatFrag<(ops node:$in),
+                          (v8i8 (extract_subvector (v16i8 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low8H : PatFrag<(ops node:$in),
+                         (v4i16 (extract_subvector (v8i16 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4S : PatFrag<(ops node:$in),
+                         (v2i32 (extract_subvector (v4i32 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low2D : PatFrag<(ops node:$in),
+                         (v1i64 (extract_subvector (v2i64 node:$in),
+                                                   (iPTR 0)))>;
+def Neon_Low4float : PatFrag<(ops node:$in),
+                             (v2f32 (extract_subvector (v4f32 node:$in),
+                                                       (iPTR 0)))>;
+def Neon_Low2double : PatFrag<(ops node:$in),
+                              (v1f64 (extract_subvector (v2f64 node:$in),
+                                                        (iPTR 0)))>;
+
+class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                   string SrcT, ValueType DestTy, ValueType SrcTy,
+                   Operand ImmTy, SDPatternOperator ExtOp>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR64:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
+                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, ValueType DestTy, ValueType SrcTy,
+                       int StartIndex, Operand ImmTy,
+                       SDPatternOperator ExtOp, PatFrag getTop>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [(set (DestTy VPR128:$Rd),
+                        (DestTy (shl
+                          (DestTy (ExtOp
+                            (SrcTy (getTop VPR128:$Rn)))),
+                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
+                         SDNode ExtOp> {
+  // 64-bit vector types.
+  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
+                         shl_imm8, ExtOp> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
+                         shl_imm16, ExtOp> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
+                         shl_imm32, ExtOp> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // 128-bit vector types
+  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
+                              8, shl_imm8, ExtOp, Neon_High16B> {
+    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
+  }
+
+  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
+                             4, shl_imm16, ExtOp, Neon_High8H> {
+    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
+  }
+
+  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
+                             2, shl_imm32, ExtOp, Neon_High4S> {
+    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
+  }
+
+  // Use other patterns to match when the immediate is 0.
+  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
+
+  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
+
+  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
+
+  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
+            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
+}
+
+// Shift left long
+defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
+defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
+
+// Rounding/Saturating shift
+class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
+                        (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+// shift right (vector by immediate)
+multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
+                           SDPatternOperator OpNode> {
+  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                         OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                         OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
+                          SDPatternOperator OpNode> {
+  // 64-bit vector types.
+  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // 128-bit vector types.
+  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right
+defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
+                                int_aarch64_neon_vsrshr>;
+defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
+                                int_aarch64_neon_vurshr>;
+
+// Saturating shift left unsigned
+defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
+
+// Saturating shift left
+defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
+defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
+
+class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDNode OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+              (Ty (OpNode (Ty VPRC:$Rn),
+                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// Shift Right accumulate
+multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
+  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift right and accumulate
+defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
+defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
+
+// Rounding shift accumulate
+class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
+                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                    SDPatternOperator OpNode>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
+                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
+                             SDPatternOperator OpNode> {
+  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                          OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                           OpNode> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                          OpNode> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                          OpNode> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                          OpNode> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Rounding shift right and accumulate
+defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
+defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
+
+// Shift insert by immediate
+class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
+                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
+                  SDPatternOperator OpNode>
+    : NeonI_2VShiftImm<q, u, opcode,
+           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
+           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
+             (i32 ImmTy:$Imm))))],
+           NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// shift left insert (vector by immediate)
+multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
+                         int_aarch64_neon_vsli> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
+                        int_aarch64_neon_vsli> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
+                        int_aarch64_neon_vsli> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// shift right insert (vector by immediate)
+multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
+    // 64-bit vector types.
+  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+    // 128-bit vector types
+  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
+                         int_aarch64_neon_vsri> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
+                        int_aarch64_neon_vsri> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
+                        int_aarch64_neon_vsri> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Shift left and insert
+defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
+
+// Shift right and insert
+defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
+
+class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                    string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary>;
+
+class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
+                       string SrcT, Operand ImmTy>
+  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
+                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
+                     [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// left long shift by immediate
+multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
+  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+
+  // Shift Narrow High
+  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
+                              shr_imm8> {
+    let Inst{22-19} = 0b0001;
+  }
+
+  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
+                             shr_imm16> {
+    let Inst{22-20} = 0b001;
+  }
+
+  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
+                             shr_imm32> {
+    let Inst{22-21} = 0b01;
+  }
+}
+
+// Shift right narrow
+defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
+
+// Shift right narrow (prefix Q is saturating, prefix R is rounding)
+defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
+defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
+defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
+defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
+defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
+defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
+defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
+
+def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2i64 (concat_vectors (v1i64 node:$Rm),
+                                                     (v1i64 node:$Rn)))>;
+def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v8i16 (concat_vectors (v4i16 node:$Rm),
+                                                     (v4i16 node:$Rn)))>;
+def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4i32 (concat_vectors (v2i32 node:$Rm),
+                                                     (v2i32 node:$Rn)))>;
+def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v4f32 (concat_vectors (v2f32 node:$Rm),
+                                                     (v2f32 node:$Rn)))>;
+def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
+                              (v2f64 (concat_vectors (v1f64 node:$Rm),
+                                                     (v1f64 node:$Rn)))>;
+
+def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (srl (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (srl (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (srl (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v8i16 (sra (v8i16 node:$lhs),
+                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v4i32 (sra (v4i32 node:$lhs),
+                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
+def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
+                             (v2i64 (sra (v2i64 node:$lhs),
+                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
+
+// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
+multiclass Neon_shiftNarrow_patterns<string shr> {
+  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
+              (i32 shr_imm8:$Imm)))),
+            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
+              (i32 shr_imm16:$Imm)))),
+            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
+              (i32 shr_imm32:$Imm)))),
+            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
+                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
+            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+                         VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
+                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
+            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
+              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
+                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
+            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                        VPR128:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
+  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
+            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
+            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
+            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
+
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v8i8
+                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
+            (!cast<Instruction>(prefix # "_16B")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v4i16
+                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
+            (!cast<Instruction>(prefix # "_8H")
+                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                VPR128:$Rn, imm:$Imm)>;
+  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                (v1i64 (bitconvert (v2i32
+                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
+            (!cast<Instruction>(prefix # "_4S")
+                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                  VPR128:$Rn, imm:$Imm)>;
+}
+
+defm : Neon_shiftNarrow_patterns<"lshr">;
+defm : Neon_shiftNarrow_patterns<"ashr">;
+
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
+defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
+
+// Convert fix-point and float-pointing
+class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
+                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
+                Operand ImmTy, SDPatternOperator IntOp>
+  : NeonI_2VShiftImm<q, u, opcode,
+                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
+                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
+                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
+                       (i32 ImmTy:$Imm))))],
+                     NoItinerary>;
+
+multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
+                              SDPatternOperator IntOp> {
+  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
+                      shr_imm32, IntOp> {
+    let Inst{22-21} = 0b01;
+  }
+
+  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
+                      shr_imm64, IntOp> {
+    let Inst{22} = 0b1;
+  }
+}
+
+// Convert fixed-point to floating-point
+defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
+                                   int_arm_neon_vcvtfxs2fp>;
+defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
+                                   int_arm_neon_vcvtfxu2fp>;
+
+// Convert floating-point to fixed-point
+defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
+                                   int_arm_neon_vcvtfp2fxs>;
+defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
+                                   int_arm_neon_vcvtfp2fxu>;
+
+multiclass Neon_sshll2_0<SDNode ext>
+{
+  def _v8i8  : PatFrag<(ops node:$Rn),
+                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
+  def _v4i16 : PatFrag<(ops node:$Rn),
+                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
+  def _v2i32 : PatFrag<(ops node:$Rn),
+                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
+}
+
+defm NI_sext_high : Neon_sshll2_0<sext>;
+defm NI_zext_high : Neon_sshll2_0<zext>;
+
+
+//===----------------------------------------------------------------------===//
+// Multiclasses for NeonI_Across
+//===----------------------------------------------------------------------===//
+
+// Variant 1
+
+multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
+{
+    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1d2s doesn't exist!
+
+    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i64 FPR64:$Rd),
+                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
+defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
+
+// Variant 2
+
+multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
+                            string asmop, SDPatternOperator opnode>
+{
+    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.8b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
+                (outs FPR8:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.16b",
+                [(set (v1i8 FPR8:$Rd),
+                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
+                NoItinerary>;
+
+    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR64:$Rn),
+                asmop # "\t$Rd, $Rn.4h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
+                NoItinerary>;
+
+    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
+                (outs FPR16:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.8h",
+                [(set (v1i16 FPR16:$Rd),
+                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
+                NoItinerary>;
+
+    // _1s2s doesn't exist!
+
+    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1i32 FPR32:$Rd),
+                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
+defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
+
+defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
+defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
+
+defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
+
+// Variant 3
+
+multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
+                            string asmop, SDPatternOperator opnode> {
+    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
+                (outs FPR32:$Rd), (ins VPR128:$Rn),
+                asmop # "\t$Rd, $Rn.4s",
+                [(set (v1f32 FPR32:$Rd),
+                    (v1f32 (opnode (v4f32 VPR128:$Rn))))],
+                NoItinerary>;
+}
+
+defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
+                                int_aarch64_neon_vmaxnmv>;
+defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
+                                int_aarch64_neon_vminnmv>;
+
+defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
+                              int_aarch64_neon_vmaxv>;
+defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
+                              int_aarch64_neon_vminv>;
+
+// The followings are for instruction class (Perm)
+
+class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
+                    string asmop, RegisterOperand OpVPR, string OpS,
+                    SDPatternOperator opnode, ValueType Ty>
+  : NeonI_Perm<q, size, opcode,
+               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (Ty OpVPR:$Rd),
+                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
+               NoItinerary>;
+
+multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
+                          SDPatternOperator opnode> {
+  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
+                           VPR64, "8b", opnode, v8i8>;
+  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
+                           VPR128, "16b",opnode, v16i8>;
+  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
+                           VPR64, "4h", opnode, v4i16>;
+  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
+                           VPR128, "8h", opnode, v8i16>;
+  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
+                           VPR64, "2s", opnode, v2i32>;
+  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
+                           VPR128, "4s", opnode, v4i32>;
+  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
+                           VPR128, "2d", opnode, v2i64>;
+}
+
+defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
+defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
+defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
+defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
+defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
+defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
+
+multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
+  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
+
+  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
+
+  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
+}
+
+defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
+defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
+defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
+defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
+defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
+defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
+
+// The followings are for instruction class (3V Diff)
+
+// normal long/long2 pattern
+class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
+                        string asmop, SDPatternOperator opnode,
+                        bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, sext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, sext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, sext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                           opnode, zext, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                           opnode, zext, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                           opnode, zext, VPR64, v2i64, v2i32>;
+  }
+}
+
+multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+  }
+}
+
+defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
+defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
+
+defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
+defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
+
+defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
+defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
+
+defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
+defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
+
+// normal wide/wide2 pattern
+class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS,
+                 SDPatternOperator opnode, SDPatternOperator ext,
+                 RegisterOperand OpVPR,
+                 ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (ResTy VPR128:$Rn),
+                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, sext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, sext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, sext, VPR64, v2i64, v2i32>;
+}
+
+defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
+defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
+
+multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
+defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
+
+multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
+                        SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                         opnode, zext, VPR64, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                         opnode, zext, VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                         opnode, zext, VPR64, v2i64, v2i32>;
+}
+
+defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
+defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
+
+multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode> {
+  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
+}
+
+defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
+defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
+
+// Get the high half part of the vector element.
+multiclass NeonI_get_high {
+  def _8h : PatFrag<(ops node:$Rn),
+                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
+                                             (v8i16 (Neon_vdup (i32 8)))))))>;
+  def _4s : PatFrag<(ops node:$Rn),
+                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
+                                              (v4i32 (Neon_vdup (i32 16)))))))>;
+  def _2d : PatFrag<(ops node:$Rn),
+                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
+                                              (v2i64 (Neon_vdup (i32 32)))))))>;
+}
+
+defm NI_get_hi : NeonI_get_high;
+
+// pattern for addhn/subhn with 2 operands
+class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode, SDPatternOperator get_hi,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR64:$Rd),
+                    (ResTy (get_hi
+                      (OpTy (opnode (OpTy VPR128:$Rn),
+                                    (OpTy VPR128:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
+                                SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
+    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
+    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
+  }
+}
+
+defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
+defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
+
+// pattern for operation with 2 operands
+class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                    string asmop, string ResS, string OpS,
+                    SDPatternOperator opnode,
+                    RegisterOperand ResVPR, RegisterOperand OpVPR,
+                    ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy ResVPR:$Rd),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
+                 NoItinerary>;
+
+// normal narrow pattern
+multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
+                              opnode, VPR64, VPR128, v8i8, v8i16>;
+    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
+                              opnode, VPR64, VPR128, v4i16, v4i32>;
+    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
+                              opnode, VPR64, VPR128, v2i32, v2i64>;
+  }
+}
+
+defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
+defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
+
+// pattern for acle intrinsic with 3 operands
+class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [], NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let neverHasSideEffects = 1;
+}
+
+multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
+  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
+  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
+  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
+}
+
+defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
+defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
+
+defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
+defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
+
+// Patterns have to be separate because there's a SUBREG_TO_REG in the output
+// part.
+class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
+                        SDPatternOperator coreop>
+  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
+                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
+                                                        (SrcTy VPR128:$Rm)))))),
+        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+              VPR128:$Rn, VPR128:$Rm)>;
+
+// addhn2 patterns
+def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
+
+// subhn2 patterns
+def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
+          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
+          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
+def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
+          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
+
+// raddhn2 patterns
+def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
+def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
+
+// rsubhn2 patterns
+def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
+def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
+
+// pattern that need to extend result
+class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
+                                                (OpTy OpVPR:$Rm))))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
+                           SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                               opnode, VPR64, v8i16, v8i8, v8i8>;
+    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                               opnode, VPR64, v4i32, v4i16, v4i16>;
+    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                               opnode, VPR64, v2i64, v2i32, v2i32>;
+  }
+}
+
+defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
+defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
+
+multiclass NeonI_Op_High<SDPatternOperator op> {
+  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v8i8 (Neon_High16B node:$Rn)),
+                         (v8i8 (Neon_High16B node:$Rm)))>;
+  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v4i16 (Neon_High8H node:$Rn)),
+                         (v4i16 (Neon_High8H node:$Rm)))>;
+  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
+                     (op (v2i32 (Neon_High4S node:$Rn)),
+                         (v2i32 (Neon_High4S node:$Rm)))>;
+}
+
+defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
+defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
+defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
+defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
+defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
+defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
+
+multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
+                            bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                !cast<PatFrag>(opnode # "_16B"),
+                                VPR128, v8i16, v16i8, v8i8>;
+    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                !cast<PatFrag>(opnode # "_8H"),
+                                VPR128, v4i32, v8i16, v4i16>;
+    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                !cast<PatFrag>(opnode # "_4S"),
+                                VPR128, v2i64, v4i32, v2i32>;
+  }
+}
+
+defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
+defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
+
+// For pattern that need two operators being chained.
+class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode, SDPatternOperator subop,
+                     RegisterOperand OpVPR,
+                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
+                                                 (OpTy OpVPR:$Rm))))))))],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode, SDPatternOperator subop>{
+  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
+}
+
+defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
+                                   add, int_arm_neon_vabds>;
+defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
+                                   add, int_arm_neon_vabdu>;
+
+multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
+                              SDPatternOperator opnode, string subop> {
+  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                             opnode, !cast<PatFrag>(subop # "_16B"),
+                             VPR128, v8i16, v16i8, v8i8>;
+  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                             opnode, !cast<PatFrag>(subop # "_8H"),
+                             VPR128, v4i32, v8i16, v4i16>;
+  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                             opnode, !cast<PatFrag>(subop # "_4S"),
+                             VPR128, v2i64, v4i32, v2i32>;
+}
+
+defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
+                                     "NI_sabdl_hi">;
+defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
+                                     "NI_uabdl_hi">;
+
+// Long pattern with 2 operands
+multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
+                          SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
+defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
+
+class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator opnode,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
+                 NoItinerary>;
+
+multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
+                                         "NI_smull_hi", 1>;
+defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
+                                         "NI_umull_hi", 1>;
+
+// Long pattern with 3 operands
+class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
+                     string asmop, string ResS, string OpS,
+                     SDPatternOperator opnode,
+                     ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+                 [(set (ResTy VPR128:$Rd),
+                    (ResTy (opnode
+                      (ResTy VPR128:$src),
+                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
+                             SDPatternOperator opnode> {
+  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                             opnode, v8i16, v8i8>;
+  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                             opnode, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                             opnode, v2i64, v2i32>;
+}
+
+def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (add node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
+
+def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
+                         (sub node:$Rd,
+                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
+
+defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
+defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
+
+defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
+defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
+
+class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
+                           string asmop, string ResS, string OpS,
+                           SDPatternOperator subop, SDPatternOperator opnode,
+                           RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy>
+  : NeonI_3VDiff<q, u, size, opcode,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
+               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
+               [(set (ResTy VPR128:$Rd),
+                  (ResTy (subop
+                    (ResTy VPR128:$src),
+                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
+               NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
+                                   SDPatternOperator subop, string opnode> {
+  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                    subop, !cast<PatFrag>(opnode # "_16B"),
+                                    VPR128, v8i16, v16i8>;
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   subop, !cast<PatFrag>(opnode # "_8H"),
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   subop, !cast<PatFrag>(opnode # "_4S"),
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
+                                          add, "NI_smull_hi">;
+defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
+                                          add, "NI_umull_hi">;
+
+defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
+                                          sub, "NI_smull_hi">;
+defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
+                                          sub, "NI_umull_hi">;
+
+multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
+                                    SDPatternOperator opnode> {
+  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v4i32, v4i16>;
+  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                                   opnode, int_arm_neon_vqdmull,
+                                   VPR64, v2i64, v2i32>;
+}
+
+defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
+                                           int_arm_neon_vqadds>;
+defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
+                                           int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
+                              opnode, VPR128, VPR64, v4i32, v4i16>;
+    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
+                              opnode, VPR128, VPR64, v2i64, v2i32>;
+  }
+}
+
+defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
+                                int_arm_neon_vqdmull, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                     !cast<PatFrag>(opnode # "_8H"),
+                                     v4i32, v8i16>;
+    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                     !cast<PatFrag>(opnode # "_4S"),
+                                     v2i64, v4i32>;
+  }
+}
+
+defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
+                                           "NI_qdmull_hi", 1>;
+
+multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
+                                     SDPatternOperator opnode> {
+  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
+                                   opnode, NI_qdmull_hi_8H,
+                                   VPR128, v4i32, v8i16>;
+  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
+                                   opnode, NI_qdmull_hi_4S,
+                                   VPR128, v2i64, v4i32>;
+}
+
+defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
+                                             int_arm_neon_vqadds>;
+defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
+                                             int_arm_neon_vqsubs>;
+
+multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
+                         SDPatternOperator opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
+                              opnode, VPR128, VPR64, v8i16, v8i8>;
+
+    def _1q1d : NeonI_3VDiff<0b0, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.1d, $Rm.1d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp, 1>;
+
+multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
+                                   string opnode, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
+                                      !cast<PatFrag>(opnode # "_16B"),
+                                      v8i16, v16i8>;
+
+    def _1q2d : NeonI_3VDiff<0b1, u, 0b11, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+                             asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
+                             [], NoItinerary>;
+  }
+}
+
+defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
+                                         1>;
+
+// End of implementation for instruction class (3V Diff)
+
+// The followings are vector load/store multiple N-element structure
+// (class SIMD lselem).
+
+// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
+// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
+//              The structure consists of a sequence of sets of N values.
+//              The first element of the structure is placed in the first lane
+//              of the first first vector, the second element in the first lane
+//              of the second vector, and so on.
+// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
+// the three 64-bit vectors list {BA, DC, FE}.
+// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
+// 64-bit vectors list {DA, EB, FC}.
+// Store instructions store multiple structure to N registers like load.
+
+
+class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 1, opcode, size,
+                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_LDVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_LDVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
+defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
+
+defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+
+defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+
+defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+
+// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
+defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
+def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
+
+defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
+def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
+
+defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
+def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
+
+class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+  : NeonI_LdStMult<q, 0, opcode, size,
+                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
+                 asmop # "\t$Rt, [$Rn]",
+                 [],
+                 NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
+  def _8B : NeonI_STVList<0, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_STVList<0, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_STVList<0, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _16B : NeonI_STVList<1, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_STVList<1, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_STVList<1, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_STVList<1, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Store multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
+
+defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+
+defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+
+defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+
+// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
+defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+
+defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+
+defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+
+def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+
+def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+
+def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+
+def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+
+def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+
+def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+
+def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+
+def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+
+def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+
+// End of vector load/store multiple N-element structure(class SIMD lselem)
+
+// The followings are post-index vector load/store multiple N-element
+// structure(class SIMD lselem-post)
+def exact1_asmoperand : AsmOperandClass {
+  let Name = "Exact1";
+  let PredicateMethod = "isExactImm<1>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
+  let ParserMatchClass = exact1_asmoperand;
+}
+
+def exact2_asmoperand : AsmOperandClass {
+  let Name = "Exact2";
+  let PredicateMethod = "isExactImm<2>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
+  let ParserMatchClass = exact2_asmoperand;
+}
+
+def exact3_asmoperand : AsmOperandClass {
+  let Name = "Exact3";
+  let PredicateMethod = "isExactImm<3>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
+  let ParserMatchClass = exact3_asmoperand;
+}
+
+def exact4_asmoperand : AsmOperandClass {
+  let Name = "Exact4";
+  let PredicateMethod = "isExactImm<4>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
+  let ParserMatchClass = exact4_asmoperand;
+}
+
+def exact6_asmoperand : AsmOperandClass {
+  let Name = "Exact6";
+  let PredicateMethod = "isExactImm<6>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
+  let ParserMatchClass = exact6_asmoperand;
+}
+
+def exact8_asmoperand : AsmOperandClass {
+  let Name = "Exact8";
+  let PredicateMethod = "isExactImm<8>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
+  let ParserMatchClass = exact8_asmoperand;
+}
+
+def exact12_asmoperand : AsmOperandClass {
+  let Name = "Exact12";
+  let PredicateMethod = "isExactImm<12>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
+  let ParserMatchClass = exact12_asmoperand;
+}
+
+def exact16_asmoperand : AsmOperandClass {
+  let Name = "Exact16";
+  let PredicateMethod = "isExactImm<16>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
+  let ParserMatchClass = exact16_asmoperand;
+}
+
+def exact24_asmoperand : AsmOperandClass {
+  let Name = "Exact24";
+  let PredicateMethod = "isExactImm<24>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
+  let ParserMatchClass = exact24_asmoperand;
+}
+
+def exact32_asmoperand : AsmOperandClass {
+  let Name = "Exact32";
+  let PredicateMethod = "isExactImm<32>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
+  let ParserMatchClass = exact32_asmoperand;
+}
+
+def exact48_asmoperand : AsmOperandClass {
+  let Name = "Exact48";
+  let PredicateMethod = "isExactImm<48>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
+  let ParserMatchClass = exact48_asmoperand;
+}
+
+def exact64_asmoperand : AsmOperandClass {
+  let Name = "Exact64";
+  let PredicateMethod = "isExactImm<64>";
+  let RenderMethod = "addImmOperands";
+}
+def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
+  let ParserMatchClass = exact64_asmoperand;
+}
+
+multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
+                           RegisterOperand VecList, Operand ImmTy,
+                           string asmop> {
+  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
+                     (outs VecList:$Rt, GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
+                        (outs VecList:$Rt, GPR64xsp:$wb),
+                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                        asmop # "\t$Rt, [$Rn], $Rm",
+                        [],
+                        NoItinerary>;
+  }
+}
+
+multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+    Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              ImmTy, asmop>;
+
+  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "ld1">;
+
+defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+
+defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "ld3">;
+
+defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "ld1">;
+defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "ld1">;
+
+defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld1">;
+defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "ld1">;
+
+defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                "ld1">;
+defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "ld1">;
+
+multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
+      DecoderMethod = "DecodeVLDSTPostInstruction" in {
+    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
+                     (outs GPR64xsp:$wb),
+                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
+                     asmop # "\t$Rt, [$Rn], $amt",
+                     [],
+                     NoItinerary> {
+      let Rm = 0b11111;
+    }
+
+    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
+                      (outs GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
+                           Operand ImmTy2, string asmop> {
+  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
+                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
+
+  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              ImmTy, asmop>;
+
+  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              ImmTy, asmop>;
+
+  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               ImmTy2, asmop>;
+
+  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              ImmTy2, asmop>;
+
+  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              ImmTy2, asmop>;
+
+  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              ImmTy2, asmop>;
+}
+
+// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
+defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "st1">;
+
+defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+
+defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                             "st3">;
+
+defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+
+// Post-index load multiple 1-element structures from N consecutive registers
+// (N = 2,3,4)
+defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                               "st1">;
+defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                   uimm_exact16, "st1">;
+
+defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                               "st1">;
+defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                   uimm_exact24, "st1">;
+
+defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                               "st1">;
+defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                   uimm_exact32, "st1">;
+
+// End of post-index vector load/store multiple N-element structure
+// (class SIMD lselem-post)
+
+// The followings are vector load/store single N-element structure
+// (class SIMD lsone).
+def neon_uimm0_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm == 0;}]> {
+  let ParserMatchClass = neon_uimm0_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm1_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 2;}]> {
+  let ParserMatchClass = neon_uimm1_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm2_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 4;}]> {
+  let ParserMatchClass = neon_uimm2_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm3_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm4_bare : Operand<i64>,
+                        ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmBareOperand";
+}
+
+class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                    RegisterOperand VecList, string asmop>
+    : NeonI_LdOne_Dup<q, r, opcode, size,
+                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+                      asmop # "\t$Rt, [$Rn]",
+                      [],
+                      NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+}
+
+multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
+  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
+                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
+
+  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
+                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
+                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
+                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
+                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load single 1-element structure to all lanes of 1 register
+defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
+
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+
+
+class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
+                    Instruction INST>
+    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
+          (VTy (INST GPR64xsp:$Rn))>;
+
+// Match all LD1R instructions
+def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+
+def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+
+def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+
+def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+
+def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+
+def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+
+def : LD1R_pattern<v1i64, i64, load, LD1R_1D>;
+def : LD1R_pattern<v1f64, f64, load, LD1R_1D>;
+
+def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+
+
+multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
+                                RegisterClass RegList> {
+  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
+  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
+  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
+  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
+}
+
+// Special vector list operand of 128-bit vectors with bare layout.
+// i.e. only show ".b", ".h", ".s", ".d"
+defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
+defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
+defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
+defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
+
+class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
+                         (outs VList:$Rt),
+                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayLoad = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+  let Constraints = "$src = $Rt";
+}
+
+multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_LDN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_LDN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                          neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_LDN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Load single 1-element structure to one lane of 1 register.
+defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
+
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+
+multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+                          Instruction INST> {
+  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+            (VTy (EXTRACT_SUBREG
+                     (INST GPR64xsp:$Rn,
+                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                           ImmOp:$lane),
+                     sub_64))>;
+
+  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+}
+
+// Match all LD1LN instructions
+defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      extloadi8, LD1LN_B>;
+
+defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      extloadi16, LD1LN_H>;
+
+defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      load, LD1LN_S>;
+
+defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      load, LD1LN_D>;
+
+class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                     Operand ImmOp, string asmop>
+    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
+                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
+                         asmop # "\t$Rt[$lane], [$Rn]",
+                         [],
+                         NoItinerary> {
+  let mayStore = 1;
+  let neverHasSideEffects = 1;
+  let hasExtraDefRegAllocReq = 1;
+}
+
+multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+  def _B : NeonI_STN_Lane<r, 0b00, op0,
+                          !cast<RegisterOperand>(List # "B_operand"),
+                          neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H : NeonI_STN_Lane<r, 0b01, op0,
+                          !cast<RegisterOperand>(List # "H_operand"),
+                          neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "S_operand"),
+                           neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D : NeonI_STN_Lane<r, 0b10, op0,
+                          !cast<RegisterOperand>(List # "D_operand"),
+                          neon_uimm1_bare, asmop>{
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Store single 1-element structure from one lane of 1 register.
+defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
+
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+
+multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
+                          Instruction INST> {
+  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn,
+                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
+                  ImmOp:$lane)>;
+
+  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
+                     GPR64xsp:$Rn),
+            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
+}
+
+// Match all ST1LN instructions
+defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                      truncstorei8, ST1LN_B>;
+
+defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                      truncstorei16, ST1LN_H>;
+
+defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                      store, ST1LN_S>;
+
+defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                      store, ST1LN_D>;
+
+// End of vector load/store single N-element structure (class SIMD lsone).
+
+
+// The following are post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+                            RegisterOperand VecList, Operand ImmTy,
+                            string asmop> {
+  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
+  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, ImmTy:$amt),
+                      asmop # "\t$Rt, [$Rn], $amt",
+                      [],
+                      NoItinerary> {
+                        let Rm = 0b11111;
+                      }
+
+    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+                      (outs VecList:$Rt, GPR64xsp:$wb),
+                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+                      asmop # "\t$Rt, [$Rn], $Rm",
+                      [],
+                      NoItinerary>;
+  }
+}
+
+multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
+                         Operand uimm_b, Operand uimm_h,
+                         Operand uimm_s, Operand uimm_d> {
+  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
+                              !cast<RegisterOperand>(List # "8B_operand"),
+                              uimm_b, asmop>;
+
+  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "4H_operand"),
+                              uimm_h, asmop>;
+
+  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "2S_operand"),
+                              uimm_s, asmop>;
+
+  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "1D_operand"),
+                              uimm_d, asmop>;
+
+  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
+                               !cast<RegisterOperand>(List # "16B_operand"),
+                               uimm_b, asmop>;
+
+  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
+                              !cast<RegisterOperand>(List # "8H_operand"),
+                              uimm_h, asmop>;
+
+  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
+                              !cast<RegisterOperand>(List # "4S_operand"),
+                              uimm_s, asmop>;
+
+  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
+                              !cast<RegisterOperand>(List # "2D_operand"),
+                              uimm_d, asmop>;
+}
+
+// Post-index load single 1-element structure to all lanes of 1 register
+defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
+                             uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                             uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                             uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                             uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
+    Constraints = "$Rn = $wb, $Rt = $src",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
+                                [],
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                                 Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+                                (outs VList:$Rt, GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
+                                    VList:$src, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
+                                [],
+                                NoItinerary>;
+}
+
+multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Post-index load single 1-element structure to one lane of 1 register.
+defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to one lane of N consecutive
+// registers
+// (N = 2,3,4)
+defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayStore = 1, neverHasSideEffects = 1,
+    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
+    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                      Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, ImmTy:$amt,
+                                    VList:$Rt, ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $amt",
+                                [],
+                                NoItinerary> {
+    let Rm = 0b11111;
+  }
+
+  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+                       Operand ImmTy, Operand ImmOp, string asmop>
+      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+                                (outs GPR64xsp:$wb),
+                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
+                                    ImmOp:$lane),
+                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
+                                [],
+                                NoItinerary>;
+}
+
+multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+                           Operand uimm_b, Operand uimm_h,
+                           Operand uimm_s, Operand uimm_d> {
+  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
+                               !cast<RegisterOperand>(List # "B_operand"),
+                               uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
+                                   !cast<RegisterOperand>(List # "B_operand"),
+                                   uimm_b, neon_uimm4_bare, asmop> {
+    let Inst{12-10} = lane{2-0};
+    let Inst{30} = lane{3};
+  }
+
+  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
+                               !cast<RegisterOperand>(List # "H_operand"),
+                               uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
+                                   !cast<RegisterOperand>(List # "H_operand"),
+                                   uimm_h, neon_uimm3_bare, asmop> {
+    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+    let Inst{30} = lane{2};
+  }
+
+  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "S_operand"),
+                               uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "S_operand"),
+                                   uimm_s, neon_uimm2_bare, asmop> {
+    let Inst{12-10} = {lane{0}, 0b0, 0b0};
+    let Inst{30} = lane{1};
+  }
+
+  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
+                               !cast<RegisterOperand>(List # "D_operand"),
+                               uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+
+  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
+                                   !cast<RegisterOperand>(List # "D_operand"),
+                                   uimm_d, neon_uimm1_bare, asmop> {
+    let Inst{12-10} = 0b001;
+    let Inst{30} = lane{0};
+  }
+}
+
+// Post-index store single 1-element structure from one lane of 1 register.
+defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
+                                uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                uimm_exact4, uimm_exact8, uimm_exact16>;
+defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                uimm_exact6, uimm_exact12, uimm_exact24>;
+defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                uimm_exact8, uimm_exact16, uimm_exact32>;
+
+// End of post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+// Neon Scalar instructions implementation
+// Scalar Three Same
+
+class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRC>
+  : NeonI_Scalar3Same<u, size, opcode,
+                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+
+multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
+                                      bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
+                                      string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
+                                        string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
+    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
+    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
+    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
+                                            Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTB,
+                                               Instruction INSTH,
+                                               Instruction INSTS,
+                                               Instruction INSTD>
+  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
+  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
+           (INSTB FPR8:$Rn, FPR8:$Rm)>;
+
+  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+           (INSTH FPR16:$Rn, FPR16:$Rm)>;
+
+  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+           (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_D_size_patterns<SDPatternOperator opnode,
+                                           Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTS,
+                                             Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass Neon_Scalar3Same_cmp_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn), (v1f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+// Scalar Three Different
+
+class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar3Diff<u, size, opcode,
+                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
+                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                      [],
+                      NoItinerary>;
+
+multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
+  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
+  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
+}
+
+multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
+  let Constraints = "$Src = $Rd" in {
+    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
+                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
+                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
+                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
+                       [],
+                       NoItinerary>;
+  }
+}
+
+multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
+                                             Instruction INSTH,
+                                             Instruction INSTS> {
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
+            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
+            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
+}
+
+// Scalar Two Registers Miscellaneous
+
+class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
+                             RegisterClass FPRCD, RegisterClass FPRCS>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
+                                         string asmop> {
+  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
+                                      FPR32>;
+  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
+                                      FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
+  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
+}
+
+multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
+  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
+  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
+  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
+}
+
+class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
+
+multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
+  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
+  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
+}
+
+class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
+                                       string asmop, RegisterClass FPRC>
+  : NeonI_Scalar2SameMisc<u, size, opcode,
+                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
+                          !strconcat(asmop, "\t$Rd, $Rn"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
+                                                 string asmop> {
+
+  let Constraints = "$Src = $Rd" in {
+    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
+    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
+    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
+    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
+  }
+}
+
+class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTD>
+  : Pat<(v1f32 (opnode (v1f64 FPR64:$Rn))),
+        (INSTD FPR64:$Rn)>;
+
+multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator Sopnode,
+                                                     SDPatternOperator Dopnode,
+                                                     Instruction INSTS,
+                                                     Instruction INSTD> {
+  def : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
+                                                 Instruction INSTS,
+                                                 Instruction INSTD> {
+  def : Pat<(v1f32 (opnode (v1f32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
+                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                          [],
+                          NoItinerary>;
+
+multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
+                                              string asmop> {
+  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
+                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
+                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpz32:$FPImm),
+                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
+                           [],
+                           NoItinerary>;
+}
+
+class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
+        (INSTD FPR64:$Rn, 0)>;
+
+class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
+                                                   Instruction INSTD>
+  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
+                          (i32 neon_uimm0:$Imm), CC)),
+        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
+
+multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def : Pat<(v1i32 (opnode (v1f32 FPR32:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTS FPR32:$Rn, fpz32:$FPImm)>;
+  def : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn),
+                           (v1f32 (scalar_to_vector (f32 fpz32:$FPImm))))),
+            (INSTD FPR64:$Rn, fpz32:$FPImm)>;
+}
+
+multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD> {
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+}
+
+multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Rn)>;
+
+}
+
+multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTB,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
+            (INSTB FPR8:$Src, FPR8:$Rn)>;
+  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
+            (INSTH FPR16:$Src, FPR16:$Rn)>;
+  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
+            (INSTS FPR32:$Src, FPR32:$Rn)>;
+  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
+            (INSTD FPR64:$Src, FPR64:$Rn)>;
+}
+
+// Scalar Shift By Immediate
+
+class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
+                                RegisterClass FPRC, Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
+                                               string asmop>
+  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
+                                            string asmop> {
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
+                                              string asmop>
+  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
+  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPR64:$Rd),
+                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+    let Constraints = "$Src = $Rd";
+}
+
+class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
+                                       RegisterClass FPRCD, RegisterClass FPRCS,
+                                       Operand ImmTy>
+  : NeonI_ScalarShiftImm<u, opcode,
+                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
+                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
+                         [], NoItinerary>;
+
+multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
+                                                string asmop> {
+  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
+                                             shr_imm8> {
+    bits<3> Imm;
+    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
+    let Inst{18-16} = Imm;
+  }
+  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
+                                             shr_imm16> {
+    bits<4> Imm;
+    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
+    let Inst{19-16} = Imm;
+  }
+  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
+                                             shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+}
+
+multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
+  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
+    bits<5> Imm;
+    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
+    let Inst{20-16} = Imm;
+  }
+  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
+    bits<6> Imm;
+    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
+    let Inst{21-16} = Imm;
+  }
+}
+
+multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
+                                               Instruction INSTD> {
+  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftImm_arm_D_size_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
+            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
+        (INSTD FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
+                                                   Instruction INSTB,
+                                                   Instruction INSTH,
+                                                   Instruction INSTS,
+                                                   Instruction INSTD>
+  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
+  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
+                (INSTB FPR8:$Rn, imm:$Imm)>;
+  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+}
+
+class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shl_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
+                                                Instruction INSTD>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
+            (i32 shr_imm64:$Imm))),
+        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
+
+multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
+                                                       SDPatternOperator opnode,
+                                                       Instruction INSTH,
+                                                       Instruction INSTS,
+                                                       Instruction INSTD> {
+  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
+                (INSTH FPR16:$Rn, imm:$Imm)>;
+  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(f32 (Sopnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(f64 (Dopnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator Sopnode,
+                                                      SDPatternOperator Dopnode,
+                                                      Instruction INSTS,
+                                                      Instruction INSTD> {
+  def ssi : Pat<(v1i32 (Sopnode (v1f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
+                (INSTS FPR32:$Rn, imm:$Imm)>;
+  def ddi : Pat<(v1i64 (Dopnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+                (INSTD FPR64:$Rn, imm:$Imm)>;
+}
+
+// Scalar Signed Shift Right (Immediate)
+defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<sra, SSHRddi>;
+
+// Scalar Unsigned Shift Right (Immediate)
+defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<srl, USHRddi>;
+
+// Scalar Signed Rounding Shift Right (Immediate)
+defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
+
+// Scalar Unigned Rounding Shift Right (Immediate)
+defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
+defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
+
+// Scalar Signed Shift Right and Accumulate (Immediate)
+def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsrads_n, SSRA>;
+
+// Scalar Unsigned Shift Right and Accumulate (Immediate)
+def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsradu_n, USRA>;
+
+// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
+def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsrads_n, SRSRA>;
+
+// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
+def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vrsradu_n, URSRA>;
+
+// Scalar Shift Left (Immediate)
+defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
+defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+def : Neon_ScalarShiftImm_arm_D_size_patterns<shl, SHLddi>;
+
+// Signed Saturating Shift Left (Immediate)
+defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
+                                               SQSHLbbi, SQSHLhhi,
+                                               SQSHLssi, SQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
+
+// Unsigned Saturating Shift Left (Immediate)
+defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
+                                               UQSHLbbi, UQSHLhhi,
+                                               UQSHLssi, UQSHLddi>;
+// Pattern to match llvm.arm.* intrinsic.
+defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
+
+// Signed Saturating Shift Left Unsigned (Immediate)
+defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
+defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
+                                               SQSHLUbbi, SQSHLUhhi,
+                                               SQSHLUssi, SQSHLUddi>;
+
+// Shift Right And Insert (Immediate)
+def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
+def : Neon_ScalarShiftRImm_accum_D_size_patterns
+          <int_aarch64_neon_vsri, SRI>;
+
+// Shift Left And Insert (Immediate)
+def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
+def : Neon_ScalarShiftLImm_accum_D_size_patterns
+          <int_aarch64_neon_vsli, SLI>;
+
+// Signed Saturating Shift Right Narrow (Immediate)
+defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
+                                                    SQSHRNbhi, SQSHRNhsi,
+                                                    SQSHRNsdi>;
+
+// Unsigned Saturating Shift Right Narrow (Immediate)
+defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
+                                                    UQSHRNbhi, UQSHRNhsi,
+                                                    UQSHRNsdi>;
+
+// Signed Saturating Rounded Shift Right Narrow (Immediate)
+defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
+                                                    SQRSHRNbhi, SQRSHRNhsi,
+                                                    SQRSHRNsdi>;
+
+// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
+defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
+                                                    UQRSHRNbhi, UQRSHRNhsi,
+                                                    UQRSHRNsdi>;
+
+// Signed Saturating Shift Right Unsigned Narrow (Immediate)
+defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
+                                                    SQSHRUNbhi, SQSHRUNhsi,
+                                                    SQSHRUNsdi>;
+
+// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
+defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
+defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
+                                                    SQRSHRUNbhi, SQRSHRUNhsi,
+                                                    SQRSHRUNsdi>;
+
+// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
+defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_s32,
+                                                  int_aarch64_neon_vcvtf64_n_s64,
+                                                  SCVTF_Nssi, SCVTF_Nddi>;
+
+// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
+defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
+defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtf32_n_u32,
+                                                  int_aarch64_neon_vcvtf64_n_u64,
+                                                  UCVTF_Nssi, UCVTF_Nddi>;
+
+// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
+defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_s32_f32,
+                                                  int_aarch64_neon_vcvtd_n_s64_f64,
+                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
+
+// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
+defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
+defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvts_n_u32_f32,
+                                                  int_aarch64_neon_vcvtd_n_u64_f64,
+                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                             Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
+          (INST FPR64:$Rn, imm:$Imm)>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
+                                             SCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
+                                             UCVTF_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
+                                             FCVTZS_Nddi>;
+
+def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
+                                             FCVTZU_Nddi>;
+
+// Scalar Integer Add
+let isCommutable = 1 in {
+def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
+}
+
+// Scalar Integer Sub
+def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
+
+// Pattern for Scalar Integer Add and Sub with D register only
+defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
+
+// Scalar Integer Saturating Add (Signed, Unsigned)
+defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
+defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
+
+// Scalar Integer Saturating Sub (Signed, Unsigned)
+defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
+defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
+
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
+                                           SQADDhhh, SQADDsss, SQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
+                                           UQADDhhh, UQADDsss, UQADDddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
+                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
+                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
+
+// Scalar Integer Saturating Doubling Multiply Half High
+defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
+
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Doubling Multiply Half High and
+// Scalar Integer Saturating Rounding Doubling Multiply Half High
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
+                                                               SQDMULHsss>;
+defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
+                                                                SQRDMULHsss>;
+
+// Scalar Floating-point Multiply Extended
+defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
+
+// Scalar Floating-point Reciprocal Step
+defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
+
+// Scalar Floating-point Reciprocal Square Root Step
+defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Floating-point Reciprocal Step and
+// Scalar Floating-point Reciprocal Square Root Step
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrecps, FRECPSsss,
+                                                              FRECPSddd>;
+defm : Neon_Scalar3Same_SD_size_patterns<int_arm_neon_vrsqrts, FRSQRTSsss,
+                                                               FRSQRTSddd>;
+
+def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Floating-point Multiply Extended,
+multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
+                                                  Instruction INSTS,
+                                                  Instruction INSTD> {
+  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
+            (INSTS FPR32:$Rn, FPR32:$Rm)>;
+  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
+            (INSTD FPR64:$Rn, FPR64:$Rm)>;
+}
+
+defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
+                                              FMULXsss,FMULXddd>;
+
+// Scalar Integer Shift Left (Signed, Unsigned)
+def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
+def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
+
+// Scalar Integer Saturating Shift Left (Signed, Unsigned)
+defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
+defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
+                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
+                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
+
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
+def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
+
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
+defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
+                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
+defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
+                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
+
+// Patterns to match llvm.arm.* intrinsic for
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
+defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+
+// Signed Saturating Doubling Multiply-Add Long
+defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
+                                            SQDMLALshh, SQDMLALdss>;
+
+// Signed Saturating Doubling Multiply-Subtract Long
+defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
+defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
+                                            SQDMLSLshh, SQDMLSLdss>;
+
+// Signed Saturating Doubling Multiply Long
+defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
+defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
+                                         SQDMULLshh, SQDMULLdss>;
+
+// Scalar Signed Integer Convert To Floating-point
+defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_s32,
+                                                 int_aarch64_neon_vcvtf64_s64,
+                                                 SCVTFss, SCVTFdd>;
+
+// Scalar Unsigned Integer Convert To Floating-point
+defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
+defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtf32_u32,
+                                                 int_aarch64_neon_vcvtf64_u64,
+                                                 UCVTFss, UCVTFdd>;
+
+// Scalar Floating-point Converts
+def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
+def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
+                                                  FCVTXN>;
+
+defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
+                                                  FCVTNSss, FCVTNSdd>;
+
+defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
+                                                  FCVTNUss, FCVTNUdd>;
+
+defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
+                                                  FCVTMSss, FCVTMSdd>;
+
+defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
+                                                  FCVTMUss, FCVTMUdd>;
+
+defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
+                                                  FCVTASss, FCVTASdd>;
+
+defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
+                                                  FCVTAUss, FCVTAUdd>;
+
+defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
+                                                  FCVTPSss, FCVTPSdd>;
+
+defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
+                                                  FCVTPUss, FCVTPUdd>;
+
+defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
+                                                  FCVTZSss, FCVTZSdd>;
+
+defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
+defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
+                                                  FCVTZUss, FCVTZUdd>;
+
+// Patterns For Convert Instructions Between v1f64 and v1i64
+class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
+                                              Instruction INST>
+    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
+def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
+
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
+def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
+
+// Scalar Floating-point Reciprocal Estimate
+defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrecpe,
+                                             FRECPEss, FRECPEdd>;
+
+// Scalar Floating-point Reciprocal Exponent
+defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
+                                             FRECPXss, FRECPXdd>;
+
+// Scalar Floating-point Reciprocal Square Root Estimate
+defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
+defm : Neon_Scalar2SameMisc_SD_size_patterns<int_arm_neon_vrsqrte,
+                                             FRSQRTEss, FRSQRTEdd>;
+
+// Scalar Floating-point Round
+class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
+def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
+def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
+def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
+def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
+def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
+def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
+
+// Scalar Integer Compare
+
+// Scalar Compare Bitwise Equal
+def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
+
+class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
+                                              Instruction INSTD,
+                                              CondCode CC>
+  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
+        (INSTD FPR64:$Rn, FPR64:$Rm)>;
+
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
+
+// Scalar Compare Signed Greather Than Or Equal
+def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
+
+// Scalar Compare Unsigned Higher Or Same
+def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
+
+// Scalar Compare Unsigned Higher
+def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
+
+// Scalar Compare Signed Greater Than
+def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
+def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
+
+// Scalar Compare Bitwise Test Bits
+def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
+def : Neon_Scalar3Same_cmp_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
+def : Neon_Scalar3Same_cmp_D_size_patterns<Neon_tst, CMTSTddd>;
+
+// Scalar Compare Bitwise Equal To Zero
+def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
+                                                CMEQddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
+
+// Scalar Compare Signed Greather Than Or Equal To Zero
+def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
+                                                CMGEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
+
+// Scalar Compare Signed Greater Than Zero
+def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
+                                                CMGTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
+
+// Scalar Compare Signed Less Than Or Equal To Zero
+def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
+                                                CMLEddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
+
+// Scalar Compare Less Than Zero
+def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
+def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
+                                                CMLTddi>;
+def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
+
+// Scalar Floating-point Compare
+
+// Scalar Floating-point Compare Mask Equal
+defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vceq,
+                                             FCMEQsss, FCMEQddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
+
+// Scalar Floating-point Compare Mask Equal To Zero
+defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vceq,
+                                                  FCMEQZssi, FCMEQZddi>;
+def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpz32:$FPImm), SETEQ)),
+          (FCMEQZddi FPR64:$Rn, fpz32:$FPImm)>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal
+defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcge,
+                                             FCMGEsss, FCMGEddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
+
+// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
+defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcge,
+                                                  FCMGEZssi, FCMGEZddi>;
+
+// Scalar Floating-point Compare Mask Greather Than
+defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcgt,
+                                             FCMGTsss, FCMGTddd>;
+def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
+
+// Scalar Floating-point Compare Mask Greather Than Zero
+defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcgt,
+                                                  FCMGTZssi, FCMGTZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
+defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vclez,
+                                                  FCMLEZssi, FCMLEZddi>;
+
+// Scalar Floating-point Compare Mask Less Than Zero
+defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
+defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_vcltz,
+                                                  FCMLTZssi, FCMLTZddi>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
+defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcage,
+                                             FACGEsss, FACGEddd>;
+
+// Scalar Floating-point Absolute Compare Mask Greater Than
+defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
+defm : Neon_Scalar3Same_cmp_SD_size_patterns<int_aarch64_neon_vcagt,
+                                             FACGTsss, FACGTddd>;
+
+// Scakar Floating-point Absolute Difference
+defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
+defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd,
+                                         FABDsss, FABDddd>;
+
+// Scalar Absolute Value
+defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
+
+// Scalar Signed Saturating Absolute Value
+defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
+                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
+
+// Scalar Negate
+defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
+defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
+
+// Scalar Signed Saturating Negate
+defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
+defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
+                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
+
+// Scalar Signed Saturating Accumulated of Unsigned Value
+defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
+                                                     SUQADDbb, SUQADDhh,
+                                                     SUQADDss, SUQADDdd>;
+
+// Scalar Unsigned Saturating Accumulated of Signed Value
+defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
+defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
+                                                     USQADDbb, USQADDhh,
+                                                     USQADDss, USQADDdd>;
+
+def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
+                                          (v1i64 FPR64:$Rn))),
+          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
+          (ABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
+          (SQABSdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
+          (SQNEGdd FPR64:$Rn)>;
+
+def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
+                      (v1i64 FPR64:$Rn))),
+          (NEGdd FPR64:$Rn)>;
+
+// Scalar Signed Saturating Extract Unsigned Narrow
+defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
+                                                     SQXTUNbh, SQXTUNhs,
+                                                     SQXTUNsd>;
+
+// Scalar Signed Saturating Extract Narrow
+defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
+                                                     SQXTNbh, SQXTNhs,
+                                                     SQXTNsd>;
+
+// Scalar Unsigned Saturating Extract Narrow
+defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
+defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
+                                                     UQXTNbh, UQXTNhs,
+                                                     UQXTNsd>;
+
+// Scalar Reduce Pairwise
+
+multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0> {
+  let isCommutable = Commutable in {
+    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
+                                (outs FPR64:$Rd), (ins VPR128:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
+                                     string asmop, bit Commutable = 0>
+  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
+  let isCommutable = Commutable in {
+    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
+                                (outs FPR32:$Rd), (ins VPR64:$Rn),
+                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
+                                [],
+                                NoItinerary>;
+  }
+}
+
+// Scalar Reduce Addition Pairwise (Integer) with
+// Pattern to match llvm.arm.* intrinsic
+defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
+
+// Pattern to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Addition Pairwise (Integer)
+def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
+          (ADDPvv_D_2D VPR128:$Rn)>;
+
+// Scalar Reduce Addition Pairwise (Floating Point)
+defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
+
+// Scalar Reduce Maximum Pairwise (Floating Point)
+defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
+
+// Scalar Reduce Minimum Pairwise (Floating Point)
+defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
+
+// Scalar Reduce maxNum Pairwise (Floating Point)
+defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
+
+// Scalar Reduce minNum Pairwise (Floating Point)
+defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
+
+multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnodeS,
+                                            SDPatternOperator opnodeD,
+                                            Instruction INSTS,
+                                            Instruction INSTD> {
+  def : Pat<(v1f32 (opnodeS (v2f32 VPR64:$Rn))),
+            (INSTS VPR64:$Rn)>;
+  def : Pat<(v1f64 (opnodeD (v2f64 VPR128:$Rn))),
+            (INSTD VPR128:$Rn)>;
+}
+
+// Patterns to match llvm.aarch64.* intrinsic for
+// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
+  int_aarch64_neon_vpfaddq, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
+  int_aarch64_neon_vpmaxq, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
+  int_aarch64_neon_vpminq, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
+  int_aarch64_neon_vpfmaxnmq, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
+  int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vaddv,
+    int_aarch64_neon_vaddv, FADDPvv_S_2S, FADDPvv_D_2D>;
+
+def : Pat<(v1f32 (int_aarch64_neon_vaddv (v4f32 VPR128:$Rn))),
+          (FADDPvv_S_2S (v2f32
+               (EXTRACT_SUBREG
+                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
+                   sub_64)))>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxv,
+    int_aarch64_neon_vmaxv, FMAXPvv_S_2S, FMAXPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminv,
+    int_aarch64_neon_vminv, FMINPvv_S_2S, FMINPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vmaxnmv,
+    int_aarch64_neon_vmaxnmv, FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
+
+defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vminnmv,
+    int_aarch64_neon_vminnmv, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
+
+// Scalar by element Arithmetic
+
+class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
+                                    string rmlane, bit u, bit szhi, bit szlo,
+                                    RegisterClass ResFPR, RegisterClass OpFPR,
+                                    RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
+                                                    string rmlane,
+                                                    bit u, bit szhi, bit szlo,
+                                                    RegisterClass ResFPR,
+                                                    RegisterClass OpFPR,
+                                                    RegisterOperand OpVPR,
+                                                    Operand OpImm>
+  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
+                             (outs ResFPR:$Rd),
+                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
+                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
+                             [],
+                             NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<3> Imm;
+  bits<5> MRm;
+}
+
+// Scalar Floating Point  multiply (scalar, by element)
+def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
+  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point  multiply extended (scalar, by element)
+def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
+  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (opnode
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn))),
+             (ResTy (INST (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Patterns for Scalar Floating Point  multiply (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
+  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+
+
+// Scalar Floating Point fused multiply-add (scalar, by element)
+def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
+  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Floating Point fused multiply-subtract (scalar, by element)
+def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1}; // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
+  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{11} = Imm{0}; // h
+  let Inst{21} = 0b0;    // l
+  let Inst{20-16} = MRm;
+}
+// We are allowed to match the fma instruction regardless of compile options.
+multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
+  Instruction FMLAI, Instruction FMLSI,
+  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  // fmla
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmla operands
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLAI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // fmls
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+
+  // swapped fmls operands
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
+
+  def  : Pat<(ResTy (fma
+               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
+               (ResTy FPRC:$Rn),
+               (ResTy FPRC:$Ra))),
+             (ResTy (FMLSI (ResTy FPRC:$Ra),
+               (ResTy FPRC:$Rn),
+               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
+               OpNImm:$Imm))>;
+}
+
+// Scalar Floating Point fused multiply-add and
+// multiply-subtract (scalar, by element)
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
+  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
+  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+// Scalar Signed saturating doubling multiply long (scalar, by element)
+def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
+  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
+  SDPatternOperator opnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass FPRC,
+  ValueType OpVTy, ValueType OpTy,
+  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+
+  //swapped operands
+  def  : Pat<(ResTy (opnode
+               (OpVTy (scalar_to_vector
+                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpVTy FPRC:$Rn))),
+             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+
+// Patterns for Scalar Signed saturating doubling
+// multiply long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
+  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating doubling multiply-add long (scalar, by element)
+def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
+  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Scalar Signed saturating doubling
+// multiply-subtract long (scalar, by element)
+def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
+  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
+  SDPatternOperator opnode,
+  SDPatternOperator coreopnode,
+  Instruction INST,
+  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
+  ValueType OpTy,
+  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
+
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode (OpTy FPRC:$Rn),
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+
+  // swapped operands
+  def  : Pat<(ResTy (opnode
+               (ResTy ResFPRC:$Ra),
+               (ResTy (coreopnode
+                 (OpTy (scalar_to_vector
+                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
+                 (OpTy FPRC:$Rn))))),
+             (ResTy (INST (ResTy ResFPRC:$Ra),
+               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
+}
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-add long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
+  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Patterns for Scalar Signed saturating
+// doubling multiply-sub long (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
+  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar general arithmetic operation
+class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
+
+class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (INST FPR64:$Rn, FPR64:$Rm)>;
+
+class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
+                                        Instruction INST> 
+    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
+              (v1f64 FPR64:$Ra))),
+          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+
+def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
+def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
+
+def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
+def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
+
+def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
+def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
+
+// Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
+  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+// Patterns for Scalar Signed saturating doubling multiply returning
+// high half (scalar, by element)
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
+  i32, VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
+  i32, VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
+  i32, VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
+  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
+  i32, VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Signed saturating rounding doubling multiply
+// returning high half (scalar, by element)
+def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
+  let Inst{11} = 0b0; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
+  let Inst{11} = Imm{2}; // h
+  let Inst{21} = Imm{1}; // l
+  let Inst{20} = Imm{0}; // m
+  let Inst{19-16} = MRm{3-0};
+}
+def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
+  let Inst{11} = 0b0;    // h
+  let Inst{21} = Imm{0}; // l
+  let Inst{20-16} = MRm;
+}
+def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
+  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{11} = Imm{1};    // h
+  let Inst{21} = Imm{0};    // l
+  let Inst{20-16} = MRm;
+}
+
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
+  VPR64Lo, neon_uimm2_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
+  VPR128Lo, neon_uimm3_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
+  VPR64Lo, neon_uimm1_bare>;
+defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
+  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
+  VPR128Lo, neon_uimm2_bare>;
+
+// Scalar Copy - DUP element to scalar
+class NeonI_Scalar_DUP<string asmop, string asmlane,
+                       RegisterClass ResRC, RegisterOperand VPRC,
+                       Operand OpImm>
+  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
+                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
+                     [],
+                     NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+multiclass NeonI_Scalar_DUP_Elt_pattern<Instruction DUPI, ValueType ResTy,
+  ValueType OpTy, Operand OpImm,
+  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
+  def : Pat<(ResTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for vector extract of FP data using scalar DUP instructions
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPsv_S, f32,
+  v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Elt_pattern<DUPdv_D, f64,
+  v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
+
+multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
+  ValueType ResTy, ValueType OpTy,Operand OpLImm,
+  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
+            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
+
+  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+                OpNImm:$Imm))>;
+}
+
+// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
+                                        v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
+                                        v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
+                                        v2i32, v4i32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
+              (neon_uimm0_bare:$Imm))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (vector_insert (ResTy undef),
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
+              (OpNImm:$Imm))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
+                                          ValueType OpTy, ValueType ElemTy,
+                                          Operand OpImm, ValueType OpNTy,
+                                          ValueType ExTy, Operand OpNImm> {
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
+            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
+
+  def : Pat<(ResTy (scalar_to_vector
+              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
+            (ResTy (DUPI
+              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              OpNImm:$Imm))>;
+}
+
+// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
+// instructions.
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1i64, v2i64, i64, neon_uimm1_bare,
+  v1i64, v2i64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1i32, v4i32, i32, neon_uimm2_bare,
+  v2i32, v4i32, neon_uimm1_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
+  v1i16, v8i16, i32, neon_uimm3_bare,
+  v4i16, v8i16, neon_uimm2_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
+  v1i8, v16i8, i32, neon_uimm4_bare,
+  v8i8, v16i8, neon_uimm3_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
+  v1f64, v2f64, f64, neon_uimm1_bare,
+  v1f64, v2f64, neon_uimm0_bare>;
+defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
+  v1f32, v4f32, f32, neon_uimm2_bare,
+  v2f32, v4f32, neon_uimm1_bare>;
+
+multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
+                                  Instruction DUPI, Operand OpImm,
+                                  RegisterClass ResRC> {
+  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
+          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
+}
+
+// Aliases for Scalar copy - DUP element (scalar)
+// FIXME: This is actually the preferred syntax but TableGen can't deal with
+// custom printing of aliases.
+defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
+defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
+
+multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
+                      ValueType OpTy> {
+  def : Pat<(ResTy (GetLow VPR128:$Rn)),
+            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
+  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
+            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
+}
+
+defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
+defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
+defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
+defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
+defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
+defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// 64-bit vector bitcasts...
+
+def : Pat<(v1i64 (bitconvert (v8i8  VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  VPR64:$src))), (v4i16 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v4i16  VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16  VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16  VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16  VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v2i32  VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32  VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32  VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v2f32  VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32  VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32  VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v2f32 (bitconvert (v1i64  VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
+
+// ..and 128-bit vector bitcasts...
+
+def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8  VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8  VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8  VPR128:$src))), (v8i16 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16  VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16  VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16  VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32  VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32  VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32  VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32  VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32  VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32  VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v2i64  VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v2f64  VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
+
+// ...and scalar bitcasts...
+def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
+def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
+def : Pat<(f32 (bitconvert (v1f32  FPR32:$src))), (f32 FPR32:$src)>;
+def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
+
+def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
+def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
+
+def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
+
+def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
+
+def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
+
+def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
+def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
+
+def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
+def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
+def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1f32 (bitconvert (f32  FPR32:$src))), (v1f32 FPR32:$src)>;
+def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+
+def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
+
+def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
+def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
+
+// Scalar Three Same
+
+def neon_uimm3 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 8;}]> {
+  let ParserMatchClass = uimm3_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+def neon_uimm4 : Operand<i64>,
+                   ImmLeaf<i64, [{return Imm < 16;}]> {
+  let ParserMatchClass = uimm4_asmoperand;
+  let PrintMethod = "printUImmHexOperand";
+}
+
+// Bitwise Extract
+class NeonI_Extract<bit q, bits<2> op2, string asmop,
+                    string OpS, RegisterOperand OpVPR, Operand OpImm>
+  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
+                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
+                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
+                     ", $Rm." # OpS # ", $Index",
+                     [],
+                     NoItinerary>{
+  bits<4> Index;
+}
+
+def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
+                               VPR64, neon_uimm3> {
+  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
+}
+
+def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
+                               VPR128, neon_uimm4> {
+  let Inst{14-11} = Index;
+}
+
+class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
+                 Operand OpImm>
+  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
+                                 (i64 OpImm:$Imm))),
+              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
+
+def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
+def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
+def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
+
+// Table lookup
+class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary>;
+
+// The vectors in look up table are always 16b
+multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
+defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
+defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
+defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
+
+// Table lookup extention
+class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
+             string asmop, string OpS, RegisterOperand OpVPR,
+             RegisterOperand VecList>
+  : NeonI_TBL<q, op2, len, op,
+              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
+              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
+              [],
+              NoItinerary> {
+  let Constraints = "$src = $Rd";
+}
+
+// The vectors in look up table are always 16b
+multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
+  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+
+  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
+                    !cast<RegisterOperand>(List # "16B_operand")>;
+}
+
+defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
+defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
+defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
+defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
+
+class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
+                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
+  : NeonI_copy<0b1, 0b0, 0b0011,
+               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
+               [(set (ResTy VPR128:$Rd),
+                 (ResTy (vector_insert
+                   (ResTy VPR128:$src),
+                   (OpTy OpGPR:$Rn),
+                   (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+  let Constraints = "$src = $Rd";
+}
+
+//Insert element (vector, from main)
+def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
+                           neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
+                           neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
+                           neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
+                           neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
+                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
+                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
+                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
+                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
+                             RegisterClass OpGPR, ValueType OpTy,
+                             Operand OpImm, Instruction INS>
+  : Pat<(ResTy (vector_insert
+              (ResTy VPR64:$src),
+              (OpTy OpGPR:$Rn),
+              (OpImm:$Imm))),
+        (ResTy (EXTRACT_SUBREG
+          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
+
+def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
+                                          neon_uimm3_bare, INSbw>;
+def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
+                                          neon_uimm2_bare, INShw>;
+def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
+                                          neon_uimm1_bare, INSsw>;
+def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
+                                          neon_uimm0_bare, INSdx>;
+
+class NeonI_INS_element<string asmop, string Res, Operand ResImm>
+  : NeonI_insert<0b1, 0b1,
+                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
+                 ResImm:$Immd, ResImm:$Immn),
+                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
+                 [],
+                 NoItinerary> {
+  let Constraints = "$src = $Rd";
+  bits<4> Immd;
+  bits<4> Immn;
+}
+
+//Insert element (vector, from element)
+def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
+  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
+  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
+}
+def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
+  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
+  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
+  // bit 11 is unspecified, but should be set to zero.
+}
+def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
+  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
+  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
+  // bits 11-12 are unspecified, but should be set to zero.
+}
+def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
+  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
+  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
+  // bits 11-13 are unspecified, but should be set to zero.
+}
+
+def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
+                    (INSELb VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
+                    (INSELh VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
+                    (INSELs VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
+def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
+                    (INSELd VPR128:$Rd, VPR128:$Rn,
+                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
+
+multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
+                                ValueType MidTy, Operand StImm, Operand NaImm,
+                                Instruction INS> {
+def : Pat<(ResTy (vector_insert
+            (ResTy VPR128:$src),
+            (MidTy (vector_extract
+              (ResTy VPR128:$Rn),
+              (StImm:$Immn))),
+            (StImm:$Immd))),
+          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
+              StImm:$Immd, StImm:$Immn)>;
+
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (StImm:$Immd))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+             StImm:$Immd, NaImm:$Immn)>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (ResTy VPR128:$Rn),
+               (StImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy VPR128:$Rn),
+               NaImm:$Immd, StImm:$Immn)),
+             sub_64))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy (vector_extract
+               (NaTy VPR64:$Rn),
+               (NaImm:$Immn))),
+             (NaImm:$Immd))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
+               NaImm:$Immd, NaImm:$Immn)),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                            neon_uimm3_bare, INSELb>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                            neon_uimm2_bare, INSELh>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                            neon_uimm1_bare, INSELs>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
+                            neon_uimm0_bare, INSELd>;
+
+multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
+                                      ValueType MidTy,
+                                      RegisterClass OpFPR, Operand ResImm,
+                                      SubRegIndex SubIndex, Instruction INS> {
+def : Pat <(ResTy (vector_insert
+             (ResTy VPR128:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (INS (ResTy VPR128:$src),
+             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
+             ResImm:$Imm,
+             (i64 0))>;
+
+def : Pat <(NaTy (vector_insert
+             (NaTy VPR64:$src),
+             (MidTy OpFPR:$Rn),
+             (ResImm:$Imm))),
+           (NaTy (EXTRACT_SUBREG
+             (ResTy (INS
+               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
+               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
+               ResImm:$Imm,
+               (i64 0))),
+             sub_64))>;
+}
+
+defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
+                                  sub_32, INSELs>;
+defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
+                                  sub_64, INSELd>;
+
+class NeonI_SMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, ValueType eleTy,
+                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0101,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                 (ResTy (sext_inreg
+                   (ResTy (vector_extract
+                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
+                   eleTy)))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Signed integer move (main, from element)
+def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
+                        GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
+                        GPR64, i64> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
+                               ValueType eleTy, Operand StImm,  Operand NaImm,
+                               Instruction SMOVI> {
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (StTy VPR128:$Rn), (StImm:$Imm))))),
+              eleTy)),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (StTy VPR128:$Rn), (StImm:$Imm))))),
+            (SMOVI VPR128:$Rn, StImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext_inreg
+              (i64 (anyext
+                (i32 (vector_extract
+                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+              eleTy)),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+
+  def : Pat<(i64 (sext
+              (i32 (vector_extract
+                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
+            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              NaImm:$Imm)>;
+}
+
+defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                          neon_uimm3_bare, SMOVxb>;
+defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                          neon_uimm2_bare, SMOVxh>;
+defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                          neon_uimm1_bare, SMOVxs>;
+
+class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
+                          ValueType eleTy, Operand StImm,  Operand NaImm,
+                          Instruction SMOVI>
+  : Pat<(i32 (sext_inreg
+          (i32 (vector_extract
+            (NaTy VPR64:$Rn), (NaImm:$Imm))),
+          eleTy)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
+                         neon_uimm3_bare, SMOVwb>;
+def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
+                         neon_uimm2_bare, SMOVwh>;
+
+class NeonI_UMOV<string asmop, string Res, bit Q,
+                 ValueType OpTy, Operand OpImm,
+                 RegisterClass ResGPR, ValueType ResTy>
+  : NeonI_copy<Q, 0b0, 0b0111,
+               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
+               [(set (ResTy ResGPR:$Rd),
+                  (ResTy (vector_extract
+                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+//Unsigned integer move (main, from element)
+def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
+                         GPR32, i32> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
+                         GPR64, i64> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
+                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
+def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
+                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
+
+class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
+                         Operand StImm,  Operand NaImm,
+                         Instruction SMOVI>
+  : Pat<(ResTy (vector_extract
+          (NaTy VPR64:$Rn), NaImm:$Imm)),
+        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+          NaImm:$Imm)>;
+
+def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
+                        neon_uimm3_bare, UMOVwb>;
+def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
+                        neon_uimm2_bare, UMOVwh>;
+def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
+                        neon_uimm1_bare, UMOVws>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
+            255)),
+          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
+            65535)),
+          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
+          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
+            255)),
+          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm3_bare:$Imm)>;
+
+def : Pat<(i32 (and
+            (i32 (vector_extract
+              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
+            65535)),
+          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm2_bare:$Imm)>;
+
+def : Pat<(i64 (zext
+            (i32 (vector_extract
+              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
+          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
+            neon_uimm0_bare:$Imm)>;
+
+// Additional copy patterns for scalar types
+def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
+          (UMOVwb (v16i8
+            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
+          (UMOVwh (v8i16
+            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
+
+def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
+          (FMOVws FPR32:$Rn)>;
+
+def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
+          (FMOVxd FPR64:$Rn)>;
+
+def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
+          (f64 FPR64:$Rn)>;
+
+def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))),
+          (f32 FPR32:$Rn)>;
+
+def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
+          (v1i8 (EXTRACT_SUBREG (v16i8
+            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_8))>;
+
+def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
+          (v1i16 (EXTRACT_SUBREG (v8i16
+            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
+            sub_16))>;
+
+def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
+          (FMOVsw $src)>;
+
+def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
+          (FMOVdx $src)>;
+
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (v1f32 FPR32:$Rn)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (v1f64 FPR64:$Rn)>;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
+                         (f64 FPR64:$src), sub_64)>;
+
+class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
+                    RegisterOperand ResVPR, Operand OpImm>
+  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
+               (ins VPR128:$Rn, OpImm:$Imm),
+               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
+               [],
+               NoItinerary> {
+  bits<4> Imm;
+}
+
+def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
+                              neon_uimm1_bare> {
+  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
+}
+
+def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
+                              neon_uimm4_bare> {
+  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
+}
+
+def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
+                              neon_uimm3_bare> {
+  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
+}
+
+def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
+                              neon_uimm2_bare> {
+  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
+}
+
+multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
+                                       ValueType OpTy,ValueType NaTy,
+                                       ValueType ExTy, Operand OpLImm,
+                                       Operand OpNImm> {
+def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
+        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
+
+def : Pat<(ResTy (Neon_vduplane
+            (NaTy VPR64:$Rn), OpNImm:$Imm)),
+          (ResTy (DUPELT
+            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
+}
+defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
+                             neon_uimm4_bare, neon_uimm3_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
+                             neon_uimm3_bare, neon_uimm2_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
+                             neon_uimm2_bare, neon_uimm1_bare>;
+defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
+                             neon_uimm1_bare, neon_uimm0_bare>;
+
+def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v2f32 (DUPELT2s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
+          (v4f32 (DUPELT4s
+            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
+            (i64 0)))>;
+def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
+          (v2f64 (DUPELT2d
+            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
+            (i64 0)))>;
+
+class NeonI_DUP<bit Q, string asmop, string rdlane,
+                RegisterOperand ResVPR, ValueType ResTy,
+                RegisterClass OpGPR, ValueType OpTy>
+  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
+               asmop # "\t$Rd" # rdlane # ", $Rn",
+               [(set (ResTy ResVPR:$Rd),
+                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
+               NoItinerary>;
+
+def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
+  let Inst{20-16} = 0b01000;
+  // bit 20 is unspecified, but should be set to zero.
+}
+
+def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
+  let Inst{20-16} = 0b00001;
+  // bits 17-20 are unspecified, but should be set to zero.
+}
+
+def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
+  let Inst{20-16} = 0b00010;
+  // bits 18-20 are unspecified, but should be set to zero.
+}
+
+def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
+  let Inst{20-16} = 0b00100;
+  // bits 19-20 are unspecified, but should be set to zero.
+}
+
+// patterns for CONCAT_VECTORS
+multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
+          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
+          (INSELd
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
+            (i64 1),
+            (i64 0))>;
+def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
+          (DUPELT2d
+            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            (i64 0))> ;
+}
+
+defm : Concat_Vector_Pattern<v16i8, v8i8>;
+defm : Concat_Vector_Pattern<v8i16, v4i16>;
+defm : Concat_Vector_Pattern<v4i32, v2i32>;
+defm : Concat_Vector_Pattern<v2i64, v1i64>;
+defm : Concat_Vector_Pattern<v4f32, v2f32>;
+defm : Concat_Vector_Pattern<v2f64, v1f64>;
+
+//patterns for EXTRACT_SUBVECTOR
+def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
+          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
+          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
+          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
+          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
+          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
+          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
+
+// The followings are for instruction class (3V Elem)
+
+// Variant 1
+
+class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
+             string asmop, string ResS, string OpS, string EleOpS,
+             Operand OpImm, RegisterOperand ResVPR,
+             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+
+  let Constraints = "$src = $Rd";
+}
+
+multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
+defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                   RegisterOperand ResVPR, RegisterOperand OpVPR,
+                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                   ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                  RegisterOperand ResVPR, RegisterOperand OpVPR,
+                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
+                  ValueType EleOpTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST ResVPR:$src, OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
+defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
+
+class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
+                 string asmop, string ResS, string OpS, string EleOpS,
+                 Operand OpImm, RegisterOperand ResVPR,
+                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
+  : NeonI_2VElem<q, u, size, opcode,
+                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
+                                         EleOpVPR:$Re, OpImm:$Index),
+                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
+                 ", $Re." # EleOpS # "[$Index]",
+                 [],
+                 NoItinerary> {
+  bits<3> Index;
+  bits<5> Re;
+}
+
+multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
+                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
+defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
+defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
+                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
+}
+
+defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
+defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
+defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
+
+// Variant 2
+
+multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
+defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
+
+class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
+                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
+                         SDPatternOperator coreop>
+  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
+          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
+        (INST OpVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
+
+multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
+                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
+
+  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
+                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
+                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
+defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
+
+def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v2f32 VPR64:$Rn))),
+          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
+                       (v4f32 VPR128:$Rn))),
+          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
+                       (v2f64 VPR128:$Rn))),
+          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
+
+// The followings are patterns using fma
+// -ffp-contract=fast generates fma
+
+multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
+                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // _1d2d doesn't exist!
+
+  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
+                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{0}};
+    let Inst{21} = 0b0;
+    let Inst{20-16} = Re;
+  }
+}
+
+defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
+defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                       RegisterOperand ResVPR, RegisterOperand OpVPR,
+                       ValueType ResTy, ValueType OpTy,
+                       SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))),
+        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane 0
+class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                      RegisterOperand ResVPR, RegisterOperand OpVPR,
+                      ValueType ResTy, ValueType OpTy,
+                      SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
+                           SDPatternOperator op,
+                           RegisterOperand ResVPR, RegisterOperand OpVPR,
+                           ValueType ResTy, ValueType OpTy,
+                           SDPatternOperator coreop>
+  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
+                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
+
+
+multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
+}
+
+defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
+
+// Pattern for lane 0
+class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
+                      RegisterOperand ResVPR, ValueType ResTy>
+  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
+                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
+                   (ResTy ResVPR:$src))),
+        (INST ResVPR:$src, ResVPR:$Rn,
+              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
+
+multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
+{
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
+                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
+                        op, VPR64, v2f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
+                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
+                        op, VPR128, v4f32>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(fneg (Neon_vduplane
+                                     node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
+                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
+                         BinOpFrag<(Neon_vduplane
+                                     (fneg node:$LHS), node:$RHS)>>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane
+                                    node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
+                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
+                        BinOpFrag<(Neon_vduplane
+                                    (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
+                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
+                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(fneg (Neon_combine_2d
+                                         node:$LHS, node:$RHS))>>;
+
+  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
+                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
+                             BinOpFrag<(Neon_combine_2d
+                                         (fneg node:$LHS), (fneg node:$RHS))>>;
+}
+
+defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
+
+// Variant 3: Long type
+// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
+//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
+
+multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
+defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
+defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
+defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
+defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
+defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
+
+multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
+  // vector register class for element is always 128-bit to cover the max index
+  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
+                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
+                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
+    let Inst{11} = {Index{1}};
+    let Inst{21} = {Index{0}};
+    let Inst{20-16} = Re;
+  }
+
+  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
+  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
+                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+
+  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
+                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
+    let Inst{11} = {Index{2}};
+    let Inst{21} = {Index{1}};
+    let Inst{20} = {Index{0}};
+    let Inst{19-16} = Re{3-0};
+  }
+}
+
+defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
+defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
+defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
+
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
+          (FMOVdd $src)>;
+def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))),
+          (FMOVss $src)>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                     RegisterOperand EleOpVPR, ValueType ResTy,
+                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                    RegisterOperand EleOpVPR, ValueType ResTy,
+                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                    SDPatternOperator hiop>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$src, VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
+                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                     SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op (ResTy VPR128:$src),
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
+defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
+defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
+defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
+
+// Pattern for lane in 128-bit vector
+class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
+                         RegisterOperand EleOpVPR, ValueType ResTy,
+                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
+
+// Pattern for lane in 64-bit vector
+class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
+                        RegisterOperand EleOpVPR, ValueType ResTy,
+                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
+                        SDPatternOperator hiop>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vduplane
+                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
+        (INST VPR128:$Rn,
+          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
+
+// Pattern for fixed lane 0
+class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
+                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
+                         SDPatternOperator hiop, Instruction DupInst>
+  : Pat<(ResTy (op
+          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
+          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
+        (INST VPR128:$Rn, (DupInst $Re), 0)>;
+
+multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
+                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
+                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
+defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
+defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
+
+multiclass NI_qdma<SDPatternOperator op> {
+  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+
+  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+                    (op node:$Ra,
+                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
+}
+
+defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
+defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
+
+multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
+                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
+                     v4i32, v4i16, v8i16>;
+
+  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
+                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
+                     v2i64, v2i32, v4i32>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
+                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
+                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
+                       !cast<PatFrag>(op # "_2d"), VPR128,
+                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
+                       !cast<PatFrag>(op # "_4s"),
+                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
+
+  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
+                       !cast<PatFrag>(op # "_2d"),
+                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
+
+  // Index can only be half of the max value for lane in 64-bit vector
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
+                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
+                    v4i32, v4i16, v4i16>;
+
+  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
+                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
+                    v2i64, v2i32, v2i32>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
+                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
+                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
+
+  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
+                      !cast<PatFrag>(op # "_2d"), VPR64,
+                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
+}
+
+defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
+defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
+
+// End of implementation for instruction class (3V Elem)
+
+class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
+                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
+                SDPatternOperator Neon_Rev>
+  : NeonI_2VMisc<Q, U, size, opcode,
+               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
+               asmop # "\t$Rd." # Res # ", $Rn." # Res,
+               [(set (ResTy ResVPR:$Rd),
+                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
+               NoItinerary> ;
+
+def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
+                          v16i8, Neon_rev64>;
+def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
+                         v8i16, Neon_rev64>;
+def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
+                         v4i32, Neon_rev64>;
+def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
+                         v8i8, Neon_rev64>;
+def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
+                         v4i16, Neon_rev64>;
+def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
+                         v2i32, Neon_rev64>;
+
+def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
+def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
+
+def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
+                          v16i8, Neon_rev32>;
+def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
+                          v8i16, Neon_rev32>;
+def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
+                         v8i8, Neon_rev32>;
+def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
+                         v4i16, Neon_rev32>;
+
+def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
+                          v16i8, Neon_rev16>;
+def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
+                         v8i8, Neon_rev16>;
+
+multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.8h, $Rn.16b",
+                           [(set (v8i16 VPR128:$Rd),
+                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.8b",
+                          [(set (v4i16 VPR64:$Rd),
+                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.4s, $Rn.8h",
+                           [(set (v4i32 VPR128:$Rd),
+                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.4h",
+                          [(set (v2i32 VPR64:$Rd),
+                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
+                          NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$Rn),
+                           asmop # "\t$Rd.2d, $Rn.4s",
+                           [(set (v2i64 VPR128:$Rd),
+                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
+                           NoItinerary>;
+
+  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.1d, $Rn.2s",
+                          [(set (v1i64 VPR64:$Rd),
+                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
+                          NoItinerary>;
+}
+
+defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
+                                int_arm_neon_vpaddls>;
+defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
+                                int_arm_neon_vpaddlu>;
+
+multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
+                             SDPatternOperator Neon_Padd> {
+  let Constraints = "$src = $Rd" in {
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "\t$Rd.8h, $Rn.16b",
+                             [(set (v8i16 VPR128:$Rd),
+                                (v8i16 (Neon_Padd
+                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
+                             NoItinerary>;
+
+    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.4h, $Rn.8b",
+                            [(set (v4i16 VPR64:$Rd),
+                               (v4i16 (Neon_Padd
+                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.4s, $Rn.8h",
+                            [(set (v4i32 VPR128:$Rd),
+                               (v4i32 (Neon_Padd
+                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.2s, $Rn.4h",
+                            [(set (v2i32 VPR64:$Rd),
+                               (v2i32 (Neon_Padd
+                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
+                            NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "\t$Rd.2d, $Rn.4s",
+                            [(set (v2i64 VPR128:$Rd),
+                               (v2i64 (Neon_Padd
+                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
+                            NoItinerary>;
+
+    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                            asmop # "\t$Rd.1d, $Rn.2s",
+                            [(set (v1i64 VPR64:$Rd),
+                               (v1i64 (Neon_Padd
+                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
+                            NoItinerary>;
+  }
+}
+
+defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
+                                   int_arm_neon_vpadals>;
+defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
+                                   int_arm_neon_vpadalu>;
+
+multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [], NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [], NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                         (outs VPR64:$Rd), (ins VPR64:$Rn),
+                         asmop # "\t$Rd.8b, $Rn.8b",
+                         [], NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [], NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [], NoItinerary>;
+}
+
+defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
+defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
+defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
+defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
+
+multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
+                                          SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
+defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
+
+def : Pat<(v16i8 (sub
+            (v16i8 Neon_AllZero),
+            (v16i8 VPR128:$Rn))),
+          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (sub
+            (v8i8 Neon_AllZero),
+            (v8i8 VPR64:$Rn))),
+          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (sub
+            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
+            (v8i16 VPR128:$Rn))),
+          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
+def : Pat<(v4i16 (sub
+            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
+            (v4i16 VPR64:$Rn))),
+          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
+def : Pat<(v4i32 (sub
+            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
+            (v4i32 VPR128:$Rn))),
+          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
+def : Pat<(v2i32 (sub
+            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
+            (v2i32 VPR64:$Rn))),
+          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
+def : Pat<(v2i64 (sub
+            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
+            (v2i64 VPR128:$Rn))),
+          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
+
+multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
+  let Constraints = "$src = $Rd" in {
+    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                           asmop # "\t$Rd.16b, $Rn.16b",
+                           [], NoItinerary>;
+
+    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.8h, $Rn.8h",
+                          [], NoItinerary>;
+
+    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4s",
+                          [], NoItinerary>;
+
+    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2d",
+                          [], NoItinerary>;
+
+    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8b",
+                          [], NoItinerary>;
+
+    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4h",
+                          [], NoItinerary>;
+
+    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2s",
+                          [], NoItinerary>;
+  }
+}
+
+defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
+defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
+
+multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
+                                           SDPatternOperator Neon_Op> {
+  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
+            (v16i8 (!cast<Instruction>(Prefix # 16b)
+              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
+
+  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
+            (v8i16 (!cast<Instruction>(Prefix # 8h)
+              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
+            (v4i32 (!cast<Instruction>(Prefix # 4s)
+              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
+            (v2i64 (!cast<Instruction>(Prefix # 2d)
+              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8b)
+              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4h)
+              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2s)
+              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
+}
+
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
+defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
+
+multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
+                          SDPatternOperator Neon_Op> {
+  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [(set (v16i8 VPR128:$Rd),
+                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
+                         NoItinerary>;
+
+  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.8h, $Rn.8h",
+                        [(set (v8i16 VPR128:$Rd),
+                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [(set (v8i8 VPR64:$Rd),
+                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.4h, $Rn.4h",
+                        [(set (v4i16 VPR64:$Rd),
+                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
+defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
+
+multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
+                              bits<5> Opcode> {
+  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
+                         (outs VPR128:$Rd), (ins VPR128:$Rn),
+                         asmop # "\t$Rd.16b, $Rn.16b",
+                         [], NoItinerary>;
+
+  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.8b, $Rn.8b",
+                        [], NoItinerary>;
+}
+
+defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
+defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
+defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
+
+def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
+                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
+def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
+                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
+
+def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
+          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
+          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
+
+def : Pat<(v16i8 (xor
+            (v16i8 VPR128:$Rn),
+            (v16i8 Neon_AllOne))),
+          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (xor
+            (v8i8 VPR64:$Rn),
+            (v8i8 Neon_AllOne))),
+          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
+def : Pat<(v8i16 (xor
+            (v8i16 VPR128:$Rn),
+            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v4i16 (xor
+            (v4i16 VPR64:$Rn),
+            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v4i32 (xor
+            (v4i32 VPR128:$Rn),
+            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+def : Pat<(v2i32 (xor
+            (v2i32 VPR64:$Rn),
+            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
+          (NOT8b VPR64:$Rn)>;
+def : Pat<(v2i64 (xor
+            (v2i64 VPR128:$Rn),
+            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
+          (NOT16b VPR128:$Rn)>;
+
+def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
+          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
+def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
+          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
+
+multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
+                                SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4f32 VPR128:$Rd),
+                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (v2f64 VPR128:$Rd),
+                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2f32 VPR64:$Rd),
+                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
+defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
+
+multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.8b, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$Rd = $src" in {
+    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                             asmop # "2\t$Rd.16b, $Rn.8h",
+                             [], NoItinerary>;
+
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
+defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
+defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
+defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
+
+multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
+                                        SDPatternOperator Neon_Op> {
+  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
+            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
+
+  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
+            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
+
+  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
+            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
+
+  def : Pat<(v16i8 (concat_vectors
+              (v8i8 VPR64:$src),
+              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 8h16b)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v8i16 (concat_vectors
+              (v4i16 VPR64:$src),
+              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 4s8h)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+
+  def : Pat<(v4i32 (concat_vectors
+              (v2i32 VPR64:$src),
+              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
+            (!cast<Instruction>(Prefix # 2d4s)
+              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
+              VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
+defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
+defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
+
+multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
+  let DecoderMethod = "DecodeSHLLInstruction" in {
+    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact8:$Imm),
+                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
+                            [], NoItinerary>;
+
+    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact16:$Imm),
+                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
+                            [], NoItinerary>;
+
+    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR64:$Rn, uimm_exact32:$Imm),
+                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
+                            [], NoItinerary>;
+
+    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact8:$Imm),
+                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
+                            [], NoItinerary>;
+
+    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact16:$Imm),
+                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
+                            [], NoItinerary>;
+
+    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
+                            (outs VPR128:$Rd),
+                            (ins VPR128:$Rn, uimm_exact32:$Imm),
+                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
+                            [], NoItinerary>;
+  }
+}
+
+defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
+
+class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
+                          SDPatternOperator ExtOp, Operand Neon_Imm,
+                          string suffix>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp (OpTy VPR64:$Rn))),
+            (DesTy (Neon_vdup
+              (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
+
+class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
+                               SDPatternOperator ExtOp, Operand Neon_Imm,
+                               string suffix, PatFrag GetHigh>
+  : Pat<(DesTy (shl
+          (DesTy (ExtOp
+            (OpTy (GetHigh VPR128:$Rn)))),
+              (DesTy (Neon_vdup
+                (i32 Neon_Imm:$Imm))))),
+        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
+
+def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
+                               Neon_High16B>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
+                               Neon_High8H>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
+                               Neon_High4S>;
+
+multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
+  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.4h, $Rn.4s",
+                          [], NoItinerary>;
+
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  let Constraints = "$src = $Rd" in {
+    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.8h, $Rn.4s",
+                            [], NoItinerary>;
+
+    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                            asmop # "2\t$Rd.4s, $Rn.2d",
+                            [], NoItinerary>;
+  }
+}
+
+defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
+
+multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
+                                       SDPatternOperator f32_to_f16_Op,
+                                       SDPatternOperator f64_to_f32_Op> {
+
+  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
+              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v8i16 (concat_vectors
+                (v4i16 VPR64:$src),
+                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
+                  (!cast<Instruction>(prefix # "4s8h")
+                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                    (v4f32 VPR128:$Rn))>;
+
+  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
+                (!cast<Instruction>(prefix # "2d4s")
+                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+                  (v2f64 VPR128:$Rn))>;
+}
+
+defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
+
+multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
+                                 bits<5> opcode> {
+  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR64:$Rd), (ins VPR128:$Rn),
+                          asmop # "\t$Rd.2s, $Rn.2d",
+                          [], NoItinerary>;
+
+  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.2d",
+                          [], NoItinerary> {
+    let Constraints = "$src = $Rd";
+  }
+
+  def : Pat<(v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))),
+            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
+
+  def : Pat<(v4f32 (concat_vectors
+              (v2f32 VPR64:$src),
+              (v2f32 (int_aarch64_neon_fcvtxn (v2f64 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "2d4s")
+               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
+               VPR128:$Rn)>;
+}
+
+defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
+
+def Neon_High4Float : PatFrag<(ops node:$in),
+                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
+
+multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
+  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.4s, $Rn.4h",
+                          [], NoItinerary>;
+
+  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR64:$Rn),
+                          asmop # "\t$Rd.2d, $Rn.2s",
+                          [], NoItinerary>;
+
+  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.4s, $Rn.8h",
+                          [], NoItinerary>;
+
+  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
+                          (outs VPR128:$Rd), (ins VPR128:$Rn),
+                          asmop # "2\t$Rd.2d, $Rn.4s",
+                          [], NoItinerary>;
+}
+
+defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
+
+multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
+
+  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
+              (v4i16 (Neon_High8H
+                (v8i16 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
+
+  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
+            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
+
+  def : Pat<(v2f64 (fextend
+              (v2f32 (Neon_High4Float
+                (v4f32 VPR128:$Rn))))),
+            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
+}
+
+defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
+
+multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
+                                ValueType ResTy4s, ValueType OpTy4s,
+                                ValueType ResTy2d, ValueType OpTy2d,
+                                ValueType ResTy2s, ValueType OpTy2s,
+                                SDPatternOperator Neon_Op> {
+
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (ResTy4s VPR128:$Rd),
+                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.2d, $Rn.2d",
+                        [(set (ResTy2d VPR128:$Rd),
+                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (ResTy2s VPR64:$Rd),
+                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
+                                v2f64, v2i32, v2f32, Neon_Op>;
+}
+
+defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtns>;
+defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtnu>;
+defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
+                                     int_aarch64_neon_fcvtps>;
+defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
+                                     int_aarch64_neon_fcvtpu>;
+defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
+                                     int_aarch64_neon_fcvtms>;
+defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
+                                     int_aarch64_neon_fcvtmu>;
+defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
+defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
+defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
+                                     int_aarch64_neon_fcvtas>;
+defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
+                                     int_aarch64_neon_fcvtau>;
+
+multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
+                                  bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
+                                v2i64, v2f32, v2i32, Neon_Op>;
+}
+
+defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
+defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
+
+multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
+                                 bits<5> opcode, SDPatternOperator Neon_Op> {
+  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
+                                v2f64, v2f32, v2f32, Neon_Op>;
+}
+
+defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
+                                     int_aarch64_neon_frintn>;
+defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
+defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
+defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
+defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
+defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
+defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
+defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
+                                    int_arm_neon_vrecpe>;
+defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
+                                     int_arm_neon_vrsqrte>;
+defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
+
+multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
+                               bits<5> opcode, SDPatternOperator Neon_Op> {
+  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
+                        (outs VPR128:$Rd), (ins VPR128:$Rn),
+                        asmop # "\t$Rd.4s, $Rn.4s",
+                        [(set (v4i32 VPR128:$Rd),
+                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
+                        NoItinerary>;
+
+  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
+                        (outs VPR64:$Rd), (ins VPR64:$Rn),
+                        asmop # "\t$Rd.2s, $Rn.2s",
+                        [(set (v2i32 VPR64:$Rd),
+                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
+                        NoItinerary>;
+}
+
+defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
+                                  int_arm_neon_vrecpe>;
+defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
+                                   int_arm_neon_vrsqrte>;
+
+// Crypto Class
+class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$src),
+                                       (v16i8 VPR128:$Rn))))],
+                     NoItinerary>{
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
+def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
+
+class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
+                      string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_AES<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$Rn),
+                     asmop # "\t$Rd.16b, $Rn.16b",
+                     [(set (v16i8 VPR128:$Rd),
+                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
+                     NoItinerary>;
+
+def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
+def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
+
+class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
+                     asmop # "\t$Rd.4s, $Rn.4s",
+                     [(set (v4i32 VPR128:$Rd),
+                        (v4i32 (opnode (v4i32 VPR128:$src),
+                                       (v4i32 VPR128:$Rn))))],
+                     NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
+                                 int_arm_neon_sha1su1>;
+def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
+                                   int_arm_neon_sha256su0>;
+
+class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
+                         string asmop, SDPatternOperator opnode>
+  : NeonI_Crypto_SHA<size, opcode,
+                     (outs FPR32:$Rd), (ins FPR32:$Rn),
+                     asmop # "\t$Rd, $Rn",
+                     [(set (v1i32 FPR32:$Rd),
+                        (v1i32 (opnode (v1i32 FPR32:$Rn))))],
+                     NoItinerary> {
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
+
+class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs VPR128:$Rd),
+                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+                       [(set (v4i32 VPR128:$Rd),
+                          (v4i32 (opnode (v4i32 VPR128:$src),
+                                         (v4i32 VPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
+                                   int_arm_neon_sha1su0>;
+def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
+                                     int_arm_neon_sha256su1>;
+
+class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v4i32 FPR128:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
+                                   int_arm_neon_sha256h>;
+def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
+                                    int_arm_neon_sha256h2>;
+
+class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop,
+                           SDPatternOperator opnode>
+  : NeonI_Crypto_3VSHA<size, opcode,
+                       (outs FPR128:$Rd),
+                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
+                       asmop # "\t$Rd, $Rn, $Rm.4s",
+                       [(set (v4i32 FPR128:$Rd),
+                          (v4i32 (opnode (v4i32 FPR128:$src),
+                                         (v1i32 FPR32:$Rn),
+                                         (v4i32 VPR128:$Rm))))],
+                       NoItinerary> {
+  let Constraints = "$src = $Rd";
+  let Predicates = [HasNEON, HasCrypto];
+}
+
+def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c", int_aarch64_neon_sha1c>;
+def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p", int_aarch64_neon_sha1p>;
+def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m", int_aarch64_neon_sha1m>;
+
+//
+// Patterns for handling half-precision values
+//
+
+// Convert f16 value coming in as i16 value to f32
+def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
+          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
+
+def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
+            f32_to_f16 (f32 FPR32:$Rn))))))),
+          (f32 FPR32:$Rn)>;
+
+// Patterns for vector extract of half-precision FP value in i16 storage type
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H
+            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+            neon_uimm2_bare:$Imm)))>;
+
+def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
+            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
+          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
+
+// Patterns for vector insert of half-precision FP value 0 in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+// Patterns for vector insert of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm3_bare:$Imm))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn),
+            (v8i16 (SUBREG_TO_REG (i64 0),
+              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+              sub_16)),
+            neon_uimm3_bare:$Imm, 0))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint
+              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
+            (neon_uimm2_bare:$Imm))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0),
+                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
+                sub_16)),
+              neon_uimm2_bare:$Imm, 0)),
+            sub_64))>;
+
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+// Patterns for vector copy of half-precision FP value in i16 storage type
+def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
+            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
+
+def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
+            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
+              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
+              65535)))))))),
+            (neon_uimm3_bare:$Imm1))),
+          (v4i16 (EXTRACT_SUBREG
+            (v8i16 (INSELh
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
+              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
+              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
+            sub_64))>;
+
+
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 3d22330..8cfb968 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -109,6 +109,11 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
     break;
+  case MachineOperand::MO_FPImmediate: {
+    assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
+    MCOp = MCOperand::CreateFPImm(0.0);
+    break;
+  }
   case MachineOperand::MO_BlockAddress:
     MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
     break;
@@ -116,7 +121,7 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
     MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_MachineBasicBlock:
     MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 20b0dcf..75ec44f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -29,9 +29,8 @@
 
 using namespace llvm;
 
-AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo &tii,
-                                         const AArch64Subtarget &sti)
-  : AArch64GenRegisterInfo(AArch64::X30), TII(tii) {
+AArch64RegisterInfo::AArch64RegisterInfo()
+  : AArch64GenRegisterInfo(AArch64::X30) {
 }
 
 const uint16_t *
@@ -122,6 +121,8 @@ AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
     return;
   }
 
+  const AArch64InstrInfo &TII =
+    *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
   int MinOffset, MaxOffset, OffsetScale;
   if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s) {
     MinOffset = 0;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index bb64fd5..4d67943 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -25,12 +25,7 @@ class AArch64InstrInfo;
 class AArch64Subtarget;
 
 struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
-private:
-  const AArch64InstrInfo &TII;
-
-public:
-  AArch64RegisterInfo(const AArch64InstrInfo &tii,
-                      const AArch64Subtarget &sti);
+  AArch64RegisterInfo();
 
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index bd79546..4e2022c 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -12,15 +12,25 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex;
-def sub_64 : SubRegIndex;
-def sub_32 : SubRegIndex;
-def sub_16 : SubRegIndex;
-def sub_8  : SubRegIndex;
-
-// The VPR registers are handled as sub-registers of FPR equivalents, but
-// they're really the same thing. We give this concept a special index.
-def sub_alias : SubRegIndex;
+def sub_128 : SubRegIndex<128>;
+def sub_64 : SubRegIndex<64>;
+def sub_32 : SubRegIndex<32>;
+def sub_16 : SubRegIndex<16>;
+def sub_8  : SubRegIndex<8>;
+
+// Note: Code depends on these having consecutive numbers.
+def qqsub : SubRegIndex<256, 256>;
+
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
 }
 
 // Registers are identified with 5-bit ID numbers.
@@ -137,60 +147,51 @@ foreach Index = 0-31 in {
 }
 
 
-def FPR8 : RegisterClass<"AArch64", [i8], 8,
+def FPR8 : RegisterClass<"AArch64", [i8, v1i8], 8,
                           (sequence "B%u", 0, 31)> {
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16], 16,
+def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
                           (sequence "H%u", 0, 31)> {
 }
 
-def FPR32 : RegisterClass<"AArch64", [f32], 32,
+def FPR32 : RegisterClass<"AArch64", [f32, v1i32, v1f32], 32,
                           (sequence "S%u", 0, 31)> {
 }
 
-def FPR64 : RegisterClass<"AArch64", [f64], 64,
-                          (sequence "D%u", 0, 31)> {
-}
+def FPR64 : RegisterClass<"AArch64",
+                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                          64, (sequence "D%u", 0, 31)>;
 
-def FPR128 : RegisterClass<"AArch64", [f128], 128,
-                          (sequence "Q%u", 0, 31)> {
-}
+def FPR128 : RegisterClass<"AArch64",
+                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                           128, (sequence "Q%u", 0, 31)>;
 
+def FPR64Lo : RegisterClass<"AArch64",
+                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
+                            64, (sequence "D%u", 0, 15)>;
+
+def FPR128Lo : RegisterClass<"AArch64",
+                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
+                             128, (sequence "Q%u", 0, 15)>;
 
 //===----------------------------------------------------------------------===//
 //  Vector registers:
 //===----------------------------------------------------------------------===//
 
-// NEON registers simply specify the overall vector, and it's expected that
-// Instructions will individually specify the acceptable data layout. In
-// principle this leaves two approaches open:
-//   + An operand, giving a single ADDvvv instruction (for example). This turns
-//     out to be unworkable in the assembly parser (without every Instruction
-//     having a "cvt" function, at least) because the constraints can't be
-//     properly enforced. It also complicates specifying patterns since each
-//     instruction will accept many types.
-//  + A bare token (e.g. ".2d"). This means the AsmParser has to know specific
-//    details about NEON registers, but simplifies most other details.
-//
-// The second approach was taken.
-
-foreach Index = 0-31 in {
-  def V # Index  : AArch64RegWithSubs<Index, "v" # Index,
-                                      [!cast<Register>("Q" # Index)],
-                                      [sub_alias]>,
-            DwarfRegNum<[!add(Index, 64)]>;
+def VPR64AsmOperand : AsmOperandClass {
+  let Name = "VPR";
+  let PredicateMethod = "isReg";
+  let RenderMethod = "addRegOperands";
 }
 
-// These two classes contain the same registers, which should be reasonably
-// sensible for MC and allocation purposes, but allows them to be treated
-// separately for things like stack spilling.
-def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64,
-                          (sequence "V%u", 0, 31)>;
+def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
+
+def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+
+def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
 
-def VPR128 : RegisterClass<"AArch64",
-                           [v2f64, v2i64, v4f32, v4i32, v8i16, v16i8], 128,
-                           (sequence "V%u", 0, 31)>;
+def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
 
 // Flags register
 def NZCV : Register<"nzcv"> {
@@ -201,3 +202,90 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
   let CopyCost = -1;
   let isAllocatable = 0;
 }
+
+//===----------------------------------------------------------------------===//
+//  Consecutive vector registers
+//===----------------------------------------------------------------------===//
+// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
+def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
+                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+                              
+// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
+def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2)]>;
+                               
+// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
+def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
+                              [(rotl FPR64, 0), (rotl FPR64, 1),
+                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
+
+// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
+def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
+                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+
+// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
+def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2)]>;
+                               
+// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
+def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
+                              [(rotl FPR128, 0), (rotl FPR128, 1),
+                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
+
+// The followings are super register classes to model 2/3/4 consecutive
+// 64-bit/128-bit registers.
+
+def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
+
+def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
+  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
+}
+
+def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
+
+def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
+
+def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
+  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
+}
+
+def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
+
+
+// The followings are vector list operands
+multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
+                               RegisterClass RegList> {
+  def _asmoperand : AsmOperandClass {
+    let Name = PREFIX # LAYOUT # Count;
+    let RenderMethod = "addVectorListOperands";
+    let PredicateMethod = 
+        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
+    let ParserMethod = "ParseVectorList";
+  }
+
+  def _operand : RegisterOperand<RegList,
+        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
+    let ParserMatchClass =
+      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  }
+}
+
+multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
+                           RegisterClass QRegList> {
+  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
+  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
+  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
+  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
+  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
+  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
+  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
+  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+}
+
+// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
+defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
+defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
+defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
+defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+\ No newline at end of file
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index d17b738..5c693c1 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -25,13 +25,31 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void AArch64Subtarget::anchor() {}
+
 AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
-  : AArch64GenSubtargetInfo(TT, CPU, FS)
-  , HasNEON(true)
-  , HasCrypto(true)
-  , TargetTriple(TT) {
+    : AArch64GenSubtargetInfo(TT, CPU, FS), HasFPARMv8(false), HasNEON(false),
+      HasCrypto(false), TargetTriple(TT), CPUString(CPU) {
+
+  initializeSubtargetFeatures(CPU, FS);
+}
+
+void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
+                                                   StringRef FS) {
+  if (CPU.empty())
+    CPUString = "generic";
+
+  std::string FullFS = FS;
+  if (CPUString == "generic") {
+    // Enable FP by default.
+    if (FullFS.empty())
+      FullFS = "+fp-armv8";
+    else
+      FullFS = "+fp-armv8," + FullFS;
+  }
 
-  ParseSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPU, FullFS);
 }
 
 bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 2e9205f..bbfd3bc 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -27,18 +27,31 @@ class StringRef;
 class GlobalValue;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
+  virtual void anchor();
 protected:
+  bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+private:
+  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   ///
   AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS);
 
+  virtual bool enableMachineScheduler() const {
+    return true;
+  }
+
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -46,8 +59,13 @@ public:
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
 
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+
+  bool hasFPARMv8() const { return HasFPARMv8; }
+  bool hasNEON() const { return HasNEON; }
+  bool hasCrypto() const { return HasCrypto; }
 
+  const std::string & getCPUString() const { return CPUString; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index df599d5..f1695e2 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -38,6 +38,7 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
 }
 
 namespace {
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 69bb80a..fbbce11 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -54,8 +54,9 @@ public:
 #include "AArch64GenAsmMatcher.inc"
   };
 
-  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Initialize the set of available features.
@@ -126,6 +127,11 @@ public:
   OperandMatchResultTy
   ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
+                      SMLoc &LayoutLoc);
+
+  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
+
   bool validateInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
@@ -153,6 +159,7 @@ private:
     k_Immediate,      // Including expressions referencing symbols
     k_Register,
     k_ShiftExtend,
+    k_VectorList,     // A sequential list of 1 to 4 registers.
     k_SysReg,         // The register operand of MRS and MSR instructions
     k_Token,          // The mnemonic; other raw tokens the auto-generated
     k_WrappedRegister // Load/store exclusive permit a wrapped register.
@@ -188,6 +195,13 @@ private:
     bool ImplicitAmount;
   };
 
+  // A vector register list is a sequential list of 1 to 4 registers.
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    A64Layout::VectorLayout Layout;
+  };
+
   struct SysRegOp {
     const char *Data;
     unsigned Length;
@@ -205,6 +219,7 @@ private:
     struct ImmOp Imm;
     struct RegOp Reg;
     struct ShiftExtendOp ShiftExtend;
+    struct VectorListOp VectorList;
     struct SysRegOp SysReg;
     struct TokOp Tok;
   };
@@ -454,7 +469,7 @@ public:
   }
 
   bool isMOVN32Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_SABS_G0,
       AArch64MCExpr::VK_AARCH64_SABS_G1,
       AArch64MCExpr::VK_AARCH64_DTPREL_G1,
@@ -463,13 +478,13 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1,
       AArch64MCExpr::VK_AARCH64_TPREL_G0,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(32, PermittedModifiers, NumModifiers);
   }
 
   bool isMOVN64Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_SABS_G0,
       AArch64MCExpr::VK_AARCH64_SABS_G1,
       AArch64MCExpr::VK_AARCH64_SABS_G2,
@@ -481,14 +496,14 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1,
       AArch64MCExpr::VK_AARCH64_TPREL_G0,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(64, PermittedModifiers, NumModifiers);
   }
 
 
   bool isMOVZ32Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_ABS_G0,
       AArch64MCExpr::VK_AARCH64_ABS_G1,
       AArch64MCExpr::VK_AARCH64_SABS_G0,
@@ -499,13 +514,13 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1,
       AArch64MCExpr::VK_AARCH64_TPREL_G0,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(32, PermittedModifiers, NumModifiers);
   }
 
   bool isMOVZ64Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_ABS_G0,
       AArch64MCExpr::VK_AARCH64_ABS_G1,
       AArch64MCExpr::VK_AARCH64_ABS_G2,
@@ -521,13 +536,13 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1,
       AArch64MCExpr::VK_AARCH64_TPREL_G0,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(64, PermittedModifiers, NumModifiers);
   }
 
   bool isMOVK32Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
       AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
       AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
@@ -536,13 +551,13 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
       AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(32, PermittedModifiers, NumModifiers);
   }
 
   bool isMOVK64Imm() const {
-    static AArch64MCExpr::VariantKind PermittedModifiers[] = {
+    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
       AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
       AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
       AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
@@ -553,13 +568,13 @@ public:
       AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
       AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
     };
-    unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
 
     return isMoveWideImm(64, PermittedModifiers, NumModifiers);
   }
 
   bool isMoveWideImm(unsigned RegWidth,
-                     AArch64MCExpr::VariantKind *PermittedModifiers,
+                     const AArch64MCExpr::VariantKind *PermittedModifiers,
                      unsigned NumModifiers) const {
     if (!isImmWithLSL()) return false;
 
@@ -664,8 +679,86 @@ public:
     return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
   }
 
-  template<int MemSize>  bool isSImm7Scaled() const {
-    if (!isImm()) return false;
+  // if 0 < value <= w, return true
+  bool isShrFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value > 0 && Value <= w;
+  }
+
+  bool isShrImm8() const { return isShrFixedWidth(8); }
+
+  bool isShrImm16() const { return isShrFixedWidth(16); }
+
+  bool isShrImm32() const { return isShrFixedWidth(32); }
+
+  bool isShrImm64() const { return isShrFixedWidth(64); }
+
+  // if 0 <= value < w, return true
+  bool isShlFixedWidth(int w) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < w;
+  }
+
+  bool isShlImm8() const { return isShlFixedWidth(8); }
+
+  bool isShlImm16() const { return isShlFixedWidth(16); }
+
+  bool isShlImm32() const { return isShlFixedWidth(32); }
+
+  bool isShlImm64() const { return isShlFixedWidth(64); }
+
+  bool isNeonMovImmShiftLSL() const {
+    if (!isShiftOrExtend())
+      return false;
+
+    if (ShiftExtend.ShiftType != A64SE::LSL)
+      return false;
+
+    // Valid shift amount is 0, 8, 16 and 24.
+    return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
+  }
+
+  bool isNeonMovImmShiftLSLH() const {
+    if (!isShiftOrExtend())
+      return false;
+
+    if (ShiftExtend.ShiftType != A64SE::LSL)
+      return false;
+
+    // Valid shift amount is 0 and 8.
+    return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
+  }
+
+  bool isNeonMovImmShiftMSL() const {
+    if (!isShiftOrExtend())
+      return false;
+
+    if (ShiftExtend.ShiftType != A64SE::MSL)
+      return false;
+
+    // Valid shift amount is 8 and 16.
+    return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
+  }
+
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  bool isVectorList() const {
+    return Kind == k_VectorList && VectorList.Layout == Layout &&
+           VectorList.Count == Count;
+  }
+
+  template <int MemSize> bool isSImm7Scaled() const {
+    if (!isImm())
+      return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
@@ -705,10 +798,38 @@ public:
     return isa<MCConstantExpr>(getImm());
   }
 
+  bool isNeonUImm64Mask() const {
+    if (!isImm())
+      return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE)
+      return false;
+
+    uint64_t Value = CE->getValue();
+
+    // i64 value with each byte being either 0x00 or 0xff.
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
+      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
+        return false;
+    return true;
+  }
+
+  // if value == N, return true
+  template<int N>
+  bool isExactImm() const {
+    if (!isImm()) return false;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+
+    return CE->getValue() == N;
+  }
+
   static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
                                           unsigned ShiftAmount,
                                           bool ImplicitAmount,
-                                          SMLoc S, SMLoc E) {
+										  SMLoc S,SMLoc E) {
     AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
     Op->ImmWithLSL.Val = Val;
     Op->ImmWithLSL.ShiftAmount = ShiftAmount;
@@ -766,6 +887,18 @@ public:
     return Op;
   }
 
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                          A64Layout::VectorLayout Layout,
+                                          SMLoc S, SMLoc E) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.Layout = Layout;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
     AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
     Op->Tok.Data = Str.data();
@@ -1026,6 +1159,40 @@ public:
     Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
   }
 
+  // For Vector Immediates shifted imm operands.
+  void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
+      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+    // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
+    int64_t Imm = ShiftExtend.Amount / 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
+      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+    // Encode LSLH shift amount 0, 8  as 0, 1.
+    int64_t Imm = ShiftExtend.Amount / 8;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
+      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+    // Encode MSL shift amount 8, 16  as 0, 1.
+    int64_t Imm = ShiftExtend.Amount / 8 - 1;
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
   // For the extend in load-store (register offset) instructions.
   template<unsigned MemSize>
   void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
@@ -1065,6 +1232,25 @@ public:
 
     Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
   }
+
+  void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    // A bit from each byte in the constant forms the encoded immediate
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+
+    unsigned Imm = 0;
+    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+      Imm |= (Value & 1) << i;
+    }
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
+
+  void addVectorListOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  }
 };
 
 } // end anonymous namespace.
@@ -1104,7 +1290,6 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       else
         return MatchOperand_Success;
     }
-
     // ... or it might be a symbolish thing
   }
     // Fall through
@@ -1148,7 +1333,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return ParseOperand(Operands, Mnemonic);
   }
   // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly:  // Weird SIMD lists
+  case AsmToken::LCurly: // SIMD vector list is not parsed here
     llvm_unreachable("Don't know how to deal with '{' in operand");
     return MatchOperand_ParseFail;
   }
@@ -1306,7 +1491,7 @@ AArch64AsmParser::ParseImmWithLSLOperand(
 
   // The optional operand must be "lsl #N" where N is non-negative.
   if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().lower() == "lsl") {
+      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
     Parser.Lex();
 
     if (Parser.getTok().is(AsmToken::Hash)) {
@@ -1363,9 +1548,8 @@ AArch64AsmParser::ParseCRxOperand(
     return MatchOperand_ParseFail;
   }
 
-  std::string LowerTok = Parser.getTok().getIdentifier().lower();
-  StringRef Tok(LowerTok);
-  if (Tok[0] != 'c') {
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
     Error(S, "Expected cN operand where 0 <= N <= 15");
     return MatchOperand_ParseFail;
   }
@@ -1437,22 +1621,11 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
   std::string LowerReg = Tok.getString().lower();
   size_t DotPos = LowerReg.find('.');
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
-
+  bool IsVec128 = false;
   SMLoc S = Tok.getLoc();
   RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
 
-  if (DotPos == StringRef::npos) {
+  if (DotPos == std::string::npos) {
     Layout = StringRef();
   } else {
     // Everything afterwards needs to be a literal token, expected to be
@@ -1462,20 +1635,78 @@ AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
     // gives us a permanent string to use in the token (a pointer into LowerReg
     // would go out of scope when we return).
     LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    std::string LayoutText = LowerReg.substr(DotPos, StringRef::npos);
+    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
+
+    // See if it's a 128-bit layout first.
     Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".d", ".d").Case(".1d", ".1d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".2s", ".2s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".4h", ".4h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".8b", ".8b").Case(".16b", ".16b")
+      .Case(".q", ".q").Case(".1q", ".1q")
+      .Case(".d", ".d").Case(".2d", ".2d")
+      .Case(".s", ".s").Case(".4s", ".4s")
+      .Case(".h", ".h").Case(".8h", ".8h")
+      .Case(".b", ".b").Case(".16b", ".16b")
       .Default("");
 
+    if (Layout.size() != 0)
+      IsVec128 = true;
+    else {
+      Layout = StringSwitch<const char *>(LayoutText)
+                   .Case(".1d", ".1d")
+                   .Case(".2s", ".2s")
+                   .Case(".4h", ".4h")
+                   .Case(".8b", ".8b")
+                   .Default("");
+    }
+
     if (Layout.size() == 0) {
-      // Malformed register
+      // If we've still not pinned it down the register is malformed.
       return false;
     }
   }
 
+  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
+  if (RegNum == AArch64::NoRegister) {
+    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
+      .Case("ip0", AArch64::X16)
+      .Case("ip1", AArch64::X17)
+      .Case("fp", AArch64::X29)
+      .Case("lr", AArch64::X30)
+      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
+      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
+      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
+      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
+      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
+      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
+      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
+      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
+      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
+      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
+      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
+      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
+      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
+      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
+      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
+      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
+      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
+      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
+      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
+      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
+      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
+      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
+      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
+      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
+      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
+      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
+      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
+      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
+      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
+      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
+      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
+      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
+      .Default(AArch64::NoRegister);
+  }
+  if (RegNum == AArch64::NoRegister)
+    return false;
+
   return true;
 }
 
@@ -1507,6 +1738,7 @@ AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       case 'h': NumLanes = 8; break;
       case 's': NumLanes = 4; break;
       case 'd': NumLanes = 2; break;
+      case 'q': NumLanes = 1; break;
       }
     }
 
@@ -1660,20 +1892,21 @@ AArch64AsmParser::ParseShiftExtend(
   std::string LowerID = IDVal.lower();
 
   A64SE::ShiftExtSpecifiers Spec =
-    StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
-      .Case("lsl", A64SE::LSL)
-      .Case("lsr", A64SE::LSR)
-      .Case("asr", A64SE::ASR)
-      .Case("ror", A64SE::ROR)
-      .Case("uxtb", A64SE::UXTB)
-      .Case("uxth", A64SE::UXTH)
-      .Case("uxtw", A64SE::UXTW)
-      .Case("uxtx", A64SE::UXTX)
-      .Case("sxtb", A64SE::SXTB)
-      .Case("sxth", A64SE::SXTH)
-      .Case("sxtw", A64SE::SXTW)
-      .Case("sxtx", A64SE::SXTX)
-      .Default(A64SE::Invalid);
+      StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
+        .Case("lsl", A64SE::LSL)
+	.Case("msl", A64SE::MSL)
+	.Case("lsr", A64SE::LSR)
+	.Case("asr", A64SE::ASR)
+	.Case("ror", A64SE::ROR)
+	.Case("uxtb", A64SE::UXTB)
+	.Case("uxth", A64SE::UXTH)
+	.Case("uxtw", A64SE::UXTW)
+	.Case("uxtx", A64SE::UXTX)
+	.Case("sxtb", A64SE::SXTB)
+	.Case("sxth", A64SE::SXTH)
+	.Case("sxtw", A64SE::SXTW)
+	.Case("sxtx", A64SE::SXTX)
+	.Default(A64SE::Invalid);
 
   if (Spec == A64SE::Invalid)
     return MatchOperand_NoMatch;
@@ -1683,8 +1916,8 @@ AArch64AsmParser::ParseShiftExtend(
   S = Parser.getTok().getLoc();
   Parser.Lex();
 
-  if (Spec != A64SE::LSL && Spec != A64SE::LSR &&
-      Spec != A64SE::ASR && Spec != A64SE::ROR) {
+  if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
+      Spec != A64SE::ROR && Spec != A64SE::MSL) {
     // The shift amount can be omitted for the extending versions, but not real
     // shifts:
     //     add x0, x0, x0, uxtb
@@ -1724,6 +1957,148 @@ AArch64AsmParser::ParseShiftExtend(
   return MatchOperand_Success;
 }
 
+/// Try to parse a vector register token, If it is a vector register,
+/// the token is eaten and return true. Otherwise return false.
+bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
+                                      StringRef &Layout, SMLoc &LayoutLoc) {
+  bool IsVector = true;
+
+  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
+    IsVector = false;
+  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
+                .contains(RegNum) &&
+           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
+                .contains(RegNum))
+    IsVector = false;
+  else if (Layout.size() == 0)
+    IsVector = false;
+
+  if (!IsVector)
+    Error(Parser.getTok().getLoc(), "expected vector type register");
+
+  Parser.Lex(); // Eat this token.
+  return IsVector;
+}
+
+
+// A vector list contains 1-4 consecutive registers.
+// Now there are two kinds of vector list when number of vector > 1:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// If the layout is like .b/.h/.s/.d, also parse the lane.
+AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (Parser.getTok().isNot(AsmToken::LCurly)) {
+    Error(Parser.getTok().getLoc(), "'{' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc SLoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '{' token.
+
+  unsigned Reg, Count = 1;
+  StringRef LayoutStr;
+  SMLoc RegEndLoc, LayoutLoc;
+  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
+    return MatchOperand_ParseFail;
+
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
+
+    unsigned Reg2;
+    StringRef LayoutStr2;
+    SMLoc RegEndLoc2, LayoutLoc2;
+    SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+      return MatchOperand_ParseFail;
+    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+
+    if (LayoutStr != LayoutStr2) {
+      Error(LayoutLoc2, "expected the same vector layout");
+      return MatchOperand_ParseFail;
+    }
+    if (Space == 0 || Space > 3) {
+      Error(RegLoc2, "invalid number of vectors");
+      return MatchOperand_ParseFail;
+    }
+
+    Count += Space;
+  } else {
+    unsigned LastReg = Reg;
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      unsigned Reg2;
+      StringRef LayoutStr2;
+      SMLoc RegEndLoc2, LayoutLoc2;
+      SMLoc RegLoc2 = Parser.getTok().getLoc();
+
+      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
+        return MatchOperand_ParseFail;
+      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
+                                        : (Reg2 + 32 - LastReg);
+      Count++;
+
+      // The space between two vectors should be 1. And they should have the same layout.
+      // Total count shouldn't be great than 4
+      if (Space != 1) {
+        Error(RegLoc2, "invalid space between two vectors");
+        return MatchOperand_ParseFail;
+      }
+      if (LayoutStr != LayoutStr2) {
+        Error(LayoutLoc2, "expected the same vector layout");
+        return MatchOperand_ParseFail;
+      }
+      if (Count > 4) {
+        Error(RegLoc2, "invalid number of vectors");
+        return MatchOperand_ParseFail;
+      }
+
+      LastReg = Reg2;
+    }
+  }
+
+  if (Parser.getTok().isNot(AsmToken::RCurly)) {
+    Error(Parser.getTok().getLoc(), "'}' expected");
+    return MatchOperand_ParseFail;
+  }
+  SMLoc ELoc = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat '}' token.
+
+  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
+  if (Count > 1) { // If count > 1, create vector list using super register.
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    static unsigned SupRegIDs[3][2] = {
+      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
+      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
+      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
+    };
+    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
+    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
+                                   &AArch64MCRegisterClasses[SupRegID]);
+  }
+  Operands.push_back(
+      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
+
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    uint32_t NumLanes = 0;
+    switch(Layout) {
+    case A64Layout::VL_B : NumLanes = 16; break;
+    case A64Layout::VL_H : NumLanes = 8; break;
+    case A64Layout::VL_S : NumLanes = 4; break;
+    case A64Layout::VL_D : NumLanes = 2; break;
+    default:
+      SMLoc Loc = getLexer().getLoc();
+      Error(Loc, "expected comma before next operand");
+      return MatchOperand_ParseFail;
+    }
+    return ParseNEONLane(Operands, NumLanes);
+  } else {
+    return MatchOperand_Success;
+  }
+}
+
 // FIXME: We would really like to be able to tablegen'erate this.
 bool AArch64AsmParser::
 validateInstruction(MCInst &Inst,
@@ -1918,7 +2293,7 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
       if (getParser().parseExpression(Value))
         return true;
 
-      getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/);
+      getParser().getStreamer().EmitValue(Value, Size);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -2019,7 +2394,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                  "expected compatible register or floating-point constant");
   case Match_FPZero:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected floating-point constant #0.0");
+                 "expected floating-point constant #0.0 or invalid register type");
   case Match_Label:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
                  "expected label or encodable integer pc offset");
@@ -2140,6 +2515,30 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_Width64:
     return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
                  "expected integer in range [<lsb>, 63]");
+  case Match_ShrImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 8]");
+  case Match_ShrImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 16]");
+  case Match_ShrImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 32]");
+  case Match_ShrImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [1, 64]");
+  case Match_ShlImm8:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 7]");
+  case Match_ShlImm16:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 15]");
+  case Match_ShlImm32:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 31]");
+  case Match_ShlImm64:
+    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
+                 "expected integer in range [0, 63]");
   }
 
   llvm_unreachable("Implement any new match types added!");
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index 8164d6f..0f2e816 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -28,6 +28,8 @@ add_llvm_target(AArch64CodeGen
   AArch64TargetObjectFile.cpp
   )
 
+add_dependencies(LLVMAArch64CodeGen AArch64CommonTableGen)
+
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 12c1b8f..be4d7f2 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -38,7 +38,7 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
 namespace {
 /// AArch64 disassembler for all AArch64 platforms.
 class AArch64Disassembler : public MCDisassembler {
-  const MCRegisterInfo *RegInfo;
+  OwningPtr<const MCRegisterInfo> RegInfo;
 public:
   /// Initializes the disassembler.
   ///
@@ -46,8 +46,7 @@ public:
     : MCDisassembler(STI), RegInfo(Info) {
   }
 
-  ~AArch64Disassembler() {
-  }
+  ~AArch64Disassembler() {}
 
   /// See MCDisassembler.
   DecodeStatus getInstruction(MCInst &instr,
@@ -57,7 +56,7 @@ public:
                               raw_ostream &vStream,
                               raw_ostream &cStream) const;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 };
 
 }
@@ -83,12 +82,38 @@ static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                          uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
+static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
+                                                unsigned RegNo, uint64_t Address,
+                                                const void *Decoder);
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
 
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
@@ -111,6 +136,30 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
                                         uint64_t Address,
                                         const void *Decoder);
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
                                              unsigned FullImm,
@@ -127,6 +176,10 @@ static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
                                            unsigned ShiftAmount,
                                            uint64_t Address,
                                            const void *Decoder);
+template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+static DecodeStatus
+DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
+                             uint64_t Address, const void *Decoder);
 
 static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
                                             unsigned ShiftAmount,
@@ -177,6 +230,17 @@ static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
                                                    uint64_t Address,
                                                    const void *Decoder);
 
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In);
 
@@ -208,7 +272,7 @@ DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
   uint8_t bytes[4];
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -325,6 +389,14 @@ DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
+}
 
 static DecodeStatus
 DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
@@ -338,16 +410,79 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
 }
 
 static DecodeStatus
-DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                         uint64_t Address, const void *Decoder) {
+DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
+                                                  unsigned RegNo,
+                                                  uint64_t Address,
+                                                  const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+
+  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
+                                            unsigned RegID,
+                                            const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::VPR128RegClassID, RegNo);
+  uint16_t Register = getReg(Decoder, RegID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Register));
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
+                                 Decoder);
+}
+
+static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
+  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
+                                 Decoder);
+}
+
 static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
                                                unsigned OptionHiS,
                                                uint64_t Address,
@@ -396,7 +531,73 @@ static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(8 - Val));
+  return MCDisassembler::Success;
+}
 
+static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(16 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(32 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
+                                        uint64_t Address,
+                                        const void *Decoder) {
+  if (Val > 7)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 15)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 31)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  if (Val > 63)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
 
 template<int RegWidth>
 static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
@@ -553,11 +754,11 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   unsigned IsToVec = fieldFromInstruction(Insn, 16, 1);
 
   if (IsToVec) {
-    DecodeVPR128RegisterClass(Inst, Rd, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
     DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
   } else {
     DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeVPR128RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
   }
 
   // Add the lane
@@ -800,4 +1001,572 @@ extern "C" void LLVMInitializeAArch64Disassembler() {
                                          createAArch64Disassembler);
 }
 
+template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+static DecodeStatus
+DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
+                             uint64_t Address, const void *Decoder) {
+  bool IsLSL = false;
+  if (Ext == A64SE::LSL)
+    IsLSL = true;
+  else if (Ext != A64SE::MSL)
+    return MCDisassembler::Fail;
+
+  // MSL and LSLH accepts encoded shift amount 0 or 1.
+  if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
+    return MCDisassembler::Fail;
+
+  // LSL  accepts encoded shift amount 0, 1, 2 or 3.
+  if (IsLSL && ShiftAmount > 3)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+  return MCDisassembler::Success;
+}
+
+// Decode post-index vector load/store instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of vector list in bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
+  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
+  // 0 for 64bit vector list, 1 for 128bit vector list
+  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+
+  unsigned NumVecs;
+  switch (Opcode) {
+  case 0: // ld4/st4
+  case 2: // ld1/st1 with 4 vectors
+    NumVecs = 4; break;
+  case 4: // ld3/st3
+  case 6: // ld1/st1 with 3 vectors
+    NumVecs = 3; break;
+  case 7: // ld1/st1 with 1 vector
+    NumVecs = 1; break;
+  case 8:  // ld2/st2
+  case 10: // ld1/st1 with 2 vectors
+    NumVecs = 2; break;
+  default:
+    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+  }
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
+    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode vector list of 1/2/3/4 vectors for load instructions.
+  if (!IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
+                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    }
+  }
+
+  return MCDisassembler::Success;
+}
+
+// Decode post-index vector load/store lane instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of the changed bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder) {
+  bool Is64bitVec = false;
+  bool IsLoadDup = false;
+  bool IsLoad = false;
+  // The total number of bytes transferred.
+  // TransferBytes = NumVecs * OneLaneBytes
+  unsigned TransferBytes = 0;
+  unsigned NumVecs = 0;
+  unsigned Opc = Inst.getOpcode();
+  switch (Opc) {
+  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
+      TransferBytes = 8; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
+      TransferBytes = 16; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
+      TransferBytes = 24; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
+  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
+  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
+      TransferBytes = 32; break;
+    }
+    Is64bitVec = true;
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
+  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
+  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
+    switch (Opc) {
+    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoadDup = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    IsLoad = true;
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    IsLoad = true;
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    IsLoad = true;
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    IsLoad = true;
+    NumVecs = 4;
+    break;
+  }
+
+  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+      TransferBytes = 1; break;
+    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+      TransferBytes = 2; break;
+    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+      TransferBytes = 4; break;
+    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
+      TransferBytes = 8; break;
+    }
+    NumVecs = 1;
+    break;
+  }
+
+  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+      TransferBytes = 2; break;
+    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+      TransferBytes = 4; break;
+    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+      TransferBytes = 8; break;
+    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
+      TransferBytes = 16; break;
+    }
+    NumVecs = 2;
+    break;
+  }
+
+  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+      TransferBytes = 3; break;
+    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+      TransferBytes = 6; break;
+    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+      TransferBytes = 12; break;
+    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
+      TransferBytes = 24; break;
+    }
+    NumVecs = 3;
+    break;
+  }
+
+  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
+    switch (Opc) {
+    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+      TransferBytes = 4; break;
+    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+      TransferBytes = 8; break;
+    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+      TransferBytes = 16; break;
+    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
+      TransferBytes = 32; break;
+    }
+    NumVecs = 4;
+    break;
+  }
+
+  default:
+    return MCDisassembler::Fail;
+  } // End of switch (Opc)
+
+  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+
+  // Decode post-index of load duplicate lane
+  if (IsLoadDup) {
+    switch (NumVecs) {
+    case 1:
+      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
+                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+
+    // Decode write back register, which is equal to Rn.
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+    else // Decode Rm
+      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+    return MCDisassembler::Success;
+  }
+
+  // Decode post-index of load/store lane
+  // Loads have a vector list as output.
+  if (IsLoad) {
+    switch (NumVecs) {
+    case 1:
+      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 2:
+      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 3:
+      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+      break;
+    case 4:
+      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+    }
+  }
+
+  // Decode write back register, which is equal to Rn.
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+  else // Decode Rm
+    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+  // Decode the source vector list.
+  switch (NumVecs) {
+  case 1:
+    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 2:
+    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 3:
+    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+    break;
+  case 4:
+    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+  }
+
+  // Decode lane
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+  unsigned S = fieldFromInstruction(Insn, 10, 3);
+  unsigned lane = 0;
+  // Calculate the number of lanes by number of vectors and transfered bytes.
+  // NumLanes = 16 bytes / bytes of each lane
+  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
+  switch (NumLanes) {
+  case 16: // A vector has 16 lanes, each lane is 1 bytes.
+    lane = (Q << 3) | S;
+    break;
+  case 8:
+    lane = (Q << 2) | (S >> 1);
+    break;
+  case 4:
+    lane = (Q << 1) | (S >> 2);
+    break;
+  case 2:
+    lane = Q;
+    break;
+  }
+  Inst.addOperand(MCOperand::CreateImm(lane));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+  unsigned size = fieldFromInstruction(Insn, 22, 2);
+  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+
+  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+
+  if(Q)
+    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  else
+    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+
+  switch (size) {
+  case 0:
+    Inst.addOperand(MCOperand::CreateImm(8));
+    break;
+  case 1:
+    Inst.addOperand(MCOperand::CreateImm(16));
+    break;
+  case 2:
+    Inst.addOperand(MCOperand::CreateImm(32));
+    break;
+  default :
+    return MCDisassembler::Fail;
+  }
+  return MCDisassembler::Success;
+}
 
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 82ce80c..0438de3 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -368,6 +368,14 @@ AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
   O << "#" << (Imm * MemScale);
 }
 
+void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  std::string Name = getRegisterName(Reg);
+  Name[0] = 'v';
+  O << Name;
+}
+
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -406,3 +414,126 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
 
   printAnnotation(O, Annot);
 }
+
+template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
+void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
+                                                     unsigned OpNum,
+                                                     raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  assert(MO.isImm() &&
+         "Immediate operand required for Neon vector immediate inst.");
+
+  bool IsLSL = false;
+  if (Ext == A64SE::LSL)
+    IsLSL = true;
+  else if (Ext != A64SE::MSL)
+    llvm_unreachable("Invalid shift specifier in movi instruction");
+
+  int64_t Imm = MO.getImm();
+
+  // MSL and LSLH accepts encoded shift amount 0 or 1.
+  if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
+    llvm_unreachable("Invalid shift amount in movi instruction");
+
+  // LSH accepts encoded shift amount 0, 1, 2 or 3.
+  if (IsLSL && (Imm < 0 || Imm > 3))
+    llvm_unreachable("Invalid shift amount in movi instruction");
+
+  // Print shift amount as multiple of 8 with MSL encoded shift amount
+  // 0 and 1 printed as 8 and 16.
+  if (!IsLSL)
+    Imm++;
+  Imm *= 8;
+
+  // LSL #0 is not printed
+  if (IsLSL) {
+    if (Imm == 0)
+      return;
+    O << ", lsl";
+  } else
+    O << ", msl";
+
+  O << " #" << Imm;
+}
+
+void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &o) {
+  o << "#0x0";
+}
+
+void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
+                                             raw_ostream &O) {
+  const MCOperand &MOUImm = MI->getOperand(OpNum);
+
+  assert(MOUImm.isImm() &&
+         "Immediate operand required for Neon vector immediate inst.");
+
+  unsigned Imm = MOUImm.getImm();
+
+  O << "#0x";
+  O.write_hex(Imm);
+}
+
+void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
+                                              unsigned OpNum,
+                                              raw_ostream &O) {
+  const MCOperand &MOUImm = MI->getOperand(OpNum);
+
+  assert(MOUImm.isImm()
+         && "Immediate operand required for Neon vector immediate inst.");
+
+  unsigned Imm = MOUImm.getImm();
+  O << Imm;
+}
+
+void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
+                                                    unsigned OpNum,
+                                                    raw_ostream &O) {
+  const MCOperand &MOUImm8 = MI->getOperand(OpNum);
+
+  assert(MOUImm8.isImm() &&
+         "Immediate operand required for Neon vector immediate bytemask inst.");
+
+  uint32_t UImm8 = MOUImm8.getImm();
+  uint64_t Mask = 0;
+
+  // Replicates 0x00 or 0xff byte in a 64-bit vector
+  for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+    if ((UImm8 >> ByteNum) & 1)
+      Mask |= (uint64_t)0xff << (8 * ByteNum);
+  }
+
+  O << "#0x";
+  O.write_hex(Mask);
+}
+
+// If Count > 1, there are two valid kinds of vector list:
+//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
+//   (2) {Vn.layout - Vm.layout}
+// We choose the first kind as output.
+template <A64Layout::VectorLayout Layout, unsigned Count>
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
+
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  std::string LayoutStr = A64VectorLayoutToString(Layout);
+  O << "{";
+  if (Count > 1) { // Print sub registers separately
+    bool IsVec64 = (Layout < A64Layout::VL_16B);
+    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
+    for (unsigned I = 0; I < Count; I++) {
+      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
+      Name[0] = 'v';
+      O << Name << LayoutStr;
+      if (I != Count - 1)
+        O << ", ";
+    }
+  } else { // Print the register directly when NumVecs is 1.
+    std::string Name = getRegisterName(Reg);
+    Name[0] = 'v';
+    O << Name << LayoutStr;
+  }
+  O << "}";
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 639fa86..37b7273 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -157,6 +157,7 @@ public:
   void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
                              raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
 
+  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
@@ -164,9 +165,18 @@ public:
     return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
   }
 
-
+  template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+  void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
+                                   raw_ostream &O);
+  void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
+                                  raw_ostream &O);
+
+  template <A64Layout::VectorLayout Layout, unsigned Count>
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
-
 }
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index a3373b1..8a9077c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -578,8 +578,8 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
 }
 
 MCAsmBackend *
-llvm::createAArch64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+llvm::createAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                              StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-
   return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS());
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 3b811df..a64c463 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -55,11 +55,10 @@ namespace {
 /// by MachO. Beware!
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
-  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                 raw_ostream &OS, MCCodeEmitter *Emitter)
-    : MCELFStreamer(Context, TAB, OS, Emitter),
-      MappingSymbolCounter(0), LastEMS(EMS_None) {
-  }
+  AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
+                     MCCodeEmitter *Emitter)
+      : MCELFStreamer(Context, 0, TAB, OS, Emitter), MappingSymbolCounter(0),
+        LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
 
@@ -85,18 +84,17 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+  virtual void EmitBytes(StringRef Data) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data, AddrSpace);
+    MCELFStreamer::EmitBytes(Data);
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace) {
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+    MCELFStreamer::EmitValueImpl(Value, Size);
   }
 
 private:
@@ -130,7 +128,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 8ec8cbf..add874c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -31,11 +31,12 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo() {
 
   UseDataRegionDirectives = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 }
+
+// Pin the vtable to this file.
+void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index a20bc47..d1dd285 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -14,13 +14,15 @@
 #ifndef LLVM_AARCH64TARGETASMINFO_H
 #define LLVM_AARCH64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
-  struct AArch64ELFMCAsmInfo : public MCAsmInfo {
-    explicit AArch64ELFMCAsmInfo();
-  };
+struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
+  explicit AArch64ELFMCAsmInfo();
+private:
+  virtual void anchor();
+};
 
 } // namespace llvm
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index a5c591e..b41c566 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -59,6 +59,23 @@ public:
   unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
                                    SmallVectorImpl<MCFixup> &Fixups) const;
 
+  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
+
+  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
+                            SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
 
   // Labels are handled mostly the same way: a symbol is needed, and
   // just gets some fixup attached.
@@ -152,10 +169,10 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
   switch (Expr->getKind()) {
   default: llvm_unreachable("Unexpected operand modifier");
   case AArch64MCExpr::VK_AARCH64_LO12: {
-    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
-                                AArch64::fixup_a64_ldst16_lo12,
-                                AArch64::fixup_a64_ldst32_lo12,
-                                AArch64::fixup_a64_ldst64_lo12,
+    static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
+                                             AArch64::fixup_a64_ldst16_lo12,
+                                             AArch64::fixup_a64_ldst32_lo12,
+                                             AArch64::fixup_a64_ldst64_lo12,
                                 AArch64::fixup_a64_ldst128_lo12 };
     assert(MemSize <= 16 && "Invalid fixup for operation");
     FixupKind = FixupsBySize[Log2_32(MemSize)];
@@ -166,19 +183,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
     FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
     break;
   case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
-    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12,
-                                AArch64::fixup_a64_ldst16_dtprel_lo12,
-                                AArch64::fixup_a64_ldst32_dtprel_lo12,
-                                AArch64::fixup_a64_ldst64_dtprel_lo12 };
+    static const unsigned FixupsBySize[] = {
+      AArch64::fixup_a64_ldst8_dtprel_lo12,
+      AArch64::fixup_a64_ldst16_dtprel_lo12,
+      AArch64::fixup_a64_ldst32_dtprel_lo12,
+      AArch64::fixup_a64_ldst64_dtprel_lo12
+    };
     assert(MemSize <= 8 && "Invalid fixup for operation");
     FixupKind = FixupsBySize[Log2_32(MemSize)];
     break;
   }
   case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
-    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
-                                AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
-                                AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
-                                AArch64::fixup_a64_ldst64_dtprel_lo12_nc };
+    static const unsigned FixupsBySize[] = {
+      AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
+      AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
+      AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
+      AArch64::fixup_a64_ldst64_dtprel_lo12_nc
+    };
     assert(MemSize <= 8 && "Invalid fixup for operation");
     FixupKind = FixupsBySize[Log2_32(MemSize)];
     break;
@@ -188,19 +209,23 @@ getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
     FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
     break;
   case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
-    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12,
-                                AArch64::fixup_a64_ldst16_tprel_lo12,
-                                AArch64::fixup_a64_ldst32_tprel_lo12,
-                                AArch64::fixup_a64_ldst64_tprel_lo12 };
+    static const unsigned FixupsBySize[] = {
+      AArch64::fixup_a64_ldst8_tprel_lo12,
+      AArch64::fixup_a64_ldst16_tprel_lo12,
+      AArch64::fixup_a64_ldst32_tprel_lo12,
+      AArch64::fixup_a64_ldst64_tprel_lo12
+    };
     assert(MemSize <= 8 && "Invalid fixup for operation");
     FixupKind = FixupsBySize[Log2_32(MemSize)];
     break;
   }
   case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
-    unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_tprel_lo12_nc,
-                                AArch64::fixup_a64_ldst16_tprel_lo12_nc,
-                                AArch64::fixup_a64_ldst32_tprel_lo12_nc,
-                                AArch64::fixup_a64_ldst64_tprel_lo12_nc };
+    static const unsigned FixupsBySize[] = {
+      AArch64::fixup_a64_ldst8_tprel_lo12_nc,
+      AArch64::fixup_a64_ldst16_tprel_lo12_nc,
+      AArch64::fixup_a64_ldst32_tprel_lo12_nc,
+      AArch64::fixup_a64_ldst64_tprel_lo12_nc
+    };
     assert(MemSize <= 8 && "Invalid fixup for operation");
     FixupKind = FixupsBySize[Log2_32(MemSize)];
     break;
@@ -302,6 +327,45 @@ AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
   return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
 }
 
+unsigned AArch64MCCodeEmitter::getShiftRightImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 8 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 16 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 32 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftRightImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return 64 - MI.getOperand(Op).getImm();
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 8;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 16;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 32;
+}
+
+unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
+    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups) const {
+  return MI.getOperand(Op).getImm() - 64;
+}
 
 template<AArch64::Fixups fixupDesired> unsigned
 AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
@@ -346,7 +410,7 @@ AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                        const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
-    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg());
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
   } else if (MO.isImm()) {
     return static_cast<unsigned>(MO.getImm());
   }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 819eead..58fc95c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -40,7 +40,7 @@ MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
                                                           StringRef CPU,
                                                           StringRef FS) {
   MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitAArch64MCSubtargetInfo(X, TT, CPU, "");
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
   return X;
 }
 
@@ -57,13 +57,14 @@ static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
   return X;
 }
 
-static MCAsmInfo *createAArch64MCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
   Triple TheTriple(TT);
 
   MCAsmInfo *MAI = new AArch64ELFMCAsmInfo();
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(AArch64::XSP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
@@ -135,17 +136,17 @@ public:
     return MCInstrAnalysis::isConditionalBranch(Inst);
   }
 
-  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                          uint64_t Size) const {
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const {
     unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
     // FIXME: We only handle PCRel branches for now.
     if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
         != MCOI::OPERAND_PCREL)
-      return -1ULL;
+      return false;
 
     int64_t Imm = Inst.getOperand(LblOperand).getImm();
-
-    return Addr + Imm;
+    Target = Addr + Imm;
+    return true;
   }
 };
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index 3849fe3..670e657 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -43,8 +43,9 @@ MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
 MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
                                              uint8_t OSABI);
 
-MCAsmBackend *createAArch64AsmBackend(const Target &T, StringRef TT,
-                                      StringRef CPU);
+MCAsmBackend *createAArch64AsmBackend(const Target &T,
+                                      const MCRegisterInfo &MRI,
+                                      StringRef TT, StringRef CPU);
 
 } // End llvm namespace
 
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index fc706a4..377b533 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -20,5 +20,5 @@ Target llvm::TheAArch64Target;
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
     RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
-    X(TheAArch64Target, "aarch64", "AArch64");
+    X(TheAArch64Target, "aarch64", "AArch64 (ARM 64-bit target)");
 }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index bedccb5..2a97cd6 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -972,7 +972,7 @@ bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
     // Now we have to work out the amount of rotation needed. The first part of
     // this calculation is actually independent of RepeatWidth, but the complex
     // case will depend on it.
-    Rotation = CountTrailingZeros_64(Imm);
+    Rotation = countTrailingZeros(Imm);
     if (Rotation == 0) {
       // There were no leading zeros, which means it's either in place or there
       // are 1s at each end (e.g. 0x8003 needs rotating).
@@ -1105,3 +1105,69 @@ bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
 
   return isMOVNImm(RegWidth, Value, UImm16, Shift);
 }
+
+// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
+// the shift amount and the shift type (shift zeros or ones in) and
+// returns whether the OpCmode value implies a shift operation.
+bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
+                                    unsigned &ShiftOnesIn) {
+  ShiftImm = 0;
+  ShiftOnesIn = false;
+  bool HasShift = true;
+
+  if (OpCmode == 0xe) {
+    // movi byte
+    HasShift = false;
+  } else if (OpCmode == 0x1e) {
+    // movi 64-bit bytemask
+    HasShift = false;
+  } else if ((OpCmode & 0xc) == 0x8) {
+    // shift zeros, per halfword
+    ShiftImm = ((OpCmode & 0x2) >> 1);
+  } else if ((OpCmode & 0x8) == 0) {
+    // shift zeros, per word
+    ShiftImm = ((OpCmode & 0x6) >> 1);
+  } else if ((OpCmode & 0xe) == 0xc) {
+    // shift ones, per word
+    ShiftOnesIn = true;
+    ShiftImm = (OpCmode & 0x1);
+  } else {
+    // per byte, per bytemask
+    llvm_unreachable("Unsupported Neon modified immediate");
+  }
+
+  return HasShift;
+}
+
+// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
+// into the element value and the element size in bits.
+uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
+                                   unsigned &EltBits) {
+  uint64_t DecodedVal = Val;
+  EltBits = 0;
+
+  if (OpCmode == 0xe) {
+    // movi byte
+    EltBits = 8;
+  } else if (OpCmode == 0x1e) {
+    // movi 64-bit bytemask
+    DecodedVal = 0;
+    for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if ((Val >> ByteNum) & 1)
+        DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
+    }
+    EltBits = 64;
+  } else if ((OpCmode & 0xc) == 0x8) {
+    // shift zeros, per halfword
+    EltBits = 16;
+  } else if ((OpCmode & 0x8) == 0) {
+    // shift zeros, per word
+    EltBits = 32;
+  } else if ((OpCmode & 0xe) == 0xc) {
+    // shift ones, per word
+    EltBits = 32;
+  } else {
+    llvm_unreachable("Unsupported Neon modified immediate");
+  }
+  return DecodedVal;
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9a1ca61..ce970b0 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -289,6 +289,7 @@ namespace A64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
         LSL,
+        MSL,
         LSR,
         ASR,
         ROR,
@@ -305,6 +306,65 @@ namespace A64SE {
     };
 }
 
+namespace A64Layout {
+    enum VectorLayout {
+        Invalid = -1,
+        VL_8B,
+        VL_4H,
+        VL_2S,
+        VL_1D,
+
+        VL_16B,
+        VL_8H,
+        VL_4S,
+        VL_2D,
+
+        // Bare layout for the 128-bit vector
+        // (only show ".b", ".h", ".s", ".d" without vector number)
+        VL_B,
+        VL_H,
+        VL_S,
+        VL_D
+    };
+}
+
+inline static const char *
+A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+  switch (Layout) {
+  case A64Layout::VL_8B:  return ".8b";
+  case A64Layout::VL_4H:  return ".4h";
+  case A64Layout::VL_2S:  return ".2s";
+  case A64Layout::VL_1D:  return ".1d";
+  case A64Layout::VL_16B:  return ".16b";
+  case A64Layout::VL_8H:  return ".8h";
+  case A64Layout::VL_4S:  return ".4s";
+  case A64Layout::VL_2D:  return ".2d";
+  case A64Layout::VL_B:  return ".b";
+  case A64Layout::VL_H:  return ".h";
+  case A64Layout::VL_S:  return ".s";
+  case A64Layout::VL_D:  return ".d";
+  default: llvm_unreachable("Unknown Vector Layout");
+  }
+}
+
+inline static A64Layout::VectorLayout
+A64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", A64Layout::VL_8B)
+             .Case(".4h", A64Layout::VL_4H)
+             .Case(".2s", A64Layout::VL_2S)
+             .Case(".1d", A64Layout::VL_1D)
+             .Case(".16b", A64Layout::VL_16B)
+             .Case(".8h", A64Layout::VL_8H)
+             .Case(".4s", A64Layout::VL_4S)
+             .Case(".2d", A64Layout::VL_2D)
+             .Case(".b", A64Layout::VL_B)
+             .Case(".h", A64Layout::VL_H)
+             .Case(".s", A64Layout::VL_S)
+             .Case(".d", A64Layout::VL_D)
+             .Default(A64Layout::Invalid);
+}
+
 namespace A64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
@@ -1068,7 +1128,10 @@ namespace A64Imms {
   // MOVN but *not* with a MOVZ (because that would take priority).
   bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
 
-}
+  uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
+  bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
+                             unsigned &ShiftOnesIn);
+  }
 
 } // end namespace llvm;
 
diff --git a/lib/Target/AArch64/Utils/CMakeLists.txt b/lib/Target/AArch64/Utils/CMakeLists.txt
index 2c28348..2348e44 100644
--- a/lib/Target/AArch64/Utils/CMakeLists.txt
+++ b/lib/Target/AArch64/Utils/CMakeLists.txt
@@ -3,3 +3,5 @@ include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/
 add_llvm_library(LLVMAArch64Utils
   AArch64BaseInfo.cpp
   )
+
+add_dependencies(LLVMAArch64Utils AArch64CommonTableGen)
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index f0d4dbe..ff585b4 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -615,7 +615,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
   SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
   bool Modified = false;
 
-  for (SmallVector<unsigned, 8>::iterator I = Defs.begin(), E = Defs.end();
+  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
      I != E; ++I) {
     // Follow the def-use chain for this DPR through COPYs, and also through
     // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
@@ -630,7 +630,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
     elideCopiesAndPHIs(Def, DefSrcs);
 
-    for (SmallVector<MachineInstr*, 8>::iterator II = DefSrcs.begin(),
+    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
       EE = DefSrcs.end(); II != EE; ++II) {
       MachineInstr *MI = *II;
 
@@ -655,8 +655,15 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
       if (NewReg != 0) {
         Modified = true;
-        for (SmallVector<MachineOperand*, 8>::const_iterator I = Uses.begin(),
+        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
                E = Uses.end(); I != E; ++I) {
+          // Make sure to constrain the register class of the new register to
+          // match what we're replacing. Otherwise we can optimize a DPR_VFP2
+          // reference into a plain DPR, and that will end poorly. NewReg is
+          // always virtual here, so there will always be a matching subclass
+          // to find.
+          MRI->constrainRegClass(NewReg, MRI->getRegClass((*I)->getReg()));
+
           DEBUG(dbgs() << "Replacing operand "
                        << **I << " with "
                        << PrintReg(NewReg) << "\n");
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 2d747091..36e5680 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -38,12 +38,16 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
 def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true",
                                      "Enable Thumb2 instructions">;
 def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
-                                     "Does not support ARM mode execution">;
+                                     "Does not support ARM mode execution",
+                                     [ModeThumb]>;
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
 def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
                                      "Enable VFP4 instructions",
                                      [FeatureVFP3, FeatureFP16]>;
+def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8",
+                                   "true", "Enable ARMv8 FP",
+                                   [FeatureVFP4]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
                                      "Restrict VFP3 to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
@@ -59,8 +63,15 @@ def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true",
                                          "FP compare + branch is slow">;
 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
                           "Floating point unit supports single precision only">;
+def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true",
+                           "Enable support for Performance Monitor extensions">;
 def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true",
                           "Enable support for TrustZone security extensions">;
+def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
+                          "Enable support for Cryptography extensions",
+                          [FeatureNEON]>;
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+                          "Enable support for CRC instructions">;
 
 // Some processors have FP multiply-accumulate instructions that don't
 // play nicely with other VFP / NEON instructions, and it's generally better
@@ -108,10 +119,24 @@ def FeatureDSPThumb2 : SubtargetFeature<"t2dsp", "Thumb2DSP", "true",
 def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true",
                                  "Supports Multiprocessing extension">;
 
-// M-series ISA?
-def FeatureMClass : SubtargetFeature<"mclass", "IsMClass", "true",
+// Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8).
+def FeatureVirtualization : SubtargetFeature<"virtualization",
+                                 "HasVirtualization", "true",
+                                 "Supports Virtualization extension",
+                                 [FeatureHWDiv, FeatureHWDivARM]>;
+
+// M-series ISA
+def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass",
                                      "Is microcontroller profile ('M' series)">;
 
+// R-series ISA
+def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass",
+                                     "Is realtime profile ('R' series)">;
+
+// A-series ISA
+def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass",
+                                     "Is application profile ('A' series)">;
+
 // Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too.
 // See ARMInstrInfo.td for details.
 def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true",
@@ -129,12 +154,19 @@ def HasV5TEOps  : SubtargetFeature<"v5te", "HasV5TEOps", "true",
 def HasV6Ops    : SubtargetFeature<"v6", "HasV6Ops", "true",
                                    "Support ARM v6 instructions",
                                    [HasV5TEOps]>;
+def HasV6MOps   : SubtargetFeature<"v6m", "HasV6MOps", "true",
+                                   "Support ARM v6M instructions",
+                                   [HasV6Ops]>;
 def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
                                    "Support ARM v6t2 instructions",
-                                   [HasV6Ops, FeatureThumb2]>;
+                                   [HasV6MOps, FeatureThumb2]>;
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
-                                   [HasV6T2Ops]>;
+                                   [HasV6T2Ops, FeaturePerfMon]>;
+def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
+                                   "Support ARM v8 instructions",
+                                   [HasV7Ops, FeatureVirtualization,
+                                    FeatureMP]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -170,12 +202,27 @@ def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
 // FIXME: It has not been determined if A15 has these features.
 def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors",
-                                   [FeatureT2XtPk, FeatureFP16,
+                                   [FeatureT2XtPk, FeatureVFP4,
+                                    FeatureMP, FeatureHWDiv, FeatureHWDivARM,
                                     FeatureAvoidPartialCPSR,
-                                    FeatureTrustZone]>;
+                                    FeatureTrustZone, FeatureVirtualization]>;
+
+def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
+                                   "Cortex-A53 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
+def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
+                                   "Cortex-A57 ARM processors",
+                                   [FeatureHWDiv, FeatureHWDivARM,
+                                    FeatureTrustZone, FeatureT2XtPk,
+                                    FeatureCrypto, FeatureCRC]>;
+
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
                                    "Cortex-R5 ARM processors",
-                                   [FeatureSlowFPBrcc, FeatureHWDivARM,
+                                   [FeatureSlowFPBrcc,
+                                    FeatureHWDiv, FeatureHWDivARM,
                                     FeatureHasSlowFPVMLx,
                                     FeatureAvoidPartialCPSR,
                                     FeatureT2XtPk]>;
@@ -233,7 +280,7 @@ def : Processor<"mpcore",           ARMV6Itineraries, [HasV6Ops, FeatureVFP2,
                                                        FeatureHasSlowFPVMLx]>;
 
 // V6M Processors.
-def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6Ops, FeatureNoARM,
+def : Processor<"cortex-m0",        ARMV6Itineraries, [HasV6MOps, FeatureNoARM,
                                                        FeatureDB, FeatureMClass]>;
 
 // V6T2 Processors.
@@ -248,26 +295,30 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
 def : ProcessorModel<"cortex-a5",   CortexA8Model,
                                     [ProcA5, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureVFP4, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 def : ProcessorModel<"cortex-a8",   CortexA8Model,
                                     [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
 // FIXME: A15 has currently the same ProcessorModel as A9.
 def : ProcessorModel<"cortex-a15",   CortexA9Model,
                                     [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
-                                     FeatureDSPThumb2, FeatureHasRAS]>;
+                                     FeatureDSPThumb2, FeatureHasRAS,
+                                     FeatureAClass]>;
 // FIXME: R5 has currently the same ProcessorModel as A8.
 def : ProcessorModel<"cortex-r5",   CortexA8Model,
                                     [ProcR5, HasV7Ops, FeatureDB,
                                      FeatureVFP3, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureVFPOnlySP,
+                                     FeatureD16, FeatureRClass]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
@@ -279,13 +330,22 @@ def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureThumb2, FeatureNoARM, FeatureDB,
                                      FeatureHWDiv, FeatureDSPThumb2,
                                      FeatureT2XtPk, FeatureVFP4,
-                                     FeatureVFPOnlySP, FeatureMClass]>;
+                                     FeatureVFPOnlySP, FeatureD16,
+                                     FeatureMClass]>;
 
 // Swift uArch Processors.
 def : ProcessorModel<"swift",       SwiftModel,
                                     [ProcSwift, HasV7Ops, FeatureNEON,
                                      FeatureDB, FeatureDSPThumb2,
-                                     FeatureHasRAS]>;
+                                     FeatureHasRAS, FeatureAClass]>;
+
+// V8 Processors
+def : ProcNoItin<"cortex-a53",      [ProcA53, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
+def : ProcNoItin<"cortex-a57",      [ProcA57, HasV8Ops, FeatureAClass,
+                                    FeatureDB, FeatureFPARMv8,
+                                    FeatureNEON, FeatureDSPThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 13ec208..e79f88d 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "ARM.h"
 #include "ARMBuildAttrs.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFPUName.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMTargetMachine.h"
 #include "ARMTargetObjectFile.h"
@@ -55,235 +56,67 @@
 #include <cctype>
 using namespace llvm;
 
-namespace {
-
-  // Per section and per symbol attributes are not supported.
-  // To implement them we would need the ability to delay this emission
-  // until the assembly file is fully parsed/generated as only then do we
-  // know the symbol and section numbers.
-  class AttributeEmitter {
-  public:
-    virtual void MaybeSwitchVendor(StringRef Vendor) = 0;
-    virtual void EmitAttribute(unsigned Attribute, unsigned Value) = 0;
-    virtual void EmitTextAttribute(unsigned Attribute, StringRef String) = 0;
-    virtual void Finish() = 0;
-    virtual ~AttributeEmitter() {}
-  };
-
-  class AsmAttributeEmitter : public AttributeEmitter {
-    MCStreamer &Streamer;
-
-  public:
-    AsmAttributeEmitter(MCStreamer &Streamer_) : Streamer(Streamer_) {}
-    void MaybeSwitchVendor(StringRef Vendor) { }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      Streamer.EmitRawText("\t.eabi_attribute " +
-                           Twine(Attribute) + ", " + Twine(Value));
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      switch (Attribute) {
-      default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
-      case ARMBuildAttrs::CPU_name:
-        Streamer.EmitRawText(StringRef("\t.cpu ") + String.lower());
-        break;
-      /* GAS requires .fpu to be emitted regardless of EABI attribute */
-      case ARMBuildAttrs::Advanced_SIMD_arch:
-      case ARMBuildAttrs::VFP_arch:
-        Streamer.EmitRawText(StringRef("\t.fpu ") + String.lower());
-        break;
-      }
-    }
-    void Finish() { }
-  };
-
-  class ObjectAttributeEmitter : public AttributeEmitter {
-    // This structure holds all attributes, accounting for
-    // their string/numeric value, so we can later emmit them
-    // in declaration order, keeping all in the same vector
-    struct AttributeItemType {
-      enum {
-        HiddenAttribute = 0,
-        NumericAttribute,
-        TextAttribute
-      } Type;
-      unsigned Tag;
-      unsigned IntValue;
-      StringRef StringValue;
-    } AttributeItem;
-
-    MCObjectStreamer &Streamer;
-    StringRef CurrentVendor;
-    SmallVector<AttributeItemType, 64> Contents;
-
-    // Account for the ULEB/String size of each item,
-    // not just the number of items
-    size_t ContentsSize;
-    // FIXME: this should be in a more generic place, but
-    // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
-    size_t getULEBSize(int Value) {
-      size_t Size = 0;
-      do {
-        Value >>= 7;
-        Size += sizeof(int8_t); // Is this really necessary?
-      } while (Value);
-      return Size;
-    }
-
-  public:
-    ObjectAttributeEmitter(MCObjectStreamer &Streamer_) :
-      Streamer(Streamer_), CurrentVendor(""), ContentsSize(0) { }
-
-    void MaybeSwitchVendor(StringRef Vendor) {
-      assert(!Vendor.empty() && "Vendor cannot be empty.");
-
-      if (CurrentVendor.empty())
-        CurrentVendor = Vendor;
-      else if (CurrentVendor == Vendor)
-        return;
-      else
-        Finish();
-
-      CurrentVendor = Vendor;
-
-      assert(Contents.size() == 0);
-    }
-
-    void EmitAttribute(unsigned Attribute, unsigned Value) {
-      AttributeItemType attr = {
-        AttributeItemType::NumericAttribute,
-        Attribute,
-        Value,
-        StringRef("")
-      };
-      ContentsSize += getULEBSize(Attribute);
-      ContentsSize += getULEBSize(Value);
-      Contents.push_back(attr);
-    }
-
-    void EmitTextAttribute(unsigned Attribute, StringRef String) {
-      AttributeItemType attr = {
-        AttributeItemType::TextAttribute,
-        Attribute,
-        0,
-        String
-      };
-      ContentsSize += getULEBSize(Attribute);
-      // String + \0
-      ContentsSize += String.size()+1;
-
-      Contents.push_back(attr);
-    }
-
-    void Finish() {
-      // Vendor size + Vendor name + '\0'
-      const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
-
-      // Tag + Tag Size
-      const size_t TagHeaderSize = 1 + 4;
-
-      Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
-      Streamer.EmitBytes(CurrentVendor);
-      Streamer.EmitIntValue(0, 1); // '\0'
-
-      Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
-      Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
-
-      // Size should have been accounted for already, now
-      // emit each field as its type (ULEB or String)
-      for (unsigned int i=0; i<Contents.size(); ++i) {
-        AttributeItemType item = Contents[i];
-        Streamer.EmitULEB128IntValue(item.Tag);
-        switch (item.Type) {
-        default: llvm_unreachable("Invalid attribute type");
-        case AttributeItemType::NumericAttribute:
-          Streamer.EmitULEB128IntValue(item.IntValue);
-          break;
-        case AttributeItemType::TextAttribute:
-          Streamer.EmitBytes(item.StringValue.upper());
-          Streamer.EmitIntValue(0, 1); // '\0'
-          break;
-        }
-      }
-
-      Contents.clear();
-    }
-  };
-
-} // end of anonymous namespace
-
-MachineLocation ARMAsmPrinter::
-getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
-void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
+                                   bool Indirect) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
-    AsmPrinter::EmitDwarfRegOp(MLoc);
-  else {
-    unsigned Reg = MLoc.getReg();
-    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
-      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
-      // S registers are described as bit-pieces of a register
-      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
-      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-
-      unsigned SReg = Reg - ARM::S0;
-      bool odd = SReg & 0x1;
-      unsigned Rx = 256 + (SReg >> 1);
-
-      OutStreamer.AddComment("DW_OP_regx for S register");
-      EmitInt8(dwarf::DW_OP_regx);
-
-      OutStreamer.AddComment(Twine(SReg));
-      EmitULEB128(Rx);
-
-      if (odd) {
-        OutStreamer.AddComment("DW_OP_bit_piece 32 32");
-        EmitInt8(dwarf::DW_OP_bit_piece);
-        EmitULEB128(32);
-        EmitULEB128(32);
-      } else {
-        OutStreamer.AddComment("DW_OP_bit_piece 32 0");
-        EmitInt8(dwarf::DW_OP_bit_piece);
-        EmitULEB128(32);
-        EmitULEB128(0);
-      }
-    } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
-      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
-      // Q registers Q0-Q15 are described by composing two D registers together.
-      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
-      // DW_OP_piece(8)
-
-      unsigned QReg = Reg - ARM::Q0;
-      unsigned D1 = 256 + 2 * QReg;
-      unsigned D2 = D1 + 1;
-
-      OutStreamer.AddComment("DW_OP_regx for Q register: D1");
-      EmitInt8(dwarf::DW_OP_regx);
-      EmitULEB128(D1);
-      OutStreamer.AddComment("DW_OP_piece 8");
-      EmitInt8(dwarf::DW_OP_piece);
-      EmitULEB128(8);
-
-      OutStreamer.AddComment("DW_OP_regx for Q register: D2");
-      EmitInt8(dwarf::DW_OP_regx);
-      EmitULEB128(D2);
-      OutStreamer.AddComment("DW_OP_piece 8");
-      EmitInt8(dwarf::DW_OP_piece);
-      EmitULEB128(8);
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) {
+    AsmPrinter::EmitDwarfRegOp(MLoc, Indirect);
+    return;
+  }
+  assert(MLoc.isReg() && !Indirect &&
+         "This doesn't support offset/indirection - implement it if needed");
+  unsigned Reg = MLoc.getReg();
+  if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+    assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+    // S registers are described as bit-pieces of a register
+    // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+    // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+
+    unsigned SReg = Reg - ARM::S0;
+    bool odd = SReg & 0x1;
+    unsigned Rx = 256 + (SReg >> 1);
+
+    OutStreamer.AddComment("DW_OP_regx for S register");
+    EmitInt8(dwarf::DW_OP_regx);
+
+    OutStreamer.AddComment(Twine(SReg));
+    EmitULEB128(Rx);
+
+    if (odd) {
+      OutStreamer.AddComment("DW_OP_bit_piece 32 32");
+      EmitInt8(dwarf::DW_OP_bit_piece);
+      EmitULEB128(32);
+      EmitULEB128(32);
+    } else {
+      OutStreamer.AddComment("DW_OP_bit_piece 32 0");
+      EmitInt8(dwarf::DW_OP_bit_piece);
+      EmitULEB128(32);
+      EmitULEB128(0);
     }
+  } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+    assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+    // Q registers Q0-Q15 are described by composing two D registers together.
+    // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
+    // DW_OP_piece(8)
+
+    unsigned QReg = Reg - ARM::Q0;
+    unsigned D1 = 256 + 2 * QReg;
+    unsigned D2 = D1 + 1;
+
+    OutStreamer.AddComment("DW_OP_regx for Q register: D1");
+    EmitInt8(dwarf::DW_OP_regx);
+    EmitULEB128(D1);
+    OutStreamer.AddComment("DW_OP_piece 8");
+    EmitInt8(dwarf::DW_OP_piece);
+    EmitULEB128(8);
+
+    OutStreamer.AddComment("DW_OP_regx for Q register: D2");
+    EmitInt8(dwarf::DW_OP_regx);
+    EmitULEB128(D2);
+    OutStreamer.AddComment("DW_OP_piece 8");
+    EmitInt8(dwarf::DW_OP_piece);
+    EmitULEB128(8);
   }
 }
 
@@ -312,7 +145,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
   assert(GV && "C++ constructor pointer was not a GlobalValue!");
 
-  const MCExpr *E = MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  const MCExpr *E = MCSymbolRefExpr::Create(getSymbol(GV),
                                             (Subtarget->isTargetDarwin()
                                              ? MCSymbolRefExpr::VK_None
                                              : MCSymbolRefExpr::VK_ARM_TARGET1),
@@ -373,7 +206,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
              (TF & ARMII::MO_HI16))
       O << ":upper16:";
-    O << *Mang->getSymbol(GV);
+    O << *getSymbol(GV);
 
     printOffset(MO.getOffset(), O);
     if (TF == ARMII::MO_PLT)
@@ -474,8 +307,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // This takes advantage of the 2 operand-ness of ldm/stm and that we've
       // already got the operands in registers that are operands to the
       // inline asm statement.
-
-      O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
+      O << "{";
+      if (ARM::GPRPairRegClass.contains(RegBegin)) {
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";;
+        RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
+      }
+      O << ARMInstPrinter::getRegisterName(RegBegin);
 
       // FIXME: The register allocator not only may not have given us the
       // registers in sequence, but may not be in ascending registers. This
@@ -500,7 +339,38 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (!FlagsOP.isImm())
         return true;
       unsigned Flags = FlagsOP.getImm();
+
+      // This operand may not be the one that actually provides the register. If
+      // it's tied to a previous one then we should refer instead to that one
+      // for registers and their classes.
+      unsigned TiedIdx;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedIdx)) {
+        for (OpNum = InlineAsm::MIOp_FirstOperand; TiedIdx; --TiedIdx) {
+          unsigned OpFlags = MI->getOperand(OpNum).getImm();
+          OpNum += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+        Flags = MI->getOperand(OpNum).getImm();
+
+        // Later code expects OpNum to be pointing at the register rather than
+        // the flags.
+        OpNum += 1;
+      }
+
       unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      unsigned RC;
+      InlineAsm::hasRegClassConstraint(Flags, RC);
+      if (RC == ARM::GPRPairRegClassID) {
+        if (NumVals != 1)
+          return true;
+        const MachineOperand &MO = MI->getOperand(OpNum);
+        if (!MO.isReg())
+          return true;
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+            ARM::gsub_0 : ARM::gsub_1);
+        O << ARMInstPrinter::getRegisterName(Reg);
+        return false;
+      }
       if (NumVals != 2)
         return true;
       unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
@@ -704,11 +574,6 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
-  // FIXME: This should eventually end up somewhere else where more
-  // intelligent flag decisions can be made. For now we are just maintaining
-  // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-  if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&OutStreamer))
-    MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
 }
 
 //===----------------------------------------------------------------------===//
@@ -718,145 +583,150 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
-void ARMAsmPrinter::emitAttributes() {
-
-  emitARMAttributeSection();
-
-  /* GAS expect .fpu to be emitted, regardless of VFP build attribute */
-  bool emitFPU = false;
-  AttributeEmitter *AttrEmitter;
-  if (OutStreamer.hasRawTextSupport()) {
-    AttrEmitter = new AsmAttributeEmitter(OutStreamer);
-    emitFPU = true;
-  } else {
-    MCObjectStreamer &O = static_cast<MCObjectStreamer&>(OutStreamer);
-    AttrEmitter = new ObjectAttributeEmitter(O);
-  }
-
-  AttrEmitter->MaybeSwitchVendor("aeabi");
-
-  std::string CPUString = Subtarget->getCPUString();
-
-  if (CPUString == "cortex-a8" ||
-      Subtarget->isCortexA8()) {
-    AttrEmitter->EmitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a8");
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                               ARMBuildAttrs::ApplicationProfile);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
-    // Fixme: figure out when this is emitted.
-    //AttrEmitter->EmitAttribute(ARMBuildAttrs::WMMX_arch,
-    //                           ARMBuildAttrs::AllowWMMXv1);
-    //
-
-    /// ADD additional Else-cases here!
-  } else if (CPUString == "xscale") {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TEJ);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  } else if (CPUString == "generic") {
-    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                               ARMBuildAttrs::ApplicationProfile);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
-  } else if (Subtarget->hasV7Ops()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
+static ARMBuildAttrs::CPUArch getArchForCPU(StringRef CPU,
+                                            const ARMSubtarget *Subtarget) {
+  if (CPU == "xscale")
+    return ARMBuildAttrs::v5TEJ;
+
+  if (Subtarget->hasV8Ops())
+    return ARMBuildAttrs::v8;
+  else if (Subtarget->hasV7Ops()) {
+    if (Subtarget->isMClass() && Subtarget->hasThumb2DSP())
+      return ARMBuildAttrs::v7E_M;
+    return ARMBuildAttrs::v7;
   } else if (Subtarget->hasV6T2Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+    return ARMBuildAttrs::v6T2;
+  else if (Subtarget->hasV6MOps())
+    return ARMBuildAttrs::v6S_M;
   else if (Subtarget->hasV6Ops())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+    return ARMBuildAttrs::v6;
   else if (Subtarget->hasV5TEOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+    return ARMBuildAttrs::v5TE;
   else if (Subtarget->hasV5TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+    return ARMBuildAttrs::v5T;
   else if (Subtarget->hasV4TOps())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    return ARMBuildAttrs::v4T;
+  else
+    return ARMBuildAttrs::v4;
+}
 
-  if (Subtarget->hasNEON() && emitFPU) {
-    /* NEON is not exactly a VFP architecture, but GAS emit one of
-     * neon/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
-    if (Subtarget->hasVFP4())
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                                     "neon-vfpv4");
-    else
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::Advanced_SIMD_arch, "neon");
-    /* If emitted for NEON, omit from VFP below, since you can have both
-     * NEON and VFP in build attributes but only one .fpu */
-    emitFPU = false;
+void ARMAsmPrinter::emitAttributes() {
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+
+  ATS.switchVendor("aeabi");
+
+  std::string CPUString = Subtarget->getCPUString();
+
+  if (CPUString != "generic")
+    ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, CPUString);
+
+  ATS.emitAttribute(ARMBuildAttrs::CPU_arch,
+                    getArchForCPU(CPUString, Subtarget));
+
+  if (Subtarget->isAClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::ApplicationProfile);
+  } else if (Subtarget->isRClass()) {
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::RealTimeProfile);
+  } else if (Subtarget->isMClass()){
+    ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                      ARMBuildAttrs::MicroControllerProfile);
   }
 
-  /* VFPv4 + .fpu */
-  if (Subtarget->hasVFP4()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv4A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv4");
-
-  /* VFPv3 + .fpu */
-  } else if (Subtarget->hasVFP3()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv3A);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv3");
-
-  /* VFPv2 + .fpu */
-  } else if (Subtarget->hasVFP2()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
-                               ARMBuildAttrs::AllowFPv2);
-    if (emitFPU)
-      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "vfpv2");
+  ATS.emitAttribute(ARMBuildAttrs::ARM_ISA_use, Subtarget->hasARMOps() ?
+                      ARMBuildAttrs::Allowed : ARMBuildAttrs::Not_Allowed);
+  if (Subtarget->isThumb1Only()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::Allowed);
+  } else if (Subtarget->hasThumb2()) {
+    ATS.emitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                      ARMBuildAttrs::AllowThumb32);
   }
 
-  /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
-   * since NEON can have 1 (allowed) or 2 (MAC operations) */
   if (Subtarget->hasNEON()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                               ARMBuildAttrs::Allowed);
+    /* NEON is not exactly a VFP architecture, but GAS emit one of
+     * neon/neon-fp-armv8/neon-vfpv4/vfpv3/vfpv2 for .fpu parameters */
+    if (Subtarget->hasFPARMv8()) {
+      if (Subtarget->hasCrypto())
+        ATS.emitFPU(ARM::CRYPTO_NEON_FP_ARMV8);
+      else
+        ATS.emitFPU(ARM::NEON_FP_ARMV8);
+    }
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(ARM::NEON_VFPV4);
+    else
+      ATS.emitFPU(ARM::NEON);
+    // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
+    if (Subtarget->hasV8Ops())
+      ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                        ARMBuildAttrs::AllowNeonARMv8);
+  } else {
+    if (Subtarget->hasFPARMv8())
+      ATS.emitFPU(ARM::FP_ARMV8);
+    else if (Subtarget->hasVFP4())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV4_D16 : ARM::VFPV4);
+    else if (Subtarget->hasVFP3())
+      ATS.emitFPU(Subtarget->hasD16() ? ARM::VFPV3_D16 : ARM::VFPV3);
+    else if (Subtarget->hasVFP2())
+      ATS.emitFPU(ARM::VFPV2);
   }
 
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_denormal,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_exceptions,
+                      ARMBuildAttrs::Allowed);
   }
 
   if (TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath)
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::Allowed);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::Allowed);
   else
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_FP_number_model,
-                               ARMBuildAttrs::AllowIEE754);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_FP_number_model,
+                      ARMBuildAttrs::AllowIEE754);
 
   // FIXME: add more flags to ARMBuildAttrs.h
   // 8-bytes alignment stuff.
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
-  AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_needed, 1);
+  ATS.emitAttribute(ARMBuildAttrs::ABI_align8_preserved, 1);
+
+  // ABI_HardFP_use attribute to indicate single precision FP.
+  if (Subtarget->isFPOnlySP())
+    ATS.emitAttribute(ARMBuildAttrs::ABI_HardFP_use,
+                      ARMBuildAttrs::HardFPSinglePrecision);
 
   // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
-  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_HardFP_use, 3);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ABI_VFP_args, 1);
-  }
+  if (Subtarget->isAAPCS_ABI() && TM.Options.FloatABIType == FloatABI::Hard)
+    ATS.emitAttribute(ARMBuildAttrs::ABI_VFP_args, ARMBuildAttrs::HardFPAAPCS);
+
   // FIXME: Should we signal R9 usage?
 
-  if (Subtarget->hasDivide())
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::DIV_use, 1);
+  if (Subtarget->hasFP16())
+      ATS.emitAttribute(ARMBuildAttrs::FP_HP_extension, ARMBuildAttrs::AllowHPFP);
+
+  if (Subtarget->hasMPExtension())
+      ATS.emitAttribute(ARMBuildAttrs::MPextension_use, ARMBuildAttrs::AllowMP);
+
+  if (Subtarget->hasDivide()) {
+    // Check if hardware divide is only available in thumb2 or ARM as well.
+    ATS.emitAttribute(ARMBuildAttrs::DIV_use,
+      Subtarget->hasDivideInARMMode() ? ARMBuildAttrs::AllowDIVExt :
+                                        ARMBuildAttrs::AllowDIVIfExists);
+  }
 
-  AttrEmitter->Finish();
-  delete AttrEmitter;
+  if (Subtarget->hasTrustZone() && Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZVirtualization);
+  else if (Subtarget->hasTrustZone())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowTZ);
+  else if (Subtarget->hasVirtualization())
+      ATS.emitAttribute(ARMBuildAttrs::Virtualization_use,
+                        ARMBuildAttrs::AllowVirtualization);
+
+  ATS.finishAttributeSection();
 }
 
 void ARMAsmPrinter::emitARMAttributeSection() {
@@ -908,7 +778,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
   bool isIndirect = Subtarget->isTargetDarwin() &&
     Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
   if (!isIndirect)
-    return Mang->getSymbol(GV);
+    return getSymbol(GV);
 
   // FIXME: Remove this when Darwin transition to @GOT like syntax.
   MCSymbol *MCSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -919,7 +789,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
     MMIMachO.getGVStubEntry(MCSym);
   if (StubSym.getPointer() == 0)
     StubSym = MachineModuleInfoImpl::
-      StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+      StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
   return MCSym;
 }
 
@@ -1092,27 +962,12 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
     OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
-void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                           raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps==4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps-2, OS);
-}
-
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
 
+  MCTargetStreamer &TS = OutStreamer.getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   const MachineFunction &MF = *MI->getParent()->getParent();
   const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
   const ARMFunctionInfo &AFI = *MF.getInfo<ARMFunctionInfo>();
@@ -1175,7 +1030,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    OutStreamer.EmitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+    ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
@@ -1223,11 +1078,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       if (DstReg == FramePtr && FramePtr != ARM::SP)
         // Set-up of the frame pointer. Positive values correspond to "add"
         // instruction.
-        OutStreamer.EmitSetFP(FramePtr, ARM::SP, -Offset);
+        ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
       else if (DstReg == ARM::SP) {
         // Change of SP by an offset. Positive values correspond to "sub"
         // instruction.
-        OutStreamer.EmitPad(Offset);
+        ATS.emitPad(Offset);
       } else {
         MI->dump();
         llvm_unreachable("Unsupported opcode for unwinding information");
@@ -1272,15 +1127,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
   case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
-  case ARM::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
+  case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
   case ARM::LEApcrel:
   case ARM::tLEApcrel:
   case ARM::t2LEApcrel: {
@@ -1376,7 +1223,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addReg(0));
 
     const GlobalValue *GV = MI->getOperand(0).getGlobal();
-    MCSymbol *GVSym = Mang->getSymbol(GV);
+    MCSymbol *GVSym = getSymbol(GV);
     const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(ARM::Bcc)
       .addExpr(GVSymExpr)
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index c945e4f..de72e06 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -97,13 +97,9 @@ private:
                                    const MachineInstr *MI);
 
 public:
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  virtual MachineLocation
-    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
-
   /// EmitDwarfRegOp - Emit dwarf register operation.
-  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const LLVM_OVERRIDE;
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc, bool Indirect) const
+      LLVM_OVERRIDE;
 
   virtual unsigned getISAEncoding() LLVM_OVERRIDE {
     // ARM/Darwin adds ISA to the DWARF info for each function.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 6005054..f835a4e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARMBaseInstrInfo.h"
 #include "ARM.h"
+#include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
 #include "ARMConstantPoolValue.h"
+#include "ARMFeatures.h"
 #include "ARMHazardRecognizer.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
@@ -36,7 +37,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "ARMGenInstrInfo.inc"
 
 using namespace llvm;
@@ -113,8 +114,7 @@ ScheduleHazardRecognizer *ARMBaseInstrInfo::
 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
                                    const ScheduleDAG *DAG) const {
   if (Subtarget.isThumb2() || Subtarget.hasVFP2())
-    return (ScheduleHazardRecognizer *)
-      new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget, DAG);
+    return (ScheduleHazardRecognizer *)new ARMHazardRecognizer(II, DAG);
   return TargetInstrInfo::CreateTargetPostRAHazardRecognizer(II, DAG);
 }
 
@@ -273,104 +273,90 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 MachineBasicBlock *&FBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
+  TBB = 0;
+  FBB = 0;
+
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
-    return false;
+    return false; // Empty blocks are easy.
   --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-  unsigned LastOpc = LastInst->getOpcode();
 
-  // Check if it's an indirect branch first, this should return 'unanalyzable'
-  // even if it's predicated.
-  if (isIndirectBranchOpcode(LastOpc))
-    return true;
+  // Walk backwards from the end of the basic block until the branch is
+  // analyzed or we give up.
+  while (isPredicated(I) || I->isTerminator()) {
 
-  if (!isUnpredicatedTerminator(I))
-    return false;
+    // Flag to be raised on unanalyzeable instructions. This is useful in cases
+    // where we want to clean up on the end of the basic block before we bail
+    // out.
+    bool CantAnalyze = false;
 
-  // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
+    // Skip over DEBUG values and predicated nonterminators.
+    while (I->isDebugValue() || !I->isTerminator()) {
+      if (I == MBB.begin())
+        return false;
+      --I;
     }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      TBB = LastInst->getOperand(0).getMBB();
-      Cond.push_back(LastInst->getOperand(1));
-      Cond.push_back(LastInst->getOperand(2));
-      return false;
+
+    if (isIndirectBranchOpcode(I->getOpcode()) ||
+        isJumpTableBranchOpcode(I->getOpcode())) {
+      // Indirect branches and jump tables can't be analyzed, but we still want
+      // to clean up any instructions at the tail of the basic block.
+      CantAnalyze = true;
+    } else if (isUncondBranchOpcode(I->getOpcode())) {
+      TBB = I->getOperand(0).getMBB();
+    } else if (isCondBranchOpcode(I->getOpcode())) {
+      // Bail out if we encounter multiple conditional branches.
+      if (!Cond.empty())
+        return true;
+
+      assert(!FBB && "FBB should have been null.");
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(I->getOperand(1));
+      Cond.push_back(I->getOperand(2));
+    } else if (I->isReturn()) {
+      // Returns can't be analyzed, but we should run cleanup.
+      CantAnalyze = !isPredicated(I);
+    } else {
+      // We encountered other unrecognized terminator. Bail out immediately.
+      return true;
     }
-    return true;  // Can't handle indirect branch.
-  }
 
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
+    // Cleanup code - to be run for unpredicated unconditional branches and
+    //                returns.
+    if (!isPredicated(I) &&
+          (isUncondBranchOpcode(I->getOpcode()) ||
+           isIndirectBranchOpcode(I->getOpcode()) ||
+           isJumpTableBranchOpcode(I->getOpcode()) ||
+           I->isReturn())) {
+      // Forget any previous condition branch information - it no longer applies.
+      Cond.clear();
+      FBB = 0;
+
+      // If we can modify the function, delete everything below this
+      // unconditional branch.
+      if (AllowModify) {
+        MachineBasicBlock::iterator DI = llvm::next(I);
+        while (DI != MBB.end()) {
+          MachineInstr *InstToDelete = DI;
+          ++DI;
+          InstToDelete->eraseFromParent();
+        }
       }
     }
-  }
 
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB =  SecondLastInst->getOperand(0).getMBB();
-    Cond.push_back(SecondLastInst->getOperand(1));
-    Cond.push_back(SecondLastInst->getOperand(2));
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
+    if (CantAnalyze)
+      return true;
 
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
+    if (I == MBB.begin())
+      return false;
 
-  // ...likewise if it ends with a branch table followed by an unconditional
-  // branch. The branch folder can create these, and we must get rid of them for
-  // correctness of Thumb constant islands.
-  if ((isJumpTableBranchOpcode(SecondLastOpc) ||
-       isIndirectBranchOpcode(SecondLastOpc)) &&
-      isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
+    --I;
   }
 
-  // Otherwise, can't handle this.
-  return true;
+  // We made it past the terminators without bailing out - we must have
+  // analyzed this branch successfully.
+  return false;
 }
 
 
@@ -535,11 +521,17 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   if (!MI->isPredicable())
     return false;
 
-  if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
-    ARMFunctionInfo *AFI =
-      MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
-    return AFI->isThumb2Function();
+  ARMFunctionInfo *AFI =
+    MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+
+  if (AFI->isThumb2Function()) {
+    if (getSubtarget().restrictIT())
+      return isV8EligibleForIT(MI);
+  } else { // non-Thumb
+    if ((MI->getDesc().TSFlags & ARMII::DomainMask) == ARMII::DomainNEON)
+      return false;
   }
+
   return true;
 }
 
@@ -660,16 +652,16 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    unsigned DestReg, unsigned SrcReg,
                                    bool KillSrc) const {
   bool GPRDest = ARM::GPRRegClass.contains(DestReg);
-  bool GPRSrc  = ARM::GPRRegClass.contains(SrcReg);
+  bool GPRSrc = ARM::GPRRegClass.contains(SrcReg);
 
   if (GPRDest && GPRSrc) {
     AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr), DestReg)
-                                  .addReg(SrcReg, getKillRegState(KillSrc))));
+                                    .addReg(SrcReg, getKillRegState(KillSrc))));
     return;
   }
 
   bool SPRDest = ARM::SPRRegClass.contains(DestReg);
-  bool SPRSrc  = ARM::SPRRegClass.contains(SrcReg);
+  bool SPRSrc = ARM::SPRRegClass.contains(SrcReg);
 
   unsigned Opc = 0;
   if (SPRDest && SPRSrc)
@@ -698,26 +690,47 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   int Spacing = 1;
 
   // Use VORRq when possible.
-  if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 2;
-  else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VORRq, BeginIdx = ARM::qsub_0, SubRegs = 4;
+  if (ARM::QQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 2;
+  } else if (ARM::QQQQPRRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VORRq;
+    BeginIdx = ARM::qsub_0;
+    SubRegs = 4;
   // Fall back to VMOVD.
-  else if (ARM::DPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2;
-  else if (ARM::DTripleRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
-  else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
-  else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2;
-
-  else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
-  else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3, Spacing = 2;
-  else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
-    Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
+  } else if (ARM::DPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+  } else if (ARM::DTripleRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+  } else if (ARM::DQuadRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+  } else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg)) {
+    Opc = Subtarget.isThumb2() ? ARM::tMOVr : ARM::MOVr;
+    BeginIdx = ARM::gsub_0;
+    SubRegs = 2;
+  } else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 2;
+    Spacing = 2;
+  } else if (ARM::DTripleSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 3;
+    Spacing = 2;
+  } else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg)) {
+    Opc = ARM::VMOVD;
+    BeginIdx = ARM::dsub_0;
+    SubRegs = 4;
+    Spacing = 2;
+  }
 
   assert(Opc && "Impossible reg-to-reg copy");
 
@@ -726,26 +739,28 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
   if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    BeginIdx = BeginIdx + ((SubRegs - 1) * Spacing);
     Spacing = -Spacing;
   }
 #ifndef NDEBUG
   SmallSet<unsigned, 4> DstRegs;
 #endif
   for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
     assert(Dst && Src && "Bad sub-register");
 #ifndef NDEBUG
     assert(!DstRegs.count(Src) && "destructive vector copy");
     DstRegs.insert(Dst);
 #endif
-    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-      .addReg(Src);
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst).addReg(Src);
     // VORR takes two source operands.
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
     Mov = AddDefaultPred(Mov);
+    // MOVr can set CC.
+    if (Opc == ARM::MOVr)
+      Mov = AddDefaultCC(Mov);
   }
   // Add implicit super-register defs and kills to the last instruction.
   Mov->addRegisterDefined(DestReg, TRI);
@@ -1214,16 +1229,6 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
   return true;
 }
 
-MachineInstr*
-ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                           int FrameIx, uint64_t Offset,
-                                           const MDNode *MDPtr,
-                                           DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 /// Create a copy of a const pool value. Update CPI to the new index and return
 /// the label UID.
 static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
@@ -1426,9 +1431,11 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRDi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1445,8 +1452,10 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
+  case ARM::t2LDRBi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
+  case ARM::t2LDRBi12:
   case ARM::t2LDRSHi12:
     break;
   }
@@ -1493,7 +1502,16 @@ bool ARMBaseInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   if ((Offset2 - Offset1) / 8 > 64)
     return false;
 
-  if (Load1->getMachineOpcode() != Load2->getMachineOpcode())
+  // Check if the machine opcodes are different. If they are different
+  // then we consider them to not be of the same base address,
+  // EXCEPT in the case of Thumb2 byte loads where one is LDRBi8 and the other LDRBi12.
+  // In this case, they are considered to be the same because they are different
+  // encoding forms of the same basic instruction.
+  if ((Load1->getMachineOpcode() != Load2->getMachineOpcode()) &&
+      !((Load1->getMachineOpcode() == ARM::t2LDRBi8 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi12) ||
+        (Load1->getMachineOpcode() == ARM::t2LDRBi12 &&
+         Load2->getMachineOpcode() == ARM::t2LDRBi8)))
     return false;  // FIXME: overly conservative?
 
   // Four loads in a row should be sufficient.
@@ -1708,7 +1726,7 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
                                                bool PreferFalse) const {
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
-  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
   bool Invert = !DefMI;
   if (!DefMI)
@@ -1716,11 +1734,17 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (!DefMI)
     return 0;
 
+  // Find new register class to use.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  unsigned       DestReg  = MI->getOperand(0).getReg();
+  const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
+  if (!MRI.constrainRegClass(DestReg, PreviousClass))
+    return 0;
+
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      DefMI->getDesc(),
-                                      MI->getOperand(0).getReg());
+                                      DefMI->getDesc(), DestReg);
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1743,7 +1767,6 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   // register operand tied to the first def.
   // The tie makes the register allocator ensure the FalseReg is allocated the
   // same register as operand 0.
-  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
   FalseReg.setImplicit();
   NewMI.addOperand(FalseReg);
   NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
@@ -1803,6 +1826,14 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
+      .setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -1826,6 +1857,115 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
   }
 }
 
+bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      unsigned NumBytes) {
+  // This optimisation potentially adds lots of load and store
+  // micro-operations, it's only really a great benefit to code-size.
+  if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
+    return false;
+
+  // If only one register is pushed/popped, LLVM can use an LDR/STR
+  // instead. We can't modify those so make sure we're dealing with an
+  // instruction we understand.
+  bool IsPop = isPopOpcode(MI->getOpcode());
+  bool IsPush = isPushOpcode(MI->getOpcode());
+  if (!IsPush && !IsPop)
+    return false;
+
+  bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
+                      MI->getOpcode() == ARM::VLDMDIA_UPD;
+  bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
+                     MI->getOpcode() == ARM::tPOP ||
+                     MI->getOpcode() == ARM::tPOP_RET;
+
+  assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
+                          MI->getOperand(1).getReg() == ARM::SP)) &&
+         "trying to fold sp update into non-sp-updating push/pop");
+
+  // The VFP push & pop act on D-registers, so we can only fold an adjustment
+  // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
+  // if this is violated.
+  if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
+    return false;
+
+  // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
+  // pred) so the list starts at 4. Thumb1 starts after the predicate.
+  int RegListIdx = IsT1PushPop ? 2 : 4;
+
+  // Calculate the space we'll need in terms of registers.
+  unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
+  unsigned RD0Reg, RegsNeeded;
+  if (IsVFPPushPop) {
+    RD0Reg = ARM::D0;
+    RegsNeeded = NumBytes / 8;
+  } else {
+    RD0Reg = ARM::R0;
+    RegsNeeded = NumBytes / 4;
+  }
+
+  // We're going to have to strip all list operands off before
+  // re-adding them since the order matters, so save the existing ones
+  // for later.
+  SmallVector<MachineOperand, 4> RegList;
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    RegList.push_back(MI->getOperand(i));
+
+  MachineBasicBlock *MBB = MI->getParent();
+  const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  // Now try to find enough space in the reglist to allocate NumBytes.
+  for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
+       --CurReg) {
+    if (!IsPop) {
+      // Pushing any register is completely harmless, mark the
+      // register involved as undef since we don't care about it in
+      // the slightest.
+      RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
+                                                  false, false, true));
+      --RegsNeeded;
+      continue;
+    }
+
+    // However, we can only pop an extra register if it's not live. For
+    // registers live within the function we might clobber a return value
+    // register; the other way a register can be live here is if it's
+    // callee-saved.
+    if (isCalleeSavedRegister(CurReg, CSRegs) ||
+        MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
+            MachineBasicBlock::LQR_Dead) {
+      // VFP pops don't allow holes in the register list, so any skip is fatal
+      // for our transformation. GPR pops do, so we should just keep looking.
+      if (IsVFPPushPop)
+        return false;
+      else
+        continue;
+    }
+
+    // Mark the unimportant registers as <def,dead> in the POP.
+    RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, false,
+                                                true));
+    --RegsNeeded;
+  }
+
+  if (RegsNeeded > 0)
+    return false;
+
+  // Finally we know we can profitably perform the optimisation so go
+  // ahead: strip all existing registers off and add them back again
+  // in the right order.
+  for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
+    MI->RemoveOperand(i);
+
+  // Add the complete list back in.
+  MachineInstrBuilder MIB(MF, &*MI);
+  for (int i = RegList.size() - 1; i >= 0; --i)
+    MIB.addOperand(RegList[i]);
+
+  return true;
+}
+
 bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
                                 unsigned FrameReg, int &Offset,
                                 const ARMBaseInstrInfo &TII) {
@@ -2232,8 +2372,32 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           isSafe = true;
           break;
         }
-        // Condition code is after the operand before CPSR.
-        ARMCC::CondCodes CC = (ARMCC::CondCodes)Instr.getOperand(IO-1).getImm();
+        // Condition code is after the operand before CPSR except for VSELs.
+        ARMCC::CondCodes CC;
+        bool IsInstrVSel = true;
+        switch (Instr.getOpcode()) {
+        default:
+          IsInstrVSel = false;
+          CC = (ARMCC::CondCodes)Instr.getOperand(IO - 1).getImm();
+          break;
+        case ARM::VSELEQD:
+        case ARM::VSELEQS:
+          CC = ARMCC::EQ;
+          break;
+        case ARM::VSELGTD:
+        case ARM::VSELGTS:
+          CC = ARMCC::GT;
+          break;
+        case ARM::VSELGED:
+        case ARM::VSELGES:
+          CC = ARMCC::GE;
+          break;
+        case ARM::VSELVSS:
+        case ARM::VSELVSD:
+          CC = ARMCC::VS;
+          break;
+        }
+
         if (Sub) {
           ARMCC::CondCodes NewCC = getSwappedCondition(CC);
           if (NewCC == ARMCC::AL)
@@ -2244,11 +2408,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
           // If it is safe to remove CmpInstr, the condition code of these
           // operands will be modified.
           if (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
-              Sub->getOperand(2).getReg() == SrcReg)
-            OperandsToUpdate.push_back(std::make_pair(&((*I).getOperand(IO-1)),
-                                                      NewCC));
-        }
-        else
+              Sub->getOperand(2).getReg() == SrcReg) {
+            // VSel doesn't support condition code update.
+            if (IsInstrVSel)
+              return false;
+            OperandsToUpdate.push_back(
+                std::make_pair(&((*I).getOperand(IO - 1)), NewCC));
+          }
+        } else
           switch (CC) {
           default:
             // CPSR can be used multiple times, we should continue.
@@ -3604,6 +3771,24 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+unsigned ARMBaseInstrInfo::getPredicationCost(const MachineInstr *MI) const {
+   if (MI->isCopyLike() || MI->isInsertSubreg() ||
+      MI->isRegSequence() || MI->isImplicitDef())
+    return 0;
+
+  if (MI->isBundle())
+    return 0;
+
+  const MCInstrDesc &MCID = MI->getDesc();
+
+  if (MCID.isCall() || MCID.hasImplicitDefOfPhysReg(ARM::CPSR)) {
+    // When predicated, CPSR is an additional source operand for CPSR updating
+    // instructions, this apparently increases their latencies.
+    return 1;
+  }
+  return 0;
+}
+
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                            const MachineInstr *MI,
                                            unsigned *PredCost) const {
@@ -3685,8 +3870,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx,
-                                      /*FindMin=*/false);
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
   if (Latency < 0)
     Latency = getInstrLatency(ItinData, DefMI);
   if (Latency <= 3)
@@ -4137,7 +4321,7 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI,
   // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
   // the full D-register by loading the same value to both lanes.  The
   // instruction is micro-coded with 2 uops, so don't do this until we can
-  // properly schedule micro-coded instuctions.  The dispatcher stalls cause
+  // properly schedule micro-coded instructions.  The dispatcher stalls cause
   // too big regressions.
 
   // Insert the dependency-breaking FCONSTD before MI.
@@ -4152,6 +4336,8 @@ bool ARMBaseInstrInfo::hasNOP() const {
 }
 
 bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
+  if (MI->getNumOperands() < 4)
+    return true;
   unsigned ShOpVal = MI->getOperand(3).getImm();
   unsigned ShImm = ARM_AM::getSORegOffset(ShOpVal);
   // Swift supports faster shifts for: lsl 2, lsl 1, and lsr 1.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 2ef659c..93e5964 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -46,7 +46,7 @@ public:
                                               MachineBasicBlock::iterator &MBBI,
                                               LiveVariables *LV) const;
 
-  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const = 0;
   const ARMSubtarget &getSubtarget() const { return Subtarget; }
 
   ScheduleHazardRecognizer *
@@ -125,12 +125,6 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
-
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI,
                              unsigned DestReg, unsigned SubIdx,
@@ -270,6 +264,8 @@ private:
                         const MCInstrDesc &UseMCID,
                         unsigned UseIdx, unsigned UseAlign) const;
 
+  unsigned getPredicationCost(const MachineInstr *MI) const;
+
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
                            unsigned *PredCost = 0) const;
@@ -366,6 +362,17 @@ bool isIndirectBranchOpcode(int Opc) {
   return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
 }
 
+static inline bool isPopOpcode(int Opc) {
+  return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
+         Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
+         Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
+}
+
+static inline bool isPushOpcode(int Opc) {
+  return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
+         Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
@@ -405,6 +412,13 @@ void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                const ARMBaseRegisterInfo& MRI,
                                unsigned MIFlags = 0);
 
+/// Tries to add registers to the reglist of a given base-updating
+/// push/pop instruction to adjust the stack by an additional
+/// NumBytes. This can save a few bytes per function in code-size, but
+/// obviously generates more memory traffic. As such, it only takes
+/// effect in functions being optimised for size.
+bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
+                                unsigned NumBytes);
 
 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
 /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b0d34a7..8717dc0 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -43,46 +43,73 @@
 
 using namespace llvm;
 
-ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
-                                         const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), TII(tii), STI(sti),
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti),
     FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11),
     BasePtr(ARM::R6) {
 }
 
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  bool ghcCall = false;
- 
-  if (MF) {
-    const Function *F = MF->getFunction();
-    ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
-  }
- 
-  if (ghcCall) {
-      return CSR_GHC_SaveList;
-  }
-  else {
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
+  const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+                                ? CSR_iOS_SaveList
+                                : CSR_AAPCS_SaveList;
+
+  if (!MF) return RegList;
+
+  const Function *F = MF->getFunction();
+  if (F->getCallingConv() == CallingConv::GHC) {
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_NoRegs_SaveList;
+  } else if (F->hasFnAttribute("interrupt")) {
+    if (STI.isMClass()) {
+      // M-class CPUs have hardware which saves the registers needed to allow a
+      // function conforming to the AAPCS to function as a handler.
+      return CSR_AAPCS_SaveList;
+    } else if (F->getFnAttribute("interrupt").getValueAsString() == "FIQ") {
+      // Fast interrupt mode gives the handler a private copy of R8-R14, so less
+      // need to be saved to restore user-mode state.
+      return CSR_FIQ_SaveList;
+    } else {
+      // Generally only R13-R14 (i.e. SP, LR) are automatically preserved by
+      // exception handling.
+      return CSR_GenericInt_SaveList;
+    }
   }
+
+  return RegList;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_NoRegs_RegMask;
   return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
     ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getNoPreservedMask() const {
-  return CSR_NoRegs_RegMask;
+ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i32 argument (which must also be the register used to return a
+  // single i32 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both or otherwise does not want to enable this optimization, the function
+  // should return NULL
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return NULL;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
 }
 
 BitVector ARMBaseRegisterInfo::
@@ -94,6 +121,7 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(ARM::SP);
   Reserved.set(ARM::PC);
   Reserved.set(ARM::FPSCR);
+  Reserved.set(ARM::APSR_NZCV);
   if (TFI->hasFP(MF))
     Reserved.set(FramePtr);
   if (hasBasePointer(MF))
@@ -309,7 +337,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
   // 3. There are VLAs in the function and the base pointer is disabled.
-  if (!MF.getTarget().Options.RealignStack)
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
     return false;
   if (AFI->isThumb1OnlyFunction())
     return false;
@@ -357,14 +385,6 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return ARM::SP;
 }
 
-unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 /// emitLoadConstPool - Emits a load from constpool to materialize the
 /// specified immediate.
 void ARMBaseRegisterInfo::
@@ -375,6 +395,7 @@ emitLoadConstPool(MachineBasicBlock &MBB,
                   ARMCC::CondCodes Pred,
                   unsigned PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
         ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
@@ -556,9 +577,10 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
-  const MCInstrDesc &MCID = TII.get(ADDriOpc);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   const MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   MachineInstrBuilder MIB = AddDefaultPred(BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -574,6 +596,8 @@ ARMBaseRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
   MachineInstr &MI = *I;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
@@ -671,6 +695,8 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   const ARMFrameLowering *TFI =
     static_cast<const ARMFrameLowering*>(MF.getTarget().getFrameLowering());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -696,12 +722,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 #endif // NDEBUG
 
-  // Special handling of dbg_value instructions.
-  if (MI.isDebugValue()) {
-    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
+  assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code");
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
   bool Done = false;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 0679919..e28fff6 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -72,9 +72,16 @@ static inline bool isARMArea3Register(unsigned Reg, bool isIOS) {
   }
 }
 
+static inline bool isCalleeSavedRegister(unsigned Reg,
+                                         const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
 class ARMBaseRegisterInfo : public ARMGenRegisterInfo {
 protected:
-  const ARMBaseInstrInfo &TII;
   const ARMSubtarget &STI;
 
   /// FramePtr - ARM physical register used as frame ptr.
@@ -86,8 +93,7 @@ protected:
   unsigned BasePtr;
 
   // Can be only subclassed.
-  explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
-                               const ARMSubtarget &STI);
+  explicit ARMBaseRegisterInfo(const ARMSubtarget &STI);
 
   // Return the opcode that implements 'Op', or 0 if no opcode
   unsigned getOpcode(int Op) const;
@@ -96,9 +102,18 @@ public:
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
   const uint32_t *getNoPreservedMask() const;
 
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i32 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i32 argument and an i32 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+  
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   const TargetRegisterClass*
@@ -142,10 +157,6 @@ public:
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getBaseRegister() const { return BasePtr; }
 
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   bool isLowRegister(unsigned Reg) const;
 
 
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
index 11bd6a4..b16d4ef 100644
--- a/lib/Target/ARM/ARMBuildAttrs.h
+++ b/lib/Target/ARM/ARMBuildAttrs.h
@@ -15,11 +15,13 @@
 #ifndef __TARGET_ARMBUILDATTRS_H__
 #define __TARGET_ARMBUILDATTRS_H__
 
+namespace llvm {
 namespace ARMBuildAttrs {
+
   enum SpecialAttr {
     // This is for the .cpu asm attr. It translates into one or more
     // AttrType (below) entries in the .ARM.attributes section in the ELF.
-    SEL_CPU 
+    SEL_CPU
   };
 
   enum AttrType {
@@ -57,7 +59,7 @@ namespace ARMBuildAttrs {
     ABI_FP_optimization_goals = 31,
     compatibility             = 32,
     CPU_unaligned_access      = 34,
-    VFP_HP_extension          = 36,
+    FP_HP_extension           = 36,
     ABI_FP_16bit_format       = 38,
     MPextension_use           = 42, // was 70, 2.08 ABI
     DIV_use                   = 44,
@@ -89,10 +91,11 @@ namespace ARMBuildAttrs {
     v7       = 10,  // e.g. Cortex A8, Cortex M3
     v6_M     = 11,  // e.g. Cortex M1
     v6S_M    = 12,  // v6_M with the System extensions
-    v7E_M    = 13   // v7_M with DSP extensions
+    v7E_M    = 13,  // v7_M with DSP extensions
+    v8       = 14   // v8, AArch32
   };
 
-  enum CPUArchProfile { // (=7), uleb128 
+  enum CPUArchProfile { // (=7), uleb128
     Not_Applicable = 0, // pre v7, or cross-profile code
     ApplicationProfile = (0x41), // 'A' (e.g. for Cortex A8)
     RealTimeProfile = (0x52), // 'R' (e.g. for Cortex R4)
@@ -101,31 +104,67 @@ namespace ARMBuildAttrs {
   };
 
   // The following have a lot of common use cases
-  enum { 
-    //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
+  enum {
     Not_Allowed = 0,
     Allowed = 1,
 
-    // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
+    // Tag_ARM_ISA_use (=8), uleb128
+
+    // Tag_THUMB_ISA_use, (=9), uleb128
+    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
+
+    // Tag_FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
     AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
     AllowFPv3A = 3, // v3 FP ISA permitted (implies use of the v2 FP ISA)
-    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
-    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
+    AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31
+    AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA)
     AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
+    AllowFPARMv8A = 7, // Use of the ARM v8-A FP ISA was permitted
+    AllowFPARMv8B = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
 
     // Tag_WMMX_arch, (=11), uleb128
-    AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
-    
-    // Tag_WMMX_arch, (=11), uleb128
-    AllowWMMXv1 = 2,  // The user permitted this entity to use WMMX v2
+    AllowWMMXv1 = 1,  // The user permitted this entity to use WMMX v1
+    AllowWMMXv2 = 2,  // The user permitted this entity to use WMMX v2
+
+    // Tag_Advanced_SIMD_arch, (=12), uleb128
+    AllowNeon = 1, // SIMDv1 was permitted
+    AllowNeon2 = 2, // SIMDv2 was permitted (Half-precision FP, MAC operations)
+    AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
 
-    // Tag_ABI_FP_denormal, (=20), uleb128 
+    // Tag_ABI_FP_denormal, (=20), uleb128
     PreserveFPSign = 2, // sign when flushed-to-zero is preserved
 
     // Tag_ABI_FP_number_model, (=23), uleb128
     AllowRTABI = 2,  // numbers, infinities, and one quiet NaN (see [RTABI])
-    AllowIEE754 = 3 // this code to use all the IEEE 754-defined FP encodings
+    AllowIEE754 = 3, // this code to use all the IEEE 754-defined FP encodings
+
+    // Tag_ABI_HardFP_use, (=27), uleb128
+    HardFPImplied = 0, // FP use should be implied by Tag_FP_arch
+    HardFPSinglePrecision = 1, // Single-precision only
+
+    // Tag_ABI_VFP_args, (=28), uleb128
+    BaseAAPCS = 0,
+    HardFPAAPCS = 1,
+
+    // Tag_FP_HP_extension, (=36), uleb128
+    AllowHPFP = 1, // Allow use of Half Precision FP
+
+    // Tag_MPextension_use, (=42), uleb128
+    AllowMP = 1, // Allow use of MP extensions
+
+    // Tag_DIV_use, (=44), uleb128
+    AllowDIVIfExists = 0, // Allow hardware divide if available in arch, or no info exists.
+    DisallowDIV = 1, // Hardware divide explicitly disallowed
+    AllowDIVExt = 2, // Allow hardware divide as optional architecture extension above
+                     // the base arch specified by Tag_CPU_arch and Tag_CPU_arch_profile.
+
+    // Tag_Virtualization_use, (=68), uleb128
+    AllowTZ = 1,
+    AllowVirtualization = 2,
+    AllowTZVirtualization = 3
   };
-}
+
+} // namespace ARMBuildAttrs
+} // namespace llvm
 
 #endif // __TARGET_ARMBUILDATTRS_H__
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 8ff666e..9bea4b2 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -207,10 +207,24 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
-                                          (sub CSR_AAPCS_ThisReturn, R9))>;
+                                         (sub CSR_AAPCS_ThisReturn, R9))>;
+
+// The "interrupt" attribute is used to generate code that is acceptable in
+// exception-handlers of various kinds. It makes us use a different return
+// instruction (handled elsewhere) and affects which registers we must return to
+// our "caller" in the same state as we receive them.
+
+// For most interrupts, all registers except SP and LR are shared with
+// user-space. We mark LR to be saved anyway, since this is what the ARM backend
+// generally does rather than tracking its liveness as a normal register.
+def CSR_GenericInt : CalleeSavedRegs<(add LR, (sequence "R%u", 12, 0))>;
+
+// The fast interrupt handlers have more private state and get their own copies
+// of R8-R12, in addition to SP and LR. As before, mark LR for saving too.
+
+// FIXME: we mark R11 as callee-saved since it's often the frame-pointer, and
+// current frame lowering expects to encounter it while processing callee-saved
+// registers.
+def CSR_FIQ : CalleeSavedRegs<(add LR, R11, (sequence "R%u", 7, 0))>;
+
 
-// GHC set of callee saved regs is empty as all those regs are
-// used for passing STG regs around
-// add is a workaround for not being able to compile empty list:
-// def CSR_GHC : CalleeSavedRegs<()>;
-def CSR_GHC : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 5e8e173..568ca85 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -167,6 +167,8 @@ namespace {
       const { return 0; }
     unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
+    unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
     unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
       const { return 0; }
     unsigned getAdrLabelOpValue(const MachineInstr &MI, unsigned Op)
@@ -1044,8 +1046,8 @@ void ARMCodeEmitter::emitDataProcessingInstruction(const MachineInstr &MI,
       return;
   } else if ((MCID.Opcode == ARM::BFC) || (MCID.Opcode == ARM::BFI)) {
       uint32_t v = ~MI.getOperand(2).getImm();
-      int32_t lsb = CountTrailingZeros_32(v);
-      int32_t msb = (32 - CountLeadingZeros_32(v)) - 1;
+      int32_t lsb = countTrailingZeros(v);
+      int32_t msb = (32 - countLeadingZeros(v)) - 1;
       // Instr{20-16} = msb, Instr{11-7} = lsb
       Binary |= (msb & 0x1F) << 16;
       Binary |= (lsb & 0x1F) << 7;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 4891609..cff5ce2 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -128,7 +128,7 @@ namespace {
         // If the block size isn't a multiple of the known bits, assume the
         // worst case padding.
         if (Size & ((1u << Bits) - 1))
-          Bits = CountTrailingZeros_32(Size);
+          Bits = countTrailingZeros(Size);
         return Bits;
       }
 
@@ -753,6 +753,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
             Scale = 4;
             break;
 
+          case ARM::LDRBi12:
           case ARM::LDRi12:
           case ARM::LDRcp:
           case ARM::t2LDRpci:
diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
index 4e703ec..7d41c69 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -163,21 +163,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
 
 int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
                                                        unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolConstant *APC = dyn_cast<ARMConstantPoolConstant>(CPV);
-      if (!APC) continue;
-      if (APC->CVal == CVal && equals(APC))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
 }
 
 bool ARMConstantPoolConstant::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -216,22 +202,7 @@ ARMConstantPoolSymbol::Create(LLVMContext &C, const char *s,
 
 int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
                                                      unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolSymbol *APS = dyn_cast<ARMConstantPoolSymbol>(CPV);
-      if (!APS) continue;
-
-      if (APS->S == S && equals(APS))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
 }
 
 bool ARMConstantPoolSymbol::hasSameValue(ARMConstantPoolValue *ACPV) {
@@ -271,22 +242,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
 
 int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
                                                   unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
-  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
-    if (Constants[i].isMachineConstantPoolEntry() &&
-        (Constants[i].getAlignment() & AlignMask) == 0) {
-      ARMConstantPoolValue *CPV =
-        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
-      ARMConstantPoolMBB *APMBB = dyn_cast<ARMConstantPoolMBB>(CPV);
-      if (!APMBB) continue;
-
-      if (APMBB->MBB == MBB && equals(APMBB))
-        return i;
-    }
-  }
-
-  return -1;
+  return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
 }
 
 bool ARMConstantPoolMBB::hasSameValue(ARMConstantPoolValue *ACPV) {
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 93812fe..7ae7bf4 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -15,6 +15,7 @@
 #define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
 
 #include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cstddef>
 
@@ -64,6 +65,26 @@ protected:
   ARMConstantPoolValue(LLVMContext &C, unsigned id, ARMCP::ARMCPKind Kind,
                        unsigned char PCAdj, ARMCP::ARMCPModifier Modifier,
                        bool AddCurrentAddress);
+
+  template <typename Derived>
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
+                                    unsigned Alignment) {
+    unsigned AlignMask = Alignment - 1;
+    const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
+    for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+      if (Constants[i].isMachineConstantPoolEntry() &&
+          (Constants[i].getAlignment() & AlignMask) == 0) {
+        ARMConstantPoolValue *CPV =
+            (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+        if (Derived *APC = dyn_cast<Derived>(CPV))
+          if (cast<Derived>(this)->equals(APC))
+            return i;
+      }
+    }
+
+    return -1;
+  }
+
 public:
   virtual ~ARMConstantPoolValue();
 
@@ -156,6 +177,10 @@ public:
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
   }
+
+  bool equals(const ARMConstantPoolConstant *A) const {
+    return CVal == A->CVal && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolSymbol - ARM-specific constantpool values for external
@@ -187,6 +212,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isExtSymbol();
   }
+
+  bool equals(const ARMConstantPoolSymbol *A) const {
+    return S == A->S && ARMConstantPoolValue::equals(A);
+  }
 };
 
 /// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
@@ -219,6 +248,10 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isMachineBasicBlock();
   }
+
+  bool equals(const ARMConstantPoolMBB *A) const {
+    return MBB == A->MBB && ARMConstantPoolValue::equals(A);
+  }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index beb843c..e6f7f86 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -692,10 +692,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned newOpc = Opcode == ARM::VMOVScc ? ARM::VMOVS : ARM::VMOVD;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(newOpc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
+        .addOperand(MI.getOperand(4));
 
       MI.eraseFromParent();
       return true;
@@ -705,10 +704,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       unsigned Opc = AFI->isThumbFunction() ? ARM::t2MOVr : ARM::MOVr;
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
@@ -717,39 +715,36 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::MOVCCsi: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
+        .addOperand(MI.getOperand(2))
         .addImm(MI.getOperand(3).getImm())
         .addImm(MI.getOperand(4).getImm()) // 'pred'
-        .addReg(MI.getOperand(5).getReg())
+        .addOperand(MI.getOperand(5))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
-
     case ARM::MOVCCsr: {
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsr),
               (MI.getOperand(1).getReg()))
-        .addReg(MI.getOperand(2).getReg(),
-                getKillRegState(MI.getOperand(2).isKill()))
-        .addReg(MI.getOperand(3).getReg(),
-                getKillRegState(MI.getOperand(3).isKill()))
+        .addOperand(MI.getOperand(2))
+        .addOperand(MI.getOperand(3))
         .addImm(MI.getOperand(4).getImm())
         .addImm(MI.getOperand(5).getImm()) // 'pred'
-        .addReg(MI.getOperand(6).getReg())
+        .addOperand(MI.getOperand(6))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCCi16:
     case ARM::MOVCCi16: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi16),
+      unsigned NewOpc = AFI->isThumbFunction() ? ARM::t2MOVi16 : ARM::MOVi16;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg());
-
+        .addOperand(MI.getOperand(4));
       MI.eraseFromParent();
       return true;
     }
@@ -760,23 +755,47 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MVNCCi:
     case ARM::MVNCCi: {
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MVNi),
+      unsigned Opc = AFI->isThumbFunction() ? ARM::t2MVNi : ARM::MVNi;
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc),
               MI.getOperand(1).getReg())
         .addImm(MI.getOperand(2).getImm())
         .addImm(MI.getOperand(3).getImm()) // 'pred'
-        .addReg(MI.getOperand(4).getReg())
+        .addOperand(MI.getOperand(4))
         .addReg(0); // 's' bit
 
       MI.eraseFromParent();
       return true;
     }
+    case ARM::t2MOVCClsl:
+    case ARM::t2MOVCClsr:
+    case ARM::t2MOVCCasr:
+    case ARM::t2MOVCCror: {
+      unsigned NewOpc;
+      switch (Opcode) {
+      case ARM::t2MOVCClsl: NewOpc = ARM::t2LSLri; break;
+      case ARM::t2MOVCClsr: NewOpc = ARM::t2LSRri; break;
+      case ARM::t2MOVCCasr: NewOpc = ARM::t2ASRri; break;
+      case ARM::t2MOVCCror: NewOpc = ARM::t2RORri; break;
+      default: llvm_unreachable("unexpeced conditional move");
+      }
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpc),
+              MI.getOperand(1).getReg())
+        .addOperand(MI.getOperand(2))
+        .addImm(MI.getOperand(3).getImm())
+        .addImm(MI.getOperand(4).getImm()) // 'pred'
+        .addOperand(MI.getOperand(5))
+        .addReg(0); // 's' bit
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
@@ -823,7 +842,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     case ARM::MOVsrl_flag:
     case ARM::MOVsra_flag: {
-      // These are just fancy MOVs insructions.
+      // These are just fancy MOVs instructions.
       AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVsi),
                              MI.getOperand(0).getReg())
                      .addOperand(MI.getOperand(1))
@@ -938,6 +957,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       ExpandMOV32BitImm(MBB, MBBI);
       return true;
 
+    case ARM::SUBS_PC_LR: {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::SUBri), ARM::PC)
+              .addReg(ARM::LR)
+              .addOperand(MI.getOperand(0))
+              .addOperand(MI.getOperand(1))
+              .addOperand(MI.getOperand(2))
+              .addReg(ARM::CPSR, RegState::Undef);
+      TransferImpOps(MI, MIB, MIB);
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::VLDMQIA: {
       unsigned NewOpc = ARM::VLDMDIA;
       MachineInstrBuilder MIB =
diff --git a/lib/Target/ARM/ARMFPUName.def b/lib/Target/ARM/ARMFPUName.def
new file mode 100644
index 0000000..9a1bbe7
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.def
@@ -0,0 +1,32 @@
+//===-- ARMFPUName.def - List of the ARM FPU names --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the list of the supported ARM FPU names.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef ARM_FPU_NAME
+#error "You must define ARM_FPU_NAME(NAME, ID) before including ARMFPUName.h"
+#endif
+
+ARM_FPU_NAME("vfp", VFP)
+ARM_FPU_NAME("vfpv2", VFPV2)
+ARM_FPU_NAME("vfpv3", VFPV3)
+ARM_FPU_NAME("vfpv3-d16", VFPV3_D16)
+ARM_FPU_NAME("vfpv4", VFPV4)
+ARM_FPU_NAME("vfpv4-d16", VFPV4_D16)
+ARM_FPU_NAME("fp-armv8", FP_ARMV8)
+ARM_FPU_NAME("neon", NEON)
+ARM_FPU_NAME("neon-vfpv4", NEON_VFPV4)
+ARM_FPU_NAME("neon-fp-armv8", NEON_FP_ARMV8)
+ARM_FPU_NAME("crypto-neon-fp-armv8", CRYPTO_NEON_FP_ARMV8)
+
+#undef ARM_FPU_NAME
diff --git a/lib/Target/ARM/ARMFPUName.h b/lib/Target/ARM/ARMFPUName.h
new file mode 100644
index 0000000..2a64cce
--- /dev/null
+++ b/lib/Target/ARM/ARMFPUName.h
@@ -0,0 +1,26 @@
+//===-- ARMFPUName.h - List of the ARM FPU names ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMFPUNAME_H
+#define ARMFPUNAME_H
+
+namespace llvm {
+namespace ARM {
+
+enum FPUKind {
+  INVALID_FPU = 0
+
+#define ARM_FPU_NAME(NAME, ID) , ID
+#include "ARMFPUName.def"
+};
+
+} // namespace ARM
+} // namespace llvm
+
+#endif // ARMFPUNAME_H
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5d45f64..a4004f3 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -20,6 +20,7 @@
 #include "ARMSubtarget.h"
 #include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -175,6 +176,8 @@ class ARMFastISel : public FastISel {
 
     // Utility routines.
   private:
+    unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum,
+                                      unsigned Op);
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -251,10 +254,10 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) {
 bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
 
-  // If we're a thumb2 or not NEON function we were handled via isPredicable.
+  // If we're a thumb2 or not NEON function we'll be handled via isPredicable.
   if ((MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainNEON ||
        AFI->isThumb2Function())
-    return false;
+    return MI->isPredicable();
 
   for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
     if (MCID.OpInfo[i].isPredicate())
@@ -275,7 +278,7 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   // Do we use a predicate? or...
   // Are we NEON in ARM mode and have a predicate operand? If so, I know
   // we're not predicable but add it anyways.
-  if (TII.isPredicable(MI) || isARMNEONPred(MI))
+  if (isARMNEONPred(MI))
     AddDefaultPred(MIB);
 
   // Do we optionally set a predicate?  Preds is size > 0 iff the predicate
@@ -290,6 +293,23 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   return MIB;
 }
 
+unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
+                                               unsigned Op, unsigned OpNum) {
+  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      unsigned NewOp = createResultReg(RegClass);
+      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                              TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
 unsigned ARMFastISel::FastEmitInst_(unsigned MachineInstOpcode,
                                     const TargetRegisterClass* RC) {
   unsigned ResultReg = createResultReg(RC);
@@ -305,6 +325,9 @@ unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill));
@@ -325,6 +348,11 @@ unsigned ARMFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -348,6 +376,12 @@ unsigned ARMFastISel::FastEmitInst_rrr(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
+  Op2 = constrainOperandRegClass(II, Op1, 3);
+
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -372,6 +406,9 @@ unsigned ARMFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -394,6 +431,9 @@ unsigned ARMFastISel::FastEmitInst_rf(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operand is sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -417,6 +457,10 @@ unsigned ARMFastISel::FastEmitInst_rri(unsigned MachineInstOpcode,
   unsigned ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
+  // Make sure the input operands are sufficiently constrained to be legal
+  // for this instruction.
+  Op0 = constrainOperandRegClass(II, Op0, 1);
+  Op1 = constrainOperandRegClass(II, Op1, 2);
   if (II.getNumDefs() >= 1) {
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II, ResultReg)
                    .addReg(Op0, Op0IsKill * RegState::Kill)
@@ -609,6 +653,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
                     .addConstantPoolIndex(Idx));
   else
     // The extra immediate is for addrmode2.
+    DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::LDRcp), DestReg)
                     .addConstantPoolIndex(Idx)
@@ -628,6 +673,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
 
+  // FastISel TLS support on non-Darwin is broken, punt to SelectionDAG.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  if (!Subtarget->isTargetDarwin() && IsThreadLocal) return 0;
+
   // Use movw+movt when possible, it avoids constant pool entries.
   // Darwin targets don't support movt with Reloc::Static, see
   // ARMTargetLowering::LowerGlobalAddressDarwin.  Other targets only support
@@ -679,6 +729,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
       AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
+      DestReg = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg, 0);
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
@@ -814,22 +865,19 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   switch (Opcode) {
     default:
     break;
-    case Instruction::BitCast: {
+    case Instruction::BitCast:
       // Look through bitcasts.
       return ARMComputeAddress(U->getOperand(0), Addr);
-    }
-    case Instruction::IntToPtr: {
+    case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
       if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
-    }
-    case Instruction::PtrToInt: {
+    case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
       if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
-    }
     case Instruction::GetElementPtr: {
       Address SavedAddr = Addr;
       int TmpOffset = Addr.Offset;
@@ -852,13 +900,8 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
               TmpOffset += CI->getSExtValue() * S;
               break;
             }
-            if (isa<AddOperator>(Op) &&
-                (!isa<Instruction>(Op) ||
-                 FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-                 == FuncInfo.MBB) &&
-                isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-              // An add (in the same block) with a constant operand. Fold the
-              // constant.
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
               ConstantInt *CI =
               cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
               TmpOffset += CI->getSExtValue() * S;
@@ -1025,7 +1068,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
           useAM3 = true;
         }
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i16:
       if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
@@ -1040,7 +1083,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
         Opc = isZExt ? ARM::LDRH : ARM::LDRSH;
         useAM3 = true;
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::i32:
       if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
@@ -1054,7 +1097,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
       } else {
         Opc = ARM::LDRi12;
       }
-      RC = &ARM::GPRRegClass;
+      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       break;
     case MVT::f32:
       if (!Subtarget->hasVFP2()) return false;
@@ -1063,7 +1106,7 @@ bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
         needVMOV = true;
         VT = MVT::i32;
         Opc = isThumb2 ? ARM::t2LDRi12 : ARM::LDRi12;
-        RC = &ARM::GPRRegClass;
+        RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
       } else {
         Opc = ARM::VLDRS;
         RC = TLI.getRegClassFor(VT);
@@ -1136,6 +1179,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
         (const TargetRegisterClass*)&ARM::tGPRRegClass :
         (const TargetRegisterClass*)&ARM::GPRRegClass);
       unsigned Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
+      SrcReg = constrainOperandRegClass(TII.get(Opc), SrcReg, 1);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(Opc), Res)
                       .addReg(SrcReg).addImm(1));
@@ -1207,6 +1251,7 @@ bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
   ARMSimplifyAddress(Addr, VT, useAM3);
 
   // Create the base instruction, then add the operands.
+  SrcReg = constrainOperandRegClass(TII.get(StrOpc), SrcReg, 0);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
                             .addReg(SrcReg);
@@ -1330,6 +1375,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
         (isLoadTypeLegal(TI->getOperand(0)->getType(), SourceVT))) {
       unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
       unsigned OpReg = getRegForValue(TI->getOperand(0));
+      OpReg = constrainOperandRegClass(TII.get(TstOpc), OpReg, 0);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                               TII.get(TstOpc))
                       .addReg(OpReg).addImm(1));
@@ -1367,6 +1413,7 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
   unsigned TstOpc = isThumb2 ? ARM::t2TSTri : ARM::TSTri;
+  CmpReg = constrainOperandRegClass(TII.get(TstOpc), CmpReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TstOpc))
                   .addReg(CmpReg).addImm(1));
 
@@ -1491,13 +1538,15 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     }
   }
 
+  const MCInstrDesc &II = TII.get(CmpOpc);
+  SrcReg1 = constrainOperandRegClass(II, SrcReg1, 0);
   if (!UseImm) {
-    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-                            TII.get(CmpOpc))
+    SrcReg2 = constrainOperandRegClass(II, SrcReg2, 1);
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
                     .addReg(SrcReg1).addReg(SrcReg2));
   } else {
     MachineInstrBuilder MIB;
-    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
+    MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
       .addReg(SrcReg1);
 
     // Only add immediate for icmp as the immediate for fcmp is an implicit 0.0.
@@ -1696,6 +1745,7 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
   }
 
   unsigned CmpOpc = isThumb2 ? ARM::t2CMPri : ARM::CMPri;
+  CondReg = constrainOperandRegClass(TII.get(CmpOpc), CondReg, 0);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc))
                   .addReg(CondReg).addImm(0));
 
@@ -1712,12 +1762,16 @@ bool ARMFastISel::SelectSelect(const Instruction *I) {
       MovCCOpc = isThumb2 ? ARM::t2MVNCCi : ARM::MVNCCi;
   }
   unsigned ResultReg = createResultReg(RC);
-  if (!UseImm)
+  if (!UseImm) {
+    Op2Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op2Reg, 1);
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 2);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op2Reg).addReg(Op1Reg).addImm(ARMCC::NE).addReg(ARM::CPSR);
-  else
+  } else {
+    Op1Reg = constrainOperandRegClass(TII.get(MovCCOpc), Op1Reg, 1);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovCCOpc), ResultReg)
     .addReg(Op1Reg).addImm(Imm).addImm(ARMCC::EQ).addReg(ARM::CPSR);
+  }
   UpdateValueMap(I, ResultReg);
   return true;
 }
@@ -1802,7 +1856,9 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
   unsigned SrcReg2 = getRegForValue(I->getOperand(1));
   if (SrcReg2 == 0) return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
+  SrcReg1 = constrainOperandRegClass(TII.get(Opc), SrcReg1, 1);
+  SrcReg2 = constrainOperandRegClass(TII.get(Opc), SrcReg2, 2);
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
                   .addReg(SrcReg1).addReg(SrcReg2));
@@ -1930,7 +1986,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
           !VA.isRegLoc() || !ArgLocs[++i].isRegLoc())
         return false;
     } else {
-      switch (static_cast<EVT>(ArgVT).getSimpleVT().SimpleTy) {
+      switch (ArgVT.SimpleTy) {
       default:
         return false;
       case MVT::i1:
@@ -1985,7 +2041,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args,
       case CCValAssign::ZExt: {
         MVT DestVT = VA.getLocVT();
         Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true);
-        assert (Arg != 0 && "Failed to emit a sext");
+        assert (Arg != 0 && "Failed to emit a zext");
         ArgVT = DestVT;
         break;
       }
@@ -2182,10 +2238,14 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
 }
 
 unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  // Manually compute the global's type to avoid building it when unnecessary.
+  Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
+  EVT LCREVT = TLI.getValueType(GVTy);
+  if (!LCREVT.isSimple()) return 0;
+
   GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
                                        GlobalValue::ExternalLinkage, 0, Name);
-  EVT LCREVT = TLI.getValueType(GV->getType());
-  if (!LCREVT.isSimple()) return 0;
+  assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
   return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
 }
 
@@ -2403,15 +2463,22 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
 
+  unsigned char OpFlags = 0;
+
+  // Add MO_PLT for global address or external symbol in the PIC relocation
+  // model.
+  if (Subtarget->isTargetELF() && TM.getRelocationModel() == Reloc::PIC_)
+    OpFlags = ARMII::MO_PLT;
+
   // ARM calls don't take a predicate, but tBL / tBLX do.
   if(isThumb2)
     AddDefaultPred(MIB);
   if (UseReg)
     MIB.addReg(CalleeReg);
   else if (!IntrMemName)
-    MIB.addGlobalAddress(GV, 0, 0);
+    MIB.addGlobalAddress(GV, 0, OpFlags);
   else
-    MIB.addExternalSymbol(IntrMemName, 0);
+    MIB.addExternalSymbol(IntrMemName, OpFlags);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
@@ -2602,47 +2669,136 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                     bool isZExt) {
   if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
     return 0;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i8 && SrcVT != MVT::i1)
+    return 0;
 
-  unsigned Opc;
-  bool isBoolZext = false;
-  const TargetRegisterClass *RC;
-  switch (SrcVT.SimpleTy) {
-  default: return 0;
-  case MVT::i16:
-    if (!Subtarget->hasV6Ops()) return 0;
-    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
-    if (isZExt)
-      Opc = isThumb2 ? ARM::t2UXTH : ARM::UXTH;
-    else
-      Opc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
-    break;
-  case MVT::i8:
-    if (!Subtarget->hasV6Ops()) return 0;
-    RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRnopcRegClass;
-    if (isZExt)
-      Opc = isThumb2 ? ARM::t2UXTB : ARM::UXTB;
-    else
-      Opc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
-    break;
-  case MVT::i1:
-    if (isZExt) {
-      RC = isThumb2 ? &ARM::rGPRRegClass : &ARM::GPRRegClass;
-      Opc = isThumb2 ? ARM::t2ANDri : ARM::ANDri;
-      isBoolZext = true;
-      break;
+  // Table of which combinations can be emitted as a single instruction,
+  // and which will require two.
+  static const uint8_t isSingleInstrTbl[3][2][2][2] = {
+    //            ARM                     Thumb
+    //           !hasV6Ops  hasV6Ops     !hasV6Ops  hasV6Ops
+    //    ext:     s  z      s  z          s  z      s  z
+    /*  1 */ { { { 0, 1 }, { 0, 1 } }, { { 0, 0 }, { 0, 1 } } },
+    /*  8 */ { { { 0, 1 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } },
+    /* 16 */ { { { 0, 0 }, { 1, 1 } }, { { 0, 0 }, { 1, 1 } } }
+  };
+
+  // Target registers for:
+  //  - For ARM can never be PC.
+  //  - For 16-bit Thumb are restricted to lower 8 registers.
+  //  - For 32-bit Thumb are restricted to non-SP and non-PC.
+  static const TargetRegisterClass *RCTbl[2][2] = {
+    // Instructions: Two                     Single
+    /* ARM      */ { &ARM::GPRnopcRegClass, &ARM::GPRnopcRegClass },
+    /* Thumb    */ { &ARM::tGPRRegClass,    &ARM::rGPRRegClass    }
+  };
+
+  // Table governing the instruction(s) to be emitted.
+  static const struct InstructionTable {
+    uint32_t Opc   : 16;
+    uint32_t hasS  :  1; // Some instructions have an S bit, always set it to 0.
+    uint32_t Shift :  7; // For shift operand addressing mode, used by MOVsi.
+    uint32_t Imm   :  8; // All instructions have either a shift or a mask.
+  } IT[2][2][3][2] = {
+    { // Two instructions (first is left shift, second is in this table).
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  31 },
+        /*  1 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  31 } },
+        /*  8 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  24 },
+        /*  8 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  24 } },
+        /* 16 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  16 },
+        /* 16 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  16 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  31 },
+        /*  1 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  31 } },
+        /*  8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  24 },
+        /*  8 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  24 } },
+        /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  16 },
+        /* 16 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  16 } }
+      }
+    },
+    { // Single instruction.
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::SXTB   , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::SXTH   , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::UXTH   , 0, ARM_AM::no_shift,   0 } }
+      },
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::t2UXTH , 0, ARM_AM::no_shift,   0 } }
+      }
     }
-    return 0;
+  };
+
+  unsigned SrcBits = SrcVT.getSizeInBits();
+  unsigned DestBits = DestVT.getSizeInBits();
+  (void) DestBits;
+  assert((SrcBits < DestBits) && "can only extend to larger types");
+  assert((DestBits == 32 || DestBits == 16 || DestBits == 8) &&
+         "other sizes unimplemented");
+  assert((SrcBits == 16 || SrcBits == 8 || SrcBits == 1) &&
+         "other sizes unimplemented");
+
+  bool hasV6Ops = Subtarget->hasV6Ops();
+  unsigned Bitness = SrcBits / 8;  // {1,8,16}=>{0,1,2}
+  assert((Bitness < 3) && "sanity-check table bounds");
+
+  bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
+  const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
+  const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt];
+  unsigned Opc = ITP->Opc;
+  assert(ARM::KILL != Opc && "Invalid table entry");
+  unsigned hasS = ITP->hasS;
+  ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift;
+  assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) &&
+         "only MOVsi has shift operand addressing mode");
+  unsigned Imm = ITP->Imm;
+
+  // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
+  bool setsCPSR = &ARM::tGPRRegClass == RC;
+  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi;
+  unsigned ResultReg;
+  // MOVsi encodes shift and immediate in shift operand addressing mode.
+  // The following condition has the same value when emitting two
+  // instruction sequences: both are shifts.
+  bool ImmIsSO = (Shift != ARM_AM::no_shift);
+
+  // Either one or two instructions are emitted.
+  // They're always of the form:
+  //   dst = in OP imm
+  // CPSR is set only by 16-bit Thumb instructions.
+  // Predicate, if any, is AL.
+  // S bit, if available, is always 0.
+  // When two are emitted the first's result will feed as the second's input,
+  // that value is then dead.
+  unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
+  for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
+    ResultReg = createResultReg(RC);
+    bool isLsl = (0 == Instr) && !isSingleInstr;
+    unsigned Opcode = isLsl ? LSLOpc : Opc;
+    ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift;
+    unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
+    bool isKill = 1 == Instr;
+    MachineInstrBuilder MIB = BuildMI(
+        *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
+    if (setsCPSR)
+      MIB.addReg(ARM::CPSR, RegState::Define);
+    SrcReg = constrainOperandRegClass(TII.get(Opcode), SrcReg, 1 + setsCPSR);
+    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
+    if (hasS)
+      AddDefaultCC(MIB);
+    // Second instruction consumes the first's result.
+    SrcReg = ResultReg;
   }
 
-  unsigned ResultReg = createResultReg(RC);
-  MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
-        .addReg(SrcReg);
-  if (isBoolZext)
-    MIB.addImm(1);
-  else
-    MIB.addImm(0);
-  AddOptionalDefs(MIB);
   return ResultReg;
 }
 
@@ -2707,7 +2863,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
     if (Reg2 == 0) return false;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(MVT::i32));
+  unsigned ResultReg = createResultReg(&ARM::GPRnopcRegClass);
   if(ResultReg == 0) return false;
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -2797,6 +2953,25 @@ bool ARMFastISel::TargetSelectInstruction(const Instruction *I) {
   return false;
 }
 
+namespace {
+// This table describes sign- and zero-extend instructions which can be
+// folded into a preceding load. All of these extends have an immediate
+// (sometimes a mask and sometimes a shift) that's applied after
+// extension.
+const struct FoldableLoadExtendsStruct {
+  uint16_t Opc[2];  // ARM, Thumb.
+  uint8_t ExpectedImm;
+  uint8_t isZExt     : 1;
+  uint8_t ExpectedVT : 7;
+} FoldableLoadExtends[] = {
+  { { ARM::SXTH,  ARM::t2SXTH  },   0, 0, MVT::i16 },
+  { { ARM::UXTH,  ARM::t2UXTH  },   0, 1, MVT::i16 },
+  { { ARM::ANDri, ARM::t2ANDri }, 255, 1, MVT::i8  },
+  { { ARM::SXTB,  ARM::t2SXTB  },   0, 0, MVT::i8  },
+  { { ARM::UXTB,  ARM::t2UXTB  },   0, 1, MVT::i8  }
+};
+}
+
 /// \brief The specified machine instr operand is a vreg, and that
 /// vreg is being provided by the specified load instruction.  If possible,
 /// try to fold the load as an operand to the instruction, returning true if
@@ -2812,26 +2987,23 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   // ldrb r1, [r0]       ldrb r1, [r0]
   // uxtb r2, r1     =>
   // mov  r3, r2         mov  r3, r1
-  bool isZExt = true;
-  switch(MI->getOpcode()) {
-    default: return false;
-    case ARM::SXTH:
-    case ARM::t2SXTH:
-      isZExt = false;
-    case ARM::UXTH:
-    case ARM::t2UXTH:
-      if (VT != MVT::i16)
-        return false;
-    break;
-    case ARM::SXTB:
-    case ARM::t2SXTB:
-      isZExt = false;
-    case ARM::UXTB:
-    case ARM::t2UXTB:
-      if (VT != MVT::i8)
-        return false;
-    break;
+  if (MI->getNumOperands() < 3 || !MI->getOperand(2).isImm())
+    return false;
+  const uint64_t Imm = MI->getOperand(2).getImm();
+
+  bool Found = false;
+  bool isZExt;
+  for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends);
+       i != e; ++i) {
+    if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() &&
+        (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm &&
+        MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) {
+      Found = true;
+      isZExt = FoldableLoadExtends[i].isZExt;
+    }
   }
+  if (!Found) return false;
+
   // See if we can handle this address.
   Address Addr;
   if (!ARMComputeAddress(LI->getOperand(0), Addr)) return false;
@@ -2854,12 +3026,14 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
   // Load value.
   if (isThumb2) {
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::t2LDRpci), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                             TII.get(ARM::t2LDRpci), DestReg1)
                     .addConstantPoolIndex(Idx));
     Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
   } else {
     // The extra immediate is for addrmode2.
+    DestReg1 = constrainOperandRegClass(TII.get(ARM::LDRcp), DestReg1, 0);
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                             DL, TII.get(ARM::LDRcp), DestReg1)
                     .addConstantPoolIndex(Idx).addImm(0));
@@ -2873,6 +3047,9 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
   }
 
   unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  DestReg2 = constrainOperandRegClass(TII.get(Opc), DestReg2, 0);
+  DestReg1 = constrainOperandRegClass(TII.get(Opc), DestReg1, 1);
+  GlobalBaseReg = constrainOperandRegClass(TII.get(Opc), GlobalBaseReg, 2);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(Opc), DestReg2)
                             .addReg(DestReg1)
@@ -2938,12 +3115,10 @@ bool ARMFastISel::FastLowerArguments() {
     ARM::R0, ARM::R1, ARM::R2, ARM::R3
   };
 
-  const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
+  const TargetRegisterClass *RC = &ARM::rGPRRegClass;
   Idx = 0;
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I, ++Idx) {
-    if (I->use_empty())
-      continue;
     unsigned SrcReg = GPRArgRegs[Idx];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
@@ -2961,13 +3136,23 @@ bool ARMFastISel::FastLowerArguments() {
 namespace llvm {
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
-    // Completely untested on non-iOS.
     const TargetMachine &TM = funcInfo.MF->getTarget();
 
-    // Darwin and thumb1 only for now.
     const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only())
+    // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
+    bool UseFastISel = false;
+    UseFastISel |= Subtarget->isTargetIOS() && !Subtarget->isThumb1Only();
+    UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb();
+    UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb();
+
+    if (UseFastISel) {
+      // iOS always has a FP for backtracking, force other targets
+      // to keep their FP when doing FastISel. The emitted code is
+      // currently superior, and in cases like test-suite's lencod
+      // FastISel isn't quite correct when FP is eliminated.
+      TM.Options.NoFramePointerElim = true;
       return new ARMFastISel(funcInfo, libInfo);
+    }
     return 0;
   }
 }
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
new file mode 100644
index 0000000..dafc4b3
--- /dev/null
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -0,0 +1,93 @@
+//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the code shared between ARM CodeGen and ARM MC
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_FEATURES_H
+#define TARGET_ARM_FEATURES_H
+
+#include "ARM.h"
+
+namespace llvm {
+
+template<typename InstrType> // could be MachineInstr or MCInst
+inline bool isV8EligibleForIT(InstrType *Instr, int BLXOperandIndex = 0) {
+  switch (Instr->getOpcode()) {
+  default:
+    return false;
+  case ARM::tADC:
+  case ARM::tADDi3:
+  case ARM::tADDi8:
+  case ARM::tADDrSPi:
+  case ARM::tADDrr:
+  case ARM::tAND:
+  case ARM::tASRri:
+  case ARM::tASRrr:
+  case ARM::tBIC:
+  case ARM::tCMNz:
+  case ARM::tCMPi8:
+  case ARM::tCMPr:
+  case ARM::tEOR:
+  case ARM::tLDRBi:
+  case ARM::tLDRBr:
+  case ARM::tLDRHi:
+  case ARM::tLDRHr:
+  case ARM::tLDRSB:
+  case ARM::tLDRSH:
+  case ARM::tLDRi:
+  case ARM::tLDRr:
+  case ARM::tLDRspi:
+  case ARM::tLSLri:
+  case ARM::tLSLrr:
+  case ARM::tLSRri:
+  case ARM::tLSRrr:
+  case ARM::tMOVi8:
+  case ARM::tMUL:
+  case ARM::tMVN:
+  case ARM::tORR:
+  case ARM::tROR:
+  case ARM::tRSB:
+  case ARM::tSBC:
+  case ARM::tSTRBi:
+  case ARM::tSTRBr:
+  case ARM::tSTRHi:
+  case ARM::tSTRHr:
+  case ARM::tSTRi:
+  case ARM::tSTRr:
+  case ARM::tSTRspi:
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+  case ARM::tSUBrr:
+  case ARM::tTST:
+    return true;
+// there are some "conditionally deprecated" opcodes
+  case ARM::tADDspr:
+    return Instr->getOperand(2).getReg() != ARM::PC;
+  // ADD PC, SP and BLX PC were always unpredictable,
+  // now on top of it they're deprecated
+  case ARM::tADDrSP:
+  case ARM::tBX:
+    return Instr->getOperand(0).getReg() != ARM::PC;
+  case ARM::tBLXr:
+    return Instr->getOperand(BLXOperandIndex).getReg() != ARM::PC;
+  case ARM::tADDhirr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(2).getReg() != ARM::PC;
+  case ARM::tCMPhir:
+  case ARM::tMOVr:
+    return Instr->getOperand(0).getReg() != ARM::PC &&
+           Instr->getOperand(1).getReg() != ARM::PC;
+  }
+}
+
+}
+
+#endif
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 483802b..d32bdbc 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -82,22 +82,11 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo()->hasVarSizedObjects();
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI,
                         const ARMBaseInstrInfo &TII,
                         const uint16_t *CSRegs) {
   // Integer spill area is handled with "pop".
-  if (MI->getOpcode() == ARM::LDMIA_RET ||
-      MI->getOpcode() == ARM::t2LDMIA_RET ||
-      MI->getOpcode() == ARM::LDMIA_UPD ||
-      MI->getOpcode() == ARM::t2LDMIA_UPD ||
-      MI->getOpcode() == ARM::VLDMDIA_UPD) {
+  if (isPopOpcode(MI->getOpcode())) {
     // The first two operands are predicates. The last two are
     // imp-def and imp-use of SP. Check everything in between.
     for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
@@ -115,20 +104,31 @@ static bool isCSRestore(MachineInstr *MI,
   return false;
 }
 
-static void
-emitSPUpdate(bool isARM,
-             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-             DebugLoc dl, const ARMBaseInstrInfo &TII,
-             int NumBytes, unsigned MIFlags = MachineInstr::NoFlags,
-             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+static void emitRegPlusImmediate(bool isARM, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                                 const ARMBaseInstrInfo &TII, unsigned DestReg,
+                                 unsigned SrcReg, int NumBytes,
+                                 unsigned MIFlags = MachineInstr::NoFlags,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) {
   if (isARM)
-    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitARMRegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                             Pred, PredReg, TII, MIFlags);
   else
-    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+    emitT2RegPlusImmediate(MBB, MBBI, dl, DestReg, SrcReg, NumBytes,
                            Pred, PredReg, TII, MIFlags);
 }
 
+static void emitSPUpdate(bool isARM, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                         const ARMBaseInstrInfo &TII, int NumBytes,
+                         unsigned MIFlags = MachineInstr::NoFlags,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) {
+  emitRegPlusImmediate(isARM, MBB, MBBI, dl, TII, ARM::SP, ARM::SP, NumBytes,
+                       MIFlags, Pred, PredReg);
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -141,7 +141,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -174,6 +175,10 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     unsigned Reg = CSI[i].getReg();
     int FI = CSI[i].getFrameIdx();
     switch (Reg) {
+    case ARM::R0:
+    case ARM::R1:
+    case ARM::R2:
+    case ARM::R3:
     case ARM::R4:
     case ARM::R5:
     case ARM::R6:
@@ -181,73 +186,61 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
     case ARM::R9:
     case ARM::R10:
     case ARM::R11:
+    case ARM::R12:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
       // This is a DPR. Exclude the aligned DPRCS2 spills.
       if (Reg == ARM::D8)
         D8SpillFI = FI;
-      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs()) {
-        AFI->addDPRCalleeSavedAreaFrame(FI);
+      if (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())
         DPRCSSize += 8;
-      }
     }
   }
 
   // Move past area 1.
-  if (GPRCS1Size > 0) MBBI++;
-
-  // Set FP to point to the stack slot that contains the previous FP.
-  // For iOS, FP is R7, which has now been stored in spill area 1.
-  // Otherwise, if this is not iOS, all the callee-saved registers go
-  // into spill area 1, including the FP in R11.  In either case, it is
-  // now safe to emit this assignment.
-  bool HasFP = hasFP(MF);
-  if (HasFP) {
-    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
-      .setMIFlag(MachineInstr::FrameSetup);
-    AddDefaultCC(AddDefaultPred(MIB));
-  }
-
-  // Move past area 2.
-  if (GPRCS2Size > 0) MBBI++;
+  MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
+  if (GPRCS1Size > 0)
+    FramePtrPush = LastPush = MBBI++;
 
   // Determine starting offsets of spill areas.
+  bool HasFP = hasFP(MF);
   unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
   unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
   unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
-  if (HasFP)
+  int FramePtrOffsetInPush = 0;
+  if (HasFP) {
+    FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
+  }
   AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
   AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
 
+  // Move past area 2.
+  if (GPRCS2Size > 0) {
+    LastPush = MBBI++;
+  }
+
   // Move past area 3.
   if (DPRCSSize > 0) {
-    MBBI++;
+    LastPush = MBBI++;
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
-      MBBI++;
+      LastPush = MBBI++;
   }
 
   // Move past the aligned DPRCS2 area.
@@ -263,8 +256,13 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
-                 MachineInstr::FrameSetup);
+    if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes)) {
+      if (LastPush == FramePtrPush)
+        FramePtrOffsetInPush += NumBytes;
+    } else
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
+                   MachineInstr::FrameSetup);
+
     if (HasFP && isARM)
       // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
       // Note it's not safe to do this in Thumb2 mode because it would have
@@ -277,6 +275,18 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       AFI->setShouldRestoreSPFromFP(true);
   }
 
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For iOS, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not iOS, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it
+  // is in area one and the adjustment needs to take place just after
+  // that push.
+  if (HasFP)
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
+                         FramePtr, ARM::SP, FramePtrOffsetInPush,
+                         MachineInstr::FrameSetup);
+
+
   if (STI.isTargetELF() && hasFP(MF))
     MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
                              AFI->getFramePtrSpillOffset());
@@ -357,7 +367,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
          "This emitEpilogue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
@@ -371,11 +382,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
-      do
+      do {
         --MBBI;
-      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
       if (!isCSRestore(MBBI, TII, CSRegs))
         ++MBBI;
     }
@@ -419,8 +430,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
                                  ARM::SP)
             .addReg(FramePtr));
       }
-    } else if (NumBytes)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+    } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
+        emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
 
     // Increment past our save areas.
     if (AFI->getDPRCalleeSavedAreaSize()) {
@@ -499,12 +510,6 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
 
   FrameReg = ARM::SP;
   Offset += SPAdj;
-  if (AFI->isGPRCalleeSavedArea1Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
-    return Offset - AFI->getGPRCalleeSavedArea2Offset();
-  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
-    return Offset - AFI->getDPRCalleeSavedAreaOffset();
 
   // SP can move around if there are allocas.  We may also lose track of SP
   // when emergency spilling inside a non-reserved call frame setup.
@@ -656,6 +661,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   unsigned RetOpcode = MI->getOpcode();
   bool isTailCall = (RetOpcode == ARM::TCRETURNdi ||
                      RetOpcode == ARM::TCRETURNri);
+  bool isInterrupt =
+      RetOpcode == ARM::SUBS_PC_LR || RetOpcode == ARM::t2SUBS_PC_LR;
 
   SmallVector<unsigned, 4> Regs;
   unsigned i = CSI.size();
@@ -670,7 +677,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
+          STI.hasV5TOps()) {
         Reg = ARM::PC;
         LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
         // Fold the return instruction into the LDM.
@@ -1197,7 +1205,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -1224,6 +1232,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       case ARM::LR:
         LRSpilled = true;
         // Fallthrough
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
         CS1Spilled = true;
@@ -1238,6 +1248,8 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
 
       switch (Reg) {
+      case ARM::R0: case ARM::R1:
+      case ARM::R2: case ARM::R3:
       case ARM::R4: case ARM::R5:
       case ARM::R6: case ARM::R7:
       case ARM::LR:
@@ -1293,8 +1305,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     if (!LRSpilled && CS1Spilled) {
       MRI.setPhysRegUsed(ARM::LR);
       NumGPRSpills++;
-      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
-                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      SmallVectorImpl<unsigned>::iterator LRPos;
+      LRPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                        (unsigned)ARM::LR);
+      if (LRPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(LRPos);
+
       ForceLRSpill = false;
       ExtraCSSpill = true;
     }
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 1240169..c69d313 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -44,10 +44,16 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
     if (LastMI && (MCID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
       MachineInstr *DefMI = LastMI;
       const MCInstrDesc &LastMCID = LastMI->getDesc();
+      const TargetMachine &TM =
+        MI->getParent()->getParent()->getTarget();
+      const ARMBaseInstrInfo &TII =
+        *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
           // On A9, AGU and NEON/FPU are muxed.
-          !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+          !(TII.getSubtarget().isLikeA9() &&
+            (LastMI->mayLoad() || LastMI->mayStore())) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
@@ -58,7 +64,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 
       if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
           (TII.canCauseFpMLxStall(MI->getOpcode()) ||
-           hasRAWHazard(DefMI, MI, TRI))) {
+           hasRAWHazard(DefMI, MI, TII.getRegisterInfo()))) {
         // Try to schedule another instruction for the next 4 cycles.
         if (FpMLxStalls == 0)
           FpMLxStalls = 4;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index 98bfc4c..e1dcec3 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -28,21 +28,14 @@ class MachineInstr;
 /// ARM preRA scheduler uses an unspecialized instance of the
 /// ScoreboardHazardRecognizer.
 class ARMHazardRecognizer : public ScoreboardHazardRecognizer {
-  const ARMBaseInstrInfo &TII;
-  const ARMBaseRegisterInfo &TRI;
-  const ARMSubtarget &STI;
-
   MachineInstr *LastMI;
   unsigned FpMLxStalls;
 
 public:
   ARMHazardRecognizer(const InstrItineraryData *ItinData,
-                      const ARMBaseInstrInfo &tii,
-                      const ARMBaseRegisterInfo &tri,
-                      const ARMSubtarget &sti,
-                      const ScheduleDAG *DAG) :
-    ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"), TII(tii),
-    TRI(tri), STI(sti), LastMI(0) {}
+                      const ScheduleDAG *DAG)
+    : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
+      LastMI(0) {}
 
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void Reset();
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 9e1782e..87d1522 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -61,7 +61,6 @@ enum AddrMode2Type {
 
 class ARMDAGToDAGISel : public SelectionDAGISel {
   ARMBaseTargetMachine &TM;
-  const ARMBaseInstrInfo *TII;
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -71,7 +70,6 @@ public:
   explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
                            CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-      TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
@@ -132,6 +130,13 @@ public:
     return true;
   }
 
+  bool SelectCMOVPred(SDValue N, SDValue &Pred, SDValue &Reg) {
+    const ConstantSDNode *CN = cast<ConstantSDNode>(N);
+    Pred = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+    Reg = CurDAG->getRegister(ARM::CPSR, MVT::i32);
+    return true;
+  }
+
   bool SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode2OffsetImm(SDNode *Op, SDValue N,
@@ -177,6 +182,7 @@ public:
                                  SDValue &OffImm);
   bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
                              SDValue &OffReg, SDValue &ShImm);
+  bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
 
   inline bool is_so_imm(unsigned Imm) const {
     return ARM_AM::getSOImmVal(Imm) != -1;
@@ -240,21 +246,6 @@ private:
   /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
   SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, bool isSigned);
 
-  /// SelectCMOVOp - Select CMOV instructions for ARM.
-  SDNode *SelectCMOVOp(SDNode *N);
-  SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-  SDNode *SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                              ARMCC::CondCodes CCVal, SDValue CCR,
-                              SDValue InFlag);
-  SDNode *SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                               ARMCC::CondCodes CCVal, SDValue CCR,
-                               SDValue InFlag);
-
   // Select special operations if node forms integer ABS pattern
   SDNode *SelectABSOp(SDNode *N);
 
@@ -262,7 +253,7 @@ private:
 
   SDNode *SelectConcatVector(SDNode *N);
 
-  SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32, unsigned Op64);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
@@ -364,7 +355,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
       continue;
 
     // Check if the AND mask is an immediate of the form: 000.....1111111100
-    unsigned TZ = CountTrailingZeros_32(And_imm);
+    unsigned TZ = countTrailingZeros(And_imm);
     if (TZ != 1 && TZ != 2)
       // Be conservative here. Shifter operands aren't always free. e.g. On
       // Swift, left shifter operand of 1 / 2 for free but others are not.
@@ -402,12 +393,12 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     }
 
     // Now make the transformation.
-    Srl = CurDAG->getNode(ISD::SRL, Srl.getDebugLoc(), MVT::i32,
+    Srl = CurDAG->getNode(ISD::SRL, SDLoc(Srl), MVT::i32,
                           Srl.getOperand(0),
                           CurDAG->getConstant(Srl_imm+TZ, MVT::i32));
-    N1 = CurDAG->getNode(ISD::AND, N1.getDebugLoc(), MVT::i32,
+    N1 = CurDAG->getNode(ISD::AND, SDLoc(N1), MVT::i32,
                          Srl, CurDAG->getConstant(And_imm, MVT::i32));
-    N1 = CurDAG->getNode(ISD::SHL, N1.getDebugLoc(), MVT::i32,
+    N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
                          N1, CurDAG->getConstant(TZ, MVT::i32));
     CurDAG->UpdateNodeOperands(N, N0, N1);
   }  
@@ -423,7 +414,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (!CheckVMLxHazard)
     return true;
 
-  if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() &&
+  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() &&
       !Subtarget->isSwift())
     return true;
 
@@ -434,6 +425,9 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (Use->getOpcode() == ISD::CopyToReg)
     return true;
   if (Use->isMachineOpcode()) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
     const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
     if (MCID.mayStore())
       return true;
@@ -533,7 +527,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -557,7 +552,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -703,7 +699,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -724,7 +721,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -901,7 +899,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
@@ -915,7 +914,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -960,7 +960,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -978,7 +979,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1202,7 +1204,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI,
+                                       getTargetLowering()->getPointerTy());
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -1219,7 +1222,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1267,7 +1271,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -1297,7 +1302,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1326,7 +1332,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1403,6 +1410,34 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
+                                                SDValue &OffImm) {
+  // This *must* succeed since it's used for the irreplacable ldrex and strex
+  // instructions.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+
+  if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N))
+    return true;
+
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS)
+    return true;
+
+  uint32_t RHSC = (int)RHS->getZExtValue();
+  if (RHSC > 1020 || RHSC % 4 != 0)
+    return true;
+
+  Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy());
+  }
+
+  OffImm = CurDAG->getTargetConstant(RHSC / 4, MVT::i32);
+  return true;
+}
+
 //===--------------------------------------------------------------------===//
 
 /// getAL - Returns a ARMCC::AL immediate node.
@@ -1468,14 +1503,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, AMOpc, getAL(CurDAG),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
                                     MVT::i32, MVT::Other, Ops);
     } else {
       SDValue Chain = LD->getChain();
       SDValue Base = LD->getBasePtr();
       SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
                        CurDAG->getRegister(0, MVT::i32), Chain };
-      return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32,
+      return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
                                     MVT::i32, MVT::Other, Ops);
     }
   }
@@ -1524,7 +1559,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
     SDValue Base = LD->getBasePtr();
     SDValue Ops[]= { Base, Offset, getAL(CurDAG),
                      CurDAG->getRegister(0, MVT::i32), Chain };
-    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
+    return CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32, MVT::i32,
                                   MVT::Other, Ops);
   }
 
@@ -1533,7 +1568,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
 SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::GPRPairRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::gsub_0, MVT::i32);
@@ -1544,7 +1579,7 @@ SDNode *ARMDAGToDAGISel::createGPRPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form a D register from a pair of S registers.
 SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::DPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
@@ -1555,7 +1590,7 @@ SDNode *ARMDAGToDAGISel::createSRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form a quad register from a pair of D registers.
 SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
@@ -1565,7 +1600,7 @@ SDNode *ARMDAGToDAGISel::createDRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 
 /// \brief Form 4 consecutive D registers from a pair of Q registers.
 SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
@@ -1576,7 +1611,7 @@ SDNode *ARMDAGToDAGISel::createQRegPairNode(EVT VT, SDValue V0, SDValue V1) {
 /// \brief Form 4 consecutive S registers.
 SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass =
     CurDAG->getTargetConstant(ARM::QPR_VFP2RegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::ssub_0, MVT::i32);
@@ -1591,7 +1626,7 @@ SDNode *ARMDAGToDAGISel::createQuadSRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// \brief Form 4 consecutive D registers.
 SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::dsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::dsub_1, MVT::i32);
@@ -1605,7 +1640,7 @@ SDNode *ARMDAGToDAGISel::createQuadDRegsNode(EVT VT, SDValue V0, SDValue V1,
 /// \brief Form 4 consecutive Q registers.
 SDNode *ARMDAGToDAGISel::createQuadQRegsNode(EVT VT, SDValue V0, SDValue V1,
                                    SDValue V2, SDValue V3) {
-  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDLoc dl(V0.getNode());
   SDValue RegClass = CurDAG->getTargetConstant(ARM::QQQQPRRegClassID, MVT::i32);
   SDValue SubReg0 = CurDAG->getTargetConstant(ARM::qsub_0, MVT::i32);
   SDValue SubReg1 = CurDAG->getTargetConstant(ARM::qsub_1, MVT::i32);
@@ -1689,7 +1724,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    const uint16_t *QOpcodes0,
                                    const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -1821,7 +1856,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
                                    const uint16_t *QOpcodes0,
                                    const uint16_t *QOpcodes1) {
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -1966,7 +2001,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
                                          const uint16_t *DOpcodes,
                                          const uint16_t *QOpcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
@@ -2084,7 +2119,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
                                       unsigned NumVecs,
                                       const uint16_t *Opcodes) {
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
@@ -2166,7 +2201,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
                                     unsigned Opc) {
   assert(NumVecs >= 2 && NumVecs <= 4 && "VTBL NumVecs out-of-range");
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   unsigned FirstTblReg = IsExt ? 2 : 1;
 
@@ -2278,204 +2313,6 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
   return NULL;
 }
 
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
-    unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
-    unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
-    unsigned Opc = 0;
-    switch (SOShOp) {
-    case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
-    case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
-    case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
-    case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
-    default:
-      llvm_unreachable("Unknown so_reg opcode!");
-    }
-    SDValue SOShImm =
-      CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  SDValue CPTmp2;
-  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsi, MVT::i32, Ops, 6);
-  }
-
-  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, ARM::MOVCCsr, MVT::i32, Ops, 7);
-  }
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectT2CMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                  ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  if (is_t2_so_imm(TrueImm)) {
-    Opc = ARM::t2MOVCCi;
-  } else if (TrueImm <= 0xffff) {
-    Opc = ARM::t2MOVCCi16;
-  } else if (is_t2_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::t2MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() && Subtarget->hasV6T2Ops()) {
-    // Large immediate.
-    Opc = ARM::t2MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::
-SelectARMCMOVImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
-                   ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (!T)
-    return 0;
-
-  unsigned Opc = 0;
-  unsigned TrueImm = T->getZExtValue();
-  bool isSoImm = is_so_imm(TrueImm);
-  if (isSoImm) {
-    Opc = ARM::MOVCCi;
-  } else if (Subtarget->hasV6T2Ops() && TrueImm <= 0xffff) {
-    Opc = ARM::MOVCCi16;
-  } else if (is_so_imm_not(TrueImm)) {
-    TrueImm = ~TrueImm;
-    Opc = ARM::MVNCCi;
-  } else if (TrueVal.getNode()->hasOneUse() &&
-             (Subtarget->hasV6T2Ops() || ARM_AM::isSOImmTwoPartVal(TrueImm))) {
-    // Large immediate.
-    Opc = ARM::MOVCCi32imm;
-  }
-
-  if (Opc) {
-    SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
-  }
-
-  return 0;
-}
-
-SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  SDValue FalseVal = N->getOperand(0);
-  SDValue TrueVal  = N->getOperand(1);
-  SDValue CC = N->getOperand(2);
-  SDValue CCR = N->getOperand(3);
-  SDValue InFlag = N->getOperand(4);
-  assert(CC.getOpcode() == ISD::Constant);
-  assert(CCR.getOpcode() == ISD::Register);
-  ARMCC::CondCodes CCVal =
-    (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue();
-
-  if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
-    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
-    // Pattern complexity = 18  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-
-    // Pattern: (ARMcmov:i32 GPR:i32:$false,
-    //             (imm:i32)<<P:Pred_so_imm>>:$true,
-    //             (imm:i32):$cc)
-    // Emits: (MOVCCi:i32 GPR:i32:$false,
-    //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
-    // Pattern complexity = 10  cost = 1  size = 0
-    if (Subtarget->isThumb()) {
-      SDNode *Res = SelectT2CMOVImmOp(N, FalseVal, TrueVal,
-                                        CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectT2CMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    } else {
-      SDNode *Res = SelectARMCMOVImmOp(N, FalseVal, TrueVal,
-                                         CCVal, CCR, InFlag);
-      if (!Res)
-        Res = SelectARMCMOVImmOp(N, TrueVal, FalseVal,
-                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
-      if (Res)
-        return Res;
-    }
-  }
-
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 1  size = 0
-  //
-  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
-  // Pattern complexity = 6  cost = 11  size = 0
-  //
-  // Also VMOVScc and VMOVDcc.
-  SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
-  SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
-  unsigned Opc = 0;
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: llvm_unreachable("Illegal conditional move type!");
-  case MVT::i32:
-    Opc = Subtarget->isThumb()
-      ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
-      : ARM::MOVCCr;
-    break;
-  case MVT::f32:
-    Opc = ARM::VMOVScc;
-    break;
-  case MVT::f64:
-    Opc = ARM::VMOVDcc;
-    break;
-  }
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
-}
-
 /// Target-specific DAG combining for ISD::XOR.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
@@ -2524,30 +2361,45 @@ SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
   return createDRegPairNode(VT, N->getOperand(0), N->getOperand(1));
 }
 
-SDNode *ARMDAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+SDNode *ARMDAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
+                                      unsigned Op16,unsigned Op32,
+                                      unsigned Op64) {
+  // Mostly direct translation to the given operations, except that we preserve
+  // the AtomicOrdering for use later on.
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
+  EVT VT = AN->getMemoryVT();
+
+  unsigned Op;
+  SDVTList VTs = CurDAG->getVTList(AN->getValueType(0), MVT::Other);
+  if (VT == MVT::i8)
+    Op = Op8;
+  else if (VT == MVT::i16)
+    Op = Op16;
+  else if (VT == MVT::i32)
+    Op = Op32;
+  else if (VT == MVT::i64) {
+    Op = Op64;
+    VTs = CurDAG->getVTList(MVT::i32, MVT::i32, MVT::Other);
+  } else
+    llvm_unreachable("Unexpected atomic operation");
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Node->getOperand(1)); // Ptr
-  Ops.push_back(Node->getOperand(2)); // Low part of Val1
-  Ops.push_back(Node->getOperand(3)); // High part of Val1
-  if (Opc == ARM::ATOMCMPXCHG6432) {
-    Ops.push_back(Node->getOperand(4)); // Low part of Val2
-    Ops.push_back(Node->getOperand(5)); // High part of Val2
-  }
-  Ops.push_back(Node->getOperand(0)); // Chain
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
-                                           MVT::i32, MVT::i32, MVT::Other,
-                                           Ops);
-  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
-  return ResNode;
+  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
+      Ops.push_back(AN->getOperand(i));
+
+  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
+  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+
+  return CurDAG->SelectNodeTo(Node, Op, VTs, &Ops[0], Ops.size());
 }
 
 SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
@@ -2587,7 +2439,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
                                   Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      TLI.getPointerTy());
+                                      getTargetLowering()->getPointerTy());
 
       SDNode *ResNode;
       if (Subtarget->isThumb1Only()) {
@@ -2617,7 +2469,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
     if (Subtarget->isThumb1Only()) {
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
@@ -2838,8 +2691,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                 SDValue(Chain.getNode(), Chain.getResNo()));
     return NULL;
   }
-  case ARMISD::CMOV:
-    return SelectCMOVOp(N);
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
@@ -3121,7 +2972,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
     case Intrinsic::arm_ldrexd: {
       SDValue MemAddr = N->getOperand(2);
-      DebugLoc dl = N->getDebugLoc();
+      SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
 
       bool isThumb = Subtarget->isThumb() && Subtarget->hasThumb2();
@@ -3179,7 +3030,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
 
     case Intrinsic::arm_strexd: {
-      DebugLoc dl = N->getDebugLoc();
+      SDLoc dl(N);
       SDValue Chain = N->getOperand(0);
       SDValue Val0 = N->getOperand(2);
       SDValue Val1 = N->getOperand(3);
@@ -3383,7 +3234,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VTBL1: {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     EVT VT = N->getValueType(0);
     SmallVector<SDValue, 6> Ops;
 
@@ -3394,7 +3245,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(ARM::VTBL1, dl, VT, Ops);
   }
   case ARMISD::VTBL2: {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     EVT VT = N->getValueType(0);
 
     // Form a REG_SEQUENCE to force register allocation.
@@ -3413,31 +3264,90 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::CONCAT_VECTORS:
     return SelectConcatVector(N);
 
-  case ARMISD::ATOMOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMOR6432);
-  case ARMISD::ATOMXOR64_DAG:
-    return SelectAtomic64(N, ARM::ATOMXOR6432);
-  case ARMISD::ATOMADD64_DAG:
-    return SelectAtomic64(N, ARM::ATOMADD6432);
-  case ARMISD::ATOMSUB64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSUB6432);
-  case ARMISD::ATOMNAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMNAND6432);
-  case ARMISD::ATOMAND64_DAG:
-    return SelectAtomic64(N, ARM::ATOMAND6432);
-  case ARMISD::ATOMSWAP64_DAG:
-    return SelectAtomic64(N, ARM::ATOMSWAP6432);
-  case ARMISD::ATOMCMPXCHG64_DAG:
-    return SelectAtomic64(N, ARM::ATOMCMPXCHG6432);
-
-  case ARMISD::ATOMMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMIN6432);
-  case ARMISD::ATOMUMIN64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMIN6432);
-  case ARMISD::ATOMMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMMAX6432);
-  case ARMISD::ATOMUMAX64_DAG:
-    return SelectAtomic64(N, ARM::ATOMUMAX6432);
+  case ISD::ATOMIC_LOAD:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_LOAD_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_STORE:
+    if (cast<AtomicSDNode>(N)->getMemoryVT() == MVT::i64)
+      return SelectAtomic(N, 0, 0, 0, ARM::ATOMIC_STORE_I64);
+    else
+      break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_ADD_I8,
+                        ARM::ATOMIC_LOAD_ADD_I16,
+                        ARM::ATOMIC_LOAD_ADD_I32,
+                        ARM::ATOMIC_LOAD_ADD_I64);
+  case ISD::ATOMIC_LOAD_SUB:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_SUB_I8,
+                        ARM::ATOMIC_LOAD_SUB_I16,
+                        ARM::ATOMIC_LOAD_SUB_I32,
+                        ARM::ATOMIC_LOAD_SUB_I64);
+  case ISD::ATOMIC_LOAD_AND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_AND_I8,
+                        ARM::ATOMIC_LOAD_AND_I16,
+                        ARM::ATOMIC_LOAD_AND_I32,
+                        ARM::ATOMIC_LOAD_AND_I64);
+  case ISD::ATOMIC_LOAD_OR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_OR_I8,
+                        ARM::ATOMIC_LOAD_OR_I16,
+                        ARM::ATOMIC_LOAD_OR_I32,
+                        ARM::ATOMIC_LOAD_OR_I64);
+  case ISD::ATOMIC_LOAD_XOR:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_XOR_I8,
+                        ARM::ATOMIC_LOAD_XOR_I16,
+                        ARM::ATOMIC_LOAD_XOR_I32,
+                        ARM::ATOMIC_LOAD_XOR_I64);
+  case ISD::ATOMIC_LOAD_NAND:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_NAND_I8,
+                        ARM::ATOMIC_LOAD_NAND_I16,
+                        ARM::ATOMIC_LOAD_NAND_I32,
+                        ARM::ATOMIC_LOAD_NAND_I64);
+  case ISD::ATOMIC_LOAD_MIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MIN_I8,
+                        ARM::ATOMIC_LOAD_MIN_I16,
+                        ARM::ATOMIC_LOAD_MIN_I32,
+                        ARM::ATOMIC_LOAD_MIN_I64);
+  case ISD::ATOMIC_LOAD_MAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_MAX_I8,
+                        ARM::ATOMIC_LOAD_MAX_I16,
+                        ARM::ATOMIC_LOAD_MAX_I32,
+                        ARM::ATOMIC_LOAD_MAX_I64);
+  case ISD::ATOMIC_LOAD_UMIN:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMIN_I8,
+                        ARM::ATOMIC_LOAD_UMIN_I16,
+                        ARM::ATOMIC_LOAD_UMIN_I32,
+                        ARM::ATOMIC_LOAD_UMIN_I64);
+  case ISD::ATOMIC_LOAD_UMAX:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_LOAD_UMAX_I8,
+                        ARM::ATOMIC_LOAD_UMAX_I16,
+                        ARM::ATOMIC_LOAD_UMAX_I32,
+                        ARM::ATOMIC_LOAD_UMAX_I64);
+  case ISD::ATOMIC_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_SWAP_I8,
+                        ARM::ATOMIC_SWAP_I16,
+                        ARM::ATOMIC_SWAP_I32,
+                        ARM::ATOMIC_SWAP_I64);
+  case ISD::ATOMIC_CMP_SWAP:
+    return SelectAtomic(N,
+                        ARM::ATOMIC_CMP_SWAP_I8,
+                        ARM::ATOMIC_CMP_SWAP_I16,
+                        ARM::ATOMIC_CMP_SWAP_I32,
+                        ARM::ATOMIC_CMP_SWAP_I64);
   }
 
   return SelectCode(N);
@@ -3449,24 +3359,20 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   bool Changed = false;
   unsigned NumOps = N->getNumOperands();
 
-  ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(
-      N->getOperand(InlineAsm::Op_AsmString));
-  StringRef AsmString = StringRef(S->getSymbol());
-
   // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
   // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
   // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
   // respectively. Since there is no constraint to explicitly specify a
-  // reg pair, we search %H operand inside the asm string. If it is found, the
-  // transformation below enforces a GPRPair reg class for "%r" for 64-bit data.
-  if (AsmString.find(":H}") == StringRef::npos)
-    return NULL;
+  // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb,
+  // the 64-bit data may be referred by H, Q, R modifiers, so we still pack
+  // them into a GPRPair.
 
-  DebugLoc dl = N->getDebugLoc();
-  SDValue Glue = N->getOperand(NumOps-1);
+  SDLoc dl(N);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(0,0);
 
+  SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
-  for(unsigned i = 0; i < NumOps -1; ++i) {
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
     SDValue op = N->getOperand(i);
     AsmNodeOperands.push_back(op);
 
@@ -3480,17 +3386,38 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     else
       continue;
 
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
     if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
         && Kind != InlineAsm::Kind_RegDefEarlyClobber)
       continue;
 
-    unsigned RegNum = InlineAsm::getNumOperandRegisters(Flag);
     unsigned RC;
     bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
-    if (!HasRC || RC != ARM::GPRRegClassID || RegNum != 2)
+    if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID))
+        || NumRegs != 2)
       continue;
 
-    assert((i+2 < NumOps-1) && "Invalid number of operands in inline asm");
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
     SDValue V0 = N->getOperand(i+1);
     SDValue V1 = N->getOperand(i+2);
     unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
@@ -3551,8 +3478,12 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     Changed = true;
 
     if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
       Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
-      Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
+      if (IsTiedToChangedOp)
+        Flag = InlineAsm::getFlagWordForMatchingOp(Flag, DefIdx);
+      else
+        Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
       // Replace the current flag.
       AsmNodeOperands[AsmNodeOperands.size() -1] = CurDAG->getTargetConstant(
           Flag, MVT::i32);
@@ -3563,11 +3494,12 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     }
   }
 
-  AsmNodeOperands.push_back(Glue);
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
   if (!Changed)
     return NULL;
 
-  SDValue New = CurDAG->getNode(ISD::INLINEASM, N->getDebugLoc(),
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
       CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
                         AsmNodeOperands.size());
   New->setNodeId(-1);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index e49cfc4..76a0a83 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include <utility>
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -74,7 +75,7 @@ namespace {
   class ARMCCState : public CCState {
   public:
     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
                LLVMContext &C, ParmContext PC)
         : CCState(CC, isVarArg, MF, TM, locs, C) {
       assert(((PC == Call) || (PC == Prologue)) &&
@@ -174,9 +175,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  if (Subtarget->isTargetDarwin()) {
+  if (Subtarget->isTargetIOS()) {
     // Uses VFP for Thumb libfuncs if available.
-    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+    if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
+        Subtarget->hasARMOps()) {
       // Single-precision floating-point arithmetic.
       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -421,7 +423,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
-  if (Subtarget->getTargetTriple().getOS() == Triple::IOS &&
+  if (Subtarget->getTargetTriple().isiOS() &&
       !Subtarget->getTargetTriple().isOSVersionLT(5, 0)) {
     setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
     setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
@@ -452,6 +454,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
 
   if (Subtarget->hasNEON()) {
     addDRTypeForNEON(MVT::v2f32);
@@ -564,16 +567,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FP_ROUND,   MVT::v2f32, Expand);
     setOperationAction(ISD::FP_EXTEND,  MVT::v2f64, Expand);
 
-    // Custom expand long extensions to vectors.
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64,  Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
-
     // NEON does not have single instruction CTPOP for vectors with element
     // types wider than 8-bits.  However, custom lowering can leverage the
     // v8i8/v16i8 vcnt instruction.
@@ -681,6 +674,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i32  , Expand);
   setOperationAction(ISD::CTLZ_ZERO_UNDEF  , MVT::i32  , Expand);
 
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
   // Only ARMv6 has BSWAP.
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
@@ -691,10 +686,36 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
   }
+
+  // FIXME: Also set divmod for SREM on EABI
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  // Register based DivRem for AEABI (RTABI 4.2)
+  if (Subtarget->isTargetAEABI()) {
+    setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
+    setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
+
+    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  } else {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  }
 
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
@@ -715,8 +736,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (!Subtarget->isTargetDarwin()) {
     // Non-Darwin platforms may return values in these registers via the
     // personality function.
-    setOperationAction(ISD::EHSELECTION,      MVT::i32,   Expand);
-    setOperationAction(ISD::EXCEPTIONADDR,    MVT::i32,   Expand);
     setExceptionPointerRegister(ARM::R0);
     setExceptionSelectorRegister(ARM::R1);
   }
@@ -724,12 +743,10 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
   // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
   // the default expansion.
-  // FIXME: This should be checking for v6k, not just v6.
-  if (Subtarget->hasDataBarrier() ||
-      (Subtarget->hasV6Ops() && !Subtarget->isThumb())) {
-    // membarrier needs custom lowering; the rest are legal and handled
-    // normally.
-    setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+  if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only()) {
+    // ATOMIC_FENCE needs custom lowering; the other 32-bit ones are legal and
+    // handled normally.
+    setOperationAction(ISD::ATOMIC_FENCE,     MVT::Other, Custom);
     // Custom lowering for 64-bit ops
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB,  MVT::i64, Custom);
@@ -742,11 +759,20 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i64, Custom);
-    // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
-    setInsertFencesForAtomic(true);
+    // On v8, we have particularly efficient implementations of atomic fences
+    // if they can be combined with nearby atomic loads and stores.
+    if (!Subtarget->hasV8Ops()) {
+      // Automatically insert fences (dmb ist) around ATOMIC_SWAP etc.
+      setInsertFencesForAtomic(true);
+    }
+    setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
   } else {
+    // If there's anything we can use as a barrier, go through custom lowering
+    // for ATOMIC_FENCE.
+    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other,
+                       Subtarget->hasAnyDataBarrier() ? Custom : Expand);
+
     // Set them all for expansion, which will force libcalls.
-    setOperationAction(ISD::ATOMIC_FENCE,   MVT::Other, Expand);
     setOperationAction(ISD::ATOMIC_CMP_SWAP,  MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_SWAP,      MVT::i32, Expand);
     setOperationAction(ISD::ATOMIC_LOAD_ADD,  MVT::i32, Expand);
@@ -843,6 +869,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
     }
   }
+      
+  // Combine sin / cos into one node or libcall if possible.
+  if (Subtarget->hasSinCos()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+    if (Subtarget->getTargetTriple().getOS() == Triple::IOS) {
+      // For iOS, we don't want to the normal expansion of a libcall to
+      // sincos. We want to issue a libcall to __sincos_stret.
+      setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+      setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+    }
+  }
 
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
@@ -882,6 +920,44 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
 
+static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
+                                  bool isThumb2, unsigned &LdrOpc,
+                                  unsigned &StrOpc) {
+  static const unsigned LoadBares[4][2] =  {{ARM::LDREXB, ARM::t2LDREXB},
+                                            {ARM::LDREXH, ARM::t2LDREXH},
+                                            {ARM::LDREX,  ARM::t2LDREX},
+                                            {ARM::LDREXD, ARM::t2LDREXD}};
+  static const unsigned LoadAcqs[4][2] =   {{ARM::LDAEXB, ARM::t2LDAEXB},
+                                            {ARM::LDAEXH, ARM::t2LDAEXH},
+                                            {ARM::LDAEX,  ARM::t2LDAEX},
+                                            {ARM::LDAEXD, ARM::t2LDAEXD}};
+  static const unsigned StoreBares[4][2] = {{ARM::STREXB, ARM::t2STREXB},
+                                            {ARM::STREXH, ARM::t2STREXH},
+                                            {ARM::STREX,  ARM::t2STREX},
+                                            {ARM::STREXD, ARM::t2STREXD}};
+  static const unsigned StoreRels[4][2] =  {{ARM::STLEXB, ARM::t2STLEXB},
+                                            {ARM::STLEXH, ARM::t2STLEXH},
+                                            {ARM::STLEX,  ARM::t2STLEX},
+                                            {ARM::STLEXD, ARM::t2STLEXD}};
+
+  const unsigned (*LoadOps)[2], (*StoreOps)[2];
+  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    LoadOps = LoadAcqs;
+  else
+    LoadOps = LoadBares;
+
+  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
+    StoreOps = StoreRels;
+  else
+    StoreOps = StoreBares;
+
+  assert(isPowerOf2_32(Size) && Size <= 8 &&
+         "unsupported size for atomic binary op!");
+
+  LdrOpc = LoadOps[Log2_32(Size)][isThumb2];
+  StrOpc = StoreOps[Log2_32(Size)][isThumb2];
+}
+
 // FIXME: It might make sense to define the representative register class as the
 // nearest super-register that has a non-null superset. For example, DPR_VFP2 is
 // a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
@@ -944,6 +1020,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMN:           return "ARMISD::CMN";
@@ -983,7 +1060,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
 
-  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
   case ARMISD::MEMBARRIER_MCR: return "ARMISD::MEMBARRIER_MCR";
 
   case ARMISD::PRELOAD:       return "ARMISD::PRELOAD";
@@ -1042,6 +1118,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
+  case ARMISD::VMAXNM:        return "ARMISD::VMAX";
+  case ARMISD::VMINNM:        return "ARMISD::VMIN";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
   case ARMISD::VBICIMM:       return "ARMISD::VBICIMM";
@@ -1069,7 +1147,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
+EVT ARMTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector()) return getPointerTy();
   return VT.changeVectorElementTypeToInteger();
 }
@@ -1233,7 +1311,7 @@ SDValue
 ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals,
                                    bool isThisReturn, SDValue ThisVal) const {
 
@@ -1314,7 +1392,7 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 SDValue
 ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     SDValue StackPtr, SDValue Arg,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
@@ -1325,12 +1403,12 @@ ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
                       false, false, 0);
 }
 
-void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                                          SDValue Chain, SDValue &Arg,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
-                                         SmallVector<SDValue, 8> &MemOpChains,
+                                         SmallVectorImpl<SDValue> &MemOpChains,
                                          ISD::ArgFlagsTy Flags) const {
 
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
@@ -1357,10 +1435,10 @@ SDValue
 ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                          = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -1406,7 +1484,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!isSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
@@ -1496,7 +1575,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
           SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
                                      MachinePointerInfo(),
-                                     false, false, false, 0);
+                                     false, false, false,
+                                     DAG.InferPtrAlignment(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
@@ -1705,17 +1785,26 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
 
   // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
-  if (isThisReturn)
-    // For 'this' returns, use the R0-preserving mask
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-  else
-    Mask = ARI->getCallPreservedMask(CallConv);
+  if (!isTailCall) {
+    const uint32_t *Mask;
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
+    if (isThisReturn) {
+      // For 'this' returns, use the R0-preserving mask if applicable
+      Mask = ARI->getThisReturnPreservedMask(CallConv);
+      if (!Mask) {
+        // Set isThisReturn to false if the calling convention is not one that
+        // allows 'returned' to be modeled in this way, so LowerCallResult does
+        // not try to pass 'this' straight through
+        isThisReturn = false;
+        Mask = ARI->getCallPreservedMask(CallConv);
+      }
+    } else
+      Mask = ARI->getCallPreservedMask(CallConv);
 
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -1729,7 +1818,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -1795,7 +1884,7 @@ ARMTargetLowering::HandleByVal(
       // else parameter would be splitted between registers and stack,
       // end register would be r4 in this case.
       unsigned ByValRegBegin = reg;
-      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : ARM::R4;
+      unsigned ByValRegEnd = (size < excess) ? reg + size/4 : (unsigned)ARM::R4;
       State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
       // Note, first register is allocated in the beginning of function already,
       // allocate remained amount of registers we need.
@@ -1886,6 +1975,12 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   if (isVarArg && !Outs.empty())
     return false;
 
+  // Exception-handling functions need a special set of instructions to indicate
+  // a return to the hardware. Tail-calling another function would probably
+  // break this.
+  if (CallerF->hasFnAttribute("interrupt"))
+    return false;
+
   // Also avoid sibcall optimization if either caller or callee uses struct
   // return semantics.
   if (isCalleeStructRet || isCallerStructRet)
@@ -2014,12 +2109,45 @@ ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
                                                     isVarArg));
 }
 
+static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
+                                    SDLoc DL, SelectionDAG &DAG) {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+
+  StringRef IntKind = F->getFnAttribute("interrupt").getValueAsString();
+
+  // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
+  // version of the "preferred return address". These offsets affect the return
+  // instruction if this is a return from PL1 without hypervisor extensions.
+  //    IRQ/FIQ: +4     "subs pc, lr, #4"
+  //    SWI:     0      "subs pc, lr, #0"
+  //    ABORT:   +4     "subs pc, lr, #4"
+  //    UNDEF:   +4/+2  "subs pc, lr, #0"
+  // UNDEF varies depending on where the exception came from ARM or Thumb
+  // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
+
+  int64_t LROffset;
+  if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
+      IntKind == "ABORT")
+    LROffset = 4;
+  else if (IntKind == "SWI" || IntKind == "UNDEF")
+    LROffset = 0;
+  else
+    report_fatal_error("Unsupported interrupt attribute. If present, value "
+                       "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
+
+  RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
+
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other,
+                     RetOps.data(), RetOps.size());
+}
+
 SDValue
 ARMTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2099,6 +2227,19 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
+  // CPUs which aren't M-class use a special sequence to return from
+  // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
+  // though we use "subs pc, lr, #N").
+  //
+  // M-class CPUs actually use a normal return sequence with a special
+  // (hardware-provided) value in LR, so the normal code path works.
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute("interrupt") &&
+      !Subtarget->isMClass()) {
+    if (Subtarget->isThumb1Only())
+      report_fatal_error("interrupt attribute is not supported in Thumb1");
+    return LowerInterruptReturn(RetOps, dl, DAG);
+  }
+
   return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
                      RetOps.data(), RetOps.size());
 }
@@ -2147,7 +2288,7 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
     Copy = *Copy->use_begin();
     if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
       return false;
-    Chain = Copy->getOperand(0);
+    TCChain = Copy->getOperand(0);
   } else {
     return false;
   }
@@ -2155,7 +2296,8 @@ bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
   bool HasRet = false;
   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
        UI != UE; ++UI) {
-    if (UI->getOpcode() != ARMISD::RET_FLAG)
+    if (UI->getOpcode() != ARMISD::RET_FLAG &&
+        UI->getOpcode() != ARMISD::INTRET_FLAG)
       return false;
     HasRet = true;
   }
@@ -2186,7 +2328,7 @@ bool ARMTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
   EVT PtrVT = Op.getValueType();
   // FIXME there is no actual debug info here
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
   if (CP->isMachineConstantPoolEntry())
@@ -2207,7 +2349,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PtrVT = getPointerTy();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
@@ -2236,7 +2378,7 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   EVT PtrVT = getPointerTy();
   unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2279,7 +2421,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                         SelectionDAG &DAG,
                                         TLSModel::Model model) const {
   const GlobalValue *GV = GA->getGlobal();
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   SDValue Offset;
   SDValue Chain = DAG.getEntryNode();
   EVT PtrVT = getPointerTy();
@@ -2349,7 +2491,7 @@ ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
     bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
@@ -2392,7 +2534,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
 SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
                                                     SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Reloc::Model RelocM = getTargetMachine().getRelocationModel();
 
@@ -2457,7 +2599,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   EVT PtrVT = getPointerTy();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
@@ -2473,7 +2615,7 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Val = DAG.getConstant(0, MVT::i32);
   return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
                      DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
@@ -2482,7 +2624,7 @@ ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
                      Op.getOperand(1), DAG.getConstant(0, MVT::i32));
 }
@@ -2491,7 +2633,7 @@ SDValue
 ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                           const ARMSubtarget *Subtarget) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
   case Intrinsic::arm_thread_pointer: {
@@ -2527,7 +2669,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_neon_vmullu: {
     unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
       ? ARMISD::VMULLs : ARMISD::VMULLu;
-    return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(),
+    return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
   }
   }
@@ -2536,19 +2678,33 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                                  const ARMSubtarget *Subtarget) {
   // FIXME: handle "fence singlethread" more efficiently.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
-           "Unexpected ISD::MEMBARRIER encountered. Should be libcall!");
+           "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
     return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
                        DAG.getConstant(0, MVT::i32));
   }
 
-  return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+  unsigned Domain = ARM_MB::ISH;
+  if (Subtarget->isMClass()) {
+    // Only a full system barrier exists in the M-class architectures.
+    Domain = ARM_MB::SY;
+  } else if (Subtarget->isSwift() && Ord == Release) {
+    // Swift happens to implement ISHST barriers in a way that's compatible with
+    // Release semantics but weaker than ISH so we'd be fools not to use
+    // it. Beware: other processors probably don't!
+    Domain = ARM_MB::ISHST;
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(Intrinsic::arm_dmb, MVT::i32),
+                     DAG.getConstant(Domain, MVT::i32));
 }
 
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
@@ -2559,7 +2715,7 @@ static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
     // Just preserve the chain.
     return Op.getOperand(0);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned isRead = ~cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() & 1;
   if (!isRead &&
       (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
@@ -2584,7 +2740,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
 
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
@@ -2595,7 +2751,7 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) {
 SDValue
 ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                         SDValue &Root, SelectionDAG &DAG,
-                                        DebugLoc dl) const {
+                                        SDLoc dl) const {
   MachineFunction &MF = DAG.getMachineFunction();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
 
@@ -2630,6 +2786,7 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
 void
 ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
                                   unsigned InRegsParamRecordIdx,
+                                  unsigned ArgSize,
                                   unsigned &ArgRegsSize,
                                   unsigned &ArgRegsSaveSize)
   const {
@@ -2648,7 +2805,29 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 
   unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
   ArgRegsSize = NumGPRs * 4;
-  ArgRegsSaveSize = (ArgRegsSize + Align - 1) & ~(Align - 1);
+
+  // If parameter is split between stack and GPRs...
+  if (NumGPRs && Align == 8 &&
+      (ArgRegsSize < ArgSize ||
+        InRegsParamRecordIdx >= CCInfo.getInRegsParamsCount())) {
+    // Add padding for part of param recovered from GPRs, so
+    // its last byte must be at address K*8 - 1.
+    // We need to do it, since remained (stack) part of parameter has
+    // stack alignment, and we need to "attach" "GPRs head" without gaps
+    // to it:
+    // Stack:
+    // |---- 8 bytes block ----| |---- 8 bytes block ----| |---- 8 bytes...
+    // [ [padding] [GPRs head] ] [        Tail passed via stack       ....
+    //
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned Padding =
+        ((ArgRegsSize + AFI->getArgRegsSaveSize() + Align - 1) & ~(Align-1)) -
+        (ArgRegsSize + AFI->getArgRegsSaveSize());
+    ArgRegsSaveSize = ArgRegsSize + Padding;
+  } else
+    // We don't need to extend regs save size for byval parameters if they
+    // are passed via GPRs only.
+    ArgRegsSaveSize = ArgRegsSize;
 }
 
 // The remaining GPRs hold either the beginning of variable-argument
@@ -2661,11 +2840,12 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 // Return: The frame index registers were stored into.
 int
 ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                                  DebugLoc dl, SDValue &Chain,
+                                  SDLoc dl, SDValue &Chain,
                                   const Value *OrigArg,
                                   unsigned InRegsParamRecordIdx,
                                   unsigned OffsetFromOrigArg,
                                   unsigned ArgOffset,
+                                  unsigned ArgSize,
                                   bool ForceMutable) const {
 
   // Currently, two use-cases possible:
@@ -2690,12 +2870,13 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     lastRegToSaveIndex = REnd - ARM::R0;
   } else {
     firstRegToSaveIndex = CCInfo.getFirstUnallocated
-      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+      (GPRArgRegs, array_lengthof(GPRArgRegs));
     lastRegToSaveIndex = 4;
   }
 
   unsigned ArgRegsSize, ArgRegsSaveSize;
-  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgRegsSize, ArgRegsSaveSize);
+  computeRegArea(CCInfo, MF, InRegsParamRecordIdx, ArgSize,
+                 ArgRegsSize, ArgRegsSaveSize);
 
   // Store any by-val regs to their spots on the stack so that they may be
   // loaded by deferencing the result of formal parameter pointer or va_next.
@@ -2703,9 +2884,17 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
   // was initialized, it can't be initialized again.
   if (ArgRegsSaveSize) {
 
+    unsigned Padding = ArgRegsSaveSize - ArgRegsSize;
+
+    if (Padding) {
+      assert(AFI->getStoredByValParamsPadding() == 0 &&
+             "The only parameter may be padded.");
+      AFI->setStoredByValParamsPadding(Padding);
+    }
+
     int FrameIndex = MFI->CreateFixedObject(
                       ArgRegsSaveSize,
-                      ArgOffset + ArgRegsSaveSize - ArgRegsSize,
+                      Padding + ArgOffset,
                       false);
     SDValue FIN = DAG.getFrameIndex(FrameIndex, getPointerTy());
 
@@ -2737,13 +2926,14 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     return FrameIndex;
   } else
     // This will point to the next argument passed via stack.
-    return MFI->CreateFixedObject(4, ArgOffset, !ForceMutable);
+    return MFI->CreateFixedObject(
+        4, AFI->getStoredByValParamsPadding() + ArgOffset, !ForceMutable);
 }
 
 // Setup stack frame, the va_list pointer will start from.
 void
 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                        DebugLoc dl, SDValue &Chain,
+                                        SDLoc dl, SDValue &Chain,
                                         unsigned ArgOffset,
                                         bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2756,7 +2946,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   // argument passed via stack.
   int FrameIndex =
     StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
-                   0, ArgOffset, ForceMutable);
+                   0, ArgOffset, 0, ForceMutable);
 
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
@@ -2766,7 +2956,7 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2896,12 +3086,15 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
                 CurByValIndex,
                 Ins[VA.getValNo()].PartOffset,
                 VA.getLocMemOffset(),
+                Flags.getByValSize(),
                 true /*force mutable frames*/);
             InVals.push_back(DAG.getFrameIndex(FrameIndex, getPointerTy()));
             CCInfo.nextInRegsParam();
           } else {
+            unsigned FIOffset = VA.getLocMemOffset() +
+                                AFI->getStoredByValParamsPadding();
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                            VA.getLocMemOffset(), true);
+                                            FIOffset, true);
 
             // Create load nodes to retrieve arguments from the stack.
             SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
@@ -2943,7 +3136,7 @@ static bool isFloatingPointZero(SDValue Op) {
 SDValue
 ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                              SDValue &ARMcc, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate(C)) {
@@ -3001,7 +3194,7 @@ ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue
 ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                             DebugLoc dl) const {
+                             SDLoc dl) const {
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
     Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
@@ -3015,7 +3208,7 @@ ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
 SDValue
 ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   unsigned Opc = Cmp.getOpcode();
-  DebugLoc DL = Cmp.getDebugLoc();
+  SDLoc DL(Cmp);
   if (Opc == ARMISD::CMP || Opc == ARMISD::CMPZ)
     return DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
 
@@ -3035,7 +3228,7 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
   SDValue SelectFalse = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Convert:
   //
@@ -3083,6 +3276,61 @@ SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
                          SelectTrue, SelectFalse, ISD::SETNE);
 }
 
+static ISD::CondCode getInverseCCForVSEL(ISD::CondCode CC) {
+  if (CC == ISD::SETNE)
+    return ISD::SETEQ;
+  return ISD::getSetCCSwappedOperands(CC);
+}
+
+static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                                 bool &swpCmpOps, bool &swpVselOps) {
+  // Start by selecting the GE condition code for opcodes that return true for
+  // 'equality'
+  if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
+      CC == ISD::SETULE)
+    CondCode = ARMCC::GE;
+
+  // and GT for opcodes that return false for 'equality'.
+  else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
+           CC == ISD::SETULT)
+    CondCode = ARMCC::GT;
+
+  // Since we are constrained to GE/GT, if the opcode contains 'less', we need
+  // to swap the compare operands.
+  if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
+      CC == ISD::SETULT)
+    swpCmpOps = true;
+
+  // Both GT and GE are ordered comparisons, and return false for 'unordered'.
+  // If we have an unordered opcode, we need to swap the operands to the VSEL
+  // instruction (effectively negating the condition).
+  //
+  // This also has the effect of swapping which one of 'less' or 'greater'
+  // returns true, so we also swap the compare operands. It also switches
+  // whether we return true for 'equality', so we compensate by picking the
+  // opposite condition code to our original choice.
+  if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
+      CC == ISD::SETUGT) {
+    swpCmpOps = !swpCmpOps;
+    swpVselOps = !swpVselOps;
+    CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
+  }
+
+  // 'ordered' is 'anything but unordered', so use the VS condition code and
+  // swap the VSEL operands.
+  if (CC == ISD::SETO) {
+    CondCode = ARMCC::VS;
+    swpVselOps = true;
+  }
+
+  // 'unordered or not equal' is 'anything but equal', so use the EQ condition
+  // code and swap the VSEL operands.
+  if (CC == ISD::SETUNE) {
+    CondCode = ARMCC::EQ;
+    swpVselOps = true;
+  }
+}
+
 SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
@@ -3090,18 +3338,69 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
+    // Try to generate VSEL on ARMv8.
+    // The VSEL instruction can't use all the usual ARM condition
+    // codes: it only has two bits to select the condition code, so it's
+    // constrained to use only GE, GT, VS and EQ.
+    //
+    // To implement all the various ISD::SETXXX opcodes, we sometimes need to
+    // swap the operands of the previous compare instruction (effectively
+    // inverting the compare condition, swapping 'less' and 'greater') and
+    // sometimes need to swap the operands to the VSEL (which inverts the
+    // condition in the sense of firing whenever the previous condition didn't)
+    if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                      TrueVal.getValueType() == MVT::f64)) {
+      ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+      if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
+          CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
+        CC = getInverseCCForVSEL(CC);
+        std::swap(TrueVal, FalseVal);
+      }
+    }
+
     SDValue ARMcc;
     SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
     SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
-    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,Cmp);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, CCR,
+                       Cmp);
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
   FPCCToARMCC(CC, CondCode, CondCode2);
 
+  // Try to generate VSEL on ARMv8.
+  if (getSubtarget()->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
+                                    TrueVal.getValueType() == MVT::f64)) {
+    // We can select VMAXNM/VMINNM from a compare followed by a select with the
+    // same operands, as follows:
+    //   c = fcmp [ogt, olt, ugt, ult] a, b
+    //   select c, a, b
+    // We only do this in unsafe-fp-math, because signed zeros and NaNs are
+    // handled differently than the original code sequence.
+    if (getTargetMachine().Options.UnsafeFPMath && LHS == TrueVal &&
+        RHS == FalseVal) {
+      if (CC == ISD::SETOGT || CC == ISD::SETUGT)
+        return DAG.getNode(ARMISD::VMAXNM, dl, VT, TrueVal, FalseVal);
+      if (CC == ISD::SETOLT || CC == ISD::SETULT)
+        return DAG.getNode(ARMISD::VMINNM, dl, VT, TrueVal, FalseVal);
+    }
+
+    bool swpCmpOps = false;
+    bool swpVselOps = false;
+    checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
+
+    if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
+        CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
+      if (swpCmpOps)
+        std::swap(LHS, RHS);
+      if (swpVselOps)
+        std::swap(TrueVal, FalseVal);
+    }
+  }
+
   SDValue ARMcc = DAG.getConstant(CondCode, MVT::i32);
   SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
@@ -3145,7 +3444,7 @@ static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG) {
     return DAG.getConstant(0, MVT::i32);
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op))
-    return DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    return DAG.getLoad(MVT::i32, SDLoc(Op),
                        Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(),
                        Ld->isVolatile(), Ld->isNonTemporal(),
                        Ld->isInvariant(), Ld->getAlignment());
@@ -3163,7 +3462,7 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
     SDValue Ptr = Ld->getBasePtr();
-    RetVal1 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal1 = DAG.getLoad(MVT::i32, SDLoc(Op),
                           Ld->getChain(), Ptr,
                           Ld->getPointerInfo(),
                           Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3171,9 +3470,9 @@ static void expandf64Toi32(SDValue Op, SelectionDAG &DAG,
 
     EVT PtrType = Ptr.getValueType();
     unsigned NewAlign = MinAlign(Ld->getAlignment(), 4);
-    SDValue NewPtr = DAG.getNode(ISD::ADD, Op.getDebugLoc(),
+    SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(Op),
                                  PtrType, Ptr, DAG.getConstant(4, PtrType));
-    RetVal2 = DAG.getLoad(MVT::i32, Op.getDebugLoc(),
+    RetVal2 = DAG.getLoad(MVT::i32, SDLoc(Op),
                           Ld->getChain(), NewPtr,
                           Ld->getPointerInfo().getWithOffset(4),
                           Ld->isVolatile(), Ld->isNonTemporal(),
@@ -3193,7 +3492,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   bool LHSSeenZero = false;
   bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
@@ -3243,7 +3542,7 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (LHS.getValueType() == MVT::i32) {
     SDValue ARMcc;
@@ -3284,7 +3583,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT PTy = getPointerTy();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
@@ -3320,7 +3619,7 @@ SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getValueType().getVectorElementType() == MVT::i32) {
     if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
@@ -3342,7 +3641,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
   if (VT.isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc;
 
   switch (Op.getOpcode()) {
@@ -3360,7 +3659,7 @@ static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
     if (VT.getVectorElementType() == MVT::f32)
@@ -3396,7 +3695,7 @@ static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
   if (VT.isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc;
 
   switch (Op.getOpcode()) {
@@ -3417,7 +3716,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Implement fcopysign with a fabs and a conditional fneg.
   SDValue Tmp0 = Op.getOperand(0);
   SDValue Tmp1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   EVT SrcVT = Tmp1.getValueType();
   bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
@@ -3501,7 +3800,7 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -3521,7 +3820,7 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
     ? ARM::R7 : ARM::R11;
@@ -3533,47 +3832,6 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
-/// Custom Expand long vector extensions, where size(DestVec) > 2*size(SrcVec),
-/// and size(DestVec) > 128-bits.
-/// This is achieved by doing the one extension from the SrcVec, splitting the
-/// result, extending these parts, and then concatenating these into the
-/// destination.
-static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op = N->getOperand(0);
-  EVT SrcVT = Op.getValueType();
-  EVT DestVT = N->getValueType(0);
-
-  assert(DestVT.getSizeInBits() > 128 &&
-         "Custom sext/zext expansion needs >128-bit vector.");
-  // If this is a normal length extension, use the default expansion.
-  if (SrcVT.getSizeInBits()*4 != DestVT.getSizeInBits() &&
-      SrcVT.getSizeInBits()*8 != DestVT.getSizeInBits())
-    return SDValue();
-
-  DebugLoc dl = N->getDebugLoc();
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned DestEltSize = DestVT.getVectorElementType().getSizeInBits();
-  unsigned NumElts = SrcVT.getVectorNumElements();
-  LLVMContext &Ctx = *DAG.getContext();
-  SDValue Mid, SplitLo, SplitHi, ExtLo, ExtHi;
-
-  EVT MidVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                               NumElts);
-  EVT SplitVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, SrcEltSize*2),
-                                 NumElts/2);
-  EVT ExtVT = EVT::getVectorVT(Ctx, EVT::getIntegerVT(Ctx, DestEltSize),
-                               NumElts/2);
-
-  Mid = DAG.getNode(N->getOpcode(), dl, MidVT, Op);
-  SplitLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(0));
-  SplitHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SplitVT, Mid,
-                        DAG.getIntPtrConstant(NumElts/2));
-  ExtLo = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitLo);
-  ExtHi = DAG.getNode(N->getOpcode(), dl, ExtVT, SplitHi);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, ExtLo, ExtHi);
-}
-
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -3581,7 +3839,7 @@ static SDValue ExpandVectorExtension(SDNode *N, SelectionDAG &DAG) {
 /// vectors), since the legalizer won't know what to do with that.
 static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
   // This function is only supposed to be called for i64 types, either as the
@@ -3618,7 +3876,7 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 /// not support i64 elements, so sometimes the zero vectors will need to be
 /// explicitly constructed.  Regardless, use a canonical VMOV to create the
 /// zero vector.
-static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   // The canonical modified immediate encoding of a zero vector is....0!
   SDValue EncodedVal = DAG.getTargetConstant(0, MVT::i32);
@@ -3634,7 +3892,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
@@ -3670,7 +3928,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
@@ -3703,7 +3961,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
                               DAG.getConstant(Intrinsic::arm_get_fpscr,
                                               MVT::i32));
@@ -3718,7 +3976,7 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
                          const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (!ST->hasV6T2Ops())
     return SDValue();
@@ -3742,7 +4000,7 @@ static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
 /// vuzp:    = [k0 k1 k2 k3 k0 k1 k2 k3]  each ki is 8-bits)
 static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
   SDValue N0 = DAG.getNode(ISD::BITCAST, DL, VT8Bit, N->getOperand(0));
@@ -3764,7 +4022,7 @@ static SDValue getCTPOP16BitCounts(SDNode *N, SelectionDAG &DAG) {
 /// v4i16:Extracted = [k0    k1    k2    k3    ]
 static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue BitCounts = getCTPOP16BitCounts(N, DAG);
   if (VT.is64BitVector()) {
@@ -3799,7 +4057,7 @@ static SDValue lowerCTPOP16BitElements(SDNode *N, SelectionDAG &DAG) {
 ///
 static SDValue lowerCTPOP32BitElements(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   EVT VT16Bit = VT.is64BitVector() ? MVT::v4i16 : MVT::v8i16;
 
@@ -3838,7 +4096,7 @@ static SDValue LowerCTPOP(SDNode *N, SelectionDAG &DAG,
 static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
                           const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   if (!VT.isVector())
     return SDValue();
@@ -3873,7 +4131,7 @@ static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
 static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
                                 const ARMSubtarget *ST) {
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // We can get here for a node like i32 = ISD::SHL i32, i64
   if (VT != MVT::i64)
@@ -3919,7 +4177,7 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
   SDValue CC = Op.getOperand(2);
   EVT VT = Op.getValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getOperand(1).getValueType().isFloatingPoint()) {
     switch (SetCCOpcode) {
@@ -4177,18 +4435,26 @@ static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
 
 SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                                            const ARMSubtarget *ST) const {
-  if (!ST->useNEONForSinglePrecisionFP() || !ST->hasVFP3() || ST->hasD16())
+  if (!ST->hasVFP3())
     return SDValue();
 
+  bool IsDouble = Op.getValueType() == MVT::f64;
   ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
-  assert(Op.getValueType() == MVT::f32 &&
-         "ConstantFP custom lowering should only occur for f32.");
 
   // Try splatting with a VMOV.f32...
   APFloat FPVal = CFP->getValueAPF();
-  int ImmVal = ARM_AM::getFP32Imm(FPVal);
+  int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
+
   if (ImmVal != -1) {
-    DebugLoc DL = Op.getDebugLoc();
+    if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
+      // We have code in place to select a valid ConstantFP already, no need to
+      // do any mangling.
+      return Op;
+    }
+
+    // It's a float and we are trying to use NEON operations where
+    // possible. Lower it to a splat followed by an extract.
+    SDLoc DL(Op);
     SDValue NewVal = DAG.getTargetConstant(ImmVal, MVT::i32);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
                                       NewVal);
@@ -4196,15 +4462,31 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
                        DAG.getConstant(0, MVT::i32));
   }
 
-  // If that fails, try a VMOV.i32
+  // The rest of our options are NEON only, make sure that's allowed before
+  // proceeding..
+  if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
+    return SDValue();
+
   EVT VMovVT;
-  unsigned iVal = FPVal.bitcastToAPInt().getZExtValue();
-  SDValue NewVal = isNEONModifiedImm(iVal, 0, 32, DAG, VMovVT, false,
-                                     VMOVModImm);
+  uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
+
+  // It wouldn't really be worth bothering for doubles except for one very
+  // important value, which does happen to match: 0.0. So make sure we don't do
+  // anything stupid.
+  if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
+    return SDValue();
+
+  // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
+  SDValue NewVal = isNEONModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                                     false, VMOVModImm);
   if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
                                       NewVal);
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -4212,11 +4494,16 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   }
 
   // Finally, try a VMVN.i32
-  NewVal = isNEONModifiedImm(~iVal & 0xffffffff, 0, 32, DAG, VMovVT, false,
-                             VMVNModImm);
+  NewVal = isNEONModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, VMovVT,
+                             false, VMVNModImm);
   if (NewVal != SDValue()) {
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
+
+    if (IsDouble)
+      return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
+
+    // It's a float: cast and extract a vector element.
     SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
                                        VecConstant);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
@@ -4475,7 +4762,7 @@ static bool isReverseMask(ArrayRef<int> M, EVT VT) {
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
 static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
-                                     const ARMSubtarget *ST, DebugLoc dl) {
+                                     const ARMSubtarget *ST, SDLoc dl) {
   uint64_t Val;
   if (!isa<ConstantSDNode>(N))
     return SDValue();
@@ -4496,7 +4783,7 @@ static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG,
 SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                                              const ARMSubtarget *ST) const {
   BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   APInt SplatBits, SplatUndef;
@@ -4580,7 +4867,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
-  if (isOnlyLowElement)
+  // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
@@ -4679,6 +4968,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
   return SDValue();
 }
 
@@ -4686,7 +4993,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 // shuffle in combination with VEXTs.
 SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                               SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned NumElts = VT.getVectorNumElements();
 
@@ -4875,7 +5182,7 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      DebugLoc dl) {
+                                      SDLoc dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -4955,7 +5262,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
   // Check to see if we can use the VTBL instruction.
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SmallVector<SDValue, 8> VTBLMask;
   for (ArrayRef<int>::iterator
@@ -4974,7 +5281,7 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
                                                       SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue OpLHS = Op.getOperand(0);
   EVT VT = OpLHS.getValueType();
 
@@ -4992,7 +5299,7 @@ static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
 static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
 
@@ -5156,7 +5463,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
   SDValue Vec = Op.getOperand(0);
   if (Op.getValueType() == MVT::i32 &&
       Vec.getValueType().getVectorElementType().getSizeInBits() < 32) {
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
     return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
   }
 
@@ -5168,7 +5475,7 @@ static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   // two 64-bit vectors are concatenated to a 128-bit vector.
   assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
          "unexpected CONCAT_VECTORS");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Val = DAG.getUNDEF(MVT::v2f64);
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
@@ -5291,7 +5598,7 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
   EVT NewVT = getExtensionTo64Bits(OrigTy);
 
-  return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
+  return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
 }
 
 /// SkipLoadExtensionForVMULL - return a load of the original vector size that
@@ -5304,7 +5611,7 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
 
   // The load already has the right type.
   if (ExtendedTy == LD->getMemoryVT())
-    return DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+    return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
                 LD->isNonTemporal(), LD->isInvariant(),
                 LD->getAlignment());
@@ -5312,7 +5619,7 @@ static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
   // We need to create a zextload/sextload. We cannot just create a load
   // followed by a zext/zext node because LowerMUL is also run during normal
   // operation legalization where we can't create illegal types.
-  return DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(), ExtendedTy,
+  return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
                         LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
                         LD->getMemoryVT(), LD->isVolatile(),
                         LD->isNonTemporal(), LD->getAlignment());
@@ -5341,7 +5648,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     assert(BVN->getOpcode() == ISD::BUILD_VECTOR &&
            BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
     unsigned LowElt = DAG.getTargetLoweringInfo().isBigEndian() ? 1 : 0;
-    return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(), MVT::v2i32,
+    return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), MVT::v2i32,
                        BVN->getOperand(LowElt), BVN->getOperand(LowElt+2));
   }
   // Construct a new BUILD_VECTOR with elements truncated to half the size.
@@ -5358,7 +5665,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     // The values are implicitly truncated so sext vs. zext doesn't matter.
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
                      MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
 }
 
@@ -5430,7 +5737,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
   }
 
   // Legalize to a VMULL instruction.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Op0;
   SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
   if (!isMLA) {
@@ -5460,7 +5767,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue
-LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i8(SDValue X, SDValue Y, SDLoc dl, SelectionDAG &DAG) {
   // Convert to float
   // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
   // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
@@ -5489,7 +5796,7 @@ LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
 }
 
 static SDValue
-LowerSDIV_v4i16(SDValue N0, SDValue N1, DebugLoc dl, SelectionDAG &DAG) {
+LowerSDIV_v4i16(SDValue N0, SDValue N1, SDLoc dl, SelectionDAG &DAG) {
   SDValue N2;
   // Convert to float.
   // float4 yf = vcvt_f32_s32(vmovl_s16(y));
@@ -5530,7 +5837,7 @@ static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::SDIV");
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
@@ -5565,7 +5872,7 @@ static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG) {
   assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
          "unexpected type for custom-lowering ISD::UDIV");
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2, N3;
@@ -5649,12 +5956,76 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (!ExtraOp)
-    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                        Op.getOperand(1));
-  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                      Op.getOperand(1), Op.getOperand(2));
 }
 
+SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin());
+
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // return values are passed via sret.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Pair of floats / doubles used to pass the result.
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+
+  // Create stack object for sret.
+  const uint64_t ByteSize = TLI.getDataLayout()->getTypeAllocSize(RetTy);
+  const unsigned StackAlign = TLI.getDataLayout()->getPrefTypeAlignment(RetTy);
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  SDValue SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy());
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+
+  Entry.Node = SRet;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Entry.isSRet = true;
+  Args.push_back(Entry);
+
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  const char *LibcallName  = (ArgVT == MVT::f64)
+  ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+  TargetLowering::
+  CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()),
+                       false, false, false, false, 0,
+                       CallingConv::C, /*isTaillCall=*/false,
+                       /*doesNotRet=*/false, /*isReturnValueUsed*/false,
+                       Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  // Address of cos field.
+  SDValue Add = DAG.getNode(ISD::ADD, dl, getPointerTy(), SRet,
+                            DAG.getIntPtrConstant(ArgVT.getStoreSize()));
+  SDValue LoadCos = DAG.getLoad(ArgVT, dl, LoadSin.getValue(1), Add,
+                                MachinePointerInfo(), false, false, false, 0);
+
+  SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys,
+                     LoadSin.getValue(0), LoadCos.getValue(0));
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   // Monotonic load/store is legal for all targets
   if (cast<AtomicSDNode>(Op)->getOrdering() <= Monotonic)
@@ -5665,40 +6036,73 @@ static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
-
 static void
 ReplaceATOMIC_OP_64(SDNode *Node, SmallVectorImpl<SDValue>& Results,
-                    SelectionDAG &DAG, unsigned NewOp) {
-  DebugLoc dl = Node->getDebugLoc();
+                    SelectionDAG &DAG) {
+  SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
+  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
 
   SmallVector<SDValue, 6> Ops;
   Ops.push_back(Node->getOperand(0)); // Chain
   Ops.push_back(Node->getOperand(1)); // Ptr
-  // Low part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(0)));
-  // High part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                            Node->getOperand(2), DAG.getIntPtrConstant(1)));
-  if (NewOp == ARMISD::ATOMCMPXCHG64_DAG) {
-    // High part of Val1
+  for(unsigned i=2; i<Node->getNumOperands(); i++) {
+    // Low part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(0)));
-    // High part of Val2
+                              Node->getOperand(i), DAG.getIntPtrConstant(0)));
+    // High part
     Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
-                              Node->getOperand(3), DAG.getIntPtrConstant(1)));
+                              Node->getOperand(i), DAG.getIntPtrConstant(1)));
   }
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops.data(), Ops.size(), MVT::i64,
-                            cast<MemSDNode>(Node)->getMemOperand());
+    DAG.getAtomic(Node->getOpcode(), dl, MVT::i64, Tys, Ops.data(), Ops.size(),
+                  cast<MemSDNode>(Node)->getMemOperand(), AN->getOrdering(),
+                  AN->getSynchScope());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1) };
   Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
   Results.push_back(Result.getValue(2));
 }
 
+static void ReplaceREADCYCLECOUNTER(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget) {
+  SDLoc DL(N);
+  SDValue Cycles32, OutChain;
+
+  if (Subtarget->hasPerfMon()) {
+    // Under Power Management extensions, the cycle-count is:
+    //    mrc p15, #0, <Rt>, c9, c13, #0
+    SDValue Ops[] = { N->getOperand(0), // Chain
+                      DAG.getConstant(Intrinsic::arm_mrc, MVT::i32),
+                      DAG.getConstant(15, MVT::i32),
+                      DAG.getConstant(0, MVT::i32),
+                      DAG.getConstant(9, MVT::i32),
+                      DAG.getConstant(13, MVT::i32),
+                      DAG.getConstant(0, MVT::i32)
+    };
+
+    Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
+                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
+                           array_lengthof(Ops));
+    OutChain = Cycles32.getValue(1);
+  } else {
+    // Intrinsic is defined to return 0 on unsupported platforms. Technically
+    // there are older ARM CPUs that have implementation-specific ways of
+    // obtaining this information (FIXME!).
+    Cycles32 = DAG.getConstant(0, MVT::i32);
+    OutChain = DAG.getEntryNode();
+  }
+
+
+  SDValue Cycles64 = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64,
+                                 Cycles32, DAG.getConstant(0, MVT::i32));
+  Results.push_back(Cycles64);
+  Results.push_back(OutChain);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Don't know how to custom lower this!");
@@ -5753,6 +6157,9 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   }
 }
 
@@ -5768,49 +6175,28 @@ void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST:
     Res = ExpandBITCAST(N, DAG);
     break;
-  case ISD::SIGN_EXTEND:
-  case ISD::ZERO_EXTEND:
-    Res = ExpandVectorExtension(N, DAG);
-    break;
   case ISD::SRL:
   case ISD::SRA:
     Res = Expand64BitShift(N, DAG, Subtarget);
     break;
-  case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMADD64_DAG);
+  case ISD::READCYCLECOUNTER:
+    ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
     return;
+  case ISD::ATOMIC_STORE:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_LOAD_ADD:
   case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMNAND64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMOR64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSUB64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMXOR64_DAG);
-    return;
   case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMSWAP64_DAG);
-    return;
   case ISD::ATOMIC_CMP_SWAP:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMCMPXCHG64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMIN:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMIN64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_MAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMMAX64_DAG);
-    return;
   case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_64(N, Results, DAG, ARMISD::ATOMUMAX64_DAG);
+    ReplaceATOMIC_OP_64(N, Results, DAG);
     return;
   }
   if (Res.getNode())
@@ -5830,6 +6216,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   unsigned oldval  = MI->getOperand(2).getReg();
   unsigned newval  = MI->getOperand(3).getReg();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -5845,21 +6232,7 @@ ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -5939,6 +6312,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned dest = MI->getOperand(0).getReg();
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -5946,24 +6320,11 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc;
-  switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
-  case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
-    break;
-  case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
-    break;
-  case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
-    break;
-  }
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
 
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
@@ -6047,6 +6408,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   unsigned ptr = MI->getOperand(1).getReg();
   unsigned incr = MI->getOperand(2).getReg();
   unsigned oldval = dest;
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6054,24 +6416,20 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   if (isThumb2) {
     MRI.constrainRegClass(dest, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(incr, &ARM::rGPRRegClass);
   }
 
   unsigned ldrOpc, strOpc, extendOpc;
+  getExclusiveOperation(Size, Ord, isThumb2, ldrOpc, strOpc);
   switch (Size) {
-  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  default: llvm_unreachable("unsupported size for AtomicBinaryMinMax!");
   case 1:
-    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
-    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
     extendOpc = isThumb2 ? ARM::t2SXTB : ARM::SXTB;
     break;
   case 2:
-    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
-    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
     extendOpc = isThumb2 ? ARM::t2SXTH : ARM::SXTH;
     break;
   case 4:
-    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
-    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
     extendOpc = 0;
     break;
   }
@@ -6115,7 +6473,10 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
 
   // Sign extend the value, if necessary.
   if (signExtend && extendOpc) {
-    oldval = MRI.createVirtualRegister(&ARM::GPRRegClass);
+    oldval = MRI.createVirtualRegister(isThumb2 ? &ARM::rGPRRegClass
+                                                : &ARM::GPRnopcRegClass);
+    if (!isThumb2)
+      MRI.constrainRegClass(dest, &ARM::GPRnopcRegClass);
     AddDefaultPred(BuildMI(BB, dl, TII->get(extendOpc), oldval)
                      .addReg(dest)
                      .addImm(0));
@@ -6153,7 +6514,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned Op1, unsigned Op2,
                                       bool NeedsCarry, bool IsCmpxchg,
                                       bool IsMinMax, ARMCC::CondCodes CC) const {
-  // This also handles ATOMIC_SWAP, indicated by Op1==0.
+  // This also handles ATOMIC_SWAP and ATOMIC_STORE, indicated by Op1==0.
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
@@ -6161,11 +6522,15 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction::iterator It = BB;
   ++It;
 
+  bool isStore = (MI->getOpcode() == ARM::ATOMIC_STORE_I64);
+  unsigned offset = (isStore ? -2 : 0);
   unsigned destlo = MI->getOperand(0).getReg();
   unsigned desthi = MI->getOperand(1).getReg();
-  unsigned ptr = MI->getOperand(2).getReg();
-  unsigned vallo = MI->getOperand(3).getReg();
-  unsigned valhi = MI->getOperand(4).getReg();
+  unsigned ptr = MI->getOperand(offset+2).getReg();
+  unsigned vallo = MI->getOperand(offset+3).getReg();
+  unsigned valhi = MI->getOperand(offset+4).getReg();
+  unsigned OrdIdx = offset + (IsCmpxchg ? 7 : 5);
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(OrdIdx).getImm());
   DebugLoc dl = MI->getDebugLoc();
   bool isThumb2 = Subtarget->isThumb2();
 
@@ -6174,8 +6539,13 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
     MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
     MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
     MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(vallo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(valhi, &ARM::rGPRRegClass);
   }
 
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
   MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
   MachineBasicBlock *contBB = 0, *cont2BB = 0;
   if (IsCmpxchg || IsMinMax)
@@ -6215,21 +6585,23 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   //   fallthrough --> exitMBB
   BB = loopMBB;
 
-  // Load
-  if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2LDREXD))
-                   .addReg(destlo, RegState::Define)
-                   .addReg(desthi, RegState::Define)
-                   .addReg(ptr));
-  } else {
-    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDREXD))
-                   .addReg(GPRPair0, RegState::Define).addReg(ptr));
-    // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
-      .addReg(GPRPair0, 0, ARM::gsub_0);
-    BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
-      .addReg(GPRPair0, 0, ARM::gsub_1);
+  if (!isStore) {
+    // Load
+    if (isThumb2) {
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(destlo, RegState::Define)
+                     .addReg(desthi, RegState::Define)
+                     .addReg(ptr));
+    } else {
+      unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+      AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc))
+                     .addReg(GPRPair0, RegState::Define).addReg(ptr));
+      // Copy r2/r3 into dest.  (This copy will normally be coalesced.)
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), destlo)
+        .addReg(GPRPair0, 0, ARM::gsub_0);
+      BuildMI(BB, dl, TII->get(TargetOpcode::COPY), desthi)
+        .addReg(GPRPair0, 0, ARM::gsub_1);
+    }
   }
 
   unsigned StoreLo, StoreHi;
@@ -6281,7 +6653,9 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Store
   if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2STREXD), storesuccess)
+    MRI.constrainRegClass(StoreLo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(StoreHi, &ARM::rGPRRegClass);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StoreLo).addReg(StoreHi).addReg(ptr));
   } else {
     // Marshal a pair...
@@ -6299,7 +6673,7 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
       .addImm(ARM::gsub_1);
 
     // ...and store it
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::STREXD), storesuccess)
+    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), storesuccess)
                    .addReg(StorePair).addReg(ptr));
   }
   // Cmp+jump
@@ -6320,6 +6694,51 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicLoad64(MachineInstr *MI, MachineBasicBlock *BB) const {
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  unsigned destlo = MI->getOperand(0).getReg();
+  unsigned desthi = MI->getOperand(1).getReg();
+  unsigned ptr = MI->getOperand(2).getReg();
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+  if (isThumb2) {
+    MRI.constrainRegClass(destlo, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(desthi, &ARM::rGPRRegClass);
+    MRI.constrainRegClass(ptr, &ARM::rGPRRegClass);
+  }
+  unsigned ldrOpc, strOpc;
+  getExclusiveOperation(8, Ord, isThumb2, ldrOpc, strOpc);
+
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(ldrOpc));
+
+  if (isThumb2) {
+    MIB.addReg(destlo, RegState::Define)
+       .addReg(desthi, RegState::Define)
+       .addReg(ptr);
+
+  } else {
+    unsigned GPRPair0 = MRI.createVirtualRegister(&ARM::GPRPairRegClass);
+    MIB.addReg(GPRPair0, RegState::Define).addReg(ptr);
+
+    // Copy GPRPair0 into dest.  (This copy will normally be coalesced.)
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), destlo)
+      .addReg(GPRPair0, 0, ARM::gsub_0);
+    BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), desthi)
+      .addReg(GPRPair0, 0, ARM::gsub_1);
+  }
+  AddDefaultPred(MIB);
+
+  MI->eraseFromParent();   // The instruction is gone now.
+
+  return BB;
+}
+
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::
@@ -6851,8 +7270,109 @@ MachineBasicBlock *OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ) {
   llvm_unreachable("Expecting a BB with two successors!");
 }
 
-MachineBasicBlock *ARMTargetLowering::
-EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
+/// Return the load opcode for a given load size. If load size >= 8,
+/// neon opcode will be returned.
+static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
+  if (LdSize >= 8)
+    return LdSize == 16 ? ARM::VLD1q32wb_fixed
+                        : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
+  if (IsThumb1)
+    return LdSize == 4 ? ARM::tLDRi
+                       : LdSize == 2 ? ARM::tLDRHi
+                                     : LdSize == 1 ? ARM::tLDRBi : 0;
+  if (IsThumb2)
+    return LdSize == 4 ? ARM::t2LDR_POST
+                       : LdSize == 2 ? ARM::t2LDRH_POST
+                                     : LdSize == 1 ? ARM::t2LDRB_POST : 0;
+  return LdSize == 4 ? ARM::LDR_POST_IMM
+                     : LdSize == 2 ? ARM::LDRH_POST
+                                   : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
+}
+
+/// Return the store opcode for a given store size. If store size >= 8,
+/// neon opcode will be returned.
+static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
+  if (StSize >= 8)
+    return StSize == 16 ? ARM::VST1q32wb_fixed
+                        : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
+  if (IsThumb1)
+    return StSize == 4 ? ARM::tSTRi
+                       : StSize == 2 ? ARM::tSTRHi
+                                     : StSize == 1 ? ARM::tSTRBi : 0;
+  if (IsThumb2)
+    return StSize == 4 ? ARM::t2STR_POST
+                       : StSize == 2 ? ARM::t2STRH_POST
+                                     : StSize == 1 ? ARM::t2STRB_POST : 0;
+  return StSize == 4 ? ARM::STR_POST_IMM
+                     : StSize == 2 ? ARM::STRH_POST
+                                   : StSize == 1 ? ARM::STRB_POST_IMM : 0;
+}
+
+/// Emit a post-increment load operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostLd(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned LdSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
+  assert(LdOpc != 0 && "Should have a load opcode");
+  if (LdSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(0));
+  } else if (IsThumb1) {
+    // load + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(LdSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addImm(LdSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
+                       .addReg(AddrOut, RegState::Define).addReg(AddrIn)
+                       .addReg(0).addImm(LdSize));
+  }
+}
+
+/// Emit a post-increment store operation with given size. The instructions
+/// will be added to BB at Pos.
+static void emitPostSt(MachineBasicBlock *BB, MachineInstr *Pos,
+                       const TargetInstrInfo *TII, DebugLoc dl,
+                       unsigned StSize, unsigned Data, unsigned AddrIn,
+                       unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
+  unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
+  assert(StOpc != 0 && "Should have a store opcode");
+  if (StSize >= 8) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(AddrIn).addImm(0).addReg(Data));
+  } else if (IsThumb1) {
+    // store + update AddrIn
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc)).addReg(Data)
+                       .addReg(AddrIn).addImm(0));
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(AddrIn).addImm(StSize);
+    AddDefaultPred(MIB);
+  } else if (IsThumb2) {
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addImm(StSize));
+  } else { // arm
+    AddDefaultPred(BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
+                       .addReg(Data).addReg(AddrIn).addReg(0)
+                       .addImm(StSize));
+  }
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitStructByval(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const {
   // This pseudo instruction has 3 operands: dst, src, size
   // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
   // Otherwise, we will generate unrolled scalar copies.
@@ -6867,23 +7387,18 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   unsigned Align = MI->getOperand(3).getImm();
   DebugLoc dl = MI->getDebugLoc();
 
-  bool isThumb2 = Subtarget->isThumb2();
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  unsigned ldrOpc, strOpc, UnitSize = 0;
+  unsigned UnitSize = 0;
+  const TargetRegisterClass *TRC = 0;
+  const TargetRegisterClass *VecTRC = 0;
 
-  const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
-    (const TargetRegisterClass*)&ARM::GPRRegClass;
-  const TargetRegisterClass *TRC_Vec = 0;
+  bool IsThumb1 = Subtarget->isThumb1Only();
+  bool IsThumb2 = Subtarget->isThumb2();
 
   if (Align & 1) {
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     UnitSize = 1;
   } else if (Align & 2) {
-    ldrOpc = isThumb2 ? ARM::t2LDRH_POST : ARM::LDRH_POST;
-    strOpc = isThumb2 ? ARM::t2STRH_POST : ARM::STRH_POST;
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
@@ -6891,27 +7406,27 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
           hasAttribute(AttributeSet::FunctionIndex,
                        Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16) {
-        ldrOpc = ARM::VLD1q32wb_fixed;
-        strOpc = ARM::VST1q32wb_fixed;
+      if ((Align % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPairRegClass;
-      }
-      else if ((Align % 8 == 0) && SizeVal >= 8) {
-        ldrOpc = ARM::VLD1d32wb_fixed;
-        strOpc = ARM::VST1d32wb_fixed;
+      else if ((Align % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
-        TRC_Vec = (const TargetRegisterClass*)&ARM::DPRRegClass;
-      }
     }
     // Can't use NEON instructions.
-    if (UnitSize == 0) {
-      ldrOpc = isThumb2 ? ARM::t2LDR_POST : ARM::LDR_POST_IMM;
-      strOpc = isThumb2 ? ARM::t2STR_POST : ARM::STR_POST_IMM;
+    if (UnitSize == 0)
       UnitSize = 4;
-    }
   }
 
+  // Select the correct opcode and register class for unit size load/store
+  bool IsNeon = UnitSize >= 8;
+  TRC = (IsThumb1 || IsThumb2) ? (const TargetRegisterClass *)&ARM::tGPRRegClass
+                               : (const TargetRegisterClass *)&ARM::GPRRegClass;
+  if (IsNeon)
+    VecTRC = UnitSize == 16
+                 ? (const TargetRegisterClass *)&ARM::DPairRegClass
+                 : UnitSize == 8
+                       ? (const TargetRegisterClass *)&ARM::DPRRegClass
+                       : 0;
+
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
 
@@ -6922,34 +7437,13 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     unsigned srcIn = src;
     unsigned destIn = dest;
     for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
-      unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (UnitSize >= 8) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(0));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(destIn).addImm(0).addReg(scratch));
-      } else if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addImm(UnitSize));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc), scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0)
-          .addImm(UnitSize));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(UnitSize));
-      }
+      unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+      emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -6957,30 +7451,14 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     // Handle the leftover bytes with LDRB and STRB.
     // [scratch, srcOut] = LDRB_POST(srcIn, 1)
     // [destOut] = STRB_POST(scratch, destIn, 1)
-    ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-    strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
     for (unsigned i = 0; i < BytesLeft; i++) {
-      unsigned scratch = MRI.createVirtualRegister(TRC);
       unsigned srcOut = MRI.createVirtualRegister(TRC);
       unsigned destOut = MRI.createVirtualRegister(TRC);
-      if (isThumb2) {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      } else {
-        AddDefaultPred(BuildMI(*BB, MI, dl,
-          TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn)
-          .addReg(0).addImm(1));
-
-        AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
-          .addReg(scratch).addReg(destIn)
-          .addReg(0).addImm(1));
-      }
+      unsigned scratch = MRI.createVirtualRegister(TRC);
+      emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
+                 IsThumb1, IsThumb2);
+      emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
+                 IsThumb1, IsThumb2);
       srcIn = srcOut;
       destIn = destOut;
     }
@@ -7021,17 +7499,16 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   // Load an immediate to varEnd.
   unsigned varEnd = MRI.createVirtualRegister(TRC);
-  if (isThumb2) {
-    unsigned VReg1 = varEnd;
+  if (IsThumb2) {
+    unsigned Vtmp = varEnd;
     if ((LoopSize & 0xFFFF0000) != 0)
-      VReg1 = MRI.createVirtualRegister(TRC);
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), VReg1)
-                   .addImm(LoopSize & 0xFFFF));
+      Vtmp = MRI.createVirtualRegister(TRC);
+    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVi16), Vtmp)
+                       .addImm(LoopSize & 0xFFFF));
 
     if ((LoopSize & 0xFFFF0000) != 0)
       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
-                     .addReg(VReg1)
-                     .addImm(LoopSize >> 16));
+                         .addReg(Vtmp).addImm(LoopSize >> 16));
   } else {
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
@@ -7043,10 +7520,12 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
       Align = getDataLayout()->getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
-                   .addReg(varEnd, RegState::Define)
-                   .addConstantPoolIndex(Idx)
-                   .addImm(0));
+    if (IsThumb1)
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx));
+    else
+      AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp)).addReg(
+          varEnd, RegState::Define).addConstantPoolIndex(Idx).addImm(0));
   }
   BB->addSuccessor(loopMBB);
 
@@ -7075,39 +7554,30 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
 
   //   [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
   //   [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
-  unsigned scratch = MRI.createVirtualRegister(UnitSize >= 8 ? TRC_Vec:TRC);
-  if (UnitSize >= 8) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(0));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(destPhi).addImm(0).addReg(scratch));
-  } else if (isThumb2) {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addImm(UnitSize));
-  } else {
-    AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), scratch)
-      .addReg(srcLoop, RegState::Define).addReg(srcPhi).addReg(0)
-      .addImm(UnitSize));
-
-    AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), destLoop)
-      .addReg(scratch).addReg(destPhi)
-      .addReg(0).addImm(UnitSize));
-  }
+  unsigned scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
+  emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
+             IsThumb1, IsThumb2);
+  emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
+             IsThumb1, IsThumb2);
 
   // Decrement loop variable by UnitSize.
-  MachineInstrBuilder MIB = BuildMI(BB, dl,
-    TII->get(isThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
-  AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
-  MIB->getOperand(5).setReg(ARM::CPSR);
-  MIB->getOperand(5).setIsDef(true);
-
-  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
-    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  if (IsThumb1) {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop);
+    MIB = AddDefaultT1CC(MIB);
+    MIB.addReg(varPhi).addImm(UnitSize);
+    AddDefaultPred(MIB);
+  } else {
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, BB->end(), dl,
+                TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
+    AddDefaultCC(AddDefaultPred(MIB.addReg(varPhi).addImm(UnitSize)));
+    MIB->getOperand(5).setReg(ARM::CPSR);
+    MIB->getOperand(5).setIsDef(true);
+  }
+  BuildMI(*BB, BB->end(), dl,
+          TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
+      .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
 
   // loopMBB can loop back to loopMBB or fall through to exitMBB.
   BB->addSuccessor(loopMBB);
@@ -7116,34 +7586,19 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
   // Add epilogue to handle BytesLeft.
   BB = exitMBB;
   MachineInstr *StartOfExit = exitMBB->begin();
-  ldrOpc = isThumb2 ? ARM::t2LDRB_POST : ARM::LDRB_POST_IMM;
-  strOpc = isThumb2 ? ARM::t2STRB_POST : ARM::STRB_POST_IMM;
 
   //   [scratch, srcOut] = LDRB_POST(srcLoop, 1)
   //   [destOut] = STRB_POST(scratch, destLoop, 1)
   unsigned srcIn = srcLoop;
   unsigned destIn = destLoop;
   for (unsigned i = 0; i < BytesLeft; i++) {
-    unsigned scratch = MRI.createVirtualRegister(TRC);
     unsigned srcOut = MRI.createVirtualRegister(TRC);
     unsigned destOut = MRI.createVirtualRegister(TRC);
-    if (isThumb2) {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addImm(1));
-    } else {
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl,
-        TII->get(ldrOpc),scratch)
-        .addReg(srcOut, RegState::Define).addReg(srcIn).addReg(0).addImm(1));
-
-      AddDefaultPred(BuildMI(*BB, StartOfExit, dl, TII->get(strOpc), destOut)
-        .addReg(scratch).addReg(destIn)
-        .addReg(0).addImm(1));
-    }
+    unsigned scratch = MRI.createVirtualRegister(TRC);
+    emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
+               IsThumb1, IsThumb2);
+    emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
+               IsThumb1, IsThumb2);
     srcIn = srcOut;
     destIn = destOut;
   }
@@ -7293,46 +7748,49 @@ ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
   case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
 
+  case ARM::ATOMIC_LOAD_I64:
+    return EmitAtomicLoad64(MI, BB);
 
-  case ARM::ATOMADD6432:
+  case ARM::ATOMIC_LOAD_ADD_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr,
                               isThumb2 ? ARM::t2ADCrr : ARM::ADCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMSUB6432:
+  case ARM::ATOMIC_LOAD_SUB_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true);
-  case ARM::ATOMOR6432:
+  case ARM::ATOMIC_LOAD_OR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr,
                               isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
-  case ARM::ATOMXOR6432:
+  case ARM::ATOMIC_LOAD_XOR_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2EORrr : ARM::EORrr,
                               isThumb2 ? ARM::t2EORrr : ARM::EORrr);
-  case ARM::ATOMAND6432:
+  case ARM::ATOMIC_LOAD_AND_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr,
                               isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
-  case ARM::ATOMSWAP6432:
+  case ARM::ATOMIC_STORE_I64:
+  case ARM::ATOMIC_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, 0, 0, false);
-  case ARM::ATOMCMPXCHG6432:
+  case ARM::ATOMIC_CMP_SWAP_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ false, /*IsCmpxchg*/true);
-  case ARM::ATOMMIN6432:
+  case ARM::ATOMIC_LOAD_MIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LT);
-  case ARM::ATOMMAX6432:
+  case ARM::ATOMIC_LOAD_MAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::GE);
-  case ARM::ATOMUMIN6432:
+  case ARM::ATOMIC_LOAD_UMIN_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
                               /*IsMinMax*/ true, ARMCC::LO);
-  case ARM::ATOMUMAX6432:
+  case ARM::ATOMIC_LOAD_UMAX_I64:
     return EmitAtomicBinary64(MI, BB, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr,
                               isThumb2 ? ARM::t2SBCrr : ARM::SBCrr,
                               /*NeedsCarry*/ true, /*IsCmpxchg*/false,
@@ -7710,13 +8168,13 @@ SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
 
   // Slct is now know to be the desired identity constant when CC is true.
   SDValue TrueVal = OtherOp;
-  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
                                  OtherOp, NonConstantVal);
   // Unless SwapSelectOps says CC should be false.
   if (SwapSelectOps)
     std::swap(TrueVal, FalseVal);
 
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
                      CCOp, TrueVal, FalseVal);
 }
 
@@ -7823,9 +8281,9 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       llvm_unreachable("Invalid vector element type for padd optimization.");
   }
 
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                             widenType, &Ops[0], Ops.size());
-  return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, tmp);
 }
 
 static SDValue findMUL_LOHI(SDValue V) {
@@ -7868,8 +8326,11 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   assert(AddcNode->getNumValues() == 2 &&
          AddcNode->getValueType(0) == MVT::i32 &&
-         AddcNode->getValueType(1) == MVT::Glue &&
-         "Expect ADDC with two result values: i32, glue");
+         "Expect ADDC with two result values. First: i32");
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
 
   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
@@ -7950,7 +8411,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   Ops.push_back(*LowAdd);
   Ops.push_back(*HiAdd);
 
-  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+  SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
                                  DAG.getVTList(MVT::i32, MVT::i32),
                                  &Ops[0], Ops.size());
 
@@ -8038,6 +8499,13 @@ static SDValue PerformSUBCombine(SDNode *N,
 /// is faster than
 ///   vadd d3, d0, d1
 ///   vmul d3, d3, d2
+//  However, for (A + B) * (A + B),
+//    vadd d2, d0, d1
+//    vmul d3, d0, d2
+//    vmla d3, d1, d2
+//  is slower than
+//    vadd d2, d0, d1
+//    vmul d3, d2, d2
 static SDValue PerformVMULCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -8057,8 +8525,11 @@ static SDValue PerformVMULCombine(SDNode *N,
     std::swap(N0, N1);
   }
 
+  if (N0 == N1)
+    return SDValue();
+
   EVT VT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue N00 = N0->getOperand(0);
   SDValue N01 = N0->getOperand(1);
   return DAG.getNode(Opcode, DL, VT,
@@ -8088,11 +8559,11 @@ static SDValue PerformMULCombine(SDNode *N,
     return SDValue();
 
   int64_t MulAmt = C->getSExtValue();
-  unsigned ShiftAmt = CountTrailingZeros_64(MulAmt);
+  unsigned ShiftAmt = countTrailingZeros<uint64_t>(MulAmt);
 
   ShiftAmt = ShiftAmt & (32 - 1);
   SDValue V = N->getOperand(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue Res;
   MulAmt >>= ShiftAmt;
@@ -8156,7 +8627,7 @@ static SDValue PerformANDCombine(SDNode *N,
 
   // Attempt to use immediate-form VBIC
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
@@ -8199,7 +8670,7 @@ static SDValue PerformORCombine(SDNode *N,
                                 const ARMSubtarget *Subtarget) {
   // Attempt to use immediate-form VORR
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
@@ -8248,22 +8719,29 @@ static SDValue PerformORCombine(SDNode *N,
     unsigned SplatBitSize;
     bool HasAnyUndefs;
 
+    APInt SplatBits0, SplatBits1;
     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    APInt SplatBits0;
+    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+    // Ensure that the second operand of both ands are constants
     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                  HasAnyUndefs) && !HasAnyUndefs) {
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-      APInt SplatBits1;
-      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs) && !HasAnyUndefs &&
-          SplatBits0 == ~SplatBits1) {
-        // Canonicalize the vector type to make instruction selection simpler.
-        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
-        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
-                                     N0->getOperand(1), N0->getOperand(0),
-                                     N1->getOperand(0));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
-      }
+                                      HasAnyUndefs) && !HasAnyUndefs) {
+        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                          HasAnyUndefs) && !HasAnyUndefs) {
+            // Ensure that the bit width of the constants are the same and that
+            // the splat arguments are logical inverses as per the pattern we
+            // are trying to simplify.
+            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+                SplatBits0 == ~SplatBits1) {
+                // Canonicalize the vector type to make instruction selection
+                // simpler.
+                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                             N0->getOperand(1),
+                                             N0->getOperand(0),
+                                             N1->getOperand(0));
+                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+            }
+        }
     }
   }
 
@@ -8274,7 +8752,7 @@ static SDValue PerformORCombine(SDNode *N,
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   // 1) or (and A, mask), val => ARMbfi A, val, mask
   //      iff (val & mask) == val
   //
@@ -8309,7 +8787,7 @@ static SDValue PerformORCombine(SDNode *N,
       return SDValue();
 
     if (ARM::isBitFieldInvertedMask(Mask)) {
-      Val >>= CountTrailingZeros_32(~Mask);
+      Val >>= countTrailingZeros(~Mask);
 
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
                         DAG.getConstant(Val, MVT::i32),
@@ -8336,7 +8814,7 @@ static SDValue PerformORCombine(SDNode *N,
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
-      unsigned amt = CountTrailingZeros_32(Mask2);
+      unsigned amt = countTrailingZeros(Mask2);
       Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
                         DAG.getConstant(amt, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
@@ -8352,7 +8830,7 @@ static SDValue PerformORCombine(SDNode *N,
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
-      unsigned lsb = CountTrailingZeros_32(Mask);
+      unsigned lsb = countTrailingZeros(Mask);
       Res = DAG.getNode(ISD::SRL, DL, VT, N00,
                         DAG.getConstant(lsb, MVT::i32));
       Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
@@ -8370,7 +8848,7 @@ static SDValue PerformORCombine(SDNode *N,
     // where lsb(mask) == #shamt and masked bits of B are known zero.
     SDValue ShAmt = N00.getOperand(1);
     unsigned ShAmtC = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(Mask);
+    unsigned LSB = countTrailingZeros(Mask);
     if (ShAmtC != LSB)
       return SDValue();
 
@@ -8413,12 +8891,12 @@ static SDValue PerformBFICombine(SDNode *N,
     if (!N11C)
       return SDValue();
     unsigned InvMask = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-    unsigned LSB = CountTrailingZeros_32(~InvMask);
-    unsigned Width = (32 - CountLeadingZeros_32(~InvMask)) - LSB;
+    unsigned LSB = countTrailingZeros(~InvMask);
+    unsigned Width = (32 - countLeadingZeros(~InvMask)) - LSB;
     unsigned Mask = (1 << Width)-1;
     unsigned Mask2 = N11C->getZExtValue();
     if ((Mask & (~Mask2)) == 0)
-      return DCI.DAG.getNode(ARMISD::BFI, N->getDebugLoc(), N->getValueType(0),
+      return DCI.DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
                              N->getOperand(0), N1.getOperand(0),
                              N->getOperand(2));
   }
@@ -8444,7 +8922,7 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
     LoadSDNode *LD = cast<LoadSDNode>(InNode);
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc DL = LD->getDebugLoc();
+    SDLoc DL(LD);
     SDValue BasePtr = LD->getBasePtr();
     SDValue NewLD1 = DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr,
                                  LD->getPointerInfo(), LD->isVolatile(),
@@ -8481,7 +8959,7 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   if (Op0.getOpcode() == ARMISD::VMOVRRD &&
       Op0.getNode() == Op1.getNode() &&
       Op0.getResNo() == 0 && Op1.getResNo() == 1)
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(),
+    return DAG.getNode(ISD::BITCAST, SDLoc(N),
                        N->getValueType(0), Op0.getOperand(0));
   return SDValue();
 }
@@ -8523,7 +9001,7 @@ static SDValue PerformSTORECombine(SDNode *N,
                                      NumElems*SizeRatio);
     assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
 
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
     SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
     SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
     for (unsigned i = 0; i < NumElems; ++i) ShuffleVec[i] = i * SizeRatio;
@@ -8584,7 +9062,7 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
-    DebugLoc DL = St->getDebugLoc();
+    SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
                                   StVal.getNode()->getOperand(0), BasePtr,
@@ -8606,14 +9084,14 @@ static SDValue PerformSTORECombine(SDNode *N,
   // Bitcast an i64 store extracted from a vector to f64.
   // Otherwise, the i64 value will be legalized to a pair of i32 values.
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = StVal.getDebugLoc();
+  SDLoc dl(StVal);
   SDValue IntVec = StVal.getOperand(0);
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                  IntVec.getValueType().getVectorNumElements());
   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
   SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                Vec, StVal.getOperand(1));
-  dl = N->getDebugLoc();
+  dl = SDLoc(N);
   SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
   // Make the DAGCombiner fold the bitcasts.
   DCI.AddToWorklist(Vec.getNode());
@@ -8659,7 +9137,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
     return SDValue();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SmallVector<SDValue, 8> Ops;
   unsigned NumElts = VT.getVectorNumElements();
   for (unsigned i = 0; i < NumElts; ++i) {
@@ -8673,6 +9151,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
@@ -8686,7 +9256,7 @@ static SDValue PerformInsertEltCombine(SDNode *N,
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
                                  VT.getVectorNumElements());
   SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
@@ -8732,7 +9302,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
       !TLI.isTypeLegal(Concat1Op1.getValueType()))
     return SDValue();
 
-  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
                                   Op0.getOperand(0), Op1.getOperand(0));
   // Translate the shuffle mask.
   SmallVector<int, 16> NewMask;
@@ -8748,7 +9318,7 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
       NewElt = HalfElts + MaskElt - NumElts;
     NewMask.push_back(NewElt);
   }
-  return DAG.getVectorShuffle(VT, N->getDebugLoc(), NewConcat,
+  return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
                               DAG.getUNDEF(VT), NewMask.data());
 }
 
@@ -8865,7 +9435,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
       Ops.push_back(N->getOperand(i));
     }
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
                                            Ops.data(), Ops.size(),
                                            MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
@@ -8939,7 +9509,7 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, VLD->getDebugLoc(), SDTys,
+  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
                                            Ops, 2, VLDMemInt->getMemoryVT(),
                                            VLDMemInt->getMemOperand());
 
@@ -8994,7 +9564,7 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   if (EltSize > VT.getVectorElementType().getSizeInBits())
     return SDValue();
 
-  return DCI.DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+  return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
 }
 
 // isConstVecPow2 - Return true if each vector element is a power of 2, all
@@ -9051,12 +9621,27 @@ static SDValue PerformVCVTCombine(SDNode *N,
       !isConstVecPow2(ConstVec, isSigned, C))
     return SDValue();
 
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from f32 to i32. We can handle
+    // smaller integers by generating an extra truncate, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
-                     N->getValueType(0),
-                     DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
-                     DAG.getConstant(Log2_64(C), MVT::i32));
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
+                                 DAG.getConstant(Log2_64(C), MVT::i32));
+
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
+
+  return FixConv;
 }
 
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
@@ -9087,12 +9672,28 @@ static SDValue PerformVDIVCombine(SDNode *N,
       !isConstVecPow2(ConstVec, isSigned, C))
     return SDValue();
 
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from i32 to f32. We can handle
+    // smaller integers by generating an extra extend, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
+  SDValue ConvInput = Op.getOperand(0);
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            ConvInput);
+
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
     Intrinsic::arm_neon_vcvtfxu2fp;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, N->getDebugLoc(),
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                      Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, MVT::i32),
-                     Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
+                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
 }
 
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -9273,7 +9874,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       VShiftOpc = ARMISD::VQRSHRNsu; break;
     }
 
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
   }
 
@@ -9290,7 +9891,7 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
       llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
     }
 
-    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
                        N->getOperand(1), N->getOperand(2),
                        DAG.getConstant(Cnt, MVT::i32));
   }
@@ -9321,7 +9922,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
       if (C->getZExtValue() == 16 && N0.getOpcode() == ISD::BSWAP &&
           DAG.MaskedValueIsZero(N0.getOperand(0),
                                 APInt::getHighBitsSet(32, 16)))
-        return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, N0, N1);
+        return DAG.getNode(ISD::ROTR, SDLoc(N), VT, N0, N1);
     }
   }
 
@@ -9338,7 +9939,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
 
   case ISD::SHL:
     if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
-      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(ARMISD::VSHL, SDLoc(N), VT, N->getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     break;
 
@@ -9347,7 +9948,7 @@ static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
     if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
       unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
                             ARMISD::VSHRs : ARMISD::VSHRu);
-      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+      return DAG.getNode(VShiftOpc, SDLoc(N), VT, N->getOperand(0),
                          DAG.getConstant(Cnt, MVT::i32));
     }
   }
@@ -9387,7 +9988,7 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
         Opc = ARMISD::VGETLANEu;
         break;
       }
-      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+      return DAG.getNode(Opc, SDLoc(N), VT, Vec, Lane);
     }
   }
 
@@ -9476,7 +10077,7 @@ static SDValue PerformSELECT_CCCombine(SDNode *N, SelectionDAG &DAG,
 
   if (!Opcode)
     return SDValue();
-  return DAG.getNode(Opcode, N->getDebugLoc(), N->getValueType(0), LHS, RHS);
+  return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), LHS, RHS);
 }
 
 /// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
@@ -9488,7 +10089,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
     return SDValue();
 
   EVT VT = N->getValueType(0);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LHS = Cmp.getOperand(0);
   SDValue RHS = Cmp.getOperand(1);
   SDValue FalseVal = N->getOperand(0);
@@ -9578,6 +10179,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ARMISD::VLD3DUP:
   case ARMISD::VLD4DUP:
     return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9702,6 +10305,21 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+
 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
   if (V < 0)
     return false;
@@ -10101,9 +10719,19 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+  unsigned BitWidth = KnownOne.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0);
   switch (Op.getOpcode()) {
   default: break;
+  case ARMISD::ADDC:
+  case ARMISD::ADDE:
+  case ARMISD::SUBC:
+  case ARMISD::SUBE:
+    // These nodes' second result is a boolean
+    if (Op.getResNo() == 0)
+      break;
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
     DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
@@ -10217,7 +10845,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
 RCPair
 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                EVT VT) const {
+                                                MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
     switch (Constraint[0]) {
@@ -10232,6 +10860,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'r':
       return RCPair(0U, &ARM::GPRRegClass);
     case 'w':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPRRegClass);
       if (VT.getSizeInBits() == 64)
@@ -10240,6 +10870,8 @@ ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return RCPair(0U, &ARM::QPRRegClass);
       break;
     case 'x':
+      if (VT == MVT::Other)
+        break;
       if (VT == MVT::f32)
         return RCPair(0U, &ARM::SPR_8RegClass);
       if (VT.getSizeInBits() == 64)
@@ -10426,6 +11058,54 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+      "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  RTLIB::Libcall LC;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  }
+
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
+
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
+
+  SDLoc dl(Op);
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
+                    0, getLibcallCallingConv(LC), /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  return CallInfo.first;
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -10434,17 +11114,15 @@ ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
 bool ARM::isBitFieldInvertedMask(unsigned v) {
   if (v == 0xffffffff)
-    return 0;
+    return false;
+
   // there can be 1's on either or both "outsides", all the "inside"
   // bits must be 0's
-  unsigned int lsb = 0, msb = 31;
-  while (v & (1 << msb)) --msb;
-  while (v & (1 << lsb)) ++lsb;
-  for (unsigned int i = lsb; i <= msb; ++i) {
-    if (v & (1 << i))
-      return 0;
-  }
-  return 1;
+  unsigned TO = CountTrailingOnes_32(v);
+  unsigned LO = CountLeadingOnes_32(v);
+  v = (v >> TO) << TO;
+  v = (v << LO) >> LO;
+  return v == 0;
 }
 
 /// isFPImmLegal - Returns true if the target can instruction select the
@@ -10513,6 +11191,30 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_ldrex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_strex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
   case Intrinsic::arm_strexd: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 426010e..90facdd 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -52,6 +52,7 @@ namespace llvm {
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
+      INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
 
@@ -94,7 +95,6 @@ namespace llvm {
 
       DYN_ALLOC,    // Dynamic allocation on the stack.
 
-      MEMBARRIER,   // Memory barrier (DMB)
       MEMBARRIER_MCR, // Memory barrier (MCR)
 
       PRELOAD,      // Preload
@@ -186,6 +186,8 @@ namespace llvm {
       // Floating-point max and min:
       FMAX,
       FMIN,
+      VMAXNM,
+      VMINNM,
 
       // Bit-field insert
       BFI,
@@ -222,21 +224,7 @@ namespace llvm {
       VST4_UPD,
       VST2LN_UPD,
       VST3LN_UPD,
-      VST4LN_UPD,
-
-      // 64-bit atomic ops (value split into two registers)
-      ATOMADD64_DAG,
-      ATOMSUB64_DAG,
-      ATOMOR64_DAG,
-      ATOMXOR64_DAG,
-      ATOMAND64_DAG,
-      ATOMNAND64_DAG,
-      ATOMSWAP64_DAG,
-      ATOMCMPXCHG64_DAG,
-      ATOMMIN64_DAG,
-      ATOMUMIN64_DAG,
-      ATOMMAX64_DAG,
-      ATOMUMAX64_DAG
+      VST4LN_UPD
     };
   }
 
@@ -270,7 +258,7 @@ namespace llvm {
     }
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     virtual MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -298,6 +286,9 @@ namespace llvm {
     using TargetLowering::isZExtFree;
     virtual bool isZExtFree(SDValue Val, EVT VT2) const;
 
+    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
+
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
@@ -349,7 +340,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   EVT VT) const;
+                                   MVT VT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -372,6 +363,12 @@ namespace llvm {
     /// be used for loads / stores from the global.
     virtual unsigned getMaximalGlobalOffset() const;
 
+    /// Returns true if a cast between SrcAS and DestAS is a noop.
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const {
+      // Addrspacecasts are always noops.
+      return true;
+    }
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     virtual FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -412,21 +409,21 @@ namespace llvm {
     void addQRTypeForNEON(MVT VT);
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
-    void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+    void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                           SDValue Chain, SDValue &Arg,
                           RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
-                          SmallVector<SDValue, 8> &MemOpChains,
+                          SmallVectorImpl<SDValue> &MemOpChains,
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
-                                 DebugLoc dl) const;
+                                 SDLoc dl) const;
 
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -457,13 +454,26 @@ namespace llvm {
                             const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
+    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    ///
+    /// ARM supports both fused and unfused multiply-add operations; we already
+    /// lower a pair of fmul and fadd to the latter so it's not clear that there
+    /// would be a gain or that the gain would be worthwhile enough to risk
+    /// correctness bugs.
+    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; }
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             bool isThisReturn, SDValue ThisVal) const;
 
@@ -471,24 +481,26 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     int StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
-                       DebugLoc dl, SDValue &Chain,
+                       SDLoc dl, SDValue &Chain,
                        const Value *OrigArg,
                        unsigned InRegsParamRecordIdx,
                        unsigned OffsetFromOrigArg,
                        unsigned ArgOffset,
+                       unsigned ArgSize,
                        bool ForceMutable) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                              DebugLoc dl, SDValue &Chain,
+                              SDLoc dl, SDValue &Chain,
                               unsigned ArgOffset,
                               bool ForceMutable = false) const;
 
     void computeRegArea(CCState &CCInfo, MachineFunction &MF,
                         unsigned InRegsParamRecordIdx,
+                        unsigned ArgSize,
                         unsigned &ArgRegsSize,
                         unsigned &ArgRegsSaveSize) const;
 
@@ -522,16 +534,16 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
     virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
 
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                      SDValue &ARMcc, SelectionDAG &DAG, DebugLoc dl) const;
+                      SDValue &ARMcc, SelectionDAG &DAG, SDLoc dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS,
-                      SelectionDAG &DAG, DebugLoc dl) const;
+                      SelectionDAG &DAG, SDLoc dl) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
@@ -556,6 +568,8 @@ namespace llvm {
                                                unsigned Size,
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
+    MachineBasicBlock *EmitAtomicLoad64(MachineInstr *MI,
+                                        MachineBasicBlock *BB) const;
 
     void SetupEntryBlockForSjLj(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 67a6820..f93504f 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -155,6 +155,16 @@ def pred : PredicateOperand<OtherVT, (ops i32imm, i32imm),
   let DecoderMethod = "DecodePredicateOperand";
 }
 
+// Selectable predicate operand for CMOV instructions. We can't use a normal
+// predicate because the default values interfere with instruction selection. In
+// all other respects it is identical though: pseudo-instruction expansion
+// relies on the MachineOperands being compatible.
+def cmovpred : Operand<i32>, PredicateOp,
+               ComplexPattern<i32, 2, "SelectCMOVPred"> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+  let PrintMethod = "printPredicateOperand";
+}
+
 // Conditional code result for instructions whose 's' bit is set, e.g. subs.
 def CCOutOperand : AsmOperandClass { let Name = "CCOut"; }
 def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
@@ -237,6 +247,8 @@ class t2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[IsThumb2]>;
 class VFP2InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2]>;
+class VFP2DPInstAlias<string Asm, dag Result, bit Emit = 0b1>
+      : InstAlias<Asm, Result, Emit>, Requires<[HasVFP2,HasDPVFP]>;
 class VFP3InstAlias<string Asm, dag Result, bit Emit = 0b1>
       : InstAlias<Asm, Result, Emit>, Requires<[HasVFP3]>;
 class NEONInstAlias<string Asm, dag Result, bit Emit = 0b1>
@@ -490,8 +502,7 @@ class JTI<dag oops, dag iops, InstrItinClass itin,
   : XI<oops, iops, AddrModeNone, 0, IndexModeNone, BrMiscFrm, itin,
        asm, "", pattern>;
 
-// Atomic load/store instructions
-class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIldr_ex_or_acq<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
@@ -502,23 +513,52 @@ class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{20}    = 1;
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
-  let Inst{11-0}  = 0b111110011111;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-0}   = 0b10011111;
 }
-class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+class AIstr_ex_or_rel<bits<2> opcod, bits<2> opcod2, dag oops, dag iops, InstrItinClass itin,
               string opc, string asm, list<dag> pattern>
   : I<oops, iops, AddrModeNone, 4, IndexModeNone, LdStExFrm, itin,
       opc, asm, "", pattern> {
-  bits<4> Rd;
   bits<4> Rt;
   bits<4> addr;
   let Inst{27-23} = 0b00011;
   let Inst{22-21} = opcod;
   let Inst{20}    = 0;
   let Inst{19-16} = addr;
-  let Inst{15-12} = Rd;
-  let Inst{11-4}  = 0b11111001;
+  let Inst{11-10} = 0b11;
+  let Inst{9-8}   = opcod2;
+  let Inst{7-4}   = 0b1001;
   let Inst{3-0}   = Rt;
 }
+// Atomic load/store instructions
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b11, oops, iops, itin, opc, asm, pattern>;
+
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b11, oops, iops, itin, opc, asm, pattern> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
+// Exclusive load/store instructions
+
+class AIldaex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstlex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b10, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  bits<4> Rd;
+  let Inst{15-12} = Rd;
+}
+
 class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   : AI<oops, iops, MiscFrm, NoItinerary, opc, "\t$Rt, $Rt2, $addr", pattern> {
   bits<4> Rt;
@@ -535,6 +575,18 @@ class AIswp<bit b, dag oops, dag iops, string opc, list<dag> pattern>
   let Unpredictable{11-8} = 0b1111;
   let DecoderMethod = "DecodeSwap";
 }
+// Acquire/Release load/store instructions
+class AIldracq<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIldr_ex_or_acq<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]>;
+
+class AIstrrel<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : AIstr_ex_or_rel<opcod, 0b00, oops, iops, itin, opc, asm, pattern>,
+    Requires<[IsARM, HasV8]> {
+  let Inst{15-12}   = 0b1111;
+}
 
 // addrmode1 instructions
 class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
@@ -1230,8 +1282,9 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin,
   : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
 
 // Move to/from coprocessor instructions
-class T2Cop<bits<4> opc, dag oops, dag iops, string asm, list<dag> pattern>
-  : T2XI <oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2]> {
+class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm,
+            list<dag> pattern>
+  : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> {
   let Inst{31-28} = opc;
 }
 
@@ -1389,7 +1442,6 @@ class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let Inst{15-12} = Dd{3-0};
   let Inst{7-0}   = addr{7-0};    // imm8
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-9}  = 0b101;
@@ -1415,7 +1467,6 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
   let Inst{15-12} = Sd{4-1};
   let Inst{7-0}   = addr{7-0};    // imm8
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-24} = opcod1;
   let Inst{21-20} = opcod2;
   let Inst{11-9}  = 0b101;
@@ -1437,6 +1488,28 @@ class PseudoVFPLdStM<dag oops, dag iops, InstrItinClass itin, string cstr,
 }
 
 // Load / store multiple
+
+// Unknown precision
+class AXXI4<dag oops, dag iops, IndexMode im,
+            string asm, string cstr, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode4, 4, im,
+          VFPLdStFrm, NoItinerary, asm, cstr, pattern> {
+  // Instruction operands.
+  bits<4>  Rn;
+  bits<13> regs;
+
+  // Encode instruction operands.
+  let Inst{19-16} = Rn;
+  let Inst{22}    = 0;
+  let Inst{15-12} = regs{11-8};
+  let Inst{7-1}   = regs{7-1};
+
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+  let Inst{0}     = 1;
+}
+
+// Double precision
 class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, 4, im,
@@ -1449,14 +1522,15 @@ class AXDI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{19-16} = Rn;
   let Inst{22}    = regs{12};
   let Inst{15-12} = regs{11-8};
-  let Inst{7-0}   = regs{7-0};
+  let Inst{7-1}   = regs{7-1};
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 1;          // Double precision
+  let Inst{0}     = 0;
 }
 
+// Single Precision
 class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
             string asm, string cstr, list<dag> pattern>
   : VFPXI<oops, iops, AddrMode4, 4, im,
@@ -1471,7 +1545,6 @@ class AXSI4<dag oops, dag iops, IndexMode im, InstrItinClass itin,
   let Inst{15-12} = regs{12-9};
   let Inst{7-0}   = regs{7-0};
 
-  // TODO: Mark the instructions with the appropriate subtarget info.
   let Inst{27-25} = 0b110;
   let Inst{11-9}  = 0b101;
   let Inst{8}     = 0;          // Single precision
@@ -1499,6 +1572,34 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{8}     = 1;          // Double precision
   let Inst{7-6}   = opcod4;
   let Inst{4}     = opcod5;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// Double precision, unary, not-predicated
+class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
 }
 
 // Double precision, binary
@@ -1525,9 +1626,42 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{8}     = 1;          // Double precision
   let Inst{6}     = op6;
   let Inst{4}     = op4;
+
+  let Predicates = [HasVFP2, HasDPVFP];
+}
+
+// FP, binary, not predicated
+class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin,
+          asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1; // double precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
-// Single precision, unary
+// Single precision, unary, predicated
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
            string asm, list<dag> pattern>
@@ -1551,6 +1685,33 @@ class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{4}     = opcod5;
 }
 
+// Single precision, unary, non-predicated
+class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
 // Single precision unary, if no NEON. Same as ASuI except not available if
 // NEON is enabled.
 class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
@@ -1586,6 +1747,35 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
   let Inst{4}     = op4;
 }
 
+// Single precision, binary, not predicated
+class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0; // Single precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
 // Single precision binary, if no NEON. Same as ASbI except not available if
 // NEON is enabled.
 class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
@@ -1698,6 +1888,21 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let DecoderNamespace = "NEON";
 }
 
+// Same as NeonI except it is not predicated
+class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+
+  let Inst{31-28} = 0b1111;
+}
+
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, string cstr, list<dag> pattern>
@@ -1817,6 +2022,35 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{5}     = Vm{4};
 }
 
+// Same as N2V but not predicated.
+class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+            dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
+            string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
+   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
+             OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0}   = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = 0b00111;
+  let Inst{21-20} = 0b11;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11} = 0;
+  let Inst{10-8} = op10_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+
+  let DecoderNamespace = "NEON";
+}
+
 // Same as N2V except it doesn't have a datatype suffix.
 class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
            bits<5> op11_7, bit op6, bit op4,
@@ -1898,6 +2132,32 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{5}     = Vm{4};
 }
 
+class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
+                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable, list<dag> pattern>
+  : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
+            Dt, "$Vd, $Vn, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22} = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7} = Vn{4};
+  let Inst{5} = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = op27_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
 class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
                 string opc, string dt, string asm, string cstr,
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 80f0ec7..df867b4 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -29,7 +30,7 @@
 using namespace llvm;
 
 ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
@@ -106,29 +107,42 @@ namespace {
       if (TM->getRelocationModel() != Reloc::PIC_)
         return false;
 
-      LLVMContext* Context = &MF.getFunction()->getContext();
-      GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
-                                           GlobalValue::ExternalLinkage, 0,
-                                           "_GLOBAL_OFFSET_TABLE_");
-      unsigned Id = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id);
-      unsigned Align = TM->getDataLayout()->getPrefTypeAlignment(GV->getType());
+      LLVMContext *Context = &MF.getFunction()->getContext();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      unsigned PCAdj = TM->getSubtarget<ARMSubtarget>().isThumb() ? 4 : 8;
+      ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
+          *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
+
+      unsigned Align = TM->getDataLayout()
+          ->getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
       unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
 
       MachineBasicBlock &FirstMBB = MF.front();
       MachineBasicBlock::iterator MBBI = FirstMBB.begin();
       DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
-      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      unsigned TempReg =
+          MF.getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
       unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
                      ARM::t2LDRpci : ARM::LDRcp;
       const TargetInstrInfo &TII = *TM->getInstrInfo();
       MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
-                                        TII.get(Opc), GlobalBaseReg)
+                                        TII.get(Opc), TempReg)
                                 .addConstantPoolIndex(Idx);
       if (Opc == ARM::LDRcp)
         MIB.addImm(0);
       AddDefaultPred(MIB);
 
+      // Fix the GOT address by adding pc.
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ? ARM::tPICADD
+                                                        : ARM::PICADD;
+      MIB = BuildMI(FirstMBB, MBBI, DL, TII.get(Opc), GlobalBaseReg)
+                .addReg(TempReg)
+                .addImm(ARMPCLabelIndex);
+      if (Opc == ARM::PICADD)
+        AddDefaultPred(MIB);
+
+
       return true;
     }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 1bd174e..2042c04 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -71,6 +71,9 @@ def SDT_ARMTCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 def SDT_ARMBFI : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                       SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
 
+def SDT_ARMVMAXNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+def SDT_ARMVMINNM : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisFP<1>, SDTCisFP<2>]>;
+
 def SDTBinaryArithWithFlags : SDTypeProfile<2, 2,
                                             [SDTCisSameAs<0, 2>,
                                              SDTCisSameAs<0, 3>,
@@ -118,7 +121,8 @@ def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
+def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
                               [SDNPInGlue]>;
 
@@ -162,8 +166,6 @@ def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp,
                                [SDNPHasChain, SDNPSideEffect]>;
 
-def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
                                [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
@@ -174,9 +176,11 @@ def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
 def ARMtcret         : SDNode<"ARMISD::TC_RETURN", SDT_ARMTCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
-
 def ARMbfi           : SDNode<"ARMISD::BFI", SDT_ARMBFI>;
 
+def ARMvmaxnm        : SDNode<"ARMISD::VMAXNM", SDT_ARMVMAXNM, []>;
+def ARMvminnm        : SDNode<"ARMISD::VMINNM", SDT_ARMVMINNM, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -189,11 +193,18 @@ def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
                                  AssemblerPredicate<"HasV6Ops", "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
+def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
+                                 AssemblerPredicate<"HasV6MOps",
+                                                    "armv6m or armv6t2">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
                                  AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
                                  AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"HasV8Ops", "armv8">;
+def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -201,14 +212,23 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasDPVFP         : Predicate<"!Subtarget->isFPOnlySP()">,
+                                 AssemblerPredicate<"!FeatureVFPOnlySP",
+                                                    "double precision VFP">;
+def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
+                                 AssemblerPredicate<"FeatureFPARMv8", "FPARMv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "NEON">;
+def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
-                                 AssemblerPredicate<"FeatureHWDiv", "divide">;
+                                 AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
-                                 AssemblerPredicate<"FeatureHWDivARM">;
+                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
                                  AssemblerPredicate<"FeatureT2XtPk",
                                                      "pack/extract">;
@@ -233,10 +253,10 @@ def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
                                  AssemblerPredicate<"ModeThumb,FeatureThumb2",
                                                     "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass", "armv7m">;
-def IsARClass        : Predicate<"!Subtarget->isMClass()">,
+                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
                                  AssemblerPredicate<"!FeatureMClass",
-                                                    "armv7a/r">;
+                                                    "!armv*m">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
                                  AssemblerPredicate<"!ModeThumb", "arm-mode">;
 def IsIOS            : Predicate<"Subtarget->isTargetIOS()">;
@@ -258,7 +278,9 @@ def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast) && "
                                  "!Subtarget->isTargetDarwin()">;
-def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
+def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast &&"
+                                 " Subtarget->hasVFP4()) || "
                                  "Subtarget->isTargetDarwin()">;
 
 // VGETLNi32 is microcoded on Swift - prefer VMOV.
@@ -275,8 +297,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
 def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"TLI.isLittleEndian()">;
-def IsBE             : Predicate<"TLI.isBigEndian()">;
+def IsLE             : Predicate<"getTargetLowering()->isLittleEndian()">;
+def IsBE             : Predicate<"getTargetLowering()->isBigEndian()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -456,7 +478,7 @@ def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
 def adrlabel : Operand<i32> {
   let EncoderMethod = "getAdrLabelOpValue";
   let ParserMatchClass = AdrLabelAsmOperand;
-  let PrintMethod = "printAdrLabelOperand";
+  let PrintMethod = "printAdrLabelOperand<0>";
 }
 
 def neon_vcvt_imm32 : Operand<i32> {
@@ -581,17 +603,6 @@ def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
-/// imm0_4 predicate - Immediate in the range [0,4].
-def Imm0_4AsmOperand : ImmAsmOperand
-{ 
-  let Name = "Imm0_4"; 
-  let DiagnosticType = "ImmRange0_4";  
-}
-def imm0_4 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 5; }]> {
-  let ParserMatchClass = Imm0_4AsmOperand;
-  let DecoderMethod = "DecodeImm0_4";
-}
-
 /// imm0_7 predicate - Immediate in the range [0,7].
 def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
@@ -671,6 +682,15 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_63AsmOperand;
 }
 
+/// imm0_239 predicate - Immediate in the range [0,239].
+def Imm0_239AsmOperand : ImmAsmOperand {
+  let Name = "Imm0_239";
+  let DiagnosticType = "ImmRange0_239";
+}
+def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
+  let ParserMatchClass = Imm0_239AsmOperand;
+}
+
 /// imm0_255 predicate - Immediate in the range [0,255].
 def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
@@ -702,6 +722,11 @@ def imm0_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
+def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def imm256_65535_expr : Operand<i32> {
+  let ParserMatchClass = Imm256_65535ExprAsmOperand;
+}
+
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
 def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
@@ -1007,11 +1032,6 @@ def p_imm : Operand<i32> {
   let DecoderMethod = "DecodeCoprocessor";
 }
 
-def pf_imm : Operand<i32> {
-  let PrintMethod = "printPImmediate";
-  let ParserMatchClass = CoprocNumAsmOperand;
-}
-
 def CoprocRegAsmOperand : AsmOperandClass {
   let Name = "CoprocReg";
   let ParserMethod = "parseCoprocRegOperand";
@@ -1327,7 +1347,7 @@ class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
           [(set GPRnopc:$Rd, (opnode (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<2> rot;
@@ -1340,11 +1360,11 @@ class AI_ext_rrot<bits<8> opcod, string opc, PatFrag opnode>
 class AI_ext_rrot_np<bits<8> opcod, string opc>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTr, opc, "\t$Rd, $Rm$rot", []>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsi]> {
   bits<2> rot;
   let Inst{19-16} = 0b1111;
   let Inst{11-10} = rot;
-}
+ }
 
 /// AI_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
@@ -1353,7 +1373,7 @@ class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
           IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot",
           [(set GPRnopc:$Rd, (opnode GPR:$Rn,
                                      (rotr GPRnopc:$Rm, rot_imm:$rot)))]>,
-        Requires<[IsARM, HasV6]> {
+        Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -1368,7 +1388,7 @@ class AI_exta_rrot<bits<8> opcod, string opc, PatFrag opnode>
 class AI_exta_rrot_np<bits<8> opcod, string opc>
   : AExtI<opcod, (outs GPRnopc:$Rd), (ins GPR:$Rn, GPRnopc:$Rm, rot_imm:$rot),
           IIC_iEXTAr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
-       Requires<[IsARM, HasV6]> {
+       Requires<[IsARM, HasV6]>, Sched<[WriteALUsr]> {
   bits<4> Rn;
   bits<2> rot;
   let Inst{19-16} = Rn;
@@ -1664,53 +1684,11 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
            [(ARMcallseq_start timm:$amt)]>;
 }
 
-// Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
-// (These pseudos use a hand-written selection code).
-let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
-def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMXOR6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMADD6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSUB6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMNAND6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMAND6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMSWAP6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMCMPXCHG6432 : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                                 (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
-                                      GPR:$set1, GPR:$set2),
-                                 NoItinerary, []>;
-def ATOMMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMIN6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-def ATOMUMAX6432  : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
-                              (ins GPR:$addr, GPR:$src1, GPR:$src2),
-                              NoItinerary, []>;
-}
-
-def HINT : AI<(outs), (ins imm0_4:$imm), MiscFrm, NoItinerary,
+def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
               "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
-  bits<3> imm;
-  let Inst{27-3} = 0b0011001000001111000000000;
-  let Inst{2-0} = imm;
+  bits<8> imm;
+  let Inst{27-8} = 0b00110010000011110000;
+  let Inst{7-0} = imm;
 }
 
 def : InstAlias<"nop$p", (HINT 0, pred:$p)>, Requires<[IsARM, HasV6T2]>;
@@ -1718,6 +1696,9 @@ def : InstAlias<"yield$p", (HINT 1, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfe$p", (HINT 2, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
+def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
+
+def : Pat<(int_arm_sevl), (HINT 5)>;
 
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
@@ -1735,12 +1716,23 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
 
 // The 16-bit operand $val can be used by a debugger to store more information
 // about the breakpoint.
-def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
-              "bkpt", "\t$val", []>, Requires<[IsARM]> {
+def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "bkpt", "\t$val", []>, Requires<[IsARM]> {
   bits<16> val;
   let Inst{3-0} = val{3-0};
   let Inst{19-8} = val{15-4};
   let Inst{27-20} = 0b00010010;
+  let Inst{31-28} = 0xe; // AL
+  let Inst{7-4} = 0b0111;
+}
+
+def HLT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "hlt", "\t$val", []>, Requires<[IsARM, HasV8]> {
+  bits<16> val;
+  let Inst{3-0} = val{3-0};
+  let Inst{19-8} = val{15-4};
+  let Inst{27-20} = 0b00010000;
+  let Inst{31-28} = 0xe; // AL
   let Inst{7-4} = 0b0111;
 }
 
@@ -1780,7 +1772,8 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def i12 : AXI<(outs), (ins addrmode_imm12:$addr), MiscFrm, IIC_Preload,
                 !strconcat(opc, "\t$addr"),
-                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]> {
+                [(ARMPreload addrmode_imm12:$addr, (i32 read), (i32 data))]>,
+                Sched<[WritePreLd]> {
     bits<4> Rt;
     bits<17> addr;
     let Inst{31-26} = 0b111101;
@@ -1796,7 +1789,8 @@ multiclass APreLoad<bits<1> read, bits<1> data, string opc> {
 
   def rs : AXI<(outs), (ins ldst_so_reg:$shift), MiscFrm, IIC_Preload,
                !strconcat(opc, "\t$shift"),
-               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]> {
+               [(ARMPreload ldst_so_reg:$shift, (i32 read), (i32 data))]>,
+               Sched<[WritePreLd]> {
     bits<17> shift;
     let Inst{31-26} = 0b111101;
     let Inst{25} = 1; // 1 for register form
@@ -1816,7 +1810,7 @@ defm PLDW : APreLoad<0, 1, "pldw">, Requires<[IsARM,HasV7,HasMP]>;
 defm PLI  : APreLoad<1, 0, "pli">,  Requires<[IsARM,HasV7]>;
 
 def SETEND : AXI<(outs), (ins setend_op:$end), MiscFrm, NoItinerary,
-                 "setend\t$end", []>, Requires<[IsARM]> {
+                 "setend\t$end", []>, Requires<[IsARM]>, Deprecated<HasV8Ops> {
   bits<1> end;
   let Inst{31-10} = 0b1111000100000001000000;
   let Inst{9} = end;
@@ -1863,7 +1857,8 @@ def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
 let isNotDuplicable = 1 in {
 def PICADD  : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
                             4, IIC_iALUr,
-                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+                            [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>,
+                            Sched<[WriteALU, ReadALU]>;
 
 let AddedComplexity = 10 in {
 def PICLDR  : ARMPseudoInst<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
@@ -1923,11 +1918,11 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
 
 let hasSideEffects = 1 in {
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
-                    4, IIC_iALUi, []>;
+                    4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 
 def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                      4, IIC_iALUi, []>;
+                      4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1938,16 +1933,22 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   // ARMV4T and above
   def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "bx", "\tlr", [(ARMretflag)]>,
-               Requires<[IsARM, HasV4T]> {
+               Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     let Inst{27-0}  = 0b0001001011111111111100011110;
   }
 
   // ARMV4 only
   def MOVPCLR : AI<(outs), (ins), BrMiscFrm, IIC_Br,
                   "mov", "\tpc, lr", [(ARMretflag)]>,
-               Requires<[IsARM, NoV4T]> {
+               Requires<[IsARM, NoV4T]>, Sched<[WriteBr]> {
     let Inst{27-0} = 0b0001101000001111000000001110;
   }
+
+  // Exception return: N.b. doesn't set CPSR as far as we're concerned (it sets
+  // the user-space one).
+  def SUBS_PC_LR : ARMPseudoInst<(outs), (ins i32imm:$offset, pred:$p),
+                                 4, IIC_Br,
+                                 [(ARMintretflag imm:$offset)]>;
 }
 
 // Indirect branches
@@ -1955,7 +1956,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   // ARMV4T and above
   def BX : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
                   [(brind GPR:$dst)]>,
-              Requires<[IsARM, HasV4T]> {
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     bits<4> dst;
     let Inst{31-4} = 0b1110000100101111111111110001;
     let Inst{3-0}  = dst;
@@ -1963,7 +1964,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
   def BX_pred : AI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br,
                   "bx", "\t$dst", [/* pattern left blank */]>,
-              Requires<[IsARM, HasV4T]> {
+              Requires<[IsARM, HasV4T]>, Sched<[WriteBr]> {
     bits<4> dst;
     let Inst{27-4} = 0b000100101111111111110001;
     let Inst{3-0}  = dst;
@@ -1980,7 +1981,7 @@ let isCall = 1,
   def BL  : ABXI<0b1011, (outs), (ins bl_target:$func),
                 IIC_Br, "bl\t$func",
                 [(ARMcall tglobaladdr:$func)]>,
-            Requires<[IsARM]> {
+            Requires<[IsARM]>, Sched<[WriteBrL]> {
     let Inst{31-28} = 0b1110;
     bits<24> func;
     let Inst{23-0} = func;
@@ -1990,7 +1991,7 @@ let isCall = 1,
   def BL_pred : ABI<0b1011, (outs), (ins bl_target:$func),
                    IIC_Br, "bl", "\t$func",
                    [(ARMcall_pred tglobaladdr:$func)]>,
-                Requires<[IsARM]> {
+                Requires<[IsARM]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{23-0} = func;
     let DecoderMethod = "DecodeBranchImmInstruction";
@@ -2000,7 +2001,7 @@ let isCall = 1,
   def BLX : AXI<(outs), (ins GPR:$func), BrMiscFrm,
                 IIC_Br, "blx\t$func",
                 [(ARMcall GPR:$func)]>,
-            Requires<[IsARM, HasV5T]> {
+            Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{31-4} = 0b1110000100101111111111110011;
     let Inst{3-0}  = func;
@@ -2009,7 +2010,7 @@ let isCall = 1,
   def BLX_pred : AI<(outs), (ins GPR:$func), BrMiscFrm,
                     IIC_Br, "blx", "\t$func",
                     [(ARMcall_pred GPR:$func)]>,
-                 Requires<[IsARM, HasV5T]> {
+                 Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
     bits<4> func;
     let Inst{27-4} = 0b000100101111111111110011;
     let Inst{3-0}  = func;
@@ -2019,18 +2020,18 @@ let isCall = 1,
   // Note: Restrict $func to the tGPR regclass to prevent it being in LR.
   def BX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, HasV4T]>;
+                   Requires<[IsARM, HasV4T]>, Sched<[WriteBr]>;
 
   // ARMv4
   def BMOVPCRX_CALL : ARMPseudoInst<(outs), (ins tGPR:$func),
                    8, IIC_Br, [(ARMcall_nolink tGPR:$func)]>,
-                   Requires<[IsARM, NoV4T]>;
+                   Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
 
   // mov lr, pc; b if callee is marked noreturn to avoid confusing the
   // return stack predictor.
   def BMOVPCB_CALL : ARMPseudoInst<(outs), (ins bl_target:$func),
                                8, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                      Requires<[IsARM]>;
+                      Requires<[IsARM]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1 in {
@@ -2038,7 +2039,8 @@ let isBranch = 1, isTerminator = 1 in {
   // a two-value operand where a dag node expects two operands. :(
   def Bcc : ABI<0b1010, (outs), (ins br_target:$target),
                IIC_Br, "b", "\t$target",
-               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]> {
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>,
+               Sched<[WriteBr]>  {
     bits<24> target;
     let Inst{23-0} = target;
     let DecoderMethod = "DecodeBranchImmInstruction";
@@ -2051,25 +2053,27 @@ let isBranch = 1, isTerminator = 1 in {
     // should be sufficient.
     // FIXME: Is B really a Barrier? That doesn't seem right.
     def B : ARMPseudoExpand<(outs), (ins br_target:$target), 4, IIC_Br,
-                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>;
+                [(br bb:$target)], (Bcc br_target:$target, (ops 14, zero_reg))>,
+                Sched<[WriteBr]>;
 
     let isNotDuplicable = 1, isIndirectBranch = 1 in {
     def BR_JTr : ARMPseudoInst<(outs),
                       (ins GPR:$target, i32imm:$jt, i32imm:$id),
                       0, IIC_Br,
-                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>;
+                      [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBr]>;
     // FIXME: This shouldn't use the generic "addrmode2," but rather be split
     // into i12 and rs suffixed versions.
     def BR_JTm : ARMPseudoInst<(outs),
                      (ins addrmode2:$target, i32imm:$jt, i32imm:$id),
                      0, IIC_Br,
                      [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
-                       imm:$id)]>;
+                       imm:$id)]>, Sched<[WriteBrTbl]>;
     def BR_JTadd : ARMPseudoInst<(outs),
                    (ins GPR:$target, GPR:$idx, i32imm:$jt, i32imm:$id),
                    0, IIC_Br,
                    [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
-                     imm:$id)]>;
+                     imm:$id)]>, Sched<[WriteBrTbl]>;
     } // isNotDuplicable = 1, isIndirectBranch = 1
   } // isBarrier = 1
 
@@ -2078,7 +2082,7 @@ let isBranch = 1, isTerminator = 1 in {
 // BLX (immediate)
 def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
                "blx\t$target", []>,
-           Requires<[IsARM, HasV5T]> {
+           Requires<[IsARM, HasV5T]>, Sched<[WriteBrL]> {
   let Inst{31-25} = 0b1111101;
   bits<25> target;
   let Inst{23-0} = target{24-1};
@@ -2087,7 +2091,7 @@ def BLXi : AXI<(outs), (ins blx_target:$target), BrMiscFrm, NoItinerary,
 
 // Branch and Exchange Jazelle
 def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
-              [/* pattern left blank */]> {
+              [/* pattern left blank */]>, Sched<[WriteBr]> {
   bits<4> func;
   let Inst{23-20} = 0b0010;
   let Inst{19-8} = 0xfff;
@@ -2098,18 +2102,20 @@ def BXJ : ABI<0b0001, (outs), (ins GPR:$func), NoItinerary, "bxj", "\t$func",
 // Tail calls.
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>;
+  def TCRETURNdi : PseudoInst<(outs), (ins i32imm:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
 
-  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>;
+  def TCRETURNri : PseudoInst<(outs), (ins tcGPR:$dst), IIC_Br, []>,
+                   Sched<[WriteBr]>;
 
   def TAILJMPd : ARMPseudoExpand<(outs), (ins br_target:$dst),
                                  4, IIC_Br, [],
                                  (Bcc br_target:$dst, (ops 14, zero_reg))>,
-                                 Requires<[IsARM]>;
+                                 Requires<[IsARM]>, Sched<[WriteBr]>;
 
   def TAILJMPr : ARMPseudoExpand<(outs), (ins tcGPR:$dst),
                                  4, IIC_Br, [],
-                                 (BX GPR:$dst)>,
+                                 (BX GPR:$dst)>, Sched<[WriteBr]>,
                                  Requires<[IsARM]>;
 }
 
@@ -2123,7 +2129,8 @@ def SMC : ABI<0b0001, (outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
 
 // Supervisor Call (Software Interrupt)
 let isCall = 1, Uses = [SP] in {
-def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []> {
+def SVC : ABI<0b1111, (outs), (ins imm24b:$svc), IIC_Br, "svc", "\t$svc", []>,
+          Sched<[WriteBr]> {
   bits<24> svc;
   let Inst{23-0} = svc;
 }
@@ -2272,6 +2279,13 @@ def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rd, GPR:$dst2),
                  []>, Requires<[IsARM, HasV5TE]>;
 }
 
+def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "lda", "\t$Rt, $addr", []>;
+def LDAB : AIldracq<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldab", "\t$Rt, $addr", []>;
+def LDAH : AIldracq<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                    NoItinerary, "ldah", "\t$Rt, $addr", []>;
+
 // Indexed loads
 multiclass AI2_ldridx<bit isByte, string opc,
                       InstrItinClass iii, InstrItinClass iir> {
@@ -2284,7 +2298,6 @@ multiclass AI2_ldridx<bit isByte, string opc,
     let Inst{19-16} = addr{16-13};
     let Inst{11-0} = addr{11-0};
     let DecoderMethod = "DecodeLDRPreImm";
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12";
   }
 
   def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2297,7 +2310,6 @@ multiclass AI2_ldridx<bit isByte, string opc,
     let Inst{11-0} = addr{11-0};
     let Inst{4} = 0;
     let DecoderMethod = "DecodeLDRPreReg";
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2";
   }
 
   def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2355,7 +2367,6 @@ multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
     let Inst{19-16} = addr{12-9};   // Rn
     let Inst{11-8}  = addr{7-4};    // imm7_4/zero
     let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3";
     let DecoderMethod = "DecodeAddrMode3Instruction";
   }
   def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2391,7 +2402,6 @@ def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
-  let AsmMatchConverter = "cvtLdrdPre";
 }
 def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
                           (ins addr_offset_none:$addr, am3offset:$offset),
@@ -2494,7 +2504,6 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{22} = 1;
     let Inst{11-8} = offset{7-4};
     let Inst{3-0} = offset{3-0};
-    let AsmMatchConverter = "cvtLdExtTWriteBackImm";
   }
   def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
                       (ins addr_offset_none:$addr, postidx_reg:$Rm),
@@ -2506,7 +2515,6 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{11-8} = 0;
     let Unpredictable{11-8} = 0b1111;
     let Inst{3-0} = Rm{3-0};
-    let AsmMatchConverter = "cvtLdExtTWriteBackReg";
     let DecoderMethod = "DecodeLDR";
   }
 }
@@ -2544,7 +2552,6 @@ multiclass AI2_stridx<bit isByte, string opc,
     let Inst{23}    = addr{12};     // U (add = ('U' == 1))
     let Inst{19-16} = addr{16-13};  // Rn
     let Inst{11-0}  = addr{11-0};   // imm12
-    let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12";
     let DecoderMethod = "DecodeSTRPreImm";
   }
 
@@ -2558,7 +2565,6 @@ multiclass AI2_stridx<bit isByte, string opc,
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{11-0}  = addr{11-0};
     let Inst{4}     = 0;           // Inst{4} = 0
-    let AsmMatchConverter = "cvtStWriteBackRegAddrMode2";
     let DecoderMethod = "DecodeSTRPreReg";
   }
   def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
@@ -2667,7 +2673,6 @@ def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
   let Inst{19-16} = addr{12-9};   // Rn
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let AsmMatchConverter = "cvtStWriteBackRegAddrMode3";
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
@@ -2701,7 +2706,6 @@ def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
-  let AsmMatchConverter = "cvtStrdPre";
 }
 
 def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
@@ -2808,7 +2812,6 @@ multiclass AI3strT<bits<4> op, string opc> {
     let Inst{22} = 1;
     let Inst{11-8} = offset{7-4};
     let Inst{3-0} = offset{3-0};
-    let AsmMatchConverter = "cvtStExtTWriteBackImm";
   }
   def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
                       (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
@@ -2819,13 +2822,18 @@ multiclass AI3strT<bits<4> op, string opc> {
     let Inst{22} = 0;
     let Inst{11-8} = 0;
     let Inst{3-0} = Rm{3-0};
-    let AsmMatchConverter = "cvtStExtTWriteBackReg";
   }
 }
 
 
 defm STRHT : AI3strT<0b1011, "strht">;
 
+def STL : AIstrrel<0b00, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                   NoItinerary, "stl", "\t$Rt, $addr", []>;
+def STLB : AIstrrel<0b10, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlb", "\t$Rt, $addr", []>;
+def STLH : AIstrrel<0b11, (outs), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlh", "\t$Rt, $addr", []>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
@@ -2955,7 +2963,7 @@ defm sysSTM : arm_ldst_mult<"stm", " ^", 0, 1, LdStMulFrm, IIC_iStore_m,
 
 let neverHasSideEffects = 1 in
 def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
-                "mov", "\t$Rd, $Rm", []>, UnaryDP {
+                "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
 
@@ -2969,7 +2977,7 @@ def MOVr : AsI1<0b1101, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMOVr,
 // A version for the smaller set of tail call registers.
 let neverHasSideEffects = 1 in
 def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
-                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP {
+                IIC_iMOVr, "mov", "\t$Rd, $Rm", []>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
 
@@ -2982,7 +2990,8 @@ def MOVr_TC : AsI1<0b1101, (outs tcGPR:$Rd), (ins tcGPR:$Rm), DPFrm,
 def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
                 DPSoRegRegFrm, IIC_iMOVsr,
                 "mov", "\t$Rd, $src",
-                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP {
+                [(set GPRnopc:$Rd, shift_so_reg_reg:$src)]>, UnaryDP,
+                Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
@@ -2998,7 +3007,7 @@ def MOVsr : AsI1<0b1101, (outs GPRnopc:$Rd), (ins shift_so_reg_reg:$src),
 def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
                 DPSoRegImmFrm, IIC_iMOVsr,
                 "mov", "\t$Rd, $src", [(set GPR:$Rd, shift_so_reg_imm:$src)]>,
-                UnaryDP {
+                UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> src;
   let Inst{15-12} = Rd;
@@ -3011,7 +3020,8 @@ def MOVsi : AsI1<0b1101, (outs GPR:$Rd), (ins shift_so_reg_imm:$src),
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def MOVi : AsI1<0b1101, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm, IIC_iMOVi,
-                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP {
+                "mov", "\t$Rd, $imm", [(set GPR:$Rd, so_imm:$imm)]>, UnaryDP,
+                Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
   let Inst{25} = 1;
@@ -3025,7 +3035,7 @@ def MOVi16 : AI1<0b1000, (outs GPR:$Rd), (ins imm0_65535_expr:$imm),
                  DPFrm, IIC_iMOVi,
                  "movw", "\t$Rd, $imm",
                  [(set GPR:$Rd, imm0_65535:$imm)]>,
-                 Requires<[IsARM, HasV6T2]>, UnaryDP {
+                 Requires<[IsARM, HasV6T2]>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<16> imm;
   let Inst{15-12} = Rd;
@@ -3041,7 +3051,8 @@ def : InstAlias<"mov${p} $Rd, $imm",
         Requires<[IsARM]>;
 
 def MOVi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
-                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
 
 let Constraints = "$src = $Rd" in {
 def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
@@ -3051,7 +3062,7 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
                   [(set GPRnopc:$Rd,
                         (or (and GPR:$src, 0xffff),
                             lo16AllZero:$imm))]>, UnaryDP,
-                  Requires<[IsARM, HasV6T2]> {
+                  Requires<[IsARM, HasV6T2]>, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<16> imm;
   let Inst{15-12} = Rd;
@@ -3063,7 +3074,8 @@ def MOVTi16 : AI1<0b1010, (outs GPRnopc:$Rd),
 }
 
 def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
-                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                      (ins GPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                      Sched<[WriteALU]>;
 
 } // Constraints
 
@@ -3073,7 +3085,7 @@ def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
 let Uses = [CPSR] in
 def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
                     [(set GPR:$Rd, (ARMrrx GPR:$Rm))]>, UnaryDP,
-                    Requires<[IsARM]>;
+                    Requires<[IsARM]>, Sched<[WriteALU]>;
 
 // These aren't really mov instructions, but we have to define them this way
 // due to flag operands.
@@ -3081,10 +3093,10 @@ def RRX: PseudoInst<(outs GPR:$Rd), (ins GPR:$Rm), IIC_iMOVsi,
 let Defs = [CPSR] in {
 def MOVsrl_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
                       [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP,
-                      Requires<[IsARM]>;
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
 def MOVsra_flag : PseudoInst<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
                       [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP,
-                      Requires<[IsARM]>;
+                      Sched<[WriteALU]>, Requires<[IsARM]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3250,7 +3262,8 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
           list<dag> pattern = [],
           dag iops = (ins GPRnopc:$Rn, GPRnopc:$Rm),
           string asm = "\t$Rd, $Rn, $Rm">
-  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern> {
+  : AI<(outs GPRnopc:$Rd), iops, DPFrm, IIC_iALUr, opc, asm, pattern>,
+    Sched<[WriteALU, ReadALU, ReadALU]> {
   bits<4> Rn;
   bits<4> Rd;
   bits<4> Rm;
@@ -3265,9 +3278,11 @@ class AAI<bits<8> op27_20, bits<8> op11_4, string opc,
 
 // Saturating add/subtract
 
+let DecoderMethod = "DecodeQADDInstruction" in
 def QADD    : AAI<0b00010000, 0b00000101, "qadd",
                   [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm, GPRnopc:$Rn))],
                   (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
+
 def QSUB    : AAI<0b00010010, 0b00000101, "qsub",
                   [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm, GPRnopc:$Rn))],
                   (ins GPRnopc:$Rm, GPRnopc:$Rn), "\t$Rd, $Rm, $Rn">;
@@ -3326,7 +3341,7 @@ def UHSUB8  : AAI<0b01100111, 0b11111111, "uhsub8">;
 def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                 MulFrm /* for convenience */, NoItinerary, "usad8",
                 "\t$Rd, $Rn, $Rm", []>,
-             Requires<[IsARM, HasV6]> {
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]> {
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -3340,7 +3355,7 @@ def USAD8  : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
 def USADA8 : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                 MulFrm /* for convenience */, NoItinerary, "usada8",
                 "\t$Rd, $Rn, $Rm, $Ra", []>,
-             Requires<[IsARM, HasV6]> {
+             Requires<[IsARM, HasV6]>, Sched<[WriteALU, ReadALU, ReadALU]>{
   bits<4> Rd;
   bits<4> Rn;
   bits<4> Rm;
@@ -3473,7 +3488,7 @@ def BFI:I<(outs GPRnopc:$Rd), (ins GPRnopc:$src, GPR:$Rn, bf_inv_mask_imm:$imm),
 
 def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
                   "mvn", "\t$Rd, $Rm",
-                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP {
+                  [(set GPR:$Rd, (not GPR:$Rm))]>, UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<4> Rm;
   let Inst{25} = 0;
@@ -3484,7 +3499,8 @@ def  MVNr  : AsI1<0b1111, (outs GPR:$Rd), (ins GPR:$Rm), DPFrm, IIC_iMVNr,
 }
 def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
                   DPSoRegImmFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP {
+                  [(set GPR:$Rd, (not so_reg_imm:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
@@ -3496,7 +3512,8 @@ def  MVNsi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_imm:$shift),
 }
 def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
                   DPSoRegRegFrm, IIC_iMVNsr, "mvn", "\t$Rd, $shift",
-                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP {
+                  [(set GPR:$Rd, (not so_reg_reg:$shift))]>, UnaryDP,
+                  Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> shift;
   let Inst{25} = 0;
@@ -3511,7 +3528,7 @@ def  MVNsr  : AsI1<0b1111, (outs GPR:$Rd), (ins so_reg_reg:$shift),
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins so_imm:$imm), DPFrm,
                   IIC_iMVNi, "mvn", "\t$Rd, $imm",
-                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP {
+                  [(set GPR:$Rd, so_imm_not:$imm)]>,UnaryDP, Sched<[WriteALU]> {
   bits<4> Rd;
   bits<12> imm;
   let Inst{25} = 1;
@@ -3993,14 +4010,58 @@ def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
 def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
-                   (srl GPRnopc:$src2, imm16_31:$sh)),
+                   (srl GPRnopc:$src2, imm16:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (sra GPRnopc:$src2, imm16_31:$sh)),
                (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
 def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
                    (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
                (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm1_15:$sh)>;
 
 //===----------------------------------------------------------------------===//
+// CRC Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class AI_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : AInoP<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm), MiscFrm, NoItinerary,
+               !strconcat("crc32", suffix), "\t$Rd, $Rn, $Rm",
+               [(set GPRnopc:$Rd, (builtin GPRnopc:$Rn, GPRnopc:$Rm))]>,
+               Requires<[IsARM, HasV8, HasCRC]> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-28} = 0b1110;
+  let Inst{27-23} = 0b00010;
+  let Inst{22-21} = sz;
+  let Inst{20}    = 0;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = Rd;
+  let Inst{11-10} = 0b00;
+  let Inst{9}     = C;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0100;
+  let Inst{3-0}   = Rm;
+
+  let Unpredictable{11-8} = 0b1101;
+}
+
+def CRC32B  : AI_crc32<0, 0b00, "b", int_arm_crc32b>;
+def CRC32CB : AI_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def CRC32H  : AI_crc32<0, 0b01, "h", int_arm_crc32h>;
+def CRC32CH : AI_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def CRC32W  : AI_crc32<0, 0b10, "w", int_arm_crc32w>;
+def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 
@@ -4022,7 +4083,8 @@ def : ARMPat<(ARMcmpZ GPR:$src, so_reg_reg:$rhs),
 let isCompare = 1, Defs = [CPSR] in {
 def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
                 "cmn", "\t$Rn, $imm",
-                [(ARMcmn GPR:$Rn, so_imm:$imm)]> {
+                [(ARMcmn GPR:$Rn, so_imm:$imm)]>,
+                Sched<[WriteCMP, ReadALU]> {
   bits<4> Rn;
   bits<12> imm;
   let Inst{25} = 1;
@@ -4038,7 +4100,7 @@ def CMNri : AI1<0b1011, (outs), (ins GPR:$Rn, so_imm:$imm), DPFrm, IIC_iCMPi,
 def CMNzrr : AI1<0b1011, (outs), (ins GPR:$Rn, GPR:$Rm), DPFrm, IIC_iCMPr,
                  "cmn", "\t$Rn, $Rm",
                  [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                   GPR:$Rn, GPR:$Rm)]> {
+                   GPR:$Rn, GPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
   bits<4> Rn;
   bits<4> Rm;
   let isCommutable = 1;
@@ -4056,7 +4118,8 @@ def CMNzrsi : AI1<0b1011, (outs),
                   (ins GPR:$Rn, so_reg_imm:$shift), DPSoRegImmFrm, IIC_iCMPsr,
                   "cmn", "\t$Rn, $shift",
                   [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                    GPR:$Rn, so_reg_imm:$shift)]> {
+                    GPR:$Rn, so_reg_imm:$shift)]>,
+                    Sched<[WriteCMPsi, ReadALU]> {
   bits<4> Rn;
   bits<12> shift;
   let Inst{25} = 0;
@@ -4074,7 +4137,8 @@ def CMNzrsr : AI1<0b1011, (outs),
                   (ins GPRnopc:$Rn, so_reg_reg:$shift), DPSoRegRegFrm, IIC_iCMPsr,
                   "cmn", "\t$Rn, $shift",
                   [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                    GPRnopc:$Rn, so_reg_reg:$shift)]> {
+                    GPRnopc:$Rn, so_reg_reg:$shift)]>,
+                    Sched<[WriteCMPsr, ReadALU]> {
   bits<4> Rn;
   bits<12> shift;
   let Inst{25} = 0;
@@ -4112,65 +4176,77 @@ let usesCustomInserter = 1, isBranch = 1, isTerminator = 1,
 def BCCi64 : PseudoInst<(outs),
     (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, brtarget:$dst),
      IIC_Br,
-    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>;
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, GPR:$rhs1, GPR:$rhs2, bb:$dst)]>,
+    Sched<[WriteBr]>;
 
 def BCCZi64 : PseudoInst<(outs),
      (ins i32imm:$cc, GPR:$lhs1, GPR:$lhs2, brtarget:$dst), IIC_Br,
-    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>;
+    [(ARMBcci64 imm:$cc, GPR:$lhs1, GPR:$lhs2, 0, 0, bb:$dst)]>,
+    Sched<[WriteBr]>;
 } // usesCustomInserter
 
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
-def MOVCCr : ARMPseudoInst<(outs GPR:$Rd), (ins GPR:$false, GPR:$Rm, pred:$p),
+def MOVCCr : ARMPseudoInst<(outs GPR:$Rd),
+                           (ins GPR:$false, GPR:$Rm, cmovpred:$p),
                            4, IIC_iCMOVr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, GPR:$Rm,
+                                                   cmovpred:$p))]>,
+             RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 def MOVCCsi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_imm:$shift, pred:$p),
-                           4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_imm:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+                            (ins GPR:$false, so_reg_imm:$shift, cmovpred:$p),
+                            4, IIC_iCMOVsr,
+                            [(set GPR:$Rd,
+                                  (ARMcmov GPR:$false, so_reg_imm:$shift,
+                                           cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 def MOVCCsr : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_reg_reg:$shift, pred:$p),
+                            (ins GPR:$false, so_reg_reg:$shift, cmovpred:$p),
                            4, IIC_iCMOVsr,
-  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
-                            imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+  [(set GPR:$Rd, (ARMcmov GPR:$false, so_reg_reg:$shift,
+                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 
 let isMoveImm = 1 in
-def MOVCCi16 : ARMPseudoInst<(outs GPR:$Rd),
-                             (ins GPR:$false, imm0_65535_expr:$imm, pred:$p),
-                             4, IIC_iMOVi,
-                             []>,
-      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
+def MOVCCi16
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                    4, IIC_iMOVi,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>,
+      Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
 def MOVCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
-   [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-      RegConstraint<"$false = $Rd">;
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm:$imm,
+                                                   cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 // Two instruction predicate mov immediate.
 let isMoveImm = 1 in
-def MOVCCi32imm : ARMPseudoInst<(outs GPR:$Rd),
-                                (ins GPR:$false, i32imm:$src, pred:$p),
-                  8, IIC_iCMOVix2, []>, RegConstraint<"$false = $Rd">;
+def MOVCCi32imm
+    : ARMPseudoInst<(outs GPR:$Rd),
+                    (ins GPR:$false, i32imm:$src, cmovpred:$p),
+                    8, IIC_iCMOVix2,
+                    [(set GPR:$Rd, (ARMcmov GPR:$false, imm:$src,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Requires<[IsARM, HasV6T2]>;
 
 let isMoveImm = 1 in
 def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
-                           (ins GPR:$false, so_imm:$imm, pred:$p),
+                           (ins GPR:$false, so_imm:$imm, cmovpred:$p),
                            4, IIC_iCMOVi,
- [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">;
+                           [(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm,
+                                                   cmovpred:$p))]>,
+                RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 } // neverHasSideEffects
 
@@ -4189,10 +4265,20 @@ def memb_opt : Operand<i32> {
   let DecoderMethod = "DecodeMemBarrierOption";
 }
 
+def InstSyncBarrierOptOperand : AsmOperandClass {
+  let Name = "InstSyncBarrierOpt";
+  let ParserMethod = "parseInstSyncBarrierOptOperand";
+}
+def instsyncb_opt : Operand<i32> {
+  let PrintMethod = "printInstSyncBOption";
+  let ParserMatchClass = InstSyncBarrierOptOperand;
+  let DecoderMethod = "DecodeInstSyncBarrierOption";
+}
+
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
 def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff05;
@@ -4201,7 +4287,7 @@ def DMB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 }
 
 def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
-                "dsb", "\t$opt", []>,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf57ff04;
@@ -4209,7 +4295,7 @@ def DSB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
 }
 
 // ISB has only full system option
-def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
+def ISB : AInoP<(outs), (ins instsyncb_opt:$opt), MiscFrm, NoItinerary,
                 "isb", "\t$opt", []>,
                 Requires<[IsARM, HasDB]> {
   bits<4> opt;
@@ -4217,124 +4303,219 @@ def ISB : AInoP<(outs), (ins memb_opt:$opt), MiscFrm, NoItinerary,
   let Inst{3-0} = opt;
 }
 
+let usesCustomInserter = 1, Defs = [CPSR] in {
+
 // Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
-let usesCustomInserter = 1, Defs = [CPSR] in
-def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
+  def ABS : ARMPseudoInst<(outs GPR:$dst), (ins GPR:$src), 8, NoItinerary, []>;
 
-let usesCustomInserter = 1 in {
-  let Defs = [CPSR] in {
+// Atomic pseudo-insts which will be lowered to ldrex/strex loops.
+// (64-bit pseudos use a hand-written selection code).
+  let mayLoad = 1, mayStore = 1 in {
     def ATOMIC_LOAD_ADD_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_8 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_16 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_ADD_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_SUB_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_AND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_OR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_XOR_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_NAND_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
-      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$incr, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_min_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_MAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_max_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMIN_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umin_32 GPR:$ptr, GPR:$val))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_LOAD_UMAX_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$val), NoItinerary,
-      [(set GPR:$dst, (atomic_load_umax_32 GPR:$ptr, GPR:$val))]>;
-
-    def ATOMIC_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
-    def ATOMIC_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$val, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
-
-    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
-    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
     def ATOMIC_CMP_SWAP_I32 : PseudoInst<
-      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
-      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
-}
+      (outs GPR:$dst),
+      (ins GPR:$ptr, GPR:$old, GPR:$new, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_ADD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_SUB_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_AND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_OR_I64 :  PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_XOR_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_NAND_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_MAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMIN_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_LOAD_UMAX_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
+    def ATOMIC_CMP_SWAP_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$cmp1, GPR:$cmp2,
+           GPR:$set1, GPR:$set2, i32imm:$ordering),
+      NoItinerary, []>;
+  }
+  let mayLoad = 1 in
+    def ATOMIC_LOAD_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, i32imm:$ordering),
+      NoItinerary, []>;
+  let mayStore = 1 in
+    def ATOMIC_STORE_I64 : PseudoInst<
+      (outs GPR:$dst1, GPR:$dst2),
+      (ins GPR:$addr, GPR:$src1, GPR:$src2, i32imm:$ordering),
+      NoItinerary, []>;
 }
 
 let usesCustomInserter = 1 in {
@@ -4344,48 +4525,147 @@ let usesCustomInserter = 1 in {
       [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
 }
 
+def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def strex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def strex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def strex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary,
-                    "ldrexb", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldrexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
 def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldrexh", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldrexh", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
 def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldrex", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldrex", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
 let hasExtraDefRegAllocReq = 1 in
-def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+def LDREXD : AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
                       NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegLoad";
 }
+
+def LDAEXB : AIldaex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexb", "\t$Rt, $addr", []>;
+def LDAEXH : AIldaex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaexh", "\t$Rt, $addr", []>;
+def LDAEX  : AIldaex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
+                     NoItinerary, "ldaex", "\t$Rt, $addr", []>;
+let hasExtraDefRegAllocReq = 1 in
+def LDAEXD : AIldaex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
+                      NoItinerary, "ldaexd", "\t$Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegLoad";
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_1 GPR:$Rt, addr_offset_none:$addr))]>;
 def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_2 GPR:$Rt, addr_offset_none:$addr))]>;
 def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_4 GPR:$Rt, addr_offset_none:$addr))]>;
 let hasExtraSrcRegAllocReq = 1 in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     (ins GPRPairOp:$Rt, addr_offset_none:$addr),
                     NoItinerary, "strexd", "\t$Rd, $Rt, $addr", []> {
   let DecoderMethod = "DecodeDoubleRegStore";
 }
+def STLEXB: AIstlex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexb", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEXH: AIstlex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexh", "\t$Rd, $Rt, $addr",
+                    []>;
+def STLEX : AIstlex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlex", "\t$Rd, $Rt, $addr",
+                    []>;
+let hasExtraSrcRegAllocReq = 1 in
+def STLEXD : AIstlex<0b01, (outs GPR:$Rd),
+                    (ins GPRPairOp:$Rt, addr_offset_none:$addr),
+                    NoItinerary, "stlexd", "\t$Rd, $Rt, $addr", []> {
+  let DecoderMethod = "DecodeDoubleRegStore";
+}
 }
 
-
-def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+                [(int_arm_clrex)]>,
             Requires<[IsARM, HasV7]>  {
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
+def : ARMPat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+             (LDREXB addr_offset_none:$addr)>;
+def : ARMPat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+             (LDREXH addr_offset_none:$addr)>;
+def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
+def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
+def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
+
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+def atomic_store_release_8  : releasing_store<atomic_store_8>;
+def atomic_store_release_16 : releasing_store<atomic_store_16>;
+def atomic_store_release_32 : releasing_store<atomic_store_32>;
+
+let AddedComplexity = 8 in {
+  def : ARMPat<(atomic_load_acquire_8 addr_offset_none:$addr),  (LDAB addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_16 addr_offset_none:$addr), (LDAH addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_load_acquire_32 addr_offset_none:$addr), (LDA  addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (STLB GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (STLH GPR:$val, addr_offset_none:$addr)>;
+  def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
+                Requires<[PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
-                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>;
+                (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
+                Requires<[PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4396,7 +4676,8 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                          imm:$CRm, imm:$opc2)]> {
+                          imm:$CRm, imm:$opc2)]>,
+            Requires<[PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4413,11 +4694,12 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{23-20} = opc1;
 }
 
-def CDP2 : ABXI<0b1110, (outs), (ins pf_imm:$cop, imm0_15:$opc1,
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
-                              imm:$CRm, imm:$opc2)]> {
+                              imm:$CRm, imm:$opc2)]>,
+               Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -4595,10 +4877,10 @@ defm LDC   : LdStCop <1, 0, "ldc">;
 defm LDCL  : LdStCop <1, 1, "ldcl">;
 defm STC   : LdStCop <0, 0, "stc">;
 defm STCL  : LdStCop <0, 1, "stcl">;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2">;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l">;
-defm STC2  : LdSt2Cop<0, 0, "stc2">;
-defm STC2L : LdSt2Cop<0, 1, "stc2l">;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2">, Requires<[PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l">, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2">, Requires<[PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l">, Requires<[PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -4631,16 +4913,17 @@ def MCR : MovRCopro<"mcr", 0 /* from ARM core register to coprocessor */,
                     (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, imm0_7:$opc2),
                     [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                  imm:$CRm, imm:$opc2)]>;
+                                  imm:$CRm, imm:$opc2)]>,
+                    ComplexDeprecationPredicate<"MCR">;
 def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
-                    (outs GPR:$Rt),
+                    (outs GPRwithAPSR:$Rt),
                     (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                          imm0_7:$opc2), []>;
 def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
-                   (MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                   (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
 
 def : ARMPat<(int_arm_mrc imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
@@ -4650,7 +4933,7 @@ class MovRCopro2<string opc, bit direction, dag oops, dag iops,
                  list<dag> pattern>
   : ABXI<0b1110, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"), pattern> {
-  let Inst{31-28} = 0b1111;
+  let Inst{31-24} = 0b11111110;
   let Inst{20} = direction;
   let Inst{4} = 1;
 
@@ -4674,16 +4957,18 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                                     imm:$CRm, imm:$opc2)]>;
+                                     imm:$CRm, imm:$opc2)]>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
-                      (outs GPR:$Rt),
+                      (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
-                           imm0_7:$opc2), []>;
+                           imm0_7:$opc2), []>,
+                      Requires<[PreV8]>;
 def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
-                   (MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                   (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 
 def : ARMV5TPat<(int_arm_mrc2 imm:$cop, imm:$opc1, imm:$CRn,
@@ -4718,7 +5003,8 @@ def MRRC : MovRRCopro<"mrrc", 1 /* from coprocessor to ARM core register */>;
 class MovRRCopro2<string opc, bit direction, list<dag> pattern = []>
   : ABXI<0b1100, (outs), (ins p_imm:$cop, imm0_15:$opc1,
          GPRnopc:$Rt, GPRnopc:$Rt2, c_imm:$CRm), NoItinerary,
-         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+         !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
+    Requires<[PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -4820,7 +5106,7 @@ def MSRi : ABI<0b0011, (outs), (ins msr_mask:$mask,  so_imm:$a), NoItinerary,
 let isCall = 1,
   Defs = [R0, R12, LR, CPSR], Uses = [SP] in {
   def TPsoft : PseudoInst<(outs), (ins), IIC_Br,
-               [(set R0, ARMthread_pointer)]>;
+               [(set R0, ARMthread_pointer)]>, Sched<[WriteBr]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4884,7 +5170,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in
   def MOVPCRX : ARMPseudoExpand<(outs), (ins GPR:$dst),
                     4, IIC_Br, [(brind GPR:$dst)],
                     (MOVr PC, GPR:$dst, (ops 14, zero_reg), zero_reg)>,
-                  Requires<[IsARM, NoV4T]>;
+                  Requires<[IsARM, NoV4T]>, Sched<[WriteBr]>;
 
 // Large immediate handling.
 
@@ -5153,10 +5439,10 @@ def : MnemonicAlias<"rfeed", "rfeib">;
 def : MnemonicAlias<"rfe", "rfeia">;
 
 // SRS aliases
-def : MnemonicAlias<"srsfa", "srsda">;
-def : MnemonicAlias<"srsea", "srsdb">;
-def : MnemonicAlias<"srsfd", "srsia">;
-def : MnemonicAlias<"srsed", "srsib">;
+def : MnemonicAlias<"srsfa", "srsib">;
+def : MnemonicAlias<"srsea", "srsia">;
+def : MnemonicAlias<"srsfd", "srsdb">;
+def : MnemonicAlias<"srsed", "srsda">;
 def : MnemonicAlias<"srs", "srsia">;
 
 // QSAX == QSUBADDX
@@ -5233,7 +5519,7 @@ def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
                              cc_out:$s)>;
 }
 def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
-                        (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+                        (ins GPR:$Rd, GPR:$Rm, pred:$p, cc_out:$s)>;
 let TwoOperandAliasConstraint = "$Rn = $Rd" in {
 def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
                         (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
@@ -5269,4 +5555,5 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
-def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
+def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>,
+         ComplexDeprecationPredicate<"IT">;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 896fd0f..43bd4c2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -626,7 +626,7 @@ class VLD1D<bits<4> op7_4, string Dt>
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VLD1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
@@ -634,7 +634,7 @@ class VLD1Q<bits<4> op7_4, string Dt>
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
 def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
@@ -655,16 +655,14 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 multiclass VLD1QWB<bits<4> op7_4, string Dt> {
@@ -674,16 +672,14 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -703,7 +699,7 @@ class VLD1D3<bits<4> op7_4, string Dt>
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
@@ -712,16 +708,14 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -744,7 +738,7 @@ class VLD1D4<bits<4> op7_4, string Dt>
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
@@ -753,16 +747,14 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -786,7 +778,7 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
 def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
@@ -810,16 +802,14 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), itin,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVLDInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
 }
 
@@ -853,7 +843,7 @@ class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
@@ -872,7 +862,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
@@ -912,7 +902,7 @@ class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
@@ -931,7 +921,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVLDInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
@@ -1348,7 +1338,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListOneDAllLanes:$Vd, GPR:$wb),
@@ -1357,7 +1346,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
@@ -1369,7 +1357,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListDPairAllLanes:$Vd, GPR:$wb),
@@ -1378,7 +1365,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -1419,7 +1405,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1101, op7_4,
                         (outs VdTy:$Vd, GPR:$wb),
@@ -1428,7 +1413,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -1580,14 +1564,14 @@ class VST1D<bits<4> op7_4, string Dt>
           IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VST1Q<bits<4> op7_4, string Dt>
   : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
           IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
 def  VST1d8   : VST1D<{0,0,0,?}, "8">;
@@ -1608,8 +1592,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
@@ -1617,8 +1600,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 multiclass VST1QWB<bits<4> op7_4, string Dt> {
@@ -1628,8 +1610,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
@@ -1637,8 +1618,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -1659,7 +1639,7 @@ class VST1D3<bits<4> op7_4, string Dt>
           IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VST1D3WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
@@ -1668,8 +1648,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
@@ -1677,8 +1656,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -1704,7 +1682,7 @@ class VST1D4<bits<4> op7_4, string Dt>
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST1Instruction";
 }
 multiclass VST1D4WB<bits<4> op7_4, string Dt> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
@@ -1713,8 +1691,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1722,8 +1699,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
 
@@ -1748,7 +1724,7 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
           itin, "vst2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
 def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
@@ -1772,16 +1748,14 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
 }
 multiclass VST2QWB<bits<4> op7_4, string Dt> {
@@ -1791,8 +1765,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1800,8 +1773,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
-    let DecoderMethod = "DecodeVSTInstruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
+    let DecoderMethod = "DecodeVLDST2Instruction";
   }
 }
 
@@ -1835,7 +1807,7 @@ class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
@@ -1854,7 +1826,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = Rn{4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST3Instruction";
 }
 
 def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
@@ -1894,7 +1866,7 @@ class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
           "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
@@ -1913,7 +1885,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{5-4} = Rn{5-4};
-  let DecoderMethod = "DecodeVSTInstruction";
+  let DecoderMethod = "DecodeVLDST4Instruction";
 }
 
 def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
@@ -2379,6 +2351,40 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
+class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
+// Same as N2VQIntXnp but with Vd as a src register.
+class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
+              bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 // Narrow 2-register operations.
 class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -2541,6 +2547,16 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
@@ -2552,6 +2568,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                            imm:$lane)))))]> {
   let isCommutable = 0;
 }
+
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
@@ -2584,6 +2601,29 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
+// Same as N3VQIntnp but with Vd as a src register.
+class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr,
+          Dt, ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
+                                       (OpTy QPR:$Vm))))]> {
+  let Constraints = "$src = $Vd";
+}
+
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -2834,6 +2874,7 @@ class N3VL<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (OpNode (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
 class N3VLSL<bit op24, bits<2> op21_20, bits<4> op11_8,
              InstrItinClass itin, string OpcodeStr, string Dt,
              ValueType TyQ, ValueType TyD, SDNode OpNode>
@@ -2889,6 +2930,17 @@ class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
         [(set QPR:$Vd, (TyQ (IntOp (TyD DPR:$Vn), (TyD DPR:$Vm))))]> {
   let isCommutable = Commutable;
 }
+
+// Same as above, but not predicated.
+class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -3965,12 +4017,18 @@ defm VQADDu   : N3VInt_QHSD<1, 0, 0b0000, 1, N3RegFrm,
                             IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, IIC_VBINi4Q,
                             "vqadd", "u", int_arm_neon_vqaddu, 1>;
 //   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
-defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
-                            int_arm_neon_vaddhn, 1>;
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i", null_frag, 1>;
 //   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
 defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
                             int_arm_neon_vraddhn, 1>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (add (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VADDHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (add (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VADDHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (add (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VADDHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Multiply Operations.
 
 //   VMUL     : Vector Multiply (integer, polynomial and floating-point)
@@ -4008,6 +4066,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfd DPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+          (VMULslfq QPR:$Rn,
+            (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+            (i32 0))>;
+
+
 //   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
 defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
                           IIC_VMULi16Q, IIC_VMULi32Q,
@@ -4053,12 +4122,18 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
                                   (SubReg_i32_lane imm:$lane)))>;
 
 //   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
-defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "s", NEONvmulls, 1>;
-defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                         "vmull", "u", NEONvmullu, 1>;
-def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
-                        v8i16, v8i8, int_arm_neon_vmullp, 1>;
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "NEONData" in {
+  defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "s", NEONvmulls, 1>;
+  defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
+                           "vmull", "u", NEONvmullu, 1>;
+  def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                            v8i16, v8i8, int_arm_neon_vmullp, 1>;
+  def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
+                          "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
+                    Requires<[HasV8, HasCrypto]>;
+}
 defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
 defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
 
@@ -4125,8 +4200,27 @@ defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
 
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlal", "s", int_arm_neon_vqdmlal>;
-defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
+                            "vqdmlal", "s", null_frag>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLALv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLALv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqadds (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqadds (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLALslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 //   VMLS     : Vector Multiply Subtract (integer and floating-point)
 defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4182,25 +4276,44 @@ defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
 
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
-                            "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
-defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+                            "vqdmlsl", "s", null_frag>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", null_frag>;
+
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                                  (v4i16 DPR:$Vm))))),
+          (VQDMLSLv4i32 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                                  (v2i32 DPR:$Vm))))),
+          (VQDMLSLv2i64 QPR:$src1, DPR:$Vn, DPR:$Vm)>;
+def : Pat<(v4i32 (int_arm_neon_vqsubs (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqdmull (v4i16 DPR:$Vn),
+                                (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv4i16 QPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane)>;
+def : Pat<(v2i64 (int_arm_neon_vqsubs (v2i64 QPR:$src1),
+                     (v2i64 (int_arm_neon_vqdmull (v2i32 DPR:$Vn),
+                                (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                     imm:$lane)))))),
+          (VQDMLSLslv2i32 QPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, imm:$lane)>;
 
 // Fused Vector Multiply-Accumulate and Fused Multiply-Subtract Operations.
 def  VFMAfd   : N3VDMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACD, "vfma", "f32",
                           v2f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 def  VFMAfq   : N3VQMulOp<0, 0, 0b00, 0b1100, 1, IIC_VFMACQ, "vfma", "f32",
                           v4f32, fmul_su, fadd_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 //   Fused Vector Multiply Subtract (floating-point)
 def  VFMSfd   : N3VDMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACD, "vfms", "f32",
                           v2f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 def  VFMSfq   : N3VQMulOp<0, 0, 0b10, 0b1100, 1, IIC_VFMACQ, "vfms", "f32",
                           v4f32, fmul_su, fsub_mlx>,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasNEON,HasVFP4,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
 def : Pat<(v2f32 (fma DPR:$Vn, DPR:$Vm, DPR:$src1)),
@@ -4248,12 +4361,18 @@ defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, N3RegFrm,
                             IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
                             "vqsub", "u", int_arm_neon_vqsubu, 0>;
 //   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
-defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
-                            int_arm_neon_vsubhn, 0>;
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i", null_frag, 0>;
 //   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
 defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
                             int_arm_neon_vrsubhn, 0>;
 
+def : Pat<(v8i8  (trunc (NEONvshru (sub (v8i16 QPR:$Vn), QPR:$Vm), 8))),
+          (VSUBHNv8i8 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v4i16 (trunc (NEONvshru (sub (v4i32 QPR:$Vn), QPR:$Vm), 16))),
+          (VSUBHNv4i16 QPR:$Vn, QPR:$Vm)>;
+def : Pat<(v2i32 (trunc (NEONvshru (sub (v2i64 QPR:$Vn), QPR:$Vm), 32))),
+          (VSUBHNv2i32 QPR:$Vn, QPR:$Vm)>;
+
 // Vector Comparisons.
 
 //   VCEQ     : Vector Compare Equal
@@ -4659,6 +4778,18 @@ def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f32",
                         v4f32, v4f32, int_arm_neon_vmaxs, 1>;
 
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMAXNMND  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNQ  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
 //   VMIN     : Vector Minimum
 defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -4673,6 +4804,18 @@ def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f32",
                         v4f32, v4f32, int_arm_neon_vmins, 1>;
 
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMINNMND  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNQ  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
 // Vector Pairwise Operations.
 
 //   VPADD    : Vector Pairwise Add
@@ -5015,10 +5158,10 @@ def  VSWPq    : N2VX<0b11, 0b11, 0b00, 0b10, 0b00000, 1, 0,
 // Vector Move Operations.
 
 //   VMOV     : Vector Move (Register)
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
-def : InstAlias<"vmov${p} $Vd, $Vm",
-                (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p} $Vd, $Vm",
+                    (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -5386,6 +5529,26 @@ def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
 def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+                    SDPatternOperator IntU> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+  }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
 //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
 let DecoderMethod = "DecodeVCVTD" in {
 def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
@@ -5409,6 +5572,25 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 }
 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", 
+                    (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", 
+                    (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", 
+                    (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", 
+                    (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", 
+                    (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", 
+                    (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
+
+
 //   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
 def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
                         IIC_VUNAQ, "vcvt", "f16.f32",
@@ -5509,8 +5691,9 @@ class VEXTd<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
         IIC_VEXTD, OpcodeStr, Dt, "$Vd, $Vn, $Vm, $index", "",
         [(set DPR:$Vd, (Ty (NEONvext (Ty DPR:$Vn),
                                      (Ty DPR:$Vm), imm:$index)))]> {
-  bits<4> index;
-  let Inst{11-8} = index{3-0};
+  bits<3> index;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = index{2-0};
 }
 
 class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
@@ -5525,14 +5708,14 @@ class VEXTq<string OpcodeStr, string Dt, ValueType Ty, Operand immTy>
 }
 
 def VEXTd8  : VEXTd<"vext", "8",  v8i8, imm0_7> {
-  let Inst{11-8} = index{3-0};
+  let Inst{10-8} = index{2-0};
 }
 def VEXTd16 : VEXTd<"vext", "16", v4i16, imm0_3> {
-  let Inst{11-9} = index{2-0};
+  let Inst{10-9} = index{1-0};
   let Inst{8}    = 0b0;
 }
 def VEXTd32 : VEXTd<"vext", "32", v2i32, imm0_1> {
-  let Inst{11-10} = index{1-0};
+  let Inst{10}     = index{0};
   let Inst{9-8}    = 0b00;
 }
 def : Pat<(v2f32 (NEONvext (v2f32 DPR:$Vn),
@@ -5657,6 +5840,77 @@ def  VTBX4Pseudo
                 IIC_VTBX4, "$orig = $dst", []>;
 } // DecoderMethod = "DecodeTBLInstruction"
 
+// VRINT      : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+  }
+
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+  def : NEONInstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
+// Cryptography instructions
+let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
+    DecoderNamespace = "v8Crypto" in {
+  class AES<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntXnp<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class AES2Op<string op, bit op7, bit op6, SDPatternOperator Int>
+    : N2VQIntX2np<0b00, 0b00, 0b011, op6, op7, NoItinerary,
+                 !strconcat("aes", op), "8", v16i8, v16i8, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntXnp<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N2SHA2Op<string op, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+              SDPatternOperator Int>
+    : N2VQIntX2np<0b10, op17_16, op10_8, op6, op7, NoItinerary,
+                 !strconcat("sha", op), "32", v4i32, v4i32, Int>,
+      Requires<[HasV8, HasCrypto]>;
+  class N3SHA3Op<string op, bits<5> op27_23, bits<2> op21_20, SDPatternOperator Int>
+    : N3VQInt3np<op27_23, op21_20, 0b1100, 1, 0, N3RegFrm, NoItinerary,
+                !strconcat("sha", op), "32", v4i32, v4i32, Int, 0>,
+      Requires<[HasV8, HasCrypto]>;
+}
+
+def AESD : AES2Op<"d", 0, 1, int_arm_neon_aesd>;
+def AESE : AES2Op<"e", 0, 0, int_arm_neon_aese>;
+def AESIMC : AES<"imc", 1, 1, int_arm_neon_aesimc>;
+def AESMC : AES<"mc", 1, 0, int_arm_neon_aesmc>;
+
+def SHA1H : N2SHA<"1h", 0b01, 0b010, 1, 1, int_arm_neon_sha1h>;
+def SHA1SU1 : N2SHA2Op<"1su1", 0b10, 0b011, 1, 0, int_arm_neon_sha1su1>;
+def SHA256SU0 : N2SHA2Op<"256su0", 0b10, 0b011, 1, 1, int_arm_neon_sha256su0>;
+def SHA1C : N3SHA3Op<"1c", 0b00100, 0b00, int_arm_neon_sha1c>;
+def SHA1M : N3SHA3Op<"1m", 0b00100, 0b10, int_arm_neon_sha1m>;
+def SHA1P : N3SHA3Op<"1p", 0b00100, 0b01, int_arm_neon_sha1p>;
+def SHA1SU0 : N3SHA3Op<"1su0", 0b00100, 0b11, int_arm_neon_sha1su0>;
+def SHA256H : N3SHA3Op<"256h", 0b00110, 0b00, int_arm_neon_sha256h>;
+def SHA256H2 : N3SHA3Op<"256h2", 0b00110, 0b01, int_arm_neon_sha256h2>;
+def SHA256SU1 : N3SHA3Op<"256su1", 0b00110, 0b10, int_arm_neon_sha256su1>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
@@ -6697,12 +6951,17 @@ def VST4qWB_register_Asm_32 :
                   (ins VecListFourQ:$list, addrmode6:$addr,
                        rGPR:$Rm, pred:$p)>;
 
-// VMOV takes an optional datatype suffix
+// VMOV/VMVN takes an optional datatype suffix
 defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                          (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
 defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                          (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
 // VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
 // D-register versions.
 def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index ae7a5c0..af5ef53 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -69,11 +69,6 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(V, MVT::i32);
 }]>;
 
-// ADR instruction labels.
-def t_adrlabel : Operand<i32> {
-  let EncoderMethod = "getThumbAdrLabelOpValue";
-}
-
 // Scaled 4 immediate.
 def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
 def t_imm0_1020s4 : Operand<i32> {
@@ -97,12 +92,34 @@ def t_imm0_508s4_neg : Operand<i32> {
 
 // Define Thumb specific addressing modes.
 
+// unsigned 8-bit, 2-scaled memory offset
+class OperandUnsignedOffset_b8s2 : AsmOperandClass {
+  let Name = "UnsignedOffset_b8s2";
+  let PredicateMethod = "isUnsignedOffset<8, 2>";
+}
+
+def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
+
+// thumb style PC relative operand. signed, 8 bits magnitude,
+// two bits shift. can be represented as either [pc, #imm], #imm,
+// or relocatable expression...
+def ThumbMemPC : AsmOperandClass {
+  let Name = "ThumbMemPC";
+}
+
 let OperandType = "OPERAND_PCREL" in {
 def t_brtarget : Operand<OtherVT> {
   let EncoderMethod = "getThumbBRTargetOpValue";
   let DecoderMethod = "DecodeThumbBROperand";
 }
 
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<2>";
+  let ParserMatchClass = UnsignedOffset_b8s2;
+}
+
 def t_bcctarget : Operand<i32> {
   let EncoderMethod = "getThumbBCCTargetOpValue";
   let DecoderMethod = "DecodeThumbBCCTargetOperand";
@@ -122,6 +139,15 @@ def t_blxtarget : Operand<i32> {
   let EncoderMethod = "getThumbBLXTargetOpValue";
   let DecoderMethod = "DecodeThumbBLXOffset";
 }
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : Operand<i32> {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
+  let ParserMatchClass = ThumbMemPC;
+}
 }
 
 // t_addrmode_rr := reg + reg
@@ -218,14 +244,6 @@ def t_addrmode_sp : Operand<i32>,
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
-// t_addrmode_pc := <label> => pc + imm8 * 4
-//
-def t_addrmode_pc : Operand<i32> {
-  let EncoderMethod = "getAddrModePCOpValue";
-  let DecoderMethod = "DecodeThumbAddrModePC";
-  let PrintMethod = "printThumbLdrLabelOperand";
-}
-
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
 //
@@ -251,25 +269,26 @@ class T1SystemEncoding<bits<8> opc>
   let Inst{7-0} = opc;
 }
 
-def tNOP : T1pI<(outs), (ins), NoItinerary, "nop", "", []>,
-           T1SystemEncoding<0x00>, // A8.6.110
-        Requires<[IsThumb2]>;
-
-def tYIELD : T1pI<(outs), (ins), NoItinerary, "yield", "", []>,
-           T1SystemEncoding<0x10>, // A8.6.410
-           Requires<[IsThumb2]>;
-
-def tWFE : T1pI<(outs), (ins), NoItinerary, "wfe", "", []>,
-           T1SystemEncoding<0x20>, // A8.6.408
-           Requires<[IsThumb2]>;
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", []>,
+            T1SystemEncoding<0x00>,
+            Requires<[IsThumb, HasV6M]> {
+  bits<4> imm;
+  let Inst{7-4} = imm;
+}
 
-def tWFI : T1pI<(outs), (ins), NoItinerary, "wfi", "", []>,
-           T1SystemEncoding<0x30>, // A8.6.409
-           Requires<[IsThumb2]>;
+class tHintAlias<string Asm, dag Result> : tInstAlias<Asm, Result> {
+  let Predicates = [IsThumb, HasV6M];
+}
 
-def tSEV : T1pI<(outs), (ins), NoItinerary, "sev", "", []>,
-           T1SystemEncoding<0x40>, // A8.6.157
-           Requires<[IsThumb2]>;
+def : tHintAlias<"nop$p", (tHINT 0, pred:$p)>; // A8.6.110
+def : tHintAlias<"yield$p", (tHINT 1, pred:$p)>; // A8.6.410
+def : tHintAlias<"wfe$p", (tHINT 2, pred:$p)>; // A8.6.408
+def : tHintAlias<"wfi$p", (tHINT 3, pred:$p)>; // A8.6.409
+def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
+def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
+def : T2Pat<(int_arm_sevl), (tHINT 5)>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -282,8 +301,15 @@ def tBKPT : T1I<(outs), (ins imm0_255:$val), NoItinerary, "bkpt\t$val",
   let Inst{7-0} = val;
 }
 
+def tHLT : T1I<(outs), (ins imm0_63:$val), NoItinerary, "hlt\t$val",
+                []>, T1Encoding<0b101110>, Requires<[IsThumb, HasV8]> {
+  let Inst{9-6} = 0b1010;
+  bits<6> val;
+  let Inst{5-0} = val;
+}
+
 def tSETEND : T1I<(outs), (ins setend_op:$end), NoItinerary, "setend\t$end",
-                  []>, T1Encoding<0b101101> {
+                  []>, T1Encoding<0b101101>, Deprecated<HasV8Ops> {
   bits<1> end;
   // A8.6.156
   let Inst{9-5} = 0b10010;
@@ -310,7 +336,7 @@ def tCPS : T1I<(outs), (ins imod_op:$imod, iflags_op:$iflags),
 let isNotDuplicable = 1, isCodeGenOnly = 1 in
 def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
                   [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.6
   bits<3> dst;
   let Inst{6-3} = 0b1111; // Rm = pc
@@ -323,7 +349,7 @@ def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr, "",
 // probably because the instruction can be moved around.
 def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
                     IIC_iALUi, "add", "\t$dst, $sp, $imm", []>,
-               T1Encoding<{1,0,1,0,1,?}> {
+               T1Encoding<{1,0,1,0,1,?}>, Sched<[WriteALU]> {
   // A6.2 & A8.6.8
   bits<3> dst;
   bits<8> imm;
@@ -335,7 +361,7 @@ def tADDrSPi : T1pI<(outs tGPR:$dst), (ins GPRsp:$sp, t_imm0_1020s4:$imm),
 // ADD sp, sp, #<imm7>
 def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
                      IIC_iALUi, "add", "\t$Rdn, $imm", []>,
-              T1Misc<{0,0,0,0,0,?,?}> {
+              T1Misc<{0,0,0,0,0,?,?}>, Sched<[WriteALU]> {
   // A6.2.5 & A8.6.8
   bits<7> imm;
   let Inst{6-0} = imm;
@@ -346,7 +372,7 @@ def tADDspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
 // FIXME: The encoding and the ASM string don't match up.
 def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
                     IIC_iALUi, "sub", "\t$Rdn, $imm", []>,
-              T1Misc<{0,0,0,0,1,?,?}> {
+              T1Misc<{0,0,0,0,1,?,?}>, Sched<[WriteALU]> {
   // A6.2.5 & A8.6.214
   bits<7> imm;
   let Inst{6-0} = imm;
@@ -367,7 +393,7 @@ def : tInstAlias<"sub${p} sp, sp, $imm",
 // ADD <Rm>, sp
 def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
                    "add", "\t$Rdn, $sp, $Rn", []>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.9 Encoding T1
   bits<4> Rdn;
   let Inst{7}   = Rdn{3};
@@ -379,7 +405,7 @@ def tADDrSP : T1pI<(outs GPR:$Rdn), (ins GPRsp:$sp, GPR:$Rn), IIC_iALUr,
 // ADD sp, <Rm>
 def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
                   "add", "\t$Rdn, $Rm", []>,
-              T1Special<{0,0,?,?}> {
+              T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.9 Encoding T2
   bits<4> Rm;
   let Inst{7} = 1;
@@ -395,7 +421,7 @@ def tADDspr : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, GPR:$Rm), IIC_iALUr,
 // Indirect branches
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBX : TI<(outs), (ins GPR:$Rm, pred:$p), IIC_Br, "bx${p}\t$Rm", []>,
-            T1Special<{1,1,0,?}> {
+            T1Special<{1,1,0,?}>, Sched<[WriteBr]> {
     // A6.2.3 & A8.6.25
     bits<4> Rm;
     let Inst{6-3} = Rm;
@@ -406,12 +432,12 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
-                   [(ARMretflag)], (tBX LR, pred:$p)>;
+                   [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
 
   // Alternative return instruction used by vararg functions.
   def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
                    2, IIC_Br, [],
-                   (tBX GPR:$Rm, pred:$p)>;
+                   (tBX GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
 }
 
 // All calls clobber the non-callee saved registers. SP is marked as a use to
@@ -424,7 +450,7 @@ let isCall = 1,
                   (outs), (ins pred:$p, t_bltarget:$func), IIC_Br,
                   "bl${p}\t$func",
                   [(ARMtcall tglobaladdr:$func)]>,
-             Requires<[IsThumb]> {
+             Requires<[IsThumb]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -438,7 +464,7 @@ let isCall = 1,
                  (outs), (ins pred:$p, t_blxtarget:$func), IIC_Br,
                    "blx${p}\t$func",
                    [(ARMcall tglobaladdr:$func)]>,
-              Requires<[IsThumb, HasV5T]> {
+              Requires<[IsThumb, HasV5T]>, Sched<[WriteBrL]> {
     bits<24> func;
     let Inst{26} = func{23};
     let Inst{25-16} = func{20-11};
@@ -453,7 +479,7 @@ let isCall = 1,
                   "blx${p}\t$func",
                   [(ARMtcall GPR:$func)]>,
               Requires<[IsThumb, HasV5T]>,
-              T1Special<{1,1,1,?}> { // A6.2.3 & A8.6.24;
+              T1Special<{1,1,1,?}>, Sched<[WriteBrL]> { // A6.2.3 & A8.6.24;
     bits<4> func;
     let Inst{6-3} = func;
     let Inst{2-0} = 0b000;
@@ -463,29 +489,32 @@ let isCall = 1,
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
                   [(ARMcall_nolink tGPR:$func)]>,
-            Requires<[IsThumb, IsThumb1Only]>;
+            Requires<[IsThumb, IsThumb1Only]>, Sched<[WriteBr]>;
 }
 
 let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
   let isPredicable = 1 in
   def tB   : T1pI<(outs), (ins t_brtarget:$target), IIC_Br,
                  "b", "\t$target", [(br bb:$target)]>,
-             T1Encoding<{1,1,1,0,0,?}> {
+             T1Encoding<{1,1,1,0,0,?}>, Sched<[WriteBr]> {
     bits<11> target;
     let Inst{10-0} = target;
-  }
+    let AsmMatchConverter = "cvtThumbBranches";
+ }
 
   // Far jump
   // Just a pseudo for a tBL instruction. Needed to let regalloc know about
   // the clobber of LR.
   let Defs = [LR] in
   def tBfar : tPseudoExpand<(outs), (ins t_bltarget:$target, pred:$p),
-                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>;
+                          4, IIC_Br, [], (tBL pred:$p, t_bltarget:$target)>,
+                          Sched<[WriteBrTbl]>;
 
   def tBR_JTr : tPseudoInst<(outs),
                       (ins tGPR:$target, i32imm:$jt, i32imm:$id),
                       0, IIC_Br,
-                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]> {
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
+                      Sched<[WriteBrTbl]> {
     list<Predicate> Predicates = [IsThumb, IsThumb1Only];
   }
 }
@@ -496,13 +525,15 @@ let isBranch = 1, isTerminator = 1 in
   def tBcc : T1I<(outs), (ins t_bcctarget:$target, pred:$p), IIC_Br,
                  "b${p}\t$target",
                  [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
-             T1BranchCond<{1,1,0,1}> {
+             T1BranchCond<{1,1,0,1}>, Sched<[WriteBr]> {
   bits<4> p;
   bits<8> target;
   let Inst{11-8} = p;
   let Inst{7-0} = target;
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
+
 // Tail calls
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS versions.
@@ -510,7 +541,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
     def tTAILJMPr : tPseudoExpand<(outs), (ins tcGPR:$dst),
                      4, IIC_Br, [],
                      (tBX GPR:$dst, (ops 14, zero_reg))>,
-                     Requires<[IsThumb]>;
+                     Requires<[IsThumb]>, Sched<[WriteBr]>;
   }
   // tTAILJMPd: IOS version uses a Thumb2 branch (no Thumb1 tail calls
   // on IOS), so it's in ARMInstrThumb2.td.
@@ -520,7 +551,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins t_brtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (tB t_brtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb, IsNotIOS]>;
+                 Requires<[IsThumb, IsNotIOS]>, Sched<[WriteBr]>;
   }
 }
 
@@ -530,7 +561,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
 // If Inst{11-8} == 0b1111 then SEE SVC
 let isCall = 1, Uses = [SP] in
 def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
-                "svc", "\t$imm", []>, Encoding16 {
+                "svc", "\t$imm", []>, Encoding16, Sched<[WriteBr]> {
   bits<8> imm;
   let Inst{15-12} = 0b1101;
   let Inst{11-8}  = 0b1111;
@@ -540,7 +571,7 @@ def tSVC : T1pI<(outs), (ins imm0_255:$imm), IIC_Br,
 // The assembler uses 0xDEFE for a trap instruction.
 let isBarrier = 1, isTerminator = 1 in
 def tTRAP : TI<(outs), (ins), IIC_Br,
-               "trap", [(trap)]>, Encoding16 {
+               "trap", [(trap)]>, Encoding16, Sched<[WriteBr]> {
   let Inst = 0xdefe;
 }
 
@@ -627,11 +658,9 @@ def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
   let Inst{7-0} = addr;
 }
 
-// Load tconstpool
-// FIXME: Use ldr.n to work around a darwin assembler bug.
-let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
-                  "ldr", ".n\t$Rt, $addr",
+                  "ldr", "\t$Rt, $addr",
                   [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
               T1Encoding<{0,1,0,0,1,?}> {
   // A6.2 & A8.6.59
@@ -641,18 +670,6 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
-// FIXME: Remove this entry when the above ldr.n workaround is fixed.
-// For assembly/disassembly use only.
-def tLDRpciASM : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
-                       "ldr", "\t$Rt, $addr", []>,
-                 T1Encoding<{0,1,0,0,1,?}> {
-  // A6.2 & A8.6.59
-  bits<3> Rt;
-  bits<8> addr;
-  let Inst{10-8} = Rt;
-  let Inst{7-0}  = addr;
-}
-
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
                                 t_addrmode_is4, AddrModeT1_4,
@@ -833,14 +850,15 @@ let isCommutable = 1, Uses = [CPSR] in
 def tADC :                      // A8.6.2
   T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
                 "adc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Add immediate
 def tADDi3 :                    // A8.6.4 T1
   T1sIGenEncodeImm<0b01110, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "add", "\t$Rd, $Rm, $imm3",
-                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]> {
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7:$imm3))]>,
+                   Sched<[WriteALU]> {
   bits<3> imm3;
   let Inst{8-6} = imm3;
 }
@@ -849,7 +867,8 @@ def tADDi8 :                    // A8.6.4 T2
   T1sItGenEncodeImm<{1,1,0,?,?}, (outs tGPR:$Rdn),
                     (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "add", "\t$Rdn, $imm8",
-                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>;
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255:$imm8))]>,
+                    Sched<[WriteALU]>;
 
 // Add register
 let isCommutable = 1 in
@@ -857,12 +876,12 @@ def tADDrr :                    // A8.6.6 T1
   T1sIGenEncode<0b01100, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "add", "\t$Rd, $Rn, $Rm",
-                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 let neverHasSideEffects = 1 in
 def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                      "add", "\t$Rdn, $Rm", []>,
-               T1Special<{0,0,?,?}> {
+               T1Special<{0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.6 T2
   bits<4> Rdn;
   bits<4> Rm;
@@ -877,14 +896,15 @@ def tAND :                      // A8.6.12
   T1sItDPEncode<0b0000, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "and", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (and tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // ASR immediate
 def tASRri :                    // A8.6.14
   T1sIGenEncodeImm<{0,1,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "asr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+                   [(set tGPR:$Rd, (sra tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -894,14 +914,15 @@ def tASRrr :                    // A8.6.15
   T1sItDPEncode<0b0100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "asr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (sra tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // BIC register
 def tBIC :                      // A8.6.20
   T1sItDPEncode<0b1110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "bic", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>;
+                [(set tGPR:$Rdn, (and tGPR:$Rn, (not tGPR:$Rm)))]>,
+                Sched<[WriteALU]>;
 
 // CMN register
 let isCompare = 1, Defs = [CPSR] in {
@@ -917,7 +938,7 @@ def tCMNz :                     // A8.6.33
   T1pIDPEncode<0b1011, (outs), (ins tGPR:$Rn, tGPR:$Rm),
                IIC_iCMPr,
                "cmn", "\t$Rn, $Rm",
-               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>;
+               [(ARMcmpZ tGPR:$Rn, (ineg tGPR:$Rm))]>, Sched<[WriteCMP]>;
 
 } // isCompare = 1, Defs = [CPSR]
 
@@ -926,7 +947,7 @@ let isCompare = 1, Defs = [CPSR] in {
 def tCMPi8 : T1pI<(outs), (ins tGPR:$Rn, imm0_255:$imm8), IIC_iCMPi,
                   "cmp", "\t$Rn, $imm8",
                   [(ARMcmp tGPR:$Rn, imm0_255:$imm8)]>,
-             T1General<{1,0,1,?,?}> {
+             T1General<{1,0,1,?,?}>, Sched<[WriteCMP]> {
   // A8.6.35
   bits<3> Rn;
   bits<8> imm8;
@@ -939,11 +960,11 @@ def tCMPr :                     // A8.6.36 T1
   T1pIDPEncode<0b1010, (outs), (ins tGPR:$Rn, tGPR:$Rm),
                IIC_iCMPr,
                "cmp", "\t$Rn, $Rm",
-               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>;
+               [(ARMcmp tGPR:$Rn, tGPR:$Rm)]>, Sched<[WriteCMP]>;
 
 def tCMPhir : T1pI<(outs), (ins GPR:$Rn, GPR:$Rm), IIC_iCMPr,
                    "cmp", "\t$Rn, $Rm", []>,
-              T1Special<{0,1,?,?}> {
+              T1Special<{0,1,?,?}>, Sched<[WriteCMP]> {
   // A8.6.36 T2
   bits<4> Rm;
   bits<4> Rn;
@@ -960,14 +981,15 @@ def tEOR :                      // A8.6.45
   T1sItDPEncode<0b0001, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "eor", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (xor tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // LSL immediate
 def tLSLri :                    // A8.6.88
   T1sIGenEncodeImm<{0,0,0,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_31:$imm5),
                    IIC_iMOVsi,
                    "lsl", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]> {
+                   [(set tGPR:$Rd, (shl tGPR:$Rm, (i32 imm:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -977,14 +999,15 @@ def tLSLrr :                    // A8.6.89
   T1sItDPEncode<0b0010, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "lsl", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (shl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // LSR immediate
 def tLSRri :                    // A8.6.90
   T1sIGenEncodeImm<{0,0,1,?,?}, (outs tGPR:$Rd), (ins tGPR:$Rm, imm_sr:$imm5),
                    IIC_iMOVsi,
                    "lsr", "\t$Rd, $Rm, $imm5",
-                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]> {
+                   [(set tGPR:$Rd, (srl tGPR:$Rm, (i32 imm_sr:$imm5)))]>,
+                   Sched<[WriteALU]> {
   bits<5> imm5;
   let Inst{10-6} = imm5;
 }
@@ -994,14 +1017,14 @@ def tLSRrr :                    // A8.6.91
   T1sItDPEncode<0b0011, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "lsr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (srl tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Move register
 let isMoveImm = 1 in
 def tMOVi8 : T1sI<(outs tGPR:$Rd), (ins imm0_255:$imm8), IIC_iMOVi,
                   "mov", "\t$Rd, $imm8",
                   [(set tGPR:$Rd, imm0_255:$imm8)]>,
-             T1General<{1,0,0,?,?}> {
+             T1General<{1,0,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.96
   bits<3> Rd;
   bits<8> imm8;
@@ -1019,7 +1042,7 @@ let neverHasSideEffects = 1 in {
 def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
                       2, IIC_iMOVr,
                       "mov", "\t$Rd, $Rm", "", []>,
-                  T1Special<{1,0,?,?}> {
+                  T1Special<{1,0,?,?}>, Sched<[WriteALU]> {
   // A8.6.97
   bits<4> Rd;
   bits<4> Rm;
@@ -1029,7 +1052,7 @@ def tMOVr : Thumb1pI<(outs GPR:$Rd), (ins GPR:$Rm), AddrModeNone,
 }
 let Defs = [CPSR] in
 def tMOVSr      : T1I<(outs tGPR:$Rd), (ins tGPR:$Rm), IIC_iMOVr,
-                      "movs\t$Rd, $Rm", []>, Encoding16 {
+                      "movs\t$Rd, $Rm", []>, Encoding16, Sched<[WriteALU]> {
   // A8.6.97
   bits<3> Rd;
   bits<3> Rm;
@@ -1060,7 +1083,7 @@ def :tInstAlias<"mul${s}${p} $Rdm, $Rn", (tMUL tGPR:$Rdm, s_cc_out:$s, tGPR:$Rn,
 def tMVN :                      // A8.6.107
   T1sIDPEncode<0b1111, (outs tGPR:$Rd), (ins tGPR:$Rn), IIC_iMVNr,
                "mvn", "\t$Rd, $Rn",
-               [(set tGPR:$Rd, (not tGPR:$Rn))]>;
+               [(set tGPR:$Rd, (not tGPR:$Rn))]>, Sched<[WriteALU]>;
 
 // Bitwise or register
 let isCommutable = 1 in
@@ -1068,7 +1091,7 @@ def tORR :                      // A8.6.114
   T1sItDPEncode<0b1100, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iBITr,
                 "orr", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (or tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
 // Swaps
 def tREV :                      // A8.6.134
@@ -1076,35 +1099,36 @@ def tREV :                      // A8.6.134
                  IIC_iUNAr,
                  "rev", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (bswap tGPR:$Rm))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 def tREV16 :                    // A8.6.135
   T1pIMiscEncode<{1,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
                  IIC_iUNAr,
                  "rev16", "\t$Rd, $Rm",
              [(set tGPR:$Rd, (rotr (bswap tGPR:$Rm), (i32 16)))]>,
-                Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 def tREVSH :                    // A8.6.136
   T1pIMiscEncode<{1,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
                  IIC_iUNAr,
                  "revsh", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sra (bswap tGPR:$Rm), (i32 16)))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 // Rotate right register
 def tROR :                      // A8.6.139
   T1sItDPEncode<0b0111, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iMOVsr,
                 "ror", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (rotr tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Negate register
 def tRSB :                      // A8.6.141
   T1sIDPEncode<0b1001, (outs tGPR:$Rd), (ins tGPR:$Rn),
                IIC_iALUi,
                "rsb", "\t$Rd, $Rn, #0",
-               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>;
+               [(set tGPR:$Rd, (ineg tGPR:$Rn))]>, Sched<[WriteALU]>;
 
 // Subtract with carry register
 let Uses = [CPSR] in
@@ -1112,14 +1136,16 @@ def tSBC :                      // A8.6.151
   T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sbc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Subtract immediate
 def tSUBi3 :                    // A8.6.210 T1
   T1sIGenEncodeImm<0b01111, (outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
                    IIC_iALUi,
                    "sub", "\t$Rd, $Rm, $imm3",
-                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]> {
+                   [(set tGPR:$Rd, (add tGPR:$Rm, imm0_7_neg:$imm3))]>,
+                   Sched<[WriteALU]> {
   bits<3> imm3;
   let Inst{8-6} = imm3;
 }
@@ -1128,14 +1154,16 @@ def tSUBi8 :                    // A8.6.210 T2
   T1sItGenEncodeImm<{1,1,1,?,?}, (outs tGPR:$Rdn),
                     (ins tGPR:$Rn, imm0_255:$imm8), IIC_iALUi,
                     "sub", "\t$Rdn, $imm8",
-                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>;
+                    [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
+                    Sched<[WriteALU]>;
 
 // Subtract register
 def tSUBrr :                    // A8.6.212
   T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sub", "\t$Rd, $Rn, $Rm",
-                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>;
+                [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
+                Sched<[WriteALU]>;
 
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
@@ -1143,7 +1171,8 @@ def tSXTB :                     // A8.6.222
                  IIC_iUNAr,
                  "sxtb", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i8))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Sign-extend short
 def tSXTH :                     // A8.6.224
@@ -1151,14 +1180,16 @@ def tSXTH :                     // A8.6.224
                  IIC_iUNAr,
                  "sxth", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (sext_inreg tGPR:$Rm, i16))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Test
 let isCompare = 1, isCommutable = 1, Defs = [CPSR] in
 def tTST :                      // A8.6.230
   T1pIDPEncode<0b1000, (outs), (ins tGPR:$Rn, tGPR:$Rm), IIC_iTSTr,
                "tst", "\t$Rn, $Rm",
-               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>;
+               [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
+               Sched<[WriteALU]>;
 
 // Zero-extend byte
 def tUXTB :                     // A8.6.262
@@ -1166,7 +1197,8 @@ def tUXTB :                     // A8.6.262
                  IIC_iUNAr,
                  "uxtb", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (and tGPR:$Rm, 0xFF))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>,
+                 Sched<[WriteALU]>;
 
 // Zero-extend short
 def tUXTH :                     // A8.6.264
@@ -1174,22 +1206,22 @@ def tUXTH :                     // A8.6.264
                  IIC_iUNAr,
                  "uxth", "\t$Rd, $Rm",
                  [(set tGPR:$Rd, (and tGPR:$Rm, 0xFFFF))]>,
-                 Requires<[IsThumb, IsThumb1Only, HasV6]>;
+                 Requires<[IsThumb, IsThumb1Only, HasV6]>, Sched<[WriteALU]>;
 
 // Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC operation.
 // Expanded after instruction selection into a branch sequence.
 let usesCustomInserter = 1 in  // Expanded after instruction selection.
   def tMOVCCr_pseudo :
-  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
-              NoItinerary,
-             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, cmovpred:$p),
+             NoItinerary,
+             [(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, cmovpred:$p))]>;
 
 // tLEApcrel - Load a pc-relative address into a register without offending the
 // assembler.
 
 def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
                IIC_iALUi, "adr{$p}\t$Rd, $addr", []>,
-               T1Encoding<{1,0,1,0,0,?}> {
+               T1Encoding<{1,0,1,0,0,?}>, Sched<[WriteALU]> {
   bits<3> Rd;
   bits<8> addr;
   let Inst{10-8} = Rd;
@@ -1199,12 +1231,12 @@ def tADR : T1I<(outs tGPR:$Rd), (ins t_adrlabel:$addr, pred:$p),
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
-                              2, IIC_iALUi, []>;
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
 
 let hasSideEffects = 1 in
 def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
                               (ins i32imm:$label, nohash_imm:$id, pred:$p),
-                              2, IIC_iALUi, []>;
+                              2, IIC_iALUi, []>, Sched<[WriteALU]>;
 
 //===----------------------------------------------------------------------===//
 // TLS Instructions
@@ -1215,7 +1247,8 @@ def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
 // complete with fixup for the aeabi_read_tp function.
 let isCall = 1, Defs = [R0, R12, LR, CPSR], Uses = [SP] in
 def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
-                          [(set R0, ARMthread_pointer)]>;
+                          [(set R0, ARMthread_pointer)]>,
+                          Sched<[WriteBr]>;
 
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
@@ -1381,13 +1414,13 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
     hasExtraDefRegAllocReq = 1 in
 def tPOP_RET : tPseudoExpand<(outs), (ins pred:$p, reglist:$regs, variable_ops),
                            2, IIC_iPop_Br, [],
-                           (tPOP pred:$p, reglist:$regs)>;
+                           (tPOP pred:$p, reglist:$regs)>, Sched<[WriteBrL]>;
 
 // Indirect branch using "mov pc, $Rm"
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def tBRIND : tPseudoExpand<(outs), (ins GPR:$Rm, pred:$p),
                   2, IIC_Br, [(brind GPR:$Rm)],
-                  (tMOVr PC, GPR:$Rm, pred:$p)>;
+                  (tMOVr PC, GPR:$Rm, pred:$p)>, Sched<[WriteBr]>;
 }
 
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 4dacb86..48acffd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -173,14 +173,13 @@ def t2ldr_pcrel_imm12 : Operand<i32> {
 // ADR instruction labels.
 def t2adrlabel : Operand<i32> {
   let EncoderMethod = "getT2AdrLabelOpValue";
-  let PrintMethod = "printAdrLabelOperand";
+  let PrintMethod = "printAdrLabelOperand<0>";
 }
 
-
 // t2addrmode_posimm8  := reg + imm8
 def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
 def t2addrmode_posimm8 : Operand<i32> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemPosImm8OffsetAsmOperand;
@@ -191,7 +190,7 @@ def t2addrmode_posimm8 : Operand<i32> {
 def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
 def t2addrmode_negimm8 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemNegImm8OffsetAsmOperand;
@@ -200,15 +199,22 @@ def t2addrmode_negimm8 : Operand<i32>,
 
 // t2addrmode_imm8  := reg +/- imm8
 def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
-def t2addrmode_imm8 : Operand<i32>,
-                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+class T2AddrMode_Imm8 : Operand<i32>,
+                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemImm8OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
+def t2addrmode_imm8 : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+def t2addrmode_imm8_pre : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<true>";
+}
+
 def t2am_imm8_offset : Operand<i32>,
                        ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
                                       [], [SDNPWantRoot]> {
@@ -219,14 +225,21 @@ def t2am_imm8_offset : Operand<i32>,
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
 def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
-def t2addrmode_imm8s4 : Operand<i32> {
-  let PrintMethod = "printT2AddrModeImm8s4Operand";
+class T2AddrMode_Imm8s4 : Operand<i32> {
   let EncoderMethod = "getT2AddrModeImm8s4OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8s4";
   let ParserMatchClass = MemImm8s4OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
+def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
 def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
 def t2am_imm8s4_offset : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
@@ -238,7 +251,8 @@ def t2am_imm8s4_offset : Operand<i32> {
 def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
   let Name = "MemImm0_1020s4Offset";
 }
-def t2addrmode_imm0_1020s4 : Operand<i32> {
+def t2addrmode_imm0_1020s4 : Operand<i32>,
+                         ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> {
   let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
   let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
@@ -451,6 +465,18 @@ class T2ThreeReg<dag oops, dag iops, InstrItinClass itin,
   let Inst{3-0}   = Rm;
 }
 
+class T2ThreeRegNoP<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : T2XI<oops, iops, itin, asm, pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{11-8}  = Rd;
+  let Inst{19-16} = Rn;
+  let Inst{3-0}   = Rm;
+}
+
 class T2sThreeReg<dag oops, dag iops, InstrItinClass itin,
            string opc, string asm, list<dag> pattern>
   : T2sI<oops, iops, itin, opc, asm, pattern> {
@@ -554,7 +580,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    def ri : T2sTwoRegImm<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), iii,
                  opc, "\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_imm:$imm))]>,
+                 Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -563,7 +590,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    // register
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), iir,
                  opc, !strconcat(wide, "\t$Rd, $Rn, $Rm"),
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -576,7 +604,8 @@ multiclass T2I_bin_irs<bits<4> opcod, string opc,
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm), iis,
                  opc, !strconcat(wide, "\t$Rd, $Rn, $ShiftedRm"),
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm))]>,
+                 Sched<[WriteALUsi, ReadALU]>  {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -635,7 +664,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def ri : T2sTwoRegImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm), IIC_iALUi,
                  opc, ".w\t$Rd, $Rn, $imm",
-                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]> {
+                 [(set rGPR:$Rd, (opnode t2_so_imm:$imm, rGPR:$Rn))]>,
+                 Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -645,7 +675,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def rr : T2sThreeReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, "\t$Rd, $Rn, $Rm",
-                 [/* For disassembly only; pattern left blank */]> {
+                 [/* For disassembly only; pattern left blank */]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -657,7 +688,8 @@ multiclass T2I_rbin_irs<bits<4> opcod, string opc, PatFrag opnode> {
    def rs : T2sTwoRegShiftedReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsir, opc, "\t$Rd, $Rn, $ShiftedRm",
-                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]> {
+                 [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm, rGPR:$Rn))]>,
+                 Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -678,12 +710,14 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
                          (ins GPRnopc:$Rn, t2_so_imm:$imm, pred:$p),
                          4, iii,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                t2_so_imm:$imm))]>;
+                                                t2_so_imm:$imm))]>,
+            Sched<[WriteALU, ReadALU]>;
    // register
    def rr : t2PseudoInst<(outs rGPR:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm, pred:$p),
                          4, iir,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                rGPR:$Rm))]> {
+                                                rGPR:$Rm))]>,
+            Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
    }
    // shifted register
@@ -691,7 +725,8 @@ multiclass T2I_bin_s_irs<InstrItinClass iii, InstrItinClass iir,
                          (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, iis,
                          [(set rGPR:$Rd, CPSR, (opnode GPRnopc:$Rn,
-                                                t2_so_reg:$ShiftedRm))]>;
+                                                t2_so_reg:$ShiftedRm))]>,
+            Sched<[WriteALUsi, ReadALUsr]>;
 }
 }
 
@@ -704,13 +739,15 @@ multiclass T2I_rbin_s_is<PatFrag opnode> {
                          (ins rGPR:$Rn, t2_so_imm:$imm, pred:$p),
                          4, IIC_iALUi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_imm:$imm,
-                                                rGPR:$Rn))]>;
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALU, ReadALU]>;
    // shifted register
    def rs : t2PseudoInst<(outs rGPR:$Rd),
                          (ins rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p),
                          4, IIC_iALUsi,
                          [(set rGPR:$Rd, CPSR, (opnode t2_so_reg:$ShiftedRm,
-                                                rGPR:$Rn))]>;
+                                                rGPR:$Rn))]>,
+            Sched<[WriteALUsi, ReadALU]>;
 }
 }
 
@@ -725,7 +762,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def ri : T2sTwoRegImm<
                (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iALUi,
                opc, ".w\t$Rd, $Rn, $imm",
-               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]> {
+               [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_imm:$imm))]>,
+               Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24} = 1;
@@ -737,7 +775,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def ri12 : T2I<
                   (outs GPRnopc:$Rd), (ins GPR:$Rn, imm0_4095:$imm), IIC_iALUi,
                   !strconcat(opc, "w"), "\t$Rd, $Rn, $imm",
-                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]> {
+                  [(set GPRnopc:$Rd, (opnode GPR:$Rn, imm0_4095:$imm))]>,
+                  Sched<[WriteALU, ReadALU]> {
      bits<4> Rd;
      bits<4> Rn;
      bits<12> imm;
@@ -755,7 +794,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    // register
    def rr : T2sThreeReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, rGPR:$Rm),
                  IIC_iALUr, opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]> {
+                 [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -769,7 +809,8 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
    def rs : T2sTwoRegShiftedReg<
                  (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
-              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]> {
+              [(set GPRnopc:$Rd, (opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm))]>,
+              Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24} = 1;
@@ -787,7 +828,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def ri : T2sTwoRegImm<(outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_imm:$imm),
                  IIC_iALUi, opc, "\t$Rd, $Rn, $imm",
                [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_imm:$imm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -797,7 +838,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
    def rr : T2sThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUr,
                  opc, ".w\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, rGPR:$Rm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALU, ReadALU, ReadALU]> {
      let isCommutable = Commutable;
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
@@ -811,7 +852,7 @@ multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
                  (outs rGPR:$Rd), (ins rGPR:$Rn, t2_so_reg:$ShiftedRm),
                  IIC_iALUsi, opc, ".w\t$Rd, $Rn, $ShiftedRm",
          [(set rGPR:$Rd, CPSR, (opnode rGPR:$Rn, t2_so_reg:$ShiftedRm, CPSR))]>,
-                 Requires<[IsThumb2]> {
+                 Requires<[IsThumb2]>, Sched<[WriteALUsi, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -826,7 +867,8 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
    def ri : T2sTwoRegShiftImm<
                  (outs rGPR:$Rd), (ins rGPR:$Rm, ty:$imm), IIC_iMOVsi,
                  opc, ".w\t$Rd, $Rm, $imm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rm, (i32 ty:$imm)))]>,
+                 Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-21} = 0b010010;
      let Inst{19-16} = 0b1111; // Rn
@@ -836,7 +878,8 @@ multiclass T2I_sh_ir<bits<2> opcod, string opc, Operand ty, PatFrag opnode> {
    def rr : T2sThreeReg<
                  (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMOVsr,
                  opc, ".w\t$Rd, $Rn, $Rm",
-                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]> {
+                 [(set rGPR:$Rd, (opnode rGPR:$Rn, rGPR:$Rm))]>,
+                 Sched<[WriteALU]> {
      let Inst{31-27} = 0b11111;
      let Inst{26-23} = 0b0100;
      let Inst{22-21} = opcod;
@@ -880,7 +923,7 @@ let isCompare = 1, Defs = [CPSR] in {
    def ri : T2OneRegCmpImm<
                 (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), iii,
                 opc, ".w\t$Rn, $imm",
-                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_imm:$imm)]>, Sched<[WriteCMP]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = opcod;
@@ -892,7 +935,7 @@ let isCompare = 1, Defs = [CPSR] in {
    def rr : T2TwoRegCmp<
                 (outs), (ins GPRnopc:$Rn, rGPR:$Rm), iir,
                 opc, ".w\t$Rn, $Rm",
-                [(opnode GPRnopc:$Rn, rGPR:$Rm)]> {
+                [(opnode GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -906,7 +949,8 @@ let isCompare = 1, Defs = [CPSR] in {
    def rs : T2OneRegCmpShiftedReg<
                 (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rn, $ShiftedRm",
-                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+                [(opnode GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                Sched<[WriteCMPsi]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -941,6 +985,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm12";
   }
   def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
@@ -961,6 +1007,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{9}     = addr{8};    // U
     let Inst{8} = 0; // The W bit.
     let Inst{7-0}   = addr{7-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm8";
   }
   def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
@@ -993,14 +1041,18 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
-    let Inst{23} = ?; // add = (U == '1')
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
     let Inst{19-16} = 0b1111; // Rn
+
     bits<4> Rt;
-    bits<12> addr;
     let Inst{15-12} = Rt{3-0};
+
+    bits<13> addr;
+    let Inst{23} = addr{12}; // add = (U == '1')
     let Inst{11-0}  = addr{11-0};
+
+    let DecoderMethod = "DecodeT2LoadLabel";
   }
 }
 
@@ -1167,7 +1219,8 @@ class T2PCOneRegImm<dag oops, dag iops, InstrItinClass itin,
 // assembler.
 def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
               (ins t2adrlabel:$addr, pred:$p),
-              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []> {
+              IIC_iALUi, "adr{$p}.w\t$Rd, $addr", []>,
+              Sched<[WriteALU, ReadALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25-24} = 0b10;
   // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
@@ -1190,12 +1243,12 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
 
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
-                                4, IIC_iALUi, []>;
+                                4, IIC_iALUi, []>, Sched<[WriteALU, ReadALU]>;
 let hasSideEffects = 1 in
 def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$label, nohash_imm:$id, pred:$p),
                                 4, IIC_iALUi,
-                                []>;
+                                []>, Sched<[WriteALU, ReadALU]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -1209,15 +1262,15 @@ defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(zextloadi16 node:$Src)>>;
+                      GPR, UnOpFrag<(zextloadi16 node:$Src)>>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(zextloadi8  node:$Src)>>;
+                      GPR, UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(sextloadi16 node:$Src)>>;
+                      GPR, UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(sextloadi8  node:$Src)>>;
+                      GPR, UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
@@ -1275,12 +1328,9 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
 
 def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
@@ -1288,48 +1338,42 @@ def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
 def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
 def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            []>;
+
 def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            []>;
+
 def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
@@ -1354,6 +1398,8 @@ class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW.
   let Inst{7-0} = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LoadT";
 }
 
 def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
@@ -1362,6 +1408,32 @@ def t2LDRHT  : T2IldT<0, 0b01, "ldrht", IIC_iLoad_bh_i>;
 def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
 def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
 
+class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
+               string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary,
+            opc, asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-24} = 0b000;
+  let Inst{23-20} = bits23_20;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR,
                    BinOpFrag<(store node:$LHS, node:$RHS)>>;
@@ -1380,27 +1452,22 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
 
 let mayStore = 1, neverHasSideEffects = 1 in {
 def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins GPRnopc:$Rt, t2addrmode_imm8:$addr),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
-                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
 def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                         "strh", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
 
 def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
 } // mayStore = 1, neverHasSideEffects = 1
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -1487,9 +1554,8 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // For disassembly only.
 
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
-                 (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru,
+                 (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
-  let AsmMatchConverter = "cvtT2LdrdPre";
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
@@ -1499,10 +1565,9 @@ def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  "$addr.base = $wb", []>;
 
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
-                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
                  "$addr.base = $wb", []> {
-  let AsmMatchConverter = "cvtT2StrdPre";
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
@@ -1512,6 +1577,31 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
                  "$addr.base = $wb", []>;
 
+class T2Istrrel<bits<2> bit54, dag oops, dag iops,
+                string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
+            asm, "", pattern>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-6} = 0b111110;
+  let Inst{5-4} = bit54;
+  let Inst{3-0} = 0b1111;
+
+  // Encode instruction operands
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+}
+
+def t2STL  : T2Istrrel<0b10, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stl", "\t$Rt, $addr", []>;
+def t2STLB : T2Istrrel<0b00, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlb", "\t$Rt, $addr", []>;
+def t2STLH : T2Istrrel<0b01, (outs), (ins rGPR:$Rt, addr_offset_none:$addr),
+                       "stlh", "\t$Rt, $addr", []>;
+
 // T2Ipl (Preload Data/Instruction) signals the memory system of possible future
 // data/instruction access.
 // instr_write is inverted for Thumb mode: (prefetch 3) -> (preload 0),
@@ -1520,24 +1610,27 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
   def i12 : T2Ii12<(outs), (ins t2addrmode_imm12:$addr), IIC_Preload, opc,
                 "\t$addr",
-              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]> {
+              [(ARMPreload t2addrmode_imm12:$addr, (i32 write), (i32 instr))]>,
+              Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
+    let Inst{23} = 1;
     let Inst{22} = 0;
     let Inst{21} = write;
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
 
     bits<17> addr;
-    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
-    let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm12
+
+    let DecoderMethod = "DecodeT2LoadImm12";
   }
 
   def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
                 "\t$addr",
-            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]> {
+            [(ARMPreload t2addrmode_negimm8:$addr, (i32 write), (i32 instr))]>,
+            Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // U = 0
@@ -1550,11 +1643,14 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     bits<13> addr;
     let Inst{19-16} = addr{12-9}; // Rn
     let Inst{7-0}   = addr{7-0};  // imm8
+
+    let DecoderMethod = "DecodeT2LoadImm8";
   }
 
   def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
                "\t$addr",
-             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]> {
+             [(ARMPreload t2addrmode_so_reg:$addr, (i32 write), (i32 instr))]>,
+             Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
     let Inst{23} = 0; // add = TRUE for T1
@@ -1562,7 +1658,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{21} = write;
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
-    let Inst{11-6} = 0000000;
+    let Inst{11-6} = 0b000000;
 
     bits<10> addr;
     let Inst{19-16} = addr{9-6}; // Rn
@@ -1571,15 +1667,33 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
     let DecoderMethod = "DecodeT2LoadShift";
   }
-  // FIXME: We should have a separate 'pci' variant here. As-is we represent
-  // it via the i12 variant, which it's related to, but that means we can
-  // represent negative immediates, which aren't legal for anything except
-  // the 'pci' case (Rn == 15).
 }
 
-defm t2PLD  : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
-defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
-defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+defm t2PLD    : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW   : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+// pci variant is very similar to i12, but supports negative offsets
+// from the PC. Only PLD and PLI have pci variants (not PLDW)
+class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+               IIC_Preload, opc, "\t$addr", 
+               [(ARMPreload (ARMWrapper tconstpool:$addr),
+                (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
+  let Inst{31-25} = 0b1111100;
+  let Inst{24} = inst;
+  let Inst{22-20} = 0b001;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1111;
+
+  bits<13> addr;
+  let Inst{23}   = addr{12};   // add = (U == '1')
+  let Inst{11-0} = addr{11-0}; // imm12
+
+  let DecoderMethod = "DecodeT2LoadLabel";
+}
+
+def t2PLDpci : T2Iplpci<0, "pld">,  Requires<[IsThumb2]>;
+def t2PLIpci : T2Iplpci<1, "pli">,  Requires<[IsThumb2,HasV7]>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
@@ -1743,7 +1857,7 @@ defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 
 let neverHasSideEffects = 1 in
 def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
-                   "mov", ".w\t$Rd, $Rm", []> {
+                   "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -1763,7 +1877,7 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
     AddedComplexity = 1 in
 def t2MOVi : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), IIC_iMOVi,
                    "mov", ".w\t$Rd, $imm",
-                   [(set rGPR:$Rd, t2_so_imm:$imm)]> {
+                   [(set rGPR:$Rd, t2_so_imm:$imm)]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 0;
   let Inst{24-21} = 0b0010;
@@ -1786,7 +1900,7 @@ def : t2InstAlias<"mov${p} $Rd, $imm", (t2MOVi rGPR:$Rd, t2_so_imm:$imm,
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in
 def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
                    "movw", "\t$Rd, $imm",
-                   [(set rGPR:$Rd, imm0_65535:$imm)]> {
+                   [(set rGPR:$Rd, imm0_65535:$imm)]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0010;
@@ -1804,6 +1918,9 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
+def : t2InstAlias<"mov${p} $Rd, $imm", 
+                  (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
+
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
 
@@ -1812,7 +1929,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
                     (ins rGPR:$src, imm0_65535_expr:$imm), IIC_iMOVi,
                     "movt", "\t$Rd, $imm",
                     [(set rGPR:$Rd,
-                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]> {
+                          (or (and rGPR:$src, 0xffff), lo16AllZero:$imm))]>,
+                          Sched<[WriteALU]> {
   let Inst{31-27} = 0b11110;
   let Inst{25} = 1;
   let Inst{24-21} = 0b0110;
@@ -1831,7 +1949,8 @@ def t2MOVTi16 : T2I<(outs rGPR:$Rd),
 }
 
 def t2MOVTi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
-                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                     (ins rGPR:$src, i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                     Sched<[WriteALU]>;
 } // Constraints
 
 def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
@@ -2171,7 +2290,7 @@ def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
 let Uses = [CPSR] in {
 def t2RRX : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                    "rrx", "\t$Rd, $Rm",
-                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]> {
+                   [(set rGPR:$Rd, (ARMrrx rGPR:$Rm))]>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2185,7 +2304,8 @@ let isCodeGenOnly = 1, Defs = [CPSR] in {
 def t2MOVsrl_flag : T2TwoRegShiftImm<
                         (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                         "lsrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]> {
+                        [(set rGPR:$Rd, (ARMsrl_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2199,7 +2319,8 @@ def t2MOVsrl_flag : T2TwoRegShiftImm<
 def t2MOVsra_flag : T2TwoRegShiftImm<
                         (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iMOVsi,
                         "asrs", ".w\t$Rd, $Rm, #1",
-                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]> {
+                        [(set rGPR:$Rd, (ARMsra_flag rGPR:$Rm))]>,
+                        Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-21} = 0b0010;
@@ -2320,7 +2441,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // shifted imm
    def i : T2sOneRegImm<(outs rGPR:$Rd), (ins t2_so_imm:$imm), iii,
                 opc, "\t$Rd, $imm",
-                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]> {
+                [(set rGPR:$Rd, (opnode t2_so_imm:$imm))]>, Sched<[WriteALU]> {
      let isAsCheapAsAMove = Cheap;
      let isReMaterializable = ReMat;
      let isMoveImm = MoveImm;
@@ -2333,7 +2454,7 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // register
    def r : T2sTwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm), iir,
                 opc, ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (opnode rGPR:$Rm))]> {
+                [(set rGPR:$Rd, (opnode rGPR:$Rm))]>, Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -2345,7 +2466,8 @@ multiclass T2I_un_irs<bits<4> opcod, string opc,
    // shifted register
    def s : T2sOneRegShiftedReg<(outs rGPR:$Rd), (ins t2_so_reg:$ShiftedRm), iis,
                 opc, ".w\t$Rd, $ShiftedRm",
-                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]> {
+                [(set rGPR:$Rd, (opnode t2_so_reg:$ShiftedRm))]>,
+                Sched<[WriteALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = opcod;
@@ -2804,22 +2926,27 @@ class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops,
 }
 
 def t2CLZ : T2I_misc<0b11, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
-                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>;
+                    "clz", "\t$Rd, $Rm", [(set rGPR:$Rd, (ctlz rGPR:$Rm))]>,
+                    Sched<[WriteALU]>;
 
 def t2RBIT : T2I_misc<0b01, 0b10, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                       "rbit", "\t$Rd, $Rm",
-                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>;
+                      [(set rGPR:$Rd, (ARMrbit rGPR:$Rm))]>,
+                      Sched<[WriteALU]>;
 
 def t2REV : T2I_misc<0b01, 0b00, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
-                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>;
+                 "rev", ".w\t$Rd, $Rm", [(set rGPR:$Rd, (bswap rGPR:$Rm))]>,
+                 Sched<[WriteALU]>;
 
 def t2REV16 : T2I_misc<0b01, 0b01, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "rev16", ".w\t$Rd, $Rm",
-                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>;
+                [(set rGPR:$Rd, (rotr (bswap rGPR:$Rm), (i32 16)))]>,
+                Sched<[WriteALU]>;
 
 def t2REVSH : T2I_misc<0b01, 0b11, (outs rGPR:$Rd), (ins rGPR:$Rm), IIC_iUNAr,
                        "revsh", ".w\t$Rd, $Rm",
-                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>;
+                 [(set rGPR:$Rd, (sra (bswap rGPR:$Rm), (i32 16)))]>,
+                 Sched<[WriteALU]>;
 
 def : T2Pat<(or (sra (shl rGPR:$Rm, (i32 24)), (i32 16)),
                 (and (srl rGPR:$Rm, (i32 8)), 0xFF)),
@@ -2831,7 +2958,8 @@ def t2PKHBT : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
                                       (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
@@ -2859,7 +2987,8 @@ def t2PKHTB : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
                                        (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]> {
+                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
   let Inst{24-20} = 0b01100;
@@ -2873,7 +3002,12 @@ def t2PKHTB : T2ThreeReg<
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
@@ -2882,6 +3016,34 @@ def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
             Requires<[HasT2ExtractPack, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
+// CRC32 Instructions
+//
+// Polynomials:
+// + CRC32{B,H,W}       0x04C11DB7
+// + CRC32C{B,H,W}      0x1EDC6F41
+//
+
+class T2I_crc32<bit C, bits<2> sz, string suffix, SDPatternOperator builtin>
+  : T2ThreeRegNoP<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), NoItinerary,
+               !strconcat("crc32", suffix, "\t$Rd, $Rn, $Rm"),
+               [(set rGPR:$Rd, (builtin rGPR:$Rn, rGPR:$Rm))]>,
+               Requires<[IsThumb2, HasV8, HasCRC]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-21} = 0b010110;
+  let Inst{20}    = C;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6}   = 0b10;
+  let Inst{5-4}   = sz;
+}
+
+def t2CRC32B  : T2I_crc32<0, 0b00, "b", int_arm_crc32b>;
+def t2CRC32CB : T2I_crc32<1, 0b00, "cb", int_arm_crc32cb>;
+def t2CRC32H  : T2I_crc32<0, 0b01, "h", int_arm_crc32h>;
+def t2CRC32CH : T2I_crc32<1, 0b01, "ch", int_arm_crc32ch>;
+def t2CRC32W  : T2I_crc32<0, 0b10, "w", int_arm_crc32w>;
+def t2CRC32CW : T2I_crc32<1, 0b10, "cw", int_arm_crc32cw>;
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
@@ -2900,7 +3062,8 @@ let isCompare = 1, Defs = [CPSR] in {
    def t2CMNri : T2OneRegCmpImm<
                 (outs), (ins GPRnopc:$Rn, t2_so_imm:$imm), IIC_iCMPi,
                 "cmn", ".w\t$Rn, $imm",
-                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]> {
+                [(ARMcmn GPRnopc:$Rn, (ineg t2_so_imm:$imm))]>,
+                Sched<[WriteCMP, ReadALU]> {
      let Inst{31-27} = 0b11110;
      let Inst{25} = 0;
      let Inst{24-21} = 0b1000;
@@ -2913,7 +3076,7 @@ let isCompare = 1, Defs = [CPSR] in {
                 (outs), (ins GPRnopc:$Rn, rGPR:$Rm), IIC_iCMPr,
                 "cmn", ".w\t$Rn, $Rm",
                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                  GPRnopc:$Rn, rGPR:$Rm)]> {
+                  GPRnopc:$Rn, rGPR:$Rm)]>, Sched<[WriteCMP, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = 0b1000;
@@ -2928,7 +3091,8 @@ let isCompare = 1, Defs = [CPSR] in {
                 (outs), (ins GPRnopc:$Rn, t2_so_reg:$ShiftedRm), IIC_iCMPsi,
                 "cmn", ".w\t$Rn, $ShiftedRm",
                 [(BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>
-                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]> {
+                  GPRnopc:$Rn, t2_so_reg:$ShiftedRm)]>,
+                  Sched<[WriteCMPsi, ReadALU, ReadALU]> {
      let Inst{31-27} = 0b11101;
      let Inst{26-25} = 0b01;
      let Inst{24-21} = 0b1000;
@@ -2959,92 +3123,67 @@ defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
                          BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>>;
 
 // Conditional moves
-// FIXME: should be able to write a pattern for ARMcmov, but can't use
-// a two-value operand where a dag node expects two operands. :(
 let neverHasSideEffects = 1 in {
 
 let isCommutable = 1, isSelect = 1 in
 def t2MOVCCr : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, rGPR:$Rm, pred:$p),
+                            (ins rGPR:$false, rGPR:$Rm, cmovpred:$p),
                             4, IIC_iCMOVr,
-   [/*(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm, imm:$cc, CCR:$ccr))*/]>,
-                RegConstraint<"$false = $Rd">;
+                            [(set rGPR:$Rd, (ARMcmov rGPR:$false, rGPR:$Rm,
+                                                     cmovpred:$p))]>,
+               RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi : t2PseudoInst<(outs rGPR:$Rd),
-                            (ins rGPR:$false, t2_so_imm:$imm, pred:$p),
+def t2MOVCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
                    4, IIC_iCMOVi,
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm:$imm, imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd">;
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,t2_so_imm:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
-// FIXME: Pseudo-ize these. For now, just mark codegen only.
 let isCodeGenOnly = 1 in {
 let isMoveImm = 1 in
-def t2MOVCCi16 : T2I<(outs rGPR:$Rd), (ins rGPR:$false, imm0_65535_expr:$imm),
-                      IIC_iCMOVi,
-                      "movw", "\t$Rd, $imm", []>,
-                      RegConstraint<"$false = $Rd"> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 1;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{15} = 0;
-
-  bits<4> Rd;
-  bits<16> imm;
-
-  let Inst{11-8}  = Rd;
-  let Inst{19-16} = imm{15-12};
-  let Inst{26}    = imm{11};
-  let Inst{14-12} = imm{10-8};
-  let Inst{7-0}   = imm{7-0};
-}
+def t2MOVCCi16
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins  rGPR:$false, imm0_65535_expr:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false, imm0_65535:$imm,
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
 
 let isMoveImm = 1 in
-def t2MOVCCi32imm : PseudoInst<(outs rGPR:$dst),
-                               (ins rGPR:$false, i32imm:$src, pred:$p),
-                    IIC_iCMOVix2, []>, RegConstraint<"$false = $dst">;
+def t2MVNCCi
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, t2_so_imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVi,
+                   [(set rGPR:$Rd,
+                         (ARMcmov rGPR:$false, t2_so_imm_not:$imm,
+                                  cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+class MOVCCShPseudo<SDPatternOperator opnode, Operand ty>
+    : t2PseudoInst<(outs rGPR:$Rd),
+                   (ins rGPR:$false, rGPR:$Rm, i32imm:$imm, cmovpred:$p),
+                   4, IIC_iCMOVsi,
+                   [(set rGPR:$Rd, (ARMcmov rGPR:$false,
+                                            (opnode rGPR:$Rm, (i32 ty:$imm)),
+                                            cmovpred:$p))]>,
+      RegConstraint<"$false = $Rd">, Sched<[WriteALU]>;
+
+def t2MOVCClsl : MOVCCShPseudo<shl,  imm0_31>;
+def t2MOVCClsr : MOVCCShPseudo<srl,  imm_sr>;
+def t2MOVCCasr : MOVCCShPseudo<sra,  imm_sr>;
+def t2MOVCCror : MOVCCShPseudo<rotr, imm0_31>;
 
 let isMoveImm = 1 in
-def t2MVNCCi : T2OneRegImm<(outs rGPR:$Rd), (ins rGPR:$false, t2_so_imm:$imm),
-                   IIC_iCMOVi, "mvn", "\t$Rd, $imm",
-[/*(set rGPR:$Rd,(ARMcmov rGPR:$false,t2_so_imm_not:$imm,
-                   imm:$cc, CCR:$ccr))*/]>,
-                   RegConstraint<"$false = $Rd"> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25} = 0;
-  let Inst{24-21} = 0b0011;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15} = 0;
-}
-
-class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
-                   string opc, string asm, list<dag> pattern>
-  : T2TwoRegShiftImm<oops, iops, itin, opc, asm, pattern> {
-  let Inst{31-27} = 0b11101;
-  let Inst{26-25} = 0b01;
-  let Inst{24-21} = 0b0010;
-  let Inst{20} = 0; // The S bit.
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{5-4} = opcod; // Shift type.
-}
-def t2MOVCClsl : T2I_movcc_sh<0b00, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsl", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCClsr : T2I_movcc_sh<0b01, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "lsr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCasr : T2I_movcc_sh<0b10, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "asr", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
-def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
-                             (ins rGPR:$false, rGPR:$Rm, i32imm:$imm),
-                             IIC_iCMOVsi, "ror", ".w\t$Rd, $Rm, $imm", []>,
-                 RegConstraint<"$false = $Rd">;
+def t2MOVCCi32imm
+    : t2PseudoInst<(outs rGPR:$dst),
+                   (ins rGPR:$false, i32imm:$src, cmovpred:$p),
+                   8, IIC_iCMOVix2,
+                   [(set rGPR:$dst, (ARMcmov rGPR:$false, imm:$src,
+                                             cmovpred:$p))]>,
+      RegConstraint<"$false = $dst">;
 } // isCodeGenOnly = 1
 
 } // neverHasSideEffects
@@ -3055,40 +3194,38 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
 
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
-                  Requires<[IsThumb, HasDB]> {
+def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dmb", "\t$opt", [(int_arm_dmb (i32 imm0_15:$opt))]>,
+                Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f5;
   let Inst{3-0} = opt;
 }
 }
 
-def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dsb", "\t$opt", []>,
-                  Requires<[IsThumb, HasDB]> {
+def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dsb", "\t$opt", [(int_arm_dsb (i32 imm0_15:$opt))]>,
+                Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
 }
 
-def t2ISB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "isb", "\t$opt",
-                  []>, Requires<[IsThumb, HasDB]> {
+def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
+                "isb", "\t$opt", []>, Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
 }
 
-class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_ldrex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001101;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
   let Inst{3-0} = 0b1111;
 
   bits<4> addr;
@@ -3096,15 +3233,14 @@ class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
   let Inst{19-16} = addr;
   let Inst{15-12} = Rt;
 }
-class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
+class T2I_strex<bits<4> opcod, dag oops, dag iops, AddrMode am, int sz,
                 InstrItinClass itin, string opc, string asm, string cstr,
                 list<dag> pattern, bits<4> rt2 = 0b1111>
   : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
   let Inst{31-27} = 0b11101;
   let Inst{26-20} = 0b0001100;
   let Inst{11-8} = rt2;
-  let Inst{7-6} = 0b01;
-  let Inst{5-4} = opcod;
+  let Inst{7-4} = opcod;
 
   bits<4> Rd;
   bits<4> addr;
@@ -3115,15 +3251,18 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 }
 
 let mayLoad = 1 in {
-def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+def t2LDREXB : T2I_ldrex<0b0100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "ldrexb", "\t$Rt, $addr", "", []>;
-def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         "ldrexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
+def t2LDREXH : T2I_ldrex<0b0101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "ldrexh", "\t$Rt, $addr", "", []>;
+                         "ldrexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
 def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
-                       "ldrex", "\t$Rt, $addr", "", []> {
+                       "ldrex", "\t$Rt, $addr", "",
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> {
   bits<4> Rt;
   bits<12> addr;
   let Inst{31-27} = 0b11101;
@@ -3134,7 +3273,7 @@ def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraDefRegAllocReq = 1 in
-def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
+def t2LDREXD : T2I_ldrex<0b0111, (outs rGPR:$Rt, rGPR:$Rt2),
                          (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "ldrexd", "\t$Rt, $Rt2, $addr", "",
@@ -3142,22 +3281,60 @@ def t2LDREXD : T2I_ldrex<0b11, (outs rGPR:$Rt, rGPR:$Rt2),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2LDAEXB : T2I_ldrex<0b1100, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexb", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEXH : T2I_ldrex<0b1101, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexh", "\t$Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+def t2LDAEX  : Thumb2I<(outs rGPR:$Rt), (ins addr_offset_none:$addr),
+                       AddrModeNone, 4, NoItinerary,
+                       "ldaex", "\t$Rt, $addr", "",
+                     []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b11101111;
+}
+let hasExtraDefRegAllocReq = 1 in
+def t2LDAEXD : T2I_ldrex<0b1111, (outs rGPR:$Rt, rGPR:$Rt2),
+                         (ins addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "ldaexd", "\t$Rt, $Rt2, $addr", "",
+                         [], {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+
+  let Inst{7} = 1;
+}
 }
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
-def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
+def t2STREXB : T2I_strex<0b0100, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "strexb", "\t$Rd, $Rt, $addr", "", []>;
-def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
+                         "strexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd, (strex_1 rGPR:$Rt,
+                                                  addr_offset_none:$addr))]>;
+def t2STREXH : T2I_strex<0b0101, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "strexh", "\t$Rd, $Rt, $addr", "", []>;
+                         "strexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd, (strex_2 rGPR:$Rt,
+                                                  addr_offset_none:$addr))]>;
+
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
-                  []> {
+                  [(set rGPR:$Rd, (strex_4 rGPR:$Rt,
+                                           t2addrmode_imm0_1020s4:$addr))]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<12> addr;
@@ -3169,7 +3346,7 @@ def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
   let Inst{7-0} = addr{7-0};
 }
 let hasExtraSrcRegAllocReq = 1 in
-def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
+def t2STREXD : T2I_strex<0b0111, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
                          "strexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
@@ -3177,9 +3354,45 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
   bits<4> Rt2;
   let Inst{11-8} = Rt2;
 }
+def t2STLEXB : T2I_strex<0b1100, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexb", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEXH : T2I_strex<0b1101, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexh", "\t$Rd, $Rt, $addr", "",
+                         []>, Requires<[IsThumb, HasV8]>;
+
+def t2STLEX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
+                             addr_offset_none:$addr),
+                  AddrModeNone, 4, NoItinerary,
+                  "stlex", "\t$Rd, $Rt, $addr", "",
+                  []>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rd;
+  bits<4> Rt;
+  bits<4> addr;
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{19-16} = addr;
+  let Inst{15-12} = Rt;
+  let Inst{11-4}  = 0b11111110;
+  let Inst{3-0}   = Rd;
+}
+let hasExtraSrcRegAllocReq = 1 in
+def t2STLEXD : T2I_strex<0b1111, (outs rGPR:$Rd),
+                         (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr),
+                         AddrModeNone, 4, NoItinerary,
+                         "stlexd", "\t$Rd, $Rt, $Rt2, $addr", "", [],
+                         {?, ?, ?, ?}>, Requires<[IsThumb, HasV8]> {
+  bits<4> Rt2;
+  let Inst{11-8} = Rt2;
+}
 }
 
-def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
             Requires<[IsThumb2, HasV7]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
@@ -3190,6 +3403,15 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
   let Inst{3-0} = 0b1111;
 }
 
+def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+            (t2LDREXB addr_offset_none:$addr)>;
+def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDREXH addr_offset_none:$addr)>;
+def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
 //   eh_sjlj_setjmp() is an instruction sequence to store the return
@@ -3243,35 +3465,39 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
 let isPredicable = 1 in
 def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
                  "b", ".w\t$target",
-                 [(br bb:$target)]> {
+                 [(br bb:$target)]>, Sched<[WriteBr]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
 
   bits<24> target;
-  let Inst{26} = target{19};
-  let Inst{11} = target{18};
-  let Inst{13} = target{17};
+  let Inst{26} = target{23};
+  let Inst{13} = target{22};
+  let Inst{11} = target{21};
   let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
-}
+  let AsmMatchConverter = "cvtThumbBranches"; 
+} 
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
           (ins GPR:$target, GPR:$index, i32imm:$jt, i32imm:$id),
            0, IIC_Br,
-          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>;
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]>,
+          Sched<[WriteBr]>;
 
 // FIXME: Add a non-pc based case that can be predicated.
 def t2TBB_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
 
 def t2TBH_JT : t2PseudoInst<(outs),
-        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>;
+        (ins GPR:$index, i32imm:$jt, i32imm:$id), 0, IIC_Br, []>,
+        Sched<[WriteBr]>;
 
 def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
-                    "tbb", "\t$addr", []> {
+                    "tbb", "\t$addr", []>, Sched<[WriteBrTbl]> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3284,7 +3510,7 @@ def t2TBB : T2I<(outs), (ins addrmode_tbb:$addr), IIC_Br,
 }
 
 def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
-                   "tbh", "\t$addr", []> {
+                   "tbh", "\t$addr", []>, Sched<[WriteBrTbl]> {
   bits<4> Rn;
   bits<4> Rm;
   let Inst{31-20} = 0b111010001101;
@@ -3304,7 +3530,7 @@ def t2TBH : T2I<(outs), (ins addrmode_tbh:$addr), IIC_Br,
 let isBranch = 1, isTerminator = 1 in
 def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
                 "b", ".w\t$target",
-                [/*(ARMbrcond bb:$target, imm:$cc)*/]> {
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]>, Sched<[WriteBr]> {
   let Inst{31-27} = 0b11110;
   let Inst{15-14} = 0b10;
   let Inst{12} = 0;
@@ -3320,6 +3546,7 @@ def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
   let Inst{10-0} = target{11-1};
 
   let DecoderMethod = "DecodeThumb2BCCInstruction";
+  let AsmMatchConverter = "cvtThumbBranches";
 }
 
 // Tail calls. The IOS version of thumb tail calls uses a t2 branch, so
@@ -3331,14 +3558,15 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                    (ins uncondbrtarget:$dst, pred:$p),
                    4, IIC_Br, [],
                    (t2B uncondbrtarget:$dst, pred:$p)>,
-                 Requires<[IsThumb2, IsIOS]>;
+                 Requires<[IsThumb2, IsIOS]>, Sched<[WriteBr]>;
 }
 
 // IT block
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
                     AddrModeNone, 2,  IIC_iALUx,
-                    "it$mask\t$cc", "", []> {
+                    "it$mask\t$cc", "", []>,
+           ComplexDeprecationPredicate<"IT"> {
   // 16-bit instruction.
   let Inst{31-16} = 0x0000;
   let Inst{15-8} = 0b10111111;
@@ -3353,7 +3581,8 @@ def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
 
 // Branch and Exchange Jazelle -- for disassembly only
 // Rm = Inst{19-16}
-def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []> {
+def t2BXJ : T2I<(outs), (ins rGPR:$func), NoItinerary, "bxj", "\t$func", []>,
+    Sched<[WriteBr]> {
   bits<4> func;
   let Inst{31-27} = 0b11110;
   let Inst{26} = 0;
@@ -3367,7 +3596,7 @@ let isBranch = 1, isTerminator = 1 in {
   def tCBZ  : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
                   "cbz\t$Rn, $target", []>,
               T1Misc<{0,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]> {
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3379,7 +3608,7 @@ let isBranch = 1, isTerminator = 1 in {
   def tCBNZ : T1I<(outs), (ins tGPR:$Rn, t_cbtarget:$target), IIC_Br,
                   "cbnz\t$Rn, $target", []>,
               T1Misc<{1,0,?,1,?,?,?}>,
-              Requires<[IsThumb2]> {
+              Requires<[IsThumb2]>, Sched<[WriteBr]> {
     // A8.6.27
     bits<6> target;
     bits<3> Rn;
@@ -3411,27 +3640,34 @@ class t2CPS<dag iops, string asm_op> : T2XI<(outs), iops, NoItinerary,
 
 let M = 1 in
   def t2CPS3p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags, i32imm:$mode),
-                      "$imod.w\t$iflags, $mode">;
+                      "$imod\t$iflags, $mode">;
 let mode = 0, M = 0 in
   def t2CPS2p : t2CPS<(ins imod_op:$imod, iflags_op:$iflags),
                       "$imod.w\t$iflags">;
 let imod = 0, iflags = 0, M = 1 in
   def t2CPS1p : t2CPS<(ins imm0_31:$mode), "\t$mode">;
 
+def : t2InstAlias<"cps$imod.w $iflags, $mode",
+                   (t2CPS3p imod_op:$imod, iflags_op:$iflags, i32imm:$mode), 0>;
+def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
+
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-def t2HINT : T2I<(outs), (ins imm0_4:$imm), NoItinerary, "hint", "\t$imm",[]> {
-  bits<3> imm;
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",[]> {
+  bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
-  let Inst{2-0} = imm;
+  let Inst{7-0} = imm;
 }
 
-def : t2InstAlias<"hint$p.w $imm", (t2HINT imm0_4:$imm, pred:$p)>;
+def : t2InstAlias<"hint$p $imm", (t2HINT imm0_239:$imm, pred:$p)>;
 def : t2InstAlias<"nop$p.w", (t2HINT 0, pred:$p)>;
 def : t2InstAlias<"yield$p.w", (t2HINT 1, pred:$p)>;
 def : t2InstAlias<"wfe$p.w", (t2HINT 2, pred:$p)>;
 def : t2InstAlias<"wfi$p.w", (t2HINT 3, pred:$p)>;
 def : t2InstAlias<"sev$p.w", (t2HINT 4, pred:$p)>;
+def : t2InstAlias<"sevl$p.w", (t2HINT 5, pred:$p)> {
+  let Predicates = [IsThumb2, HasV8];
+}
 
 def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
   bits<4> opt;
@@ -3454,6 +3690,20 @@ def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
   let Inst{19-16} = opt;
 }
 
+class T2DCPS<bits<2> opt, string opc>
+  : T2I<(outs), (ins), NoItinerary, opc, "", []>, Requires<[IsThumb2, HasV8]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{26-20} = 0b1111000;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1000;
+  let Inst{11-2} = 0b0000000000;
+  let Inst{1-0} = opt;
+}
+
+def t2DCPS1 : T2DCPS<0b01, "dcps1">;
+def t2DCPS2 : T2DCPS<0b10, "dcps2">;
+def t2DCPS3 : T2DCPS<0b11, "dcps3">;
+
 class T2SRS<bits<2> Op, bit W, dag oops, dag iops, InstrItinClass itin,
             string opc, string asm, list<dag> pattern>
   : T2I<oops, iops, itin, opc, asm, pattern> {
@@ -3508,6 +3758,19 @@ def t2RFEIA  : T2RFE<0b111010011001,
                    (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 
+// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
+// Exception return instruction is "subs pc, lr, #imm".
+let isReturn = 1, isBarrier = 1, isTerminator = 1, Defs = [PC] in
+def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
+                        "subs", "\tpc, lr, $imm",
+                        [(ARMintretflag imm0_255:$imm)]>,
+                   Requires<[IsThumb2]> {
+  let Inst{31-8} = 0b111100111101111010001111;
+
+  bits<8> imm;
+  let Inst{7-0} = imm;
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
@@ -3591,7 +3854,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : T2CI<op31_28,
-                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
                   asm, "\t$cop, $CRd, $addr!"> {
     bits<13> addr;
     bits<4> cop;
@@ -3651,10 +3914,10 @@ defm t2LDC   : t2LdStCop<0b1110, 1, 0, "ldc">;
 defm t2LDCL  : t2LdStCop<0b1110, 1, 1, "ldcl">;
 defm t2STC   : t2LdStCop<0b1110, 0, 0, "stc">;
 defm t2STCL  : t2LdStCop<0b1110, 0, 1, "stcl">;
-defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">;
-defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">;
-defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">;
-defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
+defm t2LDC2  : t2LdStCop<0b1111, 1, 0, "ldc2">, Requires<[PreV8]>;
+defm t2LDC2L : t2LdStCop<0b1111, 1, 1, "ldc2l">, Requires<[PreV8]>;
+defm t2STC2  : t2LdStCop<0b1111, 0, 0, "stc2">, Requires<[PreV8]>;
+defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">, Requires<[PreV8]>;
 
 
 //===----------------------------------------------------------------------===//
@@ -3666,7 +3929,7 @@ defm t2STC2L : t2LdStCop<0b1111, 0, 1, "stc2l">;
 //
 // A/R class can only move from CPSR or SPSR.
 def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
-                  []>, Requires<[IsThumb2,IsARClass]> {
+                  []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111011111000;
   let Inst{11-8} = Rd;
@@ -3676,7 +3939,7 @@ def t2MRS_AR : T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, apsr",
 def : t2InstAlias<"mrs${p} $Rd, cpsr", (t2MRS_AR GPR:$Rd, pred:$p)>;
 
 def t2MRSsys_AR: T2I<(outs GPR:$Rd), (ins), NoItinerary, "mrs", "\t$Rd, spsr",
-                   []>, Requires<[IsThumb2,IsARClass]> {
+                   []>, Requires<[IsThumb2,IsNotMClass]> {
   bits<4> Rd;
   let Inst{31-12} = 0b11110011111111111000;
   let Inst{11-8} = Rd;
@@ -3709,7 +3972,7 @@ def t2MRS_M : T2I<(outs rGPR:$Rd), (ins msr_mask:$mask), NoItinerary,
 // the mask with the fields to be accessed in the special register.
 def t2MSR_AR : T2I<(outs), (ins msr_mask:$mask, rGPR:$Rn),
                    NoItinerary, "msr", "\t$mask, $Rn", []>,
-               Requires<[IsThumb2,IsARClass]> {
+               Requires<[IsThumb2,IsNotMClass]> {
   bits<5> mask;
   bits<4> Rn;
   let Inst{31-21} = 0b11110011100;
@@ -3742,8 +4005,7 @@ def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
 
 class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
                   list<dag> pattern>
-  : T2Cop<Op, oops, iops,
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
           pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
@@ -3768,7 +4030,7 @@ class t2MovRRCopro<bits<4> Op, string opc, bit direction,
                    list<dag> pattern = []>
   : T2Cop<Op, (outs),
           (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+          opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3792,33 +4054,38 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
            (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                 c_imm:$CRm, imm0_7:$opc2),
            [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                         imm:$CRm, imm:$opc2)]>;
-def : t2InstAlias<"mcr $cop, $opc1, $Rt, $CRn, $CRm",
+                         imm:$CRm, imm:$opc2)]>,
+           ComplexDeprecationPredicate<"MCR">;
+def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
-                         c_imm:$CRm, 0)>;
+                         c_imm:$CRm, 0, pred:$p)>;
 def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
              [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
-                            imm:$CRm, imm:$opc2)]>;
-def : t2InstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+                            imm:$CRm, imm:$opc2)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
-                          c_imm:$CRm, 0)>;
+                          c_imm:$CRm, 0, pred:$p)>;
 
 /* from coprocessor to ARM core register */
 def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                                   c_imm:$CRm, imm0_7:$opc2), []>;
-def : t2InstAlias<"mrc $cop, $opc1, $Rt, $CRn, $CRm",
-                  (t2MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                         c_imm:$CRm, 0)>;
+def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                                  c_imm:$CRm, imm0_7:$opc2), []>;
-def : t2InstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
-                  (t2MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                          c_imm:$CRm, 0)>;
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                                  c_imm:$CRm, imm0_7:$opc2), []> {
+  let Predicates = [IsThumb2, PreV8];
+}
+def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
 
 def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
               (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3833,19 +4100,24 @@ def t2MCRR : t2MovRRCopro<0b1110, "mcrr", 0,
                                        imm:$CRm)]>;
 def t2MCRR2 : t2MovRRCopro<0b1111, "mcrr2", 0,
                            [(int_arm_mcrr2 imm:$cop, imm:$opc1, GPR:$Rt,
-                                           GPR:$Rt2, imm:$CRm)]>;
+                                           GPR:$Rt2, imm:$CRm)]> {
+  let Predicates = [IsThumb2, PreV8];
+}
+
 /* from coprocessor to ARM core register */
 def t2MRRC : t2MovRRCopro<0b1110, "mrrc", 1>;
 
-def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>;
+def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1> {
+  let Predicates = [IsThumb2, PreV8];
+}
 
 //===----------------------------------------------------------------------===//
 // Other Coprocessor Instructions.
 //
 
-def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
+def t2CDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
-                 "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                  [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                                imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
@@ -3864,11 +4136,13 @@ def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                    c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
-                   "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                   "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                    [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                                   imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
@@ -3887,6 +4161,8 @@ def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{15-12} = CRd;
   let Inst{19-16} = CRn;
   let Inst{23-20} = opc1;
+
+  let Predicates = [IsThumb2, PreV8];
 }
 
 
@@ -3960,6 +4236,15 @@ def : T2Pat<(atomic_store_32 t2addrmode_negimm8:$addr, GPR:$val),
 def : T2Pat<(atomic_store_32 t2addrmode_so_reg:$addr, GPR:$val),
             (t2STRs     GPR:$val, t2addrmode_so_reg:$addr)>;
 
+let AddedComplexity = 8 in {
+  def : T2Pat<(atomic_load_acquire_8 addr_offset_none:$addr),  (t2LDAB addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_16 addr_offset_none:$addr), (t2LDAH addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_load_acquire_32 addr_offset_none:$addr), (t2LDA  addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_8 addr_offset_none:$addr, GPR:$val),  (t2STLB GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_16 addr_offset_none:$addr, GPR:$val), (t2STLH GPR:$val, addr_offset_none:$addr)>;
+  def : T2Pat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (t2STL  GPR:$val, addr_offset_none:$addr)>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Assembler aliases
@@ -3981,7 +4266,8 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>;
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, 
+         cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $Rm",
@@ -4056,9 +4342,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>;
-def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>;
-def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4085,7 +4371,7 @@ def : t2InstAlias<"ldrsh${p} $Rt, $addr",
                   (t2LDRSHs rGPR:$Rt, t2addrmode_so_reg:$addr, pred:$p)>;
 
 def : t2InstAlias<"ldr${p} $Rt, $addr",
-                  (t2LDRpci GPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
+                  (t2LDRpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrb${p} $Rt, $addr",
                   (t2LDRBpci rGPR:$Rt, t2ldrlabel:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p} $Rt, $addr",
@@ -4247,16 +4533,16 @@ def : t2InstAlias<"mvn${p} $Rd, $imm",
                   (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
 // Same for AND <--> BIC
 def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
-                  (t2ANDri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
-                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
-                  (t2BICri rGPR:$Rd, rGPR:$Rn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 def : t2InstAlias<"and${s}${p} $Rdn, $imm",
-                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, so_imm_not:$imm,
+                  (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
@@ -4298,7 +4584,7 @@ def : t2InstAlias<"adr${p} $Rd, $addr",
 
 // LDR(literal) w/ alternate [pc, #imm] syntax.
 def t2LDRpcrel   : t2AsmPseudo<"ldr${p} $Rt, $addr",
-                         (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+                         (ins GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRBpcrel  : t2AsmPseudo<"ldrb${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def t2LDRHpcrel  : t2AsmPseudo<"ldrh${p} $Rt, $addr",
@@ -4309,7 +4595,7 @@ def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
     // Version w/ the .w suffix.
 def : t2InstAlias<"ldr${p}.w $Rt, $addr",
-                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+                  (t2LDRpcrel GPR:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
 def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
                   (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
@@ -4321,3 +4607,10 @@ def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
 
 def : t2InstAlias<"add${p} $Rd, pc, $imm",
                   (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
+
+// PLD/PLDW/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p)>,
+      Requires<[IsThumb2,HasV7]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index b5a896c..a8cdc5c 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -224,7 +224,36 @@ defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
 defm : VFPDTAnyInstAlias<"vpop${p}", "$r",
                          (VLDMDIA_UPD SP, pred:$p, dpr_reglist:$r)>;
 
-// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+// FLDMX, FSTMX - Load and store multiple unknown precision registers for
+// pre-armv6 cores.
+// These instruction are deprecated so we don't want them to get selected.
+multiclass vfp_ldstx_mult<string asm, bit L_bit> {
+  // Unknown precision
+  def XIA :
+    AXXI4<(outs), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeNone, !strconcat(asm, "iax${p}\t$Rn, $regs"), "", []> {
+    let Inst{24-23} = 0b01;       // Increment After
+    let Inst{21}    = 0;          // No writeback
+    let Inst{20}    = L_bit;
+  }
+  def XIA_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "iax${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b01;         // Increment After
+    let Inst{21}    = 1;            // Writeback
+    let Inst{20}    = L_bit;
+  }
+  def XDB_UPD :
+    AXXI4<(outs GPR:$wb), (ins GPR:$Rn, pred:$p, dpr_reglist:$regs, variable_ops),
+          IndexModeUpd, !strconcat(asm, "dbx${p}\t$Rn!, $regs"), "$Rn = $wb", []> {
+    let Inst{24-23} = 0b10;         // Decrement Before
+    let Inst{21}    = 1;
+    let Inst{20}    = L_bit;
+  }
+}
+
+defm FLDM : vfp_ldstx_mult<"fldm", 1>;
+defm FSTM : vfp_ldstx_mult<"fstm", 0>;
 
 //===----------------------------------------------------------------------===//
 // FP Binary Operations.
@@ -304,9 +333,52 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+multiclass vsel_inst<string op, bits<2> opc, int CC> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "",
+      Uses = [CPSR], AddedComplexity = 4 in {
+    def S : ASbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (ARMcmov SPR:$Sm, SPR:$Sn, CC))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11100, opc, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (ARMcmov (f64 DPR:$Dm), (f64 DPR:$Dn), CC))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+// The CC constants here match ARMCC::CondCodes.
+defm VSELGT : vsel_inst<"gt", 0b11, 12>;
+defm VSELGE : vsel_inst<"ge", 0b10, 10>;
+defm VSELEQ : vsel_inst<"eq", 0b00, 0>;
+defm VSELVS : vsel_inst<"vs", 0b01, 6>;
+
+multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def S : ASbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
+                   [(set SPR:$Sd, (SD SPR:$Sn, SPR:$Sm))]>,
+                   Requires<[HasFPARMv8]>;
+
+    def D : ADbInp<0b11101, 0b00, opc,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
+                   [(set DPR:$Dd, (f64 (SD (f64 DPR:$Dn), (f64 DPR:$Dm))))]>,
+                   Requires<[HasFPARMv8, HasDPVFP]>;
+  }
+}
+
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0, ARMvmaxnm>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1, ARMvminnm>;
+
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
-          (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+          (VNMULD DPR:$a, DPR:$b)>,
+          Requires<[NoHonorSignDependentRounding,HasDPVFP]>;
 def : Pat<(fmul (fneg SPR:$a), SPR:$b),
           (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
 
@@ -437,9 +509,11 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Inst{11-8}  = 0b1011;
   let Inst{7-6}   = 0b11;
   let Inst{4}     = 0;
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
-// Between half-precision and single-precision.  For disassembly only.
+// Between half, single and double-precision.  For disassembly only.
 
 // FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
@@ -464,6 +538,111 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
+def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}     = Dm{3-0};
+  let Inst{5}       = Dm{4};
+  let Inst{15-12}   = Sd{4-1};
+  let Inst{22}      = Sd{0};
+}
+
+def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+}
+
+multiclass vcvt_inst<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+
+    def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0} = Dm{3-0};
+      let Inst{5}   = Dm{4};
+      let Inst{8} = 1;
+    }
+
+    def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0}  = Dm{3-0};
+      let Inst{5}    = Dm{4};
+      let Inst{8} = 1;
+    }
+  }
+}
+
+defm VCVTA : vcvt_inst<"a", 0b00>;
+defm VCVTN : vcvt_inst<"n", 0b01>;
+defm VCVTP : vcvt_inst<"p", 0b10>;
+defm VCVTM : vcvt_inst<"m", 0b11>;
+
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
@@ -478,6 +657,63 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
   let D = VFPNeonA8Domain;
 }
 
+multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
+  def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+               []>, Requires<[HasFPARMv8]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+  def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
+                (outs DPR:$Dd), (ins DPR:$Dm),
+                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+                []>, Requires<[HasFPARMv8, HasDPVFP]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, "$p.f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm, pred:$p)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, "$p.f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm, pred:$p)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0>;
+
+multiclass vrint_inst_anpm<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
+                   []>, Requires<[HasFPARMv8]> {
+      let Inst{17-16} = rm;
+    }
+    def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dm),
+                   NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
+                   []>, Requires<[HasFPARMv8, HasDPVFP]> {
+      let Inst{17-16} = rm;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>,
+        Requires<[HasFPARMv8]>;
+  def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>,
+        Requires<[HasFPARMv8,HasDPVFP]>;
+}
+
+defm VRINTA : vrint_inst_anpm<"a", 0b00>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11>;
+
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
@@ -667,6 +903,8 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Dd{3-0};
   let Inst{22}    = Dd{4};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -738,6 +976,8 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Dm{4};
   let Inst{15-12} = Sd{4-1};
   let Inst{22}    = Sd{0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -841,7 +1081,8 @@ let Constraints = "$a = $dst" in {
 class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+  Sched<[WriteCvtFP]> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{0};
@@ -852,11 +1093,14 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
 class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
                           bit op5, dag oops, dag iops, InstrItinClass itin,
                           string opc, string asm, list<dag> pattern>
-  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern> {
+  : AVConv1XI<op1, op2, op3, op4, op5, oops, iops, itin, opc, asm, pattern>,
+    Sched<[WriteCvtFP]> {
   bits<5> dst;
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{4};
   let Inst{15-12} = dst{3-0};
+
+  let Predicates = [HasVFP2, HasDPVFP];
 }
 
 def VTOSHS : AVConv1XInsS_Encode<0b11101, 0b11, 0b1110, 0b1010, 0,
@@ -969,7 +1213,7 @@ def VMLAD : ADbI<0b11100, 0b00, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -985,7 +1229,7 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
@@ -996,7 +1240,7 @@ def VMLSD : ADbI<0b11100, 0b00, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+              Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1012,7 +1256,7 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1023,7 +1267,7 @@ def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+                Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1039,7 +1283,7 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1050,7 +1294,7 @@ def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+               Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 
 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1065,7 +1309,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP2,UseFPVMLx,DontUseFusedMAC]>;
+          Requires<[HasVFP2,HasDPVFP,UseFPVMLx,DontUseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
@@ -1079,7 +1323,7 @@ def VFMAD : ADbI<0b11101, 0b10, 0, 0,
                  [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1094,7 +1338,7 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
 
 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1103,7 +1347,7 @@ def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma x, y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
           (VFMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1114,7 +1358,7 @@ def VFMSD : ADbI<0b11101, 0b10, 1, 0,
                  [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
               RegConstraint<"$Ddin = $Dd">,
-              Requires<[HasVFP4,UseFusedMAC]>;
+              Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1129,7 +1373,7 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
 
 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
           (VFMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1138,14 +1382,14 @@ def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
 // (fma (fneg x), y, z) -> (vfms z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma x, (fneg y), z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
           (VFMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1156,7 +1400,7 @@ def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
                   [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
                                           (f64 DPR:$Ddin)))]>,
                 RegConstraint<"$Ddin = $Dd">,
-                Requires<[HasVFP4,UseFusedMAC]>;
+                Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1171,7 +1415,7 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
 
 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
           (VFNMAD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VFNMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1180,14 +1424,14 @@ def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
 // (fneg (fma x, y, z)) -> (vfnma z, x, y)
 def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMAD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1198,7 +1442,7 @@ def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
                   [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
                                            (f64 DPR:$Ddin)))]>,
                RegConstraint<"$Ddin = $Dd">,
-               Requires<[HasVFP4,UseFusedMAC]>;
+               Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 
 def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
                   (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
@@ -1212,7 +1456,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
 
 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
           (VFNMSD DPR:$dstin, DPR:$a, DPR:$b)>,
-          Requires<[HasVFP4,UseFusedMAC]>;
+          Requires<[HasVFP4,HasDPVFP,UseFusedMAC]>;
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VFNMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
@@ -1222,21 +1466,21 @@ def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
 // (fma x, y, (fneg z)) -> (vfnms z, x, y))
 def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
 // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
           (VFNMSD DPR:$Ddin, DPR:$Dn, DPR:$Dm)>,
-      Requires<[HasVFP4]>;
+      Requires<[HasVFP4,HasDPVFP]>;
 def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
@@ -1246,15 +1490,17 @@ def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
 //
 
 let neverHasSideEffects = 1 in {
-def VMOVDcc  : ARMPseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, pred:$p),
-                    4, IIC_fpUNA64,
-                    [/*(set DPR:$Dd, (ARMcmov DPR:$Dn, DPR:$Dm, imm:$cc))*/]>,
-                 RegConstraint<"$Dn = $Dd">;
-
-def VMOVScc  : ARMPseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, pred:$p),
-                    4, IIC_fpUNA32,
-                    [/*(set SPR:$Sd, (ARMcmov SPR:$Sn, SPR:$Sm, imm:$cc))*/]>,
-                 RegConstraint<"$Sn = $Sd">;
+def VMOVDcc  : PseudoInst<(outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm, cmovpred:$p),
+                    IIC_fpUNA64,
+                    [(set (f64 DPR:$Dd),
+                          (ARMcmov DPR:$Dn, DPR:$Dm, cmovpred:$p))]>,
+               RegConstraint<"$Dn = $Dd">, Requires<[HasVFP2,HasDPVFP]>;
+
+def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
+                    IIC_fpUNA32,
+                    [(set (f32 SPR:$Sd),
+                          (ARMcmov SPR:$Sn, SPR:$Sm, cmovpred:$p))]>,
+               RegConstraint<"$Sn = $Sd">, Requires<[HasVFP2]>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -1300,6 +1546,12 @@ let Uses = [FPSCR] in {
                               "vmrs", "\t$Rt, mvfr0", []>;
   def VMRS_MVFR1 : MovFromVFP<0b0110 /* mvfr1 */, (outs GPR:$Rt), (ins),
                               "vmrs", "\t$Rt, mvfr1", []>;
+  def VMRS_MVFR2 : MovFromVFP<0b0101 /* mvfr2 */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, mvfr2", []>, Requires<[HasFPARMv8]>;
+  def VMRS_FPINST : MovFromVFP<0b1001 /* fpinst */, (outs GPR:$Rt), (ins),
+                              "vmrs", "\t$Rt, fpinst", []>;
+  def VMRS_FPINST2 : MovFromVFP<0b1010 /* fpinst2 */, (outs GPR:$Rt), (ins),
+                                "vmrs", "\t$Rt, fpinst2", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1585,11 @@ let Defs = [FPSCR] in {
   // System level GPR -> FPSID
   def VMSR_FPSID : MovToVFP<0b0000 /* fpsid */, (outs), (ins GPR:$src),
                       "vmsr", "\tfpsid, $src", []>;
+
+  def VMSR_FPINST : MovToVFP<0b1001 /* fpinst */, (outs), (ins GPR:$src),
+                              "vmsr", "\tfpinst, $src", []>;
+  def VMSR_FPINST2 : MovToVFP<0b1010 /* fpinst2 */, (outs), (ins GPR:$src),
+                                "vmsr", "\tfpinst2, $src", []>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1344,7 +1601,8 @@ let isReMaterializable = 1 in {
 def FCONSTD : VFPAI<(outs DPR:$Dd), (ins vfp_f64imm:$imm),
                     VFPMiscFrm, IIC_fpUNA64,
                     "vmov", ".f64\t$Dd, $imm",
-                    [(set DPR:$Dd, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+                    [(set DPR:$Dd, vfp_f64imm:$imm)]>,
+              Requires<[HasVFP3,HasDPVFP]> {
   bits<5> Dd;
   bits<8> imm;
 
@@ -1426,23 +1684,23 @@ def : VFP2MnemonicAlias<"fmrx", "vmrs">;
 def : VFP2MnemonicAlias<"fmxr", "vmsr">;
 
 // Be friendly and accept the old form of zero-compare
-def : VFP2InstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
+def : VFP2DPInstAlias<"fcmpzd${p} $val", (VCMPZD DPR:$val, pred:$p)>;
 def : VFP2InstAlias<"fcmpzs${p} $val", (VCMPZS SPR:$val, pred:$p)>;
 
 
 def : VFP2InstAlias<"fmstat${p}", (FMSTAT pred:$p)>;
 def : VFP2InstAlias<"fadds${p} $Sd, $Sn, $Sm",
                     (VADDS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"faddd${p} $Dd, $Dn, $Dm",
-                    (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"faddd${p} $Dd, $Dn, $Dm",
+                      (VADDD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 def : VFP2InstAlias<"fsubs${p} $Sd, $Sn, $Sm",
                     (VSUBS SPR:$Sd, SPR:$Sn, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"fsubd${p} $Dd, $Dn, $Dm",
-                    (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"fsubd${p} $Dd, $Dn, $Dm",
+                      (VSUBD DPR:$Dd, DPR:$Dn, DPR:$Dm, pred:$p)>;
 
 // No need for the size suffix on VSQRT. It's implied by the register classes.
 def : VFP2InstAlias<"vsqrt${p} $Sd, $Sm", (VSQRTS SPR:$Sd, SPR:$Sm, pred:$p)>;
-def : VFP2InstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
+def : VFP2DPInstAlias<"vsqrt${p} $Dd, $Dm", (VSQRTD DPR:$Dd, DPR:$Dm, pred:$p)>;
 
 // VLDR/VSTR accept an optional type suffix.
 def : VFP2InstAlias<"vldr${p}.32 $Sd, $addr",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c8ed576..61596d5 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -90,6 +90,10 @@ namespace {
     typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
     typedef MemOpQueue::iterator MemOpQueueIter;
 
+    void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
+                          const MemOpQueue &MemOps, unsigned DefReg,
+                          unsigned RangeBegin, unsigned RangeEnd);
+
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -109,12 +113,12 @@ namespace {
                         unsigned PredReg,
                         unsigned Scratch,
                         DebugLoc dl,
-                        SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
     void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
                       int Opcode, unsigned Size,
                       ARMCC::CondCodes Pred, unsigned PredReg,
                       unsigned Scratch, MemOpQueue &MemOps,
-                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
 
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -360,6 +364,62 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
   return true;
 }
 
+/// \brief Find all instructions using a given imp-def within a range.
+///
+/// We are trying to combine a range of instructions, one of which (located at
+/// position RangeBegin) implicitly defines a register. The final LDM/STM will
+/// be placed at RangeEnd, and so any uses of this definition between RangeStart
+/// and RangeEnd must be modified to use an undefined value.
+///
+/// The live range continues until we find a second definition or one of the
+/// uses we find is a kill. Unfortunately MemOps is not sorted by Position, so
+/// we must consider all uses and decide which are relevant in a second pass.
+void ARMLoadStoreOpt::findUsesOfImpDef(
+    SmallVectorImpl<MachineOperand *> &UsesOfImpDefs, const MemOpQueue &MemOps,
+    unsigned DefReg, unsigned RangeBegin, unsigned RangeEnd) {
+  std::map<unsigned, MachineOperand *> Uses;
+  unsigned LastLivePos = RangeEnd;
+
+  // First we find all uses of this register with Position between RangeBegin
+  // and RangeEnd, any or all of these could be uses of a definition at
+  // RangeBegin. We also record the latest position a definition at RangeBegin
+  // would be considered live.
+  for (unsigned i = 0; i < MemOps.size(); ++i) {
+    MachineInstr &MI = *MemOps[i].MBBI;
+    unsigned MIPosition = MemOps[i].Position;
+    if (MIPosition <= RangeBegin || MIPosition > RangeEnd)
+      continue;
+
+    // If this instruction defines the register, then any later use will be of
+    // that definition rather than ours.
+    if (MI.definesRegister(DefReg))
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    MachineOperand *UseOp = MI.findRegisterUseOperand(DefReg);
+    if (!UseOp)
+      continue;
+
+    // If this instruction kills the register then (assuming liveness is
+    // correct when we start) we don't need to think about anything after here.
+    if (UseOp->isKill())
+      LastLivePos = std::min(LastLivePos, MIPosition);
+
+    Uses[MIPosition] = UseOp;
+  }
+
+  // Now we traverse the list of all uses, and append the ones that actually use
+  // our definition to the requested list.
+  for (std::map<unsigned, MachineOperand *>::iterator I = Uses.begin(),
+                                                      E = Uses.end();
+       I != E; ++I) {
+    // List is sorted by position so once we've found one out of range there
+    // will be no more to consider.
+    if (I->first > LastLivePos)
+      break;
+    UsesOfImpDefs.push_back(I->second);
+  }
+}
+
 // MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
 // success.
 void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
@@ -371,7 +431,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
                                      ARMCC::CondCodes Pred, unsigned PredReg,
                                      unsigned Scratch,
                                      DebugLoc dl,
-                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
   // First calculate which of the registers should be killed by the merged
   // instruction.
   const unsigned insertPos = memOps[insertAfter].Position;
@@ -392,6 +452,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   SmallVector<std::pair<unsigned, bool>, 8> Regs;
   SmallVector<unsigned, 8> ImpDefs;
+  SmallVector<MachineOperand *, 8> UsesOfImpDefs;
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     unsigned Reg = memOps[i].Reg;
     // If we are inserting the merged operation after an operation that
@@ -406,6 +467,12 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
       unsigned DefReg = MO->getReg();
       if (std::find(ImpDefs.begin(), ImpDefs.end(), DefReg) == ImpDefs.end())
         ImpDefs.push_back(DefReg);
+
+      // There may be other uses of the definition between this instruction and
+      // the eventual LDM/STM position. These should be marked undef if the
+      // merge takes place.
+      findUsesOfImpDef(UsesOfImpDefs, memOps, DefReg, memOps[i].Position,
+                       insertPos);
     }
   }
 
@@ -418,6 +485,16 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 
   // Merge succeeded, update records.
   Merges.push_back(prior(Loc));
+
+  // In gathering loads together, we may have moved the imp-def of a register
+  // past one of its uses. This is OK, since we know better than the rest of
+  // LLVM what's OK with ARM loads and stores; but we still have to adjust the
+  // affected uses.
+  for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
+                                                   E = UsesOfImpDefs.end();
+       I != E; ++I)
+    (*I)->setIsUndef();
+
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
     // Remove kill flags from any memops that come before insertPos.
     if (Regs[i-memOpsBegin].second) {
@@ -444,10 +521,10 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 /// load / store multiple instructions.
 void
 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                          unsigned Base, int Opcode, unsigned Size,
-                          ARMCC::CondCodes Pred, unsigned PredReg,
-                          unsigned Scratch, MemOpQueue &MemOps,
-                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+                         unsigned Base, int Opcode, unsigned Size,
+                         ARMCC::CondCodes Pred, unsigned PredReg,
+                         unsigned Scratch, MemOpQueue &MemOps,
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
   int Offset = MemOps[SIndex].Offset;
   int SOffset = Offset;
@@ -489,7 +566,10 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
     if (Reg != ARM::SP &&
         NewOffset == Offset + (int)Size &&
         ((isNotVFP && RegNum > PRegNum) ||
-         ((Count < Limit) && RegNum == PRegNum+1))) {
+         ((Count < Limit) && RegNum == PRegNum+1)) &&
+        // On Swift we don't want vldm/vstm to start with a odd register num
+        // because Q register unaligned vldm/vstm need more uops.
+        (!STI->isSwift() || isNotVFP || Count != 1 || !(PRegNum & 0x1))) {
       Offset += Size;
       PRegNum = RegNum;
       ++Count;
@@ -1484,7 +1564,7 @@ namespace {
                           unsigned &PredReg, ARMCC::CondCodes &Pred,
                           bool &isT2);
     bool RescheduleOps(MachineBasicBlock *MBB,
-                       SmallVector<MachineInstr*, 4> &Ops,
+                       SmallVectorImpl<MachineInstr *> &Ops,
                        unsigned Base, bool isLd,
                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
@@ -1602,8 +1682,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     return false;
 
   // Make sure the base address satisfies i64 ld / st alignment requirement.
+  // At the moment, we ignore the memoryoperand's value.
+  // If we want to use AliasAnalysis, we should check it accordingly.
   if (!Op0->hasOneMemOperand() ||
-      !(*Op0->memoperands_begin())->getValue() ||
       (*Op0->memoperands_begin())->isVolatile())
     return false;
 
@@ -1655,7 +1736,7 @@ namespace {
 }
 
 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
-                                 SmallVector<MachineInstr*, 4> &Ops,
+                                 SmallVectorImpl<MachineInstr *> &Ops,
                                  unsigned Base, bool isLd,
                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
   bool RetVal = false;
@@ -1857,9 +1938,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           if (!StopHere)
             BI->second.push_back(MI);
         } else {
-          SmallVector<MachineInstr*, 4> MIs;
-          MIs.push_back(MI);
-          Base2LdsMap[Base] = MIs;
+          Base2LdsMap[Base].push_back(MI);
           LdBases.push_back(Base);
         }
       } else {
@@ -1875,9 +1954,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           if (!StopHere)
             BI->second.push_back(MI);
         } else {
-          SmallVector<MachineInstr*, 4> MIs;
-          MIs.push_back(MI);
-          Base2StsMap[Base] = MIs;
+          Base2StsMap[Base].push_back(MI);
           StBases.push_back(Base);
         }
       }
@@ -1893,7 +1970,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     // Re-schedule loads.
     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
       unsigned Base = LdBases[i];
-      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
+      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
       if (Lds.size() > 1)
         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
     }
@@ -1901,7 +1978,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     // Re-schedule stores.
     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
       unsigned Base = StBases[i];
-      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
+      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
       if (Sts.size() > 1)
         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
     }
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index b641483..e12c9c6 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -82,7 +82,7 @@ bool ARMAsmPrinter::lowerOperand(const MachineOperand &MO,
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, Mang->getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_ExternalSymbol:
    MCOp = GetSymbolRef(MO,
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index f4248fc..010edf3 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -36,6 +36,13 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// 'isThumb'.
   bool hasThumb2;
 
+  /// StByValParamsPadding - For parameter that is split between
+  /// GPRs and memory; while recovering GPRs part, when
+  /// StackAlignment == 8, and GPRs-part-size mod 8 != 0,
+  /// we need to insert gap before parameter start address. It allows to
+  /// "attach" GPR-part to the part that was passed via stack.
+  unsigned StByValParamsPadding;
+
   /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
   ///
   unsigned ArgRegsSaveSize;
@@ -77,12 +84,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   unsigned GPRCS2Size;
   unsigned DPRCSSize;
 
-  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
-  /// which belong to these spill areas.
-  BitVector GPRCS1Frames;
-  BitVector GPRCS2Frames;
-  BitVector DPRCSFrames;
-
   /// NumAlignedDPRCS2Regs - The number of callee-saved DPRs that are saved in
   /// the aligned portion of the stack frame.  This is always a contiguous
   /// sequence of D-registers starting from d8.
@@ -121,7 +122,6 @@ public:
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
     NumAlignedDPRCS2Regs(0),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
@@ -129,11 +129,11 @@ public:
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
     hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+    StByValParamsPadding(0),
     ArgRegsSaveSize(0), HasStackFrame(false), RestoreSPFromFP(false),
     LRSpilledForFarJump(false),
     FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
-    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     JumpTableUId(0), PICLabelUId(0),
     VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
@@ -141,7 +141,14 @@ public:
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
   bool isThumb2Function() const { return isThumb && hasThumb2; }
 
-  unsigned getArgRegsSaveSize() const { return ArgRegsSaveSize; }
+  unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
+  void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
+
+  unsigned getArgRegsSaveSize(unsigned Align = 0) const {
+    if (!Align)
+      return ArgRegsSaveSize;
+    return (ArgRegsSaveSize + Align - 1) & ~(Align - 1);
+  }
   void setArgRegsSaveSize(unsigned s) { ArgRegsSaveSize = s; }
 
   bool hasStackFrame() const { return HasStackFrame; }
@@ -175,59 +182,6 @@ public:
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
 
-  bool isGPRCalleeSavedArea1Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
-      return false;
-    return GPRCS1Frames[fi];
-  }
-  bool isGPRCalleeSavedArea2Frame(int fi) const {
-    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
-      return false;
-    return GPRCS2Frames[fi];
-  }
-  bool isDPRCalleeSavedAreaFrame(int fi) const {
-    if (fi < 0 || fi >= (int)DPRCSFrames.size())
-      return false;
-    return DPRCSFrames[fi];
-  }
-
-  void addGPRCalleeSavedArea1Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS1Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS1Frames.resize(Size);
-      }
-      GPRCS1Frames[fi] = true;
-    }
-  }
-  void addGPRCalleeSavedArea2Frame(int fi) {
-    if (fi >= 0) {
-      int Size = GPRCS2Frames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        GPRCS2Frames.resize(Size);
-      }
-      GPRCS2Frames[fi] = true;
-    }
-  }
-  void addDPRCalleeSavedAreaFrame(int fi) {
-    if (fi >= 0) {
-      int Size = DPRCSFrames.size();
-      if (fi >= Size) {
-        Size *= 2;
-        if (fi >= Size)
-          Size = fi+1;
-        DPRCSFrames.resize(Size);
-      }
-      DPRCSFrames[fi] = true;
-    }
-  }
-
   unsigned createJumpTableUId() {
     return JumpTableUId++;
   }
diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
index 6f3819a..a788036 100644
--- a/lib/Target/ARM/ARMRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp
@@ -18,7 +18,6 @@ using namespace llvm;
 
 void ARMRegisterInfo::anchor() { }
 
-ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
-                                 const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+ARMRegisterInfo::ARMRegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
index 8a24842..fb1537c 100644
--- a/lib/Target/ARM/ARMRegisterInfo.h
+++ b/lib/Target/ARM/ARMRegisterInfo.h
@@ -19,13 +19,13 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseInstrInfo;
+
+class ARMSubtarget;
 
 struct ARMRegisterInfo : public ARMBaseRegisterInfo {
   virtual void anchor();
 public:
-  ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  ARMRegisterInfo(const ARMSubtarget &STI);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index b0f576b..d045761 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -27,31 +27,31 @@ class ARMFReg<bits<16> Enc, string n> : Register<n> {
 
 // Subregister indices.
 let Namespace = "ARM" in {
-def qqsub_0 : SubRegIndex;
-def qqsub_1 : SubRegIndex;
+def qqsub_0 : SubRegIndex<256>;
+def qqsub_1 : SubRegIndex<256, 256>;
 
 // Note: Code depends on these having consecutive numbers.
-def qsub_0 : SubRegIndex;
-def qsub_1 : SubRegIndex;
-def qsub_2 : SubRegIndex<[qqsub_1, qsub_0]>;
-def qsub_3 : SubRegIndex<[qqsub_1, qsub_1]>;
-
-def dsub_0 : SubRegIndex;
-def dsub_1 : SubRegIndex;
-def dsub_2 : SubRegIndex<[qsub_1, dsub_0]>;
-def dsub_3 : SubRegIndex<[qsub_1, dsub_1]>;
-def dsub_4 : SubRegIndex<[qsub_2, dsub_0]>;
-def dsub_5 : SubRegIndex<[qsub_2, dsub_1]>;
-def dsub_6 : SubRegIndex<[qsub_3, dsub_0]>;
-def dsub_7 : SubRegIndex<[qsub_3, dsub_1]>;
-
-def ssub_0  : SubRegIndex;
-def ssub_1  : SubRegIndex;
-def ssub_2  : SubRegIndex<[dsub_1, ssub_0]>;
-def ssub_3  : SubRegIndex<[dsub_1, ssub_1]>;
-
-def gsub_0  : SubRegIndex;
-def gsub_1  : SubRegIndex;
+def qsub_0 : SubRegIndex<128>;
+def qsub_1 : SubRegIndex<128, 128>;
+def qsub_2 : ComposedSubRegIndex<qqsub_1, qsub_0>;
+def qsub_3 : ComposedSubRegIndex<qqsub_1, qsub_1>;
+
+def dsub_0 : SubRegIndex<64>;
+def dsub_1 : SubRegIndex<64, 64>;
+def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
+def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
+def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
+def dsub_5 : ComposedSubRegIndex<qsub_2, dsub_1>;
+def dsub_6 : ComposedSubRegIndex<qsub_3, dsub_0>;
+def dsub_7 : ComposedSubRegIndex<qsub_3, dsub_1>;
+
+def ssub_0  : SubRegIndex<32>;
+def ssub_1  : SubRegIndex<32, 32>;
+def ssub_2  : ComposedSubRegIndex<dsub_1, ssub_0>;
+def ssub_3  : ComposedSubRegIndex<dsub_1, ssub_1>;
+
+def gsub_0  : SubRegIndex<32>;
+def gsub_1  : SubRegIndex<32, 32>;
 // Let TableGen synthesize the remaining 12 ssub_* indices.
 // We don't need to name them.
 }
@@ -157,21 +157,27 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 
 // Current Program Status Register.
 // We model fpscr with two registers: FPSCR models the control bits and will be
-// reserved. FPSCR_NZCV models the flag bits and will be unreserved. 
-def CPSR       : ARMReg<0, "cpsr">;
-def APSR       : ARMReg<1, "apsr">;
-def SPSR       : ARMReg<2, "spsr">;
-def FPSCR      : ARMReg<3, "fpscr">;
-def FPSCR_NZCV : ARMReg<3, "fpscr_nzcv"> {
+// reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
+// models the APSR when it's accessed by some special instructions. In such cases 
+// it has the same encoding as PC.
+def CPSR       : ARMReg<0,  "cpsr">;
+def APSR       : ARMReg<1,  "apsr">;
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">; 
+def SPSR       : ARMReg<2,  "spsr">;
+def FPSCR      : ARMReg<3,  "fpscr">;
+def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
   let Aliases = [FPSCR];
 }
 def ITSTATE    : ARMReg<4, "itstate">;
 
 // Special Registers - only available in privileged mode.
-def FPSID   : ARMReg<0, "fpsid">;
-def MVFR1   : ARMReg<6, "mvfr1">;
-def MVFR0   : ARMReg<7, "mvfr0">;
-def FPEXC   : ARMReg<8, "fpexc">;
+def FPSID   : ARMReg<0,  "fpsid">;
+def MVFR2   : ARMReg<5,  "mvfr2">;
+def MVFR1   : ARMReg<6,  "mvfr1">;
+def MVFR0   : ARMReg<7,  "mvfr0">;
+def FPEXC   : ARMReg<8,  "fpexc">;
+def FPINST  : ARMReg<9,  "fpinst">;
+def FPINST2 : ARMReg<10, "fpinst2">;
 
 // Register classes.
 //
@@ -207,6 +213,16 @@ def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
   }];
 }
 
+// GPRs without the PC but with APSR. Some instructions allow accessing the
+// APSR, while actually encoding PC in the register field. This is usefull
+// for assembly and disassembly only.
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
+  let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
+  let AltOrderSelect = [{
+      return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
+  }];
+}
+
 // GPRsp - Only the SP is legal. Used by Thumb1 instructions that want the
 // implied SP argument list.
 // FIXME: It would be better to not use this at all and refactor the
@@ -236,7 +252,7 @@ def hGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, tGPR)>;
 // to the saved value before the tail call, which would clobber a call address.
 // Note, getMinimalPhysRegClass(R0) returns tGPR because of the names of
 // this class and the preceding one(!)  This is what we want.
-def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R9, R12)> {
+def tcGPR : RegisterClass<"ARM", [i32], 32, (add R0, R1, R2, R3, R12)> {
   let AltOrders = [(and tcGPR, tGPR)];
   let AltOrderSelect = [{
       return MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 2d088de..528c4ec 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -69,6 +69,24 @@ def WriteCMP : SchedWrite;
 def WriteCMPsi : SchedWrite;
 def WriteCMPsr : SchedWrite;
 
+// Division.
+def WriteDiv : SchedWrite;
+
+// Loads.
+def WriteLd : SchedWrite;
+def WritePreLd : SchedWrite;
+
+// Branches.
+def WriteBr : SchedWrite;
+def WriteBrL : SchedWrite;
+def WriteBrTbl : SchedWrite;
+
+// Fixpoint conversions.
+def WriteCvtFP : SchedWrite;
+
+// Noop.
+def WriteNoop : SchedWrite;
+
 // Define TII for use in SchedVariant Predicates.
 def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 9739ed2..603e775 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1879,17 +1879,18 @@ def CortexA9Itineraries : ProcessorItineraries<
 // The following definitions describe the simpler per-operand machine model.
 // This works with MachineScheduler and will eventually replace itineraries.
 
+class A9WriteLMOpsListType<list<WriteSequence> writes> {
+  list <WriteSequence> Writes = writes;
+  SchedMachineModel SchedModel = ?;
+}
 
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
   let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let MicroOpBufferSize = 56; // Based on available renamed registers.
   let LoadLatency = 2; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
-  let ILPWindow = 10; // Don't reschedule small blocks to hide
-                      // latency. Minimum latency requirements are already
-                      // modeled strictly by reserving resources.
   let MispredictPenalty = 8; // Based on estimate of pipeline depth.
 
   let Itineraries = CortexA9Itineraries;
@@ -1904,7 +1905,7 @@ def A9UnitALU : ProcResource<2>;
 def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
 def A9UnitAGU : ProcResource<1>;
 def A9UnitLS  : ProcResource<1>;
-def A9UnitFP  : ProcResource<1> { let Buffered = 0; }
+def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
 def A9UnitB   : ProcResource<1>;
 
 //===----------------------------------------------------------------------===//
@@ -2014,7 +2015,7 @@ def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
 
 // Define a predicate to select the LDM based on number of memory addresses.
 def A9LMAdr#NumAddr#Pred :
-  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  SchedPredicate<"(TII->getNumLDMAddresses(MI)+1)/2 == "#NumAddr>;
 
 } // foreach NumAddr
 
@@ -2057,48 +2058,30 @@ def A9WriteL#NumAddr#Hi : WriteSequence<
 //===----------------------------------------------------------------------===//
 // LDM: Load multiple into 32-bit integer registers.
 
+def A9WriteLMOpsList : A9WriteLMOpsListType<
+                 [A9WriteL1, A9WriteL1Hi,
+                  A9WriteL2, A9WriteL2Hi,
+                  A9WriteL3, A9WriteL3Hi,
+                  A9WriteL4, A9WriteL4Hi,
+                  A9WriteL5, A9WriteL5Hi,
+                  A9WriteL6, A9WriteL6Hi,
+                  A9WriteL7, A9WriteL7Hi,
+                  A9WriteL8, A9WriteL8Hi]>;
+
 // A9WriteLM variants expand into a pair of writes for each 64-bit
 // value loaded. When the number of registers is odd, the last
 // A9WriteLnHi is naturally ignored because the instruction has no
 // following def operands.  These variants take no issue resource, so
 // they may need to be part of a WriteSequence that includes A9WriteIssue.
 def A9WriteLM : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
-                          A9WriteL2, A9WriteL2Hi,
-                          A9WriteL3, A9WriteL3Hi,
-                          A9WriteL4, A9WriteL4Hi,
-                          A9WriteL5, A9WriteL5Hi,
-                          A9WriteL6, A9WriteL6Hi,
-                          A9WriteL7, A9WriteL7Hi,
-                          A9WriteL8, A9WriteL8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMOpsList.Writes[0-1]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMOpsList.Writes[0-3]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMOpsList.Writes[0-5]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMOpsList.Writes[0-7]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMOpsList.Writes[0-9]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMOpsList.Writes[0-11]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMOpsList.Writes[0-13]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMOpsList.Writes[0-15]>,
   // For unknown LDMs, define the maximum number of writes, but only
   // make the first two consume resources.
   SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
@@ -2180,49 +2163,39 @@ def A9WriteLMfp#NumAddr#Hi : WriteSequence<
 // pair of writes for each 64-bit data loaded. When the number of
 // registers is odd, the last WriteLMfpnHi is naturally ignored because
 // the instruction has no following def operands.
+
+def A9WriteLMfpPostRAOpsList : A9WriteLMOpsListType<
+                 [A9WriteLMfp1, A9WriteLMfp2,       // 0-1
+                  A9WriteLMfp3, A9WriteLMfp4,       // 2-3
+                  A9WriteLMfp5, A9WriteLMfp6,       // 4-5
+                  A9WriteLMfp7, A9WriteLMfp8,       // 6-7
+                  A9WriteLMfp1Hi,                   // 8-8
+                  A9WriteLMfp2Hi, A9WriteLMfp2Hi,   // 9-10
+                  A9WriteLMfp3Hi, A9WriteLMfp3Hi,   // 11-12
+                  A9WriteLMfp4Hi, A9WriteLMfp4Hi,   // 13-14
+                  A9WriteLMfp5Hi, A9WriteLMfp5Hi,   // 15-16
+                  A9WriteLMfp6Hi, A9WriteLMfp6Hi,   // 17-18
+                  A9WriteLMfp7Hi, A9WriteLMfp7Hi,   // 19-20
+                  A9WriteLMfp8Hi, A9WriteLMfp8Hi]>; // 21-22
+
 def A9WriteLMfpPostRA : SchedWriteVariant<[
-  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
-  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
-  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
-  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
-  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
-  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
-  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
-  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                          A9WriteLMfp2, A9WriteLMfp2Hi,
-                          A9WriteLMfp3, A9WriteLMfp3Hi,
-                          A9WriteLMfp4, A9WriteLMfp4Hi,
-                          A9WriteLMfp5, A9WriteLMfp5Hi,
-                          A9WriteLMfp6, A9WriteLMfp6Hi,
-                          A9WriteLMfp7, A9WriteLMfp7Hi,
-                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  SchedVar<A9LMAdr1Pred, A9WriteLMfpPostRAOpsList.Writes[0-0, 8-8]>,
+  SchedVar<A9LMAdr2Pred, A9WriteLMfpPostRAOpsList.Writes[0-1, 9-10]>,
+  SchedVar<A9LMAdr3Pred, A9WriteLMfpPostRAOpsList.Writes[0-2, 10-12]>,
+  SchedVar<A9LMAdr4Pred, A9WriteLMfpPostRAOpsList.Writes[0-3, 11-14]>,
+  SchedVar<A9LMAdr5Pred, A9WriteLMfpPostRAOpsList.Writes[0-4, 12-16]>,
+  SchedVar<A9LMAdr6Pred, A9WriteLMfpPostRAOpsList.Writes[0-5, 13-18]>,
+  SchedVar<A9LMAdr7Pred, A9WriteLMfpPostRAOpsList.Writes[0-6, 14-20]>,
+  SchedVar<A9LMAdr8Pred, A9WriteLMfpPostRAOpsList.Writes[0-7, 15-22]>,
   // For unknown LDMs, define the maximum number of writes, but only
-  // make the first two consume resources.
-  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
-                             A9WriteLMfp2, A9WriteLMfp2Hi,
-                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
-                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+  // make the first two consume resources. We are optimizing for the case
+  // where the operands are DPRs, and this determines the first eight
+  // types. The remaining eight types are filled to cover the case
+  // where the operands are SPRs.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp2,
+                             A9WriteLMfp3Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp8Hi,
                              A9WriteLMfp5Hi, A9WriteLMfp5Hi,
                              A9WriteLMfp6Hi, A9WriteLMfp6Hi,
                              A9WriteLMfp7Hi, A9WriteLMfp7Hi,
@@ -2275,10 +2248,10 @@ def A9Read4 : SchedReadAdvance<3>;
 // This table follows the ARM Cortex-A9 Technical Reference Manuals,
 // mostly in order.
 
-def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+def :ItinRW<[WriteALU], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
                          IIC_iMVNi,IIC_iMVNsi,
                          IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
-def :ItinRW<[A9WriteI,A9ReadALU],[IIC_iMVNr]>;
+def :ItinRW<[WriteALU, A9ReadALU],[IIC_iMVNr]>;
 def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
 
 def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
@@ -2487,10 +2460,59 @@ def : SchedAlias<WriteALUsr, A9WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
 def : SchedAlias<ReadALU, A9ReadALU>;
 def : SchedAlias<ReadALUsr, A9ReadALU>;
-// FIXME: need to special case AND, ORR, EOR, BIC because they don't read
-// advance. But our instrinfo claims it does.
+def : InstRW< [WriteALU],
+      (instregex "ANDri", "ORRri", "EORri", "BICri", "ANDrr", "ORRrr", "EORrr",
+                 "BICrr")>;
+def : InstRW< [WriteALUsi], (instregex "ANDrsi", "ORRrsi", "EORrsi", "BICrsi")>;
+def : InstRW< [WriteALUsr], (instregex "ANDrsr", "ORRrsr", "EORrsr", "BICrsr")>;
+
 
 def : SchedAlias<WriteCMP, A9WriteALU>;
 def : SchedAlias<WriteCMPsi, A9WriteALU>;
 def : SchedAlias<WriteCMPsr, A9WriteALU>;
+
+def : InstRW< [A9WriteIsr], (instregex "MOVsr", "MOVsi", "MVNsr", "MOVCCsi",
+                                       "MOVCCsr")>;
+def : InstRW< [WriteALU, A9ReadALU], (instregex "MVNr")>;
+def : InstRW< [A9WriteI2], (instregex "MOVCCi32imm", "MOVi32imm",
+                                      "MOV_ga_dyn")>;
+def : InstRW< [A9WriteI2pc], (instregex "MOV_ga_pcrel")>;
+def : InstRW< [A9WriteI2ld], (instregex "MOV_ga_pcrel_ldr")>;
+
+def : InstRW< [WriteALU], (instregex "SEL")>;
+
+def : InstRW< [WriteALUsi], (instregex "BFC", "BFI", "UBFX", "SBFX")>;
+
+def : InstRW< [A9WriteM],
+      (instregex "MUL", "MULv5", "SMMUL", "SMMULR", "MLA", "MLAv5", "MLS",
+      "SMMLA", "SMMLAR", "SMMLS", "SMMLSR")>;
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMULL", "SMULLv5", "UMULL", "UMULLv5", "SMLAL$", "UMLAL",
+      "UMAAL", "SMLALv5", "UMLALv5", "UMAALv5", "SMLALBB", "SMLALBT", "SMLALTB",
+      "SMLALTT")>;
+// FIXME: These instructions used to have NoItinerary. Just copied the one from above.
+def : InstRW< [A9WriteM, A9WriteMHi],
+      (instregex "SMLAD", "SMLADX", "SMLALD", "SMLALDX", "SMLSD", "SMLSDX",
+      "SMLSLD", "SMLLDX", "SMUAD", "SMUADX", "SMUSD", "SMUSDX")>;
+
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMULBB", "SMULBT", "SMULTB", "SMULTT", "SMULWB", "SMULWT")>;
+def : InstRW<[A9WriteM16, A9WriteM16Hi],
+      (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLAWB", "SMLAWT")>;
+
+def : InstRW<[A9WriteL], (instregex "LDRi12", "PICLDR$")>;
+def : InstRW<[A9WriteLsi], (instregex "LDRrs")>;
+def : InstRW<[A9WriteLb],
+      (instregex "LDRBi12", "PICLDRH", "PICLDRB", "PICLDRSH", "PICLDRSB",
+      "LDRH", "LDRSH", "LDRSB")>;
+def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
+
+def : WriteRes<WriteDiv, []> { let Latency = 0; }
+
+def : WriteRes<WriteBr, [A9UnitB]>;
+def : WriteRes<WriteBrL, [A9UnitB]>;
+def : WriteRes<WriteBrTbl, [A9UnitB]>;
+def : WriteRes<WritePreLd, []>;
+def : SchedAlias<WriteCvtFP, A9WriteF>;
+def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
 } // SchedModel = CortexA9Model
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 7c6df41..8d7dbc2 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1076,7 +1076,7 @@ def SwiftItineraries : ProcessorItineraries<
 // Swift machine model for scheduling and other instruction cost heuristics.
 def SwiftModel : SchedMachineModel {
   let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
-  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let MicroOpBufferSize = 45; // Based on NEON renamed registers.
   let LoadLatency = 3;
   let MispredictPenalty = 14; // A branch direction mispredict.
 
@@ -1096,9 +1096,27 @@ let SchedModel = SwiftModel in {
   def SwiftUnitDiv : ProcResource<1>;
 
   // Generic resource requirements.
+  def SwiftWriteP0OneCycle : SchedWriteRes<[SwiftUnitP0]>;
+  def SwiftWriteP0TwoCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 2; }
+  def SwiftWriteP0FourCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 4; }
+  def SwiftWriteP0SixCycle : SchedWriteRes<[SwiftUnitP0]> { let Latency = 6; }
+  def SwiftWriteP0P1FourCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 4;
+  }
+  def SwiftWriteP0P1SixCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP1]> {
+    let Latency = 6;
+  }
+  def SwiftWriteP01OneCycle : SchedWriteRes<[SwiftUnitP01]>;
+  def SwiftWriteP1TwoCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 2; }
+  def SwiftWriteP1FourCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 4; }
+  def SwiftWriteP1SixCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 6; }
+  def SwiftWriteP1EightCycle : SchedWriteRes<[SwiftUnitP1]> { let Latency = 8; }
+  def SwiftWriteP1TwelveCyc : SchedWriteRes<[SwiftUnitP1]> { let Latency = 12; }
+  def SwiftWriteP01OneCycle2x : WriteSequence<[SwiftWriteP01OneCycle], 2>;
+  def SwiftWriteP01OneCycle3x : WriteSequence<[SwiftWriteP01OneCycle], 3>;
   def SwiftWriteP01TwoCycle : SchedWriteRes<[SwiftUnitP01]> { let Latency = 2; }
-  def SwiftWriteP01ThreeCycleTwoUops :
-    SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]> {
+  def SwiftWriteP01ThreeCycleTwoUops : SchedWriteRes<[SwiftUnitP01,
+                                                      SwiftUnitP01]> {
     let Latency = 3;
     let NumMicroOps = 2;
   }
@@ -1107,7 +1125,23 @@ let SchedModel = SwiftModel in {
     let NumMicroOps = 3;
     let ResourceCycles = [3];
   }
-
+  // Plain load without writeback.
+  def SwiftWriteP2ThreeCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 3;
+  }
+  def SwiftWriteP2FourCycle : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 4;
+  }
+  // A store does not write to a register.
+  def SwiftWriteP2 : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 0;
+  }
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP2 : WriteSequence<[SwiftWriteP2], Num>;
+  }
+  def SwiftWriteP01OneCycle2x_load : WriteSequence<[SwiftWriteP01OneCycle,
+                                                    SwiftWriteP01OneCycle,
+                                                    SwiftWriteP2ThreeCycle]>;
   // 4.2.4 Arithmetic and Logical.
   // ALU operation register shifted by immediate variant.
   def SwiftWriteALUsi : SchedWriteVariant<[
@@ -1137,8 +1171,906 @@ let SchedModel = SwiftModel in {
   def : ReadAdvance<ReadALU, 0>;
   def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
 
+
+  def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsFastImmShiftSwiftPred, [SwiftWriteP01OneCycle]>,
+    SchedVar<NoSchedPred,             [SwiftWriteP01TwoCycle]>
+  ]>;
+
   // 4.2.5 Integer comparison
   def : WriteRes<WriteCMP, [SwiftUnitP01]>;
-  def : WriteRes<WriteCMPsi, [SwiftUnitP01]>;
-  def : WriteRes<WriteCMPsr, [SwiftUnitP01]>;
+  def : SchedAlias<WriteCMPsi, SwiftChooseShiftKindP01OneOrTwoCycle>;
+  def : SchedAlias<WriteCMPsr, SwiftWriteP01TwoCycle>;
+
+  // 4.2.6 Shift, Move
+  // Shift
+  //  ASR,LSL,ROR,RRX
+  //  MOV(register-shiftedregister)  MVN(register-shiftedregister)
+  // Move
+  //  MOV,MVN
+  //  MOVT
+  // Sign/Zero extension
+  def : InstRW<[SwiftWriteP01OneCycle],
+               (instregex "SXTB", "SXTH", "SXTB16", "UXTB", "UXTH", "UXTB16",
+                          "t2SXTB", "t2SXTH", "t2SXTB16", "t2UXTB", "t2UXTH",
+                          "t2UXTB16")>;
+  // Pseudo instructions.
+  def : InstRW<[SwiftWriteP01OneCycle2x],
+        (instregex "MOVCCi32imm", "MOVi32imm", "MOV_ga_dyn", "t2MOVCCi32imm",
+                   "t2MOVi32imm", "t2MOV_ga_dyn")>;
+  def : InstRW<[SwiftWriteP01OneCycle3x],
+        (instregex "MOV_ga_pcrel", "t2MOV_ga_pcrel", "t2MOVi16_ga_pcrel")>;
+  def : InstRW<[SwiftWriteP01OneCycle2x_load],
+        (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
+
+  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+
+  def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
+  ]>;
+
+  // 4.2.7 Select
+  // SEL
+  def : InstRW<[SwiftPredP0OneOrTwoCycle], (instregex "SEL", "t2SEL")>;
+
+  // 4.2.8 Bitfield
+  // BFI,BFC, SBFX,UBFX
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "BFC", "BFI", "UBFX", "SBFX", "(t|t2)BFC", "(t|t2)BFI",
+        "(t|t2)UBFX", "(t|t2)SBFX")>;
+
+  // 4.2.9 Saturating arithmetic
+  def : InstRW< [SwiftWriteP01TwoCycle],
+        (instregex "QADD", "QSUB", "QDADD", "QDSUB", "SSAT", "SSAT16", "USAT",
+        "USAT16", "QADD8", "QADD16", "QSUB8", "QSUB16", "QASX", "QSAX",
+        "UQADD8", "UQADD16","UQSUB8","UQSUB16","UQASX","UQSAX", "t2QADD",
+        "t2QSUB", "t2QDADD", "t2QDSUB", "t2SSAT", "t2SSAT16", "t2USAT",
+        "t2QADD8", "t2QADD16", "t2QSUB8", "t2QSUB16", "t2QASX", "t2QSAX",
+        "t2UQADD8", "t2UQADD16","t2UQSUB8","t2UQSUB16","t2UQASX","t2UQSAX")>;
+
+  // 4.2.10 Parallel Arithmetic
+  // Not flag setting.
+  def : InstRW< [SwiftWriteALUsr],
+        (instregex "SADD8", "SADD16", "SSUB8", "SSUB16", "SASX", "SSAX",
+        "UADD8", "UADD16", "USUB8", "USUB16", "UASX", "USAX", "t2SADD8",
+        "t2SADD16", "t2SSUB8", "t2SSUB16", "t2SASX", "t2SSAX", "t2UADD8",
+        "t2UADD16", "t2USUB8", "t2USUB16", "t2UASX", "t2USAX")>;
+  // Flag setting.
+  def : InstRW< [SwiftWriteP01TwoCycle],
+       (instregex "SHADD8", "SHADD16", "SHSUB8", "SHSUB16", "SHASX", "SHSAX",
+       "SXTAB", "SXTAB16", "SXTAH", "UHADD8", "UHADD16", "UHSUB8", "UHSUB16",
+       "UHASX", "UHSAX", "UXTAB", "UXTAB16", "UXTAH", "t2SHADD8", "t2SHADD16",
+       "t2SHSUB8", "t2SHSUB16", "t2SHASX", "t2SHSAX", "t2SXTAB", "t2SXTAB16",
+       "t2SXTAH", "t2UHADD8", "t2UHADD16", "t2UHSUB8", "t2UHSUB16", "t2UHASX",
+       "t2UHSAX", "t2UXTAB", "t2UXTAB16", "t2UXTAH")>;
+
+  // 4.2.11 Sum of Absolute Difference
+  def : InstRW< [SwiftWriteP0P1FourCycle], (instregex "USAD8") >;
+  def : InstRW<[SwiftWriteP0P1FourCycle, ReadALU, ReadALU, SchedReadAdvance<2>],
+        (instregex "USADA8")>;
+
+  // 4.2.12 Integer Multiply (32-bit result)
+  // Two sources.
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "MULS", "MUL", "SMMUL", "SMMULR", "SMULBB", "SMULBT",
+        "SMULTB", "SMULTT", "SMULWB", "SMULWT", "SMUSD", "SMUSDXi", "t2MUL",
+        "t2SMMUL", "t2SMMULR", "t2SMULBB", "t2SMULBT", "t2SMULTB", "t2SMULTT",
+        "t2SMULWB", "t2SMULWT", "t2SMUSD")>;
+
+  def SwiftWriteP0P01FiveCycleTwoUops :
+      SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]>  {
+    let Latency = 5;
+  }
+
+  def SwiftPredP0P01FourFiveCycle : SchedWriteVariant<[
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0P01FiveCycleTwoUops ]>,
+    SchedVar<NoSchedPred,      [ SwiftWriteP0FourCycle ]>
+  ]>;
+
+  def SwiftReadAdvanceFourCyclesPred : SchedReadVariant<[
+     SchedVar<IsPredicatedPred, [SchedReadAdvance<4>]>,
+     SchedVar<NoSchedPred,      [ReadALU]>
+  ]>;
+
+  // Multiply accumulate, three sources
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "MLAS", "MLA", "MLS", "SMMLA", "SMMLAR", "SMMLS", "SMMLSR",
+        "t2MLA", "t2MLS", "t2MLAS", "t2SMMLA", "t2SMMLAR", "t2SMMLS",
+        "t2SMMLSR")>;
+
+  // 4.2.13 Integer Multiply (32-bit result, Q flag)
+  def : InstRW< [SwiftWriteP0FourCycle],
+        (instregex "SMUAD", "SMUADX", "t2SMUAD", "t2SMUADX")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle, ReadALU, ReadALU,
+                 SwiftReadAdvanceFourCyclesPred],
+        (instregex "SMLABB", "SMLABT", "SMLATB", "SMLATT", "SMLSD", "SMLSDX",
+        "SMLAWB", "SMLAWT", "t2SMLABB", "t2SMLABT", "t2SMLATB", "t2SMLATT",
+        "t2SMLSD", "t2SMLSDX", "t2SMLAWB", "t2SMLAWT")>;
+  def : InstRW< [SwiftPredP0P01FourFiveCycle],
+        (instregex "SMLAD", "SMLADX", "t2SMLAD", "t2SMLADX")>;
+
+  def SwiftP0P0P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 5;
+    let NumMicroOps = 3;
+    let ResourceCycles = [2, 1];
+  }
+  def SwiftWrite1Cycle : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite5Cycle : SchedWriteRes<[]> {
+    let Latency = 5;
+    let NumMicroOps = 0;
+  }
+  def SwiftWrite6Cycle : SchedWriteRes<[]> {
+    let Latency = 6;
+    let NumMicroOps = 0;
+  }
+
+  // 4.2.14 Integer Multiply, Long
+  def : InstRW< [SwiftP0P0P01FiveCycle, SwiftWrite5Cycle],
+        (instregex "SMULL$", "UMULL$", "t2SMULL$", "t2UMULL$")>;
+
+  def Swift2P03P01FiveCycle : SchedWriteRes<[SwiftUnitP0, SwiftUnitP01]> {
+    let Latency = 7;
+    let NumMicroOps = 5;
+    let ResourceCycles = [2, 3];
+  }
+
+  // 4.2.15 Integer Multiply Accumulate, Long
+  // 4.2.16 Integer Multiply Accumulate, Dual
+  // 4.2.17 Integer Multiply Accumulate Accumulate, Long
+  // We are being a bit inaccurate here.
+  def : InstRW< [SwiftWrite5Cycle, Swift2P03P01FiveCycle, ReadALU, ReadALU,
+                 SchedReadAdvance<4>, SchedReadAdvance<3>],
+        (instregex "SMLALS", "UMLALS", "SMLAL", "UMLAL", "MLALBB", "SMLALBT",
+        "SMLALTB", "SMLALTT", "SMLALD", "SMLALDX", "SMLSLD", "SMLSLDX",
+        "UMAAL", "t2SMLALS", "t2UMLALS", "t2SMLAL", "t2UMLAL", "t2MLALBB", "t2SMLALBT",
+        "t2SMLALTB", "t2SMLALTT", "t2SMLALD", "t2SMLALDX", "t2SMLSLD", "t2SMLSLDX",
+        "t2UMAAL")>;
+
+  def SwiftDiv : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 14;
+    let ResourceCycles = [1, 14];
+  }
+  // 4.2.18 Integer Divide
+  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : InstRW <[SwiftDiv],
+        (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+
+  // 4.2.19 Integer Load Single Element
+  // 4.2.20 Integer Load Signextended
+  def SwiftWriteP2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P01P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01,
+                                                   SwiftUnitP01]> {
+    let Latency = 4;
+    let NumMicroOps = 3;
+  }
+  def SwiftWriteP2P2ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2]> {
+    let Latency = 3;
+    let NumMicroOps = 2;
+  }
+  def SwiftWriteP2P2P01ThreeCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP2,
+                                                   SwiftUnitP01]> {
+    let Latency = 3;
+    let NumMicroOps = 3;
+  }
+  def SwiftWrBackOne : SchedWriteRes<[]> {
+    let Latency = 1;
+    let NumMicroOps = 0;
+  }
+  def SwiftWriteLdFour : SchedWriteRes<[]> {
+    let Latency = 4;
+    let NumMicroOps = 0;
+  }
+   // Not accurate.
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDR(i12|rs)$", "LDRB(i12|rs)$", "t2LDR(i8|i12|s|pci)",
+        "t2LDR(H|B)(i8|i12|s|pci)", "LDREX", "tLDR[BH](r|i|spi|pci|pciASM)",
+        "tLDR(r|i|spi|pci|pciASM)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle],
+        (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
+  def : InstRW<[SwiftWriteP2P01FourCyle],
+        (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
+        "t2LDRpci_pic", "tLDRS(B|H)")>;
+  def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
+        (instregex "LD(RB|R)(_|T_)(POST|PRE)_(IMM|REG)", "LDRH(_PRE|_POST)",
+        "LDR(T|BT)_POST_(REG|IMM)", "LDRHT(i|r)",
+        "t2LD(R|RB|RH)_(PRE|POST)", "t2LD(R|RB|RH)T")>;
+  def : InstRW<[SwiftWriteP2P01P01FourCycle, SwiftWrBackOne],
+        (instregex "LDR(SH|SB)(_POST|_PRE)", "t2LDR(SH|SB)(_POST|_PRE)",
+        "LDRS(B|H)T(i|r)", "t2LDRS(B|H)T(i|r)", "t2LDRS(B|H)T")>;
+
+  // 4.2.21 Integer Dual Load
+  // Not accurate.
+  def : InstRW<[SwiftWriteP2P2ThreeCycle, SwiftWriteLdFour],
+        (instregex "t2LDRDi8", "LDRD$")>;
+  def : InstRW<[SwiftWriteP2P2P01ThreeCycle, SwiftWriteLdFour, SwiftWrBackOne],
+        (instregex "LDRD_(POST|PRE)", "t2LDRD_(POST|PRE)")>;
+
+  // 4.2.22 Integer Load, Multiple
+  // NumReg = 1 .. 16
+  foreach Lat = 3-25 in {
+    def SwiftWriteLM#Lat#Cy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = Lat;
+    }
+    def SwiftWriteLM#Lat#CyNo : SchedWriteRes<[]> {
+      let Latency = Lat;
+      let NumMicroOps = 0;
+    }
+  }
+  // Predicate.
+  foreach NumAddr = 1-16 in {
+    def SwiftLMAddr#NumAddr#Pred : SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+  }
+  def SwiftWriteLDMAddrNoWB : SchedWriteRes<[SwiftUnitP01]> { let Latency = 0; }
+  def SwiftWriteLDMAddrWB : SchedWriteRes<[SwiftUnitP01, SwiftUnitP01]>;
+  def SwiftWriteLM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5Cy, SwiftWriteLM6Cy,
+                                SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM12Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM16Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18Cy]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM3Cy, SwiftWriteLM4Cy,
+                                SwiftWriteLM5CyNo, SwiftWriteLM6CyNo,
+                                SwiftWriteLM7CyNo, SwiftWriteLM8CyNo,
+                                SwiftWriteLM9CyNo, SwiftWriteLM10CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM12CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM15CyNo, SwiftWriteLM16CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo]>
+
+  ]> { let Variadic=1; }
+
+  def : InstRW<[SwiftWriteLM, SwiftWriteLDMAddrNoWB],
+        (instregex "LDM(IA|DA|DB|IB)$", "t2LDM(IA|DA|DB|IB)$",
+        "(t|sys)LDM(IA|DA|DB|IB)$")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM],
+        (instregex /*"t2LDMIA_RET", "tLDMIA_RET", "LDMIA_RET",*/
+        "LDM(IA|DA|DB|IB)_UPD", "(t2|sys|t)LDM(IA|DA|DB|IB)_UPD")>;
+  def : InstRW<[SwiftWriteLDMAddrWB, SwiftWriteLM, SwiftWriteP1TwoCycle],
+        (instregex "LDMIA_RET", "(t|t2)LDMIA_RET", "POP", "tPOP")>;
+  // 4.2.23 Integer Store, Single Element
+  def : InstRW<[SwiftWriteP2],
+        (instregex "PICSTR", "STR(i12|rs)", "STRB(i12|rs)", "STRH$", "STREX",
+        "t2STR(i12|i8|s)$", "t2STR[BH](i12|i8|s)$", "tSTR[BH](i|r)", "tSTR(i|r)", "tSTRspi")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2],
+        (instregex "STR(B_|_|BT_|T_)(PRE_IMM|PRE_REG|POST_REG|POST_IMM)",
+        "STR(i|r)_preidx", "STRB(i|r)_preidx", "STRH_preidx", "STR(H_|HT_)(PRE|POST)",
+        "STR(BT|HT|T)", "t2STR_(PRE|POST)", "t2STR[BH]_(PRE|POST)",
+        "t2STR_preidx", "t2STR[BH]_preidx", "t2ST(RB|RH|R)T")>;
+
+  // 4.2.24 Integer Store, Dual
+  def : InstRW<[SwiftWriteP2, SwiftWriteP2, SwiftWriteP01OneCycle],
+        (instregex "STRD$", "t2STRDi8")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteP2, SwiftWriteP2,
+                SwiftWriteP01OneCycle],
+        (instregex "(t2|t)STRD_(POST|PRE)", "STRD_(POST|PRE)")>;
+
+  // 4.2.25 Integer Store, Multiple
+  def SwiftWriteStIncAddr : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+    let Latency = 0;
+    let NumMicroOps = 2;
+  }
+  foreach NumAddr = 1-16 in {
+     def SwiftWriteSTM#NumAddr : WriteSequence<[SwiftWriteStIncAddr], NumAddr>;
+  }
+  def SwiftWriteSTM : SchedWriteVariant<[
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM2]>,
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM5]>,
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM7]>,
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM9]>,
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteSTM10]>,
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteSTM11]>,
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteSTM12]>,
+    SchedVar<SwiftLMAddr13Pred,[SwiftWriteSTM13]>,
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteSTM14]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteSTM15]>,
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteSTM16]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteSTM2]>
+  ]>;
+  def : InstRW<[SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)$", "(t2|sys|t)STM(IB|IA|DB|DA)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWriteSTM],
+        (instregex "STM(IB|IA|DB|DA)_UPD", "(t2|sys|t)STM(IB|IA|DB|DA)_UPD",
+        "PUSH", "tPUSH")>;
+
+  // 4.2.26 Branch
+  def : WriteRes<WriteBr, [SwiftUnitP1]> { let Latency = 0; }
+  def : WriteRes<WriteBrL, [SwiftUnitP1]> { let Latency = 2; }
+  def : WriteRes<WriteBrTbl, [SwiftUnitP1, SwiftUnitP2]> { let Latency = 0; }
+
+  // 4.2.27 Not issued
+  def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
+  def : InstRW<[WriteNoop], (instregex "t2IT", "IT", "NOP")>;
+
+  // 4.2.28 Advanced SIMD, Integer, 2 cycle
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VADDv", "VSUBv", "VNEG(s|f|v)", "VADDL", "VSUBL",
+                   "VADDW", "VSUBW", "VHADD", "VHSUB", "VRHADD", "VPADDi",
+                   "VPADDL", "VAND", "VBIC", "VEOR", "VORN", "VORR", "VTST",
+                   "VSHL", "VSHR(s|u)", "VSHLL", "VQSHL", "VQSHLU", "VBIF",
+                   "VBIT", "VBSL", "VSLI", "VSRI", "VCLS", "VCLZ", "VCNT")>;
+
+  def : InstRW<[SwiftWriteP1TwoCycle],
+        (instregex "VEXT", "VREV16", "VREV32", "VREV64")>;
+
+  // 4.2.29 Advanced SIMD, Integer, 4 cycle
+  // 4.2.30 Advanced SIMD, Integer with Accumulate
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VABA", "VABAL", "VPADAL", "VRSRA", "VSRA", "VACGE", "VACGT",
+        "VACLE", "VACLT", "VCEQ", "VCGE", "VCGT", "VCLE", "VCLT", "VRSHL",
+        "VQRSHL", "VRSHR(u|s)", "VABS(f|v)", "VQABS", "VQNEG", "VQADD",
+        "VQSUB")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VRECPE", "VRSQRTE")>;
+
+  // 4.2.31 Advanced SIMD, Add and Shift with Narrow
+  def : InstRW<[SwiftWriteP0P1FourCycle],
+        (instregex "VADDHN", "VSUBHN", "VSHRN")>;
+  def : InstRW<[SwiftWriteP0P1SixCycle],
+        (instregex "VRADDHN", "VRSUBHN", "VRSHRN", "VQSHRN", "VQSHRUN",
+                   "VQRSHRN", "VQRSHRUN")>;
+
+  // 4.2.32 Advanced SIMD, Vector Table Lookup
+  foreach Num = 1-4 in {
+    def SwiftWrite#Num#xP1TwoCycle : WriteSequence<[SwiftWriteP1TwoCycle], Num>;
+  }
+  def : InstRW<[SwiftWrite1xP1TwoCycle],
+        (instregex "VTB(L|X)1")>;
+  def : InstRW<[SwiftWrite2xP1TwoCycle],
+        (instregex "VTB(L|X)2")>;
+  def : InstRW<[SwiftWrite3xP1TwoCycle],
+        (instregex "VTB(L|X)3")>;
+  def : InstRW<[SwiftWrite4xP1TwoCycle],
+        (instregex "VTB(L|X)4")>;
+
+  // 4.2.33 Advanced SIMD, Transpose
+  def : InstRW<[SwiftWriteP1FourCycle, SwiftWriteP1FourCycle,
+                SwiftWriteP1TwoCycle/*RsrcOnly*/, SchedReadAdvance<2>],
+        (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
+
+  // 4.2.34 Advanced SIMD and VFP, Floating Point
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "VABS(S|D)$", "VNEG(S|D)$")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VCMP(D|S|ZD|ZS)$", "VCMPE(D|S|ZD|ZS)")>;
+  def : InstRW<[SwiftWriteP0FourCycle],
+        (instregex "VADD(S|f)", "VSUB(S|f)", "VABD", "VPADDf", "VMAX", "VMIN", "VPMAX",
+                   "VPMIN")>;
+  def : InstRW<[SwiftWriteP0SixCycle], (instregex "VADDD$", "VSUBD$")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VRECPS", "VRSQRTS")>;
+
+  // 4.2.35 Advanced SIMD and VFP, Multiply
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMUL(S|v|p|f|s)", "VNMULS", "VQDMULH", "VQRDMULH",
+                   "VMULL", "VQDMULL")>;
+  def : InstRW<[SwiftWriteP1SixCycle],
+        (instregex "VMULD", "VNMULD")>;
+  def : InstRW<[SwiftWriteP1FourCycle],
+        (instregex "VMLA", "VMLS", "VNMLA", "VNMLS", "VFMA(S|D)", "VFMS(S|D)",
+        "VFNMA", "VFNMS", "VMLAL", "VMLSL","VQDMLAL", "VQDMLSL")>;
+  def : InstRW<[SwiftWriteP1EightCycle], (instregex "VFMAfd", "VFMSfd")>;
+  def : InstRW<[SwiftWriteP1TwelveCyc], (instregex "VFMAfq", "VFMSfq")>;
+
+  // 4.2.36 Advanced SIMD and VFP, Convert
+  def : InstRW<[SwiftWriteP1FourCycle], (instregex "VCVT", "V(S|U)IT", "VTO(S|U)")>;
+  // Fixpoint conversions.
+  def : WriteRes<WriteCvtFP, [SwiftUnitP1]> { let Latency = 4; }
+
+  // 4.2.37 Advanced SIMD and VFP, Move
+  def : InstRW<[SwiftWriteP0TwoCycle],
+        (instregex "VMOVv", "VMOV(S|D)$", "VMOV(S|D)cc",
+                   "VMVNv", "VMVN(d|q)", "VMVN(S|D)cc",
+                   "FCONST(D|S)")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VMOVN", "VMOVL")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP0FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VQMOVN")>;
+  def : InstRW<[SwiftWriteP1TwoCycle], (instregex "VDUPLN", "VDUPf")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>],
+        (instregex "VDUP(8|16|32)")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle], (instregex "VMOVRS$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP0TwoCycle]>],
+        (instregex "VMOVSR$", "VSETLN")>;
+  def : InstRW<[SwiftWriteP2ThreeCycle, SwiftWriteP2FourCycle],
+        (instregex "VMOVRR(D|S)$")>;
+  def : InstRW<[SwiftWriteP2FourCycle], (instregex "VMOVDRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP2FourCycle, SwiftWriteP1TwoCycle]>,
+                WriteSequence<[SwiftWrite1Cycle, SwiftWriteP2FourCycle,
+                               SwiftWriteP1TwoCycle]>],
+                (instregex "VMOVSRR$")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle]>],
+        (instregex "VGETLN(u|i)")>;
+  def : InstRW<[WriteSequence<[SwiftWriteP1TwoCycle, SwiftWriteP2ThreeCycle,
+                               SwiftWriteP01OneCycle]>],
+        (instregex "VGETLNs")>;
+
+  // 4.2.38 Advanced SIMD and VFP, Move FPSCR
+  // Serializing instructions.
+  def SwiftWaitP0For15Cy : SchedWriteRes<[SwiftUnitP0]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP1For15Cy : SchedWriteRes<[SwiftUnitP1]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def SwiftWaitP2For15Cy : SchedWriteRes<[SwiftUnitP2]> {
+    let Latency = 15;
+    let ResourceCycles = [15];
+  }
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMRS")>;
+  def : InstRW<[SwiftWaitP0For15Cy, SwiftWaitP1For15Cy, SwiftWaitP2For15Cy],
+        (instregex "VMSR")>;
+  // Not serializing.
+  def : InstRW<[SwiftWriteP0TwoCycle], (instregex "FMSTAT")>;
+
+  // 4.2.39 Advanced SIMD and VFP, Load Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDRD$", "VLDRS$")>;
+
+  // 4.2.40 Advanced SIMD and VFP, Store Single Element
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VSTRD$", "VSTRS$")>;
+
+  // 4.2.41 Advanced SIMD and VFP, Load Multiple
+  // 4.2.42 Advanced SIMD and VFP, Store Multiple
+
+  // Resource requirement for permuting, just reserves the resources.
+  foreach Num = 1-28 in {
+    def SwiftVLDMPerm#Num : SchedWriteRes<[SwiftUnitP1]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+
+  // Pre RA pseudos - load/store to a Q register as a D register pair.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLDMQIA$", "VSTMQIA$")>;
+
+  // Post RA not modelled accurately. We assume that register use of width 64
+  // bit maps to a D register, 128 maps to a Q register. Not all different kinds
+  // are accurately represented.
+  def SwiftWriteVLDM : SchedWriteVariant<[
+    // Load of one S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteLM4Cy]>,
+    // Load of one D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo]>,
+    // Load of 3 S register.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm3]>,
+    // Load of a Q register (not neccessarily true). We should not be mapping to
+    // 4 S registers, either.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteLM4Cy, SwiftWriteLM4CyNo,
+                                SwiftWriteLM4CyNo, SwiftWriteLM4CyNo]>,
+    // Load of 5 S registers.
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo,  SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm5]>,
+    // Load of 3 D registers. (Must also be able to handle s register list -
+    // though, not accurate)
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Load of 7 S registers.
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm7]>,
+    // Load of two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>,
+    // Load of 9 S registers.
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 5 D registers.
+    SchedVar<SwiftLMAddr10Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of three Q registers.
+    SchedVar<SwiftLMAddr12Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
+    // Inaccurate: reuse describtion from 9 S registers.
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 7 D registers inaccurate.
+    SchedVar<SwiftLMAddr14Pred,[SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM10Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM14Cy, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm7]>,
+    SchedVar<SwiftLMAddr15Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM17Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM21CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM25CyNo, SwiftWriteP01OneCycle,
+                                SwiftVLDMPerm9]>,
+    // Load of 4 Q registers.
+    SchedVar<SwiftLMAddr16Pred,[SwiftWriteLM7Cy, SwiftWriteLM10Cy,
+                                SwiftWriteLM11Cy, SwiftWriteLM14Cy,
+                                SwiftWriteLM15Cy, SwiftWriteLM18CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteLM19CyNo, SwiftWriteLM22CyNo,
+                                SwiftWriteP01OneCycle, SwiftVLDMPerm4]>,
+    // Unknow number of registers, just use resources for two registers.
+    SchedVar<NoSchedPred,      [SwiftWriteLM7Cy, SwiftWriteLM8Cy,
+                                SwiftWriteLM13Cy, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteLM13CyNo, SwiftWriteLM13CyNo,
+                                SwiftWriteP01OneCycle,  SwiftVLDMPerm2]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVLDM], (instregex "VLDM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVLDM],
+        (instregex "VLDM[SD](IA|DB)_UPD$")>;
+
+  def SwiftWriteVSTM : SchedWriteVariant<[
+    // One S register.
+    SchedVar<SwiftLMAddr1Pred, [SwiftWriteSTM1]>,
+    // One D register.
+    SchedVar<SwiftLMAddr2Pred, [SwiftWriteSTM1]>,
+    // Three S registers.
+    SchedVar<SwiftLMAddr3Pred, [SwiftWriteSTM4]>,
+    // Assume one Q register.
+    SchedVar<SwiftLMAddr4Pred, [SwiftWriteSTM1]>,
+    SchedVar<SwiftLMAddr5Pred, [SwiftWriteSTM6]>,
+    // Assume three D registers.
+    SchedVar<SwiftLMAddr6Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr7Pred, [SwiftWriteSTM8]>,
+    // Assume two Q registers.
+    SchedVar<SwiftLMAddr8Pred, [SwiftWriteSTM3]>,
+    SchedVar<SwiftLMAddr9Pred, [SwiftWriteSTM10]>,
+    // Assume 5 D registers.
+    SchedVar<SwiftLMAddr10Pred, [SwiftWriteSTM6]>,
+    SchedVar<SwiftLMAddr11Pred, [SwiftWriteSTM12]>,
+    // Asume three Q registers.
+    SchedVar<SwiftLMAddr12Pred, [SwiftWriteSTM4]>,
+    SchedVar<SwiftLMAddr13Pred, [SwiftWriteSTM14]>,
+    // Assume 7 D registers.
+    SchedVar<SwiftLMAddr14Pred, [SwiftWriteSTM8]>,
+    SchedVar<SwiftLMAddr15Pred, [SwiftWriteSTM16]>,
+    // Assume four Q registers.
+    SchedVar<SwiftLMAddr16Pred, [SwiftWriteSTM5]>,
+    // Asumme two Q registers.
+    SchedVar<NoSchedPred, [SwiftWriteSTM3]>
+  ]> { let Variadic = 1; }
+
+  def : InstRW<[SwiftWriteVSTM], (instregex "VSTM[SD](IA|DB)$")>;
+
+  def : InstRW<[SwiftWriteP01OneCycle2x, SwiftWriteVSTM],
+        (instregex "VSTM[SD](IA|DB)_UPD")>;
+
+  // 4.2.43 Advanced SIMD, Element or Structure Load and Store
+  def SwiftWrite2xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [2];
+  }
+  def SwiftWrite3xP2FourCy : SchedWriteRes<[SwiftUnitP2]> {
+      let Latency = 4;
+      let ResourceCycles = [3];
+  }
+  foreach Num = 1-2 in {
+    def SwiftExt#Num#xP0 : SchedWriteRes<[SwiftUnitP0]> {
+      let Latency = 0;
+      let NumMicroOps = Num;
+      let ResourceCycles = [Num];
+    }
+  }
+  // VLDx
+  // Multiple structures.
+  // Single element structure loads.
+  // We assume aligned.
+  // Single/two register.
+  def : InstRW<[SwiftWriteLM4Cy], (instregex "VLD1(d|q)(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteLM4Cy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)wb")>;
+  // Three register.
+  def : InstRW<[SwiftWrite3xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)T$", "VLD1d64TPseudo")>;
+  def : InstRW<[SwiftWrite3xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Twb")>;
+  /// Four Register.
+  def : InstRW<[SwiftWrite2xP2FourCy],
+        (instregex "VLD1(d|q)(8|16|32|64)Q$", "VLD1d64QPseudo")>;
+  def : InstRW<[SwiftWrite2xP2FourCy, SwiftWriteP01OneCycle],
+        (instregex "VLD1(d|q)(8|16|32|64)Qwb")>;
+  // Two element structure loads.
+  // Two/four register.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftExt2xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)$", "VLD2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm2],
+        (instregex "VLD2(d|q|b)(8|16|32)wb", "VLD2q(8|16|32)PseudoWB")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo$")>;
+
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteLM9CyNo, SwiftWriteLM9CyNo,
+                SwiftWriteP01OneCycle, SwiftVLDMPerm3, SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM9Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm3,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+  // Four element structure loads.
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteLM11Cy,  SwiftExt2xP0, SwiftVLDMPerm4,
+                SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteLM11Cy, SwiftWriteLM11Cy,
+                SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex "VLD4(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM11Cy, SwiftWriteP01OneCycle, SwiftExt2xP0,
+                SwiftVLDMPerm4, SwiftWrite3xP2FourCy],
+        (instregex  "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
+
+  // Single all/lane loads.
+  // One element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)$", "VLD1(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftVLDMPerm2],
+        (instregex "VLD1(LN|DUP)(d|q)(8|16|32)(wb|_UPD)",
+                  "VLD1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2(DUP|LN)(d|q)(8|16|32|8x2|16x2|32x2)$",
+                   "VLD2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteLM6Cy, SwiftWriteP01OneCycle,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)_UPD$")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2DUPd(8|16|32|8x2|16x2|32x2)wb")>;
+  def : InstRW<[SwiftWriteLM6Cy, SwiftWriteP01OneCycle, SwiftWriteLM6Cy,
+                SwiftExt1xP0, SwiftVLDMPerm2],
+        (instregex "VLD2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy, SwiftExt1xP0,
+                SwiftVLDMPerm3],
+        (instregex "VLD3(DUP|LN)(d|q)(8|16|32)$",
+                   "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteLM8Cy, SwiftWriteLM8Cy,
+                SwiftWriteP01OneCycle, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
+                SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
+        (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element struture.
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
+        (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
+                   "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
+                SwiftWriteLM10CyNo, SwiftWriteP01OneCycle, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)_UPD")>;
+  def : InstRW<[SwiftWriteLM8Cy, SwiftWriteP01OneCycle, SwiftWriteLM9Cy,
+                SwiftWriteLM10CyNo, SwiftWriteLM10CyNo, SwiftExt1xP0,
+                SwiftVLDMPerm5],
+        (instregex "VLD4(DUP|LN)(d|q)(8|16|32)Pseudo_UPD")>;
+  // VSTx
+  // Multiple structures.
+  // Single element structure store.
+  def : InstRW<[SwiftWrite1xP2], (instregex "VST1d(8|16|32|64)$")>;
+  def : InstRW<[SwiftWrite2xP2], (instregex "VST1q(8|16|32|64)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2],
+        (instregex "VST1d(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2],
+        (instregex "VST1q(8|16|32|64)wb")>;
+  def : InstRW<[SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)T$", "VST1d64TPseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite3xP2],
+        (instregex "VST1d(8|16|32|64)Twb", "VST1d64TPseudoWB")>;
+  def : InstRW<[SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Q|QPseudo)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2],
+        (instregex "VST1d(8|16|32|64)(Qwb|QPseudoWB)")>;
+  // Two element structure store.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(d|b)(8|16|32)$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST2(b|d)(8|16|32)wb")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)$", "VST2q(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST2q(8|16|32)wb", "VST2q(8|16|32)PseudoWB")>;
+  // Three element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)$", "VST3(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3(d|q)(8|16|32)_UPD",
+                   "VST3(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Four element structure store.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST4(d|q)(8|16|32)$", "VST4(d|q)(8|16|32)(oddP|P)seudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm4],
+        (instregex "VST4(d|q)(8|16|32)_UPD",
+                   "VST4(d|q)(8|16|32)(oddP|P)seudo_UPD$")>;
+  // Single/all lane store.
+  // One element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)$", "VST1LNq(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm1],
+        (instregex "VST1LNd(8|16|32)_UPD", "VST1LNq(8|16|32)Pseudo_UPD")>;
+  // Two element structure.
+  def : InstRW<[SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)$", "VST2LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite1xP2, SwiftVLDMPerm2],
+        (instregex "VST2LN(d|q)(8|16|32)_UPD",
+                   "VST2LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Three element structure.
+  def : InstRW<[SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)$", "VST3LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite4xP2, SwiftVLDMPerm2],
+        (instregex "VST3LN(d|q)(8|16|32)_UPD",
+                   "VST3LN(d|q)(8|16|32)Pseudo_UPD")>;
+  // Four element structure.
+  def : InstRW<[SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)$", "VST4LN(d|q)(8|16|32)Pseudo$")>;
+  def : InstRW<[SwiftWriteP01OneCycle, SwiftWrite2xP2, SwiftVLDMPerm2],
+        (instregex "VST4LN(d|q)(8|16|32)_UPD",
+                   "VST4LN(d|q)(8|16|32)Pseudo_UPD")>;
+
+  // 4.2.44 VFP, Divide and Square Root
+  def SwiftDiv17 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 17;
+    let ResourceCycles = [1, 15];
+  }
+  def SwiftDiv32 : SchedWriteRes<[SwiftUnitP0, SwiftUnitDiv]> {
+    let NumMicroOps = 1;
+    let Latency = 32;
+    let ResourceCycles = [1, 30];
+  }
+  def : InstRW<[SwiftDiv17], (instregex "VDIVS", "VSQRTS")>;
+  def : InstRW<[SwiftDiv32], (instregex "VDIVD", "VSQRTD")>;
+
+  // Not specified.
+  def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
+  // Preload.
+  def : WriteRes<WritePreLd, [SwiftUnitP2]> { let Latency = 0;
+    let ResourceCycles = [0];
+  }
+
 }
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 41a7e0c..93add6e 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -26,7 +26,7 @@ ARMSelectionDAGInfo::~ARMSelectionDAGInfo() {
 }
 
 SDValue
-ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                              SDValue Chain,
                                              SDValue Dst, SDValue Src,
                                              SDValue Size, unsigned Align,
@@ -140,7 +140,7 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
 // GNU library uses (ptr, value, size)
 // See RTABI section 4.3.4
 SDValue ARMSelectionDAGInfo::
-EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         SDValue Chain, SDValue Dst,
                         SDValue Src, SDValue Size,
                         unsigned Align, bool isVolatile,
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.h b/lib/Target/ARM/ARMSelectionDAGInfo.h
index 6419a73..56c9375 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -45,7 +45,7 @@ public:
   ~ARMSelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
@@ -55,7 +55,7 @@ public:
 
   // Adjust parameters for memset, see RTABI section 4.3.4
   virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Op1, SDValue Op2,
                                   SDValue Op3, unsigned Align,
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 8653c46..a116298 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -32,20 +32,53 @@ ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
 static cl::opt<bool>
-DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
+ArmUseMOVT("arm-use-movt", cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
 
-static cl::opt<bool>
-StrictAlign("arm-strict-align", cl::Hidden,
-            cl::desc("Disallow all unaligned memory accesses"));
+enum AlignMode {
+  DefaultAlign,
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(DefaultAlign),
+      cl::values(
+          clEnumValN(DefaultAlign,  "arm-default-align",
+                     "Generate unaligned accesses only on hardware/OS "
+                     "combinations that are known to support them"),
+          clEnumValN(StrictAlign,   "arm-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "arm-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+enum ITMode {
+  DefaultIT,
+  RestrictedIT,
+  NoRestrictedIT
+};
+
+static cl::opt<ITMode>
+IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT),
+   cl::ZeroOrMore,
+   cl::values(clEnumValN(DefaultIT, "arm-default-it",
+                         "Generate IT block based on arch"),
+              clEnumValN(RestrictedIT, "arm-restrict-it",
+                         "Disallow deprecated IT based on ARMv8"),
+              clEnumValN(NoRestrictedIT, "arm-no-restrict-it",
+                         "Allow IT blocks based on ARMv7"),
+              clEnumValEnd));
 
 ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, const TargetOptions &Options)
   : ARMGenSubtargetInfo(TT, CPU, FS)
   , ARMProcFamily(Others)
+  , ARMProcClass(None)
   , stackAlignment(4)
   , CPUString(CPU)
   , TargetTriple(TT)
@@ -60,11 +93,14 @@ void ARMSubtarget::initializeEnvironment() {
   HasV5TOps = false;
   HasV5TEOps = false;
   HasV6Ops = false;
+  HasV6MOps = false;
   HasV6T2Ops = false;
   HasV7Ops = false;
+  HasV8Ops = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
+  HasFPARMv8 = false;
   HasNEON = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
@@ -73,7 +109,6 @@ void ARMSubtarget::initializeEnvironment() {
   SlowFPBrcc = false;
   InThumbMode = false;
   HasThumb2 = false;
-  IsMClass = false;
   NoARM = false;
   PostRAScheduler = false;
   IsR9Reserved = ReserveR9;
@@ -90,8 +125,12 @@ void ARMSubtarget::initializeEnvironment() {
   AvoidMOVsShifterOperand = false;
   HasRAS = false;
   HasMPExtension = false;
+  HasVirtualization = false;
   FPOnlySP = false;
+  HasPerfMon = false;
   HasTrustZone = false;
+  HasCrypto = false;
+  HasCRC = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
   UseNaClTrap = false;
@@ -115,8 +154,13 @@ void ARMSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
 }
 
 void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
-  if (CPUString.empty())
-    CPUString = "generic";
+  if (CPUString.empty()) {
+    if (isTargetIOS() && TargetTriple.getArchName().endswith("v7s"))
+      // Default to the Swift CPU when targeting armv7s/thumbv7s.
+      CPUString = "swift";
+    else
+      CPUString = "generic";
+  }
 
   // Insert the architecture feature derived from the target triple into the
   // feature string. This is important for setting features that are implied
@@ -134,7 +178,7 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Thumb2 implies at least V6T2. FIXME: Fix tests to explicitly specify a
   // ARM version or CPU and then remove this.
   if (!HasV6T2Ops && hasThumb2())
-    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6T2Ops = true;
+    HasV4TOps = HasV5TOps = HasV5TEOps = HasV6Ops = HasV6MOps = HasV6T2Ops = true;
 
   // Keep a pointer to static instruction cost data for the specified CPU.
   SchedModel = getSchedModelForCPU(CPUString);
@@ -151,21 +195,56 @@ void ARMSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (isAAPCS_ABI())
     stackAlignment = 8;
 
-  if (!isTargetIOS())
-    UseMovt = hasV6T2Ops();
-  else {
+  UseMovt = hasV6T2Ops() && ArmUseMOVT;
+
+  if (!isTargetIOS()) {
+    IsR9Reserved = ReserveR9;
+  } else {
     IsR9Reserved = ReserveR9 | !HasV6Ops;
-    UseMovt = DarwinUseMOVT && hasV6T2Ops();
     SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
   }
 
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
 
-  // v6+ may or may not support unaligned mem access depending on the system
-  // configuration.
-  if (!StrictAlign && hasV6Ops() && isTargetDarwin())
-    AllowsUnalignedMem = true;
+  switch (Align) {
+    case DefaultAlign:
+      // Assume pre-ARMv6 doesn't support unaligned accesses.
+      //
+      // ARMv6 may or may not support unaligned accesses depending on the
+      // SCTLR.U bit, which is architecture-specific. We assume ARMv6
+      // Darwin targets support unaligned accesses, and others don't.
+      //
+      // ARMv7 always has SCTLR.U set to 1, but it has a new SCTLR.A bit
+      // which raises an alignment fault on unaligned accesses. Linux
+      // defaults this bit to 0 and handles it as a system-wide (not
+      // per-process) setting. It is therefore safe to assume that ARMv7+
+      // Linux targets support unaligned accesses. The same goes for NaCl.
+      //
+      // The above behavior is consistent with GCC.
+      AllowsUnalignedMem = (
+          (hasV7Ops() && (isTargetLinux() || isTargetNaCl())) ||
+          (hasV6Ops() && isTargetDarwin()));
+      break;
+    case StrictAlign:
+      AllowsUnalignedMem = false;
+      break;
+    case NoStrictAlign:
+      AllowsUnalignedMem = true;
+      break;
+  }
+
+  switch (IT) {
+  case DefaultIT:
+    RestrictIT = hasV8Ops() ? true : false;
+    break;
+  case RestrictedIT:
+    RestrictIT = true;
+    break;
+  case NoRestrictedIT:
+    RestrictIT = false;
+    break;
+  }
 
   // NEON f32 ops are non-IEEE 754 compliant. Darwin is ok with it by default.
   uint64_t Bits = getFeatureBits();
@@ -231,12 +310,15 @@ unsigned ARMSubtarget::getMispredictionPenalty() const {
   return SchedModel->MispredictPenalty;
 }
 
+bool ARMSubtarget::hasSinCos() const {
+  return getTargetTriple().getOS() == Triple::IOS &&
+    !getTargetTriple().isOSVersionLT(7, 0);
+}
+
 bool ARMSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
-  CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(&ARM::GPRRegClass);
+  Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 038eb76..5276901 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,26 +31,36 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift
+    Others, CortexA5, CortexA8, CortexA9, CortexA15, CortexR5, Swift, CortexA53, CortexA57
+  };
+  enum ARMProcClassEnum {
+    None, AClass, RClass, MClass
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily;
 
-  /// HasV4TOps, HasV5TOps, HasV5TEOps, HasV6Ops, HasV6T2Ops, HasV7Ops -
+  /// ARMProcClass - ARM processor class: None, AClass, RClass or MClass.
+  ARMProcClassEnum ARMProcClass;
+
+  /// HasV4TOps, HasV5TOps, HasV5TEOps,
+  /// HasV6Ops, HasV6MOps, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
   bool HasV4TOps;
   bool HasV5TOps;
   bool HasV5TEOps;
   bool HasV6Ops;
+  bool HasV6MOps;
   bool HasV6T2Ops;
   bool HasV7Ops;
+  bool HasV8Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
   bool HasVFPv4;
+  bool HasFPARMv8;
   bool HasNEON;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
@@ -79,10 +89,6 @@ protected:
   /// HasThumb2 - True if Thumb2 instructions are supported.
   bool HasThumb2;
 
-  /// IsMClass - True if the subtarget belongs to the 'M' profile of CPUs -
-  /// v6m, v7m for example.
-  bool IsMClass;
-
   /// NoARM - True if subtarget does not support ARM mode execution.
   bool NoARM;
 
@@ -144,18 +150,37 @@ protected:
   /// extension (ARMv7 only).
   bool HasMPExtension;
 
+  /// HasVirtualization - True if the subtarget supports the Virtualization
+  /// extension.
+  bool HasVirtualization;
+
   /// FPOnlySP - If true, the floating point unit only supports single
   /// precision.
   bool FPOnlySP;
 
+  /// If true, the processor supports the Performance Monitor Extensions. These
+  /// include a generic cycle-counter as well as more fine-grained (often
+  /// implementation-specific) events.
+  bool HasPerfMon;
+
   /// HasTrustZone - if true, processor supports TrustZone security extensions
   bool HasTrustZone;
 
+  /// HasCrypto - if true, processor supports Cryptography extensions
+  bool HasCrypto;
+
+  /// HasCRC - if true, processor supports CRC instructions
+  bool HasCRC;
+
   /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory
   /// accesses for some types.  For details, see
   /// ARMTargetLowering::allowsUnalignedMemoryAccesses().
   bool AllowsUnalignedMem;
 
+  /// RestrictIT - If true, the subtarget disallows generation of deprecated IT
+  ///  blocks to conform to ARMv8 rule.
+  bool RestrictIT;
+
   /// Thumb2DSP - If true, the subtarget supports the v7 DSP (saturating arith
   /// and such) instructions in Thumb2 code.
   bool Thumb2DSP;
@@ -187,10 +212,6 @@ protected:
 
  public:
   enum {
-    isELF, isDarwin
-  } TargetType;
-
-  enum {
     ARM_ABI_APCS,
     ARM_ABI_AAPCS // ARM EABI
   } TargetABI;
@@ -224,8 +245,10 @@ public:
   bool hasV5TOps()  const { return HasV5TOps;  }
   bool hasV5TEOps() const { return HasV5TEOps; }
   bool hasV6Ops()   const { return HasV6Ops;   }
+  bool hasV6MOps()  const { return HasV6MOps;  }
   bool hasV6T2Ops() const { return HasV6T2Ops; }
   bool hasV7Ops()   const { return HasV7Ops;  }
+  bool hasV8Ops()   const { return HasV8Ops;  }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
@@ -241,7 +264,11 @@ public:
   bool hasVFP2() const { return HasVFPv2; }
   bool hasVFP3() const { return HasVFPv3; }
   bool hasVFP4() const { return HasVFPv4; }
+  bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON;  }
+  bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+  bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
@@ -249,11 +276,15 @@ public:
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool hasAnyDataBarrier() const {
+    return HasDataBarrier || (hasV6Ops() && !isThumb());
+  }
   bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
   bool isFPOnlySP() const { return FPOnlySP; }
+  bool hasPerfMon() const { return HasPerfMon; }
   bool hasTrustZone() const { return HasTrustZone; }
   bool prefers32BitThumb() const { return Pref32BitThumb; }
   bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; }
@@ -268,12 +299,19 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  bool isTargetIOS() const { return TargetTriple.getOS() == Triple::IOS; }
+  bool isTargetIOS() const { return TargetTriple.isiOS(); }
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetELF() const { return !isTargetDarwin(); }
+  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+  // even for GNUEABI, so we can make a distinction here and still conform to
+  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  bool isTargetAEABI() const {
+    return TargetTriple.getEnvironment() == Triple::EABI;
+  }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
   bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
@@ -282,8 +320,9 @@ public:
   bool isThumb1Only() const { return InThumbMode && !HasThumb2; }
   bool isThumb2() const { return InThumbMode && HasThumb2; }
   bool hasThumb2() const { return HasThumb2; }
-  bool isMClass() const { return IsMClass; }
-  bool isARClass() const { return !IsMClass; }
+  bool isMClass() const { return ARMProcClass == MClass; }
+  bool isRClass() const { return ARMProcClass == RClass; }
+  bool isAClass() const { return ARMProcClass == AClass; }
 
   bool isR9Reserved() const { return IsR9Reserved; }
 
@@ -292,9 +331,15 @@ public:
 
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
+  bool restrictIT() const { return RestrictIT; }
+
   const std::string & getCPUString() const { return CPUString; }
 
   unsigned getMispredictionPenalty() const;
+  
+  /// This function returns true if the target has sincos() routine in its
+  /// compiler runtime or math libraries.
+  bool hasSinCos() const;
 
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 42c7d2c..c2bf788 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -60,7 +60,7 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our ARM pass. This
   // allows the ARM pass to delegate to the target independent layer when
   // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createBasicTargetTransformInfoPass(this));
   PM.add(createARMTargetTransformInfoPass(this));
 }
 
@@ -85,6 +85,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
                        "support ARM mode execution!");
@@ -117,6 +118,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget.hasThumb2()
               ? new ARMFrameLowering(Subtarget)
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
+  initAsmInfo();
 }
 
 namespace {
@@ -148,7 +150,7 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
-    addPass(createGlobalMergePass(TM->getTargetLowering()));
+    addPass(createGlobalMergePass(TM));
 
   return false;
 }
@@ -167,7 +169,7 @@ bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
     addPass(createARMLoadStoreOptimizationPass(true));
-  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
     addPass(createMLxExpansionPass());
   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
   // enabled when NEON is available.
@@ -194,8 +196,13 @@ bool ARMPassConfig::addPreSched2() {
   addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only())
+    if (!getARMSubtarget().isThumb1Only()) {
+      // in v8, IfConversion depends on Thumb instruction widths
+      if (getARMSubtarget().restrictIT() &&
+          !getARMSubtarget().prefers32BitThumb())
+        addPass(createThumb2SizeReductionPass());
       addPass(&IfConverterID);
+    }
   }
   if (getARMSubtarget().isThumb2())
     addPass(createThumb2ITBlockPass());
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index dfdf6ab..7ec71b2 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -47,7 +47,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MCStreamer &Streamer) const {
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
-  return MCSymbolRefExpr::Create(Mang->getSymbol(GV),
+  return MCSymbolRefExpr::Create(getSymbol(*Mang, GV),
                                  MCSymbolRefExpr::VK_ARM_TARGET2,
                                  getContext());
 }
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 53ece66..6bbb38f 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -124,11 +124,14 @@ public:
 
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
 
-  unsigned getAddressComputationCost(Type *Val) const;
+  unsigned getAddressComputationCost(Type *Val, bool IsComplex) const;
 
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   OperandValueKind Op1Info = OK_AnyValue,
                                   OperandValueKind Op2Info = OK_AnyValue) const;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const;
   /// @}
 };
 
@@ -182,7 +185,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   assert(ISD && "Invalid opcode");
 
   // Single to/from double precision conversions.
-  static const CostTblEntry<MVT> NEONFltDblTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONFltDblTbl[] = {
     // Vector fptrunc/fpext conversions.
     { ISD::FP_ROUND,   MVT::v2f64, 2 },
     { ISD::FP_EXTEND,  MVT::v2f32, 2 },
@@ -192,8 +195,7 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
                                           ISD == ISD::FP_EXTEND)) {
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-    int Idx = CostTableLookup<MVT>(NEONFltDblTbl, array_lengthof(NEONFltDblTbl),
-                                ISD, LT.second);
+    int Idx = CostTableLookup(NEONFltDblTbl, ISD, LT.second);
     if (Idx != -1)
       return LT.first * NEONFltDblTbl[Idx].Cost;
   }
@@ -207,7 +209,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
-  static const TypeConversionCostTblEntry<MVT> NEONVectorConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONVectorConversionTbl[] = {
     { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
@@ -283,15 +286,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isVector() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONVectorConversionTbl,
-                                array_lengthof(NEONVectorConversionTbl),
-                                ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONVectorConversionTbl[Idx].Cost;
   }
 
   // Scalar float to integer conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONFloatConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONFloatConversionTbl[] = {
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_UINT,  MVT::i1, MVT::f32, 2 },
     { ISD::FP_TO_SINT,  MVT::i1, MVT::f64, 2 },
@@ -314,16 +317,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
     { ISD::FP_TO_UINT,  MVT::i64, MVT::f64, 10 }
   };
   if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONFloatConversionTbl,
-                                        array_lengthof(NEONFloatConversionTbl),
-                                        ISD, DstTy.getSimpleVT(),
-                                        SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
         return NEONFloatConversionTbl[Idx].Cost;
   }
 
   // Scalar integer to float conversions.
-  static const TypeConversionCostTblEntry<MVT> NEONIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  NEONIntegerConversionTbl[] = {
     { ISD::SINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::UINT_TO_FP,  MVT::f32, MVT::i1, 2 },
     { ISD::SINT_TO_FP,  MVT::f64, MVT::i1, 2 },
@@ -347,16 +349,15 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger() && ST->hasNEON()) {
-    int Idx = ConvertCostTableLookup<MVT>(NEONIntegerConversionTbl,
-                                       array_lengthof(NEONIntegerConversionTbl),
-                                       ISD, DstTy.getSimpleVT(),
-                                       SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(NEONIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return NEONIntegerConversionTbl[Idx].Cost;
   }
 
   // Scalar integer conversion costs.
-  static const TypeConversionCostTblEntry<MVT> ARMIntegerConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  ARMIntegerConversionTbl[] = {
     // i16 -> i64 requires two dependent operations.
     { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
 
@@ -368,11 +369,8 @@ unsigned ARMTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
   };
 
   if (SrcTy.isInteger()) {
-    int Idx =
-      ConvertCostTableLookup<MVT>(ARMIntegerConversionTbl,
-                                  array_lengthof(ARMIntegerConversionTbl),
-                                  ISD, DstTy.getSimpleVT(),
-                                  SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT());
     if (Idx != -1)
       return ARMIntegerConversionTbl[Idx].Cost;
   }
@@ -400,7 +398,8 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   // On NEON a a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
     // Lowering of some vector selects is currently far from perfect.
-    static const TypeConversionCostTblEntry<MVT> NEONVectorSelectTbl[] = {
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    NEONVectorSelectTbl[] = {
       { ISD::SELECT, MVT::v16i1, MVT::v16i16, 2*16 + 1 + 3*1 + 4*1 },
       { ISD::SELECT, MVT::v8i1, MVT::v8i32, 4*8 + 1*3 + 1*4 + 1*2 },
       { ISD::SELECT, MVT::v16i1, MVT::v16i32, 4*16 + 1*6 + 1*8 + 1*4 },
@@ -411,12 +410,13 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
     EVT SelCondTy = TLI->getValueType(CondTy);
     EVT SelValTy = TLI->getValueType(ValTy);
-    int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
-                                          array_lengthof(NEONVectorSelectTbl),
-                                          ISD, SelCondTy.getSimpleVT(),
-                                          SelValTy.getSimpleVT());
-    if (Idx != -1)
-      return NEONVectorSelectTbl[Idx].Cost;
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
+                                       SelCondTy.getSimpleVT(),
+                                       SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return NEONVectorSelectTbl[Idx].Cost;
+    }
 
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
     return LT.first;
@@ -425,7 +425,16 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
+unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
   // In many cases the address computation is not merged into the instruction
   // addressing mode.
   return 1;
@@ -437,7 +446,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
   if (Kind != SK_Reverse)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
-  static const CostTblEntry<MVT> NEONShuffleTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> NEONShuffleTbl[] = {
     // Reverse shuffle cost one instruction if we are shuffling within a double
     // word (vrev) or two if we shuffle a quad word (vrev, vext).
     { ISD::VECTOR_SHUFFLE, MVT::v2i32, 1 },
@@ -453,8 +462,7 @@ unsigned ARMTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
 
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
 
-  int Idx = CostTableLookup<MVT>(NEONShuffleTbl, array_lengthof(NEONShuffleTbl),
-                                 ISD::VECTOR_SHUFFLE, LT.second);
+  int Idx = CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second);
   if (Idx == -1)
     return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
 
@@ -469,7 +477,7 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
 
   const unsigned FunctionCallDivCost = 20;
   const unsigned ReciprocalDivCost = 10;
-  static const CostTblEntry<MVT> CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CostTbl[] = {
     // Division.
     // These costs are somewhat random. Choose a cost of 20 to indicate that
     // vectorizing devision (added function call) is going to be very expensive.
@@ -513,14 +521,37 @@ unsigned ARMTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueK
   int Idx = -1;
 
   if (ST->hasNEON())
-    Idx = CostTableLookup<MVT>(CostTbl, array_lengthof(CostTbl), ISDOpcode,
-                               LT.second);
+    Idx = CostTableLookup(CostTbl, ISDOpcode, LT.second);
 
   if (Idx != -1)
     return LT.first * CostTbl[Idx].Cost;
 
-
-  return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info,
-                                                     Op2Info);
+  unsigned Cost =
+      TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+
+  // This is somewhat of a hack. The problem that we are facing is that SROA
+  // creates a sequence of shift, and, or instructions to construct values.
+  // These sequences are recognized by the ISel and have zero-cost. Not so for
+  // the vectorized code. Because we have support for v2i64 but not i64 those
+  // sequences look particularily beneficial to vectorize.
+  // To work around this we increase the cost of v2i64 operations to make them
+  // seem less beneficial.
+  if (LT.second == MVT::v2i64 &&
+      Op2Info == TargetTransformInfo::OK_UniformConstantValue)
+    Cost += 4;
+
+  return Cost;
 }
 
+unsigned ARMTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                                 unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isDoubleTy()) {
+    // Unaligned loads/stores are extremely inefficient.
+    // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    return LT.first * 4;
+  }
+  return LT.first;
+}
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 1dd2953..e3f9e0d 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -7,6 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
+#include "ARMFeatures.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -24,6 +27,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -47,11 +51,33 @@ enum VectorLaneTy { NoLanes, AllLanes, IndexedLane };
 class ARMAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   const MCRegisterInfo *MRI;
 
+  ARMTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = getParser().getStreamer().getTargetStreamer();
+    return static_cast<ARMTargetStreamer &>(TS);
+  }
+
+  // Unwind directives state
+  SMLoc FnStartLoc;
+  SMLoc CantUnwindLoc;
+  SMLoc PersonalityLoc;
+  SMLoc HandlerDataLoc;
+  int FPReg;
+  void resetUnwindDirectiveParserState() {
+    FnStartLoc = SMLoc();
+    CantUnwindLoc = SMLoc();
+    PersonalityLoc = SMLoc();
+    HandlerDataLoc = SMLoc();
+    FPReg = -1;
+  }
+
   // Map of register aliases registers via the .req directive.
   StringMap<unsigned> RegisterReqs;
 
+  bool NextSymbolIsThumb;
+
   struct {
     ARMCC::CondCodes Cond;    // Condition for IT block.
     unsigned Mask:4;          // Condition mask for instructions.
@@ -76,7 +102,7 @@ class ARMAsmParser : public MCTargetAsmParser {
     if (!inITBlock()) return;
     // Move to the next instruction in the IT block, if there is one. If not,
     // mark the block as done.
-    unsigned TZ = CountTrailingZeros_32(ITState.Mask);
+    unsigned TZ = countTrailingZeros(ITState.Mask);
     if (++ITState.CurPosition == 5 - TZ)
       ITState.CurPosition = ~0U; // Done with the IT block after this.
   }
@@ -113,11 +139,22 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool parseDirectiveUnreq(SMLoc L);
   bool parseDirectiveArch(SMLoc L);
   bool parseDirectiveEabiAttr(SMLoc L);
+  bool parseDirectiveCPU(SMLoc L);
+  bool parseDirectiveFPU(SMLoc L);
+  bool parseDirectiveFnStart(SMLoc L);
+  bool parseDirectiveFnEnd(SMLoc L);
+  bool parseDirectiveCantUnwind(SMLoc L);
+  bool parseDirectivePersonality(SMLoc L);
+  bool parseDirectiveHandlerData(SMLoc L);
+  bool parseDirectiveSetFP(SMLoc L);
+  bool parseDirectivePad(SMLoc L);
+  bool parseDirectiveRegSave(SMLoc L, bool IsVector);
 
   StringRef splitMnemonic(StringRef Mnemonic, unsigned &PredicationCode,
                           bool &CarrySetting, unsigned &ProcessorIMod,
                           StringRef &ITMask);
-  void getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
+  void getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                             bool &CanAcceptCarrySet,
                              bool &CanAcceptPredicationCode);
 
   bool isThumb() const {
@@ -130,12 +167,25 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool isThumbTwo() const {
     return isThumb() && (STI.getFeatureBits() & ARM::FeatureThumb2);
   }
+  bool hasThumb() const {
+    return STI.getFeatureBits() & ARM::HasV4TOps;
+  }
   bool hasV6Ops() const {
     return STI.getFeatureBits() & ARM::HasV6Ops;
   }
+  bool hasV6MOps() const {
+    return STI.getFeatureBits() & ARM::HasV6MOps;
+  }
   bool hasV7Ops() const {
     return STI.getFeatureBits() & ARM::HasV7Ops;
   }
+  bool hasV8Ops() const {
+    return STI.getFeatureBits() & ARM::HasV8Ops;
+  }
+  bool hasARM() const {
+    return !(STI.getFeatureBits() & ARM::FeatureNoARM);
+  }
+
   void SwitchMode() {
     unsigned FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
     setAvailableFeatures(FB);
@@ -161,6 +211,8 @@ class ARMAsmParser : public MCTargetAsmParser {
     SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseMemBarrierOptOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
+  OperandMatchResultTy parseInstSyncBarrierOptOperand(
+    SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseProcIFlagsOperand(
     SmallVectorImpl<MCParsedAsmOperand*>&);
   OperandMatchResultTy parseMSRMaskOperand(
@@ -185,51 +237,19 @@ class ARMAsmParser : public MCTargetAsmParser {
                                        SMLoc &EndLoc);
 
   // Asm Match Converter Methods
-  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdExtTWriteBackImm(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdExtTWriteBackReg(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStExtTWriteBackImm(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStExtTWriteBackReg(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
   void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVLDwbFixed(MCInst &Inst,
-                     const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVLDwbRegister(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVSTwbFixed(MCInst &Inst,
-                     const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVSTwbRegister(MCInst &Inst,
+  void cvtThumbBranches(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
+                        
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool shouldOmitCCOutOperand(StringRef Mnemonic,
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
+  bool shouldOmitPredicateOperand(StringRef Mnemonic,
+                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 public:
   enum ARMMatchResultTy {
     Match_RequiresITBlock = FIRST_TARGET_MATCH_RESULT_TY,
@@ -241,12 +261,13 @@ public:
 
   };
 
-  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), FPReg(-1) {
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
-    MRI = &getContext().getRegisterInfo();
+    MRI = getContext().getRegisterInfo();
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -254,12 +275,7 @@ public:
     // Not in an ITBlock to start with.
     ITState.CurPosition = ~0U;
 
-    // Set ELF header flags.
-    // FIXME: This should eventually end up somewhere else where more
-    // intelligent flag decisions can be made. For now we are just maintaining
-    // the statu/parseDirects quo for ARM and setting EF_ARM_EABI_VER5 as the default.
-    if (MCELFStreamer *MES = dyn_cast<MCELFStreamer>(&Parser.getStreamer()))
-      MES->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+    NextSymbolIsThumb = false;
   }
 
   // Implementation of the MCTargetAsmParser interface:
@@ -276,6 +292,8 @@ public:
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
+  void onLabelParsed(MCSymbol *Symbol);
+
 };
 } // end anonymous namespace
 
@@ -293,6 +311,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_CoprocOption,
     k_Immediate,
     k_MemBarrierOpt,
+    k_InstSyncBarrierOpt,
     k_Memory,
     k_PostIndexRegister,
     k_MSRMask,
@@ -336,6 +355,10 @@ class ARMOperand : public MCParsedAsmOperand {
     ARM_MB::MemBOpt Val;
   };
 
+  struct ISBOptOp {
+    ARM_ISB::InstSyncBOpt Val;
+  };
+
   struct IFlagsOp {
     ARM_PROC::IFlags Val;
   };
@@ -422,6 +445,7 @@ class ARMOperand : public MCParsedAsmOperand {
     struct CopOp Cop;
     struct CoprocOptionOp CoprocOption;
     struct MBOptOp MBOpt;
+    struct ISBOptOp ISBOpt;
     struct ITMaskOp ITMask;
     struct IFlagsOp IFlags;
     struct MMaskOp MMask;
@@ -482,6 +506,8 @@ public:
     case k_MemBarrierOpt:
       MBOpt = o.MBOpt;
       break;
+    case k_InstSyncBarrierOpt:
+      ISBOpt = o.ISBOpt;
     case k_Memory:
       Memory = o.Memory;
       break;
@@ -564,6 +590,11 @@ public:
     return MBOpt.Val;
   }
 
+  ARM_ISB::InstSyncBOpt getInstSyncBarrierOpt() const {
+    assert(Kind == k_InstSyncBarrierOpt && "Invalid access!");
+    return ISBOpt.Val;
+  }
+
   ARM_PROC::IFlags getProcIFlags() const {
     assert(Kind == k_ProcIFlags && "Invalid access!");
     return IFlags.Val;
@@ -582,6 +613,56 @@ public:
   bool isITMask() const { return Kind == k_ITCondMask; }
   bool isITCondCode() const { return Kind == k_CondCode; }
   bool isImm() const { return Kind == k_Immediate; }
+  // checks whether this operand is an unsigned offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isUnsignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << width) - 1);
+      return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max);
+    }
+    return false;
+  }
+  // checks whether this operand is an signed offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isSignedOffset() const {
+    if (!isImm()) return false;
+    if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << (width-1)) - 1);
+      int64_t Min = -Align * (1LL << (width-1));
+      return ((Val % Align) == 0) && (Val >= Min) && (Val <= Max);
+    }
+    return false;
+  }
+
+  // checks whether this operand is a memory operand computed as an offset
+  // applied to PC. the offset may have 8 bits of magnitude and is represented
+  // with two bits of shift. textually it may be either [pc, #imm], #imm or 
+  // relocable expression...
+  bool isThumbMemPC() const {
+    int64_t Val = 0;
+    if (isImm()) {
+      if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+      if (!CE) return false;
+      Val = CE->getValue();
+    }
+    else if (isMem()) {
+      if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
+      if(Memory.BaseRegNum != ARM::PC) return false;
+      Val = Memory.OffsetImm->getValue();
+    }
+    else return false;
+    return ((Val % 4) == 0) && (Val >= 0) && (Val <= 1020);
+  }
   bool isFPImm() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -610,13 +691,6 @@ public:
     int64_t Value = CE->getValue();
     return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
   }
-  bool isImm0_4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 5;
-  }
   bool isImm0_1020s4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -639,6 +713,13 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
+  bool isImm0_239() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return Value >= 0 && Value < 240;
+  }
   bool isImm0_255() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -800,6 +881,15 @@ public:
     int64_t Value = CE->getValue();
     return Value >= 0 && Value < 65536;
   }
+  bool isImm256_65535Expr() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // If it's not a constant expression, it'll generate a fixup and be
+    // handled later.
+    if (!CE) return true;
+    int64_t Value = CE->getValue();
+    return Value >= 256 && Value < 65536;
+  }
   bool isImm0_65535Expr() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -879,7 +969,8 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return ARM_AM::getT2SOImmVal(~Value) != -1;
+    return ARM_AM::getT2SOImmVal(Value) == -1 &&
+      ARM_AM::getT2SOImmVal(~Value) != -1;
   }
   bool isT2SOImmNeg() const {
     if (!isImm()) return false;
@@ -903,6 +994,7 @@ public:
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isToken() const { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
+  bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
   bool isMem() const { return Kind == k_Memory; }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
@@ -949,7 +1041,7 @@ public:
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Val = CE->getValue();
-    return Val > -4096 && Val < 4096;
+    return (Val == INT32_MIN) || (Val > -4096 && Val < 4096);
   }
   bool isAddrMode3() const {
     // If we have an immediate that's not a constant, treat it as a label
@@ -1659,6 +1751,37 @@ public:
     Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
   }
 
+  void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
+    if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() >> 2));
+      return;
+    }
+
+    const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+    assert(SR && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateExpr(SR));
+  }
+
+  void addThumbMemPCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (isImm()) {
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+      if (CE) {
+        Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+        return;
+      }
+
+      const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+      assert(SR && "Unknown value type!");
+      Inst.addOperand(MCOperand::CreateExpr(SR));
+      return;
+    }
+
+    assert(isMem()  && "Unknown value type!");
+    assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateImm(Memory.OffsetImm->getValue()));
+  }
+
   void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a so_imm, but we have its bitwise
@@ -1680,6 +1803,11 @@ public:
     Inst.addOperand(MCOperand::CreateImm(unsigned(getMemBarrierOpt())));
   }
 
+  void addInstSyncBarrierOptOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(unsigned(getInstSyncBarrierOpt())));
+  }
+
   void addMemNoOffsetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(Memory.BaseRegNum));
@@ -1688,8 +1816,6 @@ public:
   void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     int32_t Imm = Memory.OffsetImm->getValue();
-    // FIXME: Handle #-0
-    if (Imm == INT32_MIN) Imm = 0;
     Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
@@ -2228,21 +2354,24 @@ public:
   }
 
   static ARMOperand *
-  CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned> > &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
+    assert (Regs.size() > 0 && "RegList contains no registers?");
     KindTy Kind = k_RegisterList;
 
-    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first))
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
       Kind = k_DPRRegisterList;
     else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
-             contains(Regs.front().first))
+             contains(Regs.front().second))
       Kind = k_SPRRegisterList;
 
+    // Sort based on the register encoding values.
+    array_pod_sort(Regs.begin(), Regs.end());
+
     ARMOperand *Op = new ARMOperand(Kind);
-    for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
+    for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
            I = Regs.begin(), E = Regs.end(); I != E; ++I)
-      Op->Registers.push_back(I->first);
-    array_pod_sort(Op->Registers.begin(), Op->Registers.end());
+      Op->Registers.push_back(I->second);
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
     return Op;
@@ -2345,6 +2474,15 @@ public:
     return Op;
   }
 
+  static ARMOperand *CreateInstSyncBarrierOpt(ARM_ISB::InstSyncBOpt Opt,
+                                              SMLoc S) {
+    ARMOperand *Op = new ARMOperand(k_InstSyncBarrierOpt);
+    Op->ISBOpt.Val = Opt;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static ARMOperand *CreateProcIFlags(ARM_PROC::IFlags IFlags, SMLoc S) {
     ARMOperand *Op = new ARMOperand(k_ProcIFlags);
     Op->IFlags.Val = IFlags;
@@ -2397,7 +2535,10 @@ void ARMOperand::print(raw_ostream &OS) const {
     getImm()->print(OS);
     break;
   case k_MemBarrierOpt:
-    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt()) << ">";
+    OS << "<ARM_MB::" << MemBOptToString(getMemBarrierOpt(), false) << ">";
+    break;
+  case k_InstSyncBarrierOpt:
+    OS << "<ARM_ISB::" << InstSyncBOptToString(getInstSyncBarrierOpt()) << ">";
     break;
   case k_Memory:
     OS << "<memory "
@@ -2731,8 +2872,9 @@ static int MatchCoprocessorOperandName(StringRef Name, char CoprocOp) {
       return -1;
     switch (Name[2]) {
     default:  return -1;
-    case '0': return 10;
-    case '1': return 11;
+    // p10 and p11 are invalid for coproc instructions (reserved for FP/NEON)
+    case '0': return CoprocOp == 'p'? -1: 10;
+    case '1': return CoprocOp == 'p'? -1: 11;
     case '2': return 12;
     case '3': return 13;
     case '4': return 14;
@@ -2910,12 +3052,14 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   // The reglist instructions have at most 16 registers, so reserve
   // space for that many.
-  SmallVector<std::pair<unsigned, SMLoc>, 16> Registers;
+  int EReg = 0;
+  SmallVector<std::pair<unsigned, unsigned>, 16> Registers;
 
   // Allow Q regs and just interpret them as the two D sub-registers.
   if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
     Reg = getDRegFromQReg(Reg);
-    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
     ++Reg;
   }
   const MCRegisterClass *RC;
@@ -2929,7 +3073,8 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return Error(RegLoc, "invalid register in register list");
 
   // Store the register.
-  Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+  EReg = MRI->getEncodingValue(Reg);
+  Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
 
   // This starts immediately after the first register token in the list,
   // so we can see either a comma or a minus (range separator) as a legal
@@ -2959,7 +3104,8 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       // Add all the registers in the range to the register list.
       while (Reg != EndReg) {
         Reg = getNextRegister(Reg);
-        Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+        EReg = MRI->getEncodingValue(Reg);
+        Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
       }
       continue;
     }
@@ -2992,14 +3138,15 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       continue;
     }
     // VFP register lists must also be contiguous.
-    // It's OK to use the enumeration values directly here rather, as the
-    // VFP register classes have the enum sorted properly.
     if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
         Reg != OldReg + 1)
       return Error(RegLoc, "non-contiguous register range");
-    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
-    if (isQReg)
-      Registers.push_back(std::pair<unsigned, SMLoc>(++Reg, RegLoc));
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    if (isQReg) {
+      EReg = MRI->getEncodingValue(++Reg);
+      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    }
   }
 
   if (Parser.getTok().isNot(AsmToken::RCurly))
@@ -3036,7 +3183,7 @@ parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index, SMLoc &EndLoc) {
     // There's an optional '#' token here. Normally there wouldn't be, but
     // inline assemble puts one in, and it's friendly to accept that.
     if (Parser.getTok().is(AsmToken::Hash))
-      Parser.Lex(); // Eat the '#'
+      Parser.Lex(); // Eat '#' or '$'.
 
     const MCExpr *LaneIndex;
     SMLoc Loc = Parser.getTok().getLoc();
@@ -3334,18 +3481,27 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Opt = StringSwitch<unsigned>(OptStr.slice(0, OptStr.size()).lower())
       .Case("sy",    ARM_MB::SY)
       .Case("st",    ARM_MB::ST)
+      .Case("ld",    ARM_MB::LD)
       .Case("sh",    ARM_MB::ISH)
       .Case("ish",   ARM_MB::ISH)
       .Case("shst",  ARM_MB::ISHST)
       .Case("ishst", ARM_MB::ISHST)
+      .Case("ishld", ARM_MB::ISHLD)
       .Case("nsh",   ARM_MB::NSH)
       .Case("un",    ARM_MB::NSH)
       .Case("nshst", ARM_MB::NSHST)
+      .Case("nshld", ARM_MB::NSHLD)
       .Case("unst",  ARM_MB::NSHST)
       .Case("osh",   ARM_MB::OSH)
       .Case("oshst", ARM_MB::OSHST)
+      .Case("oshld", ARM_MB::OSHLD)
       .Default(~0U);
 
+    // ishld, oshld, nshld and ld are only available from ARMv8.
+    if (!hasV8Ops() && (Opt == ARM_MB::ISHLD || Opt == ARM_MB::OSHLD ||
+                        Opt == ARM_MB::NSHLD || Opt == ARM_MB::LD))
+      Opt = ~0U;
+
     if (Opt == ~0U)
       return MatchOperand_NoMatch;
 
@@ -3354,7 +3510,7 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
              Tok.is(AsmToken::Dollar) ||
              Tok.is(AsmToken::Integer)) {
     if (Parser.getTok().isNot(AsmToken::Integer))
-      Parser.Lex(); // Eat the '#'.
+      Parser.Lex(); // Eat '#' or '$'.
     SMLoc Loc = Parser.getTok().getLoc();
 
     const MCExpr *MemBarrierID;
@@ -3383,6 +3539,57 @@ parseMemBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
+/// parseInstSyncBarrierOptOperand - Try to parse ISB inst sync barrier options.
+ARMAsmParser::OperandMatchResultTy ARMAsmParser::
+parseInstSyncBarrierOptOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  SMLoc S = Parser.getTok().getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  unsigned Opt;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    StringRef OptStr = Tok.getString();
+
+    if (OptStr.equals_lower("sy"))
+      Opt = ARM_ISB::SY;
+    else
+      return MatchOperand_NoMatch;
+
+    Parser.Lex(); // Eat identifier token.
+  } else if (Tok.is(AsmToken::Hash) ||
+             Tok.is(AsmToken::Dollar) ||
+             Tok.is(AsmToken::Integer)) {
+    if (Parser.getTok().isNot(AsmToken::Integer))
+      Parser.Lex(); // Eat '#' or '$'.
+    SMLoc Loc = Parser.getTok().getLoc();
+
+    const MCExpr *ISBarrierID;
+    if (getParser().parseExpression(ISBarrierID)) {
+      Error(Loc, "illegal expression");
+      return MatchOperand_ParseFail;
+    }
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ISBarrierID);
+    if (!CE) {
+      Error(Loc, "constant expression expected");
+      return MatchOperand_ParseFail;
+    }
+
+    int Val = CE->getValue();
+    if (Val & ~0xf) {
+      Error(Loc, "immediate value out of range");
+      return MatchOperand_ParseFail;
+    }
+
+    Opt = ARM_ISB::RESERVED_0 + Val;
+  } else
+    return MatchOperand_ParseFail;
+
+  Operands.push_back(ARMOperand::CreateInstSyncBarrierOpt(
+          (ARM_ISB::InstSyncBOpt)Opt, S));
+  return MatchOperand_Success;
+}
+
+
 /// parseProcIFlagsOperand - Try to parse iflags from CPS instruction.
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseProcIFlagsOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
@@ -3602,7 +3809,7 @@ parseSetEndImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     Error(S, "'be' or 'le' operand expected");
     return MatchOperand_ParseFail;
   }
-  int Val = StringSwitch<int>(Tok.getString())
+  int Val = StringSwitch<int>(Tok.getString().lower())
     .Case("be", 1)
     .Case("le", 0)
     .Default(-1);
@@ -3875,7 +4082,7 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Do immediates first, as we always parse those if we have a '#'.
   if (Parser.getTok().is(AsmToken::Hash) ||
       Parser.getTok().is(AsmToken::Dollar)) {
-    Parser.Lex(); // Eat the '#'.
+    Parser.Lex(); // Eat '#' or '$'.
     // Explicitly look for a '-', as we need to encode negative zero
     // differently.
     bool isNegative = Parser.getTok().is(AsmToken::Minus);
@@ -3926,260 +4133,9 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// cvtT2LdrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtT2LdrdPre(MCInst &Inst,
-             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateReg(0));
-  // addr
-  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtT2StrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtT2StrdPre(MCInst &Inst,
-             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateReg(0));
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrMode2(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-
-/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrMode2(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrMode3(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdExtTWriteBackImm(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdExtTWriteBackReg(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStExtTWriteBackImm(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStExtTWriteBackReg(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdrdPre(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStrdPre(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrMode3(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtThumbMultiply - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
+/// Convert parsed operands to MCInst.  Needed here because this instruction
+/// only has two register operands, but multiplication is commutative so
+/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
 void ARMAsmParser::
 cvtThumbMultiply(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
@@ -4198,59 +4154,62 @@ cvtThumbMultiply(MCInst &Inst,
 }
 
 void ARMAsmParser::
-cvtVLDwbFixed(MCInst &Inst,
-              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Vd
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
+cvtThumbBranches(MCInst &Inst,
+           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  int CondOp = -1, ImmOp = -1;
+  switch(Inst.getOpcode()) {
+    case ARM::tB:
+    case ARM::tBcc:  CondOp = 1; ImmOp = 2; break;
 
-void ARMAsmParser::
-cvtVLDwbRegister(MCInst &Inst,
-                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Vd
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vm
-  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
+    case ARM::t2B:
+    case ARM::t2Bcc: CondOp = 1; ImmOp = 3; break;
 
-void ARMAsmParser::
-cvtVSTwbFixed(MCInst &Inst,
-              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vt
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-void ARMAsmParser::
-cvtVSTwbRegister(MCInst &Inst,
-                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vm
-  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
-  // Vt
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
+    default: llvm_unreachable("Unexpected instruction in cvtThumbBranches");
+  }
+  // first decide whether or not the branch should be conditional
+  // by looking at it's location relative to an IT block
+  if(inITBlock()) {
+    // inside an IT block we cannot have any conditional branches. any 
+    // such instructions needs to be converted to unconditional form
+    switch(Inst.getOpcode()) {
+      case ARM::tBcc: Inst.setOpcode(ARM::tB); break;
+      case ARM::t2Bcc: Inst.setOpcode(ARM::t2B); break;
+    }
+  } else {
+    // outside IT blocks we can only have unconditional branches with AL
+    // condition code or conditional branches with non-AL condition code
+    unsigned Cond = static_cast<ARMOperand*>(Operands[CondOp])->getCondCode();
+    switch(Inst.getOpcode()) {
+      case ARM::tB:
+      case ARM::tBcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::tB : ARM::tBcc); 
+        break;
+      case ARM::t2B:
+      case ARM::t2Bcc: 
+        Inst.setOpcode(Cond == ARMCC::AL ? ARM::t2B : ARM::t2Bcc);
+        break;
+    }
+  }
+  
+  // now decide on encoding size based on branch target range
+  switch(Inst.getOpcode()) {
+    // classify tB as either t2B or t1B based on range of immediate operand
+    case ARM::tB: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<11, 1>() && isThumbTwo()) 
+        Inst.setOpcode(ARM::t2B);
+      break;
+    }
+    // classify tBcc as either t2Bcc or t1Bcc based on range of immediate operand
+    case ARM::tBcc: {
+      ARMOperand* op = static_cast<ARMOperand*>(Operands[ImmOp]);
+      if(!op->isSignedOffset<8, 1>() && isThumbTwo())
+        Inst.setOpcode(ARM::t2Bcc);
+      break;
+    }
+  }
+  ((ARMOperand*)Operands[ImmOp])->addImmOperands(Inst, 1);
+  ((ARMOperand*)Operands[CondOp])->addCondCodeOperands(Inst, 2);
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -4354,7 +4313,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       Parser.getTok().is(AsmToken::Dollar) ||
       Parser.getTok().is(AsmToken::Integer)) {
     if (Parser.getTok().isNot(AsmToken::Integer))
-      Parser.Lex(); // Eat the '#'.
+      Parser.Lex(); // Eat '#' or '$'.
     E = Parser.getTok().getLoc();
 
     bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -4536,7 +4495,7 @@ parseFPImm(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
                            TyOp->getToken() != ".f64"))
     return MatchOperand_NoMatch;
 
-  Parser.Lex(); // Eat the '#'.
+  Parser.Lex(); // Eat '#' or '$'.
 
   // Handle negation, as that still comes through as a separate token.
   bool isNegative = false;
@@ -4752,11 +4711,14 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "mls"   || Mnemonic == "smmls"  || Mnemonic == "vcls"  ||
       Mnemonic == "vmls"  || Mnemonic == "vnmls"  || Mnemonic == "vacge" ||
       Mnemonic == "vcge"  || Mnemonic == "vclt"   || Mnemonic == "vacgt" ||
-      Mnemonic == "vaclt" || Mnemonic == "vacle"  ||
+      Mnemonic == "vaclt" || Mnemonic == "vacle"  || Mnemonic == "hlt" ||
       Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
       Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
       Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
-      Mnemonic == "fmuls")
+      Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
+      Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
+      Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
+      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic.startswith("vsel"))
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -4836,8 +4798,8 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
 //
 // FIXME: It would be nice to autogen this.
 void ARMAsmParser::
-getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
-                      bool &CanAcceptPredicationCode) {
+getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
+                     bool &CanAcceptCarrySet, bool &CanAcceptPredicationCode) {
   if (Mnemonic == "and" || Mnemonic == "lsl" || Mnemonic == "lsr" ||
       Mnemonic == "rrx" || Mnemonic == "ror" || Mnemonic == "sub" ||
       Mnemonic == "add" || Mnemonic == "adc" ||
@@ -4853,28 +4815,35 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
   } else
     CanAcceptCarrySet = false;
 
-  if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
-      Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
-      Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
-      Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
-      Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" ||
-      (Mnemonic == "clrex" && !isThumb()) ||
-      (Mnemonic == "nop" && isThumbOne()) ||
-      ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" ||
-        Mnemonic == "ldc2" || Mnemonic == "ldc2l" ||
-        Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) ||
-      ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) &&
-       !isThumb()) ||
-      Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) {
+  if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
+      Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic.startswith("crc32") ||
+      Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
+      Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
+      Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
+      Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
+      Mnemonic == "vrintm" || Mnemonic.startswith("aes") ||
+      Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
+      (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
+    // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
+  } else if (!isThumb()) {
+    // Some instructions are only predicable in Thumb mode
+    CanAcceptPredicationCode
+      = Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
+        Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
+        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
+        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
+        Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
+        Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+        !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
+  } else if (isThumbOne()) {
+    if (hasV6MOps())
+      CanAcceptPredicationCode = Mnemonic != "movs";
+    else
+      CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
   } else
     CanAcceptPredicationCode = true;
-
-  if (isThumb()) {
-    if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
-        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
-      CanAcceptPredicationCode = false;
-  }
 }
 
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
@@ -4929,15 +4898,6 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
       static_cast<ARMOperand*>(Operands[5])->isImm()) {
     // Nest conditions rather than one big 'if' statement for readability.
     //
-    // If either register is a high reg, it's either one of the SP
-    // variants (handled above) or a 32-bit encoding, so we just
-    // check against T3. If the second register is the PC, this is an
-    // alternate form of ADR, which uses encoding T4, so check for that too.
-    if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-         !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) &&
-        static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
-        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
-      return false;
     // If both registers are low, we're in an IT block, and the immediate is
     // in range, we should use encoding T1 instead, which has a cc_out.
     if (inITBlock() &&
@@ -4945,6 +4905,11 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
         isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
         static_cast<ARMOperand*>(Operands[5])->isImm0_7())
       return false;
+    // Check against T3. If the second register is the PC, this is an
+    // alternate form of ADR, which uses encoding T4, so check for that too.
+    if (static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
+        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+      return false;
 
     // Otherwise, we use encoding T4, which does not have a cc_out
     // operand.
@@ -5007,6 +4972,26 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   return false;
 }
 
+bool ARMAsmParser::shouldOmitPredicateOperand(
+    StringRef Mnemonic, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
+  unsigned RegIdx = 3;
+  if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
+      static_cast<ARMOperand *>(Operands[2])->getToken() == ".f32") {
+    if (static_cast<ARMOperand *>(Operands[3])->isToken() &&
+        static_cast<ARMOperand *>(Operands[3])->getToken() == ".f32")
+      RegIdx = 4;
+
+    if (static_cast<ARMOperand *>(Operands[RegIdx])->isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID]
+             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID]
+             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg())))
+      return true;
+  }
+  return false;
+}
+
 static bool isDataTypeToken(StringRef Tok) {
   return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
     Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
@@ -5102,7 +5087,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // the matcher deal with finding the right instruction or generating an
   // appropriate error.
   bool CanAcceptCarrySet, CanAcceptPredicationCode;
-  getMnemonicAcceptInfo(Mnemonic, CanAcceptCarrySet, CanAcceptPredicationCode);
+  getMnemonicAcceptInfo(Mnemonic, Name, CanAcceptCarrySet, CanAcceptPredicationCode);
 
   // If we had a carry-set on an instruction that can't do that, issue an
   // error.
@@ -5153,7 +5138,17 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
       continue;
 
-    if (ExtraToken != ".n") {
+    // For for ARM mode generate an error if the .n qualifier is used.
+    if (ExtraToken == ".n" && !isThumb()) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
+                   "arm mode");
+    }
+
+    // The .n qualifier is always discarded as that is what the tables
+    // and matcher expect.  In ARM mode the .w qualifier has no effect,
+    // so discard it to avoid errors that can be caused by the matcher.
+    if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) {
       SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
       Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
     }
@@ -5199,6 +5194,15 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     delete Op;
   }
 
+  // Some instructions have the same mnemonic, but don't always
+  // have a predicate. Distinguish them here and delete the
+  // predicate if needed.
+  if (shouldOmitPredicateOperand(Mnemonic, Operands)) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
   // ARM mode 'blx' need special handling, as the register operand version
   // is predicable, but the label operand version is not. So, we can't rely
   // on the Mnemonic based checking to correctly figure out when to put
@@ -5218,8 +5222,9 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
   if (!isThumb() && Operands.size() > 4 &&
-      (Mnemonic == "ldrexd" || Mnemonic == "strexd")) {
-    bool isLoad = (Mnemonic == "ldrexd");
+      (Mnemonic == "ldrexd" || Mnemonic == "strexd" || Mnemonic == "ldaexd" ||
+       Mnemonic == "stlexd")) {
+    bool isLoad = (Mnemonic == "ldrexd" || Mnemonic == "ldaexd");
     unsigned Idx = isLoad ? 2 : 3;
     ARMOperand* Op1 = static_cast<ARMOperand*>(Operands[Idx]);
     ARMOperand* Op2 = static_cast<ARMOperand*>(Operands[Idx+1]);
@@ -5250,6 +5255,26 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  // FIXME: As said above, this is all a pretty gross hack.  This instruction
+  // does not fit with other "subs" and tblgen.
+  // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
+  // so the Mnemonic is the original name "subs" and delete the predicate
+  // operand so it will match the table entry.
+  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::PC &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::LR &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op0 = static_cast<ARMOperand*>(Operands[0]);
+    Operands.erase(Operands.begin());
+    delete Op0;
+    Operands.insert(Operands.begin(), ARMOperand::CreateToken(Name, NameLoc));
+
+    ARMOperand *Op1 = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op1;
+  }
   return false;
 }
 
@@ -5283,45 +5308,44 @@ static bool listContainsReg(MCInst &Inst, unsigned OpNo, unsigned Reg) {
   return false;
 }
 
-// FIXME: We would really prefer to have MCInstrInfo (the wrapper around
-// the ARMInsts array) instead. Getting that here requires awkward
-// API changes, though. Better way?
-namespace llvm {
-extern const MCInstrDesc ARMInsts[];
-}
-static const MCInstrDesc &getInstDesc(unsigned Opcode) {
-  return ARMInsts[Opcode];
+// Return true if instruction has the interesting property of being
+// allowed in IT blocks, but not being predicable.
+static bool instIsBreakpoint(const MCInst &Inst) {
+    return Inst.getOpcode() == ARM::tBKPT ||
+           Inst.getOpcode() == ARM::BKPT ||
+           Inst.getOpcode() == ARM::tHLT ||
+           Inst.getOpcode() == ARM::HLT;
+
 }
 
 // FIXME: We would really like to be able to tablegen'erate this.
 bool ARMAsmParser::
 validateInstruction(MCInst &Inst,
                     const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  const MCInstrDesc &MCID = getInstDesc(Inst.getOpcode());
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
   SMLoc Loc = Operands[0]->getStartLoc();
+
   // Check the IT block state first.
-  // NOTE: BKPT instruction has the interesting property of being
-  // allowed in IT blocks, but not being predicable.  It just always
-  // executes.
-  if (inITBlock() && Inst.getOpcode() != ARM::tBKPT &&
-      Inst.getOpcode() != ARM::BKPT) {
-    unsigned bit = 1;
+  // NOTE: BKPT and HLT instructions have the interesting property of being
+  // allowed in IT blocks, but not being predicable. They just always execute.
+  if (inITBlock() && !instIsBreakpoint(Inst)) {
+    unsigned Bit = 1;
     if (ITState.FirstCond)
       ITState.FirstCond = false;
     else
-      bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
+      Bit = (ITState.Mask >> (5 - ITState.CurPosition)) & 1;
     // The instruction must be predicable.
     if (!MCID.isPredicable())
       return Error(Loc, "instructions in IT block must be predicable");
     unsigned Cond = Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm();
-    unsigned ITCond = bit ? ITState.Cond :
+    unsigned ITCond = Bit ? ITState.Cond :
       ARMCC::getOppositeCondition(ITState.Cond);
     if (Cond != ITCond) {
       // Find the condition code Operand to get its SMLoc information.
       SMLoc CondLoc;
-      for (unsigned i = 1; i < Operands.size(); ++i)
-        if (static_cast<ARMOperand*>(Operands[i])->isCondCode())
-          CondLoc = Operands[i]->getStartLoc();
+      for (unsigned I = 1; I < Operands.size(); ++I)
+        if (static_cast<ARMOperand*>(Operands[I])->isCondCode())
+          CondLoc = Operands[I]->getStartLoc();
       return Error(CondLoc, "incorrect condition in IT block; got '" +
                    StringRef(ARMCondCodeToString(ARMCC::CondCodes(Cond))) +
                    "', but expected '" +
@@ -5330,20 +5354,55 @@ validateInstruction(MCInst &Inst,
   // Check for non-'al' condition codes outside of the IT block.
   } else if (isThumbTwo() && MCID.isPredicable() &&
              Inst.getOperand(MCID.findFirstPredOperandIdx()).getImm() !=
-             ARMCC::AL && Inst.getOpcode() != ARM::tB &&
-             Inst.getOpcode() != ARM::t2B)
+             ARMCC::AL && Inst.getOpcode() != ARM::tBcc &&
+             Inst.getOpcode() != ARM::t2Bcc)
     return Error(Loc, "predicated instructions must be in IT block");
 
-  switch (Inst.getOpcode()) {
+  const unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
   case ARM::LDRD:
   case ARM::LDRD_PRE:
   case ARM::LDRD_POST: {
+    const unsigned RtReg = Inst.getOperand(0).getReg();
+
+    // Rt can't be R14.
+    if (RtReg == ARM::LR)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt can't be R14");
+
+    const unsigned Rt = MRI->getEncodingValue(RtReg);
+    // Rt must be even-numbered.
+    if ((Rt & 1) == 1)
+      return Error(Operands[3]->getStartLoc(),
+                   "Rt must be even-numbered");
+
     // Rt2 must be Rt + 1.
-    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
-    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    const unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
     if (Rt2 != Rt + 1)
       return Error(Operands[3]->getStartLoc(),
                    "destination operands must be sequential");
+
+    if (Opcode == ARM::LDRD_PRE || Opcode == ARM::LDRD_POST) {
+      const unsigned Rn = MRI->getEncodingValue(Inst.getOperand(3).getReg());
+      // For addressing modes with writeback, the base register needs to be
+      // different from the destination registers.
+      if (Rn == Rt || Rn == Rt2)
+        return Error(Operands[3]->getStartLoc(),
+                     "base register needs to be different from destination "
+                     "registers");
+    }
+
+    return false;
+  }
+  case ARM::t2LDRDi8:
+  case ARM::t2LDRD_PRE:
+  case ARM::t2LDRD_POST: {
+    // Rt2 must be different from Rt.
+    unsigned Rt = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+    unsigned Rt2 = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+    if (Rt2 == Rt)
+      return Error(Operands[3]->getStartLoc(),
+                   "destination operands can't be identical");
     return false;
   }
   case ARM::STRD: {
@@ -5367,50 +5426,77 @@ validateInstruction(MCInst &Inst,
   }
   case ARM::SBFX:
   case ARM::UBFX: {
-    // width must be in range [1, 32-lsb]
-    unsigned lsb = Inst.getOperand(2).getImm();
-    unsigned widthm1 = Inst.getOperand(3).getImm();
-    if (widthm1 >= 32 - lsb)
+    // Width must be in range [1, 32-lsb].
+    unsigned LSB = Inst.getOperand(2).getImm();
+    unsigned Widthm1 = Inst.getOperand(3).getImm();
+    if (Widthm1 >= 32 - LSB)
       return Error(Operands[5]->getStartLoc(),
                    "bitfield width must be in range [1,32-lsb]");
     return false;
   }
+  // Notionally handles ARM::tLDMIA_UPD too.
   case ARM::tLDMIA: {
     // If we're parsing Thumb2, the .w variant is available and handles
-    // most cases that are normally illegal for a Thumb1 LDM
-    // instruction. We'll make the transformation in processInstruction()
-    // if necessary.
+    // most cases that are normally illegal for a Thumb1 LDM instruction.
+    // We'll make the transformation in processInstruction() if necessary.
     //
     // Thumb LDM instructions are writeback iff the base register is not
     // in the register list.
     unsigned Rn = Inst.getOperand(0).getReg();
-    bool hasWritebackToken =
+    bool HasWritebackToken =
       (static_cast<ARMOperand*>(Operands[3])->isToken() &&
        static_cast<ARMOperand*>(Operands[3])->getToken() == "!");
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 3, Rn, 0, listContainsBase) && !isThumbTwo())
-      return Error(Operands[3 + hasWritebackToken]->getStartLoc(),
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 3, Rn, 0, ListContainsBase) && !isThumbTwo())
+      return Error(Operands[3 + HasWritebackToken]->getStartLoc(),
                    "registers must be in range r0-r7");
     // If we should have writeback, then there should be a '!' token.
-    if (!listContainsBase && !hasWritebackToken && !isThumbTwo())
+    if (!ListContainsBase && !HasWritebackToken && !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "writeback operator '!' expected");
     // If we should not have writeback, there must not be a '!'. This is
     // true even for the 32-bit wide encodings.
-    if (listContainsBase && hasWritebackToken)
+    if (ListContainsBase && HasWritebackToken)
       return Error(Operands[3]->getStartLoc(),
                    "writeback operator '!' not allowed when base register "
                    "in register list");
 
     break;
   }
-  case ARM::t2LDMIA_UPD: {
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+    // ARM variants loading and updating the same register are only officially
+    // UNPREDICTABLE on v7 upwards. Goodness knows what they did before.
+    if (!hasV7Ops())
+      break;
+    // Fallthrough
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD: {
     if (listContainsReg(Inst, 3, Inst.getOperand(0).getReg()))
-      return Error(Operands[4]->getStartLoc(),
-                   "writeback operator '!' not allowed when base register "
-                   "in register list");
+      return Error(Operands.back()->getStartLoc(),
+                   "writeback register not allowed in register list");
     break;
   }
+  case ARM::sysLDMIA_UPD:
+  case ARM::sysLDMDA_UPD:
+  case ARM::sysLDMDB_UPD:
+  case ARM::sysLDMIB_UPD:
+    if (!listContainsReg(Inst, 3, ARM::PC))
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback register only allowed on system LDM "
+                   "if PC in register-list");
+    break;
+  case ARM::sysSTMIA_UPD:
+  case ARM::sysSTMDA_UPD:
+  case ARM::sysSTMDB_UPD:
+  case ARM::sysSTMIB_UPD:
+    return Error(Operands[2]->getStartLoc(),
+                 "system STM cannot have writeback register");
+    break;
   case ARM::tMUL: {
     // The second source operand must be the same register as the destination
     // operand.
@@ -5434,26 +5520,35 @@ validateInstruction(MCInst &Inst,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
   case ARM::tPOP: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::PC, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or pc");
     break;
   }
   case ARM::tPUSH: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, listContainsBase) &&
+    bool ListContainsBase;
+    if (checkLowRegisterList(Inst, 2, 0, ARM::LR, ListContainsBase) &&
         !isThumbTwo())
       return Error(Operands[2]->getStartLoc(),
                    "registers must be in range r0-r7 or lr");
     break;
   }
   case ARM::tSTMIA_UPD: {
-    bool listContainsBase;
-    if (checkLowRegisterList(Inst, 4, 0, 0, listContainsBase) && !isThumbTwo())
+    bool ListContainsBase, InvalidLowList;
+    InvalidLowList = checkLowRegisterList(Inst, 4, Inst.getOperand(0).getReg(),
+                                          0, ListContainsBase);
+    if (InvalidLowList && !isThumbTwo())
       return Error(Operands[4]->getStartLoc(),
                    "registers must be in range r0-r7");
+
+    // This would be converted to a 32-bit stm, but that's not valid if the
+    // writeback register is in the list.
+    if (InvalidLowList && ListContainsBase)
+      return Error(Operands[4]->getStartLoc(),
+                   "writeback operator '!' not allowed when base register "
+                   "in register list");
     break;
   }
   case ARM::tADDrSP: {
@@ -5466,6 +5561,28 @@ validateInstruction(MCInst &Inst,
     }
     break;
   }
+  // Final range checking for Thumb unconditional branch instructions.
+  case ARM::tB:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<11, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2B: {
+    int op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[op]))->isSignedOffset<24, 1>())
+      return Error(Operands[op]->getStartLoc(), "branch target out of range");
+    break;
+  }
+  // Final range checking for Thumb conditional branch instructions.
+  case ARM::tBcc:
+    if (!(static_cast<ARMOperand*>(Operands[2]))->isSignedOffset<8, 1>())
+      return Error(Operands[2]->getStartLoc(), "branch target out of range");
+    break;
+  case ARM::t2Bcc: {
+    int Op = (Operands[2]->isImm()) ? 2 : 3;
+    if (!(static_cast<ARMOperand*>(Operands[Op]))->isSignedOffset<20, 1>())
+      return Error(Operands[Op]->getStartLoc(), "branch target out of range");
+    break;
+  }
   }
 
   return false;
@@ -5749,7 +5866,9 @@ processInstruction(MCInst &Inst,
   case ARM::t2LDRpcrel:
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
-        Inst.getOperand(1).getImm() <= 0xff)
+        Inst.getOperand(1).getImm() <= 0xff &&
+        !(static_cast<ARMOperand*>(Operands[2])->isToken() &&
+         static_cast<ARMOperand*>(Operands[2])->getToken() == ".w"))
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -7398,11 +7517,10 @@ processInstruction(MCInst &Inst,
     MCOperand &MO = Inst.getOperand(1);
     unsigned Mask = MO.getImm();
     unsigned OrigMask = Mask;
-    unsigned TZ = CountTrailingZeros_32(Mask);
+    unsigned TZ = countTrailingZeros(Mask);
     if ((Inst.getOperand(0).getImm() & 1) == 0) {
       assert(Mask && TZ <= 3 && "illegal IT mask value!");
-      for (unsigned i = 3; i != TZ; --i)
-        Mask ^= 1 << i;
+      Mask ^= (0xE << TZ) & 0xF;
     }
     MO.setImm(Mask);
 
@@ -7503,7 +7621,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   // 16-bit thumb arithmetic instructions either require or preclude the 'S'
   // suffix depending on whether they're in an IT block or not.
   unsigned Opc = Inst.getOpcode();
-  const MCInstrDesc &MCID = getInstDesc(Opc);
+  const MCInstrDesc &MCID = MII.get(Opc);
   if (MCID.TSFlags & ARMII::ThumbArithFlagSetting) {
     assert(MCID.hasOptionalDef() &&
            "optionally flag setting instruction missing optional def operand");
@@ -7564,12 +7682,22 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
 
-    // Some instructions need post-processing to, for example, tweak which
-    // encoding is selected. Loop on it while changes happen so the
-    // individual transformations can chain off each other. E.g.,
-    // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
-    while (processInstruction(Inst, Operands))
-      ;
+    { // processInstruction() updates inITBlock state, we need to save it away
+      bool wasInITBlock = inITBlock();
+
+      // Some instructions need post-processing to, for example, tweak which
+      // encoding is selected. Loop on it while changes happen so the
+      // individual transformations can chain off each other. E.g.,
+      // tPOP(r8)->t2LDMIA_UPD(sp,r8)->t2STR_POST(sp,r8)
+      while (processInstruction(Inst, Operands))
+        ;
+
+      // Only after the instruction is fully processed, we can validate it
+      if (wasInITBlock && hasV8Ops() && isThumb() &&
+          !isV8EligibleForIT(&Inst, 2)) {
+        Warning(IDLoc, "deprecated instruction in IT block");
+      }
+    }
 
     // Only move forward at the very end so that everything in validate
     // and process gets a consistent answer about whether we're in an IT
@@ -7622,15 +7750,15 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv6 or later");
   case Match_RequiresThumb2:
     return Error(IDLoc, "instruction variant requires Thumb2");
-  case Match_ImmRange0_4: {
+  case Match_ImmRange0_15: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,4]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
   }
-  case Match_ImmRange0_15: {
+  case Match_ImmRange0_239: {
     SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getStartLoc();
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
+    return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
   }
 
@@ -7658,6 +7786,28 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveArch(DirectiveID.getLoc());
   else if (IDVal == ".eabi_attribute")
     return parseDirectiveEabiAttr(DirectiveID.getLoc());
+  else if (IDVal == ".cpu")
+    return parseDirectiveCPU(DirectiveID.getLoc());
+  else if (IDVal == ".fpu")
+    return parseDirectiveFPU(DirectiveID.getLoc());
+  else if (IDVal == ".fnstart")
+    return parseDirectiveFnStart(DirectiveID.getLoc());
+  else if (IDVal == ".fnend")
+    return parseDirectiveFnEnd(DirectiveID.getLoc());
+  else if (IDVal == ".cantunwind")
+    return parseDirectiveCantUnwind(DirectiveID.getLoc());
+  else if (IDVal == ".personality")
+    return parseDirectivePersonality(DirectiveID.getLoc());
+  else if (IDVal == ".handlerdata")
+    return parseDirectiveHandlerData(DirectiveID.getLoc());
+  else if (IDVal == ".setfp")
+    return parseDirectiveSetFP(DirectiveID.getLoc());
+  else if (IDVal == ".pad")
+    return parseDirectivePad(DirectiveID.getLoc());
+  else if (IDVal == ".save")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), false);
+  else if (IDVal == ".vsave")
+    return parseDirectiveRegSave(DirectiveID.getLoc(), true);
   return true;
 }
 
@@ -7693,6 +7843,9 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  if (!hasThumb())
+    return Error(L, "target does not support Thumb mode");
+
   if (!isThumb())
     SwitchMode();
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
@@ -7706,19 +7859,27 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
     return Error(L, "unexpected token in directive");
   Parser.Lex();
 
+  if (!hasARM())
+    return Error(L, "target does not support ARM mode");
+
   if (isThumb())
     SwitchMode();
   getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
   return false;
 }
 
+void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
+  if (NextSymbolIsThumb) {
+    getParser().getStreamer().EmitThumbFunc(Symbol);
+    NextSymbolIsThumb = false;
+  }
+}
+
 /// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
-  const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI.hasSubsectionsViaSymbols();
-  StringRef Name;
-  bool needFuncName = true;
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
 
   // Darwin asm has (optionally) function name after .thumb_func direction
   // ELF doesn't
@@ -7727,29 +7888,19 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
     if (Tok.isNot(AsmToken::EndOfStatement)) {
       if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
         return Error(L, "unexpected token in .thumb_func directive");
-      Name = Tok.getIdentifier();
+      MCSymbol *Func =
+          getParser().getContext().GetOrCreateSymbol(Tok.getIdentifier());
+      getParser().getStreamer().EmitThumbFunc(Func);
       Parser.Lex(); // Consume the identifier token.
-      needFuncName = false;
+      return false;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(L, "unexpected token in directive");
 
-  // Eat the end of statement and any blank lines that follow.
-  while (getLexer().is(AsmToken::EndOfStatement))
-    Parser.Lex();
-
-  // FIXME: assuming function name will be the line following .thumb_func
-  // We really should be checking the next symbol definition even if there's
-  // stuff in between.
-  if (needFuncName) {
-    Name = Parser.getTok().getIdentifier();
-  }
+  NextSymbolIsThumb = true;
 
-  // Mark symbol as a thumb symbol.
-  MCSymbol *Func = getParser().getContext().GetOrCreateSymbol(Name);
-  getParser().getStreamer().EmitThumbFunc(Func);
   return false;
 }
 
@@ -7795,10 +7946,16 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
   Parser.Lex();
 
   if (Val == 16) {
+    if (!hasThumb())
+      return Error(L, "target does not support Thumb mode");
+
     if (!isThumb())
       SwitchMode();
     getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
   } else {
+    if (!hasARM())
+      return Error(L, "target does not support ARM mode");
+
     if (isThumb())
       SwitchMode();
     getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
@@ -7855,7 +8012,267 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
 /// parseDirectiveEabiAttr
 ///  ::= .eabi_attribute int, int
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  return true;
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Tag = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat tag integer
+
+  if (Parser.getTok().isNot(AsmToken::Comma))
+    return Error(L, "comma expected");
+  Parser.Lex(); // skip comma
+
+  L = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::Integer))
+    return Error(L, "integer expected");
+  int64_t Value = Parser.getTok().getIntVal();
+  Parser.Lex(); // eat value integer
+
+  getTargetStreamer().emitAttribute(Tag, Value);
+  return false;
+}
+
+/// parseDirectiveCPU
+///  ::= .cpu str
+bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
+  StringRef CPU = getParser().parseStringToEndOfStatement().trim();
+  getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
+  return false;
+}
+
+/// parseDirectiveFPU
+///  ::= .fpu str
+bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
+  StringRef FPU = getParser().parseStringToEndOfStatement().trim();
+
+  unsigned ID = StringSwitch<unsigned>(FPU)
+#define ARM_FPU_NAME(NAME, ID) .Case(NAME, ARM::ID)
+#include "ARMFPUName.def"
+    .Default(ARM::INVALID_FPU);
+
+  if (ID == ARM::INVALID_FPU)
+    return Error(L, "Unknown FPU name");
+
+  getTargetStreamer().emitFPU(ID);
+  return false;
+}
+
+/// parseDirectiveFnStart
+///  ::= .fnstart
+bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
+  if (FnStartLoc.isValid()) {
+    Error(L, ".fnstart starts before the end of previous one");
+    Error(FnStartLoc, "previous .fnstart starts here");
+    return true;
+  }
+
+  FnStartLoc = L;
+  getTargetStreamer().emitFnStart();
+  return false;
+}
+
+/// parseDirectiveFnEnd
+///  ::= .fnend
+bool ARMAsmParser::parseDirectiveFnEnd(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .fnend directive");
+
+  // Reset the unwind directives parser state
+  resetUnwindDirectiveParserState();
+  getTargetStreamer().emitFnEnd();
+  return false;
+}
+
+/// parseDirectiveCantUnwind
+///  ::= .cantunwind
+bool ARMAsmParser::parseDirectiveCantUnwind(SMLoc L) {
+  // Check the ordering of unwind directives
+  CantUnwindLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .cantunwind directive");
+  if (HandlerDataLoc.isValid()) {
+    Error(L, ".cantunwind can't be used with .handlerdata directive");
+    Error(HandlerDataLoc, ".handlerdata was specified here");
+    return true;
+  }
+  if (PersonalityLoc.isValid()) {
+    Error(L, ".cantunwind can't be used with .personality directive");
+    Error(PersonalityLoc, ".personality was specified here");
+    return true;
+  }
+
+  getTargetStreamer().emitCantUnwind();
+  return false;
+}
+
+/// parseDirectivePersonality
+///  ::= .personality name
+bool ARMAsmParser::parseDirectivePersonality(SMLoc L) {
+  // Check the ordering of unwind directives
+  PersonalityLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (CantUnwindLoc.isValid()) {
+    Error(L, ".personality can't be used with .cantunwind directive");
+    Error(CantUnwindLoc, ".cantunwind was specified here");
+    return true;
+  }
+  if (HandlerDataLoc.isValid()) {
+    Error(L, ".personality must precede .handlerdata directive");
+    Error(HandlerDataLoc, ".handlerdata was specified here");
+    return true;
+  }
+
+  // Parse the name of the personality routine
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Parser.eatToEndOfStatement();
+    return Error(L, "unexpected input in .personality directive.");
+  }
+  StringRef Name(Parser.getTok().getIdentifier());
+  Parser.Lex();
+
+  MCSymbol *PR = getParser().getContext().GetOrCreateSymbol(Name);
+  getTargetStreamer().emitPersonality(PR);
+  return false;
+}
+
+/// parseDirectiveHandlerData
+///  ::= .handlerdata
+bool ARMAsmParser::parseDirectiveHandlerData(SMLoc L) {
+  // Check the ordering of unwind directives
+  HandlerDataLoc = L;
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .personality directive");
+  if (CantUnwindLoc.isValid()) {
+    Error(L, ".handlerdata can't be used with .cantunwind directive");
+    Error(CantUnwindLoc, ".cantunwind was specified here");
+    return true;
+  }
+
+  getTargetStreamer().emitHandlerData();
+  return false;
+}
+
+/// parseDirectiveSetFP
+///  ::= .setfp fpreg, spreg [, offset]
+bool ARMAsmParser::parseDirectiveSetFP(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .setfp directive");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".setfp must precede .handlerdata directive");
+
+  // Parse fpreg
+  SMLoc NewFPRegLoc = Parser.getTok().getLoc();
+  int NewFPReg = tryParseRegister();
+  if (NewFPReg == -1)
+    return Error(NewFPRegLoc, "frame pointer register expected");
+
+  // Consume comma
+  if (!Parser.getTok().is(AsmToken::Comma))
+    return Error(Parser.getTok().getLoc(), "comma expected");
+  Parser.Lex(); // skip comma
+
+  // Parse spreg
+  SMLoc NewSPRegLoc = Parser.getTok().getLoc();
+  int NewSPReg = tryParseRegister();
+  if (NewSPReg == -1)
+    return Error(NewSPRegLoc, "stack pointer register expected");
+
+  if (NewSPReg != ARM::SP && NewSPReg != FPReg)
+    return Error(NewSPRegLoc,
+                 "register should be either $sp or the latest fp register");
+
+  // Update the frame pointer register
+  FPReg = NewFPReg;
+
+  // Parse offset
+  int64_t Offset = 0;
+  if (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // skip comma
+
+    if (Parser.getTok().isNot(AsmToken::Hash) &&
+        Parser.getTok().isNot(AsmToken::Dollar)) {
+      return Error(Parser.getTok().getLoc(), "'#' expected");
+    }
+    Parser.Lex(); // skip hash token.
+
+    const MCExpr *OffsetExpr;
+    SMLoc ExLoc = Parser.getTok().getLoc();
+    SMLoc EndLoc;
+    if (getParser().parseExpression(OffsetExpr, EndLoc))
+      return Error(ExLoc, "malformed setfp offset");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+    if (!CE)
+      return Error(ExLoc, "setfp offset must be an immediate");
+
+    Offset = CE->getValue();
+  }
+
+  getTargetStreamer().emitSetFP(static_cast<unsigned>(NewFPReg),
+                                static_cast<unsigned>(NewSPReg), Offset);
+  return false;
+}
+
+/// parseDirective
+///  ::= .pad offset
+bool ARMAsmParser::parseDirectivePad(SMLoc L) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .pad directive");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".pad must precede .handlerdata directive");
+
+  // Parse the offset
+  if (Parser.getTok().isNot(AsmToken::Hash) &&
+      Parser.getTok().isNot(AsmToken::Dollar)) {
+    return Error(Parser.getTok().getLoc(), "'#' expected");
+  }
+  Parser.Lex(); // skip hash token.
+
+  const MCExpr *OffsetExpr;
+  SMLoc ExLoc = Parser.getTok().getLoc();
+  SMLoc EndLoc;
+  if (getParser().parseExpression(OffsetExpr, EndLoc))
+    return Error(ExLoc, "malformed pad offset");
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(OffsetExpr);
+  if (!CE)
+    return Error(ExLoc, "pad offset must be an immediate");
+
+  getTargetStreamer().emitPad(CE->getValue());
+  return false;
+}
+
+/// parseDirectiveRegSave
+///  ::= .save  { registers }
+///  ::= .vsave { registers }
+bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
+  // Check the ordering of unwind directives
+  if (!FnStartLoc.isValid())
+    return Error(L, ".fnstart must precede .save or .vsave directives");
+  if (HandlerDataLoc.isValid())
+    return Error(L, ".save or .vsave must precede .handlerdata directive");
+
+  // RAII object to make sure parsed operands are deleted.
+  struct CleanupObject {
+    SmallVector<MCParsedAsmOperand *, 1> Operands;
+    ~CleanupObject() {
+      for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+        delete Operands[I];
+    }
+  } CO;
+
+  // Parse the register list
+  if (parseRegisterList(CO.Operands))
+    return true;
+  ARMOperand *Op = (ARMOperand*)CO.Operands[0];
+  if (!IsVector && !Op->isRegList())
+    return Error(L, ".save expects GPR registers");
+  if (IsVector && !Op->isDPRRegList())
+    return Error(L, ".vsave expects DPR registers");
+
+  getTargetStreamer().emitRegSave(Op->getRegList(), IsVector);
+  return false;
 }
 
 /// Force static initialization.
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index b832508..f271a93 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,7 +49,7 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
-add_dependencies(LLVMARMCodeGen intrinsics_gen)
+add_dependencies(LLVMARMCodeGen ARMCommonTableGen intrinsics_gen)
 
 # workaround for hanging compilation on MSVC9, 10
 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index ac937f3..9c7988f 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -65,7 +65,7 @@ namespace {
       void setITState(char Firstcond, char Mask) {
         // (3 - the number of trailing zeros) is the number of then / else.
         unsigned CondBit0 = Firstcond & 1;
-        unsigned NumTZ = CountTrailingZeros_32(Mask);
+        unsigned NumTZ = countTrailingZeros<uint8_t>(Mask);
         unsigned char CCBits = static_cast<unsigned char>(Firstcond & 0xf);
         assert(NumTZ <= 3 && "Invalid IT mask!");
         // push condition codes onto the stack the correct order for the pops
@@ -156,12 +156,17 @@ static DecodeStatus DecodeGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGPRwithAPSRRegisterClass(MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
+                                               const void *Decoder);
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@@ -236,6 +241,14 @@ static DecodeStatus DecodeBranchImmInstruction(MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode6Operand(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Val,
@@ -268,6 +281,8 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
@@ -308,8 +323,6 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder);
 
 
 static DecodeStatus DecodeThumbAddSpecialReg(MCInst &Inst, uint16_t Insn,
@@ -332,6 +345,14 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
 static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
@@ -348,6 +369,8 @@ static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbCPS(MCInst &Inst, uint16_t Insn,
                                 uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Insn,
                                 uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
@@ -402,7 +425,7 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
          "Asked to disassemble an ARM instruction but Subtarget is in Thumb mode!");
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -431,6 +454,13 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
   result = decodeInstruction(DecoderTableNEONData32, MI, insn, Address,
                              this, STI);
   if (result != MCDisassembler::Fail) {
@@ -467,7 +497,22 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTablev8NEON32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
+  result = decodeInstruction(DecoderTablev8Crypto32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
 
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -492,102 +537,9 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
                                      bool isBranch, uint64_t InstSize,
                                      MCInst &MI, const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = Dis->getDisInfoBlock();
-
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
-    // Clear SymbolicOp.Value from above and also all other fields.
-    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-    if (!SymbolLookUp)
-      return false;
-    uint64_t ReferenceType;
-    if (isBranch)
-       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-    else
-       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
-    const char *ReferenceName;
-    uint64_t SymbolValue = 0x00000000ffffffffULL & Value;
-    const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType,
-                                    Address, &ReferenceName);
-    if (Name) {
-      SymbolicOp.AddSymbol.Name = Name;
-      SymbolicOp.AddSymbol.Present = true;
-    }
-    // For branches always create an MCExpr so it gets printed as hex address.
-    else if (isBranch) {
-      SymbolicOp.Value = Value;
-    }
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
-    if (!Name && !isBranch)
-      return false;
-  }
-
-  MCContext *Ctx = Dis->getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_HI16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateUpper16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_ARM_LO16)
-    MI.addOperand(MCOperand::CreateExpr(ARMMCExpr::CreateLower16(Expr, *Ctx)));
-  else if (SymbolicOp.VariantKind == LLVMDisassembler_VariantKind_None)
-    MI.addOperand(MCOperand::CreateExpr(Expr));
-  else
-    llvm_unreachable("bad SymbolicOp.VariantKind");
-
-  return true;
+  // FIXME: Does it make sense for value to be negative?
+  return Dis->tryAddingSymbolicOperand(MI, (uint32_t)Value, Address, isBranch,
+                                       /* Offset */ 0, InstSize);
 }
 
 /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -602,17 +554,7 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
 static void tryAddingPcLoadReferenceComment(uint64_t Address, int Value,
                                             const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-  if (SymbolLookUp) {
-    void *DisInfo = Dis->getDisInfoBlock();
-    uint64_t ReferenceType;
-    ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
-    const char *ReferenceName;
-    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr ||
-       ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
-  }
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
 // Thumb1 instructions don't have explicit S bits.  Rather, they
@@ -751,7 +693,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
          "Asked to disassemble in Thumb mode but Subtarget is in ARM mode!");
 
   // We want to read exactly 2 bytes of data.
-  if (Region.readBytes(Address, 2, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 2, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -803,7 +745,7 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t*)bytes, NULL) == -1) {
+  if (Region.readBytes(Address, 4, bytes) == -1) {
     Size = 0;
     return MCDisassembler::Fail;
   }
@@ -832,23 +774,34 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return result;
   }
 
-  MI.clear();
-  result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    UpdateThumbVFPPredicate(MI);
-    return result;
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      UpdateThumbVFPPredicate(MI);
+      return result;
+    }
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
-                             this, STI);
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn32, Address, this, STI);
   if (result != MCDisassembler::Fail) {
     Size = 4;
-    Check(result, AddThumbPredicate(MI));
     return result;
   }
 
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
   if (fieldFromInstruction(insn32, 24, 8) == 0xF9) {
     MI.clear();
     uint32_t NEONLdStInsn = insn32;
@@ -876,8 +829,31 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       Check(result, AddThumbPredicate(MI));
       return result;
     }
+
+    MI.clear();
+    uint32_t NEONCryptoInsn = insn32;
+    NEONCryptoInsn &= 0xF0FFFFFF; // Clear bits 27-24
+    NEONCryptoInsn |= (NEONCryptoInsn & 0x10000000) >> 4; // Move bit 28 to bit 24
+    NEONCryptoInsn |= 0x12000000; // Set bits 28 and 25
+    result = decodeInstruction(DecoderTablev8Crypto32, MI, NEONCryptoInsn,
+                               Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
+
+    MI.clear();
+    uint32_t NEONv8Insn = insn32;
+    NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+    result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      return result;
+    }
   }
 
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -920,6 +896,21 @@ DecodeGPRnopcRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
+static DecodeStatus
+DecodeGPRwithAPSRRegisterClass(MCInst &Inst, unsigned RegNo,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo == 15)
+  {
+    Inst.addOperand(MCOperand::CreateReg(ARM::APSR_NZCV));
+    return MCDisassembler::Success;
+  }
+
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
+}
+
 static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   if (RegNo > 7)
@@ -927,6 +918,26 @@ static DecodeStatus DecodetGPRRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
 }
 
+static const uint16_t GPRPairDecoderTable[] = {
+  ARM::R0_R1, ARM::R2_R3,   ARM::R4_R5,  ARM::R6_R7,
+  ARM::R8_R9, ARM::R10_R11, ARM::R12_SP
+};
+
+static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  if (RegNo > 13)
+    return MCDisassembler::Fail;
+
+  if ((RegNo & 1) || RegNo == 0xe)
+     S = MCDisassembler::SoftFail;
+
+  unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::CreateReg(RegisterPair));
+  return S;
+}
+
 static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   unsigned Register = 0;
@@ -959,8 +970,11 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
-  if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
-  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+  DecodeStatus S = MCDisassembler::Success;
+  if (RegNo == 13 || RegNo == 15)
+    S = MCDisassembler::SoftFail;
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
 }
 
 static const uint16_t SPRDecoderTable[] = {
@@ -1030,7 +1044,7 @@ static const uint16_t QPRDecoderTable[] = {
 
 static DecodeStatus DecodeQPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
+  if (RegNo > 31 || (RegNo & 1) != 0)
     return MCDisassembler::Fail;
   RegNo >>= 1;
 
@@ -1189,30 +1203,32 @@ static DecodeStatus DecodeRegListOperand(MCInst &Inst, unsigned Val,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
-  bool writebackLoad = false;
-  unsigned writebackReg = 0;
+  bool NeedDisjointWriteback = false;
+  unsigned WritebackReg = 0;
   switch (Inst.getOpcode()) {
-    default:
-      break;
-    case ARM::LDMIA_UPD:
-    case ARM::LDMDB_UPD:
-    case ARM::LDMIB_UPD:
-    case ARM::LDMDA_UPD:
-    case ARM::t2LDMIA_UPD:
-    case ARM::t2LDMDB_UPD:
-      writebackLoad = true;
-      writebackReg = Inst.getOperand(0).getReg();
-      break;
+  default:
+    break;
+  case ARM::LDMIA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+  case ARM::LDMDA_UPD:
+  case ARM::t2LDMIA_UPD:
+  case ARM::t2LDMDB_UPD:
+  case ARM::t2STMIA_UPD:
+  case ARM::t2STMDB_UPD:
+    NeedDisjointWriteback = true;
+    WritebackReg = Inst.getOperand(0).getReg();
+    break;
   }
 
   // Empty register lists are not allowed.
-  if (CountPopulation_32(Val) == 0) return MCDisassembler::Fail;
+  if (Val == 0) return MCDisassembler::Fail;
   for (unsigned i = 0; i < 16; ++i) {
     if (Val & (1 << i)) {
       if (!Check(S, DecodeGPRRegisterClass(Inst, i, Address, Decoder)))
         return MCDisassembler::Fail;
       // Writeback not allowed if Rn is in the target list.
-      if (writebackLoad && writebackReg == Inst.end()[-1].getReg())
+      if (NeedDisjointWriteback && WritebackReg == Inst.end()[-1].getReg())
         Check(S, MCDisassembler::SoftFail);
     }
   }
@@ -1227,6 +1243,13 @@ static DecodeStatus DecodeSPRRegListOperand(MCInst &Inst, unsigned Val,
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
   unsigned regs = fieldFromInstruction(Val, 0, 8);
 
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    S = MCDisassembler::SoftFail;
+  }
+
   if (!Check(S, DecodeSPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
   for (unsigned i = 0; i < (regs - 1); ++i) {
@@ -1242,9 +1265,15 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
   DecodeStatus S = MCDisassembler::Success;
 
   unsigned Vd = fieldFromInstruction(Val, 8, 5);
-  unsigned regs = fieldFromInstruction(Val, 0, 8);
+  unsigned regs = fieldFromInstruction(Val, 1, 7);
 
-  regs = regs >> 1;
+  // In case of unpredictable encoding, tweak the operands.
+  if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
+    regs = Vd + regs > 32 ? 32 - Vd : regs;
+    regs = std::max( 1u, regs);
+    regs = std::min(16u, regs);
+    S = MCDisassembler::SoftFail;
+  }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -1334,6 +1363,11 @@ static DecodeStatus DecodeCopMemInstruction(MCInst &Inst, unsigned Insn,
       break;
   }
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && (coproc != 14))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(coproc));
   Inst.addOperand(MCOperand::CreateImm(CRd));
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
@@ -1797,6 +1831,29 @@ static DecodeStatus DecodeRFEInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeQADDInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rd = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned pred = fieldFromInstruction(Insn, 28, 4);
+
+  if (pred == 0xF)
+    return DecodeCPSInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
 static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address, const void *Decoder) {
@@ -1807,6 +1864,7 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
   unsigned reglist = fieldFromInstruction(Insn, 0, 16);
 
   if (pred == 0xF) {
+    // Ambiguous with RFE and SRS
     switch (Inst.getOpcode()) {
       case ARM::LDMDA:
         Inst.setOpcode(ARM::RFEDA);
@@ -1857,11 +1915,16 @@ static DecodeStatus DecodeMemMultipleWritebackInstruction(MCInst &Inst,
         Inst.setOpcode(ARM::SRSIB_UPD);
         break;
       default:
-        if (!Check(S, MCDisassembler::Fail)) return MCDisassembler::Fail;
+        return MCDisassembler::Fail;
     }
 
     // For stores (which become SRS's, the only operand is the mode.
     if (fieldFromInstruction(Insn, 20, 1) == 0) {
+      // Check SRS encoding constraints
+      if (!(fieldFromInstruction(Insn, 22, 1) == 1 &&
+            fieldFromInstruction(Insn, 20, 1) == 0))
+        return MCDisassembler::Fail;
+
       Inst.addOperand(
           MCOperand::CreateImm(fieldFromInstruction(Insn, 0, 4)));
       return S;
@@ -1891,6 +1954,13 @@ static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
 
   DecodeStatus S = MCDisassembler::Success;
 
+  // This decoder is called from multiple location that do not check
+  // the full encoding is valid before they do.
+  if (fieldFromInstruction(Insn, 5, 1) != 0 ||
+      fieldFromInstruction(Insn, 16, 1) != 0 ||
+      fieldFromInstruction(Insn, 20, 8) != 0x10)
+    return MCDisassembler::Fail;
+
   // imod == '01' --> UNPREDICTABLE
   // NOTE: Even though this is technically UNPREDICTABLE, we choose to
   // return failure here.  The '01' imod value is unprintable, so there's
@@ -2106,7 +2176,7 @@ DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
   unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
   unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
   unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
-  int imm32 = SignExtend32<24>(tmp << 1);
+  int imm32 = SignExtend32<25>(tmp << 1);
   if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::CreateImm(imm32));
@@ -2432,6 +2502,57 @@ static DecodeStatus DecodeVLDInstruction(MCInst &Inst, unsigned Insn,
   return S;
 }
 
+static DecodeStatus DecodeVLDST1Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 6 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 7 && (align & 2)) return MCDisassembler::Fail;
+  if (type == 10 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST2Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned type = fieldFromInstruction(Insn, 8, 4);
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (type == 8 && align == 3) return MCDisassembler::Fail;
+  if (type == 9 && align == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST3Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned align = fieldFromInstruction(Insn, 4, 2);
+  if (align & 2) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeVLDST4Instruction(MCInst &Inst, unsigned Insn,
+                                   uint64_t Address, const void *Decoder) {
+  unsigned size = fieldFromInstruction(Insn, 6, 2);
+  if (size == 3) return MCDisassembler::Fail;
+
+  unsigned load = fieldFromInstruction(Insn, 21, 1);
+  return load ? DecodeVLDInstruction(Inst, Insn, Address, Decoder)
+              : DecodeVSTInstruction(Inst, Insn, Address, Decoder);
+}
+
 static DecodeStatus DecodeVSTInstruction(MCInst &Inst, unsigned Insn,
                                  uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -3115,6 +3236,17 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
   unsigned Rm = fieldFromInstruction(Val, 2, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 2);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRHs:
+  case ARM::t2STRBs:
+  case ARM::t2STRs:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
@@ -3128,53 +3260,282 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRBs:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHs:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHs:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRs:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2PLDs:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIs:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHs:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHs:
+      // FIXME: this instruction is only available with MP extensions,
+      // this should be checked first but we don't have access to the
+      // feature bits here.
+      Inst.setOpcode(ARM::t2PLDWs);
+      break;
+    default:
+      break;
+    }
+  }
+
   switch (Inst.getOpcode()) {
     case ARM::t2PLDs:
     case ARM::t2PLDWs:
     case ARM::t2PLIs:
       break;
-    default: {
-      unsigned Rt = fieldFromInstruction(Insn, 12, 4);
-      if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    default:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+  }
+
+  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
     return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 9, 1);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (U << 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi8:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBi8:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRHi8:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi8:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2PLDi8:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi8:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi8:
+      return MCDisassembler::Fail;
+    default:
+      break;
     }
   }
 
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi8:
+  case ARM::t2PLIi8:
+  case ARM::t2PLDWi8:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
-  if (Rn == 0xF) {
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= (Rn << 13);
+
+  if (Rn == 15) {
     switch (Inst.getOpcode()) {
-      case ARM::t2LDRBs:
-        Inst.setOpcode(ARM::t2LDRBpci);
-        break;
-      case ARM::t2LDRHs:
-        Inst.setOpcode(ARM::t2LDRHpci);
-        break;
-      case ARM::t2LDRSHs:
-        Inst.setOpcode(ARM::t2LDRSHpci);
-        break;
-      case ARM::t2LDRSBs:
-        Inst.setOpcode(ARM::t2LDRSBpci);
+    case ARM::t2LDRi12:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi12:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRBi12:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2PLDi12:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi12:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi12:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2PLDi12);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi12:
+  case ARM::t2PLDWi12:
+  case ARM::t2PLIi12:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBT:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHT:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSBT:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSHT:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  int imm = fieldFromInstruction(Insn, 0, 12);
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBpci:
+      case ARM::t2LDRHpci:
+        Inst.setOpcode(ARM::t2PLDpci);
         break;
-      case ARM::t2PLDs:
-        Inst.setOpcode(ARM::t2PLDi12);
-        Inst.addOperand(MCOperand::CreateReg(ARM::PC));
+      case ARM::t2LDRSBpci:
+        Inst.setOpcode(ARM::t2PLIpci);
         break;
-      default:
+      case ARM::t2LDRSHpci:
         return MCDisassembler::Fail;
+      default:
+        break;
     }
+  }
 
-    int imm = fieldFromInstruction(Insn, 0, 12);
-    if (!fieldFromInstruction(Insn, 23, 1)) imm *= -1;
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-    return S;
+  switch(Inst.getOpcode()) {
+  case ARM::t2PLDpci:
+  case ARM::t2PLIpci:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
   }
 
-  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
-  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
-  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
-  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
-    return MCDisassembler::Fail;
+  if (!U) {
+    // Special case for #-0.
+    if (imm == 0)
+      imm = INT32_MIN;
+    else
+      imm = -imm;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
 
   return S;
 }
@@ -3243,6 +3604,21 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 9);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRT:
+  case ARM::t2STRBT:
+  case ARM::t2STRHT:
+  case ARM::t2STRi8:
+  case ARM::t2STRHi8:
+  case ARM::t2STRBi8:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    break;
+  }
+
   // Some instructions always use an additive offset.
   switch (Inst.getOpcode()) {
     case ARM::t2LDRT:
@@ -3278,6 +3654,37 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
   addr |= Rn << 9;
   unsigned load = fieldFromInstruction(Insn, 20, 1);
 
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDR_PRE:
+    case ARM::t2LDR_POST:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRB_PRE:
+    case ARM::t2LDRB_POST:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRH_PRE:
+    case ARM::t2LDRH_POST:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSB_PRE:
+    case ARM::t2LDRSB_POST:
+      if (Rt == 15)
+        Inst.setOpcode(ARM::t2PLIpci);
+      else
+        Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSH_PRE:
+    case ARM::t2LDRSH_POST:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
   if (!load) {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -3304,6 +3711,17 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
   unsigned Rn = fieldFromInstruction(Val, 13, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 12);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(imm));
@@ -3401,6 +3819,11 @@ static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Val,
   if (Val == 0xA || Val == 0xB)
     return MCDisassembler::Fail;
 
+  uint64_t featureBits = ((const MCDisassembler*)Decoder)->getSubtargetInfo()
+                                                          .getFeatureBits();
+  if ((featureBits & ARM::HasV8Ops) && !(Val == 14 || Val == 15))
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::CreateImm(Val));
   return MCDisassembler::Success;
 }
@@ -3536,6 +3959,15 @@ static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Val,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeInstSyncBarrierOption(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  if (Val & ~0xf)
+    return MCDisassembler::Fail;
+
+  Inst.addOperand(MCOperand::CreateImm(Val));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeMSRMask(MCInst &Inst, unsigned Val,
                           uint64_t Address, const void *Decoder) {
   if (!Val) return MCDisassembler::Fail;
@@ -3551,11 +3983,10 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
   unsigned pred = fieldFromInstruction(Insn, 28, 4);
 
-  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
+  if (Rn == 0xF)
+    S = MCDisassembler::SoftFail;
 
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -3565,7 +3996,6 @@ static DecodeStatus DecodeDoubleRegLoad(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-
 static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
                                          uint64_t Address, const void *Decoder){
   DecodeStatus S = MCDisassembler::Success;
@@ -3578,12 +4008,10 @@ static DecodeStatus DecodeDoubleRegStore(MCInst &Inst, unsigned Insn,
   if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rd, Address, Decoder)))
     return MCDisassembler::Fail;
 
-  if ((Rt & 1) || Rt == 0xE || Rn == 0xF) return MCDisassembler::Fail;
-  if (Rd == Rn || Rd == Rt || Rd == Rt+1) return MCDisassembler::Fail;
+  if (Rn == 0xF || Rd == Rn || Rd == Rt || Rd == Rt+1)
+    S = MCDisassembler::SoftFail;
 
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
-    return MCDisassembler::Fail;
-  if (!Check(S, DecodeGPRRegisterClass(Inst, Rt+1, Address, Decoder)))
+  if (!Check(S, DecodeGPRPairRegisterClass(Inst, Rt, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4310,10 +4738,8 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
     S = MCDisassembler::SoftFail;
   }
 
-  if (mask == 0x0) {
-    mask |= 0x8;
-    S = MCDisassembler::SoftFail;
-  }
+  if (mask == 0x0)
+    return MCDisassembler::Fail;
 
   Inst.addOperand(MCOperand::CreateImm(pred));
   Inst.addOperand(MCOperand::CreateImm(mask));
@@ -4453,16 +4879,18 @@ static DecodeStatus DecodeVCVTD(MCInst &Inst, unsigned Insn,
   Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
   unsigned imm = fieldFromInstruction(Insn, 16, 6);
   unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
 
   DecodeStatus S = MCDisassembler::Success;
 
   // VMOVv2f32 is ambiguous with these decodings.
   if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
     Inst.setOpcode(ARM::VMOVv2f32);
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
-  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4481,16 +4909,18 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   Vm |= (fieldFromInstruction(Insn, 5, 1) << 4);
   unsigned imm = fieldFromInstruction(Insn, 16, 6);
   unsigned cmode = fieldFromInstruction(Insn, 8, 4);
+  unsigned op = fieldFromInstruction(Insn, 5, 1);
 
   DecodeStatus S = MCDisassembler::Success;
 
   // VMOVv4f32 is ambiguous with these decodings.
   if (!(imm & 0x38) && cmode == 0xF) {
+    if (op == 1) return MCDisassembler::Fail;
     Inst.setOpcode(ARM::VMOVv4f32);
     return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
   }
 
-  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+  if (!(imm & 0x20)) return MCDisassembler::Fail;
 
   if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
     return MCDisassembler::Fail;
@@ -4501,15 +4931,6 @@ static DecodeStatus DecodeVCVTQ(MCInst &Inst, unsigned Insn,
   return S;
 }
 
-static DecodeStatus DecodeImm0_4(MCInst &Inst, unsigned Insn, uint64_t Address,
-                                 const void *Decoder)
-{
-  unsigned Imm = fieldFromInstruction(Insn, 0, 3);
-  if (Imm > 4) return MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateImm(Imm));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeLDR(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 3bcd083..f897028 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -76,14 +76,23 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
                                StringRef Annot) {
   unsigned Opcode = MI->getOpcode();
 
+  switch(Opcode) {
+
   // Check for HINT instructions w/ canonical names.
-  if (Opcode == ARM::HINT || Opcode == ARM::t2HINT) {
+  case ARM::HINT:
+  case ARM::tHINT:
+  case ARM::t2HINT:
     switch (MI->getOperand(0).getImm()) {
     case 0: O << "\tnop"; break;
     case 1: O << "\tyield"; break;
     case 2: O << "\twfe"; break;
     case 3: O << "\twfi"; break;
     case 4: O << "\tsev"; break;
+    case 5:
+      if ((getAvailableFeatures() & ARM::HasV8Ops)) {
+        O << "\tsevl";
+        break;
+      } // Fallthrough for non-v8
     default:
       // Anything else should just print normally.
       printInstruction(MI, O);
@@ -95,10 +104,9 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       O << ".w";
     printAnnotation(O, Annot);
     return;
-  }
 
   // Check for MOVs and print canonical forms, instead.
-  if (Opcode == ARM::MOVsr) {
+  case ARM::MOVsr: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -121,7 +129,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  if (Opcode == ARM::MOVsi) {
+  case ARM::MOVsi: {
     // FIXME: Thumb variants?
     const MCOperand &Dst = MI->getOperand(0);
     const MCOperand &MO1 = MI->getOperand(1);
@@ -149,81 +157,91 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-
   // A8.6.123 PUSH
-  if ((Opcode == ARM::STMDB_UPD || Opcode == ARM::t2STMDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print PUSH if there are at least two registers in the list.
-    O << '\t' << "push";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2STMDB_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::STR_PRE_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(3).getImm() == -4) {
-    O << '\t' << "push";
-    printPredicateOperand(MI, 4, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(1).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::STMDB_UPD:
+  case ARM::t2STMDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print PUSH if there are at least two registers in the list.
+      O << '\t' << "push";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2STMDB_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::STR_PRE_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(3).getImm() == -4) {
+      O << '\t' << "push";
+      printPredicateOperand(MI, 4, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(1).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.122 POP
-  if ((Opcode == ARM::LDMIA_UPD || Opcode == ARM::t2LDMIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP &&
-      MI->getNumOperands() > 5) {
-    // Should only print POP if there are at least two registers in the list.
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 2, O);
-    if (Opcode == ARM::t2LDMIA_UPD)
-      O << ".w";
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-  if (Opcode == ARM::LDR_POST_IMM && MI->getOperand(2).getReg() == ARM::SP &&
-      MI->getOperand(4).getImm() == 4) {
-    O << '\t' << "pop";
-    printPredicateOperand(MI, 5, O);
-    O << "\t{";
-    printRegName(O, MI->getOperand(0).getReg());
-    O << "}";
-    printAnnotation(O, Annot);
-    return;
-  }
-
+  case ARM::LDMIA_UPD:
+  case ARM::t2LDMIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP && MI->getNumOperands() > 5) {
+      // Should only print POP if there are at least two registers in the list.
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 2, O);
+      if (Opcode == ARM::t2LDMIA_UPD)
+        O << ".w";
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
+
+  case ARM::LDR_POST_IMM:
+    if (MI->getOperand(2).getReg() == ARM::SP &&
+        MI->getOperand(4).getImm() == 4) {
+      O << '\t' << "pop";
+      printPredicateOperand(MI, 5, O);
+      O << "\t{";
+      printRegName(O, MI->getOperand(0).getReg());
+      O << "}";
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.355 VPUSH
-  if ((Opcode == ARM::VSTMSDB_UPD || Opcode == ARM::VSTMDDB_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpush";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTMDDB_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpush";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
   // A8.6.354 VPOP
-  if ((Opcode == ARM::VLDMSIA_UPD || Opcode == ARM::VLDMDIA_UPD) &&
-      MI->getOperand(0).getReg() == ARM::SP) {
-    O << '\t' << "vpop";
-    printPredicateOperand(MI, 2, O);
-    O << '\t';
-    printRegisterList(MI, 4, O);
-    printAnnotation(O, Annot);
-    return;
-  }
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMDIA_UPD:
+    if (MI->getOperand(0).getReg() == ARM::SP) {
+      O << '\t' << "vpop";
+      printPredicateOperand(MI, 2, O);
+      O << '\t';
+      printRegisterList(MI, 4, O);
+      printAnnotation(O, Annot);
+      return;
+    } else
+      break;
 
-  if (Opcode == ARM::tLDMIA) {
+  case ARM::tLDMIA: {
     bool Writeback = true;
     unsigned BaseReg = MI->getOperand(0).getReg();
     for (unsigned i = 3; i < MI->getNumOperands(); ++i) {
@@ -243,24 +261,16 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  // Thumb1 NOP
-  if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 &&
-      MI->getOperand(1).getReg() == ARM::R8) {
-    O << "\tnop";
-    printPredicateOperand(MI, 2, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-
   // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
   // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
   // a single GPRPair reg operand is used in the .td file to replace the two
   // GPRs. However, when decoding them, the two GRPs cannot be automatically
   // expressed as a GPRPair, so we have to manually merge them.
   // FIXME: We would really like to be able to tablegen'erate this.
-  if (Opcode == ARM::LDREXD || Opcode == ARM::STREXD) {
+  case ARM::LDREXD: case ARM::STREXD:
+  case ARM::LDAEXD: case ARM::STLEXD:
     const MCRegisterClass& MRC = MRI.getRegClass(ARM::GPRRegClassID);
-    bool isStore = Opcode == ARM::STREXD;
+    bool isStore = Opcode == ARM::STREXD || Opcode == ARM::STLEXD;
     unsigned Reg = MI->getOperand(isStore ? 1 : 0).getReg();
     if (MRC.contains(Reg)) {
       MCInst NewMI;
@@ -315,15 +325,29 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
-  if (MO1.isExpr())
+  if (MO1.isExpr()) {
     O << *MO1.getExpr();
-  else if (MO1.isImm()) {
-    O << markup("<mem:") << "[pc, "
-      << markup("<imm:") << "#" << formatImm(MO1.getImm())
-      << markup(">]>", "]");
+    return;
   }
-  else
-    llvm_unreachable("Unknown LDR label operand?");
+
+  O << markup("<mem:") << "[pc, ";
+
+  int32_t OffImm = (int32_t)MO1.getImm();
+  bool isSub = OffImm < 0;
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << markup("<imm:")
+      << "#-" << formatImm(-OffImm)
+      << markup(">");
+  } else {
+    O << markup("<imm:")
+      << "#" << formatImm(OffImm)
+      << markup(">");
+  }
+  O << "]" << markup(">");
 }
 
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
@@ -660,8 +684,8 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
                                                     raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   uint32_t v = ~MO.getImm();
-  int32_t lsb = CountTrailingZeros_32(v);
-  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  int32_t lsb = countTrailingZeros(v);
+  int32_t width = (32 - countLeadingZeros (v)) - lsb;
   assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
   O << markup("<imm:") << '#' << lsb << markup(">")
     << ", "
@@ -671,7 +695,13 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
 void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
                                      raw_ostream &O) {
   unsigned val = MI->getOperand(OpNum).getImm();
-  O << ARM_MB::MemBOptToString(val);
+  O << ARM_MB::MemBOptToString(val, (getAvailableFeatures() & ARM::HasV8Ops));
+}
+
+void ARMInstPrinter::printInstSyncBOption(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned val = MI->getOperand(OpNum).getImm();
+  O << ARM_ISB::InstSyncBOptToString(val);
 }
 
 void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
@@ -889,6 +919,7 @@ void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
   llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
 }
 
+template<unsigned scale>
 void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
@@ -898,7 +929,7 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  int32_t OffImm = (int32_t)MO.getImm();
+  int32_t OffImm = (int32_t)MO.getImm() << scale;
 
   O << markup("<imm:");
   if (OffImm == INT32_MIN)
@@ -931,7 +962,7 @@ void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
   unsigned Mask = MI->getOperand(OpNum).getImm();
   unsigned Firstcond = MI->getOperand(OpNum-1).getImm();
   unsigned CondBit0 = Firstcond & 1;
-  unsigned NumTZ = CountTrailingZeros_32(Mask);
+  unsigned NumTZ = countTrailingZeros(Mask);
   assert(NumTZ <= 3 && "Invalid IT mask!");
   for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
     bool T = ((Mask >> Pos) & 1) == CondBit0;
@@ -1059,6 +1090,7 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
   O << "]" << markup(">");
 }
 
+template<bool AlwaysPrintImm0>
 void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
                                                 unsigned OpNum,
                                                 raw_ostream &O) {
@@ -1069,22 +1101,25 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
   // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << "#-0";
-  else if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
   O << "]" << markup(">");
 }
 
+template<bool AlwaysPrintImm0>
 void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
                                                   unsigned OpNum,
                                                   raw_ostream &O) {
@@ -1100,22 +1135,24 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
 
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
   // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << "#-0";
-  else if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
   O << "]" << markup(">");
 }
 
@@ -1142,7 +1179,9 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm();
   O << ", " << markup("<imm:");
-  if (OffImm < 0)
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
     O << "#-" << -OffImm;
   else
     O << "#" << OffImm;
@@ -1157,19 +1196,14 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
 
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
-  // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
+  O << ", " << markup("<imm:");
   if (OffImm == INT32_MIN)
     O << "#-0";
   else if (OffImm < 0)
     O << "#-" << -OffImm;
-  else if (OffImm > 0)
+  else
     O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 344104e..15ae8d1 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -71,10 +71,12 @@ public:
   void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O);
   void printMemBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInstSyncBOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printShiftImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  template <unsigned scale>
   void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -96,8 +98,10 @@ public:
   template<bool AlwaysPrintImm0>
   void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
                                  raw_ostream &O);
+  template<bool AlwaysPrintImm0>
   void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O);
+  template<bool AlwaysPrintImm0>
   void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
                                     raw_ostream &O);
   void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
index 62473b2..b6c85c2 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@@ -140,7 +140,7 @@ namespace ARM_AM {
     if ((Imm & ~255U) == 0) return 0;
 
     // Use CTZ to compute the rotate amount.
-    unsigned TZ = CountTrailingZeros_32(Imm);
+    unsigned TZ = countTrailingZeros(Imm);
 
     // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
     // not 9.
@@ -153,7 +153,7 @@ namespace ARM_AM {
     // For values like 0xF000000F, we should ignore the low 6 bits, then
     // retry the hunt.
     if (Imm & 63U) {
-      unsigned TZ2 = CountTrailingZeros_32(Imm & ~63U);
+      unsigned TZ2 = countTrailingZeros(Imm & ~63U);
       unsigned RotAmt2 = TZ2 & ~1;
       if ((rotr32(Imm, RotAmt2) & ~255U) == 0)
         return (32-RotAmt2)&31;  // HW rotates right, not left.
@@ -221,7 +221,7 @@ namespace ARM_AM {
     if ((Imm & ~255U) == 0) return 0;
 
     // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
+    return countTrailingZeros(Imm);
   }
 
   /// isThumbImmShiftedVal - Return true if the specified value can be obtained
@@ -240,7 +240,7 @@ namespace ARM_AM {
     if ((Imm & ~65535U) == 0) return 0;
 
     // Use CTZ to compute the shift amount.
-    return CountTrailingZeros_32(Imm);
+    return countTrailingZeros(Imm);
   }
 
   /// isThumbImm16ShiftedVal - Return true if the specified value can be
@@ -296,7 +296,7 @@ namespace ARM_AM {
   /// encoding is possible.
   /// See ARM Reference Manual A6.3.2.
   static inline int getT2SOImmValRotateVal(unsigned V) {
-    unsigned RotAmt = CountLeadingZeros_32(V);
+    unsigned RotAmt = countLeadingZeros(V);
     if (RotAmt >= 24)
       return -1;
 
@@ -328,7 +328,7 @@ namespace ARM_AM {
   static inline unsigned getT2SOImmValRotate(unsigned V) {
     if ((V & ~255U) == 0) return 0;
     // Use CTZ to compute the rotate amount.
-    unsigned RotAmt = CountTrailingZeros_32(V);
+    unsigned RotAmt = countTrailingZeros(V);
     return (32 - RotAmt) & 31;
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index e66e985..5615b80 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -25,9 +25,9 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -152,7 +152,7 @@ static unsigned getRelaxedOpcode(unsigned Op) {
   switch (Op) {
   default: return Op;
   case ARM::tBcc:       return ARM::t2Bcc;
-  case ARM::tLDRpciASM: return ARM::t2LDRpci;
+  case ARM::tLDRpci:    return ARM::t2LDRpci;
   case ARM::tADR:       return ARM::t2ADR;
   case ARM::tB:         return ARM::t2B;
   }
@@ -419,7 +419,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
      uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
      uint32_t imm11Bits = (offset & 0x000007FF);
- 
+
      uint32_t Binary = 0;
      uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
      uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
@@ -434,8 +434,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      // four (see fixup_arm_thumb_cp). The 32-bit immediate value is encoded as
      //   imm32 = SignExtend(S:I1:I2:imm10H:imm10L:00)
      // where I1 = NOT(J1 ^ S) and I2 = NOT(J2 ^ S).
-     // The value is encoded into disjoint bit positions in the destination 
-     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit, 
+     // The value is encoded into disjoint bit positions in the destination
+     // opcode. x = unchanged, I = immediate value bit, S = sign extension bit,
      // J = either J1 or J2 bit, 0 = zero.
      //
      //   BLX: xxxxxSIIIIIIIIII xxJxJIIIIIIIIII0
@@ -450,10 +450,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
      uint32_t J2Bit = (I2Bit ^ 0x1) ^ signBit;
      uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
      uint32_t imm10LBits = (offset & 0x3FF);
- 
+
      uint32_t Binary = 0;
      uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
-     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) | 
+     uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                            ((uint16_t)imm10LBits) << 1);
      Binary |= secondHalf << 16;
      Binary |= firstHalf;
@@ -640,16 +640,16 @@ public:
 // FIXME: This should be in a separate file.
 class DarwinARMAsmBackend : public ARMAsmBackend {
 public:
-  const object::mach::CPUSubtypeARM Subtype;
+  const MachO::CPUSubTypeARM Subtype;
   DarwinARMAsmBackend(const Target &T, const StringRef TT,
-                      object::mach::CPUSubtypeARM st)
+                      MachO::CPUSubTypeARM st)
     : ARMAsmBackend(T, TT), Subtype(st) {
       HasDataInCodeSupport = true;
     }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_ARM,
+                                     MachO::CPU_TYPE_ARM,
                                      Subtype);
   }
 
@@ -660,28 +660,33 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
-    object::mach::CPUSubtypeARM CS =
-      StringSwitch<object::mach::CPUSubtypeARM>(TheTriple.getArchName())
-      .Cases("armv4t", "thumbv4t", object::mach::CSARM_V4T)
-      .Cases("armv5e", "thumbv5e",object::mach::CSARM_V5TEJ)
-      .Cases("armv6", "thumbv6", object::mach::CSARM_V6)
-      .Cases("armv6m", "thumbv6m", object::mach::CSARM_V6M)
-      .Cases("armv7em", "thumbv7em", object::mach::CSARM_V7EM)
-      .Cases("armv7f", "thumbv7f", object::mach::CSARM_V7F)
-      .Cases("armv7k", "thumbv7k", object::mach::CSARM_V7K)
-      .Cases("armv7m", "thumbv7m", object::mach::CSARM_V7M)
-      .Cases("armv7s", "thumbv7s", object::mach::CSARM_V7S)
-      .Default(object::mach::CSARM_V7);
+    MachO::CPUSubTypeARM CS =
+      StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
+      .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
+      .Cases("armv5e", "thumbv5e", MachO::CPU_SUBTYPE_ARM_V5TEJ)
+      .Cases("armv6", "thumbv6", MachO::CPU_SUBTYPE_ARM_V6)
+      .Cases("armv6m", "thumbv6m", MachO::CPU_SUBTYPE_ARM_V6M)
+      .Cases("armv7em", "thumbv7em", MachO::CPU_SUBTYPE_ARM_V7EM)
+      .Cases("armv7f", "thumbv7f", MachO::CPU_SUBTYPE_ARM_V7F)
+      .Cases("armv7k", "thumbv7k", MachO::CPU_SUBTYPE_ARM_V7K)
+      .Cases("armv7m", "thumbv7m", MachO::CPU_SUBTYPE_ARM_V7M)
+      .Cases("armv7s", "thumbv7s", MachO::CPU_SUBTYPE_ARM_V7S)
+      .Default(MachO::CPU_SUBTYPE_ARM_V7);
 
     return new DarwinARMAsmBackend(T, TT, CS);
   }
 
-  if (TheTriple.isOSWindows())
+#if 0
+  // FIXME: Introduce yet another checker but assert(0).
+  if (TheTriple.isOSBinFormatCOFF())
     assert(0 && "Windows not supported on ARM");
+#endif
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
   return new ELFARMAsmBackend(T, TT, OSABI);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index de48a0e..af939fc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -121,46 +121,89 @@ namespace ARM_MB {
   // the option field for memory barrier operations.
   enum MemBOpt {
     RESERVED_0 = 0,
-    RESERVED_1 = 1,
+    OSHLD = 1,
     OSHST = 2,
     OSH   = 3,
     RESERVED_4 = 4,
-    RESERVED_5 = 5,
+    NSHLD = 5,
     NSHST = 6,
     NSH   = 7,
     RESERVED_8 = 8,
-    RESERVED_9 = 9,
+    ISHLD = 9,
     ISHST = 10,
     ISH   = 11,
     RESERVED_12 = 12,
-    RESERVED_13 = 13,
+    LD = 13,
     ST    = 14,
     SY    = 15
   };
 
-  inline static const char *MemBOptToString(unsigned val) {
+  inline static const char *MemBOptToString(unsigned val, bool HasV8) {
     switch (val) {
     default: llvm_unreachable("Unknown memory operation");
     case SY:    return "sy";
     case ST:    return "st";
-    case RESERVED_13: return "#0xd";
+    case LD: return HasV8 ? "ld" : "#0xd";
     case RESERVED_12: return "#0xc";
     case ISH:   return "ish";
     case ISHST: return "ishst";
-    case RESERVED_9: return "#0x9";
+    case ISHLD: return HasV8 ?  "ishld" : "#0x9";
     case RESERVED_8: return "#0x8";
     case NSH:   return "nsh";
     case NSHST: return "nshst";
-    case RESERVED_5: return "#0x5";
+    case NSHLD: return HasV8 ? "nshld" : "#0x5";
     case RESERVED_4: return "#0x4";
     case OSH:   return "osh";
     case OSHST: return "oshst";
-    case RESERVED_1: return "#0x1";
+    case OSHLD: return HasV8 ? "oshld" : "#0x1";
     case RESERVED_0: return "#0x0";
     }
   }
 } // namespace ARM_MB
 
+namespace ARM_ISB {
+  enum InstSyncBOpt {
+    RESERVED_0 = 0,
+    RESERVED_1 = 1,
+    RESERVED_2 = 2,
+    RESERVED_3 = 3,
+    RESERVED_4 = 4,
+    RESERVED_5 = 5,
+    RESERVED_6 = 6,
+    RESERVED_7 = 7,
+    RESERVED_8 = 8,
+    RESERVED_9 = 9,
+    RESERVED_10 = 10,
+    RESERVED_11 = 11,
+    RESERVED_12 = 12,
+    RESERVED_13 = 13,
+    RESERVED_14 = 14,
+    SY = 15
+  };
+
+  inline static const char *InstSyncBOptToString(unsigned val) {
+    switch (val) {
+      default: llvm_unreachable("Unkown memory operation");
+      case RESERVED_0:  return "#0x0";
+      case RESERVED_1:  return "#0x1";
+      case RESERVED_2:  return "#0x2";
+      case RESERVED_3:  return "#0x3";
+      case RESERVED_4:  return "#0x4";
+      case RESERVED_5:  return "#0x5";
+      case RESERVED_6:  return "#0x6";
+      case RESERVED_7:  return "#0x7";
+      case RESERVED_8:  return "#0x8";
+      case RESERVED_9:  return "#0x9";
+      case RESERVED_10: return "#0xa";
+      case RESERVED_11: return "#0xb";
+      case RESERVED_12: return "#0xc";
+      case RESERVED_13: return "#0xd";
+      case RESERVED_14: return "#0xe";
+      case SY:          return "sy";
+    }
+  }
+} // namespace ARM_ISB
+
 /// isARMLowRegister - Returns true if the register is a low register (r0-r7).
 ///
 static inline bool isARMLowRegister(unsigned Reg) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 6c3d247..471897d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -13,6 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ARMBuildAttrs.h"
+#include "ARMFPUName.h"
 #include "ARMRegisterInfo.h"
 #include "ARMUnwindOp.h"
 #include "ARMUnwindOpAsm.h"
@@ -27,6 +29,7 @@
 #include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -36,7 +39,9 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -45,8 +50,218 @@ static std::string GetAEABIUnwindPersonalityName(unsigned Index) {
   return (Twine("__aeabi_unwind_cpp_pr") + Twine(Index)).str();
 }
 
+static const char *GetFPUName(unsigned ID) {
+  switch (ID) {
+  default:
+    llvm_unreachable("Unknown FPU kind");
+    break;
+#define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
+#include "ARMFPUName.def"
+  }
+  return NULL;
+}
+
 namespace {
 
+class ARMELFStreamer;
+
+class ARMTargetAsmStreamer : public ARMTargetStreamer {
+  formatted_raw_ostream &OS;
+  MCInstPrinter &InstPrinter;
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+public:
+  ARMTargetAsmStreamer(formatted_raw_ostream &OS, MCInstPrinter &InstPrinter);
+};
+
+ARMTargetAsmStreamer::ARMTargetAsmStreamer(formatted_raw_ostream &OS,
+                                           MCInstPrinter &InstPrinter)
+    : OS(OS), InstPrinter(InstPrinter) {}
+void ARMTargetAsmStreamer::emitFnStart() { OS << "\t.fnstart\n"; }
+void ARMTargetAsmStreamer::emitFnEnd() { OS << "\t.fnend\n"; }
+void ARMTargetAsmStreamer::emitCantUnwind() { OS << "\t.cantunwind\n"; }
+void ARMTargetAsmStreamer::emitPersonality(const MCSymbol *Personality) {
+  OS << "\t.personality " << Personality->getName() << '\n';
+}
+void ARMTargetAsmStreamer::emitHandlerData() { OS << "\t.handlerdata\n"; }
+void ARMTargetAsmStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  OS << "\t.setfp\t";
+  InstPrinter.printRegName(OS, FpReg);
+  OS << ", ";
+  InstPrinter.printRegName(OS, SpReg);
+  if (Offset)
+    OS << ", #" << Offset;
+  OS << '\n';
+}
+void ARMTargetAsmStreamer::emitPad(int64_t Offset) {
+  OS << "\t.pad\t#" << Offset << '\n';
+}
+void ARMTargetAsmStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  assert(RegList.size() && "RegList should not be empty");
+  if (isVector)
+    OS << "\t.vsave\t{";
+  else
+    OS << "\t.save\t{";
+
+  InstPrinter.printRegName(OS, RegList[0]);
+
+  for (unsigned i = 1, e = RegList.size(); i != e; ++i) {
+    OS << ", ";
+    InstPrinter.printRegName(OS, RegList[i]);
+  }
+
+  OS << "}\n";
+}
+void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {
+}
+void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+}
+void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef String) {
+  switch (Attribute) {
+  default: llvm_unreachable("Unsupported Text attribute in ASM Mode");
+  case ARMBuildAttrs::CPU_name:
+    OS << "\t.cpu\t" << String.lower() << "\n";
+    break;
+  }
+}
+void ARMTargetAsmStreamer::emitFPU(unsigned FPU) {
+  OS << "\t.fpu\t" << GetFPUName(FPU) << "\n";
+}
+void ARMTargetAsmStreamer::finishAttributeSection() {
+}
+
+class ARMTargetELFStreamer : public ARMTargetStreamer {
+private:
+  // This structure holds all attributes, accounting for
+  // their string/numeric value, so we can later emmit them
+  // in declaration order, keeping all in the same vector
+  struct AttributeItem {
+    enum {
+      HiddenAttribute = 0,
+      NumericAttribute,
+      TextAttribute
+    } Type;
+    unsigned Tag;
+    unsigned IntValue;
+    StringRef StringValue;
+
+    static bool LessTag(const AttributeItem &LHS, const AttributeItem &RHS) {
+      return (LHS.Tag < RHS.Tag);
+    }
+  };
+
+  StringRef CurrentVendor;
+  unsigned FPU;
+  SmallVector<AttributeItem, 64> Contents;
+
+  const MCSection *AttributeSection;
+
+  // FIXME: this should be in a more generic place, but
+  // getULEBSize() is in MCAsmInfo and will be moved to MCDwarf
+  static size_t getULEBSize(int Value) {
+    size_t Size = 0;
+    do {
+      Value >>= 7;
+      Size += sizeof(int8_t); // Is this really necessary?
+    } while (Value);
+    return Size;
+  }
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return 0;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::NumericAttribute,
+      Attribute,
+      Value,
+      StringRef("")
+    };
+    Contents.push_back(Item);
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->StringValue = Value;
+      return;
+    }
+
+    // Create new attribute item
+    AttributeItem Item = {
+      AttributeItem::TextAttribute,
+      Attribute,
+      0,
+      Value
+    };
+    Contents.push_back(Item);
+  }
+
+  void emitFPUDefaultAttributes();
+
+  ARMELFStreamer &getStreamer();
+
+  virtual void emitFnStart();
+  virtual void emitFnEnd();
+  virtual void emitCantUnwind();
+  virtual void emitPersonality(const MCSymbol *Personality);
+  virtual void emitHandlerData();
+  virtual void emitSetFP(unsigned FpReg, unsigned SpReg, int64_t Offset = 0);
+  virtual void emitPad(int64_t Offset);
+  virtual void emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                           bool isVector);
+
+  virtual void switchVendor(StringRef Vendor);
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitFPU(unsigned FPU);
+  virtual void finishAttributeSection();
+
+  size_t calculateContentSize() const;
+
+public:
+  ARMTargetELFStreamer()
+    : ARMTargetStreamer(), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
+      AttributeSection(0) {
+  }
+};
+
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
 /// the appropriate points in the object files. These symbols are defined in the
 /// ARM ELF ABI: infocenter.arm.com/help/topic/com.arm.../IHI0044D_aaelf.pdf.
@@ -61,27 +276,29 @@ namespace {
 /// by MachO. Beware!
 class ARMELFStreamer : public MCELFStreamer {
 public:
-  ARMELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                 MCCodeEmitter *Emitter, bool IsThumb)
-      : MCELFStreamer(SK_ARMELFStreamer, Context, TAB, OS, Emitter),
+  friend class ARMTargetELFStreamer;
+
+  ARMELFStreamer(MCContext &Context, MCTargetStreamer *TargetStreamer,
+                 MCAsmBackend &TAB, raw_ostream &OS, MCCodeEmitter *Emitter,
+                 bool IsThumb)
+      : MCELFStreamer(Context, TargetStreamer, TAB, OS, Emitter),
         IsThumb(IsThumb), MappingSymbolCounter(0), LastEMS(EMS_None) {
     Reset();
   }
 
   ~ARMELFStreamer() {}
 
+  virtual void FinishImpl();
+
   // ARM exception handling directives
-  virtual void EmitFnStart();
-  virtual void EmitFnEnd();
-  virtual void EmitCantUnwind();
-  virtual void EmitPersonality(const MCSymbol *Per);
-  virtual void EmitHandlerData();
-  virtual void EmitSetFP(unsigned NewFpReg,
-                         unsigned NewSpReg,
-                         int64_t Offset = 0);
-  virtual void EmitPad(int64_t Offset);
-  virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                           bool isVector);
+  void emitFnStart();
+  void emitFnEnd();
+  void emitCantUnwind();
+  void emitPersonality(const MCSymbol *Per);
+  void emitHandlerData();
+  void emitSetFP(unsigned NewFpReg, unsigned NewSpReg, int64_t Offset = 0);
+  void emitPad(int64_t Offset);
+  void emitRegSave(const SmallVectorImpl<unsigned> &RegList, bool isVector);
 
   virtual void ChangeSection(const MCSection *Section,
                              const MCExpr *Subsection) {
@@ -109,18 +326,17 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+  virtual void EmitBytes(StringRef Data) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data, AddrSpace);
+    MCELFStreamer::EmitBytes(Data);
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace) {
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+    MCELFStreamer::EmitValueImpl(Value, Size);
   }
 
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -142,10 +358,6 @@ public:
     }
   }
 
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_ARMELFStreamer;
-  }
-
 private:
   enum ElfMappingSymbol {
     EMS_None,
@@ -184,7 +396,7 @@ private:
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
+    AssignSection(Symbol, getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -203,7 +415,8 @@ private:
   void Reset();
 
   void EmitPersonalityFixup(StringRef Name);
-  void CollectUnwindOpcodes();
+  void FlushPendingOffset();
+  void FlushUnwindOpcodes(bool NoHandlerData);
 
   void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
                          SectionKind Kind, const MCSymbol &Fn);
@@ -220,17 +433,236 @@ private:
   MCSymbol *ExTab;
   MCSymbol *FnStart;
   const MCSymbol *Personality;
-  uint32_t VFPRegSave; // Register mask for {d31-d0}
-  uint32_t RegSave; // Register mask for {r15-r0}
-  int64_t SPOffset;
-  uint16_t FPReg;
-  int64_t FPOffset;
+  unsigned PersonalityIndex;
+  unsigned FPReg; // Frame pointer register
+  int64_t FPOffset; // Offset: (final frame pointer) - (initial $sp)
+  int64_t SPOffset; // Offset: (final $sp) - (initial $sp)
+  int64_t PendingOffset; // Offset: (final $sp) - (emitted $sp)
   bool UsedFP;
   bool CantUnwind;
+  SmallVector<uint8_t, 64> Opcodes;
   UnwindOpcodeAssembler UnwindOpAsm;
 };
 } // end anonymous namespace
 
+ARMELFStreamer &ARMTargetELFStreamer::getStreamer() {
+  ARMELFStreamer *S = static_cast<ARMELFStreamer *>(Streamer);
+  return *S;
+}
+
+void ARMTargetELFStreamer::emitFnStart() { getStreamer().emitFnStart(); }
+void ARMTargetELFStreamer::emitFnEnd() { getStreamer().emitFnEnd(); }
+void ARMTargetELFStreamer::emitCantUnwind() { getStreamer().emitCantUnwind(); }
+void ARMTargetELFStreamer::emitPersonality(const MCSymbol *Personality) {
+  getStreamer().emitPersonality(Personality);
+}
+void ARMTargetELFStreamer::emitHandlerData() {
+  getStreamer().emitHandlerData();
+}
+void ARMTargetELFStreamer::emitSetFP(unsigned FpReg, unsigned SpReg,
+                                     int64_t Offset) {
+  getStreamer().emitSetFP(FpReg, SpReg, Offset);
+}
+void ARMTargetELFStreamer::emitPad(int64_t Offset) {
+  getStreamer().emitPad(Offset);
+}
+void ARMTargetELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                       bool isVector) {
+  getStreamer().emitRegSave(RegList, isVector);
+}
+void ARMTargetELFStreamer::switchVendor(StringRef Vendor) {
+  assert(!Vendor.empty() && "Vendor cannot be empty.");
+
+  if (CurrentVendor == Vendor)
+    return;
+
+  if (!CurrentVendor.empty())
+    finishAttributeSection();
+
+  assert(Contents.empty() &&
+         ".ARM.attributes should be flushed before changing vendor");
+  CurrentVendor = Vendor;
+
+}
+void ARMTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                             StringRef Value) {
+  setAttributeItem(Attribute, Value, /* OverwriteExisting= */ true);
+}
+void ARMTargetELFStreamer::emitFPU(unsigned Value) {
+  FPU = Value;
+}
+void ARMTargetELFStreamer::emitFPUDefaultAttributes() {
+  switch (FPU) {
+  case ARM::VFP:
+  case ARM::VFPV2:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV3_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::VFPV4_D16:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4B,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv3A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_VFPV4:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPv4A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeon2,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  case ARM::NEON_FP_ARMV8:
+  case ARM::CRYPTO_NEON_FP_ARMV8:
+    setAttributeItem(ARMBuildAttrs::VFP_arch,
+                     ARMBuildAttrs::AllowFPARMv8A,
+                     /* OverwriteExisting= */ false);
+    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
+                     ARMBuildAttrs::AllowNeonARMv8,
+                     /* OverwriteExisting= */ false);
+    break;
+
+  default:
+    report_fatal_error("Unknown FPU: " + Twine(FPU));
+    break;
+  }
+}
+size_t ARMTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    switch (item.Type) {
+    case AttributeItem::HiddenAttribute:
+      break;
+    case AttributeItem::NumericAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += getULEBSize(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Result += getULEBSize(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    }
+  }
+  return Result;
+}
+void ARMTargetELFStreamer::finishAttributeSection() {
+  // <format-version>
+  // [ <section-length> "vendor-name"
+  // [ <file-tag> <size> <attribute>*
+  //   | <section-tag> <size> <section-number>* 0 <attribute>*
+  //   | <symbol-tag> <size> <symbol-number>* 0 <attribute>*
+  //   ]+
+  // ]*
+
+  if (FPU != ARM::INVALID_FPU)
+    emitFPUDefaultAttributes();
+
+  if (Contents.empty())
+    return;
+
+  std::sort(Contents.begin(), Contents.end(), AttributeItem::LessTag);
+
+  ARMELFStreamer &Streamer = getStreamer();
+
+  // Switch to .ARM.attributes section
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    AttributeSection =
+      Streamer.getContext().getELFSection(".ARM.attributes",
+                                          ELF::SHT_ARM_ATTRIBUTES,
+                                          0,
+                                          SectionKind::getMetadata());
+    Streamer.SwitchSection(AttributeSection);
+
+    // Format version
+    Streamer.EmitIntValue(0x41, 1);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
+  Streamer.EmitBytes(CurrentVendor);
+  Streamer.EmitIntValue(0, 1); // '\0'
+
+  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
+  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String)
+  for (size_t i = 0; i < Contents.size(); ++i) {
+    AttributeItem item = Contents[i];
+    Streamer.EmitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default: llvm_unreachable("Invalid attribute type");
+    case AttributeItem::NumericAttribute:
+      Streamer.EmitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeItem::TextAttribute:
+      Streamer.EmitBytes(item.StringValue.upper());
+      Streamer.EmitIntValue(0, 1); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+  FPU = ARM::INVALID_FPU;
+}
+
+void ARMELFStreamer::FinishImpl() {
+  MCTargetStreamer &TS = getTargetStreamer();
+  ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
+  ATS.finishAttributeSection();
+
+  MCELFStreamer::FinishImpl();
+}
+
 inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
                                               unsigned Type,
                                               unsigned Flags,
@@ -279,81 +711,37 @@ inline void ARMELFStreamer::SwitchToExIdxSection(const MCSymbol &FnStart) {
 }
 
 void ARMELFStreamer::Reset() {
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-
   ExTab = NULL;
   FnStart = NULL;
   Personality = NULL;
-  VFPRegSave = 0;
-  RegSave = 0;
-  FPReg = MRI.getEncodingValue(ARM::SP);
+  PersonalityIndex = NUM_PERSONALITY_INDEX;
+  FPReg = ARM::SP;
   FPOffset = 0;
   SPOffset = 0;
+  PendingOffset = 0;
   UsedFP = false;
   CantUnwind = false;
 
+  Opcodes.clear();
   UnwindOpAsm.Reset();
 }
 
-// Add the R_ARM_NONE fixup at the same position
-void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
-  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
-
-  const MCSymbolRefExpr *PersonalityRef =
-    MCSymbolRefExpr::Create(PersonalitySym,
-                            MCSymbolRefExpr::VK_ARM_NONE,
-                            getContext());
-
-  AddValueSymbols(PersonalityRef);
-  MCDataFragment *DF = getOrCreateDataFragment();
-  DF->getFixups().push_back(
-    MCFixup::Create(DF->getContents().size(), PersonalityRef,
-                    MCFixup::getKindForSize(4, false)));
-}
-
-void ARMELFStreamer::CollectUnwindOpcodes() {
-  if (UsedFP) {
-    UnwindOpAsm.EmitSetFP(FPReg);
-    UnwindOpAsm.EmitSPOffset(-FPOffset);
-  } else {
-    UnwindOpAsm.EmitSPOffset(SPOffset);
-  }
-  UnwindOpAsm.EmitVFPRegSave(VFPRegSave);
-  UnwindOpAsm.EmitRegSave(RegSave);
-  UnwindOpAsm.Finalize();
-}
-
-void ARMELFStreamer::EmitFnStart() {
+void ARMELFStreamer::emitFnStart() {
   assert(FnStart == 0);
   FnStart = getContext().CreateTempSymbol();
   EmitLabel(FnStart);
 }
 
-void ARMELFStreamer::EmitFnEnd() {
+void ARMELFStreamer::emitFnEnd() {
   assert(FnStart && ".fnstart must preceeds .fnend");
 
   // Emit unwind opcodes if there is no .handlerdata directive
-  if (!ExTab && !CantUnwind) {
-    CollectUnwindOpcodes();
-
-    unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex();
-    if (PersonalityIndex == AEABI_UNWIND_CPP_PR1 ||
-        PersonalityIndex == AEABI_UNWIND_CPP_PR2) {
-      // For the __aeabi_unwind_cpp_pr1 and __aeabi_unwind_cpp_pr2, we have to
-      // emit the unwind opcodes in the corresponding ".ARM.extab" section, and
-      // then emit a reference to these unwind opcodes in the second word of
-      // the exception index table entry.
-      SwitchToExTabSection(*FnStart);
-      ExTab = getContext().CreateTempSymbol();
-      EmitLabel(ExTab);
-      EmitBytes(UnwindOpAsm.data(), 0);
-    }
-  }
+  if (!ExTab && !CantUnwind)
+    FlushUnwindOpcodes(true);
 
   // Emit the exception index table entry
   SwitchToExIdxSection(*FnStart);
 
-  unsigned PersonalityIndex = UnwindOpAsm.getPersonalityIndex();
   if (PersonalityIndex < NUM_PERSONALITY_INDEX)
     EmitPersonalityFixup(GetAEABIUnwindPersonalityName(PersonalityIndex));
 
@@ -362,37 +750,80 @@ void ARMELFStreamer::EmitFnEnd() {
                             MCSymbolRefExpr::VK_ARM_PREL31,
                             getContext());
 
-  EmitValue(FnStartRef, 4, 0);
+  EmitValue(FnStartRef, 4);
 
   if (CantUnwind) {
-    EmitIntValue(EXIDX_CANTUNWIND, 4, 0);
+    EmitIntValue(EXIDX_CANTUNWIND, 4);
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
       MCSymbolRefExpr::Create(ExTab,
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
-    EmitValue(ExTabEntryRef, 4, 0);
+    EmitValue(ExTabEntryRef, 4);
   } else {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
     // the second word of exception index table entry.  The size of the unwind
     // opcodes should always be 4 bytes.
     assert(PersonalityIndex == AEABI_UNWIND_CPP_PR0 &&
            "Compact model must use __aeabi_cpp_unwind_pr0 as personality");
-    assert(UnwindOpAsm.size() == 4u &&
+    assert(Opcodes.size() == 4u &&
            "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
-    EmitBytes(UnwindOpAsm.data(), 0);
+    EmitBytes(StringRef(reinterpret_cast<const char*>(Opcodes.data()),
+                        Opcodes.size()));
   }
 
+  // Switch to the section containing FnStart
+  SwitchSection(&FnStart->getSection());
+
   // Clean exception handling frame information
   Reset();
 }
 
-void ARMELFStreamer::EmitCantUnwind() {
-  CantUnwind = true;
+void ARMELFStreamer::emitCantUnwind() { CantUnwind = true; }
+
+// Add the R_ARM_NONE fixup at the same position
+void ARMELFStreamer::EmitPersonalityFixup(StringRef Name) {
+  const MCSymbol *PersonalitySym = getContext().GetOrCreateSymbol(Name);
+
+  const MCSymbolRefExpr *PersonalityRef = MCSymbolRefExpr::Create(
+      PersonalitySym, MCSymbolRefExpr::VK_ARM_NONE, getContext());
+
+  AddValueSymbols(PersonalityRef);
+  MCDataFragment *DF = getOrCreateDataFragment();
+  DF->getFixups().push_back(MCFixup::Create(DF->getContents().size(),
+                                            PersonalityRef,
+                                            MCFixup::getKindForSize(4, false)));
+}
+
+void ARMELFStreamer::FlushPendingOffset() {
+  if (PendingOffset != 0) {
+    UnwindOpAsm.EmitSPOffset(-PendingOffset);
+    PendingOffset = 0;
+  }
 }
 
-void ARMELFStreamer::EmitHandlerData() {
+void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
+  // Emit the unwind opcode to restore $sp.
+  if (UsedFP) {
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+    int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
+    UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
+    UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
+  } else {
+    FlushPendingOffset();
+  }
+
+  // Finalize the unwind opcode sequence
+  UnwindOpAsm.Finalize(PersonalityIndex, Opcodes);
+
+  // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
+  // section.  Thus, we don't have to create an entry in the .ARM.extab
+  // section.
+  if (NoHandlerData && PersonalityIndex == AEABI_UNWIND_CPP_PR0)
+    return;
+
+  // Switch to .ARM.extab section.
   SwitchToExTabSection(*FnStart);
 
   // Create .ARM.extab label for offset in .ARM.exidx
@@ -400,73 +831,117 @@ void ARMELFStreamer::EmitHandlerData() {
   ExTab = getContext().CreateTempSymbol();
   EmitLabel(ExTab);
 
-  // Emit Personality
-  assert(Personality && ".personality directive must preceed .handlerdata");
-
-  const MCSymbolRefExpr *PersonalityRef =
-    MCSymbolRefExpr::Create(Personality,
-                            MCSymbolRefExpr::VK_ARM_PREL31,
-                            getContext());
+  // Emit personality
+  if (Personality) {
+    const MCSymbolRefExpr *PersonalityRef =
+      MCSymbolRefExpr::Create(Personality,
+                              MCSymbolRefExpr::VK_ARM_PREL31,
+                              getContext());
 
-  EmitValue(PersonalityRef, 4, 0);
+    EmitValue(PersonalityRef, 4);
+  }
 
   // Emit unwind opcodes
-  CollectUnwindOpcodes();
-  EmitBytes(UnwindOpAsm.data(), 0);
+  EmitBytes(StringRef(reinterpret_cast<const char *>(Opcodes.data()),
+                      Opcodes.size()));
+
+  // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
+  // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
+  // after the unwind opcodes.  The handler data consists of several 32-bit
+  // words, and should be terminated by zero.
+  //
+  // In case that the .handlerdata directive is not specified by the
+  // programmer, we should emit zero to terminate the handler data.
+  if (NoHandlerData && !Personality)
+    EmitIntValue(0, 4);
 }
 
-void ARMELFStreamer::EmitPersonality(const MCSymbol *Per) {
+void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
+
+void ARMELFStreamer::emitPersonality(const MCSymbol *Per) {
   Personality = Per;
   UnwindOpAsm.setPersonality(Per);
 }
 
-void ARMELFStreamer::EmitSetFP(unsigned NewFPReg,
-                               unsigned NewSPReg,
+void ARMELFStreamer::emitSetFP(unsigned NewFPReg, unsigned NewSPReg,
                                int64_t Offset) {
-  assert(SPOffset == 0 &&
-         "Current implementation assumes .setfp precedes .pad");
-
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
-
-  uint16_t NewFPRegEncVal = MRI.getEncodingValue(NewFPReg);
-#ifndef NDEBUG
-  uint16_t NewSPRegEncVal = MRI.getEncodingValue(NewSPReg);
-#endif
-
-  assert((NewSPReg == ARM::SP || NewSPRegEncVal == FPReg) &&
+  assert((NewSPReg == ARM::SP || NewSPReg == FPReg) &&
          "the operand of .setfp directive should be either $sp or $fp");
 
   UsedFP = true;
-  FPReg = NewFPRegEncVal;
-  FPOffset = Offset;
-}
+  FPReg = NewFPReg;
 
-void ARMELFStreamer::EmitPad(int64_t Offset) {
-  SPOffset += Offset;
+  if (NewSPReg == ARM::SP)
+    FPOffset = SPOffset + Offset;
+  else
+    FPOffset += Offset;
 }
 
-void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
-                                 bool IsVector) {
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+void ARMELFStreamer::emitPad(int64_t Offset) {
+  // Track the change of the $sp offset
+  SPOffset -= Offset;
 
-#ifndef NDEBUG
-  unsigned Max = IsVector ? 32 : 16;
-#endif
-  uint32_t &RegMask = IsVector ? VFPRegSave : RegSave;
+  // To squash multiple .pad directives, we should delay the unwind opcode
+  // until the .save, .vsave, .handlerdata, or .fnend directives.
+  PendingOffset -= Offset;
+}
 
+void ARMELFStreamer::emitRegSave(const SmallVectorImpl<unsigned> &RegList,
+                                 bool IsVector) {
+  // Collect the registers in the register list
+  unsigned Count = 0;
+  uint32_t Mask = 0;
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
   for (size_t i = 0; i < RegList.size(); ++i) {
-    unsigned Reg = MRI.getEncodingValue(RegList[i]);
-    assert(Reg < Max && "Register encoded value out of range");
-    RegMask |= 1u << Reg;
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
+    assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
+    unsigned Bit = (1u << Reg);
+    if ((Mask & Bit) == 0) {
+      Mask |= Bit;
+      ++Count;
+    }
   }
+
+  // Track the change the $sp offset: For the .save directive, the
+  // corresponding push instruction will decrease the $sp by (4 * Count).
+  // For the .vsave directive, the corresponding vpush instruction will
+  // decrease $sp by (8 * Count).
+  SPOffset -= Count * (IsVector ? 8 : 4);
+
+  // Emit the opcode
+  FlushPendingOffset();
+  if (IsVector)
+    UnwindOpAsm.EmitVFPRegSave(Mask);
+  else
+    UnwindOpAsm.EmitRegSave(Mask);
 }
 
 namespace llvm {
+
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst) {
+  ARMTargetAsmStreamer *S = new ARMTargetAsmStreamer(OS, *InstPrint);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
+}
+
   MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                                       raw_ostream &OS, MCCodeEmitter *Emitter,
                                       bool RelaxAll, bool NoExecStack,
                                       bool IsThumb) {
-    ARMELFStreamer *S = new ARMELFStreamer(Context, TAB, OS, Emitter, IsThumb);
+    ARMTargetELFStreamer *TS = new ARMTargetELFStreamer();
+    ARMELFStreamer *S =
+        new ARMELFStreamer(Context, TS, TAB, OS, Emitter, IsThumb);
+    // FIXME: This should eventually end up somewhere else where more
+    // intelligent flag decisions can be made. For now we are just maintaining
+    // the status quo for ARM and setting EF_ARM_EABI_VER5 as the default.
+    S->getAssembler().setELFHeaderEFlags(ELF::EF_ARM_EABI_VER5);
+
     if (RelaxAll)
       S->getAssembler().setRelaxAll(true);
     if (NoExecStack)
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
deleted file mode 100644
index 77ae5d2..0000000
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- ARMELFStreamer.h - ELF Streamer for ARM ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_ELF_STREAMER_H
-#define ARM_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-  MCELFStreamer* createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack,
-                                      bool IsThumb);
-}
-
-#endif // ARM_ELF_STREAMER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index c1aab9c..ad796e6 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -49,8 +49,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
 
-  WeakRefDirective = "\t.weak\t";
-
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index f0b289c..e1f716d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define LLVM_ARMTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit ARMMCAsmInfoDarwin();
   };
 
-  class ARMELFMCAsmInfo : public MCAsmInfo {
+  class ARMELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit ARMELFMCAsmInfo();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 7a59a7d..4382d0d 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -58,8 +58,7 @@ public:
   }
   bool isTargetDarwin() const {
     Triple TT(STI.getTargetTriple());
-    Triple::OSType OS = TT.getOS();
-    return OS == Triple::Darwin || OS == Triple::MacOSX || OS == Triple::IOS;
+  	return TT.isOSDarwin();
   }
 
   unsigned getMachineSoImmOpValue(unsigned SoImm) const;
@@ -315,6 +314,8 @@ public:
                                           unsigned EncodedValue) const;
   unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
                                     unsigned EncodedValue) const;
+  unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
+                                   unsigned EncodedValue) const;
 
   unsigned VFPThumb2PostEncoder(const MCInst &MI,
                                 unsigned EncodedValue) const;
@@ -389,6 +390,17 @@ unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
   return EncodedValue;
 }
 
+/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
+/// if we are in Thumb2.
+unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue |= 0xC000000; // Set bits 27-26
+  }
+
+  return EncodedValue;
+}
+
 /// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
 /// them to their Thumb2 form if we are currently in Thumb2 mode.
 unsigned ARMMCCodeEmitter::
@@ -407,7 +419,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
 
     // Q registers are encoded as 2x their register number.
     switch (Reg) {
@@ -436,7 +448,7 @@ EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
-  Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   int32_t SImm = MO1.getImm();
   bool isAdd = true;
@@ -625,8 +637,14 @@ getARMBLXTargetOpValue(const MCInst &MI, unsigned OpIdx,
 uint32_t ARMMCCodeEmitter::
 getUnconditionalBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
                        SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Val =
-    ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  unsigned Val = 0;
+  const MCOperand MO = MI.getOperand(OpIdx);
+    
+  if(MO.isExpr())
+    return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_t2_uncondbranch, Fixups);
+  else 
+    Val = MO.getImm() >> 1;
+
   bool I  = (Val & 0x800000);
   bool J1 = (Val & 0x400000);
   bool J2 = (Val & 0x200000);
@@ -652,7 +670,7 @@ getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
   if (MO.isExpr())
     return ::getBranchTargetOpValue(MI, OpIdx, ARM::fixup_arm_adr_pcrel_12,
                                     Fixups);
-  int32_t offset = MO.getImm();
+  int64_t offset = MO.getImm();
   uint32_t Val = 0x2000;
 
   int SoImmVal;
@@ -724,8 +742,8 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO1 = MI.getOperand(OpIdx);
   const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
-  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
   return (Rm << 3) | Rn;
 }
 
@@ -741,12 +759,12 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm12 = 0;
-    isAdd = false ; // 'U' bit is set as part of the fixup.
 
     if (MO.isExpr()) {
       const MCExpr *Expr = MO.getExpr();
+      isAdd = false ; // 'U' bit is set as part of the fixup.
 
       MCFixupKind Kind;
       if (isThumb2())
@@ -759,8 +777,10 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
     } else {
       Reg = ARM::PC;
       int32_t Offset = MO.getImm();
-      // FIXME: Handle #-0.
-      if (Offset < 0) {
+      if (Offset == INT32_MIN) {
+        Offset = 0;
+        isAdd = false;
+      } else if (Offset < 0) {
         Offset *= -1;
         isAdd = false;
       }
@@ -821,7 +841,7 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false ; // 'U' bit is set as part of the fixup.
 
@@ -857,7 +877,7 @@ getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
   // {7-0}  = imm8
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm8 = MO1.getImm();
   return (Reg << 8) | Imm8;
 }
@@ -940,8 +960,8 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   const MCOperand &MO2 = MI.getOperand(OpIdx+2);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
-  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
   bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
   ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
@@ -975,7 +995,7 @@ getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
   // {12}     isAdd
   // {11-0}   imm12/Rm
   const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
   Binary |= Rn << 14;
   return Binary;
@@ -998,7 +1018,7 @@ getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
     ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
     Binary <<= 7;                    // Shift amount is bits [11:7]
     Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
-    Binary |= CTX.getRegisterInfo().getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+    Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0]
   }
   return Binary | (isAdd << 12) | (isReg << 13);
 }
@@ -1011,7 +1031,7 @@ getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   bool isAdd = MO1.getImm() != 0;
-  return CTX.getRegisterInfo().getEncodingValue(MO.getReg()) | (isAdd << 4);
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4);
 }
 
 uint32_t ARMMCCodeEmitter::
@@ -1029,7 +1049,7 @@ getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   return Imm8 | (isAdd << 8) | (isImm << 9);
 }
 
@@ -1047,7 +1067,7 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
 
   // If The first operand isn't a register, we have a label reference.
   if (!MO.isReg()) {
-    unsigned Rn = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
 
     assert(MO.isExpr() && "Unexpected machine operand type!");
     const MCExpr *Expr = MO.getExpr();
@@ -1057,14 +1077,14 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
     ++MCNumCPRelocations;
     return (Rn << 9) | (1 << 13);
   }
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm = MO2.getImm();
   bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
   bool isImm = MO1.getReg() == 0;
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
 }
 
@@ -1092,7 +1112,7 @@ getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm5 = MO1.getImm();
   return ((Imm5 & 0x1f) << 3) | Rn;
 }
@@ -1119,7 +1139,7 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false; // 'U' bit is handled as part of the fixup.
 
@@ -1165,7 +1185,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1190,7 +1210,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   // Encode the shift operation Rs.
   // Encode Rs bit[11:8].
   assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-  return Binary | (CTX.getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
+  return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift);
 }
 
 unsigned ARMMCCodeEmitter::
@@ -1209,7 +1229,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1248,9 +1268,9 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
 
   // Encoded as [Rn, Rm, imm].
   // FIXME: Needs fixup support.
-  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   Value <<= 4;
-  Value |= CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
   Value <<= 2;
   Value |= MO3.getImm();
 
@@ -1264,7 +1284,7 @@ getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
   const MCOperand &MO2 = MI.getOperand(OpNum+1);
 
   // FIXME: Needs fixup support.
-  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
 
   // Even though the immediate is 8 bits long, we need 9 bits in order
   // to represent the (inverse of the) sign bit.
@@ -1326,7 +1346,7 @@ getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1359,8 +1379,8 @@ getBitfieldInvertedMaskOpValue(const MCInst &MI, unsigned Op,
   // msb of the mask.
   const MCOperand &MO = MI.getOperand(Op);
   uint32_t v = ~MO.getImm();
-  uint32_t lsb = CountTrailingZeros_32(v);
-  uint32_t msb = (32 - CountLeadingZeros_32 (v)) - 1;
+  uint32_t lsb = countTrailingZeros(v);
+  uint32_t msb = (32 - countLeadingZeros (v)) - 1;
   assert (v != 0 && lsb < 32 && msb < 32 && "Illegal bitfield mask!");
   return lsb | (msb << 5);
 }
@@ -1382,7 +1402,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
 
   if (SPRRegs || DPRRegs) {
     // VLDM/VSTM
-    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
     unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
     Binary |= (RegNo & 0x1f) << 8;
     if (SPRRegs)
@@ -1391,7 +1411,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
       Binary |= NumRegs * 2;
   } else {
     for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
-      unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(MI.getOperand(I).getReg());
+      unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(MI.getOperand(I).getReg());
       Binary |= 1 << RegNo;
     }
   }
@@ -1407,7 +1427,7 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1430,7 +1450,7 @@ getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1456,7 +1476,7 @@ getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1475,7 +1495,7 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(Op);
   if (MO.getReg() == 0) return 0x0D;
-  return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 }
 
 unsigned ARMMCCodeEmitter::
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index f09fb5a..a99de0e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -12,30 +12,73 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMBaseInfo.h"
-#include "ARMELFStreamer.h"
 #include "ARMMCAsmInfo.h"
 #include "ARMMCTargetDesc.h"
 #include "InstPrinter/ARMInstPrinter.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_MC_DESC
 #include "ARMGenRegisterInfo.inc"
 
+static bool getMCRDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV7Ops &&
+      (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 15) &&
+      (MI.getOperand(1).isImm() && MI.getOperand(1).getImm() == 0) &&
+      // Checks for the deprecated CP15ISB encoding:
+      // mcr p15, #0, rX, c7, c5, #4
+      (MI.getOperand(3).isImm() && MI.getOperand(3).getImm() == 7)) {
+    if ((MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 4)) {
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 5) {
+        Info = "deprecated since v7, use 'isb'";
+        return true;
+      }
+
+      // Checks for the deprecated CP15DSB encoding:
+      // mcr p15, #0, rX, c7, c10, #4
+      if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10) {
+        Info = "deprecated since v7, use 'dsb'";
+        return true;
+      }
+    }
+    // Checks for the deprecated CP15DMB encoding:
+    // mcr p15, #0, rX, c7, c10, #5
+    if (MI.getOperand(4).isImm() && MI.getOperand(4).getImm() == 10 &&
+        (MI.getOperand(5).isImm() && MI.getOperand(5).getImm() == 5)) {
+      Info = "deprecated since v7, use 'dmb'";
+      return true;
+    }
+  }
+  return false;
+}
+
+static bool getITDeprecationInfo(MCInst &MI, MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits() & llvm::ARM::HasV8Ops &&
+      MI.getOperand(1).isImm() && MI.getOperand(1).getImm() != 8) {
+    Info = "applying IT instruction to more than one subsequent instruction is deprecated";
+    return true;
+  }
+
+  return false;
+}
+
 #define GET_INSTRINFO_MC_DESC
 #include "ARMGenInstrInfo.inc"
 
 #define GET_SUBTARGETINFO_MC_DESC
 #include "ARMGenSubtargetInfo.inc"
 
-using namespace llvm;
 
 std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   Triple triple(TT);
@@ -59,8 +102,17 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   std::string ARMArchFeature;
   if (Idx) {
     unsigned SubVer = TT[Idx];
-    if (SubVer >= '7' && SubVer <= '9') {
+    if (SubVer == '8') {
+      if (NoCPU)
+        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, FeatureMP,
+        //      FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, FeatureT2XtPk, FeatureCrypto, FeatureCRC
+        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,+trustzone,+t2xtpk,+crypto,+crc";
+      else
+        // Use CPU to figure out the exact features
+        ARMArchFeature = "+v8";
+    } else if (SubVer == '7') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        isThumb = true;
         if (NoCPU)
           // v7m: FeatureNoARM, FeatureDB, FeatureHWDiv, FeatureMClass
           ARMArchFeature = "+v7,+noarm,+db,+hwdiv,+mclass";
@@ -99,9 +151,10 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
         ARMArchFeature = "+v6t2";
       else if (Len >= Idx+2 && TT[Idx+1] == 'm') {
+        isThumb = true;
         if (NoCPU)
           // v6m: FeatureNoARM, FeatureMClass
-          ARMArchFeature = "+v6,+noarm,+mclass";
+          ARMArchFeature = "+v6m,+noarm,+mclass";
         else
           ARMArchFeature = "+v6";
       } else
@@ -159,7 +212,7 @@ static MCRegisterInfo *createARMMCRegisterInfo(StringRef Triple) {
   return X;
 }
 
-static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
@@ -212,6 +265,15 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
   return 0;
 }
 
+static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isEnvironmentMachO())
+    return createARMMachORelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
 namespace {
 
 class ARMMCInstrAnalysis : public MCInstrAnalysis {
@@ -232,15 +294,16 @@ public:
     return MCInstrAnalysis::isConditionalBranch(Inst);
   }
 
-  uint64_t evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                          uint64_t Size) const {
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const {
     // We only handle PCRel branches for now.
     if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
-      return -1ULL;
+      return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
     // FIXME: This is not right for thumb.
-    return Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
+    return true;
   }
 };
 
@@ -292,7 +355,17 @@ extern "C" void LLVMInitializeARMTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheARMTarget, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheThumbTarget, createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheARMTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheThumbTarget, createMCAsmStreamer);
+
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheARMTarget,
+                                           createARMMCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheThumbTarget,
+                                           createARMMCRelocationInfo);
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index a89981e..959be8b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -18,13 +18,17 @@
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
+class MCInstPrinter;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCStreamer;
+class MCRelocationInfo;
 class StringRef;
 class Target;
 class raw_ostream;
@@ -41,12 +45,19 @@ namespace ARM_MC {
                                             StringRef FS);
 }
 
+MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                                bool isVerboseAsm, bool useLoc, bool useCFI,
+                                bool useDwarfDirectory,
+                                MCInstPrinter *InstPrint, MCCodeEmitter *CE,
+                                MCAsmBackend *TAB, bool ShowInst);
+
 MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createARMAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
@@ -58,6 +69,9 @@ MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
                                           uint32_t CPUType,
                                           uint32_t CPUSubtype);
 
+
+/// createARMMachORelocationInfo - Construct ARM Mach-O relocation info.
+MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
 } // End llvm namespace
 
 // Defines symbolic names for ARM registers.  This defines a mapping from
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
new file mode 100644
index 0000000..807c948
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -0,0 +1,43 @@
+//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "ARMMCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm-c/Disassembler.h"
+
+using namespace llvm;
+using namespace object;
+
+namespace {
+class ARMMachORelocationInfo : public MCRelocationInfo {
+public:
+  ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForCAPIVariantKind(const MCExpr *SubExpr,
+                                             unsigned VariantKind) {
+    switch(VariantKind) {
+    case LLVMDisassembler_VariantKind_ARM_HI16:
+      return ARMMCExpr::CreateUpper16(SubExpr, Ctx);
+    case LLVMDisassembler_VariantKind_ARM_LO16:
+      return ARMMCExpr::CreateLower16(SubExpr, Ctx);
+    default:
+      return MCRelocationInfo::createExprForCAPIVariantKind(SubExpr,
+                                                            VariantKind);
+    }
+  }
+};
+} // End unnamed namespace
+
+/// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
+  return new ARMMachORelocationInfo(Ctx);
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index b9efe74..1f681ba 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -20,10 +20,9 @@
 #include "llvm/MC/MCMachOSymbolFlags.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class ARMMachObjectWriter : public MCMachObjectTargetWriter {
@@ -63,7 +62,7 @@ public:
 
 static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
                               unsigned &Log2Size) {
-  RelocType = unsigned(macho::RIT_Vanilla);
+  RelocType = unsigned(MachO::ARM_RELOC_VANILLA);
   Log2Size = ~0U;
 
   switch (Kind) {
@@ -92,21 +91,21 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   case ARM::fixup_arm_uncondbl:
   case ARM::fixup_arm_condbl:
   case ARM::fixup_arm_blx:
-    RelocType = unsigned(macho::RIT_ARM_Branch24Bit);
+    RelocType = unsigned(MachO::ARM_RELOC_BR24);
     // Report as 'long', even though that is not quite accurate.
     Log2Size = llvm::Log2_32(4);
     return true;
 
     // Handle Thumb branches.
   case ARM::fixup_arm_thumb_br:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(2);
     return true;
 
   case ARM::fixup_t2_uncondbranch:
   case ARM::fixup_arm_thumb_bl:
   case ARM::fixup_arm_thumb_blx:
-    RelocType = unsigned(macho::RIT_ARM_ThumbBranch22Bit);
+    RelocType = unsigned(MachO::ARM_THUMB_RELOC_BR22);
     Log2Size = llvm::Log2_32(4);
     return true;
 
@@ -121,23 +120,23 @@ static bool getARMFixupKindMachOInfo(unsigned Kind, unsigned &RelocType,
   //      1 - thumb instructions
   case ARM::fixup_arm_movt_hi16:
   case ARM::fixup_arm_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 1;
     return true;
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movt_hi16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 3;
     return true;
 
   case ARM::fixup_arm_movw_lo16:
   case ARM::fixup_arm_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 0;
     return true;
   case ARM::fixup_t2_movw_lo16:
   case ARM::fixup_t2_movw_lo16_pcrel:
-    RelocType = unsigned(macho::RIT_ARM_Half);
+    RelocType = unsigned(MachO::ARM_RELOC_HALF);
     Log2Size = 2;
     return true;
   }
@@ -153,7 +152,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                                  uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_ARM_Half;
+  unsigned Type = MachO::ARM_RELOC_HALF;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -179,7 +178,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_ARM_HalfDifference;
+    Type = MachO::ARM_RELOC_HALF_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
@@ -223,29 +222,29 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     break;
   }
 
-  if (Type == macho::RIT_ARM_HalfDifference) {
+  if (Type == MachO::ARM_RELOC_HALF_SECTDIFF) {
     uint32_t OtherHalf = MovtBit
       ? (FixedValue & 0xffff) : ((FixedValue & 0xffff0000) >> 16);
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((OtherHalf       <<  0) |
-                 (macho::RIT_Pair << 24) |
-                 (MovtBit         << 28) |
-                 (ThumbBit        << 29) |
-                 (IsPCRel         << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((OtherHalf             <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (MovtBit               << 28) |
+                   (ThumbBit              << 29) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (MovtBit     << 28) |
-               (ThumbBit    << 29) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (MovtBit     << 28) |
+                 (ThumbBit    << 29) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -259,7 +258,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::ARM_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -284,31 +283,31 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
                          "' can not be undefined in a subtraction expression");
 
     // Select the appropriate difference relocation type.
-    Type = macho::RIT_Difference;
+    Type = MachO::ARM_RELOC_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+  if (Type == MachO::ARM_RELOC_SECTDIFF ||
+      Type == MachO::ARM_RELOC_LOCAL_SECTDIFF) {
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                     <<  0) |
+                   (MachO::ARM_RELOC_PAIR << 24) |
+                   (Log2Size              << 28) |
+                   (IsPCRel               << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -326,13 +325,13 @@ bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
   switch (RelocType) {
   default:
     return false;
-  case macho::RIT_ARM_Branch24Bit:
+  case MachO::ARM_RELOC_BR24:
     // PC pre-adjustment of 8 for these instructions.
     Value -= 8;
     // ARM BL/BLX has a 25-bit offset.
     Range = 0x1ffffff;
     break;
-  case macho::RIT_ARM_ThumbBranch22Bit:
+  case MachO::ARM_THUMB_RELOC_BR22:
     // PC pre-adjustment of 4 for these instructions.
     Value -= 4;
     // Thumb BL/BLX has a 24-bit offset.
@@ -361,7 +360,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                                            uint64_t &FixedValue) {
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
   unsigned Log2Size;
-  unsigned RelocType = macho::RIT_Vanilla;
+  unsigned RelocType = MachO::ARM_RELOC_VANILLA;
   if (!getARMFixupKindMachOInfo(Fixup.getKind(), RelocType, Log2Size))
     // If we failed to get fixup kind info, it's because there's no legal
     // relocation type for the fixup kind. This happens when it's a fixup that's
@@ -374,7 +373,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   // scattered relocation entry.  Differences always require scattered
   // relocations.
   if (Target.getSymB()) {
-    if (RelocType == macho::RIT_ARM_Half)
+    if (RelocType == MachO::ARM_RELOC_HALF)
       return RecordARMScatteredHalfRelocation(Writer, Asm, Layout, Fragment,
                                               Fixup, Target, FixedValue);
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -392,7 +391,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   //
   // Is this right for ARM?
   uint32_t Offset = Target.getConstant();
-  if (IsPCRel && RelocType == macho::RIT_Vanilla)
+  if (IsPCRel && RelocType == MachO::ARM_RELOC_VANILLA)
     Offset += 1 << Log2Size;
   if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
     return RecordARMScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
@@ -445,17 +444,17 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
 
   // Even when it's not a scattered relocation, movw/movt always uses
   // a PAIR relocation.
-  if (Type == macho::RIT_ARM_Half) {
+  if (Type == MachO::ARM_RELOC_HALF) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
     uint32_t Value = 0;
@@ -474,11 +473,11 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
       Value = FixedValue & 0xffff;
       break;
     }
-    macho::RelocationEntry MREPair;
-    MREPair.Word0 = Value;
-    MREPair.Word1 = ((0xffffff) |
-                     (Log2Size << 25) |
-                     (macho::RIT_Pair << 28));
+    MachO::any_relocation_info MREPair;
+    MREPair.r_word0 = Value;
+    MREPair.r_word1 = ((0xffffff              <<  0) |
+                       (Log2Size              << 25) |
+                       (MachO::ARM_RELOC_PAIR << 28));
 
     Writer->addRelocation(Fragment->getParent(), MREPair);
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index 191db69..c943370 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -20,6 +20,48 @@
 
 using namespace llvm;
 
+namespace {
+  /// UnwindOpcodeStreamer - The simple wrapper over SmallVector to emit bytes
+  /// with MSB to LSB per uint32_t ordering.  For example, the first byte will
+  /// be placed in Vec[3], and the following bytes will be placed in 2, 1, 0,
+  /// 7, 6, 5, 4, 11, 10, 9, 8, and so on.
+  class UnwindOpcodeStreamer {
+  private:
+    SmallVectorImpl<uint8_t> &Vec;
+    size_t Pos;
+
+  public:
+    UnwindOpcodeStreamer(SmallVectorImpl<uint8_t> &V) : Vec(V), Pos(3) {
+    }
+
+    /// Emit the byte in MSB to LSB per uint32_t order.
+    inline void EmitByte(uint8_t elem) {
+      Vec[Pos] = elem;
+      Pos = (((Pos ^ 0x3u) + 1) ^ 0x3u);
+    }
+
+    /// Emit the size prefix.
+    inline void EmitSize(size_t Size) {
+      size_t SizeInWords = (Size + 3) / 4;
+      assert(SizeInWords <= 0x100u &&
+             "Only 256 additional words are allowed for unwind opcodes");
+      EmitByte(static_cast<uint8_t>(SizeInWords - 1));
+    }
+
+    /// Emit the personality index prefix.
+    inline void EmitPersonalityIndex(unsigned PI) {
+      assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix");
+      EmitByte(EHT_COMPACT | PI);
+    }
+
+    /// Fill the rest of bytes with FINISH opcode.
+    inline void FillFinishOpcode() {
+      while (Pos < Vec.size())
+        EmitByte(UNWIND_OPCODE_FINISH);
+    }
+  };
+}
+
 void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
   if (RegSave == 0u)
     return;
@@ -43,28 +85,22 @@ void UnwindOpcodeAssembler::EmitRegSave(uint32_t RegSave) {
     uint32_t UnmaskedReg = RegSave & 0xfff0u & (~Mask);
     if (UnmaskedReg == 0u) {
       // Pop r[4 : (4 + n)]
-      Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
+      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4 | Range);
       RegSave &= 0x000fu;
     } else if (UnmaskedReg == (1u << 14)) {
       // Pop r[14] + r[4 : (4 + n)]
-      Ops.push_back(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
+      EmitInt8(UNWIND_OPCODE_POP_REG_RANGE_R4_R14 | Range);
       RegSave &= 0x000fu;
     }
   }
 
   // Two bytes opcode to save register r15-r4
-  if ((RegSave & 0xfff0u) != 0) {
-    uint32_t Op = UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4);
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
-  }
+  if ((RegSave & 0xfff0u) != 0)
+    EmitInt16(UNWIND_OPCODE_POP_REG_MASK_R4 | (RegSave >> 4));
 
   // Opcode to save register r3-r0
-  if ((RegSave & 0x000fu) != 0) {
-    uint32_t Op = UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu);
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
-  }
+  if ((RegSave & 0x000fu) != 0)
+    EmitInt16(UNWIND_OPCODE_POP_REG_MASK | (RegSave & 0x000fu));
 }
 
 /// Emit unwind opcodes for .vsave directives
@@ -89,10 +125,8 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    uint32_t Op =
-        UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 | ((i - 16) << 4) | Range;
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
+    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD_D16 |
+              ((i - 16) << 4) | Range);
   }
 
   while (i > 0) {
@@ -113,86 +147,75 @@ void UnwindOpcodeAssembler::EmitVFPRegSave(uint32_t VFPRegSave) {
       Bit >>= 1;
     }
 
-    uint32_t Op = UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range;
-    Ops.push_back(static_cast<uint8_t>(Op >> 8));
-    Ops.push_back(static_cast<uint8_t>(Op & 0xff));
+    EmitInt16(UNWIND_OPCODE_POP_VFP_REG_RANGE_FSTMFDD | (i << 4) | Range);
   }
 }
 
-/// Emit unwind opcodes for .setfp directives
-void UnwindOpcodeAssembler::EmitSetFP(uint16_t FPReg) {
-  Ops.push_back(UNWIND_OPCODE_SET_VSP | FPReg);
+/// Emit unwind opcodes to copy address from source register to $sp.
+void UnwindOpcodeAssembler::EmitSetSP(uint16_t Reg) {
+  EmitInt8(UNWIND_OPCODE_SET_VSP | Reg);
 }
 
-/// Emit unwind opcodes to update stack pointer
+/// Emit unwind opcodes to add $sp with an offset.
 void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
   if (Offset > 0x200) {
-    uint8_t Buff[10];
-    size_t Size = encodeULEB128((Offset - 0x204) >> 2, Buff);
-    Ops.push_back(UNWIND_OPCODE_INC_VSP_ULEB128);
-    Ops.append(Buff, Buff + Size);
+    uint8_t Buff[16];
+    Buff[0] = UNWIND_OPCODE_INC_VSP_ULEB128;
+    size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
+    EmitBytes(Buff, ULEBSize + 1);
   } else if (Offset > 0) {
     if (Offset > 0x100) {
-      Ops.push_back(UNWIND_OPCODE_INC_VSP | 0x3fu);
+      EmitInt8(UNWIND_OPCODE_INC_VSP | 0x3fu);
       Offset -= 0x100;
     }
-    Ops.push_back(UNWIND_OPCODE_INC_VSP |
-                  static_cast<uint8_t>((Offset - 4) >> 2));
+    EmitInt8(UNWIND_OPCODE_INC_VSP | static_cast<uint8_t>((Offset - 4) >> 2));
   } else if (Offset < 0) {
     while (Offset < -0x100) {
-      Ops.push_back(UNWIND_OPCODE_DEC_VSP | 0x3fu);
+      EmitInt8(UNWIND_OPCODE_DEC_VSP | 0x3fu);
       Offset += 0x100;
     }
-    Ops.push_back(UNWIND_OPCODE_DEC_VSP |
-                  static_cast<uint8_t>(((-Offset) - 4) >> 2));
+    EmitInt8(UNWIND_OPCODE_DEC_VSP |
+             static_cast<uint8_t>(((-Offset) - 4) >> 2));
   }
 }
 
-void UnwindOpcodeAssembler::AddOpcodeSizePrefix(size_t Pos) {
-  size_t SizeInWords = (size() + 3) / 4;
-  assert(SizeInWords <= 0x100u &&
-         "Only 256 additional words are allowed for unwind opcodes");
-  Ops[Pos] = static_cast<uint8_t>(SizeInWords - 1);
-}
+void UnwindOpcodeAssembler::Finalize(unsigned &PersonalityIndex,
+                                     SmallVectorImpl<uint8_t> &Result) {
 
-void UnwindOpcodeAssembler::AddPersonalityIndexPrefix(size_t Pos, unsigned PI) {
-  assert(PI < NUM_PERSONALITY_INDEX && "Invalid personality prefix");
-  Ops[Pos] = EHT_COMPACT | PI;
-}
+  UnwindOpcodeStreamer OpStreamer(Result);
 
-void UnwindOpcodeAssembler::EmitFinishOpcodes() {
-  for (size_t i = (0x4u - (size() & 0x3u)) & 0x3u; i > 0; --i)
-    Ops.push_back(UNWIND_OPCODE_FINISH);
-}
-
-void UnwindOpcodeAssembler::Finalize() {
   if (HasPersonality) {
-    // Personality specified by .personality directive
-    Offset = 1;
-    AddOpcodeSizePrefix(1);
+    // User-specifed personality routine: [ SIZE , OP1 , OP2 , ... ]
+    PersonalityIndex = NUM_PERSONALITY_INDEX;
+    size_t TotalSize = Ops.size() + 1;
+    size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+    Result.resize(RoundUpSize);
+    OpStreamer.EmitSize(RoundUpSize);
   } else {
-    if (getOpcodeSize() <= 3) {
+    if (Ops.size() <= 3) {
       // __aeabi_unwind_cpp_pr0: [ 0x80 , OP1 , OP2 , OP3 ]
-      Offset = 1;
       PersonalityIndex = AEABI_UNWIND_CPP_PR0;
-      AddPersonalityIndexPrefix(Offset, PersonalityIndex);
+      Result.resize(4);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
     } else {
       // __aeabi_unwind_cpp_pr1: [ 0x81 , SIZE , OP1 , OP2 , ... ]
-      Offset = 0;
       PersonalityIndex = AEABI_UNWIND_CPP_PR1;
-      AddPersonalityIndexPrefix(Offset, PersonalityIndex);
-      AddOpcodeSizePrefix(1);
+      size_t TotalSize = Ops.size() + 2;
+      size_t RoundUpSize = (TotalSize + 3) / 4 * 4;
+      Result.resize(RoundUpSize);
+      OpStreamer.EmitPersonalityIndex(PersonalityIndex);
+      OpStreamer.EmitSize(RoundUpSize);
     }
   }
 
-  // Emit the padding finish opcodes if the size() is not multiple of 4.
-  EmitFinishOpcodes();
+  // Copy the unwind opcodes
+  for (size_t i = OpBegins.size() - 1; i > 0; --i)
+    for (size_t j = OpBegins[i - 1], end = OpBegins[i]; j < end; ++j)
+      OpStreamer.EmitByte(Ops[j]);
 
-  // Swap the byte order
-  uint8_t *Ptr = Ops.begin() + Offset;
-  assert(size() % 4 == 0 && "Final unwind opcodes should align to 4");
-  for (size_t i = 0, n = size(); i < n; i += 4) {
-    std::swap(Ptr[i], Ptr[i + 3]);
-    std::swap(Ptr[i + 1], Ptr[i + 2]);
-  }
+  // Emit the padding finish opcodes if the size is not multiple of 4.
+  OpStreamer.FillFinishOpcode();
+
+  // Reset the assembler state
+  Reset();
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index f6ecaeb..ac67c6e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -27,86 +27,61 @@ class MCSymbol;
 
 class UnwindOpcodeAssembler {
 private:
-  llvm::SmallVector<uint8_t, 8> Ops;
-
-  unsigned Offset;
-  unsigned PersonalityIndex;
+  llvm::SmallVector<uint8_t, 32> Ops;
+  llvm::SmallVector<unsigned, 8> OpBegins;
   bool HasPersonality;
 
-  enum {
-    // The number of bytes to be preserved for the size and personality index
-    // prefix of unwind opcodes.
-    NUM_PRESERVED_PREFIX_BUF = 2
-  };
-
 public:
   UnwindOpcodeAssembler()
-      : Ops(NUM_PRESERVED_PREFIX_BUF), Offset(NUM_PRESERVED_PREFIX_BUF),
-        PersonalityIndex(NUM_PERSONALITY_INDEX), HasPersonality(0) {
+      : HasPersonality(0) {
+    OpBegins.push_back(0);
   }
 
   /// Reset the unwind opcode assembler.
   void Reset() {
-    Ops.resize(NUM_PRESERVED_PREFIX_BUF);
-    Offset = NUM_PRESERVED_PREFIX_BUF;
-    PersonalityIndex = NUM_PERSONALITY_INDEX;
+    Ops.clear();
+    OpBegins.clear();
+    OpBegins.push_back(0);
     HasPersonality = 0;
   }
 
-  /// Get the size of the payload (including the size byte)
-  size_t size() const {
-    return Ops.size() - Offset;
-  }
-
-  /// Get the beginning of the payload
-  const uint8_t *begin() const {
-    return Ops.begin() + Offset;
-  }
-
-  /// Get the payload
-  StringRef data() const {
-    return StringRef(reinterpret_cast<const char *>(begin()), size());
-  }
-
   /// Set the personality index
   void setPersonality(const MCSymbol *Per) {
     HasPersonality = 1;
   }
 
-  /// Get the personality index
-  unsigned getPersonalityIndex() const {
-    return PersonalityIndex;
-  }
-
   /// Emit unwind opcodes for .save directives
   void EmitRegSave(uint32_t RegSave);
 
   /// Emit unwind opcodes for .vsave directives
   void EmitVFPRegSave(uint32_t VFPRegSave);
 
-  /// Emit unwind opcodes for .setfp directives
-  void EmitSetFP(uint16_t FPReg);
+  /// Emit unwind opcodes to copy address from source register to $sp.
+  void EmitSetSP(uint16_t Reg);
 
-  /// Emit unwind opcodes to update stack pointer
+  /// Emit unwind opcodes to add $sp with an offset.
   void EmitSPOffset(int64_t Offset);
 
   /// Finalize the unwind opcode sequence for EmitBytes()
-  void Finalize();
+  void Finalize(unsigned &PersonalityIndex,
+                SmallVectorImpl<uint8_t> &Result);
 
 private:
-  /// Get the size of the opcodes in bytes.
-  size_t getOpcodeSize() const {
-    return Ops.size() - NUM_PRESERVED_PREFIX_BUF;
+  void EmitInt8(unsigned Opcode) {
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 1);
   }
 
-  /// Add the length prefix to the payload
-  void AddOpcodeSizePrefix(size_t Pos);
-
-  /// Add personality index prefix in some compact format
-  void AddPersonalityIndexPrefix(size_t Pos, unsigned PersonalityIndex);
+  void EmitInt16(unsigned Opcode) {
+    Ops.push_back((Opcode >> 8) & 0xff);
+    Ops.push_back(Opcode & 0xff);
+    OpBegins.push_back(OpBegins.back() + 2);
+  }
 
-  /// Fill the words with finish opcode if it is not aligned
-  void EmitFinishOpcodes();
+  void EmitBytes(const uint8_t *Opcode, size_t Size) {
+    Ops.insert(Ops.end(), Opcode, Opcode + Size);
+    OpBegins.push_back(OpBegins.back() + Size);
+  }
 };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index a7ac5ca..bab59f4 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_library(LLVMARMDesc
   ARMMachObjectWriter.cpp
   ARMELFObjectWriter.cpp
   ARMUnwindOpAsm.cpp
+  ARMMachORelocationInfo.cpp
   )
 add_dependencies(LLVMARMDesc ARMCommonTableGen)
 
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 1e2a8b0..cfb33f5 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -88,7 +88,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   const Thumb1InstrInfo &TII =
     *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -126,7 +127,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::LR:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      AFI->addGPRCalleeSavedArea1Frame(FI);
       GPRCS1Size += 4;
       break;
     case ARM::R8:
@@ -135,16 +135,12 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     case ARM::R11:
       if (Reg == FramePtr)
         FramePtrSpillFI = FI;
-      if (STI.isTargetIOS()) {
-        AFI->addGPRCalleeSavedArea2Frame(FI);
+      if (STI.isTargetIOS())
         GPRCS2Size += 4;
-      } else {
-        AFI->addGPRCalleeSavedArea1Frame(FI);
+      else
         GPRCS1Size += 4;
-      }
       break;
     default:
-      AFI->addDPRCalleeSavedAreaFrame(FI);
       DPRCSSize += 8;
     }
   }
@@ -168,10 +164,17 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
   AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
   NumBytes = DPRCSOffset;
 
+  int FramePtrOffsetInBlock = 0;
+  if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
+    FramePtrOffsetInBlock = NumBytes;
+    NumBytes = 0;
+  }
+
   // Adjust FP so it point to the stack slot that contains the previous FP.
   if (HasFP) {
+    FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
     AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
-      .addFrameIndex(FramePtrSpillFI).addImm(0)
+      .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
       .setMIFlags(MachineInstr::FrameSetup));
     if (NumBytes > 508)
       // If offset is > 508 then sp cannot be adjusted in a single instruction,
@@ -212,13 +215,6 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
 static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
   if (MI->getOpcode() == ARM::tLDRspi &&
       MI->getOperand(1).isFI() &&
@@ -249,7 +245,8 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   const Thumb1InstrInfo &TII =
     *static_cast<const Thumb1InstrInfo*>(MF.getTarget().getInstrInfo());
 
-  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
+  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   int NumBytes = (int)MFI->getStackSize();
   const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
@@ -294,8 +291,9 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
           &MBB.front() != MBBI &&
           prior(MBBI)->getOpcode() == ARM::tPOP) {
         MachineBasicBlock::iterator PMBBI = prior(MBBI);
-        emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
-      } else
+        if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
+          emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
+      } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
         emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
     }
   }
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 095736d..22a925e 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -22,7 +22,7 @@
 using namespace llvm;
 
 Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
index 7452fb7..65a7221 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp
@@ -40,9 +40,8 @@ extern cl::opt<bool> ReuseFrameIndexVals;
 
 using namespace llvm;
 
-Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
-                                       const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
 
 const TargetRegisterClass*
@@ -70,6 +69,7 @@ Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
@@ -426,7 +426,7 @@ rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
                                 *this);
     } else {
       // Translate r0 = add sp, -imm to
-      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = -imm (this is then translated into a series of instructions)
       // r0 = add r0, sp
       emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
 
@@ -488,6 +488,9 @@ void
 Thumb1RegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
                                       unsigned BaseReg, int64_t Offset) const {
   MachineInstr &MI = *I;
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(
+      MI.getParent()->getParent()->getTarget().getInstrInfo());
   int Off = Offset; // ARM doesn't need the general 64-bit offsets
   unsigned i = 0;
 
@@ -513,6 +516,7 @@ Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
   // off the frame pointer (if, for example, there are alloca() calls in
   // the function, the offset will be negative. Use R12 instead since that's
   // a call clobbered register that we know won't be used in Thumb1 mode.
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   DebugLoc DL;
   AddDefaultPred(BuildMI(MBB, I, DL, TII.get(ARM::tMOVr))
     .addReg(ARM::R12, RegState::Define)
@@ -558,6 +562,8 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const ARMBaseInstrInfo &TII =
+    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
@@ -567,11 +573,7 @@ Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                MF.getFrameInfo()->getStackSize() + SPAdj;
 
-  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea1Offset();
-  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
-    Offset -= AFI->getGPRCalleeSavedArea2Offset();
-  else if (MF.getFrameInfo()->hasVarSizedObjects()) {
+  if (MF.getFrameInfo()->hasVarSizedObjects()) {
     assert(SPAdj == 0 && MF.getTarget().getFrameLowering()->hasFP(MF) &&
            "Unexpected");
     // There are alloca()'s in this function, must reference off the frame
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index ebbab36..9689b23 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb1RegisterInfo(const ARMSubtarget &STI);
 
   const TargetRegisterClass*
   getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 97c254c..0b7d3bb 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -28,6 +28,7 @@ namespace {
     static char ID;
     Thumb2ITBlockPass() : MachineFunctionPass(ID) {}
 
+    bool restrictIT;
     const Thumb2InstrInfo *TII;
     const TargetRegisterInfo *TRI;
     ARMFunctionInfo *AFI;
@@ -73,15 +74,15 @@ static void TrackDefUses(MachineInstr *MI,
 
   for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
     unsigned Reg = LocalUses[i];
-    Uses.insert(Reg);
-    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
       Uses.insert(*Subreg);
   }
 
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
-    Defs.insert(Reg);
-    for (MCSubRegIterator Subreg(Reg, TRI); Subreg.isValid(); ++Subreg)
+    for (MCSubRegIterator Subreg(Reg, TRI, /*IncludeSelf=*/true);
+         Subreg.isValid(); ++Subreg)
       Defs.insert(*Subreg);
     if (Reg == ARM::CPSR)
       continue;
@@ -192,37 +193,42 @@ bool Thumb2ITBlockPass::InsertITInstructions(MachineBasicBlock &MBB) {
     // Form IT block.
     ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
     unsigned Mask = 0, Pos = 3;
-    // Branches, including tricky ones like LDM_RET, need to end an IT
-    // block so check the instruction we just put in the block.
-    for (; MBBI != E && Pos &&
-           (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
-      if (MBBI->isDebugValue())
-        continue;
-
-      MachineInstr *NMI = &*MBBI;
-      MI = NMI;
-
-      unsigned NPredReg = 0;
-      ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
-      if (NCC == CC || NCC == OCC) {
-        Mask |= (NCC & 1) << Pos;
-        // Add implicit use of ITSTATE.
-        NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
-                                               true/*isImp*/, false/*isKill*/));
-        LastITMI = NMI;
-      } else {
-        if (NCC == ARMCC::AL &&
-            MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
-          --MBBI;
-          MBB.remove(NMI);
-          MBB.insert(InsertPos, NMI);
-          ++NumMovedInsts;
+
+    // v8 IT blocks are limited to one conditional op unless -arm-no-restrict-it
+    // is set: skip the loop
+    if (!restrictIT) {
+      // Branches, including tricky ones like LDM_RET, need to end an IT
+      // block so check the instruction we just put in the block.
+      for (; MBBI != E && Pos &&
+             (!MI->isBranch() && !MI->isReturn()) ; ++MBBI) {
+        if (MBBI->isDebugValue())
           continue;
+
+        MachineInstr *NMI = &*MBBI;
+        MI = NMI;
+
+        unsigned NPredReg = 0;
+        ARMCC::CondCodes NCC = getITInstrPredicate(NMI, NPredReg);
+        if (NCC == CC || NCC == OCC) {
+          Mask |= (NCC & 1) << Pos;
+          // Add implicit use of ITSTATE.
+          NMI->addOperand(MachineOperand::CreateReg(ARM::ITSTATE, false/*ifDef*/,
+                                                 true/*isImp*/, false/*isKill*/));
+          LastITMI = NMI;
+        } else {
+          if (NCC == ARMCC::AL &&
+              MoveCopyOutOfITBlock(NMI, CC, OCC, Defs, Uses)) {
+            --MBBI;
+            MBB.remove(NMI);
+            MBB.insert(InsertPos, NMI);
+            ++NumMovedInsts;
+            continue;
+          }
+          break;
         }
-        break;
+        TrackDefUses(NMI, Defs, Uses, TRI);
+        --Pos;
       }
-      TrackDefUses(NMI, Defs, Uses, TRI);
-      --Pos;
     }
 
     // Finalize IT mask.
@@ -250,6 +256,7 @@ bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
   TRI = TM.getRegisterInfo();
+  restrictIT = TM.getSubtarget<ARMSubtarget>().restrictIT();
 
   if (!AFI->isThumbFunction())
     return false;
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index a1b48c2..91788ac 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -31,12 +31,13 @@ OldT2IfCvt("old-thumb2-ifcvt", cl::Hidden,
            cl::init(false));
 
 Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
-  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+  : ARMBaseInstrInfo(STI), RI(STI) {
 }
 
 /// getNoopForMachoTarget - Return the noop instruction to use for a noop.
 void Thumb2InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM::tNOP);
+  NopInst.setOpcode(ARM::tHINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
   NopInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
   NopInst.addOperand(MCOperand::CreateReg(0));
 }
@@ -214,6 +215,13 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                unsigned DestReg, unsigned BaseReg, int NumBytes,
                                ARMCC::CondCodes Pred, unsigned PredReg,
                                const ARMBaseInstrInfo &TII, unsigned MIFlags) {
+  if (NumBytes == 0 && DestReg != BaseReg) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+      .addReg(BaseReg, RegState::Kill)
+      .addImm((unsigned)Pred).addReg(PredReg).setMIFlags(MIFlags);
+    return;
+  }
+
   bool isSub = NumBytes < 0;
   if (isSub) NumBytes = -NumBytes;
 
@@ -285,7 +293,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         NumBytes = 0;
       } else {
         // FIXME: Move this to ARMAddressingModes.h?
-        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        unsigned RotAmt = countLeadingZeros(ThisVal);
         ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
         NumBytes &= ~ThisVal;
         assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
@@ -302,7 +310,7 @@ void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
         NumBytes = 0;
       } else {
         // FIXME: Move this to ARMAddressingModes.h?
-        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        unsigned RotAmt = countLeadingZeros(ThisVal);
         ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
         NumBytes &= ~ThisVal;
         assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
@@ -334,6 +342,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:   return ARM::t2STRi8;
   case ARM::t2STRBi12:  return ARM::t2STRBi8;
   case ARM::t2STRHi12:  return ARM::t2STRHi8;
+  case ARM::t2PLDi12:   return ARM::t2PLDi8;
 
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
@@ -343,6 +352,7 @@ negativeOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
@@ -364,6 +374,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:   return ARM::t2STRi12;
   case ARM::t2STRBi8:  return ARM::t2STRBi12;
   case ARM::t2STRHi8:  return ARM::t2STRHi12;
+  case ARM::t2PLDi8:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -373,6 +384,7 @@ positiveOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
     return opcode;
 
   default:
@@ -394,6 +406,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRs:   return ARM::t2STRi12;
   case ARM::t2STRBs:  return ARM::t2STRBi12;
   case ARM::t2STRHs:  return ARM::t2STRHi12;
+  case ARM::t2PLDs:   return ARM::t2PLDi12;
 
   case ARM::t2LDRi12:
   case ARM::t2LDRHi12:
@@ -403,6 +416,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi12:
   case ARM::t2STRBi12:
   case ARM::t2STRHi12:
+  case ARM::t2PLDi12:
   case ARM::t2LDRi8:
   case ARM::t2LDRHi8:
   case ARM::t2LDRBi8:
@@ -411,6 +425,7 @@ immediateOffsetOpcode(unsigned opcode)
   case ARM::t2STRi8:
   case ARM::t2STRBi8:
   case ARM::t2STRHi8:
+  case ARM::t2PLDi8:
     return opcode;
 
   default:
@@ -484,7 +499,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 
     // Otherwise, extract 8 adjacent bits from the immediate into this
     // t2ADDri/t2SUBri.
-    unsigned RotAmt = CountLeadingZeros_32(Offset);
+    unsigned RotAmt = countLeadingZeros<unsigned>(Offset);
     unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
 
     // We will handle these bits from offset, clear them.
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
index 1a7a4d4..4cb827f 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.cpp
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp
@@ -24,9 +24,8 @@
 #include "llvm/IR/Function.h"
 using namespace llvm;
 
-Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
-                                       const ARMSubtarget &sti)
-  : ARMBaseRegisterInfo(tii, sti) {
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(sti) {
 }
 
 /// emitLoadConstPool - Emits a load from constpool to materialize the
@@ -40,6 +39,7 @@ Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
                                       ARMCC::CondCodes Pred, unsigned PredReg,
                                       unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
            Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
index 6b397e8..b1d63fa 100644
--- a/lib/Target/ARM/Thumb2RegisterInfo.h
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h
@@ -20,12 +20,12 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 
 namespace llvm {
-  class ARMSubtarget;
-  class ARMBaseInstrInfo;
+
+class ARMSubtarget;
 
 struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
 public:
-  Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+  Thumb2RegisterInfo(const ARMSubtarget &STI);
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 3e69098..ddc7a66 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
+#include <cctype>
 #include <cstdio>
 #include <map>
 #include <set>
@@ -291,8 +292,6 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::LinkOnceAnyLinkage "; break;
   case GlobalValue::LinkOnceODRLinkage:
     Out << "GlobalValue::LinkOnceODRLinkage "; break;
-  case GlobalValue::LinkOnceODRAutoHideLinkage:
-    Out << "GlobalValue::LinkOnceODRAutoHideLinkage"; break;
   case GlobalValue::WeakAnyLinkage:
     Out << "GlobalValue::WeakAnyLinkage"; break;
   case GlobalValue::WeakODRLinkage:
@@ -497,6 +496,7 @@ void CppWriter::printAttributes(const AttributeSet &PAL,
       HANDLE_ATTR(ReadOnly);
       HANDLE_ATTR(NoInline);
       HANDLE_ATTR(AlwaysInline);
+      HANDLE_ATTR(OptimizeNone);
       HANDLE_ATTR(OptimizeForSize);
       HANDLE_ATTR(StackProtect);
       HANDLE_ATTR(StackProtectReq);
@@ -1139,7 +1139,7 @@ void CppWriter::printInstruction(const Instruction *I,
     nl(Out);
     for (SwitchInst::ConstCaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      const IntegersSubset CaseVal = i.getCaseValueEx();
+      const ConstantInt* CaseVal = i.getCaseValue();
       const BasicBlock *BB = i.getCaseSuccessor();
       Out << iName << "->addCase("
           << getOpName(CaseVal) << ", "
@@ -1160,8 +1160,7 @@ void CppWriter::printInstruction(const Instruction *I,
     break;
   }
   case Instruction::Resume: {
-    Out << "ResumeInst::Create(mod->getContext(), " << opNames[0]
-        << ", " << bbname << ");";
+    Out << "ResumeInst::Create(" << opNames[0] << ", " << bbname << ");";
     break;
   }
   case Instruction::Invoke: {
@@ -1175,7 +1174,7 @@ void CppWriter::printInstruction(const Instruction *I,
     }
     // FIXME: This shouldn't use magic numbers -3, -2, and -1.
     Out << "InvokeInst *" << iName << " = InvokeInst::Create("
-        << getOpName(inv->getCalledFunction()) << ", "
+        << getOpName(inv->getCalledValue()) << ", "
         << getOpName(inv->getNormalDest()) << ", "
         << getOpName(inv->getUnwindDest()) << ", "
         << iName << "_params, \"";
@@ -1589,6 +1588,20 @@ void CppWriter::printInstruction(const Instruction *I,
     Out << "\");";
     break;
   }
+  case Instruction::LandingPad: {
+    const LandingPadInst *lpi = cast<LandingPadInst>(I);
+    Out << "LandingPadInst* " << iName << " = LandingPadInst::Create(";
+    printCppName(lpi->getType());
+    Out << ", " << opNames[0] << ", " << lpi->getNumClauses() << ", \"";
+    printEscapedString(lpi->getName());
+    Out << "\", " << bbname << ");";
+    nl(Out) << iName << "->setCleanup("
+            << (lpi->isCleanup() ? "true" : "false")
+            << ");";
+    for (unsigned i = 0, e = lpi->getNumClauses(); i != e; ++i)
+      nl(Out) << iName << "->addClause(" << opNames[i+1] << ");";
+    break;
+  }
   }
   DefinedValues.insert(I);
   nl(Out);
@@ -1832,7 +1845,7 @@ void CppWriter::printInline(const std::string& fname,
   unsigned arg_count = 1;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
        AI != AE; ++AI) {
-    Out << ", Value* arg_" << arg_count;
+    Out << ", Value* arg_" << arg_count++;
   }
   Out << ") {";
   nl(Out);
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index b5b887e..ae3c9eb 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -9,8 +9,6 @@ tablegen(LLVM HexagonGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM HexagonGenDFAPacketizer.inc -gen-dfa-packetizer)
 add_public_tablegen_target(HexagonCommonTableGen)
 
-set(LLVM_COMMON_DEPENDS intrinsics_gen)
-
 add_llvm_target(HexagonCodeGen
   HexagonAsmPrinter.cpp
   HexagonCallingConvLower.cpp
@@ -19,6 +17,7 @@ add_llvm_target(HexagonCodeGen
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
   HexagonFixupHwLoops.cpp
+  HexagonMachineFunctionInfo.cpp
   HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
@@ -28,14 +27,18 @@ add_llvm_target(HexagonCodeGen
   HexagonRegisterInfo.cpp
   HexagonRemoveSZExtArgs.cpp
   HexagonSelectionDAGInfo.cpp
+  HexagonSplitConst32AndConst64.cpp
   HexagonSplitTFRCondSets.cpp
   HexagonSubtarget.cpp
   HexagonTargetMachine.cpp
   HexagonTargetObjectFile.cpp
   HexagonVLIWPacketizer.cpp
   HexagonNewValueJump.cpp
+  HexagonCopyToCombine.cpp
 )
 
+add_dependencies(LLVMHexagonCodeGen HexagonCommonTableGen intrinsics_gen)
+
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Hexagon/Hexagon.h b/lib/Target/Hexagon/Hexagon.h
index a9b00a2..5467ee3 100644
--- a/lib/Target/Hexagon/Hexagon.h
+++ b/lib/Target/Hexagon/Hexagon.h
@@ -29,7 +29,7 @@ namespace llvm {
   class HexagonTargetMachine;
   class raw_ostream;
 
-  FunctionPass *createHexagonISelDag(const HexagonTargetMachine &TM,
+  FunctionPass *createHexagonISelDag(HexagonTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
   FunctionPass *createHexagonDelaySlotFillerPass(const TargetMachine &TM);
   FunctionPass *createHexagonFPMoverPass(const TargetMachine &TM);
@@ -37,11 +37,15 @@ namespace llvm {
   FunctionPass *createHexagonCFGOptimizer(const HexagonTargetMachine &TM);
 
   FunctionPass *createHexagonSplitTFRCondSets(const HexagonTargetMachine &TM);
+  FunctionPass *createHexagonSplitConst32AndConst64(
+                      const HexagonTargetMachine &TM);
   FunctionPass *createHexagonExpandPredSpillCode(
                       const HexagonTargetMachine &TM);
   FunctionPass *createHexagonHardwareLoops();
   FunctionPass *createHexagonPeephole();
   FunctionPass *createHexagonFixupHwLoops();
+  FunctionPass *createHexagonNewValueJump();
+  FunctionPass *createHexagonCopyToCombine();
   FunctionPass *createHexagonPacketizer();
   FunctionPass *createHexagonNewValueJump();
 
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 9b3a643..568798c 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -120,15 +120,39 @@ def getPredNewOpcode : InstrMapping {
 }
 
 //===----------------------------------------------------------------------===//
+// Generate mapping table to relate .new predicated instructions with their old
+// format.
+//
+def getPredOldOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = ["new"];
+  let ValueCols = [[""]];
+}
+
+//===----------------------------------------------------------------------===//
 // Generate mapping table to relate store instructions with their new-value
 // format.
 //
 def getNewValueOpcode : InstrMapping {
   let FilterClass = "NewValueRel";
   let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
-  let ColFields = ["isNVStore"];
-  let KeyCol = ["0"];
-  let ValueCols = [["1"]];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate new-value store instructions with their old
+// format.
+//
+def getNonNVStore : InstrMapping {
+  let FilterClass = "NewValueRel";
+  let RowFields = ["BaseOpcode", "PredSense", "PNewValue"];
+  let ColFields = ["NValueST"];
+  let KeyCol = ["true"];
+  let ValueCols = [["false"]];
 }
 
 def getBasedWithImmOffset : InstrMapping {
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 88cd3fb..a2e04ba 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -99,7 +99,7 @@ void HexagonAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
     return;
   case MachineOperand::MO_GlobalAddress:
     // Computing the address of a global symbol, not calling it.
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     printOffset(MO.getOffset(), O);
     return;
   }
@@ -267,7 +267,7 @@ void HexagonAsmPrinter::printGlobalOperand(const MachineInstr *MI, int OpNo,
   assert( (MO.getType() == MachineOperand::MO_GlobalAddress) &&
          "Expecting global address");
 
-  O << *Mang->getSymbol(MO.getGlobal());
+  O << *getSymbol(MO.getGlobal());
   if (MO.getOffset() != 0) {
     O << " + ";
     O << MO.getOffset();
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index 2c93d04..f5f958c 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -25,14 +25,13 @@ using namespace llvm;
 
 Hexagon_CCState::Hexagon_CCState(CallingConv::ID CC, bool isVarArg,
                                  const TargetMachine &tm,
-                                 SmallVector<CCValAssign, 16> &locs,
+                                 SmallVectorImpl<CCValAssign> &locs,
                                  LLVMContext &c)
-  : CallingConv(CC), IsVarArg(isVarArg), TM(tm),
-    TRI(*TM.getRegisterInfo()), Locs(locs), Context(c) {
+  : CallingConv(CC), IsVarArg(isVarArg), TM(tm), Locs(locs), Context(c) {
   // No stack is used.
   StackOffset = 0;
 
-  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+  UsedRegs.resize((TM.getRegisterInfo()->getNumRegs()+31)/32);
 }
 
 // HandleByVal - Allocate a stack slot large enough to pass an argument by
@@ -56,6 +55,7 @@ void Hexagon_CCState::HandleByVal(unsigned ValNo, EVT ValVT,
 
 /// MarkAllocated - Mark a register and all of its aliases as allocated.
 void Hexagon_CCState::MarkAllocated(unsigned Reg) {
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
     UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.h b/lib/Target/Hexagon/HexagonCallingConvLower.h
index 489b3a3..33c8306 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.h
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.h
@@ -48,15 +48,14 @@ class Hexagon_CCState {
   CallingConv::ID CallingConv;
   bool IsVarArg;
   const TargetMachine &TM;
-  const TargetRegisterInfo &TRI;
-  SmallVector<CCValAssign, 16> &Locs;
+  SmallVectorImpl<CCValAssign> &Locs;
   LLVMContext &Context;
 
   unsigned StackOffset;
   SmallVector<uint32_t, 16> UsedRegs;
 public:
   Hexagon_CCState(CallingConv::ID CC, bool isVarArg, const TargetMachine &TM,
-                SmallVector<CCValAssign, 16> &locs, LLVMContext &c);
+                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &c);
 
   void addLoc(const CCValAssign &V) {
     Locs.push_back(V);
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
new file mode 100644
index 0000000..dc440cb
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -0,0 +1,677 @@
+//===------- HexagonCopyToCombine.cpp - Hexagon Copy-To-Combine Pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass replaces transfer instructions by combine instructions.
+// We walk along a basic block and look for two combinable instructions and try
+// to move them together. If we can move them next to each other we do so and
+// replace them with a combine instruction.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "hexagon-copy-combine"
+
+#include "llvm/PassSupport.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "Hexagon.h"
+#include "HexagonInstrInfo.h"
+#include "HexagonRegisterInfo.h"
+#include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonMachineFunctionInfo.h"
+
+using namespace llvm;
+
+static
+cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
+                                 cl::Hidden, cl::ZeroOrMore,
+                                 cl::init(false),
+                                 cl::desc("Disable merging into combines"));
+static
+cl::opt<unsigned>
+MaxNumOfInstsBetweenNewValueStoreAndTFR("max-num-inst-between-tfr-and-nv-store",
+                   cl::Hidden, cl::init(4),
+                   cl::desc("Maximum distance between a tfr feeding a store we "
+                            "consider the store still to be newifiable"));
+
+namespace llvm {
+  void initializeHexagonCopyToCombinePass(PassRegistry&);
+}
+
+
+namespace {
+
+class HexagonCopyToCombine : public MachineFunctionPass  {
+  const HexagonInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  bool ShouldCombineAggressively;
+
+  DenseSet<MachineInstr *> PotentiallyNewifiableTFR;
+public:
+  static char ID;
+
+  HexagonCopyToCombine() : MachineFunctionPass(ID) {
+    initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const {
+    return "Hexagon Copy-To-Combine Pass";
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+private:
+  MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
+
+  void findPotentialNewifiableTFRs(MachineBasicBlock &);
+
+  void combine(MachineInstr *I1, MachineInstr *I2,
+               MachineBasicBlock::iterator &MI, bool DoInsertAtI1);
+
+  bool isSafeToMoveTogether(MachineInstr *I1, MachineInstr *I2,
+                            unsigned I1DestReg, unsigned I2DestReg,
+                            bool &DoInsertAtI1);
+
+  void emitCombineRR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineRI(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineIR(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+
+  void emitCombineII(MachineBasicBlock::iterator &Before, unsigned DestReg,
+                     MachineOperand &HiOperand, MachineOperand &LoOperand);
+};
+
+} // End anonymous namespace.
+
+char HexagonCopyToCombine::ID = 0;
+
+INITIALIZE_PASS(HexagonCopyToCombine, "hexagon-copy-combine",
+                "Hexagon Copy-To-Combine Pass", false, false)
+
+static bool isCombinableInstType(MachineInstr *MI,
+                                 const HexagonInstrInfo *TII,
+                                 bool ShouldCombineAggressively) {
+  switch(MI->getOpcode()) {
+  case Hexagon::TFR: {
+    // A COPY instruction can be combined if its arguments are IntRegs (32bit).
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isReg());
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      Hexagon::IntRegsRegClass.contains(SrcReg);
+  }
+
+  case Hexagon::TFRI: {
+    // A transfer-immediate can be combined if its argument is a signed 8bit
+    // value.
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+    unsigned DestReg = MI->getOperand(0).getReg();
+
+    // Only combine constant extended TFRI if we are in aggressive mode.
+    return Hexagon::IntRegsRegClass.contains(DestReg) &&
+      (ShouldCombineAggressively || isInt<8>(MI->getOperand(1).getImm()));
+  }
+
+  case Hexagon::TFRI_V4: {
+    if (!ShouldCombineAggressively)
+      return false;
+    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isGlobal());
+
+    // Ensure that TargetFlags are MO_NO_FLAG for a global. This is a
+    // workaround for an ABI bug that prevents GOT relocations on combine
+    // instructions
+    if (MI->getOperand(1).getTargetFlags() != HexagonII::MO_NO_FLAG)
+      return false;
+
+    unsigned DestReg = MI->getOperand(0).getReg();
+    return Hexagon::IntRegsRegClass.contains(DestReg);
+  }
+
+  default:
+    break;
+  }
+
+  return false;
+}
+
+static bool isGreaterThan8BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isInt<8>(I->getOperand(1).getImm());
+}
+static bool isGreaterThan6BitTFRI(MachineInstr *I) {
+  return I->getOpcode() == Hexagon::TFRI &&
+    !isUInt<6>(I->getOperand(1).getImm());
+}
+
+/// areCombinableOperations - Returns true if the two instruction can be merge
+/// into a combine (ignoring register constraints).
+static bool areCombinableOperations(const TargetRegisterInfo *TRI,
+                                    MachineInstr *HighRegInst,
+                                    MachineInstr *LowRegInst) {
+  assert((HighRegInst->getOpcode() == Hexagon::TFR ||
+          HighRegInst->getOpcode() == Hexagon::TFRI ||
+          HighRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         (LowRegInst->getOpcode() == Hexagon::TFR ||
+          LowRegInst->getOpcode() == Hexagon::TFRI ||
+          LowRegInst->getOpcode() == Hexagon::TFRI_V4) &&
+         "Assume individual instructions are of a combinable type");
+
+  const HexagonRegisterInfo *QRI =
+    static_cast<const HexagonRegisterInfo *>(TRI);
+
+  // V4 added some combine variations (mixed immediate and register source
+  // operands), if we are on < V4 we can only combine 2 register-to-register
+  // moves and 2 immediate-to-register moves. We also don't have
+  // constant-extenders.
+  if (!QRI->Subtarget.hasV4TOps())
+    return HighRegInst->getOpcode() == LowRegInst->getOpcode() &&
+      !isGreaterThan8BitTFRI(HighRegInst) &&
+      !isGreaterThan6BitTFRI(LowRegInst);
+
+  // There is no combine of two constant extended values.
+  if ((HighRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan8BitTFRI(HighRegInst)) &&
+      (LowRegInst->getOpcode() == Hexagon::TFRI_V4 ||
+       isGreaterThan6BitTFRI(LowRegInst)))
+    return false;
+
+  return true;
+}
+
+static bool isEvenReg(unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         Hexagon::IntRegsRegClass.contains(Reg));
+  return (Reg - Hexagon::R0) % 2 == 0;
+}
+
+static void removeKillInfo(MachineInstr *MI, unsigned RegNotKilled) {
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    MachineOperand &Op = MI->getOperand(I);
+    if (!Op.isReg() || Op.getReg() != RegNotKilled || !Op.isKill())
+      continue;
+    Op.setIsKill(false);
+  }
+}
+
+/// isUnsafeToMoveAcross - Returns true if it is unsafe to move a copy
+/// instruction from \p UseReg to \p DestReg over the instruction \p I.
+static bool isUnsafeToMoveAcross(MachineInstr *I, unsigned UseReg,
+                                  unsigned DestReg,
+                                  const TargetRegisterInfo *TRI) {
+  return (UseReg && (I->modifiesRegister(UseReg, TRI))) ||
+          I->modifiesRegister(DestReg, TRI) ||
+          I->readsRegister(DestReg, TRI) ||
+          I->hasUnmodeledSideEffects() ||
+          I->isInlineAsm() || I->isDebugValue();
+}
+
+/// isSafeToMoveTogether - Returns true if it is safe to move I1 next to I2 such
+/// that the two instructions can be paired in a combine.
+bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
+                                                MachineInstr *I2,
+                                                unsigned I1DestReg,
+                                                unsigned I2DestReg,
+                                                bool &DoInsertAtI1) {
+
+  bool IsImmUseReg = I2->getOperand(1).isImm() || I2->getOperand(1).isGlobal();
+  unsigned I2UseReg = IsImmUseReg ? 0 : I2->getOperand(1).getReg();
+
+  // It is not safe to move I1 and I2 into one combine if I2 has a true
+  // dependence on I1.
+  if (I2UseReg && I1->modifiesRegister(I2UseReg, TRI))
+    return false;
+
+  bool isSafe = true;
+
+  // First try to move I2 towards I1.
+  {
+    // A reverse_iterator instantiated like below starts before I2, and I1
+    // respectively.
+    // Look at instructions I in between I2 and (excluding) I1.
+    MachineBasicBlock::reverse_iterator I(I2),
+      End = --(MachineBasicBlock::reverse_iterator(I1));
+    // At 03 we got better results (dhrystone!) by being more conservative.
+    if (!ShouldCombineAggressively)
+      End = MachineBasicBlock::reverse_iterator(I1);
+    // If I2 kills its operand and we move I2 over an instruction that also
+    // uses I2's use reg we need to modify that (first) instruction to now kill
+    // this reg.
+    unsigned KilledOperand = 0;
+    if (I2->killsRegister(I2UseReg))
+      KilledOperand = I2UseReg;
+    MachineInstr *KillingInstr = 0;
+
+    for (; I != End; ++I) {
+      // If the intervening instruction I:
+      //   * modifies I2's use reg
+      //   * modifies I2's def reg
+      //   * reads I2's def reg
+      //   * or has unmodelled side effects
+      // we can't move I2 across it.
+      if (isUnsafeToMoveAcross(&*I, I2UseReg, I2DestReg, TRI)) {
+        isSafe = false;
+        break;
+      }
+
+      // Update first use of the killed operand.
+      if (!KillingInstr && KilledOperand &&
+          I->readsRegister(KilledOperand, TRI))
+        KillingInstr = &*I;
+    }
+    if (isSafe) {
+      // Update the intermediate instruction to with the kill flag.
+      if (KillingInstr) {
+        bool Added = KillingInstr->addRegisterKilled(KilledOperand, TRI, true);
+        (void)Added; // supress compiler warning
+        assert(Added && "Must successfully update kill flag");
+        removeKillInfo(I2, KilledOperand);
+      }
+      DoInsertAtI1 = true;
+      return true;
+    }
+  }
+
+  // Try to move I1 towards I2.
+  {
+    // Look at instructions I in between I1 and (excluding) I2.
+    MachineBasicBlock::iterator I(I1), End(I2);
+    // At O3 we got better results (dhrystone) by being more conservative here.
+    if (!ShouldCombineAggressively)
+      End = llvm::next(MachineBasicBlock::iterator(I2));
+    IsImmUseReg = I1->getOperand(1).isImm() || I1->getOperand(1).isGlobal();
+    unsigned I1UseReg = IsImmUseReg ? 0 : I1->getOperand(1).getReg();
+    // Track killed operands. If we move across an instruction that kills our
+    // operand, we need to update the kill information on the moved I1. It kills
+    // the operand now.
+    MachineInstr *KillingInstr = 0;
+    unsigned KilledOperand = 0;
+
+    while(++I != End) {
+      // If the intervening instruction I:
+      //   * modifies I1's use reg
+      //   * modifies I1's def reg
+      //   * reads I1's def reg
+      //   * or has unmodelled side effects
+      //   We introduce this special case because llvm has no api to remove a
+      //   kill flag for a register (a removeRegisterKilled() analogous to
+      //   addRegisterKilled) that handles aliased register correctly.
+      //   * or has a killed aliased register use of I1's use reg
+      //           %D4<def> = TFRI64 16
+      //           %R6<def> = TFR %R9
+      //           %R8<def> = KILL %R8, %D4<imp-use,kill>
+      //      If we want to move R6 = across the KILL instruction we would have
+      //      to remove the %D4<imp-use,kill> operand. For now, we are
+      //      conservative and disallow the move.
+      // we can't move I1 across it.
+      if (isUnsafeToMoveAcross(I, I1UseReg, I1DestReg, TRI) ||
+          // Check for an aliased register kill. Bail out if we see one.
+          (!I->killsRegister(I1UseReg) && I->killsRegister(I1UseReg, TRI)))
+        return false;
+
+      // Check for an exact kill (registers match).
+      if (I1UseReg && I->killsRegister(I1UseReg)) {
+        assert(KillingInstr == 0 && "Should only see one killing instruction");
+        KilledOperand = I1UseReg;
+        KillingInstr = &*I;
+      }
+    }
+    if (KillingInstr) {
+      removeKillInfo(KillingInstr, KilledOperand);
+      // Update I1 to set the kill flag. This flag will later be picked up by
+      // the new COMBINE instruction.
+      bool Added = I1->addRegisterKilled(KilledOperand, TRI);
+      (void)Added; // supress compiler warning
+      assert(Added && "Must successfully update kill flag");
+    }
+    DoInsertAtI1 = false;
+  }
+
+  return true;
+}
+
+/// findPotentialNewifiableTFRs - Finds tranfers that feed stores that could be
+/// newified. (A use of a 64 bit register define can not be newified)
+void
+HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
+  DenseMap<unsigned, MachineInstr *> LastDef;
+  for (MachineBasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+    MachineInstr *MI = I;
+    // Mark TFRs that feed a potential new value store as such.
+    if(TII->mayBeNewStore(MI)) {
+      // Look for uses of TFR instructions.
+      for (unsigned OpdIdx = 0, OpdE = MI->getNumOperands(); OpdIdx != OpdE;
+           ++OpdIdx) {
+        MachineOperand &Op = MI->getOperand(OpdIdx);
+
+        // Skip over anything except register uses.
+        if (!Op.isReg() || !Op.isUse() || !Op.getReg())
+          continue;
+
+        // Look for the defining instruction.
+        unsigned Reg = Op.getReg();
+        MachineInstr *DefInst = LastDef[Reg];
+        if (!DefInst)
+          continue;
+        if (!isCombinableInstType(DefInst, TII, ShouldCombineAggressively))
+          continue;
+
+        // Only close newifiable stores should influence the decision.
+        MachineBasicBlock::iterator It(DefInst);
+        unsigned NumInstsToDef = 0;
+        while (&*It++ != MI)
+          ++NumInstsToDef;
+
+        if (NumInstsToDef > MaxNumOfInstsBetweenNewValueStoreAndTFR)
+          continue;
+
+        PotentiallyNewifiableTFR.insert(DefInst);
+      }
+      // Skip to next instruction.
+      continue;
+    }
+
+    // Put instructions that last defined integer or double registers into the
+    // map.
+    for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+      MachineOperand &Op = MI->getOperand(I);
+      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
+        continue;
+      unsigned Reg = Op.getReg();
+      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          LastDef[*SubRegs] = MI;
+        }
+      } else if (Hexagon::IntRegsRegClass.contains(Reg))
+        LastDef[Reg] = MI;
+    }
+  }
+}
+
+bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
+
+  if (IsCombinesDisabled) return false;
+
+  bool HasChanged = false;
+
+  // Get target info.
+  TRI = MF.getTarget().getRegisterInfo();
+  TII = static_cast<const HexagonInstrInfo *>(MF.getTarget().getInstrInfo());
+
+  // Combine aggressively (for code size)
+  ShouldCombineAggressively =
+    MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
+
+  // Traverse basic blocks.
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    PotentiallyNewifiableTFR.clear();
+    findPotentialNewifiableTFRs(*BI);
+
+    // Traverse instructions in basic block.
+    for(MachineBasicBlock::iterator MI = BI->begin(), End = BI->end();
+        MI != End;) {
+      MachineInstr *I1 = MI++;
+      // Don't combine a TFR whose user could be newified (instructions that
+      // define double registers can not be newified - Programmer's Ref Manual
+      // 5.4.2 New-value stores).
+      if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I1))
+        continue;
+
+      // Ignore instructions that are not combinable.
+      if (!isCombinableInstType(I1, TII, ShouldCombineAggressively))
+        continue;
+
+      // Find a second instruction that can be merged into a combine
+      // instruction.
+      bool DoInsertAtI1 = false;
+      MachineInstr *I2 = findPairable(I1, DoInsertAtI1);
+      if (I2) {
+        HasChanged = true;
+        combine(I1, I2, MI, DoInsertAtI1);
+      }
+    }
+  }
+
+  return HasChanged;
+}
+
+/// findPairable - Returns an instruction that can be merged with \p I1 into a
+/// COMBINE instruction or 0 if no such instruction can be found. Returns true
+/// in \p DoInsertAtI1 if the combine must be inserted at instruction \p I1
+/// false if the combine must be inserted at the returned instruction.
+MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
+                                                 bool &DoInsertAtI1) {
+  MachineBasicBlock::iterator I2 = llvm::next(MachineBasicBlock::iterator(I1));
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+
+  for (MachineBasicBlock::iterator End = I1->getParent()->end(); I2 != End;
+       ++I2) {
+    // Bail out early if we see a second definition of I1DestReg.
+    if (I2->modifiesRegister(I1DestReg, TRI))
+      break;
+
+    // Ignore non-combinable instructions.
+    if (!isCombinableInstType(I2, TII, ShouldCombineAggressively))
+      continue;
+
+    // Don't combine a TFR whose user could be newified.
+    if (ShouldCombineAggressively && PotentiallyNewifiableTFR.count(I2))
+      continue;
+
+    unsigned I2DestReg = I2->getOperand(0).getReg();
+
+    // Check that registers are adjacent and that the first destination register
+    // is even.
+    bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
+    bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
+    unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+    if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
+      continue;
+
+    // Check that the two instructions are combinable. V4 allows more
+    // instructions to be merged into a combine.
+    // The order matters because in a TFRI we might can encode a int8 as the
+    // hi reg operand but only a uint6 as the low reg operand.
+    if ((IsI2LowReg && !areCombinableOperations(TRI, I1, I2)) ||
+        (IsI1LowReg && !areCombinableOperations(TRI, I2, I1)))
+      break;
+
+    if (isSafeToMoveTogether(I1, I2, I1DestReg, I2DestReg,
+                             DoInsertAtI1))
+      return I2;
+
+    // Not safe. Stop searching.
+    break;
+  }
+  return 0;
+}
+
+void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
+                                   MachineBasicBlock::iterator &MI,
+                                   bool DoInsertAtI1) {
+  // We are going to delete I2. If MI points to I2 advance it to the next
+  // instruction.
+  if ((MachineInstr *)MI == I2) ++MI;
+
+  // Figure out whether I1 or I2 goes into the lowreg part.
+  unsigned I1DestReg = I1->getOperand(0).getReg();
+  unsigned I2DestReg = I2->getOperand(0).getReg();
+  bool IsI1Loreg = (I2DestReg - I1DestReg) == 1;
+  unsigned LoRegDef = IsI1Loreg ? I1DestReg : I2DestReg;
+
+  // Get the double word register.
+  unsigned DoubleRegDest =
+    TRI->getMatchingSuperReg(LoRegDef, Hexagon::subreg_loreg,
+                             &Hexagon::DoubleRegsRegClass);
+  assert(DoubleRegDest != 0 && "Expect a valid register");
+
+
+  // Setup source operands.
+  MachineOperand &LoOperand = IsI1Loreg ? I1->getOperand(1) :
+    I2->getOperand(1);
+  MachineOperand &HiOperand = IsI1Loreg ? I2->getOperand(1) :
+    I1->getOperand(1);
+
+  // Figure out which source is a register and which a constant.
+  bool IsHiReg = HiOperand.isReg();
+  bool IsLoReg = LoOperand.isReg();
+
+  MachineBasicBlock::iterator InsertPt(DoInsertAtI1 ? I1 : I2);
+  // Emit combine.
+  if (IsHiReg && IsLoReg)
+    emitCombineRR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsHiReg)
+    emitCombineRI(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else if (IsLoReg)
+    emitCombineIR(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+  else
+    emitCombineII(InsertPt, DoubleRegDest, HiOperand, LoOperand);
+
+  I1->eraseFromParent();
+  I2->eraseFromParent();
+}
+
+void HexagonCopyToCombine::emitCombineII(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle  globals.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Handle constant extended immediates.
+  if (!isInt<8>(HiOperand.getImm())) {
+    assert(isInt<8>(LoOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  if (!isUInt<6>(LoOperand.getImm())) {
+    assert(isInt<8>(HiOperand.getImm()));
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_iI_V4), DoubleDestReg)
+      .addImm(HiOperand.getImm())
+      .addImm(LoOperand.getImm());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ii), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineIR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoReg = LoOperand.getReg();
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (HiOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+      .addGlobalAddress(HiOperand.getGlobal(), HiOperand.getOffset(),
+                        HiOperand.getTargetFlags())
+      .addReg(LoReg, LoRegKillFlag);
+    return;
+  }
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine #HiImm, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_Ir_V4), DoubleDestReg)
+    .addImm(HiOperand.getImm())
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+void HexagonCopyToCombine::emitCombineRI(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Handle global.
+  if (LoOperand.isGlobal()) {
+    BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+      .addReg(HiReg, HiRegKillFlag)
+      .addGlobalAddress(LoOperand.getGlobal(), LoOperand.getOffset(),
+                        LoOperand.getTargetFlags());
+    return;
+  }
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, #LoImm
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rI_V4), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addImm(LoOperand.getImm());
+}
+
+void HexagonCopyToCombine::emitCombineRR(MachineBasicBlock::iterator &InsertPt,
+                                         unsigned DoubleDestReg,
+                                         MachineOperand &HiOperand,
+                                         MachineOperand &LoOperand) {
+  unsigned LoRegKillFlag = getKillRegState(LoOperand.isKill());
+  unsigned HiRegKillFlag = getKillRegState(HiOperand.isKill());
+  unsigned LoReg = LoOperand.getReg();
+  unsigned HiReg = HiOperand.getReg();
+
+  DebugLoc DL = InsertPt->getDebugLoc();
+  MachineBasicBlock *BB = InsertPt->getParent();
+
+  // Insert new combine instruction.
+  //  DoubleRegDest = combine HiReg, LoReg
+  BuildMI(*BB, InsertPt, DL, TII->get(Hexagon::COMBINE_rr), DoubleDestReg)
+    .addReg(HiReg, HiRegKillFlag)
+    .addReg(LoReg, LoRegKillFlag);
+}
+
+FunctionPass *llvm::createHexagonCopyToCombine() {
+  return new HexagonCopyToCombine();
+}
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index de993ee..2b04f25 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -76,17 +76,12 @@ void HexagonFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   const HexagonRegisterInfo *QRI =
     static_cast<const HexagonRegisterInfo *>(MF.getTarget().getRegisterInfo());
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   determineFrameLayout(MF);
 
-  // Check if frame moves are needed for EH.
-  bool needsFrameMoves = MMI.hasDebugInfo() ||
-    !MF.getFunction()->needsUnwindTableEntry();
-
   // Get the number of bytes to allocate from the FrameInfo.
   int NumBytes = (int) MFI->getStackSize();
 
@@ -113,28 +108,6 @@ void HexagonFrameLowering::emitPrologue(MachineFunction &MF) const {
     MO.setImm(MFI->getMaxCallFrameSize());
   }
 
- std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-
- if (needsFrameMoves) {
-   // Advance CFA. DW_CFA_def_cfa
-   unsigned FPReg = QRI->getFrameRegister();
-   unsigned RAReg = QRI->getRARegister();
-
-   MachineLocation Dst(MachineLocation::VirtualFP);
-   MachineLocation Src(FPReg, -8);
-   Moves.push_back(MachineMove(0, Dst, Src));
-
-   // R31 = (R31 - #4)
-   MachineLocation LRDst(RAReg, -4);
-   MachineLocation LRSrc(RAReg);
-   Moves.push_back(MachineMove(0, LRDst, LRSrc));
-
-   // R30 = (R30 - #8)
-   MachineLocation SPDst(FPReg, -8);
-   MachineLocation SPSrc(FPReg);
-   Moves.push_back(MachineMove(0, SPDst, SPSrc));
- }
-
   //
   // Only insert ALLOCFRAME if we need to.
   //
@@ -174,30 +147,55 @@ void HexagonFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = prior(MBB.end());
   DebugLoc dl = MBBI->getDebugLoc();
   //
-  // Only insert deallocframe if we need to.
+  // Only insert deallocframe if we need to.  Also at -O0.  See comment
+  // in emitPrologue above.
   //
-  if (hasFP(MF)) {
+  if (hasFP(MF) || MF.getTarget().getOptLevel() == CodeGenOpt::None) {
     MachineBasicBlock::iterator MBBI = prior(MBB.end());
     MachineBasicBlock::iterator MBBI_end = MBB.end();
-    //
-    // For Hexagon, we don't need the frame size.
-    //
-    MachineFrameInfo *MFI = MF.getFrameInfo();
-    int NumBytes = (int) MFI->getStackSize();
 
     const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-
+    // Handle EH_RETURN.
+    if (MBBI->getOpcode() == Hexagon::EH_RETURN_JMPR) {
+      assert(MBBI->getOperand(0).isReg() && "Offset should be in register!");
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::ADD_rr),
+              Hexagon::R29).addReg(Hexagon::R29).addReg(Hexagon::R28);
+      return;
+    }
     // Replace 'jumpr r31' instruction with dealloc_return for V4 and higher
     // versions.
     if (STI.hasV4TOps() && MBBI->getOpcode() == Hexagon::JMPret
                         && !DisableDeallocRet) {
-      // Remove jumpr node.
-      MBB.erase(MBBI);
+      // Check for RESTORE_DEALLOC_RET_JMP_V4 call. Don't emit an extra DEALLOC
+      // instruction if we encounter it.
+      MachineBasicBlock::iterator BeforeJMPR =
+        MBB.begin() == MBBI ? MBBI : prior(MBBI);
+      if (BeforeJMPR != MBBI &&
+          BeforeJMPR->getOpcode() == Hexagon::RESTORE_DEALLOC_RET_JMP_V4) {
+        // Remove the JMPR node.
+        MBB.erase(MBBI);
+        return;
+      }
+
       // Add dealloc_return.
-      BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4))
-        .addImm(NumBytes);
-    } else { // Add deallocframe for V2 and V3.
-      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME)).addImm(NumBytes);
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI_end, dl, TII.get(Hexagon::DEALLOC_RET_V4));
+      // Transfer the function live-out registers.
+      MIB->copyImplicitOps(*MBB.getParent(), &*MBBI);
+      // Remove the JUMPR node.
+      MBB.erase(MBBI);
+    } else { // Add deallocframe for V2 and V3, and V4 tail calls.
+      // Check for RESTORE_DEALLOC_BEFORE_TAILCALL_V4. We don't need an extra
+      // DEALLOCFRAME instruction after it.
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      MachineBasicBlock::iterator I =
+        Term == MBB.begin() ?  MBB.end() : prior(Term);
+      if (I != MBB.end() &&
+          I->getOpcode() == Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4)
+        return;
+
+      BuildMI(MBB, MBBI, dl, TII.get(Hexagon::DEALLOCFRAME));
     }
   }
 }
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index d002788..52d5ab2 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -134,7 +134,7 @@ namespace {
     /// has a computable trip count and, if so, return a value that represents
     /// the trip count expression.
     CountValue *getLoopTripCount(MachineLoop *L,
-                                 SmallVector<MachineInstr*, 2> &OldInsts);
+                                 SmallVectorImpl<MachineInstr *> &OldInsts);
 
     /// \brief Return the expression that represents the number of times
     /// a loop iterates.  The function takes the operands that represent the
@@ -164,7 +164,7 @@ namespace {
 
     /// \brief Return true if the instruction is now dead.
     bool isDead(const MachineInstr *MI,
-                SmallVector<MachineInstr*, 1> &DeadPhis) const;
+                SmallVectorImpl<MachineInstr *> &DeadPhis) const;
 
     /// \brief Remove the instruction if it is now dead.
     void removeIfDead(MachineInstr *MI);
@@ -428,7 +428,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
 /// induction variable patterns that are used in the calculation for
 /// the number of time the loop is executed.
 CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
-                                SmallVector<MachineInstr*, 2> &OldInsts) {
+                                    SmallVectorImpl<MachineInstr *> &OldInsts) {
   MachineBasicBlock *TopMBB = L->getTopBlock();
   MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
   assert(PI != TopMBB->pred_end() &&
@@ -871,7 +871,7 @@ bool HexagonHardwareLoops::isInvalidLoopOperation(
 /// \brief - Return true if the loop contains an instruction that inhibits
 /// the use of the hardware loop function.
 bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
+  const std::vector<MachineBasicBlock *> &Blocks = L->getBlocks();
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
     MachineBasicBlock *MBB = Blocks[i];
     for (MachineBasicBlock::iterator
@@ -890,7 +890,7 @@ bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const {
 /// for inline asm, physical registers and instructions with side effects
 /// removed.
 bool HexagonHardwareLoops::isDead(const MachineInstr *MI,
-                             SmallVector<MachineInstr*, 1> &DeadPhis) const {
+                              SmallVectorImpl<MachineInstr *> &DeadPhis) const {
   // Examine each operand.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 54ca2c9..5ae93284 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -50,15 +50,13 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
 
   // Keep a reference to HexagonTargetMachine.
   const HexagonTargetMachine& TM;
-  const HexagonInstrInfo *TII;
   DenseMap<const GlobalValue *, unsigned> GlobalAddressUseCountMap;
 public:
-  explicit HexagonDAGToDAGISel(const HexagonTargetMachine &targetmachine,
+  explicit HexagonDAGToDAGISel(HexagonTargetMachine &targetmachine,
                                CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(targetmachine, OptLevel),
       Subtarget(targetmachine.getSubtarget<HexagonSubtarget>()),
-      TM(targetmachine),
-      TII(static_cast<const HexagonInstrInfo*>(TM.getInstrInfo())) {
+      TM(targetmachine) {
     initializeHexagonDAGToDAGISelPass(*PassRegistry::getPassRegistry());
   }
   bool hasNumUsesBelowThresGA(SDNode *N) const;
@@ -92,14 +90,14 @@ public:
   bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
 
   SDNode *SelectLoad(SDNode *N);
-  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl);
-  SDNode *SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl);
+  SDNode *SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl);
+  SDNode *SelectIndexedLoad(LoadSDNode *LD, SDLoc dl);
   SDNode *SelectIndexedLoadZeroExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        DebugLoc dl);
+                                        SDLoc dl);
   SDNode *SelectIndexedLoadSignExtend64(LoadSDNode *LD, unsigned Opcode,
-                                        DebugLoc dl);
-  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, DebugLoc dl);
-  SDNode *SelectIndexedStore(StoreSDNode *ST, DebugLoc dl);
+                                        SDLoc dl);
+  SDNode *SelectBaseOffsetStore(StoreSDNode *ST, SDLoc dl);
+  SDNode *SelectIndexedStore(StoreSDNode *ST, SDLoc dl);
   SDNode *SelectStore(SDNode *N);
   SDNode *SelectSHL(SDNode *N);
   SDNode *SelectSelect(SDNode *N);
@@ -180,7 +178,7 @@ inline SDValue XformUToUM1Imm(unsigned Imm) {
 /// createHexagonISelDag - This pass converts a legalized DAG into a
 /// Hexagon-specific DAG, ready for instruction scheduling.
 ///
-FunctionPass *llvm::createHexagonISelDag(const HexagonTargetMachine &TM,
+FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new HexagonDAGToDAGISel(TM, OptLevel);
 }
@@ -385,7 +383,7 @@ static bool OffsetFitsS11(EVT MemType, int64_t Offset) {
 // lowering for GlobalAddress nodes has already turned it into a
 // CONST32.
 //
-SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, SDLoc dl) {
   SDValue Chain = LD->getChain();
   SDNode* Const32 = LD->getBasePtr().getNode();
   unsigned Opcode = 0;
@@ -396,7 +394,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
     EVT LoadedVT = LD->getMemoryVT();
     int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
     if (Offset != 0 && OffsetFitsS11(LoadedVT, Offset)) {
-      MVT PointerTy = TLI.getPointerTy();
+      MVT PointerTy = getTargetLowering()->getPointerTy();
       const GlobalValue* GV =
         cast<GlobalAddressSDNode>(Base)->getGlobal();
       SDValue TargAddr =
@@ -433,7 +431,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetLoad(LoadSDNode *LD, DebugLoc dl) {
 
 SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
                                                            unsigned Opcode,
-                                                           DebugLoc dl)
+                                                           SDLoc dl)
 {
   SDValue Chain = LD->getChain();
   EVT LoadedVT = LD->getMemoryVT();
@@ -444,8 +442,11 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
   SDValue N1 = LD->getOperand(1);
   SDValue CPTmpN1_0;
   SDValue CPTmpN1_1;
+
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConst = CurDAG->getTargetConstant(Val, MVT::i32);
       SDNode *Result_1 = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::i32,
@@ -497,7 +498,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadSignExtend64(LoadSDNode *LD,
 
 SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
                                                            unsigned Opcode,
-                                                           DebugLoc dl)
+                                                           SDLoc dl)
 {
   SDValue Chain = LD->getChain();
   EVT LoadedVT = LD->getMemoryVT();
@@ -508,8 +509,11 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
   SDValue N1 = LD->getOperand(1);
   SDValue CPTmpN1_0;
   SDValue CPTmpN1_1;
+
   if (SelectADDRriS11_2(N1, CPTmpN1_0, CPTmpN1_1) &&
       N1.getNode()->getValueType(0) == MVT::i32) {
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
     if (TII->isValidAutoIncImm(LoadedVT, Val)) {
       SDValue TargetConstVal = CurDAG->getTargetConstant(Val, MVT::i32);
       SDValue TargetConst0 = CurDAG->getTargetConstant(0, MVT::i32);
@@ -572,7 +576,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoadZeroExtend64(LoadSDNode *LD,
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, SDLoc dl) {
   SDValue Chain = LD->getChain();
   SDValue Base = LD->getBasePtr();
   SDValue Offset = LD->getOffset();
@@ -586,6 +590,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
   bool zextval = (LD->getExtensionType() == ISD::ZEXTLOAD);
 
   // Figure out the opcode.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (LoadedVT == MVT::i64) {
     if (TII->isValidAutoIncImm(LoadedVT, Val))
       Opcode = Hexagon::POST_LDrid;
@@ -667,7 +673,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, DebugLoc dl) {
 
 SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
   SDNode *result;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
 
@@ -682,7 +688,7 @@ SDNode *HexagonDAGToDAGISel::SelectLoad(SDNode *N) {
 }
 
 
-SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
+SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, SDLoc dl) {
   SDValue Chain = ST->getChain();
   SDValue Base = ST->getBasePtr();
   SDValue Offset = ST->getOffset();
@@ -694,6 +700,8 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
   // Offset value must be within representable range
   // and must have correct alignment properties.
+  const HexagonInstrInfo *TII =
+    static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
   if (TII->isValidAutoIncImm(StoredVT, Val)) {
     SDValue Ops[] = {Base, CurDAG->getTargetConstant(Val, MVT::i32), Value,
                      Chain};
@@ -751,7 +759,7 @@ SDNode *HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, DebugLoc dl) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
-                                                   DebugLoc dl) {
+                                                   SDLoc dl) {
   SDValue Chain = ST->getChain();
   SDNode* Const32 = ST->getBasePtr().getNode();
   SDValue Value = ST->getValue();
@@ -769,7 +777,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
       EVT StoredVT = ST->getMemoryVT();
       int64_t Offset = cast<GlobalAddressSDNode>(Base)->getOffset();
       if (Offset != 0 && OffsetFitsS11(StoredVT, Offset)) {
-        MVT PointerTy = TLI.getPointerTy();
+        MVT PointerTy = getTargetLowering()->getPointerTy();
         const GlobalValue* GV =
           cast<GlobalAddressSDNode>(Base)->getGlobal();
         SDValue TargAddr =
@@ -805,7 +813,7 @@ SDNode *HexagonDAGToDAGISel::SelectBaseOffsetStore(StoreSDNode *ST,
 
 
 SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   ISD::MemIndexedMode AM = ST->getAddressingMode();
 
@@ -818,7 +826,7 @@ SDNode *HexagonDAGToDAGISel::SelectStore(SDNode *N) {
 }
 
 SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   //
   // %conv.i = sext i32 %tmp1 to i64
@@ -902,7 +910,7 @@ SDNode *HexagonDAGToDAGISel::SelectMul(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() == ISD::SETCC) {
     SDValue N00 = N0.getOperand(0);
@@ -969,7 +977,7 @@ SDNode *HexagonDAGToDAGISel::SelectSelect(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue Shift = N->getOperand(0);
 
   //
@@ -1082,7 +1090,7 @@ SDNode *HexagonDAGToDAGISel::SelectTruncate(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) == MVT::i32) {
     SDValue Shl_0 = N->getOperand(0);
     SDValue Shl_1 = N->getOperand(1);
@@ -1158,7 +1166,7 @@ SDNode *HexagonDAGToDAGISel::SelectSHL(SDNode *N) {
 // We want to preserve all the lower 8-bits and, not just 1 LSB bit.
 //
 SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDNode *IsIntrinsic = N->getOperand(0).getNode();
   if ((IsIntrinsic->getOpcode() == ISD::INTRINSIC_WO_CHAIN)) {
     unsigned ID =
@@ -1201,7 +1209,7 @@ SDNode *HexagonDAGToDAGISel::SelectZeroExtend(SDNode *N) {
 // and lowering to the actual intrinsic.
 //
 SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned ID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   unsigned IntrinsicWithPred = doesIntrinsicContainPredicate(ID);
 
@@ -1209,6 +1217,8 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
   // as at least one of the operands.
   if (IntrinsicWithPred) {
     SmallVector<SDValue, 8> Ops;
+    const HexagonInstrInfo *TII =
+      static_cast<const HexagonInstrInfo*>(TM.getInstrInfo());
     const MCInstrDesc &MCID = TII->get(IntrinsicWithPred);
     const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
@@ -1251,7 +1261,7 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
 // Map floating point constant values.
 //
 SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
   APFloat APF = CN->getValueAPF();
   if (N->getValueType(0) == MVT::f32) {
@@ -1271,7 +1281,7 @@ SDNode *HexagonDAGToDAGISel::SelectConstantFP(SDNode *N) {
 // Map predicate true (encoded as -1 in LLVM) to a XOR.
 //
 SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) == MVT::i1) {
     SDNode* Result;
     int32_t Val = cast<ConstantSDNode>(N)->getSExtValue();
@@ -1310,7 +1320,7 @@ SDNode *HexagonDAGToDAGISel::SelectConstant(SDNode *N) {
 // Map add followed by a asr -> asr +=.
 //
 SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   if (N->getValueType(0) != MVT::i32) {
     return SelectCode(N);
   }
@@ -1334,8 +1344,10 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 
 
 SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
 
   switch (N->getOpcode()) {
@@ -1660,7 +1672,7 @@ bool HexagonDAGToDAGISel::foldGlobalAddressImpl(SDValue &N, SDValue &R,
                 !hasNumUsesBelowThresGA(GA))
             return false;
         R = CurDAG->getTargetGlobalAddress(GA->getGlobal(),
-                                          Const->getDebugLoc(),
+                                          SDLoc(Const),
                                           N.getValueType(),
                                           GA->getOffset() +
                                           (uint64_t)Const->getSExtValue());
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 0e5b8dc..1374179 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,13 +39,24 @@
 
 using namespace llvm;
 
-const unsigned Hexagon_MAX_RET_SIZE = 64;
-
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
                cl::desc("Control jump table emission on Hexagon target"));
 
-int NumNamedVarArgParams = -1;
+namespace {
+class HexagonCCState : public CCState {
+  int NumNamedVarArgParams;
+
+public:
+  HexagonCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
+                 const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
+                 LLVMContext &C, int NumNamedVarArgParams)
+      : CCState(CC, isVarArg, MF, TM, locs, C),
+        NumNamedVarArgParams(NumNamedVarArgParams) {}
+
+  int getNumNamedVarArgParams() const { return NumNamedVarArgParams; }
+};
+}
 
 // Implement calling convention for Hexagon.
 static bool
@@ -82,12 +93,13 @@ static bool
 CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
             MVT LocVT, CCValAssign::LocInfo LocInfo,
             ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  HexagonCCState &HState = static_cast<HexagonCCState &>(State);
 
   // NumNamedVarArgParams can not be zero for a VarArg function.
-  assert ( (NumNamedVarArgParams > 0) &&
-           "NumNamedVarArgParams is not bigger than zero.");
+  assert((HState.getNumNamedVarArgParams() > 0) &&
+         "NumNamedVarArgParams is not bigger than zero.");
 
-  if ( (int)ValNo < NumNamedVarArgParams ) {
+  if ((int)ValNo < HState.getNumNamedVarArgParams()) {
     // Deal with named arguments.
     return CC_Hexagon(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State);
   }
@@ -285,7 +297,7 @@ const {
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
 
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -302,7 +314,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc dl, SelectionDAG &DAG) const {
+                                   SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
@@ -351,7 +363,7 @@ HexagonTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                        CallingConv::ID CallConv, bool isVarArg,
                                        const
                                        SmallVectorImpl<ISD::InputArg> &Ins,
-                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SDLoc dl, SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals,
                                        const SmallVectorImpl<SDValue> &OutVals,
                                        SDValue Callee) const {
@@ -382,10 +394,10 @@ SDValue
 HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -394,13 +406,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool IsStructRet    = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
 
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
   // Check for varargs.
-  NumNamedVarArgParams = -1;
+  int NumNamedVarArgParams = -1;
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
   {
     const Function* CalleeFn = NULL;
@@ -417,6 +424,12 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  HexagonCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                        getTargetMachine(), ArgLocs, *DAG.getContext(),
+                        NumNamedVarArgParams);
+
   if (NumNamedVarArgParams > 0)
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_VarArg);
   else
@@ -513,7 +526,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (!isTailCall)
     Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
-                                                        getPointerTy(), true));
+                                                        getPointerTy(), true),
+                                 dl);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -588,7 +602,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -730,7 +744,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   unsigned JTI = JT->getIndex();
   MachineFunction &MF = DAG.getMachineFunction();
@@ -766,7 +780,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   unsigned SPReg = getStackPointerRegisterToSaveRestore();
 
@@ -812,7 +826,7 @@ HexagonTargetLowering::LowerFormalArguments(SDValue Chain,
                                             bool isVarArg,
                                             const
                                             SmallVectorImpl<ISD::InputArg> &Ins,
-                                            DebugLoc dl, SelectionDAG &DAG,
+                                            SDLoc dl, SelectionDAG &DAG,
                                             SmallVectorImpl<SDValue> &InVals)
 const {
 
@@ -925,7 +939,7 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), Addr,
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr,
                       Op.getOperand(1), MachinePointerInfo(SV), false,
                       false, 0);
 }
@@ -937,7 +951,7 @@ HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue CC = Op.getOperand(4);
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDNode* OpNode = Op.getNode();
   EVT SVT = OpNode->getValueType(0);
 
@@ -948,8 +962,7 @@ HexagonTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
-
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   SDValue Res;
   if (CP->isMachineConstantPoolEntry())
@@ -969,7 +982,7 @@ HexagonTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   if (Depth) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -991,7 +1004,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          TRI->getFrameRegister(), VT);
@@ -1004,7 +1017,7 @@ HexagonTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue HexagonTargetLowering::LowerATOMIC_FENCE(SDValue Op,
                                                  SelectionDAG& DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(HexagonISD::BARRIER, dl, MVT::Other, Op.getOperand(0));
 }
 
@@ -1014,7 +1027,7 @@ SDValue HexagonTargetLowering::LowerGLOBALADDRESS(SDValue Op,
   SDValue Result;
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
 
   const HexagonTargetObjectFile &TLOF =
@@ -1030,7 +1043,7 @@ SDValue
 HexagonTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue BA_SD =  DAG.getTargetBlockAddress(BA, MVT::i32);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   return DAG.getNode(HexagonISD::CONST32_GP, dl, getPointerTy(), BA_SD);
 }
 
@@ -1361,7 +1374,6 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     // Increase jump tables cutover to 5, was 4.
     setMinimumJumpTableEntries(5);
 
-    setOperationAction(ISD::BR_CC, MVT::Other, Expand);
     setOperationAction(ISD::BR_CC, MVT::f32, Expand);
     setOperationAction(ISD::BR_CC, MVT::f64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i1,  Expand);
@@ -1429,11 +1441,6 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
     setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
 
-    setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
-    setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
-    setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
-    setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
-
     setOperationAction(ISD::EH_RETURN,     MVT::Other, Custom);
 
     if (TM.getSubtargetImpl()->isSubtargetV2()) {
@@ -1510,12 +1517,25 @@ bool HexagonTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
   return ((VT1.getSimpleVT() == MVT::i64) && (VT2.getSimpleVT() == MVT::i32));
 }
 
+bool
+HexagonTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  // Assuming the caller does not have either a signext or zeroext modifier, and
+  // only one value is accepted, any reasonable truncation is allowed.
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  // FIXME: in principle up to 64-bit could be made safe, but it would be very
+  // fragile at the moment: any support for multiple value returns would be
+  // liable to disallow tail calls involving i64 -> iN truncation in many cases.
+  return Ty1->getPrimitiveSizeInBits() <= 32;
+}
+
 SDValue
 HexagonTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc dl       = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Mark function as containing a call to EH_RETURN.
   HexagonMachineFunctionInfo *FuncInfo =
@@ -1591,11 +1611,11 @@ const {
 std::pair<unsigned, const TargetRegisterClass*>
 HexagonTargetLowering::getRegForInlineAsmConstraint(const
                                                     std::string &Constraint,
-                                                    EVT VT) const {
+                                                    MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':   // R0-R31
-       switch (VT.getSimpleVT().SimpleTy) {
+       switch (VT.SimpleTy) {
        default:
          llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
        case MVT::i32:
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index bb1acc1..73da226 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -95,6 +95,8 @@ namespace llvm {
     virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
+    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
@@ -106,7 +108,7 @@ namespace llvm {
     SDValue LowerFormalArguments(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SDLoc dl, SelectionDAG &DAG,
                                  SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -117,7 +119,7 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             const SmallVectorImpl<SDValue> &OutVals,
                             SDValue Callee) const;
@@ -131,7 +133,7 @@ namespace llvm {
                         CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        DebugLoc dl, SelectionDAG &DAG) const;
+                        SDLoc dl, SelectionDAG &DAG) const;
 
     virtual MachineBasicBlock
     *EmitInstrWithCustomInserter(MachineInstr *MI,
@@ -139,8 +141,11 @@ namespace llvm {
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(EVT VT) const {
-      return MVT::i1;
+    virtual EVT getSetCCResultType(LLVMContext &C, EVT VT) const {
+      if (!VT.isVector())
+        return MVT::i1;
+      else
+        return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
     }
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
@@ -150,7 +155,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 EVT VT) const;
+                                 MVT VT) const;
 
     // Intrinsics
     virtual SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op,
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index 587fa7d..d25bfa8 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -54,6 +54,7 @@ def AbsoluteSet    : AddrModeType<2>;  // Absolute set addressing mode
 def BaseImmOffset  : AddrModeType<3>;  // Indirect with offset
 def BaseLongOffset : AddrModeType<4>;  // Indirect with long offset
 def BaseRegOffset  : AddrModeType<5>;  // Indirect with register offset
+def PostInc        : AddrModeType<6>;  // Post increment addressing mode
 
 class MemAccessSize<bits<3> value> {
   bits<3> Value = value;
@@ -62,7 +63,7 @@ class MemAccessSize<bits<3> value> {
 def NoMemAccess      : MemAccessSize<0>;// Not a memory acces instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
-def WordAccess       : MemAccessSize<3>;// Word access instrution (memw).
+def WordAccess       : MemAccessSize<3>;// Word access instruction (memw).
 def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 
 
@@ -157,6 +158,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   string CextOpcode = "";
   string PredSense = "";
   string PNewValue = "";
+  string NValueST  = "";    // Set to "true" for new-value stores.
   string InputType = "";    // Input is "imm" or "reg" type.
   string isMEMri = "false"; // Set to "true" for load/store with MEMri operand.
   string isFloat = "false"; // Set to "true" for the floating-point load/store.
@@ -165,6 +167,7 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let PredSense = !if(isPredicated, !if(isPredicatedFalse, "false", "true"),
                                     "");
   let PNewValue = !if(isPredicatedNew, "new", "");
+  let NValueST = !if(isNVStore, "true", "false");
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index f114170..6b97609 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
@@ -55,10 +55,12 @@ const int Hexagon_MEMH_AUTOINC_MIN = -16;
 const int Hexagon_MEMB_AUTOINC_MAX = 7;
 const int Hexagon_MEMB_AUTOINC_MIN = -8;
 
+// Pin the vtable to this file.
+void HexagonInstrInfo::anchor() {}
 
 HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
   : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
-    RI(ST, *this), Subtarget(ST) {
+    RI(ST), Subtarget(ST) {
 }
 
 
@@ -558,16 +560,6 @@ MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   return(0);
 }
 
-MachineInstr*
-HexagonInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                           int FrameIx, uint64_t Offset,
-                                           const MDNode *MDPtr,
-                                           DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Hexagon::DBG_VALUE))
-    .addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
 
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
@@ -630,188 +622,6 @@ bool HexagonInstrInfo::isBranch (const MachineInstr *MI) const {
   return MI->getDesc().isBranch();
 }
 
-bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-    default: return false;
-    // Store Byte
-    case Hexagon::STrib_nv_V4:
-    case Hexagon::STrib_indexed_nv_V4:
-    case Hexagon::STrib_indexed_shl_nv_V4:
-    case Hexagon::STrib_shl_nv_V4:
-    case Hexagon::STb_GP_nv_V4:
-    case Hexagon::POST_STbri_nv_V4:
-    case Hexagon::STrib_cPt_nv_V4:
-    case Hexagon::STrib_cdnPt_nv_V4:
-    case Hexagon::STrib_cNotPt_nv_V4:
-    case Hexagon::STrib_cdnNotPt_nv_V4:
-    case Hexagon::STrib_indexed_cPt_nv_V4:
-    case Hexagon::STrib_indexed_cdnPt_nv_V4:
-    case Hexagon::STrib_indexed_cNotPt_nv_V4:
-    case Hexagon::STrib_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_STbri_cPt_nv_V4:
-    case Hexagon::POST_STbri_cdnPt_nv_V4:
-    case Hexagon::POST_STbri_cNotPt_nv_V4:
-    case Hexagon::POST_STbri_cdnNotPt_nv_V4:
-    case Hexagon::STb_GP_cPt_nv_V4:
-    case Hexagon::STb_GP_cNotPt_nv_V4:
-    case Hexagon::STb_GP_cdnPt_nv_V4:
-    case Hexagon::STb_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrib_abs_nv_V4:
-    case Hexagon::STrib_abs_cPt_nv_V4:
-    case Hexagon::STrib_abs_cdnPt_nv_V4:
-    case Hexagon::STrib_abs_cNotPt_nv_V4:
-    case Hexagon::STrib_abs_cdnNotPt_nv_V4:
-
-    // Store Halfword
-    case Hexagon::STrih_nv_V4:
-    case Hexagon::STrih_indexed_nv_V4:
-    case Hexagon::STrih_indexed_shl_nv_V4:
-    case Hexagon::STrih_shl_nv_V4:
-    case Hexagon::STh_GP_nv_V4:
-    case Hexagon::POST_SThri_nv_V4:
-    case Hexagon::STrih_cPt_nv_V4:
-    case Hexagon::STrih_cdnPt_nv_V4:
-    case Hexagon::STrih_cNotPt_nv_V4:
-    case Hexagon::STrih_cdnNotPt_nv_V4:
-    case Hexagon::STrih_indexed_cPt_nv_V4:
-    case Hexagon::STrih_indexed_cdnPt_nv_V4:
-    case Hexagon::STrih_indexed_cNotPt_nv_V4:
-    case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_SThri_cPt_nv_V4:
-    case Hexagon::POST_SThri_cdnPt_nv_V4:
-    case Hexagon::POST_SThri_cNotPt_nv_V4:
-    case Hexagon::POST_SThri_cdnNotPt_nv_V4:
-    case Hexagon::STh_GP_cPt_nv_V4:
-    case Hexagon::STh_GP_cNotPt_nv_V4:
-    case Hexagon::STh_GP_cdnPt_nv_V4:
-    case Hexagon::STh_GP_cdnNotPt_nv_V4:
-    case Hexagon::STrih_abs_nv_V4:
-    case Hexagon::STrih_abs_cPt_nv_V4:
-    case Hexagon::STrih_abs_cdnPt_nv_V4:
-    case Hexagon::STrih_abs_cNotPt_nv_V4:
-    case Hexagon::STrih_abs_cdnNotPt_nv_V4:
-
-    // Store Word
-    case Hexagon::STriw_nv_V4:
-    case Hexagon::STriw_indexed_nv_V4:
-    case Hexagon::STriw_indexed_shl_nv_V4:
-    case Hexagon::STriw_shl_nv_V4:
-    case Hexagon::STw_GP_nv_V4:
-    case Hexagon::POST_STwri_nv_V4:
-    case Hexagon::STriw_cPt_nv_V4:
-    case Hexagon::STriw_cdnPt_nv_V4:
-    case Hexagon::STriw_cNotPt_nv_V4:
-    case Hexagon::STriw_cdnNotPt_nv_V4:
-    case Hexagon::STriw_indexed_cPt_nv_V4:
-    case Hexagon::STriw_indexed_cdnPt_nv_V4:
-    case Hexagon::STriw_indexed_cNotPt_nv_V4:
-    case Hexagon::STriw_indexed_cdnNotPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cdnPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cNotPt_nv_V4:
-    case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4:
-    case Hexagon::POST_STwri_cPt_nv_V4:
-    case Hexagon::POST_STwri_cdnPt_nv_V4:
-    case Hexagon::POST_STwri_cNotPt_nv_V4:
-    case Hexagon::POST_STwri_cdnNotPt_nv_V4:
-    case Hexagon::STw_GP_cPt_nv_V4:
-    case Hexagon::STw_GP_cNotPt_nv_V4:
-    case Hexagon::STw_GP_cdnPt_nv_V4:
-    case Hexagon::STw_GP_cdnNotPt_nv_V4:
-    case Hexagon::STriw_abs_nv_V4:
-    case Hexagon::STriw_abs_cPt_nv_V4:
-    case Hexagon::STriw_abs_cdnPt_nv_V4:
-    case Hexagon::STriw_abs_cNotPt_nv_V4:
-    case Hexagon::STriw_abs_cdnNotPt_nv_V4:
-      return true;
-  }
-}
-
-bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
-  switch (MI->getOpcode())
-  {
-    default: return false;
-    // Load Byte
-    case Hexagon::POST_LDrib:
-    case Hexagon::POST_LDrib_cPt:
-    case Hexagon::POST_LDrib_cNotPt:
-    case Hexagon::POST_LDrib_cdnPt_V4:
-    case Hexagon::POST_LDrib_cdnNotPt_V4:
-
-    // Load unsigned byte
-    case Hexagon::POST_LDriub:
-    case Hexagon::POST_LDriub_cPt:
-    case Hexagon::POST_LDriub_cNotPt:
-    case Hexagon::POST_LDriub_cdnPt_V4:
-    case Hexagon::POST_LDriub_cdnNotPt_V4:
-
-    // Load halfword
-    case Hexagon::POST_LDrih:
-    case Hexagon::POST_LDrih_cPt:
-    case Hexagon::POST_LDrih_cNotPt:
-    case Hexagon::POST_LDrih_cdnPt_V4:
-    case Hexagon::POST_LDrih_cdnNotPt_V4:
-
-    // Load unsigned halfword
-    case Hexagon::POST_LDriuh:
-    case Hexagon::POST_LDriuh_cPt:
-    case Hexagon::POST_LDriuh_cNotPt:
-    case Hexagon::POST_LDriuh_cdnPt_V4:
-    case Hexagon::POST_LDriuh_cdnNotPt_V4:
-
-    // Load word
-    case Hexagon::POST_LDriw:
-    case Hexagon::POST_LDriw_cPt:
-    case Hexagon::POST_LDriw_cNotPt:
-    case Hexagon::POST_LDriw_cdnPt_V4:
-    case Hexagon::POST_LDriw_cdnNotPt_V4:
-
-    // Load double word
-    case Hexagon::POST_LDrid:
-    case Hexagon::POST_LDrid_cPt:
-    case Hexagon::POST_LDrid_cNotPt:
-    case Hexagon::POST_LDrid_cdnPt_V4:
-    case Hexagon::POST_LDrid_cdnNotPt_V4:
-
-    // Store byte
-    case Hexagon::POST_STbri:
-    case Hexagon::POST_STbri_cPt:
-    case Hexagon::POST_STbri_cNotPt:
-    case Hexagon::POST_STbri_cdnPt_V4:
-    case Hexagon::POST_STbri_cdnNotPt_V4:
-
-    // Store halfword
-    case Hexagon::POST_SThri:
-    case Hexagon::POST_SThri_cPt:
-    case Hexagon::POST_SThri_cNotPt:
-    case Hexagon::POST_SThri_cdnPt_V4:
-    case Hexagon::POST_SThri_cdnNotPt_V4:
-
-    // Store word
-    case Hexagon::POST_STwri:
-    case Hexagon::POST_STwri_cPt:
-    case Hexagon::POST_STwri_cNotPt:
-    case Hexagon::POST_STwri_cdnPt_V4:
-    case Hexagon::POST_STwri_cdnNotPt_V4:
-
-    // Store double word
-    case Hexagon::POST_STdri:
-    case Hexagon::POST_STdri_cPt:
-    case Hexagon::POST_STdri_cNotPt:
-    case Hexagon::POST_STdri_cdnPt_V4:
-    case Hexagon::POST_STdri_cdnNotPt_V4:
-      return true;
-  }
-}
-
 bool HexagonInstrInfo::isNewValueInst(const MachineInstr *MI) const {
   if (isNewValueJump(MI))
     return true;
@@ -917,19 +727,6 @@ bool HexagonInstrInfo::isPredicable(MachineInstr *MI) const {
 //  cPt    ---> cNotPt
 //  cNotPt ---> cPt
 //
-// however, these inversiones are NOT included:
-//
-//  cdnPt      -X-> cdnNotPt
-//  cdnNotPt   -X-> cdnPt
-//  cPt_nv     -X-> cNotPt_nv (new value stores)
-//  cNotPt_nv  -X-> cPt_nv    (new value stores)
-//
-// because only the following transformations are allowed:
-//
-//  cNotPt  ---> cdnNotPt
-//  cPt     ---> cdnPt
-//  cNotPt  ---> cNotPt_nv
-//  cPt     ---> cPt_nv
 unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
   int InvPredOpcode;
   InvPredOpcode = isPredicatedTrue(Opc) ? Hexagon::getFalsePredOpcode(Opc)
@@ -939,332 +736,12 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
 
   switch(Opc) {
     default: llvm_unreachable("Unexpected predicated instruction");
-    case Hexagon::TFR_cPt:
-      return Hexagon::TFR_cNotPt;
-    case Hexagon::TFR_cNotPt:
-      return Hexagon::TFR_cPt;
-
-    case Hexagon::TFRI_cPt:
-      return Hexagon::TFRI_cNotPt;
-    case Hexagon::TFRI_cNotPt:
-      return Hexagon::TFRI_cPt;
-
-    case Hexagon::JMP_t:
-      return Hexagon::JMP_f;
-    case Hexagon::JMP_f:
-      return Hexagon::JMP_t;
-
-    case Hexagon::ADD_ri_cPt:
-      return Hexagon::ADD_ri_cNotPt;
-    case Hexagon::ADD_ri_cNotPt:
-      return Hexagon::ADD_ri_cPt;
-
-    case Hexagon::ADD_rr_cPt:
-      return Hexagon::ADD_rr_cNotPt;
-    case Hexagon::ADD_rr_cNotPt:
-      return Hexagon::ADD_rr_cPt;
-
-    case Hexagon::XOR_rr_cPt:
-      return Hexagon::XOR_rr_cNotPt;
-    case Hexagon::XOR_rr_cNotPt:
-      return Hexagon::XOR_rr_cPt;
-
-    case Hexagon::AND_rr_cPt:
-      return Hexagon::AND_rr_cNotPt;
-    case Hexagon::AND_rr_cNotPt:
-      return Hexagon::AND_rr_cPt;
-
-    case Hexagon::OR_rr_cPt:
-      return Hexagon::OR_rr_cNotPt;
-    case Hexagon::OR_rr_cNotPt:
-      return Hexagon::OR_rr_cPt;
-
-    case Hexagon::SUB_rr_cPt:
-      return Hexagon::SUB_rr_cNotPt;
-    case Hexagon::SUB_rr_cNotPt:
-      return Hexagon::SUB_rr_cPt;
-
     case Hexagon::COMBINE_rr_cPt:
       return Hexagon::COMBINE_rr_cNotPt;
     case Hexagon::COMBINE_rr_cNotPt:
       return Hexagon::COMBINE_rr_cPt;
 
-    case Hexagon::ASLH_cPt_V4:
-      return Hexagon::ASLH_cNotPt_V4;
-    case Hexagon::ASLH_cNotPt_V4:
-      return Hexagon::ASLH_cPt_V4;
-
-    case Hexagon::ASRH_cPt_V4:
-      return Hexagon::ASRH_cNotPt_V4;
-    case Hexagon::ASRH_cNotPt_V4:
-      return Hexagon::ASRH_cPt_V4;
-
-    case Hexagon::SXTB_cPt_V4:
-      return Hexagon::SXTB_cNotPt_V4;
-    case Hexagon::SXTB_cNotPt_V4:
-      return Hexagon::SXTB_cPt_V4;
-
-    case Hexagon::SXTH_cPt_V4:
-      return Hexagon::SXTH_cNotPt_V4;
-    case Hexagon::SXTH_cNotPt_V4:
-      return Hexagon::SXTH_cPt_V4;
-
-    case Hexagon::ZXTB_cPt_V4:
-      return Hexagon::ZXTB_cNotPt_V4;
-    case Hexagon::ZXTB_cNotPt_V4:
-      return Hexagon::ZXTB_cPt_V4;
-
-    case Hexagon::ZXTH_cPt_V4:
-      return Hexagon::ZXTH_cNotPt_V4;
-    case Hexagon::ZXTH_cNotPt_V4:
-      return Hexagon::ZXTH_cPt_V4;
-
-
-    case Hexagon::JMPR_t:
-      return Hexagon::JMPR_f;
-    case Hexagon::JMPR_f:
-      return Hexagon::JMPR_t;
-
-  // V4 indexed+scaled load.
-    case Hexagon::LDrid_indexed_shl_cPt_V4:
-      return Hexagon::LDrid_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrid_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrid_indexed_shl_cPt_V4;
-
-    case Hexagon::LDrib_indexed_shl_cPt_V4:
-      return Hexagon::LDrib_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrib_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrib_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriub_indexed_shl_cPt_V4:
-      return Hexagon::LDriub_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriub_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriub_indexed_shl_cPt_V4;
-
-    case Hexagon::LDrih_indexed_shl_cPt_V4:
-      return Hexagon::LDrih_indexed_shl_cNotPt_V4;
-    case Hexagon::LDrih_indexed_shl_cNotPt_V4:
-      return Hexagon::LDrih_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriuh_indexed_shl_cPt_V4:
-      return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriuh_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriuh_indexed_shl_cPt_V4;
-
-    case Hexagon::LDriw_indexed_shl_cPt_V4:
-      return Hexagon::LDriw_indexed_shl_cNotPt_V4;
-    case Hexagon::LDriw_indexed_shl_cNotPt_V4:
-      return Hexagon::LDriw_indexed_shl_cPt_V4;
-
-    // Byte.
-    case Hexagon::POST_STbri_cPt:
-      return Hexagon::POST_STbri_cNotPt;
-    case Hexagon::POST_STbri_cNotPt:
-      return Hexagon::POST_STbri_cPt;
-
-    case Hexagon::STrib_cPt:
-      return Hexagon::STrib_cNotPt;
-    case Hexagon::STrib_cNotPt:
-      return Hexagon::STrib_cPt;
-
-    case Hexagon::STrib_indexed_cPt:
-      return Hexagon::STrib_indexed_cNotPt;
-    case Hexagon::STrib_indexed_cNotPt:
-      return Hexagon::STrib_indexed_cPt;
-
-    case Hexagon::STrib_imm_cPt_V4:
-      return Hexagon::STrib_imm_cNotPt_V4;
-    case Hexagon::STrib_imm_cNotPt_V4:
-      return Hexagon::STrib_imm_cPt_V4;
-
-    case Hexagon::STrib_indexed_shl_cPt_V4:
-      return Hexagon::STrib_indexed_shl_cNotPt_V4;
-    case Hexagon::STrib_indexed_shl_cNotPt_V4:
-      return Hexagon::STrib_indexed_shl_cPt_V4;
-
-  // Halfword.
-    case Hexagon::POST_SThri_cPt:
-      return Hexagon::POST_SThri_cNotPt;
-    case Hexagon::POST_SThri_cNotPt:
-      return Hexagon::POST_SThri_cPt;
-
-    case Hexagon::STrih_cPt:
-      return Hexagon::STrih_cNotPt;
-    case Hexagon::STrih_cNotPt:
-      return Hexagon::STrih_cPt;
-
-    case Hexagon::STrih_indexed_cPt:
-      return Hexagon::STrih_indexed_cNotPt;
-    case Hexagon::STrih_indexed_cNotPt:
-      return Hexagon::STrih_indexed_cPt;
-
-    case Hexagon::STrih_imm_cPt_V4:
-      return Hexagon::STrih_imm_cNotPt_V4;
-    case Hexagon::STrih_imm_cNotPt_V4:
-      return Hexagon::STrih_imm_cPt_V4;
-
-    case Hexagon::STrih_indexed_shl_cPt_V4:
-      return Hexagon::STrih_indexed_shl_cNotPt_V4;
-    case Hexagon::STrih_indexed_shl_cNotPt_V4:
-      return Hexagon::STrih_indexed_shl_cPt_V4;
-
-  // Word.
-    case Hexagon::POST_STwri_cPt:
-      return Hexagon::POST_STwri_cNotPt;
-    case Hexagon::POST_STwri_cNotPt:
-      return Hexagon::POST_STwri_cPt;
-
-    case Hexagon::STriw_cPt:
-      return Hexagon::STriw_cNotPt;
-    case Hexagon::STriw_cNotPt:
-      return Hexagon::STriw_cPt;
-
-    case Hexagon::STriw_indexed_cPt:
-      return Hexagon::STriw_indexed_cNotPt;
-    case Hexagon::STriw_indexed_cNotPt:
-      return Hexagon::STriw_indexed_cPt;
-
-    case Hexagon::STriw_indexed_shl_cPt_V4:
-      return Hexagon::STriw_indexed_shl_cNotPt_V4;
-    case Hexagon::STriw_indexed_shl_cNotPt_V4:
-      return Hexagon::STriw_indexed_shl_cPt_V4;
-
-    case Hexagon::STriw_imm_cPt_V4:
-      return Hexagon::STriw_imm_cNotPt_V4;
-    case Hexagon::STriw_imm_cNotPt_V4:
-      return Hexagon::STriw_imm_cPt_V4;
-
-  // Double word.
-    case Hexagon::POST_STdri_cPt:
-      return Hexagon::POST_STdri_cNotPt;
-    case Hexagon::POST_STdri_cNotPt:
-      return Hexagon::POST_STdri_cPt;
-
-    case Hexagon::STrid_cPt:
-      return Hexagon::STrid_cNotPt;
-    case Hexagon::STrid_cNotPt:
-      return Hexagon::STrid_cPt;
-
-    case Hexagon::STrid_indexed_cPt:
-      return Hexagon::STrid_indexed_cNotPt;
-    case Hexagon::STrid_indexed_cNotPt:
-      return Hexagon::STrid_indexed_cPt;
-
-    case Hexagon::STrid_indexed_shl_cPt_V4:
-      return Hexagon::STrid_indexed_shl_cNotPt_V4;
-    case Hexagon::STrid_indexed_shl_cNotPt_V4:
-      return Hexagon::STrid_indexed_shl_cPt_V4;
-
-    // V4 Store to global address.
-    case Hexagon::STd_GP_cPt_V4:
-      return Hexagon::STd_GP_cNotPt_V4;
-    case Hexagon::STd_GP_cNotPt_V4:
-      return Hexagon::STd_GP_cPt_V4;
-
-    case Hexagon::STb_GP_cPt_V4:
-      return Hexagon::STb_GP_cNotPt_V4;
-    case Hexagon::STb_GP_cNotPt_V4:
-      return Hexagon::STb_GP_cPt_V4;
-
-    case Hexagon::STh_GP_cPt_V4:
-      return Hexagon::STh_GP_cNotPt_V4;
-    case Hexagon::STh_GP_cNotPt_V4:
-      return Hexagon::STh_GP_cPt_V4;
-
-    case Hexagon::STw_GP_cPt_V4:
-      return Hexagon::STw_GP_cNotPt_V4;
-    case Hexagon::STw_GP_cNotPt_V4:
-      return Hexagon::STw_GP_cPt_V4;
-
-  // Load.
-    case Hexagon::LDrid_cPt:
-      return Hexagon::LDrid_cNotPt;
-    case Hexagon::LDrid_cNotPt:
-      return Hexagon::LDrid_cPt;
-
-    case Hexagon::LDriw_cPt:
-      return Hexagon::LDriw_cNotPt;
-    case Hexagon::LDriw_cNotPt:
-      return Hexagon::LDriw_cPt;
-
-    case Hexagon::LDrih_cPt:
-      return Hexagon::LDrih_cNotPt;
-    case Hexagon::LDrih_cNotPt:
-      return Hexagon::LDrih_cPt;
-
-    case Hexagon::LDriuh_cPt:
-      return Hexagon::LDriuh_cNotPt;
-    case Hexagon::LDriuh_cNotPt:
-      return Hexagon::LDriuh_cPt;
-
-    case Hexagon::LDrib_cPt:
-      return Hexagon::LDrib_cNotPt;
-    case Hexagon::LDrib_cNotPt:
-      return Hexagon::LDrib_cPt;
-
-    case Hexagon::LDriub_cPt:
-      return Hexagon::LDriub_cNotPt;
-    case Hexagon::LDriub_cNotPt:
-      return Hexagon::LDriub_cPt;
-
- // Load Indexed.
-    case Hexagon::LDrid_indexed_cPt:
-      return Hexagon::LDrid_indexed_cNotPt;
-    case Hexagon::LDrid_indexed_cNotPt:
-      return Hexagon::LDrid_indexed_cPt;
-
-    case Hexagon::LDriw_indexed_cPt:
-      return Hexagon::LDriw_indexed_cNotPt;
-    case Hexagon::LDriw_indexed_cNotPt:
-      return Hexagon::LDriw_indexed_cPt;
-
-    case Hexagon::LDrih_indexed_cPt:
-      return Hexagon::LDrih_indexed_cNotPt;
-    case Hexagon::LDrih_indexed_cNotPt:
-      return Hexagon::LDrih_indexed_cPt;
-
-    case Hexagon::LDriuh_indexed_cPt:
-      return Hexagon::LDriuh_indexed_cNotPt;
-    case Hexagon::LDriuh_indexed_cNotPt:
-      return Hexagon::LDriuh_indexed_cPt;
-
-    case Hexagon::LDrib_indexed_cPt:
-      return Hexagon::LDrib_indexed_cNotPt;
-    case Hexagon::LDrib_indexed_cNotPt:
-      return Hexagon::LDrib_indexed_cPt;
-
-    case Hexagon::LDriub_indexed_cPt:
-      return Hexagon::LDriub_indexed_cNotPt;
-    case Hexagon::LDriub_indexed_cNotPt:
-      return Hexagon::LDriub_indexed_cPt;
-
-  // Post Inc Load.
-    case Hexagon::POST_LDrid_cPt:
-      return Hexagon::POST_LDrid_cNotPt;
-    case Hexagon::POST_LDriw_cNotPt:
-      return Hexagon::POST_LDriw_cPt;
-
-    case Hexagon::POST_LDrih_cPt:
-      return Hexagon::POST_LDrih_cNotPt;
-    case Hexagon::POST_LDrih_cNotPt:
-      return Hexagon::POST_LDrih_cPt;
-
-    case Hexagon::POST_LDriuh_cPt:
-      return Hexagon::POST_LDriuh_cNotPt;
-    case Hexagon::POST_LDriuh_cNotPt:
-      return Hexagon::POST_LDriuh_cPt;
-
-    case Hexagon::POST_LDrib_cPt:
-      return Hexagon::POST_LDrib_cNotPt;
-    case Hexagon::POST_LDrib_cNotPt:
-      return Hexagon::POST_LDrib_cPt;
-
-    case Hexagon::POST_LDriub_cPt:
-      return Hexagon::POST_LDriub_cNotPt;
-    case Hexagon::POST_LDriub_cNotPt:
-      return Hexagon::POST_LDriub_cPt;
-
-  // Dealloc_return.
+      // Dealloc_return.
     case Hexagon::DEALLOC_RET_cPt_V4:
       return Hexagon::DEALLOC_RET_cNotPt_V4;
     case Hexagon::DEALLOC_RET_cNotPt_V4:
@@ -1272,6 +749,18 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
   }
 }
 
+// New Value Store instructions.
+bool HexagonInstrInfo::isNewValueStore(const MachineInstr *MI) const {
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
+
+bool HexagonInstrInfo::isNewValueStore(unsigned Opcode) const {
+  const uint64_t F = get(Opcode).TSFlags;
+
+  return ((F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask);
+}
 
 int HexagonInstrInfo::
 getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
@@ -1285,218 +774,21 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   // This switch case will be removed once all the instructions have been
   // modified to use relation maps.
   switch(Opc) {
-  case Hexagon::TFR:
-    return !invertPredicate ? Hexagon::TFR_cPt :
-                              Hexagon::TFR_cNotPt;
   case Hexagon::TFRI_f:
     return !invertPredicate ? Hexagon::TFRI_cPt_f :
                               Hexagon::TFRI_cNotPt_f;
-  case Hexagon::TFRI:
-    return !invertPredicate ? Hexagon::TFRI_cPt :
-                              Hexagon::TFRI_cNotPt;
-  case Hexagon::JMP:
-    return !invertPredicate ? Hexagon::JMP_t :
-                              Hexagon::JMP_f;
-
   case Hexagon::COMBINE_rr:
     return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
                               Hexagon::COMBINE_rr_cNotPt;
-  case Hexagon::ASLH:
-    return !invertPredicate ? Hexagon::ASLH_cPt_V4 :
-                              Hexagon::ASLH_cNotPt_V4;
-  case Hexagon::ASRH:
-    return !invertPredicate ? Hexagon::ASRH_cPt_V4 :
-                              Hexagon::ASRH_cNotPt_V4;
-  case Hexagon::SXTB:
-    return !invertPredicate ? Hexagon::SXTB_cPt_V4 :
-                              Hexagon::SXTB_cNotPt_V4;
-  case Hexagon::SXTH:
-    return !invertPredicate ? Hexagon::SXTH_cPt_V4 :
-                              Hexagon::SXTH_cNotPt_V4;
-  case Hexagon::ZXTB:
-    return !invertPredicate ? Hexagon::ZXTB_cPt_V4 :
-                              Hexagon::ZXTB_cNotPt_V4;
-  case Hexagon::ZXTH:
-    return !invertPredicate ? Hexagon::ZXTH_cPt_V4 :
-                              Hexagon::ZXTH_cNotPt_V4;
-
-  case Hexagon::JMPR:
-    return !invertPredicate ? Hexagon::JMPR_t :
-                              Hexagon::JMPR_f;
-
-  // V4 indexed+scaled load.
-  case Hexagon::LDrid_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrid_indexed_shl_cPt_V4 :
-                              Hexagon::LDrid_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrib_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrib_indexed_shl_cPt_V4 :
-                              Hexagon::LDrib_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriub_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriub_indexed_shl_cPt_V4 :
-                              Hexagon::LDriub_indexed_shl_cNotPt_V4;
-  case Hexagon::LDrih_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDrih_indexed_shl_cPt_V4 :
-                              Hexagon::LDrih_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriuh_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_shl_cPt_V4 :
-                              Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-  case Hexagon::LDriw_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::LDriw_indexed_shl_cPt_V4 :
-                              Hexagon::LDriw_indexed_shl_cNotPt_V4;
-
-  // V4 Load from global address
-  case Hexagon::LDd_GP_V4:
-    return !invertPredicate ? Hexagon::LDd_GP_cPt_V4 :
-                              Hexagon::LDd_GP_cNotPt_V4;
-  case Hexagon::LDb_GP_V4:
-    return !invertPredicate ? Hexagon::LDb_GP_cPt_V4 :
-                              Hexagon::LDb_GP_cNotPt_V4;
-  case Hexagon::LDub_GP_V4:
-    return !invertPredicate ? Hexagon::LDub_GP_cPt_V4 :
-                              Hexagon::LDub_GP_cNotPt_V4;
-  case Hexagon::LDh_GP_V4:
-    return !invertPredicate ? Hexagon::LDh_GP_cPt_V4 :
-                              Hexagon::LDh_GP_cNotPt_V4;
-  case Hexagon::LDuh_GP_V4:
-    return !invertPredicate ? Hexagon::LDuh_GP_cPt_V4 :
-                              Hexagon::LDuh_GP_cNotPt_V4;
-  case Hexagon::LDw_GP_V4:
-    return !invertPredicate ? Hexagon::LDw_GP_cPt_V4 :
-                              Hexagon::LDw_GP_cNotPt_V4;
-
-    // Byte.
-  case Hexagon::POST_STbri:
-    return !invertPredicate ? Hexagon::POST_STbri_cPt :
-                              Hexagon::POST_STbri_cNotPt;
-  case Hexagon::STrib:
-    return !invertPredicate ? Hexagon::STrib_cPt :
-                              Hexagon::STrib_cNotPt;
-  case Hexagon::STrib_indexed:
-    return !invertPredicate ? Hexagon::STrib_indexed_cPt :
-                              Hexagon::STrib_indexed_cNotPt;
-  case Hexagon::STrib_imm_V4:
-    return !invertPredicate ? Hexagon::STrib_imm_cPt_V4 :
-                              Hexagon::STrib_imm_cNotPt_V4;
-  case Hexagon::STrib_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrib_indexed_shl_cPt_V4 :
-                              Hexagon::STrib_indexed_shl_cNotPt_V4;
-  // Halfword.
-  case Hexagon::POST_SThri:
-    return !invertPredicate ? Hexagon::POST_SThri_cPt :
-                              Hexagon::POST_SThri_cNotPt;
-  case Hexagon::STrih:
-    return !invertPredicate ? Hexagon::STrih_cPt :
-                              Hexagon::STrih_cNotPt;
-  case Hexagon::STrih_indexed:
-    return !invertPredicate ? Hexagon::STrih_indexed_cPt :
-                              Hexagon::STrih_indexed_cNotPt;
-  case Hexagon::STrih_imm_V4:
-    return !invertPredicate ? Hexagon::STrih_imm_cPt_V4 :
-                              Hexagon::STrih_imm_cNotPt_V4;
-  case Hexagon::STrih_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrih_indexed_shl_cPt_V4 :
-                              Hexagon::STrih_indexed_shl_cNotPt_V4;
+
   // Word.
-  case Hexagon::POST_STwri:
-    return !invertPredicate ? Hexagon::POST_STwri_cPt :
-                              Hexagon::POST_STwri_cNotPt;
-  case Hexagon::STriw:
+  case Hexagon::STriw_f:
     return !invertPredicate ? Hexagon::STriw_cPt :
                               Hexagon::STriw_cNotPt;
-  case Hexagon::STriw_indexed:
+  case Hexagon::STriw_indexed_f:
     return !invertPredicate ? Hexagon::STriw_indexed_cPt :
                               Hexagon::STriw_indexed_cNotPt;
-  case Hexagon::STriw_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STriw_indexed_shl_cPt_V4 :
-                              Hexagon::STriw_indexed_shl_cNotPt_V4;
-  case Hexagon::STriw_imm_V4:
-    return !invertPredicate ? Hexagon::STriw_imm_cPt_V4 :
-                              Hexagon::STriw_imm_cNotPt_V4;
-  // Double word.
-  case Hexagon::POST_STdri:
-    return !invertPredicate ? Hexagon::POST_STdri_cPt :
-                              Hexagon::POST_STdri_cNotPt;
-  case Hexagon::STrid:
-    return !invertPredicate ? Hexagon::STrid_cPt :
-                              Hexagon::STrid_cNotPt;
-  case Hexagon::STrid_indexed:
-    return !invertPredicate ? Hexagon::STrid_indexed_cPt :
-                              Hexagon::STrid_indexed_cNotPt;
-  case Hexagon::STrid_indexed_shl_V4:
-    return !invertPredicate ? Hexagon::STrid_indexed_shl_cPt_V4 :
-                              Hexagon::STrid_indexed_shl_cNotPt_V4;
-
-  // V4 Store to global address
-  case Hexagon::STd_GP_V4:
-    return !invertPredicate ? Hexagon::STd_GP_cPt_V4 :
-                              Hexagon::STd_GP_cNotPt_V4;
-  case Hexagon::STb_GP_V4:
-    return !invertPredicate ? Hexagon::STb_GP_cPt_V4 :
-                              Hexagon::STb_GP_cNotPt_V4;
-  case Hexagon::STh_GP_V4:
-    return !invertPredicate ? Hexagon::STh_GP_cPt_V4 :
-                              Hexagon::STh_GP_cNotPt_V4;
-  case Hexagon::STw_GP_V4:
-    return !invertPredicate ? Hexagon::STw_GP_cPt_V4 :
-                              Hexagon::STw_GP_cNotPt_V4;
-
-  // Load.
-  case Hexagon::LDrid:
-    return !invertPredicate ? Hexagon::LDrid_cPt :
-                              Hexagon::LDrid_cNotPt;
-  case Hexagon::LDriw:
-    return !invertPredicate ? Hexagon::LDriw_cPt :
-                              Hexagon::LDriw_cNotPt;
-  case Hexagon::LDrih:
-    return !invertPredicate ? Hexagon::LDrih_cPt :
-                              Hexagon::LDrih_cNotPt;
-  case Hexagon::LDriuh:
-    return !invertPredicate ? Hexagon::LDriuh_cPt :
-                              Hexagon::LDriuh_cNotPt;
-  case Hexagon::LDrib:
-    return !invertPredicate ? Hexagon::LDrib_cPt :
-                              Hexagon::LDrib_cNotPt;
-  case Hexagon::LDriub:
-    return !invertPredicate ? Hexagon::LDriub_cPt :
-                              Hexagon::LDriub_cNotPt;
- // Load Indexed.
-  case Hexagon::LDrid_indexed:
-    return !invertPredicate ? Hexagon::LDrid_indexed_cPt :
-                              Hexagon::LDrid_indexed_cNotPt;
-  case Hexagon::LDriw_indexed:
-    return !invertPredicate ? Hexagon::LDriw_indexed_cPt :
-                              Hexagon::LDriw_indexed_cNotPt;
-  case Hexagon::LDrih_indexed:
-    return !invertPredicate ? Hexagon::LDrih_indexed_cPt :
-                              Hexagon::LDrih_indexed_cNotPt;
-  case Hexagon::LDriuh_indexed:
-    return !invertPredicate ? Hexagon::LDriuh_indexed_cPt :
-                              Hexagon::LDriuh_indexed_cNotPt;
-  case Hexagon::LDrib_indexed:
-    return !invertPredicate ? Hexagon::LDrib_indexed_cPt :
-                              Hexagon::LDrib_indexed_cNotPt;
-  case Hexagon::LDriub_indexed:
-    return !invertPredicate ? Hexagon::LDriub_indexed_cPt :
-                              Hexagon::LDriub_indexed_cNotPt;
-  // Post Increment Load.
-  case Hexagon::POST_LDrid:
-    return !invertPredicate ? Hexagon::POST_LDrid_cPt :
-                              Hexagon::POST_LDrid_cNotPt;
-  case Hexagon::POST_LDriw:
-    return !invertPredicate ? Hexagon::POST_LDriw_cPt :
-                              Hexagon::POST_LDriw_cNotPt;
-  case Hexagon::POST_LDrih:
-    return !invertPredicate ? Hexagon::POST_LDrih_cPt :
-                              Hexagon::POST_LDrih_cNotPt;
-  case Hexagon::POST_LDriuh:
-    return !invertPredicate ? Hexagon::POST_LDriuh_cPt :
-                              Hexagon::POST_LDriuh_cNotPt;
-  case Hexagon::POST_LDrib:
-    return !invertPredicate ? Hexagon::POST_LDrib_cPt :
-                              Hexagon::POST_LDrib_cNotPt;
-  case Hexagon::POST_LDriub:
-    return !invertPredicate ? Hexagon::POST_LDriub_cPt :
-                              Hexagon::POST_LDriub_cNotPt;
+
   // DEALLOC_RETURN.
   case Hexagon::DEALLOC_RET_V4:
     return !invertPredicate ? Hexagon::DEALLOC_RET_cPt_V4 :
@@ -1727,6 +1019,16 @@ bool HexagonInstrInfo::isPredicatedNew(unsigned Opcode) const {
   return ((F >> HexagonII::PredicatedNewPos) & HexagonII::PredicatedNewMask);
 }
 
+// Returns true, if a ST insn can be promoted to a new-value store.
+bool HexagonInstrInfo::mayBeNewStore(const MachineInstr *MI) const {
+  const HexagonRegisterInfo& QRI = getRegisterInfo();
+  const uint64_t F = MI->getDesc().TSFlags;
+
+  return ((F >> HexagonII::mayNVStorePos) &
+           HexagonII::mayNVStoreMask &
+           QRI.Subtarget.hasV4TOps());
+}
+
 bool
 HexagonInstrInfo::DefinesPredicate(MachineInstr *MI,
                                    std::vector<MachineOperand> &Pred) const {
@@ -1911,6 +1213,8 @@ isValidAutoIncImm(const EVT VT, const int Offset) const {
 
 bool HexagonInstrInfo::
 isMemOp(const MachineInstr *MI) const {
+//  return MI->getDesc().mayLoad() && MI->getDesc().mayStore();
+
   switch (MI->getOpcode())
   {
     default: return false;
@@ -2202,6 +1506,10 @@ bool HexagonInstrInfo::isNewValueJump(const MachineInstr *MI) const {
   return false;
 }
 
+bool HexagonInstrInfo::isPostIncrement (const MachineInstr* MI) const {
+  return (getAddrMode(MI) == HexagonII::PostInc);
+}
+
 bool HexagonInstrInfo::isNewValue(const MachineInstr* MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
   return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
@@ -2214,6 +1522,97 @@ bool HexagonInstrInfo::isDotNewInst (const MachineInstr* MI) const {
      (isPredicated(MI) && isPredicatedNew(MI)));
 }
 
+// Returns the most basic instruction for the .new predicated instructions and
+// new-value stores.
+// For example, all of the following instructions will be converted back to the
+// same instruction:
+// 1) if (p0.new) memw(R0+#0) = R1.new  --->
+// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
+// 3) if (p0.new) memw(R0+#0) = R1      --->
+//
+
+int HexagonInstrInfo::GetDotOldOp(const int opc) const {
+  int NewOp = opc;
+  if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
+    NewOp = Hexagon::getPredOldOpcode(NewOp);
+    if (NewOp < 0)
+      assert(0 && "Couldn't change predicate new instruction to its old form.");
+  }
+
+  if (isNewValueStore(NewOp)) { // Convert into non new-value format
+    NewOp = Hexagon::getNonNVStore(NewOp);
+    if (NewOp < 0)
+      assert(0 && "Couldn't change new-value store to its old form.");
+  }
+  return NewOp;
+}
+
+// Return the new value instruction for a given store.
+int HexagonInstrInfo::GetDotNewOp(const MachineInstr* MI) const {
+  int NVOpcode = Hexagon::getNewValueOpcode(MI->getOpcode());
+  if (NVOpcode >= 0) // Valid new-value store instruction.
+    return NVOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // store new value byte
+  case Hexagon::STrib_shl_V4:
+    return Hexagon::STrib_shl_nv_V4;
+
+  case Hexagon::STrih_shl_V4:
+    return Hexagon::STrih_shl_nv_V4;
+
+  case Hexagon::STriw_f:
+    return Hexagon::STriw_nv_V4;
+
+  case Hexagon::STriw_indexed_f:
+    return Hexagon::STriw_indexed_nv_V4;
+
+  case Hexagon::STriw_shl_V4:
+    return Hexagon::STriw_shl_nv_V4;
+
+  }
+  return 0;
+}
+
+// Return .new predicate version for an instruction.
+int HexagonInstrInfo::GetDotNewPredOp(MachineInstr *MI,
+                                      const MachineBranchProbabilityInfo
+                                      *MBPI) const {
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI->getOpcode());
+  if (NewOpcode >= 0) // Valid predicate new instruction
+    return NewOpcode;
+
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown .new type");
+  // Condtional Jumps
+  case Hexagon::JMP_t:
+  case Hexagon::JMP_f:
+    return getDotNewPredJumpOp(MI, MBPI);
+
+  case Hexagon::JMPR_t:
+    return Hexagon::JMPR_tnew_tV3;
+
+  case Hexagon::JMPR_f:
+    return Hexagon::JMPR_fnew_tV3;
+
+  case Hexagon::JMPret_t:
+    return Hexagon::JMPret_tnew_tV3;
+
+  case Hexagon::JMPret_f:
+    return Hexagon::JMPret_fnew_tV3;
+
+
+  // Conditional combine
+  case Hexagon::COMBINE_rr_cPt :
+    return Hexagon::COMBINE_rr_cdnPt;
+  case Hexagon::COMBINE_rr_cNotPt :
+    return Hexagon::COMBINE_rr_cdnNotPt;
+  }
+}
+
+
 unsigned HexagonInstrInfo::getAddrMode(const MachineInstr* MI) const {
   const uint64_t F = MI->getDesc().TSFlags;
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index b721da4..3f45b8b 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -26,8 +26,9 @@
 namespace llvm {
 
 class HexagonInstrInfo : public HexagonGenInstrInfo {
+  virtual void anchor();
   const HexagonRegisterInfo RI;
-  const HexagonSubtarget& Subtarget;
+  const HexagonSubtarget &Subtarget;
   typedef unsigned Opcode_t;
 
 public:
@@ -148,11 +149,6 @@ public:
   isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumCycles,
                             const BranchProbability &Probability) const;
 
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
   virtual DFAPacketizer*
   CreateTargetScheduleState(const TargetMachine *TM,
                             const ScheduleDAG *DAG) const;
@@ -185,12 +181,19 @@ public:
   bool isNewValueInst(const MachineInstr* MI) const;
   bool isNewValue(const MachineInstr* MI) const;
   bool isDotNewInst(const MachineInstr* MI) const;
+  int GetDotOldOp(const int opc) const;
+  int GetDotNewOp(const MachineInstr* MI) const;
+  int GetDotNewPredOp(MachineInstr *MI,
+                      const MachineBranchProbabilityInfo
+                      *MBPI) const;
+  bool mayBeNewStore(const MachineInstr* MI) const;
   bool isDeallocRet(const MachineInstr *MI) const;
   unsigned getInvertedPredicatedOpcode(const int Opc) const;
   bool isExtendable(const MachineInstr* MI) const;
   bool isExtended(const MachineInstr* MI) const;
   bool isPostIncrement(const MachineInstr* MI) const;
   bool isNewValueStore(const MachineInstr* MI) const;
+  bool isNewValueStore(unsigned Opcode) const;
   bool isNewValueJump(const MachineInstr* MI) const;
   bool isNewValueJumpCandidate(const MachineInstr *MI) const;
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index 2a4b17b..c96aaca 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -384,6 +384,12 @@ def TFCR : CRInst<(outs CRRegs:$dst), (ins IntRegs:$src1),
 // ALU32/PERM +
 //===----------------------------------------------------------------------===//
 
+let neverHasSideEffects = 1 in
+def COMBINE_ii : ALU32_ii<(outs DoubleRegs:$dst),
+            (ins s8Imm:$src1, s8Imm:$src2),
+            "$dst = combine(#$src1, #$src2)",
+            []>;
+
 // Mux.
 def VMUX_prr64 : ALU64_rr<(outs DoubleRegs:$dst), (ins PredRegs:$src1,
                                                    DoubleRegs:$src2,
@@ -932,12 +938,21 @@ multiclass LD_MEMri<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, isMEMri = "true" in {
-  defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
-  defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
-  defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
-  defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
-  defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
-  defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib: LD_MEMri < "memb", "LDrib", IntRegs, 11, 6>, AddrModeRel;
+    defm LDriub: LD_MEMri < "memub" , "LDriub", IntRegs, 11, 6>, AddrModeRel;
+ }
+
+  let accessSize = HalfWordAccess in {
+    defm LDrih: LD_MEMri < "memh", "LDrih", IntRegs, 12, 7>, AddrModeRel;
+    defm LDriuh: LD_MEMri < "memuh", "LDriuh", IntRegs, 12, 7>, AddrModeRel;
+ }
+
+  let accessSize = WordAccess in
+    defm LDriw: LD_MEMri < "memw", "LDriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid: LD_MEMri < "memd", "LDrid", DoubleRegs, 14, 9>, AddrModeRel;
 }
 
 def : Pat < (i32 (sextloadi8 ADDRriS11_0:$addr)),
@@ -1000,18 +1015,25 @@ multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset in {
-  defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
-                               11, 6>, AddrModeRel;
-  defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
-                                11, 6>, AddrModeRel;
-  defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
-                               12, 7>, AddrModeRel;
-  defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
-                                12, 7>, AddrModeRel;
-  defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
-                               13, 8>, AddrModeRel;
-  defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
-                               14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed: LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext,
+                                  11, 6>, AddrModeRel;
+    defm LDriub_indexed: LD_Idxd <"memub" , "LDriub", IntRegs, s11_0Ext, u6_0Ext,
+                                   11, 6>, AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed: LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext,
+                                 12, 7>, AddrModeRel;
+    defm LDriuh_indexed: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext,
+                                  12, 7>, AddrModeRel;
+  }
+  let accessSize = WordAccess in
+    defm LDriw_indexed: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext,
+                                 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext,
+                                 14, 9>, AddrModeRel;
 }
 
 let AddedComplexity = 20 in {
@@ -1036,8 +1058,6 @@ def : Pat < (i64 (load (add IntRegs:$src1, s11_3ExtPred:$offset))),
 
 //===----------------------------------------------------------------------===//
 // Post increment load
-// Make sure that in post increment load, the first operand is always the post
-// increment operand.
 //===----------------------------------------------------------------------===//
 
 multiclass LD_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
@@ -1079,7 +1099,7 @@ multiclass LD_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
   }
 }
 
-let hasCtrlDep = 1, neverHasSideEffects = 1 in {
+let hasCtrlDep = 1, neverHasSideEffects = 1, addrMode = PostInc in {
   defm POST_LDrib : LD_PostInc<"memb", "LDrib", IntRegs, s4_0Imm>,
                     PredNewRel;
   defm POST_LDriub : LD_PostInc<"memub", "LDriub", IntRegs, s4_0Imm>,
@@ -1382,7 +1402,7 @@ multiclass ST_PostInc_Pbase<string mnemonic, RegisterClass RC, Operand ImmOp,
 multiclass ST_PostInc_Pred<string mnemonic, RegisterClass RC,
                            Operand ImmOp, bit PredNot> {
   let isPredicatedFalse = PredNot in {
-    defm _c#NAME# : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
+    defm _c#NAME : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 0>;
     // Predicate new
     let Predicates = [HasV4T], validSubTargets = HasV4SubT in
     defm _cdn#NAME#_V4 : ST_PostInc_Pbase<mnemonic, RC, ImmOp, PredNot, 1>;
@@ -1397,7 +1417,7 @@ multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
     let isPredicable = 1 in
     def NAME : STInst2PI<(outs IntRegs:$dst),
                 (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-                #mnemonic#"($src1++#$offset) = $src2",
+                mnemonic#"($src1++#$offset) = $src2",
                 [],
                 "$src1 = $dst">;
 
@@ -1474,12 +1494,17 @@ multiclass ST_MEMri<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, isMEMri = "true" in {
-  defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
-  defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
-  defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri < "memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri < "memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
 
-  let isNVStorable = 0 in
-  defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri < "memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid: ST_MEMri < "memd", "STrid", DoubleRegs, 14, 9>, AddrModeRel;
 }
 
 def : Pat<(truncstorei8 (i32 IntRegs:$src1), ADDRriS11_0:$addr),
@@ -1541,15 +1566,21 @@ multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, InputType = "reg" in {
-  defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
-                                u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
-  defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
-                                u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
-  defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
-                                u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
-  let isNVStorable = 0 in
-  defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                                u6_3Ext, 14, 9>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext,
+                                  u6_0Ext, 11, 6>, AddrModeRel, ImmRegRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext,
+                                  u6_1Ext, 12, 7>, AddrModeRel, ImmRegRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext,
+                                  u6_2Ext, 13, 8>, AddrModeRel, ImmRegRel;
+
+  let accessSize = DoubleWordAccess, isNVStorable = 0 in
+    defm STrid_indexed: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
+                                  u6_3Ext, 14, 9>, AddrModeRel;
 }
 
 let AddedComplexity = 10 in {
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index 933239d..475c23d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -213,7 +213,7 @@ def COMBINE_iI_V4 : ALU32_ii<(outs DoubleRegs:$dst),
 // Template class for load instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
 let isExtended = 1, opExtendable = 2, neverHasSideEffects = 1,
-validSubTargets = HasV4SubT in
+validSubTargets = HasV4SubT, addrMode = AbsoluteSet in
 class T_LD_abs_set<string mnemonic, RegisterClass RC>:
             LDInst2<(outs RC:$dst1, IntRegs:$dst2),
             (ins u0AlwaysExt:$addr),
@@ -266,12 +266,23 @@ multiclass ld_idxd_shl<string mnemonic, string CextOp, RegisterClass RC> {
 }
 
 let addrMode = BaseRegOffset in {
-  defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>, AddrModeRel;
-  defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>, AddrModeRel;
-  defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
-  defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>, AddrModeRel;
-  defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
-  defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>, AddrModeRel;
+  let accessSize = ByteAccess in {
+    defm LDrib_indexed_shl: ld_idxd_shl<"memb", "LDrib", IntRegs>,
+                                        AddrModeRel;
+    defm LDriub_indexed_shl: ld_idxd_shl<"memub", "LDriub", IntRegs>,
+                                        AddrModeRel;
+  }
+  let accessSize = HalfWordAccess in {
+    defm LDrih_indexed_shl: ld_idxd_shl<"memh", "LDrih", IntRegs>, AddrModeRel;
+    defm LDriuh_indexed_shl: ld_idxd_shl<"memuh", "LDriuh", IntRegs>,
+                             AddrModeRel;
+  }
+  let accessSize = WordAccess in
+     defm LDriw_indexed_shl: ld_idxd_shl<"memw", "LDriw", IntRegs>, AddrModeRel;
+
+  let accessSize = DoubleWordAccess in
+    defm LDrid_indexed_shl: ld_idxd_shl<"memd", "LDrid", DoubleRegs>,
+                             AddrModeRel;
 }
 
 // 'def pats' for load instructions with base + register offset and non-zero
@@ -456,7 +467,8 @@ def:  Pat <(i64 (extloadi32 (i32 (add IntRegs:$src1, s11_2ExtPred:$offset)))),
 //===----------------------------------------------------------------------===//
 // Template class for store instructions with Absolute set addressing mode.
 //===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT in
+let isExtended = 1, opExtendable = 2, validSubTargets = HasV4SubT,
+addrMode = AbsoluteSet in
 class T_ST_abs_set<string mnemonic, RegisterClass RC>:
             STInst2<(outs IntRegs:$dst1),
             (ins RC:$src1, u0AlwaysExt:$src2),
@@ -551,17 +563,20 @@ multiclass ST_Idxd_shl_nv<string mnemonic, string CextOp, RegisterClass RC> {
 
 let addrMode = BaseRegOffset, neverHasSideEffects = 1,
 validSubTargets = HasV4SubT in {
-  defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
-                          ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed_shl: ST_Idxd_shl<"memb", "STrib", IntRegs>,
+                            ST_Idxd_shl_nv<"memb", "STrib", IntRegs>, AddrModeRel;
 
-  defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
-                          ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed_shl: ST_Idxd_shl<"memh", "STrih", IntRegs>,
+                            ST_Idxd_shl_nv<"memh", "STrih", IntRegs>, AddrModeRel;
 
-  defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
-                          ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
+  let accessSize = WordAccess in
+    defm STriw_indexed_shl: ST_Idxd_shl<"memw", "STriw", IntRegs>,
+                            ST_Idxd_shl_nv<"memw", "STriw", IntRegs>, AddrModeRel;
 
-  let isNVStorable = 0 in
-  defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
+  let isNVStorable = 0, accessSize = DoubleWordAccess in
+    defm STrid_indexed_shl: ST_Idxd_shl<"memd", "STrid", DoubleRegs>, AddrModeRel;
 }
 
 let Predicates = [HasV4T], AddedComplexity = 10 in {
@@ -695,10 +710,15 @@ multiclass ST_Imm<string mnemonic, string CextOp, Operand OffsetOp> {
 }
 
 let addrMode = BaseImmOffset, InputType = "imm",
-    validSubTargets = HasV4SubT in {
-  defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
-  defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
-  defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
+validSubTargets = HasV4SubT in {
+  let accessSize = ByteAccess in
+    defm STrib_imm : ST_Imm<"memb", "STrib", u6_0Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_imm : ST_Imm<"memh", "STrih", u6_1Imm>, ImmRegRel, PredNewRel;
+
+  let accessSize = WordAccess in
+    defm STriw_imm : ST_Imm<"memw", "STriw", u6_2Imm>, ImmRegRel, PredNewRel;
 }
 
 let Predicates = [HasV4T], AddedComplexity = 10 in {
@@ -834,12 +854,17 @@ multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
 }
 
 let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
-  defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                                 u6_0Ext, 11, 6>, AddrModeRel;
-  defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                                 u6_1Ext, 12, 7>, AddrModeRel;
-  defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                                 u6_2Ext, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib_indexed: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
+                                   u6_0Ext, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih_indexed: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
+                                   u6_1Ext, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw_indexed: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
+                                   u6_2Ext, 13, 8>, AddrModeRel;
 }
 
 // multiclass for new-value store instructions with base + immediate offset.
@@ -887,9 +912,14 @@ multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
 
 let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
 mayStore = 1 in {
-  defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
-  defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
-  defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+  let accessSize = ByteAccess in
+    defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+
+  let accessSize = HalfWordAccess in
+    defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+
+  let accessSize = WordAccess in
+    defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
 }
 
 //===----------------------------------------------------------------------===//
@@ -939,7 +969,7 @@ multiclass ST_PostInc_nv<string mnemonic, string BaseOp, RegisterClass RC,
   }
 }
 
-let validSubTargets = HasV4SubT in {
+let addrMode = PostInc, validSubTargets = HasV4SubT in {
 defm POST_STbri: ST_PostInc_nv <"memb", "STrib", IntRegs, s4_0Imm>, AddrModeRel;
 defm POST_SThri: ST_PostInc_nv <"memh", "STrih", IntRegs, s4_1Imm>, AddrModeRel;
 defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
@@ -2534,8 +2564,9 @@ def NTSTBIT_ri : SInst<(outs PredRegs:$dst), (ins IntRegs:$src1, u5Imm:$src2),
 //Deallocate frame and return.
 //    dealloc_return
 let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
-  Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1 in {
-  def DEALLOC_RET_V4 : NVInst_V4<(outs), (ins i32imm:$amt1),
+  Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_V4 : LD0Inst<(outs), (ins),
             "dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2544,9 +2575,10 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1, isPredicable = 1,
 // Restore registers and dealloc return function call.
 let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
   Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
   def RESTORE_DEALLOC_RET_JMP_V4 : JInst<(outs),
                                    (ins calltarget:$dst),
-             "jump $dst // Restore_and_dealloc_return",
+             "jump $dst",
              []>,
              Requires<[HasV4T]>;
 }
@@ -2554,9 +2586,10 @@ let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
 // Restore registers and dealloc frame before a tail call.
 let isCall = 1, isBarrier = 1,
   Defs = [R29, R30, R31, PC] in {
+let validSubTargets = HasV4SubT in
   def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : JInst<(outs),
                                            (ins calltarget:$dst),
-             "call $dst // Restore_and_dealloc_before_tailcall",
+             "call $dst",
              []>,
              Requires<[HasV4T]>;
 }
@@ -2573,10 +2606,11 @@ let isCall = 1, isBarrier = 1,
 
 //    if (Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cPt_V4 : NVInst_V4<(outs),
-                           (ins PredRegs:$src1, i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cPt_V4 : LD0Inst<(outs),
+                           (ins PredRegs:$src1),
             "if ($src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2584,10 +2618,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps) dealloc_return
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                     i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1) dealloc_return",
             []>,
             Requires<[HasV4T]>;
@@ -2595,10 +2629,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                     i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if ($src1.new) dealloc_return:nt",
             []>,
             Requires<[HasV4T]>;
@@ -2606,10 +2640,10 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotdnPnt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                        i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPnt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1.new) dealloc_return:nt",
             []>,
             Requires<[HasV4T]>;
@@ -2617,21 +2651,21 @@ let isReturn = 1, isTerminator = 1,
 
 //    if (Ps.new) dealloc_return:t
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
     isPredicated = 1 in {
-  def DEALLOC_RET_cdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                    i32imm:$amt1),
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if ($src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
 }
 
-//    if (!Ps.new) dealloc_return:nt
+// if (!Ps.new) dealloc_return:nt
 let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R29, R31], neverHasSideEffects = 1,
-    isPredicated = 1 in {
-  def DEALLOC_RET_cNotdnPt_V4 : NVInst_V4<(outs), (ins PredRegs:$src1,
-                                                       i32imm:$amt1),
+    Defs = [R29, R30, R31, PC], Uses = [R30], neverHasSideEffects = 1,
+    isPredicated = 1, isPredicatedFalse = 1 in {
+let validSubTargets = HasV4SubT in
+  def DEALLOC_RET_cNotdnPt_V4 : LD0Inst<(outs), (ins PredRegs:$src1),
             "if (!$src1.new) dealloc_return:t",
             []>,
             Requires<[HasV4T]>;
@@ -3033,37 +3067,42 @@ def : Pat<(HexagonCONST32_GP tblockaddress:$src1),
           (TFRI_V4 tblockaddress:$src1)>,
           Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                           (ins PredRegs:$src1, globaladdress:$src2),
-           "if($src1) $dst = ##$src2",
+                           (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                              (ins PredRegs:$src1, globaladdress:$src2),
-           "if(!$src1) $dst = ##$src2",
+                              (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cdnPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, globaladdress:$src2),
-           "if($src1.new) $dst = ##$src2",
+                             (ins PredRegs:$src1, s16Ext:$src2),
+           "if($src1.new) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
-let AddedComplexity=50, neverHasSideEffects = 1, isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, AddedComplexity=50, isPredicatedFalse = 1,
+neverHasSideEffects = 1, isPredicated = 1, validSubTargets = HasV4SubT in
 def TFRI_cdnNotPt_V4 : ALU32_ri<(outs IntRegs:$dst),
-                                (ins PredRegs:$src1, globaladdress:$src2),
-           "if(!$src1.new) $dst = ##$src2",
+                                (ins PredRegs:$src1, s16Ext:$src2),
+           "if(!$src1.new) $dst = #$src2",
            []>,
            Requires<[HasV4T]>;
 
 let AddedComplexity = 50, Predicates = [HasV4T] in
 def : Pat<(HexagonCONST32_GP tglobaladdr:$src1),
-           (TFRI_V4 tglobaladdr:$src1)>;
+           (TFRI_V4 tglobaladdr:$src1)>,
+           Requires<[HasV4T]>;
 
 
 // Load - Indirect with long offset: These instructions take global address
@@ -3149,6 +3188,93 @@ def STriw_offset_ext_V4 : STInst<(outs),
                     (add IntRegs:$src1, u6_2ImmPred:$src2))]>,
             Requires<[HasV4T]>;
 
+def : Pat<(i64 (ctlz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTLZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+def : Pat<(i64 (cttz (i64 DoubleRegs:$src1))),
+          (i64 (COMBINE_Ir_V4 (i32 0), (i32 (CTTZ64_rr DoubleRegs:$src1))))>,
+          Requires<[HasV4T]>;
+
+
+// i8 -> i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi8.
+let Predicates = [HasV4T], AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrib_abs_V4 tglobaladdr:$addr)))>;
+
+def:  Pat <(i64 (extloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (zextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriub_abs_V4 FoldGlobalAddr:$addr)))>;
+
+def:  Pat <(i64 (sextloadi8 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrib_abs_V4 FoldGlobalAddr:$addr)))>;
+}
+// i16 -> i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi16.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDrih_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriuh_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi16 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDrih_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
+// i32->i64 loads
+// We need a complexity of 120 here to overide preceeding handling of
+// zextloadi32.
+let AddedComplexity = 120 in {
+def:  Pat <(i64 (extloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 (NumUsesBelowThresCONST32 tglobaladdr:$addr))),
+      (i64 (SXTW (LDriw_abs_V4 tglobaladdr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (extloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (zextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (COMBINE_Ir_V4 0, (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+
+def:  Pat <(i64 (sextloadi32 FoldGlobalAddr:$addr)),
+      (i64 (SXTW (LDriw_abs_V4 FoldGlobalAddr:$addr)))>,
+      Requires<[HasV4T]>;
+}
 
 // Indexed store double word - global address.
 // memw(Rs+#u6:2)=#S8
@@ -3264,4 +3390,3 @@ def : Pat<(i32 (load FoldGlobalAddrGP:$addr)),
 def : Pat<(atomic_load_32 FoldGlobalAddrGP:$addr),
           (i32 (LDriw_abs_V4 FoldGlobalAddrGP:$addr))>,
            Requires<[HasV4T]>;
-
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
index 92d098c..9da6074 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV5.td
@@ -26,22 +26,29 @@ def CONST32_Float_Real : LDInst<(outs IntRegs:$dst), (ins f32imm:$src1),
 // Only works with single precision fp value.
 // For double precision, use CONST64_float_real, as 64bit transfer
 // can only hold 40-bit values - 32 from const ext + 8 bit immediate.
-let isMoveImm = 1, isReMaterializable = 1, isPredicable = 1 in
-def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32imm:$src1),
-           "$dst = ##$src1",
+// Make sure that complexity is more than the CONST32 pattern in
+// HexagonInstrInfo.td patterns.
+let isExtended = 1, opExtendable = 1, isMoveImm = 1, isReMaterializable = 1,
+isPredicable = 1, AddedComplexity = 30, validSubTargets = HasV5SubT,
+isCodeGenOnly = 1 in
+def TFRI_f : ALU32_ri<(outs IntRegs:$dst), (ins f32Ext:$src1),
+           "$dst = #$src1",
            [(set IntRegs:$dst, fpimm:$src1)]>,
           Requires<[HasV5T]>;
 
+let isExtended = 1, opExtendable = 2, isPredicated = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
 def TFRI_cPt_f : ALU32_ri<(outs IntRegs:$dst),
-                          (ins PredRegs:$src1, f32imm:$src2),
-           "if ($src1) $dst = ##$src2",
+                          (ins PredRegs:$src1, f32Ext:$src2),
+           "if ($src1) $dst = #$src2",
            []>,
           Requires<[HasV5T]>;
 
-let isPredicated = 1 in
+let isExtended = 1, opExtendable = 2, isPredicated = 1, isPredicatedFalse = 1,
+neverHasSideEffects = 1, validSubTargets = HasV5SubT in
 def TFRI_cNotPt_f : ALU32_ri<(outs IntRegs:$dst),
-                             (ins PredRegs:$src1, f32imm:$src2),
-           "if (!$src1) $dst = ##$src2",
+                             (ins PredRegs:$src1, f32Ext:$src2),
+           "if (!$src1) $dst =#$src2",
            []>,
           Requires<[HasV5T]>;
 
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index f011d51..bbb2fa4 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -73,7 +73,7 @@ void llvm::HexagonLowerToMC(const MachineInstr* MI, HexagonMCInst& MCI,
                AP.OutContext));
       break;
     case MachineOperand::MO_GlobalAddress:
-      MCO = GetSymbolRef(MO, AP.Mang->getSymbol(MO.getGlobal()), AP);
+      MCO = GetSymbolRef(MO, AP.getSymbol(MO.getGlobal()), AP);
       break;
     case MachineOperand::MO_ExternalSymbol:
       MCO = GetSymbolRef(MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()),
diff --git a/lib/Target/MBlaze/MBlazeMachineFunction.cpp b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
index 2217b54..9579c8b 100644
--- a/lib/Target/MBlaze/MBlazeMachineFunction.cpp
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MBlazeMachineFunctionInfo.cpp - Private data ----------------------===//
+//= HexagonMachineFunctionInfo.cpp - Hexagon machine function info *- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MBlazeMachineFunction.h"
+#include "HexagonMachineFunctionInfo.h"
 
 using namespace llvm;
 
-void MBlazeFunctionInfo::anchor() { }
+// pin vtable to this file
+void HexagonMachineFunctionInfo::anchor() {}
+
diff --git a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index bd7b26a..a59c8c9 100644
--- a/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- HexagonMachineFuctionInfo.h - Hexagon machine function info --*- C++ -*-=//
+//=- HexagonMachineFunctionInfo.h - Hexagon machine function info -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,7 @@
 #ifndef HexagonMACHINEFUNCTIONINFO_H
 #define HexagonMACHINEFUNCTIONINFO_H
 
+#include <map>
 #include "llvm/CodeGen/MachineFunction.h"
 
 namespace llvm {
@@ -30,9 +31,8 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   int VarArgsFrameIndex;
   bool HasClobberLR;
   bool HasEHReturn;
-
   std::map<const MachineInstr*, unsigned> PacketInfo;
-
+  virtual void anchor();
 
 public:
   HexagonMachineFunctionInfo() : SRetReturnReg(0), HasClobberLR(0),
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 1388ad4..c94f081 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -195,7 +195,6 @@ void VLIWMachineScheduler::schedule() {
 void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = static_cast<VLIWMachineScheduler*>(dag);
   SchedModel = DAG->getSchedModel();
-  TRI = DAG->TRI;
 
   Top.init(DAG, SchedModel);
   Bot.init(DAG, SchedModel);
@@ -209,6 +208,8 @@ void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
   Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
   Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
 
+  delete Top.ResourceModel;
+  delete Bot.ResourceModel;
   Top.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
   Bot.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
 
@@ -223,7 +224,7 @@ void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
   for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned MinLatency = I->getMinLatency();
+    unsigned MinLatency = I->getLatency();
 #ifndef NDEBUG
     Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
 #endif
@@ -242,7 +243,7 @@ void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned MinLatency = I->getMinLatency();
+    unsigned MinLatency = I->getLatency();
 #ifndef NDEBUG
     Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
 #endif
@@ -406,11 +407,11 @@ SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
 #ifndef NDEBUG
 void ConvergingVLIWScheduler::traceCandidate(const char *Label,
                                              const ReadyQueue &Q,
-                                             SUnit *SU, PressureElement P) {
+                                             SUnit *SU, PressureChange P) {
   dbgs() << Label << " " << Q.getName() << " ";
   if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
+    dbgs() << DAG->TRI->getRegPressureSetName(P.getPSet()) << ":"
+           << P.getUnitInc() << " ";
   else
     dbgs() << "     ";
   SU->dump(DAG);
@@ -456,9 +457,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
 // Constants used to denote relative importance of
 // heuristic components for cost computation.
 static const unsigned PriorityOne = 200;
-static const unsigned PriorityTwo = 100;
-static const unsigned PriorityThree = 50;
-static const unsigned PriorityFour = 20;
+static const unsigned PriorityTwo = 50;
 static const unsigned ScaleTwo = 10;
 static const unsigned FactorOne = 2;
 
@@ -516,8 +515,8 @@ int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
   ResCount += (NumNodesBlocking * ScaleTwo);
 
   // Factor in reg pressure as a heuristic.
-  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
-  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.Excess.getUnitInc()*PriorityTwo);
+  ResCount -= (Delta.CriticalMax.getUnitInc()*PriorityTwo);
 
   DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
 
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index f68dadf..8ac333f 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -190,7 +190,6 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
 
   VLIWMachineScheduler *DAG;
   const TargetSchedModel *SchedModel;
-  const TargetRegisterInfo *TRI;
 
   // State of the top and bottom scheduled instruction boundaries.
   SchedBoundary Top;
@@ -205,7 +204,7 @@ public:
   };
 
   ConvergingVLIWScheduler():
-    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+    DAG(0), SchedModel(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
 
   virtual void initialize(ScheduleDAGMI *dag);
 
@@ -234,7 +233,7 @@ protected:
                                SchedCandidate &Candidate);
 #ifndef NDEBUG
   void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+                      PressureChange P = PressureChange());
 #endif
 };
 
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 05e6968..f7c4513 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -631,6 +631,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
                                     .addMBB(jmpTarget);
 
           assert(NewMI && "New Value Jump Instruction Not created!");
+          (void)NewMI;
           if (cmpInstr->getOperand(0).isReg() &&
               cmpInstr->getOperand(0).isKill())
             cmpInstr->getOperand(0).setIsKill(false);
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 89e3406..5490ecd 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -29,7 +29,7 @@
 //
 // Note: The peephole pass makes the instrucstions like
 // %vreg170<def> = SXTW %vreg166 or %vreg16<def> = NOT_p %vreg15<kill>
-// redundant and relies on some form of dead removal instrucions, like
+// redundant and relies on some form of dead removal instructions, like
 // DCE or DIE to actually eliminate them.
 
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d8b4e2f..1786e9d 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -38,11 +38,9 @@
 using namespace llvm;
 
 
-HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st,
-                                     const HexagonInstrInfo &tii)
+HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
   : HexagonGenRegisterInfo(Hexagon::R31),
-    Subtarget(st),
-   TII(tii) {
+    Subtarget(st) {
 }
 
 const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
@@ -130,6 +128,8 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Addressable stack objects are accessed using neg. offsets from %fp.
   MachineFunction &MF = *MI.getParent()->getParent();
+  const HexagonInstrInfo &TII =
+    *static_cast<const HexagonInstrInfo*>(MF.getTarget().getInstrInfo());
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
@@ -295,23 +295,5 @@ unsigned HexagonRegisterInfo::getStackRegister() const {
   return Hexagon::R29;
 }
 
-void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
-                                               &Moves)  const
-{
-  // VirtualFP = (R30 + #0).
-  unsigned FPReg = getFrameRegister();
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(FPReg, 0);
-  Moves.push_back(MachineMove(0, Dst, Src));
-}
-
-unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned HexagonRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 8a3f94a..89af7c3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -44,9 +44,8 @@ class Type;
 
 struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   HexagonSubtarget &Subtarget;
-  const HexagonInstrInfo &TII;
 
-  HexagonRegisterInfo(HexagonSubtarget &st, const HexagonInstrInfo &tii);
+  HexagonRegisterInfo(HexagonSubtarget &st);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
@@ -78,12 +77,7 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   unsigned getRARegister() const;
   unsigned getFrameRegister(const MachineFunction &MF) const;
   unsigned getFrameRegister() const;
-  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
   unsigned getStackRegister() const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index fe41fc3..8ea1b7e 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -57,8 +57,8 @@ let Namespace = "Hexagon" in {
     let Aliases = [R];
   }
 
-  def subreg_loreg  : SubRegIndex;
-  def subreg_hireg  : SubRegIndex;
+  def subreg_loreg  : SubRegIndex<32>;
+  def subreg_hireg  : SubRegIndex<32, 32>;
 
   // Integer registers.
   def R0 : Ri< 0, "r0">, DwarfRegNum<[0]>;
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index a52c604..c37bf9f 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -27,7 +27,7 @@ HexagonSelectionDAGInfo::~HexagonSelectionDAGInfo() {
 
 SDValue
 HexagonSelectionDAGInfo::
-EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl, SDValue Chain,
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
                         SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
                         bool isVolatile, bool AlwaysInline,
                         MachinePointerInfo DstPtrInfo,
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 0673e4d..31f278a 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -26,7 +26,7 @@ public:
   ~HexagonSelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
new file mode 100644
index 0000000..5166f8e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -0,0 +1,174 @@
+//=== HexagonSplitConst32AndConst64.cpp - split CONST32/Const64 into HI/LO ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When the compiler is invoked with no small data, for instance, with the -G0
+// command line option, then all CONST32_* opcodes should be broken down into
+// appropriate LO and HI instructions. This splitting is done by this pass.
+// The only reason this is not done in the DAG lowering itself is that there
+// is no simple way of getting the register allocator to allot the same hard
+// register to the result of LO and HI instructions. This pass is always
+// scheduled after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xfer"
+
+#include "HexagonTargetMachine.h"
+#include "HexagonSubtarget.h"
+#include "HexagonMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include <map>
+
+using namespace llvm;
+
+namespace {
+
+class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
+    const HexagonTargetMachine& QTM;
+    const HexagonSubtarget &QST;
+
+ public:
+    static char ID;
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine& TM)
+      : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+
+    const char *getPassName() const {
+      return "Hexagon Split Const32s and Const64s";
+    }
+    bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+
+char HexagonSplitConst32AndConst64::ID = 0;
+
+
+bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
+
+  const TargetInstrInfo *TII = QTM.getInstrInfo();
+
+  // Loop over all of the basic blocks
+  for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
+       MBBb != MBBe; ++MBBb) {
+    MachineBasicBlock* MBB = MBBb;
+    // Traverse the basic block
+    MachineBasicBlock::iterator MII = MBB->begin();
+    MachineBasicBlock::iterator MIE = MBB->end ();
+    while (MII != MIE) {
+      MachineInstr *MI = MII;
+      int Opc = MI->getOpcode();
+      if (Opc == Hexagon::CONST32_set) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_set_jt) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_jt), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_jt), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Label) {
+        int DestReg = MI->getOperand(0).getReg();
+        MachineOperand &Symbol = MI->getOperand (1);
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LO_label), DestReg).addOperand(Symbol);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HI_label), DestReg).addOperand(Symbol);
+        // MBB->erase returns the iterator to the next instruction, which is the
+        // one we want to process next
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST32_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestReg).addImm(ImmValue);
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestReg).addImm(ImmValue);
+        MII = MBB->erase (MI);
+        continue;
+      }
+      else if (Opc == Hexagon::CONST64_Int_Real) {
+        int DestReg = MI->getOperand(0).getReg();
+        int64_t ImmValue = MI->getOperand(1).getImm ();
+        unsigned DestLo =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_loreg);
+        unsigned DestHi =
+          QTM.getRegisterInfo()->getSubReg (DestReg, Hexagon::subreg_hireg);
+
+        int32_t LowWord = (ImmValue & 0xFFFFFFFF);
+        int32_t HighWord = (ImmValue >> 32) & 0xFFFFFFFF;
+
+        // Lower Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestLo).addImm(LowWord);
+        // Lower Registers Higher Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestLo).addImm(LowWord);
+        // Higher Registers Lower Half
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::LOi), DestHi).addImm(HighWord);
+        // Higher Registers Higher Half.
+        BuildMI (*MBB, MII, MI->getDebugLoc(),
+                 TII->get(Hexagon::HIi), DestHi).addImm(HighWord);
+        MII = MBB->erase (MI);
+        continue;
+       }
+      ++MII;
+    }
+  }
+
+  return true;
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+FunctionPass *
+llvm::createHexagonSplitConst32AndConst64(const HexagonTargetMachine &TM) {
+  return new HexagonSplitConst32AndConst64(TM);
+}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 07d5ce1..fca6707 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -86,3 +86,5 @@ HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
     ModeIEEERndNear = false;
 }
 
+// Pin the vtable to this file.
+void HexagonSubtarget::anchor() {}
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index 76a8fba..690bef0 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -27,7 +27,7 @@
 namespace llvm {
 
 class HexagonSubtarget : public HexagonGenSubtargetInfo {
-
+  virtual void anchor();
   bool UseMemOps;
   bool ModeIEEERndNear;
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index caa1ba4..bb950a0 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
 #include "HexagonMachineScheduler.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/PassManager.h"
@@ -78,6 +79,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget),
     InstrItins(&Subtarget.getInstrItineraryData()) {
     setMCUseCFI(false);
+    initAsmInfo();
 }
 
 // addPassesForOptimizations - Allow the backend (target) to add Target
@@ -100,17 +102,25 @@ class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
     : TargetPassConfig(TM, PM) {
-    // Enable MI scheduler.
-    if (!DisableHexagonMISched) {
+    // FIXME: Rather than calling enablePass(&MachineSchedulerID) below, define
+    // HexagonSubtarget::enableMachineScheduler() { return true; }.
+    // That will bypass the SelectionDAG VLIW scheduler, which is probably just
+    // hurting compile time and will be removed eventually anyway.
+    if (DisableHexagonMISched)
+      disablePass(&MachineSchedulerID);
+    else
       enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createVLIWMachineSched);
-    }
   }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
   }
 
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    return createVLIWMachineSched(C);
+  }
+
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
   virtual bool addPostRegAlloc();
@@ -124,7 +134,7 @@ TargetPassConfig *HexagonTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 bool HexagonPassConfig::addInstSelector() {
-  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  HexagonTargetMachine &TM = getHexagonTargetMachine();
   bool NoOpt = (getOptLevel() == CodeGenOpt::None);
 
   if (!NoOpt)
@@ -156,9 +166,18 @@ bool HexagonPassConfig::addPostRegAlloc() {
 }
 
 bool HexagonPassConfig::addPreSched2() {
+  const HexagonTargetMachine &TM = getHexagonTargetMachine();
+  const HexagonTargetObjectFile &TLOF =
+    (const HexagonTargetObjectFile &)getTargetLowering()->getObjFileLowering();
+
+  addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-  return false;
+  if (!TLOF.IsSmallDataEnabled()) {
+    addPass(createHexagonSplitConst32AndConst64(TM));
+    printAndVerify("After hexagon split const32/64 pass");
+  }
+  return true;
 }
 
 bool HexagonPassConfig::addPreEmitPass() {
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 993fcfa..7773cff 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -25,7 +25,8 @@
 using namespace llvm;
 
 static cl::opt<int> SmallDataThreshold("hexagon-small-data-threshold",
-                                cl::init(8), cl::Hidden);
+                                cl::init(8), cl::Hidden,
+                cl::desc("The maximum size of an object in the sdata section"));
 
 void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
                                          const TargetMachine &TM) {
@@ -46,6 +47,11 @@ void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
 static bool IsInSmallSection(uint64_t Size) {
   return Size > 0 && Size <= (uint64_t)SmallDataThreshold;
 }
+
+bool HexagonTargetObjectFile::IsSmallDataEnabled () const {
+  return SmallDataThreshold > 0;
+}
+
 /// IsGlobalInSmallSection - Return true if this global value should be
 /// placed into small data/bss section.
 bool HexagonTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h
index 6933450..41f6792 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -29,6 +29,7 @@ namespace llvm {
     bool IsGlobalInSmallSection(const GlobalValue *GV,
                                 const TargetMachine &TM) const;
 
+    bool IsSmallDataEnabled () const;
     const MCSection* SelectSectionForGlobal(const GlobalValue *GV,
                                             SectionKind Kind,
                                             Mangler *Mang,
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 39995e1..41e382d 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -164,7 +164,6 @@ namespace {
                     unsigned, std::map <MachineInstr*, SUnit*>);
     bool isNewifiable(MachineInstr* MI);
     bool isCondInst(MachineInstr* MI);
-    bool IsNewifyStore (MachineInstr* MI);
     bool tryAllocateResourcesForConstExt(MachineInstr* MI);
     bool canReserveResourcesForConstExt(MachineInstr *MI);
     void reserveResourcesForConstExt(MachineInstr* MI);
@@ -383,104 +382,6 @@ static bool IsControlFlow(MachineInstr* MI) {
   return (MI->getDesc().isTerminator() || MI->getDesc().isCall());
 }
 
-// Function returns true if an instruction can be promoted to the new-value
-// store. It will always return false for v2 and v3.
-// It lists all the conditional and unconditional stores that can be promoted
-// to the new-value stores.
-
-bool HexagonPacketizerList::IsNewifyStore (MachineInstr* MI) {
-  const HexagonRegisterInfo* QRI =
-                          (const HexagonRegisterInfo *) TM.getRegisterInfo();
-  switch (MI->getOpcode())
-  {
-    // store byte
-    case Hexagon::STrib:
-    case Hexagon::STrib_indexed:
-    case Hexagon::STrib_indexed_shl_V4:
-    case Hexagon::STrib_shl_V4:
-    case Hexagon::STb_GP_V4:
-    case Hexagon::POST_STbri:
-    case Hexagon::STrib_cPt:
-    case Hexagon::STrib_cdnPt_V4:
-    case Hexagon::STrib_cNotPt:
-    case Hexagon::STrib_cdnNotPt_V4:
-    case Hexagon::STrib_indexed_cPt:
-    case Hexagon::STrib_indexed_cdnPt_V4:
-    case Hexagon::STrib_indexed_cNotPt:
-    case Hexagon::STrib_indexed_cdnNotPt_V4:
-    case Hexagon::STrib_indexed_shl_cPt_V4:
-    case Hexagon::STrib_indexed_shl_cdnPt_V4:
-    case Hexagon::STrib_indexed_shl_cNotPt_V4:
-    case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_STbri_cPt:
-    case Hexagon::POST_STbri_cdnPt_V4:
-    case Hexagon::POST_STbri_cNotPt:
-    case Hexagon::POST_STbri_cdnNotPt_V4:
-    case Hexagon::STb_GP_cPt_V4:
-    case Hexagon::STb_GP_cNotPt_V4:
-    case Hexagon::STb_GP_cdnPt_V4:
-    case Hexagon::STb_GP_cdnNotPt_V4:
-
-    // store halfword
-    case Hexagon::STrih:
-    case Hexagon::STrih_indexed:
-    case Hexagon::STrih_indexed_shl_V4:
-    case Hexagon::STrih_shl_V4:
-    case Hexagon::STh_GP_V4:
-    case Hexagon::POST_SThri:
-    case Hexagon::STrih_cPt:
-    case Hexagon::STrih_cdnPt_V4:
-    case Hexagon::STrih_cNotPt:
-    case Hexagon::STrih_cdnNotPt_V4:
-    case Hexagon::STrih_indexed_cPt:
-    case Hexagon::STrih_indexed_cdnPt_V4:
-    case Hexagon::STrih_indexed_cNotPt:
-    case Hexagon::STrih_indexed_cdnNotPt_V4:
-    case Hexagon::STrih_indexed_shl_cPt_V4:
-    case Hexagon::STrih_indexed_shl_cdnPt_V4:
-    case Hexagon::STrih_indexed_shl_cNotPt_V4:
-    case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_SThri_cPt:
-    case Hexagon::POST_SThri_cdnPt_V4:
-    case Hexagon::POST_SThri_cNotPt:
-    case Hexagon::POST_SThri_cdnNotPt_V4:
-    case Hexagon::STh_GP_cPt_V4:
-    case Hexagon::STh_GP_cNotPt_V4:
-    case Hexagon::STh_GP_cdnPt_V4:
-    case Hexagon::STh_GP_cdnNotPt_V4:
-
-    // store word
-    case Hexagon::STriw:
-    case Hexagon::STriw_indexed:
-    case Hexagon::STriw_indexed_shl_V4:
-    case Hexagon::STriw_shl_V4:
-    case Hexagon::STw_GP_V4:
-    case Hexagon::POST_STwri:
-    case Hexagon::STriw_cPt:
-    case Hexagon::STriw_cdnPt_V4:
-    case Hexagon::STriw_cNotPt:
-    case Hexagon::STriw_cdnNotPt_V4:
-    case Hexagon::STriw_indexed_cPt:
-    case Hexagon::STriw_indexed_cdnPt_V4:
-    case Hexagon::STriw_indexed_cNotPt:
-    case Hexagon::STriw_indexed_cdnNotPt_V4:
-    case Hexagon::STriw_indexed_shl_cPt_V4:
-    case Hexagon::STriw_indexed_shl_cdnPt_V4:
-    case Hexagon::STriw_indexed_shl_cNotPt_V4:
-    case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
-    case Hexagon::POST_STwri_cPt:
-    case Hexagon::POST_STwri_cdnPt_V4:
-    case Hexagon::POST_STwri_cNotPt:
-    case Hexagon::POST_STwri_cdnNotPt_V4:
-    case Hexagon::STw_GP_cPt_V4:
-    case Hexagon::STw_GP_cNotPt_V4:
-    case Hexagon::STw_GP_cdnPt_V4:
-    case Hexagon::STw_GP_cdnNotPt_V4:
-        return QRI->Subtarget.hasV4TOps();
-  }
-  return false;
-}
-
 static bool IsLoopN(MachineInstr *MI) {
   return (MI->getOpcode() == Hexagon::LOOP0_i ||
           MI->getOpcode() == Hexagon::LOOP0_r);
@@ -498,769 +399,11 @@ static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
   return false;
 }
 
-// Return the new value instruction for a given store.
-static int GetDotNewOp(const int opc) {
-  switch (opc) {
-  default: llvm_unreachable("Unknown .new type");
-  // store new value byte
-  case Hexagon::STrib:
-    return Hexagon::STrib_nv_V4;
-
-  case Hexagon::STrib_indexed:
-    return Hexagon::STrib_indexed_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_V4:
-    return Hexagon::STrib_indexed_shl_nv_V4;
-
-  case Hexagon::STrib_shl_V4:
-    return Hexagon::STrib_shl_nv_V4;
-
-  case Hexagon::STb_GP_V4:
-    return Hexagon::STb_GP_nv_V4;
-
-  case Hexagon::POST_STbri:
-    return Hexagon::POST_STbri_nv_V4;
-
-  case Hexagon::STrib_cPt:
-    return Hexagon::STrib_cPt_nv_V4;
-
-  case Hexagon::STrib_cdnPt_V4:
-    return Hexagon::STrib_cdnPt_nv_V4;
-
-  case Hexagon::STrib_cNotPt:
-    return Hexagon::STrib_cNotPt_nv_V4;
-
-  case Hexagon::STrib_cdnNotPt_V4:
-    return Hexagon::STrib_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cPt:
-    return Hexagon::STrib_indexed_cPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cdnPt_V4:
-    return Hexagon::STrib_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cNotPt:
-    return Hexagon::STrib_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cdnNotPt_V4:
-    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_V4:
-    return Hexagon::STrib_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnPt_V4:
-    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cNotPt_V4:
-    return Hexagon::STrib_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cPt:
-    return Hexagon::POST_STbri_cPt_nv_V4;
-
-  case Hexagon::POST_STbri_cdnPt_V4:
-    return Hexagon::POST_STbri_cdnPt_nv_V4;
-
-  case Hexagon::POST_STbri_cNotPt:
-    return Hexagon::POST_STbri_cNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cdnNotPt_V4:
-    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cPt_V4:
-    return Hexagon::STb_GP_cPt_nv_V4;
-
-  case Hexagon::STb_GP_cNotPt_V4:
-    return Hexagon::STb_GP_cNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cdnPt_V4:
-    return Hexagon::STb_GP_cdnPt_nv_V4;
-
-  case Hexagon::STb_GP_cdnNotPt_V4:
-    return Hexagon::STb_GP_cdnNotPt_nv_V4;
-
-  // store new value halfword
-  case Hexagon::STrih:
-    return Hexagon::STrih_nv_V4;
-
-  case Hexagon::STrih_indexed:
-    return Hexagon::STrih_indexed_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_V4:
-    return Hexagon::STrih_indexed_shl_nv_V4;
-
-  case Hexagon::STrih_shl_V4:
-    return Hexagon::STrih_shl_nv_V4;
-
-  case Hexagon::STh_GP_V4:
-    return Hexagon::STh_GP_nv_V4;
-
-  case Hexagon::POST_SThri:
-    return Hexagon::POST_SThri_nv_V4;
-
-  case Hexagon::STrih_cPt:
-    return Hexagon::STrih_cPt_nv_V4;
-
-  case Hexagon::STrih_cdnPt_V4:
-    return Hexagon::STrih_cdnPt_nv_V4;
-
-  case Hexagon::STrih_cNotPt:
-    return Hexagon::STrih_cNotPt_nv_V4;
-
-  case Hexagon::STrih_cdnNotPt_V4:
-    return Hexagon::STrih_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cPt:
-    return Hexagon::STrih_indexed_cPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cdnPt_V4:
-    return Hexagon::STrih_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cNotPt:
-    return Hexagon::STrih_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cdnNotPt_V4:
-    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_V4:
-    return Hexagon::STrih_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnPt_V4:
-    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cNotPt_V4:
-    return Hexagon::STrih_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cPt:
-    return Hexagon::POST_SThri_cPt_nv_V4;
-
-  case Hexagon::POST_SThri_cdnPt_V4:
-    return Hexagon::POST_SThri_cdnPt_nv_V4;
-
-  case Hexagon::POST_SThri_cNotPt:
-    return Hexagon::POST_SThri_cNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cdnNotPt_V4:
-    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cPt_V4:
-    return Hexagon::STh_GP_cPt_nv_V4;
-
-  case Hexagon::STh_GP_cNotPt_V4:
-    return Hexagon::STh_GP_cNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cdnPt_V4:
-    return Hexagon::STh_GP_cdnPt_nv_V4;
-
-  case Hexagon::STh_GP_cdnNotPt_V4:
-    return Hexagon::STh_GP_cdnNotPt_nv_V4;
-
-  // store new value word
-  case Hexagon::STriw:
-    return Hexagon::STriw_nv_V4;
-
-  case Hexagon::STriw_indexed:
-    return Hexagon::STriw_indexed_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_V4:
-    return Hexagon::STriw_indexed_shl_nv_V4;
-
-  case Hexagon::STriw_shl_V4:
-    return Hexagon::STriw_shl_nv_V4;
-
-  case Hexagon::STw_GP_V4:
-    return Hexagon::STw_GP_nv_V4;
-
-  case Hexagon::POST_STwri:
-    return Hexagon::POST_STwri_nv_V4;
-
-  case Hexagon::STriw_cPt:
-    return Hexagon::STriw_cPt_nv_V4;
-
-  case Hexagon::STriw_cdnPt_V4:
-    return Hexagon::STriw_cdnPt_nv_V4;
-
-  case Hexagon::STriw_cNotPt:
-    return Hexagon::STriw_cNotPt_nv_V4;
-
-  case Hexagon::STriw_cdnNotPt_V4:
-    return Hexagon::STriw_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cPt:
-    return Hexagon::STriw_indexed_cPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cdnPt_V4:
-    return Hexagon::STriw_indexed_cdnPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cNotPt:
-    return Hexagon::STriw_indexed_cNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cdnNotPt_V4:
-    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_V4:
-    return Hexagon::STriw_indexed_shl_cPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnPt_V4:
-    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cNotPt_V4:
-    return Hexagon::STriw_indexed_shl_cNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4:
-    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cPt:
-    return Hexagon::POST_STwri_cPt_nv_V4;
-
-  case Hexagon::POST_STwri_cdnPt_V4:
-    return Hexagon::POST_STwri_cdnPt_nv_V4;
-
-  case Hexagon::POST_STwri_cNotPt:
-    return Hexagon::POST_STwri_cNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cdnNotPt_V4:
-    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cPt_V4:
-    return Hexagon::STw_GP_cPt_nv_V4;
-
-  case Hexagon::STw_GP_cNotPt_V4:
-    return Hexagon::STw_GP_cNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cdnPt_V4:
-    return Hexagon::STw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STw_GP_cdnNotPt_V4:
-    return Hexagon::STw_GP_cdnNotPt_nv_V4;
-
-  }
-}
-
-// Return .new predicate version for an instruction
-static int GetDotNewPredOp(MachineInstr *MI,
-                           const MachineBranchProbabilityInfo *MBPI,
-                           const HexagonInstrInfo *QII) {
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
-  // Conditional stores
-  // Store byte conditionally
-  case Hexagon::STrib_cPt :
-    return Hexagon::STrib_cdnPt_V4;
-
-  case Hexagon::STrib_cNotPt :
-    return Hexagon::STrib_cdnNotPt_V4;
-
-  case Hexagon::STrib_indexed_cPt :
-    return Hexagon::STrib_indexed_cdnPt_V4;
-
-  case Hexagon::STrib_indexed_cNotPt :
-    return Hexagon::STrib_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrib_imm_cPt_V4 :
-    return Hexagon::STrib_imm_cdnPt_V4;
-
-  case Hexagon::STrib_imm_cNotPt_V4 :
-    return Hexagon::STrib_imm_cdnNotPt_V4;
-
-  case Hexagon::POST_STbri_cPt :
-    return Hexagon::POST_STbri_cdnPt_V4;
-
-  case Hexagon::POST_STbri_cNotPt :
-    return Hexagon::POST_STbri_cdnNotPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_V4 :
-    return Hexagon::STrib_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrib_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::STb_GP_cPt_V4 :
-    return Hexagon::STb_GP_cdnPt_V4;
-
-  case Hexagon::STb_GP_cNotPt_V4 :
-    return Hexagon::STb_GP_cdnNotPt_V4;
-
-  // Store doubleword conditionally
-  case Hexagon::STrid_cPt :
-    return Hexagon::STrid_cdnPt_V4;
-
-  case Hexagon::STrid_cNotPt :
-    return Hexagon::STrid_cdnNotPt_V4;
-
-  case Hexagon::STrid_indexed_cPt :
-    return Hexagon::STrid_indexed_cdnPt_V4;
-
-  case Hexagon::STrid_indexed_cNotPt :
-    return Hexagon::STrid_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cPt_V4 :
-    return Hexagon::STrid_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrid_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_STdri_cPt :
-    return Hexagon::POST_STdri_cdnPt_V4;
-
-  case Hexagon::POST_STdri_cNotPt :
-    return Hexagon::POST_STdri_cdnNotPt_V4;
-
-  case Hexagon::STd_GP_cPt_V4 :
-    return Hexagon::STd_GP_cdnPt_V4;
-
-  case Hexagon::STd_GP_cNotPt_V4 :
-    return Hexagon::STd_GP_cdnNotPt_V4;
-
-  // Store halfword conditionally
-  case Hexagon::STrih_cPt :
-    return Hexagon::STrih_cdnPt_V4;
-
-  case Hexagon::STrih_cNotPt :
-    return Hexagon::STrih_cdnNotPt_V4;
-
-  case Hexagon::STrih_indexed_cPt :
-    return Hexagon::STrih_indexed_cdnPt_V4;
-
-  case Hexagon::STrih_indexed_cNotPt :
-    return Hexagon::STrih_indexed_cdnNotPt_V4;
-
-  case Hexagon::STrih_imm_cPt_V4 :
-    return Hexagon::STrih_imm_cdnPt_V4;
-
-  case Hexagon::STrih_imm_cNotPt_V4 :
-    return Hexagon::STrih_imm_cdnNotPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_V4 :
-    return Hexagon::STrih_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
-    return Hexagon::STrih_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_SThri_cPt :
-    return Hexagon::POST_SThri_cdnPt_V4;
-
-  case Hexagon::POST_SThri_cNotPt :
-    return Hexagon::POST_SThri_cdnNotPt_V4;
-
-  case Hexagon::STh_GP_cPt_V4 :
-    return Hexagon::STh_GP_cdnPt_V4;
-
-  case Hexagon::STh_GP_cNotPt_V4 :
-    return Hexagon::STh_GP_cdnNotPt_V4;
-
-  // Store word conditionally
-  case Hexagon::STriw_cPt :
-    return Hexagon::STriw_cdnPt_V4;
-
-  case Hexagon::STriw_cNotPt :
-    return Hexagon::STriw_cdnNotPt_V4;
-
-  case Hexagon::STriw_indexed_cPt :
-    return Hexagon::STriw_indexed_cdnPt_V4;
-
-  case Hexagon::STriw_indexed_cNotPt :
-    return Hexagon::STriw_indexed_cdnNotPt_V4;
-
-  case Hexagon::STriw_imm_cPt_V4 :
-    return Hexagon::STriw_imm_cdnPt_V4;
-
-  case Hexagon::STriw_imm_cNotPt_V4 :
-    return Hexagon::STriw_imm_cdnNotPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_V4 :
-    return Hexagon::STriw_indexed_shl_cdnPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
-    return Hexagon::STriw_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::POST_STwri_cPt :
-    return Hexagon::POST_STwri_cdnPt_V4;
-
-  case Hexagon::POST_STwri_cNotPt :
-    return Hexagon::POST_STwri_cdnNotPt_V4;
-
-  case Hexagon::STw_GP_cPt_V4 :
-    return Hexagon::STw_GP_cdnPt_V4;
-
-  case Hexagon::STw_GP_cNotPt_V4 :
-    return Hexagon::STw_GP_cdnNotPt_V4;
-
-  // Condtional Jumps
-  case Hexagon::JMP_t:
-  case Hexagon::JMP_f:
-    return QII->getDotNewPredJumpOp(MI, MBPI);
-
-  case Hexagon::JMPR_t:
-    return Hexagon::JMPR_tnew_tV3;
-
-  case Hexagon::JMPR_f:
-    return Hexagon::JMPR_fnew_tV3;
-
-  // Conditional Transfers
-  case Hexagon::TFR_cPt:
-    return Hexagon::TFR_cdnPt;
-
-  case Hexagon::TFR_cNotPt:
-    return Hexagon::TFR_cdnNotPt;
-
-  case Hexagon::TFRI_cPt:
-    return Hexagon::TFRI_cdnPt;
-
-  case Hexagon::TFRI_cNotPt:
-    return Hexagon::TFRI_cdnNotPt;
-
-  // Load double word
-  case Hexagon::LDrid_cPt :
-    return Hexagon::LDrid_cdnPt;
-
-  case Hexagon::LDrid_cNotPt :
-    return Hexagon::LDrid_cdnNotPt;
-
-  case Hexagon::LDrid_indexed_cPt :
-    return Hexagon::LDrid_indexed_cdnPt;
-
-  case Hexagon::LDrid_indexed_cNotPt :
-    return Hexagon::LDrid_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrid_cPt :
-    return Hexagon::POST_LDrid_cdnPt_V4;
-
-  case Hexagon::POST_LDrid_cNotPt :
-    return Hexagon::POST_LDrid_cdnNotPt_V4;
-
-  // Load word
-  case Hexagon::LDriw_cPt :
-    return Hexagon::LDriw_cdnPt;
-
-  case Hexagon::LDriw_cNotPt :
-    return Hexagon::LDriw_cdnNotPt;
-
-  case Hexagon::LDriw_indexed_cPt :
-    return Hexagon::LDriw_indexed_cdnPt;
-
-  case Hexagon::LDriw_indexed_cNotPt :
-    return Hexagon::LDriw_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriw_cPt :
-    return Hexagon::POST_LDriw_cdnPt_V4;
-
-  case Hexagon::POST_LDriw_cNotPt :
-    return Hexagon::POST_LDriw_cdnNotPt_V4;
-
-  // Load halfword
-  case Hexagon::LDrih_cPt :
-    return Hexagon::LDrih_cdnPt;
-
-  case Hexagon::LDrih_cNotPt :
-    return Hexagon::LDrih_cdnNotPt;
-
-  case Hexagon::LDrih_indexed_cPt :
-    return Hexagon::LDrih_indexed_cdnPt;
-
-  case Hexagon::LDrih_indexed_cNotPt :
-    return Hexagon::LDrih_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrih_cPt :
-    return Hexagon::POST_LDrih_cdnPt_V4;
-
-  case Hexagon::POST_LDrih_cNotPt :
-    return Hexagon::POST_LDrih_cdnNotPt_V4;
-
-  // Load byte
-  case Hexagon::LDrib_cPt :
-    return Hexagon::LDrib_cdnPt;
-
-  case Hexagon::LDrib_cNotPt :
-    return Hexagon::LDrib_cdnNotPt;
-
-  case Hexagon::LDrib_indexed_cPt :
-    return Hexagon::LDrib_indexed_cdnPt;
-
-  case Hexagon::LDrib_indexed_cNotPt :
-    return Hexagon::LDrib_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDrib_cPt :
-    return Hexagon::POST_LDrib_cdnPt_V4;
-
-  case Hexagon::POST_LDrib_cNotPt :
-    return Hexagon::POST_LDrib_cdnNotPt_V4;
-
-  // Load unsigned halfword
-  case Hexagon::LDriuh_cPt :
-    return Hexagon::LDriuh_cdnPt;
-
-  case Hexagon::LDriuh_cNotPt :
-    return Hexagon::LDriuh_cdnNotPt;
-
-  case Hexagon::LDriuh_indexed_cPt :
-    return Hexagon::LDriuh_indexed_cdnPt;
-
-  case Hexagon::LDriuh_indexed_cNotPt :
-    return Hexagon::LDriuh_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriuh_cPt :
-    return Hexagon::POST_LDriuh_cdnPt_V4;
-
-  case Hexagon::POST_LDriuh_cNotPt :
-    return Hexagon::POST_LDriuh_cdnNotPt_V4;
-
-  // Load unsigned byte
-  case Hexagon::LDriub_cPt :
-    return Hexagon::LDriub_cdnPt;
-
-  case Hexagon::LDriub_cNotPt :
-    return Hexagon::LDriub_cdnNotPt;
-
-  case Hexagon::LDriub_indexed_cPt :
-    return Hexagon::LDriub_indexed_cdnPt;
-
-  case Hexagon::LDriub_indexed_cNotPt :
-    return Hexagon::LDriub_indexed_cdnNotPt;
-
-  case Hexagon::POST_LDriub_cPt :
-    return Hexagon::POST_LDriub_cdnPt_V4;
-
-  case Hexagon::POST_LDriub_cNotPt :
-    return Hexagon::POST_LDriub_cdnNotPt_V4;
-
-  // V4 indexed+scaled load
-
-  case Hexagon::LDrid_indexed_shl_cPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cdnNotPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cdnPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cdnNotPt_V4;
-
-  // V4 global address load
-
-  case Hexagon::LDd_GP_cPt_V4:
-    return Hexagon::LDd_GP_cdnPt_V4;
-
-  case Hexagon::LDd_GP_cNotPt_V4:
-    return Hexagon::LDd_GP_cdnNotPt_V4;
-
-  case Hexagon::LDb_GP_cPt_V4:
-    return Hexagon::LDb_GP_cdnPt_V4;
-
-  case Hexagon::LDb_GP_cNotPt_V4:
-    return Hexagon::LDb_GP_cdnNotPt_V4;
-
-  case Hexagon::LDub_GP_cPt_V4:
-    return Hexagon::LDub_GP_cdnPt_V4;
-
-  case Hexagon::LDub_GP_cNotPt_V4:
-    return Hexagon::LDub_GP_cdnNotPt_V4;
-
-  case Hexagon::LDh_GP_cPt_V4:
-    return Hexagon::LDh_GP_cdnPt_V4;
-
-  case Hexagon::LDh_GP_cNotPt_V4:
-    return Hexagon::LDh_GP_cdnNotPt_V4;
-
-  case Hexagon::LDuh_GP_cPt_V4:
-    return Hexagon::LDuh_GP_cdnPt_V4;
-
-  case Hexagon::LDuh_GP_cNotPt_V4:
-    return Hexagon::LDuh_GP_cdnNotPt_V4;
-
-  case Hexagon::LDw_GP_cPt_V4:
-    return Hexagon::LDw_GP_cdnPt_V4;
-
-  case Hexagon::LDw_GP_cNotPt_V4:
-    return Hexagon::LDw_GP_cdnNotPt_V4;
-
-  // Conditional store new-value byte
-  case Hexagon::STrib_cPt_nv_V4 :
-    return Hexagon::STrib_cdnPt_nv_V4;
-  case Hexagon::STrib_cNotPt_nv_V4 :
-    return Hexagon::STrib_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_cPt_nv_V4 :
-    return Hexagon::STrib_indexed_cdnPt_nv_V4;
-  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STrib_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STbri_cPt_nv_V4 :
-    return Hexagon::POST_STbri_cdnPt_nv_V4;
-  case Hexagon::POST_STbri_cNotPt_nv_V4 :
-    return Hexagon::POST_STbri_cdnNotPt_nv_V4;
-
-  case Hexagon::STb_GP_cPt_nv_V4 :
-    return Hexagon::STb_GP_cdnPt_nv_V4;
-
-  case Hexagon::STb_GP_cNotPt_nv_V4 :
-    return Hexagon::STb_GP_cdnNotPt_nv_V4;
-
-  // Conditional store new-value halfword
-  case Hexagon::STrih_cPt_nv_V4 :
-    return Hexagon::STrih_cdnPt_nv_V4;
-  case Hexagon::STrih_cNotPt_nv_V4 :
-    return Hexagon::STrih_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_cPt_nv_V4 :
-    return Hexagon::STrih_indexed_cdnPt_nv_V4;
-  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
-    return Hexagon::STrih_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STrih_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_SThri_cPt_nv_V4 :
-    return Hexagon::POST_SThri_cdnPt_nv_V4;
-  case Hexagon::POST_SThri_cNotPt_nv_V4 :
-    return Hexagon::POST_SThri_cdnNotPt_nv_V4;
-
-  case Hexagon::STh_GP_cPt_nv_V4 :
-    return Hexagon::STh_GP_cdnPt_nv_V4;
-
-  case Hexagon::STh_GP_cNotPt_nv_V4 :
-    return Hexagon::STh_GP_cdnNotPt_nv_V4;
-
-  // Conditional store new-value word
-  case Hexagon::STriw_cPt_nv_V4 :
-    return  Hexagon::STriw_cdnPt_nv_V4;
-  case Hexagon::STriw_cNotPt_nv_V4 :
-    return Hexagon::STriw_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_cPt_nv_V4 :
-    return Hexagon::STriw_indexed_cdnPt_nv_V4;
-  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
-    return Hexagon::STriw_indexed_cdnNotPt_nv_V4;
-
-  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
-    return Hexagon::STriw_indexed_shl_cdnPt_nv_V4;
-  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
-    return Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4;
-
-  case Hexagon::POST_STwri_cPt_nv_V4 :
-    return Hexagon::POST_STwri_cdnPt_nv_V4;
-  case Hexagon::POST_STwri_cNotPt_nv_V4:
-    return Hexagon::POST_STwri_cdnNotPt_nv_V4;
-
-  case Hexagon::STw_GP_cPt_nv_V4 :
-    return Hexagon::STw_GP_cdnPt_nv_V4;
-
-  case Hexagon::STw_GP_cNotPt_nv_V4 :
-    return Hexagon::STw_GP_cdnNotPt_nv_V4;
-
-  // Conditional add
-  case Hexagon::ADD_ri_cPt :
-    return Hexagon::ADD_ri_cdnPt;
-  case Hexagon::ADD_ri_cNotPt :
-    return Hexagon::ADD_ri_cdnNotPt;
-
-  case Hexagon::ADD_rr_cPt :
-    return Hexagon::ADD_rr_cdnPt;
-  case Hexagon::ADD_rr_cNotPt :
-    return Hexagon::ADD_rr_cdnNotPt;
-
-  // Conditional logical Operations
-  case Hexagon::XOR_rr_cPt :
-    return Hexagon::XOR_rr_cdnPt;
-  case Hexagon::XOR_rr_cNotPt :
-    return Hexagon::XOR_rr_cdnNotPt;
-
-  case Hexagon::AND_rr_cPt :
-    return Hexagon::AND_rr_cdnPt;
-  case Hexagon::AND_rr_cNotPt :
-    return Hexagon::AND_rr_cdnNotPt;
-
-  case Hexagon::OR_rr_cPt :
-    return Hexagon::OR_rr_cdnPt;
-  case Hexagon::OR_rr_cNotPt :
-    return Hexagon::OR_rr_cdnNotPt;
-
-  // Conditional Subtract
-  case Hexagon::SUB_rr_cPt :
-    return Hexagon::SUB_rr_cdnPt;
-  case Hexagon::SUB_rr_cNotPt :
-    return Hexagon::SUB_rr_cdnNotPt;
-
-  // Conditional combine
-  case Hexagon::COMBINE_rr_cPt :
-    return Hexagon::COMBINE_rr_cdnPt;
-  case Hexagon::COMBINE_rr_cNotPt :
-    return Hexagon::COMBINE_rr_cdnNotPt;
-
-  case Hexagon::ASLH_cPt_V4 :
-    return Hexagon::ASLH_cdnPt_V4;
-  case Hexagon::ASLH_cNotPt_V4 :
-    return Hexagon::ASLH_cdnNotPt_V4;
-
-  case Hexagon::ASRH_cPt_V4 :
-    return Hexagon::ASRH_cdnPt_V4;
-  case Hexagon::ASRH_cNotPt_V4 :
-    return Hexagon::ASRH_cdnNotPt_V4;
-
-  case Hexagon::SXTB_cPt_V4 :
-    return Hexagon::SXTB_cdnPt_V4;
-  case Hexagon::SXTB_cNotPt_V4 :
-    return Hexagon::SXTB_cdnNotPt_V4;
-
-  case Hexagon::SXTH_cPt_V4 :
-    return Hexagon::SXTH_cdnPt_V4;
-  case Hexagon::SXTH_cNotPt_V4 :
-    return Hexagon::SXTH_cdnNotPt_V4;
-
-  case Hexagon::ZXTB_cPt_V4 :
-    return Hexagon::ZXTB_cdnPt_V4;
-  case Hexagon::ZXTB_cNotPt_V4 :
-    return Hexagon::ZXTB_cdnNotPt_V4;
-
-  case Hexagon::ZXTH_cPt_V4 :
-    return Hexagon::ZXTH_cdnPt_V4;
-  case Hexagon::ZXTH_cNotPt_V4 :
-    return Hexagon::ZXTH_cdnNotPt_V4;
-  }
-}
-
 // Returns true if an instruction can be promoted to .new predicate
 // or new-value store.
 bool HexagonPacketizerList::isNewifiable(MachineInstr* MI) {
-  if ( isCondInst(MI) || IsNewifyStore(MI))
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  if ( isCondInst(MI) || QII->mayBeNewStore(MI))
     return true;
   else
     return false;
@@ -1294,896 +437,38 @@ bool HexagonPacketizerList::PromoteToDotNew(MachineInstr* MI,
 
   int NewOpcode;
   if (RC == &Hexagon::PredRegsRegClass)
-    NewOpcode = GetDotNewPredOp(MI, MBPI, QII);
+    NewOpcode = QII->GetDotNewPredOp(MI, MBPI);
   else
-    NewOpcode = GetDotNewOp(MI->getOpcode());
+    NewOpcode = QII->GetDotNewOp(MI);
   MI->setDesc(QII->get(NewOpcode));
 
   return true;
 }
 
-// Returns the most basic instruction for the .new predicated instructions and
-// new-value stores.
-// For example, all of the following instructions will be converted back to the
-// same instruction:
-// 1) if (p0.new) memw(R0+#0) = R1.new  --->
-// 2) if (p0) memw(R0+#0)= R1.new      -------> if (p0) memw(R0+#0) = R1
-// 3) if (p0.new) memw(R0+#0) = R1      --->
-//
-// To understand the translation of instruction 1 to its original form, consider
-// a packet with 3 instructions.
-// { p0 = cmp.eq(R0,R1)
-//   if (p0.new) R2 = add(R3, R4)
-//   R5 = add (R3, R1)
-//   }
-// if (p0) memw(R5+#0) = R2 <--- trying to include it in the previous packet
-//
-// This instruction can be part of the previous packet only if both p0 and R2
-// are promoted to .new values. This promotion happens in steps, first
-// predicate register is promoted to .new and in the next iteration R2 is
-// promoted. Therefore, in case of dependence check failure (due to R5) during
-// next iteration, it should be converted back to its most basic form.
-
-static int GetDotOldOp(const int opc) {
-  switch (opc) {
-  default: llvm_unreachable("Unknown .old type");
-  case Hexagon::TFR_cdnPt:
-    return Hexagon::TFR_cPt;
-
-  case Hexagon::TFR_cdnNotPt:
-    return Hexagon::TFR_cNotPt;
-
-  case Hexagon::TFRI_cdnPt:
-    return Hexagon::TFRI_cPt;
-
-  case Hexagon::TFRI_cdnNotPt:
-    return Hexagon::TFRI_cNotPt;
-
-  case Hexagon::JMP_tnew_t:
-    return Hexagon::JMP_t;
-
-  case Hexagon::JMP_fnew_t:
-    return Hexagon::JMP_f;
-
-  case Hexagon::JMPR_tnew_tV3:
-    return Hexagon::JMPR_t;
-
-  case Hexagon::JMPR_fnew_tV3:
-    return Hexagon::JMPR_f;
-
-  // Load double word
-
-  case Hexagon::LDrid_cdnPt :
-    return Hexagon::LDrid_cPt;
-
-  case Hexagon::LDrid_cdnNotPt :
-    return Hexagon::LDrid_cNotPt;
-
-  case Hexagon::LDrid_indexed_cdnPt :
-    return Hexagon::LDrid_indexed_cPt;
-
-  case Hexagon::LDrid_indexed_cdnNotPt :
-    return Hexagon::LDrid_indexed_cNotPt;
-
-  case Hexagon::POST_LDrid_cdnPt_V4 :
-    return Hexagon::POST_LDrid_cPt;
-
-  case Hexagon::POST_LDrid_cdnNotPt_V4 :
-    return Hexagon::POST_LDrid_cNotPt;
-
-  // Load word
-
-  case Hexagon::LDriw_cdnPt :
-    return Hexagon::LDriw_cPt;
-
-  case Hexagon::LDriw_cdnNotPt :
-    return Hexagon::LDriw_cNotPt;
-
-  case Hexagon::LDriw_indexed_cdnPt :
-    return Hexagon::LDriw_indexed_cPt;
-
-  case Hexagon::LDriw_indexed_cdnNotPt :
-    return Hexagon::LDriw_indexed_cNotPt;
-
-  case Hexagon::POST_LDriw_cdnPt_V4 :
-    return Hexagon::POST_LDriw_cPt;
-
-  case Hexagon::POST_LDriw_cdnNotPt_V4 :
-    return Hexagon::POST_LDriw_cNotPt;
-
-  // Load half
-
-  case Hexagon::LDrih_cdnPt :
-    return Hexagon::LDrih_cPt;
-
-  case Hexagon::LDrih_cdnNotPt :
-    return Hexagon::LDrih_cNotPt;
-
-  case Hexagon::LDrih_indexed_cdnPt :
-    return Hexagon::LDrih_indexed_cPt;
-
-  case Hexagon::LDrih_indexed_cdnNotPt :
-    return Hexagon::LDrih_indexed_cNotPt;
-
-  case Hexagon::POST_LDrih_cdnPt_V4 :
-    return Hexagon::POST_LDrih_cPt;
-
-  case Hexagon::POST_LDrih_cdnNotPt_V4 :
-    return Hexagon::POST_LDrih_cNotPt;
-
-  // Load byte
-
-  case Hexagon::LDrib_cdnPt :
-    return Hexagon::LDrib_cPt;
-
-  case Hexagon::LDrib_cdnNotPt :
-    return Hexagon::LDrib_cNotPt;
-
-  case Hexagon::LDrib_indexed_cdnPt :
-    return Hexagon::LDrib_indexed_cPt;
-
-  case Hexagon::LDrib_indexed_cdnNotPt :
-    return Hexagon::LDrib_indexed_cNotPt;
-
-  case Hexagon::POST_LDrib_cdnPt_V4 :
-    return Hexagon::POST_LDrib_cPt;
-
-  case Hexagon::POST_LDrib_cdnNotPt_V4 :
-    return Hexagon::POST_LDrib_cNotPt;
-
-  // Load unsigned half
-
-  case Hexagon::LDriuh_cdnPt :
-    return Hexagon::LDriuh_cPt;
-
-  case Hexagon::LDriuh_cdnNotPt :
-    return Hexagon::LDriuh_cNotPt;
-
-  case Hexagon::LDriuh_indexed_cdnPt :
-    return Hexagon::LDriuh_indexed_cPt;
-
-  case Hexagon::LDriuh_indexed_cdnNotPt :
-    return Hexagon::LDriuh_indexed_cNotPt;
-
-  case Hexagon::POST_LDriuh_cdnPt_V4 :
-    return Hexagon::POST_LDriuh_cPt;
-
-  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
-    return Hexagon::POST_LDriuh_cNotPt;
-
-  // Load unsigned byte
-  case Hexagon::LDriub_cdnPt :
-    return Hexagon::LDriub_cPt;
-
-  case Hexagon::LDriub_cdnNotPt :
-    return Hexagon::LDriub_cNotPt;
-
-  case Hexagon::LDriub_indexed_cdnPt :
-    return Hexagon::LDriub_indexed_cPt;
-
-  case Hexagon::LDriub_indexed_cdnNotPt :
-    return Hexagon::LDriub_indexed_cNotPt;
-
-  case Hexagon::POST_LDriub_cdnPt_V4 :
-    return Hexagon::POST_LDriub_cPt;
-
-  case Hexagon::POST_LDriub_cdnNotPt_V4 :
-    return Hexagon::POST_LDriub_cNotPt;
-
-  // V4 indexed+scaled Load
-
-  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrid_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrib_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriub_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cPt_V4;
-
-  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDrih_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriuh_indexed_shl_cNotPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cPt_V4;
-
-  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::LDriw_indexed_shl_cNotPt_V4;
-
-  // V4 global address load
-
-  case Hexagon::LDd_GP_cdnPt_V4:
-    return Hexagon::LDd_GP_cPt_V4;
-
-  case Hexagon::LDd_GP_cdnNotPt_V4:
-    return Hexagon::LDd_GP_cNotPt_V4;
-
-  case Hexagon::LDb_GP_cdnPt_V4:
-    return Hexagon::LDb_GP_cPt_V4;
-
-  case Hexagon::LDb_GP_cdnNotPt_V4:
-    return Hexagon::LDb_GP_cNotPt_V4;
-
-  case Hexagon::LDub_GP_cdnPt_V4:
-    return Hexagon::LDub_GP_cPt_V4;
-
-  case Hexagon::LDub_GP_cdnNotPt_V4:
-    return Hexagon::LDub_GP_cNotPt_V4;
-
-  case Hexagon::LDh_GP_cdnPt_V4:
-    return Hexagon::LDh_GP_cPt_V4;
-
-  case Hexagon::LDh_GP_cdnNotPt_V4:
-    return Hexagon::LDh_GP_cNotPt_V4;
-
-  case Hexagon::LDuh_GP_cdnPt_V4:
-    return Hexagon::LDuh_GP_cPt_V4;
-
-  case Hexagon::LDuh_GP_cdnNotPt_V4:
-    return Hexagon::LDuh_GP_cNotPt_V4;
-
-  case Hexagon::LDw_GP_cdnPt_V4:
-    return Hexagon::LDw_GP_cPt_V4;
-
-  case Hexagon::LDw_GP_cdnNotPt_V4:
-    return Hexagon::LDw_GP_cNotPt_V4;
-
-  // Conditional add
-
-  case Hexagon::ADD_ri_cdnPt :
-    return Hexagon::ADD_ri_cPt;
-  case Hexagon::ADD_ri_cdnNotPt :
-    return Hexagon::ADD_ri_cNotPt;
-
-  case Hexagon::ADD_rr_cdnPt :
-    return Hexagon::ADD_rr_cPt;
-  case Hexagon::ADD_rr_cdnNotPt:
-    return Hexagon::ADD_rr_cNotPt;
-
-  // Conditional logical Operations
-
-  case Hexagon::XOR_rr_cdnPt :
-    return Hexagon::XOR_rr_cPt;
-  case Hexagon::XOR_rr_cdnNotPt :
-    return Hexagon::XOR_rr_cNotPt;
-
-  case Hexagon::AND_rr_cdnPt :
-    return Hexagon::AND_rr_cPt;
-  case Hexagon::AND_rr_cdnNotPt :
-    return Hexagon::AND_rr_cNotPt;
-
-  case Hexagon::OR_rr_cdnPt :
-    return Hexagon::OR_rr_cPt;
-  case Hexagon::OR_rr_cdnNotPt :
-    return Hexagon::OR_rr_cNotPt;
-
-  // Conditional Subtract
-
-  case Hexagon::SUB_rr_cdnPt :
-    return Hexagon::SUB_rr_cPt;
-  case Hexagon::SUB_rr_cdnNotPt :
-    return Hexagon::SUB_rr_cNotPt;
-
-  // Conditional combine
-
-  case Hexagon::COMBINE_rr_cdnPt :
-    return Hexagon::COMBINE_rr_cPt;
-  case Hexagon::COMBINE_rr_cdnNotPt :
-    return Hexagon::COMBINE_rr_cNotPt;
-
-// Conditional shift operations
-
-  case Hexagon::ASLH_cdnPt_V4 :
-    return Hexagon::ASLH_cPt_V4;
-  case Hexagon::ASLH_cdnNotPt_V4 :
-    return Hexagon::ASLH_cNotPt_V4;
-
-  case Hexagon::ASRH_cdnPt_V4 :
-    return Hexagon::ASRH_cPt_V4;
-  case Hexagon::ASRH_cdnNotPt_V4 :
-    return Hexagon::ASRH_cNotPt_V4;
-
-  case Hexagon::SXTB_cdnPt_V4 :
-    return Hexagon::SXTB_cPt_V4;
-  case Hexagon::SXTB_cdnNotPt_V4 :
-    return Hexagon::SXTB_cNotPt_V4;
-
-  case Hexagon::SXTH_cdnPt_V4 :
-    return Hexagon::SXTH_cPt_V4;
-  case Hexagon::SXTH_cdnNotPt_V4 :
-    return Hexagon::SXTH_cNotPt_V4;
-
-  case Hexagon::ZXTB_cdnPt_V4 :
-    return Hexagon::ZXTB_cPt_V4;
-  case Hexagon::ZXTB_cdnNotPt_V4 :
-    return Hexagon::ZXTB_cNotPt_V4;
-
-  case Hexagon::ZXTH_cdnPt_V4 :
-    return Hexagon::ZXTH_cPt_V4;
-  case Hexagon::ZXTH_cdnNotPt_V4 :
-    return Hexagon::ZXTH_cNotPt_V4;
-
-  // Store byte
-
-  case Hexagon::STrib_imm_cdnPt_V4 :
-    return Hexagon::STrib_imm_cPt_V4;
-
-  case Hexagon::STrib_imm_cdnNotPt_V4 :
-    return Hexagon::STrib_imm_cNotPt_V4;
-
-  case Hexagon::STrib_cdnPt_nv_V4 :
-  case Hexagon::STrib_cPt_nv_V4 :
-  case Hexagon::STrib_cdnPt_V4 :
-    return Hexagon::STrib_cPt;
-
-  case Hexagon::STrib_cdnNotPt_nv_V4 :
-  case Hexagon::STrib_cNotPt_nv_V4 :
-  case Hexagon::STrib_cdnNotPt_V4 :
-    return Hexagon::STrib_cNotPt;
-
-  case Hexagon::STrib_indexed_cdnPt_V4 :
-  case Hexagon::STrib_indexed_cPt_nv_V4 :
-  case Hexagon::STrib_indexed_cdnPt_nv_V4 :
-    return Hexagon::STrib_indexed_cPt;
-
-  case Hexagon::STrib_indexed_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_cNotPt_nv_V4 :
-  case Hexagon::STrib_indexed_cdnNotPt_nv_V4 :
-    return Hexagon::STrib_indexed_cNotPt;
-
-  case Hexagon::STrib_indexed_shl_cdnPt_nv_V4:
-  case Hexagon::STrib_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrib_indexed_shl_cPt_V4;
-
-  case Hexagon::STrib_indexed_shl_cdnNotPt_nv_V4:
-  case Hexagon::STrib_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrib_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STbri_cdnPt_nv_V4 :
-  case Hexagon::POST_STbri_cPt_nv_V4 :
-  case Hexagon::POST_STbri_cdnPt_V4 :
-    return Hexagon::POST_STbri_cPt;
-
-  case Hexagon::POST_STbri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_STbri_cNotPt_nv_V4:
-  case Hexagon::POST_STbri_cdnNotPt_V4 :
-    return Hexagon::POST_STbri_cNotPt;
-
-  case Hexagon::STb_GP_cdnPt_nv_V4:
-  case Hexagon::STb_GP_cdnPt_V4:
-  case Hexagon::STb_GP_cPt_nv_V4:
-    return Hexagon::STb_GP_cPt_V4;
-
-  case Hexagon::STb_GP_cdnNotPt_nv_V4:
-  case Hexagon::STb_GP_cdnNotPt_V4:
-  case Hexagon::STb_GP_cNotPt_nv_V4:
-    return Hexagon::STb_GP_cNotPt_V4;
-
-  // Store new-value byte - unconditional
-  case Hexagon::STrib_nv_V4:
-    return Hexagon::STrib;
-
-  case Hexagon::STrib_indexed_nv_V4:
-    return Hexagon::STrib_indexed;
-
-  case Hexagon::STrib_indexed_shl_nv_V4:
-    return Hexagon::STrib_indexed_shl_V4;
-
-  case Hexagon::STrib_shl_nv_V4:
-    return Hexagon::STrib_shl_V4;
-
-  case Hexagon::STb_GP_nv_V4:
-    return Hexagon::STb_GP_V4;
-
-  case Hexagon::POST_STbri_nv_V4:
-    return Hexagon::POST_STbri;
-
-  // Store halfword
-  case Hexagon::STrih_imm_cdnPt_V4 :
-    return Hexagon::STrih_imm_cPt_V4;
-
-  case Hexagon::STrih_imm_cdnNotPt_V4 :
-    return Hexagon::STrih_imm_cNotPt_V4;
-
-  case Hexagon::STrih_cdnPt_nv_V4 :
-  case Hexagon::STrih_cPt_nv_V4 :
-  case Hexagon::STrih_cdnPt_V4 :
-    return Hexagon::STrih_cPt;
-
-  case Hexagon::STrih_cdnNotPt_nv_V4 :
-  case Hexagon::STrih_cNotPt_nv_V4 :
-  case Hexagon::STrih_cdnNotPt_V4 :
-    return Hexagon::STrih_cNotPt;
-
-  case Hexagon::STrih_indexed_cdnPt_nv_V4:
-  case Hexagon::STrih_indexed_cPt_nv_V4 :
-  case Hexagon::STrih_indexed_cdnPt_V4 :
-    return Hexagon::STrih_indexed_cPt;
-
-  case Hexagon::STrih_indexed_cdnNotPt_nv_V4:
-  case Hexagon::STrih_indexed_cNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_cdnNotPt_V4 :
-    return Hexagon::STrih_indexed_cNotPt;
-
-  case Hexagon::STrih_indexed_shl_cdnPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrih_indexed_shl_cPt_V4;
-
-  case Hexagon::STrih_indexed_shl_cdnNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrih_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_SThri_cdnPt_nv_V4 :
-  case Hexagon::POST_SThri_cPt_nv_V4 :
-  case Hexagon::POST_SThri_cdnPt_V4 :
-    return Hexagon::POST_SThri_cPt;
-
-  case Hexagon::POST_SThri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_SThri_cNotPt_nv_V4 :
-  case Hexagon::POST_SThri_cdnNotPt_V4 :
-    return Hexagon::POST_SThri_cNotPt;
-
-  case Hexagon::STh_GP_cdnPt_nv_V4:
-  case Hexagon::STh_GP_cdnPt_V4:
-  case Hexagon::STh_GP_cPt_nv_V4:
-    return Hexagon::STh_GP_cPt_V4;
-
-  case Hexagon::STh_GP_cdnNotPt_nv_V4:
-  case Hexagon::STh_GP_cdnNotPt_V4:
-  case Hexagon::STh_GP_cNotPt_nv_V4:
-    return Hexagon::STh_GP_cNotPt_V4;
-
-  // Store new-value halfword - unconditional
-
-  case Hexagon::STrih_nv_V4:
-    return Hexagon::STrih;
-
-  case Hexagon::STrih_indexed_nv_V4:
-    return Hexagon::STrih_indexed;
-
-  case Hexagon::STrih_indexed_shl_nv_V4:
-    return Hexagon::STrih_indexed_shl_V4;
-
-  case Hexagon::STrih_shl_nv_V4:
-    return Hexagon::STrih_shl_V4;
-
-  case Hexagon::STh_GP_nv_V4:
-    return Hexagon::STh_GP_V4;
-
-  case Hexagon::POST_SThri_nv_V4:
-    return Hexagon::POST_SThri;
-
-   // Store word
-
-   case Hexagon::STriw_imm_cdnPt_V4 :
-    return Hexagon::STriw_imm_cPt_V4;
-
-  case Hexagon::STriw_imm_cdnNotPt_V4 :
-    return Hexagon::STriw_imm_cNotPt_V4;
-
-  case Hexagon::STriw_cdnPt_nv_V4 :
-  case Hexagon::STriw_cPt_nv_V4 :
-  case Hexagon::STriw_cdnPt_V4 :
-    return Hexagon::STriw_cPt;
-
-  case Hexagon::STriw_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_cNotPt_nv_V4 :
-  case Hexagon::STriw_cdnNotPt_V4 :
-    return Hexagon::STriw_cNotPt;
-
-  case Hexagon::STriw_indexed_cdnPt_nv_V4 :
-  case Hexagon::STriw_indexed_cPt_nv_V4 :
-  case Hexagon::STriw_indexed_cdnPt_V4 :
-    return Hexagon::STriw_indexed_cPt;
-
-  case Hexagon::STriw_indexed_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_cNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_cdnNotPt_V4 :
-    return Hexagon::STriw_indexed_cNotPt;
-
-  case Hexagon::STriw_indexed_shl_cdnPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
-    return Hexagon::STriw_indexed_shl_cPt_V4;
-
-  case Hexagon::STriw_indexed_shl_cdnNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cNotPt_nv_V4 :
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STriw_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STwri_cdnPt_nv_V4 :
-  case Hexagon::POST_STwri_cPt_nv_V4 :
-  case Hexagon::POST_STwri_cdnPt_V4 :
-    return Hexagon::POST_STwri_cPt;
-
-  case Hexagon::POST_STwri_cdnNotPt_nv_V4 :
-  case Hexagon::POST_STwri_cNotPt_nv_V4 :
-  case Hexagon::POST_STwri_cdnNotPt_V4 :
-    return Hexagon::POST_STwri_cNotPt;
-
-  case Hexagon::STw_GP_cdnPt_nv_V4:
-  case Hexagon::STw_GP_cdnPt_V4:
-  case Hexagon::STw_GP_cPt_nv_V4:
-    return Hexagon::STw_GP_cPt_V4;
-
-  case Hexagon::STw_GP_cdnNotPt_nv_V4:
-  case Hexagon::STw_GP_cdnNotPt_V4:
-  case Hexagon::STw_GP_cNotPt_nv_V4:
-    return Hexagon::STw_GP_cNotPt_V4;
-
-  // Store new-value word - unconditional
-
-  case Hexagon::STriw_nv_V4:
-    return Hexagon::STriw;
-
-  case Hexagon::STriw_indexed_nv_V4:
-    return Hexagon::STriw_indexed;
-
-  case Hexagon::STriw_indexed_shl_nv_V4:
-    return Hexagon::STriw_indexed_shl_V4;
-
-  case Hexagon::STriw_shl_nv_V4:
-    return Hexagon::STriw_shl_V4;
-
-  case Hexagon::STw_GP_nv_V4:
-    return Hexagon::STw_GP_V4;
-
-  case Hexagon::POST_STwri_nv_V4:
-    return Hexagon::POST_STwri;
-
- // Store doubleword
-
-  case Hexagon::STrid_cdnPt_V4 :
-    return Hexagon::STrid_cPt;
-
-  case Hexagon::STrid_cdnNotPt_V4 :
-    return Hexagon::STrid_cNotPt;
-
-  case Hexagon::STrid_indexed_cdnPt_V4 :
-    return Hexagon::STrid_indexed_cPt;
-
-  case Hexagon::STrid_indexed_cdnNotPt_V4 :
-    return Hexagon::STrid_indexed_cNotPt;
-
-  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
-    return Hexagon::STrid_indexed_shl_cPt_V4;
-
-  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
-    return Hexagon::STrid_indexed_shl_cNotPt_V4;
-
-  case Hexagon::POST_STdri_cdnPt_V4 :
-    return Hexagon::POST_STdri_cPt;
-
-  case Hexagon::POST_STdri_cdnNotPt_V4 :
-    return Hexagon::POST_STdri_cNotPt;
-
-  case Hexagon::STd_GP_cdnPt_V4 :
-    return Hexagon::STd_GP_cPt_V4;
-
-  case Hexagon::STd_GP_cdnNotPt_V4 :
-    return Hexagon::STd_GP_cNotPt_V4;
-
-  }
-}
-
 bool HexagonPacketizerList::DemoteToDotOld(MachineInstr* MI) {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  int NewOpcode = GetDotOldOp(MI->getOpcode());
+  int NewOpcode = QII->GetDotOldOp(MI->getOpcode());
   MI->setDesc(QII->get(NewOpcode));
   return true;
 }
 
-// Returns true if an instruction is predicated on p0 and false if it's
-// predicated on !p0.
+enum PredicateKind {
+  PK_False,
+  PK_True,
+  PK_Unknown
+};
 
-static bool GetPredicateSense(MachineInstr* MI,
-                              const HexagonInstrInfo *QII) {
+/// Returns true if an instruction is predicated on p0 and false if it's
+/// predicated on !p0.
+static PredicateKind getPredicateSense(MachineInstr* MI,
+                                       const HexagonInstrInfo *QII) {
+  if (!QII->isPredicated(MI))
+    return PK_Unknown;
 
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unknown predicate sense of the instruction");
-  case Hexagon::TFR_cPt:
-  case Hexagon::TFR_cdnPt:
-  case Hexagon::TFRI_cPt:
-  case Hexagon::TFRI_cdnPt:
-  case Hexagon::STrib_cPt :
-  case Hexagon::STrib_cdnPt_V4 :
-  case Hexagon::STrib_indexed_cPt :
-  case Hexagon::STrib_indexed_cdnPt_V4 :
-  case Hexagon::STrib_indexed_shl_cPt_V4 :
-  case Hexagon::STrib_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STbri_cPt :
-  case Hexagon::POST_STbri_cdnPt_V4 :
-  case Hexagon::STrih_cPt :
-  case Hexagon::STrih_cdnPt_V4 :
-  case Hexagon::STrih_indexed_cPt :
-  case Hexagon::STrih_indexed_cdnPt_V4 :
-  case Hexagon::STrih_indexed_shl_cPt_V4 :
-  case Hexagon::STrih_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_SThri_cPt :
-  case Hexagon::POST_SThri_cdnPt_V4 :
-  case Hexagon::STriw_cPt :
-  case Hexagon::STriw_cdnPt_V4 :
-  case Hexagon::STriw_indexed_cPt :
-  case Hexagon::STriw_indexed_cdnPt_V4 :
-  case Hexagon::STriw_indexed_shl_cPt_V4 :
-  case Hexagon::STriw_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STwri_cPt :
-  case Hexagon::POST_STwri_cdnPt_V4 :
-  case Hexagon::STrib_imm_cPt_V4 :
-  case Hexagon::STrib_imm_cdnPt_V4 :
-  case Hexagon::STrid_cPt :
-  case Hexagon::STrid_cdnPt_V4 :
-  case Hexagon::STrid_indexed_cPt :
-  case Hexagon::STrid_indexed_cdnPt_V4 :
-  case Hexagon::STrid_indexed_shl_cPt_V4 :
-  case Hexagon::STrid_indexed_shl_cdnPt_V4 :
-  case Hexagon::POST_STdri_cPt :
-  case Hexagon::POST_STdri_cdnPt_V4 :
-  case Hexagon::STrih_imm_cPt_V4 :
-  case Hexagon::STrih_imm_cdnPt_V4 :
-  case Hexagon::STriw_imm_cPt_V4 :
-  case Hexagon::STriw_imm_cdnPt_V4 :
-  case Hexagon::JMP_tnew_t :
-  case Hexagon::LDrid_cPt :
-  case Hexagon::LDrid_cdnPt :
-  case Hexagon::LDrid_indexed_cPt :
-  case Hexagon::LDrid_indexed_cdnPt :
-  case Hexagon::POST_LDrid_cPt :
-  case Hexagon::POST_LDrid_cdnPt_V4 :
-  case Hexagon::LDriw_cPt :
-  case Hexagon::LDriw_cdnPt :
-  case Hexagon::LDriw_indexed_cPt :
-  case Hexagon::LDriw_indexed_cdnPt :
-  case Hexagon::POST_LDriw_cPt :
-  case Hexagon::POST_LDriw_cdnPt_V4 :
-  case Hexagon::LDrih_cPt :
-  case Hexagon::LDrih_cdnPt :
-  case Hexagon::LDrih_indexed_cPt :
-  case Hexagon::LDrih_indexed_cdnPt :
-  case Hexagon::POST_LDrih_cPt :
-  case Hexagon::POST_LDrih_cdnPt_V4 :
-  case Hexagon::LDrib_cPt :
-  case Hexagon::LDrib_cdnPt :
-  case Hexagon::LDrib_indexed_cPt :
-  case Hexagon::LDrib_indexed_cdnPt :
-  case Hexagon::POST_LDrib_cPt :
-  case Hexagon::POST_LDrib_cdnPt_V4 :
-  case Hexagon::LDriuh_cPt :
-  case Hexagon::LDriuh_cdnPt :
-  case Hexagon::LDriuh_indexed_cPt :
-  case Hexagon::LDriuh_indexed_cdnPt :
-  case Hexagon::POST_LDriuh_cPt :
-  case Hexagon::POST_LDriuh_cdnPt_V4 :
-  case Hexagon::LDriub_cPt :
-  case Hexagon::LDriub_cdnPt :
-  case Hexagon::LDriub_indexed_cPt :
-  case Hexagon::LDriub_indexed_cdnPt :
-  case Hexagon::POST_LDriub_cPt :
-  case Hexagon::POST_LDriub_cdnPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cdnPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cdnPt_V4 :
-  case Hexagon::ADD_ri_cPt :
-  case Hexagon::ADD_ri_cdnPt :
-  case Hexagon::ADD_rr_cPt :
-  case Hexagon::ADD_rr_cdnPt :
-  case Hexagon::XOR_rr_cPt :
-  case Hexagon::XOR_rr_cdnPt :
-  case Hexagon::AND_rr_cPt :
-  case Hexagon::AND_rr_cdnPt :
-  case Hexagon::OR_rr_cPt :
-  case Hexagon::OR_rr_cdnPt :
-  case Hexagon::SUB_rr_cPt :
-  case Hexagon::SUB_rr_cdnPt :
-  case Hexagon::COMBINE_rr_cPt :
-  case Hexagon::COMBINE_rr_cdnPt :
-  case Hexagon::ASLH_cPt_V4 :
-  case Hexagon::ASLH_cdnPt_V4 :
-  case Hexagon::ASRH_cPt_V4 :
-  case Hexagon::ASRH_cdnPt_V4 :
-  case Hexagon::SXTB_cPt_V4 :
-  case Hexagon::SXTB_cdnPt_V4 :
-  case Hexagon::SXTH_cPt_V4 :
-  case Hexagon::SXTH_cdnPt_V4 :
-  case Hexagon::ZXTB_cPt_V4 :
-  case Hexagon::ZXTB_cdnPt_V4 :
-  case Hexagon::ZXTH_cPt_V4 :
-  case Hexagon::ZXTH_cdnPt_V4 :
-  case Hexagon::LDd_GP_cPt_V4 :
-  case Hexagon::LDb_GP_cPt_V4 :
-  case Hexagon::LDub_GP_cPt_V4 :
-  case Hexagon::LDh_GP_cPt_V4 :
-  case Hexagon::LDuh_GP_cPt_V4 :
-  case Hexagon::LDw_GP_cPt_V4 :
-  case Hexagon::STd_GP_cPt_V4 :
-  case Hexagon::STb_GP_cPt_V4 :
-  case Hexagon::STh_GP_cPt_V4 :
-  case Hexagon::STw_GP_cPt_V4 :
-  case Hexagon::LDd_GP_cdnPt_V4 :
-  case Hexagon::LDb_GP_cdnPt_V4 :
-  case Hexagon::LDub_GP_cdnPt_V4 :
-  case Hexagon::LDh_GP_cdnPt_V4 :
-  case Hexagon::LDuh_GP_cdnPt_V4 :
-  case Hexagon::LDw_GP_cdnPt_V4 :
-  case Hexagon::STd_GP_cdnPt_V4 :
-  case Hexagon::STb_GP_cdnPt_V4 :
-  case Hexagon::STh_GP_cdnPt_V4 :
-  case Hexagon::STw_GP_cdnPt_V4 :
-    return true;
+  if (QII->isPredicatedTrue(MI))
+    return PK_True;
 
-  case Hexagon::TFR_cNotPt:
-  case Hexagon::TFR_cdnNotPt:
-  case Hexagon::TFRI_cNotPt:
-  case Hexagon::TFRI_cdnNotPt:
-  case Hexagon::STrib_cNotPt :
-  case Hexagon::STrib_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_cNotPt :
-  case Hexagon::STrib_indexed_cdnNotPt_V4 :
-  case Hexagon::STrib_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STbri_cNotPt :
-  case Hexagon::POST_STbri_cdnNotPt_V4 :
-  case Hexagon::STrih_cNotPt :
-  case Hexagon::STrih_cdnNotPt_V4 :
-  case Hexagon::STrih_indexed_cNotPt :
-  case Hexagon::STrih_indexed_cdnNotPt_V4 :
-  case Hexagon::STrih_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_SThri_cNotPt :
-  case Hexagon::POST_SThri_cdnNotPt_V4 :
-  case Hexagon::STriw_cNotPt :
-  case Hexagon::STriw_cdnNotPt_V4 :
-  case Hexagon::STriw_indexed_cNotPt :
-  case Hexagon::STriw_indexed_cdnNotPt_V4 :
-  case Hexagon::STriw_indexed_shl_cNotPt_V4 :
-  case Hexagon::STriw_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STwri_cNotPt :
-  case Hexagon::POST_STwri_cdnNotPt_V4 :
-  case Hexagon::STrib_imm_cNotPt_V4 :
-  case Hexagon::STrib_imm_cdnNotPt_V4 :
-  case Hexagon::STrid_cNotPt :
-  case Hexagon::STrid_cdnNotPt_V4 :
-  case Hexagon::STrid_indexed_cdnNotPt_V4 :
-  case Hexagon::STrid_indexed_cNotPt :
-  case Hexagon::STrid_indexed_shl_cNotPt_V4 :
-  case Hexagon::STrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::POST_STdri_cNotPt :
-  case Hexagon::POST_STdri_cdnNotPt_V4 :
-  case Hexagon::STrih_imm_cNotPt_V4 :
-  case Hexagon::STrih_imm_cdnNotPt_V4 :
-  case Hexagon::STriw_imm_cNotPt_V4 :
-  case Hexagon::STriw_imm_cdnNotPt_V4 :
-  case Hexagon::JMP_fnew_t :
-  case Hexagon::LDrid_cNotPt :
-  case Hexagon::LDrid_cdnNotPt :
-  case Hexagon::LDrid_indexed_cNotPt :
-  case Hexagon::LDrid_indexed_cdnNotPt :
-  case Hexagon::POST_LDrid_cNotPt :
-  case Hexagon::POST_LDrid_cdnNotPt_V4 :
-  case Hexagon::LDriw_cNotPt :
-  case Hexagon::LDriw_cdnNotPt :
-  case Hexagon::LDriw_indexed_cNotPt :
-  case Hexagon::LDriw_indexed_cdnNotPt :
-  case Hexagon::POST_LDriw_cNotPt :
-  case Hexagon::POST_LDriw_cdnNotPt_V4 :
-  case Hexagon::LDrih_cNotPt :
-  case Hexagon::LDrih_cdnNotPt :
-  case Hexagon::LDrih_indexed_cNotPt :
-  case Hexagon::LDrih_indexed_cdnNotPt :
-  case Hexagon::POST_LDrih_cNotPt :
-  case Hexagon::POST_LDrih_cdnNotPt_V4 :
-  case Hexagon::LDrib_cNotPt :
-  case Hexagon::LDrib_cdnNotPt :
-  case Hexagon::LDrib_indexed_cNotPt :
-  case Hexagon::LDrib_indexed_cdnNotPt :
-  case Hexagon::POST_LDrib_cNotPt :
-  case Hexagon::POST_LDrib_cdnNotPt_V4 :
-  case Hexagon::LDriuh_cNotPt :
-  case Hexagon::LDriuh_cdnNotPt :
-  case Hexagon::LDriuh_indexed_cNotPt :
-  case Hexagon::LDriuh_indexed_cdnNotPt :
-  case Hexagon::POST_LDriuh_cNotPt :
-  case Hexagon::POST_LDriuh_cdnNotPt_V4 :
-  case Hexagon::LDriub_cNotPt :
-  case Hexagon::LDriub_cdnNotPt :
-  case Hexagon::LDriub_indexed_cNotPt :
-  case Hexagon::LDriub_indexed_cdnNotPt :
-  case Hexagon::POST_LDriub_cNotPt :
-  case Hexagon::POST_LDriub_cdnNotPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrid_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrib_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriub_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDrih_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriuh_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cNotPt_V4 :
-  case Hexagon::LDriw_indexed_shl_cdnNotPt_V4 :
-  case Hexagon::ADD_ri_cNotPt :
-  case Hexagon::ADD_ri_cdnNotPt :
-  case Hexagon::ADD_rr_cNotPt :
-  case Hexagon::ADD_rr_cdnNotPt :
-  case Hexagon::XOR_rr_cNotPt :
-  case Hexagon::XOR_rr_cdnNotPt :
-  case Hexagon::AND_rr_cNotPt :
-  case Hexagon::AND_rr_cdnNotPt :
-  case Hexagon::OR_rr_cNotPt :
-  case Hexagon::OR_rr_cdnNotPt :
-  case Hexagon::SUB_rr_cNotPt :
-  case Hexagon::SUB_rr_cdnNotPt :
-  case Hexagon::COMBINE_rr_cNotPt :
-  case Hexagon::COMBINE_rr_cdnNotPt :
-  case Hexagon::ASLH_cNotPt_V4 :
-  case Hexagon::ASLH_cdnNotPt_V4 :
-  case Hexagon::ASRH_cNotPt_V4 :
-  case Hexagon::ASRH_cdnNotPt_V4 :
-  case Hexagon::SXTB_cNotPt_V4 :
-  case Hexagon::SXTB_cdnNotPt_V4 :
-  case Hexagon::SXTH_cNotPt_V4 :
-  case Hexagon::SXTH_cdnNotPt_V4 :
-  case Hexagon::ZXTB_cNotPt_V4 :
-  case Hexagon::ZXTB_cdnNotPt_V4 :
-  case Hexagon::ZXTH_cNotPt_V4 :
-  case Hexagon::ZXTH_cdnNotPt_V4 :
-
-  case Hexagon::LDd_GP_cNotPt_V4 :
-  case Hexagon::LDb_GP_cNotPt_V4 :
-  case Hexagon::LDub_GP_cNotPt_V4 :
-  case Hexagon::LDh_GP_cNotPt_V4 :
-  case Hexagon::LDuh_GP_cNotPt_V4 :
-  case Hexagon::LDw_GP_cNotPt_V4 :
-  case Hexagon::STd_GP_cNotPt_V4 :
-  case Hexagon::STb_GP_cNotPt_V4 :
-  case Hexagon::STh_GP_cNotPt_V4 :
-  case Hexagon::STw_GP_cNotPt_V4 :
-  case Hexagon::LDd_GP_cdnNotPt_V4 :
-  case Hexagon::LDb_GP_cdnNotPt_V4 :
-  case Hexagon::LDub_GP_cdnNotPt_V4 :
-  case Hexagon::LDh_GP_cdnNotPt_V4 :
-  case Hexagon::LDuh_GP_cdnNotPt_V4 :
-  case Hexagon::LDw_GP_cdnNotPt_V4 :
-  case Hexagon::STd_GP_cdnNotPt_V4 :
-  case Hexagon::STb_GP_cdnNotPt_V4 :
-  case Hexagon::STh_GP_cdnNotPt_V4 :
-  case Hexagon::STw_GP_cdnNotPt_V4 :
-    return false;
-  }
-  // return *some value* to avoid compiler warning
-  return false;
+  return PK_False;
 }
 
 static MachineOperand& GetPostIncrementOperand(MachineInstr *MI,
@@ -2252,10 +537,10 @@ static MachineOperand& GetStoreValueOperand(MachineInstr *MI) {
 //    Arch Spec: 3.4.4.2
 bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
                 MachineInstr *PacketMI, unsigned DepReg,
-                std::map <MachineInstr*, SUnit*> MIToSUnit)
-{
-  // Make sure we are looking at the store
-  if (!IsNewifyStore(MI))
+                std::map <MachineInstr*, SUnit*> MIToSUnit) {
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
+  // Make sure we are looking at the store, that can be promoted.
+  if (!QII->mayBeNewStore(MI))
     return false;
 
   // Make sure there is dependency and can be new value'ed
@@ -2263,12 +548,11 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
       GetStoreValueOperand(MI).getReg() != DepReg)
     return false;
 
-  const HexagonRegisterInfo* QRI = 
+  const HexagonRegisterInfo* QRI =
                             (const HexagonRegisterInfo *) TM.getRegisterInfo();
   const MCInstrDesc& MCID = PacketMI->getDesc();
   // first operand is always the result
 
-  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   const TargetRegisterClass* PacketRC = QII->getRegClass(MCID, 0, QRI, MF);
 
   // if there is already an store in the packet, no can do new value store
@@ -2311,7 +595,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
   }
 
   // If the source that feeds the store is predicated, new value store must
-  // also be also predicated.
+  // also be predicated.
   if (QII->isPredicated(PacketMI)) {
     if (!QII->isPredicated(MI))
       return false;
@@ -2357,7 +641,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
 
     if (( predRegNumDst != predRegNumSrc) ||
           QII->isDotNewInst(PacketMI) != QII->isDotNewInst(MI)  ||
-          GetPredicateSense(MI, QII) != GetPredicateSense(PacketMI, QII)) {
+          getPredicateSense(MI, QII) != getPredicateSense(PacketMI, QII)) {
       return false;
     }
   }
@@ -2438,10 +722,11 @@ bool HexagonPacketizerList::CanPromoteToNewValue( MachineInstr *MI,
                 MachineBasicBlock::iterator &MII)
 {
 
+  const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   const HexagonRegisterInfo* QRI =
                             (const HexagonRegisterInfo *) TM.getRegisterInfo();
   if (!QRI->Subtarget.hasV4TOps() ||
-      !IsNewifyStore(MI))
+      !QII->mayBeNewStore(MI))
     return false;
 
   MachineInstr *PacketMI = PacketSU->getInstr();
@@ -2468,7 +753,7 @@ bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
 {
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
   // Already a dot new instruction.
-  if (QII->isDotNewInst(MI) && !IsNewifyStore(MI))
+  if (QII->isDotNewInst(MI) && !QII->mayBeNewStore(MI))
     return false;
 
   if (!isNewifiable(MI))
@@ -2478,12 +763,12 @@ bool HexagonPacketizerList::CanPromoteToDotNew( MachineInstr *MI,
   if (RC == &Hexagon::PredRegsRegClass && isCondInst(MI))
       return true;
   else if (RC != &Hexagon::PredRegsRegClass &&
-      !IsNewifyStore(MI)) // MI is not a new-value store
+      !QII->mayBeNewStore(MI)) // MI is not a new-value store
     return false;
   else {
     // Create a dot new machine instruction to see if resources can be
     // allocated. If not, bail out now.
-    int NewOpcode = GetDotNewOp(MI->getOpcode());
+    int NewOpcode = QII->GetDotNewOp(MI);
     const MCInstrDesc &desc = QII->get(NewOpcode);
     DebugLoc dl;
     MachineInstr *NewMI =
@@ -2552,16 +837,39 @@ bool HexagonPacketizerList::RestrictingDepExistInPacket (MachineInstr* MI,
 }
 
 
+/// Gets the predicate register of a predicated instruction.
+static unsigned getPredicatedRegister(MachineInstr *MI,
+                                      const HexagonInstrInfo *QII) {
+  /// We use the following rule: The first predicate register that is a use is
+  /// the predicate register of a predicated instruction.
+
+  assert(QII->isPredicated(MI) && "Must be predicated instruction");
+
+  for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+       OE = MI->operands_end(); OI != OE; ++OI) {
+    MachineOperand &Op = *OI;
+    if (Op.isReg() && Op.getReg() && Op.isUse() &&
+        Hexagon::PredRegsRegClass.contains(Op.getReg()))
+      return Op.getReg();
+  }
+
+  llvm_unreachable("Unknown instruction operand layout");
+
+  return 0;
+}
+
 // Given two predicated instructions, this function detects whether
 // the predicates are complements
 bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
      MachineInstr* MI2, std::map <MachineInstr*, SUnit*> MIToSUnit) {
 
   const HexagonInstrInfo *QII = (const HexagonInstrInfo *) TII;
-  // Currently can only reason about conditional transfers
-  if (!QII->isConditionalTransfer(MI1) || !QII->isConditionalTransfer(MI2)) {
+
+  // If we don't know the predicate sense of the instructions bail out early, we
+  // need it later.
+  if (getPredicateSense(MI1, QII) == PK_Unknown ||
+      getPredicateSense(MI2, QII) == PK_Unknown)
     return false;
-  }
 
   // Scheduling unit for candidate
   SUnit* SU = MIToSUnit[MI1];
@@ -2600,9 +908,9 @@ bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
         // there already exist anti dep on the same pred in
         // the packet.
         if (PacketSU->Succs[i].getSUnit() == SU &&
+            PacketSU->Succs[i].getKind() == SDep::Data &&
             Hexagon::PredRegsRegClass.contains(
               PacketSU->Succs[i].getReg()) &&
-            PacketSU->Succs[i].getKind() == SDep::Data &&
             // Here I know that *VIN is predicate setting instruction
             // with true data dep to candidate on the register
             // we care about - c) in the above example.
@@ -2623,8 +931,12 @@ bool HexagonPacketizerList::ArePredicatesComplements (MachineInstr* MI1,
   // that the predicate sense is different
   // We also need to differentiate .old vs. .new:
   // !p0 is not complimentary to p0.new
-  return ((MI1->getOperand(1).getReg() == MI2->getOperand(1).getReg()) &&
-          (GetPredicateSense(MI1, QII) != GetPredicateSense(MI2, QII)) &&
+  unsigned PReg1 = getPredicatedRegister(MI1, QII);
+  unsigned PReg2 = getPredicatedRegister(MI2, QII);
+  return ((PReg1 == PReg2) &&
+          Hexagon::PredRegsRegClass.contains(PReg1) &&
+          Hexagon::PredRegsRegClass.contains(PReg2) &&
+          (getPredicateSense(MI1, QII) != getPredicateSense(MI2, QII)) &&
           (QII->isDotNewInst(MI1) == QII->isDotNewInst(MI2)));
 }
 
@@ -2722,24 +1034,21 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   }
 
   // A LoopN instruction cannot appear in the same packet as a jump or call.
-  if (IsLoopN(I) && (   IsDirectJump(J)
-                     || MCIDJ.isCall()
-                     || QII->isDeallocRet(J))) {
+  if (IsLoopN(I) &&
+     (IsDirectJump(J) || MCIDJ.isCall() || QII->isDeallocRet(J))) {
     Dependence = true;
     return false;
   }
-  if (IsLoopN(J) && (   IsDirectJump(I)
-                     || MCIDI.isCall()
-                     || QII->isDeallocRet(I))) {
+  if (IsLoopN(J) &&
+     (IsDirectJump(I) || MCIDI.isCall() || QII->isDeallocRet(I))) {
     Dependence = true;
     return false;
   }
 
   // dealloc_return cannot appear in the same packet as a conditional or
   // unconditional jump.
-  if (QII->isDeallocRet(I) && (   MCIDJ.isBranch()
-                               || MCIDJ.isCall()
-                               || MCIDJ.isBarrier())) {
+  if (QII->isDeallocRet(I) &&
+     (MCIDJ.isBranch() || MCIDJ.isCall() || MCIDJ.isBarrier())) {
     Dependence = true;
     return false;
   }
@@ -2764,7 +1073,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     }
 
     //if dealloc_return
-    if (MCIDJ.mayStore() && QII->isDeallocRet(I)){
+    if (MCIDJ.mayStore() && QII->isDeallocRet(I)) {
       Dependence = true;
       return false;
     }
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 36da6df..7c41507 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -21,7 +21,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cstdio>
 
 using namespace llvm;
 
@@ -179,7 +178,7 @@ void HexagonInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   // Branches can take an immediate operand.  This is used by the branch
   // selection pass to print $+8, an eight byte displacement from the PC.
-  assert("Unknown branch operand.");
+  llvm_unreachable("Unknown branch operand.");
 }
 
 void HexagonInstPrinter::printCallOperand(const MCInst *MI, unsigned OpNo,
@@ -196,15 +195,9 @@ void HexagonInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
 
 void HexagonInstPrinter::printSymbol(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O, bool hi) const {
-  const MCOperand& MO = MI->getOperand(OpNo);
+  assert(MI->getOperand(OpNo).isImm() && "Unknown symbol operand");
 
-  O << '#' << (hi? "HI": "LO") << '(';
-  if (MO.isImm()) {
-    O << '#';
-    printOperand(MI, OpNo, O);
-  } else {
-    assert("Unknown symbol operand");
-    printOperand(MI, OpNo, O);
-  }
+  O << '#' << (hi ? "HI" : "LO") << "(#";
+  printOperand(MI, OpNo, O);
   O << ')';
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index d4a93b5..8519cf3 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -65,14 +65,15 @@ namespace HexagonII {
     AbsoluteSet    = 2,  // Absolute set addressing mode
     BaseImmOffset  = 3,  // Indirect with offset
     BaseLongOffset = 4,  // Indirect with long offset
-    BaseRegOffset  = 5   // Indirect with register offset
+    BaseRegOffset  = 5,  // Indirect with register offset
+    PostInc        = 6   // Post increment addressing mode
   };
 
   enum MemAccessSize {
     NoMemAccess = 0,            // Not a memory acces instruction.
     ByteAccess = 1,             // Byte access instruction (memb).
     HalfWordAccess = 2,         // Half word access instruction (memh).
-    WordAccess = 3,             // Word access instrution (memw).
+    WordAccess = 3,             // Word access instruction (memw).
     DoubleWordAccess = 4        // Double word access instruction (memd)
   };
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index 3deb8d1..3f9415b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -15,7 +15,10 @@
 
 using namespace llvm;
 
-HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
+// Pin the vtable to this file.
+void HexagonMCAsmInfo::anchor() {}
+
+HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = 0;  // .xword is only supported by V9.
@@ -29,7 +32,6 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
-  WeakRefDirective = "\t.weak\t";
 
   SupportsDebugInformation = true;
   UsesELFSectionDirectiveForBSS  = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index d336cd5..bd8cb76 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -15,14 +15,13 @@
 #define HexagonMCASMINFO_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
-  class Target;
-
-  class HexagonMCAsmInfo : public MCAsmInfo {
+  class HexagonMCAsmInfo : public MCAsmInfoELF {
+    virtual void anchor();
   public:
-    explicit HexagonMCAsmInfo(const Target &T, StringRef TT);
+    explicit HexagonMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6b1d2d1..2f93a52 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -54,13 +54,14 @@ static MCSubtargetInfo *createHexagonMCSubtargetInfo(StringRef TT,
   return X;
 }
 
-static MCAsmInfo *createHexagonMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new HexagonMCAsmInfo(T, TT);
+static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Hexagon::R30, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      0, Hexagon::R30, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 1022ae9..98d26bc 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = AArch64 ARM CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/AsmParser/CMakeLists.txt b/lib/Target/MBlaze/AsmParser/CMakeLists.txt
deleted file mode 100644
index 4a7d8e8..0000000
--- a/lib/Target/MBlaze/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMMBlazeAsmParser
-  MBlazeAsmParser.cpp
-  )
-
-add_dependencies(LLVMMBlazeAsmParser MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/AsmParser/LLVMBuild.txt b/lib/Target/MBlaze/AsmParser/LLVMBuild.txt
deleted file mode 100644
index b10189a..0000000
--- a/lib/Target/MBlaze/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/MBlaze/AsmParser/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = MBlazeAsmParser
-parent = MBlaze
-required_libraries = MBlazeInfo MC MCParser Support
-add_to_library_groups = MBlaze
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
deleted file mode 100644
index dda6e24..0000000
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ /dev/null
@@ -1,572 +0,0 @@
-//===-- MBlazeAsmParser.cpp - Parse MBlaze asm to MCInst instructions -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/MBlazeBaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-namespace {
-struct MBlazeOperand;
-
-class MBlazeAsmParser : public MCTargetAsmParser {
-  MCAsmParser &Parser;
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
-
-  MBlazeOperand *ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  MBlazeOperand *ParseRegister();
-  MBlazeOperand *ParseRegister(SMLoc &StartLoc, SMLoc &EndLoc);
-  MBlazeOperand *ParseImmediate();
-  MBlazeOperand *ParseFsl();
-  MBlazeOperand* ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-
-  /// @name Auto-generated Match Functions
-  /// {
-
-#define GET_ASSEMBLER_HEADER
-#include "MBlazeGenAsmMatcher.inc"
-
-  /// }
-
-public:
-  MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), Parser(_Parser) {}
-
-  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  virtual bool ParseDirective(AsmToken DirectiveID);
-};
-
-/// MBlazeOperand - Instances of this class represent a parsed MBlaze machine
-/// instruction.
-struct MBlazeOperand : public MCParsedAsmOperand {
-  enum KindTy {
-    Token,
-    Immediate,
-    Register,
-    Memory,
-    Fsl
-  } Kind;
-
-  SMLoc StartLoc, EndLoc;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-  };
-
-  struct RegOp {
-    unsigned RegNum;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct MemOp {
-    unsigned Base;
-    unsigned OffReg;
-    const MCExpr *Off;
-  };
-
-  struct FslImmOp {
-    const MCExpr *Val;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct ImmOp Imm;
-    struct MemOp Mem;
-    struct FslImmOp FslImm;
-  };
-
-  MBlazeOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
-public:
-  MBlazeOperand(const MBlazeOperand &o) : MCParsedAsmOperand() {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case Register:
-      Reg = o.Reg;
-      break;
-    case Immediate:
-      Imm = o.Imm;
-      break;
-    case Token:
-      Tok = o.Tok;
-      break;
-    case Memory:
-      Mem = o.Mem;
-      break;
-    case Fsl:
-      FslImm = o.FslImm;
-      break;
-    }
-  }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
-
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
-
-  unsigned getReg() const {
-    assert(Kind == Register && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  const MCExpr *getFslImm() const {
-    assert(Kind == Fsl && "Invalid access!");
-    return FslImm.Val;
-  }
-
-  unsigned getMemBase() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.Base;
-  }
-
-  const MCExpr* getMemOff() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.Off;
-  }
-
-  unsigned getMemOffReg() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.OffReg;
-  }
-
-  bool isToken() const { return Kind == Token; }
-  bool isImm() const { return Kind == Immediate; }
-  bool isMem() const { return Kind == Memory; }
-  bool isFsl() const { return Kind == Fsl; }
-  bool isReg() const { return Kind == Register; }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
-  }
-
-  void addFslOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getFslImm());
-  }
-
-  void addMemOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-
-    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
-
-    unsigned RegOff = getMemOffReg();
-    if (RegOff)
-      Inst.addOperand(MCOperand::CreateReg(RegOff));
-    else
-      addExpr(Inst, getMemOff());
-  }
-
-  StringRef getToken() const {
-    assert(Kind == Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  virtual void print(raw_ostream &OS) const;
-
-  static MBlazeOperand *CreateToken(StringRef Str, SMLoc S) {
-    MBlazeOperand *Op = new MBlazeOperand(Token);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static MBlazeOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    MBlazeOperand *Op = new MBlazeOperand(Register);
-    Op->Reg.RegNum = RegNum;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static MBlazeOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    MBlazeOperand *Op = new MBlazeOperand(Immediate);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static MBlazeOperand *CreateFslImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    MBlazeOperand *Op = new MBlazeOperand(Fsl);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static MBlazeOperand *CreateMem(unsigned Base, const MCExpr *Off, SMLoc S,
-                                  SMLoc E) {
-    MBlazeOperand *Op = new MBlazeOperand(Memory);
-    Op->Mem.Base = Base;
-    Op->Mem.Off = Off;
-    Op->Mem.OffReg = 0;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static MBlazeOperand *CreateMem(unsigned Base, unsigned Off, SMLoc S,
-                                  SMLoc E) {
-    MBlazeOperand *Op = new MBlazeOperand(Memory);
-    Op->Mem.Base = Base;
-    Op->Mem.OffReg = Off;
-    Op->Mem.Off = 0;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-};
-
-} // end anonymous namespace.
-
-void MBlazeOperand::print(raw_ostream &OS) const {
-  switch (Kind) {
-  case Immediate:
-    getImm()->print(OS);
-    break;
-  case Register:
-    OS << "<register R";
-    OS << getMBlazeRegisterNumbering(getReg()) << ">";
-    break;
-  case Token:
-    OS << "'" << getToken() << "'";
-    break;
-  case Memory: {
-    OS << "<memory R";
-    OS << getMBlazeRegisterNumbering(getMemBase());
-    OS << ", ";
-
-    unsigned RegOff = getMemOffReg();
-    if (RegOff)
-      OS << "R" << getMBlazeRegisterNumbering(RegOff);
-    else
-      OS << getMemOff();
-    OS << ">";
-    }
-    break;
-  case Fsl:
-    getFslImm()->print(OS);
-    break;
-  }
-}
-
-/// @name Auto-generated Match Functions
-/// {
-
-static unsigned MatchRegisterName(StringRef Name);
-
-/// }
-//
-bool MBlazeAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
-  MCInst Inst;
-  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                               MatchingInlineAsm)) {
-  default: break;
-  case Match_Success:
-    Out.EmitInstruction(Inst);
-    return false;
-  case Match_MissingFeature:
-    return Error(IDLoc, "instruction use requires an option to be enabled");
-  case Match_MnemonicFail:
-      return Error(IDLoc, "unrecognized instruction mnemonic");
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      if (ErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
-
-      ErrorLoc = ((MBlazeOperand*)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    }
-
-    return Error(ErrorLoc, "invalid operand for instruction");
-  }
-  }
-
-  llvm_unreachable("Implement any new match types added!");
-}
-
-MBlazeOperand *MBlazeAsmParser::
-ParseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  if (Operands.size() != 4)
-    return 0;
-
-  MBlazeOperand &Base = *(MBlazeOperand*)Operands[2];
-  MBlazeOperand &Offset = *(MBlazeOperand*)Operands[3];
-
-  SMLoc S = Base.getStartLoc();
-  SMLoc O = Offset.getStartLoc();
-  SMLoc E = Offset.getEndLoc();
-
-  if (!Base.isReg()) {
-    Error(S, "base address must be a register");
-    return 0;
-  }
-
-  if (!Offset.isReg() && !Offset.isImm()) {
-    Error(O, "offset must be a register or immediate");
-    return 0;
-  }
-
-  MBlazeOperand *Op;
-  if (Offset.isReg())
-    Op = MBlazeOperand::CreateMem(Base.getReg(), Offset.getReg(), S, E);
-  else
-    Op = MBlazeOperand::CreateMem(Base.getReg(), Offset.getImm(), S, E);
-
-  delete Operands.pop_back_val();
-  delete Operands.pop_back_val();
-  Operands.push_back(Op);
-
-  return Op;
-}
-
-bool MBlazeAsmParser::ParseRegister(unsigned &RegNo,
-                                    SMLoc &StartLoc, SMLoc &EndLoc) {
-  MBlazeOperand *Reg = ParseRegister(StartLoc, EndLoc);
-  if (!Reg)
-    return true;
-  RegNo = Reg->getReg();
-  return false;
-}
-
-MBlazeOperand *MBlazeAsmParser::ParseRegister() {
-  SMLoc S, E;
-  return ParseRegister(S, E);
-}
-
-MBlazeOperand *MBlazeAsmParser::ParseRegister(SMLoc &StartLoc, SMLoc &EndLoc) {
-  StartLoc = Parser.getTok().getLoc();
-  EndLoc = Parser.getTok().getEndLoc();
-
-  if (getLexer().getKind() != AsmToken::Identifier)
-    return 0;
-
-  unsigned RegNo = MatchRegisterName(getLexer().getTok().getIdentifier());
-  if (RegNo == 0)
-    return 0;
-
-  getLexer().Lex();
-  return MBlazeOperand::CreateReg(RegNo, StartLoc, EndLoc);
-}
-
-static unsigned MatchFslRegister(StringRef String) {
-  if (!String.startswith("rfsl"))
-    return -1;
-
-  unsigned regNum;
-  if (String.substr(4).getAsInteger(10,regNum))
-    return -1;
-
-  return regNum;
-}
-
-MBlazeOperand *MBlazeAsmParser::ParseFsl() {
-  SMLoc S = Parser.getTok().getLoc();
-  SMLoc E = Parser.getTok().getEndLoc();
-
-  switch (getLexer().getKind()) {
-  default: return 0;
-  case AsmToken::Identifier:
-    unsigned reg = MatchFslRegister(getLexer().getTok().getIdentifier());
-    if (reg >= 16)
-      return 0;
-
-    getLexer().Lex();
-    const MCExpr *EVal = MCConstantExpr::Create(reg,getContext());
-    return MBlazeOperand::CreateFslImm(EVal,S,E);
-  }
-}
-
-MBlazeOperand *MBlazeAsmParser::ParseImmediate() {
-  SMLoc S = Parser.getTok().getLoc();
-  SMLoc E = Parser.getTok().getEndLoc();
-
-  const MCExpr *EVal;
-  switch (getLexer().getKind()) {
-  default: return 0;
-  case AsmToken::LParen:
-  case AsmToken::Plus:
-  case AsmToken::Minus:
-  case AsmToken::Integer:
-  case AsmToken::Identifier:
-    if (getParser().parseExpression(EVal))
-      return 0;
-
-    return MBlazeOperand::CreateImm(EVal, S, E);
-  }
-}
-
-MBlazeOperand *MBlazeAsmParser::
-ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  MBlazeOperand *Op;
-
-  // Attempt to parse the next token as a register name
-  Op = ParseRegister();
-
-  // Attempt to parse the next token as an FSL immediate
-  if (!Op)
-    Op = ParseFsl();
-
-  // Attempt to parse the next token as an immediate
-  if (!Op)
-    Op = ParseImmediate();
-
-  // If the token could not be parsed then fail
-  if (!Op) {
-    Error(Parser.getTok().getLoc(), "unknown operand");
-    return 0;
-  }
-
-  // Push the parsed operand into the list of operands
-  Operands.push_back(Op);
-  return Op;
-}
-
-/// Parse an mblaze instruction mnemonic followed by its operands.
-bool MBlazeAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // The first operands is the token for the instruction name
-  size_t dotLoc = Name.find('.');
-  Operands.push_back(MBlazeOperand::CreateToken(Name.substr(0,dotLoc),NameLoc));
-  if (dotLoc < Name.size())
-    Operands.push_back(MBlazeOperand::CreateToken(Name.substr(dotLoc),NameLoc));
-
-  // If there are no more operands then finish
-  if (getLexer().is(AsmToken::EndOfStatement))
-    return false;
-
-  // Parse the first operand
-  if (!ParseOperand(Operands))
-    return true;
-
-  while (getLexer().isNot(AsmToken::EndOfStatement) &&
-         getLexer().is(AsmToken::Comma)) {
-    // Consume the comma token
-    getLexer().Lex();
-
-    // Parse the next operand
-    if (!ParseOperand(Operands))
-      return true;
-  }
-
-  // If the instruction requires a memory operand then we need to
-  // replace the last two operands (base+offset) with a single
-  // memory operand.
-  if (Name.startswith("lw") || Name.startswith("sw") ||
-      Name.startswith("lh") || Name.startswith("sh") ||
-      Name.startswith("lb") || Name.startswith("sb"))
-    return (ParseMemory(Operands) == NULL);
-
-  return false;
-}
-
-/// ParseDirective parses the MBlaze specific directives
-bool MBlazeAsmParser::ParseDirective(AsmToken DirectiveID) {
-  StringRef IDVal = DirectiveID.getIdentifier();
-  if (IDVal == ".word")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  return true;
-}
-
-/// ParseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool MBlazeAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeMBlazeAsmParser() {
-  RegisterMCAsmParser<MBlazeAsmParser> X(TheMBlazeTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "MBlazeGenAsmMatcher.inc"
diff --git a/lib/Target/MBlaze/AsmParser/Makefile b/lib/Target/MBlaze/AsmParser/Makefile
deleted file mode 100644
index 1e68766..0000000
--- a/lib/Target/MBlaze/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/MBlaze/AsmParser/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMBlazeAsmParser
-
-# Hack: we need to include 'main' MBlaze target directory for private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
deleted file mode 100644
index 91a41f3..0000000
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ /dev/null
@@ -1,37 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS MBlaze.td)
-
-tablegen(LLVM MBlazeGenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM MBlazeGenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM MBlazeGenCodeEmitter.inc -gen-emitter)
-tablegen(LLVM MBlazeGenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM MBlazeGenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM MBlazeGenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM MBlazeGenCallingConv.inc -gen-callingconv)
-tablegen(LLVM MBlazeGenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM MBlazeGenIntrinsics.inc -gen-tgt-intrinsic)
-add_public_tablegen_target(MBlazeCommonTableGen)
-
-add_llvm_target(MBlazeCodeGen
-  MBlazeDelaySlotFiller.cpp
-  MBlazeInstrInfo.cpp
-  MBlazeISelDAGToDAG.cpp
-  MBlazeISelLowering.cpp
-  MBlazeFrameLowering.cpp
-  MBlazeMachineFunction.cpp
-  MBlazeRegisterInfo.cpp
-  MBlazeSubtarget.cpp
-  MBlazeTargetMachine.cpp
-  MBlazeTargetObjectFile.cpp
-  MBlazeIntrinsicInfo.cpp
-  MBlazeSelectionDAGInfo.cpp
-  MBlazeAsmPrinter.cpp
-  MBlazeMCInstLower.cpp
-  )
-
-add_dependencies(LLVMMBlazeCodeGen intrinsics_gen)
-
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(TargetInfo)
-add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/MBlaze/Disassembler/CMakeLists.txt b/lib/Target/MBlaze/Disassembler/CMakeLists.txt
deleted file mode 100644
index be2dce1..0000000
--- a/lib/Target/MBlaze/Disassembler/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMMBlazeDisassembler
-  MBlazeDisassembler.cpp
-  )
-
-# workaround for hanging compilation on MSVC9 and 10
-if( MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-set_property(
-  SOURCE MBlazeDisassembler.cpp
-  PROPERTY COMPILE_FLAGS "/Od"
-  )
-endif()
-
-add_dependencies(LLVMMBlazeDisassembler MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
deleted file mode 100644
index c03ab38..0000000
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.cpp
+++ /dev/null
@@ -1,719 +0,0 @@
-//===-- MBlazeDisassembler.cpp - Disassembler for MicroBlaze  -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the MBlaze Disassembler. It contains code to translate
-// the data produced by the decoder into MCInsts.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeDisassembler.h"
-#include "MBlaze.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-
-// #include "MBlazeGenDecoderTables.inc"
-// #include "MBlazeGenRegisterNames.inc"
-
-namespace llvm {
-extern const MCInstrDesc MBlazeInsts[];
-}
-
-using namespace llvm;
-
-const uint16_t UNSUPPORTED = -1;
-
-static const uint16_t mblazeBinary2Opcode[] = {
-  MBlaze::ADD,   MBlaze::RSUB,   MBlaze::ADDC,   MBlaze::RSUBC,   //00,01,02,03
-  MBlaze::ADDK,  MBlaze::RSUBK,  MBlaze::ADDKC,  MBlaze::RSUBKC,  //04,05,06,07
-  MBlaze::ADDI,  MBlaze::RSUBI,  MBlaze::ADDIC,  MBlaze::RSUBIC,  //08,09,0A,0B
-  MBlaze::ADDIK, MBlaze::RSUBIK, MBlaze::ADDIKC, MBlaze::RSUBIKC, //0C,0D,0E,0F
-
-  MBlaze::MUL,   MBlaze::BSRL,   MBlaze::IDIV,   MBlaze::GETD,    //10,11,12,13
-  UNSUPPORTED,   UNSUPPORTED,    MBlaze::FADD,   UNSUPPORTED,     //14,15,16,17
-  MBlaze::MULI,  MBlaze::BSRLI,  UNSUPPORTED,    MBlaze::GET,     //18,19,1A,1B
-  UNSUPPORTED,   UNSUPPORTED,    UNSUPPORTED,    UNSUPPORTED,     //1C,1D,1E,1F
-
-  MBlaze::OR,    MBlaze::AND,    MBlaze::XOR,    MBlaze::ANDN,    //20,21,22,23
-  MBlaze::SEXT8, MBlaze::MFS,    MBlaze::BR,     MBlaze::BEQ,     //24,25,26,27
-  MBlaze::ORI,   MBlaze::ANDI,   MBlaze::XORI,   MBlaze::ANDNI,   //28,29,2A,2B
-  MBlaze::IMM,   MBlaze::RTSD,   MBlaze::BRI,    MBlaze::BEQI,    //2C,2D,2E,2F
-
-  MBlaze::LBU,   MBlaze::LHU,    MBlaze::LW,     UNSUPPORTED,     //30,31,32,33
-  MBlaze::SB,    MBlaze::SH,     MBlaze::SW,     UNSUPPORTED,     //34,35,36,37
-  MBlaze::LBUI,  MBlaze::LHUI,   MBlaze::LWI,    UNSUPPORTED,     //38,39,3A,3B
-  MBlaze::SBI,   MBlaze::SHI,    MBlaze::SWI,    UNSUPPORTED,     //3C,3D,3E,3F
-};
-
-static unsigned getRD(uint32_t insn) {
-  if (!isMBlazeRegister((insn>>21)&0x1F))
-    return UNSUPPORTED;
-  return getMBlazeRegisterFromNumbering((insn>>21)&0x1F);
-}
-
-static unsigned getRA(uint32_t insn) {
-  if (!getMBlazeRegisterFromNumbering((insn>>16)&0x1F))
-    return UNSUPPORTED;
-  return getMBlazeRegisterFromNumbering((insn>>16)&0x1F);
-}
-
-static unsigned getRB(uint32_t insn) {
-  if (!getMBlazeRegisterFromNumbering((insn>>11)&0x1F))
-    return UNSUPPORTED;
-  return getMBlazeRegisterFromNumbering((insn>>11)&0x1F);
-}
-
-static int64_t getRS(uint32_t insn) {
-  if (!isSpecialMBlazeRegister(insn&0x3FFF))
-    return UNSUPPORTED;
-  return getSpecialMBlazeRegisterFromNumbering(insn&0x3FFF);
-}
-
-static int64_t getIMM(uint32_t insn) {
-    int16_t val = (insn & 0xFFFF);
-    return val;
-}
-
-static int64_t getSHT(uint32_t insn) {
-    int16_t val = (insn & 0x1F);
-    return val;
-}
-
-static unsigned getFLAGS(int32_t insn) {
-    return (insn & 0x7FF);
-}
-
-static int64_t getFSL(uint32_t insn) {
-    int16_t val = (insn & 0xF);
-    return val;
-}
-
-static unsigned decodeMUL(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default: return UNSUPPORTED;
-    case 0:  return MBlaze::MUL;
-    case 1:  return MBlaze::MULH;
-    case 2:  return MBlaze::MULHSU;
-    case 3:  return MBlaze::MULHU;
-    }
-}
-
-static unsigned decodeSEXT(uint32_t insn) {
-    switch (insn&0x7FF) {
-    default:   return UNSUPPORTED;
-    case 0x60: return MBlaze::SEXT8;
-    case 0x68: return MBlaze::WIC;
-    case 0x64: return MBlaze::WDC;
-    case 0x66: return MBlaze::WDCC;
-    case 0x74: return MBlaze::WDCF;
-    case 0x61: return MBlaze::SEXT16;
-    case 0x41: return MBlaze::SRL;
-    case 0x21: return MBlaze::SRC;
-    case 0x01: return MBlaze::SRA;
-    case 0xE0: return MBlaze::CLZ;
-    }
-}
-
-static unsigned decodeBEQ(uint32_t insn) {
-    switch ((insn>>21)&0x1F) {
-    default:    return UNSUPPORTED;
-    case 0x00:  return MBlaze::BEQ;
-    case 0x10:  return MBlaze::BEQD;
-    case 0x05:  return MBlaze::BGE;
-    case 0x15:  return MBlaze::BGED;
-    case 0x04:  return MBlaze::BGT;
-    case 0x14:  return MBlaze::BGTD;
-    case 0x03:  return MBlaze::BLE;
-    case 0x13:  return MBlaze::BLED;
-    case 0x02:  return MBlaze::BLT;
-    case 0x12:  return MBlaze::BLTD;
-    case 0x01:  return MBlaze::BNE;
-    case 0x11:  return MBlaze::BNED;
-    }
-}
-
-static unsigned decodeBEQI(uint32_t insn) {
-    switch ((insn>>21)&0x1F) {
-    default:    return UNSUPPORTED;
-    case 0x00:  return MBlaze::BEQI;
-    case 0x10:  return MBlaze::BEQID;
-    case 0x05:  return MBlaze::BGEI;
-    case 0x15:  return MBlaze::BGEID;
-    case 0x04:  return MBlaze::BGTI;
-    case 0x14:  return MBlaze::BGTID;
-    case 0x03:  return MBlaze::BLEI;
-    case 0x13:  return MBlaze::BLEID;
-    case 0x02:  return MBlaze::BLTI;
-    case 0x12:  return MBlaze::BLTID;
-    case 0x01:  return MBlaze::BNEI;
-    case 0x11:  return MBlaze::BNEID;
-    }
-}
-
-static unsigned decodeBR(uint32_t insn) {
-    switch ((insn>>16)&0x1F) {
-    default:   return UNSUPPORTED;
-    case 0x00: return MBlaze::BR;
-    case 0x08: return MBlaze::BRA;
-    case 0x0C: return MBlaze::BRK;
-    case 0x10: return MBlaze::BRD;
-    case 0x14: return MBlaze::BRLD;
-    case 0x18: return MBlaze::BRAD;
-    case 0x1C: return MBlaze::BRALD;
-    }
-}
-
-static unsigned decodeBRI(uint32_t insn) {
-    switch (insn&0x3FFFFFF) {
-    default:        break;
-    case 0x0020004: return MBlaze::IDMEMBAR;
-    case 0x0220004: return MBlaze::DMEMBAR;
-    case 0x0420004: return MBlaze::IMEMBAR;
-    }
-
-    switch ((insn>>16)&0x1F) {
-    default:   return UNSUPPORTED;
-    case 0x00: return MBlaze::BRI;
-    case 0x08: return MBlaze::BRAI;
-    case 0x0C: return MBlaze::BRKI;
-    case 0x10: return MBlaze::BRID;
-    case 0x14: return MBlaze::BRLID;
-    case 0x18: return MBlaze::BRAID;
-    case 0x1C: return MBlaze::BRALID;
-    }
-}
-
-static unsigned decodeBSRL(uint32_t insn) {
-    switch ((insn>>9)&0x3) {
-    default:  return UNSUPPORTED;
-    case 0x2: return MBlaze::BSLL;
-    case 0x1: return MBlaze::BSRA;
-    case 0x0: return MBlaze::BSRL;
-    }
-}
-
-static unsigned decodeBSRLI(uint32_t insn) {
-    switch ((insn>>9)&0x3) {
-    default:  return UNSUPPORTED;
-    case 0x2: return MBlaze::BSLLI;
-    case 0x1: return MBlaze::BSRAI;
-    case 0x0: return MBlaze::BSRLI;
-    }
-}
-
-static unsigned decodeRSUBK(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::RSUBK;
-    case 0x1: return MBlaze::CMP;
-    case 0x3: return MBlaze::CMPU;
-    }
-}
-
-static unsigned decodeFADD(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default:    return UNSUPPORTED;
-    case 0x000: return MBlaze::FADD;
-    case 0x080: return MBlaze::FRSUB;
-    case 0x100: return MBlaze::FMUL;
-    case 0x180: return MBlaze::FDIV;
-    case 0x200: return MBlaze::FCMP_UN;
-    case 0x210: return MBlaze::FCMP_LT;
-    case 0x220: return MBlaze::FCMP_EQ;
-    case 0x230: return MBlaze::FCMP_LE;
-    case 0x240: return MBlaze::FCMP_GT;
-    case 0x250: return MBlaze::FCMP_NE;
-    case 0x260: return MBlaze::FCMP_GE;
-    case 0x280: return MBlaze::FLT;
-    case 0x300: return MBlaze::FINT;
-    case 0x380: return MBlaze::FSQRT;
-    }
-}
-
-static unsigned decodeGET(uint32_t insn) {
-    switch ((insn>>10)&0x3F) {
-    default:   return UNSUPPORTED;
-    case 0x00: return MBlaze::GET;
-    case 0x01: return MBlaze::EGET;
-    case 0x02: return MBlaze::AGET;
-    case 0x03: return MBlaze::EAGET;
-    case 0x04: return MBlaze::TGET;
-    case 0x05: return MBlaze::TEGET;
-    case 0x06: return MBlaze::TAGET;
-    case 0x07: return MBlaze::TEAGET;
-    case 0x08: return MBlaze::CGET;
-    case 0x09: return MBlaze::ECGET;
-    case 0x0A: return MBlaze::CAGET;
-    case 0x0B: return MBlaze::ECAGET;
-    case 0x0C: return MBlaze::TCGET;
-    case 0x0D: return MBlaze::TECGET;
-    case 0x0E: return MBlaze::TCAGET;
-    case 0x0F: return MBlaze::TECAGET;
-    case 0x10: return MBlaze::NGET;
-    case 0x11: return MBlaze::NEGET;
-    case 0x12: return MBlaze::NAGET;
-    case 0x13: return MBlaze::NEAGET;
-    case 0x14: return MBlaze::TNGET;
-    case 0x15: return MBlaze::TNEGET;
-    case 0x16: return MBlaze::TNAGET;
-    case 0x17: return MBlaze::TNEAGET;
-    case 0x18: return MBlaze::NCGET;
-    case 0x19: return MBlaze::NECGET;
-    case 0x1A: return MBlaze::NCAGET;
-    case 0x1B: return MBlaze::NECAGET;
-    case 0x1C: return MBlaze::TNCGET;
-    case 0x1D: return MBlaze::TNECGET;
-    case 0x1E: return MBlaze::TNCAGET;
-    case 0x1F: return MBlaze::TNECAGET;
-    case 0x20: return MBlaze::PUT;
-    case 0x22: return MBlaze::APUT;
-    case 0x24: return MBlaze::TPUT;
-    case 0x26: return MBlaze::TAPUT;
-    case 0x28: return MBlaze::CPUT;
-    case 0x2A: return MBlaze::CAPUT;
-    case 0x2C: return MBlaze::TCPUT;
-    case 0x2E: return MBlaze::TCAPUT;
-    case 0x30: return MBlaze::NPUT;
-    case 0x32: return MBlaze::NAPUT;
-    case 0x34: return MBlaze::TNPUT;
-    case 0x36: return MBlaze::TNAPUT;
-    case 0x38: return MBlaze::NCPUT;
-    case 0x3A: return MBlaze::NCAPUT;
-    case 0x3C: return MBlaze::TNCPUT;
-    case 0x3E: return MBlaze::TNCAPUT;
-    }
-}
-
-static unsigned decodeGETD(uint32_t insn) {
-    switch ((insn>>5)&0x3F) {
-    default:   return UNSUPPORTED;
-    case 0x00: return MBlaze::GETD;
-    case 0x01: return MBlaze::EGETD;
-    case 0x02: return MBlaze::AGETD;
-    case 0x03: return MBlaze::EAGETD;
-    case 0x04: return MBlaze::TGETD;
-    case 0x05: return MBlaze::TEGETD;
-    case 0x06: return MBlaze::TAGETD;
-    case 0x07: return MBlaze::TEAGETD;
-    case 0x08: return MBlaze::CGETD;
-    case 0x09: return MBlaze::ECGETD;
-    case 0x0A: return MBlaze::CAGETD;
-    case 0x0B: return MBlaze::ECAGETD;
-    case 0x0C: return MBlaze::TCGETD;
-    case 0x0D: return MBlaze::TECGETD;
-    case 0x0E: return MBlaze::TCAGETD;
-    case 0x0F: return MBlaze::TECAGETD;
-    case 0x10: return MBlaze::NGETD;
-    case 0x11: return MBlaze::NEGETD;
-    case 0x12: return MBlaze::NAGETD;
-    case 0x13: return MBlaze::NEAGETD;
-    case 0x14: return MBlaze::TNGETD;
-    case 0x15: return MBlaze::TNEGETD;
-    case 0x16: return MBlaze::TNAGETD;
-    case 0x17: return MBlaze::TNEAGETD;
-    case 0x18: return MBlaze::NCGETD;
-    case 0x19: return MBlaze::NECGETD;
-    case 0x1A: return MBlaze::NCAGETD;
-    case 0x1B: return MBlaze::NECAGETD;
-    case 0x1C: return MBlaze::TNCGETD;
-    case 0x1D: return MBlaze::TNECGETD;
-    case 0x1E: return MBlaze::TNCAGETD;
-    case 0x1F: return MBlaze::TNECAGETD;
-    case 0x20: return MBlaze::PUTD;
-    case 0x22: return MBlaze::APUTD;
-    case 0x24: return MBlaze::TPUTD;
-    case 0x26: return MBlaze::TAPUTD;
-    case 0x28: return MBlaze::CPUTD;
-    case 0x2A: return MBlaze::CAPUTD;
-    case 0x2C: return MBlaze::TCPUTD;
-    case 0x2E: return MBlaze::TCAPUTD;
-    case 0x30: return MBlaze::NPUTD;
-    case 0x32: return MBlaze::NAPUTD;
-    case 0x34: return MBlaze::TNPUTD;
-    case 0x36: return MBlaze::TNAPUTD;
-    case 0x38: return MBlaze::NCPUTD;
-    case 0x3A: return MBlaze::NCAPUTD;
-    case 0x3C: return MBlaze::TNCPUTD;
-    case 0x3E: return MBlaze::TNCAPUTD;
-    }
-}
-
-static unsigned decodeIDIV(uint32_t insn) {
-    switch (insn&0x3) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::IDIV;
-    case 0x2: return MBlaze::IDIVU;
-    }
-}
-
-static unsigned decodeLBU(uint32_t insn) {
-    switch ((insn>>9)&0x1) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::LBU;
-    case 0x1: return MBlaze::LBUR;
-    }
-}
-
-static unsigned decodeLHU(uint32_t insn) {
-    switch ((insn>>9)&0x1) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::LHU;
-    case 0x1: return MBlaze::LHUR;
-    }
-}
-
-static unsigned decodeLW(uint32_t insn) {
-    switch ((insn>>9)&0x3) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::LW;
-    case 0x1: return MBlaze::LWR;
-    case 0x2: return MBlaze::LWX;
-    }
-}
-
-static unsigned decodeSB(uint32_t insn) {
-    switch ((insn>>9)&0x1) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::SB;
-    case 0x1: return MBlaze::SBR;
-    }
-}
-
-static unsigned decodeSH(uint32_t insn) {
-    switch ((insn>>9)&0x1) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::SH;
-    case 0x1: return MBlaze::SHR;
-    }
-}
-
-static unsigned decodeSW(uint32_t insn) {
-    switch ((insn>>9)&0x3) {
-    default:  return UNSUPPORTED;
-    case 0x0: return MBlaze::SW;
-    case 0x1: return MBlaze::SWR;
-    case 0x2: return MBlaze::SWX;
-    }
-}
-
-static unsigned decodeMFS(uint32_t insn) {
-    switch ((insn>>15)&0x1) {
-    default:   return UNSUPPORTED;
-    case 0x0:
-      switch ((insn>>16)&0x1) {
-      default:   return UNSUPPORTED;
-      case 0x0: return MBlaze::MSRSET;
-      case 0x1: return MBlaze::MSRCLR;
-      }
-    case 0x1:
-      switch ((insn>>14)&0x1) {
-      default:   return UNSUPPORTED;
-      case 0x0: return MBlaze::MFS;
-      case 0x1: return MBlaze::MTS;
-      }
-    }
-}
-
-static unsigned decodeOR(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default:    return UNSUPPORTED;
-    case 0x000: return MBlaze::OR;
-    case 0x400: return MBlaze::PCMPBF;
-    }
-}
-
-static unsigned decodeXOR(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default:    return UNSUPPORTED;
-    case 0x000: return MBlaze::XOR;
-    case 0x400: return MBlaze::PCMPEQ;
-    }
-}
-
-static unsigned decodeANDN(uint32_t insn) {
-    switch (getFLAGS(insn)) {
-    default:    return UNSUPPORTED;
-    case 0x000: return MBlaze::ANDN;
-    case 0x400: return MBlaze::PCMPNE;
-    }
-}
-
-static unsigned decodeRTSD(uint32_t insn) {
-    switch ((insn>>21)&0x1F) {
-    default:   return UNSUPPORTED;
-    case 0x10: return MBlaze::RTSD;
-    case 0x11: return MBlaze::RTID;
-    case 0x12: return MBlaze::RTBD;
-    case 0x14: return MBlaze::RTED;
-    }
-}
-
-static unsigned getOPCODE(uint32_t insn) {
-  unsigned opcode = mblazeBinary2Opcode[ (insn>>26)&0x3F ];
-  switch (opcode) {
-  case MBlaze::MUL:     return decodeMUL(insn);
-  case MBlaze::SEXT8:   return decodeSEXT(insn);
-  case MBlaze::BEQ:     return decodeBEQ(insn);
-  case MBlaze::BEQI:    return decodeBEQI(insn);
-  case MBlaze::BR:      return decodeBR(insn);
-  case MBlaze::BRI:     return decodeBRI(insn);
-  case MBlaze::BSRL:    return decodeBSRL(insn);
-  case MBlaze::BSRLI:   return decodeBSRLI(insn);
-  case MBlaze::RSUBK:   return decodeRSUBK(insn);
-  case MBlaze::FADD:    return decodeFADD(insn);
-  case MBlaze::GET:     return decodeGET(insn);
-  case MBlaze::GETD:    return decodeGETD(insn);
-  case MBlaze::IDIV:    return decodeIDIV(insn);
-  case MBlaze::LBU:     return decodeLBU(insn);
-  case MBlaze::LHU:     return decodeLHU(insn);
-  case MBlaze::LW:      return decodeLW(insn);
-  case MBlaze::SB:      return decodeSB(insn);
-  case MBlaze::SH:      return decodeSH(insn);
-  case MBlaze::SW:      return decodeSW(insn);
-  case MBlaze::MFS:     return decodeMFS(insn);
-  case MBlaze::OR:      return decodeOR(insn);
-  case MBlaze::XOR:     return decodeXOR(insn);
-  case MBlaze::ANDN:    return decodeANDN(insn);
-  case MBlaze::RTSD:    return decodeRTSD(insn);
-  default:              return opcode;
-  }
-}
-
-//
-// Public interface for the disassembler
-//
-
-MCDisassembler::DecodeStatus MBlazeDisassembler::getInstruction(MCInst &instr,
-                                        uint64_t &size,
-                                        const MemoryObject &region,
-                                        uint64_t address,
-                                        raw_ostream &vStream,
-                                        raw_ostream &cStream) const {
-  // The machine instruction.
-  uint32_t insn;
-  uint64_t read;
-  uint8_t bytes[4];
-
-  // By default we consume 1 byte on failure
-  size = 1;
-
-  // We want to read exactly 4 bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)bytes, &read) == -1 || read < 4)
-    return Fail;
-
-  // Encoded as a big-endian 32-bit word in the stream.
-  insn = (bytes[0]<<24) | (bytes[1]<<16) | (bytes[2]<< 8) | (bytes[3]<<0);
-
-  // Get the MCInst opcode from the binary instruction and make sure
-  // that it is a valid instruction.
-  unsigned opcode = getOPCODE(insn);
-  if (opcode == UNSUPPORTED)
-    return Fail;
-
-  instr.setOpcode(opcode);
-
-  unsigned RD = getRD(insn);
-  unsigned RA = getRA(insn);
-  unsigned RB = getRB(insn);
-  unsigned RS = getRS(insn);
-
-  uint64_t tsFlags = MBlazeInsts[opcode].TSFlags;
-  switch ((tsFlags & MBlazeII::FormMask)) {
-  default: 
-    return Fail;
-
-  case MBlazeII::FC:
-    break;
-
-  case MBlazeII::FRRRR:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RB));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    break;
-
-  case MBlazeII::FRRR:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    instr.addOperand(MCOperand::CreateReg(RB));
-    break;
-
-  case MBlazeII::FRR:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    break;
-
-  case MBlazeII::FRI:
-    switch (opcode) {
-    default: 
-      return Fail;
-    case MBlaze::MFS:
-      if (RD == UNSUPPORTED)
-        return Fail;
-      instr.addOperand(MCOperand::CreateReg(RD));
-      instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
-      break;
-    case MBlaze::MTS:
-      if (RA == UNSUPPORTED)
-        return Fail;
-      instr.addOperand(MCOperand::CreateImm(insn&0x3FFF));
-      instr.addOperand(MCOperand::CreateReg(RA));
-      break;
-    case MBlaze::MSRSET:
-    case MBlaze::MSRCLR:
-      if (RD == UNSUPPORTED)
-        return Fail;
-      instr.addOperand(MCOperand::CreateReg(RD));
-      instr.addOperand(MCOperand::CreateImm(insn&0x7FFF));
-      break;
-    }
-    break;
-
-  case MBlazeII::FRRI:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    switch (opcode) {
-    default:
-      instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-      break;
-    case MBlaze::BSRLI:
-    case MBlaze::BSRAI:
-    case MBlaze::BSLLI:
-      instr.addOperand(MCOperand::CreateImm(insn&0x1F));
-      break;
-    }
-    break;
-
-  case MBlazeII::FCRR:
-    if (RA == UNSUPPORTED || RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RA));
-    instr.addOperand(MCOperand::CreateReg(RB));
-    break;
-
-  case MBlazeII::FCRI:
-    if (RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RA));
-    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    break;
-
-  case MBlazeII::FRCR:
-    if (RD == UNSUPPORTED || RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RB));
-    break;
-
-  case MBlazeII::FRCI:
-    if (RD == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    break;
-
-  case MBlazeII::FCCR:
-    if (RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RB));
-    break;
-
-  case MBlazeII::FCCI:
-    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    break;
-
-  case MBlazeII::FRRCI:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    instr.addOperand(MCOperand::CreateImm(getSHT(insn)));
-    break;
-
-  case MBlazeII::FRRC:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    break;
-
-  case MBlazeII::FRCX:
-    if (RD == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
-    break;
-
-  case MBlazeII::FRCS:
-    if (RD == UNSUPPORTED || RS == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateReg(RS));
-    break;
-
-  case MBlazeII::FCRCS:
-    if (RS == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RS));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    break;
-
-  case MBlazeII::FCRCX:
-    if (RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RA));
-    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
-    break;
-
-  case MBlazeII::FCX:
-    instr.addOperand(MCOperand::CreateImm(getFSL(insn)));
-    break;
-
-  case MBlazeII::FCR:
-    if (RB == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RB));
-    break;
-
-  case MBlazeII::FRIR:
-    if (RD == UNSUPPORTED || RA == UNSUPPORTED)
-      return Fail;
-    instr.addOperand(MCOperand::CreateReg(RD));
-    instr.addOperand(MCOperand::CreateImm(getIMM(insn)));
-    instr.addOperand(MCOperand::CreateReg(RA));
-    break;
-  }
-
-  // We always consume 4 bytes of data on success
-  size = 4;
-
-  return Success;
-}
-
-static MCDisassembler *createMBlazeDisassembler(const Target &T,
-                                                const MCSubtargetInfo &STI) {
-  return new MBlazeDisassembler(STI);
-}
-
-extern "C" void LLVMInitializeMBlazeDisassembler() {
-  // Register the disassembler.
-  TargetRegistry::RegisterMCDisassembler(TheMBlazeTarget,
-                                         createMBlazeDisassembler);
-}
diff --git a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h b/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
deleted file mode 100644
index b8ff8f6..0000000
--- a/lib/Target/MBlaze/Disassembler/MBlazeDisassembler.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- MBlazeDisassembler.h - Disassembler for MicroBlaze  -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the MBlaze Disassembler. It it the header for
-// MBlazeDisassembler, a subclass of MCDisassembler.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZEDISASSEMBLER_H
-#define MBLAZEDISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-  
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-
-/// MBlazeDisassembler - Disassembler for all MBlaze platforms.
-class MBlazeDisassembler : public MCDisassembler {
-public:
-  /// Constructor     - Initializes the disassembler.
-  ///
-  MBlazeDisassembler(const MCSubtargetInfo &STI) :
-    MCDisassembler(STI) {
-  }
-
-  ~MBlazeDisassembler() {
-  }
-
-  /// getInstruction - See MCDisassembler.
-  MCDisassembler::DecodeStatus getInstruction(MCInst &instr,
-                      uint64_t &size,
-                      const MemoryObject &region,
-                      uint64_t address,
-                      raw_ostream &vStream,
-                      raw_ostream &cStream) const;
-};
-
-} // namespace llvm
-  
-#endif
diff --git a/lib/Target/MBlaze/Disassembler/Makefile b/lib/Target/MBlaze/Disassembler/Makefile
deleted file mode 100644
index 0530b32..0000000
--- a/lib/Target/MBlaze/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/MBlaze/Disassembler/Makefile -------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMBlazeDisassembler
-
-# Hack: we need to include 'main' MBlaze target directory to grab headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
deleted file mode 100644
index 586e2d3..0000000
--- a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMMBlazeAsmPrinter
-  MBlazeInstPrinter.cpp
-  )
-
-add_dependencies(LLVMMBlazeAsmPrinter MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt b/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 3a21a05..0000000
--- a/lib/Target/MBlaze/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/MBlaze/InstPrinter/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = MBlazeAsmPrinter
-parent = MBlaze
-required_libraries = MC Support
-add_to_library_groups = MBlaze
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
deleted file mode 100644
index fc2b3d5..0000000
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.cpp
+++ /dev/null
@@ -1,71 +0,0 @@
-//===-- MBlazeInstPrinter.cpp - Convert MBlaze MCInst to assembly syntax --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an MBlaze MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "MBlazeInstPrinter.h"
-#include "MBlaze.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-using namespace llvm;
-
-
-// Include the auto-generated portion of the assembly writer.
-#include "MBlazeGenAsmWriter.inc"
-
-void MBlazeInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                  StringRef Annot) {
-  printInstruction(MI, O);
-  printAnnotation(O, Annot);
-}
-
-void MBlazeInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                     raw_ostream &O, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    O << getRegisterName(Op.getReg());
-  } else if (Op.isImm()) {
-    O << (int32_t)Op.getImm();
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
-  }
-}
-
-void MBlazeInstPrinter::printFSLImm(const MCInst *MI, int OpNo,
-                                    raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNo);
-  if (MO.isImm())
-    O << "rfsl" << MO.getImm();
-  else
-    printOperand(MI, OpNo, O, NULL);
-}
-
-void MBlazeInstPrinter::printUnsignedImm(const MCInst *MI, int OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNo);
-  if (MO.isImm())
-    O << (uint32_t)MO.getImm();
-  else
-    printOperand(MI, OpNo, O, NULL);
-}
-
-void MBlazeInstPrinter::printMemOperand(const MCInst *MI, int OpNo,
-                                        raw_ostream &O, const char *Modifier) {
-  printOperand(MI, OpNo, O, NULL);
-  O << ", ";
-  printOperand(MI, OpNo+1, O, NULL);
-}
diff --git a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h b/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
deleted file mode 100644
index 51ba7c3..0000000
--- a/lib/Target/MBlaze/InstPrinter/MBlazeInstPrinter.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//= MBlazeInstPrinter.h - Convert MBlaze MCInst to assembly syntax -*- C++ -*-//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints a MBlaze MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZEINSTPRINTER_H
-#define MBLAZEINSTPRINTER_H
-
-#include "llvm/MC/MCInstPrinter.h"
-
-namespace llvm {
-  class MCOperand;
-
-  class MBlazeInstPrinter : public MCInstPrinter {
-  public:
-    MBlazeInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                      const MCRegisterInfo &MRI)
-      : MCInstPrinter(MAI, MII, MRI) {}
-
-    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-
-    // Autogenerated by tblgen.
-    void printInstruction(const MCInst *MI, raw_ostream &O);
-    static const char *getRegisterName(unsigned RegNo);
-
-    void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                      const char *Modifier = 0);
-    void printFSLImm(const MCInst *MI, int OpNo, raw_ostream &O);
-    void printUnsignedImm(const MCInst *MI, int OpNo, raw_ostream &O);
-    void printMemOperand(const MCInst *MI, int OpNo,raw_ostream &O,
-                         const char *Modifier = 0);
-  };
-}
-
-#endif
diff --git a/lib/Target/MBlaze/InstPrinter/Makefile b/lib/Target/MBlaze/InstPrinter/Makefile
deleted file mode 100644
index 9fb6e86..0000000
--- a/lib/Target/MBlaze/InstPrinter/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/MBlaze/AsmPrinter/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMBlazeAsmPrinter
-
-# Hack: we need to include 'main' MBlaze target directory to grab
-#       private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MBlaze/LLVMBuild.txt b/lib/Target/MBlaze/LLVMBuild.txt
deleted file mode 100644
index 0b29007..0000000
--- a/lib/Target/MBlaze/LLVMBuild.txt
+++ /dev/null
@@ -1,34 +0,0 @@
-;===- ./lib/Target/MBlaze/LLVMBuild.txt ------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
-
-[component_0]
-type = TargetGroup
-name = MBlaze
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-has_disassembler = 1
-
-[component_1]
-type = Library
-name = MBlazeCodeGen
-parent = MBlaze
-required_libraries = AsmPrinter CodeGen Core MBlazeAsmPrinter MBlazeDesc MBlazeInfo MC SelectionDAG Support Target
-add_to_library_groups = MBlaze
diff --git a/lib/Target/MBlaze/MBlaze.h b/lib/Target/MBlaze/MBlaze.h
deleted file mode 100644
index 1399b85..0000000
--- a/lib/Target/MBlaze/MBlaze.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===-- MBlaze.h - Top-level interface for MBlaze ---------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in
-// the LLVM MBlaze back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TARGET_MBLAZE_H
-#define TARGET_MBLAZE_H
-
-#include "MCTargetDesc/MBlazeBaseInfo.h"
-#include "MCTargetDesc/MBlazeMCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-  class MBlazeTargetMachine;
-  class FunctionPass;
-  class MachineCodeEmitter;
-
-  FunctionPass *createMBlazeISelDag(MBlazeTargetMachine &TM);
-  FunctionPass *createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &TM);
-
-} // end namespace llvm;
-
-#endif
diff --git a/lib/Target/MBlaze/MBlaze.td b/lib/Target/MBlaze/MBlaze.td
deleted file mode 100644
index c288855..0000000
--- a/lib/Target/MBlaze/MBlaze.td
+++ /dev/null
@@ -1,73 +0,0 @@
-//===-- MBlaze.td - Describe the MBlaze Target Machine -----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This is the top level entry point for the MBlaze target.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// Register File, Calling Conv, Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "MBlazeRegisterInfo.td"
-include "MBlazeSchedule.td"
-include "MBlazeIntrinsics.td"
-include "MBlazeInstrInfo.td"
-include "MBlazeCallingConv.td"
-
-def MBlazeInstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// Microblaze Subtarget features                                              //
-//===----------------------------------------------------------------------===//
-
-def FeatureBarrel      : SubtargetFeature<"barrel", "HasBarrel", "true",
-                                "Implements barrel shifter">;
-def FeatureDiv         : SubtargetFeature<"div", "HasDiv", "true",
-                                "Implements hardware divider">;
-def FeatureMul         : SubtargetFeature<"mul", "HasMul", "true",
-                                "Implements hardware multiplier">;
-def FeaturePatCmp      : SubtargetFeature<"patcmp", "HasPatCmp", "true",
-                                "Implements pattern compare instruction">;
-def FeatureFPU         : SubtargetFeature<"fpu", "HasFPU", "true",
-                                "Implements floating point unit">;
-def FeatureMul64       : SubtargetFeature<"mul64", "HasMul64", "true",
-                                "Implements multiplier with 64-bit result">;
-def FeatureSqrt        : SubtargetFeature<"sqrt", "HasSqrt", "true",
-                                "Implements sqrt and floating point convert">;
-
-//===----------------------------------------------------------------------===//
-// MBlaze processors supported.
-//===----------------------------------------------------------------------===//
-
-def : Processor<"mblaze",  NoItineraries, []>;
-def : Processor<"mblaze3", MBlazePipe3Itineraries, []>;
-def : Processor<"mblaze5", MBlazePipe5Itineraries, []>;
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-def MBlazeAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  bit isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def MBlaze : Target {
-  let InstructionSet = MBlazeInstrInfo;
-  let AssemblyWriters = [MBlazeAsmWriter];
-}
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
deleted file mode 100644
index 7dafaef..0000000
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===-- MBlazeAsmPrinter.cpp - MBlaze LLVM assembly writer ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to GAS-format MBlaze assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mblaze-asm-printer"
-
-#include "MBlaze.h"
-#include "InstPrinter/MBlazeInstPrinter.h"
-#include "MBlazeInstrInfo.h"
-#include "MBlazeMCInstLower.h"
-#include "MBlazeMachineFunction.h"
-#include "MBlazeSubtarget.h"
-#include "MBlazeTargetMachine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include <cctype>
-
-using namespace llvm;
-
-namespace {
-  class MBlazeAsmPrinter : public AsmPrinter {
-    const MBlazeSubtarget *Subtarget;
-  public:
-    explicit MBlazeAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer) {
-      Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
-    }
-
-    virtual const char *getPassName() const {
-      return "MBlaze Assembly Printer";
-    }
-
-    void printSavedRegsBitmask();
-    void emitFrameDirective();
-    virtual void EmitFunctionBodyStart();
-    virtual void EmitFunctionBodyEnd();
-    virtual void EmitFunctionEntryLabel();
-
-    virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
-      const;
-
-    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                         unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
-    void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
-    void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
-    void printFSLImm(const MachineInstr *MI, int opNum, raw_ostream &O);
-    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                         const char *Modifier = 0);
-
-    void EmitInstruction(const MachineInstr *MI);
-  };
-} // end of anonymous namespace
-
-// #include "MBlazeGenAsmWriter.inc"
-
-//===----------------------------------------------------------------------===//
-//
-//  MBlaze Asm Directives
-//
-//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
-//  Describe the stack frame.
-//
-//  -- Mask directives "mask  bitmask, offset"
-//  Tells the assembler which registers are saved and where.
-//  bitmask - contain a little endian bitset indicating which registers are
-//            saved on function prologue (e.g. with a 0x80000000 mask, the
-//            assembler knows the register 31 (RA) is saved at prologue.
-//  offset  - the position before stack pointer subtraction indicating where
-//            the first saved register on prologue is located. (e.g. with a
-//
-//  Consider the following function prologue:
-//
-//    .frame  R19,48,R15
-//    .mask   0xc0000000,-8
-//       addiu R1, R1, -48
-//       sw R15, 40(R1)
-//       sw R19, 36(R1)
-//
-//    With a 0xc0000000 mask, the assembler knows the register 15 (R15) and
-//    19 (R19) are saved at prologue. As the save order on prologue is from
-//    left to right, R15 is saved first. A -8 offset means that after the
-//    stack pointer subtration, the first register in the mask (R15) will be
-//    saved at address 48-8=40.
-//
-//===----------------------------------------------------------------------===//
-
-// Print a 32 bit hex number with all numbers.
-static void printHex32(unsigned int Value, raw_ostream &O) {
-  O << "0x";
-  for (int i = 7; i >= 0; i--)
-    O.write_hex((Value & (0xF << (i*4))) >> (i*4));
-}
-
-// Create a bitmask with all callee saved registers for CPU or Floating Point
-// registers. For CPU registers consider RA, GP and FP for saving if necessary.
-void MBlazeAsmPrinter::printSavedRegsBitmask() {
-  const TargetFrameLowering *TFI = TM.getFrameLowering();
-  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
-
-  // CPU Saved Registers Bitmasks
-  unsigned int CPUBitmask = 0;
-
-  // Set the CPU Bitmasks
-  const MachineFrameInfo *MFI = MF->getFrameInfo();
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-    unsigned RegNum = getMBlazeRegisterNumbering(Reg);
-    if (MBlaze::GPRRegClass.contains(Reg))
-      CPUBitmask |= (1 << RegNum);
-  }
-
-  // Return Address and Frame registers must also be set in CPUBitmask.
-  if (TFI->hasFP(*MF))
-    CPUBitmask |= (1 <<  getMBlazeRegisterNumbering(RI.getFrameRegister(*MF)));
-
-  if (MFI->adjustsStack())
-    CPUBitmask |= (1 << getMBlazeRegisterNumbering(RI.getRARegister()));
-
-  // Print CPUBitmask
-  OutStreamer.EmitRawText("\t.mask\t0x" + Twine::utohexstr(CPUBitmask));
-}
-
-/// Frame Directive
-void MBlazeAsmPrinter::emitFrameDirective() {
-  if (!OutStreamer.hasRawTextSupport())
-    return;
-
-  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
-  unsigned stkReg = RI.getFrameRegister(*MF);
-  unsigned retReg = RI.getRARegister();
-  unsigned stkSze = MF->getFrameInfo()->getStackSize();
-
-  OutStreamer.EmitRawText("\t.frame\t" +
-                          Twine(MBlazeInstPrinter::getRegisterName(stkReg)) +
-                          "," + Twine(stkSze) + "," +
-                          Twine(MBlazeInstPrinter::getRegisterName(retReg)));
-}
-
-void MBlazeAsmPrinter::EmitFunctionEntryLabel() {
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
-  AsmPrinter::EmitFunctionEntryLabel();
-}
-
-void MBlazeAsmPrinter::EmitFunctionBodyStart() {
-  if (!OutStreamer.hasRawTextSupport())
-    return;
-
-  emitFrameDirective();
-  printSavedRegsBitmask();
-}
-
-void MBlazeAsmPrinter::EmitFunctionBodyEnd() {
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
-}
-
-//===----------------------------------------------------------------------===//
-void MBlazeAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  MBlazeMCInstLower MCInstLowering(OutContext, *this);
-
-  MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer.EmitInstruction(TmpInst);
-}
-
-// Print out an operand for an inline asm expression.
-bool MBlazeAsmPrinter::
-PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                unsigned AsmVariant,const char *ExtraCode, raw_ostream &O) {
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0])
-    if (ExtraCode[1] != 0) return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      // See if this is a generic print operand
-      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
-    }
-
-  printOperand(MI, OpNo, O);
-  return false;
-}
-
-void MBlazeAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
-                                    raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-
-  switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    O << MBlazeInstPrinter::getRegisterName(MO.getReg());
-    break;
-
-  case MachineOperand::MO_Immediate:
-    O << (int32_t)MO.getImm();
-    break;
-
-  case MachineOperand::MO_FPImmediate: {
-    const ConstantFP *fp = MO.getFPImm();
-    printHex32(fp->getValueAPF().bitcastToAPInt().getZExtValue(), O);
-    O << ";\t# immediate = " << *fp;
-    break;
-  }
-
-  case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
-    return;
-
-  case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
-    break;
-
-  case MachineOperand::MO_ExternalSymbol:
-    O << *GetExternalSymbolSymbol(MO.getSymbolName());
-    break;
-
-  case MachineOperand::MO_JumpTableIndex:
-    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
-      << '_' << MO.getIndex();
-    break;
-
-  case MachineOperand::MO_ConstantPoolIndex:
-    O << MAI->getPrivateGlobalPrefix() << "CPI"
-      << getFunctionNumber() << "_" << MO.getIndex();
-    if (MO.getOffset())
-      O << "+" << MO.getOffset();
-    break;
-
-  default:
-    llvm_unreachable("<unknown operand type>");
-  }
-}
-
-void MBlazeAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
-                                        raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << (uint32_t)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
-}
-
-void MBlazeAsmPrinter::printFSLImm(const MachineInstr *MI, int opNum,
-                                   raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-  if (MO.isImm())
-    O << "rfsl" << (unsigned int)MO.getImm();
-  else
-    printOperand(MI, opNum, O);
-}
-
-void MBlazeAsmPrinter::
-printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                const char *Modifier) {
-  printOperand(MI, opNum, O);
-  O << ", ";
-  printOperand(MI, opNum+1, O);
-}
-
-/// isBlockOnlyReachableByFallthough - Return true if the basic block has
-/// exactly one predecessor and the control transfer mechanism between
-/// the predecessor and this block is a fall-through.
-bool MBlazeAsmPrinter::
-isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
-  // If this is a landing pad, it isn't a fall through.  If it has no preds,
-  // then nothing falls through to it.
-  if (MBB->isLandingPad() || MBB->pred_empty())
-    return false;
-
-  // If there isn't exactly one predecessor, it can't be a fall through.
-  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
-  ++PI2;
-  if (PI2 != MBB->pred_end())
-    return false;
-
-  // The predecessor has to be immediately before this block.
-  const MachineBasicBlock *Pred = *PI;
-
-  if (!Pred->isLayoutSuccessor(MBB))
-    return false;
-
-  // If the block is completely empty, then it definitely does fall through.
-  if (Pred->empty())
-    return true;
-
-  // Check if the last terminator is an unconditional branch.
-  MachineBasicBlock::const_iterator I = Pred->end();
-  while (I != Pred->begin() && !(--I)->isTerminator())
-    ; // Noop
-  return I == Pred->end() || !I->isBarrier();
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeMBlazeAsmPrinter() {
-  RegisterAsmPrinter<MBlazeAsmPrinter> X(TheMBlazeTarget);
-}
diff --git a/lib/Target/MBlaze/MBlazeCallingConv.td b/lib/Target/MBlaze/MBlazeCallingConv.td
deleted file mode 100644
index 00a4219..0000000
--- a/lib/Target/MBlaze/MBlazeCallingConv.td
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- MBlazeCallingConv.td - Calling Conventions for MBlaze -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This describes the calling conventions for MBlaze architecture.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze ABI Calling Convention
-//===----------------------------------------------------------------------===//
-
-def RetCC_MBlaze : CallingConv<[
-  // i32 are returned in registers R3, R4
-  CCIfType<[i32,f32], CCAssignToReg<[R3, R4]>>
-]>;
-
-def CC_MBlaze : CallingConv<[
-  CCIfType<[i32,f32], CCCustom<"CC_MBlaze_AssignReg">>,
-  CCIfType<[i32,f32], CCAssignToStack<4, 4>>
-]>;
diff --git a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp b/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
deleted file mode 100644
index 3d0d1ce..0000000
--- a/lib/Target/MBlaze/MBlazeDelaySlotFiller.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-//===-- DelaySlotFiller.cpp - MBlaze delay slot filler --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// A pass that attempts to fill instructions with delay slots. If no
-// instructions can be moved into the delay slot then a NOP is placed there.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "delay-slot-filler"
-
-#include "MBlaze.h"
-#include "MBlazeTargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-using namespace llvm;
-
-STATISTIC(FilledSlots, "Number of delay slots filled");
-
-static cl::opt<bool> MBDisableDelaySlotFiller(
-  "disable-mblaze-delay-filler",
-  cl::init(false),
-  cl::desc("Disable the MBlaze delay slot filter."),
-  cl::Hidden);
-
-namespace {
-  struct Filler : public MachineFunctionPass {
-
-    TargetMachine &TM;
-    const TargetInstrInfo *TII;
-
-    static char ID;
-    Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
-
-    virtual const char *getPassName() const {
-      return "MBlaze Delay Slot Filler";
-    }
-
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-    bool runOnMachineFunction(MachineFunction &F) {
-      bool Changed = false;
-      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
-           FI != FE; ++FI)
-        Changed |= runOnMachineBasicBlock(*FI);
-      return Changed;
-    }
-
-  };
-  char Filler::ID = 0;
-} // end of anonymous namespace
-
-static bool hasImmInstruction(MachineBasicBlock::iterator &candidate) {
-    // Any instruction with an immediate mode operand greater than
-    // 16-bits requires an implicit IMM instruction.
-    unsigned numOper = candidate->getNumOperands();
-    for (unsigned op = 0; op < numOper; ++op) {
-        MachineOperand &mop = candidate->getOperand(op);
-
-        // The operand requires more than 16-bits to represent.
-        if (mop.isImm() && (mop.getImm() < -0x8000 || mop.getImm() > 0x7fff))
-          return true;
-
-        // We must assume that unknown immediate values require more than
-        // 16-bits to represent.
-        if (mop.isGlobal() || mop.isSymbol() || mop.isJTI() || mop.isCPI())
-          return true;
-
-        // FIXME: we could probably check to see if the FP value happens
-        //        to not need an IMM instruction. For now we just always
-        //        assume that FP values do.
-        if (mop.isFPImm())
-          return true;
-    }
-
-    return false;
-}
-
-static unsigned getLastRealOperand(MachineBasicBlock::iterator &instr) {
-  switch (instr->getOpcode()) {
-  default: return instr->getNumOperands();
-
-  // These instructions have a variable number of operands but the first two
-  // are the "real" operands that we care about during hazard detection.
-  case MBlaze::BRLID:
-  case MBlaze::BRALID:
-  case MBlaze::BRLD:
-  case MBlaze::BRALD:
-    return 2;
-  }
-}
-
-static bool delayHasHazard(MachineBasicBlock::iterator &candidate,
-                           MachineBasicBlock::iterator &slot) {
-  // Hazard check
-  MachineBasicBlock::iterator a = candidate;
-  MachineBasicBlock::iterator b = slot;
-
-  // MBB layout:-
-  //    candidate := a0 = operation(a1, a2)
-  //    ...middle bit...
-  //    slot := b0 = operation(b1, b2)
-
-  // Possible hazards:-/
-  // 1. a1 or a2 was written during the middle bit
-  // 2. a0 was read or written during the middle bit
-  // 3. a0 is one or more of {b0, b1, b2}
-  // 4. b0 is one or more of {a1, a2}
-  // 5. a accesses memory, and the middle bit
-  //    contains a store operation.
-  bool a_is_memory = candidate->mayLoad() || candidate->mayStore();
-
-  // Determine the number of operands in the slot instruction and in the
-  // candidate instruction.
-  const unsigned aend = getLastRealOperand(a);
-  const unsigned bend = getLastRealOperand(b);
-
-  // Check hazards type 1, 2 and 5 by scanning the middle bit
-  MachineBasicBlock::iterator m = a;
-  for (++m; m != b; ++m) {
-    for (unsigned aop = 0; aop<aend; ++aop) {
-      bool aop_is_reg = a->getOperand(aop).isReg();
-      if (!aop_is_reg) continue;
-
-      bool aop_is_def = a->getOperand(aop).isDef();
-      unsigned aop_reg = a->getOperand(aop).getReg();
-
-      const unsigned mend = getLastRealOperand(m);
-      for (unsigned mop = 0; mop<mend; ++mop) {
-        bool mop_is_reg = m->getOperand(mop).isReg();
-        if (!mop_is_reg) continue;
-
-        bool mop_is_def = m->getOperand(mop).isDef();
-        unsigned mop_reg = m->getOperand(mop).getReg();
-
-        if (aop_is_def && (mop_reg == aop_reg))
-            return true; // Hazard type 2, because aop = a0
-        else if (mop_is_def && (mop_reg == aop_reg))
-            return true; // Hazard type 1, because aop in {a1, a2}
-      }
-    }
-
-    // Check hazard type 5
-    if (a_is_memory && m->mayStore())
-      return true;
-  }
-
-  // Check hazard type 3 & 4
-  for (unsigned aop = 0; aop<aend; ++aop) {
-    if (a->getOperand(aop).isReg()) {
-      unsigned aop_reg = a->getOperand(aop).getReg();
-
-      for (unsigned bop = 0; bop<bend; ++bop) {
-        if (b->getOperand(bop).isReg() && !b->getOperand(bop).isImplicit()) {
-          unsigned bop_reg = b->getOperand(bop).getReg();
-          if (aop_reg == bop_reg)
-            return true;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-static bool isDelayFiller(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator candidate) {
-  if (candidate == MBB.begin())
-    return false;
-
-  --candidate;
-  return (candidate->hasDelaySlot());
-}
-
-static bool hasUnknownSideEffects(MachineBasicBlock::iterator &I) {
-  if (!I->hasUnmodeledSideEffects())
-    return false;
-
-  unsigned op = I->getOpcode();
-  if (op == MBlaze::ADDK || op == MBlaze::ADDIK ||
-      op == MBlaze::ADDC || op == MBlaze::ADDIC ||
-      op == MBlaze::ADDKC || op == MBlaze::ADDIKC ||
-      op == MBlaze::RSUBK || op == MBlaze::RSUBIK ||
-      op == MBlaze::RSUBC || op == MBlaze::RSUBIC ||
-      op == MBlaze::RSUBKC || op == MBlaze::RSUBIKC)
-    return false;
-
-  return true;
-}
-
-static MachineBasicBlock::iterator
-findDelayInstr(MachineBasicBlock &MBB,MachineBasicBlock::iterator slot) {
-  MachineBasicBlock::iterator I = slot;
-  while (true) {
-    if (I == MBB.begin())
-      break;
-
-    --I;
-    if (I->hasDelaySlot() || I->isBranch() || isDelayFiller(MBB,I) ||
-        I->isCall() || I->isReturn() || I->isBarrier() ||
-        hasUnknownSideEffects(I))
-      break;
-
-    if (hasImmInstruction(I) || delayHasHazard(I,slot))
-      continue;
-
-    return I;
-  }
-
-  return MBB.end();
-}
-
-/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
-/// Currently, we fill delay slots with NOPs. We assume there is only one
-/// delay slot per delayed instruction.
-bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->hasDelaySlot()) {
-      MachineBasicBlock::iterator D = MBB.end();
-      MachineBasicBlock::iterator J = I;
-
-      if (!MBDisableDelaySlotFiller)
-        D = findDelayInstr(MBB,I);
-
-      ++FilledSlots;
-      Changed = true;
-
-      if (D == MBB.end())
-        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(MBlaze::NOP));
-      else
-        MBB.splice(++J, &MBB, D);
-    }
-  return Changed;
-}
-
-/// createMBlazeDelaySlotFillerPass - Returns a pass that fills in delay
-/// slots in MBlaze MachineFunctions
-FunctionPass *llvm::createMBlazeDelaySlotFillerPass(MBlazeTargetMachine &tm) {
-  return new Filler(tm);
-}
-
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
deleted file mode 100644
index 172304b..0000000
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ /dev/null
@@ -1,488 +0,0 @@
-//===-- MBlazeFrameLowering.cpp - MBlaze Frame Information ---------------====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mblaze-frame-lowering"
-
-#include "MBlazeFrameLowering.h"
-#include "InstPrinter/MBlazeInstPrinter.h"
-#include "MBlazeInstrInfo.h"
-#include "MBlazeMachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-
-using namespace llvm;
-
-static cl::opt<bool> MBDisableStackAdjust(
-  "disable-mblaze-stack-adjust",
-  cl::init(false),
-  cl::desc("Disable MBlaze stack layout adjustment."),
-  cl::Hidden);
-
-static void replaceFrameIndexes(MachineFunction &MF, 
-                                SmallVector<std::pair<int,int64_t>, 16> &FR) {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  const SmallVector<std::pair<int,int64_t>, 16>::iterator FRB = FR.begin();
-  const SmallVector<std::pair<int,int64_t>, 16>::iterator FRE = FR.end();
-
-  SmallVector<std::pair<int,int64_t>, 16>::iterator FRI = FRB;
-  for (; FRI != FRE; ++FRI) {
-    MFI->RemoveStackObject(FRI->first);
-    int NFI = MFI->CreateFixedObject(4, FRI->second, true);
-    MBlazeFI->recordReplacement(FRI->first, NFI);
-
-    for (MachineFunction::iterator MB=MF.begin(), ME=MF.end(); MB!=ME; ++MB) {
-      MachineBasicBlock::iterator MBB = MB->begin();
-      const MachineBasicBlock::iterator MBE = MB->end();
-
-      for (; MBB != MBE; ++MBB) {
-        MachineInstr::mop_iterator MIB = MBB->operands_begin();
-        const MachineInstr::mop_iterator MIE = MBB->operands_end();
-
-        for (MachineInstr::mop_iterator MII = MIB; MII != MIE; ++MII) {
-          if (!MII->isFI() || MII->getIndex() != FRI->first) continue;
-          DEBUG(dbgs() << "FOUND FI#" << MII->getIndex() << "\n");
-          MII->setIndex(NFI);
-        }
-      }
-    }
-  }
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Stack Frame Processing methods
-// +----------------------------+
-//
-// The stack is allocated decrementing the stack pointer on
-// the first instruction of a function prologue. Once decremented,
-// all stack references are are done through a positive offset
-// from the stack/frame pointer, so the stack is considered
-// to grow up.
-//
-//===----------------------------------------------------------------------===//
-
-static void analyzeFrameIndexes(MachineFunction &MF) {
-  if (MBDisableStackAdjust) return;
-
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  MachineRegisterInfo::livein_iterator LII = MRI.livein_begin();
-  MachineRegisterInfo::livein_iterator LIE = MRI.livein_end();
-  const SmallVector<int, 16> &LiveInFI = MBlazeFI->getLiveIn();
-  SmallVector<MachineInstr*, 16> EraseInstr;
-  SmallVector<std::pair<int,int64_t>, 16> FrameRelocate;
-
-  MachineBasicBlock *MBB = MF.getBlockNumbered(0);
-  MachineBasicBlock::iterator MIB = MBB->begin();
-  MachineBasicBlock::iterator MIE = MBB->end();
-
-  int StackAdjust = 0;
-  int StackOffset = -28;
-
-  // In this loop we are searching frame indexes that corrospond to incoming
-  // arguments that are already in the stack. We look for instruction sequences
-  // like the following:
-  //    
-  //    LWI REG, FI1, 0
-  //    ...
-  //    SWI REG, FI2, 0
-  //
-  // As long as there are no defs of REG in the ... part, we can eliminate
-  // the SWI instruction because the value has already been stored to the
-  // stack by the caller. All we need to do is locate FI at the correct
-  // stack location according to the calling convensions.
-  //
-  // Additionally, if the SWI operation kills the def of REG then we don't
-  // need the LWI operation so we can erase it as well.
-  for (unsigned i = 0, e = LiveInFI.size(); i < e; ++i) {
-    for (MachineBasicBlock::iterator I=MIB; I != MIE; ++I) {
-      if (I->getOpcode() != MBlaze::LWI || I->getNumOperands() != 3 ||
-          !I->getOperand(1).isFI() || !I->getOperand(0).isReg() ||
-          I->getOperand(1).getIndex() != LiveInFI[i]) continue;
-
-      unsigned FIReg = I->getOperand(0).getReg();
-      MachineBasicBlock::iterator SI = I;
-      for (SI++; SI != MIE; ++SI) {
-        if (!SI->getOperand(0).isReg() ||
-            !SI->getOperand(1).isFI() ||
-            SI->getOpcode() != MBlaze::SWI) continue;
-
-        int FI = SI->getOperand(1).getIndex();
-        if (SI->getOperand(0).getReg() != FIReg ||
-            MFI->isFixedObjectIndex(FI) ||
-            MFI->getObjectSize(FI) != 4) continue;
-
-        if (SI->getOperand(0).isDef()) break;
-
-        if (SI->getOperand(0).isKill()) {
-          DEBUG(dbgs() << "LWI for FI#" << I->getOperand(1).getIndex() 
-                       << " removed\n");
-          EraseInstr.push_back(I);
-        }
-
-        EraseInstr.push_back(SI);
-        DEBUG(dbgs() << "SWI for FI#" << FI << " removed\n");
-
-        FrameRelocate.push_back(std::make_pair(FI,StackOffset));
-        DEBUG(dbgs() << "FI#" << FI << " relocated to " << StackOffset << "\n");
-
-        StackOffset -= 4;
-        StackAdjust += 4;
-        break;
-      }
-    }
-  }
-
-  // In this loop we are searching for frame indexes that corrospond to
-  // incoming arguments that are in registers. We look for instruction
-  // sequences like the following:
-  //    
-  //    ...  SWI REG, FI, 0
-  // 
-  // As long as the ... part does not define REG and if REG is an incoming
-  // parameter register then we know that, according to ABI convensions, the
-  // caller has allocated stack space for it already.  Instead of allocating
-  // stack space on our frame, we record the correct location in the callers
-  // frame.
-  for (MachineRegisterInfo::livein_iterator LI = LII; LI != LIE; ++LI) {
-    for (MachineBasicBlock::iterator I=MIB; I != MIE; ++I) {
-      if (I->definesRegister(LI->first))
-        break;
-
-      if (I->getOpcode() != MBlaze::SWI || I->getNumOperands() != 3 ||
-          !I->getOperand(1).isFI() || !I->getOperand(0).isReg() ||
-          I->getOperand(1).getIndex() < 0) continue;
-
-      if (I->getOperand(0).getReg() == LI->first) {
-        int FI = I->getOperand(1).getIndex();
-        MBlazeFI->recordLiveIn(FI);
-
-        int FILoc = 0;
-        switch (LI->first) {
-        default: llvm_unreachable("invalid incoming parameter!");
-        case MBlaze::R5:  FILoc = -4; break;
-        case MBlaze::R6:  FILoc = -8; break;
-        case MBlaze::R7:  FILoc = -12; break;
-        case MBlaze::R8:  FILoc = -16; break;
-        case MBlaze::R9:  FILoc = -20; break;
-        case MBlaze::R10: FILoc = -24; break;
-        }
-
-        StackAdjust += 4;
-        FrameRelocate.push_back(std::make_pair(FI,FILoc));
-        DEBUG(dbgs() << "FI#" << FI << " relocated to " << FILoc << "\n");
-        break;
-      }
-    }
-  }
-
-  // Go ahead and erase all of the instructions that we determined were
-  // no longer needed.
-  for (int i = 0, e = EraseInstr.size(); i < e; ++i)
-    MBB->erase(EraseInstr[i]);
-
-  // Replace all of the frame indexes that we have relocated with new
-  // fixed object frame indexes.
-  replaceFrameIndexes(MF, FrameRelocate);
-}
-
-static void interruptFrameLayout(MachineFunction &MF) {
-  const Function *F = MF.getFunction();
-  CallingConv::ID CallConv = F->getCallingConv();
-
-  // If this function is not using either the interrupt_handler
-  // calling convention or the save_volatiles calling convention
-  // then we don't need to do any additional frame layout.
-  if (CallConv != CallingConv::MBLAZE_INTR &&
-      CallConv != CallingConv::MBLAZE_SVOL)
-      return;
-
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MBlazeInstrInfo &TII =
-    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
-
-  // Determine if the calling convention is the interrupt_handler
-  // calling convention. Some pieces of the prologue and epilogue
-  // only need to be emitted if we are lowering and interrupt handler.
-  bool isIntr = CallConv == CallingConv::MBLAZE_INTR;
-
-  // Determine where to put prologue and epilogue additions
-  MachineBasicBlock &MENT   = MF.front();
-  MachineBasicBlock &MEXT   = MF.back();
-
-  MachineBasicBlock::iterator MENTI = MENT.begin();
-  MachineBasicBlock::iterator MEXTI = prior(MEXT.end());
-
-  DebugLoc ENTDL = MENTI != MENT.end() ? MENTI->getDebugLoc() : DebugLoc();
-  DebugLoc EXTDL = MEXTI != MEXT.end() ? MEXTI->getDebugLoc() : DebugLoc();
-
-  // Store the frame indexes generated during prologue additions for use
-  // when we are generating the epilogue additions.
-  SmallVector<int, 10> VFI;
-
-  // Build the prologue SWI for R3 - R12 if needed. Note that R11 must
-  // always have a SWI because it is used when processing RMSR.
-  for (unsigned r = MBlaze::R3; r <= MBlaze::R12; ++r) {
-    if (!MRI.isPhysRegUsed(r) && !(isIntr && r == MBlaze::R11)) continue;
-    
-    int FI = MFI->CreateStackObject(4,4,false,false);
-    VFI.push_back(FI);
-
-    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), r)
-      .addFrameIndex(FI).addImm(0);
-  }
-    
-  // Build the prologue SWI for R17, R18
-  int R17FI = MFI->CreateStackObject(4,4,false,false);
-  int R18FI = MFI->CreateStackObject(4,4,false,false);
-
-  BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R17)
-    .addFrameIndex(R17FI).addImm(0);
-    
-  BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R18)
-    .addFrameIndex(R18FI).addImm(0);
-
-  // Buid the prologue SWI and the epilogue LWI for RMSR if needed
-  if (isIntr) {
-    int MSRFI = MFI->CreateStackObject(4,4,false,false);
-    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::MFS), MBlaze::R11)
-      .addReg(MBlaze::RMSR);
-    BuildMI(MENT, MENTI, ENTDL, TII.get(MBlaze::SWI), MBlaze::R11)
-      .addFrameIndex(MSRFI).addImm(0);
-
-    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R11)
-      .addFrameIndex(MSRFI).addImm(0);
-    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::MTS), MBlaze::RMSR)
-      .addReg(MBlaze::R11);
-  }
-
-  // Build the epilogue LWI for R17, R18
-  BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R18)
-    .addFrameIndex(R18FI).addImm(0);
-
-  BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), MBlaze::R17)
-    .addFrameIndex(R17FI).addImm(0);
-
-  // Build the epilogue LWI for R3 - R12 if needed
-  for (unsigned r = MBlaze::R12, i = VFI.size(); r >= MBlaze::R3; --r) {
-    if (!MRI.isPhysRegUsed(r)) continue;
-    BuildMI(MEXT, MEXTI, EXTDL, TII.get(MBlaze::LWI), r)
-      .addFrameIndex(VFI[--i]).addImm(0);
-  }
-}
-
-static void determineFrameLayout(MachineFunction &MF) {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-
-  // Replace the dummy '0' SPOffset by the negative offsets, as explained on
-  // LowerFORMAL_ARGUMENTS. Leaving '0' for while is necessary to avoid
-  // the approach done by calculateFrameObjectOffsets to the stack frame.
-  MBlazeFI->adjustLoadArgsFI(MFI);
-  MBlazeFI->adjustStoreVarArgsFI(MFI);
-
-  // Get the number of bytes to allocate from the FrameInfo
-  unsigned FrameSize = MFI->getStackSize();
-  DEBUG(dbgs() << "Original Frame Size: " << FrameSize << "\n" );
-
-  // Get the alignments provided by the target, and the maximum alignment
-  // (if any) of the fixed frame objects.
-  // unsigned MaxAlign = MFI->getMaxAlignment();
-  unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
-  unsigned AlignMask = TargetAlign - 1;
-
-  // Make sure the frame is aligned.
-  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
-  MFI->setStackSize(FrameSize);
-  DEBUG(dbgs() << "Aligned Frame Size: " << FrameSize << "\n" );
-}
-
-int MBlazeFrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) 
-  const {
-  const MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  if (MBlazeFI->hasReplacement(FI))
-    FI = MBlazeFI->getReplacement(FI);
-  return TargetFrameLowering::getFrameIndexOffset(MF,FI);
-}
-
-// hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
-bool MBlazeFrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
-         MFI->hasVarSizedObjects();
-}
-
-void MBlazeFrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB   = MF.front();
-  MachineFrameInfo *MFI    = MF.getFrameInfo();
-  const MBlazeInstrInfo &TII =
-    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
-
-  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
-
-  // Determine the correct frame layout
-  determineFrameLayout(MF);
-
-  // Get the number of bytes to allocate from the FrameInfo.
-  unsigned StackSize = MFI->getStackSize();
-
-  // No need to allocate space on the stack.
-  if (StackSize == 0 && !MFI->adjustsStack() && !requiresRA) return;
-
-  int FPOffset = MBlazeFI->getFPStackOffset();
-  int RAOffset = MBlazeFI->getRAStackOffset();
-
-  // Adjust stack : addi R1, R1, -imm
-  BuildMI(MBB, MBBI, DL, TII.get(MBlaze::ADDIK), MBlaze::R1)
-      .addReg(MBlaze::R1).addImm(-StackSize);
-
-  // swi  R15, R1, stack_loc
-  if (MFI->adjustsStack() || requiresRA) {
-    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::SWI))
-        .addReg(MBlaze::R15).addReg(MBlaze::R1).addImm(RAOffset);
-  }
-
-  if (hasFP(MF)) {
-    // swi  R19, R1, stack_loc
-    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::SWI))
-      .addReg(MBlaze::R19).addReg(MBlaze::R1).addImm(FPOffset);
-
-    // add R19, R1, R0
-    BuildMI(MBB, MBBI, DL, TII.get(MBlaze::ADD), MBlaze::R19)
-      .addReg(MBlaze::R1).addReg(MBlaze::R0);
-  }
-}
-
-void MBlazeFrameLowering::emitEpilogue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  MachineFrameInfo *MFI            = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI     = MF.getInfo<MBlazeFunctionInfo>();
-  const MBlazeInstrInfo &TII =
-    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
-
-  DebugLoc dl = MBBI->getDebugLoc();
-
-  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
-
-  // Get the FI's where RA and FP are saved.
-  int FPOffset = MBlazeFI->getFPStackOffset();
-  int RAOffset = MBlazeFI->getRAStackOffset();
-
-  if (hasFP(MF)) {
-    // add R1, R19, R0
-    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADD), MBlaze::R1)
-      .addReg(MBlaze::R19).addReg(MBlaze::R0);
-
-    // lwi  R19, R1, stack_loc
-    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R19)
-      .addReg(MBlaze::R1).addImm(FPOffset);
-  }
-
-  // lwi R15, R1, stack_loc
-  if (MFI->adjustsStack() || requiresRA) {
-    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::LWI), MBlaze::R15)
-      .addReg(MBlaze::R1).addImm(RAOffset);
-  }
-
-  // Get the number of bytes from FrameInfo
-  int StackSize = (int) MFI->getStackSize();
-
-  // addi R1, R1, imm
-  if (StackSize) {
-    BuildMI(MBB, MBBI, dl, TII.get(MBlaze::ADDIK), MBlaze::R1)
-      .addReg(MBlaze::R1).addImm(StackSize);
-  }
-}
-
-// Eliminate ADJCALLSTACKDOWN/ADJCALLSTACKUP pseudo instructions
-void MBlazeFrameLowering::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
-                              MachineBasicBlock::iterator I) const {
-  const MBlazeInstrInfo &TII =
-    *static_cast<const MBlazeInstrInfo*>(MF.getTarget().getInstrInfo());
-  if (!hasReservedCallFrame(MF)) {
-    // If we have a frame pointer, turn the adjcallstackup instruction into a
-    // 'addi r1, r1, -<amt>' and the adjcallstackdown instruction into
-    // 'addi r1, r1, <amt>'
-    MachineInstr *Old = I;
-    int Amount = Old->getOperand(0).getImm() + 4;
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
-
-      MachineInstr *New;
-      if (Old->getOpcode() == MBlaze::ADJCALLSTACKDOWN) {
-        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
-                .addReg(MBlaze::R1).addImm(-Amount);
-      } else {
-        assert(Old->getOpcode() == MBlaze::ADJCALLSTACKUP);
-        New = BuildMI(MF,Old->getDebugLoc(), TII.get(MBlaze::ADDIK),MBlaze::R1)
-                .addReg(MBlaze::R1).addImm(Amount);
-      }
-
-      // Replace the pseudo instruction with a new instruction...
-      MBB.insert(I, New);
-    }
-  }
-
-  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
-  MBB.erase(I);
-}
-
-
-void MBlazeFrameLowering::
-processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                     RegScavenger *RS) const {
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  CallingConv::ID CallConv = MF.getFunction()->getCallingConv();
-  bool requiresRA = CallConv == CallingConv::MBLAZE_INTR;
-
-  if (MFI->adjustsStack() || requiresRA) {
-    MBlazeFI->setRAStackOffset(0);
-    MFI->CreateFixedObject(4,0,true);
-  }
-
-  if (hasFP(MF)) {
-    MBlazeFI->setFPStackOffset(4);
-    MFI->CreateFixedObject(4,4,true);
-  }
-
-  interruptFrameLayout(MF);
-  analyzeFrameIndexes(MF);
-}
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.h b/lib/Target/MBlaze/MBlazeFrameLowering.h
deleted file mode 100644
index f4228c5..0000000
--- a/lib/Target/MBlaze/MBlazeFrameLowering.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//=- MBlazeFrameLowering.h - Define frame lowering for MicroBlaze -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZE_FRAMEINFO_H
-#define MBLAZE_FRAMEINFO_H
-
-#include "MBlaze.h"
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-class MBlazeSubtarget;
-
-class MBlazeFrameLowering : public TargetFrameLowering {
-protected:
-  const MBlazeSubtarget &STI;
-
-public:
-  explicit MBlazeFrameLowering(const MBlazeSubtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 4, 0), STI(sti) {
-  }
-
-  /// targetHandlesStackFrameRounding - Returns true if the target is
-  /// responsible for rounding up the stack frame (probably at emitPrologue
-  /// time).
-  bool targetHandlesStackFrameRounding() const { return true; }
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  bool hasFP(const MachineFunction &MF) const;
-
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-
-  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                    RegScavenger *RS) const;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp b/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
deleted file mode 100644
index 34e33fd..0000000
--- a/lib/Target/MBlaze/MBlazeISelDAGToDAG.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-//===-- MBlazeISelDAGToDAG.cpp - A dag to dag inst selector for MBlaze ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the MBlaze target.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mblaze-isel"
-#include "MBlaze.h"
-#include "MBlazeMachineFunction.h"
-#include "MBlazeRegisterInfo.h"
-#include "MBlazeSubtarget.h"
-#include "MBlazeTargetMachine.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// Instruction Selector Implementation
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlazeDAGToDAGISel - MBlaze specific code to select MBlaze machine
-// instructions for SelectionDAG operations.
-//===----------------------------------------------------------------------===//
-namespace {
-
-class MBlazeDAGToDAGISel : public SelectionDAGISel {
-
-  /// TM - Keep a reference to MBlazeTargetMachine.
-  MBlazeTargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the MBlazeSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const MBlazeSubtarget &Subtarget;
-
-public:
-  explicit MBlazeDAGToDAGISel(MBlazeTargetMachine &tm) :
-  SelectionDAGISel(tm),
-  TM(tm), Subtarget(tm.getSubtarget<MBlazeSubtarget>()) {}
-
-  // Pass Name
-  virtual const char *getPassName() const {
-    return "MBlaze DAG->DAG Pattern Instruction Selection";
-  }
-private:
-  // Include the pieces autogenerated from the target description.
-  #include "MBlazeGenDAGISel.inc"
-
-  /// getTargetMachine - Return a reference to the TargetMachine, casted
-  /// to the target-specific type.
-  const MBlazeTargetMachine &getTargetMachine() {
-    return static_cast<const MBlazeTargetMachine &>(TM);
-  }
-
-  /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
-  /// to the target-specific type.
-  const MBlazeInstrInfo *getInstrInfo() {
-    return getTargetMachine().getInstrInfo();
-  }
-
-  SDNode *getGlobalBaseReg();
-  SDNode *Select(SDNode *N);
-
-  // Address Selection
-  bool SelectAddrRegReg(SDValue N, SDValue &Base, SDValue &Index);
-  bool SelectAddrRegImm(SDValue N, SDValue &Disp, SDValue &Base);
-
-  // getI32Imm - Return a target constant with the specified value, of type i32.
-  inline SDValue getI32Imm(unsigned Imm) {
-    return CurDAG->getTargetConstant(Imm, MVT::i32);
-  }
-};
-
-}
-
-/// isIntS32Immediate - This method tests to see if the node is either a 32-bit
-/// or 64-bit immediate, and if the value can be accurately represented as a
-/// sign extension from a 32-bit value.  If so, this returns true and the
-/// immediate.
-static bool isIntS32Immediate(SDNode *N, int32_t &Imm) {
-  unsigned Opc = N->getOpcode();
-  if (Opc != ISD::Constant)
-    return false;
-
-  Imm = (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
-  if (N->getValueType(0) == MVT::i32)
-    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
-  else
-    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
-}
-
-static bool isIntS32Immediate(SDValue Op, int32_t &Imm) {
-  return isIntS32Immediate(Op.getNode(), Imm);
-}
-
-
-/// SelectAddressRegReg - Given the specified addressed, check to see if it
-/// can be represented as an indexed [r+r] operation.  Returns false if it
-/// can be more efficiently represented with [r+imm].
-bool MBlazeDAGToDAGISel::
-SelectAddrRegReg(SDValue N, SDValue &Base, SDValue &Index) {
-  if (N.getOpcode() == ISD::FrameIndex) return false;
-  if (N.getOpcode() == ISD::TargetExternalSymbol ||
-      N.getOpcode() == ISD::TargetGlobalAddress)
-    return false;  // direct calls.
-
-  int32_t imm = 0;
-  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
-    if (isIntS32Immediate(N.getOperand(1), imm))
-      return false;    // r+i
-
-    if (N.getOperand(0).getOpcode() == ISD::TargetJumpTable ||
-        N.getOperand(1).getOpcode() == ISD::TargetJumpTable)
-      return false; // jump tables.
-
-    Base = N.getOperand(0);
-    Index = N.getOperand(1);
-    return true;
-  }
-
-  return false;
-}
-
-/// Returns true if the address N can be represented by a base register plus
-/// a signed 32-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.
-bool MBlazeDAGToDAGISel::
-SelectAddrRegImm(SDValue N, SDValue &Base, SDValue &Disp) {
-  // If this can be more profitably realized as r+r, fail.
-  if (SelectAddrRegReg(N, Base, Disp))
-    return false;
-
-  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::OR) {
-    int32_t imm = 0;
-    if (isIntS32Immediate(N.getOperand(1), imm)) {
-      Disp = CurDAG->getTargetConstant(imm, MVT::i32);
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
-        Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
-      } else {
-        Base = N.getOperand(0);
-      }
-      return true; // [r+i]
-    }
-  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
-    // Loading from a constant address.
-    uint32_t Imm = CN->getZExtValue();
-    Disp = CurDAG->getTargetConstant(Imm, CN->getValueType(0));
-    Base = CurDAG->getRegister(MBlaze::R0, CN->getValueType(0));
-    return true;
-  }
-
-  Disp = CurDAG->getTargetConstant(0, TM.getTargetLowering()->getPointerTy());
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
-    Base = CurDAG->getTargetFrameIndex(FI->getIndex(), N.getValueType());
-  else
-    Base = N;
-  return true;      // [r+0]
-}
-
-/// getGlobalBaseReg - Output the instructions required to put the
-/// GOT address into a register.
-SDNode *MBlazeDAGToDAGISel::getGlobalBaseReg() {
-  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
-}
-
-/// Select instructions not customized! Used for
-/// expanded, promoted and normal instructions
-SDNode* MBlazeDAGToDAGISel::Select(SDNode *Node) {
-  unsigned Opcode = Node->getOpcode();
-  DebugLoc dl = Node->getDebugLoc();
-
-  // If we have a custom node, we already have selected!
-  if (Node->isMachineOpcode())
-    return NULL;
-
-  ///
-  // Instruction Selection not handled by the auto-generated
-  // tablegen selection should be handled here.
-  ///
-  switch (Opcode) {
-    default: break;
-
-    // Get target GOT address.
-    case ISD::GLOBAL_OFFSET_TABLE:
-      return getGlobalBaseReg();
-
-    case ISD::FrameIndex: {
-        SDValue imm = CurDAG->getTargetConstant(0, MVT::i32);
-        int FI = dyn_cast<FrameIndexSDNode>(Node)->getIndex();
-        EVT VT = Node->getValueType(0);
-        SDValue TFI = CurDAG->getTargetFrameIndex(FI, VT);
-        unsigned Opc = MBlaze::ADDIK;
-        if (Node->hasOneUse())
-          return CurDAG->SelectNodeTo(Node, Opc, VT, TFI, imm);
-        return CurDAG->getMachineNode(Opc, dl, VT, TFI, imm);
-    }
-
-
-    /// Handle direct and indirect calls when using PIC. On PIC, when
-    /// GOT is smaller than about 64k (small code) the GA target is
-    /// loaded with only one instruction. Otherwise GA's target must
-    /// be loaded with 3 instructions.
-    case MBlazeISD::JmpLink: {
-      if (TM.getRelocationModel() == Reloc::PIC_) {
-        SDValue Chain  = Node->getOperand(0);
-        SDValue Callee = Node->getOperand(1);
-        SDValue R20Reg = CurDAG->getRegister(MBlaze::R20, MVT::i32);
-        SDValue InFlag(0, 0);
-
-        if ((isa<GlobalAddressSDNode>(Callee)) ||
-            (isa<ExternalSymbolSDNode>(Callee)))
-        {
-          /// Direct call for global addresses and external symbols
-          SDValue GPReg = CurDAG->getRegister(MBlaze::R15, MVT::i32);
-
-          // Use load to get GOT target
-          SDValue Ops[] = { Callee, GPReg, Chain };
-          SDValue Load = SDValue(CurDAG->getMachineNode(MBlaze::LW, dl,
-                                 MVT::i32, MVT::Other, Ops), 0);
-          Chain = Load.getValue(1);
-
-          // Call target must be on T9
-          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Load, InFlag);
-        } else
-          /// Indirect call
-          Chain = CurDAG->getCopyToReg(Chain, dl, R20Reg, Callee, InFlag);
-
-        // Emit Jump and Link Register
-        SDNode *ResNode = CurDAG->getMachineNode(MBlaze::BRLID, dl, MVT::Other,
-                                                 MVT::Glue, R20Reg, Chain);
-        Chain  = SDValue(ResNode, 0);
-        InFlag = SDValue(ResNode, 1);
-        ReplaceUses(SDValue(Node, 0), Chain);
-        ReplaceUses(SDValue(Node, 1), InFlag);
-        return ResNode;
-      }
-    }
-  }
-
-  // Select the default instruction
-  SDNode *ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-  return ResNode;
-}
-
-/// createMBlazeISelDag - This pass converts a legalized DAG into a
-/// MBlaze-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createMBlazeISelDag(MBlazeTargetMachine &TM) {
-  return new MBlazeDAGToDAGISel(TM);
-}
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.cpp b/lib/Target/MBlaze/MBlazeISelLowering.cpp
deleted file mode 100644
index d4f9432..0000000
--- a/lib/Target/MBlaze/MBlazeISelLowering.cpp
+++ /dev/null
@@ -1,1154 +0,0 @@
-//===-- MBlazeISelLowering.cpp - MBlaze DAG Lowering Implementation -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that MBlaze uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mblaze-lower"
-#include "MBlazeISelLowering.h"
-#include "MBlazeMachineFunction.h"
-#include "MBlazeSubtarget.h"
-#include "MBlazeTargetMachine.h"
-#include "MBlazeTargetObjectFile.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                CCValAssign::LocInfo &LocInfo,
-                                ISD::ArgFlagsTy &ArgFlags,
-                                CCState &State);
-
-const char *MBlazeTargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-    case MBlazeISD::JmpLink    : return "MBlazeISD::JmpLink";
-    case MBlazeISD::GPRel      : return "MBlazeISD::GPRel";
-    case MBlazeISD::Wrap       : return "MBlazeISD::Wrap";
-    case MBlazeISD::ICmp       : return "MBlazeISD::ICmp";
-    case MBlazeISD::Ret        : return "MBlazeISD::Ret";
-    case MBlazeISD::Select_CC  : return "MBlazeISD::Select_CC";
-    default                    : return NULL;
-  }
-}
-
-MBlazeTargetLowering::MBlazeTargetLowering(MBlazeTargetMachine &TM)
-  : TargetLowering(TM, new MBlazeTargetObjectFile()) {
-  Subtarget = &TM.getSubtarget<MBlazeSubtarget>();
-
-  // MBlaze does not have i1 type, so use i32 for
-  // setcc operations results (slt, sgt, ...).
-  setBooleanContents(ZeroOrOneBooleanContent);
-  setBooleanVectorContents(ZeroOrOneBooleanContent); // FIXME: Is this correct?
-
-  // Set up the register classes
-  addRegisterClass(MVT::i32, &MBlaze::GPRRegClass);
-  if (Subtarget->hasFPU()) {
-    addRegisterClass(MVT::f32, &MBlaze::GPRRegClass);
-    setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  }
-
-  // Floating point operations which are not supported
-  setOperationAction(ISD::FREM,       MVT::f32, Expand);
-  setOperationAction(ISD::FMA,        MVT::f32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-  setOperationAction(ISD::FP_ROUND,   MVT::f32, Expand);
-  setOperationAction(ISD::FP_ROUND,   MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN,  MVT::f32, Expand);
-  setOperationAction(ISD::FCOPYSIGN,  MVT::f64, Expand);
-  setOperationAction(ISD::FSIN,       MVT::f32, Expand);
-  setOperationAction(ISD::FCOS,       MVT::f32, Expand);
-  setOperationAction(ISD::FSINCOS,    MVT::f32, Expand);
-  setOperationAction(ISD::FPOWI,      MVT::f32, Expand);
-  setOperationAction(ISD::FPOW,       MVT::f32, Expand);
-  setOperationAction(ISD::FLOG,       MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2,      MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10,     MVT::f32, Expand);
-  setOperationAction(ISD::FEXP,       MVT::f32, Expand);
-
-  // Load extented operations for i1 types must be promoted
-  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
-
-  // Sign extended loads must be expanded
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
-
-  // MBlaze has no REM or DIVREM operations.
-  setOperationAction(ISD::UREM,    MVT::i32, Expand);
-  setOperationAction(ISD::SREM,    MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-
-  // If the processor doesn't support multiply then expand it
-  if (!Subtarget->hasMul()) {
-    setOperationAction(ISD::MUL, MVT::i32, Expand);
-  }
-
-  // If the processor doesn't support 64-bit multiply then expand
-  if (!Subtarget->hasMul() || !Subtarget->hasMul64()) {
-    setOperationAction(ISD::MULHS, MVT::i32, Expand);
-    setOperationAction(ISD::MULHS, MVT::i64, Expand);
-    setOperationAction(ISD::MULHU, MVT::i32, Expand);
-    setOperationAction(ISD::MULHU, MVT::i64, Expand);
-  }
-
-  // If the processor doesn't support division then expand
-  if (!Subtarget->hasDiv()) {
-    setOperationAction(ISD::UDIV, MVT::i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::i32, Expand);
-  }
-
-  // Expand unsupported conversions
-  setOperationAction(ISD::BITCAST, MVT::f32, Expand);
-  setOperationAction(ISD::BITCAST, MVT::i32, Expand);
-
-  // Expand SELECT_CC
-  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
-
-  // MBlaze doesn't have MUL_LOHI
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-  // Used by legalize types to correctly generate the setcc result.
-  // Without this, every float setcc comes with a AND/OR with the result,
-  // we don't want this, since the fpcmp result goes to a flag register,
-  // which is used implicitly by brcond and select operations.
-  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
-  AddPromotedToType(ISD::SELECT, MVT::i1, MVT::i32);
-  AddPromotedToType(ISD::SELECT_CC, MVT::i1, MVT::i32);
-
-  // MBlaze Custom Operations
-  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
-  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
-  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
-  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
-
-  // Variable Argument support
-  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
-  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
-  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
-  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
-
-
-  // Operations not directly supported by MBlaze.
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Expand);
-  setOperationAction(ISD::BR_JT,              MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC,              MVT::f32,   Expand);
-  setOperationAction(ISD::BR_CC,              MVT::i32,   Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG,  MVT::i1,    Expand);
-  setOperationAction(ISD::ROTL,               MVT::i32,   Expand);
-  setOperationAction(ISD::ROTR,               MVT::i32,   Expand);
-  setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Expand);
-  setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Expand);
-  setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Expand);
-  setOperationAction(ISD::CTLZ,               MVT::i32,   Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF,    MVT::i32,   Expand);
-  setOperationAction(ISD::CTTZ,               MVT::i32,   Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF,    MVT::i32,   Expand);
-  setOperationAction(ISD::CTPOP,              MVT::i32,   Expand);
-  setOperationAction(ISD::BSWAP,              MVT::i32,   Expand);
-
-  // We don't have line number support yet.
-  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
-
-  // Use the default for now
-  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
-
-  // MBlaze doesn't have extending float->double load/store
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-
-  setMinFunctionAlignment(2);
-
-  setStackPointerRegisterToSaveRestore(MBlaze::R1);
-  computeRegisterProperties();
-}
-
-EVT MBlazeTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i32;
-}
-
-SDValue MBlazeTargetLowering::LowerOperation(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  switch (Op.getOpcode())
-  {
-    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
-    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
-    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
-    case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
-    case ISD::VASTART:            return LowerVASTART(Op, DAG);
-  }
-  return SDValue();
-}
-
-//===----------------------------------------------------------------------===//
-//  Lower helper functions
-//===----------------------------------------------------------------------===//
-MachineBasicBlock*
-MBlazeTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                  MachineBasicBlock *MBB)
-                                                  const {
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unexpected instr type to insert");
-
-  case MBlaze::ShiftRL:
-  case MBlaze::ShiftRA:
-  case MBlaze::ShiftL:
-    return EmitCustomShift(MI, MBB);
-
-  case MBlaze::Select_FCC:
-  case MBlaze::Select_CC:
-    return EmitCustomSelect(MI, MBB);
-
-  case MBlaze::CAS32:
-  case MBlaze::SWP32:
-  case MBlaze::LAA32:
-  case MBlaze::LAS32:
-  case MBlaze::LAD32:
-  case MBlaze::LAO32:
-  case MBlaze::LAX32:
-  case MBlaze::LAN32:
-    return EmitCustomAtomic(MI, MBB);
-
-  case MBlaze::MEMBARRIER:
-    // The Microblaze does not need memory barriers. Just delete the pseudo
-    // instruction and finish.
-    MI->eraseFromParent();
-    return MBB;
-  }
-}
-
-MachineBasicBlock*
-MBlazeTargetLowering::EmitCustomShift(MachineInstr *MI,
-                                      MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  // To "insert" a shift left instruction, we actually have to insert a
-  // simple loop.  The incoming instruction knows the destination vreg to
-  // set, the source vreg to operate over and the shift amount.
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  // start:
-  //   andi     samt, samt, 31
-  //   beqid    samt, finish
-  //   add      dst, src, r0
-  // loop:
-  //   addik    samt, samt, -1
-  //   sra      dst, dst
-  //   bneid    samt, loop
-  //   nop
-  // finish:
-  MachineFunction *F = MBB->getParent();
-  MachineRegisterInfo &R = F->getRegInfo();
-  MachineBasicBlock *loop = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *finish = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, loop);
-  F->insert(It, finish);
-
-  // Update machine-CFG edges by transferring adding all successors and
-  // remaining instructions from the current block to the new block which
-  // will contain the Phi node for the select.
-  finish->splice(finish->begin(), MBB,
-                 llvm::next(MachineBasicBlock::iterator(MI)),
-                 MBB->end());
-  finish->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // Add the true and fallthrough blocks as its successors.
-  MBB->addSuccessor(loop);
-  MBB->addSuccessor(finish);
-
-  // Next, add the finish block as a successor of the loop block
-  loop->addSuccessor(finish);
-  loop->addSuccessor(loop);
-
-  unsigned IAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(MBB, dl, TII->get(MBlaze::ANDI), IAMT)
-    .addReg(MI->getOperand(2).getReg())
-    .addImm(31);
-
-  unsigned IVAL = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(MBB, dl, TII->get(MBlaze::ADDIK), IVAL)
-    .addReg(MI->getOperand(1).getReg())
-    .addImm(0);
-
-  BuildMI(MBB, dl, TII->get(MBlaze::BEQID))
-    .addReg(IAMT)
-    .addMBB(finish);
-
-  unsigned DST = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  unsigned NDST = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(loop, dl, TII->get(MBlaze::PHI), DST)
-    .addReg(IVAL).addMBB(MBB)
-    .addReg(NDST).addMBB(loop);
-
-  unsigned SAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  unsigned NAMT = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(loop, dl, TII->get(MBlaze::PHI), SAMT)
-    .addReg(IAMT).addMBB(MBB)
-    .addReg(NAMT).addMBB(loop);
-
-  if (MI->getOpcode() == MBlaze::ShiftL)
-    BuildMI(loop, dl, TII->get(MBlaze::ADD), NDST).addReg(DST).addReg(DST);
-  else if (MI->getOpcode() == MBlaze::ShiftRA)
-    BuildMI(loop, dl, TII->get(MBlaze::SRA), NDST).addReg(DST);
-  else if (MI->getOpcode() == MBlaze::ShiftRL)
-    BuildMI(loop, dl, TII->get(MBlaze::SRL), NDST).addReg(DST);
-  else
-    llvm_unreachable("Cannot lower unknown shift instruction");
-
-  BuildMI(loop, dl, TII->get(MBlaze::ADDIK), NAMT)
-    .addReg(SAMT)
-    .addImm(-1);
-
-  BuildMI(loop, dl, TII->get(MBlaze::BNEID))
-    .addReg(NAMT)
-    .addMBB(loop);
-
-  BuildMI(*finish, finish->begin(), dl,
-          TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
-    .addReg(IVAL).addMBB(MBB)
-    .addReg(NDST).addMBB(loop);
-
-  // The pseudo instruction is no longer needed so remove it
-  MI->eraseFromParent();
-  return finish;
-}
-
-MachineBasicBlock*
-MBlazeTargetLowering::EmitCustomSelect(MachineInstr *MI,
-                                       MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  // To "insert" a SELECT_CC instruction, we actually have to insert the
-  // diamond control-flow pattern.  The incoming instruction knows the
-  // destination vreg to set, the condition code register to branch on, the
-  // true/false values to select between, and a branch opcode to use.
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  //  thisMBB:
-  //  ...
-  //   TrueVal = ...
-  //   setcc r1, r2, r3
-  //   bNE   r1, r0, copy1MBB
-  //   fallthrough --> copy0MBB
-  MachineFunction *F = MBB->getParent();
-  MachineBasicBlock *flsBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *dneBB = F->CreateMachineBasicBlock(LLVM_BB);
-
-  unsigned Opc;
-  switch (MI->getOperand(4).getImm()) {
-  default: llvm_unreachable("Unknown branch condition");
-  case MBlazeCC::EQ: Opc = MBlaze::BEQID; break;
-  case MBlazeCC::NE: Opc = MBlaze::BNEID; break;
-  case MBlazeCC::GT: Opc = MBlaze::BGTID; break;
-  case MBlazeCC::LT: Opc = MBlaze::BLTID; break;
-  case MBlazeCC::GE: Opc = MBlaze::BGEID; break;
-  case MBlazeCC::LE: Opc = MBlaze::BLEID; break;
-  }
-
-  F->insert(It, flsBB);
-  F->insert(It, dneBB);
-
-  // Transfer the remainder of MBB and its successor edges to dneBB.
-  dneBB->splice(dneBB->begin(), MBB,
-                llvm::next(MachineBasicBlock::iterator(MI)),
-                MBB->end());
-  dneBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  MBB->addSuccessor(flsBB);
-  MBB->addSuccessor(dneBB);
-  flsBB->addSuccessor(dneBB);
-
-  BuildMI(MBB, dl, TII->get(Opc))
-    .addReg(MI->getOperand(3).getReg())
-    .addMBB(dneBB);
-
-  //  sinkMBB:
-  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
-  //  ...
-  //BuildMI(dneBB, dl, TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
-  //  .addReg(MI->getOperand(1).getReg()).addMBB(flsBB)
-  //  .addReg(MI->getOperand(2).getReg()).addMBB(BB);
-
-  BuildMI(*dneBB, dneBB->begin(), dl,
-          TII->get(MBlaze::PHI), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(2).getReg()).addMBB(flsBB)
-    .addReg(MI->getOperand(1).getReg()).addMBB(MBB);
-
-  MI->eraseFromParent();   // The pseudo instruction is gone now.
-  return dneBB;
-}
-
-MachineBasicBlock*
-MBlazeTargetLowering::EmitCustomAtomic(MachineInstr *MI,
-                                       MachineBasicBlock *MBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  // All atomic instructions on the Microblaze are implemented using the
-  // load-linked / store-conditional style atomic instruction sequences.
-  // Thus, all operations will look something like the following:
-  //
-  //  start:
-  //    lwx     RV, RP, 0
-  //    <do stuff>
-  //    swx     RV, RP, 0
-  //    addic   RC, R0, 0
-  //    bneid   RC, start
-  //
-  //  exit:
-  //
-  // To "insert" a shift left instruction, we actually have to insert a
-  // simple loop.  The incoming instruction knows the destination vreg to
-  // set, the source vreg to operate over and the shift amount.
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  // start:
-  //   andi     samt, samt, 31
-  //   beqid    samt, finish
-  //   add      dst, src, r0
-  // loop:
-  //   addik    samt, samt, -1
-  //   sra      dst, dst
-  //   bneid    samt, loop
-  //   nop
-  // finish:
-  MachineFunction *F = MBB->getParent();
-  MachineRegisterInfo &R = F->getRegInfo();
-
-  // Create the start and exit basic blocks for the atomic operation
-  MachineBasicBlock *start = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exit = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(It, start);
-  F->insert(It, exit);
-
-  // Update machine-CFG edges by transferring adding all successors and
-  // remaining instructions from the current block to the new block which
-  // will contain the Phi node for the select.
-  exit->splice(exit->begin(), MBB, llvm::next(MachineBasicBlock::iterator(MI)),
-               MBB->end());
-  exit->transferSuccessorsAndUpdatePHIs(MBB);
-
-  // Add the fallthrough block as its successors.
-  MBB->addSuccessor(start);
-
-  BuildMI(start, dl, TII->get(MBlaze::LWX), MI->getOperand(0).getReg())
-    .addReg(MI->getOperand(1).getReg())
-    .addReg(MBlaze::R0);
-
-  MachineBasicBlock *final = start;
-  unsigned finalReg = 0;
-
-  switch (MI->getOpcode()) {
-  default: llvm_unreachable("Cannot lower unknown atomic instruction!");
-
-  case MBlaze::SWP32:
-    finalReg = MI->getOperand(2).getReg();
-    start->addSuccessor(exit);
-    start->addSuccessor(start);
-    break;
-
-  case MBlaze::LAN32:
-  case MBlaze::LAX32:
-  case MBlaze::LAO32:
-  case MBlaze::LAD32:
-  case MBlaze::LAS32:
-  case MBlaze::LAA32: {
-    unsigned opcode = 0;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Cannot lower unknown atomic load!");
-    case MBlaze::LAA32: opcode = MBlaze::ADDIK; break;
-    case MBlaze::LAS32: opcode = MBlaze::RSUBIK; break;
-    case MBlaze::LAD32: opcode = MBlaze::AND; break;
-    case MBlaze::LAO32: opcode = MBlaze::OR; break;
-    case MBlaze::LAX32: opcode = MBlaze::XOR; break;
-    case MBlaze::LAN32: opcode = MBlaze::AND; break;
-    }
-
-    finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
-    start->addSuccessor(exit);
-    start->addSuccessor(start);
-
-    BuildMI(start, dl, TII->get(opcode), finalReg)
-      .addReg(MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(2).getReg());
-
-    if (MI->getOpcode() == MBlaze::LAN32) {
-      unsigned tmp = finalReg;
-      finalReg = R.createVirtualRegister(&MBlaze::GPRRegClass);
-      BuildMI(start, dl, TII->get(MBlaze::XORI), finalReg)
-        .addReg(tmp)
-        .addImm(-1);
-    }
-    break;
-  }
-
-  case MBlaze::CAS32: {
-    finalReg = MI->getOperand(3).getReg();
-    final = F->CreateMachineBasicBlock(LLVM_BB);
-
-    F->insert(It, final);
-    start->addSuccessor(exit);
-    start->addSuccessor(final);
-    final->addSuccessor(exit);
-    final->addSuccessor(start);
-
-    unsigned CMP = R.createVirtualRegister(&MBlaze::GPRRegClass);
-    BuildMI(start, dl, TII->get(MBlaze::CMP), CMP)
-      .addReg(MI->getOperand(0).getReg())
-      .addReg(MI->getOperand(2).getReg());
-
-    BuildMI(start, dl, TII->get(MBlaze::BNEID))
-      .addReg(CMP)
-      .addMBB(exit);
-
-    final->moveAfter(start);
-    exit->moveAfter(final);
-    break;
-  }
-  }
-
-  unsigned CHK = R.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(final, dl, TII->get(MBlaze::SWX))
-    .addReg(finalReg)
-    .addReg(MI->getOperand(1).getReg())
-    .addReg(MBlaze::R0);
-
-  BuildMI(final, dl, TII->get(MBlaze::ADDIC), CHK)
-    .addReg(MBlaze::R0)
-    .addImm(0);
-
-  BuildMI(final, dl, TII->get(MBlaze::BNEID))
-    .addReg(CHK)
-    .addMBB(start);
-
-  // The pseudo instruction is no longer needed so remove it
-  MI->eraseFromParent();
-  return exit;
-}
-
-//===----------------------------------------------------------------------===//
-//  Misc Lower Operation implementation
-//===----------------------------------------------------------------------===//
-//
-
-SDValue MBlazeTargetLowering::LowerSELECT_CC(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TrueVal = Op.getOperand(2);
-  SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
-  unsigned Opc;
-
-  SDValue CompareFlag;
-  if (LHS.getValueType() == MVT::i32) {
-    Opc = MBlazeISD::Select_CC;
-    CompareFlag = DAG.getNode(MBlazeISD::ICmp, dl, MVT::i32, LHS, RHS)
-                    .getValue(1);
-  } else {
-    llvm_unreachable("Cannot lower select_cc with unknown type");
-  }
-
-  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
-                     CompareFlag);
-}
-
-SDValue MBlazeTargetLowering::
-LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  // FIXME there isn't actually debug info here
-  DebugLoc dl = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32);
-
-  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, GA);
-}
-
-SDValue MBlazeTargetLowering::
-LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
-  llvm_unreachable("TLS not implemented for MicroBlaze.");
-}
-
-SDValue MBlazeTargetLowering::
-LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
-  SDValue ResNode;
-  SDValue HiPart;
-  // FIXME there isn't actually debug info here
-  DebugLoc dl = Op.getDebugLoc();
-
-  EVT PtrVT = Op.getValueType();
-  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
-
-  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, 0);
-  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, JTI);
-}
-
-SDValue MBlazeTargetLowering::
-LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
-  SDValue ResNode;
-  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
-  const Constant *C = N->getConstVal();
-  DebugLoc dl = Op.getDebugLoc();
-
-  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
-                                         N->getOffset(), 0);
-  return DAG.getNode(MBlazeISD::Wrap, dl, MVT::i32, CP);
-}
-
-SDValue MBlazeTargetLowering::LowerVASTART(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MBlazeFunctionInfo *FuncInfo = MF.getInfo<MBlazeFunctionInfo>();
-
-  DebugLoc dl = Op.getDebugLoc();
-  SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                 getPointerTy());
-
-  // vastart just stores the address of the VarArgsFrameIndex slot into the
-  // memory location argument.
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1),
-                      MachinePointerInfo(SV),
-                      false, false, 0);
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeGenCallingConv.inc"
-
-static bool CC_MBlaze_AssignReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                CCValAssign::LocInfo &LocInfo,
-                                ISD::ArgFlagsTy &ArgFlags,
-                                CCState &State) {
-  static const uint16_t ArgRegs[] = {
-    MBlaze::R5, MBlaze::R6, MBlaze::R7,
-    MBlaze::R8, MBlaze::R9, MBlaze::R10
-  };
-
-  const unsigned NumArgRegs = array_lengthof(ArgRegs);
-  unsigned Reg = State.AllocateReg(ArgRegs, NumArgRegs);
-  if (!Reg) return false;
-
-  unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-  State.AllocateStack(SizeInBytes, SizeInBytes);
-  State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-//                  Call Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-/// LowerCall - functions arguments are copied from virtual regs to
-/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-/// TODO: isVarArg, isTailCall.
-SDValue MBlazeTargetLowering::
-LowerCall(TargetLowering::CallLoweringInfo &CLI,
-          SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
-  SDValue Chain                         = CLI.Chain;
-  SDValue Callee                        = CLI.Callee;
-  bool &isTailCall                      = CLI.IsTailCall;
-  CallingConv::ID CallConv              = CLI.CallConv;
-  bool isVarArg                         = CLI.IsVarArg;
-
-  // MBlaze does not yet support tail call optimization
-  isTailCall = false;
-
-  // The MBlaze requires stack slots for arguments passed to var arg
-  // functions even if they are passed in registers.
-  bool needsRegArgSlots = isVarArg;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering();
-
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CC_MBlaze);
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-
-  // Variable argument function calls require a minimum of 24-bytes of stack
-  if (isVarArg && NumBytes < 24) NumBytes = 24;
-
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<SDValue, 8> MemOpChains;
-
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    MVT RegVT = VA.getLocVT();
-    SDValue Arg = OutVals[i];
-
-    // Promote the value if needed.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
-      break;
-    case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
-      break;
-    case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
-      break;
-    }
-
-    // Arguments that can be passed on register must be kept at
-    // RegsToPass vector
-    if (VA.isRegLoc()) {
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      // Register can't get to this point...
-      assert(VA.isMemLoc());
-
-      // Since we are alread passing values on the stack we don't
-      // need to worry about creating additional slots for the
-      // values passed via registers.
-      needsRegArgSlots = false;
-
-      // Create the frame index object for this incoming parameter
-      unsigned ArgSize = VA.getValVT().getSizeInBits()/8;
-      unsigned StackLoc = VA.getLocMemOffset() + 4;
-      int FI = MFI->CreateFixedObject(ArgSize, StackLoc, true);
-
-      SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
-
-      // emit ISD::STORE whichs stores the
-      // parameter value to a stack Location
-      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                         MachinePointerInfo(),
-                                         false, false, 0));
-    }
-  }
-
-  // If we need to reserve stack space for the arguments passed via registers
-  // then create a fixed stack object at the beginning of the stack.
-  if (needsRegArgSlots && TFI.hasReservedCallFrame(MF))
-    MFI->CreateFixedObject(28,0,true);
-
-  // Transform all store nodes into one single node because all store
-  // nodes are independent of each other.
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
-
-  // Build a sequence of copy-to-reg nodes chained together with token
-  // chain and flag operands which copy the outgoing args into registers.
-  // The InFlag in necessary since all emitted instructions must be
-  // stuck together.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
-    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
-                                getPointerTy(), 0, 0);
-  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
-    Callee = DAG.getTargetExternalSymbol(S->getSymbol(),
-                                getPointerTy(), 0);
-
-  // MBlazeJmpLink = #chain, #target_address, #opt_in_flags...
-  //             = Chain, Callee, Reg#1, Reg#2, ...
-  //
-  // Returns a chain & a flag for retval copy to use.
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SmallVector<SDValue, 8> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-
-  // Add argument registers to the end of the list so that they are
-  // known live into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-  }
-
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
-
-  Chain  = DAG.getNode(MBlazeISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
-
-  // Create the CALLSEQ_END node.
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
-  if (!Ins.empty())
-    InFlag = Chain.getValue(1);
-
-  // Handle result values, copying them out of physregs into vregs that we
-  // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue MBlazeTargetLowering::
-LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv,
-                bool isVarArg, const SmallVectorImpl<ISD::InputArg> &Ins,
-                DebugLoc dl, SelectionDAG &DAG,
-                SmallVectorImpl<SDValue> &InVals) const {
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallResult(Ins, RetCC_MBlaze);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                               RVLocs[i].getValVT(), InFlag).getValue(1);
-    InFlag = Chain.getValue(2);
-    InVals.push_back(Chain.getValue(0));
-  }
-
-  return Chain;
-}
-
-//===----------------------------------------------------------------------===//
-//             Formal Arguments Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-/// LowerFormalArguments - transform physical registers into
-/// virtual registers and generate load operations for
-/// arguments places on the stack.
-SDValue MBlazeTargetLowering::
-LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc dl, SelectionDAG &DAG,
-                     SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-
-  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
-  MBlazeFI->setVarArgsFrameIndex(0);
-
-  // Used with vargs to acumulate store chains.
-  std::vector<SDValue> OutChains;
-
-  // Keep track of the last register used for arguments
-  unsigned ArgRegEnd = 0;
-
-  // Assign locations to all of the incoming arguments.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeFormalArguments(Ins, CC_MBlaze);
-  SDValue StackPtr;
-
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-
-    // Arguments stored on registers
-    if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
-      ArgRegEnd = VA.getLocReg();
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = &MBlaze::GPRRegClass;
-      else if (RegVT == MVT::f32)
-        RC = &MBlaze::GPRRegClass;
-      else
-        llvm_unreachable("RegVT not supported by LowerFormalArguments");
-
-      // Transform the arguments stored on
-      // physical registers into virtual ones
-      unsigned Reg = MF.addLiveIn(ArgRegEnd, RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
-
-      // If this is an 8 or 16-bit value, it has been passed promoted
-      // to 32 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size. If if is a floating point value
-      // then convert to the correct type.
-      if (VA.getLocInfo() != CCValAssign::Full) {
-        unsigned Opcode = 0;
-        if (VA.getLocInfo() == CCValAssign::SExt)
-          Opcode = ISD::AssertSext;
-        else if (VA.getLocInfo() == CCValAssign::ZExt)
-          Opcode = ISD::AssertZext;
-        if (Opcode)
-          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue,
-                                 DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
-      }
-
-      InVals.push_back(ArgValue);
-    } else { // VA.isRegLoc()
-      // sanity check
-      assert(VA.isMemLoc());
-
-      // The last argument is not a register
-      ArgRegEnd = 0;
-
-      // The stack pointer offset is relative to the caller stack frame.
-      // Since the real stack size is unknown here, a negative SPOffset
-      // is used so there's a way to adjust these offsets when the stack
-      // size get known (on EliminateFrameIndex). A dummy SPOffset is
-      // used instead of a direct negative address (which is recorded to
-      // be used on emitPrologue) to avoid mis-calc of the first stack
-      // offset on PEI::calculateFrameObjectOffsets.
-      // Arguments are always 32-bit.
-      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
-      unsigned StackLoc = VA.getLocMemOffset() + 4;
-      int FI = MFI->CreateFixedObject(ArgSize, 0, true);
-      MBlazeFI->recordLoadArgsFI(FI, -StackLoc);
-      MBlazeFI->recordLiveIn(FI);
-
-      // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, false, 0));
-    }
-  }
-
-  // To meet ABI, when VARARGS are passed on registers, the registers
-  // must have their values written to the caller stack frame. If the last
-  // argument was placed in the stack, there's no need to save any register.
-  if ((isVarArg) && ArgRegEnd) {
-    if (StackPtr.getNode() == 0)
-      StackPtr = DAG.getRegister(StackReg, getPointerTy());
-
-    // The last register argument that must be saved is MBlaze::R10
-    const TargetRegisterClass *RC = &MBlaze::GPRRegClass;
-
-    unsigned Begin = getMBlazeRegisterNumbering(MBlaze::R5);
-    unsigned Start = getMBlazeRegisterNumbering(ArgRegEnd+1);
-    unsigned End   = getMBlazeRegisterNumbering(MBlaze::R10);
-    unsigned StackLoc = Start - Begin + 1;
-
-    for (; Start <= End; ++Start, ++StackLoc) {
-      unsigned Reg = getMBlazeRegisterFromNumbering(Start);
-      unsigned LiveReg = MF.addLiveIn(Reg, RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, LiveReg, MVT::i32);
-
-      int FI = MFI->CreateFixedObject(4, 0, true);
-      MBlazeFI->recordStoreVarArgsFI(FI, -(StackLoc*4));
-      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
-      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
-                                       MachinePointerInfo(),
-                                       false, false, 0));
-
-      // Record the frame index of the first variable argument
-      // which is a value necessary to VASTART.
-      if (!MBlazeFI->getVarArgsFrameIndex())
-        MBlazeFI->setVarArgsFrameIndex(FI);
-    }
-  }
-
-  // All stores are grouped in one node to allow the matching between
-  // the size of Ins and InVals. This only happens when on varg functions
-  if (!OutChains.empty()) {
-    OutChains.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &OutChains[0], OutChains.size());
-  }
-
-  return Chain;
-}
-
-//===----------------------------------------------------------------------===//
-//               Return Value Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-SDValue MBlazeTargetLowering::
-LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-            const SmallVectorImpl<ISD::OutputArg> &Outs,
-            const SmallVectorImpl<SDValue> &OutVals,
-            DebugLoc dl, SelectionDAG &DAG) const {
-  // CCValAssign - represent the assignment of
-  // the return value to a location
-  SmallVector<CCValAssign, 16> RVLocs;
-
-  // CCState - Info about the registers and stack slot.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-
-  // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_MBlaze);
-
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-
-  // If this function is using the interrupt_handler calling convention
-  // then use "rtid r14, 0" otherwise use "rtsd r15, 8"
-  unsigned Ret = (CallConv == CallingConv::MBLAZE_INTR) ? MBlazeISD::IRet
-                                                        : MBlazeISD::Ret;
-  unsigned Reg = (CallConv == CallingConv::MBLAZE_INTR) ? MBlaze::R14
-                                                        : MBlaze::R15;
-  RetOps.push_back(DAG.getRegister(Reg, MVT::i32));
-
-
-  // Copy the result values into the output registers.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
-
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
-
-    // guarantee that all emitted copies are
-    // stuck together, avoiding something bad
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
-
-  RetOps[0] = Chain;  // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
-
-  return DAG.getNode(Ret, dl, MVT::Other, &RetOps[0], RetOps.size());
-}
-
-//===----------------------------------------------------------------------===//
-//                           MBlaze Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-MBlazeTargetLowering::ConstraintType MBlazeTargetLowering::
-getConstraintType(const std::string &Constraint) const
-{
-  // MBlaze specific constrainy
-  //
-  // 'd' : An address register. Equivalent to r.
-  // 'y' : Equivalent to r; retained for
-  //       backwards compatibility.
-  // 'f' : Floating Point registers.
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-      default : break;
-      case 'd':
-      case 'y':
-      case 'f':
-        return C_RegisterClass;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-/// Examine constraint type and operand type and determine a weight value.
-/// This object must already have been set up with the operand type
-/// and the current alternative constraint selected.
-TargetLowering::ConstraintWeight
-MBlazeTargetLowering::getSingleConstraintMatchWeight(
-    AsmOperandInfo &info, const char *constraint) const {
-  ConstraintWeight weight = CW_Invalid;
-  Value *CallOperandVal = info.CallOperandVal;
-    // If we don't have a value, we can't do a match,
-    // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
-    return CW_Default;
-  Type *type = CallOperandVal->getType();
-  // Look at the constraint type.
-  switch (*constraint) {
-  default:
-    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
-  case 'd':
-  case 'y':
-    if (type->isIntegerTy())
-      weight = CW_Register;
-    break;
-  case 'f':
-    if (type->isFloatTy())
-      weight = CW_Register;
-    break;
-  }
-  return weight;
-}
-
-/// Given a register class constraint, like 'r', if this corresponds directly
-/// to an LLVM register class, return a register of 0 and the register class
-/// pointer.
-std::pair<unsigned, const TargetRegisterClass*> MBlazeTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      return std::make_pair(0U, &MBlaze::GPRRegClass);
-      // TODO: These can't possibly be right, but match what was in
-      // getRegClassForInlineAsmConstraint.
-    case 'd':
-    case 'y':
-    case 'f':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, &MBlaze::GPRRegClass);
-    }
-  }
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-}
-
-bool MBlazeTargetLowering::
-isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // The MBlaze target isn't yet aware of offsets.
-  return false;
-}
-
-bool MBlazeTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  return VT != MVT::f32;
-}
diff --git a/lib/Target/MBlaze/MBlazeISelLowering.h b/lib/Target/MBlaze/MBlazeISelLowering.h
deleted file mode 100644
index f6b4095..0000000
--- a/lib/Target/MBlaze/MBlazeISelLowering.h
+++ /dev/null
@@ -1,179 +0,0 @@
-//===-- MBlazeISelLowering.h - MBlaze DAG Lowering Interface ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that MBlaze uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBlazeISELLOWERING_H
-#define MBlazeISELLOWERING_H
-
-#include "MBlaze.h"
-#include "MBlazeSubtarget.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-  namespace MBlazeCC {
-    enum CC {
-      FIRST = 0,
-      EQ,
-      NE,
-      GT,
-      LT,
-      GE,
-      LE
-    };
-
-    inline static CC getOppositeCondition(CC cc) {
-      switch (cc) {
-      default: llvm_unreachable("Unknown condition code");
-      case EQ: return NE;
-      case NE: return EQ;
-      case GT: return LE;
-      case LT: return GE;
-      case GE: return LT;
-      case LE: return GE;
-      }
-    }
-
-    inline static const char *MBlazeCCToString(CC cc) {
-      switch (cc) {
-      default: llvm_unreachable("Unknown condition code");
-      case EQ: return "eq";
-      case NE: return "ne";
-      case GT: return "gt";
-      case LT: return "lt";
-      case GE: return "ge";
-      case LE: return "le";
-      }
-    }
-  }
-
-  namespace MBlazeISD {
-    enum NodeType {
-      // Start the numbering from where ISD NodeType finishes.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      // Jump and link (call)
-      JmpLink,
-
-      // Handle gp_rel (small data/bss sections) relocation.
-      GPRel,
-
-      // Select CC Pseudo Instruction
-      Select_CC,
-
-      // Wrap up multiple types of instructions
-      Wrap,
-
-      // Integer Compare
-      ICmp,
-
-      // Return from subroutine
-      Ret,
-
-      // Return from interrupt
-      IRet
-    };
-  }
-
-  //===--------------------------------------------------------------------===//
-  // TargetLowering Implementation
-  //===--------------------------------------------------------------------===//
-
-  class MBlazeTargetLowering : public TargetLowering  {
-  public:
-    explicit MBlazeTargetLowering(MBlazeTargetMachine &TM);
-
-    /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-
-    /// getTargetNodeName - This method returns the name of a target specific
-    //  DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
-
-    /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(EVT VT) const;
-
-  private:
-    // Subtarget Info
-    const MBlazeSubtarget *Subtarget;
-
-
-    // Lower Operand helpers
-    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                            CallingConv::ID CallConv, bool isVarArg,
-                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
-
-    // Lower Operand specifics
-    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-
-    virtual SDValue
-      LowerFormalArguments(SDValue Chain,
-                           CallingConv::ID CallConv, bool isVarArg,
-                           const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
-
-    virtual MachineBasicBlock*
-      EmitCustomShift(MachineInstr *MI, MachineBasicBlock *MBB) const;
-
-    virtual MachineBasicBlock*
-      EmitCustomSelect(MachineInstr *MI, MachineBasicBlock *MBB) const;
-
-    virtual MachineBasicBlock*
-            EmitCustomAtomic(MachineInstr *MI, MachineBasicBlock *MBB) const;
-
-    virtual MachineBasicBlock *
-      EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
-
-    // Inline asm support
-    ConstraintType getConstraintType(const std::string &Constraint) const;
-
-    /// Examine constraint string and operand type and determine a weight value.
-    /// The operand object must already have been set up with the operand type.
-    ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
-
-    std::pair<unsigned, const TargetRegisterClass*>
-              getRegForInlineAsmConstraint(const std::string &Constraint,
-              EVT VT) const;
-
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
-
-    /// isFPImmLegal - Returns true if the target can instruction select the
-    /// specified FP immediate natively. If false, the legalizer will
-    /// materialize the FP immediate as a load from a constant pool.
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
-  };
-}
-
-#endif // MBlazeISELLOWERING_H
diff --git a/lib/Target/MBlaze/MBlazeInstrFPU.td b/lib/Target/MBlaze/MBlazeInstrFPU.td
deleted file mode 100644
index 3f14593..0000000
--- a/lib/Target/MBlaze/MBlazeInstrFPU.td
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- MBlazeInstrFPU.td - MBlaze FPU Instruction defs ----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze profiles and nodes
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze Operand, Complex Patterns and Transformations Definitions.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Memory Access Instructions
-//===----------------------------------------------------------------------===//
-class LoadFM<bits<6> op, string instr_asm, PatFrag OpNode> :
-             TA<op, 0x000, (outs GPR:$dst), (ins memrr:$addr),
-                !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (f32 GPR:$dst), (OpNode xaddr:$addr))], IIC_MEMl>;
-
-class LoadFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
-              TB<op, (outs GPR:$dst), (ins memri:$addr),
-                 !strconcat(instr_asm, "   $dst, $addr"),
-                 [(set (f32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
-
-class StoreFM<bits<6> op, string instr_asm, PatFrag OpNode> :
-              TA<op, 0x000, (outs), (ins GPR:$dst, memrr:$addr),
-                 !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (f32 GPR:$dst), xaddr:$addr)], IIC_MEMs>;
-
-class StoreFMI<bits<6> op, string instr_asm, PatFrag OpNode> :
-               TB<op, (outs), (ins GPR:$dst, memrr:$addr),
-                  !strconcat(instr_asm, "   $dst, $addr"),
-                  [(OpNode (f32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
-
-class ArithF<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
-             InstrItinClass itin> :
-             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
-
-class CmpFN<bits<6> op, bits<11> flags, string instr_asm,
-            InstrItinClass itin> :
-            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-               !strconcat(instr_asm, "   $dst, $b, $c"),
-               [], itin>;
-
-class ArithFR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
-             InstrItinClass itin> :
-             TAR<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                 !strconcat(instr_asm, "   $dst, $c, $b"),
-                 [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
-
-class LogicFI<bits<6> op, string instr_asm> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, fimm:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIC_ALU>;
-
-let rb=0 in {
-  class ArithF2<bits<6> op, bits<11> flags, string instr_asm,
-                InstrItinClass itin> :
-                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
-                   !strconcat(instr_asm, "   $dst, $b"),
-                   [], itin>;
-
-  class ArithIF<bits<6> op, bits<11> flags, string instr_asm,
-                InstrItinClass itin> :
-                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
-                   !strconcat(instr_asm, "   $dst, $b"),
-                   [], itin>;
-
-  class ArithFI<bits<6> op, bits<11> flags, string instr_asm,
-                InstrItinClass itin> :
-                TA<op, flags, (outs GPR:$dst), (ins GPR:$b),
-                   !strconcat(instr_asm, "   $dst, $b"),
-                   [], itin>;
-}
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// FPU Arithmetic Instructions
-//===----------------------------------------------------------------------===//
-let Predicates=[HasFPU] in {
-  def FORI   : LogicFI<0x28, "ori    ">;
-  def FADD   :  ArithF<0x16, 0x000, "fadd   ", fadd, IIC_FPU>;
-  def FRSUB  : ArithFR<0x16, 0x080, "frsub  ", fsub, IIC_FPU>;
-  def FMUL   :  ArithF<0x16, 0x100, "fmul   ", fmul, IIC_FPU>;
-  def FDIV   :  ArithF<0x16, 0x180, "fdiv   ", fdiv, IIC_FPUd>;
-}
-
-let Predicates=[HasFPU], isCodeGenOnly=1 in {
-  def LWF    :   LoadFM<0x32, "lw      ", load>;
-  def LWFI   :  LoadFMI<0x3A, "lwi     ", load>;
-
-  def SWF    :  StoreFM<0x36, "sw      ", store>;
-  def SWFI   : StoreFMI<0x3E, "swi     ", store>;
-}
-
-let Predicates=[HasFPU,HasSqrt] in {
-  def FLT    : ArithIF<0x16, 0x280, "flt    ", IIC_FPUf>;
-  def FINT   : ArithFI<0x16, 0x300, "fint   ", IIC_FPUi>;
-  def FSQRT  : ArithF2<0x16, 0x380, "fsqrt  ", IIC_FPUs>;
-}
-
-let isAsCheapAsAMove = 1 in {
-  def FCMP_UN : CmpFN<0x16, 0x200, "fcmp.un", IIC_FPUc>;
-  def FCMP_LT : CmpFN<0x16, 0x210, "fcmp.lt", IIC_FPUc>;
-  def FCMP_EQ : CmpFN<0x16, 0x220, "fcmp.eq", IIC_FPUc>;
-  def FCMP_LE : CmpFN<0x16, 0x230, "fcmp.le", IIC_FPUc>;
-  def FCMP_GT : CmpFN<0x16, 0x240, "fcmp.gt", IIC_FPUc>;
-  def FCMP_NE : CmpFN<0x16, 0x250, "fcmp.ne", IIC_FPUc>;
-  def FCMP_GE : CmpFN<0x16, 0x260, "fcmp.ge", IIC_FPUc>;
-}
-
-
-let usesCustomInserter = 1 in {
-  def Select_FCC : MBlazePseudo<(outs GPR:$dst),
-    (ins GPR:$T, GPR:$F, GPR:$CMP, i32imm:$CC),
-    "; SELECT_FCC PSEUDO!",
-    []>;
-}
-
-// Floating point conversions
-let Predicates=[HasFPU] in {
-  def : Pat<(sint_to_fp GPR:$V), (FLT GPR:$V)>;
-  def : Pat<(fp_to_sint GPR:$V), (FINT GPR:$V)>;
-  def : Pat<(fsqrt GPR:$V), (FSQRT GPR:$V)>;
-}
-
-// SET_CC operations
-let Predicates=[HasFPU] in {
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETEQ),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_EQ GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETNE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_EQ GPR:$L, GPR:$R), 1)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOEQ),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_EQ GPR:$L, GPR:$R), 2)>;
- def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETONE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (XOR (FCMP_UN GPR:$L, GPR:$R),
-                            (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETONE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETGT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_GT GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETLT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_LT GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETGE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_GE GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETLE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_LE GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOGT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_GT GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOLT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_LT GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOGE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_GE GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETOLE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_LE GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUEQ),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_EQ GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUNE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_NE GPR:$L, GPR:$R), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUGT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_GT GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETULT),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_LT GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUGE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_GE GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETULE),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (OR (FCMP_UN GPR:$L, GPR:$R),
-                           (FCMP_LE GPR:$L, GPR:$R)), 2)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETO),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_UN GPR:$L, GPR:$R), 1)>;
-  def : Pat<(setcc (f32 GPR:$L), (f32 GPR:$R), SETUO),
-            (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                       (FCMP_UN GPR:$L, GPR:$R), 2)>;
-}
-
-// SELECT operations
-def : Pat<(select (i32 GPR:$C), (f32 GPR:$T), (f32 GPR:$F)),
-          (Select_FCC GPR:$T, GPR:$F, GPR:$C, 2)>;
-
-//===----------------------------------------------------------------------===//
-// Patterns for Floating Point Instructions
-//===----------------------------------------------------------------------===//
-def : Pat<(f32 fpimm:$imm), (FORI (i32 R0), fpimm:$imm)>;
diff --git a/lib/Target/MBlaze/MBlazeInstrFSL.td b/lib/Target/MBlaze/MBlazeInstrFSL.td
deleted file mode 100644
index 91b69de..0000000
--- a/lib/Target/MBlaze/MBlazeInstrFSL.td
+++ /dev/null
@@ -1,229 +0,0 @@
-//===-- MBlazeInstrFSL.td - MBlaze FSL Instruction defs ----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// FSL Instruction Formats
-//===----------------------------------------------------------------------===//
-class FSLGet<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
-             MBlazeInst<op, FRCX, (outs GPR:$dst), (ins fslimm:$b),
-                        !strconcat(instr_asm, " $dst, $b"),
-                        [(set GPR:$dst, (OpNode immZExt4:$b))],IIC_FSLg>
-{
-    bits<5> rd;
-    bits<4> fslno;
-
-    let Inst{6-10}  = rd;
-    let Inst{11-15} = 0x0;
-    let Inst{16}    = 0x0;
-    let Inst{17-21} = flags; // NCTAE
-    let Inst{22-27} = 0x0;
-    let Inst{28-31} = fslno;
-}
-
-class FSLGetD<bits<6> op, bits<5> flags, string instr_asm, Intrinsic OpNode> :
-              MBlazeInst<op, FRCR, (outs GPR:$dst), (ins GPR:$b),
-                         !strconcat(instr_asm, " $dst, $b"),
-                         [(set GPR:$dst, (OpNode GPR:$b))], IIC_FSLg>
-{
-    bits<5> rd;
-    bits<5> rb;
-
-    let Inst{6-10}  = rd;
-    let Inst{11-15} = 0x0;
-    let Inst{16-20} = rb;
-    let Inst{21}    = 0x0;
-    let Inst{22-26} = flags; // NCTAE
-    let Inst{27-31} = 0x0;
-}
-
-class FSLPut<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
-             MBlazeInst<op, FCRCX, (outs), (ins GPR:$v, fslimm:$b),
-                        !strconcat(instr_asm, " $v, $b"),
-                        [(OpNode GPR:$v, immZExt4:$b)], IIC_FSLp>
-{
-    bits<5> ra;
-    bits<4> fslno;
-
-    let Inst{6-10}  = 0x0;
-    let Inst{11-15} = ra;
-    let Inst{16}    = 0x1;
-    let Inst{17-20} = flags; // NCTA
-    let Inst{21-27} = 0x0;
-    let Inst{28-31} = fslno;
-}
-
-class FSLPutD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
-              MBlazeInst<op, FCRR, (outs), (ins GPR:$v, GPR:$b),
-                         !strconcat(instr_asm, " $v, $b"),
-                         [(OpNode GPR:$v, GPR:$b)], IIC_FSLp>
-{
-    bits<5> ra;
-    bits<5> rb;
-
-    let Inst{6-10}  = 0x0;
-    let Inst{11-15} = ra;
-    let Inst{16-20} = rb;
-    let Inst{21}    = 0x1;
-    let Inst{22-25} = flags; // NCTA
-    let Inst{26-31} = 0x0;
-}
-
-class FSLPutT<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
-              MBlazeInst<op, FCX, (outs), (ins fslimm:$b),
-                         !strconcat(instr_asm, " $b"),
-                         [(OpNode immZExt4:$b)], IIC_FSLp>
-{
-    bits<4> fslno;
-
-    let Inst{6-10}  = 0x0;
-    let Inst{11-15} = 0x0;
-    let Inst{16}    = 0x1;
-    let Inst{17-20} = flags; // NCTA
-    let Inst{21-27} = 0x0;
-    let Inst{28-31} = fslno;
-}
-
-class FSLPutTD<bits<6> op, bits<4> flags, string instr_asm, Intrinsic OpNode> :
-               MBlazeInst<op, FCR, (outs), (ins GPR:$b),
-                          !strconcat(instr_asm, " $b"),
-                          [(OpNode GPR:$b)], IIC_FSLp>
-{
-    bits<5> rb;
-
-    let Inst{6-10}  = 0x0;
-    let Inst{11-15} = 0x0;
-    let Inst{16-20} = rb;
-    let Inst{21}    = 0x1;
-    let Inst{22-25} = flags; // NCTA
-    let Inst{26-31} = 0x0;
-}
-
-//===----------------------------------------------------------------------===//
-// FSL Get Instructions
-//===----------------------------------------------------------------------===//
-def GET      : FSLGet<0x1B, 0x00, "get      ", int_mblaze_fsl_get>;
-def AGET     : FSLGet<0x1B, 0x02, "aget     ", int_mblaze_fsl_aget>;
-def CGET     : FSLGet<0x1B, 0x08, "cget     ", int_mblaze_fsl_cget>;
-def CAGET    : FSLGet<0x1B, 0x0A, "caget    ", int_mblaze_fsl_caget>;
-def EGET     : FSLGet<0x1B, 0x01, "eget     ", int_mblaze_fsl_eget>;
-def EAGET    : FSLGet<0x1B, 0x03, "eaget    ", int_mblaze_fsl_eaget>;
-def ECGET    : FSLGet<0x1B, 0x09, "ecget    ", int_mblaze_fsl_ecget>;
-def ECAGET   : FSLGet<0x1B, 0x0B, "ecaget   ", int_mblaze_fsl_ecaget>;
-def TGET     : FSLGet<0x1B, 0x04, "tget     ", int_mblaze_fsl_tget>;
-def TAGET    : FSLGet<0x1B, 0x06, "taget    ", int_mblaze_fsl_taget>;
-def TCGET    : FSLGet<0x1B, 0x0C, "tcget    ", int_mblaze_fsl_tcget>;
-def TCAGET   : FSLGet<0x1B, 0x0E, "tcaget   ", int_mblaze_fsl_tcaget>;
-def TEGET    : FSLGet<0x1B, 0x05, "teget    ", int_mblaze_fsl_teget>;
-def TEAGET   : FSLGet<0x1B, 0x07, "teaget   ", int_mblaze_fsl_teaget>;
-def TECGET   : FSLGet<0x1B, 0x0D, "tecget   ", int_mblaze_fsl_tecget>;
-def TECAGET  : FSLGet<0x1B, 0x0F, "tecaget  ", int_mblaze_fsl_tecaget>;
-
-let Defs = [CARRY] in {
-  def NGET     : FSLGet<0x1B, 0x10, "nget     ", int_mblaze_fsl_nget>;
-  def NAGET    : FSLGet<0x1B, 0x12, "naget    ", int_mblaze_fsl_naget>;
-  def NCGET    : FSLGet<0x1B, 0x18, "ncget    ", int_mblaze_fsl_ncget>;
-  def NCAGET   : FSLGet<0x1B, 0x1A, "ncaget   ", int_mblaze_fsl_ncaget>;
-  def NEGET    : FSLGet<0x1B, 0x11, "neget    ", int_mblaze_fsl_neget>;
-  def NEAGET   : FSLGet<0x1B, 0x13, "neaget   ", int_mblaze_fsl_neaget>;
-  def NECGET   : FSLGet<0x1B, 0x19, "necget   ", int_mblaze_fsl_necget>;
-  def NECAGET  : FSLGet<0x1B, 0x1B, "necaget  ", int_mblaze_fsl_necaget>;
-  def TNGET    : FSLGet<0x1B, 0x14, "tnget    ", int_mblaze_fsl_tnget>;
-  def TNAGET   : FSLGet<0x1B, 0x16, "tnaget   ", int_mblaze_fsl_tnaget>;
-  def TNCGET   : FSLGet<0x1B, 0x1C, "tncget   ", int_mblaze_fsl_tncget>;
-  def TNCAGET  : FSLGet<0x1B, 0x1E, "tncaget  ", int_mblaze_fsl_tncaget>;
-  def TNEGET   : FSLGet<0x1B, 0x15, "tneget   ", int_mblaze_fsl_tneget>;
-  def TNEAGET  : FSLGet<0x1B, 0x17, "tneaget  ", int_mblaze_fsl_tneaget>;
-  def TNECGET  : FSLGet<0x1B, 0x1D, "tnecget  ", int_mblaze_fsl_tnecget>;
-  def TNECAGET : FSLGet<0x1B, 0x1F, "tnecaget ", int_mblaze_fsl_tnecaget>;
-}
-
-//===----------------------------------------------------------------------===//
-// FSL Dynamic Get Instructions
-//===----------------------------------------------------------------------===//
-def GETD      : FSLGetD<0x13, 0x00, "getd     ", int_mblaze_fsl_get>;
-def AGETD     : FSLGetD<0x13, 0x02, "agetd    ", int_mblaze_fsl_aget>;
-def CGETD     : FSLGetD<0x13, 0x08, "cgetd    ", int_mblaze_fsl_cget>;
-def CAGETD    : FSLGetD<0x13, 0x0A, "cagetd   ", int_mblaze_fsl_caget>;
-def EGETD     : FSLGetD<0x13, 0x01, "egetd    ", int_mblaze_fsl_eget>;
-def EAGETD    : FSLGetD<0x13, 0x03, "eagetd   ", int_mblaze_fsl_eaget>;
-def ECGETD    : FSLGetD<0x13, 0x09, "ecgetd   ", int_mblaze_fsl_ecget>;
-def ECAGETD   : FSLGetD<0x13, 0x0B, "ecagetd  ", int_mblaze_fsl_ecaget>;
-def TGETD     : FSLGetD<0x13, 0x04, "tgetd    ", int_mblaze_fsl_tget>;
-def TAGETD    : FSLGetD<0x13, 0x06, "tagetd   ", int_mblaze_fsl_taget>;
-def TCGETD    : FSLGetD<0x13, 0x0C, "tcgetd   ", int_mblaze_fsl_tcget>;
-def TCAGETD   : FSLGetD<0x13, 0x0E, "tcagetd  ", int_mblaze_fsl_tcaget>;
-def TEGETD    : FSLGetD<0x13, 0x05, "tegetd   ", int_mblaze_fsl_teget>;
-def TEAGETD   : FSLGetD<0x13, 0x07, "teagetd  ", int_mblaze_fsl_teaget>;
-def TECGETD   : FSLGetD<0x13, 0x0D, "tecgetd  ", int_mblaze_fsl_tecget>;
-def TECAGETD  : FSLGetD<0x13, 0x0F, "tecagetd ", int_mblaze_fsl_tecaget>;
-
-let Defs = [CARRY] in {
-  def NGETD     : FSLGetD<0x13, 0x10, "ngetd    ", int_mblaze_fsl_nget>;
-  def NAGETD    : FSLGetD<0x13, 0x12, "nagetd   ", int_mblaze_fsl_naget>;
-  def NCGETD    : FSLGetD<0x13, 0x18, "ncgetd   ", int_mblaze_fsl_ncget>;
-  def NCAGETD   : FSLGetD<0x13, 0x1A, "ncagetd  ", int_mblaze_fsl_ncaget>;
-  def NEGETD    : FSLGetD<0x13, 0x11, "negetd   ", int_mblaze_fsl_neget>;
-  def NEAGETD   : FSLGetD<0x13, 0x13, "neagetd  ", int_mblaze_fsl_neaget>;
-  def NECGETD   : FSLGetD<0x13, 0x19, "necgetd  ", int_mblaze_fsl_necget>;
-  def NECAGETD  : FSLGetD<0x13, 0x1B, "necagetd ", int_mblaze_fsl_necaget>;
-  def TNGETD    : FSLGetD<0x13, 0x14, "tngetd   ", int_mblaze_fsl_tnget>;
-  def TNAGETD   : FSLGetD<0x13, 0x16, "tnagetd  ", int_mblaze_fsl_tnaget>;
-  def TNCGETD   : FSLGetD<0x13, 0x1C, "tncgetd  ", int_mblaze_fsl_tncget>;
-  def TNCAGETD  : FSLGetD<0x13, 0x1E, "tncagetd ", int_mblaze_fsl_tncaget>;
-  def TNEGETD   : FSLGetD<0x13, 0x15, "tnegetd  ", int_mblaze_fsl_tneget>;
-  def TNEAGETD  : FSLGetD<0x13, 0x17, "tneagetd ", int_mblaze_fsl_tneaget>;
-  def TNECGETD  : FSLGetD<0x13, 0x1D, "tnecgetd ", int_mblaze_fsl_tnecget>;
-  def TNECAGETD : FSLGetD<0x13, 0x1F, "tnecagetd", int_mblaze_fsl_tnecaget>;
-}
-
-//===----------------------------------------------------------------------===//
-// FSL Put Instructions
-//===----------------------------------------------------------------------===//
-def PUT     :  FSLPut<0x1B, 0x0, "put      ", int_mblaze_fsl_put>;
-def APUT    :  FSLPut<0x1B, 0x1, "aput     ", int_mblaze_fsl_aput>;
-def CPUT    :  FSLPut<0x1B, 0x4, "cput     ", int_mblaze_fsl_cput>;
-def CAPUT   :  FSLPut<0x1B, 0x5, "caput    ", int_mblaze_fsl_caput>;
-def TPUT    : FSLPutT<0x1B, 0x2, "tput     ", int_mblaze_fsl_tput>;
-def TAPUT   : FSLPutT<0x1B, 0x3, "taput    ", int_mblaze_fsl_taput>;
-def TCPUT   : FSLPutT<0x1B, 0x6, "tcput    ", int_mblaze_fsl_tcput>;
-def TCAPUT  : FSLPutT<0x1B, 0x7, "tcaput   ", int_mblaze_fsl_tcaput>;
-
-let Defs = [CARRY] in {
-  def NPUT    :  FSLPut<0x1B, 0x8, "nput     ", int_mblaze_fsl_nput>;
-  def NAPUT   :  FSLPut<0x1B, 0x9, "naput    ", int_mblaze_fsl_naput>;
-  def NCPUT   :  FSLPut<0x1B, 0xC, "ncput    ", int_mblaze_fsl_ncput>;
-  def NCAPUT  :  FSLPut<0x1B, 0xD, "ncaput   ", int_mblaze_fsl_ncaput>;
-  def TNPUT   : FSLPutT<0x1B, 0xA, "tnput    ", int_mblaze_fsl_tnput>;
-  def TNAPUT  : FSLPutT<0x1B, 0xB, "tnaput   ", int_mblaze_fsl_tnaput>;
-  def TNCPUT  : FSLPutT<0x1B, 0xE, "tncput   ", int_mblaze_fsl_tncput>;
-  def TNCAPUT : FSLPutT<0x1B, 0xF, "tncaput  ", int_mblaze_fsl_tncaput>;
-}
-
-//===----------------------------------------------------------------------===//
-// FSL Dynamic Put Instructions
-//===----------------------------------------------------------------------===//
-def PUTD     :  FSLPutD<0x13, 0x0, "putd     ", int_mblaze_fsl_put>;
-def APUTD    :  FSLPutD<0x13, 0x1, "aputd    ", int_mblaze_fsl_aput>;
-def CPUTD    :  FSLPutD<0x13, 0x4, "cputd    ", int_mblaze_fsl_cput>;
-def CAPUTD   :  FSLPutD<0x13, 0x5, "caputd   ", int_mblaze_fsl_caput>;
-def TPUTD    : FSLPutTD<0x13, 0x2, "tputd    ", int_mblaze_fsl_tput>;
-def TAPUTD   : FSLPutTD<0x13, 0x3, "taputd   ", int_mblaze_fsl_taput>;
-def TCPUTD   : FSLPutTD<0x13, 0x6, "tcputd   ", int_mblaze_fsl_tcput>;
-def TCAPUTD  : FSLPutTD<0x13, 0x7, "tcaputd  ", int_mblaze_fsl_tcaput>;
-
-let Defs = [CARRY] in {
-  def NPUTD    :  FSLPutD<0x13, 0x8, "nputd    ", int_mblaze_fsl_nput>;
-  def NAPUTD   :  FSLPutD<0x13, 0x9, "naputd   ", int_mblaze_fsl_naput>;
-  def NCPUTD   :  FSLPutD<0x13, 0xC, "ncputd   ", int_mblaze_fsl_ncput>;
-  def NCAPUTD  :  FSLPutD<0x13, 0xD, "ncaputd  ", int_mblaze_fsl_ncaput>;
-  def TNPUTD   : FSLPutTD<0x13, 0xA, "tnputd   ", int_mblaze_fsl_tnput>;
-  def TNAPUTD  : FSLPutTD<0x13, 0xB, "tnaputd  ", int_mblaze_fsl_tnaput>;
-  def TNCPUTD  : FSLPutTD<0x13, 0xE, "tncputd  ", int_mblaze_fsl_tncput>;
-  def TNCAPUTD : FSLPutTD<0x13, 0xF, "tncaputd ", int_mblaze_fsl_tncaput>;
-}
diff --git a/lib/Target/MBlaze/MBlazeInstrFormats.td b/lib/Target/MBlaze/MBlazeInstrFormats.td
deleted file mode 100644
index e40432a..0000000
--- a/lib/Target/MBlaze/MBlazeInstrFormats.td
+++ /dev/null
@@ -1,228 +0,0 @@
-//===-- MBlazeInstrFormats.td - MB Instruction defs --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<6> val> {
-      bits<6> Value = val;
-}
-
-def FPseudo : Format<0>;
-def FRRR    : Format<1>;  // ADD, OR, etc.
-def FRRI    : Format<2>;  // ADDI, ORI, etc.
-def FCRR    : Format<3>;  // PUTD, WDC, WIC, BEQ, BNE, BGE, etc.
-def FCRI    : Format<4>;  // RTID, RTED, RTSD, BEQI, BNEI, BGEI, etc.
-def FRCR    : Format<5>;  // BRLD, BRALD, GETD
-def FRCI    : Format<6>;  // BRLID, BRALID, MSRCLR, MSRSET
-def FCCR    : Format<7>;  // BR, BRA, BRD, etc.
-def FCCI    : Format<8>;  // IMM, BRI, BRAI, BRID, etc.
-def FRRCI   : Format<9>;  // BSRLI, BSRAI, BSLLI
-def FRRC    : Format<10>; // SEXT8, SEXT16, SRA, SRC, SRL, FLT, FINT, FSQRT
-def FRCX    : Format<11>; // GET
-def FRCS    : Format<12>; // MFS
-def FCRCS   : Format<13>; // MTS
-def FCRCX   : Format<14>; // PUT
-def FCX     : Format<15>; // TPUT
-def FCR     : Format<16>; // TPUTD
-def FRIR    : Format<17>; // RSUBI
-def FRRRR   : Format<18>; // RSUB, FRSUB
-def FRI     : Format<19>; // RSUB, FRSUB
-def FC      : Format<20>; // NOP
-def FRR     : Format<21>; // CLZ
-
-//===----------------------------------------------------------------------===//
-//  Describe MBlaze instructions format
-//
-//  CPU INSTRUCTION FORMATS
-//
-//  opcode  - operation code.
-//  rd      - dst reg.
-//  ra      - first src. reg.
-//  rb      - second src. reg.
-//  imm16   - 16-bit immediate value.
-//
-//===----------------------------------------------------------------------===//
-
-// Generic MBlaze Format
-class MBlazeInst<bits<6> op, Format form, dag outs, dag ins, string asmstr,
-                 list<dag> pattern, InstrItinClass itin> : Instruction {
-  let Namespace = "MBlaze";
-  field bits<32> Inst;
-
-  bits<6> opcode = op;
-  Format Form = form;
-  bits<6> FormBits = Form.Value;
-
-  // Top 6 bits are the 'opcode' field
-  let Inst{0-5} = opcode;
-
-  // If the instruction is marked as a pseudo, set isCodeGenOnly so that the
-  // assembler and disassmbler ignore it.
-  let isCodeGenOnly = !eq(!cast<string>(form), "FPseudo");
-
-  dag OutOperandList = outs;
-  dag InOperandList  = ins;
-
-  let AsmString   = asmstr;
-  let Pattern     = pattern;
-  let Itinerary   = itin;
-
-  // TSFlags layout should be kept in sync with MBlazeInstrInfo.h.
-  let TSFlags{5-0}   = FormBits;
-}
-
-//===----------------------------------------------------------------------===//
-// Pseudo instruction class
-//===----------------------------------------------------------------------===//
-class MBlazePseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
-      MBlazeInst<0x0, FPseudo, outs, ins, asmstr, pattern, IIC_Pseudo>;
-
-//===----------------------------------------------------------------------===//
-// Type A instruction class in MBlaze : <|opcode|rd|ra|rb|flags|>
-//===----------------------------------------------------------------------===//
-
-class TA<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
-         list<dag> pattern, InstrItinClass itin> :
-         MBlazeInst<op,FRRR,outs, ins, asmstr, pattern, itin>
-{
-  bits<5> rd;
-  bits<5> ra;
-  bits<5> rb;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-15} = ra;
-  let Inst{16-20} = rb;
-  let Inst{21-31} = flags;
-}
-
-//===----------------------------------------------------------------------===//
-// Type B instruction class in MBlaze : <|opcode|rd|ra|immediate|>
-//===----------------------------------------------------------------------===//
-
-class TB<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin> :
-         MBlazeInst<op, FRRI, outs, ins, asmstr, pattern, itin>
-{
-  bits<5>  rd;
-  bits<5>  ra;
-  bits<16> imm16;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-15} = ra;
-  let Inst{16-31} = imm16;
-}
-
-//===----------------------------------------------------------------------===//
-// Type A instruction class in MBlaze but with the operands reversed
-// in the LLVM DAG : <|opcode|rd|ra|rb|flags|>
-//===----------------------------------------------------------------------===//
-
-class TAR<bits<6> op, bits<11> flags, dag outs, dag ins, string asmstr,
-          list<dag> pattern, InstrItinClass itin> :
-          TA<op, flags, outs, ins, asmstr, pattern, itin>
-{
-  bits<5> rrd;
-  bits<5> rrb;
-  bits<5> rra;
-
-  let Form = FRRRR;
-
-  let rd = rrd;
-  let ra = rra;
-  let rb = rrb;
-}
-
-//===----------------------------------------------------------------------===//
-// Type B instruction class in MBlaze but with the operands reversed in
-// the LLVM DAG : <|opcode|rd|ra|immediate|>
-//===----------------------------------------------------------------------===//
-class TBR<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin> :
-         TB<op, outs, ins, asmstr, pattern, itin> {
-  bits<5>  rrd;
-  bits<16> rimm16;
-  bits<5>  rra;
-
-  let Form = FRIR;
-
-  let rd = rrd;
-  let ra = rra;
-  let imm16 = rimm16;
-}
-
-//===----------------------------------------------------------------------===//
-// Shift immediate instruction class in MBlaze : <|opcode|rd|ra|immediate|>
-//===----------------------------------------------------------------------===//
-class SHT<bits<6> op, bits<2> flags, dag outs, dag ins, string asmstr,
-          list<dag> pattern, InstrItinClass itin> :
-          MBlazeInst<op, FRRI, outs, ins, asmstr, pattern, itin> {
-  bits<5>  rd;
-  bits<5>  ra;
-  bits<5>  imm5;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-15} = ra;
-  let Inst{16-20} = 0x0;
-  let Inst{21-22} = flags;
-  let Inst{23-26} = 0x0;
-  let Inst{27-31} = imm5;
-}
-
-//===----------------------------------------------------------------------===//
-// Special instruction class in MBlaze : <|opcode|rd|imm14|>
-//===----------------------------------------------------------------------===//
-class SPC<bits<6> op, bits<2> flags, dag outs, dag ins, string asmstr,
-          list<dag> pattern, InstrItinClass itin> :
-          MBlazeInst<op, FRI, outs, ins, asmstr, pattern, itin> {
-  bits<5>  rd;
-  bits<14> imm14;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-15} = 0x0;
-  let Inst{16-17} = flags;
-  let Inst{18-31} = imm14;
-}
-
-//===----------------------------------------------------------------------===//
-// MSR instruction class in MBlaze : <|opcode|rd|imm15|>
-//===----------------------------------------------------------------------===//
-class MSR<bits<6> op, bits<6> flags, dag outs, dag ins, string asmstr,
-          list<dag> pattern, InstrItinClass itin> :
-          MBlazeInst<op, FRI, outs, ins, asmstr, pattern, itin> {
-  bits<5>  rd;
-  bits<15> imm15;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-16} = flags;
-  let Inst{17-31} = imm15;
-}
-
-//===----------------------------------------------------------------------===//
-// TCLZ instruction class in MBlaze : <|opcode|rd|imm15|>
-//===----------------------------------------------------------------------===//
-class TCLZ<bits<6> op, bits<16> flags, dag outs, dag ins, string asmstr,
-           list<dag> pattern, InstrItinClass itin> :
-           MBlazeInst<op, FRR, outs, ins, asmstr, pattern, itin> {
-  bits<5>  rd;
-  bits<5>  ra;
-
-  let Inst{6-10}  = rd;
-  let Inst{11-15}  = ra;
-  let Inst{16-31}  = flags;
-}
-
-//===----------------------------------------------------------------------===//
-// MBAR instruction class in MBlaze : <|opcode|rd|imm15|>
-//===----------------------------------------------------------------------===//
-class MBAR<bits<6> op, bits<26> flags, dag outs, dag ins, string asmstr,
-           list<dag> pattern, InstrItinClass itin> :
-           MBlazeInst<op, FC, outs, ins, asmstr, pattern, itin> {
-  let Inst{6-31}  = flags;
-}
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.cpp b/lib/Target/MBlaze/MBlazeInstrInfo.cpp
deleted file mode 100644
index 79449f7..0000000
--- a/lib/Target/MBlaze/MBlazeInstrInfo.cpp
+++ /dev/null
@@ -1,297 +0,0 @@
-//===-- MBlazeInstrInfo.cpp - MBlaze Instruction Information --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeInstrInfo.h"
-#include "MBlazeMachineFunction.h"
-#include "MBlazeTargetMachine.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_CTOR
-#include "MBlazeGenInstrInfo.inc"
-
-using namespace llvm;
-
-MBlazeInstrInfo::MBlazeInstrInfo(MBlazeTargetMachine &tm)
-  : MBlazeGenInstrInfo(MBlaze::ADJCALLSTACKDOWN, MBlaze::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
-
-static bool isZeroImm(const MachineOperand &op) {
-  return op.isImm() && op.getImm() == 0;
-}
-
-/// isLoadFromStackSlot - If the specified machine instruction is a direct
-/// load from a stack slot, return the virtual or physical register number of
-/// the destination along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than loading from the stack slot.
-unsigned MBlazeInstrInfo::
-isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const {
-  if (MI->getOpcode() == MBlaze::LWI) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-  }
-
-  return 0;
-}
-
-/// isStoreToStackSlot - If the specified machine instruction is a direct
-/// store to a stack slot, return the virtual or physical register number of
-/// the source reg along with the FrameIndex of the loaded stack slot.  If
-/// not, return 0.  This predicate must return 0 if the instruction has
-/// any side effects other than storing to the stack slot.
-unsigned MBlazeInstrInfo::
-isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const {
-  if (MI->getOpcode() == MBlaze::SWI) {
-    if ((MI->getOperand(1).isFI()) && // is a stack slot
-        (MI->getOperand(2).isImm()) &&  // the imm is zero
-        (isZeroImm(MI->getOperand(2)))) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-  }
-  return 0;
-}
-
-/// insertNoop - If data hazard condition is found insert the target nop
-/// instruction.
-void MBlazeInstrInfo::
-insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const {
-  DebugLoc DL;
-  BuildMI(MBB, MI, DL, get(MBlaze::NOP));
-}
-
-void MBlazeInstrInfo::
-copyPhysReg(MachineBasicBlock &MBB,
-            MachineBasicBlock::iterator I, DebugLoc DL,
-            unsigned DestReg, unsigned SrcReg,
-            bool KillSrc) const {
-  llvm::BuildMI(MBB, I, DL, get(MBlaze::ADDK), DestReg)
-    .addReg(SrcReg, getKillRegState(KillSrc)).addReg(MBlaze::R0);
-}
-
-void MBlazeInstrInfo::
-storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
-                    const TargetRegisterClass *RC,
-                    const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  BuildMI(MBB, I, DL, get(MBlaze::SWI)).addReg(SrcReg,getKillRegState(isKill))
-    .addFrameIndex(FI).addImm(0); //.addFrameIndex(FI);
-}
-
-void MBlazeInstrInfo::
-loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
-                     const TargetRegisterClass *RC,
-                     const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  BuildMI(MBB, I, DL, get(MBlaze::LWI), DestReg)
-      .addFrameIndex(FI).addImm(0); //.addFrameIndex(FI);
-}
-
-//===----------------------------------------------------------------------===//
-// Branch Analysis
-//===----------------------------------------------------------------------===//
-bool MBlazeInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                    MachineBasicBlock *&TBB,
-                                    MachineBasicBlock *&FBB,
-                                    SmallVectorImpl<MachineOperand> &Cond,
-                                    bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (MBlaze::isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (MBlaze::isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      TBB = LastInst->getOperand(1).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-      Cond.push_back(LastInst->getOperand(0));
-      return false;
-    }
-    // Otherwise, don't know what this is.
-    return true;
-  }
-
-  // Get the instruction before it if it's a terminator.
-  MachineInstr *SecondLastInst = I;
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with something like BEQID then BRID, handle it.
-  if (MBlaze::isCondBranchOpcode(SecondLastInst->getOpcode()) &&
-      MBlaze::isUncondBranchOpcode(LastInst->getOpcode())) {
-    TBB = SecondLastInst->getOperand(1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
-    Cond.push_back(SecondLastInst->getOperand(0));
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.
-  // The second one is not executed, so remove it.
-  if (MBlaze::isUncondBranchOpcode(SecondLastInst->getOpcode()) &&
-      MBlaze::isUncondBranchOpcode(LastInst->getOpcode())) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-unsigned MBlazeInstrInfo::
-InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-             MachineBasicBlock *FBB,
-             const SmallVectorImpl<MachineOperand> &Cond,
-             DebugLoc DL) const {
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 2 || Cond.size() == 0) &&
-         "MBlaze branch conditions have two components!");
-
-  unsigned Opc = MBlaze::BRID;
-  if (!Cond.empty())
-    Opc = (unsigned)Cond[0].getImm();
-
-  if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch
-      BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
-    else              // Conditional branch
-      BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg()).addMBB(TBB);
-    return 1;
-  }
-
-  BuildMI(&MBB, DL, get(Opc)).addReg(Cond[1].getReg()).addMBB(TBB);
-  BuildMI(&MBB, DL, get(MBlaze::BRID)).addMBB(FBB);
-  return 2;
-}
-
-unsigned MBlazeInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-
-  if (!MBlaze::isUncondBranchOpcode(I->getOpcode()) &&
-      !MBlaze::isCondBranchOpcode(I->getOpcode()))
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin()) return 1;
-  --I;
-  if (!MBlaze::isCondBranchOpcode(I->getOpcode()))
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-bool MBlazeInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand>
-                                               &Cond) const {
-  assert(Cond.size() == 2 && "Invalid MBlaze branch opcode!");
-  switch (Cond[0].getImm()) {
-  default:            return true;
-  case MBlaze::BEQ:   Cond[0].setImm(MBlaze::BNE); return false;
-  case MBlaze::BNE:   Cond[0].setImm(MBlaze::BEQ); return false;
-  case MBlaze::BGT:   Cond[0].setImm(MBlaze::BLE); return false;
-  case MBlaze::BGE:   Cond[0].setImm(MBlaze::BLT); return false;
-  case MBlaze::BLT:   Cond[0].setImm(MBlaze::BGE); return false;
-  case MBlaze::BLE:   Cond[0].setImm(MBlaze::BGT); return false;
-  case MBlaze::BEQI:  Cond[0].setImm(MBlaze::BNEI); return false;
-  case MBlaze::BNEI:  Cond[0].setImm(MBlaze::BEQI); return false;
-  case MBlaze::BGTI:  Cond[0].setImm(MBlaze::BLEI); return false;
-  case MBlaze::BGEI:  Cond[0].setImm(MBlaze::BLTI); return false;
-  case MBlaze::BLTI:  Cond[0].setImm(MBlaze::BGEI); return false;
-  case MBlaze::BLEI:  Cond[0].setImm(MBlaze::BGTI); return false;
-  case MBlaze::BEQD:  Cond[0].setImm(MBlaze::BNED); return false;
-  case MBlaze::BNED:  Cond[0].setImm(MBlaze::BEQD); return false;
-  case MBlaze::BGTD:  Cond[0].setImm(MBlaze::BLED); return false;
-  case MBlaze::BGED:  Cond[0].setImm(MBlaze::BLTD); return false;
-  case MBlaze::BLTD:  Cond[0].setImm(MBlaze::BGED); return false;
-  case MBlaze::BLED:  Cond[0].setImm(MBlaze::BGTD); return false;
-  case MBlaze::BEQID: Cond[0].setImm(MBlaze::BNEID); return false;
-  case MBlaze::BNEID: Cond[0].setImm(MBlaze::BEQID); return false;
-  case MBlaze::BGTID: Cond[0].setImm(MBlaze::BLEID); return false;
-  case MBlaze::BGEID: Cond[0].setImm(MBlaze::BLTID); return false;
-  case MBlaze::BLTID: Cond[0].setImm(MBlaze::BGEID); return false;
-  case MBlaze::BLEID: Cond[0].setImm(MBlaze::BGTID); return false;
-  }
-}
-
-/// getGlobalBaseReg - Return a virtual register initialized with the
-/// the global base register value. Output instructions required to
-/// initialize the register in the function entry block, if necessary.
-///
-unsigned MBlazeInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
-  MBlazeFunctionInfo *MBlazeFI = MF->getInfo<MBlazeFunctionInfo>();
-  unsigned GlobalBaseReg = MBlazeFI->getGlobalBaseReg();
-  if (GlobalBaseReg != 0)
-    return GlobalBaseReg;
-
-  // Insert the set of GlobalBaseReg into the first MBB of the function
-  MachineBasicBlock &FirstMBB = MF->front();
-  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
-
-  GlobalBaseReg = RegInfo.createVirtualRegister(&MBlaze::GPRRegClass);
-  BuildMI(FirstMBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
-          GlobalBaseReg).addReg(MBlaze::R20);
-  RegInfo.addLiveIn(MBlaze::R20);
-
-  MBlazeFI->setGlobalBaseReg(GlobalBaseReg);
-  return GlobalBaseReg;
-}
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.h b/lib/Target/MBlaze/MBlazeInstrInfo.h
deleted file mode 100644
index 5252147..0000000
--- a/lib/Target/MBlaze/MBlazeInstrInfo.h
+++ /dev/null
@@ -1,240 +0,0 @@
-//===-- MBlazeInstrInfo.h - MBlaze Instruction Information ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZEINSTRUCTIONINFO_H
-#define MBLAZEINSTRUCTIONINFO_H
-
-#include "MBlaze.h"
-#include "MBlazeRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "MBlazeGenInstrInfo.inc"
-
-namespace llvm {
-
-namespace MBlaze {
-
-  // MBlaze Branch Codes
-  enum FPBranchCode {
-    BRANCH_F,
-    BRANCH_T,
-    BRANCH_FL,
-    BRANCH_TL,
-    BRANCH_INVALID
-  };
-
-  // MBlaze Condition Codes
-  enum CondCode {
-    // To be used with float branch True
-    FCOND_F,
-    FCOND_UN,
-    FCOND_EQ,
-    FCOND_UEQ,
-    FCOND_OLT,
-    FCOND_ULT,
-    FCOND_OLE,
-    FCOND_ULE,
-    FCOND_SF,
-    FCOND_NGLE,
-    FCOND_SEQ,
-    FCOND_NGL,
-    FCOND_LT,
-    FCOND_NGE,
-    FCOND_LE,
-    FCOND_NGT,
-
-    // To be used with float branch False
-    // This conditions have the same mnemonic as the
-    // above ones, but are used with a branch False;
-    FCOND_T,
-    FCOND_OR,
-    FCOND_NEQ,
-    FCOND_OGL,
-    FCOND_UGE,
-    FCOND_OGE,
-    FCOND_UGT,
-    FCOND_OGT,
-    FCOND_ST,
-    FCOND_GLE,
-    FCOND_SNE,
-    FCOND_GL,
-    FCOND_NLT,
-    FCOND_GE,
-    FCOND_NLE,
-    FCOND_GT,
-
-    // Only integer conditions
-    COND_EQ,
-    COND_GT,
-    COND_GE,
-    COND_LT,
-    COND_LE,
-    COND_NE,
-    COND_INVALID
-  };
-
-  // Turn condition code into conditional branch opcode.
-  inline static unsigned GetCondBranchFromCond(CondCode CC) {
-    switch (CC) {
-    default: llvm_unreachable("Unknown condition code");
-    case COND_EQ: return MBlaze::BEQID;
-    case COND_NE: return MBlaze::BNEID;
-    case COND_GT: return MBlaze::BGTID;
-    case COND_GE: return MBlaze::BGEID;
-    case COND_LT: return MBlaze::BLTID;
-    case COND_LE: return MBlaze::BLEID;
-    }
-  }
-
-  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
-  /// e.g. turning COND_E to COND_NE.
-  // CondCode GetOppositeBranchCondition(MBlaze::CondCode CC);
-
-  /// MBlazeCCToString - Map each FP condition code to its string
-  inline static const char *MBlazeFCCToString(MBlaze::CondCode CC) {
-    switch (CC) {
-    default: llvm_unreachable("Unknown condition code");
-    case FCOND_F:
-    case FCOND_T:   return "f";
-    case FCOND_UN:
-    case FCOND_OR:  return "un";
-    case FCOND_EQ:
-    case FCOND_NEQ: return "eq";
-    case FCOND_UEQ:
-    case FCOND_OGL: return "ueq";
-    case FCOND_OLT:
-    case FCOND_UGE: return "olt";
-    case FCOND_ULT:
-    case FCOND_OGE: return "ult";
-    case FCOND_OLE:
-    case FCOND_UGT: return "ole";
-    case FCOND_ULE:
-    case FCOND_OGT: return "ule";
-    case FCOND_SF:
-    case FCOND_ST:  return "sf";
-    case FCOND_NGLE:
-    case FCOND_GLE: return "ngle";
-    case FCOND_SEQ:
-    case FCOND_SNE: return "seq";
-    case FCOND_NGL:
-    case FCOND_GL:  return "ngl";
-    case FCOND_LT:
-    case FCOND_NLT: return "lt";
-    case FCOND_NGE:
-    case FCOND_GE:  return "ge";
-    case FCOND_LE:
-    case FCOND_NLE: return "nle";
-    case FCOND_NGT:
-    case FCOND_GT:  return "gt";
-    }
-  }
-
-  inline static bool isUncondBranchOpcode(int Opc) {
-    switch (Opc) {
-    default: return false;
-    case MBlaze::BRI:
-    case MBlaze::BRAI:
-    case MBlaze::BRID:
-    case MBlaze::BRAID:
-      return true;
-    }
-  }
-
-  inline static bool isCondBranchOpcode(int Opc) {
-    switch (Opc) {
-    default: return false;
-    case MBlaze::BEQI: case MBlaze::BEQID:
-    case MBlaze::BNEI: case MBlaze::BNEID:
-    case MBlaze::BGTI: case MBlaze::BGTID:
-    case MBlaze::BGEI: case MBlaze::BGEID:
-    case MBlaze::BLTI: case MBlaze::BLTID:
-    case MBlaze::BLEI: case MBlaze::BLEID:
-      return true;
-    }
-  }
-}
-
-class MBlazeInstrInfo : public MBlazeGenInstrInfo {
-  MBlazeTargetMachine &TM;
-  const MBlazeRegisterInfo RI;
-public:
-  explicit MBlazeInstrInfo(MBlazeTargetMachine &TM);
-
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  ///
-  virtual const MBlazeRegisterInfo &getRegisterInfo() const { return RI; }
-
-  /// isLoadFromStackSlot - If the specified machine instruction is a direct
-  /// load from a stack slot, return the virtual or physical register number of
-  /// the destination along with the FrameIndex of the loaded stack slot.  If
-  /// not, return 0.  This predicate must return 0 if the instruction has
-  /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-
-  /// isStoreToStackSlot - If the specified machine instruction is a direct
-  /// store to a stack slot, return the virtual or physical register number of
-  /// the source reg along with the FrameIndex of the loaded stack slot.  If
-  /// not, return 0.  This predicate must return 0 if the instruction has
-  /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-  /// Branch Analysis
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
-    const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  /// Insert nop instruction when hazard condition is found
-  virtual void insertNoop(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI) const;
-
-  /// getGlobalBaseReg - Return a virtual register initialized with the
-  /// the global base register value. Output instructions required to
-  /// initialize the register in the function entry block, if necessary.
-  ///
-  unsigned getGlobalBaseReg(MachineFunction *MF) const;
-};
-
-}
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeInstrInfo.td b/lib/Target/MBlaze/MBlazeInstrInfo.td
deleted file mode 100644
index d27cd39..0000000
--- a/lib/Target/MBlaze/MBlazeInstrInfo.td
+++ /dev/null
@@ -1,1051 +0,0 @@
-//===-- MBlazeInstrInfo.td - MBlaze Instruction defs -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Instruction format superclass
-//===----------------------------------------------------------------------===//
-include "MBlazeInstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-// MBlaze type profiles
-//===----------------------------------------------------------------------===//
-
-// def SDTMBlazeSelectCC : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>]>;
-def SDT_MBlazeRet     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDT_MBlazeIRet    : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-def SDT_MBlazeJmpLink : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
-def SDT_MBCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
-def SDT_MBCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-
-//===----------------------------------------------------------------------===//
-// MBlaze specific nodes
-//===----------------------------------------------------------------------===//
-
-def MBlazeRet     : SDNode<"MBlazeISD::Ret", SDT_MBlazeRet,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def MBlazeIRet    : SDNode<"MBlazeISD::IRet", SDT_MBlazeIRet,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
-def MBlazeJmpLink : SDNode<"MBlazeISD::JmpLink",SDT_MBlazeJmpLink,
-                           [SDNPHasChain,SDNPOptInGlue,SDNPOutGlue,
-                            SDNPVariadic]>;
-
-def MBWrapper   : SDNode<"MBlazeISD::Wrap", SDTIntUnaryOp>;
-
-def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MBCallSeqStart,
-                           [SDNPHasChain, SDNPOutGlue]>;
-
-def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MBCallSeqEnd,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-//===----------------------------------------------------------------------===//
-// MBlaze Instruction Predicate Definitions.
-//===----------------------------------------------------------------------===//
-// def HasPipe3     : Predicate<"Subtarget.hasPipe3()">;
-def HasBarrel    : Predicate<"Subtarget.hasBarrel()">;
-// def NoBarrel     : Predicate<"!Subtarget.hasBarrel()">;
-def HasDiv       : Predicate<"Subtarget.hasDiv()">;
-def HasMul       : Predicate<"Subtarget.hasMul()">;
-// def HasFSL       : Predicate<"Subtarget.hasFSL()">;
-// def HasEFSL      : Predicate<"Subtarget.hasEFSL()">;
-// def HasMSRSet    : Predicate<"Subtarget.hasMSRSet()">;
-// def HasException : Predicate<"Subtarget.hasException()">;
-def HasPatCmp    : Predicate<"Subtarget.hasPatCmp()">;
-def HasFPU       : Predicate<"Subtarget.hasFPU()">;
-// def HasESR       : Predicate<"Subtarget.hasESR()">;
-// def HasPVR       : Predicate<"Subtarget.hasPVR()">;
-def HasMul64     : Predicate<"Subtarget.hasMul64()">;
-def HasSqrt      : Predicate<"Subtarget.hasSqrt()">;
-// def HasMMU       : Predicate<"Subtarget.hasMMU()">;
-
-//===----------------------------------------------------------------------===//
-// MBlaze Operand, Complex Patterns and Transformations Definitions.
-//===----------------------------------------------------------------------===//
-
-def MBlazeMemAsmOperand : AsmOperandClass {
-  let Name = "Mem";
-  let SuperClasses = [];
-}
-
-def MBlazeFslAsmOperand : AsmOperandClass {
-  let Name = "Fsl";
-  let SuperClasses = [];
-}
-
-// Instruction operand types
-def brtarget    : Operand<OtherVT>;
-def calltarget  : Operand<i32>;
-def simm16      : Operand<i32>;
-def uimm5       : Operand<i32>;
-def uimm15      : Operand<i32>;
-def fimm        : Operand<f32>;
-
-// Unsigned Operand
-def uimm16      : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-}
-
-// FSL Operand
-def fslimm      : Operand<i32> {
-  let PrintMethod = "printFSLImm";
-  let ParserMatchClass = MBlazeFslAsmOperand;
-}
-
-// Address operand
-def memri : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR, simm16);
-  let ParserMatchClass = MBlazeMemAsmOperand;
-}
-
-def memrr : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops GPR, GPR);
-  let ParserMatchClass = MBlazeMemAsmOperand;
-}
-
-// Node immediate fits as 16-bit sign extended on target immediate.
-def immSExt16  : PatLeaf<(imm), [{
-  return (N->getZExtValue() >> 16) == 0;
-}]>;
-
-// Node immediate fits as 16-bit zero extended on target immediate.
-// The LO16 param means that only the lower 16 bits of the node
-// immediate are caught.
-// e.g. addiu, sltiu
-def immZExt16  : PatLeaf<(imm), [{
-  return (N->getZExtValue() >> 16) == 0;
-}]>;
-
-// FSL immediate field must fit in 4 bits.
-def immZExt4 : PatLeaf<(imm), [{
-  return N->getZExtValue() == ((N->getZExtValue()) & 0xf) ;
-}]>;
-
-// shamt field must fit in 5 bits.
-def immZExt5 : PatLeaf<(imm), [{
-  return N->getZExtValue() == ((N->getZExtValue()) & 0x1f) ;
-}]>;
-
-// MBlaze Address Mode. SDNode frameindex could possibily be a match
-// since load and store instructions from stack used it.
-def iaddr : ComplexPattern<i32, 2, "SelectAddrRegImm", [frameindex], []>;
-def xaddr : ComplexPattern<i32, 2, "SelectAddrRegReg", [], []>;
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions
-//===----------------------------------------------------------------------===//
-
-// As stack alignment is always done with addiu, we need a 16-bit immediate
-let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : MBlazePseudo<(outs), (ins simm16:$amt),
-                                  "#ADJCALLSTACKDOWN $amt",
-                                  [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : MBlazePseudo<(outs),
-                                  (ins uimm16:$amt1, simm16:$amt2),
-                                  "#ADJCALLSTACKUP $amt1",
-                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
-}
-
-//===----------------------------------------------------------------------===//
-// Instructions specific format
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Arithmetic Instructions
-//===----------------------------------------------------------------------===//
-class Arith<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
-            InstrItinClass itin> :
-            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-               !strconcat(instr_asm, "   $dst, $b, $c"),
-               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
-
-class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_ALU>;
-
-class ArithI32<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
-               TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
-                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIC_ALU>;
-
-class ShiftI<bits<6> op, bits<2> flags, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-             SHT<op, flags, (outs GPR:$dst), (ins GPR:$b, Od:$c),
-                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [(set GPR:$dst, (OpNode GPR:$b, imm_type:$c))], IIC_SHT>;
-
-class ArithR<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode,
-            InstrItinClass itin> :
-            TAR<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                !strconcat(instr_asm, "   $dst, $c, $b"),
-                [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], itin>;
-
-class ArithRI<bits<6> op, string instr_asm, SDNode OpNode,
-             Operand Od, PatLeaf imm_type> :
-             TBR<op, (outs GPR:$dst), (ins Od:$b, GPR:$c),
-                 !strconcat(instr_asm, "   $dst, $c, $b"),
-                 [(set GPR:$dst, (OpNode imm_type:$b, GPR:$c))], IIC_ALU>;
-
-class ArithN<bits<6> op, bits<11> flags, string instr_asm,
-            InstrItinClass itin> :
-            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-               !strconcat(instr_asm, "   $dst, $b, $c"),
-               [], itin>;
-
-class ArithNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, Od:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], IIC_ALU>;
-
-class ArithRN<bits<6> op, bits<11> flags, string instr_asm,
-            InstrItinClass itin> :
-            TAR<op, flags, (outs GPR:$dst), (ins GPR:$c, GPR:$b),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [], itin>;
-
-class ArithRNI<bits<6> op, string instr_asm,Operand Od, PatLeaf imm_type> :
-             TBR<op, (outs GPR:$dst), (ins Od:$c, GPR:$b),
-                 !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIC_ALU>;
-
-//===----------------------------------------------------------------------===//
-// Misc Arithmetic Instructions
-//===----------------------------------------------------------------------===//
-
-class Logic<bits<6> op, bits<11> flags, string instr_asm, SDNode OpNode> :
-            TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-               !strconcat(instr_asm, "   $dst, $b, $c"),
-               [(set GPR:$dst, (OpNode GPR:$b, GPR:$c))], IIC_ALU>;
-
-class LogicI<bits<6> op, string instr_asm, SDNode OpNode> :
-             TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                [(set GPR:$dst, (OpNode GPR:$b, immZExt16:$c))],
-                IIC_ALU>;
-
-class LogicI32<bits<6> op, string instr_asm> :
-               TB<op, (outs GPR:$dst), (ins GPR:$b, uimm16:$c),
-                  !strconcat(instr_asm, "   $dst, $b, $c"),
-                  [], IIC_ALU>;
-
-class PatCmp<bits<6> op, bits<11> flags, string instr_asm> :
-             TA<op, flags, (outs GPR:$dst), (ins GPR:$b, GPR:$c),
-                !strconcat(instr_asm, "   $dst, $b, $c"),
-                 [], IIC_ALU>;
-
-//===----------------------------------------------------------------------===//
-// Memory Access Instructions
-//===----------------------------------------------------------------------===//
-
-let mayLoad = 1 in {
-class LoadM<bits<6> op, bits<11> flags, string instr_asm> :
-            TA<op, flags, (outs GPR:$dst), (ins memrr:$addr),
-               !strconcat(instr_asm, "   $dst, $addr"),
-               [], IIC_MEMl>;
-}
-
-class LoadMI<bits<6> op, string instr_asm, PatFrag OpNode> :
-             TB<op, (outs GPR:$dst), (ins memri:$addr),
-                !strconcat(instr_asm, "   $dst, $addr"),
-                [(set (i32 GPR:$dst), (OpNode iaddr:$addr))], IIC_MEMl>;
-
-let mayStore = 1 in {
-class StoreM<bits<6> op, bits<11> flags, string instr_asm> :
-             TA<op, flags, (outs), (ins GPR:$dst, memrr:$addr),
-                !strconcat(instr_asm, "   $dst, $addr"),
-                [], IIC_MEMs>;
-}
-
-class StoreMI<bits<6> op, string instr_asm, PatFrag OpNode> :
-              TB<op, (outs), (ins GPR:$dst, memri:$addr),
-                 !strconcat(instr_asm, "   $dst, $addr"),
-                 [(OpNode (i32 GPR:$dst), iaddr:$addr)], IIC_MEMs>;
-
-//===----------------------------------------------------------------------===//
-// Branch Instructions
-//===----------------------------------------------------------------------===//
-class Branch<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
-             TA<op, flags, (outs), (ins GPR:$target),
-                !strconcat(instr_asm, "   $target"),
-                [], IIC_BR> {
-  let rd = 0x0;
-  let ra = br;
-  let Form = FCCR;
-}
-
-class BranchI<bits<6> op, bits<5> br, string instr_asm> :
-              TB<op, (outs), (ins brtarget:$target),
-                 !strconcat(instr_asm, "   $target"),
-                 [], IIC_BR> {
-  let rd = 0;
-  let ra = br;
-  let Form = FCCI;
-}
-
-//===----------------------------------------------------------------------===//
-// Branch and Link Instructions
-//===----------------------------------------------------------------------===//
-class BranchL<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
-              TA<op, flags, (outs), (ins GPR:$link, GPR:$target),
-                 !strconcat(instr_asm, "   $link, $target"),
-                 [], IIC_BRl> {
-  let ra = br;
-  let Form = FRCR;
-}
-
-class BranchLI<bits<6> op, bits<5> br, string instr_asm> :
-               TB<op, (outs), (ins GPR:$link, calltarget:$target),
-                  !strconcat(instr_asm, "   $link, $target"),
-                  [], IIC_BRl> {
-  let ra = br;
-  let Form = FRCI;
-}
-
-//===----------------------------------------------------------------------===//
-// Conditional Branch Instructions
-//===----------------------------------------------------------------------===//
-class BranchC<bits<6> op, bits<5> br, bits<11> flags, string instr_asm> :
-              TA<op, flags, (outs),
-                 (ins GPR:$a, GPR:$b),
-                 !strconcat(instr_asm, "   $a, $b"),
-                 [], IIC_BRc> {
-  let rd = br;
-  let Form = FCRR;
-}
-
-class BranchCI<bits<6> op, bits<5> br, string instr_asm> :
-               TB<op, (outs), (ins GPR:$a, brtarget:$offset),
-                  !strconcat(instr_asm, "   $a, $offset"),
-                  [], IIC_BRc> {
-  let rd = br;
-  let Form = FCRI;
-}
-
-//===----------------------------------------------------------------------===//
-// MBlaze arithmetic instructions
-//===----------------------------------------------------------------------===//
-
-let isCommutable = 1, isAsCheapAsAMove = 1 in {
-  def ADDK   :  Arith<0x04, 0x000, "addk   ", add,  IIC_ALU>;
-  def AND    :  Logic<0x21, 0x000, "and    ", and>;
-  def OR     :  Logic<0x20, 0x000, "or     ", or>;
-  def XOR    :  Logic<0x22, 0x000, "xor    ", xor>;
-
-  let Predicates=[HasPatCmp] in {
-    def PCMPBF : PatCmp<0x20, 0x400, "pcmpbf ">;
-    def PCMPEQ : PatCmp<0x22, 0x400, "pcmpeq ">;
-    def PCMPNE : PatCmp<0x23, 0x400, "pcmpne ">;
-  }
-
-  let Defs = [CARRY] in {
-    def ADD    :  Arith<0x00, 0x000, "add    ", addc, IIC_ALU>;
-
-    let Uses = [CARRY] in {
-      def ADDC   :  Arith<0x02, 0x000, "addc   ", adde, IIC_ALU>;
-    }
-  }
-
-  let Uses = [CARRY] in {
-    def ADDKC  : ArithN<0x06, 0x000, "addkc  ", IIC_ALU>;
-  }
-}
-
-let isAsCheapAsAMove = 1 in {
-  def ANDN   :  ArithN<0x23, 0x000, "andn   ", IIC_ALU>;
-  def CMP    :  ArithN<0x05, 0x001, "cmp    ", IIC_ALU>;
-  def CMPU   :  ArithN<0x05, 0x003, "cmpu   ", IIC_ALU>;
-  def RSUBK  :  ArithR<0x05, 0x000, "rsubk  ", sub,  IIC_ALU>;
-
-  let Defs = [CARRY] in {
-    def RSUB   :  ArithR<0x01, 0x000, "rsub   ", subc, IIC_ALU>;
-
-    let Uses = [CARRY] in {
-      def RSUBC  :  ArithR<0x03, 0x000, "rsubc  ", sube, IIC_ALU>;
-    }
-  }
-
-  let Uses = [CARRY] in {
-    def RSUBKC : ArithRN<0x07, 0x000, "rsubkc ", IIC_ALU>;
-  }
-}
-
-let isCommutable = 1, Predicates=[HasMul] in {
-  def MUL    : Arith<0x10, 0x000, "mul    ", mul,   IIC_ALUm>;
-}
-
-let isCommutable = 1, Predicates=[HasMul,HasMul64] in {
-  def MULH   : Arith<0x10, 0x001, "mulh   ", mulhs, IIC_ALUm>;
-  def MULHU  : Arith<0x10, 0x003, "mulhu  ", mulhu, IIC_ALUm>;
-}
-
-let Predicates=[HasMul,HasMul64] in {
-  def MULHSU : ArithN<0x10, 0x002, "mulhsu ", IIC_ALUm>;
-}
-
-let Predicates=[HasBarrel] in {
-  def BSRL   :   Arith<0x11, 0x000, "bsrl   ", srl, IIC_SHT>;
-  def BSRA   :   Arith<0x11, 0x200, "bsra   ", sra, IIC_SHT>;
-  def BSLL   :   Arith<0x11, 0x400, "bsll   ", shl, IIC_SHT>;
-  def BSRLI  :  ShiftI<0x19, 0x0, "bsrli  ", srl, uimm5, immZExt5>;
-  def BSRAI  :  ShiftI<0x19, 0x1, "bsrai  ", sra, uimm5, immZExt5>;
-  def BSLLI  :  ShiftI<0x19, 0x2, "bslli  ", shl, uimm5, immZExt5>;
-}
-
-let Predicates=[HasDiv] in {
-  def IDIV   :  ArithR<0x12, 0x000, "idiv   ", sdiv, IIC_ALUd>;
-  def IDIVU  :  ArithR<0x12, 0x002, "idivu  ", udiv, IIC_ALUd>;
-}
-
-//===----------------------------------------------------------------------===//
-// MBlaze immediate mode arithmetic instructions
-//===----------------------------------------------------------------------===//
-
-let isAsCheapAsAMove = 1 in {
-  def ADDIK   :   ArithI<0x0C, "addik  ", add,  simm16, immSExt16>;
-  def RSUBIK  :  ArithRI<0x0D, "rsubik ", sub, simm16, immSExt16>;
-  def ANDNI   :  ArithNI<0x2B, "andni  ", uimm16, immZExt16>;
-  def ANDI    :   LogicI<0x29, "andi   ", and>;
-  def ORI     :   LogicI<0x28, "ori    ", or>;
-  def XORI    :   LogicI<0x2A, "xori   ", xor>;
-
-  let Defs = [CARRY] in {
-    def ADDI    :   ArithI<0x08, "addi   ", addc, simm16, immSExt16>;
-    def RSUBI   :  ArithRI<0x09, "rsubi  ", subc,  simm16, immSExt16>;
-
-    let Uses = [CARRY] in {
-      def ADDIC   :   ArithI<0x0A, "addic  ", adde, simm16, immSExt16>;
-      def RSUBIC  :  ArithRI<0x0B, "rsubic ", sube, simm16, immSExt16>;
-    }
-  }
-
-  let Uses = [CARRY] in {
-    def ADDIKC  :  ArithNI<0x0E, "addikc ", simm16, immSExt16>;
-    def RSUBIKC : ArithRNI<0x0F, "rsubikc", simm16, immSExt16>;
-  }
-}
-
-let Predicates=[HasMul] in {
-  def MULI    :   ArithI<0x18, "muli   ", mul, simm16, immSExt16>;
-}
-
-//===----------------------------------------------------------------------===//
-// MBlaze memory access instructions
-//===----------------------------------------------------------------------===//
-
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
-  let neverHasSideEffects = 1 in {
-    def LBU  :  LoadM<0x30, 0x000, "lbu    ">;
-    def LBUR :  LoadM<0x30, 0x200, "lbur   ">;
-
-    def LHU  :  LoadM<0x31, 0x000, "lhu    ">;
-    def LHUR :  LoadM<0x31, 0x200, "lhur   ">;
-
-    def LW   :  LoadM<0x32, 0x000, "lw     ">;
-    def LWR  :  LoadM<0x32, 0x200, "lwr    ">;
-
-    let Defs = [CARRY] in {
-      def LWX  :  LoadM<0x32, 0x400, "lwx    ">;
-    }
-  }
-
-  def LBUI : LoadMI<0x38, "lbui   ", zextloadi8>;
-  def LHUI : LoadMI<0x39, "lhui   ", zextloadi16>;
-  def LWI  : LoadMI<0x3A, "lwi    ", load>;
-}
-
-def SB  :  StoreM<0x34, 0x000, "sb     ">;
-def SBR :  StoreM<0x34, 0x200, "sbr    ">;
-
-def SH  :  StoreM<0x35, 0x000, "sh     ">;
-def SHR :  StoreM<0x35, 0x200, "shr    ">;
-
-def SW  :  StoreM<0x36, 0x000, "sw     ">;
-def SWR :  StoreM<0x36, 0x200, "swr    ">;
-
-let Defs = [CARRY] in {
-  def SWX :  StoreM<0x36, 0x400, "swx    ">;
-}
-
-def SBI : StoreMI<0x3C, "sbi    ", truncstorei8>;
-def SHI : StoreMI<0x3D, "shi    ", truncstorei16>;
-def SWI : StoreMI<0x3E, "swi    ", store>;
-
-//===----------------------------------------------------------------------===//
-// MBlaze branch instructions
-//===----------------------------------------------------------------------===//
-
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-  def BRI    :  BranchI<0x2E, 0x00, "bri    ">;
-  def BRAI   :  BranchI<0x2E, 0x08, "brai   ">;
-}
-
-let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
-  def BEQI   : BranchCI<0x2F, 0x00, "beqi   ">;
-  def BNEI   : BranchCI<0x2F, 0x01, "bnei   ">;
-  def BLTI   : BranchCI<0x2F, 0x02, "blti   ">;
-  def BLEI   : BranchCI<0x2F, 0x03, "blei   ">;
-  def BGTI   : BranchCI<0x2F, 0x04, "bgti   ">;
-  def BGEI   : BranchCI<0x2F, 0x05, "bgei   ">;
-}
-
-let isBranch = 1, isIndirectBranch = 1, isTerminator = 1, hasCtrlDep = 1,
-    isBarrier = 1 in {
-  def BR     :   Branch<0x26, 0x00, 0x000, "br     ">;
-  def BRA    :   Branch<0x26, 0x08, 0x000, "bra    ">;
-}
-
-let isBranch = 1, isIndirectBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
-  def BEQ    :  BranchC<0x27, 0x00, 0x000, "beq    ">;
-  def BNE    :  BranchC<0x27, 0x01, 0x000, "bne    ">;
-  def BLT    :  BranchC<0x27, 0x02, 0x000, "blt    ">;
-  def BLE    :  BranchC<0x27, 0x03, 0x000, "ble    ">;
-  def BGT    :  BranchC<0x27, 0x04, 0x000, "bgt    ">;
-  def BGE    :  BranchC<0x27, 0x05, 0x000, "bge    ">;
-}
-
-let isBranch = 1, isTerminator = 1, hasDelaySlot = 1, hasCtrlDep = 1,
-    isBarrier = 1 in {
-  def BRID   :  BranchI<0x2E, 0x10, "brid   ">;
-  def BRAID  :  BranchI<0x2E, 0x18, "braid  ">;
-}
-
-let isBranch = 1, isTerminator = 1, hasDelaySlot = 1, hasCtrlDep = 1 in {
-  def BEQID  : BranchCI<0x2F, 0x10, "beqid  ">;
-  def BNEID  : BranchCI<0x2F, 0x11, "bneid  ">;
-  def BLTID  : BranchCI<0x2F, 0x12, "bltid  ">;
-  def BLEID  : BranchCI<0x2F, 0x13, "bleid  ">;
-  def BGTID  : BranchCI<0x2F, 0x14, "bgtid  ">;
-  def BGEID  : BranchCI<0x2F, 0x15, "bgeid  ">;
-}
-
-let isBranch = 1, isIndirectBranch = 1, isTerminator = 1,
-    hasDelaySlot = 1, hasCtrlDep = 1, isBarrier = 1 in {
-  def BRD    :   Branch<0x26, 0x10, 0x000, "brd    ">;
-  def BRAD   :   Branch<0x26, 0x18, 0x000, "brad   ">;
-}
-
-let isBranch = 1, isIndirectBranch = 1, isTerminator = 1,
-    hasDelaySlot = 1, hasCtrlDep = 1 in {
-  def BEQD   :  BranchC<0x27, 0x10, 0x000, "beqd   ">;
-  def BNED   :  BranchC<0x27, 0x11, 0x000, "bned   ">;
-  def BLTD   :  BranchC<0x27, 0x12, 0x000, "bltd   ">;
-  def BLED   :  BranchC<0x27, 0x13, 0x000, "bled   ">;
-  def BGTD   :  BranchC<0x27, 0x14, 0x000, "bgtd   ">;
-  def BGED   :  BranchC<0x27, 0x15, 0x000, "bged   ">;
-}
-
-let isCall =1, hasDelaySlot = 1,
-    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,CARRY],
-    Uses = [R1] in {
-  def BRLID  : BranchLI<0x2E, 0x14, "brlid  ">;
-  def BRALID : BranchLI<0x2E, 0x1C, "bralid ">;
-}
-
-let isCall = 1, hasDelaySlot = 1,
-    Defs = [R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,CARRY],
-    Uses = [R1] in {
-  def BRLD   : BranchL<0x26, 0x14, 0x000, "brld   ">;
-  def BRALD  : BranchL<0x26, 0x1C, 0x000, "brald  ">;
-}
-
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
-    rd=0x10, Form=FCRI in {
-  def RTSD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
-                  "rtsd      $target, $imm",
-                  [],
-                  IIC_BR>;
-}
-
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
-    rd=0x11, Form=FCRI in {
-  def RTID   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
-                  "rtid      $target, $imm",
-                  [],
-                  IIC_BR>;
-}
-
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
-    rd=0x12, Form=FCRI in {
-  def RTBD   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
-                  "rtbd      $target, $imm",
-                  [],
-                  IIC_BR>;
-}
-
-let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1,
-    rd=0x14, Form=FCRI in {
-  def RTED   : TB<0x2D, (outs), (ins GPR:$target, simm16:$imm),
-                  "rted      $target, $imm",
-                  [],
-                  IIC_BR>;
-}
-
-//===----------------------------------------------------------------------===//
-// MBlaze misc instructions
-//===----------------------------------------------------------------------===//
-
-let neverHasSideEffects = 1 in {
-  def NOP :  MBlazeInst<0x20, FC, (outs), (ins), "nop    ", [], IIC_ALU>;
-}
-
-let Predicates=[HasPatCmp] in {
-  def CLZ :  TCLZ<0x24, 0x00E0, (outs GPR:$dst), (ins GPR:$src),
-                  "clz    $dst, $src", [], IIC_ALU>;
-}
-
-def IMEMBAR  : MBAR<0x2E, 0x0420004, (outs), (ins), "mbar   2", [], IIC_ALU>;
-def DMEMBAR  : MBAR<0x2E, 0x0220004, (outs), (ins), "mbar   1", [], IIC_ALU>;
-def IDMEMBAR : MBAR<0x2E, 0x0020004, (outs), (ins), "mbar   0", [], IIC_ALU>;
-
-let usesCustomInserter = 1 in {
-  def Select_CC : MBlazePseudo<(outs GPR:$dst),
-    (ins GPR:$T, GPR:$F, GPR:$CMP, i32imm:$CC), // F T reversed
-    "; SELECT_CC PSEUDO!",
-    []>;
-
-  def ShiftL : MBlazePseudo<(outs GPR:$dst),
-    (ins GPR:$L, GPR:$R),
-    "; ShiftL PSEUDO!",
-    []>;
-
-  def ShiftRA : MBlazePseudo<(outs GPR:$dst),
-    (ins GPR:$L, GPR:$R),
-    "; ShiftRA PSEUDO!",
-    []>;
-
-  def ShiftRL : MBlazePseudo<(outs GPR:$dst),
-    (ins GPR:$L, GPR:$R),
-    "; ShiftRL PSEUDO!",
-    []>;
-}
-
-let rb = 0 in {
-  def SEXT16 : TA<0x24, 0x061, (outs GPR:$dst), (ins GPR:$src),
-                  "sext16    $dst, $src", [], IIC_ALU>;
-  def SEXT8  : TA<0x24, 0x060, (outs GPR:$dst), (ins GPR:$src),
-                  "sext8     $dst, $src", [], IIC_ALU>;
-  let Defs = [CARRY] in {
-    def SRL    : TA<0x24, 0x041, (outs GPR:$dst), (ins GPR:$src),
-                    "srl       $dst, $src", [], IIC_ALU>;
-    def SRA    : TA<0x24, 0x001, (outs GPR:$dst), (ins GPR:$src),
-                    "sra       $dst, $src", [], IIC_ALU>;
-    let Uses = [CARRY] in {
-      def SRC    : TA<0x24, 0x021, (outs GPR:$dst), (ins GPR:$src),
-                      "src       $dst, $src", [], IIC_ALU>;
-    }
-  }
-}
-
-let isCodeGenOnly=1 in {
-  def ADDIK32 : ArithI32<0x08, "addik  ", simm16, immSExt16>;
-  def ORI32   : LogicI32<0x28, "ori    ">;
-  def BRLID32 : BranchLI<0x2E, 0x14, "brlid  ">;
-}
-
-//===----------------------------------------------------------------------===//
-// Misc. instructions
-//===----------------------------------------------------------------------===//
-let Form=FRCS in {
-  def MFS : SPC<0x25, 0x2, (outs GPR:$dst), (ins SPR:$src),
-                "mfs       $dst, $src", [], IIC_ALU>;
-}
-
-let Form=FCRCS in {
-  def MTS : SPC<0x25, 0x3, (outs SPR:$dst), (ins GPR:$src),
-                "mts       $dst, $src", [], IIC_ALU>;
-}
-
-def MSRSET : MSR<0x25, 0x20, (outs GPR:$dst), (ins uimm15:$set),
-                 "msrset    $dst, $set", [], IIC_ALU>;
-
-def MSRCLR : MSR<0x25, 0x22, (outs GPR:$dst), (ins uimm15:$clr),
-                 "msrclr    $dst, $clr", [], IIC_ALU>;
-
-let rd=0x0, Form=FCRR in {
-  def WDC  : TA<0x24, 0x64, (outs), (ins GPR:$a, GPR:$b),
-                "wdc       $a, $b", [], IIC_WDC>;
-  def WDCF : TA<0x24, 0x74, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.flush $a, $b", [], IIC_WDC>;
-  def WDCC : TA<0x24, 0x66, (outs), (ins GPR:$a, GPR:$b),
-                "wdc.clear $a, $b", [], IIC_WDC>;
-  def WIC  : TA<0x24, 0x68, (outs), (ins GPR:$a, GPR:$b),
-                "wic       $a, $b", [], IIC_WDC>;
-}
-
-def BRK  :  BranchL<0x26, 0x0C, 0x000, "brk    ">;
-def BRKI : BranchLI<0x2E, 0x0C, "brki   ">;
-
-def IMM : MBlazeInst<0x2C, FCCI, (outs), (ins simm16:$imm),
-                     "imm       $imm", [], IIC_ALU>;
-
-//===----------------------------------------------------------------------===//
-// Pseudo instructions for atomic operations
-//===----------------------------------------------------------------------===//
-let usesCustomInserter=1 in {
-  def CAS32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$cmp, GPR:$swp),
-    "# atomic compare and swap",
-    [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$cmp, GPR:$swp))]>;
-
-  def SWP32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$swp),
-    "# atomic swap",
-    [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$swp))]>;
-
-  def LAA32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and add",
-    [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$val))]>;
-
-  def LAS32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and sub",
-    [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$val))]>;
-
-  def LAD32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and and",
-    [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$val))]>;
-
-  def LAO32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and or",
-    [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$val))]>;
-
-  def LAX32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and xor",
-    [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$val))]>;
-
-  def LAN32 : MBlazePseudo<(outs GPR:$dst), (ins GPR:$ptr, GPR:$val),
-    "# atomic load and nand",
-    [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$val))]>;
-
-  def MEMBARRIER : MBlazePseudo<(outs), (ins),
-    "# memory barrier", []>;
-}
-
-//===----------------------------------------------------------------------===//
-//  Arbitrary patterns that map to one or more instructions
-//===----------------------------------------------------------------------===//
-
-// Small immediates
-def : Pat<(i32 0), (ADDK (i32 R0), (i32 R0))>;
-def : Pat<(i32 immSExt16:$imm), (ADDIK (i32 R0), imm:$imm)>;
-def : Pat<(i32 immZExt16:$imm), (ORI (i32 R0), imm:$imm)>;
-
-// Arbitrary immediates
-def : Pat<(i32 imm:$imm), (ADDIK (i32 R0), imm:$imm)>;
-
-// In register sign extension
-def : Pat<(sext_inreg GPR:$src, i16), (SEXT16 GPR:$src)>;
-def : Pat<(sext_inreg GPR:$src, i8),  (SEXT8 GPR:$src)>;
-
-// Call
-def : Pat<(MBlazeJmpLink (i32 tglobaladdr:$dst)),
-          (BRLID (i32 R15), tglobaladdr:$dst)>;
-
-def : Pat<(MBlazeJmpLink (i32 texternalsym:$dst)),
-          (BRLID (i32 R15), texternalsym:$dst)>;
-
-def : Pat<(MBlazeJmpLink GPR:$dst),
-          (BRALD (i32 R15), GPR:$dst)>;
-
-// Shift Instructions
-def : Pat<(shl GPR:$L, GPR:$R), (ShiftL GPR:$L, GPR:$R)>;
-def : Pat<(sra GPR:$L, GPR:$R), (ShiftRA GPR:$L, GPR:$R)>;
-def : Pat<(srl GPR:$L, GPR:$R), (ShiftRL GPR:$L, GPR:$R)>;
-
-// SET_CC operations
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETEQ),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 1)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETNE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 2)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 3)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 4)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 5)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETLE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$L, 6)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU (i32 R0), GPR:$L), 3)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU (i32 R0), GPR:$L), 4)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETUGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU (i32 R0), GPR:$L), 5)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 0), SETULE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU (i32 R0), GPR:$L), 6)>;
-
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETEQ),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 1)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETNE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 2)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 3)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 4)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 5)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETLE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0), GPR:$R, 6)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, (i32 R0)), 3)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, (i32 R0)), 4)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETUGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, (i32 R0)), 5)>;
-def : Pat<(setcc (i32 0), (i32 GPR:$R), SETULE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, (i32 R0)), 6)>;
-
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 1)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETNE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 2)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 3)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETLT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 4)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 5)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETLE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMP GPR:$R, GPR:$L), 6)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETUGT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, GPR:$L), 3)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETULT),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, GPR:$L), 4)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETUGE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, GPR:$L), 5)>;
-def : Pat<(setcc (i32 GPR:$L), (i32 GPR:$R), SETULE),
-          (Select_CC (ADDIK (i32 R0), 1), (ADDIK (i32 R0), 0),
-                     (CMPU GPR:$R, GPR:$L), 6)>;
-
-// SELECT operations
-def : Pat<(select (i32 GPR:$C), (i32 GPR:$T), (i32 GPR:$F)),
-          (Select_CC GPR:$T, GPR:$F, GPR:$C, 2)>;
-
-// SELECT_CC
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 1)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 2)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 3)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 4)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 5)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$L, 6)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 3)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 4)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 5)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 0),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU (i32 R0), GPR:$L), 6)>;
-
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 1)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 2)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 3)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 4)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 5)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
-          (Select_CC GPR:$T, GPR:$F, GPR:$R, 6)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 3)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 4)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 5)>;
-def : Pat<(selectcc (i32 0), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, (i32 R0)), 6)>;
-
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETEQ),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 1)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETNE),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 2)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGT),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 3)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLT),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 4)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETGE),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 5)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETLE),
-          (Select_CC GPR:$T, GPR:$F, (CMP GPR:$R, GPR:$L), 6)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 3)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULT),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 4)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETUGE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 5)>;
-def : Pat<(selectcc (i32 GPR:$L), (i32 GPR:$R),
-                    (i32 GPR:$T), (i32 GPR:$F), SETULE),
-          (Select_CC GPR:$T, GPR:$F, (CMPU GPR:$R, GPR:$L), 6)>;
-
-// Ret instructions
-def : Pat<(MBlazeRet GPR:$target), (RTSD GPR:$target, 0x8)>;
-def : Pat<(MBlazeIRet GPR:$target), (RTID GPR:$target, 0x0)>;
-
-// BR instructions
-def : Pat<(br bb:$T), (BRID bb:$T)>;
-def : Pat<(brind GPR:$T), (BRAD GPR:$T)>;
-
-// BRCOND instructions
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETEQ), bb:$T),
-          (BEQID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETNE), bb:$T),
-          (BNEID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGT), bb:$T),
-          (BGTID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLT), bb:$T),
-          (BLTID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETGE), bb:$T),
-          (BGEID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETLE), bb:$T),
-          (BLEID GPR:$L, bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGT), bb:$T),
-          (BGTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULT), bb:$T),
-          (BLTID (CMPU (i32 R0), GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETUGE), bb:$T),
-          (BGEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 0), SETULE), bb:$T),
-          (BLEID (CMPU (i32 R0), GPR:$L), bb:$T)>;
-
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETEQ), bb:$T),
-          (BEQID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETNE), bb:$T),
-          (BNEID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGT), bb:$T),
-          (BGTID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLT), bb:$T),
-          (BLTID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETGE), bb:$T),
-          (BGEID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETLE), bb:$T),
-          (BLEID GPR:$R, bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGT), bb:$T),
-          (BGTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULT), bb:$T),
-          (BLTID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETUGE), bb:$T),
-          (BGEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
-def : Pat<(brcond (setcc (i32 0), (i32 GPR:$R), SETULE), bb:$T),
-          (BLEID (CMPU GPR:$R, (i32 R0)), bb:$T)>;
-
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETEQ), bb:$T),
-          (BEQID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETNE), bb:$T),
-          (BNEID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETGT), bb:$T),
-          (BGTID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETLT), bb:$T),
-          (BLTID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETGE), bb:$T),
-          (BGEID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETLE), bb:$T),
-          (BLEID (CMP GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETUGT), bb:$T),
-          (BGTID (CMPU GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETULT), bb:$T),
-          (BLTID (CMPU GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETUGE), bb:$T),
-          (BGEID (CMPU GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (setcc (i32 GPR:$L), (i32 GPR:$R), SETULE), bb:$T),
-          (BLEID (CMPU GPR:$R, GPR:$L), bb:$T)>;
-def : Pat<(brcond (i32 GPR:$C), bb:$T),
-          (BNEID GPR:$C, bb:$T)>;
-
-// Jump tables, global addresses, and constant pools
-def : Pat<(MBWrapper tglobaladdr:$in), (ORI (i32 R0), tglobaladdr:$in)>;
-def : Pat<(MBWrapper tjumptable:$in),  (ORI (i32 R0), tjumptable:$in)>;
-def : Pat<(MBWrapper tconstpool:$in),  (ORI (i32 R0), tconstpool:$in)>;
-
-// Misc instructions
-def : Pat<(and (i32 GPR:$lh), (not (i32 GPR:$rh))),(ANDN GPR:$lh, GPR:$rh)>;
-
-// Convert any extend loads into zero extend loads
-def : Pat<(extloadi8  iaddr:$src), (i32 (LBUI iaddr:$src))>;
-def : Pat<(extloadi16 iaddr:$src), (i32 (LHUI iaddr:$src))>;
-def : Pat<(extloadi8  xaddr:$src), (i32 (LBU xaddr:$src))>;
-def : Pat<(extloadi16 xaddr:$src), (i32 (LHU xaddr:$src))>;
-
-// 32-bit load and store
-def : Pat<(store (i32 GPR:$dst), xaddr:$addr), (SW GPR:$dst, xaddr:$addr)>;
-def : Pat<(load xaddr:$addr), (i32 (LW xaddr:$addr))>;
-
-// 16-bit load and store
-def : Pat<(truncstorei16 (i32 GPR:$dst), xaddr:$ad), (SH GPR:$dst, xaddr:$ad)>;
-def : Pat<(zextloadi16 xaddr:$addr), (i32 (LHU xaddr:$addr))>;
-
-// 8-bit load and store
-def : Pat<(truncstorei8 (i32 GPR:$dst), xaddr:$ad), (SB GPR:$dst, xaddr:$ad)>;
-def : Pat<(zextloadi8 xaddr:$addr), (i32 (LBU xaddr:$addr))>;
-
-// Peepholes
-def : Pat<(store (i32 0), iaddr:$dst), (SWI (i32 R0), iaddr:$dst)>;
-
-// Atomic fence
-def : Pat<(atomic_fence (imm), (imm)), (MEMBARRIER)>;
-
-//===----------------------------------------------------------------------===//
-// Floating Point Support
-//===----------------------------------------------------------------------===//
-include "MBlazeInstrFSL.td"
-include "MBlazeInstrFPU.td"
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
deleted file mode 100644
index 8d262a0..0000000
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-//===-- MBlazeIntrinsicInfo.cpp - Intrinsic Information -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of TargetIntrinsicInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeIntrinsicInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstring>
-
-using namespace llvm;
-
-namespace mblazeIntrinsic {
-
-  enum ID {
-    last_non_mblaze_intrinsic = Intrinsic::num_intrinsics-1,
-#define GET_INTRINSIC_ENUM_VALUES
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_INTRINSIC_ENUM_VALUES
-    , num_mblaze_intrinsics
-  };
-
-#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
-}
-
-std::string MBlazeIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
-                                         unsigned numTys) const {
-  static const char *const names[] = {
-#define GET_INTRINSIC_NAME_TABLE
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_INTRINSIC_NAME_TABLE
-  };
-
-  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
-  if (IntrID < Intrinsic::num_intrinsics)
-    return 0;
-  assert(IntrID < mblazeIntrinsic::num_mblaze_intrinsics &&
-         "Invalid intrinsic ID");
-
-  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
-  return Result;
-}
-
-unsigned MBlazeIntrinsicInfo::
-lookupName(const char *Name, unsigned Len) const {
-  if (Len < 5 || Name[4] != '.' || Name[0] != 'l' || Name[1] != 'l'
-      || Name[2] != 'v' || Name[3] != 'm')
-    return 0;  // All intrinsics start with 'llvm.'
-
-#define GET_FUNCTION_RECOGNIZER
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_FUNCTION_RECOGNIZER
-  return 0;
-}
-
-unsigned MBlazeIntrinsicInfo::
-lookupGCCName(const char *Name) const {
-    return mblazeIntrinsic::getIntrinsicForGCCBuiltin("mblaze",Name);
-}
-
-bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
-  if (IntrID == 0)
-    return false;
-
-  unsigned id = IntrID - Intrinsic::num_intrinsics + 1;
-#define GET_INTRINSIC_OVERLOAD_TABLE
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_INTRINSIC_OVERLOAD_TABLE
-}
-
-/// This defines the "getAttributes(LLVMContext &C, ID id)" method.
-#define GET_INTRINSIC_ATTRIBUTES
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_INTRINSIC_ATTRIBUTES
-
-static FunctionType *getType(LLVMContext &Context, unsigned id) {
-  Type *ResultTy = NULL;
-  SmallVector<Type*, 8> ArgTys;
-  bool IsVarArg = false;
-
-#define GET_INTRINSIC_GENERATOR
-#include "MBlazeGenIntrinsics.inc"
-#undef GET_INTRINSIC_GENERATOR
-
-  return FunctionType::get(ResultTy, ArgTys, IsVarArg);
-}
-
-Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
-                                                Type **Tys,
-                                                unsigned numTy) const {
-  assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
-  AttributeSet AList = getAttributes(M->getContext(),
-                                    (mblazeIntrinsic::ID) IntrID);
-  return cast<Function>(M->getOrInsertFunction(getName(IntrID),
-                                               getType(M->getContext(), IntrID),
-                                               AList));
-}
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h b/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
deleted file mode 100644
index 34f3792..0000000
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.h
+++ /dev/null
@@ -1,33 +0,0 @@
-//===-- MBlazeIntrinsicInfo.h - MBlaze Intrinsic Information ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of TargetIntrinsicInfo.
-//
-//===----------------------------------------------------------------------===//
-#ifndef MBLAZEINTRINSICS_H
-#define MBLAZEINTRINSICS_H
-
-#include "llvm/Target/TargetIntrinsicInfo.h"
-
-namespace llvm {
-
-  class MBlazeIntrinsicInfo : public TargetIntrinsicInfo {
-  public:
-    std::string getName(unsigned IntrID, Type **Tys = 0,
-                        unsigned numTys = 0) const;
-    unsigned lookupName(const char *Name, unsigned Len) const;
-    unsigned lookupGCCName(const char *Name) const;
-    bool isOverloaded(unsigned IID) const;
-    Function *getDeclaration(Module *M, unsigned ID, Type **Tys = 0,
-                             unsigned numTys = 0) const;
-  };
-
-}
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeIntrinsics.td b/lib/Target/MBlaze/MBlazeIntrinsics.td
deleted file mode 100644
index b5dc595..0000000
--- a/lib/Target/MBlaze/MBlazeIntrinsics.td
+++ /dev/null
@@ -1,131 +0,0 @@
-//===-- IntrinsicsMBlaze.td - Defines MBlaze intrinsics ----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines all of the MicroBlaze-specific intrinsics.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Definitions for all MBlaze intrinsics.
-//
-
-// MBlaze intrinsic classes.
-let TargetPrefix = "mblaze", isTarget = 1 in {
-  class MBFSL_Get_Intrinsic : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>;
-
-  class MBFSL_Put_Intrinsic : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>;
-
-  class MBFSL_PutT_Intrinsic : Intrinsic<[], [llvm_i32_ty], []>;
-}
-
-//===----------------------------------------------------------------------===//
-// MicroBlaze FSL Get Intrinsic Definitions.
-//
-
-def int_mblaze_fsl_get      : GCCBuiltin<"__builtin_mblaze_fsl_get">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_aget     : GCCBuiltin<"__builtin_mblaze_fsl_aget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_cget     : GCCBuiltin<"__builtin_mblaze_fsl_cget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_caget    : GCCBuiltin<"__builtin_mblaze_fsl_caget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_eget     : GCCBuiltin<"__builtin_mblaze_fsl_eget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_eaget    : GCCBuiltin<"__builtin_mblaze_fsl_eaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_ecget    : GCCBuiltin<"__builtin_mblaze_fsl_ecget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_ecaget   : GCCBuiltin<"__builtin_mblaze_fsl_ecaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_nget     : GCCBuiltin<"__builtin_mblaze_fsl_nget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_naget    : GCCBuiltin<"__builtin_mblaze_fsl_naget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_ncget    : GCCBuiltin<"__builtin_mblaze_fsl_ncget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_ncaget   : GCCBuiltin<"__builtin_mblaze_fsl_ncaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_neget    : GCCBuiltin<"__builtin_mblaze_fsl_neget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_neaget   : GCCBuiltin<"__builtin_mblaze_fsl_neaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_necget   : GCCBuiltin<"__builtin_mblaze_fsl_necget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_necaget  : GCCBuiltin<"__builtin_mblaze_fsl_necaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tget     : GCCBuiltin<"__builtin_mblaze_fsl_tget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_taget    : GCCBuiltin<"__builtin_mblaze_fsl_taget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tcget    : GCCBuiltin<"__builtin_mblaze_fsl_tcget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tcaget   : GCCBuiltin<"__builtin_mblaze_fsl_tcaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_teget    : GCCBuiltin<"__builtin_mblaze_fsl_teget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_teaget   : GCCBuiltin<"__builtin_mblaze_fsl_teaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tecget   : GCCBuiltin<"__builtin_mblaze_fsl_tecget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tecaget  : GCCBuiltin<"__builtin_mblaze_fsl_tecaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tnget    : GCCBuiltin<"__builtin_mblaze_fsl_tnget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tnaget   : GCCBuiltin<"__builtin_mblaze_fsl_tnaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tncget   : GCCBuiltin<"__builtin_mblaze_fsl_tncget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tncaget  : GCCBuiltin<"__builtin_mblaze_fsl_tncaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tneget   : GCCBuiltin<"__builtin_mblaze_fsl_tneget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tneaget  : GCCBuiltin<"__builtin_mblaze_fsl_tneaget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tnecget  : GCCBuiltin<"__builtin_mblaze_fsl_tnecget">,
-                              MBFSL_Get_Intrinsic;
-def int_mblaze_fsl_tnecaget : GCCBuiltin<"__builtin_mblaze_fsl_tnecaget">,
-                              MBFSL_Get_Intrinsic;
-
-//===----------------------------------------------------------------------===//
-// MicroBlaze FSL Put Intrinsic Definitions.
-//
-
-def int_mblaze_fsl_put     : GCCBuiltin<"__builtin_mblaze_fsl_put">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_aput    : GCCBuiltin<"__builtin_mblaze_fsl_aput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_cput    : GCCBuiltin<"__builtin_mblaze_fsl_cput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_caput   : GCCBuiltin<"__builtin_mblaze_fsl_caput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_nput    : GCCBuiltin<"__builtin_mblaze_fsl_nput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_naput   : GCCBuiltin<"__builtin_mblaze_fsl_naput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_ncput   : GCCBuiltin<"__builtin_mblaze_fsl_ncput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_ncaput  : GCCBuiltin<"__builtin_mblaze_fsl_ncaput">,
-                             MBFSL_Put_Intrinsic;
-def int_mblaze_fsl_tput    : GCCBuiltin<"__builtin_mblaze_fsl_tput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_taput   : GCCBuiltin<"__builtin_mblaze_fsl_taput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tcput   : GCCBuiltin<"__builtin_mblaze_fsl_tcput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tcaput  : GCCBuiltin<"__builtin_mblaze_fsl_tcaput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tnput   : GCCBuiltin<"__builtin_mblaze_fsl_tnput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tnaput  : GCCBuiltin<"__builtin_mblaze_fsl_tnaput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tncput  : GCCBuiltin<"__builtin_mblaze_fsl_tncput">,
-                             MBFSL_PutT_Intrinsic;
-def int_mblaze_fsl_tncaput : GCCBuiltin<"__builtin_mblaze_fsl_tncaput">,
-                             MBFSL_PutT_Intrinsic;
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.cpp b/lib/Target/MBlaze/MBlazeMCInstLower.cpp
deleted file mode 100644
index ad414ac4..0000000
--- a/lib/Target/MBlaze/MBlazeMCInstLower.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//===-- MBlazeMCInstLower.cpp - Convert MBlaze MachineInstr to an MCInst---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower MBlaze MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeMCInstLower.h"
-#include "MBlazeInstrInfo.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/Mangler.h"
-using namespace llvm;
-
-MCSymbol *MBlazeMCInstLower::
-GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  case 0:  break;
-  }
-
-  return Printer.Mang->getSymbol(MO.getGlobal());
-}
-
-MCSymbol *MBlazeMCInstLower::
-GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  case 0:  break;
-  }
-
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
-MCSymbol *MBlazeMCInstLower::
-GetJumpTableSymbol(const MachineOperand &MO) const {
-  SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
-                            << Printer.getFunctionNumber() << '_'
-                            << MO.getIndex();
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  case 0:  break;
-  }
-
-  // Create a symbol for the name.
-  return Ctx.GetOrCreateSymbol(Name.str());
-}
-
-MCSymbol *MBlazeMCInstLower::
-GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
-  SmallString<256> Name;
-  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
-                            << Printer.getFunctionNumber() << '_'
-                            << MO.getIndex();
-
-  switch (MO.getTargetFlags()) {
-  default:
-      llvm_unreachable("Unknown target flag on GV operand");
-
-  case 0: break;
-  }
-
-  // Create a symbol for the name.
-  return Ctx.GetOrCreateSymbol(Name.str());
-}
-
-MCSymbol *MBlazeMCInstLower::
-GetBlockAddressSymbol(const MachineOperand &MO) const {
-  switch (MO.getTargetFlags()) {
-  default: llvm_unreachable("Unknown target flag on GV operand");
-  case 0: break;
-  }
-
-  return Printer.GetBlockAddressSymbol(MO.getBlockAddress());
-}
-
-MCOperand MBlazeMCInstLower::
-LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
-
-  switch (MO.getTargetFlags()) {
-  default:
-      llvm_unreachable("Unknown target flag on GV operand");
-
-  case 0: break;
-  }
-
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
-                                   Ctx);
-  return MCOperand::CreateExpr(Expr);
-}
-
-void MBlazeMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
-    MCOperand MCOp;
-    switch (MO.getType()) {
-    default: llvm_unreachable("unknown operand type");
-    case MachineOperand::MO_Register:
-      // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
-      MCOp = MCOperand::CreateReg(MO.getReg());
-      break;
-    case MachineOperand::MO_Immediate:
-      MCOp = MCOperand::CreateImm(MO.getImm());
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                         MO.getMBB()->getSymbol(), Ctx));
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
-      break;
-    case MachineOperand::MO_ExternalSymbol:
-      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
-      break;
-    case MachineOperand::MO_JumpTableIndex:
-      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
-      break;
-    case MachineOperand::MO_ConstantPoolIndex:
-      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
-      break;
-    case MachineOperand::MO_BlockAddress:
-      MCOp = LowerSymbolOperand(MO, GetBlockAddressSymbol(MO));
-      break;
-    case MachineOperand::MO_FPImmediate: {
-      bool ignored;
-      APFloat FVal = MO.getFPImm()->getValueAPF();
-      FVal.convert(APFloat::IEEEsingle, APFloat::rmTowardZero, &ignored);
-
-      APInt IVal = FVal.bitcastToAPInt();
-      uint64_t Val = *IVal.getRawData();
-      MCOp = MCOperand::CreateImm(Val);
-      break;
-    }
-    case MachineOperand::MO_RegisterMask:
-      continue;
-    }
-
-    OutMI.addOperand(MCOp);
-  }
-}
diff --git a/lib/Target/MBlaze/MBlazeMCInstLower.h b/lib/Target/MBlaze/MBlazeMCInstLower.h
deleted file mode 100644
index 8ab2c9a..0000000
--- a/lib/Target/MBlaze/MBlazeMCInstLower.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- MBlazeMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZE_MCINSTLOWER_H
-#define MBLAZE_MCINSTLOWER_H
-
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class AsmPrinter;
-  class MCContext;
-  class MCInst;
-  class MCOperand;
-  class MCSymbol;
-  class MachineInstr;
-  class MachineModuleInfoMachO;
-  class MachineOperand;
-
-  /// MBlazeMCInstLower - This class is used to lower an MachineInstr
-  /// into an MCInst.
-class LLVM_LIBRARY_VISIBILITY MBlazeMCInstLower {
-  MCContext &Ctx;
-
-  AsmPrinter &Printer;
-public:
-  MBlazeMCInstLower(MCContext &ctx, AsmPrinter &printer)
-    : Ctx(ctx), Printer(printer) {}
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-
-  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
-  MCSymbol *GetBlockAddressSymbol(const MachineOperand &MO) const;
-};
-
-}
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeMachineFunction.h b/lib/Target/MBlaze/MBlazeMachineFunction.h
deleted file mode 100644
index 10d507f..0000000
--- a/lib/Target/MBlaze/MBlazeMachineFunction.h
+++ /dev/null
@@ -1,169 +0,0 @@
-//===-- MBlazeMachineFunctionInfo.h - Private data --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the MBlaze specific subclass of MachineFunctionInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZE_MACHINE_FUNCTION_INFO_H
-#define MBLAZE_MACHINE_FUNCTION_INFO_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-
-namespace llvm {
-
-/// MBlazeFunctionInfo - This class is derived from MachineFunction private
-/// MBlaze target-specific information for each MachineFunction.
-class MBlazeFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-
-  /// Holds for each function where on the stack the Frame Pointer must be
-  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
-  int FPStackOffset;
-
-  /// Holds for each function where on the stack the Return Address must be
-  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
-  int RAStackOffset;
-
-  /// MBlazeFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
-  struct MBlazeFIHolder {
-
-    int FI;
-    int SPOffset;
-
-    MBlazeFIHolder(int FrameIndex, int StackPointerOffset)
-      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
-  };
-
-  /// When PIC is used the GP must be saved on the stack on the function
-  /// prologue and must be reloaded from this stack location after every
-  /// call. A reference to its stack location and frame index must be kept
-  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
-  MBlazeFIHolder GPHolder;
-
-  /// On LowerFormalArguments the stack size is unknown, so the Stack
-  /// Pointer Offset calculation of "not in register arguments" must be
-  /// postponed to emitPrologue.
-  SmallVector<MBlazeFIHolder, 16> FnLoadArgs;
-  bool HasLoadArgs;
-
-  // When VarArgs, we must write registers back to caller stack, preserving
-  // on register arguments. Since the stack size is unknown on
-  // LowerFormalArguments, the Stack Pointer Offset calculation must be
-  // postponed to emitPrologue.
-  SmallVector<MBlazeFIHolder, 4> FnStoreVarArgs;
-  bool HasStoreVarArgs;
-
-  // When determining the final stack layout some of the frame indexes may
-  // be replaced by new frame indexes that reside in the caller's stack
-  // frame. The replacements are recorded in this structure.
-  DenseMap<int,int> FIReplacements;
-
-  /// SRetReturnReg - Some subtargets require that sret lowering includes
-  /// returning the value of the returned struct in a register. This field
-  /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
-
-  /// GlobalBaseReg - keeps track of the virtual register initialized for
-  /// use as the global base register. This is used for PIC in some PIC
-  /// relocation models.
-  unsigned GlobalBaseReg;
-
-  // VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
-
-  /// LiveInFI - keeps track of the frame indexes in a callers stack
-  /// frame that are live into a function.
-  SmallVector<int, 16> LiveInFI;
-
-public:
-  MBlazeFunctionInfo(MachineFunction& MF)
-  : FPStackOffset(0), RAStackOffset(0), GPHolder(-1,-1), HasLoadArgs(false),
-    HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0),
-    VarArgsFrameIndex(0), LiveInFI()
-  {}
-
-  int getFPStackOffset() const { return FPStackOffset; }
-  void setFPStackOffset(int Off) { FPStackOffset = Off; }
-
-  int getRAStackOffset() const { return RAStackOffset; }
-  void setRAStackOffset(int Off) { RAStackOffset = Off; }
-
-  int getGPStackOffset() const { return GPHolder.SPOffset; }
-  int getGPFI() const { return GPHolder.FI; }
-  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
-  void setGPFI(int FI) { GPHolder.FI = FI; }
-  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
-
-  bool hasLoadArgs() const { return HasLoadArgs; }
-  bool hasStoreVarArgs() const { return HasStoreVarArgs; }
-
-  void recordLiveIn(int FI) {
-    LiveInFI.push_back(FI);
-  }
-
-  bool isLiveIn(int FI) {
-    for (unsigned i = 0, e = LiveInFI.size(); i < e; ++i)
-      if (FI == LiveInFI[i]) return true;
-
-    return false;
-  }
-
-  const SmallVector<int, 16>& getLiveIn() const { return LiveInFI; }
-
-  void recordReplacement(int OFI, int NFI) {
-    FIReplacements.insert(std::make_pair(OFI,NFI));
-  }
-
-  bool hasReplacement(int OFI) const {
-    return FIReplacements.find(OFI) != FIReplacements.end();
-  }
-
-  int getReplacement(int OFI) const {
-    return FIReplacements.lookup(OFI);
-  }
-
-  void recordLoadArgsFI(int FI, int SPOffset) {
-    if (!HasLoadArgs) HasLoadArgs=true;
-    FnLoadArgs.push_back(MBlazeFIHolder(FI, SPOffset));
-  }
-
-  void recordStoreVarArgsFI(int FI, int SPOffset) {
-    if (!HasStoreVarArgs) HasStoreVarArgs=true;
-    FnStoreVarArgs.push_back(MBlazeFIHolder(FI, SPOffset));
-  }
-
-  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasLoadArgs()) return;
-    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i)
-      MFI->setObjectOffset(FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset);
-  }
-
-  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
-    if (!hasStoreVarArgs()) return;
-    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i)
-      MFI->setObjectOffset(FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset);
-  }
-
-  unsigned getSRetReturnReg() const { return SRetReturnReg; }
-  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
-
-  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
-  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
-
-  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
-  void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
-};
-
-} // end of namespace llvm
-
-#endif // MBLAZE_MACHINE_FUNCTION_INFO_H
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
deleted file mode 100644
index bd83afc..0000000
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ /dev/null
@@ -1,145 +0,0 @@
-//===-- MBlazeRegisterInfo.cpp - MBlaze Register Information --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of the TargetRegisterInfo
-// class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mblaze-frame-info"
-
-#include "MBlazeRegisterInfo.h"
-#include "MBlaze.h"
-#include "MBlazeMachineFunction.h"
-#include "MBlazeSubtarget.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-
-#define GET_REGINFO_TARGET_DESC
-#include "MBlazeGenRegisterInfo.inc"
-
-using namespace llvm;
-
-MBlazeRegisterInfo::
-MBlazeRegisterInfo(const MBlazeSubtarget &ST, const TargetInstrInfo &tii)
-  : MBlazeGenRegisterInfo(MBlaze::R15), Subtarget(ST), TII(tii) {}
-
-unsigned MBlazeRegisterInfo::getPICCallReg() {
-  return MBlaze::R20;
-}
-
-//===----------------------------------------------------------------------===//
-// Callee Saved Registers methods
-//===----------------------------------------------------------------------===//
-
-/// MBlaze Callee Saved Registers
-const uint16_t* MBlazeRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const {
-  // MBlaze callee-save register range is R20 - R31
-  static const uint16_t CalleeSavedRegs[] = {
-    MBlaze::R20, MBlaze::R21, MBlaze::R22, MBlaze::R23,
-    MBlaze::R24, MBlaze::R25, MBlaze::R26, MBlaze::R27,
-    MBlaze::R28, MBlaze::R29, MBlaze::R30, MBlaze::R31,
-    0
-  };
-
-  return CalleeSavedRegs;
-}
-
-BitVector MBlazeRegisterInfo::
-getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
-  Reserved.set(MBlaze::R0);
-  Reserved.set(MBlaze::R1);
-  Reserved.set(MBlaze::R2);
-  Reserved.set(MBlaze::R13);
-  Reserved.set(MBlaze::R14);
-  Reserved.set(MBlaze::R15);
-  Reserved.set(MBlaze::R16);
-  Reserved.set(MBlaze::R17);
-  Reserved.set(MBlaze::R18);
-  Reserved.set(MBlaze::R19);
-  return Reserved;
-}
-
-// FrameIndex represent objects inside a abstract stack.
-// We must replace FrameIndex with an stack/frame pointer
-// direct reference.
-void MBlazeRegisterInfo::
-eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                    unsigned FIOperandNum, RegScavenger *RS) const {
-  MachineInstr &MI = *II;
-  MachineFunction &MF = *MI.getParent()->getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned OFIOperandNum = FIOperandNum == 2 ? 1 : 2;
-
-  DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
-        dbgs() << "<--------->\n" << MI);
-
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  int stackSize  = MFI->getStackSize();
-  int spOffset   = MFI->getObjectOffset(FrameIndex);
-
-  DEBUG(MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-        dbgs() << "FrameIndex : " << FrameIndex << "\n"
-               << "spOffset   : " << spOffset << "\n"
-               << "stackSize  : " << stackSize << "\n"
-               << "isFixed    : " << MFI->isFixedObjectIndex(FrameIndex) << "\n"
-               << "isLiveIn   : " << MBlazeFI->isLiveIn(FrameIndex) << "\n"
-               << "isSpill    : " << MFI->isSpillSlotObjectIndex(FrameIndex)
-               << "\n" );
-
-  // as explained on LowerFormalArguments, detect negative offsets
-  // and adjust SPOffsets considering the final stack size.
-  int Offset = (spOffset < 0) ? (stackSize - spOffset) : spOffset;
-  Offset += MI.getOperand(OFIOperandNum).getImm();
-
-  DEBUG(dbgs() << "Offset     : " << Offset << "\n" << "<--------->\n");
-
-  MI.getOperand(OFIOperandNum).ChangeToImmediate(Offset);
-  MI.getOperand(FIOperandNum).ChangeToRegister(getFrameRegister(MF), false);
-}
-
-void MBlazeRegisterInfo::
-processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *) const {
-  // Set the stack offset where GP must be saved/loaded from.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MBlazeFunctionInfo *MBlazeFI = MF.getInfo<MBlazeFunctionInfo>();
-  if (MBlazeFI->needGPSaveRestore())
-    MFI->setObjectOffset(MBlazeFI->getGPFI(), MBlazeFI->getGPStackOffset());
-}
-
-unsigned MBlazeRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  return TFI->hasFP(MF) ? MBlaze::R19 : MBlaze::R1;
-}
-
-unsigned MBlazeRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned MBlazeRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.h b/lib/Target/MBlaze/MBlazeRegisterInfo.h
deleted file mode 100644
index 497f386..0000000
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.h
+++ /dev/null
@@ -1,71 +0,0 @@
-//===-- MBlazeRegisterInfo.h - MBlaze Register Information Impl -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the MBlaze implementation of the TargetRegisterInfo
-// class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZEREGISTERINFO_H
-#define MBLAZEREGISTERINFO_H
-
-#include "MBlaze.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-
-#define GET_REGINFO_HEADER
-#include "MBlazeGenRegisterInfo.inc"
-
-namespace llvm {
-class MBlazeSubtarget;
-class TargetInstrInfo;
-class Type;
-
-namespace MBlaze {
-  /// SubregIndex - The index of various sized subregister classes. Note that
-  /// these indices must be kept in sync with the class indices in the
-  /// MBlazeRegisterInfo.td file.
-  enum SubregIndex {
-    SUBREG_FPEVEN = 1, SUBREG_FPODD = 2
-  };
-}
-
-struct MBlazeRegisterInfo : public MBlazeGenRegisterInfo {
-  const MBlazeSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
-
-  MBlazeRegisterInfo(const MBlazeSubtarget &Subtarget,
-                     const TargetInstrInfo &tii);
-
-  /// Get PIC indirect call register
-  static unsigned getPICCallReg();
-
-  /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const;
-
-  /// Stack Frame Processing Methods
-  void eliminateFrameIndex(MachineBasicBlock::iterator II,
-                           int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
-
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                           RegScavenger *RS = NULL) const;
-
-  /// Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  /// Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.td b/lib/Target/MBlaze/MBlazeRegisterInfo.td
deleted file mode 100644
index 64cae5c..0000000
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.td
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- MBlazeRegisterInfo.td - MBlaze Register defs -------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Declarations that describe the MicroBlaze register file
-//===----------------------------------------------------------------------===//
-
-// We have banks of 32 registers each.
-class MBlazeReg<string n> : Register<n> {
-  field bits<5> Num;
-  let Namespace = "MBlaze";
-}
-
-// Special purpose registers have 15-bit values
-class MBlazeSReg<string n> : Register<n> {
-  field bits<15> Num;
-  let Namespace = "MBlaze";
-}
-
-// MBlaze general purpose registers
-class MBlazeGPRReg<bits<5> num, string n> : MBlazeReg<n> {
-  let Num = num;
-}
-
-// MBlaze special purpose registers
-class MBlazeSPRReg<bits<15> num, string n> : MBlazeSReg<n> {
-  let Num = num;
-}
-
-//===----------------------------------------------------------------------===//
-//  Registers
-//===----------------------------------------------------------------------===//
-
-let Namespace = "MBlaze" in {
-  // General Purpose Registers
-  def R0  : MBlazeGPRReg< 0,  "r0">,   DwarfRegNum<[0]>;
-  def R1  : MBlazeGPRReg< 1,  "r1">,   DwarfRegNum<[1]>;
-  def R2  : MBlazeGPRReg< 2,  "r2">,   DwarfRegNum<[2]>;
-  def R3  : MBlazeGPRReg< 3,  "r3">,   DwarfRegNum<[3]>;
-  def R4  : MBlazeGPRReg< 4,  "r4">,   DwarfRegNum<[4]>;
-  def R5  : MBlazeGPRReg< 5,  "r5">,   DwarfRegNum<[5]>;
-  def R6  : MBlazeGPRReg< 6,  "r6">,   DwarfRegNum<[6]>;
-  def R7  : MBlazeGPRReg< 7,  "r7">,   DwarfRegNum<[7]>;
-  def R8  : MBlazeGPRReg< 8,  "r8">,   DwarfRegNum<[8]>;
-  def R9  : MBlazeGPRReg< 9,  "r9">,   DwarfRegNum<[9]>;
-  def R10 : MBlazeGPRReg< 10, "r10">,  DwarfRegNum<[10]>;
-  def R11 : MBlazeGPRReg< 11, "r11">,  DwarfRegNum<[11]>;
-  def R12 : MBlazeGPRReg< 12, "r12">,  DwarfRegNum<[12]>;
-  def R13 : MBlazeGPRReg< 13, "r13">,  DwarfRegNum<[13]>;
-  def R14 : MBlazeGPRReg< 14, "r14">,  DwarfRegNum<[14]>;
-  def R15 : MBlazeGPRReg< 15, "r15">,  DwarfRegNum<[15]>;
-  def R16 : MBlazeGPRReg< 16, "r16">,  DwarfRegNum<[16]>;
-  def R17 : MBlazeGPRReg< 17, "r17">,  DwarfRegNum<[17]>;
-  def R18 : MBlazeGPRReg< 18, "r18">,  DwarfRegNum<[18]>;
-  def R19 : MBlazeGPRReg< 19, "r19">,  DwarfRegNum<[19]>;
-  def R20 : MBlazeGPRReg< 20, "r20">,  DwarfRegNum<[20]>;
-  def R21 : MBlazeGPRReg< 21, "r21">,  DwarfRegNum<[21]>;
-  def R22 : MBlazeGPRReg< 22, "r22">,  DwarfRegNum<[22]>;
-  def R23 : MBlazeGPRReg< 23, "r23">,  DwarfRegNum<[23]>;
-  def R24 : MBlazeGPRReg< 24, "r24">,  DwarfRegNum<[24]>;
-  def R25 : MBlazeGPRReg< 25, "r25">,  DwarfRegNum<[25]>;
-  def R26 : MBlazeGPRReg< 26, "r26">,  DwarfRegNum<[26]>;
-  def R27 : MBlazeGPRReg< 27, "r27">,  DwarfRegNum<[27]>;
-  def R28 : MBlazeGPRReg< 28, "r28">,  DwarfRegNum<[28]>;
-  def R29 : MBlazeGPRReg< 29, "r29">,  DwarfRegNum<[29]>;
-  def R30 : MBlazeGPRReg< 30, "r30">,  DwarfRegNum<[30]>;
-  def R31 : MBlazeGPRReg< 31, "r31">,  DwarfRegNum<[31]>;
-
-  // Special Purpose Registers
-  def RPC    : MBlazeSPRReg<0x0000, "rpc">,    DwarfRegNum<[32]>;
-  def RMSR   : MBlazeSPRReg<0x0001, "rmsr">,   DwarfRegNum<[33]>;
-  def REAR   : MBlazeSPRReg<0x0003, "rear">,   DwarfRegNum<[34]>;
-  def RESR   : MBlazeSPRReg<0x0005, "resr">,   DwarfRegNum<[35]>;
-  def RFSR   : MBlazeSPRReg<0x0007, "rfsr">,   DwarfRegNum<[36]>;
-  def RBTR   : MBlazeSPRReg<0x000B, "rbtr">,   DwarfRegNum<[37]>;
-  def REDR   : MBlazeSPRReg<0x000D, "redr">,   DwarfRegNum<[38]>;
-  def RPID   : MBlazeSPRReg<0x1000, "rpid">,   DwarfRegNum<[39]>;
-  def RZPR   : MBlazeSPRReg<0x1001, "rzpr">,   DwarfRegNum<[40]>;
-  def RTLBX  : MBlazeSPRReg<0x1002, "rtlbx">,  DwarfRegNum<[41]>;
-  def RTLBLO : MBlazeSPRReg<0x1003, "rtlblo">, DwarfRegNum<[42]>;
-  def RTLBHI : MBlazeSPRReg<0x1004, "rtlbhi">, DwarfRegNum<[43]>;
-  def RTLBSX : MBlazeSPRReg<0x1004, "rtlbsx">, DwarfRegNum<[44]>;
-  def RPVR0  : MBlazeSPRReg<0x2000, "rpvr0">,  DwarfRegNum<[45]>;
-  def RPVR1  : MBlazeSPRReg<0x2001, "rpvr1">,  DwarfRegNum<[46]>;
-  def RPVR2  : MBlazeSPRReg<0x2002, "rpvr2">,  DwarfRegNum<[47]>;
-  def RPVR3  : MBlazeSPRReg<0x2003, "rpvr3">,  DwarfRegNum<[48]>;
-  def RPVR4  : MBlazeSPRReg<0x2004, "rpvr4">,  DwarfRegNum<[49]>;
-  def RPVR5  : MBlazeSPRReg<0x2005, "rpvr5">,  DwarfRegNum<[50]>;
-  def RPVR6  : MBlazeSPRReg<0x2006, "rpvr6">,  DwarfRegNum<[51]>;
-  def RPVR7  : MBlazeSPRReg<0x2007, "rpvr7">,  DwarfRegNum<[52]>;
-  def RPVR8  : MBlazeSPRReg<0x2008, "rpvr8">,  DwarfRegNum<[53]>;
-  def RPVR9  : MBlazeSPRReg<0x2009, "rpvr9">,  DwarfRegNum<[54]>;
-  def RPVR10 : MBlazeSPRReg<0x200A, "rpvr10">, DwarfRegNum<[55]>;
-  def RPVR11 : MBlazeSPRReg<0x200B, "rpvr11">, DwarfRegNum<[56]>;
-
-  // The carry bit. In the Microblaze this is really bit 29 of the
-  // MSR register but this is the only bit of that register that we
-  // are interested in modeling.
-  def CARRY  : MBlazeSPRReg<0x0000, "rmsr[c]">;
-}
-
-//===----------------------------------------------------------------------===//
-// Register Classes
-//===----------------------------------------------------------------------===//
-
-def GPR : RegisterClass<"MBlaze", [i32,f32], 32, (sequence "R%u", 0, 31)>;
-
-def SPR : RegisterClass<"MBlaze", [i32], 32, (add
-  // Reserved
-  RPC,
-  RMSR,
-  REAR,
-  RESR,
-  RFSR,
-  RBTR,
-  REDR,
-  RPID,
-  RZPR,
-  RTLBX,
-  RTLBLO,
-  RTLBHI,
-  RPVR0,
-  RPVR1,
-  RPVR2,
-  RPVR3,
-  RPVR4,
-  RPVR5,
-  RPVR6,
-  RPVR7,
-  RPVR8,
-  RPVR9,
-  RPVR10,
-  RPVR11
-  )>
-{
-  // None of the special purpose registers are allocatable.
-  let isAllocatable = 0;
-}
-
-def CRC : RegisterClass<"MBlaze", [i32], 32, (add CARRY)> {
-  let CopyCost = -1;
-}
diff --git a/lib/Target/MBlaze/MBlazeRelocations.h b/lib/Target/MBlaze/MBlazeRelocations.h
deleted file mode 100644
index 6387ee2..0000000
--- a/lib/Target/MBlaze/MBlazeRelocations.h
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- MBlazeRelocations.h - MBlaze Code Relocations -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MBlaze target-specific relocation types.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZERELOCATIONS_H
-#define MBLAZERELOCATIONS_H
-
-#include "llvm/CodeGen/MachineRelocation.h"
-
-namespace llvm {
-  namespace MBlaze {
-    enum RelocationType {
-      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
-      /// the value already in memory, after we adjust it for where the PC is.
-      reloc_pcrel_word = 0,
-
-      /// reloc_picrel_word - PIC base relative relocation, add the relocated
-      /// value to the value already in memory, after we adjust it for where the
-      /// PIC base is.
-      reloc_picrel_word = 1,
-
-      /// reloc_absolute_word - absolute relocation, just add the relocated
-      /// value to the value already in memory.
-      reloc_absolute_word = 2,
-
-      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
-      /// value to the value already in memory. In object files, it represents a
-      /// value which must be sign-extended when resolving the relocation.
-      reloc_absolute_word_sext = 3,
-
-      /// reloc_absolute_dword - absolute relocation, just add the relocated
-      /// value to the value already in memory.
-      reloc_absolute_dword = 4
-    };
-  }
-}
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeSchedule.td b/lib/Target/MBlaze/MBlazeSchedule.td
deleted file mode 100644
index cd5691c..0000000
--- a/lib/Target/MBlaze/MBlazeSchedule.td
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- MBlazeSchedule.td - MBlaze Scheduling Definitions --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze functional units.
-//===----------------------------------------------------------------------===//
-def IF : FuncUnit;
-def ID : FuncUnit;
-def EX : FuncUnit;
-def MA : FuncUnit;
-def WB : FuncUnit;
-
-//===----------------------------------------------------------------------===//
-// Instruction Itinerary classes used for MBlaze
-//===----------------------------------------------------------------------===//
-def IIC_ALU    : InstrItinClass;
-def IIC_ALUm   : InstrItinClass;
-def IIC_ALUd   : InstrItinClass;
-def IIC_SHT    : InstrItinClass;
-def IIC_FSLg   : InstrItinClass;
-def IIC_FSLp   : InstrItinClass;
-def IIC_MEMs   : InstrItinClass;
-def IIC_MEMl   : InstrItinClass;
-def IIC_FPU    : InstrItinClass;
-def IIC_FPUd   : InstrItinClass;
-def IIC_FPUf   : InstrItinClass;
-def IIC_FPUi   : InstrItinClass;
-def IIC_FPUs   : InstrItinClass;
-def IIC_FPUc   : InstrItinClass;
-def IIC_BR     : InstrItinClass;
-def IIC_BRc    : InstrItinClass;
-def IIC_BRl    : InstrItinClass;
-def IIC_WDC    : InstrItinClass;
-def IIC_Pseudo : InstrItinClass;
-
-//===----------------------------------------------------------------------===//
-// MBlaze instruction itineraries for three stage pipeline.
-//===----------------------------------------------------------------------===//
-include "MBlazeSchedule3.td"
-
-//===----------------------------------------------------------------------===//
-// MBlaze instruction itineraries for five stage pipeline.
-//===----------------------------------------------------------------------===//
-include "MBlazeSchedule5.td"
diff --git a/lib/Target/MBlaze/MBlazeSchedule3.td b/lib/Target/MBlaze/MBlazeSchedule3.td
deleted file mode 100644
index 20257a6..0000000
--- a/lib/Target/MBlaze/MBlazeSchedule3.td
+++ /dev/null
@@ -1,236 +0,0 @@
-//===-- MBlazeSchedule3.td - MBlaze Scheduling Definitions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze instruction itineraries for the three stage pipeline.
-//===----------------------------------------------------------------------===//
-def MBlazePipe3Itineraries : ProcessorItineraries<
-  [IF,ID,EX], [], [
-
-  // ALU instruction with one destination register and either two register
-  // source operands or one register source operand and one immediate operand.
-  // The instruction takes one cycle to execute in each of the stages. The
-  // two source operands are read during the decode stage and the result is
-  // ready after the execute stage.
-  InstrItinData< IIC_ALU,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>], // one cycle in execute stage
-               [ 2                    // result ready after two cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // ALU multiply instruction with one destination register and either two
-  // register source operands or one register source operand and one immediate
-  // operand.  The instruction takes one cycle to execute in each of the
-  // pipeline stages except the execute stage, which takes three cycles. The
-  // two source operands are read during the decode stage and the result is
-  // ready after the execute stage.
-  InstrItinData< IIC_ALUm,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<3,[EX]>], // three cycles in execute stage
-               [ 4                    // result ready after four cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // ALU divide instruction with one destination register two register source
-  // operands. The instruction takes one cycle to execute in each the pipeline
-  // stages except the execute stage, which takes 34 cycles. The two
-  // source operands are read during the decode stage and the result is ready
-  // after the execute stage.
-  InstrItinData< IIC_ALUd,
-               [ InstrStage<1,[IF]>    // one cycle in fetch stage
-               , InstrStage<1,[ID]>    // one cycle in decode stage
-               , InstrStage<34,[EX]>], // 34 cycles in execute stage
-               [ 35                    // result ready after 35 cycles
-               , 1                     // first operand read after one cycle
-               , 1 ]>,                 // second operand read after one cycle
-
-  // Shift instruction with one destination register and either two register
-  // source operands or one register source operand and one immediate operand.
-  // The instruction takes one cycle to execute in each of the pipeline stages
-  // except the execute stage, which takes two cycles.  The two source operands
-  // are read during the decode stage and the result is ready after the execute
-  // stage.
-  InstrItinData< IIC_SHT,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 3                    // result ready after three cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Branch instruction with one source operand register. The instruction takes
-  // one cycle to execute in each of the pipeline stages. The source operand is
-  // read during the decode stage.
-  InstrItinData< IIC_BR,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>], // one cycle in execute stage
-               [ 1 ]>,                // first operand read after one cycle
-
-  // Conditional branch instruction with two source operand registers. The
-  // instruction takes one cycle to execute in each of the pipeline stages. The
-  // two source operands are read during the decode stage.
-  InstrItinData< IIC_BRc,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>], // one cycle in execute stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Branch and link instruction with one destination register and one source
-  // operand register. The instruction takes one cycle to execute in each of
-  // the pipeline stages. The source operand is read during the decode stage
-  // and the destination register is ready after the execute stage.
-  InstrItinData< IIC_BRl,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>], // one cycle in execute stage
-               [ 2                    // result ready after two cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Cache control instruction with two source operand registers. The
-  // instruction takes one cycle to execute in each of the pipeline stages
-  // except the memory access stage, which takes two cycles. The source
-  // operands are read during the decode stage.
-  InstrItinData< IIC_WDC,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Floating point instruction with one destination register and two source
-  // operand registers. The instruction takes one cycle to execute in each of
-  // the pipeline stages except the execute stage, which takes six cycles. The
-  // source operands are read during the decode stage and the results are ready
-  // after the execute stage.
-  InstrItinData< IIC_FPU,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<6,[EX]>], // six cycles in execute stage
-               [ 7                    // result ready after seven cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Floating point divide instruction with one destination register and two
-  // source operand registers. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the execute stage, which takes 30
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the execute stage.
-  InstrItinData< IIC_FPUd,
-               [ InstrStage<1,[IF]>    // one cycle in fetch stage
-               , InstrStage<1,[ID]>    // one cycle in decode stage
-               , InstrStage<30,[EX]>], // one cycle in execute stage
-               [ 31                    // result ready after 31 cycles
-               , 1                     // first operand read after one cycle
-               , 1 ]>,                 // second operand read after one cycle
-
-  // Convert floating point to integer instruction with one destination
-  // register and one source operand register. The instruction takes one cycle
-  // to execute in each of the pipeline stages except the execute stage,
-  // which takes seven cycles. The source operands are read during the decode
-  // stage and the results are ready after the execute stage.
-  InstrItinData< IIC_FPUi,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<7,[EX]>], // seven cycles in execute stage
-               [ 8                    // result ready after eight cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Convert integer to floating point instruction with one destination
-  // register and one source operand register. The instruction takes one cycle
-  // to execute in each of the pipeline stages except the execute stage,
-  // which takes six cycles. The source operands are read during the decode
-  // stage and the results are ready after the execute stage.
-  InstrItinData< IIC_FPUf,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<6,[EX]>], // six cycles in execute stage
-               [ 7                    // result ready after seven cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Floating point square root instruction with one destination register and
-  // one source operand register. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the execute stage, which takes 29
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the execute stage.
-  InstrItinData< IIC_FPUs,
-               [ InstrStage<1,[IF]>    // one cycle in fetch stage
-               , InstrStage<1,[ID]>    // one cycle in decode stage
-               , InstrStage<29,[EX]>], // 29 cycles in execute stage
-               [ 30                    // result ready after 30 cycles
-               , 1 ]>,                 // first operand read after one cycle
-
-  // Floating point comparison instruction with one destination register and
-  // two source operand registers. The instruction takes one cycle to execute
-  // in each of the pipeline stages except the execute stage, which takes three
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the execute stage.
-  InstrItinData< IIC_FPUc,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<3,[EX]>], // three cycles in execute stage
-               [ 4                    // result ready after four cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // FSL get instruction with one register or immediate source operand and one
-  // destination register. The instruction takes one cycle to execute in each
-  // of the pipeline stages except the execute stage, which takes two cycles.
-  // The one source operand is read during the decode stage and the result is
-  // ready after the execute stage.
-  InstrItinData< IIC_FSLg,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 3                    // result ready after two cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // FSL put instruction with either two register source operands or one
-  // register source operand and one immediate operand. There is no result
-  // produced by the instruction. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the execute stage, which takes two
-  // cycles. The two source operands are read during the decode stage.
-  InstrItinData< IIC_FSLp,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Memory store instruction with either three register source operands or two
-  // register source operands and one immediate operand. There is no result
-  // produced by the instruction. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the execute stage, which takes two
-  // cycles. All of the source operands are read during the decode stage.
-  InstrItinData< IIC_MEMs,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 1                    // first operand read after one cycle
-               , 1                    // second operand read after one cycle
-               , 1 ]>,                // third operand read after one cycle
-
-  // Memory load instruction with one destination register and either two
-  // register source operands or one register source operand and one immediate
-  // operand. The instruction takes one cycle to execute in each of the
-  // pipeline stages except the execute stage, which takes two cycles. All of
-  // the source operands are read during the decode stage and the result is
-  // ready after the execute stage.
-  InstrItinData< IIC_MEMl,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<2,[EX]>], // two cycles in execute stage
-               [ 3                    // result ready after four cycles
-               , 1                    // second operand read after one cycle
-               , 1 ]>                 // third operand read after one cycle
-]>;
diff --git a/lib/Target/MBlaze/MBlazeSchedule5.td b/lib/Target/MBlaze/MBlazeSchedule5.td
deleted file mode 100644
index ab53b42..0000000
--- a/lib/Target/MBlaze/MBlazeSchedule5.td
+++ /dev/null
@@ -1,267 +0,0 @@
-//===-- MBlazeSchedule5.td - MBlaze Scheduling Definitions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MBlaze instruction itineraries for the five stage pipeline.
-//===----------------------------------------------------------------------===//
-def MBlazePipe5Itineraries : ProcessorItineraries<
-  [IF,ID,EX,MA,WB], [], [
-
-  // ALU instruction with one destination register and either two register
-  // source operands or one register source operand and one immediate operand.
-  // The instruction takes one cycle to execute in each of the stages. The
-  // two source operands are read during the decode stage and the result is
-  // ready after the execute stage.
-  InstrItinData< IIC_ALU,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 2                    // result ready after two cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // ALU multiply instruction with one destination register and either two
-  // register source operands or one register source operand and one immediate
-  // operand.  The instruction takes one cycle to execute in each of the
-  // pipeline stages. The two source operands are read during the decode stage
-  // and the result is ready after the execute stage.
-  InstrItinData< IIC_ALUm,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 2                    // result ready after two cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // ALU divide instruction with one destination register two register source
-  // operands. The instruction takes one cycle to execute in each the pipeline
-  // stages except the memory access stage, which takes 31 cycles. The two
-  // source operands are read during the decode stage and the result is ready
-  // after the memory access stage.
-  InstrItinData< IIC_ALUd,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<31,[MA]>  // 31 cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 33                   // result ready after 33 cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Shift instruction with one destination register and either two register
-  // source operands or one register source operand and one immediate operand.
-  // The instruction takes one cycle to execute in each of the pipeline stages.
-  // The two source operands are read during the decode stage and the result is
-  // ready after the memory access stage.
-  InstrItinData< IIC_SHT,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 3                    // result ready after three cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Branch instruction with one source operand register. The instruction takes
-  // one cycle to execute in each of the pipeline stages. The source operand is
-  // read during the decode stage.
-  InstrItinData< IIC_BR,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 1 ]>,                // first operand read after one cycle
-
-  // Conditional branch instruction with two source operand registers. The
-  // instruction takes one cycle to execute in each of the pipeline stages. The
-  // two source operands are read during the decode stage.
-  InstrItinData< IIC_BRc,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Branch and link instruction with one destination register and one source
-  // operand register. The instruction takes one cycle to execute in each of
-  // the pipeline stages. The source operand is read during the decode stage
-  // and the destination register is ready after the writeback stage.
-  InstrItinData< IIC_BRl,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 4                    // result ready after four cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Cache control instruction with two source operand registers. The
-  // instruction takes one cycle to execute in each of the pipeline stages
-  // except the memory access stage, which takes two cycles. The source
-  // operands are read during the decode stage.
-  InstrItinData< IIC_WDC,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<2,[MA]>   // two cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Floating point instruction with one destination register and two source
-  // operand registers. The instruction takes one cycle to execute in each of
-  // the pipeline stages except the memory access stage, which takes two
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the writeback stage.
-  InstrItinData< IIC_FPU,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<2,[MA]>   // two cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 5                    // result ready after five cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Floating point divide instruction with one destination register and two
-  // source operand registers. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the memory access stage, which takes 26
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the writeback stage.
-  InstrItinData< IIC_FPUd,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<26,[MA]>  // 26 cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 29                   // result ready after 29 cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Convert floating point to integer instruction with one destination
-  // register and one source operand register. The instruction takes one cycle
-  // to execute in each of the pipeline stages except the memory access stage,
-  // which takes three cycles. The source operands are read during the decode
-  // stage and the results are ready after the writeback stage.
-  InstrItinData< IIC_FPUi,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<3,[MA]>   // three cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 6                   // result ready after six cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Convert integer to floating point instruction with one destination
-  // register and one source operand register. The instruction takes one cycle
-  // to execute in each of the pipeline stages except the memory access stage,
-  // which takes two cycles. The source operands are read during the decode
-  // stage and the results are ready after the writeback stage.
-  InstrItinData< IIC_FPUf,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<2,[MA]>   // two cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 5                    // result ready after five cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Floating point square root instruction with one destination register and
-  // one source operand register. The instruction takes one cycle to execute in
-  // each of the pipeline stages except the memory access stage, which takes 25
-  // cycles. The source operands are read during the decode stage and the
-  // results are ready after the writeback stage.
-  InstrItinData< IIC_FPUs,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<25,[MA]>  // 25 cycles in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 28                   // result ready after 28 cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // Floating point comparison instruction with one destination register and
-  // two source operand registers. The instruction takes one cycle to execute
-  // in each of the pipeline stages. The source operands are read during the
-  // decode stage and the results are ready after the execute stage.
-  InstrItinData< IIC_FPUc,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 2                    // result ready after two cycles
-               , 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // FSL get instruction with one register or immediate source operand and one
-  // destination register. The instruction takes one cycle to execute in each
-  // of the pipeline stages. The one source operand is read during the decode
-  // stage and the result is ready after the execute stage.
-  InstrItinData< IIC_FSLg,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 2                    // result ready after two cycles
-               , 1 ]>,                // first operand read after one cycle
-
-  // FSL put instruction with either two register source operands or one
-  // register source operand and one immediate operand. There is no result
-  // produced by the instruction. The instruction takes one cycle to execute in
-  // each of the pipeline stages. The two source operands are read during the
-  // decode stage.
-  InstrItinData< IIC_FSLp,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 1                    // first operand read after one cycle
-               , 1 ]>,                // second operand read after one cycle
-
-  // Memory store instruction with either three register source operands or two
-  // register source operands and one immediate operand. There is no result
-  // produced by the instruction. The instruction takes one cycle to execute in
-  // each of the pipeline stages. All of the source operands are read during
-  // the decode stage.
-  InstrItinData< IIC_MEMs,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 1                    // first operand read after one cycle
-               , 1                    // second operand read after one cycle
-               , 1 ]>,                // third operand read after one cycle
-
-  // Memory load instruction with one destination register and either two
-  // register source operands or one register source operand and one immediate
-  // operand. The instruction takes one cycle to execute in each of the
-  // pipeline stages. All of the source operands are read during the decode
-  // stage and the result is ready after the writeback stage.
-  InstrItinData< IIC_MEMl,
-               [ InstrStage<1,[IF]>   // one cycle in fetch stage
-               , InstrStage<1,[ID]>   // one cycle in decode stage
-               , InstrStage<1,[EX]>   // one cycle in execute stage
-               , InstrStage<1,[MA]>   // one cycle in memory access stage
-               , InstrStage<1,[WB]>], // one cycle in write back stage
-               [ 4                    // result ready after four cycles
-               , 1                    // second operand read after one cycle
-               , 1 ]>                 // third operand read after one cycle
-]>;
diff --git a/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h b/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h
deleted file mode 100644
index 9f8e2aa..0000000
--- a/lib/Target/MBlaze/MBlazeSelectionDAGInfo.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===-- MBlazeSelectionDAGInfo.h - MBlaze SelectionDAG Info -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MBlaze subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZESELECTIONDAGINFO_H
-#define MBLAZESELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class MBlazeTargetMachine;
-
-class MBlazeSelectionDAGInfo : public TargetSelectionDAGInfo {
-public:
-  explicit MBlazeSelectionDAGInfo(const MBlazeTargetMachine &TM);
-  ~MBlazeSelectionDAGInfo();
-};
-
-}
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.cpp b/lib/Target/MBlaze/MBlazeSubtarget.cpp
deleted file mode 100644
index dc2ad29..0000000
--- a/lib/Target/MBlaze/MBlazeSubtarget.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- MBlazeSubtarget.cpp - MBlaze Subtarget Information ----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MBlaze specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeSubtarget.h"
-#include "MBlaze.h"
-#include "MBlazeRegisterInfo.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_SUBTARGETINFO_TARGET_DESC
-#define GET_SUBTARGETINFO_CTOR
-#include "MBlazeGenSubtargetInfo.inc"
-
-using namespace llvm;
-
-MBlazeSubtarget::MBlazeSubtarget(const std::string &TT,
-                                 const std::string &CPU,
-                                 const std::string &FS):
-  MBlazeGenSubtargetInfo(TT, CPU, FS),
-  HasBarrel(false), HasDiv(false), HasMul(false), HasPatCmp(false),
-  HasFPU(false), HasMul64(false), HasSqrt(false)
-{
-  // Parse features string.
-  std::string CPUName = CPU;
-  if (CPUName.empty())
-    CPUName = "mblaze";
-  ParseSubtargetFeatures(CPUName, FS);
-
-  // Only use instruction scheduling if the selected CPU has an instruction
-  // itinerary (the default CPU is the only one that doesn't).
-  HasItin = CPUName != "mblaze";
-  DEBUG(dbgs() << "CPU " << CPUName << "(" << HasItin << ")\n");
-
-  // Initialize scheduling itinerary for the specified CPU.
-  InstrItins = getInstrItineraryForCPU(CPUName);
-}
-
-bool MBlazeSubtarget::
-enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                      TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                      RegClassVector& CriticalPathRCs) const {
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
-  CriticalPathRCs.clear();
-  CriticalPathRCs.push_back(&MBlaze::GPRRegClass);
-  return HasItin && OptLevel >= CodeGenOpt::Default;
-}
diff --git a/lib/Target/MBlaze/MBlazeSubtarget.h b/lib/Target/MBlaze/MBlazeSubtarget.h
deleted file mode 100644
index ed43d21..0000000
--- a/lib/Target/MBlaze/MBlazeSubtarget.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- MBlazeSubtarget.h - Define Subtarget for the MBlaze ----*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the MBlaze specific subclass of TargetSubtargetInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZESUBTARGET_H
-#define MBLAZESUBTARGET_H
-
-#include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include <string>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "MBlazeGenSubtargetInfo.inc"
-
-namespace llvm {
-class StringRef;
-
-class MBlazeSubtarget : public MBlazeGenSubtargetInfo {
-
-protected:
-  bool HasBarrel;
-  bool HasDiv;
-  bool HasMul;
-  bool HasPatCmp;
-  bool HasFPU;
-  bool HasMul64;
-  bool HasSqrt;
-  bool HasItin;
-
-  InstrItineraryData InstrItins;
-
-public:
-
-  /// This constructor initializes the data members to match that
-  /// of the specified triple.
-  MBlazeSubtarget(const std::string &TT, const std::string &CPU,
-                  const std::string &FS);
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  /// Compute the number of maximum number of issues per cycle for the
-  /// MBlaze scheduling itineraries.
-  void computeIssueWidth();
-
-  /// enablePostRAScheduler - True at 'More' optimization.
-  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                             TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const;
-
-  /// getInstrItins - Return the instruction itineraies based on subtarget.
-  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-
-  bool hasItin()   const { return HasItin; }
-  bool hasPCMP()   const { return HasPatCmp; }
-  bool hasFPU()    const { return HasFPU; }
-  bool hasSqrt()   const { return HasSqrt; }
-  bool hasMul()    const { return HasMul; }
-  bool hasMul64()  const { return HasMul64; }
-  bool hasDiv()    const { return HasDiv; }
-  bool hasBarrel() const { return HasBarrel; }
-};
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
deleted file mode 100644
index bcdd32f..0000000
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- MBlazeTargetMachine.cpp - Define TargetMachine for MBlaze ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements the info about MBlaze target spec.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeTargetMachine.h"
-#include "MBlaze.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/PassManager.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-extern "C" void LLVMInitializeMBlazeTarget() {
-  // Register the target.
-  RegisterTargetMachine<MBlazeTargetMachine> X(TheMBlazeTarget);
-}
-
-// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
-// The stack is always 8 byte aligned
-// On function prologue, the stack is created by decrementing
-// its pointer. Once decremented, all references are done with positive
-// offset from the stack/frame pointer, using StackGrowsUp enables
-// an easier handling.
-MBlazeTargetMachine::
-MBlazeTargetMachine(const Target &T, StringRef TT,
-                    StringRef CPU, StringRef FS, const TargetOptions &Options,
-                    Reloc::Model RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS),
-    DL("E-p:32:32:32-i8:8:8-i16:16:16"),
-    InstrInfo(*this),
-    FrameLowering(Subtarget),
-    TLInfo(*this), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
-}
-
-namespace {
-/// MBlaze Code Generator Pass Configuration Options.
-class MBlazePassConfig : public TargetPassConfig {
-public:
-  MBlazePassConfig(MBlazeTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
-
-  MBlazeTargetMachine &getMBlazeTargetMachine() const {
-    return getTM<MBlazeTargetMachine>();
-  }
-
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
-};
-} // namespace
-
-TargetPassConfig *MBlazeTargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new MBlazePassConfig(this, PM);
-}
-
-// Install an instruction selector pass using
-// the ISelDag to gen MBlaze code.
-bool MBlazePassConfig::addInstSelector() {
-  addPass(createMBlazeISelDag(getMBlazeTargetMachine()));
-  return false;
-}
-
-// Implemented by targets that want to run passes immediately before
-// machine code is emitted. return true if -print-machineinstrs should
-// print out the code after the passes.
-bool MBlazePassConfig::addPreEmitPass() {
-  addPass(createMBlazeDelaySlotFillerPass(getMBlazeTargetMachine()));
-  return true;
-}
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
deleted file mode 100644
index 956794d..0000000
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ /dev/null
@@ -1,80 +0,0 @@
-//===-- MBlazeTargetMachine.h - Define TargetMachine for MBlaze -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the MBlaze specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZE_TARGETMACHINE_H
-#define MBLAZE_TARGETMACHINE_H
-
-#include "MBlazeFrameLowering.h"
-#include "MBlazeISelLowering.h"
-#include "MBlazeInstrInfo.h"
-#include "MBlazeIntrinsicInfo.h"
-#include "MBlazeSelectionDAGInfo.h"
-#include "MBlazeSubtarget.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetMachine.h"
-
-namespace llvm {
-  class formatted_raw_ostream;
-
-  class MBlazeTargetMachine : public LLVMTargetMachine {
-    MBlazeSubtarget        Subtarget;
-    const DataLayout       DL; // Calculates type size & alignment
-    MBlazeInstrInfo        InstrInfo;
-    MBlazeFrameLowering    FrameLowering;
-    MBlazeTargetLowering   TLInfo;
-    MBlazeSelectionDAGInfo TSInfo;
-    MBlazeIntrinsicInfo    IntrinsicInfo;
-    InstrItineraryData     InstrItins;
-
-  public:
-    MBlazeTargetMachine(const Target &T, StringRef TT,
-                        StringRef CPU, StringRef FS,
-                        const TargetOptions &Options,
-                        Reloc::Model RM, CodeModel::Model CM,
-                        CodeGenOpt::Level OL);
-
-    virtual const MBlazeInstrInfo *getInstrInfo() const
-    { return &InstrInfo; }
-
-    virtual const InstrItineraryData *getInstrItineraryData() const
-    {  return &InstrItins; }
-
-    virtual const TargetFrameLowering *getFrameLowering() const
-    { return &FrameLowering; }
-
-    virtual const MBlazeSubtarget *getSubtargetImpl() const
-    { return &Subtarget; }
-
-    virtual const DataLayout *getDataLayout() const
-    { return &DL;}
-
-    virtual const MBlazeRegisterInfo *getRegisterInfo() const
-    { return &InstrInfo.getRegisterInfo(); }
-
-    virtual const MBlazeTargetLowering *getTargetLowering() const
-    { return &TLInfo; }
-
-    virtual const MBlazeSelectionDAGInfo* getSelectionDAGInfo() const
-    { return &TSInfo; }
-
-    const TargetIntrinsicInfo *getIntrinsicInfo() const
-    { return &IntrinsicInfo; }
-
-    // Pass Pipeline Configuration
-    virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  };
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
deleted file mode 100644
index a7a0a68..0000000
--- a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-//===-- MBlazeTargetObjectFile.cpp - MBlaze object files ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeTargetObjectFile.h"
-#include "MBlazeSubtarget.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-void MBlazeTargetObjectFile::
-Initialize(MCContext &Ctx, const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-
-  SmallDataSection =
-    getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-
-  SmallBSSSection =
-    getContext().getELFSection(".sbss", ELF::SHT_NOBITS,
-                               ELF::SHF_WRITE |ELF::SHF_ALLOC,
-                               SectionKind::getBSS());
-
-}
-
-// A address must be loaded from a small section if its size is less than the
-// small section size threshold. Data in this section must be addressed using
-// gp_rel operator.
-static bool IsInSmallSection(uint64_t Size) {
-  return Size > 0 && Size <= 8;
-}
-
-bool MBlazeTargetObjectFile::
-IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM) const {
-  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
-    return false;
-
-  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
-}
-
-/// IsGlobalInSmallSection - Return true if this global address should be
-/// placed into small data/bss section.
-bool MBlazeTargetObjectFile::
-IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
-                       SectionKind Kind) const {
-  // Only global variables, not functions.
-  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
-  if (!GVA)
-    return false;
-
-  // We can only do this for datarel or BSS objects for now.
-  if (!Kind.isBSS() && !Kind.isDataRel())
-    return false;
-
-  // If this is a internal constant string, there is a special
-  // section for it, but not in small data/bss.
-  if (Kind.isMergeable1ByteCString())
-    return false;
-
-  Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
-}
-
-const MCSection *MBlazeTargetObjectFile::
-SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
-                       Mangler *Mang, const TargetMachine &TM) const {
-  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
-  // sections?
-
-  // Handle Small Section classification here.
-  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
-    return SmallBSSSection;
-  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
-    return SmallDataSection;
-
-  // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
-}
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.h b/lib/Target/MBlaze/MBlazeTargetObjectFile.h
deleted file mode 100644
index c313722..0000000
--- a/lib/Target/MBlaze/MBlazeTargetObjectFile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- llvm/Target/MBlazeTargetObjectFile.h - MBlaze Obj. Info -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
-#define LLVM_TARGET_MBLAZE_TARGETOBJECTFILE_H
-
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-
-namespace llvm {
-
-  class MBlazeTargetObjectFile : public TargetLoweringObjectFileELF {
-    const MCSection *SmallDataSection;
-    const MCSection *SmallBSSSection;
-  public:
-
-    void Initialize(MCContext &Ctx, const TargetMachine &TM);
-
-    /// IsGlobalInSmallSection - Return true if this global address should be
-    /// placed into small data/bss section.
-    bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM,
-                                SectionKind Kind) const;
-
-    bool IsGlobalInSmallSection(const GlobalValue *GV,
-                                const TargetMachine &TM) const;
-
-    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
-                                            SectionKind Kind,
-                                            Mangler *Mang,
-                                            const TargetMachine &TM) const;
-  };
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt b/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index 36134a6..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-add_llvm_library(LLVMMBlazeDesc
-  MBlazeAsmBackend.cpp
-  MBlazeMCAsmInfo.cpp
-  MBlazeMCCodeEmitter.cpp
-  MBlazeMCTargetDesc.cpp
-  MBlazeELFObjectWriter.cpp
-  )
-
-add_dependencies(LLVMMBlazeDesc MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt b/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index 4982f0f..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/MBlaze/MCTargetDesc/LLVMBuild.txt -----------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = MBlazeDesc
-parent = MBlaze
-required_libraries = MBlazeAsmPrinter MBlazeInfo MC Support
-add_to_library_groups = MBlaze
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
deleted file mode 100644
index 6f9752c..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
+++ /dev/null
@@ -1,171 +0,0 @@
-//===-- MBlazeAsmBackend.cpp - MBlaze Assembler Backend -------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/MBlazeMCTargetDesc.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-static unsigned getFixupKindSize(unsigned Kind) {
-  switch (Kind) {
-  default: llvm_unreachable("invalid fixup kind!");
-  case FK_Data_1: return 1;
-  case FK_PCRel_2:
-  case FK_Data_2: return 2;
-  case FK_PCRel_4:
-  case FK_Data_4: return 4;
-  case FK_Data_8: return 8;
-  }
-}
-
-
-namespace {
-
-class MBlazeAsmBackend : public MCAsmBackend {
-public:
-  MBlazeAsmBackend(const Target &T)
-    : MCAsmBackend() {
-  }
-
-  unsigned getNumFixupKinds() const {
-    return 2;
-  }
-
-  bool mayNeedRelaxation(const MCInst &Inst) const;
-
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
-
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  unsigned getPointerSize() const {
-    return 4;
-  }
-};
-
-static unsigned getRelaxedOpcode(unsigned Op) {
-    switch (Op) {
-    default:            return Op;
-    case MBlaze::ADDIK: return MBlaze::ADDIK32;
-    case MBlaze::ORI:   return MBlaze::ORI32;
-    case MBlaze::BRLID: return MBlaze::BRLID32;
-    }
-}
-
-bool MBlazeAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  if (getRelaxedOpcode(Inst.getOpcode()) == Inst.getOpcode())
-    return false;
-
-  bool hasExprOrImm = false;
-  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
-    hasExprOrImm |= Inst.getOperand(i).isExpr();
-
-  return hasExprOrImm;
-}
-
-bool MBlazeAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                            uint64_t Value,
-                                            const MCRelaxableFragment *DF,
-                                            const MCAsmLayout &Layout) const {
-  // FIXME: Is this right? It's what the "generic" code was doing before,
-  // but is X86 specific. Is it actually true for MBlaze also, or was it
-  // just close enough to not be a big deal?
-  //
-  // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
-}
-
-void MBlazeAsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  Res = Inst;
-  Res.setOpcode(getRelaxedOpcode(Inst.getOpcode()));
-}
-
-bool MBlazeAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  if ((Count % 4) != 0)
-    return false;
-
-  for (uint64_t i = 0; i < Count; i += 4)
-      OW->Write32(0x00000000);
-
-  return true;
-}
-} // end anonymous namespace
-
-namespace {
-class ELFMBlazeAsmBackend : public MBlazeAsmBackend {
-public:
-  uint8_t OSABI;
-  ELFMBlazeAsmBackend(const Target &T, uint8_t _OSABI)
-    : MBlazeAsmBackend(T), OSABI(_OSABI) { }
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value) const;
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createMBlazeELFObjectWriter(OS, OSABI);
-  }
-};
-
-void ELFMBlazeAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                     unsigned DataSize, uint64_t Value) const {
-  unsigned Size = getFixupKindSize(Fixup.getKind());
-
-  assert(Fixup.getOffset() + Size <= DataSize &&
-         "Invalid fixup offset!");
-
-  char *data = Data + Fixup.getOffset();
-  switch (Size) {
-  default: llvm_unreachable("Cannot fixup unknown value.");
-  case 1:  llvm_unreachable("Cannot fixup 1 byte value.");
-  case 8:  llvm_unreachable("Cannot fixup 8 byte value.");
-
-  case 4:
-    *(data+7) = uint8_t(Value);
-    *(data+6) = uint8_t(Value >> 8);
-    *(data+3) = uint8_t(Value >> 16);
-    *(data+2) = uint8_t(Value >> 24);
-    break;
-
-  case 2:
-    *(data+3) = uint8_t(Value >> 0);
-    *(data+2) = uint8_t(Value >> 8);
-  }
-}
-} // end anonymous namespace
-
-MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT,
-                                           StringRef CPU) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    assert(0 && "Mac not supported on MBlaze");
-
-  if (TheTriple.isOSWindows())
-    assert(0 && "Windows not supported on MBlaze");
-
-  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFMBlazeAsmBackend(T, OSABI);
-}
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
deleted file mode 100644
index 437026e..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeBaseInfo.h
+++ /dev/null
@@ -1,237 +0,0 @@
-//===-- MBlazeBaseInfo.h - Top level definitions for MBlaze -- --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the MBlaze target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBlazeBASEINFO_H
-#define MBlazeBASEINFO_H
-
-#include "MBlazeMCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-/// MBlazeII - This namespace holds all of the target specific flags that
-/// instruction info tracks.
-///
-namespace MBlazeII {
-  enum {
-    // PseudoFrm - This represents an instruction that is a pseudo instruction
-    // or one that has not been implemented yet.  It is illegal to code generate
-    // it, but tolerated for intermediate implementation stages.
-    FPseudo = 0,
-    FRRR,
-    FRRI,
-    FCRR,
-    FCRI,
-    FRCR,
-    FRCI,
-    FCCR,
-    FCCI,
-    FRRCI,
-    FRRC,
-    FRCX,
-    FRCS,
-    FCRCS,
-    FCRCX,
-    FCX,
-    FCR,
-    FRIR,
-    FRRRR,
-    FRI,
-    FC,
-    FRR,
-    FormMask = 63
-
-    //===------------------------------------------------------------------===//
-    // MBlaze Specific MachineOperand flags.
-    // MO_NO_FLAG,
-
-    /// MO_GOT - Represents the offset into the global offset table at which
-    /// the address the relocation entry symbol resides during execution.
-    // MO_GOT,
-
-    /// MO_GOT_CALL - Represents the offset into the global offset table at
-    /// which the address of a call site relocation entry symbol resides
-    /// during execution. This is different from the above since this flag
-    /// can only be present in call instructions.
-    // MO_GOT_CALL,
-
-    /// MO_GPREL - Represents the offset from the current gp value to be used
-    /// for the relocatable object file being produced.
-    // MO_GPREL,
-
-    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
-    /// address.
-    // MO_ABS_HILO
-
-  };
-}
-
-static inline bool isMBlazeRegister(unsigned Reg) {
-  return Reg <= 31;
-}
-
-static inline bool isSpecialMBlazeRegister(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : case 0x0001 : case 0x0003 : case 0x0005 : 
-    case 0x0007 : case 0x000B : case 0x000D : case 0x1000 : 
-    case 0x1001 : case 0x1002 : case 0x1003 : case 0x1004 : 
-    case 0x2000 : case 0x2001 : case 0x2002 : case 0x2003 : 
-    case 0x2004 : case 0x2005 : case 0x2006 : case 0x2007 : 
-    case 0x2008 : case 0x2009 : case 0x200A : case 0x200B : 
-      return true;
-
-    default:
-      return false;
-  }
-}
-
-/// getMBlazeRegisterNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-static inline unsigned getMBlazeRegisterNumbering(unsigned RegEnum) {
-  switch (RegEnum) {
-    case MBlaze::R0     : return 0;
-    case MBlaze::R1     : return 1;
-    case MBlaze::R2     : return 2;
-    case MBlaze::R3     : return 3;
-    case MBlaze::R4     : return 4;
-    case MBlaze::R5     : return 5;
-    case MBlaze::R6     : return 6;
-    case MBlaze::R7     : return 7;
-    case MBlaze::R8     : return 8;
-    case MBlaze::R9     : return 9;
-    case MBlaze::R10    : return 10;
-    case MBlaze::R11    : return 11;
-    case MBlaze::R12    : return 12;
-    case MBlaze::R13    : return 13;
-    case MBlaze::R14    : return 14;
-    case MBlaze::R15    : return 15;
-    case MBlaze::R16    : return 16;
-    case MBlaze::R17    : return 17;
-    case MBlaze::R18    : return 18;
-    case MBlaze::R19    : return 19;
-    case MBlaze::R20    : return 20;
-    case MBlaze::R21    : return 21;
-    case MBlaze::R22    : return 22;
-    case MBlaze::R23    : return 23;
-    case MBlaze::R24    : return 24;
-    case MBlaze::R25    : return 25;
-    case MBlaze::R26    : return 26;
-    case MBlaze::R27    : return 27;
-    case MBlaze::R28    : return 28;
-    case MBlaze::R29    : return 29;
-    case MBlaze::R30    : return 30;
-    case MBlaze::R31    : return 31;
-    case MBlaze::RPC    : return 0x0000;
-    case MBlaze::RMSR   : return 0x0001;
-    case MBlaze::REAR   : return 0x0003;
-    case MBlaze::RESR   : return 0x0005;
-    case MBlaze::RFSR   : return 0x0007;
-    case MBlaze::RBTR   : return 0x000B;
-    case MBlaze::REDR   : return 0x000D;
-    case MBlaze::RPID   : return 0x1000;
-    case MBlaze::RZPR   : return 0x1001;
-    case MBlaze::RTLBX  : return 0x1002;
-    case MBlaze::RTLBLO : return 0x1003;
-    case MBlaze::RTLBHI : return 0x1004;
-    case MBlaze::RPVR0  : return 0x2000;
-    case MBlaze::RPVR1  : return 0x2001;
-    case MBlaze::RPVR2  : return 0x2002;
-    case MBlaze::RPVR3  : return 0x2003;
-    case MBlaze::RPVR4  : return 0x2004;
-    case MBlaze::RPVR5  : return 0x2005;
-    case MBlaze::RPVR6  : return 0x2006;
-    case MBlaze::RPVR7  : return 0x2007;
-    case MBlaze::RPVR8  : return 0x2008;
-    case MBlaze::RPVR9  : return 0x2009;
-    case MBlaze::RPVR10 : return 0x200A;
-    case MBlaze::RPVR11 : return 0x200B;
-    default: llvm_unreachable("Unknown register number!");
-  }
-}
-
-/// getRegisterFromNumbering - Given the enum value for some register, e.g.
-/// MBlaze::R0, return the number that it corresponds to (e.g. 0).
-static inline unsigned getMBlazeRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0  : return MBlaze::R0;
-    case 1  : return MBlaze::R1;
-    case 2  : return MBlaze::R2;
-    case 3  : return MBlaze::R3;
-    case 4  : return MBlaze::R4;
-    case 5  : return MBlaze::R5;
-    case 6  : return MBlaze::R6;
-    case 7  : return MBlaze::R7;
-    case 8  : return MBlaze::R8;
-    case 9  : return MBlaze::R9;
-    case 10 : return MBlaze::R10;
-    case 11 : return MBlaze::R11;
-    case 12 : return MBlaze::R12;
-    case 13 : return MBlaze::R13;
-    case 14 : return MBlaze::R14;
-    case 15 : return MBlaze::R15;
-    case 16 : return MBlaze::R16;
-    case 17 : return MBlaze::R17;
-    case 18 : return MBlaze::R18;
-    case 19 : return MBlaze::R19;
-    case 20 : return MBlaze::R20;
-    case 21 : return MBlaze::R21;
-    case 22 : return MBlaze::R22;
-    case 23 : return MBlaze::R23;
-    case 24 : return MBlaze::R24;
-    case 25 : return MBlaze::R25;
-    case 26 : return MBlaze::R26;
-    case 27 : return MBlaze::R27;
-    case 28 : return MBlaze::R28;
-    case 29 : return MBlaze::R29;
-    case 30 : return MBlaze::R30;
-    case 31 : return MBlaze::R31;
-    default: llvm_unreachable("Unknown register number!");
-  }
-}
-
-static inline unsigned getSpecialMBlazeRegisterFromNumbering(unsigned Reg) {
-  switch (Reg) {
-    case 0x0000 : return MBlaze::RPC;
-    case 0x0001 : return MBlaze::RMSR;
-    case 0x0003 : return MBlaze::REAR;
-    case 0x0005 : return MBlaze::RESR;
-    case 0x0007 : return MBlaze::RFSR;
-    case 0x000B : return MBlaze::RBTR;
-    case 0x000D : return MBlaze::REDR;
-    case 0x1000 : return MBlaze::RPID;
-    case 0x1001 : return MBlaze::RZPR;
-    case 0x1002 : return MBlaze::RTLBX;
-    case 0x1003 : return MBlaze::RTLBLO;
-    case 0x1004 : return MBlaze::RTLBHI;
-    case 0x2000 : return MBlaze::RPVR0;
-    case 0x2001 : return MBlaze::RPVR1;
-    case 0x2002 : return MBlaze::RPVR2;
-    case 0x2003 : return MBlaze::RPVR3;
-    case 0x2004 : return MBlaze::RPVR4;
-    case 0x2005 : return MBlaze::RPVR5;
-    case 0x2006 : return MBlaze::RPVR6;
-    case 0x2007 : return MBlaze::RPVR7;
-    case 0x2008 : return MBlaze::RPVR8;
-    case 0x2009 : return MBlaze::RPVR9;
-    case 0x200A : return MBlaze::RPVR10;
-    case 0x200B : return MBlaze::RPVR11;
-    default: llvm_unreachable("Unknown register number!");
-  }
-}
-
-} // end namespace llvm;
-
-#endif
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp
deleted file mode 100644
index 2824b3c..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeELFObjectWriter.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-//===-- MBlazeELFObjectWriter.cpp - MBlaze ELF Writer ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/MBlazeMCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-  class MBlazeELFObjectWriter : public MCELFObjectTargetWriter {
-  public:
-    MBlazeELFObjectWriter(uint8_t OSABI);
-
-    virtual ~MBlazeELFObjectWriter();
-  protected:
-    virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                                  bool IsPCRel, bool IsRelocWithSymbol,
-                                  int64_t Addend) const;
-  };
-}
-
-MBlazeELFObjectWriter::MBlazeELFObjectWriter(uint8_t OSABI)
-  : MCELFObjectTargetWriter(/*Is64Bit*/ false, OSABI, ELF::EM_MBLAZE,
-                            /*HasRelocationAddend*/ false) {}
-
-MBlazeELFObjectWriter::~MBlazeELFObjectWriter() {
-}
-
-unsigned MBlazeELFObjectWriter::GetRelocType(const MCValue &Target,
-                                             const MCFixup &Fixup,
-                                             bool IsPCRel,
-                                             bool IsRelocWithSymbol,
-                                             int64_t Addend) const {
-  // determine the type of the relocation
-  unsigned Type;
-  if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented");
-    case FK_PCRel_4:
-      Type = ELF::R_MICROBLAZE_64_PCREL;
-      break;
-    case FK_PCRel_2:
-      Type = ELF::R_MICROBLAZE_32_PCREL;
-      break;
-    }
-  } else {
-    switch ((unsigned)Fixup.getKind()) {
-    default: llvm_unreachable("invalid fixup kind!");
-    case FK_Data_4:
-      Type = ((IsRelocWithSymbol || Addend !=0)
-              ? ELF::R_MICROBLAZE_32
-              : ELF::R_MICROBLAZE_64);
-      break;
-    case FK_Data_2:
-      Type = ELF::R_MICROBLAZE_32;
-      break;
-    }
-  }
-  return Type;
-}
-
-
-
-MCObjectWriter *llvm::createMBlazeELFObjectWriter(raw_ostream &OS,
-                                                  uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new MBlazeELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/ false);
-}
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp
deleted file mode 100644
index 8231f07..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- MBlazeMCAsmInfo.cpp - MBlaze asm properties -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the MBlazeMCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeMCAsmInfo.h"
-using namespace llvm;
-
-void MBlazeMCAsmInfo::anchor() { }
-
-MBlazeMCAsmInfo::MBlazeMCAsmInfo() {
-  IsLittleEndian              = false;
-  StackGrowsUp                = false;
-  SupportsDebugInformation    = true;
-  AlignmentIsInBytes          = false;
-  PrivateGlobalPrefix         = "$";
-  GPRel32Directive            = "\t.gpword\t";
-}
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
deleted file mode 100644
index 977f9a6..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCAsmInfo.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===-- MBlazeMCAsmInfo.h - MBlaze asm properties --------------*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MBlazeMCAsmInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZETARGETASMINFO_H
-#define MBLAZETARGETASMINFO_H
-
-#include "llvm/MC/MCAsmInfo.h"
-
-namespace llvm {
-  class Target;
-
-  class MBlazeMCAsmInfo : public MCAsmInfo {
-    virtual void anchor();
-  public:
-    explicit MBlazeMCAsmInfo();
-  };
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
deleted file mode 100644
index 8faff6a..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
+++ /dev/null
@@ -1,222 +0,0 @@
-//===-- MBlazeMCCodeEmitter.cpp - Convert MBlaze code to machine code -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the MBlazeMCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "MCTargetDesc/MBlazeMCTargetDesc.h"
-#include "MCTargetDesc/MBlazeBaseInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixup.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
-
-namespace {
-class MBlazeMCCodeEmitter : public MCCodeEmitter {
-  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  const MCInstrInfo &MCII;
-
-public:
-  MBlazeMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                      MCContext &ctx)
-    : MCII(mcii) {
-  }
-
-  ~MBlazeMCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO) const;
-  unsigned getMachineOpValue(const MCInst &MI, unsigned OpIdx) const {
-    return getMachineOpValue(MI, MI.getOperand(OpIdx));
-  }
-
-  static unsigned GetMBlazeRegNum(const MCOperand &MO) {
-    // FIXME: getMBlazeRegisterNumbering() is sufficient?
-    llvm_unreachable("MBlazeMCCodeEmitter::GetMBlazeRegNum() not yet "
-                     "implemented.");
-  }
-
-  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
-    // The MicroBlaze uses a bit reversed format so we need to reverse the
-    // order of the bits. Taken from:
-    // http://graphics.stanford.edu/~seander/bithacks.html
-    C = ((C * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
-
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void EmitRawByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) const {
-    assert(Size <= 8 && "size too big in emit constant");
-
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, CurByte, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const;
-  void EmitIMM(const MCInst &MI, unsigned &CurByte, raw_ostream &OS) const;
-
-  void EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel,
-                     unsigned &CurByte, raw_ostream &OS,
-                     SmallVectorImpl<MCFixup> &Fixups) const;
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups) const;
-};
-
-} // end anonymous namespace
-
-
-MCCodeEmitter *llvm::createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI,
-                                               MCContext &Ctx) {
-  return new MBlazeMCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned MBlazeMCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                             const MCOperand &MO) const {
-  if (MO.isReg())
-    return getMBlazeRegisterNumbering(MO.getReg());
-  if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-  if (MO.isExpr())
-    return 0; // The relocation has already been recorded at this point.
-#ifndef NDEBUG
-  errs() << MO;
-#endif
-  llvm_unreachable(0);
-}
-
-void MBlazeMCCodeEmitter::
-EmitIMM(const MCOperand &imm, unsigned &CurByte, raw_ostream &OS) const {
-  int32_t val = (int32_t)imm.getImm();
-  if (val > 32767 || val < -32768) {
-    EmitByte(0x0D, CurByte, OS);
-    EmitByte(0x00, CurByte, OS);
-    EmitRawByte((val >> 24) & 0xFF, CurByte, OS);
-    EmitRawByte((val >> 16) & 0xFF, CurByte, OS);
-  }
-}
-
-void MBlazeMCCodeEmitter::
-EmitIMM(const MCInst &MI, unsigned &CurByte,raw_ostream &OS) const {
-  switch (MI.getOpcode()) {
-  default: break;
-
-  case MBlaze::ADDIK32:
-  case MBlaze::ORI32:
-  case MBlaze::BRLID32:
-    EmitByte(0x0D, CurByte, OS);
-    EmitByte(0x00, CurByte, OS);
-    EmitRawByte(0, CurByte, OS);
-    EmitRawByte(0, CurByte, OS);
-  }
-}
-
-void MBlazeMCCodeEmitter::
-EmitImmediate(const MCInst &MI, unsigned opNo, bool pcrel, unsigned &CurByte,
-              raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups) const {
-  assert(MI.getNumOperands()>opNo && "Not enought operands for instruction");
-
-  MCOperand oper = MI.getOperand(opNo);
-
-  if (oper.isImm()) {
-    EmitIMM(oper, CurByte, OS);
-  } else if (oper.isExpr()) {
-    MCFixupKind FixupKind;
-    switch (MI.getOpcode()) {
-    default:
-      FixupKind = pcrel ? FK_PCRel_2 : FK_Data_2;
-      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
-      break;
-    case MBlaze::ORI32:
-    case MBlaze::ADDIK32:
-    case MBlaze::BRLID32:
-      FixupKind = pcrel ? FK_PCRel_4 : FK_Data_4;
-      Fixups.push_back(MCFixup::Create(0,oper.getExpr(),FixupKind));
-      break;
-    }
-  }
-}
-
-
-
-void MBlazeMCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups) const {
-  unsigned Opcode = MI.getOpcode();
-  const MCInstrDesc &Desc = MCII.get(Opcode);
-  uint64_t TSFlags = Desc.TSFlags;
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  // Emit an IMM instruction if the instruction we are encoding requires it
-  EmitIMM(MI,CurByte,OS);
-
-  switch ((TSFlags & MBlazeII::FormMask)) {
-  default: break;
-  case MBlazeII::FPseudo:
-    // Pseudo instructions don't get encoded.
-    return;
-  case MBlazeII::FRRI:
-    EmitImmediate(MI, 2, false, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FRIR:
-    EmitImmediate(MI, 1, false, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FCRI:
-    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
-    break;
-  case MBlazeII::FRCI:
-    EmitImmediate(MI, 1, true, CurByte, OS, Fixups);
-  case MBlazeII::FCCI:
-    EmitImmediate(MI, 0, true, CurByte, OS, Fixups);
-    break;
-  }
-
-  ++MCNumEmitted;  // Keep track of the # of mi's emitted
-  unsigned Value = getBinaryCodeForInstr(MI);
-  EmitConstant(Value, 4, CurByte, OS);
-}
-
-// FIXME: These #defines shouldn't be necessary. Instead, tblgen should
-// be able to generate code emitter helpers for either variant, like it
-// does for the AsmWriter.
-#define MBlazeCodeEmitter MBlazeMCCodeEmitter
-#define MachineInstr MCInst
-#include "MBlazeGenCodeEmitter.inc"
-#undef MBlazeCodeEmitter
-#undef MachineInstr
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
deleted file mode 100644
index 380750d..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- MBlazeMCTargetDesc.cpp - MBlaze Target Descriptions ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides MBlaze specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeMCTargetDesc.h"
-#include "InstPrinter/MBlazeInstPrinter.h"
-#include "MBlazeMCAsmInfo.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_MC_DESC
-#include "MBlazeGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "MBlazeGenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "MBlazeGenRegisterInfo.inc"
-
-using namespace llvm;
-
-
-static MCInstrInfo *createMBlazeMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitMBlazeMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createMBlazeMCRegisterInfo(StringRef TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitMBlazeMCRegisterInfo(X, MBlaze::R15);
-  return X;
-}
-
-static MCSubtargetInfo *createMBlazeMCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                    StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitMBlazeMCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
-  Triple TheTriple(TT);
-  switch (TheTriple.getOS()) {
-  default:
-    return new MBlazeMCAsmInfo();
-  }
-}
-
-static MCCodeGenInfo *createMBlazeMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                                CodeModel::Model CM,
-                                                CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default)
-    RM = Reloc::Static;
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin()) {
-    llvm_unreachable("MBlaze does not support Darwin MACH-O format");
-  }
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("MBlaze does not support Windows COFF format");
-  }
-
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
-}
-
-static MCInstPrinter *createMBlazeMCInstPrinter(const Target &T,
-                                                unsigned SyntaxVariant,
-                                                const MCAsmInfo &MAI,
-                                                const MCInstrInfo &MII,
-                                                const MCRegisterInfo &MRI,
-                                                const MCSubtargetInfo &STI) {
-  if (SyntaxVariant == 0)
-    return new MBlazeInstPrinter(MAI, MII, MRI);
-  return 0;
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeMBlazeTargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfoFn X(TheMBlazeTarget, createMCAsmInfo);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheMBlazeTarget,
-                                        createMBlazeMCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheMBlazeTarget, createMBlazeMCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheMBlazeTarget,
-                                    createMBlazeMCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheMBlazeTarget,
-                                          createMBlazeMCSubtargetInfo);
-
-  // Register the MC code emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheMBlazeTarget,
-                                        llvm::createMBlazeMCCodeEmitter);
-
-  // Register the asm backend
-  TargetRegistry::RegisterMCAsmBackend(TheMBlazeTarget,
-                                       createMBlazeAsmBackend);
-
-  // Register the object streamer
-  TargetRegistry::RegisterMCObjectStreamer(TheMBlazeTarget,
-                                           createMCStreamer);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheMBlazeTarget,
-                                        createMBlazeMCInstPrinter);
-}
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
deleted file mode 100644
index 7bc7d8f..0000000
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ /dev/null
@@ -1,56 +0,0 @@
-//===-- MBlazeMCTargetDesc.h - MBlaze Target Descriptions -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides MBlaze specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZEMCTARGETDESC_H
-#define MBLAZEMCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-class MCAsmBackend;
-class MCContext;
-class MCCodeEmitter;
-class MCInstrInfo;
-class MCObjectWriter;
-class MCRegisterInfo;
-class MCSubtargetInfo;
-class Target;
-class StringRef;
-class raw_ostream;
-
-extern Target TheMBlazeTarget;
-
-MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
-                                         const MCRegisterInfo &MRI,
-                                         const MCSubtargetInfo &STI,
-                                         MCContext &Ctx);
-
-MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT,
-                                     StringRef CPU);
-
-MCObjectWriter *createMBlazeELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
-} // End llvm namespace
-
-// Defines symbolic names for MBlaze registers.  This defines a mapping from
-// register name to register number.
-#define GET_REGINFO_ENUM
-#include "MBlazeGenRegisterInfo.inc"
-
-// Defines symbolic names for the MBlaze instructions.
-#define GET_INSTRINFO_ENUM
-#include "MBlazeGenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "MBlazeGenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/MBlaze/Makefile b/lib/Target/MBlaze/Makefile
deleted file mode 100644
index 512ce9a..0000000
--- a/lib/Target/MBlaze/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-##===- lib/Target/MBlaze/Makefile --------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../..
-LIBRARYNAME = LLVMMBlazeCodeGen
-TARGET = MBlaze
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = MBlazeGenRegisterInfo.inc MBlazeGenInstrInfo.inc \
-		MBlazeGenAsmWriter.inc \
-                MBlazeGenDAGISel.inc MBlazeGenAsmMatcher.inc \
-                MBlazeGenCodeEmitter.inc MBlazeGenCallingConv.inc \
-                MBlazeGenSubtargetInfo.inc MBlazeGenIntrinsics.inc
-
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
-
-include $(LEVEL)/Makefile.common
-
diff --git a/lib/Target/MBlaze/TODO b/lib/Target/MBlaze/TODO
deleted file mode 100644
index 317d7c0..0000000
--- a/lib/Target/MBlaze/TODO
+++ /dev/null
@@ -1,21 +0,0 @@
-* Writing out ELF files is close to working but the following needs to
-  be examined more closely:
-    - Relocations use 2-byte / 4-byte to terminology in reference to
-      the size of the immediate value being changed. The Xilinx
-      terminology seems to be (???) 4-byte / 8-byte in reference
-      to the number of bytes of instructions that are being changed.
-
-* Code generation seems to work relatively well now but the following
-  needs to be examined more closely:
-    - The stack layout needs to be examined to make sure it meets
-      the standard, especially in regards to var arg functions.
-    - Look at the MBlazeGenFastISel.inc stuff and make use of it
-      if appropriate.
-
-* A basic assembly parser is present now and seems to parse most things.
-  There are a few things that need to be looked at:
-    - There are some instructions that are not generated by the backend
-      and have not been tested as far as the parser is concerned.
-    - The assembly parser does not use many MicroBlaze specific directives.
-      I should investigate if there are MicroBlaze specific directive and,
-      if there are, add them.
diff --git a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt b/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
deleted file mode 100644
index b554d9b..0000000
--- a/lib/Target/MBlaze/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. 
-                     ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMMBlazeInfo
-  MBlazeTargetInfo.cpp
-  )
-
-add_dependencies(LLVMMBlazeInfo MBlazeCommonTableGen)
diff --git a/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt b/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index ba7ee5d6..0000000
--- a/lib/Target/MBlaze/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-;===- ./lib/Target/MBlaze/TargetInfo/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = MBlazeInfo
-parent = MBlaze
-required_libraries = MC Support Target
-add_to_library_groups = MBlaze
diff --git a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp b/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
deleted file mode 100644
index 323a7f6..0000000
--- a/lib/Target/MBlaze/TargetInfo/MBlazeTargetInfo.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- MBlazeTargetInfo.cpp - MBlaze Target Implementation ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlaze.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-Target llvm::TheMBlazeTarget;
-
-extern "C" void LLVMInitializeMBlazeTargetInfo() {
-  RegisterTarget<Triple::mblaze> X(TheMBlazeTarget, "mblaze", "MBlaze");
-}
diff --git a/lib/Target/MBlaze/TargetInfo/Makefile b/lib/Target/MBlaze/TargetInfo/Makefile
deleted file mode 100644
index fb7ea11..0000000
--- a/lib/Target/MBlaze/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/MBlaze/TargetInfo/Makefile ---------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMMBlazeInfo
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
index f9ecaed..c9b3c3d 100644
--- a/lib/Target/MSP430/CMakeLists.txt
+++ b/lib/Target/MSP430/CMakeLists.txt
@@ -23,7 +23,7 @@ add_llvm_target(MSP430CodeGen
   MSP430MCInstLower.cpp
   )
 
-add_dependencies(LLVMMSP430CodeGen intrinsics_gen)
+add_dependencies(LLVMMSP430CodeGen MSP430CommonTableGen intrinsics_gen)
 
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index 3c95760..acf2ab8 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -17,15 +17,12 @@ using namespace llvm;
 
 void MSP430MCAsmInfo::anchor() { }
 
-MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, StringRef TT) {
+MSP430MCAsmInfo::MSP430MCAsmInfo(StringRef TT) {
   PointerSize = CalleeSaveStackSlotSize = 2;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective ="\t.weak\t";
-  PCSymbol=".";
   CommentString = ";";
 
   AlignmentIsInBytes = false;
-  AllowNameToStartWithDigit = true;
   UsesELFSectionDirectiveForBSS = true;
 }
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index e5c2fc2..a7e0e58 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -14,16 +14,15 @@
 #ifndef MSP430TARGETASMINFO_H
 #define MSP430TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
-  class Target;
 
-  class MSP430MCAsmInfo : public MCAsmInfo {
+  class MSP430MCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
-    explicit MSP430MCAsmInfo(const Target &T, StringRef TT);
+    explicit MSP430MCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 0a04e5d..18311c3 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -92,7 +92,7 @@ void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
     if (Offset)
       O << '(' << Offset << '+';
 
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
 
     if (Offset)
       O << ')';
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index b448cc4..8a69d1e 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -23,18 +23,15 @@ def RetCC_MSP430 : CallingConv<[
 //===----------------------------------------------------------------------===//
 // MSP430 Argument Calling Conventions
 //===----------------------------------------------------------------------===//
-def CC_MSP430 : CallingConv<[
+def CC_MSP430_AssignStack : CallingConv<[
   // Pass by value if the byval attribute is given
   CCIfByVal<CCPassByVal<2, 2>>,
 
   // Promote i8 arguments to i16.
   CCIfType<[i8], CCPromoteToType<i16>>,
 
-  // The first 4 integer arguments of non-varargs functions are passed in
-  // integer registers.
-  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
-
   // Integer values get stored in stack slots that are 2 bytes in
   // size and 2-byte aligned.
   CCIfType<[i16], CCAssignToStack<2, 2>>
 ]>;
+
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index c673f59..8370714 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -27,8 +27,8 @@ protected:
 
 public:
   explicit MSP430FrameLowering(const MSP430Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2), STI(sti) {
-  }
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 2, -2, 2),
+      STI(sti) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 1566c09..4152829 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -259,11 +259,12 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   }
 
   Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
-    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex,
+                                getTargetLowering()->getPointerTy()) :
     AM.Base.Reg;
 
   if (AM.GV)
-    Disp = CurDAG->getTargetGlobalAddress(AM.GV, N->getDebugLoc(),
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(N),
                                           MVT::i16, AM.Disp,
                                           0/*AM.SymbolFlags*/);
   else if (AM.CP)
@@ -345,7 +346,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
     return NULL;
   }
 
-   return CurDAG->getMachineNode(Opcode, N->getDebugLoc(),
+   return CurDAG->getMachineNode(Opcode, SDLoc(N),
                                  VT, MVT::i16, MVT::Other,
                                  LD->getBasePtr(), LD->getChain());
 }
@@ -382,7 +383,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
 
 
 SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: ");
@@ -394,6 +395,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
     DEBUG(errs() << "== ";
           Node->dump(CurDAG);
           errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 09cdf32..745cdf5 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -45,7 +45,7 @@ typedef enum {
 } HWMultUseMode;
 
 static cl::opt<HWMultUseMode>
-HWMultMode("msp430-hwmult-mode",
+HWMultMode("msp430-hwmult-mode", cl::Hidden,
            cl::desc("Hardware multiplier use mode"),
            cl::init(HWMultNoIntr),
            cl::values(
@@ -169,6 +169,7 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   setOperationAction(ISD::VAARG,            MVT::Other, Expand);
   setOperationAction(ISD::VAEND,            MVT::Other, Expand);
   setOperationAction(ISD::VACOPY,           MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable,        MVT::i16,   Custom);
 
   // Libcalls names.
   if (HWMultMode == HWMultIntr) {
@@ -199,6 +200,7 @@ SDValue MSP430TargetLowering::LowerOperation(SDValue Op,
   case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
   case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -226,7 +228,7 @@ MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
 std::pair<unsigned, const TargetRegisterClass*>
 MSP430TargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
-                             EVT VT) const {
+                             MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -248,13 +250,130 @@ getRegForInlineAsmConstraint(const std::string &Constraint,
 
 #include "MSP430GenCallingConv.inc"
 
+/// For each argument in a function store the number of pieces it is composed
+/// of.
+template<typename ArgT>
+static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
+                              SmallVectorImpl<unsigned> &Out) {
+  unsigned CurrentArgIndex = ~0U;
+  for (unsigned i = 0, e = Args.size(); i != e; i++) {
+    if (CurrentArgIndex == Args[i].OrigArgIndex) {
+      Out.back()++;
+    } else {
+      Out.push_back(1);
+      CurrentArgIndex++;
+    }
+  }
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeCallOperands(Outs, CC_MSP430_AssignStack);
+}
+
+static void AnalyzeVarArgs(CCState &State,
+                           const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeFormalArguments(Ins, CC_MSP430_AssignStack);
+}
+
+/// Analyze incoming and outgoing function arguments. We need custom C++ code
+/// to handle special constraints in the ABI like reversing the order of the
+/// pieces of splitted arguments. In addition, all pieces of a certain argument
+/// have to be passed either using registers or the stack but never mixing both.
+template<typename ArgT>
+static void AnalyzeArguments(CCState &State,
+                             SmallVectorImpl<CCValAssign> &ArgLocs,
+                             const SmallVectorImpl<ArgT> &Args) {
+  static const uint16_t RegList[] = {
+    MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
+  };
+  static const unsigned NbRegs = array_lengthof(RegList);
+
+  if (State.isVarArg()) {
+    AnalyzeVarArgs(State, Args);
+    return;
+  }
+
+  SmallVector<unsigned, 4> ArgsParts;
+  ParseFunctionArgs(Args, ArgsParts);
+
+  unsigned RegsLeft = NbRegs;
+  bool UseStack = false;
+  unsigned ValNo = 0;
+
+  for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
+    MVT ArgVT = Args[ValNo].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[ValNo].Flags;
+    MVT LocVT = ArgVT;
+    CCValAssign::LocInfo LocInfo = CCValAssign::Full;
+
+    // Promote i8 to i16
+    if (LocVT == MVT::i8) {
+      LocVT = MVT::i16;
+      if (ArgFlags.isSExt())
+          LocInfo = CCValAssign::SExt;
+      else if (ArgFlags.isZExt())
+          LocInfo = CCValAssign::ZExt;
+      else
+          LocInfo = CCValAssign::AExt;
+    }
+
+    // Handle byval arguments
+    if (ArgFlags.isByVal()) {
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      continue;
+    }
+
+    unsigned Parts = ArgsParts[i];
+
+    if (!UseStack && Parts <= RegsLeft) {
+      unsigned FirstVal = ValNo;
+      for (unsigned j = 0; j < Parts; j++) {
+        unsigned Reg = State.AllocateReg(RegList, NbRegs);
+        State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+        RegsLeft--;
+      }
+
+      // Reverse the order of the pieces to agree with the "big endian" format
+      // required in the calling convention ABI.
+      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
+      std::reverse(B, B + Parts);
+    } else {
+      UseStack = true;
+      for (unsigned j = 0; j < Parts; j++)
+        CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    }
+  }
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::InputArg> &Ins) {
+  State.AnalyzeCallResult(Ins, RetCC_MSP430);
+}
+
+static void AnalyzeRetResult(CCState &State,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  State.AnalyzeReturn(Outs, RetCC_MSP430);
+}
+
+template<typename ArgT>
+static void AnalyzeReturnValues(CCState &State,
+                                SmallVectorImpl<CCValAssign> &RVLocs,
+                                const SmallVectorImpl<ArgT> &Args) {
+  AnalyzeRetResult(State, Args);
+
+  // Reverse splitted return values to get the "big endian" format required
+  // to agree with the calling convention ABI.
+  std::reverse(RVLocs.begin(), RVLocs.end());
+}
+
 SDValue
 MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
                                            CallingConv::ID CallConv,
                                            bool isVarArg,
                                            const SmallVectorImpl<ISD::InputArg>
                                              &Ins,
-                                           DebugLoc dl,
+                                           SDLoc dl,
                                            SelectionDAG &DAG,
                                            SmallVectorImpl<SDValue> &InVals)
                                              const {
@@ -276,10 +395,10 @@ SDValue
 MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                 SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -310,7 +429,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
                                         bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl,
+                                        SDLoc dl,
                                         SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
@@ -323,7 +442,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Ins);
 
   // Create frame index for the start of the first vararg value
   if (isVarArg) {
@@ -407,7 +526,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                                   const SmallVectorImpl<SDValue> &OutVals,
-                                  DebugLoc dl, SelectionDAG &DAG) const {
+                                  SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -421,7 +540,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
   // Analize return values.
-  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Outs);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
@@ -454,7 +573,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-/// TODO: sret.
+// TODO: sret.
 SDValue
 MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                      CallingConv::ID CallConv, bool isVarArg,
@@ -463,20 +582,20 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                        &Outs,
                                      const SmallVectorImpl<SDValue> &OutVals,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SDLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
+  AnalyzeArguments(CCInfo, ArgLocs, Outs);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
-                                                      getPointerTy(), true));
+                                                      getPointerTy(), true),
+                               dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -583,7 +702,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   Chain = DAG.getCALLSEQ_END(Chain,
                              DAG.getConstant(NumBytes, getPointerTy(), true),
                              DAG.getConstant(0, getPointerTy(), true),
-                             InFlag);
+                             InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -599,7 +718,7 @@ SDValue
 MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -607,7 +726,7 @@ MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
+  AnalyzeReturnValues(CCInfo, RVLocs, Ins);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
@@ -625,7 +744,7 @@ SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
   unsigned Opc = Op.getOpcode();
   SDNode* N = Op.getNode();
   EVT VT = Op.getValueType();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Expand non-constant shifts to loops:
   if (!isa<ConstantSDNode>(N->getOperand(1)))
@@ -669,15 +788,15 @@ SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op,
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
 
   // Create the TargetGlobalAddress node, folding in the constant offset.
-  SDValue Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+  SDValue Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                               getPointerTy(), Offset);
-  return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
+  return DAG.getNode(MSP430ISD::Wrapper, SDLoc(Op),
                      getPointerTy(), Result);
 }
 
 SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
 
@@ -686,7 +805,7 @@ SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
 
 SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
@@ -695,7 +814,7 @@ SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
 
 static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
                        ISD::CondCode CC,
-                       DebugLoc dl, SelectionDAG &DAG) {
+                       SDLoc dl, SelectionDAG &DAG) {
   // FIXME: Handle bittests someday
   assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
 
@@ -782,7 +901,7 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS   = Op.getOperand(2);
   SDValue RHS   = Op.getOperand(3);
   SDValue Dest  = Op.getOperand(4);
-  DebugLoc dl   = Op.getDebugLoc();
+  SDLoc dl  (Op);
 
   SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
@@ -794,7 +913,7 @@ SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS   = Op.getOperand(0);
   SDValue RHS   = Op.getOperand(1);
-  DebugLoc dl   = Op.getDebugLoc();
+  SDLoc dl  (Op);
 
   // If we are doing an AND and testing against zero, then the CMP
   // will not be generated.  The AND (or BIT) will generate the condition codes,
@@ -878,7 +997,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   SDValue TrueV  = Op.getOperand(2);
   SDValue FalseV = Op.getOperand(3);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  DebugLoc dl    = Op.getDebugLoc();
+  SDLoc dl   (Op);
 
   SDValue TargetCC;
   SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
@@ -897,7 +1016,7 @@ SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue Val = Op.getOperand(0);
   EVT VT      = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   assert(VT == MVT::i16 && "Only support i16 for now!");
 
@@ -929,7 +1048,7 @@ SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op,
   MFI->setReturnAddressIsTaken(true);
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
@@ -953,7 +1072,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
                                          MSP430::FPW, VT);
@@ -975,11 +1094,19 @@ SDValue MSP430TargetLowering::LowerVASTART(SDValue Op,
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
 
   // Create a store of the frame index to the location operand
-  return DAG.getStore(Op.getOperand(0), Op.getDebugLoc(), FrameIndex,
+  return DAG.getStore(Op.getOperand(0), SDLoc(Op), FrameIndex,
                       Op.getOperand(1), MachinePointerInfo(SV),
                       false, false, 0);
 }
 
+SDValue MSP430TargetLowering::LowerJumpTable(SDValue Op,
+                                             SelectionDAG &DAG) const {
+    JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+    SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+    return DAG.getNode(MSP430ISD::Wrapper, SDLoc(JT),
+                       getPointerTy(), Result);
+}
+
 /// getPostIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if this node can be
 /// combined with a load / store to form a post-indexed load / store.
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index e0ed870..85a861e 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -93,12 +93,13 @@ namespace llvm {
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     TargetLowering::ConstraintType
     getConstraintType(const std::string &Constraint) const;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
@@ -130,28 +131,28 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCCCArguments(SDValue Chain,
                               CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl,
+                              SDLoc dl,
                               SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -162,7 +163,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                             SDValue &Base,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index a6b5f2f..7a0b00a 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -22,14 +22,17 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MSP430GenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MSP430InstrInfo::anchor() {}
+
 MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
   : MSP430GenInstrInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
-    RI(tm, *this) {}
+    RI(tm) {}
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index d79f992..ad2b8cc 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -42,6 +42,7 @@ namespace MSP430II {
 
 class MSP430InstrInfo : public MSP430GenInstrInfo {
   const MSP430RegisterInfo RI;
+  virtual void anchor();
 public:
   explicit MSP430InstrInfo(MSP430TargetMachine &TM);
 
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index e45780d..50e3fda 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -183,10 +183,10 @@ let isBarrier = 1 in {
                     "br\t$brdst",
                     [(brind tblockaddress:$brdst)]>;
     def Br  : I16rr<0, (outs), (ins GR16:$brdst),
-                    "mov.w\t{$brdst, pc}",
+                    "br\t$brdst",
                     [(brind GR16:$brdst)]>;
     def Bm  : I16rm<0, (outs), (ins memsrc:$brdst),
-                    "mov.w\t{$brdst, pc}",
+                    "br\t$brdst",
                     [(brind (load addr:$brdst))]>;
   }
 }
diff --git a/lib/Target/MSP430/MSP430MCInstLower.cpp b/lib/Target/MSP430/MSP430MCInstLower.cpp
index 043e5be..52f9ee5 100644
--- a/lib/Target/MSP430/MSP430MCInstLower.cpp
+++ b/lib/Target/MSP430/MSP430MCInstLower.cpp
@@ -33,7 +33,7 @@ GetGlobalAddressSymbol(const MachineOperand &MO) const {
   case 0: break;
   }
 
-  return Printer.Mang->getSymbol(MO.getGlobal());
+  return Printer.getSymbol(MO.getGlobal());
 }
 
 MCSymbol *MSP430MCInstLower::
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 0b3e9e2..1a5e312 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -32,9 +32,8 @@
 using namespace llvm;
 
 // FIXME: Provide proper call frame setup / destroy opcodes.
-MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
-                                       const TargetInstrInfo &tii)
-  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm), TII(tii) {
+MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm)
+  : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
@@ -132,6 +131,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     // This is actually "load effective address" of the stack slot
     // instruction. We have only two-address instructions, thus we need to
     // expand it into mov + add
+    const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
 
     MI.setDesc(TII.get(MSP430::MOV16rr));
     MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 69cccb2..78047cc 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -27,13 +27,12 @@ class MSP430TargetMachine;
 struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
 private:
   MSP430TargetMachine &TM;
-  const TargetInstrInfo &TII;
 
   /// StackAlign - Default stack alignment.
   ///
   unsigned StackAlign;
 public:
-  MSP430RegisterInfo(MSP430TargetMachine &tm, const TargetInstrInfo &tii);
+  MSP430RegisterInfo(MSP430TargetMachine &tm);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
index 07619d0..4010781 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -43,7 +43,7 @@ def R13B : MSP430Reg<13, "r13">;
 def R14B : MSP430Reg<14, "r14">;
 def R15B : MSP430Reg<15, "r15">;
 
-def subreg_8bit : SubRegIndex { let Namespace = "MSP430"; }
+def subreg_8bit : SubRegIndex<8> { let Namespace = "MSP430"; }
 
 let SubRegIndices = [subreg_8bit] in {
 def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 164e351..6710a09 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -36,7 +36,9 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
     // FIXME: Check DataLayout string.
     DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) { }
+    FrameLowering(Subtarget) {
+  initAsmInfo();
+}
 
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index d31efa8..38be25c 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -19,138 +19,48 @@
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static bool isAcceptableChar(char C, bool AllowPeriod, bool AllowUTF8) {
-  if ((C < 'a' || C > 'z') &&
-      (C < 'A' || C > 'Z') &&
-      (C < '0' || C > '9') &&
-      C != '_' && C != '$' && C != '@' &&
-      !(AllowPeriod && C == '.') &&
-      !(AllowUTF8 && (C & 0x80)))
-    return false;
-  return true;
-}
-
-static char HexDigit(int V) {
-  return V < 10 ? V+'0' : V+'A'-10;
-}
-
-static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
-  OutName.push_back('_');
-  OutName.push_back(HexDigit(C >> 4));
-  OutName.push_back(HexDigit(C & 15));
-  OutName.push_back('_');
-}
-
-/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
-/// for this assembler.
-static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
-  assert(!Str.empty() && "Cannot create an empty MCSymbol");
-  
-  // If the first character is a number and the target does not allow this, we
-  // need quotes.
-  if (!MAI.doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9')
-    return true;
-  
-  // If any of the characters in the string is an unacceptable character, force
-  // quotes.
-  bool AllowPeriod = MAI.doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI.doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      return true;
-  return false;
-}
-
-/// appendMangledName - Add the specified string in mangled form if it uses
-/// any unusual characters.
-static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
-                              const MCAsmInfo &MAI) {
-  // The first character is not allowed to be a number unless the target
-  // explicitly allows it.
-  if (!MAI.doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9') {
-    MangleLetter(OutName, Str[0]);
-    Str = Str.substr(1);
-  }
-
-  bool AllowPeriod = MAI.doesAllowPeriodsInName();
-  bool AllowUTF8 = MAI.doesAllowUTF8();
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (!isAcceptableChar(Str[i], AllowPeriod, AllowUTF8))
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
-/// appendMangledQuotedName - On systems that support quoted symbols, we still
-/// have to escape some (obscure) characters like " and \n which would break the
-/// assembler's lexing.
-static void appendMangledQuotedName(SmallVectorImpl<char> &OutName,
-                                   StringRef Str) {
-  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
-    if (Str[i] == '"' || Str[i] == '\n')
-      MangleLetter(OutName, Str[i]);
-    else
-      OutName.push_back(Str[i]);
-  }
-}
-
-
 /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
 /// and the specified name as the global variable name.  GVName must not be
 /// empty.
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
-                                const Twine &GVName, ManglerPrefixTy PrefixTy) {
+                                const Twine &GVName, ManglerPrefixTy PrefixTy,
+                                bool UseGlobalPrefix) {
   SmallString<256> TmpData;
   StringRef Name = GVName.toStringRef(TmpData);
   assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
   
-  const MCAsmInfo &MAI = Context.getAsmInfo();
+  const MCAsmInfo *MAI = TM->getMCAsmInfo();
   
   // If the global name is not led with \1, add the appropriate prefixes.
   if (Name[0] == '\1') {
     Name = Name.substr(1);
   } else {
     if (PrefixTy == Mangler::Private) {
-      const char *Prefix = MAI.getPrivateGlobalPrefix();
+      const char *Prefix = MAI->getPrivateGlobalPrefix();
       OutName.append(Prefix, Prefix+strlen(Prefix));
     } else if (PrefixTy == Mangler::LinkerPrivate) {
-      const char *Prefix = MAI.getLinkerPrivateGlobalPrefix();
+      const char *Prefix = MAI->getLinkerPrivateGlobalPrefix();
       OutName.append(Prefix, Prefix+strlen(Prefix));
     }
 
-    const char *Prefix = MAI.getGlobalPrefix();
-    if (Prefix[0] == 0)
-      ; // Common noop, no prefix.
-    else if (Prefix[1] == 0)
-      OutName.push_back(Prefix[0]);  // Common, one character prefix.
-    else
-      OutName.append(Prefix, Prefix+strlen(Prefix)); // Arbitrary length prefix.
+    if (UseGlobalPrefix) {
+      const char *Prefix = MAI->getGlobalPrefix();
+      if (Prefix[0] == 0)
+        ; // Common noop, no prefix.
+      else if (Prefix[1] == 0)
+        OutName.push_back(Prefix[0]);  // Common, one character prefix.
+      else
+        // Arbitrary length prefix.
+        OutName.append(Prefix, Prefix+strlen(Prefix));
+    }
   }
-  
+
   // If this is a simple string that doesn't need escaping, just append it.
-  if (!NameNeedsEscaping(Name, MAI) ||
-      // If quotes are supported, they can be used unless the string contains
-      // a quote or newline.
-      (MAI.doesAllowQuotesInName() &&
-       Name.find_first_of("\n\"") == StringRef::npos)) {
-    OutName.append(Name.begin(), Name.end());
-    return;
-  }
-  
-  // On systems that do not allow quoted names, we need to mangle most
-  // strange characters.
-  if (!MAI.doesAllowQuotesInName())
-    return appendMangledName(OutName, Name, MAI);
-  
-  // Okay, the system allows quoted strings.  We can quote most anything, the
-  // only characters that need escaping are " and \n.
-  assert(Name.find_first_of("\n\"") != StringRef::npos);
-  return appendMangledQuotedName(OutName, Name);
+  OutName.append(Name.begin(), Name.end());
 }
 
 /// AddFastCallStdCallSuffix - Microsoft fastcall and stdcall functions require
@@ -178,8 +88,8 @@ static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
 /// and the specified global variable's name.  If the global variable doesn't
 /// have a name, this fills in a unique name for the global.
 void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
-                                const GlobalValue *GV,
-                                bool isImplicitlyPrivate) {
+                                const GlobalValue *GV, bool isImplicitlyPrivate,
+                                bool UseGlobalPrefix) {
   ManglerPrefixTy PrefixTy = Mangler::Default;
   if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
     PrefixTy = Mangler::Private;
@@ -189,7 +99,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   // If this global has a name, handle it simply.
   if (GV->hasName()) {
     StringRef Name = GV->getName();
-    getNameWithPrefix(OutName, Name, PrefixTy);
+    getNameWithPrefix(OutName, Name, PrefixTy, UseGlobalPrefix);
     // No need to do anything else if the global has the special "do not mangle"
     // flag in the name.
     if (Name[0] == 1)
@@ -201,12 +111,13 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
     if (ID == 0) ID = NextAnonGlobalID++;
   
     // Must mangle the global into a unique ID.
-    getNameWithPrefix(OutName, "__unnamed_" + Twine(ID), PrefixTy);
+    getNameWithPrefix(OutName, "__unnamed_" + Twine(ID), PrefixTy,
+                      UseGlobalPrefix);
   }
   
   // If we are supposed to add a microsoft-style suffix for stdcall/fastcall,
   // add it.
-  if (Context.getAsmInfo().hasMicrosoftFastStdCallMangling()) {
+  if (TM->getMCAsmInfo()->hasMicrosoftFastStdCallMangling()) {
     if (const Function *F = dyn_cast<Function>(GV)) {
       CallingConv::ID CC = F->getCallingConv();
     
@@ -226,17 +137,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
           // "Pure" variadic functions do not receive @0 suffix.
           (!FT->isVarArg() || FT->getNumParams() == 0 ||
            (FT->getNumParams() == 1 && F->hasStructRetAttr())))
-        AddFastCallStdCallSuffix(OutName, F, TD);
+        AddFastCallStdCallSuffix(OutName, F, *TM->getDataLayout());
     }
   }
 }
-
-/// getSymbol - Return the MCSymbol for the specified global value.  This
-/// symbol is the main label that is the address of the global.
-MCSymbol *Mangler::getSymbol(const GlobalValue *GV) {
-  SmallString<60> NameStr;
-  getNameWithPrefix(NameStr, GV, false);
-  return Context.GetOrCreateSymbol(NameStr.str());
-}
-
-
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 0795cb9..cdae6c2 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -20,26 +21,29 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/ADT/APInt.h"
 
 using namespace llvm;
 
+namespace llvm {
+class MCInstrInfo;
+}
+
 namespace {
 class MipsAssemblerOptions {
 public:
-  MipsAssemblerOptions():
-    aTReg(1), reorder(true), macro(true) {
-  }
+  MipsAssemblerOptions() : aTReg(1), reorder(true), macro(true) {}
 
-  unsigned getATRegNum() {return aTReg;}
+  unsigned getATRegNum() { return aTReg; }
   bool setATReg(unsigned Reg);
 
-  bool isReorder() {return reorder;}
-  void setReorder() {reorder = true;}
-  void setNoreorder() {reorder = false;}
+  bool isReorder() { return reorder; }
+  void setReorder() { reorder = true; }
+  void setNoreorder() { reorder = false; }
 
-  bool isMacro() {return macro;}
-  void setMacro() {macro = true;}
-  void setNomacro() {macro = false;}
+  bool isMacro() { return macro; }
+  void setMacro() { macro = true; }
+  void setNomacro() { macro = false; }
 
 private:
   unsigned aTReg;
@@ -51,23 +55,21 @@ private:
 namespace {
 class MipsAsmParser : public MCTargetAsmParser {
 
-  enum FpFormatTy {
-    FP_FORMAT_NONE = -1,
-    FP_FORMAT_S,
-    FP_FORMAT_D,
-    FP_FORMAT_L,
-    FP_FORMAT_W
-  } FpFormat;
+  MipsTargetStreamer &getTargetStreamer() {
+    MCTargetStreamer &TS = Parser.getStreamer().getTargetStreamer();
+    return static_cast<MipsTargetStreamer &>(TS);
+  }
 
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
   MipsAssemblerOptions Options;
+  bool hasConsumedDollar;
 
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
                                bool MatchingInlineAsm);
 
@@ -75,40 +77,98 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool parseMathOperation(StringRef Name, SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool ParseDirective(AsmToken DirectiveID);
 
   MipsAsmParser::OperandMatchResultTy
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands, int RegKind);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  bool parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                   int RegKind);
+
+  MipsAsmParser::OperandMatchResultTy
+  parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseAFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseFGRH32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseHW64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   MipsAsmParser::OperandMatchResultTy
-  parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128BRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128HRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128WRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128DRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseMSA128CtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+
+  MipsAsmParser::OperandMatchResultTy
+  parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
   bool searchSymbolAlias(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                         unsigned RegisterClass);
+                         unsigned RegKind);
 
-  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &,
                     StringRef Mnemonic);
 
   int tryParseRegister(bool is64BitReg);
 
-  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                bool is64BitReg);
 
   bool needsExpansion(MCInst &Inst);
@@ -122,17 +182,19 @@ class MipsAsmParser : public MCTargetAsmParser {
   void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
                             SmallVectorImpl<MCInst> &Instructions);
   void expandMemInst(MCInst &Inst, SMLoc IDLoc,
-                     SmallVectorImpl<MCInst> &Instructions,
-                     bool isLoad,bool isImmOpnd);
+                     SmallVectorImpl<MCInst> &Instructions, bool isLoad,
+                     bool isImmOpnd);
   bool reportParseError(StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
 
-  const MCExpr* evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
+  const MCExpr *evaluateRelocExpr(const MCExpr *Expr, StringRef RelocStr);
 
   bool isEvaluated(const MCExpr *Expr);
   bool parseDirectiveSet();
+  bool parseDirectiveMipsHackStocg();
+  bool parseDirectiveMipsHackELFFlags();
 
   bool parseSetAtDirective();
   bool parseSetNoAtDirective();
@@ -144,6 +206,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetAssignment();
 
   bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveGpWord();
 
   MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
 
@@ -155,40 +218,49 @@ class MipsAsmParser : public MCTargetAsmParser {
     return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
   }
 
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
+
   int matchRegisterName(StringRef Symbol, bool is64BitReg);
 
   int matchCPURegisterName(StringRef Symbol);
 
   int matchRegisterByNumber(unsigned RegNum, unsigned RegClass);
 
-  void setFpFormat(FpFormatTy Format) {
-    FpFormat = Format;
-  }
+  int matchFPURegisterName(StringRef Name);
 
-  void setDefaultFpFormat();
+  int matchFCCRegisterName(StringRef Name);
 
-  void setFpFormat(StringRef Format);
+  int matchACRegisterName(StringRef Name);
 
-  FpFormatTy getFpFormat() {return FpFormat;}
+  int matchMSA128RegisterName(StringRef Name);
 
-  bool requestsDoubleOperand(StringRef Mnemonic);
+  int matchMSA128CtrlRegisterName(StringRef Name);
+
+  int regKindToRegClass(int RegKind);
 
   unsigned getReg(int RC, int RegNo);
 
   int getATReg();
 
   bool processInstruction(MCInst &Inst, SMLoc IDLoc,
-                        SmallVectorImpl<MCInst> &Instructions);
+                          SmallVectorImpl<MCInst> &Instructions);
+
+  // Helper function that checks if the value of a vector index is within the
+  // boundaries of accepted values for each RegisterKind
+  // Example: INSERT.B $w0[n], $1 => 16 > n >= 0
+  bool validateMSAIndex(int Val, int RegKind);
+
 public:
-  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser),
+        hasConsumedDollar(false) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
   MCAsmParser &getParser() const { return Parser; }
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
 };
 }
 
@@ -201,14 +273,24 @@ class MipsOperand : public MCParsedAsmOperand {
 public:
   enum RegisterKind {
     Kind_None,
-    Kind_CPURegs,
-    Kind_CPU64Regs,
+    Kind_GPR32,
+    Kind_GPR64,
     Kind_HWRegs,
-    Kind_HW64Regs,
     Kind_FGR32Regs,
+    Kind_FGRH32Regs,
     Kind_FGR64Regs,
     Kind_AFGR64Regs,
-    Kind_CCRRegs
+    Kind_CCRRegs,
+    Kind_FCCRegs,
+    Kind_ACC64DSP,
+    Kind_LO32DSP,
+    Kind_HI32DSP,
+    Kind_COP2,
+    Kind_MSA128BRegs,
+    Kind_MSA128HRegs,
+    Kind_MSA128WRegs,
+    Kind_MSA128DRegs,
+    Kind_MSA128CtrlRegs
   };
 
 private:
@@ -219,7 +301,9 @@ private:
     k_Memory,
     k_PostIndexRegister,
     k_Register,
-    k_Token
+    k_PtrReg,
+    k_Token,
+    k_LSAImm
   } Kind;
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -258,7 +342,12 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+  void addPtrRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getPtrReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
     if (Expr == 0)
       Inst.addOperand(MCOperand::CreateImm(0));
@@ -287,6 +376,9 @@ public:
   bool isImm() const { return Kind == k_Immediate; }
   bool isToken() const { return Kind == k_Token; }
   bool isMem() const { return Kind == k_Memory; }
+  bool isPtrReg() const { return Kind == k_PtrReg; }
+  bool isInvNum() const { return Kind == k_Immediate; }
+  bool isLSAImm() const { return Kind == k_LSAImm; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
@@ -298,13 +390,18 @@ public:
     return Reg.RegNum;
   }
 
+  unsigned getPtrReg() const {
+    assert((Kind == k_PtrReg) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
   void setRegKind(RegisterKind RegKind) {
-    assert((Kind == k_Register) && "Invalid access!");
+    assert((Kind == k_Register || Kind == k_PtrReg) && "Invalid access!");
     Reg.Kind = RegKind;
   }
 
   const MCExpr *getImm() const {
-    assert((Kind == k_Immediate) && "Invalid access!");
+    assert((Kind == k_Immediate || Kind == k_LSAImm) && "Invalid access!");
     return Imm.Val;
   }
 
@@ -335,6 +432,14 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreatePtrReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_PtrReg);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Immediate);
     Op->Imm.Val = Val;
@@ -343,8 +448,16 @@ public:
     return Op;
   }
 
+  static MipsOperand *CreateLSAImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_LSAImm);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
   static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
-                                 SMLoc S, SMLoc E) {
+                                SMLoc S, SMLoc E) {
     MipsOperand *Op = new MipsOperand(k_Memory);
     Op->Mem.Base = Base;
     Op->Mem.Off = Off;
@@ -353,59 +466,91 @@ public:
     return Op;
   }
 
-  bool isCPURegsAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_CPURegs;
+  bool isGPR32Asm() const {
+    return Kind == k_Register && Reg.Kind == Kind_GPR32;
   }
-  void addCPURegsAsmOperands(MCInst &Inst, unsigned N) const {
+  void addRegAsmOperands(MCInst &Inst, unsigned N) const {
     Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
   }
 
-  bool isCPU64RegsAsm() const {
-    return Kind == k_Register && Reg.Kind == Kind_CPU64Regs;
-  }
-  void addCPU64RegsAsmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  bool isGPR64Asm() const {
+    return Kind == k_Register && Reg.Kind == Kind_GPR64;
   }
 
   bool isHWRegsAsm() const {
     assert((Kind == k_Register) && "Invalid access!");
     return Reg.Kind == Kind_HWRegs;
   }
-  void addHWRegsAsmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
-  }
 
-  bool isHW64RegsAsm() const {
+  bool isCCRAsm() const {
     assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_HW64Regs;
+    return Reg.Kind == Kind_CCRRegs;
   }
-  void addHW64RegsAsmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+
+  bool isAFGR64Asm() const {
+    return Kind == k_Register && Reg.Kind == Kind_AFGR64Regs;
   }
 
-  void addCCRAsmOperands(MCInst &Inst, unsigned N) const {
-    Inst.addOperand(MCOperand::CreateReg(Reg.RegNum));
+  bool isFGR64Asm() const {
+    return Kind == k_Register && Reg.Kind == Kind_FGR64Regs;
   }
 
-  bool isCCRAsm() const {
-    assert((Kind == k_Register) && "Invalid access!");
-    return Reg.Kind == Kind_CCRRegs;
+  bool isFGR32Asm() const {
+    return (Kind == k_Register) && Reg.Kind == Kind_FGR32Regs;
   }
 
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const {
-    return StartLoc;
+  bool isFGRH32Asm() const {
+    return (Kind == k_Register) && Reg.Kind == Kind_FGRH32Regs;
   }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const {
-    return EndLoc;
+
+  bool isFCCRegsAsm() const {
+    return (Kind == k_Register) && Reg.Kind == Kind_FCCRegs;
+  }
+
+  bool isACC64DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_ACC64DSP;
+  }
+
+  bool isLO32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_LO32DSP;
+  }
+
+  bool isHI32DSPAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_HI32DSP;
+  }
+
+  bool isCOP2Asm() const { return Kind == k_Register && Reg.Kind == Kind_COP2; }
+
+  bool isMSA128BAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128BRegs;
+  }
+
+  bool isMSA128HAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128HRegs;
+  }
+
+  bool isMSA128WAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128WRegs;
+  }
+
+  bool isMSA128DAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128DRegs;
+  }
+
+  bool isMSA128CRAsm() const {
+    return Kind == k_Register && Reg.Kind == Kind_MSA128CtrlRegs;
   }
 
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
   virtual void print(raw_ostream &OS) const {
     llvm_unreachable("unimplemented!");
   }
 }; // class MipsOperand
-}  // namespace
+} // namespace
 
 namespace llvm {
 extern const MCInstrDesc MipsInsts[];
@@ -436,8 +581,8 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     // reference or immediate we may have to expand instructions.
     for (unsigned i = 0; i < MCID.getNumOperands(); i++) {
       const MCOperandInfo &OpInfo = MCID.OpInfo[i];
-      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY)
-          || (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
+      if ((OpInfo.OperandType == MCOI::OPERAND_MEMORY) ||
+          (OpInfo.OperandType == MCOI::OPERAND_UNKNOWN)) {
         MCOperand &Op = Inst.getOperand(i);
         if (Op.isImm()) {
           int MemOffset = Op.getImm();
@@ -450,7 +595,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           const MCExpr *Expr = Op.getExpr();
           if (Expr->getKind() == MCExpr::SymbolRef) {
             const MCSymbolRefExpr *SR =
-                static_cast<const MCSymbolRefExpr*>(Expr);
+                static_cast<const MCSymbolRefExpr *>(Expr);
             if (SR->getKind() == MCSymbolRefExpr::VK_None) {
               // Expand symbol.
               expandMemInst(Inst, IDLoc, Instructions, MCID.mayLoad(), false);
@@ -463,7 +608,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
         }
       }
     } // for
-  } // if load/store
+  }   // if load/store
 
   if (needsExpansion(Inst))
     expandInstruction(Inst, IDLoc, Instructions);
@@ -486,7 +631,7 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
 }
 
 void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+                                      SmallVectorImpl<MCInst> &Instructions) {
   switch (Inst.getOpcode()) {
   case Mips::LoadImm32Reg:
     return expandLoadImm(Inst, IDLoc, Instructions);
@@ -541,8 +686,9 @@ void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(2);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -583,8 +729,9 @@ void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
   }
 }
 
-void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
-                                       SmallVectorImpl<MCInst> &Instructions) {
+void
+MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                    SmallVectorImpl<MCInst> &Instructions) {
   MCInst tmpInst;
   const MCOperand &ImmOp = Inst.getOperand(1);
   assert(ImmOp.isImm() && "expected immediate operand kind");
@@ -617,14 +764,15 @@ void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
 }
 
 void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
-          SmallVectorImpl<MCInst> &Instructions, bool isLoad, bool isImmOpnd) {
+                                  SmallVectorImpl<MCInst> &Instructions,
+                                  bool isLoad, bool isImmOpnd) {
   const MCSymbolRefExpr *SR;
   MCInst TempInst;
   unsigned ImmOffset, HiOffset, LoOffset;
   const MCExpr *ExprOffset;
   unsigned TmpRegNum;
-  unsigned AtRegNum = getReg((isMips64()) ? Mips::CPU64RegsRegClassID
-                             : Mips::CPURegsRegClassID, getATReg());
+  unsigned AtRegNum = getReg(
+      (isMips64()) ? Mips::GPR64RegClassID : Mips::GPR32RegClassID, getATReg());
   // 1st operand is either the source or destination register.
   assert(Inst.getOperand(0).isReg() && "expected register operand kind");
   unsigned RegOpNum = Inst.getOperand(0).getReg();
@@ -654,7 +802,7 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
     TempInst.addOperand(MCOperand::CreateImm(HiOffset));
   else {
     if (ExprOffset->getKind() == MCExpr::SymbolRef) {
-      SR = static_cast<const MCSymbolRefExpr*>(ExprOffset);
+      SR = static_cast<const MCSymbolRefExpr *>(ExprOffset);
       const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
           SR->getSymbol().getName(), MCSymbolRefExpr::VK_Mips_ABS_HI,
           getContext());
@@ -697,15 +845,14 @@ void MipsAsmParser::expandMemInst(MCInst &Inst, SMLoc IDLoc,
   TempInst.clear();
 }
 
-bool MipsAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out, unsigned &ErrorInfo,
-                        bool MatchingInlineAsm) {
+bool MipsAsmParser::MatchAndEmitInstruction(
+    SMLoc IDLoc, unsigned &Opcode,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, MCStreamer &Out,
+    unsigned &ErrorInfo, bool MatchingInlineAsm) {
   MCInst Inst;
   SmallVector<MCInst, 8> Instructions;
-  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                              MatchingInlineAsm);
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
 
   switch (MatchResult) {
   default:
@@ -726,7 +873,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
 
-      ErrorLoc = ((MipsOperand*) Operands[ErrorInfo])->getStartLoc();
+      ErrorLoc = ((MipsOperand *)Operands[ErrorInfo])->getStartLoc();
       if (ErrorLoc == SMLoc())
         ErrorLoc = IDLoc;
     }
@@ -740,44 +887,44 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 }
 
 int MipsAsmParser::matchCPURegisterName(StringRef Name) {
-   int CC;
+  int CC;
 
   if (Name == "at")
     return getATReg();
 
-    CC = StringSwitch<unsigned>(Name)
-    .Case("zero", 0)
-    .Case("a0",   4)
-    .Case("a1",   5)
-    .Case("a2",   6)
-    .Case("a3",   7)
-    .Case("v0",   2)
-    .Case("v1",   3)
-    .Case("s0",  16)
-    .Case("s1",  17)
-    .Case("s2",  18)
-    .Case("s3",  19)
-    .Case("s4",  20)
-    .Case("s5",  21)
-    .Case("s6",  22)
-    .Case("s7",  23)
-    .Case("k0",  26)
-    .Case("k1",  27)
-    .Case("sp",  29)
-    .Case("fp",  30)
-    .Case("gp",  28)
-    .Case("ra",  31)
-    .Case("t0",   8)
-    .Case("t1",   9)
-    .Case("t2",  10)
-    .Case("t3",  11)
-    .Case("t4",  12)
-    .Case("t5",  13)
-    .Case("t6",  14)
-    .Case("t7",  15)
-    .Case("t8",  24)
-    .Case("t9",  25)
-    .Default(-1);
+  CC = StringSwitch<unsigned>(Name)
+           .Case("zero", 0)
+           .Case("a0", 4)
+           .Case("a1", 5)
+           .Case("a2", 6)
+           .Case("a3", 7)
+           .Case("v0", 2)
+           .Case("v1", 3)
+           .Case("s0", 16)
+           .Case("s1", 17)
+           .Case("s2", 18)
+           .Case("s3", 19)
+           .Case("s4", 20)
+           .Case("s5", 21)
+           .Case("s6", 22)
+           .Case("s7", 23)
+           .Case("k0", 26)
+           .Case("k1", 27)
+           .Case("sp", 29)
+           .Case("fp", 30)
+           .Case("gp", 28)
+           .Case("ra", 31)
+           .Case("t0", 8)
+           .Case("t1", 9)
+           .Case("t2", 10)
+           .Case("t3", 11)
+           .Case("t4", 12)
+           .Case("t5", 13)
+           .Case("t6", 14)
+           .Case("t7", 15)
+           .Case("t8", 24)
+           .Case("t9", 25)
+           .Default(-1);
 
   // Although SGI documentation just cuts out t0-t3 for n32/n64,
   // GNU pushes the values of t0-t3 to override the o32/o64 values for t4-t7
@@ -787,83 +934,140 @@ int MipsAsmParser::matchCPURegisterName(StringRef Name) {
 
   if (CC == -1 && isMips64())
     CC = StringSwitch<unsigned>(Name)
-      .Case("a4",   8)
-      .Case("a5",   9)
-      .Case("a6",  10)
-      .Case("a7",  11)
-      .Case("kt0", 26)
-      .Case("kt1", 27)
-      .Case("s8",  30)
-      .Default(-1);
+             .Case("a4", 8)
+             .Case("a5", 9)
+             .Case("a6", 10)
+             .Case("a7", 11)
+             .Case("kt0", 26)
+             .Case("kt1", 27)
+             .Case("s8", 30)
+             .Default(-1);
 
   return CC;
 }
 
-int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
-
-  if (Name.equals("fcc0"))
-    return Mips::FCC0;
-
-  int CC;
-  CC = matchCPURegisterName(Name);
-  if (CC != -1)
-    return matchRegisterByNumber(CC, is64BitReg ? Mips::CPU64RegsRegClassID
-                                                : Mips::CPURegsRegClassID);
+int MipsAsmParser::matchFPURegisterName(StringRef Name) {
 
   if (Name[0] == 'f') {
     StringRef NumString = Name.substr(1);
     unsigned IntVal;
     if (NumString.getAsInteger(10, IntVal))
-      return -1; // This is not an integer.
-    if (IntVal > 31)
+      return -1;     // This is not an integer.
+    if (IntVal > 31) // Maximum index for fpu register.
       return -1;
+    return IntVal;
+  }
+  return -1;
+}
 
-    FpFormatTy Format = getFpFormat();
+int MipsAsmParser::matchFCCRegisterName(StringRef Name) {
 
-    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
-      return getReg(Mips::FGR32RegClassID, IntVal);
-    if (Format == FP_FORMAT_D) {
-      if (isFP64()) {
-        return getReg(Mips::FGR64RegClassID, IntVal);
-      }
-      // Only even numbers available as register pairs.
-      if ((IntVal > 31) || (IntVal % 2 != 0))
-        return -1;
-      return getReg(Mips::AFGR64RegClassID, IntVal / 2);
-    }
+  if (Name.startswith("fcc")) {
+    StringRef NumString = Name.substr(3);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 7) // There are only 8 fcc registers.
+      return -1;
+    return IntVal;
   }
+  return -1;
+}
 
+int MipsAsmParser::matchACRegisterName(StringRef Name) {
+
+  if (Name.startswith("ac")) {
+    StringRef NumString = Name.substr(2);
+    unsigned IntVal;
+    if (NumString.getAsInteger(10, IntVal))
+      return -1;    // This is not an integer.
+    if (IntVal > 3) // There are only 3 acc registers.
+      return -1;
+    return IntVal;
+  }
   return -1;
 }
 
-void MipsAsmParser::setDefaultFpFormat() {
+int MipsAsmParser::matchMSA128RegisterName(StringRef Name) {
+  unsigned IntVal;
 
-  if (isMips64() || isFP64())
-    FpFormat = FP_FORMAT_D;
-  else
-    FpFormat = FP_FORMAT_S;
+  if (Name.front() != 'w' || Name.drop_front(1).getAsInteger(10, IntVal))
+    return -1;
+
+  if (IntVal > 31)
+    return -1;
+
+  return IntVal;
 }
 
-bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){
+int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
+  int CC;
 
-  bool IsDouble = StringSwitch<bool>(Mnemonic.lower())
-    .Case("ldxc1", true)
-    .Case("ldc1",  true)
-    .Case("sdxc1", true)
-    .Case("sdc1",  true)
-    .Default(false);
+  CC = StringSwitch<unsigned>(Name)
+           .Case("msair", 0)
+           .Case("msacsr", 1)
+           .Case("msaaccess", 2)
+           .Case("msasave", 3)
+           .Case("msamodify", 4)
+           .Case("msarequest", 5)
+           .Case("msamap", 6)
+           .Case("msaunmap", 7)
+           .Default(-1);
 
-  return IsDouble;
+  return CC;
 }
 
-void MipsAsmParser::setFpFormat(StringRef Format) {
+int MipsAsmParser::matchRegisterName(StringRef Name, bool is64BitReg) {
 
-  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
-    .Case(".s",  FP_FORMAT_S)
-    .Case(".d",  FP_FORMAT_D)
-    .Case(".l",  FP_FORMAT_L)
-    .Case(".w",  FP_FORMAT_W)
-    .Default(FP_FORMAT_NONE);
+  int CC;
+  CC = matchCPURegisterName(Name);
+  if (CC != -1)
+    return matchRegisterByNumber(CC, is64BitReg ? Mips::GPR64RegClassID
+                                                : Mips::GPR32RegClassID);
+  CC = matchFPURegisterName(Name);
+  // TODO: decide about fpu register class
+  if (CC != -1)
+    return matchRegisterByNumber(CC, isFP64() ? Mips::FGR64RegClassID
+                                              : Mips::FGR32RegClassID);
+  return matchMSA128RegisterName(Name);
+}
+
+int MipsAsmParser::regKindToRegClass(int RegKind) {
+
+  switch (RegKind) {
+  case MipsOperand::Kind_GPR32:
+    return Mips::GPR32RegClassID;
+  case MipsOperand::Kind_GPR64:
+    return Mips::GPR64RegClassID;
+  case MipsOperand::Kind_HWRegs:
+    return Mips::HWRegsRegClassID;
+  case MipsOperand::Kind_FGR32Regs:
+    return Mips::FGR32RegClassID;
+  case MipsOperand::Kind_FGRH32Regs:
+    return Mips::FGRH32RegClassID;
+  case MipsOperand::Kind_FGR64Regs:
+    return Mips::FGR64RegClassID;
+  case MipsOperand::Kind_AFGR64Regs:
+    return Mips::AFGR64RegClassID;
+  case MipsOperand::Kind_CCRRegs:
+    return Mips::CCRRegClassID;
+  case MipsOperand::Kind_ACC64DSP:
+    return Mips::ACC64DSPRegClassID;
+  case MipsOperand::Kind_FCCRegs:
+    return Mips::FCCRegClassID;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Mips::MSA128BRegClassID;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Mips::MSA128HRegClassID;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Mips::MSA128WRegClassID;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Mips::MSA128DRegClassID;
+  case MipsOperand::Kind_MSA128CtrlRegs:
+    return Mips::MSACtrlRegClassID;
+  default:
+    return -1;
+  }
 }
 
 bool MipsAssemblerOptions::setATReg(unsigned Reg) {
@@ -874,17 +1078,15 @@ bool MipsAssemblerOptions::setATReg(unsigned Reg) {
   return true;
 }
 
-int MipsAsmParser::getATReg() {
-  return Options.getATRegNum();
-}
+int MipsAsmParser::getATReg() { return Options.getATRegNum(); }
 
 unsigned MipsAsmParser::getReg(int RC, int RegNo) {
-  return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
+  return *(getContext().getRegisterInfo()->getRegClass(RC).begin() + RegNo);
 }
 
 int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, unsigned RegClass) {
-
-  if (RegNum > 31)
+  if (RegNum >
+      getContext().getRegisterInfo()->getRegClass(RegClass).getNumRegs())
     return -1;
 
   return getReg(RegClass, RegNum);
@@ -899,12 +1101,13 @@ int MipsAsmParser::tryParseRegister(bool is64BitReg) {
     RegNum = matchRegisterName(lowerCase, is64BitReg);
   } else if (Tok.is(AsmToken::Integer))
     RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
-        is64BitReg ? Mips::CPU64RegsRegClassID : Mips::CPURegsRegClassID);
+                                   is64BitReg ? Mips::GPR64RegClassID
+                                              : Mips::GPR32RegClassID);
   return RegNum;
 }
 
 bool MipsAsmParser::tryParseRegisterOperand(
-             SmallVectorImpl<MCParsedAsmOperand*> &Operands, bool is64BitReg) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, bool is64BitReg) {
 
   SMLoc S = Parser.getTok().getLoc();
   int RegNo = -1;
@@ -913,14 +1116,15 @@ bool MipsAsmParser::tryParseRegisterOperand(
   if (RegNo == -1)
     return true;
 
-  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
-                                            Parser.getTok().getLoc()));
+  Operands.push_back(
+      MipsOperand::CreateReg(RegNo, S, Parser.getTok().getLoc()));
   Parser.Lex(); // Eat register token.
   return false;
 }
 
-bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
-                                 StringRef Mnemonic) {
+bool
+MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            StringRef Mnemonic) {
   // Check if the current operand has a custom associated parser, if so, try to
   // custom parse the operand, or fallback to the general approach.
   OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
@@ -968,22 +1172,39 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
       return true;
 
     SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-
     MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
-
     // Otherwise create a symbol reference.
-    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
-                                                getContext());
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
 
     Operands.push_back(MipsOperand::CreateImm(Res, S, E));
     return false;
   }
   case AsmToken::Identifier:
+    // For instruction aliases like "bc1f $Label" dedicated parser will
+    // eat the '$' sign before failing. So in order to look for appropriate
+    // label we must check first if we have already consumed '$'.
+    if (hasConsumedDollar) {
+      hasConsumedDollar = false;
+      SMLoc S = Parser.getTok().getLoc();
+      StringRef Identifier;
+      if (Parser.parseIdentifier(Identifier))
+        return true;
+      SMLoc E =
+          SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+      // Create a symbol reference.
+      const MCExpr *Res =
+          MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, getContext());
+
+      Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+      return false;
+    }
     // Look for the existing symbol, we should check if
     // we need to assigne the propper RegisterKind.
     if (searchSymbolAlias(Operands, MipsOperand::Kind_None))
       return false;
-    // Else drop to expression parsing.
+  // Else drop to expression parsing.
   case AsmToken::LParen:
   case AsmToken::Minus:
   case AsmToken::Plus:
@@ -1014,7 +1235,7 @@ bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
   return true;
 }
 
-const MCExpr* MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
+const MCExpr *MipsAsmParser::evaluateRelocExpr(const MCExpr *Expr,
                                                StringRef RelocStr) {
   const MCExpr *Res;
   // Check the type of the expression.
@@ -1084,7 +1305,7 @@ bool MipsAsmParser::isEvaluated(const MCExpr *Expr) {
 }
 
 bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
-  Parser.Lex(); // Eat the % token.
+  Parser.Lex();                          // Eat the % token.
   const AsmToken &Tok = Parser.getTok(); // Get next token, operation.
   if (Tok.isNot(AsmToken::Identifier))
     return true;
@@ -1130,7 +1351,7 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   StartLoc = Parser.getTok().getLoc();
   RegNo = tryParseRegister(isMips64());
   EndLoc = Parser.getTok().getLoc();
-  return (RegNo == (unsigned) -1);
+  return (RegNo == (unsigned)-1);
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
@@ -1162,11 +1383,12 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 }
 
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   const MCExpr *IdVal = 0;
   SMLoc S;
   bool isParenExpr = false;
+  MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
   // First operand is the offset.
   S = Parser.getTok().getLoc();
 
@@ -1181,21 +1403,20 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
     const AsmToken &Tok = Parser.getTok(); // Get the next token.
     if (Tok.isNot(AsmToken::LParen)) {
-      MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
+      MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
       if (Mnemonic->getToken() == "la") {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
         Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
         return MatchOperand_Success;
       }
       if (Tok.is(AsmToken::EndOfStatement)) {
-        SMLoc E = SMLoc::getFromPointer(
-            Parser.getTok().getLoc().getPointer() - 1);
+        SMLoc E =
+            SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
-        Operands.push_back(MipsOperand::CreateMem(isMips64() ? Mips::ZERO_64
-                                                             : Mips::ZERO,
-                           IdVal, S, E));
+        Operands.push_back(MipsOperand::CreateMem(
+            isMips64() ? Mips::ZERO_64 : Mips::ZERO, IdVal, S, E));
         return MatchOperand_Success;
       }
       Error(Parser.getTok().getLoc(), "'(' expected");
@@ -1205,21 +1426,12 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     Parser.Lex(); // Eat the '(' token.
   }
 
-  const AsmToken &Tok1 = Parser.getTok(); // Get next token
-  if (Tok1.is(AsmToken::Dollar)) {
-    Parser.Lex(); // Eat the '$' token.
-    if (tryParseRegisterOperand(Operands, isMips64())) {
-      Error(Parser.getTok().getLoc(), "unexpected token in operand");
-      return MatchOperand_ParseFail;
-    }
-
-  } else {
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  }
+  Res = parseRegs(Operands, isMips64() ? (int)MipsOperand::Kind_GPR64
+                                       : (int)MipsOperand::Kind_GPR32);
+  if (Res != MatchOperand_Success)
+    return Res;
 
-  const AsmToken &Tok2 = Parser.getTok(); // Get next token.
-  if (Tok2.isNot(AsmToken::RParen)) {
+  if (Parser.getTok().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "')' expected");
     return MatchOperand_ParseFail;
   }
@@ -1232,7 +1444,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
-  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
   int RegNo = op->getReg();
   // Remove the register from the operands.
   Operands.pop_back();
@@ -1251,336 +1463,694 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
   return MatchOperand_Success;
 }
 
+bool MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return false;
+
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex();
+  AsmToken::TokenKind TkKind = getLexer().getKind();
+  int Reg;
+
+  if (TkKind == AsmToken::Integer) {
+    Reg = matchRegisterByNumber(Parser.getTok().getIntVal(),
+                                regKindToRegClass(RegKind));
+    if (Reg == -1)
+      return false;
+  } else if (TkKind == AsmToken::Identifier) {
+    if ((Reg = matchCPURegisterName(Parser.getTok().getString().lower())) == -1)
+      return false;
+    Reg = getReg(regKindToRegClass(RegKind), Reg);
+  } else {
+    return false;
+  }
+
+  MipsOperand *Op = MipsOperand::CreatePtrReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind((MipsOperand::RegisterKind)RegKind);
+  Operands.push_back(Op);
+  Parser.Lex();
+  return true;
+}
+
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCPU64Regs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parsePtrReg(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  MipsOperand::RegisterKind RegKind =
+      isN64() ? MipsOperand::Kind_GPR64 : MipsOperand::Kind_GPR32;
 
-  if (!isMips64())
+  // Parse index register.
+  if (!parsePtrReg(Operands, RegKind))
     return MatchOperand_NoMatch;
-  if (getLexer().getKind() == AsmToken::Identifier) {
-    if (searchSymbolAlias(Operands, MipsOperand::Kind_CPU64Regs))
+
+  // Parse '('.
+  if (Parser.getTok().isNot(AsmToken::LParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
+  Parser.Lex();
+
+  // Parse base register.
+  if (!parsePtrReg(Operands, RegKind))
+    return MatchOperand_NoMatch;
+
+  // Parse ')'.
+  if (Parser.getTok().isNot(AsmToken::RParen))
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
+  Parser.Lex();
+
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                         int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+  if (getLexer().getKind() == AsmToken::Identifier && !hasConsumedDollar) {
+    if (searchSymbolAlias(Operands, Kind))
       return MatchOperand_Success;
     return MatchOperand_NoMatch;
   }
+  SMLoc S = Parser.getTok().getLoc();
   // If the first token is not '$', we have an error.
-  if (Parser.getTok().isNot(AsmToken::Dollar))
+  if (Parser.getTok().isNot(AsmToken::Dollar) && !hasConsumedDollar)
     return MatchOperand_NoMatch;
-
-  Parser.Lex(); // Eat $
-  if (!tryParseRegisterOperand(Operands, true)) {
-    // Set the proper register kind.
-    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
-    op->setRegKind(MipsOperand::Kind_CPU64Regs);
+  if (!hasConsumedDollar) {
+    Parser.Lex(); // Eat the '$'
+    hasConsumedDollar = true;
+  }
+  if (getLexer().getKind() == AsmToken::Identifier) {
+    int RegNum = -1;
+    std::string RegName = Parser.getTok().getString().lower();
+    // Match register by name
+    switch (RegKind) {
+    case MipsOperand::Kind_GPR32:
+    case MipsOperand::Kind_GPR64:
+      RegNum = matchCPURegisterName(RegName);
+      break;
+    case MipsOperand::Kind_AFGR64Regs:
+    case MipsOperand::Kind_FGR64Regs:
+    case MipsOperand::Kind_FGR32Regs:
+    case MipsOperand::Kind_FGRH32Regs:
+      RegNum = matchFPURegisterName(RegName);
+      if (RegKind == MipsOperand::Kind_AFGR64Regs)
+        RegNum /= 2;
+      else if (RegKind == MipsOperand::Kind_FGRH32Regs && !isFP64())
+        if (RegNum != -1 && RegNum % 2 != 0)
+          Warning(S, "Float register should be even.");
+      break;
+    case MipsOperand::Kind_FCCRegs:
+      RegNum = matchFCCRegisterName(RegName);
+      break;
+    case MipsOperand::Kind_ACC64DSP:
+      RegNum = matchACRegisterName(RegName);
+      break;
+    default:
+      break; // No match, value is set to -1.
+    }
+    // No match found, return _NoMatch to give a chance to other round.
+    if (RegNum < 0)
+      return MatchOperand_NoMatch;
+
+    int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+    if (RegVal == -1)
+      return MatchOperand_NoMatch;
+
+    MipsOperand *Op =
+        MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register name.
+    return MatchOperand_Success;
+  } else if (getLexer().getKind() == AsmToken::Integer) {
+    unsigned RegNum = Parser.getTok().getIntVal();
+    if (Kind == MipsOperand::Kind_HWRegs) {
+      if (RegNum != 29)
+        return MatchOperand_NoMatch;
+      // Only hwreg 29 is supported, found at index 0.
+      RegNum = 0;
+    }
+    int Reg = matchRegisterByNumber(RegNum, regKindToRegClass(Kind));
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+    Op->setRegKind(Kind);
+    Operands.push_back(Op);
+    hasConsumedDollar = false;
+    Parser.Lex(); // Eat the register number.
+    if ((RegKind == MipsOperand::Kind_GPR32) &&
+        (getLexer().is(AsmToken::LParen))) {
+      // Check if it is indexed addressing operand.
+      Operands.push_back(MipsOperand::CreateToken("(", getLexer().getLoc()));
+      Parser.Lex(); // Eat the parenthesis.
+      if (parseRegs(Operands, RegKind) != MatchOperand_Success)
+        return MatchOperand_NoMatch;
+      if (getLexer().isNot(AsmToken::RParen))
+        return MatchOperand_NoMatch;
+      Operands.push_back(MipsOperand::CreateToken(")", getLexer().getLoc()));
+      Parser.Lex();
+    }
     return MatchOperand_Success;
   }
   return MatchOperand_NoMatch;
 }
 
-bool MipsAsmParser::searchSymbolAlias(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands, unsigned RegisterKind) {
+bool MipsAsmParser::validateMSAIndex(int Val, int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
 
-  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
-  if (Sym) {
-    SMLoc S = Parser.getTok().getLoc();
-    const MCExpr *Expr;
-    if (Sym->isVariable())
-      Expr = Sym->getVariableValue();
-    else
-      return false;
-    if (Expr->getKind() == MCExpr::SymbolRef) {
-      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
-      const StringRef DefSymbol = Ref->getSymbol().getName();
-      if (DefSymbol.startswith("$")) {
-        // Lookup for the register with the corresponding name.
-        int RegNum = matchRegisterName(DefSymbol.substr(1), isMips64());
-        if (RegNum > -1) {
-          Parser.Lex();
-          MipsOperand *op = MipsOperand::CreateReg(RegNum, S,
-                                                   Parser.getTok().getLoc());
-          op->setRegKind((MipsOperand::RegisterKind) RegisterKind);
-          Operands.push_back(op);
-          return true;
-        }
-      }
-    } else if (Expr->getKind() == MCExpr::Constant) {
-      Parser.Lex();
-      const MCConstantExpr *Const = static_cast<const MCConstantExpr*>(Expr);
-      MipsOperand *op = MipsOperand::CreateImm(Const, S,
-          Parser.getTok().getLoc());
-      Operands.push_back(op);
-      return true;
-    }
+  if (Val < 0)
+    return false;
+
+  switch (Kind) {
+  default:
+    return false;
+  case MipsOperand::Kind_MSA128BRegs:
+    return Val < 16;
+  case MipsOperand::Kind_MSA128HRegs:
+    return Val < 8;
+  case MipsOperand::Kind_MSA128WRegs:
+    return Val < 4;
+  case MipsOperand::Kind_MSA128DRegs:
+    return Val < 2;
   }
-  return false;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCPURegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseMSARegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                            int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+  SMLoc S = Parser.getTok().getLoc();
+  std::string RegName;
 
-  if (getLexer().getKind() == AsmToken::Identifier) {
-    if (searchSymbolAlias(Operands, MipsOperand::Kind_CPURegs))
-      return MatchOperand_Success;
-    return MatchOperand_NoMatch;
-  }
-  // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
 
-  Parser.Lex(); // Eat $
-  if (!tryParseRegisterOperand(Operands, false)) {
-    // Set the proper register kind.
-    MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
-    op->setRegKind(MipsOperand::Kind_CPURegs);
+  switch (RegKind) {
+  default:
+    return MatchOperand_ParseFail;
+  case MipsOperand::Kind_MSA128BRegs:
+  case MipsOperand::Kind_MSA128HRegs:
+  case MipsOperand::Kind_MSA128WRegs:
+  case MipsOperand::Kind_MSA128DRegs:
+    break;
+  }
+
+  Parser.Lex(); // Eat the '$'.
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegName = Parser.getTok().getString().lower();
+  else
+    return MatchOperand_ParseFail;
+
+  int RegNum = matchMSA128RegisterName(RegName);
+
+  if (RegNum < 0 || RegNum > 31)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *Op = MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  Op->setRegKind(Kind);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register identifier.
+
+  // MSA registers may be suffixed with an index in the form of:
+  // 1) Immediate expression.
+  // 2) General Purpose Register.
+  // Examples:
+  //   1) copy_s.b $29,$w0[0]
+  //   2) sld.b $w0,$w1[$1]
+
+  if (Parser.getTok().isNot(AsmToken::LBrac))
+    return MatchOperand_Success;
+
+  MipsOperand *Mnemonic = static_cast<MipsOperand *>(Operands[0]);
+
+  Operands.push_back(MipsOperand::CreateToken("[", Parser.getTok().getLoc()));
+  Parser.Lex(); // Parse the '[' token.
+
+  if (Parser.getTok().is(AsmToken::Dollar)) {
+    // This must be a GPR.
+    MipsOperand *RegOp;
+    SMLoc VIdx = Parser.getTok().getLoc();
+    Parser.Lex(); // Parse the '$' token.
+
+    // GPR have aliases and we must account for that. Example: $30 == $fp
+    if (getLexer().getKind() == AsmToken::Integer) {
+      unsigned RegNum = Parser.getTok().getIntVal();
+      int Reg = matchRegisterByNumber(
+          RegNum, regKindToRegClass(MipsOperand::Kind_GPR32));
+      if (Reg == -1) {
+        Error(VIdx, "invalid general purpose register");
+        return MatchOperand_ParseFail;
+      }
+
+      RegOp = MipsOperand::CreateReg(Reg, VIdx, Parser.getTok().getLoc());
+    } else if (getLexer().getKind() == AsmToken::Identifier) {
+      int RegNum = -1;
+      std::string RegName = Parser.getTok().getString().lower();
+
+      RegNum = matchCPURegisterName(RegName);
+      if (RegNum == -1) {
+        Error(VIdx, "general purpose register expected");
+        return MatchOperand_ParseFail;
+      }
+      RegNum = getReg(regKindToRegClass(MipsOperand::Kind_GPR32), RegNum);
+      RegOp = MipsOperand::CreateReg(RegNum, VIdx, Parser.getTok().getLoc());
+    } else
+      return MatchOperand_ParseFail;
+
+    RegOp->setRegKind(MipsOperand::Kind_GPR32);
+    Operands.push_back(RegOp);
+    Parser.Lex(); // Eat the register identifier.
+
+    if (Parser.getTok().isNot(AsmToken::RBrac))
+      return MatchOperand_ParseFail;
+
+    Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+    Parser.Lex(); // Parse the ']' token.
+
     return MatchOperand_Success;
   }
-  return MatchOperand_NoMatch;
+
+  // The index must be a constant expression then.
+  SMLoc VIdx = Parser.getTok().getLoc();
+  const MCExpr *ImmVal;
+
+  if (getParser().parseExpression(ImmVal))
+    return MatchOperand_ParseFail;
+
+  const MCConstantExpr *expr = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!expr || !validateMSAIndex((int)expr->getValue(), Kind)) {
+    Error(VIdx, "invalid immediate value");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = Parser.getTok().getEndLoc();
+
+  if (Parser.getTok().isNot(AsmToken::RBrac))
+    return MatchOperand_ParseFail;
+
+  bool insve =
+      Mnemonic->getToken() == "insve.b" || Mnemonic->getToken() == "insve.h" ||
+      Mnemonic->getToken() == "insve.w" || Mnemonic->getToken() == "insve.d";
+
+  // The second vector index of insve instructions is always 0.
+  if (insve && Operands.size() > 6) {
+    if (expr->getValue() != 0) {
+      Error(VIdx, "immediate value must be 0");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(MipsOperand::CreateToken("0", VIdx));
+  } else
+    Operands.push_back(MipsOperand::CreateImm(expr, VIdx, E));
+
+  Operands.push_back(MipsOperand::CreateToken("]", Parser.getTok().getLoc()));
+
+  Parser.Lex(); // Parse the ']' token.
+
+  return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseMSACtrlRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands,
+                                int RegKind) {
+  MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
 
-  if (isMips64())
+  if (Kind != MipsOperand::Kind_MSA128CtrlRegs)
     return MatchOperand_NoMatch;
 
-  // If the first token is not '$' we have error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
-    return MatchOperand_NoMatch;
+    return MatchOperand_ParseFail;
+
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat the '$'.
 
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
-    return MatchOperand_NoMatch;
+  Parser.Lex(); // Eat the '$' symbol.
 
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
+  int RegNum = -1;
+  if (getLexer().getKind() == AsmToken::Identifier)
+    RegNum = matchMSA128CtrlRegisterName(Parser.getTok().getString().lower());
+  else if (getLexer().getKind() == AsmToken::Integer)
+    RegNum = Parser.getTok().getIntVal();
+  else
     return MatchOperand_ParseFail;
 
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29, S,
-      Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HWRegs);
-  Operands.push_back(op);
+  if (RegNum < 0 || RegNum > 7)
+    return MatchOperand_ParseFail;
+
+  int RegVal = getReg(regKindToRegClass(Kind), RegNum);
+  if (RegVal == -1)
+    return MatchOperand_ParseFail;
+
+  MipsOperand *RegOp =
+      MipsOperand::CreateReg(RegVal, S, Parser.getTok().getLoc());
+  RegOp->setRegKind(MipsOperand::Kind_MSA128CtrlRegs);
+  Operands.push_back(RegOp);
+  Parser.Lex(); // Eat the register identifier.
 
-  Parser.Lex(); // Eat the register number.
   return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseHW64Regs(
-    SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+MipsAsmParser::parseGPR64(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
 
   if (!isMips64())
     return MatchOperand_NoMatch;
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR64);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseGPR32(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_GPR32);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseAFGR64Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+
+  if (isFP64())
+    return MatchOperand_NoMatch;
+  return parseRegs(Operands, (int)MipsOperand::Kind_AFGR64Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseFGR64Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  if (!isFP64())
+    return MatchOperand_NoMatch;
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR64Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseFGR32Regs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGR32Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseFGRH32Regs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FGRH32Regs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseFCCRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_FCCRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseACC64DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_ACC64DSP);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseLO32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
+
   SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat $
+  Parser.Lex(); // Eat the '$'
 
-  const AsmToken &Tok = Parser.getTok(); // Get the next token.
-  if (Tok.isNot(AsmToken::Integer))
+  const AsmToken &Tok = Parser.getTok(); // Get next token.
+
+  if (Tok.isNot(AsmToken::Identifier))
     return MatchOperand_NoMatch;
 
-  unsigned RegNum = Tok.getIntVal();
-  // At the moment only hwreg29 is supported.
-  if (RegNum != 29)
-    return MatchOperand_ParseFail;
+  if (!Tok.getIdentifier().startswith("ac"))
+    return MatchOperand_NoMatch;
+
+  StringRef NumString = Tok.getIdentifier().substr(2);
+
+  unsigned IntVal;
+  if (NumString.getAsInteger(10, IntVal))
+    return MatchOperand_NoMatch;
 
-  MipsOperand *op = MipsOperand::CreateReg(Mips::HWR29_64, S,
-                                           Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_HW64Regs);
-  Operands.push_back(op);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::LO32DSPRegClassID);
+
+  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind(MipsOperand::Kind_LO32DSP);
+  Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
   return MatchOperand_Success;
 }
 
 MipsAsmParser::OperandMatchResultTy
-MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  unsigned RegNum;
+MipsAsmParser::parseHI32DSP(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   // If the first token is not '$' we have an error.
   if (Parser.getTok().isNot(AsmToken::Dollar))
     return MatchOperand_NoMatch;
+
   SMLoc S = Parser.getTok().getLoc();
   Parser.Lex(); // Eat the '$'
 
   const AsmToken &Tok = Parser.getTok(); // Get next token.
-  if (Tok.is(AsmToken::Integer)) {
-    RegNum = Tok.getIntVal();
-    // At the moment only fcc0 is supported.
-    if (RegNum != 0)
-      return MatchOperand_ParseFail;
-  } else if (Tok.is(AsmToken::Identifier)) {
-    // At the moment only fcc0 is supported.
-    if (Tok.getIdentifier() != "fcc0")
-      return MatchOperand_ParseFail;
-  } else
+
+  if (Tok.isNot(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
+
+  if (!Tok.getIdentifier().startswith("ac"))
+    return MatchOperand_NoMatch;
+
+  StringRef NumString = Tok.getIdentifier().substr(2);
+
+  unsigned IntVal;
+  if (NumString.getAsInteger(10, IntVal))
     return MatchOperand_NoMatch;
 
-  MipsOperand *op = MipsOperand::CreateReg(Mips::FCC0, S,
-                                           Parser.getTok().getLoc());
-  op->setRegKind(MipsOperand::Kind_CCRRegs);
-  Operands.push_back(op);
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::HI32DSPRegClassID);
+
+  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind(MipsOperand::Kind_HI32DSP);
+  Operands.push_back(Op);
 
   Parser.Lex(); // Eat the register number.
   return MatchOperand_Success;
 }
 
-MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCOP2(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // If the first token is not '$' we have an error.
+  if (Parser.getTok().isNot(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
 
-  MCSymbolRefExpr::VariantKind VK
-                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
-    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
-    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
-    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
-    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
-    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
-    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
-    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
-    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
-    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
-    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
-    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
-    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
-    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
-    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
-    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
-    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
-    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
-    .Default(MCSymbolRefExpr::VK_None);
+  SMLoc S = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the '$'
 
-  return VK;
+  const AsmToken &Tok = Parser.getTok(); // Get next token.
+
+  if (Tok.isNot(AsmToken::Integer))
+    return MatchOperand_NoMatch;
+
+  unsigned IntVal = Tok.getIntVal();
+
+  unsigned Reg = matchRegisterByNumber(IntVal, Mips::COP2RegClassID);
+
+  MipsOperand *Op = MipsOperand::CreateReg(Reg, S, Parser.getTok().getLoc());
+  Op->setRegKind(MipsOperand::Kind_COP2);
+  Operands.push_back(Op);
+
+  Parser.Lex(); // Eat the register number.
+  return MatchOperand_Success;
 }
 
-static int ConvertCcString(StringRef CondString) {
-  int CC = StringSwitch<unsigned>(CondString)
-    .Case(".f",    0)
-    .Case(".un",   1)
-    .Case(".eq",   2)
-    .Case(".ueq",  3)
-    .Case(".olt",  4)
-    .Case(".ult",  5)
-    .Case(".ole",  6)
-    .Case(".ule",  7)
-    .Case(".sf",   8)
-    .Case(".ngle", 9)
-    .Case(".seq",  10)
-    .Case(".ngl",  11)
-    .Case(".lt",   12)
-    .Case(".nge",  13)
-    .Case(".le",   14)
-    .Case(".ngt",  15)
-    .Default(-1);
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128BRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128BRegs);
+}
 
-  return CC;
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128HRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128HRegs);
 }
 
-bool MipsAsmParser::
-parseMathOperation(StringRef Name, SMLoc NameLoc,
-                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Split the format.
-  size_t Start = Name.find('.'), Next = Name.rfind('.');
-  StringRef Format1 = Name.slice(Start, Next);
-  // Add the first format to the operands.
-  Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc));
-  // Now for the second format.
-  StringRef Format2 = Name.slice(Next, StringRef::npos);
-  Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc));
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128WRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128WRegs);
+}
 
-  // Set the format for the first register.
-  setFpFormat(Format1);
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128DRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSARegs(Operands, (int)MipsOperand::Kind_MSA128DRegs);
+}
 
-  // Read the remaining operands.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    // Read the first operand.
-    if (ParseOperand(Operands, Name)) {
-      SMLoc Loc = getLexer().getLoc();
-      Parser.eatToEndOfStatement();
-      return Error(Loc, "unexpected token in argument list");
-    }
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMSA128CtrlRegs(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseMSACtrlRegs(Operands, (int)MipsOperand::Kind_MSA128CtrlRegs);
+}
 
-    if (getLexer().isNot(AsmToken::Comma)) {
-      SMLoc Loc = getLexer().getLoc();
-      Parser.eatToEndOfStatement();
-      return Error(Loc, "unexpected token in argument list");
+bool MipsAsmParser::searchSymbolAlias(
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands, unsigned RegKind) {
+
+  MCSymbol *Sym = getContext().LookupSymbol(Parser.getTok().getIdentifier());
+  if (Sym) {
+    SMLoc S = Parser.getTok().getLoc();
+    const MCExpr *Expr;
+    if (Sym->isVariable())
+      Expr = Sym->getVariableValue();
+    else
+      return false;
+    if (Expr->getKind() == MCExpr::SymbolRef) {
+      MipsOperand::RegisterKind Kind = (MipsOperand::RegisterKind)RegKind;
+      const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
+      const StringRef DefSymbol = Ref->getSymbol().getName();
+      if (DefSymbol.startswith("$")) {
+        int RegNum = -1;
+        APInt IntVal(32, -1);
+        if (!DefSymbol.substr(1).getAsInteger(10, IntVal))
+          RegNum = matchRegisterByNumber(IntVal.getZExtValue(),
+                                         isMips64() ? Mips::GPR64RegClassID
+                                                    : Mips::GPR32RegClassID);
+        else {
+          // Lookup for the register with the corresponding name.
+          switch (Kind) {
+          case MipsOperand::Kind_AFGR64Regs:
+          case MipsOperand::Kind_FGR64Regs:
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
+            break;
+          case MipsOperand::Kind_FGR32Regs:
+            RegNum = matchFPURegisterName(DefSymbol.substr(1));
+            break;
+          case MipsOperand::Kind_GPR64:
+          case MipsOperand::Kind_GPR32:
+          default:
+            RegNum = matchCPURegisterName(DefSymbol.substr(1));
+            break;
+          }
+          if (RegNum > -1)
+            RegNum = getReg(regKindToRegClass(Kind), RegNum);
+        }
+        if (RegNum > -1) {
+          Parser.Lex();
+          MipsOperand *op =
+              MipsOperand::CreateReg(RegNum, S, Parser.getTok().getLoc());
+          op->setRegKind(Kind);
+          Operands.push_back(op);
+          return true;
+        }
+      }
+    } else if (Expr->getKind() == MCExpr::Constant) {
+      Parser.Lex();
+      const MCConstantExpr *Const = static_cast<const MCConstantExpr *>(Expr);
+      MipsOperand *op =
+          MipsOperand::CreateImm(Const, S, Parser.getTok().getLoc());
+      Operands.push_back(op);
+      return true;
     }
-    Parser.Lex(); // Eat the comma.
+  }
+  return false;
+}
 
-    // Set the format for the first register
-    setFpFormat(Format2);
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseHWRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_HWRegs);
+}
 
-    // Parse and remember the operand.
-    if (ParseOperand(Operands, Name)) {
-      SMLoc Loc = getLexer().getLoc();
-      Parser.eatToEndOfStatement();
-      return Error(Loc, "unexpected token in argument list");
-    }
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseCCRRegs(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  return parseRegs(Operands, (int)MipsOperand::Kind_CCRRegs);
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseInvNum(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  const MCExpr *IdVal;
+  // If the first token is '$' we may have register operand.
+  if (Parser.getTok().is(AsmToken::Dollar))
+    return MatchOperand_NoMatch;
+  SMLoc S = Parser.getTok().getLoc();
+  if (getParser().parseExpression(IdVal))
+    return MatchOperand_ParseFail;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal);
+  assert(MCE && "Unexpected MCExpr type.");
+  int64_t Val = MCE->getValue();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(MipsOperand::CreateImm(
+      MCConstantExpr::Create(0 - Val, getContext()), S, E));
+  return MatchOperand_Success;
+}
+
+MipsAsmParser::OperandMatchResultTy
+MipsAsmParser::parseLSAImm(SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::LParen:
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+    break;
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = getLexer().getLoc();
-    Parser.eatToEndOfStatement();
-    return Error(Loc, "unexpected token in argument list");
+  const MCExpr *Expr;
+  SMLoc S = Parser.getTok().getLoc();
+
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_ParseFail;
+
+  int64_t Val;
+  if (!Expr->EvaluateAsAbsolute(Val)) {
+    Error(S, "expected immediate value");
+    return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Consume the EndOfStatement.
-  return false;
+  // The LSA instruction allows a 2-bit unsigned immediate. For this reason
+  // and because the CPU always adds one to the immediate field, the allowed
+  // range becomes 1..4. We'll only check the range here and will deal
+  // with the addition/subtraction when actually decoding/encoding
+  // the instruction.
+  if (Val < 1 || Val > 4) {
+    Error(S, "immediate not in range (1..4)");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(MipsOperand::CreateLSAImm(Expr, S,
+                                               Parser.getTok().getLoc()));
+  return MatchOperand_Success;
 }
 
-bool MipsAsmParser::
-ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef Mnemonic;
-  // Floating point instructions: Should the register be treated as a double?
-  if (requestsDoubleOperand(Name)) {
-    setFpFormat(FP_FORMAT_D);
-    Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
-    Mnemonic = Name;
-  } else {
-    setDefaultFpFormat();
-    // Create the leading tokens for the mnemonic, split by '.' characters.
-    size_t Start = 0, Next = Name.find('.');
-    Mnemonic = Name.slice(Start, Next);
-
-    Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
-
-    if (Next != StringRef::npos) {
-      // There is a format token in mnemonic.
-      size_t Dot = Name.find('.', Next + 1);
-      StringRef Format = Name.slice(Next, Dot);
-      if (Dot == StringRef::npos) // Only one '.' in a string, it's a format.
-        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
-      else {
-        if (Name.startswith("c.")) {
-          // Floating point compare, add '.' and immediate represent for cc.
-          Operands.push_back(MipsOperand::CreateToken(".", NameLoc));
-          int Cc = ConvertCcString(Format);
-          if (Cc == -1) {
-            return Error(NameLoc, "Invalid conditional code");
-          }
-          SMLoc E = SMLoc::getFromPointer(
-              Parser.getTok().getLoc().getPointer() - 1);
-          Operands.push_back(
-              MipsOperand::CreateImm(MCConstantExpr::Create(Cc, getContext()),
-                                     NameLoc, E));
-        } else {
-          // trunc, ceil, floor ...
-          return parseMathOperation(Name, NameLoc, Operands);
-        }
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
 
-        // The rest is a format.
-        Format = Name.slice(Dot, StringRef::npos);
-        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
-      }
+  MCSymbolRefExpr::VariantKind VK =
+      StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+          .Case("hi", MCSymbolRefExpr::VK_Mips_ABS_HI)
+          .Case("lo", MCSymbolRefExpr::VK_Mips_ABS_LO)
+          .Case("gp_rel", MCSymbolRefExpr::VK_Mips_GPREL)
+          .Case("call16", MCSymbolRefExpr::VK_Mips_GOT_CALL)
+          .Case("got", MCSymbolRefExpr::VK_Mips_GOT)
+          .Case("tlsgd", MCSymbolRefExpr::VK_Mips_TLSGD)
+          .Case("tlsldm", MCSymbolRefExpr::VK_Mips_TLSLDM)
+          .Case("dtprel_hi", MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+          .Case("dtprel_lo", MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+          .Case("gottprel", MCSymbolRefExpr::VK_Mips_GOTTPREL)
+          .Case("tprel_hi", MCSymbolRefExpr::VK_Mips_TPREL_HI)
+          .Case("tprel_lo", MCSymbolRefExpr::VK_Mips_TPREL_LO)
+          .Case("got_disp", MCSymbolRefExpr::VK_Mips_GOT_DISP)
+          .Case("got_page", MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+          .Case("got_ofst", MCSymbolRefExpr::VK_Mips_GOT_OFST)
+          .Case("hi(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+          .Case("lo(%neg(%gp_rel", MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+          .Default(MCSymbolRefExpr::VK_None);
 
-      setFpFormat(Format);
-    }
+  return VK;
+}
+
+bool MipsAsmParser::ParseInstruction(
+    ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // Check if we have valid mnemonic
+  if (!mnemonicIsValid(Name, 0)) {
+    Parser.eatToEndOfStatement();
+    return Error(NameLoc, "Unknown instruction");
   }
+  // First operand in MCInst is instruction mnemonic.
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
 
   // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Mnemonic)) {
+    if (ParseOperand(Operands, Name)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
       return Error(Loc, "unexpected token in argument list");
@@ -1588,7 +2158,6 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 
     while (getLexer().is(AsmToken::Comma)) {
       Parser.Lex(); // Eat the comma.
-
       // Parse and remember the operand.
       if (ParseOperand(Operands, Name)) {
         SMLoc Loc = getLexer().getLoc();
@@ -1597,13 +2166,11 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       }
     }
   }
-
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     SMLoc Loc = getLexer().getLoc();
     Parser.eatToEndOfStatement();
     return Error(Loc, "unexpected token in argument list");
   }
-
   Parser.Lex(); // Consume the EndOfStatement.
   return false;
 }
@@ -1741,8 +2308,23 @@ bool MipsAsmParser::parseSetAssignment() {
     return reportParseError("unexpected token in .set directive");
   Lex(); // Eat comma
 
-  if (Parser.parseExpression(Value))
-    reportParseError("expected valid expression after comma");
+  if (getLexer().is(AsmToken::Dollar)) {
+    MCSymbol *Symbol;
+    SMLoc DollarLoc = getLexer().getLoc();
+    // Consume the dollar sign, and check for a following identifier.
+    Parser.Lex();
+    // We have a '$' followed by something, make sure they are adjacent.
+    if (DollarLoc.getPointer() + 1 != getTok().getLoc().getPointer())
+      return true;
+    StringRef Res =
+        StringRef(DollarLoc.getPointer(),
+                  getTok().getEndLoc().getPointer() - DollarLoc.getPointer());
+    Symbol = getContext().GetOrCreateSymbol(Res);
+    Parser.Lex();
+    Value =
+        MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None, getContext());
+  } else if (Parser.parseExpression(Value))
+    return reportParseError("expected valid expression after comma");
 
   // Check if the Name already exists as a symbol.
   MCSymbol *Sym = getContext().LookupSymbol(Name);
@@ -1788,6 +2370,34 @@ bool MipsAsmParser::parseDirectiveSet() {
   return true;
 }
 
+bool MipsAsmParser::parseDirectiveMipsHackStocg() {
+  MCAsmParser &Parser = getParser();
+  StringRef Name;
+  if (Parser.parseIdentifier(Name))
+    reportParseError("expected identifier");
+
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token");
+  Lex();
+
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackSTOCG(Sym, Flags);
+  return false;
+}
+
+bool MipsAsmParser::parseDirectiveMipsHackELFFlags() {
+  int64_t Flags = 0;
+  if (Parser.parseAbsoluteExpression(Flags))
+    return TokError("unexpected token");
+
+  getTargetStreamer().emitMipsHackELFFlags(Flags);
+  return false;
+}
+
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
 bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
@@ -1813,6 +2423,22 @@ bool MipsAsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   return false;
 }
 
+/// parseDirectiveGpWord
+///  ::= .gpword local_sym
+bool MipsAsmParser::parseDirectiveGpWord() {
+  const MCExpr *Value;
+  // EmitGPRel32Value requires an expression, so we are using base class
+  // method to evaluate the expression.
+  if (getParser().parseExpression(Value))
+    return true;
+  getParser().getStreamer().EmitGPRel32Value(Value);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(getLexer().getLoc(), "unexpected token in directive");
+  Parser.Lex(); // Eat EndOfStatement token.
+  return false;
+}
+
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   StringRef IDVal = DirectiveID.getString();
@@ -1853,7 +2479,7 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".gpword") {
     // Ignore this directive for now.
-    Parser.eatToEndOfStatement();
+    parseDirectiveGpWord();
     return false;
   }
 
@@ -1862,6 +2488,12 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
+  if (IDVal == ".mips_hack_stocg")
+    return parseDirectiveMipsHackStocg();
+
+  if (IDVal == ".mips_hack_elf_flags")
+    return parseDirectiveMipsHackELFFlags();
+
   return true;
 }
 
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 78a9f70..6acc9a8 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -15,6 +15,7 @@ add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
   Mips16FrameLowering.cpp
+  Mips16HardFloat.cpp
   Mips16InstrInfo.cpp
   Mips16ISelDAGToDAG.cpp
   Mips16ISelLowering.cpp
@@ -46,10 +47,11 @@ add_llvm_target(MipsCodeGen
   MipsSelectionDAGInfo.cpp
   )
 
-add_dependencies(LLVMMipsCodeGen intrinsics_gen)
+add_dependencies(LLVMMipsCodeGen MipsCommonTableGen intrinsics_gen)
 
 add_subdirectory(InstPrinter)
 add_subdirectory(Disassembler)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(AsmParser)
+
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index 0dba33a..60508a8 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -35,26 +35,33 @@ public:
   ///
   MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                        bool bigEndian) :
-    MCDisassembler(STI), RegInfo(Info), isBigEndian(bigEndian) {}
+    MCDisassembler(STI), RegInfo(Info),
+    IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
+
+  bool isN64() const { return IsN64; }
 
 private:
-  const MCRegisterInfo *RegInfo;
+  OwningPtr<const MCRegisterInfo> RegInfo;
+  bool IsN64;
 protected:
   bool isBigEndian;
 };
 
 /// MipsDisassembler - a disasembler class for Mips32.
 class MipsDisassembler : public MipsDisassemblerBase {
+  bool IsMicroMips;
 public:
   /// Constructor     - Initializes the disassembler.
   ///
   MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
                    bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {}
+    MipsDisassemblerBase(STI, Info, bigEndian) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   /// getInstruction - See MCDisassembler.
   virtual DecodeStatus getInstruction(MCInst &instr,
@@ -88,25 +95,30 @@ public:
 
 // Forward declare these because the autogenerated code will reference them.
 // Definitions are further down.
-static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
 
 static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
 
-static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder);
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder);
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned Insn,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
 
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
@@ -118,11 +130,21 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
                                              uint64_t Address,
                                              const void *Decoder);
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder);
 
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+
 static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
                                               unsigned Insn,
                                               uint64_t Address,
@@ -133,47 +155,88 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               uint64_t Address,
                                               const void *Decoder);
 
-static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
-                                                unsigned Insn,
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
+                                                unsigned RegNo,
                                                 uint64_t Address,
                                                 const void *Decoder);
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
 
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
                                        const void *Decoder);
 
-static DecodeStatus DecodeBC1(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder);
-
-
 static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
                                      const void *Decoder);
 
+// DecodeBranchTargetMM - Decode microMIPS branch offset, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+// DecodeJumpTargetMM - Decode microMIPS jump target, which is
+// shifted left by 1 bit.
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder);
+
 static DecodeStatus DecodeMem(MCInst &Inst,
                               unsigned Insn,
                               uint64_t Address,
                               const void *Decoder);
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder);
+
 static DecodeStatus DecodeFMem(MCInst &Inst, unsigned Insn,
                                uint64_t Address,
                                const void *Decoder);
@@ -183,10 +246,12 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  uint64_t Address,
                                  const void *Decoder);
 
-static DecodeStatus DecodeCondCode(MCInst &Inst,
-                                   unsigned Insn,
-                                   uint64_t Address,
-                                   const void *Decoder);
+// Decode the immediate field of an LSA instruction which
+// is off by one.
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder);
 
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
@@ -248,11 +313,12 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
                                       uint64_t address,
                                       uint64_t &size,
                                       uint32_t &insn,
-                                      bool isBigEndian) {
+                                      bool isBigEndian,
+                                      bool IsMicroMips) {
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, (uint8_t*)Bytes, NULL) == -1) {
+  if (region.readBytes(address, 4, Bytes) == -1) {
     size = 0;
     return MCDisassembler::Fail;
   }
@@ -266,10 +332,20 @@ static DecodeStatus readInstruction32(const MemoryObject &region,
   }
   else {
     // Encoded as a small-endian 32-bit word in the stream.
-    insn = (Bytes[0] <<  0) |
-           (Bytes[1] <<  8) |
-           (Bytes[2] << 16) |
-           (Bytes[3] << 24);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsMicroMips) {
+      insn = (Bytes[2] <<  0) |
+             (Bytes[3] <<  8) |
+             (Bytes[0] << 16) |
+             (Bytes[1] << 24);
+    } else {
+      insn = (Bytes[0] <<  0) |
+             (Bytes[1] <<  8) |
+             (Bytes[2] << 16) |
+             (Bytes[3] << 24);
+    }
   }
 
   return MCDisassembler::Success;
@@ -285,10 +361,21 @@ MipsDisassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, IsMicroMips);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
+  if (IsMicroMips) {
+    // Calling the auto-generated decoder function.
+    Result = decodeInstruction(DecoderTableMicroMips32, instr, Insn, Address,
+                               this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+    return MCDisassembler::Fail;
+  }
+
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -310,7 +397,7 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   uint32_t Insn;
 
   DecodeStatus Result = readInstruction32(Region, Address, Size,
-                                          Insn, isBigEndian);
+                                          Insn, isBigEndian, false);
   if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
 
@@ -346,35 +433,45 @@ static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
 
 }
 
-static DecodeStatus DecodeCPU64RegsRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
 
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::CPU64RegsRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::GPR64RegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst,
+                                             unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder) {
   if (RegNo > 31)
     return MCDisassembler::Fail;
-  unsigned Reg = getReg(Decoder, Mips::CPURegsRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::GPR32RegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
-                                               unsigned RegNo,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  return DecodeCPURegsRegisterClass(Inst, RegNo, Address, Decoder);
+static DecodeStatus DecodePtrRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (static_cast<const MipsDisassembler *>(Decoder)->isN64())
+    return DecodeGPR64RegisterClass(Inst, RegNo, Address, Decoder);
+
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
+}
+
+static DecodeStatus DecodeDSPRRegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  return DecodeGPR32RegisterClass(Inst, RegNo, Address, Decoder);
 }
 
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
@@ -401,11 +498,37 @@ static DecodeStatus DecodeFGR32RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeFGRH32RegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::FGRH32RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeCCRRegisterClass(MCInst &Inst,
                                            unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::CCRRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeFCCRegisterClass(MCInst &Inst,
+                                           unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 7)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, Mips::FCCRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
@@ -417,8 +540,8 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   unsigned Reg = fieldFromInstruction(Insn, 16, 5);
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
 
-  Reg = getReg(Decoder, Mips::CPURegsRegClassID, Reg);
-  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
   if(Inst.getOpcode() == Mips::SC){
     Inst.addOperand(MCOperand::CreateReg(Reg));
@@ -431,6 +554,58 @@ static DecodeStatus DecodeMem(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  int Offset = SignExtend32<10>(fieldFromInstruction(Insn, 16, 10));
+  unsigned Reg = fieldFromInstruction(Insn, 6, 5);
+  unsigned Base = fieldFromInstruction(Insn, 11, 5);
+
+  Reg = getReg(Decoder, Mips::MSA128BRegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm12(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<12>(Insn & 0x0fff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemMMImm16(MCInst &Inst,
+                                     unsigned Insn,
+                                     uint64_t Address,
+                                     const void *Decoder) {
+  int Offset = SignExtend32<16>(Insn & 0xffff);
+  unsigned Reg = fieldFromInstruction(Insn, 21, 5);
+  unsigned Base = fieldFromInstruction(Insn, 16, 5);
+
+  Reg = getReg(Decoder, Mips::GPR32RegClassID, Reg);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
+
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  Inst.addOperand(MCOperand::CreateReg(Base));
+  Inst.addOperand(MCOperand::CreateImm(Offset));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeFMem(MCInst &Inst,
                                unsigned Insn,
                                uint64_t Address,
@@ -440,7 +615,7 @@ static DecodeStatus DecodeFMem(MCInst &Inst,
   unsigned Base = fieldFromInstruction(Insn, 21, 5);
 
   Reg = getReg(Decoder, Mips::FGR64RegClassID, Reg);
-  Base = getReg(Decoder, Mips::CPURegsRegClassID, Base);
+  Base = getReg(Decoder, Mips::GPR32RegClassID, Base);
 
   Inst.addOperand(MCOperand::CreateReg(Reg));
   Inst.addOperand(MCOperand::CreateReg(Base));
@@ -461,15 +636,6 @@ static DecodeStatus DecodeHWRegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeCondCode(MCInst &Inst,
-                                   unsigned Insn,
-                                   uint64_t Address,
-                                   const void *Decoder) {
-  int CondCode = Insn & 0xf;
-  Inst.addOperand(MCOperand::CreateImm(CondCode));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
                                               unsigned RegNo,
                                               uint64_t Address,
@@ -483,49 +649,98 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
+static DecodeStatus DecodeACC64DSPRegisterClass(MCInst &Inst,
                                                 unsigned RegNo,
                                                 uint64_t Address,
                                                 const void *Decoder) {
-  //Currently only hardware register 29 is supported
-  if (RegNo != 29)
-    return  MCDisassembler::Fail;
-  Inst.addOperand(MCOperand::CreateReg(Mips::HWR29_64));
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACC64DSPRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeACRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeHI32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::ACRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::HI32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeHIRegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
+static DecodeStatus DecodeLO32DSPRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
   if (RegNo >= 4)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::HIRegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::LO32DSPRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeLORegsDSPRegisterClass(MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder) {
-  if (RegNo >= 4)
+static DecodeStatus DecodeMSA128BRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128BRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128HRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128HRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128WRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128WRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSA128DRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::MSA128DRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  if (RegNo > 7)
     return MCDisassembler::Fail;
 
-  unsigned Reg = getReg(Decoder, Mips::LORegsDSPRegClassID, RegNo);
+  unsigned Reg = getReg(Decoder, Mips::MSACtrlRegClassID, RegNo);
   Inst.addOperand(MCOperand::CreateReg(Reg));
   return MCDisassembler::Success;
 }
@@ -540,16 +755,6 @@ static DecodeStatus DecodeBranchTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
-static DecodeStatus DecodeBC1(MCInst &Inst,
-                              unsigned Insn,
-                              uint64_t Address,
-                              const void *Decoder) {
-  unsigned BranchOffset = Insn & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
-  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
@@ -560,6 +765,24 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  unsigned BranchOffset = Offset & 0xffff;
+  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeJumpTargetMM(MCInst &Inst,
+                                       unsigned Insn,
+                                       uint64_t Address,
+                                       const void *Decoder) {
+  unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 1;
+  Inst.addOperand(MCOperand::CreateImm(JumpOffset));
+  return MCDisassembler::Success;
+}
 
 static DecodeStatus DecodeSimm16(MCInst &Inst,
                                  unsigned Insn,
@@ -569,6 +792,15 @@ static DecodeStatus DecodeSimm16(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeLSAImm(MCInst &Inst,
+                                 unsigned Insn,
+                                 uint64_t Address,
+                                 const void *Decoder) {
+  // We add one to the immediate field as it was encoded as 'imm - 1'.
+  Inst.addOperand(MCOperand::CreateImm(Insn + 1));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   unsigned Insn,
                                   uint64_t Address,
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index fc23cd3..7884589 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -26,6 +26,12 @@ using namespace llvm;
 #define PRINT_ALIAS_INSTR
 #include "MipsGenAsmWriter.inc"
 
+template<unsigned R>
+static bool isReg(const MCInst &MI, unsigned OpNo) {
+  assert(MI.getOperand(OpNo).isReg() && "Register operand expected.");
+  return MI.getOperand(OpNo).getReg() == R;
+}
+
 const char* Mips::MipsFCCToString(Mips::CondCode CC) {
   switch (CC) {
   case FCOND_F:
@@ -80,7 +86,7 @@ void MipsInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   }
 
   // Try to print any aliases first.
-  if (!printAliasInstr(MI, O))
+  if (!printAliasInstr(MI, O) && !printAlias(*MI, O))
     printInstruction(MI, O);
   printAnnotation(O, Annot);
 
@@ -152,11 +158,6 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
     OS << ')';
 }
 
-void MipsInstPrinter::printCPURegs(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  printRegName(O, MI->getOperand(OpNo).getReg());
-}
-
 void MipsInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -183,6 +184,15 @@ void MipsInstPrinter::printUnsignedImm(const MCInst *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsInstPrinter::printUnsignedImm8(const MCInst *MI, int opNum,
+                                        raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsInstPrinter::
 printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -209,3 +219,70 @@ printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O) {
   const MCOperand& MO = MI->getOperand(opNum);
   O << MipsFCCToString((Mips::CondCode)MO.getImm());
 }
+
+void MipsInstPrinter::
+printSHFMask(const MCInst *MI, int opNum, raw_ostream &O) {
+  llvm_unreachable("TODO");
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo, raw_ostream &OS) {
+  OS << "\t" << Str << "\t";
+  printOperand(&MI, OpNo, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const char *Str, const MCInst &MI,
+                                 unsigned OpNo0, unsigned OpNo1,
+                                 raw_ostream &OS) {
+  printAlias(Str, MI, OpNo0, OS);
+  OS << ", ";
+  printOperand(&MI, OpNo1, OS);
+  return true;
+}
+
+bool MipsInstPrinter::printAlias(const MCInst &MI, raw_ostream &OS) {
+  switch (MI.getOpcode()) {
+  case Mips::BEQ:
+    // beq $zero, $zero, $L2 => b $L2
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return (isReg<Mips::ZERO>(MI, 0) && isReg<Mips::ZERO>(MI, 1) &&
+            printAlias("b", MI, 2, OS)) ||
+           (isReg<Mips::ZERO>(MI, 1) && printAlias("beqz", MI, 0, 2, OS));
+  case Mips::BEQ64:
+    // beq $r0, $zero, $L2 => beqz $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("beqz", MI, 0, 2, OS);
+  case Mips::BNE:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BNE64:
+    // bne $r0, $zero, $L2 => bnez $r0, $L2
+    return isReg<Mips::ZERO_64>(MI, 1) && printAlias("bnez", MI, 0, 2, OS);
+  case Mips::BGEZAL:
+    // bgezal $zero, $L1 => bal $L1
+    return isReg<Mips::ZERO>(MI, 0) && printAlias("bal", MI, 1, OS);
+  case Mips::BC1T:
+    // bc1t $fcc0, $L1 => bc1t $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1t", MI, 1, OS);
+  case Mips::BC1F:
+    // bc1f $fcc0, $L1 => bc1f $L1
+    return isReg<Mips::FCC0>(MI, 0) && printAlias("bc1f", MI, 1, OS);
+  case Mips::JALR:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::JALR64:
+    // jalr $ra, $r1 => jalr $r1
+    return isReg<Mips::RA_64>(MI, 0) && printAlias("jalr", MI, 1, OS);
+  case Mips::NOR:
+  case Mips::NOR_MM:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::NOR64:
+    // nor $r0, $r1, $zero => not $r0, $r1
+    return isReg<Mips::ZERO_64>(MI, 2) && printAlias("not", MI, 0, 1, OS);
+  case Mips::OR:
+    // or $r0, $r1, $zero => move $r0, $r1
+    return isReg<Mips::ZERO>(MI, 2) && printAlias("move", MI, 0, 1, OS);
+  default: return false;
+  }
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index d1b561f..f75ae24 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -87,16 +87,23 @@ public:
 
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  void printCPURegs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printUnsignedImm(const MCInst *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MCInst *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MCInst *MI, int opNum, raw_ostream &O);
+  void printSHFMask(const MCInst *MI, int opNum, raw_ostream &O);
+
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo,
+                  raw_ostream &OS);
+  bool printAlias(const char *Str, const MCInst &MI, unsigned OpNo0,
+                  unsigned OpNo1, raw_ostream &OS);
+  bool printAlias(const MCInst &MI, raw_ostream &OS);
 };
 } // end namespace llvm
 
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index 4212c94..9116748 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,12 +1,11 @@
 add_llvm_library(LLVMMipsDesc
   MipsAsmBackend.cpp
-  MipsDirectObjLower.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
   MipsMCTargetDesc.cpp
   MipsELFObjectWriter.cpp
   MipsReginfo.cpp
-  MipsELFStreamer.cpp
+  MipsTargetStreamer.cpp
   )
 
 add_dependencies(LLVMMipsDesc MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 0b13607..3e70b23 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -45,6 +45,10 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_DISP:
   case Mips::fixup_Mips_GOT_LO16:
   case Mips::fixup_Mips_CALL_LO16:
+  case Mips::fixup_MICROMIPS_LO16:
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+  case Mips::fixup_MICROMIPS_GOT_DISP:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -65,6 +69,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_GOT_Local:
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
+  case Mips::fixup_MICROMIPS_HI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -76,6 +81,13 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
     // Get the 4th 16-bits.
     Value = ((Value + 0x800080008000LL) >> 48) & 0xffff;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Value >>= 1;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Value -= 4;
+    Value >>= 1;
+    break;
   }
 
   return Value;
@@ -188,7 +200,20 @@ public:
       { "fixup_Mips_GOT_HI16",     0,     16,   0 },
       { "fixup_Mips_GOT_LO16",     0,     16,   0 },
       { "fixup_Mips_CALL_HI16",    0,     16,   0 },
-      { "fixup_Mips_CALL_LO16",    0,     16,   0 }
+      { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
+      { "fixup_MICROMIPS_HI16",    0,     16,   0 },
+      { "fixup_MICROMIPS_LO16",    0,     16,   0 },
+      { "fixup_MICROMIPS_GOT16",   0,     16,   0 },
+      { "fixup_MICROMIPS_PC16_S1", 0,     16,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_MICROMIPS_CALL16",  0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_DISP",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_PAGE",        0,     16,   0 },
+      { "fixup_MICROMIPS_GOT_OFST",        0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_HI16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_DTPREL_LO16", 0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_HI16",  0,     16,   0 },
+      { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -253,25 +278,33 @@ public:
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/true);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T,
+                                             const MCRegisterInfo &MRI,
+                                             StringRef TT,
                                              StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
deleted file mode 100644
index 15c4282..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower Mips MCInst records that are normally
-// left to the assembler to lower such as large shifts.
-//
-//===----------------------------------------------------------------------===//
-#include "MipsInstrInfo.h"
-#include "MCTargetDesc/MipsDirectObjLower.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
-
-using namespace llvm;
-
-// If the D<shift> instruction has a shift amount that is greater
-// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
-void Mips::LowerLargeShift(MCInst& Inst) {
-
-  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
-  assert(Inst.getOperand(2).isImm());
-
-  int64_t Shift = Inst.getOperand(2).getImm();
-  if (Shift <= 31)
-    return; // Do nothing
-  Shift -= 32;
-
-  // saminus32
-  Inst.getOperand(2).setImm(Shift);
-
-  switch (Inst.getOpcode()) {
-  default:
-    // Calling function is not synchronized
-    llvm_unreachable("Unexpected shift instruction");
-  case Mips::DSLL:
-    Inst.setOpcode(Mips::DSLL32);
-    return;
-  case Mips::DSRL:
-    Inst.setOpcode(Mips::DSRL32);
-    return;
-  case Mips::DSRA:
-    Inst.setOpcode(Mips::DSRA32);
-    return;
-  }
-}
-
-// Pick a DEXT or DINS instruction variant based on the pos and size operands
-void Mips::LowerDextDins(MCInst& InstIn) {
-  int Opcode = InstIn.getOpcode();
-
-  if (Opcode == Mips::DEXT)
-    assert(InstIn.getNumOperands() == 4 &&
-           "Invalid no. of machine operands for DEXT!");
-  else // Only DEXT and DINS are possible
-    assert(InstIn.getNumOperands() == 5 &&
-           "Invalid no. of machine operands for DINS!");
-
-  assert(InstIn.getOperand(2).isImm());
-  int64_t pos = InstIn.getOperand(2).getImm();
-  assert(InstIn.getOperand(3).isImm());
-  int64_t size = InstIn.getOperand(3).getImm();
-
-  if (size <= 32) {
-    if (pos < 32)  // DEXT/DINS, do nothing
-      return;
-    // DEXTU/DINSU
-    InstIn.getOperand(2).setImm(pos - 32);
-    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
-    return;
-  }
-  // DEXTM/DINSM
-  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
-  InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
-  return;
-}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
deleted file mode 100644
index 8813cc9..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
+++ /dev/null
@@ -1,28 +0,0 @@
-//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSDIRECTOBJLOWER_H
-#define MIPSDIRECTOBJLOWER_H
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class MCInst;
-  class MCStreamer;
-
-  namespace Mips {
-  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
-  //                       where the assembler usually finishes the lowering
-  //                       such as large shifts.
-    void LowerLargeShift(MCInst &Inst);
-    void LowerDextDins(MCInst &Inst);
-  }
-}
-
-#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 6471b51..83c7d4b 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -183,6 +183,45 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_Mips_CALL_LO16:
     Type = ELF::R_MIPS_CALL_LO16;
     break;
+  case Mips::fixup_MICROMIPS_26_S1:
+    Type = ELF::R_MICROMIPS_26_S1;
+    break;
+  case Mips::fixup_MICROMIPS_HI16:
+    Type = ELF::R_MICROMIPS_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_LO16:
+    Type = ELF::R_MICROMIPS_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT16:
+    Type = ELF::R_MICROMIPS_GOT16;
+    break;
+  case Mips::fixup_MICROMIPS_PC16_S1:
+    Type = ELF::R_MICROMIPS_PC16_S1;
+    break;
+  case Mips::fixup_MICROMIPS_CALL16:
+    Type = ELF::R_MICROMIPS_CALL16;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_DISP:
+    Type = ELF::R_MICROMIPS_GOT_DISP;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_PAGE:
+    Type = ELF::R_MICROMIPS_GOT_PAGE;
+    break;
+  case Mips::fixup_MICROMIPS_GOT_OFST:
+    Type = ELF::R_MICROMIPS_GOT_OFST;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_DTPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_DTPREL_LO16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_HI16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_HI16;
+    break;
+  case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
+    Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
deleted file mode 100644
index c33bc9a..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===-- MipsELFStreamer.cpp - MipsELFStreamer ---------------------------===//
-//
-//                       The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#include "MCTargetDesc/MipsELFStreamer.h"
-#include "MipsSubtarget.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack) {
-    MipsELFStreamer *S = new MipsELFStreamer(Context, TAB, OS, Emitter,
-                                             RelaxAll, NoExecStack);
-    return S;
-  }
-
-  // For llc. Set a group of ELF header flags
-  void
-  MipsELFStreamer::emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget) {
-
-    if (hasRawTextSupport())
-      return;
-
-    // Update e_header flags
-    MCAssembler& MCA = getAssembler();
-    unsigned EFlags = MCA.getELFHeaderEFlags();
-
-    if (Subtarget.inMips16Mode())
-      EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
-    else
-      EFlags |= ELF::EF_MIPS_NOREORDER;
-
-    // Architecture
-    if (Subtarget.hasMips64r2())
-      EFlags |= ELF::EF_MIPS_ARCH_64R2;
-    else if (Subtarget.hasMips64())
-      EFlags |= ELF::EF_MIPS_ARCH_64;
-    else if (Subtarget.hasMips32r2())
-      EFlags |= ELF::EF_MIPS_ARCH_32R2;
-    else
-      EFlags |= ELF::EF_MIPS_ARCH_32;
-
-    if (Subtarget.inMicroMipsMode())
-      EFlags |= ELF::EF_MIPS_MICROMIPS;
-
-    // ABI
-    if (Subtarget.isABI_O32())
-      EFlags |= ELF::EF_MIPS_ABI_O32;
-
-    // Relocation Model
-    Reloc::Model RM = Subtarget.getRelocationModel();
-    if (RM == Reloc::PIC_ || RM == Reloc::Default)
-      EFlags |= ELF::EF_MIPS_PIC;
-    else if (RM == Reloc::Static)
-      ; // Do nothing for Reloc::Static
-    else
-      llvm_unreachable("Unsupported relocation model for e_flags");
-
-    MCA.setELFHeaderEFlags(EFlags);
-  }
-
-  // For llc. Set a symbol's STO flags
-  void
-  MipsELFStreamer::emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                                 MCSymbol *Sym,
-                                 unsigned Val) {
-
-    if (hasRawTextSupport())
-      return;
-
-    MCSymbolData &Data = getOrCreateSymbolData(Sym);
-    // The "other" values are stored in the last 6 bits of the second byte
-    // The traditional defines for STO values assume the full byte and thus
-    // the shift to pack it.
-    MCELF::setOther(Data, Val >> 2);
-  }
-
-} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
deleted file mode 100644
index b10ccc7..0000000
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//=== MipsELFStreamer.h - MipsELFStreamer ------------------------------===//
-//
-//                    The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENCE.TXT for details.
-//
-//===-------------------------------------------------------------------===//
-#ifndef MIPSELFSTREAMER_H_
-#define MIPSELFSTREAMER_H_
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-class MipsAsmPrinter;
-class MipsSubtarget;
-class MCSymbol;
-
-class MipsELFStreamer : public MCELFStreamer {
-public:
-  MipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                  raw_ostream &OS, MCCodeEmitter *Emitter,
-                  bool RelaxAll, bool NoExecStack)
-    : MCELFStreamer(SK_MipsELFStreamer, Context, TAB, OS, Emitter) {
-  }
-
-  ~MipsELFStreamer() {}
-  void emitELFHeaderFlagsCG(const MipsSubtarget &Subtarget);
-  void emitMipsSTOCG(const MipsSubtarget &Subtarget,
-                     MCSymbol *Sym,
-                     unsigned Val);
-
-  static bool classof(const MCStreamer *S) {
-    return S->getKind() == SK_MipsELFStreamer;
-  }
-};
-
-  MCELFStreamer* createMipsELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                       raw_ostream &OS, MCCodeEmitter *Emitter,
-                                       bool RelaxAll, bool NoExecStack);
-}
-
-#endif /* MIPSELFSTREAMER_H_ */
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index f963900..6ed44b7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,45 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MICROMIPS_26_S1
+    fixup_MICROMIPS_26_S1,
+
+    // resulting in - R_MICROMIPS_HI16
+    fixup_MICROMIPS_HI16,
+
+    // resulting in - R_MICROMIPS_LO16
+    fixup_MICROMIPS_LO16,
+
+    // resulting in - R_MICROMIPS_GOT16
+    fixup_MICROMIPS_GOT16,
+
+    // resulting in - R_MICROMIPS_PC16_S1
+    fixup_MICROMIPS_PC16_S1,
+
+    // resulting in - R_MICROMIPS_CALL16
+    fixup_MICROMIPS_CALL16,
+
+    // resulting in - R_MICROMIPS_GOT_DISP
+    fixup_MICROMIPS_GOT_DISP,
+
+    // resulting in - R_MICROMIPS_GOT_PAGE
+    fixup_MICROMIPS_GOT_PAGE,
+
+    // resulting in - R_MICROMIPS_GOT_OFST
+    fixup_MICROMIPS_GOT_OFST,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_HI16
+    fixup_MICROMIPS_TLS_DTPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_DTPREL_LO16
+    fixup_MICROMIPS_TLS_DTPREL_LO16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_HI16
+    fixup_MICROMIPS_TLS_TPREL_HI16,
+
+    // resulting in - R_MICROMIPS_TLS_TPREL_LO16
+    fixup_MICROMIPS_TLS_TPREL_LO16,
+
     // Marker
     LastTargetFixupKind,
     NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 5d4b32d..6aa3c76 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 void MipsMCAsmInfo::anchor() { }
 
-MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
+MipsMCAsmInfo::MipsMCAsmInfo(StringRef TT) {
   Triple TheTriple(TT);
   if ((TheTriple.getArch() == Triple::mips) ||
       (TheTriple.getArch() == Triple::mips64))
@@ -38,7 +38,6 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   ZeroDirective               = "\t.space\t";
   GPRel32Directive            = "\t.gpword\t";
   GPRel64Directive            = "\t.gpdword\t";
-  WeakRefDirective            = "\t.weak\t";
   DebugLabelSuffix            = "=.";
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index e1d8789..1000113 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -14,16 +14,15 @@
 #ifndef MIPSTARGETASMINFO_H
 #define MIPSTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
-  class Target;
 
-  class MipsMCAsmInfo : public MCAsmInfo {
+  class MipsMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
-    explicit MipsMCAsmInfo(const Target &T, StringRef TT);
+    explicit MipsMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 9460731..66428bd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -13,7 +13,6 @@
 //
 #define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsDirectObjLower.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
@@ -40,11 +39,14 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
   MCContext &Ctx;
   const MCSubtargetInfo &STI;
   bool IsLittleEndian;
+  bool IsMicroMips;
 
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_,
                     const MCSubtargetInfo &sti, bool IsLittle) :
-    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {}
+    MCII(mcii), Ctx(Ctx_), STI (sti), IsLittleEndian(IsLittle) {
+      IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
+    }
 
   ~MipsMCCodeEmitter() {}
 
@@ -54,9 +56,17 @@ public:
 
   void EmitInstruction(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the instruction encoding in little endian byte order.
-    for (unsigned i = 0; i < Size; ++i) {
-      unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
-      EmitByte((Val >> Shift) & 0xff, OS);
+    // Little-endian byte ordering:
+    //   mips32r2:   4 | 3 | 2 | 1
+    //   microMIPS:  2 | 1 | 4 | 3
+    if (IsLittleEndian && Size == 4 && IsMicroMips) {
+      EmitInstruction(Val>>16, 2, OS);
+      EmitInstruction(Val, 2, OS);
+    } else {
+      for (unsigned i = 0; i < Size; ++i) {
+        unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
+        EmitByte((Val >> Shift) & 0xff, OS);
+      }
     }
   }
 
@@ -74,12 +84,24 @@ public:
    unsigned getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                                  SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchJumpOpValueMM - Return binary encoding of the microMIPS jump
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getBranchTargetOpValue - Return binary encoding of the branch
    // target operand. If the machine operand requires relocation,
    // record the relocation and return zero.
   unsigned getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                                   SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+  // target operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
    // getMachineOpValue - Return binary encoding of operand. If the machin
    // operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
@@ -87,11 +109,17 @@ public:
 
   unsigned getMemEncoding(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // getLSAImmEncoding - Return binary encoding of LSA immediate.
+  unsigned getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+
   unsigned
   getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const;
 
@@ -114,8 +142,74 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
   return new MipsMCCodeEmitter(MCII, Ctx, STI, true);
 }
 
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+static void LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  case Mips::DROTR:
+    Inst.setOpcode(Mips::DROTR32);
+    return;
+  }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+static void LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DEXT/DINS, do nothing
+      return;
+    // DEXTU/DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    return;
+  }
+  // DEXTM/DINSM
+  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  return;
+}
+
 /// EncodeInstruction - Emit the instruction.
-/// Size the instruction (currently only 4 bytes
+/// Size the instruction with Desc.getSize().
 void MipsMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups) const
@@ -131,14 +225,16 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case Mips::DSLL:
   case Mips::DSRL:
   case Mips::DSRA:
-    Mips::LowerLargeShift(TmpInst);
+  case Mips::DROTR:
+    LowerLargeShift(TmpInst);
     break;
     // Double extract instruction is chosen by pos and size operands
   case Mips::DEXT:
   case Mips::DINS:
-    Mips::LowerDextDins(TmpInst);
+    LowerDextDins(TmpInst);
   }
 
+  unsigned long N = Fixups.size();
   uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
 
   // Check for unimplemented opcodes.
@@ -151,6 +247,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (STI.getFeatureBits() & Mips::FeatureMicroMips) {
     int NewOpcode = Mips::Std2MicroMips (Opcode, Mips::Arch_micromips);
     if (NewOpcode != -1) {
+      if (Fixups.size() > N)
+        Fixups.pop_back();
       Opcode = NewOpcode;
       TmpInst.setOpcode (NewOpcode);
       Binary = getBinaryCodeForInstr(TmpInst, Fixups);
@@ -188,6 +286,28 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTargetOpValue - Return binary encoding of the microMIPS branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValueMM expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                   MCFixupKind(Mips::
+                               fixup_MICROMIPS_PC16_S1)));
+  return 0;
+}
+
 /// getJumpTargetOpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -209,6 +329,23 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
 }
 
 unsigned MipsMCCodeEmitter::
+getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+  // If the destination is an immediate, divide by 2.
+  if (MO.isImm()) return MO.getImm() >> 1;
+
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValueMM expects only expressions or an immediate");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MICROMIPS_26_S1)));
+  return 0;
+}
+
+unsigned MipsMCCodeEmitter::
 getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
   int64_t Res;
 
@@ -238,31 +375,39 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_GPOFF_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_PAGE :
-    FixupKind = Mips::fixup_Mips_GOT_PAGE;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_PAGE
+                            : Mips::fixup_Mips_GOT_PAGE;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_OFST :
-    FixupKind = Mips::fixup_Mips_GOT_OFST;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_OFST
+                            : Mips::fixup_Mips_GOT_OFST;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_DISP :
-    FixupKind = Mips::fixup_Mips_GOT_DISP;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT_DISP
+                            : Mips::fixup_Mips_GOT_DISP;
     break;
   case MCSymbolRefExpr::VK_Mips_GPREL:
     FixupKind = Mips::fixup_Mips_GPREL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT_CALL:
-    FixupKind = Mips::fixup_Mips_CALL16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_CALL16
+                            : Mips::fixup_Mips_CALL16;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT16:
-    FixupKind = Mips::fixup_Mips_GOT_Global;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Global;
     break;
   case MCSymbolRefExpr::VK_Mips_GOT:
-    FixupKind = Mips::fixup_Mips_GOT_Local;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_GOT16
+                            : Mips::fixup_Mips_GOT_Local;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_HI:
-    FixupKind = Mips::fixup_Mips_HI16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_HI16
+                            : Mips::fixup_Mips_HI16;
     break;
   case MCSymbolRefExpr::VK_Mips_ABS_LO:
-    FixupKind = Mips::fixup_Mips_LO16;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_LO16
+                            : Mips::fixup_Mips_LO16;
     break;
   case MCSymbolRefExpr::VK_Mips_TLSGD:
     FixupKind = Mips::fixup_Mips_TLSGD;
@@ -271,19 +416,23 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups) const {
     FixupKind = Mips::fixup_Mips_TLSLDM;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_HI:
-    FixupKind = Mips::fixup_Mips_DTPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_HI16
+                            : Mips::fixup_Mips_DTPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_DTPREL_LO:
-    FixupKind = Mips::fixup_Mips_DTPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_DTPREL_LO16
+                            : Mips::fixup_Mips_DTPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_GOTTPREL:
     FixupKind = Mips::fixup_Mips_GOTTPREL;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_HI:
-    FixupKind = Mips::fixup_Mips_TPREL_HI;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_HI16
+                            : Mips::fixup_Mips_TPREL_HI;
     break;
   case MCSymbolRefExpr::VK_Mips_TPREL_LO:
-    FixupKind = Mips::fixup_Mips_TPREL_LO;
+    FixupKind = IsMicroMips ? Mips::fixup_MICROMIPS_TLS_TPREL_LO16
+                            : Mips::fixup_Mips_TPREL_LO;
     break;
   case MCSymbolRefExpr::VK_Mips_HIGHER:
     FixupKind = Mips::fixup_Mips_HIGHER;
@@ -318,7 +467,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    unsigned RegNo = Ctx.getRegisterInfo().getEncodingValue(Reg);
+    unsigned RegNo = Ctx.getRegisterInfo()->getEncodingValue(Reg);
     return RegNo;
   } else if (MO.isImm()) {
     return static_cast<unsigned>(MO.getImm());
@@ -344,6 +493,17 @@ MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
   return (OffBits & 0xFFFF) | RegBits;
 }
 
+unsigned MipsMCCodeEmitter::
+getMemEncodingMMImm12(const MCInst &MI, unsigned OpNo,
+                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // Base register is encoded in bits 20-16, offset is encoded in bits 11-0.
+  assert(MI.getOperand(OpNo).isReg());
+  unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) << 16;
+  unsigned OffBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups);
+
+  return (OffBits & 0x0FFF) | RegBits;
+}
+
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
@@ -365,5 +525,13 @@ MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
   return Position + Size - 1;
 }
 
+unsigned
+MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate - 1'.
+  return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups) - 1;
+}
+
 #include "MipsGenMCCodeEmitter.inc"
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index be83b54..5548aaa 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -11,17 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "MipsMCTargetDesc.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMCAsmInfo.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -93,12 +97,12 @@ static MCSubtargetInfo *createMipsMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new MipsMCAsmInfo(T, TT);
+static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
+  MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
 
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(Mips::SP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, SP, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
@@ -125,14 +129,23 @@ static MCInstPrinter *createMipsMCInstPrinter(const Target &T,
 }
 
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &_OS,
-                                    MCCodeEmitter *_Emitter,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createMipsELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+                                    MCContext &Context, MCAsmBackend &MAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    bool RelaxAll, bool NoExecStack) {
+  MipsTargetELFStreamer *S = new MipsTargetELFStreamer();
+  return createELFStreamer(Context, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  MipsTargetAsmStreamer *S = new MipsTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 extern "C" void LLVMInitializeMipsTargetMC() {
@@ -183,6 +196,12 @@ extern "C" void LLVMInitializeMipsTargetMC() {
   TargetRegistry::RegisterMCObjectStreamer(TheMips64elTarget,
                                            createMCStreamer);
 
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(TheMipsTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMipselTarget, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(TheMips64elTarget, createMCAsmStreamer);
+
   // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(TheMipsTarget,
                                        createMipsAsmBackendEB32);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 71954a4..eabebfe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,14 +42,14 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
-                                       StringRef CPU);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
-                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, const MCRegisterInfo &MRI,
+                                       StringRef TT, StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
new file mode 100644
index 0000000..5e90bbc
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -0,0 +1,67 @@
+//===-- MipsTargetStreamer.cpp - Mips Target Streamer Methods -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Mips specific target streamer methods.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetStreamer.h"
+#include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> PrintHackDirectives("print-hack-directives",
+                                         cl::init(false), cl::Hidden);
+
+// pin vtable to this file
+void MipsTargetStreamer::anchor() {}
+
+MipsTargetAsmStreamer::MipsTargetAsmStreamer(formatted_raw_ostream &OS)
+    : OS(OS) {}
+
+void MipsTargetAsmStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_elf_flags 0x";
+  OS.write_hex(Flags);
+  OS << '\n';
+}
+void MipsTargetAsmStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  if (!PrintHackDirectives)
+    return;
+
+  OS << "\t.mips_hack_stocg ";
+  OS << Sym->getName();
+  OS << ", ";
+  OS << Val;
+  OS << '\n';
+}
+
+MCELFStreamer &MipsTargetELFStreamer::getStreamer() {
+  return static_cast<MCELFStreamer &>(*Streamer);
+}
+
+void MipsTargetELFStreamer::emitMipsHackELFFlags(unsigned Flags) {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+// Set a symbol's STO flags
+void MipsTargetELFStreamer::emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) {
+  MCSymbolData &Data = getStreamer().getOrCreateSymbolData(Sym);
+  // The "other" values are stored in the last 6 bits of the second byte
+  // The traditional defines for STO values assume the full byte and thus
+  // the shift to pack it.
+  MCELF::setOther(Data, Val >> 2);
+}
diff --git a/lib/Target/Mips/MSA.txt b/lib/Target/Mips/MSA.txt
new file mode 100644
index 0000000..d1c4193
--- /dev/null
+++ b/lib/Target/Mips/MSA.txt
@@ -0,0 +1,78 @@
+Code Generation Notes for MSA
+=============================
+
+Intrinsics are lowered to SelectionDAG nodes where possible in order to enable
+optimisation, reduce the size of the ISel matcher, and reduce repetition in
+the implementation. In a small number of cases, this can cause different
+(semantically equivalent) instructions to be used in place of the requested
+instruction, even when no optimisation has taken place.
+
+Instructions
+============
+
+This section describes any quirks of instruction selection for MSA. For
+example, two instructions might be equally valid for some given IR and one is
+chosen in preference to the other.
+
+bclri.b:
+        It is not possible to emit bclri.b since andi.b covers exactly the
+        same cases. andi.b should use fractionally less power than bclri.b in
+        most hardware implementations so it is used in preference to bclri.b.
+
+vshf.w:
+        It is not possible to emit vshf.w when the shuffle description is
+        constant since shf.w covers exactly the same cases. shf.w is used
+        instead. It is also impossible for the shuffle description to be
+        unknown at compile-time due to the definition of shufflevector in
+        LLVM IR.
+
+vshf.[bhwd]
+        When the shuffle description describes a splat operation, splat.[bhwd]
+        instructions will be selected instead of vshf.[bhwd]. Unlike the ilv*,
+        and pck* instructions, this is matched from MipsISD::VSHF instead of
+        a special-case MipsISD node.
+
+ilvl.d, pckev.d:
+        It is not possible to emit ilvl.d, or pckev.d since ilvev.d covers the
+        same shuffle. ilvev.d will be emitted instead.
+
+ilvr.d, ilvod.d, pckod.d:
+        It is not possible to emit ilvr.d, or pckod.d since ilvod.d covers the
+        same shuffle. ilvod.d will be emitted instead.
+
+splat.[bhwd]
+        The intrinsic will work as expected. However, unlike other intrinsics
+        it lowers directly to MipsISD::VSHF instead of using common IR.
+
+splati.w:
+        It is not possible to emit splati.w since shf.w covers the same cases.
+        shf.w will be emitted instead.
+
+copy_s.w:
+        On MIPS32, the copy_u.d intrinsic will emit this instruction instead of
+        copy_u.w. This is semantically equivalent since the general-purpose
+        register file is 32-bits wide.
+
+binsri.[bhwd],  binsli.[bhwd]:
+        These two operations are equivalent to each other with the operands
+        swapped and condition inverted. The compiler may use either one as
+        appropriate.
+        Furthermore, the compiler may use bsel.[bhwd] for some masks that do
+        not survive the legalization process (this is a bug and will be fixed).
+
+bmnz.v, bmz.v, bsel.v:
+        These three operations differ only in the operand that is tied to the
+        result.
+        It is (currently) not possible to emit bmz.v, or bsel.v since bmnz.v is
+        the same operation and will be emitted instead.
+        In future, the compiler may choose between these three instructions
+        according to register allocation.
+
+bmnzi.b, bmzi.b:
+        Like their non-immediate counterparts, bmnzi.v and bmzi.v are the same
+        operation with the operands swapped. bmnzi.v will (currently) be emitted
+        for both cases.
+
+bseli.v:
+        Unlike the non-immediate versions, bseli.v is distinguishable from
+        bmnzi.b and bmzi.b and can be emitted.
diff --git a/lib/Target/Mips/MicroMipsInstrFormats.td b/lib/Target/Mips/MicroMipsInstrFormats.td
index 665b4d2..c12a32e 100644
--- a/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -39,8 +39,8 @@ class SLTI_FM_MM<bits<6> op> : MMArch {
   bits<32> Inst;
 
   let Inst{31-26} = op;
-  let Inst{25-21} = rs;
-  let Inst{20-16} = rt;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
   let Inst{15-0}  = imm16;
 }
 
@@ -110,3 +110,195 @@ class LW_FM_MM<bits<6> op> : MMArch {
   let Inst{20-16} = addr{20-16};
   let Inst{15-0}  = addr{15-0};
 }
+
+class LWL_FM_MM<bits<4> funct> {
+  bits<5> rt;
+  bits<21> addr;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x18;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = addr{20-16};
+  let Inst{15-12} = funct;
+  let Inst{11-0}  = addr{11-0};
+}
+
+class CMov_F_I_FM_MM<bits<7> func> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+  bits<3> fcc;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x15;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-13} = fcc;
+  let Inst{12-6}  = func;
+  let Inst{5-0}   = 0x3b;
+}
+
+class MTLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class MFLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = 0x00;
+  let Inst{20-16} = rd;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class CLO_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class SEB_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class EXT_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> pos;
+  bits<5> size;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-11} = size;
+  let Inst{10-6}  = pos;
+  let Inst{5-0}   = funct;
+}
+
+class J_FM_MM<bits<6> op> : MMArch {
+  bits<26> target;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-0}  = target;
+}
+
+class JR_FM_MM<bits<8> funct> : MMArch {
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-21} = 0x00;
+  let Inst{20-16} = rs;
+  let Inst{15-14} = 0x0;
+  let Inst{13-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class JALR_FM_MM<bits<10> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rd;
+  let Inst{20-16} = rs;
+  let Inst{15-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class BEQ_FM_MM<bits<6> op> : MMArch {
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = op;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZ_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class BGEZAL_FM_MM<bits<5> funct> : MMArch {
+  bits<5>  rs;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = offset;
+}
+
+class TEQ_FM_MM<bits<6> funct> : MMArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<4> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x00;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rs;
+  let Inst{15-12} = code_;
+  let Inst{11-6}  = funct;
+  let Inst{5-0}   = 0x3c;
+}
+
+class TEQI_FM_MM<bits<5> funct> : MMArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rs;
+  let Inst{15-0}  = imm16;
+}
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 74cdccd..d9507fa 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1,67 +1,219 @@
-let isCodeGenOnly = 1 in {
+def addrimm12 : ComplexPattern<iPTR, 2, "selectIntAddrMM", [frameindex]>;
+
+def simm12 : Operand<i32> {
+  let DecoderMethod = "DecodeSimm12";
+}
+
+def mem_mm_12 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops GPR32, simm12);
+  let EncoderMethod = "getMemEncodingMMImm12";
+  let ParserMatchClass = MipsMemAsmOperand;
+  let OperandType = "OPERAND_MEMORY";
+}
+
+def jmptarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def calltarget_mm : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValueMM";
+}
+
+def brtarget_mm : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValueMM";
+  let OperandType   = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTargetMM";
+}
+
+let canFoldAsLoad = 1 in
+class LoadLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                      Operand MemOpnd> :
+  InstSE<(outs RO:$rt), (ins MemOpnd:$addr, RO:$src),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode addrimm12:$addr, RO:$src))],
+         NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+  string Constraints = "$src = $rt";
+}
+
+class StoreLeftRightMM<string opstr, SDNode OpNode, RegisterOperand RO,
+                       Operand MemOpnd>:
+  InstSE<(outs), (ins RO:$rt, MemOpnd:$addr),
+         !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addrimm12:$addr)], NoItinerary, FrmI> {
+  let DecoderMethod = "DecodeMemMMImm12";
+}
+
+let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   /// Arithmetic Instructions (ALU Immediate)
-  def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, CPURegsOpnd>,
+  def ADDiu_MM : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd>,
                  ADDI_FM_MM<0xc>;
-  def ADDi_MM  : MMRel, ArithLogicI<"addi", simm16, CPURegsOpnd>,
+  def ADDi_MM  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>,
                  ADDI_FM_MM<0x4>;
-  def SLTi_MM  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, CPURegs>,
+  def SLTi_MM  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
                  SLTI_FM_MM<0x24>;
-  def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, CPURegs>,
+  def SLTiu_MM : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
                  SLTI_FM_MM<0x2c>;
-  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16, and>,
+  def ANDi_MM  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd>,
                  ADDI_FM_MM<0x34>;
-  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, CPURegsOpnd, immZExt16, or>,
+  def ORi_MM   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd>,
                  ADDI_FM_MM<0x14>;
-  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16, xor>,
+  def XORi_MM  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd>,
                  ADDI_FM_MM<0x1c>;
-  def LUi_MM   : MMRel, LoadUpper<"lui", CPURegs, uimm16>, LUI_FM_MM;
+  def LUi_MM   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM_MM;
 
   /// Arithmetic Instructions (3-Operand, R-Type)
-  def ADDu_MM  : MMRel, ArithLogicR<"addu", CPURegsOpnd>, ADD_FM_MM<0, 0x150>;
-  def SUBu_MM  : MMRel, ArithLogicR<"subu", CPURegsOpnd>, ADD_FM_MM<0, 0x1d0>;
-  def MUL_MM   : MMRel, ArithLogicR<"mul", CPURegsOpnd>, ADD_FM_MM<0, 0x210>;
-  def ADD_MM   : MMRel, ArithLogicR<"add", CPURegsOpnd>, ADD_FM_MM<0, 0x110>;
-  def SUB_MM   : MMRel, ArithLogicR<"sub", CPURegsOpnd>, ADD_FM_MM<0, 0x190>;
-  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, CPURegs>, ADD_FM_MM<0, 0x350>;
-  def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, CPURegs>,
+  def ADDu_MM  : MMRel, ArithLogicR<"addu", GPR32Opnd>, ADD_FM_MM<0, 0x150>;
+  def SUBu_MM  : MMRel, ArithLogicR<"subu", GPR32Opnd>, ADD_FM_MM<0, 0x1d0>;
+  def MUL_MM   : MMRel, ArithLogicR<"mul", GPR32Opnd>, ADD_FM_MM<0, 0x210>;
+  def ADD_MM   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM_MM<0, 0x110>;
+  def SUB_MM   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM_MM<0, 0x190>;
+  def SLT_MM   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM_MM<0, 0x350>;
+  def SLTu_MM  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>,
                  ADD_FM_MM<0, 0x390>;
-  def AND_MM   : MMRel, ArithLogicR<"and", CPURegsOpnd, 1, IIAlu, and>,
+  def AND_MM   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, IIAlu, and>,
                  ADD_FM_MM<0, 0x250>;
-  def OR_MM    : MMRel, ArithLogicR<"or", CPURegsOpnd, 1, IIAlu, or>,
+  def OR_MM    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, IIAlu, or>,
                  ADD_FM_MM<0, 0x290>;
-  def XOR_MM   : MMRel, ArithLogicR<"xor", CPURegsOpnd, 1, IIAlu, xor>,
+  def XOR_MM   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IIAlu, xor>,
                  ADD_FM_MM<0, 0x310>;
-  def NOR_MM   : MMRel, LogicNOR<"nor", CPURegsOpnd>, ADD_FM_MM<0, 0x2d0>;
-  def MULT_MM  : MMRel, Mult<"mult", IIImul, CPURegsOpnd, [HI, LO]>,
+  def NOR_MM   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM_MM<0, 0x2d0>;
+  def MULT_MM  : MMRel, Mult<"mult", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x22c>;
-  def MULTu_MM : MMRel, Mult<"multu", IIImul, CPURegsOpnd, [HI, LO]>,
+  def MULTu_MM : MMRel, Mult<"multu", IIImul, GPR32Opnd, [HI0, LO0]>,
                  MULT_FM_MM<0x26c>;
+  def SDIV_MM  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ac>;
+  def UDIV_MM  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+                 MULT_FM_MM<0x2ec>;
 
   /// Shift Instructions
-  def SLL_MM   : MMRel, shift_rotate_imm<"sll", shamt, CPURegsOpnd>,
+  def SLL_MM   : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0, 0>;
-  def SRL_MM   : MMRel, shift_rotate_imm<"srl", shamt, CPURegsOpnd>,
+  def SRL_MM   : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x40, 0>;
-  def SRA_MM   : MMRel, shift_rotate_imm<"sra", shamt, CPURegsOpnd>,
+  def SRA_MM   : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0x80, 0>;
-  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", CPURegsOpnd>,
+  def SLLV_MM  : MMRel, shift_rotate_reg<"sllv", GPR32Opnd>,
                  SRLV_FM_MM<0x10, 0>;
-  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", CPURegsOpnd>,
+  def SRLV_MM  : MMRel, shift_rotate_reg<"srlv", GPR32Opnd>,
                  SRLV_FM_MM<0x50, 0>;
-  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", CPURegsOpnd>,
+  def SRAV_MM  : MMRel, shift_rotate_reg<"srav", GPR32Opnd>,
                  SRLV_FM_MM<0x90, 0>;
-  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", shamt, CPURegsOpnd>,
+  def ROTR_MM  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd>,
                  SRA_FM_MM<0xc0, 0>;
-  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", CPURegsOpnd>,
+  def ROTRV_MM : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd>,
                  SRLV_FM_MM<0xd0, 0>;
 
   /// Load and Store Instructions - aligned
-  defm LB_MM  : LoadM<"lb", CPURegs, sextloadi8>, MMRel, LW_FM_MM<0x7>;
-  defm LBu_MM : LoadM<"lbu", CPURegs, zextloadi8>, MMRel, LW_FM_MM<0x5>;
-  defm LH_MM  : LoadM<"lh", CPURegs, sextloadi16>, MMRel, LW_FM_MM<0xf>;
-  defm LHu_MM : LoadM<"lhu", CPURegs, zextloadi16>, MMRel, LW_FM_MM<0xd>;
-  defm LW_MM  : LoadM<"lw", CPURegs>, MMRel, LW_FM_MM<0x3f>;
-  defm SB_MM  : StoreM<"sb", CPURegs, truncstorei8>, MMRel, LW_FM_MM<0x6>;
-  defm SH_MM  : StoreM<"sh", CPURegs, truncstorei16>, MMRel, LW_FM_MM<0xe>;
-  defm SW_MM  : StoreM<"sw", CPURegs>, MMRel, LW_FM_MM<0x3e>;
+  let DecoderMethod = "DecodeMemMMImm16" in {
+    def LB_MM  : Load<"lb", GPR32Opnd>, MMRel, LW_FM_MM<0x7>;
+    def LBu_MM : Load<"lbu", GPR32Opnd>, MMRel, LW_FM_MM<0x5>;
+    def LH_MM  : Load<"lh", GPR32Opnd>, MMRel, LW_FM_MM<0xf>;
+    def LHu_MM : Load<"lhu", GPR32Opnd>, MMRel, LW_FM_MM<0xd>;
+    def LW_MM  : Load<"lw", GPR32Opnd>, MMRel, LW_FM_MM<0x3f>;
+    def SB_MM  : Store<"sb", GPR32Opnd>, MMRel, LW_FM_MM<0x6>;
+    def SH_MM  : Store<"sh", GPR32Opnd>, MMRel, LW_FM_MM<0xe>;
+    def SW_MM  : Store<"sw", GPR32Opnd>, MMRel, LW_FM_MM<0x3e>;
+  }
+
+  /// Load and Store Instructions - unaligned
+  def LWL_MM : LoadLeftRightMM<"lwl", MipsLWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x0>;
+  def LWR_MM : LoadLeftRightMM<"lwr", MipsLWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x1>;
+  def SWL_MM : StoreLeftRightMM<"swl", MipsSWL, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x8>;
+  def SWR_MM : StoreLeftRightMM<"swr", MipsSWR, GPR32Opnd, mem_mm_12>,
+               LWL_FM_MM<0x9>;
+
+  /// Move Conditional
+  def MOVZ_I_MM : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x58>;
+  def MOVN_I_MM : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd,
+                  NoItinerary>, ADD_FM_MM<0, 0x18>;
+  def MOVT_I_MM : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x25>;
+  def MOVF_I_MM : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIAlu>,
+                  CMov_F_I_FM_MM<0x5>;
+
+  /// Move to/from HI/LO
+  def MTHI_MM : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>,
+                MTLO_FM_MM<0x0b5>;
+  def MTLO_MM : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>,
+                MTLO_FM_MM<0x0f5>;
+  def MFHI_MM : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x035>;
+  def MFLO_MM : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>,
+                MFLO_FM_MM<0x075>;
+
+  /// Multiply Add/Sub Instructions
+  def MADD_MM  : MMRel, MArithR<"madd", 1>, MULT_FM_MM<0x32c>;
+  def MADDU_MM : MMRel, MArithR<"maddu", 1>, MULT_FM_MM<0x36c>;
+  def MSUB_MM  : MMRel, MArithR<"msub">, MULT_FM_MM<0x3ac>;
+  def MSUBU_MM : MMRel, MArithR<"msubu">, MULT_FM_MM<0x3ec>;
+
+  /// Count Leading
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
+
+  /// Sign Ext In Register Instructions.
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM_MM<0x0ac>;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM_MM<0x0ec>;
+
+  /// Word Swap Bytes Within Halfwords
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
+
+  def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
+               EXT_FM_MM<0x2c>;
+  def INS_MM : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>,
+               EXT_FM_MM<0x0c>;
+
+  /// Jump Instructions
+  let DecoderMethod = "DecodeJumpTargetMM" in {
+    def J_MM        : MMRel, JumpFJ<jmptarget_mm, "j", br, bb, "j">,
+                      J_FM_MM<0x35>;
+    def JAL_MM      : MMRel, JumpLink<"jal", calltarget_mm>, J_FM_MM<0x3d>;
+    def TAILCALL_MM : MMRel, JumpFJ<calltarget_mm, "j", MipsTailCall, imm,
+                                    "tcall">, J_FM_MM<0x3d>, IsTailCall;
+  }
+  def JR_MM   : MMRel, IndirectBranch<"jr", GPR32Opnd>, JR_FM_MM<0x3c>;
+  def JALR_MM : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM_MM<0x03c>;
+  def TAILCALL_R_MM : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>,
+                      JR_FM_MM<0x3c>, IsTailCall;
+  def RET_MM : MMRel, RetBase<"ret", GPR32Opnd>, JR_FM_MM<0x3c>;
+
+  /// Branch Instructions
+  def BEQ_MM  : MMRel, CBranch<"beq", brtarget_mm, seteq, GPR32Opnd>,
+                BEQ_FM_MM<0x25>;
+  def BNE_MM  : MMRel, CBranch<"bne", brtarget_mm, setne, GPR32Opnd>,
+                BEQ_FM_MM<0x2d>;
+  def BGEZ_MM : MMRel, CBranchZero<"bgez", brtarget_mm, setge, GPR32Opnd>,
+                BGEZ_FM_MM<0x2>;
+  def BGTZ_MM : MMRel, CBranchZero<"bgtz", brtarget_mm, setgt, GPR32Opnd>,
+                BGEZ_FM_MM<0x6>;
+  def BLEZ_MM : MMRel, CBranchZero<"blez", brtarget_mm, setle, GPR32Opnd>,
+                BGEZ_FM_MM<0x4>;
+  def BLTZ_MM : MMRel, CBranchZero<"bltz", brtarget_mm, setlt, GPR32Opnd>,
+                BGEZ_FM_MM<0x0>;
+  def BGEZAL_MM : MMRel, BGEZAL_FT<"bgezal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x03>;
+  def BLTZAL_MM : MMRel, BGEZAL_FT<"bltzal", brtarget_mm, GPR32Opnd>,
+                  BGEZAL_FM_MM<0x01>;
+
+  /// Trap Instructions
+  def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
+  def TGE_MM  : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM_MM<0x08>;
+  def TGEU_MM : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM_MM<0x10>;
+  def TLT_MM  : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM_MM<0x20>;
+  def TLTU_MM : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM_MM<0x28>;
+  def TNE_MM  : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM_MM<0x30>;
+
+  def TEQI_MM  : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM_MM<0x0e>;
+  def TGEI_MM  : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM_MM<0x09>;
+  def TGEIU_MM : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM_MM<0x0b>;
+  def TLTI_MM  : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM_MM<0x08>;
+  def TLTIU_MM : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM_MM<0x0a>;
+  def TNEI_MM  : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM_MM<0x0c>;
 }
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 8c65bb4..e796deb 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -28,7 +28,6 @@ namespace llvm {
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
   FunctionPass *createMipsConstantIslandPass(MipsTargetMachine &tm);
-
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index eefb02a..b8e3f39 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -78,6 +78,8 @@ def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
 def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
                                     "Mips DSP-R2 ASE", [FeatureDSP]>;
 
+def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
+
 def FeatureMicroMips  : SubtargetFeature<"micromips", "InMicroMipsMode", "true",
                                          "microMips mode">;
 
@@ -101,6 +103,7 @@ def MipsAsmWriter : AsmWriter {
 
 def MipsAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
+  let MnemonicContainsDot = 1;
 }
 
 def MipsAsmParserVariant : AsmParserVariant {
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 1bb6fe4..6655ff9 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -40,7 +40,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   MachineLocation DstML, SrcML;
 
   // Adjust stack.
@@ -50,24 +50,23 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP);
-  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
-  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+  MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
 
   MCSymbol *CSLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP, -8);
-  SrcML = MachineLocation(Mips::S1);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned S2 = MRI->getDwarfRegNum(Mips::S2, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S2, -8));
 
-  DstML = MachineLocation(MachineLocation::VirtualFP, -12);
-  SrcML = MachineLocation(Mips::S0);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned S1 = MRI->getDwarfRegNum(Mips::S1, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S1, -12));
 
-  DstML = MachineLocation(MachineLocation::VirtualFP, -4);
-  SrcML = MachineLocation(Mips::RA);
-  Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+  unsigned S0 = MRI->getDwarfRegNum(Mips::S0, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, S0, -16));
+
+  unsigned RA = MRI->getDwarfRegNum(Mips::RA, true);
+  MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel, RA, -4));
 
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
@@ -172,6 +171,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   MF.getRegInfo().setPhysRegUsed(Mips::RA);
   MF.getRegInfo().setPhysRegUsed(Mips::S0);
   MF.getRegInfo().setPhysRegUsed(Mips::S1);
+  MF.getRegInfo().setPhysRegUsed(Mips::S2);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 54fdb78..8ce2ced 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -20,7 +20,7 @@ namespace llvm {
 class Mips16FrameLowering : public MipsFrameLowering {
 public:
   explicit Mips16FrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
new file mode 100644
index 0000000..81bf18c
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -0,0 +1,517 @@
+//===---- Mips16HardFloat.cpp for Mips16 Hard Float               --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass needed for Mips16 Hard Float
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips16-hard-float"
+#include "Mips16HardFloat.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <string>
+
+static void inlineAsmOut
+  (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) {
+  std::vector<llvm::Type *> AsmArgTypes;
+  std::vector<llvm::Value*> AsmArgs;
+  llvm::FunctionType *AsmFTy =
+    llvm::FunctionType::get(Type::getVoidTy(C),
+                            AsmArgTypes, false);
+  llvm::InlineAsm *IA =
+    llvm::InlineAsm::get(AsmFTy, AsmString, "", true,
+                         /* IsAlignStack */ false,
+                         llvm::InlineAsm::AD_ATT);
+  CallInst::Create(IA, AsmArgs, "", BB);
+}
+
+namespace {
+
+class InlineAsmHelper {
+  LLVMContext &C;
+  BasicBlock *BB;
+public:
+  InlineAsmHelper(LLVMContext &C_, BasicBlock *BB_) :
+    C(C_), BB(BB_) {
+  }
+
+  void Out(StringRef AsmString) {
+    inlineAsmOut(C, AsmString, BB);
+  }
+
+};
+}
+//
+// Return types that matter for hard float are:
+// float, double, complex float, and complex double
+//
+enum FPReturnVariant {
+  FRet, DRet, CFRet, CDRet, NoFPRet
+};
+
+//
+// Determine which FP return type this function has
+//
+static FPReturnVariant whichFPReturnVariant(Type *T) {
+  switch (T->getTypeID()) {
+  case Type::FloatTyID:
+    return FRet;
+  case Type::DoubleTyID:
+    return DRet;
+  case Type::StructTyID:
+    if (T->getStructNumElements() != 2)
+      break;
+    if ((T->getContainedType(0)->isFloatTy()) &&
+        (T->getContainedType(1)->isFloatTy()))
+      return CFRet;
+    if ((T->getContainedType(0)->isDoubleTy()) &&
+        (T->getContainedType(1)->isDoubleTy()))
+      return CDRet;
+    break;
+  default:
+    break;
+  }
+  return NoFPRet;
+}
+
+//
+// Parameter type that matter are float, (float, float), (float, double),
+// double, (double, double), (double, float)
+//
+enum FPParamVariant {
+  FSig, FFSig, FDSig,
+  DSig, DDSig, DFSig, NoSig
+};
+
+// which floating point parameter signature variant we are dealing with
+//
+typedef Type::TypeID TypeID;
+const Type::TypeID FloatTyID = Type::FloatTyID;
+const Type::TypeID DoubleTyID = Type::DoubleTyID;
+
+static FPParamVariant whichFPParamVariantNeeded(Function &F) {
+  switch (F.arg_size()) {
+  case 0:
+    return NoSig;
+  case 1:{
+    TypeID ArgTypeID = F.getFunctionType()->getParamType(0)->getTypeID();
+    switch (ArgTypeID) {
+    case FloatTyID:
+      return FSig;
+    case DoubleTyID:
+      return DSig;
+    default:
+      return NoSig;
+    }
+  }
+  default: {
+    TypeID ArgTypeID0 = F.getFunctionType()->getParamType(0)->getTypeID();
+    TypeID ArgTypeID1 = F.getFunctionType()->getParamType(1)->getTypeID();
+    switch(ArgTypeID0) {
+    case FloatTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return FFSig;
+      case DoubleTyID:
+        return FDSig;
+      default:
+        return FSig;
+      }
+    }
+    case DoubleTyID: {
+      switch (ArgTypeID1) {
+      case FloatTyID:
+        return DFSig;
+      case DoubleTyID:
+        return DDSig;
+      default:
+        return DSig;
+      }
+    }
+    default:
+      return NoSig;
+    }
+  }
+  }
+  llvm_unreachable("can't get here");
+}
+
+// Figure out if we need float point based on the function parameters.
+// We need to move variables in and/or out of floating point
+// registers because of the ABI
+//
+static bool needsFPStubFromParams(Function &F) {
+  if (F.arg_size() >=1) {
+    Type *ArgType = F.getFunctionType()->getParamType(0);
+    switch (ArgType->getTypeID()) {
+      case Type::FloatTyID:
+      case Type::DoubleTyID:
+        return true;
+      default:
+        break;
+    }
+  }
+  return false;
+}
+
+static bool needsFPReturnHelper(Function &F) {
+  Type* RetType = F.getReturnType();
+  return whichFPReturnVariant(RetType) != NoFPRet;
+}
+
+static bool needsFPHelperFromSig(Function &F) {
+  return needsFPStubFromParams(F) || needsFPReturnHelper(F);
+}
+
+//
+// We swap between FP and Integer registers to allow Mips16 and Mips32 to
+// interoperate
+//
+
+static void swapFPIntParams
+  (FPParamVariant PV, Module *M, InlineAsmHelper &IAH,
+   bool LE, bool ToFP) {
+  //LLVMContext &Context = M->getContext();
+  std::string MI = ToFP? "mtc1 ": "mfc1 ";
+  switch (PV) {
+  case FSig:
+    IAH.Out(MI + "$$4,$$f12");
+    break;
+  case FFSig:
+    IAH.Out(MI +"$$4,$$f12");
+    IAH.Out(MI + "$$5,$$f14");
+    break;
+  case FDSig:
+    IAH.Out(MI + "$$4,$$f12");
+    if (LE) {
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    break;
+  case DDSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+      IAH.Out(MI + "$$6,$$f14");
+      IAH.Out(MI + "$$7,$$f15");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+      IAH.Out(MI + "$$7,$$f14");
+      IAH.Out(MI + "$$6,$$f15");
+    }
+    break;
+  case DFSig:
+    if (LE) {
+      IAH.Out(MI + "$$4,$$f12");
+      IAH.Out(MI + "$$5,$$f13");
+    } else {
+      IAH.Out(MI + "$$5,$$f12");
+      IAH.Out(MI + "$$4,$$f13");
+    }
+    IAH.Out(MI + "$$6,$$f14");
+    break;
+  case NoSig:
+    return;
+  }
+}
+//
+// Make sure that we know we already need a stub for this function.
+// Having called needsFPHelperFromSig
+//
+static void assureFPCallStub(Function &F, Module *M,  
+                             const MipsSubtarget &Subtarget){
+  // for now we only need them for static relocation
+  if (Subtarget.getRelocationModel() == Reloc::PIC_)
+    return;
+  LLVMContext &Context = M->getContext();
+  bool LE = Subtarget.isLittle();
+  std::string Name = F.getName();
+  std::string SectionName = ".mips16.call.fp." + Name;
+  std::string StubName = "__call_stub_fp_" + Name;
+  //
+  // see if we already have the stub
+  //
+  Function *FStub = M->getFunction(StubName);
+  if (FStub && !FStub->isDeclaration()) return;
+  FStub = Function::Create(F.getFunctionType(),
+                           Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  IAH.Out(".set reorder");
+  FPReturnVariant RV = whichFPReturnVariant(FStub->getReturnType());
+  FPParamVariant PV = whichFPParamVariantNeeded(F);
+  swapFPIntParams(PV, M, IAH, LE, true);
+  if (RV != NoFPRet) {
+    IAH.Out("move $$18, $$31");
+    IAH.Out("jal " + Name);
+  } else {
+    IAH.Out("lui  $$25,%hi(" + Name + ")");
+    IAH.Out("addiu  $$25,$$25,%lo(" + Name + ")" );
+  }
+  switch (RV) {
+  case FRet:
+    IAH.Out("mfc1 $$2,$$f0");
+    break;
+  case DRet:
+    if (LE) {
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case CFRet:
+    if (LE) {
+    IAH.Out("mfc1 $$2,$$f0");
+    IAH.Out("mfc1 $$3,$$f2");
+    } else {
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$3,$$f2");
+    }
+    break;
+  case CDRet:
+    if (LE) {
+      IAH.Out("mfc1 $$4,$$f2");
+      IAH.Out("mfc1 $$5,$$f3");
+      IAH.Out("mfc1 $$2,$$f0");
+      IAH.Out("mfc1 $$3,$$f1");
+
+    } else {
+      IAH.Out("mfc1 $$5,$$f2");
+      IAH.Out("mfc1 $$4,$$f3");
+      IAH.Out("mfc1 $$3,$$f0");
+      IAH.Out("mfc1 $$2,$$f1");
+    }
+    break;
+  case NoFPRet:
+    break;
+  }
+  if (RV != NoFPRet)
+    IAH.Out("jr $$18");
+  else
+    IAH.Out("jr $$25");
+  new UnreachableInst(Context, BB);
+}
+
+//
+// Functions that are llvm intrinsics and don't need helpers.
+//
+static const char *IntrinsicInline[] =
+  {"fabs",
+   "fabsf",
+   "llvm.ceil.f32", "llvm.ceil.f64",
+   "llvm.copysign.f32", "llvm.copysign.f64",
+   "llvm.cos.f32", "llvm.cos.f64",
+   "llvm.exp.f32", "llvm.exp.f64",
+   "llvm.exp2.f32", "llvm.exp2.f64",
+   "llvm.fabs.f32", "llvm.fabs.f64",
+   "llvm.floor.f32", "llvm.floor.f64",
+   "llvm.fma.f32", "llvm.fma.f64",
+   "llvm.log.f32", "llvm.log.f64",
+   "llvm.log10.f32", "llvm.log10.f64",
+   "llvm.nearbyint.f32", "llvm.nearbyint.f64",
+   "llvm.pow.f32", "llvm.pow.f64",
+   "llvm.powi.f32", "llvm.powi.f64",
+   "llvm.rint.f32", "llvm.rint.f64",
+   "llvm.round.f32", "llvm.round.f64",
+   "llvm.sin.f32", "llvm.sin.f64",
+   "llvm.sqrt.f32", "llvm.sqrt.f64",
+   "llvm.trunc.f32", "llvm.trunc.f64",
+  };
+
+static bool isIntrinsicInline(Function *F) {
+  return std::binary_search(
+    IntrinsicInline, array_endof(IntrinsicInline),
+    F->getName());
+}
+//
+// Returns of float, double and complex need to be handled with a helper
+// function.
+//
+static bool fixupFPReturnAndCall
+  (Function &F, Module *M,  const MipsSubtarget &Subtarget) {
+  bool Modified = false;
+  LLVMContext &C = M->getContext();
+  Type *MyVoid = Type::getVoidTy(C);
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      Instruction &Inst = *I;
+      if (const ReturnInst *RI = dyn_cast<ReturnInst>(I)) {
+        Value *RVal = RI->getReturnValue();
+        if (!RVal) continue;
+        //
+        // If there is a return value and it needs a helper function,
+        // figure out which one and add a call before the actual
+        // return to this helper. The purpose of the helper is to move
+        // floating point values from their soft float return mapping to
+        // where they would have been mapped to in floating point registers.
+        //
+        Type *T = RVal->getType();
+        FPReturnVariant RV = whichFPReturnVariant(T);
+        if (RV == NoFPRet) continue;
+        static const char* Helper[NoFPRet] =
+          {"__mips16_ret_sf", "__mips16_ret_df", "__mips16_ret_sc",
+           "__mips16_ret_dc"};
+        const char *Name = Helper[RV];
+        AttributeSet A;
+        Value *Params[] = {RVal};
+        Modified = true;
+        //
+        // These helper functions have a different calling ABI so
+        // this __Mips16RetHelper indicates that so that later
+        // during call setup, the proper call lowering to the helper
+        // functions will take place.
+        //
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           "__Mips16RetHelper");
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::ReadNone);
+        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+                           Attribute::NoInline);
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, NULL));
+        CallInst::Create(F, Params, "", &Inst );
+      } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+          // pic mode calls are handled by already defined
+          // helper functions
+          if (Subtarget.getRelocationModel() != Reloc::PIC_ ) {
+            Function *F_ =  CI->getCalledFunction();
+            if (F_ && !isIntrinsicInline(F_) && needsFPHelperFromSig(*F_)) {
+              assureFPCallStub(*F_, M, Subtarget);
+              Modified=true;
+            }
+          }
+      }
+    }
+  return Modified;
+}
+
+static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
+                  const MipsSubtarget &Subtarget ) {
+  bool PicMode = Subtarget.getRelocationModel() == Reloc::PIC_;
+  bool LE = Subtarget.isLittle();
+  LLVMContext &Context = M->getContext();
+  std::string Name = F->getName();
+  std::string SectionName = ".mips16.fn." + Name;
+  std::string StubName = "__fn_stub_" + Name;
+  std::string LocalName = "$$__fn_local_" + Name;
+  Function *FStub = Function::Create
+    (F->getFunctionType(),
+     Function::InternalLinkage, StubName, M);
+  FStub->addFnAttr("mips16_fp_stub");
+  FStub->addFnAttr(llvm::Attribute::Naked);
+  FStub->addFnAttr(llvm::Attribute::NoUnwind);
+  FStub->addFnAttr(llvm::Attribute::NoInline);
+  FStub->addFnAttr("nomips16");
+  FStub->setSection(SectionName);
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", FStub);
+  InlineAsmHelper IAH(Context, BB);
+  IAH.Out(" .set  macro");
+  if (PicMode) {
+    IAH.Out(".set noreorder");
+    IAH.Out(".cpload  $$25");
+    IAH.Out(".set reorder");
+    IAH.Out(".reloc 0,R_MIPS_NONE," + Name);
+    IAH.Out("la $$25," + LocalName);
+  }
+  else {
+    IAH.Out(".set reorder");
+    IAH.Out("la $$25," + Name);
+  }
+  swapFPIntParams(PV, M, IAH, LE, false);
+  IAH.Out("jr $$25");
+  IAH.Out(LocalName + " = " + Name);
+  new UnreachableInst(FStub->getContext(), BB);
+}
+
+//
+// remove the use-soft-float attribute
+//
+static void removeUseSoftFloat(Function &F) {
+  AttributeSet A;
+  DEBUG(errs() << "removing -use-soft-float\n");
+  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+                     "use-soft-float", "false");
+  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  if (F.hasFnAttribute("use-soft-float")) {
+    DEBUG(errs() << "still has -use-soft-float\n");
+  }
+  F.addAttributes(AttributeSet::FunctionIndex, A);
+}
+
+namespace llvm {
+
+//
+// This pass only makes sense when the underlying chip has floating point but
+// we are compiling as mips16.
+// For all mips16 functions (that are not stubs we have already generated), or
+// declared via attributes as nomips16, we must:
+//    1) fixup all returns of float, double, single and double complex
+//       by calling a helper function before the actual return.
+//    2) generate helper functions (stubs) that can be called by mips32 functions
+//       that will move parameters passed normally passed in floating point
+//       registers the soft float equivalents.
+//    3) in the case of static relocation, generate helper functions so that
+//       mips16 functions can call extern functions of unknown type (mips16 or
+//       mips32).
+//    4) TBD. For pic, calls to extern functions of unknown type are handled by
+//       predefined helper functions in libc but this work is currently done
+//       during call lowering but it should be moved here in the future.
+//
+bool Mips16HardFloat::runOnModule(Module &M) {
+  DEBUG(errs() << "Run on Module Mips16HardFloat\n");
+  bool Modified = false;
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->hasFnAttribute("nomips16") &&
+        F->hasFnAttribute("use-soft-float")) {
+      removeUseSoftFloat(*F);
+      continue;
+    }
+    if (F->isDeclaration() || F->hasFnAttribute("mips16_fp_stub") ||
+        F->hasFnAttribute("nomips16")) continue;
+    Modified |= fixupFPReturnAndCall(*F, &M, Subtarget);
+    FPParamVariant V = whichFPParamVariantNeeded(*F);
+    if (V != NoSig) {
+      Modified = true;
+      createFPFnStub(F, &M, V, Subtarget);
+    }
+  }
+  return Modified;
+}
+
+char Mips16HardFloat::ID = 0;
+
+}
+
+ModulePass *llvm::createMips16HardFloat(MipsTargetMachine &TM) {
+  return new Mips16HardFloat(TM);
+}
+
diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
new file mode 100644
index 0000000..b7f712a
--- /dev/null
+++ b/lib/Target/Mips/Mips16HardFloat.h
@@ -0,0 +1,54 @@
+//===---- Mips16HardFloat.h for Mips16 Hard Float                  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a phase which implements part of the floating point
+// interoperability between Mips16 and Mips32 code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsTargetMachine.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+
+#ifndef MIPS16HARDFLOAT_H
+#define MIPS16HARDFLOAT_H
+
+using namespace llvm;
+
+namespace llvm {
+
+class Mips16HardFloat : public ModulePass {
+
+public:
+  static char ID;
+
+  Mips16HardFloat(MipsTargetMachine &TM_) : ModulePass(ID),
+    TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "MIPS16 Hard Float Pass";
+  }
+
+  virtual bool runOnModule(Module &M);
+
+protected:
+  /// Keep a pointer to the MipsSubtarget around so that we can make the right
+  /// decision when generating code for different targets.
+  const TargetMachine &TM;
+  const MipsSubtarget &Subtarget;
+
+};
+
+ModulePass *createMips16HardFloat(MipsTargetMachine &TM);
+
+}
+#endif
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index c1c635c..4948f40 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -42,7 +42,7 @@ bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 }
 /// Select multiply instructions.
 std::pair<SDNode*, SDNode*>
-Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, DebugLoc DL, EVT Ty,
+Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
                                bool HasLo, bool HasHi) {
   SDNode *Lo = 0, *Hi = 0;
   SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
@@ -80,10 +80,11 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   V1 = RegInfo.createVirtualRegister(RC);
   V2 = RegInfo.createVirtualRegister(RC);
 
-  BuildMI(MBB, I, DL, TII.get(Mips::LiRxImmX16), V0)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI);
-  BuildMI(MBB, I, DL, TII.get(Mips::AddiuRxPcImmX16), V1)
-    .addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+  BuildMI(MBB, I, DL, TII.get(Mips::GotPrologue16), V0).
+    addReg(V1, RegState::Define).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_HI).
+    addExternalSymbol("_gp_disp", MipsII::MO_ABS_LO);
+
   BuildMI(MBB, I, DL, TII.get(Mips::SllX16), V2).addReg(V0).addImm(16);
   BuildMI(MBB, I, DL, TII.get(Mips::AdduRxRyRz16), GlobalBaseReg)
     .addReg(V1).addReg(V2);
@@ -118,11 +119,13 @@ void Mips16DAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 SDValue Mips16DAGToDAGISel::getMips16SPAliasReg() {
   unsigned Mips16SPAliasReg =
     MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
-  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
+  return CurDAG->getRegister(Mips16SPAliasReg,
+                             getTargetLowering()->getPointerTy());
 }
 
 void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
-  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0,
+                                           getTargetLowering()->getPointerTy());
   if (Parent) {
     switch (Parent->getOpcode()) {
       case ISD::LOAD: {
@@ -149,7 +152,7 @@ void Mips16DAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
       }
     }
   }
-  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
+  AliasReg = CurDAG->getRegister(Mips::SP, getTargetLowering()->getPointerTy());
   return;
 
 }
@@ -235,7 +238,7 @@ bool Mips16DAGToDAGISel::selectAddr16(
 /// expanded, promoted and normal instructions
 std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
 
   ///
   // Instruction Selection not handled by the auto-generated
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index f05f9b7..49dc6e5 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -23,7 +23,7 @@ public:
   explicit Mips16DAGToDAGISel(MipsTargetMachine &TM) : MipsDAGToDAGISel(TM) {}
 
 private:
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc DL,
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc DL,
                                          EVT Ty, bool HasLo, bool HasHi);
 
   SDValue getMips16SPAliasReg();
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index f63318f..61d8bb8 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -13,19 +13,14 @@
 #define DEBUG_TYPE "mips-lower"
 #include "Mips16ISelLowering.h"
 #include "MipsRegisterInfo.h"
+#include "MipsTargetMachine.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include <set>
 
 using namespace llvm;
 
-static cl::opt<bool>
-Mips16HardFloat("mips16-hard-float", cl::NotHidden,
-                cl::desc("MIPS: mips16 hard float enable."),
-                cl::init(false));
-
 static cl::opt<bool> DontExpandCondPseudos16(
   "mips16-dont-expand-cond-pseudo",
   cl::init(false),
@@ -34,15 +29,98 @@ static cl::opt<bool> DontExpandCondPseudos16(
   cl::Hidden);
 
 namespace {
-  std::set<const char*, MipsTargetLowering::LTStr> NoHelperNeeded;
+struct Mips16Libcall {
+  RTLIB::Libcall Libcall;
+  const char *Name;
+
+  bool operator<(const Mips16Libcall &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+};
+
+struct Mips16IntrinsicHelperType{
+  const char* Name;
+  const char* Helper;
+
+  bool operator<(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) < 0;
+  }
+  bool operator==(const Mips16IntrinsicHelperType &RHS) const {
+    return std::strcmp(Name, RHS.Name) == 0;
+  }
+};
 }
 
+// Libcalls for which no helper is generated. Sorted by name for binary search.
+static const Mips16Libcall HardFloatLibCalls[] = {
+  { RTLIB::ADD_F64, "__mips16_adddf3" },
+  { RTLIB::ADD_F32, "__mips16_addsf3" },
+  { RTLIB::DIV_F64, "__mips16_divdf3" },
+  { RTLIB::DIV_F32, "__mips16_divsf3" },
+  { RTLIB::OEQ_F64, "__mips16_eqdf2" },
+  { RTLIB::OEQ_F32, "__mips16_eqsf2" },
+  { RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2" },
+  { RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi" },
+  { RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi" },
+  { RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf" },
+  { RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf" },
+  { RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf" },
+  { RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf" },
+  { RTLIB::OGE_F64, "__mips16_gedf2" },
+  { RTLIB::OGE_F32, "__mips16_gesf2" },
+  { RTLIB::OGT_F64, "__mips16_gtdf2" },
+  { RTLIB::OGT_F32, "__mips16_gtsf2" },
+  { RTLIB::OLE_F64, "__mips16_ledf2" },
+  { RTLIB::OLE_F32, "__mips16_lesf2" },
+  { RTLIB::OLT_F64, "__mips16_ltdf2" },
+  { RTLIB::OLT_F32, "__mips16_ltsf2" },
+  { RTLIB::MUL_F64, "__mips16_muldf3" },
+  { RTLIB::MUL_F32, "__mips16_mulsf3" },
+  { RTLIB::UNE_F64, "__mips16_nedf2" },
+  { RTLIB::UNE_F32, "__mips16_nesf2" },
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_dc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_df" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sc" }, // No associated libcall.
+  { RTLIB::UNKNOWN_LIBCALL, "__mips16_ret_sf" }, // No associated libcall.
+  { RTLIB::SUB_F64, "__mips16_subdf3" },
+  { RTLIB::SUB_F32, "__mips16_subsf3" },
+  { RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2" },
+  { RTLIB::UO_F64, "__mips16_unorddf2" },
+  { RTLIB::UO_F32, "__mips16_unordsf2" }
+};
+
+static const Mips16IntrinsicHelperType Mips16IntrinsicHelper[] = {
+  {"__fixunsdfsi", "__mips16_call_stub_2" },
+  {"ceil",  "__mips16_call_stub_df_2"},
+  {"ceilf", "__mips16_call_stub_sf_1"},
+  {"copysign",  "__mips16_call_stub_df_10"},
+  {"copysignf", "__mips16_call_stub_sf_5"},
+  {"cos",  "__mips16_call_stub_df_2"},
+  {"cosf", "__mips16_call_stub_sf_1"},
+  {"exp2",  "__mips16_call_stub_df_2"},
+  {"exp2f", "__mips16_call_stub_sf_1"},
+  {"floor",  "__mips16_call_stub_df_2"},
+  {"floorf", "__mips16_call_stub_sf_1"},
+  {"log2",  "__mips16_call_stub_df_2"},
+  {"log2f", "__mips16_call_stub_sf_1"},
+  {"nearbyint",  "__mips16_call_stub_df_2"},
+  {"nearbyintf", "__mips16_call_stub_sf_1"},
+  {"rint",  "__mips16_call_stub_df_2"},
+  {"rintf", "__mips16_call_stub_sf_1"},
+  {"sin",  "__mips16_call_stub_df_2"},
+  {"sinf", "__mips16_call_stub_sf_1"},
+  {"sqrt",  "__mips16_call_stub_df_2"},
+  {"sqrtf", "__mips16_call_stub_sf_1"},
+  {"trunc",  "__mips16_call_stub_df_2"},
+  {"truncf", "__mips16_call_stub_sf_1"},
+};
+
 Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   : MipsTargetLowering(TM) {
   //
   // set up as if mips32 and then revert so we can test the mechanism
   // for switching
-  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+  addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
   addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
   computeRegisterProperties();
   clearRegisterClasses();
@@ -50,7 +128,7 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   // Set up the register classes
   addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
 
-  if (Mips16HardFloat)
+  if (Subtarget->inMips16HardFloat())
     setMips16HardFloatLibCalls();
 
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
@@ -67,6 +145,11 @@ Mips16TargetLowering::Mips16TargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::ATOMIC_LOAD_UMIN,   MVT::i32,   Expand);
   setOperationAction(ISD::ATOMIC_LOAD_UMAX,   MVT::i32,   Expand);
 
+  setOperationAction(ISD::ROTR, MVT::i32,  Expand);
+  setOperationAction(ISD::ROTR, MVT::i64,  Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+
   computeRegisterProperties();
 }
 
@@ -91,57 +174,57 @@ Mips16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::SelBneZ:
     return emitSel16(Mips::BnezRxImm16, MI, BB);
   case Mips::SelTBteqZCmpi:
-    return emitSeliT16(Mips::BteqzX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBteqZSlti:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBteqZSltiu:
-    return emitSeliT16(Mips::BteqzX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Bteqz16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBtneZCmpi:
-    return emitSeliT16(Mips::BtnezX16, Mips::CmpiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::CmpiRxImmX16, MI, BB);
   case Mips::SelTBtneZSlti:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiRxImmX16, MI, BB);
   case Mips::SelTBtneZSltiu:
-    return emitSeliT16(Mips::BtnezX16, Mips::SltiuRxImmX16, MI, BB);
+    return emitSeliT16(Mips::Btnez16, Mips::SltiuRxImmX16, MI, BB);
   case Mips::SelTBteqZCmp:
-    return emitSelT16(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBteqZSlt:
-    return emitSelT16(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBteqZSltu:
-    return emitSelT16(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::SelTBtneZCmp:
-    return emitSelT16(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::SelTBtneZSlt:
-    return emitSelT16(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::SelTBtneZSltu:
-    return emitSelT16(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitSelT16(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::CmpRxRy16, MI, BB);
   case Mips::BteqzT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltRxRy16, MI, BB);
   case Mips::BteqzT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BteqzX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Bteqz16, Mips::SltuRxRy16, MI, BB);
   case Mips::BtnezT8CmpX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::CmpRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::CmpRxRy16, MI, BB);
   case Mips::BtnezT8SltX16:
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltRxRy16, MI, BB);
   case Mips::BtnezT8SltuX16:
     // TBD: figure out a way to get this or remove the instruction
     // altogether.
-    return emitFEXT_T8I816_ins(Mips::BtnezX16, Mips::SltuRxRy16, MI, BB);
+    return emitFEXT_T8I816_ins(Mips::Btnez16, Mips::SltuRxRy16, MI, BB);
   case Mips::BteqzT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+    Mips::Bteqz16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BteqzT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+    Mips::Bteqz16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BteqzT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BteqzX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    Mips::Bteqz16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
   case Mips::BtnezT8CmpiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, MI, BB);
+    Mips::Btnez16, Mips::CmpiRxImm16, Mips::CmpiRxImmX16, false, MI, BB);
   case Mips::BtnezT8SltiX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiRxImm16, Mips::SltiRxImmX16, MI, BB);
+    Mips::Btnez16, Mips::SltiRxImm16, Mips::SltiRxImmX16, true, MI, BB);
   case Mips::BtnezT8SltiuX16: return emitFEXT_T8I8I16_ins(
-    Mips::BtnezX16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, MI, BB);
+    Mips::Btnez16, Mips::SltiuRxImm16, Mips::SltiuRxImmX16, false, MI, BB);
     break;
   case Mips::SltCCRxRy16:
     return emitFEXT_CCRX16_ins(Mips::SltRxRy16, MI, BB);
@@ -166,47 +249,17 @@ isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
   return false;
 }
 
-void Mips16TargetLowering::setMips16LibcallName
-  (RTLIB::Libcall L, const char *Name) {
-  setLibcallName(L, Name);
-  NoHelperNeeded.insert(Name);
-}
-
 void Mips16TargetLowering::setMips16HardFloatLibCalls() {
-  setMips16LibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
-  setMips16LibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
-  setMips16LibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
-  setMips16LibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
-  setMips16LibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
-  setMips16LibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
-  setMips16LibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
-  setMips16LibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
-  setMips16LibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
-  setMips16LibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
-  setMips16LibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
-  setMips16LibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
-  setMips16LibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
-  setMips16LibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
-  setMips16LibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
-  setMips16LibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
-  setMips16LibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
-  setMips16LibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
-  setMips16LibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
-  setMips16LibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
-  setMips16LibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
-  setMips16LibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
-  setMips16LibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
-  setMips16LibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
-  setMips16LibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
-  setMips16LibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
-  setMips16LibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
-  setMips16LibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
-  setMips16LibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
-  setMips16LibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
-  setMips16LibcallName(RTLIB::O_F32, "__mips16_unordsf2");
-  setMips16LibcallName(RTLIB::O_F64, "__mips16_unorddf2");
-}
+  for (unsigned I = 0; I != array_lengthof(HardFloatLibCalls); ++I) {
+    assert((I == 0 || HardFloatLibCalls[I - 1] < HardFloatLibCalls[I]) &&
+           "Array not sorted!");
+    if (HardFloatLibCalls[I].Libcall != RTLIB::UNKNOWN_LIBCALL)
+      setLibcallName(HardFloatLibCalls[I].Libcall, HardFloatLibCalls[I].Name);
+  }
 
+  setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+  setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+}
 
 //
 // The Mips16 hard float is a crazy quilt inherited from gcc. I have a much
@@ -371,10 +424,13 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
             CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const {
   SelectionDAG &DAG = CLI.DAG;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   const char* Mips16HelperFunction = 0;
   bool NeedMips16Helper = false;
 
-  if (getTargetMachine().Options.UseSoftFloat && Mips16HardFloat) {
+  if (getTargetMachine().Options.UseSoftFloat &&
+      Subtarget->inMips16HardFloat()) {
     //
     // currently we don't have symbols tagged with the mips16 or mips32
     // qualifier so we will assume that we don't know what kind it is.
@@ -382,9 +438,34 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     //
     bool LookupHelper = true;
     if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
-      if (NoHelperNeeded.find(S->getSymbol()) != NoHelperNeeded.end()) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
+
+      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
+                             Find))
         LookupHelper = false;
+      else {
+        Mips16IntrinsicHelperType IntrinsicFind = {S->getSymbol(), ""};
+        // one more look at list of intrinsics
+        if (std::binary_search(Mips16IntrinsicHelper,
+            array_endof(Mips16IntrinsicHelper),
+                                     IntrinsicFind)) {
+          const Mips16IntrinsicHelperType *h =(std::find(Mips16IntrinsicHelper,
+              array_endof(Mips16IntrinsicHelper),
+                                       IntrinsicFind));
+          Mips16HelperFunction = h->Helper;
+          NeedMips16Helper = true;
+          LookupHelper = false;
+        }
+
       }
+    } else if (GlobalAddressSDNode *G =
+                   dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
+                             G->getGlobal()->getName().data() };
+
+      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
+                             Find))
+        LookupHelper = false;
     }
     if (LookupHelper) Mips16HelperFunction =
       getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
@@ -400,7 +481,10 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (NeedMips16Helper) {
       RegsToPass.push_front(std::make_pair(V0Reg, Callee));
       JumpTarget = DAG.getExternalSymbol(Mips16HelperFunction, getPointerTy());
-      JumpTarget = getAddrGlobal(JumpTarget, DAG, MipsII::MO_GOT);
+      ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
+      JumpTarget = getAddrGlobal(S, JumpTarget.getValueType(), DAG,
+                                 MipsII::MO_GOT, Chain,
+                                 FuncInfo->callPtrInfo(S->getSymbol()));
     } else
       RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
   }
@@ -621,7 +705,7 @@ MachineBasicBlock
 }
 
 MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
-  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+  unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
   MachineInstr *MI,  MachineBasicBlock *BB) const {
   if (DontExpandCondPseudos16)
     return BB;
@@ -632,7 +716,8 @@ MachineBasicBlock *Mips16TargetLowering::emitFEXT_T8I8I16_ins(
   unsigned CmpOpc;
   if (isUInt<8>(imm))
     CmpOpc = CmpiOpc;
-  else if (isUInt<16>(imm))
+  else if ((!ImmSigned && isUInt<16>(imm)) ||
+           (ImmSigned && isInt<16>(imm)))
     CmpOpc = CmpiXOpc;
   else
     llvm_unreachable("immediate field not usable");
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index b23e2a1..33b953f 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -32,8 +32,6 @@ namespace llvm {
                                       unsigned NextStackOffset,
                                       const MipsFunctionInfo& FI) const;
 
-    void setMips16LibcallName(RTLIB::Libcall, const char *Name);
-
     void setMips16HardFloatLibCalls();
 
     unsigned int
@@ -64,7 +62,7 @@ namespace llvm {
                                            MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitFEXT_T8I8I16_ins(
-      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc,
+      unsigned BtOpc, unsigned CmpiOpc, unsigned CmpiXOpc, bool ImmSigned,
       MachineInstr *MI,  MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitFEXT_CCRX16_ins(
diff --git a/lib/Target/Mips/Mips16InstrFormats.td b/lib/Target/Mips/Mips16InstrFormats.td
index 4ff62ef..da3a1f1 100644
--- a/lib/Target/Mips/Mips16InstrFormats.td
+++ b/lib/Target/Mips/Mips16InstrFormats.td
@@ -61,7 +61,7 @@ class MipsInst16<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Top 5 bits are the 'opcode' field
   let Inst{15-11} = Opcode;
-  
+
   let Size=2;
   field bits<16> SoftFail = 0;
 }
@@ -74,7 +74,7 @@ class MipsInst16_32<dag outs, dag ins, string asmstr, list<dag> pattern,
   MipsInst16_Base<outs, ins, asmstr, pattern, itin>
 {
   field bits<32> Inst;
-  
+
   let Size=4;
   field bits<32> SoftFail = 0;
 }
@@ -148,6 +148,20 @@ class FRR16<bits<5> _funct, dag outs, dag ins, string asmstr,
   let Inst{4-0}   = funct;
 }
 
+class FRRBreak16<dag outs, dag ins, string asmstr,
+                 list<dag> pattern, InstrItinClass itin>:
+  MipsInst16<outs, ins, asmstr, pattern, itin>
+{
+  bits<6>  Code;
+  bits<5>  funct;
+
+  let Opcode = 0b11101;
+  let funct  = 0b00101;
+
+  let Inst{10-5} = Code;
+  let Inst{4-0}   = funct;
+}
+
 //
 // For conversion functions.
 //
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 17dd2c0..000ea28 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -10,7 +10,6 @@
 // This file contains the Mips16 implementation of the TargetInstrInfo class.
 //
 //===----------------------------------------------------------------------===//
-
 #include "Mips16InstrInfo.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsMachineFunction.h"
@@ -20,10 +19,12 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -36,8 +37,8 @@ static cl::opt<bool> NeverUseSaveRestore(
 
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, Mips::BimmX16),
-    RI(*tm.getSubtargetImpl(), *this) {}
+  : MipsInstrInfo(tm, Mips::Bimm16),
+    RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;
@@ -72,16 +73,16 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   unsigned Opc = 0;
 
   if (Mips::CPU16RegsRegClass.contains(DestReg) &&
-      Mips::CPURegsRegClass.contains(SrcReg))
+      Mips::GPR32RegClass.contains(SrcReg))
     Opc = Mips::MoveR3216;
-  else if (Mips::CPURegsRegClass.contains(DestReg) &&
+  else if (Mips::GPR32RegClass.contains(DestReg) &&
            Mips::CPU16RegsRegClass.contains(SrcReg))
     Opc = Mips::Move32R16;
-  else if ((SrcReg == Mips::HI) &&
+  else if ((SrcReg == Mips::HI0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mfhi16, SrcReg = 0;
 
-  else if ((SrcReg == Mips::LO) &&
+  else if ((SrcReg == Mips::LO0) &&
            (Mips::CPU16RegsRegClass.contains(DestReg)))
     Opc = Mips::Mflo16, SrcReg = 0;
 
@@ -109,8 +110,9 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
     Opc = Mips::SwRxSpImmX16;
   assert(Opc && "Register class not handled!");
-  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
-    .addFrameIndex(FI).addImm(Offset).addMemOperand(MMO);
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill)).
+      addFrameIndex(FI).addImm(Offset)
+      .addMemOperand(MMO);
 }
 
 void Mips16InstrInfo::
@@ -145,18 +147,22 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 
 /// GetOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
-unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   switch (Opc) {
   default:  llvm_unreachable("Illegal opcode!");
   case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
   case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BeqzRxImm16: return Mips::BnezRxImm16;
+  case Mips::BnezRxImm16: return Mips::BeqzRxImm16;
   case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
   case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
   case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::Btnez16: return Mips::Bteqz16;
   case Mips::BtnezX16: return Mips::BteqzX16;
   case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
   case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
   case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::Bteqz16: return Mips::Btnez16;
   case Mips::BteqzX16: return Mips::BtnezX16;
   case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
   case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
@@ -323,48 +329,88 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
   //
   RegScavenger rs;
   int32_t lo = Imm & 0xFFFF;
-  int32_t hi = ((Imm >> 16) + (lo >> 15)) & 0xFFFF;
   NewImm = lo;
-  unsigned Reg =0;
-  unsigned SpReg = 0;
+  int Reg =0;
+  int SpReg = 0;
+
   rs.enterBasicBlock(&MBB);
   rs.forward(II);
   //
+  // We need to know which registers can be used, in the case where there
+  // are not enough free registers. We exclude all registers that
+  // are used in the instruction that we are helping.
+  //  // Consider all allocatable registers in the register class initially
+  BitVector Candidates =
+      RI.getAllocatableSet
+      (*II->getParent()->getParent(), &Mips::CPU16RegsRegClass);
+  // Exclude all the registers being used by the instruction.
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.getReg() != 0 && !MO.isDef() &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+  //
+  // If the same register was used and defined in an instruction, then
+  // it will not be in the list of candidates.
+  //
+  // we need to analyze the instruction that we are helping.
+  // we need to know if it defines register x but register x is not
+  // present as an operand of the instruction. this tells
+  // whether the register is live before the instruction. if it's not
+  // then we don't need to save it in case there are no free registers.
+  //
+  int DefReg = 0;
+  for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = II->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      DefReg = MO.getReg();
+      break;
+    }
+  }
+  //
+  BitVector Available = rs.getRegsAvailable(&Mips::CPU16RegsRegClass);
+
+  Available &= Candidates;
+  //
   // we use T0 for the first register, if we need to save something away.
   // we use T1 for the second register, if we need to save something away.
   //
   unsigned FirstRegSaved =0, SecondRegSaved=0;
   unsigned FirstRegSavedTo = 0, SecondRegSavedTo = 0;
 
-  Reg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
-  if (Reg == 0) {
-    FirstRegSaved = Reg = Mips::V0;
-    FirstRegSavedTo = Mips::T0;
-    copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+
+  Reg = Available.find_first();
+
+  if (Reg == -1) {
+    Reg = Candidates.find_first();
+    Candidates.reset(Reg);
+    if (DefReg != Reg) {
+      FirstRegSaved = Reg;
+      FirstRegSavedTo = Mips::T0;
+      copyPhysReg(MBB, II, DL, FirstRegSavedTo, FirstRegSaved, true);
+    }
   }
   else
-    rs.setUsed(Reg);
-  BuildMI(MBB, II, DL, get(Mips::LiRxImmX16), Reg).addImm(hi);
-  BuildMI(MBB, II, DL, get(Mips::SllX16), Reg).addReg(Reg).
-    addImm(16);
+    Available.reset(Reg);
+  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm);
+  NewImm = 0;
   if (FrameReg == Mips::SP) {
-    SpReg = rs.FindUnusedReg(&Mips::CPU16RegsRegClass);
-    if (SpReg == 0) {
-      if (Reg != Mips::V1) {
-        SecondRegSaved = SpReg = Mips::V1;
+    SpReg = Available.find_first();
+    if (SpReg == -1) {
+      SpReg = Candidates.find_first();
+      // Candidates.reset(SpReg); // not really needed
+      if (DefReg!= SpReg) {
+        SecondRegSaved = SpReg;
         SecondRegSavedTo = Mips::T1;
       }
-      else {
-        SecondRegSaved = SpReg = Mips::V0;
-        SecondRegSavedTo = Mips::T0;
-      }
-      copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
+      if (SecondRegSaved)
+        copyPhysReg(MBB, II, DL, SecondRegSavedTo, SecondRegSaved, true);
     }
-    else
-      rs.setUsed(SpReg);
-
+   else
+     Available.reset(SpReg);
     copyPhysReg(MBB, II, DL, SpReg, Mips::SP, false);
-    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg)
+    BuildMI(MBB, II, DL, get(Mips::  AdduRxRyRz16), Reg).addReg(SpReg, RegState::Kill)
       .addReg(Reg);
   }
   else
@@ -380,8 +426,27 @@ Mips16InstrInfo::loadImmediate(unsigned FrameReg,
   return Reg;
 }
 
-unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+Mips16InstrInfo::basicLoadImmediate(
+  unsigned FrameReg,
+  int64_t Imm, MachineBasicBlock &MBB,
+  MachineBasicBlock::iterator II, DebugLoc DL,
+  unsigned &NewImm) const {
+  const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+  unsigned Reg = RegInfo.createVirtualRegister(RC);
+  BuildMI(MBB, II, DL, get(Mips::LwConstant32), Reg).addImm(Imm);
+  NewImm = 0;
+  return Reg;
+}
+
+unsigned Mips16InstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::Bimm16  ||
+          Opc == Mips::Bteqz16        || Opc == Mips::Btnez16 ||
+          Opc == Mips::BeqzRxImm16    || Opc == Mips::BnezRxImm16   ||
           Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
           Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
           Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
@@ -415,3 +480,69 @@ void Mips16InstrInfo::BuildAddiuSpImm
 const MipsInstrInfo *llvm::createMips16InstrInfo(MipsTargetMachine &TM) {
   return new Mips16InstrInfo(TM);
 }
+
+bool Mips16InstrInfo::validImmediate(unsigned Opcode, unsigned Reg,
+                                     int64_t Amount) {
+  switch (Opcode) {
+  case Mips::LbRxRyOffMemX16:
+  case Mips::LbuRxRyOffMemX16:
+  case Mips::LhRxRyOffMemX16:
+  case Mips::LhuRxRyOffMemX16:
+  case Mips::SbRxRyOffMemX16:
+  case Mips::ShRxRyOffMemX16:
+  case Mips::LwRxRyOffMemX16:
+  case Mips::SwRxRyOffMemX16:
+  case Mips::SwRxSpImmX16:
+  case Mips::LwRxSpImmX16:
+    return isInt<16>(Amount);
+  case Mips::AddiuRxRyOffMemX16:
+    if ((Reg == Mips::PC) || (Reg == Mips::SP))
+      return isInt<16>(Amount);
+    return isInt<15>(Amount);
+  }
+  llvm_unreachable("unexpected Opcode in validImmediate");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+/// We implement the special case of the .space directive taking only an
+/// integer argument, which is the size in bytes. This is used for creating
+/// inline code spacing for testing purposes using inline assembly.
+///
+unsigned Mips16InstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+
+
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0)
+      atInsnStart = true;
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      if (strncmp(Str, ".space", 6)==0) {
+        char *EStr; int Sz;
+        Sz = strtol(Str+6, &EStr, 10);
+        while (isspace(*EStr)) ++EStr;
+        if (*EStr=='\0') {
+          DEBUG(dbgs() << "parsed .space " << Sz << '\n');
+          return Sz;
+        }
+      }
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+
+  return Length;
+}
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index a77a904..d9a594b 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -64,11 +64,11 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
 
   // Adjust SP by FrameSize bytes. Save RA, S0, S1
   void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
-                      MachineBasicBlock::iterator I) const;
+                 MachineBasicBlock::iterator I) const;
 
   // Adjust SP by FrameSize bytes. Restore RA, S0, S1
   void restoreFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
@@ -88,6 +88,13 @@ public:
                          MachineBasicBlock::iterator II, DebugLoc DL,
                          unsigned &NewImm) const;
 
+  unsigned basicLoadImmediate(unsigned FrameReg,
+                              int64_t Imm, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator II, DebugLoc DL,
+                              unsigned &NewImm) const;
+
+  static bool validImmediate(unsigned Opcode, unsigned Reg, int64_t Amount);
+
   static bool validSpImm8(int offset) {
     return ((offset & 7) == 0) && isInt<11>(offset);
   }
@@ -101,8 +108,10 @@ public:
   void BuildAddiuSpImm
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
+  unsigned getInlineAsmLength(const char *Str,
+                              const MCAsmInfo &MAI) const;
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
   void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index aa51aaf..7441c78 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -21,17 +21,27 @@ def addr16 :
 // Address operand
 def mem16 : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16Regs);
+  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16RegsPlusSP);
   let EncoderMethod = "getMemEncoding";
 }
 
 def mem16_ea : Operand<i32> {
   let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let MIOperandInfo = (ops CPU16RegsPlusSP, simm16);
   let EncoderMethod = "getMemEncoding";
 }
 
 //
+// I-type instruction format
+//
+// this is only used by bimm. the actual assembly value is a 12 bit signed
+// number
+//
+class FI16_ins<bits<5> op, string asmstr, InstrItinClass itin>:
+  FI16<op, (outs), (ins brtarget:$imm16),
+            !strconcat(asmstr, "\t$imm16 # 16 bit inst"), [], itin>;
+
+//
 //
 // I8 instruction format
 //
@@ -41,7 +51,10 @@ class FI816_ins_base<bits<3> _func, string asmstr,
   FI816<_func, (outs), (ins simm16:$imm), !strconcat(asmstr, asmstr2),
         [], itin>;
 
-
+class FI816_ins<bits<3> _func, string asmstr,
+                InstrItinClass itin>:
+  FI816_ins_base<_func, asmstr, "\t$imm  # 16 bit inst", itin>;
+ 
 class FI816_SP_ins<bits<3> _func, string asmstr,
                    InstrItinClass itin>:
   FI816_ins_base<_func, asmstr, "\t$$sp, $imm # 16 bit inst", itin>;
@@ -60,6 +73,11 @@ class FRI16_ins<bits<5> op, string asmstr,
                 InstrItinClass itin>:
   FRI16_ins_base<op, asmstr, "\t$rx, $imm \t# 16 bit inst", itin>;
 
+class FRI16_TCP_ins<bits<5> _op, string asmstr,
+                    InstrItinClass itin>:
+  FRI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm\t# 16 bit inst"), [], itin>;
+            
 class FRI16R_ins_base<bits<5> op, string asmstr, string asmstr2,
                      InstrItinClass itin>:
   FRI16<op, (outs), (ins CPU16Regs:$rx, simm16:$imm),
@@ -172,6 +190,11 @@ class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
   FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
             !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
 
+class FEXT_RI16_TCP_ins<bits<5> _op, string asmstr,
+                        InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins pcrel16:$imm, i32imm:$size),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
+
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
                      InstrItinClass itin>:
   FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_, simm16:$imm),
@@ -187,6 +210,11 @@ class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
   FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPUSPReg:$ry, simm16:$imm),
             !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
 
+class FEXT_RI16_SP_Store_explicit_ins<bits<5> _op, string asmstr,
+                                InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, CPUSPReg:$ry, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
+
 //
 // EXT-RRI instruction format
 //
@@ -215,7 +243,7 @@ class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
 // EXT-SHIFT instruction format
 //
 class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
-  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, uimm5:$sa),
                !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
 
 //
@@ -248,7 +276,7 @@ class FEXT_T8I8I16_ins<string asmstr, string asmstr2>:
 // I8_MOVR32 instruction format (used only by the MOVR32 instructio
 //
 class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
-       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins CPURegs:$r32),
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins GPR32:$r32),
        !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
 
 //
@@ -256,7 +284,7 @@ class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
 //
 
 class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
-  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
+  FI8_MOV32R16<(outs GPR32:$r32), (ins CPU16Regs:$rz),
                !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
 
 //
@@ -287,6 +315,11 @@ class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
         !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
 }
 
+class FRRBreakNull16_ins<string asmstr, InstrItinClass itin> :
+  FRRBreak16<(outs), (ins), asmstr, [], itin> {
+  let Code=0;
+}
+
 class FRR16R_ins<bits<5> f, string asmstr, InstrItinClass itin> :
   FRR16<f, (outs), (ins  CPU16Regs:$rx, CPU16Regs:$ry),
         !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
@@ -333,6 +366,14 @@ class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
   FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx),
               !strconcat(asmstr, "\t $rx"), [], itin> ;
 
+class FRR_SF16_ins
+  <bits<5> _funct, bits<3> _subfunc,
+    string asmstr, InstrItinClass itin>:
+  FRR_SF16<_funct, _subfunc, (outs CPU16Regs:$rx), (ins CPU16Regs:$rx_),
+           !strconcat(asmstr, "\t $rx"),
+           [], itin> {
+  let Constraints = "$rx_ = $rx";
+  }
 //
 // RRR-type instruction format
 //
@@ -437,7 +478,7 @@ def Constant32:
   MipsPseudo16<(outs), (ins imm32:$imm), "\t.word $imm", []>;
 
 def LwConstant32:
-  MipsPseudo16<(outs), (ins CPU16Regs:$rx, imm32:$imm),
+  MipsPseudo16<(outs CPU16Regs:$rx), (ins imm32:$imm, imm32:$constid),
     "lw\t$rx, 1f\n\tb\t2f\n\t.align\t2\n1: \t.word\t$imm\n2:", []>;
 
 
@@ -549,6 +590,14 @@ def BeqzRxImm16: FRI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 //
 def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
 
+//
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch (Extended)
+// To do an unconditional PC-relative branch.
+//
+
+def Bimm16: FI16_ins<0b00010, "b", IIAlu>, branch16;
+
 // Format: B offset MIPS16e
 // Purpose: Unconditional Branch
 // To do an unconditional PC-relative branch.
@@ -569,11 +618,22 @@ def BnezRxImm16: FRI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
 //
 def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
 
+
+//
+//Format: BREAK immediate
+// Purpose: Breakpoint
+// To cause a Breakpoint exception.
+
+def Break16: FRRBreakNull16_ins<"break 0", NoItinerary>; 
 //
 // Format: BTEQZ offset MIPS16e
 // Purpose: Branch on T Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+def Bteqz16: FI816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16 {
   let Uses = [T8];
 }
@@ -597,6 +657,11 @@ def BteqzT8SltiuX16: FEXT_T8I8I16_ins<"bteqz", "sltiu">,
 // Purpose: Branch on T Not Equal to Zero (Extended)
 // To test special register T then do a PC-relative conditional branch.
 //
+
+def Btnez16: FI816_ins<0b001, "btnez", IIAlu>, cbranch16 {
+  let Uses = [T8];
+}
+
 def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16 {
   let Uses = [T8];
 }
@@ -648,7 +713,7 @@ def CmpiRxImmX16: FEXT_RI16R_ins<0b01110, "cmpi", IIAlu> {
 // To divide 32-bit signed integers.
 //
 def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -657,7 +722,7 @@ def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
 // To divide 32-bit unsigned integers.
 //
 def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 //
 // Format: JAL target MIPS16e
@@ -667,10 +732,8 @@ def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
 //
 
 def Jal16 : FJAL16_ins<0b0, "jal", IIAlu> {
-  let isBranch = 1;
   let hasDelaySlot = 0;  // not true, but we add the nop for now
-  let isTerminator=1;
-  let isBarrier=1;
+  let isCall=1;
 }
 
 //
@@ -753,6 +816,10 @@ def LiRxImm16: FRI16_ins<0b01101, "li", IIAlu>;
 //
 def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
 
+def LiRxImmAlignX16: FEXT_RI16_ins<0b01101, ".align 2\n\tli", IIAlu> {
+  let isCodeGenOnly = 1;
+}
+
 //
 // Format: LW ry, offset(rx) MIPS16e
 // Purpose: Load Word (Extended)
@@ -766,10 +833,13 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad{
 // Purpose: Load Word (SP-Relative, Extended)
 // To load an SP-relative word from memory as a signed value.
 //
-def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10110, "lw", IILoad>, MayLoad{
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10010, "lw", IILoad>, MayLoad{
   let Uses = [SP];
 }
 
+def LwRxPcTcp16: FRI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
+
+def LwRxPcTcpX16: FEXT_RI16_TCP_ins<0b10110, "lw", IILoad>, MayLoad;
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
@@ -790,7 +860,7 @@ def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
 // To copy the special purpose HI register to a GPR.
 //
 def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
-  let Uses = [HI];
+  let Uses = [HI0];
   let neverHasSideEffects = 1;
 }
 
@@ -800,7 +870,7 @@ def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
 // To copy the special purpose LO register to a GPR.
 //
 def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
-  let Uses = [LO];
+  let Uses = [LO0];
   let neverHasSideEffects = 1;
 }
 
@@ -810,13 +880,13 @@ def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
 def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -827,7 +897,7 @@ def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
 def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -838,7 +908,7 @@ def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
 def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
   let isCommutable = 1;
   let neverHasSideEffects = 1;
-  let Defs = [HI, LO];
+  let Defs = [HI0, LO0];
 }
 
 //
@@ -878,9 +948,9 @@ def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
 let ra=1, s=0,s0=1,s1=1 in
 def RestoreRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "restore\t$$ra,  $$s0, $$s1, $frame_size", [], IILoad >, MayLoad {
+             "restore\t$$ra,  $$s0, $$s1, $$s2, $frame_size", [], IILoad >, MayLoad {
   let isCodeGenOnly = 1;
-  let Defs = [S0, S1, RA, SP];
+  let Defs = [S0, S1, S2, RA, SP];
   let Uses = [SP];
 }
 
@@ -906,9 +976,9 @@ def RestoreIncSpF16:
 let ra=1, s=1,s0=1,s1=1 in
 def SaveRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "save\t$$ra, $$s0, $$s1, $frame_size", [], IIStore >, MayStore {
+             "save\t$$ra, $$s0, $$s1, $$s2, $frame_size", [], IIStore >, MayStore {
   let isCodeGenOnly = 1;
-  let Uses = [RA, SP, S0, S1];
+  let Uses = [RA, SP, S0, S1, S2];
   let Defs = [SP];
 }
 
@@ -933,6 +1003,22 @@ def SbRxRyOffMemX16:
   FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
 
 //
+// Format: SEB rx MIPS16e
+// Purpose: Sign-Extend Byte
+// Sign-extend least significant byte in register rx.
+//
+def SebRx16
+  : FRR_SF16_ins<0b10001, 0b100, "seb", IIAlu>;
+
+//
+// Format: SEH rx MIPS16e
+// Purpose: Sign-Extend Halfword
+// Sign-extend least significant word in register rx.
+//
+def SehRx16
+  : FRR_SF16_ins<0b10001, 0b101, "seh", IIAlu>;
+
+//
 // The Sel(T) instructions are pseudos
 // T means that they use T8 implicitly.
 //
@@ -1057,7 +1143,7 @@ def ShRxRyOffMemX16:
 //
 // Format: SLL rx, ry, sa MIPS16e
 // Purpose: Shift Word Left Logical (Extended)
-// To execute a left-shift of a word by a fixed number of bits—0 to 31 bits.
+// To execute a left-shift of a word by a fixed number of bits-0 to 31 bits.
 //
 def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
 
@@ -1153,7 +1239,7 @@ def SravRxRy16: FRxRxRy16_ins<0b00111, "srav", IIAlu>;
 // Format: SRA rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Arithmetic (Extended)
 // To execute an arithmetic right-shift of a word by a fixed
-// number of bits—1 to 8 bits.
+// number of bits-1 to 8 bits.
 //
 def SraX16: FEXT_SHIFT16_ins<0b11, "sra", IIAlu>;
 
@@ -1171,7 +1257,7 @@ def SrlvRxRy16: FRxRxRy16_ins<0b00110, "srlv", IIAlu>;
 // Format: SRL rx, ry, sa MIPS16e
 // Purpose: Shift Word Right Logical (Extended)
 // To execute a logical right-shift of a word by a fixed
-// number of bits—1 to 31 bits.
+// number of bits-1 to 31 bits.
 //
 def SrlX16: FEXT_SHIFT16_ins<0b10, "srl", IIAlu>;
 
@@ -1195,7 +1281,8 @@ def SwRxRyOffMemX16:
 // Purpose: Store Word rx (SP-Relative)
 // To store an SP-relative word to memory.
 //
-def SwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b11010, "sw", IIStore>, MayStore;
+def SwRxSpImmX16: FEXT_RI16_SP_Store_explicit_ins
+  <0b11010, "sw", IIStore>, MayStore;
 
 //
 //
@@ -1311,9 +1398,7 @@ def: Mips16Pat<(i32  addr16:$addr),
 
 
 // Large (>16 bit) immediate loads
-def : Mips16Pat<(i32 imm:$imm),
-                (OrRxRxRy16 (SllX16 (LiRxImmX16 (HI16 imm:$imm)), 16),
-                (LiRxImmX16 (LO16 imm:$imm)))>;
+def : Mips16Pat<(i32 imm:$imm), (LwConstant32 imm:$imm, -1)>;
 
 // Carry MipsPatterns
 def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
@@ -1354,7 +1439,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
-   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BeqzRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1416,7 +1501,7 @@ def: Mips16Pat
 
 def: Mips16Pat
   <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1424,7 +1509,7 @@ def: Mips16Pat
 //
 def: Mips16Pat
   <(brcond CPU16Regs:$rx, bb:$targ16),
-   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+   (BnezRxImm16 CPU16Regs:$rx, bb:$targ16)
   >;
 
 //
@@ -1454,7 +1539,7 @@ def: Mips16Pat
 //   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
 //  >;
 
-def: UncondBranch16_pat<br, BimmX16>;
+def: UncondBranch16_pat<br, Bimm16>;
 
 // Small immediates
 def: Mips16Pat<(i32 immSExt16:$in),
@@ -1768,7 +1853,8 @@ def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
                (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
 
 // hi/lo relocs
-
+def : Mips16Pat<(MipsHi tblockaddress:$in),
+                (SllX16 (LiRxImmX16 tblockaddress:$in), 16)>;
 def : Mips16Pat<(MipsHi tglobaladdr:$in),
                 (SllX16 (LiRxImmX16 tglobaladdr:$in), 16)>;
 def : Mips16Pat<(MipsHi tjumptable:$in),
@@ -1776,6 +1862,8 @@ def : Mips16Pat<(MipsHi tjumptable:$in),
 def : Mips16Pat<(MipsHi tglobaltlsaddr:$in),
                 (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
 
+def : Mips16Pat<(MipsLo tblockaddress:$in), (LiRxImmX16 tblockaddress:$in)>;
+
 // wrapper_pic
 class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
   Mips16Pat<(MipsWrapper RC:$gp, node:$in),
@@ -1789,3 +1877,33 @@ def : Mips16Pat<(i32 (extloadi8   addr16:$src)),
                 (LbuRxRyOffMemX16  addr16:$src)>;
 def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
                 (LhuRxRyOffMemX16  addr16:$src)>;
+
+def: Mips16Pat<(trap), (Break16)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i8),
+                (SebRx16 CPU16Regs:$val)>;
+
+def : Mips16Pat<(sext_inreg CPU16Regs:$val, i16),
+                (SehRx16 CPU16Regs:$val)>;
+
+def GotPrologue16:   
+  MipsPseudo16<
+    (outs CPU16Regs:$rh, CPU16Regs:$rl),
+    (ins simm16:$immHi, simm16:$immLo),
+    ".align 2\n\tli\t$rh, $immHi\n\taddiu\t$rl, $$pc, $immLo\n ",[]> ;
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  // let PrintMethod = "printCPInstOperand";
+}
+
+// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+// the function.  The first operand is the ID# for this instruction, the second
+// is the index into the MachineConstantPool that this is, the third is the
+// size in bytes of this constant pool entry.
+//
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+MipsPseudo16<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                      i32imm:$size), "foo", []>;
+
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 7ad18f2..9d0f2c9 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -41,17 +41,16 @@
 
 using namespace llvm;
 
-Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
-    const Mips16InstrInfo &I)
-  : MipsRegisterInfo(ST), TII(I) {}
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 bool Mips16RegisterInfo::requiresRegisterScavenging
   (const MachineFunction &MF) const {
-  return true;
+  return false;
 }
 bool Mips16RegisterInfo::requiresFrameIndexScavenging
   (const MachineFunction &MF) const {
-  return true;
+  return false;
 }
 
 bool Mips16RegisterInfo::useFPForScavengingIndex
@@ -66,6 +65,7 @@ bool Mips16RegisterInfo::saveScavengerRegister
    const TargetRegisterClass *RC,
    unsigned Reg) const {
   DebugLoc DL;
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
   TII.copyPhysReg(MBB, UseMI, DL, Reg, Mips::T0, true);
   return true;
@@ -134,11 +134,14 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  if (!MI.isDebugValue() && ( ((FrameReg != Mips::SP) && !isInt<16>(Offset)) ||
-      ((FrameReg == Mips::SP) && !isInt<15>(Offset)) )) {
+  if (!MI.isDebugValue() &&
+      !Mips16InstrInfo::validImmediate(MI.getOpcode(), FrameReg, Offset)) {
     MachineBasicBlock &MBB = *MI.getParent();
     DebugLoc DL = II->getDebugLoc();
     unsigned NewImm;
+    const Mips16InstrInfo &TII =
+      *static_cast<const Mips16InstrInfo*>(
+        MBB.getParent()->getTarget().getInstrInfo());
     FrameReg = TII.loadImmediate(FrameReg, Offset, MBB, II, DL, NewImm);
     Offset = SignExtend64<16>(NewImm);
     IsKill = true;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 2b3d2b1..13e82a3 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -20,10 +20,8 @@ namespace llvm {
 class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
-  const Mips16InstrInfo &TII;
 public:
-  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
-                     const Mips16InstrInfo &TII);
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index fc533fb..15ef654 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -15,9 +15,6 @@
 // Mips Operand, Complex Patterns and Transformations Definitions.
 //===----------------------------------------------------------------------===//
 
-// Instruction operand types
-def shamt_64       : Operand<i64>;
-
 // Unsigned Operand
 def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
@@ -34,42 +31,21 @@ def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
-let DecoderNamespace = "Mips64" in {
-
-multiclass Atomic2Ops64<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, CPU64Regs, CPURegs>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, CPU64Regs, CPU64Regs>,
-             Requires<[IsN64, HasStdEnc]> {
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass AtomicCmpSwap64<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, CPU64Regs, CPURegs>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, CPU64Regs, CPU64Regs>,
-             Requires<[IsN64, HasStdEnc]> {
-    let isCodeGenOnly = 1;
-  }
-}
-}
-let usesCustomInserter = 1, Predicates = [HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  defm ATOMIC_LOAD_ADD_I64  : Atomic2Ops64<atomic_load_add_64>;
-  defm ATOMIC_LOAD_SUB_I64  : Atomic2Ops64<atomic_load_sub_64>;
-  defm ATOMIC_LOAD_AND_I64  : Atomic2Ops64<atomic_load_and_64>;
-  defm ATOMIC_LOAD_OR_I64   : Atomic2Ops64<atomic_load_or_64>;
-  defm ATOMIC_LOAD_XOR_I64  : Atomic2Ops64<atomic_load_xor_64>;
-  defm ATOMIC_LOAD_NAND_I64 : Atomic2Ops64<atomic_load_nand_64>;
-  defm ATOMIC_SWAP_I64      : Atomic2Ops64<atomic_swap_64>;
-  defm ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap64<atomic_cmp_swap_64>;
+let usesCustomInserter = 1 in {
+  def ATOMIC_LOAD_ADD_I64  : Atomic2Ops<atomic_load_add_64, GPR64>;
+  def ATOMIC_LOAD_SUB_I64  : Atomic2Ops<atomic_load_sub_64, GPR64>;
+  def ATOMIC_LOAD_AND_I64  : Atomic2Ops<atomic_load_and_64, GPR64>;
+  def ATOMIC_LOAD_OR_I64   : Atomic2Ops<atomic_load_or_64, GPR64>;
+  def ATOMIC_LOAD_XOR_I64  : Atomic2Ops<atomic_load_xor_64, GPR64>;
+  def ATOMIC_LOAD_NAND_I64 : Atomic2Ops<atomic_load_nand_64, GPR64>;
+  def ATOMIC_SWAP_I64      : Atomic2Ops<atomic_swap_64, GPR64>;
+  def ATOMIC_CMP_SWAP_I64  : AtomicCmpSwap<atomic_cmp_swap_64, GPR64>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
-let isPseudo = 1 in {
-  defm LOAD_AC128  : LoadM<"load_ac128", ACRegs128>;
-  defm STORE_AC128 : StoreM<"store_ac128", ACRegs128>;
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def LOAD_ACC128  : Load<"", ACC128>;
+  def STORE_ACC128 : Store<"", ACC128>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -77,166 +53,174 @@ let isPseudo = 1 in {
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDi   : ArithLogicI<"daddi", simm16_64, CPU64RegsOpnd>, ADDI_FM<0x18>;
-def DADDiu  : ArithLogicI<"daddiu", simm16_64, CPU64RegsOpnd, immSExt16, add>,
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>;
+def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, IIArith,
+                          immSExt16, add>,
               ADDI_FM<0x19>, IsAsCheapAsAMove;
-def DANDi   : ArithLogicI<"andi", uimm16_64, CPU64RegsOpnd, immZExt16, and>,
-              ADDI_FM<0xc>;
-def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, CPU64Regs>,
+
+let isCodeGenOnly = 1 in {
+def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
               SLTI_FM<0xa>;
-def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, CPU64Regs>,
+def SLTiu64 : SetCC_I<"sltiu", setult, simm16_64, immSExt16, GPR64Opnd>,
               SLTI_FM<0xb>;
-def ORi64   : ArithLogicI<"ori", uimm16_64, CPU64RegsOpnd, immZExt16, or>,
+def ANDi64 : ArithLogicI<"andi", uimm16_64, GPR64Opnd, IILogic, immZExt16,
+                         and>,
+             ADDI_FM<0xc>;
+def ORi64   : ArithLogicI<"ori", uimm16_64, GPR64Opnd, IILogic, immZExt16,
+                          or>,
               ADDI_FM<0xd>;
-def XORi64  : ArithLogicI<"xori", uimm16_64, CPU64RegsOpnd, immZExt16, xor>,
+def XORi64  : ArithLogicI<"xori", uimm16_64, GPR64Opnd, IILogic, immZExt16,
+                          xor>,
               ADDI_FM<0xe>;
-def LUi64   : LoadUpper<"lui", CPU64Regs, uimm16_64>, LUI_FM;
+def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
+}
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", CPU64RegsOpnd>, ADD_FM<0, 0x2c>;
-def DADDu  : ArithLogicR<"daddu", CPU64RegsOpnd, 1, IIAlu, add>,
+def DADD   : ArithLogicR<"dadd", GPR64Opnd>, ADD_FM<0, 0x2c>;
+def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, IIArith, add>,
                               ADD_FM<0, 0x2d>;
-def DSUBu  : ArithLogicR<"dsubu", CPU64RegsOpnd, 0, IIAlu, sub>,
+def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, IIArith, sub>,
                               ADD_FM<0, 0x2f>;
-def SLT64  : SetCC_R<"slt", setlt, CPU64Regs>, ADD_FM<0, 0x2a>;
-def SLTu64 : SetCC_R<"sltu", setult, CPU64Regs>, ADD_FM<0, 0x2b>;
-def AND64  : ArithLogicR<"and", CPU64RegsOpnd, 1, IIAlu, and>, ADD_FM<0, 0x24>;
-def OR64   : ArithLogicR<"or", CPU64RegsOpnd, 1, IIAlu, or>, ADD_FM<0, 0x25>;
-def XOR64  : ArithLogicR<"xor", CPU64RegsOpnd, 1, IIAlu, xor>, ADD_FM<0, 0x26>;
-def NOR64  : LogicNOR<"nor", CPU64RegsOpnd>, ADD_FM<0, 0x27>;
+
+let isCodeGenOnly = 1 in {
+def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
+def SLTu64 : SetCC_R<"sltu", setult, GPR64Opnd>, ADD_FM<0, 0x2b>;
+def AND64  : ArithLogicR<"and", GPR64Opnd, 1, IIArith, and>, ADD_FM<0, 0x24>;
+def OR64   : ArithLogicR<"or", GPR64Opnd, 1, IIArith, or>, ADD_FM<0, 0x25>;
+def XOR64  : ArithLogicR<"xor", GPR64Opnd, 1, IIArith, xor>, ADD_FM<0, 0x26>;
+def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
+}
 
 /// Shift Instructions
-def DSLL   : shift_rotate_imm<"dsll", shamt, CPU64RegsOpnd, shl, immZExt6>,
+def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, shl, immZExt6>,
              SRA_FM<0x38, 0>;
-def DSRL   : shift_rotate_imm<"dsrl", shamt, CPU64RegsOpnd, srl, immZExt6>,
+def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, srl, immZExt6>,
              SRA_FM<0x3a, 0>;
-def DSRA   : shift_rotate_imm<"dsra", shamt, CPU64RegsOpnd, sra, immZExt6>,
+def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, sra, immZExt6>,
              SRA_FM<0x3b, 0>;
-def DSLLV  : shift_rotate_reg<"dsllv", CPU64RegsOpnd, shl>, SRLV_FM<0x14, 0>;
-def DSRLV  : shift_rotate_reg<"dsrlv", CPU64RegsOpnd, srl>, SRLV_FM<0x16, 0>;
-def DSRAV  : shift_rotate_reg<"dsrav", CPU64RegsOpnd, sra>, SRLV_FM<0x17, 0>;
-def DSLL32 : shift_rotate_imm<"dsll32", shamt, CPU64RegsOpnd>, SRA_FM<0x3c, 0>;
-def DSRL32 : shift_rotate_imm<"dsrl32", shamt, CPU64RegsOpnd>, SRA_FM<0x3e, 0>;
-def DSRA32 : shift_rotate_imm<"dsra32", shamt, CPU64RegsOpnd>, SRA_FM<0x3f, 0>;
-}
+def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, shl>, SRLV_FM<0x14, 0>;
+def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, srl>, SRLV_FM<0x16, 0>;
+def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, sra>, SRLV_FM<0x17, 0>;
+def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd>, SRA_FM<0x3c, 0>;
+def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 0>;
+def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd>, SRA_FM<0x3f, 0>;
+
 // Rotate Instructions
-let Predicates = [HasMips64r2, HasStdEnc],
-    DecoderNamespace = "Mips64" in {
-  def DROTR  : shift_rotate_imm<"drotr", shamt, CPU64RegsOpnd, rotr, immZExt6>,
+let Predicates = [HasMips64r2, HasStdEnc] in {
+  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, rotr, immZExt6>,
                SRA_FM<0x3a, 1>;
-  def DROTRV : shift_rotate_reg<"drotrv", CPU64RegsOpnd, rotr>,
+  def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, rotr>,
                SRLV_FM<0x16, 1>;
+  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd>, SRA_FM<0x3e, 1>;
 }
 
-let DecoderNamespace = "Mips64" in {
 /// Load and Store Instructions
 ///  aligned
-defm LB64  : LoadM<"lb", CPU64Regs, sextloadi8>, LW_FM<0x20>;
-defm LBu64 : LoadM<"lbu", CPU64Regs, zextloadi8>, LW_FM<0x24>;
-defm LH64  : LoadM<"lh", CPU64Regs, sextloadi16>, LW_FM<0x21>;
-defm LHu64 : LoadM<"lhu", CPU64Regs, zextloadi16>, LW_FM<0x25>;
-defm LW64  : LoadM<"lw", CPU64Regs, sextloadi32>, LW_FM<0x23>;
-defm LWu64 : LoadM<"lwu", CPU64Regs, zextloadi32>, LW_FM<0x27>;
-defm SB64  : StoreM<"sb", CPU64Regs, truncstorei8>, LW_FM<0x28>;
-defm SH64  : StoreM<"sh", CPU64Regs, truncstorei16>, LW_FM<0x29>;
-defm SW64  : StoreM<"sw", CPU64Regs, truncstorei32>, LW_FM<0x2b>;
-defm LD    : LoadM<"ld", CPU64Regs, load>, LW_FM<0x37>;
-defm SD    : StoreM<"sd", CPU64Regs, store>, LW_FM<0x3f>;
+let isCodeGenOnly = 1 in {
+def LB64  : Load<"lb", GPR64Opnd, sextloadi8, IILoad>, LW_FM<0x20>;
+def LBu64 : Load<"lbu", GPR64Opnd, zextloadi8, IILoad>, LW_FM<0x24>;
+def LH64  : Load<"lh", GPR64Opnd, sextloadi16, IILoad>, LW_FM<0x21>;
+def LHu64 : Load<"lhu", GPR64Opnd, zextloadi16, IILoad>, LW_FM<0x25>;
+def LW64  : Load<"lw", GPR64Opnd, sextloadi32, IILoad>, LW_FM<0x23>;
+def SB64  : Store<"sb", GPR64Opnd, truncstorei8, IIStore>, LW_FM<0x28>;
+def SH64  : Store<"sh", GPR64Opnd, truncstorei16, IIStore>, LW_FM<0x29>;
+def SW64  : Store<"sw", GPR64Opnd, truncstorei32, IIStore>, LW_FM<0x2b>;
+}
+
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, IILoad>, LW_FM<0x27>;
+def LD    : Load<"ld", GPR64Opnd, load, IILoad>, LW_FM<0x37>;
+def SD    : Store<"sd", GPR64Opnd, store, IIStore>, LW_FM<0x3f>;
 
 /// load/store left/right
-defm LWL64 : LoadLeftRightM<"lwl", MipsLWL, CPU64Regs>, LW_FM<0x22>;
-defm LWR64 : LoadLeftRightM<"lwr", MipsLWR, CPU64Regs>, LW_FM<0x26>;
-defm SWL64 : StoreLeftRightM<"swl", MipsSWL, CPU64Regs>, LW_FM<0x2a>;
-defm SWR64 : StoreLeftRightM<"swr", MipsSWR, CPU64Regs>, LW_FM<0x2e>;
+let isCodeGenOnly = 1 in {
+def LWL64 : LoadLeftRight<"lwl", MipsLWL, GPR64Opnd, IILoad>, LW_FM<0x22>;
+def LWR64 : LoadLeftRight<"lwr", MipsLWR, GPR64Opnd, IILoad>, LW_FM<0x26>;
+def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, IIStore>, LW_FM<0x2a>;
+def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, IIStore>, LW_FM<0x2e>;
+}
 
-defm LDL   : LoadLeftRightM<"ldl", MipsLDL, CPU64Regs>, LW_FM<0x1a>;
-defm LDR   : LoadLeftRightM<"ldr", MipsLDR, CPU64Regs>, LW_FM<0x1b>;
-defm SDL   : StoreLeftRightM<"sdl", MipsSDL, CPU64Regs>, LW_FM<0x2c>;
-defm SDR   : StoreLeftRightM<"sdr", MipsSDR, CPU64Regs>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, IILoad>, LW_FM<0x1a>;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, IILoad>, LW_FM<0x1b>;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, IIStore>, LW_FM<0x2c>;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, IIStore>, LW_FM<0x2d>;
 
 /// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LLD : LLBase<"lld", CPU64RegsOpnd, mem>, LW_FM<0x34>;
-  def SCD : SCBase<"scd", CPU64RegsOpnd, mem>, LW_FM<0x3c>;
-}
-
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly = 1 in {
-  def LLD_P8 : LLBase<"lld", CPU64RegsOpnd, mem64>, LW_FM<0x34>;
-  def SCD_P8 : SCBase<"scd", CPU64RegsOpnd, mem64>, LW_FM<0x3c>;
-}
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>;
 
 /// Jump and Branch Instructions
-def JR64   : IndirectBranch<CPU64Regs>, MTLO_FM<8>;
-def BEQ64  : CBranch<"beq", seteq, CPU64Regs>, BEQ_FM<4>;
-def BNE64  : CBranch<"bne", setne, CPU64Regs>, BEQ_FM<5>;
-def BGEZ64 : CBranchZero<"bgez", setge, CPU64Regs>, BGEZ_FM<1, 1>;
-def BGTZ64 : CBranchZero<"bgtz", setgt, CPU64Regs>, BGEZ_FM<7, 0>;
-def BLEZ64 : CBranchZero<"blez", setle, CPU64Regs>, BGEZ_FM<6, 0>;
-def BLTZ64 : CBranchZero<"bltz", setlt, CPU64Regs>, BGEZ_FM<1, 0>;
+let isCodeGenOnly = 1 in {
+def JR64   : IndirectBranch<"jr", GPR64Opnd>, MTLO_FM<8>;
+def BEQ64  : CBranch<"beq", brtarget, seteq, GPR64Opnd>, BEQ_FM<4>;
+def BNE64  : CBranch<"bne", brtarget, setne, GPR64Opnd>, BEQ_FM<5>;
+def BGEZ64 : CBranchZero<"bgez", brtarget, setge, GPR64Opnd>, BGEZ_FM<1, 1>;
+def BGTZ64 : CBranchZero<"bgtz", brtarget, setgt, GPR64Opnd>, BGEZ_FM<7, 0>;
+def BLEZ64 : CBranchZero<"blez", brtarget, setle, GPR64Opnd>, BGEZ_FM<6, 0>;
+def BLTZ64 : CBranchZero<"bltz", brtarget, setlt, GPR64Opnd>, BGEZ_FM<1, 0>;
+def JALR64 : JumpLinkReg<"jalr", GPR64Opnd>, JALR_FM;
+def JALR64Pseudo : JumpLinkRegPseudo<GPR64Opnd, JALR, RA, GPR32Opnd>;
+def TAILCALL64_R : JumpFR<"tcallr", GPR64Opnd, MipsTailCall>,
+                   MTLO_FM<8>, IsTailCall;
 }
-let DecoderNamespace = "Mips64" in
-def JALR64 : JumpLinkReg<"jalr", CPU64Regs>, JALR_FM;
-def JALR64Pseudo : JumpLinkRegPseudo<CPU64Regs, JALR64, RA_64>;
-def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
 
-let DecoderNamespace = "Mips64" in {
 /// Multiply and Divide Instructions.
-def DMULT  : Mult<"dmult", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+def DMULT  : Mult<"dmult", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1c>;
-def DMULTu : Mult<"dmultu", IIImul, CPU64RegsOpnd, [HI64, LO64]>,
+def DMULTu : Mult<"dmultu", IIImult, GPR64Opnd, [HI0_64, LO0_64]>,
              MULT_FM<0, 0x1d>;
-def PseudoDMULT  : MultDivPseudo<DMULT, ACRegs128, CPU64RegsOpnd, MipsMult,
-                                 IIImul>;
-def PseudoDMULTu : MultDivPseudo<DMULTu, ACRegs128, CPU64RegsOpnd, MipsMultu,
-                                 IIImul>;
-def DSDIV : Div<"ddiv", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1e>;
-def DUDIV : Div<"ddivu", IIIdiv, CPU64RegsOpnd, [HI64, LO64]>, MULT_FM<0, 0x1f>;
-def PseudoDSDIV : MultDivPseudo<DSDIV, ACRegs128, CPU64RegsOpnd, MipsDivRem,
-                                IIIdiv, 0>;
-def PseudoDUDIV : MultDivPseudo<DUDIV, ACRegs128, CPU64RegsOpnd, MipsDivRemU,
-                                IIIdiv, 0>;
-
-def MTHI64 : MoveToLOHI<"mthi", CPU64Regs, [HI64]>, MTLO_FM<0x11>;
-def MTLO64 : MoveToLOHI<"mtlo", CPU64Regs, [LO64]>, MTLO_FM<0x13>;
-def MFHI64 : MoveFromLOHI<"mfhi", CPU64Regs, [HI64]>, MFLO_FM<0x10>;
-def MFLO64 : MoveFromLOHI<"mflo", CPU64Regs, [LO64]>, MFLO_FM<0x12>;
+def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
+                                 IIImult>;
+def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
+                                 IIImult>;
+def DSDIV : Div<"ddiv", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1e>;
+def DUDIV : Div<"ddivu", IIIdiv, GPR64Opnd, [HI0_64, LO0_64]>, MULT_FM<0, 0x1f>;
+def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
+                                IIIdiv, 0, 1, 1>;
+def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
+                                IIIdiv, 0, 1, 1>;
+
+let isCodeGenOnly = 1 in {
+def MTHI64 : MoveToLOHI<"mthi", GPR64Opnd, [HI0_64]>, MTLO_FM<0x11>;
+def MTLO64 : MoveToLOHI<"mtlo", GPR64Opnd, [LO0_64]>, MTLO_FM<0x13>;
+def MFHI64 : MoveFromLOHI<"mfhi", GPR64Opnd, AC0_64>, MFLO_FM<0x10>;
+def MFLO64 : MoveFromLOHI<"mflo", GPR64Opnd, AC0_64>, MFLO_FM<0x12>;
+def PseudoMFHI64 : PseudoMFLOHI<GPR64, ACC128, MipsMFHI>;
+def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
+def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
-def SEB64 : SignExtInReg<"seb", i8, CPU64Regs>, SEB_FM<0x10, 0x20>;
-def SEH64 : SignExtInReg<"seh", i16, CPU64Regs>, SEB_FM<0x18, 0x20>;
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd>, SEB_FM<0x10, 0x20>;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd>, SEB_FM<0x18, 0x20>;
+}
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", CPU64RegsOpnd>, CLO_FM<0x24>;
-def DCLO : CountLeading1<"dclo", CPU64RegsOpnd>, CLO_FM<0x25>;
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>;
 
 /// Double Word Swap Bytes/HalfWords
-def DSBH : SubwordSwap<"dsbh", CPU64RegsOpnd>, SEB_FM<2, 0x24>;
-def DSHD : SubwordSwap<"dshd", CPU64RegsOpnd>, SEB_FM<5, 0x24>;
+def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>;
+def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>;
 
-def LEA_ADDiu64 : EffectiveAddress<"daddiu", CPU64Regs, mem_ea_64>, LW_FM<0x19>;
+def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
-}
-let DecoderNamespace = "Mips64" in {
-def RDHWR64 : ReadHardware<CPU64Regs, HW64RegsOpnd>, RDHWR_FM;
+let isCodeGenOnly = 1 in
+def RDHWR64 : ReadHardware<GPR64Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def DEXT : ExtBase<"dext", CPU64RegsOpnd>, EXT_FM<3>;
-let Pattern = []<dag> in {
-  def DEXTU : ExtBase<"dextu", CPU64RegsOpnd>, EXT_FM<2>;
-  def DEXTM : ExtBase<"dextm", CPU64RegsOpnd>, EXT_FM<1>;
-}
-def DINS : InsBase<"dins", CPU64RegsOpnd>, EXT_FM<7>;
-let Pattern = []<dag> in {
-  def DINSU : InsBase<"dinsu", CPU64RegsOpnd>, EXT_FM<6>;
-  def DINSM : InsBase<"dinsm", CPU64RegsOpnd>, EXT_FM<5>;
-}
+def DEXT : ExtBase<"dext", GPR64Opnd, uimm6, MipsExt>, EXT_FM<3>;
+def DEXTU : ExtBase<"dextu", GPR64Opnd, uimm6>, EXT_FM<2>;
+def DEXTM : ExtBase<"dextm", GPR64Opnd, uimm5>, EXT_FM<1>;
+
+def DINS : InsBase<"dins", GPR64Opnd, uimm6, MipsIns>, EXT_FM<7>;
+def DINSU : InsBase<"dinsu", GPR64Opnd, uimm6>, EXT_FM<6>;
+def DINSM : InsBase<"dinsm", GPR64Opnd, uimm5>, EXT_FM<5>;
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
-  def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
-                     "dsll\t$rd, $rt, 32", [], IIAlu>;
-  def SLL64_32 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
-                    "sll\t$rd, $rt, 0", [], IIAlu>;
-  def SLL64_64 : FR<0x0, 0x00, (outs CPU64Regs:$rd), (ins CPU64Regs:$rt),
-                    "sll\t$rd, $rt, 0", [], IIAlu>;
+  def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
+                     "dsll\t$rd, $rt, 32", [], IIArith>;
+  def SLL64_32 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR32:$rt),
+                    "sll\t$rd, $rt, 0", [], IIArith>;
+  def SLL64_64 : FR<0x0, 0x00, (outs GPR64:$rd), (ins GPR64:$rt),
+                    "sll\t$rd, $rt, 0", [], IIArith>;
 }
 }
 //===----------------------------------------------------------------------===//
@@ -244,18 +228,12 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
   def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
-}
 
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
@@ -273,118 +251,80 @@ def : MipsPat<(MipsLo tglobaltlsaddr:$in),
               (DADDiu ZERO_64, tglobaltlsaddr:$in)>;
 def : MipsPat<(MipsLo texternalsym:$in), (DADDiu ZERO_64, texternalsym:$in)>;
 
-def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
-              (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
-def : MipsPat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
-              (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
-def : MipsPat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
-              (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
-def : MipsPat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
-              (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
-def : MipsPat<(add CPU64Regs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (DADDiu CPU64Regs:$hi, tglobaltlsaddr:$lo)>;
-
-def : WrapperPat<tglobaladdr, DADDiu, CPU64Regs>;
-def : WrapperPat<tconstpool, DADDiu, CPU64Regs>;
-def : WrapperPat<texternalsym, DADDiu, CPU64Regs>;
-def : WrapperPat<tblockaddress, DADDiu, CPU64Regs>;
-def : WrapperPat<tjumptable, DADDiu, CPU64Regs>;
-def : WrapperPat<tglobaltlsaddr, DADDiu, CPU64Regs>;
-
-defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaladdr:$lo)),
+              (DADDiu GPR64:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tblockaddress:$lo)),
+              (DADDiu GPR64:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tjumptable:$lo)),
+              (DADDiu GPR64:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tconstpool:$lo)),
+              (DADDiu GPR64:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR64:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (DADDiu GPR64:$hi, tglobaltlsaddr:$lo)>;
+
+def : WrapperPat<tglobaladdr, DADDiu, GPR64>;
+def : WrapperPat<tconstpool, DADDiu, GPR64>;
+def : WrapperPat<texternalsym, DADDiu, GPR64>;
+def : WrapperPat<tblockaddress, DADDiu, GPR64>;
+def : WrapperPat<tjumptable, DADDiu, GPR64>;
+def : WrapperPat<tglobaltlsaddr, DADDiu, GPR64>;
+
+defm : BrcondPats<GPR64, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
 
+def : MipsPat<(brcond (i32 (setlt i64:$lhs, 1)), bb:$dst),
+              (BLEZ64 i64:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i64:$lhs, -1)), bb:$dst),
+              (BGEZ64 i64:$lhs, bb:$dst)>;
+
 // setcc patterns
-defm : SeteqPats<CPU64Regs, SLTiu64, XOR64, SLTu64, ZERO_64>;
-defm : SetlePats<CPU64Regs, SLT64, SLTu64>;
-defm : SetgtPats<CPU64Regs, SLT64, SLTu64>;
-defm : SetgePats<CPU64Regs, SLT64, SLTu64>;
-defm : SetgeImmPats<CPU64Regs, SLTi64, SLTiu64>;
+defm : SeteqPats<GPR64, SLTiu64, XOR64, SLTu64, ZERO_64>;
+defm : SetlePats<GPR64, SLT64, SLTu64>;
+defm : SetgtPats<GPR64, SLT64, SLTu64>;
+defm : SetgePats<GPR64, SLT64, SLTu64>;
+defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 
 // truncate
-def : MipsPat<(i32 (trunc CPU64Regs:$src)),
-              (SLL (EXTRACT_SUBREG CPU64Regs:$src, sub_32), 0)>,
-      Requires<[IsN64, HasStdEnc]>;
+def : MipsPat<(i32 (trunc GPR64:$src)),
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>,
+      Requires<[HasStdEnc]>;
 
 // 32-to-64-bit extension
-def : MipsPat<(i64 (anyext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
-def : MipsPat<(i64 (zext CPURegs:$src)), (DSRL (DSLL64_32 CPURegs:$src), 32)>;
-def : MipsPat<(i64 (sext CPURegs:$src)), (SLL64_32 CPURegs:$src)>;
+def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
+def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
+def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 
 // Sign extend in register
-def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)),
-              (SLL64_64 CPU64Regs:$src)>;
+def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
+              (SLL64_64 GPR64:$src)>;
 
 // bswap MipsPattern
-def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
-
-// mflo/hi patterns.
-def : MipsPat<(i64 (ExtractLOHI ACRegs128:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs128:$ac, imm:$lohi_idx)>;
+def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
 def : InstAlias<"move $dst, $src",
-                (DADDu CPU64RegsOpnd:$dst,  CPU64RegsOpnd:$src, ZERO_64), 1>,
+                (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
       Requires<[HasMips64]>;
-def : InstAlias<"move $dst, $src",
-                (OR64 CPU64RegsOpnd:$dst, CPU64RegsOpnd:$src, ZERO_64), 1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"and $rs, $rt, $imm",
-                (DANDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
-                1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"slt $rs, $rt, $imm",
-                (SLTi64 CPURegsOpnd:$rs, CPU64Regs:$rt, simm16_64:$imm), 1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"xor $rs, $rt, $imm",
-                (XORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
-                1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"not $rt, $rs",
-                (NOR64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rs, ZERO_64), 1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"j $rs", (JR64 CPU64Regs:$rs), 0>, Requires<[HasMips64]>;
-def : InstAlias<"jalr $rs", (JALR64 RA_64, CPU64Regs:$rs)>,
-      Requires<[HasMips64]>;
-def : InstAlias<"jal $rs", (JALR64 RA_64, CPU64Regs:$rs), 0>,
-                 Requires<[HasMips64]>;
-def : InstAlias<"jal $rd,$rs", (JALR64 CPU64Regs:$rd, CPU64Regs:$rs), 0>,
-                 Requires<[HasMips64]>;
 def : InstAlias<"daddu $rs, $rt, $imm",
-                (DADDiu CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
-                1>;
+                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                0>;
 def : InstAlias<"dadd $rs, $rt, $imm",
-                (DADDi CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, simm16_64:$imm),
-                1>;
-def : InstAlias<"or $rs, $rt, $imm",
-                (ORi64 CPU64RegsOpnd:$rs, CPU64RegsOpnd:$rt, uimm16_64:$imm),
-                1>, Requires<[HasMips64]>;
-/// Move between CPU and coprocessor registers
+                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                0>;
 
-let DecoderNamespace = "Mips64" in {
-def DMFC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
-                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
-                         "dmfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 1>;
-def DMTC0_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
-                         (ins CPU64RegsOpnd:$rt),
-                         "dmtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 5>;
-def DMFC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rt),
-                         (ins CPU64RegsOpnd:$rd, uimm16:$sel),
-                         "dmfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 1>;
-def DMTC2_3OP64 : MFC3OP<(outs CPU64RegsOpnd:$rd, uimm16:$sel),
-                         (ins CPU64RegsOpnd:$rt),
-                         "dmtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 5>;
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
+def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>;
 }
 
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"dmfc0 $rt, $rd",
-                (DMFC0_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
-def : InstAlias<"dmtc0 $rt, $rd",
-                (DMTC0_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
-def : InstAlias<"dmfc2 $rt, $rd",
-                (DMFC2_3OP64 CPU64RegsOpnd:$rt, CPU64RegsOpnd:$rd, 0), 0>;
-def : InstAlias<"dmtc2 $rt, $rd",
-                (DMTC2_3OP64 CPU64RegsOpnd:$rd, 0, CPU64RegsOpnd:$rt), 0>;
+def : InstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : InstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index 99b163e..31a9b7d 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -40,7 +40,7 @@ void MipsAnalyzeImmediate::GetInstSeqLsORi(uint64_t Imm, unsigned RemSize,
 
 void MipsAnalyzeImmediate::GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize,
                                            InstSeqLs &SeqLs) {
-  unsigned Shamt = CountTrailingZeros_64(Imm);
+  unsigned Shamt = countTrailingZeros(Imm);
   GetInstSeqLs(Imm >> Shamt, RemSize - Shamt, SeqLs);
   AddInstr(SeqLs, Inst(SLL, Shamt));
 }
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.h b/lib/Target/Mips/MipsAnalyzeImmediate.h
index a094dda..cc09034 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.h
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.h
@@ -22,7 +22,7 @@ namespace llvm {
     };
     typedef SmallVector<Inst, 7 > InstSeq;
 
-    /// Analyze - Get an instrucion sequence to load immediate Imm. The last
+    /// Analyze - Get an instruction sequence to load immediate Imm. The last
     /// instruction in the sequence must be an ADDiu if LastInstrIsADDiu is
     /// true;
     const InstSeq &Analyze(uint64_t Imm, unsigned Size, bool LastInstrIsADDiu);
@@ -32,19 +32,19 @@ namespace llvm {
     /// AddInstr - Add I to all instruction sequences in SeqLs.
     void AddInstr(InstSeqLs &SeqLs, const Inst &I);
 
-    /// GetInstSeqLsADDiu - Get instrucion sequences which end with an ADDiu to
+    /// GetInstSeqLsADDiu - Get instruction sequences which end with an ADDiu to
     /// load immediate Imm
     void GetInstSeqLsADDiu(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsORi - Get instrucion sequences which end with an ORi to
+    /// GetInstSeqLsORi - Get instrutcion sequences which end with an ORi to
     /// load immediate Imm
     void GetInstSeqLsORi(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLsSLL - Get instrucion sequences which end with a SLL to
+    /// GetInstSeqLsSLL - Get instruction sequences which end with a SLL to
     /// load immediate Imm
     void GetInstSeqLsSLL(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
-    /// GetInstSeqLs - Get instrucion sequences to load immediate Imm.
+    /// GetInstSeqLs - Get instruction sequences to load immediate Imm.
     void GetInstSeqLs(uint64_t Imm, unsigned RemSize, InstSeqLs &SeqLs);
 
     /// ReplaceADDiuSLLWithLUi - Replace an ADDiu & SLL pair with a LUi.
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 6e4feda..45c4398 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -15,11 +15,11 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
-#include "MCTargetDesc/MipsELFStreamer.h"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
+#include "MipsTargetStreamer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
@@ -33,8 +33,8 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -45,12 +45,17 @@
 
 using namespace llvm;
 
+MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
+  return static_cast<MipsTargetStreamer &>(OutStreamer.getTargetStreamer());
+}
+
 bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Initialize TargetLoweringObjectFile.
   if (Subtarget->allowMixed16_32())
     const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
       .Initialize(OutContext, TM);
   MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MCP = MF.getConstantPool();
   AsmPrinter::runOnMachineFunction(MF);
   return true;
 }
@@ -71,6 +76,39 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  // If we just ended a constant pool, mark it as such.
+  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
+    InConstantPool = false;
+  }
+  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+    // CONSTPOOL_ENTRY - This instruction represents a floating
+    //constant pool in the function.  The first operand is the ID#
+    // for this instruction, the second is the index into the
+    // MachineConstantPool that this is, the third is the size in
+    // bytes of this constant pool entry.
+    // The required alignment is specified on the basic block holding this MI.
+    //
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    // If this is the first entry of the pool, mark it.
+    if (!InConstantPool) {
+      OutStreamer.EmitDataRegion(MCDR_DataRegion);
+      InConstantPool = true;
+    }
+
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    return;
+  }
+
+
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
@@ -141,7 +179,7 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   const MachineFrameInfo *MFI = MF->getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   // size of stack area to which FP callee-saved regs are saved.
-  unsigned CPURegSize = Mips::CPURegsRegClass.getSize();
+  unsigned CPURegSize = Mips::GPR32RegClass.getSize();
   unsigned FGR32RegSize = Mips::FGR32RegClass.getSize();
   unsigned AFGR64RegSize = Mips::AFGR64RegClass.getSize();
   bool HasAFGR64Reg = false;
@@ -151,7 +189,7 @@ void MipsAsmPrinter::printSavedRegsBitmask(raw_ostream &O) {
   // Set FPU Bitmask.
   for (i = 0; i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    if (Mips::CPURegsRegClass.contains(Reg))
+    if (Mips::GPR32RegClass.contains(Reg))
       break;
 
     unsigned RegNum = TM.getRegisterInfo()->getEncodingValue(Reg);
@@ -238,16 +276,15 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
   }
 
   if (Subtarget->inMicroMipsMode())
-    if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-      MES->emitMipsSTOCG(*Subtarget, CurrentFnSym,
-      (unsigned)ELF::STO_MIPS_MICROMIPS);
+    getTargetStreamer().emitMipsHackSTOCG(CurrentFnSym,
+                                          (unsigned)ELF::STO_MIPS_MICROMIPS);
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
 void MipsAsmPrinter::EmitFunctionBodyStart() {
-  MCInstLowering.Initialize(Mang, &MF->getContext());
+  MCInstLowering.Initialize(&MF->getContext());
 
   bool IsNakedFunction =
     MF->getFunction()->
@@ -284,6 +321,12 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
     }
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
   }
+  // Make sure to terminate any constant pools that were at the end
+  // of the function.
+  if (!InConstantPool)
+    return;
+  InConstantPool = false;
+  OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
@@ -418,6 +461,11 @@ bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return false;
       }
     }
+    case 'w':
+      // Print MSA registers for the 'f' constraint
+      // In LLVM, the 'w' modifier doesn't need to do anything.
+      // We can just call printOperand as normal.
+      break;
     }
   }
 
@@ -485,7 +533,7 @@ void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       return;
 
     case MachineOperand::MO_GlobalAddress:
-      O << *Mang->getSymbol(MO.getGlobal());
+      O << *getSymbol(MO.getGlobal());
       break;
 
     case MachineOperand::MO_BlockAddress: {
@@ -526,6 +574,15 @@ void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum,
     printOperand(MI, opNum, O);
 }
 
+void MipsAsmPrinter::printUnsignedImm8(const MachineInstr *MI, int opNum,
+                                       raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.isImm())
+    O << (unsigned short int)(unsigned char)MO.getImm();
+  else
+    printOperand(MI, opNum, O);
+}
+
 void MipsAsmPrinter::
 printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O) {
   // Load/Store memory operands -- imm($reg)
@@ -557,6 +614,15 @@ printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // FIXME: Use SwitchSection.
 
+  // TODO: Need to add -mabicalls and -mno-abicalls flags.
+  // Currently we assume that -mabicalls is the default.
+  if (OutStreamer.hasRawTextSupport()) {
+    OutStreamer.EmitRawText(StringRef("\t.abicalls"));
+    Reloc::Model RM = Subtarget->getRelocationModel();
+    if (RM == Reloc::Static && !Subtarget->hasMips64())
+      OutStreamer.EmitRawText(StringRef("\t.option\tpic0"));
+  }
+
   // Tell the assembler which ABI we are using
   if (OutStreamer.hasRawTextSupport())
     OutStreamer.EmitRawText("\t.section .mdebug." +
@@ -578,25 +644,54 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
 }
 
-void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+static void emitELFHeaderFlagsCG(MipsTargetStreamer &TargetStreamer,
+                                 const MipsSubtarget &Subtarget) {
+  // Update e_header flags
+  unsigned EFlags = 0;
+
+  // TODO: Need to add -mabicalls and -mno-abicalls flags.
+  // Currently we assume that -mabicalls is the default.
+  EFlags |= ELF::EF_MIPS_CPIC;
+
+  if (Subtarget.inMips16Mode())
+    EFlags |= ELF::EF_MIPS_ARCH_ASE_M16;
+  else
+    EFlags |= ELF::EF_MIPS_NOREORDER;
+
+  // Architecture
+  if (Subtarget.hasMips64r2())
+    EFlags |= ELF::EF_MIPS_ARCH_64R2;
+  else if (Subtarget.hasMips64())
+    EFlags |= ELF::EF_MIPS_ARCH_64;
+  else if (Subtarget.hasMips32r2())
+    EFlags |= ELF::EF_MIPS_ARCH_32R2;
+  else
+    EFlags |= ELF::EF_MIPS_ARCH_32;
+
+  if (Subtarget.inMicroMipsMode())
+    EFlags |= ELF::EF_MIPS_MICROMIPS;
 
-  if (OutStreamer.hasRawTextSupport()) return;
+  // ABI
+  if (Subtarget.isABI_O32())
+    EFlags |= ELF::EF_MIPS_ABI_O32;
 
+  // Relocation Model
+  Reloc::Model RM = Subtarget.getRelocationModel();
+  if (RM == Reloc::PIC_ || RM == Reloc::Default)
+    EFlags |= ELF::EF_MIPS_PIC;
+  else if (RM == Reloc::Static)
+    ; // Do nothing for Reloc::Static
+  else
+    llvm_unreachable("Unsupported relocation model for e_flags");
+
+  TargetStreamer.emitMipsHackELFFlags(EFlags);
+}
+
+void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   // Emit Mips ELF register info
   Subtarget->getMReginfo().emitMipsReginfoSectionCG(
              OutStreamer, getObjFileLowering(), *Subtarget);
-  if (MipsELFStreamer *MES = dyn_cast<MipsELFStreamer>(&OutStreamer))
-    MES->emitELFHeaderFlagsCG(*Subtarget);
-}
-
-MachineLocation
-MipsAsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  // Handles frame addresses emitted in MipsInstrInfo::emitFrameIndexDebugValue.
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
-         "Unexpected MachineOperand types");
-  return MachineLocation(MI->getOperand(0).getReg(),
-                         MI->getOperand(1).getImm());
+  emitELFHeaderFlagsCG(getTargetStreamer(), *Subtarget);
 }
 
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index dbdaf26..11c6acd 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -25,10 +25,12 @@ namespace llvm {
 class MCStreamer;
 class MachineInstr;
 class MachineBasicBlock;
+class MipsTargetStreamer;
 class Module;
 class raw_ostream;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
+  MipsTargetStreamer &getTargetStreamer();
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
@@ -40,6 +42,16 @@ private:
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
 
+  /// MCP - Keep a pointer to constantpool entries of the current
+  /// MachineFunction.
+  const MachineConstantPool *MCP;
+
+  /// InConstantPool - Maintain state when emitting a sequence of constant
+  /// pool entries so we can properly mark them as data regions.
+  bool InConstantPool;
+
+  bool UsingConstantPools;
+
 public:
 
   const MipsSubtarget *Subtarget;
@@ -47,8 +59,11 @@ public:
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCInstLowering(*this) {
+    : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
+      MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    UsingConstantPools =
+      (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
   }
 
   virtual const char *getPassName() const {
@@ -57,6 +72,12 @@ public:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
+  virtual void EmitConstantPool() LLVM_OVERRIDE {
+    if (!UsingConstantPools)
+      AsmPrinter::EmitConstantPool();
+    // we emit constant pools customly!
+  }
+
   void EmitInstruction(const MachineInstr *MI);
   void printSavedRegsBitmask(raw_ostream &O);
   void printHex32(unsigned int Value, raw_ostream &O);
@@ -75,13 +96,13 @@ public:
                              raw_ostream &O);
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
+  void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = 0);
   void EmitStartOfAsmFile(Module &M);
   void EmitEndOfAsmFile(Module &M);
-  virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 };
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 462def7..66391cb 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -26,8 +26,10 @@ def RetCC_MipsO32 : CallingConv<[
   // f32 are returned in registers F0, F2
   CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
 
-  // f64 are returned in register D0, D1
-  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
+  // f64 arguments are returned in D0_64 and D1_64 in FP64bit mode or
+  // in D0 and D1 in FP32bit mode.
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()", CCAssignToReg<[D0_64, D1_64]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()", CCAssignToReg<[D0, D1]>>>
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -149,7 +151,16 @@ def RetCC_MipsEABI : CallingConv<[
 //===----------------------------------------------------------------------===//
 def CC_MipsO32_FastCC : CallingConv<[
   // f64 arguments are passed in double-precision floating pointer registers.
-  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7, D8, D9]>>,
+  CCIfType<[f64], CCIfSubtarget<"isNotFP64bit()",
+                                CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7,
+                                               D8, D9]>>>,
+  CCIfType<[f64], CCIfSubtarget<"isFP64bit()",
+                                CCAssignToReg<[D0_64, D1_64, D2_64, D3_64,
+                                               D4_64, D5_64, D6_64, D7_64,
+                                               D8_64, D9_64, D10_64, D11_64,
+                                               D12_64, D13_64, D14_64, D15_64,
+                                               D16_64, D17_64, D18_64,
+                                               D19_64]>>>,
 
   // Stack parameter slots for f64 are 64-bit doublewords and 8-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 8>>
@@ -196,6 +207,13 @@ def CC_Mips_FastCC : CallingConv<[
   CCDelegateTo<CC_MipsN_FastCC>
 ]>;
 
+//==
+
+def CC_Mips16RetHelper : CallingConv<[
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[V0, V1, A0, A1]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
@@ -217,9 +235,15 @@ def CSR_SingleFloatOnly : CalleeSavedRegs<(add (sequence "F%u", 31, 20), RA, FP,
 def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
                                    (sequence "S%u", 7, 0))>;
 
+def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
+                                        (sequence "S%u", 7, 0))>;
+
 def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
                                    D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
                                    (sequence "S%u_64", 7, 0))>;
 
 def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
                                    GP_64, (sequence "S%u_64", 7, 0))>;
+
+def CSR_Mips16RetHelper :
+  CalleeSavedRegs<(add V0, V1, (sequence "A%u", 3, 0), S0, S1)>;
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 3fc402b..ca4163d 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -65,8 +65,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
 
 public:
   MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0),
-      II((const MipsInstrInfo *) tm.getInstrInfo()), TD(tm.getDataLayout()),
+    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
       TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -106,11 +105,16 @@ private:
                          const MachineOperand &MO) const;
 
   unsigned getJumpTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getJumpTargetOpValueMM(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
+                                    unsigned OpNo) const;
 
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
   void emitGlobalAddressUnaligned(const GlobalValue *GV, unsigned Reloc,
                                   int Offset) const;
@@ -187,6 +191,18 @@ unsigned MipsCodeEmitter::getJumpTargetOpValue(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getJumpTargetOpValueMM(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
                                                  unsigned OpNo) const {
   MachineOperand MO = MI.getOperand(OpNo);
@@ -202,6 +218,12 @@ unsigned MipsCodeEmitter::getMemEncoding(const MachineInstr &MI,
   return (getMachineOpValue(MI, MI.getOperand(OpNo+1)) & 0xFFFF) | RegBits;
 }
 
+unsigned MipsCodeEmitter::getMemEncodingMMImm12(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getSizeExtEncoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   // size is encoded as size-1.
@@ -215,6 +237,12 @@ unsigned MipsCodeEmitter::getSizeInsEncoding(const MachineInstr &MI,
          getMachineOpValue(MI, MI.getOperand(OpNo)) - 1;
 }
 
+unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
@@ -317,6 +345,14 @@ bool MipsCodeEmitter::expandPseudos(MachineBasicBlock::instr_iterator &MI,
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::SLL), Mips::ZERO)
       .addReg(Mips::ZERO).addImm(0);
     break;
+  case Mips::B:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BEQ)).addReg(Mips::ZERO)
+      .addReg(Mips::ZERO).addOperand(MI->getOperand(0));
+    break;
+  case Mips::TRAP:
+    BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::BREAK)).addImm(0)
+      .addImm(0);
+    break;
   case Mips::JALRPseudo:
     BuildMI(MBB, &*MI, MI->getDebugLoc(), II->get(Mips::JALR), Mips::RA)
       .addReg(MI->getOperand(0).getReg());
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 42e4c99..2de1430 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -16,15 +16,15 @@
 // MipsISelLowering::EmitInstrWithCustomInserter if target does not have
 // conditional move instructions.
 // cond:int, data:int
-class CMov_I_I_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
+class CMov_I_I_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
                   InstrItinClass Itin> :
   InstSE<(outs DRC:$rd), (ins DRC:$rs, CRC:$rt, DRC:$F),
-         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR> {
+         !strconcat(opstr, "\t$rd, $rs, $rt"), [], Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
 // cond:int, data:float
-class CMov_I_F_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
+class CMov_I_F_FT<string opstr, RegisterOperand CRC, RegisterOperand DRC,
                   InstrItinClass Itin> :
   InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
          !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR> {
@@ -32,22 +32,22 @@ class CMov_I_F_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
 }
 
 // cond:float, data:int
-class CMov_F_I_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+class CMov_F_I_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                   SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$F),
-         !strconcat(opstr, "\t$rd, $rs, $$fcc0"),
-         [(set RC:$rd, (OpNode RC:$rs, RC:$F))], Itin, FrmFR> {
-  let Uses = [FCR31];
+  InstSE<(outs RC:$rd), (ins RC:$rs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $fcc"),
+         [(set RC:$rd, (OpNode RC:$rs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR, opstr> {
   let Constraints = "$F = $rd";
 }
 
 // cond:float, data:float
-class CMov_F_F_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+class CMov_F_F_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                   SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$F),
-         !strconcat(opstr, "\t$fd, $fs, $$fcc0"),
-         [(set RC:$fd, (OpNode RC:$fs, RC:$F))], Itin, FrmFR> {
-  let Uses = [FCR31];
+  InstSE<(outs RC:$fd), (ins RC:$fs, FCCRegsOpnd:$fcc, RC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $fcc"),
+         [(set RC:$fd, (OpNode RC:$fs, FCCRegsOpnd:$fcc, RC:$F))],
+         Itin, FrmFR> {
   let Constraints = "$F = $fd";
 }
 
@@ -103,151 +103,143 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 }
 
 // Instantiation of instructions.
-def MOVZ_I_I : CMov_I_I_FT<"movz", CPURegs, CPURegs, NoItinerary>,
+def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, IIArith>,
                ADD_FM<0, 0xa>;
-let Predicates = [HasStdEnc],
-                  DecoderNamespace = "Mips64" in {
-  def MOVZ_I_I64   : CMov_I_I_FT<"movz", CPURegs, CPU64Regs, NoItinerary>,
+
+let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+  def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xa>;
+  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", GPR64Opnd, GPR64Opnd, IIArith>,
                      ADD_FM<0, 0xa>;
-  def MOVZ_I64_I   : CMov_I_I_FT<"movz", CPU64Regs, CPURegs, NoItinerary>,
-                     ADD_FM<0, 0xa> {
-    let isCodeGenOnly = 1;
-  }
-  def MOVZ_I64_I64 : CMov_I_I_FT<"movz", CPU64Regs, CPU64Regs, NoItinerary>,
-                     ADD_FM<0, 0xa> {
-    let isCodeGenOnly = 1;
-  }
 }
 
-def MOVN_I_I       : CMov_I_I_FT<"movn", CPURegs, CPURegs, NoItinerary>,
+def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, IIArith>,
                      ADD_FM<0, 0xb>;
-let Predicates = [HasStdEnc],
-                  DecoderNamespace = "Mips64" in {
-  def MOVN_I_I64   : CMov_I_I_FT<"movn", CPURegs, CPU64Regs, NoItinerary>,
+
+let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+  def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, IIArith>,
+                     ADD_FM<0, 0xb>;
+  def MOVN_I64_I64 : CMov_I_I_FT<"movn", GPR64Opnd, GPR64Opnd, IIArith>,
                      ADD_FM<0, 0xb>;
-  def MOVN_I64_I   : CMov_I_I_FT<"movn", CPU64Regs, CPURegs, NoItinerary>,
-                     ADD_FM<0, 0xb> {
-    let isCodeGenOnly = 1;
-  }
-  def MOVN_I64_I64 : CMov_I_I_FT<"movn", CPU64Regs, CPU64Regs, NoItinerary>,
-                     ADD_FM<0, 0xb> {
-    let isCodeGenOnly = 1;
-  }
 }
 
-def MOVZ_I_S : CMov_I_F_FT<"movz.s", CPURegs, FGR32, IIFmove>,
+def MOVZ_I_S : CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<18, 16>;
-def MOVZ_I64_S : CMov_I_F_FT<"movz.s", CPU64Regs, FGR32, IIFmove>,
-                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]> {
-  let DecoderNamespace = "Mips64";
-}
 
-def MOVN_I_S : CMov_I_F_FT<"movn.s", CPURegs, FGR32, IIFmove>,
+let isCodeGenOnly = 1 in
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, IIFmove>,
+                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
+
+def MOVN_I_S : CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, IIFmove>,
                CMov_I_F_FM<19, 16>;
-def MOVN_I64_S : CMov_I_F_FT<"movn.s", CPU64Regs, FGR32, IIFmove>,
-                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]> {
-  let DecoderNamespace = "Mips64";
-}
+
+let isCodeGenOnly = 1 in
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, IIFmove>,
+                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", CPURegs, AFGR64, IIFmove>,
+  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", CPURegs, AFGR64, IIFmove>,
+  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
 }
-let Predicates = [IsFP64bit, HasStdEnc],
-                  DecoderNamespace = "Mips64" in {
-  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", CPURegs, FGR64, IIFmove>,
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<18, 17>;
-  def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", CPU64Regs, FGR64, IIFmove>,
-                     CMov_I_F_FM<18, 17> {
-    let isCodeGenOnly = 1;
-  }
-  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", CPURegs, FGR64, IIFmove>,
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, IIFmove>,
                    CMov_I_F_FM<19, 17>;
-  def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", CPU64Regs, FGR64, IIFmove>,
-                     CMov_I_F_FM<19, 17> {
-    let isCodeGenOnly = 1;
+  let isCodeGenOnly = 1 in {
+    def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<18, 17>;
+    def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
+                                   IIFmove>, CMov_I_F_FM<19, 17>;
   }
 }
 
-def MOVT_I : CMov_F_I_FT<"movt", CPURegs, IIAlu, MipsCMovFP_T>, CMov_F_I_FM<1>;
-def MOVT_I64 : CMov_F_I_FT<"movt", CPU64Regs, IIAlu, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]> {
-  let DecoderNamespace = "Mips64";
-}
+def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, IIArith, MipsCMovFP_T>,
+             CMov_F_I_FM<1>;
 
-def MOVF_I : CMov_F_I_FT<"movf", CPURegs, IIAlu, MipsCMovFP_F>, CMov_F_I_FM<0>;
-def MOVF_I64 : CMov_F_I_FT<"movf", CPU64Regs, IIAlu, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]> {
-  let DecoderNamespace = "Mips64";
-}
+let isCodeGenOnly = 1 in
+def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, IIArith, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
+
+def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, IIArith, MipsCMovFP_F>,
+             CMov_F_I_FM<0>;
 
-def MOVT_S : CMov_F_F_FT<"movt.s", FGR32, IIFmove, MipsCMovFP_T>,
+let isCodeGenOnly = 1 in
+def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, IIArith, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
+
+def MOVT_S : CMov_F_F_FT<"movt.s", FGR32Opnd, IIFmove, MipsCMovFP_T>,
              CMov_F_F_FM<16, 1>;
-def MOVF_S : CMov_F_F_FT<"movf.s", FGR32, IIFmove, MipsCMovFP_F>,
+def MOVF_S : CMov_F_F_FT<"movf.s", FGR32Opnd, IIFmove, MipsCMovFP_F>,
              CMov_F_F_FM<16, 0>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64, IIFmove, MipsCMovFP_T>,
+  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64, IIFmove, MipsCMovFP_F>,
+  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
-let Predicates = [IsFP64bit, HasStdEnc],
-    DecoderNamespace = "Mips64" in {
-  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64, IIFmove, MipsCMovFP_T>,
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, IIFmove, MipsCMovFP_T>,
                  CMov_F_F_FM<17, 1>;
-  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64, IIFmove, MipsCMovFP_F>,
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, IIFmove, MipsCMovFP_F>,
                  CMov_F_F_FM<17, 0>;
 }
 
 // Instantiation of conditional move patterns.
-defm : MovzPats0<CPURegs, CPURegs, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<CPURegs, CPURegs, MOVZ_I_I, XOR>;
-defm : MovzPats2<CPURegs, CPURegs, MOVZ_I_I, XORi>;
+defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
+defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>;
+defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>;
 let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<CPURegs, CPU64Regs, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<CPU64Regs, CPURegs, MOVZ_I_I, SLT64, SLTu64, SLTi64,
+  defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
-  defm : MovzPats0<CPU64Regs, CPU64Regs, MOVZ_I_I64, SLT64, SLTu64, SLTi64,
+  defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
-  defm : MovzPats1<CPURegs, CPU64Regs, MOVZ_I_I64, XOR>;
-  defm : MovzPats1<CPU64Regs, CPURegs, MOVZ_I64_I, XOR64>;
-  defm : MovzPats1<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XOR64>;
-  defm : MovzPats2<CPURegs, CPU64Regs, MOVZ_I_I64, XORi>;
-  defm : MovzPats2<CPU64Regs, CPURegs, MOVZ_I64_I, XORi64>;
-  defm : MovzPats2<CPU64Regs, CPU64Regs, MOVZ_I64_I64, XORi64>;
+  defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>;
+  defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>;
+  defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>;
+  defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>;
+  defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>;
+  defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>;
 }
 
-defm : MovnPats<CPURegs, CPURegs, MOVN_I_I, XOR>;
+defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>;
 let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovnPats<CPURegs, CPU64Regs, MOVN_I_I64, XOR>;
-  defm : MovnPats<CPU64Regs, CPURegs, MOVN_I64_I, XOR64>;
-  defm : MovnPats<CPU64Regs, CPU64Regs, MOVN_I64_I64, XOR64>;
+  defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>;
+  defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>;
+  defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>;
 }
 
-defm : MovzPats0<CPURegs, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
-defm : MovzPats1<CPURegs, FGR32, MOVZ_I_S, XOR>;
-defm : MovnPats<CPURegs, FGR32, MOVN_I_S, XOR>;
+defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
+defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>;
+defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>;
 let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<CPU64Regs, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
+  defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
-  defm : MovzPats1<CPU64Regs, FGR32, MOVZ_I64_S, XOR64>;
-  defm : MovnPats<CPU64Regs, FGR32, MOVN_I64_S, XOR64>;
+  defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>;
+  defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>;
 }
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  defm : MovzPats0<CPURegs, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats1<CPURegs, AFGR64, MOVZ_I_D32, XOR>;
-  defm : MovnPats<CPURegs, AFGR64, MOVN_I_D32, XOR>;
+  defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>;
+  defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>;
 }
 let Predicates = [IsFP64bit, HasStdEnc] in {
-  defm : MovzPats0<CPURegs, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<CPU64Regs, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
+  defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
+  defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
                    SLTiu64>;
-  defm : MovzPats1<CPURegs, FGR64, MOVZ_I_D64, XOR>;
-  defm : MovzPats1<CPU64Regs, FGR64, MOVZ_I64_D64, XOR64>;
-  defm : MovnPats<CPURegs, FGR64, MOVN_I_D64, XOR>;
-  defm : MovnPats<CPU64Regs, FGR64, MOVN_I64_D64, XOR64>;
+  defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>;
+  defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>;
+  defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>;
+  defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>;
 }
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index 1951324..c46bbac 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -9,9 +9,7 @@
 //
 //
 // This pass is used to make Pc relative loads of constants.
-// For now, only Mips16 will use this. While it has the same name and
-// uses many ideas from the LLVM ARM Constant Island Pass, it's not intended
-// to reuse any of the code from the ARM version.
+// For now, only Mips16 will use this. 
 //
 // Loading constants inline is expensive on Mips16 and it's in general better
 // to place the constant nearby in code space and then it can be loaded with a
@@ -27,32 +25,244 @@
 
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "Mips16InstrInfo.h"
+#include "MipsMachineFunction.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+
+// FIXME: This option should be removed once it has received sufficient testing.
+static cl::opt<bool>
+AlignConstantIslands("mips-align-constant-islands", cl::Hidden, cl::init(true),
+          cl::desc("Align constant islands in code"));
+
+
+// Rather than do make check tests with huge amounts of code, we force
+// the test to use this amount.
+//
+static cl::opt<int> ConstantIslandsSmallOffset(
+  "mips-constant-islands-small-offset",
+  cl::init(0),
+  cl::desc("Make small offsets be this amount for testing purposes"),
+  cl::Hidden);
+
+//
+// For testing purposes we tell it to not use relaxed load forms so that it
+// will split blocks.
+//
+static cl::opt<bool> NoLoadRelaxation(
+  "mips-constant-islands-no-load-relaxation",
+  cl::init(false),
+  cl::desc("Don't relax loads to long loads - for testing purposes"),
+  cl::Hidden);
+
+
 namespace {
+
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
+  /// MipsConstantIslands - Due to limited PC-relative displacements, Mips
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+
   class MipsConstantIslands : public MachineFunctionPass {
 
+    /// BasicBlockInfo - Information about the offset and size of a single
+    /// basic block.
+    struct BasicBlockInfo {
+      /// Offset - Distance from the beginning of the function to the beginning
+      /// of this basic block.
+      ///
+      /// Offsets are computed assuming worst case padding before an aligned
+      /// block. This means that subtracting basic block offsets always gives a
+      /// conservative estimate of the real distance which may be smaller.
+      ///
+      /// Because worst case padding is used, the computed offset of an aligned
+      /// block may not actually be aligned.
+      unsigned Offset;
+
+      /// Size - Size of the basic block in bytes.  If the block contains
+      /// inline assembly, this is a worst case estimate.
+      ///
+      /// The size does not include any alignment padding whether from the
+      /// beginning of the block, or from an aligned jump table at the end.
+      unsigned Size;
+
+      // FIXME: ignore LogAlign for this patch
+      //
+      unsigned postOffset(unsigned LogAlign = 0) const {
+        unsigned PO = Offset + Size;
+        return PO;
+      }
+
+      BasicBlockInfo() : Offset(0), Size(0) {}
+
+    };
+
+    std::vector<BasicBlockInfo> BBInfo;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+    private:
+      unsigned MaxDisp;
+      unsigned LongFormMaxDisp; // mips16 has 16/32 bit instructions
+                                // with different displacements
+      unsigned LongFormOpcode;
+    public:
+      bool NegOk;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg,
+             unsigned longformmaxdisp, unsigned longformopcode)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp),
+          LongFormMaxDisp(longformmaxdisp), LongFormOpcode(longformopcode),
+          NegOk(neg){
+        HighWaterMark = CPEMI->getParent();
+      }
+      /// getMaxDisp - Returns the maximum displacement supported by MI.
+      unsigned getMaxDisp() const {
+        unsigned xMaxDisp = ConstantIslandsSmallOffset?
+                            ConstantIslandsSmallOffset: MaxDisp;
+        return xMaxDisp;
+      }
+      void setMaxDisp(unsigned val) {
+        MaxDisp = val;
+      }
+      unsigned getLongFormMaxDisp() const {
+        return LongFormMaxDisp;
+      }
+      unsigned getLongFormOpcode() const {
+          return LongFormOpcode;
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+  /// CPEntry - One per constant pool entry, keeping the machine instruction
+  /// pointer, the constpool index, and the number of CPUser's which
+  /// reference this entry.
+  struct CPEntry {
+    MachineInstr *CPEMI;
+    unsigned CPI;
+    unsigned RefCount;
+    CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+      : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+  };
+
+  /// CPEntries - Keep track of all of the constant pool entry machine
+  /// instructions. For each original constpool index (i.e. those that
+  /// existed upon entry to this pass), it keeps a vector of entries.
+  /// Original elements are cloned as we go along; the clones are
+  /// put in the vector of the original element, but have distinct CPIs.
+  std::vector<std::vector<CPEntry> > CPEntries;
+
+  /// ImmBranch - One per immediate branch, keeping the machine instruction
+  /// pointer, conditional or unconditional, the max displacement,
+  /// and (if isCond is true) the corresponding unconditional branch
+  /// opcode.
+  struct ImmBranch {
+    MachineInstr *MI;
+    unsigned MaxDisp : 31;
+    bool isCond : 1;
+    int UncondBr;
+    ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+      : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+  };
+
+  /// ImmBranches - Keep track of all the immediate branch instructions.
+  ///
+  std::vector<ImmBranch> ImmBranches;
+
+  /// HasFarJump - True if any far jump instruction has been emitted during
+  /// the branch fix up pass.
+  bool HasFarJump;
+
+  const TargetMachine &TM;
+  bool IsPIC;
+  unsigned ABI;
+  const MipsSubtarget *STI;
+  const Mips16InstrInfo *TII;
+  MipsFunctionInfo *MFI;
+  MachineFunction *MF;
+  MachineConstantPool *MCP;
+
+  unsigned PICLabelUId;
+  bool PrescannedForConstants;
+
+  void initPICLabelUId(unsigned UId) {
+    PICLabelUId = UId;
+  }
+
+
+  unsigned createPICLabelUId() {
+    return PICLabelUId++;
+  }
+
   public:
     static char ID;
     MipsConstantIslands(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
-        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()) {}
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        STI(&TM.getSubtarget<MipsSubtarget>()), MF(0), MCP(0),
+        PrescannedForConstants(false){}
 
     virtual const char *getPassName() const {
       return "Mips Constant Islands";
@@ -60,30 +270,1264 @@ namespace {
 
     bool runOnMachineFunction(MachineFunction &F);
 
-  private:
+    void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    unsigned getCPELogAlign(const MachineInstr *CPEMI);
+    void initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs);
+    unsigned getOffsetOf(MachineInstr *MI) const;
+    unsigned getUserOffset(CPUser&) const;
+    void dumpBBs();
+    void verify();
+
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK);
+    bool isOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                         const CPUser &U);
+
+    bool isLongFormOffsetInRange(unsigned UserOffset, unsigned TrialOffset,
+                                const CPUser &U);
 
+    void computeBlockSize(MachineBasicBlock *MBB);
+    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
+    void updateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
+    bool decrementCPEReferenceCount(unsigned CPI, MachineInstr* CPEMI);
+    int findInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    int findLongFormInRangeCPEntry(CPUser& U, unsigned UserOffset);
+    bool findAvailableWater(CPUser&U, unsigned UserOffset,
+                            water_iterator &WaterIter);
+    void createNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool handleConstantPoolUser(unsigned CPUserIndex);
+    void removeDeadCPEMI(MachineInstr *CPEMI);
+    bool removeUnusedCPEntries();
+    bool isCPEntryInRange(MachineInstr *MI, unsigned UserOffset,
+                          MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                          bool DoDump = false);
+    bool isWaterInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U, unsigned &Growth);
+    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool fixupImmediateBr(ImmBranch &Br);
+    bool fixupConditionalBr(ImmBranch &Br);
+    bool fixupUnconditionalBr(ImmBranch &Br);
 
-    const TargetMachine &TM;
-    const MipsInstrInfo *TII;
-    bool IsPIC;
-    unsigned ABI;
+    void prescanForConstants();
+
+  private:
 
   };
 
   char MipsConstantIslands::ID = 0;
 } // end of anonymous namespace
 
+
+bool MipsConstantIslands::isLongFormOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getLongFormMaxDisp(), U.NegOk);
+}
+
+bool MipsConstantIslands::isOffsetInRange
+  (unsigned UserOffset, unsigned TrialOffset,
+   const CPUser &U) {
+  return isOffsetInRange(UserOffset, TrialOffset,
+                         U.getMaxDisp(), U.NegOk);
+}
+/// print block size and offset information - debugging
+void MipsConstantIslands::dumpBBs() {
+  DEBUG({
+    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
+      const BasicBlockInfo &BBI = BBInfo[J];
+      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
+             << format(" size=%#x\n", BBInfo[J].Size);
+    }
+  });
+}
 /// createMipsLongBranchPass - Returns a pass that converts branches to long
 /// branches.
 FunctionPass *llvm::createMipsConstantIslandPass(MipsTargetMachine &tm) {
   return new MipsConstantIslands(tm);
 }
 
-bool MipsConstantIslands::runOnMachineFunction(MachineFunction &F) {
+bool MipsConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // The intention is for this to be a mips16 only pass for now
   // FIXME:
-  // if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode())
-  //  return false;
+  MF = &mf;
+  MCP = mf.getConstantPool();
+  DEBUG(dbgs() << "constant island machine function " << "\n");
+  if (!TM.getSubtarget<MipsSubtarget>().inMips16Mode() ||
+      !MipsSubtarget::useConstantIslands()) {
+    return false;
+  }
+  TII = (const Mips16InstrInfo*)MF->getTarget().getInstrInfo();
+  MFI = MF->getInfo<MipsFunctionInfo>();
+  DEBUG(dbgs() << "constant island processing " << "\n");
+  //
+  // will need to make predermination if there is any constants we need to
+  // put in constant islands. TBD.
+  //
+  if (!PrescannedForConstants) prescanForConstants();
+
+  HasFarJump = false;
+  // This pass invalidates liveness information when it splits basic blocks.
+  MF->getRegInfo().invalidateLiveness();
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF->RenumberBlocks();
+
+  bool MadeChange = false;
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP->isEmpty())
+    doInitialPlacement(CPEMIs);
+
+  /// The next UID to take is the first unused one.
+  initPICLabelUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  initializeFunctionInfo(CPEMIs);
+  CPEMIs.clear();
+  DEBUG(dumpBBs());
+
+  /// Remove dead constant pool entries.
+  MadeChange |= removeUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  (void)NoBRIters;
+  while (true) {
+    DEBUG(dbgs() << "Beginning CP iteration #" << NoCPIters << '\n');
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= handleConstantPoolUser(i);
+    if (CPChange && ++NoCPIters > 30)
+      report_fatal_error("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    DEBUG(dbgs() << "Beginning BR iteration #" << NoBRIters << '\n');
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= fixupImmediateBr(ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      report_fatal_error("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  DEBUG(dbgs() << '\n'; dumpBBs());
+
+  BBInfo.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  return MadeChange;
+}
+
+/// doInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void
+MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF->CreateMachineBasicBlock();
+  MF->push_back(BB);
+
+
+  // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
+  unsigned MaxAlign = Log2_32(MCP->getConstantPoolAlignment());
+
+  // Mark the basic block as required by the const-pool.
+  // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
+  BB->setAlignment(AlignConstantIslands ? MaxAlign : 2);
+
+  // The function needs to be as aligned as the basic blocks. The linker may
+  // move functions around based on their alignment.
+  MF->ensureAlignment(BB->getAlignment());
+
+  // Order the entries in BB by descending alignment.  That ensures correct
+  // alignment of all entries as long as BB is sufficiently aligned.  Keep
+  // track of the insertion point for each alignment.  We are going to bucket
+  // sort the entries as they are created.
+  SmallVector<MachineBasicBlock::iterator, 8> InsPoint(MaxAlign + 1, BB->end());
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
+
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    assert(Size >= 4 && "Too small constant pool entry");
+    unsigned Align = CPs[i].getAlignment();
+    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    // Verify that all constant pool entries are a multiple of their alignment.
+    // If not, we would have to pad them out so that instructions stay aligned.
+    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+
+    // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
+    unsigned LogAlign = Log2_32(Align);
+    MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
+
+    MachineInstr *CPEMI =
+      BuildMI(*BB, InsAt, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+        .addImm(i).addConstantPoolIndex(i).addImm(Size);
+
+    CPEMIs.push_back(CPEMI);
+
+    // Ensure that future entries with higher alignment get inserted before
+    // CPEMI. This is bucket sort with iterators.
+    for (unsigned a = LogAlign + 1; a <= MaxAlign; ++a)
+      if (InsPoint[a] == InsAt)
+        InsPoint[a] = CPEMI;
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    ++NumCPEs;
+    DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
+                 << Size << ", align = " << Align <<'\n');
+  }
+  DEBUG(BB->dump());
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  // Can't fall off end of function.
+  if (llvm::next(MBBI) == MBB->getParent()->end())
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+MipsConstantIslands::CPEntry
+*MipsConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// getCPELogAlign - Returns the required alignment of the constant pool entry
+/// represented by CPEMI.  Alignment is measured in log2(bytes) units.
+unsigned MipsConstantIslands::getCPELogAlign(const MachineInstr *CPEMI) {
+  assert(CPEMI && CPEMI->getOpcode() == Mips::CONSTPOOL_ENTRY);
+
+  // Everything is 4-byte aligned unless AlignConstantIslands is set.
+  if (!AlignConstantIslands)
+    return 2;
+
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
+  unsigned Align = MCP->getConstants()[CPI].getAlignment();
+  assert(isPowerOf2_32(Align) && "Invalid CPE alignment");
+  return Log2_32(Align);
+}
+
+/// initializeFunctionInfo - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void MipsConstantIslands::
+initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
+  BBInfo.clear();
+  BBInfo.resize(MF->getNumBlockIDs());
+
+  // First thing, compute the size of all basic blocks, and see if the function
+  // has any inline assembly in it. If so, we have to be conservative about
+  // alignment assumptions, as we don't know for sure the size of any
+  // instructions in the inline assembly.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
+    computeBlockSize(I);
+
+
+  // Compute block offsets.
+  adjustBBOffsetsAfter(MF->begin());
+
+  // Now go back through the instructions and build up our data structures.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      if (I->isDebugValue())
+        continue;
+
+      int Opc = I->getOpcode();
+      if (I->isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other branches for now
+        case Mips::Bimm16:
+          Bits = 11;
+          Scale = 2;
+          isCond = false;
+          break;
+        case Mips::BimmX16:
+          Bits = 16;
+          Scale = 2;
+          isCond = false;
+        }
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == Mips::CONSTPOOL_ENTRY)
+        continue;
+
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          unsigned LongFormBits = 0;
+          unsigned LongFormScale = 0;
+          unsigned LongFormOpcode = 0;
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+          case Mips::LwRxPcTcp16:
+            Bits = 8;
+            Scale = 4;
+            LongFormOpcode = Mips::LwRxPcTcpX16;
+            LongFormBits = 16;
+            LongFormScale = 1;
+            break;
+          case Mips::LwRxPcTcpX16:
+            Bits = 16;
+            Scale = 1;
+            NegOk = true;
+            break;
+          }
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          unsigned LongFormMaxOffs = ((1 << LongFormBits)-1) * LongFormScale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk,
+                                   LongFormMaxOffs, LongFormOpcode));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+
+        }
+
+    }
+  }
+
+}
+
+/// computeBlockSize - Compute the size and some alignment information for MBB.
+/// This function updates BBInfo directly.
+void MipsConstantIslands::computeBlockSize(MachineBasicBlock *MBB) {
+  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
+  BBI.Size = 0;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I)
+    BBI.Size += TII->GetInstSizeInBytes(I);
+
+}
+
+/// getOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned MipsConstantIslands::getOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+  return Offset;
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// updateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void MipsConstantIslands::updateForInsertedWaterBlock
+  (MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consecutive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+unsigned MipsConstantIslands::getUserOffset(CPUser &U) const {
+  return getOffsetOf(U.MI);
+}
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *MipsConstantIslands::splitBlockBeforeInstr
+  (MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF->insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  BuildMI(OrigBB, DebugLoc(), TII->get(Mips::Bimm16)).addMBB(NewBB);
+  ++NumSplit;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  NewBB->transferSuccessors(OrigBB);
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as updateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF->RenumberBlocks(NewBB);
+
+  // Insert an entry into BBInfo to align it properly with the (newly
+  // renumbered) block numbers.
+  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(llvm::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the OrigBB is.  As the first half of the original
+  // block, it cannot contain a tablejump.  The size includes
+  // the new jump we added.  (It should be possible to do this without
+  // recounting everything, but it's very confusing, and this is rarely
+  // executed.)
+  computeBlockSize(OrigBB);
+
+  // Figure out how large the NewMBB is.  As the second half of the original
+  // block, it may contain a tablejump.
+  computeBlockSize(NewBB);
+
+  // All BBOffsets following these blocks must be modified.
+  adjustBBOffsetsAfter(OrigBB);
+
+  return NewBB;
+}
+
+
+
+/// isOffsetInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool MipsConstantIslands::isOffsetInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK) {
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// isWaterInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+///
+/// Compute how much the function will grow by inserting a CPE after Water.
+bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U,
+                                        unsigned &Growth) {
+  unsigned CPELogAlign = getCPELogAlign(U.CPEMI);
+  unsigned CPEOffset = BBInfo[Water->getNumber()].postOffset(CPELogAlign);
+  unsigned NextBlockOffset, NextBlockAlignment;
+  MachineFunction::const_iterator NextBlock = Water;
+  if (++NextBlock == MF->end()) {
+    NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
+    NextBlockAlignment = 0;
+  } else {
+    NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
+    NextBlockAlignment = NextBlock->getAlignment();
+  }
+  unsigned Size = U.CPEMI->getOperand(2).getImm();
+  unsigned CPEEnd = CPEOffset + Size;
+
+  // The CPE may be able to hide in the alignment padding before the next
+  // block. It may also cause more padding to be required if it is more aligned
+  // that the next block.
+  if (CPEEnd > NextBlockOffset) {
+    Growth = CPEEnd - NextBlockOffset;
+    // Compute the padding that would go at the end of the CPE to align the next
+    // block.
+    Growth += OffsetToAlignment(CPEEnd, 1u << NextBlockAlignment);
+
+    // If the CPE is to be inserted before the instruction, that will raise
+    // the offset of the instruction. Also account for unknown alignment padding
+    // in blocks between CPE and the user.
+    if (CPEOffset < UserOffset)
+      UserOffset += Growth;
+  } else
+    // CPE fits in existing padding.
+    Growth = 0;
+
+  return isOffsetInRange(UserOffset, CPEOffset, U);
+}
+
+/// isCPEntryInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool MipsConstantIslands::isCPEntryInRange
+  (MachineInstr *MI, unsigned UserOffset,
+   MachineInstr *CPEMI, unsigned MaxDisp,
+   bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = getOffsetOf(CPEMI);
+
+  if (DoDump) {
+    DEBUG({
+      unsigned Block = MI->getParent()->getNumber();
+      const BasicBlockInfo &BBI = BBInfo[Block];
+      dbgs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+             << " max delta=" << MaxDisp
+             << format(" insn address=%#x", UserOffset)
+             << " in BB#" << Block << ": "
+             << format("%#x-%x\t", BBI.Offset, BBI.postOffset()) << *MI
+             << format("CPE address=%#x offset=%+d: ", CPEOffset,
+                       int(CPEOffset-UserOffset));
+    });
+  }
+
+  return isOffsetInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == Mips::Bimm16)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif
+
+void MipsConstantIslands::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
+  unsigned BBNum = BB->getNumber();
+  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
+    // Get the offset and known bits at the end of the layout predecessor.
+    // Include the alignment of the current block.
+    unsigned Offset = BBInfo[i - 1].Offset + BBInfo[i - 1].Size;
+    BBInfo[i].Offset = Offset;
+  }
+}
+
+/// decrementCPEReferenceCount - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
+                                                    MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    removeDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    --NumCPEs;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI, U.getMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
+                     U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// This version checks if the longer form of the instruction can be used to
+/// to satisfy things.
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int MipsConstantIslands::findLongFormInRangeCPEntry
+  (CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (isCPEntryInRange(UserMI, UserOffset, CPEMI,
+                       U.getLongFormMaxDisp(), U.NegOk,
+                       true)) {
+    DEBUG(dbgs() << "In range\n");
+    UserMI->setDesc(TII->get(U.getLongFormOpcode()));
+    U.setMaxDisp(U.getLongFormMaxDisp());
+    return 2;  // instruction is longer length now
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
+                         U.getLongFormMaxDisp(), U.NegOk)) {
+      DEBUG(dbgs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return decrementCPEReferenceCount(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case Mips::Bimm16:
+    return ((1<<10)-1)*2;
+  case Mips::BimmX16:
+    return ((1<<16)-1)*2;
+  default:
+    break;
+  }
+  return ((1<<16)-1)*2;
+}
+
+/// findAvailableWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  
+/// To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool MipsConstantIslands::findAvailableWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  unsigned BestGrowth = ~0u;
+  for (water_iterator IP = prior(WaterList.end()), B = WaterList.begin();;
+       --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    unsigned Growth;
+    if (isWaterInRange(UserOffset, WaterBB, U, Growth) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB)) && Growth < BestGrowth) {
+      // This is the least amount of required padding seen so far.
+      BestGrowth = Growth;
+      WaterIter = IP;
+      DEBUG(dbgs() << "Found water after BB#" << WaterBB->getNumber()
+                   << " Growth=" << Growth << '\n');
+
+      // Keep looking unless it is perfect.
+      if (BestGrowth == 0)
+        return true;
+    }
+    if (IP == B)
+      break;
+  }
+  return BestGrowth != ~0u;
+}
+
+/// createNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void MipsConstantIslands::createNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPELogAlign = getCPELogAlign(CPEMI);
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  const BasicBlockInfo &UserBBI = BBInfo[UserMBB->getNumber()];
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  
+  if (BBHasFallthrough(UserMBB)) {
+    // Size of branch to insert.
+    unsigned Delta = 2;
+    // Compute the offset where the CPE will begin.
+    unsigned CPEOffset = UserBBI.postOffset(CPELogAlign) + Delta;
+
+    if (isOffsetInRange(UserOffset, CPEOffset, U)) {
+      DEBUG(dbgs() << "Split at end of BB#" << UserMBB->getNumber()
+            << format(", expected CPE offset %#x\n", CPEOffset));
+      NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+      // Add an unconditional branch from UserMBB to fallthrough block.  Record
+      // it for branch lengthening; this new branch will not get out of range,
+      // but if the preceding conditional branch is out of range, the targets
+      // will be exchanged, and the altered branch may be out of range, so the
+      // machinery has to know about it.
+      int UncondBr = Mips::Bimm16;
+      BuildMI(UserMBB, DebugLoc(), TII->get(UncondBr)).addMBB(NewMBB);
+      unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+      ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                                      MaxDisp, false, UncondBr));
+      BBInfo[UserMBB->getNumber()].Size += Delta;
+      adjustBBOffsetsAfter(UserMBB);
+      return;
+    }
+  }
+
+  // What a big block.  Find a place within the block to split it.  
+
+  // Try to split the block so it's fully aligned.  Compute the latest split
+  // point where we can add a 4-byte branch instruction, and then align to
+  // LogAlign which is the largest possible alignment in the function.
+  unsigned LogAlign = MF->getAlignment();
+  assert(LogAlign >= CPELogAlign && "Over-aligned constant pool entry");
+  unsigned BaseInsertOffset = UserOffset + U.getMaxDisp();
+  DEBUG(dbgs() << format("Split in middle of big block before %#x",
+                         BaseInsertOffset));
+
+  // The 4 in the following is for the unconditional branch we'll be inserting
+  // Alignment of the island is handled
+  // inside isOffsetInRange.
+  BaseInsertOffset -= 4;
+
+  DEBUG(dbgs() << format(", adjusted to %#x", BaseInsertOffset)
+               << " la=" << LogAlign << '\n');
+
+  // This could point off the end of the block if we've already got constant
+  // pool entries following this block; only the last one is in the water list.
+  // Back past any possible branches (allow for a conditional and a maximally
+  // long unconditional).
+  if (BaseInsertOffset + 8 >= UserBBI.postOffset()) {
+    BaseInsertOffset = UserBBI.postOffset() - 8;
+    DEBUG(dbgs() << format("Move inside block: %#x\n", BaseInsertOffset));
+  }
+  unsigned EndInsertOffset = BaseInsertOffset + 4 +
+    CPEMI->getOperand(2).getImm();
+  MachineBasicBlock::iterator MI = UserMI;
+  ++MI;
+  unsigned CPUIndex = CPUserIndex+1;
+  unsigned NumCPUsers = CPUsers.size();
+  //MachineInstr *LastIT = 0;
+  for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+       Offset < BaseInsertOffset;
+       Offset += TII->GetInstSizeInBytes(MI),
+       MI = llvm::next(MI)) {
+    assert(MI != UserMBB->end() && "Fell off end of block");
+    if (CPUIndex < NumCPUsers && CPUsers[CPUIndex].MI == MI) {
+      CPUser &U = CPUsers[CPUIndex];
+      if (!isOffsetInRange(Offset, EndInsertOffset, U)) {
+        // Shift intertion point by one unit of alignment so it is within reach.
+        BaseInsertOffset -= 1u << LogAlign;
+        EndInsertOffset  -= 1u << LogAlign;
+      }
+      // This is overly conservative, as we don't account for CPEMIs being
+      // reused within the block, but it doesn't matter much.  Also assume CPEs
+      // are added in order with alignment padding.  We may eventually be able
+      // to pack the aligned CPEs better.
+      EndInsertOffset += U.CPEMI->getOperand(2).getImm();
+      CPUIndex++;
+    }
+  }
+
+  --MI;
+  NewMBB = splitBlockBeforeInstr(MI);
+}
+
+/// handleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool MipsConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.
+  unsigned UserOffset = getUserOffset(U);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = findInRangeCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF->CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (findAvailableWater(U, UserOffset, IP)) {
+    DEBUG(dbgs() << "Found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.erase(WaterBB))
+      NewWaterList.insert(NewIsland);
+
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    // we first see if a longer form of the instrucion could have reached
+    // the constant. in that case we won't bother to split
+    if (!NoLoadRelaxation) {
+      result = findLongFormInRangeCPEntry(U, UserOffset);
+      if (result != 0) return true;
+    }
+    DEBUG(dbgs() << "No water found\n");
+    createNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // splitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF->insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  updateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  decrementCPEReferenceCount(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc(), TII->get(Mips::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  ++NumCPEs;
+
+  // Mark the basic block as aligned as required by the const-pool entry.
+  NewIsland->setAlignment(getCPELogAlign(U.CPEMI));
+
+  // Increase the size of the island block to account for the new entry.
+  BBInfo[NewIsland->getNumber()].Size += Size;
+  adjustBBOffsetsAfter(llvm::prior(MachineFunction::iterator(NewIsland)));
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = createPICLabelUId();
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(dbgs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+        << format(" offset=%#x\n", BBInfo[NewIsland->getNumber()].Offset));
+
+  return true;
+}
+
+/// removeDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void MipsConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBInfo[CPEBB->getNumber()].Size -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    BBInfo[CPEBB->getNumber()].Size = 0;
+
+    // This block no longer needs to be aligned.
+    CPEBB->setAlignment(0);
+  } else
+    // Entries are sorted by descending alignment, so realign from the front.
+    CPEBB->setAlignment(getCPELogAlign(CPEBB->begin()));
+
+  adjustBBOffsetsAfter(CPEBB);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// removeUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool MipsConstantIslands::removeUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          removeDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// isBBInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool MipsConstantIslands::isBBInRange
+  (MachineInstr *MI,MachineBasicBlock *DestBB, unsigned MaxDisp) {
+
+unsigned PCAdj = 4;
+
+  unsigned BrOffset   = getOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBInfo[DestBB->getNumber()].Offset;
+
+  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << getOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
   return false;
 }
 
+/// fixupImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool MipsConstantIslands::fixupImmediateBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (isBBInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return fixupUnconditionalBr(Br);
+  return fixupConditionalBr(Br);
+}
+
+/// fixupUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+MipsConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  // Use BL to implement far jump.
+  Br.MaxDisp = ((1 << 16)-1) * 2;
+  MI->setDesc(TII->get(Mips::BimmX16));
+  BBInfo[MBB->getNumber()].Size += 2;
+  adjustBBOffsetsAfter(MBB);
+  HasFarJump = true;
+  ++NumUBrFixed;
+
+  DEBUG(dbgs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// fixupConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+MipsConstantIslands::fixupConditionalBr(ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  unsigned CCReg = 0;  // FIXME
+  unsigned CC=0; //FIXME
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  ++NumCBrFixed;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (isBBInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    splitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBInfo[MBB->getNumber()].Size -= delta;
+    MBB->back().eraseFromParent();
+    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(dbgs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc(), TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBInfo[MI->getParent()->getNumber()].Size -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+  adjustBBOffsetsAfter(MBB);
+  return true;
+}
+
+
+void MipsConstantIslands::prescanForConstants() {
+  unsigned J = 0;
+  (void)J;
+  PrescannedForConstants = true;
+  for (MachineFunction::iterator B =
+         MF->begin(), E = MF->end(); B != E; ++B) {
+    for (MachineBasicBlock::instr_iterator I =
+        B->instr_begin(), EB = B->instr_end(); I != EB; ++I) {
+      switch(I->getDesc().getOpcode()) {
+        case Mips::LwConstant32: {
+          DEBUG(dbgs() << "constant island constant " << *I << "\n");
+          J = I->getNumOperands();
+          DEBUG(dbgs() << "num operands " << J  << "\n");
+          MachineOperand& Literal = I->getOperand(1);
+          if (Literal.isImm()) {
+            int64_t V = Literal.getImm();
+            DEBUG(dbgs() << "literal " << V  << "\n");
+            Type *Int32Ty =
+              Type::getInt32Ty(MF->getFunction()->getContext());
+            const Constant *C = ConstantInt::get(Int32Ty, V);
+            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            I->getOperand(2).ChangeToImmediate(index);
+            DEBUG(dbgs() << "constant island constant " << *I << "\n");
+            I->setDesc(TII->get(Mips::LwRxPcTcp16));
+            I->RemoveOperand(1);
+            I->RemoveOperand(1);
+            I->addOperand(MachineOperand::CreateCPI(index, 0));
+            I->addOperand(MachineOperand::CreateImm(4));
+          }
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
+
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index c12878a..d268384 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -256,236 +256,235 @@ class PREPEND_ENC : APPEND_FMT<0b00001>;
 
 // Instruction desc.
 class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                        InstrItinClass itin, RegisterClass RCD,
-                        RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                        InstrItinClass itin, RegisterOperand ROD,
+                        RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RCD,
-                           RegisterClass RCS = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs);
+                           InstrItinClass itin, RegisterOperand ROD,
+                           RegisterOperand ROS = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs))];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCS,
-                             RegisterClass RCT = RCS> {
+                             InstrItinClass itin, RegisterOperand ROS,
+                             RegisterOperand ROT = ROS> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
-  list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
+  list<dag> Pattern = [(OpNode ROS:$rs, ROT:$rt)];
   InstrItinClass Itinerary = itin;
 }
 
 class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCS,  RegisterClass RCT = RCS> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROS,  RegisterOperand ROT = ROS> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                               InstrItinClass itin, RegisterClass RCT,
-                               RegisterClass RCS = RCT> {
-  dag OutOperandList = (outs RCT:$rt);
-  dag InOperandList = (ins RCS:$rs, shamt:$sa, RCS:$src);
+                               InstrItinClass itin, RegisterOperand ROT,
+                               RegisterOperand ROS = ROT> {
+  dag OutOperandList = (outs ROT:$rt);
+  dag InOperandList = (ins ROS:$rs, uimm5:$sa, ROS:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
+  list<dag> Pattern = [(set ROT:$rt, (OpNode ROS:$src, ROS:$rs, immZExt5:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                             InstrItinClass itin, RegisterClass RCD,
-                             RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCT:$rt);
+                             InstrItinClass itin, RegisterOperand ROD,
+                             RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                     ImmLeaf immPat, InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
+                     ImmLeaf immPat, InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
   dag InOperandList = (ins uimm16:$imm);
   string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
-  list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode immPat:$imm))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                           InstrItinClass itin, RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList =  (ins RC:$rt, CPURegs:$rs_sa);
+                           InstrItinClass itin, RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList =  (ins RO:$rt, GPR32Opnd:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, CPURegs:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs_sa))];
   InstrItinClass Itinerary = itin;
 }
 
 class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                            SDPatternOperator ImmPat, InstrItinClass itin,
-                           RegisterClass RC> {
-  dag OutOperandList = (outs RC:$rd);
-  dag InOperandList = (ins RC:$rt, uimm16:$rs_sa);
+                           RegisterOperand RO> {
+  dag OutOperandList = (outs RO:$rd);
+  dag InOperandList = (ins RO:$rt, uimm16:$rs_sa);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
-  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
+  list<dag> Pattern = [(set RO:$rd, (OpNode RO:$rt, ImmPat:$rs_sa))];
   InstrItinClass Itinerary = itin;
   bit hasSideEffects = 1;
 }
 
 class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                    InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rd);
-  dag InOperandList = (ins CPURegs:$base, CPURegs:$index);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins PtrRC:$base, PtrRC:$index);
   string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
-  list<dag> Pattern = [(set CPURegs:$rd,
-                       (OpNode CPURegs:$base, CPURegs:$index))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode iPTR:$base, iPTR:$index))];
   InstrItinClass Itinerary = itin;
   bit mayLoad = 1;
 }
 
 class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
-                         InstrItinClass itin, RegisterClass RCD,
-                         RegisterClass RCS = RCD,  RegisterClass RCT = RCD> {
-  dag OutOperandList = (outs RCD:$rd);
-  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+                         InstrItinClass itin, RegisterOperand ROD,
+                         RegisterOperand ROS = ROD,  RegisterOperand ROT = ROD> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROS:$rs, ROT:$rt);
   string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
-  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  list<dag> Pattern = [(set ROD:$rd, (OpNode ROS:$rs, ROT:$rt))];
   InstrItinClass Itinerary = itin;
 }
 
 class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                        SDPatternOperator ImmOp, InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rt);
-  dag InOperandList = (ins CPURegs:$rs, shamt:$sa, CPURegs:$src);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm5:$sa, GPR32Opnd:$src);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
-  list<dag> Pattern =  [(set CPURegs:$rt,
-                        (OpNode CPURegs:$src, CPURegs:$rs, ImmOp:$sa))];
+  list<dag> Pattern =  [(set GPR32Opnd:$rt,
+                        (OpNode GPR32Opnd:$src, GPR32Opnd:$rs, ImmOp:$sa))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
 
 class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, CPURegs:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, GPR32Opnd:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                               InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rt);
-  dag InOperandList = (ins ACRegsDSP:$ac, uimm16:$shift_rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins ACC64DSPOpnd:$ac, uimm16:$shift_rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
   InstrItinClass Itinerary = itin;
 }
 
 class SHILO_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins simm16:$shift, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins simm16:$shift, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode immSExt6:$shift, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode immSExt6:$shift, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class SHILO_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins CPURegs:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MTHLIP_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins CPURegs:$rs, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode CPURegs:$rs, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rd);
+  dag OutOperandList = (outs GPR32Opnd:$rd);
   dag InOperandList = (ins uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
-  list<dag> Pattern = [(set CPURegs:$rd, (OpNode immZExt10:$mask))];
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode immZExt10:$mask))];
   InstrItinClass Itinerary = itin;
 }
 
 class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                       InstrItinClass itin> {
   dag OutOperandList = (outs);
-  dag InOperandList = (ins CPURegs:$rs, uimm16:$mask);
+  dag InOperandList = (ins GPR32Opnd:$rs, uimm16:$mask);
   string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
-  list<dag> Pattern = [(OpNode CPURegs:$rs, immZExt10:$mask)];
+  list<dag> Pattern = [(OpNode GPR32Opnd:$rs, immZExt10:$mask)];
   InstrItinClass Itinerary = itin;
 }
 
 class DPA_W_PH_DESC_BASE<string instr_asm, SDPatternOperator OpNode> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   string Constraints = "$acin = $ac";
 }
 
 class MULT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac, (OpNode CPURegs:$rs, CPURegs:$rt))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac, (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   bit isCommutable = 1;
 }
 
 class MADD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs ACRegsDSP:$ac);
-  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin);
+  dag OutOperandList = (outs ACC64DSPOpnd:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin);
   string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
-  list<dag> Pattern = [(set ACRegsDSP:$ac,
-                        (OpNode CPURegs:$rs, CPURegs:$rt, ACRegsDSP:$acin))];
+  list<dag> Pattern = [(set ACC64DSPOpnd:$ac,
+                        (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64DSPOpnd:$acin))];
   InstrItinClass Itinerary = itin;
-  int AddedComplexity = 20;
   string Constraints = "$acin = $ac";
 }
 
-class MFHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rd);
-  dag InOperandList = (ins RC:$ac);
+class MFHI_DESC_BASE<string instr_asm, RegisterOperand RO, SDNode OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins RO:$ac);
   string AsmString = !strconcat(instr_asm, "\t$rd, $ac");
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (OpNode RO:$ac))];
   InstrItinClass Itinerary = itin;
 }
 
-class MTHI_DESC_BASE<string instr_asm, RegisterClass RC, InstrItinClass itin> {
-  dag OutOperandList = (outs RC:$ac);
-  dag InOperandList = (ins CPURegs:$rs);
+class MTHI_DESC_BASE<string instr_asm, RegisterOperand RO, InstrItinClass itin> {
+  dag OutOperandList = (outs RO:$ac);
+  dag InOperandList = (ins GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
   InstrItinClass Itinerary = itin;
 }
 
 class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
-  MipsPseudo<(outs CPURegs:$dst), (ins), [(set CPURegs:$dst, (OpNode))]> {
+  MipsPseudo<(outs GPR32Opnd:$dst), (ins), [(set GPR32Opnd:$dst, (OpNode))]> {
   bit usesCustomInserter = 1;
 }
 
@@ -501,10 +500,10 @@ class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
 
 class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                      InstrItinClass itin> {
-  dag OutOperandList = (outs CPURegs:$rt);
-  dag InOperandList = (ins CPURegs:$src, CPURegs:$rs);
+  dag OutOperandList = (outs GPR32Opnd:$rt);
+  dag InOperandList = (ins GPR32Opnd:$src, GPR32Opnd:$rs);
   string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
-  list<dag> Pattern = [(set CPURegs:$rt, (OpNode CPURegs:$src, CPURegs:$rs))];
+  list<dag> Pattern = [(set GPR32Opnd:$rt, (OpNode GPR32Opnd:$src, GPR32Opnd:$rs))];
   InstrItinClass Itinerary = itin;
   string Constraints = "$src = $rt";
 }
@@ -515,209 +514,209 @@ class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
 
 // Addition/subtraction
 class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", null_frag, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
-                                        NoItinerary, CPURegs, CPURegs>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
-                                        NoItinerary, CPURegs, CPURegs>,
+                                        NoItinerary, GPR32Opnd, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", null_frag, NoItinerary,
-                                     CPURegs, CPURegs>, IsCommutable,
+                                     GPR32Opnd, GPR32Opnd>, IsCommutable,
                    Defs<[DSPCarry]>;
 
 class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", null_frag, NoItinerary,
-                                     CPURegs, CPURegs>,
+                                     GPR32Opnd, GPR32Opnd>,
                    IsCommutable, Uses<[DSPCarry]>, Defs<[DSPOutFlag20]>;
 
 class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
-                                      CPURegs, CPURegs>;
+                                      GPR32Opnd, GPR32Opnd>;
 
 class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
-                                             NoItinerary, CPURegs, DSPRegs>;
+                                             NoItinerary, GPR32Opnd, DSPROpnd>;
 
 // Absolute value
 class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
-                                             NoItinerary, CPURegs>,
+                                             NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag20]>;
 
 // Precision reduce/expand
 class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
                                                  int_mips_precrq_qb_ph,
-                                                 NoItinerary, DSPRegs, DSPRegs>;
+                                                 NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
                                                 int_mips_precrq_ph_w,
-                                                NoItinerary, DSPRegs, CPURegs>;
+                                                NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
                                                    int_mips_precrq_rs_ph_w,
-                                                   NoItinerary, DSPRegs,
-                                                   CPURegs>,
+                                                   NoItinerary, DSPROpnd,
+                                                   GPR32Opnd>,
                             Defs<[DSPOutFlag22]>;
 
 class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
                                                     int_mips_precrqu_s_qb_ph,
-                                                    NoItinerary, DSPRegs,
-                                                    DSPRegs>,
+                                                    NoItinerary, DSPROpnd,
+                                                    DSPROpnd>,
                              Defs<[DSPOutFlag22]>;
 
 class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
                                                  int_mips_preceq_w_phl,
-                                                 NoItinerary, CPURegs, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
                                                  int_mips_preceq_w_phr,
-                                                 NoItinerary, CPURegs, DSPRegs>;
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
                                                    int_mips_precequ_ph_qbl,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
                                                    int_mips_precequ_ph_qbr,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
                                                     int_mips_precequ_ph_qbla,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
                                                     int_mips_precequ_ph_qbra,
-                                                    NoItinerary, DSPRegs>;
+                                                    NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
                                                   int_mips_preceu_ph_qbl,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
                                                   int_mips_preceu_ph_qbr,
-                                                  NoItinerary, DSPRegs>;
+                                                  NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
                                                    int_mips_preceu_ph_qbla,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
                                                    int_mips_preceu_ph_qbra,
-                                                   NoItinerary, DSPRegs>;
+                                                   NoItinerary, DSPROpnd>;
 
 // Shift
 class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>,
+                                          NoItinerary, DSPROpnd>,
                      Defs<[DSPOutFlag22]>;
 
 class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
-                                           NoItinerary, DSPRegs>,
+                                           NoItinerary, DSPROpnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
-                                            immZExt4, NoItinerary, DSPRegs>,
+                                            immZExt4, NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
-                                             NoItinerary, DSPRegs>,
+                                             NoItinerary, DSPROpnd>,
                         Defs<[DSPOutFlag22]>;
 
 class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
-                                            immZExt4, NoItinerary, DSPRegs>;
+                                            immZExt4, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
-                                           immZExt5, NoItinerary, CPURegs>,
+                                           immZExt5, NoItinerary, GPR32Opnd>,
                       Defs<[DSPOutFlag22]>;
 
 class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
-                                            NoItinerary, CPURegs>,
+                                            NoItinerary, GPR32Opnd>,
                        Defs<[DSPOutFlag22]>;
 
 class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
-                                           immZExt5, NoItinerary, CPURegs>;
+                                           immZExt5, NoItinerary, GPR32Opnd>;
 
 class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
-                                            NoItinerary, CPURegs>;
+                                            NoItinerary, GPR32Opnd>;
 
 // Multiplication
 class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
                                               int_mips_muleu_s_ph_qbl,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
                                               int_mips_muleu_s_ph_qbr,
-                                              NoItinerary, DSPRegs, DSPRegs>,
+                                              NoItinerary, DSPROpnd, DSPROpnd>,
                             Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
                                              int_mips_muleq_s_w_phl,
-                                             NoItinerary, CPURegs, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
                                              int_mips_muleq_s_w_phr,
-                                             NoItinerary, CPURegs, DSPRegs>,
+                                             NoItinerary, GPR32Opnd, DSPROpnd>,
                            IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
-                                          NoItinerary, DSPRegs, DSPRegs>,
+                                          NoItinerary, DSPROpnd, DSPROpnd>,
                         IsCommutable, Defs<[DSPOutFlag21]>;
 
 class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph",
@@ -737,10 +736,10 @@ class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr", MipsMAQ_SA_W_PHR>,
                           Defs<[DSPOutFlag16_19]>;
 
 // Move from/to hi/lo.
-class MFHI_DESC : MFHI_DESC_BASE<"mfhi", HIRegsDSP, NoItinerary>;
-class MFLO_DESC : MFHI_DESC_BASE<"mflo", LORegsDSP, NoItinerary>;
-class MTHI_DESC : MTHI_DESC_BASE<"mthi", HIRegsDSP, NoItinerary>;
-class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LORegsDSP, NoItinerary>;
+class MFHI_DESC : MFHI_DESC_BASE<"mfhi", ACC64DSPOpnd, MipsMFHI, NoItinerary>;
+class MFLO_DESC : MFHI_DESC_BASE<"mflo", ACC64DSPOpnd, MipsMFLO, NoItinerary>;
+class MTHI_DESC : MTHI_DESC_BASE<"mthi", HI32DSPOpnd, NoItinerary>;
+class MTLO_DESC : MTHI_DESC_BASE<"mtlo", LO32DSPOpnd, NoItinerary>;
 
 // Dot product with accumulate/subtract
 class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl", MipsDPAU_H_QBL>;
@@ -773,67 +772,67 @@ class MSUBU_DSP_DESC : MADD_DESC_BASE<"msubu", MipsMSubu, NoItinerary>;
 // Comparison
 class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
                                                int_mips_cmpu_eq_qb, NoItinerary,
-                                               DSPRegs>,
+                                               DSPROpnd>,
                         IsCommutable, Defs<[DSPCCond]>;
 
 class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
                                                int_mips_cmpu_lt_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
                                                int_mips_cmpu_le_qb, NoItinerary,
-                                               DSPRegs>, Defs<[DSPCCond]>;
+                                               DSPROpnd>, Defs<[DSPCCond]>;
 
 class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
                                                 int_mips_cmpgu_eq_qb,
-                                                NoItinerary, CPURegs, DSPRegs>,
+                                                NoItinerary, GPR32Opnd, DSPROpnd>,
                          IsCommutable;
 
 class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
                                                 int_mips_cmpgu_lt_qb,
-                                                NoItinerary, CPURegs, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
                                                 int_mips_cmpgu_le_qb,
-                                                NoItinerary, CPURegs, DSPRegs>;
+                                                NoItinerary, GPR32Opnd, DSPROpnd>;
 
 class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        IsCommutable, Defs<[DSPCCond]>;
 
 class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPCCond]>;
 
 // Misc
 class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
-                                           NoItinerary, CPURegs>;
+                                           NoItinerary, GPR32Opnd>;
 
 class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
-                                              NoItinerary, DSPRegs, DSPRegs>;
+                                              NoItinerary, DSPROpnd, DSPROpnd>;
 
 class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
-                                    NoItinerary, DSPRegs>;
+                                    NoItinerary, DSPROpnd>;
 
 class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
-                                             NoItinerary, DSPRegs, CPURegs>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
-                                             NoItinerary, DSPRegs, CPURegs>;
+                                             NoItinerary, DSPROpnd, GPR32Opnd>;
 
 class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
-                                            NoItinerary, DSPRegs, DSPRegs>,
+                                            NoItinerary, DSPROpnd, DSPROpnd>,
                      Uses<[DSPCCond]>;
 
 class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>;
@@ -905,97 +904,97 @@ class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>,
 // MIPS DSP Rev 2
 // Addition/subtraction
 class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>, IsCommutable,
+                                       DSPROpnd, DSPROpnd>, IsCommutable,
                      Defs<[DSPOutFlag20]>;
 
 class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag20]>;
 
 class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
-                                       DSPRegs, DSPRegs>,
+                                       DSPROpnd, DSPROpnd>,
                      Defs<[DSPOutFlag20]>;
 
 class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable;
+                                         NoItinerary, DSPROpnd>, IsCommutable;
 
 class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
-                                           NoItinerary, DSPRegs>, IsCommutable;
+                                           NoItinerary, DSPROpnd>, IsCommutable;
 
 class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
-                                         NoItinerary, DSPRegs>;
+                                         NoItinerary, DSPROpnd>;
 
 class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
-                                        NoItinerary, CPURegs>, IsCommutable;
+                                        NoItinerary, GPR32Opnd>, IsCommutable;
 
 class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
-                                          NoItinerary, CPURegs>, IsCommutable;
+                                          NoItinerary, GPR32Opnd>, IsCommutable;
 
 class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
-                                        NoItinerary, CPURegs>;
+                                        NoItinerary, GPR32Opnd>;
 
 class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
-                                          NoItinerary, CPURegs>;
+                                          NoItinerary, GPR32Opnd>;
 
 // Comparison
 class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
                                                  int_mips_cmpgdu_eq_qb,
-                                                 NoItinerary, CPURegs, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           IsCommutable, Defs<[DSPCCond]>;
 
 class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
                                                  int_mips_cmpgdu_lt_qb,
-                                                 NoItinerary, CPURegs, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
                                                  int_mips_cmpgdu_le_qb,
-                                                 NoItinerary, CPURegs, DSPRegs>,
+                                                 NoItinerary, GPR32Opnd, DSPROpnd>,
                           Defs<[DSPCCond]>;
 
 // Absolute
 class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
-                                              NoItinerary, DSPRegs>,
+                                              NoItinerary, DSPROpnd>,
                        Defs<[DSPOutFlag20]>;
 
 // Multiplication
 class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", null_frag, NoItinerary,
-                                       DSPRegs>, IsCommutable,
+                                       DSPROpnd>, IsCommutable,
                     Defs<[DSPOutFlag21]>;
 
 class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
-                                         NoItinerary, DSPRegs>, IsCommutable,
+                                         NoItinerary, DSPROpnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
-                                         NoItinerary, CPURegs>, IsCommutable,
+                                         NoItinerary, GPR32Opnd>, IsCommutable,
                       Defs<[DSPOutFlag21]>;
 
 class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
-                                          NoItinerary, CPURegs>, IsCommutable,
+                                          NoItinerary, GPR32Opnd>, IsCommutable,
                        Defs<[DSPOutFlag21]>;
 
 class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
-                                         NoItinerary, DSPRegs, DSPRegs>,
+                                         NoItinerary, DSPROpnd, DSPROpnd>,
                        IsCommutable, Defs<[DSPOutFlag21]>;
 
 // Dot product with accumulate/subtract
@@ -1026,36 +1025,36 @@ class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph", MipsMULSA_W_PH>;
 // Precision reduce/expand
 class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
                                                 int_mips_precr_qb_ph,
-                                                NoItinerary, DSPRegs, DSPRegs>;
+                                                NoItinerary, DSPROpnd, DSPROpnd>;
 
 class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
                                                      int_mips_precr_sra_ph_w,
-                                                     NoItinerary, DSPRegs,
-                                                     CPURegs>;
+                                                     NoItinerary, DSPROpnd,
+                                                     GPR32Opnd>;
 
 class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
                                                       int_mips_precr_sra_r_ph_w,
-                                                       NoItinerary, DSPRegs,
-                                                       CPURegs>;
+                                                       NoItinerary, DSPROpnd,
+                                                       GPR32Opnd>;
 
 // Shift
 class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", null_frag, immZExt3,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
-                                            immZExt3, NoItinerary, DSPRegs>;
+                                            immZExt3, NoItinerary, DSPROpnd>;
 
 class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
-                                             NoItinerary, DSPRegs>;
+                                             NoItinerary, DSPROpnd>;
 
 class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", null_frag, immZExt4,
-                                          NoItinerary, DSPRegs>;
+                                          NoItinerary, DSPROpnd>;
 
 class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
-                                           NoItinerary, DSPRegs>;
+                                           NoItinerary, DSPROpnd>;
 
 // Misc
 class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
@@ -1240,24 +1239,24 @@ def PREPEND : PREPEND_ENC, PREPEND_DESC;
 }
 
 // Pseudos.
-let isPseudo = 1 in {
+let isPseudo = 1, isCodeGenOnly = 1 in {
   // Pseudo instructions for loading and storing accumulator registers.
-  defm LOAD_AC_DSP  : LoadM<"load_ac_dsp", ACRegsDSP>;
-  defm STORE_AC_DSP : StoreM<"store_ac_dsp", ACRegsDSP>;
+  def LOAD_ACC64DSP  : Load<"", ACC64DSPOpnd>;
+  def STORE_ACC64DSP : Store<"", ACC64DSPOpnd>;
 
   // Pseudos for loading and storing ccond field of DSP control register.
-  defm LOAD_CCOND_DSP  : LoadM<"load_ccond_dsp", DSPCC>;
-  defm STORE_CCOND_DSP : StoreM<"store_ccond_dsp", DSPCC>;
+  def LOAD_CCOND_DSP  : Load<"load_ccond_dsp", DSPCC>;
+  def STORE_CCOND_DSP : Store<"store_ccond_dsp", DSPCC>;
 }
 
 // Pseudo CMP and PICK instructions.
 class PseudoCMP<Instruction RealInst> :
-  PseudoDSP<(outs DSPCC:$cmp), (ins DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rs, DSPRegs:$rt)>, NeverHasSideEffects;
+  PseudoDSP<(outs DSPCC:$cmp), (ins DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rs, DSPROpnd:$rt)>, NeverHasSideEffects;
 
 class PseudoPICK<Instruction RealInst> :
-  PseudoDSP<(outs DSPRegs:$rd), (ins DSPCC:$cmp, DSPRegs:$rs, DSPRegs:$rt), []>,
-  PseudoInstExpansion<(RealInst DSPRegs:$rd, DSPRegs:$rs, DSPRegs:$rt)>,
+  PseudoDSP<(outs DSPROpnd:$rd), (ins DSPCC:$cmp, DSPROpnd:$rs, DSPROpnd:$rt), []>,
+  PseudoInstExpansion<(RealInst DSPROpnd:$rd, DSPROpnd:$rs, DSPROpnd:$rt)>,
   NeverHasSideEffects;
 
 def PseudoCMP_EQ_PH : PseudoCMP<CMP_EQ_PH>;
@@ -1270,6 +1269,8 @@ def PseudoCMPU_LE_QB : PseudoCMP<CMPU_LE_QB>;
 def PseudoPICK_PH : PseudoPICK<PICK_PH>;
 def PseudoPICK_QB : PseudoPICK<PICK_QB>;
 
+def PseudoMTLOHI_DSP : PseudoMTLOHI<ACC64DSP, GPR32>;
+
 // Patterns.
 class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
   Pat<pattern, result>, Requires<[pred]>;
@@ -1279,19 +1280,19 @@ class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
    DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
           (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
 
-def : BitconvertPat<i32, v2i16, CPURegs, DSPRegs>;
-def : BitconvertPat<i32, v4i8, CPURegs, DSPRegs>;
-def : BitconvertPat<v2i16, i32, DSPRegs, CPURegs>;
-def : BitconvertPat<v4i8, i32, DSPRegs, CPURegs>;
+def : BitconvertPat<i32, v2i16, GPR32, DSPR>;
+def : BitconvertPat<i32, v4i8, GPR32, DSPR>;
+def : BitconvertPat<v2i16, i32, DSPR, GPR32>;
+def : BitconvertPat<v4i8, i32, DSPR, GPR32>;
 
 def : DSPPat<(v2i16 (load addr:$a)),
-             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
 def : DSPPat<(v4i8 (load addr:$a)),
-             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
-def : DSPPat<(store (v2i16 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
-def : DSPPat<(store (v4i8 DSPRegs:$val), addr:$a),
-             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPR))>;
+def : DSPPat<(store (v2i16 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPR:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPR:$val, GPR32), addr:$a)>;
 
 // Binary operations.
 class DSPBinPat<Instruction Inst, ValueType ValTy, SDPatternOperator Node,
@@ -1336,7 +1337,7 @@ class DSPSetCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                   CondCode CC> :
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs)),
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR)),
                       (ValTy ZERO)))>;
 
 class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
@@ -1344,7 +1345,7 @@ class DSPSetCCPatInv<Instruction Cmp, Instruction Pick, ValueType ValTy,
   DSPPat<(ValTy (MipsSETCC_DSP ValTy:$a, ValTy:$b, CC)),
          (ValTy (Pick (ValTy (Cmp ValTy:$a, ValTy:$b)),
                       (ValTy ZERO),
-                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPRegs))))>;
+                      (ValTy (COPY_TO_REGCLASS (ADDiu ZERO, -1), DSPR))))>;
 
 class DSPSelectCCPat<Instruction Cmp, Instruction Pick, ValueType ValTy,
                      CondCode CC> :
@@ -1384,12 +1385,12 @@ def : DSPSelectCCPatInv<PseudoCMPU_LE_QB, PseudoPICK_QB, v4i8, SETUGT>;
 
 // Extr patterns.
 class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode CPURegs:$rs, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, CPURegs:$rs)>;
+  DSPPat<(i32 (OpNode GPR32:$rs, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, GPR32:$rs)>;
 
 class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
-  DSPPat<(i32 (OpNode immZExt5:$shift, ACRegsDSP:$ac)),
-         (Instr ACRegsDSP:$ac, immZExt5:$shift)>;
+  DSPPat<(i32 (OpNode immZExt5:$shift, ACC64DSP:$ac)),
+         (Instr ACC64DSP:$ac, immZExt5:$shift)>;
 
 def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
@@ -1404,11 +1405,6 @@ def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
 def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
 def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
 
-// mflo/hi patterns.
-let AddedComplexity = 20 in
-def : DSPPat<(i32 (ExtractLOHI ACRegsDSP:$ac, imm:$lohi_idx)),
-             (EXTRACT_SUBREG ACRegsDSP:$ac, imm:$lohi_idx)>;
-
 // Indexed load patterns.
 class IndexedLoadPat<SDPatternOperator LoadNode, Instruction Instr> :
   DSPPat<(i32 (LoadNode (add i32:$base, i32:$index))),
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index d07a595..ffbd83b 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -177,7 +177,7 @@ namespace {
   class Filler : public MachineFunctionPass {
   public:
     Filler(TargetMachine &tm)
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+      : MachineFunctionPass(ID), TM(tm) { }
 
     virtual const char *getPassName() const {
       return "Mips Delay Slot Filler";
@@ -243,7 +243,6 @@ namespace {
     bool terminateSearch(const MachineInstr &Candidate) const;
 
     TargetMachine &TM;
-    const TargetInstrInfo *TII;
 
     static char ID;
   };
@@ -422,8 +421,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
     return false;
 
   if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
-    return !PSV->PseudoSourceValue::isConstant(0) &&
-      (V != PseudoSourceValue::getStack());
+    return !PSV->isConstant(0) && V != PseudoSourceValue::getStack();
 
   return true;
 }
@@ -438,7 +436,7 @@ bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
 
   // Check underlying object list.
   if (getUnderlyingObjects(MI, Objs)) {
-    for (SmallVector<const Value *, 4>::const_iterator I = Objs.begin();
+    for (SmallVectorImpl<const Value *>::const_iterator I = Objs.begin();
          I != Objs.end(); ++I)
       HasHazard |= updateDefsUses(*I, MI.mayStore());
 
@@ -474,7 +472,7 @@ getUnderlyingObjects(const MachineInstr &MI,
   SmallVector<Value *, 4> Objs;
   GetUnderlyingObjects(const_cast<Value *>(V), Objs);
 
-  for (SmallVector<Value*, 4>::iterator I = Objs.begin(), E = Objs.end();
+  for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
        I != E; ++I) {
     if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(*I)) {
       if (PSV->isAliased(MFI))
@@ -514,6 +512,8 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Bundle the NOP to the instruction with the delay slot.
+    const MipsInstrInfo *TII =
+      static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
     BuildMI(MBB, llvm::next(I), I->getDebugLoc(), TII->get(Mips::NOP));
     MIBundleBuilder(MBB, I, llvm::next(llvm::next(I)));
   }
@@ -562,14 +562,13 @@ bool Filler::searchBackward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.init(*Slot);
 
-  if (searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, ReverseIter(Slot), MBB.rend(), RegDU, MemDU, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, llvm::next(Filler).base());
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
@@ -583,14 +582,13 @@ bool Filler::searchForward(MachineBasicBlock &MBB, Iter Slot) const {
 
   RegDU.setCallerSaved(*Slot);
 
-  if (searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler)) {
-    MBB.splice(llvm::next(Slot), &MBB, Filler);
-    MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
-    ++UsefulSlots;
-    return true;
-  }
+  if (!searchRange(MBB, llvm::next(Slot), MBB.end(), RegDU, NM, Filler))
+    return false;
 
-  return false;
+  MBB.splice(llvm::next(Slot), &MBB, Filler);
+  MIBundleBuilder(MBB, Slot, llvm::next(llvm::next(Slot)));
+  ++UsefulSlots;
+  return true;
 }
 
 bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 968e536..c417bd5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -57,7 +57,8 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
 }
 
 /// ComplexPattern used on MipsInstrInfo
@@ -68,6 +69,12 @@ bool MipsDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                        SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                          SDValue &Offset) const {
   llvm_unreachable("Unimplemented function.");
@@ -80,12 +87,83 @@ bool MipsDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
   return false;
 }
 
+bool MipsDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                       SDValue &Offset) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 bool MipsDAGToDAGISel::selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                                     SDValue &Offset, SDValue &Alias) {
   llvm_unreachable("Unimplemented function.");
   return false;
 }
 
+bool MipsDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
+bool MipsDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  llvm_unreachable("Unimplemented function.");
+  return false;
+}
+
 /// Select instructions not customized! Used for
 /// expanded, promoted and normal instructions
 SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
@@ -97,6 +175,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return NULL;
   }
 
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index cf0f9c5..a4d9da5 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -57,6 +57,11 @@ private:
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  // Complex Pattern.
+  /// (reg + reg).
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   /// Fall back on this function if all else fails.
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
@@ -65,9 +70,42 @@ private:
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
   virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
                             SDValue &Offset, SDValue &Alias);
 
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual SDNode *Select(SDNode *N);
 
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 4d76181..1e8250c 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "MipsTargetMachine.h"
 #include "MipsTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -34,6 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
 
 using namespace llvm;
 
@@ -43,6 +45,11 @@ static cl::opt<bool>
 LargeGOT("mxgot", cl::Hidden,
          cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
 
+static cl::opt<bool>
+NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
+               cl::desc("MIPS: Don't trap on integer division by zero."),
+               cl::init(false));
+
 static const uint16_t O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
@@ -62,10 +69,10 @@ static const uint16_t Mips64DPRegs[8] = {
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
 static bool isShiftedMask(uint64_t I, uint64_t &Pos, uint64_t &Size) {
   if (!isShiftedMask_64(I))
-     return false;
+    return false;
 
   Size = CountPopulation_64(I);
-  Pos = CountTrailingZeros_64(I);
+  Pos = countTrailingZeros(I);
   return true;
 }
 
@@ -74,72 +81,35 @@ SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
   return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
 }
 
-static SDValue getTargetNode(SDValue Op, SelectionDAG &DAG, unsigned Flag) {
-  EVT Ty = Op.getValueType();
+SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
+}
 
-  if (GlobalAddressSDNode *N = dyn_cast<GlobalAddressSDNode>(Op))
-    return DAG.getTargetGlobalAddress(N->getGlobal(), Op.getDebugLoc(), Ty, 0,
-                                      Flag);
-  if (ExternalSymbolSDNode *N = dyn_cast<ExternalSymbolSDNode>(Op))
-    return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
-  if (BlockAddressSDNode *N = dyn_cast<BlockAddressSDNode>(Op))
-    return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
-  if (JumpTableSDNode *N = dyn_cast<JumpTableSDNode>(Op))
-    return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
-  if (ConstantPoolSDNode *N = dyn_cast<ConstantPoolSDNode>(Op))
-    return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
-                                     N->getOffset(), Flag);
-
-  llvm_unreachable("Unexpected node type.");
-  return SDValue();
+SDValue MipsTargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
 }
 
-static SDValue getAddrNonPIC(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT Ty = Op.getValueType();
-  SDValue Hi = getTargetNode(Op, DAG, MipsII::MO_ABS_HI);
-  SDValue Lo = getTargetNode(Op, DAG, MipsII::MO_ABS_LO);
-  return DAG.getNode(ISD::ADD, DL, Ty,
-                     DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
-                     DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+SDValue MipsTargetLowering::getTargetNode(BlockAddressSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrLocal(SDValue Op, SelectionDAG &DAG,
-                                         bool HasMips64) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT Ty = Op.getValueType();
-  unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
-  SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, GOTFlag));
-  SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
-                             MachinePointerInfo::getGOT(), false, false, false,
-                             0);
-  unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
-  SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty, getTargetNode(Op, DAG, LoFlag));
-  return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
-}
-
-SDValue MipsTargetLowering::getAddrGlobal(SDValue Op, SelectionDAG &DAG,
+SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
                                           unsigned Flag) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT Ty = Op.getValueType();
-  SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
-                            getTargetNode(Op, DAG, Flag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Tgt,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+  return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
 }
 
-SDValue MipsTargetLowering::getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                                  unsigned HiFlag,
-                                                  unsigned LoFlag) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT Ty = Op.getValueType();
-  SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty, getTargetNode(Op, DAG, HiFlag));
-  Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
-  SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
-                                getTargetNode(Op, DAG, LoFlag));
-  return DAG.getLoad(Ty, DL, DAG.getEntryNode(), Wrapper,
-                     MachinePointerInfo::getGOT(), false, false, false, 0);
+SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
+                                          SelectionDAG &DAG,
+                                          unsigned Flag) const {
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+                                   N->getOffset(), Flag);
 }
 
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -156,9 +126,10 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::FPCmp:             return "MipsISD::FPCmp";
   case MipsISD::CMovFP_T:          return "MipsISD::CMovFP_T";
   case MipsISD::CMovFP_F:          return "MipsISD::CMovFP_F";
-  case MipsISD::FPRound:           return "MipsISD::FPRound";
-  case MipsISD::ExtractLOHI:       return "MipsISD::ExtractLOHI";
-  case MipsISD::InsertLOHI:        return "MipsISD::InsertLOHI";
+  case MipsISD::TruncIntFP:        return "MipsISD::TruncIntFP";
+  case MipsISD::MFHI:              return "MipsISD::MFHI";
+  case MipsISD::MFLO:              return "MipsISD::MFLO";
+  case MipsISD::MTLOHI:            return "MipsISD::MTLOHI";
   case MipsISD::Mult:              return "MipsISD::Mult";
   case MipsISD::Multu:             return "MipsISD::Multu";
   case MipsISD::MAdd:              return "MipsISD::MAdd";
@@ -202,6 +173,30 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::SHRL_DSP:          return "MipsISD::SHRL_DSP";
   case MipsISD::SETCC_DSP:         return "MipsISD::SETCC_DSP";
   case MipsISD::SELECT_CC_DSP:     return "MipsISD::SELECT_CC_DSP";
+  case MipsISD::VALL_ZERO:         return "MipsISD::VALL_ZERO";
+  case MipsISD::VANY_ZERO:         return "MipsISD::VANY_ZERO";
+  case MipsISD::VALL_NONZERO:      return "MipsISD::VALL_NONZERO";
+  case MipsISD::VANY_NONZERO:      return "MipsISD::VANY_NONZERO";
+  case MipsISD::VCEQ:              return "MipsISD::VCEQ";
+  case MipsISD::VCLE_S:            return "MipsISD::VCLE_S";
+  case MipsISD::VCLE_U:            return "MipsISD::VCLE_U";
+  case MipsISD::VCLT_S:            return "MipsISD::VCLT_S";
+  case MipsISD::VCLT_U:            return "MipsISD::VCLT_U";
+  case MipsISD::VSMAX:             return "MipsISD::VSMAX";
+  case MipsISD::VSMIN:             return "MipsISD::VSMIN";
+  case MipsISD::VUMAX:             return "MipsISD::VUMAX";
+  case MipsISD::VUMIN:             return "MipsISD::VUMIN";
+  case MipsISD::VEXTRACT_SEXT_ELT: return "MipsISD::VEXTRACT_SEXT_ELT";
+  case MipsISD::VEXTRACT_ZEXT_ELT: return "MipsISD::VEXTRACT_ZEXT_ELT";
+  case MipsISD::VNOR:              return "MipsISD::VNOR";
+  case MipsISD::VSHF:              return "MipsISD::VSHF";
+  case MipsISD::SHF:               return "MipsISD::SHF";
+  case MipsISD::ILVEV:             return "MipsISD::ILVEV";
+  case MipsISD::ILVOD:             return "MipsISD::ILVOD";
+  case MipsISD::ILVL:              return "MipsISD::ILVL";
+  case MipsISD::ILVR:              return "MipsISD::ILVR";
+  case MipsISD::PCKEV:             return "MipsISD::PCKEV";
+  case MipsISD::PCKOD:             return "MipsISD::PCKOD";
   default:                         return NULL;
   }
 }
@@ -250,6 +245,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -265,6 +261,7 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::SELECT,             MVT::i64,   Custom);
     setOperationAction(ISD::LOAD,               MVT::i64,   Custom);
     setOperationAction(ISD::STORE,              MVT::i64,   Custom);
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
   if (!HasMips64) {
@@ -339,11 +336,6 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FNEG,             MVT::f64,   Expand);
   }
 
-  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i32, Expand);
-  setOperationAction(ISD::EXCEPTIONADDR,     MVT::i64, Expand);
-  setOperationAction(ISD::EHSELECTION,       MVT::i32, Expand);
-  setOperationAction(ISD::EHSELECTION,       MVT::i64, Expand);
-
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
 
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
@@ -383,6 +375,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setTruncStoreAction(MVT::i64, MVT::i32, Custom);
   }
 
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
   setTargetDAGCombine(ISD::SDIVREM);
   setTargetDAGCombine(ISD::UDIVREM);
   setTargetDAGCombine(ISD::SELECT);
@@ -407,7 +401,7 @@ const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
   return llvm::createMipsSETargetLowering(TM);
 }
 
-EVT MipsTargetLowering::getSetCCResultType(EVT VT) const {
+EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -420,11 +414,11 @@ static SDValue performDivRemCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT Ty = N->getValueType(0);
-  unsigned LO = (Ty == MVT::i32) ? Mips::LO : Mips::LO64;
-  unsigned HI = (Ty == MVT::i32) ? Mips::HI : Mips::HI64;
+  unsigned LO = (Ty == MVT::i32) ? Mips::LO0 : Mips::LO0_64;
+  unsigned HI = (Ty == MVT::i32) ? Mips::HI0 : Mips::HI0_64;
   unsigned Opc = N->getOpcode() == ISD::SDIVREM ? MipsISD::DivRem16 :
                                                   MipsISD::DivRemU16;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue DivRem = DAG.getNode(Opc, DL, MVT::Glue,
                                N->getOperand(0), N->getOperand(1));
@@ -502,7 +496,7 @@ static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
     return Op;
 
   SDValue RHS = Op.getOperand(1);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Assume the 3rd operand is a CondCodeSDNode. Add code to check the type of
   // node if necessary.
@@ -514,12 +508,13 @@ static SDValue createFPCmp(SelectionDAG &DAG, const SDValue &Op) {
 
 // Creates and returns a CMovFPT/F node.
 static SDValue createCMovFP(SelectionDAG &DAG, SDValue Cond, SDValue True,
-                            SDValue False, DebugLoc DL) {
+                            SDValue False, SDLoc DL) {
   ConstantSDNode *CC = cast<ConstantSDNode>(Cond.getOperand(2));
   bool invert = invertFPCondCodeUser((Mips::CondCode)CC->getSExtValue());
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
 
   return DAG.getNode((invert ? MipsISD::CMovFP_F : MipsISD::CMovFP_T), DL,
-                     True.getValueType(), True, False, Cond);
+                     True.getValueType(), True, FCC0, False, Cond);
 }
 
 static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -545,7 +540,7 @@ static SDValue performSELECTCombine(SDNode *N, SelectionDAG &DAG,
   if (!CN || CN->getZExtValue())
     return SDValue();
 
-  const DebugLoc DL = N->getDebugLoc();
+  const SDLoc DL(N);
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
   SDValue True = N->getOperand(1);
 
@@ -561,7 +556,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   // Pattern match EXT.
   //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
   //  => ext $dst, $src, size, pos
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
@@ -590,7 +585,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
   if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
     return SDValue();
 
-  return DAG.getNode(MipsISD::Ext, N->getDebugLoc(), ValTy,
+  return DAG.getNode(MipsISD::Ext, SDLoc(N), ValTy,
                      ShiftRight.getOperand(0), DAG.getConstant(Pos, MVT::i32),
                      DAG.getConstant(SMSize, MVT::i32));
 }
@@ -602,7 +597,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   //  $dst = or (and $src1 , mask0), (and (shl $src, pos), mask1),
   //  where mask1 = (2**size - 1) << pos, mask0 = ~mask1
   //  => ins $dst, $src, size, pos, $src1
-  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasMips32r2())
+  if (DCI.isBeforeLegalizeOps() || !Subtarget->hasExtractInsert())
     return SDValue();
 
   SDValue And0 = N->getOperand(0), And1 = N->getOperand(1);
@@ -644,7 +639,7 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
   if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
     return SDValue();
 
-  return DAG.getNode(MipsISD::Ins, N->getDebugLoc(), ValTy, Shl.getOperand(0),
+  return DAG.getNode(MipsISD::Ins, SDLoc(N), ValTy, Shl.getOperand(0),
                      DAG.getConstant(SMPos0, MVT::i32),
                      DAG.getConstant(SMSize0, MVT::i32), And0.getOperand(0));
 }
@@ -669,7 +664,7 @@ static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT ValTy = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SDValue Add1 = DAG.getNode(ISD::ADD, DL, ValTy, N->getOperand(0),
                              Add.getOperand(0));
@@ -744,6 +739,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::LOAD:               return lowerLOAD(Op, DAG);
   case ISD::STORE:              return lowerSTORE(Op, DAG);
   case ISD::ADD:                return lowerADD(Op, DAG);
+  case ISD::FP_TO_SINT:         return lowerFP_TO_SINT(Op, DAG);
   }
   return SDValue();
 }
@@ -763,6 +759,30 @@ addLiveIn(MachineFunction &MF, unsigned PReg, const TargetRegisterClass *RC)
   return VReg;
 }
 
+static MachineBasicBlock *expandPseudoDIV(MachineInstr *MI,
+                                          MachineBasicBlock &MBB,
+                                          const TargetInstrInfo &TII,
+                                          bool Is64Bit) {
+  if (NoZeroDivCheck)
+    return &MBB;
+
+  // Insert instruction "teq $divisor_reg, $zero, 7".
+  MachineBasicBlock::iterator I(MI);
+  MachineInstrBuilder MIB;
+  MachineOperand &Divisor = MI->getOperand(2);
+  MIB = BuildMI(MBB, llvm::next(I), MI->getDebugLoc(), TII.get(Mips::TEQ))
+    .addReg(Divisor.getReg(), getKillRegState(Divisor.isKill()))
+    .addReg(Mips::ZERO).addImm(7);
+
+  // Use the 32-bit sub-register if this is a 64-bit division.
+  if (Is64Bit)
+    MIB->getOperand(0).setSubReg(Mips::sub_32);
+
+  // Clear Divisor's kill flag.
+  Divisor.setIsKill(false);
+  return &MBB;
+}
+
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
@@ -770,108 +790,82 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   default:
     llvm_unreachable("Unexpected instr type to insert");
   case Mips::ATOMIC_LOAD_ADD_I8:
-  case Mips::ATOMIC_LOAD_ADD_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I16:
-  case Mips::ATOMIC_LOAD_ADD_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I32:
-  case Mips::ATOMIC_LOAD_ADD_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::ADDu);
   case Mips::ATOMIC_LOAD_ADD_I64:
-  case Mips::ATOMIC_LOAD_ADD_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DADDu);
 
   case Mips::ATOMIC_LOAD_AND_I8:
-  case Mips::ATOMIC_LOAD_AND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I16:
-  case Mips::ATOMIC_LOAD_AND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I32:
-  case Mips::ATOMIC_LOAD_AND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::AND);
   case Mips::ATOMIC_LOAD_AND_I64:
-  case Mips::ATOMIC_LOAD_AND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::AND64);
 
   case Mips::ATOMIC_LOAD_OR_I8:
-  case Mips::ATOMIC_LOAD_OR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I16:
-  case Mips::ATOMIC_LOAD_OR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I32:
-  case Mips::ATOMIC_LOAD_OR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::OR);
   case Mips::ATOMIC_LOAD_OR_I64:
-  case Mips::ATOMIC_LOAD_OR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::OR64);
 
   case Mips::ATOMIC_LOAD_XOR_I8:
-  case Mips::ATOMIC_LOAD_XOR_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I16:
-  case Mips::ATOMIC_LOAD_XOR_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I32:
-  case Mips::ATOMIC_LOAD_XOR_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::XOR);
   case Mips::ATOMIC_LOAD_XOR_I64:
-  case Mips::ATOMIC_LOAD_XOR_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::XOR64);
 
   case Mips::ATOMIC_LOAD_NAND_I8:
-  case Mips::ATOMIC_LOAD_NAND_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I16:
-  case Mips::ATOMIC_LOAD_NAND_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I32:
-  case Mips::ATOMIC_LOAD_NAND_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0, true);
   case Mips::ATOMIC_LOAD_NAND_I64:
-  case Mips::ATOMIC_LOAD_NAND_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0, true);
 
   case Mips::ATOMIC_LOAD_SUB_I8:
-  case Mips::ATOMIC_LOAD_SUB_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I16:
-  case Mips::ATOMIC_LOAD_SUB_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I32:
-  case Mips::ATOMIC_LOAD_SUB_I32_P8:
     return emitAtomicBinary(MI, BB, 4, Mips::SUBu);
   case Mips::ATOMIC_LOAD_SUB_I64:
-  case Mips::ATOMIC_LOAD_SUB_I64_P8:
     return emitAtomicBinary(MI, BB, 8, Mips::DSUBu);
 
   case Mips::ATOMIC_SWAP_I8:
-  case Mips::ATOMIC_SWAP_I8_P8:
     return emitAtomicBinaryPartword(MI, BB, 1, 0);
   case Mips::ATOMIC_SWAP_I16:
-  case Mips::ATOMIC_SWAP_I16_P8:
     return emitAtomicBinaryPartword(MI, BB, 2, 0);
   case Mips::ATOMIC_SWAP_I32:
-  case Mips::ATOMIC_SWAP_I32_P8:
     return emitAtomicBinary(MI, BB, 4, 0);
   case Mips::ATOMIC_SWAP_I64:
-  case Mips::ATOMIC_SWAP_I64_P8:
     return emitAtomicBinary(MI, BB, 8, 0);
 
   case Mips::ATOMIC_CMP_SWAP_I8:
-  case Mips::ATOMIC_CMP_SWAP_I8_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 1);
   case Mips::ATOMIC_CMP_SWAP_I16:
-  case Mips::ATOMIC_CMP_SWAP_I16_P8:
     return emitAtomicCmpSwapPartword(MI, BB, 2);
   case Mips::ATOMIC_CMP_SWAP_I32:
-  case Mips::ATOMIC_CMP_SWAP_I32_P8:
     return emitAtomicCmpSwap(MI, BB, 4);
   case Mips::ATOMIC_CMP_SWAP_I64:
-  case Mips::ATOMIC_CMP_SWAP_I64_P8:
     return emitAtomicCmpSwap(MI, BB, 8);
+  case Mips::PseudoSDIV:
+  case Mips::PseudoUDIV:
+    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), false);
+  case Mips::PseudoDSDIV:
+  case Mips::PseudoDUDIV:
+    return expandPseudoDIV(MI, *BB, *getTargetMachine().getInstrInfo(), true);
   }
 }
 
@@ -891,16 +885,16 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   unsigned LL, SC, AND, NOR, ZERO, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     AND = Mips::AND;
     NOR = Mips::NOR;
     ZERO = Mips::ZERO;
     BEQ = Mips::BEQ;
   }
   else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     AND = Mips::AND64;
     NOR = Mips::NOR64;
     ZERO = Mips::ZERO_64;
@@ -926,8 +920,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
 
   // Transfer the remainder of BB and its successor edges to exitMBB.
   exitMBB->splice(exitMBB->begin(), BB,
-                  llvm::next(MachineBasicBlock::iterator(MI)),
-                  BB->end());
+                  llvm::next(MachineBasicBlock::iterator(MI)), BB->end());
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   //  thisMBB:
@@ -958,7 +951,7 @@ MipsTargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   BuildMI(BB, DL, TII->get(SC), Success).addReg(StoreVal).addReg(Ptr).addImm(0);
   BuildMI(BB, DL, TII->get(BEQ)).addReg(Success).addReg(ZERO).addMBB(loopMBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -969,15 +962,13 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
                                              unsigned Size, unsigned BinOpcode,
                                              bool Nand) const {
   assert((Size == 1 || Size == 2) &&
-      "Unsupported size for EmitAtomicBinaryPartial.");
+         "Unsupported size for EmitAtomicBinaryPartial.");
 
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &RegInfo = MF->getRegInfo();
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest = MI->getOperand(0).getReg();
   unsigned Ptr = MI->getOperand(1).getReg();
@@ -1039,13 +1030,20 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  if (Subtarget->isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
   BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
-    .addReg(ShiftAmt).addReg(MaskUpper);
+    .addReg(MaskUpper).addReg(ShiftAmt);
   BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
-  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(ShiftAmt).addReg(Incr);
+  BuildMI(BB, DL, TII->get(Mips::SLLV), Incr2).addReg(Incr).addReg(ShiftAmt);
 
   // atomic.load.binop
   // loopMBB:
@@ -1067,7 +1065,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   //   beq     success,$0,loopMBB
 
   BB = loopMBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   if (Nand) {
     //  and andres, oldval, incr2
     //  nor binopres, $0, andres
@@ -1081,7 +1079,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     //  and newval, binopres, mask
     BuildMI(BB, DL, TII->get(BinOpcode), BinOpRes).addReg(OldVal).addReg(Incr2);
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(BinOpRes).addReg(Mask);
-  } else {// atomic.swap
+  } else { // atomic.swap
     //  and newval, incr2, mask
     BuildMI(BB, DL, TII->get(Mips::AND), NewVal).addReg(Incr2).addReg(Mask);
   }
@@ -1090,7 +1088,7 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal0).addReg(NewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
     .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
     .addReg(Success).addReg(Mips::ZERO).addMBB(loopMBB);
@@ -1106,21 +1104,20 @@ MipsTargetLowering::emitAtomicBinaryPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal1)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(ShiftAmt).addReg(MaskedOldVal1);
+      .addReg(MaskedOldVal1).addReg(ShiftAmt);
   BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
       .addReg(SrlRes).addImm(ShiftImm);
   BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
       .addReg(SllRes).addImm(ShiftImm);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
 
-MachineBasicBlock *
-MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                      MachineBasicBlock *BB,
-                                      unsigned Size) const {
+MachineBasicBlock * MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
+                                                          MachineBasicBlock *BB,
+                                                          unsigned Size) const {
   assert((Size == 4 || Size == 8) && "Unsupported size for EmitAtomicCmpSwap.");
 
   MachineFunction *MF = BB->getParent();
@@ -1131,15 +1128,14 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   unsigned LL, SC, ZERO, BNE, BEQ;
 
   if (Size == 4) {
-    LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-    SC = IsN64 ? Mips::SC_P8 : Mips::SC;
+    LL = Mips::LL;
+    SC = Mips::SC;
     ZERO = Mips::ZERO;
     BNE = Mips::BNE;
     BEQ = Mips::BEQ;
-  }
-  else {
-    LL = IsN64 ? Mips::LLD_P8 : Mips::LLD;
-    SC = IsN64 ? Mips::SCD_P8 : Mips::SCD;
+  } else {
+    LL = Mips::LLD;
+    SC = Mips::SCD;
     ZERO = Mips::ZERO_64;
     BNE = Mips::BNE64;
     BEQ = Mips::BEQ64;
@@ -1194,7 +1190,7 @@ MipsTargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(BEQ))
     .addReg(Success).addReg(ZERO).addMBB(loop1MBB);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  MI->eraseFromParent(); // The instruction is gone now.
 
   return exitMBB;
 }
@@ -1211,8 +1207,6 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   const TargetRegisterClass *RC = getRegClassFor(MVT::i32);
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   DebugLoc DL = MI->getDebugLoc();
-  unsigned LL = IsN64 ? Mips::LL_P8 : Mips::LL;
-  unsigned SC = IsN64 ? Mips::SC_P8 : Mips::SC;
 
   unsigned Dest    = MI->getOperand(0).getReg();
   unsigned Ptr     = MI->getOperand(1).getReg();
@@ -1282,27 +1276,34 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   BuildMI(BB, DL, TII->get(Mips::AND), AlignedAddr)
     .addReg(Ptr).addReg(MaskLSB2);
   BuildMI(BB, DL, TII->get(Mips::ANDi), PtrLSB2).addReg(Ptr).addImm(3);
-  BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  if (Subtarget->isLittle()) {
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(PtrLSB2).addImm(3);
+  } else {
+    unsigned Off = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII->get(Mips::XORi), Off)
+      .addReg(PtrLSB2).addImm((Size == 1) ? 3 : 2);
+    BuildMI(BB, DL, TII->get(Mips::SLL), ShiftAmt).addReg(Off).addImm(3);
+  }
   BuildMI(BB, DL, TII->get(Mips::ORi), MaskUpper)
     .addReg(Mips::ZERO).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), Mask)
-    .addReg(ShiftAmt).addReg(MaskUpper);
+    .addReg(MaskUpper).addReg(ShiftAmt);
   BuildMI(BB, DL, TII->get(Mips::NOR), Mask2).addReg(Mips::ZERO).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedCmpVal)
     .addReg(CmpVal).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedCmpVal)
-    .addReg(ShiftAmt).addReg(MaskedCmpVal);
+    .addReg(MaskedCmpVal).addReg(ShiftAmt);
   BuildMI(BB, DL, TII->get(Mips::ANDi), MaskedNewVal)
     .addReg(NewVal).addImm(MaskImm);
   BuildMI(BB, DL, TII->get(Mips::SLLV), ShiftedNewVal)
-    .addReg(ShiftAmt).addReg(MaskedNewVal);
+    .addReg(MaskedNewVal).addReg(ShiftAmt);
 
   //  loop1MBB:
   //    ll      oldval,0(alginedaddr)
   //    and     maskedoldval0,oldval,mask
   //    bne     maskedoldval0,shiftedcmpval,sinkMBB
   BB = loop1MBB;
-  BuildMI(BB, DL, TII->get(LL), OldVal).addReg(AlignedAddr).addImm(0);
+  BuildMI(BB, DL, TII->get(Mips::LL), OldVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::AND), MaskedOldVal0)
     .addReg(OldVal).addReg(Mask);
   BuildMI(BB, DL, TII->get(Mips::BNE))
@@ -1318,7 +1319,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
     .addReg(OldVal).addReg(Mask2);
   BuildMI(BB, DL, TII->get(Mips::OR), StoreVal)
     .addReg(MaskedOldVal1).addReg(ShiftedNewVal);
-  BuildMI(BB, DL, TII->get(SC), Success)
+  BuildMI(BB, DL, TII->get(Mips::SC), Success)
       .addReg(StoreVal).addReg(AlignedAddr).addImm(0);
   BuildMI(BB, DL, TII->get(Mips::BEQ))
       .addReg(Success).addReg(Mips::ZERO).addMBB(loop1MBB);
@@ -1331,7 +1332,7 @@ MipsTargetLowering::emitAtomicCmpSwapPartword(MachineInstr *MI,
   int64_t ShiftImm = (Size == 1) ? 24 : 16;
 
   BuildMI(BB, DL, TII->get(Mips::SRLV), SrlRes)
-      .addReg(ShiftAmt).addReg(MaskedOldVal0);
+      .addReg(MaskedOldVal0).addReg(ShiftAmt);
   BuildMI(BB, DL, TII->get(Mips::SLL), SllRes)
       .addReg(SrlRes).addImm(ShiftImm);
   BuildMI(BB, DL, TII->get(Mips::SRA), Dest)
@@ -1349,7 +1350,7 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT PTy = getPointerTy();
   unsigned EntrySize =
     DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(*getDataLayout());
@@ -1375,14 +1376,12 @@ SDValue MipsTargetLowering::lowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BRIND, DL, MVT::Other, Chain, Addr);
 }
 
-SDValue MipsTargetLowering::
-lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
-{
+SDValue MipsTargetLowering::lowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   // The first operand is the chain, the second is the condition, the third is
   // the block to branch to if the condition is true.
   SDValue Chain = Op.getOperand(0);
   SDValue Dest = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   SDValue CondRes = createFPCmp(DAG, Op.getOperand(1));
 
@@ -1395,8 +1394,9 @@ lowerBRCOND(SDValue Op, SelectionDAG &DAG) const
     (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
   unsigned Opc = invertFPCondCodeUser(CC) ? Mips::BRANCH_F : Mips::BRANCH_T;
   SDValue BrCode = DAG.getConstant(Opc, MVT::i32);
+  SDValue FCC0 = DAG.getRegister(Mips::FCC0, MVT::i32);
   return DAG.getNode(MipsISD::FPBrcond, DL, Op.getValueType(), Chain, BrCode,
-                     Dest, CondRes);
+                     FCC0, Dest, CondRes);
 }
 
 SDValue MipsTargetLowering::
@@ -1409,15 +1409,16 @@ lowerSELECT(SDValue Op, SelectionDAG &DAG) const
     return Op;
 
   return createCMovFP(DAG, Cond, Op.getOperand(1), Op.getOperand(2),
-                      Op.getDebugLoc());
+                      SDLoc(Op));
 }
 
 SDValue MipsTargetLowering::
 lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = Op.getOperand(0).getValueType();
-  SDValue Cond = DAG.getNode(ISD::SETCC, DL, getSetCCResultType(Ty),
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL,
+                             getSetCCResultType(*DAG.getContext(), Ty),
                              Op.getOperand(0), Op.getOperand(1),
                              Op.getOperand(4));
 
@@ -1434,14 +1435,16 @@ SDValue MipsTargetLowering::lowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDValue True  = DAG.getConstant(1, MVT::i32);
   SDValue False = DAG.getConstant(0, MVT::i32);
 
-  return createCMovFP(DAG, Cond, True, False, Op.getDebugLoc());
+  return createCMovFP(DAG, Cond, True, False, SDLoc(Op));
 }
 
 SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   // FIXME there isn't actually debug info here
-  DebugLoc DL = Op.getDebugLoc();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDLoc DL(Op);
+  EVT Ty = Op.getValueType();
+  GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = N->getGlobal();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     const MipsTargetObjectFile &TLOF =
@@ -1458,26 +1461,31 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
     }
 
     // %hi/%lo relocation
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
   }
 
   if (GV->hasInternalLinkage() || (GV->hasLocalLinkage() && !isa<Function>(GV)))
-    return getAddrLocal(Op, DAG, HasMips64);
+    return getAddrLocal(N, Ty, DAG, HasMips64);
 
   if (LargeGOT)
-    return getAddrGlobalLargeGOT(Op, DAG, MipsII::MO_GOT_HI16,
-                                 MipsII::MO_GOT_LO16);
+    return getAddrGlobalLargeGOT(N, Ty, DAG, MipsII::MO_GOT_HI16,
+                                 MipsII::MO_GOT_LO16, DAG.getEntryNode(),
+                                 MachinePointerInfo::getGOT());
 
-  return getAddrGlobal(Op, DAG,
-                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16);
+  return getAddrGlobal(N, Ty, DAG,
+                       HasMips64 ? MipsII::MO_GOT_DISP : MipsII::MO_GOT16,
+                       DAG.getEntryNode(), MachinePointerInfo::getGOT());
 }
 
 SDValue MipsTargetLowering::lowerBlockAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  BlockAddressSDNode *N = cast<BlockAddressSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1488,7 +1496,7 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
   // Local Exec TLS Model.
 
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  DebugLoc DL = GA->getDebugLoc();
+  SDLoc DL(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
 
@@ -1564,10 +1572,13 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 SDValue MipsTargetLowering::
 lowerJumpTable(SDValue Op, SelectionDAG &DAG) const
 {
+  JumpTableSDNode *N = cast<JumpTableSDNode>(Op);
+  EVT Ty = Op.getValueType();
+
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::
@@ -1582,18 +1593,20 @@ lowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
   //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
   //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode);
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  EVT Ty = Op.getValueType();
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64)
-    return getAddrNonPIC(Op, DAG);
+    return getAddrNonPIC(N, Ty, DAG);
 
-  return getAddrLocal(Op, DAG, HasMips64);
+  return getAddrLocal(N, Ty, DAG, HasMips64);
 }
 
 SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue FI = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
                                  getPointerTy());
 
@@ -1604,12 +1617,13 @@ SDValue MipsTargetLowering::lowerVASTART(SDValue Op, SelectionDAG &DAG) const {
                       MachinePointerInfo(SV), false, false, 0);
 }
 
-static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   EVT TyX = Op.getOperand(0).getValueType();
   EVT TyY = Op.getOperand(1).getValueType();
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
   SDValue Const31 = DAG.getConstant(31, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Res;
 
   // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
@@ -1623,7 +1637,7 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
     DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(1),
                 Const1);
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, 31, 1  ; extract bit31 of Y
     // ins  X, E, 31, 1  ; insert extracted bit at bit31 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, MVT::i32, Y, Const31, Const1);
@@ -1649,18 +1663,19 @@ static SDValue lowerFCOPYSIGN32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
+                                bool HasExtractInsert) {
   unsigned WidthX = Op.getOperand(0).getValueSizeInBits();
   unsigned WidthY = Op.getOperand(1).getValueSizeInBits();
   EVT TyX = MVT::getIntegerVT(WidthX), TyY = MVT::getIntegerVT(WidthY);
   SDValue Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Bitcast to integer nodes.
   SDValue X = DAG.getNode(ISD::BITCAST, DL, TyX, Op.getOperand(0));
   SDValue Y = DAG.getNode(ISD::BITCAST, DL, TyY, Op.getOperand(1));
 
-  if (HasR2) {
+  if (HasExtractInsert) {
     // ext  E, Y, width(Y) - 1, 1  ; extract bit width(Y)-1 of Y
     // ins  X, E, width(X) - 1, 1  ; insert extracted bit at bit width(X)-1 of X
     SDValue E = DAG.getNode(MipsISD::Ext, DL, TyY, Y,
@@ -1700,14 +1715,15 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64())
-    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
   // to i32.
@@ -1717,7 +1733,7 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
                 Const1);
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
                       DAG.getRegister(Mips::ZERO, MVT::i32),
                       DAG.getConstant(31, MVT::i32), Const1, X);
@@ -1734,15 +1750,16 @@ static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG, bool HasR2) {
   return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
 }
 
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
+static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
+                           bool HasExtractInsert) {
   SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Bitcast to integer node.
   SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
 
   // Clear MSB.
-  if (HasR2)
+  if (HasExtractInsert)
     Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
                       DAG.getRegister(Mips::ZERO_64, MVT::i64),
                       DAG.getConstant(63, MVT::i32), Const1, X);
@@ -1757,9 +1774,9 @@ static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG, bool HasR2) {
 SDValue
 MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return lowerFABS64(Op, DAG, Subtarget->hasMips32r2());
+    return lowerFABS64(Op, DAG, Subtarget->hasExtractInsert());
 
-  return lowerFABS32(Op, DAG, Subtarget->hasMips32r2());
+  return lowerFABS32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
 SDValue MipsTargetLowering::
@@ -1771,7 +1788,7 @@ lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL,
                                          IsN64 ? Mips::FP_64 : Mips::FP, VT);
   return FrameAddr;
@@ -1791,7 +1808,7 @@ SDValue MipsTargetLowering::lowerRETURNADDR(SDValue Op,
 
   // Return RA, which contains the return address. Mark it an implicit live-in.
   unsigned Reg = MF.addLiveIn(RA, getRegClassFor(VT));
-  return DAG.getCopyFromReg(DAG.getEntryNode(), Op.getDebugLoc(), Reg, VT);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(Op), Reg, VT);
 }
 
 // An EH_RETURN is the result of lowering llvm.eh.return which in turn is
@@ -1807,7 +1824,7 @@ SDValue MipsTargetLowering::lowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc DL       = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
 
   // Store stack offset in V1, store jump target in V0. Glue CopyToReg and
@@ -1827,14 +1844,14 @@ SDValue MipsTargetLowering::lowerATOMIC_FENCE(SDValue Op,
   // FIXME: Need pseudo-fence for 'singlethread' fences
   // FIXME: Set SType for weaker fences where supported/appropriate.
   unsigned SType = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(MipsISD::Sync, DL, MVT::Other, Op.getOperand(0),
                      DAG.getConstant(SType, MVT::i32));
 }
 
 SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
 
@@ -1865,7 +1882,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
 
 SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
                                                  bool IsSRA) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Lo = Op.getOperand(0), Hi = Op.getOperand(1);
   SDValue Shamt = Op.getOperand(2);
 
@@ -1909,7 +1926,7 @@ static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
   SDValue Ptr = LD->getBasePtr();
   EVT VT = LD->getValueType(0), MemVT = LD->getMemoryVT();
   EVT BasePtrVT = Ptr.getValueType();
-  DebugLoc DL = LD->getDebugLoc();
+  SDLoc DL(LD);
   SDVTList VTList = DAG.getVTList(VT, MVT::Other);
 
   if (Offset)
@@ -1975,7 +1992,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   //  (set tmp1, (lwr baseptr, tmp0))
   //  (set tmp2, (shl tmp1, 32))
   //  (set dst, (srl tmp2, 32))
-  DebugLoc DL = LD->getDebugLoc();
+  SDLoc DL(LD);
   SDValue Const32 = DAG.getConstant(32, MVT::i32);
   SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
@@ -1987,7 +2004,7 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
                              SDValue Chain, unsigned Offset) {
   SDValue Ptr = SD->getBasePtr(), Value = SD->getValue();
   EVT MemVT = SD->getMemoryVT(), BasePtrVT = Ptr.getValueType();
-  DebugLoc DL = SD->getDebugLoc();
+  SDLoc DL(SD);
   SDVTList VTList = DAG.getVTList(MVT::Other);
 
   if (Offset)
@@ -2000,16 +2017,8 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
 }
 
 // Expand an unaligned 32 or 64-bit integer store node.
-SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  StoreSDNode *SD = cast<StoreSDNode>(Op);
-  EVT MemVT = SD->getMemoryVT();
-
-  // Return if store is aligned or if MemVT is neither i32 nor i64.
-  if ((SD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
-      ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
-    return SDValue();
-
-  bool IsLittle = Subtarget->isLittle();
+static SDValue lowerUnalignedIntStore(StoreSDNode *SD, SelectionDAG &DAG,
+                                      bool IsLittle) {
   SDValue Value = SD->getValue(), Chain = SD->getChain();
   EVT VT = Value.getValueType();
 
@@ -2036,6 +2045,34 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return createStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
 }
 
+// Lower (store (fp_to_sint $fp) $ptr) to (store (TruncIntFP $fp), $ptr).
+static SDValue lowerFP_TO_SINT_STORE(StoreSDNode *SD, SelectionDAG &DAG) {
+  SDValue Val = SD->getValue();
+
+  if (Val.getOpcode() != ISD::FP_TO_SINT)
+    return SDValue();
+
+  EVT FPTy = EVT::getFloatingPointVT(Val.getValueSizeInBits());
+  SDValue Tr = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Val), FPTy,
+                           Val.getOperand(0));
+
+  return DAG.getStore(SD->getChain(), SDLoc(SD), Tr, SD->getBasePtr(),
+                      SD->getPointerInfo(), SD->isVolatile(),
+                      SD->isNonTemporal(), SD->getAlignment());
+}
+
+SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode *SD = cast<StoreSDNode>(Op);
+  EVT MemVT = SD->getMemoryVT();
+
+  // Lower unaligned integer stores.
+  if ((SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+      ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
+    return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
+
+  return lowerFP_TO_SINT_STORE(SD, DAG);
+}
+
 SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
   if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
       || cast<ConstantSDNode>
@@ -2053,10 +2090,18 @@ SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op->getValueType(0);
   int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
   SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
-  return DAG.getNode(ISD::ADD, Op->getDebugLoc(), ValTy, InArgsAddr,
+  return DAG.getNode(ISD::ADD, SDLoc(Op), ValTy, InArgsAddr,
                      DAG.getConstant(0, ValTy));
 }
 
+SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  EVT FPTy = EVT::getFloatingPointVT(Op.getValueSizeInBits());
+  SDValue Trunc = DAG.getNode(MipsISD::TruncIntFP, SDLoc(Op), FPTy,
+                              Op.getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), Op.getValueType(), Trunc);
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2076,21 +2121,14 @@ SDValue MipsTargetLowering::lowerADD(SDValue Op, SelectionDAG &DAG) const {
 //  For vararg functions, all arguments are passed in A0, A1, A2, A3 and stack.
 //===----------------------------------------------------------------------===//
 
-static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
-                       MVT LocVT, CCValAssign::LocInfo LocInfo,
-                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
+                       CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
+                       CCState &State, const uint16_t *F64Regs) {
 
-  static const unsigned IntRegsSize=4, FloatRegsSize=2;
+  static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
 
-  static const uint16_t IntRegs[] = {
-      Mips::A0, Mips::A1, Mips::A2, Mips::A3
-  };
-  static const uint16_t F32Regs[] = {
-      Mips::F12, Mips::F14
-  };
-  static const uint16_t F64Regs[] = {
-      Mips::D6, Mips::D7
-  };
+  static const uint16_t IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const uint16_t F32Regs[] = { Mips::F12, Mips::F14 };
 
   // Do not process byval args here.
   if (ArgFlags.isByVal())
@@ -2159,14 +2197,28 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
   return false;
 }
 
+static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D6, Mips::D7 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
+static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
+                            MVT LocVT, CCValAssign::LocInfo LocInfo,
+                            ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  static const uint16_t F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+
+  return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
+}
+
 #include "MipsGenCallingConv.inc"
 
 //===----------------------------------------------------------------------===//
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-static const unsigned O32IntRegsSize = 4;
-
 // Return next O32 integer argument register.
 static unsigned getNextIntArgReg(unsigned Reg) {
   assert((Reg == Mips::A0) || (Reg == Mips::A2));
@@ -2175,7 +2227,7 @@ static unsigned getNextIntArgReg(unsigned Reg) {
 
 SDValue
 MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
-                                   SDValue Chain, SDValue Arg, DebugLoc DL,
+                                   SDValue Chain, SDValue Arg, SDLoc DL,
                                    bool IsTailCall, SelectionDAG &DAG) const {
   if (!IsTailCall) {
     SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
@@ -2229,6 +2281,15 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const uint32_t *Mask = TRI->getCallPreservedMask(CLI.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
+  if (Subtarget->inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(CLI.Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F->hasFnAttribute("__Mips16RetHelper")) {
+        Mask = MipsRegisterInfo::getMips16RetHelperMask();
+      }
+    }
+  }
   Ops.push_back(CLI.DAG.getRegisterMask(Mask));
 
   if (InFlag.getNode())
@@ -2241,10 +2302,10 @@ SDValue
 MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &DL                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc DL                              = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &IsTailCall                      = CLI.IsTailCall;
@@ -2254,16 +2315,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
+  MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    getSpecialCallingConv(Callee);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo,
+                    SpecialCallingConv);
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
-                                 getTargetMachine().Options.UseSoftFloat,
+                                 Subtarget->mipsSEUsesSoftFloat(),
                                  Callee.getNode(), CLI.Args);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2286,7 +2351,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
 
   if (!IsTailCall)
-    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, DL,
                                         IsN64 ? Mips::SP_64 : Mips::SP,
@@ -2380,32 +2445,40 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
   bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
+  EVT Ty = Callee.getValueType();
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPICCall) {
-      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+      const GlobalValue *Val = G->getGlobal();
+      InternalLinkage = Val->hasInternalLinkage();
 
       if (InternalLinkage)
-        Callee = getAddrLocal(Callee, DAG, HasMips64);
+        Callee = getAddrLocal(G, Ty, DAG, HasMips64);
       else if (LargeGOT)
-        Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                       MipsII::MO_CALL_LO16);
+        Callee = getAddrGlobalLargeGOT(G, Ty, DAG, MipsII::MO_CALL_HI16,
+                                       MipsII::MO_CALL_LO16, Chain,
+                                       FuncInfo->callPtrInfo(Val));
       else
-        Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+        Callee = getAddrGlobal(G, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                               FuncInfo->callPtrInfo(Val));
     } else
       Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy(), 0,
                                           MipsII::MO_NO_FLAG);
     GlobalOrExternal = true;
   }
   else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+
     if (!IsN64 && !IsPIC) // !N64 && static
-      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(),
                                             MipsII::MO_NO_FLAG);
     else if (LargeGOT)
-      Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
-                                     MipsII::MO_CALL_LO16);
+      Callee = getAddrGlobalLargeGOT(S, Ty, DAG, MipsII::MO_CALL_HI16,
+                                     MipsII::MO_CALL_LO16, Chain,
+                                     FuncInfo->callPtrInfo(Sym));
     else // N64 || PIC
-      Callee = getAddrGlobal(Callee, DAG, MipsII::MO_GOT_CALL);
+      Callee = getAddrGlobal(S, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
+                             FuncInfo->callPtrInfo(Sym));
 
     GlobalOrExternal = true;
   }
@@ -2424,7 +2497,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Create the CALLSEQ_END node.
   Chain = DAG.getCALLSEQ_END(Chain, NextStackOffsetVal,
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, DL);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -2439,7 +2512,7 @@ SDValue
 MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc DL, SelectionDAG &DAG,
+                                    SDLoc DL, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals,
                                     const SDNode *CallNode,
                                     const Type *RetTy) const {
@@ -2447,9 +2520,9 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
-  MipsCCInfo.analyzeCallResult(Ins, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeCallResult(Ins, Subtarget->mipsSEUsesSoftFloat(),
                                CallNode, RetTy);
 
   // Copy all of the result registers out of their specified physreg.
@@ -2478,7 +2551,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
                                          CallingConv::ID CallConv,
                                          bool IsVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                         DebugLoc DL, SelectionDAG &DAG,
+                                         SDLoc DL, SelectionDAG &DAG,
                                          SmallVectorImpl<SDValue> &InVals)
                                           const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -2494,10 +2567,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
-  bool UseSoftFloat = getTargetMachine().Options.UseSoftFloat;
+  bool UseSoftFloat = Subtarget->mipsSEUsesSoftFloat();
 
   MipsCCInfo.analyzeFormalArguments(Ins, UseSoftFloat, FuncArg);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
@@ -2526,21 +2599,9 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
     // Arguments stored on registers
     if (IsRegLoc) {
-      EVT RegVT = VA.getLocVT();
+      MVT RegVT = VA.getLocVT();
       unsigned ArgReg = VA.getLocReg();
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = Subtarget->inMips16Mode()? &Mips::CPU16RegsRegClass :
-                                        &Mips::CPURegsRegClass;
-      else if (RegVT == MVT::i64)
-        RC = &Mips::CPU64RegsRegClass;
-      else if (RegVT == MVT::f32)
-        RC = &Mips::FGR32RegClass;
-      else if (RegVT == MVT::f64)
-        RC = HasMips64 ? &Mips::FGR64RegClass : &Mips::AFGR64RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FormalArguments Lowering");
+      const TargetRegisterClass *RC = getRegClassFor(RegVT);
 
       // Transform the arguments stored on
       // physical registers into virtual ones
@@ -2590,9 +2651,11 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 
       // Create load nodes to retrieve arguments from the stack
       SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(ValVT, DL, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, false, 0));
+      SDValue Load = DAG.getLoad(ValVT, DL, Chain, FIN,
+                                 MachinePointerInfo::getFixedStack(FI),
+                                 false, false, false, 0);
+      InVals.push_back(Load);
+      OutChains.push_back(Load.getValue(1));
     }
   }
 
@@ -2644,7 +2707,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool IsVarArg,
                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                                 const SmallVectorImpl<SDValue> &OutVals,
-                                DebugLoc DL, SelectionDAG &DAG) const {
+                                SDLoc DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of
   // the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
@@ -2653,10 +2716,10 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
   // CCState - Info about the registers and stack slot.
   CCState CCInfo(CallConv, IsVarArg, MF, getTargetMachine(), RVLocs,
                  *DAG.getContext());
-  MipsCC MipsCCInfo(CallConv, IsO32, CCInfo);
+  MipsCC MipsCCInfo(CallConv, IsO32, Subtarget->isFP64bit(), CCInfo);
 
   // Analyze return values.
-  MipsCCInfo.analyzeReturn(Outs, getTargetMachine().Options.UseSoftFloat,
+  MipsCCInfo.analyzeReturn(Outs, Subtarget->mipsSEUsesSoftFloat(),
                            MF.getFunction()->getReturnType());
 
   SDValue Flag;
@@ -2715,7 +2778,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
 MipsTargetLowering::ConstraintType MipsTargetLowering::
 getConstraintType(const std::string &Constraint) const
 {
-  // Mips specific constrainy
+  // Mips specific constraints
   // GCC config/mips/constraints.md
   //
   // 'd' : An address register. Equivalent to r
@@ -2766,16 +2829,19 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     if (type->isIntegerTy())
       weight = CW_Register;
     break;
-  case 'f':
-    if (type->isFloatTy())
+  case 'f': // FPU or MSA register
+    if (Subtarget->hasMSA() && type->isVectorTy() &&
+        cast<VectorType>(type)->getBitWidth() == 128)
+      weight = CW_Register;
+    else if (type->isFloatTy())
       weight = CW_Register;
     break;
   case 'c': // $25 for indirect jumps
   case 'l': // lo register
   case 'x': // hilo register pair
-      if (type->isIntegerTy())
+    if (type->isIntegerTy())
       weight = CW_SpecificReg;
-      break;
+    break;
   case 'I': // signed 16 bit immediate
   case 'J': // integer zero
   case 'K': // unsigned 16 bit immediate
@@ -2793,11 +2859,109 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
   return weight;
 }
 
+/// This is a helper function to parse a physical register string and split it
+/// into non-numeric and numeric parts (Prefix and Reg). The first boolean flag
+/// that is returned indicates whether parsing was successful. The second flag
+/// is true if the numeric part exists.
+static std::pair<bool, bool>
+parsePhysicalReg(const StringRef &C, std::string &Prefix,
+                 unsigned long long &Reg) {
+  if (C.front() != '{' || C.back() != '}')
+    return std::make_pair(false, false);
+
+  // Search for the first numeric character.
+  StringRef::const_iterator I, B = C.begin() + 1, E = C.end() - 1;
+  I = std::find_if(B, E, std::ptr_fun(isdigit));
+
+  Prefix.assign(B, I - B);
+
+  // The second flag is set to false if no numeric characters were found.
+  if (I == E)
+    return std::make_pair(true, false);
+
+  // Parse the numeric characters.
+  return std::make_pair(!getAsUnsignedInteger(StringRef(I, E - I), 10, Reg),
+                        true);
+}
+
+std::pair<unsigned, const TargetRegisterClass *> MipsTargetLowering::
+parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC;
+  std::string Prefix;
+  unsigned long long Reg;
+
+  std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
+
+  if (!R.first)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
+    // No numeric characters follow "hi" or "lo".
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+    RC = TRI->getRegClass(Prefix == "hi" ?
+                          Mips::HI32RegClassID : Mips::LO32RegClassID);
+    return std::make_pair(*(RC->begin()), RC);
+  } else if (Prefix.compare(0, 4, "$msa") == 0) {
+    // Parse $msa(ir|csr|access|save|modify|request|map|unmap)
+
+    // No numeric characters follow the name.
+    if (R.second)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    Reg = StringSwitch<unsigned long long>(Prefix)
+              .Case("$msair", Mips::MSAIR)
+              .Case("$msacsr", Mips::MSACSR)
+              .Case("$msaaccess", Mips::MSAAccess)
+              .Case("$msasave", Mips::MSASave)
+              .Case("$msamodify", Mips::MSAModify)
+              .Case("$msarequest", Mips::MSARequest)
+              .Case("$msamap", Mips::MSAMap)
+              .Case("$msaunmap", Mips::MSAUnmap)
+              .Default(0);
+
+    if (!Reg)
+      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+
+    RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
+    return std::make_pair(Reg, RC);
+  }
+
+  if (!R.second)
+    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+
+  if (Prefix == "$f") { // Parse $f0-$f31.
+    // If the size of FP registers is 64-bit or Reg is an even number, select
+    // the 64-bit register class. Otherwise, select the 32-bit register class.
+    if (VT == MVT::Other)
+      VT = (Subtarget->isFP64bit() || !(Reg % 2)) ? MVT::f64 : MVT::f32;
+
+    RC = getRegClassFor(VT);
+
+    if (RC == &Mips::AFGR64RegClass) {
+      assert(Reg % 2 == 0);
+      Reg >>= 1;
+    }
+  } else if (Prefix == "$fcc") // Parse $fcc0-$fcc7.
+    RC = TRI->getRegClass(Mips::FCCRegClassID);
+  else if (Prefix == "$w") { // Parse $w0-$w31.
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::v16i8 : VT);
+  } else { // Parse $0-$31.
+    assert(Prefix == "$");
+    RC = getRegClassFor((VT == MVT::Other) ? MVT::i32 : VT);
+  }
+
+  assert(Reg < RC->getNumRegs());
+  return std::make_pair(*(RC->begin() + Reg), RC);
+}
+
 /// Given a register class constraint, like 'r', if this corresponds directly
 /// to an LLVM register class, return a register of 0 and the register class
 /// pointer.
 std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
 {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
@@ -2807,18 +2971,26 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
       if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
         if (Subtarget->inMips16Mode())
           return std::make_pair(0U, &Mips::CPU16RegsRegClass);
-        return std::make_pair(0U, &Mips::CPURegsRegClass);
+        return std::make_pair(0U, &Mips::GPR32RegClass);
       }
       if (VT == MVT::i64 && !HasMips64)
-        return std::make_pair(0U, &Mips::CPURegsRegClass);
+        return std::make_pair(0U, &Mips::GPR32RegClass);
       if (VT == MVT::i64 && HasMips64)
-        return std::make_pair(0U, &Mips::CPU64RegsRegClass);
+        return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
-    case 'f':
-      if (VT == MVT::f32)
+    case 'f': // FPU or MSA register
+      if (VT == MVT::v16i8)
+        return std::make_pair(0U, &Mips::MSA128BRegClass);
+      else if (VT == MVT::v8i16 || VT == MVT::v8f16)
+        return std::make_pair(0U, &Mips::MSA128HRegClass);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return std::make_pair(0U, &Mips::MSA128WRegClass);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return std::make_pair(0U, &Mips::MSA128DRegClass);
+      else if (VT == MVT::f32)
         return std::make_pair(0U, &Mips::FGR32RegClass);
-      if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
+      else if ((VT == MVT::f64) && (!Subtarget->isSingleFloat())) {
         if (Subtarget->isFP64bit())
           return std::make_pair(0U, &Mips::FGR64RegClass);
         return std::make_pair(0U, &Mips::AFGR64RegClass);
@@ -2826,19 +2998,26 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
       break;
     case 'c': // register suitable for indirect jump
       if (VT == MVT::i32)
-        return std::make_pair((unsigned)Mips::T9, &Mips::CPURegsRegClass);
+        return std::make_pair((unsigned)Mips::T9, &Mips::GPR32RegClass);
       assert(VT == MVT::i64 && "Unexpected type.");
-      return std::make_pair((unsigned)Mips::T9_64, &Mips::CPU64RegsRegClass);
+      return std::make_pair((unsigned)Mips::T9_64, &Mips::GPR64RegClass);
     case 'l': // register suitable for indirect jump
       if (VT == MVT::i32)
-        return std::make_pair((unsigned)Mips::LO, &Mips::LORegsRegClass);
-      return std::make_pair((unsigned)Mips::LO64, &Mips::LORegs64RegClass);
+        return std::make_pair((unsigned)Mips::LO0, &Mips::LO32RegClass);
+      return std::make_pair((unsigned)Mips::LO0_64, &Mips::LO64RegClass);
     case 'x': // register suitable for indirect jump
       // Fixme: Not triggering the use of both hi and low
       // This will generate an error message
       return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
     }
   }
+
+  std::pair<unsigned, const TargetRegisterClass *> R;
+  R = parseRegForInlineAsmConstraint(Constraint, VT);
+
+  if (R.second)
+    return R;
+
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
@@ -2937,8 +3116,8 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
-bool
-MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM, Type *Ty) const {
+bool MipsTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                               Type *Ty) const {
   // No global is ever allowed as a base.
   if (AM.BaseGV)
     return false;
@@ -3002,13 +3181,13 @@ static bool isF128SoftLibCall(const char *CallSym) {
      "log10l", "log2l", "logl", "nearbyintl", "powl", "rintl", "sinl", "sqrtl",
      "truncl"};
 
-  const char * const *End = LibCalls + array_lengthof(LibCalls);
+  const char *const *End = LibCalls + array_lengthof(LibCalls);
 
   // Check that LibCalls is sorted alphabetically.
   MipsTargetLowering::LTStr Comp;
 
 #ifndef NDEBUG
-  for (const char * const *I = LibCalls; I < End - 1; ++I)
+  for (const char *const *I = LibCalls; I < End - 1; ++I)
     assert(Comp(*I, *(I + 1)));
 #endif
 
@@ -3029,13 +3208,32 @@ static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
   return (ES && Ty->isIntegerTy(128) && isF128SoftLibCall(ES->getSymbol()));
 }
 
-MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CC, bool IsO32_,
-                                   CCState &Info)
-  : CCInfo(Info), CallConv(CC), IsO32(IsO32_) {
+MipsTargetLowering::MipsCC::SpecialCallingConvType
+  MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
+  MipsCC::SpecialCallingConvType SpecialCallingConv =
+    MipsCC::NoSpecialCallingConv;;
+  if (Subtarget->inMips16HardFloat()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      llvm::StringRef Sym = G->getGlobal()->getName();
+      Function *F = G->getGlobal()->getParent()->getFunction(Sym);
+      if (F->hasFnAttribute("__Mips16RetHelper")) {
+        SpecialCallingConv = MipsCC::Mips16RetHelperConv;
+      }
+    }
+  }
+  return SpecialCallingConv;
+}
+
+MipsTargetLowering::MipsCC::MipsCC(
+  CallingConv::ID CC, bool IsO32_, bool IsFP64_, CCState &Info,
+  MipsCC::SpecialCallingConvType SpecialCallingConv_)
+  : CCInfo(Info), CallConv(CC), IsO32(IsO32_), IsFP64(IsFP64_),
+    SpecialCallingConv(SpecialCallingConv_){
   // Pre-allocate reserved argument area.
   CCInfo.AllocateStack(reservedArgArea(), 1);
 }
 
+
 void MipsTargetLowering::MipsCC::
 analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
                     bool IsVarArg, bool IsSoftFloat, const SDNode *CallNode,
@@ -3143,11 +3341,10 @@ analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
   analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
 }
 
-void
-MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
-                                           MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags) {
+void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                                MVT LocVT,
+                                                CCValAssign::LocInfo LocInfo,
+                                                ISD::ArgFlagsTy ArgFlags) {
   assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
 
   struct ByValArgInfo ByVal;
@@ -3183,11 +3380,13 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::fixedArgFn() const {
   if (CallConv == CallingConv::Fast)
     return CC_Mips_FastCC;
 
-  return IsO32 ? CC_MipsO32 : CC_MipsN;
+  if (SpecialCallingConv == Mips16RetHelperConv)
+    return CC_Mips16RetHelper;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN;
 }
 
 llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
-  return IsO32 ? CC_MipsO32 : CC_MipsN_VarArg;
+  return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
 }
 
 const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
@@ -3233,7 +3432,7 @@ MVT MipsTargetLowering::MipsCC::getRegVT(MVT VT, const Type *OrigTy,
 }
 
 void MipsTargetLowering::
-copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
+copyByValRegs(SDValue Chain, SDLoc DL, std::vector<SDValue> &OutChains,
               SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
               SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
               const MipsCC &CC, const ByValArgInfo &ByVal) const {
@@ -3277,9 +3476,9 @@ copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
 
 // Copy byVal arg to registers and stack.
 void MipsTargetLowering::
-passByValArg(SDValue Chain, DebugLoc DL,
+passByValArg(SDValue Chain, SDLoc DL,
              std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
-             SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+             SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
              const MipsCC &CC, const ByValArgInfo &ByVal,
              const ISD::ArgFlagsTy &Flags, bool isLittle) const {
@@ -3365,17 +3564,15 @@ passByValArg(SDValue Chain, DebugLoc DL,
                             DAG.getConstant(Offset, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
-  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
-                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
+                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
                         MachinePointerInfo(0), MachinePointerInfo(0));
   MemOpChains.push_back(Chain);
 }
 
-void
-MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
-                                    const MipsCC &CC, SDValue Chain,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                         const MipsCC &CC, SDValue Chain,
+                                         SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
   const uint16_t *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
@@ -3393,8 +3590,7 @@ MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   if (NumRegs == Idx)
     VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
   else
-    VaArgOffset =
-      (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+    VaArgOffset = (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
 
   // Record the frame index of the first variable argument
   // which is a value necessary to VASTART.
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 5587e8f..65f68f0 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -17,6 +17,7 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/Function.h"
@@ -60,8 +61,8 @@ namespace llvm {
       CMovFP_T,
       CMovFP_F,
 
-      // Floating Point Rounding
-      FPRound,
+      // FP-to-int truncation node.
+      TruncIntFP,
 
       // Return
       Ret,
@@ -69,10 +70,11 @@ namespace llvm {
       EH_RETURN,
 
       // Node used to extract integer from accumulator.
-      ExtractLOHI,
+      MFHI,
+      MFLO,
 
       // Node used to insert integers to accumulator.
-      InsertLOHI,
+      MTLOHI,
 
       // Mult nodes.
       Mult,
@@ -152,6 +154,43 @@ namespace llvm {
       SETCC_DSP,
       SELECT_CC_DSP,
 
+      // Vector comparisons.
+      // These take a vector and return a boolean.
+      VALL_ZERO,
+      VANY_ZERO,
+      VALL_NONZERO,
+      VANY_NONZERO,
+
+      // These take a vector and return a vector bitmask.
+      VCEQ,
+      VCLE_S,
+      VCLE_U,
+      VCLT_S,
+      VCLT_U,
+
+      // Element-wise vector max/min.
+      VSMAX,
+      VSMIN,
+      VUMAX,
+      VUMIN,
+
+      // Vector Shuffle with mask as an operand
+      VSHF,  // Generic shuffle
+      SHF,   // 4-element set shuffle.
+      ILVEV, // Interleave even elements
+      ILVOD, // Interleave odd elements
+      ILVL,  // Interleave left elements
+      ILVR,  // Interleave right elements
+      PCKEV, // Pack even elements
+      PCKOD, // Pack odd elements
+
+      // Combined (XOR (OR $a, $b), -1)
+      VNOR,
+
+      // Extended vector element extraction
+      VEXTRACT_SEXT_ELT,
+      VEXTRACT_ZEXT_ELT,
+
       // Load/Store Left/Right nodes.
       LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LWR,
@@ -195,7 +234,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
@@ -211,12 +250,72 @@ namespace llvm {
   protected:
     SDValue getGlobalReg(SelectionDAG &DAG, EVT Ty) const;
 
-    SDValue getAddrLocal(SDValue Op, SelectionDAG &DAG, bool HasMips64) const;
-
-    SDValue getAddrGlobal(SDValue Op, SelectionDAG &DAG, unsigned Flag) const;
-
-    SDValue getAddrGlobalLargeGOT(SDValue Op, SelectionDAG &DAG,
-                                  unsigned HiFlag, unsigned LoFlag) const;
+    // This method creates the following nodes, which are necessary for
+    // computing a local symbol's address:
+    //
+    // (add (load (wrapper $gp, %got(sym)), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrLocal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                         bool HasMips64) const {
+      SDLoc DL(N);
+      unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+      SDValue GOT = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, GOTFlag));
+      SDValue Load = DAG.getLoad(Ty, DL, DAG.getEntryNode(), GOT,
+                                 MachinePointerInfo::getGOT(), false, false,
+                                 false, 0);
+      unsigned LoFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+      SDValue Lo = DAG.getNode(MipsISD::Lo, DL, Ty,
+                               getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getNode(ISD::ADD, DL, Ty, Load, Lo);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address:
+    //
+    // (load (wrapper $gp, %got(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobal(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag, SDValue Chain,
+                          const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Tgt = DAG.getNode(MipsISD::Wrapper, DL, Ty, getGlobalReg(DAG, Ty),
+                                getTargetNode(N, Ty, DAG, Flag));
+      return DAG.getLoad(Ty, DL, Chain, Tgt, PtrInfo, false, false, false, 0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a global symbol's address in large-GOT mode:
+    //
+    // (load (wrapper (add %hi(sym), $gp), %lo(sym)))
+    template<class NodeTy>
+    SDValue getAddrGlobalLargeGOT(NodeTy *N, EVT Ty, SelectionDAG &DAG,
+                                  unsigned HiFlag, unsigned LoFlag,
+                                  SDValue Chain,
+                                  const MachinePointerInfo &PtrInfo) const {
+      SDLoc DL(N);
+      SDValue Hi = DAG.getNode(MipsISD::Hi, DL, Ty,
+                               getTargetNode(N, Ty, DAG, HiFlag));
+      Hi = DAG.getNode(ISD::ADD, DL, Ty, Hi, getGlobalReg(DAG, Ty));
+      SDValue Wrapper = DAG.getNode(MipsISD::Wrapper, DL, Ty, Hi,
+                                    getTargetNode(N, Ty, DAG, LoFlag));
+      return DAG.getLoad(Ty, DL, Chain, Wrapper, PtrInfo, false, false, false,
+                         0);
+    }
+
+    // This method creates the following nodes, which are necessary for
+    // computing a symbol's address in non-PIC mode:
+    //
+    // (add %hi(sym), %lo(sym))
+    template<class NodeTy>
+    SDValue getAddrNonPIC(NodeTy *N, EVT Ty, SelectionDAG &DAG) const {
+      SDLoc DL(N);
+      SDValue Hi = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_HI);
+      SDValue Lo = getTargetNode(N, Ty, DAG, MipsII::MO_ABS_LO);
+      return DAG.getNode(ISD::ADD, DL, Ty,
+                         DAG.getNode(MipsISD::Hi, DL, Ty, Hi),
+                         DAG.getNode(MipsISD::Lo, DL, Ty, Lo));
+    }
 
     /// This function fills Ops, which is the list of operands that will later
     /// be used when a function call node is created. It also generates
@@ -240,7 +339,13 @@ namespace llvm {
     /// arguments and inquire about calling convention information.
     class MipsCC {
     public:
-      MipsCC(CallingConv::ID CallConv, bool IsO32, CCState &Info);
+      enum SpecialCallingConvType {
+        Mips16RetHelperConv, NoSpecialCallingConv
+      };
+
+      MipsCC(CallingConv::ID CallConv, bool IsO32, bool IsFP64, CCState &Info,
+             SpecialCallingConvType SpecialCallingConv = NoSpecialCallingConv);
+
 
       void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
                                bool IsVarArg, bool IsSoftFloat,
@@ -275,7 +380,7 @@ namespace llvm {
       /// Return pointer to array of integer argument registers.
       const uint16_t *intArgRegs() const;
 
-      typedef SmallVector<ByValArgInfo, 2>::const_iterator byval_iterator;
+      typedef SmallVectorImpl<ByValArgInfo>::const_iterator byval_iterator;
       byval_iterator byval_begin() const { return ByValArgs.begin(); }
       byval_iterator byval_end() const { return ByValArgs.end(); }
 
@@ -312,9 +417,13 @@ namespace llvm {
 
       CCState &CCInfo;
       CallingConv::ID CallConv;
-      bool IsO32;
+      bool IsO32, IsFP64;
+      SpecialCallingConvType SpecialCallingConv;
       SmallVector<ByValArgInfo, 2> ByValArgs;
     };
+  protected:
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
     // Subtarget Info
     const MipsSubtarget *Subtarget;
@@ -322,11 +431,32 @@ namespace llvm {
     bool HasMips64, IsN64, IsO32;
 
   private:
+    // Create a TargetGlobalAddress node.
+    SDValue getTargetNode(GlobalAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetExternalSymbol node.
+    SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetBlockAddress node.
+    SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetJumpTable node.
+    SDValue getTargetNode(JumpTableSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    // Create a TargetConstantPool node.
+    SDValue getTargetNode(ConstantPoolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                          unsigned Flag) const;
+
+    MipsCC::SpecialCallingConvType getSpecialCallingConv(SDValue Callee) const;
     // Lower Operand helpers
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals,
                             const SDNode *CallNode, const Type *RetTy) const;
 
@@ -351,9 +481,8 @@ namespace llvm {
     SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG& DAG) const;
     SDValue lowerShiftRightParts(SDValue Op, SelectionDAG& DAG,
                                  bool IsSRA) const;
-    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerADD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
 
     /// isEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
@@ -365,7 +494,7 @@ namespace llvm {
     /// copyByValArg - Copy argument registers which were used to pass a byval
     /// argument to the stack. Create a stack frame object for the byval
     /// argument.
-    void copyByValRegs(SDValue Chain, DebugLoc DL,
+    void copyByValRegs(SDValue Chain, SDLoc DL,
                        std::vector<SDValue> &OutChains, SelectionDAG &DAG,
                        const ISD::ArgFlagsTy &Flags,
                        SmallVectorImpl<SDValue> &InVals,
@@ -373,9 +502,9 @@ namespace llvm {
                        const MipsCC &CC, const ByValArgInfo &ByVal) const;
 
     /// passByValArg - Pass a byval argument in registers or on stack.
-    void passByValArg(SDValue Chain, DebugLoc DL,
+    void passByValArg(SDValue Chain, SDLoc DL,
                       std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
-                      SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+                      SmallVectorImpl<SDValue> &MemOpChains, SDValue StackPtr,
                       MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
                       const MipsCC &CC, const ByValArgInfo &ByVal,
                       const ISD::ArgFlagsTy &Flags, bool isLittle) const;
@@ -384,17 +513,17 @@ namespace llvm {
     /// to the stack. Also create a stack frame object for the first variable
     /// argument.
     void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
-                         SDValue Chain, DebugLoc DL, SelectionDAG &DAG) const;
+                         SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
-                           SDValue Arg, DebugLoc DL, bool IsTailCall,
+                           SDValue Arg, SDLoc DL, bool IsTailCall,
                            SelectionDAG &DAG) const;
 
     virtual SDValue
@@ -412,7 +541,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     // Inline asm support
     ConstraintType getConstraintType(const std::string &Constraint) const;
@@ -422,9 +551,14 @@ namespace llvm {
     ConstraintWeight getSingleConstraintMatchWeight(
       AsmOperandInfo &info, const char *constraint) const;
 
+    /// This function parses registers that appear in inline-asm constraints.
+    /// It returns pair (0, 0) on failure.
+    std::pair<unsigned, const TargetRegisterClass *>
+    parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const;
+
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
-              EVT VT) const;
+                                           MVT VT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 6b23057..9f7ce9a 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -24,12 +24,14 @@
 //===----------------------------------------------------------------------===//
 
 // Floating Point Compare and Branch
-def SDT_MipsFPBrcond : SDTypeProfile<0, 2, [SDTCisInt<0>,
-                                            SDTCisVT<1, OtherVT>]>;
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisInt<0>,
+                                            SDTCisVT<1, i32>,
+                                            SDTCisVT<2, OtherVT>]>;
 def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<1>,
                                          SDTCisVT<2, i32>]>;
-def SDT_MipsCMovFP : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<1, 2>]>;
+def SDT_MipsCMovFP : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisVT<2, i32>,
+                                          SDTCisSameAs<1, 3>]>;
+def SDT_MipsTruncIntFP : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 def SDT_MipsBuildPairF64 : SDTypeProfile<1, 2, [SDTCisVT<0, f64>,
                                                 SDTCisVT<1, i32>,
                                                 SDTCisSameAs<1, 2>]>;
@@ -42,6 +44,7 @@ def MipsCMovFP_T : SDNode<"MipsISD::CMovFP_T", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsCMovFP_F : SDNode<"MipsISD::CMovFP_F", SDT_MipsCMovFP, [SDNPInGlue]>;
 def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
+def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
 def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
 def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
                                    SDT_MipsExtractElementF64>;
@@ -86,7 +89,7 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 // Only S32 and D32 are supported right now.
 //===----------------------------------------------------------------------===//
 
-class ADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin, bit IsComm,
+class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
               SDPatternOperator OpNode= null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fs, $ft"),
@@ -96,15 +99,15 @@ class ADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin, bit IsComm,
 
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
-  def _D32 : ADDS_FT<opstr, AFGR64, Itin, IsComm, OpNode>,
+  def _D32 : ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ADDS_FT<opstr, FGR64, Itin, IsComm, OpNode>,
+  def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin, IsComm, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
-class ABSS_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
          [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>,
@@ -112,95 +115,87 @@ class ABSS_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
 
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
-  def _D32 : ABSS_FT<opstr, AFGR64, AFGR64, Itin, OpNode>,
+  def _D32 : ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR64, FGR64, Itin, OpNode>,
+  def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
-  def _D32 : ABSS_FT<opstr, FGR32, AFGR64, Itin>,
+  def _D32 : ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : ABSS_FT<opstr, FGR32, FGR64, Itin>,
+  def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
              Requires<[IsFP64bit, HasStdEnc]> {
     let DecoderNamespace = "Mips64";
   }
 }
 
-class MFC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+class MFC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
          [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
 
-class MTC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+class MTC1_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
               InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
   InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
          [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
 
-class MFC1_FT_CCR<string opstr, RegisterClass DstRC, RegisterOperand SrcRC,
-              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
-
-class MTC1_FT_CCR<string opstr, RegisterOperand DstRC, RegisterClass SrcRC,
-              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
-         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
-
-class LW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(set RC:$rt, (OpNode addrDefault:$addr))], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
+  let mayLoad = 1;
 }
 
-class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
-            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
-  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class SW_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
+            SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [(OpNode RC:$rt, addrDefault:$addr)], Itin, FrmFI> {
   let DecoderMethod = "DecodeFMem";
+  let mayStore = 1;
 }
 
-class MADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+class MADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                SDPatternOperator OpNode = null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
          [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin, FrmFR>;
 
-class NMADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+class NMADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin,
                 SDPatternOperator OpNode = null_frag> :
   InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
          !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
          [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
          Itin, FrmFR>;
 
-class LWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+class LWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
+  InstSE<(outs DRC:$fd), (ins PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fd, ${index}(${base})"),
-         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI> {
+         [(set DRC:$fd, (OpNode (add iPTR:$base, iPTR:$index)))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
-class SWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+class SWXC1_FT<string opstr, RegisterOperand DRC,
                InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
-  InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
+  InstSE<(outs), (ins DRC:$fs, PtrRC:$base, PtrRC:$index),
          !strconcat(opstr, "\t$fs, ${index}(${base})"),
-         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI> {
+         [(OpNode DRC:$fs, (add iPTR:$base, iPTR:$index))], Itin, FrmFI> {
   let AddedComplexity = 20;
 }
 
 class BC1F_FT<string opstr, InstrItinClass Itin,
               SDPatternOperator Op = null_frag>  :
-  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
-         [(MipsFPBrcond Op, bb:$offset)], Itin, FrmFI> {
+  InstSE<(outs), (ins FCCRegsOpnd:$fcc, brtarget:$offset),
+         !strconcat(opstr, "\t$fcc, $offset"),
+         [(MipsFPBrcond Op, FCCRegsOpnd:$fcc, bb:$offset)], Itin, FrmFI> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
   let Defs = [AT];
-  let Uses = [FCR31];
 }
 
 class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
@@ -208,17 +203,53 @@ class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
   InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
          !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
          [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR> {
-  let Defs = [FCR31];
-}
+  let Defs = [FCC0];
+  let isCodeGenOnly = 1;
+}
+
+class C_COND_FT<string CondStr, string Typestr, RegisterOperand RC>  :
+   InstSE<(outs), (ins RC:$fs, RC:$ft),
+          !strconcat("c.", CondStr, ".", Typestr, "\t$fs, $ft"), [], IIFcmp,
+          FrmFR>;
+
+multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt> {
+  def C_F_#NAME : C_COND_FT<"f", TypeStr, RC>, C_COND_FM<fmt, 0>;
+  def C_UN_#NAME : C_COND_FT<"un", TypeStr, RC>, C_COND_FM<fmt, 1>;
+  def C_EQ_#NAME : C_COND_FT<"eq", TypeStr, RC>, C_COND_FM<fmt, 2>;
+  def C_UEQ_#NAME : C_COND_FT<"ueq", TypeStr, RC>, C_COND_FM<fmt, 3>;
+  def C_OLT_#NAME : C_COND_FT<"olt", TypeStr, RC>, C_COND_FM<fmt, 4>;
+  def C_ULT_#NAME : C_COND_FT<"ult", TypeStr, RC>, C_COND_FM<fmt, 5>;
+  def C_OLE_#NAME : C_COND_FT<"ole", TypeStr, RC>, C_COND_FM<fmt, 6>;
+  def C_ULE_#NAME : C_COND_FT<"ule", TypeStr, RC>, C_COND_FM<fmt, 7>;
+  def C_SF_#NAME : C_COND_FT<"sf", TypeStr, RC>, C_COND_FM<fmt, 8>;
+  def C_NGLE_#NAME : C_COND_FT<"ngle", TypeStr, RC>, C_COND_FM<fmt, 9>;
+  def C_SEQ_#NAME : C_COND_FT<"seq", TypeStr, RC>, C_COND_FM<fmt, 10>;
+  def C_NGL_#NAME : C_COND_FT<"ngl", TypeStr, RC>, C_COND_FM<fmt, 11>;
+  def C_LT_#NAME : C_COND_FT<"lt", TypeStr, RC>, C_COND_FM<fmt, 12>;
+  def C_NGE_#NAME : C_COND_FT<"nge", TypeStr, RC>, C_COND_FM<fmt, 13>;
+  def C_LE_#NAME : C_COND_FT<"le", TypeStr, RC>, C_COND_FM<fmt, 14>;
+  def C_NGT_#NAME : C_COND_FT<"ngt", TypeStr, RC>, C_COND_FM<fmt, 15>;
+}
+
+defm S : C_COND_M<"s", FGR32Opnd, 16>;
+defm D32 : C_COND_M<"d", AFGR64Opnd, 17>,
+                    Requires<[NotFP64bit, HasStdEnc]>;
+let DecoderNamespace = "Mips64" in
+defm D64 : C_COND_M<"d", FGR64Opnd, 17>, Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xc, 16>;
-def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xd, 16>;
-def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xe, 16>;
-def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xf, 16>;
-def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0x24, 16>;
+def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+                 ABSS_FM<0xc, 16>;
+def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+                 ABSS_FM<0xd, 16>;
+def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+                 ABSS_FM<0xe, 16>;
+def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+                 ABSS_FM<0xf, 16>;
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, IIFcvt>,
+                 ABSS_FM<0x24, 16>;
 
 defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
 defm TRUNC_W : ROUND_M<"trunc.w.d", IIFcvt>, ABSS_FM<0xd, 17>;
@@ -227,46 +258,72 @@ defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
 defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>;
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x8, 16>;
-  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64, FGR64, IIFcvt>,
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x8, 16>;
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x8, 17>;
-  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x9, 16>;
-  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64, FGR64, IIFcvt>,
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x9, 16>;
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0x9, 17>;
-  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xa, 16>;
-  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0xa, 17>;
-  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xb, 16>;
-  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64, FGR64, IIFcvt>,
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0xa, 16>;
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+                   ABSS_FM<0xa, 17>;
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0xb, 16>;
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
                     ABSS_FM<0xb, 17>;
 }
 
-def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32, FGR32, IIFcvt>, ABSS_FM<0x20, 20>;
-def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x25, 16>;
-def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0x25, 17>;
+def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, IIFcvt>,
+              ABSS_FM<0x20, 20>;
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+              ABSS_FM<0x25, 16>;
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, IIFcvt>,
+               ABSS_FM<0x25, 17>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32, AFGR64, IIFcvt>, ABSS_FM<0x20, 17>;
-  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
-  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
+  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, IIFcvt>,
+                  ABSS_FM<0x20, 17>;
+  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x21, 20>;
+  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x21, 16>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
- def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
- def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
- def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
- def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
- def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64, FGR64, IIFcvt>, ABSS_FM<0x21, 21>;
+  def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, IIFcvt>,
+                  ABSS_FM<0x20, 17>;
+  def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, IIFcvt>,
+                  ABSS_FM<0x20, 21>;
+  def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x21, 20>;
+  def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, IIFcvt>,
+                  ABSS_FM<0x21, 16>;
+  def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, IIFcvt>,
+                  ABSS_FM<0x21, 21>;
+}
+
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def PseudoCVT_S_W : ABSS_FT<"", FGR32Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D32_W : ABSS_FT<"", AFGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_S_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
+  def PseudoCVT_D64_W : ABSS_FT<"", FGR64Opnd, GPR32Opnd, IIFcvt>;
+  def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, IIFcvt>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : ABSS_FT<"abs.s", FGR32, FGR32, IIFcvt, fabs>, ABSS_FM<0x5, 16>;
-  def FNEG_S : ABSS_FT<"neg.s", FGR32, FGR32, IIFcvt, fneg>, ABSS_FM<0x7, 16>;
+  def FABS_S : ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, IIFcvt, fabs>,
+               ABSS_FM<0x5, 16>;
+  def FNEG_S : ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, IIFcvt, fneg>,
+               ABSS_FM<0x7, 16>;
   defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
   defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
 }
 
-def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32, FGR32, IIFsqrtSingle, fsqrt>,
-               ABSS_FM<0x4, 16>;
+def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, IIFsqrtSingle,
+               fsqrt>, ABSS_FM<0x4, 16>;
 defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 
 // The odd-numbered registers are only referenced when doing loads,
@@ -275,130 +332,136 @@ defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 // regardless of register aliasing.
 
 /// Move Control Registers From/To CPU Registers
-def CFC1 : MFC1_FT_CCR<"cfc1", CPURegs, CCROpnd, IIFmove>, MFC1_FM<2>;
-def CTC1 : MTC1_FT_CCR<"ctc1", CCROpnd, CPURegs, IIFmove>, MFC1_FM<6>;
-def MFC1 : MFC1_FT<"mfc1", CPURegs, FGR32, IIFmove, bitconvert>, MFC1_FM<0>;
-def MTC1 : MTC1_FT<"mtc1", FGR32, CPURegs, IIFmove, bitconvert>, MFC1_FM<4>;
-def DMFC1 : MFC1_FT<"dmfc1", CPU64Regs, FGR64, IIFmove, bitconvert>, MFC1_FM<1>;
-def DMTC1 : MTC1_FT<"dmtc1", FGR64, CPU64Regs, IIFmove, bitconvert>, MFC1_FM<5>;
-
-def FMOV_S   : ABSS_FT<"mov.s", FGR32, FGR32, IIFmove>, ABSS_FM<0x6, 16>;
-def FMOV_D32 : ABSS_FT<"mov.d", AFGR64, AFGR64, IIFmove>, ABSS_FM<0x6, 17>,
-               Requires<[NotFP64bit, HasStdEnc]>;
-def FMOV_D64 : ABSS_FT<"mov.d", FGR64, FGR64, IIFmove>, ABSS_FM<0x6, 17>,
-               Requires<[IsFP64bit, HasStdEnc]> {
-  let DecoderNamespace = "Mips64";
+def CFC1 : MFC1_FT<"cfc1", GPR32Opnd, CCROpnd, IIFmove>, MFC1_FM<2>;
+def CTC1 : MTC1_FT<"ctc1", CCROpnd, GPR32Opnd, IIFmove>, MFC1_FM<6>;
+def MFC1 : MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, IIFmoveC1, bitconvert>,
+           MFC1_FM<0>;
+def MTC1 : MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, IIFmoveC1, bitconvert>,
+           MFC1_FM<4>;
+def MFHC1 : MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, IIFmoveC1>,
+            MFC1_FM<3>;
+def MTHC1 : MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, IIFmoveC1>,
+            MFC1_FM<7>;
+def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, IIFmoveC1,
+            bitconvert>, MFC1_FM<1>;
+def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, IIFmoveC1,
+            bitconvert>, MFC1_FM<5>;
+
+def FMOV_S   : ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, IIFmove>,
+               ABSS_FM<0x6, 16>;
+def FMOV_D32 : ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, IIFmove>,
+               ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, IIFmove>,
+               ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
+                 let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LWC1_P8 : LW_FT<"lwc1", FGR32, IILoad, mem64, load>, LW_FM<0x31>;
-  def SWC1_P8 : SW_FT<"swc1", FGR32, IIStore, mem64, store>, LW_FM<0x39>;
-  def LDC164_P8 : LW_FT<"ldc1", FGR64, IILoad, mem64, load>, LW_FM<0x35> {
-    let isCodeGenOnly =1;
-  }
-  def SDC164_P8 : SW_FT<"sdc1", FGR64, IIStore, mem64, store>, LW_FM<0x3d> {
-    let isCodeGenOnly =1;
-  }
+let Predicates = [HasStdEnc] in {
+  def LWC1 : LW_FT<"lwc1", FGR32Opnd, IIFLoad, load>, LW_FM<0x31>;
+  def SWC1 : SW_FT<"swc1", FGR32Opnd, IIFStore, store>, LW_FM<0x39>;
 }
 
-let Predicates = [NotN64, HasStdEnc] in {
-  def LWC1 : LW_FT<"lwc1", FGR32, IILoad, mem, load>, LW_FM<0x31>;
-  def SWC1 : SW_FT<"swc1", FGR32, IIStore, mem, store>, LW_FM<0x39>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, HasMips64, HasStdEnc],
-  DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64, IILoad, mem, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64, IIStore, mem, store>, LW_FM<0x3d>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LDC1 : LW_FT<"ldc1", AFGR64Opnd, IIFLoad, load>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64Opnd, IIFStore, store>, LW_FM<0x3d>;
 }
 
-let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-  def LDC1 : LW_FT<"ldc1", AFGR64, IILoad, mem, load>, LW_FM<0x35>;
-  def SDC1 : SW_FT<"sdc1", AFGR64, IIStore, mem, store>, LW_FM<0x3d>;
+/// Cop2 Memory Instructions
+let Predicates = [HasStdEnc] in {
+  def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
+  def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
+  def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>;
+  def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>;
 }
 
 // Indexed loads and stores.
 let Predicates = [HasFPIdx, HasStdEnc] in {
-  def LWXC1 : LWXC1_FT<"lwxc1", FGR32, CPURegs, IILoad, load>, LWXC1_FM<0>;
-  def SWXC1 : SWXC1_FT<"swxc1", FGR32, CPURegs, IIStore, store>, SWXC1_FM<8>;
-}
-
-let Predicates = [HasMips32r2, NotMips64, HasStdEnc] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
+  def LWXC1 : LWXC1_FT<"lwxc1", FGR32Opnd, IIFLoad, load>, LWXC1_FM<0>;
+  def SWXC1 : SWXC1_FT<"swxc1", FGR32Opnd, IIFStore, store>, SWXC1_FM<8>;
 }
 
-let Predicates = [HasMips64, NotN64, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
+let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
-// n64
-let Predicates = [IsN64, HasStdEnc], isCodeGenOnly=1 in {
-  def LWXC1_P8 : LWXC1_FT<"lwxc1", FGR32, CPU64Regs, IILoad, load>, LWXC1_FM<0>;
-  def LDXC164_P8 : LWXC1_FT<"ldxc1", FGR64, CPU64Regs, IILoad, load>,
-                   LWXC1_FM<1>;
-  def SWXC1_P8 : SWXC1_FT<"swxc1", FGR32, CPU64Regs, IIStore, store>,
-                 SWXC1_FM<8>;
-  def SDXC164_P8 : SWXC1_FT<"sdxc1", FGR64, CPU64Regs, IIStore, store>,
-                   SWXC1_FM<9>;
+let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
+    DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, IIFLoad, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, IIFStore, store>, SWXC1_FM<9>;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotMips64, HasStdEnc] in {
-  def LUXC1 : LWXC1_FT<"luxc1", AFGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
-  def SUXC1 : SWXC1_FT<"suxc1", AFGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
+let Predicates = [NotFP64bit, HasStdEnc] in {
+  def LUXC1 : LWXC1_FT<"luxc1", AFGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC1 : SWXC1_FT<"suxc1", AFGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
-let Predicates = [HasMips64, HasStdEnc],
-  DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, IIFLoad>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, IIFStore>, SWXC1_FM<0xd>;
 }
 
 /// Floating-point Aritmetic
-def FADD_S : ADDS_FT<"add.s", FGR32, IIFadd, 1, fadd>, ADDS_FM<0x00, 16>;
-defm FADD : ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
-def FDIV_S : ADDS_FT<"div.s", FGR32, IIFdivSingle, 0, fdiv>, ADDS_FM<0x03, 16>;
-defm FDIV : ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
-def FMUL_S : ADDS_FT<"mul.s", FGR32, IIFmulSingle, 1, fmul>, ADDS_FM<0x02, 16>;
-defm FMUL : ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
-def FSUB_S : ADDS_FT<"sub.s", FGR32, IIFadd, 0, fsub>, ADDS_FM<0x01, 16>;
-defm FSUB : ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
+def FADD_S : ADDS_FT<"add.s", FGR32Opnd, IIFadd, 1, fadd>,
+             ADDS_FM<0x00, 16>;
+defm FADD :  ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : ADDS_FT<"div.s", FGR32Opnd, IIFdivSingle, 0, fdiv>,
+             ADDS_FM<0x03, 16>;
+defm FDIV :  ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : ADDS_FT<"mul.s", FGR32Opnd, IIFmulSingle, 1, fmul>,
+             ADDS_FM<0x02, 16>;
+defm FMUL :  ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : ADDS_FT<"sub.s", FGR32Opnd, IIFadd, 0, fsub>,
+             ADDS_FM<0x01, 16>;
+defm FSUB :  ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
 
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MADDS_FT<"madd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<4, 0>;
-  def MSUB_S : MADDS_FT<"msub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<5, 0>;
+  def MADD_S : MADDS_FT<"madd.s", FGR32Opnd, IIFmulSingle, fadd>,
+               MADDS_FM<4, 0>;
+  def MSUB_S : MADDS_FT<"msub.s", FGR32Opnd, IIFmulSingle, fsub>,
+               MADDS_FM<5, 0>;
 }
 
 let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<6, 0>;
-  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<7, 0>;
+  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32Opnd, IIFmulSingle, fadd>,
+                MADDS_FM<6, 0>;
+  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32Opnd, IIFmulSingle, fsub>,
+                MADDS_FM<7, 0>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MADDS_FT<"madd.d", AFGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
-  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
+  def MADD_D32 : MADDS_FT<"madd.d", AFGR64Opnd, IIFmulDouble, fadd>,
+                 MADDS_FM<4, 1>;
+  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64Opnd, IIFmulDouble, fsub>,
+                 MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64, IIFmulDouble, fadd>,
+  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64, IIFmulDouble, fsub>,
+  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
-  def MADD_D64 : MADDS_FT<"madd.d", FGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
-  def MSUB_D64 : MADDS_FT<"msub.d", FGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, IIFmulDouble, fadd>,
+                 MADDS_FM<4, 1>;
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, IIFmulDouble, fsub>,
+                 MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
     isCodeGenOnly=1 in {
-  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64, IIFmulDouble, fadd>,
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, IIFmulDouble, fadd>,
                   MADDS_FM<6, 1>;
-  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64, IIFmulDouble, fsub>,
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, IIFmulDouble, fsub>,
                   MADDS_FM<7, 1>;
 }
 
@@ -410,10 +473,9 @@ let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
-let DecoderMethod = "DecodeBC1" in {
 def BC1F : BC1F_FT<"bc1f", IIBranch, MIPS_BRANCH_F>, BC1F_FM<0, 0>;
 def BC1T : BC1F_FT<"bc1t", IIBranch, MIPS_BRANCH_T>, BC1F_FM<0, 1>;
-}
+
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
 //===----------------------------------------------------------------------===//
@@ -447,22 +509,36 @@ def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
 //===----------------------------------------------------------------------===//
-def MOVCCRToCCR : PseudoSE<(outs CCR:$dst), (ins CCROpnd:$src), []>;
 
 // This pseudo instr gets expanded into 2 mtc1 instrs after register
 // allocation.
-def BuildPairF64 :
-  PseudoSE<(outs AFGR64:$dst),
-           (ins CPURegs:$lo, CPURegs:$hi),
-           [(set AFGR64:$dst, (MipsBuildPairF64 CPURegs:$lo, CPURegs:$hi))]>;
+class BuildPairF64Base<RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins GPR32Opnd:$lo, GPR32Opnd:$hi),
+           [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
+
+def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
+                   Requires<[NotFP64bit, HasStdEnc]>;
+def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
+                      Requires<[IsFP64bit, HasStdEnc]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
 // if n is 0, lower part of src is extracted.
 // if n is 1, higher part of src is extracted.
-def ExtractElementF64 :
-  PseudoSE<(outs CPURegs:$dst), (ins AFGR64:$src, i32imm:$n),
-           [(set CPURegs:$dst, (MipsExtractElementF64 AFGR64:$src, imm:$n))]>;
+class ExtractElementF64Base<RegisterOperand RO> :
+  PseudoSE<(outs GPR32Opnd:$dst), (ins RO:$src, i32imm:$n),
+           [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
+
+def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
+                        Requires<[NotFP64bit, HasStdEnc]>;
+def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
+                           Requires<[IsFP64bit, HasStdEnc]>;
+
+//===----------------------------------------------------------------------===//
+// InstAliases.
+//===----------------------------------------------------------------------===//
+def : InstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
+def : InstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
@@ -470,59 +546,59 @@ def ExtractElementF64 :
 def : MipsPat<(f32 fpimm0), (MTC1 ZERO)>;
 def : MipsPat<(f32 fpimm0neg), (FNEG_S (MTC1 ZERO))>;
 
-def : MipsPat<(f32 (sint_to_fp CPURegs:$src)), (CVT_S_W (MTC1 CPURegs:$src))>;
-def : MipsPat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S FGR32:$src))>;
+def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_S_W GPR32Opnd:$src)>;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_W_S FGR32Opnd:$src)>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
-                (CVT_D32_W (MTC1 CPURegs:$src))>;
-  def : MipsPat<(i32 (fp_to_sint AFGR64:$src)),
-                (MFC1 (TRUNC_W_D32 AFGR64:$src))>;
-  def : MipsPat<(f32 (fround AFGR64:$src)), (CVT_S_D32 AFGR64:$src)>;
-  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D32_S FGR32:$src)>;
+  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+                (PseudoCVT_D32_W GPR32Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+                (TRUNC_W_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+                (CVT_S_D32 AFGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D32_S FGR32Opnd:$src)>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc] in {
   def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
   def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
 
-  def : MipsPat<(f64 (sint_to_fp CPURegs:$src)),
-                (CVT_D64_W (MTC1 CPURegs:$src))>;
-  def : MipsPat<(f32 (sint_to_fp CPU64Regs:$src)),
-                (CVT_S_L (DMTC1 CPU64Regs:$src))>;
-  def : MipsPat<(f64 (sint_to_fp CPU64Regs:$src)),
-                (CVT_D64_L (DMTC1 CPU64Regs:$src))>;
+  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+                (PseudoCVT_D64_W GPR32Opnd:$src)>;
+  def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>;
+  def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+                (PseudoCVT_D64_L GPR64Opnd:$src)>;
 
-  def : MipsPat<(i32 (fp_to_sint FGR64:$src)),
-                (MFC1 (TRUNC_W_D64 FGR64:$src))>;
-  def : MipsPat<(i64 (fp_to_sint FGR32:$src)), (DMFC1 (TRUNC_L_S FGR32:$src))>;
-  def : MipsPat<(i64 (fp_to_sint FGR64:$src)),
-                (DMFC1 (TRUNC_L_D64 FGR64:$src))>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_W_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+                (TRUNC_L_S FGR32Opnd:$src)>;
+  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+                (TRUNC_L_D64 FGR64Opnd:$src)>;
 
-  def : MipsPat<(f32 (fround FGR64:$src)), (CVT_S_D64 FGR64:$src)>;
-  def : MipsPat<(f64 (fextend FGR32:$src)), (CVT_D64_S FGR32:$src)>;
+  def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+                (CVT_S_D64 FGR64Opnd:$src)>;
+  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+                (CVT_D64_S FGR32Opnd:$src)>;
 }
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LWC1_P8, f32, load>;
-    def : StoreRegImmPat<SWC1_P8, f32>;
-    def : LoadRegImmPat<LDC164_P8, f64, load>;
-    def : StoreRegImmPat<SDC164_P8, f64>;
-  }
-
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LWC1, f32, load>;
     def : StoreRegImmPat<SWC1, f32>;
   }
 
-  let Predicates = [NotN64, HasMips64, HasStdEnc] in {
+  let Predicates = [IsFP64bit, HasStdEnc] in {
     def : LoadRegImmPat<LDC164, f64, load>;
     def : StoreRegImmPat<SDC164, f64>;
   }
 
-  let Predicates = [NotN64, NotMips64, HasStdEnc] in {
+  let Predicates = [NotFP64bit, HasStdEnc] in {
     def : LoadRegImmPat<LDC1, f64, load>;
     def : StoreRegImmPat<SDC1, f64>;
   }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index ea07372..737a018 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -183,7 +183,7 @@ class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
 
-class FJ<bits<6> op>
+class FJ<bits<6> op> : StdArch
 {
   bits<26> target;
 
@@ -272,7 +272,7 @@ class SRLV_FM<bits<6> funct, bit rotate> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class BEQ_FM<bits<6> op> {
+class BEQ_FM<bits<6> op> : StdArch {
   bits<5>  rs;
   bits<5>  rt;
   bits<16> offset;
@@ -285,7 +285,7 @@ class BEQ_FM<bits<6> op> {
   let Inst{15-0}  = offset;
 }
 
-class BGEZ_FM<bits<6> op, bits<5> funct> {
+class BGEZ_FM<bits<6> op, bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -297,17 +297,6 @@ class BGEZ_FM<bits<6> op, bits<5> funct> {
   let Inst{15-0}  = offset;
 }
 
-class B_FM {
-  bits<16> offset;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 4;
-  let Inst{25-21} = 0;
-  let Inst{20-16} = 0;
-  let Inst{15-0}  = offset;
-}
-
 class SLTI_FM<bits<6> op> : StdArch {
   bits<5> rt;
   bits<5> rs;
@@ -321,7 +310,7 @@ class SLTI_FM<bits<6> op> : StdArch {
   let Inst{15-0}  = imm16;
 }
 
-class MFLO_FM<bits<6> funct> {
+class MFLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
 
   bits<32> Inst;
@@ -333,7 +322,7 @@ class MFLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class MTLO_FM<bits<6> funct> {
+class MTLO_FM<bits<6> funct> : StdArch {
   bits<5> rs;
 
   bits<32> Inst;
@@ -344,7 +333,7 @@ class MTLO_FM<bits<6> funct> {
   let Inst{5-0}   = funct;
 }
 
-class SEB_FM<bits<5> funct, bits<6> funct2> {
+class SEB_FM<bits<5> funct, bits<6> funct2> : StdArch {
   bits<5> rd;
   bits<5> rt;
 
@@ -358,7 +347,7 @@ class SEB_FM<bits<5> funct, bits<6> funct2> {
   let Inst{5-0}   = funct2;
 }
 
-class CLO_FM<bits<6> funct> {
+class CLO_FM<bits<6> funct> : StdArch {
   bits<5> rd;
   bits<5> rs;
   bits<5> rt;
@@ -374,7 +363,7 @@ class CLO_FM<bits<6> funct> {
   let rt = rd;
 }
 
-class LUI_FM {
+class LUI_FM : StdArch {
   bits<5> rt;
   bits<16> imm16;
 
@@ -386,7 +375,7 @@ class LUI_FM {
   let Inst{15-0}  = imm16;
 }
 
-class JALR_FM {
+class JALR_FM : StdArch {
   bits<5> rd;
   bits<5> rs;
 
@@ -400,18 +389,7 @@ class JALR_FM {
   let Inst{5-0}   = 9;
 }
 
-class BAL_FM {
-  bits<16> offset;
-
-  bits<32> Inst;
-
-  let Inst{31-26} = 1;
-  let Inst{25-21} = 0;
-  let Inst{20-16} = 0x11;
-  let Inst{15-0}  = offset;
-}
-
-class BGEZAL_FM<bits<5> funct> {
+class BGEZAL_FM<bits<5> funct> : StdArch {
   bits<5>  rs;
   bits<16> offset;
 
@@ -446,7 +424,7 @@ class MULT_FM<bits<6> op, bits<6> funct> : StdArch {
   let Inst{5-0}   = funct;
 }
 
-class EXT_FM<bits<6> funct> {
+class EXT_FM<bits<6> funct> : StdArch {
   bits<5> rt;
   bits<5> rs;
   bits<5> pos;
@@ -476,6 +454,90 @@ class RDHWR_FM {
   let Inst{5-0}   = 0x3b;
 }
 
+class TEQ_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> code_;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = code_;
+  let Inst{5-0}   = funct;
+}
+
+class TEQI_FM<bits<5> funct> : StdArch {
+  bits<5> rs;
+  bits<16> imm16;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 1;
+  let Inst{25-21} = rs;
+  let Inst{20-16}   = funct;
+  let Inst{15-0}  = imm16;
+}
+//===----------------------------------------------------------------------===//
+//  System calls format <op|code_|funct>
+//===----------------------------------------------------------------------===//
+
+class SYS_FM<bits<6> funct>
+{
+  bits<20> code_;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-6} = code_;
+  let Inst{5-0}  = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Break instruction format <op|code_1|funct>
+//===----------------------------------------------------------------------===//
+
+class BRK_FM<bits<6> funct>
+{
+  bits<10> code_1;
+  bits<10> code_2;
+  bits<32> Inst;
+  let Inst{31-26} = 0x0;
+  let Inst{25-16} = code_1;
+  let Inst{15-6}  = code_2;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+//  Exception return format <Cop0|1|0|funct>
+//===----------------------------------------------------------------------===//
+
+class ER_FM<bits<6> funct>
+{
+  bits<32> Inst;
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Enable/disable interrupt instruction format <Cop0|MFMC0|rt|12|0|sc|0|0>
+//===----------------------------------------------------------------------===//
+
+class EI_FM<bits<1> sc>
+{
+  bits<32> Inst;
+  bits<5> rt;
+  let Inst{31-26} = 0x10;
+  let Inst{25-21} = 0xb;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0xc;
+  let Inst{10-6}  = 0;
+  let Inst{5}     = sc;
+  let Inst{4-0}   = 0;
+}
+
 //===----------------------------------------------------------------------===//
 //
 //  FLOATING POINT INSTRUCTION FORMATS
@@ -609,13 +671,14 @@ class SWXC1_FM<bits<6> funct> {
 }
 
 class BC1F_FM<bit nd, bit tf> {
+  bits<3>  fcc;
   bits<16> offset;
 
   bits<32> Inst;
 
   let Inst{31-26} = 0x11;
   let Inst{25-21} = 0x8;
-  let Inst{20-18} = 0; // cc
+  let Inst{20-18} = fcc;
   let Inst{17} = nd;
   let Inst{16} = tf;
   let Inst{15-0} = offset;
@@ -637,6 +700,10 @@ class CEQS_FM<bits<5> fmt> {
   let Inst{3-0} = cond;
 }
 
+class C_COND_FM<bits<5> fmt, bits<4> c> : CEQS_FM<fmt> {
+  let cond = c;
+}
+
 class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
   bits<5> fd;
   bits<5> fs;
@@ -652,15 +719,16 @@ class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
   let Inst{5-0} = funct;
 }
 
-class CMov_F_I_FM<bit tf> {
+class CMov_F_I_FM<bit tf> : StdArch {
   bits<5> rd;
   bits<5> rs;
+  bits<3> fcc;
 
   bits<32> Inst;
 
   let Inst{31-26} = 0;
   let Inst{25-21} = rs;
-  let Inst{20-18} = 0; // cc
+  let Inst{20-18} = fcc;
   let Inst{17} = 0;
   let Inst{16} = tf;
   let Inst{15-11} = rd;
@@ -671,12 +739,13 @@ class CMov_F_I_FM<bit tf> {
 class CMov_F_F_FM<bits<5> fmt, bit tf> {
   bits<5> fd;
   bits<5> fs;
+  bits<3> fcc;
 
   bits<32> Inst;
 
   let Inst{31-26} = 0x11;
   let Inst{25-21} = fmt;
-  let Inst{20-18} = 0; // cc
+  let Inst{20-18} = fcc;
   let Inst{17} = 0;
   let Inst{16} = tf;
   let Inst{15-11} = fs;
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index ad92d41..0ebad05 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -22,11 +22,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "MipsGenInstrInfo.inc"
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void MipsInstrInfo::anchor() {}
+
 MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm, unsigned UncondBr)
   : MipsGenInstrInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
     TM(tm), UncondBrOpc(UncondBr) {}
@@ -61,15 +64,6 @@ MachineMemOperand *MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
                                  MFI.getObjectSize(FI), Align);
 }
 
-MachineInstr*
-MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                        uint64_t Offset, const MDNode *MDPtr,
-                                        DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Mips::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 //===----------------------------------------------------------------------===//
 // Branch Analysis
 //===----------------------------------------------------------------------===//
@@ -77,7 +71,7 @@ MipsInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
 void MipsInstrInfo::AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
                                   MachineBasicBlock *&BB,
                                   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(GetAnalyzableBrOpc(Opc) && "Not an analyzable branch");
+  assert(getAnalyzableBrOpc(Opc) && "Not an analyzable branch");
   int NumOp = Inst->getNumExplicitOperands();
 
   // for both int and fp branches, the last explicit operand is the
@@ -167,7 +161,7 @@ RemoveBranch(MachineBasicBlock &MBB) const
   // Up to 2 branches are removed.
   // Note that indirect branches are not removed.
   for(removed = 0; I != REnd && removed < 2; ++I, ++removed)
-    if (!GetAnalyzableBrOpc(I->getOpcode()))
+    if (!getAnalyzableBrOpc(I->getOpcode()))
       break;
 
   MBB.erase(I.base(), FirstBr.base());
@@ -182,7 +176,7 @@ ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const
 {
   assert( (Cond.size() && Cond.size() <= 3) &&
           "Invalid Mips branch condition!");
-  Cond[0].setImm(GetOppositeBranchOpc(Cond[0].getImm()));
+  Cond[0].setImm(getOppositeBranchOpc(Cond[0].getImm()));
   return false;
 }
 
@@ -210,7 +204,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   BranchInstrs.push_back(LastInst);
 
   // Not an analyzable branch (e.g., indirect jump).
-  if (!GetAnalyzableBrOpc(LastOpc))
+  if (!getAnalyzableBrOpc(LastOpc))
     return LastInst->isIndirectBranch() ? BT_Indirect : BT_None;
 
   // Get the second to last instruction in the block.
@@ -219,7 +213,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   if (++I != REnd) {
     SecondLastInst = &*I;
-    SecondLastOpc = GetAnalyzableBrOpc(SecondLastInst->getOpcode());
+    SecondLastOpc = getAnalyzableBrOpc(SecondLastInst->getOpcode());
 
     // Not an analyzable branch (must be an indirect jump).
     if (isUnpredicatedTerminator(SecondLastInst) && !SecondLastOpc)
@@ -228,7 +222,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // If there is only one terminator instruction, process it.
   if (!SecondLastOpc) {
-    // Unconditional branch
+    // Unconditional branch.
     if (LastOpc == UncondBrOpc) {
       TBB = LastInst->getOperand(0).getMBB();
       return BT_Uncond;
@@ -280,5 +274,22 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
     const char *AsmStr = MI->getOperand(0).getSymbolName();
     return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
   }
+  case Mips::CONSTPOOL_ENTRY:
+    // If this machine instr is a constant pool entry, its size is recorded as
+    // operand #2.
+    return MI->getOperand(2).getImm();
   }
 }
+
+MachineInstrBuilder
+MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
+                                  MachineBasicBlock::iterator I) const {
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), get(NewOpc));
+
+  for (unsigned J = 0, E = I->getDesc().getNumOperands(); J < E; ++J)
+    MIB.addOperand(I->getOperand(J));
+
+  MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
+  return MIB;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 8c05d97..d9ac961 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -17,6 +17,7 @@
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -26,6 +27,7 @@
 namespace llvm {
 
 class MipsInstrInfo : public MipsGenInstrInfo {
+  virtual void anchor();
 protected:
   MipsTargetMachine &TM;
   unsigned UncondBrOpc;
@@ -66,11 +68,6 @@ public:
                            bool AllowModify,
                            SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
 
-  virtual MachineInstr* emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx, uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
-
   /// Insert nop instruction when hazard condition is found
   virtual void insertNoop(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI) const;
@@ -81,7 +78,7 @@ public:
   ///
   virtual const MipsRegisterInfo &getRegisterInfo() const = 0;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const = 0;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
   /// Return the number of bytes of code the specified instruction may be.
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
@@ -116,6 +113,11 @@ public:
                                 const TargetRegisterInfo *TRI,
                                 int64_t Offset) const = 0;
 
+  /// Create an instruction which has the same operands and memory operands
+  /// as MI but has a new opcode.
+  MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
+                                         MachineBasicBlock::iterator I) const;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
@@ -123,7 +125,7 @@ protected:
                                    unsigned Flag) const;
 
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const = 0;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const = 0;
 
   void AnalyzeCondBr(const MachineInstr *Inst, unsigned Opc,
                      MachineBasicBlock *&BB,
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 86ec729..ebdbaa4 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -23,10 +23,9 @@ def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
                                                 SDTCisInt<4>]>;
 def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
 def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
-def SDT_ExtractLOHI : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVT<1, untyped>,
-                                           SDTCisVT<2, i32>]>;
-def SDT_InsertLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
-                                          SDTCisVT<1, i32>, SDTCisSameAs<1, 2>]>;
+def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>;
+def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>,
+                                      SDTCisInt<1>, SDTCisSameAs<1, 2>]>;
 def SDT_MipsMultDiv : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, SDTCisInt<1>,
                                     SDTCisSameAs<1, 2>]>;
 def SDT_MipsMAddMSub : SDTypeProfile<1, 3,
@@ -85,11 +84,12 @@ def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
                            [SDNPHasChain, SDNPSideEffect,
                             SDNPOptInGlue, SDNPOutGlue]>;
 
-// Node used to extract integer from LO/HI register.
-def ExtractLOHI : SDNode<"MipsISD::ExtractLOHI", SDT_ExtractLOHI>;
+// Nodes used to extract LO/HI registers.
+def MipsMFHI : SDNode<"MipsISD::MFHI", SDT_MFLOHI>;
+def MipsMFLO : SDNode<"MipsISD::MFLO", SDT_MFLOHI>;
 
 // Node used to insert 32-bit integers to LOHI register pair.
-def InsertLOHI : SDNode<"MipsISD::InsertLOHI", SDT_InsertLOHI>;
+def MipsMTLOHI : SDNode<"MipsISD::MTLOHI", SDT_MTLOHI>;
 
 // Mult nodes.
 def MipsMult  : SDNode<"MipsISD::Mult", SDT_MipsMultDiv>;
@@ -104,7 +104,8 @@ def MipsMSubu : SDNode<"MipsISD::MSubu", SDT_MipsMAddMSub>;
 // DivRem(u) nodes
 def MipsDivRem    : SDNode<"MipsISD::DivRem", SDT_MipsMultDiv>;
 def MipsDivRemU   : SDNode<"MipsISD::DivRemU", SDT_MipsMultDiv>;
-def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16, [SDNPOutGlue]>;
+def MipsDivRem16  : SDNode<"MipsISD::DivRem16", SDT_MipsDivRem16,
+                           [SDNPOutGlue]>;
 def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
                            [SDNPOutGlue]>;
 
@@ -113,7 +114,7 @@ def MipsDivRemU16 : SDNode<"MipsISD::DivRemU16", SDT_MipsDivRem16,
 // Wrapper node patterns give the instruction selector a chance to replace
 // target constant nodes that would otherwise remain unchanged with ADDiu
 // nodes. Without these wrapper node patterns, the following conditional move
-// instrucion is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
+// instruction is emitted when function cmov2 in test/CodeGen/Mips/cmov.ll is
 // compiled:
 //  movn  %got(d)($gp), %got(c)($gp), $4
 // This instruction is illegal since movn can take only register operands.
@@ -180,6 +181,12 @@ def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
 def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
 def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
+def InMicroMips    :  Predicate<"Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"FeatureMicroMips">;
+def NotInMicroMips :  Predicate<"!Subtarget.inMicroMipsMode()">,
+                      AssemblerPredicate<"!FeatureMicroMips">;
+def IsLE           :  Predicate<"Subtarget.isLittle()">;
+def IsBE           :  Predicate<"!Subtarget.isLittle()">;
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStdEnc];
@@ -240,7 +247,7 @@ def brtarget    : Operand<OtherVT> {
 def calltarget  : Operand<iPTR> {
   let EncoderMethod = "getJumpTargetOpValue";
 }
-def calltarget64: Operand<i64>;
+
 def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
@@ -248,48 +255,73 @@ def simm16      : Operand<i32> {
 def simm20      : Operand<i32> {
 }
 
-def simm16_64   : Operand<i64>;
-def shamt       : Operand<i32>;
+def uimm20      : Operand<i32> {
+}
+
+def uimm10      : Operand<i32> {
+}
+
+def simm16_64   : Operand<i64> {
+  let DecoderMethod = "DecodeSimm16";
+}
 
 // Unsigned Operand
+def uimm5       : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm6 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
 
+def pcrel16      : Operand<i32> {
+}
+
 def MipsMemAsmOperand : AsmOperandClass {
   let Name = "Mem";
   let ParserMethod = "parseMemOperand";
 }
 
-// Address operand
-def mem : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops CPURegs, simm16);
-  let EncoderMethod = "getMemEncoding";
-  let ParserMatchClass = MipsMemAsmOperand;
-  let OperandType = "OPERAND_MEMORY";
+def MipsInvertedImmoperand : AsmOperandClass {
+  let Name = "InvNum";
+  let RenderMethod = "addImmOperands";
+  let ParserMethod = "parseInvNum";
+}
+
+def PtrRegAsmOperand : AsmOperandClass {
+  let Name = "PtrReg";
+  let ParserMethod = "parsePtrReg";
 }
 
-def mem64 : Operand<i64> {
+
+def InvertedImOperand : Operand<i32> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
+// Address operand
+def mem : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops CPU64Regs, simm16_64);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let ParserMatchClass = MipsMemAsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea : Operand<i32> {
+def mem_ea : Operand<iPTR> {
   let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops CPURegs, simm16);
+  let MIOperandInfo = (ops ptr_rc, simm16);
   let EncoderMethod = "getMemEncoding";
   let OperandType = "OPERAND_MEMORY";
 }
 
-def mem_ea_64 : Operand<i64> {
-  let PrintMethod = "printMemOperandEA";
-  let MIOperandInfo = (ops CPU64Regs, simm16_64);
-  let EncoderMethod = "getMemEncoding";
-  let OperandType = "OPERAND_MEMORY";
+def PtrRC : Operand<iPTR> {
+  let MIOperandInfo = (ops ptr_rc);
+  let DecoderMethod = "DecodePtrRegisterClass";
+  let ParserMatchClass = PtrRegAsmOperand;
 }
 
 // size operand of ext instruction
@@ -362,6 +394,9 @@ def addr :
 def addrRegImm :
   ComplexPattern<iPTR, 2, "selectAddrRegImm", [frameindex]>;
 
+def addrRegReg :
+  ComplexPattern<iPTR, 2, "selectAddrRegReg", [frameindex]>;
+
 def addrDefault :
   ComplexPattern<iPTR, 2, "selectAddrDefault", [frameindex]>;
 
@@ -382,160 +417,111 @@ class ArithLogicR<string opstr, RegisterOperand RO, bit isComm = 0,
 
 // Arithmetic and logical instructions with 2 register operands.
 class ArithLogicI<string opstr, Operand Od, RegisterOperand RO,
+                  InstrItinClass Itin = NoItinerary,
                   SDPatternOperator imm_type = null_frag,
                   SDPatternOperator OpNode = null_frag> :
   InstSE<(outs RO:$rt), (ins RO:$rs, Od:$imm16),
          !strconcat(opstr, "\t$rt, $rs, $imm16"),
          [(set RO:$rt, (OpNode RO:$rs, imm_type:$imm16))],
-         IIAlu, FrmI, opstr> {
+         Itin, FrmI, opstr> {
   let isReMaterializable = 1;
+  let TwoOperandAliasConstraint = "$rs = $rt";
 }
 
 // Arithmetic Multiply ADD/SUB
 class MArithR<string opstr, bit isComm = 0> :
-  InstSE<(outs), (ins CPURegsOpnd:$rs, CPURegsOpnd:$rt),
-         !strconcat(opstr, "\t$rs, $rt"), [], IIImul, FrmR> {
-  let Defs = [HI, LO];
-  let Uses = [HI, LO];
+  InstSE<(outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+         !strconcat(opstr, "\t$rs, $rt"), [], IIImult, FrmR, opstr> {
+  let Defs = [HI0, LO0];
+  let Uses = [HI0, LO0];
   let isCommutable = isComm;
 }
 
 //  Logical
-class LogicNOR<string opstr, RegisterOperand RC>:
-  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$rt),
+class LogicNOR<string opstr, RegisterOperand RO>:
+  InstSE<(outs RO:$rd), (ins RO:$rs, RO:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set RC:$rd, (not (or RC:$rs, RC:$rt)))], IIAlu, FrmR, opstr> {
+         [(set RO:$rd, (not (or RO:$rs, RO:$rt)))], IIArith, FrmR, opstr> {
   let isCommutable = 1;
 }
 
 // Shifts
 class shift_rotate_imm<string opstr, Operand ImmOpnd,
-                       RegisterOperand RC, SDPatternOperator OpNode = null_frag,
+                       RegisterOperand RO, SDPatternOperator OpNode = null_frag,
                        SDPatternOperator PF = null_frag> :
-  InstSE<(outs RC:$rd), (ins RC:$rt, ImmOpnd:$shamt),
+  InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
          !strconcat(opstr, "\t$rd, $rt, $shamt"),
-         [(set RC:$rd, (OpNode RC:$rt, PF:$shamt))], IIAlu, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], IIArith, FrmR, opstr>;
 
-class shift_rotate_reg<string opstr, RegisterOperand RC,
+class shift_rotate_reg<string opstr, RegisterOperand RO,
                        SDPatternOperator OpNode = null_frag>:
-  InstSE<(outs RC:$rd), (ins CPURegsOpnd:$rs, RC:$rt),
+  InstSE<(outs RO:$rd), (ins RO:$rt, GPR32Opnd:$rs),
          !strconcat(opstr, "\t$rd, $rt, $rs"),
-         [(set RC:$rd, (OpNode RC:$rt, CPURegsOpnd:$rs))], IIAlu, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, GPR32Opnd:$rs))], IIArith, FrmR, opstr>;
 
 // Load Upper Imediate
-class LoadUpper<string opstr, RegisterClass RC, Operand Imm>:
-  InstSE<(outs RC:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
-         [], IIAlu, FrmI>, IsAsCheapAsAMove {
+class LoadUpper<string opstr, RegisterOperand RO, Operand Imm>:
+  InstSE<(outs RO:$rt), (ins Imm:$imm16), !strconcat(opstr, "\t$rt, $imm16"),
+         [], IIArith, FrmI, opstr>, IsAsCheapAsAMove {
   let neverHasSideEffects = 1;
   let isReMaterializable = 1;
 }
 
-class FMem<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-          InstrItinClass itin>: FFI<op, outs, ins, asmstr, pattern> {
-  bits<21> addr;
-  let Inst{25-21} = addr{20-16};
-  let Inst{15-0}  = addr{15-0};
-  let DecoderMethod = "DecodeMem";
-}
-
 // Memory Load/Store
-class Load<string opstr, SDPatternOperator OpNode, RegisterClass RC,
-           Operand MemOpnd, ComplexPattern Addr, string ofsuffix> :
-  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RC:$rt, (OpNode Addr:$addr))], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Load<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+           InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, (OpNode Addr:$addr))], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let canFoldAsLoad = 1;
   let mayLoad = 1;
 }
 
-class Store<string opstr, SDPatternOperator OpNode, RegisterClass RC,
-            Operand MemOpnd, ComplexPattern Addr, string ofsuffix> :
-  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RC:$rt, Addr:$addr)], NoItinerary, FrmI,
-         !strconcat(opstr, ofsuffix)> {
+class Store<string opstr, DAGOperand RO, SDPatternOperator OpNode = null_frag,
+            InstrItinClass Itin = NoItinerary, ComplexPattern Addr = addr> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, Addr:$addr)], Itin, FrmI, opstr> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
 }
 
-multiclass LoadM<string opstr, RegisterClass RC,
-                 SDPatternOperator OpNode = null_frag,
-                 ComplexPattern Addr = addr> {
-  def NAME : Load<opstr, OpNode, RC, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Load<opstr, OpNode, RC, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreM<string opstr, RegisterClass RC,
-                  SDPatternOperator OpNode = null_frag,
-                  ComplexPattern Addr = addr> {
-  def NAME : Store<opstr, OpNode, RC, mem, Addr, "">,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Store<opstr, OpNode, RC, mem64, Addr, "_p8">,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Load/Store Left/Right
 let canFoldAsLoad = 1 in
-class LoadLeftRight<string opstr, SDNode OpNode, RegisterClass RC,
-                    Operand MemOpnd> :
-  InstSE<(outs RC:$rt), (ins MemOpnd:$addr, RC:$src),
+class LoadLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                    InstrItinClass Itin> :
+  InstSE<(outs RO:$rt), (ins mem:$addr, RO:$src),
          !strconcat(opstr, "\t$rt, $addr"),
-         [(set RC:$rt, (OpNode addr:$addr, RC:$src))], NoItinerary, FrmI> {
+         [(set RO:$rt, (OpNode addr:$addr, RO:$src))], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
   string Constraints = "$src = $rt";
 }
 
-class StoreLeftRight<string opstr, SDNode OpNode, RegisterClass RC,
-                     Operand MemOpnd>:
-  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(OpNode RC:$rt, addr:$addr)], NoItinerary, FrmI> {
+class StoreLeftRight<string opstr, SDNode OpNode, RegisterOperand RO,
+                     InstrItinClass Itin> :
+  InstSE<(outs), (ins RO:$rt, mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RO:$rt, addr:$addr)], Itin, FrmI> {
   let DecoderMethod = "DecodeMem";
 }
 
-multiclass LoadLeftRightM<string opstr, SDNode OpNode, RegisterClass RC> {
-  def NAME : LoadLeftRight<opstr, OpNode, RC, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : LoadLeftRight<opstr, OpNode, RC, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
-multiclass StoreLeftRightM<string opstr, SDNode OpNode, RegisterClass RC> {
-  def NAME : StoreLeftRight<opstr, OpNode, RC, mem>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : StoreLeftRight<opstr, OpNode, RC, mem64>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-    let isCodeGenOnly = 1;
-  }
-}
-
 // Conditional Branch
-class CBranch<string opstr, PatFrag cond_op, RegisterClass RC> :
-  InstSE<(outs), (ins RC:$rs, RC:$rt, brtarget:$offset),
+class CBranch<string opstr, DAGOperand opnd, PatFrag cond_op,
+              RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, opnd:$offset),
          !strconcat(opstr, "\t$rs, $rt, $offset"),
-         [(brcond (i32 (cond_op RC:$rs, RC:$rt)), bb:$offset)], IIBranch,
-         FrmI> {
+         [(brcond (i32 (cond_op RO:$rs, RO:$rt)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
   let Defs = [AT];
 }
 
-class CBranchZero<string opstr, PatFrag cond_op, RegisterClass RC> :
-  InstSE<(outs), (ins RC:$rs, brtarget:$offset),
+class CBranchZero<string opstr, DAGOperand opnd, PatFrag cond_op,
+                  RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, opnd:$offset),
          !strconcat(opstr, "\t$rs, $offset"),
-         [(brcond (i32 (cond_op RC:$rs, 0)), bb:$offset)], IIBranch, FrmI> {
+         [(brcond (i32 (cond_op RO:$rs, 0)), bb:$offset)], IIBranch,
+         FrmI, opstr> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
@@ -543,24 +529,24 @@ class CBranchZero<string opstr, PatFrag cond_op, RegisterClass RC> :
 }
 
 // SetCC
-class SetCC_R<string opstr, PatFrag cond_op, RegisterClass RC> :
-  InstSE<(outs CPURegsOpnd:$rd), (ins RC:$rs, RC:$rt),
+class SetCC_R<string opstr, PatFrag cond_op, RegisterOperand RO> :
+  InstSE<(outs GPR32Opnd:$rd), (ins RO:$rs, RO:$rt),
          !strconcat(opstr, "\t$rd, $rs, $rt"),
-         [(set CPURegsOpnd:$rd, (cond_op RC:$rs, RC:$rt))],
-         IIAlu, FrmR, opstr>;
+         [(set GPR32Opnd:$rd, (cond_op RO:$rs, RO:$rt))],
+         IIslt, FrmR, opstr>;
 
 class SetCC_I<string opstr, PatFrag cond_op, Operand Od, PatLeaf imm_type,
-              RegisterClass RC>:
-  InstSE<(outs CPURegsOpnd:$rt), (ins RC:$rs, Od:$imm16),
+              RegisterOperand RO>:
+  InstSE<(outs GPR32Opnd:$rt), (ins RO:$rs, Od:$imm16),
          !strconcat(opstr, "\t$rt, $rs, $imm16"),
-         [(set CPURegsOpnd:$rt, (cond_op RC:$rs, imm_type:$imm16))],
-         IIAlu, FrmI, opstr>;
+         [(set GPR32Opnd:$rt, (cond_op RO:$rs, imm_type:$imm16))],
+         IIslt, FrmI, opstr>;
 
 // Jump
 class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
-             SDPatternOperator targetoperator> :
+             SDPatternOperator targetoperator, string bopstr> :
   InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
-         [(operator targetoperator:$target)], IIBranch, FrmJ> {
+         [(operator targetoperator:$target)], IIBranch, FrmJ, bopstr> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
@@ -569,9 +555,9 @@ class JumpFJ<DAGOperand opnd, string opstr, SDPatternOperator operator,
 }
 
 // Unconditional branch
-class UncondBranch<string opstr> :
-  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
-         [(br bb:$offset)], IIBranch, FrmI> {
+class UncondBranch<Instruction BEQInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [(br bb:$offset)], IIBranch>,
+  PseudoInstExpansion<(BEQInst ZERO, ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -582,17 +568,20 @@ class UncondBranch<string opstr> :
 
 // Base class for indirect branch and return instruction classes.
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
-class JumpFR<RegisterClass RC, SDPatternOperator operator = null_frag>:
-  InstSE<(outs), (ins RC:$rs), "jr\t$rs", [(operator RC:$rs)], IIBranch, FrmR>;
+class JumpFR<string opstr, RegisterOperand RO,
+             SDPatternOperator operator = null_frag>:
+  InstSE<(outs), (ins RO:$rs), "jr\t$rs", [(operator RO:$rs)], IIBranch,
+         FrmR, opstr>;
 
 // Indirect branch
-class IndirectBranch<RegisterClass RC>: JumpFR<RC, brind> {
+class IndirectBranch<string opstr, RegisterOperand RO> :
+      JumpFR<opstr, RO, brind> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
 // Return instruction
-class RetBase<RegisterClass RC>: JumpFR<RC> {
+class RetBase<string opstr, RegisterOperand RO>: JumpFR<opstr, RO> {
   let isReturn = 1;
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
@@ -601,29 +590,30 @@ class RetBase<RegisterClass RC>: JumpFR<RC> {
 
 // Jump and Link (Call)
 let isCall=1, hasDelaySlot=1, Defs = [RA] in {
-  class JumpLink<string opstr> :
-    InstSE<(outs), (ins calltarget:$target), !strconcat(opstr, "\t$target"),
-           [(MipsJmpLink imm:$target)], IIBranch, FrmJ> {
+  class JumpLink<string opstr, DAGOperand opnd> :
+    InstSE<(outs), (ins opnd:$target), !strconcat(opstr, "\t$target"),
+           [(MipsJmpLink imm:$target)], IIBranch, FrmJ, opstr> {
     let DecoderMethod = "DecodeJumpTarget";
   }
 
-  class JumpLinkRegPseudo<RegisterClass RC, Instruction JALRInst,
-                          Register RetReg>:
-    PseudoSE<(outs), (ins RC:$rs), [(MipsJmpLink RC:$rs)], IIBranch>,
-    PseudoInstExpansion<(JALRInst RetReg, RC:$rs)>;
+  class JumpLinkRegPseudo<RegisterOperand RO, Instruction JALRInst,
+                          Register RetReg, RegisterOperand ResRO = RO>:
+    PseudoSE<(outs), (ins RO:$rs), [(MipsJmpLink RO:$rs)], IIBranch>,
+    PseudoInstExpansion<(JALRInst RetReg, ResRO:$rs)>;
 
-  class JumpLinkReg<string opstr, RegisterClass RC>:
-    InstSE<(outs RC:$rd), (ins RC:$rs), !strconcat(opstr, "\t$rd, $rs"),
-           [], IIBranch, FrmR>;
+  class JumpLinkReg<string opstr, RegisterOperand RO>:
+    InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
+           [], IIBranch, FrmR, opstr>;
 
-  class BGEZAL_FT<string opstr, RegisterOperand RO> :
-    InstSE<(outs), (ins RO:$rs, brtarget:$offset),
-           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI>;
+  class BGEZAL_FT<string opstr, DAGOperand opnd, RegisterOperand RO> :
+    InstSE<(outs), (ins RO:$rs, opnd:$offset),
+           !strconcat(opstr, "\t$rs, $offset"), [], IIBranch, FrmI, opstr>;
 
 }
 
-class BAL_FT :
-  InstSE<(outs), (ins brtarget:$offset), "bal\t$offset", [], IIBranch, FrmI> {
+class BAL_BR_Pseudo<Instruction RealInst> :
+  PseudoSE<(outs), (ins brtarget:$offset), [], IIBranch>,
+  PseudoInstExpansion<(RealInst ZERO, brtarget:$offset)> {
   let isBranch = 1;
   let isTerminator = 1;
   let isBarrier = 1;
@@ -631,12 +621,49 @@ class BAL_FT :
   let Defs = [RA];
 }
 
+// Syscall
+class SYS_FT<string opstr> :
+  InstSE<(outs), (ins uimm20:$code_),
+         !strconcat(opstr, "\t$code_"), [], NoItinerary, FrmI>;
+// Break
+class BRK_FT<string opstr> :
+  InstSE<(outs), (ins uimm10:$code_1, uimm10:$code_2),
+         !strconcat(opstr, "\t$code_1, $code_2"), [], NoItinerary, FrmOther>;
+
+// (D)Eret
+class ER_FT<string opstr> :
+  InstSE<(outs), (ins),
+         opstr, [], NoItinerary, FrmOther>;
+
+// Interrupts
+class DEI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins),
+         !strconcat(opstr, "\t$rt"), [], NoItinerary, FrmOther>;
+
+// Wait
+class WAIT_FT<string opstr> :
+  InstSE<(outs), (ins), opstr, [], NoItinerary, FrmOther> {
+  let Inst{31-26} = 0x10;
+  let Inst{25}    = 1;
+  let Inst{24-6}  = 0;
+  let Inst{5-0}   = 0x20;
+}
+
 // Sync
 let hasSideEffects = 1 in
 class SYNC_FT :
   InstSE<(outs), (ins i32imm:$stype), "sync $stype", [(MipsSync imm:$stype)],
          NoItinerary, FrmOther>;
 
+let hasSideEffects = 1 in
+class TEQ_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, RO:$rt, uimm16:$code_),
+         !strconcat(opstr, "\t$rs, $rt, $code_"), [], NoItinerary,
+         FrmI, opstr>;
+
+class TEQI_FT<string opstr, RegisterOperand RO> :
+  InstSE<(outs), (ins RO:$rs, uimm16:$imm16),
+         !strconcat(opstr, "\t$rs, $imm16"), [], NoItinerary, FrmOther, opstr>;
 // Mul, Div
 class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
            list<Register> DefRegs> :
@@ -651,49 +678,61 @@ class Mult<string opstr, InstrItinClass itin, RegisterOperand RO,
 // operands.
 class MultDivPseudo<Instruction RealInst, RegisterClass R0, RegisterOperand R1,
                     SDPatternOperator OpNode, InstrItinClass Itin,
-                    bit IsComm = 1, bit HasSideEffects = 0> :
+                    bit IsComm = 1, bit HasSideEffects = 0,
+                    bit UsesCustomInserter = 0> :
   PseudoSE<(outs R0:$ac), (ins R1:$rs, R1:$rt),
            [(set R0:$ac, (OpNode R1:$rs, R1:$rt))], Itin>,
   PseudoInstExpansion<(RealInst R1:$rs, R1:$rt)> {
   let isCommutable = IsComm;
   let hasSideEffects = HasSideEffects;
+  let usesCustomInserter = UsesCustomInserter;
 }
 
 // Pseudo multiply add/sub instruction with explicit accumulator register
 // operands.
 class MAddSubPseudo<Instruction RealInst, SDPatternOperator OpNode>
-  : PseudoSE<(outs ACRegs:$ac),
-             (ins CPURegsOpnd:$rs, CPURegsOpnd:$rt, ACRegs:$acin),
-             [(set ACRegs:$ac,
-              (OpNode CPURegsOpnd:$rs, CPURegsOpnd:$rt, ACRegs:$acin))],
-             IIImul>,
-    PseudoInstExpansion<(RealInst CPURegsOpnd:$rs, CPURegsOpnd:$rt)> {
+  : PseudoSE<(outs ACC64:$ac),
+             (ins GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin),
+             [(set ACC64:$ac,
+              (OpNode GPR32Opnd:$rs, GPR32Opnd:$rt, ACC64:$acin))],
+             IIImult>,
+    PseudoInstExpansion<(RealInst GPR32Opnd:$rs, GPR32Opnd:$rt)> {
   string Constraints = "$acin = $ac";
 }
 
 class Div<string opstr, InstrItinClass itin, RegisterOperand RO,
           list<Register> DefRegs> :
   InstSE<(outs), (ins RO:$rs, RO:$rt), !strconcat(opstr, "\t$$zero, $rs, $rt"),
-         [], itin, FrmR> {
+         [], itin, FrmR, opstr> {
   let Defs = DefRegs;
 }
 
 // Move from Hi/Lo
-class MoveFromLOHI<string opstr, RegisterClass RC, list<Register> UseRegs>:
-  InstSE<(outs RC:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR> {
-  let Uses = UseRegs;
+class PseudoMFLOHI<RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode>
+  : PseudoSE<(outs DstRC:$rd), (ins SrcRC:$hilo),
+             [(set DstRC:$rd, (OpNode SrcRC:$hilo))], IIHiLo>;
+
+class MoveFromLOHI<string opstr, RegisterOperand RO, Register UseReg>:
+  InstSE<(outs RO:$rd), (ins), !strconcat(opstr, "\t$rd"), [], IIHiLo, FrmR,
+         opstr> {
+  let Uses = [UseReg];
   let neverHasSideEffects = 1;
 }
 
-class MoveToLOHI<string opstr, RegisterClass RC, list<Register> DefRegs>:
-  InstSE<(outs), (ins RC:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo, FrmR> {
+class PseudoMTLOHI<RegisterClass DstRC, RegisterClass SrcRC>
+  : PseudoSE<(outs DstRC:$lohi), (ins SrcRC:$lo, SrcRC:$hi),
+             [(set DstRC:$lohi, (MipsMTLOHI SrcRC:$lo, SrcRC:$hi))], IIHiLo>;
+
+class MoveToLOHI<string opstr, RegisterOperand RO, list<Register> DefRegs>:
+  InstSE<(outs), (ins RO:$rs), !strconcat(opstr, "\t$rs"), [], IIHiLo,
+  FrmR, opstr> {
   let Defs = DefRegs;
   let neverHasSideEffects = 1;
 }
 
-class EffectiveAddress<string opstr, RegisterClass RC, Operand Mem> :
-  InstSE<(outs RC:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
-         [(set RC:$rt, addr:$addr)], NoItinerary, FrmI> {
+class EffectiveAddress<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem_ea:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RO:$rt, addr:$addr)], NoItinerary, FrmI> {
   let isCodeGenOnly = 1;
   let DecoderMethod = "DecodeMem";
 }
@@ -701,97 +740,91 @@ class EffectiveAddress<string opstr, RegisterClass RC, Operand Mem> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], IIAlu, FrmR>,
+         [(set RO:$rd, (ctlz RO:$rs))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], IIAlu, FrmR>,
+         [(set RO:$rd, (ctlz (not RO:$rs)))], IIArith, FrmR, opstr>,
   Requires<[HasBitCount, HasStdEnc]>;
 
 
 // Sign Extend in Register.
-class SignExtInReg<string opstr, ValueType vt, RegisterClass RC> :
-  InstSE<(outs RC:$rd), (ins RC:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RC:$rd, (sext_inreg RC:$rt, vt))], NoItinerary, FrmR> {
+class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO> :
+  InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], IIseb, FrmR, opstr> {
   let Predicates = [HasSEInReg, HasStdEnc];
 }
 
 // Subword Swap
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
-         NoItinerary, FrmR> {
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
 
 // Read Hardware
-class ReadHardware<RegisterClass CPURegClass, RegisterOperand RO> :
-  InstSE<(outs CPURegClass:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
-         IIAlu, FrmR>;
+class ReadHardware<RegisterOperand CPURegOperand, RegisterOperand RO> :
+  InstSE<(outs CPURegOperand:$rt), (ins RO:$rd), "rdhwr\t$rt, $rd", [],
+         IIArith, FrmR>;
 
 // Ext and Ins
-class ExtBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ext:$size),
+class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsExt RO:$rs, imm:$pos, imm:$size))], NoItinerary,
-         FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
+         FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
 }
 
-class InsBase<string opstr, RegisterOperand RO>:
-  InstSE<(outs RO:$rt), (ins RO:$rs, uimm16:$pos, size_ins:$size, RO:$src),
+class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
+              SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
-         [(set RO:$rt, (MipsIns RO:$rs, imm:$pos, imm:$size, RO:$src))],
-         NoItinerary, FrmR> {
+         [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
+         NoItinerary, FrmR, opstr> {
   let Predicates = [HasMips32r2, HasStdEnc];
   let Constraints = "$src = $rt";
 }
 
 // Atomic instructions with 2 source operands (ATOMIC_SWAP & ATOMIC_LOAD_*).
-class Atomic2Ops<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$incr),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$incr))]>;
-
-multiclass Atomic2Ops32<PatFrag Op> {
-  def NAME : Atomic2Ops<Op, CPURegs, CPURegs>, Requires<[NotN64, HasStdEnc]>;
-  def _P8  : Atomic2Ops<Op, CPURegs, CPU64Regs>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-  }
-}
+class Atomic2Ops<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$incr),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$incr))]>;
 
 // Atomic Compare & Swap.
-class AtomicCmpSwap<PatFrag Op, RegisterClass DRC, RegisterClass PRC> :
-  PseudoSE<(outs DRC:$dst), (ins PRC:$ptr, DRC:$cmp, DRC:$swap),
-           [(set DRC:$dst, (Op PRC:$ptr, DRC:$cmp, DRC:$swap))]>;
-
-multiclass AtomicCmpSwap32<PatFrag Op>  {
-  def NAME : AtomicCmpSwap<Op, CPURegs, CPURegs>,
-             Requires<[NotN64, HasStdEnc]>;
-  def _P8  : AtomicCmpSwap<Op, CPURegs, CPU64Regs>,
-             Requires<[IsN64, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-  }
-}
+class AtomicCmpSwap<PatFrag Op, RegisterClass DRC> :
+  PseudoSE<(outs DRC:$dst), (ins PtrRC:$ptr, DRC:$cmp, DRC:$swap),
+           [(set DRC:$dst, (Op iPTR:$ptr, DRC:$cmp, DRC:$swap))]>;
 
-class LLBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$rt), (ins Mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
+class LLBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt), (ins mem:$addr), !strconcat(opstr, "\t$rt, $addr"),
          [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayLoad = 1;
 }
 
-class SCBase<string opstr, RegisterOperand RO, Operand Mem> :
-  InstSE<(outs RO:$dst), (ins RO:$rt, Mem:$addr),
+class SCBase<string opstr, RegisterOperand RO> :
+  InstSE<(outs RO:$dst), (ins RO:$rt, mem:$addr),
          !strconcat(opstr, "\t$rt, $addr"), [], NoItinerary, FrmI> {
   let DecoderMethod = "DecodeMem";
   let mayStore = 1;
   let Constraints = "$rt = $dst";
 }
 
-class MFC3OP<dag outs, dag ins, string asmstr> :
-  InstSE<outs, ins, asmstr, [], NoItinerary, FrmFR>;
+class MFC3OP<string asmstr, RegisterOperand RO> :
+  InstSE<(outs RO:$rt, RO:$rd, uimm16:$sel), (ins),
+         !strconcat(asmstr, "\t$rt, $rd, $sel"), [], NoItinerary, FrmFR>;
+
+class TrapBase<Instruction RealInst>
+  : PseudoSE<(outs), (ins), [(trap)], NoItinerary>,
+    PseudoInstExpansion<(RealInst 0, 0)> {
+  let isBarrier = 1;
+  let isTerminator = 1;
+  let isCodeGenOnly = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
@@ -809,38 +842,38 @@ def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
 }
 
 let usesCustomInserter = 1 in {
-  defm ATOMIC_LOAD_ADD_I8   : Atomic2Ops32<atomic_load_add_8>;
-  defm ATOMIC_LOAD_ADD_I16  : Atomic2Ops32<atomic_load_add_16>;
-  defm ATOMIC_LOAD_ADD_I32  : Atomic2Ops32<atomic_load_add_32>;
-  defm ATOMIC_LOAD_SUB_I8   : Atomic2Ops32<atomic_load_sub_8>;
-  defm ATOMIC_LOAD_SUB_I16  : Atomic2Ops32<atomic_load_sub_16>;
-  defm ATOMIC_LOAD_SUB_I32  : Atomic2Ops32<atomic_load_sub_32>;
-  defm ATOMIC_LOAD_AND_I8   : Atomic2Ops32<atomic_load_and_8>;
-  defm ATOMIC_LOAD_AND_I16  : Atomic2Ops32<atomic_load_and_16>;
-  defm ATOMIC_LOAD_AND_I32  : Atomic2Ops32<atomic_load_and_32>;
-  defm ATOMIC_LOAD_OR_I8    : Atomic2Ops32<atomic_load_or_8>;
-  defm ATOMIC_LOAD_OR_I16   : Atomic2Ops32<atomic_load_or_16>;
-  defm ATOMIC_LOAD_OR_I32   : Atomic2Ops32<atomic_load_or_32>;
-  defm ATOMIC_LOAD_XOR_I8   : Atomic2Ops32<atomic_load_xor_8>;
-  defm ATOMIC_LOAD_XOR_I16  : Atomic2Ops32<atomic_load_xor_16>;
-  defm ATOMIC_LOAD_XOR_I32  : Atomic2Ops32<atomic_load_xor_32>;
-  defm ATOMIC_LOAD_NAND_I8  : Atomic2Ops32<atomic_load_nand_8>;
-  defm ATOMIC_LOAD_NAND_I16 : Atomic2Ops32<atomic_load_nand_16>;
-  defm ATOMIC_LOAD_NAND_I32 : Atomic2Ops32<atomic_load_nand_32>;
-
-  defm ATOMIC_SWAP_I8       : Atomic2Ops32<atomic_swap_8>;
-  defm ATOMIC_SWAP_I16      : Atomic2Ops32<atomic_swap_16>;
-  defm ATOMIC_SWAP_I32      : Atomic2Ops32<atomic_swap_32>;
-
-  defm ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap32<atomic_cmp_swap_8>;
-  defm ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap32<atomic_cmp_swap_16>;
-  defm ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap32<atomic_cmp_swap_32>;
+  def ATOMIC_LOAD_ADD_I8   : Atomic2Ops<atomic_load_add_8, GPR32>;
+  def ATOMIC_LOAD_ADD_I16  : Atomic2Ops<atomic_load_add_16, GPR32>;
+  def ATOMIC_LOAD_ADD_I32  : Atomic2Ops<atomic_load_add_32, GPR32>;
+  def ATOMIC_LOAD_SUB_I8   : Atomic2Ops<atomic_load_sub_8, GPR32>;
+  def ATOMIC_LOAD_SUB_I16  : Atomic2Ops<atomic_load_sub_16, GPR32>;
+  def ATOMIC_LOAD_SUB_I32  : Atomic2Ops<atomic_load_sub_32, GPR32>;
+  def ATOMIC_LOAD_AND_I8   : Atomic2Ops<atomic_load_and_8, GPR32>;
+  def ATOMIC_LOAD_AND_I16  : Atomic2Ops<atomic_load_and_16, GPR32>;
+  def ATOMIC_LOAD_AND_I32  : Atomic2Ops<atomic_load_and_32, GPR32>;
+  def ATOMIC_LOAD_OR_I8    : Atomic2Ops<atomic_load_or_8, GPR32>;
+  def ATOMIC_LOAD_OR_I16   : Atomic2Ops<atomic_load_or_16, GPR32>;
+  def ATOMIC_LOAD_OR_I32   : Atomic2Ops<atomic_load_or_32, GPR32>;
+  def ATOMIC_LOAD_XOR_I8   : Atomic2Ops<atomic_load_xor_8, GPR32>;
+  def ATOMIC_LOAD_XOR_I16  : Atomic2Ops<atomic_load_xor_16, GPR32>;
+  def ATOMIC_LOAD_XOR_I32  : Atomic2Ops<atomic_load_xor_32, GPR32>;
+  def ATOMIC_LOAD_NAND_I8  : Atomic2Ops<atomic_load_nand_8, GPR32>;
+  def ATOMIC_LOAD_NAND_I16 : Atomic2Ops<atomic_load_nand_16, GPR32>;
+  def ATOMIC_LOAD_NAND_I32 : Atomic2Ops<atomic_load_nand_32, GPR32>;
+
+  def ATOMIC_SWAP_I8       : Atomic2Ops<atomic_swap_8, GPR32>;
+  def ATOMIC_SWAP_I16      : Atomic2Ops<atomic_swap_16, GPR32>;
+  def ATOMIC_SWAP_I32      : Atomic2Ops<atomic_swap_32, GPR32>;
+
+  def ATOMIC_CMP_SWAP_I8   : AtomicCmpSwap<atomic_cmp_swap_8, GPR32>;
+  def ATOMIC_CMP_SWAP_I16  : AtomicCmpSwap<atomic_cmp_swap_16, GPR32>;
+  def ATOMIC_CMP_SWAP_I32  : AtomicCmpSwap<atomic_cmp_swap_32, GPR32>;
 }
 
 /// Pseudo instructions for loading and storing accumulator registers.
-let isPseudo = 1 in {
-  defm LOAD_AC64  : LoadM<"load_ac64", ACRegs>;
-  defm STORE_AC64 : StoreM<"store_ac64", ACRegs>;
+let isPseudo = 1, isCodeGenOnly = 1 in {
+  def LOAD_ACC64  : Load<"", ACC64>;
+  def STORE_ACC64 : Store<"", ACC64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -851,113 +884,146 @@ let isPseudo = 1 in {
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu : MMRel, ArithLogicI<"addiu", simm16, CPURegsOpnd, immSExt16, add>,
+def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, IIArith, immSExt16,
+                               add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
-def ADDi  : MMRel, ArithLogicI<"addi", simm16, CPURegsOpnd>, ADDI_FM<0x8>;
-def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, CPURegs>,
+def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>;
+def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
-def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, CPURegs>,
+def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xb>;
-def ANDi  : MMRel, ArithLogicI<"andi", uimm16, CPURegsOpnd, immZExt16, and>,
+def ANDi  : MMRel, ArithLogicI<"andi", uimm16, GPR32Opnd, IILogic, immZExt16,
+                               and>,
             ADDI_FM<0xc>;
-def ORi   : MMRel, ArithLogicI<"ori", uimm16, CPURegsOpnd, immZExt16, or>,
+def ORi   : MMRel, ArithLogicI<"ori", uimm16, GPR32Opnd, IILogic, immZExt16,
+                               or>,
             ADDI_FM<0xd>;
-def XORi  : MMRel, ArithLogicI<"xori", uimm16, CPURegsOpnd, immZExt16, xor>,
+def XORi  : MMRel, ArithLogicI<"xori", uimm16, GPR32Opnd, IILogic, immZExt16,
+                               xor>,
             ADDI_FM<0xe>;
-def LUi   : MMRel, LoadUpper<"lui", CPURegs, uimm16>, LUI_FM;
+def LUi   : MMRel, LoadUpper<"lui", GPR32Opnd, uimm16>, LUI_FM;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def ADDu  : MMRel, ArithLogicR<"addu", CPURegsOpnd, 1, IIAlu, add>,
+def ADDu  : MMRel, ArithLogicR<"addu", GPR32Opnd, 1, IIArith, add>,
             ADD_FM<0, 0x21>;
-def SUBu  : MMRel, ArithLogicR<"subu", CPURegsOpnd, 0, IIAlu, sub>,
+def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, IIArith, sub>,
             ADD_FM<0, 0x23>;
-def MUL   : MMRel, ArithLogicR<"mul", CPURegsOpnd, 1, IIImul, mul>,
+let Defs = [HI0, LO0] in
+def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, IIImul, mul>,
             ADD_FM<0x1c, 2>;
-def ADD   : MMRel, ArithLogicR<"add", CPURegsOpnd>, ADD_FM<0, 0x20>;
-def SUB   : MMRel, ArithLogicR<"sub", CPURegsOpnd>, ADD_FM<0, 0x22>;
-def SLT   : MMRel, SetCC_R<"slt", setlt, CPURegs>, ADD_FM<0, 0x2a>;
-def SLTu  : MMRel, SetCC_R<"sltu", setult, CPURegs>, ADD_FM<0, 0x2b>;
-def AND   : MMRel, ArithLogicR<"and", CPURegsOpnd, 1, IIAlu, and>,
+def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
+def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
+def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
+def SLTu  : MMRel, SetCC_R<"sltu", setult, GPR32Opnd>, ADD_FM<0, 0x2b>;
+def AND   : MMRel, ArithLogicR<"and", GPR32Opnd, 1, IILogic, and>,
             ADD_FM<0, 0x24>;
-def OR    : MMRel, ArithLogicR<"or", CPURegsOpnd, 1, IIAlu, or>,
+def OR    : MMRel, ArithLogicR<"or", GPR32Opnd, 1, IILogic, or>,
             ADD_FM<0, 0x25>;
-def XOR   : MMRel, ArithLogicR<"xor", CPURegsOpnd, 1, IIAlu, xor>,
+def XOR   : MMRel, ArithLogicR<"xor", GPR32Opnd, 1, IILogic, xor>,
             ADD_FM<0, 0x26>;
-def NOR   : MMRel, LogicNOR<"nor", CPURegsOpnd>, ADD_FM<0, 0x27>;
+def NOR   : MMRel, LogicNOR<"nor", GPR32Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
-def SLL  : MMRel, shift_rotate_imm<"sll", shamt, CPURegsOpnd, shl, immZExt5>,
+def SLL  : MMRel, shift_rotate_imm<"sll", uimm5, GPR32Opnd, shl, immZExt5>,
            SRA_FM<0, 0>;
-def SRL  : MMRel, shift_rotate_imm<"srl", shamt, CPURegsOpnd, srl, immZExt5>,
+def SRL  : MMRel, shift_rotate_imm<"srl", uimm5, GPR32Opnd, srl, immZExt5>,
            SRA_FM<2, 0>;
-def SRA  : MMRel, shift_rotate_imm<"sra", shamt, CPURegsOpnd, sra, immZExt5>,
+def SRA  : MMRel, shift_rotate_imm<"sra", uimm5, GPR32Opnd, sra, immZExt5>,
            SRA_FM<3, 0>;
-def SLLV : MMRel, shift_rotate_reg<"sllv", CPURegsOpnd, shl>, SRLV_FM<4, 0>;
-def SRLV : MMRel, shift_rotate_reg<"srlv", CPURegsOpnd, srl>, SRLV_FM<6, 0>;
-def SRAV : MMRel, shift_rotate_reg<"srav", CPURegsOpnd, sra>, SRLV_FM<7, 0>;
+def SLLV : MMRel, shift_rotate_reg<"sllv", GPR32Opnd, shl>, SRLV_FM<4, 0>;
+def SRLV : MMRel, shift_rotate_reg<"srlv", GPR32Opnd, srl>, SRLV_FM<6, 0>;
+def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, sra>, SRLV_FM<7, 0>;
 
 // Rotate Instructions
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", shamt, CPURegsOpnd, rotr,
+  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, rotr,
                                       immZExt5>,
               SRA_FM<2, 1>;
-  def ROTRV : MMRel, shift_rotate_reg<"rotrv", CPURegsOpnd, rotr>,
+  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, rotr>,
               SRLV_FM<6, 1>;
 }
 
 /// Load and Store Instructions
 ///  aligned
-defm LB  : LoadM<"lb", CPURegs, sextloadi8>, MMRel, LW_FM<0x20>;
-defm LBu : LoadM<"lbu", CPURegs, zextloadi8, addrDefault>, MMRel, LW_FM<0x24>;
-defm LH  : LoadM<"lh", CPURegs, sextloadi16, addrDefault>, MMRel, LW_FM<0x21>;
-defm LHu : LoadM<"lhu", CPURegs, zextloadi16>, MMRel, LW_FM<0x25>;
-defm LW  : LoadM<"lw", CPURegs, load, addrDefault>, MMRel, LW_FM<0x23>;
-defm SB  : StoreM<"sb", CPURegs, truncstorei8>, MMRel, LW_FM<0x28>;
-defm SH  : StoreM<"sh", CPURegs, truncstorei16>, MMRel, LW_FM<0x29>;
-defm SW  : StoreM<"sw", CPURegs, store>, MMRel, LW_FM<0x2b>;
+def LB  : Load<"lb", GPR32Opnd, sextloadi8, IILoad>, MMRel, LW_FM<0x20>;
+def LBu : Load<"lbu", GPR32Opnd, zextloadi8, IILoad, addrDefault>, MMRel,
+          LW_FM<0x24>;
+def LH  : Load<"lh", GPR32Opnd, sextloadi16, IILoad, addrDefault>, MMRel,
+          LW_FM<0x21>;
+def LHu : Load<"lhu", GPR32Opnd, zextloadi16, IILoad>, MMRel, LW_FM<0x25>;
+def LW  : Load<"lw", GPR32Opnd, load, IILoad, addrDefault>, MMRel,
+          LW_FM<0x23>;
+def SB  : Store<"sb", GPR32Opnd, truncstorei8, IIStore>, MMRel, LW_FM<0x28>;
+def SH  : Store<"sh", GPR32Opnd, truncstorei16, IIStore>, MMRel, LW_FM<0x29>;
+def SW  : Store<"sw", GPR32Opnd, store, IIStore>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
-defm LWL : LoadLeftRightM<"lwl", MipsLWL, CPURegs>, LW_FM<0x22>;
-defm LWR : LoadLeftRightM<"lwr", MipsLWR, CPURegs>, LW_FM<0x26>;
-defm SWL : StoreLeftRightM<"swl", MipsSWL, CPURegs>, LW_FM<0x2a>;
-defm SWR : StoreLeftRightM<"swr", MipsSWR, CPURegs>, LW_FM<0x2e>;
+let Predicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, IILoad>, LW_FM<0x22>;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, IILoad>, LW_FM<0x26>;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, IIStore>, LW_FM<0x2a>;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, IIStore>, LW_FM<0x2e>;
+}
 
 def SYNC : SYNC_FT, SYNC_FM;
+def TEQ : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM<0x34>;
+def TGE : MMRel, TEQ_FT<"tge", GPR32Opnd>, TEQ_FM<0x30>;
+def TGEU : MMRel, TEQ_FT<"tgeu", GPR32Opnd>, TEQ_FM<0x31>;
+def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
+def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
+def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
 
-/// Load-linked, Store-conditional
-let Predicates = [NotN64, HasStdEnc] in {
-  def LL : LLBase<"ll", CPURegsOpnd, mem>, LW_FM<0x30>;
-  def SC : SCBase<"sc", CPURegsOpnd, mem>, LW_FM<0x38>;
-}
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
 
-let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LL_P8 : LLBase<"ll", CPURegsOpnd, mem64>, LW_FM<0x30>;
-  def SC_P8 : SCBase<"sc", CPURegsOpnd, mem64>, LW_FM<0x38>;
-}
+def BREAK : BRK_FT<"break">, BRK_FM<0xd>;
+def SYSCALL : SYS_FT<"syscall">, SYS_FM<0xc>;
+def TRAP : TrapBase<BREAK>;
+
+def ERET : ER_FT<"eret">, ER_FM<0x18>;
+def DERET : ER_FT<"deret">, ER_FM<0x1f>;
+
+def EI : DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
+def DI : DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
+
+def WAIT : WAIT_FT<"wait">;
+
+/// Load-linked, Store-conditional
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
 
 /// Jump and Branch Instructions
-def J       : JumpFJ<jmptarget, "j", br, bb>, FJ<2>,
+def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
               Requires<[RelocStatic, HasStdEnc]>, IsBranch;
-def JR      : IndirectBranch<CPURegs>, MTLO_FM<8>;
-def B       : UncondBranch<"b">, B_FM;
-def BEQ     : CBranch<"beq", seteq, CPURegs>, BEQ_FM<4>;
-def BNE     : CBranch<"bne", setne, CPURegs>, BEQ_FM<5>;
-def BGEZ    : CBranchZero<"bgez", setge, CPURegs>, BGEZ_FM<1, 1>;
-def BGTZ    : CBranchZero<"bgtz", setgt, CPURegs>, BGEZ_FM<7, 0>;
-def BLEZ    : CBranchZero<"blez", setle, CPURegs>, BGEZ_FM<6, 0>;
-def BLTZ    : CBranchZero<"bltz", setlt, CPURegs>, BGEZ_FM<1, 0>;
-
-def BAL_BR: BAL_FT, BAL_FM;
-
-def JAL  : JumpLink<"jal">, FJ<3>;
-def JALR : JumpLinkReg<"jalr", CPURegs>, JALR_FM;
-def JALRPseudo : JumpLinkRegPseudo<CPURegs, JALR, RA>;
-def BGEZAL : BGEZAL_FT<"bgezal", CPURegsOpnd>, BGEZAL_FM<0x11>;
-def BLTZAL : BGEZAL_FT<"bltzal", CPURegsOpnd>, BGEZAL_FM<0x10>;
-def TAILCALL : JumpFJ<calltarget, "j", MipsTailCall, imm>, FJ<2>, IsTailCall;
-def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, MTLO_FM<8>, IsTailCall;
-
-def RET : RetBase<CPURegs>, MTLO_FM<8>;
+def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
+def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
+def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
+def BGEZ    : MMRel, CBranchZero<"bgez", brtarget, setge, GPR32Opnd>,
+              BGEZ_FM<1, 1>;
+def BGTZ    : MMRel, CBranchZero<"bgtz", brtarget, setgt, GPR32Opnd>,
+              BGEZ_FM<7, 0>;
+def BLEZ    : MMRel, CBranchZero<"blez", brtarget, setle, GPR32Opnd>,
+              BGEZ_FM<6, 0>;
+def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
+              BGEZ_FM<1, 0>;
+def B       : UncondBranch<BEQ>;
+
+def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
+def JALR : MMRel, JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
+def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
+def BGEZAL : MMRel, BGEZAL_FT<"bgezal", brtarget, GPR32Opnd>, BGEZAL_FM<0x11>;
+def BLTZAL : MMRel, BGEZAL_FT<"bltzal", brtarget, GPR32Opnd>, BGEZAL_FM<0x10>;
+def BAL_BR : BAL_BR_Pseudo<BGEZAL>;
+def TAILCALL : MMRel, JumpFJ<calltarget, "j", MipsTailCall, imm, "tcall">,
+               FJ<2>, IsTailCall;
+def TAILCALL_R : MMRel, JumpFR<"tcallr", GPR32Opnd, MipsTailCall>, MTLO_FM<8>,
+                 IsTailCall;
+
+def RET : MMRel, RetBase<"ret", GPR32Opnd>, MTLO_FM<8>;
 
 // Exception handling related node and instructions.
 // The conversion sequence is:
@@ -973,41 +1039,38 @@ def MIPSehret : SDNode<"MipsISD::EH_RETURN", SDT_MipsEHRET,
                       [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 
 let Uses = [V0, V1], isTerminator = 1, isReturn = 1, isBarrier = 1 in {
-  def MIPSeh_return32 : MipsPseudo<(outs), (ins CPURegs:$spoff, CPURegs:$dst),
-                                [(MIPSehret CPURegs:$spoff, CPURegs:$dst)]>;
-  def MIPSeh_return64 : MipsPseudo<(outs), (ins CPU64Regs:$spoff,
-                                                CPU64Regs:$dst),
-                                [(MIPSehret CPU64Regs:$spoff, CPU64Regs:$dst)]>;
+  def MIPSeh_return32 : MipsPseudo<(outs), (ins GPR32:$spoff, GPR32:$dst),
+                                [(MIPSehret GPR32:$spoff, GPR32:$dst)]>;
+  def MIPSeh_return64 : MipsPseudo<(outs), (ins GPR64:$spoff,
+                                                GPR64:$dst),
+                                [(MIPSehret GPR64:$spoff, GPR64:$dst)]>;
 }
 
 /// Multiply and Divide Instructions.
-def MULT  : MMRel, Mult<"mult", IIImul, CPURegsOpnd, [HI, LO]>,
+def MULT  : MMRel, Mult<"mult", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x18>;
-def MULTu : MMRel, Mult<"multu", IIImul, CPURegsOpnd, [HI, LO]>,
+def MULTu : MMRel, Mult<"multu", IIImult, GPR32Opnd, [HI0, LO0]>,
             MULT_FM<0, 0x19>;
-def PseudoMULT  : MultDivPseudo<MULT, ACRegs, CPURegsOpnd, MipsMult, IIImul>;
-def PseudoMULTu : MultDivPseudo<MULTu, ACRegs, CPURegsOpnd, MipsMultu, IIImul>;
-def SDIV  : Div<"div", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1a>;
-def UDIV  : Div<"divu", IIIdiv, CPURegsOpnd, [HI, LO]>, MULT_FM<0, 0x1b>;
-def PseudoSDIV : MultDivPseudo<SDIV, ACRegs, CPURegsOpnd, MipsDivRem, IIIdiv, 0>;
-def PseudoUDIV : MultDivPseudo<UDIV, ACRegs, CPURegsOpnd, MipsDivRemU, IIIdiv,
-                               0>;
-
-def MTHI : MoveToLOHI<"mthi", CPURegs, [HI]>, MTLO_FM<0x11>;
-def MTLO : MoveToLOHI<"mtlo", CPURegs, [LO]>, MTLO_FM<0x13>;
-def MFHI : MoveFromLOHI<"mfhi", CPURegs, [HI]>, MFLO_FM<0x10>;
-def MFLO : MoveFromLOHI<"mflo", CPURegs, [LO]>, MFLO_FM<0x12>;
+def SDIV  : MMRel, Div<"div", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1a>;
+def UDIV  : MMRel, Div<"divu", IIIdiv, GPR32Opnd, [HI0, LO0]>,
+            MULT_FM<0, 0x1b>;
+
+def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
+def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
+def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
+def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
 
 /// Sign Ext In Register Instructions.
-def SEB : SignExtInReg<"seb", i8, CPURegs>, SEB_FM<0x10, 0x20>;
-def SEH : SignExtInReg<"seh", i16, CPURegs>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd>, SEB_FM<0x10, 0x20>;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd>, SEB_FM<0x18, 0x20>;
 
 /// Count Leading
-def CLZ : CountLeading0<"clz", CPURegsOpnd>, CLO_FM<0x20>;
-def CLO : CountLeading1<"clo", CPURegsOpnd>, CLO_FM<0x21>;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : SubwordSwap<"wsbh", CPURegsOpnd>, SEB_FM<2, 0x20>;
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1016,85 +1079,98 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 // instructions. The same not happens for stack address copies, so an
 // add op with mem ComplexPattern is used and the stack address copy
 // can be matched. It's similar to Sparc LEA_ADDRi
-def LEA_ADDiu : EffectiveAddress<"addiu", CPURegs, mem_ea>, LW_FM<9>;
+def LEA_ADDiu : EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
-def MADDU : MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MArithR<"msub">, MULT_FM<0x1c, 4>;
-def MSUBU : MArithR<"msubu">, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", 1>, MULT_FM<0x1c, 0>;
+def MADDU : MMRel, MArithR<"maddu", 1>, MULT_FM<0x1c, 1>;
+def MSUB  : MMRel, MArithR<"msub">, MULT_FM<0x1c, 4>;
+def MSUBU : MMRel, MArithR<"msubu">, MULT_FM<0x1c, 5>;
+
+let Predicates = [HasStdEnc, NotDSP] in {
+def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, IIImult>;
+def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, IIImult>;
+def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
+def PseudoMFLO : PseudoMFLOHI<GPR32, ACC64, MipsMFLO>;
+def PseudoMTLOHI : PseudoMTLOHI<ACC64, GPR32>;
 def PseudoMADD  : MAddSubPseudo<MADD, MipsMAdd>;
 def PseudoMADDU : MAddSubPseudo<MADDU, MipsMAddu>;
 def PseudoMSUB  : MAddSubPseudo<MSUB, MipsMSub>;
 def PseudoMSUBU : MAddSubPseudo<MSUBU, MipsMSubu>;
+}
 
-def RDHWR : ReadHardware<CPURegs, HWRegsOpnd>, RDHWR_FM;
-
-def EXT : ExtBase<"ext", CPURegsOpnd>, EXT_FM<0>;
-def INS : InsBase<"ins", CPURegsOpnd>, EXT_FM<4>;
-
-/// Move Control Registers From/To CPU Registers
-def MFC0_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
-                      (ins CPURegsOpnd:$rd, uimm16:$sel),
-                      "mfc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 0>;
+def PseudoSDIV : MultDivPseudo<SDIV, ACC64, GPR32Opnd, MipsDivRem, IIIdiv,
+                               0, 1, 1>;
+def PseudoUDIV : MultDivPseudo<UDIV, ACC64, GPR32Opnd, MipsDivRemU, IIIdiv,
+                               0, 1, 1>;
 
-def MTC0_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
-                      (ins CPURegsOpnd:$rt),
-                      "mtc0\t$rt, $rd, $sel">, MFC3OP_FM<0x10, 4>;
+def RDHWR : ReadHardware<GPR32Opnd, HWRegsOpnd>, RDHWR_FM;
 
-def MFC2_3OP : MFC3OP<(outs CPURegsOpnd:$rt),
-                      (ins CPURegsOpnd:$rd, uimm16:$sel),
-                      "mfc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 0>;
+def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
+def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
-def MTC2_3OP : MFC3OP<(outs CPURegsOpnd:$rd, uimm16:$sel),
-                      (ins CPURegsOpnd:$rt),
-                      "mtc2\t$rt, $rd, $sel">, MFC3OP_FM<0x12, 4>;
+/// Move Control Registers From/To CPU Registers
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
+def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
+def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
 def : InstAlias<"move $dst, $src",
-                (ADDu CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
-      Requires<[NotMips64]>;
-def : InstAlias<"move $dst, $src",
-                (OR CPURegsOpnd:$dst, CPURegsOpnd:$src,ZERO), 1>,
+                (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
       Requires<[NotMips64]>;
-def : InstAlias<"bal $offset", (BGEZAL RA, brtarget:$offset), 1>;
+def : InstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
 def : InstAlias<"addu $rs, $rt, $imm",
-                (ADDiu CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : InstAlias<"add $rs, $rt, $imm",
-                (ADDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
+                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : InstAlias<"and $rs, $rt, $imm",
-                (ANDi CPURegsOpnd:$rs, CPURegsOpnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"j $rs", (JR CPURegs:$rs), 0>,
-      Requires<[NotMips64]>;
-def : InstAlias<"jalr $rs", (JALR RA, CPURegs:$rs)>, Requires<[NotMips64]>;
-def : InstAlias<"jal $rs", (JALR RA, CPURegs:$rs), 0>, Requires<[NotMips64]>;
-def : InstAlias<"jal $rd,$rs", (JALR CPURegs:$rd, CPURegs:$rs), 0>,
-                 Requires<[NotMips64]>;
+                (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : InstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+def : InstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+def : InstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+def : InstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
 def : InstAlias<"not $rt, $rs",
-                (NOR CPURegsOpnd:$rt, CPURegsOpnd:$rs, ZERO), 1>;
+                (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
 def : InstAlias<"neg $rt, $rs",
-                (SUB CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+                (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
 def : InstAlias<"negu $rt, $rs",
-                (SUBu CPURegsOpnd:$rt, ZERO, CPURegsOpnd:$rs), 1>;
+                (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
 def : InstAlias<"slt $rs, $rt, $imm",
-                (SLTi CPURegsOpnd:$rs, CPURegs:$rt, simm16:$imm), 0>;
+                (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
 def : InstAlias<"xor $rs, $rt, $imm",
-                (XORi CPURegsOpnd:$rs, CPURegsOpnd:$rt, uimm16:$imm), 1>,
-      Requires<[NotMips64]>;
+                (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : InstAlias<"or $rs, $rt, $imm",
-                (ORi CPURegsOpnd:$rs, CPURegsOpnd:$rt, uimm16:$imm), 1>,
-                 Requires<[NotMips64]>;
+                (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
 def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-def : InstAlias<"mfc0 $rt, $rd",
-                (MFC0_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
-def : InstAlias<"mtc0 $rt, $rd",
-                (MTC0_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
-def : InstAlias<"mfc2 $rt, $rd",
-                (MFC2_3OP CPURegsOpnd:$rt, CPURegsOpnd:$rd, 0), 0>;
-def : InstAlias<"mtc2 $rt, $rd",
-                (MTC2_3OP CPURegsOpnd:$rd, 0, CPURegsOpnd:$rt), 0>;
+def : InstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : InstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+def : InstAlias<"bnez $rs,$offset",
+                (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : InstAlias<"beqz $rs,$offset",
+                (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : InstAlias<"syscall", (SYSCALL 0), 1>;
+
+def : InstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+def : InstAlias<"break", (BREAK 0, 0), 1>;
+def : InstAlias<"ei", (EI ZERO), 1>;
+def : InstAlias<"di", (DI ZERO), 1>;
+
+def  : InstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : InstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def : InstAlias<"sub, $rd, $rs, $imm",
+                (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
+def : InstAlias<"subu, $rd, $rs, $imm",
+                (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
@@ -1103,19 +1179,17 @@ def : InstAlias<"mtc2 $rt, $rd",
 class LoadImm32< string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadImm32Reg : LoadImm32<"li", shamt,CPURegsOpnd>;
+def LoadImm32Reg : LoadImm32<"li", uimm5, GPR32Opnd>;
 
 class LoadAddress<string instr_asm, Operand MemOpnd, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins MemOpnd:$addr),
                      !strconcat(instr_asm, "\t$rt, $addr")> ;
-def LoadAddr32Reg : LoadAddress<"la", mem, CPURegsOpnd>;
+def LoadAddr32Reg : LoadAddress<"la", mem, GPR32Opnd>;
 
 class LoadAddressImm<string instr_asm, Operand Od, RegisterOperand RO> :
   MipsAsmPseudoInst<(outs RO:$rt), (ins Od:$imm32),
                      !strconcat(instr_asm, "\t$rt, $imm32")> ;
-def LoadAddr32Imm : LoadAddressImm<"la", shamt,CPURegsOpnd>;
-
-
+def LoadAddr32Imm : LoadAddressImm<"la", uimm5, GPR32Opnd>;
 
 //===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
@@ -1141,13 +1215,13 @@ def : MipsPat<(i32 imm:$imm),
           (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
 
 // Carry MipsPatterns
-def : MipsPat<(subc CPURegs:$lhs, CPURegs:$rhs),
-              (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
+              (SUBu GPR32:$lhs, GPR32:$rhs)>;
 let Predicates = [HasStdEnc, NotDSP] in {
-  def : MipsPat<(addc CPURegs:$lhs, CPURegs:$rhs),
-                (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
-  def : MipsPat<(addc  CPURegs:$src, immSExt16:$imm),
-                (ADDiu CPURegs:$src, imm:$imm)>;
+  def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
+                (ADDu GPR32:$lhs, GPR32:$rhs)>;
+  def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
+                (ADDiu GPR32:$src, imm:$imm)>;
 }
 
 // Call
@@ -1155,8 +1229,8 @@ def : MipsPat<(MipsJmpLink (i32 tglobaladdr:$dst)),
               (JAL tglobaladdr:$dst)>;
 def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
               (JAL texternalsym:$dst)>;
-//def : MipsPat<(MipsJmpLink CPURegs:$dst),
-//              (JALR CPURegs:$dst)>;
+//def : MipsPat<(MipsJmpLink GPR32:$dst),
+//              (JALR GPR32:$dst)>;
 
 // Tail call
 def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
@@ -1178,58 +1252,49 @@ def : MipsPat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
 def : MipsPat<(MipsLo tglobaltlsaddr:$in), (ADDiu ZERO, tglobaltlsaddr:$in)>;
 def : MipsPat<(MipsLo texternalsym:$in), (ADDiu ZERO, texternalsym:$in)>;
 
-def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
-              (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
-def : MipsPat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
-              (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
-def : MipsPat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
-              (ADDiu CPURegs:$hi, tjumptable:$lo)>;
-def : MipsPat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
-              (ADDiu CPURegs:$hi, tconstpool:$lo)>;
-def : MipsPat<(add CPURegs:$hi, (MipsLo tglobaltlsaddr:$lo)),
-              (ADDiu CPURegs:$hi, tglobaltlsaddr:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaladdr:$lo)),
+              (ADDiu GPR32:$hi, tglobaladdr:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tblockaddress:$lo)),
+              (ADDiu GPR32:$hi, tblockaddress:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tjumptable:$lo)),
+              (ADDiu GPR32:$hi, tjumptable:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tconstpool:$lo)),
+              (ADDiu GPR32:$hi, tconstpool:$lo)>;
+def : MipsPat<(add GPR32:$hi, (MipsLo tglobaltlsaddr:$lo)),
+              (ADDiu GPR32:$hi, tglobaltlsaddr:$lo)>;
 
 // gp_rel relocs
-def : MipsPat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)),
-              (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
-def : MipsPat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)),
-              (ADDiu CPURegs:$gp, tconstpool:$in)>;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tglobaladdr:$in)),
+              (ADDiu GPR32:$gp, tglobaladdr:$in)>;
+def : MipsPat<(add GPR32:$gp, (MipsGPRel tconstpool:$in)),
+              (ADDiu GPR32:$gp, tconstpool:$in)>;
 
 // wrapper_pic
 class WrapperPat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
       MipsPat<(MipsWrapper RC:$gp, node:$in),
               (ADDiuOp RC:$gp, node:$in)>;
 
-def : WrapperPat<tglobaladdr, ADDiu, CPURegs>;
-def : WrapperPat<tconstpool, ADDiu, CPURegs>;
-def : WrapperPat<texternalsym, ADDiu, CPURegs>;
-def : WrapperPat<tblockaddress, ADDiu, CPURegs>;
-def : WrapperPat<tjumptable, ADDiu, CPURegs>;
-def : WrapperPat<tglobaltlsaddr, ADDiu, CPURegs>;
+def : WrapperPat<tglobaladdr, ADDiu, GPR32>;
+def : WrapperPat<tconstpool, ADDiu, GPR32>;
+def : WrapperPat<texternalsym, ADDiu, GPR32>;
+def : WrapperPat<tblockaddress, ADDiu, GPR32>;
+def : WrapperPat<tjumptable, ADDiu, GPR32>;
+def : WrapperPat<tglobaltlsaddr, ADDiu, GPR32>;
 
 // Mips does not have "not", so we expand our way
-def : MipsPat<(not CPURegs:$in),
-              (NOR CPURegsOpnd:$in, ZERO)>;
+def : MipsPat<(not GPR32:$in),
+              (NOR GPR32Opnd:$in, ZERO)>;
 
 // extended loads
-let Predicates = [NotN64, HasStdEnc] in {
+let Predicates = [HasStdEnc] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 }
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
-}
 
 // peepholes
-let Predicates = [NotN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-}
-let Predicates = [IsN64, HasStdEnc] in {
-  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-}
+let Predicates = [HasStdEnc] in
+def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
 multiclass BrcondPats<RegisterClass RC, Instruction BEQOp, Instruction BNEOp,
@@ -1248,6 +1313,10 @@ def : MipsPat<(brcond (i32 (setge RC:$lhs, immSExt16:$rhs)), bb:$dst),
               (BEQ (SLTiOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
 def : MipsPat<(brcond (i32 (setuge RC:$lhs, immSExt16:$rhs)), bb:$dst),
               (BEQ (SLTiuOp RC:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setugt RC:$lhs, immSExt16Plus1:$rhs)), bb:$dst),
+              (BEQ (SLTiuOp RC:$lhs, (Plus1 imm:$rhs)), ZERO, bb:$dst)>;
 
 def : MipsPat<(brcond (i32 (setle RC:$lhs, RC:$rhs)), bb:$dst),
               (BEQ (SLTOp RC:$rhs, RC:$lhs), ZERO, bb:$dst)>;
@@ -1258,11 +1327,20 @@ def : MipsPat<(brcond RC:$cond, bb:$dst),
               (BNEOp RC:$cond, ZEROReg, bb:$dst)>;
 }
 
-defm : BrcondPats<CPURegs, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+defm : BrcondPats<GPR32, BEQ, BNE, SLT, SLTu, SLTi, SLTiu, ZERO>;
+
+def : MipsPat<(brcond (i32 (setlt i32:$lhs, 1)), bb:$dst),
+              (BLEZ i32:$lhs, bb:$dst)>;
+def : MipsPat<(brcond (i32 (setgt i32:$lhs, -1)), bb:$dst),
+              (BGEZ i32:$lhs, bb:$dst)>;
 
 // setcc patterns
 multiclass SeteqPats<RegisterClass RC, Instruction SLTiuOp, Instruction XOROp,
                      Instruction SLTuOp, Register ZEROReg> {
+  def : MipsPat<(seteq RC:$lhs, 0),
+                (SLTiuOp RC:$lhs, 1)>;
+  def : MipsPat<(setne RC:$lhs, 0),
+                (SLTuOp ZEROReg, RC:$lhs)>;
   def : MipsPat<(seteq RC:$lhs, RC:$rhs),
                 (SLTiuOp (XOROp RC:$lhs, RC:$rhs), 1)>;
   def : MipsPat<(setne RC:$lhs, RC:$rhs),
@@ -1298,31 +1376,22 @@ multiclass SetgeImmPats<RegisterClass RC, Instruction SLTiOp,
                 (XORi (SLTiuOp RC:$lhs, immSExt16:$rhs), 1)>;
 }
 
-defm : SeteqPats<CPURegs, SLTiu, XOR, SLTu, ZERO>;
-defm : SetlePats<CPURegs, SLT, SLTu>;
-defm : SetgtPats<CPURegs, SLT, SLTu>;
-defm : SetgePats<CPURegs, SLT, SLTu>;
-defm : SetgeImmPats<CPURegs, SLTi, SLTiu>;
+defm : SeteqPats<GPR32, SLTiu, XOR, SLTu, ZERO>;
+defm : SetlePats<GPR32, SLT, SLTu>;
+defm : SetgtPats<GPR32, SLT, SLTu>;
+defm : SetgePats<GPR32, SLT, SLTu>;
+defm : SetgeImmPats<GPR32, SLTi, SLTiu>;
 
 // bswap pattern
-def : MipsPat<(bswap CPURegs:$rt), (ROTR (WSBH CPURegs:$rt), 16)>;
-
-// mflo/hi patterns.
-def : MipsPat<(i32 (ExtractLOHI ACRegs:$ac, imm:$lohi_idx)),
-              (EXTRACT_SUBREG ACRegs:$ac, imm:$lohi_idx)>;
+def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
-  let Predicates = [NotN64, HasStdEnc] in {
+  let Predicates = [HasStdEnc] in {
     def : LoadRegImmPat<LBu, i32, zextloadi8>;
     def : LoadRegImmPat<LH, i32, sextloadi16>;
     def : LoadRegImmPat<LW, i32, load>;
   }
-  let Predicates = [IsN64, HasStdEnc] in {
-    def : LoadRegImmPat<LBu_P8, i32, zextloadi8>;
-    def : LoadRegImmPat<LH_P8, i32, sextloadi16>;
-    def : LoadRegImmPat<LW_P8, i32, load>;
-  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -1343,6 +1412,10 @@ include "Mips16InstrInfo.td"
 include "MipsDSPInstrFormats.td"
 include "MipsDSPInstrInfo.td"
 
+// MSA
+include "MipsMSAInstrFormats.td"
+include "MipsMSAInstrInfo.td"
+
 // Micromips
 include "MicroMipsInstrFormats.td"
 include "MicroMipsInstrInfo.td"
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
index 1b2a325..d76cb1d 100644
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -218,9 +218,9 @@ void *MipsJITInfo::emitFunctionStub(const Function *F, void *Fn,
     Hi++;
   int Lo = (int)(EmittedAddr & 0xffff);
 
-  // lui t9, %hi(EmittedAddr)
-  // addiu t9, t9, %lo(EmittedAddr)
-  // jalr t8, t9
+  // lui $t9, %hi(EmittedAddr)
+  // addiu $t9, $t9, %lo(EmittedAddr)
+  // jalr $t8, $t9
   // nop
   if (IsLittleEndian) {
     JCE.emitWordLE(0xf << 26 | 25 << 16 | Hi);
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index bf5ad37..2efe578 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -65,7 +65,6 @@ namespace {
     static char ID;
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
         LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
@@ -85,7 +84,6 @@ namespace {
     void expandToLongBranch(MBBInfo &Info);
 
     const TargetMachine &TM;
-    const MipsInstrInfo *TII;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
     bool IsPIC;
@@ -172,6 +170,8 @@ void MipsLongBranch::initMBBInfo() {
   MBBInfos.clear();
   MBBInfos.resize(MF->size());
 
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
   for (unsigned I = 0, E = MBBInfos.size(); I < E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(I);
 
@@ -217,7 +217,9 @@ int64_t MipsLongBranch::computeOffset(const MachineInstr *Br) {
 // MachineBasicBlock operand MBBOpnd.
 void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
                                    DebugLoc DL, MachineBasicBlock *MBBOpnd) {
-  unsigned NewOpc = TII->GetOppositeBranchOpc(Br->getOpcode());
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+  unsigned NewOpc = TII->getOppositeBranchOpc(Br->getOpcode());
   const MCInstrDesc &NewDesc = TII->get(NewOpc);
 
   MachineInstrBuilder MIB = BuildMI(MBB, Br, DL, NewDesc);
@@ -235,6 +237,11 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
   MIB.addMBB(MBBOpnd);
 
+  // Bundle the instruction in the delay slot to the newly created branch
+  // and erase the original branch.
+  assert(Br->isBundledWithSucc());
+  MachineBasicBlock::instr_iterator II(Br);
+  MIBundleBuilder(&*MIB).append((++II)->removeFromBundle());
   Br->eraseFromParent();
 }
 
@@ -247,6 +254,9 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   MachineFunction::iterator FallThroughMBB = ++MachineFunction::iterator(MBB);
   MachineBasicBlock *LongBrMBB = MF->CreateMachineBasicBlock(BB);
 
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
   MF->insert(FallThroughMBB, LongBrMBB);
   MBB->removeSuccessor(TgtMBB);
   MBB->addSuccessor(LongBrMBB);
@@ -399,6 +409,9 @@ static void emitGPDisp(MachineFunction &F, const MipsInstrInfo *TII) {
 }
 
 bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
+  const MipsInstrInfo *TII =
+    static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
+
   if (TM.getSubtarget<MipsSubtarget>().inMips16Mode())
     return false;
   if ((TM.getRelocationModel() == Reloc::PIC_) &&
@@ -412,7 +425,7 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
   MF = &F;
   initMBBInfo();
 
-  SmallVector<MBBInfo, 16>::iterator I, E = MBBInfos.end();
+  SmallVectorImpl<MBBInfo>::iterator I, E = MBBInfos.end();
   bool EverMadeChange = false, MadeChange = true;
 
   while (MadeChange) {
@@ -424,8 +437,10 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
+      int ShVal = TM.getSubtarget<MipsSubtarget>().inMicroMipsMode() ? 2 : 4;
+
       // Check if offset fits into 16-bit immediate field of branches.
-      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / ShVal))
         continue;
 
       I->HasLongBranch = true;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d836975..b6dfadc 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -28,8 +28,7 @@ using namespace llvm;
 MipsMCInstLower::MipsMCInstLower(MipsAsmPrinter &asmprinter)
   : AsmPrinter(asmprinter) {}
 
-void MipsMCInstLower::Initialize(Mangler *M, MCContext *C) {
-  Mang = M;
+void MipsMCInstLower::Initialize(MCContext *C) {
   Ctx = C;
 }
 
@@ -74,7 +73,7 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
 
   case MachineOperand::MO_GlobalAddress:
-    Symbol = Mang->getSymbol(MO.getGlobal());
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
     Offset += MO.getOffset();
     break;
 
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index c4a6016..4570bd9 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -19,7 +19,6 @@ namespace llvm {
   class MCOperand;
   class MachineInstr;
   class MachineFunction;
-  class Mangler;
   class MipsAsmPrinter;
 
 /// MipsMCInstLower - This class is used to lower an MachineInstr into an
@@ -27,11 +26,10 @@ namespace llvm {
 class LLVM_LIBRARY_VISIBILITY MipsMCInstLower {
   typedef MachineOperand::MachineOperandType MachineOperandType;
   MCContext *Ctx;
-  Mangler *Mang;
   MipsAsmPrinter &AsmPrinter;
 public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
-  void Initialize(Mangler *mang, MCContext *C);
+  void Initialize(MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
   MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 
diff --git a/lib/Target/Mips/MipsMSAInstrFormats.td b/lib/Target/Mips/MipsMSAInstrFormats.td
new file mode 100644
index 0000000..875dc0b
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrFormats.td
@@ -0,0 +1,406 @@
+//===- MipsMSAInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasMSA : Predicate<"Subtarget.hasMSA()">,
+             AssemblerPredicate<"FeatureMSA">;
+
+class MSAInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasMSA];
+  let Inst{31-26} = 0b011110;
+}
+
+class MSACBranch : MSAInst {
+  let Inst{31-26} = 0b010001;
+}
+
+class MSASpecial : MSAInst {
+  let Inst{31-26} = 0b000000;
+}
+
+class PseudoMSA<dag outs, dag ins, list<dag> pattern,
+                InstrItinClass itin = IIPseudo>:
+  MipsPseudo<outs, ins, pattern, itin> {
+  let Predicates = [HasMSA];
+}
+
+class MSA_BIT_B_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<3> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-19} = 0b1110;
+  let Inst{18-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_H_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<4> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-20} = 0b110;
+  let Inst{19-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_W_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<5> m;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = 0b10;
+  let Inst{20-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_BIT_D_FMT<bits<3> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+  bits<6> m;
+
+  let Inst{25-23} = major;
+  let Inst{22} = 0b0;
+  let Inst{21-16} = m;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FILL_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2R_FMT<bits<8> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-18} = major;
+  let Inst{17-16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_2RF_FMT<bits<9> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-17} = major;
+  let Inst{16} = df;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3RF_FMT<bits<4> major, bits<1> df, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_3R_INDEX_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> rt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CFCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rd;
+  bits<5> cs;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = cs;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_CTCMSA_FMT<bits<10> major, bits<6> minor>: MSAInst {
+  bits<5> rs;
+  bits<5> cd;
+
+  let Inst{25-16} = major;
+  let Inst{15-11} = rs;
+  let Inst{10-6} = cd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_D_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-17} = 0b11100;
+  let Inst{16} = n{0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_COPY_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<4> n;
+  bits<5> ws;
+  bits<5> rd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = ws;
+  let Inst{10-6} = rd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_B_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-20} = 0b00;
+  let Inst{19-16} = n{3-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_H_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = n{2-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_ELM_INSERT_W_FMT<bits<4> major, bits<6> minor>: MSAInst {
+  bits<6> n;
+  bits<5> rs;
+  bits<5> wd;
+
+  let Inst{25-22} = major;
+  let Inst{21-18} = 0b1100;
+  let Inst{17-16} = n{1-0};
+  let Inst{15-11} = rs;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I5_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<5> imm;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = imm;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I8_FMT<bits<2> major, bits<6> minor>: MSAInst {
+  bits<8> u8;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-24} = major;
+  let Inst{23-16} = u8;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_I10_FMT<bits<3> major, bits<2> df, bits<6> minor>: MSAInst {
+  bits<10> s10;
+  bits<5> wd;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-11} = s10;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_MI10_FMT<bits<2> df, bits<4> minor>: MSAInst {
+  bits<21> addr;
+  bits<5> wd;
+
+  let Inst{25-16} = addr{9-0};
+  let Inst{15-11} = addr{20-16};
+  let Inst{10-6} = wd;
+  let Inst{5-2} = minor;
+  let Inst{1-0} = df;
+}
+
+class MSA_VEC_FMT<bits<5> major, bits<6> minor>: MSAInst {
+  bits<5> wt;
+  bits<5> ws;
+  bits<5> wd;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-11} = ws;
+  let Inst{10-6} = wd;
+  let Inst{5-0} = minor;
+}
+
+class MSA_CBRANCH_FMT<bits<3> major, bits<2> df>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-23} = major;
+  let Inst{22-21} = df;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class MSA_CBRANCH_V_FMT<bits<5> major>: MSACBranch {
+  bits<16> offset;
+  bits<5> wt;
+
+  let Inst{25-21} = major;
+  let Inst{20-16} = wt;
+  let Inst{15-0} = offset;
+}
+
+class SPECIAL_LSA_FMT<bits<6> minor>: MSASpecial {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+  bits<2> sa;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8} = 0b000;
+  let Inst{7-6} = sa;
+  let Inst{5-0} = minor;
+}
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
new file mode 100644
index 0000000..82c51a6
--- /dev/null
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -0,0 +1,3694 @@
+//===- MipsMSAInstrInfo.td - MSA ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips MSA ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsVecCond : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVec<1>]>;
+def SDT_VSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                      SDTCisInt<1>,
+                                      SDTCisSameAs<1, 2>,
+                                      SDTCisVT<3, OtherVT>]>;
+def SDT_VFSetCC : SDTypeProfile<1, 3, [SDTCisInt<0>,
+                                       SDTCisFP<1>,
+                                       SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, OtherVT>]>;
+def SDT_VSHF : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisVec<0>,
+                                    SDTCisInt<1>, SDTCisVec<1>,
+                                    SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+def SDT_SHF : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisVT<1, i32>, SDTCisSameAs<0, 2>]>;
+def SDT_ILV : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisVec<0>,
+                                   SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
+
+def MipsVAllNonZero : SDNode<"MipsISD::VALL_NONZERO", SDT_MipsVecCond>;
+def MipsVAnyNonZero : SDNode<"MipsISD::VANY_NONZERO", SDT_MipsVecCond>;
+def MipsVAllZero : SDNode<"MipsISD::VALL_ZERO", SDT_MipsVecCond>;
+def MipsVAnyZero : SDNode<"MipsISD::VANY_ZERO", SDT_MipsVecCond>;
+def MipsVSMax : SDNode<"MipsISD::VSMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVSMin : SDNode<"MipsISD::VSMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMax : SDNode<"MipsISD::VUMAX", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVUMin : SDNode<"MipsISD::VUMIN", SDTIntBinOp,
+                       [SDNPCommutative, SDNPAssociative]>;
+def MipsVNOR : SDNode<"MipsISD::VNOR", SDTIntBinOp,
+                      [SDNPCommutative, SDNPAssociative]>;
+def MipsVSHF : SDNode<"MipsISD::VSHF", SDT_VSHF>;
+def MipsSHF : SDNode<"MipsISD::SHF", SDT_SHF>;
+def MipsILVEV : SDNode<"MipsISD::ILVEV", SDT_ILV>;
+def MipsILVOD : SDNode<"MipsISD::ILVOD", SDT_ILV>;
+def MipsILVL  : SDNode<"MipsISD::ILVL",  SDT_ILV>;
+def MipsILVR  : SDNode<"MipsISD::ILVR",  SDT_ILV>;
+def MipsPCKEV : SDNode<"MipsISD::PCKEV", SDT_ILV>;
+def MipsPCKOD : SDNode<"MipsISD::PCKOD", SDT_ILV>;
+
+def vsetcc : SDNode<"ISD::SETCC", SDT_VSetCC>;
+def vfsetcc : SDNode<"ISD::SETCC", SDT_VFSetCC>;
+
+def MipsVExtractSExt : SDNode<"MipsISD::VEXTRACT_SEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
+    SDTypeProfile<1, 3, [SDTCisPtrTy<2>]>, []>;
+
+// Operands
+
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// The immediate of an LSA instruction needs special handling
+// as the encoded value should be subtracted by one.
+def uimm2LSAAsmOperand : AsmOperandClass {
+  let Name = "LSAImm";
+  let ParserMethod = "parseLSAImm";
+  let RenderMethod = "addImmOperands";
+}
+
+def LSAImm : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+  let EncoderMethod = "getLSAImmEncoding";
+  let DecoderMethod = "DecodeLSAImm";
+  let ParserMatchClass = uimm2LSAAsmOperand;
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm4 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def uimm8 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def simm5 : Operand<i32>;
+
+def simm10 : Operand<i32>;
+
+def vsplat_uimm1 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm2 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm3 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm4 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm5 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm6 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_uimm8 : Operand<vAny> {
+  let PrintMethod = "printUnsignedImm8";
+}
+
+def vsplat_simm5 : Operand<vAny>;
+
+def vsplat_simm10 : Operand<vAny>;
+
+def immZExt2Lsa : ImmLeaf<i32, [{return isUInt<2>(Imm - 1);}]>;
+
+// Pattern fragments
+def vextract_sext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i8)>;
+def vextract_sext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i16)>;
+def vextract_sext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractSExt node:$vec, node:$idx, i32)>;
+
+def vextract_zext_i8  : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i8)>;
+def vextract_zext_i16 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i16)>;
+def vextract_zext_i32 : PatFrag<(ops node:$vec, node:$idx),
+                                (MipsVExtractZExt node:$vec, node:$idx, i32)>;
+
+def vinsert_v16i8 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v16i8 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v8i16 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v8i16 (vector_insert node:$vec, node:$val, node:$idx))>;
+def vinsert_v4i32 : PatFrag<(ops node:$vec, node:$val, node:$idx),
+    (v4i32 (vector_insert node:$vec, node:$val, node:$idx))>;
+
+class vfsetcc_type<ValueType ResTy, ValueType OpTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vfsetcc (OpTy node:$lhs), (OpTy node:$rhs), CC))>;
+
+// ISD::SETFALSE cannot occur
+def vfsetoeq_v4f32 : vfsetcc_type<v4i32, v4f32, SETOEQ>;
+def vfsetoeq_v2f64 : vfsetcc_type<v2i64, v2f64, SETOEQ>;
+def vfsetoge_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGE>;
+def vfsetoge_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGE>;
+def vfsetogt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOGT>;
+def vfsetogt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOGT>;
+def vfsetole_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLE>;
+def vfsetole_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLE>;
+def vfsetolt_v4f32 : vfsetcc_type<v4i32, v4f32, SETOLT>;
+def vfsetolt_v2f64 : vfsetcc_type<v2i64, v2f64, SETOLT>;
+def vfsetone_v4f32 : vfsetcc_type<v4i32, v4f32, SETONE>;
+def vfsetone_v2f64 : vfsetcc_type<v2i64, v2f64, SETONE>;
+def vfsetord_v4f32 : vfsetcc_type<v4i32, v4f32, SETO>;
+def vfsetord_v2f64 : vfsetcc_type<v2i64, v2f64, SETO>;
+def vfsetun_v4f32  : vfsetcc_type<v4i32, v4f32, SETUO>;
+def vfsetun_v2f64  : vfsetcc_type<v2i64, v2f64, SETUO>;
+def vfsetueq_v4f32 : vfsetcc_type<v4i32, v4f32, SETUEQ>;
+def vfsetueq_v2f64 : vfsetcc_type<v2i64, v2f64, SETUEQ>;
+def vfsetuge_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGE>;
+def vfsetuge_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGE>;
+def vfsetugt_v4f32 : vfsetcc_type<v4i32, v4f32, SETUGT>;
+def vfsetugt_v2f64 : vfsetcc_type<v2i64, v2f64, SETUGT>;
+def vfsetule_v4f32 : vfsetcc_type<v4i32, v4f32, SETULE>;
+def vfsetule_v2f64 : vfsetcc_type<v2i64, v2f64, SETULE>;
+def vfsetult_v4f32 : vfsetcc_type<v4i32, v4f32, SETULT>;
+def vfsetult_v2f64 : vfsetcc_type<v2i64, v2f64, SETULT>;
+def vfsetune_v4f32 : vfsetcc_type<v4i32, v4f32, SETUNE>;
+def vfsetune_v2f64 : vfsetcc_type<v2i64, v2f64, SETUNE>;
+// ISD::SETTRUE cannot occur
+// ISD::SETFALSE2 cannot occur
+// ISD::SETTRUE2 cannot occur
+
+class vsetcc_type<ValueType ResTy, CondCode CC> :
+  PatFrag<(ops node:$lhs, node:$rhs),
+          (ResTy (vsetcc node:$lhs, node:$rhs, CC))>;
+
+def vseteq_v16i8  : vsetcc_type<v16i8, SETEQ>;
+def vseteq_v8i16  : vsetcc_type<v8i16, SETEQ>;
+def vseteq_v4i32  : vsetcc_type<v4i32, SETEQ>;
+def vseteq_v2i64  : vsetcc_type<v2i64, SETEQ>;
+def vsetle_v16i8  : vsetcc_type<v16i8, SETLE>;
+def vsetle_v8i16  : vsetcc_type<v8i16, SETLE>;
+def vsetle_v4i32  : vsetcc_type<v4i32, SETLE>;
+def vsetle_v2i64  : vsetcc_type<v2i64, SETLE>;
+def vsetlt_v16i8  : vsetcc_type<v16i8, SETLT>;
+def vsetlt_v8i16  : vsetcc_type<v8i16, SETLT>;
+def vsetlt_v4i32  : vsetcc_type<v4i32, SETLT>;
+def vsetlt_v2i64  : vsetcc_type<v2i64, SETLT>;
+def vsetule_v16i8 : vsetcc_type<v16i8, SETULE>;
+def vsetule_v8i16 : vsetcc_type<v8i16, SETULE>;
+def vsetule_v4i32 : vsetcc_type<v4i32, SETULE>;
+def vsetule_v2i64 : vsetcc_type<v2i64, SETULE>;
+def vsetult_v16i8 : vsetcc_type<v16i8, SETULT>;
+def vsetult_v8i16 : vsetcc_type<v8i16, SETULT>;
+def vsetult_v4i32 : vsetcc_type<v4i32, SETULT>;
+def vsetult_v2i64 : vsetcc_type<v2i64, SETULT>;
+
+def vsplati8  : PatFrag<(ops node:$e0),
+                        (v16i8 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati16 : PatFrag<(ops node:$e0),
+                        (v8i16 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati32 : PatFrag<(ops node:$e0),
+                        (v4i32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplati64 : PatFrag<(ops node:$e0),
+                        (v2i64 (build_vector:$v0 node:$e0, node:$e0))>;
+def vsplatf32 : PatFrag<(ops node:$e0),
+                        (v4f32 (build_vector node:$e0, node:$e0,
+                                             node:$e0, node:$e0))>;
+def vsplatf64 : PatFrag<(ops node:$e0),
+                        (v2f64 (build_vector node:$e0, node:$e0))>;
+
+def vsplati8_elt  : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati8 node:$i), node:$v, node:$v)>;
+def vsplati16_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati16 node:$i), node:$v, node:$v)>;
+def vsplati32_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati32 node:$i), node:$v, node:$v)>;
+def vsplati64_elt : PatFrag<(ops node:$v, node:$i),
+                            (MipsVSHF (vsplati64 node:$i), node:$v, node:$v)>;
+
+class SplatPatLeaf<Operand opclass, dag frag, code pred = [{}],
+                   SDNodeXForm xform = NOOP_SDNodeXForm>
+  : PatLeaf<frag, pred, xform> {
+  Operand OpClass = opclass;
+}
+
+class SplatComplexPattern<Operand opclass, ValueType ty, int numops, string fn,
+                          list<SDNode> roots = [],
+                          list<SDNodeProperty> props = []> :
+  ComplexPattern<ty, numops, fn, roots, props> {
+  Operand OpClass = opclass;
+}
+
+def vsplati8_uimm3 : SplatComplexPattern<vsplat_uimm3, v16i8, 1,
+                                         "selectVSplatUimm3",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm4 : SplatComplexPattern<vsplat_uimm4, v16i8, 1,
+                                         "selectVSplatUimm4",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm5 : SplatComplexPattern<vsplat_uimm5, v16i8, 1,
+                                         "selectVSplatUimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_uimm8 : SplatComplexPattern<vsplat_uimm8, v16i8, 1,
+                                         "selectVSplatUimm8",
+                                         [build_vector, bitconvert]>;
+
+def vsplati8_simm5 : SplatComplexPattern<vsplat_simm5, v16i8, 1,
+                                         "selectVSplatSimm5",
+                                         [build_vector, bitconvert]>;
+
+def vsplati16_uimm3 : SplatComplexPattern<vsplat_uimm3, v8i16, 1,
+                                          "selectVSplatUimm3",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm4 : SplatComplexPattern<vsplat_uimm4, v8i16, 1,
+                                          "selectVSplatUimm4",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_uimm5 : SplatComplexPattern<vsplat_uimm5, v8i16, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati16_simm5 : SplatComplexPattern<vsplat_simm5, v8i16, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm2 : SplatComplexPattern<vsplat_uimm2, v4i32, 1,
+                                          "selectVSplatUimm2",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_uimm5 : SplatComplexPattern<vsplat_uimm5, v4i32, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati32_simm5 : SplatComplexPattern<vsplat_simm5, v4i32, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm1 : SplatComplexPattern<vsplat_uimm1, v2i64, 1,
+                                          "selectVSplatUimm1",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm5 : SplatComplexPattern<vsplat_uimm5, v2i64, 1,
+                                          "selectVSplatUimm5",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_uimm6 : SplatComplexPattern<vsplat_uimm6, v2i64, 1,
+                                          "selectVSplatUimm6",
+                                          [build_vector, bitconvert]>;
+
+def vsplati64_simm5 : SplatComplexPattern<vsplat_simm5, v2i64, 1,
+                                          "selectVSplatSimm5",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is an exact
+// power of 2
+def vsplat_uimm_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmPow2",
+                                      [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that is the bitwise
+// inverse of an exact power of 2
+def vsplat_uimm_inv_pow2 : ComplexPattern<vAny, 1, "selectVSplatUimmInvPow2",
+                                          [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of left-most bits set.
+def vsplat_maskl_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskL",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with only a consecutive sequence
+// of right-most bits set.
+def vsplat_maskr_bits : SplatComplexPattern<vsplat_uimm8, vAny, 1,
+                                            "selectVSplatMaskR",
+                                            [build_vector, bitconvert]>;
+
+// Any build_vector that is a constant splat with a value that equals 1
+// FIXME: These should be a ComplexPattern but we can't use them because the
+//        ISel generator requires the uses to have a name, but providing a name
+//        causes other errors ("used in pattern but not operand list")
+def vsplat_imm_eq_1 : PatLeaf<(build_vector), [{
+  APInt Imm;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (N, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vsplati64_imm_eq_1 : PatLeaf<(bitconvert (v4i32 (build_vector))), [{
+  APInt Imm;
+  SDNode *BV = N->getOperand(0).getNode();
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  return selectVSplat (BV, Imm) &&
+         Imm.getBitWidth() == EltTy.getSizeInBits() && Imm == 1;
+}]>;
+
+def vbclr_b : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_h : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_w : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl vsplat_imm_eq_1, node:$wt),
+                                          immAllOnesV))>;
+def vbclr_d : PatFrag<(ops node:$ws, node:$wt),
+                      (and node:$ws, (xor (shl (v2i64 vsplati64_imm_eq_1),
+                                               node:$wt),
+                                          (bitconvert (v4i32 immAllOnesV))))>;
+
+def vbneg_b : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_h : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_w : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbneg_d : PatFrag<(ops node:$ws, node:$wt),
+                      (xor node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                          node:$wt))>;
+
+def vbset_b : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_h : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_w : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl vsplat_imm_eq_1, node:$wt))>;
+def vbset_d : PatFrag<(ops node:$ws, node:$wt),
+                      (or node:$ws, (shl (v2i64 vsplati64_imm_eq_1),
+                                         node:$wt))>;
+
+def fms : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                  (fsub node:$wd, (fmul node:$ws, node:$wt))>;
+
+def muladd : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (add node:$wd, (mul node:$ws, node:$wt))>;
+
+def mulsub : PatFrag<(ops node:$wd, node:$ws, node:$wt),
+                     (sub node:$wd, (mul node:$ws, node:$wt))>;
+
+def mul_fexp2 : PatFrag<(ops node:$ws, node:$wt),
+                        (fmul node:$ws, (fexp2 node:$wt))>;
+
+// Immediates
+def immSExt5 : ImmLeaf<i32, [{return isInt<5>(Imm);}]>;
+def immSExt10: ImmLeaf<i32, [{return isInt<10>(Imm);}]>;
+
+// Instruction encoding.
+class ADD_A_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010000>;
+class ADD_A_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010000>;
+class ADD_A_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010000>;
+class ADD_A_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010000>;
+
+class ADDS_A_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010000>;
+class ADDS_A_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010000>;
+class ADDS_A_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010000>;
+class ADDS_A_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010000>;
+
+class ADDS_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010000>;
+class ADDS_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010000>;
+class ADDS_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010000>;
+class ADDS_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010000>;
+
+class ADDS_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010000>;
+class ADDS_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010000>;
+class ADDS_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010000>;
+class ADDS_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010000>;
+
+class ADDV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001110>;
+class ADDV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001110>;
+class ADDV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001110>;
+class ADDV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001110>;
+
+class ADDVI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000110>;
+class ADDVI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000110>;
+class ADDVI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000110>;
+class ADDVI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000110>;
+
+class AND_V_ENC : MSA_VEC_FMT<0b00000, 0b011110>;
+
+class ANDI_B_ENC : MSA_I8_FMT<0b00, 0b000000>;
+
+class ASUB_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010001>;
+class ASUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010001>;
+class ASUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010001>;
+class ASUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010001>;
+
+class ASUB_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010001>;
+class ASUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010001>;
+class ASUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010001>;
+class ASUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010001>;
+
+class AVE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010000>;
+class AVE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010000>;
+class AVE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010000>;
+class AVE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010000>;
+
+class AVE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010000>;
+class AVE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010000>;
+class AVE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010000>;
+class AVE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010000>;
+
+class AVER_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010000>;
+class AVER_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010000>;
+class AVER_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010000>;
+class AVER_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010000>;
+
+class AVER_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010000>;
+class AVER_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010000>;
+class AVER_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010000>;
+class AVER_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010000>;
+
+class BCLR_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001101>;
+class BCLR_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001101>;
+class BCLR_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001101>;
+class BCLR_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001101>;
+
+class BCLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001001>;
+class BCLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001001>;
+class BCLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001001>;
+class BCLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001001>;
+
+class BINSL_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001101>;
+class BINSL_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001101>;
+class BINSL_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001101>;
+class BINSL_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001101>;
+
+class BINSLI_B_ENC : MSA_BIT_B_FMT<0b110, 0b001001>;
+class BINSLI_H_ENC : MSA_BIT_H_FMT<0b110, 0b001001>;
+class BINSLI_W_ENC : MSA_BIT_W_FMT<0b110, 0b001001>;
+class BINSLI_D_ENC : MSA_BIT_D_FMT<0b110, 0b001001>;
+
+class BINSR_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001101>;
+class BINSR_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001101>;
+class BINSR_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001101>;
+class BINSR_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001101>;
+
+class BINSRI_B_ENC : MSA_BIT_B_FMT<0b111, 0b001001>;
+class BINSRI_H_ENC : MSA_BIT_H_FMT<0b111, 0b001001>;
+class BINSRI_W_ENC : MSA_BIT_W_FMT<0b111, 0b001001>;
+class BINSRI_D_ENC : MSA_BIT_D_FMT<0b111, 0b001001>;
+
+class BMNZ_V_ENC : MSA_VEC_FMT<0b00100, 0b011110>;
+
+class BMNZI_B_ENC : MSA_I8_FMT<0b00, 0b000001>;
+
+class BMZ_V_ENC : MSA_VEC_FMT<0b00101, 0b011110>;
+
+class BMZI_B_ENC : MSA_I8_FMT<0b01, 0b000001>;
+
+class BNEG_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001101>;
+class BNEG_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001101>;
+class BNEG_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001101>;
+class BNEG_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001101>;
+
+class BNEGI_B_ENC : MSA_BIT_B_FMT<0b101, 0b001001>;
+class BNEGI_H_ENC : MSA_BIT_H_FMT<0b101, 0b001001>;
+class BNEGI_W_ENC : MSA_BIT_W_FMT<0b101, 0b001001>;
+class BNEGI_D_ENC : MSA_BIT_D_FMT<0b101, 0b001001>;
+
+class BNZ_B_ENC : MSA_CBRANCH_FMT<0b111, 0b00>;
+class BNZ_H_ENC : MSA_CBRANCH_FMT<0b111, 0b01>;
+class BNZ_W_ENC : MSA_CBRANCH_FMT<0b111, 0b10>;
+class BNZ_D_ENC : MSA_CBRANCH_FMT<0b111, 0b11>;
+
+class BNZ_V_ENC : MSA_CBRANCH_V_FMT<0b01111>;
+
+class BSEL_V_ENC : MSA_VEC_FMT<0b00110, 0b011110>;
+
+class BSELI_B_ENC : MSA_I8_FMT<0b10, 0b000001>;
+
+class BSET_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001101>;
+class BSET_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001101>;
+class BSET_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001101>;
+class BSET_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001101>;
+
+class BSETI_B_ENC : MSA_BIT_B_FMT<0b100, 0b001001>;
+class BSETI_H_ENC : MSA_BIT_H_FMT<0b100, 0b001001>;
+class BSETI_W_ENC : MSA_BIT_W_FMT<0b100, 0b001001>;
+class BSETI_D_ENC : MSA_BIT_D_FMT<0b100, 0b001001>;
+
+class BZ_B_ENC : MSA_CBRANCH_FMT<0b110, 0b00>;
+class BZ_H_ENC : MSA_CBRANCH_FMT<0b110, 0b01>;
+class BZ_W_ENC : MSA_CBRANCH_FMT<0b110, 0b10>;
+class BZ_D_ENC : MSA_CBRANCH_FMT<0b110, 0b11>;
+
+class BZ_V_ENC : MSA_CBRANCH_V_FMT<0b01011>;
+
+class CEQ_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001111>;
+class CEQ_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001111>;
+class CEQ_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001111>;
+class CEQ_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001111>;
+
+class CEQI_B_ENC : MSA_I5_FMT<0b000, 0b00, 0b000111>;
+class CEQI_H_ENC : MSA_I5_FMT<0b000, 0b01, 0b000111>;
+class CEQI_W_ENC : MSA_I5_FMT<0b000, 0b10, 0b000111>;
+class CEQI_D_ENC : MSA_I5_FMT<0b000, 0b11, 0b000111>;
+
+class CFCMSA_ENC : MSA_ELM_CFCMSA_FMT<0b0001111110, 0b011001>;
+
+class CLE_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001111>;
+class CLE_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001111>;
+class CLE_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001111>;
+class CLE_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001111>;
+
+class CLE_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001111>;
+class CLE_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001111>;
+class CLE_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001111>;
+class CLE_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001111>;
+
+class CLEI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000111>;
+class CLEI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000111>;
+class CLEI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000111>;
+class CLEI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000111>;
+
+class CLEI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000111>;
+class CLEI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000111>;
+class CLEI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000111>;
+class CLEI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000111>;
+
+class CLT_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001111>;
+class CLT_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001111>;
+class CLT_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001111>;
+class CLT_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001111>;
+
+class CLT_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001111>;
+class CLT_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001111>;
+class CLT_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001111>;
+class CLT_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001111>;
+
+class CLTI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000111>;
+class CLTI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000111>;
+class CLTI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000111>;
+class CLTI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000111>;
+
+class CLTI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000111>;
+class CLTI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000111>;
+class CLTI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000111>;
+class CLTI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000111>;
+
+class COPY_S_B_ENC : MSA_ELM_COPY_B_FMT<0b0010, 0b011001>;
+class COPY_S_H_ENC : MSA_ELM_COPY_H_FMT<0b0010, 0b011001>;
+class COPY_S_W_ENC : MSA_ELM_COPY_W_FMT<0b0010, 0b011001>;
+
+class COPY_U_B_ENC : MSA_ELM_COPY_B_FMT<0b0011, 0b011001>;
+class COPY_U_H_ENC : MSA_ELM_COPY_H_FMT<0b0011, 0b011001>;
+class COPY_U_W_ENC : MSA_ELM_COPY_W_FMT<0b0011, 0b011001>;
+
+class CTCMSA_ENC : MSA_ELM_CTCMSA_FMT<0b0000111110, 0b011001>;
+
+class DIV_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010010>;
+class DIV_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010010>;
+class DIV_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010010>;
+class DIV_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010010>;
+
+class DIV_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010010>;
+class DIV_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010010>;
+class DIV_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010010>;
+class DIV_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010010>;
+
+class DOTP_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010011>;
+class DOTP_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010011>;
+class DOTP_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010011>;
+
+class DOTP_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010011>;
+class DOTP_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010011>;
+class DOTP_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010011>;
+
+class DPADD_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010011>;
+class DPADD_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010011>;
+class DPADD_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010011>;
+
+class DPADD_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010011>;
+class DPADD_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010011>;
+class DPADD_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010011>;
+
+class DPSUB_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010011>;
+class DPSUB_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010011>;
+class DPSUB_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010011>;
+
+class DPSUB_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010011>;
+class DPSUB_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010011>;
+class DPSUB_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010011>;
+
+class FADD_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011011>;
+class FADD_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011011>;
+
+class FCAF_W_ENC : MSA_3RF_FMT<0b0000, 0b0, 0b011010>;
+class FCAF_D_ENC : MSA_3RF_FMT<0b0000, 0b1, 0b011010>;
+
+class FCEQ_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011010>;
+class FCEQ_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011010>;
+
+class FCLASS_W_ENC : MSA_2RF_FMT<0b110010000, 0b0, 0b011110>;
+class FCLASS_D_ENC : MSA_2RF_FMT<0b110010000, 0b1, 0b011110>;
+
+class FCLE_W_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011010>;
+class FCLE_D_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011010>;
+
+class FCLT_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011010>;
+class FCLT_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011010>;
+
+class FCNE_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011100>;
+class FCNE_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011100>;
+
+class FCOR_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011100>;
+class FCOR_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011100>;
+
+class FCUEQ_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011010>;
+class FCUEQ_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011010>;
+
+class FCULE_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011010>;
+class FCULE_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011010>;
+
+class FCULT_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011010>;
+class FCULT_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011010>;
+
+class FCUN_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011010>;
+class FCUN_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011010>;
+
+class FCUNE_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011100>;
+class FCUNE_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011100>;
+
+class FDIV_W_ENC : MSA_3RF_FMT<0b0011, 0b0, 0b011011>;
+class FDIV_D_ENC : MSA_3RF_FMT<0b0011, 0b1, 0b011011>;
+
+class FEXDO_H_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011011>;
+class FEXDO_W_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011011>;
+
+class FEXP2_W_ENC : MSA_3RF_FMT<0b0111, 0b0, 0b011011>;
+class FEXP2_D_ENC : MSA_3RF_FMT<0b0111, 0b1, 0b011011>;
+
+class FEXUPL_W_ENC : MSA_2RF_FMT<0b110011000, 0b0, 0b011110>;
+class FEXUPL_D_ENC : MSA_2RF_FMT<0b110011000, 0b1, 0b011110>;
+
+class FEXUPR_W_ENC : MSA_2RF_FMT<0b110011001, 0b0, 0b011110>;
+class FEXUPR_D_ENC : MSA_2RF_FMT<0b110011001, 0b1, 0b011110>;
+
+class FFINT_S_W_ENC : MSA_2RF_FMT<0b110011110, 0b0, 0b011110>;
+class FFINT_S_D_ENC : MSA_2RF_FMT<0b110011110, 0b1, 0b011110>;
+
+class FFINT_U_W_ENC : MSA_2RF_FMT<0b110011111, 0b0, 0b011110>;
+class FFINT_U_D_ENC : MSA_2RF_FMT<0b110011111, 0b1, 0b011110>;
+
+class FFQL_W_ENC : MSA_2RF_FMT<0b110011010, 0b0, 0b011110>;
+class FFQL_D_ENC : MSA_2RF_FMT<0b110011010, 0b1, 0b011110>;
+
+class FFQR_W_ENC : MSA_2RF_FMT<0b110011011, 0b0, 0b011110>;
+class FFQR_D_ENC : MSA_2RF_FMT<0b110011011, 0b1, 0b011110>;
+
+class FILL_B_ENC : MSA_2R_FILL_FMT<0b11000000, 0b00, 0b011110>;
+class FILL_H_ENC : MSA_2R_FILL_FMT<0b11000000, 0b01, 0b011110>;
+class FILL_W_ENC : MSA_2R_FILL_FMT<0b11000000, 0b10, 0b011110>;
+
+class FLOG2_W_ENC : MSA_2RF_FMT<0b110010111, 0b0, 0b011110>;
+class FLOG2_D_ENC : MSA_2RF_FMT<0b110010111, 0b1, 0b011110>;
+
+class FMADD_W_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011011>;
+class FMADD_D_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011011>;
+
+class FMAX_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011011>;
+class FMAX_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011011>;
+
+class FMAX_A_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011011>;
+class FMAX_A_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011011>;
+
+class FMIN_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011011>;
+class FMIN_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011011>;
+
+class FMIN_A_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011011>;
+class FMIN_A_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011011>;
+
+class FMSUB_W_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011011>;
+class FMSUB_D_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011011>;
+
+class FMUL_W_ENC : MSA_3RF_FMT<0b0010, 0b0, 0b011011>;
+class FMUL_D_ENC : MSA_3RF_FMT<0b0010, 0b1, 0b011011>;
+
+class FRINT_W_ENC : MSA_2RF_FMT<0b110010110, 0b0, 0b011110>;
+class FRINT_D_ENC : MSA_2RF_FMT<0b110010110, 0b1, 0b011110>;
+
+class FRCP_W_ENC : MSA_2RF_FMT<0b110010101, 0b0, 0b011110>;
+class FRCP_D_ENC : MSA_2RF_FMT<0b110010101, 0b1, 0b011110>;
+
+class FRSQRT_W_ENC : MSA_2RF_FMT<0b110010100, 0b0, 0b011110>;
+class FRSQRT_D_ENC : MSA_2RF_FMT<0b110010100, 0b1, 0b011110>;
+
+class FSAF_W_ENC : MSA_3RF_FMT<0b1000, 0b0, 0b011010>;
+class FSAF_D_ENC : MSA_3RF_FMT<0b1000, 0b1, 0b011010>;
+
+class FSEQ_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011010>;
+class FSEQ_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011010>;
+
+class FSLE_W_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011010>;
+class FSLE_D_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011010>;
+
+class FSLT_W_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011010>;
+class FSLT_D_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011010>;
+
+class FSNE_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011100>;
+class FSNE_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011100>;
+
+class FSOR_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011100>;
+class FSOR_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011100>;
+
+class FSQRT_W_ENC : MSA_2RF_FMT<0b110010011, 0b0, 0b011110>;
+class FSQRT_D_ENC : MSA_2RF_FMT<0b110010011, 0b1, 0b011110>;
+
+class FSUB_W_ENC : MSA_3RF_FMT<0b0001, 0b0, 0b011011>;
+class FSUB_D_ENC : MSA_3RF_FMT<0b0001, 0b1, 0b011011>;
+
+class FSUEQ_W_ENC : MSA_3RF_FMT<0b1011, 0b0, 0b011010>;
+class FSUEQ_D_ENC : MSA_3RF_FMT<0b1011, 0b1, 0b011010>;
+
+class FSULE_W_ENC : MSA_3RF_FMT<0b1111, 0b0, 0b011010>;
+class FSULE_D_ENC : MSA_3RF_FMT<0b1111, 0b1, 0b011010>;
+
+class FSULT_W_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011010>;
+class FSULT_D_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011010>;
+
+class FSUN_W_ENC : MSA_3RF_FMT<0b1001, 0b0, 0b011010>;
+class FSUN_D_ENC : MSA_3RF_FMT<0b1001, 0b1, 0b011010>;
+
+class FSUNE_W_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011100>;
+class FSUNE_D_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011100>;
+
+class FTINT_S_W_ENC : MSA_2RF_FMT<0b110011100, 0b0, 0b011110>;
+class FTINT_S_D_ENC : MSA_2RF_FMT<0b110011100, 0b1, 0b011110>;
+
+class FTINT_U_W_ENC : MSA_2RF_FMT<0b110011101, 0b0, 0b011110>;
+class FTINT_U_D_ENC : MSA_2RF_FMT<0b110011101, 0b1, 0b011110>;
+
+class FTQ_H_ENC : MSA_3RF_FMT<0b1010, 0b0, 0b011011>;
+class FTQ_W_ENC : MSA_3RF_FMT<0b1010, 0b1, 0b011011>;
+
+class FTRUNC_S_W_ENC : MSA_2RF_FMT<0b110010001, 0b0, 0b011110>;
+class FTRUNC_S_D_ENC : MSA_2RF_FMT<0b110010001, 0b1, 0b011110>;
+
+class FTRUNC_U_W_ENC : MSA_2RF_FMT<0b110010010, 0b0, 0b011110>;
+class FTRUNC_U_D_ENC : MSA_2RF_FMT<0b110010010, 0b1, 0b011110>;
+
+class HADD_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010101>;
+class HADD_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010101>;
+class HADD_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010101>;
+
+class HADD_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010101>;
+class HADD_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010101>;
+class HADD_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010101>;
+
+class HSUB_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010101>;
+class HSUB_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010101>;
+class HSUB_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010101>;
+
+class HSUB_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010101>;
+class HSUB_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010101>;
+class HSUB_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010101>;
+
+class ILVEV_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010100>;
+class ILVEV_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010100>;
+class ILVEV_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010100>;
+class ILVEV_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010100>;
+
+class ILVL_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b010100>;
+class ILVL_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b010100>;
+class ILVL_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b010100>;
+class ILVL_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b010100>;
+
+class ILVOD_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010100>;
+class ILVOD_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010100>;
+class ILVOD_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010100>;
+class ILVOD_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010100>;
+
+class ILVR_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b010100>;
+class ILVR_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b010100>;
+class ILVR_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b010100>;
+class ILVR_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b010100>;
+
+class INSERT_B_ENC : MSA_ELM_INSERT_B_FMT<0b0100, 0b011001>;
+class INSERT_H_ENC : MSA_ELM_INSERT_H_FMT<0b0100, 0b011001>;
+class INSERT_W_ENC : MSA_ELM_INSERT_W_FMT<0b0100, 0b011001>;
+
+class INSVE_B_ENC : MSA_ELM_B_FMT<0b0101, 0b011001>;
+class INSVE_H_ENC : MSA_ELM_H_FMT<0b0101, 0b011001>;
+class INSVE_W_ENC : MSA_ELM_W_FMT<0b0101, 0b011001>;
+class INSVE_D_ENC : MSA_ELM_D_FMT<0b0101, 0b011001>;
+
+class LD_B_ENC   : MSA_MI10_FMT<0b00, 0b1000>;
+class LD_H_ENC   : MSA_MI10_FMT<0b01, 0b1000>;
+class LD_W_ENC   : MSA_MI10_FMT<0b10, 0b1000>;
+class LD_D_ENC   : MSA_MI10_FMT<0b11, 0b1000>;
+
+class LDI_B_ENC  : MSA_I10_FMT<0b110, 0b00, 0b000111>;
+class LDI_H_ENC  : MSA_I10_FMT<0b110, 0b01, 0b000111>;
+class LDI_W_ENC  : MSA_I10_FMT<0b110, 0b10, 0b000111>;
+class LDI_D_ENC  : MSA_I10_FMT<0b110, 0b11, 0b000111>;
+
+class LSA_ENC : SPECIAL_LSA_FMT<0b000101>;
+
+class MADD_Q_H_ENC : MSA_3RF_FMT<0b0101, 0b0, 0b011100>;
+class MADD_Q_W_ENC : MSA_3RF_FMT<0b0101, 0b1, 0b011100>;
+
+class MADDR_Q_H_ENC : MSA_3RF_FMT<0b1101, 0b0, 0b011100>;
+class MADDR_Q_W_ENC : MSA_3RF_FMT<0b1101, 0b1, 0b011100>;
+
+class MADDV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010010>;
+class MADDV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010010>;
+class MADDV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010010>;
+class MADDV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010010>;
+
+class MAX_A_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b001110>;
+class MAX_A_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b001110>;
+class MAX_A_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b001110>;
+class MAX_A_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b001110>;
+
+class MAX_S_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001110>;
+class MAX_S_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001110>;
+class MAX_S_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001110>;
+class MAX_S_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001110>;
+
+class MAX_U_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b001110>;
+class MAX_U_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b001110>;
+class MAX_U_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b001110>;
+class MAX_U_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b001110>;
+
+class MAXI_S_B_ENC : MSA_I5_FMT<0b010, 0b00, 0b000110>;
+class MAXI_S_H_ENC : MSA_I5_FMT<0b010, 0b01, 0b000110>;
+class MAXI_S_W_ENC : MSA_I5_FMT<0b010, 0b10, 0b000110>;
+class MAXI_S_D_ENC : MSA_I5_FMT<0b010, 0b11, 0b000110>;
+
+class MAXI_U_B_ENC : MSA_I5_FMT<0b011, 0b00, 0b000110>;
+class MAXI_U_H_ENC : MSA_I5_FMT<0b011, 0b01, 0b000110>;
+class MAXI_U_W_ENC : MSA_I5_FMT<0b011, 0b10, 0b000110>;
+class MAXI_U_D_ENC : MSA_I5_FMT<0b011, 0b11, 0b000110>;
+
+class MIN_A_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b001110>;
+class MIN_A_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b001110>;
+class MIN_A_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b001110>;
+class MIN_A_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b001110>;
+
+class MIN_S_B_ENC : MSA_3R_FMT<0b100, 0b00, 0b001110>;
+class MIN_S_H_ENC : MSA_3R_FMT<0b100, 0b01, 0b001110>;
+class MIN_S_W_ENC : MSA_3R_FMT<0b100, 0b10, 0b001110>;
+class MIN_S_D_ENC : MSA_3R_FMT<0b100, 0b11, 0b001110>;
+
+class MIN_U_B_ENC : MSA_3R_FMT<0b101, 0b00, 0b001110>;
+class MIN_U_H_ENC : MSA_3R_FMT<0b101, 0b01, 0b001110>;
+class MIN_U_W_ENC : MSA_3R_FMT<0b101, 0b10, 0b001110>;
+class MIN_U_D_ENC : MSA_3R_FMT<0b101, 0b11, 0b001110>;
+
+class MINI_S_B_ENC : MSA_I5_FMT<0b100, 0b00, 0b000110>;
+class MINI_S_H_ENC : MSA_I5_FMT<0b100, 0b01, 0b000110>;
+class MINI_S_W_ENC : MSA_I5_FMT<0b100, 0b10, 0b000110>;
+class MINI_S_D_ENC : MSA_I5_FMT<0b100, 0b11, 0b000110>;
+
+class MINI_U_B_ENC : MSA_I5_FMT<0b101, 0b00, 0b000110>;
+class MINI_U_H_ENC : MSA_I5_FMT<0b101, 0b01, 0b000110>;
+class MINI_U_W_ENC : MSA_I5_FMT<0b101, 0b10, 0b000110>;
+class MINI_U_D_ENC : MSA_I5_FMT<0b101, 0b11, 0b000110>;
+
+class MOD_S_B_ENC : MSA_3R_FMT<0b110, 0b00, 0b010010>;
+class MOD_S_H_ENC : MSA_3R_FMT<0b110, 0b01, 0b010010>;
+class MOD_S_W_ENC : MSA_3R_FMT<0b110, 0b10, 0b010010>;
+class MOD_S_D_ENC : MSA_3R_FMT<0b110, 0b11, 0b010010>;
+
+class MOD_U_B_ENC : MSA_3R_FMT<0b111, 0b00, 0b010010>;
+class MOD_U_H_ENC : MSA_3R_FMT<0b111, 0b01, 0b010010>;
+class MOD_U_W_ENC : MSA_3R_FMT<0b111, 0b10, 0b010010>;
+class MOD_U_D_ENC : MSA_3R_FMT<0b111, 0b11, 0b010010>;
+
+class MOVE_V_ENC : MSA_ELM_FMT<0b0010111110, 0b011001>;
+
+class MSUB_Q_H_ENC : MSA_3RF_FMT<0b0110, 0b0, 0b011100>;
+class MSUB_Q_W_ENC : MSA_3RF_FMT<0b0110, 0b1, 0b011100>;
+
+class MSUBR_Q_H_ENC : MSA_3RF_FMT<0b1110, 0b0, 0b011100>;
+class MSUBR_Q_W_ENC : MSA_3RF_FMT<0b1110, 0b1, 0b011100>;
+
+class MSUBV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010010>;
+class MSUBV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010010>;
+class MSUBV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010010>;
+class MSUBV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010010>;
+
+class MUL_Q_H_ENC : MSA_3RF_FMT<0b0100, 0b0, 0b011100>;
+class MUL_Q_W_ENC : MSA_3RF_FMT<0b0100, 0b1, 0b011100>;
+
+class MULR_Q_H_ENC : MSA_3RF_FMT<0b1100, 0b0, 0b011100>;
+class MULR_Q_W_ENC : MSA_3RF_FMT<0b1100, 0b1, 0b011100>;
+
+class MULV_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010010>;
+class MULV_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010010>;
+class MULV_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010010>;
+class MULV_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010010>;
+
+class NLOC_B_ENC : MSA_2R_FMT<0b11000010, 0b00, 0b011110>;
+class NLOC_H_ENC : MSA_2R_FMT<0b11000010, 0b01, 0b011110>;
+class NLOC_W_ENC : MSA_2R_FMT<0b11000010, 0b10, 0b011110>;
+class NLOC_D_ENC : MSA_2R_FMT<0b11000010, 0b11, 0b011110>;
+
+class NLZC_B_ENC : MSA_2R_FMT<0b11000011, 0b00, 0b011110>;
+class NLZC_H_ENC : MSA_2R_FMT<0b11000011, 0b01, 0b011110>;
+class NLZC_W_ENC : MSA_2R_FMT<0b11000011, 0b10, 0b011110>;
+class NLZC_D_ENC : MSA_2R_FMT<0b11000011, 0b11, 0b011110>;
+
+class NOR_V_ENC : MSA_VEC_FMT<0b00010, 0b011110>;
+
+class NORI_B_ENC : MSA_I8_FMT<0b10, 0b000000>;
+
+class OR_V_ENC : MSA_VEC_FMT<0b00001, 0b011110>;
+
+class ORI_B_ENC  : MSA_I8_FMT<0b01, 0b000000>;
+
+class PCKEV_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010100>;
+class PCKEV_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010100>;
+class PCKEV_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010100>;
+class PCKEV_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010100>;
+
+class PCKOD_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010100>;
+class PCKOD_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010100>;
+class PCKOD_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010100>;
+class PCKOD_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010100>;
+
+class PCNT_B_ENC : MSA_2R_FMT<0b11000001, 0b00, 0b011110>;
+class PCNT_H_ENC : MSA_2R_FMT<0b11000001, 0b01, 0b011110>;
+class PCNT_W_ENC : MSA_2R_FMT<0b11000001, 0b10, 0b011110>;
+class PCNT_D_ENC : MSA_2R_FMT<0b11000001, 0b11, 0b011110>;
+
+class SAT_S_B_ENC : MSA_BIT_B_FMT<0b000, 0b001010>;
+class SAT_S_H_ENC : MSA_BIT_H_FMT<0b000, 0b001010>;
+class SAT_S_W_ENC : MSA_BIT_W_FMT<0b000, 0b001010>;
+class SAT_S_D_ENC : MSA_BIT_D_FMT<0b000, 0b001010>;
+
+class SAT_U_B_ENC : MSA_BIT_B_FMT<0b001, 0b001010>;
+class SAT_U_H_ENC : MSA_BIT_H_FMT<0b001, 0b001010>;
+class SAT_U_W_ENC : MSA_BIT_W_FMT<0b001, 0b001010>;
+class SAT_U_D_ENC : MSA_BIT_D_FMT<0b001, 0b001010>;
+
+class SHF_B_ENC  : MSA_I8_FMT<0b00, 0b000010>;
+class SHF_H_ENC  : MSA_I8_FMT<0b01, 0b000010>;
+class SHF_W_ENC  : MSA_I8_FMT<0b10, 0b000010>;
+
+class SLD_B_ENC : MSA_3R_INDEX_FMT<0b000, 0b00, 0b010100>;
+class SLD_H_ENC : MSA_3R_INDEX_FMT<0b000, 0b01, 0b010100>;
+class SLD_W_ENC : MSA_3R_INDEX_FMT<0b000, 0b10, 0b010100>;
+class SLD_D_ENC : MSA_3R_INDEX_FMT<0b000, 0b11, 0b010100>;
+
+class SLDI_B_ENC : MSA_ELM_B_FMT<0b0000, 0b011001>;
+class SLDI_H_ENC : MSA_ELM_H_FMT<0b0000, 0b011001>;
+class SLDI_W_ENC : MSA_ELM_W_FMT<0b0000, 0b011001>;
+class SLDI_D_ENC : MSA_ELM_D_FMT<0b0000, 0b011001>;
+
+class SLL_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b001101>;
+class SLL_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b001101>;
+class SLL_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b001101>;
+class SLL_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b001101>;
+
+class SLLI_B_ENC : MSA_BIT_B_FMT<0b000, 0b001001>;
+class SLLI_H_ENC : MSA_BIT_H_FMT<0b000, 0b001001>;
+class SLLI_W_ENC : MSA_BIT_W_FMT<0b000, 0b001001>;
+class SLLI_D_ENC : MSA_BIT_D_FMT<0b000, 0b001001>;
+
+class SPLAT_B_ENC : MSA_3R_INDEX_FMT<0b001, 0b00, 0b010100>;
+class SPLAT_H_ENC : MSA_3R_INDEX_FMT<0b001, 0b01, 0b010100>;
+class SPLAT_W_ENC : MSA_3R_INDEX_FMT<0b001, 0b10, 0b010100>;
+class SPLAT_D_ENC : MSA_3R_INDEX_FMT<0b001, 0b11, 0b010100>;
+
+class SPLATI_B_ENC : MSA_ELM_B_FMT<0b0001, 0b011001>;
+class SPLATI_H_ENC : MSA_ELM_H_FMT<0b0001, 0b011001>;
+class SPLATI_W_ENC : MSA_ELM_W_FMT<0b0001, 0b011001>;
+class SPLATI_D_ENC : MSA_ELM_D_FMT<0b0001, 0b011001>;
+
+class SRA_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001101>;
+class SRA_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001101>;
+class SRA_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001101>;
+class SRA_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001101>;
+
+class SRAI_B_ENC : MSA_BIT_B_FMT<0b001, 0b001001>;
+class SRAI_H_ENC : MSA_BIT_H_FMT<0b001, 0b001001>;
+class SRAI_W_ENC : MSA_BIT_W_FMT<0b001, 0b001001>;
+class SRAI_D_ENC : MSA_BIT_D_FMT<0b001, 0b001001>;
+
+class SRAR_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010101>;
+class SRAR_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010101>;
+class SRAR_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010101>;
+class SRAR_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010101>;
+
+class SRARI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001010>;
+class SRARI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001010>;
+class SRARI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001010>;
+class SRARI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001010>;
+
+class SRL_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b001101>;
+class SRL_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b001101>;
+class SRL_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b001101>;
+class SRL_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b001101>;
+
+class SRLI_B_ENC : MSA_BIT_B_FMT<0b010, 0b001001>;
+class SRLI_H_ENC : MSA_BIT_H_FMT<0b010, 0b001001>;
+class SRLI_W_ENC : MSA_BIT_W_FMT<0b010, 0b001001>;
+class SRLI_D_ENC : MSA_BIT_D_FMT<0b010, 0b001001>;
+
+class SRLR_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010101>;
+class SRLR_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010101>;
+class SRLR_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010101>;
+class SRLR_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010101>;
+
+class SRLRI_B_ENC : MSA_BIT_B_FMT<0b011, 0b001010>;
+class SRLRI_H_ENC : MSA_BIT_H_FMT<0b011, 0b001010>;
+class SRLRI_W_ENC : MSA_BIT_W_FMT<0b011, 0b001010>;
+class SRLRI_D_ENC : MSA_BIT_D_FMT<0b011, 0b001010>;
+
+class ST_B_ENC   : MSA_MI10_FMT<0b00, 0b1001>;
+class ST_H_ENC   : MSA_MI10_FMT<0b01, 0b1001>;
+class ST_W_ENC   : MSA_MI10_FMT<0b10, 0b1001>;
+class ST_D_ENC   : MSA_MI10_FMT<0b11, 0b1001>;
+
+class SUBS_S_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010001>;
+class SUBS_S_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010001>;
+class SUBS_S_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010001>;
+class SUBS_S_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010001>;
+
+class SUBS_U_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b010001>;
+class SUBS_U_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b010001>;
+class SUBS_U_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b010001>;
+class SUBS_U_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b010001>;
+
+class SUBSUS_U_B_ENC : MSA_3R_FMT<0b010, 0b00, 0b010001>;
+class SUBSUS_U_H_ENC : MSA_3R_FMT<0b010, 0b01, 0b010001>;
+class SUBSUS_U_W_ENC : MSA_3R_FMT<0b010, 0b10, 0b010001>;
+class SUBSUS_U_D_ENC : MSA_3R_FMT<0b010, 0b11, 0b010001>;
+
+class SUBSUU_S_B_ENC : MSA_3R_FMT<0b011, 0b00, 0b010001>;
+class SUBSUU_S_H_ENC : MSA_3R_FMT<0b011, 0b01, 0b010001>;
+class SUBSUU_S_W_ENC : MSA_3R_FMT<0b011, 0b10, 0b010001>;
+class SUBSUU_S_D_ENC : MSA_3R_FMT<0b011, 0b11, 0b010001>;
+
+class SUBV_B_ENC : MSA_3R_FMT<0b001, 0b00, 0b001110>;
+class SUBV_H_ENC : MSA_3R_FMT<0b001, 0b01, 0b001110>;
+class SUBV_W_ENC : MSA_3R_FMT<0b001, 0b10, 0b001110>;
+class SUBV_D_ENC : MSA_3R_FMT<0b001, 0b11, 0b001110>;
+
+class SUBVI_B_ENC : MSA_I5_FMT<0b001, 0b00, 0b000110>;
+class SUBVI_H_ENC : MSA_I5_FMT<0b001, 0b01, 0b000110>;
+class SUBVI_W_ENC : MSA_I5_FMT<0b001, 0b10, 0b000110>;
+class SUBVI_D_ENC : MSA_I5_FMT<0b001, 0b11, 0b000110>;
+
+class VSHF_B_ENC : MSA_3R_FMT<0b000, 0b00, 0b010101>;
+class VSHF_H_ENC : MSA_3R_FMT<0b000, 0b01, 0b010101>;
+class VSHF_W_ENC : MSA_3R_FMT<0b000, 0b10, 0b010101>;
+class VSHF_D_ENC : MSA_3R_FMT<0b000, 0b11, 0b010101>;
+
+class XOR_V_ENC : MSA_VEC_FMT<0b00011, 0b011110>;
+
+class XORI_B_ENC : MSA_I8_FMT<0b11, 0b000000>;
+
+// Instruction desc.
+class MSA_BIT_B_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_H_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_D_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          ComplexPattern Imm, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, vsplat_uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, Imm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_B_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm3:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt3:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_H_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_W_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm5:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt5:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed soon.
+class MSA_BIT_D_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm6:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt6:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_BIT_BINSXI_DESC_BASE<string instr_asm, ValueType Ty,
+                               ComplexPattern Mask, RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, vsplat_uimm8:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (vselect (Ty Mask:$m), (Ty ROWD:$wd_in),
+                                               ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_BIT_BINSLI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskl_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_BINSRI_DESC_BASE<string instr_asm, ValueType Ty,
+                               RegisterOperand ROWD,
+                               RegisterOperand ROWS = ROWD,
+                               InstrItinClass itin = NoItinerary> :
+  MSA_BIT_BINSXI_DESC_BASE<instr_asm, Ty, vsplat_maskr_bits, ROWD, ROWS, itin>;
+
+class MSA_BIT_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$m);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $m");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$m))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         ValueType VecTy, RegisterOperand ROD,
+                         RegisterOperand ROWS,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROD:$rd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $ws[$n]");
+  list<dag> Pattern = [(set ROD:$rd, (OpNode (VecTy ROWS:$ws), immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm4:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt4:$n))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_COPY_PSEUDO_BASE<SDPatternOperator OpNode, ValueType VecTy,
+                           RegisterClass RCD, RegisterClass RCWS> :
+      MipsPseudo<(outs RCD:$wd), (ins RCWS:$ws, uimm4:$n),
+                 [(set RCD:$wd, (OpNode (VecTy RCWS:$ws), immZExt4:$n))]> {
+  bit usesCustomInserter = 1;
+}
+
+class MSA_I5_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $imm");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$imm))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SplatComplexPattern SplatImm, RegisterOperand ROWD,
+                       RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, SplatImm:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+// This class is deprecated and will be removed in the next few patches
+class MSA_I8_X_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                         InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, immZExt8:$u8))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I8_SHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                           RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, uimm8:$u8);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $u8");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsSHF immZExt8:$u8, ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_I10_LDI_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins vsplat_simm10:$s10);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $s10");
+  // LDI is matched using custom matching code in MipsSEISelDAGToDAG.cpp
+  list<dag> Pattern = [];
+  bit hasSideEffects = 0;
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_DESC_BASE<string instr_asm, ValueType VT,
+                            SDPatternOperator OpNode, RegisterOperand ROWD,
+                            RegisterOperand ROS = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (VT (OpNode ROS:$rs)))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_2R_FILL_PSEUDO_BASE<ValueType VT, SDPatternOperator OpNode,
+                              RegisterClass RCWD, RegisterClass RCWS = RCWD> :
+      MipsPseudo<(outs RCWD:$wd), (ins RCWS:$fs),
+                 [(set RCWD:$wd, (OpNode RCWS:$fs))]> {
+  let usesCustomInserter = 1;
+}
+
+class MSA_2RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                       RegisterOperand ROWT = ROWD,
+                       InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_BINSX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             RegisterOperand ROWT = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in, ROWS:$ws,
+                                              ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SPLAT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                             InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_VSHF_DESC_BASE<string instr_asm, RegisterOperand ROWD,
+                            RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF ROWD:$wd_in, ROWS:$ws,
+                                                ROWT:$wt))];
+  string Constraints = "$wd = $wd_in";
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_SLD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, GPR32:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$rt]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, GPR32:$rt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_3R_4R_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd,
+                       (OpNode ROWD:$wd_in, ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_3RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> :
+  MSA_3R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_3RF_4RF_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                            RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                            RegisterOperand ROWT = ROWD,
+                            InstrItinClass itin = NoItinerary> :
+  MSA_3R_4R_DESC_BASE<instr_asm, OpNode, ROWD, ROWS, ROWT, itin>;
+
+class MSA_CBRANCH_DESC_BASE<string instr_asm, RegisterOperand ROWD> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wt, brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$wt, $offset");
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = IIBranch;
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+  list<Register> Defs = [AT];
+}
+
+class MSA_INSERT_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           RegisterOperand ROWD, RegisterOperand ROS,
+                           InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, ROS:$rs, uimm6:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $rs");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              ROS:$rs,
+                                              immZExt6:$n))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                             RegisterOperand ROWD, RegisterOperand ROFS> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, uimm6:$n, ROFS:$fs),
+                 [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        immZExt6:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                          RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                          InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWD:$wd_in, uimm6:$n, ROWS:$ws);
+  string AsmString = !strconcat(instr_asm, "\t$wd[$n], $ws[0]");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWD:$wd_in,
+                                              immZExt6:$n,
+                                              ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+  string Constraints = "$wd = $wd_in";
+}
+
+class MSA_VEC_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
+                        RegisterOperand ROWT = ROWD,
+                        InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, ROWT:$wt);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws, $wt");
+  list<dag> Pattern = [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_ELM_SPLAT_DESC_BASE<string instr_asm, SplatComplexPattern SplatImm,
+                              RegisterOperand ROWD,
+                              RegisterOperand ROWS = ROWD,
+                              InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins ROWS:$ws, SplatImm.OpClass:$n);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $ws[$n]");
+  list<dag> Pattern = [(set ROWD:$wd, (MipsVSHF SplatImm:$n, ROWS:$ws,
+                                                ROWS:$ws))];
+  InstrItinClass Itinerary = itin;
+}
+
+class MSA_VEC_PSEUDO_BASE<SDPatternOperator OpNode, RegisterOperand ROWD,
+                          RegisterOperand ROWS = ROWD,
+                          RegisterOperand ROWT = ROWD> :
+      MipsPseudo<(outs ROWD:$wd), (ins ROWS:$ws, ROWT:$wt),
+                 [(set ROWD:$wd, (OpNode ROWS:$ws, ROWT:$wt))]>;
+
+class ADD_A_B_DESC : MSA_3R_DESC_BASE<"add_a.b", int_mips_add_a_b, MSA128BOpnd>,
+                     IsCommutable;
+class ADD_A_H_DESC : MSA_3R_DESC_BASE<"add_a.h", int_mips_add_a_h, MSA128HOpnd>,
+                     IsCommutable;
+class ADD_A_W_DESC : MSA_3R_DESC_BASE<"add_a.w", int_mips_add_a_w, MSA128WOpnd>,
+                     IsCommutable;
+class ADD_A_D_DESC : MSA_3R_DESC_BASE<"add_a.d", int_mips_add_a_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class ADDS_A_B_DESC : MSA_3R_DESC_BASE<"adds_a.b", int_mips_adds_a_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_A_H_DESC : MSA_3R_DESC_BASE<"adds_a.h", int_mips_adds_a_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_A_W_DESC : MSA_3R_DESC_BASE<"adds_a.w", int_mips_adds_a_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_A_D_DESC : MSA_3R_DESC_BASE<"adds_a.d", int_mips_adds_a_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_S_B_DESC : MSA_3R_DESC_BASE<"adds_s.b", int_mips_adds_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_S_H_DESC : MSA_3R_DESC_BASE<"adds_s.h", int_mips_adds_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_S_W_DESC : MSA_3R_DESC_BASE<"adds_s.w", int_mips_adds_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_S_D_DESC : MSA_3R_DESC_BASE<"adds_s.d", int_mips_adds_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDS_U_B_DESC : MSA_3R_DESC_BASE<"adds_u.b", int_mips_adds_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class ADDS_U_H_DESC : MSA_3R_DESC_BASE<"adds_u.h", int_mips_adds_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class ADDS_U_W_DESC : MSA_3R_DESC_BASE<"adds_u.w", int_mips_adds_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class ADDS_U_D_DESC : MSA_3R_DESC_BASE<"adds_u.d", int_mips_adds_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class ADDV_B_DESC : MSA_3R_DESC_BASE<"addv.b", add, MSA128BOpnd>, IsCommutable;
+class ADDV_H_DESC : MSA_3R_DESC_BASE<"addv.h", add, MSA128HOpnd>, IsCommutable;
+class ADDV_W_DESC : MSA_3R_DESC_BASE<"addv.w", add, MSA128WOpnd>, IsCommutable;
+class ADDV_D_DESC : MSA_3R_DESC_BASE<"addv.d", add, MSA128DOpnd>, IsCommutable;
+
+class ADDVI_B_DESC : MSA_I5_DESC_BASE<"addvi.b", add, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class ADDVI_H_DESC : MSA_I5_DESC_BASE<"addvi.h", add, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class ADDVI_W_DESC : MSA_I5_DESC_BASE<"addvi.w", add, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class ADDVI_D_DESC : MSA_I5_DESC_BASE<"addvi.d", add, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class AND_V_DESC : MSA_VEC_DESC_BASE<"and.v", and, MSA128BOpnd>;
+class AND_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128HOpnd>;
+class AND_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128WOpnd>;
+class AND_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<and, MSA128DOpnd>;
+
+class ANDI_B_DESC : MSA_I8_DESC_BASE<"andi.b", and, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class ASUB_S_B_DESC : MSA_3R_DESC_BASE<"asub_s.b", int_mips_asub_s_b,
+                                       MSA128BOpnd>;
+class ASUB_S_H_DESC : MSA_3R_DESC_BASE<"asub_s.h", int_mips_asub_s_h,
+                                       MSA128HOpnd>;
+class ASUB_S_W_DESC : MSA_3R_DESC_BASE<"asub_s.w", int_mips_asub_s_w,
+                                       MSA128WOpnd>;
+class ASUB_S_D_DESC : MSA_3R_DESC_BASE<"asub_s.d", int_mips_asub_s_d,
+                                       MSA128DOpnd>;
+
+class ASUB_U_B_DESC : MSA_3R_DESC_BASE<"asub_u.b", int_mips_asub_u_b,
+                                       MSA128BOpnd>;
+class ASUB_U_H_DESC : MSA_3R_DESC_BASE<"asub_u.h", int_mips_asub_u_h,
+                                       MSA128HOpnd>;
+class ASUB_U_W_DESC : MSA_3R_DESC_BASE<"asub_u.w", int_mips_asub_u_w,
+                                       MSA128WOpnd>;
+class ASUB_U_D_DESC : MSA_3R_DESC_BASE<"asub_u.d", int_mips_asub_u_d,
+                                       MSA128DOpnd>;
+
+class AVE_S_B_DESC : MSA_3R_DESC_BASE<"ave_s.b", int_mips_ave_s_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_S_H_DESC : MSA_3R_DESC_BASE<"ave_s.h", int_mips_ave_s_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_S_W_DESC : MSA_3R_DESC_BASE<"ave_s.w", int_mips_ave_s_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_S_D_DESC : MSA_3R_DESC_BASE<"ave_s.d", int_mips_ave_s_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVE_U_B_DESC : MSA_3R_DESC_BASE<"ave_u.b", int_mips_ave_u_b, MSA128BOpnd>,
+                     IsCommutable;
+class AVE_U_H_DESC : MSA_3R_DESC_BASE<"ave_u.h", int_mips_ave_u_h, MSA128HOpnd>,
+                     IsCommutable;
+class AVE_U_W_DESC : MSA_3R_DESC_BASE<"ave_u.w", int_mips_ave_u_w, MSA128WOpnd>,
+                     IsCommutable;
+class AVE_U_D_DESC : MSA_3R_DESC_BASE<"ave_u.d", int_mips_ave_u_d, MSA128DOpnd>,
+                     IsCommutable;
+
+class AVER_S_B_DESC : MSA_3R_DESC_BASE<"aver_s.b", int_mips_aver_s_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_S_H_DESC : MSA_3R_DESC_BASE<"aver_s.h", int_mips_aver_s_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_S_W_DESC : MSA_3R_DESC_BASE<"aver_s.w", int_mips_aver_s_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_S_D_DESC : MSA_3R_DESC_BASE<"aver_s.d", int_mips_aver_s_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class AVER_U_B_DESC : MSA_3R_DESC_BASE<"aver_u.b", int_mips_aver_u_b,
+                                       MSA128BOpnd>, IsCommutable;
+class AVER_U_H_DESC : MSA_3R_DESC_BASE<"aver_u.h", int_mips_aver_u_h,
+                                       MSA128HOpnd>, IsCommutable;
+class AVER_U_W_DESC : MSA_3R_DESC_BASE<"aver_u.w", int_mips_aver_u_w,
+                                       MSA128WOpnd>, IsCommutable;
+class AVER_U_D_DESC : MSA_3R_DESC_BASE<"aver_u.d", int_mips_aver_u_d,
+                                       MSA128DOpnd>, IsCommutable;
+
+class BCLR_B_DESC : MSA_3R_DESC_BASE<"bclr.b", vbclr_b, MSA128BOpnd>;
+class BCLR_H_DESC : MSA_3R_DESC_BASE<"bclr.h", vbclr_h, MSA128HOpnd>;
+class BCLR_W_DESC : MSA_3R_DESC_BASE<"bclr.w", vbclr_w, MSA128WOpnd>;
+class BCLR_D_DESC : MSA_3R_DESC_BASE<"bclr.d", vbclr_d, MSA128DOpnd>;
+
+class BCLRI_B_DESC : MSA_BIT_B_DESC_BASE<"bclri.b", and, vsplat_uimm_inv_pow2,
+                                         MSA128BOpnd>;
+class BCLRI_H_DESC : MSA_BIT_H_DESC_BASE<"bclri.h", and, vsplat_uimm_inv_pow2,
+                                         MSA128HOpnd>;
+class BCLRI_W_DESC : MSA_BIT_W_DESC_BASE<"bclri.w", and, vsplat_uimm_inv_pow2,
+                                         MSA128WOpnd>;
+class BCLRI_D_DESC : MSA_BIT_D_DESC_BASE<"bclri.d", and, vsplat_uimm_inv_pow2,
+                                         MSA128DOpnd>;
+
+class BINSL_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.b", int_mips_binsl_b,
+                                            MSA128BOpnd>;
+class BINSL_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.h", int_mips_binsl_h,
+                                            MSA128HOpnd>;
+class BINSL_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.w", int_mips_binsl_w,
+                                            MSA128WOpnd>;
+class BINSL_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsl.d", int_mips_binsl_d,
+                                            MSA128DOpnd>;
+
+class BINSLI_B_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.b", v16i8, MSA128BOpnd>;
+class BINSLI_H_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.h", v8i16, MSA128HOpnd>;
+class BINSLI_W_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.w", v4i32, MSA128WOpnd>;
+class BINSLI_D_DESC : MSA_BIT_BINSLI_DESC_BASE<"binsli.d", v2i64, MSA128DOpnd>;
+
+class BINSR_B_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.b", int_mips_binsr_b,
+                                            MSA128BOpnd>;
+class BINSR_H_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.h", int_mips_binsr_h,
+                                            MSA128HOpnd>;
+class BINSR_W_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.w", int_mips_binsr_w,
+                                            MSA128WOpnd>;
+class BINSR_D_DESC : MSA_3R_BINSX_DESC_BASE<"binsr.d", int_mips_binsr_d,
+                                            MSA128DOpnd>;
+
+class BINSRI_B_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.b", v16i8, MSA128BOpnd>;
+class BINSRI_H_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.h", v8i16, MSA128HOpnd>;
+class BINSRI_W_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.w", v4i32, MSA128WOpnd>;
+class BINSRI_D_DESC : MSA_BIT_BINSRI_DESC_BASE<"binsri.d", v2i64, MSA128DOpnd>;
+
+class BMNZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmnz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMNZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmnzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$ws,
+                                                      MSA128BOpnd:$wd_in))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZ_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bmz.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wt,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BMZI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bmzi.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect vsplati8_uimm8:$u8,
+                                                      MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BNEG_B_DESC : MSA_3R_DESC_BASE<"bneg.b", vbneg_b, MSA128BOpnd>;
+class BNEG_H_DESC : MSA_3R_DESC_BASE<"bneg.h", vbneg_h, MSA128HOpnd>;
+class BNEG_W_DESC : MSA_3R_DESC_BASE<"bneg.w", vbneg_w, MSA128WOpnd>;
+class BNEG_D_DESC : MSA_3R_DESC_BASE<"bneg.d", vbneg_d, MSA128DOpnd>;
+
+class BNEGI_B_DESC : MSA_BIT_B_DESC_BASE<"bnegi.b", xor, vsplat_uimm_pow2, MSA128BOpnd>;
+class BNEGI_H_DESC : MSA_BIT_H_DESC_BASE<"bnegi.h", xor, vsplat_uimm_pow2, MSA128HOpnd>;
+class BNEGI_W_DESC : MSA_BIT_W_DESC_BASE<"bnegi.w", xor, vsplat_uimm_pow2, MSA128WOpnd>;
+class BNEGI_D_DESC : MSA_BIT_D_DESC_BASE<"bnegi.d", xor, vsplat_uimm_pow2, MSA128DOpnd>;
+
+class BNZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bnz.b", MSA128BOpnd>;
+class BNZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bnz.h", MSA128HOpnd>;
+class BNZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bnz.w", MSA128WOpnd>;
+class BNZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bnz.d", MSA128DOpnd>;
+
+class BNZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bnz.v", MSA128BOpnd>;
+
+class BSEL_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                       MSA128BOpnd:$wt);
+  string AsmString = "bsel.v\t$wd, $ws, $wt";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd,
+                        (vselect MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                                                  MSA128BOpnd:$wt))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSELI_B_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$wd_in, MSA128BOpnd:$ws,
+                           vsplat_uimm8:$u8);
+  string AsmString = "bseli.b\t$wd, $ws, $u8";
+  list<dag> Pattern = [(set MSA128BOpnd:$wd, (vselect MSA128BOpnd:$wd_in,
+                                                      MSA128BOpnd:$ws,
+                                                      vsplati8_uimm8:$u8))];
+  InstrItinClass Itinerary = NoItinerary;
+  string Constraints = "$wd = $wd_in";
+}
+
+class BSET_B_DESC : MSA_3R_DESC_BASE<"bset.b", vbset_b, MSA128BOpnd>;
+class BSET_H_DESC : MSA_3R_DESC_BASE<"bset.h", vbset_h, MSA128HOpnd>;
+class BSET_W_DESC : MSA_3R_DESC_BASE<"bset.w", vbset_w, MSA128WOpnd>;
+class BSET_D_DESC : MSA_3R_DESC_BASE<"bset.d", vbset_d, MSA128DOpnd>;
+
+class BSETI_B_DESC : MSA_BIT_B_DESC_BASE<"bseti.b", or, vsplat_uimm_pow2,
+                                         MSA128BOpnd>;
+class BSETI_H_DESC : MSA_BIT_H_DESC_BASE<"bseti.h", or, vsplat_uimm_pow2,
+                                         MSA128HOpnd>;
+class BSETI_W_DESC : MSA_BIT_W_DESC_BASE<"bseti.w", or, vsplat_uimm_pow2,
+                                         MSA128WOpnd>;
+class BSETI_D_DESC : MSA_BIT_D_DESC_BASE<"bseti.d", or, vsplat_uimm_pow2,
+                                         MSA128DOpnd>;
+
+class BZ_B_DESC : MSA_CBRANCH_DESC_BASE<"bz.b", MSA128BOpnd>;
+class BZ_H_DESC : MSA_CBRANCH_DESC_BASE<"bz.h", MSA128HOpnd>;
+class BZ_W_DESC : MSA_CBRANCH_DESC_BASE<"bz.w", MSA128WOpnd>;
+class BZ_D_DESC : MSA_CBRANCH_DESC_BASE<"bz.d", MSA128DOpnd>;
+
+class BZ_V_DESC : MSA_CBRANCH_DESC_BASE<"bz.v", MSA128BOpnd>;
+
+class CEQ_B_DESC : MSA_3R_DESC_BASE<"ceq.b", vseteq_v16i8, MSA128BOpnd>,
+                   IsCommutable;
+class CEQ_H_DESC : MSA_3R_DESC_BASE<"ceq.h", vseteq_v8i16, MSA128HOpnd>,
+                   IsCommutable;
+class CEQ_W_DESC : MSA_3R_DESC_BASE<"ceq.w", vseteq_v4i32, MSA128WOpnd>,
+                   IsCommutable;
+class CEQ_D_DESC : MSA_3R_DESC_BASE<"ceq.d", vseteq_v2i64, MSA128DOpnd>,
+                   IsCommutable;
+
+class CEQI_B_DESC : MSA_I5_DESC_BASE<"ceqi.b", vseteq_v16i8, vsplati8_simm5,
+                                     MSA128BOpnd>;
+class CEQI_H_DESC : MSA_I5_DESC_BASE<"ceqi.h", vseteq_v8i16, vsplati16_simm5,
+                                     MSA128HOpnd>;
+class CEQI_W_DESC : MSA_I5_DESC_BASE<"ceqi.w", vseteq_v4i32, vsplati32_simm5,
+                                     MSA128WOpnd>;
+class CEQI_D_DESC : MSA_I5_DESC_BASE<"ceqi.d", vseteq_v2i64, vsplati64_simm5,
+                                     MSA128DOpnd>;
+
+class CFCMSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins MSA128CROpnd:$cs);
+  string AsmString = "cfcmsa\t$rd, $cs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class CLE_S_B_DESC : MSA_3R_DESC_BASE<"cle_s.b", vsetle_v16i8, MSA128BOpnd>;
+class CLE_S_H_DESC : MSA_3R_DESC_BASE<"cle_s.h", vsetle_v8i16, MSA128HOpnd>;
+class CLE_S_W_DESC : MSA_3R_DESC_BASE<"cle_s.w", vsetle_v4i32, MSA128WOpnd>;
+class CLE_S_D_DESC : MSA_3R_DESC_BASE<"cle_s.d", vsetle_v2i64, MSA128DOpnd>;
+
+class CLE_U_B_DESC : MSA_3R_DESC_BASE<"cle_u.b", vsetule_v16i8, MSA128BOpnd>;
+class CLE_U_H_DESC : MSA_3R_DESC_BASE<"cle_u.h", vsetule_v8i16, MSA128HOpnd>;
+class CLE_U_W_DESC : MSA_3R_DESC_BASE<"cle_u.w", vsetule_v4i32, MSA128WOpnd>;
+class CLE_U_D_DESC : MSA_3R_DESC_BASE<"cle_u.d", vsetule_v2i64, MSA128DOpnd>;
+
+class CLEI_S_B_DESC : MSA_I5_DESC_BASE<"clei_s.b", vsetle_v16i8,
+                                       vsplati8_simm5,  MSA128BOpnd>;
+class CLEI_S_H_DESC : MSA_I5_DESC_BASE<"clei_s.h", vsetle_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLEI_S_W_DESC : MSA_I5_DESC_BASE<"clei_s.w", vsetle_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLEI_S_D_DESC : MSA_I5_DESC_BASE<"clei_s.d", vsetle_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLEI_U_B_DESC : MSA_I5_DESC_BASE<"clei_u.b", vsetule_v16i8,
+                                       vsplati8_uimm5,  MSA128BOpnd>;
+class CLEI_U_H_DESC : MSA_I5_DESC_BASE<"clei_u.h", vsetule_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLEI_U_W_DESC : MSA_I5_DESC_BASE<"clei_u.w", vsetule_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLEI_U_D_DESC : MSA_I5_DESC_BASE<"clei_u.d", vsetule_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class CLT_S_B_DESC : MSA_3R_DESC_BASE<"clt_s.b", vsetlt_v16i8, MSA128BOpnd>;
+class CLT_S_H_DESC : MSA_3R_DESC_BASE<"clt_s.h", vsetlt_v8i16, MSA128HOpnd>;
+class CLT_S_W_DESC : MSA_3R_DESC_BASE<"clt_s.w", vsetlt_v4i32, MSA128WOpnd>;
+class CLT_S_D_DESC : MSA_3R_DESC_BASE<"clt_s.d", vsetlt_v2i64, MSA128DOpnd>;
+
+class CLT_U_B_DESC : MSA_3R_DESC_BASE<"clt_u.b", vsetult_v16i8, MSA128BOpnd>;
+class CLT_U_H_DESC : MSA_3R_DESC_BASE<"clt_u.h", vsetult_v8i16, MSA128HOpnd>;
+class CLT_U_W_DESC : MSA_3R_DESC_BASE<"clt_u.w", vsetult_v4i32, MSA128WOpnd>;
+class CLT_U_D_DESC : MSA_3R_DESC_BASE<"clt_u.d", vsetult_v2i64, MSA128DOpnd>;
+
+class CLTI_S_B_DESC : MSA_I5_DESC_BASE<"clti_s.b", vsetlt_v16i8,
+                                       vsplati8_simm5, MSA128BOpnd>;
+class CLTI_S_H_DESC : MSA_I5_DESC_BASE<"clti_s.h", vsetlt_v8i16,
+                                       vsplati16_simm5, MSA128HOpnd>;
+class CLTI_S_W_DESC : MSA_I5_DESC_BASE<"clti_s.w", vsetlt_v4i32,
+                                       vsplati32_simm5, MSA128WOpnd>;
+class CLTI_S_D_DESC : MSA_I5_DESC_BASE<"clti_s.d", vsetlt_v2i64,
+                                       vsplati64_simm5, MSA128DOpnd>;
+
+class CLTI_U_B_DESC : MSA_I5_DESC_BASE<"clti_u.b", vsetult_v16i8,
+                                       vsplati8_uimm5, MSA128BOpnd>;
+class CLTI_U_H_DESC : MSA_I5_DESC_BASE<"clti_u.h", vsetult_v8i16,
+                                       vsplati16_uimm5, MSA128HOpnd>;
+class CLTI_U_W_DESC : MSA_I5_DESC_BASE<"clti_u.w", vsetult_v4i32,
+                                       vsplati32_uimm5, MSA128WOpnd>;
+class CLTI_U_D_DESC : MSA_I5_DESC_BASE<"clti_u.d", vsetult_v2i64,
+                                       vsplati64_uimm5, MSA128DOpnd>;
+
+class COPY_S_B_DESC : MSA_COPY_DESC_BASE<"copy_s.b", vextract_sext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_S_H_DESC : MSA_COPY_DESC_BASE<"copy_s.h", vextract_sext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_S_W_DESC : MSA_COPY_DESC_BASE<"copy_s.w", vextract_sext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_U_B_DESC : MSA_COPY_DESC_BASE<"copy_u.b", vextract_zext_i8,  v16i8,
+                                         GPR32Opnd, MSA128BOpnd>;
+class COPY_U_H_DESC : MSA_COPY_DESC_BASE<"copy_u.h", vextract_zext_i16, v8i16,
+                                         GPR32Opnd, MSA128HOpnd>;
+class COPY_U_W_DESC : MSA_COPY_DESC_BASE<"copy_u.w", vextract_zext_i32, v4i32,
+                                         GPR32Opnd, MSA128WOpnd>;
+
+class COPY_FW_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v4f32, FGR32,
+                                                 MSA128W>;
+class COPY_FD_PSEUDO_DESC : MSA_COPY_PSEUDO_BASE<vector_extract, v2f64, FGR64,
+                                                 MSA128D>;
+
+class CTCMSA_DESC {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins MSA128CROpnd:$cd, GPR32Opnd:$rs);
+  string AsmString = "ctcmsa\t$cd, $rs";
+  InstrItinClass Itinerary = NoItinerary;
+  bit hasSideEffects = 1;
+}
+
+class DIV_S_B_DESC : MSA_3R_DESC_BASE<"div_s.b", sdiv, MSA128BOpnd>;
+class DIV_S_H_DESC : MSA_3R_DESC_BASE<"div_s.h", sdiv, MSA128HOpnd>;
+class DIV_S_W_DESC : MSA_3R_DESC_BASE<"div_s.w", sdiv, MSA128WOpnd>;
+class DIV_S_D_DESC : MSA_3R_DESC_BASE<"div_s.d", sdiv, MSA128DOpnd>;
+
+class DIV_U_B_DESC : MSA_3R_DESC_BASE<"div_u.b", udiv, MSA128BOpnd>;
+class DIV_U_H_DESC : MSA_3R_DESC_BASE<"div_u.h", udiv, MSA128HOpnd>;
+class DIV_U_W_DESC : MSA_3R_DESC_BASE<"div_u.w", udiv, MSA128WOpnd>;
+class DIV_U_D_DESC : MSA_3R_DESC_BASE<"div_u.d", udiv, MSA128DOpnd>;
+
+class DOTP_S_H_DESC : MSA_3R_DESC_BASE<"dotp_s.h", int_mips_dotp_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_S_W_DESC : MSA_3R_DESC_BASE<"dotp_s.w", int_mips_dotp_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_S_D_DESC : MSA_3R_DESC_BASE<"dotp_s.d", int_mips_dotp_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DOTP_U_H_DESC : MSA_3R_DESC_BASE<"dotp_u.h", int_mips_dotp_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>,
+                      IsCommutable;
+class DOTP_U_W_DESC : MSA_3R_DESC_BASE<"dotp_u.w", int_mips_dotp_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>,
+                      IsCommutable;
+class DOTP_U_D_DESC : MSA_3R_DESC_BASE<"dotp_u.d", int_mips_dotp_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>,
+                      IsCommutable;
+
+class DPADD_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.h", int_mips_dpadd_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.w", int_mips_dpadd_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_s.d", int_mips_dpadd_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPADD_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.h", int_mips_dpadd_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>, IsCommutable;
+class DPADD_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.w", int_mips_dpadd_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>, IsCommutable;
+class DPADD_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpadd_u.d", int_mips_dpadd_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>, IsCommutable;
+
+class DPSUB_S_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.h", int_mips_dpsub_s_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_S_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.w", int_mips_dpsub_s_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_S_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_s.d", int_mips_dpsub_s_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class DPSUB_U_H_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.h", int_mips_dpsub_u_h,
+                                           MSA128HOpnd, MSA128BOpnd,
+                                           MSA128BOpnd>;
+class DPSUB_U_W_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.w", int_mips_dpsub_u_w,
+                                           MSA128WOpnd, MSA128HOpnd,
+                                           MSA128HOpnd>;
+class DPSUB_U_D_DESC : MSA_3R_4R_DESC_BASE<"dpsub_u.d", int_mips_dpsub_u_d,
+                                           MSA128DOpnd, MSA128WOpnd,
+                                           MSA128WOpnd>;
+
+class FADD_W_DESC : MSA_3RF_DESC_BASE<"fadd.w", fadd, MSA128WOpnd>,
+                    IsCommutable;
+class FADD_D_DESC : MSA_3RF_DESC_BASE<"fadd.d", fadd, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCAF_W_DESC : MSA_3RF_DESC_BASE<"fcaf.w", int_mips_fcaf_w, MSA128WOpnd>,
+                    IsCommutable;
+class FCAF_D_DESC : MSA_3RF_DESC_BASE<"fcaf.d", int_mips_fcaf_d, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCEQ_W_DESC : MSA_3RF_DESC_BASE<"fceq.w", vfsetoeq_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCEQ_D_DESC : MSA_3RF_DESC_BASE<"fceq.d", vfsetoeq_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCLASS_W_DESC : MSA_2RF_DESC_BASE<"fclass.w", int_mips_fclass_w,
+                                        MSA128WOpnd>;
+class FCLASS_D_DESC : MSA_2RF_DESC_BASE<"fclass.d", int_mips_fclass_d,
+                                        MSA128DOpnd>;
+
+class FCLE_W_DESC : MSA_3RF_DESC_BASE<"fcle.w", vfsetole_v4f32, MSA128WOpnd>;
+class FCLE_D_DESC : MSA_3RF_DESC_BASE<"fcle.d", vfsetole_v2f64, MSA128DOpnd>;
+
+class FCLT_W_DESC : MSA_3RF_DESC_BASE<"fclt.w", vfsetolt_v4f32, MSA128WOpnd>;
+class FCLT_D_DESC : MSA_3RF_DESC_BASE<"fclt.d", vfsetolt_v2f64, MSA128DOpnd>;
+
+class FCNE_W_DESC : MSA_3RF_DESC_BASE<"fcne.w", vfsetone_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCNE_D_DESC : MSA_3RF_DESC_BASE<"fcne.d", vfsetone_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCOR_W_DESC : MSA_3RF_DESC_BASE<"fcor.w", vfsetord_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCOR_D_DESC : MSA_3RF_DESC_BASE<"fcor.d", vfsetord_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUEQ_W_DESC : MSA_3RF_DESC_BASE<"fcueq.w", vfsetueq_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUEQ_D_DESC : MSA_3RF_DESC_BASE<"fcueq.d", vfsetueq_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULE_W_DESC : MSA_3RF_DESC_BASE<"fcule.w", vfsetule_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULE_D_DESC : MSA_3RF_DESC_BASE<"fcule.d", vfsetule_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCULT_W_DESC : MSA_3RF_DESC_BASE<"fcult.w", vfsetult_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCULT_D_DESC : MSA_3RF_DESC_BASE<"fcult.d", vfsetult_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FCUN_W_DESC : MSA_3RF_DESC_BASE<"fcun.w", vfsetun_v4f32, MSA128WOpnd>,
+                    IsCommutable;
+class FCUN_D_DESC : MSA_3RF_DESC_BASE<"fcun.d", vfsetun_v2f64, MSA128DOpnd>,
+                    IsCommutable;
+
+class FCUNE_W_DESC : MSA_3RF_DESC_BASE<"fcune.w", vfsetune_v4f32, MSA128WOpnd>,
+                     IsCommutable;
+class FCUNE_D_DESC : MSA_3RF_DESC_BASE<"fcune.d", vfsetune_v2f64, MSA128DOpnd>,
+                     IsCommutable;
+
+class FDIV_W_DESC : MSA_3RF_DESC_BASE<"fdiv.w", fdiv, MSA128WOpnd>;
+class FDIV_D_DESC : MSA_3RF_DESC_BASE<"fdiv.d", fdiv, MSA128DOpnd>;
+
+class FEXDO_H_DESC : MSA_3RF_DESC_BASE<"fexdo.h", int_mips_fexdo_h,
+                                       MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FEXDO_W_DESC : MSA_3RF_DESC_BASE<"fexdo.w", int_mips_fexdo_w,
+                                       MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+// The fexp2.df instruction multiplies the first operand by 2 to the power of
+// the second operand. We therefore need a pseudo-insn in order to invent the
+// 1.0 when we only need to match ISD::FEXP2.
+class FEXP2_W_DESC : MSA_3RF_DESC_BASE<"fexp2.w", mul_fexp2, MSA128WOpnd>;
+class FEXP2_D_DESC : MSA_3RF_DESC_BASE<"fexp2.d", mul_fexp2, MSA128DOpnd>;
+let usesCustomInserter = 1 in {
+  class FEXP2_W_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128W:$wd), (ins MSA128W:$ws),
+                 [(set MSA128W:$wd, (fexp2 MSA128W:$ws))]>;
+  class FEXP2_D_1_PSEUDO_DESC :
+      MipsPseudo<(outs MSA128D:$wd), (ins MSA128D:$ws),
+                 [(set MSA128D:$wd, (fexp2 MSA128D:$ws))]>;
+}
+
+class FEXUPL_W_DESC : MSA_2RF_DESC_BASE<"fexupl.w", int_mips_fexupl_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPL_D_DESC : MSA_2RF_DESC_BASE<"fexupl.d", int_mips_fexupl_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FEXUPR_W_DESC : MSA_2RF_DESC_BASE<"fexupr.w", int_mips_fexupr_w,
+                                        MSA128WOpnd, MSA128HOpnd>;
+class FEXUPR_D_DESC : MSA_2RF_DESC_BASE<"fexupr.d", int_mips_fexupr_d,
+                                        MSA128DOpnd, MSA128WOpnd>;
+
+class FFINT_S_W_DESC : MSA_2RF_DESC_BASE<"ffint_s.w", sint_to_fp, MSA128WOpnd>;
+class FFINT_S_D_DESC : MSA_2RF_DESC_BASE<"ffint_s.d", sint_to_fp, MSA128DOpnd>;
+
+class FFINT_U_W_DESC : MSA_2RF_DESC_BASE<"ffint_u.w", uint_to_fp, MSA128WOpnd>;
+class FFINT_U_D_DESC : MSA_2RF_DESC_BASE<"ffint_u.d", uint_to_fp, MSA128DOpnd>;
+
+class FFQL_W_DESC : MSA_2RF_DESC_BASE<"ffql.w", int_mips_ffql_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQL_D_DESC : MSA_2RF_DESC_BASE<"ffql.d", int_mips_ffql_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FFQR_W_DESC : MSA_2RF_DESC_BASE<"ffqr.w", int_mips_ffqr_w,
+                                      MSA128WOpnd, MSA128HOpnd>;
+class FFQR_D_DESC : MSA_2RF_DESC_BASE<"ffqr.d", int_mips_ffqr_d,
+                                      MSA128DOpnd, MSA128WOpnd>;
+
+class FILL_B_DESC : MSA_2R_FILL_DESC_BASE<"fill.b", v16i8, vsplati8,
+                                          MSA128BOpnd, GPR32Opnd>;
+class FILL_H_DESC : MSA_2R_FILL_DESC_BASE<"fill.h", v8i16, vsplati16,
+                                          MSA128HOpnd, GPR32Opnd>;
+class FILL_W_DESC : MSA_2R_FILL_DESC_BASE<"fill.w", v4i32, vsplati32,
+                                          MSA128WOpnd, GPR32Opnd>;
+
+class FILL_FW_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v4f32, vsplatf32, MSA128W,
+                                                    FGR32>;
+class FILL_FD_PSEUDO_DESC : MSA_2R_FILL_PSEUDO_BASE<v2f64, vsplatf64, MSA128D,
+                                                    FGR64>;
+
+class FLOG2_W_DESC : MSA_2RF_DESC_BASE<"flog2.w", flog2, MSA128WOpnd>;
+class FLOG2_D_DESC : MSA_2RF_DESC_BASE<"flog2.d", flog2, MSA128DOpnd>;
+
+class FMADD_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.w", fma, MSA128WOpnd>;
+class FMADD_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmadd.d", fma, MSA128DOpnd>;
+
+class FMAX_W_DESC : MSA_3RF_DESC_BASE<"fmax.w", int_mips_fmax_w, MSA128WOpnd>;
+class FMAX_D_DESC : MSA_3RF_DESC_BASE<"fmax.d", int_mips_fmax_d, MSA128DOpnd>;
+
+class FMAX_A_W_DESC : MSA_3RF_DESC_BASE<"fmax_a.w", int_mips_fmax_a_w,
+                                        MSA128WOpnd>;
+class FMAX_A_D_DESC : MSA_3RF_DESC_BASE<"fmax_a.d", int_mips_fmax_a_d,
+                                        MSA128DOpnd>;
+
+class FMIN_W_DESC : MSA_3RF_DESC_BASE<"fmin.w", int_mips_fmin_w, MSA128WOpnd>;
+class FMIN_D_DESC : MSA_3RF_DESC_BASE<"fmin.d", int_mips_fmin_d, MSA128DOpnd>;
+
+class FMIN_A_W_DESC : MSA_3RF_DESC_BASE<"fmin_a.w", int_mips_fmin_a_w,
+                                        MSA128WOpnd>;
+class FMIN_A_D_DESC : MSA_3RF_DESC_BASE<"fmin_a.d", int_mips_fmin_a_d,
+                                        MSA128DOpnd>;
+
+class FMSUB_W_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.w", fms, MSA128WOpnd>;
+class FMSUB_D_DESC : MSA_3RF_4RF_DESC_BASE<"fmsub.d", fms, MSA128DOpnd>;
+
+class FMUL_W_DESC : MSA_3RF_DESC_BASE<"fmul.w", fmul, MSA128WOpnd>;
+class FMUL_D_DESC : MSA_3RF_DESC_BASE<"fmul.d", fmul, MSA128DOpnd>;
+
+class FRINT_W_DESC : MSA_2RF_DESC_BASE<"frint.w", frint, MSA128WOpnd>;
+class FRINT_D_DESC : MSA_2RF_DESC_BASE<"frint.d", frint, MSA128DOpnd>;
+
+class FRCP_W_DESC : MSA_2RF_DESC_BASE<"frcp.w", int_mips_frcp_w, MSA128WOpnd>;
+class FRCP_D_DESC : MSA_2RF_DESC_BASE<"frcp.d", int_mips_frcp_d, MSA128DOpnd>;
+
+class FRSQRT_W_DESC : MSA_2RF_DESC_BASE<"frsqrt.w", int_mips_frsqrt_w,
+                                        MSA128WOpnd>;
+class FRSQRT_D_DESC : MSA_2RF_DESC_BASE<"frsqrt.d", int_mips_frsqrt_d,
+                                        MSA128DOpnd>;
+
+class FSAF_W_DESC : MSA_3RF_DESC_BASE<"fsaf.w", int_mips_fsaf_w, MSA128WOpnd>;
+class FSAF_D_DESC : MSA_3RF_DESC_BASE<"fsaf.d", int_mips_fsaf_d, MSA128DOpnd>;
+
+class FSEQ_W_DESC : MSA_3RF_DESC_BASE<"fseq.w", int_mips_fseq_w, MSA128WOpnd>;
+class FSEQ_D_DESC : MSA_3RF_DESC_BASE<"fseq.d", int_mips_fseq_d, MSA128DOpnd>;
+
+class FSLE_W_DESC : MSA_3RF_DESC_BASE<"fsle.w", int_mips_fsle_w, MSA128WOpnd>;
+class FSLE_D_DESC : MSA_3RF_DESC_BASE<"fsle.d", int_mips_fsle_d, MSA128DOpnd>;
+
+class FSLT_W_DESC : MSA_3RF_DESC_BASE<"fslt.w", int_mips_fslt_w, MSA128WOpnd>;
+class FSLT_D_DESC : MSA_3RF_DESC_BASE<"fslt.d", int_mips_fslt_d, MSA128DOpnd>;
+
+class FSNE_W_DESC : MSA_3RF_DESC_BASE<"fsne.w", int_mips_fsne_w, MSA128WOpnd>;
+class FSNE_D_DESC : MSA_3RF_DESC_BASE<"fsne.d", int_mips_fsne_d, MSA128DOpnd>;
+
+class FSOR_W_DESC : MSA_3RF_DESC_BASE<"fsor.w", int_mips_fsor_w, MSA128WOpnd>;
+class FSOR_D_DESC : MSA_3RF_DESC_BASE<"fsor.d", int_mips_fsor_d, MSA128DOpnd>;
+
+class FSQRT_W_DESC : MSA_2RF_DESC_BASE<"fsqrt.w", fsqrt, MSA128WOpnd>;
+class FSQRT_D_DESC : MSA_2RF_DESC_BASE<"fsqrt.d", fsqrt, MSA128DOpnd>;
+
+class FSUB_W_DESC : MSA_3RF_DESC_BASE<"fsub.w", fsub, MSA128WOpnd>;
+class FSUB_D_DESC : MSA_3RF_DESC_BASE<"fsub.d", fsub, MSA128DOpnd>;
+
+class FSUEQ_W_DESC : MSA_3RF_DESC_BASE<"fsueq.w", int_mips_fsueq_w,
+                                       MSA128WOpnd>;
+class FSUEQ_D_DESC : MSA_3RF_DESC_BASE<"fsueq.d", int_mips_fsueq_d,
+                                       MSA128DOpnd>;
+
+class FSULE_W_DESC : MSA_3RF_DESC_BASE<"fsule.w", int_mips_fsule_w,
+                                       MSA128WOpnd>;
+class FSULE_D_DESC : MSA_3RF_DESC_BASE<"fsule.d", int_mips_fsule_d,
+                                       MSA128DOpnd>;
+
+class FSULT_W_DESC : MSA_3RF_DESC_BASE<"fsult.w", int_mips_fsult_w,
+                                       MSA128WOpnd>;
+class FSULT_D_DESC : MSA_3RF_DESC_BASE<"fsult.d", int_mips_fsult_d,
+                                       MSA128DOpnd>;
+
+class FSUN_W_DESC : MSA_3RF_DESC_BASE<"fsun.w", int_mips_fsun_w,
+                                      MSA128WOpnd>;
+class FSUN_D_DESC : MSA_3RF_DESC_BASE<"fsun.d", int_mips_fsun_d,
+                                      MSA128DOpnd>;
+
+class FSUNE_W_DESC : MSA_3RF_DESC_BASE<"fsune.w", int_mips_fsune_w,
+                                       MSA128WOpnd>;
+class FSUNE_D_DESC : MSA_3RF_DESC_BASE<"fsune.d", int_mips_fsune_d,
+                                       MSA128DOpnd>;
+
+class FTINT_S_W_DESC : MSA_2RF_DESC_BASE<"ftint_s.w", int_mips_ftint_s_w,
+                                         MSA128WOpnd>;
+class FTINT_S_D_DESC : MSA_2RF_DESC_BASE<"ftint_s.d", int_mips_ftint_s_d,
+                                         MSA128DOpnd>;
+
+class FTINT_U_W_DESC : MSA_2RF_DESC_BASE<"ftint_u.w", int_mips_ftint_u_w,
+                                         MSA128WOpnd>;
+class FTINT_U_D_DESC : MSA_2RF_DESC_BASE<"ftint_u.d", int_mips_ftint_u_d,
+                                         MSA128DOpnd>;
+
+class FTQ_H_DESC : MSA_3RF_DESC_BASE<"ftq.h", int_mips_ftq_h,
+                                     MSA128HOpnd, MSA128WOpnd, MSA128WOpnd>;
+class FTQ_W_DESC : MSA_3RF_DESC_BASE<"ftq.w", int_mips_ftq_w,
+                                     MSA128WOpnd, MSA128DOpnd, MSA128DOpnd>;
+
+class FTRUNC_S_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.w", fp_to_sint,
+                                          MSA128WOpnd>;
+class FTRUNC_S_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_s.d", fp_to_sint,
+                                          MSA128DOpnd>;
+
+class FTRUNC_U_W_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.w", fp_to_uint,
+                                          MSA128WOpnd>;
+class FTRUNC_U_D_DESC : MSA_2RF_DESC_BASE<"ftrunc_u.d", fp_to_uint,
+                                          MSA128DOpnd>;
+
+class HADD_S_H_DESC : MSA_3R_DESC_BASE<"hadd_s.h", int_mips_hadd_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_S_W_DESC : MSA_3R_DESC_BASE<"hadd_s.w", int_mips_hadd_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_S_D_DESC : MSA_3R_DESC_BASE<"hadd_s.d", int_mips_hadd_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HADD_U_H_DESC : MSA_3R_DESC_BASE<"hadd_u.h", int_mips_hadd_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HADD_U_W_DESC : MSA_3R_DESC_BASE<"hadd_u.w", int_mips_hadd_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HADD_U_D_DESC : MSA_3R_DESC_BASE<"hadd_u.d", int_mips_hadd_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_S_H_DESC : MSA_3R_DESC_BASE<"hsub_s.h", int_mips_hsub_s_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_S_W_DESC : MSA_3R_DESC_BASE<"hsub_s.w", int_mips_hsub_s_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_S_D_DESC : MSA_3R_DESC_BASE<"hsub_s.d", int_mips_hsub_s_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class HSUB_U_H_DESC : MSA_3R_DESC_BASE<"hsub_u.h", int_mips_hsub_u_h,
+                                       MSA128HOpnd, MSA128BOpnd, MSA128BOpnd>;
+class HSUB_U_W_DESC : MSA_3R_DESC_BASE<"hsub_u.w", int_mips_hsub_u_w,
+                                       MSA128WOpnd, MSA128HOpnd, MSA128HOpnd>;
+class HSUB_U_D_DESC : MSA_3R_DESC_BASE<"hsub_u.d", int_mips_hsub_u_d,
+                                       MSA128DOpnd, MSA128WOpnd, MSA128WOpnd>;
+
+class ILVEV_B_DESC : MSA_3R_DESC_BASE<"ilvev.b", MipsILVEV, MSA128BOpnd>;
+class ILVEV_H_DESC : MSA_3R_DESC_BASE<"ilvev.h", MipsILVEV, MSA128HOpnd>;
+class ILVEV_W_DESC : MSA_3R_DESC_BASE<"ilvev.w", MipsILVEV, MSA128WOpnd>;
+class ILVEV_D_DESC : MSA_3R_DESC_BASE<"ilvev.d", MipsILVEV, MSA128DOpnd>;
+
+class ILVL_B_DESC : MSA_3R_DESC_BASE<"ilvl.b", MipsILVL, MSA128BOpnd>;
+class ILVL_H_DESC : MSA_3R_DESC_BASE<"ilvl.h", MipsILVL, MSA128HOpnd>;
+class ILVL_W_DESC : MSA_3R_DESC_BASE<"ilvl.w", MipsILVL, MSA128WOpnd>;
+class ILVL_D_DESC : MSA_3R_DESC_BASE<"ilvl.d", MipsILVL, MSA128DOpnd>;
+
+class ILVOD_B_DESC : MSA_3R_DESC_BASE<"ilvod.b", MipsILVOD, MSA128BOpnd>;
+class ILVOD_H_DESC : MSA_3R_DESC_BASE<"ilvod.h", MipsILVOD, MSA128HOpnd>;
+class ILVOD_W_DESC : MSA_3R_DESC_BASE<"ilvod.w", MipsILVOD, MSA128WOpnd>;
+class ILVOD_D_DESC : MSA_3R_DESC_BASE<"ilvod.d", MipsILVOD, MSA128DOpnd>;
+
+class ILVR_B_DESC : MSA_3R_DESC_BASE<"ilvr.b", MipsILVR, MSA128BOpnd>;
+class ILVR_H_DESC : MSA_3R_DESC_BASE<"ilvr.h", MipsILVR, MSA128HOpnd>;
+class ILVR_W_DESC : MSA_3R_DESC_BASE<"ilvr.w", MipsILVR, MSA128WOpnd>;
+class ILVR_D_DESC : MSA_3R_DESC_BASE<"ilvr.d", MipsILVR, MSA128DOpnd>;
+
+class INSERT_B_DESC : MSA_INSERT_DESC_BASE<"insert.b", vinsert_v16i8,
+                                           MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_DESC : MSA_INSERT_DESC_BASE<"insert.h", vinsert_v8i16,
+                                           MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
+                                           MSA128WOpnd, GPR32Opnd>;
+
+class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
+                                                     MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
+                                                     MSA128DOpnd, FGR64Opnd>;
+
+class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", int_mips_insve_b,
+                                         MSA128BOpnd>;
+class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", int_mips_insve_h,
+                                         MSA128HOpnd>;
+class INSVE_W_DESC : MSA_INSVE_DESC_BASE<"insve.w", int_mips_insve_w,
+                                         MSA128WOpnd>;
+class INSVE_D_DESC : MSA_INSVE_DESC_BASE<"insve.d", int_mips_insve_d,
+                                         MSA128DOpnd>;
+
+class LD_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs ROWD:$wd);
+  dag InOperandList = (ins MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(set ROWD:$wd, (TyNode (OpNode Addr:$addr)))];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class LD_B_DESC : LD_DESC_BASE<"ld.b", load, v16i8, MSA128BOpnd>;
+class LD_H_DESC : LD_DESC_BASE<"ld.h", load, v8i16, MSA128HOpnd>;
+class LD_W_DESC : LD_DESC_BASE<"ld.w", load, v4i32, MSA128WOpnd>;
+class LD_D_DESC : LD_DESC_BASE<"ld.d", load, v2i64, MSA128DOpnd>;
+
+class LDI_B_DESC : MSA_I10_LDI_DESC_BASE<"ldi.b", MSA128BOpnd>;
+class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
+class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
+class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
+
+class LSA_DESC {
+  dag OutOperandList = (outs GPR32Opnd:$rd);
+  dag InOperandList = (ins GPR32Opnd:$rs, GPR32Opnd:$rt, LSAImm:$sa);
+  string AsmString = "lsa\t$rd, $rs, $rt, $sa";
+  list<dag> Pattern = [(set GPR32Opnd:$rd, (add GPR32Opnd:$rs,
+                                                (shl GPR32Opnd:$rt,
+                                                     immZExt2Lsa:$sa)))];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MADD_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.h", int_mips_madd_q_h,
+                                            MSA128HOpnd>;
+class MADD_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"madd_q.w", int_mips_madd_q_w,
+                                            MSA128WOpnd>;
+
+class MADDR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.h", int_mips_maddr_q_h,
+                                             MSA128HOpnd>;
+class MADDR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"maddr_q.w", int_mips_maddr_q_w,
+                                             MSA128WOpnd>;
+
+class MADDV_B_DESC : MSA_3R_4R_DESC_BASE<"maddv.b", muladd, MSA128BOpnd>;
+class MADDV_H_DESC : MSA_3R_4R_DESC_BASE<"maddv.h", muladd, MSA128HOpnd>;
+class MADDV_W_DESC : MSA_3R_4R_DESC_BASE<"maddv.w", muladd, MSA128WOpnd>;
+class MADDV_D_DESC : MSA_3R_4R_DESC_BASE<"maddv.d", muladd, MSA128DOpnd>;
+
+class MAX_A_B_DESC : MSA_3R_DESC_BASE<"max_a.b", int_mips_max_a_b, MSA128BOpnd>;
+class MAX_A_H_DESC : MSA_3R_DESC_BASE<"max_a.h", int_mips_max_a_h, MSA128HOpnd>;
+class MAX_A_W_DESC : MSA_3R_DESC_BASE<"max_a.w", int_mips_max_a_w, MSA128WOpnd>;
+class MAX_A_D_DESC : MSA_3R_DESC_BASE<"max_a.d", int_mips_max_a_d, MSA128DOpnd>;
+
+class MAX_S_B_DESC : MSA_3R_DESC_BASE<"max_s.b", MipsVSMax, MSA128BOpnd>;
+class MAX_S_H_DESC : MSA_3R_DESC_BASE<"max_s.h", MipsVSMax, MSA128HOpnd>;
+class MAX_S_W_DESC : MSA_3R_DESC_BASE<"max_s.w", MipsVSMax, MSA128WOpnd>;
+class MAX_S_D_DESC : MSA_3R_DESC_BASE<"max_s.d", MipsVSMax, MSA128DOpnd>;
+
+class MAX_U_B_DESC : MSA_3R_DESC_BASE<"max_u.b", MipsVUMax, MSA128BOpnd>;
+class MAX_U_H_DESC : MSA_3R_DESC_BASE<"max_u.h", MipsVUMax, MSA128HOpnd>;
+class MAX_U_W_DESC : MSA_3R_DESC_BASE<"max_u.w", MipsVUMax, MSA128WOpnd>;
+class MAX_U_D_DESC : MSA_3R_DESC_BASE<"max_u.d", MipsVUMax, MSA128DOpnd>;
+
+class MAXI_S_B_DESC : MSA_I5_DESC_BASE<"maxi_s.b", MipsVSMax, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MAXI_S_H_DESC : MSA_I5_DESC_BASE<"maxi_s.h", MipsVSMax, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MAXI_S_W_DESC : MSA_I5_DESC_BASE<"maxi_s.w", MipsVSMax, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MAXI_S_D_DESC : MSA_I5_DESC_BASE<"maxi_s.d", MipsVSMax, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MAXI_U_B_DESC : MSA_I5_DESC_BASE<"maxi_u.b", MipsVUMax, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MAXI_U_H_DESC : MSA_I5_DESC_BASE<"maxi_u.h", MipsVUMax, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MAXI_U_W_DESC : MSA_I5_DESC_BASE<"maxi_u.w", MipsVUMax, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MAXI_U_D_DESC : MSA_I5_DESC_BASE<"maxi_u.d", MipsVUMax, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MIN_A_B_DESC : MSA_3R_DESC_BASE<"min_a.b", int_mips_min_a_b, MSA128BOpnd>;
+class MIN_A_H_DESC : MSA_3R_DESC_BASE<"min_a.h", int_mips_min_a_h, MSA128HOpnd>;
+class MIN_A_W_DESC : MSA_3R_DESC_BASE<"min_a.w", int_mips_min_a_w, MSA128WOpnd>;
+class MIN_A_D_DESC : MSA_3R_DESC_BASE<"min_a.d", int_mips_min_a_d, MSA128DOpnd>;
+
+class MIN_S_B_DESC : MSA_3R_DESC_BASE<"min_s.b", MipsVSMin, MSA128BOpnd>;
+class MIN_S_H_DESC : MSA_3R_DESC_BASE<"min_s.h", MipsVSMin, MSA128HOpnd>;
+class MIN_S_W_DESC : MSA_3R_DESC_BASE<"min_s.w", MipsVSMin, MSA128WOpnd>;
+class MIN_S_D_DESC : MSA_3R_DESC_BASE<"min_s.d", MipsVSMin, MSA128DOpnd>;
+
+class MIN_U_B_DESC : MSA_3R_DESC_BASE<"min_u.b", MipsVUMin, MSA128BOpnd>;
+class MIN_U_H_DESC : MSA_3R_DESC_BASE<"min_u.h", MipsVUMin, MSA128HOpnd>;
+class MIN_U_W_DESC : MSA_3R_DESC_BASE<"min_u.w", MipsVUMin, MSA128WOpnd>;
+class MIN_U_D_DESC : MSA_3R_DESC_BASE<"min_u.d", MipsVUMin, MSA128DOpnd>;
+
+class MINI_S_B_DESC : MSA_I5_DESC_BASE<"mini_s.b", MipsVSMin, vsplati8_simm5,
+                                       MSA128BOpnd>;
+class MINI_S_H_DESC : MSA_I5_DESC_BASE<"mini_s.h", MipsVSMin, vsplati16_simm5,
+                                       MSA128HOpnd>;
+class MINI_S_W_DESC : MSA_I5_DESC_BASE<"mini_s.w", MipsVSMin, vsplati32_simm5,
+                                       MSA128WOpnd>;
+class MINI_S_D_DESC : MSA_I5_DESC_BASE<"mini_s.d", MipsVSMin, vsplati64_simm5,
+                                       MSA128DOpnd>;
+
+class MINI_U_B_DESC : MSA_I5_DESC_BASE<"mini_u.b", MipsVUMin, vsplati8_uimm5,
+                                       MSA128BOpnd>;
+class MINI_U_H_DESC : MSA_I5_DESC_BASE<"mini_u.h", MipsVUMin, vsplati16_uimm5,
+                                       MSA128HOpnd>;
+class MINI_U_W_DESC : MSA_I5_DESC_BASE<"mini_u.w", MipsVUMin, vsplati32_uimm5,
+                                       MSA128WOpnd>;
+class MINI_U_D_DESC : MSA_I5_DESC_BASE<"mini_u.d", MipsVUMin, vsplati64_uimm5,
+                                       MSA128DOpnd>;
+
+class MOD_S_B_DESC : MSA_3R_DESC_BASE<"mod_s.b", srem, MSA128BOpnd>;
+class MOD_S_H_DESC : MSA_3R_DESC_BASE<"mod_s.h", srem, MSA128HOpnd>;
+class MOD_S_W_DESC : MSA_3R_DESC_BASE<"mod_s.w", srem, MSA128WOpnd>;
+class MOD_S_D_DESC : MSA_3R_DESC_BASE<"mod_s.d", srem, MSA128DOpnd>;
+
+class MOD_U_B_DESC : MSA_3R_DESC_BASE<"mod_u.b", urem, MSA128BOpnd>;
+class MOD_U_H_DESC : MSA_3R_DESC_BASE<"mod_u.h", urem, MSA128HOpnd>;
+class MOD_U_W_DESC : MSA_3R_DESC_BASE<"mod_u.w", urem, MSA128WOpnd>;
+class MOD_U_D_DESC : MSA_3R_DESC_BASE<"mod_u.d", urem, MSA128DOpnd>;
+
+class MOVE_V_DESC {
+  dag OutOperandList = (outs MSA128BOpnd:$wd);
+  dag InOperandList = (ins MSA128BOpnd:$ws);
+  string AsmString = "move.v\t$wd, $ws";
+  list<dag> Pattern = [];
+  InstrItinClass Itinerary = NoItinerary;
+}
+
+class MSUB_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.h", int_mips_msub_q_h,
+                                            MSA128HOpnd>;
+class MSUB_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msub_q.w", int_mips_msub_q_w,
+                                            MSA128WOpnd>;
+
+class MSUBR_Q_H_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.h", int_mips_msubr_q_h,
+                                             MSA128HOpnd>;
+class MSUBR_Q_W_DESC : MSA_3RF_4RF_DESC_BASE<"msubr_q.w", int_mips_msubr_q_w,
+                                             MSA128WOpnd>;
+
+class MSUBV_B_DESC : MSA_3R_4R_DESC_BASE<"msubv.b", mulsub, MSA128BOpnd>;
+class MSUBV_H_DESC : MSA_3R_4R_DESC_BASE<"msubv.h", mulsub, MSA128HOpnd>;
+class MSUBV_W_DESC : MSA_3R_4R_DESC_BASE<"msubv.w", mulsub, MSA128WOpnd>;
+class MSUBV_D_DESC : MSA_3R_4R_DESC_BASE<"msubv.d", mulsub, MSA128DOpnd>;
+
+class MUL_Q_H_DESC : MSA_3RF_DESC_BASE<"mul_q.h", int_mips_mul_q_h,
+                                       MSA128HOpnd>;
+class MUL_Q_W_DESC : MSA_3RF_DESC_BASE<"mul_q.w", int_mips_mul_q_w,
+                                       MSA128WOpnd>;
+
+class MULR_Q_H_DESC : MSA_3RF_DESC_BASE<"mulr_q.h", int_mips_mulr_q_h,
+                                        MSA128HOpnd>;
+class MULR_Q_W_DESC : MSA_3RF_DESC_BASE<"mulr_q.w", int_mips_mulr_q_w,
+                                        MSA128WOpnd>;
+
+class MULV_B_DESC : MSA_3R_DESC_BASE<"mulv.b", mul, MSA128BOpnd>;
+class MULV_H_DESC : MSA_3R_DESC_BASE<"mulv.h", mul, MSA128HOpnd>;
+class MULV_W_DESC : MSA_3R_DESC_BASE<"mulv.w", mul, MSA128WOpnd>;
+class MULV_D_DESC : MSA_3R_DESC_BASE<"mulv.d", mul, MSA128DOpnd>;
+
+class NLOC_B_DESC : MSA_2R_DESC_BASE<"nloc.b", int_mips_nloc_b, MSA128BOpnd>;
+class NLOC_H_DESC : MSA_2R_DESC_BASE<"nloc.h", int_mips_nloc_h, MSA128HOpnd>;
+class NLOC_W_DESC : MSA_2R_DESC_BASE<"nloc.w", int_mips_nloc_w, MSA128WOpnd>;
+class NLOC_D_DESC : MSA_2R_DESC_BASE<"nloc.d", int_mips_nloc_d, MSA128DOpnd>;
+
+class NLZC_B_DESC : MSA_2R_DESC_BASE<"nlzc.b", ctlz, MSA128BOpnd>;
+class NLZC_H_DESC : MSA_2R_DESC_BASE<"nlzc.h", ctlz, MSA128HOpnd>;
+class NLZC_W_DESC : MSA_2R_DESC_BASE<"nlzc.w", ctlz, MSA128WOpnd>;
+class NLZC_D_DESC : MSA_2R_DESC_BASE<"nlzc.d", ctlz, MSA128DOpnd>;
+
+class NOR_V_DESC : MSA_VEC_DESC_BASE<"nor.v", MipsVNOR, MSA128BOpnd>;
+class NOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128HOpnd>;
+class NOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128WOpnd>;
+class NOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<MipsVNOR, MSA128DOpnd>;
+
+class NORI_B_DESC : MSA_I8_DESC_BASE<"nori.b", MipsVNOR, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+class OR_V_DESC : MSA_VEC_DESC_BASE<"or.v", or, MSA128BOpnd>;
+class OR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128HOpnd>;
+class OR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128WOpnd>;
+class OR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<or, MSA128DOpnd>;
+
+class ORI_B_DESC : MSA_I8_DESC_BASE<"ori.b", or, vsplati8_uimm8, MSA128BOpnd>;
+
+class PCKEV_B_DESC : MSA_3R_DESC_BASE<"pckev.b", MipsPCKEV, MSA128BOpnd>;
+class PCKEV_H_DESC : MSA_3R_DESC_BASE<"pckev.h", MipsPCKEV, MSA128HOpnd>;
+class PCKEV_W_DESC : MSA_3R_DESC_BASE<"pckev.w", MipsPCKEV, MSA128WOpnd>;
+class PCKEV_D_DESC : MSA_3R_DESC_BASE<"pckev.d", MipsPCKEV, MSA128DOpnd>;
+
+class PCKOD_B_DESC : MSA_3R_DESC_BASE<"pckod.b", MipsPCKOD, MSA128BOpnd>;
+class PCKOD_H_DESC : MSA_3R_DESC_BASE<"pckod.h", MipsPCKOD, MSA128HOpnd>;
+class PCKOD_W_DESC : MSA_3R_DESC_BASE<"pckod.w", MipsPCKOD, MSA128WOpnd>;
+class PCKOD_D_DESC : MSA_3R_DESC_BASE<"pckod.d", MipsPCKOD, MSA128DOpnd>;
+
+class PCNT_B_DESC : MSA_2R_DESC_BASE<"pcnt.b", ctpop, MSA128BOpnd>;
+class PCNT_H_DESC : MSA_2R_DESC_BASE<"pcnt.h", ctpop, MSA128HOpnd>;
+class PCNT_W_DESC : MSA_2R_DESC_BASE<"pcnt.w", ctpop, MSA128WOpnd>;
+class PCNT_D_DESC : MSA_2R_DESC_BASE<"pcnt.d", ctpop, MSA128DOpnd>;
+
+class SAT_S_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_s.b", int_mips_sat_s_b,
+                                           MSA128BOpnd>;
+class SAT_S_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_s.h", int_mips_sat_s_h,
+                                           MSA128HOpnd>;
+class SAT_S_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_s.w", int_mips_sat_s_w,
+                                           MSA128WOpnd>;
+class SAT_S_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_s.d", int_mips_sat_s_d,
+                                           MSA128DOpnd>;
+
+class SAT_U_B_DESC : MSA_BIT_B_X_DESC_BASE<"sat_u.b", int_mips_sat_u_b,
+                                           MSA128BOpnd>;
+class SAT_U_H_DESC : MSA_BIT_H_X_DESC_BASE<"sat_u.h", int_mips_sat_u_h,
+                                           MSA128HOpnd>;
+class SAT_U_W_DESC : MSA_BIT_W_X_DESC_BASE<"sat_u.w", int_mips_sat_u_w,
+                                           MSA128WOpnd>;
+class SAT_U_D_DESC : MSA_BIT_D_X_DESC_BASE<"sat_u.d", int_mips_sat_u_d,
+                                           MSA128DOpnd>;
+
+class SHF_B_DESC : MSA_I8_SHF_DESC_BASE<"shf.b", MSA128BOpnd>;
+class SHF_H_DESC : MSA_I8_SHF_DESC_BASE<"shf.h", MSA128HOpnd>;
+class SHF_W_DESC : MSA_I8_SHF_DESC_BASE<"shf.w", MSA128WOpnd>;
+
+class SLD_B_DESC : MSA_3R_SLD_DESC_BASE<"sld.b", int_mips_sld_b, MSA128BOpnd>;
+class SLD_H_DESC : MSA_3R_SLD_DESC_BASE<"sld.h", int_mips_sld_h, MSA128HOpnd>;
+class SLD_W_DESC : MSA_3R_SLD_DESC_BASE<"sld.w", int_mips_sld_w, MSA128WOpnd>;
+class SLD_D_DESC : MSA_3R_SLD_DESC_BASE<"sld.d", int_mips_sld_d, MSA128DOpnd>;
+
+class SLDI_B_DESC : MSA_ELM_DESC_BASE<"sldi.b", int_mips_sldi_b, MSA128BOpnd>;
+class SLDI_H_DESC : MSA_ELM_DESC_BASE<"sldi.h", int_mips_sldi_h, MSA128HOpnd>;
+class SLDI_W_DESC : MSA_ELM_DESC_BASE<"sldi.w", int_mips_sldi_w, MSA128WOpnd>;
+class SLDI_D_DESC : MSA_ELM_DESC_BASE<"sldi.d", int_mips_sldi_d, MSA128DOpnd>;
+
+class SLL_B_DESC : MSA_3R_DESC_BASE<"sll.b", shl, MSA128BOpnd>;
+class SLL_H_DESC : MSA_3R_DESC_BASE<"sll.h", shl, MSA128HOpnd>;
+class SLL_W_DESC : MSA_3R_DESC_BASE<"sll.w", shl, MSA128WOpnd>;
+class SLL_D_DESC : MSA_3R_DESC_BASE<"sll.d", shl, MSA128DOpnd>;
+
+class SLLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.b", shl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SLLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.h", shl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SLLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.w", shl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SLLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"slli.d", shl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SPLAT_B_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.b", vsplati8_elt,
+                                            MSA128BOpnd>;
+class SPLAT_H_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.h", vsplati16_elt,
+                                            MSA128HOpnd>;
+class SPLAT_W_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.w", vsplati32_elt,
+                                            MSA128WOpnd>;
+class SPLAT_D_DESC : MSA_3R_SPLAT_DESC_BASE<"splat.d", vsplati64_elt,
+                                            MSA128DOpnd>;
+
+class SPLATI_B_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.b", vsplati8_uimm4,
+                                              MSA128BOpnd>;
+class SPLATI_H_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.h", vsplati16_uimm3,
+                                              MSA128HOpnd>;
+class SPLATI_W_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.w", vsplati32_uimm2,
+                                              MSA128WOpnd>;
+class SPLATI_D_DESC : MSA_ELM_SPLAT_DESC_BASE<"splati.d", vsplati64_uimm1,
+                                              MSA128DOpnd>;
+
+class SRA_B_DESC : MSA_3R_DESC_BASE<"sra.b", sra, MSA128BOpnd>;
+class SRA_H_DESC : MSA_3R_DESC_BASE<"sra.h", sra, MSA128HOpnd>;
+class SRA_W_DESC : MSA_3R_DESC_BASE<"sra.w", sra, MSA128WOpnd>;
+class SRA_D_DESC : MSA_3R_DESC_BASE<"sra.d", sra, MSA128DOpnd>;
+
+class SRAI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.b", sra, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRAI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.h", sra, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRAI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.w", sra, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRAI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srai.d", sra, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRAR_B_DESC : MSA_3R_DESC_BASE<"srar.b", int_mips_srar_b, MSA128BOpnd>;
+class SRAR_H_DESC : MSA_3R_DESC_BASE<"srar.h", int_mips_srar_h, MSA128HOpnd>;
+class SRAR_W_DESC : MSA_3R_DESC_BASE<"srar.w", int_mips_srar_w, MSA128WOpnd>;
+class SRAR_D_DESC : MSA_3R_DESC_BASE<"srar.d", int_mips_srar_d, MSA128DOpnd>;
+
+class SRARI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srari.b", int_mips_srari_b,
+                                           MSA128BOpnd>;
+class SRARI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srari.h", int_mips_srari_h,
+                                           MSA128HOpnd>;
+class SRARI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srari.w", int_mips_srari_w,
+                                           MSA128WOpnd>;
+class SRARI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srari.d", int_mips_srari_d,
+                                           MSA128DOpnd>;
+
+class SRL_B_DESC : MSA_3R_DESC_BASE<"srl.b", srl, MSA128BOpnd>;
+class SRL_H_DESC : MSA_3R_DESC_BASE<"srl.h", srl, MSA128HOpnd>;
+class SRL_W_DESC : MSA_3R_DESC_BASE<"srl.w", srl, MSA128WOpnd>;
+class SRL_D_DESC : MSA_3R_DESC_BASE<"srl.d", srl, MSA128DOpnd>;
+
+class SRLI_B_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.b", srl, vsplati8_uimm3,
+                                            MSA128BOpnd>;
+class SRLI_H_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.h", srl, vsplati16_uimm4,
+                                            MSA128HOpnd>;
+class SRLI_W_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.w", srl, vsplati32_uimm5,
+                                            MSA128WOpnd>;
+class SRLI_D_DESC : MSA_BIT_SPLAT_DESC_BASE<"srli.d", srl, vsplati64_uimm6,
+                                            MSA128DOpnd>;
+
+class SRLR_B_DESC : MSA_3R_DESC_BASE<"srlr.b", int_mips_srlr_b, MSA128BOpnd>;
+class SRLR_H_DESC : MSA_3R_DESC_BASE<"srlr.h", int_mips_srlr_h, MSA128HOpnd>;
+class SRLR_W_DESC : MSA_3R_DESC_BASE<"srlr.w", int_mips_srlr_w, MSA128WOpnd>;
+class SRLR_D_DESC : MSA_3R_DESC_BASE<"srlr.d", int_mips_srlr_d, MSA128DOpnd>;
+
+class SRLRI_B_DESC : MSA_BIT_B_X_DESC_BASE<"srlri.b", int_mips_srlri_b,
+                                           MSA128BOpnd>;
+class SRLRI_H_DESC : MSA_BIT_H_X_DESC_BASE<"srlri.h", int_mips_srlri_h,
+                                           MSA128HOpnd>;
+class SRLRI_W_DESC : MSA_BIT_W_X_DESC_BASE<"srlri.w", int_mips_srlri_w,
+                                           MSA128WOpnd>;
+class SRLRI_D_DESC : MSA_BIT_D_X_DESC_BASE<"srlri.d", int_mips_srlri_d,
+                                           MSA128DOpnd>;
+
+class ST_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   ValueType TyNode, RegisterOperand ROWD,
+                   Operand MemOpnd = mem, ComplexPattern Addr = addrRegImm,
+                   InstrItinClass itin = NoItinerary> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins ROWD:$wd, MemOpnd:$addr);
+  string AsmString = !strconcat(instr_asm, "\t$wd, $addr");
+  list<dag> Pattern = [(OpNode (TyNode ROWD:$wd), Addr:$addr)];
+  InstrItinClass Itinerary = itin;
+  string DecoderMethod = "DecodeMSA128Mem";
+}
+
+class ST_B_DESC : ST_DESC_BASE<"st.b", store, v16i8, MSA128BOpnd>;
+class ST_H_DESC : ST_DESC_BASE<"st.h", store, v8i16, MSA128HOpnd>;
+class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd>;
+class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd>;
+
+class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
+                                       MSA128BOpnd>;
+class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
+                                       MSA128HOpnd>;
+class SUBS_S_W_DESC : MSA_3R_DESC_BASE<"subs_s.w", int_mips_subs_s_w,
+                                       MSA128WOpnd>;
+class SUBS_S_D_DESC : MSA_3R_DESC_BASE<"subs_s.d", int_mips_subs_s_d,
+                                       MSA128DOpnd>;
+
+class SUBS_U_B_DESC : MSA_3R_DESC_BASE<"subs_u.b", int_mips_subs_u_b,
+                                       MSA128BOpnd>;
+class SUBS_U_H_DESC : MSA_3R_DESC_BASE<"subs_u.h", int_mips_subs_u_h,
+                                       MSA128HOpnd>;
+class SUBS_U_W_DESC : MSA_3R_DESC_BASE<"subs_u.w", int_mips_subs_u_w,
+                                       MSA128WOpnd>;
+class SUBS_U_D_DESC : MSA_3R_DESC_BASE<"subs_u.d", int_mips_subs_u_d,
+                                       MSA128DOpnd>;
+
+class SUBSUS_U_B_DESC : MSA_3R_DESC_BASE<"subsus_u.b", int_mips_subsus_u_b,
+                                         MSA128BOpnd>;
+class SUBSUS_U_H_DESC : MSA_3R_DESC_BASE<"subsus_u.h", int_mips_subsus_u_h,
+                                         MSA128HOpnd>;
+class SUBSUS_U_W_DESC : MSA_3R_DESC_BASE<"subsus_u.w", int_mips_subsus_u_w,
+                                         MSA128WOpnd>;
+class SUBSUS_U_D_DESC : MSA_3R_DESC_BASE<"subsus_u.d", int_mips_subsus_u_d,
+                                         MSA128DOpnd>;
+
+class SUBSUU_S_B_DESC : MSA_3R_DESC_BASE<"subsuu_s.b", int_mips_subsuu_s_b,
+                                         MSA128BOpnd>;
+class SUBSUU_S_H_DESC : MSA_3R_DESC_BASE<"subsuu_s.h", int_mips_subsuu_s_h,
+                                         MSA128HOpnd>;
+class SUBSUU_S_W_DESC : MSA_3R_DESC_BASE<"subsuu_s.w", int_mips_subsuu_s_w,
+                                         MSA128WOpnd>;
+class SUBSUU_S_D_DESC : MSA_3R_DESC_BASE<"subsuu_s.d", int_mips_subsuu_s_d,
+                                         MSA128DOpnd>;
+
+class SUBV_B_DESC : MSA_3R_DESC_BASE<"subv.b", sub, MSA128BOpnd>;
+class SUBV_H_DESC : MSA_3R_DESC_BASE<"subv.h", sub, MSA128HOpnd>;
+class SUBV_W_DESC : MSA_3R_DESC_BASE<"subv.w", sub, MSA128WOpnd>;
+class SUBV_D_DESC : MSA_3R_DESC_BASE<"subv.d", sub, MSA128DOpnd>;
+
+class SUBVI_B_DESC : MSA_I5_DESC_BASE<"subvi.b", sub, vsplati8_uimm5,
+                                      MSA128BOpnd>;
+class SUBVI_H_DESC : MSA_I5_DESC_BASE<"subvi.h", sub, vsplati16_uimm5,
+                                      MSA128HOpnd>;
+class SUBVI_W_DESC : MSA_I5_DESC_BASE<"subvi.w", sub, vsplati32_uimm5,
+                                      MSA128WOpnd>;
+class SUBVI_D_DESC : MSA_I5_DESC_BASE<"subvi.d", sub, vsplati64_uimm5,
+                                      MSA128DOpnd>;
+
+class VSHF_B_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.b", MSA128BOpnd>;
+class VSHF_H_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.h", MSA128HOpnd>;
+class VSHF_W_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.w", MSA128WOpnd>;
+class VSHF_D_DESC : MSA_3R_VSHF_DESC_BASE<"vshf.d", MSA128DOpnd>;
+
+class XOR_V_DESC : MSA_VEC_DESC_BASE<"xor.v", xor, MSA128BOpnd>;
+class XOR_V_H_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128HOpnd>;
+class XOR_V_W_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128WOpnd>;
+class XOR_V_D_PSEUDO_DESC : MSA_VEC_PSEUDO_BASE<xor, MSA128DOpnd>;
+
+class XORI_B_DESC : MSA_I8_DESC_BASE<"xori.b", xor, vsplati8_uimm8,
+                                     MSA128BOpnd>;
+
+// Instruction defs.
+def ADD_A_B : ADD_A_B_ENC, ADD_A_B_DESC;
+def ADD_A_H : ADD_A_H_ENC, ADD_A_H_DESC;
+def ADD_A_W : ADD_A_W_ENC, ADD_A_W_DESC;
+def ADD_A_D : ADD_A_D_ENC, ADD_A_D_DESC;
+
+def ADDS_A_B : ADDS_A_B_ENC, ADDS_A_B_DESC;
+def ADDS_A_H : ADDS_A_H_ENC, ADDS_A_H_DESC;
+def ADDS_A_W : ADDS_A_W_ENC, ADDS_A_W_DESC;
+def ADDS_A_D : ADDS_A_D_ENC, ADDS_A_D_DESC;
+
+def ADDS_S_B : ADDS_S_B_ENC, ADDS_S_B_DESC;
+def ADDS_S_H : ADDS_S_H_ENC, ADDS_S_H_DESC;
+def ADDS_S_W : ADDS_S_W_ENC, ADDS_S_W_DESC;
+def ADDS_S_D : ADDS_S_D_ENC, ADDS_S_D_DESC;
+
+def ADDS_U_B : ADDS_U_B_ENC, ADDS_U_B_DESC;
+def ADDS_U_H : ADDS_U_H_ENC, ADDS_U_H_DESC;
+def ADDS_U_W : ADDS_U_W_ENC, ADDS_U_W_DESC;
+def ADDS_U_D : ADDS_U_D_ENC, ADDS_U_D_DESC;
+
+def ADDV_B : ADDV_B_ENC, ADDV_B_DESC;
+def ADDV_H : ADDV_H_ENC, ADDV_H_DESC;
+def ADDV_W : ADDV_W_ENC, ADDV_W_DESC;
+def ADDV_D : ADDV_D_ENC, ADDV_D_DESC;
+
+def ADDVI_B : ADDVI_B_ENC, ADDVI_B_DESC;
+def ADDVI_H : ADDVI_H_ENC, ADDVI_H_DESC;
+def ADDVI_W : ADDVI_W_ENC, ADDVI_W_DESC;
+def ADDVI_D : ADDVI_D_ENC, ADDVI_D_DESC;
+
+def AND_V : AND_V_ENC, AND_V_DESC;
+def AND_V_H_PSEUDO : AND_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_W_PSEUDO : AND_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def AND_V_D_PSEUDO : AND_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(AND_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def ANDI_B : ANDI_B_ENC, ANDI_B_DESC;
+
+def ASUB_S_B : ASUB_S_B_ENC, ASUB_S_B_DESC;
+def ASUB_S_H : ASUB_S_H_ENC, ASUB_S_H_DESC;
+def ASUB_S_W : ASUB_S_W_ENC, ASUB_S_W_DESC;
+def ASUB_S_D : ASUB_S_D_ENC, ASUB_S_D_DESC;
+
+def ASUB_U_B : ASUB_U_B_ENC, ASUB_U_B_DESC;
+def ASUB_U_H : ASUB_U_H_ENC, ASUB_U_H_DESC;
+def ASUB_U_W : ASUB_U_W_ENC, ASUB_U_W_DESC;
+def ASUB_U_D : ASUB_U_D_ENC, ASUB_U_D_DESC;
+
+def AVE_S_B : AVE_S_B_ENC, AVE_S_B_DESC;
+def AVE_S_H : AVE_S_H_ENC, AVE_S_H_DESC;
+def AVE_S_W : AVE_S_W_ENC, AVE_S_W_DESC;
+def AVE_S_D : AVE_S_D_ENC, AVE_S_D_DESC;
+
+def AVE_U_B : AVE_U_B_ENC, AVE_U_B_DESC;
+def AVE_U_H : AVE_U_H_ENC, AVE_U_H_DESC;
+def AVE_U_W : AVE_U_W_ENC, AVE_U_W_DESC;
+def AVE_U_D : AVE_U_D_ENC, AVE_U_D_DESC;
+
+def AVER_S_B : AVER_S_B_ENC, AVER_S_B_DESC;
+def AVER_S_H : AVER_S_H_ENC, AVER_S_H_DESC;
+def AVER_S_W : AVER_S_W_ENC, AVER_S_W_DESC;
+def AVER_S_D : AVER_S_D_ENC, AVER_S_D_DESC;
+
+def AVER_U_B : AVER_U_B_ENC, AVER_U_B_DESC;
+def AVER_U_H : AVER_U_H_ENC, AVER_U_H_DESC;
+def AVER_U_W : AVER_U_W_ENC, AVER_U_W_DESC;
+def AVER_U_D : AVER_U_D_ENC, AVER_U_D_DESC;
+
+def BCLR_B : BCLR_B_ENC, BCLR_B_DESC;
+def BCLR_H : BCLR_H_ENC, BCLR_H_DESC;
+def BCLR_W : BCLR_W_ENC, BCLR_W_DESC;
+def BCLR_D : BCLR_D_ENC, BCLR_D_DESC;
+
+def BCLRI_B : BCLRI_B_ENC, BCLRI_B_DESC;
+def BCLRI_H : BCLRI_H_ENC, BCLRI_H_DESC;
+def BCLRI_W : BCLRI_W_ENC, BCLRI_W_DESC;
+def BCLRI_D : BCLRI_D_ENC, BCLRI_D_DESC;
+
+def BINSL_B : BINSL_B_ENC, BINSL_B_DESC;
+def BINSL_H : BINSL_H_ENC, BINSL_H_DESC;
+def BINSL_W : BINSL_W_ENC, BINSL_W_DESC;
+def BINSL_D : BINSL_D_ENC, BINSL_D_DESC;
+
+def BINSLI_B : BINSLI_B_ENC, BINSLI_B_DESC;
+def BINSLI_H : BINSLI_H_ENC, BINSLI_H_DESC;
+def BINSLI_W : BINSLI_W_ENC, BINSLI_W_DESC;
+def BINSLI_D : BINSLI_D_ENC, BINSLI_D_DESC;
+
+def BINSR_B : BINSR_B_ENC, BINSR_B_DESC;
+def BINSR_H : BINSR_H_ENC, BINSR_H_DESC;
+def BINSR_W : BINSR_W_ENC, BINSR_W_DESC;
+def BINSR_D : BINSR_D_ENC, BINSR_D_DESC;
+
+def BINSRI_B : BINSRI_B_ENC, BINSRI_B_DESC;
+def BINSRI_H : BINSRI_H_ENC, BINSRI_H_DESC;
+def BINSRI_W : BINSRI_W_ENC, BINSRI_W_DESC;
+def BINSRI_D : BINSRI_D_ENC, BINSRI_D_DESC;
+
+def BMNZ_V : BMNZ_V_ENC, BMNZ_V_DESC;
+
+def BMNZI_B : BMNZI_B_ENC, BMNZI_B_DESC;
+
+def BMZ_V : BMZ_V_ENC, BMZ_V_DESC;
+
+def BMZI_B : BMZI_B_ENC, BMZI_B_DESC;
+
+def BNEG_B : BNEG_B_ENC, BNEG_B_DESC;
+def BNEG_H : BNEG_H_ENC, BNEG_H_DESC;
+def BNEG_W : BNEG_W_ENC, BNEG_W_DESC;
+def BNEG_D : BNEG_D_ENC, BNEG_D_DESC;
+
+def BNEGI_B : BNEGI_B_ENC, BNEGI_B_DESC;
+def BNEGI_H : BNEGI_H_ENC, BNEGI_H_DESC;
+def BNEGI_W : BNEGI_W_ENC, BNEGI_W_DESC;
+def BNEGI_D : BNEGI_D_ENC, BNEGI_D_DESC;
+
+def BNZ_B : BNZ_B_ENC, BNZ_B_DESC;
+def BNZ_H : BNZ_H_ENC, BNZ_H_DESC;
+def BNZ_W : BNZ_W_ENC, BNZ_W_DESC;
+def BNZ_D : BNZ_D_ENC, BNZ_D_DESC;
+
+def BNZ_V : BNZ_V_ENC, BNZ_V_DESC;
+
+def BSEL_V : BSEL_V_ENC, BSEL_V_DESC;
+
+class MSA_BSEL_PSEUDO_BASE<RegisterOperand RO, ValueType Ty> :
+  MipsPseudo<(outs RO:$wd), (ins RO:$wd_in, RO:$ws, RO:$wt),
+             [(set RO:$wd, (Ty (vselect RO:$wd_in, RO:$ws, RO:$wt)))]>,
+  PseudoInstExpansion<(BSEL_V MSA128BOpnd:$wd, MSA128BOpnd:$wd_in,
+                              MSA128BOpnd:$ws, MSA128BOpnd:$wt)> {
+  let Constraints = "$wd_in = $wd";
+}
+
+def BSEL_H_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128HOpnd, v8i16>;
+def BSEL_W_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4i32>;
+def BSEL_D_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2i64>;
+def BSEL_FW_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128WOpnd, v4f32>;
+def BSEL_FD_PSEUDO : MSA_BSEL_PSEUDO_BASE<MSA128DOpnd, v2f64>;
+
+def BSELI_B : BSELI_B_ENC, BSELI_B_DESC;
+
+def BSET_B : BSET_B_ENC, BSET_B_DESC;
+def BSET_H : BSET_H_ENC, BSET_H_DESC;
+def BSET_W : BSET_W_ENC, BSET_W_DESC;
+def BSET_D : BSET_D_ENC, BSET_D_DESC;
+
+def BSETI_B : BSETI_B_ENC, BSETI_B_DESC;
+def BSETI_H : BSETI_H_ENC, BSETI_H_DESC;
+def BSETI_W : BSETI_W_ENC, BSETI_W_DESC;
+def BSETI_D : BSETI_D_ENC, BSETI_D_DESC;
+
+def BZ_B : BZ_B_ENC, BZ_B_DESC;
+def BZ_H : BZ_H_ENC, BZ_H_DESC;
+def BZ_W : BZ_W_ENC, BZ_W_DESC;
+def BZ_D : BZ_D_ENC, BZ_D_DESC;
+
+def BZ_V : BZ_V_ENC, BZ_V_DESC;
+
+def CEQ_B : CEQ_B_ENC, CEQ_B_DESC;
+def CEQ_H : CEQ_H_ENC, CEQ_H_DESC;
+def CEQ_W : CEQ_W_ENC, CEQ_W_DESC;
+def CEQ_D : CEQ_D_ENC, CEQ_D_DESC;
+
+def CEQI_B : CEQI_B_ENC, CEQI_B_DESC;
+def CEQI_H : CEQI_H_ENC, CEQI_H_DESC;
+def CEQI_W : CEQI_W_ENC, CEQI_W_DESC;
+def CEQI_D : CEQI_D_ENC, CEQI_D_DESC;
+
+def CFCMSA : CFCMSA_ENC, CFCMSA_DESC;
+
+def CLE_S_B : CLE_S_B_ENC, CLE_S_B_DESC;
+def CLE_S_H : CLE_S_H_ENC, CLE_S_H_DESC;
+def CLE_S_W : CLE_S_W_ENC, CLE_S_W_DESC;
+def CLE_S_D : CLE_S_D_ENC, CLE_S_D_DESC;
+
+def CLE_U_B : CLE_U_B_ENC, CLE_U_B_DESC;
+def CLE_U_H : CLE_U_H_ENC, CLE_U_H_DESC;
+def CLE_U_W : CLE_U_W_ENC, CLE_U_W_DESC;
+def CLE_U_D : CLE_U_D_ENC, CLE_U_D_DESC;
+
+def CLEI_S_B : CLEI_S_B_ENC, CLEI_S_B_DESC;
+def CLEI_S_H : CLEI_S_H_ENC, CLEI_S_H_DESC;
+def CLEI_S_W : CLEI_S_W_ENC, CLEI_S_W_DESC;
+def CLEI_S_D : CLEI_S_D_ENC, CLEI_S_D_DESC;
+
+def CLEI_U_B : CLEI_U_B_ENC, CLEI_U_B_DESC;
+def CLEI_U_H : CLEI_U_H_ENC, CLEI_U_H_DESC;
+def CLEI_U_W : CLEI_U_W_ENC, CLEI_U_W_DESC;
+def CLEI_U_D : CLEI_U_D_ENC, CLEI_U_D_DESC;
+
+def CLT_S_B : CLT_S_B_ENC, CLT_S_B_DESC;
+def CLT_S_H : CLT_S_H_ENC, CLT_S_H_DESC;
+def CLT_S_W : CLT_S_W_ENC, CLT_S_W_DESC;
+def CLT_S_D : CLT_S_D_ENC, CLT_S_D_DESC;
+
+def CLT_U_B : CLT_U_B_ENC, CLT_U_B_DESC;
+def CLT_U_H : CLT_U_H_ENC, CLT_U_H_DESC;
+def CLT_U_W : CLT_U_W_ENC, CLT_U_W_DESC;
+def CLT_U_D : CLT_U_D_ENC, CLT_U_D_DESC;
+
+def CLTI_S_B : CLTI_S_B_ENC, CLTI_S_B_DESC;
+def CLTI_S_H : CLTI_S_H_ENC, CLTI_S_H_DESC;
+def CLTI_S_W : CLTI_S_W_ENC, CLTI_S_W_DESC;
+def CLTI_S_D : CLTI_S_D_ENC, CLTI_S_D_DESC;
+
+def CLTI_U_B : CLTI_U_B_ENC, CLTI_U_B_DESC;
+def CLTI_U_H : CLTI_U_H_ENC, CLTI_U_H_DESC;
+def CLTI_U_W : CLTI_U_W_ENC, CLTI_U_W_DESC;
+def CLTI_U_D : CLTI_U_D_ENC, CLTI_U_D_DESC;
+
+def COPY_S_B : COPY_S_B_ENC, COPY_S_B_DESC;
+def COPY_S_H : COPY_S_H_ENC, COPY_S_H_DESC;
+def COPY_S_W : COPY_S_W_ENC, COPY_S_W_DESC;
+
+def COPY_U_B : COPY_U_B_ENC, COPY_U_B_DESC;
+def COPY_U_H : COPY_U_H_ENC, COPY_U_H_DESC;
+def COPY_U_W : COPY_U_W_ENC, COPY_U_W_DESC;
+
+def COPY_FW_PSEUDO : COPY_FW_PSEUDO_DESC;
+def COPY_FD_PSEUDO : COPY_FD_PSEUDO_DESC;
+
+def CTCMSA : CTCMSA_ENC, CTCMSA_DESC;
+
+def DIV_S_B : DIV_S_B_ENC, DIV_S_B_DESC;
+def DIV_S_H : DIV_S_H_ENC, DIV_S_H_DESC;
+def DIV_S_W : DIV_S_W_ENC, DIV_S_W_DESC;
+def DIV_S_D : DIV_S_D_ENC, DIV_S_D_DESC;
+
+def DIV_U_B : DIV_U_B_ENC, DIV_U_B_DESC;
+def DIV_U_H : DIV_U_H_ENC, DIV_U_H_DESC;
+def DIV_U_W : DIV_U_W_ENC, DIV_U_W_DESC;
+def DIV_U_D : DIV_U_D_ENC, DIV_U_D_DESC;
+
+def DOTP_S_H : DOTP_S_H_ENC, DOTP_S_H_DESC;
+def DOTP_S_W : DOTP_S_W_ENC, DOTP_S_W_DESC;
+def DOTP_S_D : DOTP_S_D_ENC, DOTP_S_D_DESC;
+
+def DOTP_U_H : DOTP_U_H_ENC, DOTP_U_H_DESC;
+def DOTP_U_W : DOTP_U_W_ENC, DOTP_U_W_DESC;
+def DOTP_U_D : DOTP_U_D_ENC, DOTP_U_D_DESC;
+
+def DPADD_S_H : DPADD_S_H_ENC, DPADD_S_H_DESC;
+def DPADD_S_W : DPADD_S_W_ENC, DPADD_S_W_DESC;
+def DPADD_S_D : DPADD_S_D_ENC, DPADD_S_D_DESC;
+
+def DPADD_U_H : DPADD_U_H_ENC, DPADD_U_H_DESC;
+def DPADD_U_W : DPADD_U_W_ENC, DPADD_U_W_DESC;
+def DPADD_U_D : DPADD_U_D_ENC, DPADD_U_D_DESC;
+
+def DPSUB_S_H : DPSUB_S_H_ENC, DPSUB_S_H_DESC;
+def DPSUB_S_W : DPSUB_S_W_ENC, DPSUB_S_W_DESC;
+def DPSUB_S_D : DPSUB_S_D_ENC, DPSUB_S_D_DESC;
+
+def DPSUB_U_H : DPSUB_U_H_ENC, DPSUB_U_H_DESC;
+def DPSUB_U_W : DPSUB_U_W_ENC, DPSUB_U_W_DESC;
+def DPSUB_U_D : DPSUB_U_D_ENC, DPSUB_U_D_DESC;
+
+def FADD_W : FADD_W_ENC, FADD_W_DESC;
+def FADD_D : FADD_D_ENC, FADD_D_DESC;
+
+def FCAF_W : FCAF_W_ENC, FCAF_W_DESC;
+def FCAF_D : FCAF_D_ENC, FCAF_D_DESC;
+
+def FCEQ_W : FCEQ_W_ENC, FCEQ_W_DESC;
+def FCEQ_D : FCEQ_D_ENC, FCEQ_D_DESC;
+
+def FCLE_W : FCLE_W_ENC, FCLE_W_DESC;
+def FCLE_D : FCLE_D_ENC, FCLE_D_DESC;
+
+def FCLT_W : FCLT_W_ENC, FCLT_W_DESC;
+def FCLT_D : FCLT_D_ENC, FCLT_D_DESC;
+
+def FCLASS_W : FCLASS_W_ENC, FCLASS_W_DESC;
+def FCLASS_D : FCLASS_D_ENC, FCLASS_D_DESC;
+
+def FCNE_W : FCNE_W_ENC, FCNE_W_DESC;
+def FCNE_D : FCNE_D_ENC, FCNE_D_DESC;
+
+def FCOR_W : FCOR_W_ENC, FCOR_W_DESC;
+def FCOR_D : FCOR_D_ENC, FCOR_D_DESC;
+
+def FCUEQ_W : FCUEQ_W_ENC, FCUEQ_W_DESC;
+def FCUEQ_D : FCUEQ_D_ENC, FCUEQ_D_DESC;
+
+def FCULE_W : FCULE_W_ENC, FCULE_W_DESC;
+def FCULE_D : FCULE_D_ENC, FCULE_D_DESC;
+
+def FCULT_W : FCULT_W_ENC, FCULT_W_DESC;
+def FCULT_D : FCULT_D_ENC, FCULT_D_DESC;
+
+def FCUN_W : FCUN_W_ENC, FCUN_W_DESC;
+def FCUN_D : FCUN_D_ENC, FCUN_D_DESC;
+
+def FCUNE_W : FCUNE_W_ENC, FCUNE_W_DESC;
+def FCUNE_D : FCUNE_D_ENC, FCUNE_D_DESC;
+
+def FDIV_W : FDIV_W_ENC, FDIV_W_DESC;
+def FDIV_D : FDIV_D_ENC, FDIV_D_DESC;
+
+def FEXDO_H : FEXDO_H_ENC, FEXDO_H_DESC;
+def FEXDO_W : FEXDO_W_ENC, FEXDO_W_DESC;
+
+def FEXP2_W : FEXP2_W_ENC, FEXP2_W_DESC;
+def FEXP2_D : FEXP2_D_ENC, FEXP2_D_DESC;
+def FEXP2_W_1_PSEUDO : FEXP2_W_1_PSEUDO_DESC;
+def FEXP2_D_1_PSEUDO : FEXP2_D_1_PSEUDO_DESC;
+
+def FEXUPL_W : FEXUPL_W_ENC, FEXUPL_W_DESC;
+def FEXUPL_D : FEXUPL_D_ENC, FEXUPL_D_DESC;
+
+def FEXUPR_W : FEXUPR_W_ENC, FEXUPR_W_DESC;
+def FEXUPR_D : FEXUPR_D_ENC, FEXUPR_D_DESC;
+
+def FFINT_S_W : FFINT_S_W_ENC, FFINT_S_W_DESC;
+def FFINT_S_D : FFINT_S_D_ENC, FFINT_S_D_DESC;
+
+def FFINT_U_W : FFINT_U_W_ENC, FFINT_U_W_DESC;
+def FFINT_U_D : FFINT_U_D_ENC, FFINT_U_D_DESC;
+
+def FFQL_W : FFQL_W_ENC, FFQL_W_DESC;
+def FFQL_D : FFQL_D_ENC, FFQL_D_DESC;
+
+def FFQR_W : FFQR_W_ENC, FFQR_W_DESC;
+def FFQR_D : FFQR_D_ENC, FFQR_D_DESC;
+
+def FILL_B : FILL_B_ENC, FILL_B_DESC;
+def FILL_H : FILL_H_ENC, FILL_H_DESC;
+def FILL_W : FILL_W_ENC, FILL_W_DESC;
+def FILL_FW_PSEUDO : FILL_FW_PSEUDO_DESC;
+def FILL_FD_PSEUDO : FILL_FD_PSEUDO_DESC;
+
+def FLOG2_W : FLOG2_W_ENC, FLOG2_W_DESC;
+def FLOG2_D : FLOG2_D_ENC, FLOG2_D_DESC;
+
+def FMADD_W : FMADD_W_ENC, FMADD_W_DESC;
+def FMADD_D : FMADD_D_ENC, FMADD_D_DESC;
+
+def FMAX_W : FMAX_W_ENC, FMAX_W_DESC;
+def FMAX_D : FMAX_D_ENC, FMAX_D_DESC;
+
+def FMAX_A_W : FMAX_A_W_ENC, FMAX_A_W_DESC;
+def FMAX_A_D : FMAX_A_D_ENC, FMAX_A_D_DESC;
+
+def FMIN_W : FMIN_W_ENC, FMIN_W_DESC;
+def FMIN_D : FMIN_D_ENC, FMIN_D_DESC;
+
+def FMIN_A_W : FMIN_A_W_ENC, FMIN_A_W_DESC;
+def FMIN_A_D : FMIN_A_D_ENC, FMIN_A_D_DESC;
+
+def FMSUB_W : FMSUB_W_ENC, FMSUB_W_DESC;
+def FMSUB_D : FMSUB_D_ENC, FMSUB_D_DESC;
+
+def FMUL_W : FMUL_W_ENC, FMUL_W_DESC;
+def FMUL_D : FMUL_D_ENC, FMUL_D_DESC;
+
+def FRINT_W : FRINT_W_ENC, FRINT_W_DESC;
+def FRINT_D : FRINT_D_ENC, FRINT_D_DESC;
+
+def FRCP_W : FRCP_W_ENC, FRCP_W_DESC;
+def FRCP_D : FRCP_D_ENC, FRCP_D_DESC;
+
+def FRSQRT_W : FRSQRT_W_ENC, FRSQRT_W_DESC;
+def FRSQRT_D : FRSQRT_D_ENC, FRSQRT_D_DESC;
+
+def FSAF_W : FSAF_W_ENC, FSAF_W_DESC;
+def FSAF_D : FSAF_D_ENC, FSAF_D_DESC;
+
+def FSEQ_W : FSEQ_W_ENC, FSEQ_W_DESC;
+def FSEQ_D : FSEQ_D_ENC, FSEQ_D_DESC;
+
+def FSLE_W : FSLE_W_ENC, FSLE_W_DESC;
+def FSLE_D : FSLE_D_ENC, FSLE_D_DESC;
+
+def FSLT_W : FSLT_W_ENC, FSLT_W_DESC;
+def FSLT_D : FSLT_D_ENC, FSLT_D_DESC;
+
+def FSNE_W : FSNE_W_ENC, FSNE_W_DESC;
+def FSNE_D : FSNE_D_ENC, FSNE_D_DESC;
+
+def FSOR_W : FSOR_W_ENC, FSOR_W_DESC;
+def FSOR_D : FSOR_D_ENC, FSOR_D_DESC;
+
+def FSQRT_W : FSQRT_W_ENC, FSQRT_W_DESC;
+def FSQRT_D : FSQRT_D_ENC, FSQRT_D_DESC;
+
+def FSUB_W : FSUB_W_ENC, FSUB_W_DESC;
+def FSUB_D : FSUB_D_ENC, FSUB_D_DESC;
+
+def FSUEQ_W : FSUEQ_W_ENC, FSUEQ_W_DESC;
+def FSUEQ_D : FSUEQ_D_ENC, FSUEQ_D_DESC;
+
+def FSULE_W : FSULE_W_ENC, FSULE_W_DESC;
+def FSULE_D : FSULE_D_ENC, FSULE_D_DESC;
+
+def FSULT_W : FSULT_W_ENC, FSULT_W_DESC;
+def FSULT_D : FSULT_D_ENC, FSULT_D_DESC;
+
+def FSUN_W : FSUN_W_ENC, FSUN_W_DESC;
+def FSUN_D : FSUN_D_ENC, FSUN_D_DESC;
+
+def FSUNE_W : FSUNE_W_ENC, FSUNE_W_DESC;
+def FSUNE_D : FSUNE_D_ENC, FSUNE_D_DESC;
+
+def FTINT_S_W : FTINT_S_W_ENC, FTINT_S_W_DESC;
+def FTINT_S_D : FTINT_S_D_ENC, FTINT_S_D_DESC;
+
+def FTINT_U_W : FTINT_U_W_ENC, FTINT_U_W_DESC;
+def FTINT_U_D : FTINT_U_D_ENC, FTINT_U_D_DESC;
+
+def FTQ_H : FTQ_H_ENC, FTQ_H_DESC;
+def FTQ_W : FTQ_W_ENC, FTQ_W_DESC;
+
+def FTRUNC_S_W : FTRUNC_S_W_ENC, FTRUNC_S_W_DESC;
+def FTRUNC_S_D : FTRUNC_S_D_ENC, FTRUNC_S_D_DESC;
+
+def FTRUNC_U_W : FTRUNC_U_W_ENC, FTRUNC_U_W_DESC;
+def FTRUNC_U_D : FTRUNC_U_D_ENC, FTRUNC_U_D_DESC;
+
+def HADD_S_H : HADD_S_H_ENC, HADD_S_H_DESC;
+def HADD_S_W : HADD_S_W_ENC, HADD_S_W_DESC;
+def HADD_S_D : HADD_S_D_ENC, HADD_S_D_DESC;
+
+def HADD_U_H : HADD_U_H_ENC, HADD_U_H_DESC;
+def HADD_U_W : HADD_U_W_ENC, HADD_U_W_DESC;
+def HADD_U_D : HADD_U_D_ENC, HADD_U_D_DESC;
+
+def HSUB_S_H : HSUB_S_H_ENC, HSUB_S_H_DESC;
+def HSUB_S_W : HSUB_S_W_ENC, HSUB_S_W_DESC;
+def HSUB_S_D : HSUB_S_D_ENC, HSUB_S_D_DESC;
+
+def HSUB_U_H : HSUB_U_H_ENC, HSUB_U_H_DESC;
+def HSUB_U_W : HSUB_U_W_ENC, HSUB_U_W_DESC;
+def HSUB_U_D : HSUB_U_D_ENC, HSUB_U_D_DESC;
+
+def ILVEV_B : ILVEV_B_ENC, ILVEV_B_DESC;
+def ILVEV_H : ILVEV_H_ENC, ILVEV_H_DESC;
+def ILVEV_W : ILVEV_W_ENC, ILVEV_W_DESC;
+def ILVEV_D : ILVEV_D_ENC, ILVEV_D_DESC;
+
+def ILVL_B : ILVL_B_ENC, ILVL_B_DESC;
+def ILVL_H : ILVL_H_ENC, ILVL_H_DESC;
+def ILVL_W : ILVL_W_ENC, ILVL_W_DESC;
+def ILVL_D : ILVL_D_ENC, ILVL_D_DESC;
+
+def ILVOD_B : ILVOD_B_ENC, ILVOD_B_DESC;
+def ILVOD_H : ILVOD_H_ENC, ILVOD_H_DESC;
+def ILVOD_W : ILVOD_W_ENC, ILVOD_W_DESC;
+def ILVOD_D : ILVOD_D_ENC, ILVOD_D_DESC;
+
+def ILVR_B : ILVR_B_ENC, ILVR_B_DESC;
+def ILVR_H : ILVR_H_ENC, ILVR_H_DESC;
+def ILVR_W : ILVR_W_ENC, ILVR_W_DESC;
+def ILVR_D : ILVR_D_ENC, ILVR_D_DESC;
+
+def INSERT_B : INSERT_B_ENC, INSERT_B_DESC;
+def INSERT_H : INSERT_H_ENC, INSERT_H_DESC;
+def INSERT_W : INSERT_W_ENC, INSERT_W_DESC;
+
+// INSERT_FW_PSEUDO defined after INSVE_W
+// INSERT_FD_PSEUDO defined after INSVE_D
+
+def INSVE_B : INSVE_B_ENC, INSVE_B_DESC;
+def INSVE_H : INSVE_H_ENC, INSVE_H_DESC;
+def INSVE_W : INSVE_W_ENC, INSVE_W_DESC;
+def INSVE_D : INSVE_D_ENC, INSVE_D_DESC;
+
+def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
+def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
+
+def LD_B: LD_B_ENC, LD_B_DESC;
+def LD_H: LD_H_ENC, LD_H_DESC;
+def LD_W: LD_W_ENC, LD_W_DESC;
+def LD_D: LD_D_ENC, LD_D_DESC;
+
+def LDI_B : LDI_B_ENC, LDI_B_DESC;
+def LDI_H : LDI_H_ENC, LDI_H_DESC;
+def LDI_W : LDI_W_ENC, LDI_W_DESC;
+def LDI_D : LDI_D_ENC, LDI_D_DESC;
+
+def LSA : LSA_ENC, LSA_DESC;
+
+def MADD_Q_H : MADD_Q_H_ENC, MADD_Q_H_DESC;
+def MADD_Q_W : MADD_Q_W_ENC, MADD_Q_W_DESC;
+
+def MADDR_Q_H : MADDR_Q_H_ENC, MADDR_Q_H_DESC;
+def MADDR_Q_W : MADDR_Q_W_ENC, MADDR_Q_W_DESC;
+
+def MADDV_B : MADDV_B_ENC, MADDV_B_DESC;
+def MADDV_H : MADDV_H_ENC, MADDV_H_DESC;
+def MADDV_W : MADDV_W_ENC, MADDV_W_DESC;
+def MADDV_D : MADDV_D_ENC, MADDV_D_DESC;
+
+def MAX_A_B : MAX_A_B_ENC, MAX_A_B_DESC;
+def MAX_A_H : MAX_A_H_ENC, MAX_A_H_DESC;
+def MAX_A_W : MAX_A_W_ENC, MAX_A_W_DESC;
+def MAX_A_D : MAX_A_D_ENC, MAX_A_D_DESC;
+
+def MAX_S_B : MAX_S_B_ENC, MAX_S_B_DESC;
+def MAX_S_H : MAX_S_H_ENC, MAX_S_H_DESC;
+def MAX_S_W : MAX_S_W_ENC, MAX_S_W_DESC;
+def MAX_S_D : MAX_S_D_ENC, MAX_S_D_DESC;
+
+def MAX_U_B : MAX_U_B_ENC, MAX_U_B_DESC;
+def MAX_U_H : MAX_U_H_ENC, MAX_U_H_DESC;
+def MAX_U_W : MAX_U_W_ENC, MAX_U_W_DESC;
+def MAX_U_D : MAX_U_D_ENC, MAX_U_D_DESC;
+
+def MAXI_S_B : MAXI_S_B_ENC, MAXI_S_B_DESC;
+def MAXI_S_H : MAXI_S_H_ENC, MAXI_S_H_DESC;
+def MAXI_S_W : MAXI_S_W_ENC, MAXI_S_W_DESC;
+def MAXI_S_D : MAXI_S_D_ENC, MAXI_S_D_DESC;
+
+def MAXI_U_B : MAXI_U_B_ENC, MAXI_U_B_DESC;
+def MAXI_U_H : MAXI_U_H_ENC, MAXI_U_H_DESC;
+def MAXI_U_W : MAXI_U_W_ENC, MAXI_U_W_DESC;
+def MAXI_U_D : MAXI_U_D_ENC, MAXI_U_D_DESC;
+
+def MIN_A_B : MIN_A_B_ENC, MIN_A_B_DESC;
+def MIN_A_H : MIN_A_H_ENC, MIN_A_H_DESC;
+def MIN_A_W : MIN_A_W_ENC, MIN_A_W_DESC;
+def MIN_A_D : MIN_A_D_ENC, MIN_A_D_DESC;
+
+def MIN_S_B : MIN_S_B_ENC, MIN_S_B_DESC;
+def MIN_S_H : MIN_S_H_ENC, MIN_S_H_DESC;
+def MIN_S_W : MIN_S_W_ENC, MIN_S_W_DESC;
+def MIN_S_D : MIN_S_D_ENC, MIN_S_D_DESC;
+
+def MIN_U_B : MIN_U_B_ENC, MIN_U_B_DESC;
+def MIN_U_H : MIN_U_H_ENC, MIN_U_H_DESC;
+def MIN_U_W : MIN_U_W_ENC, MIN_U_W_DESC;
+def MIN_U_D : MIN_U_D_ENC, MIN_U_D_DESC;
+
+def MINI_S_B : MINI_S_B_ENC, MINI_S_B_DESC;
+def MINI_S_H : MINI_S_H_ENC, MINI_S_H_DESC;
+def MINI_S_W : MINI_S_W_ENC, MINI_S_W_DESC;
+def MINI_S_D : MINI_S_D_ENC, MINI_S_D_DESC;
+
+def MINI_U_B : MINI_U_B_ENC, MINI_U_B_DESC;
+def MINI_U_H : MINI_U_H_ENC, MINI_U_H_DESC;
+def MINI_U_W : MINI_U_W_ENC, MINI_U_W_DESC;
+def MINI_U_D : MINI_U_D_ENC, MINI_U_D_DESC;
+
+def MOD_S_B : MOD_S_B_ENC, MOD_S_B_DESC;
+def MOD_S_H : MOD_S_H_ENC, MOD_S_H_DESC;
+def MOD_S_W : MOD_S_W_ENC, MOD_S_W_DESC;
+def MOD_S_D : MOD_S_D_ENC, MOD_S_D_DESC;
+
+def MOD_U_B : MOD_U_B_ENC, MOD_U_B_DESC;
+def MOD_U_H : MOD_U_H_ENC, MOD_U_H_DESC;
+def MOD_U_W : MOD_U_W_ENC, MOD_U_W_DESC;
+def MOD_U_D : MOD_U_D_ENC, MOD_U_D_DESC;
+
+def MOVE_V : MOVE_V_ENC, MOVE_V_DESC;
+
+def MSUB_Q_H : MSUB_Q_H_ENC, MSUB_Q_H_DESC;
+def MSUB_Q_W : MSUB_Q_W_ENC, MSUB_Q_W_DESC;
+
+def MSUBR_Q_H : MSUBR_Q_H_ENC, MSUBR_Q_H_DESC;
+def MSUBR_Q_W : MSUBR_Q_W_ENC, MSUBR_Q_W_DESC;
+
+def MSUBV_B : MSUBV_B_ENC, MSUBV_B_DESC;
+def MSUBV_H : MSUBV_H_ENC, MSUBV_H_DESC;
+def MSUBV_W : MSUBV_W_ENC, MSUBV_W_DESC;
+def MSUBV_D : MSUBV_D_ENC, MSUBV_D_DESC;
+
+def MUL_Q_H : MUL_Q_H_ENC, MUL_Q_H_DESC;
+def MUL_Q_W : MUL_Q_W_ENC, MUL_Q_W_DESC;
+
+def MULR_Q_H : MULR_Q_H_ENC, MULR_Q_H_DESC;
+def MULR_Q_W : MULR_Q_W_ENC, MULR_Q_W_DESC;
+
+def MULV_B : MULV_B_ENC, MULV_B_DESC;
+def MULV_H : MULV_H_ENC, MULV_H_DESC;
+def MULV_W : MULV_W_ENC, MULV_W_DESC;
+def MULV_D : MULV_D_ENC, MULV_D_DESC;
+
+def NLOC_B : NLOC_B_ENC, NLOC_B_DESC;
+def NLOC_H : NLOC_H_ENC, NLOC_H_DESC;
+def NLOC_W : NLOC_W_ENC, NLOC_W_DESC;
+def NLOC_D : NLOC_D_ENC, NLOC_D_DESC;
+
+def NLZC_B : NLZC_B_ENC, NLZC_B_DESC;
+def NLZC_H : NLZC_H_ENC, NLZC_H_DESC;
+def NLZC_W : NLZC_W_ENC, NLZC_W_DESC;
+def NLZC_D : NLZC_D_ENC, NLZC_D_DESC;
+
+def NOR_V : NOR_V_ENC, NOR_V_DESC;
+def NOR_V_H_PSEUDO : NOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_W_PSEUDO : NOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def NOR_V_D_PSEUDO : NOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(NOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def NORI_B : NORI_B_ENC, NORI_B_DESC;
+
+def OR_V : OR_V_ENC, OR_V_DESC;
+def OR_V_H_PSEUDO : OR_V_H_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_W_PSEUDO : OR_V_W_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+def OR_V_D_PSEUDO : OR_V_D_PSEUDO_DESC,
+                    PseudoInstExpansion<(OR_V MSA128BOpnd:$wd,
+                                              MSA128BOpnd:$ws,
+                                              MSA128BOpnd:$wt)>;
+
+def ORI_B : ORI_B_ENC, ORI_B_DESC;
+
+def PCKEV_B : PCKEV_B_ENC, PCKEV_B_DESC;
+def PCKEV_H : PCKEV_H_ENC, PCKEV_H_DESC;
+def PCKEV_W : PCKEV_W_ENC, PCKEV_W_DESC;
+def PCKEV_D : PCKEV_D_ENC, PCKEV_D_DESC;
+
+def PCKOD_B : PCKOD_B_ENC, PCKOD_B_DESC;
+def PCKOD_H : PCKOD_H_ENC, PCKOD_H_DESC;
+def PCKOD_W : PCKOD_W_ENC, PCKOD_W_DESC;
+def PCKOD_D : PCKOD_D_ENC, PCKOD_D_DESC;
+
+def PCNT_B : PCNT_B_ENC, PCNT_B_DESC;
+def PCNT_H : PCNT_H_ENC, PCNT_H_DESC;
+def PCNT_W : PCNT_W_ENC, PCNT_W_DESC;
+def PCNT_D : PCNT_D_ENC, PCNT_D_DESC;
+
+def SAT_S_B : SAT_S_B_ENC, SAT_S_B_DESC;
+def SAT_S_H : SAT_S_H_ENC, SAT_S_H_DESC;
+def SAT_S_W : SAT_S_W_ENC, SAT_S_W_DESC;
+def SAT_S_D : SAT_S_D_ENC, SAT_S_D_DESC;
+
+def SAT_U_B : SAT_U_B_ENC, SAT_U_B_DESC;
+def SAT_U_H : SAT_U_H_ENC, SAT_U_H_DESC;
+def SAT_U_W : SAT_U_W_ENC, SAT_U_W_DESC;
+def SAT_U_D : SAT_U_D_ENC, SAT_U_D_DESC;
+
+def SHF_B : SHF_B_ENC, SHF_B_DESC;
+def SHF_H : SHF_H_ENC, SHF_H_DESC;
+def SHF_W : SHF_W_ENC, SHF_W_DESC;
+
+def SLD_B : SLD_B_ENC, SLD_B_DESC;
+def SLD_H : SLD_H_ENC, SLD_H_DESC;
+def SLD_W : SLD_W_ENC, SLD_W_DESC;
+def SLD_D : SLD_D_ENC, SLD_D_DESC;
+
+def SLDI_B : SLDI_B_ENC, SLDI_B_DESC;
+def SLDI_H : SLDI_H_ENC, SLDI_H_DESC;
+def SLDI_W : SLDI_W_ENC, SLDI_W_DESC;
+def SLDI_D : SLDI_D_ENC, SLDI_D_DESC;
+
+def SLL_B : SLL_B_ENC, SLL_B_DESC;
+def SLL_H : SLL_H_ENC, SLL_H_DESC;
+def SLL_W : SLL_W_ENC, SLL_W_DESC;
+def SLL_D : SLL_D_ENC, SLL_D_DESC;
+
+def SLLI_B : SLLI_B_ENC, SLLI_B_DESC;
+def SLLI_H : SLLI_H_ENC, SLLI_H_DESC;
+def SLLI_W : SLLI_W_ENC, SLLI_W_DESC;
+def SLLI_D : SLLI_D_ENC, SLLI_D_DESC;
+
+def SPLAT_B : SPLAT_B_ENC, SPLAT_B_DESC;
+def SPLAT_H : SPLAT_H_ENC, SPLAT_H_DESC;
+def SPLAT_W : SPLAT_W_ENC, SPLAT_W_DESC;
+def SPLAT_D : SPLAT_D_ENC, SPLAT_D_DESC;
+
+def SPLATI_B : SPLATI_B_ENC, SPLATI_B_DESC;
+def SPLATI_H : SPLATI_H_ENC, SPLATI_H_DESC;
+def SPLATI_W : SPLATI_W_ENC, SPLATI_W_DESC;
+def SPLATI_D : SPLATI_D_ENC, SPLATI_D_DESC;
+
+def SRA_B : SRA_B_ENC, SRA_B_DESC;
+def SRA_H : SRA_H_ENC, SRA_H_DESC;
+def SRA_W : SRA_W_ENC, SRA_W_DESC;
+def SRA_D : SRA_D_ENC, SRA_D_DESC;
+
+def SRAI_B : SRAI_B_ENC, SRAI_B_DESC;
+def SRAI_H : SRAI_H_ENC, SRAI_H_DESC;
+def SRAI_W : SRAI_W_ENC, SRAI_W_DESC;
+def SRAI_D : SRAI_D_ENC, SRAI_D_DESC;
+
+def SRAR_B : SRAR_B_ENC, SRAR_B_DESC;
+def SRAR_H : SRAR_H_ENC, SRAR_H_DESC;
+def SRAR_W : SRAR_W_ENC, SRAR_W_DESC;
+def SRAR_D : SRAR_D_ENC, SRAR_D_DESC;
+
+def SRARI_B : SRARI_B_ENC, SRARI_B_DESC;
+def SRARI_H : SRARI_H_ENC, SRARI_H_DESC;
+def SRARI_W : SRARI_W_ENC, SRARI_W_DESC;
+def SRARI_D : SRARI_D_ENC, SRARI_D_DESC;
+
+def SRL_B : SRL_B_ENC, SRL_B_DESC;
+def SRL_H : SRL_H_ENC, SRL_H_DESC;
+def SRL_W : SRL_W_ENC, SRL_W_DESC;
+def SRL_D : SRL_D_ENC, SRL_D_DESC;
+
+def SRLI_B : SRLI_B_ENC, SRLI_B_DESC;
+def SRLI_H : SRLI_H_ENC, SRLI_H_DESC;
+def SRLI_W : SRLI_W_ENC, SRLI_W_DESC;
+def SRLI_D : SRLI_D_ENC, SRLI_D_DESC;
+
+def SRLR_B : SRLR_B_ENC, SRLR_B_DESC;
+def SRLR_H : SRLR_H_ENC, SRLR_H_DESC;
+def SRLR_W : SRLR_W_ENC, SRLR_W_DESC;
+def SRLR_D : SRLR_D_ENC, SRLR_D_DESC;
+
+def SRLRI_B : SRLRI_B_ENC, SRLRI_B_DESC;
+def SRLRI_H : SRLRI_H_ENC, SRLRI_H_DESC;
+def SRLRI_W : SRLRI_W_ENC, SRLRI_W_DESC;
+def SRLRI_D : SRLRI_D_ENC, SRLRI_D_DESC;
+
+def ST_B: ST_B_ENC, ST_B_DESC;
+def ST_H: ST_H_ENC, ST_H_DESC;
+def ST_W: ST_W_ENC, ST_W_DESC;
+def ST_D: ST_D_ENC, ST_D_DESC;
+
+def SUBS_S_B : SUBS_S_B_ENC, SUBS_S_B_DESC;
+def SUBS_S_H : SUBS_S_H_ENC, SUBS_S_H_DESC;
+def SUBS_S_W : SUBS_S_W_ENC, SUBS_S_W_DESC;
+def SUBS_S_D : SUBS_S_D_ENC, SUBS_S_D_DESC;
+
+def SUBS_U_B : SUBS_U_B_ENC, SUBS_U_B_DESC;
+def SUBS_U_H : SUBS_U_H_ENC, SUBS_U_H_DESC;
+def SUBS_U_W : SUBS_U_W_ENC, SUBS_U_W_DESC;
+def SUBS_U_D : SUBS_U_D_ENC, SUBS_U_D_DESC;
+
+def SUBSUS_U_B : SUBSUS_U_B_ENC, SUBSUS_U_B_DESC;
+def SUBSUS_U_H : SUBSUS_U_H_ENC, SUBSUS_U_H_DESC;
+def SUBSUS_U_W : SUBSUS_U_W_ENC, SUBSUS_U_W_DESC;
+def SUBSUS_U_D : SUBSUS_U_D_ENC, SUBSUS_U_D_DESC;
+
+def SUBSUU_S_B : SUBSUU_S_B_ENC, SUBSUU_S_B_DESC;
+def SUBSUU_S_H : SUBSUU_S_H_ENC, SUBSUU_S_H_DESC;
+def SUBSUU_S_W : SUBSUU_S_W_ENC, SUBSUU_S_W_DESC;
+def SUBSUU_S_D : SUBSUU_S_D_ENC, SUBSUU_S_D_DESC;
+
+def SUBV_B : SUBV_B_ENC, SUBV_B_DESC;
+def SUBV_H : SUBV_H_ENC, SUBV_H_DESC;
+def SUBV_W : SUBV_W_ENC, SUBV_W_DESC;
+def SUBV_D : SUBV_D_ENC, SUBV_D_DESC;
+
+def SUBVI_B : SUBVI_B_ENC, SUBVI_B_DESC;
+def SUBVI_H : SUBVI_H_ENC, SUBVI_H_DESC;
+def SUBVI_W : SUBVI_W_ENC, SUBVI_W_DESC;
+def SUBVI_D : SUBVI_D_ENC, SUBVI_D_DESC;
+
+def VSHF_B : VSHF_B_ENC, VSHF_B_DESC;
+def VSHF_H : VSHF_H_ENC, VSHF_H_DESC;
+def VSHF_W : VSHF_W_ENC, VSHF_W_DESC;
+def VSHF_D : VSHF_D_ENC, VSHF_D_DESC;
+
+def XOR_V : XOR_V_ENC, XOR_V_DESC;
+def XOR_V_H_PSEUDO : XOR_V_H_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_W_PSEUDO : XOR_V_W_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+def XOR_V_D_PSEUDO : XOR_V_D_PSEUDO_DESC,
+                     PseudoInstExpansion<(XOR_V MSA128BOpnd:$wd,
+                                                MSA128BOpnd:$ws,
+                                                MSA128BOpnd:$wt)>;
+
+def XORI_B : XORI_B_ENC, XORI_B_DESC;
+
+// Patterns.
+class MSAPat<dag pattern, dag result, list<Predicate> pred = [HasMSA]> :
+  Pat<pattern, result>, Requires<pred>;
+
+def : MSAPat<(extractelt (v4i32 MSA128W:$ws), immZExt4:$idx),
+             (COPY_S_W MSA128W:$ws, immZExt4:$idx)>;
+
+def : MSAPat<(v16i8 (load addr:$addr)), (LD_B addr:$addr)>;
+def : MSAPat<(v8i16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4i32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2i64 (load addr:$addr)), (LD_D addr:$addr)>;
+def : MSAPat<(v8f16 (load addr:$addr)), (LD_H addr:$addr)>;
+def : MSAPat<(v4f32 (load addr:$addr)), (LD_W addr:$addr)>;
+def : MSAPat<(v2f64 (load addr:$addr)), (LD_D addr:$addr)>;
+
+def : MSAPat<(v8f16 (load addrRegImm:$addr)), (LD_H addrRegImm:$addr)>;
+def : MSAPat<(v4f32 (load addrRegImm:$addr)), (LD_W addrRegImm:$addr)>;
+def : MSAPat<(v2f64 (load addrRegImm:$addr)), (LD_D addrRegImm:$addr)>;
+
+def : MSAPat<(store (v16i8 MSA128B:$ws), addr:$addr),
+             (ST_B MSA128B:$ws, addr:$addr)>;
+def : MSAPat<(store (v8i16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4i32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2i64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+def : MSAPat<(store (v8f16 MSA128H:$ws), addr:$addr),
+             (ST_H MSA128H:$ws, addr:$addr)>;
+def : MSAPat<(store (v4f32 MSA128W:$ws), addr:$addr),
+             (ST_W MSA128W:$ws, addr:$addr)>;
+def : MSAPat<(store (v2f64 MSA128D:$ws), addr:$addr),
+             (ST_D MSA128D:$ws, addr:$addr)>;
+
+def ST_FH : MSAPat<(store (v8f16 MSA128H:$ws), addrRegImm:$addr),
+                   (ST_H MSA128H:$ws, addrRegImm:$addr)>;
+def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addrRegImm:$addr),
+                   (ST_W MSA128W:$ws, addrRegImm:$addr)>;
+def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addrRegImm:$addr),
+                   (ST_D MSA128D:$ws, addrRegImm:$addr)>;
+
+class MSA_FABS_PSEUDO_DESC_BASE<RegisterOperand ROWD,
+                                RegisterOperand ROWS = ROWD,
+                                InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs ROWD:$wd),
+             (ins ROWS:$ws),
+             [(set ROWD:$wd, (fabs ROWS:$ws))]> {
+  InstrItinClass Itinerary = itin;
+}
+def FABS_W : MSA_FABS_PSEUDO_DESC_BASE<MSA128WOpnd>,
+             PseudoInstExpansion<(FMAX_A_W MSA128WOpnd:$wd, MSA128WOpnd:$ws,
+                                           MSA128WOpnd:$ws)>;
+def FABS_D : MSA_FABS_PSEUDO_DESC_BASE<MSA128DOpnd>,
+             PseudoInstExpansion<(FMAX_A_D MSA128DOpnd:$wd, MSA128DOpnd:$ws,
+                                           MSA128DOpnd:$ws)>;
+
+class MSABitconvertPat<ValueType DstVT, ValueType SrcVT,
+                       RegisterClass DstRC, list<Predicate> preds = [HasMSA]> :
+   MSAPat<(DstVT (bitconvert SrcVT:$src)),
+          (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>;
+
+// These are endian-independant because the element size doesnt change
+def : MSABitconvertPat<v8i16, v8f16, MSA128H>;
+def : MSABitconvertPat<v4i32, v4f32, MSA128W>;
+def : MSABitconvertPat<v2i64, v2f64, MSA128D>;
+def : MSABitconvertPat<v8f16, v8i16, MSA128H>;
+def : MSABitconvertPat<v4f32, v4i32, MSA128W>;
+def : MSABitconvertPat<v2f64, v2i64, MSA128D>;
+
+// Little endian bitcasts are always no-ops
+def : MSABitconvertPat<v16i8, v8i16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4i32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2i64, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v8f16, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v4f32, MSA128B, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v16i8, v2f64, MSA128B, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v8i16, v16i8, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4i32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2i64, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v4f32, MSA128H, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v8i16, v2f64, MSA128H, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4i32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4i32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2i64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2i64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v4f32, v16i8, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8i16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2i64, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v8f16, MSA128W, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v4f32, v2f64, MSA128W, [HasMSA, IsLE]>;
+
+def : MSABitconvertPat<v2f64, v16i8, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8i16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4i32, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v8f16, MSA128D, [HasMSA, IsLE]>;
+def : MSABitconvertPat<v2f64, v4f32, MSA128D, [HasMSA, IsLE]>;
+
+// Big endian bitcasts expand to shuffle instructions.
+// This is because bitcast is defined to be a store/load sequence and the
+// vector store/load instructions are mixed-endian with respect to the vector
+// as a whole (little endian with respect to element order, but big endian
+// elements).
+
+class MSABitconvertReverseQuartersPat<ValueType DstVT, ValueType SrcVT,
+                                      RegisterClass DstRC, MSAInst Insn,
+                                      RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHalvesPat<ValueType DstVT, ValueType SrcVT,
+                                    RegisterClass DstRC, MSAInst Insn,
+                                    RegisterClass ViaRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177),
+                           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseBInHPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_B, MSA128B>;
+
+class MSABitconvertReverseBInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSAPat<(DstVT (bitconvert SrcVT:$src)),
+         (COPY_TO_REGCLASS
+           (SHF_W
+             (COPY_TO_REGCLASS
+               (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27),
+               MSA128W), 177),
+           DstRC),
+         [HasMSA, IsBE]>;
+
+class MSABitconvertReverseHInWPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseHInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseQuartersPat<DstVT, SrcVT, DstRC, SHF_H, MSA128H>;
+
+class MSABitconvertReverseWInDPat<ValueType DstVT, ValueType SrcVT,
+                                  RegisterClass DstRC> :
+  MSABitconvertReverseHalvesPat<DstVT, SrcVT, DstRC, SHF_W, MSA128W>;
+
+def : MSABitconvertReverseBInHPat<v8i16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInHPat<v8f16, v16i8, MSA128H>;
+def : MSABitconvertReverseBInWPat<v4i32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInWPat<v4f32, v16i8, MSA128W>;
+def : MSABitconvertReverseBInDPat<v2i64, v16i8, MSA128D>;
+def : MSABitconvertReverseBInDPat<v2f64, v16i8, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8i16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8i16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8i16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8i16, MSA128D>;
+
+def : MSABitconvertReverseBInHPat<v16i8, v8f16, MSA128B>;
+def : MSABitconvertReverseHInWPat<v4i32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInWPat<v4f32, v8f16, MSA128W>;
+def : MSABitconvertReverseHInDPat<v2i64, v8f16, MSA128D>;
+def : MSABitconvertReverseHInDPat<v2f64, v8f16, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4i32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4i32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4i32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4i32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4i32, MSA128D>;
+
+def : MSABitconvertReverseBInWPat<v16i8, v4f32, MSA128B>;
+def : MSABitconvertReverseHInWPat<v8i16, v4f32, MSA128H>;
+def : MSABitconvertReverseHInWPat<v8f16, v4f32, MSA128H>;
+def : MSABitconvertReverseWInDPat<v2i64, v4f32, MSA128D>;
+def : MSABitconvertReverseWInDPat<v2f64, v4f32, MSA128D>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2i64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2i64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2i64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2i64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2i64, MSA128W>;
+
+def : MSABitconvertReverseBInDPat<v16i8, v2f64, MSA128B>;
+def : MSABitconvertReverseHInDPat<v8i16, v2f64, MSA128H>;
+def : MSABitconvertReverseHInDPat<v8f16, v2f64, MSA128H>;
+def : MSABitconvertReverseWInDPat<v4i32, v2f64, MSA128W>;
+def : MSABitconvertReverseWInDPat<v4f32, v2f64, MSA128W>;
+
+// Pseudos used to implement BNZ.df, and BZ.df
+
+class MSA_CBRANCH_PSEUDO_DESC_BASE<SDPatternOperator OpNode, ValueType TyNode,
+                                   RegisterClass RCWS,
+                                   InstrItinClass itin = NoItinerary> :
+  MipsPseudo<(outs GPR32:$dst),
+             (ins RCWS:$ws),
+             [(set GPR32:$dst, (OpNode (TyNode RCWS:$ws)))]> {
+  bit usesCustomInserter = 1;
+}
+
+def SNZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+def SNZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v8i16,
+                                                MSA128H, NoItinerary>;
+def SNZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v4i32,
+                                                MSA128W, NoItinerary>;
+def SNZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllNonZero, v2i64,
+                                                MSA128D, NoItinerary>;
+def SNZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyNonZero, v16i8,
+                                                MSA128B, NoItinerary>;
+
+def SZ_B_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v16i8,
+                                               MSA128B, NoItinerary>;
+def SZ_H_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v8i16,
+                                               MSA128H, NoItinerary>;
+def SZ_W_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v4i32,
+                                               MSA128W, NoItinerary>;
+def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
+                                               MSA128D, NoItinerary>;
+def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
+                                               MSA128B, NoItinerary>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 59b23f7..dedf802 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -22,6 +23,53 @@ static cl::opt<bool>
 FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
                  cl::desc("Always use $gp as the global base register."));
 
+// class MipsCallEntry.
+MipsCallEntry::MipsCallEntry(const StringRef &N) {
+#ifndef NDEBUG
+  Name = N;
+  Val = 0;
+#endif
+}
+
+MipsCallEntry::MipsCallEntry(const GlobalValue *V) {
+#ifndef NDEBUG
+  Val = V;
+#endif
+}
+
+bool MipsCallEntry::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool MipsCallEntry::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+void MipsCallEntry::printCustom(raw_ostream &O) const {
+  O << "MipsCallEntry: ";
+#ifndef NDEBUG
+  if (Val)
+    O << Val->getName();
+  else
+    O << Name;
+#endif
+}
+
+MipsFunctionInfo::~MipsFunctionInfo() {
+  for (StringMap<const MipsCallEntry *>::iterator
+       I = ExternalCallEntries.begin(), E = ExternalCallEntries.end(); I != E;
+       ++I)
+    delete I->getValue();
+
+  for (ValueMap<const GlobalValue *, const MipsCallEntry *>::iterator
+       I = GlobalCallEntries.begin(), E = GlobalCallEntries.end(); I != E; ++I)
+    delete I->second;
+}
+
 bool MipsFunctionInfo::globalBaseRegSet() const {
   return GlobalBaseReg;
 }
@@ -38,8 +86,8 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
     RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
   else
     RC = ST.isABI_N64() ?
-      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass :
-      (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+      (const TargetRegisterClass*)&Mips::GPR64RegClass :
+      (const TargetRegisterClass*)&Mips::GPR32RegClass;
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
@@ -60,7 +108,7 @@ void MipsFunctionInfo::createEhDataRegsFI() {
   for (int I = 0; I < 4; ++I) {
     const MipsSubtarget &ST = MF.getTarget().getSubtarget<MipsSubtarget>();
     const TargetRegisterClass *RC = ST.isABI_N64() ?
-        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
     EhDataRegFI[I] = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
         RC->getAlignment(), false);
@@ -72,4 +120,22 @@ bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
                         || FI == EhDataRegFI[2] || FI == EhDataRegFI[3]);
 }
 
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const StringRef &Name) {
+  const MipsCallEntry *&E = ExternalCallEntries[Name];
+
+  if (!E)
+    E = new MipsCallEntry(Name);
+
+  return MachinePointerInfo(E);
+}
+
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *Val) {
+  const MipsCallEntry *&E = GlobalCallEntries[Val];
+
+  if (!E)
+    E = new MipsCallEntry(Val);
+
+  return MachinePointerInfo(E);
+}
+
 void MipsFunctionInfo::anchor() { }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index b05b348..43bf682 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -15,56 +15,48 @@
 #define MIPS_MACHINE_FUNCTION_INFO_H
 
 #include "MipsSubtarget.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include <utility>
 
 namespace llvm {
 
+/// \brief A class derived from PseudoSourceValue that represents a GOT entry
+/// resolved by lazy-binding.
+class MipsCallEntry : public PseudoSourceValue {
+public:
+  explicit MipsCallEntry(const StringRef &N);
+  explicit MipsCallEntry(const GlobalValue *V);
+  virtual bool isConstant(const MachineFrameInfo *) const;
+  virtual bool isAliased(const MachineFrameInfo *) const;
+  virtual bool mayAlias(const MachineFrameInfo *) const;
+
+private:
+  virtual void printCustom(raw_ostream &O) const;
+#ifndef NDEBUG
+  std::string Name;
+  const GlobalValue *Val;
+#endif
+};
+
 /// MipsFunctionInfo - This class is derived from MachineFunction private
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
-
-  MachineFunction& MF;
-  /// SRetReturnReg - Some subtargets require that sret lowering includes
-  /// returning the value of the returned struct in a register. This field
-  /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
-
-  /// GlobalBaseReg - keeps track of the virtual register initialized for
-  /// use as the global base register. This is used for PIC in some PIC
-  /// relocation models.
-  unsigned GlobalBaseReg;
-
-  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
-  /// use as an alias for SP for use in load/store of halfword/byte from/to
-  /// the stack
-  unsigned Mips16SPAliasReg;
-
-  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
-  int VarArgsFrameIndex;
-
-  /// True if function has a byval argument.
-  bool HasByvalArg;
-
-  /// Size of incoming argument area.
-  unsigned IncomingArgSize;
-
-  /// CallsEhReturn - Whether the function calls llvm.eh.return.
-  bool CallsEhReturn;
-
-  /// Frame objects for spilling eh data registers.
-  int EhDataRegFI[4];
-
 public:
   MipsFunctionInfo(MachineFunction& MF)
    : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
      VarArgsFrameIndex(0), CallsEhReturn(false)
   {}
 
+  ~MipsFunctionInfo();
+
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
@@ -92,6 +84,51 @@ public:
   int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
   bool isEhDataRegFI(int FI) const;
 
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for an external function.
+  MachinePointerInfo callPtrInfo(const StringRef &Name);
+
+  /// \brief Create a MachinePointerInfo that has a MipsCallEntr object
+  /// representing a GOT entry for a global function.
+  MachinePointerInfo callPtrInfo(const GlobalValue *Val);
+
+private:
+  virtual void anchor();
+
+  MachineFunction& MF;
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
+  /// VarArgsFrameIndex - FrameIndex for start of varargs area.
+  int VarArgsFrameIndex;
+
+  /// True if function has a byval argument.
+  bool HasByvalArg;
+
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
+
+  /// CallsEhReturn - Whether the function calls llvm.eh.return.
+  bool CallsEhReturn;
+
+  /// Frame objects for spilling eh data registers.
+  int EhDataRegFI[4];
+
+  /// MipsCallEntry maps.
+  StringMap<const MipsCallEntry *> ExternalCallEntries;
+  ValueMap<const GlobalValue *, const MipsCallEntry *> GlobalCallEntries;
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 1919077..fe60841 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -14,9 +14,17 @@
 #define DEBUG_TYPE "mips-os16"
 #include "MipsOs16.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+
+static cl::opt<std::string> Mips32FunctionMask(
+  "mips32-function-mask",
+  cl::init(""),
+  cl::desc("Force function to be mips32"),
+  cl::Hidden);
+
 namespace {
 
   // Figure out if we need float point based on the function signature.
@@ -85,18 +93,43 @@ namespace llvm {
 
 
 bool MipsOs16::runOnModule(Module &M) {
-  DEBUG(errs() << "Run on Module MipsOs16\n");
+  bool usingMask = Mips32FunctionMask.length() > 0;
+  bool doneUsingMask = false; // this will make it stop repeating
+  DEBUG(dbgs() << "Run on Module MipsOs16 \n" << Mips32FunctionMask << "\n");
+  if (usingMask)
+    DEBUG(dbgs() << "using mask \n" << Mips32FunctionMask << "\n");
+  unsigned int functionIndex = 0;
   bool modified = false;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration()) continue;
     DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-    if (needsFP(*F)) {
-      DEBUG(dbgs() << " need to compile as nomips16 \n");
-      F->addFnAttr("nomips16");
+    if (usingMask) {
+      if (!doneUsingMask) {
+        if (functionIndex == Mips32FunctionMask.length())
+          functionIndex = 0;
+        switch (Mips32FunctionMask[functionIndex]) {
+        case '1':
+          DEBUG(dbgs() << "mask forced mips32: " << F->getName() << "\n");
+          F->addFnAttr("nomips16");
+          break;
+        case '.':
+          doneUsingMask = true;
+          break;
+        default:
+          break;
+        }
+        functionIndex++;
+      }
     }
     else {
-      F->addFnAttr("mips16");
-      DEBUG(dbgs() << " no need to compile as nomips16 \n");
+      if (needsFP(*F)) {
+        DEBUG(dbgs() << "os16 forced mips32: " << F->getName() << "\n");
+        F->addFnAttr("nomips16");
+      }
+      else {
+        DEBUG(dbgs() << "os16 forced mips16: " << F->getName() << "\n");
+        F->addFnAttr("mips16");
+      }
     }
   }
   return modified;
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index dead07b..3105b02 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -47,6 +47,11 @@ MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
+const TargetRegisterClass *
+MipsRegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                     unsigned Kind) const {
+  return Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+}
 
 unsigned
 MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
@@ -54,9 +59,9 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   switch (RC->getID()) {
   default:
     return 0;
-  case Mips::CPURegsRegClassID:
-  case Mips::CPU64RegsRegClassID:
-  case Mips::DSPRegsRegClassID: {
+  case Mips::GPR32RegClassID:
+  case Mips::GPR64RegClassID:
+  case Mips::DSPRRegClassID: {
     const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
     return 28 - TFI->hasFP(MF);
   }
@@ -78,48 +83,60 @@ const uint16_t* MipsRegisterInfo::
 getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_SaveList;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_SaveList;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_SaveList;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_SaveList;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_SaveList;
+
+  return CSR_O32_SaveList;
 }
 
 const uint32_t*
 MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_RegMask;
-  else if (!Subtarget.hasMips64())
-    return CSR_O32_RegMask;
-  else if (Subtarget.isABI_N32())
+
+  if (Subtarget.isABI_N64())
+    return CSR_N64_RegMask;
+
+  if (Subtarget.isABI_N32())
     return CSR_N32_RegMask;
 
-  assert(Subtarget.isABI_N64());
-  return CSR_N64_RegMask;
+  if (Subtarget.isFP64bit())
+    return CSR_O32_FP64_RegMask;
+
+  return CSR_O32_RegMask;
+}
+
+const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
+  return CSR_Mips16RetHelper_RegMask;
 }
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
-  static const uint16_t ReservedCPURegs[] = {
+  static const uint16_t ReservedGPR32[] = {
     Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
   };
 
-  static const uint16_t ReservedCPU64Regs[] = {
+  static const uint16_t ReservedGPR64[] = {
     Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
   BitVector Reserved(getNumRegs());
   typedef TargetRegisterClass::const_iterator RegIter;
 
-  for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I)
-    Reserved.set(ReservedCPURegs[I]);
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR32); ++I)
+    Reserved.set(ReservedGPR32[I]);
 
-  for (unsigned I = 0; I < array_lengthof(ReservedCPU64Regs); ++I)
-    Reserved.set(ReservedCPU64Regs[I]);
+  for (unsigned I = 0; I < array_lengthof(ReservedGPR64); ++I)
+    Reserved.set(ReservedGPR64[I]);
 
-  if (Subtarget.hasMips64()) {
+  if (Subtarget.isFP64bit()) {
     // Reserve all registers in AFGR64.
     for (RegIter Reg = Mips::AFGR64RegClass.begin(),
          EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
@@ -142,7 +159,6 @@ getReservedRegs(const MachineFunction &MF) const {
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
-  Reserved.set(Mips::HWR29_64);
 
   // Reserve DSP control register.
   Reserved.set(Mips::DSPPos);
@@ -151,10 +167,22 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::DSPEFI);
   Reserved.set(Mips::DSPOutFlag);
 
+  // Reserve MSA control registers.
+  Reserved.set(Mips::MSAIR);
+  Reserved.set(Mips::MSACSR);
+  Reserved.set(Mips::MSAAccess);
+  Reserved.set(Mips::MSASave);
+  Reserved.set(Mips::MSAModify);
+  Reserved.set(Mips::MSARequest);
+  Reserved.set(Mips::MSAMap);
+  Reserved.set(Mips::MSAUnmap);
+
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
     Reserved.set(Mips::RA);
     Reserved.set(Mips::RA_64);
+    Reserved.set(Mips::T0);
+    Reserved.set(Mips::T1);
   }
 
   // Reserve GP if small section is used.
@@ -212,12 +240,3 @@ getFrameRegister(const MachineFunction &MF) const {
 
 }
 
-unsigned MipsRegisterInfo::
-getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned MipsRegisterInfo::
-getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 5ed5124..0450c6f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -42,10 +42,14 @@ public:
   void adjustMipsStackFrame(MachineFunction &MF) const;
 
   /// Code Generation virtual methods...
+  const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
+                                                unsigned Kind) const;
+
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const;
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  static const uint32_t *getMips16RetHelperMask();
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -64,10 +68,6 @@ public:
   /// Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
-  /// Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
-
   /// \brief Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 229f167..3173d09 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -11,16 +11,15 @@
 //  Declarations that describe the MIPS register file
 //===----------------------------------------------------------------------===//
 let Namespace = "Mips" in {
-def sub_fpeven : SubRegIndex;
-def sub_fpodd  : SubRegIndex;
-def sub_32     : SubRegIndex;
-def sub_lo     : SubRegIndex;
-def sub_hi     : SubRegIndex;
-def sub_dsp16_19 : SubRegIndex;
-def sub_dsp20    : SubRegIndex;
-def sub_dsp21    : SubRegIndex;
-def sub_dsp22    : SubRegIndex;
-def sub_dsp23    : SubRegIndex;
+def sub_32     : SubRegIndex<32>;
+def sub_64     : SubRegIndex<64>;
+def sub_lo     : SubRegIndex<32>;
+def sub_hi     : SubRegIndex<32, 32>;
+def sub_dsp16_19 : SubRegIndex<4, 16>;
+def sub_dsp20    : SubRegIndex<1, 20>;
+def sub_dsp21    : SubRegIndex<1, 21>;
+def sub_dsp22    : SubRegIndex<1, 22>;
+def sub_dsp23    : SubRegIndex<1, 23>;
 }
 
 class Unallocatable {
@@ -54,17 +53,24 @@ class FPR<bits<16> Enc, string n> : MipsReg<Enc, n>;
 // Mips 64-bit (aliased) FPU Registers
 class AFPR<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_fpeven, sub_fpodd];
+  let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
 }
 
 class AFPR64<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
-  let SubRegIndices = [sub_32];
+  let SubRegIndices = [sub_lo, sub_hi];
+  let CoveredBySubRegs = 1;
+}
+
+// Mips 128-bit (aliased) MSA Registers
+class AFPR128<bits<16> Enc, string n, list<Register> subregs>
+  : MipsRegWithSubRegs<Enc, n, subregs> {
+  let SubRegIndices = [sub_64];
 }
 
 // Accumulator Registers
-class ACC<bits<16> Enc, string n, list<Register> subregs>
+class ACCReg<bits<16> Enc, string n, list<Register> subregs>
   : MipsRegWithSubRegs<Enc, n, subregs> {
   let SubRegIndices = [sub_lo, sub_hi];
   let CoveredBySubRegs = 1;
@@ -147,127 +153,70 @@ let Namespace = "Mips" in {
   def RA_64   : Mips64GPRReg< 31, "ra",  [RA]>, DwarfRegNum<[31]>;
 
   /// Mips Single point precision FPU Registers
-  def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
-  def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
-  def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
-  def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
-  def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
-  def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
-  def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
-  def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
-  def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
-  def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
-  def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
-  def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
-  def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
-  def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
-  def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
-  def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
-  def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
-  def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
-  def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
-  def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
-  def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
-  def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
-  def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
-  def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
-  def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
-  def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
-  def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
-  def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
-  def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
-  def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
-  def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
-  def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+  foreach I = 0-31 in
+  def F#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
+
+  // Higher half of 64-bit FP registers.
+  foreach I = 0-31 in
+  def F_HI#I : FPR<I, "f"#I>, DwarfRegNum<[!add(I, 32)]>;
 
   /// Mips Double point precision FPU Registers (aliased
   /// with the single precision to hold 64 bit values)
-  def D0  : AFPR< 0,  "f0", [F0,   F1]>;
-  def D1  : AFPR< 2,  "f2", [F2,   F3]>;
-  def D2  : AFPR< 4,  "f4", [F4,   F5]>;
-  def D3  : AFPR< 6,  "f6", [F6,   F7]>;
-  def D4  : AFPR< 8,  "f8", [F8,   F9]>;
-  def D5  : AFPR<10, "f10", [F10, F11]>;
-  def D6  : AFPR<12, "f12", [F12, F13]>;
-  def D7  : AFPR<14, "f14", [F14, F15]>;
-  def D8  : AFPR<16, "f16", [F16, F17]>;
-  def D9  : AFPR<18, "f18", [F18, F19]>;
-  def D10 : AFPR<20, "f20", [F20, F21]>;
-  def D11 : AFPR<22, "f22", [F22, F23]>;
-  def D12 : AFPR<24, "f24", [F24, F25]>;
-  def D13 : AFPR<26, "f26", [F26, F27]>;
-  def D14 : AFPR<28, "f28", [F28, F29]>;
-  def D15 : AFPR<30, "f30", [F30, F31]>;
+  foreach I = 0-15 in
+  def D#I : AFPR<!shl(I, 1), "f"#!shl(I, 1),
+                 [!cast<FPR>("F"#!shl(I, 1)),
+                  !cast<FPR>("F"#!add(!shl(I, 1), 1))]>;
 
   /// Mips Double point precision FPU Registers in MFP64 mode.
-  def D0_64  : AFPR64<0, "f0", [F0]>, DwarfRegNum<[32]>;
-  def D1_64  : AFPR64<1, "f1", [F1]>, DwarfRegNum<[33]>;
-  def D2_64  : AFPR64<2, "f2", [F2]>, DwarfRegNum<[34]>;
-  def D3_64  : AFPR64<3, "f3", [F3]>, DwarfRegNum<[35]>;
-  def D4_64  : AFPR64<4, "f4", [F4]>, DwarfRegNum<[36]>;
-  def D5_64  : AFPR64<5, "f5", [F5]>, DwarfRegNum<[37]>;
-  def D6_64  : AFPR64<6, "f6", [F6]>, DwarfRegNum<[38]>;
-  def D7_64  : AFPR64<7, "f7", [F7]>, DwarfRegNum<[39]>;
-  def D8_64  : AFPR64<8, "f8", [F8]>, DwarfRegNum<[40]>;
-  def D9_64  : AFPR64<9, "f9", [F9]>, DwarfRegNum<[41]>;
-  def D10_64  : AFPR64<10, "f10", [F10]>, DwarfRegNum<[42]>;
-  def D11_64  : AFPR64<11, "f11", [F11]>, DwarfRegNum<[43]>;
-  def D12_64  : AFPR64<12, "f12", [F12]>, DwarfRegNum<[44]>;
-  def D13_64  : AFPR64<13, "f13", [F13]>, DwarfRegNum<[45]>;
-  def D14_64  : AFPR64<14, "f14", [F14]>, DwarfRegNum<[46]>;
-  def D15_64  : AFPR64<15, "f15", [F15]>, DwarfRegNum<[47]>;
-  def D16_64  : AFPR64<16, "f16", [F16]>, DwarfRegNum<[48]>;
-  def D17_64  : AFPR64<17, "f17", [F17]>, DwarfRegNum<[49]>;
-  def D18_64  : AFPR64<18, "f18", [F18]>, DwarfRegNum<[50]>;
-  def D19_64  : AFPR64<19, "f19", [F19]>, DwarfRegNum<[51]>;
-  def D20_64  : AFPR64<20, "f20", [F20]>, DwarfRegNum<[52]>;
-  def D21_64  : AFPR64<21, "f21", [F21]>, DwarfRegNum<[53]>;
-  def D22_64  : AFPR64<22, "f22", [F22]>, DwarfRegNum<[54]>;
-  def D23_64  : AFPR64<23, "f23", [F23]>, DwarfRegNum<[55]>;
-  def D24_64  : AFPR64<24, "f24", [F24]>, DwarfRegNum<[56]>;
-  def D25_64  : AFPR64<25, "f25", [F25]>, DwarfRegNum<[57]>;
-  def D26_64  : AFPR64<26, "f26", [F26]>, DwarfRegNum<[58]>;
-  def D27_64  : AFPR64<27, "f27", [F27]>, DwarfRegNum<[59]>;
-  def D28_64  : AFPR64<28, "f28", [F28]>, DwarfRegNum<[60]>;
-  def D29_64  : AFPR64<29, "f29", [F29]>, DwarfRegNum<[61]>;
-  def D30_64  : AFPR64<30, "f30", [F30]>, DwarfRegNum<[62]>;
-  def D31_64  : AFPR64<31, "f31", [F31]>, DwarfRegNum<[63]>;
+  foreach I = 0-31 in
+  def D#I#_64 : AFPR64<I, "f"#I, [!cast<FPR>("F"#I), !cast<FPR>("F_HI"#I)]>,
+                DwarfRegNum<[!add(I, 32)]>;
+
+  /// Mips MSA registers
+  /// MSA and FPU cannot both be present unless the FPU has 64-bit registers
+  foreach I = 0-31 in
+  def W#I : AFPR128<I, "w"#I, [!cast<AFPR64>("D"#I#"_64")]>,
+            DwarfRegNum<[!add(I, 32)]>;
 
   // Hi/Lo registers
-  def HI  : Register<"ac0">, DwarfRegNum<[64]>;
-  def HI1 : Register<"ac1">, DwarfRegNum<[176]>;
-  def HI2 : Register<"ac2">, DwarfRegNum<[178]>;
-  def HI3 : Register<"ac3">, DwarfRegNum<[180]>;
-  def LO  : Register<"ac0">, DwarfRegNum<[65]>;
-  def LO1 : Register<"ac1">, DwarfRegNum<[177]>;
-  def LO2 : Register<"ac2">, DwarfRegNum<[179]>;
-  def LO3 : Register<"ac3">, DwarfRegNum<[181]>;
+  def HI0 : MipsReg<0, "ac0">, DwarfRegNum<[64]>;
+  def HI1 : MipsReg<1, "ac1">, DwarfRegNum<[176]>;
+  def HI2 : MipsReg<2, "ac2">, DwarfRegNum<[178]>;
+  def HI3 : MipsReg<3, "ac3">, DwarfRegNum<[180]>;
+  def LO0 : MipsReg<0, "ac0">, DwarfRegNum<[65]>;
+  def LO1 : MipsReg<1, "ac1">, DwarfRegNum<[177]>;
+  def LO2 : MipsReg<2, "ac2">, DwarfRegNum<[179]>;
+  def LO3 : MipsReg<3, "ac3">, DwarfRegNum<[181]>;
 
   let SubRegIndices = [sub_32] in {
-  def HI64  : RegisterWithSubRegs<"hi", [HI]>;
-  def LO64  : RegisterWithSubRegs<"lo", [LO]>;
+  def HI0_64  : RegisterWithSubRegs<"hi", [HI0]>;
+  def LO0_64  : RegisterWithSubRegs<"lo", [LO0]>;
   }
 
-  // Status flags register
-  def FCR31 : Register<"31">;
+  // FP control registers.
+  foreach I = 0-31 in
+  def FCR#I : MipsReg<#I, ""#I>;
+
+  // FP condition code registers.
+  foreach I = 0-7 in
+  def FCC#I : MipsReg<#I, "fcc"#I>;
 
-  // fcc0 register
-  def FCC0 : MipsReg<0, "fcc0">;
+  // COP2 registers.
+  foreach I = 0-31 in
+  def COP2#I : MipsReg<#I, ""#I>;
 
   // PC register
   def PC : Register<"pc">;
 
   // Hardware register $29
   def HWR29 : MipsReg<29, "29">;
-  def HWR29_64 : MipsReg<29, "29">;
 
   // Accum registers
-  def AC0 : ACC<0, "ac0", [LO, HI]>;
-  def AC1 : ACC<1, "ac1", [LO1, HI1]>;
-  def AC2 : ACC<2, "ac2", [LO2, HI2]>;
-  def AC3 : ACC<3, "ac3", [LO3, HI3]>;
+  foreach I = 0-3 in
+  def AC#I : ACCReg<#I, "ac"#I,
+                    [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
 
-  def AC0_64 : ACC<0, "ac0", [LO64, HI64]>;
+  def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
 
   // DSP-ASE control register fields.
   def DSPPos : Register<"">;
@@ -286,13 +235,23 @@ let Namespace = "Mips" in {
   def DSPOutFlag : RegisterWithSubRegs<"", [DSPOutFlag16_19, DSPOutFlag20,
                                             DSPOutFlag21, DSPOutFlag22,
                                             DSPOutFlag23]>;
+
+  // MSA-ASE control registers.
+  def MSAIR      : MipsReg<0, "0">;
+  def MSACSR     : MipsReg<1, "1">;
+  def MSAAccess  : MipsReg<2, "2">;
+  def MSASave    : MipsReg<3, "3">;
+  def MSAModify  : MipsReg<4, "4">;
+  def MSARequest : MipsReg<5, "5">;
+  def MSAMap     : MipsReg<6, "6">;
+  def MSAUnmap   : MipsReg<7, "7">;
 }
 
 //===----------------------------------------------------------------------===//
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-class CPURegsClass<list<ValueType> regTypes> :
+class GPR32Class<list<ValueType> regTypes> :
   RegisterClass<"Mips", regTypes, 32, (add
   // Reserved
   ZERO, AT,
@@ -307,10 +266,10 @@ class CPURegsClass<list<ValueType> regTypes> :
   // Reserved
   K0, K1, GP, SP, FP, RA)>;
 
-def CPURegs : CPURegsClass<[i32]>;
-def DSPRegs : CPURegsClass<[v4i8, v2i16]>;
+def GPR32 : GPR32Class<[i32]>;
+def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
-def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
+def GPR64 : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
   ZERO_64, AT_64,
   // Return Values and Arguments
@@ -330,6 +289,13 @@ def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
   // Callee save
   S0, S1)>;
 
+def CPU16RegsPlusSP : RegisterClass<"Mips", [i32], 32, (add
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Callee save
+  S0, S1,
+  SP)>;
+
 def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>, Unallocatable;
 
 def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
@@ -343,6 +309,9 @@ def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>, Unallocatable;
 // * FGR32 - 32 32-bit registers (single float only mode)
 def FGR32 : RegisterClass<"Mips", [f32], 32, (sequence "F%u", 0, 31)>;
 
+def FGRH32 : RegisterClass<"Mips", [f32], 32, (sequence "F_HI%u", 0, 31)>,
+             Unallocatable;
+
 def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
   // Return Values and Arguments
   D0, D1,
@@ -357,78 +326,224 @@ def AFGR64 : RegisterClass<"Mips", [f64], 64, (add
 
 def FGR64 : RegisterClass<"Mips", [f64], 64, (sequence "D%u_64", 0, 31)>;
 
-// Condition Register for floating point operations
-def CCR  : RegisterClass<"Mips", [i32], 32, (add FCR31,FCC0)>, Unallocatable;
+// FP control registers.
+def CCR : RegisterClass<"Mips", [i32], 32, (sequence "FCR%u", 0, 31)>,
+          Unallocatable;
+
+// FP condition code registers.
+def FCC : RegisterClass<"Mips", [i32], 32, (sequence "FCC%u", 0, 7)>,
+          Unallocatable;
+
+def MSA128B: RegisterClass<"Mips", [v16i8], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128H: RegisterClass<"Mips", [v8i16, v8f16], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128W: RegisterClass<"Mips", [v4i32, v4f32], 128,
+                           (sequence "W%u", 0, 31)>;
+def MSA128D: RegisterClass<"Mips", [v2i64, v2f64], 128,
+                           (sequence "W%u", 0, 31)>;
+
+def MSACtrl: RegisterClass<"Mips", [i32], 32, (add
+  MSAIR, MSACSR, MSAAccess, MSASave, MSAModify, MSARequest, MSAMap, MSAUnmap)>;
 
 // Hi/Lo Registers
-def LORegs : RegisterClass<"Mips", [i32], 32, (add LO)>;
-def HIRegs : RegisterClass<"Mips", [i32], 32, (add HI)>;
-def LORegsDSP : RegisterClass<"Mips", [i32], 32, (add LO, LO1, LO2, LO3)>;
-def HIRegsDSP : RegisterClass<"Mips", [i32], 32, (add HI, HI1, HI2, HI3)>;
-def LORegs64 : RegisterClass<"Mips", [i64], 64, (add LO64)>;
-def HIRegs64 : RegisterClass<"Mips", [i64], 64, (add HI64)>;
+def LO32 : RegisterClass<"Mips", [i32], 32, (add LO0)>;
+def HI32 : RegisterClass<"Mips", [i32], 32, (add HI0)>;
+def LO32DSP : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+def HI32DSP : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LO64 : RegisterClass<"Mips", [i64], 64, (add LO0_64)>;
+def HI64 : RegisterClass<"Mips", [i64], 64, (add HI0_64)>;
 
 // Hardware registers
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>, Unallocatable;
-def HWRegs64 : RegisterClass<"Mips", [i64], 64, (add HWR29_64)>, Unallocatable;
 
 // Accumulator Registers
-def ACRegs : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
+def ACC64 : RegisterClass<"Mips", [untyped], 64, (add AC0)> {
   let Size = 64;
 }
 
-def ACRegs128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
+def ACC128 : RegisterClass<"Mips", [untyped], 128, (add AC0_64)> {
   let Size = 128;
 }
 
-def ACRegsDSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
+def ACC64DSP : RegisterClass<"Mips", [untyped], 64, (sequence "AC%u", 0, 3)> {
   let Size = 64;
 }
 
 def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 
+// Coprocessor 2 registers.
+def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
+           Unallocatable;
+
 // Register Operands.
-def CPURegsAsmOperand : AsmOperandClass {
-  let Name = "CPURegsAsm";
-  let ParserMethod = "parseCPURegs";
+
+class MipsAsmRegOperand : AsmOperandClass {
+  let RenderMethod = "addRegAsmOperands";
+}
+def GPR32AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32Asm";
+  let ParserMethod = "parseGPR32";
+}
+
+def GPR64AsmOperand : MipsAsmRegOperand {
+  let Name = "GPR64Asm";
+  let ParserMethod = "parseGPR64";
+}
+
+def ACC64DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "ACC64DSPAsm";
+  let ParserMethod = "parseACC64DSP";
+}
+
+def LO32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "LO32DSPAsm";
+  let ParserMethod = "parseLO32DSP";
 }
 
-def CPU64RegsAsmOperand : AsmOperandClass {
-  let Name = "CPU64RegsAsm";
-  let ParserMethod = "parseCPU64Regs";
+def HI32DSPAsmOperand : MipsAsmRegOperand {
+  let Name = "HI32DSPAsm";
+  let ParserMethod = "parseHI32DSP";
 }
 
-def CCRAsmOperand : AsmOperandClass {
+def CCRAsmOperand : MipsAsmRegOperand {
   let Name = "CCRAsm";
   let ParserMethod = "parseCCRRegs";
 }
 
-def CPURegsOpnd : RegisterOperand<CPURegs, "printCPURegs"> {
-  let ParserMatchClass = CPURegsAsmOperand;
+def AFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "AFGR64Asm";
+  let ParserMethod = "parseAFGR64Regs";
+}
+
+def FGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR64Asm";
+  let ParserMethod = "parseFGR64Regs";
 }
 
-def CPU64RegsOpnd : RegisterOperand<CPU64Regs, "printCPURegs"> {
-  let ParserMatchClass = CPU64RegsAsmOperand;
+def FGR32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGR32Asm";
+  let ParserMethod = "parseFGR32Regs";
 }
 
-def CCROpnd : RegisterOperand<CCR, "printCPURegs"> {
+def FGRH32AsmOperand : MipsAsmRegOperand {
+  let Name = "FGRH32Asm";
+  let ParserMethod = "parseFGRH32Regs";
+}
+
+def FCCRegsAsmOperand : MipsAsmRegOperand {
+  let Name = "FCCRegsAsm";
+  let ParserMethod = "parseFCCRegs";
+}
+
+def MSA128BAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128BAsm";
+  let ParserMethod = "parseMSA128BRegs";
+}
+
+def MSA128HAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128HAsm";
+  let ParserMethod = "parseMSA128HRegs";
+}
+
+def MSA128WAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128WAsm";
+  let ParserMethod = "parseMSA128WRegs";
+}
+
+def MSA128DAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128DAsm";
+  let ParserMethod = "parseMSA128DRegs";
+}
+
+def MSA128CRAsmOperand : MipsAsmRegOperand {
+  let Name = "MSA128CRAsm";
+  let ParserMethod = "parseMSA128CtrlRegs";
+}
+
+def GPR32Opnd : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def GPR64Opnd : RegisterOperand<GPR64> {
+  let ParserMatchClass = GPR64AsmOperand;
+}
+
+def DSPROpnd : RegisterOperand<DSPR> {
+  let ParserMatchClass = GPR32AsmOperand;
+}
+
+def CCROpnd : RegisterOperand<CCR> {
   let ParserMatchClass = CCRAsmOperand;
 }
 
-def HWRegsAsmOperand : AsmOperandClass {
+def HWRegsAsmOperand : MipsAsmRegOperand {
   let Name = "HWRegsAsm";
   let ParserMethod = "parseHWRegs";
 }
 
-def HW64RegsAsmOperand : AsmOperandClass {
-  let Name = "HW64RegsAsm";
-  let ParserMethod = "parseHW64Regs";
+def COP2AsmOperand : MipsAsmRegOperand {
+  let Name = "COP2Asm";
+  let ParserMethod = "parseCOP2";
 }
 
-def HWRegsOpnd : RegisterOperand<HWRegs, "printCPURegs"> {
+def HWRegsOpnd : RegisterOperand<HWRegs> {
   let ParserMatchClass = HWRegsAsmOperand;
 }
 
-def HW64RegsOpnd : RegisterOperand<HWRegs64, "printCPURegs"> {
-  let ParserMatchClass = HW64RegsAsmOperand;
+def AFGR64Opnd : RegisterOperand<AFGR64> {
+  let ParserMatchClass = AFGR64AsmOperand;
+}
+
+def FGR64Opnd : RegisterOperand<FGR64> {
+  let ParserMatchClass = FGR64AsmOperand;
+}
+
+def FGR32Opnd : RegisterOperand<FGR32> {
+  let ParserMatchClass = FGR32AsmOperand;
+}
+
+def FGRH32Opnd : RegisterOperand<FGRH32> {
+  let ParserMatchClass = FGRH32AsmOperand;
+}
+
+def FCCRegsOpnd : RegisterOperand<FCC> {
+  let ParserMatchClass = FCCRegsAsmOperand;
 }
+
+def LO32DSPOpnd : RegisterOperand<LO32DSP> {
+  let ParserMatchClass = LO32DSPAsmOperand;
+}
+
+def HI32DSPOpnd : RegisterOperand<HI32DSP> {
+  let ParserMatchClass = HI32DSPAsmOperand;
+}
+
+def ACC64DSPOpnd : RegisterOperand<ACC64DSP> {
+  let ParserMatchClass = ACC64DSPAsmOperand;
+}
+
+def COP2Opnd : RegisterOperand<COP2> {
+  let ParserMatchClass = COP2AsmOperand;
+}
+
+def MSA128BOpnd : RegisterOperand<MSA128B> {
+  let ParserMatchClass = MSA128BAsmOperand;
+}
+
+def MSA128HOpnd : RegisterOperand<MSA128H> {
+  let ParserMatchClass = MSA128HAsmOperand;
+}
+
+def MSA128WOpnd : RegisterOperand<MSA128W> {
+  let ParserMatchClass = MSA128WAsmOperand;
+}
+
+def MSA128DOpnd : RegisterOperand<MSA128D> {
+  let ParserMatchClass = MSA128DAsmOperand;
+}
+
+def MSA128CROpnd : RegisterOperand<MSACtrl> {
+  let ParserMatchClass = MSA128CRAsmOperand;
+}
+
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index b295e91..33ed4b3 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -32,6 +32,21 @@ using namespace llvm;
 namespace {
 typedef MachineBasicBlock::iterator Iter;
 
+static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
+  if (Mips::ACC64RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI,
+                          (unsigned)Mips::PseudoMFLO);
+
+  if (Mips::ACC64DSPRegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::MFHI_DSP, (unsigned)Mips::MFLO_DSP);
+
+  if (Mips::ACC128RegClass.contains(Src))
+    return std::make_pair((unsigned)Mips::PseudoMFHI64,
+                          (unsigned)Mips::PseudoMFLO64);
+
+  return std::make_pair(0, 0);
+}
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -43,22 +58,19 @@ private:
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
   void expandLoadACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
-  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned RegSize);
+  void expandStoreACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                      unsigned MFLoOpc, unsigned RegSize);
   bool expandCopy(MachineBasicBlock &MBB, Iter I);
-  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                     unsigned Src, unsigned RegSize);
+  bool expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned MFHiOpc,
+                     unsigned MFLoOpc);
 
   MachineFunction &MF;
-  const MipsSEInstrInfo &TII;
-  const MipsRegisterInfo &RegInfo;
   MachineRegisterInfo &MRI;
 };
 }
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
-  : MF(MF_),
-    TII(*static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo())),
-    RegInfo(TII.getRegisterInfo()), MRI(MF.getRegInfo()) {}
+  : MF(MF_), MRI(MF.getRegInfo()) {}
 
 bool ExpandPseudo::expand() {
   bool Expanded = false;
@@ -74,32 +86,26 @@ bool ExpandPseudo::expand() {
 bool ExpandPseudo::expandInstr(MachineBasicBlock &MBB, Iter I) {
   switch(I->getOpcode()) {
   case Mips::LOAD_CCOND_DSP:
-  case Mips::LOAD_CCOND_DSP_P8:
     expandLoadCCond(MBB, I);
     break;
   case Mips::STORE_CCOND_DSP:
-  case Mips::STORE_CCOND_DSP_P8:
     expandStoreCCond(MBB, I);
     break;
-  case Mips::LOAD_AC64:
-  case Mips::LOAD_AC64_P8:
-  case Mips::LOAD_AC_DSP:
-  case Mips::LOAD_AC_DSP_P8:
+  case Mips::LOAD_ACC64:
+  case Mips::LOAD_ACC64DSP:
     expandLoadACC(MBB, I, 4);
     break;
-  case Mips::LOAD_AC128:
-  case Mips::LOAD_AC128_P8:
+  case Mips::LOAD_ACC128:
     expandLoadACC(MBB, I, 8);
     break;
-  case Mips::STORE_AC64:
-  case Mips::STORE_AC64_P8:
-  case Mips::STORE_AC_DSP:
-  case Mips::STORE_AC_DSP_P8:
-    expandStoreACC(MBB, I, 4);
+  case Mips::STORE_ACC64:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI, Mips::PseudoMFLO, 4);
+    break;
+  case Mips::STORE_ACC64DSP:
+    expandStoreACC(MBB, I, Mips::MFHI_DSP, Mips::MFLO_DSP, 4);
     break;
-  case Mips::STORE_AC128:
-  case Mips::STORE_AC128_P8:
-    expandStoreACC(MBB, I, 8);
+  case Mips::STORE_ACC128:
+    expandStoreACC(MBB, I, Mips::PseudoMFHI64, Mips::PseudoMFLO64, 8);
     break;
   case TargetOpcode::COPY:
     if (!expandCopy(MBB, I))
@@ -119,6 +125,11 @@ void ExpandPseudo::expandLoadCCond(MachineBasicBlock &MBB, Iter I) {
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
   unsigned Dst = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
@@ -134,6 +145,11 @@ void ExpandPseudo::expandStoreCCond(MachineBasicBlock &MBB, Iter I) {
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(4);
   unsigned VR = MRI.createVirtualRegister(RC);
   unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
@@ -152,6 +168,11 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
@@ -168,62 +189,69 @@ void ExpandPseudo::expandLoadACC(MachineBasicBlock &MBB, Iter I,
 }
 
 void ExpandPseudo::expandStoreACC(MachineBasicBlock &MBB, Iter I,
+                                  unsigned MFHiOpc, unsigned MFLoOpc,
                                   unsigned RegSize) {
-  //  copy $vr0, lo
+  //  mflo $vr0, src
   //  store $vr0, FI
-  //  copy $vr1, hi
+  //  mfhi $vr1, src
   //  store $vr1, FI + 4
 
   assert(I->getOperand(0).isReg() && I->getOperand(1).isFI());
 
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned Src = I->getOperand(0).getReg(), FI = I->getOperand(1).getIndex();
   unsigned SrcKill = getKillRegState(I->getOperand(0).isKill());
-  unsigned Lo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned Hi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(Lo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   TII.storeRegToStack(MBB, I, VR0, true, FI, RC, &RegInfo, 0);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(Hi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   TII.storeRegToStack(MBB, I, VR1, true, FI, RC, &RegInfo, RegSize);
 }
 
 bool ExpandPseudo::expandCopy(MachineBasicBlock &MBB, Iter I) {
-  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned Src = I->getOperand(1).getReg();
+  std::pair<unsigned, unsigned> Opcodes = getMFHiLoOpc(Src);
 
-  if (Mips::ACRegsDSPRegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 4);
-
-  if (Mips::ACRegs128RegClass.contains(Dst, Src))
-    return expandCopyACC(MBB, I, Dst, Src, 8);
+  if (!Opcodes.first)
+    return false;
 
-  return false;
+  return expandCopyACC(MBB, I, Opcodes.first, Opcodes.second);
 }
 
-bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I, unsigned Dst,
-                                 unsigned Src, unsigned RegSize) {
-  //  copy $vr0, src_lo
+bool ExpandPseudo::expandCopyACC(MachineBasicBlock &MBB, Iter I,
+                                 unsigned MFHiOpc, unsigned MFLoOpc) {
+  //  mflo $vr0, src
   //  copy dst_lo, $vr0
-  //  copy $vr1, src_hi
+  //  mfhi $vr1, src
   //  copy dst_hi, $vr1
 
-  const TargetRegisterClass *RC = RegInfo.intRegClass(RegSize);
+  const MipsSEInstrInfo &TII =
+    *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
+  unsigned Dst = I->getOperand(0).getReg(), Src = I->getOperand(1).getReg();
+  unsigned VRegSize = RegInfo.getMinimalPhysRegClass(Dst)->getSize() / 2;
+  const TargetRegisterClass *RC = RegInfo.intRegClass(VRegSize);
   unsigned VR0 = MRI.createVirtualRegister(RC);
   unsigned VR1 = MRI.createVirtualRegister(RC);
   unsigned SrcKill = getKillRegState(I->getOperand(1).isKill());
   unsigned DstLo = RegInfo.getSubReg(Dst, Mips::sub_lo);
   unsigned DstHi = RegInfo.getSubReg(Dst, Mips::sub_hi);
-  unsigned SrcLo = RegInfo.getSubReg(Src, Mips::sub_lo);
-  unsigned SrcHi = RegInfo.getSubReg(Src, Mips::sub_hi);
   DebugLoc DL = I->getDebugLoc();
 
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR0).addReg(SrcLo, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFLoOpc), VR0).addReg(Src);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstLo)
     .addReg(VR0, RegState::Kill);
-  BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), VR1).addReg(SrcHi, SrcKill);
+  BuildMI(MBB, I, DL, TII.get(MFHiOpc), VR1).addReg(Src, SrcKill);
   BuildMI(MBB, I, DL, TII.get(TargetOpcode::COPY), DstHi)
     .addReg(VR1, RegState::Kill);
   return true;
@@ -244,10 +272,12 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB   = MF.front();
   MachineFrameInfo *MFI    = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const MipsSEInstrInfo &TII =
     *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
@@ -262,7 +292,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   MachineLocation DstML, SrcML;
 
   // Adjust stack.
@@ -272,9 +302,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
   BuildMI(MBB, MBBI, dl,
           TII.get(TargetOpcode::PROLOG_LABEL)).addSym(AdjustSPLabel);
-  DstML = MachineLocation(MachineLocation::VirtualFP);
-  SrcML = MachineLocation(MachineLocation::VirtualFP, -StackSize);
-  Moves.push_back(MachineMove(AdjustSPLabel, DstML, SrcML));
+  MMI.addFrameInst(
+      MCCFIInstruction::createDefCfaOffset(AdjustSPLabel, -StackSize));
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
 
@@ -298,35 +327,36 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
       // If Reg is a double precision register, emit two cfa_offsets,
       // one for each of the paired single precision registers.
       if (Mips::AFGR64RegClass.contains(Reg)) {
-        MachineLocation DstML0(MachineLocation::VirtualFP, Offset);
-        MachineLocation DstML1(MachineLocation::VirtualFP, Offset + 4);
-        MachineLocation SrcML0(RegInfo->getSubReg(Reg, Mips::sub_fpeven));
-        MachineLocation SrcML1(RegInfo->getSubReg(Reg, Mips::sub_fpodd));
+        unsigned Reg0 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_lo), true);
+        unsigned Reg1 =
+            MRI->getDwarfRegNum(RegInfo.getSubReg(Reg, Mips::sub_hi), true);
 
         if (!STI.isLittle())
-          std::swap(SrcML0, SrcML1);
+          std::swap(Reg0, Reg1);
 
-        Moves.push_back(MachineMove(CSLabel, DstML0, SrcML0));
-        Moves.push_back(MachineMove(CSLabel, DstML1, SrcML1));
+        MMI.addFrameInst(
+            MCCFIInstruction::createOffset(CSLabel, Reg0, Offset));
+        MMI.addFrameInst(
+            MCCFIInstruction::createOffset(CSLabel, Reg1, Offset + 4));
       } else {
-        // Reg is either in CPURegs or FGR32.
-        DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
-        SrcML = MachineLocation(Reg);
-        Moves.push_back(MachineMove(CSLabel, DstML, SrcML));
+        // Reg is either in GPR32 or FGR32.
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            CSLabel, MRI->getDwarfRegNum(Reg, 1), Offset));
       }
     }
   }
 
   if (MipsFI->callsEhReturn()) {
     const TargetRegisterClass *RC = STI.isABI_N64() ?
-        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
     // Insert instructions that spill eh data registers.
     for (int I = 0; I < 4; ++I) {
       if (!MBB.isLiveIn(ehDataReg(I)))
         MBB.addLiveIn(ehDataReg(I));
       TII.storeRegToStackSlot(MBB, MBBI, ehDataReg(I), false,
-                              MipsFI->getEhDataRegFI(I), RC, RegInfo);
+                              MipsFI->getEhDataRegFI(I), RC, &RegInfo);
     }
 
     // Emit .cfi_offset directives for eh data registers.
@@ -335,9 +365,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
             TII.get(TargetOpcode::PROLOG_LABEL)).addSym(CSLabel2);
     for (int I = 0; I < 4; ++I) {
       int64_t Offset = MFI->getObjectOffset(MipsFI->getEhDataRegFI(I));
-      DstML = MachineLocation(MachineLocation::VirtualFP, Offset);
-      SrcML = MachineLocation(ehDataReg(I));
-      Moves.push_back(MachineMove(CSLabel2, DstML, SrcML));
+      unsigned Reg = MRI->getDwarfRegNum(ehDataReg(I), true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(CSLabel2, Reg, Offset));
     }
   }
 
@@ -350,9 +379,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
     MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, dl,
             TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
-    DstML = MachineLocation(FP);
-    SrcML = MachineLocation(MachineLocation::VirtualFP);
-    Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
+    MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
+        SetFPLabel, MRI->getDwarfRegNum(FP, true)));
   }
 }
 
@@ -361,10 +389,12 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MachineFrameInfo *MFI            = MF.getFrameInfo();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-  const MipsRegisterInfo *RegInfo =
-    static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   const MipsSEInstrInfo &TII =
     *static_cast<const MipsSEInstrInfo*>(MF.getTarget().getInstrInfo());
+  const MipsRegisterInfo &RegInfo =
+    *static_cast<const MipsRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   DebugLoc dl = MBBI->getDebugLoc();
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
   unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
@@ -385,7 +415,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (MipsFI->callsEhReturn()) {
     const TargetRegisterClass *RC = STI.isABI_N64() ?
-        &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+        &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
     // Find first instruction that restores a callee-saved register.
     MachineBasicBlock::iterator I = MBBI;
@@ -395,7 +425,7 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
     // Insert instructions that restore eh data registers.
     for (int J = 0; J < 4; ++J) {
       TII.loadRegFromStackSlot(MBB, I, ehDataReg(J), MipsFI->getEhDataRegFI(J),
-                               RC, RegInfo);
+                               RC, &RegInfo);
     }
   }
 
@@ -493,7 +523,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // The spill slot should be half the size of the accumulator. If target is
     // mips64, it should be 64-bit, otherwise it should be 32-bt.
     const TargetRegisterClass *RC = STI.hasMips64() ?
-      &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+      &Mips::GPR64RegClass : &Mips::GPR32RegClass;
     int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
                                                   RC->getAlignment(), false);
     RS->addScavengingFrameIndex(FI);
@@ -507,7 +537,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     return;
 
   const TargetRegisterClass *RC = STI.isABI_N64() ?
-    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+    &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
                                                 RC->getAlignment(), false);
   RS->addScavengingFrameIndex(FI);
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 193a66c..8fa9e46 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -21,7 +21,7 @@ namespace llvm {
 class MipsSEFrameLowering : public MipsFrameLowering {
 public:
   explicit MipsSEFrameLowering(const MipsSubtarget &STI)
-    : MipsFrameLowering(STI, STI.hasMips64() ? 16 : 8) {}
+    : MipsFrameLowering(STI, STI.stackAlignment()) {}
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 8a6523a..737660e 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -66,6 +66,21 @@ void MipsSEDAGToDAGISel::addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
     MIB.addReg(Mips::DSPEFI, Flag);
 }
 
+unsigned MipsSEDAGToDAGISel::getMSACtrlReg(const SDValue RegIdx) const {
+  switch (cast<ConstantSDNode>(RegIdx)->getZExtValue()) {
+  default:
+    llvm_unreachable("Could not map int to register");
+  case 0: return Mips::MSAIR;
+  case 1: return Mips::MSACSR;
+  case 2: return Mips::MSAAccess;
+  case 3: return Mips::MSASave;
+  case 4: return Mips::MSAModify;
+  case 5: return Mips::MSARequest;
+  case 6: return Mips::MSAMap;
+  case 7: return Mips::MSAUnmap;
+  }
+}
+
 bool MipsSEDAGToDAGISel::replaceUsesWithZeroReg(MachineRegisterInfo *MRI,
                                                 const MachineInstr& MI) {
   unsigned DstReg = 0, ZeroReg = 0;
@@ -119,9 +134,9 @@ void MipsSEDAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   const TargetRegisterClass *RC;
 
   if (Subtarget.isABI_N64())
-    RC = (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
+    RC = (const TargetRegisterClass*)&Mips::GPR64RegClass;
   else
-    RC = (const TargetRegisterClass*)&Mips::CPURegsRegClass;
+    RC = (const TargetRegisterClass*)&Mips::GPR32RegClass;
 
   V0 = RegInfo.createVirtualRegister(RC);
   V1 = RegInfo.createVirtualRegister(RC);
@@ -214,7 +229,7 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
 }
 
 SDNode *MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag,
-                                           SDValue CmpLHS, DebugLoc DL,
+                                           SDValue CmpLHS, SDLoc DL,
                                            SDNode *Node) const {
   unsigned Opc = InFlag.getOpcode(); (void)Opc;
 
@@ -301,6 +316,20 @@ bool MipsSEDAGToDAGISel::selectAddrRegImm(SDValue Addr, SDValue &Base,
   return false;
 }
 
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsSEDAGToDAGISel::selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                          SDValue &Offset) const {
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    Base = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+
+  return false;
+}
+
 bool MipsSEDAGToDAGISel::selectAddrDefault(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) const {
   Base = Addr;
@@ -314,9 +343,266 @@ bool MipsSEDAGToDAGISel::selectIntAddr(SDValue Addr, SDValue &Base,
     selectAddrDefault(Addr, Base, Offset);
 }
 
+/// Used on microMIPS Load/Store unaligned instructions (12-bit offset)
+bool MipsSEDAGToDAGISel::selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) const {
+  EVT ValTy = Addr.getValueType();
+
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<12>(CN->getSExtValue())) {
+
+      // If the first operand is a FI then get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0)))
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectIntAddrMM(SDValue Addr, SDValue &Base,
+                                         SDValue &Offset) const {
+  return selectAddrRegImm12(Addr, Base, Offset) ||
+    selectAddrDefault(Addr, Base, Offset);
+}
+
+// Select constant vector splats.
+//
+// Returns true and sets Imm if:
+// * MSA is enabled
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
+  if (!Subtarget.hasMSA())
+    return false;
+
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                             HasAnyUndefs, 8,
+                             !Subtarget.isLittle()))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Select constant vector splats.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value fits in an integer with the specified signed-ness and
+//   width.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+//
+// It's worth noting that this function is not used as part of the selection
+// of ldi.[bhwd] since it does not permit using the wrong-typed ldi.[bhwd]
+// instruction to achieve the desired bit pattern. ldi.[bhwd] is selected in
+// MipsSEDAGToDAGISel::selectNode.
+bool MipsSEDAGToDAGISel::
+selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                   unsigned ImmBitSize) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    if (( Signed && ImmValue.isSignedIntN(ImmBitSize)) ||
+        (!Signed && ImmValue.isIntN(ImmBitSize))) {
+      Imm = CurDAG->getTargetConstant(ImmValue, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm1(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 1);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm2(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 2);
+}
+
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm3(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 3);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm4(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 4);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 5);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm6(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 6);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatUimm8(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, false, 8);
+}
+
+// Select constant vector splats.
+bool MipsSEDAGToDAGISel::
+selectVSplatSimm5(SDValue N, SDValue &Imm) const {
+  return selectVSplatCommon(N, Imm, true, 5);
+}
+
+// Select constant vector splats whose value is a power of 2.
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a power of two.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatUimmPow2(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat (N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = ImmValue.exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of left-most bits set (e.g. 0b11...1100...00).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of left-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero from the bitwise
+    // inverse of ImmValue, and test that the inverse of this is the same
+    // as the original value.
+    if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
+
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+// Select constant vector splats whose value only has a consecutive sequence
+// of right-most bits set (e.g. 0b00...0011...11).
+//
+// In addition to the requirements of selectVSplat(), this function returns
+// true and sets Imm if:
+// * The splat value is the same width as the elements of the vector
+// * The splat value is a consecutive sequence of right-most bits.
+//
+// This function looks through ISD::BITCAST nodes.
+// TODO: This might not be appropriate for big-endian MSA since BITCAST is
+//       sometimes a shuffle in big-endian mode.
+bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    // Extract the run of set bits starting with bit zero, and test that the
+    // result is the same as the original value
+    if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool MipsSEDAGToDAGISel::selectVSplatUimmInvPow2(SDValue N,
+                                                 SDValue &Imm) const {
+  APInt ImmValue;
+  EVT EltTy = N->getValueType(0).getVectorElementType();
+
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  if (selectVSplat(N.getNode(), ImmValue) &&
+      ImmValue.getBitWidth() == EltTy.getSizeInBits()) {
+    int32_t Log2 = (~ImmValue).exactLogBase2();
+
+    if (Log2 != -1) {
+      Imm = CurDAG->getTargetConstant(Log2, EltTy);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
 
   ///
   // Instruction Selection not handled by the auto-generated
@@ -348,6 +634,11 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO_64, MVT::i64);
         Result = CurDAG->getMachineNode(Mips::DMTC1, DL, MVT::f64, Zero);
+      } else if (Subtarget.isFP64bit()) {
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
+                                              Mips::ZERO, MVT::i32);
+        Result = CurDAG->getMachineNode(Mips::BuildPairF64_64, DL, MVT::f64,
+                                        Zero, Zero);
       } else {
         SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
                                               Mips::ZERO, MVT::i32);
@@ -374,7 +665,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
       AnalyzeImm.Analyze(Imm, Size, false);
 
     MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-    DebugLoc DL = CN->getDebugLoc();
+    SDLoc DL(CN);
     SDNode *RegOpnd;
     SDValue ImmOpnd = CurDAG->getTargetConstant(SignExtend64<16>(Inst->ImmOpnd),
                                                 MVT::i64);
@@ -401,24 +692,71 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, RegOpnd);
   }
 
+  case ISD::INTRINSIC_W_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_cfcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx = Node->getOperand(2);
+      SDValue Reg = CurDAG->getCopyFromReg(ChainIn, DL,
+                                           getMSACtrlReg(RegIdx), MVT::i32);
+      return std::make_pair(true, Reg.getNode());
+    }
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_move_v:
+      // Like an assignment but will always produce a move.v even if
+      // unnecessary.
+      return std::make_pair(true,
+                            CurDAG->getMachineNode(Mips::MOVE_V, DL,
+                                                   Node->getValueType(0),
+                                                   Node->getOperand(1)));
+    }
+    break;
+  }
+
+  case ISD::INTRINSIC_VOID: {
+    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    default:
+      break;
+
+    case Intrinsic::mips_ctcmsa: {
+      SDValue ChainIn = Node->getOperand(0);
+      SDValue RegIdx  = Node->getOperand(2);
+      SDValue Value   = Node->getOperand(3);
+      SDValue ChainOut = CurDAG->getCopyToReg(ChainIn, DL,
+                                              getMSACtrlReg(RegIdx), Value);
+      return std::make_pair(true, ChainOut.getNode());
+    }
+    }
+    break;
+  }
+
   case MipsISD::ThreadPointer: {
-    EVT PtrVT = TLI.getPointerTy();
-    unsigned RdhwrOpc, SrcReg, DestReg;
+    EVT PtrVT = getTargetLowering()->getPointerTy();
+    unsigned RdhwrOpc, DestReg;
 
     if (PtrVT == MVT::i32) {
       RdhwrOpc = Mips::RDHWR;
-      SrcReg = Mips::HWR29;
       DestReg = Mips::V1;
     } else {
       RdhwrOpc = Mips::RDHWR64;
-      SrcReg = Mips::HWR29_64;
       DestReg = Mips::V1_64;
     }
 
     SDNode *Rdhwr =
-      CurDAG->getMachineNode(RdhwrOpc, Node->getDebugLoc(),
+      CurDAG->getMachineNode(RdhwrOpc, SDLoc(Node),
                              Node->getValueType(0),
-                             CurDAG->getRegister(SrcReg, PtrVT));
+                             CurDAG->getRegister(Mips::HWR29, MVT::i32));
     SDValue Chain = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, DestReg,
                                          SDValue(Rdhwr, 0));
     SDValue ResNode = CurDAG->getCopyFromReg(Chain, DL, DestReg, PtrVT);
@@ -426,18 +764,81 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     return std::make_pair(true, ResNode.getNode());
   }
 
-  case MipsISD::InsertLOHI: {
-    unsigned RCID = Subtarget.hasDSP() ? Mips::ACRegsDSPRegClassID :
-                                         Mips::ACRegsRegClassID;
-    SDValue RegClass = CurDAG->getTargetConstant(RCID, MVT::i32);
-    SDValue LoIdx = CurDAG->getTargetConstant(Mips::sub_lo, MVT::i32);
-    SDValue HiIdx = CurDAG->getTargetConstant(Mips::sub_hi, MVT::i32);
-    const SDValue Ops[] = { RegClass, Node->getOperand(0), LoIdx,
-                            Node->getOperand(1), HiIdx };
-    SDNode *Res = CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL,
-                                         MVT::Untyped, Ops);
+  case ISD::BUILD_VECTOR: {
+    // Select appropriate ldi.[bhwd] instructions for constant splats of
+    // 128-bit when MSA is enabled. Fixup any register class mismatches that
+    // occur as a result.
+    //
+    // This allows the compiler to use a wider range of immediates than would
+    // otherwise be allowed. If, for example, v4i32 could only use ldi.h then
+    // it would not be possible to load { 0x01010101, 0x01010101, 0x01010101,
+    // 0x01010101 } without using a constant pool. This would be sub-optimal
+    // when // 'ldi.b wd, 1' is capable of producing that bit-pattern in the
+    // same set/ of registers. Similarly, ldi.h isn't capable of producing {
+    // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
+
+    BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned LdiOp;
+    EVT ResVecTy = BVN->getValueType(0);
+    EVT ViaVecTy;
+
+    if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
+      return std::make_pair(false, (SDNode*)NULL);
+
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, 8,
+                              !Subtarget.isLittle()))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    switch (SplatBitSize) {
+    default:
+      return std::make_pair(false, (SDNode*)NULL);
+    case 8:
+      LdiOp = Mips::LDI_B;
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      LdiOp = Mips::LDI_H;
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      LdiOp = Mips::LDI_W;
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      LdiOp = Mips::LDI_D;
+      ViaVecTy = MVT::v2i64;
+      break;
+    }
+
+    if (!SplatValue.isSignedIntN(10))
+      return std::make_pair(false, (SDNode*)NULL);
+
+    SDValue Imm = CurDAG->getTargetConstant(SplatValue,
+                                            ViaVecTy.getVectorElementType());
+
+    SDNode *Res = CurDAG->getMachineNode(LdiOp, SDLoc(Node), ViaVecTy, Imm);
+
+    if (ResVecTy != ViaVecTy) {
+      // If LdiOp is writing to a different register class to ResVecTy, then
+      // fix it up here. This COPY_TO_REGCLASS should never cause a move.v
+      // since the source and destination register sets contain the same
+      // registers.
+      const TargetLowering *TLI = getTargetLowering();
+      MVT ResVecTySimple = ResVecTy.getSimpleVT();
+      const TargetRegisterClass *RC = TLI->getRegClassFor(ResVecTySimple);
+      Res = CurDAG->getMachineNode(Mips::COPY_TO_REGCLASS, SDLoc(Node),
+                                   ResVecTy, SDValue(Res, 0),
+                                   CurDAG->getTargetConstant(RC->getID(),
+                                                             MVT::i32));
+    }
+
     return std::make_pair(true, Res);
   }
+
   }
 
   return std::make_pair(false, (SDNode*)NULL);
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index a235e96..dc52064 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -30,23 +30,67 @@ private:
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
 
+  unsigned getMSACtrlReg(const SDValue RegIdx) const;
+
   bool replaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
 
-  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
+  std::pair<SDNode*, SDNode*> selectMULT(SDNode *N, unsigned Opc, SDLoc dl,
                                          EVT Ty, bool HasLo, bool HasHi);
 
   SDNode *selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS,
-                         DebugLoc DL, SDNode *Node) const;
+                         SDLoc DL, SDNode *Node) const;
 
   virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
                                 SDValue &Offset) const;
 
+  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                                SDValue &Offset) const;
+
   virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
                                  SDValue &Offset) const;
 
   virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
                              SDValue &Offset) const;
 
+  virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) const;
+
+  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                               SDValue &Offset) const;
+
+  /// \brief Select constant vector splats.
+  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a given integer.
+  virtual bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+                                  unsigned ImmBitSize) const;
+  /// \brief Select constant vector splats whose value fits in a uimm1.
+  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm2.
+  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm3.
+  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm4.
+  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm5.
+  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm6.
+  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a uimm8.
+  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value fits in a simm5.
+  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a power of 2.
+  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is the inverse of a
+  /// power of 2.
+  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// ending at the most significant bit
+  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  /// \brief Select constant vector splats whose value is a run of set bits
+  /// starting at bit zero.
+  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
 
   virtual void processFunctionAfterISel(MachineFunction &MF);
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 8544bb8..809adc0 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -10,6 +10,7 @@
 // Subclass of MipsTargetLowering specialized for mips32/64.
 //
 //===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelLowering.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
@@ -17,6 +18,8 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
@@ -25,22 +28,40 @@ static cl::opt<bool>
 EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
                     cl::desc("MIPS: Enable tail calls."), cl::init(false));
 
+static cl::opt<bool> NoDPLoadStore("mno-ldc1-sdc1", cl::init(false),
+                                   cl::desc("Expand double precision loads and "
+                                            "stores to their single precision "
+                                            "counterparts"));
+
 MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   : MipsTargetLowering(TM) {
   // Set up the register classes
+  addRegisterClass(MVT::i32, &Mips::GPR32RegClass);
 
-  clearRegisterClasses();
+  if (HasMips64)
+    addRegisterClass(MVT::i64, &Mips::GPR64RegClass);
 
-  addRegisterClass(MVT::i32, &Mips::CPURegsRegClass);
+  if (Subtarget->hasDSP() || Subtarget->hasMSA()) {
+    // Expand all truncating stores and extending loads.
+    unsigned FirstVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+    unsigned LastVT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
 
-  if (HasMips64)
-    addRegisterClass(MVT::i64, &Mips::CPU64RegsRegClass);
+    for (unsigned VT0 = FirstVT; VT0 <= LastVT; ++VT0) {
+      for (unsigned VT1 = FirstVT; VT1 <= LastVT; ++VT1)
+        setTruncStoreAction((MVT::SimpleValueType)VT0,
+                            (MVT::SimpleValueType)VT1, Expand);
+
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT0, Expand);
+    }
+  }
 
   if (Subtarget->hasDSP()) {
     MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
 
     for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
-      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+      addRegisterClass(VecTys[i], &Mips::DSPRRegClass);
 
       // Expand all builtin opcodes.
       for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
@@ -63,12 +84,28 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
   if (Subtarget->hasDSPR2())
     setOperationAction(ISD::MUL, MVT::v2i16, Legal);
 
-  if (!TM.Options.UseSoftFloat) {
+  if (Subtarget->hasMSA()) {
+    addMSAIntType(MVT::v16i8, &Mips::MSA128BRegClass);
+    addMSAIntType(MVT::v8i16, &Mips::MSA128HRegClass);
+    addMSAIntType(MVT::v4i32, &Mips::MSA128WRegClass);
+    addMSAIntType(MVT::v2i64, &Mips::MSA128DRegClass);
+    addMSAFloatType(MVT::v8f16, &Mips::MSA128HRegClass);
+    addMSAFloatType(MVT::v4f32, &Mips::MSA128WRegClass);
+    addMSAFloatType(MVT::v2f64, &Mips::MSA128DRegClass);
+
+    setTargetDAGCombine(ISD::AND);
+    setTargetDAGCombine(ISD::OR);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::VSELECT);
+    setTargetDAGCombine(ISD::XOR);
+  }
+
+  if (!Subtarget->mipsSEUsesSoftFloat()) {
     addRegisterClass(MVT::f32, &Mips::FGR32RegClass);
 
     // When dealing with single precision only, use libcalls
     if (!Subtarget->isSingleFloat()) {
-      if (HasMips64)
+      if (Subtarget->isFP64bit())
         addRegisterClass(MVT::f64, &Mips::FGR64RegClass);
       else
         addRegisterClass(MVT::f64, &Mips::AFGR64RegClass);
@@ -99,6 +136,16 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
 
   setTargetDAGCombine(ISD::ADDE);
   setTargetDAGCombine(ISD::SUBE);
+  setTargetDAGCombine(ISD::MUL);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  if (NoDPLoadStore) {
+    setOperationAction(ISD::LOAD, MVT::f64, Custom);
+    setOperationAction(ISD::STORE, MVT::f64, Custom);
+  }
 
   computeRegisterProperties();
 }
@@ -108,6 +155,93 @@ llvm::createMipsSETargetLowering(MipsTargetMachine &TM) {
   return new MipsSETargetLowering(TM);
 }
 
+// Enable MSA support for the given integer type and Register class.
+void MipsSETargetLowering::
+addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  setOperationAction(ISD::ADD, Ty, Legal);
+  setOperationAction(ISD::AND, Ty, Legal);
+  setOperationAction(ISD::CTLZ, Ty, Legal);
+  setOperationAction(ISD::CTPOP, Ty, Legal);
+  setOperationAction(ISD::MUL, Ty, Legal);
+  setOperationAction(ISD::OR, Ty, Legal);
+  setOperationAction(ISD::SDIV, Ty, Legal);
+  setOperationAction(ISD::SREM, Ty, Legal);
+  setOperationAction(ISD::SHL, Ty, Legal);
+  setOperationAction(ISD::SRA, Ty, Legal);
+  setOperationAction(ISD::SRL, Ty, Legal);
+  setOperationAction(ISD::SUB, Ty, Legal);
+  setOperationAction(ISD::UDIV, Ty, Legal);
+  setOperationAction(ISD::UREM, Ty, Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE, Ty, Custom);
+  setOperationAction(ISD::VSELECT, Ty, Legal);
+  setOperationAction(ISD::XOR, Ty, Legal);
+
+  if (Ty == MVT::v4i32 || Ty == MVT::v2i64) {
+    setOperationAction(ISD::FP_TO_SINT, Ty, Legal);
+    setOperationAction(ISD::FP_TO_UINT, Ty, Legal);
+    setOperationAction(ISD::SINT_TO_FP, Ty, Legal);
+    setOperationAction(ISD::UINT_TO_FP, Ty, Legal);
+  }
+
+  setOperationAction(ISD::SETCC, Ty, Legal);
+  setCondCodeAction(ISD::SETNE, Ty, Expand);
+  setCondCodeAction(ISD::SETGE, Ty, Expand);
+  setCondCodeAction(ISD::SETGT, Ty, Expand);
+  setCondCodeAction(ISD::SETUGE, Ty, Expand);
+  setCondCodeAction(ISD::SETUGT, Ty, Expand);
+}
+
+// Enable MSA support for the given floating-point type and Register class.
+void MipsSETargetLowering::
+addMSAFloatType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC) {
+  addRegisterClass(Ty, RC);
+
+  // Expand all builtin opcodes.
+  for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+    setOperationAction(Opc, Ty, Expand);
+
+  setOperationAction(ISD::LOAD, Ty, Legal);
+  setOperationAction(ISD::STORE, Ty, Legal);
+  setOperationAction(ISD::BITCAST, Ty, Legal);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, Ty, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, Ty, Custom);
+
+  if (Ty != MVT::v8f16) {
+    setOperationAction(ISD::FABS,  Ty, Legal);
+    setOperationAction(ISD::FADD,  Ty, Legal);
+    setOperationAction(ISD::FDIV,  Ty, Legal);
+    setOperationAction(ISD::FEXP2, Ty, Legal);
+    setOperationAction(ISD::FLOG2, Ty, Legal);
+    setOperationAction(ISD::FMA,   Ty, Legal);
+    setOperationAction(ISD::FMUL,  Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FSQRT, Ty, Legal);
+    setOperationAction(ISD::FSUB,  Ty, Legal);
+    setOperationAction(ISD::VSELECT, Ty, Legal);
+
+    setOperationAction(ISD::SETCC, Ty, Legal);
+    setCondCodeAction(ISD::SETOGE, Ty, Expand);
+    setCondCodeAction(ISD::SETOGT, Ty, Expand);
+    setCondCodeAction(ISD::SETUGE, Ty, Expand);
+    setCondCodeAction(ISD::SETUGT, Ty, Expand);
+    setCondCodeAction(ISD::SETGE,  Ty, Expand);
+    setCondCodeAction(ISD::SETGT,  Ty, Expand);
+  }
+}
 
 bool
 MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
@@ -127,6 +261,8 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
 SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                              SelectionDAG &DAG) const {
   switch(Op.getOpcode()) {
+  case ISD::LOAD:  return lowerLOAD(Op, DAG);
+  case ISD::STORE: return lowerSTORE(Op, DAG);
   case ISD::SMUL_LOHI: return lowerMulDiv(Op, MipsISD::Mult, true, true, DAG);
   case ISD::UMUL_LOHI: return lowerMulDiv(Op, MipsISD::Multu, true, true, DAG);
   case ISD::MULHS:     return lowerMulDiv(Op, MipsISD::Mult, false, true, DAG);
@@ -137,6 +273,10 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
                                           DAG);
   case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return lowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:     return lowerINTRINSIC_VOID(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:       return lowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return lowerVECTOR_SHUFFLE(Op, DAG);
   }
 
   return MipsTargetLowering::LowerOperation(Op, DAG);
@@ -186,10 +326,10 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
   if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
     return false;
 
-  DebugLoc DL = ADDENode->getDebugLoc();
+  SDLoc DL(ADDENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   ADDCNode->getOperand(1),
                                   ADDENode->getOperand(1));
 
@@ -203,15 +343,11 @@ static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) {
 
   // replace uses of adde and addc here
   if (!SDValue(ADDCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut);
   }
   if (!SDValue(ADDENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MAdd,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut);
   }
 
@@ -262,10 +398,10 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
   if (!MultHi.hasOneUse() || !MultLo.hasOneUse())
     return false;
 
-  DebugLoc DL = SUBENode->getDebugLoc();
+  SDLoc DL(SUBENode);
 
   // Initialize accumulator.
-  SDValue ACCIn = CurDAG->getNode(MipsISD::InsertLOHI, DL, MVT::Untyped,
+  SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped,
                                   SUBCNode->getOperand(0),
                                   SUBENode->getOperand(0));
 
@@ -279,15 +415,11 @@ static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) {
 
   // replace uses of sube and subc here
   if (!SDValue(SUBCNode, 0).use_empty()) {
-    SDValue LoIdx = CurDAG->getConstant(Mips::sub_lo, MVT::i32);
-    SDValue LoOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    LoIdx);
+    SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut);
   }
   if (!SDValue(SUBENode, 0).use_empty()) {
-    SDValue HiIdx = CurDAG->getConstant(Mips::sub_hi, MVT::i32);
-    SDValue HiOut = CurDAG->getNode(MipsISD::ExtractLOHI, DL, MVT::i32, MSub,
-                                    HiIdx);
+    SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub);
     CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut);
   }
 
@@ -307,6 +439,248 @@ static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to zero extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::AND.
+// - Removes redundant zero extensions performed by an ISD::AND.
+static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned Op0Opcode = Op0->getOpcode();
+
+  // (and (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d)
+  // where $d + 1 == 2^n and n == 32
+  // or    $d + 1 == 2^n and n <= 32 and ZExt
+  // -> (MipsVExtractZExt $a, $b, $c)
+  if (Op0Opcode == MipsISD::VEXTRACT_SEXT_ELT ||
+      Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT) {
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Op1);
+
+    if (!Mask)
+      return SDValue();
+
+    int32_t Log2IfPositive = (Mask->getAPIntValue() + 1).exactLogBase2();
+
+    if (Log2IfPositive <= 0)
+      return SDValue(); // Mask+1 is not a power of 2
+
+    SDValue Op0Op2 = Op0->getOperand(2);
+    EVT ExtendTy = cast<VTSDNode>(Op0Op2)->getVT();
+    unsigned ExtendTySize = ExtendTy.getSizeInBits();
+    unsigned Log2 = Log2IfPositive;
+
+    if ((Op0Opcode == MipsISD::VEXTRACT_ZEXT_ELT && Log2 >= ExtendTySize) ||
+        Log2 == ExtendTySize) {
+      SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
+      DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
+                      Op0->getVTList(), Ops, Op0->getNumOperands());
+      return Op0;
+    }
+  }
+
+  return SDValue();
+}
+
+// Determine if the specified node is a constant vector splat.
+//
+// Returns true and sets Imm if:
+// * N is a ISD::BUILD_VECTOR representing a constant splat
+//
+// This function is quite similar to MipsSEDAGToDAGISel::selectVSplat. The
+// differences are that it assumes the MSA has already been checked and the
+// arbitrary requirement for a maximum of 32-bit integers isn't applied (and
+// must not be in order for binsri.d to be selectable).
+static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
+  BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
+
+  if (Node == NULL)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
+                             8, !IsLittleEndian))
+    return false;
+
+  Imm = SplatValue;
+
+  return true;
+}
+
+// Test whether the given node is an all-ones build_vector.
+static bool isVectorAllOnes(SDValue N) {
+  // Look through bitcasts. Endianness doesn't matter because we are looking
+  // for an all-ones value.
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+
+  if (!BVN)
+    return false;
+
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  // Endianness doesn't matter in this context because we are looking for
+  // an all-ones value.
+  if (BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs))
+    return SplatValue.isAllOnesValue();
+
+  return false;
+}
+
+// Test whether N is the bitwise inverse of OfNode.
+static bool isBitwiseInverse(SDValue N, SDValue OfNode) {
+  if (N->getOpcode() != ISD::XOR)
+    return false;
+
+  if (isVectorAllOnes(N->getOperand(0)))
+    return N->getOperand(1) == OfNode;
+
+  if (isVectorAllOnes(N->getOperand(1)))
+    return N->getOperand(0) == OfNode;
+
+  return false;
+}
+
+// Perform combines where ISD::OR is the root node.
+//
+// Performs the following transformations:
+// - (or (and $a, $mask), (and $b, $inv_mask)) => (vselect $mask, $a, $b)
+//   where $inv_mask is the bitwise inverse of $mask and the 'or' has a 128-bit
+//   vector type.
+static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const MipsSubtarget *Subtarget) {
+  if (!Subtarget->hasMSA())
+    return SDValue();
+
+  EVT Ty = N->getValueType(0);
+
+  if (!Ty.is128BitVector())
+    return SDValue();
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  if (Op0->getOpcode() == ISD::AND && Op1->getOpcode() == ISD::AND) {
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+    SDValue Op1Op0 = Op1->getOperand(0);
+    SDValue Op1Op1 = Op1->getOperand(1);
+    bool IsLittleEndian = !Subtarget->isLittle();
+
+    SDValue IfSet, IfClr, Cond;
+    bool IsConstantMask = false;
+    APInt Mask, InvMask;
+
+    // If Op0Op0 is an appropriate mask, try to find it's inverse in either
+    // Op1Op0, or Op1Op1. Keep track of the Cond, IfSet, and IfClr nodes, while
+    // looking.
+    // IfClr will be set if we find a valid match.
+    if (isVSplat(Op0Op0, Mask, IsLittleEndian)) {
+      Cond = Op0Op0;
+      IfSet = Op0Op1;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, and Op0Op1 is an appropriate mask, try the same
+    // thing again using this mask.
+    // IfClr will be set if we find a valid match.
+    if (!IfClr.getNode() && isVSplat(Op0Op1, Mask, IsLittleEndian)) {
+      Cond = Op0Op1;
+      IfSet = Op0Op0;
+
+      if (isVSplat(Op1Op0, InvMask, IsLittleEndian) &&
+          Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op1;
+      else if (isVSplat(Op1Op1, InvMask, IsLittleEndian) &&
+               Mask.getBitWidth() == InvMask.getBitWidth() && Mask == ~InvMask)
+        IfClr = Op1Op0;
+
+      IsConstantMask = true;
+    }
+
+    // If IfClr is not yet set, try looking for a non-constant match.
+    // IfClr will be set if we find a valid match amongst the eight
+    // possibilities.
+    if (!IfClr.getNode()) {
+      if (isBitwiseInverse(Op0Op0, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op0)) {
+        Cond = Op1Op0;
+        IfSet = Op1Op1;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op0Op0, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op1;
+      } else if (isBitwiseInverse(Op0Op1, Op1Op1)) {
+        Cond = Op1Op1;
+        IfSet = Op1Op0;
+        IfClr = Op0Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op0)) {
+        Cond = Op0Op0;
+        IfSet = Op0Op1;
+        IfClr = Op1Op0;
+      } else if (isBitwiseInverse(Op1Op0, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op1;
+      } else if (isBitwiseInverse(Op1Op1, Op0Op1)) {
+        Cond = Op0Op1;
+        IfSet = Op0Op0;
+        IfClr = Op1Op0;
+      }
+    }
+
+    // At this point, IfClr will be set if we have a valid match.
+    if (!IfClr.getNode())
+      return SDValue();
+
+    assert(Cond.getNode() && IfSet.getNode());
+
+    // Fold degenerate cases.
+    if (IsConstantMask) {
+      if (Mask.isAllOnesValue())
+        return IfSet;
+      else if (Mask == 0)
+        return IfClr;
+    }
+
+    // Transform the DAG into an equivalent VSELECT.
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), Ty, Cond, IfClr, IfSet);
+  }
+
+  return SDValue();
+}
+
 static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const MipsSubtarget *Subtarget) {
@@ -320,6 +694,57 @@ static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue genConstMult(SDValue X, uint64_t C, SDLoc DL, EVT VT,
+                            EVT ShiftTy, SelectionDAG &DAG) {
+  // Clear the upper (64 - VT.sizeInBits) bits.
+  C &= ((uint64_t)-1) >> (64 - VT.getSizeInBits());
+
+  // Return 0.
+  if (C == 0)
+    return DAG.getConstant(0, VT);
+
+  // Return x.
+  if (C == 1)
+    return X;
+
+  // If c is power of 2, return (shl x, log2(c)).
+  if (isPowerOf2_64(C))
+    return DAG.getNode(ISD::SHL, DL, VT, X,
+                       DAG.getConstant(Log2_64(C), ShiftTy));
+
+  unsigned Log2Ceil = Log2_64_Ceil(C);
+  uint64_t Floor = 1LL << Log2_64(C);
+  uint64_t Ceil = Log2Ceil == 64 ? 0LL : 1LL << Log2Ceil;
+
+  // If |c - floor_c| <= |c - ceil_c|,
+  // where floor_c = pow(2, floor(log2(c))) and ceil_c = pow(2, ceil(log2(c))),
+  // return (add constMult(x, floor_c), constMult(x, c - floor_c)).
+  if (C - Floor <= Ceil - C) {
+    SDValue Op0 = genConstMult(X, Floor, DL, VT, ShiftTy, DAG);
+    SDValue Op1 = genConstMult(X, C - Floor, DL, VT, ShiftTy, DAG);
+    return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
+  }
+
+  // If |c - floor_c| > |c - ceil_c|,
+  // return (sub constMult(x, ceil_c), constMult(x, ceil_c - c)).
+  SDValue Op0 = genConstMult(X, Ceil, DL, VT, ShiftTy, DAG);
+  SDValue Op1 = genConstMult(X, Ceil - C, DL, VT, ShiftTy, DAG);
+  return DAG.getNode(ISD::SUB, DL, VT, Op0, Op1);
+}
+
+static SDValue performMULCombine(SDNode *N, SelectionDAG &DAG,
+                                 const TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSETargetLowering *TL) {
+  EVT VT = N->getValueType(0);
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (!VT.isVector())
+      return genConstMult(N->getOperand(0), C->getZExtValue(), SDLoc(N),
+                          VT, TL->getScalarShiftAmountTy(VT), DAG);
+
+  return SDValue(N, 0);
+}
+
 static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
                                       SelectionDAG &DAG,
                                       const MipsSubtarget *Subtarget) {
@@ -330,6 +755,9 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
   unsigned EltSize = Ty.getVectorElementType().getSizeInBits();
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
 
+  if (!Subtarget->hasDSP())
+    return SDValue();
+
   if (!BV ||
       !BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, HasAnyUndefs,
                            EltSize, !Subtarget->isLittle()) ||
@@ -337,7 +765,7 @@ static SDValue performDSPShiftCombine(unsigned Opc, SDNode *N, EVT Ty,
       (SplatValue.getZExtValue() >= EltSize))
     return SDValue();
 
-  return DAG.getNode(Opc, N->getDebugLoc(), Ty, N->getOperand(0),
+  return DAG.getNode(Opc, SDLoc(N), Ty, N->getOperand(0),
                      DAG.getConstant(SplatValue.getZExtValue(), MVT::i32));
 }
 
@@ -352,11 +780,57 @@ static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
   return performDSPShiftCombine(MipsISD::SHLL_DSP, N, Ty, DAG, Subtarget);
 }
 
+// Fold sign-extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT for MSA and fold
+// constant splats into MipsISD::SHRA_DSP for DSPr2.
+//
+// Performs the following transformations:
+// - Changes MipsISD::VEXTRACT_[SZ]EXT_ELT to sign extension if its
+//   sign/zero-extension is completely overwritten by the new one performed by
+//   the ISD::SRA and ISD::SHL nodes.
+// - Removes redundant sign extensions performed by an ISD::SRA and ISD::SHL
+//   sequence.
+//
+// See performDSPShiftCombine for more information about the transformation
+// used for DSPr2.
 static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget *Subtarget) {
   EVT Ty = N->getValueType(0);
 
+  if (Subtarget->hasMSA()) {
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+
+    // (sra (shl (MipsVExtract[SZ]Ext $a, $b, $c), imm:$d), imm:$d)
+    // where $d + sizeof($c) == 32
+    // or    $d + sizeof($c) <= 32 and SExt
+    // -> (MipsVExtractSExt $a, $b, $c)
+    if (Op0->getOpcode() == ISD::SHL && Op1 == Op0->getOperand(1)) {
+      SDValue Op0Op0 = Op0->getOperand(0);
+      ConstantSDNode *ShAmount = dyn_cast<ConstantSDNode>(Op1);
+
+      if (!ShAmount)
+        return SDValue();
+
+      if (Op0Op0->getOpcode() != MipsISD::VEXTRACT_SEXT_ELT &&
+          Op0Op0->getOpcode() != MipsISD::VEXTRACT_ZEXT_ELT)
+        return SDValue();
+
+      EVT ExtendTy = cast<VTSDNode>(Op0Op0->getOperand(2))->getVT();
+      unsigned TotalBits = ShAmount->getZExtValue() + ExtendTy.getSizeInBits();
+
+      if (TotalBits == 32 ||
+          (Op0Op0->getOpcode() == MipsISD::VEXTRACT_SEXT_ELT &&
+           TotalBits <= 32)) {
+        SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
+                          Op0Op0->getOperand(2) };
+        DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
+                        Op0Op0->getVTList(), Ops, Op0Op0->getNumOperands());
+        return Op0Op0;
+      }
+    }
+  }
+
   if ((Ty != MVT::v2i16) && ((Ty != MVT::v4i8) || !Subtarget->hasDSPR2()))
     return SDValue();
 
@@ -402,24 +876,91 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   if (!isLegalDSPCondCode(Ty, cast<CondCodeSDNode>(N->getOperand(2))->get()))
     return SDValue();
 
-  return DAG.getNode(MipsISD::SETCC_DSP, N->getDebugLoc(), Ty, N->getOperand(0),
+  return DAG.getNode(MipsISD::SETCC_DSP, SDLoc(N), Ty, N->getOperand(0),
                      N->getOperand(1), N->getOperand(2));
 }
 
 static SDValue performVSELECTCombine(SDNode *N, SelectionDAG &DAG) {
   EVT Ty = N->getValueType(0);
 
-  if ((Ty != MVT::v2i16) && (Ty != MVT::v4i8))
-    return SDValue();
+  if (Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (vselect (setcc $a, $b, SETLT), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $b, $a)) -> (vsmax $a, $b)
+    //   (vselect (setcc $a, $b, SETLT), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETLE), $a, $b)) -> (vsmin $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $b, $a)) -> (vumax $a, $b)
+    //   (vselect (setcc $a, $b, SETULT), $a, $b)) -> (vumin $a, $b)
+    //   (vselect (setcc $a, $b, SETULE), $a, $b)) -> (vumin $a, $b)
+    // SETGT/SETGE/SETUGT/SETUGE variants of these will show up initially but
+    // will be expanded to equivalent SETLT/SETLE/SETULT/SETULE versions by the
+    // legalizer.
+    SDValue Op0 = N->getOperand(0);
+
+    if (Op0->getOpcode() != ISD::SETCC)
+      return SDValue();
+
+    ISD::CondCode CondCode = cast<CondCodeSDNode>(Op0->getOperand(2))->get();
+    bool Signed;
+
+    if (CondCode == ISD::SETLT  || CondCode == ISD::SETLE)
+      Signed = true;
+    else if (CondCode == ISD::SETULT || CondCode == ISD::SETULE)
+      Signed = false;
+    else
+      return SDValue();
+
+    SDValue Op1 = N->getOperand(1);
+    SDValue Op2 = N->getOperand(2);
+    SDValue Op0Op0 = Op0->getOperand(0);
+    SDValue Op0Op1 = Op0->getOperand(1);
+
+    if (Op1 == Op0Op0 && Op2 == Op0Op1)
+      return DAG.getNode(Signed ? MipsISD::VSMIN : MipsISD::VUMIN, SDLoc(N),
+                         Ty, Op1, Op2);
+    else if (Op1 == Op0Op1 && Op2 == Op0Op0)
+      return DAG.getNode(Signed ? MipsISD::VSMAX : MipsISD::VUMAX, SDLoc(N),
+                         Ty, Op1, Op2);
+  } else if ((Ty == MVT::v2i16) || (Ty == MVT::v4i8)) {
+    SDValue SetCC = N->getOperand(0);
+
+    if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
+      return SDValue();
+
+    return DAG.getNode(MipsISD::SELECT_CC_DSP, SDLoc(N), Ty,
+                       SetCC.getOperand(0), SetCC.getOperand(1),
+                       N->getOperand(1), N->getOperand(2), SetCC.getOperand(2));
+  }
 
-  SDValue SetCC = N->getOperand(0);
+  return SDValue();
+}
 
-  if (SetCC.getOpcode() != MipsISD::SETCC_DSP)
-    return SDValue();
+static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
+                                 const MipsSubtarget *Subtarget) {
+  EVT Ty = N->getValueType(0);
 
-  return DAG.getNode(MipsISD::SELECT_CC_DSP, N->getDebugLoc(), Ty,
-                     SetCC.getOperand(0), SetCC.getOperand(1), N->getOperand(1),
-                     N->getOperand(2), SetCC.getOperand(2));
+  if (Subtarget->hasMSA() && Ty.is128BitVector() && Ty.isInteger()) {
+    // Try the following combines:
+    //   (xor (or $a, $b), (build_vector allones))
+    //   (xor (or $a, $b), (bitcast (build_vector allones)))
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    SDValue NotOp;
+
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      NotOp = Op1;
+    else if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      NotOp = Op0;
+    else
+      return SDValue();
+
+    if (NotOp->getOpcode() == ISD::OR)
+      return DAG.getNode(MipsISD::VNOR, SDLoc(N), Ty, NotOp->getOperand(0),
+                         NotOp->getOperand(1));
+  }
+
+  return SDValue();
 }
 
 SDValue
@@ -430,8 +971,16 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   case ISD::ADDE:
     return performADDECombine(N, DAG, DCI, Subtarget);
+  case ISD::AND:
+    Val = performANDCombine(N, DAG, DCI, Subtarget);
+    break;
+  case ISD::OR:
+    Val = performORCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SUBE:
     return performSUBECombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
     return performSHLCombine(N, DAG, DCI, Subtarget);
   case ISD::SRA:
@@ -440,14 +989,22 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
     return performVSELECTCombine(N, DAG);
-  case ISD::SETCC: {
+  case ISD::XOR:
+    Val = performXORCombine(N, DAG, Subtarget);
+    break;
+  case ISD::SETCC:
     Val = performSETCCCombine(N, DAG);
     break;
   }
-  }
 
-  if (Val.getNode())
+  if (Val.getNode()) {
+    DEBUG(dbgs() << "\nMipsSE DAG Combine:\n";
+          N->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n=> \n";
+          Val.getNode()->printrWithDepth(dbgs(), &DAG);
+          dbgs() << "\n");
     return Val;
+  }
 
   return MipsTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -460,6 +1017,42 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return MipsTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case Mips::BPOSGE32_PSEUDO:
     return emitBPOSGE32(MI, BB);
+  case Mips::SNZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_B);
+  case Mips::SNZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_H);
+  case Mips::SNZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_W);
+  case Mips::SNZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_D);
+  case Mips::SNZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BNZ_V);
+  case Mips::SZ_B_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_B);
+  case Mips::SZ_H_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_H);
+  case Mips::SZ_W_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_W);
+  case Mips::SZ_D_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_D);
+  case Mips::SZ_V_PSEUDO:
+    return emitMSACBranchPseudo(MI, BB, Mips::BZ_V);
+  case Mips::COPY_FW_PSEUDO:
+    return emitCOPY_FW(MI, BB);
+  case Mips::COPY_FD_PSEUDO:
+    return emitCOPY_FD(MI, BB);
+  case Mips::INSERT_FW_PSEUDO:
+    return emitINSERT_FW(MI, BB);
+  case Mips::INSERT_FD_PSEUDO:
+    return emitINSERT_FD(MI, BB);
+  case Mips::FILL_FW_PSEUDO:
+    return emitFILL_FW(MI, BB);
+  case Mips::FILL_FD_PSEUDO:
+    return emitFILL_FD(MI, BB);
+  case Mips::FEXP2_W_1_PSEUDO:
+    return emitFEXP2_W_1(MI, BB);
+  case Mips::FEXP2_D_1_PSEUDO:
+    return emitFEXP2_D_1(MI, BB);
   }
 }
 
@@ -496,21 +1089,81 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
                                   InternalLinkage, CLI, Callee, Chain);
 }
 
+SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  LoadSDNode &Nd = *cast<LoadSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerLOAD(Op, DAG);
+
+  // Replace a double precision load with two i32 loads and a buildpair64.
+  SDLoc DL(Op);
+  SDValue Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+
+  // i32 load from lower address.
+  SDValue Lo = DAG.getLoad(MVT::i32, DL, Chain, Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           Nd.getAlignment());
+
+  // i32 load from higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  SDValue Hi = DAG.getLoad(MVT::i32, DL, Lo.getValue(1), Ptr,
+                           MachinePointerInfo(), Nd.isVolatile(),
+                           Nd.isNonTemporal(), Nd.isInvariant(),
+                           std::min(Nd.getAlignment(), 4U));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
+  SDValue Ops[2] = {BP, Hi.getValue(1)};
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  StoreSDNode &Nd = *cast<StoreSDNode>(Op);
+
+  if (Nd.getMemoryVT() != MVT::f64 || !NoDPLoadStore)
+    return MipsTargetLowering::lowerSTORE(Op, DAG);
+
+  // Replace a double precision store with two extractelement64s and i32 stores.
+  SDLoc DL(Op);
+  SDValue Val = Nd.getValue(), Ptr = Nd.getBasePtr(), Chain = Nd.getChain();
+  EVT PtrVT = Ptr.getValueType();
+  SDValue Lo = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
+                           Val, DAG.getConstant(1, MVT::i32));
+
+  if (!Subtarget->isLittle())
+    std::swap(Lo, Hi);
+
+  // i32 store to lower address.
+  Chain = DAG.getStore(Chain, DL, Lo, Ptr, MachinePointerInfo(),
+                       Nd.isVolatile(), Nd.isNonTemporal(), Nd.getAlignment(),
+                       Nd.getTBAAInfo());
+
+  // i32 store to higher address.
+  Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, DAG.getConstant(4, PtrVT));
+  return DAG.getStore(Chain, DL, Hi, Ptr, MachinePointerInfo(),
+                      Nd.isVolatile(), Nd.isNonTemporal(),
+                      std::min(Nd.getAlignment(), 4U), Nd.getTBAAInfo());
+}
+
 SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
                                           bool HasLo, bool HasHi,
                                           SelectionDAG &DAG) const {
   EVT Ty = Op.getOperand(0).getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
                              Op.getOperand(0), Op.getOperand(1));
   SDValue Lo, Hi;
 
   if (HasLo)
-    Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_lo, MVT::i32));
+    Lo = DAG.getNode(MipsISD::MFLO, DL, Ty, Mult);
   if (HasHi)
-    Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, Ty, Mult,
-                     DAG.getConstant(Mips::sub_hi, MVT::i32));
+    Hi = DAG.getNode(MipsISD::MFHI, DL, Ty, Mult);
 
   if (!HasLo || !HasHi)
     return HasLo ? Lo : Hi;
@@ -520,19 +1173,17 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
 }
 
 
-static SDValue initAccumulator(SDValue In, DebugLoc DL, SelectionDAG &DAG) {
+static SDValue initAccumulator(SDValue In, SDLoc DL, SelectionDAG &DAG) {
   SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(0, MVT::i32));
   SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
                              DAG.getConstant(1, MVT::i32));
-  return DAG.getNode(MipsISD::InsertLOHI, DL, MVT::Untyped, InLo, InHi);
+  return DAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, InLo, InHi);
 }
 
-static SDValue extractLOHI(SDValue Op, DebugLoc DL, SelectionDAG &DAG) {
-  SDValue Lo = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_lo, MVT::i32));
-  SDValue Hi = DAG.getNode(MipsISD::ExtractLOHI, DL, MVT::i32, Op,
-                           DAG.getConstant(Mips::sub_hi, MVT::i32));
+static SDValue extractLOHI(SDValue Op, SDLoc DL, SelectionDAG &DAG) {
+  SDValue Lo = DAG.getNode(MipsISD::MFLO, DL, MVT::i32, Op);
+  SDValue Hi = DAG.getNode(MipsISD::MFHI, DL, MVT::i32, Op);
   return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Lo, Hi);
 }
 
@@ -549,7 +1200,7 @@ static SDValue extractLOHI(SDValue Op, DebugLoc DL, SelectionDAG &DAG) {
 // out64 = merge-values (v0, v1)
 //
 static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
   SmallVector<SDValue, 3> Ops;
   unsigned OpNo = 0;
@@ -596,8 +1247,156 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
   return DAG.getMergeValues(Vals, 2, DL);
 }
 
+// Lower an MSA copy intrinsic into the specified SelectionDAG node
+static SDValue lowerMSACopyIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(Op);
+  SDValue Vec = Op->getOperand(1);
+  SDValue Idx = Op->getOperand(2);
+  EVT ResTy = Op->getValueType(0);
+  EVT EltTy = Vec->getValueType(0).getVectorElementType();
+
+  SDValue Result = DAG.getNode(Opc, DL, ResTy, Vec, Idx,
+                               DAG.getValueType(EltTy));
+
+  return Result;
+}
+
+static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
+  EVT ResVecTy = Op->getValueType(0);
+  EVT ViaVecTy = ResVecTy;
+  SDLoc DL(Op);
+
+  // When ResVecTy == MVT::v2i64, LaneA is the upper 32 bits of the lane and
+  // LaneB is the lower 32-bits. Otherwise LaneA and LaneB are alternating
+  // lanes.
+  SDValue LaneA;
+  SDValue LaneB = Op->getOperand(2);
+
+  if (ResVecTy == MVT::v2i64) {
+    LaneA = DAG.getConstant(0, MVT::i32);
+    ViaVecTy = MVT::v4i32;
+  } else
+    LaneA = LaneB;
+
+  SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
+                      LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (ViaVecTy != ResVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSASplatImm(SDValue Op, unsigned ImmOp, SelectionDAG &DAG) {
+  return DAG.getConstant(Op->getConstantOperandVal(ImmOp), Op->getValueType(0));
+}
+
+static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
+                                   bool BigEndian, SelectionDAG &DAG) {
+  EVT ViaVecTy = VecTy;
+  SDValue SplatValueA = SplatValue;
+  SDValue SplatValueB = SplatValue;
+  SDLoc DL(SplatValue);
+
+  if (VecTy == MVT::v2i64) {
+    // v2i64 BUILD_VECTOR must be performed via v4i32 so split into i32's.
+    ViaVecTy = MVT::v4i32;
+
+    SplatValueA = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValue);
+    SplatValueB = DAG.getNode(ISD::SRL, DL, MVT::i64, SplatValue,
+                              DAG.getConstant(32, MVT::i32));
+    SplatValueB = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, SplatValueB);
+  }
+
+  // We currently hold the parts in little endian order. Swap them if
+  // necessary.
+  if (BigEndian)
+    std::swap(SplatValueA, SplatValueB);
+
+  SDValue Ops[16] = { SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB,
+                      SplatValueA, SplatValueB, SplatValueA, SplatValueB };
+
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
+                               ViaVecTy.getVectorNumElements());
+
+  if (VecTy != ViaVecTy)
+    Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
+
+  return Result;
+}
+
+static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
+                                        unsigned Opc, SDValue Imm,
+                                        bool BigEndian) {
+  EVT VecTy = Op->getValueType(0);
+  SDValue Exp2Imm;
+  SDLoc DL(Op);
+
+  // The DAG Combiner can't constant fold bitcasted vectors yet so we must do it
+  // here for now.
+  if (VecTy == MVT::v2i64) {
+    if (ConstantSDNode *CImm = dyn_cast<ConstantSDNode>(Imm)) {
+      APInt BitImm = APInt(64, 1) << CImm->getAPIntValue();
+
+      SDValue BitImmHiOp = DAG.getConstant(BitImm.lshr(32).trunc(32), MVT::i32);
+      SDValue BitImmLoOp = DAG.getConstant(BitImm.trunc(32), MVT::i32);
+
+      if (BigEndian)
+        std::swap(BitImmLoOp, BitImmHiOp);
+
+      Exp2Imm =
+          DAG.getNode(ISD::BITCAST, DL, MVT::v2i64,
+                      DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, BitImmLoOp,
+                                  BitImmHiOp, BitImmLoOp, BitImmHiOp));
+    }
+  }
+
+  if (Exp2Imm.getNode() == NULL) {
+    // We couldnt constant fold, do a vector shift instead
+
+    // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
+    // only values 0-63 are valid.
+    if (VecTy == MVT::v2i64)
+      Imm = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Imm);
+
+    Exp2Imm = getBuildVectorSplat(VecTy, Imm, BigEndian, DAG);
+
+    Exp2Imm =
+        DAG.getNode(ISD::SHL, DL, VecTy, DAG.getConstant(1, VecTy), Exp2Imm);
+  }
+
+  return DAG.getNode(Opc, DL, VecTy, Op->getOperand(1), Exp2Imm);
+}
+
+static SDValue lowerMSABitClear(SDValue Op, SelectionDAG &DAG) {
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  SDValue One = DAG.getConstant(1, ResTy);
+  SDValue Bit = DAG.getNode(ISD::SHL, DL, ResTy, One, Op->getOperand(2));
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1),
+                     DAG.getNOT(DL, Bit, ResTy));
+}
+
+static SDValue lowerMSABitClearImm(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  APInt BitImm = APInt(ResTy.getVectorElementType().getSizeInBits(), 1)
+                 << cast<ConstantSDNode>(Op->getOperand(2))->getAPIntValue();
+  SDValue BitMask = DAG.getConstant(~BitImm, ResTy);
+
+  return DAG.getNode(ISD::AND, DL, ResTy, Op->getOperand(1), BitMask);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
   switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
   default:
     return SDValue();
@@ -633,12 +1432,610 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::MSub);
   case Intrinsic::mips_msubu:
     return lowerDSPIntr(Op, DAG, MipsISD::MSubu);
+  case Intrinsic::mips_addv_b:
+  case Intrinsic::mips_addv_h:
+  case Intrinsic::mips_addv_w:
+  case Intrinsic::mips_addv_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_addvi_b:
+  case Intrinsic::mips_addvi_h:
+  case Intrinsic::mips_addvi_w:
+  case Intrinsic::mips_addvi_d:
+    return DAG.getNode(ISD::ADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_and_v:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_andi_b:
+    return DAG.getNode(ISD::AND, DL, Op->getValueType(0), Op->getOperand(1),
+                       lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_bclr_b:
+  case Intrinsic::mips_bclr_h:
+  case Intrinsic::mips_bclr_w:
+  case Intrinsic::mips_bclr_d:
+    return lowerMSABitClear(Op, DAG);
+  case Intrinsic::mips_bclri_b:
+  case Intrinsic::mips_bclri_h:
+  case Intrinsic::mips_bclri_w:
+  case Intrinsic::mips_bclri_d:
+    return lowerMSABitClearImm(Op, DAG);
+  case Intrinsic::mips_binsli_b:
+  case Intrinsic::mips_binsli_h:
+  case Intrinsic::mips_binsli_w:
+  case Intrinsic::mips_binsli_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
+                                       Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_binsri_b:
+  case Intrinsic::mips_binsri_h:
+  case Intrinsic::mips_binsri_w:
+  case Intrinsic::mips_binsri_d: {
+    EVT VecTy = Op->getValueType(0);
+    EVT EltTy = VecTy.getVectorElementType();
+    APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
+                                      Op->getConstantOperandVal(3));
+    return DAG.getNode(ISD::VSELECT, DL, VecTy,
+                       DAG.getConstant(Mask, VecTy, true), Op->getOperand(1),
+                       Op->getOperand(2));
+  }
+  case Intrinsic::mips_bmnz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_bmnzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(2),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bmz_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0), Op->getOperand(3),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_bmzi_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 3, DAG), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_bneg_b:
+  case Intrinsic::mips_bneg_h:
+  case Intrinsic::mips_bneg_w:
+  case Intrinsic::mips_bneg_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::XOR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bnegi_b:
+  case Intrinsic::mips_bnegi_h:
+  case Intrinsic::mips_bnegi_w:
+  case Intrinsic::mips_bnegi_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::XOR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bnz_b:
+  case Intrinsic::mips_bnz_h:
+  case Intrinsic::mips_bnz_w:
+  case Intrinsic::mips_bnz_d:
+    return DAG.getNode(MipsISD::VALL_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bnz_v:
+    return DAG.getNode(MipsISD::VANY_NONZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bsel_v:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       Op->getOperand(3));
+  case Intrinsic::mips_bseli_b:
+    return DAG.getNode(ISD::VSELECT, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2),
+                       lowerMSASplatImm(Op, 3, DAG));
+  case Intrinsic::mips_bset_b:
+  case Intrinsic::mips_bset_h:
+  case Intrinsic::mips_bset_w:
+  case Intrinsic::mips_bset_d: {
+    EVT VecTy = Op->getValueType(0);
+    SDValue One = DAG.getConstant(1, VecTy);
+
+    return DAG.getNode(ISD::OR, DL, VecTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, DL, VecTy, One,
+                                   Op->getOperand(2)));
+  }
+  case Intrinsic::mips_bseti_b:
+  case Intrinsic::mips_bseti_h:
+  case Intrinsic::mips_bseti_w:
+  case Intrinsic::mips_bseti_d:
+    return lowerMSABinaryBitImmIntr(Op, DAG, ISD::OR, Op->getOperand(2),
+                                    !Subtarget->isLittle());
+  case Intrinsic::mips_bz_b:
+  case Intrinsic::mips_bz_h:
+  case Intrinsic::mips_bz_w:
+  case Intrinsic::mips_bz_d:
+    return DAG.getNode(MipsISD::VALL_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_bz_v:
+    return DAG.getNode(MipsISD::VANY_ZERO, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ceq_b:
+  case Intrinsic::mips_ceq_h:
+  case Intrinsic::mips_ceq_w:
+  case Intrinsic::mips_ceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETEQ);
+  case Intrinsic::mips_ceqi_b:
+  case Intrinsic::mips_ceqi_h:
+  case Intrinsic::mips_ceqi_w:
+  case Intrinsic::mips_ceqi_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETEQ);
+  case Intrinsic::mips_cle_s_b:
+  case Intrinsic::mips_cle_s_h:
+  case Intrinsic::mips_cle_s_w:
+  case Intrinsic::mips_cle_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLE);
+  case Intrinsic::mips_clei_s_b:
+  case Intrinsic::mips_clei_s_h:
+  case Intrinsic::mips_clei_s_w:
+  case Intrinsic::mips_clei_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLE);
+  case Intrinsic::mips_cle_u_b:
+  case Intrinsic::mips_cle_u_h:
+  case Intrinsic::mips_cle_u_w:
+  case Intrinsic::mips_cle_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_clei_u_b:
+  case Intrinsic::mips_clei_u_h:
+  case Intrinsic::mips_clei_u_w:
+  case Intrinsic::mips_clei_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULE);
+  case Intrinsic::mips_clt_s_b:
+  case Intrinsic::mips_clt_s_h:
+  case Intrinsic::mips_clt_s_w:
+  case Intrinsic::mips_clt_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETLT);
+  case Intrinsic::mips_clti_s_b:
+  case Intrinsic::mips_clti_s_h:
+  case Intrinsic::mips_clti_s_w:
+  case Intrinsic::mips_clti_s_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETLT);
+  case Intrinsic::mips_clt_u_b:
+  case Intrinsic::mips_clt_u_h:
+  case Intrinsic::mips_clt_u_w:
+  case Intrinsic::mips_clt_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_clti_u_b:
+  case Intrinsic::mips_clti_u_h:
+  case Intrinsic::mips_clti_u_w:
+  case Intrinsic::mips_clti_u_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        lowerMSASplatImm(Op, 2, DAG), ISD::SETULT);
+  case Intrinsic::mips_copy_s_b:
+  case Intrinsic::mips_copy_s_h:
+  case Intrinsic::mips_copy_s_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_SEXT_ELT);
+  case Intrinsic::mips_copy_s_d:
+    // Don't lower directly into VEXTRACT_SEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_copy_u_b:
+  case Intrinsic::mips_copy_u_h:
+  case Intrinsic::mips_copy_u_w:
+    return lowerMSACopyIntr(Op, DAG, MipsISD::VEXTRACT_ZEXT_ELT);
+  case Intrinsic::mips_copy_u_d:
+    // Don't lower directly into VEXTRACT_ZEXT_ELT since i64 might be illegal.
+    // Instead lower to the generic EXTRACT_VECTOR_ELT node and let the type
+    // legalizer and EXTRACT_VECTOR_ELT lowering sort it out.
+    //
+    // Note: When i64 is illegal, this results in copy_s.w instructions instead
+    // of copy_u.w instructions. This makes no difference to the behaviour
+    // since i64 is only illegal when the register file is 32-bit.
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_div_s_b:
+  case Intrinsic::mips_div_s_h:
+  case Intrinsic::mips_div_s_w:
+  case Intrinsic::mips_div_s_d:
+    return DAG.getNode(ISD::SDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_div_u_b:
+  case Intrinsic::mips_div_u_h:
+  case Intrinsic::mips_div_u_w:
+  case Intrinsic::mips_div_u_d:
+    return DAG.getNode(ISD::UDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fadd_w:
+  case Intrinsic::mips_fadd_d:
+    return DAG.getNode(ISD::FADD, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  // Don't lower mips_fcaf_[wd] since LLVM folds SETFALSE condcodes away
+  case Intrinsic::mips_fceq_w:
+  case Intrinsic::mips_fceq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOEQ);
+  case Intrinsic::mips_fcle_w:
+  case Intrinsic::mips_fcle_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLE);
+  case Intrinsic::mips_fclt_w:
+  case Intrinsic::mips_fclt_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETOLT);
+  case Intrinsic::mips_fcne_w:
+  case Intrinsic::mips_fcne_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETONE);
+  case Intrinsic::mips_fcor_w:
+  case Intrinsic::mips_fcor_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETO);
+  case Intrinsic::mips_fcueq_w:
+  case Intrinsic::mips_fcueq_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUEQ);
+  case Intrinsic::mips_fcule_w:
+  case Intrinsic::mips_fcule_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULE);
+  case Intrinsic::mips_fcult_w:
+  case Intrinsic::mips_fcult_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETULT);
+  case Intrinsic::mips_fcun_w:
+  case Intrinsic::mips_fcun_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUO);
+  case Intrinsic::mips_fcune_w:
+  case Intrinsic::mips_fcune_d:
+    return DAG.getSetCC(DL, Op->getValueType(0), Op->getOperand(1),
+                        Op->getOperand(2), ISD::SETUNE);
+  case Intrinsic::mips_fdiv_w:
+  case Intrinsic::mips_fdiv_d:
+    return DAG.getNode(ISD::FDIV, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ffint_u_w:
+  case Intrinsic::mips_ffint_u_d:
+    return DAG.getNode(ISD::UINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ffint_s_w:
+  case Intrinsic::mips_ffint_s_d:
+    return DAG.getNode(ISD::SINT_TO_FP, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_fill_b:
+  case Intrinsic::mips_fill_h:
+  case Intrinsic::mips_fill_w:
+  case Intrinsic::mips_fill_d: {
+    SmallVector<SDValue, 16> Ops;
+    EVT ResTy = Op->getValueType(0);
+
+    for (unsigned i = 0; i < ResTy.getVectorNumElements(); ++i)
+      Ops.push_back(Op->getOperand(1));
+
+    // If ResTy is v2i64 then the type legalizer will break this node down into
+    // an equivalent v4i32.
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, &Ops[0], Ops.size());
+  }
+  case Intrinsic::mips_fexp2_w:
+  case Intrinsic::mips_fexp2_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(
+        ISD::FMUL, SDLoc(Op), ResTy, Op->getOperand(1),
+        DAG.getNode(ISD::FEXP2, SDLoc(Op), ResTy, Op->getOperand(2)));
+  }
+  case Intrinsic::mips_flog2_w:
+  case Intrinsic::mips_flog2_d:
+    return DAG.getNode(ISD::FLOG2, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fmadd_w:
+  case Intrinsic::mips_fmadd_d:
+    return DAG.getNode(ISD::FMA, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_fmul_w:
+  case Intrinsic::mips_fmul_d:
+    return DAG.getNode(ISD::FMUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_fmsub_w:
+  case Intrinsic::mips_fmsub_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::FMUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_frint_w:
+  case Intrinsic::mips_frint_d:
+    return DAG.getNode(ISD::FRINT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsqrt_w:
+  case Intrinsic::mips_fsqrt_d:
+    return DAG.getNode(ISD::FSQRT, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_fsub_w:
+  case Intrinsic::mips_fsub_d:
+    return DAG.getNode(ISD::FSUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ftrunc_u_w:
+  case Intrinsic::mips_ftrunc_u_d:
+    return DAG.getNode(ISD::FP_TO_UINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ftrunc_s_w:
+  case Intrinsic::mips_ftrunc_s_d:
+    return DAG.getNode(ISD::FP_TO_SINT, DL, Op->getValueType(0),
+                       Op->getOperand(1));
+  case Intrinsic::mips_ilvev_b:
+  case Intrinsic::mips_ilvev_h:
+  case Intrinsic::mips_ilvev_w:
+  case Intrinsic::mips_ilvev_d:
+    return DAG.getNode(MipsISD::ILVEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvl_b:
+  case Intrinsic::mips_ilvl_h:
+  case Intrinsic::mips_ilvl_w:
+  case Intrinsic::mips_ilvl_d:
+    return DAG.getNode(MipsISD::ILVL, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvod_b:
+  case Intrinsic::mips_ilvod_h:
+  case Intrinsic::mips_ilvod_w:
+  case Intrinsic::mips_ilvod_d:
+    return DAG.getNode(MipsISD::ILVOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_ilvr_b:
+  case Intrinsic::mips_ilvr_h:
+  case Intrinsic::mips_ilvr_w:
+  case Intrinsic::mips_ilvr_d:
+    return DAG.getNode(MipsISD::ILVR, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_insert_b:
+  case Intrinsic::mips_insert_h:
+  case Intrinsic::mips_insert_w:
+  case Intrinsic::mips_insert_d:
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(3), Op->getOperand(2));
+  case Intrinsic::mips_ldi_b:
+  case Intrinsic::mips_ldi_h:
+  case Intrinsic::mips_ldi_w:
+  case Intrinsic::mips_ldi_d:
+    return lowerMSASplatImm(Op, 1, DAG);
+  case Intrinsic::mips_lsa: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::SHL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_maddv_b:
+  case Intrinsic::mips_maddv_h:
+  case Intrinsic::mips_maddv_w:
+  case Intrinsic::mips_maddv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::ADD, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_max_s_b:
+  case Intrinsic::mips_max_s_h:
+  case Intrinsic::mips_max_s_w:
+  case Intrinsic::mips_max_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_max_u_b:
+  case Intrinsic::mips_max_u_h:
+  case Intrinsic::mips_max_u_w:
+  case Intrinsic::mips_max_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_maxi_s_b:
+  case Intrinsic::mips_maxi_s_h:
+  case Intrinsic::mips_maxi_s_w:
+  case Intrinsic::mips_maxi_s_d:
+    return DAG.getNode(MipsISD::VSMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_maxi_u_b:
+  case Intrinsic::mips_maxi_u_h:
+  case Intrinsic::mips_maxi_u_w:
+  case Intrinsic::mips_maxi_u_d:
+    return DAG.getNode(MipsISD::VUMAX, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_min_s_b:
+  case Intrinsic::mips_min_s_h:
+  case Intrinsic::mips_min_s_w:
+  case Intrinsic::mips_min_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_min_u_b:
+  case Intrinsic::mips_min_u_h:
+  case Intrinsic::mips_min_u_w:
+  case Intrinsic::mips_min_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_mini_s_b:
+  case Intrinsic::mips_mini_s_h:
+  case Intrinsic::mips_mini_s_w:
+  case Intrinsic::mips_mini_s_d:
+    return DAG.getNode(MipsISD::VSMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mini_u_b:
+  case Intrinsic::mips_mini_u_h:
+  case Intrinsic::mips_mini_u_w:
+  case Intrinsic::mips_mini_u_d:
+    return DAG.getNode(MipsISD::VUMIN, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_mod_s_b:
+  case Intrinsic::mips_mod_s_h:
+  case Intrinsic::mips_mod_s_w:
+  case Intrinsic::mips_mod_s_d:
+    return DAG.getNode(ISD::SREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mod_u_b:
+  case Intrinsic::mips_mod_u_h:
+  case Intrinsic::mips_mod_u_w:
+  case Intrinsic::mips_mod_u_d:
+    return DAG.getNode(ISD::UREM, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_mulv_b:
+  case Intrinsic::mips_mulv_h:
+  case Intrinsic::mips_mulv_w:
+  case Intrinsic::mips_mulv_d:
+    return DAG.getNode(ISD::MUL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_msubv_b:
+  case Intrinsic::mips_msubv_h:
+  case Intrinsic::mips_msubv_w:
+  case Intrinsic::mips_msubv_d: {
+    EVT ResTy = Op->getValueType(0);
+    return DAG.getNode(ISD::SUB, SDLoc(Op), ResTy, Op->getOperand(1),
+                       DAG.getNode(ISD::MUL, SDLoc(Op), ResTy,
+                                   Op->getOperand(2), Op->getOperand(3)));
+  }
+  case Intrinsic::mips_nlzc_b:
+  case Intrinsic::mips_nlzc_h:
+  case Intrinsic::mips_nlzc_w:
+  case Intrinsic::mips_nlzc_d:
+    return DAG.getNode(ISD::CTLZ, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_nor_v: {
+    SDValue Res = DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                              Op->getOperand(1), Op->getOperand(2));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_nori_b: {
+    SDValue Res =  DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                               Op->getOperand(1),
+                               lowerMSASplatImm(Op, 2, DAG));
+    return DAG.getNOT(DL, Res, Res->getValueType(0));
+  }
+  case Intrinsic::mips_or_v:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_ori_b:
+    return DAG.getNode(ISD::OR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_pckev_b:
+  case Intrinsic::mips_pckev_h:
+  case Intrinsic::mips_pckev_w:
+  case Intrinsic::mips_pckev_d:
+    return DAG.getNode(MipsISD::PCKEV, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pckod_b:
+  case Intrinsic::mips_pckod_h:
+  case Intrinsic::mips_pckod_w:
+  case Intrinsic::mips_pckod_d:
+    return DAG.getNode(MipsISD::PCKOD, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2));
+  case Intrinsic::mips_pcnt_b:
+  case Intrinsic::mips_pcnt_h:
+  case Intrinsic::mips_pcnt_w:
+  case Intrinsic::mips_pcnt_d:
+    return DAG.getNode(ISD::CTPOP, DL, Op->getValueType(0), Op->getOperand(1));
+  case Intrinsic::mips_shf_b:
+  case Intrinsic::mips_shf_h:
+  case Intrinsic::mips_shf_w:
+    return DAG.getNode(MipsISD::SHF, DL, Op->getValueType(0),
+                       Op->getOperand(2), Op->getOperand(1));
+  case Intrinsic::mips_sll_b:
+  case Intrinsic::mips_sll_h:
+  case Intrinsic::mips_sll_w:
+  case Intrinsic::mips_sll_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_slli_b:
+  case Intrinsic::mips_slli_h:
+  case Intrinsic::mips_slli_w:
+  case Intrinsic::mips_slli_d:
+    return DAG.getNode(ISD::SHL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_splat_b:
+  case Intrinsic::mips_splat_h:
+  case Intrinsic::mips_splat_w:
+  case Intrinsic::mips_splat_d:
+    // We can't lower via VECTOR_SHUFFLE because it requires constant shuffle
+    // masks, nor can we lower via BUILD_VECTOR & EXTRACT_VECTOR_ELT because
+    // EXTRACT_VECTOR_ELT can't extract i64's on MIPS32.
+    // Instead we lower to MipsISD::VSHF and match from there.
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatZExt(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_splati_b:
+  case Intrinsic::mips_splati_h:
+  case Intrinsic::mips_splati_w:
+  case Intrinsic::mips_splati_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       lowerMSASplatImm(Op, 2, DAG), Op->getOperand(1),
+                       Op->getOperand(1));
+  case Intrinsic::mips_sra_b:
+  case Intrinsic::mips_sra_h:
+  case Intrinsic::mips_sra_w:
+  case Intrinsic::mips_sra_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srai_b:
+  case Intrinsic::mips_srai_h:
+  case Intrinsic::mips_srai_w:
+  case Intrinsic::mips_srai_d:
+    return DAG.getNode(ISD::SRA, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_srl_b:
+  case Intrinsic::mips_srl_h:
+  case Intrinsic::mips_srl_w:
+  case Intrinsic::mips_srl_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_srli_b:
+  case Intrinsic::mips_srli_h:
+  case Intrinsic::mips_srli_w:
+  case Intrinsic::mips_srli_d:
+    return DAG.getNode(ISD::SRL, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_subv_b:
+  case Intrinsic::mips_subv_h:
+  case Intrinsic::mips_subv_w:
+  case Intrinsic::mips_subv_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_subvi_b:
+  case Intrinsic::mips_subvi_h:
+  case Intrinsic::mips_subvi_w:
+  case Intrinsic::mips_subvi_d:
+    return DAG.getNode(ISD::SUB, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
+  case Intrinsic::mips_vshf_b:
+  case Intrinsic::mips_vshf_h:
+  case Intrinsic::mips_vshf_w:
+  case Intrinsic::mips_vshf_d:
+    return DAG.getNode(MipsISD::VSHF, DL, Op->getValueType(0),
+                       Op->getOperand(1), Op->getOperand(2), Op->getOperand(3));
+  case Intrinsic::mips_xor_v:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0), Op->getOperand(1),
+                       Op->getOperand(2));
+  case Intrinsic::mips_xori_b:
+    return DAG.getNode(ISD::XOR, DL, Op->getValueType(0),
+                       Op->getOperand(1), lowerMSASplatImm(Op, 2, DAG));
   }
 }
 
+static SDValue lowerMSALoadIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Address = Op->getOperand(2);
+  SDValue Offset  = Op->getOperand(3);
+  EVT ResTy = Op->getValueType(0);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getLoad(ResTy, DL, ChainIn, Address, MachinePointerInfo(), false,
+                     false, false, 16);
+}
+
 SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
   default:
     return SDValue();
   case Intrinsic::mips_extp:
@@ -681,9 +2078,524 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_W_CHAIN(SDValue Op,
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH);
   case Intrinsic::mips_dpsqx_sa_w_ph:
     return lowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH);
+  case Intrinsic::mips_ld_b:
+  case Intrinsic::mips_ld_h:
+  case Intrinsic::mips_ld_w:
+  case Intrinsic::mips_ld_d:
+   return lowerMSALoadIntr(Op, DAG, Intr);
+  }
+}
+
+static SDValue lowerMSAStoreIntr(SDValue Op, SelectionDAG &DAG, unsigned Intr) {
+  SDLoc DL(Op);
+  SDValue ChainIn = Op->getOperand(0);
+  SDValue Value   = Op->getOperand(2);
+  SDValue Address = Op->getOperand(3);
+  SDValue Offset  = Op->getOperand(4);
+  EVT PtrTy = Address->getValueType(0);
+
+  Address = DAG.getNode(ISD::ADD, DL, PtrTy, Address, Offset);
+
+  return DAG.getStore(ChainIn, DL, Value, Address, MachinePointerInfo(), false,
+                      false, 16);
+}
+
+SDValue MipsSETargetLowering::lowerINTRINSIC_VOID(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  unsigned Intr = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+  switch (Intr) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_st_b:
+  case Intrinsic::mips_st_h:
+  case Intrinsic::mips_st_w:
+  case Intrinsic::mips_st_d:
+    return lowerMSAStoreIntr(Op, DAG, Intr);
   }
 }
 
+/// \brief Check if the given BuildVectorSDNode is a splat.
+/// This method currently relies on DAG nodes being reused when equivalent,
+/// so it's possible for this to return false even when isConstantSplat returns
+/// true.
+static bool isSplatVector(const BuildVectorSDNode *N) {
+  unsigned int nOps = N->getNumOperands();
+  assert(nOps > 1 && "isSplatVector has 0 or 1 sized build vector");
+
+  SDValue Operand0 = N->getOperand(0);
+
+  for (unsigned int i = 1; i < nOps; ++i) {
+    if (N->getOperand(i) != Operand0)
+      return false;
+  }
+
+  return true;
+}
+
+// Lower ISD::EXTRACT_VECTOR_ELT into MipsISD::VEXTRACT_SEXT_ELT.
+//
+// The non-value bits resulting from ISD::EXTRACT_VECTOR_ELT are undefined. We
+// choose to sign-extend but we could have equally chosen zero-extend. The
+// DAGCombiner will fold any sign/zero extension of the ISD::EXTRACT_VECTOR_ELT
+// result into this node later (possibly changing it to a zero-extend in the
+// process).
+SDValue MipsSETargetLowering::
+lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDValue Op0 = Op->getOperand(0);
+  EVT VecTy = Op0->getValueType(0);
+
+  if (!VecTy.is128BitVector())
+    return SDValue();
+
+  if (ResTy.isInteger()) {
+    SDValue Op1 = Op->getOperand(1);
+    EVT EltTy = VecTy.getVectorElementType();
+    return DAG.getNode(MipsISD::VEXTRACT_SEXT_ELT, DL, ResTy, Op0, Op1,
+                       DAG.getValueType(EltTy));
+  }
+
+  return Op;
+}
+
+static bool isConstantOrUndef(const SDValue Op) {
+  if (Op->getOpcode() == ISD::UNDEF)
+    return true;
+  if (dyn_cast<ConstantSDNode>(Op))
+    return true;
+  if (dyn_cast<ConstantFPSDNode>(Op))
+    return true;
+  return false;
+}
+
+static bool isConstantOrUndefBUILD_VECTOR(const BuildVectorSDNode *Op) {
+  for (unsigned i = 0; i < Op->getNumOperands(); ++i)
+    if (isConstantOrUndef(Op->getOperand(i)))
+      return true;
+  return false;
+}
+
+// Lowers ISD::BUILD_VECTOR into appropriate SelectionDAG nodes for the
+// backend.
+//
+// Lowers according to the following rules:
+// - Constant splats are legal as-is as long as the SplatBitSize is a power of
+//   2 less than or equal to 64 and the value fits into a signed 10-bit
+//   immediate
+// - Constant splats are lowered to bitconverted BUILD_VECTORs if SplatBitSize
+//   is a power of 2 less than or equal to 64 and the value does not fit into a
+//   signed 10-bit immediate
+// - Non-constant splats are legal as-is.
+// - Non-constant non-splats are lowered to sequences of INSERT_VECTOR_ELT.
+// - All others are illegal and must be expanded.
+SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+  SDLoc DL(Op);
+  APInt SplatValue, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!Subtarget->hasMSA() || !ResTy.is128BitVector())
+    return SDValue();
+
+  if (Node->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                            HasAnyUndefs, 8,
+                            !Subtarget->isLittle()) && SplatBitSize <= 64) {
+    // We can only cope with 8, 16, 32, or 64-bit elements
+    if (SplatBitSize != 8 && SplatBitSize != 16 && SplatBitSize != 32 &&
+        SplatBitSize != 64)
+      return SDValue();
+
+    // If the value fits into a simm10 then we can use ldi.[bhwd]
+    // However, if it isn't an integer type we will have to bitcast from an
+    // integer type first. Also, if there are any undefs, we must lower them
+    // to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+      return Op;
+
+    EVT ViaVecTy;
+
+    switch (SplatBitSize) {
+    default:
+      return SDValue();
+    case 8:
+      ViaVecTy = MVT::v16i8;
+      break;
+    case 16:
+      ViaVecTy = MVT::v8i16;
+      break;
+    case 32:
+      ViaVecTy = MVT::v4i32;
+      break;
+    case 64:
+      // There's no fill.d to fall back on for 64-bit values
+      return SDValue();
+    }
+
+    // SelectionDAG::getConstant will promote SplatValue appropriately.
+    SDValue Result = DAG.getConstant(SplatValue, ViaVecTy);
+
+    // Bitcast to the type we originally wanted
+    if (ViaVecTy != ResTy)
+      Result = DAG.getNode(ISD::BITCAST, SDLoc(Node), ResTy, Result);
+
+    return Result;
+  } else if (isSplatVector(Node))
+    return Op;
+  else if (!isConstantOrUndefBUILD_VECTOR(Node)) {
+    // Use INSERT_VECTOR_ELT operations rather than expand to stores.
+    // The resulting code is the same length as the expansion, but it doesn't
+    // use memory operations
+    EVT ResTy = Node->getValueType(0);
+
+    assert(ResTy.isVector());
+
+    unsigned NumElts = ResTy.getVectorNumElements();
+    SDValue Vector = DAG.getUNDEF(ResTy);
+    for (unsigned i = 0; i < NumElts; ++i) {
+      Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector,
+                           Node->getOperand(i),
+                           DAG.getConstant(i, MVT::i32));
+    }
+    return Vector;
+  }
+
+  return SDValue();
+}
+
+// Lower VECTOR_SHUFFLE into SHF (if possible).
+//
+// SHF splits the vector into blocks of four elements, then shuffles these
+// elements according to a <4 x i2> constant (encoded as an integer immediate).
+//
+// It is therefore possible to lower into SHF when the mask takes the form:
+//   <a, b, c, d, a+4, b+4, c+4, d+4, a+8, b+8, c+8, d+8, ...>
+// When undef's appear they are treated as if they were whatever value is
+// necessary in order to fit the above form.
+//
+// For example:
+//   %2 = shufflevector <8 x i16> %0, <8 x i16> undef,
+//                      <8 x i32> <i32 3, i32 2, i32 1, i32 0,
+//                                 i32 7, i32 6, i32 5, i32 4>
+// is lowered to:
+//   (SHF_H $w0, $w1, 27)
+// where the 27 comes from:
+//   3 + (2 << 2) + (1 << 4) + (0 << 6)
+static SDValue lowerVECTOR_SHUFFLE_SHF(SDValue Op, EVT ResTy,
+                                       SmallVector<int, 16> Indices,
+                                       SelectionDAG &DAG) {
+  int SHFIndices[4] = { -1, -1, -1, -1 };
+
+  if (Indices.size() < 4)
+    return SDValue();
+
+  for (unsigned i = 0; i < 4; ++i) {
+    for (unsigned j = i; j < Indices.size(); j += 4) {
+      int Idx = Indices[j];
+
+      // Convert from vector index to 4-element subvector index
+      // If an index refers to an element outside of the subvector then give up
+      if (Idx != -1) {
+        Idx -= 4 * (j / 4);
+        if (Idx < 0 || Idx >= 4)
+          return SDValue();
+      }
+
+      // If the mask has an undef, replace it with the current index.
+      // Note that it might still be undef if the current index is also undef
+      if (SHFIndices[i] == -1)
+        SHFIndices[i] = Idx;
+
+      // Check that non-undef values are the same as in the mask. If they
+      // aren't then give up
+      if (!(Idx == -1 || Idx == SHFIndices[i]))
+        return SDValue();
+    }
+  }
+
+  // Calculate the immediate. Replace any remaining undefs with zero
+  APInt Imm(32, 0);
+  for (int i = 3; i >= 0; --i) {
+    int Idx = SHFIndices[i];
+
+    if (Idx == -1)
+      Idx = 0;
+
+    Imm <<= 2;
+    Imm |= Idx & 0x3;
+  }
+
+  return DAG.getNode(MipsISD::SHF, SDLoc(Op), ResTy,
+                     DAG.getConstant(Imm, MVT::i32), Op->getOperand(0));
+}
+
+// Lower VECTOR_SHUFFLE into ILVEV (if possible).
+//
+// ILVEV interleaves the even elements from each vector.
+//
+// It is possible to lower into ILVEV when the mask takes the form:
+//   <0, n, 2, n+2, 4, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVOD (if possible).
+//
+// ILVOD interleaves the odd elements from each vector.
+//
+// It is possible to lower into ILVOD when the mask takes the form:
+//   <1, n+1, 3, n+3, 5, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 1;
+  int WtIdx = ResTy.getVectorNumElements() + 1;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx += 2;
+    WtIdx += 2;
+  }
+
+  return DAG.getNode(MipsISD::ILVOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVL (if possible).
+//
+// ILVL interleaves consecutive elements from the left half of each vector.
+//
+// It is possible to lower into ILVL when the mask takes the form:
+//   <0, n, 1, n+1, 2, n+2, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVL(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int WsIdx = 0;
+  int WtIdx = ResTy.getVectorNumElements();
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVL, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into ILVR (if possible).
+//
+// ILVR interleaves consecutive elements from the right half of each vector.
+//
+// It is possible to lower into ILVR when the mask takes the form:
+//   <x, n+x, x+1, n+x+1, x+2, n+x+2, ...>
+// where n is the number of elements in the vector and x is half n.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_ILVR(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  unsigned NumElts = ResTy.getVectorNumElements();
+  int WsIdx = NumElts / 2;
+  int WtIdx = NumElts + NumElts / 2;
+
+  for (unsigned i = 0; i < Indices.size(); i += 2) {
+    if (Indices[i] != -1 && Indices[i] != WsIdx)
+      return SDValue();
+    if (Indices[i+1] != -1 && Indices[i+1] != WtIdx)
+      return SDValue();
+    WsIdx ++;
+    WtIdx ++;
+  }
+
+  return DAG.getNode(MipsISD::ILVR, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKEV (if possible).
+//
+// PCKEV copies the even elements of each vector into the result vector.
+//
+// It is possible to lower into PCKEV when the mask takes the form:
+//   <0, 2, 4, ..., n, n+2, n+4, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKEV(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 0;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKEV, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into PCKOD (if possible).
+//
+// PCKOD copies the odd elements of each vector into the result vector.
+//
+// It is possible to lower into PCKOD when the mask takes the form:
+//   <1, 3, 5, ..., n+1, n+3, n+5, ...>
+// where n is the number of elements in the vector.
+//
+// When undef's appear in the mask they are treated as if they were whatever
+// value is necessary in order to fit the above form.
+static SDValue lowerVECTOR_SHUFFLE_PCKOD(SDValue Op, EVT ResTy,
+                                         SmallVector<int, 16> Indices,
+                                         SelectionDAG &DAG) {
+  assert ((Indices.size() % 2) == 0);
+  int Idx = 1;
+
+  for (unsigned i = 0; i < Indices.size(); ++i) {
+    if (Indices[i] != -1 && Indices[i] != Idx)
+      return SDValue();
+    Idx += 2;
+  }
+
+  return DAG.getNode(MipsISD::PCKOD, SDLoc(Op), ResTy, Op->getOperand(0),
+                     Op->getOperand(1));
+}
+
+// Lower VECTOR_SHUFFLE into VSHF.
+//
+// This mostly consists of converting the shuffle indices in Indices into a
+// BUILD_VECTOR and adding it as an operand to the resulting VSHF. There is
+// also code to eliminate unused operands of the VECTOR_SHUFFLE. For example,
+// if the type is v8i16 and all the indices are less than 8 then the second
+// operand is unused and can be replaced with anything. We choose to replace it
+// with the used operand since this reduces the number of instructions overall.
+static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
+                                        SmallVector<int, 16> Indices,
+                                        SelectionDAG &DAG) {
+  SmallVector<SDValue, 16> Ops;
+  SDValue Op0;
+  SDValue Op1;
+  EVT MaskVecTy = ResTy.changeVectorElementTypeToInteger();
+  EVT MaskEltTy = MaskVecTy.getVectorElementType();
+  bool Using1stVec = false;
+  bool Using2ndVec = false;
+  SDLoc DL(Op);
+  int ResTyNumElts = ResTy.getVectorNumElements();
+
+  for (int i = 0; i < ResTyNumElts; ++i) {
+    // Idx == -1 means UNDEF
+    int Idx = Indices[i];
+
+    if (0 <= Idx && Idx < ResTyNumElts)
+      Using1stVec = true;
+    if (ResTyNumElts <= Idx && Idx < ResTyNumElts * 2)
+      Using2ndVec = true;
+  }
+
+  for (SmallVector<int, 16>::iterator I = Indices.begin(); I != Indices.end();
+       ++I)
+    Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
+
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0],
+                                Ops.size());
+
+  if (Using1stVec && Using2ndVec) {
+    Op0 = Op->getOperand(0);
+    Op1 = Op->getOperand(1);
+  } else if (Using1stVec)
+    Op0 = Op1 = Op->getOperand(0);
+  else if (Using2ndVec)
+    Op0 = Op1 = Op->getOperand(1);
+  else
+    llvm_unreachable("shuffle vector mask references neither vector operand?");
+
+  return DAG.getNode(MipsISD::VSHF, DL, ResTy, MaskVec, Op0, Op1);
+}
+
+// Lower VECTOR_SHUFFLE into one of a number of instructions depending on the
+// indices in the shuffle.
+SDValue MipsSETargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  ShuffleVectorSDNode *Node = cast<ShuffleVectorSDNode>(Op);
+  EVT ResTy = Op->getValueType(0);
+
+  if (!ResTy.is128BitVector())
+    return SDValue();
+
+  int ResTyNumElts = ResTy.getVectorNumElements();
+  SmallVector<int, 16> Indices;
+
+  for (int i = 0; i < ResTyNumElts; ++i)
+    Indices.push_back(Node->getMaskElt(i));
+
+  SDValue Result = lowerVECTOR_SHUFFLE_SHF(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVL(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_ILVR(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKEV(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  Result = lowerVECTOR_SHUFFLE_PCKOD(Op, ResTy, Indices, DAG);
+  if (Result.getNode())
+    return Result;
+  return lowerVECTOR_SHUFFLE_VSHF(Op, ResTy, Indices, DAG);
+}
+
 MachineBasicBlock * MipsSETargetLowering::
 emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   // $bb:
@@ -701,7 +2613,7 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
 
   MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const TargetRegisterClass *RC = &Mips::CPURegsRegClass;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   DebugLoc DL = MI->getDebugLoc();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
   MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
@@ -746,3 +2658,318 @@ emitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
   MI->eraseFromParent();   // The pseudo instruction is gone now.
   return Sink;
 }
+
+MachineBasicBlock * MipsSETargetLowering::
+emitMSACBranchPseudo(MachineInstr *MI, MachineBasicBlock *BB,
+                     unsigned BranchOp) const{
+  // $bb:
+  //  vany_nonzero $rd, $ws
+  //  =>
+  // $bb:
+  //  bnz.b $ws, $tbb
+  //  b $fbb
+  // $fbb:
+  //  li $rd1, 0
+  //  b $sink
+  // $tbb:
+  //  li $rd2, 1
+  // $sink:
+  //  $rd = phi($rd1, $fbb, $rd2, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bnz.b instruction to $BB.
+  BuildMI(BB, DL, TII->get(BranchOp))
+    .addReg(MI->getOperand(1).getReg())
+    .addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned RD1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), RD1)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned RD2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), RD2)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(RD1).addMBB(FBB).addReg(RD2).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
+// Emit the COPY_FW pseudo instruction.
+//
+// copy_fw_pseudo $fd, $ws, n
+// =>
+// copy_u_w $rt, $ws, $n
+// mtc1     $rt, $fd
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is never valid
+// for lane 1 because it would require FR=0 mode which isn't supported by MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FW(MachineInstr *MI, MachineBasicBlock *BB) const{
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Fd = MI->getOperand(0).getReg();
+  unsigned Ws = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_lo);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_lo);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the COPY_FD pseudo instruction.
+//
+// copy_fd_pseudo $fd, $ws, n
+// =>
+// splati.d $wt, $ws, $n
+// copy $fd, $wt:sub_64
+//
+// When n is zero, the equivalent operation can be performed with (potentially)
+// zero instructions due to register overlaps. This optimization is always
+// valid because FR=1 mode which is the only supported mode in MSA.
+MachineBasicBlock * MipsSETargetLowering::
+emitCOPY_FD(MachineInstr *MI, MachineBasicBlock *BB) const{
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  unsigned Fd  = MI->getOperand(0).getReg();
+  unsigned Ws  = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm() * 2;
+  DebugLoc DL = MI->getDebugLoc();
+
+  if (Lane == 0)
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Ws, 0, Mips::sub_64);
+  else {
+    unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+    BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wt).addReg(Ws).addImm(1);
+    BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Fd).addReg(Wt, 0, Mips::sub_64);
+  }
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FW pseudo instruction.
+//
+// insert_fw_pseudo $wd, $wd_in, $n, $fs
+// =>
+// subreg_to_reg $wt:sub_lo, $fs
+// insve_w $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FW(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_W), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the INSERT_FD pseudo instruction.
+//
+// insert_fd_pseudo $wd, $fs, n
+// =>
+// subreg_to_reg $wt:sub_64, $fs
+// insve_d $wd[$n], $wd_in, $wt[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Wd_in = MI->getOperand(1).getReg();
+  unsigned Lane = MI->getOperand(2).getImm();
+  unsigned Fs = MI->getOperand(3).getReg();
+  unsigned Wt = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+      .addImm(0)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSVE_D), Wd)
+      .addReg(Wd_in)
+      .addImm(Lane)
+      .addReg(Wt);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FW pseudo instruction.
+//
+// fill_fw_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_lo, $wt1, $fs
+// splati.w $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FW(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_lo);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_W), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FILL_FD pseudo instruction.
+//
+// fill_fd_pseudo $wd, $fs
+// =>
+// implicit_def $wt1
+// insert_subreg $wt2:subreg_64, $wt1, $fs
+// splati.d $wd, $wt2[0]
+MachineBasicBlock *
+MipsSETargetLowering::emitFILL_FD(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const {
+  assert(Subtarget->isFP64bit());
+
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned Fs = MI->getOperand(1).getReg();
+  unsigned Wt1 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+  unsigned Wt2 = RegInfo.createVirtualRegister(&Mips::MSA128DRegClass);
+
+  BuildMI(*BB, MI, DL, TII->get(Mips::IMPLICIT_DEF), Wt1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::INSERT_SUBREG), Wt2)
+      .addReg(Wt1)
+      .addReg(Fs)
+      .addImm(Mips::sub_64);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SPLATI_D), Wd).addReg(Wt2).addImm(0);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_W_1 pseudo instructions.
+//
+// fexp2_w_1_pseudo $wd, $wt
+// =>
+// ldi.w $ws, 1
+// fexp2.w $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_W_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128WRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_W), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_W), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_W), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
+// Emit the FEXP2_D_1 pseudo instructions.
+//
+// fexp2_d_1_pseudo $wd, $wt
+// =>
+// ldi.d $ws, 1
+// fexp2.d $wd, $ws, $wt
+MachineBasicBlock *
+MipsSETargetLowering::emitFEXP2_D_1(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = &Mips::MSA128DRegClass;
+  unsigned Ws1 = RegInfo.createVirtualRegister(RC);
+  unsigned Ws2 = RegInfo.createVirtualRegister(RC);
+  DebugLoc DL = MI->getDebugLoc();
+
+  // Splat 1.0 into a vector
+  BuildMI(*BB, MI, DL, TII->get(Mips::LDI_D), Ws1).addImm(1);
+  BuildMI(*BB, MI, DL, TII->get(Mips::FFINT_U_D), Ws2).addReg(Ws1);
+
+  // Emit 1.0 * fexp2(Wt)
+  BuildMI(*BB, MI, DL, TII->get(Mips::FEXP2_D), MI->getOperand(0).getReg())
+      .addReg(Ws2)
+      .addReg(MI->getOperand(1).getReg());
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index ec8a5c7..c5210d9 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -22,6 +22,14 @@ namespace llvm {
   public:
     explicit MipsSETargetLowering(MipsTargetMachine &TM);
 
+    /// \brief Enable MSA support for the given integer type and Register
+    /// class.
+    void addMSAIntType(MVT::SimpleValueType Ty, const TargetRegisterClass *RC);
+    /// \brief Enable MSA support for the given floating-point type and
+    /// Register class.
+    void addMSAFloatType(MVT::SimpleValueType Ty,
+                         const TargetRegisterClass *RC);
+
     virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const;
 
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
@@ -38,8 +46,8 @@ namespace llvm {
 
     virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
       if (VT == MVT::Untyped)
-        return Subtarget->hasDSP() ? &Mips::ACRegsDSPRegClass :
-                                     &Mips::ACRegsRegClass;
+        return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
+                                     &Mips::ACC64RegClass;
 
       return TargetLowering::getRepRegClassFor(VT);
     }
@@ -56,14 +64,50 @@ namespace llvm {
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
                 CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
 
+    SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
                         SelectionDAG &DAG) const;
 
     SDValue lowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    /// \brief Lower VECTOR_SHUFFLE into one of a number of instructions
+    /// depending on the indices in the shuffle.
+    SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
 
     MachineBasicBlock *emitBPOSGE32(MachineInstr *MI,
                                     MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitMSACBranchPseudo(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            unsigned BranchOp) const;
+    /// \brief Emit the COPY_FW pseudo instruction
+    MachineBasicBlock *emitCOPY_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the COPY_FD pseudo instruction
+    MachineBasicBlock *emitCOPY_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FW pseudo instruction
+    MachineBasicBlock *emitINSERT_FW(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_FD pseudo instruction
+    MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FW pseudo instruction
+    MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FILL_FD pseudo instruction
+    MachineBasicBlock *emitFILL_FD(MachineInstr *MI,
+                                   MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_W_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_W_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
+    /// \brief Emit the FEXP2_D_1 pseudo instructions.
+    MachineBasicBlock *emitFEXP2_D_1(MachineInstr *MI,
+                                     MachineBasicBlock *BB) const;
   };
 }
 
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index a0768e5..02931a3 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -26,7 +27,7 @@ using namespace llvm;
 MipsSEInstrInfo::MipsSEInstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm,
                   tm.getRelocationModel() == Reloc::PIC_ ? Mips::B : Mips::J),
-    RI(*tm.getSubtargetImpl(), *this),
+    RI(*tm.getSubtargetImpl()),
     IsN64(tm.getSubtarget<MipsSubtarget>().isABI_N64()) {}
 
 const MipsRegisterInfo &MipsSEInstrInfo::getRegisterInfo() const {
@@ -43,10 +44,8 @@ isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::LW)    || (Opc == Mips::LW_P8)  || (Opc == Mips::LD) ||
-      (Opc == Mips::LD_P8) || (Opc == Mips::LWC1)   || (Opc == Mips::LWC1_P8) ||
-      (Opc == Mips::LDC1)  || (Opc == Mips::LDC164) ||
-      (Opc == Mips::LDC164_P8)) {
+  if ((Opc == Mips::LW)   || (Opc == Mips::LD)   ||
+      (Opc == Mips::LWC1) || (Opc == Mips::LDC1) || (Opc == Mips::LDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -68,10 +67,8 @@ isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const
 {
   unsigned Opc = MI->getOpcode();
 
-  if ((Opc == Mips::SW)    || (Opc == Mips::SW_P8)  || (Opc == Mips::SD) ||
-      (Opc == Mips::SD_P8) || (Opc == Mips::SWC1)   || (Opc == Mips::SWC1_P8) ||
-      (Opc == Mips::SDC1)  || (Opc == Mips::SDC164) ||
-      (Opc == Mips::SDC164_P8)) {
+  if ((Opc == Mips::SW)   || (Opc == Mips::SD)   ||
+      (Opc == Mips::SWC1) || (Opc == Mips::SDC1) || (Opc == Mips::SDC164)) {
     if ((MI->getOperand(1).isFI()) && // is a stack slot
         (MI->getOperand(2).isImm()) &&  // the imm is zero
         (isZeroImm(MI->getOperand(2)))) {
@@ -88,39 +85,41 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   bool KillSrc) const {
   unsigned Opc = 0, ZeroReg = 0;
 
-  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
-    if (Mips::CPURegsRegClass.contains(SrcReg))
-      Opc = Mips::OR, ZeroReg = Mips::ZERO;
+  if (Mips::GPR32RegClass.contains(DestReg)) { // Copy to CPU Reg.
+    if (Mips::GPR32RegClass.contains(SrcReg))
+      Opc = Mips::ADDu, ZeroReg = Mips::ZERO;
     else if (Mips::CCRRegClass.contains(SrcReg))
       Opc = Mips::CFC1;
     else if (Mips::FGR32RegClass.contains(SrcReg))
       Opc = Mips::MFC1;
-    else if (Mips::HIRegsRegClass.contains(SrcReg))
+    else if (Mips::HI32RegClass.contains(SrcReg))
       Opc = Mips::MFHI, SrcReg = 0;
-    else if (Mips::LORegsRegClass.contains(SrcReg))
+    else if (Mips::LO32RegClass.contains(SrcReg))
       Opc = Mips::MFLO, SrcReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(SrcReg))
+    else if (Mips::HI32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(SrcReg))
+    else if (Mips::LO32DSPRegClass.contains(SrcReg))
       Opc = Mips::MFLO_DSP;
     else if (Mips::DSPCCRegClass.contains(SrcReg)) {
       BuildMI(MBB, I, DL, get(Mips::RDDSP), DestReg).addImm(1 << 4)
         .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(SrcReg))
+      Opc = Mips::CFCMSA;
   }
-  else if (Mips::CPURegsRegClass.contains(SrcReg)) { // Copy from CPU Reg.
+  else if (Mips::GPR32RegClass.contains(SrcReg)) { // Copy from CPU Reg.
     if (Mips::CCRRegClass.contains(DestReg))
       Opc = Mips::CTC1;
     else if (Mips::FGR32RegClass.contains(DestReg))
       Opc = Mips::MTC1;
-    else if (Mips::HIRegsRegClass.contains(DestReg))
+    else if (Mips::HI32RegClass.contains(DestReg))
       Opc = Mips::MTHI, DestReg = 0;
-    else if (Mips::LORegsRegClass.contains(DestReg))
+    else if (Mips::LO32RegClass.contains(DestReg))
       Opc = Mips::MTLO, DestReg = 0;
-    else if (Mips::HIRegsDSPRegClass.contains(DestReg))
+    else if (Mips::HI32DSPRegClass.contains(DestReg))
       Opc = Mips::MTHI_DSP;
-    else if (Mips::LORegsDSPRegClass.contains(DestReg))
+    else if (Mips::LO32DSPRegClass.contains(DestReg))
       Opc = Mips::MTLO_DSP;
     else if (Mips::DSPCCRegClass.contains(DestReg)) {
       BuildMI(MBB, I, DL, get(Mips::WRDSP))
@@ -128,6 +127,8 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(DestReg, RegState::ImplicitDefine);
       return;
     }
+    else if (Mips::MSACtrlRegClass.contains(DestReg))
+      Opc = Mips::CTCMSA;
   }
   else if (Mips::FGR32RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_S;
@@ -135,26 +136,28 @@ void MipsSEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = Mips::FMOV_D32;
   else if (Mips::FGR64RegClass.contains(DestReg, SrcReg))
     Opc = Mips::FMOV_D64;
-  else if (Mips::CCRRegClass.contains(DestReg, SrcReg))
-    Opc = Mips::MOVCCRToCCR;
-  else if (Mips::CPU64RegsRegClass.contains(DestReg)) { // Copy to CPU64 Reg.
-    if (Mips::CPU64RegsRegClass.contains(SrcReg))
-      Opc = Mips::OR64, ZeroReg = Mips::ZERO_64;
-    else if (Mips::HIRegs64RegClass.contains(SrcReg))
+  else if (Mips::GPR64RegClass.contains(DestReg)) { // Copy to CPU64 Reg.
+    if (Mips::GPR64RegClass.contains(SrcReg))
+      Opc = Mips::DADDu, ZeroReg = Mips::ZERO_64;
+    else if (Mips::HI64RegClass.contains(SrcReg))
       Opc = Mips::MFHI64, SrcReg = 0;
-    else if (Mips::LORegs64RegClass.contains(SrcReg))
+    else if (Mips::LO64RegClass.contains(SrcReg))
       Opc = Mips::MFLO64, SrcReg = 0;
     else if (Mips::FGR64RegClass.contains(SrcReg))
       Opc = Mips::DMFC1;
   }
-  else if (Mips::CPU64RegsRegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
-    if (Mips::HIRegs64RegClass.contains(DestReg))
+  else if (Mips::GPR64RegClass.contains(SrcReg)) { // Copy from CPU64 Reg.
+    if (Mips::HI64RegClass.contains(DestReg))
       Opc = Mips::MTHI64, DestReg = 0;
-    else if (Mips::LORegs64RegClass.contains(DestReg))
+    else if (Mips::LO64RegClass.contains(DestReg))
       Opc = Mips::MTLO64, DestReg = 0;
     else if (Mips::FGR64RegClass.contains(DestReg))
       Opc = Mips::DMTC1;
   }
+  else if (Mips::MSA128BRegClass.contains(DestReg)) { // Copy to MSA reg
+    if (Mips::MSA128BRegClass.contains(SrcReg))
+      Opc = Mips::MOVE_V;
+  }
 
   assert(Opc && "Cannot copy registers");
 
@@ -181,24 +184,32 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
   unsigned Opc = 0;
 
-  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SW_P8 : Mips::SW;
-  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SD_P8 : Mips::SD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC64_P8 : Mips::STORE_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC_DSP_P8 : Mips::STORE_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_AC128_P8 : Mips::STORE_AC128;
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::SW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::SD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::STORE_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::STORE_CCOND_DSP_P8 : Mips::STORE_CCOND_DSP;
+    Opc = Mips::STORE_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SWC1_P8 : Mips::SWC1;
+    Opc = Mips::SWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::SDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::SDC164_P8 : Mips::SDC164;
+    Opc = Mips::SDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::ST_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::ST_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::ST_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::ST_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
@@ -214,24 +225,32 @@ loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
   unsigned Opc = 0;
 
-  if (Mips::CPURegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LW_P8 : Mips::LW;
-  else if (Mips::CPU64RegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LD_P8 : Mips::LD;
-  else if (Mips::ACRegsRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC64_P8 : Mips::LOAD_AC64;
-  else if (Mips::ACRegsDSPRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC_DSP_P8 : Mips::LOAD_AC_DSP;
-  else if (Mips::ACRegs128RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_AC128_P8 : Mips::LOAD_AC128;
+  if (Mips::GPR32RegClass.hasSubClassEq(RC))
+    Opc = Mips::LW;
+  else if (Mips::GPR64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LD;
+  else if (Mips::ACC64RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64;
+  else if (Mips::ACC64DSPRegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC64DSP;
+  else if (Mips::ACC128RegClass.hasSubClassEq(RC))
+    Opc = Mips::LOAD_ACC128;
   else if (Mips::DSPCCRegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LOAD_CCOND_DSP_P8 : Mips::LOAD_CCOND_DSP;
+    Opc = Mips::LOAD_CCOND_DSP;
   else if (Mips::FGR32RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LWC1_P8 : Mips::LWC1;
+    Opc = Mips::LWC1;
   else if (Mips::AFGR64RegClass.hasSubClassEq(RC))
     Opc = Mips::LDC1;
   else if (Mips::FGR64RegClass.hasSubClassEq(RC))
-    Opc = IsN64 ? Mips::LDC164_P8 : Mips::LDC164;
+    Opc = Mips::LDC164;
+  else if (RC->hasType(MVT::v16i8))
+    Opc = Mips::LD_B;
+  else if (RC->hasType(MVT::v8i16) || RC->hasType(MVT::v8f16))
+    Opc = Mips::LD_H;
+  else if (RC->hasType(MVT::v4i32) || RC->hasType(MVT::v4f32))
+    Opc = Mips::LD_W;
+  else if (RC->hasType(MVT::v2i64) || RC->hasType(MVT::v2f64))
+    Opc = Mips::LD_D;
 
   assert(Opc && "Register class not handled!");
   BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(Offset)
@@ -245,17 +264,59 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   default:
     return false;
   case Mips::RetRA:
-    ExpandRetRA(MBB, MI, Mips::RET);
+    expandRetRA(MBB, MI, Mips::RET);
+    break;
+  case Mips::PseudoMFHI:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI);
+    break;
+  case Mips::PseudoMFLO:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO);
+    break;
+  case Mips::PseudoMFHI64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFHI64);
+    break;
+  case Mips::PseudoMFLO64:
+    expandPseudoMFHiLo(MBB, MI, Mips::MFLO64);
+    break;
+  case Mips::PseudoMTLOHI:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO, Mips::MTHI, false);
+    break;
+  case Mips::PseudoMTLOHI64:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO64, Mips::MTHI64, false);
+    break;
+  case Mips::PseudoMTLOHI_DSP:
+    expandPseudoMTLoHi(MBB, MI, Mips::MTLO_DSP, Mips::MTHI_DSP, true);
+    break;
+  case Mips::PseudoCVT_S_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_D32_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D32_W, Mips::MTC1, false);
+    break;
+  case Mips::PseudoCVT_S_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_S_L, Mips::DMTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_W:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_W, Mips::MTC1, true);
+    break;
+  case Mips::PseudoCVT_D64_L:
+    expandCvtFPInt(MBB, MI, Mips::CVT_D64_L, Mips::DMTC1, true);
     break;
   case Mips::BuildPairF64:
-    ExpandBuildPairF64(MBB, MI);
+    expandBuildPairF64(MBB, MI, false);
+    break;
+  case Mips::BuildPairF64_64:
+    expandBuildPairF64(MBB, MI, true);
     break;
   case Mips::ExtractElementF64:
-    ExpandExtractElementF64(MBB, MI);
+    expandExtractElementF64(MBB, MI, false);
+    break;
+  case Mips::ExtractElementF64_64:
+    expandExtractElementF64(MBB, MI, true);
     break;
   case Mips::MIPSeh_return32:
   case Mips::MIPSeh_return64:
-    ExpandEhReturn(MBB, MI);
+    expandEhReturn(MBB, MI);
     break;
   }
 
@@ -263,9 +324,9 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   return true;
 }
 
-/// GetOppositeBranchOpc - Return the inverse of the specified
+/// getOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
-unsigned MipsSEInstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
   switch (Opc) {
   default:           llvm_unreachable("Illegal opcode!");
   case Mips::BEQ:    return Mips::BNE;
@@ -315,7 +376,7 @@ MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
   unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
   unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
   const TargetRegisterClass *RC = STI.isABI_N64() ?
-    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+    &Mips::GPR64RegClass : &Mips::GPR32RegClass;
   bool LastInstrIsADDiu = NewImm;
 
   const MipsAnalyzeImmediate::InstSeq &Seq =
@@ -346,7 +407,7 @@ MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
   return Reg;
 }
 
-unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
+unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
           Opc == Mips::BEQ64  || Opc == Mips::BNE64  || Opc == Mips::BGTZ64 ||
@@ -356,51 +417,134 @@ unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
          Opc : 0;
 }
 
-void MipsSEInstrInfo::ExpandRetRA(MachineBasicBlock &MBB,
+void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I,
                                 unsigned Opc) const {
   BuildMI(MBB, I, I->getDebugLoc(), get(Opc)).addReg(Mips::RA);
 }
 
-void MipsSEInstrInfo::ExpandExtractElementF64(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator I) const {
+std::pair<bool, bool>
+MipsSEInstrInfo::compareOpndSize(unsigned Opc,
+                                 const MachineFunction &MF) const {
+  const MCInstrDesc &Desc = get(Opc);
+  assert(Desc.NumOperands == 2 && "Unary instruction expected.");
+  const MipsRegisterInfo *RI = &getRegisterInfo();
+  unsigned DstRegSize = getRegClass(Desc, 0, RI, MF)->getSize();
+  unsigned SrcRegSize = getRegClass(Desc, 1, RI, MF)->getSize();
+
+  return std::make_pair(DstRegSize > SrcRegSize, DstRegSize < SrcRegSize);
+}
+
+void MipsSEInstrInfo::expandPseudoMFHiLo(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned NewOpc) const {
+  BuildMI(MBB, I, I->getDebugLoc(), get(NewOpc), I->getOperand(0).getReg());
+}
+
+void MipsSEInstrInfo::expandPseudoMTLoHi(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned LoOpc,
+                                         unsigned HiOpc,
+                                         bool HasExplicitDef) const {
+  // Expand
+  //  lo_hi pseudomtlohi $gpr0, $gpr1
+  // to these two instructions:
+  //  mtlo $gpr0
+  //  mthi $gpr1
+
+  DebugLoc DL = I->getDebugLoc();
+  const MachineOperand &SrcLo = I->getOperand(1), &SrcHi = I->getOperand(2);
+  MachineInstrBuilder LoInst = BuildMI(MBB, I, DL, get(LoOpc));
+  MachineInstrBuilder HiInst = BuildMI(MBB, I, DL, get(HiOpc));
+  LoInst.addReg(SrcLo.getReg(), getKillRegState(SrcLo.isKill()));
+  HiInst.addReg(SrcHi.getReg(), getKillRegState(SrcHi.isKill()));
+
+  // Add lo/hi registers if the mtlo/hi instructions created have explicit
+  // def registers.
+  if (HasExplicitDef) {
+    unsigned DstReg = I->getOperand(0).getReg();
+    unsigned DstLo = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+    unsigned DstHi = getRegisterInfo().getSubReg(DstReg, Mips::sub_hi);
+    LoInst.addReg(DstLo, RegState::Define);
+    HiInst.addReg(DstHi, RegState::Define);
+  }
+}
+
+void MipsSEInstrInfo::expandCvtFPInt(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned CvtOpc, unsigned MovOpc,
+                                     bool IsI64) const {
+  const MCInstrDesc &CvtDesc = get(CvtOpc), &MovDesc = get(MovOpc);
+  const MachineOperand &Dst = I->getOperand(0), &Src = I->getOperand(1);
+  unsigned DstReg = Dst.getReg(), SrcReg = Src.getReg(), TmpReg = DstReg;
+  unsigned KillSrc =  getKillRegState(Src.isKill());
+  DebugLoc DL = I->getDebugLoc();
+  bool DstIsLarger, SrcIsLarger;
+
+  tie(DstIsLarger, SrcIsLarger) = compareOpndSize(CvtOpc, *MBB.getParent());
+
+  if (DstIsLarger)
+    TmpReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  if (SrcIsLarger)
+    DstReg = getRegisterInfo().getSubReg(DstReg, Mips::sub_lo);
+
+  BuildMI(MBB, I, DL, MovDesc, TmpReg).addReg(SrcReg, KillSrc);
+  BuildMI(MBB, I, DL, CvtDesc, DstReg).addReg(TmpReg, RegState::Kill);
+}
+
+void MipsSEInstrInfo::expandExtractElementF64(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned SrcReg = I->getOperand(1).getReg();
   unsigned N = I->getOperand(2).getImm();
-  const MCInstrDesc& Mfc1Tdd = get(Mips::MFC1);
   DebugLoc dl = I->getDebugLoc();
 
   assert(N < 2 && "Invalid immediate");
-  unsigned SubIdx = N ? Mips::sub_fpodd : Mips::sub_fpeven;
+  unsigned SubIdx = N ? Mips::sub_hi : Mips::sub_lo;
   unsigned SubReg = getRegisterInfo().getSubReg(SrcReg, SubIdx);
 
-  BuildMI(MBB, I, dl, Mfc1Tdd, DstReg).addReg(SubReg);
+  if (SubIdx == Mips::sub_hi && FP64)
+    BuildMI(MBB, I, dl, get(Mips::MFHC1), DstReg).addReg(SubReg);
+  else
+    BuildMI(MBB, I, dl, get(Mips::MFC1), DstReg).addReg(SubReg);
 }
 
-void MipsSEInstrInfo::ExpandBuildPairF64(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I) const {
+void MipsSEInstrInfo::expandBuildPairF64(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         bool FP64) const {
   unsigned DstReg = I->getOperand(0).getReg();
   unsigned LoReg = I->getOperand(1).getReg(), HiReg = I->getOperand(2).getReg();
   const MCInstrDesc& Mtc1Tdd = get(Mips::MTC1);
   DebugLoc dl = I->getDebugLoc();
   const TargetRegisterInfo &TRI = getRegisterInfo();
 
-  // mtc1 Lo, $fp
-  // mtc1 Hi, $fp + 1
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpeven))
+  // For FP32 mode:
+  //   mtc1 Lo, $fp
+  //   mtc1 Hi, $fp + 1
+  // For FP64 mode:
+  //   mtc1 Lo, $fp
+  //   mthc1 Hi, $fp
+
+  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_lo))
     .addReg(LoReg);
-  BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_fpodd))
-    .addReg(HiReg);
+
+  if (FP64)
+    BuildMI(MBB, I, dl, get(Mips::MTHC1), TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
+  else
+    BuildMI(MBB, I, dl, Mtc1Tdd, TRI.getSubReg(DstReg, Mips::sub_hi))
+      .addReg(HiReg);
 }
 
-void MipsSEInstrInfo::ExpandEhReturn(MachineBasicBlock &MBB,
+void MipsSEInstrInfo::expandEhReturn(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const {
   // This pseudo instruction is generated as part of the lowering of
   // ISD::EH_RETURN. We convert it to a stack increment by OffsetReg, and
   // indirect jump to TargetReg
   const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
   unsigned ADDU = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-  unsigned OR = STI.isABI_N64() ? Mips::OR64 : Mips::OR;
   unsigned JR = STI.isABI_N64() ? Mips::JR64 : Mips::JR;
   unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
   unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA;
@@ -409,13 +553,13 @@ void MipsSEInstrInfo::ExpandEhReturn(MachineBasicBlock &MBB,
   unsigned OffsetReg = I->getOperand(0).getReg();
   unsigned TargetReg = I->getOperand(1).getReg();
 
-  // or   $ra, $v0, $zero
+  // addu $ra, $v0, $zero
   // addu $sp, $sp, $v1
   // jr   $ra
   if (TM.getRelocationModel() == Reloc::PIC_)
-    BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(OR), T9)
+    BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), T9)
         .addReg(TargetReg).addReg(ZERO);
-  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(OR), RA)
+  BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), RA)
       .addReg(TargetReg).addReg(ZERO);
   BuildMI(MBB, I, I->getDebugLoc(), TM.getInstrInfo()->get(ADDU), SP)
       .addReg(SP).addReg(OffsetReg);
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 0bf7876..6d2dd90 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -65,7 +65,7 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
+  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
 
   /// Adjust SP by Amount bytes.
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
@@ -79,15 +79,39 @@ public:
                          unsigned *NewImm) const;
 
 private:
-  virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
+  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
 
-  void ExpandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+  void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
-  void ExpandExtractElementF64(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator I) const;
-  void ExpandBuildPairF64(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator I) const;
-  void ExpandEhReturn(MachineBasicBlock &MBB,
+
+  std::pair<bool, bool> compareOpndSize(unsigned Opc,
+                                        const MachineFunction &MF) const;
+
+  void expandPseudoMFHiLo(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned NewOpc) const;
+
+  void expandPseudoMTLoHi(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                          unsigned LoOpc, unsigned HiOpc,
+                          bool HasExplicitDef) const;
+
+  /// Expand pseudo Int-to-FP conversion instructions.
+  ///
+  /// For example, the following pseudo instruction
+  ///  PseudoCVT_D32_W D2, A5
+  /// gets expanded into these two instructions:
+  ///  MTC1 F4, A5
+  ///  CVT_D32_W D2, F4
+  ///
+  /// We do this expansion post-RA to avoid inserting a floating point copy
+  /// instruction between MTC1 and CVT_D32_W.
+  void expandCvtFPInt(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                      unsigned CvtOpc, unsigned MovOpc, bool IsI64) const;
+
+  void expandExtractElementF64(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, bool FP64) const;
+  void expandBuildPairF64(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, bool FP64) const;
+  void expandEhReturn(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 9696738..2d44084 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -40,9 +40,8 @@
 
 using namespace llvm;
 
-MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
-                                       const MipsSEInstrInfo &I)
-  : MipsRegisterInfo(ST), TII(I) {}
+MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 bool MipsSERegisterInfo::
 requiresRegisterScavenging(const MachineFunction &MF) const {
@@ -57,10 +56,28 @@ requiresFrameIndexScavenging(const MachineFunction &MF) const {
 const TargetRegisterClass *
 MipsSERegisterInfo::intRegClass(unsigned Size) const {
   if (Size == 4)
-    return &Mips::CPURegsRegClass;
+    return &Mips::GPR32RegClass;
 
   assert(Size == 8);
-  return &Mips::CPU64RegsRegClass;
+  return &Mips::GPR64RegClass;
+}
+
+/// Determine whether a given opcode is an MSA load/store (supporting 10-bit
+/// offsets) or a non-MSA load/store (supporting 16-bit offsets).
+static inline bool isMSALoadOrStore(const unsigned Opcode) {
+  switch (Opcode) {
+  case Mips::LD_B:
+  case Mips::LD_H:
+  case Mips::LD_W:
+  case Mips::LD_D:
+  case Mips::ST_B:
+  case Mips::ST_H:
+  case Mips::ST_W:
+  case Mips::ST_D:
+    return true;
+  default:
+    return false;
+  }
 }
 
 void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
@@ -112,21 +129,49 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
-  // If MI is not a debug value, make sure Offset fits in the 16-bit immediate
-  // field.
-  if (!MI.isDebugValue() && !isInt<16>(Offset)) {
-    MachineBasicBlock &MBB = *MI.getParent();
-    DebugLoc DL = II->getDebugLoc();
-    unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned NewImm;
-
-    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
-    BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
-      .addReg(Reg, RegState::Kill);
-
-    FrameReg = Reg;
-    Offset = SignExtend64<16>(NewImm);
-    IsKill = true;
+  if (!MI.isDebugValue()) {
+    // Make sure Offset fits within the field available.
+    // For MSA instructions, this is a 10-bit signed immediate, otherwise it is
+    // a 16-bit signed immediate.
+    unsigned OffsetBitSize = isMSALoadOrStore(MI.getOpcode()) ? 10 : 16;
+
+    if (OffsetBitSize == 10 && !isInt<10>(Offset) && isInt<16>(Offset)) {
+      // If we have an offset that needs to fit into a signed 10-bit immediate
+      // and doesn't, but does fit into 16-bits then use an ADDiu
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDiu = Subtarget.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
+      const TargetRegisterClass *RC =
+          Subtarget.isABI_N64() ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
+      MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+      unsigned Reg = RegInfo.createVirtualRegister(RC);
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      BuildMI(MBB, II, DL, TII.get(ADDiu), Reg).addReg(FrameReg).addImm(Offset);
+
+      FrameReg = Reg;
+      Offset = 0;
+      IsKill = true;
+    } else if (!isInt<16>(Offset)) {
+      // Otherwise split the offset into 16-bit pieces and add it in multiple
+      // instructions.
+      MachineBasicBlock &MBB = *MI.getParent();
+      DebugLoc DL = II->getDebugLoc();
+      unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+      unsigned NewImm = 0;
+      const MipsSEInstrInfo &TII =
+          *static_cast<const MipsSEInstrInfo *>(
+               MBB.getParent()->getTarget().getInstrInfo());
+      unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
+                                       OffsetBitSize == 16 ? &NewImm : NULL);
+      BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+        .addReg(Reg, RegState::Kill);
+
+      FrameReg = Reg;
+      Offset = SignExtend64<16>(NewImm);
+      IsKill = true;
+    }
   }
 
   MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 2f7c37b..76cdd9d 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -21,11 +21,8 @@ namespace llvm {
 class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
-  const MipsSEInstrInfo &TII;
-
 public:
-  MipsSERegisterInfo(const MipsSubtarget &Subtarget,
-                     const MipsSEInstrInfo &TII);
+  MipsSERegisterInfo(const MipsSubtarget &Subtarget);
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
index 1add02f..2779064 100644
--- a/lib/Target/Mips/MipsSchedule.td
+++ b/lib/Target/Mips/MipsSchedule.td
@@ -17,13 +17,18 @@ def IMULDIV : FuncUnit;
 // Instruction Itinerary classes used for Mips
 //===----------------------------------------------------------------------===//
 def IIAlu              : InstrItinClass;
+def IIArith            : InstrItinClass;
+def IILogic            : InstrItinClass;
 def IILoad             : InstrItinClass;
 def IIStore            : InstrItinClass;
 def IIXfer             : InstrItinClass;
 def IIBranch           : InstrItinClass;
 def IIHiLo             : InstrItinClass;
 def IIImul             : InstrItinClass;
+def IIImult            : InstrItinClass;
 def IIIdiv             : InstrItinClass;
+def IIseb              : InstrItinClass;
+def IIslt              : InstrItinClass;
 def IIFcvt             : InstrItinClass;
 def IIFmove            : InstrItinClass;
 def IIFcmp             : InstrItinClass;
@@ -35,6 +40,9 @@ def IIFdivDouble       : InstrItinClass;
 def IIFsqrtSingle      : InstrItinClass;
 def IIFsqrtDouble      : InstrItinClass;
 def IIFrecipFsqrtStep  : InstrItinClass;
+def IIFLoad            : InstrItinClass;
+def IIFStore           : InstrItinClass;
+def IIFmoveC1          : InstrItinClass;
 def IIPseudo           : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
@@ -42,6 +50,8 @@ def IIPseudo           : InstrItinClass;
 //===----------------------------------------------------------------------===//
 def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIArith            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IILogic            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
   InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
   InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
@@ -59,5 +69,8 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [
   InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
   InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
   InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
-  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
+  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>,
+  InstrItinData<IIFLoad            , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIFStore           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIFmoveC1          , [InstrStage<2,  [ALU]>]>
 ]>;
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 14a2b27..0a81072 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -48,6 +48,17 @@ static cl::opt<bool> Mips_Os16(
            "floating point as Mips 16"),
   cl::Hidden);
 
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
+static cl::opt<bool>
+Mips16ConstantIslands(
+  "mips16-constant-islands", cl::Hidden,
+  cl::desc("MIPS: mips16 constant islands enable. experimental feature"),
+  cl::init(false));
+
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
@@ -58,8 +69,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasSwap(false),
   HasBitCount(false), HasFPIdx(false),
-  InMips16Mode(false), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16),
+  InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
+  InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
   RM(_RM), OverrideMode(NoOverride), TM(_TM)
 {
   std::string CPUName = CPU;
@@ -83,12 +95,20 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
           (hasMips64() && (isABI_N32() || isABI_N64()))) &&
          "Invalid  Arch & ABI pair.");
 
+  if (hasMSA() && !isFP64bit())
+    report_fatal_error("MSA requires a 64-bit FPU register file (FR=1 mode). "
+                       "See -mattr=+fp64.",
+                       false);
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
   // Set UseSmallSection.
   UseSmallSection = !IsLinux && (RM == Reloc::Static);
+  // set some subtarget specific features
+  if (inMips16Mode())
+    HasBitCount=false;
 }
 
 bool
@@ -98,7 +118,7 @@ MipsSubtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
   Mode = TargetSubtargetInfo::ANTIDEP_NONE;
   CriticalPathRCs.clear();
   CriticalPathRCs.push_back(hasMips64() ?
-                            &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass);
+                            &Mips::GPR64RegClass : &Mips::GPR32RegClass);
   return OptLevel >= CodeGenOpt::Aggressive;
 }
 
@@ -146,3 +166,11 @@ void MipsSubtarget::resetSubtarget(MachineFunction *MF) {
   }
 }
 
+bool MipsSubtarget::mipsSEUsesSoftFloat() const {
+  return TM->Options.UseSoftFloat && !InMips16HardFloat;
+}
+
+bool MipsSubtarget::useConstantIslands() {
+  DEBUG(dbgs() << "use constant islands " << Mips16ConstantIslands << "\n");
+  return Mips16ConstantIslands;
+}
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index f2f0e15..6b2ab12 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -93,6 +93,9 @@ protected:
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
 
+  // Mips16 hard float
+  bool InMips16HardFloat;
+
   // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
   bool PreviousInMips16Mode;
 
@@ -110,6 +113,9 @@ protected:
   // compiled as Mips32
   bool Os16;
 
+  // HasMSA -- supports MSA ASE.
+  bool HasMSA;
+
   InstrItineraryData InstrItins;
 
   // The instance to the register info section object
@@ -154,6 +160,7 @@ public:
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
   bool isSingleFloat() const { return IsSingleFloat; }
@@ -170,28 +177,48 @@ public:
     }
     llvm_unreachable("Unexpected mode");
   }
-  bool inMips16ModeDefault() {
+  bool inMips16ModeDefault() const {
     return InMips16Mode;
   }
+  bool inMips16HardFloat() const {
+    return inMips16Mode() && InMips16HardFloat;
+  }
   bool inMicroMipsMode() const { return InMicroMipsMode; }
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
+  bool hasMSA() const { return HasMSA; }
   bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
+  bool mipsSEUsesSoftFloat() const;
+
+  bool enableLongBranchPass() const {
+    return hasStandardEncoding() || allowMixed16_32();
+  }
+
   /// Features related to the presence of specific instructions.
   bool hasSEInReg()   const { return HasSEInReg; }
   bool hasCondMov()   const { return HasCondMov; }
   bool hasSwap()      const { return HasSwap; }
   bool hasBitCount()  const { return HasBitCount; }
   bool hasFPIdx()     const { return HasFPIdx; }
+  bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
 
-  bool allowMixed16_32() const { return AllowMixed16_32;};
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  bool allowMixed16_32() const { return inMips16ModeDefault() |
+                                        AllowMixed16_32;}
 
   bool os16() const { return Os16;};
 
+// for now constant islands are on for the whole compilation unit but we only
+// really use them if in addition we are in mips16 mode
+//
+static bool useConstantIslands();
+
+  unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
+
   // Grab MipsRegInfo object
   const MipsReginfo &getMReginfo() const { return MRI; }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index ee28e2a..5046c1b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -22,6 +22,7 @@
 #include "MipsSEISelLowering.h"
 #include "MipsSEISelDAGToDAG.h"
 #include "Mips16FrameLowering.h"
+#include "Mips16HardFloat.h"
 #include "Mips16InstrInfo.h"
 #include "Mips16ISelDAGToDAG.h"
 #include "Mips16ISelLowering.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 
@@ -69,8 +71,9 @@ MipsTargetMachine(const Target &T, StringRef TT,
                 "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32-S64")),
     InstrInfo(MipsInstrInfo::create(*this)),
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
-    TLInfo(MipsTargetLowering::create(*this)),
-    TSInfo(*this), JITInfo() {
+    TLInfo(MipsTargetLowering::create(*this)), TSInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()), JITInfo() {
+  initAsmInfo();
 }
 
 
@@ -132,7 +135,13 @@ namespace {
 class MipsPassConfig : public TargetPassConfig {
 public:
   MipsPassConfig(MipsTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // The current implementation of long branch pass requires a scratch
+    // register ($at) to be available before branch instructions. Tail merging
+    // can break this requirement, so disable it when long branch pass is
+    // enabled.
+    EnableTailMerge = !getMipsSubtarget().enableLongBranchPass();
+  }
 
   MipsTargetMachine &getMipsTargetMachine() const {
     return getTM<MipsTargetMachine>();
@@ -156,6 +165,9 @@ void MipsPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
   if (getMipsSubtarget().os16())
     addPass(createMipsOs16(getMipsTargetMachine()));
+  if (getMipsSubtarget().inMips16HardFloat())
+    addPass(createMips16HardFloat(getMipsTargetMachine()));
+  addPass(createPartiallyInlineLibCallsPass());
 }
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
@@ -191,8 +203,7 @@ bool MipsPassConfig::addPreEmitPass() {
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
   addPass(createMipsDelaySlotFillerPass(TM));
 
-  if (Subtarget.hasStandardEncoding() ||
-      Subtarget.allowMixed16_32())
+  if (Subtarget.enableLongBranchPass())
     addPass(createMipsLongBranchPass(TM));
   if (Subtarget.inMips16Mode() ||
       Subtarget.allowMixed16_32())
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index ee55708..5a9a11d 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -44,6 +44,7 @@ class MipsTargetMachine : public LLVMTargetMachine {
   OwningPtr<const MipsFrameLowering> FrameLoweringSE;
   OwningPtr<const MipsTargetLowering> TLInfoSE;
   MipsSelectionDAGInfo TSInfo;
+  const InstrItineraryData &InstrItins;
   MipsJITInfo JITInfo;
 
 public:
@@ -65,6 +66,11 @@ public:
   { return &Subtarget; }
   virtual const DataLayout *getDataLayout()    const
   { return &DL;}
+
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return Subtarget.inMips16Mode() ? 0 : &InstrItins;
+  }
+
   virtual MipsJITInfo *getJITInfo()
   { return &JITInfo; }
 
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
new file mode 100644
index 0000000..96966fd
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -0,0 +1,44 @@
+//===-- MipsTargetStreamer.h - Mips Target Streamer ------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETSTREAMER_H
+#define MIPSTARGETSTREAMER_H
+
+#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class MipsTargetStreamer : public MCTargetStreamer {
+  virtual void anchor();
+
+public:
+  virtual void emitMipsHackELFFlags(unsigned Flags) = 0;
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val) = 0;
+};
+
+// This part is for ascii assembly output
+class MipsTargetAsmStreamer : public MipsTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  MipsTargetAsmStreamer(formatted_raw_ostream &OS);
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+
+// This part is for ELF object output
+class MipsTargetELFStreamer : public MipsTargetStreamer {
+public:
+  MCELFStreamer &getStreamer();
+  virtual void emitMipsHackELFFlags(unsigned Flags);
+  virtual void emitMipsHackSTOCG(MCSymbol *Sym, unsigned Val);
+};
+}
+
+#endif
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 735ca9b..4f1324c 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -24,11 +24,13 @@ set(NVPTXCodeGen_sources
   NVPTXUtilities.cpp
   NVVMReflect.cpp
   NVPTXGenericToNVVM.cpp
+  NVPTXPrologEpilogPass.cpp
+  NVPTXMCExpr.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
 
-add_dependencies(LLVMNVPTXCodeGen intrinsics_gen)
+add_dependencies(LLVMNVPTXCodeGen NVPTXCommonTableGen intrinsics_gen)
 
 add_subdirectory(TargetInfo)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index 10051c7..d5be0e4 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -1 +1,289 @@
-// Placeholder
+//===-- NVPTXInstPrinter.cpp - PTX assembly instruction printing ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Print MCInst instructions to .ptx format.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "InstPrinter/NVPTXInstPrinter.h"
+#include "NVPTX.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cctype>
+using namespace llvm;
+
+#include "NVPTXGenAsmWriter.inc"
+
+
+NVPTXInstPrinter::NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                                   const MCRegisterInfo &MRI,
+                                   const MCSubtargetInfo &STI)
+  : MCInstPrinter(MAI, MII, MRI) {
+  setAvailableFeatures(STI.getFeatureBits());
+}
+
+void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // Decode the virtual register
+  // Must be kept in sync with NVPTXAsmPrinter::encodeVirtualRegister
+  unsigned RCId = (RegNo >> 28);
+  switch (RCId) {
+  default: report_fatal_error("Bad virtual register encoding");
+  case 0:
+    // This is actually a physical register, so defer to the autogenerated
+    // register printer
+    OS << getRegisterName(RegNo);
+    return;
+  case 1:
+    OS << "%p";
+    break;
+  case 2:
+    OS << "%rs";
+    break;
+  case 3:
+    OS << "%r";
+    break;
+  case 4:
+    OS << "%rl";
+    break;
+  case 5:
+    OS << "%f";
+    break;
+  case 6:
+    OS << "%fl";
+    break;
+  }
+
+  unsigned VReg = RegNo & 0x0FFFFFFF;
+  OS << VReg;
+}
+
+void NVPTXInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                                 StringRef Annot) {
+  printInstruction(MI, OS);
+
+  // Next always print the annotation.
+  printAnnotation(OS, Annot);
+}
+
+void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    printRegName(O, Reg);
+  } else if (Op.isImm()) {
+    O << markup("<imm:") << formatImm(Op.getImm()) << markup(">");
+  } else {
+    assert(Op.isExpr() && "Unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void NVPTXInstPrinter::printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCvtMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "sat") == 0) {
+    // SAT flag
+    if (Imm & NVPTX::PTXCvtMode::SAT_FLAG)
+      O << ".sat";
+  } else if (strcmp(Modifier, "base") == 0) {
+    // Default operand
+    switch (Imm & NVPTX::PTXCvtMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCvtMode::NONE:
+      break;
+    case NVPTX::PTXCvtMode::RNI:
+      O << ".rni";
+      break;
+    case NVPTX::PTXCvtMode::RZI:
+      O << ".rzi";
+      break;
+    case NVPTX::PTXCvtMode::RMI:
+      O << ".rmi";
+      break;
+    case NVPTX::PTXCvtMode::RPI:
+      O << ".rpi";
+      break;
+    case NVPTX::PTXCvtMode::RN:
+      O << ".rn";
+      break;
+    case NVPTX::PTXCvtMode::RZ:
+      O << ".rz";
+      break;
+    case NVPTX::PTXCvtMode::RM:
+      O << ".rm";
+      break;
+    case NVPTX::PTXCvtMode::RP:
+      O << ".rp";
+      break;
+    }
+  } else {
+    llvm_unreachable("Invalid conversion modifier");
+  }
+}
+
+void NVPTXInstPrinter::printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                                    const char *Modifier) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  int64_t Imm = MO.getImm();
+
+  if (strcmp(Modifier, "ftz") == 0) {
+    // FTZ flag
+    if (Imm & NVPTX::PTXCmpMode::FTZ_FLAG)
+      O << ".ftz";
+  } else if (strcmp(Modifier, "base") == 0) {
+    switch (Imm & NVPTX::PTXCmpMode::BASE_MASK) {
+    default:
+      return;
+    case NVPTX::PTXCmpMode::EQ:
+      O << ".eq";
+      break;
+    case NVPTX::PTXCmpMode::NE:
+      O << ".ne";
+      break;
+    case NVPTX::PTXCmpMode::LT:
+      O << ".lt";
+      break;
+    case NVPTX::PTXCmpMode::LE:
+      O << ".le";
+      break;
+    case NVPTX::PTXCmpMode::GT:
+      O << ".gt";
+      break;
+    case NVPTX::PTXCmpMode::GE:
+      O << ".ge";
+      break;
+    case NVPTX::PTXCmpMode::LO:
+      O << ".lo";
+      break;
+    case NVPTX::PTXCmpMode::LS:
+      O << ".ls";
+      break;
+    case NVPTX::PTXCmpMode::HI:
+      O << ".hi";
+      break;
+    case NVPTX::PTXCmpMode::HS:
+      O << ".hs";
+      break;
+    case NVPTX::PTXCmpMode::EQU:
+      O << ".equ";
+      break;
+    case NVPTX::PTXCmpMode::NEU:
+      O << ".neu";
+      break;
+    case NVPTX::PTXCmpMode::LTU:
+      O << ".ltu";
+      break;
+    case NVPTX::PTXCmpMode::LEU:
+      O << ".leu";
+      break;
+    case NVPTX::PTXCmpMode::GTU:
+      O << ".gtu";
+      break;
+    case NVPTX::PTXCmpMode::GEU:
+      O << ".geu";
+      break;
+    case NVPTX::PTXCmpMode::NUM:
+      O << ".num";
+      break;
+    case NVPTX::PTXCmpMode::NotANumber:
+      O << ".nan";
+      break;
+    }
+  } else {
+    llvm_unreachable("Empty Modifier");
+  }
+}
+
+void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
+                                     raw_ostream &O, const char *Modifier) {
+  if (Modifier) {
+    const MCOperand &MO = MI->getOperand(OpNum);
+    int Imm = (int) MO.getImm();
+    if (!strcmp(Modifier, "volatile")) {
+      if (Imm)
+        O << ".volatile";
+    } else if (!strcmp(Modifier, "addsp")) {
+      switch (Imm) {
+      case NVPTX::PTXLdStInstCode::GLOBAL:
+        O << ".global";
+        break;
+      case NVPTX::PTXLdStInstCode::SHARED:
+        O << ".shared";
+        break;
+      case NVPTX::PTXLdStInstCode::LOCAL:
+        O << ".local";
+        break;
+      case NVPTX::PTXLdStInstCode::PARAM:
+        O << ".param";
+        break;
+      case NVPTX::PTXLdStInstCode::CONSTANT:
+        O << ".const";
+        break;
+      case NVPTX::PTXLdStInstCode::GENERIC:
+        break;
+      default:
+        llvm_unreachable("Wrong Address Space");
+      }
+    } else if (!strcmp(Modifier, "sign")) {
+      if (Imm == NVPTX::PTXLdStInstCode::Signed)
+        O << "s";
+      else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
+        O << "u";
+      else
+        O << "f";
+    } else if (!strcmp(Modifier, "vec")) {
+      if (Imm == NVPTX::PTXLdStInstCode::V2)
+        O << ".v2";
+      else if (Imm == NVPTX::PTXLdStInstCode::V4)
+        O << ".v4";
+    } else
+      llvm_unreachable("Unknown Modifier");
+  } else
+    llvm_unreachable("Empty Modifier");
+}
+
+void NVPTXInstPrinter::printMemOperand(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  printOperand(MI, OpNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, OpNum + 1, O);
+  } else {
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, OpNum + 1, O);
+  }
+}
+
+void NVPTXInstPrinter::printProtoIdent(const MCInst *MI, int OpNum,
+                                       raw_ostream &O, const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+  assert(Op.isExpr() && "Call prototype is not an MCExpr?");
+  const MCExpr *Expr = Op.getExpr();
+  const MCSymbol &Sym = cast<MCSymbolRefExpr>(Expr)->getSymbol();
+  O << Sym.getName();
+}
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
new file mode 100644
index 0000000..93029ae
--- /dev/null
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -0,0 +1,53 @@
+//= NVPTXInstPrinter.h - Convert NVPTX MCInst to assembly syntax --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an NVPTX MCInst to .ptx file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef NVPTX_INST_PRINTER_H
+#define NVPTX_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCOperand;
+class MCSubtargetInfo;
+
+class NVPTXInstPrinter : public MCInstPrinter {
+public:
+  NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+  // End
+
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = 0);
+  void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
+                    const char *Modifier = 0);
+  void printLdStCode(const MCInst *MI, int OpNum,
+                     raw_ostream &O, const char *Modifier = 0);
+  void printMemOperand(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = 0);
+  void printProtoIdent(const MCInst *MI, int OpNum,
+                       raw_ostream &O, const char *Modifier = 0);
+};
+
+}
+
+#endif
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index b3e8b5d..edf4a80 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -22,7 +22,6 @@ namespace llvm {
 enum AddressSpace {
   ADDRESS_SPACE_GENERIC = 0,
   ADDRESS_SPACE_GLOBAL = 1,
-  ADDRESS_SPACE_CONST_NOT_GEN = 2, // Not part of generic space
   ADDRESS_SPACE_SHARED = 3,
   ADDRESS_SPACE_CONST = 4,
   ADDRESS_SPACE_LOCAL = 5,
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 459cd96..f2784b8 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -17,17 +17,15 @@
 
 using namespace llvm;
 
-bool CompileForDebugging;
-
 // -debug-compile - Command line option to inform opt and llc passes to
 // compile for debugging
-static cl::opt<bool, true>
-Debug("debug-compile", cl::desc("Compile for debugging"), cl::Hidden,
-      cl::location(CompileForDebugging), cl::init(false));
+static cl::opt<bool> CompileForDebugging("debug-compile",
+                                         cl::desc("Compile for debugging"),
+                                         cl::Hidden, cl::init(false));
 
 void NVPTXMCAsmInfo::anchor() {}
 
-NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
+NVPTXMCAsmInfo::NVPTXMCAsmInfo(const StringRef &TT) {
   Triple TheTriple(TT);
   if (TheTriple.getArch() == Triple::nvptx64) {
     PointerSize = CalleeSaveStackSlotSize = 8;
@@ -37,8 +35,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Target &T, const StringRef &TT) {
 
   PrivateGlobalPrefix = "$L__";
 
-  AllowPeriodsInName = false;
-
   HasSetDirective = false;
 
   HasSingleParameterDotFile = false;
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index 82097da..7d1633f 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -23,7 +23,7 @@ class StringRef;
 class NVPTXMCAsmInfo : public MCAsmInfo {
   virtual void anchor();
 public:
-  explicit NVPTXMCAsmInfo(const Target &T, const StringRef &TT);
+  explicit NVPTXMCAsmInfo(const StringRef &TT);
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index ccd2970..871bac9 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 
 #include "NVPTXMCTargetDesc.h"
 #include "NVPTXMCAsmInfo.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -57,6 +58,17 @@ static MCCodeGenInfo *createNVPTXMCCodeGenInfo(
   return X;
 }
 
+static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  if (SyntaxVariant == 0)
+    return new NVPTXInstPrinter(MAI, MII, MRI, STI);
+  return 0;
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXTargetMC() {
   // Register the MC asm info.
@@ -85,4 +97,9 @@ extern "C" void LLVMInitializeNVPTXTargetMC() {
   TargetRegistry::RegisterMCSubtargetInfo(TheNVPTXTarget64,
                                           createNVPTXMCSubtargetInfo);
 
+  // Register the MCInstPrinter.
+  TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget32,
+                                        createNVPTXMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheNVPTXTarget64,
+                                        createNVPTXMCInstPrinter);
 }
diff --git a/lib/Target/NVPTX/ManagedStringPool.h b/lib/Target/NVPTX/ManagedStringPool.h
index d6c79b5..f9fb059 100644
--- a/lib/Target/NVPTX/ManagedStringPool.h
+++ b/lib/Target/NVPTX/ManagedStringPool.h
@@ -29,7 +29,7 @@ class ManagedStringPool {
 public:
   ManagedStringPool() {}
   ~ManagedStringPool() {
-    SmallVector<std::string *, 8>::iterator Current = Pool.begin();
+    SmallVectorImpl<std::string *>::iterator Current = Pool.begin();
     while (Current != Pool.end()) {
       delete *Current;
       Current++;
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 072c65d..490b49d 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -27,6 +27,7 @@
 namespace llvm {
 class NVPTXTargetMachine;
 class FunctionPass;
+class MachineFunctionPass;
 class formatted_raw_ostream;
 
 namespace NVPTXCC {
@@ -60,12 +61,10 @@ inline static const char *NVPTXCondCodeToString(NVPTXCC::CondCodes CC) {
 
 FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
-FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
-FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
-FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
 ModulePass *createGenericToNVVMPass();
 ModulePass *createNVVMReflectPass();
 ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
@@ -75,8 +74,7 @@ extern Target TheNVPTXTarget64;
 namespace NVPTX {
 enum DrvInterface {
   NVCL,
-  CUDA,
-  TEST
+  CUDA
 };
 
 // A field inside TSFlags needs a shift and a mask. The usage is
@@ -130,6 +128,53 @@ enum VecType {
   V4 = 4
 };
 }
+
+/// PTXCvtMode - Conversion code enumeration
+namespace PTXCvtMode {
+enum CvtMode {
+  NONE = 0,
+  RNI,
+  RZI,
+  RMI,
+  RPI,
+  RN,
+  RZ,
+  RM,
+  RP,
+
+  BASE_MASK = 0x0F,
+  FTZ_FLAG = 0x10,
+  SAT_FLAG = 0x20
+};
+}
+
+/// PTXCmpMode - Comparison mode enumeration
+namespace PTXCmpMode {
+enum CmpMode {
+  EQ = 0,
+  NE,
+  LT,
+  LE,
+  GT,
+  GE,
+  LO,
+  LS,
+  HI,
+  HS,
+  EQU,
+  NEU,
+  LTU,
+  LEU,
+  GTU,
+  GEU,
+  NUM,
+  // NAN is a MACRO
+  NotANumber,
+
+  BASE_MASK = 0xFF,
+  FTZ_FLAG = 0x100
+};
+}
 }
 } // end namespace llvm;
 
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index d78b4e8..6183a75 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -57,6 +57,12 @@ def : Proc<"sm_35", [SM35]>;
 def NVPTXInstrInfo : InstrInfo {
 }
 
+def NVPTXAsmWriter : AsmWriter {
+  bit isMCAsmWriter = 1;
+  string AsmWriterClassName  = "InstPrinter";
+}
+
 def NVPTX : Target {
   let InstructionSet = NVPTXInstrInfo;
+  let AssemblyWriters = [NVPTXAsmWriter];
 }
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
index 0f792ec..1f37696 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.cpp
@@ -37,7 +37,7 @@ bool NVPTXAllocaHoisting::runOnFunction(Function &function) {
 }
 
 char NVPTXAllocaHoisting::ID = 1;
-RegisterPass<NVPTXAllocaHoisting>
+static RegisterPass<NVPTXAllocaHoisting>
 X("alloca-hoisting", "Hoisting alloca instructions in non-entry "
                      "blocks to the entry block");
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 229e4e5..7552fe7 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -16,10 +16,11 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
-#include "NVPTXNumRegisters.h"
+#include "NVPTXMCExpr.h"
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
 #include "NVPTXUtilities.h"
+#include "InstPrinter/NVPTXInstPrinter.h"
 #include "cl_common_defines.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/ConstantFolding.h"
@@ -47,23 +48,17 @@
 #include <sstream>
 using namespace llvm;
 
-#include "NVPTXGenAsmWriter.inc"
-
-bool RegAllocNilUsed = true;
-
 #define DEPOTNAME "__local_depot"
 
 static cl::opt<bool>
-EmitLineNumbers("nvptx-emit-line-numbers",
+EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
                 cl::desc("NVPTX Specific: Emit Line numbers even without -G"),
                 cl::init(true));
 
-namespace llvm { bool InterleaveSrcInPtx = false; }
-
-static cl::opt<bool, true>
-InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
+static cl::opt<bool>
+InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, cl::Hidden,
               cl::desc("NVPTX Specific: Emit source line in ptx file"),
-              cl::location(llvm::InterleaveSrcInPtx));
+              cl::init(false));
 
 namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
@@ -131,7 +126,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.Mang->getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
@@ -279,8 +274,10 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
   const LLVMContext &ctx = MF->getFunction()->getContext();
   DIScope Scope(curLoc.getScope(ctx));
 
-  if (!Scope.Verify())
-    return;
+  assert((!Scope || Scope.isScope()) &&
+    "Scope of a DebugLoc should be null or a DIScope.");
+  if (!Scope)
+     return;
 
   StringRef fileName(Scope.getFilename());
   StringRef dirName(Scope.getDirectory());
@@ -294,7 +291,7 @@ void NVPTXAsmPrinter::emitLineNumberAsDotLoc(const MachineInstr &MI) {
     return;
 
   // Emit the line from the source file.
-  if (llvm::InterleaveSrcInPtx)
+  if (InterleaveSrc)
     this->emitSrcInText(fileName.str(), curLoc.getLine());
 
   std::stringstream temp;
@@ -308,8 +305,115 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   raw_svector_ostream OS(Str);
   if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
     emitLineNumberAsDotLoc(*MI);
-  printInstruction(MI, OS);
-  OutStreamer.EmitRawText(OS.str());
+
+  MCInst Inst;
+  lowerToMCInst(MI, Inst);
+  OutStreamer.EmitInstruction(Inst);
+}
+
+void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
+  OutMI.setOpcode(MI->getOpcode());
+
+  // Special: Do not mangle symbol operand of CALL_PROTOTYPE
+  if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
+    const MachineOperand &MO = MI->getOperand(0);
+    OutMI.addOperand(GetSymbolRef(MO,
+      OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
+    return;
+  }
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    if (lowerOperand(MO, MCOp))
+      OutMI.addOperand(MCOp);
+  }
+}
+
+bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
+                                   MCOperand &MCOp) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type");
+  case MachineOperand::MO_Register:
+    MCOp = MCOperand::CreateReg(encodeVirtualRegister(MO.getReg()));
+    break;
+  case MachineOperand::MO_Immediate:
+    MCOp = MCOperand::CreateImm(MO.getImm());
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+        MO.getMBB()->getSymbol(), OutContext));
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
+    break;
+  case MachineOperand::MO_FPImmediate: {
+    const ConstantFP *Cnt = MO.getFPImm();
+    APFloat Val = Cnt->getValueAPF();
+
+    switch (Cnt->getType()->getTypeID()) {
+    default: report_fatal_error("Unsupported FP type"); break;
+    case Type::FloatTyID:
+      MCOp = MCOperand::CreateExpr(
+        NVPTXFloatMCExpr::CreateConstantFPSingle(Val, OutContext));
+      break;
+    case Type::DoubleTyID:
+      MCOp = MCOperand::CreateExpr(
+        NVPTXFloatMCExpr::CreateConstantFPDouble(Val, OutContext));
+      break;
+    }
+    break;
+  }
+  }
+  return true;
+}
+
+unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+    DenseMap<unsigned, unsigned> &RegMap = VRegMapping[RC];
+    unsigned RegNum = RegMap[Reg];
+
+    // Encode the register class in the upper 4 bits
+    // Must be kept in sync with NVPTXInstPrinter::printRegName
+    unsigned Ret = 0;
+    if (RC == &NVPTX::Int1RegsRegClass) {
+      Ret = (1 << 28);
+    } else if (RC == &NVPTX::Int16RegsRegClass) {
+      Ret = (2 << 28);
+    } else if (RC == &NVPTX::Int32RegsRegClass) {
+      Ret = (3 << 28);
+    } else if (RC == &NVPTX::Int64RegsRegClass) {
+      Ret = (4 << 28);
+    } else if (RC == &NVPTX::Float32RegsRegClass) {
+      Ret = (5 << 28);
+    } else if (RC == &NVPTX::Float64RegsRegClass) {
+      Ret = (6 << 28);
+    } else {
+      report_fatal_error("Bad register class");
+    }
+
+    // Insert the vreg number
+    Ret |= (RegNum & 0x0FFFFFFF);
+    return Ret;
+  } else {
+    // Some special-use registers are actually physical registers.
+    // Encode this as the register class ID of 0 and the real register ID.
+    return Reg & 0x0FFFFFFF;
+  }
+}
+
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
+                                        const MCSymbol *Symbol) {
+  const MCExpr *Expr;
+  Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
+                                 OutContext);
+  return MCOperand::CreateExpr(Expr);
 }
 
 void NVPTXAsmPrinter::printReturnValStr(const Function *F, raw_ostream &O) {
@@ -436,9 +540,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void NVPTXAsmPrinter::EmitFunctionBodyStart() {
-  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
-  unsigned numRegClasses = TRI.getNumRegClasses();
-  VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1];
+  VRegMapping.clear();
   OutStreamer.EmitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
 
@@ -450,7 +552,20 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
 
 void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
   OutStreamer.EmitRawText(StringRef("}\n"));
-  delete[] VRidGlobal2LocalMap;
+  VRegMapping.clear();
+}
+
+void NVPTXAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+  const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+  if (TRI->isVirtualRegister(RegNo)) {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           getVirtualRegisterName(RegNo));
+  } else {
+    OutStreamer.AddComment(Twine("implicit-def: ") +
+                           TM.getRegisterInfo()->getName(RegNo));
+  }
+  OutStreamer.AddBlankLine();
 }
 
 void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
@@ -504,24 +619,30 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
     O << ".minnctapersm " << mincta << "\n";
 }
 
-void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
-                                             raw_ostream &O) {
-  const TargetRegisterClass *RC = MRI->getRegClass(vr);
-  unsigned id = RC->getID();
+std::string
+NVPTXAsmPrinter::getVirtualRegisterName(unsigned Reg) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
 
-  std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
-  unsigned mapped_vr = regmap[vr];
+  std::string Name;
+  raw_string_ostream NameStr(Name);
 
-  if (!isVec) {
-    O << getNVPTXRegClassStr(RC) << mapped_vr;
-    return;
-  }
-  report_fatal_error("Bad register!");
+  VRegRCMap::const_iterator I = VRegMapping.find(RC);
+  assert(I != VRegMapping.end() && "Bad register class");
+  const DenseMap<unsigned, unsigned> &RegMap = I->second;
+
+  VRegMap::const_iterator VI = RegMap.find(Reg);
+  assert(VI != RegMap.end() && "Bad virtual register");
+  unsigned MappedVR = VI->second;
+
+  NameStr << getNVPTXRegClassStr(RC) << MappedVR;
+
+  NameStr.flush();
+  return Name;
 }
 
-void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr, bool isVec,
+void NVPTXAsmPrinter::emitVirtualRegister(unsigned int vr,
                                           raw_ostream &O) {
-  getVirtualRegisterName(vr, isVec, O);
+  O << getVirtualRegisterName(vr);
 }
 
 void NVPTXAsmPrinter::printVecModifiedImmediate(
@@ -554,145 +675,7 @@ void NVPTXAsmPrinter::printVecModifiedImmediate(
     llvm_unreachable("Unknown Modifier on immediate operand");
 }
 
-void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
-                                   raw_ostream &O, const char *Modifier) {
-  const MachineOperand &MO = MI->getOperand(opNum);
-  switch (MO.getType()) {
-  case MachineOperand::MO_Register:
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
-      if (MO.getReg() == NVPTX::VRDepot)
-        O << DEPOTNAME << getFunctionNumber();
-      else
-        O << getRegisterName(MO.getReg());
-    } else {
-      if (!Modifier)
-        emitVirtualRegister(MO.getReg(), false, O);
-      else {
-        if (strcmp(Modifier, "vecfull") == 0)
-          emitVirtualRegister(MO.getReg(), true, O);
-        else
-          llvm_unreachable(
-              "Don't know how to handle the modifier on virtual register.");
-      }
-    }
-    return;
 
-  case MachineOperand::MO_Immediate:
-    if (!Modifier)
-      O << MO.getImm();
-    else if (strstr(Modifier, "vec") == Modifier)
-      printVecModifiedImmediate(MO, Modifier, O);
-    else
-      llvm_unreachable(
-          "Don't know how to handle modifier on immediate operand");
-    return;
-
-  case MachineOperand::MO_FPImmediate:
-    printFPConstant(MO.getFPImm(), O);
-    break;
-
-  case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
-    break;
-
-  case MachineOperand::MO_ExternalSymbol: {
-    const char *symbname = MO.getSymbolName();
-    if (strstr(symbname, ".PARAM") == symbname) {
-      unsigned index;
-      sscanf(symbname + 6, "%u[];", &index);
-      printParamName(index, O);
-    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
-      unsigned index;
-      sscanf(symbname + 9, "%u[];", &index);
-      O << *CurrentFnSym << "_param_" << index << "_offset";
-    } else
-      O << symbname;
-    break;
-  }
-
-  case MachineOperand::MO_MachineBasicBlock:
-    O << *MO.getMBB()->getSymbol();
-    return;
-
-  default:
-    llvm_unreachable("Operand type not supported.");
-  }
-}
-
-void NVPTXAsmPrinter::printImplicitDef(const MachineInstr *MI,
-                                       raw_ostream &O) const {
-#ifndef __OPTIMIZE__
-  O << "\t// Implicit def :";
-  //printOperand(MI, 0);
-  O << "\n";
-#endif
-}
-
-void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
-                                      raw_ostream &O, const char *Modifier) {
-  printOperand(MI, opNum, O);
-
-  if (Modifier && !strcmp(Modifier, "add")) {
-    O << ", ";
-    printOperand(MI, opNum + 1, O);
-  } else {
-    if (MI->getOperand(opNum + 1).isImm() &&
-        MI->getOperand(opNum + 1).getImm() == 0)
-      return; // don't print ',0' or '+0'
-    O << "+";
-    printOperand(MI, opNum + 1, O);
-  }
-}
-
-void NVPTXAsmPrinter::printLdStCode(const MachineInstr *MI, int opNum,
-                                    raw_ostream &O, const char *Modifier) {
-  if (Modifier) {
-    const MachineOperand &MO = MI->getOperand(opNum);
-    int Imm = (int) MO.getImm();
-    if (!strcmp(Modifier, "volatile")) {
-      if (Imm)
-        O << ".volatile";
-    } else if (!strcmp(Modifier, "addsp")) {
-      switch (Imm) {
-      case NVPTX::PTXLdStInstCode::GLOBAL:
-        O << ".global";
-        break;
-      case NVPTX::PTXLdStInstCode::SHARED:
-        O << ".shared";
-        break;
-      case NVPTX::PTXLdStInstCode::LOCAL:
-        O << ".local";
-        break;
-      case NVPTX::PTXLdStInstCode::PARAM:
-        O << ".param";
-        break;
-      case NVPTX::PTXLdStInstCode::CONSTANT:
-        O << ".const";
-        break;
-      case NVPTX::PTXLdStInstCode::GENERIC:
-        if (!nvptxSubtarget.hasGenericLdSt())
-          O << ".global";
-        break;
-      default:
-        llvm_unreachable("Wrong Address Space");
-      }
-    } else if (!strcmp(Modifier, "sign")) {
-      if (Imm == NVPTX::PTXLdStInstCode::Signed)
-        O << "s";
-      else if (Imm == NVPTX::PTXLdStInstCode::Unsigned)
-        O << "u";
-      else
-        O << "f";
-    } else if (!strcmp(Modifier, "vec")) {
-      if (Imm == NVPTX::PTXLdStInstCode::V2)
-        O << ".v2";
-      else if (Imm == NVPTX::PTXLdStInstCode::V4)
-        O << ".v4";
-    } else
-      llvm_unreachable("Unknown Modifier");
-  } else
-    llvm_unreachable("Empty Modifier");
-}
 
 void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
 
@@ -702,7 +685,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *Mang->getSymbol(F) << "\n";
+  O << *getSymbol(F) << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -912,7 +895,7 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile &>(getObjFileLowering())
       .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, *TM.getDataLayout());
+  Mang = new Mangler(&TM);
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -921,6 +904,16 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   // Already commented out
   //bool Result = AsmPrinter::doInitialization(M);
 
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    OutStreamer.AddComment("Start of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+    OutStreamer.EmitRawText(StringRef(M.getModuleInlineAsm()));
+    OutStreamer.AddBlankLine();
+    OutStreamer.AddComment("End of file scope inline assembly");
+    OutStreamer.AddBlankLine();
+  }
+
   if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
     recordAndEmitFilenames(M);
 
@@ -1222,12 +1215,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     else
       O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
 
     // Ptx allows variable initilization only for constant and global state
     // spaces.
     if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-         (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
         GVar->hasInitializer()) {
       const Constant *Initializer = GVar->getInitializer();
@@ -1251,7 +1243,6 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       // Ptx allows variable initilization only for constant and
       // global state spaces.
       if (((PTy->getAddressSpace() == llvm::ADDRESS_SPACE_GLOBAL) ||
-           (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
            (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
           GVar->hasInitializer()) {
         const Constant *Initializer = GVar->getInitializer();
@@ -1260,15 +1251,15 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           bufferAggregateConstant(Initializer, &aggBuffer);
           if (aggBuffer.numSymbols) {
             if (nvptxSubtarget.is64Bit()) {
-              O << " .u64 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u64 " << *getSymbol(GVar) << "[";
               O << ElementSize / 8;
             } else {
-              O << " .u32 " << *Mang->getSymbol(GVar) << "[";
+              O << " .u32 " << *getSymbol(GVar) << "[";
               O << ElementSize / 4;
             }
             O << "]";
           } else {
-            O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+            O << " .b8 " << *getSymbol(GVar) << "[";
             O << ElementSize;
             O << "]";
           }
@@ -1276,7 +1267,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           aggBuffer.print();
           O << "}";
         } else {
-          O << " .b8 " << *Mang->getSymbol(GVar);
+          O << " .b8 " << *getSymbol(GVar);
           if (ElementSize) {
             O << "[";
             O << ElementSize;
@@ -1284,7 +1275,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
           }
         }
       } else {
-        O << " .b8 " << *Mang->getSymbol(GVar);
+        O << " .b8 " << *getSymbol(GVar);
         if (ElementSize) {
           O << "[";
           O << ElementSize;
@@ -1322,14 +1313,6 @@ void NVPTXAsmPrinter::emitPTXAddressSpace(unsigned int AddressSpace,
     O << "global";
     break;
   case llvm::ADDRESS_SPACE_CONST:
-    // This logic should be consistent with that in
-    // getCodeAddrSpace() (NVPTXISelDATToDAT.cpp)
-    if (nvptxSubtarget.hasGenericLdSt())
-      O << "global";
-    else
-      O << "const";
-    break;
-  case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
     O << "const";
     break;
   case llvm::ADDRESS_SPACE_SHARED:
@@ -1399,7 +1382,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
     O << " .";
     O << getPTXFundamentalTypeStr(ETy);
     O << " ";
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
 
@@ -1414,7 +1397,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   case Type::ArrayTyID:
   case Type::VectorTyID:
     ElementSize = TD->getTypeStoreSize(ETy);
-    O << " .b8 " << *Mang->getSymbol(GVar) << "[";
+    O << " .b8 " << *getSymbol(GVar) << "[";
     if (ElementSize) {
       O << itostr(ElementSize);
     }
@@ -1469,7 +1452,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
       (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
+    O << *getSymbol(I->getParent()) << "_param_" << paramIndex;
   else {
     std::string argName = I->getName();
     const char *p = argName.c_str();
@@ -1528,13 +1511,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (llvm::isImage(*I)) {
         std::string sname = I->getName();
         if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .surfref " << *getSymbol(F) << "_param_"
             << paramIndex;
         else // Default image is read_only
-          O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+          O << "\t.param .texref " << *getSymbol(F) << "_param_"
             << paramIndex;
       } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
+        O << "\t.param .samplerref " << *getSymbol(F) << "_param_"
           << paramIndex;
       continue;
     }
@@ -1569,14 +1552,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
             default:
               O << ".ptr ";
               break;
-            case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
+            case llvm::ADDRESS_SPACE_CONST:
               O << ".ptr .const ";
               break;
             case llvm::ADDRESS_SPACE_SHARED:
               O << ".ptr .shared ";
               break;
             case llvm::ADDRESS_SPACE_GLOBAL:
-            case llvm::ADDRESS_SPACE_CONST:
               O << ".ptr .global ";
               break;
             }
@@ -1709,48 +1691,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   for (unsigned i = 0; i < numVRs; i++) {
     unsigned int vr = TRI->index2VirtReg(i);
     const TargetRegisterClass *RC = MRI->getRegClass(vr);
-    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
     int n = regmap.size();
     regmap.insert(std::make_pair(vr, n + 1));
   }
 
   // Emit register declarations
   // @TODO: Extract out the real register usage
-  O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
-  O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+  // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
 
   // Emit declaration of the virtual registers or 'physical' registers for
   // each register class
-  //for (unsigned i=0; i< numRegClasses; i++) {
-  //    std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
-  //    const TargetRegisterClass *RC = TRI->getRegClass(i);
-  //    std::string rcname = getNVPTXRegClassName(RC);
-  //    std::string rcStr = getNVPTXRegClassStr(RC);
-  //    //int n = regmap.size();
-  //    if (!isNVPTXVectorRegClass(RC)) {
-  //      O << "\t.reg " << rcname << " \t" << rcStr << "<"
-  //        << NVPTXNumRegisters << ">;\n";
-  //    }
-
-  // Only declare those registers that may be used. And do not emit vector
-  // registers as
-  // they are all elementized to scalar registers.
-  //if (n && !isNVPTXVectorRegClass(RC)) {
-  //    if (RegAllocNilUsed) {
-  //        O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
-  //          << ">;\n";
-  //    }
-  //    else {
-  //        O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
-  //          << "<" << 32 << ">;\n";
-  //    }
-  //}
-  //}
+  for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+    std::string rcname = getNVPTXRegClassName(RC);
+    std::string rcStr = getNVPTXRegClassStr(RC);
+    int n = regmap.size();
+
+    // Only declare those registers that may be used.
+    if (n) {
+       O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+         << ">;\n";
+    }
+  }
 
   OutStreamer.EmitRawText(O.str());
 }
@@ -1794,13 +1764,13 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    O << *Mang->getSymbol(GVar);
+    O << *getSymbol(GVar);
     return;
   }
   if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
     const Value *v = Cexpr->stripPointerCasts();
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-      O << *Mang->getSymbol(GVar);
+      O << *getSymbol(GVar);
       return;
     } else {
       O << *LowerConstant(CPV, *this);
@@ -1918,7 +1888,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   case Type::VectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantArray>(CPV) || isa<ConstantVector>(CPV) ||
-        isa<ConstantStruct>(CPV)) {
+        isa<ConstantStruct>(CPV) || isa<ConstantDataSequential>(CPV)) {
       int ElementSize = TD->getTypeAllocSize(CPV->getType());
       bufferAggregateConstant(CPV, aggBuffer);
       if (Bytes > ElementSize)
@@ -1991,41 +1961,6 @@ bool NVPTXAsmPrinter::isImageType(const Type *Ty) {
   return false;
 }
 
-/// PrintAsmOperand - Print out an operand for an inline asm expression.
-///
-bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                                      unsigned AsmVariant,
-                                      const char *ExtraCode, raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      // See if this is a generic print operand
-      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
-    case 'r':
-      break;
-    }
-  }
-
-  printOperand(MI, OpNo, O);
-
-  return false;
-}
-
-bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
-    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
-    const char *ExtraCode, raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier
-
-  O << '[';
-  printMemOperand(MI, OpNo, O);
-  O << ']';
-
-  return false;
-}
 
 bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
@@ -2040,7 +1975,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   case NVPTX::CallArgI32:
   case NVPTX::CallArgI32imm:
   case NVPTX::CallArgI64:
-  case NVPTX::CallArgI8:
   case NVPTX::CallArgParam:
   case NVPTX::CallVoidInst:
   case NVPTX::CallVoidInstReg:
@@ -2058,10 +1992,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   case NVPTX::StoreParamI32:
   case NVPTX::StoreParamI64:
   case NVPTX::StoreParamI8:
-  case NVPTX::StoreParamS32I8:
-  case NVPTX::StoreParamU32I8:
-  case NVPTX::StoreParamS32I16:
-  case NVPTX::StoreParamU32I16:
   case NVPTX::StoreRetvalF32:
   case NVPTX::StoreRetvalF64:
   case NVPTX::StoreRetvalI16:
@@ -2074,7 +2004,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   case NVPTX::LastCallArgI32:
   case NVPTX::LastCallArgI32imm:
   case NVPTX::LastCallArgI64:
-  case NVPTX::LastCallArgI8:
   case NVPTX::LastCallArgParam:
   case NVPTX::LoadParamMemF32:
   case NVPTX::LoadParamMemF64:
@@ -2082,12 +2011,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   case NVPTX::LoadParamMemI32:
   case NVPTX::LoadParamMemI64:
   case NVPTX::LoadParamMemI8:
-  case NVPTX::LoadParamRegF32:
-  case NVPTX::LoadParamRegF64:
-  case NVPTX::LoadParamRegI16:
-  case NVPTX::LoadParamRegI32:
-  case NVPTX::LoadParamRegI64:
-  case NVPTX::LoadParamRegI8:
   case NVPTX::PrototypeInst:
   case NVPTX::DBG_VALUE:
     return true;
@@ -2095,6 +2018,116 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) {
   return false;
 }
 
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool NVPTXAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      // See if this is a generic print operand
+      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo, O);
+
+  return false;
+}
+
+bool NVPTXAsmPrinter::PrintAsmMemoryOperand(
+    const MachineInstr *MI, unsigned OpNo, unsigned AsmVariant,
+    const char *ExtraCode, raw_ostream &O) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo, O);
+  O << ']';
+
+  return false;
+}
+
+void NVPTXAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
+                                   raw_ostream &O, const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MO.getReg() == NVPTX::VRDepot)
+        O << DEPOTNAME << getFunctionNumber();
+      else
+        O << NVPTXInstPrinter::getRegisterName(MO.getReg());
+    } else {
+      emitVirtualRegister(MO.getReg(), O);
+    }
+    return;
+
+  case MachineOperand::MO_Immediate:
+    if (!Modifier)
+      O << MO.getImm();
+    else if (strstr(Modifier, "vec") == Modifier)
+      printVecModifiedImmediate(MO, Modifier, O);
+    else
+      llvm_unreachable(
+          "Don't know how to handle modifier on immediate operand");
+    return;
+
+  case MachineOperand::MO_FPImmediate:
+    printFPConstant(MO.getFPImm(), O);
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *symbname = MO.getSymbolName();
+    if (strstr(symbname, ".PARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 6, "%u[];", &index);
+      printParamName(index, O);
+    } else if (strstr(symbname, ".HLPPARAM") == symbname) {
+      unsigned index;
+      sscanf(symbname + 9, "%u[];", &index);
+      O << *CurrentFnSym << "_param_" << index << "_offset";
+    } else
+      O << symbname;
+    break;
+  }
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol();
+    return;
+
+  default:
+    llvm_unreachable("Operand type not supported.");
+  }
+}
+
+void NVPTXAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      raw_ostream &O, const char *Modifier) {
+  printOperand(MI, opNum, O);
+
+  if (Modifier && !strcmp(Modifier, "add")) {
+    O << ", ";
+    printOperand(MI, opNum + 1, O);
+  } else {
+    if (MI->getOperand(opNum + 1).isImm() &&
+        MI->getOperand(opNum + 1).getImm() == 0)
+      return; // don't print ',0' or '+0'
+    O << "+";
+    printOperand(MI, opNum + 1, O);
+  }
+}
+
+
 // Force static initialization.
 extern "C" void LLVMInitializeNVPTXBackendAsmPrinter() {
   RegisterAsmPrinter<NVPTXAsmPrinter> X(TheNVPTXTarget32);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7faa6b2..3abe5d1 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -155,7 +155,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
           if (pos == nextSymbolPos) {
             const Value *v = Symbols[nSym];
             if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-              MCSymbol *Name = AP.Mang->getSymbol(GVar);
+              MCSymbol *Name = AP.getSymbol(GVar);
               O << *Name;
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
@@ -188,16 +188,17 @@ private:
   void EmitFunctionEntryLabel();
   void EmitFunctionBodyStart();
   void EmitFunctionBodyEnd();
+  void emitImplicitDef(const MachineInstr *MI) const;
 
   void EmitInstruction(const MachineInstr *);
+  void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  unsigned encodeVirtualRegister(unsigned Reg);
 
   void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
 
   void printGlobalVariable(const GlobalVariable *GVar);
-  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                    const char *Modifier = 0);
-  void printLdStCode(const MachineInstr *MI, int opNum, raw_ostream &O,
-                     const char *Modifier = 0);
   void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
                                  raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
@@ -213,22 +214,23 @@ private:
   void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
-  void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
+  void emitVirtualRegister(unsigned int vr, raw_ostream &);
   void emitFunctionExternParamList(const MachineFunction &MF);
   void emitFunctionParamList(const Function *, raw_ostream &O);
   void emitFunctionParamList(const MachineFunction &MF, raw_ostream &O);
   void setAndEmitFunctionVirtualRegisters(const MachineFunction &MF);
   void emitFunctionTempData(const MachineFunction &MF, unsigned &FrameSize);
   bool isImageType(const Type *Ty);
+  void printReturnValStr(const Function *, raw_ostream &O);
+  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
                        raw_ostream &);
+  void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
+                    const char *Modifier = 0);
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              unsigned AsmVariant, const char *ExtraCode,
                              raw_ostream &);
-  void printReturnValStr(const Function *, raw_ostream &O);
-  void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
-
 protected:
   bool doInitialization(Module &M);
   bool doFinalization(Module &M);
@@ -243,7 +245,9 @@ private:
   // The contents are specific for each
   // MachineFunction. But the size of the
   // array is not.
-  std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+  typedef DenseMap<unsigned, unsigned> VRegMap;
+  typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+  VRegRCMap VRegMapping;
   // cache the subtarget here.
   const NVPTXSubtarget &nvptxSubtarget;
   // Build the map between type name and ID based on module's type
@@ -281,7 +285,6 @@ public:
       : AsmPrinter(TM, Streamer),
         nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
     CurrentBankselLabelInBasicBlock = "";
-    VRidGlobal2LocalMap = NULL;
     reader = NULL;
   }
 
@@ -292,7 +295,7 @@ public:
 
   bool ignoreLoc(const MachineInstr &);
 
-  virtual void getVirtualRegisterName(unsigned, bool, raw_ostream &);
+  std::string getVirtualRegisterName(unsigned) const;
 
   DebugLoc prevDebugLoc;
   void emitLineNumberAsDotLoc(const MachineInstr &);
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 6533da5..9030584f 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -36,30 +37,24 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF) const {
     // in the BB, so giving it no debug location.
     DebugLoc dl = DebugLoc();
 
-    if (tm.getSubtargetImpl()->hasGenericLdSt()) {
-      // mov %SPL, %depot;
-      // cvta.local %SP, %SPL;
-      if (is64bit) {
-        MachineInstr *MI = BuildMI(
-            MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
-            NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
-        BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::IMOV64rr),
-                NVPTX::VRFrameLocal).addReg(NVPTX::VRDepot);
-      } else {
-        MachineInstr *MI = BuildMI(
-            MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
-            NVPTX::VRFrame).addReg(NVPTX::VRFrameLocal);
-        BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::IMOV32rr),
-                NVPTX::VRFrameLocal).addReg(NVPTX::VRDepot);
-      }
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+
+    // mov %SPL, %depot;
+    // cvta.local %SP, %SPL;
+    if (is64bit) {
+      unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int64RegsRegClass);
+      MachineInstr *MI = BuildMI(
+          MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes_64),
+          NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR_64),
+              LocalReg).addImm(MF.getFunctionNumber());
     } else {
-      // mov %SP, %depot;
-      if (is64bit)
-        BuildMI(MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::IMOV64rr),
-                NVPTX::VRFrame).addReg(NVPTX::VRDepot);
-      else
-        BuildMI(MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::IMOV32rr),
-                NVPTX::VRFrame).addReg(NVPTX::VRDepot);
+      unsigned LocalReg = MRI.createVirtualRegister(&NVPTX::Int32RegsRegClass);
+      MachineInstr *MI = BuildMI(
+          MBB, MBBI, dl, tm.getInstrInfo()->get(NVPTX::cvta_local_yes),
+          NVPTX::VRFrame).addReg(LocalReg);
+      BuildMI(MBB, MI, dl, tm.getInstrInfo()->get(NVPTX::MOV_DEPOT_ADDR),
+              LocalReg).addImm(MF.getFunctionNumber());
     }
   }
 }
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 1077c46..9fb0dd8 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -142,7 +142,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     GlobalVariable *GV = I->first;
     GlobalVariable *NewGV = I->second;
     ++I;
-    Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+    Constant *BitCastNewGV = ConstantExpr::getPointerCast(NewGV, GV->getType());
     // At this point, the remaining uses of GV should be found only in global
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
@@ -384,7 +384,7 @@ void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
 
   // Replace the old operands with the new operands.
   N->dropAllReferences();
-  for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(),
+  for (SmallVectorImpl<MDNode *>::iterator I = NewOperands.begin(),
                                            E = NewOperands.end();
        I != E; ++I) {
     N->addOperand(*I);
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index d4378c2..4b8b306 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -25,28 +25,29 @@
 
 using namespace llvm;
 
-static cl::opt<bool> UseFMADInstruction(
-    "nvptx-mad-enable", cl::ZeroOrMore,
-    cl::desc("NVPTX Specific: Enable generating FMAD instructions"),
-    cl::init(false));
-
 static cl::opt<int>
-FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore,
+FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
                           " 1: do it  2: do it aggressively"),
                  cl::init(2));
 
 static cl::opt<int> UsePrecDivF32(
-    "nvptx-prec-divf32", cl::ZeroOrMore,
+    "nvptx-prec-divf32", cl::ZeroOrMore, cl::Hidden,
     cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
              " IEEE Compliant F32 div.rnd if avaiable."),
     cl::init(2));
 
 static cl::opt<bool>
-UsePrecSqrtF32("nvptx-prec-sqrtf32",
+UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden,
           cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
           cl::init(true));
 
+static cl::opt<bool>
+FtzEnabled("nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
+           cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
+           cl::init(false));
+
+
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -58,12 +59,7 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
                                      CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel),
       Subtarget(tm.getSubtarget<NVPTXSubtarget>()) {
-  // Always do fma.f32 fpcontract if the target supports the instruction.
-  // Always do fma.f64 fpcontract if the target supports the instruction.
-  // Do mad.f32 is nvptx-mad-enable is specified and the target does not
-  // support fma.f32.
 
-  doFMADF32 = (OptLevel > 0) && UseFMADInstruction && !Subtarget.hasFMAF32();
   doFMAF32 = (OptLevel > 0) && Subtarget.hasFMAF32() && (FMAContractLevel >= 1);
   doFMAF64 = (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel >= 1);
   doFMAF32AGG =
@@ -71,28 +67,61 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
   doFMAF64AGG =
       (OptLevel > 0) && Subtarget.hasFMAF64() && (FMAContractLevel == 2);
 
-  allowFMA = (FMAContractLevel >= 1) || UseFMADInstruction;
-
-  UseF32FTZ = false;
+  allowFMA = (FMAContractLevel >= 1);
 
   doMulWide = (OptLevel > 0);
+}
 
-  // Decide how to translate f32 div
-  do_DIVF32_PREC = UsePrecDivF32;
-  // Decide how to translate f32 sqrt
-  do_SQRTF32_PREC = UsePrecSqrtF32;
-  // sm less than sm_20 does not support div.rnd. Use div.full.
-  if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
-    do_DIVF32_PREC = 1;
+int NVPTXDAGToDAGISel::getDivF32Level() const {
+  if (UsePrecDivF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-div32=N is used on the command-line, always honor it
+    return UsePrecDivF32;
+  } else {
+    // Otherwise, use div.approx if fast math is enabled
+    if (TM.Options.UnsafeFPMath)
+      return 0;
+    else
+      return 2;
+  }
+}
+
+bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
+  if (UsePrecSqrtF32.getNumOccurrences() > 0) {
+    // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
+    return UsePrecSqrtF32;
+  } else {
+    // Otherwise, use sqrt.approx if fast math is enabled
+    if (TM.Options.UnsafeFPMath)
+      return false;
+    else
+      return true;
+  }
+}
 
+bool NVPTXDAGToDAGISel::useF32FTZ() const {
+  if (FtzEnabled.getNumOccurrences() > 0) {
+    // If nvptx-f32ftz is used on the command-line, always honor it
+    return FtzEnabled;
+  } else {
+    const Function *F = MF->getFunction();
+    // Otherwise, check for an nvptx-f32ftz attribute on the function
+    if (F->hasFnAttribute("nvptx-f32ftz"))
+      return (F->getAttributes().getAttribute(AttributeSet::FunctionIndex,
+                                              "nvptx-f32ftz")
+                                              .getValueAsString() == "true");
+    else
+      return false;
+  }
 }
 
 /// Select - Select instructions not customized! Used for
 /// expanded, promoted and normal instructions.
 SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
-  if (N->isMachineOpcode())
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL; // Already selected.
+  }
 
   SDNode *ResNode = NULL;
   switch (N->getOpcode()) {
@@ -116,6 +145,23 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::StoreV4:
     ResNode = SelectStoreVector(N);
     break;
+  case NVPTXISD::LoadParam:
+  case NVPTXISD::LoadParamV2:
+  case NVPTXISD::LoadParamV4:
+    ResNode = SelectLoadParam(N);
+    break;
+  case NVPTXISD::StoreRetval:
+  case NVPTXISD::StoreRetvalV2:
+  case NVPTXISD::StoreRetvalV4:
+    ResNode = SelectStoreRetval(N);
+    break;
+  case NVPTXISD::StoreParam:
+  case NVPTXISD::StoreParamV2:
+  case NVPTXISD::StoreParamV4:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParamU32:
+    ResNode = SelectStoreParam(N);
+    break;
   default:
     break;
   }
@@ -127,42 +173,26 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 static unsigned int getCodeAddrSpace(MemSDNode *N,
                                      const NVPTXSubtarget &Subtarget) {
   const Value *Src = N->getSrcValue();
+
   if (!Src)
-    return NVPTX::PTXLdStInstCode::LOCAL;
+    return NVPTX::PTXLdStInstCode::GENERIC;
 
   if (const PointerType *PT = dyn_cast<PointerType>(Src->getType())) {
     switch (PT->getAddressSpace()) {
-    case llvm::ADDRESS_SPACE_LOCAL:
-      return NVPTX::PTXLdStInstCode::LOCAL;
-    case llvm::ADDRESS_SPACE_GLOBAL:
-      return NVPTX::PTXLdStInstCode::GLOBAL;
-    case llvm::ADDRESS_SPACE_SHARED:
-      return NVPTX::PTXLdStInstCode::SHARED;
-    case llvm::ADDRESS_SPACE_CONST_NOT_GEN:
-      return NVPTX::PTXLdStInstCode::CONSTANT;
-    case llvm::ADDRESS_SPACE_GENERIC:
-      return NVPTX::PTXLdStInstCode::GENERIC;
-    case llvm::ADDRESS_SPACE_PARAM:
-      return NVPTX::PTXLdStInstCode::PARAM;
-    case llvm::ADDRESS_SPACE_CONST:
-      // If the arch supports generic address space, translate it to GLOBAL
-      // for correctness.
-      // If the arch does not support generic address space, then the arch
-      // does not really support ADDRESS_SPACE_CONST, translate it to
-      // to CONSTANT for better performance.
-      if (Subtarget.hasGenericLdSt())
-        return NVPTX::PTXLdStInstCode::GLOBAL;
-      else
-        return NVPTX::PTXLdStInstCode::CONSTANT;
-    default:
-      break;
+    case llvm::ADDRESS_SPACE_LOCAL: return NVPTX::PTXLdStInstCode::LOCAL;
+    case llvm::ADDRESS_SPACE_GLOBAL: return NVPTX::PTXLdStInstCode::GLOBAL;
+    case llvm::ADDRESS_SPACE_SHARED: return NVPTX::PTXLdStInstCode::SHARED;
+    case llvm::ADDRESS_SPACE_GENERIC: return NVPTX::PTXLdStInstCode::GENERIC;
+    case llvm::ADDRESS_SPACE_PARAM: return NVPTX::PTXLdStInstCode::PARAM;
+    case llvm::ADDRESS_SPACE_CONST: return NVPTX::PTXLdStInstCode::CONSTANT;
+    default: break;
     }
   }
-  return NVPTX::PTXLdStInstCode::LOCAL;
+  return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
   SDNode *NVPTXLD = NULL;
@@ -205,7 +235,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT ScalarVT = SimpleVT.getScalarType();
-  unsigned fromTypeWidth = ScalarVT.getSizeInBits();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
   if ((LD->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
@@ -220,7 +251,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType TargetVT = LD->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
     switch (TargetVT) {
@@ -401,7 +432,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
   EVT LoadedVT = MemSD->getMemoryVT();
@@ -430,7 +461,8 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
   MVT ScalarVT = SimpleVT.getScalarType();
-  unsigned FromTypeWidth = ScalarVT.getSizeInBits();
+  // Read at least 8 bits (predicates are stored as 8-bit values)
+  unsigned FromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int FromType;
   // The last operand holds the original LoadSDNode::getExtensionType() value
   unsigned ExtensionType = cast<ConstantSDNode>(
@@ -782,194 +814,478 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *LD;
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  SDValue Base, Offset, Addr;
 
-  EVT RetVT = N->getValueType(0);
+  EVT EltVT = Mem->getMemoryVT().getVectorElementType();
 
-  // Select opcode
-  if (Subtarget.is64Bit()) {
+  if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return NULL;
     case NVPTXISD::LDGV2:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+      switch (EltVT.getSimpleVT().SimpleTy) {
       default:
         return NULL;
       case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
         break;
       case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
         break;
       case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
         break;
       case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
         break;
       case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
         break;
       case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
         break;
       }
       break;
-    case NVPTXISD::LDGV4:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+    case NVPTXISD::LDUV2:
+      switch (EltVT.getSimpleVT().SimpleTy) {
       default:
         return NULL;
       case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
         break;
       case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
         break;
       case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
         break;
       case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
         break;
       }
       break;
-    case NVPTXISD::LDUV2:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+    case NVPTXISD::LDGV4:
+      switch (EltVT.getSimpleVT().SimpleTy) {
       default:
         return NULL;
       case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
         break;
       case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
         break;
       case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
         break;
       case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
         break;
       }
       break;
     case NVPTXISD::LDUV4:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+      switch (EltVT.getSimpleVT().SimpleTy) {
       default:
         return NULL;
       case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
         break;
       case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
         break;
       case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
         break;
       case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_64;
+        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
         break;
       }
       break;
     }
-  } else {
-    switch (N->getOpcode()) {
-    default:
-      return NULL;
-    case NVPTXISD::LDGV2:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+
+    SDValue Ops[] = { Addr, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
+                                ArrayRef<SDValue>(Ops, 2));
+  } else if (Subtarget.is64Bit()
+                 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
+                 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
       default:
         return NULL;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_32;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_32;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_32;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
+          break;
+        }
         break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_32;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
+          break;
+        }
         break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_32;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
+          break;
+        }
         break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_32;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
+          break;
+        }
         break;
       }
-      break;
-    case NVPTXISD::LDGV4:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+    } else {
+      switch (N->getOpcode()) {
       default:
         return NULL;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_32;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
+          break;
+        }
         break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_32;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
+          break;
+        }
         break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_32;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
+          break;
+        }
         break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_32;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
+          break;
+        }
         break;
       }
-      break;
-    case NVPTXISD::LDUV2:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+    }
+
+    SDValue Ops[] = { Base, Offset, Chain };
+
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
+                                ArrayRef<SDValue>(Ops, 3));
+  } else {
+    if (Subtarget.is64Bit()) {
+      switch (N->getOpcode()) {
       default:
         return NULL;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_32;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_32;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_32;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
+          break;
+        }
         break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_32;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
+          break;
+        }
         break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_32;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
+          break;
+        }
         break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_32;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
+          break;
+        }
         break;
       }
-      break;
-    case NVPTXISD::LDUV4:
-      switch (RetVT.getSimpleVT().SimpleTy) {
+    } else {
+      switch (N->getOpcode()) {
       default:
         return NULL;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_32;
+      case NVPTXISD::LDGV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
+          break;
+        }
         break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_32;
+      case NVPTXISD::LDUV2:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
+          break;
+        case MVT::i64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
+          break;
+        case MVT::f64:
+          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
+          break;
+        }
         break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_32;
+      case NVPTXISD::LDGV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
+          break;
+        }
         break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_32;
+      case NVPTXISD::LDUV4:
+        switch (EltVT.getSimpleVT().SimpleTy) {
+        default:
+          return NULL;
+        case MVT::i8:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
+          break;
+        case MVT::i16:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
+          break;
+        case MVT::i32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
+          break;
+        case MVT::f32:
+          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
+          break;
+        }
         break;
       }
-      break;
     }
-  }
 
-  SDValue Ops[] = { Op1, Chain };
-  LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    SDValue Ops[] = { Op1, Chain };
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
+                                ArrayRef<SDValue>(Ops, 2));
+  }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
@@ -979,7 +1295,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
 }
 
 SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   EVT StoreVT = ST->getMemoryVT();
   SDNode *NVPTXST = NULL;
@@ -1033,8 +1349,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   SDValue Addr;
   SDValue Offset, Base;
   unsigned Opcode;
-  MVT::SimpleValueType SourceVT =
-      N1.getNode()->getValueType(0).getSimpleVT().SimpleTy;
+  MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
     switch (SourceVT) {
@@ -1214,7 +1529,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
   unsigned Opcode;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
   MemSDNode *MemSD = cast<MemSDNode>(N);
@@ -1585,6 +1900,414 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   return ST;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue Offset = Node->getOperand(2);
+  SDValue Flag = Node->getOperand(3);
+  SDLoc DL(Node);
+  MemSDNode *Mem = cast<MemSDNode>(Node);
+
+  unsigned VecSize;
+  switch (Node->getOpcode()) {
+  default:
+    return NULL;
+  case NVPTXISD::LoadParam:
+    VecSize = 1;
+    break;
+  case NVPTXISD::LoadParamV2:
+    VecSize = 2;
+    break;
+  case NVPTXISD::LoadParamV4:
+    VecSize = 4;
+    break;
+  }
+
+  EVT EltVT = Node->getValueType(0);
+  EVT MemVT = Mem->getMemoryVT();
+
+  unsigned Opc = 0;
+
+  switch (VecSize) {
+  default:
+    return NULL;
+  case 1:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemI8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemI16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemI32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemI64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemF32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV2I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV2I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV2I32;
+      break;
+    case MVT::i64:
+      Opc = NVPTX::LoadParamMemV2I64;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV2F32;
+      break;
+    case MVT::f64:
+      Opc = NVPTX::LoadParamMemV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (MemVT.getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i8:
+      Opc = NVPTX::LoadParamMemV4I8;
+      break;
+    case MVT::i16:
+      Opc = NVPTX::LoadParamMemV4I16;
+      break;
+    case MVT::i32:
+      Opc = NVPTX::LoadParamMemV4I32;
+      break;
+    case MVT::f32:
+      Opc = NVPTX::LoadParamMemV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDVTList VTs;
+  if (VecSize == 1) {
+    VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
+  } else if (VecSize == 2) {
+    VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
+  } else {
+    EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
+    VTs = CurDAG->getVTList(&EVTs[0], 5);
+  }
+
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opc, DL, VTs, Ops);
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Offset = N->getOperand(1);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return NULL;
+  case NVPTXISD::StoreRetval:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreRetvalV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreRetvalV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 6> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 2));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (NumElts) {
+  default:
+    return NULL;
+  case 1:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalI8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalI16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalI32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalI64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalF32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalF64;
+      break;
+    }
+    break;
+  case 2:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV2I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV2I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV2I32;
+      break;
+    case MVT::i64:
+      Opcode = NVPTX::StoreRetvalV2I64;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV2F32;
+      break;
+    case MVT::f64:
+      Opcode = NVPTX::StoreRetvalV2F64;
+      break;
+    }
+    break;
+  case 4:
+    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+    default:
+      return NULL;
+    case MVT::i1:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i8:
+      Opcode = NVPTX::StoreRetvalV4I8;
+      break;
+    case MVT::i16:
+      Opcode = NVPTX::StoreRetvalV4I16;
+      break;
+    case MVT::i32:
+      Opcode = NVPTX::StoreRetvalV4I32;
+      break;
+    case MVT::f32:
+      Opcode = NVPTX::StoreRetvalV4F32;
+      break;
+    }
+    break;
+  }
+
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Chain = N->getOperand(0);
+  SDValue Param = N->getOperand(1);
+  unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue();
+  SDValue Offset = N->getOperand(2);
+  unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
+  MemSDNode *Mem = cast<MemSDNode>(N);
+  SDValue Flag = N->getOperand(N->getNumOperands() - 1);
+
+  // How many elements do we have?
+  unsigned NumElts = 1;
+  switch (N->getOpcode()) {
+  default:
+    return NULL;
+  case NVPTXISD::StoreParamU32:
+  case NVPTXISD::StoreParamS32:
+  case NVPTXISD::StoreParam:
+    NumElts = 1;
+    break;
+  case NVPTXISD::StoreParamV2:
+    NumElts = 2;
+    break;
+  case NVPTXISD::StoreParamV4:
+    NumElts = 4;
+    break;
+  }
+
+  // Build vector of operands
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Ops.push_back(N->getOperand(i + 3));
+  Ops.push_back(CurDAG->getTargetConstant(ParamVal, MVT::i32));
+  Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32));
+  Ops.push_back(Chain);
+  Ops.push_back(Flag);
+
+  // Determine target opcode
+  // If we have an i1, use an 8-bit store. The lowering code in
+  // NVPTXISelLowering will have already emitted an upcast.
+  unsigned Opcode = 0;
+  switch (N->getOpcode()) {
+  default:
+    switch (NumElts) {
+    default:
+      return NULL;
+    case 1:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamI8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamI16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamI32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamI64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamF32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamF64;
+        break;
+      }
+      break;
+    case 2:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV2I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV2I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV2I32;
+        break;
+      case MVT::i64:
+        Opcode = NVPTX::StoreParamV2I64;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV2F32;
+        break;
+      case MVT::f64:
+        Opcode = NVPTX::StoreParamV2F64;
+        break;
+      }
+      break;
+    case 4:
+      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
+      default:
+        return NULL;
+      case MVT::i1:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i8:
+        Opcode = NVPTX::StoreParamV4I8;
+        break;
+      case MVT::i16:
+        Opcode = NVPTX::StoreParamV4I16;
+        break;
+      case MVT::i32:
+        Opcode = NVPTX::StoreParamV4I32;
+        break;
+      case MVT::f32:
+        Opcode = NVPTX::StoreParamV4F32;
+        break;
+      }
+      break;
+    }
+    break;
+  // Special case: if we have a sign-extend/zero-extend node, insert the
+  // conversion instruction first, and use that as the value operand to
+  // the selected StoreParam node.
+  case NVPTXISD::StoreParamU32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  case NVPTXISD::StoreParamS32: {
+    Opcode = NVPTX::StoreParamI32;
+    SDValue CvtNone = CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE,
+                                                MVT::i32);
+    SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
+                                         MVT::i32, Ops[0], CvtNone);
+    Ops[0] = SDValue(Cvt, 0);
+    break;
+  }
+  }
+
+  SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+  SDNode *Ret =
+      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  return Ret;
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index ed16d44..d961e50 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -28,38 +28,22 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
 
   // If true, generate corresponding FPCONTRACT. This is
   // language dependent (i.e. CUDA and OpenCL works differently).
-  bool doFMADF32;
   bool doFMAF64;
   bool doFMAF32;
   bool doFMAF64AGG;
   bool doFMAF32AGG;
   bool allowFMA;
 
-  // 0: use div.approx
-  // 1: use div.full
-  // 2: For sm_20 and later, ieee-compliant div.rnd.f32 can be generated;
-  //    Otherwise, use div.full
-  int do_DIVF32_PREC;
-
-  // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
-  // is true, then generate the corresponding FTZ version.
-  bool do_SQRTF32_PREC;
-
-  // If true, add .ftz to f32 instructions.
-  // This is only meaningful for sm_20 and later, as the default
-  // is not ftz.
-  // For sm earlier than sm_20, f32 denorms are always ftz by the
-  // hardware.
-  // We always add the .ftz modifier regardless of the sm value
-  // when Use32FTZ is true.
-  bool UseF32FTZ;
-
   // If true, generate mul.wide from sext and mul
   bool doMulWide;
 
+  int getDivF32Level() const;
+  bool usePrecSqrtF32() const;
+  bool useF32FTZ() const;
+
 public:
   explicit NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
-                             CodeGenOpt::Level OptLevel);
+                             CodeGenOpt::Level   OptLevel);
 
   // Pass Name
   virtual const char *getPassName() const {
@@ -80,7 +64,10 @@ private:
   SDNode *SelectLDGLDUVector(SDNode *N);
   SDNode *SelectStore(SDNode *N);
   SDNode *SelectStoreVector(SDNode *N);
-
+  SDNode *SelectLoadParam(SDNode *N);
+  SDNode *SelectStoreRetval(SDNode *N);
+  SDNode *SelectStoreParam(SDNode *N);
+        
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
   }
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6e01a5a..6a8be75 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -51,6 +51,8 @@ static bool IsPTXVectorType(MVT VT) {
   switch (VT.SimpleTy) {
   default:
     return false;
+  case MVT::v2i1:
+  case MVT::v4i1:
   case MVT::v2i8:
   case MVT::v4i8:
   case MVT::v2i16:
@@ -65,6 +67,37 @@ static bool IsPTXVectorType(MVT VT) {
   }
 }
 
+/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
+/// EVTs that compose it.  Unlike ComputeValueVTs, this will break apart vectors
+/// into their primitive components.
+/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
+/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
+/// LowerCall, and LowerReturn.
+static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
+                               SmallVectorImpl<EVT> &ValueVTs,
+                               SmallVectorImpl<uint64_t> *Offsets = 0,
+                               uint64_t StartingOffset = 0) {
+  SmallVector<EVT, 16> TempVTs;
+  SmallVector<uint64_t, 16> TempOffsets;
+
+  ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset);
+  for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
+    EVT VT = TempVTs[i];
+    uint64_t Off = TempOffsets[i];
+    if (VT.isVector())
+      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
+        ValueVTs.push_back(VT.getVectorElementType());
+        if (Offsets)
+          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+      }
+    else {
+      ValueVTs.push_back(VT);
+      if (Offsets)
+        Offsets->push_back(Off);
+    }
+  }
+}
+
 // NVPTXTargetLowering Constructor.
 NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
     : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM),
@@ -90,7 +123,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
     setSchedulingPreference(Sched::Source);
 
   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
-  addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass);
   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
   addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
@@ -106,10 +138,12 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   setOperationAction(ISD::BR_CC, MVT::i16, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::i64, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  // Some SIGN_EXTEND_INREG can be done using cvt instruction.
+  // For others we will expand to a SHL/SRA pair.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
   if (nvptxSubtarget.hasROT64()) {
@@ -170,6 +204,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   // TRAP can be lowered to PTX trap
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
+  setOperationAction(ISD::ADDC, MVT::i64, Expand);
+  setOperationAction(ISD::ADDE, MVT::i64, Expand);
+
   // Register custom handling for vector loads/stores
   for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE;
        ++i) {
@@ -181,6 +218,25 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
     }
   }
 
+  // Custom handling for i8 intrinsics
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+  setOperationAction(ISD::CTLZ, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ, MVT::i64, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Legal);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Legal);
+  setOperationAction(ISD::CTTZ, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i16, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
   // Now deduce the information based on the above mentioned
   // actions
   computeRegisterProperties();
@@ -196,8 +252,6 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::RET_FLAG";
   case NVPTXISD::Wrapper:
     return "NVPTXISD::Wrapper";
-  case NVPTXISD::NVBuiltin:
-    return "NVPTXISD::NVBuiltin";
   case NVPTXISD::DeclareParam:
     return "NVPTXISD::DeclareParam";
   case NVPTXISD::DeclareScalarParam:
@@ -210,14 +264,20 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::PrintCall";
   case NVPTXISD::LoadParam:
     return "NVPTXISD::LoadParam";
+  case NVPTXISD::LoadParamV2:
+    return "NVPTXISD::LoadParamV2";
+  case NVPTXISD::LoadParamV4:
+    return "NVPTXISD::LoadParamV4";
   case NVPTXISD::StoreParam:
     return "NVPTXISD::StoreParam";
+  case NVPTXISD::StoreParamV2:
+    return "NVPTXISD::StoreParamV2";
+  case NVPTXISD::StoreParamV4:
+    return "NVPTXISD::StoreParamV4";
   case NVPTXISD::StoreParamS32:
     return "NVPTXISD::StoreParamS32";
   case NVPTXISD::StoreParamU32:
     return "NVPTXISD::StoreParamU32";
-  case NVPTXISD::MoveToParam:
-    return "NVPTXISD::MoveToParam";
   case NVPTXISD::CallArgBegin:
     return "NVPTXISD::CallArgBegin";
   case NVPTXISD::CallArg:
@@ -236,12 +296,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::Prototype";
   case NVPTXISD::MoveParam:
     return "NVPTXISD::MoveParam";
-  case NVPTXISD::MoveRetval:
-    return "NVPTXISD::MoveRetval";
-  case NVPTXISD::MoveToRetval:
-    return "NVPTXISD::MoveToRetval";
   case NVPTXISD::StoreRetval:
     return "NVPTXISD::StoreRetval";
+  case NVPTXISD::StoreRetvalV2:
+    return "NVPTXISD::StoreRetvalV2";
+  case NVPTXISD::StoreRetvalV4:
+    return "NVPTXISD::StoreRetvalV4";
   case NVPTXISD::PseudoUseParam:
     return "NVPTXISD::PseudoUseParam";
   case NVPTXISD::RETURN:
@@ -250,6 +310,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::CallSeqBegin";
   case NVPTXISD::CallSeqEnd:
     return "NVPTXISD::CallSeqEnd";
+  case NVPTXISD::CallPrototype:
+    return "NVPTXISD::CallPrototype";
   case NVPTXISD::LoadV2:
     return "NVPTXISD::LoadV2";
   case NVPTXISD::LoadV4:
@@ -275,89 +337,68 @@ bool NVPTXTargetLowering::shouldSplitVectorElementType(EVT VT) const {
 
 SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   Op = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
   return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op);
 }
 
-std::string NVPTXTargetLowering::getPrototype(
-    Type *retTy, const ArgListTy &Args,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment) const {
+std::string
+NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  unsigned retAlignment,
+                                  const ImmutableCallSite *CS) const {
 
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return "";
 
   std::stringstream O;
   O << "prototype_" << uniqueCallSite << " : .callprototype ";
 
-  if (retTy->getTypeID() == Type::VoidTyID)
+  if (retTy->getTypeID() == Type::VoidTyID) {
     O << "()";
-  else {
+  } else {
     O << "(";
-    if (isABI) {
-      if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
-        unsigned size = 0;
-        if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
-          size = ITy->getBitWidth();
-          if (size < 32)
-            size = 32;
-        } else {
-          assert(retTy->isFloatingPointTy() &&
-                 "Floating point type expected here");
-          size = retTy->getPrimitiveSizeInBits();
-        }
-
-        O << ".param .b" << size << " _";
-      } else if (isa<PointerType>(retTy))
-        O << ".param .b" << getPointerTy().getSizeInBits() << " _";
-      else {
-        if ((retTy->getTypeID() == Type::StructTyID) ||
-            isa<VectorType>(retTy)) {
-          SmallVector<EVT, 16> vtparts;
-          ComputeValueVTs(*this, retTy, vtparts);
-          unsigned totalsz = 0;
-          for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
-            unsigned elems = 1;
-            EVT elemtype = vtparts[i];
-            if (vtparts[i].isVector()) {
-              elems = vtparts[i].getVectorNumElements();
-              elemtype = vtparts[i].getVectorElementType();
-            }
-            for (unsigned j = 0, je = elems; j != je; ++j) {
-              unsigned sz = elemtype.getSizeInBits();
-              if (elemtype.isInteger() && (sz < 8))
-                sz = 8;
-              totalsz += sz / 8;
-            }
-          }
-          O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]";
-        } else {
-          assert(false && "Unknown return type");
-        }
+    if (retTy->isPrimitiveType() || retTy->isIntegerTy()) {
+      unsigned size = 0;
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) {
+        size = ITy->getBitWidth();
+        if (size < 32)
+          size = 32;
+      } else {
+        assert(retTy->isFloatingPointTy() &&
+               "Floating point type expected here");
+        size = retTy->getPrimitiveSizeInBits();
       }
-    } else {
-      SmallVector<EVT, 16> vtparts;
-      ComputeValueVTs(*this, retTy, vtparts);
-      unsigned idx = 0;
-      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
-        unsigned elems = 1;
-        EVT elemtype = vtparts[i];
-        if (vtparts[i].isVector()) {
-          elems = vtparts[i].getVectorNumElements();
-          elemtype = vtparts[i].getVectorElementType();
-        }
 
-        for (unsigned j = 0, je = elems; j != je; ++j) {
-          unsigned sz = elemtype.getSizeInBits();
-          if (elemtype.isInteger() && (sz < 32))
-            sz = 32;
-          O << ".reg .b" << sz << " _";
-          if (j < je - 1)
-            O << ", ";
-          ++idx;
+      O << ".param .b" << size << " _";
+    } else if (isa<PointerType>(retTy)) {
+      O << ".param .b" << getPointerTy().getSizeInBits() << " _";
+    } else {
+      if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) {
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, retTy, vtparts);
+        unsigned totalsz = 0;
+        for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
+          unsigned elems = 1;
+          EVT elemtype = vtparts[i];
+          if (vtparts[i].isVector()) {
+            elems = vtparts[i].getVectorNumElements();
+            elemtype = vtparts[i].getVectorElementType();
+          }
+          // TODO: no need to loop
+          for (unsigned j = 0, je = elems; j != je; ++j) {
+            unsigned sz = elemtype.getSizeInBits();
+            if (elemtype.isInteger() && (sz < 8))
+              sz = 8;
+            totalsz += sz / 8;
+          }
         }
-        if (i < e - 1)
-          O << ", ";
+        O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]";
+      } else {
+        assert(false && "Unknown return type");
       }
     }
     O << ") ";
@@ -367,14 +408,38 @@ std::string NVPTXTargetLowering::getPrototype(
   bool first = true;
   MVT thePointerTy = getPointerTy();
 
-  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    const Type *Ty = Args[i].Ty;
+  unsigned OIdx = 0;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    Type *Ty = Args[i].Ty;
     if (!first) {
       O << ", ";
     }
     first = false;
 
-    if (Outs[i].Flags.isByVal() == false) {
+    if (Outs[OIdx].Flags.isByVal() == false) {
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
+        unsigned align = 0;
+        const CallInst *CallI = cast<CallInst>(CS->getInstruction());
+        const DataLayout *TD = getDataLayout();
+        // +1 because index 0 is reserved for return type alignment
+        if (!llvm::getAlign(*CallI, i + 1, align))
+          align = TD->getABITypeAlignment(Ty);
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        O << ".param .align " << align << " .b8 ";
+        O << "_";
+        O << "[" << sz << "]";
+        // update the index for Outs
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, Ty, vtparts);
+        if (unsigned len = vtparts.size())
+          OIdx += len - 1;
+        continue;
+      }
+       // i8 types in IR will be i16 types in SDAG
+      assert((getValueType(Ty) == Outs[OIdx].VT ||
+             (getValueType(Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
+             "type mismatch between callee prototype and arguments");
+      // scalar type
       unsigned sz = 0;
       if (isa<IntegerType>(Ty)) {
         sz = cast<IntegerType>(Ty)->getBitWidth();
@@ -384,10 +449,7 @@ std::string NVPTXTargetLowering::getPrototype(
         sz = thePointerTy.getSizeInBits();
       else
         sz = Ty->getPrimitiveSizeInBits();
-      if (isABI)
-        O << ".param .b" << sz << " ";
-      else
-        O << ".reg .b" << sz << " ";
+      O << ".param .b" << sz << " ";
       O << "_";
       continue;
     }
@@ -395,50 +457,72 @@ std::string NVPTXTargetLowering::getPrototype(
     assert(PTy && "Param with byval attribute should be a pointer type");
     Type *ETy = PTy->getElementType();
 
-    if (isABI) {
-      unsigned align = Outs[i].Flags.getByValAlign();
-      unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
-      O << ".param .align " << align << " .b8 ";
-      O << "_";
-      O << "[" << sz << "]";
-      continue;
-    } else {
-      SmallVector<EVT, 16> vtparts;
-      ComputeValueVTs(*this, ETy, vtparts);
-      for (unsigned i = 0, e = vtparts.size(); i != e; ++i) {
-        unsigned elems = 1;
-        EVT elemtype = vtparts[i];
-        if (vtparts[i].isVector()) {
-          elems = vtparts[i].getVectorNumElements();
-          elemtype = vtparts[i].getVectorElementType();
-        }
+    unsigned align = Outs[OIdx].Flags.getByValAlign();
+    unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
+    O << ".param .align " << align << " .b8 ";
+    O << "_";
+    O << "[" << sz << "]";
+  }
+  O << ");";
+  return O.str();
+}
 
-        for (unsigned j = 0, je = elems; j != je; ++j) {
-          unsigned sz = elemtype.getSizeInBits();
-          if (elemtype.isInteger() && (sz < 32))
-            sz = 32;
-          O << ".reg .b" << sz << " ";
-          O << "_";
-          if (j < je - 1)
-            O << ", ";
-        }
-        if (i < e - 1)
-          O << ", ";
+unsigned
+NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                          const ImmutableCallSite *CS,
+                                          Type *Ty,
+                                          unsigned Idx) const {
+  const DataLayout *TD = getDataLayout();
+  unsigned Align = 0;
+  const Value *DirectCallee = CS->getCalledFunction();
+
+  if (!DirectCallee) {
+    // We don't have a direct function symbol, but that may be because of
+    // constant cast instructions in the call.
+    const Instruction *CalleeI = CS->getInstruction();
+    assert(CalleeI && "Call target is not a function or derived value?");
+
+    // With bitcast'd call targets, the instruction will be the call
+    if (isa<CallInst>(CalleeI)) {
+      // Check if we have call alignment metadata
+      if (llvm::getAlign(*cast<CallInst>(CalleeI), Idx, Align))
+        return Align;
+
+      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      // Ignore any bitcast instructions
+      while(isa<ConstantExpr>(CalleeV)) {
+        const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
+        if (!CE->isCast())
+          break;
+        // Look through the bitcast
+        CalleeV = cast<ConstantExpr>(CalleeV)->getOperand(0);
       }
-      continue;
+
+      // We have now looked past all of the bitcasts.  Do we finally have a
+      // Function?
+      if (isa<Function>(CalleeV))
+        DirectCallee = CalleeV;
     }
   }
-  O << ");";
-  return O.str();
+
+  // Check for function alignment information if we found that the
+  // ultimate target is a Function
+  if (DirectCallee)
+    if (llvm::getAlign(*cast<Function>(DirectCallee), Idx, Align))
+      return Align;
+
+  // Call is indirect or alignment information is not available, fall back to
+  // the ABI type alignment
+  return TD->getABITypeAlignment(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
-  DebugLoc &dl = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDLoc dl = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &isTailCall = CLI.IsTailCall;
@@ -447,53 +531,258 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   ImmutableCallSite *CS = CLI.CS;
 
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+  const DataLayout *TD = getDataLayout();
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
 
   SDValue tempChain = Chain;
   Chain =
-      DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true));
+      DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
+                           dl);
   SDValue InFlag = Chain.getValue(1);
 
-  assert((Outs.size() == Args.size()) &&
-         "Unexpected number of arguments to function call");
   unsigned paramCount = 0;
+  // Args.size() and Outs.size() need not match.
+  // Outs.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Outs)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Outs.
+  // So a different index should be used for indexing into Outs/OutVals.
+  // See similar issue in LowerFormalArguments.
+  unsigned OIdx = 0;
   // Declare the .params or .reg need to pass values
   // to the function
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    EVT VT = Outs[i].VT;
+  for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
+    EVT VT = Outs[OIdx].VT;
+    Type *Ty = Args[i].Ty;
+
+    if (Outs[OIdx].Flags.isByVal() == false) {
+      if (Ty->isAggregateType()) {
+        // aggregate
+        SmallVector<EVT, 16> vtparts;
+        ComputeValueVTs(*this, Ty, vtparts);
+
+        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+                                      DAG.getConstant(paramCount, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps, 5);
+        InFlag = Chain.getValue(1);
+        unsigned curOffset = 0;
+        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
+          unsigned elems = 1;
+          EVT elemtype = vtparts[j];
+          if (vtparts[j].isVector()) {
+            elems = vtparts[j].getVectorNumElements();
+            elemtype = vtparts[j].getVectorElementType();
+          }
+          for (unsigned k = 0, ke = elems; k != ke; ++k) {
+            unsigned sz = elemtype.getSizeInBits();
+            if (elemtype.isInteger() && (sz < 8))
+              sz = 8;
+            SDValue StVal = OutVals[OIdx];
+            if (elemtype.getSizeInBits() < 16) {
+              StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+            }
+            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+            SDValue CopyParamOps[] = { Chain,
+                                       DAG.getConstant(paramCount, MVT::i32),
+                                       DAG.getConstant(curOffset, MVT::i32),
+                                       StVal, InFlag };
+            Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                            CopyParamVTs, &CopyParamOps[0], 5,
+                                            elemtype, MachinePointerInfo());
+            InFlag = Chain.getValue(1);
+            curOffset += sz / 8;
+            ++OIdx;
+          }
+        }
+        if (vtparts.size() > 0)
+          --OIdx;
+        ++paramCount;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1);
+        // declare .param .align <align> .b8 .param<n>[<size>];
+        unsigned sz = TD->getTypeAllocSize(Ty);
+        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32),
+                                      DAG.getConstant(paramCount, MVT::i32),
+                                      DAG.getConstant(sz, MVT::i32), InFlag };
+        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                            DeclareParamOps, 5);
+        InFlag = Chain.getValue(1);
+        unsigned NumElts = ObjectVT.getVectorNumElements();
+        EVT EltVT = ObjectVT.getVectorElementType();
+        EVT MemVT = EltVT;
+        bool NeedExtend = false;
+        if (EltVT.getSizeInBits() < 16) {
+          NeedExtend = true;
+          EltVT = MVT::i16;
+        }
+
+        // V1 store
+        if (NumElts == 1) {
+          SDValue Elt = OutVals[OIdx++];
+          if (NeedExtend)
+            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(0, MVT::i32), Elt,
+                                     InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
+                                          CopyParamVTs, &CopyParamOps[0], 5,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else if (NumElts == 2) {
+          SDValue Elt0 = OutVals[OIdx++];
+          SDValue Elt1 = OutVals[OIdx++];
+          if (NeedExtend) {
+            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
+            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+          }
+
+          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+          SDValue CopyParamOps[] = { Chain,
+                                     DAG.getConstant(paramCount, MVT::i32),
+                                     DAG.getConstant(0, MVT::i32), Elt0, Elt1,
+                                     InFlag };
+          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
+                                          CopyParamVTs, &CopyParamOps[0], 6,
+                                          MemVT, MachinePointerInfo());
+          InFlag = Chain.getValue(1);
+        } else {
+          unsigned curOffset = 0;
+          // V4 stores
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the
+          // vector will be expanded to a power of 2 elements, so we know we can
+          // always round up to the next multiple of 4 when creating the vector
+          // stores.
+          // e.g.  4 elem => 1 st.v4
+          //       6 elem => 2 st.v4
+          //       8 elem => 2 st.v4
+          //      11 elem => 3 st.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64)
+            VecSize = 2;
+
+          // This is potentially only part of a vector, so assume all elements
+          // are packed together.
+          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
+
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            // Get values
+            SDValue StoreVal;
+            SmallVector<SDValue, 8> Ops;
+            Ops.push_back(Chain);
+            Ops.push_back(DAG.getConstant(paramCount, MVT::i32));
+            Ops.push_back(DAG.getConstant(curOffset, MVT::i32));
+
+            unsigned Opc = NVPTXISD::StoreParamV2;
+
+            StoreVal = OutVals[OIdx++];
+            if (NeedExtend)
+              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            Ops.push_back(StoreVal);
+
+            if (i + 1 < NumElts) {
+              StoreVal = OutVals[OIdx++];
+              if (NeedExtend)
+                StoreVal =
+                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+            } else {
+              StoreVal = DAG.getUNDEF(EltVT);
+            }
+            Ops.push_back(StoreVal);
+
+            if (VecSize == 4) {
+              Opc = NVPTXISD::StoreParamV4;
+              if (i + 2 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+
+              if (i + 3 < NumElts) {
+                StoreVal = OutVals[OIdx++];
+                if (NeedExtend)
+                  StoreVal =
+                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+              } else {
+                StoreVal = DAG.getUNDEF(EltVT);
+              }
+              Ops.push_back(StoreVal);
+            }
 
-    if (Outs[i].Flags.isByVal() == false) {
+            Ops.push_back(InFlag);
+
+            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, &Ops[0],
+                                            Ops.size(), MemVT,
+                                            MachinePointerInfo());
+            InFlag = Chain.getValue(1);
+            curOffset += PerStoreOffset;
+          }
+        }
+        ++paramCount;
+        --OIdx;
+        continue;
+      }
       // Plain scalar
       // for ABI,    declare .param .b<size> .param<n>;
-      // for nonABI, declare .reg .b<size> .param<n>;
-      unsigned isReg = 1;
-      if (isABI)
-        isReg = 0;
       unsigned sz = VT.getSizeInBits();
-      if (VT.isInteger() && (sz < 32))
-        sz = 32;
+      bool needExtend = false;
+      if (VT.isInteger()) {
+        if (sz < 16)
+          needExtend = true;
+        if (sz < 32)
+          sz = 32;
+      }
       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue DeclareParamOps[] = { Chain,
                                     DAG.getConstant(paramCount, MVT::i32),
                                     DAG.getConstant(sz, MVT::i32),
-                                    DAG.getConstant(isReg, MVT::i32), InFlag };
+                                    DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
                           DeclareParamOps, 5);
       InFlag = Chain.getValue(1);
+      SDValue OutV = OutVals[OIdx];
+      if (needExtend) {
+        // zext/sext i1 to i16
+        unsigned opc = ISD::ZERO_EXTEND;
+        if (Outs[OIdx].Flags.isSExt())
+          opc = ISD::SIGN_EXTEND;
+        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
+      }
       SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
-                                 DAG.getConstant(0, MVT::i32), OutVals[i],
-                                 InFlag };
+                                 DAG.getConstant(0, MVT::i32), OutV, InFlag };
 
       unsigned opcode = NVPTXISD::StoreParam;
-      if (isReg)
-        opcode = NVPTXISD::MoveToParam;
-      else {
-        if (Outs[i].Flags.isZExt())
-          opcode = NVPTXISD::StoreParamU32;
-        else if (Outs[i].Flags.isSExt())
-          opcode = NVPTXISD::StoreParamS32;
-      }
-      Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5);
+      if (Outs[OIdx].Flags.isZExt())
+        opcode = NVPTXISD::StoreParamU32;
+      else if (Outs[OIdx].Flags.isSExt())
+        opcode = NVPTXISD::StoreParamS32;
+      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 5,
+                                      VT, MachinePointerInfo());
 
       InFlag = Chain.getValue(1);
       ++paramCount;
@@ -505,55 +794,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     assert(PTy && "Type of a byval parameter should be pointer");
     ComputeValueVTs(*this, PTy->getElementType(), vtparts);
 
-    if (isABI) {
-      // declare .param .align 16 .b8 .param<n>[<size>];
-      unsigned sz = Outs[i].Flags.getByValSize();
-      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      // The ByValAlign in the Outs[i].Flags is alway set at this point, so we
-      // don't need to
-      // worry about natural alignment or not. See TargetLowering::LowerCallTo()
-      SDValue DeclareParamOps[] = {
-        Chain, DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32),
-        DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
-        InFlag
-      };
-      Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                          DeclareParamOps, 5);
-      InFlag = Chain.getValue(1);
-      unsigned curOffset = 0;
-      for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-        unsigned elems = 1;
-        EVT elemtype = vtparts[j];
-        if (vtparts[j].isVector()) {
-          elems = vtparts[j].getVectorNumElements();
-          elemtype = vtparts[j].getVectorElementType();
-        }
-        for (unsigned k = 0, ke = elems; k != ke; ++k) {
-          unsigned sz = elemtype.getSizeInBits();
-          if (elemtype.isInteger() && (sz < 8))
-            sz = 8;
-          SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
-                          DAG.getConstant(curOffset, getPointerTy()));
-          SDValue theVal =
-              DAG.getLoad(elemtype, dl, tempChain, srcAddr,
-                          MachinePointerInfo(), false, false, false, 0);
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, MVT::i32),
-                                     DAG.getConstant(curOffset, MVT::i32),
-                                     theVal, InFlag };
-          Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                              CopyParamOps, 5);
-          InFlag = Chain.getValue(1);
-          curOffset += sz / 8;
-        }
-      }
-      ++paramCount;
-      continue;
-    }
-    // Non-abi, struct or vector
-    // Declare a bunch or .reg .b<size> .param<n>
+    // declare .param .align <align> .b8 .param<n>[<size>];
+    unsigned sz = Outs[OIdx].Flags.getByValSize();
+    SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
+    // so we don't need to worry about natural alignment or not.
+    // See TargetLowering::LowerCallTo().
+    SDValue DeclareParamOps[] = {
+      Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32),
+      DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32),
+      InFlag
+    };
+    Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
+                        DeclareParamOps, 5);
+    InFlag = Chain.getValue(1);
     unsigned curOffset = 0;
     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
       unsigned elems = 1;
@@ -564,107 +818,66 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
       for (unsigned k = 0, ke = elems; k != ke; ++k) {
         unsigned sz = elemtype.getSizeInBits();
-        if (elemtype.isInteger() && (sz < 32))
-          sz = 32;
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain,
-                                      DAG.getConstant(paramCount, MVT::i32),
-                                      DAG.getConstant(sz, MVT::i32),
-                                      DAG.getConstant(1, MVT::i32), InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                            DeclareParamOps, 5);
-        InFlag = Chain.getValue(1);
+        if (elemtype.isInteger() && (sz < 8))
+          sz = 8;
         SDValue srcAddr =
-            DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i],
+            DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx],
                         DAG.getConstant(curOffset, getPointerTy()));
-        SDValue theVal =
-            DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(),
-                        false, false, false, 0);
+        SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
+                                     MachinePointerInfo(), false, false, false,
+                                     0);
+        if (elemtype.getSizeInBits() < 16) {
+          theVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, theVal);
+        }
         SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32),
-                                   DAG.getConstant(0, MVT::i32), theVal,
+                                   DAG.getConstant(curOffset, MVT::i32), theVal,
                                    InFlag };
-        Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs,
-                            CopyParamOps, 5);
+        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
+                                        CopyParamOps, 5, elemtype,
+                                        MachinePointerInfo());
+
         InFlag = Chain.getValue(1);
-        ++paramCount;
+        curOffset += sz / 8;
       }
     }
+    ++paramCount;
   }
 
   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
   unsigned retAlignment = 0;
 
   // Handle Result
-  unsigned retCount = 0;
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> resvtparts;
     ComputeValueVTs(*this, retTy, resvtparts);
 
-    // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or
-    // individual .reg .b<size> func_retval<0..> for non ABI
-    unsigned resultsz = 0;
-    for (unsigned i = 0, e = resvtparts.size(); i != e; ++i) {
-      unsigned elems = 1;
-      EVT elemtype = resvtparts[i];
-      if (resvtparts[i].isVector()) {
-        elems = resvtparts[i].getVectorNumElements();
-        elemtype = resvtparts[i].getVectorElementType();
-      }
-      for (unsigned j = 0, je = elems; j != je; ++j) {
-        unsigned sz = elemtype.getSizeInBits();
-        if (isABI == false) {
-          if (elemtype.isInteger() && (sz < 32))
-            sz = 32;
-        } else {
-          if (elemtype.isInteger() && (sz < 8))
-            sz = 8;
-        }
-        if (isABI == false) {
-          SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32),
-                                      DAG.getConstant(sz, MVT::i32),
-                                      DAG.getConstant(retCount, MVT::i32),
-                                      InFlag };
-          Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
-                              DeclareRetOps, 5);
-          InFlag = Chain.getValue(1);
-          ++retCount;
-        }
-        resultsz += sz;
-      }
-    }
-    if (isABI) {
-      if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
-          retTy->isPointerTy()) {
-        // Scalar needs to be at least 32bit wide
-        if (resultsz < 32)
-          resultsz = 32;
-        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
-                                    DAG.getConstant(resultsz, MVT::i32),
-                                    DAG.getConstant(0, MVT::i32), InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
-                            DeclareRetOps, 5);
-        InFlag = Chain.getValue(1);
-      } else {
-        if (Func) { // direct call
-          if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment))
-            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
-        } else { // indirect call
-          const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
-          if (!llvm::getAlign(*CallI, 0, retAlignment))
-            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
-        }
-        SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareRetOps[] = { Chain,
-                                    DAG.getConstant(retAlignment, MVT::i32),
-                                    DAG.getConstant(resultsz / 8, MVT::i32),
-                                    DAG.getConstant(0, MVT::i32), InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
-                            DeclareRetOps, 5);
-        InFlag = Chain.getValue(1);
-      }
+    // Declare
+    //  .param .align 16 .b8 retval0[<size-in-bytes>], or
+    //  .param .b<size-in-bits> retval0
+    unsigned resultsz = TD->getTypeAllocSizeInBits(retTy);
+    if (retTy->isPrimitiveType() || retTy->isIntegerTy() ||
+        retTy->isPointerTy()) {
+      // Scalar needs to be at least 32bit wide
+      if (resultsz < 32)
+        resultsz = 32;
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
+                                  DAG.getConstant(resultsz, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
+                          DeclareRetOps, 5);
+      InFlag = Chain.getValue(1);
+    } else {
+      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
+      SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      SDValue DeclareRetOps[] = { Chain,
+                                  DAG.getConstant(retAlignment, MVT::i32),
+                                  DAG.getConstant(resultsz / 8, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32), InFlag };
+      Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
+                          DeclareRetOps, 5);
+      InFlag = Chain.getValue(1);
     }
   }
 
@@ -674,25 +887,22 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
     // to be emitted, and the label has to used as the last arg of call
     // instruction.
-    // The prototype is embedded in a string and put as the operand for an
-    // INLINEASM SDNode.
-    SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment);
-    const char *asmstr = nvTM->getManagedStrPool()
-        ->getManagedString(proto_string.c_str())->c_str();
-    SDValue InlineAsmOps[] = {
-      Chain, DAG.getTargetExternalSymbol(asmstr, getPointerTy()),
-      DAG.getMDNode(0), DAG.getTargetConstant(0, MVT::i32), InFlag
+    // The prototype is embedded in a string and put as the operand for a
+    // CallPrototype SDNode which will print out to the value of the string.
+    SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    std::string Proto = getPrototype(retTy, Args, Outs, retAlignment, CS);
+    const char *ProtoStr =
+      nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
+    SDValue ProtoOps[] = {
+      Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
     };
-    Chain = DAG.getNode(ISD::INLINEASM, dl, InlineAsmVTs, InlineAsmOps, 5);
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3);
     InFlag = Chain.getValue(1);
   }
   // Op to just print "call"
   SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue PrintCallOps[] = {
-    Chain,
-    DAG.getConstant(isABI ? ((Ins.size() == 0) ? 0 : 1) : retCount, MVT::i32),
-    InFlag
+    Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
   };
   Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
                       dl, PrintCallVTs, PrintCallOps, 3);
@@ -740,62 +950,183 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
-    if (isABI) {
-      unsigned resoffset = 0;
-      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        unsigned sz = Ins[i].VT.getSizeInBits();
-        if (Ins[i].VT.isInteger() && (sz < 8))
-          sz = 8;
-        EVT LoadRetVTs[] = { Ins[i].VT, MVT::Other, MVT::Glue };
-        SDValue LoadRetOps[] = { Chain, DAG.getConstant(1, MVT::i32),
-                                 DAG.getConstant(resoffset, MVT::i32), InFlag };
-        SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs,
-                                     LoadRetOps, array_lengthof(LoadRetOps));
+    unsigned resoffset = 0;
+    if (retTy && retTy->isVectorTy()) {
+      EVT ObjectVT = getValueType(retTy);
+      unsigned NumElts = ObjectVT.getVectorNumElements();
+      EVT EltVT = ObjectVT.getVectorElementType();
+      assert(nvTM->getTargetLowering()->getNumRegisters(F->getContext(),
+                                                        ObjectVT) == NumElts &&
+             "Vector was not scalarized");
+      unsigned sz = EltVT.getSizeInBits();
+      bool needTruncate = sz < 16 ? true : false;
+
+      if (NumElts == 1) {
+        // Just a simple load
+        std::vector<EVT> LoadRetVTs;
+        if (needTruncate) {
+          // If loading i1 result, generate
+          //   load i16
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(EltVT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        std::vector<SDValue> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
+            LoadRetOps.size(), EltVT, MachinePointerInfo());
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
-        InVals.push_back(retval);
-        resoffset += sz / 8;
+        SDValue Ret0 = retval;
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
+        InVals.push_back(Ret0);
+      } else if (NumElts == 2) {
+        // LoadV2
+        std::vector<EVT> LoadRetVTs;
+        if (needTruncate) {
+          // If loading i1 result, generate
+          //   load i16
+          //   trunc i16 to i1
+          LoadRetVTs.push_back(MVT::i16);
+          LoadRetVTs.push_back(MVT::i16);
+        } else {
+          LoadRetVTs.push_back(EltVT);
+          LoadRetVTs.push_back(EltVT);
+        }
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+        std::vector<SDValue> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParamV2, dl,
+            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
+            LoadRetOps.size(), EltVT, MachinePointerInfo());
+        Chain = retval.getValue(2);
+        InFlag = retval.getValue(3);
+        SDValue Ret0 = retval.getValue(0);
+        SDValue Ret1 = retval.getValue(1);
+        if (needTruncate) {
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
+          InVals.push_back(Ret0);
+          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
+          InVals.push_back(Ret1);
+        } else {
+          InVals.push_back(Ret0);
+          InVals.push_back(Ret1);
+        }
+      } else {
+        // Split into N LoadV4
+        unsigned Ofst = 0;
+        unsigned VecSize = 4;
+        unsigned Opc = NVPTXISD::LoadParamV4;
+        if (EltVT.getSizeInBits() == 64) {
+          VecSize = 2;
+          Opc = NVPTXISD::LoadParamV2;
+        }
+        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+        for (unsigned i = 0; i < NumElts; i += VecSize) {
+          SmallVector<EVT, 8> LoadRetVTs;
+          if (needTruncate) {
+            // If loading i1 result, generate
+            //   load i16
+            //   trunc i16 to i1
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(MVT::i16);
+          } else {
+            for (unsigned j = 0; j < VecSize; ++j)
+              LoadRetVTs.push_back(EltVT);
+          }
+          LoadRetVTs.push_back(MVT::Other);
+          LoadRetVTs.push_back(MVT::Glue);
+          SmallVector<SDValue, 4> LoadRetOps;
+          LoadRetOps.push_back(Chain);
+          LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+          LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
+          LoadRetOps.push_back(InFlag);
+          SDValue retval = DAG.getMemIntrinsicNode(
+              Opc, dl, DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()),
+              &LoadRetOps[0], LoadRetOps.size(), EltVT, MachinePointerInfo());
+          if (VecSize == 2) {
+            Chain = retval.getValue(2);
+            InFlag = retval.getValue(3);
+          } else {
+            Chain = retval.getValue(4);
+            InFlag = retval.getValue(5);
+          }
+
+          for (unsigned j = 0; j < VecSize; ++j) {
+            if (i + j >= NumElts)
+              break;
+            SDValue Elt = retval.getValue(j);
+            if (needTruncate)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+            InVals.push_back(Elt);
+          }
+          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+        }
       }
     } else {
-      SmallVector<EVT, 16> resvtparts;
-      ComputeValueVTs(*this, retTy, resvtparts);
-
-      assert(Ins.size() == resvtparts.size() &&
-             "Unexpected number of return values in non-ABI case");
-      unsigned paramNum = 0;
+      SmallVector<EVT, 16> VTs;
+      ComputePTXValueVTs(*this, retTy, VTs);
+      assert(VTs.size() == Ins.size() && "Bad value decomposition");
       for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        assert(EVT(Ins[i].VT) == resvtparts[i] &&
-               "Unexpected EVT type in non-ABI case");
-        unsigned numelems = 1;
-        EVT elemtype = Ins[i].VT;
-        if (Ins[i].VT.isVector()) {
-          numelems = Ins[i].VT.getVectorNumElements();
-          elemtype = Ins[i].VT.getVectorElementType();
-        }
-        std::vector<SDValue> tempRetVals;
-        for (unsigned j = 0; j < numelems; ++j) {
-          EVT MoveRetVTs[] = { elemtype, MVT::Other, MVT::Glue };
-          SDValue MoveRetOps[] = { Chain, DAG.getConstant(0, MVT::i32),
-                                   DAG.getConstant(paramNum, MVT::i32),
-                                   InFlag };
-          SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs,
-                                       MoveRetOps, array_lengthof(MoveRetOps));
-          Chain = retval.getValue(1);
-          InFlag = retval.getValue(2);
-          tempRetVals.push_back(retval);
-          ++paramNum;
-        }
-        if (Ins[i].VT.isVector())
-          InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT,
-                                       &tempRetVals[0], tempRetVals.size()));
-        else
-          InVals.push_back(tempRetVals[0]);
+        unsigned sz = VTs[i].getSizeInBits();
+        bool needTruncate = sz < 8 ? true : false;
+        if (VTs[i].isInteger() && (sz < 8))
+          sz = 8;
+
+        SmallVector<EVT, 4> LoadRetVTs;
+        EVT TheLoadType = VTs[i];
+        if (retTy->isIntegerTy() &&
+            TD->getTypeAllocSizeInBits(retTy) < 32) {
+          // This is for integer types only, and specifically not for
+          // aggregates.
+          LoadRetVTs.push_back(MVT::i32);
+          TheLoadType = MVT::i32;
+        } else if (sz < 16) {
+          // If loading i1/i8 result, generate
+          //   load i8 (-> i16)
+          //   trunc i16 to i1/i8
+          LoadRetVTs.push_back(MVT::i16);
+        } else
+          LoadRetVTs.push_back(Ins[i].VT);
+        LoadRetVTs.push_back(MVT::Other);
+        LoadRetVTs.push_back(MVT::Glue);
+
+        SmallVector<SDValue, 4> LoadRetOps;
+        LoadRetOps.push_back(Chain);
+        LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
+        LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32));
+        LoadRetOps.push_back(InFlag);
+        SDValue retval = DAG.getMemIntrinsicNode(
+            NVPTXISD::LoadParam, dl,
+            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
+            LoadRetOps.size(), TheLoadType, MachinePointerInfo());
+        Chain = retval.getValue(1);
+        InFlag = retval.getValue(2);
+        SDValue Ret0 = retval.getValue(0);
+        if (needTruncate)
+          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
+        InVals.push_back(Ret0);
+        resoffset += sz / 8;
       }
     }
   }
+
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true),
                              DAG.getIntPtrConstant(uniqueCallSite + 1, true),
-                             InFlag);
+                             InFlag, dl);
   uniqueCallSite++;
 
   // set isTailCall to false for now, until we figure out how to express
@@ -810,7 +1141,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 SDValue
 NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   SmallVector<SDValue, 8> Ops;
   unsigned NumOperands = Node->getNumOperands();
   for (unsigned i = 0; i < NumOperands; ++i) {
@@ -861,17 +1192,17 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
 // v = ld i1* addr
 //   =>
-// v1 = ld i8* addr
-// v = trunc v1 to i1
+// v1 = ld i8* addr (-> i16)
+// v = trunc i16 to i1
 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
   LoadSDNode *LD = cast<LoadSDNode>(Node);
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
   assert(Node->getValueType(0) == MVT::i1 &&
          "Custom lowering for i1 load only");
   SDValue newLD =
-      DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(),
+      DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(),
                   LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(),
                   LD->isInvariant(), LD->getAlignment());
   SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
@@ -896,7 +1227,7 @@ SDValue
 NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
   SDNode *N = Op.getNode();
   SDValue Val = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT ValVT = Val.getValueType();
 
   if (ValVT.isVector()) {
@@ -955,8 +1286,6 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
       SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
                                    DAG.getIntPtrConstant(i));
       if (NeedExt)
-        // ANY_EXTEND is correct here since the store will only look at the
-        // lower-order bits anyway.
         ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
       Ops.push_back(ExtVal);
     }
@@ -981,11 +1310,11 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
 
 // st i1 v, addr
 //    =>
-// v1 = zxt v to i8
-// st i8, addr
+// v1 = zxt v to i16
+// st.u8 i16, addr
 SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   StoreSDNode *ST = cast<StoreSDNode>(Node);
   SDValue Tmp1 = ST->getChain();
   SDValue Tmp2 = ST->getBasePtr();
@@ -994,9 +1323,10 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
-  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Tmp3);
-  SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                                isVolatile, isNonTemporal, Alignment);
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
+  SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
+                                     ST->getPointerInfo(), MVT::i8, isNonTemporal,
+                                     isVolatile, Alignment);
   return Result;
 }
 
@@ -1011,7 +1341,15 @@ SDValue NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname,
 
 SDValue
 NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int idx, EVT v) const {
-  return getExtSymb(DAG, ".PARAM", idx, v);
+  std::string ParamSym;
+  raw_string_ostream ParamStr(ParamSym);
+
+  ParamStr << DAG.getMachineFunction().getName() << "_param_" << idx;
+  ParamStr.flush();
+
+  std::string *SavedStr =
+    nvTM->getManagedStrPool()->getManagedString(ParamSym.c_str());
+  return DAG.getTargetExternalSymbol(SavedStr->c_str(), v);
 }
 
 SDValue NVPTXTargetLowering::getParamHelpSymbol(SelectionDAG &DAG, int idx) {
@@ -1046,19 +1384,23 @@ bool llvm::isImageOrSamplerVal(const Value *arg, const Module *context) {
 
 SDValue NVPTXTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
     SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const DataLayout *TD = getDataLayout();
 
   const Function *F = MF.getFunction();
   const AttributeSet &PAL = F->getAttributes();
+  const TargetLowering *TLI = nvTM->getTargetLowering();
 
   SDValue Root = DAG.getRoot();
   std::vector<SDValue> OutChains;
 
   bool isKernel = llvm::isKernelFunction(*F);
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
 
   std::vector<Type *> argTypes;
   std::vector<const Argument *> theArgs;
@@ -1067,15 +1409,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     theArgs.push_back(I);
     argTypes.push_back(I->getType());
   }
-  //assert(argTypes.size() == Ins.size() &&
-  //       "Ins types and function types did not match");
+  // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
+  // Ins.size() will be larger
+  //   * if there is an aggregate argument with multiple fields (each field
+  //     showing up separately in Ins)
+  //   * if there is a vector argument with more than typical vector-length
+  //     elements (generally if more than 4) where each vector element is
+  //     individually present in Ins.
+  // So a different index should be used for indexing into Ins.
+  // See similar issue in LowerCall.
+  unsigned InsIdx = 0;
 
   int idx = 0;
-  for (unsigned i = 0, e = argTypes.size(); i != e; ++i, ++idx) {
+  for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++idx, ++InsIdx) {
     Type *Ty = argTypes[i];
-    EVT ObjectVT = getValueType(Ty);
-    //assert(ObjectVT == Ins[i].VT &&
-    //       "Ins type did not match function type");
 
     // If the kernel argument is image*_t or sampler_t, convert it to
     // a i32 constant holding the parameter position. This can later
@@ -1091,142 +1438,248 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
 
     if (theArgs[i]->use_empty()) {
       // argument is dead
-      if (ObjectVT.isVector()) {
-        EVT EltVT = ObjectVT.getVectorElementType();
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        for (unsigned vi = 0; vi < NumElts; ++vi) {
-          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, EltVT));
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+
+        ComputePTXValueVTs(*this, Ty, vtparts);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          EVT partVT = vtparts[parti];
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, partVT));
+          ++InsIdx;
         }
-      } else {
-        InVals.push_back(DAG.getNode(ISD::UNDEF, dl, ObjectVT));
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
+        for (unsigned parti = 0; parti < NumRegs; ++parti) {
+          InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
+          ++InsIdx;
+        }
+        if (NumRegs > 0)
+          --InsIdx;
+        continue;
       }
+      InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
       continue;
     }
 
     // In the following cases, assign a node order of "idx+1"
-    // to newly created nodes. The SDNOdes for params have to
+    // to newly created nodes. The SDNodes for params have to
     // appear in the same order as their order of appearance
     // in the original function. "idx+1" holds that order.
     if (PAL.hasAttribute(i + 1, Attribute::ByVal) == false) {
-      if (ObjectVT.isVector()) {
+      if (Ty->isAggregateType()) {
+        SmallVector<EVT, 16> vtparts;
+        SmallVector<uint64_t, 16> offsets;
+
+        // NOTE: Here, we lose the ability to issue vector loads for vectors
+        // that are a part of a struct.  This should be investigated in the
+        // future.
+        ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0);
+        assert(vtparts.size() > 0 && "empty aggregate type not expected");
+        bool aggregateIsPacked = false;
+        if (StructType *STy = llvm::dyn_cast<StructType>(Ty))
+          aggregateIsPacked = STy->isPacked();
+
+        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
+             ++parti) {
+          EVT partVT = vtparts[parti];
+          Value *srcValue = Constant::getNullValue(
+              PointerType::get(partVT.getTypeForEVT(F->getContext()),
+                               llvm::ADDRESS_SPACE_PARAM));
+          SDValue srcAddr =
+              DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                          DAG.getConstant(offsets[parti], getPointerTy()));
+          unsigned partAlign =
+              aggregateIsPacked ? 1
+                                : TD->getABITypeAlignment(
+                                      partVT.getTypeForEVT(F->getContext()));
+          SDValue p;
+          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
+            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
+            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
+                               MachinePointerInfo(srcValue), partVT, false,
+                               false, partAlign);
+          } else {
+            p = DAG.getLoad(partVT, dl, Root, srcAddr,
+                            MachinePointerInfo(srcValue), false, false, false,
+                            partAlign);
+          }
+          if (p.getNode())
+            p.getNode()->setIROrder(idx + 1);
+          InVals.push_back(p);
+          ++InsIdx;
+        }
+        if (vtparts.size() > 0)
+          --InsIdx;
+        continue;
+      }
+      if (Ty->isVectorTy()) {
+        EVT ObjectVT = getValueType(Ty);
+        SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
         unsigned NumElts = ObjectVT.getVectorNumElements();
+        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
+               "Vector was not scalarized");
+        unsigned Ofst = 0;
         EVT EltVT = ObjectVT.getVectorElementType();
-        unsigned Offset = 0;
-        for (unsigned vi = 0; vi < NumElts; ++vi) {
-          SDValue A = getParamSymbol(DAG, idx, getPointerTy());
-          SDValue B = DAG.getIntPtrConstant(Offset);
-          SDValue Addr = DAG.getNode(ISD::ADD, dl, getPointerTy(),
-                                     //getParamSymbol(DAG, idx, EltVT),
-                                     //DAG.getConstant(Offset, getPointerTy()));
-                                     A, B);
+
+        // V1 load
+        // f32 = load ...
+        if (NumElts == 1) {
+          // We only have one element, so just directly load it
           Value *SrcValue = Constant::getNullValue(PointerType::get(
               EltVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
-          SDValue Ld = DAG.getLoad(
-              EltVT, dl, Root, Addr, MachinePointerInfo(SrcValue), false, false,
-              false,
+          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                                        DAG.getConstant(Ofst, getPointerTy()));
+          SDValue P = DAG.getLoad(
+              EltVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              false, true,
               TD->getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())));
-          Offset += EltVT.getStoreSizeInBits() / 8;
-          InVals.push_back(Ld);
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
+          InVals.push_back(P);
+          Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext()));
+          ++InsIdx;
+        } else if (NumElts == 2) {
+          // V2 load
+          // f32,f32 = load ...
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
+          Value *SrcValue = Constant::getNullValue(PointerType::get(
+              VecVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+          SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                                        DAG.getConstant(Ofst, getPointerTy()));
+          SDValue P = DAG.getLoad(
+              VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+              false, true,
+              TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+          if (P.getNode())
+            P.getNode()->setIROrder(idx + 1);
+
+          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(0));
+          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                     DAG.getIntPtrConstant(1));
+
+          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
+            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
+            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
+          }
+
+          InVals.push_back(Elt0);
+          InVals.push_back(Elt1);
+          Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          InsIdx += 2;
+        } else {
+          // V4 loads
+          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
+          // the
+          // vector will be expanded to a power of 2 elements, so we know we can
+          // always round up to the next multiple of 4 when creating the vector
+          // loads.
+          // e.g.  4 elem => 1 ld.v4
+          //       6 elem => 2 ld.v4
+          //       8 elem => 2 ld.v4
+          //      11 elem => 3 ld.v4
+          unsigned VecSize = 4;
+          if (EltVT.getSizeInBits() == 64) {
+            VecSize = 2;
+          }
+          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
+          for (unsigned i = 0; i < NumElts; i += VecSize) {
+            Value *SrcValue = Constant::getNullValue(
+                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
+                                 llvm::ADDRESS_SPACE_PARAM));
+            SDValue SrcAddr =
+                DAG.getNode(ISD::ADD, dl, getPointerTy(), Arg,
+                            DAG.getConstant(Ofst, getPointerTy()));
+            SDValue P = DAG.getLoad(
+                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue), false,
+                false, true,
+                TD->getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())));
+            if (P.getNode())
+              P.getNode()->setIROrder(idx + 1);
+
+            for (unsigned j = 0; j < VecSize; ++j) {
+              if (i + j >= NumElts)
+                break;
+              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
+                                        DAG.getIntPtrConstant(j));
+              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
+                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
+              InVals.push_back(Elt);
+            }
+            Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+          }
+          InsIdx += NumElts;
         }
+
+        if (NumElts > 0)
+          --InsIdx;
         continue;
       }
-
       // A plain scalar.
-      if (isABI || isKernel) {
-        // If ABI, load from the param symbol
-        SDValue Arg = getParamSymbol(DAG, idx);
-        // Conjure up a value that we can get the address space from.
-        // FIXME: Using a constant here is a hack.
-        Value *srcValue = Constant::getNullValue(
-            PointerType::get(ObjectVT.getTypeForEVT(F->getContext()),
-                             llvm::ADDRESS_SPACE_PARAM));
-        SDValue p = DAG.getLoad(
-            ObjectVT, dl, Root, Arg, MachinePointerInfo(srcValue), false, false,
-            false,
-            TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
-        if (p.getNode())
-          DAG.AssignOrdering(p.getNode(), idx + 1);
-        InVals.push_back(p);
+      EVT ObjectVT = getValueType(Ty);
+      // If ABI, load from the param symbol
+      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+      Value *srcValue = Constant::getNullValue(PointerType::get(
+          ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM));
+      SDValue p;
+       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
+        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
+                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
+        p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, Arg,
+                           MachinePointerInfo(srcValue), ObjectVT, false, false,
+        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       } else {
-        // If no ABI, just move the param symbol
-        SDValue Arg = getParamSymbol(DAG, idx, ObjectVT);
-        SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
-        if (p.getNode())
-          DAG.AssignOrdering(p.getNode(), idx + 1);
-        InVals.push_back(p);
+        p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg,
+                        MachinePointerInfo(srcValue), false, false, false,
+        TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
       }
+      if (p.getNode())
+        p.getNode()->setIROrder(idx + 1);
+      InVals.push_back(p);
       continue;
     }
 
     // Param has ByVal attribute
-    if (isABI || isKernel) {
-      // Return MoveParam(param symbol).
-      // Ideally, the param symbol can be returned directly,
-      // but when SDNode builder decides to use it in a CopyToReg(),
-      // machine instruction fails because TargetExternalSymbol
-      // (not lowered) is target dependent, and CopyToReg assumes
-      // the source is lowered.
-      SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
-      SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
-      if (p.getNode())
-        DAG.AssignOrdering(p.getNode(), idx + 1);
-      if (isKernel)
-        InVals.push_back(p);
-      else {
-        SDValue p2 = DAG.getNode(
-            ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
-            DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
-        InVals.push_back(p2);
-      }
-    } else {
-      // Have to move a set of param symbols to registers and
-      // store them locally and return the local pointer in InVals
-      const PointerType *elemPtrType = dyn_cast<PointerType>(argTypes[i]);
-      assert(elemPtrType && "Byval parameter should be a pointer type");
-      Type *elemType = elemPtrType->getElementType();
-      // Compute the constituent parts
-      SmallVector<EVT, 16> vtparts;
-      SmallVector<uint64_t, 16> offsets;
-      ComputeValueVTs(*this, elemType, vtparts, &offsets, 0);
-      unsigned totalsize = 0;
-      for (unsigned j = 0, je = vtparts.size(); j != je; ++j)
-        totalsize += vtparts[j].getStoreSizeInBits();
-      SDValue localcopy = DAG.getFrameIndex(
-          MF.getFrameInfo()->CreateStackObject(totalsize / 8, 16, false),
-          getPointerTy());
-      unsigned sizesofar = 0;
-      std::vector<SDValue> theChains;
-      for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-        unsigned numElems = 1;
-        if (vtparts[j].isVector())
-          numElems = vtparts[j].getVectorNumElements();
-        for (unsigned k = 0, ke = numElems; k != ke; ++k) {
-          EVT tmpvt = vtparts[j];
-          if (tmpvt.isVector())
-            tmpvt = tmpvt.getVectorElementType();
-          SDValue arg = DAG.getNode(NVPTXISD::MoveParam, dl, tmpvt,
-                                    getParamSymbol(DAG, idx, tmpvt));
-          SDValue addr =
-              DAG.getNode(ISD::ADD, dl, getPointerTy(), localcopy,
-                          DAG.getConstant(sizesofar, getPointerTy()));
-          theChains.push_back(DAG.getStore(
-              Chain, dl, arg, addr, MachinePointerInfo(), false, false, 0));
-          sizesofar += tmpvt.getStoreSizeInBits() / 8;
-          ++idx;
-        }
-      }
-      --idx;
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &theChains[0],
-                          theChains.size());
-      InVals.push_back(localcopy);
+    // Return MoveParam(param symbol).
+    // Ideally, the param symbol can be returned directly,
+    // but when SDNode builder decides to use it in a CopyToReg(),
+    // machine instruction fails because TargetExternalSymbol
+    // (not lowered) is target dependent, and CopyToReg assumes
+    // the source is lowered.
+    EVT ObjectVT = getValueType(Ty);
+    assert(ObjectVT == Ins[InsIdx].VT &&
+           "Ins type did not match function type");
+    SDValue Arg = getParamSymbol(DAG, idx, getPointerTy());
+    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
+    if (p.getNode())
+      p.getNode()->setIROrder(idx + 1);
+    if (isKernel)
+      InVals.push_back(p);
+    else {
+      SDValue p2 = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, MVT::i32), p);
+      InVals.push_back(p2);
     }
   }
 
   // Clang will check explicit VarArg and issue error if any. However, Clang
   // will let code with
-  // implicit var arg like f() pass.
+  // implicit var arg like f() pass. See bug 617733.
   // We treat this case as if the arg list is empty.
-  //if (F.isVarArg()) {
+  // if (F.isVarArg()) {
   // assert(0 && "VarArg not supported yet!");
   //}
 
@@ -1237,43 +1690,185 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   return Chain;
 }
 
-SDValue NVPTXTargetLowering::LowerReturn(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
-    SelectionDAG &DAG) const {
+
+SDValue
+NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                 bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<SDValue> &OutVals,
+                                 SDLoc dl, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  Type *RetTy = F->getReturnType();
+  const DataLayout *TD = getDataLayout();
 
   bool isABI = (nvptxSubtarget.getSmVersion() >= 20);
+  assert(isABI && "Non-ABI compilation is not supported");
+  if (!isABI)
+    return Chain;
+
+  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
+    // If we have a vector type, the OutVals array will be the scalarized
+    // components and we have combine them into 1 or more vector stores.
+    unsigned NumElts = VTy->getNumElements();
+    assert(NumElts == Outs.size() && "Bad scalarization of return value");
+
+    // const_cast can be removed in later LLVM versions
+    EVT EltVT = getValueType(RetTy).getVectorElementType();
+    bool NeedExtend = false;
+    if (EltVT.getSizeInBits() < 16)
+      NeedExtend = true;
+
+    // V1 store
+    if (NumElts == 1) {
+      SDValue StoreVal = OutVals[0];
+      // We only have one element, so just directly store it
+      if (NeedExtend)
+        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
+      SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                      DAG.getVTList(MVT::Other), &Ops[0], 3,
+                                      EltVT, MachinePointerInfo());
+
+    } else if (NumElts == 2) {
+      // V2 store
+      SDValue StoreVal0 = OutVals[0];
+      SDValue StoreVal1 = OutVals[1];
+
+      if (NeedExtend) {
+        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
+        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
+      }
 
-  unsigned sizesofar = 0;
-  unsigned idx = 0;
-  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-    SDValue theVal = OutVals[i];
-    EVT theValType = theVal.getValueType();
-    unsigned numElems = 1;
-    if (theValType.isVector())
-      numElems = theValType.getVectorNumElements();
-    for (unsigned j = 0, je = numElems; j != je; ++j) {
-      SDValue tmpval = theVal;
-      if (theValType.isVector())
-        tmpval = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                             theValType.getVectorElementType(), tmpval,
-                             DAG.getIntPtrConstant(j));
-      Chain = DAG.getNode(
-          isABI ? NVPTXISD::StoreRetval : NVPTXISD::MoveToRetval, dl,
-          MVT::Other, Chain, DAG.getConstant(isABI ? sizesofar : idx, MVT::i32),
-          tmpval);
-      if (theValType.isVector())
-        sizesofar += theValType.getVectorElementType().getStoreSizeInBits() / 8;
-      else
-        sizesofar += theValType.getStoreSizeInBits() / 8;
-      ++idx;
+      SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
+                        StoreVal1 };
+      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
+                                      DAG.getVTList(MVT::Other), &Ops[0], 4,
+                                      EltVT, MachinePointerInfo());
+    } else {
+      // V4 stores
+      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
+      // vector will be expanded to a power of 2 elements, so we know we can
+      // always round up to the next multiple of 4 when creating the vector
+      // stores.
+      // e.g.  4 elem => 1 st.v4
+      //       6 elem => 2 st.v4
+      //       8 elem => 2 st.v4
+      //      11 elem => 3 st.v4
+
+      unsigned VecSize = 4;
+      if (OutVals[0].getValueType().getSizeInBits() == 64)
+        VecSize = 2;
+
+      unsigned Offset = 0;
+
+      EVT VecVT =
+          EVT::getVectorVT(F->getContext(), OutVals[0].getValueType(), VecSize);
+      unsigned PerStoreOffset =
+          TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+
+      for (unsigned i = 0; i < NumElts; i += VecSize) {
+        // Get values
+        SDValue StoreVal;
+        SmallVector<SDValue, 8> Ops;
+        Ops.push_back(Chain);
+        Ops.push_back(DAG.getConstant(Offset, MVT::i32));
+        unsigned Opc = NVPTXISD::StoreRetvalV2;
+        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
+
+        StoreVal = OutVals[i];
+        if (NeedExtend)
+          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        Ops.push_back(StoreVal);
+
+        if (i + 1 < NumElts) {
+          StoreVal = OutVals[i + 1];
+          if (NeedExtend)
+            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+        } else {
+          StoreVal = DAG.getUNDEF(ExtendedVT);
+        }
+        Ops.push_back(StoreVal);
+
+        if (VecSize == 4) {
+          Opc = NVPTXISD::StoreRetvalV4;
+          if (i + 2 < NumElts) {
+            StoreVal = OutVals[i + 2];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+
+          if (i + 3 < NumElts) {
+            StoreVal = OutVals[i + 3];
+            if (NeedExtend)
+              StoreVal =
+                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
+          } else {
+            StoreVal = DAG.getUNDEF(ExtendedVT);
+          }
+          Ops.push_back(StoreVal);
+        }
+
+        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
+        Chain =
+            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), &Ops[0],
+                                    Ops.size(), EltVT, MachinePointerInfo());
+        Offset += PerStoreOffset;
+      }
+    }
+  } else {
+    SmallVector<EVT, 16> ValVTs;
+    // const_cast is necessary since we are still using an LLVM version from
+    // before the type system re-write.
+    ComputePTXValueVTs(*this, RetTy, ValVTs);
+    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
+
+    unsigned SizeSoFar = 0;
+    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+      SDValue theVal = OutVals[i];
+      EVT TheValType = theVal.getValueType();
+      unsigned numElems = 1;
+      if (TheValType.isVector())
+        numElems = TheValType.getVectorNumElements();
+      for (unsigned j = 0, je = numElems; j != je; ++j) {
+        SDValue TmpVal = theVal;
+        if (TheValType.isVector())
+          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                               TheValType.getVectorElementType(), TmpVal,
+                               DAG.getIntPtrConstant(j));
+        EVT TheStoreType = ValVTs[i];
+        if (RetTy->isIntegerTy() &&
+            TD->getTypeAllocSizeInBits(RetTy) < 32) {
+          // The following zero-extension is for integer types only, and
+          // specifically not for aggregates.
+          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
+          TheStoreType = MVT::i32;
+        }
+        else if (TmpVal.getValueType().getSizeInBits() < 16)
+          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
+
+        SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
+        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
+                                        DAG.getVTList(MVT::Other), &Ops[0],
+                                        3, TheStoreType,
+                                        MachinePointerInfo());
+        if(TheValType.isVector())
+          SizeSoFar += 
+            TheStoreType.getVectorElementType().getStoreSizeInBits() / 8;
+        else
+          SizeSoFar += TheStoreType.getStoreSizeInBits()/8;
+      }
     }
   }
 
   return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain);
 }
 
+
 void NVPTXTargetLowering::LowerAsmOperandForConstraint(
     SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
     SelectionDAG &DAG) const {
@@ -1337,9 +1932,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
 
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
-      Info.memVT = MVT::i32;
+      Info.memVT = getValueType(I.getType());
     else if (Intrinsic == Intrinsic::nvvm_ldu_global_p)
-      Info.memVT = getPointerTy();
+      Info.memVT = getValueType(I.getType());
     else
       Info.memVT = MVT::f32;
     Info.ptrVal = I.getArgOperand(0);
@@ -1420,11 +2015,11 @@ NVPTXTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass *>
 NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  EVT VT) const {
+                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'c':
-      return std::make_pair(0U, &NVPTX::Int8RegsRegClass);
+      return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
     case 'h':
       return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
     case 'r':
@@ -1450,7 +2045,7 @@ unsigned NVPTXTargetLowering::getFunctionAlignment(const Function *) const {
 static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &Results) {
   EVT ResVT = N->getValueType(0);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   assert(ResVT.isVector() && "Vector load must have vector type");
 
@@ -1543,7 +2138,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &Results) {
   SDValue Chain = N->getOperand(0);
   SDValue Intrin = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Get the intrinsic ID
   unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
@@ -1564,7 +2159,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       unsigned NumElts = ResVT.getVectorNumElements();
       EVT EltVT = ResVT.getVectorElementType();
 
-      // Since LDU/LDG are target nodes, we cannot rely on DAG type legalization.
+      // Since LDU/LDG are target nodes, we cannot rely on DAG type
+      // legalization.
       // Therefore, we must ensure the type is legal.  For i1 and i8, we set the
       // loaded type to i16 and propogate the "real" type as the memory type.
       bool NeedTrunc = false;
@@ -1623,7 +2219,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
 
       OtherOps.push_back(Chain); // Chain
                                  // Skip operand 1 (intrinsic ID)
-                                 // Others
+      // Others
       for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i)
         OtherOps.push_back(N->getOperand(i));
 
@@ -1671,7 +2267,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
           DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, &Ops[0],
                                   Ops.size(), MVT::i8, MemSD->getMemOperand());
 
-      Results.push_back(NewLD.getValue(0));
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
+                                    NewLD.getValue(0)));
       Results.push_back(NewLD.getValue(1));
     }
   }
@@ -1691,3 +2288,29 @@ void NVPTXTargetLowering::ReplaceNodeResults(
     return;
   }
 }
+
+// Pin NVPTXSection's and NVPTXTargetObjectFile's vtables to this file.
+void NVPTXSection::anchor() {}
+
+NVPTXTargetObjectFile::~NVPTXTargetObjectFile() {
+  delete TextSection;
+  delete DataSection;
+  delete BSSSection;
+  delete ReadOnlySection;
+
+  delete StaticCtorSection;
+  delete StaticDtorSection;
+  delete LSDASection;
+  delete EHFrameSection;
+  delete DwarfAbbrevSection;
+  delete DwarfInfoSection;
+  delete DwarfLineSection;
+  delete DwarfFrameSection;
+  delete DwarfPubTypesSection;
+  delete DwarfDebugInlineSection;
+  delete DwarfStrSection;
+  delete DwarfLocSection;
+  delete DwarfARangesSection;
+  delete DwarfRangesSection;
+  delete DwarfMacroInfoSection;
+}
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 3cd49d3..66e708f 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -29,17 +29,11 @@ enum NodeType {
   CALL,
   RET_FLAG,
   LOAD_PARAM,
-  NVBuiltin,
   DeclareParam,
   DeclareScalarParam,
   DeclareRetParam,
   DeclareRet,
   DeclareScalarRet,
-  LoadParam,
-  StoreParam,
-  StoreParamS32, // to sext and store a <32bit value, not used currently
-  StoreParamU32, // to zext and store a <32bit value, not used currently
-  MoveToParam,
   PrintCall,
   PrintCallUni,
   CallArgBegin,
@@ -51,13 +45,11 @@ enum NodeType {
   CallSymbol,
   Prototype,
   MoveParam,
-  MoveRetval,
-  MoveToRetval,
-  StoreRetval,
   PseudoUseParam,
   RETURN,
   CallSeqBegin,
   CallSeqEnd,
+  CallPrototype,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -67,7 +59,18 @@ enum NodeType {
   LDUV2, // LDU.v2
   LDUV4, // LDU.v4
   StoreV2,
-  StoreV4
+  StoreV4,
+  LoadParam,
+  LoadParamV2,
+  LoadParamV4,
+  StoreParam,
+  StoreParamV2,
+  StoreParamV4,
+  StoreParamS32, // to sext and store a <32bit value, not used currently
+  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreRetval,
+  StoreRetvalV2,
+  StoreRetvalV4
 };
 }
 
@@ -100,7 +103,7 @@ public:
   /// getFunctionAlignment - Return the Log2 alignment of this function.
   virtual unsigned getFunctionAlignment(const Function *F) const;
 
-  virtual EVT getSetCCResultType(EVT VT) const {
+  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const {
     if (VT.isVector())
       return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
@@ -108,11 +111,11 @@ public:
 
   ConstraintType getConstraintType(const std::string &Constraint) const;
   std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
 
   virtual SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-      const SmallVectorImpl<ISD::InputArg> &Ins, DebugLoc dl, SelectionDAG &DAG,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
       SmallVectorImpl<SDValue> &InVals) const;
 
   virtual SDValue
@@ -120,12 +123,13 @@ public:
 
   std::string getPrototype(Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
-                           unsigned retAlignment) const;
+                           unsigned retAlignment,
+                           const ImmutableCallSite *CS) const;
 
   virtual SDValue
   LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
               const SmallVectorImpl<ISD::OutputArg> &Outs,
-              const SmallVectorImpl<SDValue> &OutVals, DebugLoc dl,
+              const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
               SelectionDAG &DAG) const;
 
   virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
@@ -144,7 +148,7 @@ private:
 
   SDValue getExtSymb(SelectionDAG &DAG, const char *name, int idx,
                      EVT = MVT::i32) const;
-  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT = MVT::i32) const;
+  SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
   SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
 
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
@@ -158,6 +162,9 @@ private:
 
   virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) const;
+
+  unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
+                                Type *Ty, unsigned Idx) const;
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 33a63c2..86ddd38 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,54 +14,53 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "NVPTXGenInstrInfo.inc"
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include <cstdio>
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+void NVPTXInstrInfo::anchor() {}
+
 // FIXME: Add the subtarget support on this constructor.
 NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
-    : NVPTXGenInstrInfo(), TM(tm), RegInfo(*this, *TM.getSubtargetImpl()) {}
+    : NVPTXGenInstrInfo(), TM(tm), RegInfo(*TM.getSubtargetImpl()) {}
 
 void NVPTXInstrInfo::copyPhysReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
     unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
-  if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
-      NVPTX::Int32RegsRegClass.contains(SrcReg))
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+  if (DestRC != SrcRC)
+    report_fatal_error("Attempted to created cross-class register copy");
+
+  if (DestRC == &NVPTX::Int32RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
-           NVPTX::Int8RegsRegClass.contains(SrcReg))
-    BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
-           NVPTX::Int1RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int1RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
-           NVPTX::Float32RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float32RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
-           NVPTX::Int16RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int16RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
-           NVPTX::Int64RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Int64RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
-           NVPTX::Float64RegsRegClass.contains(SrcReg))
+      .addReg(SrcReg, getKillRegState(KillSrc));
+  else if (DestRC == &NVPTX::Float64RegsRegClass)
     BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
+      .addReg(SrcReg, getKillRegState(KillSrc));
   else {
-    llvm_unreachable("Don't know how to copy a register");
+    llvm_unreachable("Bad register copy");
   }
 }
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index b1972e9..600fc5c 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -26,6 +26,7 @@ namespace llvm {
 class NVPTXInstrInfo : public NVPTXGenInstrInfo {
   NVPTXTargetMachine &TM;
   const NVPTXRegisterInfo RegInfo;
+  virtual void anchor();
 public:
   explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
 
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index da6dd39..b23f1e4 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -32,6 +32,86 @@ def isVecOther   : VecInstTypeEnum<15>;
 
 def brtarget    : Operand<OtherVT>;
 
+// CVT conversion modes
+// These must match the enum in NVPTX.h
+def CvtNONE : PatLeaf<(i32 0x0)>;
+def CvtRNI  : PatLeaf<(i32 0x1)>;
+def CvtRZI  : PatLeaf<(i32 0x2)>;
+def CvtRMI  : PatLeaf<(i32 0x3)>;
+def CvtRPI  : PatLeaf<(i32 0x4)>;
+def CvtRN   : PatLeaf<(i32 0x5)>;
+def CvtRZ   : PatLeaf<(i32 0x6)>;
+def CvtRM   : PatLeaf<(i32 0x7)>;
+def CvtRP   : PatLeaf<(i32 0x8)>;
+
+def CvtNONE_FTZ : PatLeaf<(i32 0x10)>;
+def CvtRNI_FTZ  : PatLeaf<(i32 0x11)>;
+def CvtRZI_FTZ  : PatLeaf<(i32 0x12)>;
+def CvtRMI_FTZ  : PatLeaf<(i32 0x13)>;
+def CvtRPI_FTZ  : PatLeaf<(i32 0x14)>;
+def CvtRN_FTZ   : PatLeaf<(i32 0x15)>;
+def CvtRZ_FTZ   : PatLeaf<(i32 0x16)>;
+def CvtRM_FTZ   : PatLeaf<(i32 0x17)>;
+def CvtRP_FTZ   : PatLeaf<(i32 0x18)>;
+
+def CvtSAT      : PatLeaf<(i32 0x20)>;
+def CvtSAT_FTZ  : PatLeaf<(i32 0x30)>;
+
+def CvtMode : Operand<i32> {
+  let PrintMethod = "printCvtMode";
+}
+
+// Compare modes
+// These must match the enum in NVPTX.h
+def CmpEQ   : PatLeaf<(i32 0)>;
+def CmpNE   : PatLeaf<(i32 1)>;
+def CmpLT   : PatLeaf<(i32 2)>;
+def CmpLE   : PatLeaf<(i32 3)>;
+def CmpGT   : PatLeaf<(i32 4)>;
+def CmpGE   : PatLeaf<(i32 5)>;
+def CmpLO   : PatLeaf<(i32 6)>;
+def CmpLS   : PatLeaf<(i32 7)>;
+def CmpHI   : PatLeaf<(i32 8)>;
+def CmpHS   : PatLeaf<(i32 9)>;
+def CmpEQU  : PatLeaf<(i32 10)>;
+def CmpNEU  : PatLeaf<(i32 11)>;
+def CmpLTU  : PatLeaf<(i32 12)>;
+def CmpLEU  : PatLeaf<(i32 13)>;
+def CmpGTU  : PatLeaf<(i32 14)>;
+def CmpGEU  : PatLeaf<(i32 15)>;
+def CmpNUM  : PatLeaf<(i32 16)>;
+def CmpNAN  : PatLeaf<(i32 17)>;
+
+def CmpEQ_FTZ   : PatLeaf<(i32 0x100)>;
+def CmpNE_FTZ   : PatLeaf<(i32 0x101)>;
+def CmpLT_FTZ   : PatLeaf<(i32 0x102)>;
+def CmpLE_FTZ   : PatLeaf<(i32 0x103)>;
+def CmpGT_FTZ   : PatLeaf<(i32 0x104)>;
+def CmpGE_FTZ   : PatLeaf<(i32 0x105)>;
+def CmpLO_FTZ   : PatLeaf<(i32 0x106)>;
+def CmpLS_FTZ   : PatLeaf<(i32 0x107)>;
+def CmpHI_FTZ   : PatLeaf<(i32 0x108)>;
+def CmpHS_FTZ   : PatLeaf<(i32 0x109)>;
+def CmpEQU_FTZ  : PatLeaf<(i32 0x10A)>;
+def CmpNEU_FTZ  : PatLeaf<(i32 0x10B)>;
+def CmpLTU_FTZ  : PatLeaf<(i32 0x10C)>;
+def CmpLEU_FTZ  : PatLeaf<(i32 0x10D)>;
+def CmpGTU_FTZ  : PatLeaf<(i32 0x10E)>;
+def CmpGEU_FTZ  : PatLeaf<(i32 0x10F)>;
+def CmpNUM_FTZ  : PatLeaf<(i32 0x110)>;
+def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
+
+def CmpMode : Operand<i32> {
+  let PrintMethod = "printCmpMode";
+}
+
+def F32ConstZero : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+    return CurDAG->getTargetConstantFP(0.0, MVT::f32);
+  }]>;
+def F32ConstOne : Operand<f32>, PatLeaf<(f32 fpimm)>, SDNodeXForm<fpimm, [{
+    return CurDAG->getTargetConstantFP(1.0, MVT::f32);
+  }]>;
+
 //===----------------------------------------------------------------------===//
 // NVPTX Instruction Predicate Definitions
 //===----------------------------------------------------------------------===//
@@ -56,127 +136,31 @@ def hasLDG : Predicate<"Subtarget.hasLDG()">;
 def hasLDU : Predicate<"Subtarget.hasLDU()">;
 def hasGenericLdSt : Predicate<"Subtarget.hasGenericLdSt()">;
 
-def doF32FTZ : Predicate<"UseF32FTZ">;
+def doF32FTZ : Predicate<"useF32FTZ()">;
+def doNoF32FTZ : Predicate<"!useF32FTZ()">;
 
 def doFMAF32      : Predicate<"doFMAF32">;
-def doFMAF32_ftz  : Predicate<"(doFMAF32 && UseF32FTZ)">;
+def doFMAF32_ftz  : Predicate<"(doFMAF32 && useF32FTZ())">;
 def doFMAF32AGG      : Predicate<"doFMAF32AGG">;
-def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && UseF32FTZ)">;
+def doFMAF32AGG_ftz  : Predicate<"(doFMAF32AGG && useF32FTZ())">;
 def doFMAF64      : Predicate<"doFMAF64">;
 def doFMAF64AGG      : Predicate<"doFMAF64AGG">;
-def doFMADF32     : Predicate<"doFMADF32">;
-def doFMADF32_ftz : Predicate<"(doFMADF32 && UseF32FTZ)">;
 
 def doMulWide      : Predicate<"doMulWide">;
 
 def allowFMA : Predicate<"allowFMA">;
-def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
+def allowFMA_ftz : Predicate<"(allowFMA && useF32FTZ())">;
 
-def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
-def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
+def do_DIVF32_APPROX : Predicate<"getDivF32Level()==0">;
+def do_DIVF32_FULL : Predicate<"getDivF32Level()==1">;
 
-def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
-def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
+def do_SQRTF32_APPROX : Predicate<"!usePrecSqrtF32()">;
+def do_SQRTF32_RN : Predicate<"usePrecSqrtF32()">;
 
 def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
 
 def true : Predicate<"1">;
 
-//===----------------------------------------------------------------------===//
-// Special Handling for 8-bit Operands and Operations
-//
-// PTX supports 8-bit signed and unsigned types, but does not support 8-bit
-// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have
-// 8-bit registers.
-//
-// PTX ld, st and cvt instructions permit source and destination data operands
-// to be wider than the instruction-type size, so that narrow values may be
-// loaded, stored, and converted using regular-width registers.
-//
-// So in PTX generation, we
-// - always use 16-bit registers in place in 8-bit registers.
-//   (8-bit variables should stay as 8-bit as they represent memory layout.)
-// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values
-//   before operation
-//   . div
-//   . rem
-//   . neg (sign)
-//   . set, setp
-//   . shr
-//
-// We are patching the operations by inserting the cvt instructions in the
-// asm strings of the affected instructions.
-//
-// Since vector operations, except for ld/st, are eventually elementized. We
-// do not need to special-hand the vector 8-bit operations.
-//
-//
-//===----------------------------------------------------------------------===//
-
-// Generate string block like
-// {
-//   .reg .s16 %temp1;
-//   .reg .s16 %temp2;
-//   cvt.s16.s8 %temp1, %a;
-//   cvt.s16.s8 %temp2, %b;
-//   opc.s16    %dst, %temp1, %temp2;
-// }
-// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
-class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> {
-  string s = !strconcat("{{\n\t",
-             !strconcat(".reg .", !strconcat(TypeStr,
-             !strconcat(" \t%temp1;\n\t",
-             !strconcat(".reg .", !strconcat(TypeStr,
-             !strconcat(" \t%temp2;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
-             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))));
-}
-
-// Generate string block like
-// {
-//   .reg .s16 %temp1;
-//   .reg .s16 %temp2;
-//   cvt.s16.s8 %temp1, %a;
-//   mov.b16    %temp2, %b;
-//   cvt.s16.s8 %temp2, %temp2;
-//   opc.s16    %dst, %temp1, %temp2;
-// }
-// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
-class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> {
-  string s = !strconcat("{{\n\t",
-             !strconcat(".reg .", !strconcat(TypeStr,
-             !strconcat(" \t%temp1;\n\t",
-             !strconcat(".reg .",
-             !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t",
-             !strconcat("mov.b16 \t%temp2, $b;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t",
-             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
-}
-
-// Generate string block like
-// {
-//   .reg .s16 %temp1;
-//   .reg .s16 %temp2;
-//   mov.b16    %temp1, %b;
-//   cvt.s16.s8 %temp1, %temp1;
-//   cvt.s16.s8 %temp2, %a;
-//   opc.s16    %dst, %temp1, %temp2;
-// }
-// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8
-class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> {
-  string s = !strconcat("{{\n\t",
-             !strconcat(".reg .", !strconcat(TypeStr,
-             !strconcat(" \t%temp1;\n\t",
-             !strconcat(".reg .", !strconcat(TypeStr,
-             !strconcat(" \t%temp2;\n\t",
-             !strconcat("mov.b16 \t%temp1, $a;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t",
-             !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t",
-             !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))));
-}
-
 
 //===----------------------------------------------------------------------===//
 // Some Common Instruction Class Templates
@@ -204,66 +188,6 @@ multiclass I3<string OpcStr, SDNode OpNode> {
   def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
                      [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
-  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
-  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
-}
-
-multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> {
-  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                       Int64Regs:$b))]>;
-  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                       Int32Regs:$b))]>;
-  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                       Int16Regs:$b))]>;
-  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
-  def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
-                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
-  def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
-                     [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>;
-}
-
-multiclass I3_noi8<string OpcStr, SDNode OpNode> {
-  def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a,
-                       Int64Regs:$b))]>;
-  def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                     !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-                     [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a,
-                       Int32Regs:$b))]>;
-  def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                     !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-                     [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
-                       Int16Regs:$b))]>;
-  def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                     !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                     [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>;
 }
 
 multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> {
@@ -369,6 +293,90 @@ multiclass F2<string OpcStr, SDNode OpNode> {
 //===----------------------------------------------------------------------===//
 
 //-----------------------------------
+// General Type Conversion
+//-----------------------------------
+
+let neverHasSideEffects = 1 in {
+// Generate a cvt to the given type from all possible types.
+// Each instance takes a CvtMode immediate that defines the conversion mode to
+// use.  It can be CvtNONE to omit a conversion mode.
+multiclass CVT_FROM_ALL<string FromName, RegisterClass RC> {
+  def _s16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s16\t$dst, $src;"),
+                       []>;
+  def _u16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u16\t$dst, $src;"),
+                       []>;
+  def _f16 : NVPTXInst<(outs RC:$dst),
+                       (ins Int16Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f16\t$dst, $src;"),
+                       []>;
+  def _s32 : NVPTXInst<(outs RC:$dst),
+                       (ins Int32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s32\t$dst, $src;"),
+                       []>;
+  def _u32 : NVPTXInst<(outs RC:$dst),
+                       (ins Int32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u32\t$dst, $src;"),
+                       []>;
+  def _s64 : NVPTXInst<(outs RC:$dst),
+                       (ins Int64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".s64\t$dst, $src;"),
+                       []>;
+  def _u64 : NVPTXInst<(outs RC:$dst),
+                       (ins Int64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".u64\t$dst, $src;"),
+                       []>;
+  def _f32 : NVPTXInst<(outs RC:$dst),
+                       (ins Float32Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f32\t$dst, $src;"),
+                       []>;
+  def _f64 : NVPTXInst<(outs RC:$dst),
+                       (ins Float64Regs:$src, CvtMode:$mode),
+                       !strconcat("cvt${mode:base}${mode:ftz}${mode:sat}.",
+                       FromName, ".f64\t$dst, $src;"),
+                       []>;
+}
+
+// Generate a cvt to all possible types.
+defm CVT_s16 : CVT_FROM_ALL<"s16", Int16Regs>;
+defm CVT_u16 : CVT_FROM_ALL<"u16", Int16Regs>;
+defm CVT_f16 : CVT_FROM_ALL<"f16", Int16Regs>;
+defm CVT_s32 : CVT_FROM_ALL<"s32", Int32Regs>;
+defm CVT_u32 : CVT_FROM_ALL<"u32", Int32Regs>;
+defm CVT_s64 : CVT_FROM_ALL<"s64", Int64Regs>;
+defm CVT_u64 : CVT_FROM_ALL<"u64", Int64Regs>;
+defm CVT_f32 : CVT_FROM_ALL<"f32", Float32Regs>;
+defm CVT_f64 : CVT_FROM_ALL<"f64", Float64Regs>;
+
+// This set of cvt is different from the above. The type of the source
+// and target are the same.
+//
+def CVT_INREG_s16_s8 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
+                        "cvt.s16.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s8 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                        "cvt.s32.s8 \t$dst, $src;", []>;
+def CVT_INREG_s32_s16 : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src),
+                        "cvt.s32.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s8 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s8 \t$dst, $src;", []>;
+def CVT_INREG_s64_s16 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s16 \t$dst, $src;", []>;
+def CVT_INREG_s64_s32 : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src),
+                        "cvt.s64.s32 \t$dst, $src;", []>;
+}
+
+//-----------------------------------
 // Integer Arithmetic
 //-----------------------------------
 
@@ -522,81 +530,17 @@ def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)),
 
 defm MULT : I3<"mul.lo.s", mul>;
 
-defm MULTHS : I3_noi8<"mul.hi.s", mulhs>;
-defm MULTHU : I3_noi8<"mul.hi.u", mulhu>;
-def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-            !strconcat("{{ \n\t",
-            !strconcat(".reg \t.s16 temp1; \n\t",
-            !strconcat(".reg \t.s16 temp2; \n\t",
-            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
-            !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t",
-            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
-            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
-            !strconcat("}}", "")))))))),
-      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>;
-def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-            !strconcat("{{ \n\t",
-            !strconcat(".reg \t.s16 temp1; \n\t",
-            !strconcat(".reg \t.s16 temp2; \n\t",
-            !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t",
-            !strconcat("mov.b16 \ttemp2, $b; \n\t",
-            !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t",
-            !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t",
-            !strconcat("shr.s16 \t$dst, $dst, 8; \n\t",
-            !strconcat("}}", ""))))))))),
-      [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>;
-def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-            !strconcat("{{ \n\t",
-            !strconcat(".reg \t.u16 temp1; \n\t",
-            !strconcat(".reg \t.u16 temp2; \n\t",
-            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
-            !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t",
-            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
-            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
-            !strconcat("}}", "")))))))),
-      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>;
-def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-            !strconcat("{{ \n\t",
-            !strconcat(".reg \t.u16 temp1; \n\t",
-            !strconcat(".reg \t.u16 temp2; \n\t",
-            !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t",
-            !strconcat("mov.b16 \ttemp2, $b; \n\t",
-            !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t",
-            !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t",
-            !strconcat("shr.u16 \t$dst, $dst, 8; \n\t",
-            !strconcat("}}", ""))))))))),
-      [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>;
-
-
-defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">;
-defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">;
-
-defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">;
+defm MULTHS : I3<"mul.hi.s", mulhs>;
+defm MULTHU : I3<"mul.hi.u", mulhu>;
+
+defm SDIV : I3<"div.s", sdiv>;
+defm UDIV : I3<"div.u", udiv>;
+
+defm SREM : I3<"rem.s", srem>;
 // The ri version will not be selected as DAGCombiner::visitSREM will lower it.
-defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">;
+defm UREM : I3<"rem.u", urem>;
 // The ri version will not be selected as DAGCombiner::visitUREM will lower it.
 
-def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst),
-                      (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
-                        Int8Regs:$c))]>;
-def MAD8rri : NVPTXInst<(outs Int8Regs:$dst),
-                      (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b),
-                        imm:$c))]>;
-def MAD8rir : NVPTXInst<(outs Int8Regs:$dst),
-                      (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
-                        Int8Regs:$c))]>;
-def MAD8rii : NVPTXInst<(outs Int8Regs:$dst),
-                      (ins Int8Regs:$a, i8imm:$b, i8imm:$c),
-                      "mad.lo.s16 \t$dst, $a, $b, $c;",
-                      [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b),
-                        imm:$c))]>;
-
 def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst),
                       (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c),
                       "mad.lo.s16 \t$dst, $a, $b, $c;",
@@ -661,10 +605,6 @@ def MAD64rii : NVPTXInst<(outs Int64Regs:$dst),
                         (mul Int64Regs:$a, imm:$b), imm:$c))]>;
 
 
-def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
-                     !strconcat("cvt.s16.s8 \t$dst, $src;\n\t",
-                                 "neg.s16 \t$dst, $dst;"),
-         [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>;
 def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                      "neg.s16 \t$dst, $src;",
          [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>;
@@ -842,6 +782,16 @@ def FDIV32ri_prec : NVPTXInst<(outs Float32Regs:$dst),
                         (fdiv Float32Regs:$a, fpimm:$b))]>,
                       Requires<[reqPTX20]>;
 
+//
+// F32 rsqrt
+//
+
+def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
+                       "rsqrt.approx.f32 \t$dst, $b;", []>;
+
+def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
+         (RSQRTF32approx1r Float32Regs:$b)>,
+         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
 
 multiclass FPCONTRACT32<string OpcStr, Predicate Pred> {
    def rrr : NVPTXInst<(outs Float32Regs:$dst),
@@ -912,8 +862,6 @@ multiclass FPCONTRACT64<string OpcStr, Predicate Pred> {
 // If we reverse the order of the following two lines, then rrr2 rule will be
 // generated for FMA32, but not for rrr.
 // Therefore, we manually write the rrr2 rule in FPCONTRACT32.
-defm FMAD32_ftz : FPCONTRACT32<"mad.ftz.f32", doFMADF32_ftz>;
-defm FMAD32 : FPCONTRACT32<"mad.f32", doFMADF32>;
 defm FMA32_ftz  : FPCONTRACT32<"fma.rn.ftz.f32", doFMAF32_ftz>;
 defm FMA32  : FPCONTRACT32<"fma.rn.f32", doFMAF32>;
 defm FMA64  : FPCONTRACT64<"fma.rn.f64", doFMAF64>;
@@ -952,8 +900,6 @@ multiclass FPCONTRACT64_SUB_PAT<NVPTXInst Inst, Predicate Pred> {
 
 defm FMAF32ext_ftz  : FPCONTRACT32_SUB_PAT<FMA32_ftzrrr, doFMAF32AGG_ftz>;
 defm FMAF32ext  : FPCONTRACT32_SUB_PAT<FMA32rrr, doFMAF32AGG>;
-defm FMADF32ext_ftz : FPCONTRACT32_SUB_PAT_MAD<FMAD32_ftzrrr, doFMADF32_ftz>;
-defm FMADF32ext : FPCONTRACT32_SUB_PAT_MAD<FMAD32rrr, doFMADF32>;
 defm FMAF64ext  : FPCONTRACT64_SUB_PAT<FMA64rrr, doFMAF64AGG>;
 
 def SINF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
@@ -963,6 +909,41 @@ def COSF:  NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
                       "cos.approx.f32 \t$dst, $src;",
                       [(set Float32Regs:$dst, (fcos Float32Regs:$src))]>;
 
+// Lower (frem x, y) into (sub x, (mul (floor (div x, y)) y))
+// e.g. "poor man's fmod()"
+
+// frem - f32 FTZ
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32rr_ftz (CVT_f32_f32
+            (FDIV32rr_prec_ftz Float32Regs:$x, Float32Regs:$y), CvtRMI_FTZ),
+             Float32Regs:$y))>,
+          Requires<[doF32FTZ]>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr_ftz Float32Regs:$x, (FMULf32ri_ftz (CVT_f32_f32
+            (FDIV32ri_prec_ftz Float32Regs:$x, fpimm:$y), CvtRMI_FTZ),
+             fpimm:$y))>,
+          Requires<[doF32FTZ]>;
+
+// frem - f32
+def : Pat<(frem Float32Regs:$x, Float32Regs:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32rr (CVT_f32_f32
+            (FDIV32rr_prec Float32Regs:$x, Float32Regs:$y), CvtRMI),
+             Float32Regs:$y))>;
+def : Pat<(frem Float32Regs:$x, fpimm:$y),
+          (FSUBf32rr Float32Regs:$x, (FMULf32ri (CVT_f32_f32
+            (FDIV32ri_prec Float32Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
+// frem - f64
+def : Pat<(frem Float64Regs:$x, Float64Regs:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64rr (CVT_f64_f64
+            (FDIV64rr Float64Regs:$x, Float64Regs:$y), CvtRMI),
+             Float64Regs:$y))>;
+def : Pat<(frem Float64Regs:$x, fpimm:$y),
+          (FSUBf64rr Float64Regs:$x, (FMULf64ri (CVT_f64_f64
+            (FDIV64ri Float64Regs:$x, fpimm:$y), CvtRMI),
+             fpimm:$y))>;
+
 //-----------------------------------
 // Logical Arithmetic
 //-----------------------------------
@@ -974,12 +955,6 @@ multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> {
   def b1ri:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b),
                       !strconcat(OpcStr, ".pred  \t$dst, $a, $b;"),
                       [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>;
-  def b8rr:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
-  def b8ri:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-                      !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
   def b16rr:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
                       !strconcat(OpcStr, ".b16  \t$dst, $a, $b;"),
                       [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
@@ -1010,9 +985,6 @@ defm XOR : LOG_FORMAT<"xor", xor>;
 def NOT1:  NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src),
                       "not.pred \t$dst, $src;",
                       [(set Int1Regs:$dst, (not Int1Regs:$src))]>;
-def NOT8:  NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
-                      "not.b16 \t$dst, $src;",
-                      [(set Int8Regs:$dst, (not Int8Regs:$src))]>;
 def NOT16:  NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                       "not.b16 \t$dst, $src;",
                       [(set Int16Regs:$dst, (not Int16Regs:$src))]>;
@@ -1056,21 +1028,13 @@ multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
                       !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
                       [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
                         (i32 imm:$b)))]>;
-   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
-                        Int32Regs:$b))]>;
-   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
-                      !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
-                        (i32 imm:$b)))]>;
 }
 
 defm SHL : LSHIFT_FORMAT<"shl.b", shl>;
 
 // For shifts, the second src operand must be 32-bit value
 // Need to add cvt for the 8-bits.
-multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> {
+multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode> {
    def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
                       Int32Regs:$b),
                       !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
@@ -1102,20 +1066,10 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> {
                       !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
                       [(set Int16Regs:$dst, (OpNode Int16Regs:$a,
                         (i32 imm:$b)))]>;
-   def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b),
-                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
-                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
-                        Int32Regs:$b))]>;
-   def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b),
-                      !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t",
-                      !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))),
-                      [(set Int8Regs:$dst, (OpNode Int8Regs:$a,
-                        (i32 imm:$b)))]>;
 }
 
-defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">;
-defm SRL : RSHIFT_FORMAT<"shr.u", srl, "cvt.u16.u8">;
+defm SRA : RSHIFT_FORMAT<"shr.s", sra>;
+defm SRL : RSHIFT_FORMAT<"shr.u", srl>;
 
 // 32bit
 def ROT32imm_sw : NVPTXInst<(outs Int32Regs:$dst),
@@ -1213,6 +1167,120 @@ def ROTR64reg_sw : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src,
 
 
 //-----------------------------------
+// General Comparison
+//-----------------------------------
+
+// General setp instructions
+multiclass SETP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins RC:$a, RC:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+  def ri : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+  def ir : NVPTXInst<(outs Int1Regs:$dst),
+                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+            !strconcat("setp${cmp:base}${cmp:ftz}.", TypeStr, "\t$dst, $a, $b;"),
+                     []>;
+}
+
+defm SETP_b16 : SETP<"b16", Int16Regs, i16imm>;
+defm SETP_s16 : SETP<"s16", Int16Regs, i16imm>;
+defm SETP_u16 : SETP<"u16", Int16Regs, i16imm>;
+defm SETP_b32 : SETP<"b32", Int32Regs, i32imm>;
+defm SETP_s32 : SETP<"s32", Int32Regs, i32imm>;
+defm SETP_u32 : SETP<"u32", Int32Regs, i32imm>;
+defm SETP_b64 : SETP<"b64", Int64Regs, i64imm>;
+defm SETP_s64 : SETP<"s64", Int64Regs, i64imm>;
+defm SETP_u64 : SETP<"u64", Int64Regs, i64imm>;
+defm SETP_f32 : SETP<"f32", Float32Regs, f32imm>;
+defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
+
+// General set instructions
+multiclass SET<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins RC:$a, RC:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  def ri : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins RC:$a, ImmCls:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+  def ir : NVPTXInst<(outs Int32Regs:$dst),
+                     (ins ImmCls:$a, RC:$b, CmpMode:$cmp),
+                     !strconcat("set$cmp.", TypeStr, "\t$dst, $a, $b;"), []>;
+}
+
+defm SET_b16 : SET<"b16", Int16Regs, i16imm>;
+defm SET_s16 : SET<"s16", Int16Regs, i16imm>;
+defm SET_u16 : SET<"u16", Int16Regs, i16imm>;
+defm SET_b32 : SET<"b32", Int32Regs, i32imm>;
+defm SET_s32 : SET<"s32", Int32Regs, i32imm>;
+defm SET_u32 : SET<"u32", Int32Regs, i32imm>;
+defm SET_b64 : SET<"b64", Int64Regs, i64imm>;
+defm SET_s64 : SET<"s64", Int64Regs, i64imm>;
+defm SET_u64 : SET<"u64", Int64Regs, i64imm>;
+defm SET_f32 : SET<"f32", Float32Regs, f32imm>;
+defm SET_f64 : SET<"f64", Float64Regs, f64imm>;
+
+//-----------------------------------
+// General Selection
+//-----------------------------------
+
+// General selp instructions
+multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
+  def rr : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ri : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ir : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+  def ii : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"), []>;
+}
+
+multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,
+                        SDNode ImmNode> {
+  def rr : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;
+  def ri : NVPTXInst<(outs RC:$dst),
+                     (ins RC:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;
+  def ir : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, RC:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                     [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;
+  def ii : NVPTXInst<(outs RC:$dst),
+                     (ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
+                     !strconcat("selp.", TypeStr, "\t$dst, $a, $b, $p;"),
+                 [(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
+}
+
+defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;
+defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
+defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
+defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;
+defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
+defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
+defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;
+defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
+defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
+defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
+defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
+
+// Special select for predicate operands
+def : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
+              (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
+              (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
+
+//-----------------------------------
 // Data Movement (Load / Store, Move)
 //-----------------------------------
 
@@ -1253,12 +1321,19 @@ def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a),
                      "mov.u64 \t$dst, $a;",
                      [(set Int64Regs:$dst, (Wrapper tglobaladdr:$a))]>;
 
+// Get pointer to local stack
+def MOV_DEPOT_ADDR
+  : NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+              "mov.u32 \t$d, __local_depot$num;", []>;
+def MOV_DEPOT_ADDR_64
+  : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+              "mov.u64 \t$d, __local_depot$num;", []>;
+
+
 // copyPhysreg is hard-coded in NVPTXInstrInfo.cpp
 let IsSimpleMove=1 in {
 def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss),
                    "mov.pred \t$dst, $sss;", []>;
-def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss),
-                    "mov.u16 \t$dst, $sss;", []>;
 def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss),
                     "mov.u16 \t$dst, $sss;", []>;
 def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss),
@@ -1274,9 +1349,6 @@ def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src),
 def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src),
                     "mov.pred \t$dst, $src;",
           [(set Int1Regs:$dst, imm:$src)]>;
-def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src),
-                    "mov.u16 \t$dst, $src;",
-          [(set Int8Regs:$dst, imm:$src)]>;
 def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src),
                     "mov.u16 \t$dst, $src;",
           [(set Int16Regs:$dst, imm:$src)]>;
@@ -1308,440 +1380,194 @@ def LEA_ADDRi64 : NVPTXInst<(outs Int64Regs:$dst), (ins MEMri64:$addr),
 // Comparison and Selection
 //-----------------------------------
 
-// Generate string block like
-// {
-//   .reg .pred p;
-//   setp.gt.s16 p, %a, %b;
-//   selp.s16 %dst, -1, 0, p;
-// }
-// when OpcStr=setp.gt.s sz1=16 sz2=16 d=%dst a=%a b=%b
-class Set_Str<string OpcStr, string sz1, string sz2, string d, string a,
-  string b> {
-  string t1  = "{{\n\t.reg .pred p;\n\t";
-  string t2  = !strconcat(t1 , OpcStr);
-  string t3  = !strconcat(t2 , sz1);
-  string t4  = !strconcat(t3 , " \tp, ");
-  string t5  = !strconcat(t4 , a);
-  string t6  = !strconcat(t5 , ", ");
-  string t7  = !strconcat(t6 , b);
-  string t8  = !strconcat(t7 , ";\n\tselp.s");
-  string t9  = !strconcat(t8 , sz2);
-  string t10 = !strconcat(t9, " \t");
-  string t11 = !strconcat(t10, d);
-  string s   = !strconcat(t11, ", -1, 0, p;\n\t}}");
+multiclass ISET_FORMAT<PatFrag OpNode, PatLeaf Mode,
+                       Instruction setp_16rr,
+                       Instruction setp_16ri,
+                       Instruction setp_16ir,
+                       Instruction setp_32rr,
+                       Instruction setp_32ri,
+                       Instruction setp_32ir,
+                       Instruction setp_64rr,
+                       Instruction setp_64ri,
+                       Instruction setp_64ir,
+                       Instruction set_16rr,
+                       Instruction set_16ri,
+                       Instruction set_16ir,
+                       Instruction set_32rr,
+                       Instruction set_32ri,
+                       Instruction set_32ir,
+                       Instruction set_64rr,
+                       Instruction set_64ri,
+                       Instruction set_64ir> {
+  // i16 -> pred
+  def : Pat<(i1 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (setp_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int16Regs:$a, imm:$b)),
+            (setp_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int16Regs:$b)),
+            (setp_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> pred
+  def : Pat<(i1 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (setp_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int32Regs:$a, imm:$b)),
+            (setp_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int32Regs:$b)),
+            (setp_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> pred
+  def : Pat<(i1 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (setp_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Int64Regs:$a, imm:$b)),
+            (setp_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i1 (OpNode imm:$a, Int64Regs:$b)),
+            (setp_64ir imm:$a, Int64Regs:$b, Mode)>;
+
+  // i16 -> i32
+  def : Pat<(i32 (OpNode Int16Regs:$a, Int16Regs:$b)),
+            (set_16rr Int16Regs:$a, Int16Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int16Regs:$a, imm:$b)),
+            (set_16ri Int16Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int16Regs:$b)),
+            (set_16ir imm:$a, Int16Regs:$b, Mode)>;
+  // i32 -> i32
+  def : Pat<(i32 (OpNode Int32Regs:$a, Int32Regs:$b)),
+            (set_32rr Int32Regs:$a, Int32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int32Regs:$a, imm:$b)),
+            (set_32ri Int32Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int32Regs:$b)),
+            (set_32ir imm:$a, Int32Regs:$b, Mode)>;
+  // i64 -> i32
+  def : Pat<(i32 (OpNode Int64Regs:$a, Int64Regs:$b)),
+            (set_64rr Int64Regs:$a, Int64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Int64Regs:$a, imm:$b)),
+            (set_64ri Int64Regs:$a, imm:$b, Mode)>;
+  def : Pat<(i32 (OpNode imm:$a, Int64Regs:$b)),
+            (set_64ir imm:$a, Int64Regs:$b, Mode)>;
 }
 
-// Generate string block like
-// {
-//   .reg .pred p;
-//   .reg .s16 %temp1;
-//   .reg .s16 %temp2;
-//   cvt.s16.s8 %temp1, %a;
-//   cvt s16.s8 %temp1, %b;
-//   setp.gt.s16 p, %temp1, %temp2;
-//   selp.s16 %dst, -1, 0, p;
-// }
-// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8
-class Set_Stri8<string OpcStr, string d, string a, string b, string type,
-  string cvt> {
-  string t1  = "{{\n\t.reg .pred p;\n\t";
-  string t2  = !strconcat(t1, ".reg .");
-  string t3  = !strconcat(t2, type);
-  string t4  = !strconcat(t3, " %temp1;\n\t");
-  string t5  = !strconcat(t4, ".reg .");
-  string t6  = !strconcat(t5, type);
-  string t7  = !strconcat(t6, " %temp2;\n\t");
-  string t8  = !strconcat(t7, cvt);
-  string t9  = !strconcat(t8, " \t%temp1, ");
-  string t10 = !strconcat(t9, a);
-  string t11 = !strconcat(t10, ";\n\t");
-  string t12 = !strconcat(t11, cvt);
-  string t13 = !strconcat(t12, " \t%temp2, ");
-  string t14 = !strconcat(t13, b);
-  string t15 = !strconcat(t14, ";\n\t");
-  string t16 = !strconcat(t15, OpcStr);
-  string t17 = !strconcat(t16, "16");
-  string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t");
-  string t19 = !strconcat(t18, "selp.s16 \t");
-  string t20 = !strconcat(t19, d);
-  string s   = !strconcat(t20, ", -1, 0, p;\n\t}}");
+multiclass ISET_FORMAT_SIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_s16rr, SETP_s16ri, SETP_s16ir,
+                SETP_s32rr, SETP_s32ri, SETP_s32ir,
+                SETP_s64rr, SETP_s64ri, SETP_s64ir,
+                SET_s16rr, SET_s16ri, SET_s16ir,
+                SET_s32rr, SET_s32ri, SET_s32ir,
+                SET_s64rr, SET_s64ri, SET_s64ir> {
+  // TableGen doesn't like empty multiclasses
+  def : PatLeaf<(i32 0)>;
 }
 
-multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode,
-  string TypeStr, string CVTStr> {
-  def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                     Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s,
-               []>;
-  def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a,
-      Int16Regs:$b),
-                     Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s,
-               []>;
-  def i32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
-      Int32Regs:$b),
-                     Set_Str<OpcStr, "32", "32", "$dst", "$a", "$b">.s,
-               []>;
-  def i64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a,
-      Int64Regs:$b),
-                     Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s,
-               []>;
-
-  def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                     Handle_i8rr<OpcStr, TypeStr, CVTStr>.s,
-               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
-  def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-                     Handle_i8ri<OpcStr, TypeStr, CVTStr>.s,
-               [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
-  def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
-                     Handle_i8ir<OpcStr, TypeStr, CVTStr>.s,
-               [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
-  def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b),
-                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
-  def i16ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
-  def i16ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
-                 !strconcat(OpcStr, "16 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
-  def i32rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b),
-                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-  def i32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
-                 !strconcat(OpcStr, "32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
-  def i64rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b),
-                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
-  def i64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
-                 !strconcat(OpcStr, "64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
-
-  def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b),
-                     Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s,
-               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>;
-  def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b),
-                     Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s,
-               [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>;
-  def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b),
-                     Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s,
-               [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>;
-  def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a,
-      Int16Regs:$b),
-                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>;
-  def i16ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, i16imm:$b),
-                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int16Regs:$a, imm:$b))]>;
-  def i16ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i16imm:$a, Int16Regs:$b),
-                 !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode imm:$a, Int16Regs:$b))]>;
-  def i32rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a,
-      Int32Regs:$b),
-                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, Int32Regs:$b))]>;
-  def i32ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b),
-                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>;
-  def i32ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i32imm:$a, Int32Regs:$b),
-                 !strconcat(OpcStr_u32, "32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode imm:$a, Int32Regs:$b))]>;
-  def i64rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a,
-      Int64Regs:$b),
-                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, Int64Regs:$b))]>;
-  def i64ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int64Regs:$a, i64imm:$b),
-                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>;
-  def i64ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i64imm:$a, Int64Regs:$b),
-                 !strconcat(OpcStr_u32, "64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>;
+multiclass ISET_FORMAT_UNSIGNED<PatFrag OpNode, PatLeaf Mode>
+  : ISET_FORMAT<OpNode, Mode,
+                SETP_u16rr, SETP_u16ri, SETP_u16ir,
+                SETP_u32rr, SETP_u32ri, SETP_u32ir,
+                SETP_u64rr, SETP_u64ri, SETP_u64ir,
+                SET_u16rr, SET_u16ri, SET_u16ir,
+                SET_u32rr, SET_u32ri, SET_u32ir,
+                SET_u64rr, SET_u64ri, SET_u64ir> {
+  // TableGen doesn't like empty multiclasses
+  def : PatLeaf<(i32 0)>;
 }
 
-multiclass FSET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode> {
-  def f32rr_toi32_ftz: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
-      Float32Regs:$b),
-                     Set_Str<OpcStr, "ftz.f32", "32", "$dst", "$a", "$b">.s,
-               []>, Requires<[doF32FTZ]>;
-  def f32rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float32Regs:$a,
-      Float32Regs:$b),
-                     Set_Str<OpcStr, "f32", "32", "$dst", "$a", "$b">.s,
-               []>;
-  def f64rr_toi64: NVPTXInst<(outs Int64Regs:$dst), (ins Float64Regs:$a,
-      Float64Regs:$b),
-                     Set_Str<OpcStr, "f64", "64", "$dst", "$a", "$b">.s,
-               []>;
-  def f64rr_toi32: NVPTXInst<(outs Int32Regs:$dst), (ins Float64Regs:$a,
-      Float64Regs:$b),
-                     Set_Str<OpcStr, "f64", "32", "$dst", "$a", "$b">.s,
-               []>;
-
-  def f32rr_p_ftz: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a
-      , Float32Regs:$b),
-                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>
-  , Requires<[doF32FTZ]>;
-  def f32rr_p: NVPTXInst<(outs Int1Regs:$dst),
-    (ins Float32Regs:$a, Float32Regs:$b),
-                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
-  def f32ri_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
-    (ins Float32Regs:$a, f32imm:$b),
-                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
-  Requires<[doF32FTZ]>;
-  def f32ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float32Regs:$a, f32imm:$b),
-                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
-  def f32ir_p_ftz: NVPTXInst<(outs Int1Regs:$dst),
-    (ins f32imm:$a, Float32Regs:$b),
-                 !strconcat(OpcStr, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>,
-  Requires<[doF32FTZ]>;
-  def f32ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f32imm:$a, Float32Regs:$b),
-                 !strconcat(OpcStr, "f32 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
-  def f64rr_p: NVPTXInst<(outs Int1Regs:$dst),
-    (ins Float64Regs:$a, Float64Regs:$b),
-                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
-  def f64ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Float64Regs:$a, f64imm:$b),
-                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
-  def f64ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins f64imm:$a, Float64Regs:$b),
-                 !strconcat(OpcStr, "f64 \t$dst, $a, $b;"),
-               [(set Int1Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
-
-  def f32rr_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float32Regs:$a, Float32Regs:$b),
-                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
-  def f32rr_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float32Regs:$a, Float32Regs:$b),
-                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, Float32Regs:$b))]>;
-  def f32ri_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float32Regs:$a, f32imm:$b),
-                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
-  def f32ri_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float32Regs:$a, f32imm:$b),
-                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;
-  def f32ir_u32_ftz: NVPTXInst<(outs Int32Regs:$dst),
-    (ins f32imm:$a, Float32Regs:$b),
-                 !strconcat(OpcStr_u32, "ftz.f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
-  def f32ir_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins f32imm:$a, Float32Regs:$b),
-                 !strconcat(OpcStr_u32, "f32 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float32Regs:$b))]>;
-  def f64rr_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float64Regs:$a, Float64Regs:$b),
-                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>;
-  def f64ri_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins Float64Regs:$a, f64imm:$b),
-                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode Float64Regs:$a, fpimm:$b))]>;
-  def f64ir_u32: NVPTXInst<(outs Int32Regs:$dst),
-    (ins f64imm:$a, Float64Regs:$b),
-                 !strconcat(OpcStr_u32, "f64 \t$dst, $a, $b;"),
-               [(set Int32Regs:$dst, (OpNode fpimm:$a, Float64Regs:$b))]>;
+defm : ISET_FORMAT_SIGNED<setgt, CmpGT>;
+defm : ISET_FORMAT_UNSIGNED<setugt, CmpGT>;
+defm : ISET_FORMAT_SIGNED<setlt, CmpLT>;
+defm : ISET_FORMAT_UNSIGNED<setult, CmpLT>;
+defm : ISET_FORMAT_SIGNED<setge, CmpGE>;
+defm : ISET_FORMAT_UNSIGNED<setuge, CmpGE>;
+defm : ISET_FORMAT_SIGNED<setle, CmpLE>;
+defm : ISET_FORMAT_UNSIGNED<setule, CmpLE>;
+defm : ISET_FORMAT_SIGNED<seteq, CmpEQ>;
+defm : ISET_FORMAT_UNSIGNED<setueq, CmpEQ>;
+defm : ISET_FORMAT_SIGNED<setne, CmpNE>;
+defm : ISET_FORMAT_UNSIGNED<setune, CmpNE>;
+
+// i1 compares
+def : Pat<(setne Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(setune Int1Regs:$a, Int1Regs:$b),
+          (XORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
+def : Pat<(seteq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(setueq Int1Regs:$a, Int1Regs:$b),
+          (NOT1 (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+// i1 compare -> i32
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
+          (SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
+
+
+
+multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
+  // f32 -> pred
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SETP_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SETP_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SETP_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> pred
+  def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
+
+  // f32 -> i32
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
+            (SET_f32rr Float32Regs:$a, Float32Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode Float32Regs:$a, fpimm:$b)),
+            (SET_f32ri Float32Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, ModeFTZ)>,
+        Requires<[doF32FTZ]>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float32Regs:$b)),
+            (SET_f32ir fpimm:$a, Float32Regs:$b, Mode)>;
+
+  // f64 -> i32
+  def : Pat<(i32 (OpNode Float64Regs:$a, Float64Regs:$b)),
+            (SET_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
+  def : Pat<(i32 (OpNode Float64Regs:$a, fpimm:$b)),
+            (SET_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
+  def : Pat<(i32 (OpNode fpimm:$a, Float64Regs:$b)),
+            (SET_f64ir fpimm:$a, Float64Regs:$b, Mode)>;
 }
 
-defm ISetSGT
-: ISET_FORMAT<"setp.gt.s", "set.gt.u32.s", setgt, "s16", "cvt.s16.s8">;
-defm ISetUGT
-: ISET_FORMAT<"setp.gt.u", "set.gt.u32.u", setugt, "u16", "cvt.u16.u8">;
-defm ISetSLT
-: ISET_FORMAT<"setp.lt.s", "set.lt.u32.s", setlt, "s16", "cvt.s16.s8">;
-defm ISetULT
-: ISET_FORMAT<"setp.lt.u", "set.lt.u32.u", setult, "u16", "cvt.u16.u8">;
-defm ISetSGE
-: ISET_FORMAT<"setp.ge.s", "set.ge.u32.s", setge, "s16", "cvt.s16.s8">;
-defm ISetUGE
-: ISET_FORMAT<"setp.ge.u", "set.ge.u32.u", setuge, "u16", "cvt.u16.u8">;
-defm ISetSLE
-: ISET_FORMAT<"setp.le.s", "set.le.u32.s", setle, "s16", "cvt.s16.s8">;
-defm ISetULE
-: ISET_FORMAT<"setp.le.u", "set.le.u32.u", setule, "u16", "cvt.u16.u8">;
-defm ISetSEQ
-: ISET_FORMAT<"setp.eq.s", "set.eq.u32.s", seteq, "s16", "cvt.s16.s8">;
-defm ISetUEQ
-: ISET_FORMAT<"setp.eq.u", "set.eq.u32.u", setueq, "u16", "cvt.u16.u8">;
-defm ISetSNE
-: ISET_FORMAT<"setp.ne.s", "set.ne.u32.s", setne, "s16", "cvt.s16.s8">;
-defm ISetUNE
-: ISET_FORMAT<"setp.ne.u", "set.ne.u32.u", setune, "u16", "cvt.u16.u8">;
-
-def ISetSNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-                      "xor.pred \t$dst, $a, $b;",
-            [(set Int1Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
-def ISetUNEi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-                      "xor.pred \t$dst, $a, $b;",
-            [(set Int1Regs:$dst, (setune Int1Regs:$a, Int1Regs:$b))]>;
-def ISetSEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-            !strconcat("{{\n\t",
-            !strconcat(".reg .pred temp;\n\t",
-            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
-            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
-            [(set Int1Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
-def ISetUEQi1rr_p : NVPTXInst<(outs Int1Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-            !strconcat("{{\n\t",
-            !strconcat(".reg .pred temp;\n\t",
-            !strconcat("xor.pred \ttemp, $a, $b;\n\t",
-            !strconcat("not.pred \t$dst, temp;\n\t}}","")))),
-            [(set Int1Regs:$dst, (setueq Int1Regs:$a, Int1Regs:$b))]>;
-
-// Compare 2 i1's and produce a u32
-def ISETSNEi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-                  !strconcat("{{\n\t",
-                  !strconcat(".reg .pred temp;\n\t",
-                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
-                  !strconcat("selp.u32 \t$dst, -1, 0, temp;", "\n\t}}")))),
-                  [(set Int32Regs:$dst, (setne Int1Regs:$a, Int1Regs:$b))]>;
-def ISETSEQi1rr_u32 : NVPTXInst<(outs Int32Regs:$dst),
-  (ins Int1Regs:$a, Int1Regs:$b),
-                  !strconcat("{{\n\t",
-                  !strconcat(".reg .pred temp;\n\t",
-                  !strconcat("xor.pred \ttemp, $a, $b;\n\t",
-                  !strconcat("selp.u32 \t$dst, 0, -1, temp;", "\n\t}}")))),
-                  [(set Int32Regs:$dst, (seteq Int1Regs:$a, Int1Regs:$b))]>;
-
-defm FSetGT : FSET_FORMAT<"setp.gt.", "set.gt.u32.", setogt>;
-defm FSetLT : FSET_FORMAT<"setp.lt.", "set.lt.u32.", setolt>;
-defm FSetGE : FSET_FORMAT<"setp.ge.", "set.ge.u32.", setoge>;
-defm FSetLE : FSET_FORMAT<"setp.le.", "set.le.u32.", setole>;
-defm FSetEQ : FSET_FORMAT<"setp.eq.", "set.eq.u32.", setoeq>;
-defm FSetNE : FSET_FORMAT<"setp.ne.", "set.ne.u32.", setone>;
-
-defm FSetUGT : FSET_FORMAT<"setp.gtu.", "set.gtu.u32.", setugt>;
-defm FSetULT : FSET_FORMAT<"setp.ltu.", "set.ltu.u32.",setult>;
-defm FSetUGE : FSET_FORMAT<"setp.geu.", "set.geu.u32.",setuge>;
-defm FSetULE : FSET_FORMAT<"setp.leu.", "set.leu.u32.",setule>;
-defm FSetUEQ : FSET_FORMAT<"setp.equ.", "set.equ.u32.",setueq>;
-defm FSetUNE : FSET_FORMAT<"setp.neu.", "set.neu.u32.",setune>;
-
-defm FSetNUM : FSET_FORMAT<"setp.num.", "set.num.u32.",seto>;
-defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>;
-
-def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)),
-                     (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a),
-                             (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>;
-def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst),
-  (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>;
-def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst),
-  (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>;
-def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst),
-  (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>;
-def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst),
-  (ins i8imm:$a, i8imm:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
-
-def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst),
-  (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, Int16Regs:$b))]>;
-def SELECTi16ri : NVPTXInst<(outs Int16Regs:$dst),
-  (ins Int16Regs:$a, i16imm:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int16Regs:$dst, (select Int1Regs:$p, Int16Regs:$a, imm:$b))]>;
-def SELECTi16ir : NVPTXInst<(outs Int16Regs:$dst),
-  (ins i16imm:$a, Int16Regs:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, Int16Regs:$b))]>;
-def SELECTi16ii : NVPTXInst<(outs Int16Regs:$dst),
-  (ins i16imm:$a, i16imm:$b, Int1Regs:$p),
-                      "selp.b16 \t$dst, $a, $b, $p;",
-      [(set Int16Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
-
-def SELECTi32rr : NVPTXInst<(outs Int32Regs:$dst),
-  (ins Int32Regs:$a, Int32Regs:$b, Int1Regs:$p),
-                      "selp.b32 \t$dst, $a, $b, $p;",
-      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, Int32Regs:$b))]>;
-def SELECTi32ri : NVPTXInst<(outs Int32Regs:$dst),
-  (ins Int32Regs:$a, i32imm:$b, Int1Regs:$p),
-                      "selp.b32 \t$dst, $a, $b, $p;",
-      [(set Int32Regs:$dst, (select Int1Regs:$p, Int32Regs:$a, imm:$b))]>;
-def SELECTi32ir : NVPTXInst<(outs Int32Regs:$dst),
-  (ins i32imm:$a, Int32Regs:$b, Int1Regs:$p),
-                      "selp.b32 \t$dst, $a, $b, $p;",
-      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, Int32Regs:$b))]>;
-def SELECTi32ii : NVPTXInst<(outs Int32Regs:$dst),
-  (ins i32imm:$a, i32imm:$b, Int1Regs:$p),
-                      "selp.b32 \t$dst, $a, $b, $p;",
-      [(set Int32Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
-
-def SELECTi64rr : NVPTXInst<(outs Int64Regs:$dst),
-  (ins Int64Regs:$a, Int64Regs:$b, Int1Regs:$p),
-                      "selp.b64 \t$dst, $a, $b, $p;",
-      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, Int64Regs:$b))]>;
-def SELECTi64ri : NVPTXInst<(outs Int64Regs:$dst),
-  (ins Int64Regs:$a, i64imm:$b, Int1Regs:$p),
-                      "selp.b64 \t$dst, $a, $b, $p;",
-      [(set Int64Regs:$dst, (select Int1Regs:$p, Int64Regs:$a, imm:$b))]>;
-def SELECTi64ir : NVPTXInst<(outs Int64Regs:$dst),
-  (ins i64imm:$a, Int64Regs:$b, Int1Regs:$p),
-                      "selp.b64 \t$dst, $a, $b, $p;",
-      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, Int64Regs:$b))]>;
-def SELECTi64ii : NVPTXInst<(outs Int64Regs:$dst),
-  (ins i64imm:$a, i64imm:$b, Int1Regs:$p),
-                      "selp.b64 \t$dst, $a, $b, $p;",
-      [(set Int64Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>;
-
-def SELECTf32rr : NVPTXInst<(outs Float32Regs:$dst),
-  (ins Float32Regs:$a, Float32Regs:$b, Int1Regs:$p),
-                      "selp.f32 \t$dst, $a, $b, $p;",
-      [(set Float32Regs:$dst,
-        (select Int1Regs:$p, Float32Regs:$a, Float32Regs:$b))]>;
-def SELECTf32ri : NVPTXInst<(outs Float32Regs:$dst),
-  (ins Float32Regs:$a, f32imm:$b, Int1Regs:$p),
-                      "selp.f32 \t$dst, $a, $b, $p;",
-      [(set Float32Regs:$dst, (select Int1Regs:$p, Float32Regs:$a, fpimm:$b))]>;
-def SELECTf32ir : NVPTXInst<(outs Float32Regs:$dst),
-  (ins f32imm:$a, Float32Regs:$b, Int1Regs:$p),
-                      "selp.f32 \t$dst, $a, $b, $p;",
-      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float32Regs:$b))]>;
-def SELECTf32ii : NVPTXInst<(outs Float32Regs:$dst),
-  (ins f32imm:$a, f32imm:$b, Int1Regs:$p),
-                      "selp.f32 \t$dst, $a, $b, $p;",
-      [(set Float32Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
-
-def SELECTf64rr : NVPTXInst<(outs Float64Regs:$dst),
-  (ins Float64Regs:$a, Float64Regs:$b, Int1Regs:$p),
-                      "selp.f64 \t$dst, $a, $b, $p;",
-      [(set Float64Regs:$dst,
-        (select Int1Regs:$p, Float64Regs:$a, Float64Regs:$b))]>;
-def SELECTf64ri : NVPTXInst<(outs Float64Regs:$dst),
-  (ins Float64Regs:$a, f64imm:$b, Int1Regs:$p),
-                      "selp.f64 \t$dst, $a, $b, $p;",
-      [(set Float64Regs:$dst, (select Int1Regs:$p, Float64Regs:$a, fpimm:$b))]>;
-def SELECTf64ir : NVPTXInst<(outs Float64Regs:$dst),
-  (ins f64imm:$a, Float64Regs:$b, Int1Regs:$p),
-                      "selp.f64 \t$dst, $a, $b, $p;",
-      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, Float64Regs:$b))]>;
-def SELECTf64ii : NVPTXInst<(outs Float64Regs:$dst),
-  (ins f64imm:$a, f64imm:$b, Int1Regs:$p),
-                      "selp.f64 \t $dst, $a, $b, $p;",
-      [(set Float64Regs:$dst, (select Int1Regs:$p, fpimm:$a, fpimm:$b))]>;
+defm FSetGT : FSET_FORMAT<setogt, CmpGT, CmpGT_FTZ>;
+defm FSetLT : FSET_FORMAT<setolt, CmpLT, CmpLT_FTZ>;
+defm FSetGE : FSET_FORMAT<setoge, CmpGE, CmpGE_FTZ>;
+defm FSetLE : FSET_FORMAT<setole, CmpLE, CmpLE_FTZ>;
+defm FSetEQ : FSET_FORMAT<setoeq, CmpEQ, CmpEQ_FTZ>;
+defm FSetNE : FSET_FORMAT<setone, CmpNE, CmpNE_FTZ>;
+
+defm FSetUGT : FSET_FORMAT<setugt, CmpGTU, CmpGTU_FTZ>;
+defm FSetULT : FSET_FORMAT<setult, CmpLTU, CmpLTU_FTZ>;
+defm FSetUGE : FSET_FORMAT<setuge, CmpGEU, CmpGEU_FTZ>;
+defm FSetULE : FSET_FORMAT<setule, CmpLEU, CmpLEU_FTZ>;
+defm FSetUEQ : FSET_FORMAT<setueq, CmpEQU, CmpEQU_FTZ>;
+defm FSetUNE : FSET_FORMAT<setune, CmpNEU, CmpNEU_FTZ>;
+
+defm FSetNUM : FSET_FORMAT<seto, CmpNUM, CmpNUM_FTZ>;
+defm FSetNAN : FSET_FORMAT<setuo, CmpNAN, CmpNAN_FTZ>;
 
 //def ld_param         : SDNode<"NVPTXISD::LOAD_PARAM", SDTLoad,
 //                        [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -1751,17 +1577,22 @@ def SDTDeclareParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
 def SDTDeclareScalarParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>,
   SDTCisInt<1>, SDTCisInt<2>]>;
 def SDTLoadParamProfile : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDTLoadParamV2Profile : SDTypeProfile<2, 2, [SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisInt<3>]>;
+def SDTLoadParamV4Profile : SDTypeProfile<4, 2, [SDTCisInt<4>, SDTCisInt<5>]>;
 def SDTPrintCallProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDTPrintCallUniProfile : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def SDTStoreParamProfile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV2Profile : SDTypeProfile<0, 4, [SDTCisInt<0>, SDTCisInt<1>]>;
+def SDTStoreParamV4Profile : SDTypeProfile<0, 6, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTStoreParam32Profile : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>]>;
 def SDTCallArgProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
 def SDTCallArgMarkProfile : SDTypeProfile<0, 0, []>;
 def SDTCallVoidProfile : SDTypeProfile<0, 1, []>;
 def SDTCallValProfile : SDTypeProfile<1, 0, []>;
 def SDTMoveParamProfile : SDTypeProfile<1, 1, []>;
-def SDTMoveRetvalProfile : SDTypeProfile<0, 1, []>;
 def SDTStoreRetvalProfile : SDTypeProfile<0, 2, [SDTCisInt<0>]>;
+def SDTStoreRetvalV2Profile : SDTypeProfile<0, 3, [SDTCisInt<0>]>;
+def SDTStoreRetvalV4Profile : SDTypeProfile<0, 5, [SDTCisInt<0>]>;
 def SDTPseudoUseParamProfile : SDTypeProfile<0, 1, []>;
 
 def DeclareParam : SDNode<"NVPTXISD::DeclareParam", SDTDeclareParamProfile,
@@ -1776,18 +1607,24 @@ def DeclareRet   : SDNode<"NVPTXISD::DeclareRet", SDTDeclareScalarParamProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def LoadParam    : SDNode<"NVPTXISD::LoadParam", SDTLoadParamProfile,
                          [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV2  : SDNode<"NVPTXISD::LoadParamV2", SDTLoadParamV2Profile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
+def LoadParamV4  : SDNode<"NVPTXISD::LoadParamV4", SDTLoadParamV4Profile,
+                         [SDNPHasChain, SDNPMayLoad, SDNPOutGlue, SDNPInGlue]>;
 def PrintCall    : SDNode<"NVPTXISD::PrintCall", SDTPrintCallProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def PrintCallUni : SDNode<"NVPTXISD::PrintCallUni", SDTPrintCallUniProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def StoreParam   : SDNode<"NVPTXISD::StoreParam", SDTStoreParamProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV2 : SDNode<"NVPTXISD::StoreParamV2", SDTStoreParamV2Profile,
+                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def StoreParamV4 : SDNode<"NVPTXISD::StoreParamV4", SDTStoreParamV4Profile,
+                         [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def StoreParamU32 : SDNode<"NVPTXISD::StoreParamU32", SDTStoreParam32Profile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def StoreParamS32 : SDNode<"NVPTXISD::StoreParamS32", SDTStoreParam32Profile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
-def MoveToParam  : SDNode<"NVPTXISD::MoveToParam", SDTStoreParamProfile,
-                       [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def CallArgBegin : SDNode<"NVPTXISD::CallArgBegin", SDTCallArgMarkProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def CallArg      : SDNode<"NVPTXISD::CallArg", SDTCallArgProfile,
@@ -1804,12 +1641,12 @@ def CallVal      : SDNode<"NVPTXISD::CallVal", SDTCallValProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
 def MoveParam    : SDNode<"NVPTXISD::MoveParam", SDTMoveParamProfile,
                          []>;
-def MoveRetval   : SDNode<"NVPTXISD::MoveRetval", SDTMoveRetvalProfile,
-                         [SDNPHasChain, SDNPSideEffect]>;
 def StoreRetval  : SDNode<"NVPTXISD::StoreRetval", SDTStoreRetvalProfile,
                          [SDNPHasChain, SDNPSideEffect]>;
-def MoveToRetval : SDNode<"NVPTXISD::MoveToRetval", SDTStoreRetvalProfile,
-                         [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV2  : SDNode<"NVPTXISD::StoreRetvalV2", SDTStoreRetvalV2Profile,
+                           [SDNPHasChain, SDNPSideEffect]>;
+def StoreRetvalV4  : SDNode<"NVPTXISD::StoreRetvalV4", SDTStoreRetvalV4Profile,
+                           [SDNPHasChain, SDNPSideEffect]>;
 def PseudoUseParam : SDNode<"NVPTXISD::PseudoUseParam",
   SDTPseudoUseParamProfile,
                        [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
@@ -1820,7 +1657,7 @@ class LoadParamMemInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
                 !strconcat(!strconcat("ld.param", opstr),
                 "\t$dst, [retval0+$b];"),
-                [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
+                []>;
 
 class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs regclass:$dst), (ins i32imm:$b),
@@ -1828,35 +1665,57 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> :
                 "\t$dst, retval$b;"),
                 [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>;
 
+class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param.v2", opstr),
+                "\t{{$dst, $dst2}}, [retval0+$b];"), []>;
+
+class LoadParamV4MemInst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs regclass:$dst, regclass:$dst2, regclass:$dst3,
+                      regclass:$dst4),
+                (ins i32imm:$b),
+                !strconcat(!strconcat("ld.param.v4", opstr),
+                "\t{{$dst, $dst2, $dst3, $dst4}}, [retval0+$b];"), []>;
+
 class StoreParamInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
                 !strconcat(!strconcat("st.param", opstr),
                 "\t[param$a+$b], $val;"),
-                [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+                []>;
 
-class MoveToParamInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b),
-                !strconcat(!strconcat("mov", opstr),
-                "\tparam$a, $val;"),
-                [(MoveToParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>;
+class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2,
+                             i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param.v2", opstr),
+                "\t[param$a+$b], {{$val, $val2}};"),
+                []>;
+
+class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2,
+                             regclass:$val3, i32imm:$a, i32imm:$b),
+                !strconcat(!strconcat("st.param.v4", opstr),
+                "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"),
+                []>;
 
 class StoreRetvalInst<NVPTXRegClass regclass, string opstr> :
       NVPTXInst<(outs), (ins regclass:$val, i32imm:$a),
                 !strconcat(!strconcat("st.param", opstr),
                 "\t[func_retval0+$a], $val;"),
-                [(StoreRetval (i32 imm:$a), regclass:$val)]>;
+                []>;
 
-class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins i32imm:$num, regclass:$val),
-                !strconcat(!strconcat("mov", opstr),
-                "\tfunc_retval$num, $val;"),
-                [(MoveToRetval (i32 imm:$num), regclass:$val)]>;
+class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a),
+                !strconcat(!strconcat("st.param.v2", opstr),
+                "\t[func_retval0+$a], {{$val, $val2}};"),
+                []>;
 
-class MoveRetvalInst<NVPTXRegClass regclass, string opstr> :
-      NVPTXInst<(outs), (ins regclass:$val),
-                !strconcat(!strconcat("mov", opstr),
-                "\tfunc_retval0, $val;"),
-                [(MoveRetval regclass:$val)]>;
+class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> :
+      NVPTXInst<(outs),
+                (ins regclass:$val, regclass:$val2, regclass:$val3,
+                     regclass:$val4, i32imm:$a),
+                !strconcat(!strconcat("st.param.v4", opstr),
+                "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"),
+                []>;
 
 def PrintCallRetInst1 : NVPTXInst<(outs), (ins),
 "call (retval0), ",
@@ -1919,126 +1778,81 @@ def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ",
 def LoadParamMemI64    : LoadParamMemInst<Int64Regs, ".b64">;
 def LoadParamMemI32    : LoadParamMemInst<Int32Regs, ".b32">;
 def LoadParamMemI16    : LoadParamMemInst<Int16Regs, ".b16">;
-def LoadParamMemI8     : LoadParamMemInst<Int8Regs, ".b8">;
-
-//def LoadParamMemI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
-//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
-//                "cvt.u16.u32\t$dst, temp_param_reg;"),
-//                [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
-//def LoadParamMemI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
-//                !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t",
-//                "cvt.u16.u32\t$dst, temp_param_reg;"),
-//                [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>;
-
+def LoadParamMemI8     : LoadParamMemInst<Int16Regs, ".b8">;
+def LoadParamMemV2I64  : LoadParamV2MemInst<Int64Regs, ".b64">;
+def LoadParamMemV2I32  : LoadParamV2MemInst<Int32Regs, ".b32">;
+def LoadParamMemV2I16  : LoadParamV2MemInst<Int16Regs, ".b16">;
+def LoadParamMemV2I8   : LoadParamV2MemInst<Int16Regs, ".b8">;
+def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
+def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
+def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
 def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
 def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
-
-def LoadParamRegI64    : LoadParamRegInst<Int64Regs, ".b64">;
-def LoadParamRegI32    : LoadParamRegInst<Int32Regs, ".b32">;
-def LoadParamRegI16    : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b),
-                         "cvt.u16.u32\t$dst, retval$b;",
-                         [(set Int16Regs:$dst,
-                           (LoadParam (i32 0), (i32 imm:$b)))]>;
-def LoadParamRegI8     : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b),
-                         "cvt.u16.u32\t$dst, retval$b;",
-                         [(set Int8Regs:$dst,
-                           (LoadParam (i32 0), (i32 imm:$b)))]>;
-
-def LoadParamRegF32    : LoadParamRegInst<Float32Regs, ".f32">;
-def LoadParamRegF64    : LoadParamRegInst<Float64Regs, ".f64">;
+def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
+def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
 def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
 def StoreParamI32    : StoreParamInst<Int32Regs, ".b32">;
 
-def StoreParamI16    : NVPTXInst<(outs),
-  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
-                       "st.param.b16\t[param$a+$b], $val;",
-           [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
-
-def StoreParamI8     : NVPTXInst<(outs),
-  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
-                       "st.param.b8\t[param$a+$b], $val;",
-                       [(StoreParam
-                         (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
-
-def StoreParamS32I16 : NVPTXInst<(outs),
-  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
-                 !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t",
-                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
-                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
-def StoreParamU32I16 : NVPTXInst<(outs),
-  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
-                 !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
-                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
-                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
-
-def StoreParamU32I8   : NVPTXInst<(outs),
-  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
-                 !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t",
-                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
-                 [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
-def StoreParamS32I8   : NVPTXInst<(outs),
-  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
-                 !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t",
-                            "st.param.b32\t[param$a+$b], temp_param_reg;"),
-                 [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
+def StoreParamI16    : StoreParamInst<Int16Regs, ".b16">;
+def StoreParamI8     : StoreParamInst<Int16Regs, ".b8">;
+def StoreParamV2I64  : StoreParamV2Inst<Int64Regs, ".b64">;
+def StoreParamV2I32  : StoreParamV2Inst<Int32Regs, ".b32">;
+def StoreParamV2I16  : StoreParamV2Inst<Int16Regs, ".b16">;
+def StoreParamV2I8   : StoreParamV2Inst<Int16Regs, ".b8">;
+
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4I32    : StoreParamV4Inst<Int32Regs, ".b32">;
+def StoreParamV4I32    : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2,
+                                               Int32Regs:$val3, Int32Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                   "st.param.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
+
+def StoreParamV4I16    : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+                                               Int16Regs:$val3, Int16Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
+
+def StoreParamV4I8     : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2,
+                                                Int16Regs:$val3, Int16Regs:$val4,
+                                                i32imm:$a, i32imm:$b),
+                 "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                         []>;
 
 def StoreParamF32    : StoreParamInst<Float32Regs, ".f32">;
 def StoreParamF64    : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
+def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+// FIXME: StoreParamV4Inst crashes llvm-tblgen :(
+//def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
+def StoreParamV4F32    : NVPTXInst<(outs),
+                                   (ins Float32Regs:$val, Float32Regs:$val2,
+                                        Float32Regs:$val3, Float32Regs:$val4,
+                                        i32imm:$a, i32imm:$b),
+                "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};",
+                        []>;
 
-def MoveToParamI64   : MoveToParamInst<Int64Regs, ".b64">;
-def MoveToParamI32   : MoveToParamInst<Int32Regs, ".b32">;
-def MoveToParamF64   : MoveToParamInst<Float64Regs, ".f64">;
-def MoveToParamF32   : MoveToParamInst<Float32Regs, ".f32">;
-def MoveToParamI16   : NVPTXInst<(outs),
-  (ins Int16Regs:$val, i32imm:$a, i32imm:$b),
-                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
-                              "mov.b32\tparam$a, temp_param_reg;"),
-                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>;
-def MoveToParamI8    : NVPTXInst<(outs),
-  (ins Int8Regs:$val, i32imm:$a, i32imm:$b),
-                   !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t",
-                              "mov.b32\tparam$a, temp_param_reg;"),
-                   [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
 def StoreRetvalI32    : StoreRetvalInst<Int32Regs, ".b32">;
 def StoreRetvalI16    : StoreRetvalInst<Int16Regs, ".b16">;
-def StoreRetvalI8     : StoreRetvalInst<Int8Regs, ".b8">;
-
-//def StoreRetvalI16    : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a),
-//     !strconcat("\{\n\t",
-//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
-//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
-//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
-//     [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>;
-//def StoreRetvalI8     : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a),
-//     !strconcat("\{\n\t",
-//     !strconcat(".reg .b32 temp_retval_reg;\n\t",
-//     !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t",
-//                "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))),
-//     [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>;
+def StoreRetvalI8     : StoreRetvalInst<Int16Regs, ".b8">;
+def StoreRetvalV2I64  : StoreRetvalV2Inst<Int64Regs, ".b64">;
+def StoreRetvalV2I32  : StoreRetvalV2Inst<Int32Regs, ".b32">;
+def StoreRetvalV2I16  : StoreRetvalV2Inst<Int16Regs, ".b16">;
+def StoreRetvalV2I8   : StoreRetvalV2Inst<Int16Regs, ".b8">;
+def StoreRetvalV4I32  : StoreRetvalV4Inst<Int32Regs, ".b32">;
+def StoreRetvalV4I16  : StoreRetvalV4Inst<Int16Regs, ".b16">;
+def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
 
 def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
 def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
-
-def MoveRetvalI64    : MoveRetvalInst<Int64Regs, ".b64">;
-def MoveRetvalI32    : MoveRetvalInst<Int32Regs, ".b32">;
-def MoveRetvalI16    : MoveRetvalInst<Int16Regs, ".b16">;
-def MoveRetvalI8     : MoveRetvalInst<Int8Regs, ".b8">;
-def MoveRetvalF64    : MoveRetvalInst<Float64Regs, ".f64">;
-def MoveRetvalF32    : MoveRetvalInst<Float32Regs, ".f32">;
-
-def MoveToRetvalI64    : MoveToRetvalInst<Int64Regs, ".b64">;
-def MoveToRetvalI32    : MoveToRetvalInst<Int32Regs, ".b32">;
-def MoveToRetvalF64    : MoveToRetvalInst<Float64Regs, ".f64">;
-def MoveToRetvalF32    : MoveToRetvalInst<Float32Regs, ".f32">;
-def MoveToRetvalI16    : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val),
-                         "cvt.u32.u16\tfunc_retval$num, $val;",
-                         [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>;
-def MoveToRetvalI8     : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val),
-                         "cvt.u32.u16\tfunc_retval$num, $val;",
-                         [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>;
+def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
+def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
 
 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
@@ -2056,7 +1870,6 @@ class LastCallArgInst<NVPTXRegClass regclass> :
 def CallArgI64     : CallArgInst<Int64Regs>;
 def CallArgI32     : CallArgInst<Int32Regs>;
 def CallArgI16     : CallArgInst<Int16Regs>;
-def CallArgI8      : CallArgInst<Int8Regs>;
 
 def CallArgF64     : CallArgInst<Float64Regs>;
 def CallArgF32     : CallArgInst<Float32Regs>;
@@ -2064,7 +1877,6 @@ def CallArgF32     : CallArgInst<Float32Regs>;
 def LastCallArgI64 : LastCallArgInst<Int64Regs>;
 def LastCallArgI32 : LastCallArgInst<Int32Regs>;
 def LastCallArgI16 : LastCallArgInst<Int16Regs>;
-def LastCallArgI8  : LastCallArgInst<Int8Regs>;
 
 def LastCallArgF64 : LastCallArgInst<Float64Regs>;
 def LastCallArgF32 : LastCallArgInst<Float32Regs>;
@@ -2124,9 +1936,6 @@ def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;
 def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
                    "cvt.u16.u32\t$dst, $src;",
                    [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
-def MoveParamI8  : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src),
-                   "cvt.u16.u32\t$dst, $src;",
-                   [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>;
 def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;
 def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;
 
@@ -2138,7 +1947,6 @@ class PseudoUseParamInst<NVPTXRegClass regclass> :
 def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
 def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
 def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
-def PseudoUseParamI8  : PseudoUseParamInst<Int8Regs>;
 def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
 def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;
 
@@ -2180,7 +1988,7 @@ multiclass LD<NVPTXRegClass regclass> {
 }
 
 let mayLoad=1, neverHasSideEffects=1 in {
-defm LD_i8  : LD<Int8Regs>;
+defm LD_i8  : LD<Int16Regs>;
 defm LD_i16 : LD<Int16Regs>;
 defm LD_i32 : LD<Int32Regs>;
 defm LD_i64 : LD<Int64Regs>;
@@ -2222,7 +2030,7 @@ multiclass ST<NVPTXRegClass regclass> {
 }
 
 let mayStore=1, neverHasSideEffects=1 in {
-defm ST_i8  : ST<Int8Regs>;
+defm ST_i8  : ST<Int16Regs>;
 defm ST_i16 : ST<Int16Regs>;
 defm ST_i32 : ST<Int32Regs>;
 defm ST_i64 : ST<Int64Regs>;
@@ -2306,7 +2114,7 @@ multiclass LD_VEC<NVPTXRegClass regclass> {
                 []>;
 }
 let mayLoad=1, neverHasSideEffects=1 in {
-defm LDV_i8  : LD_VEC<Int8Regs>;
+defm LDV_i8  : LD_VEC<Int16Regs>;
 defm LDV_i16 : LD_VEC<Int16Regs>;
 defm LDV_i32 : LD_VEC<Int32Regs>;
 defm LDV_i64 : LD_VEC<Int64Regs>;
@@ -2389,7 +2197,7 @@ multiclass ST_VEC<NVPTXRegClass regclass> {
     []>;
 }
 let mayStore=1, neverHasSideEffects=1 in {
-defm STV_i8  : ST_VEC<Int8Regs>;
+defm STV_i8  : ST_VEC<Int16Regs>;
 defm STV_i16 : ST_VEC<Int16Regs>;
 defm STV_i32 : ST_VEC<Int32Regs>;
 defm STV_i64 : ST_VEC<Int64Regs>;
@@ -2400,291 +2208,6 @@ defm STV_f64 : ST_VEC<Float64Regs>;
 
 //---- Conversion ----
 
-multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> {
-// FIXME: need to add f16 support
-//  def CVTf16i8 :
-//    NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a),
-//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"),
-//        [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>;
-//  def CVTf16i16 :
-//    NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a),
-//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"),
-//        [(set Float16Regs:$d, (OpNode Int16Regs:$a))]>;
-//  def CVTf16i32 :
-//    NVPTXInst<(outs Float16Regs:$d), (ins Int32Regs:$a),
-//              !strconcat(!strconcat("cvt.rn.f16.", OpStr), "32 \t$d, $a;"),
-//        [(set Float16Regs:$d, (OpNode Int32Regs:$a))]>;
-//  def CVTf16i64:
-//    NVPTXInst<(outs Float16Regs:$d), (ins Int64Regs:$a),
-//          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
-//            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
-
-  def CVTf32i1 :
-    NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a),
-              "selp.f32 \t$d, 1.0, 0.0, $a;",
-        [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>;
-  def CVTf32i8 :
-    NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"),
-        [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>;
-  def CVTf32i16 :
-    NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"),
-        [(set Float32Regs:$d, (OpNode Int16Regs:$a))]>;
-  def CVTf32i32 :
-    NVPTXInst<(outs Float32Regs:$d), (ins Int32Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f32.", OpStr), "32 \t$d, $a;"),
-        [(set Float32Regs:$d, (OpNode Int32Regs:$a))]>;
-  def CVTf32i64:
-    NVPTXInst<(outs Float32Regs:$d), (ins Int64Regs:$a),
-          !strconcat(!strconcat("cvt.rn.f32.", OpStr), "64 \t$d, $a;"),
-            [(set Float32Regs:$d, (OpNode Int64Regs:$a))]>;
-
-  def CVTf64i1 :
-    NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a),
-              "selp.f64 \t$d, 1.0, 0.0, $a;",
-        [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>;
-  def CVTf64i8 :
-    NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"),
-        [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>;
-  def CVTf64i16 :
-    NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"),
-        [(set Float64Regs:$d, (OpNode Int16Regs:$a))]>;
-  def CVTf64i32 :
-    NVPTXInst<(outs Float64Regs:$d), (ins Int32Regs:$a),
-              !strconcat(!strconcat("cvt.rn.f64.", OpStr), "32 \t$d, $a;"),
-        [(set Float64Regs:$d, (OpNode Int32Regs:$a))]>;
-  def CVTf64i64:
-    NVPTXInst<(outs Float64Regs:$d), (ins Int64Regs:$a),
-          !strconcat(!strconcat("cvt.rn.f64.", OpStr), "64 \t$d, $a;"),
-            [(set Float64Regs:$d, (OpNode Int64Regs:$a))]>;
-}
-
-defm Sint_to_fp : CVT_INT_TO_FP <"s", sint_to_fp>;
-defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>;
-
-multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> {
-// FIXME: need to add f16 support
-//  def CVTi8f16:
-//    NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a),
-//              !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"),
-//        [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>;
-  def CVTi8f32_ftz:
-    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
-        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
-  def CVTi8f32:
-    NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
-        [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>;
-  def CVTi8f64:
-    NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
-        [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>;
-
-// FIXME: need to add f16 support
-//  def CVTi16f16:
-//    NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a),
-//              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"),
-//        [(set Int16Regs:$d, (OpNode Float16Regs:$a))]>;
-  def CVTi16f32_ftz:
-    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"),
-        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
-  def CVTi16f32:
-    NVPTXInst<(outs Int16Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"),
-        [(set Int16Regs:$d, (OpNode Float32Regs:$a))]>;
-  def CVTi16f64:
-    NVPTXInst<(outs Int16Regs:$d), (ins Float64Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"),
-        [(set Int16Regs:$d, (OpNode Float64Regs:$a))]>;
-
-// FIXME: need to add f16 support
-//  def CVTi32f16:  def CVTi32f16:
-//    NVPTXInst<(outs Int32Regs:$d), (ins Float16Regs:$a),
-//              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f16 \t$d, $a;"),
-//        [(set Int32Regs:$d, (OpNode Float16Regs:$a))]>;
-  def CVTi32f32_ftz:
-    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "32.f32 \t$d, $a;"),
-        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
-  def CVTi32f32:
-    NVPTXInst<(outs Int32Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f32 \t$d, $a;"),
-        [(set Int32Regs:$d, (OpNode Float32Regs:$a))]>;
-  def CVTi32f64:
-    NVPTXInst<(outs Int32Regs:$d), (ins Float64Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "32.f64 \t$d, $a;"),
-        [(set Int32Regs:$d, (OpNode Float64Regs:$a))]>;
-
-// FIXME: need to add f16 support
-//  def CVTi64f16:
-//    NVPTXInst<(outs Int64Regs:$d), (ins Float16Regs:$a),
-//              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f16 \t$d, $a;"),
-//        [(set Int64Regs:$d, (OpNode Float16Regs:$a))]>;
-  def CVTi64f32_ftz:
-    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "64.f32 \t$d, $a;"),
-        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>;
-  def CVTi64f32:
-    NVPTXInst<(outs Int64Regs:$d), (ins Float32Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f32 \t$d, $a;"),
-        [(set Int64Regs:$d, (OpNode Float32Regs:$a))]>;
-  def CVTi64f64:
-    NVPTXInst<(outs Int64Regs:$d), (ins Float64Regs:$a),
-              !strconcat(!strconcat("cvt.rzi.", OpStr), "64.f64 \t$d, $a;"),
-        [(set Int64Regs:$d, (OpNode Float64Regs:$a))]>;
-}
-
-defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>;
-defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>;
-
-multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> {
-  def ext1to8:
-       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
-           "selp.u16 \t$d, 1, 0, $a;",
-     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to16:
-       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
-           "selp.u16 \t$d, 1, 0, $a;",
-     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to32:
-       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
-           "selp.u32 \t$d, 1, 0, $a;",
-     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to64:
-       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
-           "selp.u64 \t$d, 1, 0, $a;",
-     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
-}
-
-multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> {
-  def ext1to8:
-       NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a),
-           "selp.s16 \t$d, -1, 0, $a;",
-     [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to16:
-       NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a),
-           "selp.s16 \t$d, -1, 0, $a;",
-     [(set Int16Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to32:
-       NVPTXInst<(outs Int32Regs:$d), (ins Int1Regs:$a),
-           "selp.s32 \t$d, -1, 0, $a;",
-     [(set Int32Regs:$d, (OpNode Int1Regs:$a))]>;
-  def ext1to64:
-       NVPTXInst<(outs Int64Regs:$d), (ins Int1Regs:$a),
-           "selp.s64 \t$d, -1, 0, $a;",
-     [(set Int64Regs:$d, (OpNode Int1Regs:$a))]>;
-}
-
-multiclass INT_EXTEND <string OpStr, SDNode OpNode> {
-  // All Int8Regs are emiited as 16bit registers in ptx.
-  // And there is no selp.u8 in ptx.
-  def ext8to16:
-       NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.",
-             !strconcat(OpStr, "8 \t$d, $a;")))),
-     [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>;
-  def ext8to32:
-       NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
-             !strconcat(OpStr, "8 \t$d, $a;")))),
-     [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>;
-  def ext8to64:
-       NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
-             !strconcat(OpStr, "8 \t$d, $a;")))),
-     [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>;
-  def ext16to32:
-       NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.",
-             !strconcat(OpStr, "16 \t$d, $a;")))),
-     [(set Int32Regs:$d, (OpNode Int16Regs:$a))]>;
-  def ext16to64:
-       NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
-             !strconcat(OpStr, "16 \t$d, $a;")))),
-     [(set Int64Regs:$d, (OpNode Int16Regs:$a))]>;
-  def ext32to64:
-       NVPTXInst<(outs Int64Regs:$d), (ins Int32Regs:$a),
-           !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.",
-             !strconcat(OpStr, "32 \t$d, $a;")))),
-     [(set Int64Regs:$d, (OpNode Int32Regs:$a))]>;
-}
-
-defm Sint_extend_1 : INT_EXTEND_SIGNED_1<sext>;
-defm Zint_extend_1 : INT_EXTEND_UNSIGNED_1<zext>;
-defm Aint_extend_1 : INT_EXTEND_UNSIGNED_1<anyext>;
-
-defm Sint_extend : INT_EXTEND <"s", sext>;
-defm Zint_extend : INT_EXTEND <"u", zext>;
-defm Aint_extend : INT_EXTEND <"u", anyext>;
-
-class TRUNC_to1_asm<string sz> {
-  string s = !strconcat("{{\n\t",
-             !strconcat(".reg ",
-             !strconcat(sz,
-             !strconcat(" temp;\n\t",
-             !strconcat("and",
-             !strconcat(sz,
-             !strconcat("\t temp, $a, 1;\n\t",
-             !strconcat("setp",
-             !strconcat(sz, ".eq \t $d, temp, 1;\n\t}}")))))))));
-}
-
-def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
-             "cvt.u32.u64 \t$d, $a;",
-       [(set Int32Regs:$d, (trunc Int64Regs:$a))]>;
-def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a),
-             "cvt.u16.u64 \t$d, $a;",
-       [(set Int16Regs:$d, (trunc Int64Regs:$a))]>;
-def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a),
-             "cvt.u8.u64 \t$d, $a;",
-       [(set Int8Regs:$d, (trunc Int64Regs:$a))]>;
-def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a),
-             "cvt.u16.u32 \t$d, $a;",
-       [(set Int16Regs:$d, (trunc Int32Regs:$a))]>;
-def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a),
-             "cvt.u8.u32 \t$d, $a;",
-       [(set Int8Regs:$d, (trunc Int32Regs:$a))]>;
-def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a),
-             "cvt.u8.u16 \t$d, $a;",
-       [(set Int8Regs:$d, (trunc Int16Regs:$a))]>;
-def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
-             TRUNC_to1_asm<".b64">.s,
-             [(set Int1Regs:$d, (trunc Int64Regs:$a))]>;
-def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a),
-             TRUNC_to1_asm<".b32">.s,
-             [(set Int1Regs:$d, (trunc Int32Regs:$a))]>;
-def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a),
-             TRUNC_to1_asm<".b16">.s,
-             [(set Int1Regs:$d, (trunc Int16Regs:$a))]>;
-def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a),
-             TRUNC_to1_asm<".b16">.s,
-             [(set Int1Regs:$d, (trunc Int8Regs:$a))]>;
-
-// Select instructions
-def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b),
-          (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>;
-def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
-          (SELECTi16rr Int16Regs:$a, Int16Regs:$b,
-            (TRUNC_32to1 Int32Regs:$pred))>;
-def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
-          (SELECTi32rr Int32Regs:$a, Int32Regs:$b,
-            (TRUNC_32to1 Int32Regs:$pred))>;
-def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
-          (SELECTi64rr Int64Regs:$a, Int64Regs:$b,
-            (TRUNC_32to1 Int32Regs:$pred))>;
-def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
-          (SELECTf32rr Float32Regs:$a, Float32Regs:$b,
-            (TRUNC_32to1 Int32Regs:$pred))>;
-def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
-          (SELECTf64rr Float64Regs:$a, Float64Regs:$b,
-            (TRUNC_32to1 Int32Regs:$pred))>;
-
 class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
   NVPTXRegClass regclassOut> :
            NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
@@ -2696,29 +2219,209 @@ def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
 def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
 
+// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
+// we cannot specify floating-point literals in isel patterns.  Therefore, we
+// use an integer selp to select either 1 or 0 and then cvt to floating-point.
+
+// sint -> f32
+def : Pat<(f32 (sint_to_fp Int1Regs:$a)),
+          (CVT_f32_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int16Regs:$a)),
+          (CVT_f32_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int32Regs:$a)),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (sint_to_fp Int64Regs:$a)),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f32
+def : Pat<(f32 (uint_to_fp Int1Regs:$a)),
+          (CVT_f32_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int16Regs:$a)),
+          (CVT_f32_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int32Regs:$a)),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f32 (uint_to_fp Int64Regs:$a)),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+
+// sint -> f64
+def : Pat<(f64 (sint_to_fp Int1Regs:$a)),
+          (CVT_f64_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int16Regs:$a)),
+          (CVT_f64_s16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int32Regs:$a)),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (sint_to_fp Int64Regs:$a)),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+
+// uint -> f64
+def : Pat<(f64 (uint_to_fp Int1Regs:$a)),
+          (CVT_f64_u32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
+          (CVT_f64_u16 Int16Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+
+
+// f32 -> sint
+def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
+          (CVT_s16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_sint Float32Regs:$a)),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_sint Float32Regs:$a)),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f32 -> uint
+def : Pat<(i1 (fp_to_uint Float32Regs:$a)),
+          (SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (fp_to_uint Float32Regs:$a)),
+          (CVT_u16_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i32 (fp_to_uint Float32Regs:$a)),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i64 (fp_to_uint Float32Regs:$a)),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+
+// f64 -> sint
+def : Pat<(i1 (fp_to_sint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_sint Float64Regs:$a)),
+          (CVT_s16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_sint Float64Regs:$a)),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_sint Float64Regs:$a)),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+
+// f64 -> uint
+def : Pat<(i1 (fp_to_uint Float64Regs:$a)),
+          (SETP_b64ri (BITCONVERT_64_F2I Float64Regs:$a), 0, CmpEQ)>;
+def : Pat<(i16 (fp_to_uint Float64Regs:$a)),
+          (CVT_u16_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i32 (fp_to_uint Float64Regs:$a)),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(i64 (fp_to_uint Float64Regs:$a)),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+
+// sext i1
+def : Pat<(i16 (sext Int1Regs:$a)),
+          (SELP_s16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (sext Int1Regs:$a)),
+          (SELP_s32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (sext Int1Regs:$a)),
+          (SELP_s64ii -1, 0, Int1Regs:$a)>;
+
+// zext i1
+def : Pat<(i16 (zext Int1Regs:$a)),
+          (SELP_u16ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (zext Int1Regs:$a)),
+          (SELP_u32ii 1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (zext Int1Regs:$a)),
+          (SELP_u64ii 1, 0, Int1Regs:$a)>;
+
+// anyext i1
+def : Pat<(i16 (anyext Int1Regs:$a)),
+          (SELP_u16ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i32 (anyext Int1Regs:$a)),
+          (SELP_u32ii -1, 0, Int1Regs:$a)>;
+def : Pat<(i64 (anyext Int1Regs:$a)),
+          (SELP_u64ii -1, 0, Int1Regs:$a)>;
+
+// sext i16
+def : Pat<(i32 (sext Int16Regs:$a)),
+          (CVT_s32_s16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (sext Int16Regs:$a)),
+          (CVT_s64_s16 Int16Regs:$a, CvtNONE)>;
+
+// zext i16
+def : Pat<(i32 (zext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (zext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// anyext i16
+def : Pat<(i32 (anyext Int16Regs:$a)),
+          (CVT_u32_u16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i64 (anyext Int16Regs:$a)),
+          (CVT_u64_u16 Int16Regs:$a, CvtNONE)>;
+
+// sext i32
+def : Pat<(i64 (sext Int32Regs:$a)),
+          (CVT_s64_s32 Int32Regs:$a, CvtNONE)>;
+
+// zext i32
+def : Pat<(i64 (zext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+// anyext i32
+def : Pat<(i64 (anyext Int32Regs:$a)),
+          (CVT_u64_u32 Int32Regs:$a, CvtNONE)>;
+
+
+// truncate i64
+def : Pat<(i32 (trunc Int64Regs:$a)),
+          (CVT_u32_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i16 (trunc Int64Regs:$a)),
+          (CVT_u16_u64 Int64Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int64Regs:$a)),
+          (SETP_b64ri (ANDb64ri Int64Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i32
+def : Pat<(i16 (trunc Int32Regs:$a)),
+          (CVT_u16_u32 Int32Regs:$a, CvtNONE)>;
+def : Pat<(i1 (trunc Int32Regs:$a)),
+          (SETP_b32ri (ANDb32ri Int32Regs:$a, 1), 1, CmpEQ)>;
+
+// truncate i16
+def : Pat<(i1 (trunc Int16Regs:$a)),
+          (SETP_b16ri (ANDb16ri Int16Regs:$a, 1), 1, CmpEQ)>;
+
+// sext_inreg
+def : Pat<(sext_inreg Int16Regs:$a, i8), (CVT_INREG_s16_s8 Int16Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i8), (CVT_INREG_s32_s8 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int32Regs:$a, i16), (CVT_INREG_s32_s16 Int32Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i8), (CVT_INREG_s64_s8 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i16), (CVT_INREG_s64_s16 Int64Regs:$a)>;
+def : Pat<(sext_inreg Int64Regs:$a, i32), (CVT_INREG_s64_s32 Int64Regs:$a)>;
+
+
+// Select instructions with 32-bit predicates
+def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
+          (SELP_b16rr Int16Regs:$a, Int16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
+          (SELP_b32rr Int32Regs:$a, Int32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
+          (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
+          (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
+          (SELP_f64rr Float64Regs:$a, Float64Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+
+
 // pack a set of smaller int registers to a larger int register
-def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d),
-                          (ins Int8Regs:$s1, Int8Regs:$s2,
-                               Int8Regs:$s3, Int8Regs:$s4),
-                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
-                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
-                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
-                          !strconcat("\n\tcvt.u8.u8\t%t2, $s3;",
-                          !strconcat("\n\tcvt.u8.u8\t%t3, $s4;",
-                           "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))),
-                          []>;
 def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d),
                           (ins Int16Regs:$s1, Int16Regs:$s2,
                                Int16Regs:$s3, Int16Regs:$s4),
                           "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};",
                           []>;
-def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d),
-                          (ins Int8Regs:$s1, Int8Regs:$s2),
-                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
-                          !strconcat("\n\tcvt.u8.u8\t%t0, $s1;",
-                          !strconcat("\n\tcvt.u8.u8\t%t1, $s2;",
-                                     "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))),
-                          []>;
 def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d),
                           (ins Int16Regs:$s1, Int16Regs:$s2),
                           "mov.b32\t$d, {{$s1, $s2}};",
@@ -2733,28 +2436,11 @@ def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d),
                           []>;
 
 // unpack a larger int register to a set of smaller int registers
-def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2,
-                                Int8Regs:$d3, Int8Regs:$d4),
-                          (ins Int32Regs:$s),
-                          !strconcat("{{\n\t.reg .b8\t%t<4>;",
-                          !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;",
-                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
-                          !strconcat("\n\tcvt.u8.u8\t$d2, %t1;",
-                          !strconcat("\n\tcvt.u8.u8\t$d3, %t2;",
-                                     "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))),
-                          []>;
 def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2,
                                  Int16Regs:$d3, Int16Regs:$d4),
                            (ins Int64Regs:$s),
                            "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;",
                           []>;
-def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2),
-                          (ins Int16Regs:$s),
-                          !strconcat("{{\n\t.reg .b8\t%t<2>;",
-                          !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;",
-                          !strconcat("\n\tcvt.u8.u8\t$d1, %t0;",
-                                     "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))),
-                          []>;
 def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2),
                            (ins Int32Regs:$s),
                            "mov.b32\t{{$d1, $d2}}, $s;",
@@ -2768,21 +2454,75 @@ def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
                            "mov.b64\t{{$d1, $d2}}, $s;",
                           []>;
 
-def FPRound_ftz : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
-            "cvt.rn.ftz.f32.f64 \t$d, $a;",
-      [(set Float32Regs:$d, (fround Float64Regs:$a))]>, Requires<[doF32FTZ]>;
-
-def FPRound : NVPTXInst<(outs Float32Regs:$d), (ins Float64Regs:$a),
-            "cvt.rn.f32.f64 \t$d, $a;",
-      [(set Float32Regs:$d, (fround Float64Regs:$a))]>;
-
-def FPExtend_ftz : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
-            "cvt.ftz.f64.f32 \t$d, $a;",
-      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>, Requires<[doF32FTZ]>;
-
-def FPExtend : NVPTXInst<(outs Float64Regs:$d), (ins Float32Regs:$a),
-            "cvt.f64.f32 \t$d, $a;",
-      [(set Float64Regs:$d, (fextend Float32Regs:$a))]>;
+// Count leading zeros
+def CLZr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                       "clz.b32\t$d, $a;",
+                       []>;
+def CLZr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                       "clz.b64\t$d, $a;",
+                       []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctlz Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+def : Pat<(ctlz_zero_undef Int32Regs:$a),
+          (CLZr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctlz Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(ctlz_zero_undef Int64Regs:$a),
+          (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
+// than 16 bits to store). We also need to subtract 16 because the
+// high-order 16 zeros were counted.
+def : Pat<(ctlz Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+def : Pat<(ctlz_zero_undef Int16Regs:$a),
+          (SUBi16ri (CVT_u16_u32 (CLZr32
+            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE), 16)>;
+
+// Population count
+def POPCr32 : NVPTXInst<(outs Int32Regs:$d), (ins Int32Regs:$a),
+                        "popc.b32\t$d, $a;",
+                        []>;
+def POPCr64 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+                        "popc.b64\t$d, $a;",
+                        []>;
+
+// 32-bit has a direct PTX instruction
+def : Pat<(ctpop Int32Regs:$a),
+          (POPCr32 Int32Regs:$a)>;
+
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
+// to 64-bit to match the LLVM semantics
+def : Pat<(ctpop Int64Regs:$a),
+          (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back
+// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
+// than 16 bits to store)
+def : Pat<(ctpop Int16Regs:$a),
+          (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
+           CvtNONE)>;
+
+// fround f64 -> f32
+def : Pat<(f32 (fround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f32 (fround Float64Regs:$a)),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+
+// fextend f32 -> f64
+def : Pat<(f64 (fextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(f64 (fextend Float32Regs:$a)),
+          (CVT_f64_f32 Float32Regs:$a, CvtNONE)>;
 
 def retflag       : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue]>;
@@ -2810,8 +2550,8 @@ let isTerminator=1 in {
                   [(br bb:$target)]>;
 }
 
-def : Pat<(brcond Int32Regs:$a, bb:$target), (CBranch
-    (ISetUNEi32ri_p Int32Regs:$a, 0), bb:$target)>;
+def : Pat<(brcond Int32Regs:$a, bb:$target),
+          (CBranch (SETP_u32ri Int32Regs:$a, 0, CmpNE), bb:$target)>;
 
 // SelectionDAGBuilder::visitSWitchCase() will invert the condition of a
 // conditional branch if
@@ -2867,6 +2607,20 @@ def trapinst : NVPTXInst<(outs), (ins),
                          "trap;",
                          [(trap)]>;
 
+// Call prototype wrapper
+def SDTCallPrototype : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def CallPrototype
+  : SDNode<"NVPTXISD::CallPrototype", SDTCallPrototype,
+           [SDNPHasChain, SDNPOutGlue, SDNPInGlue, SDNPSideEffect]>;
+def ProtoIdent : Operand<i32> {
+  let PrintMethod = "printProtoIdent";
+}
+def CALL_PROTOTYPE
+  : NVPTXInst<(outs), (ins ProtoIdent:$ident),
+              "$ident", [(CallPrototype (i32 texternalsym:$ident))]>;
+
+
+
 include "NVPTXIntrinsics.td"
 
 
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 24037ca..14049b1 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -82,49 +82,36 @@ def INT_MEMBAR_SYS : MEMBAR<"membar.sys;", int_nvvm_membar_sys>;
 //-----------------------------------
 
 // Map min(1.0, max(0.0, x)) to sat(x)
-multiclass SAT<NVPTXRegClass regclass, Operand fimm, Intrinsic IntMinOp,
-  Intrinsic IntMaxOp, PatLeaf f0, PatLeaf f1, string OpStr> {
-
-   // fmin(1.0, fmax(0.0, x)) => sat(x)
-   def SAT11 : NVPTXInst<(outs regclass:$dst),
-     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
-           OpStr,
-     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
-       (IntMaxOp f0:$srcf1, regclass:$src)))]>;
-
-   // fmin(1.0, fmax(x, 0.0)) => sat(x)
-   def SAT12 : NVPTXInst<(outs regclass:$dst),
-     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
-           OpStr,
-     [(set regclass:$dst, (IntMinOp f1:$srcf0 ,
-       (IntMaxOp regclass:$src, f0:$srcf1)))]>;
-
-   // fmin(fmax(0.0, x), 1.0) => sat(x)
-   def SAT13 : NVPTXInst<(outs regclass:$dst),
-     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
-           OpStr,
-     [(set regclass:$dst, (IntMinOp
-       (IntMaxOp f0:$srcf0, regclass:$src), f1:$srcf1))]>;
-
-   // fmin(fmax(x, 0.0), 1.0) => sat(x)
-   def SAT14 : NVPTXInst<(outs regclass:$dst),
-     (ins fimm:$srcf0, fimm:$srcf1, regclass:$src),
-         OpStr,
-     [(set regclass:$dst, (IntMinOp
-       (IntMaxOp regclass:$src, f0:$srcf0), f1:$srcf1))]>;
-
-}
-// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x
-// is NaN
+// Note that max(0.0, min(x, 1.0)) cannot be mapped to sat(x) because when x is
+// NaN
 // max(0.0, min(x, 1.0)) is 1.0 while sat(x) is 0.
 // Same story for fmax, fmin.
 
-defm SAT_fmin_fmax_f : SAT<Float32Regs, f32imm, int_nvvm_fmin_f,
-  int_nvvm_fmax_f, immFloat0, immFloat1,
-           "cvt.sat.f32.f32 \t$dst, $src; \n">;
-defm SAT_fmin_fmax_d : SAT<Float64Regs, f64imm, int_nvvm_fmin_d,
-  int_nvvm_fmax_d, immDouble0, immDouble1,
-           "cvt.sat.f64.f64 \t$dst, $src; \n">;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f immFloat1,
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0)),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f immFloat0, Float32Regs:$a), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_f
+            (int_nvvm_fmax_f Float32Regs:$a, immFloat0), immFloat1),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d immDouble1,
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0)),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d immDouble0, Float64Regs:$a), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_fmin_d
+            (int_nvvm_fmax_d Float64Regs:$a, immDouble0), immDouble1),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
 
 
 // We need a full string for OpcStr here because we need to deal with case like
@@ -312,19 +299,19 @@ def INT_NVVM_SAD_UI : F_MATH_3<"sad.u32 \t$dst, $src0, $src1, $src2;",
 // Floor  Ceil
 //
 
-def INT_NVVM_FLOOR_FTZ_F : F_MATH_1<"cvt.rmi.ftz.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_floor_ftz_f>;
-def INT_NVVM_FLOOR_F : F_MATH_1<"cvt.rmi.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_floor_f>;
-def INT_NVVM_FLOOR_D : F_MATH_1<"cvt.rmi.f64.f64 \t$dst, $src0;",
-  Float64Regs, Float64Regs, int_nvvm_floor_d>;
+def : Pat<(int_nvvm_floor_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_floor_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_floor_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRMI)>;
 
-def INT_NVVM_CEIL_FTZ_F : F_MATH_1<"cvt.rpi.ftz.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_ceil_ftz_f>;
-def INT_NVVM_CEIL_F : F_MATH_1<"cvt.rpi.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_ceil_f>;
-def INT_NVVM_CEIL_D : F_MATH_1<"cvt.rpi.f64.f64 \t$dst, $src0;",
-  Float64Regs, Float64Regs, int_nvvm_ceil_d>;
+def : Pat<(int_nvvm_ceil_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_ceil_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRPI)>;
+def : Pat<(int_nvvm_ceil_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRPI)>;
 
 //
 // Abs
@@ -347,37 +334,34 @@ def INT_NVVM_FABS_D : F_MATH_1<"abs.f64 \t$dst, $src0;", Float64Regs,
 // Round
 //
 
-def INT_NVVM_ROUND_FTZ_F : F_MATH_1<"cvt.rni.ftz.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_round_ftz_f>;
-def INT_NVVM_ROUND_F : F_MATH_1<"cvt.rni.f32.f32 \t$dst, $src0;", Float32Regs,
-  Float32Regs, int_nvvm_round_f>;
-
-def INT_NVVM_ROUND_D : F_MATH_1<"cvt.rni.f64.f64 \t$dst, $src0;", Float64Regs,
-  Float64Regs, int_nvvm_round_d>;
+def : Pat<(int_nvvm_round_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_round_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_round_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRNI)>;
 
 //
 // Trunc
 //
 
-def INT_NVVM_TRUNC_FTZ_F : F_MATH_1<"cvt.rzi.ftz.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_trunc_ftz_f>;
-def INT_NVVM_TRUNC_F : F_MATH_1<"cvt.rzi.f32.f32 \t$dst, $src0;", Float32Regs,
-  Float32Regs, int_nvvm_trunc_f>;
-
-def INT_NVVM_TRUNC_D : F_MATH_1<"cvt.rzi.f64.f64 \t$dst, $src0;", Float64Regs,
-  Float64Regs, int_nvvm_trunc_d>;
+def : Pat<(int_nvvm_trunc_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_trunc_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_trunc_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtRZI)>;
 
 //
 // Saturate
 //
 
-def INT_NVVM_SATURATE_FTZ_F : F_MATH_1<"cvt.sat.ftz.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_saturate_ftz_f>;
-def INT_NVVM_SATURATE_F : F_MATH_1<"cvt.sat.f32.f32 \t$dst, $src0;",
-  Float32Regs, Float32Regs, int_nvvm_saturate_f>;
-
-def INT_NVVM_SATURATE_D : F_MATH_1<"cvt.sat.f64.f64 \t$dst, $src0;",
-  Float64Regs, Float64Regs, int_nvvm_saturate_d>;
+def : Pat<(int_nvvm_saturate_ftz_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT_FTZ)>;
+def : Pat<(int_nvvm_saturate_f Float32Regs:$a),
+          (CVT_f32_f32 Float32Regs:$a, CvtSAT)>;
+def : Pat<(int_nvvm_saturate_d Float64Regs:$a),
+          (CVT_f64_f64 Float64Regs:$a, CvtSAT)>;
 
 //
 // Exp2  Log2
@@ -568,110 +552,110 @@ def INT_NVVM_ADD_RP_D : F_MATH_2<"add.rp.f64 \t$dst, $src0, $src1;",
 // Convert
 //
 
-def INT_NVVM_D2F_RN_FTZ : F_MATH_1<"cvt.rn.ftz.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rn_ftz>;
-def INT_NVVM_D2F_RN : F_MATH_1<"cvt.rn.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rn>;
-def INT_NVVM_D2F_RZ_FTZ : F_MATH_1<"cvt.rz.ftz.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rz_ftz>;
-def INT_NVVM_D2F_RZ : F_MATH_1<"cvt.rz.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rz>;
-def INT_NVVM_D2F_RM_FTZ : F_MATH_1<"cvt.rm.ftz.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rm_ftz>;
-def INT_NVVM_D2F_RM : F_MATH_1<"cvt.rm.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rm>;
-def INT_NVVM_D2F_RP_FTZ : F_MATH_1<"cvt.rp.ftz.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rp_ftz>;
-def INT_NVVM_D2F_RP : F_MATH_1<"cvt.rp.f32.f64 \t$dst, $src0;",
-  Float32Regs, Float64Regs, int_nvvm_d2f_rp>;
-
-def INT_NVVM_D2I_RN : F_MATH_1<"cvt.rni.s32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2i_rn>;
-def INT_NVVM_D2I_RZ : F_MATH_1<"cvt.rzi.s32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2i_rz>;
-def INT_NVVM_D2I_RM : F_MATH_1<"cvt.rmi.s32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2i_rm>;
-def INT_NVVM_D2I_RP : F_MATH_1<"cvt.rpi.s32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2i_rp>;
-
-def INT_NVVM_D2UI_RN : F_MATH_1<"cvt.rni.u32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2ui_rn>;
-def INT_NVVM_D2UI_RZ : F_MATH_1<"cvt.rzi.u32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2ui_rz>;
-def INT_NVVM_D2UI_RM : F_MATH_1<"cvt.rmi.u32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2ui_rm>;
-def INT_NVVM_D2UI_RP : F_MATH_1<"cvt.rpi.u32.f64 \t$dst, $src0;",
-  Int32Regs, Float64Regs, int_nvvm_d2ui_rp>;
-
-def INT_NVVM_I2D_RN : F_MATH_1<"cvt.rn.f64.s32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_i2d_rn>;
-def INT_NVVM_I2D_RZ : F_MATH_1<"cvt.rz.f64.s32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_i2d_rz>;
-def INT_NVVM_I2D_RM : F_MATH_1<"cvt.rm.f64.s32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_i2d_rm>;
-def INT_NVVM_I2D_RP : F_MATH_1<"cvt.rp.f64.s32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_i2d_rp>;
-
-def INT_NVVM_UI2D_RN : F_MATH_1<"cvt.rn.f64.u32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_ui2d_rn>;
-def INT_NVVM_UI2D_RZ : F_MATH_1<"cvt.rz.f64.u32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_ui2d_rz>;
-def INT_NVVM_UI2D_RM : F_MATH_1<"cvt.rm.f64.u32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_ui2d_rm>;
-def INT_NVVM_UI2D_RP : F_MATH_1<"cvt.rp.f64.u32 \t$dst, $src0;",
-  Float64Regs, Int32Regs, int_nvvm_ui2d_rp>;
-
-def INT_NVVM_F2I_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2i_rn_ftz>;
-def INT_NVVM_F2I_RN : F_MATH_1<"cvt.rni.s32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2i_rn>;
-def INT_NVVM_F2I_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2i_rz_ftz>;
-def INT_NVVM_F2I_RZ : F_MATH_1<"cvt.rzi.s32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2i_rz>;
-def INT_NVVM_F2I_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2i_rm_ftz>;
-def INT_NVVM_F2I_RM : F_MATH_1<"cvt.rmi.s32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2i_rm>;
-def INT_NVVM_F2I_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2i_rp_ftz>;
-def INT_NVVM_F2I_RP : F_MATH_1<"cvt.rpi.s32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2i_rp>;
-
-def INT_NVVM_F2UI_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2ui_rn_ftz>;
-def INT_NVVM_F2UI_RN : F_MATH_1<"cvt.rni.u32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2ui_rn>;
-def INT_NVVM_F2UI_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2ui_rz_ftz>;
-def INT_NVVM_F2UI_RZ : F_MATH_1<"cvt.rzi.u32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2ui_rz>;
-def INT_NVVM_F2UI_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2ui_rm_ftz>;
-def INT_NVVM_F2UI_RM : F_MATH_1<"cvt.rmi.u32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2ui_rm>;
-def INT_NVVM_F2UI_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u32.f32 \t$dst, $src0;",
-  Int32Regs, Float32Regs, int_nvvm_f2ui_rp_ftz>;
-def INT_NVVM_F2UI_RP : F_MATH_1<"cvt.rpi.u32.f32 \t$dst, $src0;", Int32Regs,
-  Float32Regs, int_nvvm_f2ui_rp>;
-
-def INT_NVVM_I2F_RN : F_MATH_1<"cvt.rn.f32.s32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_i2f_rn>;
-def INT_NVVM_I2F_RZ : F_MATH_1<"cvt.rz.f32.s32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_i2f_rz>;
-def INT_NVVM_I2F_RM : F_MATH_1<"cvt.rm.f32.s32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_i2f_rm>;
-def INT_NVVM_I2F_RP : F_MATH_1<"cvt.rp.f32.s32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_i2f_rp>;
-
-def INT_NVVM_UI2F_RN : F_MATH_1<"cvt.rn.f32.u32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_ui2f_rn>;
-def INT_NVVM_UI2F_RZ : F_MATH_1<"cvt.rz.f32.u32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_ui2f_rz>;
-def INT_NVVM_UI2F_RM : F_MATH_1<"cvt.rm.f32.u32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_ui2f_rm>;
-def INT_NVVM_UI2F_RP : F_MATH_1<"cvt.rp.f32.u32 \t$dst, $src0;", Float32Regs,
-  Int32Regs, int_nvvm_ui2f_rp>;
+def : Pat<(int_nvvm_d2f_rn_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>;
+def : Pat<(int_nvvm_d2f_rn Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_d2f_rz_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ_FTZ)>;
+def : Pat<(int_nvvm_d2f_rz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_d2f_rm_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM_FTZ)>;
+def : Pat<(int_nvvm_d2f_rm Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_d2f_rp_ftz Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP_FTZ)>;
+def : Pat<(int_nvvm_d2f_rp Float64Regs:$a),
+          (CVT_f32_f64 Float64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_d2i_rn Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2i_rz Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2i_rm Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2i_rp Float64Regs:$a),
+          (CVT_s32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ui_rn Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ui_rz Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ui_rm Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ui_rp Float64Regs:$a),
+          (CVT_u32_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2d_rn Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2d_rz Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2d_rm Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2d_rp Int32Regs:$a),
+          (CVT_f64_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2d_rn Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2d_rz Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2d_rm Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2d_rp Int32Regs:$a),
+          (CVT_f64_u32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_f2i_rn_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rn Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2i_rz_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2i_rm_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rm Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2i_rp_ftz Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2i_rp Float32Regs:$a),
+          (CVT_s32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ui_rn_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rn Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ui_rz_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ui_rm_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rm Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ui_rp_ftz Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ui_rp Float32Regs:$a),
+          (CVT_u32_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_i2f_rn Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_i2f_rz Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_i2f_rm Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_i2f_rp Int32Regs:$a),
+          (CVT_f32_s32 Int32Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ui2f_rn Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ui2f_rz Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ui2f_rm Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ui2f_rp Int32Regs:$a),
+          (CVT_f32_u32 Int32Regs:$a, CvtRP)>;
 
 def INT_NVVM_LOHI_I2D : F_MATH_2<"mov.b64 \t$dst, {{$src0, $src1}};",
   Float64Regs, Int32Regs, Int32Regs, int_nvvm_lohi_i2d>;
@@ -687,91 +671,106 @@ def INT_NVVM_D2I_HI : F_MATH_1<!strconcat("{{\n\t",
                            "}}"))),
              Int32Regs, Float64Regs, int_nvvm_d2i_hi>;
 
-def INT_NVVM_F2LL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.s64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ll_rn_ftz>;
-def INT_NVVM_F2LL_RN : F_MATH_1<"cvt.rni.s64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ll_rn>;
-def INT_NVVM_F2LL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.s64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ll_rz_ftz>;
-def INT_NVVM_F2LL_RZ : F_MATH_1<"cvt.rzi.s64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ll_rz>;
-def INT_NVVM_F2LL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.s64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ll_rm_ftz>;
-def INT_NVVM_F2LL_RM : F_MATH_1<"cvt.rmi.s64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ll_rm>;
-def INT_NVVM_F2LL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.s64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ll_rp_ftz>;
-def INT_NVVM_F2LL_RP : F_MATH_1<"cvt.rpi.s64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ll_rp>;
-
-def INT_NVVM_F2ULL_RN_FTZ : F_MATH_1<"cvt.rni.ftz.u64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ull_rn_ftz>;
-def INT_NVVM_F2ULL_RN : F_MATH_1<"cvt.rni.u64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ull_rn>;
-def INT_NVVM_F2ULL_RZ_FTZ : F_MATH_1<"cvt.rzi.ftz.u64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ull_rz_ftz>;
-def INT_NVVM_F2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ull_rz>;
-def INT_NVVM_F2ULL_RM_FTZ : F_MATH_1<"cvt.rmi.ftz.u64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ull_rm_ftz>;
-def INT_NVVM_F2ULL_RM : F_MATH_1<"cvt.rmi.u64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ull_rm>;
-def INT_NVVM_F2ULL_RP_FTZ : F_MATH_1<"cvt.rpi.ftz.u64.f32 \t$dst, $src0;",
-  Int64Regs, Float32Regs, int_nvvm_f2ull_rp_ftz>;
-def INT_NVVM_F2ULL_RP : F_MATH_1<"cvt.rpi.u64.f32 \t$dst, $src0;", Int64Regs,
-  Float32Regs, int_nvvm_f2ull_rp>;
-
-def INT_NVVM_D2LL_RN : F_MATH_1<"cvt.rni.s64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ll_rn>;
-def INT_NVVM_D2LL_RZ : F_MATH_1<"cvt.rzi.s64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ll_rz>;
-def INT_NVVM_D2LL_RM : F_MATH_1<"cvt.rmi.s64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ll_rm>;
-def INT_NVVM_D2LL_RP : F_MATH_1<"cvt.rpi.s64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ll_rp>;
-
-def INT_NVVM_D2ULL_RN : F_MATH_1<"cvt.rni.u64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ull_rn>;
-def INT_NVVM_D2ULL_RZ : F_MATH_1<"cvt.rzi.u64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ull_rz>;
-def INT_NVVM_D2ULL_RM : F_MATH_1<"cvt.rmi.u64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ull_rm>;
-def INT_NVVM_D2ULL_RP : F_MATH_1<"cvt.rpi.u64.f64 \t$dst, $src0;", Int64Regs,
-  Float64Regs, int_nvvm_d2ull_rp>;
-
-def INT_NVVM_LL2F_RN : F_MATH_1<"cvt.rn.f32.s64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ll2f_rn>;
-def INT_NVVM_LL2F_RZ : F_MATH_1<"cvt.rz.f32.s64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ll2f_rz>;
-def INT_NVVM_LL2F_RM : F_MATH_1<"cvt.rm.f32.s64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ll2f_rm>;
-def INT_NVVM_LL2F_RP : F_MATH_1<"cvt.rp.f32.s64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ll2f_rp>;
-def INT_NVVM_ULL2F_RN : F_MATH_1<"cvt.rn.f32.u64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ull2f_rn>;
-def INT_NVVM_ULL2F_RZ : F_MATH_1<"cvt.rz.f32.u64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ull2f_rz>;
-def INT_NVVM_ULL2F_RM : F_MATH_1<"cvt.rm.f32.u64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ull2f_rm>;
-def INT_NVVM_ULL2F_RP : F_MATH_1<"cvt.rp.f32.u64 \t$dst, $src0;", Float32Regs,
-  Int64Regs, int_nvvm_ull2f_rp>;
-
-def INT_NVVM_LL2D_RN : F_MATH_1<"cvt.rn.f64.s64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ll2d_rn>;
-def INT_NVVM_LL2D_RZ : F_MATH_1<"cvt.rz.f64.s64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ll2d_rz>;
-def INT_NVVM_LL2D_RM : F_MATH_1<"cvt.rm.f64.s64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ll2d_rm>;
-def INT_NVVM_LL2D_RP : F_MATH_1<"cvt.rp.f64.s64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ll2d_rp>;
-def INT_NVVM_ULL2D_RN : F_MATH_1<"cvt.rn.f64.u64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ull2d_rn>;
-def INT_NVVM_ULL2D_RZ : F_MATH_1<"cvt.rz.f64.u64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ull2d_rz>;
-def INT_NVVM_ULL2D_RM : F_MATH_1<"cvt.rm.f64.u64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ull2d_rm>;
-def INT_NVVM_ULL2D_RP : F_MATH_1<"cvt.rp.f64.u64 \t$dst, $src0;", Float64Regs,
-  Int64Regs, int_nvvm_ull2d_rp>;
+def : Pat<(int_nvvm_f2ll_rn_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rn Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ll_rz_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ll_rm_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rm Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ll_rp_ftz Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ll_rp Float32Regs:$a),
+          (CVT_s64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_f2ull_rn_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rn Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_f2ull_rz_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_f2ull_rm_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rm Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_f2ull_rp_ftz Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI_FTZ)>;
+def : Pat<(int_nvvm_f2ull_rp Float32Regs:$a),
+          (CVT_u64_f32 Float32Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ll_rn Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ll_rz Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ll_rm Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ll_rp Float64Regs:$a),
+          (CVT_s64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_d2ull_rn Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRNI)>;
+def : Pat<(int_nvvm_d2ull_rz Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRZI)>;
+def : Pat<(int_nvvm_d2ull_rm Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRMI)>;
+def : Pat<(int_nvvm_d2ull_rp Float64Regs:$a),
+          (CVT_u64_f64 Float64Regs:$a, CvtRPI)>;
+
+def : Pat<(int_nvvm_ll2f_rn Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2f_rz Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2f_rm Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2f_rp Int64Regs:$a),
+          (CVT_f32_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2f_rn Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2f_rz Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2f_rm Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2f_rp Int64Regs:$a),
+          (CVT_f32_u64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ll2d_rn Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ll2d_rz Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ll2d_rm Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ll2d_rp Int64Regs:$a),
+          (CVT_f64_s64 Int64Regs:$a, CvtRP)>;
+
+def : Pat<(int_nvvm_ull2d_rn Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRN)>;
+def : Pat<(int_nvvm_ull2d_rz Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRZ)>;
+def : Pat<(int_nvvm_ull2d_rm Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRM)>;
+def : Pat<(int_nvvm_ull2d_rp Int64Regs:$a),
+          (CVT_f64_u64 Int64Regs:$a, CvtRP)>;
+
+
+// FIXME: Ideally, we could use these patterns instead of the scope-creating
+// patterns, but ptxas does not like these since .s16 is not compatible with
+// .f16.  The solution is to use .bXX for all integer register types, but we
+// are not there yet.
+//def : Pat<(int_nvvm_f2h_rn_ftz Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>;
+//def : Pat<(int_nvvm_f2h_rn Float32Regs:$a),
+//          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+//
+//def : Pat<(int_nvvm_h2f Int16Regs:$a),
+//          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
 
 def INT_NVVM_F2H_RN_FTZ : F_MATH_1<!strconcat("{{\n\t",
                                    !strconcat(".reg .b16 %temp;\n\t",
@@ -793,6 +792,13 @@ def INT_NVVM_H2F : F_MATH_1<!strconcat("{{\n\t",
             "}}")))),
           Float32Regs, Int16Regs, int_nvvm_h2f>;
 
+def : Pat<(f32 (f16_to_f32 Int16Regs:$a)),
+          (CVT_f32_f16 Int16Regs:$a, CvtNONE)>;
+def : Pat<(i16 (f32_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
+def : Pat<(i16 (f32_to_f16 Float32Regs:$a)),
+          (CVT_f16_f32 Float32Regs:$a, CvtRN)>;
+
 //
 // Bitcast
 //
@@ -1270,6 +1276,11 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs,
 // Support for ldu on sm_20 or later
 //-----------------------------------
 
+def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{
+  MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N);
+  return M->getMemoryVT() == MVT::i8;
+}]>;
+
 // Scalar
 // @TODO: Revisit this, Changed imemAny to imem
 multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
@@ -1291,8 +1302,27 @@ multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> {
          [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
 }
 
-defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];",  Int8Regs,
-int_nvvm_ldu_global_i>;
+multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> {
+  def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>;
+  def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>;
+ def avar:  NVPTXInst<(outs regclass:$result), (ins imem:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>,
+         Requires<[hasLDU]>;
+ def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>;
+ def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
+               !strconcat("ldu.global.", TyStr),
+         [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>;
+}
+
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs,
+                                             ldu_i8>;
 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs,
 int_nvvm_ldu_global_i>;
 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs,
@@ -1312,25 +1342,43 @@ int_nvvm_ldu_global_p>;
 
 // Elementized vector ldu
 multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
- def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-   (ins Int32Regs:$src),
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int32Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins Int64Regs:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
                      !strconcat("ldu.global.", TyStr), []>;
- def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-   (ins Int64Regs:$src),
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
                      !strconcat("ldu.global.", TyStr), []>;
 }
 
-multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
- def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-     regclass:$dst4), (ins Int32Regs:$src),
+multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri:$src), 
                !strconcat("ldu.global.", TyStr), []>;
- def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-     regclass:$dst4), (ins Int64Regs:$src),
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ldu.global.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins imemAny:$src), 
                !strconcat("ldu.global.", TyStr), []>;
 }
 
 defm INT_PTX_LDU_G_v2i8_ELE
-  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int8Regs>;
+  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
 defm INT_PTX_LDU_G_v2i16_ELE
   : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v2i32_ELE
@@ -1342,7 +1390,7 @@ defm INT_PTX_LDU_G_v2i64_ELE
 defm INT_PTX_LDU_G_v2f64_ELE
   : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
 defm INT_PTX_LDU_G_v4i8_ELE
-  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>;
+  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v4i16_ELE
   : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Int16Regs>;
@@ -1422,20 +1470,38 @@ defm INT_PTX_LDG_GLOBAL_p64
 
 // Elementized vector ldg 
 multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
- def _32:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins Int32Regs:$src),
                      !strconcat("ld.global.nc.", TyStr), []>;
- def _64:     NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins Int64Regs:$src),
                      !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins MEMri64:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
+ def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src),
+                     !strconcat("ld.global.nc.", TyStr), []>;
 }
 
 multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { 
- def _32:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
-                        regclass:$dst3, regclass:$dst4), (ins Int32Regs:$src),
+  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins Int32Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                               regclass:$dst4), (ins Int64Regs:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri:$src), 
                !strconcat("ld.global.nc.", TyStr), []>;
- def _64:    NVPTXInst<(outs regclass:$dst1, regclass:$dst2,
-                        regclass:$dst3, regclass:$dst4), (ins Int64Regs:$src),
+  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                              regclass:$dst4), (ins MEMri64:$src), 
+               !strconcat("ld.global.nc.", TyStr), []>;
+  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                             regclass:$dst4), (ins imemAny:$src), 
                !strconcat("ld.global.nc.", TyStr), []>;
 }
 
@@ -1542,10 +1608,6 @@ def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
 
 
 // nvvm.move intrinsicc
-def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s),
-                             "mov.b16 \t$r, $s;",
-                             [(set Int8Regs:$r,
-                               (int_nvvm_move_i8 Int8Regs:$s))]>;
 def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
                              "mov.b16 \t$r, $s;",
                              [(set Int16Regs:$r,
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
new file mode 100644
index 0000000..ca24764
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -0,0 +1,46 @@
+//===-- NVPTXMCExpr.cpp - NVPTX specific MC expression classes ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "nvptx-mcexpr"
+#include "NVPTXMCExpr.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+using namespace llvm;
+
+const NVPTXFloatMCExpr*
+NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
+  return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
+}
+
+void NVPTXFloatMCExpr::PrintImpl(raw_ostream &OS) const {
+  bool Ignored;
+  unsigned NumHex;
+  APFloat APF = getAPFloat();
+
+  switch (Kind) {
+  default: llvm_unreachable("Invalid kind!");
+  case VK_NVPTX_SINGLE_PREC_FLOAT:
+    OS << "0f";
+    NumHex = 8;
+    APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  case VK_NVPTX_DOUBLE_PREC_FLOAT:
+    OS << "0d";
+    NumHex = 16;
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Ignored);
+    break;
+  }
+
+  APInt API = APF.bitcastToAPInt();
+  std::string HexStr(utohexstr(API.getZExtValue()));
+  if (HexStr.length() < NumHex)
+    OS << std::string(NumHex - HexStr.length(), '0');
+  OS << utohexstr(API.getZExtValue());
+}
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
new file mode 100644
index 0000000..0efb231
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -0,0 +1,83 @@
+//===-- NVPTXMCExpr.h - NVPTX specific MC expression classes ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// Modeled after ARMMCExpr
+
+#ifndef NVPTXMCEXPR_H
+#define NVPTXMCEXPR_H
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class NVPTXFloatMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_NVPTX_None,
+    VK_NVPTX_SINGLE_PREC_FLOAT,   // FP constant in single-precision
+    VK_NVPTX_DOUBLE_PREC_FLOAT    // FP constant in double-precision
+  };
+
+private:
+  const VariantKind Kind;
+  const APFloat Flt;
+
+  explicit NVPTXFloatMCExpr(VariantKind _Kind, APFloat _Flt)
+    : Kind(_Kind), Flt(_Flt) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const NVPTXFloatMCExpr *Create(VariantKind Kind, APFloat Flt,
+                                        MCContext &Ctx);
+
+  static const NVPTXFloatMCExpr *CreateConstantFPSingle(APFloat Flt,
+                                                        MCContext &Ctx) {
+    return Create(VK_NVPTX_SINGLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  static const NVPTXFloatMCExpr *CreateConstantFPDouble(APFloat Flt,
+                                                        MCContext &Ctx) {
+    return Create(VK_NVPTX_DOUBLE_PREC_FLOAT, Flt, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  APFloat getAPFloat() const { return Flt; }
+
+/// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const {
+    return false;
+  }
+  void AddValueSymbols(MCAssembler *) const {};
+  const MCSection *FindAssociatedSection() const {
+    return NULL;
+  }
+
+  // There are no TLS NVPTXMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 0000000..843ebed
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,225 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+  return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameLowering &TFI = *TM.getFrameLowering();
+  const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+  bool Modified = false;
+
+  calculateFrameObjectOffsets(MF);
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+      MachineInstr *MI = I;
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        if (!MI->getOperand(i).isFI())
+          continue;
+        TRI.eliminateFrameIndex(MI, 0, i, NULL);
+        Modified = true;
+      }
+    }
+  }
+
+  // Add function prolog/epilog
+  TFI.emitPrologue(MF);
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().isReturn())
+      TFI.emitEpilogue(MF, *I);
+  }
+
+  return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // NOTE: We do not have a call stack
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // No scavenger
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // No stack protector
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // No scavenger
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 2824653..4d3a1d9 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -38,10 +38,6 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
     return ".s32";
   } else if (RC == &NVPTX::Int16RegsRegClass) {
     return ".s16";
-  }
-      // Int8Regs become 16-bit registers in PTX
-      else if (RC == &NVPTX::Int8RegsRegClass) {
-    return ".s16";
   } else if (RC == &NVPTX::Int1RegsRegClass) {
     return ".pred";
   } else if (RC == &NVPTX::SpecialRegsRegClass) {
@@ -57,15 +53,13 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
     return "%f";
   }
   if (RC == &NVPTX::Float64RegsRegClass) {
-    return "%fd";
+    return "%fl";
   } else if (RC == &NVPTX::Int64RegsRegClass) {
-    return "%rd";
+    return "%rl";
   } else if (RC == &NVPTX::Int32RegsRegClass) {
     return "%r";
   } else if (RC == &NVPTX::Int16RegsRegClass) {
     return "%rs";
-  } else if (RC == &NVPTX::Int8RegsRegClass) {
-    return "%rc";
   } else if (RC == &NVPTX::Int1RegsRegClass) {
     return "%p";
   } else if (RC == &NVPTX::SpecialRegsRegClass) {
@@ -77,8 +71,7 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
 }
 }
 
-NVPTXRegisterInfo::NVPTXRegisterInfo(const TargetInstrInfo &tii,
-                                     const NVPTXSubtarget &st)
+NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
     : NVPTXGenRegisterInfo(0), Is64Bit(st.is64Bit()) {}
 
 #define GET_REGINFO_TARGET_DESC
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index d406820..0a20f29 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -35,7 +35,7 @@ private:
   ManagedStringPool ManagedStrPool;
 
 public:
-  NVPTXRegisterInfo(const TargetInstrInfo &tii, const NVPTXSubtarget &st);
+  NVPTXRegisterInfo(const NVPTXSubtarget &st);
 
   //------------------------------------------------------
   // Pure virtual functions from TargetRegisterInfo
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index 8d100d6..7a38a66 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -29,9 +29,10 @@ def VRFrameLocal    : NVPTXReg<"%SPL">;
 // Special Registers used as the stack
 def VRDepot  : NVPTXReg<"%Depot">;
 
-foreach i = 0-395 in {
+// We use virtual registers, but define a few physical registers here to keep
+// SDAG and the MachineInstr layers happy.
+foreach i = 0-4 in {
   def P#i  : NVPTXReg<"%p"#i>;  // Predicate
-  def RC#i : NVPTXReg<"%rc"#i>; // 8-bit
   def RS#i : NVPTXReg<"%rs"#i>; // 16-bit
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
   def RL#i : NVPTXReg<"%rl"#i>; // 64-bit
@@ -48,17 +49,16 @@ foreach i = 0-395 in {
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
-def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>;
-def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>;
-def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>;
-def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>;
-def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>;
-def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 395))>;
-def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 395))>;
-def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 395))>;
-def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 395))>;
-def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 395))>;
-def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 395))>;
+def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
+def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
+def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
+def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
+def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
+def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
+def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
+def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
+def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
+def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
 
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
 def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRDepot)>;
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index e57ace9..f8a692e 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -24,10 +24,10 @@ namespace llvm {
 /// the ASMPrint interface.
 ///
 class NVPTXSection : public MCSection {
-
+  virtual void anchor();
 public:
   NVPTXSection(SectionVariant V, SectionKind K) : MCSection(V, K) {}
-  ~NVPTXSection() {}
+  virtual ~NVPTXSection() {}
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
diff --git a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
index 83dfe12..b64c308 100644
--- a/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
+++ b/lib/Target/NVPTX/NVPTXSplitBBatBar.cpp
@@ -36,7 +36,7 @@ bool NVPTXSplitBBatBar::runOnFunction(Function &F) {
     BasicBlock::iterator II = IB;
     BasicBlock::iterator IE = BI->end();
 
-    // Skit the first intruction. No splitting is needed at this
+    // Skit the first instruction. No splitting is needed at this
     // point even if this is a bar.
     while (II != IE) {
       if (IntrinsicInst *inst = dyn_cast<IntrinsicInst>(II)) {
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 2dcd73d..9771a17 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -19,23 +19,21 @@
 
 using namespace llvm;
 
-// Select Driver Interface
-#include "llvm/Support/CommandLine.h"
-namespace {
-cl::opt<NVPTX::DrvInterface> DriverInterface(
-    cl::desc("Choose driver interface:"),
-    cl::values(clEnumValN(NVPTX::NVCL, "drvnvcl", "Nvidia OpenCL driver"),
-               clEnumValN(NVPTX::CUDA, "drvcuda", "Nvidia CUDA driver"),
-               clEnumValN(NVPTX::TEST, "drvtest", "Plain Test"), clEnumValEnd),
-    cl::init(NVPTX::NVCL));
-}
+
+// Pin the vtable to this file.
+void NVPTXSubtarget::anchor() {}
 
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, bool is64Bit)
     : NVPTXGenSubtargetInfo(TT, CPU, FS), Is64Bit(is64Bit), PTXVersion(0),
       SmVersion(20) {
 
-  drvInterface = DriverInterface;
+  Triple T(TT);
+
+  if (T.getOS() == Triple::NVCL)
+    drvInterface = NVPTX::NVCL;
+  else
+    drvInterface = NVPTX::CUDA;
 
   // Provide the default CPU if none
   std::string defCPU = "sm_20";
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 670077d..004be11 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,7 +25,7 @@
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
+  virtual void anchor();
   std::string TargetName;
   NVPTX::DrvInterface drvInterface;
   bool Is64Bit;
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 1ae2a7c..46edd6d 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -57,9 +57,6 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   RegisterTargetMachine<NVPTXTargetMachine32> X(TheNVPTXTarget32);
   RegisterTargetMachine<NVPTXTargetMachine64> Y(TheNVPTXTarget64);
 
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> A(TheNVPTXTarget32);
-  RegisterMCAsmInfo<NVPTXMCAsmInfo> B(TheNVPTXTarget64);
-
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
@@ -74,7 +71,9 @@ NVPTXTargetMachine::NVPTXTargetMachine(
       Subtarget(TT, CPU, FS, is64bit), DL(Subtarget.getDataLayout()),
       InstrInfo(*this), TLInfo(*this), TSInfo(*this),
       FrameLowering(
-          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {}
+          *this, is64bit) /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
+  initAsmInfo();
+}
 
 void NVPTXTargetMachine32::anchor() {}
 
@@ -92,7 +91,7 @@ NVPTXTargetMachine64::NVPTXTargetMachine64(
     CodeGenOpt::Level OL)
     : NVPTXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
-namespace llvm {
+namespace {
 class NVPTXPassConfig : public TargetPassConfig {
 public:
   NVPTXPassConfig(NVPTXTargetMachine *TM, PassManagerBase &PM)
@@ -105,8 +104,13 @@ public:
   virtual void addIRPasses();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+
+  virtual FunctionPass *createTargetRegisterAllocator(bool) LLVM_OVERRIDE;
+  virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
+  virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
 };
-}
+} // end anonymous namespace
 
 TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
   NVPTXPassConfig *PassConfig = new NVPTXPassConfig(this, PM);
@@ -114,6 +118,16 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
 }
 
 void NVPTXPassConfig::addIRPasses() {
+  // The following passes are known to not play well with virtual regs hanging
+  // around after register allocation (which in our case, is *all* registers).
+  // We explicitly disable them here.  We do, however, need some functionality
+  // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+  // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+  disablePass(&PrologEpilogCodeInserterID);
+  disablePass(&MachineCopyPropagationID);
+  disablePass(&BranchFolderPassID);
+  disablePass(&TailDuplicateID);
+
   TargetPassConfig::addIRPasses();
   addPass(createGenericToNVVMPass());
 }
@@ -127,3 +141,41 @@ bool NVPTXPassConfig::addInstSelector() {
 }
 
 bool NVPTXPassConfig::addPreRegAlloc() { return false; }
+bool NVPTXPassConfig::addPostRegAlloc() {
+  addPass(createNVPTXPrologEpilogPass());
+  return false;
+}
+
+FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
+  return 0; // No reg alloc
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+  addPass(&PHIEliminationID);
+  addPass(&TwoAddressInstructionPassID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  assert(!RegAllocPass && "NVPTX uses no regalloc!");
+
+  addPass(&ProcessImplicitDefsID);
+  addPass(&LiveVariablesID);
+  addPass(&MachineLoopInfoID);
+  addPass(&PHIEliminationID);
+
+  addPass(&TwoAddressInstructionPassID);
+  addPass(&RegisterCoalescerID);
+
+  // PreRA instruction scheduling.
+  if (addPass(&MachineSchedulerID))
+    printAndVerify("After Machine Scheduling");
+
+
+  addPass(&StackSlotColoringID);
+
+  // FIXME: Needs physical registers
+  //addPass(&PostRAMachineLICMID);
+
+  printAndVerify("After StackSlotColoring");
+}
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 6ab0e08..2a7394b 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -21,31 +21,33 @@ class Module;
 class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 
 public:
-  NVPTXTargetObjectFile() {}
-  ~NVPTXTargetObjectFile() {
-    delete TextSection;
-    delete DataSection;
-    delete BSSSection;
-    delete ReadOnlySection;
+  NVPTXTargetObjectFile() {
+    TextSection = 0;
+    DataSection = 0;
+    BSSSection = 0;
+    ReadOnlySection = 0;
 
-    delete StaticCtorSection;
-    delete StaticDtorSection;
-    delete LSDASection;
-    delete EHFrameSection;
-    delete DwarfAbbrevSection;
-    delete DwarfInfoSection;
-    delete DwarfLineSection;
-    delete DwarfFrameSection;
-    delete DwarfPubTypesSection;
-    delete DwarfDebugInlineSection;
-    delete DwarfStrSection;
-    delete DwarfLocSection;
-    delete DwarfARangesSection;
-    delete DwarfRangesSection;
-    delete DwarfMacroInfoSection;
+    StaticCtorSection = 0;
+    StaticDtorSection = 0;
+    LSDASection = 0;
+    EHFrameSection = 0;
+    DwarfAbbrevSection = 0;
+    DwarfInfoSection = 0;
+    DwarfLineSection = 0;
+    DwarfFrameSection = 0;
+    DwarfPubTypesSection = 0;
+    DwarfDebugInlineSection = 0;
+    DwarfStrSection = 0;
+    DwarfLocSection = 0;
+    DwarfARangesSection = 0;
+    DwarfRangesSection = 0;
+    DwarfMacroInfoSection = 0;
   }
 
+  virtual ~NVPTXTargetObjectFile();
+
   virtual void Initialize(MCContext &ctx, const TargetMachine &TM) {
+    TargetLoweringObjectFile::Initialize(ctx, TM);
     TextSection = new NVPTXSection(MCSection::SV_ELF, SectionKind::getText());
     DataSection =
         new NVPTXSection(MCSection::SV_ELF, SectionKind::getDataRel());
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 3cc324b..7406207 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -79,7 +79,7 @@ ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
 }
 
 static cl::opt<bool>
-NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
+NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::Hidden,
                    cl::desc("NVVM reflection, enabled by default"));
 
 char NVVMReflect::ID = 0;
@@ -88,7 +88,7 @@ INITIALIZE_PASS(NVVMReflect, "nvvm-reflect",
                 false)
 
 static cl::list<std::string>
-ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"),
+ReflectList("nvvm-reflect-list", cl::value_desc("name=<int>"), cl::Hidden,
             cl::desc("A list of string=num assignments"),
             cl::ValueRequired);
 
diff --git a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
index bd08c13..02ebf1d 100644
--- a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
+++ b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCAsmParser
 parent = PowerPC
-required_libraries = PowerPCInfo MC MCParser Support
+required_libraries = PowerPCDesc PowerPCInfo MC MCParser Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index f2cb8b8..fe83fe1 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -8,15 +8,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -106,11 +109,73 @@ static unsigned CRRegs[8] = {
   PPC::CR4, PPC::CR5, PPC::CR6, PPC::CR7
 };
 
+// Evaluate an expression containing condition register
+// or condition register field symbols.  Returns positive
+// value on success, or -1 on error.
+static int64_t
+EvaluateCRExpr(const MCExpr *E) {
+  switch (E->getKind()) {
+  case MCExpr::Target:
+    return -1;
+
+  case MCExpr::Constant: {
+    int64_t Res = cast<MCConstantExpr>(E)->getValue();
+    return Res < 0 ? -1 : Res;
+  }
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    StringRef Name = SRE->getSymbol().getName();
+
+    if (Name == "lt") return 0;
+    if (Name == "gt") return 1;
+    if (Name == "eq") return 2;
+    if (Name == "so") return 3;
+    if (Name == "un") return 3;
+
+    if (Name == "cr0") return 0;
+    if (Name == "cr1") return 1;
+    if (Name == "cr2") return 2;
+    if (Name == "cr3") return 3;
+    if (Name == "cr4") return 4;
+    if (Name == "cr5") return 5;
+    if (Name == "cr6") return 6;
+    if (Name == "cr7") return 7;
+
+    return -1;
+  }
+
+  case MCExpr::Unary:
+    return -1;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    int64_t LHSVal = EvaluateCRExpr(BE->getLHS());
+    int64_t RHSVal = EvaluateCRExpr(BE->getRHS());
+    int64_t Res;
+
+    if (LHSVal < 0 || RHSVal < 0)
+      return -1;
+
+    switch (BE->getOpcode()) {
+    default: return -1;
+    case MCBinaryExpr::Add: Res = LHSVal + RHSVal; break;
+    case MCBinaryExpr::Mul: Res = LHSVal * RHSVal; break;
+    }
+
+    return Res < 0 ? -1 : Res;
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
 struct PPCOperand;
 
 class PPCAsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   bool IsPPC64;
 
   MCAsmParser &getParser() const { return Parser; }
@@ -126,10 +191,16 @@ class PPCAsmParser : public MCTargetAsmParser {
 
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
 
+  const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
+                                        PPCMCExpr::VariantKind &Variant);
+  const MCExpr *FixupVariantKind(const MCExpr *E);
+  bool ParseExpression(const MCExpr *&EVal);
+
   bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveTC(unsigned Size, SMLoc L);
+  bool ParseDirectiveMachine(SMLoc L);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
@@ -149,11 +220,13 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 
 public:
-  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
-    : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
+  PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
+               const MCInstrInfo &_MII)
+      : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
-    IsPPC64 = TheTriple.getArch() == Triple::ppc64;
+    IsPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+               TheTriple.getArch() == Triple::ppc64le);
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
@@ -163,6 +236,12 @@ public:
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   virtual bool ParseDirective(AsmToken DirectiveID);
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+
+  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                            MCSymbolRefExpr::VariantKind,
+                                            MCContext &Ctx);
 };
 
 /// PPCOperand - Instances of this class represent a parsed PowerPC machine
@@ -171,7 +250,8 @@ struct PPCOperand : public MCParsedAsmOperand {
   enum KindTy {
     Token,
     Immediate,
-    Expression
+    Expression,
+    TLSRegister
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -188,12 +268,18 @@ struct PPCOperand : public MCParsedAsmOperand {
 
   struct ExprOp {
     const MCExpr *Val;
+    int64_t CRVal;     // Cached result of EvaluateCRExpr(Val)
+  };
+
+  struct TLSRegOp {
+    const MCSymbolRefExpr *Sym;
   };
 
   union {
     struct TokOp Tok;
     struct ImmOp Imm;
     struct ExprOp Expr;
+    struct TLSRegOp TLSReg;
   };
 
   PPCOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -213,6 +299,9 @@ public:
     case Expression:
       Expr = o.Expr;
       break;
+    case TLSRegister:
+      TLSReg = o.TLSReg;
+      break;
     }
   }
 
@@ -235,6 +324,16 @@ public:
     return Expr.Val;
   }
 
+  int64_t getExprCRVal() const {
+    assert(Kind == Expression && "Invalid access!");
+    return Expr.CRVal;
+  }
+
+  const MCExpr *getTLSReg() const {
+    assert(Kind == TLSRegister && "Invalid access!");
+    return TLSReg.Sym;
+  }
+
   unsigned getReg() const {
     assert(isRegNumber() && "Invalid access!");
     return (unsigned) Imm.Val;
@@ -242,12 +341,17 @@ public:
 
   unsigned getCCReg() const {
     assert(isCCRegNumber() && "Invalid access!");
-    return (unsigned) Imm.Val;
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
+  }
+
+  unsigned getCRBit() const {
+    assert(isCRBitNumber() && "Invalid access!");
+    return (unsigned) (Kind == Immediate ? Imm.Val : Expr.CRVal);
   }
 
   unsigned getCRBitMask() const {
     assert(isCRBitMask() && "Invalid access!");
-    return 7 - CountTrailingZeros_32(Imm.Val);
+    return 7 - countTrailingZeros<uint64_t>(Imm.Val);
   }
 
   bool isToken() const { return Kind == Token; }
@@ -262,9 +366,24 @@ public:
   bool isS16ImmX4() const { return Kind == Expression ||
                                    (Kind == Immediate && isInt<16>(getImm()) &&
                                     (getImm() & 3) == 0); }
+  bool isS17Imm() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<17>(getImm())); }
+  bool isTLSReg() const { return Kind == TLSRegister; }
+  bool isDirectBr() const { return Kind == Expression ||
+                                   (Kind == Immediate && isInt<26>(getImm()) &&
+                                    (getImm() & 3) == 0); }
+  bool isCondBr() const { return Kind == Expression ||
+                                 (Kind == Immediate && isInt<16>(getImm()) &&
+                                  (getImm() & 3) == 0); }
   bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
-  bool isCCRegNumber() const { return Kind == Immediate &&
-                                      isUInt<3>(getImm()); }
+  bool isCCRegNumber() const { return (Kind == Expression
+                                       && isUInt<3>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<3>(getImm())); }
+  bool isCRBitNumber() const { return (Kind == Expression
+                                       && isUInt<5>(getExprCRVal())) ||
+                                      (Kind == Immediate
+                                       && isUInt<5>(getImm())); }
   bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
                                     isPowerOf2_32(getImm()); }
   bool isMem() const { return false; }
@@ -325,7 +444,7 @@ public:
 
   void addRegCRBITRCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getReg()]));
+    Inst.addOperand(MCOperand::CreateReg(CRBITRegs[getCRBit()]));
   }
 
   void addRegCRRCOperands(MCInst &Inst, unsigned N) const {
@@ -346,20 +465,17 @@ public:
       Inst.addOperand(MCOperand::CreateExpr(getExpr()));
   }
 
-  void addDispRIOperands(MCInst &Inst, unsigned N) const {
+  void addBranchTargetOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     if (Kind == Immediate)
-      Inst.addOperand(MCOperand::CreateImm(getImm()));
+      Inst.addOperand(MCOperand::CreateImm(getImm() / 4));
     else
       Inst.addOperand(MCOperand::CreateExpr(getExpr()));
   }
 
-  void addDispRIXOperands(MCInst &Inst, unsigned N) const {
+  void addTLSRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    if (Kind == Immediate)
-      Inst.addOperand(MCOperand::CreateImm(getImm() / 4));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(getExpr()));
+    Inst.addOperand(MCOperand::CreateExpr(getTLSReg()));
   }
 
   StringRef getToken() const {
@@ -379,6 +495,20 @@ public:
     return Op;
   }
 
+  static PPCOperand *CreateTokenWithStringCopy(StringRef Str, SMLoc S,
+                                               bool IsPPC64) {
+    // Allocate extra memory for the string and copy it.
+    void *Mem = ::operator new(sizeof(PPCOperand) + Str.size());
+    PPCOperand *Op = new (Mem) PPCOperand(Token);
+    Op->Tok.Data = (const char *)(Op + 1);
+    Op->Tok.Length = Str.size();
+    std::memcpy((char *)(Op + 1), Str.data(), Str.size());
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
   static PPCOperand *CreateImm(int64_t Val, SMLoc S, SMLoc E, bool IsPPC64) {
     PPCOperand *Op = new PPCOperand(Immediate);
     Op->Imm.Val = Val;
@@ -392,11 +522,34 @@ public:
                                 SMLoc S, SMLoc E, bool IsPPC64) {
     PPCOperand *Op = new PPCOperand(Expression);
     Op->Expr.Val = Val;
+    Op->Expr.CRVal = EvaluateCRExpr(Val);
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    Op->IsPPC64 = IsPPC64;
+    return Op;
+  }
+
+  static PPCOperand *CreateTLSReg(const MCSymbolRefExpr *Sym,
+                                  SMLoc S, SMLoc E, bool IsPPC64) {
+    PPCOperand *Op = new PPCOperand(TLSRegister);
+    Op->TLSReg.Sym = Sym;
     Op->StartLoc = S;
     Op->EndLoc = E;
     Op->IsPPC64 = IsPPC64;
     return Op;
   }
+
+  static PPCOperand *CreateFromMCExpr(const MCExpr *Val,
+                                      SMLoc S, SMLoc E, bool IsPPC64) {
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Val))
+      return CreateImm(CE->getValue(), S, E, IsPPC64);
+
+    if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Val))
+      if (SRE->getKind() == MCSymbolRefExpr::VK_PPC_TLS)
+        return CreateTLSReg(SRE, S, E, IsPPC64);
+
+    return CreateExpr(Val, S, E, IsPPC64);
+  }
 };
 
 } // end anonymous namespace.
@@ -412,6 +565,9 @@ void PPCOperand::print(raw_ostream &OS) const {
   case Expression:
     getExpr()->print(OS);
     break;
+  case TLSRegister:
+    getTLSReg()->print(OS);
+    break;
   }
 }
 
@@ -419,11 +575,133 @@ void PPCOperand::print(raw_ostream &OS) const {
 void PPCAsmParser::
 ProcessInstruction(MCInst &Inst,
                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  switch (Inst.getOpcode()) {
-  case PPC::SLWI: {
+  int Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  case PPC::LAx: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(PPC::LA);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBI: {
     MCInst TmpInst;
     int64_t N = Inst.getOperand(2).getImm();
-    TmpInst.setOpcode(PPC::RLWINM);
+    TmpInst.setOpcode(PPC::ADDI);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIS: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDIS);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBIC: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDIC);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SUBICo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(PPC::ADDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(-N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLWI:
+  case PPC::EXTLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRWI:
+  case PPC::EXTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B + N));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSLWI:
+  case PPC::INSLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSLWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - B));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRWI:
+  case PPC::INSRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRWI? PPC::RLWIMI : PPC::RLWIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - (B + N)));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm((B + N) - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRWI:
+  case PPC::ROTRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::ROTRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(32 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLWI:
+  case PPC::SLWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLWI? PPC::RLWINM : PPC::RLWINMo);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
     TmpInst.addOperand(MCOperand::CreateImm(N));
@@ -432,10 +710,11 @@ ProcessInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
-  case PPC::SRWI: {
+  case PPC::SRWI:
+  case PPC::SRWIo: {
     MCInst TmpInst;
     int64_t N = Inst.getOperand(2).getImm();
-    TmpInst.setOpcode(PPC::RLWINM);
+    TmpInst.setOpcode(Opcode == PPC::SRWI? PPC::RLWINM : PPC::RLWINMo);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
     TmpInst.addOperand(MCOperand::CreateImm(32 - N));
@@ -444,10 +723,90 @@ ProcessInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
-  case PPC::SLDI: {
+  case PPC::CLRRWI:
+  case PPC::CLRRWIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLWI:
+  case PPC::CLRLSLWIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLWI? PPC::RLWINM : PPC::RLWINMo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(B - N));
+    TmpInst.addOperand(MCOperand::CreateImm(31 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTLDI:
+  case PPC::EXTLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTLDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    TmpInst.addOperand(MCOperand::CreateImm(N - 1));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::EXTRDI:
+  case PPC::EXTRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::EXTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(B + N));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::INSRDI:
+  case PPC::INSRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    int64_t B = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::INSRDI? PPC::RLDIMI : PPC::RLDIMIo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - (B + N)));
+    TmpInst.addOperand(MCOperand::CreateImm(B));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::ROTRDI:
+  case PPC::ROTRDIo: {
     MCInst TmpInst;
     int64_t N = Inst.getOperand(2).getImm();
-    TmpInst.setOpcode(PPC::RLDICR);
+    TmpInst.setOpcode(Opcode == PPC::ROTRDI? PPC::RLDICL : PPC::RLDICLo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(64 - N));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::SLDI:
+  case PPC::SLDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::SLDI? PPC::RLDICR : PPC::RLDICRo);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
     TmpInst.addOperand(MCOperand::CreateImm(N));
@@ -455,10 +814,11 @@ ProcessInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
-  case PPC::SRDI: {
+  case PPC::SRDI:
+  case PPC::SRDIo: {
     MCInst TmpInst;
     int64_t N = Inst.getOperand(2).getImm();
-    TmpInst.setOpcode(PPC::RLDICL);
+    TmpInst.setOpcode(Opcode == PPC::SRDI? PPC::RLDICL : PPC::RLDICLo);
     TmpInst.addOperand(Inst.getOperand(0));
     TmpInst.addOperand(Inst.getOperand(1));
     TmpInst.addOperand(MCOperand::CreateImm(64 - N));
@@ -466,6 +826,31 @@ ProcessInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
+  case PPC::CLRRDI:
+  case PPC::CLRRDIo: {
+    MCInst TmpInst;
+    int64_t N = Inst.getOperand(2).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRRDI? PPC::RLDICR : PPC::RLDICRo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(0));
+    TmpInst.addOperand(MCOperand::CreateImm(63 - N));
+    Inst = TmpInst;
+    break;
+  }
+  case PPC::CLRLSLDI:
+  case PPC::CLRLSLDIo: {
+    MCInst TmpInst;
+    int64_t B = Inst.getOperand(2).getImm();
+    int64_t N = Inst.getOperand(3).getImm();
+    TmpInst.setOpcode(Opcode == PPC::CLRLSLDI? PPC::RLDIC : PPC::RLDICo);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::CreateImm(N));
+    TmpInst.addOperand(MCOperand::CreateImm(B - N));
+    Inst = TmpInst;
+    break;
+  }
   }
 }
 
@@ -518,19 +903,23 @@ MatchRegisterName(const AsmToken &Tok, unsigned &RegNo, int64_t &IntVal) {
       RegNo = isPPC64()? PPC::CTR8 : PPC::CTR;
       IntVal = 9;
       return false;
-    } else if (Name.substr(0, 1).equals_lower("r") &&
+    } else if (Name.equals_lower("vrsave")) {
+      RegNo = PPC::VRSAVE;
+      IntVal = 256;
+      return false;
+    } else if (Name.startswith_lower("r") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = isPPC64()? XRegs[IntVal] : RRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("f") &&
+    } else if (Name.startswith_lower("f") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = FRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 1).equals_lower("v") &&
+    } else if (Name.startswith_lower("v") &&
                !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
       RegNo = VRegs[IntVal];
       return false;
-    } else if (Name.substr(0, 2).equals_lower("cr") &&
+    } else if (Name.startswith_lower("cr") &&
                !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
       RegNo = CRRegs[IntVal];
       return false;
@@ -556,6 +945,159 @@ ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
   return Error(StartLoc, "invalid register name");
 }
 
+/// Extract \code @l/@ha \endcode modifier from expression.  Recursively scan
+/// the expression and check for VK_PPC_LO/HI/HA
+/// symbol variants.  If all symbols with modifier use the same
+/// variant, return the corresponding PPCMCExpr::VariantKind,
+/// and a modified expression using the default symbol variant.
+/// Otherwise, return NULL.
+const MCExpr *PPCAsmParser::
+ExtractModifierFromExpr(const MCExpr *E,
+                        PPCMCExpr::VariantKind &Variant) {
+  MCContext &Context = getParser().getContext();
+  Variant = PPCMCExpr::VK_PPC_None;
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return 0;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_PPC_LO:
+      Variant = PPCMCExpr::VK_PPC_LO;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HI:
+      Variant = PPCMCExpr::VK_PPC_HI;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HA:
+      Variant = PPCMCExpr::VK_PPC_HA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHER:
+      Variant = PPCMCExpr::VK_PPC_HIGHER;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHERA:
+      Variant = PPCMCExpr::VK_PPC_HIGHERA;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHEST:
+      Variant = PPCMCExpr::VK_PPC_HIGHEST;
+      break;
+    case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+      Variant = PPCMCExpr::VK_PPC_HIGHESTA;
+      break;
+    default:
+      return 0;
+    }
+
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
+    if (!Sub)
+      return 0;
+    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    PPCMCExpr::VariantKind LHSVariant, RHSVariant;
+    const MCExpr *LHS = ExtractModifierFromExpr(BE->getLHS(), LHSVariant);
+    const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
+
+    if (!LHS && !RHS)
+      return 0;
+
+    if (!LHS) LHS = BE->getLHS();
+    if (!RHS) RHS = BE->getRHS();
+
+    if (LHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = RHSVariant;
+    else if (RHSVariant == PPCMCExpr::VK_PPC_None)
+      Variant = LHSVariant;
+    else if (LHSVariant == RHSVariant)
+      Variant = LHSVariant;
+    else
+      return 0;
+
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// Find all VK_TLSGD/VK_TLSLD symbol references in expression and replace
+/// them by VK_PPC_TLSGD/VK_PPC_TLSLD.  This is necessary to avoid having
+/// _GLOBAL_OFFSET_TABLE_ created via ELFObjectWriter::RelocNeedsGOT.
+/// FIXME: This is a hack.
+const MCExpr *PPCAsmParser::
+FixupVariantKind(const MCExpr *E) {
+  MCContext &Context = getParser().getContext();
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return E;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_TLSGD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSGD;
+      break;
+    case MCSymbolRefExpr::VK_TLSLD:
+      Variant = MCSymbolRefExpr::VK_PPC_TLSLD;
+      break;
+    default:
+      return E;
+    }
+    return MCSymbolRefExpr::Create(&SRE->getSymbol(), Variant, Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = FixupVariantKind(UE->getSubExpr());
+    if (Sub == UE->getSubExpr())
+      return E;
+    return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = FixupVariantKind(BE->getLHS());
+    const MCExpr *RHS = FixupVariantKind(BE->getRHS());
+    if (LHS == BE->getLHS() && RHS == BE->getRHS())
+      return E;
+    return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// Parse an expression.  This differs from the default "parseExpression"
+/// in that it handles complex \code @l/@ha \endcode modifiers.
+bool PPCAsmParser::
+ParseExpression(const MCExpr *&EVal) {
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  EVal = FixupVariantKind(EVal);
+
+  PPCMCExpr::VariantKind Variant;
+  const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
+  if (E)
+    EVal = PPCMCExpr::Create(Variant, E, false, getParser().getContext());
+
+  return false;
+}
+
 bool PPCAsmParser::
 ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
@@ -587,23 +1129,40 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   case AsmToken::Identifier:
   case AsmToken::Dot:
   case AsmToken::Dollar:
-    if (!getParser().parseExpression(EVal))
+    if (!ParseExpression(EVal))
       break;
     /* fall through */
   default:
     return Error(S, "unknown operand");
   }
 
-  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(EVal))
-    Op = PPCOperand::CreateImm(CE->getValue(), S, E, isPPC64());
-  else
-    Op = PPCOperand::CreateExpr(EVal, S, E, isPPC64());
-
   // Push the parsed operand into the list of operands
+  Op = PPCOperand::CreateFromMCExpr(EVal, S, E, isPPC64());
   Operands.push_back(Op);
 
-  // Check for D-form memory operands
-  if (getLexer().is(AsmToken::LParen)) {
+  // Check whether this is a TLS call expression
+  bool TLSCall = false;
+  if (const MCSymbolRefExpr *Ref = dyn_cast<MCSymbolRefExpr>(EVal))
+    TLSCall = Ref->getSymbol().getName() == "__tls_get_addr";
+
+  if (TLSCall && getLexer().is(AsmToken::LParen)) {
+    const MCExpr *TLSSym;
+
+    Parser.Lex(); // Eat the '('.
+    S = Parser.getTok().getLoc();
+    if (ParseExpression(TLSSym))
+      return Error(S, "invalid TLS call expression");
+    if (getLexer().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "missing ')'");
+    E = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the ')'.
+
+    Op = PPCOperand::CreateFromMCExpr(TLSSym, S, E, isPPC64());
+    Operands.push_back(Op);
+  }
+
+  // Otherwise, check for D-form memory operands
+  if (!TLSCall && getLexer().is(AsmToken::LParen)) {
     Parser.Lex(); // Eat the '('.
     S = Parser.getTok().getLoc();
 
@@ -644,15 +1203,38 @@ bool PPCAsmParser::
 ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // The first operand is the token for the instruction name.
+  // If the next character is a '+' or '-', we need to add it to the
+  // instruction name, to match what TableGen is doing.
+  std::string NewOpcode;
+  if (getLexer().is(AsmToken::Plus)) {
+    getLexer().Lex();
+    NewOpcode = Name;
+    NewOpcode += '+';
+    Name = NewOpcode;
+  }
+  if (getLexer().is(AsmToken::Minus)) {
+    getLexer().Lex();
+    NewOpcode = Name;
+    NewOpcode += '-';
+    Name = NewOpcode;
+  }
   // If the instruction ends in a '.', we need to create a separate
   // token for it, to match what TableGen is doing.
   size_t Dot = Name.find('.');
   StringRef Mnemonic = Name.slice(0, Dot);
-  Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
+  if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+    Operands.push_back(
+        PPCOperand::CreateTokenWithStringCopy(Mnemonic, NameLoc, isPPC64()));
+  else
+    Operands.push_back(PPCOperand::CreateToken(Mnemonic, NameLoc, isPPC64()));
   if (Dot != StringRef::npos) {
     SMLoc DotLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Dot);
     StringRef DotStr = Name.slice(Dot, StringRef::npos);
-    Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
+    if (!NewOpcode.empty()) // Underlying memory for Name is volatile.
+      Operands.push_back(
+          PPCOperand::CreateTokenWithStringCopy(DotStr, DotLoc, isPPC64()));
+    else
+      Operands.push_back(PPCOperand::CreateToken(DotStr, DotLoc, isPPC64()));
   }
 
   // If there are no more operands then finish
@@ -680,9 +1262,13 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
 bool PPCAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  if (IDVal == ".llong")
+    return ParseDirectiveWord(8, DirectiveID.getLoc());
   if (IDVal == ".tc")
     return ParseDirectiveTC(isPPC64()? 8 : 4, DirectiveID.getLoc());
+  if (IDVal == ".machine")
+    return ParseDirectiveMachine(DirectiveID.getLoc());
   return true;
 }
 
@@ -728,12 +1314,84 @@ bool PPCAsmParser::ParseDirectiveTC(unsigned Size, SMLoc L) {
   return ParseDirectiveWord(Size, L);
 }
 
+/// ParseDirectiveMachine
+///  ::= .machine [ cpu | "push" | "pop" ]
+bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
+  if (getLexer().isNot(AsmToken::Identifier) &&
+      getLexer().isNot(AsmToken::String))
+    return Error(L, "unexpected token in directive");
+
+  StringRef CPU = Parser.getTok().getIdentifier();
+  Parser.Lex();
+
+  // FIXME: Right now, the parser always allows any available
+  // instruction, so the .machine directive is not useful.
+  // Implement ".machine any" (by doing nothing) for the benefit
+  // of existing assembler code.  Likewise, we can then implement
+  // ".machine push" and ".machine pop" as no-op.
+  if (CPU != "any" && CPU != "push" && CPU != "pop")
+    return Error(L, "unrecognized machine type");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+
+  return false;
+}
+
 /// Force static initialization.
 extern "C" void LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> A(ThePPC32Target);
   RegisterMCAsmParser<PPCAsmParser> B(ThePPC64Target);
+  RegisterMCAsmParser<PPCAsmParser> C(ThePPC64LETarget);
 }
 
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "PPCGenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned PPCAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                  unsigned Kind) {
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ImmVal;
+  switch (Kind) {
+    case MCK_0: ImmVal = 0; break;
+    case MCK_1: ImmVal = 1; break;
+    case MCK_2: ImmVal = 2; break;
+    case MCK_3: ImmVal = 3; break;
+    default: return Match_InvalidOperand;
+  }
+
+  PPCOperand *Op = static_cast<PPCOperand*>(AsmOp);
+  if (Op->isImm() && Op->getImm() == ImmVal)
+    return Match_Success;
+
+  return Match_InvalidOperand;
+}
+
+const MCExpr *
+PPCAsmParser::applyModifierToExpr(const MCExpr *E,
+                                  MCSymbolRefExpr::VariantKind Variant,
+                                  MCContext &Ctx) {
+  switch (Variant) {
+  case MCSymbolRefExpr::VK_PPC_LO:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HI:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHER:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHERA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHEST:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+  case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+    return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+  default:
+    return 0;
+  }
+}
diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
index 71803cd..9a763f5 100644
--- a/lib/Target/PowerPC/CMakeLists.txt
+++ b/lib/Target/PowerPC/CMakeLists.txt
@@ -7,6 +7,7 @@ tablegen(LLVM PPCGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM PPCGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM PPCGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM PPCGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM PPCGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM PPCGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM PPCGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(PowerPCCommonTableGen)
@@ -20,6 +21,7 @@ add_llvm_target(PowerPCCodeGen
   PPCInstrInfo.cpp
   PPCISelDAGToDAG.cpp
   PPCISelLowering.cpp
+  PPCFastISel.cpp
   PPCFrameLowering.cpp
   PPCJITInfo.cpp
   PPCMCInstLower.cpp
@@ -27,11 +29,12 @@ add_llvm_target(PowerPCCodeGen
   PPCRegisterInfo.cpp
   PPCSubtarget.cpp
   PPCTargetMachine.cpp
+  PPCTargetObjectFile.cpp
   PPCTargetTransformInfo.cpp
   PPCSelectionDAGInfo.cpp
   )
 
-add_dependencies(LLVMPowerPCCodeGen intrinsics_gen)
+add_dependencies(LLVMPowerPCCodeGen PowerPCCommonTableGen intrinsics_gen)
 
 add_subdirectory(AsmParser)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index 93fca00..8281b5c 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -18,9 +18,17 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+// FIXME: Once the integrated assembler supports full register names, tie this
+// to the verbose-asm setting.
+static cl::opt<bool>
+FullRegNames("ppc-asm-full-reg-names", cl::Hidden, cl::init(false),
+             cl::desc("Use full register names when printing assembly"));
+
 #include "PPCGenAsmWriter.inc"
 
 void PPCInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
@@ -78,6 +86,17 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     }
   }
   
+  // For fast-isel, a COPY_TO_REGCLASS may survive this long.  This is
+  // used when converting a 32-bit float to a 64-bit float as part of
+  // conversion to an integer (see PPCFastISel.cpp:SelectFPToI()),
+  // as otherwise we have problems with incorrect register classes
+  // in machine instruction verification.  For now, just avoid trying
+  // to print it as such an instruction has no effect (a 32-bit float
+  // in a register is already in 64-bit form, just with lower
+  // precision).  FIXME: Is there a better solution?
+  if (MI->getOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+    return;
+  
   printInstruction(MI, O);
   printAnnotation(O, Annot);
 }
@@ -90,19 +109,87 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
 
   if (StringRef(Modifier) == "cc") {
     switch ((PPC::Predicate)Code) {
-    case PPC::PRED_LT: O << "lt"; return;
-    case PPC::PRED_LE: O << "le"; return;
-    case PPC::PRED_EQ: O << "eq"; return;
-    case PPC::PRED_GE: O << "ge"; return;
-    case PPC::PRED_GT: O << "gt"; return;
-    case PPC::PRED_NE: O << "ne"; return;
-    case PPC::PRED_UN: O << "un"; return;
-    case PPC::PRED_NU: O << "nu"; return;
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LT:
+      O << "lt";
+      return;
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_LE:
+      O << "le";
+      return;
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_EQ:
+      O << "eq";
+      return;
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GE:
+      O << "ge";
+      return;
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_GT:
+      O << "gt";
+      return;
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_NE:
+      O << "ne";
+      return;
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_UN:
+      O << "un";
+      return;
+    case PPC::PRED_NU_MINUS:
+    case PPC::PRED_NU_PLUS:
+    case PPC::PRED_NU:
+      O << "nu";
+      return;
     }
+    llvm_unreachable("Invalid predicate code");
+  }
+
+  if (StringRef(Modifier) == "pm") {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_LT:
+    case PPC::PRED_LE:
+    case PPC::PRED_EQ:
+    case PPC::PRED_GE:
+    case PPC::PRED_GT:
+    case PPC::PRED_NE:
+    case PPC::PRED_UN:
+    case PPC::PRED_NU:
+      return;
+    case PPC::PRED_LT_MINUS:
+    case PPC::PRED_LE_MINUS:
+    case PPC::PRED_EQ_MINUS:
+    case PPC::PRED_GE_MINUS:
+    case PPC::PRED_GT_MINUS:
+    case PPC::PRED_NE_MINUS:
+    case PPC::PRED_UN_MINUS:
+    case PPC::PRED_NU_MINUS:
+      O << "-";
+      return;
+    case PPC::PRED_LT_PLUS:
+    case PPC::PRED_LE_PLUS:
+    case PPC::PRED_EQ_PLUS:
+    case PPC::PRED_GE_PLUS:
+    case PPC::PRED_GT_PLUS:
+    case PPC::PRED_NE_PLUS:
+    case PPC::PRED_UN_PLUS:
+    case PPC::PRED_NU_PLUS:
+      O << "+";
+      return;
+    }
+    llvm_unreachable("Invalid predicate code");
   }
   
   assert(StringRef(Modifier) == "reg" &&
-         "Need to specify 'cc' or 'reg' as predicate op modifier!");
+         "Need to specify 'cc', 'pm' or 'reg' as predicate op modifier!");
   printOperand(MI, OpNo+1, O);
 }
 
@@ -129,18 +216,16 @@ void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
-  O << (short)MI->getOperand(OpNo).getImm();
+  if (MI->getOperand(OpNo).isImm())
+    O << (short)MI->getOperand(OpNo).getImm();
+  else
+    printOperand(MI, OpNo, O);
 }
 
 void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
-  O << (unsigned short)MI->getOperand(OpNo).getImm();
-}
-
-void PPCInstPrinter::printS16X4ImmOperand(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm())
-    O << (short)(MI->getOperand(OpNo).getImm()*4);
+    O << (unsigned short)MI->getOperand(OpNo).getImm();
   else
     printOperand(MI, OpNo, O);
 }
@@ -153,11 +238,14 @@ void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
   // Branches can take an immediate operand.  This is used by the branch
   // selection pass to print .+8, an eight byte displacement from the PC.
   O << ".+";
-  printAbsAddrOperand(MI, OpNo, O);
+  printAbsBranchOperand(MI, OpNo, O);
 }
 
-void PPCInstPrinter::printAbsAddrOperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
+void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  if (!MI->getOperand(OpNo).isImm())
+    return printOperand(MI, OpNo, O);
+
   O << (int)MI->getOperand(OpNo).getImm()*4;
 }
 
@@ -182,7 +270,7 @@ void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
-  printSymbolLo(MI, OpNo, O);
+  printS16ImmOperand(MI, OpNo, O);
   O << '(';
   if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
     O << "0";
@@ -191,22 +279,6 @@ void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
   O << ')';
 }
 
-void PPCInstPrinter::printMemRegImmShifted(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    printS16X4ImmOperand(MI, OpNo, O);
-  else
-    printSymbolLo(MI, OpNo, O);
-  O << '(';
-  
-  if (MI->getOperand(OpNo+1).getReg() == PPC::R0)
-    O << "0";
-  else
-    printOperand(MI, OpNo+1, O);
-  O << ')';
-}
-
-
 void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   // When used as the base register, r0 reads constant zero rather than
@@ -220,11 +292,21 @@ void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
   printOperand(MI, OpNo+1, O);
 }
 
+void PPCInstPrinter::printTLSCall(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printBranchOperand(MI, OpNo, O);
+  O << '(';
+  printOperand(MI, OpNo+1, O);
+  O << ')';
+}
 
 
 /// stripRegisterPrefix - This method strips the character prefix from a
 /// register name so that only the number is left.  Used by for linux asm.
 static const char *stripRegisterPrefix(const char *RegName) {
+  if (FullRegNames)
+    return RegName;
+
   switch (RegName[0]) {
   case 'r':
   case 'f':
@@ -256,39 +338,4 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   assert(Op.isExpr() && "unknown operand kind in printOperand");
   O << *Op.getExpr();
 }
-  
-void PPCInstPrinter::printSymbolLo(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    return printS16ImmOperand(MI, OpNo, O);
-  
-  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
-  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
-  if (MI->getOperand(OpNo).isExpr() &&
-      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
-    O << "lo16(";
-    printOperand(MI, OpNo, O);
-    O << ')';
-  } else {
-    printOperand(MI, OpNo, O);
-  }
-}
-
-void PPCInstPrinter::printSymbolHi(const MCInst *MI, unsigned OpNo,
-                                   raw_ostream &O) {
-  if (MI->getOperand(OpNo).isImm())
-    return printS16ImmOperand(MI, OpNo, O);
-
-  // FIXME: This is a terrible hack because we can't encode lo16() as an operand
-  // flag of a subtraction.  See the FIXME in GetSymbolRef in PPCMCInstLower.
-  if (MI->getOperand(OpNo).isExpr() &&
-      isa<MCBinaryExpr>(MI->getOperand(OpNo).getExpr())) {
-    O << "ha16(";
-    printOperand(MI, OpNo, O);
-    O << ')';
-  } else {
-    printOperand(MI, OpNo, O);
-  }
-}
-
 
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 8f1e211..8a4c03d 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -21,15 +21,14 @@ namespace llvm {
 class MCOperand;
 
 class PPCInstPrinter : public MCInstPrinter {
-  // 0 -> AIX, 1 -> Darwin.
-  unsigned SyntaxVariant;
+  bool IsDarwin;
 public:
   PPCInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                 const MCRegisterInfo &MRI, unsigned syntaxVariant)
-    : MCInstPrinter(MAI, MII, MRI), SyntaxVariant(syntaxVariant) {}
+                 const MCRegisterInfo &MRI, bool isDarwin)
+    : MCInstPrinter(MAI, MII, MRI), IsDarwin(isDarwin) {}
   
   bool isDarwinSyntax() const {
-    return SyntaxVariant == 1;
+    return IsDarwin;
   }
   
   virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
@@ -50,19 +49,14 @@ public:
   void printU6ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printS16X4ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAbsAddrOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printMemRegImmShifted(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  
-  // FIXME: Remove
-  void printSymbolLo(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSymbolHi(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 } // end namespace llvm
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
index b674883..3efa5ec 100644
--- a/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/PowerPC/MCTargetDesc/CMakeLists.txt
@@ -3,7 +3,9 @@ add_llvm_library(LLVMPowerPCDesc
   PPCMCTargetDesc.cpp
   PPCMCAsmInfo.cpp
   PPCMCCodeEmitter.cpp
+  PPCMCExpr.cpp
   PPCPredicates.cpp
+  PPCMachObjectWriter.cpp
   PPCELFObjectWriter.cpp
   )
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index ec26574..0d42081 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -16,13 +16,13 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
@@ -30,40 +30,45 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
-  case PPC::fixup_ppc_tlsreg:
   case PPC::fixup_ppc_nofixup:
     return Value;
   case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
     return Value & 0xfffc;
   case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
     return Value & 0x3fffffc;
-#if 0
-  case PPC::fixup_ppc_hi16:
-    return (Value >> 16) & 0xffff;
-#endif
-  case PPC::fixup_ppc_ha16:
-    return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
-  case PPC::fixup_ppc_lo16:
+  case PPC::fixup_ppc_half16:
     return Value & 0xffff;
-  case PPC::fixup_ppc_lo16_ds:
+  case PPC::fixup_ppc_half16ds:
     return Value & 0xfffc;
   }
 }
 
-namespace {
-class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-public:
-  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                      uint32_t CPUSubtype)
-    : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
-
-  void RecordRelocation(MachObjectWriter *Writer,
-                        const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        const MCFragment *Fragment, const MCFixup &Fixup,
-                        MCValue Target, uint64_t &FixedValue) {
-    llvm_unreachable("Relocation emission for MachO/PPC unimplemented!");
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+    return 1;
+  case FK_Data_2:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_half16ds:
+    return 2;
+  case FK_Data_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_brcond14abs:
+  case PPC::fixup_ppc_br24:
+  case PPC::fixup_ppc_br24abs:
+    return 4;
+  case FK_Data_8:
+    return 8;
+  case PPC::fixup_ppc_nofixup:
+    return 0;
   }
-};
+}
+
+namespace {
 
 class PPCAsmBackend : public MCAsmBackend {
 const Target &TheTarget;
@@ -77,10 +82,10 @@ public:
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_ppc_lo16",        16,     16,   0 },
-      { "fixup_ppc_ha16",        16,     16,   0 },
-      { "fixup_ppc_lo16_ds",     16,     14,   0 },
-      { "fixup_ppc_tlsreg",       0,      0,   0 },
+      { "fixup_ppc_br24abs",     6,      24,   0 },
+      { "fixup_ppc_brcond14abs", 16,     14,   0 },
+      { "fixup_ppc_half16",       0,     16,   0 },
+      { "fixup_ppc_half16ds",     0,     14,   0 },
       { "fixup_ppc_nofixup",      0,      0,   0 }
     };
 
@@ -98,12 +103,13 @@ public:
     if (!Value) return;           // Doesn't change encoding.
 
     unsigned Offset = Fixup.getOffset();
+    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
 
     // For each byte of the fragment that the fixup touches, mask in the bits
     // from the fixup value. The Value has been "split up" into the appropriate
     // bitfields above.
-    for (unsigned i = 0; i != 4; ++i)
-      Data[Offset + i] |= uint8_t((Value >> ((4 - i - 1)*8)) & 0xff);
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Data[Offset + i] |= uint8_t((Value >> ((NumBytes - i - 1)*8)) & 0xff);
   }
 
   bool mayNeedRelaxation(const MCInst &Inst) const {
@@ -126,16 +132,23 @@ public:
   }
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-    // FIXME: Zero fill for now. That's not right, but at least will get the
-    // section size right.
-    for (uint64_t i = 0; i != Count; ++i)
-      OW->Write8(0);
+    uint64_t NumNops = Count / 4;
+    for (uint64_t i = 0; i != NumNops; ++i)
+      OW->Write32(0x60000000);
+
+    switch (Count % 4) {
+    default: break; // No leftover bytes to write
+    case 1: OW->Write8(0); break;
+    case 2: OW->Write16(0); break;
+    case 3: OW->Write16(0); OW->Write8(0); break;
+    }
+
     return true;
   }
 
   unsigned getPointerSize() const {
     StringRef Name = TheTarget.getName();
-    if (Name == "ppc64") return 8;
+    if (Name == "ppc64" || Name == "ppc64le") return 8;
     assert(Name == "ppc32" && "Unknown target name!");
     return 4;
   }
@@ -151,12 +164,11 @@ namespace {
 
     MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
       bool is64 = getPointerSize() == 8;
-      return createMachObjectWriter(new PPCMachObjectWriter(
-                                      /*Is64Bit=*/is64,
-                                      (is64 ? object::mach::CTM_PowerPC64 :
-                                       object::mach::CTM_PowerPC),
-                                      object::mach::CSPPC_ALL),
-                                    OS, /*IsLittleEndian=*/false);
+      return createPPCMachObjectWriter(
+          OS,
+          /*Is64Bit=*/is64,
+          (is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
+          MachO::CPU_SUBTYPE_POWERPC_ALL);
     }
 
     virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -183,10 +195,9 @@ namespace {
 
 } // end anonymous namespace
 
-
-
-
-MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 7a84723..54de70e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -30,29 +30,17 @@ namespace {
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) const;
+    virtual const MCSymbol *ExplicitRelSym(const MCAssembler &Asm,
+                                           const MCValue &Target,
+                                           const MCFragment &F,
+                                           const MCFixup &Fixup,
+                                           bool IsPCRel) const;
     virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
                                                     const MCFixup &Fixup,
                                                     bool IsPCRel) const;
-    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
-
-    virtual void sortRelocs(const MCAssembler &Asm,
-                            std::vector<ELFRelocationEntry> &Relocs);
-  };
-
-  class PPCELFRelocationEntry : public ELFRelocationEntry {
-  public:
-    PPCELFRelocationEntry(const ELFRelocationEntry &RE);
-    bool operator<(const PPCELFRelocationEntry &RE) const {
-      return (RE.r_offset < r_offset ||
-              (RE.r_offset == r_offset && RE.Type > Type));
-    }
   };
 }
 
-PPCELFRelocationEntry::PPCELFRelocationEntry(const ELFRelocationEntry &RE)
-  : ELFRelocationEntry(RE.r_offset, RE.Index, RE.Type, RE.Symbol,
-                       RE.r_addend, *RE.Fixup) {}
-
 PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
   : MCELFObjectTargetWriter(Is64Bit, OSABI,
                             Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
@@ -75,11 +63,30 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
     default:
       llvm_unreachable("Unimplemented");
     case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
       Type = ELF::R_PPC_REL24;
       break;
     case PPC::fixup_ppc_brcond14:
+    case PPC::fixup_ppc_brcond14abs:
       Type = ELF::R_PPC_REL14;
       break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_REL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_REL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_REL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = ELF::R_PPC_REL16_HA;
+        break;
+      }
+      break;
     case FK_Data_4:
     case FK_PCRel_4:
       Type = ELF::R_PPC_REL32;
@@ -92,93 +99,216 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
   } else {
     switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
-    case PPC::fixup_ppc_br24:
+    case PPC::fixup_ppc_br24abs:
       Type = ELF::R_PPC_ADDR24;
       break;
-    case PPC::fixup_ppc_brcond14:
+    case PPC::fixup_ppc_brcond14abs:
       Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
       break;
-    case PPC::fixup_ppc_ha16:
+    case PPC::fixup_ppc_half16:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
-      case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
-        Type = ELF::R_PPC_TPREL16_HA;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16;
         break;
-      case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
-        Type = ELF::R_PPC64_DTPREL16_HA;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = ELF::R_PPC_ADDR16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = ELF::R_PPC_ADDR16_HI;
         break;
-      case MCSymbolRefExpr::VK_PPC_GAS_HA16:
-      case MCSymbolRefExpr::VK_PPC_DARWIN_HA16:
+      case MCSymbolRefExpr::VK_PPC_HA:
         Type = ELF::R_PPC_ADDR16_HA;
-	break;
-      case MCSymbolRefExpr::VK_PPC_TOC16_HA:
-        Type = ELF::R_PPC64_TOC16_HA;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
-        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+      case MCSymbolRefExpr::VK_PPC_HIGHER:
+        Type = ELF::R_PPC64_ADDR16_HIGHER;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
-        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+      case MCSymbolRefExpr::VK_PPC_HIGHERA:
+        Type = ELF::R_PPC64_ADDR16_HIGHERA;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
-        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+      case MCSymbolRefExpr::VK_PPC_HIGHEST:
+        Type = ELF::R_PPC64_ADDR16_HIGHEST;
         break;
-      }
-      break;
-    case PPC::fixup_ppc_lo16:
-      switch (Modifier) {
-      default: llvm_unreachable("Unsupported Modifier");
-      case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
-        Type = ELF::R_PPC_TPREL16_LO;
+      case MCSymbolRefExpr::VK_PPC_HIGHESTA:
+        Type = ELF::R_PPC64_ADDR16_HIGHESTA;
         break;
-      case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
-        Type = ELF::R_PPC64_DTPREL16_LO;
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC_GOT16;
         break;
-      case MCSymbolRefExpr::VK_None:
-        Type = ELF::R_PPC_ADDR16;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC_GOT16_LO;
         break;
-      case MCSymbolRefExpr::VK_PPC_GAS_LO16:
-      case MCSymbolRefExpr::VK_PPC_DARWIN_LO16:
-        Type = ELF::R_PPC_ADDR16_LO;
-	break;
-      case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
+      case MCSymbolRefExpr::VK_PPC_GOT_HI:
+        Type = ELF::R_PPC_GOT16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_HA:
+        Type = ELF::R_PPC_GOT16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
         Type = ELF::R_PPC64_TOC16;
         break;
-      case MCSymbolRefExpr::VK_PPC_TOC16_LO:
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+      case MCSymbolRefExpr::VK_PPC_TOC_HI:
+        Type = ELF::R_PPC64_TOC16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC_HA:
+        Type = ELF::R_PPC64_TOC16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC_TPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HI:
+        Type = ELF::R_PPC_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HA:
+        Type = ELF::R_PPC_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHER:
+        Type = ELF::R_PPC64_TPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHERA:
+        Type = ELF::R_PPC64_TPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHEST:
+        Type = ELF::R_PPC64_TPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_HIGHESTA:
+        Type = ELF::R_PPC64_TPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HI:
+        Type = ELF::R_PPC64_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HA:
+        Type = ELF::R_PPC64_DTPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHER:
+        Type = ELF::R_PPC64_DTPREL16_HIGHER;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHERA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHERA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHEST:
+        Type = ELF::R_PPC64_DTPREL16_HIGHEST;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_HIGHESTA:
+        Type = ELF::R_PPC64_DTPREL16_HIGHESTA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD:
+        Type = ELF::R_PPC64_GOT_TLSGD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO:
         Type = ELF::R_PPC64_GOT_TLSGD16_LO;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HI:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD:
+        Type = ELF::R_PPC64_GOT_TLSLD16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO:
         Type = ELF::R_PPC64_GOT_TLSLD16_LO;
         break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HI:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        /* We don't have R_PPC64_GOT_TPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
+        /* We don't have R_PPC64_GOT_TPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_TPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HI:
+        Type = ELF::R_PPC64_GOT_TPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        /* We don't have R_PPC64_GOT_DTPREL16, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        /* We don't have R_PPC64_GOT_DTPREL16_LO, but since GOT offsets
+           are always 4-aligned, we can use R_PPC64_GOT_DTPREL16_LO_DS.  */
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA:
+        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HI:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HI;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_HA:
+        Type = ELF::R_PPC64_GOT_DTPREL16_HA;
+        break;
       }
       break;
-    case PPC::fixup_ppc_lo16_ds:
+    case PPC::fixup_ppc_half16ds:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC64_ADDR16_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_GAS_LO16:
-      case MCSymbolRefExpr::VK_PPC_DARWIN_LO16:
+      case MCSymbolRefExpr::VK_PPC_LO:
         Type = ELF::R_PPC64_ADDR16_LO_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_TOC_ENTRY:
+      case MCSymbolRefExpr::VK_GOT:
+        Type = ELF::R_PPC64_GOT16_DS;
+	break;
+      case MCSymbolRefExpr::VK_PPC_GOT_LO:
+        Type = ELF::R_PPC64_GOT16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TOC:
         Type = ELF::R_PPC64_TOC16_DS;
 	break;
-      case MCSymbolRefExpr::VK_PPC_TOC16_LO:
+      case MCSymbolRefExpr::VK_PPC_TOC_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC64_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TPREL_LO:
+        Type = ELF::R_PPC64_TPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL:
+        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO:
         Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL:
+        Type = ELF::R_PPC64_GOT_DTPREL16_DS;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_DTPREL_LO:
+        Type = ELF::R_PPC64_GOT_DTPREL16_LO_DS;
+        break;
       }
       break;
-    case PPC::fixup_ppc_tlsreg:
-      Type = ELF::R_PPC64_TLS;
-      break;
     case PPC::fixup_ppc_nofixup:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
@@ -188,17 +318,29 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TLSLD:
         Type = ELF::R_PPC64_TLSLD;
         break;
+      case MCSymbolRefExpr::VK_PPC_TLS:
+        Type = ELF::R_PPC64_TLS;
+        break;
       }
       break;
     case FK_Data_8:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
-      case MCSymbolRefExpr::VK_PPC_TOC:
+      case MCSymbolRefExpr::VK_PPC_TOCBASE:
         Type = ELF::R_PPC64_TOC;
         break;
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC64_ADDR64;
 	break;
+      case MCSymbolRefExpr::VK_PPC_DTPMOD:
+        Type = ELF::R_PPC64_DTPMOD64;
+	break;
+      case MCSymbolRefExpr::VK_PPC_TPREL:
+        Type = ELF::R_PPC64_TPREL64;
+	break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL:
+        Type = ELF::R_PPC64_DTPREL64;
+	break;
       }
       break;
     case FK_Data_4:
@@ -220,6 +362,35 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
   return getRelocTypeInner(Target, Fixup, IsPCRel);
 }
 
+const MCSymbol *PPCELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
+                                                   const MCValue &Target,
+                                                   const MCFragment &F,
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
+  assert(Target.getSymA() && "SymA cannot be 0");
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  bool EmitThisSym;
+  switch (Modifier) {
+  // GOT references always need a relocation, even if the
+  // target symbol is local.
+  case MCSymbolRefExpr::VK_GOT:
+  case MCSymbolRefExpr::VK_PPC_GOT_LO:
+  case MCSymbolRefExpr::VK_PPC_GOT_HI:
+  case MCSymbolRefExpr::VK_PPC_GOT_HA:
+    EmitThisSym = true;
+    break;
+  default:
+    EmitThisSym = false;
+    break;
+  } 
+
+  if (EmitThisSym)
+    return &Target.getSymA()->getSymbol().AliasedSymbol();
+  return NULL;
+}
+
 const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Target,
                                                             const MCFixup &Fixup,
                                                             bool IsPCRel) const {
@@ -240,47 +411,6 @@ const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Targe
   return NULL;
 }
 
-void PPCELFObjectWriter::
-adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
-  switch ((unsigned)Fixup.getKind()) {
-    case PPC::fixup_ppc_ha16:
-    case PPC::fixup_ppc_lo16:
-    case PPC::fixup_ppc_lo16_ds:
-      RelocOffset += 2;
-      break;
-    default:
-      break;
-  }
-}
-
-// The standard sorter only sorts on the r_offset field, but PowerPC can
-// have multiple relocations at the same offset.  Sort secondarily on the
-// relocation type to avoid nondeterminism.
-void PPCELFObjectWriter::sortRelocs(const MCAssembler &Asm,
-                                    std::vector<ELFRelocationEntry> &Relocs) {
-
-  // Copy to a temporary vector of relocation entries having a different
-  // sort function.
-  std::vector<PPCELFRelocationEntry> TmpRelocs;
-  
-  for (std::vector<ELFRelocationEntry>::iterator R = Relocs.begin();
-       R != Relocs.end(); ++R) {
-    TmpRelocs.push_back(PPCELFRelocationEntry(*R));
-  }
-
-  // Sort in place by ascending r_offset and descending r_type.
-  array_pod_sort(TmpRelocs.begin(), TmpRelocs.end());
-
-  // Copy back to the original vector.
-  unsigned I = 0;
-  for (std::vector<PPCELFRelocationEntry>::iterator R = TmpRelocs.begin();
-       R != TmpRelocs.end(); ++R, ++I) {
-    Relocs[I] = ELFRelocationEntry(R->r_offset, R->Index, R->Type,
-                                   R->Symbol, R->r_addend, *R->Fixup);
-  }
-}
-
-
 MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
                                                bool Is64Bit,
                                                uint8_t OSABI) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 86c44f5..68de8c1 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -25,23 +25,25 @@ enum Fixups {
   /// branches.
   fixup_ppc_brcond14,
   
-  /// fixup_ppc_lo16 - A 16-bit fixup corresponding to lo16(_foo) for instrs
-  /// like 'li'.
-  fixup_ppc_lo16,
-  
-  /// fixup_ppc_ha16 - A 16-bit fixup corresponding to ha16(_foo) for instrs
-  /// like 'lis'.
-  fixup_ppc_ha16,
+  /// fixup_ppc_br24abs - 24-bit absolute relocation for direct branches
+  /// like 'ba' and 'bla'.
+  fixup_ppc_br24abs,
+
+  /// fixup_ppc_brcond14abs - 14-bit absolute relocation for conditional
+  /// branches.
+  fixup_ppc_brcond14abs,
+
+  /// fixup_ppc_half16 - A 16-bit fixup corresponding to lo16(_foo)
+  /// or ha16(_foo) for instrs like 'li' or 'addis'.
+  fixup_ppc_half16,
   
-  /// fixup_ppc_lo16_ds - A 14-bit fixup corresponding to lo16(_foo) with
+  /// fixup_ppc_half16ds - A 14-bit fixup corresponding to lo16(_foo) with
   /// implied 2 zero bits for instrs like 'std'.
-  fixup_ppc_lo16_ds,
-
-  /// fixup_ppc_tlsreg - Insert thread-pointer register number.
-  fixup_ppc_tlsreg,
+  fixup_ppc_half16ds,
 
   /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
-  /// to __tls_get_addr for the TLS general and local dynamic models.
+  /// to __tls_get_addr for the TLS general and local dynamic models,
+  /// or inserts the thread-pointer register number.
   fixup_ppc_nofixup,
   
   // Marker
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index a25d7fe..f3dddce 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -22,7 +22,6 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
   }
   IsLittleEndian = false;
 
-  PCSymbol = ".";
   CommentString = ";";
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
@@ -47,24 +46,24 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   CommentString = "#";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  
+
   // Uses '.section' before '.bss' directive
   UsesELFSectionDirectiveForBSS = true;  
 
   // Debug Information
   SupportsDebugInformation = true;
 
-  PCSymbol = ".";
+  DollarIsPC = true;
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+  MinInstAlignment = 4;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  AssemblerDialect = 0;           // Old-Style mnemonics.
+  AssemblerDialect = 1;           // New-Style mnemonics.
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 7b4ed9f..1530e77 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -15,6 +15,7 @@
 #define PPCTARGETASMINFO_H
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
 
@@ -24,7 +25,7 @@ namespace llvm {
     explicit PPCMCAsmInfoDarwin(bool is64Bit);
   };
 
-  class PPCLinuxMCAsmInfo : public MCAsmInfo {
+  class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit PPCLinuxMCAsmInfo(bool is64Bit);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 2223cd6..346a9be 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
@@ -48,16 +49,20 @@ public:
                                SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getHA16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                           SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
+  unsigned getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups) const;
 
@@ -72,13 +77,19 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const {
+    // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
+    // It's just a nop to keep the register classes happy, so don't
+    // generate anything.
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == TargetOpcode::COPY_TO_REGCLASS)
+      return;
+
     uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
 
     // BL8_NOP etc. all have a size of 8 because of the following 'nop'.
     unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
-    unsigned Opcode = MI.getOpcode();
     if (Opcode == PPC::BL8_NOP || Opcode == PPC::BLA8_NOP ||
-        Opcode == PPC::BL8_NOP_TLSGD || Opcode == PPC::BL8_NOP_TLSLD)
+        Opcode == PPC::BL8_NOP_TLS)
       Size = 8;
     
     // Output the constant in big endian byte order.
@@ -111,17 +122,6 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_br24));
-
-  // For special TLS calls, add another fixup for the symbol.  Apparently
-  // BL8_NOP, BL8_NOP_TLSGD, and BL8_NOP_TLSLD are sufficiently
-  // similar that TblGen will not generate a separate case for the latter
-  // two, so this is the only way to get the extra fixup generated.
-  unsigned Opcode = MI.getOpcode();
-  if (Opcode == PPC::BL8_NOP_TLSGD || Opcode == PPC::BL8_NOP_TLSLD) {
-    const MCOperand &MO2 = MI.getOperand(OpNo+1);
-    Fixups.push_back(MCFixup::Create(0, MO2.getExpr(),
-                                     (MCFixupKind)PPC::fixup_ppc_nofixup));
-  }
   return 0;
 }
 
@@ -136,25 +136,38 @@ unsigned PPCMCCodeEmitter::getCondBrEncoding(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
-unsigned PPCMCCodeEmitter::getHA16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+unsigned PPCMCCodeEmitter::
+getAbsDirectBrEncoding(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
+
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_ha16));
+                                   (MCFixupKind)PPC::fixup_ppc_br24abs));
   return 0;
 }
 
-unsigned PPCMCCodeEmitter::getLO16Encoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
+unsigned PPCMCCodeEmitter::
+getAbsCondBrEncoding(const MCInst &MI, unsigned OpNo,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
-  
+
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+                                   (MCFixupKind)PPC::fixup_ppc_brcond14abs));
+  return 0;
+}
+
+unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO, Fixups);
+  
+  // Add a fixup for the immediate field.
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
   return 0;
 }
 
@@ -170,8 +183,8 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16));
   return RegBits;
 }
 
@@ -185,11 +198,11 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
   
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
+    return ((getMachineOpValue(MI, MO, Fixups) >> 2) & 0x3FFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16_ds));
+  Fixups.push_back(MCFixup::Create(2, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_half16ds));
   return RegBits;
 }
 
@@ -203,19 +216,29 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
   // hint to the linker that this statement is part of a relocation sequence.
   // Return the thread-pointer register's encoding.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_tlsreg));
-  return CTX.getRegisterInfo().getEncodingValue(PPC::X13);
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  return CTX.getRegisterInfo()->getEncodingValue(PPC::X13);
+}
+
+unsigned PPCMCCodeEmitter::getTLSCallEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  // For special TLS calls, we need two fixups; one for the branch target
+  // (__tls_get_addr), which we create via getDirectBrEncoding as usual,
+  // and one for the TLSGD or TLSLD symbol, which is emitted here.
+  const MCOperand &MO = MI.getOperand(OpNo+1);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_nofixup));
+  return getDirectBrEncoding(MI, OpNo, Fixups);
 }
 
 unsigned PPCMCCodeEmitter::
 get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                     SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTCRF || 
-          MI.getOpcode() == PPC::MFOCRF ||
-          MI.getOpcode() == PPC::MTCRF8) &&
+  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
-  return 0x80 >> CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  return 0x80 >> CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 }
 
 
@@ -223,11 +246,12 @@ unsigned PPCMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
-    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
     // The GPR operand should come through here though.
-    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MFOCRF) ||
+    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
-    return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+    return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   }
   
   assert(MO.isImm() &&
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
new file mode 100644
index 0000000..d7e8402
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -0,0 +1,155 @@
+//===-- PPCMCExpr.cpp - PPC specific MC expression classes ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppcmcexpr"
+#include "PPCMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+using namespace llvm;
+
+const PPCMCExpr*
+PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
+                  bool isDarwin, MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr, isDarwin);
+}
+
+void PPCMCExpr::PrintImpl(raw_ostream &OS) const {
+  if (isDarwinSyntax()) {
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "lo16"; break;
+    case VK_PPC_HI: OS << "hi16"; break;
+    case VK_PPC_HA: OS << "ha16"; break;
+    }
+
+    OS << '(';
+    getSubExpr()->print(OS);
+    OS << ')';
+  } else {
+    getSubExpr()->print(OS);
+
+    switch (Kind) {
+    default: llvm_unreachable("Invalid kind!");
+    case VK_PPC_LO: OS << "@l"; break;
+    case VK_PPC_HI: OS << "@h"; break;
+    case VK_PPC_HA: OS << "@ha"; break;
+    case VK_PPC_HIGHER: OS << "@higher"; break;
+    case VK_PPC_HIGHERA: OS << "@highera"; break;
+    case VK_PPC_HIGHEST: OS << "@highest"; break;
+    case VK_PPC_HIGHESTA: OS << "@highesta"; break;
+    }
+  }
+}
+
+bool
+PPCMCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                     const MCAsmLayout *Layout) const {
+  MCValue Value;
+
+  if (!Layout || !getSubExpr()->EvaluateAsRelocatable(Value, *Layout))
+    return false;
+
+  if (Value.isAbsolute()) {
+    int64_t Result = Value.getConstant();
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_LO:
+        Result = Result & 0xffff;
+        break;
+      case VK_PPC_HI:
+        Result = (Result >> 16) & 0xffff;
+        break;
+      case VK_PPC_HA:
+        Result = ((Result + 0x8000) >> 16) & 0xffff;
+        break;
+      case VK_PPC_HIGHER:
+        Result = (Result >> 32) & 0xffff;
+        break;
+      case VK_PPC_HIGHERA:
+        Result = ((Result + 0x8000) >> 32) & 0xffff;
+        break;
+      case VK_PPC_HIGHEST:
+        Result = (Result >> 48) & 0xffff;
+        break;
+      case VK_PPC_HIGHESTA:
+        Result = ((Result + 0x8000) >> 48) & 0xffff;
+        break;
+    }
+    Res = MCValue::get(Result);
+  } else {
+    MCContext &Context = Layout->getAssembler().getContext();
+    const MCSymbolRefExpr *Sym = Value.getSymA();
+    MCSymbolRefExpr::VariantKind Modifier = Sym->getKind();
+    if (Modifier != MCSymbolRefExpr::VK_None)
+      return false;
+    switch (Kind) {
+      default:
+        llvm_unreachable("Invalid kind!");
+      case VK_PPC_LO:
+        Modifier = MCSymbolRefExpr::VK_PPC_LO;
+        break;
+      case VK_PPC_HI:
+        Modifier = MCSymbolRefExpr::VK_PPC_HI;
+        break;
+      case VK_PPC_HA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HA;
+        break;
+      case VK_PPC_HIGHERA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHERA;
+        break;
+      case VK_PPC_HIGHER:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHER;
+        break;
+      case VK_PPC_HIGHEST:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHEST;
+        break;
+      case VK_PPC_HIGHESTA:
+        Modifier = MCSymbolRefExpr::VK_PPC_HIGHESTA;
+        break;
+    }
+    Sym = MCSymbolRefExpr::Create(&Sym->getSymbol(), Modifier, Context);
+    Res = MCValue::get(Sym, Value.getSymB(), Value.getConstant());
+  }
+
+  return true;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+static void AddValueSymbols_(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbols_(BE->getLHS(), Asm);
+    AddValueSymbols_(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbols_(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void PPCMCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbols_(getSubExpr(), Asm);
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
new file mode 100644
index 0000000..e44c7c1
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -0,0 +1,96 @@
+//===-- PPCMCExpr.h - PPC specific MC expression classes --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCMCEXPR_H
+#define PPCMCEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCAsmLayout.h"
+
+namespace llvm {
+
+class PPCMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_PPC_None,
+    VK_PPC_LO,
+    VK_PPC_HI,
+    VK_PPC_HA,
+    VK_PPC_HIGHER,
+    VK_PPC_HIGHERA,
+    VK_PPC_HIGHEST,
+    VK_PPC_HIGHESTA
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+  bool IsDarwin;
+
+  explicit PPCMCExpr(VariantKind _Kind, const MCExpr *_Expr,
+                     bool _IsDarwin)
+    : Kind(_Kind), Expr(_Expr), IsDarwin(_IsDarwin) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const PPCMCExpr *Create(VariantKind Kind, const MCExpr *Expr,
+                                 bool isDarwin, MCContext &Ctx);
+
+  static const PPCMCExpr *CreateLo(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_LO, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *CreateHi(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_HI, Expr, isDarwin, Ctx);
+  }
+
+  static const PPCMCExpr *CreateHa(const MCExpr *Expr,
+                                   bool isDarwin, MCContext &Ctx) {
+    return Create(VK_PPC_HA, Expr, isDarwin, Ctx);
+  }
+
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// isDarwinSyntax - True if expression is to be printed using Darwin syntax.
+  bool isDarwinSyntax() const { return IsDarwin; }
+
+
+  /// @}
+
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res,
+                                 const MCAsmLayout *Layout) const;
+  void AddValueSymbols(MCAssembler *) const;
+  const MCSection *FindAssociatedSection() const {
+    return getSubExpr()->FindAssociatedSection();
+  }
+
+  // There are no TLS PPCMCExprs at the moment.
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 2209f93..f18d095 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -14,13 +14,16 @@
 #include "PPCMCTargetDesc.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "PPCMCAsmInfo.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_INSTRINFO_MC_DESC
@@ -34,6 +37,9 @@
 
 using namespace llvm;
 
+// Pin the vtable to this file.
+PPCTargetStreamer::~PPCTargetStreamer() {}
+
 static MCInstrInfo *createPPCMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitPPCMCInstrInfo(X);
@@ -42,7 +48,8 @@ static MCInstrInfo *createPPCMCInstrInfo() {
 
 static MCRegisterInfo *createPPCMCRegisterInfo(StringRef TT) {
   Triple TheTriple(TT);
-  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64);
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+                  TheTriple.getArch() == Triple::ppc64le);
   unsigned Flavour = isPPC64 ? 0 : 1;
   unsigned RA = isPPC64 ? PPC::LR8 : PPC::LR;
 
@@ -58,9 +65,10 @@ static MCSubtargetInfo *createPPCMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
-  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+  bool isPPC64 = (TheTriple.getArch() == Triple::ppc64 ||
+                  TheTriple.getArch() == Triple::ppc64le);
 
   MCAsmInfo *MAI;
   if (TheTriple.isOSDarwin())
@@ -69,9 +77,10 @@ static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
     MAI = new PPCLinuxMCAsmInfo(isPPC64);
 
   // Initial state of the frame pointer is R1.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(isPPC64? PPC::X1 : PPC::R1, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(Reg, true), 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
@@ -90,13 +99,37 @@ static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   }
   if (CM == CodeModel::Default) {
     Triple T(TT);
-    if (!T.isOSDarwin() && T.getArch() == Triple::ppc64)
+    if (!T.isOSDarwin() &&
+        (T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le))
       CM = CodeModel::Medium;
   }
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
+namespace {
+class PPCTargetAsmStreamer : public PPCTargetStreamer {
+  formatted_raw_ostream &OS;
+
+public:
+  PPCTargetAsmStreamer(formatted_raw_ostream &OS) : OS(OS) {}
+  virtual void emitTCEntry(const MCSymbol &S) {
+    OS << "\t.tc ";
+    OS << S.getName();
+    OS << "[TC],";
+    OS << S.getName();
+    OS << '\n';
+  }
+};
+
+class PPCTargetELFStreamer : public PPCTargetStreamer {
+  virtual void emitTCEntry(const MCSymbol &S) {
+    // Creates a R_PPC64_TOC relocation
+    Streamer->EmitSymbolValue(&S, 8);
+  }
+};
+}
+
 // This is duplicated code. Refactor this.
 static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCContext &Ctx, MCAsmBackend &MAB,
@@ -107,7 +140,20 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (Triple(TT).isOSDarwin())
     return createMachOStreamer(Ctx, MAB, OS, Emitter, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  PPCTargetStreamer *S = new PPCTargetELFStreamer();
+  return createELFStreamer(Ctx, S, MAB, OS, Emitter, RelaxAll, NoExecStack);
+}
+
+static MCStreamer *
+createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
+                    bool isVerboseAsm, bool useLoc, bool useCFI,
+                    bool useDwarfDirectory, MCInstPrinter *InstPrint,
+                    MCCodeEmitter *CE, MCAsmBackend *TAB, bool ShowInst) {
+  PPCTargetStreamer *S = new PPCTargetAsmStreamer(OS);
+
+  return llvm::createAsmStreamer(Ctx, S, OS, isVerboseAsm, useLoc, useCFI,
+                                 useDwarfDirectory, InstPrint, CE, TAB,
+                                 ShowInst);
 }
 
 static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
@@ -116,45 +162,65 @@ static MCInstPrinter *createPPCMCInstPrinter(const Target &T,
                                              const MCInstrInfo &MII,
                                              const MCRegisterInfo &MRI,
                                              const MCSubtargetInfo &STI) {
-  return new PPCInstPrinter(MAI, MII, MRI, SyntaxVariant);
+  bool isDarwin = Triple(STI.getTargetTriple()).isOSDarwin();
+  return new PPCInstPrinter(MAI, MII, MRI, isDarwin);
 }
 
 extern "C" void LLVMInitializePowerPCTargetMC() {
   // Register the MC asm info.
   RegisterMCAsmInfoFn C(ThePPC32Target, createPPCMCAsmInfo);
   RegisterMCAsmInfoFn D(ThePPC64Target, createPPCMCAsmInfo);  
+  RegisterMCAsmInfoFn E(ThePPC64LETarget, createPPCMCAsmInfo);  
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(ThePPC32Target, createPPCMCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(ThePPC64Target, createPPCMCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(ThePPC64LETarget,
+                                        createPPCMCCodeGenInfo);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(ThePPC32Target, createPPCMCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(ThePPC64Target, createPPCMCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(ThePPC64LETarget,
+                                      createPPCMCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(ThePPC32Target, createPPCMCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(ThePPC64Target, createPPCMCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(ThePPC64LETarget, createPPCMCRegisterInfo);
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC32Target,
                                           createPPCMCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(ThePPC64Target,
                                           createPPCMCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(ThePPC64LETarget,
+                                          createPPCMCSubtargetInfo);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(ThePPC32Target, createPPCMCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(ThePPC64Target, createPPCMCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(ThePPC64LETarget,
+                                        createPPCMCCodeEmitter);
   
     // Register the asm backend.
   TargetRegistry::RegisterMCAsmBackend(ThePPC32Target, createPPCAsmBackend);
   TargetRegistry::RegisterMCAsmBackend(ThePPC64Target, createPPCAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(ThePPC64LETarget, createPPCAsmBackend);
   
   // Register the object streamer.
   TargetRegistry::RegisterMCObjectStreamer(ThePPC32Target, createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(ThePPC64Target, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(ThePPC64LETarget, createMCStreamer);
+
+  // Register the asm streamer.
+  TargetRegistry::RegisterAsmStreamer(ThePPC32Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64Target, createMCAsmStreamer);
+  TargetRegistry::RegisterAsmStreamer(ThePPC64LETarget, createMCAsmStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(ThePPC32Target, createPPCMCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(ThePPC64Target, createPPCMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(ThePPC64LETarget,
+                                        createPPCMCInstPrinter);
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 38a7420..0b0ca24 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -33,18 +33,24 @@ class raw_ostream;
 
 extern Target ThePPC32Target;
 extern Target ThePPC64Target;
+extern Target ThePPC64LETarget;
   
 MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createPPCAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                  StringRef TT, StringRef CPU);
 
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
                                          bool Is64Bit,
                                          uint8_t OSABI);
+/// createPPCELFObjectWriter - Construct a PPC Mach-O object writer.
+MCObjectWriter *createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                          uint32_t CPUType,
+                                          uint32_t CPUSubtype);
 } // End llvm namespace
 
 // Generated files will use "namespace PPC". To avoid symbol clash,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
new file mode 100644
index 0000000..bbafe2e
--- /dev/null
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -0,0 +1,389 @@
+//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
+
+using namespace llvm;
+
+namespace {
+class PPCMachObjectWriter : public MCMachObjectTargetWriter {
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
+                                 const MCAssembler &Asm,
+                                 const MCAsmLayout &Layout,
+                                 const MCFragment *Fragment,
+                                 const MCFixup &Fixup, MCValue Target,
+                                 unsigned Log2Size, uint64_t &FixedValue);
+
+  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                           const MCAsmLayout &Layout,
+                           const MCFragment *Fragment, const MCFixup &Fixup,
+                           MCValue Target, uint64_t &FixedValue);
+
+public:
+  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
+      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype,
+                                 /*UseAggressiveSymbolFolding=*/Is64Bit) {}
+
+  void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
+                        const MCAsmLayout &Layout, const MCFragment *Fragment,
+                        const MCFixup &Fixup, MCValue Target,
+                        uint64_t &FixedValue) {
+    if (Writer->is64Bit()) {
+      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
+    } else
+      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                          FixedValue);
+  }
+};
+}
+
+/// computes the log2 of the size of the relocation,
+/// used for relocation_info::r_length.
+static unsigned getFixupKindLog2Size(unsigned Kind) {
+  switch (Kind) {
+  default:
+    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
+  case FK_PCRel_1:
+  case FK_Data_1:
+    return 0;
+  case FK_PCRel_2:
+  case FK_Data_2:
+    return 1;
+  case FK_PCRel_4:
+  case PPC::fixup_ppc_brcond14:
+  case PPC::fixup_ppc_half16:
+  case PPC::fixup_ppc_br24:
+  case FK_Data_4:
+    return 2;
+  case FK_PCRel_8:
+  case FK_Data_8:
+    return 3;
+  }
+  return 0;
+}
+
+/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
+/// Outline based on PPCELFObjectWriter::getRelocTypeInner().
+static unsigned getRelocType(const MCValue &Target,
+                             const MCFixupKind FixupKind, // from
+                                                          // Fixup.getKind()
+                             const bool IsPCRel) {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // determine the type of the relocation
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
+  if (IsPCRel) { // relative to PC
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (relative)");
+    case PPC::fixup_ppc_br24:
+      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
+      break;
+    case PPC::fixup_ppc_brcond14:
+      Type = MachO::PPC_RELOC_BR14;
+      break;
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16;
+        break;
+      }
+      break;
+    }
+  } else {
+    switch ((unsigned)FixupKind) {
+    default:
+      report_fatal_error("Unimplemented fixup kind (absolute)!");
+    case PPC::fixup_ppc_half16:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported modifier for half16 fixup");
+      case MCSymbolRefExpr::VK_PPC_HA:
+        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_LO:
+        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
+        break;
+      case MCSymbolRefExpr::VK_PPC_HI:
+        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
+        break;
+      }
+      break;
+    case FK_Data_4:
+      break;
+    case FK_Data_2:
+      break;
+    }
+  }
+  return Type;
+}
+
+static void makeRelocationInfo(MachO::any_relocation_info &MRE,
+                               const uint32_t FixupOffset, const uint32_t Index,
+                               const unsigned IsPCRel, const unsigned Log2Size,
+                               const unsigned IsExtern, const unsigned Type) {
+  MRE.r_word0 = FixupOffset;
+  // The bitfield offsets that work (as determined by trial-and-error)
+  // are different than what is documented in the mach-o manuals.
+  // This appears to be an endianness issue; reversing the order of the
+  // documented bitfields in <llvm/Support/MachO.h> fixes this (but
+  // breaks x86/ARM assembly).
+  MRE.r_word1 = ((Index << 8) |    // was << 0
+                 (IsPCRel << 7) |  // was << 24
+                 (Log2Size << 5) | // was << 25
+                 (IsExtern << 4) | // was << 27
+                 (Type << 0));     // was << 28
+}
+
+static void
+makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
+                            const uint32_t Addr, const unsigned Type,
+                            const unsigned Log2Size, const unsigned IsPCRel,
+                            const uint32_t Value2) {
+  // For notes on bitfield positions and endianness, see:
+  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
+  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
+                 (IsPCRel << 30) | MachO::R_SCATTERED);
+  MRE.r_word1 = Value2;
+}
+
+/// Compute fixup offset (address).
+static uint32_t getFixupOffset(const MCAsmLayout &Layout,
+                               const MCFragment *Fragment,
+                               const MCFixup &Fixup) {
+  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  // On Mach-O, ppc_fixup_half16 relocations must refer to the
+  // start of the instruction, not the second halfword, as ELF does
+  if (unsigned(Fixup.getKind()) == PPC::fixup_ppc_half16)
+    FixupOffset &= ~uint32_t(3);
+  return FixupOffset;
+}
+
+/// \return false if falling back to using non-scattered relocation,
+/// otherwise true for normal scattered relocation.
+/// based on X86MachObjectWriter::RecordScatteredRelocation
+/// and ARMMachObjectWriter::RecordScatteredRelocation
+bool PPCMachObjectWriter::RecordScatteredRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    unsigned Log2Size, uint64_t &FixedValue) {
+  // caller already computes these, can we just pass and reuse?
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  const MCFixupKind FK = Fixup.getKind();
+  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned Type = getRelocType(Target, FK, IsPCRel);
+
+  // Is this a local or SECTDIFF relocation entry?
+  // SECTDIFF relocation entries have symbol subtractions,
+  // and require two entries, the first for the add-symbol value,
+  // the second for the subtract-symbol value.
+
+  // See <reloc.h>.
+  const MCSymbol *A = &Target.getSymA()->getSymbol();
+  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+
+  if (!A_SD->getFragment())
+    report_fatal_error("symbol '" + A->getName() +
+                       "' can not be undefined in a subtraction expression");
+
+  uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
+  uint64_t SecAddr =
+      Writer->getSectionAddress(A_SD->getFragment()->getParent());
+  FixedValue += SecAddr;
+  uint32_t Value2 = 0;
+
+  if (const MCSymbolRefExpr *B = Target.getSymB()) {
+    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+
+    if (!B_SD->getFragment())
+      report_fatal_error("symbol '" + B->getSymbol().getName() +
+                         "' can not be undefined in a subtraction expression");
+
+    // FIXME: is Type correct? see include/llvm/Support/MachO.h
+    Value2 = Writer->getSymbolAddress(B_SD, Layout);
+    FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
+  }
+  // FIXME: does FixedValue get used??
+
+  // Relocations are written out in reverse order, so the PAIR comes first.
+  if (Type == MachO::PPC_RELOC_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
+      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
+    // X86 had this piece, but ARM does not
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                                  Twine("Section too large, can't encode "
+                                        "r_address (") +
+                                      Buffer + ") into 24 bits of scattered "
+                                               "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
+    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
+    // see PPCMCExpr::EvaluateAsRelocatableImpl()
+    uint32_t other_half = 0;
+    switch (Type) {
+    case MachO::PPC_RELOC_LO16_SECTDIFF:
+      other_half = (FixedValue >> 16) & 0xffff;
+      // applyFixupOffset longer extracts the high part because it now assumes
+      // this was already done.
+      // It looks like this is not true for the FixedValue needed with Mach-O
+      // relocs.
+      // So we need to adjust FixedValue again here.
+      FixedValue &= 0xffff;
+      break;
+    case MachO::PPC_RELOC_HA16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue =
+          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
+      break;
+    case MachO::PPC_RELOC_HI16_SECTDIFF:
+      other_half = FixedValue & 0xffff;
+      FixedValue = (FixedValue >> 16) & 0xffff;
+      break;
+    default:
+      llvm_unreachable("Invalid PPC scattered relocation type.");
+      break;
+    }
+
+    MachO::any_relocation_info MRE;
+    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
+                                Log2Size, IsPCRel, Value2);
+    Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
+  }
+  MachO::any_relocation_info MRE;
+  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
+}
+
+// see PPCELFObjectWriter for a general outline of cases
+void PPCMachObjectWriter::RecordPPCRelocation(
+    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
+    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
+    uint64_t &FixedValue) {
+  const MCFixupKind FK = Fixup.getKind(); // unsigned
+  const unsigned Log2Size = getFixupKindLog2Size(FK);
+  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
+  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
+
+  // If this is a difference or a defined symbol plus an offset, then we need a
+  // scattered relocation entry. Differences always require scattered
+  // relocations.
+  if (Target.getSymB() &&
+      // Q: are branch targets ever scattered?
+      RelocType != MachO::PPC_RELOC_BR24 &&
+      RelocType != MachO::PPC_RELOC_BR14) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
+                              Log2Size, FixedValue);
+    return;
+  }
+
+  // this doesn't seem right for RIT_PPC_BR24
+  // Get the symbol data, if any.
+  MCSymbolData *SD = 0;
+  if (Target.getSymA())
+    SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+
+  // See <reloc.h>.
+  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
+  unsigned Index = 0;
+  unsigned IsExtern = 0;
+  unsigned Type = RelocType;
+
+  if (Target.isAbsolute()) { // constant
+                             // SymbolNum of 0 indicates the absolute section.
+                             //
+    // FIXME: Currently, these are never generated (see code below). I cannot
+    // find a case where they are actually emitted.
+    report_fatal_error("FIXME: relocations to absolute targets "
+                       "not yet implemented");
+    // the above line stolen from ARM, not sure
+  } else {
+    // Resolve constant variables.
+    if (SD->getSymbol().isVariable()) {
+      int64_t Res;
+      if (SD->getSymbol().getVariableValue()->EvaluateAsAbsolute(
+              Res, Layout, Writer->getSectionAddressMap())) {
+        FixedValue = Res;
+        return;
+      }
+    }
+
+    // Check whether we need an external or internal relocation.
+    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+      IsExtern = 1;
+      Index = SD->getIndex();
+      // For external relocations, make sure to offset the fixup value to
+      // compensate for the addend of the symbol address, if it was
+      // undefined. This occurs with weak definitions, for example.
+      if (!SD->Symbol->isUndefined())
+        FixedValue -= Layout.getSymbolOffset(SD);
+    } else {
+      // The index is the section ordinal (1-based).
+      const MCSectionData &SymSD =
+          Asm.getSectionData(SD->getSymbol().getSection());
+      Index = SymSD.getOrdinal() + 1;
+      FixedValue += Writer->getSectionAddress(&SymSD);
+    }
+    if (IsPCRel)
+      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
+  }
+
+  // struct relocation_info (8 bytes)
+  MachO::any_relocation_info MRE;
+  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, IsExtern,
+                     Type);
+  Writer->addRelocation(Fragment->getParent(), MRE);
+}
+
+MCObjectWriter *llvm::createPPCMachObjectWriter(raw_ostream &OS, bool Is64Bit,
+                                                uint32_t CPUType,
+                                                uint32_t CPUSubtype) {
+  return createMachObjectWriter(
+      new PPCMachObjectWriter(Is64Bit, CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/false);
+}
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
index 853e505..63facc5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.cpp
@@ -26,6 +26,22 @@ PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
   case PPC::PRED_LE: return PPC::PRED_GT;
   case PPC::PRED_NU: return PPC::PRED_UN;
   case PPC::PRED_UN: return PPC::PRED_NU;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_UN_PLUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_NU_MINUS;
   }
   llvm_unreachable("Unknown PPC branch opcode!");
 }
@@ -40,6 +56,22 @@ PPC::Predicate PPC::getSwappedPredicate(PPC::Predicate Opcode) {
   case PPC::PRED_LE: return PPC::PRED_GE;
   case PPC::PRED_NU: return PPC::PRED_NU;
   case PPC::PRED_UN: return PPC::PRED_UN;
+  case PPC::PRED_EQ_MINUS: return PPC::PRED_EQ_MINUS;
+  case PPC::PRED_NE_MINUS: return PPC::PRED_NE_MINUS;
+  case PPC::PRED_LT_MINUS: return PPC::PRED_GT_MINUS;
+  case PPC::PRED_GE_MINUS: return PPC::PRED_LE_MINUS;
+  case PPC::PRED_GT_MINUS: return PPC::PRED_LT_MINUS;
+  case PPC::PRED_LE_MINUS: return PPC::PRED_GE_MINUS;
+  case PPC::PRED_NU_MINUS: return PPC::PRED_NU_MINUS;
+  case PPC::PRED_UN_MINUS: return PPC::PRED_UN_MINUS;
+  case PPC::PRED_EQ_PLUS: return PPC::PRED_EQ_PLUS;
+  case PPC::PRED_NE_PLUS: return PPC::PRED_NE_PLUS;
+  case PPC::PRED_LT_PLUS: return PPC::PRED_GT_PLUS;
+  case PPC::PRED_GE_PLUS: return PPC::PRED_LE_PLUS;
+  case PPC::PRED_GT_PLUS: return PPC::PRED_LT_PLUS;
+  case PPC::PRED_LE_PLUS: return PPC::PRED_GE_PLUS;
+  case PPC::PRED_NU_PLUS: return PPC::PRED_NU_PLUS;
+  case PPC::PRED_UN_PLUS: return PPC::PRED_UN_PLUS;
   }
   llvm_unreachable("Unknown PPC branch opcode!");
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
index 444758c..d498c2f 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCPredicates.h
@@ -25,14 +25,30 @@ namespace llvm {
 namespace PPC {
   /// Predicate - These are "(BI << 5) | BO"  for various predicates.
   enum Predicate {
-    PRED_LT     = (0 << 5) | 12,
-    PRED_LE     = (1 << 5) |  4,
-    PRED_EQ     = (2 << 5) | 12,
-    PRED_GE     = (0 << 5) |  4,
-    PRED_GT     = (1 << 5) | 12,
-    PRED_NE     = (2 << 5) |  4,
-    PRED_UN     = (3 << 5) | 12,
-    PRED_NU     = (3 << 5) |  4
+    PRED_LT       = (0 << 5) | 12,
+    PRED_LE       = (1 << 5) |  4,
+    PRED_EQ       = (2 << 5) | 12,
+    PRED_GE       = (0 << 5) |  4,
+    PRED_GT       = (1 << 5) | 12,
+    PRED_NE       = (2 << 5) |  4,
+    PRED_UN       = (3 << 5) | 12,
+    PRED_NU       = (3 << 5) |  4,
+    PRED_LT_MINUS = (0 << 5) | 14,
+    PRED_LE_MINUS = (1 << 5) |  6,
+    PRED_EQ_MINUS = (2 << 5) | 14,
+    PRED_GE_MINUS = (0 << 5) |  6,
+    PRED_GT_MINUS = (1 << 5) | 14,
+    PRED_NE_MINUS = (2 << 5) |  6,
+    PRED_UN_MINUS = (3 << 5) | 14,
+    PRED_NU_MINUS = (3 << 5) |  6,
+    PRED_LT_PLUS  = (0 << 5) | 15,
+    PRED_LE_PLUS  = (1 << 5) |  7,
+    PRED_EQ_PLUS  = (2 << 5) | 15,
+    PRED_GE_PLUS  = (0 << 5) |  7,
+    PRED_GT_PLUS  = (1 << 5) | 15,
+    PRED_NE_PLUS  = (2 << 5) |  7,
+    PRED_UN_PLUS  = (3 << 5) | 15,
+    PRED_NU_PLUS  = (3 << 5) |  7
   };
   
   /// Invert the specified predicate.  != -> ==, < -> >=.
diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
index 6666694..21fdcd9 100644
--- a/lib/Target/PowerPC/Makefile
+++ b/lib/Target/PowerPC/Makefile
@@ -16,7 +16,7 @@ BUILT_SOURCES = PPCGenRegisterInfo.inc PPCGenAsmMatcher.inc \
                 PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
                 PPCGenInstrInfo.inc PPCGenDAGISel.inc \
                 PPCGenSubtargetInfo.inc PPCGenCallingConv.inc \
-                PPCGenMCCodeEmitter.inc
+                PPCGenMCCodeEmitter.inc PPCGenFastISel.inc
 
 DIRS = AsmParser InstPrinter TargetInfo MCTargetDesc
 
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index b4be51a..f0d5af2 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -30,7 +30,10 @@ namespace llvm {
   class AsmPrinter;
   class MCInst;
 
-  FunctionPass *createPPCCTRLoops();
+  FunctionPass *createPPCCTRLoops(PPCTargetMachine &TM);
+#ifndef NDEBUG
+  FunctionPass *createPPCCTRLoopsVerify();
+#endif
   FunctionPass *createPPCEarlyReturnPass();
   FunctionPass *createPPCBranchSelectionPass();
   FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
@@ -71,18 +74,21 @@ namespace llvm {
     /// The next are not flags but distinct values.
     MO_ACCESS_MASK = 0xf0,
 
-    /// MO_LO16, MO_HA16 - lo16(symbol) and ha16(symbol)
-    MO_LO16 = 1 << 4,
-    MO_HA16 = 2 << 4,
+    /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
+    MO_LO = 1 << 4,
+    MO_HA = 2 << 4,
 
-    MO_TPREL16_HA = 3 << 4,
-    MO_TPREL16_LO = 4 << 4,
+    MO_TPREL_LO = 4 << 4,
+    MO_TPREL_HA = 3 << 4,
 
     /// These values identify relocations on immediates folded
     /// into memory operations.
-    MO_DTPREL16_LO = 5 << 4,
-    MO_TLSLD16_LO  = 6 << 4,
-    MO_TOC16_LO    = 7 << 4
+    MO_DTPREL_LO = 5 << 4,
+    MO_TLSLD_LO  = 6 << 4,
+    MO_TOC_LO    = 7 << 4,
+
+    // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
+    MO_TLS       = 8 << 4
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index eb73c67..54e3d40 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -57,6 +57,8 @@ def FeatureMFOCRF    : SubtargetFeature<"mfocrf","HasMFOCRF", "true",
                                         "Enable the MFOCRF instruction">;
 def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
                                         "Enable the fsqrt instruction">;
+def FeatureFCPSGN    : SubtargetFeature<"fcpsgn", "HasFCPSGN", "true",
+                                        "Enable the fcpsgn instruction">;
 def FeatureFRE       : SubtargetFeature<"fre", "HasFRE", "true",
                                         "Enable the fre instruction">;
 def FeatureFRES      : SubtargetFeature<"fres", "HasFRES", "true",
@@ -85,6 +87,13 @@ def FeatureBookE     : SubtargetFeature<"booke", "IsBookE", "true",
                                         "Enable Book E instructions">;
 def FeatureQPX       : SubtargetFeature<"qpx","HasQPX", "true",
                                         "Enable QPX instructions">;
+def FeatureVSX       : SubtargetFeature<"vsx","HasVSX", "true",
+                                        "Enable VSX instructions">;
+
+def DeprecatedMFTB   : SubtargetFeature<"", "DeprecatedMFTB", "true",
+                                        "Treat mftb as deprecated">;
+def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
+  "Treat vector data stream cache control instructions as deprecated">;
 
 // Note: Future features to add when support is extended to more
 // recent ISA levels:
@@ -146,10 +155,10 @@ include "PPCInstrInfo.td"
 def : Processor<"generic", G3Itineraries, [Directive32]>;
 def : Processor<"440", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"450", PPC440Itineraries, [Directive440, FeatureISEL,
                                            FeatureFRES, FeatureFRSQRTE,
-                                           FeatureBookE]>;
+                                           FeatureBookE, DeprecatedMFTB]>;
 def : Processor<"601", G3Itineraries, [Directive601]>;
 def : Processor<"602", G3Itineraries, [Directive602]>;
 def : Processor<"603", G3Itineraries, [Directive603,
@@ -185,29 +194,32 @@ def : ProcessorModel<"g5", G5Model,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    FeatureFRES, FeatureFRSQRTE,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc, FeatureMFOCRF,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"e5500", PPCE5500Model,
                   [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
-                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL,
+                   DeprecatedMFTB]>;
 def : ProcessorModel<"a2", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */]>;
+               /*, Feature64BitRegs */, DeprecatedMFTB]>;
 def : ProcessorModel<"a2q", PPCA2Model,
                   [DirectiveA2, FeatureBookE, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX, Feature64Bit
-               /*, Feature64BitRegs */, FeatureQPX]>;
+               /*, Feature64BitRegs */, FeatureQPX, DeprecatedMFTB]>;
 def : ProcessorModel<"pwr3", G5Model,
                   [DirectivePwr3, FeatureAltivec,
                    FeatureFRES, FeatureFRSQRTE, FeatureMFOCRF,
@@ -220,38 +232,48 @@ def : ProcessorModel<"pwr5", G5Model,
                   [DirectivePwr5, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, Feature64Bit]>;
+                   FeatureSTFIWX, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr5x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
                    FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES,
-                   FeatureSTFIWX, FeatureFPRND, Feature64Bit]>;
+                   FeatureSTFIWX, FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6", G5Model,
                   [DirectivePwr6, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */]>;
+                   FeatureFPRND, Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr6x", G5Model,
                   [DirectivePwr5x, FeatureAltivec, FeatureMFOCRF,
-                   FeatureFSqrt, FeatureFRE, FeatureFRES,
+                   FeatureFCPSGN, FeatureFSqrt, FeatureFRE, FeatureFRES,
                    FeatureFRSQRTE, FeatureFRSQRTES, FeatureRecipPrec,
                    FeatureSTFIWX, FeatureLFIWAX,
-                   FeatureFPRND, Feature64Bit]>;
+                   FeatureFPRND, Feature64Bit,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : ProcessorModel<"pwr7", G5Model,
                   [DirectivePwr7, FeatureAltivec,
-                   FeatureMFOCRF, FeatureFSqrt, FeatureFRE,
+                   FeatureMFOCRF, FeatureFCPSGN, FeatureFSqrt, FeatureFRE,
                    FeatureFRES, FeatureFRSQRTE, FeatureFRSQRTES,
                    FeatureRecipPrec, FeatureSTFIWX, FeatureLFIWAX,
                    FeatureFPRND, FeatureFPCVT, FeatureISEL,
                    FeaturePOPCNTD, FeatureLDBRX,
-                   Feature64Bit /*, Feature64BitRegs */]>;
+                   Feature64Bit /*, Feature64BitRegs */,
+                   DeprecatedMFTB, DeprecatedDST]>;
 def : Processor<"ppc", G3Itineraries, [Directive32]>;
 def : ProcessorModel<"ppc64", G5Model,
                   [Directive64, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
                    FeatureFRSQRTE, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"ppc64le", G5Model,
+                  [Directive64, FeatureAltivec,
+                   FeatureMFOCRF, FeatureFSqrt, FeatureFRES,
+                   FeatureFRSQRTE, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
 
 //===----------------------------------------------------------------------===//
 // Calling Conventions
@@ -272,10 +294,20 @@ def PPCAsmParser : AsmParser {
   let ShouldEmitMatchRegisterName = 0;
 }
 
+def PPCAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // We do not use hard coded registers in asm strings.  However, some
+  // InstAlias definitions use immediate literals.  Set RegisterPrefix
+  // so that those are not misinterpreted as registers.
+  string RegisterPrefix = "%";
+}
+
 def PPC : Target {
   // Information about the instructions.
   let InstructionSet = PPCInstrInfo;
   
   let AssemblyWriters = [PPCAsmWriter];
   let AssemblyParsers = [PPCAsmParser];
+  let AssemblyParserVariants = [PPCAsmParserVariant];
 }
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 3c7cc4e..ada34ed 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -20,8 +20,10 @@
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCPredicates.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetStreamer.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -85,18 +87,6 @@ namespace {
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &O);
-
-    MachineLocation getDebugValueLocation(const MachineInstr *MI) const {
-      MachineLocation Location;
-      assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-      // Frame address.  Currently handles register +- offset only.
-      if (MI->getOperand(0).isReg() && MI->getOperand(2).isImm())
-        Location.set(MI->getOperand(0).getReg(), MI->getOperand(2).getImm());
-      else {
-        DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-      }
-      return Location;
-    }
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
@@ -213,7 +203,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
             .getGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
                  GV->hasAvailableExternallyLinkage()) {
         SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
@@ -223,12 +213,12 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
                     getHiddenGVStubEntry(SymToPrint);
         if (StubSym.getPointer() == 0)
           StubSym = MachineModuleInfoImpl::
-            StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+            StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else {
-        SymToPrint = Mang->getSymbol(GV);
+        SymToPrint = getSymbol(GV);
       }
     } else {
-      SymToPrint = Mang->getSymbol(GV);
+      SymToPrint = getSymbol(GV);
     }
     
     O << *SymToPrint;
@@ -339,28 +329,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // Lower multi-instruction pseudo operations.
   switch (MI->getOpcode()) {
   default: break;
-  case TargetOpcode::DBG_VALUE: {
-    if (!isVerbose() || !OutStreamer.hasRawTextSupport()) return;
-      
-    SmallString<32> Str;
-    raw_svector_ostream O(Str);
-    unsigned NOps = MI->getNumOperands();
-    assert(NOps==4);
-    O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-    // cast away const; DIetc do not take const operands for some reason.
-    DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-    O << V.getName();
-    O << " <- ";
-    // Frame address.  Currently handles register +- offset only.
-    assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-    O << '['; printOperand(MI, 0, O); O << '+'; printOperand(MI, 1, O);
-    O << ']';
-    O << "+";
-    printOperand(MI, NOps-2, O);
-    OutStreamer.EmitRawText(O.str());
-    return;
-  }
-      
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
   case PPC::MovePCtoLR:
   case PPC::MovePCtoLR8: {
     // Transform %LR = MovePCtoLR
@@ -394,7 +364,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
     MCSymbol *MOSymbol = 0;
     if (MO.isGlobal())
-      MOSymbol = Mang->getSymbol(MO.getGlobal());
+      MOSymbol = getSymbol(MO.getGlobal());
     else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
@@ -403,7 +373,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *TOCEntry = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -433,7 +403,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsCommon = GVar && RealGValue->hasCommonLinkage();
@@ -444,11 +414,12 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
 
-    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI())
+    if (IsExternal || IsFunction || IsCommon || IsAvailExt || MO.isJTI() ||
+        TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_HA,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_HA,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -469,23 +440,27 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
-    else if (MO.isCPI())
+    else if (MO.isCPI()) {
       MOSymbol = GetCPISymbol(MO.getIndex());
+      if (TM.getCodeModel() == CodeModel::Large)
+        MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
+    }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
     
       if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+          RealGValue->hasAvailableExternallyLinkage() ||
+          TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -510,18 +485,18 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
       const GlobalValue *RealGValue = GAlias ?
         GAlias->resolveAliasedGlobal(false) : GValue;
-      MOSymbol = Mang->getSymbol(RealGValue);
+      MOSymbol = getSymbol(RealGValue);
       const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
       IsExternal = GVar && !GVar->hasInitializer();
       IsFunction = !GVar;
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
-    if (IsFunction || IsExternal)
+    if (IsFunction || IsExternal || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
 
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TOC_LO,
                               OutContext);
     TmpInst.getOperand(2) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -533,9 +508,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_HA,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -551,9 +526,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -565,9 +540,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_HA,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -581,9 +556,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsGD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD_LO,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -593,7 +568,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::GETtlsADDR: {
     // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
-    // Into:      BL8_NOP_TLSGD __tls_get_addr(sym@tlsgd)
+    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsgd)
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
 
     StringRef Name = "__tls_get_addr";
@@ -602,11 +577,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLSGD)
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLS)
                                 .addExpr(TlsRef)
                                 .addExpr(SymVar));
     return;
@@ -617,9 +592,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_HA,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -633,9 +608,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymGotTlsLD =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -645,7 +620,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::GETtlsldADDR: {
     // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
-    // Into:      BL8_NOP_TLSLD __tls_get_addr(sym@tlsld)
+    // Into:      BL8_NOP_TLS __tls_get_addr(sym@tlsld)
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
 
     StringRef Name = "__tls_get_addr";
@@ -654,11 +629,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymVar =
       MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
                               OutContext);
-    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLSLD)
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_TLS)
                                 .addExpr(TlsRef)
                                 .addExpr(SymVar));
     return;
@@ -669,9 +644,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_HA,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_HA,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -685,9 +660,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
     const MachineOperand &MO = MI->getOperand(2);
     const GlobalValue *GValue = MO.getGlobal();
-    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    MCSymbol *MOSymbol = getSymbol(GValue);
     const MCExpr *SymDtprel =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_LO,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL_LO,
                               OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8)
                                 .addReg(MI->getOperand(0).getReg())
@@ -695,21 +670,63 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                                 .addExpr(SymDtprel));
     return;
   }
-  case PPC::MFCRpseud:
-  case PPC::MFCR8pseud:
-    // Transform: %R3 = MFCRpseud %CR7
-    // Into:      %R3 = MFCR      ;; cr7
-    OutStreamer.AddComment(PPCInstPrinter::
-                           getRegisterName(MI->getOperand(1).getReg()));
-    OutStreamer.EmitInstruction(MCInstBuilder(Subtarget.isPPC64() ? PPC::MFCR8 : PPC::MFCR)
-      .addReg(MI->getOperand(0).getReg()));
-    return;
+  case PPC::MFOCRF:
+  case PPC::MFOCRF8:
+    if (!Subtarget.hasMFOCRF()) {
+      // Transform: %R3 = MFOCRF %CR7
+      // Into:      %R3 = MFCR   ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MFOCRF ? PPC::MFCR : PPC::MFCR8;
+      OutStreamer.AddComment(PPCInstPrinter::
+                             getRegisterName(MI->getOperand(1).getReg()));
+      OutStreamer.EmitInstruction(MCInstBuilder(NewOpcode)
+                                  .addReg(MI->getOperand(0).getReg()));
+      return;
+    }
+    break;
+  case PPC::MTOCRF:
+  case PPC::MTOCRF8:
+    if (!Subtarget.hasMFOCRF()) {
+      // Transform: %CR7 = MTOCRF %R3
+      // Into:      MTCRF mask, %R3 ;; cr7
+      unsigned NewOpcode =
+        MI->getOpcode() == PPC::MTOCRF ? PPC::MTCRF : PPC::MTCRF8;
+      unsigned Mask = 0x80 >> OutContext.getRegisterInfo()
+                              ->getEncodingValue(MI->getOperand(0).getReg());
+      OutStreamer.AddComment(PPCInstPrinter::
+                             getRegisterName(MI->getOperand(0).getReg()));
+      OutStreamer.EmitInstruction(MCInstBuilder(NewOpcode)
+                                  .addImm(Mask)
+                                  .addReg(MI->getOperand(1).getReg()));
+      return;
+    }
+    break;
   case PPC::SYNC:
     // In Book E sync is called msync, handle this special case here...
     if (Subtarget.isBookE()) {
       OutStreamer.EmitRawText(StringRef("\tmsync"));
       return;
     }
+    break;
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::LWA_32:
+  case PPC::LWA: {
+    // Verify alignment is legal, so we don't create relocations
+    // that can't be supported.
+    // FIXME:  This test is currently disabled for Darwin.  The test
+    // suite shows a handful of test cases that fail this check for
+    // Darwin.  Those need to be investigated before this sanity test
+    // can be enabled for those subtargets.
+    if (!Subtarget.isDarwin()) {
+      unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+      const MachineOperand &MO = MI->getOperand(OpNum);
+      if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+        llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
+    }
+    // Now process the instruction normally.
+    break;
+  }
   }
 
   LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
@@ -737,7 +754,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
-                        MCSymbolRefExpr::VK_PPC_TOC, OutContext),
+                        MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
                         8/*size*/);
   // Emit a null environment pointer.
   OutStreamer.EmitIntValue(0, 8 /* size */);
@@ -755,6 +772,9 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
+  PPCTargetStreamer &TS =
+      static_cast<PPCTargetStreamer &>(OutStreamer.getTargetStreamer());
+
   if (isPPC64 && !TOC.empty()) {
     const MCSectionELF *Section = OutStreamer.getContext().getELFSection(".toc",
         ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC,
@@ -765,7 +785,7 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
          E = TOC.end(); I != E; ++I) {
       OutStreamer.EmitLabel(I->second);
       MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
-      OutStreamer.EmitTCEntry(*S);
+      TS.emitTCEntry(*S);
     }
   }
 
@@ -781,7 +801,7 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
       //   .long _foo
       OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second.getPointer(),
                                                     OutContext),
-                            isPPC64 ? 8 : 4/*size*/, 0/*addrspace*/);
+                            isPPC64 ? 8 : 4/*size*/);
     }
 
     Stubs.clear();
@@ -829,7 +849,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "power6",
     "power6x",
     "power7",
-    "ppc64"
+    "ppc64",
+    "ppc64le"
   };
 
   unsigned Directive = Subtarget.getDarwinDirective();
@@ -843,7 +864,7 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   
   // FIXME: This is a total hack, finish mc'izing the PPC backend.
   if (OutStreamer.hasRawTextSupport()) {
-    assert(Directive < sizeof(CPUDirectives) / sizeof(*CPUDirectives) &&
+    assert(Directive < array_lengthof(CPUDirectives) &&
            "CPUDirectives[] might not be up-to-date!");
     OutStreamer.EmitRawText("\t.machine " + Twine(CPUDirectives[Directive]));
   }
@@ -883,6 +904,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
 void PPCDarwinAsmPrinter::
 EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
   bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
+  bool isDarwin = Subtarget.isDarwin();
   
   const TargetLoweringObjectFileMachO &TLOFMacho = 
     static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
@@ -910,6 +932,9 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
 
       const MCExpr *Anon = MCSymbolRefExpr::Create(AnonSymbol, OutContext);
+      const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
+      const MCExpr *Sub =
+        MCBinaryExpr::CreateSub(LazyPtrExpr, Anon, OutContext);
 
       // mflr r0
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R0));
@@ -919,21 +944,20 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
       // mflr r11
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MFLR).addReg(PPC::R11));
       // addis r11, r11, ha16(LazyPtr - AnonSymbol)
-      const MCExpr *Sub =
-        MCBinaryExpr::CreateSub(MCSymbolRefExpr::Create(LazyPtr, OutContext),
-                                Anon, OutContext);
+      const MCExpr *SubHa16 = PPCMCExpr::CreateHa(Sub, isDarwin, OutContext);
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS)
         .addReg(PPC::R11)
         .addReg(PPC::R11)
-        .addExpr(Sub));
+        .addExpr(SubHa16));
       // mtlr r0
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTLR).addReg(PPC::R0));
 
       // ldu r12, lo16(LazyPtr - AnonSymbol)(r11)
       // lwzu r12, lo16(LazyPtr - AnonSymbol)(r11)
+      const MCExpr *SubLo16 = PPCMCExpr::CreateLo(Sub, isDarwin, OutContext);
       OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
         .addReg(PPC::R12)
-        .addExpr(Sub).addExpr(Sub)
+        .addExpr(SubLo16).addExpr(SubLo16)
         .addReg(PPC::R11));
       // mtctr r12
       OutStreamer.EmitInstruction(MCInstBuilder(PPC::MTCTR).addReg(PPC::R12));
@@ -967,24 +991,24 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
     MCSymbol *Stub = Stubs[i].first;
     MCSymbol *RawSym = Stubs[i].second.getPointer();
     MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+    const MCExpr *LazyPtrExpr = MCSymbolRefExpr::Create(LazyPtr, OutContext);
 
     OutStreamer.SwitchSection(StubSection);
     EmitAlignment(4);
     OutStreamer.EmitLabel(Stub);
     OutStreamer.EmitSymbolAttribute(RawSym, MCSA_IndirectSymbol);
+
     // lis r11, ha16(LazyPtr)
     const MCExpr *LazyPtrHa16 =
-      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_HA16,
-                              OutContext);
+      PPCMCExpr::CreateHa(LazyPtrExpr, isDarwin, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(PPC::LIS)
       .addReg(PPC::R11)
       .addExpr(LazyPtrHa16));
 
-    const MCExpr *LazyPtrLo16 =
-      MCSymbolRefExpr::Create(LazyPtr, MCSymbolRefExpr::VK_PPC_DARWIN_LO16,
-                              OutContext);
     // ldu r12, lo16(LazyPtr)(r11)
     // lwzu r12, lo16(LazyPtr)(r11)
+    const MCExpr *LazyPtrLo16 =
+      PPCMCExpr::CreateLo(LazyPtrExpr, isDarwin, OutContext);
     OutStreamer.EmitInstruction(MCInstBuilder(isPPC64 ? PPC::LDU : PPC::LWZU)
       .addReg(PPC::R12)
       .addExpr(LazyPtrLo16).addExpr(LazyPtrLo16)
@@ -1037,7 +1061,7 @@ bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
         MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
         MachineModuleInfoImpl::StubValueTy &StubSym =
           MMIMacho.getGVStubEntry(NLPSym);
-        StubSym = MachineModuleInfoImpl::StubValueTy(Mang->getSymbol(*I), true);
+        StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(*I), true);
       }
     }
   }
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 81a54d7..4224ae2 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -9,767 +9,641 @@
 //
 // This pass identifies loops where we can generate the PPC branch instructions
 // that decrement and test the count register (CTR) (bdnz and friends).
-// This pass is based on the HexagonHardwareLoops pass.
 //
 // The pattern that defines the induction variable can changed depending on
 // prior optimizations.  For example, the IndVarSimplify phase run by 'opt'
 // normalizes induction variables, and the Loop Strength Reduction pass
 // run by 'llc' may also make changes to the induction variable.
-// The pattern detected by this phase is due to running Strength Reduction.
 //
 // Criteria for CTR loops:
 //  - Countable loops (w/ ind. var for a trip count)
-//  - Assumes loops are normalized by IndVarSimplify
 //  - Try inner-most loops first
 //  - No nested CTR loops.
 //  - No function calls in loops.
 //
-//  Note: As with unconverted loops, PPCBranchSelector must be run after this
-//  pass in order to convert long-displacement jumps into jump pairs.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ctrloops"
-#include "PPC.h"
-#include "MCTargetDesc/PPCPredicates.h"
-#include "PPCTargetMachine.h"
-#include "llvm/ADT/DenseMap.h"
+
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineLoopInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
 #include "llvm/PassSupport.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "PPCTargetMachine.h"
+#include "PPC.h"
+
+#ifndef NDEBUG
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#endif
+
 #include <algorithm>
+#include <vector>
 
 using namespace llvm;
 
+#ifndef NDEBUG
+static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
+#endif
+
 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
 
 namespace llvm {
   void initializePPCCTRLoopsPass(PassRegistry&);
+#ifndef NDEBUG
+  void initializePPCCTRLoopsVerifyPass(PassRegistry&);
+#endif
 }
 
 namespace {
-  class CountValue;
-  struct PPCCTRLoops : public MachineFunctionPass {
-    MachineLoopInfo       *MLI;
-    MachineRegisterInfo   *MRI;
-    const TargetInstrInfo *TII;
+  struct PPCCTRLoops : public FunctionPass {
+
+#ifndef NDEBUG
+    static int Counter;
+#endif
 
   public:
-    static char ID;   // Pass identification, replacement for typeid
+    static char ID;
 
-    PPCCTRLoops() : MachineFunctionPass(ID) {
+    PPCCTRLoops() : FunctionPass(ID), TM(0) {
+      initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
+    }
+    PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    const char *getPassName() const { return "PPC CTR Loops"; }
+    virtual bool runOnFunction(Function &F);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
-      AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineLoopInfo>();
-      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
     }
 
   private:
-    /// getCanonicalInductionVariable - Check to see if the loop has a canonical
-    /// induction variable.
-    /// Should be defined in MachineLoop. Based upon version in class Loop.
-    void getCanonicalInductionVariable(MachineLoop *L,
-                              SmallVector<MachineInstr *, 4> &IVars,
-                              SmallVector<MachineInstr *, 4> &IOps) const;
-
-    /// getTripCount - Return a loop-invariant LLVM register indicating the
-    /// number of times the loop will be executed.  If the trip-count cannot
-    /// be determined, this return null.
-    CountValue *getTripCount(MachineLoop *L,
-                             SmallVector<MachineInstr *, 2> &OldInsts) const;
-
-    /// isInductionOperation - Return true if the instruction matches the
-    /// pattern for an opertion that defines an induction variable.
-    bool isInductionOperation(const MachineInstr *MI, unsigned IVReg) const;
-
-    /// isInvalidOperation - Return true if the instruction is not valid within
-    /// a CTR loop.
-    bool isInvalidLoopOperation(const MachineInstr *MI) const;
-
-    /// containsInavlidInstruction - Return true if the loop contains an
-    /// instruction that inhibits using the CTR loop.
-    bool containsInvalidInstruction(MachineLoop *L) const;
-
-    /// converToCTRLoop - Given a loop, check if we can convert it to a
-    /// CTR loop.  If so, then perform the conversion and return true.
-    bool convertToCTRLoop(MachineLoop *L);
-
-    /// isDead - Return true if the instruction is now dead.
-    bool isDead(const MachineInstr *MI,
-                SmallVector<MachineInstr *, 1> &DeadPhis) const;
-
-    /// removeIfDead - Remove the instruction if it is now dead.
-    void removeIfDead(MachineInstr *MI);
+    bool mightUseCTR(const Triple &TT, BasicBlock *BB);
+    bool convertToCTRLoop(Loop *L);
+
+  private:
+    PPCTargetMachine *TM;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *TD;
+    DominatorTree *DT;
+    const TargetLibraryInfo *LibInfo;
   };
 
   char PPCCTRLoops::ID = 0;
+#ifndef NDEBUG
+  int PPCCTRLoops::Counter = 0;
+#endif
 
-
-  // CountValue class - Abstraction for a trip count of a loop. A
-  // smaller vesrsion of the MachineOperand class without the concerns
-  // of changing the operand representation.
-  class CountValue {
+#ifndef NDEBUG
+  struct PPCCTRLoopsVerify : public MachineFunctionPass {
   public:
-    enum CountValueType {
-      CV_Register,
-      CV_Immediate
-    };
-  private:
-    CountValueType Kind;
-    union Values {
-      unsigned RegNum;
-      int64_t ImmVal;
-      Values(unsigned r) : RegNum(r) {}
-      Values(int64_t i) : ImmVal(i) {}
-    } Contents;
-    bool isNegative;
+    static char ID;
 
-  public:
-    CountValue(unsigned r, bool neg) : Kind(CV_Register), Contents(r),
-                                       isNegative(neg) {}
-    explicit CountValue(int64_t i) : Kind(CV_Immediate), Contents(i),
-                                     isNegative(i < 0) {}
-    CountValueType getType() const { return Kind; }
-    bool isReg() const { return Kind == CV_Register; }
-    bool isImm() const { return Kind == CV_Immediate; }
-    bool isNeg() const { return isNegative; }
-
-    unsigned getReg() const {
-      assert(isReg() && "Wrong CountValue accessor");
-      return Contents.RegNum;
-    }
-    void setReg(unsigned Val) {
-      Contents.RegNum = Val;
-    }
-    int64_t getImm() const {
-      assert(isImm() && "Wrong CountValue accessor");
-      if (isNegative) {
-        return -Contents.ImmVal;
-      }
-      return Contents.ImmVal;
-    }
-    void setImm(int64_t Val) {
-      Contents.ImmVal = Val;
+    PPCCTRLoopsVerify() : MachineFunctionPass(ID) {
+      initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
     }
 
-    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
-      if (isReg()) { OS << PrintReg(getReg()); }
-      if (isImm()) { OS << getImm(); }
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
     }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  private:
+    MachineDominatorTree *MDT;
   };
+
+  char PPCCTRLoopsVerify::ID = 0;
+#endif // NDEBUG
 } // end anonymous namespace
 
 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
                     false, false)
 
-/// isCompareEquals - Returns true if the instruction is a compare equals
-/// instruction with an immediate operand.
-static bool isCompareEqualsImm(const MachineInstr *MI, bool &SignedCmp,
-                               bool &Int64Cmp) {
-  if (MI->getOpcode() == PPC::CMPWI) {
-    SignedCmp = true;
-    Int64Cmp = false;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPDI) {
-    SignedCmp = true;
-    Int64Cmp = true;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPLWI) {
-    SignedCmp = false;
-    Int64Cmp = false;
-    return true;
-  } else if (MI->getOpcode() == PPC::CMPLDI) {
-    SignedCmp = false;
-    Int64Cmp = true;
-    return true;
-  }
-
-  return false;
+FunctionPass *llvm::createPPCCTRLoops(PPCTargetMachine &TM) {
+  return new PPCCTRLoops(TM);
 }
 
+#ifndef NDEBUG
+INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                      "PowerPC CTR Loops Verify", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
+                    "PowerPC CTR Loops Verify", false, false)
 
-/// createPPCCTRLoops - Factory for creating
-/// the CTR loop phase.
-FunctionPass *llvm::createPPCCTRLoops() {
-  return new PPCCTRLoops();
+FunctionPass *llvm::createPPCCTRLoopsVerify() {
+  return new PPCCTRLoopsVerify();
 }
+#endif // NDEBUG
 
+bool PPCCTRLoops::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTree>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
 
-bool PPCCTRLoops::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********* PPC CTR Loops *********\n");
-
-  bool Changed = false;
+  bool MadeChange = false;
 
-  // get the loop information
-  MLI = &getAnalysis<MachineLoopInfo>();
-  // get the register information
-  MRI = &MF.getRegInfo();
-  // the target specific instructio info.
-  TII = MF.getTarget().getInstrInfo();
-
-  for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end();
+  for (LoopInfo::iterator I = LI->begin(), E = LI->end();
        I != E; ++I) {
-    MachineLoop *L = *I;
-    if (!L->getParentLoop()) {
-      Changed |= convertToCTRLoop(L);
-    }
+    Loop *L = *I;
+    if (!L->getParentLoop())
+      MadeChange |= convertToCTRLoop(L);
   }
 
-  return Changed;
+  return MadeChange;
 }
 
-/// getCanonicalInductionVariable - Check to see if the loop has a canonical
-/// induction variable. We check for a simple recurrence pattern - an
-/// integer recurrence that decrements by one each time through the loop and
-/// ends at zero.  If so, return the phi node that corresponds to it.
-///
-/// Based upon the similar code in LoopInfo except this code is specific to
-/// the machine.
-/// This method assumes that the IndVarSimplify pass has been run by 'opt'.
-///
-void
-PPCCTRLoops::getCanonicalInductionVariable(MachineLoop *L,
-                                  SmallVector<MachineInstr *, 4> &IVars,
-                                  SmallVector<MachineInstr *, 4> &IOps) const {
-  MachineBasicBlock *TopMBB = L->getTopBlock();
-  MachineBasicBlock::pred_iterator PI = TopMBB->pred_begin();
-  assert(PI != TopMBB->pred_end() &&
-         "Loop must have more than one incoming edge!");
-  MachineBasicBlock *Backedge = *PI++;
-  if (PI == TopMBB->pred_end()) return;  // dead loop
-  MachineBasicBlock *Incoming = *PI++;
-  if (PI != TopMBB->pred_end()) return;  // multiple backedges?
-
-  // make sure there is one incoming and one backedge and determine which
-  // is which.
-  if (L->contains(Incoming)) {
-    if (L->contains(Backedge))
-      return;
-    std::swap(Incoming, Backedge);
-  } else if (!L->contains(Backedge))
-    return;
-
-  // Loop over all of the PHI nodes, looking for a canonical induction variable:
-  //   - The PHI node is "reg1 = PHI reg2, BB1, reg3, BB2".
-  //   - The recurrence comes from the backedge.
-  //   - the definition is an induction operatio.n
-  for (MachineBasicBlock::iterator I = TopMBB->begin(), E = TopMBB->end();
-       I != E && I->isPHI(); ++I) {
-    MachineInstr *MPhi = &*I;
-    unsigned DefReg = MPhi->getOperand(0).getReg();
-    for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2) {
-      // Check each operand for the value from the backedge.
-      MachineBasicBlock *MBB = MPhi->getOperand(i+1).getMBB();
-      if (L->contains(MBB)) { // operands comes from the backedge
-        // Check if the definition is an induction operation.
-        MachineInstr *DI = MRI->getVRegDef(MPhi->getOperand(i).getReg());
-        if (isInductionOperation(DI, DefReg)) {
-          IOps.push_back(DI);
-          IVars.push_back(MPhi);
+bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
+  for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
+       J != JE; ++J) {
+    if (CallInst *CI = dyn_cast<CallInst>(J)) {
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+        // Inline ASM is okay, unless it clobbers the ctr register.
+        InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
+        for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
+          InlineAsm::ConstraintInfo &C = CIV[i];
+          if (C.Type != InlineAsm::isInput)
+            for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
+              if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
+                return true;
         }
-      }
-    }
-  }
-  return;
-}
 
-/// getTripCount - Return a loop-invariant LLVM value indicating the
-/// number of times the loop will be executed.  The trip count can
-/// be either a register or a constant value.  If the trip-count
-/// cannot be determined, this returns null.
-///
-/// We find the trip count from the phi instruction that defines the
-/// induction variable.  We follow the links to the CMP instruction
-/// to get the trip count.
-///
-/// Based upon getTripCount in LoopInfo.
-///
-CountValue *PPCCTRLoops::getTripCount(MachineLoop *L,
-                           SmallVector<MachineInstr *, 2> &OldInsts) const {
-  MachineBasicBlock *LastMBB = L->getExitingBlock();
-  // Don't generate a CTR loop if the loop has more than one exit.
-  if (LastMBB == 0)
-    return 0;
-
-  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
-  if (LastI->getOpcode() != PPC::BCC)
-    return 0;
-
-  // We need to make sure that this compare is defining the condition
-  // register actually used by the terminating branch.
-
-  unsigned PredReg = LastI->getOperand(1).getReg();
-  DEBUG(dbgs() << "Examining loop with first terminator: " << *LastI);
-
-  unsigned PredCond = LastI->getOperand(0).getImm();
-  if (PredCond != PPC::PRED_EQ && PredCond != PPC::PRED_NE)
-    return 0;
-
-  // Check that the loop has a induction variable.
-  SmallVector<MachineInstr *, 4> IVars, IOps;
-  getCanonicalInductionVariable(L, IVars, IOps);
-  for (unsigned i = 0; i < IVars.size(); ++i) {
-    MachineInstr *IOp = IOps[i];
-    MachineInstr *IV_Inst = IVars[i];
-
-    // Canonical loops will end with a 'cmpwi/cmpdi cr, IV, Imm',
-    //  if Imm is 0, get the count from the PHI opnd
-    //  if Imm is -M, than M is the count
-    //  Otherwise, Imm is the count
-    MachineOperand *IV_Opnd;
-    const MachineOperand *InitialValue;
-    if (!L->contains(IV_Inst->getOperand(2).getMBB())) {
-      InitialValue = &IV_Inst->getOperand(1);
-      IV_Opnd = &IV_Inst->getOperand(3);
-    } else {
-      InitialValue = &IV_Inst->getOperand(3);
-      IV_Opnd = &IV_Inst->getOperand(1);
-    }
+        continue;
+      }
 
-    DEBUG(dbgs() << "Considering:\n");
-    DEBUG(dbgs() << "  induction operation: " << *IOp);
-    DEBUG(dbgs() << "  induction variable: " << *IV_Inst);
-    DEBUG(dbgs() << "  initial value: " << *InitialValue << "\n");
-  
-    // Look for the cmp instruction to determine if we
-    // can get a useful trip count.  The trip count can
-    // be either a register or an immediate.  The location
-    // of the value depends upon the type (reg or imm).
-    for (MachineRegisterInfo::reg_iterator
-         RI = MRI->reg_begin(IV_Opnd->getReg()), RE = MRI->reg_end();
-         RI != RE; ++RI) {
-      IV_Opnd = &RI.getOperand();
-      bool SignedCmp, Int64Cmp;
-      MachineInstr *MI = IV_Opnd->getParent();
-      if (L->contains(MI) && isCompareEqualsImm(MI, SignedCmp, Int64Cmp) &&
-          MI->getOperand(0).getReg() == PredReg) {
-
-        OldInsts.push_back(MI);
-        OldInsts.push_back(IOp);
- 
-        DEBUG(dbgs() << "  compare: " << *MI);
- 
-        const MachineOperand &MO = MI->getOperand(2);
-        assert(MO.isImm() && "IV Cmp Operand should be an immediate");
-
-        int64_t ImmVal;
-        if (SignedCmp)
-          ImmVal = (short) MO.getImm();
-        else
-          ImmVal = MO.getImm();
-  
-        const MachineInstr *IV_DefInstr = MRI->getVRegDef(IV_Opnd->getReg());
-        assert(L->contains(IV_DefInstr->getParent()) &&
-               "IV definition should occurs in loop");
-        int64_t iv_value = (short) IV_DefInstr->getOperand(2).getImm();
-  
-        assert(InitialValue->isReg() && "Expecting register for init value");
-        unsigned InitialValueReg = InitialValue->getReg();
-  
-        MachineInstr *DefInstr = MRI->getVRegDef(InitialValueReg);
-  
-        // Here we need to look for an immediate load (an li or lis/ori pair).
-        if (DefInstr && (DefInstr->getOpcode() == PPC::ORI8 ||
-                         DefInstr->getOpcode() == PPC::ORI)) {
-          int64_t start = DefInstr->getOperand(2).getImm();
-          MachineInstr *DefInstr2 =
-            MRI->getVRegDef(DefInstr->getOperand(1).getReg());
-          if (DefInstr2 && (DefInstr2->getOpcode() == PPC::LIS8 ||
-                            DefInstr2->getOpcode() == PPC::LIS)) {
-            DEBUG(dbgs() << "  initial constant: " << *DefInstr);
-            DEBUG(dbgs() << "  initial constant: " << *DefInstr2);
-
-            start |= int64_t(short(DefInstr2->getOperand(1).getImm())) << 16;
-  
-            int64_t count = ImmVal - start;
-            if ((count % iv_value) != 0) {
-              return 0;
-            }
-
-            OldInsts.push_back(DefInstr);
-            OldInsts.push_back(DefInstr2);
-
-            // count/iv_value, the trip count, should be positive here. If it
-            // is negative, that indicates that the counter will wrap.
-            if (Int64Cmp)
-              return new CountValue(count/iv_value);
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
+
+      if (Function *F = CI->getCalledFunction()) {
+        // Most intrinsics don't become function calls, but some might.
+        // sin, cos, exp and log are always calls.
+        unsigned Opcode;
+        if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
+          switch (F->getIntrinsicID()) {
+          default: continue;
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                       !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::setjmp:
+
+#if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
+ // let's return it to _setjmp state
+#  pragma pop_macro("setjmp")
+#  undef setjmp_undefined_for_msvc
+#endif
+
+          case Intrinsic::longjmp:
+
+          // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
+          // because, although it does clobber the counter register, the
+          // control can't then return to inside the loop unless there is also
+          // an eh_sjlj_setjmp.
+          case Intrinsic::eh_sjlj_setjmp:
+
+          case Intrinsic::memcpy:
+          case Intrinsic::memmove:
+          case Intrinsic::memset:
+          case Intrinsic::powi:
+          case Intrinsic::log:
+          case Intrinsic::log2:
+          case Intrinsic::log10:
+          case Intrinsic::exp:
+          case Intrinsic::exp2:
+          case Intrinsic::pow:
+          case Intrinsic::sin:
+          case Intrinsic::cos:
+            return true;
+          case Intrinsic::copysign:
+            if (CI->getArgOperand(0)->getType()->getScalarType()->
+                isPPC_FP128Ty())
+              return true;
             else
-              return new CountValue(uint32_t(count/iv_value));
-          }
-        } else if (DefInstr && (DefInstr->getOpcode() == PPC::LI8 ||
-                                DefInstr->getOpcode() == PPC::LI)) {
-          DEBUG(dbgs() << "  initial constant: " << *DefInstr);
-
-          int64_t count = ImmVal -
-            int64_t(short(DefInstr->getOperand(1).getImm()));
-          if ((count % iv_value) != 0) {
-            return 0;
+              continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
           }
+        }
 
-          OldInsts.push_back(DefInstr);
-
-          if (Int64Cmp)
-            return new CountValue(count/iv_value);
-          else
-            return new CountValue(uint32_t(count/iv_value));
-        } else if (iv_value == 1 || iv_value == -1) {
-          // We can't determine a constant starting value.
-          if (ImmVal == 0) {
-            return new CountValue(InitialValueReg, iv_value > 0);
+        // PowerPC does not use [US]DIVREM or other library calls for
+        // operations on regular types which are not otherwise library calls
+        // (i.e. soft float or atomics). If adapting for targets that do,
+        // additional care is required here.
+
+        LibFunc::Func Func;
+        if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
+            LibInfo->getLibFunc(F->getName(), Func) &&
+            LibInfo->hasOptimizedCodeGen(Func)) {
+          // Non-read-only functions are never treated as intrinsics.
+          if (!CI->onlyReadsMemory())
+            return true;
+
+          // Conversion happens only for FP calls.
+          if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
+            return true;
+
+          switch (Func) {
+          default: return true;
+          case LibFunc::copysign:
+          case LibFunc::copysignf:
+            continue; // ISD::FCOPYSIGN is never a library call.
+          case LibFunc::copysignl:
+            return true;
+          case LibFunc::fabs:
+          case LibFunc::fabsf:
+          case LibFunc::fabsl:
+            continue; // ISD::FABS is never a library call.
+          case LibFunc::sqrt:
+          case LibFunc::sqrtf:
+          case LibFunc::sqrtl:
+            Opcode = ISD::FSQRT; break;
+          case LibFunc::floor:
+          case LibFunc::floorf:
+          case LibFunc::floorl:
+            Opcode = ISD::FFLOOR; break;
+          case LibFunc::nearbyint:
+          case LibFunc::nearbyintf:
+          case LibFunc::nearbyintl:
+            Opcode = ISD::FNEARBYINT; break;
+          case LibFunc::ceil:
+          case LibFunc::ceilf:
+          case LibFunc::ceill:
+            Opcode = ISD::FCEIL; break;
+          case LibFunc::rint:
+          case LibFunc::rintf:
+          case LibFunc::rintl:
+            Opcode = ISD::FRINT; break;
+          case LibFunc::round:
+          case LibFunc::roundf:
+          case LibFunc::roundl:
+            Opcode = ISD::FROUND; break;
+          case LibFunc::trunc:
+          case LibFunc::truncf:
+          case LibFunc::truncl:
+            Opcode = ISD::FTRUNC; break;
           }
-          // FIXME: handle non-zero end value.
+
+          MVT VTy =
+            TLI->getSimpleValueType(CI->getArgOperand(0)->getType(), true);
+          if (VTy == MVT::Other)
+            return true;
+          
+          if (TLI->isOperationLegalOrCustom(Opcode, VTy))
+            continue;
+          else if (VTy.isVector() &&
+                   TLI->isOperationLegalOrCustom(Opcode, VTy.getScalarType()))
+            continue;
+
+          return true;
         }
-        // FIXME: handle non-unit increments (we might not want to introduce
-        // division but we can handle some 2^n cases with shifts).
-  
       }
-    }
-  }
-  return 0;
-}
-
-/// isInductionOperation - return true if the operation is matches the
-/// pattern that defines an induction variable:
-///    addi iv, c
-///
-bool
-PPCCTRLoops::isInductionOperation(const MachineInstr *MI,
-                                           unsigned IVReg) const {
-  return ((MI->getOpcode() == PPC::ADDI || MI->getOpcode() == PPC::ADDI8) &&
-          MI->getOperand(1).isReg() && // could be a frame index instead
-          MI->getOperand(1).getReg() == IVReg);
-}
 
-/// isInvalidOperation - Return true if the operation is invalid within
-/// CTR loop.
-bool
-PPCCTRLoops::isInvalidLoopOperation(const MachineInstr *MI) const {
-
-  // call is not allowed because the callee may use a CTR loop
-  if (MI->getDesc().isCall()) {
-    return true;
-  }
-  // check if the instruction defines a CTR loop register
-  // (this will also catch nested CTR loops)
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.isDef() &&
-        (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8)) {
       return true;
-    }
-  }
-  return false;
-}
+    } else if (isa<BinaryOperator>(J) &&
+               J->getType()->getScalarType()->isPPC_FP128Ty()) {
+      // Most operations on ppc_f128 values become calls.
+      return true;
+    } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
+               isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
+      CastInst *CI = cast<CastInst>(J);
+      if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
+          CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
+          (TT.isArch32Bit() &&
+           (CI->getSrcTy()->getScalarType()->isIntegerTy(64) ||
+            CI->getDestTy()->getScalarType()->isIntegerTy(64))
+          ))
+        return true;
+    } else if (TT.isArch32Bit() &&
+               J->getType()->getScalarType()->isIntegerTy(64) &&
+               (J->getOpcode() == Instruction::UDiv ||
+                J->getOpcode() == Instruction::SDiv ||
+                J->getOpcode() == Instruction::URem ||
+                J->getOpcode() == Instruction::SRem)) {
+      return true;
+    } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
+      // On PowerPC, indirect jumps use the counter register.
+      return true;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(J)) {
+      if (!TM)
+        return true;
+      const TargetLowering *TLI = TM->getTargetLowering();
 
-/// containsInvalidInstruction - Return true if the loop contains
-/// an instruction that inhibits the use of the CTR loop function.
-///
-bool PPCCTRLoops::containsInvalidInstruction(MachineLoop *L) const {
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = Blocks[i];
-    for (MachineBasicBlock::iterator
-           MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) {
-      const MachineInstr *MI = &*MII;
-      if (isInvalidLoopOperation(MI)) {
+      if (TLI->supportJumpTables() &&
+          SI->getNumCases()+1 >= (unsigned) TLI->getMinimumJumpTableEntries())
         return true;
-      }
     }
   }
+
   return false;
 }
 
-/// isDead returns true if the instruction is dead
-/// (this was essentially copied from DeadMachineInstructionElim::isDead, but
-/// with special cases for inline asm, physical registers and instructions with
-/// side effects removed)
-bool PPCCTRLoops::isDead(const MachineInstr *MI,
-                         SmallVector<MachineInstr *, 1> &DeadPhis) const {
-  // Examine each operand.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (!MRI->use_nodbg_empty(Reg)) {
-        // This instruction has users, but if the only user is the phi node for
-        // the parent block, and the only use of that phi node is this
-        // instruction, then this instruction is dead: both it (and the phi
-        // node) can be removed.
-        MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg);
-        if (llvm::next(I) == MRI->use_end() &&
-            I.getOperand().getParent()->isPHI()) {
-          MachineInstr *OnePhi = I.getOperand().getParent();
-
-          for (unsigned j = 0, f = OnePhi->getNumOperands(); j != f; ++j) {
-            const MachineOperand &OPO = OnePhi->getOperand(j);
-            if (OPO.isReg() && OPO.isDef()) {
-              unsigned OPReg = OPO.getReg();
-
-              MachineRegisterInfo::use_iterator nextJ;
-              for (MachineRegisterInfo::use_iterator J = MRI->use_begin(OPReg),
-                   E = MRI->use_end(); J!=E; J=nextJ) {
-                nextJ = llvm::next(J);
-                MachineOperand& Use = J.getOperand();
-                MachineInstr *UseMI = Use.getParent();
-
-                if (MI != UseMI) {
-                  // The phi node has a user that is not MI, bail...
-                  return false;
-                }
-              }
-            }
-          }
+bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
+  bool MadeChange = false;
 
-          DeadPhis.push_back(OnePhi);
-        } else {
-          // This def has a non-debug use. Don't delete the instruction!
-          return false;
-        }
-      }
-    }
+  Triple TT = Triple(L->getHeader()->getParent()->getParent()->
+                     getTargetTriple());
+  if (!TT.isArch32Bit() && !TT.isArch64Bit())
+    return MadeChange; // Unknown arch. type.
+
+  // Process nested loops first.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
+    MadeChange |= convertToCTRLoop(*I);
   }
 
-  // If there are no defs with uses, the instruction is dead.
-  return true;
-}
+  // If a nested loop has been converted, then we can't convert this loop.
+  if (MadeChange)
+    return MadeChange;
+
+#ifndef NDEBUG
+  // Stop trying after reaching the limit (if any).
+  int Limit = CTRLoopLimit;
+  if (Limit >= 0) {
+    if (Counter >= CTRLoopLimit)
+      return false;
+    Counter++;
+  }
+#endif
+
+  // We don't want to spill/restore the counter register, and so we don't
+  // want to use the counter register if the loop contains calls.
+  for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
+       I != IE; ++I)
+    if (mightUseCTR(TT, *I))
+      return MadeChange;
+
+  SmallVector<BasicBlock*, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+
+  BasicBlock *CountedExitBlock = 0;
+  const SCEV *ExitCount = 0;
+  BranchInst *CountedExitBranch = 0;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
+       IE = ExitingBlocks.end(); I != IE; ++I) {
+    const SCEV *EC = SE->getExitCount(L, *I);
+    DEBUG(dbgs() << "Exit Count for " << *L << " from block " <<
+                    (*I)->getName() << ": " << *EC << "\n");
+    if (isa<SCEVCouldNotCompute>(EC))
+      continue;
+    if (const SCEVConstant *ConstEC = dyn_cast<SCEVConstant>(EC)) {
+      if (ConstEC->getValue()->isZero())
+        continue;
+    } else if (!SE->isLoopInvariant(EC, L))
+      continue;
+
+    if (SE->getTypeSizeInBits(EC->getType()) > (TT.isArch64Bit() ? 64 : 32))
+      continue;
+
+    // We now have a loop-invariant count of loop iterations (which is not the
+    // constant zero) for which we know that this loop will not exit via this
+    // exisiting block.
+
+    // We need to make sure that this block will run on every loop iteration.
+    // For this to be true, we must dominate all blocks with backedges. Such
+    // blocks are in-loop predecessors to the header block.
+    bool NotAlways = false;
+    for (pred_iterator PI = pred_begin(L->getHeader()),
+         PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
+      if (!L->contains(*PI))
+        continue;
 
-void PPCCTRLoops::removeIfDead(MachineInstr *MI) {
-  // This procedure was essentially copied from DeadMachineInstructionElim
+      if (!DT->dominates(*I, *PI)) {
+        NotAlways = true;
+        break;
+      }
+    }
 
-  SmallVector<MachineInstr *, 1> DeadPhis;
-  if (isDead(MI, DeadPhis)) {
-    DEBUG(dbgs() << "CTR looping will remove: " << *MI);
+    if (NotAlways)
+      continue;
 
-    // It is possible that some DBG_VALUE instructions refer to this
-    // instruction.  Examine each def operand for such references;
-    // if found, mark the DBG_VALUE as undef (but don't delete it).
-    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef())
+    // Make sure this blocks ends with a conditional branch.
+    Instruction *TI = (*I)->getTerminator();
+    if (!TI)
+      continue;
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional())
         continue;
-      unsigned Reg = MO.getReg();
-      MachineRegisterInfo::use_iterator nextI;
-      for (MachineRegisterInfo::use_iterator I = MRI->use_begin(Reg),
-           E = MRI->use_end(); I!=E; I=nextI) {
-        nextI = llvm::next(I);  // I is invalidated by the setReg
-        MachineOperand& Use = I.getOperand();
-        MachineInstr *UseMI = Use.getParent();
-        if (UseMI==MI)
-          continue;
-        if (Use.isDebug()) // this might also be a instr -> phi -> instr case
-                           // which can also be removed.
-          UseMI->getOperand(0).setReg(0U);
-      }
-    }
 
-    MI->eraseFromParent();
-    for (unsigned i = 0; i < DeadPhis.size(); ++i) {
-      DeadPhis[i]->eraseFromParent();
-    }
+      CountedExitBranch = BI;
+    } else
+      continue;
+
+    // Note that this block may not be the loop latch block, even if the loop
+    // has a latch block.
+    CountedExitBlock = *I;
+    ExitCount = EC;
+    break;
   }
+
+  if (!CountedExitBlock)
+    return MadeChange;
+
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // If we don't have a preheader, then insert one. If we already have a
+  // preheader, then we can use it (except if the preheader contains a use of
+  // the CTR register because some such uses might be reordered by the
+  // selection DAG after the mtctr instruction).
+  if (!Preheader || mightUseCTR(TT, Preheader))
+    Preheader = InsertPreheaderForLoop(L, this);
+  if (!Preheader)
+    return MadeChange;
+
+  DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName() << "\n");
+
+  // Insert the count into the preheader and replace the condition used by the
+  // selected branch.
+  MadeChange = true;
+
+  SCEVExpander SCEVE(*SE, "loopcnt");
+  LLVMContext &C = SE->getContext();
+  Type *CountType = TT.isArch64Bit() ? Type::getInt64Ty(C) :
+                                       Type::getInt32Ty(C);
+  if (!ExitCount->getType()->isPointerTy() &&
+      ExitCount->getType() != CountType)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(CountType, 1)); 
+  Value *ECValue = SCEVE.expandCodeFor(ExitCount, CountType,
+                                       Preheader->getTerminator());
+
+  IRBuilder<> CountBuilder(Preheader->getTerminator());
+  Module *M = Preheader->getParent()->getParent();
+  Value *MTCTRFunc = Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr,
+                                               CountType);
+  CountBuilder.CreateCall(MTCTRFunc, ECValue);
+
+  IRBuilder<> CondBuilder(CountedExitBranch);
+  Value *DecFunc =
+    Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
+  Value *NewCond = CondBuilder.CreateCall(DecFunc);
+  Value *OldCond = CountedExitBranch->getCondition();
+  CountedExitBranch->setCondition(NewCond);
+
+  // The false branch must exit the loop.
+  if (!L->contains(CountedExitBranch->getSuccessor(0)))
+    CountedExitBranch->swapSuccessors();
+
+  // The old condition may be dead now, and may have even created a dead PHI
+  // (the original induction variable).
+  RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+  DeleteDeadPHIs(CountedExitBlock);
+
+  ++NumCTRLoops;
+  return MadeChange;
 }
 
-/// converToCTRLoop - check if the loop is a candidate for
-/// converting to a CTR loop.  If so, then perform the
-/// transformation.
-///
-/// This function works on innermost loops first.  A loop can
-/// be converted if it is a counting loop; either a register
-/// value or an immediate.
-///
-/// The code makes several assumptions about the representation
-/// of the loop in llvm.
-bool PPCCTRLoops::convertToCTRLoop(MachineLoop *L) {
-  bool Changed = false;
-  // Process nested loops first.
-  for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    Changed |= convertToCTRLoop(*I);
-  }
-  // If a nested loop has been converted, then we can't convert this loop.
-  if (Changed) {
-    return Changed;
+#ifndef NDEBUG
+static bool clobbersCTR(const MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      if (MO.isDef() && (MO.getReg() == PPC::CTR || MO.getReg() == PPC::CTR8))
+        return true;
+    } else if (MO.isRegMask()) {
+      if (MO.clobbersPhysReg(PPC::CTR) || MO.clobbersPhysReg(PPC::CTR8))
+        return true;
+    }
   }
 
-  SmallVector<MachineInstr *, 2> OldInsts;
-  // Are we able to determine the trip count for the loop?
-  CountValue *TripCount = getTripCount(L, OldInsts);
-  if (TripCount == 0) {
-    DEBUG(dbgs() << "failed to get trip count!\n");
-    return false;
-  }
+  return false;
+}
 
-  if (TripCount->isImm()) {
-    DEBUG(dbgs() << "constant trip count: " << TripCount->getImm() << "\n");
+static bool verifyCTRBranch(MachineBasicBlock *MBB,
+                            MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator BI = I;
+  SmallSet<MachineBasicBlock *, 16>   Visited;
+  SmallVector<MachineBasicBlock *, 8> Preds;
+  bool CheckPreds;
+
+  if (I == MBB->begin()) {
+    Visited.insert(MBB);
+    goto queue_preds;
+  } else
+    --I;
+
+check_block:
+  Visited.insert(MBB);
+  if (I == MBB->end())
+    goto queue_preds;
+
+  CheckPreds = true;
+  for (MachineBasicBlock::iterator IE = MBB->begin();; --I) {
+    unsigned Opc = I->getOpcode();
+    if (Opc == PPC::MTCTRloop || Opc == PPC::MTCTR8loop) {
+      CheckPreds = false;
+      break;
+    }
 
-    // FIXME: We currently can't form 64-bit constants
-    // (including 32-bit unsigned constants)
-    if (!isInt<32>(TripCount->getImm()))
+    if (I != BI && clobbersCTR(I)) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " (" <<
+                      MBB->getFullName() << ") instruction " << *I <<
+                      " clobbers CTR, invalidating " << "BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
       return false;
-  }
+    }
 
-  // Does the loop contain any invalid instructions?
-  if (containsInvalidInstruction(L)) {
-    return false;
+    if (I == IE)
+      break;
   }
-  MachineBasicBlock *Preheader = L->getLoopPreheader();
-  // No preheader means there's not place for the loop instr.
-  if (Preheader == 0) {
-    return false;
-  }
-  MachineBasicBlock::iterator InsertPos = Preheader->getFirstTerminator();
 
-  DebugLoc dl;
-  if (InsertPos != Preheader->end())
-    dl = InsertPos->getDebugLoc();
+  if (!CheckPreds && Preds.empty())
+    return true;
 
-  MachineBasicBlock *LastMBB = L->getExitingBlock();
-  // Don't generate CTR loop if the loop has more than one exit.
-  if (LastMBB == 0) {
-    return false;
-  }
-  MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
-
-  // Determine the loop start.
-  MachineBasicBlock *LoopStart = L->getTopBlock();
-  if (L->getLoopLatch() != LastMBB) {
-    // When the exit and latch are not the same, use the latch block as the
-    // start.
-    // The loop start address is used only after the 1st iteration, and the loop
-    // latch may contains instrs. that need to be executed after the 1st iter.
-    LoopStart = L->getLoopLatch();
-    // Make sure the latch is a successor of the exit, otherwise it won't work.
-    if (!LastMBB->isSuccessor(LoopStart)) {
+  if (CheckPreds) {
+queue_preds:
+    if (MachineFunction::iterator(MBB) == MBB->getParent()->begin()) {
+      DEBUG(dbgs() << "Unable to find a MTCTR instruction for BB#" <<
+                      BI->getParent()->getNumber() << " (" <<
+                      BI->getParent()->getFullName() << ") instruction " <<
+                      *BI << "\n");
       return false;
     }
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PIE = MBB->pred_end(); PI != PIE; ++PI)
+      Preds.push_back(*PI);
   }
 
-  // Convert the loop to a CTR loop
-  DEBUG(dbgs() << "Change to CTR loop at "; L->dump());
-
-  MachineFunction *MF = LastMBB->getParent();
-  const PPCSubtarget &Subtarget = MF->getTarget().getSubtarget<PPCSubtarget>();
-  bool isPPC64 = Subtarget.isPPC64();
-
-  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
-  const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
-
-  unsigned CountReg;
-  if (TripCount->isReg()) {
-    // Create a copy of the loop count register.
-    const TargetRegisterClass *SrcRC =
-      MF->getRegInfo().getRegClass(TripCount->getReg());
-    CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    unsigned CopyOp = (isPPC64 && GPRC->hasSubClassEq(SrcRC)) ?
-                        (unsigned) PPC::EXTSW_32_64 :
-                        (unsigned) TargetOpcode::COPY;
-    BuildMI(*Preheader, InsertPos, dl,
-            TII->get(CopyOp), CountReg).addReg(TripCount->getReg());
-    if (TripCount->isNeg()) {
-      unsigned CountReg1 = CountReg;
-      CountReg = MF->getRegInfo().createVirtualRegister(RC);
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::NEG8 : PPC::NEG),
-                       CountReg).addReg(CountReg1);
+  do {
+    MBB = Preds.pop_back_val();
+    if (!Visited.count(MBB)) {
+      I = MBB->getLastNonDebugInstr();
+      goto check_block;
     }
-  } else {
-    assert(TripCount->isImm() && "Expecting immedate vaule for trip count");
-    // Put the trip count in a register for transfer into the count register.
-
-    int64_t CountImm = TripCount->getImm();
-    if (TripCount->isNeg())
-      CountImm = -CountImm;
-
-    CountReg = MF->getRegInfo().createVirtualRegister(RC);
-    if (abs64(CountImm) > 0x7FFF) {
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS),
-              CountReg).addImm((CountImm >> 16) & 0xFFFF);
-      unsigned CountReg1 = CountReg;
-      CountReg = MF->getRegInfo().createVirtualRegister(RC);
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
-              CountReg).addReg(CountReg1).addImm(CountImm & 0xFFFF);
-    } else {
-      BuildMI(*Preheader, InsertPos, dl,
-              TII->get(isPPC64 ? PPC::LI8 : PPC::LI),
-              CountReg).addImm(CountImm);
-    }
-  }
+  } while (!Preds.empty());
 
-  // Add the mtctr instruction to the beginning of the loop.
-  BuildMI(*Preheader, InsertPos, dl,
-          TII->get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(CountReg,
-            TripCount->isImm() ? RegState::Kill : 0);
-
-  // Make sure the loop start always has a reference in the CFG.  We need to
-  // create a BlockAddress operand to get this mechanism to work both the
-  // MachineBasicBlock and BasicBlock objects need the flag set.
-  LoopStart->setHasAddressTaken();
-  // This line is needed to set the hasAddressTaken flag on the BasicBlock
-  // object
-  BlockAddress::get(const_cast<BasicBlock *>(LoopStart->getBasicBlock()));
-
-  // Replace the loop branch with a bdnz instruction.
-  dl = LastI->getDebugLoc();
-  const std::vector<MachineBasicBlock*> Blocks = L->getBlocks();
-  for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *MBB = Blocks[i];
-    if (MBB != Preheader)
-      MBB->addLiveIn(isPPC64 ? PPC::CTR8 : PPC::CTR);
-  }
+  return true;
+}
 
-  // The loop ends with either:
-  //  - a conditional branch followed by an unconditional branch, or
-  //  - a conditional branch to the loop start.
-  assert(LastI->getOpcode() == PPC::BCC &&
-         "loop end must start with a BCC instruction");
-  // Either the BCC branches to the beginning of the loop, or it
-  // branches out of the loop and there is an unconditional branch
-  // to the start of the loop.
-  MachineBasicBlock *BranchTarget = LastI->getOperand(2).getMBB();
-  BuildMI(*LastMBB, LastI, dl,
-        TII->get((BranchTarget == LoopStart) ?
-                 (isPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
-                 (isPPC64 ? PPC::BDZ8 : PPC::BDZ))).addMBB(BranchTarget);
-
-  // Conditional branch; just delete it.
-  DEBUG(dbgs() << "Removing old branch: " << *LastI);
-  LastMBB->erase(LastI);
-
-  delete TripCount;
-
-  // The induction operation (add) and the comparison (cmpwi) may now be
-  // unneeded. If these are unneeded, then remove them.
-  for (unsigned i = 0; i < OldInsts.size(); ++i)
-    removeIfDead(OldInsts[i]);
+bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+
+  // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
+  // any other instructions that might clobber the ctr register.
+  for (MachineFunction::iterator I = MF.begin(), IE = MF.end();
+       I != IE; ++I) {
+    MachineBasicBlock *MBB = I;
+    if (!MDT->isReachableFromEntry(MBB))
+      continue;
+
+    for (MachineBasicBlock::iterator MII = MBB->getFirstTerminator(),
+      MIIE = MBB->end(); MII != MIIE; ++MII) {
+      unsigned Opc = MII->getOpcode();
+      if (Opc == PPC::BDNZ8 || Opc == PPC::BDNZ ||
+          Opc == PPC::BDZ8  || Opc == PPC::BDZ)
+        if (!verifyCTRBranch(MBB, MII))
+          llvm_unreachable("Invalid PPC CTR loop!");
+    }
+  }
 
-  ++NumCTRLoops;
-  return true;
+  return false;
 }
+#endif // NDEBUG
 
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index c8a29a3..e8e7f4c 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -37,6 +37,37 @@ def RetCC_PPC : CallingConv<[
 ]>;
 
 
+// Note that we don't currently have calling conventions for 64-bit
+// PowerPC, but handle all the complexities of the ABI in the lowering
+// logic.  FIXME: See if the logic can be simplified with use of CCs.
+// This may require some extensions to current table generation.
+
+// Simple calling convention for 64-bit ELF PowerPC fast isel.
+// Only handle ints and floats.  All ints are promoted to i64.
+// Vector types and quadword ints are not handled.
+def CC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],  CCPromoteToType<i64>>,
+  CCIfType<[i16], CCPromoteToType<i64>>,
+  CCIfType<[i32], CCPromoteToType<i64>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>
+]>;
+
+// Simple return-value convention for 64-bit ELF PowerPC fast isel.
+// All small ints are promoted to i64.  Vector types, quadword ints,
+// and multiple register returns are "supported" to avoid compile
+// errors, but none are handled by the fast selector.
+def RetCC_PPC64_ELF_FIS : CallingConv<[
+  CCIfType<[i8],   CCPromoteToType<i64>>,
+  CCIfType<[i16],  CCPromoteToType<i64>>,
+  CCIfType<[i32],  CCPromoteToType<i64>>,
+  CCIfType<[i64],  CCAssignToReg<[X3, X4]>>,
+  CCIfType<[i128], CCAssignToReg<[X3, X4, X5, X6]>>,
+  CCIfType<[f32],  CCAssignToReg<[F1, F2]>>,
+  CCIfType<[f64],  CCAssignToReg<[F1, F2, F3, F4]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC System V Release 4 32-bit ABI
 //===----------------------------------------------------------------------===//
@@ -105,40 +136,45 @@ def CC_PPC32_SVR4_ByVal : CallingConv<[
   CCCustom<"CC_PPC32_SVR4_Custom_Dummy">
 ]>;
 
+def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27,
+                                       V28, V29, V30, V31)>;
+
 def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
                                         R21, R22, R23, R24, R25, R26, R27, R28,
                                         R29, R30, R31, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
-                                        V20, V21, V22, V23, V24, V25, V26, V27,
-                                        V28, V29, V30, V31)>;
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
 
-def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20, VRSAVE,
+def CSR_SVR432   : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
                                         R21, R22, R23, R24, R25, R26, R27, R28,
                                         R29, R30, R31, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
-                                        V20, V21, V22, V23, V24, V25, V26, V27,
-                                        V28, V29, V30, V31)>;
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
+def CSR_SVR432_Altivec : CalleeSavedRegs<(add CSR_SVR432, CSR_Altivec)>;
 
 def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
                                         X29, X30, X31, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
-                                        V20, V21, V22, V23, V24, V25, V26, V27,
-                                        V28, V29, V30, V31)>;
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
 
-def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20, VRSAVE,
+def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>;
+
+def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
                                         X29, X30, X31, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4,
-                                        V20, V21, V22, V23, V24, V25, V26, V27,
-                                        V28, V29, V30, V31)>;
+                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
+                                   )>;
+
 
-def CSR_NoRegs : CalleeSavedRegs<(add VRSAVE)>;
-def CSR_NoRegs_Darwin : CalleeSavedRegs<(add)>;
+def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
 
-def CSR_NoRegs_Altivec : CalleeSavedRegs<(add (sequence "V%u", 0, 31), VRSAVE)>;
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 6478718..418736e 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -63,12 +63,15 @@ namespace {
     unsigned get_crbitm_encoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getDirectBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getAbsDirectBrEncoding(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+    unsigned getAbsCondBrEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-    unsigned getHA16Encoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getLO16Encoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getImm16Encoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
+    unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
     const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
 
@@ -139,8 +142,8 @@ void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
 unsigned PPCCodeEmitter::get_crbitm_encoding(const MachineInstr &MI,
                                              unsigned OpNo) const {
   const MachineOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MTCRF8 ||
-            MI.getOpcode() == PPC::MFOCRF) &&
+  assert((MI.getOpcode() == PPC::MTOCRF || MI.getOpcode() == PPC::MTOCRF8 ||
+          MI.getOpcode() == PPC::MFOCRF || MI.getOpcode() == PPC::MFOCRF8) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
   return 0x80 >> TM.getRegisterInfo()->getEncodingValue(MO.getReg());
 }
@@ -194,21 +197,32 @@ unsigned PPCCodeEmitter::getCondBrEncoding(const MachineInstr &MI,
   return 0;
 }
 
-unsigned PPCCodeEmitter::getHA16Encoding(const MachineInstr &MI,
-                                         unsigned OpNo) const {
+unsigned PPCCodeEmitter::getAbsDirectBrEncoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
   const MachineOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
 
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_high));
-  return 0;
+  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
+}
+
+unsigned PPCCodeEmitter::getAbsCondBrEncoding(const MachineInstr &MI,
+                                              unsigned OpNo) const {
+  llvm_unreachable("Absolute branch relocations unsupported on the old JIT.");
 }
 
-unsigned PPCCodeEmitter::getLO16Encoding(const MachineInstr &MI,
-                                         unsigned OpNo) const {
+unsigned PPCCodeEmitter::getImm16Encoding(const MachineInstr &MI,
+                                          unsigned OpNo) const {
   const MachineOperand &MO = MI.getOperand(OpNo);
   if (MO.isReg() || MO.isImm()) return getMachineOpValue(MI, MO);
-  
-  MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low));
+
+  unsigned RelocID;
+  switch (MO.getTargetFlags() & PPCII::MO_ACCESS_MASK) {
+    default: llvm_unreachable("Unsupported target operand flags!");
+    case PPCII::MO_LO: RelocID = PPC::reloc_absolute_low; break;
+    case PPCII::MO_HA: RelocID = PPC::reloc_absolute_high; break;
+  }
+
+  MCE.addRelocation(GetRelocation(MO, RelocID));
   return 0;
 }
 
@@ -237,7 +251,7 @@ unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
   
   const MachineOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm())
-    return (getMachineOpValue(MI, MO) & 0x3FFF) | RegBits;
+    return ((getMachineOpValue(MI, MO) >> 2) & 0x3FFF) | RegBits;
   
   MCE.addRelocation(GetRelocation(MO, PPC::reloc_absolute_low_ix));
   return RegBits;
@@ -250,15 +264,20 @@ unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI,
   return 0;
 }
 
+unsigned PPCCodeEmitter::getTLSCallEncoding(const MachineInstr &MI,
+                                            unsigned OpNo) const {
+  llvm_unreachable("TLS not supported on the old JIT.");
+  return 0;
+}
 
 unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
                                            const MachineOperand &MO) const {
 
   if (MO.isReg()) {
-    // MTCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
+    // MTOCRF/MFOCRF should go through get_crbitm_encoding for the CR operand.
     // The GPR operand should come through here though.
-    assert((MI.getOpcode() != PPC::MTCRF && MI.getOpcode() != PPC::MTCRF8 &&
-             MI.getOpcode() != PPC::MFOCRF) ||
+    assert((MI.getOpcode() != PPC::MTOCRF && MI.getOpcode() != PPC::MTOCRF8 &&
+            MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
     return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
   }
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
new file mode 100644
index 0000000..09117e7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -0,0 +1,2236 @@
+//===-- PPCFastISel.cpp - PowerPC FastISel implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC-specific support for the FastISel class. Some
+// of the target-specific code is generated by tablegen in the file
+// PPCGenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppcfastisel"
+#include "PPC.h"
+#include "PPCISelLowering.h"
+#include "PPCSubtarget.h"
+#include "PPCTargetMachine.h"
+#include "MCTargetDesc/PPCPredicates.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+
+//===----------------------------------------------------------------------===//
+//
+// TBD:
+//   FastLowerArguments: Handle simple cases.
+//   PPCMaterializeGV: Handle TLS.
+//   SelectCall: Handle function pointers.
+//   SelectCall: Handle multi-register return values.
+//   SelectCall: Optimize away nops for local calls.
+//   processCallArgs: Handle bit-converted arguments.
+//   finishCall: Handle multi-register return values.
+//   PPCComputeAddress: Handle parameter references as FrameIndex's.
+//   PPCEmitCmp: Handle immediate as operand 1.
+//   SelectCall: Handle small byval arguments.
+//   SelectIntrinsicCall: Implement.
+//   SelectSelect: Implement.
+//   Consider factoring isTypeLegal into the base class.
+//   Implement switches and jump tables.
+//
+//===----------------------------------------------------------------------===//
+using namespace llvm;
+
+namespace {
+
+typedef struct Address {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  long Offset;
+
+  // Innocuous defaults for our address.
+  Address()
+   : BaseType(RegBase), Offset(0) {
+     Base.Reg = 0;
+   }
+} Address;
+
+class PPCFastISel : public FastISel {
+
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  const PPCSubtarget &PPCSubTarget;
+  LLVMContext *Context;
+
+  public:
+    explicit PPCFastISel(FunctionLoweringInfo &FuncInfo,
+                         const TargetLibraryInfo *LibInfo)
+    : FastISel(FuncInfo, LibInfo),
+      TM(FuncInfo.MF->getTarget()),
+      TII(*TM.getInstrInfo()),
+      TLI(*TM.getTargetLowering()),
+      PPCSubTarget(
+       *((static_cast<const PPCTargetMachine *>(&TM))->getSubtargetImpl())
+      ),
+      Context(&FuncInfo.Fn->getContext()) { }
+
+  // Backend specific FastISel code.
+  private:
+    virtual bool TargetSelectInstruction(const Instruction *I);
+    virtual unsigned TargetMaterializeConstant(const Constant *C);
+    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
+    virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                     const LoadInst *LI);
+    virtual bool FastLowerArguments();
+    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
+    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     uint64_t Imm);
+    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC,
+                                    unsigned Op0, bool Op0IsKill);
+    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass *RC,
+                                     unsigned Op0, bool Op0IsKill,
+                                     unsigned Op1, bool Op1IsKill);
+
+  // Instruction selection routines.
+  private:
+    bool SelectLoad(const Instruction *I);
+    bool SelectStore(const Instruction *I);
+    bool SelectBranch(const Instruction *I);
+    bool SelectIndirectBr(const Instruction *I);
+    bool SelectCmp(const Instruction *I);
+    bool SelectFPExt(const Instruction *I);
+    bool SelectFPTrunc(const Instruction *I);
+    bool SelectIToFP(const Instruction *I, bool IsSigned);
+    bool SelectFPToI(const Instruction *I, bool IsSigned);
+    bool SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode);
+    bool SelectCall(const Instruction *I);
+    bool SelectRet(const Instruction *I);
+    bool SelectTrunc(const Instruction *I);
+    bool SelectIntExt(const Instruction *I);
+
+  // Utility routines.
+  private:
+    bool isTypeLegal(Type *Ty, MVT &VT);
+    bool isLoadTypeLegal(Type *Ty, MVT &VT);
+    bool PPCEmitCmp(const Value *Src1Value, const Value *Src2Value,
+                    bool isZExt, unsigned DestReg);
+    bool PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                     const TargetRegisterClass *RC, bool IsZExt = true,
+                     unsigned FP64LoadOpc = PPC::LFD);
+    bool PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr);
+    bool PPCComputeAddress(const Value *Obj, Address &Addr);
+    void PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                            unsigned &IndexReg);
+    bool PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                           unsigned DestReg, bool IsZExt);
+    unsigned PPCMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned PPCMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned PPCMaterializeInt(const Constant *C, MVT VT);
+    unsigned PPCMaterialize32BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMaterialize64BitInt(int64_t Imm,
+                                    const TargetRegisterClass *RC);
+    unsigned PPCMoveToIntReg(const Instruction *I, MVT VT,
+                             unsigned SrcReg, bool IsSigned);
+    unsigned PPCMoveToFPReg(MVT VT, unsigned SrcReg, bool IsSigned);
+
+  // Call handling routines.
+  private:
+    bool processCallArgs(SmallVectorImpl<Value*> &Args,
+                         SmallVectorImpl<unsigned> &ArgRegs,
+                         SmallVectorImpl<MVT> &ArgVTs,
+                         SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                         SmallVectorImpl<unsigned> &RegArgs,
+                         CallingConv::ID CC,
+                         unsigned &NumBytes,
+                         bool IsVarArg);
+    void finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                    const Instruction *I, CallingConv::ID CC,
+                    unsigned &NumBytes, bool IsVarArg);
+    CCAssignFn *usePPC32CCs(unsigned Flag);
+
+  private:
+  #include "PPCGenFastISel.inc"
+
+};
+
+} // end anonymous namespace
+
+#include "PPCGenCallingConv.inc"
+
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCFastISel::usePPC32CCs(unsigned Flag) {
+  if (Flag == 1)
+    return CC_PPC32_SVR4;
+  else if (Flag == 2)
+    return CC_PPC32_SVR4_ByVal;
+  else if (Flag == 3)
+    return CC_PPC32_SVR4_VarArg;
+  else
+    return RetCC_PPC;
+}
+
+static Optional<PPC::Predicate> getComparePred(CmpInst::Predicate Pred) {
+  switch (Pred) {
+    // These are not representable with any single compare.
+    case CmpInst::FCMP_FALSE:
+    case CmpInst::FCMP_UEQ:
+    case CmpInst::FCMP_UGT:
+    case CmpInst::FCMP_UGE:
+    case CmpInst::FCMP_ULT:
+    case CmpInst::FCMP_ULE:
+    case CmpInst::FCMP_UNE:
+    case CmpInst::FCMP_TRUE:
+    default:
+      return Optional<PPC::Predicate>();
+
+    case CmpInst::FCMP_OEQ:
+    case CmpInst::ICMP_EQ:
+      return PPC::PRED_EQ;
+
+    case CmpInst::FCMP_OGT:
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_SGT:
+      return PPC::PRED_GT;
+
+    case CmpInst::FCMP_OGE:
+    case CmpInst::ICMP_UGE:
+    case CmpInst::ICMP_SGE:
+      return PPC::PRED_GE;
+
+    case CmpInst::FCMP_OLT:
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_SLT:
+      return PPC::PRED_LT;
+
+    case CmpInst::FCMP_OLE:
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_SLE:
+      return PPC::PRED_LE;
+
+    case CmpInst::FCMP_ONE:
+    case CmpInst::ICMP_NE:
+      return PPC::PRED_NE;
+
+    case CmpInst::FCMP_ORD:
+      return PPC::PRED_NU;
+
+    case CmpInst::FCMP_UNO:
+      return PPC::PRED_UN;
+  }
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel, and return its equivalent machine type in VT.
+// FIXME: Copied directly from ARM -- factor into base class?
+bool PPCFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT Evt = TLI.getValueType(Ty, true);
+
+  // Only handle simple types.
+  if (Evt == MVT::Other || !Evt.isSimple()) return false;
+  VT = Evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+// Determine whether the type Ty is simple enough to be handled by
+// fast-isel as a load target, and return its equivalent machine type in VT.
+bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT)) return true;
+
+  // If this is a type than can be sign or zero-extended to a basic operation
+  // go ahead and accept it now.
+  if (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) {
+    return true;
+  }
+
+  return false;
+}
+
+// Given a value Obj, create an Address object Addr that represents its
+// address.  Return false if we can't handle it.
+bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
+    // Don't walk into other basic blocks unless the object is an alloca from
+    // another block, otherwise it may not have a virtual register assigned.
+    if (FuncInfo.StaticAllocaMap.count(static_cast<const AllocaInst *>(Obj)) ||
+        FuncInfo.MBBMap[I->getParent()] == FuncInfo.MBB) {
+      Opcode = I->getOpcode();
+      U = I;
+    }
+  } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(Obj)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+    default:
+      break;
+    case Instruction::BitCast:
+      // Look through bitcasts.
+      return PPCComputeAddress(U->getOperand(0), Addr);
+    case Instruction::IntToPtr:
+      // Look past no-op inttoptrs.
+      if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::PtrToInt:
+      // Look past no-op ptrtoints.
+      if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+        return PPCComputeAddress(U->getOperand(0), Addr);
+      break;
+    case Instruction::GetElementPtr: {
+      Address SavedAddr = Addr;
+      long TmpOffset = Addr.Offset;
+
+      // Iterate through the GEP folding the constants into offsets where
+      // we can.
+      gep_type_iterator GTI = gep_type_begin(U);
+      for (User::const_op_iterator II = U->op_begin() + 1, IE = U->op_end();
+           II != IE; ++II, ++GTI) {
+        const Value *Op = *II;
+        if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+          const StructLayout *SL = TD.getStructLayout(STy);
+          unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+          TmpOffset += SL->getElementOffset(Idx);
+        } else {
+          uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+          for (;;) {
+            if (const ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+              // Constant-offset addressing.
+              TmpOffset += CI->getSExtValue() * S;
+              break;
+            }
+            if (canFoldAddIntoGEP(U, Op)) {
+              // A compatible add with a constant operand. Fold the constant.
+              ConstantInt *CI =
+              cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
+              TmpOffset += CI->getSExtValue() * S;
+              // Iterate on the other operand.
+              Op = cast<AddOperator>(Op)->getOperand(0);
+              continue;
+            }
+            // Unsupported
+            goto unsupported_gep;
+          }
+        }
+      }
+
+      // Try to grab the base operand now.
+      Addr.Offset = TmpOffset;
+      if (PPCComputeAddress(U->getOperand(0), Addr)) return true;
+
+      // We failed, restore everything and try the other options.
+      Addr = SavedAddr;
+
+      unsupported_gep:
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst *AI = cast<AllocaInst>(Obj);
+      DenseMap<const AllocaInst*, int>::iterator SI =
+        FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end()) {
+        Addr.BaseType = Address::FrameIndexBase;
+        Addr.Base.FI = SI->second;
+        return true;
+      }
+      break;
+    }
+  }
+
+  // FIXME: References to parameters fall through to the behavior
+  // below.  They should be able to reference a frame index since
+  // they are stored to the stack, so we can get "ld rx, offset(r1)"
+  // instead of "addi ry, r1, offset / ld rx, 0(ry)".  Obj will
+  // just contain the parameter.  Try to handle this with a FI.
+
+  // Try to get this in a register if nothing else has worked.
+  if (Addr.Base.Reg == 0)
+    Addr.Base.Reg = getRegForValue(Obj);
+
+  // Prevent assignment of base register to X0, which is inappropriate
+  // for loads and stores alike.
+  if (Addr.Base.Reg != 0)
+    MRI.setRegClass(Addr.Base.Reg, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  return Addr.Base.Reg != 0;
+}
+
+// Fix up some addresses that can't be used directly.  For example, if
+// an offset won't fit in an instruction field, we may need to move it
+// into an index register.
+void PPCFastISel::PPCSimplifyAddress(Address &Addr, MVT VT, bool &UseOffset,
+                                     unsigned &IndexReg) {
+
+  // Check whether the offset fits in the instruction field.
+  if (!isInt<16>(Addr.Offset))
+    UseOffset = false;
+
+  // If this is a stack pointer and the offset needs to be simplified then
+  // put the alloca address into a register, set the base type back to
+  // register and continue. This should almost never happen.
+  if (!UseOffset && Addr.BaseType == Address::FrameIndexBase) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(Addr.Base.FI).addImm(0);
+    Addr.Base.Reg = ResultReg;
+    Addr.BaseType = Address::RegBase;
+  }
+
+  if (!UseOffset) {
+    IntegerType *OffsetTy = ((VT == MVT::i32) ? Type::getInt32Ty(*Context)
+                             : Type::getInt64Ty(*Context));
+    const ConstantInt *Offset =
+      ConstantInt::getSigned(OffsetTy, (int64_t)(Addr.Offset));
+    IndexReg = PPCMaterializeInt(Offset, MVT::i64);
+    assert(IndexReg && "Unexpected error in PPCMaterializeInt!");
+  }
+}
+
+// Emit a load instruction if possible, returning true if we succeeded,
+// otherwise false.  See commentary below for how the register class of
+// the load is determined. 
+bool PPCFastISel::PPCEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
+                              const TargetRegisterClass *RC,
+                              bool IsZExt, unsigned FP64LoadOpc) {
+  unsigned Opc;
+  bool UseOffset = true;
+
+  // If ResultReg is given, it determines the register class of the load.
+  // Otherwise, RC is the register class to use.  If the result of the
+  // load isn't anticipated in this block, both may be zero, in which
+  // case we must make a conservative guess.  In particular, don't assign
+  // R0 or X0 to the result register, as the result may be used in a load,
+  // store, add-immediate, or isel that won't permit this.  (Though
+  // perhaps the spill and reload of live-exit values would handle this?)
+  const TargetRegisterClass *UseRC =
+    (ResultReg ? MRI.getRegClass(ResultReg) :
+     (RC ? RC :
+      (VT == MVT::f64 ? &PPC::F8RCRegClass :
+       (VT == MVT::f32 ? &PPC::F4RCRegClass :
+        (VT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+         &PPC::GPRC_and_GPRC_NOR0RegClass)))));
+
+  bool Is32BitInt = UseRC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::LBZ : PPC::LBZ8;
+      break;
+    case MVT::i16:
+      Opc = (IsZExt ?
+             (Is32BitInt ? PPC::LHZ : PPC::LHZ8) : 
+             (Is32BitInt ? PPC::LHA : PPC::LHA8));
+      break;
+    case MVT::i32:
+      Opc = (IsZExt ? 
+             (Is32BitInt ? PPC::LWZ : PPC::LWZ8) :
+             (Is32BitInt ? PPC::LWA_32 : PPC::LWA));
+      if ((Opc == PPC::LWA || Opc == PPC::LWA_32) && ((Addr.Offset & 3) != 0))
+        UseOffset = false;
+      break;
+    case MVT::i64:
+      Opc = PPC::LD;
+      assert(UseRC->hasSuperClassEq(&PPC::G8RCRegClass) && 
+             "64-bit load with 32-bit target??");
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::LFS;
+      break;
+    case MVT::f64:
+      Opc = FP64LoadOpc;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+  if (ResultReg == 0)
+    ResultReg = createResultReg(UseRC);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset) {
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  } else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::LBZ:    Opc = PPC::LBZX;    break;
+      case PPC::LBZ8:   Opc = PPC::LBZX8;   break;
+      case PPC::LHZ:    Opc = PPC::LHZX;    break;
+      case PPC::LHZ8:   Opc = PPC::LHZX8;   break;
+      case PPC::LHA:    Opc = PPC::LHAX;    break;
+      case PPC::LHA8:   Opc = PPC::LHAX8;   break;
+      case PPC::LWZ:    Opc = PPC::LWZX;    break;
+      case PPC::LWZ8:   Opc = PPC::LWZX8;   break;
+      case PPC::LWA:    Opc = PPC::LWAX;    break;
+      case PPC::LWA_32: Opc = PPC::LWAX_32; break;
+      case PPC::LD:     Opc = PPC::LDX;     break;
+      case PPC::LFS:    Opc = PPC::LFSX;    break;
+      case PPC::LFD:    Opc = PPC::LFDX;    break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+      .addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a load instruction.
+bool PPCFastISel::SelectLoad(const Instruction *I) {
+  // FIXME: No atomic loads are supported.
+  if (cast<LoadInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getType(), VT))
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(0), Addr))
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  This is necessary
+  // to constrain RA from using R0/X0 when this is not legal.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
+    return false;
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Emit a store instruction to store SrcReg at Addr.
+bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
+  assert(SrcReg && "Nothing to store!");
+  unsigned Opc;
+  bool UseOffset = true;
+
+  const TargetRegisterClass *RC = MRI.getRegClass(SrcReg);
+  bool Is32BitInt = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  switch (VT.SimpleTy) {
+    default: // e.g., vector types not handled
+      return false;
+    case MVT::i8:
+      Opc = Is32BitInt ? PPC::STB : PPC::STB8;
+      break;
+    case MVT::i16:
+      Opc = Is32BitInt ? PPC::STH : PPC::STH8;
+      break;
+    case MVT::i32:
+      assert(Is32BitInt && "Not GPRC for i32??");
+      Opc = PPC::STW;
+      break;
+    case MVT::i64:
+      Opc = PPC::STD;
+      UseOffset = ((Addr.Offset & 3) == 0);
+      break;
+    case MVT::f32:
+      Opc = PPC::STFS;
+      break;
+    case MVT::f64:
+      Opc = PPC::STFD;
+      break;
+  }
+
+  // If necessary, materialize the offset into a register and use
+  // the indexed form.  Also handle stack pointers with special needs.
+  unsigned IndexReg = 0;
+  PPCSimplifyAddress(Addr, VT, UseOffset, IndexReg);
+
+  // Note: If we still have a frame index here, we know the offset is
+  // in range, as otherwise PPCSimplifyAddress would have converted it
+  // into a RegBase.
+  if (Addr.BaseType == Address::FrameIndexBase) {
+    MachineMemOperand *MMO =
+      FuncInfo.MF->getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(Addr.Base.FI, Addr.Offset),
+        MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
+        MFI.getObjectAlignment(Addr.Base.FI));
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc)).addReg(SrcReg)
+      .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
+
+  // Base reg with offset in range.
+  } else if (UseOffset)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addImm(Addr.Offset).addReg(Addr.Base.Reg);
+
+  // Indexed form.
+  else {
+    // Get the RR opcode corresponding to the RI one.  FIXME: It would be
+    // preferable to use the ImmToIdxMap from PPCRegisterInfo.cpp, but it
+    // is hard to get at.
+    switch (Opc) {
+      default:        llvm_unreachable("Unexpected opcode!");
+      case PPC::STB:  Opc = PPC::STBX;  break;
+      case PPC::STH : Opc = PPC::STHX;  break;
+      case PPC::STW : Opc = PPC::STWX;  break;
+      case PPC::STB8: Opc = PPC::STBX8; break;
+      case PPC::STH8: Opc = PPC::STHX8; break;
+      case PPC::STW8: Opc = PPC::STWX8; break;
+      case PPC::STD:  Opc = PPC::STDX;  break;
+      case PPC::STFS: Opc = PPC::STFSX; break;
+      case PPC::STFD: Opc = PPC::STFDX; break;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
+      .addReg(SrcReg).addReg(Addr.Base.Reg).addReg(IndexReg);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select a store instruction.
+bool PPCFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // FIXME: No atomics loads are supported.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(Op0->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!PPCEmitStore(VT, SrcReg, Addr))
+    return false;
+
+  return true;
+}
+
+// Attempt to fast-select a branch instruction.
+bool PPCFastISel::SelectBranch(const Instruction *I) {
+  const BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *BrBB = FuncInfo.MBB;
+  MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
+
+  // For now, just try the simplest case where it's fed by a compare.
+  if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    Optional<PPC::Predicate> OptPPCPred = getComparePred(CI->getPredicate());
+    if (!OptPPCPred)
+      return false;
+
+    PPC::Predicate PPCPred = OptPPCPred.getValue();
+
+    // Take advantage of fall-through opportunities.
+    if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
+      std::swap(TBB, FBB);
+      PPCPred = PPC::InvertPredicate(PPCPred);
+    }
+
+    unsigned CondReg = createResultReg(&PPC::CRRCRegClass);
+
+    if (!PPCEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                    CondReg))
+      return false;
+
+    BuildMI(*BrBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCC))
+      .addImm(PPCPred).addReg(CondReg).addMBB(TBB);
+    FastEmitBranch(FBB, DL);
+    FuncInfo.MBB->addSuccessor(TBB);
+    return true;
+
+  } else if (const ConstantInt *CI =
+             dyn_cast<ConstantInt>(BI->getCondition())) {
+    uint64_t Imm = CI->getZExtValue();
+    MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
+    FastEmitBranch(Target, DL);
+    return true;
+  }
+
+  // FIXME: ARM looks for a case where the block containing the compare
+  // has been split from the block containing the branch.  If this happens,
+  // there is a vreg available containing the result of the compare.  I'm
+  // not sure we can do much, as we've lost the predicate information with
+  // the compare instruction -- we have a 4-bit CR but don't know which bit
+  // to test here.
+  return false;
+}
+
+// Attempt to emit a compare of the two source values.  Signed and unsigned
+// comparisons are supported.  Return false if we can't handle it.
+bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
+                             bool IsZExt, unsigned DestReg) {
+  Type *Ty = SrcValue1->getType();
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  // See if operand 2 is an immediate encodeable in the compare.
+  // FIXME: Operands are not in canonical order at -O0, so an immediate
+  // operand in position 1 is a lost opportunity for now.  We are
+  // similar to ARM in this regard.
+  long Imm = 0;
+  bool UseImm = false;
+
+  // Only 16-bit integer constants can be represented in compares for 
+  // PowerPC.  Others will be materialized into a register.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(SrcValue2)) {
+    if (SrcVT == MVT::i64 || SrcVT == MVT::i32 || SrcVT == MVT::i16 ||
+        SrcVT == MVT::i8 || SrcVT == MVT::i1) {
+      const APInt &CIVal = ConstInt->getValue();
+      Imm = (IsZExt) ? (long)CIVal.getZExtValue() : (long)CIVal.getSExtValue();
+      if ((IsZExt && isUInt<16>(Imm)) || (!IsZExt && isInt<16>(Imm)))
+        UseImm = true;
+    }
+  }
+
+  unsigned CmpOpc;
+  bool NeedsExt = false;
+  switch (SrcVT.SimpleTy) {
+    default: return false;
+    case MVT::f32:
+      CmpOpc = PPC::FCMPUS;
+      break;
+    case MVT::f64:
+      CmpOpc = PPC::FCMPUD;
+      break;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+      NeedsExt = true;
+      // Intentional fall-through.
+    case MVT::i32:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLW : PPC::CMPW;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLWI : PPC::CMPWI;
+      break;
+    case MVT::i64:
+      if (!UseImm)
+        CmpOpc = IsZExt ? PPC::CMPLD : PPC::CMPD;
+      else
+        CmpOpc = IsZExt ? PPC::CMPLDI : PPC::CMPDI;
+      break;
+  }
+
+  unsigned SrcReg1 = getRegForValue(SrcValue1);
+  if (SrcReg1 == 0)
+    return false;
+
+  unsigned SrcReg2 = 0;
+  if (!UseImm) {
+    SrcReg2 = getRegForValue(SrcValue2);
+    if (SrcReg2 == 0)
+      return false;
+  }
+
+  if (NeedsExt) {
+    unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg1, MVT::i32, ExtReg, IsZExt))
+      return false;
+    SrcReg1 = ExtReg;
+
+    if (!UseImm) {
+      unsigned ExtReg = createResultReg(&PPC::GPRCRegClass);
+      if (!PPCEmitIntExt(SrcVT, SrcReg2, MVT::i32, ExtReg, IsZExt))
+        return false;
+      SrcReg2 = ExtReg;
+    }
+  }
+
+  if (!UseImm)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addReg(SrcReg2);
+  else
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CmpOpc), DestReg)
+      .addReg(SrcReg1).addImm(Imm);
+
+  return true;
+}
+
+// Attempt to fast-select a floating-point extend instruction.
+bool PPCFastISel::SelectFPExt(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f32 || DestVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // No code is generated for a FP extend.
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select a floating-point truncate instruction.
+bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::f64 || DestVT != MVT::f32)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // Round the result to single precision.
+  unsigned DestReg = createResultReg(&PPC::F4RCRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP), DestReg)
+    .addReg(SrcReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move an i32 or i64 value in a GPR to an f64 value in an FPR.
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+// FIXME: The code here is sloppy for the 4-byte case.  Can use a 4-byte
+// stack slot and 4-byte store/load sequence.  Or just sext the 4-byte
+// case to 8 bytes which produces tighter code but wastes stack space.
+unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
+                                     bool IsSigned) {
+
+  // If necessary, extend 32-bit int to 64-bit.
+  if (SrcVT == MVT::i32) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(MVT::i32, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return 0;
+    SrcReg = TmpReg;
+  }
+
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the GPR.
+  if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
+    return 0;
+
+  // Load the integer value into an FPR.  The kind of load used depends
+  // on a number of conditions.
+  unsigned LoadOpc = PPC::LFD;
+
+  if (SrcVT == MVT::i32) {
+    Addr.Offset = 4;
+    if (!IsSigned)
+      LoadOpc = PPC::LFIWZX;
+    else if (PPCSubTarget.hasLFIWAX())
+      LoadOpc = PPC::LFIWAX;
+  }
+
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(MVT::f64, ResultReg, Addr, RC, !IsSigned, LoadOpc))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select an integer-to-floating-point conversion.
+bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
+  MVT DstVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::f32 && DstVT != MVT::f64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+
+  if (SrcVT != MVT::i8  && SrcVT != MVT::i16 &&
+      SrcVT != MVT::i32 && SrcVT != MVT::i64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // We can only lower an unsigned convert if we have the newer
+  // floating-point conversion operations.
+  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // FIXME: For now we require the newer floating-point conversion operations
+  // (which are present only on P7 and A2 server models) when converting
+  // to single-precision float.  Otherwise we have to generate a lot of
+  // fiddly code to avoid double rounding.  If necessary, the fiddly code
+  // can be found in PPCTargetLowering::LowerINT_TO_FP().
+  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+    return false;
+
+  // Extend the input if necessary.
+  if (SrcVT == MVT::i8 || SrcVT == MVT::i16) {
+    unsigned TmpReg = createResultReg(&PPC::G8RCRegClass);
+    if (!PPCEmitIntExt(SrcVT, SrcReg, MVT::i64, TmpReg, !IsSigned))
+      return false;
+    SrcVT = MVT::i64;
+    SrcReg = TmpReg;
+  }
+
+  // Move the integer value to an FPR.
+  unsigned FPReg = PPCMoveToFPReg(SrcVT, SrcReg, IsSigned);
+  if (FPReg == 0)
+    return false;
+
+  // Determine the opcode for the conversion.
+  const TargetRegisterClass *RC = &PPC::F8RCRegClass;
+  unsigned DestReg = createResultReg(RC);
+  unsigned Opc;
+
+  if (DstVT == MVT::f32)
+    Opc = IsSigned ? PPC::FCFIDS : PPC::FCFIDUS;
+  else
+    Opc = IsSigned ? PPC::FCFID : PPC::FCFIDU;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(FPReg);
+
+  UpdateValueMap(I, DestReg);
+  return true;
+}
+
+// Move the floating-point value in SrcReg into an integer destination
+// register, and return the register (or zero if we can't handle it).
+// FIXME: When direct register moves are implemented (see PowerISA 2.08),
+// those should be used instead of moving via a stack slot when the
+// subtarget permits.
+unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
+                                      unsigned SrcReg, bool IsSigned) {
+  // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
+  // Note that if have STFIWX available, we could use a 4-byte stack
+  // slot for i32, but this being fast-isel we'll just go with the
+  // easiest code gen possible.
+  Address Addr;
+  Addr.BaseType = Address::FrameIndexBase;
+  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+
+  // Store the value from the FPR.
+  if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
+    return 0;
+
+  // Reload it into a GPR.  If we want an i32, modify the address
+  // to have a 4-byte offset so we load from the right place.
+  if (VT == MVT::i32)
+    Addr.Offset = 4;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+
+  unsigned ResultReg = 0;
+  if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
+    return 0;
+
+  return ResultReg;
+}
+
+// Attempt to fast-select a floating-point-to-integer conversion.
+bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
+  MVT DstVT, SrcVT;
+  Type *DstTy = I->getType();
+  if (!isTypeLegal(DstTy, DstVT))
+    return false;
+
+  if (DstVT != MVT::i32 && DstVT != MVT::i64)
+    return false;
+
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+  if (!isTypeLegal(SrcTy, SrcVT))
+    return false;
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (SrcReg == 0)
+    return false;
+
+  // Convert f32 to f64 if necessary.  This is just a meaningless copy
+  // to get the register class right.  COPY_TO_REGCLASS is needed since
+  // a COPY from F4RC to F8RC is converted to a F4RC-F4RC copy downstream.
+  const TargetRegisterClass *InRC = MRI.getRegClass(SrcReg);
+  if (InRC == &PPC::F4RCRegClass) {
+    unsigned TmpReg = createResultReg(&PPC::F8RCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(TargetOpcode::COPY_TO_REGCLASS), TmpReg)
+      .addReg(SrcReg).addImm(PPC::F8RCRegClassID);
+    SrcReg = TmpReg;
+  }
+
+  // Determine the opcode for the conversion, which takes place
+  // entirely within FPRs.
+  unsigned DestReg = createResultReg(&PPC::F8RCRegClass);
+  unsigned Opc;
+
+  if (DstVT == MVT::i32)
+    if (IsSigned)
+      Opc = PPC::FCTIWZ;
+    else
+      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+  else
+    Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
+
+  // Generate the convert.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+    .addReg(SrcReg);
+
+  // Now move the integer value from a float register to an integer register.
+  unsigned IntReg = PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  if (IntReg == 0)
+    return false;
+
+  UpdateValueMap(I, IntReg);
+  return true;
+}
+
+// Attempt to fast-select a binary integer operation that isn't already
+// handled automatically.
+bool PPCFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
+  EVT DestVT  = TLI.getValueType(I->getType(), true);
+
+  // We can get here in the case when we have a binary operation on a non-legal
+  // type and the target independent selector doesn't know how to handle it.
+  if (DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  // Look at the currently assigned register for this instruction
+  // to determine the required register class.  If there is no register,
+  // make a conservative choice (don't assign R0).
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     &PPC::GPRC_and_GPRC_NOR0RegClass);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  unsigned Opc;
+  switch (ISDOpcode) {
+    default: return false;
+    case ISD::ADD:
+      Opc = IsGPRC ? PPC::ADD4 : PPC::ADD8;
+      break;
+    case ISD::OR:
+      Opc = IsGPRC ? PPC::OR : PPC::OR8;
+      break;
+    case ISD::SUB:
+      Opc = IsGPRC ? PPC::SUBF : PPC::SUBF8;
+      break;
+  }
+
+  unsigned ResultReg = createResultReg(RC ? RC : &PPC::G8RCRegClass);
+  unsigned SrcReg1 = getRegForValue(I->getOperand(0));
+  if (SrcReg1 == 0) return false;
+
+  // Handle case of small immediate operand.
+  if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    const APInt &CIVal = ConstInt->getValue();
+    int Imm = (int)CIVal.getSExtValue();
+    bool UseImm = true;
+    if (isInt<16>(Imm)) {
+      switch (Opc) {
+        default:
+          llvm_unreachable("Missing case!");
+        case PPC::ADD4:
+          Opc = PPC::ADDI;
+          MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+          break;
+        case PPC::ADD8:
+          Opc = PPC::ADDI8;
+          MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+          break;
+        case PPC::OR:
+          Opc = PPC::ORI;
+          break;
+        case PPC::OR8:
+          Opc = PPC::ORI8;
+          break;
+        case PPC::SUBF:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI;
+            MRI.setRegClass(SrcReg1, &PPC::GPRC_and_GPRC_NOR0RegClass);
+            Imm = -Imm;
+          }
+          break;
+        case PPC::SUBF8:
+          if (Imm == -32768)
+            UseImm = false;
+          else {
+            Opc = PPC::ADDI8;
+            MRI.setRegClass(SrcReg1, &PPC::G8RC_and_G8RC_NOX0RegClass);
+            Imm = -Imm;
+          }
+          break;
+      }
+
+      if (UseImm) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+          .addReg(SrcReg1).addImm(Imm);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  // Reg-reg case.
+  unsigned SrcReg2 = getRegForValue(I->getOperand(1));
+  if (SrcReg2 == 0) return false;
+
+  // Reverse operands for subtract-from.
+  if (ISDOpcode == ISD::SUB)
+    std::swap(SrcReg1, SrcReg2);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ResultReg)
+    .addReg(SrcReg1).addReg(SrcReg2);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Handle arguments to a call that we're attempting to fast-select.
+// Return false if the arguments are too complex for us at the moment.
+bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
+                                  SmallVectorImpl<unsigned> &ArgRegs,
+                                  SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+                                  SmallVectorImpl<unsigned> &RegArgs,
+                                  CallingConv::ID CC,
+                                  unsigned &NumBytes,
+                                  bool IsVarArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, ArgLocs, *Context);
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
+
+  // Bail out if we can't handle any of the arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Skip vector arguments for now, as well as long double and
+    // uint128_t, and anything that isn't passed in a register.
+    if (ArgVT.isVector() || ArgVT.getSizeInBits() > 64 ||
+        !VA.isRegLoc() || VA.needsCustom())
+      return false;
+
+    // Skip bit-converted arguments for now.
+    if (VA.getLocInfo() == CCValAssign::BCvt)
+      return false;
+  }
+
+  // Get a count of how many bytes are to be pushed onto the stack.
+  NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameSetupOpcode()))
+    .addImm(NumBytes);
+
+  // Prepare to assign register arguments.  Every argument uses up a
+  // GPR protocol register even if it's passed in a floating-point
+  // register.
+  unsigned NextGPR = PPC::X3;
+  unsigned NextFPR = PPC::F1;
+
+  // Process arguments.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    unsigned Arg = ArgRegs[VA.getValNo()];
+    MVT ArgVT = ArgVTs[VA.getValNo()];
+
+    // Handle argument promotion and bitcasts.
+    switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::SExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/false))
+          llvm_unreachable("Failed to emit a sext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::AExt:
+      case CCValAssign::ZExt: {
+        MVT DestVT = VA.getLocVT();
+        const TargetRegisterClass *RC =
+          (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+        unsigned TmpReg = createResultReg(RC);
+        if (!PPCEmitIntExt(ArgVT, Arg, DestVT, TmpReg, /*IsZExt*/true))
+          llvm_unreachable("Failed to emit a zext!");
+        ArgVT = DestVT;
+        Arg = TmpReg;
+        break;
+      }
+      case CCValAssign::BCvt: {
+        // FIXME: Not yet handled.
+        llvm_unreachable("Should have bailed before getting here!");
+        break;
+      }
+    }
+
+    // Copy this argument to the appropriate register.
+    unsigned ArgReg;
+    if (ArgVT == MVT::f32 || ArgVT == MVT::f64) {
+      ArgReg = NextFPR++;
+      ++NextGPR;
+    } else
+      ArgReg = NextGPR++;
+      
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ArgReg).addReg(Arg);
+    RegArgs.push_back(ArgReg);
+  }
+
+  return true;
+}
+
+// For a call that we've determined we can fast-select, finish the
+// call sequence and generate a copy to obtain the return value (if any).
+void PPCFastISel::finishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                             const Instruction *I, CallingConv::ID CC,
+                             unsigned &NumBytes, bool IsVarArg) {
+  // Issue CallSEQ_END.
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+          TII.get(TII.getCallFrameDestroyOpcode()))
+    .addImm(NumBytes).addImm(0);
+
+  // Next, generate a copy to obtain the return value.
+  // FIXME: No multi-register return values yet, though I don't foresee
+  // any real difficulties there.
+  if (RetVT != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    CCValAssign &VA = RVLocs[0];
+    assert(RVLocs.size() == 1 && "No support for multi-reg return values!");
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    MVT DestVT = VA.getValVT();
+    MVT CopyVT = DestVT;
+
+    // Ints smaller than a register still arrive in a full 64-bit
+    // register, so make sure we recognize this.
+    if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32)
+      CopyVT = MVT::i64;
+
+    unsigned SourcePhysReg = VA.getLocReg();
+    unsigned ResultReg = 0;
+
+    if (RetVT == CopyVT) {
+      const TargetRegisterClass *CpyRC = TLI.getRegClassFor(CopyVT);
+      ResultReg = createResultReg(CpyRC);
+
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+
+    // If necessary, round the floating result to single precision.
+    } else if (CopyVT == MVT::f64) {
+      ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::FRSP),
+              ResultReg).addReg(SourcePhysReg);
+
+    // If only the low half of a general register is needed, generate
+    // a GPRC copy instead of a G8RC copy.  (EXTRACT_SUBREG can't be
+    // used along the fast-isel path (not lowered), and downstream logic
+    // also doesn't like a direct subreg copy on a physical reg.)
+    } else if (RetVT == MVT::i8 || RetVT == MVT::i16 || RetVT == MVT::i32) {
+      ResultReg = createResultReg(&PPC::GPRCRegClass);
+      // Convert physical register from G8RC to GPRC.
+      SourcePhysReg -= PPC::X0 - PPC::R0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+              TII.get(TargetOpcode::COPY), ResultReg)
+        .addReg(SourcePhysReg);
+    }
+
+    assert(ResultReg && "ResultReg unset!");
+    UsedRegs.push_back(SourcePhysReg);
+    UpdateValueMap(I, ResultReg);
+  }
+}
+
+// Attempt to fast-select a call instruction.
+bool PPCFastISel::SelectCall(const Instruction *I) {
+  const CallInst *CI = cast<CallInst>(I);
+  const Value *Callee = CI->getCalledValue();
+
+  // Can't handle inline asm.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Allow SelectionDAG isel to handle tail calls.
+  if (CI->isTailCall())
+    return false;
+
+  // Obtain calling convention.
+  ImmutableCallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+
+  PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  bool IsVarArg = FTy->isVarArg();
+
+  // Not ready for varargs yet.
+  if (IsVarArg)
+    return false;
+
+  // Handle simple calls for now, with legal return types and
+  // those that can be extended.
+  Type *RetTy = I->getType();
+  MVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
+           RetVT != MVT::i8)
+    return false;
+
+  // FIXME: No multi-register return values yet.
+  if (RetVT != MVT::isVoid && RetVT != MVT::i8 && RetVT != MVT::i16 &&
+      RetVT != MVT::i32 && RetVT != MVT::i64 && RetVT != MVT::f32 &&
+      RetVT != MVT::f64) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, TM, RVLocs, *Context);
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_PPC64_ELF_FIS);
+    if (RVLocs.size() > 1)
+      return false;
+  }
+
+  // Bail early if more than 8 arguments, as we only currently
+  // handle arguments passed in registers.
+  unsigned NumArgs = CS.arg_size();
+  if (NumArgs > 8)
+    return false;
+
+  // Set up the argument vectors.
+  SmallVector<Value*, 8> Args;
+  SmallVector<unsigned, 8> ArgRegs;
+  SmallVector<MVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+
+  Args.reserve(NumArgs);
+  ArgRegs.reserve(NumArgs);
+  ArgVTs.reserve(NumArgs);
+  ArgFlags.reserve(NumArgs);
+
+  for (ImmutableCallSite::arg_iterator II = CS.arg_begin(), IE = CS.arg_end();
+       II != IE; ++II) {
+    // FIXME: ARM does something for intrinsic calls here, check into that.
+
+    unsigned AttrIdx = II - CS.arg_begin() + 1;
+    
+    // Only handle easy calls for now.  It would be reasonably easy
+    // to handle <= 8-byte structures passed ByVal in registers, but we
+    // have to ensure they are right-justified in the register.
+    if (CS.paramHasAttr(AttrIdx, Attribute::InReg) ||
+        CS.paramHasAttr(AttrIdx, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrIdx, Attribute::Nest) ||
+        CS.paramHasAttr(AttrIdx, Attribute::ByVal))
+      return false;
+
+    ISD::ArgFlagsTy Flags;
+    if (CS.paramHasAttr(AttrIdx, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrIdx, Attribute::ZExt))
+      Flags.setZExt();
+
+    Type *ArgTy = (*II)->getType();
+    MVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8)
+      return false;
+
+    if (ArgVT.isVector())
+      return false;
+
+    unsigned Arg = getRegForValue(*II);
+    if (Arg == 0)
+      return false;
+
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(*II);
+    ArgRegs.push_back(Arg);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Process the arguments.
+  SmallVector<unsigned, 8> RegArgs;
+  unsigned NumBytes;
+
+  if (!processCallArgs(Args, ArgRegs, ArgVTs, ArgFlags,
+                       RegArgs, CC, NumBytes, IsVarArg))
+    return false;
+
+  // FIXME: No handling for function pointers yet.  This requires
+  // implementing the function descriptor (OPD) setup.
+  const GlobalValue *GV = dyn_cast<GlobalValue>(Callee);
+  if (!GV)
+    return false;
+
+  // Build direct call with NOP for TOC restore.
+  // FIXME: We can and should optimize away the NOP for local calls.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BL8_NOP));
+  // Add callee.
+  MIB.addGlobalAddress(GV);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned II = 0, IE = RegArgs.size(); II != IE; ++II)
+    MIB.addReg(RegArgs[II], RegState::Implicit);
+
+  // Add a register mask with the call-preserved registers.  Proper
+  // defs for return values will be added by setPhysRegsDeadExcept().
+  MIB.addRegMask(TRI.getCallPreservedMask(CC));
+
+  // Finish off the call including any return values.
+  SmallVector<unsigned, 4> UsedRegs;
+  finishCall(RetVT, UsedRegs, I, CC, NumBytes, IsVarArg);
+
+  // Set all unused physregs defs as dead.
+  static_cast<MachineInstr *>(MIB)->setPhysRegsDeadExcept(UsedRegs, TRI);
+
+  return true;
+}
+
+// Attempt to fast-select a return instruction.
+bool PPCFastISel::SelectRet(const Instruction *I) {
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+  const Function &F = *I->getParent()->getParent();
+
+  // Build a list of return value registers.
+  SmallVector<unsigned, 4> RetRegs;
+  CallingConv::ID CC = F.getCallingConv();
+
+  if (Ret->getNumOperands() > 0) {
+    SmallVector<ISD::OutputArg, 4> Outs;
+    GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
+
+    // Analyze operands of the call, assigning locations to each operand.
+    SmallVector<CCValAssign, 16> ValLocs;
+    CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs, *Context);
+    CCInfo.AnalyzeReturn(Outs, RetCC_PPC64_ELF_FIS);
+    const Value *RV = Ret->getOperand(0);
+    
+    // FIXME: Only one output register for now.
+    if (ValLocs.size() > 1)
+      return false;
+
+    // Special case for returning a constant integer of any size.
+    // Materialize the constant as an i64 and copy it to the return
+    // register.  This avoids an unnecessary extend or truncate.
+    if (isa<ConstantInt>(*RV)) {
+      const Constant *C = cast<Constant>(RV);
+      unsigned SrcReg = PPCMaterializeInt(C, MVT::i64);
+      unsigned RetReg = ValLocs[0].getLocReg();
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+              RetReg).addReg(SrcReg);
+      RetRegs.push_back(RetReg);
+
+    } else {
+      unsigned Reg = getRegForValue(RV);
+
+      if (Reg == 0)
+        return false;
+
+      // Copy the result values into the output registers.
+      for (unsigned i = 0; i < ValLocs.size(); ++i) {
+
+        CCValAssign &VA = ValLocs[i];
+        assert(VA.isRegLoc() && "Can only return in registers!");
+        RetRegs.push_back(VA.getLocReg());
+        unsigned SrcReg = Reg + VA.getValNo();
+
+        EVT RVEVT = TLI.getValueType(RV->getType());
+        if (!RVEVT.isSimple())
+          return false;
+        MVT RVVT = RVEVT.getSimpleVT();
+        MVT DestVT = VA.getLocVT();
+
+        if (RVVT != DestVT && RVVT != MVT::i8 &&
+            RVVT != MVT::i16 && RVVT != MVT::i32)
+          return false;
+      
+        if (RVVT != DestVT) {
+          switch (VA.getLocInfo()) {
+            default:
+              llvm_unreachable("Unknown loc info!");
+            case CCValAssign::Full:
+              llvm_unreachable("Full value assign but types don't match?");
+            case CCValAssign::AExt:
+            case CCValAssign::ZExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, true))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+            case CCValAssign::SExt: {
+              const TargetRegisterClass *RC =
+                (DestVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+              unsigned TmpReg = createResultReg(RC);
+              if (!PPCEmitIntExt(RVVT, SrcReg, DestVT, TmpReg, false))
+                return false;
+              SrcReg = TmpReg;
+              break;
+            }
+          }
+        }
+
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::COPY), RetRegs[i])
+          .addReg(SrcReg);
+      }
+    }
+  }
+
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                                    TII.get(PPC::BLR));
+
+  for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+    MIB.addReg(RetRegs[i], RegState::Implicit);
+
+  return true;
+}
+
+// Attempt to emit an integer extend of SrcReg into DestReg.  Both
+// signed and zero extensions are supported.  Return false if we
+// can't handle it.
+bool PPCFastISel::PPCEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                unsigned DestReg, bool IsZExt) {
+  if (DestVT != MVT::i32 && DestVT != MVT::i64)
+    return false;
+  if (SrcVT != MVT::i8 && SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    return false;
+
+  // Signed extensions use EXTSB, EXTSH, EXTSW.
+  if (!IsZExt) {
+    unsigned Opc;
+    if (SrcVT == MVT::i8)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSB : PPC::EXTSB8_32_64;
+    else if (SrcVT == MVT::i16)
+      Opc = (DestVT == MVT::i32) ? PPC::EXTSH : PPC::EXTSH8_32_64;
+    else {
+      assert(DestVT == MVT::i64 && "Signed extend from i32 to i32??");
+      Opc = PPC::EXTSW_32_64;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addReg(SrcReg);
+
+  // Unsigned 32-bit extensions use RLWINM.
+  } else if (DestVT == MVT::i32) {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 24;
+    else {
+      assert(SrcVT == MVT::i16 && "Unsigned extend from i32 to i32??");
+      MB = 16;
+    }
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLWINM),
+            DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB).addImm(/*ME=*/31);
+
+  // Unsigned 64-bit extensions use RLDICL (with a 32-bit source).
+  } else {
+    unsigned MB;
+    if (SrcVT == MVT::i8)
+      MB = 56;
+    else if (SrcVT == MVT::i16)
+      MB = 48;
+    else
+      MB = 32;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(PPC::RLDICL_32_64), DestReg)
+      .addReg(SrcReg).addImm(/*SH=*/0).addImm(MB);
+  }
+
+  return true;
+}
+
+// Attempt to fast-select an indirect branch instruction.
+bool PPCFastISel::SelectIndirectBr(const Instruction *I) {
+  unsigned AddrReg = getRegForValue(I->getOperand(0));
+  if (AddrReg == 0)
+    return false;
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::MTCTR8))
+    .addReg(AddrReg);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::BCTR8));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
+  return true;
+}
+
+// Attempt to fast-select an integer truncate instruction.
+bool PPCFastISel::SelectTrunc(const Instruction *I) {
+  Value *Src  = I->getOperand(0);
+  EVT SrcVT  = TLI.getValueType(Src->getType(), true);
+  EVT DestVT = TLI.getValueType(I->getType(), true);
+
+  if (SrcVT != MVT::i64 && SrcVT != MVT::i32 && SrcVT != MVT::i16)
+    return false;
+
+  if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
+    return false;
+
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg)
+    return false;
+
+  // The only interesting case is when we need to switch register classes.
+  if (SrcVT == MVT::i64) {
+    unsigned ResultReg = createResultReg(&PPC::GPRCRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            ResultReg).addReg(SrcReg, 0, PPC::sub_32);
+    SrcReg = ResultReg;
+  }
+
+  UpdateValueMap(I, SrcReg);
+  return true;
+}
+
+// Attempt to fast-select an integer extend instruction.
+bool PPCFastISel::SelectIntExt(const Instruction *I) {
+  Type *DestTy = I->getType();
+  Value *Src = I->getOperand(0);
+  Type *SrcTy = Src->getType();
+
+  bool IsZExt = isa<ZExtInst>(I);
+  unsigned SrcReg = getRegForValue(Src);
+  if (!SrcReg) return false;
+
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple())
+    return false;
+  if (!DestEVT.isSimple())
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
+
+  // If we know the register class needed for the result of this
+  // instruction, use it.  Otherwise pick the register class of the
+  // correct size that does not contain X0/R0, since we don't know
+  // whether downstream uses permit that assignment.
+  unsigned AssignedReg = FuncInfo.ValueMap[I];
+  const TargetRegisterClass *RC =
+    (AssignedReg ? MRI.getRegClass(AssignedReg) :
+     (DestVT == MVT::i64 ? &PPC::G8RC_and_G8RC_NOX0RegClass :
+      &PPC::GPRC_and_GPRC_NOR0RegClass));
+  unsigned ResultReg = createResultReg(RC);
+
+  if (!PPCEmitIntExt(SrcVT, SrcReg, DestVT, ResultReg, IsZExt))
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+// Attempt to fast-select an instruction that wasn't handled by
+// the table-generated machinery.
+bool PPCFastISel::TargetSelectInstruction(const Instruction *I) {
+
+  switch (I->getOpcode()) {
+    case Instruction::Load:
+      return SelectLoad(I);
+    case Instruction::Store:
+      return SelectStore(I);
+    case Instruction::Br:
+      return SelectBranch(I);
+    case Instruction::IndirectBr:
+      return SelectIndirectBr(I);
+    case Instruction::FPExt:
+      return SelectFPExt(I);
+    case Instruction::FPTrunc:
+      return SelectFPTrunc(I);
+    case Instruction::SIToFP:
+      return SelectIToFP(I, /*IsSigned*/ true);
+    case Instruction::UIToFP:
+      return SelectIToFP(I, /*IsSigned*/ false);
+    case Instruction::FPToSI:
+      return SelectFPToI(I, /*IsSigned*/ true);
+    case Instruction::FPToUI:
+      return SelectFPToI(I, /*IsSigned*/ false);
+    case Instruction::Add:
+      return SelectBinaryIntOp(I, ISD::ADD);
+    case Instruction::Or:
+      return SelectBinaryIntOp(I, ISD::OR);
+    case Instruction::Sub:
+      return SelectBinaryIntOp(I, ISD::SUB);
+    case Instruction::Call:
+      if (dyn_cast<IntrinsicInst>(I))
+        return false;
+      return SelectCall(I);
+    case Instruction::Ret:
+      return SelectRet(I);
+    case Instruction::Trunc:
+      return SelectTrunc(I);
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      return SelectIntExt(I);
+    // Here add other flavors of Instruction::XXX that automated
+    // cases don't catch.  For example, switches are terminators
+    // that aren't yet handled.
+    default:
+      break;
+  }
+  return false;
+}
+
+// Materialize a floating-point constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
+  // No plans to handle long double here.
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
+  // All FP constants are loaded from the constant pool.
+  unsigned Align = TD.getPrefTypeAlignment(CFP->getType());
+  assert(Align > 0 && "Unexpectedly missing alignment information!");
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  MachineMemOperand *MMO =
+    FuncInfo.MF->getMachineMemOperand(
+      MachinePointerInfo::getConstantPool(), MachineMemOperand::MOLoad,
+      (VT == MVT::f32) ? 4 : 8, Align);
+
+  unsigned Opc = (VT == MVT::f32) ? PPC::LFS : PPC::LFD;
+  unsigned TmpReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  // For small code model, generate a LF[SD](0, LDtocCPT(Idx, X2)).
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocCPT),
+            TmpReg)
+      .addConstantPoolIndex(Idx).addReg(PPC::X2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+      .addImm(0).addReg(TmpReg).addMemOperand(MMO);
+  } else {
+    // Otherwise we generate LF[SD](Idx[lo], ADDIStocHA(X2, Idx)).
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+            TmpReg).addReg(PPC::X2).addConstantPoolIndex(Idx);
+    // But for large code model, we must generate a LDtocL followed
+    // by the LF[SD].
+    if (CModel == CodeModel::Large) {
+      unsigned TmpReg2 = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              TmpReg2).addConstantPoolIndex(Idx).addReg(TmpReg);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addImm(0).addReg(TmpReg2);
+    } else 
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), DestReg)
+        .addConstantPoolIndex(Idx, 0, PPCII::MO_TOC_LO)
+        .addReg(TmpReg)
+        .addMemOperand(MMO);
+  }
+
+  return DestReg;
+}
+
+// Materialize the address of a global value into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
+  assert(VT == MVT::i64 && "Non-address!");
+  const TargetRegisterClass *RC = &PPC::G8RC_and_G8RC_NOX0RegClass;
+  unsigned DestReg = createResultReg(RC);
+
+  // Global values may be plain old object addresses, TLS object
+  // addresses, constant pool entries, or jump tables.  How we generate
+  // code for these may depend on small, medium, or large code model.
+  CodeModel::Model CModel = TM.getCodeModel();
+
+  // FIXME: Jump tables are not yet required because fast-isel doesn't
+  // handle switches; if that changes, we need them as well.  For now,
+  // what follows assumes everything's a generic (or TLS) global address.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias, use the aliasee for determining thread-locality.
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false));
+  }
+
+  // FIXME: We don't yet handle the complexity of TLS.
+  bool IsTLS = GVar && GVar->isThreadLocal();
+  if (IsTLS)
+    return 0;
+
+  // For small code model, generate a simple TOC load.
+  if (CModel == CodeModel::Small || CModel == CodeModel::JITDefault)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtoc), DestReg)
+      .addGlobalAddress(GV).addReg(PPC::X2);
+  else {
+    // If the address is an externally defined symbol, a symbol with
+    // common or externally available linkage, a function address, or a
+    // jump table address (not yet needed), or if we are generating code
+    // for large code model, we generate:
+    //       LDtocL(GV, ADDIStocHA(%X2, GV))
+    // Otherwise we generate:
+    //       ADDItocL(ADDIStocHA(%X2, GV), GV)
+    // Either way, start with the ADDIStocHA:
+    unsigned HighPartReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDIStocHA),
+            HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
+
+    // !GVar implies a function address.  An external variable is one
+    // without an initializer.
+    // If/when switches are implemented, jump tables should be handled
+    // on the "if" path here.
+    if (CModel == CodeModel::Large || !GVar || !GVar->hasInitializer() ||
+        GVar->hasCommonLinkage() || GVar->hasAvailableExternallyLinkage())
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::LDtocL),
+              DestReg).addGlobalAddress(GV).addReg(HighPartReg);
+    else
+      // Otherwise generate the ADDItocL.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDItocL),
+              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+  }
+
+  return DestReg;
+}
+
+// Materialize a 32-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize32BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+
+  unsigned ResultReg = createResultReg(RC);
+  bool IsGPRC = RC->hasSuperClassEq(&PPC::GPRCRegClass);
+
+  if (isInt<16>(Imm))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(IsGPRC ? PPC::LI : PPC::LI8), ResultReg)
+      .addImm(Imm);
+  else if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), TmpReg)
+      .addImm(Hi);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(IsGPRC ? PPC::ORI : PPC::ORI8), ResultReg)
+      .addReg(TmpReg).addImm(Lo);
+  } else
+    // Just Hi bits.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(IsGPRC ? PPC::LIS : PPC::LIS8), ResultReg)
+      .addImm(Hi);
+  
+  return ResultReg;
+}
+
+// Materialize a 64-bit integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
+                                             const TargetRegisterClass *RC) {
+  unsigned Remainder = 0;
+  unsigned Shift = 0;
+
+  // If the value doesn't fit in 32 bits, see if we can shift it
+  // so that it fits in 32 bits.
+  if (!isInt<32>(Imm)) {
+    Shift = countTrailingZeros<uint64_t>(Imm);
+    int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+
+    if (isInt<32>(ImmSh))
+      Imm = ImmSh;
+    else {
+      Remainder = Imm;
+      Shift = 32;
+      Imm >>= 32;
+    }
+  }
+
+  // Handle the high-order 32 bits (if shifted) or the whole 32 bits
+  // (if not shifted).
+  unsigned TmpReg1 = PPCMaterialize32BitInt(Imm, RC);
+  if (!Shift)
+    return TmpReg1;
+
+  // If upper 32 bits were not zero, we've built them and need to shift
+  // them into place.
+  unsigned TmpReg2;
+  if (Imm) {
+    TmpReg2 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::RLDICR),
+            TmpReg2).addReg(TmpReg1).addImm(Shift).addImm(63 - Shift);
+  } else
+    TmpReg2 = TmpReg1;
+
+  unsigned TmpReg3, Hi, Lo;
+  if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+    TmpReg3 = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ORIS8),
+            TmpReg3).addReg(TmpReg2).addImm(Hi);
+  } else
+    TmpReg3 = TmpReg2;
+
+  if ((Lo = Remainder & 0xFFFF)) {
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ORI8),
+            ResultReg).addReg(TmpReg3).addImm(Lo);
+    return ResultReg;
+  }
+
+  return TmpReg3;
+}
+
+
+// Materialize an integer constant into a register, and return
+// the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+
+  // If the constant is in range, use a load-immediate.
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  if (isInt<16>(CI->getSExtValue())) {
+    unsigned Opc = (VT == MVT::i64) ? PPC::LI8 : PPC::LI;
+    unsigned ImmReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), ImmReg)
+      .addImm(CI->getSExtValue());
+    return ImmReg;
+  }
+
+  // Construct the constant piecewise.
+  int64_t Imm = CI->getZExtValue();
+
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else if (VT == MVT::i32)
+    return PPCMaterialize32BitInt(Imm, RC);
+
+  return 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned PPCFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return PPCMaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return PPCMaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return PPCMaterializeInt(C, VT);
+
+  return 0;
+}
+
+// Materialize the address created by an alloca into a register, and
+// return the register number (or zero if we failed to handle it).
+unsigned PPCFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+  // Don't handle dynamic allocas.
+  if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
+
+  MVT VT;
+  if (!isLoadTypeLegal(AI->getType(), VT)) return 0;
+
+  DenseMap<const AllocaInst*, int>::iterator SI =
+    FuncInfo.StaticAllocaMap.find(AI);
+
+  if (SI != FuncInfo.StaticAllocaMap.end()) {
+    unsigned ResultReg = createResultReg(&PPC::G8RC_and_G8RC_NOX0RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(PPC::ADDI8),
+            ResultReg).addFrameIndex(SI->second).addImm(0);
+    return ResultReg;
+  }
+
+  return 0;
+}
+
+// Fold loads into extends when possible.
+// FIXME: We can have multiple redundant extend/trunc instructions
+// following a load.  The folding only picks up one.  Extend this
+// to check subsequent instructions for the same pattern and remove
+// them.  Thus ResultReg should be the def reg for the last redundant
+// instruction in a chain, and all intervening instructions can be
+// removed from parent.  Change test/CodeGen/PowerPC/fast-isel-fold.ll
+// to add ELF64-NOT: rldicl to the appropriate tests when this works.
+bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                                      const LoadInst *LI) {
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(LI->getType(), VT))
+    return false;
+
+  // Combine load followed by zero- or sign-extend.
+  bool IsZExt = false;
+  switch(MI->getOpcode()) {
+    default:
+      return false;
+
+    case PPC::RLDICL:
+    case PPC::RLDICL_32_64: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 56) ||
+          (VT == MVT::i16 && MB <= 48) ||
+          (VT == MVT::i32 && MB <= 32))
+        break;
+      return false;
+    }
+
+    case PPC::RLWINM:
+    case PPC::RLWINM8: {
+      IsZExt = true;
+      unsigned MB = MI->getOperand(3).getImm();
+      if ((VT == MVT::i8 && MB <= 24) ||
+          (VT == MVT::i16 && MB <= 16))
+        break;
+      return false;
+    }
+
+    case PPC::EXTSB:
+    case PPC::EXTSB8:
+    case PPC::EXTSB8_32_64:
+      /* There is no sign-extending load-byte instruction. */
+      return false;
+
+    case PPC::EXTSH:
+    case PPC::EXTSH8:
+    case PPC::EXTSH8_32_64: {
+      if (VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+
+    case PPC::EXTSW:
+    case PPC::EXTSW_32_64: {
+      if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8)
+        return false;
+      break;
+    }
+  }
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!PPCComputeAddress(LI->getOperand(0), Addr))
+    return false;
+
+  unsigned ResultReg = MI->getOperand(0).getReg();
+
+  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+    return false;
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// Attempt to lower call arguments in a faster way than done by
+// the selection DAG code.
+bool PPCFastISel::FastLowerArguments() {
+  // Defer to normal argument lowering for now.  It's reasonably
+  // efficient.  Consider doing something like ARM to handle the
+  // case where all args fit in registers, no varargs, no float
+  // or vector args.
+  return false;
+}
+
+// Handle materializing integer constants into a register.  This is not
+// automatically generated for PowerPC, so must be explicitly created here.
+unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
+  
+  if (Opc != ISD::Constant)
+    return 0;
+
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16 &&
+      VT != MVT::i8 && VT != MVT::i1) 
+    return 0;
+
+  const TargetRegisterClass *RC = ((VT == MVT::i64) ? &PPC::G8RCRegClass :
+                                   &PPC::GPRCRegClass);
+  if (VT == MVT::i64)
+    return PPCMaterialize64BitInt(Imm, RC);
+  else
+    return PPCMaterialize32BitInt(Imm, RC);
+}
+
+// Override for ADDI and ADDI8 to set the correct register class
+// on RHS operand 0.  The automatic infrastructure naively assumes
+// GPRC for i32 and G8RC for i64; the concept of "no R0" is lost
+// for these cases.  At the moment, none of the other automatically
+// generated RI instructions require special treatment.  However, once
+// SelectSelect is implemented, "isel" requires similar handling.
+//
+// Also be conservative about the output register class.  Avoid
+// assigning R0 or X0 to the output register for GPRC and G8RC
+// register classes, as any such result could be used in ADDI, etc.,
+// where those regs have another meaning.
+unsigned PPCFastISel::FastEmitInst_ri(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass *RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      uint64_t Imm) {
+  if (MachineInstOpcode == PPC::ADDI)
+    MRI.setRegClass(Op0, &PPC::GPRC_and_GPRC_NOR0RegClass);
+  else if (MachineInstOpcode == PPC::ADDI8)
+    MRI.setRegClass(Op0, &PPC::G8RC_and_G8RC_NOX0RegClass);
+
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_ri(MachineInstOpcode, UseRC,
+                                   Op0, Op0IsKill, Imm);
+}
+
+// Override for instructions with one register operand to avoid use of
+// R0/X0.  The automatic infrastructure isn't aware of the context so
+// we must be conservative.
+unsigned PPCFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
+                                     const TargetRegisterClass* RC,
+                                     unsigned Op0, bool Op0IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_r(MachineInstOpcode, UseRC, Op0, Op0IsKill);
+}
+
+// Override for instructions with two register operands to avoid use
+// of R0/X0.  The automatic infrastructure isn't aware of the context
+// so we must be conservative.
+unsigned PPCFastISel::FastEmitInst_rr(unsigned MachineInstOpcode,
+                                      const TargetRegisterClass* RC,
+                                      unsigned Op0, bool Op0IsKill,
+                                      unsigned Op1, bool Op1IsKill) {
+  const TargetRegisterClass *UseRC =
+    (RC == &PPC::GPRCRegClass ? &PPC::GPRC_and_GPRC_NOR0RegClass :
+     (RC == &PPC::G8RCRegClass ? &PPC::G8RC_and_G8RC_NOX0RegClass : RC));
+
+  return FastISel::FastEmitInst_rr(MachineInstOpcode, UseRC, Op0, Op0IsKill,
+                                   Op1, Op1IsKill);
+}
+
+namespace llvm {
+  // Create the fast instruction selector for PowerPC64 ELF.
+  FastISel *PPC::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                const TargetLibraryInfo *LibInfo) {
+    const TargetMachine &TM = FuncInfo.MF->getTarget();
+
+    // Only available on 64-bit ELF for now.
+    const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
+    if (Subtarget->isPPC64() && Subtarget->isSVR4ABI())
+      return new PPCFastISel(FuncInfo, LibInfo);
+
+    return 0;
+  }
+}
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c845909..0ac2ced 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -26,17 +26,6 @@
 
 using namespace llvm;
 
-// FIXME This disables some code that aligns the stack to a boundary bigger than
-// the default (16 bytes on Darwin) when there is a stack local of greater
-// alignment.  This does not currently work, because the delta between old and
-// new stack pointers is added to offsets that reference incoming parameters
-// after the prolog is generated, and the code that does that doesn't handle a
-// variable delta.  You don't want to do that anyway; a better approach is to
-// reserve another register that retains to the incoming stack pointer, and
-// reference parameters relative to that.
-#define ALIGN_STACK 0
-
-
 /// VRRegNo - Map from a numbered VR register to its enum value.
 ///
 static const uint16_t VRRegNo[] = {
@@ -215,11 +204,13 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned FrameSize =
     UseEstimate ? MFI->estimateStackSize(MF) : MFI->getStackSize();
 
-  // Get the alignments provided by the target, and the maximum alignment
-  // (if any) of the fixed frame objects.
-  unsigned MaxAlign = MFI->getMaxAlignment();
-  unsigned TargetAlign = getStackAlignment();
-  unsigned AlignMask = TargetAlign - 1;  //
+  // Get stack alignments. The frame must be aligned to the greatest of these:
+  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
+  unsigned MaxAlign = MFI->getMaxAlignment(); // algmt required by data in frame
+  unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
+
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
@@ -235,7 +226,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
-      (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
+      !RegInfo->hasBasePointer(MF)) { // No special alignment.
     // No need for frame
     if (UpdateMF)
       MFI->setStackSize(0);
@@ -305,6 +296,12 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  unsigned BPReg  = HasBP ? (unsigned) PPC::R30 : FPReg;
+  unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI)
     for (MachineBasicBlock::iterator MBBI = BI->end(); MBBI != BI->begin(); ) {
@@ -321,6 +318,13 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
         case PPC::FP8:
           MO.setReg(FP8Reg);
           break;
+        case PPC::BP:
+          MO.setReg(BPReg);
+          break;
+        case PPC::BP8:
+          MO.setReg(BP8Reg);
+          break;
+
         }
       }
     }
@@ -332,18 +336,29 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const PPCInstrInfo &TII =
     *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
   MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
   bool needsFrameMoves = MMI.hasDebugInfo() ||
     MF.getFunction()->needsUnwindTableEntry();
 
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get the ABI.
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+  assert((isDarwinABI || isSVR4ABI) &&
+         "Currently only Darwin and SVR4 ABIs are supported for PowerPC.");
+
   // Prepare for frame info.
   MCSymbol *FrameLabel = 0;
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
-  if (!Subtarget.isSVR4ABI())
+  if (!isSVR4ABI)
     for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
       if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
         HandleVRSaveUpdate(MBBI, TII);
@@ -357,26 +372,58 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Work out frame sizes.
   unsigned FrameSize = determineFrameLayout(MF);
   int NegFrameSize = -FrameSize;
+  if (!isInt<32>(NegFrameSize))
+    llvm_unreachable("Unhandled stack size!");
 
   if (MFI->isFrameAddressTaken())
     replaceFPWithRealFP(MF);
 
-  // Get processor type.
-  bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
-  bool isDarwinABI = Subtarget.isDarwinABI();
   // Check if the link register (LR) must be saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
-  const SmallVector<unsigned, 3> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+
+  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg       = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
+  const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
+                                                : PPC::MFLR );
+  const MCInstrDesc& StoreInst = TII.get(isPPC64 ? PPC::STD
+                                                 : PPC::STW );
+  const MCInstrDesc& StoreUpdtInst = TII.get(isPPC64 ? PPC::STDU
+                                                     : PPC::STWU );
+  const MCInstrDesc& StoreUpdtIdxInst = TII.get(isPPC64 ? PPC::STDUX
+                                                        : PPC::STWUX);
+  const MCInstrDesc& LoadImmShiftedInst = TII.get(isPPC64 ? PPC::LIS8
+                                                          : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get(isPPC64 ? PPC::ORI8
+                                                 : PPC::ORI );
+  const MCInstrDesc& OrInst = TII.get(isPPC64 ? PPC::OR8
+                                              : PPC::OR );
+  const MCInstrDesc& SubtractCarryingInst = TII.get(isPPC64 ? PPC::SUBFC8
+                                                            : PPC::SUBFC);
+  const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
+                                                               : PPC::SUBFIC);
+
+  // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
+  // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
+  // Red Zone, an asynchronous event (a form of "callee") could claim a frame &
+  // overwrite it, so PPC32 SVR4 must claim at least a minimal frame to save LR.
+  assert((isPPC64 || !isSVR4ABI || !(!FrameSize && (MustSaveLR || HasFP))) &&
+         "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -386,136 +433,130 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     }
   }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
-
-    if (!MustSaveCRs.empty()) {
-      MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), PPC::X12);
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = FFI->getObjectOffset(BPIndex);
+    } else {
+      BPOffset =
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI);
     }
-
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X31)
-        .addImm(FPOffset/4)
-        .addReg(PPC::X1);
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
-        .addReg(PPC::X0)
-        .addImm(LROffset / 4)
-        .addReg(PPC::X1);
-
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
-        .addReg(PPC::X12, getKillRegState(true))
-        .addImm(8)
-        .addReg(PPC::X1);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
-
-    if (HasFP)
-      // FIXME: On PPC32 SVR4, FPOffset is negative and access to negative
-      // offsets of R1 is not allowed.
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R31)
-        .addImm(FPOffset)
-        .addReg(PPC::R1);
-
-    assert(MustSaveCRs.empty() &&
-           "Prologue CR saving supported only in 64-bit mode");
-
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
-        .addReg(PPC::R0)
-        .addImm(LROffset)
-        .addReg(PPC::R1);
   }
 
-  // Skip if a leaf routine.
-  if (!FrameSize) return;
-
   // Get stack alignments.
-  unsigned TargetAlign = getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
+  if (HasBP && MaxAlign > 1)
+    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
+           "Invalid alignment!");
+
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
+  bool isLargeFrame = !isInt<16>(NegFrameSize);
+
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
+
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Prologue CR saving supported only in 64-bit mode");
+
+  if (!MustSaveCRs.empty()) { // will only occur for PPC64
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFCR8), TempReg);
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      MIB.addReg(MustSaveCRs[i], RegState::ImplicitKill);
+  }
+
+  if (HasFP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
+
+  if (HasBP)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
+
+  if (MustSaveLR)
+    // FIXME: On PPC32 SVR4, we must not spill before claiming the stackframe.
+    BuildMI(MBB, MBBI, dl, StoreInst)
+      .addReg(ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
+
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+      .addReg(TempReg, getKillRegState(true))
+      .addImm(8)
+      .addReg(SPReg);
+
+  // Skip the rest if this is a leaf function & all spills fit in the Red Zone.
+  if (!FrameSize) return;
 
   // Adjust stack pointer: r1 += NegFrameSize.
   // If there is a preferred stack alignment, align R1 now
-  if (!isPPC64) {
-    // PPC32.
-    if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
-      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
-
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
-        .addReg(PPC::R1)
+
+  if (HasBP) {
+    // Save a copy of r1 as the base pointer.
+    BuildMI(MBB, MBBI, dl, OrInst, BPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
+  }
+
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+        .addReg(SPReg)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+    else // PPC32...
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+        .addReg(SPReg)
         .addImm(0)
         .addImm(32 - Log2_32(MaxAlign))
         .addImm(31);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
-        .addReg(PPC::R0, RegState::Kill)
+    if (!isLargeFrame) {
+      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
         .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
-        .addReg(PPC::R1)
-        .addImm(NegFrameSize)
-        .addReg(PPC::R1);
     } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
         .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-        .addReg(PPC::R0, RegState::Kill)
+      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+        .addReg(TempReg, RegState::Kill)
         .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX), PPC::R1)
-        .addReg(PPC::R1, RegState::Kill)
-        .addReg(PPC::R1)
-        .addReg(PPC::R0);
+      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+        .addReg(ScratchReg, RegState::Kill)
+        .addReg(TempReg, RegState::Kill);
     }
-  } else {    // PPC64.
-    if (ALIGN_STACK && MaxAlign > TargetAlign) {
-      assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-             "Invalid alignment!");
-      assert(isInt<16>(NegFrameSize) && "Unhandled stack size and alignment!");
-
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
-        .addReg(PPC::X1)
-        .addImm(0)
-        .addImm(64 - Log2_32(MaxAlign));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
-        .addReg(PPC::X0)
-        .addImm(NegFrameSize);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    } else if (isInt<16>(NegFrameSize)) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
-        .addReg(PPC::X1)
-        .addImm(NegFrameSize / 4)
-        .addReg(PPC::X1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
-        .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-        .addReg(PPC::X0, RegState::Kill)
-        .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX), PPC::X1)
-        .addReg(PPC::X1, RegState::Kill)
-        .addReg(PPC::X1)
-        .addReg(PPC::X0);
-    }
-  }
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  } else if (!isLargeFrame) {
+    BuildMI(MBB, MBBI, dl, StoreUpdtInst, SPReg)
+      .addReg(SPReg)
+      .addImm(NegFrameSize)
+      .addReg(SPReg);
+
+  } else {
+    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+      .addImm(NegFrameSize >> 16);
+    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addImm(NegFrameSize & 0xFFFF);
+    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+      .addReg(SPReg, RegState::Kill)
+      .addReg(SPReg)
+      .addReg(ScratchReg);
+  }
 
   // Add the "machine moves" for the instructions we generated above, but in
   // reverse order.
@@ -525,25 +566,26 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(FrameLabel);
 
     // Show update of SP.
-    if (NegFrameSize) {
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
-      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-    } else {
-      MachineLocation SP(isPPC64 ? PPC::X31 : PPC::R31);
-      Moves.push_back(MachineMove(FrameLabel, SP, SP));
-    }
+    assert(NegFrameSize);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(FrameLabel, NegFrameSize));
 
     if (HasFP) {
-      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
-      MachineLocation FPSrc(isPPC64 ? PPC::X31 : PPC::R31);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createOffset(FrameLabel, Reg, FPOffset));
+    }
+
+    if (HasBP) {
+      unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createOffset(FrameLabel, Reg, BPOffset));
     }
 
     if (MustSaveLR) {
-      MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
-      MachineLocation LRSrc(isPPC64 ? PPC::LR8 : PPC::LR);
-      Moves.push_back(MachineMove(FrameLabel, LRDst, LRSrc));
+      unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createOffset(FrameLabel, Reg, LROffset));
     }
   }
 
@@ -551,15 +593,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // If there is a frame pointer, copy R1 into R31
   if (HasFP) {
-    if (!isPPC64) {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
-        .addReg(PPC::R1)
-        .addReg(PPC::R1);
-    } else {
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
-        .addReg(PPC::X1)
-        .addReg(PPC::X1);
-    }
+    BuildMI(MBB, MBBI, dl, OrInst, FPReg)
+      .addReg(SPReg)
+      .addReg(SPReg);
 
     if (needsFrameMoves) {
       ReadyLabel = MMI.getContext().CreateTempSymbol();
@@ -567,10 +603,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Mark effective beginning of when frame pointer is ready.
       BuildMI(MBB, MBBI, dl, TII.get(PPC::PROLOG_LABEL)).addSym(ReadyLabel);
 
-      MachineLocation FPDst(HasFP ? (isPPC64 ? PPC::X31 : PPC::R31) :
-                                    (isPPC64 ? PPC::X1 : PPC::R1));
-      MachineLocation FPSrc(MachineLocation::VirtualFP);
-      Moves.push_back(MachineMove(ReadyLabel, FPDst, FPSrc));
+      unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(ReadyLabel, Reg));
     }
   }
 
@@ -590,26 +624,21 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
       // For SVR4, don't emit a move for the CR spill slot if we haven't
       // spilled CRs.
-      if (Subtarget.isSVR4ABI()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
-	  && MustSaveCRs.empty())
-	continue;
+      if (isSVR4ABI && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+          && MustSaveCRs.empty())
+        continue;
 
       // For 64-bit SVR4 when we have spilled CRs, the spill location
       // is SP+8, not a frame-relative slot.
-      if (Subtarget.isSVR4ABI()
-	  && Subtarget.isPPC64()
-	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
-	MachineLocation CSDst(PPC::X1, 8);
-	MachineLocation CSSrc(PPC::CR2);
-	Moves.push_back(MachineMove(Label, CSDst, CSSrc));
-	continue;
+      if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            Label, MRI->getDwarfRegNum(PPC::CR2, true), 8));
+        continue;
       }
 
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
-      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-      MachineLocation CSSrc(Reg);
-      Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+      MMI.addFrameInst(MCCFIInstruction::createOffset(
+          Label, MRI->getDwarfRegNum(Reg, true), Offset));
     }
   }
 }
@@ -620,6 +649,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   assert(MBBI != MBB.end() && "Returning block has no terminator");
   const PPCInstrInfo &TII =
     *static_cast<const PPCInstrInfo*>(MF.getTarget().getInstrInfo());
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
   unsigned RetOpcode = MBBI->getOpcode();
   DebugLoc dl;
@@ -633,30 +664,49 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
           RetOpcode == PPC::TCRETURNai8) &&
          "Can only insert epilog into returning blocks");
 
-  // Get alignment info so we know how to restore r1
+  // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned TargetAlign = getStackAlignment();
-  unsigned MaxAlign = MFI->getMaxAlignment();
 
   // Get the number of bytes allocated from the FrameInfo.
   int FrameSize = MFI->getStackSize();
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
-  // Get operating system
+  // Get the ABI.
   bool isDarwinABI = Subtarget.isDarwinABI();
+  bool isSVR4ABI = Subtarget.isSVR4ABI();
+
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
-  const SmallVector<unsigned, 3> &MustSaveCRs = FI->getMustSaveCRs();
-  // Do we have a frame pointer for this function?
+  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
+  bool HasBP = RegInfo->hasBasePointer(MF);
+
+  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  unsigned BPReg      = isPPC64 ? PPC::X30 : PPC::R30;
+  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  unsigned ScratchReg  = isPPC64 ? PPC::X0  : PPC::R0;
+  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
+                                                 : PPC::MTLR );
+  const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
+                                                 : PPC::LWZ );
+  const MCInstrDesc& LoadImmShiftedInst = TII.get( isPPC64 ? PPC::LIS8
+                                                           : PPC::LIS );
+  const MCInstrDesc& OrImmInst = TII.get( isPPC64 ? PPC::ORI8
+                                                  : PPC::ORI );
+  const MCInstrDesc& AddImmInst = TII.get( isPPC64 ? PPC::ADDI8
+                                                   : PPC::ADDI );
+  const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
+                                                : PPC::ADD4 );
 
   int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
 
   int FPOffset = 0;
   if (HasFP) {
-    if (Subtarget.isSVR4ABI()) {
+    if (isSVR4ABI) {
       MachineFrameInfo *FFI = MF.getFrameInfo();
       int FPIndex = FI->getFramePointerSaveIndex();
       assert(FPIndex && "No Frame Pointer Save Slot!");
@@ -666,6 +716,19 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
     }
   }
 
+  int BPOffset = 0;
+  if (HasBP) {
+    if (isSVR4ABI) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int BPIndex = FI->getBasePointerSaveIndex();
+      assert(BPIndex && "No Base Pointer Save Slot!");
+      BPOffset = FFI->getObjectOffset(BPIndex);
+    } else {
+      BPOffset =
+        PPCFrameLowering::getBasePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
   bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
     RetOpcode == PPC::TCRETURNdi ||
     RetOpcode == PPC::TCRETURNai ||
@@ -687,98 +750,76 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       FrameSize += StackAdj;
   }
 
+  // Frames of 32KB & larger require special handling because they cannot be
+  // indexed into with a simple LD/LWZ immediate offset operand.
+  bool isLargeFrame = !isInt<16>(FrameSize);
+
   if (FrameSize) {
-    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
-    // on entry to the function.  Add this offset back now.
-    if (!isPPC64) {
-      // If this function contained a fastcc call and GuaranteedTailCallOpt is
-      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
-      // call which invalidates the stack pointer value in SP(0). So we use the
-      // value of R31 in this case.
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
-          .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
-          .addReg(PPC::R0, RegState::Kill)
-          .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
-          .addReg(PPC::R1)
-          .addReg(PPC::R31)
-          .addReg(PPC::R0);
-      } else if (isInt<16>(FrameSize) &&
-                 (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
-                 !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
-          .addReg(PPC::R1).addImm(FrameSize);
+    // In the prologue, the loaded (or persistent) stack pointer value is offset
+    // by the STDU/STDUX/STWU/STWUX instruction.  Add this offset back now.
+
+    // If this function contained a fastcc call and GuaranteedTailCallOpt is
+    // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+    // call which invalidates the stack pointer value in SP(0). So we use the
+    // value of R31 in this case.
+    if (FI->hasFastCall()) {
+      assert(HasFP && "Expecting a valid frame pointer.");
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+          .addReg(FPReg).addImm(FrameSize);
       } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
-          .addImm(0).addReg(PPC::R1);
-      }
-    } else {
-      if (FI->hasFastCall() && isInt<16>(FrameSize)) {
-        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-          .addReg(PPC::X31).addImm(FrameSize);
-      } else if(FI->hasFastCall()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(FrameSize >> 16);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
-          .addReg(PPC::X0, RegState::Kill)
+        BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(FrameSize & 0xFFFF);
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
-          .addReg(PPC::X1)
-          .addReg(PPC::X31)
-          .addReg(PPC::X0);
-      } else if (isInt<16>(FrameSize) && TargetAlign >= MaxAlign &&
-            !MFI->hasVarSizedObjects()) {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
-           .addReg(PPC::X1).addImm(FrameSize);
-      } else {
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
-           .addImm(0).addReg(PPC::X1);
+        BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
+          .addReg(FPReg)
+          .addReg(ScratchReg);
       }
+    } else if (!isLargeFrame && !HasBP && !MFI->hasVarSizedObjects()) {
+      BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+        .addReg(SPReg)
+        .addImm(FrameSize);
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadInst, SPReg)
+        .addImm(0)
+        .addReg(SPReg);
     }
-  }
 
-  if (isPPC64) {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
-        .addImm(LROffset/4).addReg(PPC::X1);
+  }
 
-    if (!MustSaveCRs.empty())
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), PPC::X12)
-        .addImm(8).addReg(PPC::X1);
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, LoadInst, ScratchReg)
+      .addImm(LROffset)
+      .addReg(SPReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
-        .addImm(FPOffset/4).addReg(PPC::X1);
+  assert((isPPC64 || MustSaveCRs.empty()) &&
+         "Epilogue CR restoring supported only in 64-bit mode");
 
-    if (!MustSaveCRs.empty())
-      for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::MTCRF8), MustSaveCRs[i])
-          .addReg(PPC::X12, getKillRegState(i == e-1));
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
+      .addImm(8)
+      .addReg(SPReg);
 
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
-  } else {
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
-          .addImm(LROffset).addReg(PPC::R1);
+  if (HasFP)
+    BuildMI(MBB, MBBI, dl, LoadInst, FPReg)
+      .addImm(FPOffset)
+      .addReg(SPReg);
 
-    assert(MustSaveCRs.empty() &&
-           "Epilogue CR restoring supported only in 64-bit mode");
+  if (HasBP)
+    BuildMI(MBB, MBBI, dl, LoadInst, BPReg)
+      .addImm(BPOffset)
+      .addReg(SPReg);
 
-    if (HasFP)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
-          .addImm(FPOffset).addReg(PPC::R1);
+  if (!MustSaveCRs.empty()) // will only occur for PPC64
+    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+        .addReg(TempReg, getKillRegState(i == e-1));
 
-    if (MustSaveLR)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
-  }
+  if (MustSaveLR)
+    BuildMI(MBB, MBBI, dl, MTLRInst).addReg(ScratchReg);
 
   // Callee pop calling convention. Pop parameter/linkage area. Used for tail
   // call optimization
@@ -786,27 +827,20 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       MF.getFunction()->getCallingConv() == CallingConv::Fast) {
      PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
      unsigned CallerAllocatedAmt = FI->getMinReservedArea();
-     unsigned StackReg = isPPC64 ? PPC::X1 : PPC::R1;
-     unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
-     unsigned TmpReg = isPPC64 ? PPC::X0 : PPC::R0;
-     unsigned ADDIInstr = isPPC64 ? PPC::ADDI8 : PPC::ADDI;
-     unsigned ADDInstr = isPPC64 ? PPC::ADD8 : PPC::ADD4;
-     unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
-     unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
 
      if (CallerAllocatedAmt && isInt<16>(CallerAllocatedAmt)) {
-       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
-         .addReg(StackReg).addImm(CallerAllocatedAmt);
+       BuildMI(MBB, MBBI, dl, AddImmInst, SPReg)
+         .addReg(SPReg).addImm(CallerAllocatedAmt);
      } else {
-       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+       BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
           .addImm(CallerAllocatedAmt >> 16);
-       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
-          .addReg(TmpReg, RegState::Kill)
+       BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
           .addImm(CallerAllocatedAmt & 0xFFFF);
-       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
-          .addReg(StackReg)
+       BuildMI(MBB, MBBI, dl, AddInst)
+          .addReg(SPReg)
           .addReg(FPReg)
-          .addReg(TmpReg);
+          .addReg(ScratchReg);
      }
   } else if (RetOpcode == PPC::TCRETURNdi) {
     MBBI = MBB.getLastNonDebugInstr();
@@ -854,7 +888,8 @@ static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
 void
 PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                                    RegScavenger *) const {
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -879,6 +914,15 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     FI->setFramePointerSaveIndex(FPSI);
   }
 
+  int BPSI = FI->getBasePointerSaveIndex();
+  if (!BPSI && RegInfo->hasBasePointer(MF)) {
+    int BPOffset = getBasePointerSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for the base pointer save area.
+    BPSI = MFI->CreateFixedObject(isPPC64? 8 : 4, BPOffset, true);
+    // Save the result.
+    FI->setBasePointerSaveIndex(BPSI);
+  }
+
   // Reserve stack space to move the linkage area to in case of a tail call.
   int TCSPDelta = 0;
   if (MF.getTarget().Options.GuaranteedTailCallOpt &&
@@ -1010,6 +1054,17 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
   }
 
+  const PPCRegisterInfo *RegInfo =
+    static_cast<const PPCRegisterInfo*>(MF.getTarget().getRegisterInfo());
+  if (RegInfo->hasBasePointer(MF)) {
+    HasGPSaveArea = true;
+
+    int FI = PFI->getBasePointerSaveIndex();
+    assert(FI && "No Base Pointer Save Slot!");
+
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+
   // General register save area starts right below the Floating-point
   // register save area.
   if (HasGPSaveArea || HasG8SaveArea) {
@@ -1122,8 +1177,12 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
                                                        RC->getAlignment(),
                                                        false));
 
+    // Might we have over-aligned allocas?
+    bool HasAlVars = MFI->hasVarSizedObjects() &&
+                     MFI->getMaxAlignment() > getStackAlignment();
+
     // These kinds of spills might need two registers.
-    if (spillsCR(MF) || spillsVRSAVE(MF))
+    if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
       RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
                                                          RC->getAlignment(),
                                                          false));
@@ -1151,6 +1210,12 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
     // CR2 through CR4 are the nonvolatile CR fields.
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
 
@@ -1212,7 +1277,7 @@ restoreCRs(bool isPPC64, bool is31,
     MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
 					     PPC::R12),
 				     CSI[CSIIndex].getFrameIdx()));
-    RestoreOp = PPC::MTCRF;
+    RestoreOp = PPC::MTOCRF;
     MoveReg = PPC::R12;
   }
   
@@ -1300,6 +1365,12 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
 
+    // Only Darwin actually uses the VRSAVE register, but it can still appear
+    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
+    // Darwin, ignore it.
+    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+      continue;
+
     if (Reg == PPC::CR2) {
       CR2Spilled = true;
       // The spill slot is associated only with CR2, which is the
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 6f5f936..7aab37e 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -94,6 +94,16 @@ public:
     return isPPC64 ? -8U : -4U;
   }
 
+  /// getBasePointerSaveOffset - Return the previous frame offset to save the
+  /// base pointer.
+  static unsigned getBasePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI)
+      return isPPC64 ? -16U : -8U;
+
+    // SVR4 ABI: First slot in the general register save area.
+    return isPPC64 ? -16U : -8U;
+  }
+
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
   static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) {
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 4bf1e33..0df50e1 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -71,8 +71,8 @@ void PPCScoreboardHazardRecognizer::Reset() {
 //   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
 //
 
-PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
-  : TII(tii) {
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetMachine &TM)
+  : TM(TM) {
   EndDispatchGroup();
 }
 
@@ -91,7 +91,7 @@ PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
                                      bool &isFirst, bool &isSingle,
                                      bool &isCracked,
                                      bool &isLoad, bool &isStore) {
-  const MCInstrDesc &MCID = TII.get(Opcode);
+  const MCInstrDesc &MCID = TM.getInstrInfo()->get(Opcode);
 
   isLoad  = MCID.mayLoad();
   isStore = MCID.mayStore();
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 55b45d0..84b8e6de 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -43,7 +43,7 @@ public:
 /// setting the CTR register then branching through it within a dispatch group),
 /// or storing then loading from the same address within a dispatch group.
 class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
-  const TargetInstrInfo &TII;
+  const TargetMachine &TM;
 
   unsigned NumIssued;  // Number of insts issued, including advanced cycles.
 
@@ -64,7 +64,7 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
   unsigned NumStores;
 
 public:
-  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  PPCHazardRecognizer970(const TargetMachine &TM);
   virtual HazardType getHazardType(SUnit *SU, int Stalls);
   virtual void EmitInstruction(SUnit *SU);
   virtual void AdvanceCycle();
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index aed0fbb..6ba6af6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -110,13 +110,13 @@ namespace {
 
     /// SelectCC - Select a comparison of the specified values with the
     /// specified condition code, returning the CR# of the expression.
-    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, DebugLoc dl);
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDLoc dl);
 
     /// SelectAddrImm - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -145,11 +145,11 @@ namespace {
       return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
 
-    /// SelectAddrImmShift - Returns true if the address N can be represented by
-    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
-    /// for use by STD and friends.
-    bool SelectAddrImmShift(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    /// SelectAddrImmX4 - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement that is a multiple of 4.
+    /// Suitable for use by STD and friends.
+    bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
     }
 
     // Select an address into a single register.
@@ -330,19 +330,22 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 }
 
 bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (!Val)
+    return false;
+
   if (isShiftedMask_32(Val)) {
     // look for the first non-zero bit
-    MB = CountLeadingZeros_32(Val);
+    MB = countLeadingZeros(Val);
     // look for the first zero bit after the run of ones
-    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    ME = countLeadingZeros((Val - 1) ^ Val);
     return true;
   } else {
     Val = ~Val; // invert mask
     if (isShiftedMask_32(Val)) {
       // effectively look for the first zero bit
-      ME = CountLeadingZeros_32(Val) - 1;
+      ME = countLeadingZeros(Val) - 1;
       // effectively look for the first one bit after the run of zeros
-      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      MB = countLeadingZeros((Val - 1) ^ Val) + 1;
       return true;
     }
   }
@@ -397,7 +400,7 @@ bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask,
 SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   APInt LKZ, LKO, RKZ, RKO;
   CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
@@ -435,7 +438,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
     }
 
     unsigned MB, ME;
-    if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) {
+    if (isRunOfOnes(InsertMask, MB, ME)) {
       SDValue Tmp1, Tmp2;
 
       if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
@@ -447,10 +450,10 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
         unsigned SHOpc = Op1.getOperand(0).getOpcode();
         if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
             isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+	  // Note that Value must be in range here (less than 32) because
+	  // otherwise there would not be any bits set in InsertMask.
           Op1 = Op1.getOperand(0).getOperand(0);
           SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
-        } else {
-          Op1 = Op1.getOperand(0);
         }
       }
 
@@ -466,7 +469,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
 /// SelectCC - Select a comparison of the specified values with the specified
 /// condition code, returning the CR# of the expression.
 SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
-                                    ISD::CondCode CC, DebugLoc dl) {
+                                    ISD::CondCode CC, SDLoc dl) {
   // Always select the LHS.
   unsigned Opc;
 
@@ -594,12 +597,8 @@ static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
 /// getCRIdxForSetCC - Return the index of the condition register field
 /// associated with the SetCC condition, and whether or not the field is
 /// treated as inverted.  That is, lt = 0; ge = 0 inverted.
-///
-/// If this returns with Other != -1, then the returned comparison is an or of
-/// two simpler comparisons.  In this case, Invert is guaranteed to be false.
-static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert) {
   Invert = false;
-  Other = -1;
   switch (CC) {
   default: llvm_unreachable("Unknown condition!");
   case ISD::SETOLT:
@@ -710,7 +709,7 @@ static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
 
 
 SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned Imm;
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
@@ -847,8 +846,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   }
 
   bool Inv;
-  int OtherCondIdx;
-  unsigned Idx = getCRIdxForSetCC(CC, Inv, OtherCondIdx);
+  unsigned Idx = getCRIdxForSetCC(CC, Inv);
   SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
   SDValue IntCR;
 
@@ -859,44 +857,29 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
-  if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1)
-    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
-                                           CCReg), 0);
-  else
-    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFCRpseud, dl, MVT::i32,
-                                           CR7Reg, CCReg), 0);
+  IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                         CCReg), 0);
 
   SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
                       getI32Imm(31), getI32Imm(31) };
-  if (OtherCondIdx == -1 && !Inv)
+  if (!Inv)
     return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
 
   // Get the specified bit.
   SDValue Tmp =
     SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-  if (Inv) {
-    assert(OtherCondIdx == -1 && "Can't have split plus negation");
-    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
-  }
-
-  // Otherwise, we have to turn an operation like SETONE -> SETOLT | SETOGT.
-  // We already got the bit for the first part of the comparison (e.g. SETULE).
-
-  // Get the other bit of the comparison.
-  Ops[1] = getI32Imm((32-(3-OtherCondIdx)) & 31);
-  SDValue OtherCond =
-    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops), 0);
-
-  return CurDAG->SelectNodeTo(N, PPC::OR, MVT::i32, Tmp, OtherCond);
+  return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
 }
 
 
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
-  if (N->isMachineOpcode())
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
@@ -912,7 +895,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
       // If it can't be represented as a 32 bit value.
       if (!isInt<32>(Imm)) {
-        Shift = CountTrailingZeros_64(Imm);
+        Shift = countTrailingZeros<uint64_t>(Imm);
         int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
 
         // If the shifted value fits 32 bits.
@@ -992,15 +975,10 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                                   getSmallIPtrImm(0));
   }
 
-  case PPCISD::MFCR: {
+  case PPCISD::MFOCRF: {
     SDValue InFlag = N->getOperand(1);
-    // Use MFOCRF if supported.
-    if (PPCSubTarget.hasMFOCRF())
-      return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
-                                    N->getOperand(0), InFlag);
-    else
-      return CurDAG->getMachineNode(PPC::MFCRpseud, dl, MVT::i32,
-                                    N->getOperand(0), InFlag);
+    return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                  N->getOperand(0), InFlag);
   }
 
   case ISD::SDIV: {
@@ -1242,6 +1220,15 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
                         getI32Imm(BROpc) };
     return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
   }
+  case PPCISD::BDNZ:
+  case PPCISD::BDZ: {
+    bool IsPPC64 = PPCSubTarget.isPPC64();
+    SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
+                                   (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
+                                   (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
+                                MVT::Other, Ops, 2);
+  }
   case PPCISD::COND_BRANCH: {
     // Op #0 is the Chain.
     // Op #1 is the PPC::PRED_* number.
@@ -1493,13 +1480,13 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
         continue;
       break;
     case PPC::ADDIdtprelL:
-      Flags = PPCII::MO_DTPREL16_LO;
+      Flags = PPCII::MO_DTPREL_LO;
       break;
     case PPC::ADDItlsldL:
-      Flags = PPCII::MO_TLSLD16_LO;
+      Flags = PPCII::MO_TLSLD_LO;
       break;
     case PPC::ADDItocL:
-      Flags = PPCII::MO_TOC16_LO;
+      Flags = PPCII::MO_TOC_LO;
       break;
     }
 
@@ -1519,8 +1506,16 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
     // immediate operand, add it now.
     if (ReplaceFlags) {
       if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
-        DebugLoc dl = GA->getDebugLoc();
+        SDLoc dl(GA);
         const GlobalValue *GV = GA->getGlobal();
+        // We can't perform this optimization for data whose alignment
+        // is insufficient for the instruction encoding.
+        if (GV->getAlignment() < 4 &&
+            (StorageOpcode == PPC::LD || StorageOpcode == PPC::STD ||
+             StorageOpcode == PPC::LWA)) {
+          DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
+          continue;
+        }
         ImmOpnd = CurDAG->getTargetGlobalAddress(GV, dl, MVT::i64, 0, Flags);
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 3fcafdc..8da5f05 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "PPCMachineFunctionInfo.h"
 #include "PPCPerfectShuffle.h"
 #include "PPCTargetMachine.h"
+#include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -36,21 +37,6 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State);
-static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
-                                              CCValAssign::LocInfo &LocInfo,
-                                              ISD::ArgFlagsTy &ArgFlags,
-                                              CCState &State);
-static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                                MVT &LocVT,
-                                                CCValAssign::LocInfo &LocInfo,
-                                                ISD::ArgFlagsTy &ArgFlags,
-                                                CCState &State);
-
 static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
 cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
 
@@ -64,14 +50,15 @@ static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
   if (TM.getSubtargetImpl()->isDarwin())
     return new TargetLoweringObjectFileMachO();
 
+  if (TM.getSubtargetImpl()->isSVR4ABI())
+    return new PPC64LinuxTargetObjectFile();
+
   return new TargetLoweringObjectFileELF();
 }
 
 PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
   const PPCSubtarget *Subtarget = &TM.getSubtarget<PPCSubtarget>();
-  PPCRegInfo = TM.getRegisterInfo();
-  PPCII = TM.getInstrInfo();
 
   setPow2DivIsCheap();
 
@@ -162,28 +149,24 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
         Subtarget->hasFRSQRTES() && Subtarget->hasFRES()))
     setOperationAction(ISD::FSQRT, MVT::f32, Expand);
 
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  if (Subtarget->hasFCPSGN()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
+  } else {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  }
 
   if (Subtarget->hasFPRND()) {
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::FROUND, MVT::f64, Legal);
 
     setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
     setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
     setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-
-    // frin does not implement "ties to even." Thus, this is safe only in
-    // fast-math mode.
-    if (TM.Options.UnsafeFPMath) {
-      setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-      setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-
-      // These need to set FE_INEXACT, and use a custom inserter.
-      setOperationAction(ISD::FRINT, MVT::f64, Legal);
-      setOperationAction(ISD::FRINT, MVT::f32, Legal);
-    }
+    setOperationAction(ISD::FROUND, MVT::f32, Legal);
   }
 
   // PowerPC does not have BSWAP, CTPOP or CTTZ
@@ -241,11 +224,6 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // We cannot sextinreg(i1).  Expand to shifts.
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
 
-  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
-  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
-  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
-  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
-
   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
   // support continuation, user-level threading, and etc.. As a result, no
@@ -298,8 +276,13 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   } else
     setOperationAction(ISD::VAARG, MVT::Other, Expand);
 
+  if (Subtarget->isSVR4ABI() && !isPPC64)
+    // VACOPY is custom lowered with the 32-bit SVR4 ABI.
+    setOperationAction(ISD::VACOPY            , MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+
   // Use the default implementation.
-  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
   setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
@@ -309,6 +292,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
+  // To handle counter-based loop conditions.
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
+
   // Comparisons that require checking two conditions.
   setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
   setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
@@ -407,6 +393,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
       setOperationAction(ISD::UDIV, VT, Expand);
       setOperationAction(ISD::UREM, VT, Expand);
       setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FREM, VT, Expand);
       setOperationAction(ISD::FNEG, VT, Expand);
       setOperationAction(ISD::FSQRT, VT, Expand);
       setOperationAction(ISD::FLOG, VT, Expand);
@@ -501,6 +488,9 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
     setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
+
+    setCondCodeAction(ISD::SETO,   MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
   }
 
   if (Subtarget->has64BitSupport()) {
@@ -529,9 +519,11 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::LOAD);
   setTargetDAGCombine(ISD::STORE);
   setTargetDAGCombine(ISD::BR_CC);
   setTargetDAGCombine(ISD::BSWAP);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
   // Use reciprocal estimates.
   if (TM.Options.UnsafeFPMath) {
@@ -564,7 +556,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  setSchedulingPreference(Sched::Hybrid);
+  if (Subtarget->enableMachineScheduler())
+    setSchedulingPreference(Sched::Source);
+  else
+    setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
 
@@ -583,24 +578,47 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   }
 }
 
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
+                             unsigned MaxMaxAlign) {
+  if (MaxAlign == MaxMaxAlign)
+    return;
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
+      MaxAlign = 32;
+    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
+      MaxAlign = 16;
+  } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == MaxMaxAlign)
+        break;
+    }
+  }
+}
+
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
-  const TargetMachine &TM = getTargetMachine();
   // Darwin passes everything on 4 byte boundary.
-  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+  if (PPCSubTarget.isDarwin())
     return 4;
 
   // 16byte and wider vectors are passed on 16byte boundary.
-  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    if (VTy->getBitWidth() >= 128)
-      return 16;
-
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-   if (PPCSubTarget.isPPC64())
-     return 8;
-
-  return 4;
+  unsigned Align = PPCSubTarget.isPPC64() ? 8 : 4;
+  if (PPCSubTarget.hasAltivec() || PPCSubTarget.hasQPX())
+    getMaxByValAlign(Ty, Align, PPCSubTarget.hasQPX() ? 32 : 16);
+  return Align;
 }
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -634,7 +652,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
   case PPCISD::EH_SJLJ_SETJMP:  return "PPCISD::EH_SJLJ_SETJMP";
   case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
-  case PPCISD::MFCR:            return "PPCISD::MFCR";
+  case PPCISD::MFOCRF:          return "PPCISD::MFOCRF";
   case PPCISD::VCMP:            return "PPCISD::VCMP";
   case PPCISD::VCMPo:           return "PPCISD::VCMPo";
   case PPCISD::LBRX:            return "PPCISD::LBRX";
@@ -642,6 +660,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::LARX:            return "PPCISD::LARX";
   case PPCISD::STCX:            return "PPCISD::STCX";
   case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::BDNZ:            return "PPCISD::BDNZ";
+  case PPCISD::BDZ:             return "PPCISD::BDZ";
   case PPCISD::MFFS:            return "PPCISD::MFFS";
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
@@ -662,10 +682,11 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
   case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   case PPCISD::VADD_SPLAT:      return "PPCISD::VADD_SPLAT";
+  case PPCISD::SC:              return "PPCISD::SC";
   }
 }
 
-EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
+EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
   return VT.changeVectorElementTypeToInteger();
@@ -1036,24 +1057,68 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
   return false;
 }
 
+// If we happen to be doing an i64 load or store into a stack slot that has
+// less than a 4-byte alignment, then the frame-index elimination may need to
+// use an indexed load or store instruction (because the offset may not be a
+// multiple of 4). The extra register needed to hold the offset comes from the
+// register scavenger, and it is possible that the scavenger will need to use
+// an emergency spill slot. As a result, we need to make sure that a spill slot
+// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
+// stack slot.
+static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
+  // FIXME: This does not handle the LWA case.
+  if (VT != MVT::i64)
+    return;
+
+  // NOTE: We'll exclude negative FIs here, which come from argument
+  // lowering, because there are no known test cases triggering this problem
+  // using packed structures (or similar). We can remove this exclusion if
+  // we find such a test case. The reason why this is so test-case driven is
+  // because this entire 'fixup' is only to prevent crashes (from the
+  // register scavenger) on not-really-valid inputs. For example, if we have:
+  //   %a = alloca i1
+  //   %b = bitcast i1* %a to i64*
+  //   store i64* a, i64 b
+  // then the store should really be marked as 'align 1', but is not. If it
+  // were marked as 'align 1' then the indexed form would have been
+  // instruction-selected initially, and the problem this 'fixup' is preventing
+  // won't happen regardless.
+  if (FrameIdx < 0)
+    return;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+  if (Align >= 4)
+    return;
+
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  FuncInfo->setHasNonRISpills();
+}
+
 /// Returns true if the address N can be represented by a base register plus
 /// a signed 16-bit displacement [r+imm], and if it is not better
-/// represented as reg+reg.
+/// represented as reg+reg.  If Aligned is true, only accept displacements
+/// suitable for STD and friends, i.e. multiples of 4.
 bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
                                             SDValue &Base,
-                                            SelectionDAG &DAG) const {
+                                            SelectionDAG &DAG,
+                                            bool Aligned) const {
   // FIXME dl should come from parent load or store, not from address
-  DebugLoc dl = N.getDebugLoc();
+  SDLoc dl(N);
   // If this can be more profitably realized as r+r, fail.
   if (SelectAddressRegReg(N, Disp, Base, DAG))
     return false;
 
   if (N.getOpcode() == ISD::ADD) {
     short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm)) {
-      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
+      Disp = DAG.getTargetConstant(imm, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+        fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
       } else {
         Base = N.getOperand(0);
       }
@@ -1072,7 +1137,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
   } else if (N.getOpcode() == ISD::OR) {
     short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm)) {
+    if (isIntS16Immediate(N.getOperand(1), imm) &&
+        (!Aligned || (imm & 3) == 0)) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -1083,7 +1149,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
         // If all of the bits are known zero on the LHS or RHS, the add won't
         // carry.
         Base = N.getOperand(0);
-        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        Disp = DAG.getTargetConstant(imm, N.getValueType());
         return true;
       }
     }
@@ -1093,7 +1159,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // If this address fits entirely in a 16-bit sext immediate field, codegen
     // this as "d, 0"
     short Imm;
-    if (isIntS16Immediate(CN, Imm)) {
+    if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
       Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
       Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -1101,8 +1167,9 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     }
 
     // Handle 32-bit sext immediates with LIS + addr mode.
-    if (CN->getValueType(0) == MVT::i32 ||
-        (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+    if ((CN->getValueType(0) == MVT::i32 ||
+         (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
+        (!Aligned || (CN->getZExtValue() & 3) == 0)) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -1116,9 +1183,10 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   }
 
   Disp = DAG.getTargetConstant(0, getPointerTy());
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
     Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-  else
+    fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
+  } else
     Base = N;
   return true;      // [r+0]
 }
@@ -1150,92 +1218,6 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   return true;
 }
 
-/// SelectAddressRegImmShift - Returns true if the address N can be
-/// represented by a base register plus a signed 14-bit displacement
-/// [r+imm*4].  Suitable for use by STD and friends.
-bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
-                                                 SDValue &Base,
-                                                 SelectionDAG &DAG) const {
-  // FIXME dl should come from the parent load or store, not the address
-  DebugLoc dl = N.getDebugLoc();
-  // If this can be more profitably realized as r+r, fail.
-  if (SelectAddressRegReg(N, Disp, Base, DAG))
-    return false;
-
-  if (N.getOpcode() == ISD::ADD) {
-    short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
-      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
-        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-      } else {
-        Base = N.getOperand(0);
-      }
-      return true; // [r+i]
-    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
-      // Match LOAD (ADD (X, Lo(G))).
-      assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
-             && "Cannot handle constant offsets yet!");
-      Disp = N.getOperand(1).getOperand(0);  // The global address.
-      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
-             Disp.getOpcode() == ISD::TargetConstantPool ||
-             Disp.getOpcode() == ISD::TargetJumpTable);
-      Base = N.getOperand(0);
-      return true;  // [&g+r]
-    }
-  } else if (N.getOpcode() == ISD::OR) {
-    short imm = 0;
-    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
-      // If this is an or of disjoint bitfields, we can codegen this as an add
-      // (for better address arithmetic) if the LHS and RHS of the OR are
-      // provably disjoint.
-      APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
-      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
-        // If all of the bits are known zero on the LHS or RHS, the add won't
-        // carry.
-        Base = N.getOperand(0);
-        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
-        return true;
-      }
-    }
-  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
-    // Loading from a constant address.  Verify low two bits are clear.
-    if ((CN->getZExtValue() & 3) == 0) {
-      // If this address fits entirely in a 14-bit sext immediate field, codegen
-      // this as "d, 0"
-      short Imm;
-      if (isIntS16Immediate(CN, Imm)) {
-        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
-        Base = DAG.getRegister(PPCSubTarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
-                               CN->getValueType(0));
-        return true;
-      }
-
-      // Fold the low-part of 32-bit absolute addresses into addr mode.
-      if (CN->getValueType(0) == MVT::i32 ||
-          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
-        int Addr = (int)CN->getZExtValue();
-
-        // Otherwise, break this down into an LIS + disp.
-        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
-        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
-        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
-        Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0);
-        return true;
-      }
-    }
-  }
-
-  Disp = DAG.getTargetConstant(0, getPointerTy());
-  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
-    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
-  else
-    Base = N;
-  return true;      // [r+0]
-}
-
-
 /// getPreIndexedAddressParts - returns true by value, base pointer and
 /// offset pointer and addressing mode by reference if the node's address
 /// can be legally represented as pre-indexed load / store address.
@@ -1288,18 +1270,16 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
     return true;
   }
 
-  // LDU/STU use reg+imm*4, others use reg+imm.
+  // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    // reg + imm
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    // reg + imm * 4.
-    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
       return false;
   }
 
@@ -1324,8 +1304,8 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
 static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
                                unsigned &LoOpFlags, const GlobalValue *GV = 0) {
-  HiOpFlags = PPCII::MO_HA16;
-  LoOpFlags = PPCII::MO_LO16;
+  HiOpFlags = PPCII::MO_HA;
+  LoOpFlags = PPCII::MO_LO;
 
   // Don't use the pic base if not in PIC relocation model.  Or if we are on a
   // non-darwin platform.  We don't support PIC on other platforms yet.
@@ -1355,7 +1335,7 @@ static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
                              SelectionDAG &DAG) {
   EVT PtrVT = HiPart.getValueType();
   SDValue Zero = DAG.getConstant(0, PtrVT);
-  DebugLoc DL = HiPart.getDebugLoc();
+  SDLoc DL(HiPart);
 
   SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
   SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
@@ -1380,7 +1360,7 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   // The actual address of the GlobalValue is stored in the TOC.
   if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
     SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
-    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
   }
 
@@ -1401,7 +1381,7 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   // The actual address of the GlobalValue is stored in the TOC.
   if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
     SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
-    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+    return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
                        DAG.getRegister(PPC::X2, MVT::i64));
   }
 
@@ -1428,8 +1408,12 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
 
+  // FIXME: TLS addresses currently use medium model code sequences,
+  // which is the most useful form.  Eventually support for small and
+  // large models could be added if users need it, at the cost of
+  // additional complexity.
   GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   const GlobalValue *GV = GA->getGlobal();
   EVT PtrVT = getPointerTy();
   bool is64bit = PPCSubTarget.isPPC64();
@@ -1438,9 +1422,9 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   if (Model == TLSModel::LocalExec) {
     SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                               PPCII::MO_TPREL16_HA);
+                                               PPCII::MO_TPREL_HA);
     SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                               PPCII::MO_TPREL16_LO);
+                                               PPCII::MO_TPREL_LO);
     SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
                                      is64bit ? MVT::i64 : MVT::i32);
     SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
@@ -1452,12 +1436,14 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   if (Model == TLSModel::InitialExec) {
     SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
+                                                PPCII::MO_TLS);
     SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
     SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
                                      PtrVT, GOTReg, TGA);
     SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
                                    PtrVT, TGA, TPOffsetHi);
-    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGA);
+    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
   }
 
   if (Model == TLSModel::GeneralDynamic) {
@@ -1515,7 +1501,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
-  DebugLoc DL = GSDN->getDebugLoc();
+  SDLoc DL(GSDN);
   const GlobalValue *GV = GSDN->getGlobal();
 
   // 64-bit SVR4 ABI code is always position-independent.
@@ -1546,7 +1532,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
 
 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // If we're comparing for equality to zero, expose the fact that this is
   // implented as a ctlz/srl pair on ppc, so that the dag combiner can
@@ -1595,7 +1581,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
   SDValue InChain = Node->getOperand(0);
   SDValue VAListPtr = Node->getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
 
@@ -1695,6 +1681,18 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                      false, false, false, 0);
 }
 
+SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG,
+                                       const PPCSubtarget &Subtarget) const {
+  assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
+
+  // We have to copy the entire va_list struct:
+  // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
+  return DAG.getMemcpy(Op.getOperand(0), Op,
+                       Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(12, MVT::i32), 8, false, true,
+                       MachinePointerInfo(), MachinePointerInfo());
+}
+
 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
                                                   SelectionDAG &DAG) const {
   return Op.getOperand(0);
@@ -1706,7 +1704,7 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
@@ -1748,7 +1746,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -1842,18 +1840,24 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
 
 #include "PPCGenCallingConv.inc"
 
-static bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                       CCValAssign::LocInfo &LocInfo,
-                                       ISD::ArgFlagsTy &ArgFlags,
-                                       CCState &State) {
+// Function whose sole purpose is to kill compiler warnings 
+// stemming from unused functions included from PPCGenCallingConv.inc.
+CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
+  return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
+}
+
+bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
   return true;
 }
 
-static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
-                                              MVT &LocVT,
-                                              CCValAssign::LocInfo &LocInfo,
-                                              ISD::ArgFlagsTy &ArgFlags,
-                                              CCState &State) {
+bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                             MVT &LocVT,
+                                             CCValAssign::LocInfo &LocInfo,
+                                             ISD::ArgFlagsTy &ArgFlags,
+                                             CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
@@ -1876,11 +1880,11 @@ static bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-static bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
-                                                MVT &LocVT,
-                                                CCValAssign::LocInfo &LocInfo,
-                                                ISD::ArgFlagsTy &ArgFlags,
-                                                CCState &State) {
+bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                               MVT &LocVT,
+                                               CCValAssign::LocInfo &LocInfo,
+                                               ISD::ArgFlagsTy &ArgFlags,
+                                               CCState &State) {
   static const uint16_t ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
@@ -1931,7 +1935,7 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv, bool isVarArg,
                                         const SmallVectorImpl<ISD::InputArg>
                                           &Ins,
-                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SDLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
   if (PPCSubTarget.isSVR4ABI()) {
@@ -1953,7 +1957,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   // 32-bit SVR4 ABI Stack Frame Layout:
@@ -2170,14 +2174,14 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
 SDValue
 PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
                                      SelectionDAG &DAG, SDValue ArgVal,
-                                     DebugLoc dl) const {
+                                     SDLoc dl) const {
   if (Flags.isSExt())
     ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
   else if (Flags.isZExt())
     ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
                          DAG.getValueType(ObjectVT));
-  
+
   return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
 }
 
@@ -2213,7 +2217,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
@@ -2304,6 +2308,13 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
         InVals.push_back(FIN);
         continue;
       }
+
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+        CurArgOffset = ArgOffset;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (ObjSize < PtrByteSize)
         CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
@@ -2502,7 +2513,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
                                         &Ins,
-                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SDLoc dl, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
   // TODO: add description of PPC stack frame format, or at least some docs.
   //
@@ -2600,17 +2611,17 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  // FIXME: FuncArg and Ins[ArgNo] must reference the same argument.
-  // When passing anonymous aggregates, this is currently not true.
-  // See LowerFormalArguments_64SVR4 for a fix.
   Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+  unsigned CurArgIdx = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
     unsigned ObjSize = ObjectVT.getSizeInBits()/8;
     unsigned ArgSize = ObjSize;
     ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+    std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[ArgNo].OrigArgIndex;
 
     unsigned CurArgOffset = ArgOffset;
 
@@ -3002,9 +3013,9 @@ struct TailCallArgumentInfo {
 static void
 StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
                                            SDValue Chain,
-                   const SmallVector<TailCallArgumentInfo, 8> &TailCallArgs,
-                   SmallVector<SDValue, 8> &MemOpChains,
-                   DebugLoc dl) {
+                   const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
+                   SmallVectorImpl<SDValue> &MemOpChains,
+                   SDLoc dl) {
   for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
     SDValue Arg = TailCallArgs[i].Arg;
     SDValue FIN = TailCallArgs[i].FrameIdxOp;
@@ -3026,7 +3037,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
                                                int SPDiff,
                                                bool isPPC64,
                                                bool isDarwinABI,
-                                               DebugLoc dl) {
+                                               SDLoc dl) {
   if (SPDiff) {
     // Calculate the new stack slot for the return address.
     int SlotSize = isPPC64 ? 8 : 4;
@@ -3061,7 +3072,7 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
 static void
 CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
                          SDValue Arg, int SPDiff, unsigned ArgOffset,
-                      SmallVector<TailCallArgumentInfo, 8>& TailCallArguments) {
+                     SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
   int Offset = ArgOffset + SPDiff;
   uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
   int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
@@ -3083,7 +3094,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
                                                         SDValue &LROpOut,
                                                         SDValue &FPOpOut,
                                                         bool isDarwinABI,
-                                                        DebugLoc dl) const {
+                                                        SDLoc dl) const {
   if (SPDiff) {
     // Load the LR and FP stack slot for later adjusting.
     EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
@@ -3113,7 +3124,7 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
                        false, false, MachinePointerInfo(0),
@@ -3126,9 +3137,9 @@ static void
 LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
                  SDValue Arg, SDValue PtrOff, int SPDiff,
                  unsigned ArgOffset, bool isPPC64, bool isTailCall,
-                 bool isVector, SmallVector<SDValue, 8> &MemOpChains,
-                 SmallVector<TailCallArgumentInfo, 8> &TailCallArguments,
-                 DebugLoc dl) {
+                 bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
+                 SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
+                 SDLoc dl) {
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   if (!isTailCall) {
     if (isVector) {
@@ -3149,9 +3160,9 @@ LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
 
 static
 void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
-                     DebugLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
+                     SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
                      SDValue LROp, SDValue FPOp, bool isDarwinABI,
-                     SmallVector<TailCallArgumentInfo, 8> &TailCallArguments) {
+                     SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Emit a sequence of copyto/copyfrom virtual registers for arguments that
@@ -3171,15 +3182,15 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
 
   // Emit callseq_end just before tailcall node.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 }
 
 static
 unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
-                     SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall,
-                     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
-                     SmallVector<SDValue, 8> &Ops, std::vector<EVT> &NodeTys,
+                     SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
+                     SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
+                     SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
                      const PPCSubtarget &PPCSubTarget) {
 
   bool isPPC64 = PPCSubTarget.isPPC64();
@@ -3363,7 +3374,7 @@ SDValue
 PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
@@ -3406,7 +3417,7 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 }
 
 SDValue
-PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
                               bool isTailCall, bool isVarArg,
                               SelectionDAG &DAG,
                               SmallVector<std::pair<unsigned, SDValue>, 8>
@@ -3476,7 +3487,9 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if ((CallOpc == PPCISD::CALL) && !isLocalCall(Callee)) {
+    } else if ((CallOpc == PPCISD::CALL) &&
+               (!isLocalCall(Callee) ||
+                DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
       // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP;
     }
@@ -3493,7 +3506,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                              DAG.getIntPtrConstant(BytesCalleePops, true),
-                             InFlag);
+                             InFlag, dl);
   if (!Ins.empty())
     InFlag = Chain.getValue(1);
 
@@ -3505,10 +3518,10 @@ SDValue
 PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -3542,7 +3555,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
@@ -3628,7 +3641,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be moved somewhere else
@@ -3679,7 +3693,8 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
 
       // This must go outside the CALLSEQ_START..END.
       SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
+                           CallSeqStart.getNode()->getOperand(1),
+                           SDLoc(MemcpyCall));
       DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                              NewCallSeqStart.getNode());
       Chain = CallSeqStart = NewCallSeqStart;
@@ -3755,13 +3770,14 @@ PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
                                               SDValue CallSeqStart,
                                               ISD::ArgFlagsTy Flags,
                                               SelectionDAG &DAG,
-                                              DebugLoc dl) const {
+                                              SDLoc dl) const {
   SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
                         CallSeqStart.getNode()->getOperand(0),
                         Flags, DAG, dl);
   // The MEMCPY must go outside the CALLSEQ_START..END.
   SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                             CallSeqStart.getNode()->getOperand(1));
+                             CallSeqStart.getNode()->getOperand(1),
+                             SDLoc(MemcpyCall));
   DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
                          NewCallSeqStart.getNode());
   return NewCallSeqStart;
@@ -3774,7 +3790,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   unsigned NumOps = Outs.size();
@@ -3815,7 +3831,8 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -3889,6 +3906,15 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       if (Size == 0)
         continue;
 
+      unsigned BVAlign = Flags.getByValAlign();
+      if (BVAlign > 8) {
+        if (BVAlign % PtrByteSize != 0)
+          llvm_unreachable(
+            "ByVal alignment is not a multiple of the pointer size");
+
+        ArgOffset = ((ArgOffset+BVAlign-1)/BVAlign)*BVAlign;
+      }
+
       // All aggregates smaller than 8 bytes must be passed right-justified.
       if (Size==1 || Size==2 || Size==4) {
         EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
@@ -3940,7 +3966,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
         // register.
         // FIXME: The memcpy seems to produce pretty awful code for
         // small aggregates, particularly for packed ones.
-        // FIXME: It would be preferable to use the slot in the 
+        // FIXME: It would be preferable to use the slot in the
         // parameter save area instead of a new local variable.
         SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
         SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
@@ -3980,7 +4006,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -4003,7 +4029,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
           // must be passed right-justified in the stack doubleword, and
           // in the GPR, if one is available.
           SDValue StoreOff;
-          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+          if (Arg.getSimpleValueType().SimpleTy == MVT::f32) {
             SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
             StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
           } else
@@ -4145,7 +4171,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   unsigned NumOps = Outs.size();
@@ -4186,7 +4212,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               dl);
   SDValue CallSeqStart = Chain;
 
   // Load the return address and frame pointer so it can be move somewhere else
@@ -4310,7 +4337,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       continue;
     }
 
-    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    switch (Arg.getSimpleValueType().SimpleTy) {
     default: llvm_unreachable("Unexpected ValueType for argument!");
     case MVT::i32:
     case MVT::i64:
@@ -4502,7 +4529,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
 
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
@@ -4551,7 +4578,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                    const PPCSubtarget &Subtarget) const {
   // When we pop the dynamic allocation we need to restore the SP link.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the corect type for pointers.
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -4636,7 +4663,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the corect type for pointers.
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -4653,7 +4680,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
@@ -4661,7 +4688,7 @@ SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
@@ -4687,7 +4714,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT CmpVT = Op.getOperand(0).getValueType();
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
   SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // If the RHS of the comparison is a 0.0, we don't need to do the
   // subtraction at all.
@@ -4768,14 +4795,14 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
 // FIXME: Split this code up when LegalizeDAGTypes lands.
 SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                           DebugLoc dl) const {
+                                           SDLoc dl) const {
   assert(Op.getOperand(0).getValueType().isFloatingPoint());
   SDValue Src = Op.getOperand(0);
   if (Src.getValueType() == MVT::f32)
     Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
 
   SDValue Tmp;
-  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  switch (Op.getSimpleValueType().SimpleTy) {
   default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
   case MVT::i32:
     Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
@@ -4827,7 +4854,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
 
 SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Don't handle ppc_fp128 here; let it be lowered to a libcall.
   if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
     return SDValue();
@@ -4961,7 +4988,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
 
 SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   /*
    The rounding mode is in bits 30:31 of FPSR, and has the following
    settings:
@@ -5027,7 +5054,7 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
          "Unexpected SHL!");
@@ -5055,7 +5082,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
          VT == Op.getOperand(1).getValueType() &&
@@ -5083,7 +5110,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
   unsigned BitWidth = VT.getSizeInBits();
   assert(Op.getNumOperands() == 3 &&
@@ -5118,7 +5145,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
 /// BuildSplatI - Build a canonical splati of Val with an element size of
 /// SplatSize.  Cast the result to VT.
 static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
-                             SelectionDAG &DAG, DebugLoc dl) {
+                             SelectionDAG &DAG, SDLoc dl) {
   assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
 
   static const EVT VTys[] = { // canonical VT to use for each size.
@@ -5142,10 +5169,20 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
   return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
 }
 
+/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
+                                SelectionDAG &DAG, SDLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op);
+}
+
 /// BuildIntrinsicOp - Return a binary operator intrinsic node with the
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
-                                SelectionDAG &DAG, DebugLoc dl,
+                                SelectionDAG &DAG, SDLoc dl,
                                 EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = LHS.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
@@ -5156,7 +5193,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
 /// specified intrinsic ID.
 static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
                                 SDValue Op2, SelectionDAG &DAG,
-                                DebugLoc dl, EVT DestVT = MVT::Other) {
+                                SDLoc dl, EVT DestVT = MVT::Other) {
   if (DestVT == MVT::Other) DestVT = Op0.getValueType();
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
                      DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
@@ -5166,7 +5203,7 @@ static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
 /// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
 /// amount.  The result has the specified value type.
 static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
-                             EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+                             EVT VT, SelectionDAG &DAG, SDLoc dl) {
   // Force LHS/RHS to be the right type.
   LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
   RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
@@ -5185,7 +5222,7 @@ static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
 // sequence of ops that should be used.
 SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
   assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
@@ -5341,7 +5378,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 /// the specified operations to build the shuffle.
 static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
                                       SDValue RHS, SelectionDAG &DAG,
-                                      DebugLoc dl) {
+                                      SDLoc dl) {
   unsigned OpNum = (PFEntry >> 26) & 0x0F;
   unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
   unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
@@ -5420,7 +5457,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
 /// lowered into a vperm.
 SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -5587,7 +5624,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                    SelectionDAG &DAG) const {
   // If this is a lowered altivec predicate compare, CompareOpc is set to the
   // opcode number of the comparison.
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   int CompareOpc;
   bool isDot;
   if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
@@ -5612,7 +5649,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
   // This is flagged to the above dot comparison.
-  SDValue Flags = DAG.getNode(PPCISD::MFCR, dl, MVT::i32,
+  SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
                                 DAG.getRegister(PPC::CR6, MVT::i32),
                                 CompNode.getValue(1));
 
@@ -5651,7 +5688,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
                                                    SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
   int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
@@ -5668,7 +5705,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
 }
 
 SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   if (Op.getValueType() == MVT::v4i32) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
 
@@ -5745,6 +5782,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:
     return LowerVAARG(Op, DAG, PPCSubTarget);
 
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG, PPCSubTarget);
+
   case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
@@ -5755,7 +5795,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
   case ISD::FP_TO_UINT:
   case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
-                                                       Op.getDebugLoc());
+                                                       SDLoc(Op));
   case ISD::UINT_TO_FP:
   case ISD::SINT_TO_FP:         return LowerINT_TO_FP(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
@@ -5772,6 +5812,9 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, DAG);
 
+  // For counter-based loop handling.
+  case ISD::INTRINSIC_W_CHAIN:  return SDValue();
+
   // Frame & Return address.
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
@@ -5782,10 +5825,26 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   const TargetMachine &TM = getTargetMachine();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case ISD::INTRINSIC_W_CHAIN: {
+    if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
+        Intrinsic::ppc_is_decremented_ctr_nonzero)
+      break;
+
+    assert(N->getValueType(0) == MVT::i1 &&
+           "Unexpected result type for CTR decrement intrinsic");
+    EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
+    SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
+    SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
+                                 N->getOperand(1)); 
+
+    Results.push_back(NewInt);
+    Results.push_back(NewInt.getValue(1));
+    break;
+  }
   case ISD::VAARG: {
     if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
         || TM.getSubtarget<PPCSubtarget>().isPPC64())
@@ -5821,6 +5880,9 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::FP_TO_SINT:
+    // LowerFP_TO_INT() can only handle f32 and f64.
+    if (N->getOperand(0).getValueType() == MVT::ppcf128)
+      return;
     Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
     return;
   }
@@ -6092,6 +6154,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // thisMBB:
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
   // Prepare IP either in reg.
   const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
@@ -6101,15 +6164,32 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   if (PPCSubTarget.isPPC64() && PPCSubTarget.isSVR4ABI()) {
     MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
             .addReg(PPC::X2)
-            .addImm(TOCOffset / 4)
+            .addImm(TOCOffset)
             .addReg(BufReg);
-
     MIB.setMemRefs(MMOBegin, MMOEnd);
   }
 
+  // Naked functions never have a base pointer, and so we use r1. For all
+  // other functions, this decision must be delayed until during PEI.
+  unsigned BaseReg;
+  if (MF->getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
+    BaseReg = PPCSubTarget.isPPC64() ? PPC::X1 : PPC::R1;
+  else
+    BaseReg = PPCSubTarget.isPPC64() ? PPC::BP8 : PPC::BP;
+
+  MIB = BuildMI(*thisMBB, MI, DL,
+                TII->get(PPCSubTarget.isPPC64() ? PPC::STD : PPC::STW))
+          .addReg(BaseReg)
+          .addImm(BPOffset)
+          .addReg(BufReg);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
-  MIB.addRegMask(PPCRegInfo->getNoPreservedMask());
+  const PPCRegisterInfo *TRI =
+    static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
+  MIB.addRegMask(TRI->getNoPreservedMask());
 
   BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
 
@@ -6129,7 +6209,7 @@ PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   if (PPCSubTarget.isPPC64()) {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
             .addReg(LabelReg)
-            .addImm(LabelOffset / 4)
+            .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
@@ -6176,12 +6256,14 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   unsigned FP  = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
   unsigned SP  = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
+  unsigned BP  = (PVT == MVT::i64) ? PPC::X30 : PPC::R30;
 
   MachineInstrBuilder MIB;
 
   const int64_t LabelOffset = 1 * PVT.getStoreSize();
   const int64_t SPOffset    = 2 * PVT.getStoreSize();
   const int64_t TOCOffset   = 3 * PVT.getStoreSize();
+  const int64_t BPOffset    = 4 * PVT.getStoreSize();
 
   unsigned BufReg = MI->getOperand(0).getReg();
 
@@ -6202,7 +6284,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload IP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
-            .addImm(LabelOffset / 4)
+            .addImm(LabelOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
@@ -6214,7 +6296,7 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   // Reload SP
   if (PVT == MVT::i64) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
-            .addImm(SPOffset / 4)
+            .addImm(SPOffset)
             .addReg(BufReg);
   } else {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
@@ -6223,13 +6305,22 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
   }
   MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  // FIXME: When we also support base pointers, that register must also be
-  // restored here.
+  // Reload BP
+  if (PVT == MVT::i64) {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  } else {
+    MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
+            .addImm(BPOffset)
+            .addReg(BufReg);
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
   // Reload TOC
   if (PVT == MVT::i64 && PPCSubTarget.isSVR4ABI()) {
     MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
-            .addImm(TOCOffset / 4)
+            .addImm(TOCOffset)
             .addReg(BufReg);
 
     MIB.setMemRefs(MMOBegin, MMOEnd);
@@ -6272,8 +6363,10 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     Cond.push_back(MI->getOperand(1));
 
     DebugLoc dl = MI->getDebugLoc();
-    PPCII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(), Cond,
-                        MI->getOperand(2).getReg(), MI->getOperand(3).getReg());
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
+                      Cond, MI->getOperand(2).getReg(),
+                      MI->getOperand(3).getReg());
   } else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
              MI->getOpcode() == PPC::SELECT_CC_I8 ||
              MI->getOpcode() == PPC::SELECT_CC_F4 ||
@@ -6633,51 +6726,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 
     // Restore FPSCR value.
     BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
-  } else if (MI->getOpcode() == PPC::FRINDrint ||
-             MI->getOpcode() == PPC::FRINSrint) {
-    bool isf32 = MI->getOpcode() == PPC::FRINSrint;
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
-    DebugLoc dl   = MI->getDebugLoc();
-
-    MachineRegisterInfo &RegInfo = F->getRegInfo();
-    unsigned CRReg = RegInfo.createVirtualRegister(&PPC::CRRCRegClass);
-
-    // Perform the rounding.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FRINS : PPC::FRIND), Dest)
-      .addReg(Src);
-
-    // Compare the results.
-    BuildMI(*BB, MI, dl, TII->get(isf32 ? PPC::FCMPUS : PPC::FCMPUD), CRReg)
-      .addReg(Dest).addReg(Src);
-
-    // If the results were not equal, then set the FPSCR XX bit.
-    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
-    F->insert(It, midMBB);
-    F->insert(It, exitMBB);
-    exitMBB->splice(exitMBB->begin(), BB,
-                    llvm::next(MachineBasicBlock::iterator(MI)),
-                    BB->end());
-    exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-    BuildMI(*BB, MI, dl, TII->get(PPC::BCC))
-      .addImm(PPC::PRED_EQ).addReg(CRReg).addMBB(exitMBB);
-
-    BB->addSuccessor(midMBB);
-    BB->addSuccessor(exitMBB);
-
-    BB = midMBB;
-
-    // Set the FPSCR XX bit (FE_INEXACT). Note that we cannot just set
-    // the FI bit here because that will not automatically set XX also,
-    // and XX is what libm interprets as the FE_INEXACT flag.
-    BuildMI(BB, dl, TII->get(PPC::MTFSB1)).addImm(/* 38 - 32 = */ 6);
-    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
-
-    BB->addSuccessor(exitMBB);
-
-    BB = exitMBB;
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -6717,7 +6765,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
       ++Iterations;
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
 
     SDValue FPOne =
       DAG.getConstantFP(1.0, VT.getScalarType());
@@ -6779,7 +6827,7 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
       ++Iterations;
 
     SelectionDAG &DAG = DCI.DAG;
-    DebugLoc dl = Op.getDebugLoc();
+    SDLoc dl(Op);
 
     SDValue FPThreeHalves =
       DAG.getConstantFP(1.5, VT.getScalarType());
@@ -6823,11 +6871,120 @@ SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
   return SDValue();
 }
 
+// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
+// not enforce equality of the chain operands.
+static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
+                            unsigned Bytes, int Dist,
+                            SelectionDAG &DAG) {
+  EVT VT = LS->getMemoryVT();
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LS->getBasePtr();
+  SDValue BaseLoc = Base->getBasePtr();
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  // Handle X+C
+  if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
+      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
+    return true;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const GlobalValue *GV1 = NULL;
+  const GlobalValue *GV2 = NULL;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+// Return true is there is a nearyby consecutive load to the one provided
+// (regardless of alignment). We search up and down the chain, looking though
+// token factors and other loads (but nothing else). As a result, a true
+// results indicates that it is safe to create a new consecutive load adjacent
+// to the load provided.
+static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
+  SDValue Chain = LD->getChain();
+  EVT VT = LD->getMemoryVT();
+
+  SmallSet<SDNode *, 16> LoadRoots;
+  SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
+  SmallSet<SDNode *, 16> Visited;
+
+  // First, search up the chain, branching to follow all token-factor operands.
+  // If we find a consecutive load, then we're done, otherwise, record all
+  // nodes just above the top-level loads and token factors.
+  while (!Queue.empty()) {
+    SDNode *ChainNext = Queue.pop_back_val();
+    if (!Visited.insert(ChainNext))
+      continue;
+
+    if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
+      if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+        return true;
+
+      if (!Visited.count(ChainLD->getChain().getNode()))
+        Queue.push_back(ChainLD->getChain().getNode());
+    } else if (ChainNext->getOpcode() == ISD::TokenFactor) {
+      for (SDNode::op_iterator O = ChainNext->op_begin(),
+           OE = ChainNext->op_end(); O != OE; ++O)
+        if (!Visited.count(O->getNode()))
+          Queue.push_back(O->getNode());
+    } else
+      LoadRoots.insert(ChainNext);
+  }
+
+  // Second, search down the chain, starting from the top-level nodes recorded
+  // in the first phase. These top-level nodes are the nodes just above all
+  // loads and token factors. Starting with their uses, recursively look though
+  // all loads (just the chain uses) and token factors to find a consecutive
+  // load.
+  Visited.clear();
+  Queue.clear();
+
+  for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
+       IE = LoadRoots.end(); I != IE; ++I) {
+    Queue.push_back(*I);
+       
+    while (!Queue.empty()) {
+      SDNode *LoadRoot = Queue.pop_back_val();
+      if (!Visited.insert(LoadRoot))
+        continue;
+
+      if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
+        if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
+          return true;
+
+      for (SDNode::use_iterator UI = LoadRoot->use_begin(),
+           UE = LoadRoot->use_end(); UI != UE; ++UI)
+        if (((isa<LoadSDNode>(*UI) &&
+            cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
+            UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
+          Queue.push_back(*UI);
+    }
+  }
+
+  return false;
+}
+
 SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   const TargetMachine &TM = getTargetMachine();
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case PPCISD::SHL:
@@ -6868,7 +7025,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                  DCI);
       if (RV.getNode() != 0) {
         DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_EXTEND, N->getOperand(1).getDebugLoc(),
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV);
         DCI.AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
@@ -6881,7 +7038,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                  DCI);
       if (RV.getNode() != 0) {
         DCI.AddToWorklist(RV.getNode());
-        RV = DAG.getNode(ISD::FP_ROUND, N->getOperand(1).getDebugLoc(),
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV,
                          N->getOperand(1).getOperand(1));
         DCI.AddToWorklist(RV.getNode());
@@ -6909,8 +7066,28 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     if (RV.getNode() != 0) {
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode() != 0)
+      if (RV.getNode() != 0) {
+	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
+	// this case and force the answer to 0.
+
+        EVT VT = RV.getValueType();
+
+        SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
+        if (VT.isVector()) {
+          assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
+          Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
+        }
+
+        SDValue ZeroCmp =
+          DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
+                       N->getOperand(0), Zero, ISD::SETEQ);
+        DCI.AddToWorklist(ZeroCmp.getNode());
+        DCI.AddToWorklist(RV.getNode());
+
+        RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
+                         ZeroCmp, Zero, RV);
         return RV;
+      }
     }
 
     }
@@ -6999,6 +7176,160 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
     break;
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT VT = LD->getValueType(0);
+    Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
+    if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
+        TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
+        (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+         VT == MVT::v4i32 || VT == MVT::v4f32) &&
+        LD->getAlignment() < ABIAlignment) {
+      // This is a type-legal unaligned Altivec load.
+      SDValue Chain = LD->getChain();
+      SDValue Ptr = LD->getBasePtr();
+
+      // This implements the loading of unaligned vectors as described in
+      // the venerable Apple Velocity Engine overview. Specifically:
+      // https://developer.apple.com/hardwaredrivers/ve/alignment.html
+      // https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
+      //
+      // The general idea is to expand a sequence of one or more unaligned
+      // loads into a alignment-based permutation-control instruction (lvsl),
+      // a series of regular vector loads (which always truncate their
+      // input address to an aligned address), and a series of permutations.
+      // The results of these permutations are the requested loaded values.
+      // The trick is that the last "extra" load is not taken from the address
+      // you might suspect (sizeof(vector) bytes after the last requested
+      // load), but rather sizeof(vector) - 1 bytes after the last
+      // requested vector. The point of this is to avoid a page fault if the
+      // base address happend to be aligned. This works because if the base
+      // address is aligned, then adding less than a full vector length will
+      // cause the last vector in the sequence to be (re)loaded. Otherwise,
+      // the next vector will be fetched as you might suspect was necessary.
+
+      // We might be able to reuse the permutation generation from
+      // a different base address offset from this one by an aligned amount.
+      // The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
+      // optimization later.
+      SDValue PermCntl = BuildIntrinsicOp(Intrinsic::ppc_altivec_lvsl, Ptr,
+                                          DAG, dl, MVT::v16i8);
+
+      // Refine the alignment of the original load (a "new" load created here
+      // which was identical to the first except for the alignment would be
+      // merged with the existing node regardless).
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineMemOperand *MMO =
+        MF.getMachineMemOperand(LD->getPointerInfo(),
+                                LD->getMemOperand()->getFlags(),
+                                LD->getMemoryVT().getStoreSize(),
+                                ABIAlignment);
+      LD->refineAlignment(MMO);
+      SDValue BaseLoad = SDValue(LD, 0);
+
+      // Note that the value of IncOffset (which is provided to the next
+      // load's pointer info offset value, and thus used to calculate the
+      // alignment), and the value of IncValue (which is actually used to
+      // increment the pointer value) are different! This is because we
+      // require the next load to appear to be aligned, even though it
+      // is actually offset from the base pointer by a lesser amount.
+      int IncOffset = VT.getSizeInBits() / 8;
+      int IncValue = IncOffset;
+
+      // Walk (both up and down) the chain looking for another load at the real
+      // (aligned) offset (the alignment of the other load does not matter in
+      // this case). If found, then do not use the offset reduction trick, as
+      // that will prevent the loads from being later combined (as they would
+      // otherwise be duplicates).
+      if (!findConsecutiveLoad(LD, DAG))
+        --IncValue;
+
+      SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+
+      SDValue ExtraLoad =
+        DAG.getLoad(VT, dl, Chain, Ptr,
+                    LD->getPointerInfo().getWithOffset(IncOffset),
+                    LD->isVolatile(), LD->isNonTemporal(),
+                    LD->isInvariant(), ABIAlignment);
+
+      SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+        BaseLoad.getValue(1), ExtraLoad.getValue(1));
+
+      if (BaseLoad.getValueType() != MVT::v4i32)
+        BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
+
+      if (ExtraLoad.getValueType() != MVT::v4i32)
+        ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
+
+      SDValue Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
+                                      BaseLoad, ExtraLoad, PermCntl, DAG, dl);
+
+      if (VT != MVT::v4i32)
+        Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
+
+      // Now we need to be really careful about how we update the users of the
+      // original load. We cannot just call DCI.CombineTo (or
+      // DAG.ReplaceAllUsesWith for that matter), because the load still has
+      // uses created here (the permutation for example) that need to stay.
+      SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+      while (UI != UE) {
+        SDUse &Use = UI.getUse();
+        SDNode *User = *UI;
+        // Note: BaseLoad is checked here because it might not be N, but a
+        // bitcast of N.
+        if (User == Perm.getNode() || User == BaseLoad.getNode() ||
+            User == TF.getNode() || Use.getResNo() > 1) {
+          ++UI;
+          continue;
+        }
+
+        SDValue To = Use.getResNo() ? TF : Perm;
+        ++UI;
+
+        SmallVector<SDValue, 8> Ops;
+        for (SDNode::op_iterator O = User->op_begin(),
+             OE = User->op_end(); O != OE; ++O) {
+          if (*O == Use)
+            Ops.push_back(To);
+          else
+            Ops.push_back(*O);
+        }
+
+        DAG.UpdateNodeOperands(User, Ops.data(), Ops.size());
+      }
+
+      return SDValue(N, 0);
+    }
+    }
+    break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() ==
+          Intrinsic::ppc_altivec_lvsl &&
+        N->getOperand(1)->getOpcode() == ISD::ADD) {
+      SDValue Add = N->getOperand(1);
+
+      if (DAG.MaskedValueIsZero(Add->getOperand(1),
+            APInt::getAllOnesValue(4 /* 16 byte alignment */).zext(
+              Add.getValueType().getScalarType().getSizeInBits()))) {
+        SDNode *BasePtr = Add->getOperand(0).getNode();
+        for (SDNode::use_iterator UI = BasePtr->use_begin(),
+             UE = BasePtr->use_end(); UI != UE; ++UI) {
+          if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+              cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
+                Intrinsic::ppc_altivec_lvsl) {
+            // We've found another LVSL, and this address if an aligned
+            // multiple of that one. The results will be the same, so use the
+            // one we've just found instead.
+
+            return SDValue(*UI, 0);
+          }
+        }
+      }
+    }
+
+    break;
   case ISD::BSWAP:
     // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
     if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
@@ -7083,20 +7414,53 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         }
       }
 
-      // If the user is a MFCR instruction, we know this is safe.  Otherwise we
-      // give up for right now.
-      if (FlagUser->getOpcode() == PPCISD::MFCR)
+      // If the user is a MFOCRF instruction, we know this is safe.
+      // Otherwise we give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFOCRF)
         return SDValue(VCMPoNode, 0);
     }
     break;
   }
   case ISD::BR_CC: {
     // If this is a branch on an altivec predicate comparison, lower this so
-    // that we don't have to do a MFCR: instead, branch directly on CR6.  This
+    // that we don't have to do a MFOCRF: instead, branch directly on CR6.  This
     // lowering is done pre-legalize, because the legalizer lowers the predicate
     // compare down to code that is difficult to reassemble.
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
     SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+
+    // Sometimes the promoted value of the intrinsic is ANDed by some non-zero
+    // value. If so, pass-through the AND to get to the intrinsic.
+    if (LHS.getOpcode() == ISD::AND &&
+        LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(LHS.getOperand(1)) &&
+        !cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
+          isZero())
+      LHS = LHS.getOperand(0);
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
+        cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
+          Intrinsic::ppc_is_decremented_ctr_nonzero &&
+        isa<ConstantSDNode>(RHS)) {
+      assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+             "Counter decrement comparison is not EQ or NE");
+
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      bool isBDNZ = (CC == ISD::SETEQ && Val) ||
+                    (CC == ISD::SETNE && !Val);
+
+      // We now need to make the intrinsic dead (it cannot be instruction
+      // selected).
+      DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
+      assert(LHS.getNode()->hasOneUse() &&
+             "Counter decrement has more than one use");
+
+      return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
+                         N->getOperand(0), N->getOperand(4));
+    }
+
     int CompareOpc;
     bool isDot;
 
@@ -7271,7 +7635,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
 
 std::pair<unsigned, const TargetRegisterClass*>
 PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                EVT VT) const {
+                                                MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC RS6000 Constraint Letters
     switch (Constraint[0]) {
@@ -7296,7 +7660,24 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     }
   }
 
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+  std::pair<unsigned, const TargetRegisterClass*> R =
+    TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
+  // (which we call X[0-9]+). If a 64-bit value has been requested, and a
+  // 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
+  // register.
+  // FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
+  // the AsmName field from *RegisterInfo.td, then this would not be necessary.
+  if (R.first && VT == MVT::i64 && PPCSubTarget.isPPC64() &&
+      PPC::GPRCRegClass.contains(R.first)) {
+    const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+    return std::make_pair(TRI->getMatchingSuperReg(R.first,
+                            PPC::sub_32, &PPC::G8RCRegClass),
+                          &PPC::G8RCRegClass);
+  }
+
+  return R;
 }
 
 
@@ -7406,25 +7787,13 @@ bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return true;
 }
 
-/// isLegalAddressImmediate - Return true if the integer value can be used
-/// as the offset of the target addressing mode for load / store of the
-/// given type.
-bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,Type *Ty) const{
-  // PPC allows a sign-extended 16-bit immediate field.
-  return (V > -(1 << 16) && V < (1 << 16)-1);
-}
-
-bool PPCTargetLowering::isLegalAddressImmediate(GlobalValue* GV) const {
-  return false;
-}
-
 SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   // Make sure the function does not optimize away the store of the RA to
@@ -7454,7 +7823,7 @@ SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
 
 SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
                                           SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -7537,18 +7906,15 @@ bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
   return true;
 }
 
-/// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
-/// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
-/// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
-/// is expanded to mul + add.
-bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
   if (!VT.isSimple())
     return false;
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32:
   case MVT::f64:
-  case MVT::v4f32:
     return true;
   default:
     break;
@@ -7558,9 +7924,15 @@ bool PPCTargetLowering::isFMAFasterThanMulAndAdd(EVT VT) const {
 }
 
 Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
-  if (DisableILPPref)
+  if (DisableILPPref || PPCSubTarget.enableMachineScheduler())
     return TargetLowering::getSchedulingPreference(N);
 
   return Sched::ILP;
 }
 
+// Create a fast isel object.
+FastISel *
+PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
+                                  const TargetLibraryInfo *LibInfo) const {
+  return PPC::createFastISel(FuncInfo, LibInfo);
+}
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 423e983..df3af35 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -20,6 +20,7 @@
 #include "PPCRegisterInfo.h"
 #include "PPCSubtarget.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -115,11 +116,10 @@ namespace llvm {
       /// Return with a flag operand, matched by 'blr'
       RET_FLAG,
 
-      /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCRpseud/MFOCRF
-      /// instructions.  This copies the bits corresponding to the specified
-      /// CRREG into the resultant GPR.  Bits corresponding to other CR regs
-      /// are undefined.
-      MFCR,
+      /// R32 = MFOCRF(CRREG, INFLAG) - Represents the MFOCRF instruction.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFOCRF,
 
       // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
       EH_SJLJ_SETJMP,
@@ -146,6 +146,10 @@ namespace llvm {
       /// an optional input flag argument.
       COND_BRANCH,
 
+      /// CHAIN = BDNZ CHAIN, DESTBB - These are used to create counter-based
+      /// loops.
+      BDNZ, BDZ,
+
       /// F8RC = FADDRTZ F8RC, F8RC - This is an FADD done with rounding
       /// towards zero.  Used only as part of the long double-to-int
       /// conversion sequence.
@@ -175,61 +179,61 @@ namespace llvm {
 
       /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
       /// TLS model, produces an ADDIS8 instruction that adds the GOT
-      /// base to sym@got@tprel@ha.
+      /// base to sym\@got\@tprel\@ha.
       ADDIS_GOT_TPREL_HA,
 
       /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
       /// TLS model, produces a LD instruction with base register G8RReg
-      /// and offset sym@got@tprel@l.  This completes the addition that
+      /// and offset sym\@got\@tprel\@l.  This completes the addition that
       /// finds the offset of "sym" relative to the thread pointer.
       LD_GOT_TPREL_L,
 
       /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
       /// model, produces an ADD instruction that adds the contents of
       /// G8RReg to the thread pointer.  Symbol contains a relocation
-      /// sym@tls which is to be replaced by the thread pointer and
+      /// sym\@tls which is to be replaced by the thread pointer and
       /// identifies to the linker that the instruction is part of a
       /// TLS sequence.
       ADD_TLS,
 
       /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
-      /// register to sym@got@tlsgd@ha.
+      /// register to sym\@got\@tlsgd\@ha.
       ADDIS_TLSGD_HA,
 
       /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@tlsgd@l.
+      /// sym\@got\@tlsgd\@l.
       ADDI_TLSGD_L,
 
       /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym@tlsgd).
+      /// model, produces a call to __tls_get_addr(sym\@tlsgd).
       GET_TLS_ADDR,
 
       /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
       /// model, produces an ADDIS8 instruction that adds the GOT base
-      /// register to sym@got@tlsld@ha.
+      /// register to sym\@got\@tlsld\@ha.
       ADDIS_TLSLD_HA,
 
       /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@tlsld@l.
+      /// sym\@got\@tlsld\@l.
       ADDI_TLSLD_L,
 
       /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
-      /// model, produces a call to __tls_get_addr(sym@tlsld).
+      /// model, produces a call to __tls_get_addr(sym\@tlsld).
       GET_TLSLD_ADDR,
 
       /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
       /// local-dynamic TLS model, produces an ADDIS8 instruction
-      /// that adds X3 to sym@dtprel@ha.  The Chain operand is needed 
+      /// that adds X3 to sym\@dtprel\@ha. The Chain operand is needed
       /// to tie this in place following a copy to %X3 from the result
       /// of a GET_TLSLD_ADDR.
       ADDIS_DTPREL_HA,
 
       /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
       /// model, produces an ADDI8 instruction that adds G8RReg to
-      /// sym@got@dtprel@l.
+      /// sym\@got\@dtprel\@l.
       ADDI_DTPREL_L,
 
       /// VRRC = VADD_SPLAT Elt, EltSize - Temporary node to be expanded
@@ -238,6 +242,10 @@ namespace llvm {
       /// optimizations due to constant folding.
       VADD_SPLAT,
 
+      /// CHAIN = SC CHAIN, Imm128 - System call.  The 7-bit unsigned
+      /// operand identifies the operating system entry point.
+      SC,
+
       /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
       /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
       /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -266,16 +274,16 @@ namespace llvm {
 
       /// G8RC = ADDIS_TOC_HA %X2, Symbol - For medium and large code model,
       /// produces an ADDIS8 instruction that adds the TOC base register to
-      /// sym@toc@ha.
+      /// sym\@toc\@ha.
       ADDIS_TOC_HA,
 
       /// G8RC = LD_TOC_L Symbol, G8RReg - For medium and large code model,
       /// produces a LD instruction with base register G8RReg and offset
-      /// sym@toc@l.  Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
+      /// sym\@toc\@l. Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
       LD_TOC_L,
 
       /// G8RC = ADDI_TOC_L G8RReg, Symbol - For medium code model, produces
-      /// an ADDI8 instruction that adds G8RReg to sym@toc@l.
+      /// an ADDI8 instruction that adds G8RReg to sym\@toc\@l.
       /// Preceded by an ADDIS_TOC_HA to form a full 32-bit offset.
       ADDI_TOC_L
     };
@@ -327,8 +335,6 @@ namespace llvm {
 
   class PPCTargetLowering : public TargetLowering {
     const PPCSubtarget &PPCSubTarget;
-    const PPCRegisterInfo *PPCRegInfo;
-    const PPCInstrInfo *PPCII;
 
   public:
     explicit PPCTargetLowering(PPCTargetMachine &TM);
@@ -340,7 +346,7 @@ namespace llvm {
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
@@ -358,21 +364,16 @@ namespace llvm {
 
     /// SelectAddressRegImm - Returns true if the address N can be represented
     /// by a base register plus a signed 16-bit displacement [r+imm], and if it
-    /// is not better represented as reg+reg.
+    /// is not better represented as reg+reg.  If Aligned is true, only accept
+    /// displacements suitable for STD and friends, i.e. multiples of 4.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
-                             SelectionDAG &DAG) const;
+                             SelectionDAG &DAG, bool Aligned) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
                                  SelectionDAG &DAG) const;
 
-    /// SelectAddressRegImmShift - Returns true if the address N can be
-    /// represented by a base register plus a signed 14-bit displacement
-    /// [r+imm*4].  Suitable for use by STD and friends.
-    bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
-                                  SelectionDAG &DAG) const;
-
     Sched::Preference getSchedulingPreference(SDNode *N) const;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -418,7 +419,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   EVT VT) const;
+                                   MVT VT) const;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
@@ -436,15 +437,6 @@ namespace llvm {
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
 
-    /// isLegalAddressImmediate - Return true if the integer value can be used
-    /// as the offset of the target addressing mode for load / store of the
-    /// given type.
-    virtual bool isLegalAddressImmediate(int64_t V, Type *Ty) const;
-
-    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
-    /// the offset of the target addressing mode.
-    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
-
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
@@ -459,7 +451,7 @@ namespace llvm {
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
@@ -467,11 +459,16 @@ namespace llvm {
     /// relative to software emulation.
     virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast = 0) const;
 
-    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
-    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
-    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
-    /// is expanded to mul + add.
-    virtual bool isFMAFasterThanMulAndAdd(EVT VT) const;
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+
+    /// createFastISel - This method returns a target-specific FastISel object,
+    /// or null if the target does not support "fast" instruction selection.
+    virtual FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                                     const TargetLibraryInfo *LibInfo) const;
 
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
@@ -490,7 +487,7 @@ namespace llvm {
                                          SDValue &LROpOut,
                                          SDValue &FPOpOut,
                                          bool isDarwinABI,
-                                         DebugLoc dl) const;
+                                         SDLoc dl) const;
 
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
@@ -506,12 +503,14 @@ namespace llvm {
                          const PPCSubtarget &Subtarget) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG,
                        const PPCSubtarget &Subtarget) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG,
+                        const PPCSubtarget &Subtarget) const;
     SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
                                 const PPCSubtarget &Subtarget) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                       const PPCSubtarget &Subtarget) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, DebugLoc dl) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, SDLoc dl) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const;
@@ -526,9 +525,9 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
-    SDValue FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool isTailCall,
+    SDValue FinishCall(CallingConv::ID CallConv, SDLoc dl, bool isTailCall,
                        bool isVarArg,
                        SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8>
@@ -543,7 +542,7 @@ namespace llvm {
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -561,11 +560,11 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     SDValue
       extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
-                        SDValue ArgVal, DebugLoc dl) const;
+                        SDValue ArgVal, SDLoc dl) const;
 
     void
       setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
@@ -576,25 +575,25 @@ namespace llvm {
       LowerFormalArguments_Darwin(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerFormalArguments_64SVR4(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerFormalArguments_32SVR4(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SDLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue
       createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
                                  SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
-                                 SelectionDAG &DAG, DebugLoc dl) const;
+                                 SelectionDAG &DAG, SDLoc dl) const;
 
     SDValue
       LowerCall_Darwin(SDValue Chain, SDValue Callee,
@@ -603,7 +602,7 @@ namespace llvm {
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl, SelectionDAG &DAG,
+                       SDLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
       LowerCall_64SVR4(SDValue Chain, SDValue Callee,
@@ -612,7 +611,7 @@ namespace llvm {
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                       DebugLoc dl, SelectionDAG &DAG,
+                       SDLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
     LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
@@ -620,7 +619,7 @@ namespace llvm {
                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                      const SmallVectorImpl<SDValue> &OutVals,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc dl, SelectionDAG &DAG,
+                     SDLoc dl, SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -628,7 +627,31 @@ namespace llvm {
 
     SDValue DAGCombineFastRecip(SDValue Op, DAGCombinerInfo &DCI) const;
     SDValue DAGCombineFastRecipFSQRT(SDValue Op, DAGCombinerInfo &DCI) const;
+
+    CCAssignFn *useFastISelCCs(unsigned Flag) const;
   };
+
+  namespace PPC {
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo);
+  }
+
+  bool CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                  CCValAssign::LocInfo &LocInfo,
+                                  ISD::ArgFlagsTy &ArgFlags,
+                                  CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
+                                         MVT &LocVT,
+                                         CCValAssign::LocInfo &LocInfo,
+                                         ISD::ArgFlagsTy &ArgFlags,
+                                         CCState &State);
+
+  bool CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
+                                           MVT &LocVT,
+                                           CCValAssign::LocInfo &LocInfo,
+                                           ISD::ArgFlagsTy &ArgFlags,
+                                           CCState &State);
 }
 
 #endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index bff4c23..46db4fe 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -17,29 +17,39 @@
 //
 def s16imm64 : Operand<i64> {
   let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
 }
 def u16imm64 : Operand<i64> {
   let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
 }
-def symbolHi64 : Operand<i64> {
-  let PrintMethod = "printSymbolHi";
-  let EncoderMethod = "getHA16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
-}
-def symbolLo64 : Operand<i64> {
-  let PrintMethod = "printSymbolLo";
-  let EncoderMethod = "getLO16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
+def s17imm64 : Operand<i64> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
 }
 def tocentry : Operand<iPTR> {
   let MIOperandInfo = (ops i64imm:$imm);
 }
+def PPCTLSRegOperand : AsmOperandClass {
+  let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
+  let RenderMethod = "addTLSRegOperands";
+}
 def tlsreg : Operand<i64> {
   let EncoderMethod = "getTLSRegEncoding";
+  let ParserMatchClass = PPCTLSRegOperand;
 }
 def tlsgd : Operand<i64> {}
+def tlscall : Operand<i64> {
+  let PrintMethod = "printTLSCall";
+  let MIOperandInfo = (ops calltarget:$func, tlsgd:$sym);
+  let EncoderMethod = "getTLSCallEncoding";
+}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -78,7 +88,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 
     let isCodeGenOnly = 1 in
     def BCCTR8 : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
-                             "b${cond:cc}ctr ${cond:reg}", BrB, []>,
+                             "b${cond:cc}ctr${cond:pm} ${cond:reg}", BrB, []>,
         Requires<[In64BitMode]>;
   }
 }
@@ -111,7 +121,10 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
     def BL8  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
                      "bl $func", BrB, []>;  // See Pat patterns below.
 
-    def BLA8 : IForm<18, 1, 1, (outs), (ins aaddr:$func),
+    def BL8_TLS  : IForm<18, 0, 1, (outs), (ins tlscall:$func),
+                         "bl $func", BrB, []>;
+
+    def BLA8 : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
                      "bla $func", BrB, [(PPCcall (i64 imm:$func))]>;
   }
   let Uses = [RM], isCodeGenOnly = 1 in {
@@ -119,16 +132,12 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
                              (outs), (ins calltarget:$func),
                              "bl $func\n\tnop", BrB, []>;
 
-    def BL8_NOP_TLSGD : IForm_and_DForm_4_zero<18, 0, 1, 24,
-                                  (outs), (ins calltarget:$func, tlsgd:$sym),
-                                  "bl $func($sym)\n\tnop", BrB, []>;
-
-    def BL8_NOP_TLSLD : IForm_and_DForm_4_zero<18, 0, 1, 24,
-                                  (outs), (ins calltarget:$func, tlsgd:$sym),
-                                  "bl $func($sym)\n\tnop", BrB, []>;
+    def BL8_NOP_TLS : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins tlscall:$func),
+                                  "bl $func\n\tnop", BrB, []>;
 
     def BLA8_NOP : IForm_and_DForm_4_zero<18, 1, 1, 24,
-                             (outs), (ins aaddr:$func),
+                             (outs), (ins abscalltarget:$func),
                              "bla $func\n\tnop", BrB,
                              [(PPCcall_nop (i64 imm:$func))]>;
   }
@@ -139,7 +148,7 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
 
     let isCodeGenOnly = 1 in
     def BCCTRL8 : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
-                              "b${cond:cc}ctrl ${cond:reg}", BrB, []>,
+                              "b${cond:cc}ctrl${cond:pm} ${cond:reg}", BrB, []>,
         Requires<[In64BitMode]>;
   }
 }
@@ -207,7 +216,7 @@ def TCRETURNdi8 :Pseudo< (outs),
                  []>;
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
+def TCRETURNai8 :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
                  "#TC_RETURNa8 $func $offset",
                  [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
 
@@ -233,7 +242,7 @@ def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
-def TAILBA8   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
                   "ba $dst", BrB,
                   []>;
 
@@ -253,22 +262,26 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 // 64-bit CR instructions
 let Interpretation64Bit = 1 in {
 let neverHasSideEffects = 1 in {
-def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins g8rc:$rS),
+def MTOCRF8: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins g8rc:$ST),
+                        "mtocrf $FXM, $ST", BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def MTCRF8 : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, g8rc:$rS),
                       "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
-let isCodeGenOnly = 1 in
-def MFCR8pseud: XFXForm_3<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
-                       "#MFCR8pseud", SprMFCR>,
-            PPC970_MicroCode, PPC970_Unit_CRU;
-} // neverHasSideEffects = 1
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
+def MFOCRF8: XFXForm_5a<31, 19, (outs g8rc:$rT), (ins crbitm:$FXM),
+                        "mfocrf $rT, $FXM", SprMFCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
 
-let neverHasSideEffects = 1 in
 def MFCR8 : XFXForm_3<31, 19, (outs g8rc:$rT), (ins),
                      "mfcr $rT", SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // neverHasSideEffects = 1
 
 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR8] in
   def EH_SjLj_SetJmp64  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
                             "#EH_SJLJ_SETJMP64",
                             [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
@@ -293,8 +306,14 @@ def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
                            "mtctr $rS", SprMTSPR>,
              PPC970_DGroup_First, PPC970_Unit_FXU;
 }
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR8] in {
+let Pattern = [(int_ppc_mtctr i64:$rS)] in
+def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
+                               "mtctr $rS", SprMTSPR>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+}
 
-let Pattern = [(set i64:$rT, readcyclecounter)] in
+let isCodeGenOnly = 1, Pattern = [(set i64:$rT, readcyclecounter)] in
 def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs g8rc:$rT), (ins),
                           "mfspr $rT, 268", SprMFTB>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
@@ -329,10 +348,10 @@ let Interpretation64Bit = 1 in {
 let neverHasSideEffects = 1 in {
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins symbolLo64:$imm),
+def LI8  : DForm_2_r0<14, (outs g8rc:$rD), (ins s16imm64:$imm),
                       "li $rD, $imm", IntSimple,
-                      [(set i64:$rD, immSExt16:$imm)]>;
-def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins symbolHi64:$imm),
+                      [(set i64:$rD, imm64SExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs g8rc:$rD), (ins s17imm64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set i64:$rD, imm16ShiftedSExt:$imm)]>;
 }
@@ -392,9 +411,8 @@ defm ADD8  : XOForm_1r<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        [(set i64:$rT, (add i64:$rA, i64:$rB))]>;
 // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
 // initial-exec thread-local storage model.
-let isCodeGenOnly = 1 in
 def ADD8TLS  : XOForm_1<31, 266, 0, (outs g8rc:$rT), (ins g8rc:$rA, tlsreg:$rB),
-                        "add $rT, $rA, $rB@tls", IntSimple,
+                        "add $rT, $rA, $rB", IntSimple,
                         [(set i64:$rT, (add i64:$rA, tglobaltlsaddr:$rB))]>;
                      
 defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
@@ -404,18 +422,18 @@ defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
 let Defs = [CARRY] in
 def ADDIC8 : DForm_2<12, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "addic $rD, $rA, $imm", IntGeneral,
-                     [(set i64:$rD, (addc i64:$rA, immSExt16:$imm))]>;
-def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, symbolLo64:$imm),
+                     [(set i64:$rD, (addc i64:$rA, imm64SExt16:$imm))]>;
+def ADDI8  : DForm_2<14, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s16imm64:$imm),
                      "addi $rD, $rA, $imm", IntSimple,
-                     [(set i64:$rD, (add i64:$rA, immSExt16:$imm))]>;
-def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, symbolHi64:$imm),
+                     [(set i64:$rD, (add i64:$rA, imm64SExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs g8rc:$rD), (ins g8rc_nox0:$rA, s17imm64:$imm),
                      "addis $rD, $rA, $imm", IntSimple,
                      [(set i64:$rD, (add i64:$rA, imm16ShiftedSExt:$imm))]>;
 
 let Defs = [CARRY] in {
 def SUBFIC8: DForm_2< 8, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
-                     [(set i64:$rD, (subc immSExt16:$imm, i64:$rA))]>;
+                     [(set i64:$rD, (subc imm64SExt16:$imm, i64:$rA))]>;
 defm SUBFC8 : XOForm_1r<31, 8, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                         "subfc", "$rT, $rA, $rB", IntGeneral,
                         [(set i64:$rT, (subc i64:$rB, i64:$rA))]>,
@@ -489,6 +507,14 @@ defm EXTSH8 : XForm_11r<31, 922, (outs g8rc:$rA), (ins g8rc:$rS),
                         [(set i64:$rA, (sext_inreg i64:$rS, i16))]>;
 } // Interpretation64Bit
 
+// For fast-isel:
+let isCodeGenOnly = 1 in {
+def EXTSB8_32_64 : XForm_11<31, 954, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsb $rA, $rS", IntSimple, []>, isPPC64;
+def EXTSH8_32_64 : XForm_11<31, 922, (outs g8rc:$rA), (ins gprc:$rS),
+                           "extsh $rA, $rS", IntSimple, []>, isPPC64;
+} // isCodeGenOnly for fast-isel
+
 defm EXTSW  : XForm_11r<31, 986, (outs g8rc:$rA), (ins g8rc:$rS),
                         "extsw", "$rA, $rS", IntSimple,
                         [(set i64:$rA, (sext_inreg i64:$rS, i32))]>, isPPC64;
@@ -503,16 +529,16 @@ defm SRADI  : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH),
 defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS),
                         "cntlzd", "$rA, $rS", IntGeneral,
                         [(set i64:$rA, (ctlz i64:$rS))]>;
-defm POPCNTD : XForm_11r<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
-                         "popcntd", "$rA, $rS", IntGeneral,
-                         [(set i64:$rA, (ctpop i64:$rS))]>;
+def POPCNTD : XForm_11<31, 506, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntd $rA, $rS", IntGeneral,
+                       [(set i64:$rA, (ctpop i64:$rS))]>;
 
 // popcntw also does a population count on the high 32 bits (storing the
 // results in the high 32-bits of the output). We'll ignore that here (which is
 // safe because we never separately use the high part of the 64-bit registers).
-defm POPCNTW : XForm_11r<31, 378, (outs gprc:$rA), (ins gprc:$rS),
-                         "popcntw", "$rA, $rS", IntGeneral,
-                         [(set i32:$rA, (ctpop i32:$rS))]>;
+def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
+                       "popcntw $rA, $rS", IntGeneral,
+                       [(set i32:$rA, (ctpop i32:$rS))]>;
 
 defm DIVD  : XOForm_1r<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "divd", "$rT, $rA, $rB", IntDivD,
@@ -525,6 +551,9 @@ defm DIVDU : XOForm_1r<31, 457, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
 defm MULLD : XOForm_1r<31, 233, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                        "mulld", "$rT, $rA, $rB", IntMulHD,
                        [(set i64:$rT, (mul i64:$rA, i64:$rB))]>, isPPC64;
+def MULLI8 : DForm_2<7, (outs g8rc:$rD), (ins g8rc:$rA, s16imm64:$imm),
+                       "mulli $rD, $rA, $imm", IntMulLI,
+                       [(set i64:$rD, (mul i64:$rA, imm64SExt16:$imm))]>;
 }
 
 let neverHasSideEffects = 1 in {
@@ -541,14 +570,30 @@ defm RLDCL  : MDSForm_1r<30, 8,
                         (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
                         "rldcl", "$rA, $rS, $rB, $MBE", IntRotateD,
                         []>, isPPC64;
+defm RLDCR  : MDSForm_1r<30, 9,
+                        (outs g8rc:$rA), (ins g8rc:$rS, gprc:$rB, u6imm:$MBE),
+                        "rldcr", "$rA, $rS, $rB, $MBE", IntRotateD,
+                        []>, isPPC64;
 defm RLDICL : MDForm_1r<30, 0,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicl", "$rA, $rS, $SH, $MBE", IntRotateDI,
                         []>, isPPC64;
+// For fast-isel:
+let isCodeGenOnly = 1 in
+def RLDICL_32_64 : MDForm_1<30, 0,
+                           (outs g8rc:$rA),
+                           (ins gprc:$rS, u6imm:$SH, u6imm:$MBE),
+                           "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
+                           []>, isPPC64;
+// End fast-isel.
 defm RLDICR : MDForm_1r<30, 1,
                         (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
                         "rldicr", "$rA, $rS, $SH, $MBE", IntRotateDI,
                         []>, isPPC64;
+defm RLDIC  : MDForm_1r<30, 2,
+                        (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE),
+                        "rldic", "$rA, $rS, $SH, $MBE", IntRotateDI,
+                        []>, isPPC64;
 
 let Interpretation64Bit = 1 in {
 defm RLWINM8 : MForm_2r<21, (outs g8rc:$rA),
@@ -592,6 +637,15 @@ def LWAX : XForm_1<31, 341, (outs g8rc:$rD), (ins memrr:$src),
                    "lwax $rD, $src", LdStLHA,
                    [(set i64:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
                    PPC970_DGroup_Cracked;
+// For fast-isel:
+let isCodeGenOnly = 1, mayLoad = 1 in {
+def LWA_32  : DSForm_1<58, 2, (outs gprc:$rD), (ins memrix:$src),
+                      "lwa $rD, $src", LdStLWA, []>, isPPC64,
+                      PPC970_DGroup_Cracked;
+def LWAX_32 : XForm_1<31, 341, (outs gprc:$rD), (ins memrr:$src),
+                     "lwax $rD, $src", LdStLHA, []>, isPPC64,
+                     PPC970_DGroup_Cracked;
+} // end fast-isel isCodeGenOnly
 
 // Update forms.
 let mayLoad = 1, neverHasSideEffects = 1 in {
@@ -750,25 +804,25 @@ def ADDItocL: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        (PPCaddiTocL i64:$reg, tglobaladdr:$disp))]>, isPPC64;
 
 // Support for thread-local storage.
-def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDISgotTprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDISgotTprelHA",
                          [(set i64:$rD,
                            (PPCaddisGotTprelHA i64:$reg,
                                                tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins symbolLo64:$disp, g8rc_nox0:$reg),
+def LDgotTprelL: Pseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                         "#LDgotTprelL",
                         [(set i64:$rD,
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
                  isPPC64;
 def : Pat<(PPCaddTls i64:$in, tglobaltlsaddr:$g),
           (ADD8TLS $in, tglobaltlsaddr:$g)>;
-def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDIStlsgdHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsgdHA",
                          [(set i64:$rD,
                            (PPCaddisTlsgdHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsgdL",
                        [(set i64:$rD,
                          (PPCaddiTlsgdL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -778,12 +832,12 @@ def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                         [(set i64:$rD,
                           (PPCgetTlsAddr i64:$reg, tglobaltlsaddr:$sym))]>,
                  isPPC64;
-def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDIStlsldHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIStlsldHA",
                          [(set i64:$rD,
                            (PPCaddisTlsldHA i64:$reg, tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDItlsldL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                        "#ADDItlsldL",
                        [(set i64:$rD,
                          (PPCaddiTlsldL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -793,13 +847,13 @@ def GETtlsldADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                           [(set i64:$rD,
                             (PPCgetTlsldAddr i64:$reg, tglobaltlsaddr:$sym))]>,
                    isPPC64;
-def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolHi64:$disp),
+def ADDISdtprelHA: Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                           "#ADDISdtprelHA",
                           [(set i64:$rD,
                             (PPCaddisDtprelHA i64:$reg,
                                               tglobaltlsaddr:$disp))]>,
                    isPPC64;
-def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, symbolLo64:$disp),
+def ADDIdtprelL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                          "#ADDIdtprelL",
                          [(set i64:$rD,
                            (PPCaddiDtprelL i64:$reg, tglobaltlsaddr:$disp))]>,
@@ -914,6 +968,9 @@ let PPC970_Unit = 3, neverHasSideEffects = 1,
 defm FCFID  : XForm_26r<63, 846, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fcfid", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfcfid f64:$frB))]>, isPPC64;
+defm FCTID  : XForm_26r<63, 814, (outs f8rc:$frD), (ins f8rc:$frB),
+                        "fctid", "$frD, $frB", FPGeneral,
+                        []>, isPPC64;
 defm FCTIDZ : XForm_26r<63, 815, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fctidz", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (PPCfctidz f64:$frB))]>, isPPC64;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index cc9cf0a..a55abe3 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -229,35 +229,45 @@ let Predicates = [HasAltivec] in {
 let isCodeGenOnly = 1 in {
 def DSS      : DSS_Form<822, (outs),
                         (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dss $STRM", LdStLoad /*FIXME*/, []>;
+                        "dss $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSSALL   : DSS_Form<822, (outs),
                         (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
-                        "dssall", LdStLoad /*FIXME*/, []>;
+                        "dssall", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DST      : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT     : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST    : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT   : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, gprc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 
 def DST64    : DSS_Form<342, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTT64   : DSS_Form<342, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTST64  : DSS_Form<374, (outs),
                         (ins u5imm:$ZERO, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dstst $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 def DSTSTT64 : DSS_Form<374, (outs),
                         (ins u5imm:$ONE, u5imm:$STRM, g8rc:$rA, gprc:$rB),
-                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>;
+                        "dststt $rA, $rB, $STRM", LdStLoad /*FIXME*/, []>,
+                        Deprecated<DeprecatedDST>;
 }
 
 def MFVSCR : VXForm_4<1540, (outs vrrc:$vD), (ins),
@@ -392,7 +402,7 @@ def VCTUXS : VXForm_1<906, (outs vrrc:$vD), (ins u5imm:$UIMM, vrrc:$vB),
 // Defines with the UIM field set to 0 for floating-point
 // to integer (fp_to_sint/fp_to_uint) conversions and integer
 // to floating-point (sint_to_fp/uint_to_fp) conversions.
-let VA = 0 in {
+let isCodeGenOnly = 1, VA = 0 in {
 def VCFSX_0 : VXForm_1<842, (outs vrrc:$vD), (ins vrrc:$vB),
                        "vcfsx $vD, $vB, 0", VecFP,
                        [(set v4f32:$vD,
@@ -664,15 +674,29 @@ def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
 def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
 def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
                       
-let isCodeGenOnly = 1 in
-def V_SET0 : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+let isCodeGenOnly = 1 in {
+def V_SET0B : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllZerosV))]>;
+def V_SET0H : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllZerosV))]>;
+def V_SET0  : VXForm_setzero<1220, (outs vrrc:$vD), (ins),
                       "vxor $vD, $vD, $vD", VecFP,
                       [(set v4i32:$vD, (v4i32 immAllZerosV))]>;
+
 let IMM=-1 in {
-def V_SETALLONES : VXForm_3<908, (outs vrrc:$vD), (ins),
+def V_SETALLONESB : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", VecFP,
+                      [(set v16i8:$vD, (v16i8 immAllOnesV))]>;
+def V_SETALLONESH : VXForm_3<908, (outs vrrc:$vD), (ins),
+                      "vspltisw $vD, -1", VecFP,
+                      [(set v8i16:$vD, (v8i16 immAllOnesV))]>;
+def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
                       "vspltisw $vD, -1", VecFP,
                       [(set v4i32:$vD, (v4i32 immAllOnesV))]>;
 }
+}
 } // VALU Operations.
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index b6f4e85..29233d4 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -145,6 +145,33 @@ class BForm_2<bits<6> opcode, bits<5> bo, bits<5> bi, bit aa, bit lk,
   let Inst{31}    = lk;
 }
 
+class BForm_3<bits<6> opcode, bit aa, bit lk,
+              dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, BrB> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<14> BD;
+
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.3 SC-Form
+class SCForm<bits<6> opcode, bits<1> xo,
+                     dag OOL, dag IOL, string asmstr, InstrItinClass itin,
+                     list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<7>  LEV;
+
+  let Pattern = pattern;
+
+  let Inst{20-26} = LEV;
+  let Inst{30}    = xo;
+}
+
 // 1.7.4 D-Form
 class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
                  InstrItinClass itin, list<dag> pattern> 
@@ -371,6 +398,13 @@ class XForm_1a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let RST = 0;
 }
 
+class XForm_rs<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+}
+
 class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -411,6 +445,17 @@ class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = 0;
 }
 
+class XForm_mtmsr<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RS;
+  bits<1> L;
+
+  let Inst{6-10} = RS;
+  let Inst{15} = L;
+  let Inst{21-30} = xo;
+}
+
 class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                    InstrItinClass itin>
   : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
@@ -446,14 +491,23 @@ class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
 class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
                string asmstr, InstrItinClass itin, list<dag> pattern> 
   : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<2> L;
+
   let Pattern = pattern;
-  let Inst{6-10}  = 0;
+  let Inst{6-8}   = 0;
+  let Inst{9-10}  = L;
   let Inst{11-15} = 0;
   let Inst{16-20} = 0;
   let Inst{21-30} = xo;
   let Inst{31}    = 0;
 }
 
+class XForm_24_eieio<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : XForm_24_sync<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let L = 0;
+}
+
 class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                InstrItinClass itin, list<dag> pattern> 
   : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
@@ -498,6 +552,21 @@ class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = RC;
 }
 
+class XForm_0<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+  let B = 0;
+}
+
+class XForm_16b<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let RST = 0;
+  let A = 0;
+}
+
 // DCB_Form - Form X instruction, used for dcb* instructions.
 class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
                       InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 1fb17eb..315ad04 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -33,7 +33,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #define GET_INSTRMAP_INFO
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
 using namespace llvm;
@@ -45,9 +45,12 @@ opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
 static cl::opt<bool> DisableCmpOpt("disable-ppc-cmp-opt",
 cl::desc("Disable compare instruction optimization"), cl::Hidden);
 
+// Pin the vtable to this file.
+void PPCInstrInfo::anchor() {}
+
 PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
   : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
-    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+    TM(tm), RI(*TM.getSubtargetImpl()) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
 /// this target when scheduling the DAG.
@@ -74,10 +77,9 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   // Most subtargets use a PPC970 recognizer.
   if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
       Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
-    const TargetInstrInfo *TII = TM.getInstrInfo();
-    assert(TII && "No InstrInfo?");
+    assert(TM.getInstrInfo() && "No InstrInfo?");
 
-    return new PPCHazardRecognizer970(*TII);
+    return new PPCHazardRecognizer970(TM);
   }
 
   return new PPCScoreboardHazardRecognizer(II, DAG);
@@ -449,7 +451,9 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 
   // isel is for regular integer GPRs only.
   if (!PPC::GPRCRegClass.hasSubClassEq(RC) &&
-      !PPC::G8RCRegClass.hasSubClassEq(RC))
+      !PPC::GPRC_NOR0RegClass.hasSubClassEq(RC) &&
+      !PPC::G8RCRegClass.hasSubClassEq(RC) &&
+      !PPC::G8RC_NOX0RegClass.hasSubClassEq(RC))
     return false;
 
   // FIXME: These numbers are for the A2, how well they work for other cores is
@@ -479,12 +483,15 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   const TargetRegisterClass *RC =
     RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
   assert(RC && "TrueReg and FalseReg must have overlapping register classes");
-  assert((PPC::GPRCRegClass.hasSubClassEq(RC) ||
-          PPC::G8RCRegClass.hasSubClassEq(RC)) &&
+
+  bool Is64Bit = PPC::G8RCRegClass.hasSubClassEq(RC) ||
+                 PPC::G8RC_NOX0RegClass.hasSubClassEq(RC);
+  assert((Is64Bit ||
+          PPC::GPRCRegClass.hasSubClassEq(RC) ||
+          PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) &&
          "isel is for regular integer GPRs only");
 
-  unsigned OpCode =
-    PPC::GPRCRegClass.hasSubClassEq(RC) ? PPC::ISEL : PPC::ISEL8;
+  unsigned OpCode = Is64Bit ? PPC::ISEL8 : PPC::ISEL;
   unsigned SelectPred = Cond[0].getImm();
 
   unsigned SubIdx;
@@ -792,16 +799,6 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
-MachineInstr*
-PPCInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                       int FrameIx, uint64_t Offset,
-                                       const MDNode *MDPtr,
-                                       DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(PPC::DBG_VALUE));
-  addFrameReference(MIB, FrameIx, 0, false).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 bool PPCInstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
@@ -991,6 +988,10 @@ bool PPCInstrInfo::SubsumesPredicate(
   if (Pred2[1].getReg() == PPC::CTR8 || Pred2[1].getReg() == PPC::CTR)
     return false;
 
+  // P1 can only subsume P2 if they test the same condition register.
+  if (Pred1[1].getReg() != Pred2[1].getReg())
+    return false;
+
   PPC::Predicate P1 = (PPC::Predicate) Pred1[0].getImm();
   PPC::Predicate P2 = (PPC::Predicate) Pred2[0].getImm();
 
@@ -1156,25 +1157,19 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       MachineInstr *UseMI = &*I;
       if (UseMI->getOpcode() == PPC::BCC) {
         unsigned Pred = UseMI->getOperand(0).getImm();
-        if (Pred == PPC::PRED_EQ || Pred == PPC::PRED_NE)
-          continue;
-
-        return false;
+        if (Pred != PPC::PRED_EQ && Pred != PPC::PRED_NE)
+          return false;
       } else if (UseMI->getOpcode() == PPC::ISEL ||
                  UseMI->getOpcode() == PPC::ISEL8) {
         unsigned SubIdx = UseMI->getOperand(3).getSubReg();
-        if (SubIdx == PPC::sub_eq)
-          continue;
-
-        return false;
+        if (SubIdx != PPC::sub_eq)
+          return false;
       } else
         return false;
     }
   }
 
-  // Get ready to iterate backward from CmpInstr.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
+  MachineBasicBlock::iterator I = CmpInstr;
 
   // Scan forward to find the first use of the compare.
   for (MachineBasicBlock::iterator EL = CmpInstr->getParent()->end();
@@ -1191,9 +1186,6 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
       break;
   }
 
-  // Early exit if we're at the beginning of the BB.
-  if (I == B) return false;
-
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
@@ -1213,6 +1205,11 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // Search for Sub.
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   --I;
+
+  // Get ready to iterate backward from CmpInstr.
+  MachineBasicBlock::iterator E = MI,
+                              B = CmpInstr->getParent()->begin();
+
   for (; I != E && !noSub; --I) {
     const MachineInstr &Instr = *I;
     unsigned IOpC = Instr.getOpcode();
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 34a1a73..f140c41 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -78,6 +78,7 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr*> &NewMIs,
                             bool &NonRI, bool &SpillsVRS) const;
+  virtual void anchor();
 public:
   explicit PPCInstrInfo(PPCTargetMachine &TM);
 
@@ -148,12 +149,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
-
   virtual
   bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 4763069..2bd3aad 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -162,6 +162,10 @@ def PPCeh_sjlj_longjmp : SDNode<"PPCISD::EH_SJLJ_LONGJMP",
                                 SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
                                 [SDNPHasChain, SDNPSideEffect]>;
 
+def SDT_PPCsc     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def PPCsc         : SDNode<"PPCISD::SC", SDT_PPCsc,
+                           [SDNPHasChain, SDNPSideEffect]>;
+
 def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
 def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutGlue]>;
 
@@ -246,13 +250,15 @@ def maskimm32 : PatLeaf<(imm), [{
     return false;
 }]>;
 
-def immSExt16  : PatLeaf<(imm), [{
-  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
-  // field.  Used by instructions like 'addi'.
-  if (N->getValueType(0) == MVT::i32)
-    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
-  else
-    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+def imm32SExt16  : Operand<i32>, ImmLeaf<i32, [{
+  // imm32SExt16 predicate - True if the i32 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int32_t)Imm == (short)Imm;
+}]>;
+def imm64SExt16  : Operand<i64>, ImmLeaf<i64, [{
+  // imm64SExt16 predicate - True if the i64 immediate fits in a 16-bit
+  // sign extended field.  Used by instructions like 'addi'.
+  return (int64_t)Imm == (short)Imm;
 }]>;
 def immZExt16  : PatLeaf<(imm), [{
   // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
@@ -283,7 +289,7 @@ def imm16ShiftedSExt : PatLeaf<(imm), [{
 }], HI16>;
 
 // Some r+i load/store instructions (such as LD, STD, LDU, etc.) that require
-// restricted memrix (offset/4) constants are alignment sensitive. If these
+// restricted memrix (4-aligned) constants are alignment sensitive. If these
 // offsets are hidden behind TOC entries than the values of the lower-order
 // bits cannot be checked directly. As a result, we need to also incorporate
 // an alignment check into the relevant patterns.
@@ -386,7 +392,7 @@ def vrrc : RegisterOperand<VRRC> {
   let ParserMatchClass = PPCRegVRRCAsmOperand;
 }
 def PPCRegCRBITRCAsmOperand : AsmOperandClass {
-  let Name = "RegCRBITRC"; let PredicateMethod = "isRegNumber";
+  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
 }
 def crbitrc : RegisterOperand<CRBITRC> {
   let ParserMatchClass = PPCRegCRBITRCAsmOperand;
@@ -428,6 +434,7 @@ def PPCS16ImmAsmOperand : AsmOperandClass {
 }
 def s16imm  : Operand<i32> {
   let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCS16ImmAsmOperand;
 }
 def PPCU16ImmAsmOperand : AsmOperandClass {
@@ -436,31 +443,58 @@ def PPCU16ImmAsmOperand : AsmOperandClass {
 }
 def u16imm  : Operand<i32> {
   let PrintMethod = "printU16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
   let ParserMatchClass = PPCU16ImmAsmOperand;
 }
+def PPCS17ImmAsmOperand : AsmOperandClass {
+  let Name = "S17Imm"; let PredicateMethod = "isS17Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s17imm  : Operand<i32> {
+  // This operand type is used for addis/lis to allow the assembler parser
+  // to accept immediates in the range -65536..65535 for compatibility with
+  // the GNU assembler.  The operand is treated as 16-bit otherwise.
+  let PrintMethod = "printS16ImmOperand";
+  let EncoderMethod = "getImm16Encoding";
+  let ParserMatchClass = PPCS17ImmAsmOperand;
+}
+def PPCDirectBrAsmOperand : AsmOperandClass {
+  let Name = "DirectBr"; let PredicateMethod = "isDirectBr";
+  let RenderMethod = "addBranchTargetOperands";
+}
 def directbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def absdirectbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
+}
+def PPCCondBrAsmOperand : AsmOperandClass {
+  let Name = "CondBr"; let PredicateMethod = "isCondBr";
+  let RenderMethod = "addBranchTargetOperands";
 }
 def condbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
+}
+def abscondbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsCondBrEncoding";
+  let ParserMatchClass = PPCCondBrAsmOperand;
 }
 def calltarget : Operand<iPTR> {
+  let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
 }
-def aaddr : Operand<iPTR> {
-  let PrintMethod = "printAbsAddrOperand";
-}
-def symbolHi: Operand<i32> {
-  let PrintMethod = "printSymbolHi";
-  let EncoderMethod = "getHA16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
-}
-def symbolLo: Operand<i32> {
-  let PrintMethod = "printSymbolLo";
-  let EncoderMethod = "getLO16Encoding";
-  let ParserMatchClass = PPCS16ImmAsmOperand;
+def abscalltarget : Operand<iPTR> {
+  let PrintMethod = "printAbsBranchOperand";
+  let EncoderMethod = "getAbsDirectBrEncoding";
+  let ParserMatchClass = PPCDirectBrAsmOperand;
 }
 def PPCCRBitMaskOperand : AsmOperandClass {
  let Name = "CRBitMask"; let PredicateMethod = "isCRBitMask";
@@ -488,12 +522,14 @@ def ptr_rc_idx : Operand<iPTR>, PointerLikeRegClass<0> {
 
 def PPCDispRIOperand : AsmOperandClass {
  let Name = "DispRI"; let PredicateMethod = "isS16Imm";
+ let RenderMethod = "addImmOperands";
 }
 def dispRI : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIOperand;
 }
 def PPCDispRIXOperand : AsmOperandClass {
  let Name = "DispRIX"; let PredicateMethod = "isS16ImmX4";
+ let RenderMethod = "addImmOperands";
 }
 def dispRIX : Operand<iPTR> {
   let ParserMatchClass = PPCDispRIXOperand;
@@ -508,8 +544,8 @@ def memrr : Operand<iPTR> {
   let PrintMethod = "printMemRegReg";
   let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg, ptr_rc_idx:$offreg);
 }
-def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
-  let PrintMethod = "printMemRegImmShifted";
+def memrix : Operand<iPTR> {   // memri where the imm is 4-aligned.
+  let PrintMethod = "printMemRegImm";
   let MIOperandInfo = (ops dispRIX:$imm, ptr_rc_nor0:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
@@ -530,7 +566,7 @@ def pred : Operand<OtherVT> {
 def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
-def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4",  [], []>; // "std"
 
 // The address in a single register. This is used with the SjLj
 // pseudo-instructions.
@@ -749,6 +785,20 @@ multiclass XForm_26r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
   }
 }
 
+multiclass XForm_28r<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                    string asmbase, string asmstr, InstrItinClass itin,
+                    list<dag> pattern> {
+  let BaseName = asmbase in {
+    def NAME : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(" ", asmstr)), itin,
+                       pattern>, RecFormRel;
+    let Defs = [CR1] in
+    def o    : XForm_28<opcode, xo, OOL, IOL,
+                       !strconcat(asmbase, !strconcat(". ", asmstr)), itin,
+                       []>, isDOT, RecFormRel;
+  }
+}
+
 multiclass AForm_1r<bits<6> opcode, bits<5> xo, dag OOL, dag IOL,
                     string asmbase, string asmstr, InstrItinClass itin,
                     list<dag> pattern> {
@@ -860,7 +910,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 
     let isCodeGenOnly = 1 in
     def BCCTR : XLForm_2_br<19, 528, 0, (outs), (ins pred:$cond),
-                            "b${cond:cc}ctr ${cond:reg}", BrB, []>;
+                            "b${cond:cc}ctr${cond:pm} ${cond:reg}", BrB, []>;
   }
 }
 
@@ -873,6 +923,8 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   def B   : IForm<18, 0, 0, (outs), (ins directbrtarget:$dst),
                   "b $dst", BrB,
                   [(br bb:$dst)]>;
+  def BA  : IForm<18, 1, 0, (outs), (ins absdirectbrtarget:$dst),
+                  "ba $dst", BrB, []>;
   }
 
   // BCC represents an arbitrary conditional branch on a predicate.
@@ -880,18 +932,29 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   // a two-value operand where a dag node expects two operands. :(
   let isCodeGenOnly = 1 in {
     def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
-                    "b${cond:cc} ${cond:reg}, $dst"
+                    "b${cond:cc}${cond:pm} ${cond:reg}, $dst"
                     /*[(PPCcondbranch crrc:$crS, imm:$opc, bb:$dst)]*/>;
+    def BCCA : BForm<16, 1, 0, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                     "b${cond:cc}a${cond:pm} ${cond:reg}, $dst">;
+
     let isReturn = 1, Uses = [LR, RM] in
     def BCLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$cond),
-                           "b${cond:cc}lr ${cond:reg}", BrB, []>;
+                           "b${cond:cc}lr${cond:pm} ${cond:reg}", BrB, []>;
+  }
 
-    let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
-      def BDZLR  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
+  let isReturn = 1, Defs = [CTR], Uses = [CTR, LR, RM] in {
+   def BDZLR  : XLForm_2_ext<19, 16, 18, 0, 0, (outs), (ins),
                              "bdzlr", BrB, []>;
-      def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
+   def BDNZLR : XLForm_2_ext<19, 16, 16, 0, 0, (outs), (ins),
                              "bdnzlr", BrB, []>;
-    }
+   def BDZLRp : XLForm_2_ext<19, 16, 27, 0, 0, (outs), (ins),
+                             "bdzlr+", BrB, []>;
+   def BDNZLRp: XLForm_2_ext<19, 16, 25, 0, 0, (outs), (ins),
+                             "bdnzlr+", BrB, []>;
+   def BDZLRm : XLForm_2_ext<19, 16, 26, 0, 0, (outs), (ins),
+                             "bdzlr-", BrB, []>;
+   def BDNZLRm: XLForm_2_ext<19, 16, 24, 0, 0, (outs), (ins),
+                             "bdnzlr-", BrB, []>;
   }
 
   let Defs = [CTR], Uses = [CTR] in {
@@ -899,6 +962,26 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
                        "bdz $dst">;
     def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
                        "bdnz $dst">;
+    def BDZA  : BForm_1<16, 18, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza $dst">;
+    def BDNZA : BForm_1<16, 16, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza $dst">;
+    def BDZp : BForm_1<16, 27, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz+ $dst">;
+    def BDNZp: BForm_1<16, 25, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz+ $dst">;
+    def BDZAp : BForm_1<16, 27, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza+ $dst">;
+    def BDNZAp: BForm_1<16, 25, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza+ $dst">;
+    def BDZm : BForm_1<16, 26, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz- $dst">;
+    def BDNZm: BForm_1<16, 24, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz- $dst">;
+    def BDZAm : BForm_1<16, 26, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdza- $dst">;
+    def BDNZAm: BForm_1<16, 24, 1, 0, (outs), (ins abscondbrtarget:$dst),
+                        "bdnza- $dst">;
   }
 }
 
@@ -915,8 +998,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
   let Uses = [RM] in {
     def BL  : IForm<18, 0, 1, (outs), (ins calltarget:$func),
                     "bl $func", BrB, []>;  // See Pat patterns below.
-    def BLA : IForm<18, 1, 1, (outs), (ins aaddr:$func),
+    def BLA : IForm<18, 1, 1, (outs), (ins abscalltarget:$func),
                     "bla $func", BrB, [(PPCcall (i32 imm:$func))]>;
+
+    let isCodeGenOnly = 1 in {
+      def BCCL : BForm<16, 0, 1, (outs), (ins pred:$cond, condbrtarget:$dst),
+                       "b${cond:cc}l${cond:pm} ${cond:reg}, $dst">;
+      def BCCLA : BForm<16, 1, 1, (outs), (ins pred:$cond, abscondbrtarget:$dst),
+                        "b${cond:cc}la${cond:pm} ${cond:reg}, $dst">;
+    }
   }
   let Uses = [CTR, RM] in {
     def BCTRL : XLForm_2_ext<19, 528, 20, 0, 1, (outs), (ins),
@@ -925,7 +1015,55 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR] in {
 
     let isCodeGenOnly = 1 in
     def BCCTRL : XLForm_2_br<19, 528, 1, (outs), (ins pred:$cond),
-                             "b${cond:cc}ctrl ${cond:reg}", BrB, []>;
+                             "b${cond:cc}ctrl${cond:pm} ${cond:reg}", BrB, []>;
+  }
+  let Uses = [LR, RM] in {
+    def BLRL : XLForm_2_ext<19, 16, 20, 0, 1, (outs), (ins),
+                            "blrl", BrB, []>;
+
+    let isCodeGenOnly = 1 in
+    def BCLRL : XLForm_2_br<19, 16, 1, (outs), (ins pred:$cond),
+                            "b${cond:cc}lrl${cond:pm} ${cond:reg}", BrB, []>;
+  }
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def BDZL  : BForm_1<16, 18, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl $dst">;
+    def BDNZL : BForm_1<16, 16, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl $dst">;
+    def BDZLA  : BForm_1<16, 18, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla $dst">;
+    def BDNZLA : BForm_1<16, 16, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla $dst">;
+    def BDZLp : BForm_1<16, 27, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl+ $dst">;
+    def BDNZLp: BForm_1<16, 25, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl+ $dst">;
+    def BDZLAp : BForm_1<16, 27, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla+ $dst">;
+    def BDNZLAp: BForm_1<16, 25, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla+ $dst">;
+    def BDZLm : BForm_1<16, 26, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdzl- $dst">;
+    def BDNZLm: BForm_1<16, 24, 0, 1, (outs), (ins condbrtarget:$dst),
+                        "bdnzl- $dst">;
+    def BDZLAm : BForm_1<16, 26, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdzla- $dst">;
+    def BDNZLAm: BForm_1<16, 24, 1, 1, (outs), (ins abscondbrtarget:$dst),
+                         "bdnzla- $dst">;
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in {
+    def BDZLRL  : XLForm_2_ext<19, 16, 18, 0, 1, (outs), (ins),
+                               "bdzlrl", BrB, []>;
+    def BDNZLRL : XLForm_2_ext<19, 16, 16, 0, 1, (outs), (ins),
+                               "bdnzlrl", BrB, []>;
+    def BDZLRLp : XLForm_2_ext<19, 16, 27, 0, 1, (outs), (ins),
+                               "bdzlrl+", BrB, []>;
+    def BDNZLRLp: XLForm_2_ext<19, 16, 25, 0, 1, (outs), (ins),
+                               "bdnzlrl+", BrB, []>;
+    def BDZLRLm : XLForm_2_ext<19, 16, 26, 0, 1, (outs), (ins),
+                               "bdzlrl-", BrB, []>;
+    def BDNZLRLm: XLForm_2_ext<19, 16, 24, 0, 1, (outs), (ins),
+                               "bdnzlrl-", BrB, []>;
   }
 }
 
@@ -937,7 +1075,7 @@ def TCRETURNdi :Pseudo< (outs),
 
 
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
-def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset),
+def TCRETURNai :Pseudo<(outs), (ins abscalltarget:$func, i32imm:$offset),
                  "#TC_RETURNa $func $offset",
                  [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
 
@@ -954,23 +1092,22 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
 def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
      Requires<[In32BitMode]>;
 
-
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
 def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
                   "b $dst", BrB,
                   []>;
 
-}
-
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
     isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
-def TAILBA   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+def TAILBA   : IForm<18, 0, 0, (outs), (ins abscalltarget:$dst),
                   "ba $dst", BrB,
                   []>;
 
+}
+
 let hasSideEffects = 1, isBarrier = 1, usesCustomInserter = 1 in {
+  let Defs = [CTR] in
   def EH_SjLj_SetJmp32  : Pseudo<(outs gprc:$dst), (ins memr:$buf),
                             "#EH_SJLJ_SETJMP32",
                             [(set i32:$dst, (PPCeh_sjlj_setjmp addr:$buf))]>,
@@ -987,6 +1124,12 @@ let isBranch = 1, isTerminator = 1 in {
                         "#EH_SjLj_Setup\t$dst", []>;
 }
 
+// System call.
+let PPC970_Unit = 7 in {
+  def SC     : SCForm<17, 1, (outs), (ins i32imm:$lev),
+                      "sc $lev", BrB, [(PPCsc (i32 imm:$lev))]>;
+}
+
 // DCB* instructions.
 def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
                       "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
@@ -1110,6 +1253,15 @@ def STWCX : XForm_1<31, 150, (outs), (ins gprc:$rS, memrr:$dst),
 let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
 def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStLoad, [(trap)]>;
 
+def TWI : DForm_base<3, (outs), (ins u5imm:$to, gprc:$rA, s16imm:$imm),
+                     "twi $to, $rA, $imm", IntTrapW, []>;
+def TW : XForm_1<31, 4, (outs), (ins u5imm:$to, gprc:$rA, gprc:$rB),
+                 "tw $to, $rA, $rB", IntTrapW, []>;
+def TDI : DForm_base<2, (outs), (ins u5imm:$to, g8rc:$rA, s16imm:$imm),
+                     "tdi $to, $rA, $imm", IntTrapD, []>;
+def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
+                 "td $to, $rA, $rB", IntTrapD, []>;
+
 //===----------------------------------------------------------------------===//
 // PPC32 Load Instructions.
 //
@@ -1250,6 +1402,10 @@ def LFIWZX : XForm_25<31, 887, (outs f8rc:$frD), (ins memrr:$src),
                       [(set f64:$frD, (PPClfiwzx xoaddr:$src))]>;
 }
 
+// Load Multiple
+def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
+                  "lmw $rD, $src", LdStLMW, []>;
+
 //===----------------------------------------------------------------------===//
 // PPC32 Store Instructions.
 //
@@ -1380,50 +1536,54 @@ def : Pat<(pre_store f32:$rS, iPTR:$ptrreg, iPTR:$ptroff),
 def : Pat<(pre_store f64:$rS, iPTR:$ptrreg, iPTR:$ptroff),
           (STFDUX $rS, $ptrreg, $ptroff)>;
 
-def SYNC : XForm_24_sync<31, 598, (outs), (ins),
-                        "sync", LdStSync,
-                        [(int_ppc_sync)]>;
+// Store Multiple
+def STMW : DForm_1<47, (outs), (ins gprc:$rS, memri:$dst),
+                   "stmw $rS, $dst", LdStLMW, []>;
+
+def SYNC : XForm_24_sync<31, 598, (outs), (ins i32imm:$L),
+                        "sync $L", LdStSync, []>;
+def : Pat<(int_ppc_sync), (SYNC 0)>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Arithmetic Instructions.
 //
 
 let PPC970_Unit = 1 in {  // FXU Operations.
-def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolLo:$imm),
+def ADDI   : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$imm),
                      "addi $rD, $rA, $imm", IntSimple,
-                     [(set i32:$rD, (add i32:$rA, immSExt16:$imm))]>;
+                     [(set i32:$rD, (add i32:$rA, imm32SExt16:$imm))]>;
 let BaseName = "addic" in {
 let Defs = [CARRY] in
 def ADDIC  : DForm_2<12, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "addic $rD, $rA, $imm", IntGeneral,
-                     [(set i32:$rD, (addc i32:$rA, immSExt16:$imm))]>,
+                     [(set i32:$rD, (addc i32:$rA, imm32SExt16:$imm))]>,
                      RecFormRel, PPC970_DGroup_Cracked;
 let Defs = [CARRY, CR0] in
 def ADDICo : DForm_2<13, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "addic. $rD, $rA, $imm", IntGeneral,
                      []>, isDOT, RecFormRel;
 }
-def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolHi:$imm),
+def ADDIS  : DForm_2<15, (outs gprc:$rD), (ins gprc_nor0:$rA, s17imm:$imm),
                      "addis $rD, $rA, $imm", IntSimple,
                      [(set i32:$rD, (add i32:$rA, imm16ShiftedSExt:$imm))]>;
 let isCodeGenOnly = 1 in
-def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, symbolLo:$sym),
+def LA     : DForm_2<14, (outs gprc:$rD), (ins gprc_nor0:$rA, s16imm:$sym),
                      "la $rD, $sym($rA)", IntGeneral,
                      [(set i32:$rD, (add i32:$rA,
                                           (PPClo tglobaladdr:$sym, 0)))]>;
 def MULLI  : DForm_2< 7, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "mulli $rD, $rA, $imm", IntMulLI,
-                     [(set i32:$rD, (mul i32:$rA, immSExt16:$imm))]>;
+                     [(set i32:$rD, (mul i32:$rA, imm32SExt16:$imm))]>;
 let Defs = [CARRY] in
 def SUBFIC : DForm_2< 8, (outs gprc:$rD), (ins gprc:$rA, s16imm:$imm),
                      "subfic $rD, $rA, $imm", IntGeneral,
-                     [(set i32:$rD, (subc immSExt16:$imm, i32:$rA))]>;
+                     [(set i32:$rD, (subc imm32SExt16:$imm, i32:$rA))]>;
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins symbolLo:$imm),
+  def LI  : DForm_2_r0<14, (outs gprc:$rD), (ins s16imm:$imm),
                        "li $rD, $imm", IntSimple,
-                       [(set i32:$rD, immSExt16:$imm)]>;
-  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins symbolHi:$imm),
+                       [(set i32:$rD, imm32SExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs gprc:$rD), (ins s17imm:$imm),
                        "lis $rD, $imm", IntSimple,
                        [(set i32:$rD, imm16ShiftedSExt:$imm)]>;
 }
@@ -1532,6 +1692,9 @@ let isCompare = 1, neverHasSideEffects = 1 in {
 
 let Uses = [RM] in {
   let neverHasSideEffects = 1 in {
+  defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
+                          "fctiw", "$frD, $frB", FPGeneral,
+                          []>;
   defm FCTIWZ : XForm_26r<63, 15, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiwz", "$frD, $frB", FPGeneral,
                           [(set f64:$frD, (PPCfctiwz f64:$frB))]>;
@@ -1540,23 +1703,13 @@ let Uses = [RM] in {
                           "frsp", "$frD, $frB", FPGeneral,
                           [(set f32:$frD, (fround f64:$frB))]>;
 
-  // The frin -> nearbyint mapping is valid only in fast-math mode.
   let Interpretation64Bit = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f64:$frD, (fnearbyint f64:$frB))]>;
+                          [(set f64:$frD, (frnd f64:$frB))]>;
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frin", "$frD, $frB", FPGeneral,
-                          [(set f32:$frD, (fnearbyint f32:$frB))]>;
-  }
-
-  // These pseudos expand to rint but also set FE_INEXACT when the result does
-  // not equal the argument.
-  let usesCustomInserter = 1, Defs = [RM] in { // FIXME: Model FPSCR!
-    def FRINDrint : Pseudo<(outs f8rc:$frD), (ins f8rc:$frB),
-                            "#FRINDrint", [(set f64:$frD, (frint f64:$frB))]>;
-    def FRINSrint : Pseudo<(outs f4rc:$frD), (ins f4rc:$frB),
-                            "#FRINSrint", [(set f32:$frD, (frint f32:$frB))]>;
+                          [(set f32:$frD, (frnd f32:$frB))]>;
   }
 
   let neverHasSideEffects = 1 in {
@@ -1626,6 +1779,14 @@ defm FNEGD  : XForm_26r<63, 40, (outs f8rc:$frD), (ins f8rc:$frB),
                         "fneg", "$frD, $frB", FPGeneral,
                         [(set f64:$frD, (fneg f64:$frB))]>;
 
+defm FCPSGNS : XForm_28r<63, 8, (outs f4rc:$frD), (ins f4rc:$frA, f4rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f32:$frD, (fcopysign f32:$frB, f32:$frA))]>;
+let Interpretation64Bit = 1 in
+defm FCPSGND : XForm_28r<63, 8, (outs f8rc:$frD), (ins f8rc:$frA, f8rc:$frB),
+                        "fcpsgn", "$frD, $frA, $frB", FPGeneral,
+                        [(set f64:$frD, (fcopysign f64:$frB, f64:$frA))]>;
+
 // Reciprocal estimates.
 defm FRE      : XForm_26r<63, 24, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fre", "$frD, $frB", FPGeneral,
@@ -1648,15 +1809,37 @@ def MCRF   : XLForm_3<19, 0, (outs crrc:$BF), (ins crrc:$BFA),
                       "mcrf $BF, $BFA", BrMCR>,
              PPC970_DGroup_First, PPC970_Unit_CRU;
 
+def CRAND  : XLForm_1<19, 257, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crand $CRD, $CRA, $CRB", BrCR, []>;
+
+def CRNAND : XLForm_1<19, 225, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnand $CRD, $CRA, $CRB", BrCR, []>;
+
+def CROR   : XLForm_1<19, 449, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "cror $CRD, $CRA, $CRB", BrCR, []>;
+
+def CRXOR  : XLForm_1<19, 193, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crxor $CRD, $CRA, $CRB", BrCR, []>;
+
+def CRNOR  : XLForm_1<19, 33, (outs crbitrc:$CRD),
+                              (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crnor $CRD, $CRA, $CRB", BrCR, []>;
+
 def CREQV  : XLForm_1<19, 289, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "creqv $CRD, $CRA, $CRB", BrCR,
-                      []>;
+                      "creqv $CRD, $CRA, $CRB", BrCR, []>;
+
+def CRANDC : XLForm_1<19, 129, (outs crbitrc:$CRD),
+                               (ins crbitrc:$CRA, crbitrc:$CRB),
+                      "crandc $CRD, $CRA, $CRB", BrCR, []>;
 
-def CROR  : XLForm_1<19, 449, (outs crbitrc:$CRD),
+def CRORC  : XLForm_1<19, 417, (outs crbitrc:$CRD),
                                (ins crbitrc:$CRA, crbitrc:$CRB),
-                      "cror $CRD, $CRA, $CRB", BrCR,
-                      []>;
+                      "crorc $CRD, $CRA, $CRB", BrCR, []>;
 
 let isCodeGenOnly = 1 in {
 def CRSET  : XLForm_1_ext<19, 289, (outs crbitrc:$dst), (ins),
@@ -1680,6 +1863,15 @@ def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
 
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
+
+def MFSPR : XFXForm_1<31, 339, (outs gprc:$RT), (ins i32imm:$SPR),
+                      "mfspr $RT, $SPR", SprMFSPR>;
+def MTSPR : XFXForm_1<31, 467, (outs), (ins i32imm:$SPR, gprc:$RT),
+                      "mtspr $SPR, $RT", SprMTSPR>;
+
+def MFTB : XFXForm_1<31, 371, (outs gprc:$RT), (ins i32imm:$SPR),
+                     "mftb $RT, $SPR", SprMFTB>, Deprecated<DeprecatedMFTB>;
+
 let Uses = [CTR] in {
 def MFCTR : XFXForm_1_ext<31, 339, 9, (outs gprc:$rT), (ins),
                           "mfctr $rT", SprMFSPR>,
@@ -1690,6 +1882,12 @@ def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
                           "mtctr $rS", SprMTSPR>,
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
+let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
+let Pattern = [(int_ppc_mtctr i32:$rS)] in
+def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
+                              "mtctr $rS", SprMTSPR>,
+                PPC970_DGroup_First, PPC970_Unit_FXU;
+}
 
 let Defs = [LR] in {
 def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins gprc:$rS),
@@ -1702,17 +1900,17 @@ def MFLR  : XFXForm_1_ext<31, 339, 8, (outs gprc:$rT), (ins),
             PPC970_DGroup_First, PPC970_Unit_FXU;
 }
 
-// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like
-// a GPR on the PPC970.  As such, copies in and out have the same performance
-// characteristics as an OR instruction.
-def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
-                             "mtspr 256, $rS", IntGeneral>,
-               PPC970_DGroup_Single, PPC970_Unit_FXU;
-def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
-                             "mfspr $rT, 256", IntGeneral>,
-               PPC970_DGroup_First, PPC970_Unit_FXU;
-
 let isCodeGenOnly = 1 in {
+  // Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed
+  // like a GPR on the PPC970.  As such, copies in and out have the same
+  // performance characteristics as an OR instruction.
+  def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins gprc:$rS),
+                               "mtspr 256, $rS", IntGeneral>,
+                 PPC970_DGroup_Single, PPC970_Unit_FXU;
+  def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs gprc:$rT), (ins),
+                               "mfspr $rT, 256", IntGeneral>,
+                 PPC970_DGroup_First, PPC970_Unit_FXU;
+
   def MTVRSAVEv : XFXForm_7_ext<31, 467, 256,
                                 (outs VRSAVERC:$reg), (ins gprc:$rS),
                                 "mtspr 256, $rS", IntGeneral>,
@@ -1736,34 +1934,23 @@ def RESTORE_VRSAVE : Pseudo<(outs VRSAVERC:$vrsave), (ins memri:$F),
                      "#RESTORE_VRSAVE", []>;
 
 let neverHasSideEffects = 1 in {
-def MTCRF : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins gprc:$rS),
-                      "mtcrf $FXM, $rS", BrMCRX>,
-            PPC970_MicroCode, PPC970_Unit_CRU;
+def MTOCRF: XFXForm_5a<31, 144, (outs crbitm:$FXM), (ins gprc:$ST),
+                       "mtocrf $FXM, $ST", BrMCRX>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
 
-// This is a pseudo for MFCR, which implicitly uses all 8 of its subregisters;
-// declaring that here gives the local register allocator problems with this:
-//  vreg = MCRF  CR0
-//  MFCR  <kill of whatever preg got assigned to vreg>
-// while not declaring it breaks DeadMachineInstructionElimination.
-// As it turns out, in all cases where we currently use this,
-// we're only interested in one subregister of it.  Represent this in the
-// instruction to keep the register allocator from becoming confused.
-//
-// FIXME: Make this a real Pseudo instruction when the JIT switches to MC.
-let isCodeGenOnly = 1 in
-def MFCRpseud: XFXForm_3<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
-                       "#MFCRpseud", SprMFCR>,
+def MTCRF : XFXForm_5<31, 144, (outs), (ins i32imm:$FXM, gprc:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
             PPC970_MicroCode, PPC970_Unit_CRU;
 
+let hasExtraSrcRegAllocReq = 1 in // to enable post-ra anti-dep breaking.
 def MFOCRF: XFXForm_5a<31, 19, (outs gprc:$rT), (ins crbitm:$FXM),
                        "mfocrf $rT, $FXM", SprMFCR>,
             PPC970_DGroup_First, PPC970_Unit_CRU;
-} // neverHasSideEffects = 1
 
-let neverHasSideEffects = 1 in
 def MFCR : XFXForm_3<31, 19, (outs gprc:$rT), (ins),
                      "mfcr $rT", SprMFCR>,
                      PPC970_MicroCode, PPC970_Unit_CRU;
+} // neverHasSideEffects = 1
 
 // Pseudo instruction to perform FADD in round-to-zero mode.
 let usesCustomInserter = 1, Uses = [RM] in {
@@ -2004,7 +2191,7 @@ def : Pat<(or i32:$in, imm:$imm),
 def : Pat<(xor i32:$in, imm:$imm),
           (XORIS (XORI $in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
 // SUBFIC
-def : Pat<(sub immSExt16:$imm, i32:$in),
+def : Pat<(sub imm32SExt16:$imm, i32:$in),
           (SUBFIC $in, imm:$imm)>;
 
 // SHL/SRL
@@ -2097,7 +2284,7 @@ def : Pat<(f64 (extloadf32 xaddr:$src)),
 def : Pat<(f64 (fextend f32:$src)),
           (COPY_TO_REGCLASS $src, F8RC)>;
 
-def : Pat<(atomic_fence (imm), (imm)), (SYNC)>;
+def : Pat<(atomic_fence (imm), (imm)), (SYNC 0)>;
 
 // Additional FNMSUB patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
@@ -2109,6 +2296,12 @@ def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
 def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
           (FNMSUBS $A, $C, $B)>;
 
+// FCOPYSIGN's operand types need not agree.
+def : Pat<(fcopysign f64:$frB, f32:$frA),
+          (FCPSGND (COPY_TO_REGCLASS $frA, F8RC), $frB)>;
+def : Pat<(fcopysign f32:$frB, f64:$frA),
+          (FCPSGNS (COPY_TO_REGCLASS $frA, F4RC), $frB)>;
+
 include "PPCInstrAltivec.td"
 include "PPCInstr64Bit.td"
 
@@ -2123,6 +2316,41 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
 def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
                     "icbi $src", LdStICBI, []>;
 
+def EIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+                           "eieio", LdStLoad, []>;
+
+def WAIT : XForm_24_sync<31, 62, (outs), (ins i32imm:$L),
+                         "wait $L", LdStLoad, []>;
+
+def MTMSR: XForm_mtmsr<31, 146, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsr $RS, $L", SprMTMSR>;
+
+def MFMSR : XForm_rs<31, 83, (outs gprc:$RT), (ins),
+                  "mfmsr $RT", SprMFMSR, []>;
+
+def MTMSRD : XForm_mtmsr<31, 178, (outs), (ins gprc:$RS, i32imm:$L),
+                    "mtmsrd $RS, $L", SprMTMSRD>;
+
+def SLBIE : XForm_16b<31, 434, (outs), (ins gprc:$RB),
+                        "slbie $RB", SprSLBIE, []>;
+
+def SLBMTE : XForm_26<31, 402, (outs), (ins gprc:$RS, gprc:$RB),
+                    "slbmte $RS, $RB", SprSLBMTE, []>;
+
+def SLBMFEE : XForm_26<31, 915, (outs gprc:$RT), (ins gprc:$RB),
+                       "slbmfee $RT, $RB", SprSLBMFEE, []>;
+
+def SLBIA : XForm_0<31, 498, (outs), (ins), "slbia", SprSLBIA, []>;
+
+def TLBSYNC : XForm_0<31, 566, (outs), (ins),
+                        "tlbsync", SprTLBSYNC, []>;
+
+def TLBIEL : XForm_16b<31, 274, (outs), (ins gprc:$RB),
+                          "tlbiel $RB", SprTLBIEL, []>;
+
+def TLBIE : XForm_26<31, 306, (outs), (ins gprc:$RS, gprc:$RB),
+                          "tlbie $RB,$RS", SprTLBIE, []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Assembler Instruction Aliases
 //
@@ -2143,66 +2371,339 @@ class PPCAsmPseudo<string asm, dag iops>
   let isPseudo = 1;
 }
 
-def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"sc", (SC 0)>;
+
+def : InstAlias<"sync", (SYNC 0)>;
+def : InstAlias<"msync", (SYNC 0)>;
+def : InstAlias<"lwsync", (SYNC 1)>;
+def : InstAlias<"ptesync", (SYNC 2)>;
+
+def : InstAlias<"wait", (WAIT 0)>;
+def : InstAlias<"waitrsv", (WAIT 1)>;
+def : InstAlias<"waitimpl", (WAIT 2)>;
+
+def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
+def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
+
+def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
 
+def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
+
+def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"not. $rA, $rB", (NOR8o g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+
+def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
+
+def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
+                        (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBICo : PPCAsmPseudo<"subic. $rA, $rB, $imm",
+                          (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8o g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
+def : InstAlias<"mfsprg $RT, 0", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg $RT, 1", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg $RT, 2", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg $RT, 3", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mfsprg0 $RT", (MFSPR gprc:$RT, 272)>;
+def : InstAlias<"mfsprg1 $RT", (MFSPR gprc:$RT, 273)>;
+def : InstAlias<"mfsprg2 $RT", (MFSPR gprc:$RT, 274)>;
+def : InstAlias<"mfsprg3 $RT", (MFSPR gprc:$RT, 275)>;
+
+def : InstAlias<"mtsprg 0, $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg 1, $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg 2, $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg 3, $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtsprg0 $RT", (MTSPR 272, gprc:$RT)>;
+def : InstAlias<"mtsprg1 $RT", (MTSPR 273, gprc:$RT)>;
+def : InstAlias<"mtsprg2 $RT", (MTSPR 274, gprc:$RT)>;
+def : InstAlias<"mtsprg3 $RT", (MTSPR 275, gprc:$RT)>;
+
+def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+
+def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
+def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+
+def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
+def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+
+def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
+def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
+def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
+def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
+
+def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
+
+def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTLWIo : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWI : PPCAsmPseudo<"extrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def EXTRWIo : PPCAsmPseudo<"extrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWI : PPCAsmPseudo<"inslwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSLWIo : PPCAsmPseudo<"inslwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWI : PPCAsmPseudo<"insrwi $rA, $rS, $n, $b",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def INSRWIo : PPCAsmPseudo<"insrwi. $rA, $rS, $n, $b",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
+def ROTRWI : PPCAsmPseudo<"rotrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def ROTRWIo : PPCAsmPseudo<"rotrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
 def SLWI : PPCAsmPseudo<"slwi $rA, $rS, $n",
                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SLWIo : PPCAsmPseudo<"slwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
 def SRWI : PPCAsmPseudo<"srwi $rA, $rS, $n",
                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def SRWIo : PPCAsmPseudo<"srwi. $rA, $rS, $n",
+                         (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWI : PPCAsmPseudo<"clrrwi $rA, $rS, $n",
+                          (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRRWIo : PPCAsmPseudo<"clrrwi. $rA, $rS, $n",
+                           (ins gprc:$rA, gprc:$rS, u5imm:$n)>;
+def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
+                            (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+def CLRLSLWIo : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
+                             (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
+
+def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNMo gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
+def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINMo gprc:$rA, gprc:$rS, 0, u5imm:$n, 31)>;
+
+def EXTLDI : PPCAsmPseudo<"extldi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTLDIo : PPCAsmPseudo<"extldi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDI : PPCAsmPseudo<"extrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def EXTRDIo : PPCAsmPseudo<"extrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDI : PPCAsmPseudo<"insrdi $rA, $rS, $n, $b",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def INSRDIo : PPCAsmPseudo<"insrdi. $rA, $rS, $n, $b",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n, u6imm:$b)>;
+def ROTRDI : PPCAsmPseudo<"rotrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def ROTRDIo : PPCAsmPseudo<"rotrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
 def SLDI : PPCAsmPseudo<"sldi $rA, $rS, $n",
                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def SLDIo : PPCAsmPseudo<"sldi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
 def SRDI : PPCAsmPseudo<"srdi $rA, $rS, $n",
                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
-
-def : InstAlias<"blt $cc, $dst", (BCC 12, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bgt $cc, $dst", (BCC 44, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"beq $cc, $dst", (BCC 76, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bun $cc, $dst", (BCC 108, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bso $cc, $dst", (BCC 108, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bge $cc, $dst", (BCC 4, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bnl $cc, $dst", (BCC 4, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"ble $cc, $dst", (BCC 36, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bng $cc, $dst", (BCC 36, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bne $cc, $dst", (BCC 68, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bnu $cc, $dst", (BCC 100, crrc:$cc, condbrtarget:$dst)>;
-def : InstAlias<"bns $cc, $dst", (BCC 100, crrc:$cc, condbrtarget:$dst)>;
-
-def : InstAlias<"bltlr $cc", (BCLR 12, crrc:$cc)>;
-def : InstAlias<"bgtlr $cc", (BCLR 44, crrc:$cc)>;
-def : InstAlias<"beqlr $cc", (BCLR 76, crrc:$cc)>;
-def : InstAlias<"bunlr $cc", (BCLR 108, crrc:$cc)>;
-def : InstAlias<"bsolr $cc", (BCLR 108, crrc:$cc)>;
-def : InstAlias<"bgelr $cc", (BCLR 4, crrc:$cc)>;
-def : InstAlias<"bnllr $cc", (BCLR 4, crrc:$cc)>;
-def : InstAlias<"blelr $cc", (BCLR 36, crrc:$cc)>;
-def : InstAlias<"bnglr $cc", (BCLR 36, crrc:$cc)>;
-def : InstAlias<"bnelr $cc", (BCLR 68, crrc:$cc)>;
-def : InstAlias<"bnulr $cc", (BCLR 100, crrc:$cc)>;
-def : InstAlias<"bnslr $cc", (BCLR 100, crrc:$cc)>;
-
-def : InstAlias<"bltctr $cc", (BCCTR 12, crrc:$cc)>;
-def : InstAlias<"bgtctr $cc", (BCCTR 44, crrc:$cc)>;
-def : InstAlias<"beqctr $cc", (BCCTR 76, crrc:$cc)>;
-def : InstAlias<"bunctr $cc", (BCCTR 108, crrc:$cc)>;
-def : InstAlias<"bsoctr $cc", (BCCTR 108, crrc:$cc)>;
-def : InstAlias<"bgectr $cc", (BCCTR 4, crrc:$cc)>;
-def : InstAlias<"bnlctr $cc", (BCCTR 4, crrc:$cc)>;
-def : InstAlias<"blectr $cc", (BCCTR 36, crrc:$cc)>;
-def : InstAlias<"bngctr $cc", (BCCTR 36, crrc:$cc)>;
-def : InstAlias<"bnectr $cc", (BCCTR 68, crrc:$cc)>;
-def : InstAlias<"bnuctr $cc", (BCCTR 100, crrc:$cc)>;
-def : InstAlias<"bnsctr $cc", (BCCTR 100, crrc:$cc)>;
-
-def : InstAlias<"bltctrl $cc", (BCCTRL 12, crrc:$cc)>;
-def : InstAlias<"bgtctrl $cc", (BCCTRL 44, crrc:$cc)>;
-def : InstAlias<"beqctrl $cc", (BCCTRL 76, crrc:$cc)>;
-def : InstAlias<"bunctrl $cc", (BCCTRL 108, crrc:$cc)>;
-def : InstAlias<"bsoctrl $cc", (BCCTRL 108, crrc:$cc)>;
-def : InstAlias<"bgectrl $cc", (BCCTRL 4, crrc:$cc)>;
-def : InstAlias<"bnlctrl $cc", (BCCTRL 4, crrc:$cc)>;
-def : InstAlias<"blectrl $cc", (BCCTRL 36, crrc:$cc)>;
-def : InstAlias<"bngctrl $cc", (BCCTRL 36, crrc:$cc)>;
-def : InstAlias<"bnectrl $cc", (BCCTRL 68, crrc:$cc)>;
-def : InstAlias<"bnuctrl $cc", (BCCTRL 100, crrc:$cc)>;
-def : InstAlias<"bnsctrl $cc", (BCCTRL 100, crrc:$cc)>;
+def SRDIo : PPCAsmPseudo<"srdi. $rA, $rS, $n",
+                         (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDI : PPCAsmPseudo<"clrrdi $rA, $rS, $n",
+                          (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRRDIo : PPCAsmPseudo<"clrrdi. $rA, $rS, $n",
+                           (ins g8rc:$rA, g8rc:$rS, u6imm:$n)>;
+def CLRLSLDI : PPCAsmPseudo<"clrlsldi $rA, $rS, $b, $n",
+                            (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+def CLRLSLDIo : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
+                             (ins g8rc:$rA, g8rc:$rS, u6imm:$b, u6imm:$n)>;
+
+def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
+def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>;
+
+// These generic branch instruction forms are used for the assembler parser only.
+// Defs and Uses are conservative, since we don't know the BO value.
+let PPC970_Unit = 7 in {
+  let Defs = [CTR], Uses = [CTR, RM] in {
+    def gBC : BForm_3<16, 0, 0, (outs),
+                      (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                      "bc $bo, $bi, $dst">;
+    def gBCA : BForm_3<16, 1, 0, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                       "bca $bo, $bi, $dst">;
+  }
+  let Defs = [LR, CTR], Uses = [CTR, RM] in {
+    def gBCL : BForm_3<16, 0, 1, (outs),
+                       (ins u5imm:$bo, crbitrc:$bi, condbrtarget:$dst),
+                       "bcl $bo, $bi, $dst">;
+    def gBCLA : BForm_3<16, 1, 1, (outs),
+                        (ins u5imm:$bo, crbitrc:$bi, abscondbrtarget:$dst),
+                        "bcla $bo, $bi, $dst">;
+  }
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCLR : XLForm_2<19, 16, 0, (outs),
+                         (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                         "bclr $bo, $bi, $bh", BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCLRL : XLForm_2<19, 16, 1, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bclrl $bo, $bi, $bh", BrB, []>;
+  let Defs = [CTR], Uses = [CTR, LR, RM] in
+    def gBCCTR : XLForm_2<19, 528, 0, (outs),
+                          (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                          "bcctr $bo, $bi, $bh", BrB, []>;
+  let Defs = [LR, CTR], Uses = [CTR, LR, RM] in
+    def gBCCTRL : XLForm_2<19, 528, 1, (outs),
+                           (ins u5imm:$bo, crbitrc:$bi, i32imm:$bh),
+                           "bcctrl $bo, $bi, $bh", BrB, []>;
+}
+def : InstAlias<"bclr $bo, $bi", (gBCLR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bclrl $bo, $bi", (gBCLRL u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctr $bo, $bi", (gBCCTR u5imm:$bo, crbitrc:$bi, 0)>;
+def : InstAlias<"bcctrl $bo, $bi", (gBCCTRL u5imm:$bo, crbitrc:$bi, 0)>;
+
+multiclass BranchSimpleMnemonic1<string name, string pm, int bo> {
+  def : InstAlias<"b"#name#pm#" $bi, $dst", (gBC bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $bi, $dst", (gBCA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lr"#pm#" $bi", (gBCLR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"l"#pm#" $bi, $dst", (gBCL bo, crbitrc:$bi, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $bi, $dst", (gBCLA bo, crbitrc:$bi, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"lrl"#pm#" $bi", (gBCLRL bo, crbitrc:$bi, 0)>;
+}
+multiclass BranchSimpleMnemonic2<string name, string pm, int bo>
+  : BranchSimpleMnemonic1<name, pm, bo> {
+  def : InstAlias<"b"#name#"ctr"#pm#" $bi", (gBCCTR bo, crbitrc:$bi, 0)>;
+  def : InstAlias<"b"#name#"ctrl"#pm#" $bi", (gBCCTRL bo, crbitrc:$bi, 0)>;
+}
+defm : BranchSimpleMnemonic2<"t", "", 12>;
+defm : BranchSimpleMnemonic2<"f", "", 4>;
+defm : BranchSimpleMnemonic2<"t", "-", 14>;
+defm : BranchSimpleMnemonic2<"f", "-", 6>;
+defm : BranchSimpleMnemonic2<"t", "+", 15>;
+defm : BranchSimpleMnemonic2<"f", "+", 7>;
+defm : BranchSimpleMnemonic1<"dnzt", "", 8>;
+defm : BranchSimpleMnemonic1<"dnzf", "", 0>;
+defm : BranchSimpleMnemonic1<"dzt", "", 10>;
+defm : BranchSimpleMnemonic1<"dzf", "", 2>;
+
+multiclass BranchExtendedMnemonicPM<string name, string pm, int bibo> {
+  def : InstAlias<"b"#name#pm#" $cc, $dst",
+                  (BCC bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#pm#" $dst",
+                  (BCC bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"a"#pm#" $cc, $dst",
+                  (BCCA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"a"#pm#" $dst",
+                  (BCCA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lr"#pm#" $cc",
+                  (BCLR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lr"#pm,
+                  (BCLR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctr"#pm#" $cc",
+                  (BCCTR bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctr"#pm,
+                  (BCCTR bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"l"#pm#" $cc, $dst",
+                  (BCCL bibo, crrc:$cc, condbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"l"#pm#" $dst",
+                  (BCCL bibo, CR0, condbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"la"#pm#" $cc, $dst",
+                  (BCCLA bibo, crrc:$cc, abscondbrtarget:$dst)>;
+  def : InstAlias<"b"#name#"la"#pm#" $dst",
+                  (BCCLA bibo, CR0, abscondbrtarget:$dst)>;
+
+  def : InstAlias<"b"#name#"lrl"#pm#" $cc",
+                  (BCLRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"lrl"#pm,
+                  (BCLRL bibo, CR0)>;
+
+  def : InstAlias<"b"#name#"ctrl"#pm#" $cc",
+                  (BCCTRL bibo, crrc:$cc)>;
+  def : InstAlias<"b"#name#"ctrl"#pm,
+                  (BCCTRL bibo, CR0)>;
+}
+multiclass BranchExtendedMnemonic<string name, int bibo> {
+  defm : BranchExtendedMnemonicPM<name, "", bibo>;
+  defm : BranchExtendedMnemonicPM<name, "-", !add(bibo, 2)>;
+  defm : BranchExtendedMnemonicPM<name, "+", !add(bibo, 3)>;
+}
+defm : BranchExtendedMnemonic<"lt", 12>;
+defm : BranchExtendedMnemonic<"gt", 44>;
+defm : BranchExtendedMnemonic<"eq", 76>;
+defm : BranchExtendedMnemonic<"un", 108>;
+defm : BranchExtendedMnemonic<"so", 108>;
+defm : BranchExtendedMnemonic<"ge", 4>;
+defm : BranchExtendedMnemonic<"nl", 4>;
+defm : BranchExtendedMnemonic<"le", 36>;
+defm : BranchExtendedMnemonic<"ng", 36>;
+defm : BranchExtendedMnemonic<"ne", 68>;
+defm : BranchExtendedMnemonic<"nu", 100>;
+defm : BranchExtendedMnemonic<"ns", 100>;
+
+def : InstAlias<"cmpwi $rA, $imm", (CMPWI CR0, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpw $rA, $rB", (CMPW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmplwi $rA, $imm", (CMPLWI CR0, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmplw $rA, $rB", (CMPLW CR0, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpdi $rA, $imm", (CMPDI CR0, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmpd $rA, $rB", (CMPD CR0, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpldi $rA, $imm", (CMPLDI CR0, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpld $rA, $rB", (CMPLD CR0, g8rc:$rA, g8rc:$rB)>;
+
+def : InstAlias<"cmpi $bf, 0, $rA, $imm", (CMPWI crrc:$bf, gprc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmp $bf, 0, $rA, $rB", (CMPW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpli $bf, 0, $rA, $imm", (CMPLWI crrc:$bf, gprc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpl $bf, 0, $rA, $rB", (CMPLW crrc:$bf, gprc:$rA, gprc:$rB)>;
+def : InstAlias<"cmpi $bf, 1, $rA, $imm", (CMPDI crrc:$bf, g8rc:$rA, s16imm:$imm)>;
+def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm:$imm)>;
+def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
+
+multiclass TrapExtendedMnemonic<string name, int to> {
+  def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
+  def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
+  def : InstAlias<"tw"#name#"i $rA, $imm", (TWI to, gprc:$rA, s16imm:$imm)>;
+  def : InstAlias<"tw"#name#" $rA, $rB", (TW to, gprc:$rA, gprc:$rB)>;
+}
+defm : TrapExtendedMnemonic<"lt", 16>;
+defm : TrapExtendedMnemonic<"le", 20>;
+defm : TrapExtendedMnemonic<"eq", 4>;
+defm : TrapExtendedMnemonic<"ge", 12>;
+defm : TrapExtendedMnemonic<"gt", 8>;
+defm : TrapExtendedMnemonic<"nl", 12>;
+defm : TrapExtendedMnemonic<"ne", 24>;
+defm : TrapExtendedMnemonic<"ng", 20>;
+defm : TrapExtendedMnemonic<"llt", 2>;
+defm : TrapExtendedMnemonic<"lle", 6>;
+defm : TrapExtendedMnemonic<"lge", 5>;
+defm : TrapExtendedMnemonic<"lgt", 1>;
+defm : TrapExtendedMnemonic<"lnl", 5>;
+defm : TrapExtendedMnemonic<"lng", 6>;
+defm : TrapExtendedMnemonic<"u", 31>;
 
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index cfcd749..5e3a48d 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -71,8 +71,13 @@ static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
 extern "C" void PPC32CompilationCallback();
 extern "C" void PPC64CompilationCallback();
 
-#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
-    !(defined(__ppc64__) || defined(__FreeBSD__))
+// The first clause of the preprocessor directive looks wrong, but it is
+// necessary when compiling this code on non-PowerPC hosts.
+#if (!defined(__ppc__) && !defined(__powerpc__)) || defined(__powerpc64__) || defined(__ppc64__)
+void PPC32CompilationCallback() {
+  llvm_unreachable("This is not a 32bit PowerPC, you can't execute this!");
+}
+#elif !defined(__ELF__)
 // CompilationCallback stub - We can't use a C function with inline assembly in
 // it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
 // write our own wrapper, which does things our way, so we have complete control
@@ -137,8 +142,8 @@ asm(
     "bctr\n"
     );
 
-#elif defined(__PPC__) && !defined(__ppc64__)
-// Linux & FreeBSD / PPC 32 support
+#else
+// ELF PPC 32 support
 
 // CompilationCallback stub - We can't use a C function with inline assembly in
 // it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
@@ -197,15 +202,14 @@ asm(
     "mtlr 0\n"
     "bctr\n"
     );
-#else
-void PPC32CompilationCallback() {
-  llvm_unreachable("This is not a power pc, you can't execute this!");
-}
 #endif
 
-#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
-    defined(__ppc64__)
-#ifdef __ELF__
+#if !defined(__powerpc64__) && !defined(__ppc64__)
+void PPC64CompilationCallback() {
+  llvm_unreachable("This is not a 64bit PowerPC, you can't execute this!");
+}
+#else
+#  ifdef __ELF__
 asm(
     ".text\n"
     ".align 2\n"
@@ -219,13 +223,13 @@ asm(
     ".align 4\n"
     ".type PPC64CompilationCallback,@function\n"
 ".L.PPC64CompilationCallback:\n"
-#else
+#  else
 asm(
     ".text\n"
     ".align 2\n"
     ".globl _PPC64CompilationCallback\n"
 "_PPC64CompilationCallback:\n"
-#endif
+#  endif
     // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
     // FIXME: need to save v[0-19] for altivec?
     // Set up a proper stack frame
@@ -258,12 +262,12 @@ asm(
     "ld   5, 280(1)\n" // stub's frame
     "ld   4, 16(5)\n"  // stub's lr
     "li   5, 1\n"      // 1 == 64 bit
-#ifdef __ELF__
+#  ifdef __ELF__
     "bl LLVMPPCCompilationCallback\n"
     "nop\n"
-#else
+#  else
     "bl _LLVMPPCCompilationCallback\n"
-#endif
+#  endif
     "mtctr 3\n"
     // Restore all int arg registers
     "ld 10, 272(1)\n"    "ld 9,  264(1)\n"
@@ -285,10 +289,6 @@ asm(
     // XXX: any special TOC handling in the ELF case for JIT?
     "bctr\n"
     );
-#else
-void PPC64CompilationCallback() {
-  llvm_unreachable("This is not a power pc, you can't execute this!");
-}
 #endif
 
 extern "C" {
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index f8cf3a5..f61c8bf 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPC.h"
+#include "MCTargetDesc/PPCMCExpr.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
@@ -68,7 +69,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (MO.isGlobal()) {
       StubSym =
       MachineModuleInfoImpl::
-      StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+      StubValueTy(AP.getSymbol(MO.getGlobal()),
                   !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -94,7 +95,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
     if (StubSym.getPointer() == 0) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
-                   StubValueTy(AP.Mang->getSymbol(MO.getGlobal()),
+                   StubValueTy(AP.getSymbol(MO.getGlobal()),
                                !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -111,31 +112,26 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   unsigned access = MO.getTargetFlags() & PPCII::MO_ACCESS_MASK;
 
   switch (access) {
-    case PPCII::MO_HA16: RefKind = isDarwin ? 
-                           MCSymbolRefExpr::VK_PPC_DARWIN_HA16 : 
-                           MCSymbolRefExpr::VK_PPC_GAS_HA16; 
-                         break;
-    case PPCII::MO_LO16: RefKind = isDarwin ? 
-                           MCSymbolRefExpr::VK_PPC_DARWIN_LO16 : 
-                           MCSymbolRefExpr::VK_PPC_GAS_LO16; 
-                         break;
-    case PPCII::MO_TPREL16_HA: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_HA;
-                               break;
-    case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
-                               break;
-    case PPCII::MO_DTPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_DTPREL16_LO;
-                                break;
-    case PPCII::MO_TLSLD16_LO: RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO;
-                               break;
-    case PPCII::MO_TOC16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TOC16_LO;
-                             break;
-   }
+    case PPCII::MO_TPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_LO;
+      break;
+    case PPCII::MO_TPREL_HA:
+      RefKind = MCSymbolRefExpr::VK_PPC_TPREL_HA;
+      break;
+    case PPCII::MO_DTPREL_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_DTPREL_LO;
+      break;
+    case PPCII::MO_TLSLD_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_GOT_TLSLD_LO;
+      break;
+    case PPCII::MO_TOC_LO:
+      RefKind = MCSymbolRefExpr::VK_PPC_TOC_LO;
+      break;
+    case PPCII::MO_TLS:
+      RefKind = MCSymbolRefExpr::VK_PPC_TLS;
+      break;
+  }
 
-  // FIXME: This isn't right, but we don't have a good way to express this in
-  // the MC Level, see below.
-  if (MO.getTargetFlags() & PPCII::MO_PIC_FLAG)
-    RefKind = MCSymbolRefExpr::VK_None;
-  
   const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, RefKind, Ctx);
 
   if (!MO.isJTI() && MO.getOffset())
@@ -149,10 +145,18 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     
     const MCExpr *PB = MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
     Expr = MCBinaryExpr::CreateSub(Expr, PB, Ctx);
-    // FIXME: We have no way to make the result be VK_PPC_LO16/VK_PPC_HA16,
-    // since it is not a symbol!
   }
-  
+
+  // Add ha16() / lo16() markers if required.
+  switch (access) {
+    case PPCII::MO_LO:
+      Expr = PPCMCExpr::CreateLo(Expr, isDarwin, Ctx);
+      break;
+    case PPCII::MO_HA:
+      Expr = PPCMCExpr::CreateHa(Expr, isDarwin, Ctx);
+      break;
+  }
+
   return MCOperand::CreateExpr(Expr);
 }
 
diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 40d1f3a..33f843d 100644
--- a/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -32,6 +32,9 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   ///
   int ReturnAddrSaveIndex;
 
+  /// Frame index where the old base pointer is stored.
+  int BasePointerSaveIndex;
+
   /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
   /// function.  This is only valid after the initial scan of the function by
   /// PEI.
@@ -93,6 +96,7 @@ public:
   explicit PPCFunctionInfo(MachineFunction &MF) 
     : FramePointerSaveIndex(0),
       ReturnAddrSaveIndex(0),
+      BasePointerSaveIndex(0),
       HasSpills(false),
       HasNonRISpills(false),
       SpillsCR(false),
@@ -113,6 +117,9 @@ public:
   int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
   void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
 
+  int getBasePointerSaveIndex() const { return BasePointerSaveIndex; }
+  void setBasePointerSaveIndex(int Idx) { BasePointerSaveIndex = Idx; }
+
   unsigned getMinReservedArea() const { return MinReservedArea; }
   void setMinReservedArea(unsigned size) { MinReservedArea = size; }
 
@@ -160,7 +167,7 @@ public:
   int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
   void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
 
-  const SmallVector<unsigned, 3> &
+  const SmallVectorImpl<unsigned> &
     getMustSaveCRs() const { return MustSaveCRs; }
   void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
 };
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2be6324..19ccbfc 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -48,12 +48,19 @@
 
 using namespace llvm;
 
-PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
-                                 const TargetInstrInfo &tii)
+static cl::opt<bool>
+EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
+         cl::desc("Enable use of a base pointer for complex stack frames"));
+
+static cl::opt<bool>
+AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
+         cl::desc("Force the use of a base pointer in every function"));
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii) {
+    Subtarget(ST) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -62,6 +69,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
   ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
   ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+  ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
 
   // 64-bit
   ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
@@ -92,32 +100,41 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
 const uint16_t*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isDarwinABI())
-    return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
-                                 CSR_Darwin32_SaveList;
-
-  return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
+    return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                  CSR_Darwin64_Altivec_SaveList :
+                                  CSR_Darwin64_SaveList) :
+                                 (Subtarget.hasAltivec() ?
+                                  CSR_Darwin32_Altivec_SaveList :
+                                  CSR_Darwin32_SaveList);
+
+  return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                CSR_SVR464_Altivec_SaveList :
+                                CSR_SVR464_SaveList) :
+                               (Subtarget.hasAltivec() ?
+                                CSR_SVR432_Altivec_SaveList :
+                                CSR_SVR432_SaveList);
 }
 
 const uint32_t*
 PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (Subtarget.isDarwinABI())
-    return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
-                                 CSR_Darwin32_RegMask;
-
-  return Subtarget.isPPC64() ? CSR_SVR464_RegMask : CSR_SVR432_RegMask;
+    return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                  CSR_Darwin64_Altivec_RegMask :
+                                  CSR_Darwin64_RegMask) :
+                                 (Subtarget.hasAltivec() ?
+                                  CSR_Darwin32_Altivec_RegMask :
+                                  CSR_Darwin32_RegMask);
+
+  return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
+                                CSR_SVR464_Altivec_RegMask :
+                                CSR_SVR464_RegMask) :
+                               (Subtarget.hasAltivec() ?
+                                CSR_SVR432_Altivec_RegMask :
+                                CSR_SVR432_RegMask);
 }
 
 const uint32_t*
 PPCRegisterInfo::getNoPreservedMask() const {
-  // The naming here is inverted: The CSR_NoRegs_Altivec has the
-  // Altivec registers masked so that they're not saved and restored around
-  // instructions with this preserved mask.
-
-  if (!Subtarget.hasAltivec())
-    return CSR_NoRegs_Altivec_RegMask;
-
-  if (Subtarget.isDarwin())
-    return CSR_NoRegs_Darwin_RegMask;
   return CSR_NoRegs_RegMask;
 }
 
@@ -136,11 +153,24 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(PPC::FP);
   Reserved.set(PPC::FP8);
 
+  // The BP register is also not really a register, but is the representation
+  // of the base pointer register used by setjmp.
+  Reserved.set(PPC::BP);
+  Reserved.set(PPC::BP8);
+
+  // The counter registers must be reserved so that counter-based loops can
+  // be correctly formed (and the mtctr instructions are not DCE'd).
+  Reserved.set(PPC::CTR);
+  Reserved.set(PPC::CTR8);
+
   Reserved.set(PPC::R1);
   Reserved.set(PPC::LR);
   Reserved.set(PPC::LR8);
   Reserved.set(PPC::RM);
 
+  if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
+    Reserved.set(PPC::VRSAVE);
+
   // The SVR4 ABI reserves r2 and r13
   if (Subtarget.isSVR4ABI()) {
     Reserved.set(PPC::R2);  // System-reserved register
@@ -157,6 +187,9 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     if (PPCFI->needsFP(MF))
       Reserved.set(PPC::X31);
 
+    if (hasBasePointer(MF))
+      Reserved.set(PPC::X30);
+
     // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
     if (Subtarget.isSVR4ABI()) {
       Reserved.set(PPC::X2);
@@ -166,6 +199,15 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (PPCFI->needsFP(MF))
     Reserved.set(PPC::R31);
 
+  if (hasBasePointer(MF))
+    Reserved.set(PPC::R30);
+
+  // Reserve Altivec registers when Altivec is unavailable.
+  if (!Subtarget.hasAltivec())
+    for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
+         IE = PPC::VRRCRegClass.end(); I != IE; ++I)
+      Reserved.set(*I);
+
   return Reserved;
 }
 
@@ -214,6 +256,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   MachineFunction &MF = *MBB.getParent();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   // Determine whether 64-bit pointers are used.
   bool LP64 = Subtarget.isPPC64();
   DebugLoc dl = MI.getDebugLoc();
@@ -226,8 +270,8 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   // Get stack alignments.
   unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
   unsigned MaxAlign = MFI->getMaxAlignment();
-  if (MaxAlign > TargetAlign)
-    report_fatal_error("Dynamic alloca with large aligns not supported");
+  assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
+         "Maximum call-frame size not sufficiently aligned");
 
   // Determine the previous frame's address.  If FrameSize can't be
   // represented as 16 bits or we need special alignment, then we load the
@@ -252,40 +296,62 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
       .addImm(0)
       .addReg(PPC::R1);
   }
-  
+
+  bool KillNegSizeReg = MI.getOperand(1).isKill();
+  unsigned NegSizeReg = MI.getOperand(1).getReg();
+
   // Grow the stack and update the stack pointer link, then determine the
   // address of new allocated space.
   if (LP64) {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
     BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
       .addReg(Reg, RegState::Kill)
       .addReg(PPC::X1)
-      .addReg(MI.getOperand(1).getReg());
-    if (!MI.getOperand(1).isKill())
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
-        .addReg(PPC::X1)
-        .addImm(maxCallFrameSize);
-    else
-      // Implicitly kill the register.
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
-        .addReg(PPC::X1)
-        .addImm(maxCallFrameSize)
-        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+      .addReg(PPC::X1)
+      .addImm(maxCallFrameSize);
   } else {
+    if (MaxAlign > TargetAlign) {
+      unsigned UnalNegSizeReg = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+
+      // Unfortunately, there is no andi, only andi., and we can't insert that
+      // here because we might clobber cr0 while it is live.
+      BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
+        .addImm(~(MaxAlign-1));
+
+      unsigned NegSizeReg1 = NegSizeReg;
+      NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
+      BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
+        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+        .addReg(NegSizeReg1, RegState::Kill);
+      KillNegSizeReg = true;
+    }
+
     BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
       .addReg(Reg, RegState::Kill)
       .addReg(PPC::R1)
-      .addReg(MI.getOperand(1).getReg());
-
-    if (!MI.getOperand(1).isKill())
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
-        .addReg(PPC::R1)
-        .addImm(maxCallFrameSize);
-    else
-      // Implicitly kill the register.
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
-        .addReg(PPC::R1)
-        .addImm(maxCallFrameSize)
-        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+      .addReg(PPC::R1)
+      .addImm(maxCallFrameSize);
   }
   
   // Discard the DYNALLOC instruction.
@@ -307,6 +373,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -317,8 +384,8 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   unsigned SrcReg = MI.getOperand(0).getReg();
 
   // We need to store the CR in the low 4-bits of the saved value. First, issue
-  // an MFCRpsued to save all of the CRBits and, if needed, kill the SrcReg.
-  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFCR8pseud : PPC::MFCRpseud), Reg)
+  // an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
           .addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
     
   // If the saved register wasn't CR0, shift the bits left so that they are in
@@ -350,6 +417,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   bool LP64 = Subtarget.isPPC64();
@@ -377,7 +445,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
              .addImm(31);
   }
 
-  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTCRF8 : PPC::MTCRF), DestReg)
+  BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg)
              .addReg(Reg, RegState::Kill);
 
   // Discard the pseudo instruction.
@@ -391,6 +459,7 @@ void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -415,6 +484,7 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
   // Get the instruction's basic block.
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
 
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
@@ -454,9 +524,8 @@ PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
   return false;
 }
 
-// Figure out if the offset in the instruction is shifted right two bits. This
-// is true for instructions like "STD", which the machine implicitly adds two
-// low zeros to.
+// Figure out if the offset in the instruction must be a multiple of 4.
+// This is true for instructions like "STD".
 static bool usesIXAddr(const MachineInstr &MI) {
   unsigned OpC = MI.getOpcode();
 
@@ -464,6 +533,7 @@ static bool usesIXAddr(const MachineInstr &MI) {
   default:
     return false;
   case PPC::LWA:
+  case PPC::LWA_32:
   case PPC::LD:
   case PPC::STD:
     return true;
@@ -493,9 +563,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineBasicBlock &MBB = *MI.getParent();
   // Get the basic block's function.
   MachineFunction &MF = *MBB.getParent();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   // Get the frame info.
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc dl = MI.getDebugLoc();
 
   unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
@@ -533,12 +604,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 
   // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
-
-  bool is64Bit = Subtarget.isPPC64();
-  MI.getOperand(FIOperandNum).ChangeToRegister(TFI->hasFP(MF) ?
-                                              (is64Bit ? PPC::X31 : PPC::R31) :
-                                                (is64Bit ? PPC::X1 : PPC::R1),
-                                              false);
+  MI.getOperand(FIOperandNum).ChangeToRegister(
+    FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
 
   // Figure out if the offset in the instruction is shifted right two bits.
   bool isIXAddr = usesIXAddr(MI);
@@ -549,10 +616,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Now add the frame object offset to the offset from r1.
   int Offset = MFI->getObjectOffset(FrameIndex);
-  if (!isIXAddr)
-    Offset += MI.getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
 
   // If we're not using a Frame Pointer that has been set to the value of the
   // SP before having the stack size subtracted from it, then add the stack size
@@ -560,8 +624,10 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Naked functions have stack size 0, although getStackSize may not reflect that
   // because we didn't call all the pieces that compute it for naked functions.
   if (!MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked))
-    Offset += MFI->getStackSize();
+        hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked)) {
+    if (!(hasBasePointer(MF) && FrameIndex < 0))
+      Offset += MFI->getStackSize();
+  }
 
   // If we can, encode the offset directly into the instruction.  If this is a
   // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
@@ -569,11 +635,9 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // clear can be encoded.  This is extremely uncommon, because normally you
   // only "std" to a stack slot that is at least 4-byte aligned, but it can
   // happen in invalid code.
-  if (OpC == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
-      (!noImmForm &&
-       isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0))) {
-    if (isIXAddr)
-      Offset >>= 2;    // The actual encoded value has the low two bits zero.
+  assert(OpC != PPC::DBG_VALUE &&
+         "This should be handle in a target independent way");
+  if (!noImmForm && isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
     MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
     return;
   }
@@ -581,6 +645,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // The offset doesn't fit into a single register, scavenge one to build the
   // offset in.
 
+  bool is64Bit = Subtarget.isPPC64();
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
   const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
@@ -626,12 +691,42 @@ unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
     return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
 }
 
-unsigned PPCRegisterInfo::getEHExceptionRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
+unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
+  if (!hasBasePointer(MF))
+    return getFrameRegister(MF);
+
+  return Subtarget.isPPC64() ? PPC::X30 : PPC::R30;
 }
 
-unsigned PPCRegisterInfo::getEHHandlerRegister() const {
-  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
+bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  if (!EnableBasePointer)
+    return false;
+  if (AlwaysBasePointer)
+    return true;
+
+  // If we need to realign the stack, then the stack pointer can no longer
+  // serve as an offset into the caller's stack space. As a result, we need a
+  // base pointer.
+  return needsStackRealignment(MF);
+}
+
+bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+
+  return true;
+}
+
+bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                     Attribute::StackAlignment));
+
+  return requiresRealignment && canRealignStack(MF);
 }
 
 /// Returns true if the instruction's frame index
@@ -650,11 +745,7 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   }
 
   unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
-
-  if (!usesIXAddr(*MI))
-    Offset += MI->getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI->getOperand(OffsetOperandNo).getImm() << 2;
+  Offset += MI->getOperand(OffsetOperandNo).getImm();
 
   // It's the load/store FI references that cause issues, as it can be difficult
   // to materialize the offset if it won't fit in the literal field. Estimate
@@ -711,9 +802,10 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
   if (Ins != MBB->end())
     DL = Ins->getDebugLoc();
 
+  const MachineFunction &MF = *MBB->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   const MCInstrDesc &MCID = TII.get(ADDriOpc);
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
   MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
 
   BuildMI(*MBB, Ins, DL, MCID, BaseReg)
@@ -734,17 +826,7 @@ PPCRegisterInfo::resolveFrameIndex(MachineBasicBlock::iterator I,
 
   MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
   unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
-
-  bool isIXAddr = usesIXAddr(MI);
-  if (!isIXAddr)
-    Offset += MI.getOperand(OffsetOperandNo).getImm();
-  else
-    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
-
-  // Figure out if the offset in the instruction is shifted right two bits.
-  if (isIXAddr)
-    Offset >>= 2;    // The actual encoded value has the low two bits zero.
-
+  Offset += MI.getOperand(OffsetOperandNo).getImm();
   MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
 }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 7a48b4b..dd3bb40 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -29,9 +29,8 @@ class Type;
 class PPCRegisterInfo : public PPCGenRegisterInfo {
   DenseMap<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 public:
-  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  PPCRegisterInfo(const PPCSubtarget &SubTarget);
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
@@ -93,9 +92,11 @@ public:
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
+  // Base pointer (stack realignment) support.
+  unsigned getBaseRegister(const MachineFunction &MF) const;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index 57a25f5..d566e2c 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -11,11 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 let Namespace = "PPC" in {
-def sub_lt : SubRegIndex;
-def sub_gt : SubRegIndex;
-def sub_eq : SubRegIndex;
-def sub_un : SubRegIndex;
-def sub_32 : SubRegIndex;
+def sub_lt : SubRegIndex<1>;
+def sub_gt : SubRegIndex<1, 1>;
+def sub_eq : SubRegIndex<1, 2>;
+def sub_un : SubRegIndex<1, 3>;
+def sub_32 : SubRegIndex<32>;
 }
 
 
@@ -94,6 +94,10 @@ def ZERO8 : GP8<ZERO, "0">;
 def FP   : GPR<0 /* arbitrary */, "**FRAME POINTER**">;
 def FP8  : GP8<FP, "**FRAME POINTER**">;
 
+// Representations of the base pointer used by setjmp.
+def BP   : GPR<0 /* arbitrary */, "**BASE POINTER**">;
+def BP8  : GP8<BP, "**BASE POINTER**">;
+
 // Condition register bits
 def CR0LT : CRBIT< 0, "0">;
 def CR0GT : CRBIT< 1, "1">;
@@ -150,7 +154,7 @@ def CTR  : SPR<9, "ctr">, DwarfRegNum<[-2, 66]>;
 def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66, -2]>;
 
 // VRsave register
-def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[109]>;
+def VRSAVE: SPR<256, "vrsave">, DwarfRegNum<[109]>;
 
 // Carry bit.  In the architecture this is really bit 0 of the XER register
 // (which really is SPR register 1);  this is the only bit interesting to a
@@ -172,11 +176,11 @@ def RM: SPR<512, "**ROUNDING MODE**">;
 // then nonvolatiles in reverse order since stmw/lmw save from rN to r31
 def GPRC : RegisterClass<"PPC", [i32], 32, (add (sequence "R%u", 2, 12),
                                                 (sequence "R%u", 30, 13),
-                                                R31, R0, R1, FP)>;
+                                                R31, R0, R1, FP, BP)>;
 
 def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
                                                 (sequence "X%u", 30, 14),
-                                                X31, X13, X0, X1, FP8)>;
+                                                X31, X13, X0, X1, FP8, BP8)>;
 
 // For some instructions r0 is special (representing the value 0 instead of
 // the value in the r0 register), and we use these register subclasses to
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 660c0c3..92ba69c 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -108,6 +108,14 @@ def VecPerm      : InstrItinClass;
 def VecFPRound   : InstrItinClass;
 def VecVSL       : InstrItinClass;
 def VecVSR       : InstrItinClass;
+def SprMTMSRD    : InstrItinClass;
+def SprSLIE      : InstrItinClass;
+def SprSLBIE     : InstrItinClass;
+def SprSLBMTE    : InstrItinClass;
+def SprSLBMFEE   : InstrItinClass;
+def SprSLBIA     : InstrItinClass;
+def SprTLBIEL    : InstrItinClass;
+def SprTLBIE     : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
 // Processor instruction itineraries.
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 8d5838e..1612cd2 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -14,39 +14,8 @@
 //===----------------------------------------------------------------------===//
 // Functional units on the PowerPC A2 chip sets
 //
-def IU0to3_0  : FuncUnit; // Fetch unit 1 to 4 slot 1
-def IU0to3_1  : FuncUnit; // Fetch unit 1 to 4 slot 2
-def IU0to3_2  : FuncUnit; // Fetch unit 1 to 4 slot 3
-def IU0to3_3  : FuncUnit; // Fetch unit 1 to 4 slot 4
-def IU4_0  : FuncUnit; // Instruction buffer slot 1
-def IU4_1  : FuncUnit; // Instruction buffer slot 2
-def IU4_2  : FuncUnit; // Instruction buffer slot 3
-def IU4_3  : FuncUnit; // Instruction buffer slot 4
-def IU4_4  : FuncUnit; // Instruction buffer slot 5
-def IU4_5  : FuncUnit; // Instruction buffer slot 6
-def IU4_6  : FuncUnit; // Instruction buffer slot 7
-def IU4_7  : FuncUnit; // Instruction buffer slot 8
-def IU5    : FuncUnit; // Dependency resolution
-def IU6    : FuncUnit; // Instruction issue
-def RF0    : FuncUnit;
-def XRF1   : FuncUnit;
-def XEX1   : FuncUnit; // Execution stage 1 for the XU pipeline
-def XEX2   : FuncUnit; // Execution stage 2 for the XU pipeline
-def XEX3   : FuncUnit; // Execution stage 3 for the XU pipeline
-def XEX4   : FuncUnit; // Execution stage 4 for the XU pipeline
-def XEX5   : FuncUnit; // Execution stage 5 for the XU pipeline
-def XEX6   : FuncUnit; // Execution stage 6 for the XU pipeline
-def FRF1   : FuncUnit;
-def FEX1   : FuncUnit; // Execution stage 1 for the FU pipeline
-def FEX2   : FuncUnit; // Execution stage 2 for the FU pipeline
-def FEX3   : FuncUnit; // Execution stage 3 for the FU pipeline
-def FEX4   : FuncUnit; // Execution stage 4 for the FU pipeline
-def FEX5   : FuncUnit; // Execution stage 5 for the FU pipeline
-def FEX6   : FuncUnit; // Execution stage 6 for the FU pipeline
-
-def CR_Bypass  : Bypass; // The bypass for condition regs.
-//def GPR_Bypass : Bypass; // The bypass for general-purpose regs.
-//def FPR_Bypass : Bypass; // The bypass for floating-point regs.
+def XU     : FuncUnit; // XU pipeline
+def FU     : FuncUnit; // FI pipeline
 
 //
 // This file defines the itinerary class data for the PPC A2 processor.
@@ -55,699 +24,119 @@ def CR_Bypass  : Bypass; // The bypass for condition regs.
 
 
 def PPCA2Itineraries : ProcessorItineraries<
-  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3,
-   IU4_0, IU4_1, IU4_2, IU4_3, IU4_4, IU4_5, IU4_6, IU4_7,
-   IU5, IU6, RF0, XRF1, XEX1, XEX2, XEX3, XEX4, XEX5, XEX6,
-   FRF1, FEX1, FEX2, FEX3, FEX4, FEX5, FEX6],
-  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
-  InstrItinData<IntSimple   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntGeneral  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntCompare  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntDivW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<38, [XEX6]>],
-                              [53, 7, 7],
-                              [NoBypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMFFS     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMTFSB0   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulHWU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntMulLI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotate   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateD  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntRotateDI , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<IntShift    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapW    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<IntTrapD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7], 
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<BrB         , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<BrCR        , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCR       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
-  InstrItinData<BrMCRX      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7, 7],
-                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBA    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBF    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStDCBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 11],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLoad    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLoadUpd , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStLDU     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStore   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStStoreUpd, [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStICBI    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTFD    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<LdStSTFDU   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
-  InstrItinData<LdStLFD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLFDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7, 7],
-                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLHA     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLHAU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLMW     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [14, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStLWARX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTD     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStSTDU    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [13, 7],
-                              [GPR_Bypass, GPR_Bypass]>,                              
-  InstrItinData<LdStSTDCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSTWCX   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<13, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [26, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStSync    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<12, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>]>,
-  InstrItinData<SprISYNC    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMTMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprTLBSYNC  , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>]>,
-  InstrItinData<SprMFCR     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [10, 7], 
-                              [GPR_Bypass, CR_Bypass]>,
-  InstrItinData<SprMFMSR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [GPR_Bypass, NoBypass]>,
-  InstrItinData<SprMFSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMFTB     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSPR    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
-                              [15, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprMTSRIN   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprRFI      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<SprSC       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
-                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
-                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
-                               InstrStage<1, [XEX5]>, InstrStage<14, [XEX6]>],
-                              [29, 7],
-                              [NoBypass, GPR_Bypass]>,
-  InstrItinData<FPGeneral   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPAddSub    , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPCompare   , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [13, 7, 7],
-                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivD      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<71, [FRF1], 0>,
-                               InstrStage<71, [FEX1], 0>,
-                                  InstrStage<71, [FEX2], 0>,
-                               InstrStage<71, [FEX3], 0>,
-                                  InstrStage<71, [FEX4], 0>,
-                               InstrStage<71, [FEX5], 0>,
-                                  InstrStage<71, [FEX6]>],
-                              [86, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPDivS      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<58, [FRF1], 0>,
-                               InstrStage<58, [FEX1], 0>,
-                                  InstrStage<58, [FEX2], 0>,
-                               InstrStage<58, [FEX3], 0>,
-                                  InstrStage<58, [FEX4], 0>,
-                               InstrStage<58, [FEX5], 0>,
-                                  InstrStage<58, [FEX6]>],
-                              [73, 7, 7],
-                              [NoBypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPSqrt      , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<68, [FRF1], 0>,
-                               InstrStage<68, [FEX1], 0>,
-                                  InstrStage<68, [FEX2], 0>,
-                               InstrStage<68, [FEX3], 0>,
-                                  InstrStage<68, [FEX4], 0>,
-                               InstrStage<68, [FEX5], 0>,
-                                  InstrStage<68, [FEX6]>],
-                              [86, 7], // FIXME: should be [86, 7] for double
-                                       // and [82, 7] for single. Likewise,
-                                       // the FEX? cycle count should be 68
-                                       // for double and 64 for single.
-                              [NoBypass, FPR_Bypass]>,
-  InstrItinData<FPFused     , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7, 7, 7],
-                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
-  InstrItinData<FPRes       , [InstrStage<4,
-                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
-                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
-                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
-                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
-                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
-                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
-                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
-                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
-                              [15, 7],
-                              [FPR_Bypass, FPR_Bypass]>
+  [XU, FU], [], [
+  InstrItinData<IntSimple   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [XU]>],
+                              [39, 1, 1]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [XU]>],
+                              [71, 1, 1]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntShift    , [InstrStage<1, [XU]>],
+                              [2, 1, 1]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [XU]>],
+                              [2, 1]>,
+  InstrItinData<BrB         , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<BrCR        , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [XU]>],
+                              [5, 1, 1]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLDU     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStStore   , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStICBI,     [InstrStage<1, [XU]>],
+                              [16, 1, 1]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStLFD     , [InstrStage<1, [XU]>],
+                              [7, 1, 1]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [XU]>],
+                              [7, 9, 1, 1]>,
+  InstrItinData<LdStLHA     , [InstrStage<1, [XU]>],
+                              [6, 1, 1]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [XU]>],
+                              [6, 8, 1, 1]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTD     , [InstrStage<1, [XU]>],
+                              [1, 1, 1]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [XU]>],
+                              [2, 1, 1, 1]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [XU]>],
+                              [82, 1, 1]>, // L2 latency
+  InstrItinData<LdStSync    , [InstrStage<1, [XU]>],
+                              [6]>,
+  InstrItinData<SprISYNC    , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [XU]>],
+                              [16, 1]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [XU]>],
+                              [4, 1]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [XU]>],
+                              [6, 1]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<SprSC       , [InstrStage<1, [XU]>],
+                              [16]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FU]>],
+                              [6, 1, 1]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FU]>],
+                              [5, 1, 1]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [FU]>],
+                              [72, 1, 1]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [FU]>],
+                              [59, 1, 1]>,
+  InstrItinData<FPSqrt      , [InstrStage<1, [FU]>],
+                              [69, 1, 1]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FU]>],
+                              [6, 1, 1, 1]>,
+  InstrItinData<FPRes       , [InstrStage<1, [FU]>],
+                              [6, 1]>
 ]>;
 
 // ===---------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
index 9bb779a..c189b9e 100644
--- a/lib/Target/PowerPC/PPCScheduleE500mc.td
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -36,6 +36,8 @@ def CFX_0 : FuncUnit; // CFX pipeline
 def LSU_0 : FuncUnit; // LSU pipeline
 def FPU_0 : FuncUnit; // FPU pipeline
 
+def CR_Bypass : Bypass;
+
 def PPCE500mcItineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
   [CR_Bypass, GPR_Bypass, FPR_Bypass], [
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
index d7e11ac..7a24d20 100644
--- a/lib/Target/PowerPC/PPCScheduleE5500.td
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -39,6 +39,7 @@ def CFX_1 : FuncUnit; // CFX pipeline stage 1
 // def LSU_0 : FuncUnit; // LSU pipeline
 // def FPU_0 : FuncUnit; // FPU pipeline
 
+// def CR_Bypass : Bypass;
 
 def PPCE5500Itineraries : ProcessorItineraries<
   [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index a8f2b3f..7231ab1 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -14,7 +14,11 @@
 #include "PPCSubtarget.h"
 #include "PPC.h"
 #include "PPCRegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Function.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
@@ -29,32 +33,70 @@ using namespace llvm;
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, bool is64Bit)
   : PPCGenSubtargetInfo(TT, CPU, FS)
-  , StackAlignment(16)
-  , DarwinDirective(PPC::DIR_NONE)
-  , HasMFOCRF(false)
-  , Has64BitSupport(false)
-  , Use64BitRegs(false)
   , IsPPC64(is64Bit)
-  , HasAltivec(false)
-  , HasQPX(false)
-  , HasFSQRT(false)
-  , HasFRE(false)
-  , HasFRES(false)
-  , HasFRSQRTE(false)
-  , HasFRSQRTES(false)
-  , HasRecipPrec(false)
-  , HasSTFIWX(false)
-  , HasLFIWAX(false)
-  , HasFPRND(false)
-  , HasFPCVT(false)
-  , HasISEL(false)
-  , HasPOPCNTD(false)
-  , HasLDBRX(false)
-  , IsBookE(false)
-  , HasLazyResolverStubs(false)
-  , IsJITCodeModel(false)
   , TargetTriple(TT) {
+  initializeEnvironment();
+  resetSubtargetFeatures(CPU, FS);
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+
+  // Calls to external functions need to use indirect calls
+  IsJITCodeModel = true;
+}
+
+void PPCSubtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+  AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                           "target-cpu");
+  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+                                          "target-features");
+  std::string CPU =
+    !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
+  std::string FS =
+    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+  if (!FS.empty()) {
+    initializeEnvironment();
+    resetSubtargetFeatures(CPU, FS);
+  }
+}
 
+void PPCSubtarget::initializeEnvironment() {
+  StackAlignment = 16;
+  DarwinDirective = PPC::DIR_NONE;
+  HasMFOCRF = false;
+  Has64BitSupport = false;
+  Use64BitRegs = false;
+  HasAltivec = false;
+  HasQPX = false;
+  HasFCPSGN = false;
+  HasFSQRT = false;
+  HasFRE = false;
+  HasFRES = false;
+  HasFRSQRTE = false;
+  HasFRSQRTES = false;
+  HasRecipPrec = false;
+  HasSTFIWX = false;
+  HasLFIWAX = false;
+  HasFPRND = false;
+  HasFPCVT = false;
+  HasISEL = false;
+  HasPOPCNTD = false;
+  HasLDBRX = false;
+  IsBookE = false;
+  DeprecatedMFTB = false;
+  DeprecatedDST = false;
+  HasLazyResolverStubs = false;
+  IsJITCodeModel = false;
+}
+
+void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
   if (CPUName.empty())
@@ -72,7 +114,7 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   std::string FullFS = FS;
 
   // If we are generating code for ppc64, verify that options make sense.
-  if (is64Bit) {
+  if (IsPPC64) {
     Has64BitSupport = true;
     // Silently force 64-bit register use on ppc64.
     Use64BitRegs = true;
@@ -99,21 +141,11 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
   // is enabled because external functions will assume this alignment.
   if (hasQPX() || isBGQ())
     StackAlignment = 32;
-}
-
-/// SetJITMode - This is called to inform the subtarget info that we are
-/// producing code for the JIT.
-void PPCSubtarget::SetJITMode() {
-  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
-  // everything is.  This matters for PPC64, which codegens in PIC mode without
-  // stubs.
-  HasLazyResolverStubs = false;
 
-  // Calls to external functions need to use indirect calls
-  IsJITCodeModel = true;
+  // Determine endianness.
+  IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
 }
 
-
 /// hasLazyResolverStub - Return true if accesses to the specified global have
 /// to go through a dyld lazy resolution stub.  This means that an extra load
 /// is required to get the address of the global.
@@ -135,14 +167,7 @@ bool PPCSubtarget::enablePostRAScheduler(
            CodeGenOpt::Level OptLevel,
            TargetSubtargetInfo::AntiDepBreakMode& Mode,
            RegClassVector& CriticalPathRCs) const {
-  // FIXME: It would be best to use TargetSubtargetInfo::ANTIDEP_ALL here,
-  // but we can't because we can't reassign the cr registers. There is a
-  // dependence between the cr register and the RLWINM instruction used
-  // to extract its value which the anti-dependency breaker can't currently
-  // see. Maybe we should make a late-expanded pseudo to encode this dependency.
-  // (the relevant code is in PPCDAGToDAGISel::SelectSETCC)
-
-  Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
+  Mode = TargetSubtargetInfo::ANTIDEP_ALL;
 
   CriticalPathRCs.clear();
 
@@ -151,9 +176,44 @@ bool PPCSubtarget::enablePostRAScheduler(
   else
     CriticalPathRCs.push_back(&PPC::GPRCRegClass);
     
-  CriticalPathRCs.push_back(&PPC::F8RCRegClass);
-  CriticalPathRCs.push_back(&PPC::VRRCRegClass);
-
   return OptLevel >= CodeGenOpt::Default;
 }
 
+// Embedded cores need aggressive scheduling.
+static bool needsAggressiveScheduling(unsigned Directive) {
+  switch (Directive) {
+  default: return false;
+  case PPC::DIR_440:
+  case PPC::DIR_A2:
+  case PPC::DIR_E500mc:
+  case PPC::DIR_E5500:
+    return true;
+  }
+}
+
+bool PPCSubtarget::enableMachineScheduler() const {
+  // Enable MI scheduling for the embedded cores.
+  // FIXME: Enable this for all cores (some additional modeling
+  // may be necessary).
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
+void PPCSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                       MachineInstr *begin,
+                                       MachineInstr *end,
+                                       unsigned NumRegionInstrs) const {
+  if (needsAggressiveScheduling(DarwinDirective)) {
+    Policy.OnlyTopDown = false;
+    Policy.OnlyBottomUp = false;
+  }
+
+  // Spilling is generally expensive on all PPC cores, so always enable
+  // register-pressure tracking.
+  Policy.ShouldTrackPressure = true;
+}
+
+bool PPCSubtarget::useAA() const {
+  // Use AA during code generation for the embedded cores.
+  return needsAggressiveScheduling(DarwinDirective);
+}
+
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 65b4d21..c863a6e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -76,6 +76,8 @@ protected:
   bool IsPPC64;
   bool HasAltivec;
   bool HasQPX;
+  bool HasVSX;
+  bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
   bool HasRecipPrec;
@@ -87,8 +89,11 @@ protected:
   bool HasPOPCNTD;
   bool HasLDBRX;
   bool IsBookE;
+  bool DeprecatedMFTB;
+  bool DeprecatedDST;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
+  bool IsLittleEndian;
 
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
@@ -128,7 +133,7 @@ public:
     // documentation are wrong; these are correct (i.e. "what gcc does").
     if (isPPC64() && isSVR4ABI()) {
       if (TargetTriple.getOS() == llvm::Triple::FreeBSD)
-        return "E-p:64:64-f64:64:64-i64:64:64-f128:64:64-v128:128:128-n32:64";
+        return "E-p:64:64-f64:64:64-i64:64:64-v128:128:128-n32:64";
       else
         return "E-p:64:64-f64:64:64-i64:64:64-f128:128:128-v128:128:128-n32:64";
     }
@@ -137,6 +142,13 @@ public:
                      : "E-p:32:32-f64:64:64-i64:64:64-f128:64:128-n32";
   }
 
+  /// \brief Reset the features for the PowerPC target.
+  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+  void initializeEnvironment();
+  void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+
+public:
   /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
   ///
   bool isPPC64() const { return IsPPC64; }
@@ -159,7 +171,11 @@ public:
   // isJITCodeModel - True if we're generating code for the JIT
   bool isJITCodeModel() const { return IsJITCodeModel; }
 
+  // isLittleEndian - True if generating little-endian code
+  bool isLittleEndian() const { return IsLittleEndian; }
+
   // Specific obvious features.
+  bool hasFCPSGN() const { return HasFCPSGN; }
   bool hasFSQRT() const { return HasFSQRT; }
   bool hasFRE() const { return HasFRE; }
   bool hasFRES() const { return HasFRES; }
@@ -177,6 +193,8 @@ public:
   bool hasPOPCNTD() const { return HasPOPCNTD; }
   bool hasLDBRX() const { return HasLDBRX; }
   bool isBookE() const { return IsBookE; }
+  bool isDeprecatedMFTB() const { return DeprecatedMFTB; }
+  bool isDeprecatedDST() const { return DeprecatedDST; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -194,6 +212,14 @@ public:
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
                              RegClassVector& CriticalPathRCs) const;
+
+  // Scheduling customization.
+  bool enableMachineScheduler() const;
+  void overrideSchedPolicy(MachineSchedPolicy &Policy,
+                           MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const;
+  bool useAA() const;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 14dc794..9acefe5 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -30,6 +30,7 @@ extern "C" void LLVMInitializePowerPCTarget() {
   // Register the targets
   RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);
   RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
+  RegisterTargetMachine<PPC64TargetMachine> C(ThePPC64LETarget);
 }
 
 PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
@@ -48,6 +49,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
   // The binutils for the BG/P are too old for CFI.
   if (Subtarget.isBGP())
     setMCUseCFI(false);
+  initAsmInfo();
 }
 
 void PPC32TargetMachine::anchor() { }
@@ -90,7 +92,7 @@ public:
     return *getPPCTargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addPreRegAlloc();
+  virtual bool addPreISel();
   virtual bool addILPOpts();
   virtual bool addInstSelector();
   virtual bool addPreSched2();
@@ -102,9 +104,9 @@ TargetPassConfig *PPCTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new PPCPassConfig(this, PM);
 }
 
-bool PPCPassConfig::addPreRegAlloc() {
+bool PPCPassConfig::addPreISel() {
   if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCCTRLoops());
+    addPass(createPPCCTRLoops(getPPCTargetMachine()));
 
   return false;
 }
@@ -121,6 +123,12 @@ bool PPCPassConfig::addILPOpts() {
 bool PPCPassConfig::addInstSelector() {
   // Install an instruction selector.
   addPass(createPPCISelDag(getPPCTargetMachine()));
+
+#ifndef NDEBUG
+  if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
+    addPass(createPPCCTRLoopsVerify());
+#endif
+
   return false;
 }
 
@@ -155,7 +163,7 @@ void PPCTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our PPC pass. This
   // allows the PPC pass to delegate to the target independent layer when
   // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createBasicTargetTransformInfoPass(this));
   PM.add(createPPCTargetTransformInfoPass(this));
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
new file mode 100644
index 0000000..ec1e606
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -0,0 +1,67 @@
+//===-- PPCTargetObjectFile.cpp - PPC Object Info -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetObjectFile.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+void
+PPC64LinuxTargetObjectFile::
+Initialize(MCContext &Ctx, const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
+}
+
+const MCSection * PPC64LinuxTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  const MCSection *DefaultSection = 
+    TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang, TM);
+
+  if (DefaultSection != ReadOnlySection)
+    return DefaultSection;
+
+  // Here override ReadOnlySection to DataRelROSection for PPC64 SVR4 ABI
+  // when we have a constant that contains global relocations.  This is
+  // necessary because of this ABI's handling of pointers to functions in
+  // a shared library.  The address of a function is actually the address
+  // of a function descriptor, which resides in the .opd section.  Generated
+  // code uses the descriptor directly rather than going via the GOT as some
+  // other ABIs do, which means that initialized function pointers must
+  // reference the descriptor.  The linker must convert copy relocs of
+  // pointers to functions in shared libraries into dynamic relocations,
+  // because of an ordering problem with initialization of copy relocs and
+  // PLT entries.  The dynamic relocation will be initialized by the dynamic
+  // linker, so we must use DataRelROSection instead of ReadOnlySection.
+  // For more information, see the description of ELIMINATE_COPY_RELOCS in
+  // GNU ld.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+
+  if (GVar && GVar->isConstant() &&
+      (GVar->getInitializer()->getRelocationInfo() ==
+       Constant::GlobalRelocations))
+    return DataRelROSection;
+
+  return DefaultSection;
+}
+
+const MCExpr *PPC64LinuxTargetObjectFile::
+getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  const MCExpr *Expr =
+    MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PPC_DTPREL, getContext());
+  return MCBinaryExpr::CreateAdd(Expr,
+                                 MCConstantExpr::Create(0x8000, getContext()),
+                                 getContext());
+}
+
diff --git a/lib/Target/PowerPC/PPCTargetObjectFile.h b/lib/Target/PowerPC/PPCTargetObjectFile.h
new file mode 100644
index 0000000..262c522
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetObjectFile.h
@@ -0,0 +1,35 @@
+//===-- PPCTargetObjectFile.h - PPC Object Info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+#define LLVM_TARGET_PPC_TARGETOBJECTFILE_H
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  /// PPC64LinuxTargetObjectFile - This implementation is used for
+  /// 64-bit PowerPC Linux.
+  class PPC64LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
+
+    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    virtual const MCSection *
+    SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                           Mangler *Mang, const TargetMachine &TM) const;
+
+    /// \brief Describe a TLS variable address within debug info.
+    virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
+  };
+
+}  // end namespace llvm
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
new file mode 100644
index 0000000..e876be1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -0,0 +1,23 @@
+//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETSTREAMER_H
+#define PPCTARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+class PPCTargetStreamer : public MCTargetStreamer {
+public:
+  virtual ~PPCTargetStreamer();
+  virtual void emitTCEntry(const MCSymbol &S) = 0;
+};
+}
+
+#endif
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2504ba7..8879630 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -77,6 +77,7 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
 
   /// @}
 
@@ -129,6 +130,14 @@ PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
   return PSK_Software;
 }
 
+void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (ST->getDarwinDirective() == PPC::DIR_A2) {
+    // The A2 is in-order with a deep pipeline, and concatenation unrolling
+    // helps expose latency-hiding opportunities to the instruction scheduler.
+    UP.Partial = UP.Runtime = true;
+  }
+}
+
 unsigned PPCTTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasAltivec())
     return 0;
diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
index fa44331..5727dbc 100644
--- a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
@@ -12,7 +12,7 @@
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::ThePPC32Target, llvm::ThePPC64Target;
+Target llvm::ThePPC32Target, llvm::ThePPC64Target, llvm::ThePPC64LETarget;
 
 extern "C" void LLVMInitializePowerPCTargetInfo() { 
   RegisterTarget<Triple::ppc, /*HasJIT=*/true>
@@ -20,4 +20,7 @@ extern "C" void LLVMInitializePowerPCTargetInfo() {
 
   RegisterTarget<Triple::ppc64, /*HasJIT=*/true>
     Y(ThePPC64Target, "ppc64", "PowerPC 64");
+
+  RegisterTarget<Triple::ppc64le, /*HasJIT=*/true>
+    Z(ThePPC64LETarget, "ppc64le", "PowerPC 64 LE");
 }
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 9792bd8..025b28e 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -11,32 +11,47 @@
 #ifndef AMDGPU_H
 #define AMDGPU_H
 
-#include "AMDGPUTargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
-class FunctionPass;
+class AMDGPUInstrPrinter;
 class AMDGPUTargetMachine;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
 
 // R600 Passes
-FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
+FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
+FunctionPass *createR600TextureIntrinsicsReplacer();
 FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
 FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
+FunctionPass *createR600ClauseMergePass(TargetMachine &tm);
 FunctionPass *createR600Packetizer(TargetMachine &tm);
 FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
+FunctionPass *createAMDGPUCFGStructurizerPass(TargetMachine &tm);
 
 // SI Passes
+FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
-FunctionPass* createAMDGPUIndirectAddressingPass(TargetMachine &tm);
+FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
+
+/// \brief Creates an AMDGPU-specific Target Transformation Info pass.
+ImmutablePass *
+createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM);
+
+extern Target TheAMDGPUTarget;
 
 } // End namespace llvm
 
@@ -49,4 +64,47 @@ namespace ShaderType {
   };
 }
 
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  REGION_ADDRESS   = 4, ///< Address space for region memory.
+  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+
+  // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
+  // order to be able to dynamically index a constant buffer, for example:
+  //
+  // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+  CONSTANT_BUFFER_0 = 8,
+  CONSTANT_BUFFER_1 = 9,
+  CONSTANT_BUFFER_2 = 10,
+  CONSTANT_BUFFER_3 = 11,
+  CONSTANT_BUFFER_4 = 12,
+  CONSTANT_BUFFER_5 = 13,
+  CONSTANT_BUFFER_6 = 14,
+  CONSTANT_BUFFER_7 = 15,
+  CONSTANT_BUFFER_8 = 16,
+  CONSTANT_BUFFER_9 = 17,
+  CONSTANT_BUFFER_10 = 18,
+  CONSTANT_BUFFER_11 = 19,
+  CONSTANT_BUFFER_12 = 20,
+  CONSTANT_BUFFER_13 = 21,
+  CONSTANT_BUFFER_14 = 22,
+  CONSTANT_BUFFER_15 = 23,
+  LAST_ADDRESS     = 24
+};
+
+} // namespace AMDGPUAS
+
 #endif // AMDGPU_H
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 1a26c77..182235b 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -10,6 +10,91 @@
 // Include AMDIL TD files
 include "AMDILBase.td"
 
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+// Debugging Features
+
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureIRStructurizer : SubtargetFeature <"disable-irstructurizer",
+        "EnableIRStructurizer",
+        "false",
+        "Disable IR Structurizer">;
+
+// Target features
+
+def FeatureIfCvt : SubtargetFeature <"disable-ifcvt",
+        "EnableIfCvt",
+        "false",
+        "Disable the if conversion pass">;
+
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "FP64",
+        "true",
+        "Enable 64bit double precision operations">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "true",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "Is32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding.">;
+
+def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
+        "HasVertexCache",
+        "true",
+        "Specify use of dedicated vertex cache.">;
+
+def FeatureCaymanISA : SubtargetFeature<"caymanISA",
+        "CaymanISA",
+        "true",
+        "Use Cayman ISA">;
+
+class SubtargetFeatureFetchLimit <string Value> :
+                          SubtargetFeature <"fetch"#Value,
+        "TexVTXClauseSize",
+        Value,
+        "Limit the maximum number of fetches in a clause to "#Value>;
+
+def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
+def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+
+class SubtargetFeatureGeneration <string Value,
+                                  list<SubtargetFeature> Implies> :
+        SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
+                          Value#" GPU generation", Implies>;
+
+def FeatureR600 : SubtargetFeatureGeneration<"R600",
+        [FeatureR600ALUInst, FeatureFetchLimit8]>;
+
+def FeatureR700 : SubtargetFeatureGeneration<"R700",
+        [FeatureFetchLimit16]>;
+
+def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
+        [FeatureFetchLimit16]>;
+
+def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
+        [FeatureFetchLimit16]>;
+
+def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
+        [Feature64BitPtr, FeatureFP64]>;
+
+def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
+        [Feature64BitPtr, FeatureFP64]>;
+//===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
   let guessInstructionProperties = 1;
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 4c35ecf..67bdba2 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -19,16 +19,17 @@
 
 #include "AMDGPUAsmPrinter.h"
 #include "AMDGPU.h"
-#include "SIDefines.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
+#include "SIDefines.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
@@ -44,32 +45,60 @@ extern "C" void LLVMInitializeR600AsmPrinter() {
   TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
 }
 
+AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer)
+{
+  DisasmEnabled = TM.getSubtarget<AMDGPUSubtarget>().dumpCode() &&
+                  ! Streamer.hasRawTextSupport();
+}
+
 /// We need to override this function so we can avoid
 /// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
-  if (STM.dumpCode()) {
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    MF.dump();
-#endif
-  }
   SetupMachineFunction(MF);
   if (OutStreamer.hasRawTextSupport()) {
     OutStreamer.EmitRawText("@" + MF.getName() + ":");
   }
 
-  const MCSectionELF *ConfigSection = getObjFileLowering().getContext()
-                                              .getELFSection(".AMDGPU.config",
+  MCContext &Context = getObjFileLowering().getContext();
+  const MCSectionELF *ConfigSection = Context.getELFSection(".AMDGPU.config",
                                               ELF::SHT_PROGBITS, 0,
                                               SectionKind::getReadOnly());
   OutStreamer.SwitchSection(ConfigSection);
-  if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     EmitProgramInfoSI(MF);
   } else {
     EmitProgramInfoR600(MF);
   }
+
+  DisasmLines.clear();
+  HexLines.clear();
+  DisasmLineMaxLen = 0;
+
   OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
   EmitFunctionBody();
+
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+
+    if (DisasmEnabled) {
+      OutStreamer.SwitchSection(Context.getELFSection(".AMDGPU.disasm",
+                                                  ELF::SHT_NOTE, 0,
+                                                  SectionKind::getReadOnly()));
+
+      for (size_t i = 0; i < DisasmLines.size(); ++i) {
+        std::string Comment(DisasmLineMaxLen - DisasmLines[i].size(), ' ');
+        Comment += " ; " + HexLines[i] + "\n";
+
+        OutStreamer.EmitBytes(StringRef(DisasmLines[i]));
+        OutStreamer.EmitBytes(StringRef(Comment));
+      }
+    }
+  }
+
   return false;
 }
 
@@ -105,7 +134,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   }
 
   unsigned RsrcReg;
-  if (STM.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX) {
+  if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) {
     // Evergreen / Northern Islands
     switch (MFI->ShaderType) {
     default: // Fall through
@@ -130,9 +159,15 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
                            S_STACK_SIZE(MFI->StackSize), 4);
   OutStreamer.EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
   OutStreamer.EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+
+  if (MFI->ShaderType == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
+    OutStreamer.EmitIntValue(RoundUpToAlignment(MFI->LDSSize, 4) >> 2, 4);
+  }
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -148,7 +183,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
 
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
-        MachineOperand & MO = MI.getOperand(op_idx);
+        MachineOperand &MO = MI.getOperand(op_idx);
         unsigned maxUsed;
         unsigned width = 0;
         bool isSGPR = false;
@@ -162,8 +197,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
           VCCUsed = true;
           continue;
         }
+
         switch (reg) {
         default: break;
+        case AMDGPU::SCC:
         case AMDGPU::EXEC:
         case AMDGPU::M0:
           continue;
@@ -196,6 +233,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
         } else if (AMDGPU::VReg_256RegClass.contains(reg)) {
           isSGPR = false;
           width = 8;
+        } else if (AMDGPU::SReg_512RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 16;
         } else if (AMDGPU::VReg_512RegClass.contains(reg)) {
           isSGPR = false;
           width = 16;
@@ -227,7 +267,25 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF) {
 
   OutStreamer.EmitIntValue(RsrcReg, 4);
   OutStreamer.EmitIntValue(S_00B028_VGPRS(MaxVGPR / 4) | S_00B028_SGPRS(MaxSGPR / 8), 4);
+
+  unsigned LDSAlignShift;
+  if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) {
+    // LDS is allocated in 64 dword blocks
+    LDSAlignShift = 8;
+  } else {
+    // LDS is allocated in 128 dword blocks
+    LDSAlignShift = 9;
+  }
+  unsigned LDSBlocks =
+          RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift;
+
+  if (MFI->ShaderType == ShaderType::COMPUTE) {
+    OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
+    OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+  }
   if (MFI->ShaderType == ShaderType::PIXEL) {
+    OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
+    OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4);
     OutStreamer.EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
     OutStreamer.EmitIntValue(MFI->PSInputAddr, 4);
   }
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index f425ef4..05dc9bb 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,14 +16,15 @@
 #define AMDGPU_ASMPRINTER_H
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include <string>
+#include <vector>
 
 namespace llvm {
 
 class AMDGPUAsmPrinter : public AsmPrinter {
 
 public:
-  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) { }
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -38,6 +39,11 @@ public:
 
   /// Implemented in AMDGPUMCInstLower.cpp
   virtual void EmitInstruction(const MachineInstr *MI);
+
+protected:
+  bool DisasmEnabled;
+  std::vector<std::string> DisasmLines, HexLines;
+  size_t DisasmLineMaxLen;
 };
 
 } // End anonymous llvm
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 9c30515..65cdb24 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -19,12 +19,13 @@ def CC_SI : CallingConv<[
 
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
-    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15
+    SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
+    SGPR16
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
     [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR12, SGPR15 ]
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
   >>>,
 
   CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
@@ -34,15 +35,40 @@ def CC_SI : CallingConv<[
     VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31
   ]>>>,
 
-  // This is the default for i64 values.
-  // XXX: We should change this once clang understands the CC_AMDGPU.
-  CCIfType<[i64], CCAssignToRegWithShadow<
-   [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
-   [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
-  >>
+  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
+    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14 ],
+    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15 ]
+  >>>
+
+]>;
+
+// Calling convention for R600
+def CC_R600 : CallingConv<[
+  CCIfInReg<CCIfType<[v4f32, v4i32] , CCAssignToReg<[
+    T0_XYZW, T1_XYZW, T2_XYZW, T3_XYZW, T4_XYZW, T5_XYZW, T6_XYZW, T7_XYZW,
+    T8_XYZW, T9_XYZW, T10_XYZW, T11_XYZW, T12_XYZW, T13_XYZW, T14_XYZW, T15_XYZW,
+    T16_XYZW, T17_XYZW, T18_XYZW, T19_XYZW, T20_XYZW, T21_XYZW, T22_XYZW,
+    T23_XYZW, T24_XYZW, T25_XYZW, T26_XYZW, T27_XYZW, T28_XYZW, T29_XYZW,
+    T30_XYZW, T31_XYZW, T32_XYZW
+  ]>>>
+]>;
+
+// Calling convention for compute kernels
+def CC_AMDGPU_Kernel : CallingConv<[
+  CCCustom<"allocateStack">
 ]>;
 
 def CC_AMDGPU : CallingConv<[
-  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().device()"#
-       "->getGeneration() == AMDGPUDeviceInfo::HD7XXX", CCDelegateTo<CC_SI>>
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() >= "
+       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+       "State.getMachineFunction().getInfo<SIMachineFunctionInfo>()->"#
+       "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>().getGeneration() < "
+       "AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+       "State.getMachineFunction().getInfo<R600MachineFunctionInfo>()->"
+       "ShaderType == ShaderType::COMPUTE", CCDelegateTo<CC_AMDGPU_Kernel>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_SI>>,
+  CCIf<"State.getTarget().getSubtarget<AMDGPUSubtarget>()"#
+       ".getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS", CCDelegateTo<CC_R600>>
 ]>;
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 815d6f7..40f14d2 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -78,27 +78,8 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    const AllocaInst *Alloca = MFI->getObjectAllocation(i);
-    unsigned ArrayElements;
-    const Type *AllocaType = Alloca->getAllocatedType();
-    const Type *ElementType;
-
-    if (AllocaType->isArrayTy()) {
-      ArrayElements = AllocaType->getArrayNumElements();
-      ElementType = AllocaType->getArrayElementType();
-    } else {
-      ArrayElements = 1;
-      ElementType = AllocaType;
-    }
-
-    unsigned VectorElements;
-    if (ElementType->isVectorTy()) {
-      VectorElements = ElementType->getVectorNumElements();
-    } else {
-      VectorElements = 1;
-    }
-
-    Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+    unsigned Size = MFI->getObjectSize(i);
+    Offset += (Size / (getStackWidth(MF) * 4));
   }
   return Offset;
 }
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
new file mode 100644
index 0000000..a989135
--- /dev/null
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -0,0 +1,585 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "R600InstrInfo.h"
+#include "SIISelLowering.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include <list>
+#include <queue>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget &Subtarget;
+public:
+  AMDGPUDAGToDAGISel(TargetMachine &TM);
+  virtual ~AMDGPUDAGToDAGISel();
+
+  SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+  virtual void PostprocessISelDAG();
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+  bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
+                   const R600InstrInfo *TII);
+  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+  bool FoldDotOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+  SDValue SimplifyI24(SDValue &Op);
+  bool SelectI24(SDValue Addr, SDValue &Op);
+  bool SelectU24(SDValue Addr, SDValue &Op);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  bool isCPLoad(const LoadSDNode *N) const;
+  bool isConstantLoad(const LoadSDNode *N, int cbID) const;
+  bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isParamLoad(const LoadSDNode *N) const;
+  bool isPrivateLoad(const LoadSDNode *N) const;
+  bool isLocalLoad(const LoadSDNode *N) const;
+  bool isRegionLoad(const LoadSDNode *N) const;
+
+  const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
+  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
+  bool SelectGlobalValueVariableOffset(SDValue Addr,
+      SDValue &BaseReg, SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
+                                       ) {
+  return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
+  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+/// \brief Determine the register class for \p OpNo
+/// \returns The register class of the virtual register that will be used for
+/// the given operand number \OpNo or NULL if the register class cannot be
+/// determined.
+const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
+                                                          unsigned OpNo) const {
+  if (!N->isMachineOpcode()) {
+    return NULL;
+  }
+  switch (N->getMachineOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
+    unsigned OpIdx = Desc.getNumDefs() + OpNo;
+    if (OpIdx >= Desc.getNumOperands())
+      return NULL;
+    int RegClass = Desc.OpInfo[OpIdx].RegClass;
+    if (RegClass == -1) {
+      return NULL;
+    }
+    return TM.getRegisterInfo()->getRegClass(RegClass);
+  }
+  case AMDGPU::REG_SEQUENCE: {
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
+                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
+    unsigned SubRegIdx =
+            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
+  }
+  }
+}
+
+SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+    SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
+    return NULL;   // Already selected.
+  }
+  switch (Opc) {
+  default: break;
+  case ISD::BUILD_VECTOR: {
+    unsigned RegClassID;
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    const AMDGPURegisterInfo *TRI =
+                   static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
+    const SIRegisterInfo *SIRI =
+                   static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+    EVT VT = N->getValueType(0);
+    unsigned NumVectorElts = VT.getVectorNumElements();
+    assert(VT.getVectorElementType().bitsEq(MVT::i32));
+    if (ST.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+      bool UseVReg = true;
+      for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                    U != E; ++U) {
+        if (!U->isMachineOpcode()) {
+          continue;
+        }
+        const TargetRegisterClass *RC = getOperandRegClass(*U, U.getOperandNo());
+        if (!RC) {
+          continue;
+        }
+        if (SIRI->isSGPRClass(RC)) {
+          UseVReg = false;
+        }
+      }
+      switch(NumVectorElts) {
+      case 1: RegClassID = UseVReg ? AMDGPU::VReg_32RegClassID :
+                                     AMDGPU::SReg_32RegClassID;
+        break;
+      case 2: RegClassID = UseVReg ? AMDGPU::VReg_64RegClassID :
+                                     AMDGPU::SReg_64RegClassID;
+        break;
+      case 4: RegClassID = UseVReg ? AMDGPU::VReg_128RegClassID :
+                                     AMDGPU::SReg_128RegClassID;
+        break;
+      case 8: RegClassID = UseVReg ? AMDGPU::VReg_256RegClassID :
+                                     AMDGPU::SReg_256RegClassID;
+        break;
+      case 16: RegClassID = UseVReg ? AMDGPU::VReg_512RegClassID :
+                                      AMDGPU::SReg_512RegClassID;
+        break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    } else {
+      // BUILD_VECTOR was lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
+      // that adds a 128 bits reg copy when going through TwoAddressInstructions
+      // pass. We want to avoid 128 bits copies as much as possible because they
+      // can't be bundled by our scheduler.
+      switch(NumVectorElts) {
+      case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
+      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
+      default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
+      }
+    }
+
+    SDValue RegClass = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+
+    if (NumVectorElts == 1) {
+      return CurDAG->SelectNodeTo(N, AMDGPU::COPY_TO_REGCLASS,
+                                  VT.getVectorElementType(),
+                                  N->getOperand(0), RegClass);
+    }
+
+    assert(NumVectorElts <= 16 && "Vectors with more than 16 elements not "
+                                  "supported yet");
+    // 16 = Max Num Vector Elements
+    // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
+    // 1 = Vector Register Class
+    SDValue RegSeqArgs[16 * 2 + 1];
+
+    RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
+    bool IsRegSeq = true;
+    for (unsigned i = 0; i < N->getNumOperands(); i++) {
+      // XXX: Why is this here?
+      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
+        IsRegSeq = false;
+        break;
+      }
+      RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
+      RegSeqArgs[1 + (2 * i) + 1] =
+              CurDAG->getTargetConstant(TRI->getSubRegFromChannel(i), MVT::i32);
+    }
+    if (!IsRegSeq)
+      break;
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
+        RegSeqArgs, 2 * N->getNumOperands() + 1);
+  }
+  case ISD::BUILD_PAIR: {
+    SDValue RC, SubReg0, SubReg1;
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+      break;
+    }
+    if (N->getValueType(0) == MVT::i128) {
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
+    } else if (N->getValueType(0) == MVT::i64) {
+      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
+      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
+    } else {
+      llvm_unreachable("Unhandled value type for BUILD_PAIR");
+    }
+    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
+                            N->getOperand(1), SubReg1 };
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
+                                  SDLoc(N), N->getValueType(0), Ops);
+  }
+  case AMDGPUISD::REGISTER_LOAD: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+
+    SelectADDRIndirect(N->getOperand(1), Addr, Offset);
+    const SDValue Ops[] = {
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterLoad, SDLoc(N),
+                                  CurDAG->getVTList(MVT::i32, MVT::i64, MVT::Other),
+                                  Ops);
+  }
+  case AMDGPUISD::REGISTER_STORE: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      break;
+    SDValue Addr, Offset;
+    SelectADDRIndirect(N->getOperand(2), Addr, Offset);
+    const SDValue Ops[] = {
+      N->getOperand(1),
+      Addr,
+      Offset,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      N->getOperand(0),
+    };
+    return CurDAG->getMachineNode(AMDGPU::SI_RegisterStorePseudo, SDLoc(N),
+                                        CurDAG->getVTList(MVT::Other),
+                                        Ops);
+  }
+  }
+  return SelectCode(N);
+}
+
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
+  if (CbId == -1) {
+    return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS);
+  }
+  return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
+  if (N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getMemoryVT().bitsLT(MVT::i32)) {
+      return true;
+    }
+  }
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
+  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    if (MMO) {
+      const Value *V = MMO->getValue();
+      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+//===----------------------------------------------------------------------===//
+// Complex Patterns
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
+    SDValue& IntPtr) {
+  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
+    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
+    SDValue& BaseReg, SDValue &Offset) {
+  if (!dyn_cast<ConstantSDNode>(Addr)) {
+    BaseReg = Addr;
+    Offset = CurDAG->getIntPtrConstant(0, true);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode * IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  SDLoc(CurDAG->getEntryNode()),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
+                                            SDValue &Offset) {
+  ConstantSDNode *C;
+
+  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
+    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
+            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+  } else {
+    Base = Addr;
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+
+  return true;
+}
+
+SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
+  APInt Demanded = APInt(32, 0x00FFFFFF);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
+  const TargetLowering *TLI = getTargetLowering();
+  if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
+    CurDAG->ReplaceAllUsesWith(Op, TLO.New);
+    CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
+    return SimplifyI24(TLO.New);
+  } else {
+    return  Op;
+  }
+}
+
+bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
+
+  assert(Op.getValueType() == MVT::i32);
+
+  if (CurDAG->ComputeNumSignBits(Op) == 9) {
+    I24 = SimplifyI24(Op);
+    return true;
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
+  APInt KnownZero;
+  APInt KnownOne;
+  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
+
+  assert (Op.getValueType() == MVT::i32);
+
+  // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
+  // i32.  These smaller types are legal to use with the i24 instructions.
+  if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
+       Op.getOpcode() == ISD::ANY_EXTEND ||
+       ISD::isEXTLoad(Op.getNode())) {
+    U24 = SimplifyI24(Op);
+    return true;
+  }
+  return false;
+}
+
+void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
+  const AMDGPUTargetLowering& Lowering =
+    (*(const AMDGPUTargetLowering*)getTargetLowering());
+  bool IsModified = false;
+  do {
+    IsModified = false;
+    // Go over all selected nodes and try to fold them a bit more
+    for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+
+      SDNode *Node = I;
+
+      MachineSDNode *MachineNode = dyn_cast<MachineSDNode>(I);
+      if (!MachineNode)
+        continue;
+
+      SDNode *ResNode = Lowering.PostISelFolding(MachineNode, *CurDAG);
+      if (ResNode != Node) {
+        ReplaceUses(Node, ResNode);
+        IsModified = true;
+      }
+    }
+    CurDAG->RemoveDeadNodes();
+  } while (IsModified);
+}
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..c4d75ff 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -14,16 +14,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUISelLowering.h"
+#include "AMDGPU.h"
+#include "AMDGPUFrameLowering.h"
 #include "AMDGPURegisterInfo.h"
-#include "AMDILIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDILIntrinsicInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
 
 using namespace llvm;
+static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
+                      CCValAssign::LocInfo LocInfo,
+                      ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() / 8, ArgFlags.getOrigAlign());
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+
+  return true;
+}
 
 #include "AMDGPUGenCallingConv.inc"
 
@@ -45,26 +58,171 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::FABS,   MVT::f32, Legal);
   setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
   setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+  setOperationAction(ISD::FROUND, MVT::f32, Legal);
+
+  // The hardware supports ROTR, but not ROTL
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
 
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
 
+  setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
+  // Custom lowering of vector stores is required for local address space
+  // stores.
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  // XXX: Native v2i32 local address space stores are possible, but not
+  // currently implemented.
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+
+  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
+  setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
+  setTruncStoreAction(MVT::v4i32, MVT::v4i8, Custom);
+  // XXX: This can be change to Custom, once ExpandVectorStores can
+  // handle 64-bit stores.
+  setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
+  setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f32, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
+  setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
+
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
+  setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
+
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::v2i32, MVT::v4i32
+  };
+  const size_t NumIntTypes = array_lengthof(IntTypes);
+
+  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
+    MVT::SimpleValueType VT = IntTypes[x];
+    //Expand the following operations for the current type by default
+    setOperationAction(ISD::ADD,  VT, Expand);
+    setOperationAction(ISD::AND,  VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT, Expand);
+    setOperationAction(ISD::MUL,  VT, Expand);
+    setOperationAction(ISD::OR,   VT, Expand);
+    setOperationAction(ISD::SHL,  VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::SRL,  VT, Expand);
+    setOperationAction(ISD::SRA,  VT, Expand);
+    setOperationAction(ISD::SUB,  VT, Expand);
+    setOperationAction(ISD::UDIV, VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT, Expand);
+    setOperationAction(ISD::UREM, VT, Expand);
+    setOperationAction(ISD::VSELECT, VT, Expand);
+    setOperationAction(ISD::XOR,  VT, Expand);
+  }
+
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::v2f32, MVT::v4f32
+  };
+  const size_t NumFloatTypes = array_lengthof(FloatTypes);
+
+  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
+    MVT::SimpleValueType VT = FloatTypes[x];
+    setOperationAction(ISD::FABS, VT, Expand);
+    setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FDIV, VT, Expand);
+    setOperationAction(ISD::FFLOOR, VT, Expand);
+    setOperationAction(ISD::FMUL, VT, Expand);
+    setOperationAction(ISD::FRINT, VT, Expand);
+    setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSUB, VT, Expand);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Target Information
+//===----------------------------------------------------------------------===//
+
+MVT AMDGPUTargetLowering::getVectorIdxTy() const {
+  return MVT::i32;
+}
+
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
+                                                   EVT CastTy) const {
+  if (LoadTy.getSizeInBits() != CastTy.getSizeInBits())
+    return true;
+
+  unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits();
+  unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits();
+
+  return ((LScalarSize <= CastScalarSize) ||
+          (CastScalarSize >= 32) ||
+          (LScalarSize < 32));
+}
+
+//===---------------------------------------------------------------------===//
+// Target Properties
+//===---------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32;
+}
+
+bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32;
 }
 
 //===---------------------------------------------------------------------===//
@@ -83,7 +241,7 @@ SDValue AMDGPUTargetLowering::LowerReturn(
                                      bool isVarArg,
                                      const SmallVectorImpl<ISD::OutputArg> &Outs,
                                      const SmallVectorImpl<SDValue> &OutVals,
-                                     DebugLoc DL, SelectionDAG &DAG) const {
+                                     SDLoc DL, SelectionDAG &DAG) const {
   return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
 }
 
@@ -105,16 +263,104 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   // AMDGPU DAG lowering
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
   }
   return Op;
 }
 
+SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
+                                                 SDValue Op,
+                                                 SelectionDAG &DAG) const {
+
+  const DataLayout *TD = getTargetMachine().getDataLayout();
+  GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
+
+  assert(G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS);
+  // XXX: What does the value of G->getOffset() mean?
+  assert(G->getOffset() == 0 &&
+         "Do not know what to do with an non-zero offset");
+
+  const GlobalValue *GV = G->getGlobal();
+
+  unsigned Offset;
+  if (MFI->LocalMemoryObjects.count(GV) == 0) {
+    uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
+    Offset = MFI->LDSSize;
+    MFI->LocalMemoryObjects[GV] = Offset;
+    // XXX: Account for alignment?
+    MFI->LDSSize += Size;
+  } else {
+    Offset = MFI->LocalMemoryObjects[GV];
+  }
+
+  return DAG.getConstant(Offset, getPointerTy(G->getAddressSpace()));
+}
+
+void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start,
+                                         unsigned Count) const {
+  EVT VT = Op.getValueType();
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
+                               VT.getVectorElementType(),
+                               Op, DAG.getConstant(i, MVT::i32)));
+  }
+}
+
+SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SmallVector<SDValue, 8> Args;
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  ExtractVectorElements(A, DAG, Args, 0,
+                        A.getValueType().getVectorNumElements());
+  ExtractVectorElements(B, DAG, Args, 0,
+                        B.getValueType().getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+
+  SmallVector<SDValue, 8> Args;
+  EVT VT = Op.getValueType();
+  unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
+                        VT.getVectorNumElements());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
+                     &Args[0], Args.size());
+}
+
+SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
+                                              SelectionDAG &DAG) const {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const AMDGPUFrameLowering *TFL =
+   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
+
+  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
+  assert(FIN);
+
+  unsigned FrameIndex = FIN->getIndex();
+  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
+  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF),
+                         Op.getValueType());
+}
+
 SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     SelectionDAG &DAG) const {
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   switch (IntrinsicID) {
@@ -154,7 +400,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
     SelectionDAG &DAG) const {
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
                                               Op.getOperand(1));
@@ -166,7 +412,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
                                 DAG.getConstantFP(1.0f, MVT::f32),
@@ -181,7 +427,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 /// \brief Generate Min/Max node
 SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
@@ -238,11 +484,126 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   return Op;
 }
 
+SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
+                                              SelectionDAG &DAG) const {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+  EVT MemEltVT = Load->getMemoryVT().getVectorElementType();
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  EVT PtrVT = Load->getBasePtr().getValueType();
+  unsigned NumElts = Load->getMemoryVT().getVectorNumElements();
+  SmallVector<SDValue, 8> Loads;
+  SDLoc SL(Op);
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, Load->getBasePtr(),
+                    DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8), PtrVT));
+    Loads.push_back(DAG.getExtLoad(Load->getExtensionType(), SL, EltVT,
+                        Load->getChain(), Ptr,
+                        MachinePointerInfo(Load->getMemOperand()->getValue()),
+                        MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
+                        Load->getAlignment()));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), &Loads[0],
+                     Loads.size());
+}
+
+SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  StoreSDNode *Store = dyn_cast<StoreSDNode>(Op);
+  EVT MemVT = Store->getMemoryVT();
+  unsigned MemBits = MemVT.getSizeInBits();
 
+  // Byte stores are really expensive, so if possible, try to pack
+  // 32-bit vector truncatating store into an i32 store.
+  // XXX: We could also handle optimize other vector bitwidths
+  if (!MemVT.isVector() || MemBits > 32) {
+    return SDValue();
+  }
+
+  SDLoc DL(Op);
+  const SDValue &Value = Store->getValue();
+  EVT VT = Value.getValueType();
+  const SDValue &Ptr = Store->getBasePtr();
+  EVT MemEltVT = MemVT.getVectorElementType();
+  unsigned MemEltBits = MemEltVT.getSizeInBits();
+  unsigned MemNumElements = MemVT.getVectorNumElements();
+  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
+  SDValue Mask;
+  switch(MemEltBits) {
+  case 8:
+    Mask = DAG.getConstant(0xFF, PackedVT);
+    break;
+  case 16:
+    Mask = DAG.getConstant(0xFFFF, PackedVT);
+    break;
+  default:
+    llvm_unreachable("Cannot lower this vector store");
+  }
+  SDValue PackedValue;
+  for (unsigned i = 0; i < MemNumElements; ++i) {
+    EVT ElemVT = VT.getVectorElementType();
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
+                              DAG.getConstant(i, MVT::i32));
+    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
+    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
+    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
+    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    if (i == 0) {
+      PackedValue = Elt;
+    } else {
+      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+    }
+  }
+  return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
+                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->isVolatile(),  Store->isNonTemporal(),
+                      Store->getAlignment());
+}
+
+SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT MemEltVT = Store->getMemoryVT().getVectorElementType();
+  EVT EltVT = Store->getValue().getValueType().getVectorElementType();
+  EVT PtrVT = Store->getBasePtr().getValueType();
+  unsigned NumElts = Store->getMemoryVT().getVectorNumElements();
+  SDLoc SL(Op);
+
+  SmallVector<SDValue, 8> Chains;
+
+  for (unsigned i = 0, e = NumElts; i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                              Store->getValue(), DAG.getConstant(i, MVT::i32));
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT,
+                              Store->getBasePtr(),
+                            DAG.getConstant(i * (MemEltVT.getSizeInBits() / 8),
+                                            PtrVT));
+    Chains.push_back(DAG.getTruncStore(Store->getChain(), SL, Val, Ptr,
+                         MachinePointerInfo(Store->getMemOperand()->getValue()),
+                         MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
+                         Store->getAlignment()));
+  }
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+}
+
+SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Result = AMDGPUTargetLowering::MergeVectorStore(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
+
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  if ((Store->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+       Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+      Store->getValue().getValueType().isVector()) {
+    return SplitVectorStore(Op, DAG);
+  }
+  return SDValue();
+}
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
     SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Num = Op.getOperand(0);
@@ -295,13 +656,13 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
                                                  DAG.getConstant(-1, VT),
                                                  DAG.getConstant(0, VT),
-                                                 ISD::SETGE);
-  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
-  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
-                                                  DAG.getConstant(0, VT),
+                                                 ISD::SETUGE);
+  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
+                                                  Num_S_Remainder,
                                                   DAG.getConstant(-1, VT),
                                                   DAG.getConstant(0, VT),
-                                                  ISD::SETGE);
+                                                  ISD::SETUGE);
   // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
   SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
                                                Remainder_GE_Zero);
@@ -345,10 +706,62 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
+SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue S0 = Op.getOperand(0);
+  SDLoc DL(Op);
+  if (Op.getValueType() != MVT::f32 || S0.getValueType() != MVT::i64)
+    return SDValue();
+
+  // f32 uint_to_fp i64
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(0, MVT::i32));
+  SDValue FloatLo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Lo);
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, S0,
+                           DAG.getConstant(1, MVT::i32));
+  SDValue FloatHi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, Hi);
+  FloatHi = DAG.getNode(ISD::FMUL, DL, MVT::f32, FloatHi,
+                        DAG.getConstantFP(4294967296.0f, MVT::f32)); // 2^32
+  return DAG.getNode(ISD::FADD, DL, MVT::f32, FloatLo, FloatHi);
+
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
 
+void AMDGPUTargetLowering::getOriginalFunctionArgs(
+                               SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const {
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    if (Ins[i].ArgVT == Ins[i].VT) {
+      OrigIns.push_back(Ins[i]);
+      continue;
+    }
+
+    EVT VT;
+    if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) {
+      // Vector has been split into scalars.
+      VT = Ins[i].ArgVT.getVectorElementType();
+    } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() &&
+               Ins[i].ArgVT.getVectorElementType() !=
+               Ins[i].VT.getVectorElementType()) {
+      // Vector elements have been promoted
+      VT = Ins[i].ArgVT;
+    } else {
+      // Vector has been spilt into smaller vectors.
+      VT = Ins[i].VT;
+    }
+
+    ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used,
+                      Ins[i].OrigArgIndex, Ins[i].PartOffset);
+    OrigIns.push_back(Arg);
+  }
+}
+
 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
   if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
     return CFP->isExactlyValue(1.0);
@@ -410,5 +823,13 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
+  NODE_NAME_CASE(LOAD_CONSTANT)
+  NODE_NAME_CASE(LOAD_INPUT)
+  NODE_NAME_CASE(SAMPLE)
+  NODE_NAME_CASE(SAMPLEB)
+  NODE_NAME_CASE(SAMPLED)
+  NODE_NAME_CASE(SAMPLEL)
+  NODE_NAME_CASE(STORE_MSKOR)
+  NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
   }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c2a79ea..2dfd3cf 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -20,12 +20,25 @@
 
 namespace llvm {
 
+class AMDGPUMachineFunction;
 class MachineRegisterInfo;
 
 class AMDGPUTargetLowering : public TargetLowering {
 private:
+  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &Args,
+                             unsigned Start, unsigned Count) const;
+  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Lower vector stores by merging the vector elements into an integer
+  /// of the same bitwidth.
+  SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
+  /// \brief Split a vector store into multiple scalar stores.
+  /// \returns The resulting chain. 
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
 protected:
 
@@ -33,23 +46,43 @@ protected:
   /// MachineFunction.
   ///
   /// \returns a RegisterSDNode representing Reg.
-  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                                                  unsigned Reg, EVT VT) const;
-
+  virtual SDValue CreateLiveInRegister(SelectionDAG &DAG,
+                                       const TargetRegisterClass *RC,
+                                       unsigned Reg, EVT VT) const;
+  SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
+                             SelectionDAG &DAG) const;
+  /// \brief Split a vector load into multiple scalar loads.
+  SDValue SplitVectorLoad(const SDValue &Op, SelectionDAG &DAG) const;
+  SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   bool isHWTrueValue(SDValue Op) const;
   bool isHWFalseValue(SDValue Op) const;
 
+  /// The SelectionDAGBuilder will automatically promote function arguments
+  /// with illegal types.  However, this does not work for the AMDGPU targets
+  /// since the function arguments are stored in memory as these illegal types.
+  /// In order to handle this properly we need to get the origianl types sizes
+  /// from the LLVM IR Function and fixup the ISD:InputArg values before
+  /// passing them to AnalyzeFormalArguments()
+  void getOriginalFunctionArgs(SelectionDAG &DAG,
+                               const Function *F,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               SmallVectorImpl<ISD::InputArg> &OrigIns) const;
   void AnalyzeFormalArguments(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
 
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
+  virtual bool isFAbsFree(EVT VT) const;
+  virtual bool isFNegFree(EVT VT) const;
+  virtual MVT getVectorIdxTy() const;
+  virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE;
   virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
-                              DebugLoc DL, SelectionDAG &DAG) const;
+                              SDLoc DL, SelectionDAG &DAG) const;
   virtual SDValue LowerCall(CallLoweringInfo &CLI,
                             SmallVectorImpl<SDValue> &InVals) const {
     CLI.Callee.dump();
@@ -115,10 +148,10 @@ enum {
   RET_FLAG,
   BRANCH_COND,
   // End AMDIL ISD Opcodes
-  BITALIGN,
-  BUFFER_STORE,
   DWORDADDR,
   FRACT,
+  COS_HW,
+  SIN_HW,
   FMAX,
   SMAX,
   UMAX,
@@ -126,10 +159,21 @@ enum {
   SMIN,
   UMIN,
   URECIP,
+  DOT4,
+  TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
+  LOAD_INPUT,
+  SAMPLE,
+  SAMPLEB,
+  SAMPLED,
+  SAMPLEL,
+  FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  STORE_MSKOR,
+  LOAD_CONSTANT,
+  TBUFFER_STORE_FORMAT,
   LAST_AMDGPU_ISD_NUMBER
 };
 
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
deleted file mode 100644
index ed6c8ec..0000000
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ /dev/null
@@ -1,343 +0,0 @@
-//===-- AMDGPUIndirectAddressing.cpp - Indirect Adressing Support ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Instructions can use indirect addressing to index the register file as if it
-/// were memory.  This pass lowers RegisterLoad and RegisterStore instructions
-/// to either a COPY or a MOV that uses indirect addressing.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
-#include "R600MachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-namespace {
-
-class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
-
-private:
-  static char ID;
-  const AMDGPUInstrInfo *TII;
-
-  bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
-
-public:
-  AMDGPUIndirectAddressingPass(TargetMachine &tm) :
-    MachineFunctionPass(ID),
-    TII(static_cast<const AMDGPUInstrInfo*>(tm.getInstrInfo()))
-    { }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  const char *getPassName() const { return "R600 Handle indirect addressing"; }
-
-};
-
-} // End anonymous namespace
-
-char AMDGPUIndirectAddressingPass::ID = 0;
-
-FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
-  return new AMDGPUIndirectAddressingPass(tm);
-}
-
-bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  int IndirectBegin = TII->getIndirectIndexBegin(MF);
-  int IndirectEnd = TII->getIndirectIndexEnd(MF);
-
-  if (IndirectBegin == -1) {
-    // No indirect addressing, we can skip this pass
-    assert(IndirectEnd == -1);
-    return false;
-  }
-
-  // The map keeps track of the indirect address that is represented by
-  // each virtual register. The key is the register and the value is the
-  // indirect address it uses.
-  std::map<unsigned, unsigned> RegisterAddressMap;
-
-  // First pass - Lower all of the RegisterStore instructions and track which
-  // registers are live.
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // This map keeps track of the current live indirect registers.
-    // The key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterStore(MI)) {
-        continue;
-      }
-
-      // Lower RegisterStore
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-      const TargetRegisterClass *IndirectStoreRegClass =
-                   TII->getIndirectAddrStoreRegClass(MI.getOperand(0).getReg());
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access.
-        unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-
-        BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY), DstReg)
-                .addOperand(MI.getOperand(0));
-
-        RegisterAddressMap[DstReg] = Address;
-        LiveAddressRegisterMap[Address] = DstReg;
-      } else {
-        // Indirect register access.
-        MachineInstrBuilder MOV = TII->buildIndirectWrite(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          unsigned DstReg = MRI.createVirtualRegister(IndirectStoreRegClass);
-          MOV.addReg(DstReg, RegState::Define | RegState::Implicit);
-          RegisterAddressMap[DstReg] = Addr;
-          LiveAddressRegisterMap[Addr] = DstReg;
-        }
-      }
-      MI.eraseFromParent();
-    }
-
-    // Update the live-ins of the succesor blocks
-    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
-                                          SuccEnd = MBB.succ_end();
-                                          SuccEnd != Succ; ++Succ) {
-      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
-      for (Key = LiveAddressRegisterMap.begin(),
-           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
-        (*Succ)->addLiveIn(Key->second);
-      }
-    }
-  }
-
-  // Second pass - Lower the RegisterLoad instructions
-  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
-                                                      BB != BB_E; ++BB) {
-    // Key is the address and the value is the register
-    std::map<unsigned, unsigned> LiveAddressRegisterMap;
-    MachineBasicBlock &MBB = *BB;
-
-    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
-    while (LI != MBB.livein_end()) {
-      std::vector<unsigned> PhiRegisters;
-
-      // Make sure this live in is used for indirect addressing
-      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
-        ++LI;
-        continue;
-      }
-
-      unsigned Address = RegisterAddressMap[*LI];
-      LiveAddressRegisterMap[Address] = *LI;
-      PhiRegisters.push_back(*LI);
-
-      // Check if there are other live in registers which map to the same
-      // indirect address.
-      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
-                                              LE = MBB.livein_end();
-                                              LJ != LE; ++LJ) {
-        unsigned Reg = *LJ;
-        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
-          continue;
-        }
-
-        if (RegisterAddressMap[Reg] == Address) {
-          PhiRegisters.push_back(Reg);
-        }
-      }
-
-      if (PhiRegisters.size() == 1) {
-        // We don't need to insert a Phi instruction, so we can just add the
-        // registers to the live list for the block.
-        LiveAddressRegisterMap[Address] = *LI;
-        MBB.removeLiveIn(*LI);
-      } else {
-        // We need to insert a PHI, because we have the same address being
-        // written in multiple predecessor blocks.
-        const TargetRegisterClass *PhiDstClass =
-                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
-        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
-        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
-                                          MBB.findDebugLoc(MBB.begin()),
-                                          TII->get(AMDGPU::PHI), PhiDstReg);
-
-        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
-                                                   RE = PhiRegisters.end();
-                                                   RI != RE; ++RI) {
-          unsigned Reg = *RI;
-          MachineInstr *DefInst = MRI.getVRegDef(Reg);
-          assert(DefInst);
-          MachineBasicBlock *RegBlock = DefInst->getParent();
-          Phi.addReg(Reg);
-          Phi.addMBB(RegBlock);
-          MBB.removeLiveIn(Reg);
-        }
-        RegisterAddressMap[PhiDstReg] = Address;
-        LiveAddressRegisterMap[Address] = PhiDstReg;
-      }
-      LI = MBB.livein_begin();
-    }
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
-                               I != MBB.end(); I = Next) {
-      Next = llvm::next(I);
-      MachineInstr &MI = *I;
-
-      if (!TII->isRegisterLoad(MI)) {
-        if (MI.getOpcode() == AMDGPU::PHI) {
-          continue;
-        }
-        // Check for indirect register defs
-        for (unsigned OpIdx = 0, NumOperands = MI.getNumOperands();
-                                 OpIdx < NumOperands; ++OpIdx) {
-          MachineOperand &MO = MI.getOperand(OpIdx);
-          if (MO.isReg() && MO.isDef() &&
-              RegisterAddressMap.find(MO.getReg()) != RegisterAddressMap.end()) {
-            unsigned Reg = MO.getReg();
-            unsigned LiveAddress = RegisterAddressMap[Reg];
-            // Chain the live-ins
-            if (LiveAddressRegisterMap.find(LiveAddress) !=
-                                                     RegisterAddressMap.end()) {
-              MI.addOperand(MachineOperand::CreateReg(
-                                  LiveAddressRegisterMap[LiveAddress],
-                                  false, // isDef
-                                  true,  // isImp
-                                  true));  // isKill
-            }
-            LiveAddressRegisterMap[LiveAddress] = Reg;
-          }
-        }
-        continue;
-      }
-
-      const TargetRegisterClass *SuperIndirectRegClass =
-                                                TII->getSuperIndirectRegClass();
-      const TargetRegisterClass *IndirectLoadRegClass =
-                                             TII->getIndirectAddrLoadRegClass();
-      unsigned IndirectReg = MRI.createVirtualRegister(SuperIndirectRegClass);
-
-      unsigned RegIndex = MI.getOperand(2).getImm();
-      unsigned Channel = MI.getOperand(3).getImm();
-      unsigned Address = TII->calculateIndirectAddress(RegIndex, Channel);
-
-      if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
-        // Direct register access
-        unsigned Reg = LiveAddressRegisterMap[Address];
-        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
-
-        if (regHasExplicitDef(MRI, Reg)) {
-          // If the register we are reading from has an explicit def, then that
-          // means it was written via a direct register access (i.e. COPY
-          // or other instruction that doesn't use indirect addressing).  In
-          // this case we know where the value has been stored, so we can just
-          // issue a copy.
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                  MI.getOperand(0).getReg())
-                  .addReg(Reg);
-        } else {
-          // If the register we are reading has an implicit def, then that
-          // means it was written by an indirect register access (i.e. An
-          // instruction that uses indirect addressing. 
-          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
-                   MI.getOperand(0).getReg())
-                   .addReg(AddrReg)
-                   .addReg(Reg, RegState::Implicit);
-        }
-      } else {
-        // Indirect register access
-
-        // Note on REQ_SEQUENCE instructons: You can't actually use the register
-        // it defines unless  you have an instruction that takes the defined
-        // register class as an operand.
-
-        MachineInstrBuilder Sequence = BuildMI(MBB, I, MBB.findDebugLoc(I),
-                                               TII->get(AMDGPU::REG_SEQUENCE),
-                                               IndirectReg);
-        for (int i = IndirectBegin; i <= IndirectEnd; ++i) {
-          unsigned Addr = TII->calculateIndirectAddress(i, Channel);
-          if (LiveAddressRegisterMap.find(Addr) == LiveAddressRegisterMap.end()) {
-            continue;
-          }
-          unsigned Reg = LiveAddressRegisterMap[Addr];
-
-          // We only need to use REG_SEQUENCE for explicit defs, since the
-          // register coalescer won't do anything with the implicit defs.
-          if (!regHasExplicitDef(MRI, Reg)) {
-            continue;
-          }
-
-          // Insert a REQ_SEQUENCE instruction to force the register allocator
-          // to allocate the virtual register to the correct physical register.
-          Sequence.addReg(LiveAddressRegisterMap[Addr]);
-          Sequence.addImm(TII->getRegisterInfo().getIndirectSubReg(Addr));
-        }
-        MachineInstrBuilder Mov = TII->buildIndirectRead(BB, I,
-                                           MI.getOperand(0).getReg(), // Value
-                                           Address,
-                                           MI.getOperand(1).getReg()); // Offset
-
-
-
-        Mov.addReg(IndirectReg, RegState::Implicit | RegState::Kill);
-        Mov.addReg(LiveAddressRegisterMap[Address], RegState::Implicit);
-
-      }
-      MI.eraseFromParent();
-    }
-  }
-  return false;
-}
-
-bool AMDGPUIndirectAddressingPass::regHasExplicitDef(MachineRegisterInfo &MRI,
-                                                  unsigned Reg) const {
-  MachineInstr *DefInstr = MRI.getVRegDef(Reg);
-
-  if (!DefInstr) {
-    return false;
-  }
-
-  if (DefInstr->getOpcode() == AMDGPU::PHI) {
-    bool Explicit = false;
-    for (MachineInstr::const_mop_iterator I = DefInstr->operands_begin(),
-                                          E = DefInstr->operands_end();
-                                          I != E; ++I) {
-      const MachineOperand &MO = *I;
-      if (!MO.isReg() || MO.isDef()) {
-        continue;
-      }
-
-      Explicit = Explicit || regHasExplicitDef(MRI, MO.getReg());
-    }
-    return Explicit;
-  }
-
-  return DefInstr->getOperand(0).isReg() &&
-         DefInstr->getOperand(0).getReg() == Reg;
-}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index 30f736c..4f7084b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -16,19 +16,23 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
-#include "AMDIL.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
+#define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void AMDGPUInstrInfo::anchor() {}
+
 AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
-  : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
+  : AMDGPUGenInstrInfo(-1,-1), RI(tm), TM(tm) { }
 
 const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
   return RI;
@@ -99,27 +103,6 @@ bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
   return false;
 }
 
-MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
-  MachineBasicBlock::iterator tmp = MBB->end();
-  if (!MBB->size()) {
-    return MBB->end();
-  }
-  while (--tmp) {
-    if (tmp->getOpcode() == AMDGPU::ENDLOOP
-        || tmp->getOpcode() == AMDGPU::ENDIF
-        || tmp->getOpcode() == AMDGPU::ELSE) {
-      if (tmp == MBB->begin()) {
-        return tmp;
-      } else {
-        continue;
-      }
-    }  else {
-      return ++tmp;
-    }
-  }
-  return MBB->end();
-}
-
 void
 AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
@@ -139,6 +122,55 @@ AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   assert(!"Not Implemented");
 }
 
+bool AMDGPUInstrInfo::expandPostRAPseudo (MachineBasicBlock::iterator MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+   int OffsetOpIdx =
+       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::addr);
+   // addr is a custom operand with multiple MI operands, and only the
+   // first MI operand is given a name.
+  int RegOpIdx = OffsetOpIdx + 1;
+  int ChanOpIdx =
+      AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::chan);
+
+  if (isRegisterLoad(*MI)) {
+    int DstOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                    getIndirectAddrRegClass()->getRegister(Address));
+    } else {
+      buildIndirectRead(MBB, MI, MI->getOperand(DstOpIdx).getReg(),
+                        Address, OffsetReg);
+    }
+  } else if (isRegisterStore(*MI)) {
+    int ValOpIdx =
+        AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::val);
+    AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+    unsigned RegIndex = MI->getOperand(RegOpIdx).getImm();
+    unsigned Channel = MI->getOperand(ChanOpIdx).getImm();
+    unsigned Address = calculateIndirectAddress(RegIndex, Channel);
+    unsigned OffsetReg = MI->getOperand(OffsetOpIdx).getReg();
+    if (OffsetReg == AMDGPU::INDIRECT_BASE_ADDR) {
+      buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
+                    MI->getOperand(ValOpIdx).getReg());
+    } else {
+      buildIndirectWrite(MBB, MI, MI->getOperand(ValOpIdx).getReg(),
+                         calculateIndirectAddress(RegIndex, Channel),
+                         OffsetReg);
+    }
+  } else {
+    return false;
+  }
+
+  MBB->erase(MI);
+  return true;
+}
+
+
 MachineInstr *
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
@@ -244,6 +276,57 @@ bool AMDGPUInstrInfo::isRegisterLoad(const MachineInstr &MI) const {
   return get(MI.getOpcode()).TSFlags & AMDGPU_FLAG_REGISTER_LOAD;
 }
 
+int AMDGPUInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = -1;
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  if (MRI.livein_empty()) {
+    return 0;
+  }
+
+  const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
+  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
+                                            LE = MRI.livein_end();
+                                            LI != LE; ++LI) {
+    unsigned Reg = LI->first;
+    if (TargetRegisterInfo::isVirtualRegister(Reg) ||
+        !IndirectRC->contains(Reg))
+      continue;
+
+    unsigned RegIndex;
+    unsigned RegEnd;
+    for (RegIndex = 0, RegEnd = IndirectRC->getNumRegs(); RegIndex != RegEnd;
+                                                          ++RegIndex) {
+      if (IndirectRC->getRegister(RegIndex) == Reg)
+        break;
+    }
+    Offset = std::max(Offset, (int)RegIndex);
+  }
+
+  return Offset + 1;
+}
+
+int AMDGPUInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
+  int Offset = 0;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Variable sized objects are not supported
+  assert(!MFI->hasVarSizedObjects());
+
+  if (MFI->getNumObjects() == 0) {
+    return -1;
+  }
+
+  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
+
+  return getIndirectIndexBegin(MF) + Offset;
+}
+
 
 void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const {
@@ -265,3 +348,12 @@ void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
     }
   }
 }
+
+int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
+  switch (Channels) {
+  default: return Opcode;
+  case 1: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_1);
+  case 2: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_2);
+  case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
+  }
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 3909e4e..ce5b58c 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -23,6 +23,7 @@
 
 #define GET_INSTRINFO_HEADER
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
 #define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
@@ -42,6 +43,7 @@ private:
   const AMDGPURegisterInfo RI;
   bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
                           MachineBasicBlock &MBB) const;
+  virtual void anchor();
 protected:
   TargetMachine &TM;
 public:
@@ -86,6 +88,8 @@ public:
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const;
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
@@ -96,6 +100,14 @@ protected:
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const;
+  /// \returns the smallest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+
+  /// \returns the largest register index that will be accessed by an indirect
+  /// read or write or -1 if indirect addressing is not used by this program.
+  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
                             const SmallVectorImpl<unsigned> &Ops) const;
@@ -138,19 +150,9 @@ public:
 // Pure virtual funtions to be implemented by sub-classes.
 //===---------------------------------------------------------------------===//
 
-  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                       int64_t Imm) const = 0;
   virtual unsigned getIEQOpcode() const = 0;
   virtual bool isMov(unsigned opcode) const = 0;
 
-  /// \returns the smallest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const = 0;
-
-  /// \returns the largest register index that will be accessed by an indirect
-  /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const = 0;
-
   /// \brief Calculate the "Indirect Address" for the given \p RegIndex and
   ///        \p Channel
   ///
@@ -161,14 +163,9 @@ public:
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const = 0;
 
-  /// \returns The register class to be used for storing values to an
-  /// "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                  unsigned SourceReg) const = 0;
-
-  /// \returns The register class to be used for loading values from
-  /// an "Indirect Address" .
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const = 0;
+  /// \returns The register class to be used for loading and storing values
+  /// from an "Indirect Address" .
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const = 0;
 
   /// \brief Build instruction(s) for an indirect register write.
   ///
@@ -186,18 +183,27 @@ public:
                                     unsigned ValueReg, unsigned Address,
                                     unsigned OffsetReg) const = 0;
 
-  /// \returns the register class whose sub registers are the set of all
-  /// possible registers that can be used for indirect addressing.
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const = 0;
-
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
   virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
     DebugLoc DL) const;
 
+  /// \brief Build a MOV instruction.
+  virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator I,
+                                      unsigned DstReg, unsigned SrcReg) const = 0;
+
+  /// \brief Given a MIMG \p Opcode that writes all 4 channels, return the
+  /// equivalent opcode that writes \p Channels Channels.
+  int getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const;
+
 };
 
+namespace AMDGPU {
+  int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIndex);
+}  // End namespace AMDGPU
+
 } // End llvm namespace
 
 #define AMDGPU_FLAG_REGISTER_LOAD  (UINT64_C(1) << 63)
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index b66ae87..fccede0 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -23,12 +23,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
 // AMDGPU DAG Nodes
 //
 
-// out = ((a << 32) | b) >> c)
-//
-// Can be used to optimize rtol:
-// rotl(a, b) = bitalign(a, a, 32 - b)
-def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
-
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
@@ -71,8 +65,6 @@ def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
 // e is rounding error
 def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
 
-def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
-
 def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
                           SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -80,3 +72,17 @@ def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
 def AMDGPUregister_store : SDNode<"AMDGPUISD::REGISTER_STORE",
                            SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                            [SDNPHasChain, SDNPMayStore]>;
+
+// MSKOR instructions are atomic memory instructions used mainly for storing
+// 8-bit and 16-bit values.  The definition is:
+//
+// MSKOR(dst, mask, src) MEM[dst] = ((MEM[dst] & ~mask) | src)
+//
+// src0: vec4(src, 0, 0, mask)
+// src1: dst - rat offset (aka pointer) in dwords  
+def AMDGPUstore_mskor : SDNode<"AMDGPUISD::STORE_MSKOR",
+                        SDTypeProfile<0, 2, []>,
+                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AMDGPUround : SDNode<"ISD::FROUND",
+                         SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index d2620b2..3c5375d 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -35,46 +35,75 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 }
 
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
-def COND_EQ : PatLeaf <
+//===----------------------------------------------------------------------===//
+// PatLeafs for floating-point comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_OEQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOEQ: case ISD::SETUEQ:
-                     case ISD::SETEQ: return true;}}}]
+  [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
 >;
 
-def COND_NE : PatLeaf <
+def COND_OGT : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETONE: case ISD::SETUNE:
-                     case ISD::SETNE: return true;}}}]
+  [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
 >;
-def COND_GT : PatLeaf <
+
+def COND_OGE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGT: case ISD::SETUGT:
-                     case ISD::SETGT: return true;}}}]
+  [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
 >;
 
-def COND_GE : PatLeaf <
+def COND_OLT : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOGE: case ISD::SETUGE:
-                     case ISD::SETGE: return true;}}}]
+  [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
 >;
 
-def COND_LT : PatLeaf <
+def COND_OLE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
+>;
+
+def COND_UNE : PatLeaf <
+  (cond),
+  [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
+>;
+
+def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
+def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for unsigned comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
+def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
+def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
+def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for signed comparisons
+//===----------------------------------------------------------------------===//
+
+def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
+def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
+def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
+def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+
+//===----------------------------------------------------------------------===//
+// PatLeafs for integer equality
+//===----------------------------------------------------------------------===//
+
+def COND_EQ : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLT: case ISD::SETULT:
-                     case ISD::SETLT: return true;}}}]
+  [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
 >;
 
-def COND_LE : PatLeaf <
+def COND_NE : PatLeaf <
   (cond),
-  [{switch(N->get()){{default: return false;
-                     case ISD::SETOLE: case ISD::SETULE:
-                     case ISD::SETLE: return true;}}}]
+  [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
 >;
 
 def COND_NULL : PatLeaf <
@@ -86,15 +115,131 @@ def COND_NULL : PatLeaf <
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
-def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
+def az_extload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  return L->getExtensionType() == ISD::ZEXTLOAD ||
+         L->getExtensionType() == ISD::EXTLOAD;
+}]>;
+
+def az_extloadi8 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def az_extloadi8_global : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def sextloadi8_global : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi8_constant : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_local : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def az_extloadi16_global : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_global : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def sextloadi16_constant : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_local : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def az_extloadi32_global : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def az_extloadi32_constant : PatFrag<(ops node:$ptr),
+                                     (az_extloadi32 node:$ptr), [{
+  return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+def truncstorei8_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_global : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_store : PatFrag<(ops node:$val, node:$ptr),
+                             (store node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei8_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_local : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isLocalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def local_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isLocalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def atomic_load_add_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_add node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def atomic_load_sub_local : PatFrag<(ops node:$ptr, node:$value),
+                                    (atomic_load_sub node:$ptr, node:$value), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+}]>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000;	// 1 << 32 in floating point encoding
+int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 }
 def CONST : Constants;
 
@@ -108,6 +253,9 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
+def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
+def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
+
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 let usesCustomInserter = 1  in {
@@ -137,6 +285,8 @@ class FNEG <RegisterClass rc> : AMDGPUShaderInst <
 
 multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
                     ComplexPattern addrPat> {
+let UseNamedOperandTable = 1 in {
+
   def RegisterLoad : AMDGPUShaderInst <
     (outs dstClass:$dst),
     (ins addrClass:$addr, i32imm:$chan),
@@ -155,6 +305,7 @@ multiclass RegisterLoadStore <RegisterClass dstClass, Operand addrClass,
     let isRegisterStore = 1;
   }
 }
+}
 
 } // End isCodeGenOnly = 1, isPseudo = 1
 
@@ -186,61 +337,12 @@ class Insert_Element <ValueType elem_type, ValueType vec_type,
   (INSERT_SUBREG $vec, $elem, sub_reg)
 >;
 
-// Vector Build pattern
-class Vector1_Build <ValueType vecType, ValueType elemType,
-                     RegisterClass rc> : Pat <
-  (vecType (build_vector elemType:$src)),
-  (vecType (COPY_TO_REGCLASS $src, rc))
->;
-
-class Vector2_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1)),
-  (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1)
->;
-
 class Vector4_Build <ValueType vecType, ValueType elemType> : Pat <
   (vecType (build_vector elemType:$x, elemType:$y, elemType:$z, elemType:$w)),
   (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
     (vecType (IMPLICIT_DEF)), $x, sub0), $y, sub1), $z, sub2), $w, sub3)
 >;
 
-class Vector8_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                              $sub2, sub2), $sub3, sub3),
-                              $sub4, sub4), $sub5, sub5),
-                              $sub6, sub6), $sub7, sub7)
->;
-
-class Vector16_Build <ValueType vecType, ValueType elemType> : Pat <
-  (vecType (build_vector elemType:$sub0, elemType:$sub1,
-                         elemType:$sub2, elemType:$sub3,
-                         elemType:$sub4, elemType:$sub5,
-                         elemType:$sub6, elemType:$sub7,
-                         elemType:$sub8, elemType:$sub9,
-                         elemType:$sub10, elemType:$sub11,
-                         elemType:$sub12, elemType:$sub13,
-                         elemType:$sub14, elemType:$sub15)),
-  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
-    (vecType (IMPLICIT_DEF)), $sub0, sub0), $sub1, sub1),
-                            $sub2, sub2), $sub3, sub3),
-                            $sub4, sub4), $sub5, sub5),
-                            $sub6, sub6), $sub7, sub7),
-                            $sub8, sub8), $sub9, sub9),
-                            $sub10, sub10), $sub11, sub11),
-                            $sub12, sub12), $sub13, sub13),
-                            $sub14, sub14), $sub15, sub15)
->;
-
 // XXX: Convert to new syntax and use COPY_TO_REG, once the DFAPacketizer
 // can handle COPY instructions.
 // bitconvert pattern
@@ -295,6 +397,22 @@ class BFEPattern <Instruction BFE> : Pat <
   (BFE $x, $y, $z)
 >;
 
+// rotr pattern
+class ROTRPattern <Instruction BIT_ALIGN> : Pat <
+  (rotr i32:$src0, i32:$src1),
+  (BIT_ALIGN $src0, $src0, $src1)
+>;
+
+// 24-bit arithmetic patterns
+def umul24 : PatFrag <(ops node:$x, node:$y), (mul node:$x, node:$y)>;
+
+/*
+class UMUL24Pattern <Instruction UMUL24> : Pat <
+  (mul U24:$x, U24:$y),
+  (UMUL24 $x, $y)
+>;
+*/
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index eecb25b..9f975bf 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -50,6 +50,8 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_barrier_local  : Intrinsic<[], [], []>;
 }
 
 let TargetPrefix = "TGSI", isTarget = 1 in {
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 1dc1c65..0ed598e 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -15,14 +15,19 @@
 
 #include "AMDGPUMCInstLower.h"
 #include "AMDGPUAsmPrinter.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -69,15 +74,45 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MachineBasicBlock::const_instr_iterator I = MI;
     ++I;
     while (I != MBB->end() && I->isInsideBundle()) {
-      MCInst MCBundleInst;
-      const MachineInstr *BundledInst = I;
-      MCInstLowering.lower(BundledInst, MCBundleInst);
-      OutStreamer.EmitInstruction(MCBundleInst);
+      EmitInstruction(I);
       ++I;
     }
   } else {
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
     OutStreamer.EmitInstruction(TmpInst);
+
+    if (DisasmEnabled) {
+      // Disassemble instruction/operands to text.
+      DisasmLines.resize(DisasmLines.size() + 1);
+      std::string &DisasmLine = DisasmLines.back();
+      raw_string_ostream DisasmStream(DisasmLine);
+
+      AMDGPUInstPrinter InstPrinter(*TM.getMCAsmInfo(), *TM.getInstrInfo(),
+                                    *TM.getRegisterInfo());
+      InstPrinter.printInst(&TmpInst, DisasmStream, StringRef());
+
+      // Disassemble instruction/operands to hex representation.
+      SmallVector<MCFixup, 4> Fixups;
+      SmallVector<char, 16> CodeBytes;
+      raw_svector_ostream CodeStream(CodeBytes);
+
+      MCObjectStreamer &ObjStreamer = (MCObjectStreamer &)OutStreamer;
+      MCCodeEmitter &InstEmitter = ObjStreamer.getAssembler().getEmitter();
+      InstEmitter.EncodeInstruction(TmpInst, CodeStream, Fixups);
+      CodeStream.flush();
+
+      HexLines.resize(HexLines.size() + 1);
+      std::string &HexLine = HexLines.back();
+      raw_string_ostream HexStream(HexLine);
+
+      for (size_t i = 0; i < CodeBytes.size(); i += 4) {
+        unsigned int CodeDWord = *(unsigned int *)&CodeBytes[i];
+        HexStream << format("%s%08X", (i > 0 ? " " : ""), CodeDWord);
+      }
+
+      DisasmStream.flush();
+      DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLine.size());
+    }
   }
 }
diff --git a/lib/Target/R600/AMDGPUMachineFunction.cpp b/lib/Target/R600/AMDGPUMachineFunction.cpp
index 0461025..14171f4 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.cpp
+++ b/lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -2,14 +2,17 @@
 #include "AMDGPU.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
+using namespace llvm;
 
-namespace llvm {
+static const char *const ShaderTypeAttribute = "ShaderType";
 
-const char *AMDGPUMachineFunction::ShaderTypeAttribute = "ShaderType";
+// Pin the vtable to this file.
+void AMDGPUMachineFunction::anchor() {}
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
     MachineFunctionInfo() {
   ShaderType = ShaderType::COMPUTE;
+  LDSSize = 0;
   AttributeSet Set = MF.getFunction()->getAttributes();
   Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
                                  ShaderTypeAttribute);
@@ -20,5 +23,3 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
       llvm_unreachable("Can't parse shader type!");
   }
 }
-
-}
diff --git a/lib/Target/R600/AMDGPUMachineFunction.h b/lib/Target/R600/AMDGPUMachineFunction.h
index 21c8c51..fea0b39 100644
--- a/lib/Target/R600/AMDGPUMachineFunction.h
+++ b/lib/Target/R600/AMDGPUMachineFunction.h
@@ -14,15 +14,20 @@
 #define AMDGPUMACHINEFUNCTION_H
 
 #include "llvm/CodeGen/MachineFunction.h"
+#include <map>
 
 namespace llvm {
 
 class AMDGPUMachineFunction : public MachineFunctionInfo {
-private:
-  static const char *ShaderTypeAttribute;
+  virtual void anchor();
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
   unsigned ShaderType;
+  /// A map to keep track of local memory objects and their offsets within
+  /// the local memory space.
+  std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
+  /// Number of bytes in the LDS that are being used.
+  unsigned LDSSize;
 };
 
 }
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index fe994d2..47617a7 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -17,11 +17,9 @@
 
 using namespace llvm;
 
-AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
-    const TargetInstrInfo &tii)
+AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
 : AMDGPUGenRegisterInfo(0),
-  TM(tm),
-  TII(tii)
+  TM(tm)
   { }
 
 //===----------------------------------------------------------------------===//
@@ -48,27 +46,21 @@ unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return 0;
 }
 
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  static const unsigned SubRegs[] = {
+    AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
+    AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
+    AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
+    AMDGPU::sub15
+  };
+
+  assert (Channel < array_lengthof(SubRegs));
+  return SubRegs[Channel];
+}
+
 unsigned AMDGPURegisterInfo::getIndirectSubReg(unsigned IndirectIndex) const {
 
-  switch(IndirectIndex) {
-  case 0: return AMDGPU::sub0;
-  case 1: return AMDGPU::sub1;
-  case 2: return AMDGPU::sub2;
-  case 3: return AMDGPU::sub3;
-  case 4: return AMDGPU::sub4;
-  case 5: return AMDGPU::sub5;
-  case 6: return AMDGPU::sub6;
-  case 7: return AMDGPU::sub7;
-  case 8: return AMDGPU::sub8;
-  case 9: return AMDGPU::sub9;
-  case 10: return AMDGPU::sub10;
-  case 11: return AMDGPU::sub11;
-  case 12: return AMDGPU::sub12;
-  case 13: return AMDGPU::sub13;
-  case 14: return AMDGPU::sub14;
-  case 15: return AMDGPU::sub15;
-  default: llvm_unreachable("indirect index out of range");
-  }
+  return getSubRegFromChannel(IndirectIndex);
 }
 
 #define GET_REGINFO_TARGET_DESC
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 1fc88e7..688e1a0 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,10 +30,9 @@ class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   TargetMachine &TM;
-  const TargetInstrInfo &TII;
   static const uint16_t CalleeSavedReg;
 
-  AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
+  AMDGPURegisterInfo(TargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const {
     assert(!"Unimplemented");  return BitVector();
@@ -51,6 +50,14 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
     assert(!"Unimplemented"); return NULL;
   }
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const {
+    assert(!"Unimplemented"); return 0;
+  }
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
   const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
index b5aca03..835a146 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.td
+++ b/lib/Target/R600/AMDGPURegisterInfo.td
@@ -14,7 +14,8 @@
 let Namespace = "AMDGPU" in {
 
 foreach Index = 0-15 in {
-  def sub#Index : SubRegIndex;
+  // Indices are used in a variety of ways here, so don't set a size/offset.
+  def sub#Index : SubRegIndex<-1, -1>;
 }
 
 def INDIRECT_BASE_ADDR : Register <"INDIRECT_BASE_ADDR">;
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index a7e1d7b..061793a 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -25,8 +25,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
     InstrItins = getInstrItineraryForCPU(CPU);
 
-  memset(CapsOverride, 0, sizeof(*CapsOverride)
-      * AMDGPUDeviceInfo::MaxNumberCapabilities);
   // Default card
   StringRef GPU = CPU;
   Is64bit = false;
@@ -34,21 +32,16 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   DefaultSize[1] = 1;
   DefaultSize[2] = 1;
   HasVertexCache = false;
+  TexVTXClauseSize = 0;
+  Gen = AMDGPUSubtarget::R600;
+  FP64 = false;
+  CaymanISA = false;
+  EnableIRStructurizer = true;
+  EnableIfCvt = true;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
-  Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
 }
 
-AMDGPUSubtarget::~AMDGPUSubtarget() {
-  delete Device;
-}
-
-bool
-AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
-  assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
-      "Caps index is out of bounds!");
-  return CapsOverride[caps];
-}
 bool
 AMDGPUSubtarget::is64bit() const  {
   return Is64bit;
@@ -57,6 +50,30 @@ bool
 AMDGPUSubtarget::hasVertexCache() const {
   return HasVertexCache;
 }
+short
+AMDGPUSubtarget::getTexVTXClauseSize() const {
+  return TexVTXClauseSize;
+}
+enum AMDGPUSubtarget::Generation
+AMDGPUSubtarget::getGeneration() const {
+  return Gen;
+}
+bool
+AMDGPUSubtarget::hasHWFP64() const {
+  return FP64;
+}
+bool
+AMDGPUSubtarget::hasCaymanISA() const {
+  return CaymanISA;
+}
+bool
+AMDGPUSubtarget::IsIRStructurizerEnabled() const {
+  return EnableIRStructurizer;
+}
+bool
+AMDGPUSubtarget::isIfCvtEnabled() const {
+  return EnableIfCvt;
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
@@ -72,21 +89,32 @@ AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
 
 std::string
 AMDGPUSubtarget::getDataLayout() const {
-    if (!Device) {
-        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
-                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
-                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
-                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
-                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
-    }
-    return Device->getDataLayout();
+  std::string DataLayout = std::string(
+   "e"
+   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
+   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+   "-n32:64"
+  );
+
+  if (hasHWFP64()) {
+    DataLayout.append("-f64:64:64");
+  }
+
+  if (is64bit()) {
+    DataLayout.append("-p:64:64:64");
+  } else {
+    DataLayout.append("-p:32:32:32");
+  }
+
+  if (Gen >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    DataLayout.append("-p3:32:32:32");
+  }
+
+  return DataLayout;
 }
 
 std::string
 AMDGPUSubtarget::getDeviceName() const {
   return DevName;
 }
-const AMDGPUDevice *
-AMDGPUSubtarget::device() const {
-  return Device;
-}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index b6501a4..4288d27 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -14,7 +14,7 @@
 
 #ifndef AMDGPUSUBTARGET_H
 #define AMDGPUSUBTARGET_H
-#include "AMDILDevice.h"
+#include "AMDGPU.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -27,9 +27,17 @@
 namespace llvm {
 
 class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+public:
+  enum Generation {
+    R600 = 0,
+    R700,
+    EVERGREEN,
+    NORTHERN_ISLANDS,
+    SOUTHERN_ISLANDS,
+    SEA_ISLANDS
+  };
+
 private:
-  bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
-  const AMDGPUDevice *Device;
   size_t DefaultSize[3];
   std::string DevName;
   bool Is64bit;
@@ -37,23 +45,36 @@ private:
   bool DumpCode;
   bool R600ALUInst;
   bool HasVertexCache;
+  short TexVTXClauseSize;
+  enum Generation Gen;
+  bool FP64;
+  bool CaymanISA;
+  bool EnableIRStructurizer;
+  bool EnableIfCvt;
 
   InstrItineraryData InstrItins;
 
 public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
-  virtual ~AMDGPUSubtarget();
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
   virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  bool isOverride(AMDGPUDeviceInfo::Caps) const;
   bool is64bit() const;
   bool hasVertexCache() const;
+  short getTexVTXClauseSize() const;
+  enum Generation getGeneration() const;
+  bool hasHWFP64() const;
+  bool hasCaymanISA() const;
+  bool IsIRStructurizerEnabled() const;
+  bool isIfCvtEnabled() const;
+
+  virtual bool enableMachineScheduler() const {
+    return getGeneration() <= NORTHERN_ISLANDS;
+  }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
-  const AMDGPUDevice* device() const;
   std::string getDataLayout() const;
   std::string getDeviceName() const;
   virtual size_t getDefaultSize(uint32_t dim) const;
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 31fbf32..bc4f5d7 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include <llvm/CodeGen/Passes.h>
 
+
 using namespace llvm;
 
 extern "C" void LLVMInitializeR600Target() {
@@ -59,17 +60,19 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
   Subtarget(TT, CPU, FS),
   Layout(Subtarget.getDataLayout()),
   FrameLowering(TargetFrameLowering::StackGrowsUp,
-      Subtarget.device()->getStackAlignment(), 0),
+                64 * 16 // Maximum stack alignment (long16)
+               , 0),
   IntrinsicInfo(this),
   InstrItins(&Subtarget.getInstrItineraryData()) {
   // TLInfo uses InstrInfo so it must be initialized after.
-  if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-    InstrInfo = new R600InstrInfo(*this);
-    TLInfo = new R600TargetLowering(*this);
+  if (Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    InstrInfo.reset(new R600InstrInfo(*this));
+    TLInfo.reset(new R600TargetLowering(*this));
   } else {
-    InstrInfo = new SIInstrInfo(*this);
-    TLInfo = new SITargetLowering(*this);
+    InstrInfo.reset(new SIInstrInfo(*this));
+    TLInfo.reset(new SITargetLowering(*this));
   }
+  initAsmInfo();
 }
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() {
@@ -79,18 +82,20 @@ namespace {
 class AMDGPUPassConfig : public TargetPassConfig {
 public:
   AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {
-    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-      enablePass(&MachineSchedulerID);
-      MachineSchedRegistry::setDefault(createR600MachineScheduler);
-    }
-  }
+    : TargetPassConfig(TM, PM) {}
 
   AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
     return getTM<AMDGPUTargetMachine>();
   }
 
+  virtual ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const {
+    const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+      return createR600MachineScheduler(C);
+    return 0;
+  }
+
   virtual bool addPreISel();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
@@ -104,53 +109,76 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AMDGPUPassConfig(this, PM);
 }
 
+//===----------------------------------------------------------------------===//
+// AMDGPU Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This
+  // allows the AMDGPU pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAMDGPUTargetTransformInfoPass(this));
+}
+
 bool
 AMDGPUPassConfig::addPreISel() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
-    addPass(createAMDGPUStructurizeCFGPass());
+  addPass(createFlattenCFGPass());
+  if (ST.IsIRStructurizerEnabled())
+    addPass(createStructurizeCFGPass());
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createSinkingPass());
+    addPass(createSITypeRewriter());
     addPass(createSIAnnotateControlFlowPass());
+  } else {
+    addPass(createR600TextureIntrinsicsReplacer());
   }
   return false;
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
-
-  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-    // This callbacks this pass uses are not implemented yet on SI.
-    addPass(createAMDGPUIndirectAddressingPass(*TM));
-  }
   return false;
 }
 
 bool AMDGPUPassConfig::addPreRegAlloc() {
   addPass(createAMDGPUConvertToISAPass(*TM));
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    addPass(createR600VectorRegMerger(*TM));
+  } else {
+    addPass(createSIFixSGPRCopiesPass(*TM));
+  }
   return false;
 }
 
 bool AMDGPUPassConfig::addPostRegAlloc() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
-  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+  if (ST.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createSIInsertWaits(*TM));
   }
   return false;
 }
 
 bool AMDGPUPassConfig::addPreSched2() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
 
-  addPass(&IfConverterID);
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600EmitClauseMarkers(*TM));
+  if (ST.isIfCvtEnabled())
+    addPass(&IfConverterID);
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
+    addPass(createR600ClauseMergePass(*TM));
   return false;
 }
 
 bool AMDGPUPassConfig::addPreEmitPass() {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-    addPass(createAMDGPUCFGPreparationPass(*TM));
+  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
     addPass(createAMDGPUCFGStructurizerPass(*TM));
-    addPass(createR600EmitClauseMarkers(*TM));
     addPass(createR600ExpandSpecialInstrsPass(*TM));
     addPass(&FinalizeMachineBundlesID);
     addPass(createR600Packetizer(*TM));
@@ -161,4 +189,3 @@ bool AMDGPUPassConfig::addPreEmitPass() {
 
   return false;
 }
-
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index 2afe787..f942614 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -25,44 +25,45 @@
 
 namespace llvm {
 
-MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
-
 class AMDGPUTargetMachine : public LLVMTargetMachine {
 
   AMDGPUSubtarget Subtarget;
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  const AMDGPUInstrInfo * InstrInfo;
-  AMDGPUTargetLowering * TLInfo;
-  const InstrItineraryData* InstrItins;
+  OwningPtr<AMDGPUInstrInfo> InstrInfo;
+  OwningPtr<AMDGPUTargetLowering> TLInfo;
+  const InstrItineraryData *InstrItins;
 
 public:
-   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
-                       StringRef CPU,
-                       TargetOptions Options,
-                       Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-   ~AMDGPUTargetMachine();
-   virtual const AMDGPUFrameLowering* getFrameLowering() const {
-     return &FrameLowering;
-   }
-   virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
-     return &IntrinsicInfo;
-   }
-   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
-   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
-   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
-      return &InstrInfo->getRegisterInfo();
-   }
-   virtual AMDGPUTargetLowering * getTargetLowering() const {
-      return TLInfo;
-   }
-   virtual const InstrItineraryData* getInstrItineraryData() const {
-      return InstrItins;
-   }
-   virtual const DataLayout* getDataLayout() const { return &Layout; }
-   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                      StringRef CPU, TargetOptions Options, Reloc::Model RM,
+                      CodeModel::Model CM, CodeGenOpt::Level OL);
+  ~AMDGPUTargetMachine();
+  virtual const AMDGPUFrameLowering *getFrameLowering() const {
+    return &FrameLowering;
+  }
+  virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+    return &IntrinsicInfo;
+  }
+  virtual const AMDGPUInstrInfo *getInstrInfo() const {
+    return InstrInfo.get();
+  }
+  virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+  virtual AMDGPUTargetLowering *getTargetLowering() const {
+    return TLInfo.get();
+  }
+  virtual const InstrItineraryData *getInstrItineraryData() const {
+    return InstrItins;
+  }
+  virtual const DataLayout *getDataLayout() const { return &Layout; }
+  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  /// \brief Register R600 analysis passes with a pass manager.
+  virtual void addAnalysisPasses(PassManagerBase &PM);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
new file mode 100644
index 0000000..8db319c
--- /dev/null
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -0,0 +1,90 @@
+//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file implements a TargetTransformInfo analysis pass specific to the
+// AMDGPU target machine. It uses the target's detailed information to provide
+// more precise answers to certain TTI queries, while letting the target
+// independent and default TTI implementations handle the rest.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "AMDGPUtti"
+#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't have a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeAMDGPUTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class AMDGPUTTI : public ImmutablePass, public TargetTransformInfo {
+  const AMDGPUTargetMachine *TM;
+  const AMDGPUSubtarget *ST;
+  const AMDGPUTargetLowering *TLI;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+  AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  AMDGPUTTI(const AMDGPUTargetMachine *TM)
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+        TLI(TM->getTargetLowering()) {
+    initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() { pushTTIStack(this); }
+
+  virtual void finalizePass() { popTTIStack(); }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  /// Pass identification.
+  static char ID;
+
+  /// Provide necessary pointer adjustments for the two base classes.
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo *)this;
+    return this;
+  }
+
+  virtual bool hasBranchDivergence() const;
+
+  /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti",
+                   "AMDGPU Target Transform Info", true, true, false)
+char AMDGPUTTI::ID = 0;
+
+ImmutablePass *
+llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) {
+  return new AMDGPUTTI(TM);
+}
+
+bool AMDGPUTTI::hasBranchDivergence() const { return true; }
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
deleted file mode 100644
index 39ab664..0000000
--- a/lib/Target/R600/AMDIL.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// This file contains the entry points for global functions defined in the LLVM
-/// AMDGPU back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef AMDIL_H
-#define AMDIL_H
-
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Target/TargetMachine.h"
-
-#define ARENA_SEGMENT_RESERVED_UAVS 12
-#define DEFAULT_ARENA_UAV_ID 8
-#define DEFAULT_RAW_UAV_ID 7
-#define GLOBAL_RETURN_RAW_UAV_ID 11
-#define HW_MAX_NUM_CB 8
-#define MAX_NUM_UNIQUE_UAVS 8
-#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
-#define OPENCL_MAX_READ_IMAGES 128
-#define OPENCL_MAX_WRITE_IMAGES 8
-#define OPENCL_MAX_SAMPLERS 16
-
-// The next two values can never be zero, as zero is the ID that is
-// used to assert against.
-#define DEFAULT_LDS_ID     1
-#define DEFAULT_GDS_ID     1
-#define DEFAULT_SCRATCH_ID 1
-#define DEFAULT_VEC_SLOTS  8
-
-#define OCL_DEVICE_RV710        0x0001
-#define OCL_DEVICE_RV730        0x0002
-#define OCL_DEVICE_RV770        0x0004
-#define OCL_DEVICE_CEDAR        0x0008
-#define OCL_DEVICE_REDWOOD      0x0010
-#define OCL_DEVICE_JUNIPER      0x0020
-#define OCL_DEVICE_CYPRESS      0x0040
-#define OCL_DEVICE_CAICOS       0x0080
-#define OCL_DEVICE_TURKS        0x0100
-#define OCL_DEVICE_BARTS        0x0200
-#define OCL_DEVICE_CAYMAN       0x0400
-#define OCL_DEVICE_ALL          0x3FFF
-
-/// The number of function ID's that are reserved for 
-/// internal compiler usage.
-const unsigned int RESERVED_FUNCS = 1024;
-
-namespace llvm {
-class AMDGPUInstrPrinter;
-class FunctionPass;
-class MCAsmInfo;
-class raw_ostream;
-class Target;
-class TargetMachine;
-
-// Instruction selection passes.
-FunctionPass*
-  createAMDGPUISelDag(TargetMachine &TM);
-FunctionPass*
-  createAMDGPUPeepholeOpt(TargetMachine &TM);
-
-// Pre emit passes.
-FunctionPass*
-  createAMDGPUCFGPreparationPass(TargetMachine &TM);
-FunctionPass*
-  createAMDGPUCFGStructurizerPass(TargetMachine &TM);
-
-extern Target TheAMDGPUTarget;
-} // end namespace llvm;
-
-// Include device information enumerations
-#include "AMDILDeviceInfo.h"
-
-namespace llvm {
-/// OpenCL uses address spaces to differentiate between
-/// various memory regions on the hardware. On the CPU
-/// all of the address spaces point to the same memory,
-/// however on the GPU, each address space points to
-/// a seperate piece of memory that is unique from other
-/// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
-};
-
-} // namespace AMDGPUAS
-
-} // end namespace llvm
-#endif // AMDIL_H
diff --git a/lib/Target/R600/AMDIL7XXDevice.cpp b/lib/Target/R600/AMDIL7XXDevice.cpp
deleted file mode 100644
index ea6ac34..0000000
--- a/lib/Target/R600/AMDIL7XXDevice.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDIL7XXDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILDevice.h"
-
-using namespace llvm;
-
-AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
-  setCaps();
-  std::string name = mSTM->getDeviceName();
-  if (name == "rv710") {
-    DeviceFlag = OCL_DEVICE_RV710;
-  } else if (name == "rv730") {
-    DeviceFlag = OCL_DEVICE_RV730;
-  } else {
-    DeviceFlag = OCL_DEVICE_RV770;
-  }
-}
-
-AMDGPU7XXDevice::~AMDGPU7XXDevice() {
-}
-
-void AMDGPU7XXDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-}
-
-size_t AMDGPU7XXDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_700;
-  }
-  return 0;
-}
-
-size_t AMDGPU7XXDevice::getWavefrontSize() const {
-  return AMDGPUDevice::HalfWavefrontSize;
-}
-
-uint32_t AMDGPU7XXDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD4XXX;
-}
-
-uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
-  switch (DeviceID) {
-  default:
-    assert(0 && "ID type passed in is unknown!");
-    break;
-  case GLOBAL_ID:
-  case CONSTANT_ID:
-  case RAW_UAV_ID:
-  case ARENA_UAV_ID:
-    break;
-  case LDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-      return DEFAULT_LDS_ID;
-    }
-    break;
-  case SCRATCH_ID:
-    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
-      return DEFAULT_SCRATCH_ID;
-    }
-    break;
-  case GDS_ID:
-    assert(0 && "GDS UAV ID is not supported on this chip");
-    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-      return DEFAULT_GDS_ID;
-    }
-    break;
-  };
-
-  return 0;
-}
-
-uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
-  return 1;
-}
-
-AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
-  setCaps();
-}
-
-AMDGPU770Device::~AMDGPU770Device() {
-}
-
-void AMDGPU770Device::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mSWBits.set(AMDGPUDeviceInfo::FMA);
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-  }
-  mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
-  mHWBits.reset(AMDGPUDeviceInfo::LongOps);
-  mSWBits.set(AMDGPUDeviceInfo::LongOps);
-  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-}
-
-size_t AMDGPU770Device::getWavefrontSize() const {
-  return AMDGPUDevice::WavefrontSize;
-}
-
-AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
-}
-
-AMDGPU710Device::~AMDGPU710Device() {
-}
-
-size_t AMDGPU710Device::getWavefrontSize() const {
-  return AMDGPUDevice::QuarterWavefrontSize;
-}
diff --git a/lib/Target/R600/AMDIL7XXDevice.h b/lib/Target/R600/AMDIL7XXDevice.h
deleted file mode 100644
index 1cf4ca4..0000000
--- a/lib/Target/R600/AMDIL7XXDevice.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDIL7XXDEVICEIMPL_H
-#define AMDIL7XXDEVICEIMPL_H
-#include "AMDILDevice.h"
-
-namespace llvm {
-class AMDGPUSubtarget;
-
-//===----------------------------------------------------------------------===//
-// 7XX generation of devices and their respective sub classes
-//===----------------------------------------------------------------------===//
-
-/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
-///
-/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
-/// support the minimal features that are required to be considered OpenCL 1.0
-/// compliant and nothing more.
-class AMDGPU7XXDevice : public AMDGPUDevice {
-public:
-  AMDGPU7XXDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU7XXDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual size_t getWavefrontSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual uint32_t getResourceID(uint32_t DeviceID) const;
-  virtual uint32_t getMaxNumUAVs() const;
-
-protected:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPU770Device class represents the RV770 chip and it's
-/// derivative cards.
-///
-/// The difference between this device and the base class is this device device
-/// adds support for double precision and has a larger wavefront size.
-class AMDGPU770Device : public AMDGPU7XXDevice {
-public:
-  AMDGPU770Device(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU770Device();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPU710Device class derives from the 7XX base class.
-///
-/// This class is a smaller derivative, so we need to overload some of the
-/// functions in order to correctly specify this information.
-class AMDGPU710Device : public AMDGPU7XXDevice {
-public:
-  AMDGPU710Device(AMDGPUSubtarget *ST);
-  virtual ~AMDGPU710Device();
-  virtual size_t getWavefrontSize() const;
-};
-
-} // namespace llvm
-#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
index e221110..5dcd478 100644
--- a/lib/Target/R600/AMDILBase.td
+++ b/lib/Target/R600/AMDILBase.td
@@ -16,70 +16,6 @@ def ALU_NULL : FuncUnit;
 def NullALU : InstrItinClass;
 
 //===----------------------------------------------------------------------===//
-// AMDIL Subtarget features.
-//===----------------------------------------------------------------------===//
-def FeatureFP64     : SubtargetFeature<"fp64",
-        "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
-        "true",
-        "Enable 64bit double precision operations">;
-def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
-        "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
-        "true",
-        "Enable byte addressable stores">;
-def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
-        "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
-        "true",
-        "Enable duplicate barrier detection(HD5XXX or later).">;
-def FeatureImages : SubtargetFeature<"images",
-        "CapsOverride[AMDGPUDeviceInfo::Images]",
-        "true",
-        "Enable image functions">;
-def FeatureMultiUAV : SubtargetFeature<"multi_uav",
-        "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
-        "true",
-        "Generate multiple UAV code(HD5XXX family or later)">;
-def FeatureMacroDB : SubtargetFeature<"macrodb",
-        "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
-        "true",
-        "Use internal macrodb, instead of macrodb in driver">;
-def FeatureNoAlias : SubtargetFeature<"noalias",
-        "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
-        "true",
-        "assert that all kernel argument pointers are not aliased">;
-def FeatureNoInline : SubtargetFeature<"no-inline",
-        "CapsOverride[AMDGPUDeviceInfo::NoInline]",
-        "true",
-        "specify whether to not inline functions">;
-
-def Feature64BitPtr : SubtargetFeature<"64BitPtr",
-        "Is64bit",
-        "false",
-        "Specify if 64bit addressing should be used.">;
-
-def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
-        "Is32on64bit",
-        "false",
-        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
-def FeatureDebug : SubtargetFeature<"debug",
-        "CapsOverride[AMDGPUDeviceInfo::Debug]",
-        "true",
-        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
-def FeatureDumpCode : SubtargetFeature <"DumpCode",
-        "DumpCode",
-        "true",
-        "Dump MachineInstrs in the CodeEmitter">;
-
-def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
-        "R600ALUInst",
-        "false",
-        "Older version of ALU instructions encoding.">;
-
-def FeatureVertexCache : SubtargetFeature<"HasVertexCache",
-        "HasVertexCache",
-        "true",
-        "Specify use of dedicated vertex cache.">;
-
-//===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index b0cd0f9..507570f 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,14 +8,17 @@
 /// \file
 //==-----------------------------------------------------------------------===//
 
-#define DEBUGME 0
 #define DEBUG_TYPE "structcfg"
 
+#include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
-#include "AMDIL.h"
+#include "R600InstrInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -28,9 +31,12 @@
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
+#define DEFAULT_VEC_SLOTS 8
+
 // TODO: move-begin.
 
 //===----------------------------------------------------------------------===//
@@ -43,12 +49,8 @@ STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
     "matched");
 STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
     "matched");
-STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
-    "pattern matched");
 STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
     "pattern matched");
-STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
-    "matched");
 STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
 STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 
@@ -57,39 +59,29 @@ STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
 // Miscellaneous utility for CFGStructurizer.
 //
 //===----------------------------------------------------------------------===//
-namespace llvmCFGStruct {
+namespace {
 #define SHOWNEWINSTR(i) \
-  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+  DEBUG(dbgs() << "New instr: " << *i << "\n");
 
 #define SHOWNEWBLK(b, msg) \
-if (DEBUGME) { \
-  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  errs() << "\n"; \
-}
+DEBUG( \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  dbgs() << "\n"; \
+);
 
 #define SHOWBLK_DETAIL(b, msg) \
-if (DEBUGME) { \
+DEBUG( \
   if (b) { \
-  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
-  b->print(errs()); \
-  errs() << "\n"; \
+  dbgs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(dbgs()); \
+  dbgs() << "\n"; \
   } \
-}
+);
 
 #define INVALIDSCCNUM -1
-#define INVALIDREGNUM 0
-
-template<class LoopinfoT>
-void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
-  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
-       iterEnd = LoopInfo.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->print(OS, 0);
-  }
-}
 
 template<class NodeT>
-void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
   size_t sz = Src.size();
   for (size_t i = 0; i < sz/2; ++i) {
     NodeT *t = Src[i];
@@ -98,7 +90,7 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
   }
 }
 
-} //end namespace llvmCFGStruct
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //
@@ -106,43 +98,17 @@ void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
 //
 //===----------------------------------------------------------------------===//
 
-namespace llvmCFGStruct {
-template<class PassT>
-struct CFGStructTraits {
-};
 
-template <class InstrT>
-class BlockInformation {
-public:
-  bool isRetired;
-  int  sccNum;
-  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
-  //Instructions defining the corresponding successor.
-  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
-};
+namespace {
 
-template <class BlockT, class InstrT, class RegiT>
-class LandInformation {
+class BlockInformation {
 public:
-  BlockT *landBlk;
-  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
-                                  //WHILELOOP(thisloop) init before entering
-                                  //thisloop.
-  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
-                                  //WHILELOOP(thisloop) init after entering
-                                  //thisloop.
-  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
-                                     //land block, branch cond on this reg.
-  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
-                                     //endif" after ENDLOOP(thisloop) break
-                                     //outerLoopOf(thisLoop).
-  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
-                                    //endif" after ENDLOOP(thisloop) continue on
-                                    //outerLoopOf(thisLoop).
-  LandInformation() : landBlk(NULL) {}
+  bool IsRetired;
+  int  SccNum;
+  BlockInformation() : IsRetired(false), SccNum(INVALIDSCCNUM) {}
 };
 
-} //end of namespace llvmCFGStruct
+} // end anonymous namespace
 
 //===----------------------------------------------------------------------===//
 //
@@ -150,1026 +116,1213 @@ public:
 //
 //===----------------------------------------------------------------------===//
 
-namespace llvmCFGStruct {
-// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
-template<class PassT>
-class  CFGStructurizer {
+namespace {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
 public:
-  typedef enum {
+  typedef SmallVector<MachineBasicBlock *, 32> MBBVector;
+  typedef std::map<MachineBasicBlock *, BlockInformation *> MBBInfoMap;
+  typedef std::map<MachineLoop *, MachineBasicBlock *> LoopLandInfoMap;
+
+  enum PathToKind {
     Not_SinglePath = 0,
     SinglePath_InPath = 1,
     SinglePath_NotInPath = 2
-  } PathToKind;
+  };
 
-public:
-  typedef typename PassT::InstructionType         InstrT;
-  typedef typename PassT::FunctionType            FuncT;
-  typedef typename PassT::DominatortreeType       DomTreeT;
-  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
-  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
-  typedef typename PassT::LoopinfoType            LoopInfoT;
-
-  typedef GraphTraits<FuncT *>                    FuncGTraits;
-  //typedef FuncGTraits::nodes_iterator BlockIterator;
-  typedef typename FuncT::iterator                BlockIterator;
-
-  typedef typename FuncGTraits::NodeType          BlockT;
-  typedef GraphTraits<BlockT *>                   BlockGTraits;
-  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
-  //typedef BlockGTraits::succ_iterator InstructionIterator;
-  typedef typename BlockT::iterator               InstrIterator;
-
-  typedef CFGStructTraits<PassT>                  CFGTraits;
-  typedef BlockInformation<InstrT>                BlockInfo;
-  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
-
-  typedef int                                     RegiT;
-  typedef typename PassT::LoopType                LoopT;
-  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
-        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
-        //landing info for loop break
-  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+  static char ID;
 
-public:
-  CFGStructurizer();
-  ~CFGStructurizer();
+  AMDGPUCFGStructurizer(TargetMachine &tm) :
+      MachineFunctionPass(ID), TM(tm),
+      TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
+      TRI(&TII->getRegisterInfo()) { }
+
+   const char *getPassName() const {
+    return "AMD IL Control Flow Graph structurizer Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addPreserved<MachineFunctionAnalysis>();
+    AU.addRequired<MachineFunctionAnalysis>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+  }
 
   /// Perform the CFG structurization
-  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  bool run();
 
   /// Perform the CFG preparation
-  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+  /// This step will remove every unconditionnal/dead jump instructions and make
+  /// sure all loops have an exit block
+  bool prepare();
+
+  bool runOnMachineFunction(MachineFunction &MF) {
+    DEBUG(MF.dump(););
+    OrderedBlks.clear();
+    FuncRep = &MF;
+    MLI = &getAnalysis<MachineLoopInfo>();
+    DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
+    MDT = &getAnalysis<MachineDominatorTree>();
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    PDT = &getAnalysis<MachinePostDominatorTree>();
+    DEBUG(PDT->print(dbgs()););
+    prepare();
+    run();
+    DEBUG(MF.dump(););
+    return true;
+  }
 
-private:
-  void reversePredicateSetter(typename BlockT::iterator);
-  void   orderBlocks();
-  void   printOrderedBlocks(llvm::raw_ostream &OS);
-  int patternMatch(BlockT *CurBlock);
-  int patternMatchGroup(BlockT *CurBlock);
-
-  int serialPatternMatch(BlockT *CurBlock);
-  int ifPatternMatch(BlockT *CurBlock);
-  int switchPatternMatch(BlockT *CurBlock);
-  int loopendPatternMatch(BlockT *CurBlock);
-  int loopPatternMatch(BlockT *CurBlock);
-
-  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
-  //int loopWithoutBreak(BlockT *);
-
-  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
-                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
-  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
-                           BlockT *ContBlock, LoopT *contLoop);
-  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
-  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                       BlockT *FalseBlock);
-  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
-                          BlockT *FalseBlock);
-  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                              BlockT *FalseBlock, BlockT **LandBlockPtr);
-  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
-                                   BlockT *FalseBlock, BlockT *LandBlock,
-                                   bool Detail = false);
-  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
-                          bool AllowSideEntry = true);
-  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
-                        bool AllowSideEntry = true);
-  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
-  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
-
-  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
-                            BlockT *TrueBlock, BlockT *FalseBlock,
-                            BlockT *LandBlock);
-  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
-  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
-                           BlockT *ExitLandBlock, RegiT SetReg);
-  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
-                           RegiT SetReg);
-  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
-                                std::set<BlockT*> &ExitBlockSet,
-                                BlockT *ExitLandBlk);
-  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
-                                BlockTSmallerVector &ExitingBlocks,
-                                BlockTSmallerVector &ExitBlocks);
-  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
-  void removeUnconditionalBranch(BlockT *SrcBlock);
-  void removeRedundantConditionalBranch(BlockT *SrcBlock);
-  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
-
-  void removeSuccessor(BlockT *SrcBlock);
-  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
-  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
-
-  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
-                          InstrIterator InsertPos);
-
-  void recordSccnum(BlockT *SrcBlock, int SCCNum);
-  int getSCCNum(BlockT *srcBlk);
-
-  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
-  bool isRetiredBlock(BlockT *SrcBlock);
-  bool isActiveLoophead(BlockT *CurBlock);
-  bool needMigrateBlock(BlockT *Block);
-
-  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
-                              BlockTSmallerVector &exitBlocks,
-                              std::set<BlockT*> &ExitBlockSet);
-  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
-  BlockT *getLoopLandBlock(LoopT *LoopRep);
-  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
-
-  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
-  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
-
-  bool hasBackEdge(BlockT *curBlock);
-  unsigned getLoopDepth  (LoopT *LoopRep);
-  int countActiveBlock(
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
-    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
-  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+protected:
+  TargetMachine &TM;
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *PDT;
+  MachineLoopInfo *MLI;
+  const R600InstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+  // PRINT FUNCTIONS
+  /// Print the ordered Blocks.
+  void printOrderedBlocks() const {
+    size_t i = 0;
+    for (MBBVector::const_iterator iterBlk = OrderedBlks.begin(),
+        iterBlkEnd = OrderedBlks.end(); iterBlk != iterBlkEnd; ++iterBlk, ++i) {
+      dbgs() << "BB" << (*iterBlk)->getNumber();
+      dbgs() << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+      if (i != 0 && i % 10 == 0) {
+        dbgs() << "\n";
+      } else {
+        dbgs() << " ";
+      }
+    }
+  }
+  static void PrintLoopinfo(const MachineLoopInfo &LoopInfo) {
+    for (MachineLoop::iterator iter = LoopInfo.begin(),
+         iterEnd = LoopInfo.end(); iter != iterEnd; ++iter) {
+      (*iter)->print(dbgs(), 0);
+    }
+  }
+
+  // UTILITY FUNCTIONS
+  int getSCCNum(MachineBasicBlock *MBB) const;
+  MachineBasicBlock *getLoopLandInfo(MachineLoop *LoopRep) const;
+  bool hasBackEdge(MachineBasicBlock *MBB) const;
+  static unsigned getLoopDepth(MachineLoop *LoopRep);
+  bool isRetiredBlock(MachineBasicBlock *MBB) const;
+  bool isActiveLoophead(MachineBasicBlock *MBB) const;
+  PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+      bool AllowSideEntry = true) const;
+  int countActiveBlock(MBBVector::const_iterator It,
+      MBBVector::const_iterator E) const;
+  bool needMigrateBlock(MachineBasicBlock *MBB) const;
+
+  // Utility Functions
+  void reversePredicateSetter(MachineBasicBlock::iterator I);
+  /// Compute the reversed DFS post order of Blocks
+  void orderBlocks(MachineFunction *MF);
+
+  // Function originaly from CFGStructTraits
+  void insertInstrEnd(MachineBasicBlock *MBB, int NewOpcode,
+      DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock *MBB, int NewOpcode,
+    DebugLoc DL = DebugLoc());
+  MachineInstr *insertInstrBefore(MachineBasicBlock::iterator I, int NewOpcode);
+  void insertCondBranchBefore(MachineBasicBlock::iterator I, int NewOpcode,
+      DebugLoc DL);
+  void insertCondBranchBefore(MachineBasicBlock *MBB,
+      MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+      DebugLoc DL);
+  void insertCondBranchEnd(MachineBasicBlock *MBB, int NewOpcode, int RegNum);
+  static int getBranchNzeroOpcode(int OldOpcode);
+  static int getBranchZeroOpcode(int OldOpcode);
+  static int getContinueNzeroOpcode(int OldOpcode);
+  static int getContinueZeroOpcode(int OldOpcode);
+  static MachineBasicBlock *getTrueBranch(MachineInstr *MI);
+  static void setTrueBranch(MachineInstr *MI, MachineBasicBlock *MBB);
+  static MachineBasicBlock *getFalseBranch(MachineBasicBlock *MBB,
+      MachineInstr *MI);
+  static bool isCondBranch(MachineInstr *MI);
+  static bool isUncondBranch(MachineInstr *MI);
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *MBB);
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *MBB);
+  /// The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  ///
+  /// BB with backward-edge could have move instructions after the branch
+  /// instruction.  Such move instruction "belong to" the loop backward-edge.
+  MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getReturnInstr(MachineBasicBlock *MBB);
+  static MachineInstr *getContinueInstr(MachineBasicBlock *MBB);
+  static bool isReturnBlock(MachineBasicBlock *MBB);
+  static void cloneSuccessorList(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB) ;
+  static MachineBasicBlock *clone(MachineBasicBlock *MBB);
+  /// MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose
+  /// because the AMDGPU instruction is not recognized as terminator fix this
+  /// and retire this routine
+  void replaceInstrUseOfBlockWith(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *OldMBB, MachineBasicBlock *NewBlk);
+  static void wrapup(MachineBasicBlock *MBB);
+
+
+  int patternMatch(MachineBasicBlock *MBB);
+  int patternMatchGroup(MachineBasicBlock *MBB);
+  int serialPatternMatch(MachineBasicBlock *MBB);
+  int ifPatternMatch(MachineBasicBlock *MBB);
+  int loopendPatternMatch();
+  int mergeLoop(MachineLoop *LoopRep);
+  int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock *LoopHeader);
+
+  void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+      MachineLoop *ContLoop);
+  /// return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in
+  /// the same loop with LoopLandInfo without explicitly keeping track of
+  /// loopContBlks and loopBreakBlks, this is a method to get the information.
+  bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
+      MachineBasicBlock *Src2MBB);
+  int handleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
+  int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock **LandMBBPtr);
+  void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+      MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+      MachineBasicBlock *LandMBB, bool Detail = false);
+  int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+      MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
+  void mergeSerialBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *SrcMBB);
+
+  void mergeIfthenelseBlock(MachineInstr *BranchMI,
+      MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+      MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB);
+  void mergeLooplandBlock(MachineBasicBlock *DstMBB,
+      MachineBasicBlock *LandMBB);
+  void mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+      MachineBasicBlock *LandMBB);
+  void settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+      MachineBasicBlock *ContMBB);
+  /// normalizeInfiniteLoopExit change
+  ///   B1:
+  ///        uncond_br LoopHeader
+  ///
+  /// to
+  ///   B1:
+  ///        cond_br 1 LoopHeader dummyExit
+  /// and return the newly added dummy exit block
+  MachineBasicBlock *normalizeInfiniteLoopExit(MachineLoop *LoopRep);
+  void removeUnconditionalBranch(MachineBasicBlock *MBB);
+  /// Remove duplicate branches instructions in a block.
+  /// For instance
+  /// B0:
+  ///    cond_br X B1 B2
+  ///    cond_br X B1 B2
+  /// is transformed to
+  /// B0:
+  ///    cond_br X B1 B2
+  void removeRedundantConditionalBranch(MachineBasicBlock *MBB);
+  void addDummyExitBlock(SmallVectorImpl<MachineBasicBlock *> &RetMBB);
+  void removeSuccessor(MachineBasicBlock *MBB);
+  MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
+      MachineBasicBlock *PredMBB);
+  void migrateInstruction(MachineBasicBlock *SrcMBB,
+      MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
+  void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
+  void retireBlock(MachineBasicBlock *MBB);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+
+  MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
+  /// This is work around solution for findNearestCommonDominator not avaiable
+  /// to post dom a proper fix should go to Dominators.h.
+  MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
+      MachineBasicBlock *MBB2);
 
 private:
-  DomTreeT *domTree;
-  PostDomTreeT *postDomTree;
-  LoopInfoT *loopInfo;
-  PassT *passRep;
-  FuncT *funcRep;
-
-  BlockInfoMap blockInfoMap;
-  LoopLandInfoMap loopLandInfoMap;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
-  const AMDGPURegisterInfo *TRI;
+  MBBInfoMap BlockInfoMap;
+  LoopLandInfoMap LLInfoMap;
+  std::map<MachineLoop *, bool> Visited;
+  MachineFunction *FuncRep;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> OrderedBlks;
+};
+
+int AMDGPUCFGStructurizer::getSCCNum(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return INVALIDSCCNUM;
+  return (*It).second->SccNum;
+}
 
-};  //template class CFGStructurizer
+MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
+    const {
+  LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
+  if (It == LLInfoMap.end())
+    return NULL;
+  return (*It).second;
+}
+
+bool AMDGPUCFGStructurizer::hasBackEdge(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  if (!LoopRep)
+    return false;
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  return MBB->isSuccessor(LoopHeader);
+}
+
+unsigned AMDGPUCFGStructurizer::getLoopDepth(MachineLoop *LoopRep) {
+  return LoopRep ? LoopRep->getLoopDepth() : 0;
+}
 
-template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
-  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+bool AMDGPUCFGStructurizer::isRetiredBlock(MachineBasicBlock *MBB) const {
+  MBBInfoMap::const_iterator It = BlockInfoMap.find(MBB);
+  if (It == BlockInfoMap.end())
+    return false;
+  return (*It).second->IsRetired;
 }
 
-template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
-  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
-       E = blockInfoMap.end(); I != E; ++I) {
-    delete I->second;
+bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const {
+  MachineLoop *LoopRep = MLI->getLoopFor(MBB);
+  while (LoopRep && LoopRep->getHeader() == MBB) {
+    MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
+    if(!LoopLand)
+      return true;
+    if (!isRetiredBlock(LoopLand))
+      return true;
+    LoopRep = LoopRep->getParentLoop();
   }
+  return false;
+}
+AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
+    bool AllowSideEntry) const {
+  assert(DstMBB);
+  if (SrcMBB == DstMBB)
+    return SinglePath_InPath;
+  while (SrcMBB && SrcMBB->succ_size() == 1) {
+    SrcMBB = *SrcMBB->succ_begin();
+    if (SrcMBB == DstMBB)
+      return SinglePath_InPath;
+    if (!AllowSideEntry && SrcMBB->pred_size() > 1)
+      return Not_SinglePath;
+  }
+  if (SrcMBB && SrcMBB->succ_size()==0)
+    return SinglePath_NotInPath;
+  return Not_SinglePath;
 }
 
-template<class PassT>
-bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
-                                     const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
+int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
+    MBBVector::const_iterator E) const {
+  int Count = 0;
+  while (It != E) {
+    if (!isRetiredBlock(*It))
+      ++Count;
+    ++It;
+  }
+  return Count;
+}
 
-  bool changed = false;
+bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const {
+  unsigned BlockSizeThreshold = 30;
+  unsigned CloneInstrThreshold = 100;
+  bool MultiplePreds = MBB && (MBB->pred_size() > 1);
 
-  //FIXME: if not reducible flow graph, make it so ???
+  if(!MultiplePreds)
+    return false;
+  unsigned BlkSize = MBB->size();
+  return ((BlkSize > BlockSizeThreshold) &&
+      (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
+}
 
-  if (DEBUGME) {
-        errs() << "AMDGPUCFGStructurizer::prepare\n";
+void AMDGPUCFGStructurizer::reversePredicateSetter(
+    MachineBasicBlock::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2)
+            .setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        llvm_unreachable("PRED_X Opcode invalid!");
+      }
+    }
   }
+}
+
+void AMDGPUCFGStructurizer::insertInstrEnd(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+ MachineInstr *MI = MBB->getParent()
+    ->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->push_back(MI);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(MI);
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(MachineBasicBlock *MBB,
+    int NewOpcode, DebugLoc DL) {
+  MachineInstr *MI =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DL);
+  if (MBB->begin() != MBB->end())
+    MBB->insert(MBB->begin(), MI);
+  else
+    MBB->push_back(MI);
+  SHOWNEWINSTR(MI);
+  return MI;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::insertInstrBefore(
+    MachineBasicBlock::iterator I, int NewOpcode) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineInstr *NewMBB =
+      MBB->getParent()->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->insert(I, NewMBB);
+  //assume the instruction doesn't take any reg operand ...
+  SHOWNEWINSTR(NewMBB);
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(
+    MachineBasicBlock::iterator I, int NewOpcode, DebugLoc DL) {
+  MachineInstr *OldMI = &(*I);
+  MachineBasicBlock *MBB = OldMI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewMI = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  MBB->insert(I, NewMI);
+  MachineInstrBuilder MIB(*MF, NewMI);
+  MIB.addReg(OldMI->getOperand(1).getReg(), false);
+  SHOWNEWINSTR(NewMI);
+  //erase later oldInstr->eraseFromParent();
+}
+
+void AMDGPUCFGStructurizer::insertCondBranchBefore(MachineBasicBlock *blk,
+    MachineBasicBlock::iterator I, int NewOpcode, int RegNum,
+    DebugLoc DL) {
+  MachineFunction *MF = blk->getParent();
+  MachineInstr *NewInstr = MF->CreateMachineInstr(TII->get(NewOpcode), DL);
+  //insert before
+  blk->insert(I, NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  if (DEBUGME) {
-    errs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, errs());
+void AMDGPUCFGStructurizer::insertCondBranchEnd(MachineBasicBlock *MBB,
+    int NewOpcode, int RegNum) {
+  MachineFunction *MF = MBB->getParent();
+  MachineInstr *NewInstr =
+    MF->CreateMachineInstr(TII->get(NewOpcode), DebugLoc());
+  MBB->push_back(NewInstr);
+  MachineInstrBuilder(*MF, NewInstr).addReg(RegNum, false);
+  SHOWNEWINSTR(NewInstr);
+}
+
+int AMDGPUCFGStructurizer::getBranchNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
 
-  orderBlocks();
-  if (DEBUGME) {
-    errs() << "Ordered blocks:\n";
-    printOrderedBlocks(errs());
+int AMDGPUCFGStructurizer::getBranchZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+  case AMDGPU::BRANCH_COND_i32:
+  case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
 
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
-
-  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
-       iterEnd = loopInfo->end();
-       iter != iterEnd; ++iter) {
-    LoopT* loopRep = (*iter);
-    BlockTSmallerVector exitingBlks;
-    loopRep->getExitingBlocks(exitingBlks);
-    
-    if (exitingBlks.size() == 0) {
-      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
-      if (dummyExitBlk != NULL)
-        retBlks.push_back(dummyExitBlk);
-    }
+int AMDGPUCFGStructurizer::getContinueNzeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+  default: llvm_unreachable("internal error");
+  };
+  return -1;
+}
+
+int AMDGPUCFGStructurizer::getContinueZeroOpcode(int OldOpcode) {
+  switch(OldOpcode) {
+  case AMDGPU::JUMP_COND:
+  case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+  default: llvm_unreachable("internal error");
   }
+  return -1;
+}
 
-  // Remove unconditional branch instr.
-  // Add dummy exit block iff there are multiple returns.
+MachineBasicBlock *AMDGPUCFGStructurizer::getTrueBranch(MachineInstr *MI) {
+  return MI->getOperand(0).getMBB();
+}
 
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
-       iterBlk != iterEndBlk;
-       ++iterBlk) {
-    BlockT *curBlk = *iterBlk;
-    removeUnconditionalBranch(curBlk);
-    removeRedundantConditionalBranch(curBlk);
-    if (CFGTraits::isReturnBlock(curBlk)) {
-      retBlks.push_back(curBlk);
+void AMDGPUCFGStructurizer::setTrueBranch(MachineInstr *MI,
+    MachineBasicBlock *MBB) {
+  MI->getOperand(0).setMBB(MBB);
+}
+
+MachineBasicBlock *
+AMDGPUCFGStructurizer::getFalseBranch(MachineBasicBlock *MBB,
+    MachineInstr *MI) {
+  assert(MBB->succ_size() == 2);
+  MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+  MachineBasicBlock::succ_iterator It = MBB->succ_begin();
+  MachineBasicBlock::succ_iterator Next = It;
+  ++Next;
+  return (*It == TrueBranch) ? *Next : *It;
+}
+
+bool AMDGPUCFGStructurizer::isCondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+    case AMDGPU::JUMP_COND:
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+bool AMDGPUCFGStructurizer::isUncondBranch(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::JUMP:
+  case AMDGPU::BRANCH:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
+DebugLoc AMDGPUCFGStructurizer::getLastDebugLocInBB(MachineBasicBlock *MBB) {
+  //get DebugLoc from the first MachineBasicBlock instruction with debug info
+  DebugLoc DL;
+  for (MachineBasicBlock::iterator It = MBB->begin(); It != MBB->end();
+      ++It) {
+    MachineInstr *instr = &(*It);
+    if (instr->getDebugLoc().isUnknown() == false)
+      DL = instr->getDebugLoc();
+  }
+  return DL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  MachineInstr *MI = &*It;
+  if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
+    return MI;
+  return NULL;
+}
+
+MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
+    MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB->rbegin(), E = MBB->rend();
+      It != E; ++It) {
+    // FIXME: Simplify
+    MachineInstr *MI = &*It;
+    if (MI) {
+      if (isCondBranch(MI) || isUncondBranch(MI))
+        return MI;
+      else if (!TII->isMov(MI->getOpcode()))
+        break;
     }
-    assert(curBlk->succ_size() <= 2);
-  } //for
+  }
+  return NULL;
+}
 
-  if (retBlks.size() >= 2) {
-    addDummyExitBlock(retBlks);
-    changed = true;
+MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *instr = &(*It);
+    if (instr->getOpcode() == AMDGPU::RETURN)
+      return instr;
   }
+  return NULL;
+}
 
-  return changed;
-} //CFGStructurizer::prepare
+MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
+  MachineBasicBlock::reverse_iterator It = MBB->rbegin();
+  if (It != MBB->rend()) {
+    MachineInstr *MI = &(*It);
+    if (MI->getOpcode() == AMDGPU::CONTINUE)
+      return MI;
+  }
+  return NULL;
+}
 
-template<class PassT>
-bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
-    const AMDGPURegisterInfo * tri) {
-  passRep = &pass;
-  funcRep = &func;
-  TRI = tri;
+bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
+  MachineInstr *MI = getReturnInstr(MBB);
+  bool IsReturn = (MBB->succ_size() == 0);
+  if (MI)
+    assert(IsReturn);
+  else if (IsReturn)
+    DEBUG(
+      dbgs() << "BB" << MBB->getNumber()
+             <<" is return block without RETURN instr\n";);
+  return  IsReturn;
+}
 
-  //Assume reducible CFG...
-  if (DEBUGME) {
-    errs() << "AMDGPUCFGStructurizer::run\n";
-    func.viewCFG();
+void AMDGPUCFGStructurizer::cloneSuccessorList(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  for (MachineBasicBlock::succ_iterator It = SrcMBB->succ_begin(),
+       iterEnd = SrcMBB->succ_end(); It != iterEnd; ++It)
+    DstMBB->addSuccessor(*It);  // *iter's predecessor is also taken care of
+}
+
+MachineBasicBlock *AMDGPUCFGStructurizer::clone(MachineBasicBlock *MBB) {
+  MachineFunction *Func = MBB->getParent();
+  MachineBasicBlock *NewMBB = Func->CreateMachineBasicBlock();
+  Func->push_back(NewMBB);  //insert to function
+  for (MachineBasicBlock::iterator It = MBB->begin(), E = MBB->end();
+      It != E; ++It) {
+    MachineInstr *MI = Func->CloneMachineInstr(It);
+    NewMBB->push_back(MI);
   }
+  return NewMBB;
+}
+
+void AMDGPUCFGStructurizer::replaceInstrUseOfBlockWith(
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *OldMBB,
+    MachineBasicBlock *NewBlk) {
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(SrcMBB);
+  if (BranchMI && isCondBranch(BranchMI) &&
+      getTrueBranch(BranchMI) == OldMBB)
+    setTrueBranch(BranchMI, NewBlk);
+}
+
+void AMDGPUCFGStructurizer::wrapup(MachineBasicBlock *MBB) {
+  assert((!MBB->getParent()->getJumpTableInfo()
+          || MBB->getParent()->getJumpTableInfo()->isEmpty())
+         && "found a jump table");
+
+   //collect continue right before endloop
+   SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> ContInstr;
+   MachineBasicBlock::iterator Pre = MBB->begin();
+   MachineBasicBlock::iterator E = MBB->end();
+   MachineBasicBlock::iterator It = Pre;
+   while (It != E) {
+     if (Pre->getOpcode() == AMDGPU::CONTINUE
+         && It->getOpcode() == AMDGPU::ENDLOOP)
+       ContInstr.push_back(Pre);
+     Pre = It;
+     ++It;
+   }
+
+   //delete continue right before endloop
+   for (unsigned i = 0; i < ContInstr.size(); ++i)
+      ContInstr[i]->eraseFromParent();
+
+   // TODO to fix up jump table so later phase won't be confused.  if
+   // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+   // there isn't such an interface yet.  alternatively, replace all the other
+   // blocks in the jump table with the entryBlk //}
+
+}
+
+
+bool AMDGPUCFGStructurizer::prepare() {
+  bool Changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::prepare\n";);
 
-  domTree = CFGTraits::getDominatorTree(pass);
-  if (DEBUGME) {
-    domTree->print(errs(), (const llvm::Module*)0);
+  orderBlocks(FuncRep);
+
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> RetBlks;
+
+  // Add an ExitBlk to loop that don't have one
+  for (MachineLoopInfo::iterator It = MLI->begin(),
+       E = MLI->end(); It != E; ++It) {
+    MachineLoop *LoopRep = (*It);
+    MBBVector ExitingMBBs;
+    LoopRep->getExitingBlocks(ExitingMBBs);
+
+    if (ExitingMBBs.size() == 0) {
+      MachineBasicBlock* DummyExitBlk = normalizeInfiniteLoopExit(LoopRep);
+      if (DummyExitBlk)
+        RetBlks.push_back(DummyExitBlk);
+    }
   }
 
-  postDomTree = CFGTraits::getPostDominatorTree(pass);
-  if (DEBUGME) {
-    postDomTree->print(errs());
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+  for (SmallVectorImpl<MachineBasicBlock *>::const_iterator
+       It = OrderedBlks.begin(), E = OrderedBlks.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    removeUnconditionalBranch(MBB);
+    removeRedundantConditionalBranch(MBB);
+    if (isReturnBlock(MBB)) {
+      RetBlks.push_back(MBB);
+    }
+    assert(MBB->succ_size() <= 2);
   }
 
-  loopInfo = CFGTraits::getLoopInfo(pass);
-  if (DEBUGME) {
-    errs() << "LoopInfo:\n";
-    PrintLoopinfo(*loopInfo, errs());
+  if (RetBlks.size() >= 2) {
+    addDummyExitBlock(RetBlks);
+    Changed = true;
   }
 
-  orderBlocks();
+  return Changed;
+}
+
+bool AMDGPUCFGStructurizer::run() {
+
+  //Assume reducible CFG...
+  DEBUG(dbgs() << "AMDGPUCFGStructurizer::run\n";FuncRep->viewCFG(););
+
 #ifdef STRESSTEST
   //Use the worse block ordering to test the algorithm.
   ReverseVector(orderedBlks);
 #endif
 
-  if (DEBUGME) {
-    errs() << "Ordered blocks:\n";
-    printOrderedBlocks(errs());
-  }
-  int numIter = 0;
-  bool finish = false;
-  BlockT *curBlk;
-  bool makeProgress = false;
-  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
-                                        orderedBlks.end());
+  DEBUG(dbgs() << "Ordered blocks:\n"; printOrderedBlocks(););
+  int NumIter = 0;
+  bool Finish = false;
+  MachineBasicBlock *MBB;
+  bool MakeProgress = false;
+  int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
+                                        OrderedBlks.end());
 
   do {
-    ++numIter;
-    if (DEBUGME) {
-      errs() << "numIter = " << numIter
-             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
-    }
-
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlk = orderedBlks.begin();
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlkEnd = orderedBlks.end();
-
-    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      sccBeginIter = iterBlk;
-    BlockT *sccBeginBlk = NULL;
-    int sccNumBlk = 0;  // The number of active blocks, init to a
+    ++NumIter;
+    DEBUG(
+      dbgs() << "numIter = " << NumIter
+             << ", numRemaintedBlk = " << NumRemainedBlk << "\n";
+    );
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator It =
+        OrderedBlks.begin();
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator E =
+        OrderedBlks.end();
+
+    SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
+        It;
+    MachineBasicBlock *SccBeginMBB = NULL;
+    int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
-    int sccNumIter;     // Number of iteration in this SCC.
-
-    while (iterBlk != iterBlkEnd) {
-      curBlk = *iterBlk;
-
-      if (sccBeginBlk == NULL) {
-        sccBeginIter = iterBlk;
-        sccBeginBlk = curBlk;
-        sccNumIter = 0;
-        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
-        if (DEBUGME) {
-              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
-              errs() << "\n";
-        }
+    int SccNumIter;     // Number of iteration in this SCC.
+
+    while (It != E) {
+      MBB = *It;
+
+      if (!SccBeginMBB) {
+        SccBeginIter = It;
+        SccBeginMBB = MBB;
+        SccNumIter = 0;
+        SccNumBlk = NumRemainedBlk; // Init to maximum possible number.
+        DEBUG(
+              dbgs() << "start processing SCC" << getSCCNum(SccBeginMBB);
+              dbgs() << "\n";
+        );
       }
 
-      if (!isRetiredBlock(curBlk)) {
-        patternMatch(curBlk);
-      }
+      if (!isRetiredBlock(MBB))
+        patternMatch(MBB);
 
-      ++iterBlk;
+      ++It;
 
-      bool contNextScc = true;
-      if (iterBlk == iterBlkEnd
-          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+      bool ContNextScc = true;
+      if (It == E
+          || getSCCNum(SccBeginMBB) != getSCCNum(*It)) {
         // Just finish one scc.
-        ++sccNumIter;
-        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
-        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
-          if (DEBUGME) {
-            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
-                   << ", sccNumIter = " << sccNumIter;
-            errs() << "doesn't make any progress\n";
-          }
-          contNextScc = true;
-        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
-          sccNumBlk = sccRemainedNumBlk;
-          iterBlk = sccBeginIter;
-          contNextScc = false;
-          if (DEBUGME) {
-            errs() << "repeat processing SCC" << getSCCNum(curBlk)
-                   << "sccNumIter = " << sccNumIter << "\n";
-            func.viewCFG();
-          }
+        ++SccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(SccBeginIter, It);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= SccNumBlk) {
+          DEBUG(
+            dbgs() << "Can't reduce SCC " << getSCCNum(MBB)
+                   << ", sccNumIter = " << SccNumIter;
+            dbgs() << "doesn't make any progress\n";
+          );
+          ContNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < SccNumBlk) {
+          SccNumBlk = sccRemainedNumBlk;
+          It = SccBeginIter;
+          ContNextScc = false;
+          DEBUG(
+            dbgs() << "repeat processing SCC" << getSCCNum(MBB)
+                   << "sccNumIter = " << SccNumIter << "\n";
+            FuncRep->viewCFG();
+          );
         } else {
           // Finish the current scc.
-          contNextScc = true;
+          ContNextScc = true;
         }
       } else {
         // Continue on next component in the current scc.
-        contNextScc = false;
+        ContNextScc = false;
       }
 
-      if (contNextScc) {
-        sccBeginBlk = NULL;
-      }
+      if (ContNextScc)
+        SccBeginMBB = NULL;
     } //while, "one iteration" over the function.
 
-    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
-    if (entryBlk->succ_size() == 0) {
-      finish = true;
-      if (DEBUGME) {
-        errs() << "Reduce to one block\n";
-      }
+    MachineBasicBlock *EntryMBB =
+        GraphTraits<MachineFunction *>::nodes_begin(FuncRep);
+    if (EntryMBB->succ_size() == 0) {
+      Finish = true;
+      DEBUG(
+        dbgs() << "Reduce to one block\n";
+      );
     } else {
-      int newnumRemainedBlk
-        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      int NewnumRemainedBlk
+        = countActiveBlock(OrderedBlks.begin(), OrderedBlks.end());
       // consider cloned blocks ??
-      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
-        makeProgress = true;
-        numRemainedBlk = newnumRemainedBlk;
+      if (NewnumRemainedBlk == 1 || NewnumRemainedBlk < NumRemainedBlk) {
+        MakeProgress = true;
+        NumRemainedBlk = NewnumRemainedBlk;
       } else {
-        makeProgress = false;
-        if (DEBUGME) {
-          errs() << "No progress\n";
-        }
+        MakeProgress = false;
+        DEBUG(
+          dbgs() << "No progress\n";
+        );
       }
     }
-  } while (!finish && makeProgress);
+  } while (!Finish && MakeProgress);
 
   // Misc wrap up to maintain the consistency of the Function representation.
-  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+  wrapup(GraphTraits<MachineFunction *>::nodes_begin(FuncRep));
 
   // Detach retired Block, release memory.
-  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
-       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    if ((*iterMap).second && (*iterMap).second->isRetired) {
-      assert(((*iterMap).first)->getNumber() != -1);
-      if (DEBUGME) {
-        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
-      }
-      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+  for (MBBInfoMap::iterator It = BlockInfoMap.begin(), E = BlockInfoMap.end();
+      It != E; ++It) {
+    if ((*It).second && (*It).second->IsRetired) {
+      assert(((*It).first)->getNumber() != -1);
+      DEBUG(
+        dbgs() << "Erase BB" << ((*It).first)->getNumber() << "\n";
+      );
+      (*It).first->eraseFromParent();  //Remove from the parent Function.
     }
-    delete (*iterMap).second;
+    delete (*It).second;
   }
-  blockInfoMap.clear();
+  BlockInfoMap.clear();
+  LLInfoMap.clear();
 
-  // clear loopLandInfoMap
-  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
-       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
-    delete (*iterMap).second;
-  }
-  loopLandInfoMap.clear();
+  DEBUG(
+    FuncRep->viewCFG();
+  );
 
-  if (DEBUGME) {
-    func.viewCFG();
-  }
-
-  if (!finish) {
-    assert(!"IRREDUCIBL_CF");
-  }
+  if (!Finish)
+    llvm_unreachable("IRREDUCIBL_CF");
 
   return true;
-} //CFGStructurizer::run
-
-/// Print the ordered Blocks.
-///
-template<class PassT>
-void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
-  size_t i = 0;
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
-      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
-       iterBlk != iterBlkEnd;
-       ++iterBlk, ++i) {
-    os << "BB" << (*iterBlk)->getNumber();
-    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
-    if (i != 0 && i % 10 == 0) {
-      os << "\n";
-    } else {
-      os << " ";
-    }
-  }
-} //printOrderedBlocks
-
-/// Compute the reversed DFS post order of Blocks
-///
-template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
-  int sccNum = 0;
-  BlockT *bb;
-  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
-       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
-    std::vector<BlockT *> &sccNext = *sccIter;
-    for (typename std::vector<BlockT *>::const_iterator
-         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+}
+
+
+
+void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
+  int SccNum = 0;
+  MachineBasicBlock *MBB;
+  for (scc_iterator<MachineFunction *> It = scc_begin(MF), E = scc_end(MF);
+      It != E; ++It, ++SccNum) {
+    std::vector<MachineBasicBlock *> &SccNext = *It;
+    for (std::vector<MachineBasicBlock *>::const_iterator
+         blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
-      bb = *blockIter;
-      orderedBlks.push_back(bb);
-      recordSccnum(bb, sccNum);
+      MBB = *blockIter;
+      OrderedBlks.push_back(MBB);
+      recordSccnum(MBB, SccNum);
     }
   }
 
   //walk through all the block in func to check for unreachable
-  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
-       blockEnd1 = FuncGTraits::nodes_end(funcRep);
-       blockIter1 != blockEnd1; ++blockIter1) {
-    BlockT *bb = &(*blockIter1);
-    sccNum = getSCCNum(bb);
-    if (sccNum == INVALIDSCCNUM) {
-      errs() << "unreachable block BB" << bb->getNumber() << "\n";
-    }
+  typedef GraphTraits<MachineFunction *> GTM;
+  MachineFunction::iterator It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = &(*It);
+    SccNum = getSCCNum(MBB);
+    if (SccNum == INVALIDSCCNUM)
+      dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
   }
-} //orderBlocks
+}
 
-template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
-  int numMatch = 0;
-  int curMatch;
+int AMDGPUCFGStructurizer::patternMatch(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  int CurMatch;
 
-  if (DEBUGME) {
-        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
-  }
+  DEBUG(
+        dbgs() << "Begin patternMatch BB" << MBB->getNumber() << "\n";
+  );
 
-  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
-    numMatch += curMatch;
-  }
+  while ((CurMatch = patternMatchGroup(MBB)) > 0)
+    NumMatch += CurMatch;
 
-  if (DEBUGME) {
-        errs() << "End patternMatch BB" << curBlk->getNumber()
-      << ", numMatch = " << numMatch << "\n";
-  }
+  DEBUG(
+        dbgs() << "End patternMatch BB" << MBB->getNumber()
+      << ", numMatch = " << NumMatch << "\n";
+  );
+
+  return NumMatch;
+}
+
+int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) {
+  int NumMatch = 0;
+  NumMatch += loopendPatternMatch();
+  NumMatch += serialPatternMatch(MBB);
+  NumMatch += ifPatternMatch(MBB);
+  return NumMatch;
+}
 
-  return numMatch;
-} //patternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
-  int numMatch = 0;
-  numMatch += serialPatternMatch(curBlk);
-  numMatch += ifPatternMatch(curBlk);
-  numMatch += loopendPatternMatch(curBlk);
-  numMatch += loopPatternMatch(curBlk);
-  return numMatch;
-}//patternMatchGroup
-
-template<class PassT>
-int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 1) {
+
+int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 1)
     return 0;
-  }
 
-  BlockT *childBlk = *curBlk->succ_begin();
-  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+  MachineBasicBlock *childBlk = *MBB->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
     return 0;
-  }
 
-  mergeSerialBlock(curBlk, childBlk);
+  mergeSerialBlock(MBB, childBlk);
   ++numSerialPatternMatch;
   return 1;
-} //serialPatternMatch
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   //two edges
-  if (curBlk->succ_size() != 2) {
+  if (MBB->succ_size() != 2)
     return 0;
-  }
-
-  if (hasBackEdge(curBlk)) {
+  if (hasBackEdge(MBB))
     return 0;
-  }
-
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
-  if (branchInstr == NULL) {
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  if (!BranchMI)
     return 0;
-  }
 
-  assert(CFGTraits::isCondBranch(branchInstr));
+  assert(isCondBranch(BranchMI));
+  int NumMatch = 0;
 
-  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
-  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
-  BlockT *landBlk;
-  int cloned = 0;
+  MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
+  NumMatch += serialPatternMatch(TrueMBB);
+  NumMatch += ifPatternMatch(TrueMBB);
+  MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
+  NumMatch += serialPatternMatch(FalseMBB);
+  NumMatch += ifPatternMatch(FalseMBB);
+  MachineBasicBlock *LandBlk;
+  int Cloned = 0;
 
+  assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
   // TODO: Simplify
-  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
-    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
-    landBlk = *trueBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
-    landBlk = NULL;
-  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
-    landBlk = falseBlk;
-    falseBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && *falseBlk->succ_begin() == trueBlk) {
-    landBlk = trueBlk;
-    trueBlk = NULL;
-  } else if (falseBlk->succ_size() == 1
-             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
-    landBlk = *falseBlk->succ_begin();
-  } else if (trueBlk->succ_size() == 1
-    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
-    landBlk = *trueBlk->succ_begin();
+  if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 1
+    && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) {
+    // Diamond pattern
+    LandBlk = *TrueMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
+    // Triangle pattern, false is empty
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && *FalseMBB->succ_begin() == TrueMBB) {
+    // Triangle pattern, true is empty
+    // We reverse the predicate to make a triangle, empty false pattern;
+    std::swap(TrueMBB, FalseMBB);
+    reversePredicateSetter(MBB->end());
+    LandBlk = FalseMBB;
+    FalseMBB = NULL;
+  } else if (FalseMBB->succ_size() == 1
+             && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
+    LandBlk = *FalseMBB->succ_begin();
+  } else if (TrueMBB->succ_size() == 1
+    && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
+    LandBlk = *TrueMBB->succ_begin();
   } else {
-    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+    return NumMatch + handleJumpintoIf(MBB, TrueMBB, FalseMBB);
   }
 
   // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
   // new BB created for landBlk==NULL may introduce new challenge to the
   // reduction process.
-  if (landBlk != NULL &&
-      ((trueBlk && trueBlk->pred_size() > 1)
-      || (falseBlk && falseBlk->pred_size() > 1))) {
-     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  if (LandBlk &&
+      ((TrueMBB && TrueMBB->pred_size() > 1)
+      || (FalseMBB && FalseMBB->pred_size() > 1))) {
+     Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, &LandBlk);
   }
 
-  if (trueBlk && trueBlk->pred_size() > 1) {
-    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
-    ++cloned;
+  if (TrueMBB && TrueMBB->pred_size() > 1) {
+    TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
+    ++Cloned;
   }
 
-  if (falseBlk && falseBlk->pred_size() > 1) {
-    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
-    ++cloned;
+  if (FalseMBB && FalseMBB->pred_size() > 1) {
+    FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
+    ++Cloned;
   }
 
-  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+  mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
 
   ++numIfPatternMatch;
 
-  numClonedBlock += cloned;
-
-  return 1 + cloned;
-} //ifPatternMatch
+  numClonedBlock += Cloned;
 
-template<class PassT>
-int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
-  return 0;
-} //switchPatternMatch
+  return 1 + Cloned + NumMatch;
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  typename std::vector<LoopT *> nestedLoops;
-  while (loopRep) {
-    nestedLoops.push_back(loopRep);
-    loopRep = loopRep->getParentLoop();
+int AMDGPUCFGStructurizer::loopendPatternMatch() {
+  std::vector<MachineLoop *> NestedLoops;
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
+      It != E; ++It) {
+    df_iterator<MachineLoop *> LpIt = df_begin(*It),
+        LpE = df_end(*It);
+    for (; LpIt != LpE; ++LpIt)
+      NestedLoops.push_back(*LpIt);
   }
-
-  if (nestedLoops.size() == 0) {
+  if (NestedLoops.size() == 0)
     return 0;
-  }
 
   // Process nested loop outside->inside, so "continue" to a outside loop won't
   // be mistaken as "break" of the current loop.
-  int num = 0;
-  for (typename std::vector<LoopT *>::reverse_iterator
-       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
-       iter != iterEnd; ++iter) {
-    loopRep = *iter;
-
-    if (getLoopLandBlock(loopRep) != NULL) {
+  int Num = 0;
+  for (std::vector<MachineLoop *>::reverse_iterator It = NestedLoops.rbegin(),
+      E = NestedLoops.rend(); It != E; ++It) {
+    MachineLoop *ExaminedLoop = *It;
+    if (ExaminedLoop->getNumBlocks() == 0 || Visited[ExaminedLoop])
       continue;
-    }
-
-    BlockT *loopHeader = loopRep->getHeader();
-
-    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
-
-    if (numBreak == -1) {
+    DEBUG(dbgs() << "Processing:\n"; ExaminedLoop->dump(););
+    int NumBreak = mergeLoop(ExaminedLoop);
+    if (NumBreak == -1)
       break;
-    }
-
-    int numCont = loopcontPatternMatch(loopRep, loopHeader);
-    num += numBreak + numCont;
-  }
-
-  return num;
-} //loopendPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
-  if (curBlk->succ_size() != 0) {
-    return 0;
-  }
-
-  int numLoop = 0;
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-    if (loopLand) {
-      BlockT *landBlk = loopLand->landBlk;
-      assert(landBlk);
-      if (!isRetiredBlock(landBlk)) {
-        mergeLooplandBlock(curBlk, loopLand);
-        ++numLoop;
-      }
-    }
-    loopRep = loopRep->getParentLoop();
-  }
-
-  numLoopPatternMatch += numLoop;
-
-  return numLoop;
-} //loopPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
-                                                  BlockT *loopHeader) {
-  BlockTSmallerVector exitingBlks;
-  loopRep->getExitingBlocks(exitingBlks);
-
-  if (DEBUGME) {
-    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
-  }
-
-  if (exitingBlks.size() == 0) {
-    setLoopLandBlock(loopRep);
-    return 0;
-  }
-
-  // Compute the corresponding exitBlks and exit block set.
-  BlockTSmallerVector exitBlks;
-  std::set<BlockT *> exitBlkSet;
-  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
-       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *exitingBlk = *iter;
-    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-    exitBlks.push_back(exitBlk);
-    exitBlkSet.insert(exitBlk);  //non-duplicate insert
-  }
-
-  assert(exitBlkSet.size() > 0);
-  assert(exitBlks.size() == exitingBlks.size());
-
-  if (DEBUGME) {
-    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+    Num += NumBreak;
   }
+  return Num;
+}
 
-  // Find exitLandBlk.
-  BlockT *exitLandBlk = NULL;
-  int numCloned = 0;
-  int numSerial = 0;
-
-  if (exitBlkSet.size() == 1) {
-    exitLandBlk = *exitBlkSet.begin();
-  } else {
-    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
-
-    if (exitLandBlk == NULL) {
-      return -1;
-    }
-
-    bool allInPath = true;
-    bool allNotInPath = true;
-    for (typename std::set<BlockT*>::const_iterator
-         iter = exitBlkSet.begin(),
-         iterEnd = exitBlkSet.end();
-         iter != iterEnd; ++iter) {
-      BlockT *exitBlk = *iter;
-
-      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
-      if (DEBUGME) {
-        errs() << "BB" << exitBlk->getNumber()
-               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
-               << pathKind << "\n";
-      }
-
-      allInPath = allInPath && (pathKind == SinglePath_InPath);
-      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
-
-      if (!allInPath && !allNotInPath) {
-        if (DEBUGME) {
-              errs() << "singlePath check fail\n";
-        }
-        return -1;
-      }
-    } // check all exit blocks
-
-    if (allNotInPath) {
-
-      // TODO: Simplify, maybe separate function?
-      LoopT *parentLoopRep = loopRep->getParentLoop();
-      BlockT *parentLoopHeader = NULL;
-      if (parentLoopRep)
-        parentLoopHeader = parentLoopRep->getHeader();
-
-      if (exitLandBlk == parentLoopHeader &&
-          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
-                                               loopRep,
-                                               exitBlkSet,
-                                               exitLandBlk)) != NULL) {
-        if (DEBUGME) {
-          errs() << "relocateLoopcontBlock success\n";
-        }
-      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
-                                                      exitingBlks,
-                                                      exitBlks)) != NULL) {
-        if (DEBUGME) {
-          errs() << "insertEndbranchBlock success\n";
-        }
-      } else {
-        if (DEBUGME) {
-          errs() << "loop exit fail\n";
-        }
-        return -1;
-      }
-    }
-
-    // Handle side entry to exit path.
-    exitBlks.clear();
-    exitBlkSet.clear();
-    for (typename BlockTSmallerVector::iterator iterExiting =
-           exitingBlks.begin(),
-         iterExitingEnd = exitingBlks.end();
-         iterExiting != iterExitingEnd; ++iterExiting) {
-      BlockT *exitingBlk = *iterExiting;
-      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
-      BlockT *newExitBlk = exitBlk;
-
-      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
-        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
-        ++numCloned;
-      }
-
-      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
-
-      exitBlks.push_back(newExitBlk);
-      exitBlkSet.insert(newExitBlk);
-    }
-
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      numSerial += serialPatternMatch(exitBlk);
-    }
-
-    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
-         iterExitEnd = exitBlks.end();
-         iterExit != iterExitEnd; ++iterExit) {
-      BlockT *exitBlk = *iterExit;
-      if (exitBlk->pred_size() > 1) {
-        if (exitBlk != exitLandBlk) {
-          return -1;
-        }
-      } else {
-        if (exitBlk != exitLandBlk &&
-            (exitBlk->succ_size() != 1 ||
-            *exitBlk->succ_begin() != exitLandBlk)) {
-          return -1;
-        }
-      }
-    }
-  } // else
-
-  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
-
-  // Fold break into the breaking block. Leverage across level breaks.
-  assert(exitingBlks.size() == exitBlks.size());
-  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
-       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
-       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
-    BlockT *exitBlk = *iterExit;
-    BlockT *exitingBlk = *iterExiting;
-    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
-    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
-    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
-  }
+int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MBBVector ExitingMBBs;
+  LoopRep->getExitingBlocks(ExitingMBBs);
+  assert(!ExitingMBBs.empty() && "Infinite Loop not supported");
+  DEBUG(dbgs() << "Loop has " << ExitingMBBs.size() << " exiting blocks\n";);
+  // We assume a single ExitBlk
+  MBBVector ExitBlks;
+  LoopRep->getExitBlocks(ExitBlks);
+  SmallPtrSet<MachineBasicBlock *, 2> ExitBlkSet;
+  for (unsigned i = 0, e = ExitBlks.size(); i < e; ++i)
+    ExitBlkSet.insert(ExitBlks[i]);
+  assert(ExitBlkSet.size() == 1);
+  MachineBasicBlock *ExitBlk = *ExitBlks.begin();
+  assert(ExitBlk && "Loop has several exit block");
+  MBBVector LatchBlks;
+  typedef GraphTraits<Inverse<MachineBasicBlock*> > InvMBBTraits;
+  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
+      PE = InvMBBTraits::child_end(LoopHeader);
+  for (; PI != PE; PI++) {
+    if (LoopRep->contains(*PI))
+      LatchBlks.push_back(*PI);
+  }
+
+  for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
+    mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
+  for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
+    settleLoopcontBlock(LatchBlks[i], LoopHeader);
+  int Match = 0;
+  do {
+    Match = 0;
+    Match += serialPatternMatch(LoopHeader);
+    Match += ifPatternMatch(LoopHeader);
+  } while (Match > 0);
+  mergeLooplandBlock(LoopHeader, ExitBlk);
+  MachineLoop *ParentLoop = LoopRep->getParentLoop();
+  if (ParentLoop)
+    MLI->changeLoopFor(LoopHeader, ParentLoop);
+  else
+    MLI->removeBlock(LoopHeader);
+  Visited[LoopRep] = true;
+  return 1;
+}
 
-  int numBreak = static_cast<int>(exitingBlks.size());
-  numLoopbreakPatternMatch += numBreak;
-  numClonedBlock += numCloned;
-  return numBreak + numSerial + numCloned;
-} //loopbreakPatternMatch
-
-template<class PassT>
-int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
-                                                 BlockT *loopHeader) {
-  int numCont = 0;
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
-  for (typename InvBlockGTraits::ChildIteratorType iter =
-       InvBlockGTraits::child_begin(loopHeader),
-       iterEnd = InvBlockGTraits::child_end(loopHeader);
-       iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    if (loopRep->contains(curBlk)) {
-      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
-                          loopHeader, loopRep);
-      contBlk.push_back(curBlk);
-      ++numCont;
+int AMDGPUCFGStructurizer::loopcontPatternMatch(MachineLoop *LoopRep,
+    MachineBasicBlock *LoopHeader) {
+  int NumCont = 0;
+  SmallVector<MachineBasicBlock *, DEFAULT_VEC_SLOTS> ContMBB;
+  typedef GraphTraits<Inverse<MachineBasicBlock *> > GTIM;
+  GTIM::ChildIteratorType It = GTIM::child_begin(LoopHeader),
+      E = GTIM::child_end(LoopHeader);
+  for (; It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (LoopRep->contains(MBB)) {
+      handleLoopcontBlock(MBB, MLI->getLoopFor(MBB),
+                          LoopHeader, LoopRep);
+      ContMBB.push_back(MBB);
+      ++NumCont;
     }
   }
 
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
-       iter = contBlk.begin(), iterEnd = contBlk.end();
-       iter != iterEnd; ++iter) {
-    (*iter)->removeSuccessor(loopHeader);
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = ContMBB.begin(),
+      E = ContMBB.end(); It != E; ++It) {
+    (*It)->removeSuccessor(LoopHeader);
   }
 
-  numLoopcontPatternMatch += numCont;
+  numLoopcontPatternMatch += NumCont;
 
-  return numCont;
-} //loopcontPatternMatch
+  return NumCont;
+}
 
 
-template<class PassT>
-bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
-                                                         BlockT *src2Blk) {
-  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
-  // same loop with LoopLandInfo without explicitly keeping track of
-  // loopContBlks and loopBreakBlks, this is a method to get the information.
-  //
-  if (src1Blk->succ_size() == 0) {
-    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
-    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
-      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-      if (theEntry != NULL) {
-        if (DEBUGME) {
-          errs() << "isLoopContBreakBlock yes src1 = BB"
-                 << src1Blk->getNumber()
-                 << " src2 = BB" << src2Blk->getNumber() << "\n";
-        }
+bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
+    MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
+  if (Src1MBB->succ_size() == 0) {
+    MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
+    if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
+      MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
+      if (TheEntry) {
+        DEBUG(
+          dbgs() << "isLoopContBreakBlock yes src1 = BB"
+                 << Src1MBB->getNumber()
+                 << " src2 = BB" << Src2MBB->getNumber() << "\n";
+        );
         return true;
       }
     }
   }
   return false;
-}  //isSameloopDetachedContbreak
-
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
-                                             BlockT *trueBlk,
-                                             BlockT *falseBlk) {
-  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
-  if (num == 0) {
-    if (DEBUGME) {
-      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
-    }
-    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
-  }
-  return num;
 }
 
-template<class PassT>
-int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
-                                                BlockT *trueBlk,
-                                                BlockT *falseBlk) {
-  int num = 0;
-  BlockT *downBlk;
-
-  //trueBlk could be the common post dominator
-  downBlk = trueBlk;
-
-  if (DEBUGME) {
-    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
-           << " true = BB" << trueBlk->getNumber()
-           << ", numSucc=" << trueBlk->succ_size()
-           << " false = BB" << falseBlk->getNumber() << "\n";
+int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
+  if (Num == 0) {
+    DEBUG(
+      dbgs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    );
+    Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
   }
+  return Num;
+}
 
-  while (downBlk) {
-    if (DEBUGME) {
-      errs() << "check down = BB" << downBlk->getNumber();
-    }
-
-    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
-      if (DEBUGME) {
-        errs() << " working\n";
-      }
-
-      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
-      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
+  int Num = 0;
+  MachineBasicBlock *DownBlk;
 
-      numClonedBlock += num;
-      num += serialPatternMatch(*headBlk->succ_begin());
-      num += serialPatternMatch(*(++headBlk->succ_begin()));
-      num += ifPatternMatch(headBlk);
-      assert(num > 0);
+  //trueBlk could be the common post dominator
+  DownBlk = TrueMBB;
+
+  DEBUG(
+    dbgs() << "handleJumpintoIfImp head = BB" << HeadMBB->getNumber()
+           << " true = BB" << TrueMBB->getNumber()
+           << ", numSucc=" << TrueMBB->succ_size()
+           << " false = BB" << FalseMBB->getNumber() << "\n";
+  );
+
+  while (DownBlk) {
+    DEBUG(
+      dbgs() << "check down = BB" << DownBlk->getNumber();
+    );
+
+    if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
+      DEBUG(
+        dbgs() << " working\n";
+      );
+
+      Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
+      Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
+
+      numClonedBlock += Num;
+      Num += serialPatternMatch(*HeadMBB->succ_begin());
+      Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
+      Num += ifPatternMatch(HeadMBB);
+      assert(Num > 0);
 
       break;
     }
-    if (DEBUGME) {
-      errs() << " not working\n";
-    }
-    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+    DEBUG(
+      dbgs() << " not working\n";
+    );
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
   } // walk down the postDomTree
 
-  return num;
-} //handleJumpintoIf
-
-template<class PassT>
-void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
-                                                         BlockT *trueBlk,
-                                                         BlockT *falseBlk,
-                                                         BlockT *landBlk,
-                                                         bool detail) {
-  errs() << "head = BB" << headBlk->getNumber()
-         << " size = " << headBlk->size();
-  if (detail) {
-    errs() << "\n";
-    headBlk->print(errs());
-    errs() << "\n";
+  return Num;
+}
+
+void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
+    MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) {
+  dbgs() << "head = BB" << HeadMBB->getNumber()
+         << " size = " << HeadMBB->size();
+  if (Detail) {
+    dbgs() << "\n";
+    HeadMBB->print(dbgs());
+    dbgs() << "\n";
   }
 
-  if (trueBlk) {
-    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
-           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      trueBlk->print(errs());
-      errs() << "\n";
+  if (TrueMBB) {
+    dbgs() << ", true = BB" << TrueMBB->getNumber() << " size = "
+           << TrueMBB->size() << " numPred = " << TrueMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      TrueMBB->print(dbgs());
+      dbgs() << "\n";
     }
   }
-  if (falseBlk) {
-    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
-           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      falseBlk->print(errs());
-      errs() << "\n";
+  if (FalseMBB) {
+    dbgs() << ", false = BB" << FalseMBB->getNumber() << " size = "
+           << FalseMBB->size() << " numPred = " << FalseMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      FalseMBB->print(dbgs());
+      dbgs() << "\n";
     }
   }
-  if (landBlk) {
-    errs() << ", land = BB" << landBlk->getNumber() << " size = "
-           << landBlk->size() << " numPred = " << landBlk->pred_size();
-    if (detail) {
-      errs() << "\n";
-      landBlk->print(errs());
-      errs() << "\n";
+  if (LandMBB) {
+    dbgs() << ", land = BB" << LandMBB->getNumber() << " size = "
+           << LandMBB->size() << " numPred = " << LandMBB->pred_size();
+    if (Detail) {
+      dbgs() << "\n";
+      LandMBB->print(dbgs());
+      dbgs() << "\n";
     }
   }
 
-    errs() << "\n";
-} //showImproveSimpleJumpintoIf
+    dbgs() << "\n";
+}
 
-template<class PassT>
-int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
-                                                    BlockT *trueBlk,
-                                                    BlockT *falseBlk,
-                                                    BlockT **plandBlk) {
-  bool migrateTrue = false;
-  bool migrateFalse = false;
+int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
+    MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+    MachineBasicBlock **LandMBBPtr) {
+  bool MigrateTrue = false;
+  bool MigrateFalse = false;
 
-  BlockT *landBlk = *plandBlk;
+  MachineBasicBlock *LandBlk = *LandMBBPtr;
 
-  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
-         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+  assert((!TrueMBB || TrueMBB->succ_size() <= 1)
+         && (!FalseMBB || FalseMBB->succ_size() <= 1));
 
-  if (trueBlk == falseBlk) {
+  if (TrueMBB == FalseMBB)
     return 0;
-  }
 
-  migrateTrue = needMigrateBlock(trueBlk);
-  migrateFalse = needMigrateBlock(falseBlk);
+  MigrateTrue = needMigrateBlock(TrueMBB);
+  MigrateFalse = needMigrateBlock(FalseMBB);
 
-  if (!migrateTrue && !migrateFalse) {
+  if (!MigrateTrue && !MigrateFalse)
     return 0;
-  }
 
   // If we need to migrate either trueBlk and falseBlk, migrate the rest that
   // have more than one predecessors.  without doing this, its predecessor
   // rather than headBlk will have undefined value in initReg.
-  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
-    migrateTrue = true;
-  }
-  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
-    migrateFalse = true;
-  }
+  if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() > 1)
+    MigrateTrue = true;
+  if (!MigrateFalse && FalseMBB && FalseMBB->pred_size() > 1)
+    MigrateFalse = true;
 
-  if (DEBUGME) {
-    errs() << "before improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
-  }
+  DEBUG(
+    dbgs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
 
   // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
   //
@@ -1183,205 +1336,192 @@ int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
   // add initReg = initVal to headBlk
 
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  unsigned initReg =
-    funcRep->getRegInfo().createVirtualRegister(I32RC);
-  if (!migrateTrue || !migrateFalse) {
-    int initVal = migrateTrue ? 0 : 1;
-    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  if (!MigrateTrue || !MigrateFalse) {
+    // XXX: We have an opportunity here to optimize the "branch into if" case
+    // here.  Branch into if looks like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \           |
+    // diamond_false        diamond_true
+    //             \      /
+    //               done
+    //
+    // The diamond_head block begins the "if" and the diamond_true block
+    // is the block being "branched into".
+    //
+    // If MigrateTrue is true, then TrueBB is the block being "branched into"
+    // and if MigrateFalse is true, then FalseBB is the block being
+    // "branched into"
+    // 
+    // Here is the pseudo code for how I think the optimization should work:
+    // 1. Insert MOV GPR0, 0 before the branch instruction in diamond_head.
+    // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
+    // 3. Move the branch instruction from diamond_head into its own basic
+    //    block (new_block).
+    // 4. Add an unconditional branch from diamond_head to new_block
+    // 5. Replace the branch instruction in branch_from with an unconditional
+    //    branch to new_block.  If branch_from has multiple predecessors, then
+    //    we need to replace the True/False block in the branch
+    //    instruction instead of replacing it.
+    // 6. Change the condition of the branch instruction in new_block from
+    //    COND to (COND || GPR0)
+    //
+    // In order insert these MOV instruction, we will need to use the
+    // RegisterScavenger.  Usually liveness stops being tracked during
+    // the late machine optimization passes, however if we implement
+    // bool TargetRegisterInfo::requiresRegisterScavenging(
+    //                                                const MachineFunction &MF)
+    // and have it return true, liveness will be tracked correctly 
+    // by generic optimization passes.  We will also need to make sure that
+    // all of our target-specific passes that run after regalloc and before
+    // the CFGStructurizer track liveness and we will need to modify this pass
+    // to correctly track liveness.
+    //
+    // After the above changes, the new CFG should look like this:
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //                       \     /
+    //                      new_block
+    //                      /      |
+    //         diamond_false        diamond_true
+    //                      \      /
+    //                        done
+    //
+    // Without this optimization, we are forced to duplicate the diamond_true
+    // block and we will end up with a CFG like this:
+    //
+    //                        entry
+    //                       /     |
+    //           diamond_head       branch_from
+    //             /      \                   |
+    // diamond_false        diamond_true      diamond_true (duplicate)
+    //             \      /                   |
+    //               done --------------------|
+    //
+    // Duplicating diamond_true can be very costly especially if it has a
+    // lot of instructions.
+    return 0;
   }
 
-  int numNewBlk = 0;
-
-  if (landBlk == NULL) {
-    landBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(landBlk);  //insert to function
-
-    if (trueBlk) {
-      trueBlk->addSuccessor(landBlk);
-    } else {
-      headBlk->addSuccessor(landBlk);
-    }
-
-    if (falseBlk) {
-      falseBlk->addSuccessor(landBlk);
-    } else {
-      headBlk->addSuccessor(landBlk);
-    }
-
-    numNewBlk ++;
-  }
+  int NumNewBlk = 0;
 
-  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+  bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
 
   //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
-  typename BlockT::iterator insertPos =
-    CFGTraits::getInstrPos
-    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
-
-  if (landBlkHasOtherPred) {
-    unsigned immReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
-    unsigned cmpResReg =
-      funcRep->getRegInfo().createVirtualRegister(I32RC);
-
-    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
-                                        initReg, immReg);
-    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
-                                      AMDGPU::IF_PREDICATE_SET, passRep,
-                                      cmpResReg, DebugLoc());
-  }
-
-  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
-                                    passRep, initReg, DebugLoc());
-
-  if (migrateTrue) {
-    migrateInstruction(trueBlk, landBlk, insertPos);
+  MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, AMDGPU::ENDIF);
+
+  if (LandBlkHasOtherPred) {
+    llvm_unreachable("Extra register needed to handle CFG");
+    unsigned CmpResReg =
+      HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+    llvm_unreachable("Extra compare instruction needed to handle CFG");
+    insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
+        CmpResReg, DebugLoc());
+  }
+
+  // XXX: We are running this after RA, so creating virtual registers will
+  // cause an assertion failure in the PostRA scheduling pass.
+  unsigned InitReg =
+    HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+  insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
+      DebugLoc());
+
+  if (MigrateTrue) {
+    migrateInstruction(TrueMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 1).
-    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
-  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+  insertInstrBefore(I, AMDGPU::ELSE);
 
-  if (migrateFalse) {
-    migrateInstruction(falseBlk, landBlk, insertPos);
+  if (MigrateFalse) {
+    migrateInstruction(FalseMBB, LandBlk, I);
     // need to uncondionally insert the assignment to ensure a path from its
     // predecessor rather than headBlk has valid value in initReg if
     // (initVal != 0)
-    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+    llvm_unreachable("Extra register needed to handle CFG");
   }
 
-  if (landBlkHasOtherPred) {
+  if (LandBlkHasOtherPred) {
     // add endif
-    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+    insertInstrBefore(I, AMDGPU::ENDIF);
 
     // put initReg = 2 to other predecessors of landBlk
-    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
-         ++predIter) {
-      BlockT *curBlk = *predIter;
-      if (curBlk != trueBlk && curBlk != falseBlk) {
-        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
-      }
-    } //for
-  }
-  if (DEBUGME) {
-    errs() << "result from improveSimpleJumpintoIf: ";
-    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
-  }
-
-  // update landBlk
-  *plandBlk = landBlk;
-
-  return numNewBlk;
-} //improveSimpleJumpintoIf
-
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
-                                              LoopT *exitingLoop,
-                                             BlockT *exitBlk,
-                                              LoopT *exitLoop,
-                                             BlockT *landBlk) {
-  if (DEBUGME) {
-    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
-           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
-  }
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  RegiT initReg = INVALIDREGNUM;
-  if (exitingLoop != exitLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopBreakInitReg(exitLoop, initReg);
-    while (exitingLoop != exitLoop && exitingLoop) {
-      addLoopBreakOnReg(exitingLoop, initReg);
-      exitingLoop = exitingLoop->getParentLoop();
+    for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
+         PE = LandBlk->pred_end(); PI != PE; ++PI) {
+      MachineBasicBlock *MBB = *PI;
+      if (MBB != TrueMBB && MBB != FalseMBB)
+        llvm_unreachable("Extra register needed to handle CFG");
     }
-    assert(exitingLoop == exitLoop);
   }
+  DEBUG(
+    dbgs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
+  );
 
-  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+  // update landBlk
+  *LandMBBPtr = LandBlk;
 
-} //handleLoopbreak
+  return NumNewBlk;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
-                                                  LoopT *contingLoop,
-                                                 BlockT *contBlk,
-                                                  LoopT *contLoop) {
-  if (DEBUGME) {
-    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
-           << " header = BB" << contBlk->getNumber() << "\n";
+void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
+    MachineLoop *ContLoop) {
+  DEBUG(dbgs() << "loopcontPattern cont = BB" << ContingMBB->getNumber()
+               << " header = BB" << ContMBB->getNumber() << "\n";
+        dbgs() << "Trying to continue loop-depth = "
+               << getLoopDepth(ContLoop)
+               << " from loop-depth = " << getLoopDepth(ContingLoop) << "\n";);
+  settleLoopcontBlock(ContingMBB, ContMBB);
+}
 
-    errs() << "Trying to continue loop-depth = "
-           << getLoopDepth(contLoop)
-           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
-  }
+void AMDGPUCFGStructurizer::mergeSerialBlock(MachineBasicBlock *DstMBB,
+    MachineBasicBlock *SrcMBB) {
+  DEBUG(
+    dbgs() << "serialPattern BB" << DstMBB->getNumber()
+           << " <= BB" << SrcMBB->getNumber() << "\n";
+  );
+  DstMBB->splice(DstMBB->end(), SrcMBB, SrcMBB->begin(), SrcMBB->end());
 
-  RegiT initReg = INVALIDREGNUM;
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (contingLoop != contLoop) {
-    initReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    assert(initReg != INVALIDREGNUM);
-    addLoopContInitReg(contLoop, initReg);
-    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
-      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
-      contingLoop = contingLoop->getParentLoop();
-    }
-    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
-    addLoopContOnReg(contingLoop, initReg);
-  }
+  DstMBB->removeSuccessor(SrcMBB);
+  cloneSuccessorList(DstMBB, SrcMBB);
 
-  settleLoopcontBlock(contingBlk, contBlk, initReg);
-} //handleLoopcontBlock
+  removeSuccessor(SrcMBB);
+  MLI->removeBlock(SrcMBB);
+  retireBlock(SrcMBB);
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
-  if (DEBUGME) {
-    errs() << "serialPattern BB" << dstBlk->getNumber()
-           << " <= BB" << srcBlk->getNumber() << "\n";
-  }
-  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
-
-  dstBlk->removeSuccessor(srcBlk);
-  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
-
-  removeSuccessor(srcBlk);
-  retireBlock(dstBlk, srcBlk);
-} //mergeSerialBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
-                                                  BlockT *curBlk,
-                                                  BlockT *trueBlk,
-                                                  BlockT *falseBlk,
-                                                  BlockT *landBlk) {
-  if (DEBUGME) {
-    errs() << "ifPattern BB" << curBlk->getNumber();
-    errs() << "{  ";
-    if (trueBlk) {
-      errs() << "BB" << trueBlk->getNumber();
-    }
-    errs() << "  } else ";
-    errs() << "{  ";
-    if (falseBlk) {
-      errs() << "BB" << falseBlk->getNumber();
-    }
-    errs() << "  }\n ";
-    errs() << "landBlock: ";
-    if (landBlk == NULL) {
-      errs() << "NULL";
+void AMDGPUCFGStructurizer::mergeIfthenelseBlock(MachineInstr *BranchMI,
+    MachineBasicBlock *MBB, MachineBasicBlock *TrueMBB,
+    MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB) {
+  assert (TrueMBB);
+  DEBUG(
+    dbgs() << "ifPattern BB" << MBB->getNumber();
+    dbgs() << "{  ";
+    if (TrueMBB) {
+      dbgs() << "BB" << TrueMBB->getNumber();
+    }
+    dbgs() << "  } else ";
+    dbgs() << "{  ";
+    if (FalseMBB) {
+      dbgs() << "BB" << FalseMBB->getNumber();
+    }
+    dbgs() << "  }\n ";
+    dbgs() << "landBlock: ";
+    if (!LandMBB) {
+      dbgs() << "NULL";
     } else {
-      errs() << "BB" << landBlk->getNumber();
+      dbgs() << "BB" << LandMBB->getNumber();
     }
-    errs() << "\n";
-  }
+    dbgs() << "\n";
+  );
 
-  int oldOpcode = branchInstr->getOpcode();
-  DebugLoc branchDL = branchInstr->getDebugLoc();
+  int OldOpcode = BranchMI->getOpcode();
+  DebugLoc BranchDL = BranchMI->getDebugLoc();
 
 //    transform to
 //    if cond
@@ -1391,1661 +1531,374 @@ void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
 //    endif
 //    landBlk
 
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(curBlk, branchInstr);
-  CFGTraits::insertCondBranchBefore(branchInstrPos,
-                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
-                                    passRep,
-                                    branchDL);
-
-  if (trueBlk) {
-    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
-    curBlk->removeSuccessor(trueBlk);
-    if (landBlk && trueBlk->succ_size()!=0) {
-      trueBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, trueBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
-
-  if (falseBlk) {
-    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
-                   falseBlk->end());
-    curBlk->removeSuccessor(falseBlk);
-    if (landBlk && falseBlk->succ_size() != 0) {
-      falseBlk->removeSuccessor(landBlk);
-    }
-    retireBlock(curBlk, falseBlk);
-  }
-  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
-
-  branchInstr->eraseFromParent();
+  MachineBasicBlock::iterator I = BranchMI;
+  insertCondBranchBefore(I, getBranchNzeroOpcode(OldOpcode),
+      BranchDL);
 
-  if (landBlk && trueBlk && falseBlk) {
-    curBlk->addSuccessor(landBlk);
+  if (TrueMBB) {
+    MBB->splice(I, TrueMBB, TrueMBB->begin(), TrueMBB->end());
+    MBB->removeSuccessor(TrueMBB);
+    if (LandMBB && TrueMBB->succ_size()!=0)
+      TrueMBB->removeSuccessor(LandMBB);
+    retireBlock(TrueMBB);
+    MLI->removeBlock(TrueMBB);
   }
 
-} //mergeIfthenelseBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
-                                                LoopLandInfo *loopLand) {
-  BlockT *landBlk = loopLand->landBlk;
-
-  if (DEBUGME) {
-    errs() << "loopPattern header = BB" << dstBlk->getNumber()
-           << " land = BB" << landBlk->getNumber() << "\n";
+  if (FalseMBB) {
+    insertInstrBefore(I, AMDGPU::ELSE);
+    MBB->splice(I, FalseMBB, FalseMBB->begin(),
+                   FalseMBB->end());
+    MBB->removeSuccessor(FalseMBB);
+    if (LandMBB && FalseMBB->succ_size() != 0)
+      FalseMBB->removeSuccessor(LandMBB);
+    retireBlock(FalseMBB);
+    MLI->removeBlock(FalseMBB);
   }
+  insertInstrBefore(I, AMDGPU::ENDIF);
 
-  // Loop contInitRegs are init at the beginning of the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->contInitRegs.begin(),
-       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
+  BranchMI->eraseFromParent();
 
-  /* we last inserterd the DebugLoc in the
-   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
-   * search for the DebugLoc in the that statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
-  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
-
-  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
-  // Loop breakInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakInitRegs.begin(),
-       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
-  // Loop endbranchInitRegs are init before entering the loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->endbranchInitRegs.begin(),
-       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
-  }
+  if (LandMBB && TrueMBB && FalseMBB)
+    MBB->addSuccessor(LandMBB);
 
-  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
-   * search for the DebugLoc in the continue statement.
-   * if not found, we have to insert the empty/default DebugLoc */
-  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
-  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
-
-  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
-  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
-  // loop.
-  for (typename std::set<RegiT>::const_iterator iter =
-         loopLand->breakOnRegs.begin(),
-       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
-                                   *iter);
-  }
-
-  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
-  // loop.
-  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
-       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
-    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
-                                   passRep, *iter);
-  }
-
-  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
-
-  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
-       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
-    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
-  }
-
-  removeSuccessor(landBlk);
-  retireBlock(dstBlk, landBlk);
-} //mergeLooplandBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
-  while (I--) {
-    if (I->getOpcode() == AMDGPU::PRED_X) {
-      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
-      case OPCODE_IS_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
-        return;
-      case OPCODE_IS_NOT_ZERO_INT:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
-        return;
-      case OPCODE_IS_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
-        return;
-      case OPCODE_IS_NOT_ZERO:
-        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
-        return;
-      default:
-        assert(0 && "PRED_X Opcode invalid!");
-      }
-    }
-  }
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
-                                                 BlockT *exitBlk,
-                                                 BlockT *exitLandBlk,
-                                                 RegiT  setReg) {
-  if (DEBUGME) {
-    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
-           << " exit = BB" << exitBlk->getNumber()
-           << " land = BB" << exitLandBlk->getNumber() << "\n";
-  }
+void AMDGPUCFGStructurizer::mergeLooplandBlock(MachineBasicBlock *DstBlk,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopPattern header = BB" << DstBlk->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
 
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
-  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-
-  DebugLoc DL = branchInstr->getDebugLoc();
-
-  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-
-  //    transform exitingBlk to
-  //    if ( ) {
-  //       exitBlk (if exitBlk != exitLandBlk)
-  //       setReg = 1
-  //       break
-  //    }endif
-  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
-
-  typename BlockT::iterator branchInstrPos =
-    CFGTraits::getInstrPos(exitingBlk, branchInstr);
-
-  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
-    //break_logical
+  insertInstrBefore(DstBlk, AMDGPU::WHILELOOP, DebugLoc());
+  insertInstrEnd(DstBlk, AMDGPU::ENDLOOP, DebugLoc());
+  DstBlk->addSuccessor(LandMBB);
+  DstBlk->removeSuccessor(DstBlk);
+}
 
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstrPos);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-  } else {
-    if (trueBranch != exitBlk) {
-      reversePredicateSetter(branchInstr);
-    }
-    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
-    if (exitBlk != exitLandBlk) {
-      //splice is insert-before ...
-      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
-                         exitBlk->end());
-    }
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-    }
-    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
-  } //if_logical
 
+void AMDGPUCFGStructurizer::mergeLoopbreakBlock(MachineBasicBlock *ExitingMBB,
+    MachineBasicBlock *LandMBB) {
+  DEBUG(dbgs() << "loopbreakPattern exiting = BB" << ExitingMBB->getNumber()
+               << " land = BB" << LandMBB->getNumber() << "\n";);
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(ExitingMBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DebugLoc DL = BranchMI->getDebugLoc();
+  MachineBasicBlock *TrueBranch = getTrueBranch(BranchMI);
+  MachineBasicBlock::iterator I = BranchMI;
+  if (TrueBranch != LandMBB)
+    reversePredicateSetter(I);
+  insertCondBranchBefore(ExitingMBB, I, AMDGPU::IF_PREDICATE_SET, AMDGPU::PREDICATE_BIT, DL);
+  insertInstrBefore(I, AMDGPU::BREAK);
+  insertInstrBefore(I, AMDGPU::ENDIF);
   //now branchInst can be erase safely
-  branchInstr->eraseFromParent();
-
+  BranchMI->eraseFromParent();
   //now take care of successors, retire blocks
-  exitingBlk->removeSuccessor(exitBlk);
-  if (exitBlk != exitLandBlk) {
-    //splice is insert-before ...
-    exitBlk->removeSuccessor(exitLandBlk);
-    retireBlock(exitingBlk, exitBlk);
-  }
-
-} //mergeLoopbreakBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
-                                                 BlockT *contBlk,
-                                                 RegiT   setReg) {
-  if (DEBUGME) {
-    errs() << "settleLoopcontBlock conting = BB"
-           << contingBlk->getNumber()
-           << ", cont = BB" << contBlk->getNumber() << "\n";
-  }
-
-  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
-  if (branchInstr) {
-    assert(CFGTraits::isCondBranch(branchInstr));
-    typename BlockT::iterator branchInstrPos =
-      CFGTraits::getInstrPos(contingBlk, branchInstr);
-    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
-    int oldOpcode = branchInstr->getOpcode();
-    DebugLoc DL = branchInstr->getDebugLoc();
-
-    //    transform contingBlk to
-    //     if () {
-    //          move instr after branchInstr
-    //          continue
-    //        or
-    //          setReg = 1
-    //          break
-    //     }endif
-    //     successor = {orgSuccessor(contingBlk) - loopHeader}
-
-    bool useContinueLogical = 
-      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
-
-    if (useContinueLogical == false) {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
-                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
-
-      if (setReg != INVALIDREGNUM) {
-        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
-      } else {
-        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
-      }
+  ExitingMBB->removeSuccessor(LandMBB);
+}
 
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+void AMDGPUCFGStructurizer::settleLoopcontBlock(MachineBasicBlock *ContingMBB,
+    MachineBasicBlock *ContMBB) {
+  DEBUG(dbgs() << "settleLoopcontBlock conting = BB"
+               << ContingMBB->getNumber()
+               << ", cont = BB" << ContMBB->getNumber() << "\n";);
+
+  MachineInstr *MI = getLoopendBlockBranchInstr(ContingMBB);
+  if (MI) {
+    assert(isCondBranch(MI));
+    MachineBasicBlock::iterator I = MI;
+    MachineBasicBlock *TrueBranch = getTrueBranch(MI);
+    int OldOpcode = MI->getOpcode();
+    DebugLoc DL = MI->getDebugLoc();
+
+    bool UseContinueLogical = ((&*ContingMBB->rbegin()) == MI);
+
+    if (UseContinueLogical == false) {
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getBranchNzeroOpcode(OldOpcode) :
+          getBranchZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      insertInstrEnd(ContingMBB, AMDGPU::CONTINUE, DL);
+      insertInstrEnd(ContingMBB, AMDGPU::ENDIF, DL);
     } else {
-      int branchOpcode =
-        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
-                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
-
-      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+      int BranchOpcode =
+          TrueBranch == ContMBB ? getContinueNzeroOpcode(OldOpcode) :
+          getContinueZeroOpcode(OldOpcode);
+      insertCondBranchBefore(I, BranchOpcode, DL);
     }
 
-    branchInstr->eraseFromParent();
+    MI->eraseFromParent();
   } else {
     // if we've arrived here then we've already erased the branch instruction
-    // travel back up the basic block to see the last reference of our debug location
-    // we've just inserted that reference here so it should be representative
-    if (setReg != INVALIDREGNUM) {
-      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    } else {
-      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
-      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
-    }
-  } //else
-
-} //settleLoopcontBlock
-
-// BBs in exitBlkSet are determined as in break-path for loopRep,
-// before we can put code for BBs as inside loop-body for loopRep
-// check whether those BBs are determined as cont-BB for parentLoopRep
-// earlier.
-// If so, generate a new BB newBlk
-//    (1) set newBlk common successor of BBs in exitBlkSet
-//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
-//    (3) generate continue-instr in newBlk
-//
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
-                                              LoopT *loopRep,
-                                              std::set<BlockT *> &exitBlkSet,
-                                              BlockT *exitLandBlk) {
-  std::set<BlockT *> endBlkSet;
-
-
-
-  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
-       iterEnd = exitBlkSet.end();
-       iter != iterEnd; ++iter) {
-    BlockT *exitBlk = *iter;
-    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
-
-    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
-      return NULL;
-
-    endBlkSet.insert(endBlk);
-  }
-
-  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(newBlk);  //insert to function
-  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
-  SHOWNEWBLK(newBlk, "New continue block: ");
-
-  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
-       iterEnd = endBlkSet.end();
-       iter != iterEnd; ++iter) {
-      BlockT *endBlk = *iter;
-      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
-      if (contInstr) {
-        contInstr->eraseFromParent();
-      }
-      endBlk->addSuccessor(newBlk);
-      if (DEBUGME) {
-        errs() << "Add new continue Block to BB"
-               << endBlk->getNumber() << " successors\n";
-      }
-  }
-
-  return newBlk;
-} //relocateLoopcontBlock
-
-
-// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
-// LoopLandBlock. This BB branch on the loop endBranchInit register to the
-// pathes corresponding to the loop exiting branches.
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
-                                              BlockTSmallerVector &exitingBlks,
-                                              BlockTSmallerVector &exitBlks) {
-  const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-
-  RegiT endBranchReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
-  assert(endBranchReg >= 0);
-
-  // reg = 0 before entering the loop
-  addLoopEndbranchInitReg(loopRep, endBranchReg);
-
-  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
-  assert(numBlks >=2 && numBlks == exitBlks.size());
-
-  BlockT *preExitingBlk = exitingBlks[0];
-  BlockT *preExitBlk = exitBlks[0];
-  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(preBranchBlk);  //insert to function
-  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
-
-  BlockT *newLandBlk = preBranchBlk;
-
-      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
-        newLandBlk);
-  preExitingBlk->removeSuccessor(preExitBlk);
-  preExitingBlk->addSuccessor(newLandBlk);
-
-  //it is redundant to add reg = 0 to exitingBlks[0]
-
-  // For 1..n th exiting path (the last iteration handles two pathes) create the
-  // branch to the previous path and the current path.
-  for (uint32_t i = 1; i < numBlks; ++i) {
-    BlockT *curExitingBlk = exitingBlks[i];
-    BlockT *curExitBlk = exitBlks[i];
-    BlockT *curBranchBlk;
-
-    if (i == numBlks - 1) {
-      curBranchBlk = curExitBlk;
-    } else {
-      curBranchBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(curBranchBlk);  //insert to function
-      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
-    }
-
-    // Add reg = i to exitingBlks[i].
-    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
-                                       endBranchReg, i);
-
-    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
-    // (exitingBlks[i], newLandBlk).
-    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
-                                          newLandBlk);
-    curExitingBlk->removeSuccessor(curExitBlk);
-    curExitingBlk->addSuccessor(newLandBlk);
-
-    // add to preBranchBlk the branch instruction:
-    // if (endBranchReg == preVal)
-    //    preExitBlk
-    // else
-    //    curBranchBlk
-    //
-    // preValReg = i - 1
-
-  DebugLoc DL;
-  RegiT preValReg = static_cast<int>
-    (funcRep->getRegInfo().createVirtualRegister(I32RC));
-
-  preBranchBlk->insert(preBranchBlk->begin(),
-                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
-                       i - 1));
-
-  // condResReg = (endBranchReg == preValReg)
-    RegiT condResReg = static_cast<int>
-      (funcRep->getRegInfo().createVirtualRegister(I32RC));
-    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
-      .addReg(endBranchReg).addReg(preValReg);
-
-    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
-      .addMBB(preExitBlk).addReg(condResReg);
-
-    preBranchBlk->addSuccessor(preExitBlk);
-    preBranchBlk->addSuccessor(curBranchBlk);
-
-    // Update preExitingBlk, preExitBlk, preBranchBlk.
-    preExitingBlk = curExitingBlk;
-    preExitBlk = curExitBlk;
-    preBranchBlk = curBranchBlk;
-
-  }  //end for 1 .. n blocks
-
-  return newLandBlk;
-} //addLoopEndbranchBlock
-
-template<class PassT>
-typename CFGStructurizer<PassT>::PathToKind
-CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
-                                     bool allowSideEntry) {
-  assert(dstBlk);
-
-  if (srcBlk == dstBlk) {
-    return SinglePath_InPath;
+    // travel back up the basic block to see the last reference of our debug
+    // location we've just inserted that reference here so it should be
+    // representative insertEnd to ensure phi-moves, if exist, go before the
+    // continue-instr.
+    insertInstrEnd(ContingMBB, AMDGPU::CONTINUE,
+        getLastDebugLocInBB(ContingMBB));
   }
+}
 
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == dstBlk) {
-      return SinglePath_InPath;
-    }
-
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return Not_SinglePath;
-    }
-  }
-
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return SinglePath_NotInPath;
-  }
-
-  return Not_SinglePath;
-} //singlePathTo
-
-// If there is a single path from srcBlk to dstBlk, return the last block before
-// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
-// last block in the path Otherwise, return NULL
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
-                                      bool allowSideEntry) {
-  assert(dstBlk);
-
-  if (srcBlk == dstBlk) {
-    return srcBlk;
-  }
-
-  if (srcBlk->succ_size() == 0) {
-    return srcBlk;
-  }
-
-  while (srcBlk && srcBlk->succ_size() == 1) {
-    BlockT *preBlk = srcBlk;
-
-    srcBlk = *srcBlk->succ_begin();
-    if (srcBlk == NULL) {
-      return preBlk;
-    }
-
-    if (!allowSideEntry && srcBlk->pred_size() > 1) {
-      return NULL;
-    }
-  }
-
-  if (srcBlk && srcBlk->succ_size()==0) {
-    return srcBlk;
-  }
-
-  return NULL;
-
-} //singlePathEnd
-
-template<class PassT>
-int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
-                                               BlockT *dstBlk) {
-  int cloned = 0;
-  assert(preBlk->isSuccessor(srcBlk));
-  while (srcBlk && srcBlk != dstBlk) {
-    assert(srcBlk->succ_size() == 1);
-    if (srcBlk->pred_size() > 1) {
-      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
-      ++cloned;
+int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
+    MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
+  int Cloned = 0;
+  assert(PreMBB->isSuccessor(SrcMBB));
+  while (SrcMBB && SrcMBB != DstMBB) {
+    assert(SrcMBB->succ_size() == 1);
+    if (SrcMBB->pred_size() > 1) {
+      SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
+      ++Cloned;
     }
 
-    preBlk = srcBlk;
-    srcBlk = *srcBlk->succ_begin();
+    PreMBB = SrcMBB;
+    SrcMBB = *SrcMBB->succ_begin();
   }
 
-  return cloned;
-} //cloneOnSideEntryTo
+  return Cloned;
+}
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
-                                                 BlockT *predBlk) {
-  assert(predBlk->isSuccessor(curBlk) &&
+MachineBasicBlock *
+AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
+    MachineBasicBlock *PredMBB) {
+  assert(PredMBB->isSuccessor(MBB) &&
          "succBlk is not a prececessor of curBlk");
 
-  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
-  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  MachineBasicBlock *CloneMBB = clone(MBB);  //clone instructions
+  replaceInstrUseOfBlockWith(PredMBB, MBB, CloneMBB);
   //srcBlk, oldBlk, newBlk
 
-  predBlk->removeSuccessor(curBlk);
-  predBlk->addSuccessor(cloneBlk);
+  PredMBB->removeSuccessor(MBB);
+  PredMBB->addSuccessor(CloneMBB);
 
   // add all successor to cloneBlk
-  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+  cloneSuccessorList(CloneMBB, MBB);
 
-  numClonedInstr += curBlk->size();
+  numClonedInstr += MBB->size();
 
-  if (DEBUGME) {
-    errs() << "Cloned block: " << "BB"
-           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
-  }
+  DEBUG(
+    dbgs() << "Cloned block: " << "BB"
+           << MBB->getNumber() << "size " << MBB->size() << "\n";
+  );
 
-  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+  SHOWNEWBLK(CloneMBB, "result of Cloned block: ");
 
-  return cloneBlk;
-} //cloneBlockForPredecessor
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
-                                               BlockT *exitingBlk) {
-  BlockT *exitBlk = NULL;
-
-  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
-       iterSuccEnd = exitingBlk->succ_end();
-       iterSucc != iterSuccEnd; ++iterSucc) {
-    BlockT *curBlk = *iterSucc;
-    if (!loopRep->contains(curBlk)) {
-      assert(exitBlk == NULL);
-      exitBlk = curBlk;
-    }
-  }
-
-  assert(exitBlk != NULL);
-
-  return exitBlk;
-} //exitingBlock2ExitBlock
+  return CloneMBB;
+}
 
-template<class PassT>
-void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
-                                                BlockT *dstBlk,
-                                                InstrIterator insertPos) {
-  InstrIterator spliceEnd;
+void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
+    MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
+  MachineBasicBlock::iterator SpliceEnd;
   //look for the input branchinstr, not the AMDGPU branchinstr
-  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-  if (branchInstr == NULL) {
-    if (DEBUGME) {
-      errs() << "migrateInstruction don't see branch instr\n" ;
-    }
-    spliceEnd = srcBlk->end();
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
+  if (!BranchMI) {
+    DEBUG(
+      dbgs() << "migrateInstruction don't see branch instr\n" ;
+    );
+    SpliceEnd = SrcMBB->end();
   } else {
-    if (DEBUGME) {
-      errs() << "migrateInstruction see branch instr\n" ;
-      branchInstr->dump();
-    }
-    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
-  }
-  if (DEBUGME) {
-    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
+    DEBUG(
+      dbgs() << "migrateInstruction see branch instr\n" ;
+      BranchMI->dump();
+    );
+    SpliceEnd = BranchMI;
   }
+  DEBUG(
+    dbgs() << "migrateInstruction before splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
 
   //splice insert before insertPos
-  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+  DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
 
-  if (DEBUGME) {
-    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
-      << "srcSize = " << srcBlk->size() << "\n";
-  }
-} //migrateInstruction
+  DEBUG(
+    dbgs() << "migrateInstruction after splice dstSize = " << DstMBB->size()
+      << "srcSize = " << SrcMBB->size() << "\n";
+  );
+}
 
-// normalizeInfiniteLoopExit change
-//   B1:
-//        uncond_br LoopHeader
-//
-// to
-//   B1:
-//        cond_br 1 LoopHeader dummyExit
-// and return the newly added dummy exit block
-// 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
-  BlockT *loopHeader;
-  BlockT *loopLatch;
-  loopHeader = LoopRep->getHeader();
-  loopLatch = LoopRep->getLoopLatch();
-  BlockT *dummyExitBlk = NULL;
+MachineBasicBlock *
+AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
+  MachineBasicBlock *LoopHeader = LoopRep->getHeader();
+  MachineBasicBlock *LoopLatch = LoopRep->getLoopLatch();
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
-  if (loopHeader!=NULL && loopLatch!=NULL) {
-    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
-    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
-      dummyExitBlk = funcRep->CreateMachineBasicBlock();
-      funcRep->push_back(dummyExitBlk);  //insert to function
-      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
-
-      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
-
-      typename BlockT::iterator insertPos =
-        CFGTraits::getInstrPos(loopLatch, branchInstr);
-      unsigned immReg =
-        funcRep->getRegInfo().createVirtualRegister(I32RC);
-      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
-      InstrT *newInstr = 
-        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
-      MachineInstrBuilder MIB(*funcRep, newInstr);
-      MIB.addMBB(loopHeader);
-      MIB.addReg(immReg, false);
-
-      SHOWNEWINSTR(newInstr);
-
-      branchInstr->eraseFromParent();
-      loopLatch->addSuccessor(dummyExitBlk);
-    }
-  }
 
-  return dummyExitBlk;
-} //normalizeInfiniteLoopExit
+  if (!LoopHeader || !LoopLatch)
+    return NULL;
+  MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
+  // Is LoopRep an infinite loop ?
+  if (!BranchMI || !isUncondBranch(BranchMI))
+    return NULL;
 
-template<class PassT>
-void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
-  InstrT *branchInstr;
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+  DEBUG(dbgs() << "Old branch instr: " << *BranchMI << "\n";);
+  MachineBasicBlock::iterator I = BranchMI;
+  unsigned ImmReg = FuncRep->getRegInfo().createVirtualRegister(I32RC);
+  llvm_unreachable("Extra register needed to handle CFG");
+  MachineInstr *NewMI = insertInstrBefore(I, AMDGPU::BRANCH_COND_i32);
+  MachineInstrBuilder MIB(*FuncRep, NewMI);
+  MIB.addMBB(LoopHeader);
+  MIB.addReg(ImmReg, false);
+  SHOWNEWINSTR(NewMI);
+  BranchMI->eraseFromParent();
+  LoopLatch->addSuccessor(DummyExitBlk);
+
+  return DummyExitBlk;
+}
+
+void AMDGPUCFGStructurizer::removeUnconditionalBranch(MachineBasicBlock *MBB) {
+  MachineInstr *BranchMI;
 
   // I saw two unconditional branch in one basic block in example
   // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
-  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
-          && CFGTraits::isUncondBranch(branchInstr)) {
-    if (DEBUGME) {
-          errs() << "Removing unconditional branch instruction" ;
-      branchInstr->dump();
-    }
-    branchInstr->eraseFromParent();
-  }
-} //removeUnconditionalBranch
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
-  if (srcBlk->succ_size() == 2) {
-    BlockT *blk1 = *srcBlk->succ_begin();
-    BlockT *blk2 = *(++srcBlk->succ_begin());
-
-    if (blk1 == blk2) {
-      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
-      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
-      if (DEBUGME) {
-        errs() << "Removing unneeded conditional branch instruction" ;
-        branchInstr->dump();
-      }
-      branchInstr->eraseFromParent();
-      SHOWNEWBLK(blk1, "Removing redundant successor");
-      srcBlk->removeSuccessor(blk1);
-    }
-  }
-} //removeRedundantConditionalBranch
-
-template<class PassT>
-void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
-                                               DEFAULT_VEC_SLOTS> &retBlks) {
-  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
-  funcRep->push_back(dummyExitBlk);  //insert to function
-  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
-
-  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
-         retBlks.begin(),
-       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
-    BlockT *curBlk = *iter;
-    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
-    if (curInstr) {
-      curInstr->eraseFromParent();
-    }
-    curBlk->addSuccessor(dummyExitBlk);
-    if (DEBUGME) {
-      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
-             << " successors\n";
-    }
-  } //for
-
-  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
-} //addDummyExitBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
-  while (srcBlk->succ_size()) {
-    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  while ((BranchMI = getLoopendBlockBranchInstr(MBB))
+          && isUncondBranch(BranchMI)) {
+    DEBUG(dbgs() << "Removing uncond branch instr"; BranchMI->dump(););
+    BranchMI->eraseFromParent();
   }
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+void AMDGPUCFGStructurizer::removeRedundantConditionalBranch(
+    MachineBasicBlock *MBB) {
+  if (MBB->succ_size() != 2)
+    return;
+  MachineBasicBlock *MBB1 = *MBB->succ_begin();
+  MachineBasicBlock *MBB2 = *llvm::next(MBB->succ_begin());
+  if (MBB1 != MBB2)
+    return;
+
+  MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
+  assert(BranchMI && isCondBranch(BranchMI));
+  DEBUG(dbgs() << "Removing unneeded cond branch instr"; BranchMI->dump(););
+  BranchMI->eraseFromParent();
+  SHOWNEWBLK(MBB1, "Removing redundant successor");
+  MBB->removeSuccessor(MBB1);
+}
 
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
+void AMDGPUCFGStructurizer::addDummyExitBlock(
+    SmallVectorImpl<MachineBasicBlock*> &RetMBB) {
+  MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
+  FuncRep->push_back(DummyExitBlk);  //insert to function
+  insertInstrEnd(DummyExitBlk, AMDGPU::RETURN);
+
+  for (SmallVectorImpl<MachineBasicBlock *>::iterator It = RetMBB.begin(),
+       E = RetMBB.end(); It != E; ++It) {
+    MachineBasicBlock *MBB = *It;
+    MachineInstr *MI = getReturnInstr(MBB);
+    if (MI)
+      MI->eraseFromParent();
+    MBB->addSuccessor(DummyExitBlk);
+    DEBUG(
+      dbgs() << "Add dummyExitBlock to BB" << MBB->getNumber()
+             << " successors\n";
+    );
   }
+  SHOWNEWBLK(DummyExitBlk, "DummyExitBlock: ");
+}
 
-  srcBlkInfo->sccNum = sccNum;
+void AMDGPUCFGStructurizer::removeSuccessor(MachineBasicBlock *MBB) {
+  while (MBB->succ_size())
+    MBB->removeSuccessor(*MBB->succ_begin());
 }
 
-template<class PassT>
-int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+void AMDGPUCFGStructurizer::recordSccnum(MachineBasicBlock *MBB,
+    int SccNum) {
+  BlockInformation *&srcBlkInfo = BlockInfoMap[MBB];
+  if (!srcBlkInfo)
+    srcBlkInfo = new BlockInformation();
+  srcBlkInfo->SccNum = SccNum;
 }
 
-template<class PassT>
-void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
-  if (DEBUGME) {
-        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
-  }
+void AMDGPUCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
+  DEBUG(
+        dbgs() << "Retiring BB" << MBB->getNumber() << "\n";
+  );
 
-  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+  BlockInformation *&SrcBlkInfo = BlockInfoMap[MBB];
 
-  if (srcBlkInfo == NULL) {
-    srcBlkInfo = new BlockInfo();
-  }
+  if (!SrcBlkInfo)
+    SrcBlkInfo = new BlockInformation();
 
-  srcBlkInfo->isRetired = true;
-  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+  SrcBlkInfo->IsRetired = true;
+  assert(MBB->succ_size() == 0 && MBB->pred_size() == 0
          && "can't retire block yet");
 }
 
-template<class PassT>
-bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
-  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
-  return (srcBlkInfo && srcBlkInfo->isRetired);
-}
-
-template<class PassT>
-bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  while (loopRep && loopRep->getHeader() == curBlk) {
-    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
-
-    if(loopLand == NULL)
-      return true;
-
-    BlockT *landBlk = loopLand->landBlk;
-    assert(landBlk);
-    if (!isRetiredBlock(landBlk)) {
-      return true;
-    }
-
-    loopRep = loopRep->getParentLoop();
-  }
-
-  return false;
-} //isActiveLoophead
-
-template<class PassT>
-bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
-  const unsigned blockSizeThreshold = 30;
-  const unsigned cloneInstrThreshold = 100;
-
-  bool multiplePreds = blk && (blk->pred_size() > 1);
-
-  if(!multiplePreds)
-    return false;
-
-  unsigned blkSize = blk->size();
-  return ((blkSize > blockSizeThreshold)
-          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
-} //needMigrateBlock
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
-                                            BlockTSmallerVector &exitBlks,
-                                            std::set<BlockT *> &exitBlkSet) {
-  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
-
-  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
-       predIterEnd = landBlk->pred_end();
-       predIter != predIterEnd; ++predIter) {
-    BlockT *curBlk = *predIter;
-    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
-      inpathBlks.push_back(curBlk);
-    }
-  } //for
-
-  //if landBlk has predecessors that are not in the given loop,
-  //create a new block
-  BlockT *newLandBlk = landBlk;
-  if (inpathBlks.size() != landBlk->pred_size()) {
-    newLandBlk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(newLandBlk);  //insert to function
-    newLandBlk->addSuccessor(landBlk);
-    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
-         inpathBlks.begin(),
-         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
-      BlockT *curBlk = *iter;
-      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
-      //srcBlk, oldBlk, newBlk
-      curBlk->removeSuccessor(landBlk);
-      curBlk->addSuccessor(newLandBlk);
-    }
-    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
-      if (exitBlks[i] == landBlk) {
-        exitBlks[i] = newLandBlk;
-      }
-    }
-    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
-  }
-
-  setLoopLandBlock(loopRep, newLandBlk);
-
-  return newLandBlk;
-} // recordLoopbreakLand
-
-template<class PassT>
-void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  assert(theEntry->landBlk == NULL);
-
-  if (blk == NULL) {
-    blk = funcRep->CreateMachineBasicBlock();
-    funcRep->push_back(blk);  //insert to function
-    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
-  }
-
-  theEntry->landBlk = blk;
-
-  if (DEBUGME) {
-    errs() << "setLoopLandBlock loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  landing-block = BB" << blk->getNumber() << "\n";
-  }
-} // setLoopLandBlock
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-
-  theEntry->breakOnRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopBreakOnReg loop-header = BB"
+void AMDGPUCFGStructurizer::setLoopLandBlock(MachineLoop *loopRep,
+    MachineBasicBlock *MBB) {
+  MachineBasicBlock *&TheEntry = LLInfoMap[loopRep];
+  if (!MBB) {
+    MBB = FuncRep->CreateMachineBasicBlock();
+    FuncRep->push_back(MBB);  //insert to function
+    SHOWNEWBLK(MBB, "DummyLandingBlock for loop without break: ");
+  }
+  TheEntry = MBB;
+  DEBUG(
+    dbgs() << "setLoopLandBlock loop-header = BB"
            << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopBreakOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contOnRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopContOnReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopContOnReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->breakInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopBreakInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopBreakInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->contInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-    errs() << "addLoopContInitReg loop-header = BB"
-           << loopRep->getHeader()->getNumber()
-           << "  regNum = " << regNum << "\n";
-  }
-} // addLoopContInitReg
-
-template<class PassT>
-void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
-                                                     RegiT regNum) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  if (theEntry == NULL) {
-    theEntry = new LoopLandInfo();
-  }
-  theEntry->endbranchInitRegs.insert(regNum);
-
-  if (DEBUGME) {
-        errs() << "addLoopEndbranchInitReg loop-header = BB"
-      << loopRep->getHeader()->getNumber()
-      << "  regNum = " << regNum << "\n";
-  }
-} // addLoopEndbranchInitReg
-
-template<class PassT>
-typename CFGStructurizer<PassT>::LoopLandInfo *
-CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry;
-} // getLoopLandInfo
-
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
-  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
-
-  return theEntry ? theEntry->landBlk : NULL;
-} // getLoopLandBlock
-
-
-template<class PassT>
-bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
-  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
-  if (loopRep == NULL)
-    return false;
-
-  BlockT *loopHeader = loopRep->getHeader();
-
-  return curBlk->isSuccessor(loopHeader);
-
-} //hasBackEdge
-
-template<class PassT>
-unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
-  return loopRep ? loopRep->getLoopDepth() : 0;
-} //getLoopDepth
-
-template<class PassT>
-int CFGStructurizer<PassT>::countActiveBlock
-(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
- typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
-  int count = 0;
-  while (iterStart != iterEnd) {
-    if (!isRetiredBlock(*iterStart)) {
-      ++count;
-    }
-    ++iterStart;
-  }
-
-  return count;
-} //countActiveBlock
+           << "  landing-block = BB" << MBB->getNumber() << "\n";
+  );
+}
 
-// This is work around solution for findNearestCommonDominator not avaiable to
-// post dom a proper fix should go to Dominators.h.
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
+    MachineBasicBlock *MBB2) {
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT*
-CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+  if (PDT->dominates(MBB1, MBB2))
+    return MBB1;
+  if (PDT->dominates(MBB2, MBB1))
+    return MBB2;
 
-  if (postDomTree->dominates(blk1, blk2)) {
-    return blk1;
-  }
-  if (postDomTree->dominates(blk2, blk1)) {
-    return blk2;
-  }
-
-  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
-  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+  MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
+  MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
 
   // Handle newly cloned node.
-  if (node1 == NULL && blk1->succ_size() == 1) {
-    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
-  }
-  if (node2 == NULL && blk2->succ_size() == 1) {
-    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
-  }
+  if (!Node1 && MBB1->succ_size() == 1)
+    return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
+  if (!Node2 && MBB2->succ_size() == 1)
+    return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
-  if (node1 == NULL || node2 == NULL) {
+  if (!Node1 || !Node2)
     return NULL;
-  }
 
-  node1 = node1->getIDom();
-  while (node1) {
-    if (postDomTree->dominates(node1, node2)) {
-      return node1->getBlock();
-    }
-    node1 = node1->getIDom();
+  Node1 = Node1->getIDom();
+  while (Node1) {
+    if (PDT->dominates(Node1, Node2))
+      return Node1->getBlock();
+    Node1 = Node1->getIDom();
   }
 
   return NULL;
 }
 
-template<class PassT>
-typename CFGStructurizer<PassT>::BlockT *
-CFGStructurizer<PassT>::findNearestCommonPostDom
-(typename std::set<BlockT *> &blks) {
-  BlockT *commonDom;
-  typename std::set<BlockT *>::const_iterator iter = blks.begin();
-  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
-  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
-    BlockT *curBlk = *iter;
-    if (curBlk != commonDom) {
-      commonDom = findNearestCommonPostDom(curBlk, commonDom);
-    }
-  }
-
-  if (DEBUGME) {
-    errs() << "Common post dominator for exit blocks is ";
-    if (commonDom) {
-          errs() << "BB" << commonDom->getNumber() << "\n";
-    } else {
-      errs() << "NULL\n";
-    }
-  }
-
-  return commonDom;
-} //findNearestCommonPostDom
-
-} //end namespace llvm
-
-//todo: move-end
-
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructurizer for AMDGPU
-//
-//===----------------------------------------------------------------------===//
-
-
-using namespace llvmCFGStruct;
-
-namespace llvm {
-class AMDGPUCFGStructurizer : public MachineFunctionPass {
-public:
-  typedef MachineInstr              InstructionType;
-  typedef MachineFunction           FunctionType;
-  typedef MachineBasicBlock         BlockType;
-  typedef MachineLoopInfo           LoopinfoType;
-  typedef MachineDominatorTree      DominatortreeType;
-  typedef MachinePostDominatorTree  PostDominatortreeType;
-  typedef MachineDomTreeNode        DomTreeNodeType;
-  typedef MachineLoop               LoopType;
-
-protected:
-  TargetMachine &TM;
-  const TargetInstrInfo *TII;
-  const AMDGPURegisterInfo *TRI;
-
-public:
-  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
-  const TargetInstrInfo *getTargetInstrInfo() const;
-
-private:
-
-};
-
-} //end of namespace llvm
-AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
-: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
-  TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
-}
-
-const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
-  return TII;
+MachineBasicBlock *
+AMDGPUCFGStructurizer::findNearestCommonPostDom(
+    std::set<MachineBasicBlock *> &MBBs) {
+  MachineBasicBlock *CommonDom;
+  std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
+  std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
+  for (CommonDom = *It; It != E && CommonDom; ++It) {
+    MachineBasicBlock *MBB = *It;
+    if (MBB != CommonDom)
+      CommonDom = findNearestCommonPostDom(MBB, CommonDom);
+  }
+
+  DEBUG(
+    dbgs() << "Common post dominator for exit blocks is ";
+    if (CommonDom)
+          dbgs() << "BB" << CommonDom->getNumber() << "\n";
+    else
+      dbgs() << "NULL\n";
+  );
+
+  return CommonDom;
 }
-//===----------------------------------------------------------------------===//
-//
-// CFGPrepare
-//
-//===----------------------------------------------------------------------===//
-
-
-using namespace llvmCFGStruct;
-
-namespace llvm {
-class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
 
-public:
-  AMDGPUCFGPrepare(TargetMachine &tm);
-
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-
-  bool runOnMachineFunction(MachineFunction &F);
+char AMDGPUCFGStructurizer::ID = 0;
 
-private:
-
-};
+} // end anonymous namespace
 
-char AMDGPUCFGPrepare::ID = 0;
-} //end of namespace llvm
-
-AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
-  : AMDGPUCFGStructurizer(ID, tm )  {
-}
-const char *AMDGPUCFGPrepare::getPassName() const {
-  return "AMD IL Control Flow Graph Preparation Pass";
-}
-
-void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGPerform
-//
-//===----------------------------------------------------------------------===//
-
-
-using namespace llvmCFGStruct;
-
-namespace llvm {
-class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
-public:
-  static char ID;
-
-public:
-  AMDGPUCFGPerform(TargetMachine &tm);
-  virtual const char *getPassName() const;
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
-  bool runOnMachineFunction(MachineFunction &F);
-
-private:
-
-};
-
-char AMDGPUCFGPerform::ID = 0;
-} //end of namespace llvm
-
-  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
-: AMDGPUCFGStructurizer(ID, tm) {
-}
-
-const char *AMDGPUCFGPerform::getPassName() const {
-  return "AMD IL Control Flow Graph structurizer Pass";
-}
-
-void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<MachineFunctionAnalysis>();
-  AU.addRequired<MachineFunctionAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addRequired<MachinePostDominatorTree>();
-  AU.addRequired<MachineLoopInfo>();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// CFGStructTraits<AMDGPUCFGStructurizer>
-//
-//===----------------------------------------------------------------------===//
-
-namespace llvmCFGStruct {
-// this class is tailor to the AMDGPU backend
-template<>
-struct CFGStructTraits<AMDGPUCFGStructurizer> {
-  typedef int RegiT;
-
-  static int getBranchNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static int getBranchZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
-    case AMDGPU::BRANCH_COND_i32:
-    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static int getContinueNzeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
-    default:
-      assert(0 && "internal error");
-    };
-    return -1;
-  }
-
-  static int getContinueZeroOpcode(int oldOpcode) {
-    switch(oldOpcode) {
-    case AMDGPU::JUMP_COND:
-    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
-    default:
-      assert(0 && "internal error");
-    }
-    return -1;
-  }
-
-  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
-    return instr->getOperand(0).getMBB();
-  }
-
-  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
-    instr->getOperand(0).setMBB(blk);
-  }
-
-  static MachineBasicBlock *
-  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(blk->succ_size() == 2);
-    MachineBasicBlock *trueBranch = getTrueBranch(instr);
-    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
-    MachineBasicBlock::succ_iterator iterNext = iter;
-    ++iterNext;
-
-    return (*iter == trueBranch) ? *iterNext : *iter;
-  }
-
-  static bool isCondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-      case AMDGPU::JUMP_COND:
-      case AMDGPU::BRANCH_COND_i32:
-      case AMDGPU::BRANCH_COND_f32:
-      break;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static bool isUncondBranch(MachineInstr *instr) {
-    switch (instr->getOpcode()) {
-    case AMDGPU::JUMP:
-    case AMDGPU::BRANCH:
-      return true;
-    default:
-      return false;
-    }
-    return true;
-  }
-
-  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
-    //get DebugLoc from the first MachineBasicBlock instruction with debug info
-    DebugLoc DL;
-    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getDebugLoc().isUnknown() == false) {
-        DL = instr->getDebugLoc();
-      }
-    }
-    return DL;
-  }
-
-  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    MachineInstr *instr = &*iter;
-    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
-      return instr;
-    }
-    return NULL;
-  }
-
-  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
-  //
-  // BB with backward-edge could have move instructions after the branch
-  // instruction.  Such move instruction "belong to" the loop backward-edge.
-  //
-  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
-    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
-                                  blk->getParent()->getTarget().getInstrInfo());
-
-    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
-         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
-      // FIXME: Simplify
-      MachineInstr *instr = &*iter;
-      if (instr) {
-        if (isCondBranch(instr) || isUncondBranch(instr)) {
-          return instr;
-        } else if (!TII->isMov(instr->getOpcode())) {
-          break;
-        }
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::RETURN) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
-    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
-    if (iter != blk->rend()) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::CONTINUE) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
-    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
-      MachineInstr *instr = &(*iter);
-      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
-        return instr;
-      }
-    }
-    return NULL;
-  }
-
-  static bool isReturnBlock(MachineBasicBlock *blk) {
-    MachineInstr *instr = getReturnInstr(blk);
-    bool isReturn = (blk->succ_size() == 0);
-    if (instr) {
-      assert(isReturn);
-    } else if (isReturn) {
-      if (DEBUGME) {
-        errs() << "BB" << blk->getNumber()
-               <<" is return block without RETURN instr\n";
-      }
-    }
-
-    return  isReturn;
-  }
-
-  static MachineBasicBlock::iterator
-  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
-    assert(instr->getParent() == blk && "instruction doesn't belong to block");
-    MachineBasicBlock::iterator iter = blk->begin();
-    MachineBasicBlock::iterator iterEnd = blk->end();
-    while (&(*iter) != instr && iter != iterEnd) {
-      ++iter;
-    }
-
-    assert(iter != iterEnd);
-    return iter;
-  }//getInstrPos
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep) {
-    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrBefore
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
-                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    MachineBasicBlock::iterator res;
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep) {
-    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
-  } //insertInstrEnd
-
-  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
-                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-   MachineInstr *newInstr = blk->getParent()
-      ->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->push_back(newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-  } //insertInstrEnd
-
-  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
-                                         int newOpcode, 
-                                         AMDGPUCFGStructurizer *passRep) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr =
-      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
-                                           DebugLoc());
-
-    blk->insert(instrPos, newInstr);
-    //assume the instruction doesn't take any reg operand ...
-
-    SHOWNEWINSTR(newInstr);
-    return newInstr;
-  } //insertInstrBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     DebugLoc DL) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    blk->insert(instrPos, newInstr);
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(oldInstr->getOperand(1).getReg(), false);
-
-    SHOWNEWINSTR(newInstr);
-    //erase later oldInstr->eraseFromParent();
-  } //insertCondBranchBefore
-
-  static void insertCondBranchBefore(MachineBasicBlock *blk,
-                                     MachineBasicBlock::iterator insertPos,
-                                     int newOpcode,
-                                     AMDGPUCFGStructurizer *passRep,
-                                     RegiT regNum,
-                                     DebugLoc DL) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-
-    MachineInstr *newInstr = MF->CreateMachineInstr(tii->get(newOpcode), DL);
-
-    //insert before
-    blk->insert(insertPos, newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchBefore
-
-  static void insertCondBranchEnd(MachineBasicBlock *blk,
-                                  int newOpcode,
-                                  AMDGPUCFGStructurizer *passRep,
-                                  RegiT regNum) {
-    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
-
-    blk->push_back(newInstr);
-    MachineInstrBuilder(*MF, newInstr).addReg(regNum, false);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertCondBranchEnd
-
-
-  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    MachineInstr *oldInstr = &(*instrPos);
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineBasicBlock *blk = oldInstr->getParent();
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    blk->insert(instrPos, newInstr);
-
-    SHOWNEWINSTR(newInstr);
-  } //insertAssignInstrBefore
-
-  static void insertAssignInstrBefore(MachineBasicBlock *blk,
-                                      AMDGPUCFGStructurizer *passRep,
-                                      RegiT regNum, int regVal) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-
-    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
-                                                 regVal);
-    if (blk->begin() != blk->end()) {
-      blk->insert(blk->begin(), newInstr);
-    } else {
-      blk->push_back(newInstr);
-    }
-
-    SHOWNEWINSTR(newInstr);
-
-  } //insertInstrBefore
-
-  static void insertCompareInstrBefore(MachineBasicBlock *blk,
-                                       MachineBasicBlock::iterator instrPos,
-                                       AMDGPUCFGStructurizer *passRep,
-                                       RegiT dstReg, RegiT src1Reg,
-                                       RegiT src2Reg) {
-    const AMDGPUInstrInfo *tii =
-             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
-    MachineFunction *MF = blk->getParent();
-    MachineInstr *newInstr =
-      MF->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
-
-    MachineInstrBuilder MIB(*MF, newInstr);
-    MIB.addReg(dstReg, RegState::Define); //set target
-    MIB.addReg(src1Reg); //set src value
-    MIB.addReg(src2Reg); //set src value
-
-    blk->insert(instrPos, newInstr);
-    SHOWNEWINSTR(newInstr);
-
-  } //insertCompareInstrBefore
-
-  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
-                                 MachineBasicBlock *srcBlk) {
-    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
-         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
-      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
-    }
-  } //cloneSuccessorList
-
-  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
-    MachineFunction *func = srcBlk->getParent();
-    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
-    func->push_back(newBlk);  //insert to function
-    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
-         iterEnd = srcBlk->end();
-         iter != iterEnd; ++iter) {
-      MachineInstr *instr = func->CloneMachineInstr(iter);
-      newBlk->push_back(instr);
-    }
-    return newBlk;
-  }
-
-  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
-  //the AMDGPU instruction is not recognized as terminator fix this and retire
-  //this routine
-  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
-                                         MachineBasicBlock *oldBlk,
-                                         MachineBasicBlock *newBlk) {
-    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
-    if (branchInstr && isCondBranch(branchInstr) &&
-        getTrueBranch(branchInstr) == oldBlk) {
-      setTrueBranch(branchInstr, newBlk);
-    }
-  }
-
-  static void wrapup(MachineBasicBlock *entryBlk) {
-    assert((!entryBlk->getParent()->getJumpTableInfo()
-            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
-           && "found a jump table");
-
-     //collect continue right before endloop
-     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
-     MachineBasicBlock::iterator pre = entryBlk->begin();
-     MachineBasicBlock::iterator iterEnd = entryBlk->end();
-     MachineBasicBlock::iterator iter = pre;
-     while (iter != iterEnd) {
-       if (pre->getOpcode() == AMDGPU::CONTINUE
-           && iter->getOpcode() == AMDGPU::ENDLOOP) {
-         contInstr.push_back(pre);
-       }
-       pre = iter;
-       ++iter;
-     } //end while
-
-     //delete continue right before endloop
-     for (unsigned i = 0; i < contInstr.size(); ++i) {
-        contInstr[i]->eraseFromParent();
-     }
-
-     // TODO to fix up jump table so later phase won't be confused.  if
-     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
-     // there isn't such an interface yet.  alternatively, replace all the other
-     // blocks in the jump table with the entryBlk //}
-
-  } //wrapup
-
-  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineDominatorTree>();
-  }
-
-  static MachinePostDominatorTree*
-  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachinePostDominatorTree>();
-  }
-
-  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
-    return &pass.getAnalysis<MachineLoopInfo>();
-  }
-}; // template class CFGStructTraits
-} //end of namespace llvm
-
-// createAMDGPUCFGPreparationPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
-                                                 ) {
-  return new AMDGPUCFGPrepare(tm );
-}
-
-bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
-  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
-                                                                        *this,
-                                                                        TRI);
-}
-
-// createAMDGPUCFGStructurizerPass- Returns a pass
-FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
-                                                  ) {
-  return new AMDGPUCFGPerform(tm );
-}
 
-bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
-  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
-                                                                    *this,
-                                                                    TRI);
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm) {
+  return new AMDGPUCFGStructurizer(tm);
 }
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
deleted file mode 100644
index db8e01e..0000000
--- a/lib/Target/R600/AMDILDevice.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILDevice.h"
-#include "AMDGPUSubtarget.h"
-
-using namespace llvm;
-// Default implementation for all of the classes.
-AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
-  mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
-  mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
-  setCaps();
-  DeviceFlag = OCL_DEVICE_ALL;
-}
-
-AMDGPUDevice::~AMDGPUDevice() {
-    mHWBits.clear();
-    mSWBits.clear();
-}
-
-size_t AMDGPUDevice::getMaxGDSSize() const {
-  return 0;
-}
-
-uint32_t 
-AMDGPUDevice::getDeviceFlag() const {
-  return DeviceFlag;
-}
-
-size_t AMDGPUDevice::getMaxNumCBs() const {
-  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
-    return HW_MAX_NUM_CB;
-  }
-
-  return 0;
-}
-
-size_t AMDGPUDevice::getMaxCBSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
-    return MAX_CB_SIZE;
-  }
-
-  return 0;
-}
-
-size_t AMDGPUDevice::getMaxScratchSize() const {
-  return 65536;
-}
-
-uint32_t AMDGPUDevice::getStackAlignment() const {
-  return 16;
-}
-
-void AMDGPUDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::HalfOps);
-  mSWBits.set(AMDGPUDeviceInfo::ByteOps);
-  mSWBits.set(AMDGPUDeviceInfo::ShortOps);
-  mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
-    mSWBits.set(AMDGPUDeviceInfo::NoInline);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
-    mSWBits.set(AMDGPUDeviceInfo::MacroDB);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
-    mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
-  }
-  mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
-  mSWBits.set(AMDGPUDeviceInfo::LongOps);
-}
-
-AMDGPUDeviceInfo::ExecutionMode
-AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
-  if (mHWBits[Caps]) {
-    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
-    return AMDGPUDeviceInfo::Hardware;
-  }
-
-  if (mSWBits[Caps]) {
-    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
-    return AMDGPUDeviceInfo::Software;
-  }
-
-  return AMDGPUDeviceInfo::Unsupported;
-
-}
-
-bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
-}
-
-bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
-}
-
-bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
-  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
-}
-
-std::string
-AMDGPUDevice::getDataLayout() const {
-  std::string DataLayout = std::string(
-   "e"
-   "-p:32:32:32"
-   "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
-   "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128"
-   "-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048"
-   "-n32:64"
-  );
-
-  if (usesHardware(AMDGPUDeviceInfo::DoubleOps)) {
-    DataLayout.append("-f64:64:64");
-  }
-
-  return DataLayout;
-}
diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h
deleted file mode 100644
index 97df98c..0000000
--- a/lib/Target/R600/AMDILDevice.h
+++ /dev/null
@@ -1,117 +0,0 @@
-//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-//
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDILDEVICEIMPL_H
-#define AMDILDEVICEIMPL_H
-#include "AMDIL.h"
-#include "llvm/ADT/BitVector.h"
-
-namespace llvm {
-  class AMDGPUSubtarget;
-  class MCStreamer;
-//===----------------------------------------------------------------------===//
-// Interface for data that is specific to a single device
-//===----------------------------------------------------------------------===//
-class AMDGPUDevice {
-public:
-  AMDGPUDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUDevice();
-
-  // Enum values for the various memory types.
-  enum {
-    RAW_UAV_ID   = 0,
-    ARENA_UAV_ID = 1,
-    LDS_ID       = 2,
-    GDS_ID       = 3,
-    SCRATCH_ID   = 4,
-    CONSTANT_ID  = 5,
-    GLOBAL_ID    = 6,
-    MAX_IDS      = 7
-  } IO_TYPE_IDS;
-
-  /// \returns The max LDS size that the hardware supports.  Size is in
-  /// bytes.
-  virtual size_t getMaxLDSSize() const = 0;
-
-  /// \returns The max GDS size that the hardware supports if the GDS is
-  /// supported by the hardware.  Size is in bytes.
-  virtual size_t getMaxGDSSize() const;
-
-  /// \returns The max number of hardware constant address spaces that
-  /// are supported by this device.
-  virtual size_t getMaxNumCBs() const;
-
-  /// \returns The max number of bytes a single hardware constant buffer
-  /// can support.  Size is in bytes.
-  virtual size_t getMaxCBSize() const;
-
-  /// \returns The max number of bytes allowed by the hardware scratch
-  /// buffer.  Size is in bytes.
-  virtual size_t getMaxScratchSize() const;
-
-  /// \brief Get the flag that corresponds to the device.
-  virtual uint32_t getDeviceFlag() const;
-
-  /// \returns The number of work-items that exist in a single hardware
-  /// wavefront.
-  virtual size_t getWavefrontSize() const = 0;
-
-  /// \brief Get the generational name of this specific device.
-  virtual uint32_t getGeneration() const = 0;
-
-  /// \brief Get the stack alignment of this specific device.
-  virtual uint32_t getStackAlignment() const;
-
-  /// \brief Get the resource ID for this specific device.
-  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
-
-  /// \brief Get the max number of UAV's for this device.
-  virtual uint32_t getMaxNumUAVs() const = 0;
-
-
-  // API utilizing more detailed capabilities of each family of
-  // cards. If a capability is supported, then either usesHardware or
-  // usesSoftware returned true.  If usesHardware returned true, then
-  // usesSoftware must return false for the same capability.  Hardware
-  // execution means that the feature is done natively by the hardware
-  // and is not emulated by the softare.  Software execution means
-  // that the feature could be done in the hardware, but there is
-  // software that emulates it with possibly using the hardware for
-  // support since the hardware does not fully comply with OpenCL
-  // specs.
-
-  bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
-  bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
-  bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
-  virtual std::string getDataLayout() const;
-  static const unsigned int MAX_LDS_SIZE_700 = 16384;
-  static const unsigned int MAX_LDS_SIZE_800 = 32768;
-  static const unsigned int WavefrontSize = 64;
-  static const unsigned int HalfWavefrontSize = 32;
-  static const unsigned int QuarterWavefrontSize = 16;
-protected:
-  virtual void setCaps();
-  BitVector mHWBits;
-  llvm::BitVector mSWBits;
-  AMDGPUSubtarget *mSTM;
-  uint32_t DeviceFlag;
-private:
-  AMDGPUDeviceInfo::ExecutionMode
-  getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
-};
-
-} // namespace llvm
-#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
deleted file mode 100644
index 126514b..0000000
--- a/lib/Target/R600/AMDILDeviceInfo.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Function that creates DeviceInfo from a device name and other information.
-//
-//==-----------------------------------------------------------------------===//
-#include "AMDILDevices.h"
-#include "AMDGPUSubtarget.h"
-
-using namespace llvm;
-namespace llvm {
-namespace AMDGPUDeviceInfo {
-
-AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
-                                AMDGPUSubtarget *ptr,
-                                bool is64bit, bool is64on32bit) {
-  if (deviceName.c_str()[2] == '7') {
-    switch (deviceName.c_str()[3]) {
-    case '1':
-      return new AMDGPU710Device(ptr);
-    case '7':
-      return new AMDGPU770Device(ptr);
-    default:
-      return new AMDGPU7XXDevice(ptr);
-    }
-  } else if (deviceName == "cypress") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCypressDevice(ptr);
-  } else if (deviceName == "juniper") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUEvergreenDevice(ptr);
-  } else if (deviceName == "redwood" || deviceName == "sumo") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPURedwoodDevice(ptr);
-  } else if (deviceName == "cedar") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCedarDevice(ptr);
-  } else if (deviceName == "barts" || deviceName == "turks") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUNIDevice(ptr);
-  } else if (deviceName == "cayman") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUCaymanDevice(ptr);
-  } else if (deviceName == "caicos") {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPUNIDevice(ptr);
-  } else if (deviceName == "SI" ||
-             deviceName == "tahiti" || deviceName == "pitcairn" ||
-             deviceName == "verde"  || deviceName == "oland" ||
-	     deviceName == "hainan") {
-    return new AMDGPUSIDevice(ptr);
-  } else {
-#if DEBUG
-    assert(!is64bit && "This device does not support 64bit pointers!");
-    assert(!is64on32bit && "This device does not support 64bit"
-          " on 32bit pointers!");
-#endif
-    return new AMDGPU7XXDevice(ptr);
-  }
-}
-} // End namespace AMDGPUDeviceInfo
-} // End namespace llvm
diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h
deleted file mode 100644
index 4b2c3a5..0000000
--- a/lib/Target/R600/AMDILDeviceInfo.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#ifndef AMDILDEVICEINFO_H
-#define AMDILDEVICEINFO_H
-
-
-#include <string>
-
-namespace llvm {
-  class AMDGPUDevice;
-  class AMDGPUSubtarget;
-  namespace AMDGPUDeviceInfo {
-    /// Each Capabilities can be executed using a hardware instruction,
-    /// emulated with a sequence of software instructions, or not
-    /// supported at all.
-    enum ExecutionMode {
-      Unsupported = 0, ///< Unsupported feature on the card(Default value)
-       /// This is the execution mode that is set if the feature is emulated in
-       /// software.
-      Software,
-      /// This execution mode is set if the feature exists natively in hardware
-      Hardware
-    };
-
-    enum Caps {
-      HalfOps          = 0x1,  ///< Half float is supported or not.
-      DoubleOps        = 0x2,  ///< Double is supported or not.
-      ByteOps          = 0x3,  ///< Byte(char) is support or not.
-      ShortOps         = 0x4,  ///< Short is supported or not.
-      LongOps          = 0x5,  ///< Long is supported or not.
-      Images           = 0x6,  ///< Images are supported or not.
-      ByteStores       = 0x7,  ///< ByteStores available(!HD4XXX).
-      ConstantMem      = 0x8,  ///< Constant/CB memory.
-      LocalMem         = 0x9,  ///< Local/LDS memory.
-      PrivateMem       = 0xA,  ///< Scratch/Private/Stack memory.
-      RegionMem        = 0xB,  ///< OCL GDS Memory Extension.
-      FMA              = 0xC,  ///< Use HW FMA or SW FMA.
-      ArenaSegment     = 0xD,  ///< Use for Arena UAV per pointer 12-1023.
-      MultiUAV         = 0xE,  ///< Use for UAV per Pointer 0-7.
-      Reserved0        = 0xF,  ///< ReservedFlag
-      NoAlias          = 0x10, ///< Cached loads.
-      Signed24BitOps   = 0x11, ///< Peephole Optimization.
-      /// Debug mode implies that no hardware features or optimizations
-      /// are performned and that all memory access go through a single
-      /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
-      Debug            = 0x12,
-      CachedMem        = 0x13, ///< Cached mem is available or not.
-      BarrierDetect    = 0x14, ///< Detect duplicate barriers.
-      Reserved1        = 0x15, ///< Reserved flag
-      ByteLDSOps       = 0x16, ///< Flag to specify if byte LDS ops are available.
-      ArenaVectors     = 0x17, ///< Flag to specify if vector loads from arena work.
-      TmrReg           = 0x18, ///< Flag to specify if Tmr register is supported.
-      NoInline         = 0x19, ///< Flag to specify that no inlining should occur.
-      MacroDB          = 0x1A, ///< Flag to specify that backend handles macrodb.
-      HW64BitDivMod    = 0x1B, ///< Flag for backend to generate 64bit div/mod.
-      ArenaUAV         = 0x1C, ///< Flag to specify that arena uav is supported.
-      PrivateUAV       = 0x1D, ///< Flag to specify that private memory uses uav's.
-      /// If more capabilities are required, then
-      /// this number needs to be increased.
-      /// All capabilities must come before this
-      /// number.
-      MaxNumberCapabilities = 0x20
-    };
-    /// These have to be in order with the older generations
-    /// having the lower number enumerations.
-    enum Generation {
-      HD4XXX = 0, ///< 7XX based devices.
-      HD5XXX, ///< Evergreen based devices.
-      HD6XXX, ///< NI/Evergreen+ based devices.
-      HD7XXX, ///< Southern Islands based devices.
-      HDTEST, ///< Experimental feature testing device.
-      HDNUMGEN
-    };
-
-
-  AMDGPUDevice*
-    getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
-                      bool is64bit = false, bool is64on32bit = false);
-  } // namespace AMDILDeviceInfo
-} // namespace llvm
-#endif // AMDILDEVICEINFO_H
diff --git a/lib/Target/R600/AMDILDevices.h b/lib/Target/R600/AMDILDevices.h
deleted file mode 100644
index 636fa6d..0000000
--- a/lib/Target/R600/AMDILDevices.h
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#ifndef AMDIL_DEVICES_H
-#define AMDIL_DEVICES_H
-// Include all of the device specific header files
-#include "AMDIL7XXDevice.h"
-#include "AMDILDevice.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-#include "AMDILSIDevice.h"
-
-#endif // AMDIL_DEVICES_H
diff --git a/lib/Target/R600/AMDILEvergreenDevice.cpp b/lib/Target/R600/AMDILEvergreenDevice.cpp
deleted file mode 100644
index c5213a0..0000000
--- a/lib/Target/R600/AMDILEvergreenDevice.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILEvergreenDevice.h"
-
-using namespace llvm;
-
-AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
-: AMDGPUDevice(ST) {
-  setCaps();
-  std::string name = ST->getDeviceName();
-  if (name == "cedar") {
-    DeviceFlag = OCL_DEVICE_CEDAR;
-  } else if (name == "redwood") {
-    DeviceFlag = OCL_DEVICE_REDWOOD;
-  } else if (name == "cypress") {
-    DeviceFlag = OCL_DEVICE_CYPRESS;
-  } else {
-    DeviceFlag = OCL_DEVICE_JUNIPER;
-  }
-}
-
-AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
-}
-
-size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_800;
-  } else {
-    return 0;
-  }
-}
-size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-    return MAX_LDS_SIZE_800;
-  } else {
-    return 0;
-  }
-}
-uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
-  return 12;
-}
-
-uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
-  switch(id) {
-  default:
-    assert(0 && "ID type passed in is unknown!");
-    break;
-  case CONSTANT_ID:
-  case RAW_UAV_ID:
-    return GLOBAL_RETURN_RAW_UAV_ID;
-  case GLOBAL_ID:
-  case ARENA_UAV_ID:
-    return DEFAULT_ARENA_UAV_ID;
-  case LDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-      return DEFAULT_LDS_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  case GDS_ID:
-    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
-      return DEFAULT_GDS_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  case SCRATCH_ID:
-    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
-      return DEFAULT_SCRATCH_ID;
-    } else {
-      return DEFAULT_ARENA_UAV_ID;
-    }
-  };
-  return 0;
-}
-
-size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
-  return AMDGPUDevice::WavefrontSize;
-}
-
-uint32_t AMDGPUEvergreenDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD5XXX;
-}
-
-void AMDGPUEvergreenDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
-  mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
-  mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
-  mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
-  mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
-    mHWBits.set(AMDGPUDeviceInfo::ByteStores);
-  }
-  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
-    mSWBits.set(AMDGPUDeviceInfo::LocalMem);
-    mSWBits.set(AMDGPUDeviceInfo::RegionMem);
-  } else {
-    mHWBits.set(AMDGPUDeviceInfo::LocalMem);
-    mHWBits.set(AMDGPUDeviceInfo::RegionMem);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::Images);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
-    mHWBits.set(AMDGPUDeviceInfo::NoAlias);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::CachedMem);
-  if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
-    mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
-  mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
-  mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
-  mHWBits.set(AMDGPUDeviceInfo::LongOps);
-  mSWBits.reset(AMDGPUDeviceInfo::LongOps);
-  mHWBits.set(AMDGPUDeviceInfo::TmrReg);
-}
-
-AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCypressDevice::~AMDGPUCypressDevice() {
-}
-
-void AMDGPUCypressDevice::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-    mHWBits.set(AMDGPUDeviceInfo::FMA);
-  }
-}
-
-
-AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCedarDevice::~AMDGPUCedarDevice() {
-}
-
-void AMDGPUCedarDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::FMA);
-}
-
-size_t AMDGPUCedarDevice::getWavefrontSize() const {
-  return AMDGPUDevice::QuarterWavefrontSize;
-}
-
-AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  setCaps();
-}
-
-AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
-}
-
-void AMDGPURedwoodDevice::setCaps() {
-  mSWBits.set(AMDGPUDeviceInfo::FMA);
-}
-
-size_t AMDGPURedwoodDevice::getWavefrontSize() const {
-  return AMDGPUDevice::HalfWavefrontSize;
-}
diff --git a/lib/Target/R600/AMDILEvergreenDevice.h b/lib/Target/R600/AMDILEvergreenDevice.h
deleted file mode 100644
index ea90f77..0000000
--- a/lib/Target/R600/AMDILEvergreenDevice.h
+++ /dev/null
@@ -1,93 +0,0 @@
-//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===----------------------------------------------------------------------===//
-#ifndef AMDILEVERGREENDEVICE_H
-#define AMDILEVERGREENDEVICE_H
-#include "AMDGPUSubtarget.h"
-#include "AMDILDevice.h"
-
-namespace llvm {
-  class AMDGPUSubtarget;
-//===----------------------------------------------------------------------===//
-// Evergreen generation of devices and their respective sub classes
-//===----------------------------------------------------------------------===//
-
-
-/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
-/// series of cards.
-///
-/// This class contains information required to differentiate
-/// the Evergreen device from the generic AMDGPUDevice. This device represents
-/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
-class AMDGPUEvergreenDevice : public AMDGPUDevice {
-public:
-  AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUEvergreenDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual size_t getMaxGDSSize() const;
-  virtual size_t getWavefrontSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual uint32_t getMaxNumUAVs() const;
-  virtual uint32_t getResourceID(uint32_t) const;
-protected:
-  virtual void setCaps();
-};
-
-/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
-/// support for double precision operations. This device is used to represent
-/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
-/// and HD59XX cards.
-class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUCypressDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUCypressDevice();
-private:
-  virtual void setCaps();
-};
-
-
-/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
-/// devices.
-///
-/// This class differs from the base AMDGPUEvergreenDevice in that the
-/// device is a ~quarter of the 'Juniper'. These are commercially known as the
-/// HD54XX and HD53XX series of cards.
-class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUCedarDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPUCedarDevice();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-
-/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
-/// devices.
-///
-/// This class differs from the base class, in that these devices are
-/// considered about half of a 'Juniper' device. These are commercially known as
-/// the HD55XX and HD56XX series of cards.
-class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
-  virtual ~AMDGPURedwoodDevice();
-  virtual size_t getWavefrontSize() const;
-private:
-  virtual void setCaps();
-};
-  
-} // namespace llvm
-#endif // AMDILEVERGREENDEVICE_H
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
deleted file mode 100644
index ba75a44..0000000
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ /dev/null
@@ -1,666 +0,0 @@
-//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Defines an instruction selector for the AMDGPU target.
-//
-//===----------------------------------------------------------------------===//
-#include "AMDGPUInstrInfo.h"
-#include "AMDGPUISelLowering.h" // For AMDGPUISD
-#include "AMDGPURegisterInfo.h"
-#include "AMDILDevices.h"
-#include "R600InstrInfo.h"
-#include "SIISelLowering.h"
-#include "llvm/ADT/ValueMap.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include <list>
-#include <queue>
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// Instruction Selector Implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// AMDGPU specific code to select AMDGPU machine instructions for
-/// SelectionDAG operations.
-class AMDGPUDAGToDAGISel : public SelectionDAGISel {
-  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
-  // make the right decision when generating code for different targets.
-  const AMDGPUSubtarget &Subtarget;
-public:
-  AMDGPUDAGToDAGISel(TargetMachine &TM);
-  virtual ~AMDGPUDAGToDAGISel();
-
-  SDNode *Select(SDNode *N);
-  virtual const char *getPassName() const;
-  virtual void PostprocessISelDAG();
-
-private:
-  inline SDValue getSmallIPtrImm(unsigned Imm);
-  bool FoldOperands(unsigned, const R600InstrInfo *, std::vector<SDValue> &);
-
-  // Complex pattern selectors
-  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
-  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
-  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-
-  static bool checkType(const Value *ptr, unsigned int addrspace);
-  static const Value *getBasePointerValue(const Value *V);
-
-  static bool isGlobalStore(const StoreSDNode *N);
-  static bool isPrivateStore(const StoreSDNode *N);
-  static bool isLocalStore(const StoreSDNode *N);
-  static bool isRegionStore(const StoreSDNode *N);
-
-  static bool isCPLoad(const LoadSDNode *N);
-  static bool isConstantLoad(const LoadSDNode *N, int cbID);
-  static bool isGlobalLoad(const LoadSDNode *N);
-  static bool isParamLoad(const LoadSDNode *N);
-  static bool isPrivateLoad(const LoadSDNode *N);
-  static bool isLocalLoad(const LoadSDNode *N);
-  static bool isRegionLoad(const LoadSDNode *N);
-
-  bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr,
-      SDValue &BaseReg, SDValue& Offset);
-  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
-  bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
-
-  // Include the pieces autogenerated from the target description.
-#include "AMDGPUGenDAGISel.inc"
-};
-}  // end anonymous namespace
-
-/// \brief This pass converts a legalized DAG into a AMDGPU-specific
-// DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
-                                       ) {
-  return new AMDGPUDAGToDAGISel(TM);
-}
-
-AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
-                                     )
-  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
-}
-
-AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
-}
-
-SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
-  return CurDAG->getTargetConstant(Imm, MVT::i32);
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRParam(
-    SDValue Addr, SDValue& R1, SDValue& R2) {
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
-      R2 = CurDAG->getTargetConstant(0, MVT::i32);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, MVT::i32);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, MVT::i32);
-  }
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-  return SelectADDRParam(Addr, R1, R2);
-}
-
-
-bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
-  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress) {
-    return false;
-  }
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
-      R2 = CurDAG->getTargetConstant(0, MVT::i64);
-    } else {
-      R1 = Addr;
-      R2 = CurDAG->getTargetConstant(0, MVT::i64);
-    }
-  } else if (Addr.getOpcode() == ISD::ADD) {
-    R1 = Addr.getOperand(0);
-    R2 = Addr.getOperand(1);
-  } else {
-    R1 = Addr;
-    R2 = CurDAG->getTargetConstant(0, MVT::i64);
-  }
-  return true;
-}
-
-SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
-  unsigned int Opc = N->getOpcode();
-  if (N->isMachineOpcode()) {
-    return NULL;   // Already selected.
-  }
-  switch (Opc) {
-  default: break;
-  case ISD::BUILD_VECTOR: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
-      break;
-    }
-    // BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
-    // that adds a 128 bits reg copy when going through TwoAddressInstructions
-    // pass. We want to avoid 128 bits copies as much as possible because they
-    // can't be bundled by our scheduler.
-    SDValue RegSeqArgs[9] = {
-      CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
-      SDValue(), CurDAG->getTargetConstant(AMDGPU::sub3, MVT::i32)
-    };
-    bool IsRegSeq = true;
-    for (unsigned i = 0; i < N->getNumOperands(); i++) {
-      if (dyn_cast<RegisterSDNode>(N->getOperand(i))) {
-        IsRegSeq = false;
-        break;
-      }
-      RegSeqArgs[2 * i + 1] = N->getOperand(i);
-    }
-    if (!IsRegSeq)
-      break;
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-        RegSeqArgs, 2 * N->getNumOperands() + 1);
-  }
-  case ISD::BUILD_PAIR: {
-    SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-      break;
-    }
-    if (N->getValueType(0) == MVT::i128) {
-      RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, MVT::i32);
-      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
-      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
-    } else if (N->getValueType(0) == MVT::i64) {
-      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
-      SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
-      SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
-    } else {
-      llvm_unreachable("Unhandled value type for BUILD_PAIR");
-    }
-    const SDValue Ops[] = { RC, N->getOperand(0), SubReg0,
-                            N->getOperand(1), SubReg1 };
-    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
-                                  N->getDebugLoc(), N->getValueType(0), Ops);
-  }
-
-  case ISD::ConstantFP:
-  case ISD::Constant: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-    // XXX: Custom immediate lowering not implemented yet.  Instead we use
-    // pseudo instructions defined in SIInstructions.td
-    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
-      break;
-    }
-    const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-
-    uint64_t ImmValue = 0;
-    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
-
-    if (N->getOpcode() == ISD::ConstantFP) {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::f64);
-
-      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
-      APFloat Value = C->getValueAPF();
-      float FloatValue = Value.convertToFloat();
-      if (FloatValue == 0.0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (FloatValue == 0.5) {
-        ImmReg = AMDGPU::HALF;
-      } else if (FloatValue == 1.0) {
-        ImmReg = AMDGPU::ONE;
-      } else {
-        ImmValue = Value.bitcastToAPInt().getZExtValue();
-      }
-    } else {
-      // XXX: 64-bit Immediates not supported yet
-      assert(N->getValueType(0) != MVT::i64);
-
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
-      if (C->getZExtValue() == 0) {
-        ImmReg = AMDGPU::ZERO;
-      } else if (C->getZExtValue() == 1) {
-        ImmReg = AMDGPU::ONE_INT;
-      } else {
-        ImmValue = C->getZExtValue();
-      }
-    }
-
-    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
-                              Use != SDNode::use_end(); Use = Next) {
-      Next = llvm::next(Use);
-      std::vector<SDValue> Ops;
-      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
-        Ops.push_back(Use->getOperand(i));
-      }
-
-      if (!Use->isMachineOpcode()) {
-          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-            // We can only use literal constants (e.g. AMDGPU::ZERO,
-            // AMDGPU::ONE, etc) in machine opcodes.
-            continue;
-          }
-      } else {
-        if (!TII->isALUInstr(Use->getMachineOpcode()) ||
-            (TII->get(Use->getMachineOpcode()).TSFlags &
-            R600_InstFlag::VECTOR)) {
-          continue;
-        }
-
-        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
-        assert(ImmIdx != -1);
-
-        // subtract one from ImmIdx, because the DST operand is usually index
-        // 0 for MachineInstrs, but we have no DST in the Ops vector.
-        ImmIdx--;
-
-        // Check that we aren't already using an immediate.
-        // XXX: It's possible for an instruction to have more than one
-        // immediate operand, but this is not supported yet.
-        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
-          assert(C);
-
-          if (C->getZExtValue() != 0) {
-            // This instruction is already using an immediate.
-            continue;
-          }
-
-          // Set the immediate value
-          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
-        }
-      }
-      // Set the immediate register
-      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
-
-      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
-    }
-    break;
-  }
-  }
-  SDNode *Result = SelectCode(N);
-
-  // Fold operands of selected node
-
-  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
-    const R600InstrInfo *TII =
-        static_cast<const R600InstrInfo*>(TM.getInstrInfo());
-    if (Result && Result->isMachineOpcode() &&
-        !(TII->get(Result->getMachineOpcode()).TSFlags & R600_InstFlag::VECTOR)
-        && TII->isALUInstr(Result->getMachineOpcode())) {
-      // Fold FNEG/FABS/CONST_ADDRESS
-      // TODO: Isel can generate multiple MachineInst, we need to recursively
-      // parse Result
-      bool IsModified = false;
-      do {
-        std::vector<SDValue> Ops;
-        for(SDNode::op_iterator I = Result->op_begin(), E = Result->op_end();
-            I != E; ++I)
-          Ops.push_back(*I);
-        IsModified = FoldOperands(Result->getMachineOpcode(), TII, Ops);
-        if (IsModified) {
-          Result = CurDAG->UpdateNodeOperands(Result, Ops.data(), Ops.size());
-        }
-      } while (IsModified);
-
-      // If node has a single use which is CLAMP_R600, folds it
-      if (Result->hasOneUse() && Result->isMachineOpcode()) {
-        SDNode *PotentialClamp = *Result->use_begin();
-        if (PotentialClamp->isMachineOpcode() &&
-            PotentialClamp->getMachineOpcode() == AMDGPU::CLAMP_R600) {
-          unsigned ClampIdx =
-            TII->getOperandIdx(Result->getMachineOpcode(), R600Operands::CLAMP);
-          std::vector<SDValue> Ops;
-          unsigned NumOp = Result->getNumOperands();
-          for (unsigned i = 0; i < NumOp; ++i) {
-            Ops.push_back(Result->getOperand(i));
-          }
-          Ops[ClampIdx - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-          Result = CurDAG->SelectNodeTo(PotentialClamp,
-              Result->getMachineOpcode(), PotentialClamp->getVTList(),
-              Ops.data(), NumOp);
-        }
-      }
-    }
-  }
-
-  return Result;
-}
-
-bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
-    const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
-  int OperandIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1),
-    TII->getOperandIdx(Opcode, R600Operands::SRC2)
-  };
-  int SelIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0_SEL),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1_SEL),
-    TII->getOperandIdx(Opcode, R600Operands::SRC2_SEL)
-  };
-  int NegIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0_NEG),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1_NEG),
-    TII->getOperandIdx(Opcode, R600Operands::SRC2_NEG)
-  };
-  int AbsIdx[] = {
-    TII->getOperandIdx(Opcode, R600Operands::SRC0_ABS),
-    TII->getOperandIdx(Opcode, R600Operands::SRC1_ABS),
-    -1
-  };
-
-  for (unsigned i = 0; i < 3; i++) {
-    if (OperandIdx[i] < 0)
-      return false;
-    SDValue Operand = Ops[OperandIdx[i] - 1];
-    switch (Operand.getOpcode()) {
-    case AMDGPUISD::CONST_ADDRESS: {
-      SDValue CstOffset;
-      if (Operand.getValueType().isVector() ||
-          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
-        break;
-
-      // Gather others constants values
-      std::vector<unsigned> Consts;
-      for (unsigned j = 0; j < 3; j++) {
-        int SrcIdx = OperandIdx[j];
-        if (SrcIdx < 0)
-          break;
-        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
-          if (Reg->getReg() == AMDGPU::ALU_CONST) {
-            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
-            Consts.push_back(Cst->getZExtValue());
-          }
-        }
-      }
-
-      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
-      Consts.push_back(Cst->getZExtValue());
-      if (!TII->fitsConstReadLimitations(Consts))
-        break;
-
-      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
-      Ops[SelIdx[i] - 1] = CstOffset;
-      return true;
-      }
-    case ISD::FNEG:
-      if (NegIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[NegIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::FABS:
-      if (AbsIdx[i] < 0)
-        break;
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      Ops[AbsIdx[i] - 1] = CurDAG->getTargetConstant(1, MVT::i32);
-      return true;
-    case ISD::BITCAST:
-      Ops[OperandIdx[i] - 1] = Operand.getOperand(0);
-      return true;
-    default:
-      break;
-    }
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
-    return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
-}
-
-const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
-  if (!V) {
-    return NULL;
-  }
-  const Value *ret = NULL;
-  ValueMap<const Value *, bool> ValueBitMap;
-  std::queue<const Value *, std::list<const Value *> > ValueQueue;
-  ValueQueue.push(V);
-  while (!ValueQueue.empty()) {
-    V = ValueQueue.front();
-    if (ValueBitMap.find(V) == ValueBitMap.end()) {
-      ValueBitMap[V] = true;
-      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
-        ret = V;
-        break;
-      } else if (dyn_cast<GlobalVariable>(V)) {
-        ret = V;
-        break;
-      } else if (dyn_cast<Constant>(V)) {
-        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
-        if (CE) {
-          ValueQueue.push(CE->getOperand(0));
-        }
-      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
-        ret = AI;
-        break;
-      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
-        uint32_t numOps = I->getNumOperands();
-        for (uint32_t x = 0; x < numOps; ++x) {
-          ValueQueue.push(I->getOperand(x));
-        }
-      } else {
-        assert(!"Found a Value that we didn't know how to handle!");
-      }
-    }
-    ValueQueue.pop();
-  }
-  return ret;
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
-}
-
-bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
-  if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
-    return true;
-  }
-  MachineMemOperand *MMO = N->getMemOperand();
-  const Value *V = MMO->getValue();
-  const Value *BV = getBasePointerValue(V);
-  if (MMO
-      && MMO->getValue()
-      && ((V && dyn_cast<GlobalValue>(V))
-          || (BV && dyn_cast<GlobalValue>(
-                        getBasePointerValue(MMO->getValue()))))) {
-    return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
-  } else {
-    return false;
-  }
-}
-
-bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
-}
-
-bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
-  MachineMemOperand *MMO = N->getMemOperand();
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
-    if (MMO) {
-      const Value *V = MMO->getValue();
-      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
-      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
-    // Check to make sure we are not a constant pool load or a constant load
-    // that is marked as a private load
-    if (isCPLoad(N) || isConstantLoad(N, -1)) {
-      return false;
-    }
-  }
-  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
-    return true;
-  }
-  return false;
-}
-
-const char *AMDGPUDAGToDAGISel::getPassName() const {
-  return "AMDGPU DAG->DAG Pattern Instruction Selection";
-}
-
-#ifdef DEBUGTMP
-#undef INT64_C
-#endif
-#undef DEBUGTMP
-
-///==== AMDGPU Functions ====///
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-    SDValue& IntPtr) {
-  if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
-    IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
-    SDValue& BaseReg, SDValue &Offset) {
-  if (!dyn_cast<ConstantSDNode>(Addr)) {
-    BaseReg = Addr;
-    Offset = CurDAG->getIntPtrConstant(0, true);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
-                                           SDValue &Offset) {
-  ConstantSDNode * IMMOffset;
-
-  if (Addr.getOpcode() == ISD::ADD
-      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
-      && isInt<16>(IMMOffset->getZExtValue())) {
-
-      Base = Addr.getOperand(0);
-      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
-      return true;
-  // If the pointer address is constant, we can move it to the offset field.
-  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
-             && isInt<16>(IMMOffset->getZExtValue())) {
-    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                  CurDAG->getEntryNode().getDebugLoc(),
-                                  AMDGPU::ZERO, MVT::i32);
-    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
-    return true;
-  }
-
-  // Default case, no offset
-  Base = Addr;
-  Offset = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
-}
-
-bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
-                                            SDValue &Offset) {
-  ConstantSDNode *C;
-
-  if ((C = dyn_cast<ConstantSDNode>(Addr))) {
-    Base = CurDAG->getRegister(AMDGPU::INDIRECT_BASE_ADDR, MVT::i32);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
-  } else if ((Addr.getOpcode() == ISD::ADD || Addr.getOpcode() == ISD::OR) &&
-            (C = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
-    Base = Addr.getOperand(0);
-    Offset = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
-  } else {
-    Base = Addr;
-    Offset = CurDAG->getTargetConstant(0, MVT::i32);
-  }
-
-  return true;
-}
-
-void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
-
-  // Go over all selected nodes and try to fold them a bit more
-  const AMDGPUTargetLowering& Lowering = ((const AMDGPUTargetLowering&)TLI);
-  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
-       E = CurDAG->allnodes_end(); I != E; ++I) {
-
-    MachineSDNode *Node = dyn_cast<MachineSDNode>(I);
-    if (!Node)
-      continue;
-
-    SDNode *ResNode = Lowering.PostISelFolding(Node, *CurDAG);
-    if (ResNode != Node)
-      ReplaceUses(Node, ResNode);
-  }
-}
-
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 922cac1..970787e 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPUISelLowering.h"
 #include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDILDevices.h"
 #include "AMDILIntrinsicInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -40,7 +39,7 @@ using namespace llvm;
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
-  int types[] = {
+  static const int types[] = {
     (int)MVT::i8,
     (int)MVT::i16,
     (int)MVT::i32,
@@ -59,19 +58,19 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     (int)MVT::v2i64
   };
 
-  int IntTypes[] = {
+  static const int IntTypes[] = {
     (int)MVT::i8,
     (int)MVT::i16,
     (int)MVT::i32,
     (int)MVT::i64
   };
 
-  int FloatTypes[] = {
+  static const int FloatTypes[] = {
     (int)MVT::f32,
     (int)MVT::f64
   };
 
-  int VectorTypes[] = {
+  static const int VectorTypes[] = {
     (int)MVT::v2i8,
     (int)MVT::v4i8,
     (int)MVT::v2i16,
@@ -83,10 +82,10 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     (int)MVT::v2f64,
     (int)MVT::v2i64
   };
-  size_t NumTypes = sizeof(types) / sizeof(*types);
-  size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
-  size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
-  size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
+  const size_t NumTypes = array_lengthof(types);
+  const size_t NumFloatTypes = array_lengthof(FloatTypes);
+  const size_t NumIntTypes = array_lengthof(IntTypes);
+  const size_t NumVectorTypes = array_lengthof(VectorTypes);
 
   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
   // These are the current register classes that are
@@ -138,8 +137,6 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
 
-    // GPU doesn't have a rotl, rotr, or byteswap instruction
-    setOperationAction(ISD::ROTR, VT, Expand);
     setOperationAction(ISD::BSWAP, VT, Expand);
 
     // GPU doesn't have any counting operators
@@ -158,21 +155,19 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SELECT_CC, VT, Expand);
 
   }
-  if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
-    setOperationAction(ISD::MULHU, MVT::i64, Expand);
-    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
-    setOperationAction(ISD::MULHS, MVT::i64, Expand);
-    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
-    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
-    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
-    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
-    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
-  }
-  if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
+  setOperationAction(ISD::MULHU, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::i64, Expand);
+  setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+  setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+  setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+  setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+  setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+  setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  if (STM.hasHWFP64()) {
     // we support loading/storing v2f64 but not operations on the type
     setOperationAction(ISD::FADD, MVT::v2f64, Expand);
     setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
@@ -331,7 +326,7 @@ SDValue
 AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue Data = Op.getOperand(0);
   VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT DVT = Data.getValueType();
   EVT BVT = BaseType->getVT();
   unsigned baseBits = BVT.getScalarType().getSizeInBits();
@@ -387,7 +382,7 @@ AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result;
   Result = DAG.getNode(
       AMDGPUISD::BRANCH_COND,
-      Op.getDebugLoc(),
+      SDLoc(Op),
       Op.getValueType(),
       Chain, Jump, Cond);
   return Result;
@@ -395,7 +390,7 @@ AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -476,7 +471,7 @@ AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -547,7 +542,7 @@ AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i8) {
@@ -564,7 +559,7 @@ AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   MVT INTTY = MVT::i32;
   if (OVT == MVT::v2i16) {
@@ -581,7 +576,7 @@ AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue
 AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT OVT = Op.getValueType();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
index 110f147..0f0c88d 100644
--- a/lib/Target/R600/AMDILInstrInfo.td
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -10,63 +10,6 @@
 // This file describes the AMDIL instructions in TableGen format.
 //
 //===----------------------------------------------------------------------===//
-// AMDIL Instruction Predicate Definitions
-// Predicate that is set to true if the hardware supports double precision
-// divide
-def HasHWDDiv                 : Predicate<"Subtarget.device()"
-                           "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
-              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
-
-// Predicate that is set to true if the hardware supports double, but not double
-// precision divide in hardware
-def HasSWDDiv             : Predicate<"Subtarget.device()"
-                           "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
-              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
-
-// Predicate that is set to true if the hardware support 24bit signed
-// math ops. Otherwise a software expansion to 32bit math ops is used instead.
-def HasHWSign24Bit          : Predicate<"Subtarget.device()"
-                            "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
-
-// Predicate that is set to true if 64bit operations are supported or not
-def HasHW64Bit              : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
-def HasSW64Bit              : Predicate<"Subtarget.device()"
-                            "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
-
-// Predicate that is set to true if the timer register is supported
-def HasTmrRegister          : Predicate<"Subtarget.device()"
-                            "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
-// Predicate that is true if we are at least evergreen series
-def HasDeviceIDInst         : Predicate<"Subtarget.device()"
-                            "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
-
-// Predicate that is true if we have region address space.
-def hasRegionAS             : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
-
-// Predicate that is false if we don't have region address space.
-def noRegionAS             : Predicate<"!Subtarget.device()"
-                            "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
-
-
-// Predicate that is set to true if 64bit Mul is supported in the IL or not
-def HasHW64Mul              : Predicate<"Subtarget.calVersion()" 
-                                          ">= CAL_VERSION_SC_139"
-                                          "&& Subtarget.device()"
-                                          "->getGeneration() >="
-                                          "AMDGPUDeviceInfo::HD5XXX">;
-def HasSW64Mul              : Predicate<"Subtarget.calVersion()" 
-                                          "< CAL_VERSION_SC_139">;
-// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
-def HasHW64DivMod           : Predicate<"Subtarget.device()"
-                            "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
-def HasSW64DivMod           : Predicate<"Subtarget.device()"
-                            "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
-
-// Predicate that is set to true if 64bit pointer are used.
-def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
-def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
 //===--------------------------------------------------------------------===//
 // Custom Operands
 //===--------------------------------------------------------------------===//
@@ -175,15 +118,15 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
 // Multiclass Instruction formats
 //===--------------------------------------------------------------------===//
 // Multiclass that handles branch instructions
-multiclass BranchConditional<SDNode Op> {
+multiclass BranchConditional<SDNode Op, RegisterClass rci, RegisterClass rcf> {
     def _i32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRI32:$src0),
+  (ins brtarget:$target, rci:$src0),
         "; i32 Pseudo branch instruction",
-  [(Op bb:$target, GPRI32:$src0)]>;
+  [(Op bb:$target, (i32 rci:$src0))]>;
     def _f32 : ILFormat<(outs),
-  (ins brtarget:$target, GPRF32:$src0),
+  (ins brtarget:$target, rcf:$src0),
         "; f32 Pseudo branch instruction",
-  [(Op bb:$target, GPRF32:$src0)]>;
+  [(Op bb:$target, (f32 rcf:$src0))]>;
 }
 
 // Only scalar types should generate flow control
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 4ddb057..762ee39 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -14,7 +14,6 @@
 
 #include "AMDILIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "AMDIL.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
@@ -50,6 +49,9 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
 
 unsigned int
 AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
+  if (!StringRef(Name, Len).startswith("llvm."))
+    return 0; // All intrinsics start with 'llvm.'
+
 #define GET_FUNCTION_RECOGNIZER
 #include "AMDGPUGenIntrinsics.inc"
 #undef GET_FUNCTION_RECOGNIZER
diff --git a/lib/Target/R600/AMDILNIDevice.cpp b/lib/Target/R600/AMDILNIDevice.cpp
deleted file mode 100644
index 47c3f7f..0000000
--- a/lib/Target/R600/AMDILNIDevice.cpp
+++ /dev/null
@@ -1,65 +0,0 @@
-//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILNIDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-
-using namespace llvm;
-
-AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-  std::string name = ST->getDeviceName();
-  if (name == "caicos") {
-    DeviceFlag = OCL_DEVICE_CAICOS;
-  } else if (name == "turks") {
-    DeviceFlag = OCL_DEVICE_TURKS;
-  } else if (name == "cayman") {
-    DeviceFlag = OCL_DEVICE_CAYMAN;
-  } else {
-    DeviceFlag = OCL_DEVICE_BARTS;
-  }
-}
-AMDGPUNIDevice::~AMDGPUNIDevice() {
-}
-
-size_t
-AMDGPUNIDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDGPUNIDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD6XXX;
-}
-
-
-AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
-  : AMDGPUNIDevice(ST) {
-  setCaps();
-}
-
-AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
-}
-
-void
-AMDGPUCaymanDevice::setCaps() {
-  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
-    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
-    mHWBits.set(AMDGPUDeviceInfo::FMA);
-  }
-  mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
-  mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
-  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
-}
-
diff --git a/lib/Target/R600/AMDILNIDevice.h b/lib/Target/R600/AMDILNIDevice.h
deleted file mode 100644
index 24a6408..0000000
--- a/lib/Target/R600/AMDILNIDevice.h
+++ /dev/null
@@ -1,57 +0,0 @@
-//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef AMDILNIDEVICE_H
-#define AMDILNIDEVICE_H
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-
-namespace llvm {
-
-class AMDGPUSubtarget;
-//===---------------------------------------------------------------------===//
-// NI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
-/// cards.
-///
-/// It is very similiar to the AMDGPUEvergreenDevice, with the major
-/// exception being differences in wavefront size and hardware capabilities.  The
-/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
-/// integer operations
-class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUNIDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUNIDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual uint32_t getGeneration() const;
-};
-
-/// Just as the AMDGPUCypressDevice is the double capable version of the
-/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
-/// of the AMDGPUNIDevice.  The other major difference is that the Cayman Device
-/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
-class AMDGPUCaymanDevice: public AMDGPUNIDevice {
-public:
-  AMDGPUCaymanDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUCaymanDevice();
-private:
-  virtual void setCaps();
-};
-
-static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
-} // namespace llvm
-#endif // AMDILNIDEVICE_H
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
deleted file mode 100644
index 0d1de3d..0000000
--- a/lib/Target/R600/AMDILSIDevice.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-#include "AMDILSIDevice.h"
-#include "AMDGPUSubtarget.h"
-#include "AMDILEvergreenDevice.h"
-#include "AMDILNIDevice.h"
-
-using namespace llvm;
-
-AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
-  : AMDGPUEvergreenDevice(ST) {
-}
-AMDGPUSIDevice::~AMDGPUSIDevice() {
-}
-
-size_t
-AMDGPUSIDevice::getMaxLDSSize() const {
-  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
-    return MAX_LDS_SIZE_900;
-  } else {
-    return 0;
-  }
-}
-
-uint32_t
-AMDGPUSIDevice::getGeneration() const {
-  return AMDGPUDeviceInfo::HD7XXX;
-}
-
-std::string
-AMDGPUSIDevice::getDataLayout() const {
-  return std::string(
-    "e"
-    "-p:64:64:64"
-    "-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64"
-    "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128"
-    "-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024"
-    "-v2048:2048:2048"
-    "-n32:64"
-  );
-}
diff --git a/lib/Target/R600/AMDILSIDevice.h b/lib/Target/R600/AMDILSIDevice.h
deleted file mode 100644
index 5b2cb25..0000000
--- a/lib/Target/R600/AMDILSIDevice.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//==-----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief Interface for the subtarget data classes.
-///
-/// This file will define the interface that each generation needs to
-/// implement in order to correctly answer queries on the capabilities of the
-/// specific hardware.
-//===---------------------------------------------------------------------===//
-#ifndef AMDILSIDEVICE_H
-#define AMDILSIDEVICE_H
-#include "AMDILEvergreenDevice.h"
-
-namespace llvm {
-class AMDGPUSubtarget;
-//===---------------------------------------------------------------------===//
-// SI generation of devices and their respective sub classes
-//===---------------------------------------------------------------------===//
-
-/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
-/// of cards.
-class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
-public:
-  AMDGPUSIDevice(AMDGPUSubtarget*);
-  virtual ~AMDGPUSIDevice();
-  virtual size_t getMaxLDSSize() const;
-  virtual uint32_t getGeneration() const;
-  virtual std::string getDataLayout() const;
-};
-
-} // namespace llvm
-#endif // AMDILSIDEVICE_H
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 97f0a40..9f8f6a8 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -12,28 +12,22 @@ tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
 add_public_tablegen_target(AMDGPUCommonTableGen)
 
 add_llvm_target(R600CodeGen
-  AMDIL7XXDevice.cpp
   AMDILCFGStructurizer.cpp
-  AMDILDevice.cpp
-  AMDILDeviceInfo.cpp
-  AMDILEvergreenDevice.cpp
   AMDILIntrinsicInfo.cpp
-  AMDILISelDAGToDAG.cpp
   AMDILISelLowering.cpp
-  AMDILNIDevice.cpp
-  AMDILSIDevice.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
-  AMDGPUIndirectAddressing.cpp
+  AMDGPUISelDAGToDAG.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUSubtarget.cpp
-  AMDGPUStructurizeCFG.cpp
   AMDGPUTargetMachine.cpp
+  AMDGPUTargetTransformInfo.cpp
   AMDGPUISelLowering.cpp
   AMDGPUConvertToISA.cpp
   AMDGPUInstrInfo.cpp
   AMDGPURegisterInfo.cpp
+  R600ClauseMergePass.cpp
   R600ControlFlowFinalizer.cpp
   R600EmitClauseMarkers.cpp
   R600ExpandSpecialInstrs.cpp
@@ -41,18 +35,22 @@ add_llvm_target(R600CodeGen
   R600ISelLowering.cpp
   R600MachineFunctionInfo.cpp
   R600MachineScheduler.cpp
+  R600OptimizeVectorRegisters.cpp
   R600Packetizer.cpp
   R600RegisterInfo.cpp
+  R600TextureIntrinsicsReplacer.cpp
   SIAnnotateControlFlow.cpp
+  SIFixSGPRCopies.cpp
   SIInsertWaits.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILowerControlFlow.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
+  SITypeRewriter.cpp
   )
 
-add_dependencies(LLVMR600CodeGen intrinsics_gen)
+add_dependencies(LLVMR600CodeGen AMDGPUCommonTableGen intrinsics_gen)
 
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 303cdf2..99e1377 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -10,8 +10,8 @@
 
 #include "AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 
 using namespace llvm;
 
@@ -23,6 +23,63 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
+  switch (reg) {
+  case AMDGPU::VCC:
+    O << "vcc";
+    return;
+  case AMDGPU::SCC:
+    O << "scc";
+    return;
+  case AMDGPU::EXEC:
+    O << "exec";
+    return;
+  case AMDGPU::M0:
+    O << "m0";
+    return;
+  default:
+    break;
+  }
+
+  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
+  // giant enum of all the different shifted sets of registers is pretty
+  // unmanagable, so parse the name and reformat it to be prettier.
+  StringRef Name(getRegisterName(reg));
+
+  std::pair<StringRef, StringRef> Split = Name.split('_');
+  StringRef SubRegName = Split.first;
+  StringRef Rest = Split.second;
+
+  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
+    O << Name;
+    return;
+  }
+
+  unsigned RegIndex;
+  StringRef RegIndexStr = SubRegName.drop_front(4);
+
+  if (RegIndexStr.getAsInteger(10, RegIndex)) {
+    O << Name;
+    return;
+  }
+
+  if (SubRegName.front() == 'V')
+    O << 'v';
+  else if (SubRegName.front() == 'S')
+    O << 's';
+  else {
+    O << Name;
+    return;
+  }
+
+  if (Rest.empty()) // Only 1 32-bit register
+    O << RegIndex;
+  else {
+    unsigned NumReg = Rest.count('_') + 2;
+    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
+  }
+}
+
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
 
@@ -30,8 +87,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     switch (Op.getReg()) {
     // This is the default predicate state, so we don't need to print it.
-    case AMDGPU::PRED_SEL_OFF: break;
-    default: O << getRegisterName(Op.getReg()); break;
+    case AMDGPU::PRED_SEL_OFF:
+      break;
+
+    default:
+      printRegOperand(Op.getReg(), O);
+      break;
     }
   } else if (Op.isImm()) {
     O << Op.getImm();
@@ -102,7 +163,7 @@ void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
 
 void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O) {
-  printIfSet(MI, OpNo, O.indent(20 - O.GetNumBytesInBuffer()), "*", " ");
+  printIfSet(MI, OpNo, O.indent(25 - O.GetNumBytesInBuffer()), "*", " ");
 }
 
 void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
@@ -178,13 +239,13 @@ void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
   int BankSwizzle = MI->getOperand(OpNo).getImm();
   switch (BankSwizzle) {
   case 1:
-    O << "BS:VEC_021";
+    O << "BS:VEC_021/SCL_122";
     break;
   case 2:
-    O << "BS:VEC_120";
+    O << "BS:VEC_120/SCL_212";
     break;
   case 3:
-    O << "BS:VEC_102";
+    O << "BS:VEC_102/SCL_221";
     break;
   case 4:
     O << "BS:VEC_201";
@@ -198,6 +259,51 @@ void AMDGPUInstPrinter::printBankSwizzle(const MCInst *MI, unsigned OpNo,
   return;
 }
 
+void AMDGPUInstPrinter::printRSel(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned Sel = MI->getOperand(OpNo).getImm();
+  switch (Sel) {
+  case 0:
+    O << "X";
+    break;
+  case 1:
+    O << "Y";
+    break;
+  case 2:
+    O << "Z";
+    break;
+  case 3:
+    O << "W";
+    break;
+  case 4:
+    O << "0";
+    break;
+  case 5:
+    O << "1";
+    break;
+  case 7:
+    O << "_";
+    break;
+  default:
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printCT(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  unsigned CT = MI->getOperand(OpNo).getImm();
+  switch (CT) {
+  case 0:
+    O << "U";
+    break;
+  case 1:
+    O << "N";
+    break;
+  default:
+    break;
+  }
+}
+
 void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   int KCacheMode = MI->getOperand(OpNo).getImm();
@@ -210,4 +316,21 @@ void AMDGPUInstPrinter::printKCache(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  // Note: Mask values are taken from SIInsertWaits.cpp and not from ISA docs
+  // SIInsertWaits.cpp bits usage does not match ISA docs description but it
+  // works so it might be a misprint in docs.
+  unsigned SImm16 = MI->getOperand(OpNo).getImm();
+  unsigned Vmcnt = SImm16 & 0xF;
+  unsigned Expcnt = (SImm16 >> 4) & 0xF;
+  unsigned Lgkmcnt = (SImm16 >> 8) & 0xF;
+  if (Vmcnt != 0xF)
+    O << "vmcnt(" << Vmcnt << ") ";
+  if (Expcnt != 0x7)
+    O << "expcnt(" << Expcnt << ") ";
+  if (Lgkmcnt != 0x7)
+    O << "lgkmcnt(" << Lgkmcnt << ")";
+}
+
 #include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index c6fd053..77af942 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -32,6 +32,7 @@ public:
   virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
 
 private:
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -49,7 +50,10 @@ private:
   void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printBankSwizzle(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRSel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printCT(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printKCache(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWaitFlag(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a3397f3..29d0acf 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -82,6 +82,8 @@ void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 // ELFAMDGPUAsmBackend class
 //===----------------------------------------------------------------------===//
 
+namespace {
+
 class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
@@ -91,7 +93,11 @@ public:
   }
 };
 
-MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
                                            StringRef CPU) {
   return new ELFAMDGPUAsmBackend(T);
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 2aae26a..4a8e1b0 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -11,7 +11,7 @@
 #include "AMDGPUMCAsmInfo.h"
 
 using namespace llvm;
-AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   HasSingleParameterDotFile = false;
   WeakDefDirective = 0;
   //===------------------------------------------------------------------===//
@@ -21,7 +21,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
   HasStaticCtorDtorReferenceInStaticMode = false;
   LinkerRequiresNonEmptyDwarfLines = true;
   MaxInstLength = 16;
-  PCSymbol = "$";
   SeparatorString = "\n";
   CommentColumn = 40;
   CommentString = ";";
@@ -32,9 +31,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
   InlineAsmStart = ";#ASMSTART";
   InlineAsmEnd = ";#ASMEND";
   AssemblerDialect = 0;
-  AllowQuotesInName = false;
-  AllowNameToStartWithDigit = false;
-  AllowPeriodsInName = false;
 
   //===--- Data Emission Directives -------------------------------------===//
   ZeroDirective = ".zero";
@@ -56,13 +52,11 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
 
   //===--- Global Variable Emission Directives --------------------------===//
   GlobalDirective = ".global";
-  ExternDirective = ".extern";
   HasSetDirective = false;
   HasAggressiveSymbolFolding = true;
   COMMDirectiveAlignmentIsInBytes = false;
   HasDotTypeDotSizeDirective = false;
   HasNoDeadStrip = true;
-  HasSymbolResolver = false;
   WeakRefDirective = ".weakref\t";
   LinkOnceDirective = 0;
   //===--- Dwarf Emission Directives -----------------------------------===//
@@ -70,11 +64,6 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
   SupportsDebugInformation = true;
 }
 
-const char*
-AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
-  return 0;
-}
-
 const MCSection*
 AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
   return 0;
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 3ad0fa6..22afd63 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -17,13 +17,11 @@
 #include "llvm/MC/MCAsmInfo.h"
 namespace llvm {
 
-class Target;
 class StringRef;
 
 class AMDGPUMCAsmInfo : public MCAsmInfo {
 public:
-  explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
-  const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
+  explicit AMDGPUMCAsmInfo(StringRef &TT);
   const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
 };
 } // namespace llvm
diff --git a/lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 6a115b2..521b3b3 100644
--- a/lib/Target/MBlaze/MBlazeSelectionDAGInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- MBlazeSelectionDAGInfo.cpp - MBlaze SelectionDAG Info -------------===//
+//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the MBlazeSelectionDAGInfo class.
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mblaze-selectiondag-info"
-#include "MBlazeTargetMachine.h"
+#include "AMDGPUMCCodeEmitter.h"
+
 using namespace llvm;
 
-MBlazeSelectionDAGInfo::MBlazeSelectionDAGInfo(const MBlazeTargetMachine &TM)
-  : TargetSelectionDAGInfo(TM) {
-}
+// pin vtable to this file
+void AMDGPUMCCodeEmitter::anchor() {}
 
-MBlazeSelectionDAGInfo::~MBlazeSelectionDAGInfo() {
-}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
index cd3a7ce..d8cf64a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -24,6 +24,7 @@ class MCInst;
 class MCOperand;
 
 class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+  virtual void anchor();
 public:
 
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 61d70bb..a1bec28 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -88,7 +88,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     MCCodeEmitter *_Emitter,
                                     bool RelaxAll,
                                     bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, false, false);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, false, false);
 }
 
 extern "C" void LLVMInitializeR600TargetMC() {
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index abb0320..f6b3376 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -40,8 +40,8 @@ MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCSubtargetInfo &STI,
                                      MCContext &Ctx);
 
-MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
-                                     StringRef CPU);
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 MCObjectWriter *createAMDGPUELFObjectWriter(raw_ostream &OS);
 } // End llvm namespace
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
index 3ccdf42..98f6925 100644
--- a/lib/Target/R600/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_llvm_library(LLVMR600Desc
   AMDGPUAsmBackend.cpp
   AMDGPUELFObjectWriter.cpp
+  AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
   AMDGPUMCAsmInfo.cpp
   R600MCCodeEmitter.cpp
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index cb4cf0c..dd8df65 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -24,7 +24,6 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
-#include <stdio.h>
 
 using namespace llvm;
 
@@ -81,21 +80,6 @@ enum FCInstr {
   FC_CONTINUE
 };
 
-enum TextureTypes {
-  TEXTURE_1D = 1,
-  TEXTURE_2D,
-  TEXTURE_3D,
-  TEXTURE_CUBE,
-  TEXTURE_RECT,
-  TEXTURE_SHADOW1D,
-  TEXTURE_SHADOW2D,
-  TEXTURE_SHADOWRECT,
-  TEXTURE_1D_ARRAY,
-  TEXTURE_2D_ARRAY,
-  TEXTURE_SHADOW1D_ARRAY,
-  TEXTURE_SHADOW2D_ARRAY
-};
-
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                            const MCRegisterInfo &MRI,
                                            const MCSubtargetInfo &STI) {
@@ -114,69 +98,37 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   } else if (IS_VTX(Desc)) {
     uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
     uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
-    InstWord2 |= 1 << 19;
+    if (!(STI.getFeatureBits() & AMDGPU::FeatureCaymanISA)) {
+      InstWord2 |= 1 << 19; // Mega-Fetch bit
+    }
 
     Emit(InstWord01, OS);
     Emit(InstWord2, OS);
-    Emit((u_int32_t) 0, OS);
+    Emit((uint32_t) 0, OS);
   } else if (IS_TEX(Desc)) {
-    unsigned Opcode = MI.getOpcode();
-    bool HasOffsets = (Opcode == AMDGPU::TEX_LD);
-    unsigned OpOffset = HasOffsets ? 3 : 0;
-    int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
-    int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
-
-    uint32_t SrcSelect[4] = {0, 1, 2, 3};
-    uint32_t Offsets[3] = {0, 0, 0};
-    uint64_t CoordType[4] = {1, 1, 1, 1};
-
-    if (HasOffsets)
-      for (unsigned i = 0; i < 3; i++) {
-        int SignedOffset = MI.getOperand(i + 2).getImm();
-        Offsets[i] = (SignedOffset & 0x1F);
-      }
-
-    if (TextureType == TEXTURE_RECT ||
-        TextureType == TEXTURE_SHADOWRECT) {
-      CoordType[ELEMENT_X] = 0;
-      CoordType[ELEMENT_Y] = 0;
-    }
-
-    if (TextureType == TEXTURE_1D_ARRAY ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) {
-      if (Opcode == AMDGPU::TEX_SAMPLE_C_L ||
-          Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
-        CoordType[ELEMENT_Y] = 0;
-      } else {
-        CoordType[ELEMENT_Z] = 0;
-        SrcSelect[ELEMENT_Z] = ELEMENT_Y;
-      }
-    } else if (TextureType == TEXTURE_2D_ARRAY ||
-        TextureType == TEXTURE_SHADOW2D_ARRAY) {
-      CoordType[ELEMENT_Z] = 0;
-    }
-
-
-    if ((TextureType == TEXTURE_SHADOW1D ||
-        TextureType == TEXTURE_SHADOW2D ||
-        TextureType == TEXTURE_SHADOWRECT ||
-        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
-        Opcode != AMDGPU::TEX_SAMPLE_C_L &&
-        Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
-      SrcSelect[ELEMENT_W] = ELEMENT_Z;
-    }
-
-    uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups) |
-        CoordType[ELEMENT_X] << 60 | CoordType[ELEMENT_Y] << 61 |
-        CoordType[ELEMENT_Z] << 62 | CoordType[ELEMENT_W] << 63;
-    uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
-        SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
-        SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
-        Offsets[2] << 10;
-
-    Emit(Word01, OS);
-    Emit(Word2, OS);
-    Emit((u_int32_t) 0, OS);
+      int64_t Sampler = MI.getOperand(14).getImm();
+
+      int64_t SrcSelect[4] = {
+        MI.getOperand(2).getImm(),
+        MI.getOperand(3).getImm(),
+        MI.getOperand(4).getImm(),
+        MI.getOperand(5).getImm()
+      };
+      int64_t Offsets[3] = {
+        MI.getOperand(6).getImm() & 0x1F,
+        MI.getOperand(7).getImm() & 0x1F,
+        MI.getOperand(8).getImm() & 0x1F
+      };
+
+      uint64_t Word01 = getBinaryCodeForInstr(MI, Fixups);
+      uint32_t Word2 = Sampler << 15 | SrcSelect[ELEMENT_X] << 20 |
+          SrcSelect[ELEMENT_Y] << 23 | SrcSelect[ELEMENT_Z] << 26 |
+          SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
+          Offsets[2] << 10;
+
+      Emit(Word01, OS);
+      Emit(Word2, OS);
+      Emit((uint32_t) 0, OS);
   } else {
     uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
     if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 0cbe919..ee190e4 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -10,39 +10,45 @@
 class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
 : Processor<Name, itin, Features>;
 def : Proc<"",           R600_VLIW5_Itin,
-    [FeatureR600ALUInst, FeatureVertexCache]>;
+    [FeatureR600, FeatureVertexCache]>;
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600ALUInst , FeatureVertexCache]>;
+    [FeatureR600 , FeatureVertexCache]>;
 def : Proc<"rs880",      R600_VLIW5_Itin,
-    [FeatureR600ALUInst]>;
+    [FeatureR600]>;
 def : Proc<"rv670",      R600_VLIW5_Itin,
-    [FeatureR600ALUInst, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR600, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"rv710",      R600_VLIW5_Itin,
-    [FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache]>;
 def : Proc<"rv730",      R600_VLIW5_Itin,
-    [FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache]>;
 def : Proc<"rv770",      R600_VLIW5_Itin,
-    [FeatureFP64, FeatureVertexCache]>;
+    [FeatureR700, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages]>;
+    [FeatureEvergreen]>;
 def : Proc<"juniper",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache]>;
 def : Proc<"cypress",    R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureFP64, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>;
 def : Proc<"barts",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache]>;
 def : Proc<"turks",      R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache]>;
 def : Proc<"caicos",     R600_VLIW5_Itin,
-    [FeatureByteAddress, FeatureImages]>;
+    [FeatureNorthernIslands]>;
 def : Proc<"cayman",     R600_VLIW4_Itin,
-    [FeatureByteAddress, FeatureImages, FeatureFP64]>;def : Proc<"SI",         SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"tahiti",     SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"pitcairn",   SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"verde",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"oland",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
-def : Proc<"hainan",     SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+    [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
+
+def : Proc<"SI",         SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"tahiti",     SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"pitcairn",   SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"verde",      SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"oland",      SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"hainan",     SI_Itin, [FeatureSouthernIslands]>;
+def : Proc<"bonaire",    SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
+def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
new file mode 100644
index 0000000..33d2ca3
--- /dev/null
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -0,0 +1,204 @@
+//===-- R600ClauseMergePass - Merge consecutive CF_ALU -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// R600EmitClauseMarker pass emits CFAlu instruction in a conservative maneer.
+/// This pass is merging consecutive CFAlus where applicable.
+/// It needs to be called after IfCvt for best results.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "r600mergeclause"
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool isCFAlu(const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  case AMDGPU::CF_ALU:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+class R600ClauseMergePass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  unsigned getCFAluSize(const MachineInstr *MI) const;
+  bool isCFAluEnabled(const MachineInstr *MI) const;
+
+  /// IfCvt pass can generate "disabled" ALU clause marker that need to be
+  /// removed and their content affected to the previous alu clause.
+  /// This function parse instructions after CFAlu untill it find a disabled
+  /// CFAlu and merge the content, or an enabled CFAlu.
+  void cleanPotentialDisabledCFAlu(MachineInstr *CFAlu) const;
+
+  /// Check whether LatrCFAlu can be merged into RootCFAlu and do it if
+  /// it is the case.
+  bool mergeIfPossible(MachineInstr *RootCFAlu, const MachineInstr *LatrCFAlu)
+      const;
+
+public:
+  R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const;
+};
+
+char R600ClauseMergePass::ID = 0;
+
+unsigned R600ClauseMergePass::getCFAluSize(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::COUNT)).getImm();
+}
+
+bool R600ClauseMergePass::isCFAluEnabled(const MachineInstr *MI) const {
+  assert(isCFAlu(MI));
+  return MI->getOperand(
+      TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::Enabled)).getImm();
+}
+
+void R600ClauseMergePass::cleanPotentialDisabledCFAlu(MachineInstr *CFAlu)
+    const {
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  MachineBasicBlock::iterator I = CFAlu, E = CFAlu->getParent()->end();
+  I++;
+  do {
+    while (I!= E && !isCFAlu(I))
+      I++;
+    if (I == E)
+      return;
+    MachineInstr *MI = I++;
+    if (isCFAluEnabled(MI))
+      break;
+    CFAlu->getOperand(CntIdx).setImm(getCFAluSize(CFAlu) + getCFAluSize(MI));
+    MI->eraseFromParent();
+  } while (I != E);
+}
+
+bool R600ClauseMergePass::mergeIfPossible(MachineInstr *RootCFAlu,
+                                          const MachineInstr *LatrCFAlu) const {
+  assert(isCFAlu(RootCFAlu) && isCFAlu(LatrCFAlu));
+  int CntIdx = TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::COUNT);
+  unsigned RootInstCount = getCFAluSize(RootCFAlu),
+      LaterInstCount = getCFAluSize(LatrCFAlu);
+  unsigned CumuledInsts = RootInstCount + LaterInstCount;
+  if (CumuledInsts >= TII->getMaxAlusPerClause()) {
+    DEBUG(dbgs() << "Excess inst counts\n");
+    return false;
+  }
+  if (RootCFAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+    return false;
+  // Is KCache Bank 0 compatible ?
+  int Mode0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE0);
+  int KBank0Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK0);
+  int KBank0LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR0);
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm() &&
+      RootCFAlu->getOperand(Mode0Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank0Idx).getImm() !=
+       RootCFAlu->getOperand(KBank0Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank0LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank0LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  // Is KCache Bank 1 compatible ?
+  int Mode1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_MODE1);
+  int KBank1Idx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_BANK1);
+  int KBank1LineIdx =
+      TII->getOperandIdx(AMDGPU::CF_ALU, AMDGPU::OpName::KCACHE_ADDR1);
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm() &&
+      RootCFAlu->getOperand(Mode1Idx).getImm() &&
+      (LatrCFAlu->getOperand(KBank1Idx).getImm() !=
+      RootCFAlu->getOperand(KBank1Idx).getImm() ||
+      LatrCFAlu->getOperand(KBank1LineIdx).getImm() !=
+      RootCFAlu->getOperand(KBank1LineIdx).getImm())) {
+    DEBUG(dbgs() << "Wrong KC0\n");
+    return false;
+  }
+  if (LatrCFAlu->getOperand(Mode0Idx).getImm()) {
+    RootCFAlu->getOperand(Mode0Idx).setImm(
+        LatrCFAlu->getOperand(Mode0Idx).getImm());
+    RootCFAlu->getOperand(KBank0Idx).setImm(
+        LatrCFAlu->getOperand(KBank0Idx).getImm());
+    RootCFAlu->getOperand(KBank0LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank0LineIdx).getImm());
+  }
+  if (LatrCFAlu->getOperand(Mode1Idx).getImm()) {
+    RootCFAlu->getOperand(Mode1Idx).setImm(
+        LatrCFAlu->getOperand(Mode1Idx).getImm());
+    RootCFAlu->getOperand(KBank1Idx).setImm(
+        LatrCFAlu->getOperand(KBank1Idx).getImm());
+    RootCFAlu->getOperand(KBank1LineIdx).setImm(
+        LatrCFAlu->getOperand(KBank1LineIdx).getImm());
+  }
+  RootCFAlu->getOperand(CntIdx).setImm(CumuledInsts);
+  RootCFAlu->setDesc(TII->get(LatrCFAlu->getOpcode()));
+  return true;
+}
+
+bool R600ClauseMergePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin(),  E = MBB.end();
+    MachineBasicBlock::iterator LatestCFAlu = E;
+    while (I != E) {
+      MachineInstr *MI = I++;
+      if ((!TII->canBeConsideredALU(MI) && !isCFAlu(MI)) ||
+          TII->mustBeLastInClause(MI->getOpcode()))
+        LatestCFAlu = E;
+      if (!isCFAlu(MI))
+        continue;
+      cleanPotentialDisabledCFAlu(MI);
+
+      if (LatestCFAlu != E && mergeIfPossible(LatestCFAlu, MI)) {
+        MI->eraseFromParent();
+      } else {
+        assert(MI->getOperand(8).getImm() && "CF ALU instruction disabled");
+        LatestCFAlu = MI;
+      }
+    }
+  }
+  return false;
+}
+
+const char *R600ClauseMergePass::getPassName() const {
+  return "R600 Merge Clause Markers Pass";
+}
+
+} // end anonymous namespace
+
+
+llvm::FunctionPass *llvm::createR600ClauseMergePass(TargetMachine &TM) {
+  return new R600ClauseMergePass(TM);
+}
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index ffe3414..ac3d8f6 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -14,8 +14,6 @@
 
 #define DEBUG_TYPE "r600cf"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
@@ -24,8 +22,11 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
 
-namespace llvm {
+namespace {
 
 class R600ControlFlowFinalizer : public MachineFunctionPass {
 
@@ -48,7 +49,7 @@ private:
 
   static char ID;
   const R600InstrInfo *TII;
-  const R600RegisterInfo &TRI;
+  const R600RegisterInfo *TRI;
   unsigned MaxFetchInst;
   const AMDGPUSubtarget &ST;
 
@@ -64,7 +65,7 @@ private:
 
   const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI) const {
     unsigned Opcode = 0;
-    bool isEg = (ST.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX);
+    bool isEg = (ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
     switch (CFI) {
     case CF_TC:
       Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
@@ -97,7 +98,7 @@ private:
       Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
       break;
     case CF_END:
-      if (ST.device()->getDeviceFlag() == OCL_DEVICE_CAYMAN) {
+      if (ST.hasCaymanISA()) {
         Opcode = AMDGPU::CF_END_CM;
         break;
       }
@@ -109,28 +110,33 @@ private:
   }
 
   bool isCompatibleWithClause(const MachineInstr *MI,
-  std::set<unsigned> &DstRegs, std::set<unsigned> &SrcRegs) const {
+      std::set<unsigned> &DstRegs) const {
     unsigned DstMI, SrcMI;
     for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
         E = MI->operands_end(); I != E; ++I) {
       const MachineOperand &MO = *I;
       if (!MO.isReg())
         continue;
-      if (MO.isDef())
-        DstMI = MO.getReg();
+      if (MO.isDef()) {
+        unsigned Reg = MO.getReg();
+        if (AMDGPU::R600_Reg128RegClass.contains(Reg))
+          DstMI = Reg;
+        else
+          DstMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              &AMDGPU::R600_Reg128RegClass);
+      }
       if (MO.isUse()) {
         unsigned Reg = MO.getReg();
         if (AMDGPU::R600_Reg128RegClass.contains(Reg))
           SrcMI = Reg;
         else
-          SrcMI = TRI.getMatchingSuperReg(Reg,
-              TRI.getSubRegFromChannel(TRI.getHWRegChan(Reg)),
+          SrcMI = TRI->getMatchingSuperReg(Reg,
+              TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
               &AMDGPU::R600_Reg128RegClass);
       }
     }
-    if ((DstRegs.find(SrcMI) == DstRegs.end()) &&
-        (SrcRegs.find(DstMI) == SrcRegs.end())) {
-      SrcRegs.insert(SrcMI);
+    if ((DstRegs.find(SrcMI) == DstRegs.end())) {
       DstRegs.insert(DstMI);
       return true;
     } else
@@ -144,16 +150,16 @@ private:
     std::vector<MachineInstr *> ClauseContent;
     unsigned AluInstCount = 0;
     bool IsTex = TII->usesTextureCache(ClauseHead);
-    std::set<unsigned> DstRegs, SrcRegs;
+    std::set<unsigned> DstRegs;
     for (MachineBasicBlock::iterator E = MBB.end(); I != E; ++I) {
       if (IsTrivialInst(I))
         continue;
-      if (AluInstCount > MaxFetchInst)
+      if (AluInstCount >= MaxFetchInst)
         break;
       if ((IsTex && !TII->usesTextureCache(I)) ||
           (!IsTex && !TII->usesVertexCache(I)))
         break;
-      if (!isCompatibleWithClause(I, DstRegs, SrcRegs))
+      if (!isCompatibleWithClause(I, DstRegs))
         break;
       AluInstCount ++;
       ClauseContent.push_back(I);
@@ -166,28 +172,26 @@ private:
   }
 
   void getLiteral(MachineInstr *MI, std::vector<int64_t> &Lits) const {
-    unsigned LiteralRegs[] = {
+    static const unsigned LiteralRegs[] = {
       AMDGPU::ALU_LITERAL_X,
       AMDGPU::ALU_LITERAL_Y,
       AMDGPU::ALU_LITERAL_Z,
       AMDGPU::ALU_LITERAL_W
     };
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg())
-        continue;
-      if (MO.getReg() != AMDGPU::ALU_LITERAL_X)
+    const SmallVector<std::pair<MachineOperand *, int64_t>, 3 > Srcs =
+        TII->getSrcs(MI);
+    for (unsigned i = 0, e = Srcs.size(); i < e; ++i) {
+      if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
         continue;
-      unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM);
-      int64_t Imm = MI->getOperand(ImmIdx).getImm();
+      int64_t Imm = Srcs[i].second;
       std::vector<int64_t>::iterator It =
           std::find(Lits.begin(), Lits.end(), Imm);
       if (It != Lits.end()) {
         unsigned Index = It - Lits.begin();
-        MO.setReg(LiteralRegs[Index]);
+        Srcs[i].first->setReg(LiteralRegs[Index]);
       } else {
         assert(Lits.size() < 4 && "Too many literals in Instruction Group");
-        MO.setReg(LiteralRegs[Lits.size()]);
+        Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
         Lits.push_back(Imm);
       }
     }
@@ -252,6 +256,7 @@ private:
         ClauseContent.push_back(MILit);
       }
     }
+    assert(ClauseContent.size() < 128 && "ALU clause is too big");
     ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
     return ClauseFile(ClauseHead, ClauseContent);
   }
@@ -272,6 +277,7 @@ private:
   void
   EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
       unsigned &CfCount) {
+    Clause.first->getOperand(0).setImm(0);
     CounterPropagateAddr(Clause.first, CfCount);
     MachineBasicBlock *BB = Clause.first->getParent();
     BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
@@ -295,34 +301,35 @@ private:
   }
 
   unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
-    switch (ST.device()->getGeneration()) {
-    case AMDGPUDeviceInfo::HD4XXX:
+    switch (ST.getGeneration()) {
+    case AMDGPUSubtarget::R600:
+    case AMDGPUSubtarget::R700:
       if (hasPush)
         StackSubEntry += 2;
       break;
-    case AMDGPUDeviceInfo::HD5XXX:
+    case AMDGPUSubtarget::EVERGREEN:
       if (hasPush)
         StackSubEntry ++;
-    case AMDGPUDeviceInfo::HD6XXX:
+    case AMDGPUSubtarget::NORTHERN_ISLANDS:
       StackSubEntry += 2;
       break;
+    default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
     }
     return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
   }
 
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
-    TRI(TII->getRegisterInfo()),
+    TII (0), TRI(0),
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
-      if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
-        MaxFetchInst = 8;
-      else
-        MaxFetchInst = 16;
+      MaxFetchInst = ST.getTexVTXClauseSize();
   }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
+    TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+    TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+
     unsigned MaxStack = 0;
     unsigned CurrentStack = 0;
     bool HasPush = false;
@@ -340,6 +347,9 @@ public:
         MaxStack = 1;
       }
       std::vector<ClauseFile> FetchClauses, AluClauses;
+      std::vector<MachineInstr *> LastAlu(1);
+      std::vector<MachineInstr *> ToPopAfter;
+      
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
           I != E;) {
         if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
@@ -350,6 +360,10 @@ public:
         }
 
         MachineBasicBlock::iterator MI = I;
+        if (MI->getOpcode() != AMDGPU::ENDIF)
+          LastAlu.back() = 0;
+        if (MI->getOpcode() == AMDGPU::CF_ALU)
+          LastAlu.back() = MI;
         I++;
         switch (MI->getOpcode()) {
         case AMDGPU::CF_ALU_PUSH_BEFORE:
@@ -359,12 +373,6 @@ public:
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
-        case AMDGPU::EG_ExportBuf:
-        case AMDGPU::EG_ExportSwz:
-        case AMDGPU::R600_ExportBuf:
-        case AMDGPU::R600_ExportSwz:
-        case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
-        case AMDGPU::RAT_WRITE_CACHELESS_128_eg:
           DEBUG(dbgs() << CfCount << ":"; MI->dump(););
           CfCount++;
           break;
@@ -395,6 +403,7 @@ public:
           break;
         }
         case AMDGPU::IF_PREDICATE_SET: {
+          LastAlu.push_back(0);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
@@ -412,7 +421,7 @@ public:
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_ELSE))
               .addImm(0)
-              .addImm(1);
+              .addImm(0);
           DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
           IfThenElseStack.push_back(MIb);
           MI->eraseFromParent();
@@ -421,31 +430,31 @@ public:
         }
         case AMDGPU::ENDIF: {
           CurrentStack--;
+          if (LastAlu.back()) {
+            ToPopAfter.push_back(LastAlu.back());
+          } else {
+            MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
+                getHWInstrDesc(CF_POP))
+                .addImm(CfCount + 1)
+                .addImm(1);
+            (void)MIb;
+            DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+            CfCount++;
+          }
+          
           MachineInstr *IfOrElseInst = IfThenElseStack.back();
           IfThenElseStack.pop_back();
-          CounterPropagateAddr(IfOrElseInst, CfCount + 1);
-          MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
-              getHWInstrDesc(CF_POP))
-              .addImm(CfCount + 1)
-              .addImm(1);
-          (void)MIb;
-          DEBUG(dbgs() << CfCount << ":"; MIb->dump(););
+          CounterPropagateAddr(IfOrElseInst, CfCount);
+          IfOrElseInst->getOperand(1).setImm(1);
+          LastAlu.pop_back();
           MI->eraseFromParent();
-          CfCount++;
           break;
         }
-        case AMDGPU::PREDICATED_BREAK: {
-          CurrentStack--;
-          CfCount += 3;
-          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_JUMP))
-              .addImm(CfCount)
-              .addImm(1);
+        case AMDGPU::BREAK: {
+          CfCount ++;
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_LOOP_BREAK))
               .addImm(0);
-          BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_POP))
-              .addImm(CfCount)
-              .addImm(1);
           LoopStack.back().second.insert(MIb);
           MI->eraseFromParent();
           break;
@@ -473,9 +482,28 @@ public:
             EmitALUClause(I, AluClauses[i], CfCount);
         }
         default:
+          if (TII->isExport(MI->getOpcode())) {
+            DEBUG(dbgs() << CfCount << ":"; MI->dump(););
+            CfCount++;
+          }
           break;
         }
       }
+      for (unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
+        MachineInstr *Alu = ToPopAfter[i];
+        BuildMI(MBB, Alu, MBB.findDebugLoc((MachineBasicBlock::iterator)Alu),
+            TII->get(AMDGPU::CF_ALU_POP_AFTER))
+            .addImm(Alu->getOperand(0).getImm())
+            .addImm(Alu->getOperand(1).getImm())
+            .addImm(Alu->getOperand(2).getImm())
+            .addImm(Alu->getOperand(3).getImm())
+            .addImm(Alu->getOperand(4).getImm())
+            .addImm(Alu->getOperand(5).getImm())
+            .addImm(Alu->getOperand(6).getImm())
+            .addImm(Alu->getOperand(7).getImm())
+            .addImm(Alu->getOperand(8).getImm());
+        Alu->eraseFromParent();
+      }
       MFI->StackSize = getHWStackSize(MaxStack, HasPush);
     }
 
@@ -489,7 +517,7 @@ public:
 
 char R600ControlFlowFinalizer::ID = 0;
 
-}
+} // end anonymous namespace
 
 
 llvm::FunctionPass *llvm::createR600ControlFlowFinalizer(TargetMachine &TM) {
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 36bfb18..1781f2a 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -41,7 +41,12 @@ namespace R600_InstFlag {
     OP1 = (1 << 10),
     OP2 = (1 << 11),
     VTX_INST  = (1 << 12),
-    TEX_INST = (1 << 13)
+    TEX_INST = (1 << 13),
+    ALU_INST = (1 << 14),
+    LDS_1A = (1 << 15),
+    LDS_1A1D = (1 << 16),
+    IS_EXPORT = (1 << 17),
+    LDS_1A2D = (1 << 18)
   };
 }
 
@@ -57,47 +62,82 @@ namespace R600_InstFlag {
 #define IS_VTX(desc) ((desc).TSFlags & R600_InstFlag::VTX_INST)
 #define IS_TEX(desc) ((desc).TSFlags & R600_InstFlag::TEX_INST)
 
-namespace R600Operands {
-  enum Ops {
-    DST,
-    UPDATE_EXEC_MASK,
-    UPDATE_PREDICATE,
-    WRITE,
-    OMOD,
-    DST_REL,
-    CLAMP,
-    SRC0,
-    SRC0_NEG,
-    SRC0_REL,
-    SRC0_ABS,
-    SRC0_SEL,
-    SRC1,
-    SRC1_NEG,
-    SRC1_REL,
-    SRC1_ABS,
-    SRC1_SEL,
-    SRC2,
-    SRC2_NEG,
-    SRC2_REL,
-    SRC2_SEL,
-    LAST,
-    PRED_SEL,
-    IMM,
-    BANK_SWIZZLE,
-    COUNT
+namespace OpName {
+
+  enum VecOps {
+    UPDATE_EXEC_MASK_X,
+    UPDATE_PREDICATE_X,
+    WRITE_X,
+    OMOD_X,
+    DST_REL_X,
+    CLAMP_X,
+    SRC0_X,
+    SRC0_NEG_X,
+    SRC0_REL_X,
+    SRC0_ABS_X,
+    SRC0_SEL_X,
+    SRC1_X,
+    SRC1_NEG_X,
+    SRC1_REL_X,
+    SRC1_ABS_X,
+    SRC1_SEL_X,
+    PRED_SEL_X,
+    UPDATE_EXEC_MASK_Y,
+    UPDATE_PREDICATE_Y,
+    WRITE_Y,
+    OMOD_Y,
+    DST_REL_Y,
+    CLAMP_Y,
+    SRC0_Y,
+    SRC0_NEG_Y,
+    SRC0_REL_Y,
+    SRC0_ABS_Y,
+    SRC0_SEL_Y,
+    SRC1_Y,
+    SRC1_NEG_Y,
+    SRC1_REL_Y,
+    SRC1_ABS_Y,
+    SRC1_SEL_Y,
+    PRED_SEL_Y,
+    UPDATE_EXEC_MASK_Z,
+    UPDATE_PREDICATE_Z,
+    WRITE_Z,
+    OMOD_Z,
+    DST_REL_Z,
+    CLAMP_Z,
+    SRC0_Z,
+    SRC0_NEG_Z,
+    SRC0_REL_Z,
+    SRC0_ABS_Z,
+    SRC0_SEL_Z,
+    SRC1_Z,
+    SRC1_NEG_Z,
+    SRC1_REL_Z,
+    SRC1_ABS_Z,
+    SRC1_SEL_Z,
+    PRED_SEL_Z,
+    UPDATE_EXEC_MASK_W,
+    UPDATE_PREDICATE_W,
+    WRITE_W,
+    OMOD_W,
+    DST_REL_W,
+    CLAMP_W,
+    SRC0_W,
+    SRC0_NEG_W,
+    SRC0_REL_W,
+    SRC0_ABS_W,
+    SRC0_SEL_W,
+    SRC1_W,
+    SRC1_NEG_W,
+    SRC1_REL_W,
+    SRC1_ABS_W,
+    SRC1_SEL_W,
+    PRED_SEL_W,
+    IMM_0,
+    IMM_1,
+    VEC_COUNT
  };
 
-  const static int ALUOpTable[3][R600Operands::COUNT] = {
-//            W        C     S  S  S  S     S  S  S  S     S  S  S
-//            R  O  D  L  S  R  R  R  R  S  R  R  R  R  S  R  R  R  L  P
-//   D  U     I  M  R  A  R  C  C  C  C  R  C  C  C  C  R  C  C  C  A  R  I
-//   S  E  U  T  O  E  M  C  0  0  0  0  C  1  1  1  1  C  2  2  2  S  E  M  B
-//   T  M  P  E  D  L  P  0  N  R  A  S  1  N  R  A  S  2  N  R  S  T  D  M  S
-    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,-1,-1,-1,-1,-1,-1,-1,10,11,12,13},
-    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,13,14,15,16,-1,-1,-1,-1,17,18,19,20},
-    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18}
-  };
-
 }
 
 //===----------------------------------------------------------------------===//
@@ -126,4 +166,6 @@ namespace R600Operands {
 #define R_028878_SQ_PGM_RESOURCES_GS                 0x028878
 #define R_0288D4_SQ_PGM_RESOURCES_LS                 0x0288d4
 
+#define R_0288E8_SQ_LDS_ALLOC                        0x0288E8
+
 #endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 3fdc678..1bbfd2b 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -23,21 +23,23 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-namespace llvm {
+using namespace llvm;
+
+namespace {
 
 class R600EmitClauseMarkersPass : public MachineFunctionPass {
 
 private:
   static char ID;
   const R600InstrInfo *TII;
+  int Address;
 
   unsigned OccupiedDwords(MachineInstr *MI) const {
     switch (MI->getOpcode()) {
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
-    case AMDGPU::DOT4_eg_pseudo:
-    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT_4:
       return 4;
     case AMDGPU::KILL:
       return 0;
@@ -45,6 +47,11 @@ private:
       break;
     }
 
+    // These will be expanded to two ALU instructions in the
+    // ExpandSpecialInstructions pass.
+    if (TII->isLDSRetInstr(MI->getOpcode()))
+      return 2;
+
     if(TII->isVector(*MI) ||
         TII->isCubeOp(MI->getOpcode()) ||
         TII->isReductionOp(MI->getOpcode()))
@@ -71,8 +78,7 @@ private:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
     case AMDGPU::COPY:
-    case AMDGPU::DOT4_eg_pseudo:
-    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT_4:
       return true;
     default:
       return false;
@@ -83,37 +89,13 @@ private:
     switch (MI->getOpcode()) {
     case AMDGPU::KILL:
     case AMDGPU::RETURN:
+    case AMDGPU::IMPLICIT_DEF:
       return true;
     default:
       return false;
     }
   }
 
-  // Register Idx, then Const value
-  std::vector<std::pair<unsigned, unsigned> > ExtractConstRead(MachineInstr *MI)
-      const {
-    const R600Operands::Ops OpTable[3][2] = {
-      {R600Operands::SRC0, R600Operands::SRC0_SEL},
-      {R600Operands::SRC1, R600Operands::SRC1_SEL},
-      {R600Operands::SRC2, R600Operands::SRC2_SEL},
-    };
-    std::vector<std::pair<unsigned, unsigned> > Result;
-
-    if (!TII->isALUInstr(MI->getOpcode()))
-      return Result;
-    for (unsigned j = 0; j < 3; j++) {
-      int SrcIdx = TII->getOperandIdx(MI->getOpcode(), OpTable[j][0]);
-      if (SrcIdx < 0)
-        break;
-      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
-        unsigned Const = MI->getOperand(
-            TII->getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-        Result.push_back(std::pair<unsigned, unsigned>(SrcIdx, Const));
-      }
-    }
-    return Result;
-  }
-
   std::pair<unsigned, unsigned> getAccessedBankLine(unsigned Sel) const {
     // Sel is (512 + (kc_bank << 12) + ConstIndex) << 2
     // (See also R600ISelLowering.cpp)
@@ -129,11 +111,20 @@ private:
   }
 
   bool SubstituteKCacheBank(MachineInstr *MI,
-      std::vector<std::pair<unsigned, unsigned> > &CachedConsts) const {
+      std::vector<std::pair<unsigned, unsigned> > &CachedConsts,
+      bool UpdateInstr = true) const {
     std::vector<std::pair<unsigned, unsigned> > UsedKCache;
-    std::vector<std::pair<unsigned, unsigned> > Consts = ExtractConstRead(MI);
-    assert(TII->isALUInstr(MI->getOpcode()) && "Can't assign Const");
+
+    if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+      return true;
+
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
+        TII->getSrcs(MI);
+    assert((TII->isALUInstr(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
     for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
       unsigned Sel = Consts[i].second;
       unsigned Chan = Sel & 3, Index = ((Sel >> 2) - 512) & 31;
       unsigned KCacheIndex = Index * 4 + Chan;
@@ -159,25 +150,77 @@ private:
       return false;
     }
 
-    for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
-      switch(UsedKCache[i].first) {
+    if (!UpdateInstr)
+      return true;
+
+    for (unsigned i = 0, j = 0, n = Consts.size(); i < n; ++i) {
+      if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
+        continue;
+      switch(UsedKCache[j].first) {
       case 0:
-        MI->getOperand(Consts[i].first).setReg(
-            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[i].second));
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC0RegClass.getRegister(UsedKCache[j].second));
         break;
       case 1:
-        MI->getOperand(Consts[i].first).setReg(
-            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[i].second));
+        Consts[i].first->setReg(
+            AMDGPU::R600_KC1RegClass.getRegister(UsedKCache[j].second));
         break;
       default:
         llvm_unreachable("Wrong Cache Line");
       }
+      j++;
+    }
+    return true;
+  }
+
+  bool canClauseLocalKillFitInClause(
+                        unsigned AluInstCount,
+                        std::vector<std::pair<unsigned, unsigned> > KCacheBanks,
+                        MachineBasicBlock::iterator Def,
+                        MachineBasicBlock::iterator BBEnd) {
+    const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    for (MachineInstr::const_mop_iterator
+           MOI = Def->operands_begin(),
+           MOE = Def->operands_end(); MOI != MOE; ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef() ||
+          TRI.isPhysRegLiveAcrossClauses(MOI->getReg()))
+        continue;
+
+      // Def defines a clause local register, so check that its use will fit
+      // in the clause.
+      unsigned LastUseCount = 0;
+      for (MachineBasicBlock::iterator UseI = Def; UseI != BBEnd; ++UseI) {
+        AluInstCount += OccupiedDwords(UseI);
+        // Make sure we won't need to end the clause due to KCache limitations.
+        if (!SubstituteKCacheBank(UseI, KCacheBanks, false))
+          return false;
+
+        // We have reached the maximum instruction limit before finding the
+        // use that kills this register, so we cannot use this def in the
+        // current clause.
+        if (AluInstCount >= TII->getMaxAlusPerClause())
+          return false;
+
+        // Register kill flags have been cleared by the time we get to this
+        // pass, but it is safe to assume that all uses of this register
+        // occur in the same basic block as its definition, because
+        // it is illegal for the scheduler to schedule them in
+        // different blocks.
+        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+          LastUseCount = AluInstCount;
+
+        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+          break;
+      }
+      if (LastUseCount)
+        return LastUseCount <= TII->getMaxAlusPerClause();
+      llvm_unreachable("Clause local register live at end of clause.");
     }
     return true;
   }
 
   MachineBasicBlock::iterator
-  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
     MachineBasicBlock::iterator ClauseHead = I;
     std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
     bool PushBeforeModifier = false;
@@ -190,39 +233,66 @@ private:
       if (AluInstCount > TII->getMaxAlusPerClause())
         break;
       if (I->getOpcode() == AMDGPU::PRED_X) {
+        // We put PRED_X in its own clause to ensure that ifcvt won't create
+        // clauses with more than 128 insts.
+        // IfCvt is indeed checking that "then" and "else" branches of an if
+        // statement have less than ~60 insts thus converted clauses can't be
+        // bigger than ~121 insts (predicate setter needs to be in the same
+        // clause as predicated alus).
+        if (AluInstCount > 0)
+          break;
         if (TII->getFlagOp(I).getImm() & MO_FLAG_PUSH)
           PushBeforeModifier = true;
         AluInstCount ++;
         continue;
       }
-      if (I->getOpcode() == AMDGPU::KILLGT) {
+      // XXX: GROUP_BARRIER instructions cannot be in the same ALU clause as:
+      //
+      // * KILL or INTERP instructions
+      // * Any instruction that sets UPDATE_EXEC_MASK or UPDATE_PRED bits
+      // * Uses waterfalling (i.e. INDEX_MODE = AR.X)
+      //
+      // XXX: These checks have not been implemented yet.
+      if (TII->mustBeLastInClause(I->getOpcode())) {
         I++;
         break;
       }
-      if (TII->isALUInstr(I->getOpcode()) &&
-          !SubstituteKCacheBank(I, KCacheBanks))
+
+      // If this instruction defines a clause local register, make sure
+      // its use can fit in this clause.
+      if (!canClauseLocalKillFitInClause(AluInstCount, KCacheBanks, I, E))
+        break;
+
+      if (!SubstituteKCacheBank(I, KCacheBanks))
         break;
       AluInstCount += OccupiedDwords(I);
     }
     unsigned Opcode = PushBeforeModifier ?
         AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
     BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
-        .addImm(0) // ADDR
+    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
+    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
+    // pass may assume that identical ALU clause starter at the beginning of a 
+    // true and false branch can be factorized which is not the case.
+        .addImm(Address++) // ADDR
         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
         .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
         .addImm(KCacheBanks.empty()?0:2) // KM0
         .addImm((KCacheBanks.size() < 2)?0:2) // KM1
         .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
         .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
-        .addImm(AluInstCount); // COUNT
+        .addImm(AluInstCount) // COUNT
+        .addImm(1); // Enabled
     return I;
   }
 
 public:
   R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+    TII(0), Address(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF) {
+    TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
+
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                     BB != BB_E; ++BB) {
       MachineBasicBlock &MBB = *BB;
@@ -246,7 +316,7 @@ public:
 
 char R600EmitClauseMarkersPass::ID = 0;
 
-}
+} // end anonymous namespace
 
 
 llvm::FunctionPass *llvm::createR600EmitClauseMarkers(TargetMachine &TM) {
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index f8c900f..aeee4aa 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,7 +38,7 @@ private:
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+    TII(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -56,6 +56,7 @@ FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
 }
 
 bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
   const R600RegisterInfo &TRI = TII->getRegisterInfo();
 
@@ -67,6 +68,23 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       MachineInstr &MI = *I;
       I = llvm::next(I);
 
+      // Expand LDS_*_RET instructions
+      if (TII->isLDSRetInstr(MI.getOpcode())) {
+        int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
+        assert(DstIdx != -1);
+        MachineOperand &DstOp = MI.getOperand(DstIdx);
+        MachineInstr *Mov = TII->buildMovInstr(&MBB, I,
+                                               DstOp.getReg(), AMDGPU::OQAP);
+        DstOp.setReg(AMDGPU::OQAP);
+        int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(),
+                                           AMDGPU::OpName::pred_sel);
+        // Copy the pred_sel bit
+        Mov->getOperand(MovPredSelIdx).setReg(
+            MI.getOperand(LDSPredSelIdx).getReg());
+      }
+
       switch (MI.getOpcode()) {
       default: break;
       // Expand PRED_X to one of the PRED_SET instructions.
@@ -81,28 +99,13 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
                                             AMDGPU::ZERO);             // src1
         TII->addFlag(PredSet, 0, MO_FLAG_MASK);
         if (Flags & MO_FLAG_PUSH) {
-          TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1);
         } else {
-          TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
+          TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1);
         }
         MI.eraseFromParent();
         continue;
         }
-      case AMDGPU::BREAK: {
-        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
-                                          AMDGPU::PRED_SETE_INT,
-                                          AMDGPU::PREDICATE_BIT,
-                                          AMDGPU::ZERO,
-                                          AMDGPU::ZERO);
-        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
-        TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
-
-        BuildMI(MBB, I, MBB.findDebugLoc(I),
-                TII->get(AMDGPU::PREDICATED_BREAK))
-                .addReg(AMDGPU::PREDICATE_BIT);
-        MI.eraseFromParent();
-        continue;
-        }
 
       case AMDGPU::INTERP_PAIR_XY: {
         MachineInstr *BMI;
@@ -182,6 +185,45 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         MI.eraseFromParent();
         continue;
         }
+      case AMDGPU::DOT_4: {
+
+        const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+        unsigned DstReg = MI.getOperand(0).getReg();
+        unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+        for (unsigned Chan = 0; Chan < 4; ++Chan) {
+          bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned SubDstReg =
+              AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+          MachineInstr *BMI =
+              TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+          if (Chan > 0) {
+            BMI->bundleWithPred();
+          }
+          if (Mask) {
+            TII->addFlag(BMI, 0, MO_FLAG_MASK);
+          }
+          if (Chan != 3)
+            TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+          unsigned Opcode = BMI->getOpcode();
+          // While not strictly necessary from hw point of view, we force
+          // all src operands of a dot4 inst to belong to the same slot.
+          unsigned Src0 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+              .getReg();
+          unsigned Src1 = BMI->getOperand(
+              TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+              .getReg();
+          (void) Src0;
+          (void) Src1;
+          if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+              (TRI.getEncodingValue(Src1) & 0xff) < 127)
+            assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
+        }
+        MI.eraseFromParent();
+        continue;
+      }
       }
 
       bool IsReduction = TII->isReductionOp(MI.getOpcode());
@@ -218,14 +260,14 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
       // T0_W = CUBE T1_Y, T1_Z
       for (unsigned Chan = 0; Chan < 4; Chan++) {
         unsigned DstReg = MI.getOperand(
-                            TII->getOperandIdx(MI, R600Operands::DST)).getReg();
+                            TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg();
         unsigned Src0 = MI.getOperand(
-                           TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
+                           TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg();
         unsigned Src1 = 0;
 
         // Determine the correct source registers
         if (!IsCube) {
-          int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
+          int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1);
           if (Src1Idx != -1) {
             Src1 = MI.getOperand(Src1Idx).getReg();
           }
@@ -268,12 +310,6 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         case AMDGPU::CUBE_eg_pseudo:
           Opcode = AMDGPU::CUBE_eg_real;
           break;
-        case AMDGPU::DOT4_r600_pseudo:
-          Opcode = AMDGPU::DOT4_r600_real;
-          break;
-        case AMDGPU::DOT4_eg_pseudo:
-          Opcode = AMDGPU::DOT4_eg_real;
-          break;
         default:
           break;
         }
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 7252235..0fcb488 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -27,41 +28,40 @@ using namespace llvm;
 
 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM),
-    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
+    Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
   addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
   addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
   addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
+
   computeRegisterProperties();
 
-  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
-  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
-  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
-  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
-
-  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
-  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
-  setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
-  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
-  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
-  setOperationAction(ISD::OR, MVT::v4i32, Expand);
-  setOperationAction(ISD::OR, MVT::v2i32, Expand);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
-  setOperationAction(ISD::SHL, MVT::v4i32, Expand);
-  setOperationAction(ISD::SHL, MVT::v2i32, Expand);
-  setOperationAction(ISD::SRL, MVT::v4i32, Expand);
-  setOperationAction(ISD::SRL, MVT::v2i32, Expand);
-  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
-  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
-  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
-  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
-  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
-  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+  // Set condition code actions
+  setCondCodeAction(ISD::SETO,   MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUO,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLT,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETLE,  MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
+
+  setOperationAction(ISD::FCOS, MVT::f32, Custom);
+  setOperationAction(ISD::FSIN, MVT::f32, Custom);
+
   setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
-  setOperationAction(ISD::XOR, MVT::v4i32, Expand);
-  setOperationAction(ISD::XOR, MVT::v2i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
 
   setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
@@ -72,8 +72,6 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
 
-  setOperationAction(ISD::ROTL, MVT::i32, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
@@ -81,24 +79,33 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
 
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
-
-  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
-  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+
+  // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
+  // spaces, so it is custom lowered to handle those where it isn't.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
+
   setOperationAction(ISD::STORE, MVT::i8, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i8, Custom);
+  setTruncStoreAction(MVT::i32, MVT::i16, Custom);
 
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -108,10 +115,13 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::FP_TO_SINT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::SELECT_CC);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-  setSchedulingPreference(Sched::VLIW);
+  setSchedulingPreference(Sched::Source);
 }
 
 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
@@ -119,9 +129,29 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   MachineFunction * MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock::iterator I = *MI;
+  const R600InstrInfo *TII =
+    static_cast<const R600InstrInfo*>(MF->getTarget().getInstrInfo());
 
   switch (MI->getOpcode()) {
-  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  default:
+    // Replace LDS_*_RET instruction that don't have any uses with the
+    // equivalent LDS_*_NORET instruction.
+    if (TII->isLDSRetInstr(MI->getOpcode())) {
+      int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+      assert(DstIdx != -1);
+      MachineInstrBuilder NewMI;
+      if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()))
+        return BB;
+
+      NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
+                      TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
+      for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
+        NewMI.addOperand(MI->getOperand(i));
+      }
+    } else {
+      return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+    }
+    break;
   case AMDGPU::CLAMP_R600: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
                                                    AMDGPU::MOV,
@@ -169,12 +199,13 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::CONST_COPY: {
     MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
         MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
-    TII->setImmOperand(NewMI, R600Operands::SRC0_SEL,
+    TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
         MI->getOperand(1).getImm());
     break;
   }
 
   case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
   case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
     unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
 
@@ -188,23 +219,99 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::TXD: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
-
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
             .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
             .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
             .addReg(T0, RegState::Implicit)
             .addReg(T1, RegState::Implicit);
     break;
@@ -213,23 +320,100 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::TXD_SHADOW: {
     unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
     unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    MachineOperand &RID = MI->getOperand(4);
+    MachineOperand &SID = MI->getOperand(5);
+    unsigned TextureId = MI->getOperand(6).getImm();
+    unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
+    unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
+
+    switch (TextureId) {
+    case 5: // Rect
+      CTX = CTY = 0;
+      break;
+    case 6: // Shadow1D
+      SrcW = SrcZ;
+      break;
+    case 7: // Shadow2D
+      SrcW = SrcZ;
+      break;
+    case 8: // ShadowRect
+      CTX = CTY = 0;
+      SrcW = SrcZ;
+      break;
+    case 9: // 1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 10: // 2DArray
+      CTZ = 0;
+      break;
+    case 11: // Shadow1DArray
+      SrcZ = SrcY;
+      CTZ = 0;
+      break;
+    case 12: // Shadow2DArray
+      CTZ = 0;
+      break;
+    }
 
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
             .addOperand(MI->getOperand(3))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
             .addOperand(MI->getOperand(2))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6));
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW);
     BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
             .addOperand(MI->getOperand(0))
             .addOperand(MI->getOperand(1))
-            .addOperand(MI->getOperand(4))
-            .addOperand(MI->getOperand(5))
-            .addOperand(MI->getOperand(6))
+            .addImm(SrcX)
+            .addImm(SrcY)
+            .addImm(SrcZ)
+            .addImm(SrcW)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(0)
+            .addImm(1)
+            .addImm(2)
+            .addImm(3)
+            .addOperand(RID)
+            .addOperand(SID)
+            .addImm(CTX)
+            .addImm(CTY)
+            .addImm(CTZ)
+            .addImm(CTW)
             .addReg(T0, RegState::Implicit)
             .addReg(T1, RegState::Implicit);
     break;
@@ -321,30 +505,27 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
-using namespace llvm::Intrinsic;
-using namespace llvm::AMDGPUIntrinsic;
-
 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
-  case ISD::ROTL: return LowerROTL(Op, DAG);
+  case ISD::FCOS:
+  case ISD::FSIN: return LowerTrig(Op, DAG);
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
   case ISD::LOAD: return LowerLOAD(Op, DAG);
-  case ISD::FrameIndex: return LowerFrameIndex(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
   case ISD::INTRINSIC_VOID: {
     SDValue Chain = Op.getOperand(0);
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
     switch (IntrinsicID) {
     case AMDGPUIntrinsic::AMDGPU_store_output: {
-      MachineFunction &MF = DAG.getMachineFunction();
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
       MFI->LiveOuts.push_back(Reg);
-      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
+      return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
     }
     case AMDGPUIntrinsic::R600_store_swizzle: {
       const SDValue Args[8] = {
@@ -357,7 +538,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         DAG.getConstant(2, MVT::i32), // SWZ_Z
         DAG.getConstant(3, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, Op.getDebugLoc(), Op.getValueType(),
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
           Args, 8);
     }
 
@@ -371,13 +552,17 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     unsigned IntrinsicID =
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     switch(IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case AMDGPUIntrinsic::R600_load_input: {
       int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
-      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      MRI.addLiveIn(Reg);
+      return DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), Reg, VT);
     }
 
     case AMDGPUIntrinsic::R600_interp_input: {
@@ -385,66 +570,184 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
       MachineSDNode *interp;
       if (ijb < 0) {
+        const MachineFunction &MF = DAG.getMachineFunction();
+        const R600InstrInfo *TII =
+          static_cast<const R600InstrInfo*>(MF.getTarget().getInstrInfo());
         interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
             MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
         return DAG.getTargetExtractSubreg(
             TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
             DL, MVT::f32, SDValue(interp, 0));
       }
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
+      unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
+      MRI.addLiveIn(RegisterI);
+      MRI.addLiveIn(RegisterJ);
+      SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
+      SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
+          SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
 
       if (slot % 4 < 2)
         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
+            RegisterJNode, RegisterINode);
       else
         interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
             MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1), MVT::f32),
-            CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
-                AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb), MVT::f32));
-
+            RegisterJNode, RegisterINode);
       return SDValue(interp, slot % 2);
     }
+    case AMDGPUIntrinsic::R600_interp_xy:
+    case AMDGPUIntrinsic::R600_interp_zw: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      MachineSDNode *interp;
+      SDValue RegisterINode = Op.getOperand(2);
+      SDValue RegisterJNode = Op.getOperand(3);
+
+      if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      else
+        interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
+            MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
+            RegisterJNode, RegisterINode);
+      return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
+          SDValue(interp, 0), SDValue(interp, 1));
+    }
+    case AMDGPUIntrinsic::R600_tex:
+    case AMDGPUIntrinsic::R600_texc:
+    case AMDGPUIntrinsic::R600_txl:
+    case AMDGPUIntrinsic::R600_txlc:
+    case AMDGPUIntrinsic::R600_txb:
+    case AMDGPUIntrinsic::R600_txbc:
+    case AMDGPUIntrinsic::R600_txf:
+    case AMDGPUIntrinsic::R600_txq:
+    case AMDGPUIntrinsic::R600_ddx:
+    case AMDGPUIntrinsic::R600_ddy:
+    case AMDGPUIntrinsic::R600_ldptr: {
+      unsigned TextureOp;
+      switch (IntrinsicID) {
+      case AMDGPUIntrinsic::R600_tex:
+        TextureOp = 0;
+        break;
+      case AMDGPUIntrinsic::R600_texc:
+        TextureOp = 1;
+        break;
+      case AMDGPUIntrinsic::R600_txl:
+        TextureOp = 2;
+        break;
+      case AMDGPUIntrinsic::R600_txlc:
+        TextureOp = 3;
+        break;
+      case AMDGPUIntrinsic::R600_txb:
+        TextureOp = 4;
+        break;
+      case AMDGPUIntrinsic::R600_txbc:
+        TextureOp = 5;
+        break;
+      case AMDGPUIntrinsic::R600_txf:
+        TextureOp = 6;
+        break;
+      case AMDGPUIntrinsic::R600_txq:
+        TextureOp = 7;
+        break;
+      case AMDGPUIntrinsic::R600_ddx:
+        TextureOp = 8;
+        break;
+      case AMDGPUIntrinsic::R600_ddy:
+        TextureOp = 9;
+        break;
+      case AMDGPUIntrinsic::R600_ldptr:
+        TextureOp = 10;
+        break;
+      default:
+        llvm_unreachable("Unknow Texture Operation");
+      }
+
+      SDValue TexArgs[19] = {
+        DAG.getConstant(TextureOp, MVT::i32),
+        Op.getOperand(1),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(2),
+        Op.getOperand(3),
+        Op.getOperand(4),
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(1, MVT::i32),
+        DAG.getConstant(2, MVT::i32),
+        DAG.getConstant(3, MVT::i32),
+        Op.getOperand(5),
+        Op.getOperand(6),
+        Op.getOperand(7),
+        Op.getOperand(8),
+        Op.getOperand(9),
+        Op.getOperand(10)
+      };
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+    }
+    case AMDGPUIntrinsic::AMDGPU_dp4: {
+      SDValue Args[8] = {
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(0, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(1, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(2, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+          DAG.getConstant(3, MVT::i32)),
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+          DAG.getConstant(3, MVT::i32))
+      };
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+    }
 
-    case r600_read_ngroups_x:
+    case Intrinsic::r600_read_ngroups_x:
       return LowerImplicitParameter(DAG, VT, DL, 0);
-    case r600_read_ngroups_y:
+    case Intrinsic::r600_read_ngroups_y:
       return LowerImplicitParameter(DAG, VT, DL, 1);
-    case r600_read_ngroups_z:
+    case Intrinsic::r600_read_ngroups_z:
       return LowerImplicitParameter(DAG, VT, DL, 2);
-    case r600_read_global_size_x:
+    case Intrinsic::r600_read_global_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 3);
-    case r600_read_global_size_y:
+    case Intrinsic::r600_read_global_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 4);
-    case r600_read_global_size_z:
+    case Intrinsic::r600_read_global_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 5);
-    case r600_read_local_size_x:
+    case Intrinsic::r600_read_local_size_x:
       return LowerImplicitParameter(DAG, VT, DL, 6);
-    case r600_read_local_size_y:
+    case Intrinsic::r600_read_local_size_y:
       return LowerImplicitParameter(DAG, VT, DL, 7);
-    case r600_read_local_size_z:
+    case Intrinsic::r600_read_local_size_z:
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
-    case r600_read_tgid_x:
+    case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_X, VT);
-    case r600_read_tgid_y:
+    case Intrinsic::r600_read_tgid_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Y, VT);
-    case r600_read_tgid_z:
+    case Intrinsic::r600_read_tgid_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T1_Z, VT);
-    case r600_read_tidig_x:
+    case Intrinsic::r600_read_tidig_x:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_X, VT);
-    case r600_read_tidig_y:
+    case Intrinsic::r600_read_tidig_y:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Y, VT);
-    case r600_read_tidig_z:
+    case Intrinsic::r600_read_tidig_z:
       return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
                                   AMDGPU::T0_Z, VT);
     }
@@ -478,10 +781,41 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
   }
 }
 
+SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
+  // On hw >= R700, COS/SIN input must be between -1. and 1.
+  // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
+  EVT VT = Op.getValueType();
+  SDValue Arg = Op.getOperand(0);
+  SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
+      DAG.getNode(ISD::FADD, SDLoc(Op), VT,
+        DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
+          DAG.getConstantFP(0.15915494309, MVT::f32)),
+        DAG.getConstantFP(0.5, MVT::f32)));
+  unsigned TrigNode;
+  switch (Op.getOpcode()) {
+  case ISD::FCOS:
+    TrigNode = AMDGPUISD::COS_HW;
+    break;
+  case ISD::FSIN:
+    TrigNode = AMDGPUISD::SIN_HW;
+    break;
+  default:
+    llvm_unreachable("Wrong trig opcode");
+  }
+  SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
+      DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
+        DAG.getConstantFP(-0.5, MVT::f32)));
+  if (Gen >= AMDGPUSubtarget::R700)
+    return TrigVal;
+  // On R600 hw, COS/SIN input must be between -Pi and Pi.
+  return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
+      DAG.getConstantFP(3.14159265359, MVT::f32));
+}
+
 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(
       ISD::SETCC,
-      Op.getDebugLoc(),
+      SDLoc(Op),
       MVT::i1,
       Op, DAG.getConstantFP(0.0f, MVT::f32),
       DAG.getCondCode(ISD::SETNE)
@@ -489,11 +823,11 @@ SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                                   DebugLoc DL,
+                                                   SDLoc DL,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::PARAM_I_ADDRESS);
+                                      AMDGPUAS::CONSTANT_BUFFER_0);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
@@ -504,32 +838,6 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                      false, false, false, 0);
 }
 
-SDValue R600TargetLowering::LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const {
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  const AMDGPUFrameLowering *TFL =
-   static_cast<const AMDGPUFrameLowering*>(getTargetMachine().getFrameLowering());
-
-  FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op);
-  assert(FIN);
-
-  unsigned FrameIndex = FIN->getIndex();
-  unsigned Offset = TFL->getFrameIndexOffset(MF, FrameIndex);
-  return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), MVT::i32);
-}
-
-SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  EVT VT = Op.getValueType();
-
-  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
-                     Op.getOperand(0),
-                     Op.getOperand(0),
-                     DAG.getNode(ISD::SUB, DL, VT,
-                                 DAG.getConstant(32, MVT::i32),
-                                 Op.getOperand(1)));
-}
-
 bool R600TargetLowering::isZero(SDValue Op) const {
   if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
     return Cst->isNullValue();
@@ -541,7 +849,7 @@ bool R600TargetLowering::isZero(SDValue Op) const {
 }
 
 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue LHS = Op.getOperand(0);
@@ -560,16 +868,27 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // SET* can match the following patterns:
   //
-  // select_cc f32, f32, -1,  0, cc_any
-  // select_cc f32, f32, 1.0f, 0.0f, cc_any
-  // select_cc i32, i32, -1,  0, cc_any
+  // select_cc f32, f32, -1,  0, cc_supported
+  // select_cc f32, f32, 1.0f, 0.0f, cc_supported
+  // select_cc i32, i32, -1,  0, cc_supported
   //
 
   // Move hardware True/False values to the correct operand.
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  ISD::CondCode InverseCC =
+     ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
   if (isHWTrueValue(False) && isHWFalseValue(True)) {
-    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
-    std::swap(False, True);
-    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+    if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
+      std::swap(False, True);
+      CC = DAG.getCondCode(InverseCC);
+    } else {
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
+      if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
+        std::swap(False, True);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(SwapInvCC);
+      }
+    }
   }
 
   if (isHWTrueValue(True) && isHWFalseValue(False) &&
@@ -582,14 +901,34 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
   //
   // CND* can match the following patterns:
   //
-  // select_cc f32, 0.0, f32, f32, cc_any
-  // select_cc f32, 0.0, i32, i32, cc_any
-  // select_cc i32, 0,   f32, f32, cc_any
-  // select_cc i32, 0,   i32, i32, cc_any
+  // select_cc f32, 0.0, f32, f32, cc_supported
+  // select_cc f32, 0.0, i32, i32, cc_supported
+  // select_cc i32, 0,   f32, f32, cc_supported
+  // select_cc i32, 0,   i32, i32, cc_supported
   //
-  if (isZero(LHS) || isZero(RHS)) {
-    SDValue Cond = (isZero(LHS) ? RHS : LHS);
-    SDValue Zero = (isZero(LHS) ? LHS : RHS);
+
+  // Try to move the zero value to the RHS
+  if (isZero(LHS)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    // Try swapping the operands
+    ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
+    if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(CCSwapped);
+    } else {
+      // Try inverting the conditon and then swapping the operands
+      ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
+      CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
+      if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
+        std::swap(True, False);
+        std::swap(LHS, RHS);
+        CC = DAG.getCondCode(CCSwapped);
+      }
+    }
+  }
+  if (isZero(RHS)) {
+    SDValue Cond = LHS;
+    SDValue Zero = RHS;
     ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
     if (CompareVT != VT) {
       // Bitcast True / False to the correct types.  This will end up being
@@ -599,20 +938,11 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
       False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
     }
-    if (isZero(LHS)) {
-      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
-    }
 
     switch (CCOpcode) {
     case ISD::SETONE:
     case ISD::SETUNE:
     case ISD::SETNE:
-    case ISD::SETULE:
-    case ISD::SETULT:
-    case ISD::SETOLE:
-    case ISD::SETOLT:
-    case ISD::SETLE:
-    case ISD::SETLT:
       CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
       Temp = True;
       True = False;
@@ -660,17 +990,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
       DAG.getCondCode(ISD::SETNE));
 }
 
-SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  return DAG.getNode(ISD::SELECT_CC,
-      Op.getDebugLoc(),
-      Op.getValueType(),
-      Op.getOperand(0),
-      DAG.getConstant(0, MVT::i32),
-      Op.getOperand(1),
-      Op.getOperand(2),
-      DAG.getCondCode(ISD::SETNE));
-}
-
 /// LLVM generates byte-addresed pointers.  For indirect addressing, we need to
 /// convert these pointers to a register index.  Each register holds
 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
@@ -693,7 +1012,7 @@ SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
   default: llvm_unreachable("Invalid stack width");
   }
 
-  return DAG.getNode(ISD::SRL, Ptr.getDebugLoc(), Ptr.getValueType(), Ptr,
+  return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
                      DAG.getConstant(SRLPad, MVT::i32));
 }
 
@@ -727,25 +1046,65 @@ void R600TargetLowering::getStackAddress(unsigned StackWidth,
 }
 
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Value = Op.getOperand(1);
   SDValue Ptr = Op.getOperand(2);
 
-  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
-    // Convert pointer from byte address to dword address.
-    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
-                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
-                                  Ptr, DAG.getConstant(2, MVT::i32)));
+  SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Result.getNode()) {
+    return Result;
+  }
 
-    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
-      assert(!"Truncated and indexed stores not supported yet");
-    } else {
-      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
+    if (StoreNode->isTruncatingStore()) {
+      EVT VT = Value.getValueType();
+      assert(VT.bitsLE(MVT::i32));
+      EVT MemVT = StoreNode->getMemoryVT();
+      SDValue MaskConstant;
+      if (MemVT == MVT::i8) {
+        MaskConstant = DAG.getConstant(0xFF, MVT::i32);
+      } else {
+        assert(MemVT == MVT::i16);
+        MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
+      }
+      SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
+                                      DAG.getConstant(2, MVT::i32));
+      SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
+                                      DAG.getConstant(0x00000003, VT));
+      SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
+      SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
+                                   DAG.getConstant(3, VT));
+      SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
+      SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
+      // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
+      // vector instead.
+      SDValue Src[4] = {
+        ShiftedValue,
+        DAG.getConstant(0, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        Mask
+      };
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Args[3] = { Chain, Input, DWordAddr };
+      return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
+                                     Op->getVTList(), Args, 3, MemVT,
+                                     StoreNode->getMemOperand());
+    } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
+               Value.getValueType().bitsGE(MVT::i32)) {
+      // Convert pointer from byte address to dword address.
+      Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                        DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                    Ptr, DAG.getConstant(2, MVT::i32)));
+
+      if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+        assert(!"Truncated and indexed stores not supported yet");
+      } else {
+        Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+      }
+      return Chain;
     }
-    return Chain;
   }
 
   EVT ValueVT = Value.getValueType();
@@ -789,7 +1148,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
     }
     Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
-    DAG.getTargetConstant(0, MVT::i32)); // Channel 
+    DAG.getTargetConstant(0, MVT::i32)); // Channel
   }
 
   return Chain;
@@ -839,18 +1198,28 @@ ConstantAddressBlock(unsigned AddressSpace) {
 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 {
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Ptr = Op.getOperand(1);
   SDValue LoweredLoad;
 
+  if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
+    SDValue MergedValues[2] = {
+      SplitVectorLoad(Op, DAG),
+      Chain
+    };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
-  if (ConstantBlock > -1) {
+  if (ConstantBlock > -1 &&
+      ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
+       (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (dyn_cast<ConstantExpr>(LoadNode->getSrcValue()) ||
-        dyn_cast<Constant>(LoadNode->getSrcValue()) ||
-        dyn_cast<ConstantSDNode>(Ptr)) {
+    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
+        isa<Constant>(LoadNode->getSrcValue()) ||
+        isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
         // We want Const position encoded with the following formula :
@@ -862,13 +1231,19 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
             DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
         Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Slots, 4);
+      EVT NewVT = MVT::v4i32;
+      unsigned NumElements = 4;
+      if (VT.isVector()) {
+        NewVT = VT;
+        NumElements = VT.getVectorNumElements();
+      }
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
     } else {
       // non constant ptr cant be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
           DAG.getConstant(LoadNode->getAddressSpace() -
-	                  AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
+                          AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
           );
     }
 
@@ -884,6 +1259,30 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     return DAG.getMergeValues(MergedValues, 2, DL);
   }
 
+  // For most operations returning SDValue() will result in the node being
+  // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
+  // need to manually expand loads that may be legal in some address spaces and
+  // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
+  // compute shaders, since the data is sign extended when it is uploaded to the
+  // buffer. However SEXT loads from other address spaces are not supported, so
+  // we need to expand them here.
+  if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
+    EVT MemVT = LoadNode->getMemoryVT();
+    assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
+    SDValue ShiftAmount =
+          DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
+    SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
+                                  LoadNode->getPointerInfo(), MemVT,
+                                  LoadNode->isVolatile(),
+                                  LoadNode->isNonTemporal(),
+                                  LoadNode->getAlignment());
+    SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
+
+    SDValue MergedValues[2] = { Sra, Chain };
+    return DAG.getMergeValues(MergedValues, 2, DL);
+  }
+
   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
     return SDValue();
   }
@@ -941,42 +1340,158 @@ SDValue R600TargetLowering::LowerFormalArguments(
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
-  unsigned ParamOffsetBytes = 36;
-  Function::const_arg_iterator FuncArg =
-                            DAG.getMachineFunction().getFunction()->arg_begin();
-  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned ShaderType = MF.getInfo<R600MachineFunctionInfo>()->ShaderType;
+
+  SmallVector<ISD::InputArg, 8> LocalIns;
+
+  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                          LocalIns);
+
+  AnalyzeFormalArguments(CCInfo, LocalIns);
+
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
     EVT VT = Ins[i].VT;
-    Type *ArgType = FuncArg->getType();
-    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
-                             32 : ArgType->getPrimitiveSizeInBits();
-    unsigned ArgBytes = ArgSizeInBits >> 3;
-    EVT ArgVT;
-    if (ArgSizeInBits < VT.getSizeInBits()) {
-      assert(!ArgType->isFloatTy() &&
-             "Extending floating point arguments not supported yet");
-      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
-    } else {
-      ArgVT = VT;
+    EVT MemVT = LocalIns[i].VT;
+
+    if (ShaderType != ShaderType::COMPUTE) {
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
+      SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+      InVals.push_back(Register);
+      continue;
     }
+
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                                    AMDGPUAS::PARAM_I_ADDRESS);
-    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
-                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
-                                       MachinePointerInfo(UndefValue::get(PtrTy)),
-                                       ArgVT, false, false, ArgBytes);
+                                                   AMDGPUAS::CONSTANT_BUFFER_0);
+
+    // The first 36 bytes of the input buffer contains information about
+    // thread group and global sizes.
+    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+                                 DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
+                                 MachinePointerInfo(UndefValue::get(PtrTy)),
+                                 MemVT, false, false, 4);
+                                 // 4 is the prefered alignment for
+                                 // the CONSTANT memory space.
     InVals.push_back(Arg);
-    ParamOffsetBytes += ArgBytes;
   }
   return Chain;
 }
 
-EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
+EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
    if (!VT.isVector()) return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
+static SDValue
+CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
+                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      // We mask write here to teach later passes that the ith element of this
+      // vector is undef. Thus we can use it to reduce 128 bits reg usage,
+      // break false dependencies and additionnaly make assembly easier to read.
+      RemapSwizzle[i] = 7; // SEL_MASK_WRITE
+    if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
+      if (C->isZero()) {
+        RemapSwizzle[i] = 4; // SEL_0
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      } else if (C->isExactlyValue(1.0)) {
+        RemapSwizzle[i] = 5; // SEL_1
+        NewBldVec[i] = DAG.getUNDEF(MVT::f32);
+      }
+    }
+
+    if (NewBldVec[i].getOpcode() == ISD::UNDEF)
+      continue;
+    for (unsigned j = 0; j < i; j++) {
+      if (NewBldVec[i] == NewBldVec[j]) {
+        NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
+        RemapSwizzle[i] = j;
+        break;
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+      VectorEntry.getValueType(), NewBldVec, 4);
+}
+
+static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
+                                DenseMap<unsigned, unsigned> &RemapSwizzle) {
+  assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
+  assert(RemapSwizzle.empty());
+  SDValue NewBldVec[4] = {
+      VectorEntry.getOperand(0),
+      VectorEntry.getOperand(1),
+      VectorEntry.getOperand(2),
+      VectorEntry.getOperand(3)
+  };
+  bool isUnmovable[4] = { false, false, false, false };
+  for (unsigned i = 0; i < 4; i++)
+    RemapSwizzle[i] = i;
+
+  for (unsigned i = 0; i < 4; i++) {
+    if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
+          ->getZExtValue();
+      if (i == Idx) {
+        isUnmovable[Idx] = true;
+        continue;
+      }
+      if (isUnmovable[Idx])
+        continue;
+      // Swap i and Idx
+      std::swap(NewBldVec[Idx], NewBldVec[i]);
+      std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
+      break;
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
+      VectorEntry.getValueType(), NewBldVec, 4);
+}
+
+
+SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
+SDValue Swz[4], SelectionDAG &DAG) const {
+  assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
+  // Old -> New swizzle values
+  DenseMap<unsigned, unsigned> SwizzleRemap;
+
+  BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  SwizzleRemap.clear();
+  BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
+    if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+      Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
+  }
+
+  return BuildVector;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Optimizations
 //===----------------------------------------------------------------------===//
@@ -990,7 +1505,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
       if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
-        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
+        return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
                            Arg.getOperand(0));
       }
       break;
@@ -1015,7 +1530,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       return SDValue();
     }
 
-    return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), N->getValueType(0),
+    return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
                            SelectCC.getOperand(0), // LHS
                            SelectCC.getOperand(1), // RHS
                            DAG.getConstant(-1, MVT::i32), // True
@@ -1024,6 +1539,61 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
 
     break;
   }
+
+  // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
+  // => build_vector elt0, ... , NewEltIdx, ... , eltN
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = N->getOperand(0);
+    SDValue InVal = N->getOperand(1);
+    SDValue EltNo = N->getOperand(2);
+    SDLoc dl(N);
+
+    // If the inserted element is an UNDEF, just use the input vector.
+    if (InVal.getOpcode() == ISD::UNDEF)
+      return InVec;
+
+    EVT VT = InVec.getValueType();
+
+    // If we can't generate a legal BUILD_VECTOR, exit
+    if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return SDValue();
+
+    // Check that we know which element is being inserted
+    if (!isa<ConstantSDNode>(EltNo))
+      return SDValue();
+    unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+    // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+    // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+    // vector elements.
+    SmallVector<SDValue, 8> Ops;
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      Ops.append(InVec.getNode()->op_begin(),
+                 InVec.getNode()->op_end());
+    } else if (InVec.getOpcode() == ISD::UNDEF) {
+      unsigned NElts = VT.getVectorNumElements();
+      Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+    } else {
+      return SDValue();
+    }
+
+    // Insert the element
+    if (Elt < Ops.size()) {
+      // All the operands of BUILD_VECTOR must have the same type;
+      // we enforce that here.
+      EVT OpVT = Ops[0].getValueType();
+      if (InVal.getValueType() != OpVT)
+        InVal = OpVT.bitsGT(InVal.getValueType()) ?
+          DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+          DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+      Ops[Elt] = InVal;
+    }
+
+    // Return the new vector
+    return DAG.getNode(ISD::BUILD_VECTOR, dl,
+                       VT, &Ops[0], Ops.size());
+  }
+
   // Extract_vec (Build_vector) generated by custom lowering
   // also needs to be customly combined
   case ISD::EXTRACT_VECTOR_ELT: {
@@ -1038,7 +1608,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
         Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
       if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
         unsigned Element = Const->getZExtValue();
-        return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), N->getVTList(),
+        return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
             Arg->getOperand(0).getOperand(Element));
       }
     }
@@ -1073,25 +1643,25 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
       LHSCC = ISD::getSetCCInverse(LHSCC,
                                   LHS.getOperand(0).getValueType().isInteger());
-      return DAG.getSelectCC(N->getDebugLoc(),
-                             LHS.getOperand(0),
-                             LHS.getOperand(1),
-                             LHS.getOperand(2),
-                             LHS.getOperand(3),
-                             LHSCC);
+      if (DCI.isBeforeLegalizeOps() ||
+          isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
+        return DAG.getSelectCC(SDLoc(N),
+                               LHS.getOperand(0),
+                               LHS.getOperand(1),
+                               LHS.getOperand(2),
+                               LHS.getOperand(3),
+                               LHSCC);
+      break;
     }
     }
+    return SDValue();
   }
+
   case AMDGPUISD::EXPORT: {
     SDValue Arg = N->getOperand(1);
     if (Arg.getOpcode() != ISD::BUILD_VECTOR)
       break;
-    SDValue NewBldVec[4] = {
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32),
-        DAG.getUNDEF(MVT::f32)
-      };
+
     SDValue NewArgs[8] = {
       N->getOperand(0), // Chain
       SDValue(),
@@ -1102,23 +1672,290 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
       N->getOperand(6), // SWZ_Z
       N->getOperand(7) // SWZ_W
     };
-    for (unsigned i = 0; i < Arg.getNumOperands(); i++) {
-      if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Arg.getOperand(i))) {
-        if (C->isZero()) {
-          NewArgs[4 + i] = DAG.getConstant(4, MVT::i32); // SEL_0
-        } else if (C->isExactlyValue(1.0)) {
-          NewArgs[4 + i] = DAG.getConstant(5, MVT::i32); // SEL_0
-        } else {
-          NewBldVec[i] = Arg.getOperand(i);
+    SDLoc DL(N);
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+  }
+  case AMDGPUISD::TEXTURE_FETCH: {
+    SDValue Arg = N->getOperand(1);
+    if (Arg.getOpcode() != ISD::BUILD_VECTOR)
+      break;
+
+    SDValue NewArgs[19] = {
+      N->getOperand(0),
+      N->getOperand(1),
+      N->getOperand(2),
+      N->getOperand(3),
+      N->getOperand(4),
+      N->getOperand(5),
+      N->getOperand(6),
+      N->getOperand(7),
+      N->getOperand(8),
+      N->getOperand(9),
+      N->getOperand(10),
+      N->getOperand(11),
+      N->getOperand(12),
+      N->getOperand(13),
+      N->getOperand(14),
+      N->getOperand(15),
+      N->getOperand(16),
+      N->getOperand(17),
+      N->getOperand(18),
+    };
+    NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
+    return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
+        NewArgs, 19);
+  }
+  }
+  return SDValue();
+}
+
+static bool
+FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
+            SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Src.isMachineOpcode())
+    return false;
+  switch (Src.getMachineOpcode()) {
+  case AMDGPU::FNEG_R600:
+    if (!Neg.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Neg = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::FABS_R600:
+    if (!Abs.getNode())
+      return false;
+    Src = Src.getOperand(0);
+    Abs = DAG.getTargetConstant(1, MVT::i32);
+    return true;
+  case AMDGPU::CONST_COPY: {
+    unsigned Opcode = ParentNode->getMachineOpcode();
+    bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+
+    if (!Sel.getNode())
+      return false;
+
+    SDValue CstOffset = Src.getOperand(0);
+    if (ParentNode->getValueType(0).isVector())
+      return false;
+
+    // Gather constants values
+    int SrcIndices[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+    };
+    std::vector<unsigned> Consts;
+    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
+      int OtherSrcIdx = SrcIndices[i];
+      int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
+      if (OtherSrcIdx < 0 || OtherSelIdx < 0)
+        continue;
+      if (HasDst) {
+        OtherSrcIdx--;
+        OtherSelIdx--;
+      }
+      if (RegisterSDNode *Reg =
+          dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
+        if (Reg->getReg() == AMDGPU::ALU_CONST) {
+          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
+              ParentNode->getOperand(OtherSelIdx));
+          Consts.push_back(Cst->getZExtValue());
         }
+      }
+    }
+
+    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    Consts.push_back(Cst->getZExtValue());
+    if (!TII->fitsConstReadLimitations(Consts)) {
+      return false;
+    }
+
+    Sel = CstOffset;
+    Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
+    return true;
+  }
+  case AMDGPU::MOV_IMM_I32:
+  case AMDGPU::MOV_IMM_F32: {
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+    uint64_t ImmValue = 0;
+
+
+    if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
+      ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+      float FloatValue = FPC->getValueAPF().convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
+      uint64_t Value = C->getZExtValue();
+      if (Value == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (Value == 1) {
+        ImmReg = AMDGPU::ONE_INT;
       } else {
-        NewBldVec[i] = Arg.getOperand(i);
+        ImmValue = Value;
       }
     }
-    DebugLoc DL = N->getDebugLoc();
-    NewArgs[1] = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4f32, NewBldVec, 4);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+
+    // Check that we aren't already using an immediate.
+    // XXX: It's possible for an instruction to have more than one
+    // immediate operand, but this is not supported yet.
+    if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+      if (!Imm.getNode())
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
+      assert(C);
+      if (C->getZExtValue())
+        return false;
+      Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
+    }
+    Src = DAG.getRegister(ImmReg, MVT::i32);
+    return true;
   }
+  default:
+    return false;
   }
-  return SDValue();
+}
+
+
+/// \brief Fold the instructions after selecting them
+SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
+                                            SelectionDAG &DAG) const {
+  const R600InstrInfo *TII =
+      static_cast<const R600InstrInfo *>(DAG.getTarget().getInstrInfo());
+  if (!Node->isMachineOpcode())
+    return Node;
+  unsigned Opcode = Node->getMachineOpcode();
+  SDValue FakeOp;
+
+  std::vector<SDValue> Ops;
+  for(SDNode::op_iterator I = Node->op_begin(), E = Node->op_end();
+              I != E; ++I)
+          Ops.push_back(*I);
+
+  if (Opcode == AMDGPU::DOT_4) {
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
+        };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
+    };
+    for (unsigned i = 0; i < 8; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue &Abs = Ops[AbsIdx[i] - 1];
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      if (HasDst)
+        SelIdx--;
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::REG_SEQUENCE) {
+    for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
+      SDValue &Src = Ops[i];
+      if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  } else if (Opcode == AMDGPU::CLAMP_R600) {
+    SDValue Src = Node->getOperand(0);
+    if (!Src.isMachineOpcode() ||
+        !TII->hasInstrModifiers(Src.getMachineOpcode()))
+      return Node;
+    int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
+        AMDGPU::OpName::clamp);
+    if (ClampIdx < 0)
+      return Node;
+    std::vector<SDValue> Ops;
+    unsigned NumOp = Src.getNumOperands();
+    for(unsigned i = 0; i < NumOp; ++i)
+          Ops.push_back(Src.getOperand(i));
+    Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
+    return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
+        Node->getVTList(), Ops);
+  } else {
+    if (!TII->hasInstrModifiers(Opcode))
+      return Node;
+    int OperandIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
+    };
+    int NegIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
+    };
+    int AbsIdx[] = {
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
+      TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
+      -1
+    };
+    for (unsigned i = 0; i < 3; i++) {
+      if (OperandIdx[i] < 0)
+        return Node;
+      SDValue &Src = Ops[OperandIdx[i] - 1];
+      SDValue &Neg = Ops[NegIdx[i] - 1];
+      SDValue FakeAbs;
+      SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
+      bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
+      int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
+      int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
+      if (HasDst) {
+        SelIdx--;
+        ImmIdx--;
+      }
+      SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
+      SDValue &Imm = Ops[ImmIdx];
+      if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
+        return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
+    }
+  }
+
+  return Node;
 }
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 2c09acb..c10257e 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -36,37 +36,37 @@ public:
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const;
-  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
 private:
-  const R600InstrInfo * TII;
-
+  unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
   /// first nine dwords of a Vertex Buffer.  These implicit parameters are
   /// lowered to load instructions which retreive the values from the Vertex
   /// Buffer.
   SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
-                                 DebugLoc DL, unsigned DwordOffset) const;
+                                 SDLoc DL, unsigned DwordOffset) const;
 
   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
       MachineRegisterInfo & MRI, unsigned dword_offset) const;
+  SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
 
   /// \brief Lower ROTL opcode to BITALIGN
   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFrameIndex(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue stackPtrToRegIndex(SDValue Ptr, unsigned StackWidth,
                                           SelectionDAG &DAG) const;
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
+  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrFormats.td b/lib/Target/R600/R600InstrFormats.td
new file mode 100644
index 0000000..9428bab
--- /dev/null
+++ b/lib/Target/R600/R600InstrFormats.td
@@ -0,0 +1,492 @@
+//===-- R600InstrFormats.td - R600 Instruction Encodings ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Instruction format definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit LDS_1A = 0;
+  bit LDS_1A1D = 0;
+  bit HasNativeOperands = 0;
+  bit VTXInst = 0;
+  bit TEXInst = 0;
+  bit ALUInst = 0;
+  bit IsExport = 0;
+  bit LDS_1A2D = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+  let TSFlags{12} = VTXInst;
+  let TSFlags{13} = TEXInst;
+  let TSFlags{14} = ALUInst;
+  let TSFlags{15} = LDS_1A;
+  let TSFlags{16} = LDS_1A1D;
+  let TSFlags{17} = IsExport;
+  let TSFlags{18} = LDS_1A2D;
+}
+
+//===----------------------------------------------------------------------===//
+// ALU instructions
+//===----------------------------------------------------------------------===//
+
+class R600_ALU_LDS_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word0 : R600_ALU_LDS_Word0 {
+
+  bits<1>  src0_neg;
+  bits<1>  src1_neg;
+
+  let Word0{12}    = src0_neg;
+  let Word0{25}    = src1_neg;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+class R600LDS_Word1 {
+  field bits<32> Word1;
+
+  bits<11> src2;
+  bits<9>  src2_sel  = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+  bits<1>  src2_rel;
+  // offset specifies the stride offset to the second set of data to be read
+  // from.  This is a dword offset.
+  bits<5>  alu_inst = 17; // OP3_INST_LDS_IDX_OP
+  bits<3>  bank_swizzle;
+  bits<6>  lds_op;
+  bits<2>  dst_chan = 0;
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{17-13} = alu_inst;
+  let Word1{20-18} = bank_swizzle;
+  let Word1{26-21} = lds_op;
+  let Word1{30-29} = dst_chan;
+}
+
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+//===----------------------------------------------------------------------===//
+// Vertex Fetch instructions
+//===----------------------------------------------------------------------===//
+
+class VTX_WORD0 {
+  field bits<32> Word0;
+  bits<7> src_gpr;
+  bits<5> VC_INST;
+  bits<2> FETCH_TYPE;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> BUFFER_ID;
+  bits<1> SRC_REL;
+  bits<2> SRC_SEL_X;
+
+  let Word0{4-0}   = VC_INST;
+  let Word0{6-5}   = FETCH_TYPE;
+  let Word0{7}     = FETCH_WHOLE_QUAD;
+  let Word0{15-8}  = BUFFER_ID;
+  let Word0{22-16} = src_gpr;
+  let Word0{23}    = SRC_REL;
+  let Word0{25-24} = SRC_SEL_X;
+}
+
+class VTX_WORD0_eg : VTX_WORD0 {
+
+  bits<6> MEGA_FETCH_COUNT;
+
+  let Word0{31-26} = MEGA_FETCH_COUNT;
+}
+
+class VTX_WORD0_cm : VTX_WORD0 {
+
+  bits<2> SRC_SEL_Y;
+  bits<2> STRUCTURED_READ;
+  bits<1> LDS_REQ;
+  bits<1> COALESCED_READ;
+
+  let Word0{27-26} = SRC_SEL_Y;
+  let Word0{29-28} = STRUCTURED_READ;
+  let Word0{30}    = LDS_REQ;
+  let Word0{31}    = COALESCED_READ;
+}
+
+class VTX_WORD1_GPR {
+  field bits<32> Word1;
+  bits<7> dst_gpr;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<1> USE_CONST_FIELDS;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL;
+  bits<1> FORMAT_COMP_ALL;
+  bits<1> SRF_MODE_ALL;
+
+  let Word1{6-0} = dst_gpr;
+  let Word1{7}    = DST_REL;
+  let Word1{8}    = 0; // Reserved
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{21}    = USE_CONST_FIELDS;
+  let Word1{27-22} = DATA_FORMAT;
+  let Word1{29-28} = NUM_FORMAT_ALL;
+  let Word1{30}    = FORMAT_COMP_ALL;
+  let Word1{31}    = SRF_MODE_ALL;
+}
+
+//===----------------------------------------------------------------------===//
+// Texture fetch instructions
+//===----------------------------------------------------------------------===//
+
+class TEX_WORD0 {
+  field bits<32> Word0;
+
+  bits<5> TEX_INST;
+  bits<2> INST_MOD;
+  bits<1> FETCH_WHOLE_QUAD;
+  bits<8> RESOURCE_ID;
+  bits<7> SRC_GPR;
+  bits<1> SRC_REL;
+  bits<1> ALT_CONST;
+  bits<2> RESOURCE_INDEX_MODE;
+  bits<2> SAMPLER_INDEX_MODE;
+
+  let Word0{4-0} = TEX_INST;
+  let Word0{6-5} = INST_MOD;
+  let Word0{7} = FETCH_WHOLE_QUAD;
+  let Word0{15-8} = RESOURCE_ID;
+  let Word0{22-16} = SRC_GPR;
+  let Word0{23} = SRC_REL;
+  let Word0{24} = ALT_CONST;
+  let Word0{26-25} = RESOURCE_INDEX_MODE;
+  let Word0{28-27} = SAMPLER_INDEX_MODE;
+}
+
+class TEX_WORD1 {
+  field bits<32> Word1;
+
+  bits<7> DST_GPR;
+  bits<1> DST_REL;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  bits<7> LOD_BIAS;
+  bits<1> COORD_TYPE_X;
+  bits<1> COORD_TYPE_Y;
+  bits<1> COORD_TYPE_Z;
+  bits<1> COORD_TYPE_W;
+
+  let Word1{6-0} = DST_GPR;
+  let Word1{7} = DST_REL;
+  let Word1{11-9} = DST_SEL_X;
+  let Word1{14-12} = DST_SEL_Y;
+  let Word1{17-15} = DST_SEL_Z;
+  let Word1{20-18} = DST_SEL_W;
+  let Word1{27-21} = LOD_BIAS;
+  let Word1{28} = COORD_TYPE_X;
+  let Word1{29} = COORD_TYPE_Y;
+  let Word1{30} = COORD_TYPE_Z;
+  let Word1{31} = COORD_TYPE_W;
+}
+
+class TEX_WORD2 {
+  field bits<32> Word2;
+
+  bits<5> OFFSET_X;
+  bits<5> OFFSET_Y;
+  bits<5> OFFSET_Z;
+  bits<5> SAMPLER_ID;
+  bits<3> SRC_SEL_X;
+  bits<3> SRC_SEL_Y;
+  bits<3> SRC_SEL_Z;
+  bits<3> SRC_SEL_W;
+
+  let Word2{4-0} = OFFSET_X;
+  let Word2{9-5} = OFFSET_Y;
+  let Word2{14-10} = OFFSET_Z;
+  let Word2{19-15} = SAMPLER_ID;
+  let Word2{22-20} = SRC_SEL_X;
+  let Word2{25-23} = SRC_SEL_Y;
+  let Word2{28-26} = SRC_SEL_Z;
+  let Word2{31-29} = SRC_SEL_W;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Instructions
+//===----------------------------------------------------------------------===//
+
+class CF_WORD1_R600 {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<3> COUNT;
+  bits<6> CALL_COUNT;
+  bits<1> COUNT_3;
+  bits<1> END_OF_PROGRAM;
+  bits<1> VALID_PIXEL_MODE;
+  bits<7> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{12-10} = COUNT;
+  let Word1{18-13} = CALL_COUNT;
+  let Word1{19} = COUNT_3;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{22} = VALID_PIXEL_MODE;
+  let Word1{29-23} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_WORD0_EG {
+  field bits<32> Word0;
+
+  bits<24> ADDR;
+  bits<3> JUMPTABLE_SEL;
+
+  let Word0{23-0} = ADDR;
+  let Word0{26-24} = JUMPTABLE_SEL;
+}
+
+class CF_WORD1_EG {
+  field bits<32> Word1;
+
+  bits<3> POP_COUNT;
+  bits<5> CF_CONST;
+  bits<2> COND;
+  bits<6> COUNT;
+  bits<1> VALID_PIXEL_MODE;
+  bits<1> END_OF_PROGRAM;
+  bits<8> CF_INST;
+  bits<1> BARRIER;
+
+  let Word1{2-0} = POP_COUNT;
+  let Word1{7-3} = CF_CONST;
+  let Word1{9-8} = COND;
+  let Word1{15-10} = COUNT;
+  let Word1{20} = VALID_PIXEL_MODE;
+  let Word1{21} = END_OF_PROGRAM;
+  let Word1{29-22} = CF_INST;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALU_WORD0 {
+  field bits<32> Word0;
+
+  bits<22> ADDR;
+  bits<4> KCACHE_BANK0;
+  bits<4> KCACHE_BANK1;
+  bits<2> KCACHE_MODE0;
+
+  let Word0{21-0} = ADDR;
+  let Word0{25-22} = KCACHE_BANK0;
+  let Word0{29-26} = KCACHE_BANK1;
+  let Word0{31-30} = KCACHE_MODE0;
+}
+
+class CF_ALU_WORD1 {
+  field bits<32> Word1;
+
+  bits<2> KCACHE_MODE1;
+  bits<8> KCACHE_ADDR0;
+  bits<8> KCACHE_ADDR1;
+  bits<7> COUNT;
+  bits<1> ALT_CONST;
+  bits<4> CF_INST;
+  bits<1> WHOLE_QUAD_MODE;
+  bits<1> BARRIER;
+
+  let Word1{1-0} = KCACHE_MODE1;
+  let Word1{9-2} = KCACHE_ADDR0;
+  let Word1{17-10} = KCACHE_ADDR1;
+  let Word1{24-18} = COUNT;
+  let Word1{25} = ALT_CONST;
+  let Word1{29-26} = CF_INST;
+  let Word1{30} = WHOLE_QUAD_MODE;
+  let Word1{31} = BARRIER;
+}
+
+class CF_ALLOC_EXPORT_WORD0_RAT {
+  field bits<32> Word0;
+
+  bits<4> rat_id;
+  bits<6> rat_inst;
+  bits<2> rim;
+  bits<2> type;
+  bits<7> rw_gpr;
+  bits<1> rw_rel;
+  bits<7> index_gpr;
+  bits<2> elem_size;
+
+  let Word0{3-0}   = rat_id;
+  let Word0{9-4}   = rat_inst;
+  let Word0{10}    = 0; // Reserved
+  let Word0{12-11} = rim;
+  let Word0{14-13} = type;
+  let Word0{21-15} = rw_gpr;
+  let Word0{22}    = rw_rel;
+  let Word0{29-23} = index_gpr;
+  let Word0{31-30} = elem_size;
+}
+
+class CF_ALLOC_EXPORT_WORD1_BUF {
+  field bits<32> Word1;
+
+  bits<12> array_size;
+  bits<4>  comp_mask;
+  bits<4>  burst_count;
+  bits<1>  vpm;
+  bits<1>  eop;
+  bits<8>  cf_inst;
+  bits<1>  mark;
+  bits<1>  barrier;
+
+  let Word1{11-0} = array_size;
+  let Word1{15-12} = comp_mask;
+  let Word1{19-16} = burst_count;
+  let Word1{20}    = vpm;
+  let Word1{21}    = eop;
+  let Word1{29-22} = cf_inst;
+  let Word1{30}    = mark;
+  let Word1{31}    = barrier;
+}
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 37150c4..c0827fc 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -19,18 +19,18 @@
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
 #include "R600RegisterInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
 using namespace llvm;
 
 R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
-    RI(tm, *this),
+    RI(tm),
     ST(tm.getSubtarget<AMDGPUSubtarget>())
   { }
 
@@ -51,9 +51,17 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const {
-  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
-      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
-    for (unsigned I = 0; I < 4; I++) {
+  unsigned VectorComponents = 0;
+  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
+      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+    VectorComponents = 4;
+  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
+            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
+    VectorComponents = 2;
+  }
+
+  if (VectorComponents > 0) {
+    for (unsigned I = 0; I < VectorComponents; I++) {
       unsigned SubRegIndex = RI.getSubRegFromChannel(I);
       buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
@@ -62,28 +70,23 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                       RegState::Define | RegState::Implicit);
     }
   } else {
-
-    // We can't copy vec4 registers
-    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
-           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
-
     MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
                                                   DestReg, SrcReg);
-    NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
+    NewMI->getOperand(getOperandIdx(*NewMI, AMDGPU::OpName::src0))
                                     .setIsKill(KillSrc);
   }
 }
 
-MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
-                                             unsigned DstReg, int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addReg(AMDGPU::ALU_LITERAL_X);
-  MIB.addImm(Imm);
-  MIB.addReg(0); // PREDICATE_BIT
-
-  return MI;
+/// \returns true if \p MBBI can be moved into a new basic.
+bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) const {
+  for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
+                                        E = MBBI->operands_end(); I != E; ++I) {
+    if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
+        I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+      return false;
+  }
+  return true;
 }
 
 unsigned R600InstrInfo::getIEQOpcode() const {
@@ -114,12 +117,7 @@ bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
 }
 
 bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
-  switch(Opcode) {
-    default: return false;
-    case AMDGPU::DOT4_r600_pseudo:
-    case AMDGPU::DOT4_eg_pseudo:
-      return true;
-  }
+  return false;
 }
 
 bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
@@ -136,19 +134,73 @@ bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
 bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
   unsigned TargetFlags = get(Opcode).TSFlags;
 
+  return (TargetFlags & R600_InstFlag::ALU_INST);
+}
+
+bool R600InstrInfo::hasInstrModifiers(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
   return ((TargetFlags & R600_InstFlag::OP1) |
           (TargetFlags & R600_InstFlag::OP2) |
           (TargetFlags & R600_InstFlag::OP3));
 }
 
+bool R600InstrInfo::isLDSInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::LDS_1A) |
+          (TargetFlags & R600_InstFlag::LDS_1A1D) |
+          (TargetFlags & R600_InstFlag::LDS_1A2D));
+}
+
+bool R600InstrInfo::isLDSNoRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) == -1;
+}
+
+bool R600InstrInfo::isLDSRetInstr(unsigned Opcode) const {
+  return isLDSInstr(Opcode) && getOperandIdx(Opcode, AMDGPU::OpName::dst) != -1;
+}
+
+bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
+  if (isALUInstr(MI->getOpcode()))
+    return true;
+  if (isVector(*MI) || isCubeOp(MI->getOpcode()))
+    return true;
+  switch (MI->getOpcode()) {
+  case AMDGPU::PRED_X:
+  case AMDGPU::INTERP_PAIR_XY:
+  case AMDGPU::INTERP_PAIR_ZW:
+  case AMDGPU::INTERP_VEC_LOAD:
+  case AMDGPU::COPY:
+  case AMDGPU::DOT_4:
+    return true;
+  default:
+    return false;
+  }
+}
+
 bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
-  return (get(Opcode).TSFlags & R600_InstFlag::TRANS_ONLY);
+  if (ST.hasCaymanISA())
+    return false;
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::TransALU);
 }
 
 bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
   return isTransOnly(MI->getOpcode());
 }
 
+bool R600InstrInfo::isVectorOnly(unsigned Opcode) const {
+  return (get(Opcode).getSchedClass() == AMDGPU::Sched::VecALU);
+}
+
+bool R600InstrInfo::isVectorOnly(const MachineInstr *MI) const {
+  return isVectorOnly(MI->getOpcode());
+}
+
+bool R600InstrInfo::isExport(unsigned Opcode) const {
+  return (get(Opcode).TSFlags & R600_InstFlag::IS_EXPORT);
+}
+
 bool R600InstrInfo::usesVertexCache(unsigned Opcode) const {
   return ST.hasVertexCache() && IS_VTX(get(Opcode));
 }
@@ -168,6 +220,380 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const {
          usesTextureCache(MI->getOpcode());
 }
 
+bool R600InstrInfo::mustBeLastInClause(unsigned Opcode) const {
+  switch (Opcode) {
+  case AMDGPU::KILLGT:
+  case AMDGPU::GROUP_BARRIER:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600InstrInfo::usesAddressRegister(MachineInstr *MI) const {
+  return  MI->findRegisterUseOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::definesAddressRegister(MachineInstr *MI) const {
+  return MI->findRegisterDefOperandIdx(AMDGPU::AR_X) != -1;
+}
+
+bool R600InstrInfo::readsLDSSrcReg(const MachineInstr *MI) const {
+  if (!isALUInstr(MI->getOpcode())) {
+    return false;
+  }
+  for (MachineInstr::const_mop_iterator I = MI->operands_begin(),
+                                        E = MI->operands_end(); I != E; ++I) {
+    if (!I->isReg() || !I->isUse() ||
+        TargetRegisterInfo::isVirtualRegister(I->getReg()))
+      continue;
+
+    if (AMDGPU::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
+      return true;
+  }
+  return false;
+}
+
+int R600InstrInfo::getSrcIdx(unsigned Opcode, unsigned SrcNum) const {
+  static const unsigned OpTable[] = {
+    AMDGPU::OpName::src0,
+    AMDGPU::OpName::src1,
+    AMDGPU::OpName::src2
+  };
+
+  assert (SrcNum < 3);
+  return getOperandIdx(Opcode, OpTable[SrcNum]);
+}
+
+#define SRC_SEL_ROWS 11
+int R600InstrInfo::getSelIdx(unsigned Opcode, unsigned SrcIdx) const {
+  static const unsigned SrcSelTable[SRC_SEL_ROWS][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+    {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+    {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+    {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+    {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+    {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+    {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+    {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+    {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W}
+  };
+
+  for (unsigned i = 0; i < SRC_SEL_ROWS; ++i) {
+    if (getOperandIdx(Opcode, SrcSelTable[i][0]) == (int)SrcIdx) {
+      return getOperandIdx(Opcode, SrcSelTable[i][1]);
+    }
+  }
+  return -1;
+}
+#undef SRC_SEL_ROWS
+
+SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+R600InstrInfo::getSrcs(MachineInstr *MI) const {
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    static const unsigned OpTable[8][2] = {
+      {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
+      {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
+      {AMDGPU::OpName::src0_Z, AMDGPU::OpName::src0_sel_Z},
+      {AMDGPU::OpName::src0_W, AMDGPU::OpName::src0_sel_W},
+      {AMDGPU::OpName::src1_X, AMDGPU::OpName::src1_sel_X},
+      {AMDGPU::OpName::src1_Y, AMDGPU::OpName::src1_sel_Y},
+      {AMDGPU::OpName::src1_Z, AMDGPU::OpName::src1_sel_Z},
+      {AMDGPU::OpName::src1_W, AMDGPU::OpName::src1_sel_W},
+    };
+
+    for (unsigned j = 0; j < 8; j++) {
+      MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                        OpTable[j][0]));
+      unsigned Reg = MO.getReg();
+      if (Reg == AMDGPU::ALU_CONST) {
+        unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
+                                                    OpTable[j][1])).getImm();
+        Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+        continue;
+      }
+      
+    }
+    return Result;
+  }
+
+  static const unsigned OpTable[3][2] = {
+    {AMDGPU::OpName::src0, AMDGPU::OpName::src0_sel},
+    {AMDGPU::OpName::src1, AMDGPU::OpName::src1_sel},
+    {AMDGPU::OpName::src2, AMDGPU::OpName::src2_sel},
+  };
+
+  for (unsigned j = 0; j < 3; j++) {
+    int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+    if (SrcIdx < 0)
+      break;
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    unsigned Reg = MI->getOperand(SrcIdx).getReg();
+    if (Reg == AMDGPU::ALU_CONST) {
+      unsigned Sel = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Sel));
+      continue;
+    }
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned Imm = MI->getOperand(
+          getOperandIdx(MI->getOpcode(), AMDGPU::OpName::literal)).getImm();
+      Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, Imm));
+      continue;
+    }
+    Result.push_back(std::pair<MachineOperand *, int64_t>(&MO, 0));
+  }
+  return Result;
+}
+
+std::vector<std::pair<int, unsigned> >
+R600InstrInfo::ExtractSrcs(MachineInstr *MI,
+                           const DenseMap<unsigned, unsigned> &PV,
+                           unsigned &ConstCount) const {
+  ConstCount = 0;
+  const SmallVector<std::pair<MachineOperand *, int64_t>, 3> Srcs = getSrcs(MI);
+  const std::pair<int, unsigned> DummyPair(-1, 0);
+  std::vector<std::pair<int, unsigned> > Result;
+  unsigned i = 0;
+  for (unsigned n = Srcs.size(); i < n; ++i) {
+    unsigned Reg = Srcs[i].first->getReg();
+    unsigned Index = RI.getEncodingValue(Reg) & 0xff;
+    if (Reg == AMDGPU::OQAP) {
+      Result.push_back(std::pair<int, unsigned>(Index, 0));
+    }
+    if (PV.find(Reg) != PV.end()) {
+      // 255 is used to tells its a PS/PV reg
+      Result.push_back(std::pair<int, unsigned>(255, 0));
+      continue;
+    }
+    if (Index > 127) {
+      ConstCount++;
+      Result.push_back(DummyPair);
+      continue;
+    }
+    unsigned Chan = RI.getHWRegChan(Reg);
+    Result.push_back(std::pair<int, unsigned>(Index, Chan));
+  }
+  for (; i < 3; ++i)
+    Result.push_back(DummyPair);
+  return Result;
+}
+
+static std::vector<std::pair<int, unsigned> >
+Swizzle(std::vector<std::pair<int, unsigned> > Src,
+        R600InstrInfo::BankSwizzle Swz) {
+  if (Src[0] == Src[1])
+    Src[1].first = -1;
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210:
+    break;
+  case R600InstrInfo::ALU_VEC_021_SCL_122:
+    std::swap(Src[1], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_102_SCL_221:
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_120_SCL_212:
+    std::swap(Src[0], Src[1]);
+    std::swap(Src[0], Src[2]);
+    break;
+  case R600InstrInfo::ALU_VEC_201:
+    std::swap(Src[0], Src[2]);
+    std::swap(Src[0], Src[1]);
+    break;
+  case R600InstrInfo::ALU_VEC_210:
+    std::swap(Src[0], Src[2]);
+    break;
+  }
+  return Src;
+}
+
+static unsigned
+getTransSwizzle(R600InstrInfo::BankSwizzle Swz, unsigned Op) {
+  switch (Swz) {
+  case R600InstrInfo::ALU_VEC_012_SCL_210: {
+    unsigned Cycles[3] = { 2, 1, 0};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_021_SCL_122: {
+    unsigned Cycles[3] = { 1, 2, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_120_SCL_212: {
+    unsigned Cycles[3] = { 2, 1, 2};
+    return Cycles[Op];
+  }
+  case R600InstrInfo::ALU_VEC_102_SCL_221: {
+    unsigned Cycles[3] = { 2, 2, 1};
+    return Cycles[Op];
+  }
+  default:
+    llvm_unreachable("Wrong Swizzle for Trans Slot");
+    return 0;
+  }
+}
+
+/// returns how many MIs (whose inputs are represented by IGSrcs) can be packed
+/// in the same Instruction Group while meeting read port limitations given a
+/// Swz swizzle sequence.
+unsigned  R600InstrInfo::isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  int Vector[4][3];
+  memset(Vector, -1, sizeof(Vector));
+  for (unsigned i = 0, e = IGSrcs.size(); i < e; i++) {
+    const std::vector<std::pair<int, unsigned> > &Srcs =
+        Swizzle(IGSrcs[i], Swz[i]);
+    for (unsigned j = 0; j < 3; j++) {
+      const std::pair<int, unsigned> &Src = Srcs[j];
+      if (Src.first < 0 || Src.first == 255)
+        continue;
+      if (Src.first == GET_REG_INDEX(RI.getEncodingValue(AMDGPU::OQAP))) {
+        if (Swz[i] != R600InstrInfo::ALU_VEC_012_SCL_210 &&
+            Swz[i] != R600InstrInfo::ALU_VEC_021_SCL_122) {
+            // The value from output queue A (denoted by register OQAP) can
+            // only be fetched during the first cycle.
+            return false;
+        }
+        // OQAP does not count towards the normal read port restrictions
+        continue;
+      }
+      if (Vector[Src.second][j] < 0)
+        Vector[Src.second][j] = Src.first;
+      if (Vector[Src.second][j] != Src.first)
+        return i;
+    }
+  }
+  // Now check Trans Alu
+  for (unsigned i = 0, e = TransSrcs.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransSrcs[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (Src.first == 255)
+      continue;
+    if (Vector[Src.second][Cycle] < 0)
+      Vector[Src.second][Cycle] = Src.first;
+    if (Vector[Src.second][Cycle] != Src.first)
+      return IGSrcs.size() - 1;
+  }
+  return IGSrcs.size();
+}
+
+/// Given a swizzle sequence SwzCandidate and an index Idx, returns the next
+/// (in lexicographic term) swizzle sequence assuming that all swizzles after
+/// Idx can be skipped
+static bool
+NextPossibleSolution(
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    unsigned Idx) {
+  assert(Idx < SwzCandidate.size());
+  int ResetIdx = Idx;
+  while (ResetIdx > -1 && SwzCandidate[ResetIdx] == R600InstrInfo::ALU_VEC_210)
+    ResetIdx --;
+  for (unsigned i = ResetIdx + 1, e = SwzCandidate.size(); i < e; i++) {
+    SwzCandidate[i] = R600InstrInfo::ALU_VEC_012_SCL_210;
+  }
+  if (ResetIdx == -1)
+    return false;
+  int NextSwizzle = SwzCandidate[ResetIdx] + 1;
+  SwzCandidate[ResetIdx] = (R600InstrInfo::BankSwizzle)NextSwizzle;
+  return true;
+}
+
+/// Enumerate all possible Swizzle sequence to find one that can meet all
+/// read port requirements.
+bool R600InstrInfo::FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const {
+  unsigned ValidUpTo = 0;
+  do {
+    ValidUpTo = isLegalUpTo(IGSrcs, SwzCandidate, TransSrcs, TransSwz);
+    if (ValidUpTo == IGSrcs.size())
+      return true;
+  } while (NextPossibleSolution(SwzCandidate, ValidUpTo));
+  return false;
+}
+
+/// Instructions in Trans slot can't read gpr at cycle 0 if they also read
+/// a const, and can't read a gpr at cycle 1 if they read 2 const.
+static bool
+isConstCompatible(R600InstrInfo::BankSwizzle TransSwz,
+                  const std::vector<std::pair<int, unsigned> > &TransOps,
+                  unsigned ConstCount) {
+  // TransALU can't read 3 constants
+  if (ConstCount > 2)
+    return false;
+  for (unsigned i = 0, e = TransOps.size(); i < e; ++i) {
+    const std::pair<int, unsigned> &Src = TransOps[i];
+    unsigned Cycle = getTransSwizzle(TransSwz, i);
+    if (Src.first < 0)
+      continue;
+    if (ConstCount > 0 && Cycle == 0)
+      return false;
+    if (ConstCount > 1 && Cycle == 1)
+      return false;
+  }
+  return true;
+}
+
+bool
+R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
+                                       const DenseMap<unsigned, unsigned> &PV,
+                                       std::vector<BankSwizzle> &ValidSwizzle,
+                                       bool isLastAluTrans)
+    const {
+  //Todo : support shared src0 - src1 operand
+
+  std::vector<std::vector<std::pair<int, unsigned> > > IGSrcs;
+  ValidSwizzle.clear();
+  unsigned ConstCount;
+  BankSwizzle TransBS = ALU_VEC_012_SCL_210;
+  for (unsigned i = 0, e = IG.size(); i < e; ++i) {
+    IGSrcs.push_back(ExtractSrcs(IG[i], PV, ConstCount));
+    unsigned Op = getOperandIdx(IG[i]->getOpcode(),
+        AMDGPU::OpName::bank_swizzle);
+    ValidSwizzle.push_back( (R600InstrInfo::BankSwizzle)
+        IG[i]->getOperand(Op).getImm());
+  }
+  std::vector<std::pair<int, unsigned> > TransOps;
+  if (!isLastAluTrans)
+    return FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps, TransBS);
+
+  TransOps = IGSrcs.back();
+  IGSrcs.pop_back();
+  ValidSwizzle.pop_back();
+
+  static const R600InstrInfo::BankSwizzle TransSwz[] = {
+    ALU_VEC_012_SCL_210,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221
+  };
+  for (unsigned i = 0; i < 4; i++) {
+    TransBS = TransSwz[i];
+    if (!isConstCompatible(TransBS, TransOps, ConstCount))
+      continue;
+    bool Result = FindSwizzleForVectorSlot(IGSrcs, ValidSwizzle, TransOps,
+        TransBS);
+    if (Result) {
+      ValidSwizzle.push_back(TransBS);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+
 bool
 R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
     const {
@@ -194,37 +620,31 @@ R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
 }
 
 bool
-R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
+R600InstrInfo::fitsConstReadLimitations(const std::vector<MachineInstr *> &MIs)
+    const {
   std::vector<unsigned> Consts;
+  SmallSet<int64_t, 4> Literals;
   for (unsigned i = 0, n = MIs.size(); i < n; i++) {
-    const MachineInstr *MI = MIs[i];
-
-    const R600Operands::Ops OpTable[3][2] = {
-      {R600Operands::SRC0, R600Operands::SRC0_SEL},
-      {R600Operands::SRC1, R600Operands::SRC1_SEL},
-      {R600Operands::SRC2, R600Operands::SRC2_SEL},
-    };
-
+    MachineInstr *MI = MIs[i];
     if (!isALUInstr(MI->getOpcode()))
       continue;
 
-    for (unsigned j = 0; j < 3; j++) {
-      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
-      if (SrcIdx < 0)
-        break;
-      unsigned Reg = MI->getOperand(SrcIdx).getReg();
-      if (Reg == AMDGPU::ALU_CONST) {
-        unsigned Const = MI->getOperand(
-            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
-        Consts.push_back(Const);
-        continue;
-      }
-      if (AMDGPU::R600_KC0RegClass.contains(Reg) ||
-          AMDGPU::R600_KC1RegClass.contains(Reg)) {
-        unsigned Index = RI.getEncodingValue(Reg) & 0xff;
-        unsigned Chan = RI.getHWRegChan(Reg);
+    const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Srcs =
+        getSrcs(MI);
+
+    for (unsigned j = 0, e = Srcs.size(); j < e; j++) {
+      std::pair<MachineOperand *, unsigned> Src = Srcs[j];
+      if (Src.first->getReg() == AMDGPU::ALU_LITERAL_X)
+        Literals.insert(Src.second);
+      if (Literals.size() > 4)
+        return false;
+      if (Src.first->getReg() == AMDGPU::ALU_CONST)
+        Consts.push_back(Src.second);
+      if (AMDGPU::R600_KC0RegClass.contains(Src.first->getReg()) ||
+          AMDGPU::R600_KC1RegClass.contains(Src.first->getReg())) {
+        unsigned Index = RI.getEncodingValue(Src.first->getReg()) & 0xff;
+        unsigned Chan = RI.getHWRegChan(Src.first->getReg());
         Consts.push_back((Index << 2) | Chan);
-        continue;
       }
     }
   }
@@ -265,6 +685,11 @@ bool isJump(unsigned Opcode) {
   return Opcode == AMDGPU::JUMP || Opcode == AMDGPU::JUMP_COND;
 }
 
+static bool isBranch(unsigned Opcode) {
+  return Opcode == AMDGPU::BRANCH || Opcode == AMDGPU::BRANCH_COND_i32 ||
+      Opcode == AMDGPU::BRANCH_COND_f32;
+}
+
 bool
 R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
@@ -283,6 +708,10 @@ R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       return false;
     --I;
   }
+  // AMDGPU::BRANCH* instructions are only available after isel and are not
+  // handled
+  if (isBranch(I->getOpcode()))
+    return true;
   if (!isJump(static_cast<MachineInstr *>(I)->getOpcode())) {
     return false;
   }
@@ -343,6 +772,17 @@ int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
   };
 }
 
+static
+MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
+      It != E; ++It) {
+    if (It->getOpcode() == AMDGPU::CF_ALU ||
+        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
+      return llvm::prior(It.base());
+  }
+  return MBB.end();
+}
+
 unsigned
 R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
                             MachineBasicBlock *TBB,
@@ -364,6 +804,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
       BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
              .addMBB(TBB)
              .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+      if (CfAlu == MBB.end())
+        return 1;
+      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
       return 1;
     }
   } else {
@@ -375,6 +820,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
             .addMBB(TBB)
             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
     BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      return 2;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
     return 2;
   }
 }
@@ -398,6 +848,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
     break;
   }
   case AMDGPU::JUMP:
@@ -418,6 +873,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
     clearFlag(predSet, 0, MO_FLAG_PUSH);
     I->eraseFromParent();
+    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
+    if (CfAlu == MBB.end())
+      break;
+    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
+    CfAlu->setDesc(get(AMDGPU::CF_ALU));
     break;
   }
   case AMDGPU::JUMP:
@@ -452,6 +912,15 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
 
   if (MI->getOpcode() == AMDGPU::KILLGT) {
     return false;
+  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    // If the clause start in the middle of MBB then the MBB has more
+    // than a single clause, unable to predicate several clauses.
+    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
+      return false;
+    // TODO: We don't support KC merging atm
+    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
+      return false;
+    return true;
   } else if (isVector(*MI)) {
     return false;
   } else {
@@ -547,6 +1016,25 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
                       const SmallVectorImpl<MachineOperand> &Pred) const {
   int PIdx = MI->findFirstPredOperandIdx();
 
+  if (MI->getOpcode() == AMDGPU::CF_ALU) {
+    MI->getOperand(8).setImm(0);
+    return true;
+  }
+
+  if (MI->getOpcode() == AMDGPU::DOT_4) {
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Z))
+        .setReg(Pred[2].getReg());
+    MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_W))
+        .setReg(Pred[2].getReg());
+    MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
+    MIB.addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
   if (PIdx != -1) {
     MachineOperand &PMO = MI->getOperand(PIdx);
     PMO.setReg(Pred[2].getReg());
@@ -558,6 +1046,10 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
   return false;
 }
 
+unsigned int R600InstrInfo::getPredicationCost(const MachineInstr *) const {
+  return 2;
+}
+
 unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                             const MachineInstr *MI,
                                             unsigned *PredCost) const {
@@ -566,67 +1058,25 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
   return 2;
 }
 
-int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  int Offset = 0;
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  if (MRI.livein_empty()) {
-    return 0;
-  }
-
-  for (MachineRegisterInfo::livein_iterator LI = MRI.livein_begin(),
-                                            LE = MRI.livein_end();
-                                            LI != LE; ++LI) {
-    Offset = std::max(Offset,
-                      GET_REG_INDEX(RI.getEncodingValue(LI->first)));
-  }
-
-  return Offset + 1;
-}
-
-int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  int Offset = 0;
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Variable sized objects are not supported
-  assert(!MFI->hasVarSizedObjects());
-
-  if (MFI->getNumObjects() == 0) {
-    return -1;
-  }
-
-  Offset = TM.getFrameLowering()->getFrameIndexOffset(MF, -1);
-
-  return getIndirectIndexBegin(MF) + Offset;
-}
-
-std::vector<unsigned> R600InstrInfo::getIndirectReservedRegs(
+void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
                                              const MachineFunction &MF) const {
   const AMDGPUFrameLowering *TFL =
                  static_cast<const AMDGPUFrameLowering*>(TM.getFrameLowering());
-  std::vector<unsigned> Regs;
 
   unsigned StackWidth = TFL->getStackWidth(MF);
   int End = getIndirectIndexEnd(MF);
 
-  if (End == -1) {
-    return Regs;
-  }
+  if (End == -1)
+    return;
 
   for (int Index = getIndirectIndexBegin(MF); Index <= End; ++Index) {
     unsigned SuperReg = AMDGPU::R600_Reg128RegClass.getRegister(Index);
-    Regs.push_back(SuperReg);
+    Reserved.set(SuperReg);
     for (unsigned Chan = 0; Chan < StackWidth; ++Chan) {
       unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister((4 * Index) + Chan);
-      Regs.push_back(Reg);
+      Reserved.set(Reg);
     }
   }
-  return Regs;
 }
 
 unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
@@ -636,13 +1086,8 @@ unsigned R600InstrInfo::calculateIndirectAddress(unsigned RegIndex,
   return RegIndex;
 }
 
-const TargetRegisterClass * R600InstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  return &AMDGPU::R600_TReg32RegClass;
-}
-
-const TargetRegisterClass *R600InstrInfo::getIndirectAddrLoadRegClass() const {
-  return &AMDGPU::TRegMemRegClass;
+const TargetRegisterClass *R600InstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::R600_TReg32_XRegClass;
 }
 
 MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
@@ -652,12 +1097,13 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
   unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                AMDGPU::AR_X, OffsetReg);
-  setImmOperand(MOVA, R600Operands::WRITE, 0);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
 
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       AddrReg, ValueReg)
-                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
-  setImmOperand(Mov, R600Operands::DST_REL, 1);
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::dst_rel, 1);
   return Mov;
 }
 
@@ -669,20 +1115,17 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
                                                        AMDGPU::AR_X,
                                                        OffsetReg);
-  setImmOperand(MOVA, R600Operands::WRITE, 0);
+  setImmOperand(MOVA, AMDGPU::OpName::write, 0);
   MachineInstrBuilder Mov = buildDefaultInstruction(*MBB, I, AMDGPU::MOV,
                                       ValueReg,
                                       AddrReg)
-                                      .addReg(AMDGPU::AR_X, RegState::Implicit);
-  setImmOperand(Mov, R600Operands::SRC0_REL, 1);
+                                      .addReg(AMDGPU::AR_X,
+                                           RegState::Implicit | RegState::Kill);
+  setImmOperand(Mov, AMDGPU::OpName::src0_rel, 1);
 
   return Mov;
 }
 
-const TargetRegisterClass *R600InstrInfo::getSuperIndirectRegClass() const {
-  return &AMDGPU::IndirectRegRegClass;
-}
-
 unsigned R600InstrInfo::getMaxAlusPerClause() const {
   return 115;
 }
@@ -728,52 +1171,118 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
   return MIB;
 }
 
+#define OPERAND_CASE(Label) \
+  case Label: { \
+    static const unsigned Ops[] = \
+    { \
+      Label##_X, \
+      Label##_Y, \
+      Label##_Z, \
+      Label##_W \
+    }; \
+    return Ops[Slot]; \
+  }
+
+static unsigned getSlotedOps(unsigned  Op, unsigned Slot) {
+  switch (Op) {
+  OPERAND_CASE(AMDGPU::OpName::update_exec_mask)
+  OPERAND_CASE(AMDGPU::OpName::update_pred)
+  OPERAND_CASE(AMDGPU::OpName::write)
+  OPERAND_CASE(AMDGPU::OpName::omod)
+  OPERAND_CASE(AMDGPU::OpName::dst_rel)
+  OPERAND_CASE(AMDGPU::OpName::clamp)
+  OPERAND_CASE(AMDGPU::OpName::src0)
+  OPERAND_CASE(AMDGPU::OpName::src0_neg)
+  OPERAND_CASE(AMDGPU::OpName::src0_rel)
+  OPERAND_CASE(AMDGPU::OpName::src0_abs)
+  OPERAND_CASE(AMDGPU::OpName::src0_sel)
+  OPERAND_CASE(AMDGPU::OpName::src1)
+  OPERAND_CASE(AMDGPU::OpName::src1_neg)
+  OPERAND_CASE(AMDGPU::OpName::src1_rel)
+  OPERAND_CASE(AMDGPU::OpName::src1_abs)
+  OPERAND_CASE(AMDGPU::OpName::src1_sel)
+  OPERAND_CASE(AMDGPU::OpName::pred_sel)
+  default:
+    llvm_unreachable("Wrong Operand");
+  }
+}
+
+#undef OPERAND_CASE
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+    MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+    const {
+  assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+  unsigned Opcode;
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700)
+    Opcode = AMDGPU::DOT4_r600;
+  else
+    Opcode = AMDGPU::DOT4_eg;
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &Src0 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
+  MachineOperand &Src1 = MI->getOperand(
+      getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
+  MachineInstr *MIB = buildDefaultInstruction(
+      MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+  static const unsigned  Operands[14] = {
+    AMDGPU::OpName::update_exec_mask,
+    AMDGPU::OpName::update_pred,
+    AMDGPU::OpName::write,
+    AMDGPU::OpName::omod,
+    AMDGPU::OpName::dst_rel,
+    AMDGPU::OpName::clamp,
+    AMDGPU::OpName::src0_neg,
+    AMDGPU::OpName::src0_rel,
+    AMDGPU::OpName::src0_abs,
+    AMDGPU::OpName::src0_sel,
+    AMDGPU::OpName::src1_neg,
+    AMDGPU::OpName::src1_rel,
+    AMDGPU::OpName::src1_abs,
+    AMDGPU::OpName::src1_sel,
+  };
+
+  MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
+      getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
+  MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+      .setReg(MO.getReg());
+
+  for (unsigned i = 0; i < 14; i++) {
+    MachineOperand &MO = MI->getOperand(
+        getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
+    assert (MO.isImm());
+    setImmOperand(MIB, Operands[i], MO.getImm());
+  }
+  MIB->getOperand(20).setImm(0);
+  return MIB;
+}
+
 MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
                                          MachineBasicBlock::iterator I,
                                          unsigned DstReg,
                                          uint64_t Imm) const {
   MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
                                                   AMDGPU::ALU_LITERAL_X);
-  setImmOperand(MovImm, R600Operands::IMM, Imm);
+  setImmOperand(MovImm, AMDGPU::OpName::literal, Imm);
   return MovImm;
 }
 
-int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
-                                 R600Operands::Ops Op) const {
-  return getOperandIdx(MI.getOpcode(), Op);
+MachineInstr *R600InstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned DstReg, unsigned SrcReg) const {
+  return buildDefaultInstruction(*MBB, I, AMDGPU::MOV, DstReg, SrcReg);
 }
 
-int R600InstrInfo::getOperandIdx(unsigned Opcode,
-                                 R600Operands::Ops Op) const {
-  unsigned TargetFlags = get(Opcode).TSFlags;
-  unsigned OpTableIdx;
-
-  if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
-    switch (Op) {
-    case R600Operands::DST: return 0;
-    case R600Operands::SRC0: return 1;
-    case R600Operands::SRC1: return 2;
-    case R600Operands::SRC2: return 3;
-    default:
-      assert(!"Unknown operand type for instruction");
-      return -1;
-    }
-  }
-
-  if (TargetFlags & R600_InstFlag::OP1) {
-    OpTableIdx = 0;
-  } else if (TargetFlags & R600_InstFlag::OP2) {
-    OpTableIdx = 1;
-  } else {
-    assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
-                                                 "for this instruction");
-    OpTableIdx = 2;
-  }
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI, unsigned Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
 
-  return R600Operands::ALUOpTable[OpTableIdx][Op];
+int R600InstrInfo::getOperandIdx(unsigned Opcode, unsigned Op) const {
+  return AMDGPU::getNamedOperandIdx(Opcode, Op);
 }
 
-void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
+void R600InstrInfo::setImmOperand(MachineInstr *MI, unsigned Op,
                                   int64_t Imm) const {
   int Idx = getOperandIdx(*MI, Op);
   assert(Idx != -1 && "Operand not supported for this instruction.");
@@ -801,20 +1310,20 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
     bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
     switch (Flag) {
     case MO_FLAG_CLAMP:
-      FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::clamp);
       break;
     case MO_FLAG_MASK:
-      FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::write);
       break;
     case MO_FLAG_NOT_LAST:
     case MO_FLAG_LAST:
-      FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
+      FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::last);
       break;
     case MO_FLAG_NEG:
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
-      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
-      case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_neg); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_neg); break;
+      case 2: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src2_neg); break;
       }
       break;
 
@@ -823,8 +1332,8 @@ MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
                        "instructions.");
       (void)IsOP3;
       switch (SrcIdx) {
-      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
-      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
+      case 0: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src0_abs); break;
+      case 1: FlagIndex = getOperandIdx(*MI, AMDGPU::OpName::src1_abs); break;
       }
       break;
 
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index babe4b8..13d9810 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -16,7 +16,6 @@
 #define R600INSTRUCTIONINFO_H_
 
 #include "AMDGPUInstrInfo.h"
-#include "AMDIL.h"
 #include "R600Defines.h"
 #include "R600RegisterInfo.h"
 #include <map>
@@ -36,8 +35,19 @@ namespace llvm {
   const AMDGPUSubtarget &ST;
 
   int getBranchInstr(const MachineOperand &op) const;
+  std::vector<std::pair<int, unsigned> >
+  ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
 
   public:
+  enum BankSwizzle {
+    ALU_VEC_012_SCL_210 = 0,
+    ALU_VEC_021_SCL_122,
+    ALU_VEC_120_SCL_212,
+    ALU_VEC_102_SCL_221,
+    ALU_VEC_201,
+    ALU_VEC_210
+  };
+
   explicit R600InstrInfo(AMDGPUTargetMachine &tm);
 
   const R600RegisterInfo &getRegisterInfo() const;
@@ -45,6 +55,8 @@ namespace llvm {
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
+  bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI) const;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -53,25 +65,83 @@ namespace llvm {
 
   /// \returns true if this \p Opcode represents an ALU instruction.
   bool isALUInstr(unsigned Opcode) const;
+  bool hasInstrModifiers(unsigned Opcode) const;
+  bool isLDSInstr(unsigned Opcode) const;
+  bool isLDSNoRetInstr(unsigned Opcode) const;
+  bool isLDSRetInstr(unsigned Opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction or an
+  /// instruction that will be lowered in ExpandSpecialInstrs Pass.
+  bool canBeConsideredALU(const MachineInstr *MI) const;
 
   bool isTransOnly(unsigned Opcode) const;
   bool isTransOnly(const MachineInstr *MI) const;
+  bool isVectorOnly(unsigned Opcode) const;
+  bool isVectorOnly(const MachineInstr *MI) const;
+  bool isExport(unsigned Opcode) const;
 
   bool usesVertexCache(unsigned Opcode) const;
   bool usesVertexCache(const MachineInstr *MI) const;
   bool usesTextureCache(unsigned Opcode) const;
   bool usesTextureCache(const MachineInstr *MI) const;
 
+  bool mustBeLastInClause(unsigned Opcode) const;
+  bool usesAddressRegister(MachineInstr *MI) const;
+  bool definesAddressRegister(MachineInstr *MI) const;
+  bool readsLDSSrcReg(const MachineInstr *MI) const;
+
+  /// \returns The operand index for the given source number.  Legal values
+  /// for SrcNum are 0, 1, and 2.
+  int getSrcIdx(unsigned Opcode, unsigned SrcNum) const;
+  /// \returns The operand Index for the Sel operand given an index to one
+  /// of the instruction's src operands.
+  int getSelIdx(unsigned Opcode, unsigned SrcIdx) const;
+
+  /// \returns a pair for each src of an ALU instructions.
+  /// The first member of a pair is the register id.
+  /// If register is ALU_CONST, second member is SEL.
+  /// If register is ALU_LITERAL, second member is IMM.
+  /// Otherwise, second member value is undefined.
+  SmallVector<std::pair<MachineOperand *, int64_t>, 3>
+      getSrcs(MachineInstr *MI) const;
+
+  unsigned  isLegalUpTo(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    const std::vector<R600InstrInfo::BankSwizzle> &Swz,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  bool FindSwizzleForVectorSlot(
+    const std::vector<std::vector<std::pair<int, unsigned> > > &IGSrcs,
+    std::vector<R600InstrInfo::BankSwizzle> &SwzCandidate,
+    const std::vector<std::pair<int, unsigned> > &TransSrcs,
+    R600InstrInfo::BankSwizzle TransSwz) const;
+
+  /// Given the order VEC_012 < VEC_021 < VEC_120 < VEC_102 < VEC_201 < VEC_210
+  /// returns true and the first (in lexical order) BankSwizzle affectation
+  /// starting from the one already provided in the Instruction Group MIs that
+  /// fits Read Port limitations in BS if available. Otherwise returns false
+  /// and undefined content in BS.
+  /// isLastAluTrans should be set if the last Alu of MIs will be executed on
+  /// Trans ALU. In this case, ValidTSwizzle returns the BankSwizzle value to
+  /// apply to the last instruction.
+  /// PV holds GPR to PV registers in the Instruction Group MIs.
+  bool fitsReadPortLimitations(const std::vector<MachineInstr *> &MIs,
+                               const DenseMap<unsigned, unsigned> &PV,
+                               std::vector<BankSwizzle> &BS,
+                               bool isLastAluTrans) const;
+
+  /// An instruction group can only access 2 channel pair (either [XY] or [ZW])
+  /// from KCache bank on R700+. This function check if MI set in input meet
+  /// this limitations
+  bool fitsConstReadLimitations(const std::vector<MachineInstr *> &) const;
+  /// Same but using const index set instead of MI set.
   bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
-  bool canBundle(const std::vector<MachineInstr *> &) const;
 
   /// \breif Vector instructions are instructions that must fill all
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const;
   virtual bool isMov(unsigned Opcode) const;
 
@@ -118,6 +188,8 @@ namespace llvm {
   bool PredicateInstruction(MachineInstr *MI,
                         const SmallVectorImpl<MachineOperand> &Pred) const;
 
+  unsigned int getPredicationCost(const MachineInstr *) const;
+
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
                                unsigned *PredCost = 0) const;
@@ -125,22 +197,14 @@ namespace llvm {
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const { return 1;}
 
-  /// \returns a list of all the registers that may be accesed using indirect
-  /// addressing.
-  std::vector<unsigned> getIndirectReservedRegs(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
-
+  /// \brief Reserve the registers that may be accesed using indirect addressing.
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                   MachineBasicBlock::iterator I,
@@ -152,8 +216,6 @@ namespace llvm {
                                   unsigned ValueReg, unsigned Address,
                                   unsigned OffsetReg) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-
   unsigned getMaxAlusPerClause() const;
 
   ///buildDefaultInstruction - This function returns a MachineInstr with
@@ -170,23 +232,32 @@ namespace llvm {
                                               unsigned Src0Reg,
                                               unsigned Src1Reg = 0) const;
 
+  MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+                                             MachineInstr *MI,
+                                             unsigned Slot,
+                                             unsigned DstReg) const;
+
   MachineInstr *buildMovImm(MachineBasicBlock &BB,
                                   MachineBasicBlock::iterator I,
                                   unsigned DstReg,
                                   uint64_t Imm) const;
 
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
+
   /// \brief Get the index of Op in the MachineInstr.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
-  int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
+  int getOperandIdx(const MachineInstr &MI, unsigned Op) const;
 
   /// \brief Get the index of \p Op for the given Opcode.
   ///
   /// \returns -1 if the Instruction does not contain the specified \p Op.
-  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+  int getOperandIdx(unsigned Opcode, unsigned Op) const;
 
   /// \brief Helper function for setting instruction flag values.
-  void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
+  void setImmOperand(MachineInstr *MI, unsigned Op, int64_t Imm) const;
 
   /// \returns true if this instruction has an operand for storing target flags.
   bool hasFlagOperand(const MachineInstr &MI) const;
@@ -208,6 +279,12 @@ namespace llvm {
   void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
 };
 
+namespace AMDGPU {
+
+int getLDSNoRetOp(uint16_t Opcode);
+
+} //End namespace AMDGPU
+
 } // End llvm namespace
 
 #endif // R600INSTRINFO_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8f47523..0346e24 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -12,44 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 include "R600Intrinsics.td"
-
-class InstR600 <dag outs, dag ins, string asm, list<dag> pattern,
-                InstrItinClass itin>
-    : AMDGPUInst <outs, ins, asm, pattern> {
-
-  field bits<64> Inst;
-  bit TransOnly = 0;
-  bit Trig = 0;
-  bit Op3 = 0;
-  bit isVector = 0;
-  bits<2> FlagOperandIdx = 0;
-  bit Op1 = 0;
-  bit Op2 = 0;
-  bit HasNativeOperands = 0;
-  bit VTXInst = 0;
-  bit TEXInst = 0;
-
-  let Namespace = "AMDGPU";
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asm;
-  let Pattern = pattern;
-  let Itinerary = itin;
-
-  let TSFlags{0} = TransOnly;
-  let TSFlags{4} = Trig;
-  let TSFlags{5} = Op3;
-
-  // Vector instructions are instructions that must fill all slots in an
-  // instruction group
-  let TSFlags{6} = isVector;
-  let TSFlags{8-7} = FlagOperandIdx;
-  let TSFlags{9} = HasNativeOperands;
-  let TSFlags{10} = Op1;
-  let TSFlags{11} = Op2;
-  let TSFlags{12} = VTXInst;
-  let TSFlags{13} = TEXInst;
-}
+include "R600InstrFormats.td"
 
 class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
     InstR600 <outs, ins, asm, pattern, NullALU> {
@@ -96,6 +59,12 @@ def UP : InstFlag <"printUpdatePred">;
 // Once we start using the packetizer in this backend we should have this
 // default to 0.
 def LAST : InstFlag<"printLast", 1>;
+def RSel : Operand<i32> {
+  let PrintMethod = "printRSel";
+}
+def CT: Operand<i32> {
+  let PrintMethod = "printCT";
+}
 
 def FRAMEri : Operand<iPTR> {
   let MIOperandInfo = (ops R600_Reg32:$ptr, i32imm:$index);
@@ -106,237 +75,7 @@ def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
 def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
 def ADDRGA_CONST_OFFSET : ComplexPattern<i32, 1, "SelectGlobalValueConstantOffset", [], []>;
 def ADDRGA_VAR_OFFSET : ComplexPattern<i32, 2, "SelectGlobalValueVariableOffset", [], []>;
-def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
-
-class R600ALU_Word0 {
-  field bits<32> Word0;
-
-  bits<11> src0;
-  bits<1>  src0_neg;
-  bits<1>  src0_rel;
-  bits<11> src1;
-  bits<1>  src1_rel;
-  bits<1>  src1_neg;
-  bits<3>  index_mode = 0;
-  bits<2>  pred_sel;
-  bits<1>  last;
-
-  bits<9>  src0_sel  = src0{8-0};
-  bits<2>  src0_chan = src0{10-9};
-  bits<9>  src1_sel  = src1{8-0};
-  bits<2>  src1_chan = src1{10-9};
-
-  let Word0{8-0}   = src0_sel;
-  let Word0{9}     = src0_rel;
-  let Word0{11-10} = src0_chan;
-  let Word0{12}    = src0_neg;
-  let Word0{21-13} = src1_sel;
-  let Word0{22}    = src1_rel;
-  let Word0{24-23} = src1_chan;
-  let Word0{25}    = src1_neg;
-  let Word0{28-26} = index_mode;
-  let Word0{30-29} = pred_sel;
-  let Word0{31}    = last;
-}
-
-class R600ALU_Word1 {
-  field bits<32> Word1;
-
-  bits<11> dst;
-  bits<3>  bank_swizzle;
-  bits<1>  dst_rel;
-  bits<1>  clamp;
-
-  bits<7>  dst_sel  = dst{6-0};
-  bits<2>  dst_chan = dst{10-9};
-
-  let Word1{20-18} = bank_swizzle;
-  let Word1{27-21} = dst_sel;
-  let Word1{28}    = dst_rel;
-  let Word1{30-29} = dst_chan;
-  let Word1{31}    = clamp;
-}
-
-class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
-
-  bits<1>  src0_abs;
-  bits<1>  src1_abs;
-  bits<1>  update_exec_mask;
-  bits<1>  update_pred;
-  bits<1>  write;
-  bits<2>  omod;
-
-  let Word1{0}     = src0_abs;
-  let Word1{1}     = src1_abs;
-  let Word1{2}     = update_exec_mask;
-  let Word1{3}     = update_pred;
-  let Word1{4}     = write;
-  let Word1{6-5}   = omod;
-  let Word1{17-7}  = alu_inst;
-}
 
-class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
-
-  bits<11> src2;
-  bits<1>  src2_rel;
-  bits<1>  src2_neg;
-
-  bits<9>  src2_sel = src2{8-0};
-  bits<2>  src2_chan = src2{10-9};
-
-  let Word1{8-0}   = src2_sel;
-  let Word1{9}     = src2_rel;
-  let Word1{11-10} = src2_chan;
-  let Word1{12}    = src2_neg;
-  let Word1{17-13} = alu_inst;
-}
-
-class VTX_WORD0 {
-  field bits<32> Word0;
-  bits<7> SRC_GPR;
-  bits<5> VC_INST;
-  bits<2> FETCH_TYPE;
-  bits<1> FETCH_WHOLE_QUAD;
-  bits<8> BUFFER_ID;
-  bits<1> SRC_REL;
-  bits<2> SRC_SEL_X;
-  bits<6> MEGA_FETCH_COUNT;
-
-  let Word0{4-0}   = VC_INST;
-  let Word0{6-5}   = FETCH_TYPE;
-  let Word0{7}     = FETCH_WHOLE_QUAD;
-  let Word0{15-8}  = BUFFER_ID;
-  let Word0{22-16} = SRC_GPR;
-  let Word0{23}    = SRC_REL;
-  let Word0{25-24} = SRC_SEL_X;
-  let Word0{31-26} = MEGA_FETCH_COUNT;
-}
-
-class VTX_WORD1_GPR {
-  field bits<32> Word1;
-  bits<7> DST_GPR;
-  bits<1> DST_REL;
-  bits<3> DST_SEL_X;
-  bits<3> DST_SEL_Y;
-  bits<3> DST_SEL_Z;
-  bits<3> DST_SEL_W;
-  bits<1> USE_CONST_FIELDS;
-  bits<6> DATA_FORMAT;
-  bits<2> NUM_FORMAT_ALL;
-  bits<1> FORMAT_COMP_ALL;
-  bits<1> SRF_MODE_ALL;
-
-  let Word1{6-0} = DST_GPR;
-  let Word1{7}    = DST_REL;
-  let Word1{8}    = 0; // Reserved
-  let Word1{11-9} = DST_SEL_X;
-  let Word1{14-12} = DST_SEL_Y;
-  let Word1{17-15} = DST_SEL_Z;
-  let Word1{20-18} = DST_SEL_W;
-  let Word1{21}    = USE_CONST_FIELDS;
-  let Word1{27-22} = DATA_FORMAT;
-  let Word1{29-28} = NUM_FORMAT_ALL;
-  let Word1{30}    = FORMAT_COMP_ALL;
-  let Word1{31}    = SRF_MODE_ALL;
-}
-
-class TEX_WORD0 {
-  field bits<32> Word0;
-
-  bits<5> TEX_INST;
-  bits<2> INST_MOD;
-  bits<1> FETCH_WHOLE_QUAD;
-  bits<8> RESOURCE_ID;
-  bits<7> SRC_GPR;
-  bits<1> SRC_REL;
-  bits<1> ALT_CONST;
-  bits<2> RESOURCE_INDEX_MODE;
-  bits<2> SAMPLER_INDEX_MODE;
-
-  let Word0{4-0} = TEX_INST;
-  let Word0{6-5} = INST_MOD;
-  let Word0{7} = FETCH_WHOLE_QUAD;
-  let Word0{15-8} = RESOURCE_ID;
-  let Word0{22-16} = SRC_GPR;
-  let Word0{23} = SRC_REL;
-  let Word0{24} = ALT_CONST;
-  let Word0{26-25} = RESOURCE_INDEX_MODE;
-  let Word0{28-27} = SAMPLER_INDEX_MODE;
-}
-
-class TEX_WORD1 {
-  field bits<32> Word1;
-
-  bits<7> DST_GPR;
-  bits<1> DST_REL;
-  bits<3> DST_SEL_X;
-  bits<3> DST_SEL_Y;
-  bits<3> DST_SEL_Z;
-  bits<3> DST_SEL_W;
-  bits<7> LOD_BIAS;
-  bits<1> COORD_TYPE_X;
-  bits<1> COORD_TYPE_Y;
-  bits<1> COORD_TYPE_Z;
-  bits<1> COORD_TYPE_W;
-
-  let Word1{6-0} = DST_GPR;
-  let Word1{7} = DST_REL;
-  let Word1{11-9} = DST_SEL_X;
-  let Word1{14-12} = DST_SEL_Y;
-  let Word1{17-15} = DST_SEL_Z;
-  let Word1{20-18} = DST_SEL_W;
-  let Word1{27-21} = LOD_BIAS;
-  let Word1{28} = COORD_TYPE_X;
-  let Word1{29} = COORD_TYPE_Y;
-  let Word1{30} = COORD_TYPE_Z;
-  let Word1{31} = COORD_TYPE_W;
-}
-
-class TEX_WORD2 {
-  field bits<32> Word2;
-
-  bits<5> OFFSET_X;
-  bits<5> OFFSET_Y;
-  bits<5> OFFSET_Z;
-  bits<5> SAMPLER_ID;
-  bits<3> SRC_SEL_X;
-  bits<3> SRC_SEL_Y;
-  bits<3> SRC_SEL_Z;
-  bits<3> SRC_SEL_W;
-
-  let Word2{4-0} = OFFSET_X;
-  let Word2{9-5} = OFFSET_Y;
-  let Word2{14-10} = OFFSET_Z;
-  let Word2{19-15} = SAMPLER_ID;
-  let Word2{22-20} = SRC_SEL_X;
-  let Word2{25-23} = SRC_SEL_Y;
-  let Word2{28-26} = SRC_SEL_Z;
-  let Word2{31-29} = SRC_SEL_W;
-}
-
-/*
-XXX: R600 subtarget uses a slightly different encoding than the other
-subtargets.  We currently handle this in R600MCCodeEmitter, but we may
-want to use these instruction classes in the future.
-
-class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
-
-  bits<1>  fog_merge;
-  bits<10> alu_inst;
-
-  let Inst{37}    = fog_merge;
-  let Inst{39-38} = omod;
-  let Inst{49-40} = alu_inst;
-}
-
-class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
-
-  bits<11> alu_inst;
-
-  let Inst{38-37} = omod;
-  let Inst{49-39} = alu_inst;
-}
-*/
 
 def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
                                      (ops PRED_SEL_OFF)>;
@@ -358,7 +97,7 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
                    LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                    BANK_SWIZZLE:$bank_swizzle),
               !strconcat("  ", opName,
-                   "$last$clamp $dst$write$dst_rel$omod, "
+                   "$clamp $last $dst$write$dst_rel$omod, "
                    "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
                    "$pred_sel $bank_swizzle"),
               pattern,
@@ -374,7 +113,9 @@ class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
   let update_pred = 0;
   let HasNativeOperands = 1;
   let Op1 = 1;
+  let ALUInst = 1;
   let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
 
   let Inst{31-0}  = Word0;
   let Inst{63-32} = Word1;
@@ -386,7 +127,7 @@ class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
               [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
 >;
 
-// If you add our change the operands for R600_2OP instructions, you must
+// If you add or change the operands for R600_2OP instructions, you must
 // also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
 // R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
 class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
@@ -399,7 +140,7 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                BANK_SWIZZLE:$bank_swizzle),
           !strconcat("  ", opName,
-                "$last$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$clamp $last $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
                 "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
                 "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
                 "$pred_sel $bank_swizzle"),
@@ -410,7 +151,9 @@ class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
 
   let HasNativeOperands = 1;
   let Op2 = 1;
+  let ALUInst = 1;
   let DisableEncoding = "$literal";
+  let UseNamedOperandTable = 1;
 
   let Inst{31-0}  = Word0;
   let Inst{63-32} = Word1;
@@ -436,7 +179,7 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
                R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel, SEL:$src2_sel,
                LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal,
                BANK_SWIZZLE:$bank_swizzle),
-          !strconcat("  ", opName, "$last$clamp $dst$dst_rel, "
+          !strconcat("  ", opName, "$clamp $last $dst$dst_rel, "
                              "$src0_neg$src0$src0_rel, "
                              "$src1_neg$src1$src1_rel, "
                              "$src2_neg$src2$src2_rel, "
@@ -450,6 +193,8 @@ class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
   let HasNativeOperands = 1;
   let DisableEncoding = "$literal";
   let Op3 = 1;
+  let UseNamedOperandTable = 1;
+  let ALUInst = 1;
 
   let Inst{31-0}  = Word0;
   let Inst{63-32} = Word1;
@@ -463,38 +208,7 @@ class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
           pattern,
           itin>;
 
-class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
-                InstrItinClass itin = AnyALU> :
-  InstR600 <(outs R600_Reg128:$DST_GPR),
-          (ins R600_Reg128:$SRC_GPR, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID, i32imm:$textureTarget),
-          !strconcat(opName, "$DST_GPR, $SRC_GPR, $RESOURCE_ID, $SAMPLER_ID, $textureTarget"),
-          pattern,
-          itin>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
-    let Inst{31-0} = Word0;
-    let Inst{63-32} = Word1;
-
-    let TEX_INST = inst{4-0};
-    let SRC_REL = 0;
-    let DST_REL = 0;
-    let DST_SEL_X = 0;
-    let DST_SEL_Y = 1;
-    let DST_SEL_Z = 2;
-    let DST_SEL_W = 3;
-    let LOD_BIAS = 0;
-
-    let INST_MOD = 0;
-    let FETCH_WHOLE_QUAD = 0;
-    let ALT_CONST = 0;
-    let SAMPLER_INDEX_MODE = 0;
-    let RESOURCE_INDEX_MODE = 0;
-
-    let COORD_TYPE_X = 0;
-    let COORD_TYPE_Y = 0;
-    let COORD_TYPE_Z = 0;
-    let COORD_TYPE_W = 0;
-
-    let TEXInst = 1;
-  }
+
 
 } // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
 
@@ -515,7 +229,7 @@ def TEX_RECT : PatLeaf<
 def TEX_ARRAY : PatLeaf<
   (imm),
   [{uint32_t TType = (uint32_t)N->getZExtValue();
-    return TType == 9 || TType == 10 || TType == 15 || TType == 16;
+    return TType == 9 || TType == 10 || TType == 16;
   }]
 >;
 
@@ -526,76 +240,115 @@ def TEX_SHADOW_ARRAY : PatLeaf<
   }]
 >;
 
-class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
-                 dag ins, string asm, list<dag> pattern> :
-    InstR600ISA <outs, ins, asm, pattern> {
-  bits<7>  RW_GPR;
-  bits<7>  INDEX_GPR;
-
-  bits<2>  RIM;
-  bits<2>  TYPE;
-  bits<1>  RW_REL;
-  bits<2>  ELEM_SIZE;
-
-  bits<12> ARRAY_SIZE;
-  bits<4>  COMP_MASK;
-  bits<4>  BURST_COUNT;
-  bits<1>  VPM;
-  bits<1>  eop;
-  bits<1>  MARK;
-  bits<1>  BARRIER;
-
-  // CF_ALLOC_EXPORT_WORD0_RAT
-  let Inst{3-0}   = rat_id;
-  let Inst{9-4}   = rat_inst;
-  let Inst{10}    = 0; // Reserved
-  let Inst{12-11} = RIM;
-  let Inst{14-13} = TYPE;
-  let Inst{21-15} = RW_GPR;
-  let Inst{22}    = RW_REL;
-  let Inst{29-23} = INDEX_GPR;
-  let Inst{31-30} = ELEM_SIZE;
-
-  // CF_ALLOC_EXPORT_WORD1_BUF
-  let Inst{43-32} = ARRAY_SIZE;
-  let Inst{47-44} = COMP_MASK;
-  let Inst{51-48} = BURST_COUNT;
-  let Inst{52}    = VPM;
-  let Inst{53}    = eop;
-  let Inst{61-54} = cf_inst;
-  let Inst{62}    = MARK;
-  let Inst{63}    = BARRIER;
+def TEX_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 14;
+  }]
+>;
+
+def TEX_ARRAY_MSAA : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return TType == 15;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cfinst, bits <6> ratinst, bits<4> ratid, bits<4> mask,
+                 dag outs, dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern>,
+    CF_ALLOC_EXPORT_WORD0_RAT, CF_ALLOC_EXPORT_WORD1_BUF  {
+
+  let rat_id = ratid;
+  let rat_inst = ratinst;
+  let rim         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let type        = 1;
+  let rw_rel      = 0;
+  let elem_size   = 0;
+
+  let array_size  = 0;
+  let comp_mask   = mask;
+  let burst_count = 0;
+  let vpm         = 0;
+  let cf_inst = cfinst;
+  let mark        = 0;
+  let barrier     = 1;
+
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+  let IsExport = 1;
+
+}
+
+class VTX_READ <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$src_gpr), name, pattern>,
+      VTX_WORD1_GPR {
+
+  // Static fields
+  let DST_REL = 0;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  let USE_CONST_FIELDS = 0;
+  let NUM_FORMAT_ALL = 1;
+  let FORMAT_COMP_ALL = 0;
+  let SRF_MODE_ALL = 0;
+
+  let Inst{63-32} = Word1;
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+
+  let VTXInst = 1;
 }
 
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
-  [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
+  [{ return isConstantLoad(dyn_cast<LoadSDNode>(N), 0); }]
 >;
 
 def load_param : LoadParamFrag<load>;
-def load_param_zexti8 : LoadParamFrag<zextloadi8>;
-def load_param_zexti16 : LoadParamFrag<zextloadi16>;
-
-def isR600 : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
-def isR700 : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
-                            "Subtarget.device()->getDeviceFlag()"
-                            ">= OCL_DEVICE_RV710">;
+def load_param_exti8 : LoadParamFrag<az_extloadi8>;
+def load_param_exti16 : LoadParamFrag<az_extloadi16>;
+
+def isR600 : Predicate<"Subtarget.getGeneration() <= AMDGPUSubtarget::R700">;
+def isR700 : Predicate<"Subtarget.getGeneration() == AMDGPUSubtarget::R700">;
 def isEG : Predicate<
-  "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
-  "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
-  "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
+  "Subtarget.getGeneration() >= AMDGPUSubtarget::EVERGREEN && "
+  "Subtarget.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS && "
+  "!Subtarget.hasCaymanISA()">;
 
-def isCayman : Predicate<"Subtarget.device()"
-                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
-def isEGorCayman : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
-                            "|| Subtarget.device()->getGeneration() =="
-                            "AMDGPUDeviceInfo::HD6XXX">;
+def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
+def isEGorCayman : Predicate<"Subtarget.getGeneration() == "
+                             "AMDGPUSubtarget::EVERGREEN"
+                            "|| Subtarget.getGeneration() =="
+                            "AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 def isR600toCayman : Predicate<
-                     "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
+                     "Subtarget.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS">;
 
 //===----------------------------------------------------------------------===//
 // R600 SDNodes
@@ -603,13 +356,13 @@ def isR600toCayman : Predicate<
 
 def INTERP_PAIR_XY :  AMDGPUShaderInst <
   (outs R600_TReg32_X:$dst0, R600_TReg32_Y:$dst1),
-  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
   "INTERP_PAIR_XY $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
 def INTERP_PAIR_ZW :  AMDGPUShaderInst <
   (outs R600_TReg32_Z:$dst0, R600_TReg32_W:$dst1),
-  (ins i32imm:$src0, R600_Reg32:$src1, R600_Reg32:$src2),
+  (ins i32imm:$src0, R600_TReg32_Y:$src1, R600_TReg32_X:$src2),
   "INTERP_PAIR_ZW $src0 $src1 $src2 : $dst0 dst1",
   []>;
 
@@ -618,6 +371,44 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
   [SDNPVariadic]
 >;
 
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+  SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+      SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+      SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+  []
+>;
+
+def COS_HW : SDNode<"AMDGPUISD::COS_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def SIN_HW : SDNode<"AMDGPUISD::SIN_HW",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
+>;
+
+def TEXTURE_FETCH_Type : SDTypeProfile<1, 19, [SDTCisFP<0>]>;
+
+def TEXTURE_FETCH: SDNode<"AMDGPUISD::TEXTURE_FETCH", TEXTURE_FETCH_Type, []>;
+
+multiclass TexPattern<bits<32> TextureOp, Instruction inst, ValueType vt = v4f32> {
+def : Pat<(TEXTURE_FETCH (i32 TextureOp), vt:$SRC_GPR,
+          (i32 imm:$srcx), (i32 imm:$srcy), (i32 imm:$srcz), (i32 imm:$srcw),
+          (i32 imm:$offsetx), (i32 imm:$offsety), (i32 imm:$offsetz),
+          (i32 imm:$DST_SEL_X), (i32 imm:$DST_SEL_Y), (i32 imm:$DST_SEL_Z),
+          (i32 imm:$DST_SEL_W),
+          (i32 imm:$RESOURCE_ID), (i32 imm:$SAMPLER_ID),
+          (i32 imm:$COORD_TYPE_X), (i32 imm:$COORD_TYPE_Y), (i32 imm:$COORD_TYPE_Z),
+          (i32 imm:$COORD_TYPE_W)),
+          (inst R600_Reg128:$SRC_GPR,
+          imm:$srcx, imm:$srcy, imm:$srcz, imm:$srcw,
+          imm:$offsetx, imm:$offsety, imm:$offsetz,
+          imm:$DST_SEL_X, imm:$DST_SEL_Y, imm:$DST_SEL_Z,
+          imm:$DST_SEL_W,
+          imm:$RESOURCE_ID, imm:$SAMPLER_ID,
+          imm:$COORD_TYPE_X, imm:$COORD_TYPE_Y, imm:$COORD_TYPE_Z,
+          imm:$COORD_TYPE_W)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Interpolation Instructions
 //===----------------------------------------------------------------------===//
@@ -626,7 +417,7 @@ def INTERP_VEC_LOAD :  AMDGPUShaderInst <
   (outs R600_Reg128:$dst),
   (ins i32imm:$src0),
   "INTERP_LOAD $src0 : $dst",
-  []>;
+  [(set R600_Reg128:$dst, (int_R600_interp_const imm:$src0))]>;
 
 def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
   let bank_swizzle = 5;
@@ -753,13 +544,14 @@ let usesCustomInserter = 1, isNotDuplicable = 1 in {
 class ExportSwzInst : InstR600ISA<(
     outs),
     (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
-    i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
+    RSel:$sw_x, RSel:$sw_y, RSel:$sw_z, RSel:$sw_w, i32imm:$inst,
     i32imm:$eop),
-    !strconcat("EXPORT", " $gpr"),
+    !strconcat("EXPORT", " $gpr.$sw_x$sw_y$sw_z$sw_w"),
     []>, ExportWord0, ExportSwzWord1 {
   let elem_size = 3;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 } // End usesCustomInserter = 1
@@ -773,47 +565,13 @@ class ExportBufInst : InstR600ISA<(
   let elem_size = 0;
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
+  let IsExport = 1;
 }
 
 //===----------------------------------------------------------------------===//
 // Control Flow Instructions
 //===----------------------------------------------------------------------===//
 
-class CF_ALU_WORD0 {
-  field bits<32> Word0;
-
-  bits<22> ADDR;
-  bits<4> KCACHE_BANK0;
-  bits<4> KCACHE_BANK1;
-  bits<2> KCACHE_MODE0;
-
-  let Word0{21-0} = ADDR;
-  let Word0{25-22} = KCACHE_BANK0;
-  let Word0{29-26} = KCACHE_BANK1;
-  let Word0{31-30} = KCACHE_MODE0;
-}
-
-class CF_ALU_WORD1 {
-  field bits<32> Word1;
-
-  bits<2> KCACHE_MODE1;
-  bits<8> KCACHE_ADDR0;
-  bits<8> KCACHE_ADDR1;
-  bits<7> COUNT;
-  bits<1> ALT_CONST;
-  bits<4> CF_INST;
-  bits<1> WHOLE_QUAD_MODE;
-  bits<1> BARRIER;
-
-  let Word1{1-0} = KCACHE_MODE1;
-  let Word1{9-2} = KCACHE_ADDR0;
-  let Word1{17-10} = KCACHE_ADDR1;
-  let Word1{24-18} = COUNT;
-  let Word1{25} = ALT_CONST;
-  let Word1{29-26} = CF_INST;
-  let Word1{30} = WHOLE_QUAD_MODE;
-  let Word1{31} = BARRIER;
-}
 
 def KCACHE : InstFlag<"printKCache">;
 
@@ -821,7 +579,7 @@ class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
 (ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
 KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
 i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
-i32imm:$COUNT),
+i32imm:$COUNT, i32imm:$Enabled),
 !strconcat(OpName, " $COUNT, @$ADDR, "
 "KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
 [] >, CF_ALU_WORD0, CF_ALU_WORD1 {
@@ -831,6 +589,7 @@ i32imm:$COUNT),
   let ALT_CONST = 0;
   let WHOLE_QUAD_MODE = 0;
   let BARRIER = 1;
+  let UseNamedOperandTable = 1;
 
   let Inst{31-0} = Word0;
   let Inst{63-32} = Word1;
@@ -844,45 +603,19 @@ class CF_WORD0_R600 {
   let Word0 = ADDR;
 }
 
-class CF_WORD1_R600 {
-  field bits<32> Word1;
-
-  bits<3> POP_COUNT;
-  bits<5> CF_CONST;
-  bits<2> COND;
-  bits<3> COUNT;
-  bits<6> CALL_COUNT;
-  bits<1> COUNT_3;
-  bits<1> END_OF_PROGRAM;
-  bits<1> VALID_PIXEL_MODE;
-  bits<7> CF_INST;
-  bits<1> WHOLE_QUAD_MODE;
-  bits<1> BARRIER;
-
-  let Word1{2-0} = POP_COUNT;
-  let Word1{7-3} = CF_CONST;
-  let Word1{9-8} = COND;
-  let Word1{12-10} = COUNT;
-  let Word1{18-13} = CALL_COUNT;
-  let Word1{19} = COUNT_3;
-  let Word1{21} = END_OF_PROGRAM;
-  let Word1{22} = VALID_PIXEL_MODE;
-  let Word1{29-23} = CF_INST;
-  let Word1{30} = WHOLE_QUAD_MODE;
-  let Word1{31} = BARRIER;
-}
-
 class CF_CLAUSE_R600 <bits<7> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
 ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
   field bits<64> Inst;
+  bits<4> CNT;
 
   let CF_INST = inst;
   let BARRIER = 1;
   let CF_CONST = 0;
   let VALID_PIXEL_MODE = 0;
   let COND = 0;
+  let COUNT = CNT{2-0};
   let CALL_COUNT = 0;
-  let COUNT_3 = 0;
+  let COUNT_3 = CNT{3};
   let END_OF_PROGRAM = 0;
   let WHOLE_QUAD_MODE = 0;
 
@@ -890,38 +623,6 @@ ins, AsmPrint, [] >, CF_WORD0_R600, CF_WORD1_R600 {
   let Inst{63-32} = Word1;
 }
 
-class CF_WORD0_EG {
-  field bits<32> Word0;
-
-  bits<24> ADDR;
-  bits<3> JUMPTABLE_SEL;
-
-  let Word0{23-0} = ADDR;
-  let Word0{26-24} = JUMPTABLE_SEL;
-}
-
-class CF_WORD1_EG {
-  field bits<32> Word1;
-
-  bits<3> POP_COUNT;
-  bits<5> CF_CONST;
-  bits<2> COND;
-  bits<6> COUNT;
-  bits<1> VALID_PIXEL_MODE;
-  bits<1> END_OF_PROGRAM;
-  bits<8> CF_INST;
-  bits<1> BARRIER;
-
-  let Word1{2-0} = POP_COUNT;
-  let Word1{7-3} = CF_CONST;
-  let Word1{9-8} = COND;
-  let Word1{15-10} = COUNT;
-  let Word1{20} = VALID_PIXEL_MODE;
-  let Word1{21} = END_OF_PROGRAM;
-  let Word1{29-22} = CF_INST;
-  let Word1{31} = BARRIER;
-}
-
 class CF_CLAUSE_EG <bits<8> inst, dag ins, string AsmPrint> : AMDGPUInst <(outs),
 ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
   field bits<64> Inst;
@@ -940,6 +641,7 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
 
 def CF_ALU : ALU_CLAUSE<8, "ALU">;
 def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
+def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
 
 def FETCH_CLAUSE : AMDGPUInst <(outs),
 (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
@@ -987,42 +689,42 @@ def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
 // XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
 def SETE : R600_2OP <
   0x08, "SETE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OEQ))]
 >;
 
 def SGT : R600_2OP <
   0x09, "SETGT",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GT))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGT))]
 >;
 
 def SGE : R600_2OP <
   0xA, "SETGE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_GE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_OGE))]
 >;
 
 def SNE : R600_2OP <
   0xB, "SETNE",
-  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_NE))]
+  [(set f32:$dst, (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_UNE))]
 >;
 
 def SETE_DX10 : R600_2OP <
   0xC, "SETE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_EQ))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OEQ))]
 >;
 
 def SETGT_DX10 : R600_2OP <
   0xD, "SETGT_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GT))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGT))]
 >;
 
 def SETGE_DX10 : R600_2OP <
   0xE, "SETGE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_GE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_OGE))]
 >;
 
 def SETNE_DX10 : R600_2OP <
   0xF, "SETNE_DX10",
-  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_NE))]
+  [(set i32:$dst, (selectcc f32:$src0, f32:$src1, -1, 0, COND_UNE))]
 >;
 
 def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
@@ -1120,104 +822,86 @@ def CNDE_INT : R600_3OP <
 
 def CNDGE_INT : R600_3OP <
   0x1E, "CNDGE_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GE))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGE))]
 >;
 
 def CNDGT_INT : R600_3OP <
   0x1D, "CNDGT_INT",
-  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_GT))]
+  [(set i32:$dst, (selectcc i32:$src0, 0, i32:$src1, i32:$src2, COND_SGT))]
 >;
 
 //===----------------------------------------------------------------------===//
 // Texture instructions
 //===----------------------------------------------------------------------===//
 
-def TEX_LD : R600_TEX <
-  0x03, "TEX_LD",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txf v4f32:$SRC_GPR,
-      imm:$OFFSET_X, imm:$OFFSET_Y, imm:$OFFSET_Z, imm:$RESOURCE_ID,
-      imm:$SAMPLER_ID, imm:$textureTarget))]
-> {
-let AsmString = "TEX_LD $DST_GPR, $SRC_GPR, $OFFSET_X, $OFFSET_Y, $OFFSET_Z,"
-    "$RESOURCE_ID, $SAMPLER_ID, $textureTarget";
-let InOperandList = (ins R600_Reg128:$SRC_GPR, i32imm:$OFFSET_X,
-    i32imm:$OFFSET_Y, i32imm:$OFFSET_Z, i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
-    i32imm:$textureTarget);
-}
-
-def TEX_GET_TEXTURE_RESINFO : R600_TEX <
-  0x04, "TEX_GET_TEXTURE_RESINFO",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txq v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_GET_GRADIENTS_H : R600_TEX <
-  0x07, "TEX_GET_GRADIENTS_H",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_ddx v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_GET_GRADIENTS_V : R600_TEX <
-  0x08, "TEX_GET_GRADIENTS_V",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_ddy v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
-
-def TEX_SET_GRADIENTS_H : R600_TEX <
-  0x0B, "TEX_SET_GRADIENTS_H",
-  []
->;
-
-def TEX_SET_GRADIENTS_V : R600_TEX <
-  0x0C, "TEX_SET_GRADIENTS_V",
-  []
->;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 
-def TEX_SAMPLE : R600_TEX <
-  0x10, "TEX_SAMPLE",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+class R600_TEX <bits<11> inst, string opName> :
+  InstR600 <(outs R600_Reg128:$DST_GPR),
+          (ins R600_Reg128:$SRC_GPR,
+          RSel:$srcx, RSel:$srcy, RSel:$srcz, RSel:$srcw,
+          i32imm:$offsetx, i32imm:$offsety, i32imm:$offsetz,
+          RSel:$DST_SEL_X, RSel:$DST_SEL_Y, RSel:$DST_SEL_Z, RSel:$DST_SEL_W,
+          i32imm:$RESOURCE_ID, i32imm:$SAMPLER_ID,
+          CT:$COORD_TYPE_X, CT:$COORD_TYPE_Y, CT:$COORD_TYPE_Z,
+          CT:$COORD_TYPE_W),
+          !strconcat(opName,
+          " $DST_GPR.$DST_SEL_X$DST_SEL_Y$DST_SEL_Z$DST_SEL_W, "
+          "$SRC_GPR.$srcx$srcy$srcz$srcw "
+          "RID:$RESOURCE_ID SID:$SAMPLER_ID "
+          "CT:$COORD_TYPE_X$COORD_TYPE_Y$COORD_TYPE_Z$COORD_TYPE_W"),
+          [],
+          NullALU>, TEX_WORD0, TEX_WORD1, TEX_WORD2 {
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
 
-def TEX_SAMPLE_C : R600_TEX <
-  0x18, "TEX_SAMPLE_C",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_tex v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
+  let TEX_INST = inst{4-0};
+  let SRC_REL = 0;
+  let DST_REL = 0;
+  let LOD_BIAS = 0;
 
-def TEX_SAMPLE_L : R600_TEX <
-  0x11, "TEX_SAMPLE_L",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+  let INST_MOD = 0;
+  let FETCH_WHOLE_QUAD = 0;
+  let ALT_CONST = 0;
+  let SAMPLER_INDEX_MODE = 0;
+  let RESOURCE_INDEX_MODE = 0;
 
-def TEX_SAMPLE_C_L : R600_TEX <
-  0x19, "TEX_SAMPLE_C_L",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txl v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
+  let TEXInst = 1;
+}
 
-def TEX_SAMPLE_LB : R600_TEX <
-  0x12, "TEX_SAMPLE_LB",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, imm:$textureTarget))]
->;
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
 
-def TEX_SAMPLE_C_LB : R600_TEX <
-  0x1A, "TEX_SAMPLE_C_LB",
-  [(set v4f32:$DST_GPR, (int_AMDGPU_txb v4f32:$SRC_GPR,
-      imm:$RESOURCE_ID, imm:$SAMPLER_ID, TEX_SHADOW:$textureTarget))]
->;
 
-def TEX_SAMPLE_G : R600_TEX <
-  0x14, "TEX_SAMPLE_G",
-  []
->;
 
-def TEX_SAMPLE_C_G : R600_TEX <
-  0x1C, "TEX_SAMPLE_C_G",
-  []
->;
+def TEX_SAMPLE : R600_TEX <0x10, "TEX_SAMPLE">;
+def TEX_SAMPLE_C : R600_TEX <0x18, "TEX_SAMPLE_C">;
+def TEX_SAMPLE_L : R600_TEX <0x11, "TEX_SAMPLE_L">;
+def TEX_SAMPLE_C_L : R600_TEX <0x19, "TEX_SAMPLE_C_L">;
+def TEX_SAMPLE_LB : R600_TEX <0x12, "TEX_SAMPLE_LB">;
+def TEX_SAMPLE_C_LB : R600_TEX <0x1A, "TEX_SAMPLE_C_LB">;
+def TEX_LD : R600_TEX <0x03, "TEX_LD">;
+def TEX_LDPTR : R600_TEX <0x03, "TEX_LDPTR"> {
+  let INST_MOD = 1;
+}
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <0x04, "TEX_GET_TEXTURE_RESINFO">;
+def TEX_GET_GRADIENTS_H : R600_TEX <0x07, "TEX_GET_GRADIENTS_H">;
+def TEX_GET_GRADIENTS_V : R600_TEX <0x08, "TEX_GET_GRADIENTS_V">;
+def TEX_SET_GRADIENTS_H : R600_TEX <0x0B, "TEX_SET_GRADIENTS_H">;
+def TEX_SET_GRADIENTS_V : R600_TEX <0x0C, "TEX_SET_GRADIENTS_V">;
+def TEX_SAMPLE_G : R600_TEX <0x14, "TEX_SAMPLE_G">;
+def TEX_SAMPLE_C_G : R600_TEX <0x1C, "TEX_SAMPLE_C_G">;
+
+defm : TexPattern<0, TEX_SAMPLE>;
+defm : TexPattern<1, TEX_SAMPLE_C>;
+defm : TexPattern<2, TEX_SAMPLE_L>;
+defm : TexPattern<3, TEX_SAMPLE_C_L>;
+defm : TexPattern<4, TEX_SAMPLE_LB>;
+defm : TexPattern<5, TEX_SAMPLE_C_LB>;
+defm : TexPattern<6, TEX_LD, v4i32>;
+defm : TexPattern<7, TEX_GET_TEXTURE_RESINFO, v4i32>;
+defm : TexPattern<8, TEX_GET_GRADIENTS_H>;
+defm : TexPattern<9, TEX_GET_GRADIENTS_V>;
+defm : TexPattern<10, TEX_LDPTR, v4i32>;
 
 //===----------------------------------------------------------------------===//
 // Helper classes for common instructions
@@ -1240,41 +924,82 @@ class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
 
 class CNDE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_EQ))]
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OEQ))]
 >;
 
 class CNDGT_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGT",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GT))]
->;
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGT))]
+> {
+  let Itinerary = VecALU;
+}
 
 class CNDGE_Common <bits<5> inst> : R600_3OP <
   inst, "CNDGE",
-  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
->;
-
-multiclass DOT4_Common <bits<11> inst> {
+  [(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_OGE))]
+> {
+  let Itinerary = VecALU;
+}
+
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU"  in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+   UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+   OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+   R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+   R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+   R600_Pred:$pred_sel_X,
+// Slot Y
+   UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+   OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+   R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+   R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+   R600_Pred:$pred_sel_Y,
+// Slot Z
+   UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+   OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+   R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+   R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+   R600_Pred:$pred_sel_Z,
+// Slot W
+   UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+   OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+   R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+   R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+   R600_Pred:$pred_sel_W,
+   LITERAL:$literal0, LITERAL:$literal1),
+  "",
+  pattern,
+  AnyALU> {
 
-  def _pseudo : R600_REDUCTION <inst,
-    (ins R600_Reg128:$src0, R600_Reg128:$src1),
-    "DOT4 $dst $src0, $src1",
-    [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
-  >;
+  let UseNamedOperandTable = 1;
 
-  def _real : R600_2OP <inst, "DOT4", []>;
 }
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+  R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+  R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+  R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+  R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
+
 
 let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
 multiclass CUBE_Common <bits<11> inst> {
 
   def _pseudo : InstR600 <
     (outs R600_Reg128:$dst),
-    (ins R600_Reg128:$src),
-    "CUBE $dst $src",
-    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src))],
+    (ins R600_Reg128:$src0),
+    "CUBE $dst $src0",
+    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
     VecALU
   > {
     let isPseudo = 1;
+    let UseNamedOperandTable = 1;
   }
 
   def _real : R600_2OP <inst, "CUBE", []>;
@@ -1284,35 +1009,30 @@ multiclass CUBE_Common <bits<11> inst> {
 class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "EXP_IEEE", fexp2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_INT", fp_to_sint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "INT_TO_FLT", sint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "FLT_TO_UINT", fp_to_uint
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "UINT_TO_FLT", uint_to_fp
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1323,7 +1043,6 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
 class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "LOG_IEEE", flog2
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
@@ -1333,75 +1052,68 @@ class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
 class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI_INT", mulhs
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULHI", mulhu
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
   inst, "MULLO_INT", mul
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []> {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_CLAMPED", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIP_IEEE", [(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIP_UINT", AMDGPUurecip
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
   inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
   inst, "RECIPSQRT_IEEE", []
 > {
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class SIN_Common <bits<11> inst> : R600_1OP <
-  inst, "SIN", []>{
+  inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
 class COS_Common <bits<11> inst> : R600_1OP <
-  inst, "COS", []> {
+  inst, "COS", [(set f32:$dst, (COS_HW f32:$src0))]> {
   let Trig = 1;
-  let TransOnly = 1;
   let Itinerary = TransALU;
 }
 
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
 //===----------------------------------------------------------------------===//
 // Helper patterns for complex intrinsics
 //===----------------------------------------------------------------------===//
@@ -1424,6 +1136,13 @@ class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ie
   (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x))
 >;
 
+// FROUND pattern
+class FROUNDPat<Instruction CNDGE> : Pat <
+  (AMDGPUround f32:$x),
+  (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x))
+>;
+
+
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1436,7 +1155,7 @@ let Predicates = [isR600] in {
   def CNDE_r600 : CNDE_Common<0x18>;
   def CNDGT_r600 : CNDGT_Common<0x19>;
   def CNDGE_r600 : CNDGE_Common<0x1A>;
-  defm DOT4_r600 : DOT4_Common<0x50>;
+  def DOT4_r600 : DOT4_Common<0x50>;
   defm CUBE_r600 : CUBE_Common<0x52>;
   def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
   def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
@@ -1465,11 +1184,12 @@ let Predicates = [isR600] in {
   def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
 
   def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
+  def : FROUNDPat <CNDGE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
@@ -1478,58 +1198,58 @@ let Predicates = [isR600] in {
   def R600_ExportBuf : ExportBufInst {
     let Word1{20-17} = 0; // BURST_COUNT
     let Word1{21} = eop;
-    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{22} = 0; // VALID_PIXEL_MODE
     let Word1{30-23} = inst;
     let Word1{31} = 1; // BARRIER
   }
   defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
 
-  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "TEX $COUNT @$ADDR"> {
+  def CF_TC_R600 : CF_CLAUSE_R600<1, (ins i32imm:$ADDR, i32imm:$CNT),
+  "TEX $CNT @$ADDR"> {
     let POP_COUNT = 0;
   }
-  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$COUNT),
-  "VTX $COUNT @$ADDR"> {
+  def CF_VC_R600 : CF_CLAUSE_R600<2, (ins i32imm:$ADDR, i32imm:$CNT),
+  "VTX $CNT @$ADDR"> {
     let POP_COUNT = 0;
   }
   def WHILE_LOOP_R600 : CF_CLAUSE_R600<6, (ins i32imm:$ADDR),
   "LOOP_START_DX10 @$ADDR"> {
     let POP_COUNT = 0;
-    let COUNT = 0;
+    let CNT = 0;
   }
   def END_LOOP_R600 : CF_CLAUSE_R600<5, (ins i32imm:$ADDR), "END_LOOP @$ADDR"> {
     let POP_COUNT = 0;
-    let COUNT = 0;
+    let CNT = 0;
   }
   def LOOP_BREAK_R600 : CF_CLAUSE_R600<9, (ins i32imm:$ADDR),
   "LOOP_BREAK @$ADDR"> {
     let POP_COUNT = 0;
-    let COUNT = 0;
+    let CNT = 0;
   }
   def CF_CONTINUE_R600 : CF_CLAUSE_R600<8, (ins i32imm:$ADDR),
   "CONTINUE @$ADDR"> {
     let POP_COUNT = 0;
-    let COUNT = 0;
+    let CNT = 0;
   }
   def CF_JUMP_R600 : CF_CLAUSE_R600<10, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "JUMP @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
+    let CNT = 0;
   }
   def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "ELSE @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
+    let CNT = 0;
   }
   def CF_CALL_FS_R600 : CF_CLAUSE_R600<19, (ins), "CALL_FS"> {
     let ADDR = 0;
-    let COUNT = 0;
+    let CNT = 0;
     let POP_COUNT = 0;
   }
   def POP_R600 : CF_CLAUSE_R600<14, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "POP @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
+    let CNT = 0;
   }
   def CF_END_R600 : CF_CLAUSE_R600<0, (ins), "CF_END"> {
-    let COUNT = 0;
+    let CNT = 0;
     let POP_COUNT = 0;
     let ADDR = 0;
     let END_OF_PROGRAM = 1;
@@ -1537,18 +1257,6 @@ let Predicates = [isR600] in {
 
 }
 
-// Helper pattern for normalizing inputs to triginomic instructions for R700+
-// cards.
-class COS_PAT <InstR600 trig> : Pat<
-  (fcos f32:$src),
-  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src))
->;
-
-class SIN_PAT <InstR600 trig> : Pat<
-  (fsin f32:$src),
-  (trig (MUL_IEEE (MOV_IMM_I32 CONST.TWO_PI_INV), $src))
->;
-
 //===----------------------------------------------------------------------===//
 // R700 Only instructions
 //===----------------------------------------------------------------------===//
@@ -1556,12 +1264,35 @@ class SIN_PAT <InstR600 trig> : Pat<
 let Predicates = [isR700] in {
   def SIN_r700 : SIN_Common<0x6E>;
   def COS_r700 : COS_Common<0x6F>;
+}
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman store instructions
+//===----------------------------------------------------------------------===//
 
-  // R700 normalizes inputs to SIN/COS the same as EG
-  def : SIN_PAT <SIN_r700>;
-  def : COS_PAT <COS_r700>;
+let Predicates = [isEGorCayman] in {
+
+class CF_MEM_RAT_CACHELESS <bits<6> rat_inst, bits<4> rat_id, bits<4> mask, dag ins,
+                           string name, list<dag> pattern>
+    : EG_CF_RAT <0x57, rat_inst, rat_id, mask, (outs), ins,
+                 "MEM_RAT_CACHELESS "#name, pattern>;
+
+class CF_MEM_RAT <bits<6> rat_inst, bits<4> rat_id, dag ins, string name,
+                  list<dag> pattern>
+    : EG_CF_RAT <0x56, rat_inst, rat_id, 0xf /* mask */, (outs), ins,
+                 "MEM_RAT "#name, pattern>;
+
+def RAT_MSKOR : CF_MEM_RAT <0x11, 0,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
+  "MSKOR $rw_gpr.XW, $index_gpr",
+  [(mskor_global v4i32:$rw_gpr, i32:$index_gpr)]
+> {
+  let eop = 0;
 }
 
+} // End Predicates = [isEGorCayman]
+
+
 //===----------------------------------------------------------------------===//
 // Evergreen Only instructions
 //===----------------------------------------------------------------------===//
@@ -1585,9 +1316,179 @@ def SIN_eg : SIN_Common<0x8D>;
 def COS_eg : COS_Common<0x8E>;
 
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : SIN_PAT <SIN_eg>;
-def : COS_PAT <COS_eg>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+
+let usesCustomInserter = 1 in {
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x1,
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr, $index_gpr, $eop",
+  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+// 64-bit store
+def RAT_WRITE_CACHELESS_64_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0x3,
+  (ins R600_Reg64:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XY, $index_gpr, $eop",
+  [(global_store v2i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : CF_MEM_RAT_CACHELESS <0x2, 0, 0xf,
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  "STORE_RAW $rw_gpr.XYZW, $index_gpr, $eop",
+  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+>;
+
+} // End usesCustomInserter = 1
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_eg, VTX_READ<name, buffer_id, outs, pattern> {
+
+  // Static fields
+  let VC_INST = 0;
+  let FETCH_TYPE = 2;
+  let FETCH_WHOLE_QUAD = 0;
+  let BUFFER_ID = buffer_id;
+  let SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  let SRC_SEL_X = 0;
+
+  let Inst{31-0} = Word0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $src_gpr registers of the VTX_READ.
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
+}
+
+class VTX_READ_64_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_64 $dst_gpr.XY, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 8;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_64_eg : VTX_READ_64_eg <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
+>;
+
+def VTX_READ_GLOBAL_16_eg : VTX_READ_16_eg <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 64-bit reads
+def VTX_READ_GLOBAL_64_eg : VTX_READ_64_eg <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
+>;
+
 } // End Predicates = [isEG]
 
 //===----------------------------------------------------------------------===//
@@ -1620,10 +1521,11 @@ let Predicates = [isEGorCayman] in {
   def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
 
-  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
-    [(set i32:$dst, (AMDGPUbitalign i32:$src0, i32:$src1, i32:$src2))],
-    VecALU
+  def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
+    [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
   >;
+  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+  def : ROTRPattern <BIT_ALIGN_INT_eg>;
 
   def MULADD_eg : MULADD_Common<0x14>;
   def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
@@ -1635,7 +1537,10 @@ let Predicates = [isEGorCayman] in {
   def CNDGE_eg : CNDGE_Common<0x1B>;
   def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
   def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
-  defm DOT4_eg : DOT4_Common<0xBE>;
+  def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
+    [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+  >;
+  def DOT4_eg : DOT4_Common<0xBE>;
   defm CUBE_eg : CUBE_Common<0xC0>;
 
 let hasSideEffects = 1 in {
@@ -1646,6 +1551,7 @@ let hasSideEffects = 1 in {
 
   def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
     let Pattern = [];
+    let Itinerary = AnyALU;
   }
 
   def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
@@ -1656,6 +1562,165 @@ let hasSideEffects = 1 in {
 
   def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
 
+def GROUP_BARRIER : InstR600 <
+    (outs), (ins), "  GROUP_BARRIER", [(int_AMDGPU_barrier_local)], AnyALU>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <0x54> {
+
+  let dst = 0;
+  let dst_rel = 0;
+  let src0 = 0;
+  let src0_rel = 0;
+  let src0_neg = 0;
+  let src0_abs = 0;
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let write = 0;
+  let omod = 0;
+  let clamp = 0;
+  let last = 1;
+  let bank_swizzle = 0;
+  let pred_sel = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// LDS Instructions
+//===----------------------------------------------------------------------===//
+class R600_LDS  <bits<6> op, dag outs, dag ins, string asm,
+                 list<dag> pattern = []> :
+
+    InstR600 <outs, ins, asm, pattern, XALU>,
+    R600_ALU_LDS_Word0,
+    R600LDS_Word1 {
+
+  bits<6>  offset = 0;
+  let lds_op = op;
+
+  let Word1{27} = offset{0};
+  let Word1{12} = offset{1};
+  let Word1{28} = offset{2};
+  let Word1{31} = offset{3};
+  let Word0{12} = offset{4};
+  let Word0{25} = offset{5};
+
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+
+  let ALUInst = 1;
+  let HasNativeOperands = 1;
+  let UseNamedOperandTable = 1;
+}
+
+class R600_LDS_1A <bits<6> lds_op, string name, list<dag> pattern> : R600_LDS <
+  lds_op,
+  (outs R600_Reg32:$dst),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last OQAP, $src0$src0_rel $pred_sel",
+  pattern
+  > {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src2 = 0;
+  let src2_rel = 0;
+
+  let Defs = [OQAP];
+  let usesCustomInserter = 1;
+  let LDS_1A = 1;
+  let DisableEncoding = "$dst";
+}
+
+class R600_LDS_1A1D <bits<6> lds_op, dag outs, string name, list<dag> pattern,
+                     string dst =""> :
+    R600_LDS <
+  lds_op, outs,
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       LAST:$last, R600_Pred:$pred_sel,
+       BANK_SWIZZLE:$bank_swizzle),
+  "  "#name#" $last "#dst#"$src0$src0_rel, $src1$src1_rel, $pred_sel",
+  pattern
+  > {
+
+  field string BaseOp;
+
+  let src2 = 0;
+  let src2_rel = 0;
+  let LDS_1A1D = 1;
+}
+
+class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op, (outs), name, pattern> {
+  let BaseOp = name;
+}
+
+class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+
+  let BaseOp = name;
+  let usesCustomInserter = 1;
+  let DisableEncoding = "$dst";
+  let Defs = [OQAP];
+}
+
+class R600_LDS_1A2D <bits<6> lds_op, string name, list<dag> pattern> :
+    R600_LDS <
+  lds_op,
+  (outs),
+  (ins R600_Reg32:$src0, REL:$src0_rel, SEL:$src0_sel,
+       R600_Reg32:$src1, REL:$src1_rel, SEL:$src1_sel,
+       R600_Reg32:$src2, REL:$src2_rel, SEL:$src2_sel,
+       LAST:$last, R600_Pred:$pred_sel, BANK_SWIZZLE:$bank_swizzle),
+  "  "#name# "$last $src0$src0_rel, $src1$src1_rel, $src2$src2_rel, $pred_sel",
+  pattern> {
+  let LDS_1A2D = 1;
+}
+
+def LDS_ADD : R600_LDS_1A1D_NORET <0x0, "LDS_ADD", [] >;
+def LDS_SUB : R600_LDS_1A1D_NORET <0x1, "LDS_SUB", [] >;
+def LDS_WRITE : R600_LDS_1A1D_NORET <0xD, "LDS_WRITE",
+  [(local_store (i32 R600_Reg32:$src1), R600_Reg32:$src0)]
+>;
+def LDS_BYTE_WRITE : R600_LDS_1A1D_NORET<0x12, "LDS_BYTE_WRITE",
+  [(truncstorei8_local i32:$src1, i32:$src0)]
+>;
+def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
+  [(truncstorei16_local i32:$src1, i32:$src0)]
+>;
+def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
+  [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+>;
+def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
+  [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+>;
+def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
+  [(set (i32 R600_Reg32:$dst), (local_load R600_Reg32:$src0))]
+>;
+def LDS_BYTE_READ_RET : R600_LDS_1A <0x36, "LDS_BYTE_READ_RET",
+  [(set i32:$dst, (sextloadi8_local i32:$src0))]
+>;
+def LDS_UBYTE_READ_RET : R600_LDS_1A <0x37, "LDS_UBYTE_READ_RET",
+  [(set i32:$dst, (az_extloadi8_local i32:$src0))]
+>;
+def LDS_SHORT_READ_RET : R600_LDS_1A <0x38, "LDS_SHORT_READ_RET",
+  [(set i32:$dst, (sextloadi16_local i32:$src0))]
+>;
+def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
+  [(set i32:$dst, (az_extloadi16_local i32:$src0))]
+>;
+
   // TRUNC is used for the FLT_TO_INT instructions to work around a
   // perceived problem where the rounding modes are applied differently
   // depending on the instruction and the slot they are in.
@@ -1673,9 +1738,11 @@ let hasSideEffects = 1 in {
   // SHA-256 Patterns
   def : SHA256MaPattern <BFI_INT_eg, XOR_INT>;
 
+  def : FROUNDPat <CNDGE_eg>;
+
   def EG_ExportSwz : ExportSwzInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1685,7 +1752,7 @@ let hasSideEffects = 1 in {
 
   def EG_ExportBuf : ExportBufInst {
     let Word1{19-16} = 0; // BURST_COUNT
-    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{20} = 0; // VALID_PIXEL_MODE
     let Word1{21} = eop;
     let Word1{29-22} = inst;
     let Word1{30} = 0; // MARK
@@ -1744,48 +1811,78 @@ let hasSideEffects = 1 in {
     let END_OF_PROGRAM = 1;
   }
 
+} // End Predicates = [isEGorCayman]
+
 //===----------------------------------------------------------------------===//
-// Memory read/write instructions
+// Regist loads and stores - for indirect addressing
 //===----------------------------------------------------------------------===//
-let usesCustomInserter = 1 in {
 
-class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
-                              list<dag> pattern>
-    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
-                 !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
-  let RIM         = 0;
-  // XXX: Have a separate instruction for non-indexed writes.
-  let TYPE        = 1;
-  let RW_REL      = 0;
-  let ELEM_SIZE   = 0;
+defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
 
-  let ARRAY_SIZE  = 0;
-  let COMP_MASK   = comp_mask;
-  let BURST_COUNT = 0;
-  let VPM         = 0;
-  let MARK        = 0;
-  let BARRIER     = 1;
-}
+//===----------------------------------------------------------------------===//
+// Cayman Instructions
+//===----------------------------------------------------------------------===//
 
-} // End usesCustomInserter = 1
+let Predicates = [isCayman] in {
 
-// 32-bit store
-def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
-  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0x1, "RAT_WRITE_CACHELESS_32_eg",
-  [(global_store i32:$rw_gpr, i32:$index_gpr)]
+def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
+  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+>;
+def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
+  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
 >;
 
-//128-bit store
-def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
-  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
-  0xf, "RAT_WRITE_CACHELESS_128",
-  [(global_store v4i32:$rw_gpr, i32:$index_gpr)]
+let isVector = 1 in {
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+// The multiplication scales from [0,1] to the unsigned integer range
+def : Pat <
+  (AMDGPUurecip i32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
+                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
 >;
 
-class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
-    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern>,
-      VTX_WORD1_GPR, VTX_WORD0 {
+  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
+    let ADDR = 0;
+    let POP_COUNT = 0;
+    let COUNT = 0;
+  }
+
+def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
+
+class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
+  CF_MEM_RAT_CACHELESS <0x14, 0, mask,
+                        (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
+                        "STORE_DWORD $rw_gpr, $index_gpr",
+                        [(global_store vt:$rw_gpr, i32:$index_gpr)]> {
+  let eop = 0; // This bit is not used on Cayman.
+}
+
+def RAT_STORE_DWORD32 : RAT_STORE_DWORD <R600_TReg32_X, i32, 0x1>;
+def RAT_STORE_DWORD64 : RAT_STORE_DWORD <R600_Reg64, v2i32, 0x3>;
+def RAT_STORE_DWORD128 : RAT_STORE_DWORD <R600_Reg128, v4i32, 0xf>;
+
+class VTX_READ_cm <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : VTX_WORD0_cm, VTX_READ<name, buffer_id, outs, pattern> {
 
   // Static fields
   let VC_INST = 0;
@@ -1796,53 +1893,18 @@ class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
   // XXX: We can infer this field based on the SRC_GPR.  This would allow us
   // to store vertex addresses in any channel, not just X.
   let SRC_SEL_X = 0;
-  let DST_REL = 0;
-  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
-  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
-  // however, based on my testing if USE_CONST_FIELDS is set, then all
-  // these fields need to be set to 0.
-  let USE_CONST_FIELDS = 0;
-  let NUM_FORMAT_ALL = 1;
-  let FORMAT_COMP_ALL = 0;
-  let SRF_MODE_ALL = 0;
+  let SRC_SEL_Y = 0;
+  let STRUCTURED_READ = 0;
+  let LDS_REQ = 0;
+  let COALESCED_READ = 0;
 
   let Inst{31-0} = Word0;
-  let Inst{63-32} = Word1;
-  // LLVM can only encode 64-bit instructions, so these fields are manually
-  // encoded in R600CodeEmitter
-  //
-  // bits<16> OFFSET;
-  // bits<2>  ENDIAN_SWAP = 0;
-  // bits<1>  CONST_BUF_NO_STRIDE = 0;
-  // bits<1>  MEGA_FETCH = 0;
-  // bits<1>  ALT_CONST = 0;
-  // bits<2>  BUFFER_INDEX_MODE = 0;
-
-
-
-  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
-  // is done in R600CodeEmitter
-  //
-  // Inst{79-64} = OFFSET;
-  // Inst{81-80} = ENDIAN_SWAP;
-  // Inst{82}    = CONST_BUF_NO_STRIDE;
-  // Inst{83}    = MEGA_FETCH;
-  // Inst{84}    = ALT_CONST;
-  // Inst{86-85} = BUFFER_INDEX_MODE;
-  // Inst{95-86} = 0; Reserved
-
-  // VTX_WORD3 (Padding)
-  //
-  // Inst{127-96} = 0;
-
-  let VTXInst = 1;
 }
 
-class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
-                   pattern> {
+class VTX_READ_8_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_8 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
 
-  let MEGA_FETCH_COUNT = 1;
   let DST_SEL_X = 0;
   let DST_SEL_Y = 7;   // Masked
   let DST_SEL_Z = 7;   // Masked
@@ -1850,10 +1912,9 @@ class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
   let DATA_FORMAT = 1; // FMT_8
 }
 
-class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
-                    pattern> {
-  let MEGA_FETCH_COUNT = 2;
+class VTX_READ_16_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_16 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
   let DST_SEL_X = 0;
   let DST_SEL_Y = 7;   // Masked
   let DST_SEL_Z = 7;   // Masked
@@ -1862,11 +1923,10 @@ class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
 
 }
 
-class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
-                   pattern> {
+class VTX_READ_32_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_32 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_TReg32_X:$dst_gpr), pattern> {
 
-  let MEGA_FETCH_COUNT = 4;
   let DST_SEL_X        = 0;
   let DST_SEL_Y        = 7;   // Masked
   let DST_SEL_Z        = 7;   // Masked
@@ -1875,19 +1935,29 @@ class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
 
   // This is not really necessary, but there were some GPU hangs that appeared
   // to be caused by ALU instructions in the next instruction group that wrote
-  // to the $ptr registers of the VTX_READ.
+  // to the $src_gpr registers of the VTX_READ.
   // e.g.
   // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
   // %T2_X<def> = MOV %ZERO
   //Adding this constraint prevents this from happening.
-  let Constraints = "$ptr.ptr = $dst";
+  let Constraints = "$src_gpr.ptr = $dst_gpr";
 }
 
-class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
-    : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
-                   pattern> {
+class VTX_READ_64_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_64 $dst_gpr, $src_gpr", buffer_id,
+                   (outs R600_Reg64:$dst_gpr), pattern> {
+
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 1;
+  let DST_SEL_Z        = 7;
+  let DST_SEL_W        = 7;
+  let DATA_FORMAT      = 0x1D; // COLOR_32_32
+}
+
+class VTX_READ_128_cm <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_cm <"VTX_READ_128 $dst_gpr.XYZW, $src_gpr", buffer_id,
+                   (outs R600_Reg128:$dst_gpr), pattern> {
 
-  let MEGA_FETCH_COUNT = 16;
   let DST_SEL_X        =  0;
   let DST_SEL_Y        =  1;
   let DST_SEL_Z        =  2;
@@ -1896,28 +1966,31 @@ class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
 
   // XXX: Need to force VTX_READ_128 instructions to write to the same register
   // that holds its buffer address to avoid potential hangs.  We can't use
-  // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
+  // the same constraint as VTX_READ_32_eg, because the $src_gpr.ptr and $dst
   // registers are different sizes.
 }
 
 //===----------------------------------------------------------------------===//
 // VTX Read from parameter memory space
 //===----------------------------------------------------------------------===//
+def VTX_READ_PARAM_8_cm : VTX_READ_8_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti8 ADDRVTX_READ:$src_gpr))]
+>;
 
-def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
-  [(set i32:$dst, (load_param_zexti8 ADDRVTX_READ:$ptr))]
+def VTX_READ_PARAM_16_cm : VTX_READ_16_cm <0,
+  [(set i32:$dst_gpr, (load_param_exti16 ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
-  [(set i32:$dst, (load_param_zexti16 ADDRVTX_READ:$ptr))]
+def VTX_READ_PARAM_32_cm : VTX_READ_32_cm <0,
+  [(set i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
-  [(set i32:$dst, (load_param ADDRVTX_READ:$ptr))]
+def VTX_READ_PARAM_64_cm : VTX_READ_64_cm <0,
+  [(set v2i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
-def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
-  [(set v4i32:$dst, (load_param ADDRVTX_READ:$ptr))]
+def VTX_READ_PARAM_128_cm : VTX_READ_128_cm <0,
+  [(set v4i32:$dst_gpr, (load_param ADDRVTX_READ:$src_gpr))]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1925,78 +1998,29 @@ def VTX_READ_PARAM_128_eg : VTX_READ_128_eg <0,
 //===----------------------------------------------------------------------===//
 
 // 8-bit reads
-def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
-  [(set i32:$dst, (zextloadi8_global ADDRVTX_READ:$ptr))]
+def VTX_READ_GLOBAL_8_cm : VTX_READ_8_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi8_global ADDRVTX_READ:$src_gpr))]
 >;
 
-// 32-bit reads
-def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst, (global_load ADDRVTX_READ:$ptr))]
+def VTX_READ_GLOBAL_16_cm : VTX_READ_16_cm <1,
+  [(set i32:$dst_gpr, (az_extloadi16_global ADDRVTX_READ:$src_gpr))]
 >;
 
-// 128-bit reads
-def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
-  [(set v4i32:$dst, (global_load ADDRVTX_READ:$ptr))]
+// 32-bit reads
+def VTX_READ_GLOBAL_32_cm : VTX_READ_32_cm <1,
+  [(set i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
 >;
 
-//===----------------------------------------------------------------------===//
-// Constant Loads
-// XXX: We are currently storing all constants in the global address space.
-//===----------------------------------------------------------------------===//
-
-def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
-  [(set i32:$dst, (constant_load ADDRVTX_READ:$ptr))]
+// 64-bit reads
+def VTX_READ_GLOBAL_64_cm : VTX_READ_64_cm <1,
+  [(set v2i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
 >;
 
-}
-
-//===----------------------------------------------------------------------===//
-// Regist loads and stores - for indirect addressing
-//===----------------------------------------------------------------------===//
-
-defm R600_ : RegisterLoadStore <R600_Reg32, FRAMEri, ADDRIndirect>;
-
-let Predicates = [isCayman] in {
-
-let isVector = 1 in {
-
-def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
-
-def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
-def MULHI_INT_cm : MULHI_INT_Common<0x90>;
-def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
-def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
-def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
-def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
-def LOG_IEEE_cm : LOG_IEEE_Common<0x83>;
-def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
-def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
-def SIN_cm : SIN_Common<0x8D>;
-def COS_cm : COS_Common<0x8E>;
-} // End isVector = 1
-
-def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
-def : SIN_PAT <SIN_cm>;
-def : COS_PAT <COS_cm>;
-
-defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
-
-// RECIP_UINT emulation for Cayman
-// The multiplication scales from [0,1] to the unsigned integer range
-def : Pat <
-  (AMDGPUurecip i32:$src0),
-  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
-                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+// 128-bit reads
+def VTX_READ_GLOBAL_128_cm : VTX_READ_128_cm <1,
+  [(set v4i32:$dst_gpr, (global_load ADDRVTX_READ:$src_gpr))]
 >;
 
-  def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
-    let ADDR = 0;
-    let POP_COUNT = 0;
-    let COUNT = 0;
-  }
-
-def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
 } // End isCayman
 
 //===----------------------------------------------------------------------===//
@@ -2007,9 +2031,6 @@ def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
 def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
   "IF_PREDICATE_SET $src", []>;
 
-def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
-  "PREDICATED_BREAK $src", []>;
-
 //===----------------------------------------------------------------------===//
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
@@ -2083,10 +2104,6 @@ def TXD_SHADOW: InstR600 <
 } // End isPseudo = 1
 } // End usesCustomInserter = 1
 
-def CLAMP_R600 :  CLAMP <R600_Reg32>;
-def FABS_R600 : FABS<R600_Reg32>;
-def FNEG_R600 : FNEG<R600_Reg32>;
-
 //===---------------------------------------------------------------------===//
 // Return instruction
 //===---------------------------------------------------------------------===//
@@ -2117,7 +2134,7 @@ def CONST_COPY : Instruction {
 def TEX_VTX_CONSTBUF :
   InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "VTX_READ_eg $dst, $ptr",
       [(set v4i32:$dst, (CONST_ADDRESS ADDRGA_VAR_OFFSET:$ptr, (i32 imm:$BUFFER_ID)))]>,
-  VTX_WORD1_GPR, VTX_WORD0 {
+  VTX_WORD1_GPR, VTX_WORD0_eg {
 
   let VC_INST = 0;
   let FETCH_TYPE = 2;
@@ -2171,7 +2188,7 @@ def TEX_VTX_CONSTBUF :
 def TEX_VTX_TEXBUF:
   InstR600ISA <(outs R600_Reg128:$dst), (ins MEMxi:$ptr, i32imm:$BUFFER_ID), "TEX_VTX_EXPLICIT_READ $dst, $ptr",
       [(set v4f32:$dst, (int_R600_load_texbuf ADDRGA_VAR_OFFSET:$ptr, imm:$BUFFER_ID))]>,
-VTX_WORD1_GPR, VTX_WORD0 {
+VTX_WORD1_GPR, VTX_WORD0_eg {
 
 let VC_INST = 0;
 let FETCH_TYPE = 2;
@@ -2235,7 +2252,7 @@ let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
   def BRANCH : ILFormat<(outs), (ins brtarget:$target),
       "; Pseudo unconditional branch instruction",
       [(br bb:$target)]>;
-  defm BRANCH_COND : BranchConditional<IL_brcond>;
+  defm BRANCH_COND : BranchConditional<IL_brcond, R600_Reg32, R600_Reg32>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -2306,7 +2323,7 @@ def : CND_INT_f32 <CNDGE_INT, SETGE>;
 
 //CNDGE_INT extra pattern
 def : Pat <
-  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_GT),
+  (selectcc i32:$src0, -1, i32:$src1, i32:$src2, COND_SGT),
   (CNDGE_INT $src0, $src1, $src2)
 >;
 
@@ -2321,86 +2338,6 @@ def KIL : Pat <
   (MASK_WRITE (KILLGT (f32 ZERO), $src0))
 >;
 
-// SGT Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LT),
-  (SGT $src1, $src0)
->;
-
-// SGE Reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, COND_LE),
-  (SGE $src1, $src0)
->;
-
-// SETGT_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LT),
-  (SETGT_DX10 $src1, $src0)
->;
-
-// SETGE_DX10 reverse args
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, COND_LE),
-  (SETGE_DX10 $src1, $src0)
->;
-
-// SETGT_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLT),
-  (SETGT_INT $src1, $src0)
->;
-
-// SETGE_INT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETLE),
-  (SETGE_INT $src1, $src0)
->;
-
-// SETGT_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULT),
-  (SETGT_UINT $src1, $src0)
->;
-
-// SETGE_UINT reverse args
-def : Pat <
-  (selectcc i32:$src0, i32:$src1, -1, 0, SETULE),
-  (SETGE_UINT $src1, $src0)
->;
-
-// The next two patterns are special cases for handling 'true if ordered' and
-// 'true if unordered' conditionals.  The assumption here is that the behavior of
-// SETE and SNE conforms to the Direct3D 10 rules for floating point values
-// described here:
-// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
-// We assume that  SETE returns false when one of the operands is NAN and
-// SNE returns true when on of the operands is NAN
-
-//SETE - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETO),
-  (SETE $src0, $src1)
->;
-
-//SETE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETO),
-  (SETE_DX10 $src0, $src1)
->;
-
-//SNE - 'true if unordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, FP_ONE, FP_ZERO, SETUO),
-  (SNE $src0, $src1)
->;
-
-//SETNE_DX10 - 'true if ordered'
-def : Pat <
-  (selectcc f32:$src0, f32:$src1, -1, 0, SETUO),
-  (SETNE_DX10 $src0, $src1)
->;
-
 def : Extract_Element <f32, v4f32, 0, sub0>;
 def : Extract_Element <f32, v4f32, 1, sub1>;
 def : Extract_Element <f32, v4f32, 2, sub2>;
@@ -2424,10 +2361,24 @@ def : Insert_Element <i32, v4i32, 3, sub3>;
 def : Vector4_Build <v4f32, f32>;
 def : Vector4_Build <v4i32, i32>;
 
+def : Extract_Element <f32, v2f32, 0, sub0>;
+def : Extract_Element <f32, v2f32, 1, sub1>;
+
+def : Insert_Element <f32, v2f32, 0, sub0>;
+def : Insert_Element <f32, v2f32, 1, sub1>;
+
+def : Extract_Element <i32, v2i32, 0, sub0>;
+def : Extract_Element <i32, v2i32, 1, sub1>;
+
+def : Insert_Element <i32, v2i32, 0, sub0>;
+def : Insert_Element <i32, v2i32, 1, sub1>;
+
 // bitconvert patterns
 
 def : BitConvert <i32, f32, R600_Reg32>;
 def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v2f32, v2i32, R600_Reg64>;
+def : BitConvert <v2i32, v2f32, R600_Reg64>;
 def : BitConvert <v4f32, v4i32, R600_Reg128>;
 def : BitConvert <v4i32, v4f32, R600_Reg128>;
 
@@ -2435,3 +2386,11 @@ def : BitConvert <v4i32, v4f32, R600_Reg128>;
 def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
+
+def getLDSNoRetOp : InstrMapping {
+  let FilterClass = "R600_LDS_1A1D";
+  let RowFields = ["BaseOp"];
+  let ColFields = ["DisableEncoding"];
+  let KeyCol = ["$dst"];
+  let ValueCols = [[""""]];
+}
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
index dc8980a..9681747 100644
--- a/lib/Target/R600/R600Intrinsics.td
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -12,12 +12,56 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "R600", isTarget = 1 in {
+  class TextureIntrinsicFloatInput :
+    Intrinsic<[llvm_v4f32_ty], [
+      llvm_v4f32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+  class TextureIntrinsicInt32Input :
+    Intrinsic<[llvm_v4i32_ty], [
+      llvm_v4i32_ty, // Coord
+      llvm_i32_ty, // offset_x
+      llvm_i32_ty, // offset_y,
+      llvm_i32_ty, // offset_z,
+      llvm_i32_ty, // resource_id
+      llvm_i32_ty, // samplerid
+      llvm_i32_ty, // coord_type_x
+      llvm_i32_ty, // coord_type_y
+      llvm_i32_ty, // coord_type_z
+      llvm_i32_ty // coord_type_w
+    ], [IntrNoMem]>;
+
   def int_R600_load_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
   def int_R600_interp_input :
     Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_interp_const :
+    Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_R600_interp_xy :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+def int_R600_interp_zw :
+    Intrinsic<[llvm_v2f32_ty], [llvm_i32_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_R600_load_texbuf :
     Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_tex : TextureIntrinsicFloatInput;
+  def int_R600_texc : TextureIntrinsicFloatInput;
+  def int_R600_txl : TextureIntrinsicFloatInput;
+  def int_R600_txlc : TextureIntrinsicFloatInput;
+  def int_R600_txb : TextureIntrinsicFloatInput;
+  def int_R600_txbc : TextureIntrinsicFloatInput;
+  def int_R600_txf : TextureIntrinsicInt32Input;
+  def int_R600_ldptr : TextureIntrinsicInt32Input;
+  def int_R600_txq : TextureIntrinsicInt32Input;
+  def int_R600_ddx : TextureIntrinsicFloatInput;
+  def int_R600_ddy : TextureIntrinsicFloatInput;
   def int_R600_store_swizzle :
     Intrinsic<[], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], []>;
   def int_R600_store_stream_output :
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
index 018b403..01105c6 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.cpp
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -12,7 +12,9 @@
 
 using namespace llvm;
 
-R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
-  : AMDGPUMachineFunction(MF) { }
 
+// Pin the vtable to this file.
+void R600MachineFunctionInfo::anchor() {}
 
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : AMDGPUMachineFunction(MF) { }
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index 70fddbb..c1bec0a 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -13,14 +13,15 @@
 #ifndef R600MACHINEFUNCTIONINFO_H
 #define R600MACHINEFUNCTIONINFO_H
 
+#include "AMDGPUMachineFunction.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "AMDGPUMachineFunction.h"
 #include <vector>
 
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index a777142..da2a4d8 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -9,19 +9,17 @@
 //
 /// \file
 /// \brief R600 Machine Scheduler interface
-// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
 //
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "misched"
 
 #include "R600MachineScheduler.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
-#include <set>
 
 using namespace llvm;
 
@@ -30,54 +28,80 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   TII = static_cast<const R600InstrInfo*>(DAG->TII);
   TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
+  VLIW5 = !DAG->MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
   MRI = &DAG->MRI;
-  Available[IDAlu]->clear();
-  Available[IDFetch]->clear();
-  Available[IDOther]->clear();
   CurInstKind = IDOther;
   CurEmitted = 0;
-  OccupedSlotsMask = 15;
+  OccupedSlotsMask = 31;
   InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
-
+  InstKindLimit[IDOther] = 32;
 
   const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
-  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD5XXX) {
-    InstKindLimit[IDFetch] = 7; // 8 minus 1 for security
-  } else {
-    InstKindLimit[IDFetch] = 15; // 16 minus 1 for security
-  }
+  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
+  AluInstCount = 0;
+  FetchInstCount = 0;
 }
 
-void R600SchedStrategy::MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst)
+void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
+                                  std::vector<SUnit *> &QDst)
 {
-  if (QSrc->empty())
-    return;
-  for (ReadyQueue::iterator I = QSrc->begin(),
-      E = QSrc->end(); I != E; ++I) {
-    (*I)->NodeQueueId &= ~QSrc->getID();
-    QDst->push(*I);
-  }
-  QSrc->clear();
+  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
+  QSrc.clear();
+}
+
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
 }
 
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   SUnit *SU = 0;
-  IsTopNode = true;
   NextInstKind = IDOther;
 
+  IsTopNode = false;
+
   // check if we might want to switch current clause type
-  bool AllowSwitchToAlu = (CurInstKind == IDOther) ||
-      (CurEmitted > InstKindLimit[CurInstKind]) ||
-      (Available[CurInstKind]->empty());
-  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
-      (!Available[IDFetch]->empty() || !Available[IDOther]->empty());
-
-  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
-      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
+  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
+      (Available[CurInstKind].empty());
+  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
+      (!Available[IDFetch].empty() || !Available[IDOther].empty());
+
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate = 
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We assume the local GPR requirements to be "dominated" by the requirement
+    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+    // after TEX are indeed likely to consume or generate values from/for the
+    // TEX clause.
+    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+    // (TODO : use RegisterPressure)
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+  if (!SU && ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
+      (!AllowSwitchFromAlu && CurInstKind == IDAlu))) {
     // try to pick ALU
     SU = pickAlu();
+    if (!SU && !PhysicalRegCopy.empty()) {
+      SU = PhysicalRegCopy.front();
+      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
+    }
     if (SU) {
-      if (CurEmitted >  InstKindLimit[IDAlu])
+      if (CurEmitted >= InstKindLimit[IDAlu])
         CurEmitted = 0;
       NextInstKind = IDAlu;
     }
@@ -99,14 +123,10 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 
   DEBUG(
       if (SU) {
-        dbgs() << "picked node: ";
+        dbgs() << " ** Pick node **\n";
         SU->dump(DAG);
       } else {
-        dbgs() << "NO NODE ";
-        for (int i = 0; i < IDLast; ++i) {
-          Available[i]->dump();
-          Pending[i]->dump();
-        }
+        dbgs() << "NO NODE \n";
         for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
           const SUnit &S = DAG->SUnits[i];
           if (!S.isScheduled)
@@ -119,19 +139,16 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 }
 
 void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
-
-  DEBUG(dbgs() << "scheduled: ");
-  DEBUG(SU->dump(DAG));
-
   if (NextInstKind != CurInstKind) {
     DEBUG(dbgs() << "Instruction Type Switch\n");
     if (NextInstKind != IDAlu)
-      OccupedSlotsMask = 15;
+      OccupedSlotsMask |= 31;
     CurEmitted = 0;
     CurInstKind = NextInstKind;
   }
 
   if (CurInstKind == IDAlu) {
+    AluInstCount ++;
     switch (getAluKind(SU)) {
     case AluT_XYZW:
       CurEmitted += 4;
@@ -157,20 +174,37 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  }
-  MoveUnits(Pending[IDOther], Available[IDOther]);
+  } else
+    FetchInstCount++;
 }
 
-void R600SchedStrategy::releaseTopNode(SUnit *SU) {
-  int IK = getInstKind(SU);
+static bool
+isPhysicalRegCopy(MachineInstr *MI) {
+  if (MI->getOpcode() != AMDGPU::COPY)
+    return false;
 
-  DEBUG(dbgs() << IK << " <= ");
-  DEBUG(SU->dump(DAG));
+  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+}
 
-  Pending[IK]->push(SU);
+void R600SchedStrategy::releaseTopNode(SUnit *SU) {
+  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
 }
 
 void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
+  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
+  if (isPhysicalRegCopy(SU->getInstr())) {
+    PhysicalRegCopy.push_back(SU);
+    return;
+  }
+
+  int IK = getInstKind(SU);
+
+  // There is no export clause, we can schedule one as soon as its ready
+  if (IK == IDOther)
+    Available[IDOther].push_back(SU);
+  else
+    Pending[IK].push_back(SU);
+
 }
 
 bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
@@ -185,18 +219,19 @@ bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
 R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
   MachineInstr *MI = SU->getInstr();
 
+  if (TII->isTransOnly(MI))
+    return AluTrans;
+
     switch (MI->getOpcode()) {
+    case AMDGPU::PRED_X:
+      return AluPredX;
     case AMDGPU::INTERP_PAIR_XY:
     case AMDGPU::INTERP_PAIR_ZW:
     case AMDGPU::INTERP_VEC_LOAD:
+    case AMDGPU::DOT_4:
       return AluT_XYZW;
     case AMDGPU::COPY:
-      if (TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg())) {
-        // %vregX = COPY Tn_X is likely to be discarded in favor of an
-        // assignement of Tn_X to %vregX, don't considers it in scheduling
-        return AluDiscarded;
-      }
-      else if (MI->getOperand(1).isUndef()) {
+      if (MI->getOperand(1).isUndef()) {
         // MI will become a KILL, don't considers it in scheduling
         return AluDiscarded;
       }
@@ -205,10 +240,18 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     }
 
     // Does the instruction take a whole IG ?
+    // XXX: Is it possible to add a helper function in R600InstrInfo that can
+    // be used here and in R600PacketizerList::isSoloInstruction() ?
     if(TII->isVector(*MI) ||
         TII->isCubeOp(MI->getOpcode()) ||
-        TII->isReductionOp(MI->getOpcode()))
+        TII->isReductionOp(MI->getOpcode()) ||
+        MI->getOpcode() == AMDGPU::GROUP_BARRIER) {
       return AluT_XYZW;
+    }
+
+    if (TII->isLDSInstr(MI->getOpcode())) {
+      return AluT_X;
+    }
 
     // Is the result already assigned to a channel ?
     unsigned DestSubReg = MI->getOperand(0).getSubReg();
@@ -239,6 +282,10 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
     if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
       return AluT_XYZW;
 
+    // LDS src registers cannot be used in the Trans slot.
+    if (TII->readsLDSSrcReg(MI))
+      return AluT_XYZW;
+
     return AluAny;
 
 }
@@ -246,57 +293,39 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
 int R600SchedStrategy::getInstKind(SUnit* SU) {
   int Opcode = SU->getInstr()->getOpcode();
 
+  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
+    return IDFetch;
+
   if (TII->isALUInstr(Opcode)) {
     return IDAlu;
   }
 
   switch (Opcode) {
+  case AMDGPU::PRED_X:
   case AMDGPU::COPY:
   case AMDGPU::CONST_COPY:
   case AMDGPU::INTERP_PAIR_XY:
   case AMDGPU::INTERP_PAIR_ZW:
   case AMDGPU::INTERP_VEC_LOAD:
-  case AMDGPU::DOT4_eg_pseudo:
-  case AMDGPU::DOT4_r600_pseudo:
+  case AMDGPU::DOT_4:
     return IDAlu;
-  case AMDGPU::TEX_VTX_CONSTBUF:
-  case AMDGPU::TEX_VTX_TEXBUF:
-  case AMDGPU::TEX_LD:
-  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
-  case AMDGPU::TEX_GET_GRADIENTS_H:
-  case AMDGPU::TEX_GET_GRADIENTS_V:
-  case AMDGPU::TEX_SET_GRADIENTS_H:
-  case AMDGPU::TEX_SET_GRADIENTS_V:
-  case AMDGPU::TEX_SAMPLE:
-  case AMDGPU::TEX_SAMPLE_C:
-  case AMDGPU::TEX_SAMPLE_L:
-  case AMDGPU::TEX_SAMPLE_C_L:
-  case AMDGPU::TEX_SAMPLE_LB:
-  case AMDGPU::TEX_SAMPLE_C_LB:
-  case AMDGPU::TEX_SAMPLE_G:
-  case AMDGPU::TEX_SAMPLE_C_G:
-  case AMDGPU::TXD:
-  case AMDGPU::TXD_SHADOW:
-    return IDFetch;
   default:
-    DEBUG(
-        dbgs() << "other inst: ";
-        SU->dump(DAG);
-    );
     return IDOther;
   }
 }
 
-SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
+SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
     return NULL;
-  for (std::set<SUnit *, CompareSUnit>::iterator It = Q.begin(), E = Q.end();
+  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
     InstructionsGroupCandidate.push_back(SU->getInstr());
-    if (TII->canBundle(InstructionsGroupCandidate)) {
+    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate)
+        && (!AnyALU || !TII->isVectorOnly(SU->getInstr()))
+    ) {
       InstructionsGroupCandidate.pop_back();
-      Q.erase(It);
+      Q.erase((It + 1).base());
       return SU;
     } else {
       InstructionsGroupCandidate.pop_back();
@@ -306,33 +335,37 @@ SUnit *R600SchedStrategy::PopInst(std::multiset<SUnit *, CompareSUnit> &Q) {
 }
 
 void R600SchedStrategy::LoadAlu() {
-  ReadyQueue *QSrc = Pending[IDAlu];
-  for (ReadyQueue::iterator I = QSrc->begin(),
-        E = QSrc->end(); I != E; ++I) {
-      (*I)->NodeQueueId &= ~QSrc->getID();
-      AluKind AK = getAluKind(*I);
-      AvailableAlus[AK].insert(*I);
-    }
-    QSrc->clear();
+  std::vector<SUnit *> &QSrc = Pending[IDAlu];
+  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
+    AluKind AK = getAluKind(QSrc[i]);
+    AvailableAlus[AK].push_back(QSrc[i]);
+  }
+  QSrc.clear();
 }
 
 void R600SchedStrategy::PrepareNextSlot() {
   DEBUG(dbgs() << "New Slot\n");
   assert (OccupedSlotsMask && "Slot wasn't filled");
   OccupedSlotsMask = 0;
+//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
+//    OccupedSlotsMask |= 16;
   InstructionsGroupCandidate.clear();
   LoadAlu();
 }
 
 void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
-  unsigned DestReg = MI->getOperand(0).getReg();
+  int DstIndex = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
+  if (DstIndex == -1) {
+    return;
+  }
+  unsigned DestReg = MI->getOperand(DstIndex).getReg();
   // PressureRegister crashes if an operand is def and used in the same inst
   // and we try to constraint its regclass
   for (MachineInstr::mop_iterator It = MI->operands_begin(),
       E = MI->operands_end(); It != E; ++It) {
     MachineOperand &MO = *It;
     if (MO.isReg() && !MO.isDef() &&
-        MO.getReg() == MI->getOperand(0).getReg())
+        MO.getReg() == DestReg)
       return;
   }
   // Constrains the regclass of DestReg to assign it to Slot
@@ -352,53 +385,60 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
   }
 }
 
-SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
+SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
   static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
-  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
-  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
-  if (!UnslotedSU) {
+  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
+  if (SlotedSU)
     return SlotedSU;
-  } else if (!SlotedSU) {
+  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
+  if (UnslotedSU)
     AssignSlot(UnslotedSU->getInstr(), Slot);
-    return UnslotedSU;
-  } else {
-    //Determine which one to pick (the lesser one)
-    if (CompareSUnit()(SlotedSU, UnslotedSU)) {
-      AvailableAlus[AluAny].insert(UnslotedSU);
-      return SlotedSU;
-    } else {
-      AvailableAlus[IndexToID[Slot]].insert(SlotedSU);
-      AssignSlot(UnslotedSU->getInstr(), Slot);
-      return UnslotedSU;
-    }
-  }
+  return UnslotedSU;
 }
 
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
-  return Pending[IDAlu]->empty() && AvailableAlus[AluAny].empty() &&
-      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
-      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
-      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
+      AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
-  while (!isAvailablesAluEmpty()) {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
     if (!OccupedSlotsMask) {
+      // Bottom up scheduling : predX must comes first
+      if (!AvailableAlus[AluPredX].empty()) {
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluPredX], false);
+      }
       // Flush physical reg copies (RA will discard them)
       if (!AvailableAlus[AluDiscarded].empty()) {
-        OccupedSlotsMask = 15;
-        return PopInst(AvailableAlus[AluDiscarded]);
+        OccupedSlotsMask |= 31;
+        return PopInst(AvailableAlus[AluDiscarded], false);
       }
       // If there is a T_XYZW alu available, use it
       if (!AvailableAlus[AluT_XYZW].empty()) {
-        OccupedSlotsMask = 15;
-        return PopInst(AvailableAlus[AluT_XYZW]);
+        OccupedSlotsMask |= 15;
+        return PopInst(AvailableAlus[AluT_XYZW], false);
+      }
+    }
+    bool TransSlotOccuped = OccupedSlotsMask & 16;
+    if (!TransSlotOccuped && VLIW5) {
+      if (!AvailableAlus[AluTrans].empty()) {
+        OccupedSlotsMask |= 16;
+        return PopInst(AvailableAlus[AluTrans], false);
+      }
+      SUnit *SU = AttemptFillSlot(3, true);
+      if (SU) {
+        OccupedSlotsMask |= 16;
+        return SU;
       }
     }
-    for (unsigned Chan = 0; Chan < 4; ++Chan) {
+    for (int Chan = 3; Chan > -1; --Chan) {
       bool isOccupied = OccupedSlotsMask & (1 << Chan);
       if (!isOccupied) {
-        SUnit *SU = AttemptFillSlot(Chan);
+        SUnit *SU = AttemptFillSlot(Chan, false);
         if (SU) {
           OccupedSlotsMask |= (1 << Chan);
           InstructionsGroupCandidate.push_back(SU->getInstr());
@@ -413,14 +453,14 @@ SUnit* R600SchedStrategy::pickAlu() {
 
 SUnit* R600SchedStrategy::pickOther(int QID) {
   SUnit *SU = 0;
-  ReadyQueue *AQ = Available[QID];
+  std::vector<SUnit *> &AQ = Available[QID];
 
-  if (AQ->empty()) {
+  if (AQ.empty()) {
     MoveUnits(Pending[QID], AQ);
   }
-  if (!AQ->empty()) {
-    SU = *AQ->begin();
-    AQ->remove(AQ->begin());
+  if (!AQ.empty()) {
+    SU = AQ.back();
+    AQ.resize(AQ.size() - 1);
   }
   return SU;
 }
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index 3d0367f..97c8cde 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -16,21 +16,14 @@
 #define R600MACHINESCHEDULER_H_
 
 #include "R600InstrInfo.h"
+#include "llvm/ADT/PriorityQueue.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/PriorityQueue.h"
 
 using namespace llvm;
 
 namespace llvm {
 
-class CompareSUnit {
-public:
-  bool operator()(const SUnit *S1, const SUnit *S2) {
-    return S1->getDepth() > S2->getDepth();
-  }
-};
-
 class R600SchedStrategy : public MachineSchedStrategy {
 
   const ScheduleDAGMI *DAG;
@@ -38,12 +31,6 @@ class R600SchedStrategy : public MachineSchedStrategy {
   const R600RegisterInfo *TRI;
   MachineRegisterInfo *MRI;
 
-  enum InstQueue {
-    QAlu = 1,
-    QFetch = 2,
-    QOther = 4
-  };
-
   enum InstKind {
     IDAlu,
     IDFetch,
@@ -58,17 +45,23 @@ class R600SchedStrategy : public MachineSchedStrategy {
     AluT_Z,
     AluT_W,
     AluT_XYZW,
+    AluPredX,
+    AluTrans,
     AluDiscarded, // LLVM Instructions that are going to be eliminated
     AluLast
   };
 
-  ReadyQueue *Available[IDLast], *Pending[IDLast];
-  std::multiset<SUnit *, CompareSUnit> AvailableAlus[AluLast];
+  std::vector<SUnit *> Available[IDLast], Pending[IDLast];
+  std::vector<SUnit *> AvailableAlus[AluLast];
+  std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
   int CurEmitted;
   InstKind NextInstKind;
 
+  unsigned AluInstCount;
+  unsigned FetchInstCount;
+
   int InstKindLimit[IDLast];
 
   int OccupedSlotsMask;
@@ -76,19 +69,9 @@ class R600SchedStrategy : public MachineSchedStrategy {
 public:
   R600SchedStrategy() :
     DAG(0), TII(0), TRI(0), MRI(0) {
-    Available[IDAlu] = new ReadyQueue(QAlu, "AAlu");
-    Available[IDFetch] = new ReadyQueue(QFetch, "AFetch");
-    Available[IDOther] = new ReadyQueue(QOther, "AOther");
-    Pending[IDAlu] = new ReadyQueue(QAlu<<4, "PAlu");
-    Pending[IDFetch] = new ReadyQueue(QFetch<<4, "PFetch");
-    Pending[IDOther] = new ReadyQueue(QOther<<4, "POther");
   }
 
   virtual ~R600SchedStrategy() {
-    for (unsigned I = 0; I < IDLast; ++I) {
-      delete Available[I];
-      delete Pending[I];
-    }
   }
 
   virtual void initialize(ScheduleDAGMI *dag);
@@ -99,20 +82,21 @@ public:
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
+  bool VLIW5;
 
   int getInstKind(SUnit *SU);
   bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
   AluKind getAluKind(SUnit *SU) const;
   void LoadAlu();
-  bool isAvailablesAluEmpty() const;
-  SUnit *AttemptFillSlot (unsigned Slot);
+  unsigned AvailablesAluCount() const;
+  SUnit *AttemptFillSlot (unsigned Slot, bool AnyAlu);
   void PrepareNextSlot();
-  SUnit *PopInst(std::multiset<SUnit *, CompareSUnit> &Q);
+  SUnit *PopInst(std::vector<SUnit*> &Q, bool AnyALU);
 
   void AssignSlot(MachineInstr *MI, unsigned Slot);
   SUnit* pickAlu();
   SUnit* pickOther(int QID);
-  void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
+  void MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst);
 };
 
 } // namespace llvm
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
new file mode 100644
index 0000000..cf719c0
--- /dev/null
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -0,0 +1,380 @@
+//===--------------------- R600MergeVectorRegisters.cpp -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass merges inputs of swizzeable instructions into vector sharing
+/// common data and/or have enough undef subreg using swizzle abilities.
+///
+/// For instance let's consider the following pseudo code :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
+///
+/// is turned into :
+/// vreg5<def> = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
+/// ...
+/// vreg7<def> = INSERT_SUBREG vreg4, sub3
+/// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
+///
+/// This allow regalloc to reduce register pressure for vector registers and
+/// to reduce MOV count.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "vec-merger"
+#include "llvm/Support/Debug.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+static bool
+isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
+  for (MachineRegisterInfo::def_iterator It = MRI.def_begin(Reg),
+      E = MRI.def_end(); It != E; ++It) {
+    return (*It).isImplicitDef();
+  }
+  if (MRI.isReserved(Reg)) {
+    return false;
+  }
+  llvm_unreachable("Reg without a def");
+  return false;
+}
+
+class RegSeqInfo {
+public:
+  MachineInstr *Instr;
+  DenseMap<unsigned, unsigned> RegToChan;
+  std::vector<unsigned> UndefReg;
+  RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
+    assert (MI->getOpcode() == AMDGPU::REG_SEQUENCE);
+    for (unsigned i = 1, e = Instr->getNumOperands(); i < e; i+=2) {
+      MachineOperand &MO = Instr->getOperand(i);
+      unsigned Chan = Instr->getOperand(i + 1).getImm();
+      if (isImplicitlyDef(MRI, MO.getReg()))
+        UndefReg.push_back(Chan);
+      else
+        RegToChan[MO.getReg()] = Chan;
+    }
+  }
+  RegSeqInfo() {}
+
+  bool operator==(const RegSeqInfo &RSI) const {
+    return RSI.Instr == Instr;
+  }
+};
+
+class R600VectorRegMerger : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const R600InstrInfo *TII;
+  bool canSwizzle(const MachineInstr &) const;
+  bool areAllUsesSwizzeable(unsigned Reg) const;
+  void SwizzleInput(MachineInstr &,
+      const std::vector<std::pair<unsigned, unsigned> > &) const;
+  bool tryMergeVector(const RegSeqInfo *, RegSeqInfo *,
+      std::vector<std::pair<unsigned, unsigned> > &Remap) const;
+  bool tryMergeUsingCommonSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  bool tryMergeUsingFreeSlot(RegSeqInfo &RSI, RegSeqInfo &CompatibleRSI,
+      std::vector<std::pair<unsigned, unsigned> > &RemapChan);
+  MachineInstr *RebuildVector(RegSeqInfo *MI,
+      const RegSeqInfo *BaseVec,
+      const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const;
+  void RemoveMI(MachineInstr *);
+  void trackRSI(const RegSeqInfo &RSI);
+
+  typedef DenseMap<unsigned, std::vector<MachineInstr *> > InstructionSetMap;
+  DenseMap<MachineInstr *, RegSeqInfo> PreviousRegSeq;
+  InstructionSetMap PreviousRegSeqByReg;
+  InstructionSetMap PreviousRegSeqByUndefCount;
+public:
+  static char ID;
+  R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
+  TII(0) { }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const {
+    return "R600 Vector Registers Merge Pass";
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn);
+};
+
+char R600VectorRegMerger::ID = 0;
+
+bool R600VectorRegMerger::canSwizzle(const MachineInstr &MI)
+    const {
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    return true;
+  switch (MI.getOpcode()) {
+  case AMDGPU::R600_ExportSwz:
+  case AMDGPU::EG_ExportSwz:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
+    RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned> > &Remap)
+    const {
+  unsigned CurrentUndexIdx = 0;
+  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+      E = ToMerge->RegToChan.end(); It != E; ++It) {
+    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+        Untouched->RegToChan.find((*It).first);
+    if (PosInUntouched != Untouched->RegToChan.end()) {
+      Remap.push_back(std::pair<unsigned, unsigned>
+          ((*It).second, (*PosInUntouched).second));
+      continue;
+    }
+    if (CurrentUndexIdx >= Untouched->UndefReg.size())
+      return false;
+    Remap.push_back(std::pair<unsigned, unsigned>
+        ((*It).second, Untouched->UndefReg[CurrentUndexIdx++]));
+  }
+
+  return true;
+}
+
+static
+unsigned getReassignedChan(
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan,
+    unsigned Chan) {
+  for (unsigned j = 0, je = RemapChan.size(); j < je; j++) {
+    if (RemapChan[j].first == Chan)
+      return RemapChan[j].second;
+  }
+  llvm_unreachable("Chan wasn't reassigned");
+}
+
+MachineInstr *R600VectorRegMerger::RebuildVector(
+    RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Reg = RSI->Instr->getOperand(0).getReg();
+  MachineBasicBlock::iterator Pos = RSI->Instr;
+  MachineBasicBlock &MBB = *Pos->getParent();
+  DebugLoc DL = Pos->getDebugLoc();
+
+  unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+      E = RSI->RegToChan.end(); It != E; ++It) {
+    unsigned DstReg = MRI->createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned SubReg = (*It).first;
+    unsigned Swizzle = (*It).second;
+    unsigned Chan = getReassignedChan(RemapChan, Swizzle);
+
+    MachineInstr *Tmp = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::INSERT_SUBREG),
+        DstReg)
+        .addReg(SrcVec)
+        .addReg(SubReg)
+        .addImm(Chan);
+    UpdatedRegToChan[SubReg] = Chan;
+    std::vector<unsigned>::iterator ChanPos =
+        std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan);
+    if (ChanPos != UpdatedUndef.end())
+      UpdatedUndef.erase(ChanPos);
+    assert(std::find(UpdatedUndef.begin(), UpdatedUndef.end(), Chan) ==
+               UpdatedUndef.end() &&
+           "UpdatedUndef shouldn't contain Chan more than once!");
+    DEBUG(dbgs() << "    ->"; Tmp->dump(););
+    (void)Tmp;
+    SrcVec = DstReg;
+  }
+  Pos = BuildMI(MBB, Pos, DL, TII->get(AMDGPU::COPY), Reg)
+      .addReg(SrcVec);
+  DEBUG(dbgs() << "    ->"; Pos->dump(););
+
+  DEBUG(dbgs() << "  Updating Swizzle:\n");
+  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+      E = MRI->use_end(); It != E; ++It) {
+    DEBUG(dbgs() << "    ";(*It).dump(); dbgs() << "    ->");
+    SwizzleInput(*It, RemapChan);
+    DEBUG((*It).dump());
+  }
+  RSI->Instr->eraseFromParent();
+
+  // Update RSI
+  RSI->Instr = Pos;
+  RSI->RegToChan = UpdatedRegToChan;
+  RSI->UndefReg = UpdatedUndef;
+
+  return Pos;
+}
+
+void R600VectorRegMerger::RemoveMI(MachineInstr *MI) {
+  for (InstructionSetMap::iterator It = PreviousRegSeqByReg.begin(),
+      E = PreviousRegSeqByReg.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+  for (InstructionSetMap::iterator It = PreviousRegSeqByUndefCount.begin(),
+      E = PreviousRegSeqByUndefCount.end(); It != E; ++It) {
+    std::vector<MachineInstr *> &MIs = (*It).second;
+    MIs.erase(std::find(MIs.begin(), MIs.end(), MI), MIs.end());
+  }
+}
+
+void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
+    const std::vector<std::pair<unsigned, unsigned> > &RemapChan) const {
+  unsigned Offset;
+  if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST)
+    Offset = 2;
+  else
+    Offset = 3;
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned Swizzle = MI.getOperand(i + Offset).getImm() + 1;
+    for (unsigned j = 0, e = RemapChan.size(); j < e; j++) {
+      if (RemapChan[j].first == Swizzle) {
+        MI.getOperand(i + Offset).setImm(RemapChan[j].second - 1);
+        break;
+      }
+    }
+  }
+}
+
+bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+  for (MachineRegisterInfo::use_iterator It = MRI->use_begin(Reg),
+      E = MRI->use_end(); It != E; ++It) {
+    if (!canSwizzle(*It))
+      return false;
+  }
+  return true;
+}
+
+bool R600VectorRegMerger::tryMergeUsingCommonSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  for (MachineInstr::mop_iterator MOp = RSI.Instr->operands_begin(),
+      MOE = RSI.Instr->operands_end(); MOp != MOE; ++MOp) {
+    if (!MOp->isReg())
+      continue;
+    if (PreviousRegSeqByReg[MOp->getReg()].empty())
+      continue;
+    std::vector<MachineInstr *> MIs = PreviousRegSeqByReg[MOp->getReg()];
+    for (unsigned i = 0, e = MIs.size(); i < e; i++) {
+      CompatibleRSI = PreviousRegSeq[MIs[i]];
+      if (RSI == CompatibleRSI)
+        continue;
+      if (tryMergeVector(&CompatibleRSI, &RSI, RemapChan))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
+    RegSeqInfo &CompatibleRSI,
+    std::vector<std::pair<unsigned, unsigned> > &RemapChan) {
+  unsigned NeededUndefs = 4 - RSI.UndefReg.size();
+  if (PreviousRegSeqByUndefCount[NeededUndefs].empty())
+    return false;
+  std::vector<MachineInstr *> &MIs =
+      PreviousRegSeqByUndefCount[NeededUndefs];
+  CompatibleRSI = PreviousRegSeq[MIs.back()];
+  tryMergeVector(&CompatibleRSI, &RSI, RemapChan);
+  return true;
+}
+
+void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
+  for (DenseMap<unsigned, unsigned>::const_iterator
+  It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
+    PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
+  }
+  PreviousRegSeqByUndefCount[RSI.UndefReg.size()].push_back(RSI.Instr);
+  PreviousRegSeq[RSI.Instr] = RSI;
+}
+
+bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
+  TII = static_cast<const R600InstrInfo *>(Fn.getTarget().getInstrInfo());
+  MRI = &(Fn.getRegInfo());
+  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
+       MBB != MBBe; ++MBB) {
+    MachineBasicBlock *MB = MBB;
+    PreviousRegSeq.clear();
+    PreviousRegSeqByReg.clear();
+    PreviousRegSeqByUndefCount.clear();
+
+    for (MachineBasicBlock::iterator MII = MB->begin(), MIIE = MB->end();
+         MII != MIIE; ++MII) {
+      MachineInstr *MI = MII;
+      if (MI->getOpcode() != AMDGPU::REG_SEQUENCE) {
+        if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
+          unsigned Reg = MI->getOperand(1).getReg();
+          for (MachineRegisterInfo::def_iterator It = MRI->def_begin(Reg),
+              E = MRI->def_end(); It != E; ++It) {
+            RemoveMI(&(*It));
+          }
+        }
+        continue;
+      }
+
+
+      RegSeqInfo RSI(*MRI, MI);
+
+      // All uses of MI are swizzeable ?
+      unsigned Reg = MI->getOperand(0).getReg();
+      if (!areAllUsesSwizzeable(Reg))
+        continue;
+
+      DEBUG (dbgs() << "Trying to optimize ";
+          MI->dump();
+      );
+
+      RegSeqInfo CandidateRSI;
+      std::vector<std::pair<unsigned, unsigned> > RemapChan;
+      DEBUG(dbgs() << "Using common slots...\n";);
+      if (tryMergeUsingCommonSlot(RSI, CandidateRSI, RemapChan)) {
+        // Remove CandidateRSI mapping
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      DEBUG(dbgs() << "Using free slots...\n";);
+      RemapChan.clear();
+      if (tryMergeUsingFreeSlot(RSI, CandidateRSI, RemapChan)) {
+        RemoveMI(CandidateRSI.Instr);
+        MII = RebuildVector(&RSI, &CandidateRSI, RemapChan);
+        trackRSI(RSI);
+        continue;
+      }
+      //Failed to merge
+      trackRSI(RSI);
+    }
+  }
+  return false;
+}
+
+}
+
+llvm::FunctionPass *llvm::createR600VectorRegMerger(TargetMachine &tm) {
+  return new R600VectorRegMerger(tm);
+}
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index cd7b7d0..cd9b6ea 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,22 +14,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef R600PACKETIZER_CPP
-#define R600PACKETIZER_CPP
-
 #define DEBUG_TYPE "packets"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "AMDGPU.h"
+#include "R600InstrInfo.h"
 #include "llvm/CodeGen/DFAPacketizer.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "AMDGPU.h"
-#include "R600InstrInfo.h"
+#include "llvm/Support/raw_ostream.h"
 
-namespace llvm {
+using namespace llvm;
+
+namespace {
 
 class R600Packetizer : public MachineFunctionPass {
 
@@ -59,15 +58,8 @@ class R600PacketizerList : public VLIWPacketizerList {
 private:
   const R600InstrInfo *TII;
   const R600RegisterInfo &TRI;
-
-  enum BankSwizzle {
-    ALU_VEC_012 = 0,
-    ALU_VEC_021,
-    ALU_VEC_120,
-    ALU_VEC_102,
-    ALU_VEC_201,
-    ALU_VEC_210
-  };
+  bool VLIW5;
+  bool ConsideredInstUsesAlreadyWrittenVectorElement;
 
   unsigned getSlot(const MachineInstr *MI) const {
     return TRI.getHWRegChan(MI->getOperand(0).getReg());
@@ -84,21 +76,35 @@ private:
     MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
     if (I->isBundle())
       BI++;
+    int LastDstChan = -1;
     do {
+      bool isTrans = false;
+      int BISlot = getSlot(BI);
+      if (LastDstChan >= BISlot)
+        isTrans = true;
+      LastDstChan = BISlot;
       if (TII->isPredicated(BI))
         continue;
-      if (TII->isTransOnly(BI))
+      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::write);
+      if (OperandIdx > -1 && BI->getOperand(OperandIdx).getImm() == 0)
         continue;
-      int OperandIdx = TII->getOperandIdx(BI->getOpcode(), R600Operands::WRITE);
-      if (OperandIdx < 0)
+      int DstIdx = TII->getOperandIdx(BI->getOpcode(), AMDGPU::OpName::dst);
+      if (DstIdx == -1) {
         continue;
-      if (BI->getOperand(OperandIdx).getImm() == 0)
+      }
+      unsigned Dst = BI->getOperand(DstIdx).getReg();
+      if (isTrans || TII->isTransOnly(BI)) {
+        Result[Dst] = AMDGPU::PS;
         continue;
-      unsigned Dst = BI->getOperand(0).getReg();
-      if (BI->getOpcode() == AMDGPU::DOT4_r600_real) {
+      }
+      if (BI->getOpcode() == AMDGPU::DOT4_r600 ||
+          BI->getOpcode() == AMDGPU::DOT4_eg) {
         Result[Dst] = AMDGPU::PV_X;
         continue;
       }
+      if (Dst == AMDGPU::OQAP) {
+        continue;
+      }
       unsigned PVReg = 0;
       switch (TRI.getHWRegChan(Dst)) {
       case 0:
@@ -123,10 +129,10 @@ private:
 
   void substitutePV(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PVs)
       const {
-    R600Operands::Ops Ops[] = {
-      R600Operands::SRC0,
-      R600Operands::SRC1,
-      R600Operands::SRC2
+    unsigned Ops[] = {
+      AMDGPU::OpName::src0,
+      AMDGPU::OpName::src1,
+      AMDGPU::OpName::src2
     };
     for (unsigned i = 0; i < 3; i++) {
       int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
@@ -144,10 +150,14 @@ public:
                         MachineDominatorTree &MDT)
   : VLIWPacketizerList(MF, MLI, MDT, true),
     TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TRI(TII->getRegisterInfo()) {
+    VLIW5 = !MF.getTarget().getSubtarget<AMDGPUSubtarget>().hasCaymanISA();
+  }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() { }
+  void initPacketizerState() {
+    ConsideredInstUsesAlreadyWrittenVectorElement = false;
+  }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
   bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
@@ -161,9 +171,11 @@ public:
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
       return true;
-    if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
+    if (MI->getOpcode() == AMDGPU::GROUP_BARRIER)
       return true;
-    if (TII->isTransOnly(MI))
+    // XXX: This can be removed once the packetizer properly handles all the
+    // LDS instruction group restrictions.
+    if (TII->isLDSInstr(MI->getOpcode()))
       return true;
     return false;
   }
@@ -172,11 +184,11 @@ public:
   // together.
   bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
-    if (getSlot(MII) <= getSlot(MIJ))
-      return false;
+    if (getSlot(MII) == getSlot(MIJ))
+      ConsideredInstUsesAlreadyWrittenVectorElement = true;
     // Does MII and MIJ share the same pred_sel ?
-    int OpI = TII->getOperandIdx(MII->getOpcode(), R600Operands::PRED_SEL),
-        OpJ = TII->getOperandIdx(MIJ->getOpcode(), R600Operands::PRED_SEL);
+    int OpI = TII->getOperandIdx(MII->getOpcode(), AMDGPU::OpName::pred_sel),
+        OpJ = TII->getOperandIdx(MIJ->getOpcode(), AMDGPU::OpName::pred_sel);
     unsigned PredI = (OpI > -1)?MII->getOperand(OpI).getReg():0,
         PredJ = (OpJ > -1)?MIJ->getOperand(OpJ).getReg():0;
     if (PredI != PredJ)
@@ -194,6 +206,14 @@ public:
         return false;
       }
     }
+
+    bool ARDef = TII->definesAddressRegister(MII) ||
+                 TII->definesAddressRegister(MIJ);
+    bool ARUse = TII->usesAddressRegister(MII) ||
+                 TII->usesAddressRegister(MIJ);
+    if (ARDef && ARUse)
+      return false;
+
     return true;
   }
 
@@ -202,15 +222,34 @@ public:
   bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
-    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600Operands::LAST);
+    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
     MI->getOperand(LastOp).setImm(Bit);
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  bool isBundlableWithCurrentPMI(MachineInstr *MI,
+                                 const DenseMap<unsigned, unsigned> &PV,
+                                 std::vector<R600InstrInfo::BankSwizzle> &BS,
+                                 bool &isTransSlot) {
+    isTransSlot = TII->isTransOnly(MI);
+    assert (!isTransSlot || VLIW5);
+
+    // Is the dst reg sequence legal ?
+    if (!isTransSlot && !CurrentPacketMIs.empty()) {
+      if (getSlot(MI) <= getSlot(CurrentPacketMIs.back())) {
+        if (ConsideredInstUsesAlreadyWrittenVectorElement  &&
+            !TII->isVectorOnly(MI) && VLIW5) {
+          isTransSlot = true;
+          DEBUG(dbgs() << "Considering as Trans Inst :"; MI->dump(););
+        }
+        else
+          return false;
+      }
+    }
+
+    // Are the Constants limitations met ?
     CurrentPacketMIs.push_back(MI);
-    bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
-    DEBUG(
-      if (!FitsConstLimits) {
+    if (!TII->fitsConstReadLimitations(CurrentPacketMIs)) {
+      DEBUG(
         dbgs() << "Couldn't pack :\n";
         MI->dump();
         dbgs() << "with the following packets :\n";
@@ -219,12 +258,15 @@ public:
           dbgs() << "\n";
         }
         dbgs() << "because of Consts read limitations\n";
-      });
-    const DenseMap<unsigned, unsigned> &PV =
-        getPreviousVector(CurrentPacketMIs.front());
-    bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs, PV);
-    DEBUG(
-      if (!FitsReadPortLimits) {
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
+    }
+
+    // Is there a BankSwizzle set that meet Read Port limitations ?
+    if (!TII->fitsReadPortLimitations(CurrentPacketMIs,
+            PV, BS, isTransSlot)) {
+      DEBUG(
         dbgs() << "Couldn't pack :\n";
         MI->dump();
         dbgs() << "with the following packets :\n";
@@ -233,146 +275,50 @@ public:
           dbgs() << "\n";
         }
         dbgs() << "because of Read port limitations\n";
-      });
-    bool isBundlable = FitsConstLimits && FitsReadPortLimits;
-    CurrentPacketMIs.pop_back();
-    if (!isBundlable) {
-      endPacket(MI->getParent(), MI);
-      substitutePV(MI, getPreviousVector(MI));
-      return VLIWPacketizerList::addToPacket(MI);
-    }
-    if (!CurrentPacketMIs.empty())
-      setIsLastBit(CurrentPacketMIs.back(), 0);
-    substitutePV(MI, PV);
-    return VLIWPacketizerList::addToPacket(MI);
-  }
-private:
-  std::vector<std::pair<int, unsigned> >
-  ExtractSrcs(const MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV)
-      const {
-    R600Operands::Ops Ops[] = {
-      R600Operands::SRC0,
-      R600Operands::SRC1,
-      R600Operands::SRC2
-    };
-    std::vector<std::pair<int, unsigned> > Result;
-    for (unsigned i = 0; i < 3; i++) {
-      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
-      if (OperandIdx < 0){
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Src = MI->getOperand(OperandIdx).getReg();
-      if (PV.find(Src) != PV.end()) {
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Reg = TRI.getEncodingValue(Src) & 0xff;
-      if (Reg > 127) {
-        Result.push_back(std::pair<int, unsigned>(-1,0));
-        continue;
-      }
-      unsigned Chan = TRI.getHWRegChan(Src);
-      Result.push_back(std::pair<int, unsigned>(Reg, Chan));
+      );
+      CurrentPacketMIs.pop_back();
+      return false;
     }
-    return Result;
-  }
 
-  std::vector<std::pair<int, unsigned> >
-  Swizzle(std::vector<std::pair<int, unsigned> > Src,
-  BankSwizzle Swz) const {
-    switch (Swz) {
-    case ALU_VEC_012:
-      break;
-    case ALU_VEC_021:
-      std::swap(Src[1], Src[2]);
-      break;
-    case ALU_VEC_102:
-      std::swap(Src[0], Src[1]);
-      break;
-    case ALU_VEC_120:
-      std::swap(Src[0], Src[1]);
-      std::swap(Src[0], Src[2]);
-      break;
-    case ALU_VEC_201:
-      std::swap(Src[0], Src[2]);
-      std::swap(Src[0], Src[1]);
-      break;
-    case ALU_VEC_210:
-      std::swap(Src[0], Src[2]);
-      break;
-    }
-    return Src;
-  }
+    // We cannot read LDS source registrs from the Trans slot.
+    if (isTransSlot && TII->readsLDSSrcReg(MI))
+      return false;
 
-  bool isLegal(const std::vector<MachineInstr *> &IG,
-      const std::vector<BankSwizzle> &Swz,
-      const DenseMap<unsigned, unsigned> &PV) const {
-    assert (Swz.size() == IG.size());
-    int Vector[4][3];
-    memset(Vector, -1, sizeof(Vector));
-    for (unsigned i = 0, e = IG.size(); i < e; i++) {
-      const std::vector<std::pair<int, unsigned> > &Srcs =
-          Swizzle(ExtractSrcs(IG[i], PV), Swz[i]);
-      for (unsigned j = 0; j < 3; j++) {
-        const std::pair<int, unsigned> &Src = Srcs[j];
-        if (Src.first < 0)
-          continue;
-        if (Vector[Src.second][j] < 0)
-          Vector[Src.second][j] = Src.first;
-        if (Vector[Src.second][j] != Src.first)
-          return false;
-      }
-    }
+    CurrentPacketMIs.pop_back();
     return true;
   }
 
-  bool recursiveFitsFPLimitation(
-  std::vector<MachineInstr *> IG,
-  const DenseMap<unsigned, unsigned> &PV,
-  std::vector<BankSwizzle> &SwzCandidate,
-  std::vector<MachineInstr *> CurrentlyChecked)
-      const {
-    if (!isLegal(CurrentlyChecked, SwzCandidate, PV))
-      return false;
-    if (IG.size() == CurrentlyChecked.size()) {
-      return true;
-    }
-    BankSwizzle AvailableSwizzle[] = {
-      ALU_VEC_012,
-      ALU_VEC_021,
-      ALU_VEC_120,
-      ALU_VEC_102,
-      ALU_VEC_201,
-      ALU_VEC_210
-    };
-    CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]);
-    for (unsigned i = 0; i < 6; i++) {
-      SwzCandidate.push_back(AvailableSwizzle[i]);
-      if (recursiveFitsFPLimitation(IG, PV, SwzCandidate, CurrentlyChecked))
-        return true;
-      SwzCandidate.pop_back();
-    }
-    return false;
-  }
-
-  bool fitsReadPortLimitation(
-  std::vector<MachineInstr *> IG,
-  const DenseMap<unsigned, unsigned> &PV)
-      const {
-    //Todo : support shared src0 - src1 operand
-    std::vector<BankSwizzle> SwzCandidate;
-    bool Result = recursiveFitsFPLimitation(IG, PV, SwzCandidate,
-        std::vector<MachineInstr *>());
-    if (!Result)
-      return false;
-    for (unsigned i = 0, e = IG.size(); i < e; i++) {
-      MachineInstr *MI = IG[i];
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+    MachineBasicBlock::iterator FirstInBundle =
+        CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
+    const DenseMap<unsigned, unsigned> &PV =
+        getPreviousVector(FirstInBundle);
+    std::vector<R600InstrInfo::BankSwizzle> BS;
+    bool isTransSlot;
+
+    if (isBundlableWithCurrentPMI(MI, PV, BS, isTransSlot)) {
+      for (unsigned i = 0, e = CurrentPacketMIs.size(); i < e; i++) {
+        MachineInstr *MI = CurrentPacketMIs[i];
+        unsigned Op = TII->getOperandIdx(MI->getOpcode(),
+            AMDGPU::OpName::bank_swizzle);
+        MI->getOperand(Op).setImm(BS[i]);
+      }
       unsigned Op = TII->getOperandIdx(MI->getOpcode(),
-          R600Operands::BANK_SWIZZLE);
-      MI->getOperand(Op).setImm(SwzCandidate[i]);
+          AMDGPU::OpName::bank_swizzle);
+      MI->getOperand(Op).setImm(BS.back());
+      if (!CurrentPacketMIs.empty())
+        setIsLastBit(CurrentPacketMIs.back(), 0);
+      substitutePV(MI, PV);
+      MachineBasicBlock::iterator It = VLIWPacketizerList::addToPacket(MI);
+      if (isTransSlot) {
+        endPacket(llvm::next(It)->getParent(), llvm::next(It));
+      }
+      return It;
     }
-    return true;
+    endPacket(MI->getParent(), MI);
+    if (TII->isTransOnly(MI))
+      return MI;
+    return VLIWPacketizerList::addToPacket(MI);
   }
 };
 
@@ -402,7 +348,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
     MachineBasicBlock::iterator End = MBB->end();
     MachineBasicBlock::iterator MI = MBB->begin();
     while (MI != End) {
-      if (MI->isKill()) {
+      if (MI->isKill() || MI->getOpcode() == AMDGPU::IMPLICIT_DEF ||
+          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
         MachineBasicBlock::iterator DeleteMI = MI;
         ++MI;
         MBB->erase(DeleteMI);
@@ -450,10 +397,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
 
 }
 
-}
+} // end anonymous namespace
 
 llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
   return new R600Packetizer(tm);
 }
-
-#endif // R600PACKETIZER_CPP
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
index bbd7995..f3bb88b 100644
--- a/lib/Target/R600/R600RegisterInfo.cpp
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -20,16 +20,16 @@
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
-    const TargetInstrInfo &tii)
-: AMDGPURegisterInfo(tm, tii),
-  TM(tm),
-  TII(tii)
-  { }
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm)
+: AMDGPURegisterInfo(tm),
+  TM(tm)
+  { RCW.RegWeight = 0; RCW.WeightLimit = 0;}
 
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
+  const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
   Reserved.set(AMDGPU::ZERO);
   Reserved.set(AMDGPU::HALF);
   Reserved.set(AMDGPU::ONE);
@@ -43,25 +43,15 @@ BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(AMDGPU::PRED_SEL_OFF);
   Reserved.set(AMDGPU::PRED_SEL_ZERO);
   Reserved.set(AMDGPU::PRED_SEL_ONE);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
 
   for (TargetRegisterClass::iterator I = AMDGPU::R600_AddrRegClass.begin(),
                         E = AMDGPU::R600_AddrRegClass.end(); I != E; ++I) {
     Reserved.set(*I);
   }
 
-  for (TargetRegisterClass::iterator I = AMDGPU::TRegMemRegClass.begin(),
-                                     E = AMDGPU::TRegMemRegClass.end();
-                                     I !=  E; ++I) {
-    Reserved.set(*I);
-  }
+  TII->reserveIndirectRegisters(Reserved, MF);
 
-  const R600InstrInfo *RII = static_cast<const R600InstrInfo*>(&TII);
-  std::vector<unsigned> IndirectRegs = RII->getIndirectReservedRegs(MF);
-  for (std::vector<unsigned>::iterator I = IndirectRegs.begin(),
-                                       E = IndirectRegs.end();
-                                       I != E; ++I) {
-    Reserved.set(*I);
-  }
   return Reserved;
 }
 
@@ -79,6 +69,10 @@ unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
   return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
 }
 
+unsigned R600RegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return GET_REG_INDEX(getEncodingValue(Reg));
+}
+
 const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
                                                                    MVT VT) const {
   switch(VT.SimpleTy) {
@@ -87,13 +81,20 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
-  switch (Channel) {
-    default: assert(!"Invalid channel index"); return 0;
-    case 0: return AMDGPU::sub0;
-    case 1: return AMDGPU::sub1;
-    case 2: return AMDGPU::sub2;
-    case 3: return AMDGPU::sub3;
-  }
+const RegClassWeight &R600RegisterInfo::getRegClassWeight(
+  const TargetRegisterClass *RC) const {
+  return RCW;
 }
 
+bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  switch (Reg) {
+  case AMDGPU::OQAP:
+  case AMDGPU::OQBP:
+  case AMDGPU::AR_X:
+    return false;
+  default:
+    return true;
+  }
+}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index f9ca918..c74c49e 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -21,13 +21,12 @@
 namespace llvm {
 
 class R600TargetMachine;
-class TargetInstrInfo;
 
 struct R600RegisterInfo : public AMDGPURegisterInfo {
   AMDGPUTargetMachine &TM;
-  const TargetInstrInfo &TII;
+  RegClassWeight RCW;
 
-  R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+  R600RegisterInfo(AMDGPUTargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -40,14 +39,16 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
 
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
-  unsigned getSubRegFromChannel(unsigned Channel) const;
+  virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
 
+  // \returns true if \p Reg can be defined in one ALU caluse and used in another.
+  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
index bfc546b..68bcd20 100644
--- a/lib/Target/R600/R600RegisterInfo.td
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -23,6 +23,14 @@ class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
   let HWEncoding = encoding;
 }
 
+class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sub0, sub1];
+  let HWEncoding = encoding;
+}
+
+
 foreach Index = 0-127 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
     // 32-bit Temporary Registers
@@ -31,26 +39,29 @@ foreach Index = 0-127 in {
     // Indirect addressing offset registers
     def Addr#Index#_#Chan : R600RegWithChan <"T("#Index#" + AR.x)."#Chan,
                                               Index, Chan>;
-    def TRegMem#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index,
-                                                Chan>;
   }
   // 128-bit Temporary Registers
-  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#"",
                                    [!cast<Register>("T"#Index#"_X"),
                                     !cast<Register>("T"#Index#"_Y"),
                                     !cast<Register>("T"#Index#"_Z"),
                                     !cast<Register>("T"#Index#"_W")],
                                    Index>;
+
+  def T#Index#_XY : R600Reg_64 <"T"#Index#"",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y")],
+                                   Index>;
 }
 
 // KCACHE_BANK0
 foreach Index = 159-128 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
     // 32-bit Temporary Registers
-    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#Index#"-128]."#Chan, Index, Chan>;
+    def KC0_#Index#_#Chan : R600RegWithChan <"KC0["#!add(Index,-128)#"]."#Chan, Index, Chan>;
   }
   // 128-bit Temporary Registers
-  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#Index#"-128].XYZW",
+  def KC0_#Index#_XYZW : R600Reg_128 <"KC0["#!add(Index, -128)#"].XYZW",
                                  [!cast<Register>("KC0_"#Index#"_X"),
                                   !cast<Register>("KC0_"#Index#"_Y"),
                                   !cast<Register>("KC0_"#Index#"_Z"),
@@ -62,10 +73,10 @@ foreach Index = 159-128 in {
 foreach Index = 191-160 in {
   foreach Chan = [ "X", "Y", "Z", "W" ] in {
     // 32-bit Temporary Registers
-    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#Index#"-160]."#Chan, Index, Chan>;
+    def KC1_#Index#_#Chan : R600RegWithChan <"KC1["#!add(Index,-160)#"]."#Chan, Index, Chan>;
   }
   // 128-bit Temporary Registers
-  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#Index#"-160].XYZW",
+  def KC1_#Index#_XYZW : R600Reg_128 <"KC1["#!add(Index, -160)#"].XYZW",
                                  [!cast<Register>("KC1_"#Index#"_X"),
                                   !cast<Register>("KC1_"#Index#"_Y"),
                                   !cast<Register>("KC1_"#Index#"_Z"),
@@ -82,6 +93,12 @@ foreach Index = 448-480 in {
 
 // Special Registers
 
+def OQA : R600Reg<"OQA", 219>;
+def OQB : R600Reg<"OQB", 220>;
+def OQAP : R600Reg<"OQAP", 221>;
+def OQBP : R600Reg<"OQAP", 222>;
+def LDS_DIRECT_A : R600Reg<"LDS_DIRECT_A", 223>;
+def LDS_DIRECT_B : R600Reg<"LDS_DIRECT_B", 224>;
 def ZERO : R600Reg<"0.0", 248>;
 def ONE : R600Reg<"1.0", 249>;
 def NEG_ONE : R600Reg<"-1.0", 249>;
@@ -92,10 +109,11 @@ def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
 def ALU_LITERAL_Y : R600RegWithChan<"literal.y", 253, "Y">;
 def ALU_LITERAL_Z : R600RegWithChan<"literal.z", 253, "Z">;
 def ALU_LITERAL_W : R600RegWithChan<"literal.w", 253, "W">;
-def PV_X : R600RegWithChan<"PV.x", 254, "X">;
-def PV_Y : R600RegWithChan<"PV.y", 254, "Y">;
-def PV_Z : R600RegWithChan<"PV.z", 254, "Z">;
-def PV_W : R600RegWithChan<"PV.w", 254, "W">;
+def PV_X : R600RegWithChan<"PV.X", 254, "X">;
+def PV_Y : R600RegWithChan<"PV.Y", 254, "Y">;
+def PV_Z : R600RegWithChan<"PV.Z", 254, "Z">;
+def PV_W : R600RegWithChan<"PV.W", 254, "W">;
+def PS: R600Reg<"PS", 255>;
 def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
 def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
 def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
@@ -115,7 +133,8 @@ let isAllocatable = 0 in {
 // XXX: Only use the X channel, until we support wider stack widths
 def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
 
-} // End isAllocatable = 0
+def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
+  (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
 
 def R600_KC0_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                               (add (sequence "KC0_%u_X", 128, 159))>;
@@ -149,6 +168,8 @@ def R600_KC1 : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (interleave R600_KC1_X, R600_KC1_Y,
                                                R600_KC1_Z, R600_KC1_W)>;
 
+} // End isAllocatable = 0
+
 def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
                                    (add (sequence "T%u_X", 0, 127), AR_X)>;
 
@@ -169,8 +190,9 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_TReg32,
     R600_ArrayBase,
     R600_Addr,
+    R600_KC0, R600_KC1,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
-    ALU_CONST, ALU_PARAM
+    ALU_CONST, ALU_PARAM, OQAP
     )>;
 
 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
@@ -184,32 +206,5 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
   let CopyCost = -1;
 }
 
-//===----------------------------------------------------------------------===//
-// Register classes for indirect addressing
-//===----------------------------------------------------------------------===//
-
-// Super register for all the Indirect Registers.  This register class is used
-// by the REG_SEQUENCE instruction to specify the registers to use for direct
-// reads / writes which may be written / read by an indirect address.
-class IndirectSuper<string n, list<Register> subregs> :
-    RegisterWithSubRegs<n, subregs> {
-  let Namespace = "AMDGPU";
-  let SubRegIndices =
- [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
-  sub8, sub9, sub10, sub11, sub12, sub13, sub14, sub15];
-}
-
-def IndirectSuperReg : IndirectSuper<"Indirect",
-  [TRegMem0_X, TRegMem1_X, TRegMem2_X, TRegMem3_X, TRegMem4_X, TRegMem5_X,
-   TRegMem6_X, TRegMem7_X, TRegMem8_X, TRegMem9_X, TRegMem10_X, TRegMem11_X,
-   TRegMem12_X, TRegMem13_X, TRegMem14_X, TRegMem15_X]
->;
-
-def IndirectReg : RegisterClass<"AMDGPU", [f32, i32], 32, (add IndirectSuperReg)>;
-
-// This register class defines the registers that are the storage units for
-// the "Indirect Addressing" pseudo memory space.
-// XXX: Only use the X channel, until we support wider stack widths
-def TRegMem : RegisterClass<"AMDGPU", [f32, i32], 32,
-  (add (sequence "TRegMem%u_X", 0, 16))
->;
+def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
+                                (add (sequence "T%u_XY", 0, 63))>;
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
index 78a460a..df62bf8 100644
--- a/lib/Target/R600/R600Schedule.td
+++ b/lib/Target/R600/R600Schedule.td
@@ -23,14 +23,16 @@ def TRANS : FuncUnit;
 def AnyALU : InstrItinClass;
 def VecALU : InstrItinClass;
 def TransALU : InstrItinClass;
+def XALU : InstrItinClass;
 
 def R600_VLIW5_Itin : ProcessorItineraries <
   [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
   [],
   [
     InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
-    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
     InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<XALU, [InstrStage<1, [ALU_X]>]>,
     InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
   ]
 >;
@@ -40,7 +42,7 @@ def R600_VLIW4_Itin : ProcessorItineraries <
   [],
   [
     InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
-    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W]>]>,
     InstrItinData<TransALU, [InstrStage<1, [ALU_NULL]>]>,
     InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
   ]
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
new file mode 100644
index 0000000..3258894
--- /dev/null
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -0,0 +1,303 @@
+//===-- R600TextureIntrinsicsReplacer.cpp ---------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass translates tgsi-like texture intrinsics into R600 texture
+/// closer to hardware intrinsics.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+class R600TextureIntrinsicsReplacer :
+    public FunctionPass, public InstVisitor<R600TextureIntrinsicsReplacer> {
+  static char ID;
+
+  Module *Mod;
+  Type *FloatType;
+  Type *Int32Type;
+  Type *V4f32Type;
+  Type *V4i32Type;
+  FunctionType *TexSign;
+  FunctionType *TexQSign;
+
+  void getAdjustmentFromTextureTarget(unsigned TextureType, bool hasLOD,
+                                      unsigned SrcSelect[4], unsigned CT[4],
+                                      bool &useShadowVariant) {
+    enum TextureTypes {
+      TEXTURE_1D = 1,
+      TEXTURE_2D,
+      TEXTURE_3D,
+      TEXTURE_CUBE,
+      TEXTURE_RECT,
+      TEXTURE_SHADOW1D,
+      TEXTURE_SHADOW2D,
+      TEXTURE_SHADOWRECT,
+      TEXTURE_1D_ARRAY,
+      TEXTURE_2D_ARRAY,
+      TEXTURE_SHADOW1D_ARRAY,
+      TEXTURE_SHADOW2D_ARRAY,
+      TEXTURE_SHADOWCUBE,
+      TEXTURE_2D_MSAA,
+      TEXTURE_2D_ARRAY_MSAA,
+      TEXTURE_CUBE_ARRAY,
+      TEXTURE_SHADOWCUBE_ARRAY
+    };
+
+    switch (TextureType) {
+    case 0:
+      useShadowVariant = false;
+      return;
+    case TEXTURE_RECT:
+    case TEXTURE_1D:
+    case TEXTURE_2D:
+    case TEXTURE_3D:
+    case TEXTURE_CUBE:
+    case TEXTURE_1D_ARRAY:
+    case TEXTURE_2D_ARRAY:
+    case TEXTURE_CUBE_ARRAY:
+    case TEXTURE_2D_MSAA:
+    case TEXTURE_2D_ARRAY_MSAA:
+      useShadowVariant = false;
+      break;
+    case TEXTURE_SHADOW1D:
+    case TEXTURE_SHADOW2D:
+    case TEXTURE_SHADOWRECT:
+    case TEXTURE_SHADOW1D_ARRAY:
+    case TEXTURE_SHADOW2D_ARRAY:
+    case TEXTURE_SHADOWCUBE:
+    case TEXTURE_SHADOWCUBE_ARRAY:
+      useShadowVariant = true;
+      break;
+    default:
+      llvm_unreachable("Unknow Texture Type");
+    }
+
+    if (TextureType == TEXTURE_RECT ||
+        TextureType == TEXTURE_SHADOWRECT) {
+      CT[0] = 0;
+      CT[1] = 0;
+    }
+
+    if (TextureType == TEXTURE_CUBE_ARRAY ||
+        TextureType == TEXTURE_SHADOWCUBE_ARRAY)
+      CT[2] = 0;
+
+    if (TextureType == TEXTURE_1D_ARRAY ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) {
+      if (hasLOD && useShadowVariant) {
+        CT[1] = 0;
+      } else {
+        CT[2] = 0;
+        SrcSelect[2] = 1;
+      }
+    } else if (TextureType == TEXTURE_2D_ARRAY ||
+        TextureType == TEXTURE_SHADOW2D_ARRAY) {
+      CT[2] = 0;
+    }
+
+    if ((TextureType == TEXTURE_SHADOW1D ||
+        TextureType == TEXTURE_SHADOW2D ||
+        TextureType == TEXTURE_SHADOWRECT ||
+        TextureType == TEXTURE_SHADOW1D_ARRAY) &&
+        !(hasLOD && useShadowVariant))
+      SrcSelect[3] = 2;
+  }
+
+  void ReplaceCallInst(CallInst &I, FunctionType *FT, const char *Name,
+                       unsigned SrcSelect[4], Value *Offset[3], Value *Resource,
+                       Value *Sampler, unsigned CT[4], Value *Coord) {
+    IRBuilder<> Builder(&I);
+    Constant *Mask[] = {
+      ConstantInt::get(Int32Type, SrcSelect[0]),
+      ConstantInt::get(Int32Type, SrcSelect[1]),
+      ConstantInt::get(Int32Type, SrcSelect[2]),
+      ConstantInt::get(Int32Type, SrcSelect[3])
+    };
+    Value *SwizzleMask = ConstantVector::get(Mask);
+    Value *SwizzledCoord =
+        Builder.CreateShuffleVector(Coord, Coord, SwizzleMask);
+
+    Value *Args[] = {
+      SwizzledCoord,
+      Offset[0],
+      Offset[1],
+      Offset[2],
+      Resource,
+      Sampler,
+      ConstantInt::get(Int32Type, CT[0]),
+      ConstantInt::get(Int32Type, CT[1]),
+      ConstantInt::get(Int32Type, CT[2]),
+      ConstantInt::get(Int32Type, CT[3])
+    };
+
+    Function *F = Mod->getFunction(Name);
+    if (!F) {
+      F = Function::Create(FT, GlobalValue::ExternalLinkage, Name, Mod);
+      F->addFnAttr(Attribute::ReadNone);
+    }
+    I.replaceAllUsesWith(Builder.CreateCall(F, Args));
+    I.eraseFromParent();
+  }
+
+  void ReplaceTexIntrinsic(CallInst &I, bool hasLOD, FunctionType *FT,
+                           const char *VanillaInt,
+                           const char *ShadowInt) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(1);
+    Value *SamplerId = I.getArgOperand(2);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0),
+      ConstantInt::get(Int32Type, 0)
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, hasLOD, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, FT, useShadowVariant?ShadowInt:VanillaInt, SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+  void ReplaceTXF(CallInst &I) {
+    Value *Coord = I.getArgOperand(0);
+    Value *ResourceId = I.getArgOperand(4);
+    Value *SamplerId = I.getArgOperand(5);
+
+    unsigned TextureType =
+        dyn_cast<ConstantInt>(I.getArgOperand(6))->getZExtValue();
+
+    unsigned SrcSelect[4] = { 0, 1, 2, 3 };
+    unsigned CT[4] = {1, 1, 1, 1};
+    Value *Offset[3] = {
+      I.getArgOperand(1),
+      I.getArgOperand(2),
+      I.getArgOperand(3),
+    };
+    bool useShadowVariant;
+
+    getAdjustmentFromTextureTarget(TextureType, false, SrcSelect, CT,
+                                   useShadowVariant);
+
+    ReplaceCallInst(I, TexQSign, "llvm.R600.txf", SrcSelect,
+                    Offset, ResourceId, SamplerId, CT, Coord);
+  }
+
+public:
+  R600TextureIntrinsicsReplacer():
+    FunctionPass(ID) {
+  }
+
+  virtual bool doInitialization(Module &M) {
+    LLVMContext &Ctx = M.getContext();
+    Mod = &M;
+    FloatType = Type::getFloatTy(Ctx);
+    Int32Type = Type::getInt32Ty(Ctx);
+    V4f32Type = VectorType::get(FloatType, 4);
+    V4i32Type = VectorType::get(Int32Type, 4);
+    Type *ArgsType[] = {
+      V4f32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexSign = FunctionType::get(V4f32Type, ArgsType, /*isVarArg=*/false);
+    Type *ArgsQType[] = {
+      V4i32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+      Int32Type,
+    };
+    TexQSign = FunctionType::get(V4f32Type, ArgsQType, /*isVarArg=*/false);
+    return false;
+  }
+
+  virtual bool runOnFunction(Function &F) {
+    visit(F);
+    return false;
+  }
+
+  virtual const char *getPassName() const {
+    return "R600 Texture Intrinsics Replacer";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+  }
+
+  void visitCallInst(CallInst &I) {
+    if (!I.getCalledFunction())
+      return;
+
+    StringRef Name = I.getCalledFunction()->getName();
+    if (Name == "llvm.AMDGPU.tex") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.tex", "llvm.R600.texc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txl") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txl", "llvm.R600.txlc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txb") {
+      ReplaceTexIntrinsic(I, true, TexSign, "llvm.R600.txb", "llvm.R600.txbc");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txf") {
+      ReplaceTXF(I);
+      return;
+    }
+    if (Name == "llvm.AMDGPU.txq") {
+      ReplaceTexIntrinsic(I, false, TexQSign, "llvm.R600.txq", "llvm.R600.txq");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddx") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddx", "llvm.R600.ddx");
+      return;
+    }
+    if (Name == "llvm.AMDGPU.ddy") {
+      ReplaceTexIntrinsic(I, false, TexSign, "llvm.R600.ddy", "llvm.R600.ddy");
+      return;
+    }
+  }
+
+};
+
+char R600TextureIntrinsicsReplacer::ID = 0;
+
+}
+
+FunctionPass *llvm::createR600TextureIntrinsicsReplacer() {
+  return new R600TextureIntrinsicsReplacer();
+}
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index 2477e2a..6bbdf59 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -15,6 +15,8 @@
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -29,13 +31,13 @@ typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
 // Intrinsic names the control flow is annotated with
-static const char *IfIntrinsic = "llvm.SI.if";
-static const char *ElseIntrinsic = "llvm.SI.else";
-static const char *BreakIntrinsic = "llvm.SI.break";
-static const char *IfBreakIntrinsic = "llvm.SI.if.break";
-static const char *ElseBreakIntrinsic = "llvm.SI.else.break";
-static const char *LoopIntrinsic = "llvm.SI.loop";
-static const char *EndCfIntrinsic = "llvm.SI.end.cf";
+static const char *const IfIntrinsic = "llvm.SI.if";
+static const char *const ElseIntrinsic = "llvm.SI.else";
+static const char *const BreakIntrinsic = "llvm.SI.break";
+static const char *const IfBreakIntrinsic = "llvm.SI.if.break";
+static const char *const ElseBreakIntrinsic = "llvm.SI.else.break";
+static const char *const LoopIntrinsic = "llvm.SI.loop";
+static const char *const EndCfIntrinsic = "llvm.SI.end.cf";
 
 class SIAnnotateControlFlow : public FunctionPass {
 
diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h
index 716b093..2cbce28 100644
--- a/lib/Target/R600/SIDefines.h
+++ b/lib/Target/R600/SIDefines.h
@@ -11,12 +11,28 @@
 #ifndef SIDEFINES_H_
 #define SIDEFINES_H_
 
+namespace SIInstrFlags {
+enum {
+  MIMG = 1 << 3,
+  SMRD = 1 << 4,
+  VOP1 = 1 << 5,
+  VOP2 = 1 << 6,
+  VOP3 = 1 << 7,
+  VOPC = 1 << 8,
+  SALU = 1 << 9
+};
+}
+
 #define R_00B028_SPI_SHADER_PGM_RSRC1_PS                                0x00B028
+#define R_00B02C_SPI_SHADER_PGM_RSRC2_PS                                0x00B02C
+#define   S_00B02C_EXTRA_LDS_SIZE(x)                                  (((x) & 0xFF) << 8)
 #define R_00B128_SPI_SHADER_PGM_RSRC1_VS                                0x00B128
 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS                                0x00B228
 #define R_00B848_COMPUTE_PGM_RSRC1                                      0x00B848
 #define   S_00B028_VGPRS(x)                                           (((x) & 0x3F) << 0)
 #define   S_00B028_SGPRS(x)                                           (((x) & 0x0F) << 6)
+#define R_00B84C_COMPUTE_PGM_RSRC2                                      0x00B84C
+#define   S_00B84C_LDS_SIZE(x)                                        (((x) & 0x1FF) << 15)
 #define R_0286CC_SPI_PS_INPUT_ENA                                       0x0286CC
 
 #endif // SIDEFINES_H_
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
new file mode 100644
index 0000000..3370c79
--- /dev/null
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -0,0 +1,263 @@
+//===-- SIFixSGPRCopies.cpp - Remove potential VGPR => SGPR copies --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Copies from VGPR to SGPR registers are illegal and the register coalescer
+/// will sometimes generate these illegal copies in situations like this:
+///
+///  Register Class <vsrc> is the union of <vgpr> and <sgpr>
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///   %vreg1 <vsrc> = COPY %vreg0 <sgpr>
+///    ...
+///    BRANCH %cond BB1, BB2
+///  BB1:
+///    %vreg2 <vgpr> = VECTOR_INST
+///    %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+///  BB2:
+///    %vreg4 <vsrc> = PHI %vreg1 <vsrc>, <BB#0>, %vreg3 <vrsc>, <BB#1>
+///    %vreg5 <vgpr> = VECTOR_INST %vreg4 <vsrc>
+///
+///
+/// The coalescer will begin at BB0 and eliminate its copy, then the resulting
+/// code will look like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <vsrc> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <vsrc>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now that the result of the PHI instruction is an SGPR, the register
+/// allocator is now forced to constrain the register class of %vreg3 to
+/// <sgpr> so we end up with final code like this:
+///
+/// BB0:
+///   %vreg0 <sgpr> = SCALAR_INST
+///    ...
+///    BRANCH %cond BB1, BB2
+/// BB1:
+///   %vreg2 <vgpr> = VECTOR_INST
+///   %vreg3 <sgpr> = COPY %vreg2 <vgpr>
+/// BB2:
+///   %vreg4 <sgpr> = PHI %vreg0 <sgpr>, <BB#0>, %vreg3 <sgpr>, <BB#1>
+///   %vreg5 <vgpr> = VECTOR_INST %vreg4 <sgpr>
+///
+/// Now this code contains an illegal copy from a VGPR to an SGPR.
+///
+/// In order to avoid this problem, this pass searches for PHI instructions
+/// which define a <vsrc> register and constrains its definition class to
+/// <vgpr> if the user of the PHI's definition register is a vector instruction.
+/// If the PHI's definition class is constrained to <vgpr> then the coalescer
+/// will be unable to perform the COPY removal from the above example  which
+/// ultimately led to the creation of an illegal COPY.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sgpr-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIFixSGPRCopies : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const TargetRegisterClass *inferRegClassFromUses(const SIRegisterInfo *TRI,
+                                           const MachineRegisterInfo &MRI,
+                                           unsigned Reg,
+                                           unsigned SubReg) const;
+  const TargetRegisterClass *inferRegClassFromDef(const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const;
+  bool isVGPRToSGPRCopy(const MachineInstr &Copy, const SIRegisterInfo *TRI,
+                        const MachineRegisterInfo &MRI) const;
+
+public:
+  SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI Fix SGPR copies";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SIFixSGPRCopies::ID = 0;
+
+FunctionPass *llvm::createSIFixSGPRCopiesPass(TargetMachine &tm) {
+  return new SIFixSGPRCopies(tm);
+}
+
+static bool hasVGPROperands(const MachineInstr &MI, const SIRegisterInfo *TRI) {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (!MI.getOperand(i).isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+      continue;
+
+    if (TRI->hasVGPRs(MRI.getRegClass(MI.getOperand(i).getReg())))
+      return true;
+  }
+  return false;
+}
+
+/// This functions walks the use list of Reg until it finds an Instruction
+/// that isn't a COPY returns the register class of that instruction.
+/// \return The register defined by the first non-COPY instruction.
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromUses(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  // The Reg parameter to the function must always be defined by either a PHI
+  // or a COPY, therefore it cannot be a physical register.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Reg cannot be a physical register");
+
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  RC = TRI->getSubRegClass(RC, SubReg);
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
+                                         E = MRI.use_end(); I != E; ++I) {
+    switch (I->getOpcode()) {
+    case AMDGPU::COPY:
+      RC = TRI->getCommonSubClass(RC, inferRegClassFromUses(TRI, MRI,
+                                  I->getOperand(0).getReg(),
+                                  I->getOperand(0).getSubReg()));
+      break;
+    }
+  }
+
+  return RC;
+}
+
+const TargetRegisterClass *SIFixSGPRCopies::inferRegClassFromDef(
+                                                 const SIRegisterInfo *TRI,
+                                                 const MachineRegisterInfo &MRI,
+                                                 unsigned Reg,
+                                                 unsigned SubReg) const {
+  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    const TargetRegisterClass *RC = TRI->getPhysRegClass(Reg);
+    return TRI->getSubRegClass(RC, SubReg);
+  }
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def->getOpcode() != AMDGPU::COPY) {
+    return TRI->getSubRegClass(MRI.getRegClass(Reg), SubReg);
+  }
+
+  return inferRegClassFromDef(TRI, MRI, Def->getOperand(1).getReg(),
+                                   Def->getOperand(1).getSubReg());
+}
+
+bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
+                                      const SIRegisterInfo *TRI,
+                                      const MachineRegisterInfo &MRI) const {
+
+  unsigned DstReg = Copy.getOperand(0).getReg();
+  unsigned SrcReg = Copy.getOperand(1).getReg();
+  unsigned SrcSubReg = Copy.getOperand(1).getSubReg();
+  const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
+  const TargetRegisterClass *SrcRC;
+
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      DstRC == &AMDGPU::M0RegRegClass)
+    return false;
+
+  SrcRC = inferRegClassFromDef(TRI, MRI, SrcReg, SrcSubReg);
+  return TRI->isSGPRClass(DstRC) && TRI->hasVGPRs(SrcRC);
+}
+
+bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      if (MI.getOpcode() == AMDGPU::COPY && isVGPRToSGPRCopy(MI, TRI, MRI)) {
+        DEBUG(dbgs() << "Fixing VGPR -> SGPR copy:\n");
+        DEBUG(MI.print(dbgs()));
+        TII->moveToVALU(MI);
+
+      }
+
+      switch (MI.getOpcode()) {
+      default: continue;
+      case AMDGPU::PHI: {
+        DEBUG(dbgs() << " Fixing PHI:\n");
+        DEBUG(MI.print(dbgs()));
+
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          const TargetRegisterClass *RC = inferRegClassFromDef(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+          MRI.constrainRegClass(Reg, RC);
+        }
+        unsigned Reg = MI.getOperand(0).getReg();
+        const TargetRegisterClass *RC = inferRegClassFromUses(TRI, MRI, Reg,
+                                                  MI.getOperand(0).getSubReg());
+        if (TRI->getCommonSubClass(RC, &AMDGPU::VReg_32RegClass)) {
+          MRI.constrainRegClass(Reg, &AMDGPU::VReg_32RegClass);
+        }
+
+        if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
+          break;
+
+        // If a PHI node defines an SGPR and any of its operands are VGPRs,
+        // then we need to move it to the VALU.
+        for (unsigned i = 1; i < MI.getNumOperands(); i+=2) {
+          unsigned Reg = MI.getOperand(i).getReg();
+          if (TRI->hasVGPRs(MRI.getRegClass(Reg))) {
+            TII->moveToVALU(MI);
+            break;
+          }
+        }
+
+        break;
+      }
+      case AMDGPU::REG_SEQUENCE: {
+        if (TRI->hasVGPRs(TII->getOpRegClass(MI, 0)) ||
+            !hasVGPROperands(MI, TRI))
+          continue;
+
+        DEBUG(dbgs() << "Fixing REG_SEQUENCE:\n");
+        DEBUG(MI.print(dbgs()));
+
+        TII->moveToVALU(MI);
+        break;
+      }
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 6bd82a5..d5d2b68 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -13,39 +13,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "SIISelLowering.h"
-#include "AMDIL.h"
 #include "AMDGPU.h"
 #include "AMDILIntrinsicInfo.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
-#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/Function.h"
+
+const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
 
 using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
-    AMDGPUTargetLowering(TM),
-    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())),
-    TRI(TM.getRegisterInfo()) {
+    AMDGPUTargetLowering(TM) {
 
   addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
 
-  addRegisterClass(MVT::v16i8, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
-  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
 
-  addRegisterClass(MVT::v1i32, &AMDGPU::VReg_32RegClass);
-
-  addRegisterClass(MVT::v2i32, &AMDGPU::VReg_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
 
   addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
@@ -59,6 +56,21 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   computeRegisterProperties();
 
+  // Condition Codes
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -66,14 +78,66 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::ADD, MVT::i64, Legal);
   setOperationAction(ISD::ADD, MVT::i32, Legal);
+  setOperationAction(ISD::ADDC, MVT::i32, Legal);
+  setOperationAction(ISD::ADDE, MVT::i32, Legal);
+
+  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
+
+  // We need to custom lower vector stores from local memory
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v8i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v16i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i32, Custom);
+
+  // We need to custom lower loads/stores from private memory
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Custom);
+  setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i64, Custom);
+  setOperationAction(ISD::STORE, MVT::i128, Custom);
+  setOperationAction(ISD::STORE, MVT::v2i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
 
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
 
   setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
 
-  setOperationAction(ISD::STORE, MVT::i32, Custom);
-  setOperationAction(ISD::STORE, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::v2i1, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i1, Expand);
+
+  setOperationAction(ISD::ANY_EXTEND, MVT::i64, Custom);
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
+  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
 
   setTargetDAGCombine(ISD::SELECT_CC);
 
@@ -82,12 +146,45 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setSchedulingPreference(Sched::RegPressure);
 }
 
+//===----------------------------------------------------------------------===//
+// TargetLowering queries
+//===----------------------------------------------------------------------===//
+
+bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
+                                                     bool *IsFast) const {
+  // XXX: This depends on the address space and also we may want to revist
+  // the alignment values we specify in the DataLayout.
+  if (!VT.isSimple() || VT == MVT::Other)
+    return false;
+  return VT.bitsGT(MVT::i32);
+}
+
+bool SITargetLowering::shouldSplitVectorElementType(EVT VT) const {
+  return VT.bitsLE(MVT::i16);
+}
+
+SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         SDLoc DL, SDValue Chain,
+                                         unsigned Offset) const {
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                            AMDGPUAS::CONSTANT_ADDRESS);
+  SDValue BasePtr =  DAG.getCopyFromReg(Chain, DL,
+                           MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                             DAG.getConstant(Offset, MVT::i64));
+  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+                            MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
+                            false, false, MemVT.getSizeInBits() >> 3);
+
+}
+
 SDValue SITargetLowering::LowerFormalArguments(
                                       SDValue Chain,
                                       CallingConv::ID CallConv,
                                       bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SDLoc DL, SelectionDAG &DAG,
                                       SmallVectorImpl<SDValue> &InVals) const {
 
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
@@ -103,9 +200,10 @@ SDValue SITargetLowering::LowerFormalArguments(
 
   for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
-   
-    // First check if it's a PS input addr 
-    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg()) {
+
+    // First check if it's a PS input addr
+    if (Info->ShaderType == ShaderType::PIXEL && !Arg.Flags.isInReg() &&
+        !Arg.Flags.isByVal()) {
 
       assert((PSInputNum <= 15) && "Too many PS inputs!");
 
@@ -120,7 +218,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     }
 
     // Second split vertices into their elements
-    if (Arg.VT.isVector()) {
+    if (Info->ShaderType != ShaderType::COMPUTE && Arg.VT.isVector()) {
       ISD::InputArg NewArg = Arg;
       NewArg.Flags.setSplit();
       NewArg.VT = Arg.VT.getVectorElementType();
@@ -136,7 +234,7 @@ SDValue SITargetLowering::LowerFormalArguments(
         NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
 
-    } else {
+    } else if (Info->ShaderType != ShaderType::COMPUTE) {
       Splits.push_back(Arg);
     }
   }
@@ -152,20 +250,44 @@ SDValue SITargetLowering::LowerFormalArguments(
     CCInfo.AllocateReg(AMDGPU::VGPR1);
   }
 
+  // The pointer to the list of arguments is stored in SGPR0, SGPR1
+  if (Info->ShaderType == ShaderType::COMPUTE) {
+    CCInfo.AllocateReg(AMDGPU::SGPR0);
+    CCInfo.AllocateReg(AMDGPU::SGPR1);
+    MF.addLiveIn(AMDGPU::SGPR0_SGPR1, &AMDGPU::SReg_64RegClass);
+  }
+
+  if (Info->ShaderType == ShaderType::COMPUTE) {
+    getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
+                            Splits);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
 
+    const ISD::InputArg &Arg = Ins[i];
     if (Skipped & (1 << i)) {
-      InVals.push_back(SDValue());
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
       continue;
     }
 
     CCValAssign &VA = ArgLocs[ArgIdx++];
+    EVT VT = VA.getLocVT();
+
+    if (VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = Splits[i].VT;
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
+                                   36 + VA.getLocMemOffset());
+      InVals.push_back(Arg);
+      continue;
+    }
     assert(VA.isRegLoc() && "Parameter must be in a register!");
 
     unsigned Reg = VA.getLocReg();
-    MVT VT = VA.getLocVT();
 
     if (VT == MVT::i64) {
       // For now assume it is a pointer
@@ -181,7 +303,6 @@ SDValue SITargetLowering::LowerFormalArguments(
     Reg = MF.addLiveIn(Reg, RC);
     SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
 
-    const ISD::InputArg &Arg = Ins[i];
     if (Arg.VT.isVector()) {
 
       // Build a vector from the registers
@@ -200,7 +321,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       NumElements = Arg.VT.getVectorNumElements() - NumElements;
       for (unsigned j = 0; j != NumElements; ++j)
         Regs.push_back(DAG.getUNDEF(VT));
- 
+
       InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
                                    Regs.data(), Regs.size()));
       continue;
@@ -214,36 +335,274 @@ SDValue SITargetLowering::LowerFormalArguments(
 MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
+  MachineBasicBlock::iterator I = *MI;
+
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
+  case AMDGPU::SI_ADDR64_RSRC: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    unsigned SuperReg = MI->getOperand(0).getReg();
+    unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned SubRegHiHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    unsigned SubRegHiLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), SubRegLo)
+            .addOperand(MI->getOperand(1));
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiLo)
+            .addImm(0);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), SubRegHiHi)
+            .addImm(RSRC_DATA_FORMAT >> 32);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SubRegHi)
+            .addReg(SubRegHiLo)
+            .addImm(AMDGPU::sub0)
+            .addReg(SubRegHiHi)
+            .addImm(AMDGPU::sub1);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::REG_SEQUENCE), SuperReg)
+            .addReg(SubRegLo)
+            .addImm(AMDGPU::sub0_sub1)
+            .addReg(SubRegHi)
+            .addImm(AMDGPU::sub2_sub3);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::V_SUB_F64: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(MI->getOperand(2).getReg())
+            .addImm(0)  /* src2 */
+            .addImm(0)  /* ABS */
+            .addImm(0)  /* CLAMP */
+            .addImm(0)  /* OMOD */
+            .addImm(2); /* NEG */
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::SI_RegisterStorePseudo: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder MIB =
+        BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
+                Reg);
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    MI->eraseFromParent();
+  }
   }
   return BB;
 }
 
-EVT SITargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i1;
+EVT SITargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector()) {
+    return MVT::i1;
+  }
+  return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
 }
 
 MVT SITargetLowering::getScalarShiftAmountTy(EVT VT) const {
   return MVT::i32;
 }
 
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+    return false; /* There is V_MAD_F32 for f32 */
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG Lowering Operations
 //===----------------------------------------------------------------------===//
 
 SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   switch (Op.getOpcode()) {
   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::ADD: return LowerADD(Op, DAG);
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  case ISD::LOAD: {
+    LoadSDNode *Load = dyn_cast<LoadSDNode>(Op);
+    if ((Load->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+         Load->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+        Op.getValueType().isVector()) {
+      SDValue MergedValues[2] = {
+        SplitVectorLoad(Op, DAG),
+        Load->getChain()
+      };
+      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+    } else {
+      return LowerLOAD(Op, DAG);
+    }
+  }
+
   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
   case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::ANY_EXTEND: // Fall-through
+  case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
+  case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    SDLoc DL(Op);
+    //XXX: Hardcoded we only use two to store the pointer to the parameters.
+    unsigned NumUserSGPRs = 2;
+    switch (IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case Intrinsic::r600_read_ngroups_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
+    case Intrinsic::r600_read_ngroups_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
+    case Intrinsic::r600_read_ngroups_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
+    case Intrinsic::r600_read_global_size_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
+    case Intrinsic::r600_read_global_size_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
+    case Intrinsic::r600_read_global_size_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
+    case Intrinsic::r600_read_local_size_x:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
+    case Intrinsic::r600_read_local_size_y:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
+    case Intrinsic::r600_read_local_size_z:
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
+    case Intrinsic::r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
+    case Intrinsic::r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 1), VT);
+    case Intrinsic::r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
+                     AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 2), VT);
+    case Intrinsic::r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR0, VT);
+    case Intrinsic::r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR1, VT);
+    case Intrinsic::r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR2, VT);
+    case AMDGPUIntrinsic::SI_load_const: {
+      SDValue Ops [] = {
+        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(2)
+      };
+
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MachinePointerInfo(),
+          MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
+          VT.getSizeInBits() / 8, 4);
+      return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
+                                     Op->getVTList(), Ops, 2, VT, MMO);
+    }
+    case AMDGPUIntrinsic::SI_sample:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampleb:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB, Op, DAG);
+    case AMDGPUIntrinsic::SI_sampled:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLED, Op, DAG);
+    case AMDGPUIntrinsic::SI_samplel:
+      return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
+    case AMDGPUIntrinsic::SI_vs_load_input:
+      return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
+                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(2),
+                         Op.getOperand(3));
+    }
+  }
+
+  case ISD::INTRINSIC_VOID:
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    switch (IntrinsicID) {
+      case AMDGPUIntrinsic::SI_tbuffer_store: {
+        SDLoc DL(Op);
+        SDValue Ops [] = {
+          Chain,
+          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(3),
+          Op.getOperand(4),
+          Op.getOperand(5),
+          Op.getOperand(6),
+          Op.getOperand(7),
+          Op.getOperand(8),
+          Op.getOperand(9),
+          Op.getOperand(10),
+          Op.getOperand(11),
+          Op.getOperand(12),
+          Op.getOperand(13),
+          Op.getOperand(14)
+        };
+        EVT VT = Op.getOperand(3).getValueType();
+
+        MachineMemOperand *MMO = MF.getMachineMemOperand(
+            MachinePointerInfo(),
+            MachineMemOperand::MOStore,
+            VT.getSizeInBits() / 8, 4);
+        return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
+                                       Op->getVTList(), Ops,
+                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+      }
+      default:
+        break;
+    }
   }
   return SDValue();
 }
 
+SDValue SITargetLowering::LowerADD(SDValue Op,
+                                   SelectionDAG &DAG) const {
+  if (Op.getValueType() != MVT::i64)
+    return SDValue();
+
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue One = DAG.getConstant(1, MVT::i32);
+
+  SDValue Lo0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, Zero);
+  SDValue Hi0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, LHS, One);
+
+  SDValue Lo1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, Zero);
+  SDValue Hi1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, RHS, One);
+
+  SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Glue);
+
+  SDValue AddLo = DAG.getNode(ISD::ADDC, DL, VTList, Lo0, Lo1);
+  SDValue Carry = AddLo.getValue(1);
+  SDValue AddHi = DAG.getNode(ISD::ADDE, DL, VTList, Hi0, Hi1, Carry);
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddLo, AddHi.getValue(0));
+}
+
 /// \brief Helper function for LowerBRCOND
 static SDNode *findUser(SDValue Value, unsigned Opcode) {
 
@@ -265,7 +624,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
                                       SelectionDAG &DAG) const {
 
-  DebugLoc DL = BRCOND.getDebugLoc();
+  SDLoc DL(BRCOND);
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
@@ -338,30 +697,51 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   return Chain;
 }
 
-#define RSRC_DATA_FORMAT 0xf00000000000
-
-SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Value = Op.getOperand(1);
-  SDValue VirtualAddress = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  LoadSDNode *Load = cast<LoadSDNode>(Op);
 
-  if (StoreNode->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS) {
+  if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
     return SDValue();
-  }
 
-  SDValue SrcSrc = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
-                               DAG.getConstant(0, MVT::i64),
-			       DAG.getConstant(RSRC_DATA_FORMAT, MVT::i64));
+  SDValue TruncPtr = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                                 Load->getBasePtr(), DAG.getConstant(0, MVT::i32));
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+
+  SDValue Ret = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op.getValueType(),
+                            Load->getChain(), Ptr,
+                            DAG.getTargetConstant(0, MVT::i32),
+                            Op.getOperand(2));
+  SDValue MergedValues[2] = {
+    Ret,
+    Load->getChain()
+  };
+  return DAG.getMergeValues(MergedValues, 2, DL);
+
+}
+
+SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
+                                             SelectionDAG &DAG) const {
 
-  SDValue Ops[2];
-  Ops[0] = DAG.getNode(AMDGPUISD::BUFFER_STORE, DL, MVT::Other, Chain,
-                       Value, SrcSrc, VirtualAddress);
-  Ops[1] = Chain;
+  if (Op.getValueType() == MVT::i128) {
+    return Op;
+  }
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  assert(Op.getOpcode() == ISD::UNDEF);
 
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
+                     DAG.getConstant(0, MVT::i64),
+                     DAG.getConstant(0, MVT::i64));
+}
+
+SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
+                                               const SDValue &Op,
+                                               SelectionDAG &DAG) const {
+  return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
+                     Op.getOperand(2),
+                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(4));
 }
 
 SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -371,7 +751,7 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue False = Op.getOperand(3);
   SDValue CC = Op.getOperand(4);
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Possible Min/Max pattern
   SDValue MinMax = LowerMinMax(Op, DAG);
@@ -383,6 +763,84 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }
 
+SDValue SITargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  if (VT != MVT::i64) {
+    return SDValue();
+  }
+
+  SDValue Hi = DAG.getNode(ISD::SRA, DL, MVT::i32, Op.getOperand(0),
+                                                 DAG.getConstant(31, MVT::i32));
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0), Hi);
+}
+
+SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
+  if (Ret.getNode())
+    return Ret;
+
+  if (VT.isVector() && VT.getVectorNumElements() >= 8)
+      return SplitVectorStore(Op, DAG);
+
+  if (Store->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+    return SDValue();
+
+  SDValue TruncPtr = DAG.getZExtOrTrunc(Store->getBasePtr(), DL, MVT::i32);
+  SDValue Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, TruncPtr,
+                            DAG.getConstant(2, MVT::i32));
+  SDValue Chain = Store->getChain();
+  SmallVector<SDValue, 8> Values;
+
+  if (VT == MVT::i64) {
+    for (unsigned i = 0; i < 2; ++i) {
+      Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                       Store->getValue(), DAG.getConstant(i, MVT::i32)));
+    }
+  } else if (VT == MVT::i128) {
+    for (unsigned i = 0; i < 2; ++i) {
+      for (unsigned j = 0; j < 2; ++j) {
+        Values.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                           DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
+                           Store->getValue(), DAG.getConstant(i, MVT::i32)),
+                         DAG.getConstant(j, MVT::i32)));
+      }
+    }
+  } else {
+    Values.push_back(Store->getValue());
+  }
+
+  for (unsigned i = 0; i < Values.size(); ++i) {
+    SDValue PartPtr = DAG.getNode(ISD::ADD, DL, MVT::i32,
+                                  Ptr, DAG.getConstant(i, MVT::i32));
+    Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
+                        Chain, Values[i], PartPtr,
+                        DAG.getTargetConstant(0, MVT::i32));
+  }
+  return Chain;
+}
+
+
+SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  if (VT != MVT::i64) {
+    return SDValue();
+  }
+
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
+                                              DAG.getConstant(0, MVT::i32));
+}
+
 //===----------------------------------------------------------------------===//
 // Custom DAG optimizations
 //===----------------------------------------------------------------------===//
@@ -390,13 +848,12 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
     default: break;
     case ISD::SELECT_CC: {
-      N->dump();
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
       if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
@@ -433,13 +890,13 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-/// \brief Test if RegClass is one of the VSrc classes 
+/// \brief Test if RegClass is one of the VSrc classes
 static bool isVSrc(unsigned RegClass) {
   return AMDGPU::VSrc_32RegClassID == RegClass ||
          AMDGPU::VSrc_64RegClassID == RegClass;
 }
 
-/// \brief Test if RegClass is one of the SSrc classes 
+/// \brief Test if RegClass is one of the SSrc classes
 static bool isSSrc(unsigned RegClass) {
   return AMDGPU::SSrc_32RegClassID == RegClass ||
          AMDGPU::SSrc_64RegClassID == RegClass;
@@ -481,6 +938,8 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
                                bool &ScalarSlotUsed) const {
 
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
@@ -512,30 +971,67 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
   return false;
 }
 
+const TargetRegisterClass *SITargetLowering::getRegClassForNode(
+                                   SelectionDAG &DAG, const SDValue &Op) const {
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+  if (!Op->isMachineOpcode()) {
+    switch(Op->getOpcode()) {
+    case ISD::CopyFromReg: {
+      MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+      unsigned Reg = cast<RegisterSDNode>(Op->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        return MRI.getRegClass(Reg);
+      }
+      return TRI.getPhysRegClass(Reg);
+    }
+    default:  return NULL;
+    }
+  }
+  const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
+  int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
+  if (OpClassID != -1) {
+    return TRI.getRegClass(OpClassID);
+  }
+  switch(Op.getMachineOpcode()) {
+  case AMDGPU::COPY_TO_REGCLASS:
+    // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
+    OpClassID = cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue();
+
+    // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
+    // class, then the register class for the value could be either a
+    // VReg or and SReg.  In order to get a more accurate
+    if (OpClassID == AMDGPU::VSrc_32RegClassID ||
+        OpClassID == AMDGPU::VSrc_64RegClassID) {
+      return getRegClassForNode(DAG, Op.getOperand(0));
+    }
+    return TRI.getRegClass(OpClassID);
+  case AMDGPU::EXTRACT_SUBREG: {
+    int SubIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    const TargetRegisterClass *SuperClass =
+      getRegClassForNode(DAG, Op.getOperand(0));
+    return TRI.getSubClassWithSubReg(SuperClass, SubIdx);
+  }
+  case AMDGPU::REG_SEQUENCE:
+    // Operand 0 is the register class id for REG_SEQUENCE instructions.
+    return TRI.getRegClass(
+      cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue());
+  default:
+    return getRegClassFor(Op.getSimpleValueType());
+  }
+}
+
 /// \brief Does "Op" fit into register class "RegClass" ?
-bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, SDValue &Op,
+bool SITargetLowering::fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
                                     unsigned RegClass) const {
-
-  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); 
-  SDNode *Node = Op.getNode();
-
-  const TargetRegisterClass *OpClass;
-  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(Node)) {
-    const MCInstrDesc &Desc = TII->get(MN->getMachineOpcode());
-    int OpClassID = Desc.OpInfo[Op.getResNo()].RegClass;
-    if (OpClassID == -1)
-      OpClass = getRegClassFor(Op.getSimpleValueType());
-    else
-      OpClass = TRI->getRegClass(OpClassID);
-
-  } else if (Node->getOpcode() == ISD::CopyFromReg) {
-    RegisterSDNode *Reg = cast<RegisterSDNode>(Node->getOperand(1).getNode());
-    OpClass = MRI.getRegClass(Reg->getReg());
-
-  } else
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const TargetRegisterClass *RC = getRegClassForNode(DAG, Op);
+  if (!RC) {
     return false;
-
-  return TRI->getRegClass(RegClass)->hasSubClassEq(OpClass);
+  }
+  return TRI->getRegClass(RegClass)->hasSubClassEq(RC);
 }
 
 /// \brief Make sure that we don't exeed the number of allowed scalars
@@ -561,20 +1057,33 @@ void SITargetLowering::ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
     return;
   }
 
-  // This is a conservative aproach, it is possible that we can't determine
-  // the correct register class and copy too often, but better save than sorry.
+  // This is a conservative aproach. It is possible that we can't determine the
+  // correct register class and copy too often, but better safe than sorry.
   SDValue RC = DAG.getTargetConstant(RegClass, MVT::i32);
-  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DebugLoc(),
+  SDNode *Node = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS, SDLoc(),
                                     Operand.getValueType(), Operand, RC);
   Operand = SDValue(Node, 0);
 }
 
+/// \returns true if \p Node's operands are different from the SDValue list
+/// \p Ops
+static bool isNodeChanged(const SDNode *Node, const std::vector<SDValue> &Ops) {
+  for (unsigned i = 0, e = Node->getNumOperands(); i < e; ++i) {
+    if (Ops[i].getNode() != Node->getOperand(i).getNode()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 /// \brief Try to fold the Nodes operands into the Node
 SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
                                        SelectionDAG &DAG) const {
 
   // Original encoding (either e32 or e64)
   int Opcode = Node->getMachineOpcode();
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
   const MCInstrDesc *Desc = &TII->get(Opcode);
 
   unsigned NumDefs = Desc->getNumDefs();
@@ -700,13 +1209,19 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
   for (unsigned i = NumOps - NumDefs, e = Node->getNumOperands(); i < e; ++i)
     Ops.push_back(Node->getOperand(i));
 
+  // Nodes that have a glue result are not CSE'd by getMachineNode(), so in
+  // this case a brand new node is always be created, even if the operands
+  // are the same as before.  So, manually check if anything has been changed.
+  if (Desc->Opcode == Opcode && !isNodeChanged(Node, Ops)) {
+    return Node;
+  }
+
   // Create a complete new instruction
-  return DAG.getMachineNode(Desc->Opcode, Node->getDebugLoc(),
-                            Node->getVTList(), Ops);
+  return DAG.getMachineNode(Desc->Opcode, SDLoc(Node), Node->getVTList(), Ops);
 }
 
 /// \brief Helper function for adjustWritemask
-unsigned SubIdx2Lane(unsigned Idx) {
+static unsigned SubIdx2Lane(unsigned Idx) {
   switch (Idx) {
   default: return 0;
   case AMDGPU::sub0: return 0;
@@ -720,7 +1235,9 @@ unsigned SubIdx2Lane(unsigned Idx) {
 void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
                                        SelectionDAG &DAG) const {
   SDNode *Users[4] = { };
-  unsigned Writemask = 0, Lane = 0;
+  unsigned Lane = 0;
+  unsigned OldDmask = Node->getConstantOperandVal(0);
+  unsigned NewDmask = 0;
 
   // Try to figure out the used register components
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -731,32 +1248,45 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
       return;
 
+    // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+    // Note that subregs are packed, i.e. Lane==0 is the first bit set
+    // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+    // set, etc.
     Lane = SubIdx2Lane(I->getConstantOperandVal(1));
 
+    // Set which texture component corresponds to the lane.
+    unsigned Comp;
+    for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+      assert(Dmask);
+      Comp = countTrailingZeros(Dmask);
+      Dmask &= ~(1 << Comp);
+    }
+
     // Abort if we have more than one user per component
     if (Users[Lane])
       return;
 
     Users[Lane] = *I;
-    Writemask |= 1 << Lane;
+    NewDmask |= 1 << Comp;
   }
 
-  // Abort if all components are used
-  if (Writemask == 0xf)
+  // Abort if there's no change
+  if (NewDmask == OldDmask)
     return;
 
   // Adjust the writemask in the node
   std::vector<SDValue> Ops;
-  Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
+  Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
   Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
 
   // If we only got one lane, replace it with a copy
-  if (Writemask == (1U << Lane)) {
+  // (if NewDmask has only one bit set...)
+  if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
     SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
     SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
-                                      DebugLoc(), Users[Lane]->getValueType(0),
+                                      SDLoc(), Users[Lane]->getValueType(0),
                                       SDValue(Node, 0), RC);
     DAG.ReplaceAllUsesWith(Users[Lane], Copy);
     return;
@@ -784,8 +1314,11 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
 /// \brief Fold the instructions after slecting them
 SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
                                           SelectionDAG &DAG) const {
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  Node = AdjustRegClass(Node, DAG);
 
-  if (AMDGPU::isMIMG(Node->getMachineOpcode()) != -1)
+  if (TII->isMIMG(Node->getMachineOpcode()))
     adjustWritemask(Node, DAG);
 
   return foldOperands(Node, DAG);
@@ -795,7 +1328,9 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
 /// bits set in the writemask
 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
                                                      SDNode *Node) const {
-  if (AMDGPU::isMIMG(MI->getOpcode()) == -1)
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  if (!TII->isMIMG(MI->getOpcode()))
     return;
 
   unsigned VReg = MI->getOperand(0).getReg();
@@ -812,6 +1347,53 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr *MI,
   case 3:  RC = &AMDGPU::VReg_96RegClass; break;
   }
 
+  unsigned NewOpcode = TII->getMaskedMIMGOp(MI->getOpcode(), BitsSet);
+  MI->setDesc(TII->get(NewOpcode));
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   MRI.setRegClass(VReg, RC);
 }
+
+MachineSDNode *SITargetLowering::AdjustRegClass(MachineSDNode *N,
+                                                SelectionDAG &DAG) const {
+
+  SDLoc DL(N);
+  unsigned NewOpcode = N->getMachineOpcode();
+
+  switch (N->getMachineOpcode()) {
+  default: return N;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+    NewOpcode = AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+    }
+    // Fall-through
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: {
+    if (NewOpcode == N->getMachineOpcode()) {
+      NewOpcode = AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
+    }
+    if (fitsRegClass(DAG, N->getOperand(0), AMDGPU::SReg_64RegClassID)) {
+      return N;
+    }
+    ConstantSDNode *Offset = cast<ConstantSDNode>(N->getOperand(1));
+    SDValue Ops[] = {
+      SDValue(DAG.getMachineNode(AMDGPU::SI_ADDR64_RSRC, DL, MVT::i128,
+                                 DAG.getConstant(0, MVT::i64)), 0),
+      N->getOperand(0),
+      DAG.getConstant(Offset->getSExtValue() << 2, MVT::i32)
+    };
+    return DAG.getMachineNode(NewOpcode, DL, N->getVTList(), Ops);
+  }
+  }
+}
+
+SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                               const TargetRegisterClass *RC,
+                                               unsigned Reg, EVT VT) const {
+  SDValue VReg = AMDGPUTargetLowering::CreateLiveInRegister(DAG, RC, Reg, VT);
+
+  return DAG.getCopyFromReg(DAG.getEntryNode(), SDLoc(DAG.getEntryNode()),
+                            cast<RegisterSDNode>(VReg)->getReg(), VT);
+}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index de637be..9933ece 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -21,35 +21,48 @@
 namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
-  const SIInstrInfo * TII;
-  const TargetRegisterInfo * TRI;
-
-  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
+                         SDValue Chain, unsigned Offset) const;
+  SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
+                               SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
-  bool fitsRegClass(SelectionDAG &DAG, SDValue &Op, unsigned RegClass) const;
-  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand, 
+  const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
+                                                const SDValue &Op) const;
+  bool fitsRegClass(SelectionDAG &DAG, const SDValue &Op,
+                    unsigned RegClass) const;
+  void ensureSRegLimit(SelectionDAG &DAG, SDValue &Operand,
                        unsigned RegClass, bool &ScalarSlotUsed) const;
 
   SDNode *foldOperands(MachineSDNode *N, SelectionDAG &DAG) const;
   void adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
+  MachineSDNode *AdjustRegClass(MachineSDNode *N, SelectionDAG &DAG) const;
 
 public:
   SITargetLowering(TargetMachine &tm);
+  bool allowsUnalignedMemoryAccesses(EVT  VT, bool *IsFast) const;
+  virtual bool shouldSplitVectorElementType(EVT VT) const;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                               DebugLoc DL, SelectionDAG &DAG,
+                               SDLoc DL, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const;
 
   virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
                                               MachineBasicBlock * BB) const;
-  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
   virtual MVT getScalarShiftAmountTy(EVT VT) const;
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
   virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
   virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
@@ -57,6 +70,8 @@ public:
                                              SDNode *Node) const;
 
   int32_t analyzeImmediate(const SDNode *N) const;
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                               unsigned Reg, EVT VT) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 98bd3db..7ef662e 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -47,7 +47,7 @@ class SIInsertWaits : public MachineFunctionPass {
 private:
   static char ID;
   const SIInstrInfo *TII;
-  const SIRegisterInfo &TRI;
+  const SIRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
 
   /// \brief Constant hardware limits
@@ -97,8 +97,9 @@ private:
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(static_cast<const SIInstrInfo*>(tm.getInstrInfo())),
-    TRI(TII->getRegisterInfo()) { }
+    TII(0),
+    TRI(0),
+    ExpInstrTypesSeen(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -133,12 +134,19 @@ Counters SIInsertWaits::getHwCounts(MachineInstr &MI) {
   // LGKM may uses larger values
   if (TSFlags & SIInstrFlags::LGKM_CNT) {
 
-    MachineOperand &Op = MI.getOperand(0);
-    assert(Op.isReg() && "First LGKM operand must be a register!");
+    if (TII->isSMRD(MI.getOpcode())) {
 
-    unsigned Reg = Op.getReg();
-    unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
-    Result.Named.LGKM = Size > 4 ? 2 : 1;
+      MachineOperand &Op = MI.getOperand(0);
+      assert(Op.isReg() && "First LGKM operand must be a register!");
+
+      unsigned Reg = Op.getReg();
+      unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+      Result.Named.LGKM = Size > 4 ? 2 : 1;
+
+    } else {
+      // DS
+      Result.Named.LGKM = 1;
+    }
 
   } else {
     Result.Named.LGKM = 0;
@@ -178,16 +186,16 @@ bool SIInsertWaits::isOpRelevant(MachineOperand &Op) {
 
 RegInterval SIInsertWaits::getRegInterval(MachineOperand &Op) {
 
-  if (!Op.isReg())
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()))
     return std::make_pair(0, 0);
 
   unsigned Reg = Op.getReg();
-  unsigned Size = TRI.getMinimalPhysRegClass(Reg)->getSize();
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
 
   assert(Size >= 4);
 
   RegInterval Result;
-  Result.first = TRI.getEncodingValue(Reg);
+  Result.first = TRI->getEncodingValue(Reg);
   Result.second = Result.first + Size / 4;
 
   return Result;
@@ -328,9 +336,11 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
 }
 
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
-
   bool Changes = false;
 
+  TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
+
   MRI = &MF.getRegInfo();
 
   WaitedOn = ZeroCounts;
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index f737ddd..53ebaaf 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -17,10 +17,24 @@ class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
   field bits<1> LGKM_CNT = 0;
+  field bits<1> MIMG = 0;
+  field bits<1> SMRD = 0;
+  field bits<1> VOP1 = 0;
+  field bits<1> VOP2 = 0;
+  field bits<1> VOP3 = 0;
+  field bits<1> VOPC = 0;
+  field bits<1> SALU = 0;
 
   let TSFlags{0} = VM_CNT;
   let TSFlags{1} = EXP_CNT;
   let TSFlags{2} = LGKM_CNT;
+  let TSFlags{3} = MIMG;
+  let TSFlags{4} = SMRD;
+  let TSFlags{5} = VOP1;
+  let TSFlags{6} = VOP2;
+  let TSFlags{7} = VOP3;
+  let TSFlags{8} = VOPC;
+  let TSFlags{9} = SALU;
 }
 
 class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
@@ -55,6 +69,7 @@ class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -73,6 +88,7 @@ class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -90,6 +106,7 @@ class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -106,6 +123,7 @@ class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
@@ -123,6 +141,7 @@ class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let SALU = 1;
 }
 
 class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
@@ -140,6 +159,7 @@ class SMRD <bits<5> op, bits<1> imm, dag outs, dag ins, string asm,
   let Inst{31-27} = 0x18; //encoding
 
   let LGKM_CNT = 1;
+  let SMRD = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -162,6 +182,8 @@ class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP1 = 1;
 }
 
 class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -180,60 +202,66 @@ class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP2 = 1;
 }
 
 class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<3> ABS; 
-  bits<1> CLAMP;
-  bits<2> OMOD;
-  bits<3> NEG;
-
-  let Inst{7-0} = VDST;
-  let Inst{10-8} = ABS;
-  let Inst{11} = CLAMP;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<3> abs;
+  bits<1> clamp;
+  bits<2> omod;
+  bits<3> neg;
+
+  let Inst{7-0} = dst;
+  let Inst{10-8} = abs;
+  let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
   
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
-  bits<8> VDST;
-  bits<9> SRC0;
-  bits<9> SRC1;
-  bits<9> SRC2;
-  bits<7> SDST;
-  bits<2> OMOD;
-  bits<3> NEG;
+  bits<8> dst;
+  bits<9> src0;
+  bits<9> src1;
+  bits<9> src2;
+  bits<7> sdst;
+  bits<2> omod;
+  bits<3> neg;
 
-  let Inst{7-0} = VDST;
-  let Inst{14-8} = SDST;
+  let Inst{7-0} = dst;
+  let Inst{14-8} = sdst;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
-  let Inst{40-32} = SRC0;
-  let Inst{49-41} = SRC1;
-  let Inst{58-50} = SRC2;
-  let Inst{60-59} = OMOD;
-  let Inst{63-61} = NEG;
+  let Inst{40-32} = src0;
+  let Inst{49-41} = src1;
+  let Inst{58-50} = src2;
+  let Inst{60-59} = omod;
+  let Inst{63-61} = neg;
 
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let UseNamedOperandTable = 1;
+  let VOP3 = 1;
 }
 
 class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
@@ -251,6 +279,7 @@ class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
+  let VOPC = 1;
 }
 
 class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
@@ -281,6 +310,30 @@ class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
 let Uses = [EXEC] in {
 
+class DS <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> vdst;
+  bits<1> gds;
+  bits<8> addr;
+  bits<8> data0;
+  bits<8> data1;
+  bits<8> offset0;
+  bits<8> offset1;
+
+  let Inst{7-0} = offset0;
+  let Inst{15-8} = offset1;
+  let Inst{17} = gds;
+  let Inst{25-18} = op;
+  let Inst{31-26} = 0x36; //encoding
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data0;
+  let Inst{55-48} = data1;
+  let Inst{63-56} = vdst;
+
+  let LGKM_CNT = 1;
+}
+
 class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64<outs, ins, asm, pattern> {
 
@@ -390,6 +443,7 @@ class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
 
   let VM_CNT = 1;
   let EXP_CNT = 1;
+  let MIMG = 1;
 }
 
 def EXP : Enc64<
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 9a04c60..ab55c1b 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -15,22 +15,26 @@
 
 #include "SIInstrInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIDefines.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
-#include <stdio.h>
 
 using namespace llvm;
 
 SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
-    RI(tm, *this)
+    RI(tm)
     { }
 
 const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
   return RI;
 }
 
+//===----------------------------------------------------------------------===//
+// TargetInstrInfo callbacks
+//===----------------------------------------------------------------------===//
+
 void
 SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MI, DebugLoc DL,
@@ -42,27 +46,27 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // never be necessary.
   assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
 
-  const int16_t Sub0_15[] = {
+  static const int16_t Sub0_15[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
     AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
     AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, 0
   };
 
-  const int16_t Sub0_7[] = {
+  static const int16_t Sub0_7[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
     AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, 0
   };
 
-  const int16_t Sub0_3[] = {
+  static const int16_t Sub0_3[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, 0
   };
 
-  const int16_t Sub0_2[] = {
+  static const int16_t Sub0_2[] = {
     AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, 0
   };
 
-  const int16_t Sub0_1[] = {
+  static const int16_t Sub0_1[] = {
     AMDGPU::sub0, AMDGPU::sub1, 0
   };
 
@@ -118,14 +122,14 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_32RegClass.contains(SrcReg));
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
     BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
 
   } else if (AMDGPU::VReg_64RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_64RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_64RegClass.contains(SrcReg));
+           AMDGPU::SReg_64RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_1;
 
@@ -136,19 +140,19 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   } else if (AMDGPU::VReg_128RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_128RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_128RegClass.contains(SrcReg));
+           AMDGPU::SReg_128RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_3;
 
   } else if (AMDGPU::VReg_256RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_256RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_256RegClass.contains(SrcReg));
+           AMDGPU::SReg_256RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_7;
 
   } else if (AMDGPU::VReg_512RegClass.contains(DestReg)) {
     assert(AMDGPU::VReg_512RegClass.contains(SrcReg) ||
-	   AMDGPU::SReg_512RegClass.contains(SrcReg));
+           AMDGPU::SReg_512RegClass.contains(SrcReg));
     Opcode = AMDGPU::V_MOV_B32_e32;
     SubIndices = Sub0_15;
 
@@ -168,7 +172,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
-
   int NewOpc;
 
   // Try to map original to commuted opcode
@@ -185,11 +188,36 @@ unsigned SIInstrInfo::commuteOpcode(unsigned Opcode) const {
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                                               bool NewMI) const {
 
-  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg() ||
-      !MI->getOperand(2).isReg())
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
     return 0;
 
-  MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  // Cannot commute VOP2 if src0 is SGPR.
+  if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
+      RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
+   return 0;
+
+  if (!MI->getOperand(2).isReg()) {
+    // XXX: Commute instructions with FPImm operands
+    if (NewMI || MI->getOperand(2).isFPImm() ||
+       (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
+      return 0;
+    }
+
+    // XXX: Commute VOP3 instructions with abs and neg set.
+    if (isVOP3(MI->getOpcode()) &&
+        (MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::abs)).getImm() ||
+         MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                        AMDGPU::OpName::neg)).getImm()))
+      return 0;
+
+    unsigned Reg = MI->getOperand(1).getReg();
+    MI->getOperand(1).ChangeToImmediate(MI->getOperand(2).getImm());
+    MI->getOperand(2).ChangeToRegister(Reg, false);
+  } else {
+    MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
+  }
 
   if (MI)
     MI->setDesc(get(commuteOpcode(MI->getOpcode())));
@@ -197,15 +225,12 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
   return MI;
 }
 
-MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                           int64_t Imm) const {
-  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_B32_e32), DebugLoc());
-  MachineInstrBuilder MIB(*MF, MI);
-  MIB.addReg(DstReg, RegState::Define);
-  MIB.addImm(Imm);
-
-  return MI;
-
+MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         unsigned SrcReg) const {
+  return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32),
+                 DstReg) .addReg(SrcReg);
 }
 
 bool SIInstrInfo::isMov(unsigned Opcode) const {
@@ -224,32 +249,397 @@ SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
   return RC != &AMDGPU::EXECRegRegClass;
 }
 
-//===----------------------------------------------------------------------===//
-// Indirect addressing callbacks
-//===----------------------------------------------------------------------===//
+int SIInstrInfo::isMIMG(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::MIMG;
+}
 
-unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
-                                                 unsigned Channel) const {
-  assert(Channel == 0);
-  return RegIndex;
+int SIInstrInfo::isSMRD(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::SMRD;
+}
+
+bool SIInstrInfo::isVOP1(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP1;
+}
+
+bool SIInstrInfo::isVOP2(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP2;
+}
+
+bool SIInstrInfo::isVOP3(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOP3;
+}
+
+bool SIInstrInfo::isVOPC(uint16_t Opcode) const {
+  return get(Opcode).TSFlags & SIInstrFlags::VOPC;
+}
+
+bool SIInstrInfo::isSALUInstr(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & SIInstrFlags::SALU;
+}
+
+bool SIInstrInfo::isInlineConstant(const MachineOperand &MO) const {
+  if(MO.isImm()) {
+    return MO.getImm() >= -16 && MO.getImm() <= 64;
+  }
+  if (MO.isFPImm()) {
+    return MO.getFPImm()->isExactlyValue(0.0)  ||
+           MO.getFPImm()->isExactlyValue(0.5)  ||
+           MO.getFPImm()->isExactlyValue(-0.5) ||
+           MO.getFPImm()->isExactlyValue(1.0)  ||
+           MO.getFPImm()->isExactlyValue(-1.0) ||
+           MO.getFPImm()->isExactlyValue(2.0)  ||
+           MO.getFPImm()->isExactlyValue(-2.0) ||
+           MO.getFPImm()->isExactlyValue(4.0)  ||
+           MO.getFPImm()->isExactlyValue(-4.0);
+  }
+  return false;
+}
+
+bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO) const {
+  return (MO.isImm() || MO.isFPImm()) && !isInlineConstant(MO);
+}
+
+bool SIInstrInfo::verifyInstruction(const MachineInstr *MI,
+                                    StringRef &ErrInfo) const {
+  uint16_t Opcode = MI->getOpcode();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+  // Verify VOP*
+  if (isVOP1(Opcode) || isVOP2(Opcode) || isVOP3(Opcode) || isVOPC(Opcode)) {
+    unsigned ConstantBusCount = 0;
+    unsigned SGPRUsed = AMDGPU::NoRegister;
+    for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isUse() &&
+          !TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+
+        // EXEC register uses the constant bus.
+        if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
+          ++ConstantBusCount;
+
+        // SGPRs use the constant bus
+        if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
+            (!MO.isImplicit() &&
+            (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) ||
+            AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) {
+          if (SGPRUsed != MO.getReg()) {
+            ++ConstantBusCount;
+            SGPRUsed = MO.getReg();
+          }
+        }
+      }
+      // Literal constants use the constant bus.
+      if (isLiteralConstant(MO))
+        ++ConstantBusCount;
+    }
+    if (ConstantBusCount > 1) {
+      ErrInfo = "VOP* instruction uses the constant bus more than once";
+      return false;
+    }
+  }
+
+  // Verify SRC1 for VOP2 and VOPC
+  if (Src1Idx != -1 && (isVOP2(Opcode) || isVOPC(Opcode))) {
+    const MachineOperand &Src1 = MI->getOperand(Src1Idx);
+    if (Src1.isImm() || Src1.isFPImm()) {
+      ErrInfo = "VOP[2C] src1 cannot be an immediate.";
+      return false;
+    }
+  }
+
+  // Verify VOP3
+  if (isVOP3(Opcode)) {
+    if (Src0Idx != -1 && isLiteralConstant(MI->getOperand(Src0Idx))) {
+      ErrInfo = "VOP3 src0 cannot be a literal constant.";
+      return false;
+    }
+    if (Src1Idx != -1 && isLiteralConstant(MI->getOperand(Src1Idx))) {
+      ErrInfo = "VOP3 src1 cannot be a literal constant.";
+      return false;
+    }
+    if (Src2Idx != -1 && isLiteralConstant(MI->getOperand(Src2Idx))) {
+      ErrInfo = "VOP3 src2 cannot be a literal constant.";
+      return false;
+    }
+  }
+  return true;
+}
+
+unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default: return AMDGPU::INSTRUCTION_LIST_END;
+  case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
+  case AMDGPU::COPY: return AMDGPU::COPY;
+  case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::S_ADD_I32: return AMDGPU::V_ADD_I32_e32;
+  case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
+  case AMDGPU::S_SUB_I32: return AMDGPU::V_SUB_I32_e32;
+  case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
+  case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
+  case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64;
+  case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
+  case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
+  case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
+  case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  }
+}
+
+bool SIInstrInfo::isSALUOpSupportedOnVALU(const MachineInstr &MI) const {
+  return getVALUOp(MI) != AMDGPU::INSTRUCTION_LIST_END;
+}
+
+const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
+                                                      unsigned OpNo) const {
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const MCInstrDesc &Desc = get(MI.getOpcode());
+  if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
+      Desc.OpInfo[OpNo].RegClass == -1)
+    return MRI.getRegClass(MI.getOperand(OpNo).getReg());
+
+  unsigned RCID = Desc.OpInfo[OpNo].RegClass;
+  return RI.getRegClass(RCID);
+}
+
+bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::REG_SEQUENCE:
+    return RI.hasVGPRs(getOpRegClass(MI, 0));
+  default:
+    return RI.hasVGPRs(getOpRegClass(MI, OpNo));
+  }
 }
 
+void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const {
+  MachineBasicBlock::iterator I = MI;
+  MachineOperand &MO = MI->getOperand(OpIdx);
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass;
+  const TargetRegisterClass *RC = RI.getRegClass(RCID);
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (MO.isReg()) {
+    Opcode = AMDGPU::COPY;
+  } else if (RI.isSGPRClass(RC)) {
+    Opcode = AMDGPU::S_MOV_B32;
+  }
+
+  const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
+  unsigned Reg = MRI.createVirtualRegister(VRC);
+  BuildMI(*MI->getParent(), I, MI->getParent()->findDebugLoc(I), get(Opcode),
+          Reg).addOperand(MO);
+  MO.ChangeToRegister(Reg, false);
+}
+
+void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
+  MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+  int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src0);
+  int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src1);
+  int Src2Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                           AMDGPU::OpName::src2);
+
+  // Legalize VOP2
+  if (isVOP2(MI->getOpcode()) && Src1Idx != -1) {
+    MachineOperand &Src0 = MI->getOperand(Src0Idx);
+    MachineOperand &Src1 = MI->getOperand(Src1Idx);
+
+    // If the instruction implicitly reads VCC, we can't have any SGPR operands,
+    // so move any.
+    bool ReadsVCC = MI->readsRegister(AMDGPU::VCC, &RI);
+    if (ReadsVCC && Src0.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src0.getReg()))) {
+      legalizeOpWithMove(MI, Src0Idx);
+      return;
+    }
+
+    if (ReadsVCC && Src1.isReg() &&
+        RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
+      legalizeOpWithMove(MI, Src1Idx);
+      return;
+    }
+
+    // Legalize VOP2 instructions where src1 is not a VGPR. An SGPR input must
+    // be the first operand, and there can only be one.
+    if (Src1.isImm() || Src1.isFPImm() ||
+        (Src1.isReg() && RI.isSGPRClass(MRI.getRegClass(Src1.getReg())))) {
+      if (MI->isCommutable()) {
+        if (commuteInstruction(MI))
+          return;
+      }
+      legalizeOpWithMove(MI, Src1Idx);
+    }
+  }
+
+  // XXX - Do any VOP3 instructions read VCC?
+  // Legalize VOP3
+  if (isVOP3(MI->getOpcode())) {
+    int VOP3Idx[3] = {Src0Idx, Src1Idx, Src2Idx};
+    unsigned SGPRReg = AMDGPU::NoRegister;
+    for (unsigned i = 0; i < 3; ++i) {
+      int Idx = VOP3Idx[i];
+      if (Idx == -1)
+        continue;
+      MachineOperand &MO = MI->getOperand(Idx);
+
+      if (MO.isReg()) {
+        if (!RI.isSGPRClass(MRI.getRegClass(MO.getReg())))
+          continue; // VGPRs are legal
+
+        assert(MO.getReg() != AMDGPU::SCC && "SCC operand to VOP3 instruction");
+
+        if (SGPRReg == AMDGPU::NoRegister || SGPRReg == MO.getReg()) {
+          SGPRReg = MO.getReg();
+          // We can use one SGPR in each VOP3 instruction.
+          continue;
+        }
+      } else if (!isLiteralConstant(MO)) {
+        // If it is not a register and not a literal constant, then it must be
+        // an inline constant which is always legal.
+        continue;
+      }
+      // If we make it this far, then the operand is not legal and we must
+      // legalize it.
+      legalizeOpWithMove(MI, Idx);
+    }
+  }
+
+  // Legalize REG_SEQUENCE
+  // The register class of the operands much be the same type as the register
+  // class of the output.
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      const TargetRegisterClass *OpRC =
+              MRI.getRegClass(MI->getOperand(i).getReg());
+      if (RI.hasVGPRs(OpRC)) {
+        VRC = OpRC;
+      } else {
+        SRC = OpRC;
+      }
+    }
+
+    // If any of the operands are VGPR registers, then they all most be
+    // otherwise we will create illegal VGPR->SGPR copies when legalizing
+    // them.
+    if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) {
+      if (!VRC) {
+        assert(SRC);
+        VRC = RI.getEquivalentVGPRClass(SRC);
+      }
+      RC = VRC;
+    } else {
+      RC = SRC;
+    }
 
-int SIInstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+    // Update all the operands so they have the same type.
+    for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
+      if (!MI->getOperand(i).isReg() ||
+          !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
+        continue;
+      unsigned DstReg = MRI.createVirtualRegister(RC);
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+              get(AMDGPU::COPY), DstReg)
+              .addOperand(MI->getOperand(i));
+      MI->getOperand(i).setReg(DstReg);
+    }
+  }
 }
 
-int SIInstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
+  SmallVector<MachineInstr *, 128> Worklist;
+  Worklist.push_back(&TopInst);
+
+  while (!Worklist.empty()) {
+    MachineInstr *Inst = Worklist.pop_back_val();
+    unsigned NewOpcode = getVALUOp(*Inst);
+    if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
+      continue;
+
+    MachineRegisterInfo &MRI = Inst->getParent()->getParent()->getRegInfo();
+
+    // Use the new VALU Opcode.
+    const MCInstrDesc &NewDesc = get(NewOpcode);
+    Inst->setDesc(NewDesc);
+
+    // Remove any references to SCC. Vector instructions can't read from it, and
+    // We're just about to add the implicit use / defs of VCC, and we don't want
+    // both.
+    for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) {
+      MachineOperand &Op = Inst->getOperand(i);
+      if (Op.isReg() && Op.getReg() == AMDGPU::SCC)
+        Inst->RemoveOperand(i);
+    }
+
+    // Add the implict and explicit register definitions.
+    if (NewDesc.ImplicitUses) {
+      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitUses[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+      }
+    }
+
+    if (NewDesc.ImplicitDefs) {
+      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+        unsigned Reg = NewDesc.ImplicitDefs[i];
+        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+      }
+    }
+
+    legalizeOperands(Inst);
+
+    // Update the destination register class.
+    const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
+
+    switch (Inst->getOpcode()) {
+      // For target instructions, getOpRegClass just returns the virtual
+      // register class associated with the operand, so we need to find an
+      // equivalent VGPR register class in order to move the instruction to the
+      // VALU.
+    case AMDGPU::COPY:
+    case AMDGPU::PHI:
+    case AMDGPU::REG_SEQUENCE:
+      if (RI.hasVGPRs(NewDstRC))
+        continue;
+      NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+      if (!NewDstRC)
+        continue;
+      break;
+    default:
+      break;
+    }
+
+    unsigned DstReg = Inst->getOperand(0).getReg();
+    unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
+    MRI.replaceRegWith(DstReg, NewDstReg);
+
+    for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
+           E = MRI.use_end(); I != E; ++I) {
+      MachineInstr &UseMI = *I;
+      if (!canReadVGPR(UseMI, I.getOperandNo())) {
+        Worklist.push_back(&UseMI);
+      }
+    }
+  }
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrStoreRegClass(
-                                                     unsigned SourceReg) const {
-  llvm_unreachable("Unimplemented");
+//===----------------------------------------------------------------------===//
+// Indirect addressing callbacks
+//===----------------------------------------------------------------------===//
+
+unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex,
+                                                 unsigned Channel) const {
+  assert(Channel == 0);
+  return RegIndex;
 }
 
-const TargetRegisterClass *SIInstrInfo::getIndirectAddrLoadRegClass() const {
-  llvm_unreachable("Unimplemented");
+const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const {
+  return &AMDGPU::VReg_32RegClass;
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
@@ -257,7 +647,17 @@ MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1))
+          .addReg(IndirectBaseReg, RegState::Define)
+          .addOperand(I->getOperand(0))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0)
+          .addReg(ValueReg);
 }
 
 MachineInstrBuilder SIInstrInfo::buildIndirectRead(
@@ -265,9 +665,43 @@ MachineInstrBuilder SIInstrInfo::buildIndirectRead(
                                    MachineBasicBlock::iterator I,
                                    unsigned ValueReg,
                                    unsigned Address, unsigned OffsetReg) const {
-  llvm_unreachable("Unimplemented");
+  const DebugLoc &DL = MBB->findDebugLoc(I);
+  unsigned IndirectBaseReg = AMDGPU::VReg_32RegClass.getRegister(
+                                      getIndirectIndexBegin(*MBB->getParent()));
+
+  return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC))
+          .addOperand(I->getOperand(0))
+          .addOperand(I->getOperand(1))
+          .addReg(IndirectBaseReg)
+          .addReg(OffsetReg)
+          .addImm(0);
+
 }
 
-const TargetRegisterClass *SIInstrInfo::getSuperIndirectRegClass() const {
-  llvm_unreachable("Unimplemented");
+void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved,
+                                            const MachineFunction &MF) const {
+  int End = getIndirectIndexEnd(MF);
+  int Begin = getIndirectIndexBegin(MF);
+
+  if (End == -1)
+    return;
+
+
+  for (int Index = Begin; Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_32RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 1); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 2); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 3); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 7); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index));
+
+  for (int Index = std::max(0, Begin - 15); Index <= End; ++Index)
+    Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index));
 }
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index 87eff4d..4af6348 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -1,4 +1,4 @@
-//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
+//===-- SIInstrInfo.h - SI Instruction Info Interface -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -25,6 +25,14 @@ class SIInstrInfo : public AMDGPUInstrInfo {
 private:
   const SIRegisterInfo RI;
 
+  MachineInstrBuilder buildIndirectIndexLoop(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator I,
+                                             unsigned OffsetVGPR,
+                                             unsigned MovRelOp,
+                                             unsigned Dst,
+                                             unsigned Src0) const;
+  // If you add or remove instructions from this function, you will
+
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
@@ -40,25 +48,65 @@ public:
   virtual MachineInstr *commuteInstruction(MachineInstr *MI,
                                            bool NewMI=false) const;
 
-  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
-                                        int64_t Imm) const;
-
   virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DstReg, unsigned SrcReg) const;
   virtual bool isMov(unsigned Opcode) const;
 
   virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
-
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
-
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int isMIMG(uint16_t Opcode) const;
+  int isSMRD(uint16_t Opcode) const;
+  bool isVOP1(uint16_t Opcode) const;
+  bool isVOP2(uint16_t Opcode) const;
+  bool isVOP3(uint16_t Opcode) const;
+  bool isVOPC(uint16_t Opcode) const;
+  bool isInlineConstant(const MachineOperand &MO) const;
+  bool isLiteralConstant(const MachineOperand &MO) const;
+
+  virtual bool verifyInstruction(const MachineInstr *MI,
+                                 StringRef &ErrInfo) const;
+
+  bool isSALUInstr(const MachineInstr &MI) const;
+  static unsigned getVALUOp(const MachineInstr &MI);
+  bool isSALUOpSupportedOnVALU(const MachineInstr &MI) const;
+
+  /// \brief Return the correct register class for \p OpNo.  For target-specific
+  /// instructions, this will return the register class that has been defined
+  /// in tablegen.  For generic instructions, like REG_SEQUENCE it will return
+  /// the register class of its machine operand.
+  /// to infer the correct register class base on the other operands.
+  const TargetRegisterClass *getOpRegClass(const MachineInstr &MI,
+                                           unsigned OpNo) const;\
+
+  /// \returns true if it is legal for the operand at index \p OpNo
+  /// to read a VGPR.
+  bool canReadVGPR(const MachineInstr &MI, unsigned OpNo) const;
+
+  /// \brief Legalize the \p OpIndex operand of this instruction by inserting
+  /// a MOV.  For example:
+  /// ADD_I32_e32 VGPR0, 15
+  /// to
+  /// MOV VGPR1, 15
+  /// ADD_I32_e32 VGPR0, VGPR1
+  ///
+  /// If the operand being legalized is a register, then a COPY will be used
+  /// instead of MOV.
+  void legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const;
+
+  /// \brief Legalize all operands in this instruction.  This function may
+  /// create new instruction and insert them before \p MI.
+  void legalizeOperands(MachineInstr *MI) const;
+
+  /// \brief Replace this instruction's opcode with the equivalent VALU
+  /// opcode.  This function will also move the users of \p MI to the
+  /// VALU if necessary.
+  void moveToVALU(MachineInstr &MI) const;
 
   virtual unsigned calculateIndirectAddress(unsigned RegIndex,
                                             unsigned Channel) const;
 
-  virtual const TargetRegisterClass *getIndirectAddrStoreRegClass(
-                                                      unsigned SourceReg) const;
-
-  virtual const TargetRegisterClass *getIndirectAddrLoadRegClass() const;
+  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
 
   virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
                                                  MachineBasicBlock::iterator I,
@@ -71,16 +119,18 @@ public:
                                                 unsigned ValueReg,
                                                 unsigned Address,
                                                 unsigned OffsetReg) const;
+  void reserveIndirectRegisters(BitVector &Reserved,
+                                const MachineFunction &MF) const;
 
-  virtual const TargetRegisterClass *getSuperIndirectRegClass() const;
-  };
+  void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
+              unsigned SavReg, unsigned IndexReg) const;
+};
 
 namespace AMDGPU {
 
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
-  int isMIMG(uint16_t Opcode);
 
 } // End namespace AMDGPU
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index c8aecb7..4cd0daa 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -16,19 +16,64 @@ def SIadd64bit32bit : SDNode<"ISD::ADD",
   SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
 >;
 
+def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+                      [SDNPMayLoad, SDNPMemOperand]
+>;
+
+def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
+  SDTypeProfile<0, 13,
+    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+     SDTCisVT<1, iAny>,   // vdata(VGPR)
+     SDTCisVT<2, i32>,    // num_channels(imm)
+     SDTCisVT<3, i32>,    // vaddr(VGPR)
+     SDTCisVT<4, i32>,    // soffset(SGPR)
+     SDTCisVT<5, i32>,    // inst_offset(imm)
+     SDTCisVT<6, i32>,    // dfmt(imm)
+     SDTCisVT<7, i32>,    // nfmt(imm)
+     SDTCisVT<8, i32>,    // offen(imm)
+     SDTCisVT<9, i32>,    // idxen(imm)
+     SDTCisVT<10, i32>,   // glc(imm)
+     SDTCisVT<11, i32>,   // slc(imm)
+     SDTCisVT<12, i32>    // tfe(imm)
+    ]>,
+  [SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
+def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+                       SDTCisVT<3, i32>]>
+>;
+
+class SDSample<string opcode> : SDNode <opcode,
+  SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
+                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+>;
+
+def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
+def SIsampleb : SDSample<"AMDGPUISD::SAMPLEB">;
+def SIsampled : SDSample<"AMDGPUISD::SAMPLED">;
+def SIsamplel : SDSample<"AMDGPUISD::SAMPLEL">;
+
 // Transformation function, extract the lower 32bit of a 64bit immediate
 def LO32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() & 0xffffffff, MVT::i32);
 }]>;
 
+def LO32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
+}]>;
+
 // Transformation function, extract the upper 32bit of a 64bit immediate
 def HI32 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() >> 32, MVT::i32);
 }]>;
 
-def SIbuffer_store : SDNode<"AMDGPUISD::BUFFER_STORE",
-                           SDTypeProfile<0, 3, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
-                           [SDNPHasChain, SDNPMayStore]>;
+def HI32f : SDNodeXForm<fpimm, [{
+  APInt V = N->getValueAPF().bitcastToAPInt().lshr(32).trunc(32);
+  return CurDAG->getTargetConstantFP(APFloat(APFloat::IEEEsingle, V), MVT::f32);
+}]>;
 
 def IMM8bitDWORD : ImmLeaf <
   i32, [{
@@ -39,15 +84,47 @@ def IMM8bitDWORD : ImmLeaf <
   }]>
 >;
 
-def IMM12bit : ImmLeaf <
-  i16,
-  [{return isUInt<12>(Imm);}]
+def as_i1imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i1);
+}]>;
+
+def as_i8imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i8);
+}]>;
+
+def as_i16imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), MVT::i16);
+}]>;
+
+def IMM12bit : PatLeaf <(imm),
+  [{return isUInt<12>(N->getZExtValue());}]
 >;
 
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return ((const SITargetLowering &)TLI).analyzeImmediate(N) == 0;
+  return
+    (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
+}]>;
+
+class SGPRImm <dag frag> : PatLeaf<frag, [{
+  if (TM.getSubtarget<AMDGPUSubtarget>().getGeneration() <
+      AMDGPUSubtarget::SOUTHERN_ISLANDS) {
+    return false;
+  }
+  const SIRegisterInfo *SIRI =
+                       static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  for (SDNode::use_iterator U = N->use_begin(), E = SDNode::use_end();
+                                                U != E; ++U) {
+    if (SIRI->isSGPRClass(getOperandRegClass(*U, U.getOperandNo()))) {
+      return true;
+    }
+  }
+  return false;
 }]>;
 
+def FRAMEri64 : Operand<iPTR> {
+  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+}
+
 //===----------------------------------------------------------------------===//
 // SI assembler operands
 //===----------------------------------------------------------------------===//
@@ -99,6 +176,11 @@ class SOP2_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
+class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
+  op, (outs SReg_64:$dst), (ins SSrc_64:$src0, SSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>;
+
 class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
   op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
   opName#" $dst, $src0, $src1", pattern
@@ -163,8 +245,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
   >, VOP <opName> {
-    let SRC1 = SIOperand.ZERO;
-    let SRC2 = SIOperand.ZERO;
+    let src1 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -174,6 +256,12 @@ multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern>
 multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern>
   : VOP1_Helper <op, VReg_64, VSrc_64, opName, pattern>;
 
+multiclass VOP1_32_64 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_32, VSrc_64, opName, pattern>;
+
+multiclass VOP1_64_32 <bits<8> op, string opName, list<dag> pattern>
+  : VOP1_Helper <op, VReg_64, VSrc_32, opName, pattern>;
+
 multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
                         string opName, list<dag> pattern, string revOp> {
   def _e32 : VOP2 <
@@ -189,7 +277,7 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -217,11 +305,11 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
          i32imm:$omod, i32imm:$neg),
     opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
        can write it into any SGPR. We currently don't use the carry out,
        so for now hardcode it to VCC as well */
-    let SDST = SIOperand.VCC;
+    let sdst = SIOperand.VCC;
   }
 }
 
@@ -244,7 +332,7 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
-    let SRC2 = SIOperand.ZERO;
+    let src2 = SIOperand.ZERO;
   }
 }
 
@@ -263,6 +351,19 @@ class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
 >, VOP <opName>;
 
+class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
+  op, (outs VReg_64:$dst),
+  (ins VSrc_64:$src0, VSrc_32:$src1),
+  opName#" $dst, $src0, $src1", pattern
+>, VOP <opName> {
+
+  let src2 = SIOperand.ZERO;
+  let abs = 0;
+  let clamp = 0;
+  let omod = 0;
+  let neg = 0;
+}
+
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
   (ins VSrc_64:$src0, VSrc_64:$src1, VSrc_64:$src2,
@@ -274,6 +375,41 @@ class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
 // Vector I/O classes
 //===----------------------------------------------------------------------===//
 
+class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs regClass:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
+       i8imm:$offset0, i8imm:$offset1),
+  asm#" $vdst, $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
+  op,
+  (outs),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, VReg_32:$data1,
+       i8imm:$offset0, i8imm:$offset1),
+  asm#" $gds, $addr, $data0, $data1, $offset0, $offset1, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+  let vdst = 0;
+}
+
+class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS <
+  op,
+  (outs rc:$vdst),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i8imm:$offset0,
+       i8imm:$offset1),
+  asm#" $gds, $vdst, $addr, $data0, $offset0, $offset1, [M0]",
+  []> {
+  let mayStore = 1;
+  let mayLoad = 1;
+  let data1 = 0;
+}
+
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
@@ -287,31 +423,41 @@ class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBU
   let mayLoad = 0;
 }
 
-class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
-  op,
-  (outs regClass:$vdata),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
-       i1imm:$lds, VReg_32:$vaddr, SReg_128:$srsrc, i1imm:$slc,
-       i1imm:$tfe, SSrc_32:$soffset),
-  asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, "
-     #"$lds, $vaddr, $srsrc, $slc, $tfe, $soffset",
-  []> {
-  let mayLoad = 1;
-  let mayStore = 0;
+multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
+
+  let glc = 0, lds = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */,
+                                          mayLoad = 1 in {
+
+  let offen = 1, idxen = 0, addr64 = 0, offset = 0 in {
+    def _OFFEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr),
+                         asm#" $vdata, $srsrc + $vaddr", []>;
+  }
+
+  let offen = 0, idxen = 1, addr64 = 0 in {
+    def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_32:$vaddr, i16imm:$offset),
+                         asm#" $vdata, $srsrc[$vaddr] + $offset", []>;
+  }
+
+  let offen = 0, idxen = 0, addr64 = 1 in {
+    def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
+                         (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                         asm#" $vdata, $srsrc + $vaddr + $offset", []>;
+  }
+  }
 }
 
-class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass,
-                         ValueType VT> :
-    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr),
-          name#" $vdata, $srsrc + $vaddr",
-          [(SIbuffer_store (VT vdataClass:$vdata), (i128 SReg_128:$srsrc),
-                                                    (i64 VReg_64:$vaddr))]> {
+class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+    MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
+                            i16imm:$offset),
+          name#" $vdata, $srsrc + $vaddr + $offset",
+         []> {
 
   let mayLoad = 0;
   let mayStore = 1;
 
   // Encoding
-  let offset = 0;
   let offen = 0;
   let idxen = 0;
   let glc = 0;
@@ -335,11 +481,18 @@ class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF
   let mayStore = 0;
 }
 
-class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
+class MIMG_Mask <string op, int channels> {
+  string Op = op;
+  int Channels = channels;
+}
+
+class MIMG_NoSampler_Helper <bits<7> op, string asm,
+                             RegisterClass dst_rc,
+                             RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc",
@@ -350,11 +503,31 @@ class MIMG_NoSampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
-class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
+multiclass MIMG_NoSampler_Src_Helper <bits<7> op, string asm,
+                                      RegisterClass dst_rc,
+                                      int channels> {
+  def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+}
+
+multiclass MIMG_NoSampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_NoSampler_Src_Helper <op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_NoSampler_Src_Helper <op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 4>;
+}
+
+class MIMG_Sampler_Helper <bits<7> op, string asm,
+                           RegisterClass dst_rc,
+                           RegisterClass src_rc> : MIMG <
   op,
-  (outs VReg_128:$vdata),
+  (outs dst_rc:$vdata),
   (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
-       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, unknown:$vaddr,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, src_rc:$vaddr,
        SReg_256:$srsrc, SReg_128:$ssamp),
   asm#" $vdata, $dmask, $unorm, $glc, $da, $r128,"
      #" $tfe, $lwe, $slc, $vaddr, $srsrc, $ssamp",
@@ -364,6 +537,28 @@ class MIMG_Sampler_Helper <bits<7> op, string asm> : MIMG <
   let hasPostISelHook = 1;
 }
 
+multiclass MIMG_Sampler_Src_Helper <bits<7> op, string asm,
+                                    RegisterClass dst_rc,
+                                    int channels> {
+  def _V1 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_32>,
+            MIMG_Mask<asm#"_V1", channels>;
+  def _V2 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_64>,
+            MIMG_Mask<asm#"_V2", channels>;
+  def _V4 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_128>,
+            MIMG_Mask<asm#"_V4", channels>;
+  def _V8 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_256>,
+            MIMG_Mask<asm#"_V8", channels>;
+  def _V16 : MIMG_Sampler_Helper <op, asm, dst_rc, VReg_512>,
+            MIMG_Mask<asm#"_V16", channels>;
+}
+
+multiclass MIMG_Sampler <bits<7> op, string asm> {
+  defm _V1 : MIMG_Sampler_Src_Helper<op, asm, VReg_32, 1>;
+  defm _V2 : MIMG_Sampler_Src_Helper<op, asm, VReg_64, 2>;
+  defm _V3 : MIMG_Sampler_Src_Helper<op, asm, VReg_96, 3>;
+  defm _V4 : MIMG_Sampler_Src_Helper<op, asm, VReg_128, 4>;
+}
+
 //===----------------------------------------------------------------------===//
 // Vector instruction mappings
 //===----------------------------------------------------------------------===//
@@ -386,6 +581,14 @@ def getCommuteRev : InstrMapping {
   let ValueCols = [["0"]];
 }
 
+def getMaskedMIMGOp : InstrMapping {
+  let FilterClass = "MIMG_Mask";
+  let RowFields = ["Op"];
+  let ColFields = ["Channels"];
+  let KeyCol = ["4"];
+  let ValueCols = [["1"], ["2"], ["3"] ];
+}
+
 // Maps an commuted opcode to its original version
 def getCommuteOrig : InstrMapping {
   let FilterClass = "VOP2_REV";
@@ -395,13 +598,4 @@ def getCommuteOrig : InstrMapping {
   let ValueCols = [["1"]];
 }
 
-// Test if the supplied opcode is an MIMG instruction
-def isMIMG : InstrMapping {
-  let FilterClass = "MIMG";
-  let RowFields = ["Inst"];
-  let ColFields = ["Size"];
-  let KeyCol = ["8"];
-  let ValueCols = [["8"]];
-}
-
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 0d50c5d..76f05eb 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -22,8 +22,10 @@ def InterpSlot : Operand<i32> {
   let PrintMethod = "printInterpSlot";
 }
 
-def isSI : Predicate<"Subtarget.device()"
-                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
+def isSI : Predicate<"Subtarget.getGeneration() "
+                      ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
+
+def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
 let Predicates = [isSI] in {
 
@@ -126,8 +128,11 @@ def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
 def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
 } // End isCompare = 1
 
-def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
-def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+let Defs = [SCC], isCommutable = 1 in {
+  def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+  def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+}
+
 //def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
 def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
 def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
@@ -138,19 +143,19 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
-defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_LT>;
-defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_EQ>;
-defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_LE>;
-defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_GT>;
-defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", f32, COND_NE>;
-defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_GE>;
-defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32">;
-defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32">;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", f32, COND_OLT>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", f32, COND_OEQ>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", f32, COND_OLE>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", f32, COND_OGT>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32">;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", f32, COND_OGE>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", f32, COND_O>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", f32, COND_UO>;
 defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32">;
 defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32">;
 defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32">;
 defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32">;
-defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_NE>;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", f32, COND_UNE>;
 defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32">;
 defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32">;
 
@@ -176,19 +181,19 @@ defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64">;
-defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64">;
-defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64">;
-defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64">;
-defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64">;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", f64, COND_OLT>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", f64, COND_OEQ>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", f64, COND_OLE>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", f64, COND_OGT>;
 defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64">;
-defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64">;
-defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64">;
-defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64">;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", f64, COND_OGE>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", f64, COND_O>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", f64, COND_UO>;
 defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64">;
 defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64">;
 defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64">;
 defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64">;
-defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64">;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", f64, COND_UNE>;
 defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64">;
 defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64">;
 
@@ -290,12 +295,12 @@ defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32">;
-defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_LT>;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", i32, COND_SLT>;
 defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", i32, COND_EQ>;
-defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_LE>;
-defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_GT>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", i32, COND_SLE>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", i32, COND_SGT>;
 defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", i32, COND_NE>;
-defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_GE>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", i32, COND_SGE>;
 defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -312,12 +317,12 @@ defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64">;
-defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64">;
-defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64">;
-defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64">;
-defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64">;
-defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64">;
-defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64">;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", i64, COND_SLT>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", i64, COND_EQ>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", i64, COND_SLE>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", i64, COND_SGT>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", i64, COND_NE>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", i64, COND_SGE>;
 defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -334,12 +339,12 @@ defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32">;
-defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32">;
-defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32">;
-defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32">;
-defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32">;
-defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32">;
-defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32">;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", i32, COND_ULT>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", i32, COND_EQ>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", i32, COND_ULE>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", i32, COND_UGT>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", i32, COND_NE>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", i32, COND_UGE>;
 defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -356,12 +361,12 @@ defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32">;
 } // End hasSideEffects = 1, Defs = [EXEC]
 
 defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64">;
-defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64">;
-defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64">;
-defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64">;
-defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64">;
-defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64">;
-defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64">;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", i64, COND_ULT>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", i64, COND_EQ>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", i64, COND_ULE>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", i64, COND_UGT>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", i64, COND_NE>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", i64, COND_UGE>;
 defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64">;
 
 let hasSideEffects = 1, Defs = [EXEC] in {
@@ -391,32 +396,52 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
+def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
+def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
+def DS_WRITE_B8 : DS_Store_Helper <0x00000001e, "DS_WRITE_B8", VReg_32>;
+def DS_WRITE_B16 : DS_Store_Helper <0x00000001f, "DS_WRITE_B16", VReg_32>;
+def DS_READ_B32 : DS_Load_Helper <0x00000036, "DS_READ_B32", VReg_32>;
+def DS_READ_I8 : DS_Load_Helper <0x00000039, "DS_READ_I8", VReg_32>;
+def DS_READ_U8 : DS_Load_Helper <0x0000003a, "DS_READ_U8", VReg_32>;
+def DS_READ_I16 : DS_Load_Helper <0x0000003b, "DS_READ_I16", VReg_32>;
+def DS_READ_U16 : DS_Load_Helper <0x0000003c, "DS_READ_U16", VReg_32>;
+
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
 //def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
-def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
 //def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
 //def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
 //def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
 //def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
-//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
-//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
-//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
-//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
-def BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
-def BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
-def BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
-//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
-//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+defm BUFFER_LOAD_UBYTE : MUBUF_Load_Helper <0x00000008, "BUFFER_LOAD_UBYTE", VReg_32>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Load_Helper <0x00000009, "BUFFER_LOAD_SBYTE", VReg_32>;
+defm BUFFER_LOAD_USHORT : MUBUF_Load_Helper <0x0000000a, "BUFFER_LOAD_USHORT", VReg_32>;
+defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <0x0000000b, "BUFFER_LOAD_SSHORT", VReg_32>;
+defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <0x0000000c, "BUFFER_LOAD_DWORD", VReg_32>;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <0x0000000d, "BUFFER_LOAD_DWORDX2", VReg_64>;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <0x0000000e, "BUFFER_LOAD_DWORDX4", VReg_128>;
+
+def BUFFER_STORE_BYTE : MUBUF_Store_Helper <
+  0x00000018, "BUFFER_STORE_BYTE", VReg_32
+>;
+
+def BUFFER_STORE_SHORT : MUBUF_Store_Helper <
+  0x0000001a, "BUFFER_STORE_SHORT", VReg_32
+>;
 
 def BUFFER_STORE_DWORD : MUBUF_Store_Helper <
-  0x0000001c, "BUFFER_STORE_DWORD", VReg_32, i32
+  0x0000001c, "BUFFER_STORE_DWORD", VReg_32
 >;
 
 def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
-  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64, i64
+  0x0000001d, "BUFFER_STORE_DWORDX2", VReg_64
+>;
+
+def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
+  0x0000001e, "BUFFER_STORE_DWORDX4", VReg_128
 >;
-//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
 //def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
 //def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
 //def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
@@ -457,21 +482,24 @@ def BUFFER_STORE_DWORDX2 : MUBUF_Store_Helper <
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
 def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
-//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
-//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
-//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
-//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+def TBUFFER_STORE_FORMAT_X : MTBUF_Store_Helper <0x00000004, "TBUFFER_STORE_FORMAT_X", VReg_32>;
+def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FORMAT_XY", VReg_64>;
+def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
+def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
 let mayLoad = 1 in {
 
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SReg_32>;
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
 defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
 defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
 
 defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SReg_32
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
 >;
 
 defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
@@ -494,8 +522,8 @@ defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
 
 //def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
 //def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
-//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
-def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
+defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
 //def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
 //def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
@@ -504,7 +532,7 @@ def IMAGE_LOAD_MIP : MIMG_NoSampler_Helper <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
 //def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
 //def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
-def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
 //def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
 //def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
@@ -522,20 +550,20 @@ def IMAGE_GET_RESINFO : MIMG_NoSampler_Helper <0x0000000e, "IMAGE_GET_RESINFO">;
 //def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
-def IMAGE_SAMPLE : MIMG_Sampler_Helper <0x00000020, "IMAGE_SAMPLE">; 
+defm IMAGE_SAMPLE : MIMG_Sampler <0x00000020, "IMAGE_SAMPLE">;
 //def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
-def IMAGE_SAMPLE_D : MIMG_Sampler_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <0x00000022, "IMAGE_SAMPLE_D">;
 //def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
-def IMAGE_SAMPLE_L : MIMG_Sampler_Helper <0x00000024, "IMAGE_SAMPLE_L">;
-def IMAGE_SAMPLE_B : MIMG_Sampler_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <0x00000024, "IMAGE_SAMPLE_L">;
+defm IMAGE_SAMPLE_B : MIMG_Sampler <0x00000025, "IMAGE_SAMPLE_B">;
 //def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
 //def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
-def IMAGE_SAMPLE_C : MIMG_Sampler_Helper <0x00000028, "IMAGE_SAMPLE_C">;
+defm IMAGE_SAMPLE_C : MIMG_Sampler <0x00000028, "IMAGE_SAMPLE_C">;
 //def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
-//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <0x0000002a, "IMAGE_SAMPLE_C_D">;
 //def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
-def IMAGE_SAMPLE_C_L : MIMG_Sampler_Helper <0x0000002c, "IMAGE_SAMPLE_C_L">;
-def IMAGE_SAMPLE_C_B : MIMG_Sampler_Helper <0x0000002d, "IMAGE_SAMPLE_C_B">;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <0x0000002c, "IMAGE_SAMPLE_C_L">;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
 //def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
 //def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
@@ -597,15 +625,21 @@ defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
 } // End neverHasSideEffects = 1, isMoveImm = 1
 
 defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
-//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
-//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+defm V_CVT_I32_F64 : VOP1_32_64 <0x00000003, "V_CVT_I32_F64",
+  [(set i32:$dst, (fp_to_sint f64:$src0))]
+>;
+defm V_CVT_F64_I32 : VOP1_64_32 <0x00000004, "V_CVT_F64_I32",
+  [(set f64:$dst, (sint_to_fp i32:$src0))]
+>;
 defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
   [(set f32:$dst, (sint_to_fp i32:$src0))]
 >;
 defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32",
   [(set f32:$dst, (uint_to_fp i32:$src0))]
 >;
-defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32",
+  [(set i32:$dst, (fp_to_uint f32:$src0))]
+>;
 defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
   [(set i32:$dst, (fp_to_sint f32:$src0))]
 >;
@@ -615,8 +649,12 @@ defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
 //defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
 //defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
 //defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
-//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
-//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+defm V_CVT_F32_F64 : VOP1_32_64 <0x0000000f, "V_CVT_F32_F64",
+  [(set f32:$dst, (fround f64:$src0))]
+>;
+defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
+  [(set f64:$dst, (fextend f32:$src0))]
+>;
 //defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
@@ -657,12 +695,18 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 <
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 >;
 defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
-defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
+  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+>;
 defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
 defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
 defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
-defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
-defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
+  [(set f32:$dst, (fsqrt f32:$src0))]
+>;
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64",
+  [(set f64:$dst, (fsqrt f64:$src0))]
+>;
 defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
 defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
 defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
@@ -768,9 +812,18 @@ def S_CBRANCH_EXECNZ : SOPP <
 } // End isBranch = 1
 } // End isTerminator = 1
 
-//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
 let hasSideEffects = 1 in {
-def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
   []
 >;
 } // End hasSideEffects
@@ -806,6 +859,23 @@ def : Pat <
   (V_CNDMASK_B32_e64 $src0, $src1, $src2)
 >;
 
+def : Pat <
+  (i32 (trunc i64:$val)),
+  (EXTRACT_SUBREG $val, sub0)
+>;
+
+//use two V_CNDMASK_B32_e64 instructions for f64
+def : Pat <
+  (f64 (select i1:$src2, f64:$src1, f64:$src0)),
+  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+  (V_CNDMASK_B32_e64 (EXTRACT_SUBREG $src0, sub0),
+                     (EXTRACT_SUBREG $src1, sub0),
+                     $src2), sub0),
+  (V_CNDMASK_B32_e64 (EXTRACT_SUBREG $src0, sub1),
+                     (EXTRACT_SUBREG $src1, sub1),
+                     $src2), sub1)
+>;
+
 defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
 defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
 
@@ -833,14 +903,16 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
   [(set f32:$dst, (fmul f32:$src0, f32:$src1))]
 >;
 
-} // End isCommutable = 1
 
-//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
+defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
+  [(set i32:$dst, (mul I24:$src0, I24:$src1))]
+>;
 //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
-//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
+defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
+  [(set i32:$dst, (mul U24:$src0, U24:$src1))]
+>;
 //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
 
-let isCommutable = 1 in {
 
 defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
   [(set f32:$dst, (AMDGPUfmin f32:$src0, f32:$src1))]
@@ -875,9 +947,13 @@ defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
 >;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
+let hasPostISelHook = 1 in {
+
 defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
   [(set i32:$dst, (shl i32:$src0, i32:$src1))]
 >;
+
+}
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
 defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
@@ -897,20 +973,17 @@ defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
 defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
 defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
 //defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
-//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
-//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
-  [(set i32:$dst, (add (i32 VSrc_32:$src0), (i32 VReg_32:$src1)))]
->;
-
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
-  [(set i32:$dst, (sub i32:$src0, i32:$src1))]
->;
+// No patterns so that the scalar instructions are always selected.
+// The scalar versions will be replaced with vector when needed later.
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", []>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", []>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], "V_SUB_I32">;
 
-let Uses = [VCC] in { // Carry-out comes from VCC
+let Uses = [VCC] in { // Carry-in comes from VCC
 defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", []>;
 defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", []>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], "V_SUBB_U32">;
@@ -948,8 +1021,12 @@ let neverHasSideEffects = 1 in {
 
 def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
 def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
-//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
-//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
+def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
+>;
+def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
+>;
 
 } // End neverHasSideEffects
 def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
@@ -960,10 +1037,16 @@ def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
 def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
 def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
 defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
-def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+  [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
+>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
+  [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
+>;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
 def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
 def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
@@ -982,13 +1065,36 @@ def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
 def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
-def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
-def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
-def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+
+def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def V_LSHR_B64 : VOP3_64_Shift <0x00000162, "V_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def V_ASHR_I64 : VOP3_64_Shift <0x00000163, "V_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+let isCommutable = 1 in {
+
 def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
 def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
 def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
 def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+
+} // isCommutable = 1
+
+def : Pat <
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
 def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
 
 let isCommutable = 1 in {
@@ -1023,12 +1129,31 @@ def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
 def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
 def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
 def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
 def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
 def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
 def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
@@ -1060,7 +1185,9 @@ def : Pat <
   (S_OR_B64 $src0, $src1)
 >;
 def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+>;
 def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
 def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
 def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
@@ -1071,12 +1198,31 @@ def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
 def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
 def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
 def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
-def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
-def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
-def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
 def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
 def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
 def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
@@ -1096,7 +1242,7 @@ def LOAD_CONST : AMDGPUShaderInst <
   [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
 >;
 
-// SI Psuedo instructions. These are used by the CFG structurizer pass
+// SI pseudo instructions. These are used by the CFG structurizer pass
 // and should be lowered to ISA instructions prior to codegen.
 
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
@@ -1169,6 +1315,36 @@ def SI_KILL : InstSI <
 
 let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
+//defm SI_ : RegisterLoadStore <VReg_32, FRAMEri64, ADDRIndirect>;
+
+let UseNamedOperandTable = 1 in {
+
+def SI_RegisterLoad : AMDGPUShaderInst <
+  (outs VReg_32:$dst, SReg_64:$temp),
+  (ins FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterLoad = 1;
+  let mayLoad = 1;
+}
+
+class SIRegStore<dag outs> : AMDGPUShaderInst <
+  outs,
+  (ins VReg_32:$val, FRAMEri64:$addr, i32imm:$chan),
+  "", []
+> {
+  let isRegisterStore = 1;
+  let mayStore = 1;
+}
+
+let usesCustomInserter = 1 in {
+def SI_RegisterStorePseudo : SIRegStore<(outs)>;
+} // End usesCustomInserter = 1
+def SI_RegisterStore : SIRegStore<(outs SReg_64:$temp)>;
+
+
+} // End UseNamedOperandTable = 1
+
 def SI_INDIRECT_SRC : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins unknown:$src, VSrc_32:$idx, i32imm:$off),
@@ -1185,6 +1361,7 @@ class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
   let Constraints = "$src = $dst";
 }
 
+def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VReg_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
@@ -1192,6 +1369,25 @@ def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
 
 } // Uses = [EXEC,VCC,M0], Defs = [EXEC,VCC,M0]
 
+let usesCustomInserter = 1 in {
+
+// This pseudo instruction takes a pointer as input and outputs a resource
+// constant that can be used with the ADDR64 MUBUF instructions.
+def SI_ADDR64_RSRC : InstSI <
+  (outs SReg_128:$srsrc),
+  (ins SReg_64:$ptr),
+  "", []
+>;
+
+def V_SUB_F64 : InstSI <
+  (outs VReg_64:$dst),
+  (ins VReg_64:$src0, VReg_64:$src1),
+  "V_SUB_F64 $dst, $src0, $src1",
+  []
+>;
+
+} // end usesCustomInserter
+
 } // end IsCodeGenOnly, isPseudo
 
 def : Pat<
@@ -1206,10 +1402,8 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (int_SI_vs_load_input v16i8:$tlst, IMM12bit:$attr_offset,
-                        i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
-                           $buf_idx_vgpr, $tlst, 0, 0, 0)
+  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset)
 >;
 
 /* int_SI_export */
@@ -1220,66 +1414,94 @@ def : Pat <
        $src0, $src1, $src2, $src3)
 >;
 
+def : Pat <
+  (f64 (fsub f64:$src0, f64:$src1)),
+  (V_SUB_F64 $src0, $src1)
+>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
-/* int_SI_sample for simple 1D texture lookup */
+/* SIsample for simple 1D texture lookup */
 def : Pat <
-  (int_SI_sample v1i32:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
-  (IMAGE_SAMPLE 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
+  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SamplePattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, imm),
+class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleRectPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_RECT),
+class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleArrayPattern<Intrinsic name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_ARRAY),
+class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowPattern<Intrinsic name, MIMG opcode,
+class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-class SampleShadowArrayPattern<Intrinsic name, MIMG opcode,
+class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, v16i8:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
-/* int_SI_sample* for texture lookups consuming more address parameters */
-multiclass SamplePatterns<ValueType addr_type> {
-  def : SamplePattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleRectPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleArrayPattern <int_SI_sample, IMAGE_SAMPLE, addr_type>;
-  def : SampleShadowPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sample, IMAGE_SAMPLE_C, addr_type>;
-
-  def : SamplePattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleArrayPattern <int_SI_samplel, IMAGE_SAMPLE_L, addr_type>;
-  def : SampleShadowPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_samplel, IMAGE_SAMPLE_C_L, addr_type>;
-
-  def : SamplePattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_B, addr_type>;
-  def : SampleShadowPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
-  def : SampleShadowArrayPattern <int_SI_sampleb, IMAGE_SAMPLE_C_B, addr_type>;
+/* SIsample* for texture lookups consuming more address parameters */
+multiclass SamplePatterns<MIMG sample, MIMG sample_c, MIMG sample_l,
+                          MIMG sample_c_l, MIMG sample_b, MIMG sample_c_b,
+MIMG sample_d, MIMG sample_c_d, ValueType addr_type> {
+  def : SamplePattern <SIsample, sample, addr_type>;
+  def : SampleRectPattern <SIsample, sample, addr_type>;
+  def : SampleArrayPattern <SIsample, sample, addr_type>;
+  def : SampleShadowPattern <SIsample, sample_c, addr_type>;
+  def : SampleShadowArrayPattern <SIsample, sample_c, addr_type>;
+
+  def : SamplePattern <SIsamplel, sample_l, addr_type>;
+  def : SampleArrayPattern <SIsamplel, sample_l, addr_type>;
+  def : SampleShadowPattern <SIsamplel, sample_c_l, addr_type>;
+  def : SampleShadowArrayPattern <SIsamplel, sample_c_l, addr_type>;
+
+  def : SamplePattern <SIsampleb, sample_b, addr_type>;
+  def : SampleArrayPattern <SIsampleb, sample_b, addr_type>;
+  def : SampleShadowPattern <SIsampleb, sample_c_b, addr_type>;
+  def : SampleShadowArrayPattern <SIsampleb, sample_c_b, addr_type>;
+
+  def : SamplePattern <SIsampled, sample_d, addr_type>;
+  def : SampleArrayPattern <SIsampled, sample_d, addr_type>;
+  def : SampleShadowPattern <SIsampled, sample_c_d, addr_type>;
+  def : SampleShadowArrayPattern <SIsampled, sample_c_d, addr_type>;
 }
 
-defm : SamplePatterns<v2i32>;
-defm : SamplePatterns<v4i32>;
-defm : SamplePatterns<v8i32>;
-defm : SamplePatterns<v16i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V2, IMAGE_SAMPLE_C_V4_V2,
+                      IMAGE_SAMPLE_L_V4_V2, IMAGE_SAMPLE_C_L_V4_V2,
+                      IMAGE_SAMPLE_B_V4_V2, IMAGE_SAMPLE_C_B_V4_V2,
+                      IMAGE_SAMPLE_D_V4_V2, IMAGE_SAMPLE_C_D_V4_V2,
+                      v2i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V4, IMAGE_SAMPLE_C_V4_V4,
+                      IMAGE_SAMPLE_L_V4_V4, IMAGE_SAMPLE_C_L_V4_V4,
+                      IMAGE_SAMPLE_B_V4_V4, IMAGE_SAMPLE_C_B_V4_V4,
+                      IMAGE_SAMPLE_D_V4_V4, IMAGE_SAMPLE_C_D_V4_V4,
+                      v4i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V8, IMAGE_SAMPLE_C_V4_V8,
+                      IMAGE_SAMPLE_L_V4_V8, IMAGE_SAMPLE_C_L_V4_V8,
+                      IMAGE_SAMPLE_B_V4_V8, IMAGE_SAMPLE_C_B_V4_V8,
+                      IMAGE_SAMPLE_D_V4_V8, IMAGE_SAMPLE_C_D_V4_V8,
+                      v8i32>;
+defm : SamplePatterns<IMAGE_SAMPLE_V4_V16, IMAGE_SAMPLE_C_V4_V16,
+                      IMAGE_SAMPLE_L_V4_V16, IMAGE_SAMPLE_C_L_V4_V16,
+                      IMAGE_SAMPLE_B_V4_V16, IMAGE_SAMPLE_C_B_V4_V16,
+                      IMAGE_SAMPLE_D_V4_V16, IMAGE_SAMPLE_C_D_V4_V16,
+                      v16i32>;
 
 /* int_SI_imageload for texture fetches consuming varying address parameters */
 class ImageLoadPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
@@ -1292,23 +1514,46 @@ class ImageLoadArrayPattern<Intrinsic name, MIMG opcode, ValueType addr_type> :
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
 >;
 
-multiclass ImageLoadPatterns<ValueType addr_type> {
-  def : ImageLoadPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
-  def : ImageLoadArrayPattern <int_SI_imageload, IMAGE_LOAD_MIP, addr_type>;
+class ImageLoadMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_MSAA),
+    (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+class ImageLoadArrayMSAAPattern<Intrinsic name, MIMG opcode, ValueType addr_type> : Pat <
+    (name addr_type:$addr, v32i8:$rsrc, TEX_ARRAY_MSAA),
+    (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc)
+>;
+
+multiclass ImageLoadPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayPattern <int_SI_imageload, opcode, addr_type>;
+}
+
+multiclass ImageLoadMSAAPatterns<MIMG opcode, ValueType addr_type> {
+  def : ImageLoadMSAAPattern <int_SI_imageload, opcode, addr_type>;
+  def : ImageLoadArrayMSAAPattern <int_SI_imageload, opcode, addr_type>;
 }
 
-defm : ImageLoadPatterns<v2i32>;
-defm : ImageLoadPatterns<v4i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V2, v2i32>;
+defm : ImageLoadPatterns<IMAGE_LOAD_MIP_V4_V4, v4i32>;
+
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V2, v2i32>;
+defm : ImageLoadMSAAPatterns<IMAGE_LOAD_V4_V4, v4i32>;
 
 /* Image resource information */
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, imm),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 def : Pat <
   (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY),
-  (IMAGE_GET_RESINFO 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
+>;
+
+def : Pat <
+  (int_SI_resinfo i32:$mipid, v32i8:$rsrc, TEX_ARRAY_MSAA),
+  (IMAGE_GET_RESINFO_V4_V1 0xf, 0, 0, 1, 0, 0, 0, 0, (V_MOV_B32_e32 $mipid), $rsrc)
 >;
 
 /********** ============================================ **********/
@@ -1379,22 +1624,30 @@ foreach Index = 0-15 in {
   >;
 }
 
-def : Vector1_Build <v1i32, i32, VReg_32>;
-def : Vector2_Build <v2i32, i32>;
-def : Vector2_Build <v2f32, f32>;
-def : Vector4_Build <v4i32, i32>;
-def : Vector4_Build <v4f32, f32>;
-def : Vector8_Build <v8i32, i32>;
-def : Vector8_Build <v8f32, f32>;
-def : Vector16_Build <v16i32, i32>;
-def : Vector16_Build <v16f32, f32>;
-
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <i32, f32, VReg_32>;
 
 def : BitConvert <f32, i32, SReg_32>;
 def : BitConvert <f32, i32, VReg_32>;
 
+def : BitConvert <i64, f64, VReg_64>;
+
+def : BitConvert <f64, i64, VReg_64>;
+
+def : BitConvert <v2f32, v2i32, VReg_64>;
+def : BitConvert <v2i32, v2f32, VReg_64>;
+def : BitConvert <v2i32, i64, VReg_64>;
+
+def : BitConvert <v4f32, v4i32, VReg_128>;
+def : BitConvert <v4i32, v4f32, VReg_128>;
+def : BitConvert <v4i32, i128,  VReg_128>;
+def : BitConvert <i128, v4i32,  VReg_128>;
+
+def : BitConvert <v8i32, v32i8, SReg_256>;
+def : BitConvert <v32i8, v8i32, SReg_256>;
+def : BitConvert <v8i32, v32i8, VReg_256>;
+def : BitConvert <v32i8, v8i32, VReg_256>;
+
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
@@ -1422,6 +1675,16 @@ def : Pat <
 /********** ================== **********/
 
 def : Pat <
+  (SGPRImm<(i32 imm)>:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+def : Pat <
+  (SGPRImm<(f32 fpimm)>:$imm),
+  (S_MOV_B32 fpimm:$imm)
+>;
+
+def : Pat <
   (i32 imm:$imm),
   (V_MOV_B32_e32 imm:$imm)
 >;
@@ -1449,6 +1712,13 @@ def : Pat <
     (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
 >;
 
+def : Pat <
+  (f64 fpimm:$imm),
+  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+    (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0),
+    (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1)
+>;
+
 /********** ===================== **********/
 /********** Interpolation Paterns **********/
 /********** ===================== **********/
@@ -1483,6 +1753,11 @@ def : Pat<
   (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
 >;
 
+def : Pat<
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+>;
+
 def : Pat <
   (fcos f32:$src0),
   (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
@@ -1521,20 +1796,20 @@ def : Pat <
 
 // 1. Offset as 8bit DWORD immediate
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, IMM8bitDWORD:$offset),
+  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
   (S_BUFFER_LOAD_DWORD_IMM $sbase, IMM8bitDWORD:$offset)
 >;
 
 // 2. Offset loaded in an 32bit SGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, imm:$offset),
+  (SIload_constant i128:$sbase, imm:$offset),
   (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
 >;
 
 // 3. Offset in an 32Bit VGPR
 def : Pat <
-  (int_SI_load_const v16i8:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD 0, 1, 0, 0, 0, 0, $voff, $sbase, 0, 0, 0)
+  (SIload_constant i128:$sbase, i32:$voff),
+  (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff)
 >;
 
 // The multiplication scales from [0,1] to the unsigned integer range
@@ -1545,6 +1820,12 @@ def : Pat <
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
+def : Pat <
+  (int_SI_tid),
+  (V_MBCNT_HI_U32_B32_e32 0xffffffff,
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
+>;
+
 /********** ================== **********/
 /**********   VOP3 Patterns    **********/
 /********** ================== **********/
@@ -1554,6 +1835,40 @@ def : Pat <
   (V_MAD_F32 $src0, $src1, $src2)
 >;
 
+/********** ======================= **********/
+/**********   Load/Store Patterns   **********/
+/********** ======================= **********/
+
+class DSReadPat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src0),
+  (vt (inst 0, $src0, $src0, $src0, 0, 0))
+>;
+
+def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local>;
+def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local>;
+def : DSReadPat <DS_READ_I16, i32, sextloadi16_local>;
+def : DSReadPat <DS_READ_U16, i32, az_extloadi16_local>;
+def : DSReadPat <DS_READ_B32, i32, local_load>;
+def : Pat <
+    (local_load i32:$src0),
+    (i32 (DS_READ_B32 0, $src0, $src0, $src0, 0, 0))
+>;
+
+class DSWritePat <DS inst, ValueType vt, PatFrag frag> : Pat <
+  (frag i32:$src1, i32:$src0),
+  (inst 0, $src0, $src1, $src1, 0, 0)
+>;
+
+def : DSWritePat <DS_WRITE_B8, i32, truncstorei8_local>;
+def : DSWritePat <DS_WRITE_B16, i32, truncstorei16_local>;
+def : DSWritePat <DS_WRITE_B32, i32, local_store>;
+
+def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
+           (DS_ADD_U32_RTN 0, $ptr, $val, 0, 0)>;
+
+def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
+           (DS_SUB_U32_RTN 0, $ptr, $val, 0, 0)>;
+
 /********** ================== **********/
 /**********   SMRD Patterns    **********/
 /********** ================== **********/
@@ -1581,8 +1896,100 @@ multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
 
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
 defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v16i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
 defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+//===----------------------------------------------------------------------===//
+// MUBUF Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass MUBUFLoad_Pattern <MUBUF Instr_ADDR64, ValueType vt,
+                              PatFrag global_ld, PatFrag constant_ld> {
+  def : Pat <
+    (vt (global_ld (add i64:$ptr, (i64 IMM12bit:$offset)))),
+    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, (as_i16imm $offset))
+  >;
+
+  def : Pat <
+    (vt (global_ld i64:$ptr)),
+    (Instr_ADDR64 (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+  >;
+
+  def : Pat <
+     (vt (global_ld (add i64:$ptr, i64:$offset))),
+     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+  >;
+
+  def : Pat <
+     (vt (constant_ld (add i64:$ptr, i64:$offset))),
+     (Instr_ADDR64 (SI_ADDR64_RSRC $ptr), $offset, 0)
+  >;
+}
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32,
+                          sextloadi8_global, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32,
+                          az_extloadi8_global, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32,
+                          sextloadi16_global, sextloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32,
+                          az_extloadi16_global, az_extloadi16_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORD_ADDR64, i32,
+                          global_load, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
+                          global_load, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, i64,
+                          az_extloadi32_global, az_extloadi32_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, v2i32,
+                          global_load, constant_load>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_DWORDX4_ADDR64, v4i32,
+                          global_load, constant_load>;
+
+multiclass MUBUFStore_Pattern <MUBUF Instr, ValueType vt, PatFrag st> {
+
+  def : Pat <
+    (st vt:$value, i64:$ptr),
+    (Instr $value, (SI_ADDR64_RSRC (i64 0)), $ptr, 0)
+  >;
+
+  def : Pat <
+    (st vt:$value, (add i64:$ptr, i64:$offset)),
+    (Instr $value, (SI_ADDR64_RSRC $ptr), $offset, 0)
+   >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE, i32, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT, i32, truncstorei16_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORD, i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, i64, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX2, v2i32, global_store>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_DWORDX4, v4i32, global_store>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Patterns
+//===----------------------------------------------------------------------===//
+
+// TBUFFER_STORE_FORMAT_*, addr64=0
+class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
+  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+                   i32:$soffset, imm:$inst_offset, imm:$dfmt,
+                   imm:$nfmt, imm:$offen, imm:$idxen,
+                   imm:$glc, imm:$slc, imm:$tfe),
+  (opcode
+    $vdata, (as_i16imm $inst_offset), (as_i1imm $offen), (as_i1imm $idxen),
+    (as_i1imm $glc), 0, (as_i8imm $dfmt), (as_i8imm $nfmt), $vaddr, $rsrc,
+    (as_i1imm $slc), (as_i1imm $tfe), $soffset)
+>;
+
+def : MTBUF_StoreResource <i32, 1, TBUFFER_STORE_FORMAT_X>;
+def : MTBUF_StoreResource <v2i32, 2, TBUFFER_STORE_FORMAT_XY>;
+def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
+def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
@@ -1592,25 +1999,25 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, SI_INDIRECT_DST IndDst> {
 
   // 1. Extract with offset
   def : Pat<
-    (vector_extract vt:$vec, (i64 (zext (add i32:$idx, imm:$off)))),
+    (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
     (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
   >;
 
   // 2. Extract without offset
   def : Pat<
-    (vector_extract vt:$vec, (i64 (zext i32:$idx))),
+    (vector_extract vt:$vec, i32:$idx),
     (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
   >;
 
   // 3. Insert with offset
   def : Pat<
-    (vector_insert vt:$vec, f32:$val, (i64 (zext (add i32:$idx, imm:$off)))),
+    (vector_insert vt:$vec, f32:$val, (add i32:$idx, imm:$off)),
     (IndDst (IMPLICIT_DEF), $vec, $idx, imm:$off, $val)
   >;
 
   // 4. Insert without offset
   def : Pat<
-    (vector_insert vt:$vec, f32:$val, (i64 (zext i32:$idx))),
+    (vector_insert vt:$vec, f32:$val, i32:$idx),
     (IndDst (IMPLICIT_DEF), $vec, $idx, 0, $val)
   >;
 }
@@ -1634,6 +2041,37 @@ def : Pat<
   (V_CMP_U_F32_e64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i64 (trunc i128:$x)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
+    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+>;
+
+def : Pat <
+  (i32 (trunc i64:$a)),
+  (EXTRACT_SUBREG $a, sub0)
+>;
+
+// V_ADD_I32_e32/S_ADD_I32 produces carry in VCC/SCC. For the vector
+// case, the sgpr-copies pass will fix this to use the vector version.
+def : Pat <
+  (i32 (addc i32:$src0, i32:$src1)),
+  (S_ADD_I32 $src0, $src1)
+>;
+
+def : Pat <
+  (or i64:$a, i64:$b),
+  (INSERT_SUBREG
+    (INSERT_SUBREG (IMPLICIT_DEF),
+      (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub0), (EXTRACT_SUBREG $b, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG $a, sub1), (EXTRACT_SUBREG $b, sub1)), sub1)
+>;
+
 //============================================================================//
 // Miscellaneous Optimization Patterns
 //============================================================================//
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
index 224cd2f..7fcc964 100644
--- a/lib/Target/R600/SIIntrinsics.td
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -14,15 +14,35 @@
 
 let TargetPrefix = "SI", isTarget = 1 in {
 
+  def int_SI_tid : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
   def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
   def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
-  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v16i8_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
-  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem]>;
+  // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
+  def int_SI_tbuffer_store : Intrinsic <
+    [],
+    [llvm_anyint_ty, // rsrc(SGPR)
+     llvm_anyint_ty, // vdata(VGPR), overloaded for types i32, v2i32, v4i32
+     llvm_i32_ty,    // num_channels(imm), selects opcode suffix: 1=X, 2=XY, 3=XYZ, 4=XYZW
+     llvm_i32_ty,    // vaddr(VGPR)
+     llvm_i32_ty,    // soffset(SGPR)
+     llvm_i32_ty,    // inst_offset(imm)
+     llvm_i32_ty,    // dfmt(imm)
+     llvm_i32_ty,    // nfmt(imm)
+     llvm_i32_ty,    // offen(imm)
+     llvm_i32_ty,    // idxen(imm)
+     llvm_i32_ty,    // glc(imm)
+     llvm_i32_ty,    // slc(imm)
+     llvm_i32_ty],   // tfe(imm)
+    []>;
+
+  class Sample : Intrinsic <[llvm_v4f32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
 
   def int_SI_sample : Sample;
   def int_SI_sampleb : Sample;
+  def int_SI_sampled : Sample;
   def int_SI_samplel : Sample;
 
   def int_SI_imageload : Intrinsic <[llvm_v4i32_ty], [llvm_anyvector_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index 2b60eb9..958763d 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -91,8 +91,7 @@ private:
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(tm.getRegisterInfo()),
-    TII(tm.getInstrInfo()) { }
+    MachineFunctionPass(ID), TRI(0), TII(0) { }
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
@@ -378,10 +377,13 @@ void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Vec = MI.getOperand(2).getReg();
   unsigned Off = MI.getOperand(4).getImm();
+  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Vec;
 
-  MachineInstr *MovRel = 
+  MachineInstr *MovRel =
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
-            .addReg(TRI->getSubReg(Vec, AMDGPU::sub0) + Off)
+            .addReg(SubReg + Off)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Vec, RegState::Implicit);
 
@@ -396,10 +398,13 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
   unsigned Dst = MI.getOperand(0).getReg();
   unsigned Off = MI.getOperand(4).getImm();
   unsigned Val = MI.getOperand(5).getReg();
+  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
+  if (!SubReg)
+    SubReg = Dst;
 
   MachineInstr *MovRel = 
     BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
-            .addReg(TRI->getSubReg(Dst, AMDGPU::sub0) + Off, RegState::Define)
+            .addReg(SubReg + Off, RegState::Define)
             .addReg(Val)
             .addReg(AMDGPU::M0, RegState::Implicit)
             .addReg(Dst, RegState::Implicit);
@@ -408,8 +413,12 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 }
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
+  bool NeedM0 = false;
   bool NeedWQM = false;
   unsigned Depth = 0;
 
@@ -474,6 +483,7 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectSrc(MI);
           break;
 
+        case AMDGPU::SI_INDIRECT_DST_V1:
         case AMDGPU::SI_INDIRECT_DST_V2:
         case AMDGPU::SI_INDIRECT_DST_V4:
         case AMDGPU::SI_INDIRECT_DST_V8:
@@ -481,6 +491,14 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
           IndirectDst(MI);
           break;
 
+        case AMDGPU::DS_READ_B32:
+          NeedWQM = true;
+          // Fall through
+        case AMDGPU::DS_WRITE_B32:
+        case AMDGPU::DS_ADD_U32_RTN:
+          NeedM0 = true;
+          break;
+
         case AMDGPU::V_INTERP_P1_F32:
         case AMDGPU::V_INTERP_P2_F32:
         case AMDGPU::V_INTERP_MOV_F32:
@@ -491,7 +509,15 @@ bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (NeedWQM) {
+  if (NeedM0) {
+    MachineBasicBlock &MBB = MF.front();
+    // Initialize M0 to a value that won't cause LDS access to be discarded
+    // due to offset clamping
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_MOV_B32),
+            AMDGPU::M0).addImm(0xffffffff);
+  }
+
+  if (NeedWQM && MFI->ShaderType != ShaderType::COMPUTE) {
     MachineBasicBlock &MBB = MF.front();
     BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ee0e307..071f9fa 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -13,6 +13,10 @@
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SIMachineFunctionInfo::anchor() {}
+
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     PSInputAddr(0) { }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 6da9f7f..2f1961c 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -22,6 +22,7 @@ namespace llvm {
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
+  virtual void anchor();
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
   unsigned PSInputAddr;
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 99278ae..ed0bbaf 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -15,18 +15,21 @@
 
 #include "SIRegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
+#include "SIInstrInfo.h"
 
 using namespace llvm;
 
-SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
-    const TargetInstrInfo &tii)
-: AMDGPURegisterInfo(tm, tii),
-  TM(tm),
-  TII(tii)
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm)
+: AMDGPURegisterInfo(tm),
+  TM(tm)
   { }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::EXEC);
+  Reserved.set(AMDGPU::INDIRECT_BASE_ADDR);
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo*>(TM.getInstrInfo());
+  TII->reserveIndirectRegisters(Reserved, MF);
   return Reserved;
 }
 
@@ -51,3 +54,78 @@ const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
     case MVT::i32: return &AMDGPU::VReg_32RegClass;
   }
 }
+
+unsigned SIRegisterInfo::getHWRegIndex(unsigned Reg) const {
+  return getEncodingValue(Reg);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
+  assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+
+  const TargetRegisterClass *BaseClasses[] = {
+    &AMDGPU::VReg_32RegClass,
+    &AMDGPU::SReg_32RegClass,
+    &AMDGPU::VReg_64RegClass,
+    &AMDGPU::SReg_64RegClass,
+    &AMDGPU::SReg_128RegClass,
+    &AMDGPU::SReg_256RegClass
+  };
+
+  for (unsigned i = 0, e = sizeof(BaseClasses) /
+                           sizeof(const TargetRegisterClass*); i != e; ++i) {
+    if (BaseClasses[i]->contains(Reg)) {
+      return BaseClasses[i];
+    }
+  }
+  return NULL;
+}
+
+bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
+  if (!RC) {
+    return false;
+  }
+  return !hasVGPRs(RC);
+}
+
+bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
+  return getCommonSubClass(&AMDGPU::VReg_32RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) ||
+         getCommonSubClass(&AMDGPU::VReg_512RegClass, RC);
+}
+
+const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
+                                         const TargetRegisterClass *SRC) const {
+    if (hasVGPRs(SRC)) {
+      return SRC;
+    } else if (SRC == &AMDGPU::SCCRegRegClass) {
+      return &AMDGPU::VCCRegRegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_32RegClass)) {
+      return &AMDGPU::VReg_32RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SGPR_64RegClass)) {
+      return &AMDGPU::VReg_64RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_128RegClass)) {
+      return &AMDGPU::VReg_128RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_256RegClass)) {
+      return &AMDGPU::VReg_256RegClass;
+    } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
+      return &AMDGPU::VReg_512RegClass;
+    }
+    return NULL;
+}
+
+const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
+                         const TargetRegisterClass *RC, unsigned SubIdx) const {
+  if (SubIdx == AMDGPU::NoSubRegister)
+    return RC;
+
+  // If this register has a sub-register, we can safely assume it is a 32-bit
+  // register, becuase all of SI's sub-registers are 32-bit.
+  if (isSGPRClass(RC)) {
+    return &AMDGPU::SGPR_32RegClass;
+  } else {
+    return &AMDGPU::VGPR_32RegClass;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index caec228..8148f7f 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -21,13 +21,11 @@
 namespace llvm {
 
 class AMDGPUTargetMachine;
-class TargetInstrInfo;
 
 struct SIRegisterInfo : public AMDGPURegisterInfo {
   AMDGPUTargetMachine &TM;
-  const TargetInstrInfo &TII;
 
-  SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+  SIRegisterInfo(AMDGPUTargetMachine &tm);
 
   virtual BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -43,6 +41,28 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
   virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+
+  virtual unsigned getHWRegIndex(unsigned Reg) const;
+
+  /// \brief Return the 'base' register class for this register.
+  /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
+  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+
+  /// \returns true if this class contains only SGPR registers
+  bool isSGPRClass(const TargetRegisterClass *RC) const;
+
+  /// \returns true if this class contains VGPR registers.
+  bool hasVGPRs(const TargetRegisterClass *RC) const;
+
+  /// \returns A VGPR reg class with the same width as \p SRC
+  const TargetRegisterClass *getEquivalentVGPRClass(
+                                          const TargetRegisterClass *SRC) const;
+
+  /// \returns The register class that is used for a sub-register of \p RC for
+  /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
+  /// be returned.
+  const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
+                                            unsigned SubIdx) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 244d4c00..49bdbc9 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -43,7 +43,7 @@ def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
                             (add (sequence "SGPR%u", 0, 101))>;
 
 // SGPR 64-bit registers
-def SGPR_64 : RegisterTuples<[sub0, sub1],
+def SGPR_64Regs : RegisterTuples<[sub0, sub1],
                              [(add (decimate (trunc SGPR_32, 101), 2)),
                               (add (decimate (shl SGPR_32, 1), 2))]>;
 
@@ -153,15 +153,17 @@ def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
   (add SGPR_32, M0Reg)
 >;
 
-def SReg_64 : RegisterClass<"AMDGPU", [i64, i1], 64,
-  (add SGPR_64, VCCReg, EXECReg)
+def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64], 64, (add SGPR_64Regs)>;
+
+def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
+  (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [v16i8, i128], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
 
-def SReg_256 : RegisterClass<"AMDGPU", [v32i8], 256, (add SGPR_256)>;
+def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8], 512, (add SGPR_512)>;
+def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 512, (add SGPR_512)>;
 
 // Register class for all vector registers (VGPRs + Interploation Registers)
 def VReg_32 : RegisterClass<"AMDGPU", [i32, f32, v1i32], 32, (add VGPR_32)>;
@@ -172,9 +174,9 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
 
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 256, (add VGPR_256)>;
+def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
new file mode 100644
index 0000000..f194d8b
--- /dev/null
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -0,0 +1,162 @@
+//===-- SITypeRewriter.cpp - Remove unwanted types ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass removes performs the following type substitution on all
+/// non-compute shaders:
+///
+/// v16i8 => i128
+///   - v16i8 is used for constant memory resource descriptors.  This type is
+///      legal for some compute APIs, and we don't want to declare it as legal
+///      in the backend, because we want the legalizer to expand all v16i8
+///      operations.
+/// v1* => *
+///   - Having v1* types complicates the legalizer and we can easily replace
+///   - them with the element type.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InstVisitor.h"
+
+using namespace llvm;
+
+namespace {
+
+class SITypeRewriter : public FunctionPass,
+                       public InstVisitor<SITypeRewriter> {
+
+  static char ID;
+  Module *Mod;
+  Type *v16i8;
+  Type *i128;
+
+public:
+  SITypeRewriter() : FunctionPass(ID) { }
+  virtual bool doInitialization(Module &M);
+  virtual bool runOnFunction(Function &F);
+  virtual const char *getPassName() const {
+    return "SI Type Rewriter";
+  }
+  void visitLoadInst(LoadInst &I);
+  void visitCallInst(CallInst &I);
+  void visitBitCast(BitCastInst &I);
+};
+
+} // End anonymous namespace
+
+char SITypeRewriter::ID = 0;
+
+bool SITypeRewriter::doInitialization(Module &M) {
+  Mod = &M;
+  v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
+  i128 = Type::getIntNTy(M.getContext(), 128);
+  return false;
+}
+
+bool SITypeRewriter::runOnFunction(Function &F) {
+  AttributeSet Set = F.getAttributes();
+  Attribute A = Set.getAttribute(AttributeSet::FunctionIndex, "ShaderType");
+
+  unsigned ShaderType = ShaderType::COMPUTE;
+  if (A.isStringAttribute()) {
+    StringRef Str = A.getValueAsString();
+    Str.getAsInteger(0, ShaderType);
+  }
+  if (ShaderType != ShaderType::COMPUTE) {
+    visit(F);
+  }
+
+  visit(F);
+
+  return false;
+}
+
+void SITypeRewriter::visitLoadInst(LoadInst &I) {
+  Value *Ptr = I.getPointerOperand();
+  Type *PtrTy = Ptr->getType();
+  Type *ElemTy = PtrTy->getPointerElementType();
+  IRBuilder<> Builder(&I);
+  if (ElemTy == v16i8)  {
+    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    LoadInst *Load = Builder.CreateLoad(BitCast);
+    SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
+    I.getAllMetadataOtherThanDebugLoc(MD);
+    for (unsigned i = 0, e = MD.size(); i != e; ++i) {
+      Load->setMetadata(MD[i].first, MD[i].second);
+    }
+    Value *BitCastLoad = Builder.CreateBitCast(Load, I.getType());
+    I.replaceAllUsesWith(BitCastLoad);
+    I.eraseFromParent();
+  }
+}
+
+void SITypeRewriter::visitCallInst(CallInst &I) {
+  IRBuilder<> Builder(&I);
+  SmallVector <Value*, 8> Args;
+  SmallVector <Type*, 8> Types;
+  bool NeedToReplace = false;
+  Function *F = I.getCalledFunction();
+  std::string Name = F->getName().str();
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    Value *Arg = I.getArgOperand(i);
+    if (Arg->getType() == v16i8) {
+      Args.push_back(Builder.CreateBitCast(Arg, i128));
+      Types.push_back(i128);
+      NeedToReplace = true;
+      Name = Name + ".i128";
+    } else if (Arg->getType()->isVectorTy() &&
+               Arg->getType()->getVectorNumElements() == 1 &&
+               Arg->getType()->getVectorElementType() ==
+                                              Type::getInt32Ty(I.getContext())){
+      Type *ElementTy = Arg->getType()->getVectorElementType();
+      std::string TypeName = "i32";
+      InsertElementInst *Def = dyn_cast<InsertElementInst>(Arg);
+      assert(Def);
+      Args.push_back(Def->getOperand(1));
+      Types.push_back(ElementTy);
+      std::string VecTypeName = "v1" + TypeName;
+      Name = Name.replace(Name.find(VecTypeName), VecTypeName.length(), TypeName);
+      NeedToReplace = true;
+    } else {
+      Args.push_back(Arg);
+      Types.push_back(Arg->getType());
+    }
+  }
+
+  if (!NeedToReplace) {
+    return;
+  }
+  Function *NewF = Mod->getFunction(Name);
+  if (!NewF) {
+    NewF = Function::Create(FunctionType::get(F->getReturnType(), Types, false), GlobalValue::ExternalLinkage, Name, Mod);
+    NewF->setAttributes(F->getAttributes());
+  }
+  I.replaceAllUsesWith(Builder.CreateCall(NewF, Args));
+  I.eraseFromParent();
+}
+
+void SITypeRewriter::visitBitCast(BitCastInst &I) {
+  IRBuilder<> Builder(&I);
+  if (I.getDestTy() != i128) {
+    return;
+  }
+
+  if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
+    if (Op->getSrcTy() == i128) {
+      I.replaceAllUsesWith(Op->getOperand(0));
+      I.eraseFromParent();
+    }
+  }
+}
+
+FunctionPass *llvm::createSITypeRewriter() {
+  return new SITypeRewriter();
+}
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
index 46b1f18..f437564 100644
--- a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
index efb10db..6339394 100644
--- a/lib/Target/Sparc/CMakeLists.txt
+++ b/lib/Target/Sparc/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_TARGET_DEFINITIONS Sparc.td)
 
 tablegen(LLVM SparcGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM SparcGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM SparcGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM SparcGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM SparcGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM SparcGenSubtargetInfo.inc -gen-subtarget)
@@ -10,7 +11,6 @@ add_public_tablegen_target(SparcCommonTableGen)
 
 add_llvm_target(SparcCodeGen
   DelaySlotFiller.cpp
-  FPMover.cpp
   SparcAsmPrinter.cpp
   SparcInstrInfo.cpp
   SparcISelDAGToDAG.cpp
@@ -21,9 +21,11 @@ add_llvm_target(SparcCodeGen
   SparcSubtarget.cpp
   SparcTargetMachine.cpp
   SparcSelectionDAGInfo.cpp
+  SparcJITInfo.cpp
+  SparcCodeEmitter.cpp
   )
 
-add_dependencies(LLVMSparcCodeGen intrinsics_gen)
+add_dependencies(LLVMSparcCodeGen SparcCommonTableGen intrinsics_gen)
 
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 6123773..9a0466a 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -14,6 +14,7 @@
 
 #define DEBUG_TYPE "delay-slot-filler"
 #include "Sparc.h"
+#include "SparcSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -39,11 +40,13 @@ namespace {
     /// layout, etc.
     ///
     TargetMachine &TM;
-    const TargetInstrInfo *TII;
+    const SparcSubtarget *Subtarget;
 
     static char ID;
-    Filler(TargetMachine &tm) 
-      : MachineFunctionPass(ID), TM(tm), TII(tm.getInstrInfo()) { }
+    Filler(TargetMachine &tm)
+      : MachineFunctionPass(ID), TM(tm),
+        Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
+    }
 
     virtual const char *getPassName() const {
       return "SPARC Delay Slot Filler";
@@ -61,8 +64,9 @@ namespace {
     bool isDelayFiller(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator candidate);
 
-    void insertCallUses(MachineBasicBlock::iterator MI,
-                        SmallSet<unsigned, 32>& RegUses);
+    void insertCallDefsUses(MachineBasicBlock::iterator MI,
+                            SmallSet<unsigned, 32>& RegDefs,
+                            SmallSet<unsigned, 32>& RegUses);
 
     void insertDefsUses(MachineBasicBlock::iterator MI,
                         SmallSet<unsigned, 32>& RegDefs,
@@ -81,6 +85,9 @@ namespace {
 
     bool needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize);
 
+    bool tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI);
+
   };
   char Filler::ID = 0;
 } // end of anonymous namespace
@@ -99,29 +106,54 @@ FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
 bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
 
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
-    if (I->hasDelaySlot()) {
-      MachineBasicBlock::iterator D = MBB.end();
-      MachineBasicBlock::iterator J = I;
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineBasicBlock::iterator MI = I;
+    ++I;
 
-      if (!DisableDelaySlotFiller)
-        D = findDelayInstr(MBB, I);
+    // If MI is restore, try combining it with previous inst.
+    if (!DisableDelaySlotFiller &&
+        (MI->getOpcode() == SP::RESTORErr
+         || MI->getOpcode() == SP::RESTOREri)) {
+      Changed |= tryCombineRestoreWithPrevInst(MBB, MI);
+      continue;
+    }
 
-      ++FilledSlots;
+    if (!Subtarget->isV9() &&
+        (MI->getOpcode() == SP::FCMPS || MI->getOpcode() == SP::FCMPD
+         || MI->getOpcode() == SP::FCMPQ)) {
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
       Changed = true;
+      continue;
+    }
+
+    // If MI has no delay slot, skip.
+    if (!MI->hasDelaySlot())
+      continue;
+
+    MachineBasicBlock::iterator D = MBB.end();
+
+    if (!DisableDelaySlotFiller)
+      D = findDelayInstr(MBB, MI);
+
+    ++FilledSlots;
+    Changed = true;
+
+    if (D == MBB.end())
+      BuildMI(MBB, I, MI->getDebugLoc(), TII->get(SP::NOP));
+    else
+      MBB.splice(I, &MBB, D);
 
-      if (D == MBB.end())
-        BuildMI(MBB, ++J, I->getDebugLoc(), TII->get(SP::NOP));
-      else
-        MBB.splice(++J, &MBB, D);
-      unsigned structSize = 0;
-      if (needsUnimp(I, structSize)) {
-        MachineBasicBlock::iterator J = I;
-        ++J; //skip the delay filler.
-        BuildMI(MBB, ++J, I->getDebugLoc(),
-                TII->get(SP::UNIMP)).addImm(structSize);
-      }
+    unsigned structSize = 0;
+    if (needsUnimp(MI, structSize)) {
+      MachineBasicBlock::iterator J = MI;
+      ++J; // skip the delay filler.
+      assert (J != MBB.end() && "MI needs a delay instruction.");
+      BuildMI(MBB, ++J, MI->getDebugLoc(),
+              TII->get(SP::UNIMP)).addImm(structSize);
     }
+  }
   return Changed;
 }
 
@@ -134,28 +166,34 @@ Filler::findDelayInstr(MachineBasicBlock &MBB,
   bool sawLoad = false;
   bool sawStore = false;
 
-  MachineBasicBlock::iterator I = slot;
+  if (slot == MBB.begin())
+    return MBB.end();
 
-  if (slot->getOpcode() == SP::RET)
+  if (slot->getOpcode() == SP::RET || slot->getOpcode() == SP::TLS_CALL)
     return MBB.end();
 
   if (slot->getOpcode() == SP::RETL) {
-    --I;
-    if (I->getOpcode() != SP::RESTORErr)
-      return MBB.end();
-    //change retl to ret
-    slot->setDesc(TII->get(SP::RET));
-    return I;
+    MachineBasicBlock::iterator J = slot;
+    --J;
+
+    if (J->getOpcode() == SP::RESTORErr
+        || J->getOpcode() == SP::RESTOREri) {
+      // change retl to ret.
+      slot->setDesc(TM.getInstrInfo()->get(SP::RET));
+      return J;
+    }
   }
 
-  //Call's delay filler can def some of call's uses.
+  // Call's delay filler can def some of call's uses.
   if (slot->isCall())
-    insertCallUses(slot, RegUses);
+    insertCallDefsUses(slot, RegDefs, RegUses);
   else
     insertDefsUses(slot, RegDefs, RegUses);
 
   bool done = false;
 
+  MachineBasicBlock::iterator I = slot;
+
   while (!done) {
     done = (I == MBB.begin());
 
@@ -216,12 +254,12 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
     unsigned Reg = MO.getReg();
 
     if (MO.isDef()) {
-      //check whether Reg is defined or used before delay slot.
+      // check whether Reg is defined or used before delay slot.
       if (IsRegInSet(RegDefs, Reg) || IsRegInSet(RegUses, Reg))
         return true;
     }
     if (MO.isUse()) {
-      //check whether Reg is defined before delay slot.
+      // check whether Reg is defined before delay slot.
       if (IsRegInSet(RegDefs, Reg))
         return true;
     }
@@ -230,9 +268,12 @@ bool Filler::delayHasHazard(MachineBasicBlock::iterator candidate,
 }
 
 
-void Filler::insertCallUses(MachineBasicBlock::iterator MI,
-                            SmallSet<unsigned, 32>& RegUses)
+void Filler::insertCallDefsUses(MachineBasicBlock::iterator MI,
+                                SmallSet<unsigned, 32>& RegDefs,
+                                SmallSet<unsigned, 32>& RegUses)
 {
+  // Call defines o7, which is visible to the instruction in delay slot.
+  RegDefs.insert(SP::O7);
 
   switch(MI->getOpcode()) {
   default: llvm_unreachable("Unknown opcode.");
@@ -255,7 +296,7 @@ void Filler::insertCallUses(MachineBasicBlock::iterator MI,
   }
 }
 
-//Insert Defs and Uses of MI into the sets RegDefs and RegUses.
+// Insert Defs and Uses of MI into the sets RegDefs and RegUses.
 void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
                             SmallSet<unsigned, 32>& RegDefs,
                             SmallSet<unsigned, 32>& RegUses)
@@ -270,13 +311,17 @@ void Filler::insertDefsUses(MachineBasicBlock::iterator MI,
       continue;
     if (MO.isDef())
       RegDefs.insert(Reg);
-    if (MO.isUse())
+    if (MO.isUse()) {
+      // Implicit register uses of retl are return values and
+      // retl does not use them.
+      if (MO.isImplicit() && MI->getOpcode() == SP::RETL)
+        continue;
       RegUses.insert(Reg);
-
+    }
   }
 }
 
-//returns true if the Reg or its alias is in the RegSet.
+// returns true if the Reg or its alias is in the RegSet.
 bool Filler::IsRegInSet(SmallSet<unsigned, 32>& RegSet, unsigned Reg)
 {
   // Check Reg and all aliased Registers.
@@ -310,6 +355,7 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   case SP::CALL: structSizeOpNum = 1; break;
   case SP::JMPLrr:
   case SP::JMPLri: structSizeOpNum = 2; break;
+  case SP::TLS_CALL: return false;
   }
 
   const MachineOperand &MO = I->getOperand(structSizeOpNum);
@@ -318,3 +364,142 @@ bool Filler::needsUnimp(MachineBasicBlock::iterator I, unsigned &StructSize)
   StructSize = MO.getImm();
   return true;
 }
+
+static bool combineRestoreADD(MachineBasicBlock::iterator RestoreMI,
+                              MachineBasicBlock::iterator AddMI,
+                              const TargetInstrInfo *TII)
+{
+  // Before:  add  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = AddMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change ADD to RESTORE.
+  AddMI->setDesc(TII->get((AddMI->getOpcode() == SP::ADDrr)
+                          ? SP::RESTORErr
+                          : SP::RESTOREri));
+
+  // Map the destination register.
+  AddMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreOR(MachineBasicBlock::iterator RestoreMI,
+                             MachineBasicBlock::iterator OrMI,
+                             const TargetInstrInfo *TII)
+{
+  // Before:  or  <op0>, <op1>, %i[0-7]
+  //          restore %g0, %g0, %i[0-7]
+  //    and <op0> or <op1> is zero,
+  //
+  // After :  restore <op0>, <op1>, %o[0-7]
+
+  unsigned reg = OrMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  // check whether it is a copy.
+  if (OrMI->getOpcode() == SP::ORrr
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && OrMI->getOperand(2).getReg() != SP::G0)
+    return false;
+
+  if (OrMI->getOpcode() == SP::ORri
+      && OrMI->getOperand(1).getReg() != SP::G0
+      && (!OrMI->getOperand(2).isImm() || OrMI->getOperand(2).getImm() != 0))
+    return false;
+
+  // Erase RESTORE.
+  RestoreMI->eraseFromParent();
+
+  // Change OR to RESTORE.
+  OrMI->setDesc(TII->get((OrMI->getOpcode() == SP::ORrr)
+                         ? SP::RESTORErr
+                         : SP::RESTOREri));
+
+  // Map the destination register.
+  OrMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+
+  return true;
+}
+
+static bool combineRestoreSETHIi(MachineBasicBlock::iterator RestoreMI,
+                                 MachineBasicBlock::iterator SetHiMI,
+                                 const TargetInstrInfo *TII)
+{
+  // Before:  sethi imm3, %i[0-7]
+  //          restore %g0, %g0, %g0
+  //
+  // After :  restore %g0, (imm3<<10), %o[0-7]
+
+  unsigned reg = SetHiMI->getOperand(0).getReg();
+  if (reg < SP::I0 || reg > SP::I7)
+    return false;
+
+  if (!SetHiMI->getOperand(1).isImm())
+    return false;
+
+  int64_t imm = SetHiMI->getOperand(1).getImm();
+
+  // Is it a 3 bit immediate?
+  if (!isInt<3>(imm))
+    return false;
+
+  // Make it a 13 bit immediate.
+  imm = (imm << 10) & 0x1FFF;
+
+  assert(RestoreMI->getOpcode() == SP::RESTORErr);
+
+  RestoreMI->setDesc(TII->get(SP::RESTOREri));
+
+  RestoreMI->getOperand(0).setReg(reg - SP::I0 + SP::O0);
+  RestoreMI->getOperand(1).setReg(SP::G0);
+  RestoreMI->getOperand(2).ChangeToImmediate(imm);
+
+
+  // Erase the original SETHI.
+  SetHiMI->eraseFromParent();
+
+  return true;
+}
+
+bool Filler::tryCombineRestoreWithPrevInst(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI)
+{
+  // No previous instruction.
+  if (MBBI == MBB.begin())
+    return false;
+
+  // assert that MBBI is a "restore %g0, %g0, %g0".
+  assert(MBBI->getOpcode() == SP::RESTORErr
+         && MBBI->getOperand(0).getReg() == SP::G0
+         && MBBI->getOperand(1).getReg() == SP::G0
+         && MBBI->getOperand(2).getReg() == SP::G0);
+
+  MachineBasicBlock::iterator PrevInst = MBBI; --PrevInst;
+
+  // It cannot combine with a delay filler.
+  if (isDelayFiller(MBB, PrevInst))
+    return false;
+
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+
+  switch (PrevInst->getOpcode()) {
+  default: break;
+  case SP::ADDrr:
+  case SP::ADDri: return combineRestoreADD(MBBI, PrevInst, TII); break;
+  case SP::ORrr:
+  case SP::ORri:  return combineRestoreOR(MBBI, PrevInst, TII); break;
+  case SP::SETHIi: return combineRestoreSETHIi(MBBI, PrevInst, TII); break;
+  }
+  // It cannot combine with the previous instruction.
+  return false;
+}
diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
deleted file mode 100644
index 1325b98..0000000
--- a/lib/Target/Sparc/FPMover.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "fpmover"
-#include "Sparc.h"
-#include "SparcSubtarget.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-STATISTIC(NumFpDs , "Number of instructions translated");
-STATISTIC(NoopFpDs, "Number of noop instructions removed");
-
-namespace {
-  struct FPMover : public MachineFunctionPass {
-    /// Target machine description which we query for reg. names, data
-    /// layout, etc.
-    ///
-    TargetMachine &TM;
-    
-    static char ID;
-    explicit FPMover(TargetMachine &tm) 
-      : MachineFunctionPass(ID), TM(tm) { }
-
-    virtual const char *getPassName() const {
-      return "Sparc Double-FP Move Fixer";
-    }
-
-    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-    bool runOnMachineFunction(MachineFunction &F);
-  };
-  char FPMover::ID = 0;
-} // end of anonymous namespace
-
-/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
-/// instructions into FMOVS instructions
-///
-FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
-  return new FPMover(tm);
-}
-
-/// getDoubleRegPair - Given a DFP register, return the even and odd FP
-/// registers that correspond to it.
-static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
-                             unsigned &OddReg) {
-  static const uint16_t EvenHalvesOfPairs[] = {
-    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
-    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
-  };
-  static const uint16_t OddHalvesOfPairs[] = {
-    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
-    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
-  };
-  static const uint16_t DoubleRegsInOrder[] = {
-    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
-    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
-  };
-  for (unsigned i = 0; i < array_lengthof(DoubleRegsInOrder); ++i)
-    if (DoubleRegsInOrder[i] == DoubleReg) {
-      EvenReg = EvenHalvesOfPairs[i];
-      OddReg = OddHalvesOfPairs[i];
-      return;
-    }
-  llvm_unreachable("Can't find reg");
-}
-
-/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
-///
-bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
-    MachineInstr *MI = I++;
-    DebugLoc dl = MI->getDebugLoc();
-    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
-        MI->getOpcode() == SP::FpNEGD) {
-      Changed = true;
-      unsigned DestDReg = MI->getOperand(0).getReg();
-      unsigned SrcDReg  = MI->getOperand(1).getReg();
-      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
-        MBB.erase(MI);   // Eliminate the noop copy.
-        ++NoopFpDs;
-        continue;
-      }
-      
-      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
-      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
-      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
-
-      const TargetInstrInfo *TII = TM.getInstrInfo();
-      if (MI->getOpcode() == SP::FpMOVD)
-        MI->setDesc(TII->get(SP::FMOVS));
-      else if (MI->getOpcode() == SP::FpNEGD)
-        MI->setDesc(TII->get(SP::FNEGS));
-      else if (MI->getOpcode() == SP::FpABSD)
-        MI->setDesc(TII->get(SP::FABSS));
-      else
-        llvm_unreachable("Unknown opcode!");
-        
-      MI->getOperand(0).setReg(EvenDestReg);
-      MI->getOperand(1).setReg(EvenSrcReg);
-      DEBUG(errs() << "FPMover: the modified instr is: " << *MI);
-      // Insert copy for the other half of the double.
-      if (DestDReg != SrcDReg) {
-        MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
-          .addReg(OddSrcReg);
-        DEBUG(errs() << "FPMover: the inserted instr is: " << *MI);
-      }
-      ++NumFpDs;
-    }
-  }
-  return Changed;
-}
-
-bool FPMover::runOnMachineFunction(MachineFunction &F) {
-  // If the target has V9 instructions, the fp-mover pseudos will never be
-  // emitted.  Avoid a scan of the instructions to improve compile time.
-  if (TM.getSubtarget<SparcSubtarget>().isV9())
-    return false;
-  
-  bool Changed = false;
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
-       FI != FE; ++FI)
-    Changed |= runOnMachineBasicBlock(*FI);
-  return Changed;
-}
diff --git a/lib/Target/Sparc/LLVMBuild.txt b/lib/Target/Sparc/LLVMBuild.txt
index fe20d2f..fd8e5d9 100644
--- a/lib/Target/Sparc/LLVMBuild.txt
+++ b/lib/Target/Sparc/LLVMBuild.txt
@@ -23,10 +23,12 @@ type = TargetGroup
 name = Sparc
 parent = Target
 has_asmprinter = 1
+has_jit = 1
 
 [component_1]
 type = Library
 name = SparcCodeGen
 parent = Sparc
-required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc SparcInfo Support Target
+required_libraries = AsmPrinter CodeGen Core MC SelectionDAG SparcDesc
+                     SparcInfo Support Target
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
index aac0e8d..f3caeaa 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcBaseInfo.h
@@ -53,7 +53,27 @@ enum TOF {
 
   // Extract bits 41-32 of an address.
   // Assembler: %hm(addr)
-  MO_HM
+  MO_HM,
+
+  // TargetFlags for Thread Local Storage.
+  MO_TLS_GD_HI22,
+  MO_TLS_GD_LO10,
+  MO_TLS_GD_ADD,
+  MO_TLS_GD_CALL,
+  MO_TLS_LDM_HI22,
+  MO_TLS_LDM_LO10,
+  MO_TLS_LDM_ADD,
+  MO_TLS_LDM_CALL,
+  MO_TLS_LDO_HIX22,
+  MO_TLS_LDO_LOX10,
+  MO_TLS_LDO_ADD,
+  MO_TLS_IE_HI22,
+  MO_TLS_IE_LO10,
+  MO_TLS_IE_LD,
+  MO_TLS_IE_LDX,
+  MO_TLS_IE_ADD,
+  MO_TLS_LE_HIX22,
+  MO_TLS_LE_LOX10
 };
 
 } // end namespace SPII
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 3d4bfdc..baac36b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -18,26 +18,29 @@ using namespace llvm;
 
 void SparcELFMCAsmInfo::anchor() { }
 
-SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, StringRef TT) {
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   IsLittleEndian = false;
   Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::sparcv9) {
+  bool isV9 = (TheTriple.getArch() == Triple::sparcv9);
+
+  if (isV9) {
     PointerSize = CalleeSaveStackSlotSize = 8;
   }
 
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  // .xword is only supported by V9.
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : 0;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
   HasLEB128 = true;
   SupportsDebugInformation = true;
-  
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
-  WeakRefDirective = "\t.weak\t";
-
   PrivateGlobalPrefix = ".L";
 }
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index f0e1354..1e58e37 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -14,16 +14,15 @@
 #ifndef SPARCTARGETASMINFO_H
 #define SPARCTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
-  class Target;
 
-  class SparcELFMCAsmInfo : public MCAsmInfo {
+  class SparcELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
-    explicit SparcELFMCAsmInfo(const Target &T, StringRef TT);
+    explicit SparcELFMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
index 4b81ada..c171db7 100644
--- a/lib/Target/Sparc/Makefile
+++ b/lib/Target/Sparc/Makefile
@@ -14,7 +14,8 @@ TARGET = Sparc
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = SparcGenRegisterInfo.inc SparcGenInstrInfo.inc \
 		SparcGenAsmWriter.inc SparcGenDAGISel.inc \
-		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc
+		SparcGenSubtargetInfo.inc SparcGenCallingConv.inc \
+		SparcGenCodeEmitter.inc
 
 DIRS = TargetInfo MCTargetDesc
 
diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
index b4991fe..34e68cf 100644
--- a/lib/Target/Sparc/README.txt
+++ b/lib/Target/Sparc/README.txt
@@ -38,7 +38,7 @@ t1:
 
 1) should be replaced with a brz in V9 mode.
 
-* Same as above, but emit conditional move on register zero (p192) in V9 
+* Same as above, but emit conditional move on register zero (p192) in V9
   mode.  Testcase:
 
 int %t1(int %a, int %b) {
@@ -47,13 +47,15 @@ int %t1(int %a, int %b) {
         ret int %D
 }
 
-* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling 
+* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling
   with the Y register, if they are faster.
 
 * Codegen bswap(load)/store(bswap) -> load/store ASI
 
-* Implement frame pointer elimination, e.g. eliminate save/restore for 
+* Implement frame pointer elimination, e.g. eliminate save/restore for
   leaf fns.
 * Fill delay slots
 
 * Implement JIT support
+
+* Use %g0 directly to materialize 0. No instruction is required.
diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
index ce6ae17..f44b604 100644
--- a/lib/Target/Sparc/Sparc.h
+++ b/lib/Target/Sparc/Sparc.h
@@ -26,7 +26,8 @@ namespace llvm {
 
   FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
   FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
-  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+  FunctionPass *createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                              JITCodeEmitter &JCE);
 
 } // end namespace llvm;
 
@@ -51,7 +52,7 @@ namespace llvm {
       ICC_NEG =  6   ,  // Negative
       ICC_VC  = 15   ,  // Overflow Clear
       ICC_VS  =  7   ,  // Overflow Set
-      
+
       //FCC_A   =  8+16,  // Always
       //FCC_N   =  0+16,  // Never
       FCC_U   =  7+16,  // Unordered
@@ -70,7 +71,7 @@ namespace llvm {
       FCC_O   = 15+16   // Ordered
     };
   }
-  
+
   inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
     switch (CC) {
     case SPCC::ICC_NE:  return "ne";
@@ -104,5 +105,22 @@ namespace llvm {
     }
     llvm_unreachable("Invalid cond code");
   }
+
+  inline static unsigned HI22(int64_t imm) {
+    return (unsigned)((imm >> 10) & ((1 << 22)-1));
+  }
+
+  inline static unsigned LO10(int64_t imm) {
+    return (unsigned)(imm & 0x3FF);
+  }
+
+  inline static unsigned HIX22(int64_t imm) {
+    return HI22(~imm);
+  }
+
+  inline static unsigned LOX10(int64_t imm) {
+    return ~LO10(~imm);
+  }
+
 }  // end namespace llvm
 #endif
diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
index 611f8e8..0df48f6 100644
--- a/lib/Target/Sparc/Sparc.td
+++ b/lib/Target/Sparc/Sparc.td
@@ -19,7 +19,7 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 // SPARC Subtarget features.
 //
- 
+
 def FeatureV9
   : SubtargetFeature<"v9", "IsV9", "true",
                      "Enable SPARC-V9 instructions">;
@@ -30,6 +30,10 @@ def FeatureVIS
   : SubtargetFeature<"vis", "IsVIS", "true",
                      "Enable UltraSPARC Visual Instruction Set extensions">;
 
+def FeatureHardQuad
+  : SubtargetFeature<"hard-quad-float", "HasHardQuad", "true",
+                     "Enable quad-word floating point instructions">;
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 108eb90..d06c894 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -43,6 +44,7 @@ namespace {
                          const char *Modifier = 0);
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
+    virtual void EmitFunctionBodyStart();
     virtual void EmitInstruction(const MachineInstr *MI) {
       SmallString<128> Str;
       raw_svector_ostream OS(Str);
@@ -60,16 +62,38 @@ namespace {
                                raw_ostream &O);
 
     bool printGetPCX(const MachineInstr *MI, unsigned OpNo, raw_ostream &OS);
-    
+
     virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
                        const;
+    void EmitGlobalRegisterDecl(unsigned reg) {
+      SmallString<128> Str;
+      raw_svector_ostream OS(Str);
+      OS << "\t.register "
+         << "%" << StringRef(getRegisterName(reg)).lower()
+         << ", "
+         << ((reg == SP::G6 || reg == SP::G7)? "#ignore" : "#scratch");
+      OutStreamer.EmitRawText(OS.str());
+    }
 
-    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   };
 } // end of anonymous namespace
 
 #include "SparcGenAsmWriter.inc"
 
+void SparcAsmPrinter::EmitFunctionBodyStart() {
+  if (!TM.getSubtarget<SparcSubtarget>().is64Bit())
+    return;
+
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  const unsigned globalRegs[] = { SP::G2, SP::G3, SP::G6, SP::G7, 0 };
+  for (unsigned i = 0; globalRegs[i] != 0; ++i) {
+    unsigned reg = globalRegs[i];
+    if (MRI.use_empty(reg))
+      continue;
+    EmitGlobalRegisterDecl(reg);
+  }
+}
+
 void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
                                    raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand (opNum);
@@ -81,11 +105,37 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
       assert(TF == SPII::MO_NO_FLAG &&
              "Cannot handle target flags on call address");
     else if (MI->getOpcode() == SP::SETHIi)
-      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH) &&
+      assert((TF == SPII::MO_HI || TF == SPII::MO_H44 || TF == SPII::MO_HH
+              || TF == SPII::MO_TLS_GD_HI22
+              || TF == SPII::MO_TLS_LDM_HI22
+              || TF == SPII::MO_TLS_LDO_HIX22
+              || TF == SPII::MO_TLS_IE_HI22
+              || TF == SPII::MO_TLS_LE_HIX22) &&
              "Invalid target flags for address operand on sethi");
+    else if (MI->getOpcode() == SP::TLS_CALL)
+      assert((TF == SPII::MO_NO_FLAG
+              || TF == SPII::MO_TLS_GD_CALL
+              || TF == SPII::MO_TLS_LDM_CALL) &&
+             "Cannot handle target flags on tls call address");
+    else if (MI->getOpcode() == SP::TLS_ADDrr)
+      assert((TF == SPII::MO_TLS_GD_ADD || TF == SPII::MO_TLS_LDM_ADD
+              || TF == SPII::MO_TLS_LDO_ADD || TF == SPII::MO_TLS_IE_ADD) &&
+             "Cannot handle target flags on add for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDrr)
+      assert(TF == SPII::MO_TLS_IE_LD &&
+             "Cannot handle target flags on ld for TLS");
+    else if (MI->getOpcode() == SP::TLS_LDXrr)
+      assert(TF == SPII::MO_TLS_IE_LDX &&
+             "Cannot handle target flags on ldx for TLS");
+    else if (MI->getOpcode() == SP::XORri)
+      assert((TF == SPII::MO_TLS_LDO_LOX10 || TF == SPII::MO_TLS_LE_LOX10) &&
+             "Cannot handle target flags on xor for TLS");
     else
-      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44 ||
-              TF == SPII::MO_HM) &&
+      assert((TF == SPII::MO_LO || TF == SPII::MO_M44 || TF == SPII::MO_L44
+              || TF == SPII::MO_HM
+              || TF == SPII::MO_TLS_GD_LO10
+              || TF == SPII::MO_TLS_LDM_LO10
+              || TF == SPII::MO_TLS_IE_LO10 ) &&
              "Invalid target flags for small address operand");
   }
 #endif
@@ -104,6 +154,24 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   case SPII::MO_L44: O << "%l44("; break;
   case SPII::MO_HH:  O << "%hh(";  break;
   case SPII::MO_HM:  O << "%hm(";  break;
+  case SPII::MO_TLS_GD_HI22:   O << "%tgd_hi22(";   break;
+  case SPII::MO_TLS_GD_LO10:   O << "%tgd_lo10(";   break;
+  case SPII::MO_TLS_GD_ADD:    O << "%tgd_add(";    break;
+  case SPII::MO_TLS_GD_CALL:   O << "%tgd_call(";   break;
+  case SPII::MO_TLS_LDM_HI22:  O << "%tldm_hi22(";  break;
+  case SPII::MO_TLS_LDM_LO10:  O << "%tldm_lo10(";  break;
+  case SPII::MO_TLS_LDM_ADD:   O << "%tldm_add(";   break;
+  case SPII::MO_TLS_LDM_CALL:  O << "%tldm_call(";  break;
+  case SPII::MO_TLS_LDO_HIX22: O << "%tldo_hix22("; break;
+  case SPII::MO_TLS_LDO_LOX10: O << "%tldo_lox10("; break;
+  case SPII::MO_TLS_LDO_ADD:   O << "%tldo_add(";   break;
+  case SPII::MO_TLS_IE_HI22:   O << "%tie_hi22(";   break;
+  case SPII::MO_TLS_IE_LO10:   O << "%tie_lo10(";   break;
+  case SPII::MO_TLS_IE_LD:     O << "%tie_ld(";     break;
+  case SPII::MO_TLS_IE_LDX:    O << "%tie_ldx(";    break;
+  case SPII::MO_TLS_IE_ADD:    O << "%tie_add(";    break;
+  case SPII::MO_TLS_LE_HIX22:  O << "%tle_hix22(";  break;
+  case SPII::MO_TLS_LE_LOX10:  O << "%tle_lox10(";   break;
   }
 
   switch (MO.getType()) {
@@ -118,7 +186,10 @@ void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     return;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O <<  GetBlockAddressSymbol(MO.getBlockAddress())->getName();
     break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
@@ -164,7 +235,7 @@ bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
   case MachineOperand::MO_Register:
     assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
            "Operand is not a physical register ");
-    assert(MO.getReg() != SP::O7 && 
+    assert(MO.getReg() != SP::O7 &&
            "%o7 is assigned as destination for getpcx!");
     operand = "%" + StringRef(getRegisterName(MO.getReg())).lower();
     break;
@@ -177,15 +248,15 @@ bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum,
   O << "\tcall\t.LLGETPC" << mfNum << '_' << bbNum << '\n' ;
 
   O << "\t  sethi\t"
-    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum 
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
     << ")), "  << operand << '\n' ;
 
   O << ".LLGETPC" << mfNum << '_' << bbNum << ":\n" ;
-  O << "\tor\t" << operand  
+  O << "\tor\t" << operand
     << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << mfNum << '_' << bbNum
     << ")), " << operand << '\n';
-  O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
-  
+  O << "\tadd\t" << operand << ", %o7, " << operand << '\n';
+
   return true;
 }
 
@@ -243,19 +314,19 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   // then nothing falls through to it.
   if (MBB->isLandingPad() || MBB->pred_empty())
     return false;
-  
+
   // If there isn't exactly one predecessor, it can't be a fall through.
   MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
   ++PI2;
   if (PI2 != MBB->pred_end())
     return false;
-  
+
   // The predecessor has to be immediately before this block.
   const MachineBasicBlock *Pred = *PI;
-  
+
   if (!Pred->isLayoutSuccessor(MBB))
     return false;
-  
+
   // Check if the last terminator is an unconditional branch.
   MachineBasicBlock::const_iterator I = Pred->end();
   while (I != Pred->begin() && !(--I)->isTerminator())
@@ -263,17 +334,8 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return I == Pred->end() || !I->isBarrier();
 }
 
-MachineLocation SparcAsmPrinter::
-getDebugValueLocation(const MachineInstr *MI) const {
-  assert(MI->getNumOperands() == 4 && "Invalid number of operands!");
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
-         "Unexpected MachineOperand types");
-  return MachineLocation(MI->getOperand(0).getReg(),
-                         MI->getOperand(1).getImm());
-}
-
 // Force static initialization.
-extern "C" void LLVMInitializeSparcAsmPrinter() { 
+extern "C" void LLVMInitializeSparcAsmPrinter() {
   RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
   RegisterAsmPrinter<SparcAsmPrinter> Y(TheSparcV9Target);
 }
diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
index 54784e0..acd4ec2 100644
--- a/lib/Target/Sparc/SparcCallingConv.td
+++ b/lib/Target/Sparc/SparcCallingConv.td
@@ -16,7 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 def CC_Sparc32 : CallingConv<[
-  //Custom assign SRet to [sp+64].
+  // Custom assign SRet to [sp+64].
   CCIfSRet<CCCustom<"CC_Sparc_Assign_SRet">>,
   // i32 f32 arguments get passed in integer registers if there is space.
   CCIfType<[i32, f32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
@@ -117,3 +117,14 @@ def CC_Sparc64 : CallingConv<[
   // arguments whether they are passed in registers or not.
   CCCustom<"CC_Sparc64_Full">
 ]>;
+
+// Callee-saved registers are handled by the register window mechanism.
+def CSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add (sequence "I%u", 0, 7),
+                            (sequence "L%u", 0, 7));
+}
+
+// Callee-saved registers for calls with ReturnsTwice attribute.
+def RTCSR : CalleeSavedRegs<(add)> {
+  let OtherPreserved = (add I6, I7);
+}
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
new file mode 100644
index 0000000..9bfe31fe
--- /dev/null
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -0,0 +1,245 @@
+//===-- Sparc/SparcCodeEmitter.cpp - Convert Sparc Code to Machine Code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===---------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Sparc machine instructions
+// into relocatable machine code.
+//
+//===---------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "Sparc.h"
+#include "MCTargetDesc/SparcBaseInfo.h"
+#include "SparcRelocations.h"
+#include "SparcTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+class SparcCodeEmitter : public MachineFunctionPass {
+  SparcJITInfo *JTI;
+  const SparcInstrInfo *II;
+  const DataLayout *TD;
+  const SparcSubtarget *Subtarget;
+  TargetMachine &TM;
+  JITCodeEmitter &MCE;
+  const std::vector<MachineConstantPoolEntry> *MCPEs;
+  bool IsPIC;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineModuleInfo> ();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+public:
+  SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
+      TM(tm), MCE(mce), MCPEs(0),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+  bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "Sparc Machine Code Emitter";
+  }
+
+  /// getBinaryCodeForInstr - This function, generated by the
+  /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+  /// machine instructions.
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+
+  void emitInstruction(MachineBasicBlock::instr_iterator MI,
+                       MachineBasicBlock &MBB);
+
+private:
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MachineInstr &MI,
+                             const MachineOperand &MO) const;
+
+  void emitWord(unsigned Word);
+
+  unsigned getRelocation(const MachineInstr &MI,
+                         const MachineOperand &MO) const;
+
+  void emitGlobalAddress(const GlobalValue *GV, unsigned Reloc) const;
+  void emitExternalSymbolAddress(const char *ES, unsigned Reloc) const;
+  void emitConstPoolAddress(unsigned CPI, unsigned Reloc) const;
+  void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc) const;
+};
+}  // end anonymous namespace.
+
+char SparcCodeEmitter::ID = 0;
+
+bool SparcCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  SparcTargetMachine &Target = static_cast<SparcTargetMachine &>(
+                                const_cast<TargetMachine &>(MF.getTarget()));
+
+  JTI = Target.getJITInfo();
+  II = Target.getInstrInfo();
+  TD = Target.getDataLayout();
+  Subtarget = &TM.getSubtarget<SparcSubtarget> ();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+        << MF.getName() << "'\n");
+    MCE.startFunction(MF);
+
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+        MBB != E; ++MBB){
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(),
+           E = MBB->instr_end(); I != E;)
+        emitInstruction(*I++, *MBB);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void SparcCodeEmitter::emitInstruction(MachineBasicBlock::instr_iterator MI,
+                                      MachineBasicBlock &MBB) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << *MI);
+
+  MCE.processDebugLoc(MI->getDebugLoc(), true);
+
+  ++NumEmitted;
+
+  switch (MI->getOpcode()) {
+  default: {
+    emitWord(getBinaryCodeForInstr(*MI));
+    break;
+  }
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI->getOperand(0).getSymbolName()[0]) {
+      report_fatal_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL: {
+    MCE.emitLabel(MI->getOperand(0).getMCSymbol());
+    break;
+  }
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL: {
+    // Do nothing.
+    break;
+  }
+  case SP::GETPCX: {
+    report_fatal_error("JIT does not support pseudo instruction GETPCX yet!");
+    break;
+  }
+  }
+
+  MCE.processDebugLoc(MI->getDebugLoc(), false);
+}
+
+void SparcCodeEmitter::emitWord(unsigned Word) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Word) << "\n");
+  MCE.emitWordBE(Word);
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned SparcCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                             const MachineOperand &MO) const {
+  if (MO.isReg())
+    return TM.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO));
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
+  else if (MO.isCPI())
+    emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), getRelocation(MI, MO));
+  else
+    llvm_unreachable("Unable to encode MachineOperand!");
+  return 0;
+}
+unsigned SparcCodeEmitter::getRelocation(const MachineInstr &MI,
+                                         const MachineOperand &MO) const {
+
+  unsigned TF = MO.getTargetFlags();
+  switch (TF) {
+  default:
+  case SPII::MO_NO_FLAG: break;
+  case SPII::MO_LO: return SP::reloc_sparc_lo;
+  case SPII::MO_HI: return SP::reloc_sparc_hi;
+  case SPII::MO_H44:
+  case SPII::MO_M44:
+  case SPII::MO_L44:
+  case SPII::MO_HH:
+  case SPII::MO_HM: assert(0 && "FIXME: Implement Medium/Large code model.");
+  }
+
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default: break;
+  case SP::CALL:    return SP::reloc_sparc_pc30;
+  case SP::BA:
+  case SP::BCOND:
+  case SP::FBCOND:  return SP::reloc_sparc_pc22;
+  case SP::BPXCC:   return SP::reloc_sparc_pc19;
+  }
+  llvm_unreachable("unknown reloc!");
+}
+
+void SparcCodeEmitter::emitGlobalAddress(const GlobalValue *GV,
+                                        unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                             const_cast<GlobalValue *>(GV), 0,
+                                             true));
+}
+
+void SparcCodeEmitter::
+emitExternalSymbolAddress(const char *ES, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, 0, 0));
+}
+
+void SparcCodeEmitter::
+emitConstPoolAddress(unsigned CPI, unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, false));
+}
+
+void SparcCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                            unsigned Reloc) const {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB));
+}
+
+
+/// createSparcJITCodeEmitterPass - Return a pass that emits the collected Sparc
+/// code to the specified MCE object.
+FunctionPass *llvm::createSparcJITCodeEmitterPass(SparcTargetMachine &TM,
+                                                 JITCodeEmitter &JCE) {
+  return new SparcCodeEmitter(TM, JCE);
+}
+
+#include "SparcGenCodeEmitter.inc"
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 7874240..c75998a 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -26,7 +26,61 @@
 
 using namespace llvm;
 
+static cl::opt<bool>
+DisableLeafProc("disable-sparc-leaf-proc",
+                cl::init(false),
+                cl::desc("Disable Sparc leaf procedure optimization."),
+                cl::Hidden);
+
+
+void SparcFrameLowering::emitSPAdjustment(MachineFunction &MF,
+                                          MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          int NumBytes,
+                                          unsigned ADDrr,
+                                          unsigned ADDri) const {
+
+  DebugLoc dl = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
+  const SparcInstrInfo &TII =
+    *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
+  if (NumBytes >= -4096 && NumBytes < 4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(ADDri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+    return;
+  }
+
+  // Emit this the hard way.  This clobbers G1 which we always know is
+  // available here.
+  if (NumBytes >= 0) {
+    // Emit nonnegative numbers with sethi + or.
+    // sethi %hi(NumBytes), %g1
+    // or %g1, %lo(NumBytes), %g1
+    // add %sp, %g1, %sp
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(LO10(NumBytes));
+    BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+    return ;
+  }
+
+  // Emit negative numbers with sethi + xor.
+  // sethi %hix(NumBytes), %g1
+  // xor %g1, %lox(NumBytes), %g1
+  // add %sp, %g1, %sp
+  BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(ADDrr), SP::O6)
+    .addReg(SP::O6).addReg(SP::G1);
+}
+
 void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+
   MachineBasicBlock &MBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const SparcInstrInfo &TII =
@@ -37,43 +91,36 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Get the number of bytes to allocate from the FrameInfo
   int NumBytes = (int) MFI->getStackSize();
 
-  if (SubTarget.is64Bit()) {
-    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
-    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
-    NumBytes += 128;
-    // Frames with calls must also reserve space for 6 outgoing arguments
-    // whether they are used or not. LowerCall_64 takes care of that.
-    assert(NumBytes % 16 == 0 && "Stack size not 16-byte aligned");
-  } else {
-    // Emit the correct save instruction based on the number of bytes in
-    // the frame. Minimum stack frame size according to V8 ABI is:
-    //   16 words for register window spill
-    //    1 word for address of returned aggregate-value
-    // +  6 words for passing parameters on the stack
-    // ----------
-    //   23 words * 4 bytes per word = 92 bytes
-    NumBytes += 92;
-
-    // Round up to next doubleword boundary -- a double-word boundary
-    // is required by the ABI.
-    NumBytes = RoundUpToAlignment(NumBytes, 8);
+  unsigned SAVEri = SP::SAVEri;
+  unsigned SAVErr = SP::SAVErr;
+  if (FuncInfo->isLeafProc()) {
+    if (NumBytes == 0)
+      return;
+    SAVEri = SP::ADDri;
+    SAVErr = SP::ADDrr;
   }
-  NumBytes = -NumBytes;
+  NumBytes = - SubTarget.getAdjustedFrameSize(NumBytes);
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SAVErr, SAVEri);
 
-  if (NumBytes >= -4096) {
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVEri), SP::O6)
-      .addReg(SP::O6).addImm(NumBytes);
-  } else {
-    // Emit this the hard way.  This clobbers G1 which we always know is
-    // available here.
-    unsigned OffHi = (unsigned)NumBytes >> 10U;
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
-      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
-    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVErr), SP::O6)
-      .addReg(SP::O6).addReg(SP::G1);
-  }
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+  BuildMI(MBB, MBBI, dl, TII.get(SP::PROLOG_LABEL)).addSym(FrameLabel);
+
+  unsigned regFP = MRI->getDwarfRegNum(SP::I6, true);
+
+  // Emit ".cfi_def_cfa_register 30".
+  MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                          regFP));
+  // Emit ".cfi_window_save".
+  MMI.addFrameInst(MCCFIInstruction::createWindowSave(FrameLabel));
+
+  unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
+  unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
+  // Emit ".cfi_register 15, 31".
+  MMI.addFrameInst(MCCFIInstruction::createRegister(FrameLabel,
+                                                    regOutRA,
+                                                    regInRA));
 }
 
 void SparcFrameLowering::
@@ -81,15 +128,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
     MachineInstr &MI = *I;
-    DebugLoc DL = MI.getDebugLoc();
     int Size = MI.getOperand(0).getImm();
     if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
       Size = -Size;
-    const SparcInstrInfo &TII =
-      *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
+
     if (Size)
-      BuildMI(MBB, I, DL, TII.get(SP::ADDri), SP::O6).addReg(SP::O6)
-        .addImm(Size);
+      emitSPAdjustment(MF, MBB, I, Size, SP::ADDrr, SP::ADDri);
   }
   MBB.erase(I);
 }
@@ -97,12 +141,112 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void SparcFrameLowering::emitEpilogue(MachineFunction &MF,
                                   MachineBasicBlock &MBB) const {
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const SparcInstrInfo &TII =
     *static_cast<const SparcInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
   assert(MBBI->getOpcode() == SP::RETL &&
          "Can only put epilog before 'retl' instruction!");
-  BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
-    .addReg(SP::G0);
+  if (!FuncInfo->isLeafProc()) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+      .addReg(SP::G0);
+    return;
+  }
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  int NumBytes = (int) MFI->getStackSize();
+  if (NumBytes == 0)
+    return;
+
+  NumBytes = SubTarget.getAdjustedFrameSize(NumBytes);
+  emitSPAdjustment(MF, MBB, MBBI, NumBytes, SP::ADDrr, SP::ADDri);
+}
+
+bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  // Reserve call frame if there are no variable sized objects on the stack.
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MF.getTarget().Options.DisableFramePointerElim(MF) ||
+    MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+
+static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
+{
+
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
+    if (MRI->isPhysRegUsed(reg))
+      return false;
+
+  return true;
+}
+
+bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
+{
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo    *MFI = MF.getFrameInfo();
+
+  return !(MFI->hasCalls()              // has calls
+           || MRI.isPhysRegUsed(SP::L0) // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6) // %SP is used
+           || hasFP(MF));               // need %FP
+}
+
+void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Remap %i[0-7] to %o[0-7].
+  for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+    if (!MRI.isPhysRegUsed(reg))
+      continue;
+    unsigned mapped_reg = (reg - SP::I0 + SP::O0);
+    assert(!MRI.isPhysRegUsed(mapped_reg));
+
+    // Replace I register with O register.
+    MRI.replaceRegWith(reg, mapped_reg);
+
+    // Mark the reg unused.
+    MRI.setPhysRegUnused(reg);
+  }
+
+  // Rewrite MBB's Live-ins.
+  for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+       MBB != E; ++MBB) {
+    for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
+      if (!MBB->isLiveIn(reg))
+        continue;
+      MBB->removeLiveIn(reg);
+      MBB->addLiveIn(reg - SP::I0 + SP::O0);
+    }
+  }
+
+  assert(verifyLeafProcRegUse(&MRI));
+#ifdef XDEBUG
+  MF.verify(0, "After LeafProc Remapping");
+#endif
+}
+
+void SparcFrameLowering::processFunctionBeforeCalleeSavedScan
+                  (MachineFunction &MF, RegScavenger *RS) const {
+
+  if (!DisableLeafProc && isLeafProc(MF)) {
+    SparcMachineFunctionInfo *MFI = MF.getInfo<SparcMachineFunctionInfo>();
+    MFI->setLeafProc(true);
+
+    remapRegsForLeafProc(MF);
+  }
+
 }
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index c375662..072fde3 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -38,7 +38,25 @@ public:
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I) const;
 
-  bool hasFP(const MachineFunction &MF) const { return false; }
+  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const;
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+private:
+  // Remap input registers to output registers for leaf procedure.
+  void remapRegsForLeafProc(MachineFunction &MF) const;
+
+  // Returns true if MF is a leaf procedure.
+  bool isLeafProc(MachineFunction &MF) const;
+
+
+  // Emits code for adjusting SP in function prologue/epilogue.
+  void emitSPAdjustment(MachineFunction &MF,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        int NumBytes, unsigned ADDrr, unsigned ADDri) const;
+
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index a709685..b012bfd 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -33,7 +33,7 @@ class SparcDAGToDAGISel : public SelectionDAGISel {
   /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const SparcSubtarget &Subtarget;
-  SparcTargetMachine& TM;
+  SparcTargetMachine &TM;
 public:
   explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
     : SelectionDAGISel(tm),
@@ -67,18 +67,21 @@ private:
 
 SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
 }
 
 bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                                      SDValue &Base, SDValue &Offset) {
   if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
-    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), TLI.getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
+                                       getTargetLowering()->getPointerTy());
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -88,7 +91,7 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
                 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
           // Constant offset from frame ref.
           Base = CurDAG->getTargetFrameIndex(FIN->getIndex(),
-                                             TLI.getPointerTy());
+                                           getTargetLowering()->getPointerTy());
         } else {
           Base = Addr.getOperand(0);
         }
@@ -115,7 +118,8 @@ bool SparcDAGToDAGISel::SelectADDRri(SDValue Addr,
 bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   if (Addr.getOpcode() == ISD::FrameIndex) return false;
   if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
-      Addr.getOpcode() == ISD::TargetGlobalAddress)
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
     return false;  // direct calls.
 
   if (Addr.getOpcode() == ISD::ADD) {
@@ -131,14 +135,16 @@ bool SparcDAGToDAGISel::SelectADDRrr(SDValue Addr, SDValue &R1, SDValue &R2) {
   }
 
   R1 = Addr;
-  R2 = CurDAG->getRegister(SP::G0, TLI.getPointerTy());
+  R2 = CurDAG->getRegister(SP::G0, getTargetLowering()->getPointerTy());
   return true;
 }
 
 SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
-  if (N->isMachineOpcode())
+  SDLoc dl(N);
+  if (N->isMachineOpcode()) {
+    N->setNodeId(-1);
     return NULL;   // Already selected.
+  }
 
   switch (N->getOpcode()) {
   default: break;
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 3863e2c..64625f7 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -14,6 +14,7 @@
 
 #include "SparcISelLowering.h"
 #include "SparcMachineFunctionInfo.h"
+#include "SparcRegisterInfo.h"
 #include "SparcTargetMachine.h"
 #include "MCTargetDesc/SparcBaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
@@ -40,7 +41,7 @@ static bool CC_Sparc_Assign_SRet(unsigned &ValNo, MVT &ValVT,
 {
   assert (ArgFlags.isSRet());
 
-  //Assign SRet argument
+  // Assign SRet argument.
   State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
                                          0,
                                          LocVT, LocInfo));
@@ -54,18 +55,18 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
   static const uint16_t RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
-  //Try to get first reg
+  // Try to get first reg.
   if (unsigned Reg = State.AllocateReg(RegList, 6)) {
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   } else {
-    //Assign whole thing in stack
+    // Assign whole thing in stack.
     State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
                                            State.AllocateStack(8,4),
                                            LocVT, LocInfo));
     return true;
   }
 
-  //Try to get second reg
+  // Try to get second reg.
   if (unsigned Reg = State.AllocateReg(RegList, 6))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
@@ -164,7 +165,7 @@ SparcTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool IsVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 DebugLoc DL, SelectionDAG &DAG) const {
+                                 SDLoc DL, SelectionDAG &DAG) const {
   if (Subtarget->is64Bit())
     return LowerReturn_64(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
   return LowerReturn_32(Chain, CallConv, IsVarArg, Outs, OutVals, DL, DAG);
@@ -175,7 +176,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+                                    SDLoc DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // CCValAssign - represent the assignment of the return value to locations.
@@ -206,7 +207,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  unsigned RetAddrOffset = 8; //Call Inst + Delay Slot
+  unsigned RetAddrOffset = 8; // Call Inst + Delay Slot
   // If the function returns a struct, copy the SRetReturnReg to I0
   if (MF.getFunction()->hasStructRetAttr()) {
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
@@ -238,7 +239,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
                                     CallingConv::ID CallConv, bool IsVarArg,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
-                                    DebugLoc DL, SelectionDAG &DAG) const {
+                                    SDLoc DL, SelectionDAG &DAG) const {
   // CCValAssign - represent the assignment of the return value to locations.
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -314,7 +315,7 @@ LowerFormalArguments(SDValue Chain,
                      CallingConv::ID CallConv,
                      bool IsVarArg,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc DL,
+                     SDLoc DL,
                      SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const {
   if (Subtarget->is64Bit())
@@ -332,7 +333,7 @@ LowerFormalArguments_32(SDValue Chain,
                         CallingConv::ID CallConv,
                         bool isVarArg,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                        DebugLoc dl,
+                        SDLoc dl,
                         SelectionDAG &DAG,
                         SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -351,7 +352,7 @@ LowerFormalArguments_32(SDValue Chain,
     CCValAssign &VA = ArgLocs[i];
 
     if (i == 0  && Ins[i].Flags.isSRet()) {
-      //Get SRet from [%fp+64]
+      // Get SRet from [%fp+64].
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
       SDValue Arg = DAG.getLoad(MVT::i32, dl, Chain, FIPtr,
@@ -410,7 +411,7 @@ LowerFormalArguments_32(SDValue Chain,
 
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::f64);
-      //If it is double-word aligned, just load.
+      // If it is double-word aligned, just load.
       if (Offset % 8 == 0) {
         int FI = MF.getFrameInfo()->CreateFixedObject(8,
                                                       Offset,
@@ -470,7 +471,7 @@ LowerFormalArguments_32(SDValue Chain,
   }
 
   if (MF.getFunction()->hasStructRetAttr()) {
-    //Copy the SRet Argument to SRetReturnReg
+    // Copy the SRet Argument to SRetReturnReg.
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
     unsigned Reg = SFI->getSRetReturnReg();
     if (!Reg) {
@@ -532,7 +533,7 @@ LowerFormalArguments_64(SDValue Chain,
                         CallingConv::ID CallConv,
                         bool IsVarArg,
                         const SmallVectorImpl<ISD::InputArg> &Ins,
-                        DebugLoc DL,
+                        SDLoc DL,
                         SelectionDAG &DAG,
                         SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -648,15 +649,36 @@ SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   return LowerCall_32(CLI, InVals);
 }
 
+static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
+                                     ImmutableCallSite *CS) {
+  if (CS)
+    return CS->hasFnAttr(Attribute::ReturnsTwice);
+
+  const Function *CalleeFn = 0;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    CalleeFn = dyn_cast<Function>(G->getGlobal());
+  } else if (ExternalSymbolSDNode *E =
+             dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const Function *Fn = DAG.getMachineFunction().getFunction();
+    const Module *M = Fn->getParent();
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+  }
+
+  if (!CalleeFn)
+    return false;
+  return CalleeFn->hasFnAttribute(Attribute::ReturnsTwice);
+}
+
 // Lower a call for the 32-bit ABI.
 SDValue
 SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
                                   SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -680,7 +702,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
 
-  //Create local copies for byval args.
+  // Create local copies for byval args.
   SmallVector<SDValue, 8> ByValArgs;
   for (unsigned i = 0,  e = Outs.size(); i != e; ++i) {
     ISD::ArgFlagsTy Flags = Outs[i].Flags;
@@ -696,13 +718,14 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
     SDValue SizeNode = DAG.getConstant(Size, MVT::i32);
 
     Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
-                          false,        //isVolatile,
-                          (Size <= 32), //AlwaysInline if size <= 32
+                          false,        // isVolatile,
+                          (Size <= 32), // AlwaysInline if size <= 32
                           MachinePointerInfo(), MachinePointerInfo());
     ByValArgs.push_back(FIPtr);
   }
 
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
@@ -718,7 +741,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
     ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
-    //Use local copy if it is a byval arg.
+    // Use local copy if it is a byval arg.
     if (Flags.isByVal())
       Arg = ByValArgs[byvalArgIdx++];
 
@@ -758,7 +781,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
       if (VA.isMemLoc()) {
         unsigned Offset = VA.getLocMemOffset() + StackOffset;
-        //if it is double-word aligned, just store.
+        // if it is double-word aligned, just store.
         if (Offset % 8 == 0) {
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset);
@@ -791,7 +814,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
         if (NextVA.isRegLoc()) {
           RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Lo));
         } else {
-          //Store the low part in stack.
+          // Store the low part in stack.
           unsigned Offset = NextVA.getLocMemOffset() + StackOffset;
           SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
           SDValue PtrOff = DAG.getIntPtrConstant(Offset);
@@ -860,6 +883,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   }
 
   unsigned SRetArgSize = (hasStructRetAttr)? getSRetArgSize(DAG, Callee):0;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
 
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
@@ -879,6 +903,16 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
     Ops.push_back(DAG.getRegister(toCallerWindow(RegsToPass[i].first),
                                   RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CallConv)
+                          : TRI->getCallPreservedMask(CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
@@ -886,7 +920,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
-                             DAG.getIntPtrConstant(0, true), InFlag);
+                             DAG.getIntPtrConstant(0, true), InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Assign locations to each value returned by this call.
@@ -907,6 +941,23 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   return Chain;
 }
 
+// This functions returns true if CalleeName is a ABI function that returns
+// a long double (fp128).
+static bool isFP128ABICall(const char *CalleeName)
+{
+  static const char *const ABICalls[] =
+    {  "_Q_add", "_Q_sub", "_Q_mul", "_Q_div",
+       "_Q_sqrt", "_Q_neg",
+       "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
+       "_Q_lltoq", "_Q_ulltoq",
+       0
+    };
+  for (const char * const *I = ABICalls; *I != 0; ++I)
+    if (strcmp(CalleeName, *I) == 0)
+      return true;
+  return false;
+}
+
 unsigned
 SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 {
@@ -917,7 +968,10 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
              dyn_cast<ExternalSymbolSDNode>(Callee)) {
     const Function *Fn = DAG.getMachineFunction().getFunction();
     const Module *M = Fn->getParent();
-    CalleeFn = M->getFunction(E->getSymbol());
+    const char *CalleeName = E->getSymbol();
+    CalleeFn = M->getFunction(CalleeName);
+    if (!CalleeFn && isFP128ABICall(CalleeName))
+      return 16; // Return sizeof(fp128)
   }
 
   if (!CalleeFn)
@@ -979,9 +1033,12 @@ SDValue
 SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                                   SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
-  DebugLoc DL = CLI.DL;
+  SDLoc DL = CLI.DL;
   SDValue Chain = CLI.Chain;
 
+  // Sparc target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
@@ -1004,7 +1061,8 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Adjust the stack pointer to make room for the arguments.
   // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
   // with more than 6 arguments.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                               DL);
 
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
@@ -1097,6 +1155,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, getPointerTy());
   else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
@@ -1110,6 +1169,15 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                   RegsToPass[i].second.getValueType()));
 
+  // Add a register mask operand representing the call-preserved registers.
+  const SparcRegisterInfo *TRI =
+    ((const SparcTargetMachine&)getTargetMachine()).getRegisterInfo();
+  const uint32_t *Mask = ((hasReturnsTwice)
+                          ? TRI->getRTCallPreservedMask(CLI.CallConv)
+                          : TRI->getCallPreservedMask(CLI.CallConv));
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   // Make sure the CopyToReg nodes are glued to the call instruction which
   // consumes the registers.
   if (InGlue.getNode())
@@ -1122,7 +1190,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Revert the stack pointer immediately after the call.
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
-                             DAG.getIntPtrConstant(0, true), InGlue);
+                             DAG.getIntPtrConstant(0, true), InGlue, DL);
   InGlue = Chain.getValue(1);
 
   // Now extract the return values. This is more or less the same as
@@ -1242,20 +1310,27 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   addRegisterClass(MVT::i32, &SP::IntRegsRegClass);
   addRegisterClass(MVT::f32, &SP::FPRegsRegClass);
   addRegisterClass(MVT::f64, &SP::DFPRegsRegClass);
+  addRegisterClass(MVT::f128, &SP::QFPRegsRegClass);
   if (Subtarget->is64Bit())
     addRegisterClass(MVT::i64, &SP::I64RegsRegClass);
 
   // Turn FP extload into load/fextend
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+
   // Sparc doesn't have i1 sign extending load
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
   // Turn FP truncstore into trunc + store.
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
 
   // Custom legalize GlobalAddress nodes into LO/HI parts.
   setOperationAction(ISD::GlobalAddress, getPointerTy(), Custom);
   setOperationAction(ISD::GlobalTLSAddress, getPointerTy(), Custom);
   setOperationAction(ISD::ConstantPool, getPointerTy(), Custom);
+  setOperationAction(ISD::BlockAddress, getPointerTy(), Custom);
 
   // Sparc doesn't have sext_inreg, replace them with shl/sra
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
@@ -1268,13 +1343,25 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
 
+  // ... nor does SparcV9.
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UREM, MVT::i64, Expand);
+    setOperationAction(ISD::SREM, MVT::i64, Expand);
+    setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  }
+
   // Custom expand fp<->sint
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
 
-  // Expand fp<->uint
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+  // Custom Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::BITCAST, MVT::f32, Expand);
   setOperationAction(ISD::BITCAST, MVT::i32, Expand);
@@ -1283,9 +1370,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f128, Expand);
+
   setOperationAction(ISD::SETCC, MVT::i32, Expand);
   setOperationAction(ISD::SETCC, MVT::f32, Expand);
   setOperationAction(ISD::SETCC, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Expand);
 
   // Sparc doesn't have BRCOND either, it has BR_CC.
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -1294,20 +1384,51 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
 
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
 
   if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ADDC, MVT::i64, Custom);
+    setOperationAction(ISD::ADDE, MVT::i64, Custom);
+    setOperationAction(ISD::SUBC, MVT::i64, Custom);
+    setOperationAction(ISD::SUBE, MVT::i64, Custom);
+    setOperationAction(ISD::BITCAST, MVT::f64, Expand);
+    setOperationAction(ISD::BITCAST, MVT::i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::i64, Expand);
+    setOperationAction(ISD::SETCC, MVT::i64, Expand);
     setOperationAction(ISD::BR_CC, MVT::i64, Custom);
     setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
+
+    setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+    setOperationAction(ISD::CTTZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ , MVT::i64, Expand);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
+    setOperationAction(ISD::BSWAP, MVT::i64, Expand);
+    setOperationAction(ISD::ROTL , MVT::i64, Expand);
+    setOperationAction(ISD::ROTR , MVT::i64, Expand);
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
   }
 
   // FIXME: There are instructions available for ATOMIC_FENCE
   // on SparcV8 and later.
   setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Expand);
 
+  if (!Subtarget->isV9()) {
+    // SparcV8 does not have FNEGD and FABSD.
+    setOperationAction(ISD::FNEG, MVT::f64, Custom);
+    setOperationAction(ISD::FABS, MVT::f64, Custom);
+  }
+
+  setOperationAction(ISD::FSIN , MVT::f128, Expand);
+  setOperationAction(ISD::FCOS , MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FREM , MVT::f128, Expand);
+  setOperationAction(ISD::FMA  , MVT::f128, Expand);
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
   setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
@@ -1326,8 +1447,10 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
   setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f128, Expand);
   setOperationAction(ISD::FPOW , MVT::f64, Expand);
   setOperationAction(ISD::FPOW , MVT::f32, Expand);
 
@@ -1339,7 +1462,12 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
 
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+  }
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -1353,14 +1481,100 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
 
-  // No debug info support yet.
-  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  setExceptionPointerRegister(SP::I0);
+  setExceptionSelectorRegister(SP::I1);
 
   setStackPointerRegisterToSaveRestore(SP::O6);
 
-  if (TM.getSubtarget<SparcSubtarget>().isV9())
+  if (Subtarget->isV9())
     setOperationAction(ISD::CTPOP, MVT::i32, Legal);
 
+  if (Subtarget->isV9() && Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::LOAD, MVT::f128, Legal);
+    setOperationAction(ISD::STORE, MVT::f128, Legal);
+  } else {
+    setOperationAction(ISD::LOAD, MVT::f128, Custom);
+    setOperationAction(ISD::STORE, MVT::f128, Custom);
+  }
+
+  if (Subtarget->hasHardQuad()) {
+    setOperationAction(ISD::FADD,  MVT::f128, Legal);
+    setOperationAction(ISD::FSUB,  MVT::f128, Legal);
+    setOperationAction(ISD::FMUL,  MVT::f128, Legal);
+    setOperationAction(ISD::FDIV,  MVT::f128, Legal);
+    setOperationAction(ISD::FSQRT, MVT::f128, Legal);
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Legal);
+    if (Subtarget->isV9()) {
+      setOperationAction(ISD::FNEG, MVT::f128, Legal);
+      setOperationAction(ISD::FABS, MVT::f128, Legal);
+    } else {
+      setOperationAction(ISD::FNEG, MVT::f128, Custom);
+      setOperationAction(ISD::FABS, MVT::f128, Custom);
+    }
+
+    if (!Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+    }
+
+  } else {
+    // Custom legalize f128 operations.
+
+    setOperationAction(ISD::FADD,  MVT::f128, Custom);
+    setOperationAction(ISD::FSUB,  MVT::f128, Custom);
+    setOperationAction(ISD::FMUL,  MVT::f128, Custom);
+    setOperationAction(ISD::FDIV,  MVT::f128, Custom);
+    setOperationAction(ISD::FSQRT, MVT::f128, Custom);
+    setOperationAction(ISD::FNEG,  MVT::f128, Custom);
+    setOperationAction(ISD::FABS,  MVT::f128, Custom);
+
+    setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
+    setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
+
+    // Setup Runtime library names.
+    if (Subtarget->is64Bit()) {
+      setLibcallName(RTLIB::ADD_F128,  "_Qp_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Qp_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Qp_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Qp_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Qp_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Qp_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Qp_qtoui");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Qp_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Qp_uitoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Qp_qtox");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Qp_qtoux");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Qp_xtoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Qp_uxtoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Qp_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Qp_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Qp_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Qp_qtod");
+    } else {
+      setLibcallName(RTLIB::ADD_F128,  "_Q_add");
+      setLibcallName(RTLIB::SUB_F128,  "_Q_sub");
+      setLibcallName(RTLIB::MUL_F128,  "_Q_mul");
+      setLibcallName(RTLIB::DIV_F128,  "_Q_div");
+      setLibcallName(RTLIB::SQRT_F128, "_Q_sqrt");
+      setLibcallName(RTLIB::FPTOSINT_F128_I32, "_Q_qtoi");
+      setLibcallName(RTLIB::FPTOUINT_F128_I32, "_Q_qtou");
+      setLibcallName(RTLIB::SINTTOFP_I32_F128, "_Q_itoq");
+      setLibcallName(RTLIB::UINTTOFP_I32_F128, "_Q_utoq");
+      setLibcallName(RTLIB::FPTOSINT_F128_I64, "_Q_qtoll");
+      setLibcallName(RTLIB::FPTOUINT_F128_I64, "_Q_qtoull");
+      setLibcallName(RTLIB::SINTTOFP_I64_F128, "_Q_lltoq");
+      setLibcallName(RTLIB::UINTTOFP_I64_F128, "_Q_ulltoq");
+      setLibcallName(RTLIB::FPEXT_F32_F128, "_Q_stoq");
+      setLibcallName(RTLIB::FPEXT_F64_F128, "_Q_dtoq");
+      setLibcallName(RTLIB::FPROUND_F128_F32, "_Q_qtos");
+      setLibcallName(RTLIB::FPROUND_F128_F64, "_Q_qtod");
+    }
+  }
+
   setMinFunctionAlignment(2);
 
   computeRegisterProperties();
@@ -1381,21 +1595,33 @@ const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case SPISD::Lo:         return "SPISD::Lo";
   case SPISD::FTOI:       return "SPISD::FTOI";
   case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::FTOX:       return "SPISD::FTOX";
+  case SPISD::XTOF:       return "SPISD::XTOF";
   case SPISD::CALL:       return "SPISD::CALL";
   case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
   case SPISD::GLOBAL_BASE_REG: return "SPISD::GLOBAL_BASE_REG";
   case SPISD::FLUSHW:     return "SPISD::FLUSHW";
+  case SPISD::TLS_ADD:    return "SPISD::TLS_ADD";
+  case SPISD::TLS_LD:     return "SPISD::TLS_LD";
+  case SPISD::TLS_CALL:   return "SPISD::TLS_CALL";
   }
 }
 
+EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
-void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         APInt &KnownZero,
-                                                         APInt &KnownOne,
-                                                         const SelectionDAG &DAG,
-                                                         unsigned Depth) const {
+void SparcTargetLowering::computeMaskedBitsForTargetNode
+                                (const SDValue Op,
+                                 APInt &KnownZero,
+                                 APInt &KnownOne,
+                                 const SelectionDAG &DAG,
+                                 unsigned Depth) const {
   APInt KnownZero2, KnownOne2;
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
 
@@ -1444,7 +1670,7 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                              SelectionDAG &DAG) const {
   if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
     return DAG.getTargetGlobalAddress(GA->getGlobal(),
-                                      GA->getDebugLoc(),
+                                      SDLoc(GA),
                                       GA->getValueType(0),
                                       GA->getOffset(), TF);
 
@@ -1454,6 +1680,12 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                      CP->getAlignment(),
                                      CP->getOffset(), TF);
 
+  if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(BA->getBlockAddress(),
+                                     Op.getValueType(),
+                                     0,
+                                     TF);
+
   if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
     return DAG.getTargetExternalSymbol(ES->getSymbol(),
                                        ES->getValueType(0), TF);
@@ -1466,7 +1698,7 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
 SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
                                           unsigned HiTF, unsigned LoTF,
                                           SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Hi = DAG.getNode(SPISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
   SDValue Lo = DAG.getNode(SPISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
@@ -1476,7 +1708,7 @@ SDValue SparcTargetLowering::makeHiLoPair(SDValue Op,
 // Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
 // or ExternalSymbol SDNode.
 SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   EVT VT = getPointerTy();
 
   // Handle PIC mode first.
@@ -1485,6 +1717,10 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue HiLo = makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
     SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, VT);
     SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, VT, GlobalBase, HiLo);
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
     return DAG.getLoad(VT, DL, DAG.getEntryNode(), AbsAddr,
                        MachinePointerInfo::getGOT(), false, false, false, 0);
   }
@@ -1493,6 +1729,7 @@ SDValue SparcTargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
   switch(getTargetMachine().getCodeModel()) {
   default:
     llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::JITDefault:
   case CodeModel::Small:
     // abs32.
     return makeHiLoPair(Op, SPII::MO_HI, SPII::MO_LO, DAG);
@@ -1524,29 +1761,441 @@ SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
   return makeAddress(Op, DAG);
 }
 
-static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  // Convert the fp value to integer in an FP register.
-  assert(Op.getValueType() == MVT::i32);
-  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
-  return DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+SDValue SparcTargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  SDLoc DL(GA);
+  const GlobalValue *GV = GA->getGlobal();
+  EVT PtrVT = getPointerTy();
+
+  TLSModel::Model model = getTargetMachine().getTLSModel(GV);
+
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    unsigned HiTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_HI22
+                     : SPII::MO_TLS_LDM_HI22);
+    unsigned LoTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_LO10
+                     : SPII::MO_TLS_LDM_LO10);
+    unsigned addTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_ADD
+                      : SPII::MO_TLS_LDM_ADD);
+    unsigned callTF = ((model == TLSModel::GeneralDynamic)? SPII::MO_TLS_GD_CALL
+                       : SPII::MO_TLS_LDM_CALL);
+
+    SDValue HiLo = makeHiLoPair(Op, HiTF, LoTF, DAG);
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue Argument = DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Base, HiLo,
+                               withTargetFlags(Op, addTF, DAG));
+
+    SDValue Chain = DAG.getEntryNode();
+    SDValue InFlag;
+
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, true), DL);
+    Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag);
+    InFlag = Chain.getValue(1);
+    SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT);
+    SDValue Symbol = withTargetFlags(Op, callTF, DAG);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+    Ops.push_back(Symbol);
+    Ops.push_back(DAG.getRegister(SP::O0, PtrVT));
+    const uint32_t *Mask = getTargetMachine()
+      .getRegisterInfo()->getCallPreservedMask(CallingConv::C);
+    assert(Mask && "Missing call preserved mask for calling convention");
+    Ops.push_back(DAG.getRegisterMask(Mask));
+    Ops.push_back(InFlag);
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, &Ops[0], Ops.size());
+    InFlag = Chain.getValue(1);
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
+    SDValue Ret = DAG.getCopyFromReg(Chain, DL, SP::O0, PtrVT, InFlag);
+
+    if (model != TLSModel::LocalDynamic)
+      return Ret;
+
+    SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_HIX22, DAG));
+    SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                             withTargetFlags(Op, SPII::MO_TLS_LDO_LOX10, DAG));
+    HiLo =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT, Ret, HiLo,
+                       withTargetFlags(Op, SPII::MO_TLS_LDO_ADD, DAG));
+  }
+
+  if (model == TLSModel::InitialExec) {
+    unsigned ldTF     = ((PtrVT == MVT::i64)? SPII::MO_TLS_IE_LDX
+                         : SPII::MO_TLS_IE_LD);
+
+    SDValue Base = DAG.getNode(SPISD::GLOBAL_BASE_REG, DL, PtrVT);
+
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI->setHasCalls(true);
+
+    SDValue TGA = makeHiLoPair(Op,
+                               SPII::MO_TLS_IE_HI22, SPII::MO_TLS_IE_LO10, DAG);
+    SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base, TGA);
+    SDValue Offset = DAG.getNode(SPISD::TLS_LD,
+                                 DL, PtrVT, Ptr,
+                                 withTargetFlags(Op, ldTF, DAG));
+    return DAG.getNode(SPISD::TLS_ADD, DL, PtrVT,
+                       DAG.getRegister(SP::G7, PtrVT), Offset,
+                       withTargetFlags(Op, SPII::MO_TLS_IE_ADD, DAG));
+  }
+
+  assert(model == TLSModel::LocalExec);
+  SDValue Hi = DAG.getNode(SPISD::Hi, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_HIX22, DAG));
+  SDValue Lo = DAG.getNode(SPISD::Lo, DL, PtrVT,
+                           withTargetFlags(Op, SPII::MO_TLS_LE_LOX10, DAG));
+  SDValue Offset =  DAG.getNode(ISD::XOR, DL, PtrVT, Hi, Lo);
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getRegister(SP::G7, PtrVT), Offset);
+}
+
+SDValue
+SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                          SDValue Arg, SDLoc DL,
+                                          SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+  ArgListEntry Entry;
+  Entry.Node = Arg;
+  Entry.Ty   = ArgTy;
+
+  if (ArgTy->isFP128Ty()) {
+    // Create a stack object and pass the pointer to the library function.
+    int FI = MFI->CreateStackObject(16, 8, false);
+    SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy());
+    Chain = DAG.getStore(Chain,
+                         DL,
+                         Entry.Node,
+                         FIPtr,
+                         MachinePointerInfo(),
+                         false,
+                         false,
+                         8);
+
+    Entry.Node = FIPtr;
+    Entry.Ty   = PointerType::getUnqual(ArgTy);
+  }
+  Args.push_back(Entry);
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                                 const char *LibFuncName,
+                                 unsigned numArgs) const {
+
+  ArgListTy Args;
+
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+
+  SDValue Callee = DAG.getExternalSymbol(LibFuncName, getPointerTy());
+  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  Type *RetTyABI = RetTy;
+  SDValue Chain = DAG.getEntryNode();
+  SDValue RetPtr;
+
+  if (RetTy->isFP128Ty()) {
+    // Create a Stack Object to receive the return value of type f128.
+    ArgListEntry Entry;
+    int RetFI = MFI->CreateStackObject(16, 8, false);
+    RetPtr = DAG.getFrameIndex(RetFI, getPointerTy());
+    Entry.Node = RetPtr;
+    Entry.Ty   = PointerType::getUnqual(RetTy);
+    if (!Subtarget->is64Bit())
+      Entry.isSRet = true;
+    Entry.isReturned = false;
+    Args.push_back(Entry);
+    RetTyABI = Type::getVoidTy(*DAG.getContext());
+  }
+
+  assert(Op->getNumOperands() >= numArgs && "Not enough operands!");
+  for (unsigned i = 0, e = numArgs; i != e; ++i) {
+    Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
+  }
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTyABI,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, SDLoc(Op));
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // chain is in second result.
+  if (RetTyABI == RetTy)
+    return CallInfo.first;
+
+  assert (RetTy->isFP128Ty() && "Unexpected return type!");
+
+  Chain = CallInfo.second;
+
+  // Load RetPtr to get the return value.
+  return DAG.getLoad(Op.getValueType(),
+                     SDLoc(Op),
+                     Chain,
+                     RetPtr,
+                     MachinePointerInfo(),
+                     false, false, false, 8);
+}
+
+SDValue
+SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
+                                      unsigned &SPCC,
+                                      SDLoc DL,
+                                      SelectionDAG &DAG) const {
+
+  const char *LibCall = 0;
+  bool is64Bit = Subtarget->is64Bit();
+  switch(SPCC) {
+  default: llvm_unreachable("Unhandled conditional code!");
+  case SPCC::FCC_E  : LibCall = is64Bit? "_Qp_feq" : "_Q_feq"; break;
+  case SPCC::FCC_NE : LibCall = is64Bit? "_Qp_fne" : "_Q_fne"; break;
+  case SPCC::FCC_L  : LibCall = is64Bit? "_Qp_flt" : "_Q_flt"; break;
+  case SPCC::FCC_G  : LibCall = is64Bit? "_Qp_fgt" : "_Q_fgt"; break;
+  case SPCC::FCC_LE : LibCall = is64Bit? "_Qp_fle" : "_Q_fle"; break;
+  case SPCC::FCC_GE : LibCall = is64Bit? "_Qp_fge" : "_Q_fge"; break;
+  case SPCC::FCC_UL :
+  case SPCC::FCC_ULE:
+  case SPCC::FCC_UG :
+  case SPCC::FCC_UGE:
+  case SPCC::FCC_U  :
+  case SPCC::FCC_O  :
+  case SPCC::FCC_LG :
+  case SPCC::FCC_UE : LibCall = is64Bit? "_Qp_cmp" : "_Q_cmp"; break;
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(LibCall, getPointerTy());
+  Type *RetTy = Type::getInt32Ty(*DAG.getContext());
+  ArgListTy Args;
+  SDValue Chain = DAG.getEntryNode();
+  Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
+  Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
+
+  TargetLowering::
+    CallLoweringInfo CLI(Chain,
+                         RetTy,
+                         false, false, false, false,
+                         0, CallingConv::C,
+                         false, false, true,
+                         Callee, Args, DAG, DL);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  // result is in first, and chain is in second result.
+  SDValue Result =  CallInfo.first;
+
+  switch(SPCC) {
+  default: {
+    SDValue RHS = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UL : {
+    SDValue Mask   = DAG.getTargetConstant(1, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_ULE: {
+    SDValue RHS = DAG.getTargetConstant(2, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UG :  {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_G;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UGE: {
+    SDValue RHS = DAG.getTargetConstant(1, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+
+  case SPCC::FCC_U  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_O  :  {
+    SDValue RHS = DAG.getTargetConstant(3, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_LG :  {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_NE;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  case SPCC::FCC_UE : {
+    SDValue Mask   = DAG.getTargetConstant(3, Result.getValueType());
+    Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
+    SDValue RHS    = DAG.getTargetConstant(0, Result.getValueType());
+    SPCC = SPCC::ICC_E;
+    return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
+  }
+  }
+}
+
+static SDValue
+LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
+                   const SparcTargetLowering &TLI) {
+
+  if (Op.getOperand(0).getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F64_F128), 1);
+
+  if (Op.getOperand(0).getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
+
+  llvm_unreachable("fpextend with non-float operand!");
+  return SDValue(0, 0);
 }
 
-static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  assert(Op.getOperand(0).getValueType() == MVT::i32);
-  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, MVT::f32, Op.getOperand(0));
-  // Convert the int value to FP in an FP register.
-  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+static SDValue
+LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
+                  const SparcTargetLowering &TLI) {
+  // FP_ROUND on f64 and f32 are legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128)
+    return Op;
+
+  if (Op.getValueType() == MVT::f64)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F64), 1);
+  if (Op.getValueType() == MVT::f32)
+    return TLI.LowerF128Op(Op, DAG,
+                           TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
+
+  llvm_unreachable("fpround to non-float!");
+  return SDValue(0, 0);
 }
 
-static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  // Expand f128 operations to fp128 abi calls.
+  if (Op.getOperand(0).getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(VT))) {
+    const char *libName = TLI.getLibcallName(VT == MVT::i32
+                                             ? RTLIB::FPTOSINT_F128_I32
+                                             : RTLIB::FPTOSINT_F128_I64);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the resulting type is illegal.
+  if (!TLI.isTypeLegal(VT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the fp value to integer in an FP register.
+  if (VT == MVT::i32)
+    Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  else
+    Op = DAG.getNode(SPISD::FTOX, dl, MVT::f64, Op.getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || (OpVT == MVT::i64));
+
+  EVT floatVT = (OpVT == MVT::i32) ? MVT::f32 : MVT::f64;
+
+  // Expand f128 operations to fp128 ABI calls.
+  if (Op.getValueType() == MVT::f128
+      && (!hasHardQuad || !TLI.isTypeLegal(OpVT))) {
+    const char *libName = TLI.getLibcallName(OpVT == MVT::i32
+                                             ? RTLIB::SINTTOFP_I32_F128
+                                             : RTLIB::SINTTOFP_I64_F128);
+    return TLI.LowerF128Op(Op, DAG, libName, 1);
+  }
+
+  // Expand if the operand type is illegal.
+  if (!TLI.isTypeLegal(OpVT))
+    return SDValue(0, 0);
+
+  // Otherwise, Convert the int value to FP in an FP register.
+  SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
+  unsigned opcode = (OpVT == MVT::i32)? SPISD::ITOF : SPISD::XTOF;
+  return DAG.getNode(opcode, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the resulting type is legal.
+  if (Op.getOperand(0).getValueType() != MVT::f128 ||
+      (hasHardQuad && TLI.isTypeLegal(VT)))
+    return SDValue(0, 0);
+
+  assert(VT == MVT::i32 || VT == MVT::i64);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(VT == MVT::i32
+                                            ? RTLIB::FPTOUINT_F128_I32
+                                            : RTLIB::FPTOUINT_F128_I64),
+                         1);
+}
+
+static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI,
+                               bool hasHardQuad) {
+  SDLoc dl(Op);
+  EVT OpVT = Op.getOperand(0).getValueType();
+  assert(OpVT == MVT::i32 || OpVT == MVT::i64);
+
+  // Expand if it does not involve f128 or the target has support for
+  // quad floating point instructions and the operand type is legal.
+  if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
+    return SDValue(0, 0);
+
+  return TLI.LowerF128Op(Op, DAG,
+                         TLI.getLibcallName(OpVT == MVT::i32
+                                            ? RTLIB::UINTTOFP_I32_F128
+                                            : RTLIB::UINTTOFP_I64_F128),
+                         1);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG,
+                          const SparcTargetLowering &TLI,
+                          bool hasHardQuad) {
   SDValue Chain = Op.getOperand(0);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
   SDValue LHS = Op.getOperand(2);
   SDValue RHS = Op.getOperand(3);
   SDValue Dest = Op.getOperand(4);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc, SPCC = ~0U;
 
   // If this is a br_cc of a "setcc", and if the setcc got lowered into
@@ -1556,28 +2205,34 @@ static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
   // Get the condition flag.
   SDValue CompareFlag;
   if (LHS.getValueType().isInteger()) {
-    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
-    SDValue Ops[2] = { LHS, RHS };
-    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
     // 32-bit compares use the icc flags, 64-bit uses the xcc flags.
     Opc = LHS.getValueType() == MVT::i32 ? SPISD::BRICC : SPISD::BRXCC;
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
-    Opc = SPISD::BRFCC;
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::BRICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      Opc = SPISD::BRFCC;
+    }
   }
   return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
 }
 
-static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const SparcTargetLowering &TLI,
+                              bool hasHardQuad) {
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue TrueVal = Op.getOperand(2);
   SDValue FalseVal = Op.getOperand(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Opc, SPCC = ~0U;
 
   // If this is a select_cc of a "setcc", and if the setcc got lowered into
@@ -1586,17 +2241,20 @@ static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
 
   SDValue CompareFlag;
   if (LHS.getValueType().isInteger()) {
-    // subcc returns a value
-    EVT VTs[] = { LHS.getValueType(), MVT::Glue };
-    SDValue Ops[2] = { LHS, RHS };
-    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, MVT::Glue, LHS, RHS);
     Opc = LHS.getValueType() == MVT::i32 ?
           SPISD::SELECT_ICC : SPISD::SELECT_XCC;
     if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
   } else {
-    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
-    Opc = SPISD::SELECT_FCC;
-    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    if (!hasHardQuad && LHS.getValueType() == MVT::f128) {
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+      CompareFlag = TLI.LowerF128Compare(LHS, RHS, SPCC, dl, DAG);
+      Opc = SPISD::SELECT_ICC;
+    } else {
+      CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Glue, LHS, RHS);
+      Opc = SPISD::SELECT_FCC;
+      if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    }
   }
   return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
                      DAG.getConstant(SPCC, MVT::i32), CompareFlag);
@@ -1607,9 +2265,12 @@ static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
   MachineFunction &MF = DAG.getMachineFunction();
   SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
 
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo()->setFrameAddressIsTaken(true);
+
   // vastart just stores the address of the VarArgsFrameIndex slot into the
   // memory location argument.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Offset =
     DAG.getNode(ISD::ADD, DL, TLI.getPointerTy(),
                 DAG.getRegister(SP::I6, TLI.getPointerTy()),
@@ -1626,7 +2287,7 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
   SDValue VAListPtr = Node->getOperand(1);
   EVT PtrVT = VAListPtr.getValueType();
   const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   SDValue VAList = DAG.getLoad(PtrVT, DL, InChain, VAListPtr,
                                MachinePointerInfo(SV), false, false, false, 0);
   // Increment the pointer, VAList, to the next vaarg.
@@ -1642,27 +2303,32 @@ static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
                      std::min(PtrVT.getSizeInBits(), VT.getSizeInBits())/8);
 }
 
-static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                       const SparcSubtarget *Subtarget) {
   SDValue Chain = Op.getOperand(0);  // Legalize the chain.
   SDValue Size  = Op.getOperand(1);  // Legalize the size.
-  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Size->getValueType(0);
+  SDLoc dl(Op);
 
   unsigned SPReg = SP::O6;
-  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
-  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
   Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
 
   // The resultant pointer is actually 16 words from the bottom of the stack,
   // to provide a register spill area.
-  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
-                                 DAG.getConstant(96, MVT::i32));
+  unsigned regSpillArea = Subtarget->is64Bit() ? 128 : 96;
+  regSpillArea += Subtarget->getStackPointerBias();
+
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
+                               DAG.getConstant(regSpillArea, VT));
   SDValue Ops[2] = { NewVal, Chain };
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
 
 static SDValue getFLUSHW(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Chain = DAG.getNode(SPISD::FLUSHW,
                               dl, MVT::Other, DAG.getEntryNode());
   return Chain;
@@ -1673,7 +2339,7 @@ static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned FrameReg = SP::I6;
 
   uint64_t depth = Op.getConstantOperandVal(0);
@@ -1699,20 +2365,25 @@ static SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
   return FrameAddr;
 }
 
-static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG,
+                               const SparcTargetLowering &TLI) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setReturnAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
-  unsigned RetReg = SP::I7;
-
+  SDLoc dl(Op);
   uint64_t depth = Op.getConstantOperandVal(0);
 
   SDValue RetAddr;
-  if (depth == 0)
+  if (depth == 0) {
+    unsigned RetReg = MF.addLiveIn(SP::I7,
+                                   TLI.getRegClassFor(TLI.getPointerTy()));
     RetAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, RetReg, VT);
-  else {
+  } else {
+    // Need frame address to find return address of the caller.
+    MFI->setFrameAddressIsTaken(true);
+
     // flush first to make sure the windowed registers' values are in stack
     SDValue Chain = getFLUSHW(Op, DAG);
     RetAddr = DAG.getCopyFromReg(Chain, dl, SP::I6, VT);
@@ -1731,23 +2402,272 @@ static SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
   return RetAddr;
 }
 
+static SDValue LowerF64Op(SDValue Op, SelectionDAG &DAG, unsigned opcode)
+{
+  SDLoc dl(Op);
+
+  assert(Op.getValueType() == MVT::f64 && "LowerF64Op called on non-double!");
+  assert(opcode == ISD::FNEG || opcode == ISD::FABS);
+
+  // Lower fneg/fabs on f64 to fneg/fabs on f32.
+  // fneg f64 => fneg f32:sub_even, fmov f32:sub_odd.
+  // fabs f64 => fabs f32:sub_even, fmov f32:sub_odd.
+
+  SDValue SrcReg64 = Op.getOperand(0);
+  SDValue Hi32 = DAG.getTargetExtractSubreg(SP::sub_even, dl, MVT::f32,
+                                            SrcReg64);
+  SDValue Lo32 = DAG.getTargetExtractSubreg(SP::sub_odd, dl, MVT::f32,
+                                            SrcReg64);
+
+  Hi32 = DAG.getNode(opcode, dl, MVT::f32, Hi32);
+
+  SDValue DstReg64 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                dl, MVT::f64), 0);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_even, dl, MVT::f64,
+                                       DstReg64, Hi32);
+  DstReg64 = DAG.getTargetInsertSubreg(SP::sub_odd, dl, MVT::f64,
+                                       DstReg64, Lo32);
+  return DstReg64;
+}
+
+// Lower a f128 load into two f64 loads.
+static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
+{
+  SDLoc dl(Op);
+  LoadSDNode *LdNode = dyn_cast<LoadSDNode>(Op.getNode());
+  assert(LdNode && LdNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+
+  unsigned alignment = LdNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue Hi64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LdNode->getBasePtr(),
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+  EVT addrVT = LdNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              LdNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  SDValue Lo64 = DAG.getLoad(MVT::f64,
+                             dl,
+                             LdNode->getChain(),
+                             LoPtr,
+                             LdNode->getPointerInfo(),
+                             false, false, false, alignment);
+
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *InFP128 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                       dl, MVT::f128);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Hi64,
+                               SubRegEven);
+  InFP128 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                               MVT::f128,
+                               SDValue(InFP128, 0),
+                               Lo64,
+                               SubRegOdd);
+  SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
+                           SDValue(Lo64.getNode(), 1) };
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                 &OutChains[0], 2);
+  SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+// Lower a f128 store into two f64 stores.
+static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  StoreSDNode *StNode = dyn_cast<StoreSDNode>(Op.getNode());
+  assert(StNode && StNode->getOffset().getOpcode() == ISD::UNDEF
+         && "Unexpected node type");
+  SDValue SubRegEven = DAG.getTargetConstant(SP::sub_even64, MVT::i32);
+  SDValue SubRegOdd  = DAG.getTargetConstant(SP::sub_odd64, MVT::i32);
+
+  SDNode *Hi64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegEven);
+  SDNode *Lo64 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                    dl,
+                                    MVT::f64,
+                                    StNode->getValue(),
+                                    SubRegOdd);
+
+  unsigned alignment = StNode->getAlignment();
+  if (alignment > 8)
+    alignment = 8;
+
+  SDValue OutChains[2];
+  OutChains[0] = DAG.getStore(StNode->getChain(),
+                              dl,
+                              SDValue(Hi64, 0),
+                              StNode->getBasePtr(),
+                              MachinePointerInfo(),
+                              false, false, alignment);
+  EVT addrVT = StNode->getBasePtr().getValueType();
+  SDValue LoPtr = DAG.getNode(ISD::ADD, dl, addrVT,
+                              StNode->getBasePtr(),
+                              DAG.getConstant(8, addrVT));
+  OutChains[1] = DAG.getStore(StNode->getChain(),
+                             dl,
+                             SDValue(Lo64, 0),
+                             LoPtr,
+                             MachinePointerInfo(),
+                             false, false, alignment);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &OutChains[0], 2);
+}
+
+static SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG,
+                         const SparcTargetLowering &TLI,
+                         bool is64Bit) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FNEG);
+  if (Op.getValueType() == MVT::f128)
+    return TLI.LowerF128Op(Op, DAG, ((is64Bit) ? "_Qp_neg" : "_Q_neg"), 1);
+  return Op;
+}
+
+static SDValue LowerFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
+  if (Op.getValueType() == MVT::f64)
+    return LowerF64Op(Op, DAG, ISD::FABS);
+  if (Op.getValueType() != MVT::f128)
+    return Op;
+
+  // Lower fabs on f128 to fabs on f64
+  // fabs f128 => fabs f64:sub_even64, fmov f64:sub_odd64
+
+  SDLoc dl(Op);
+  SDValue SrcReg128 = Op.getOperand(0);
+  SDValue Hi64 = DAG.getTargetExtractSubreg(SP::sub_even64, dl, MVT::f64,
+                                            SrcReg128);
+  SDValue Lo64 = DAG.getTargetExtractSubreg(SP::sub_odd64, dl, MVT::f64,
+                                            SrcReg128);
+  if (isV9)
+    Hi64 = DAG.getNode(Op.getOpcode(), dl, MVT::f64, Hi64);
+  else
+    Hi64 = LowerF64Op(Hi64, DAG, ISD::FABS);
+
+  SDValue DstReg128 = SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, MVT::f128), 0);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_even64, dl, MVT::f128,
+                                        DstReg128, Hi64);
+  DstReg128 = DAG.getTargetInsertSubreg(SP::sub_odd64, dl, MVT::f128,
+                                        DstReg128, Lo64);
+  return DstReg128;
+}
+
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+
+  if (Op.getValueType() != MVT::i64)
+    return Op;
+
+  SDLoc dl(Op);
+  SDValue Src1 = Op.getOperand(0);
+  SDValue Src1Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1);
+  SDValue Src1Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src1,
+                               DAG.getConstant(32, MVT::i64));
+  Src1Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src1Hi);
+
+  SDValue Src2 = Op.getOperand(1);
+  SDValue Src2Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2);
+  SDValue Src2Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Src2,
+                               DAG.getConstant(32, MVT::i64));
+  Src2Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src2Hi);
+
+
+  bool hasChain = false;
+  unsigned hiOpc = Op.getOpcode();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Invalid opcode");
+  case ISD::ADDC: hiOpc = ISD::ADDE; break;
+  case ISD::ADDE: hasChain = true; break;
+  case ISD::SUBC: hiOpc = ISD::SUBE; break;
+  case ISD::SUBE: hasChain = true; break;
+  }
+  SDValue Lo;
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Glue);
+  if (hasChain) {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo,
+                     Op.getOperand(2));
+  } else {
+    Lo = DAG.getNode(Op.getOpcode(), dl, VTs, Src1Lo, Src2Lo);
+  }
+  SDValue Hi = DAG.getNode(hiOpc, dl, VTs, Src1Hi, Src2Hi, Lo.getValue(1));
+  SDValue Carry = Hi.getValue(1);
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Lo);
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Hi);
+  Hi = DAG.getNode(ISD::SHL, dl, MVT::i64, Hi,
+                   DAG.getConstant(32, MVT::i64));
+
+  SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
+  SDValue Ops[2] = { Dst, Carry };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
 SDValue SparcTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+
+  bool hasHardQuad = Subtarget->hasHardQuad();
+  bool is64Bit     = Subtarget->is64Bit();
+  bool isV9        = Subtarget->isV9();
+
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
-  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG, *this);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
-  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
-  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
-  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
-  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG, *this,
+                                                       hasHardQuad);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG, *this,
+                                                  hasHardQuad);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG, *this,
+                                                      hasHardQuad);
   case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
-  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG,
+                                                               Subtarget);
+
+  case ISD::LOAD:               return LowerF128Load(Op, DAG);
+  case ISD::STORE:              return LowerF128Store(Op, DAG);
+  case ISD::FADD:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::ADD_F128), 2);
+  case ISD::FSUB:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SUB_F128), 2);
+  case ISD::FMUL:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::MUL_F128), 2);
+  case ISD::FDIV:               return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::DIV_F128), 2);
+  case ISD::FSQRT:              return LowerF128Op(Op, DAG,
+                                       getLibcallName(RTLIB::SQRT_F128),1);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG, *this, is64Bit);
+  case ISD::FABS:               return LowerFABS(Op, DAG, isV9);
+  case ISD::FP_EXTEND:          return LowerF128_FPEXTEND(Op, DAG, *this);
+  case ISD::FP_ROUND:           return LowerF128_FPROUND(Op, DAG, *this);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   }
 }
 
@@ -1764,11 +2684,13 @@ SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case SP::SELECT_CC_Int_ICC:
   case SP::SELECT_CC_FP_ICC:
   case SP::SELECT_CC_DFP_ICC:
+  case SP::SELECT_CC_QFP_ICC:
     BROpcode = SP::BCOND;
     break;
   case SP::SELECT_CC_Int_FCC:
   case SP::SELECT_CC_FP_FCC:
   case SP::SELECT_CC_DFP_FCC:
+  case SP::SELECT_CC_QFP_FCC:
     BROpcode = SP::FBCOND;
     break;
   }
@@ -1847,7 +2769,7 @@ SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
 
 std::pair<unsigned, const TargetRegisterClass*>
 SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  EVT VT) const {
+                                                  MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     case 'r':
@@ -1863,3 +2785,50 @@ SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The Sparc target isn't yet aware of offsets.
   return false;
 }
+
+void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>& Results,
+                                             SelectionDAG &DAG) const {
+
+  SDLoc dl(N);
+
+  RTLIB::Libcall libCall = RTLIB::UNKNOWN_LIBCALL;
+
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getOperand(0).getValueType() != MVT::f128
+        || N->getValueType(0) != MVT::i64)
+      return;
+    libCall = ((N->getOpcode() == ISD::FP_TO_SINT)
+               ? RTLIB::FPTOSINT_F128_I64
+               : RTLIB::FPTOUINT_F128_I64);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // Custom lower only if it involves f128 or i64.
+    if (N->getValueType(0) != MVT::f128
+        || N->getOperand(0).getValueType() != MVT::i64)
+      return;
+
+    libCall = ((N->getOpcode() == ISD::SINT_TO_FP)
+               ? RTLIB::SINTTOFP_I64_F128
+               : RTLIB::UINTTOFP_I64_F128);
+
+    Results.push_back(LowerF128Op(SDValue(N, 0),
+                                  DAG,
+                                  getLibcallName(libCall),
+                                  1));
+    return;
+  }
+}
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index fd706be..2659fc8 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -37,11 +37,17 @@ namespace llvm {
 
       FTOI,        // FP to Int within a FP register.
       ITOF,        // Int to FP within a FP register.
+      FTOX,        // FP to Int64 within a FP register.
+      XTOF,        // Int64 to FP within a FP register.
 
       CALL,        // A call instruction.
       RET_FLAG,    // Return with a flag operand.
-      GLOBAL_BASE_REG, // Global base reg for PIC
-      FLUSHW       // FLUSH register windows to stack
+      GLOBAL_BASE_REG, // Global base reg for PIC.
+      FLUSHW,      // FLUSH register windows to stack.
+
+      TLS_ADD,     // For Thread Local Storage (TLS).
+      TLS_LD,
+      TLS_CALL
     };
   }
 
@@ -68,29 +74,32 @@ namespace llvm {
 
     ConstraintType getConstraintType(const std::string &Constraint) const;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
 
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_32(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerFormalArguments_64(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -106,26 +115,49 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerReturn_32(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           DebugLoc DL, SelectionDAG &DAG) const;
+                           SDLoc DL, SelectionDAG &DAG) const;
     SDValue LowerReturn_64(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
-                           DebugLoc DL, SelectionDAG &DAG) const;
+                           SDLoc DL, SelectionDAG &DAG) const;
 
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
     unsigned getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const;
     SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
     SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
                          SelectionDAG &DAG) const;
     SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerF128_LibCallArg(SDValue Chain, ArgListTy &Args,
+                                 SDValue Arg, SDLoc DL,
+                                 SelectionDAG &DAG) const;
+    SDValue LowerF128Op(SDValue Op, SelectionDAG &DAG,
+                        const char *LibFuncName,
+                        unsigned numArgs) const;
+    SDValue LowerF128Compare(SDValue LHS, SDValue RHS,
+                             unsigned &SPCC,
+                             SDLoc DL,
+                             SelectionDAG &DAG) const;
+
+    bool ShouldShrinkFPConstant(EVT VT) const {
+      // Do not shrink FP constpool if VT == MVT::f128.
+      // (ldd, call _Q_fdtoq) is more expensive than two ldds.
+      return VT != MVT::f128;
+    }
+
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue>& Results,
+                                    SelectionDAG &DAG) const;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index 91805f9..8656de5 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -59,10 +59,6 @@ defm SRAX : F3_S<"srax", 0b100111, 1, sra, i64, I64Regs>;
 // preferable to use a constant pool load instead, depending on the
 // microarchitecture.
 
-// The %g0 register is constant 0.
-// This is useful for stx %g0, [...], for example.
-def : Pat<(i64 0), (i64 G0)>, Requires<[Is64Bit]>;
-
 // Single-instruction patterns.
 
 // The ALU instructions want their simm13 operands as i32 immediates.
@@ -157,14 +153,10 @@ def : Pat<(xor i64:$a, (not i64:$b)), (XNORrr $a, $b)>;
 def : Pat<(add i64:$a, i64:$b), (ADDrr $a, $b)>;
 def : Pat<(sub i64:$a, i64:$b), (SUBrr $a, $b)>;
 
-// Add/sub with carry were renamed to addc/subc in SPARC v9.
-def : Pat<(adde i64:$a, i64:$b), (ADDXrr $a, $b)>;
-def : Pat<(sube i64:$a, i64:$b), (SUBXrr $a, $b)>;
-
-def : Pat<(addc i64:$a, i64:$b), (ADDCCrr $a, $b)>;
-def : Pat<(subc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
+def : Pat<(SPcmpicc i64:$a, i64:$b), (CMPrr $a, $b)>;
 
-def : Pat<(SPcmpicc i64:$a, i64:$b), (SUBCCrr $a, $b)>;
+def : Pat<(tlsadd i64:$a, i64:$b, tglobaltlsaddr:$sym),
+          (TLS_ADDrr $a, $b, $sym)>;
 
 // Register-immediate instructions.
 
@@ -175,7 +167,15 @@ def : Pat<(xor i64:$a, (i64 simm13:$b)), (XORri $a, (as_i32imm $b))>;
 def : Pat<(add i64:$a, (i64 simm13:$b)), (ADDri $a, (as_i32imm $b))>;
 def : Pat<(sub i64:$a, (i64 simm13:$b)), (SUBri $a, (as_i32imm $b))>;
 
-def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (SUBCCri $a, (as_i32imm $b))>;
+def : Pat<(SPcmpicc i64:$a, (i64 simm13:$b)), (CMPri $a, (as_i32imm $b))>;
+
+def : Pat<(ctpop i64:$src), (POPCrr $src)>;
+
+// "LEA" form of add
+def LEAX_ADDri : F3_2<2, 0b000000,
+                     (outs I64Regs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
 
 } // Predicates = [Is64Bit]
 
@@ -241,8 +241,19 @@ def LDXri  : F3_2<3, 0b001011,
                   (outs I64Regs:$dst), (ins MEMri:$addr),
                   "ldx [$addr], $dst",
                   [(set i64:$dst, (load ADDRri:$addr))]>;
+let mayLoad = 1 in
+  def TLS_LDXrr : F3_1<3, 0b001011,
+                       (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                       "ldx [$addr], $dst, $sym",
+                       [(set i64:$dst,
+                           (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
 
 // Extending loads to i64.
+def : Pat<(i64 (zextloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (zextloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
+def : Pat<(i64 (extloadi1 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
+
 def : Pat<(i64 (zextloadi8 ADDRrr:$addr)), (LDUBrr ADDRrr:$addr)>;
 def : Pat<(i64 (zextloadi8 ADDRri:$addr)), (LDUBri ADDRri:$addr)>;
 def : Pat<(i64 (extloadi8 ADDRrr:$addr)),  (LDUBrr ADDRrr:$addr)>;
@@ -290,6 +301,10 @@ def : Pat<(truncstorei16 i64:$src, ADDRri:$addr), (STHri ADDRri:$addr, $src)>;
 def : Pat<(truncstorei32 i64:$src, ADDRrr:$addr), (STrr  ADDRrr:$addr, $src)>;
 def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
 
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i64 0), ADDRrr:$dst), (STXrr ADDRrr:$dst, (i64 G0))>;
+def : Pat<(store (i64 0), ADDRri:$dst), (STXri ADDRri:$dst, (i64 G0))>;
+
 } // Predicates = [Is64Bit]
 
 
@@ -307,9 +322,9 @@ def : Pat<(truncstorei32 i64:$src, ADDRri:$addr), (STri  ADDRri:$addr, $src)>;
 let Predicates = [Is64Bit] in {
 
 let Uses = [ICC] in
-def BPXCC : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                     "bp$cc %xcc, $dst",
-                     [(SPbrxcc bb:$dst, imm:$cc)]>;
+def BPXCC : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                     "b$cond %xcc, $imm22",
+                     [(SPbrxcc bb:$imm22, imm:$cond)]>;
 
 // Conditional moves on %xcc.
 let Uses = [ICC], Constraints = "$f = $rd" in {
@@ -322,12 +337,68 @@ def MOVXCCri : Pseudo<(outs IntRegs:$rd),
                       (ins i32imm:$i, IntRegs:$f, CCOp:$cond),
                       "mov$cond %xcc, $i, $rd",
                       [(set i32:$rd,
-                       (SPselecticc simm11:$i, i32:$f, imm:$cond))]>;
+                       (SPselectxcc simm11:$i, i32:$f, imm:$cond))]>;
+def FMOVS_XCC : Pseudo<(outs FPRegs:$rd),
+                      (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+                      "fmovs$cond %xcc, $rs2, $rd",
+                      [(set f32:$rd,
+                       (SPselectxcc f32:$rs2, f32:$f, imm:$cond))]>;
+def FMOVD_XCC : Pseudo<(outs DFPRegs:$rd),
+                      (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+                      "fmovd$cond %xcc, $rs2, $rd",
+                      [(set f64:$rd,
+                       (SPselectxcc f64:$rs2, f64:$f, imm:$cond))]>;
 } // Uses, Constraints
 
+//===----------------------------------------------------------------------===//
+// 64-bit Floating Point Conversions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [Is64Bit] in {
+
+def FXTOS : F3_3u<2, 0b110100, 0b010000100,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtos $src, $dst",
+                 [(set FPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOD : F3_3u<2, 0b110100, 0b010001000,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtod $src, $dst",
+                 [(set DFPRegs:$dst, (SPxtof DFPRegs:$src))]>;
+def FXTOQ : F3_3u<2, 0b110100, 0b010001100,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fxtoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPxtof DFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+def FSTOX : F3_3u<2, 0b110100, 0b010000001,
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox FPRegs:$src))]>;
+def FDTOX : F3_3u<2, 0b110100, 0b010000010,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox DFPRegs:$src))]>;
+def FQTOX : F3_3u<2, 0b110100, 0b010000011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtox $src, $dst",
+                 [(set DFPRegs:$dst, (SPftox QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
+
+} // Predicates = [Is64Bit]
+
 def : Pat<(SPselectxcc i64:$t, i64:$f, imm:$cond),
           (MOVXCCrr $t, $f, imm:$cond)>;
 def : Pat<(SPselectxcc (i64 simm11:$t), i64:$f, imm:$cond),
           (MOVXCCri (as_i32imm $t), $f, imm:$cond)>;
 
+def : Pat<(SPselecticc i64:$t, i64:$f, imm:$cond),
+          (MOVICCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselecticc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVICCri (as_i32imm $t), $f, imm:$cond)>;
+
+def : Pat<(SPselectfcc i64:$t, i64:$f, imm:$cond),
+          (MOVFCCrr $t, $f, imm:$cond)>;
+def : Pat<(SPselectfcc (i64 simm11:$t), i64:$f, imm:$cond),
+          (MOVFCCri (as_i32imm $t), $f, imm:$cond)>;
+
 } // Predicates = [Is64Bit]
diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
index e7fde08..afa2874 100644
--- a/lib/Target/Sparc/SparcInstrFormats.td
+++ b/lib/Target/Sparc/SparcInstrFormats.td
@@ -7,14 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern>
+          : Instruction {
   field bits<32> Inst;
 
   let Namespace = "SP";
 
   bits<2> op;
   let Inst{31-30} = op;               // Top two bits are the 'op' field
-  
+
   dag OutOperandList = outs;
   dag InOperandList = ins;
   let AsmString   = asmstr;
@@ -46,12 +47,11 @@ class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{29-25} = rd;
 }
 
-class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr, 
+class F2_2<bits<3> op2Val, dag outs, dag ins, string asmstr,
            list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
   bits<4>   cond;
   bit       annul = 0;     // currently unused
 
-  let cond        = condVal;
   let op2         = op2Val;
 
   let Inst{29}    = annul;
@@ -88,7 +88,7 @@ class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
-class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins, 
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
            string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
   bits<13> simm13;
 
@@ -111,6 +111,32 @@ class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
   let Inst{4-0}  = rs2;
 }
 
+// floating-point unary operations.
+class F3_3u<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rs1        = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+// floating-point compares.
+class F3_3c<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+  let rd         = 0;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
 // Shift by register rs2.
 class F3_Sr<bits<2> opVal, bits<6> op3val, bit xVal, dag outs, dag ins,
             string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
@@ -149,3 +175,59 @@ multiclass F3_S<string OpcStr, bits<6> Op3Val, bit XVal, SDNode OpNode,
                  !strconcat(OpcStr, " $rs, $shcnt, $rd"),
                  [(set VT:$rd, (OpNode VT:$rs, (i32 imm:$shcnt)))]>;
 }
+
+class F4<bits<6> op3, dag outs, dag ins, string asmstr, list<dag> pattern>
+      : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+
+  let op          = 2;
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+}
+
+
+class F4_1<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+
+  bits<3> cc;
+  bits<4> cond;
+  bits<5> rs2;
+
+  let Inst{4-0}   = rs2;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 0;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+
+}
+
+class F4_2<bits<6> op3, dag outs, dag ins,
+            string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<3>  cc;
+  bits<4>  cond;
+  bits<11> simm11;
+
+  let Inst{10-0}  = simm11;
+  let Inst{11}    = cc{0};
+  let Inst{12}    = cc{1};
+  let Inst{13}    = 1;
+  let Inst{17-14} = cond;
+  let Inst{18}    = cc{2};
+}
+
+class F4_3<bits<6> op3, bits<6> opf_low, dag outs, dag ins,
+           string asmstr, list<dag> pattern>
+      : F4<op3, outs, ins, asmstr, pattern> {
+  bits<4> cond;
+  bits<3> opf_cc;
+  bits<5> rs2;
+
+  let Inst{18}     = 0;
+  let Inst{17-14}  = cond;
+  let Inst{13-11}  = opf_cc;
+  let Inst{10-5}   = opf_low;
+  let Inst{4-0}    = rs2;
+}
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index 39d7329..c10b5b3 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -17,19 +17,25 @@
 #include "SparcSubtarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "SparcGenInstrInfo.inc"
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void SparcInstrInfo::anchor() {}
+
 SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
   : SparcGenInstrInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
-    RI(ST, *this), Subtarget(ST) {
+    RI(ST), Subtarget(ST) {
 }
 
 /// isLoadFromStackSlot - If the specified machine instruction is a direct
@@ -40,8 +46,10 @@ SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
 unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
                                              int &FrameIndex) const {
   if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDXri ||
       MI->getOpcode() == SP::LDFri ||
-      MI->getOpcode() == SP::LDDFri) {
+      MI->getOpcode() == SP::LDDFri ||
+      MI->getOpcode() == SP::LDQFri) {
     if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
         MI->getOperand(2).getImm() == 0) {
       FrameIndex = MI->getOperand(1).getIndex();
@@ -59,8 +67,10 @@ unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
 unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
                                             int &FrameIndex) const {
   if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STXri ||
       MI->getOpcode() == SP::STFri ||
-      MI->getOpcode() == SP::STDFri) {
+      MI->getOpcode() == SP::STDFri ||
+      MI->getOpcode() == SP::STQFri) {
     if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
         MI->getOperand(1).getImm() == 0) {
       FrameIndex = MI->getOperand(0).getIndex();
@@ -96,14 +106,14 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
 
   case SPCC::FCC_U:    return SPCC::FCC_O;
   case SPCC::FCC_O:    return SPCC::FCC_U;
-  case SPCC::FCC_G:    return SPCC::FCC_LE;
-  case SPCC::FCC_LE:   return SPCC::FCC_G;
-  case SPCC::FCC_UG:   return SPCC::FCC_ULE;
-  case SPCC::FCC_ULE:  return SPCC::FCC_UG;
-  case SPCC::FCC_L:    return SPCC::FCC_GE;
-  case SPCC::FCC_GE:   return SPCC::FCC_L;
-  case SPCC::FCC_UL:   return SPCC::FCC_UGE;
-  case SPCC::FCC_UGE:  return SPCC::FCC_UL;
+  case SPCC::FCC_G:    return SPCC::FCC_ULE;
+  case SPCC::FCC_LE:   return SPCC::FCC_UG;
+  case SPCC::FCC_UG:   return SPCC::FCC_LE;
+  case SPCC::FCC_ULE:  return SPCC::FCC_G;
+  case SPCC::FCC_L:    return SPCC::FCC_UGE;
+  case SPCC::FCC_GE:   return SPCC::FCC_UL;
+  case SPCC::FCC_UL:   return SPCC::FCC_GE;
+  case SPCC::FCC_UGE:  return SPCC::FCC_L;
   case SPCC::FCC_LG:   return SPCC::FCC_UE;
   case SPCC::FCC_UE:   return SPCC::FCC_LG;
   case SPCC::FCC_NE:   return SPCC::FCC_E;
@@ -112,18 +122,6 @@ static SPCC::CondCodes GetOppositeBranchCondition(SPCC::CondCodes CC)
   llvm_unreachable("Invalid cond code");
 }
 
-MachineInstr *
-SparcInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                         int FrameIx,
-                                         uint64_t Offset,
-                                         const MDNode *MDPtr,
-                                         DebugLoc dl) const {
-  MachineInstrBuilder MIB = BuildMI(MF, dl, get(SP::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
-
 bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                    MachineBasicBlock *&TBB,
                                    MachineBasicBlock *&FBB,
@@ -139,15 +137,15 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     if (I->isDebugValue())
       continue;
 
-    //When we see a non-terminator, we are done
+    // When we see a non-terminator, we are done.
     if (!isUnpredicatedTerminator(I))
       break;
 
-    //Terminator is not a branch
+    // Terminator is not a branch.
     if (!I->isBranch())
       return true;
 
-    //Handle Unconditional branches
+    // Handle Unconditional branches.
     if (I->getOpcode() == SP::BA) {
       UnCondBrIter = I;
 
@@ -176,7 +174,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     unsigned Opcode = I->getOpcode();
     if (Opcode != SP::BCOND && Opcode != SP::FBCOND)
-      return true; //Unknown Opcode
+      return true; // Unknown Opcode.
 
     SPCC::CondCodes BranchCode = (SPCC::CondCodes)I->getOperand(1).getImm();
 
@@ -185,7 +183,7 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       if (AllowModify && UnCondBrIter != MBB.end() &&
           MBB.isLayoutSuccessor(TargetBB)) {
 
-        //Transform the code
+        // Transform the code
         //
         //    brCC L1
         //    ba L2
@@ -219,8 +217,8 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       Cond.push_back(MachineOperand::CreateImm(BranchCode));
       continue;
     }
-    //FIXME: Handle subsequent conditional branches
-    //For now, we can't handle multiple conditional branches
+    // FIXME: Handle subsequent conditional branches.
+    // For now, we can't handle multiple conditional branches.
     return true;
   }
   return false;
@@ -241,7 +239,7 @@ SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
     return 1;
   }
 
-  //Conditional branch
+  // Conditional branch
   unsigned CC = Cond[0].getImm();
 
   if (IsIntegerCC(CC))
@@ -281,17 +279,69 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I, DebugLoc DL,
                                  unsigned DestReg, unsigned SrcReg,
                                  bool KillSrc) const {
+  unsigned numSubRegs = 0;
+  unsigned movOpc     = 0;
+  const unsigned *subRegIdx = 0;
+
+  const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
+  const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
+  const unsigned QFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd,
+                                          SP::sub_odd64_then_sub_even,
+                                          SP::sub_odd64_then_sub_odd };
+
   if (SP::IntRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0)
       .addReg(SrcReg, getKillRegState(KillSrc));
   else if (SP::FPRegsRegClass.contains(DestReg, SrcReg))
     BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc));
-  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg))
-    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD), DestReg)
-      .addReg(SrcReg, getKillRegState(KillSrc));
-  else
+  else if (SP::DFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      BuildMI(MBB, I, DL, get(SP::FMOVD), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      // Use two FMOVS instructions.
+      subRegIdx  = DFP_FP_SubRegsIdx;
+      numSubRegs = 2;
+      movOpc     = SP::FMOVS;
+    }
+  } else if (SP::QFPRegsRegClass.contains(DestReg, SrcReg)) {
+    if (Subtarget.isV9()) {
+      if (Subtarget.hasHardQuad()) {
+        BuildMI(MBB, I, DL, get(SP::FMOVQ), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+      } else {
+        // Use two FMOVD instructions.
+        subRegIdx  = QFP_DFP_SubRegsIdx;
+        numSubRegs = 2;
+        movOpc     = SP::FMOVD;
+      }
+    } else {
+      // Use four FMOVS instructions.
+      subRegIdx  = QFP_FP_SubRegsIdx;
+      numSubRegs = 4;
+      movOpc     = SP::FMOVS;
+    }
+  } else
     llvm_unreachable("Impossible reg-to-reg copy");
+
+  if (numSubRegs == 0 || subRegIdx == 0 || movOpc == 0)
+    return;
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstr *MovMI = 0;
+
+  for (unsigned i = 0; i != numSubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
+    unsigned Src = TRI->getSubReg(SrcReg,  subRegIdx[i]);
+    assert(Dst && Src && "Bad sub-register");
+
+    MovMI = BuildMI(MBB, I, DL, get(movOpc), Dst).addReg(Src);
+  }
+  // Add implicit super-register defs and kills to the last MovMI.
+  MovMI->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    MovMI->addRegisterKilled(SrcReg, TRI);
 }
 
 void SparcInstrInfo::
@@ -302,16 +352,32 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOStore,
+                             MFI.getObjectSize(FI),
+                             MFI.getObjectAlignment(FI));
+
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
-  if (RC == &SP::IntRegsRegClass)
+ if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::STXri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
-      .addReg(SrcReg, getKillRegState(isKill));
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
   else if (RC == &SP::FPRegsRegClass)
     BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
-      .addReg(SrcReg,  getKillRegState(isKill));
-  else if (RC == &SP::DFPRegsRegClass)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
     BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
-      .addReg(SrcReg,  getKillRegState(isKill));
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use STQFri irrespective of its legality. If STQ is not legal, it will be
+    // lowered into two STDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::STQFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill)).addMemOperand(MMO);
   else
     llvm_unreachable("Can't store this register to stack slot");
 }
@@ -324,12 +390,31 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
-  if (RC == &SP::IntRegsRegClass)
-    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = *MF->getFrameInfo();
+  MachineMemOperand *MMO =
+    MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
+                             MachineMemOperand::MOLoad,
+                             MFI.getObjectSize(FI),
+                             MFI.getObjectAlignment(FI));
+
+  if (RC == &SP::I64RegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (RC == &SP::IntRegsRegClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
   else if (RC == &SP::FPRegsRegClass)
-    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
-  else if (RC == &SP::DFPRegsRegClass)
-    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::DFPRegsRegClass.hasSubClassEq(RC))
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
+  else if (SP::QFPRegsRegClass.hasSubClassEq(RC))
+    // Use LDQFri irrespective of its legality. If LDQ is not legal, it will be
+    // lowered into two LDDs in eliminateFrameIndex.
+    BuildMI(MBB, I, DL, get(SP::LDQFri), DestReg).addFrameIndex(FI).addImm(0)
+      .addMemOperand(MMO);
   else
     llvm_unreachable("Can't load this register from stack slot");
 }
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index 204f698..a86cbcb 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -37,6 +37,7 @@ namespace SPII {
 class SparcInstrInfo : public SparcGenInstrInfo {
   const SparcRegisterInfo RI;
   const SparcSubtarget& Subtarget;
+  virtual void anchor();
 public:
   explicit SparcInstrInfo(SparcSubtarget &ST);
 
@@ -53,7 +54,7 @@ public:
   /// any side effects other than loading from the stack slot.
   virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
                                        int &FrameIndex) const;
-  
+
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
@@ -62,14 +63,6 @@ public:
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const;
 
-  /// emitFrameIndexDebugValue - Emit a target-dependent form of
-  /// DBG_VALUE encoding the address of a frame index.
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc dl) const;
-
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
                              SmallVectorImpl<MachineOperand> &Cond,
@@ -86,7 +79,7 @@ public:
                            MachineBasicBlock::iterator I, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const;
-  
+
   virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MBBI,
                                    unsigned SrcReg, bool isKill, int FrameIndex,
@@ -98,7 +91,7 @@ public:
                                     unsigned DestReg, int FrameIndex,
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
-  
+
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
 
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index baefb06..ef7a114 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -39,6 +39,10 @@ def HasNoV9 : Predicate<"!Subtarget.isV9()">;
 // HasVIS - This is true when the target processor has VIS extensions.
 def HasVIS : Predicate<"Subtarget.isVIS()">;
 
+// HasHardQuad - This is true when the target processor supports quad floating
+// point instructions.
+def HasHardQuad : Predicate<"Subtarget.hasHardQuad()">;
+
 // UseDeprecatedInsts - This predicate is true when the target processor is a
 // V8, or when it is V9 but the V8 deprecated instructions are efficient enough
 // to use when appropriate.  In either of these cases, the instruction selector
@@ -81,6 +85,8 @@ def MEMri : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc, i32imm);
 }
 
+def TLSSym : Operand<iPTR>;
+
 // Branch targets have OtherVT type.
 def brtarget : Operand<OtherVT>;
 def calltarget : Operand<i32>;
@@ -89,9 +95,11 @@ def calltarget : Operand<i32>;
 let PrintMethod = "printCCOperand" in
   def CCOp : Operand<i32>;
 
-def SDTSPcmpfcc : 
+def SDTSPcmpicc :
+SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPcmpfcc :
 SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
-def SDTSPbrcc : 
+def SDTSPbrcc :
 SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
 def SDTSPselectcc :
 SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
@@ -99,8 +107,17 @@ def SDTSPFTOI :
 SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
 def SDTSPITOF :
 SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDTSPFTOX :
+SDTypeProfile<1, 1, [SDTCisVT<0, f64>, SDTCisFP<1>]>;
+def SDTSPXTOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f64>]>;
+
+def SDTSPtlsadd :
+SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDTSPtlsld :
+SDTypeProfile<1, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 
-def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutGlue]>;
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTSPcmpicc, [SDNPOutGlue]>;
 def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutGlue]>;
 def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
 def SPbrxcc : SDNode<"SPISD::BRXCC", SDTSPbrcc, [SDNPHasChain, SDNPInGlue]>;
@@ -111,6 +128,8 @@ def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
 
 def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
 def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+def SPftox  : SDNode<"SPISD::FTOX", SDTSPFTOX>;
+def SPxtof  : SDNode<"SPISD::XTOF", SDTSPXTOF>;
 
 def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInGlue]>;
 def SPselectxcc : SDNode<"SPISD::SELECT_XCC", SDTSPselectcc, [SDNPInGlue]>;
@@ -138,6 +157,12 @@ def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
                            [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
+def tlsadd        : SDNode<"SPISD::TLS_ADD", SDTSPtlsadd>;
+def tlsld         : SDNode<"SPISD::TLS_LD",  SDTSPtlsld>;
+def tlscall       : SDNode<"SPISD::TLS_CALL", SDT_SPCall,
+                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                             SDNPVariadic]>;
+
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
 }
@@ -186,7 +211,7 @@ def FCC_O   : FCC_VAL<29>;  // Ordered
 
 /// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
 multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
-  def rr  : F3_1<2, Op3Val, 
+  def rr  : F3_1<2, Op3Val,
                  (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat(OpcStr, " $b, $c, $dst"),
                  [(set i32:$dst, (OpNode i32:$b, i32:$c))]>;
@@ -199,7 +224,7 @@ multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
 /// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
 /// pattern.
 multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
-  def rr  : F3_1<2, Op3Val, 
+  def rr  : F3_1<2, Op3Val,
                  (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                  !strconcat(OpcStr, " $b, $c, $dst"), []>;
   def ri  : F3_2<2, Op3Val,
@@ -240,27 +265,15 @@ let hasSideEffects = 1, mayStore = 1 in {
                    [(flushw)]>;
 }
 
-def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
-                "unimp $val", []>;
-
-// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
-// fpmover pass.
-let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
-  def FpMOVD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpMOVD $src, $dst", []>;
-  def FpNEGD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpNEGD $src, $dst",
-                      [(set f64:$dst, (fneg f64:$src))]>;
-  def FpABSD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
-                      "!FpABSD $src, $dst",
-                      [(set f64:$dst, (fabs f64:$src))]>;
-}
+let rd = 0 in
+  def UNIMP : F2_1<0b000, (outs), (ins i32imm:$val),
+                  "unimp $val", []>;
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.  This has to handle all
 // permutations of selection between i32/f32/f64 on ICC and FCC.
-  // Expanded after instruction selection.
-let Uses = [ICC], usesCustomInserter = 1 in { 
+// Expanded after instruction selection.
+let Uses = [ICC], usesCustomInserter = 1 in {
   def SELECT_CC_Int_ICC
    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
             "; SELECT_CC_Int_ICC PSEUDO!",
@@ -274,6 +287,11 @@ let Uses = [ICC], usesCustomInserter = 1 in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_ICC PSEUDO!",
             [(set f64:$dst, (SPselecticc f64:$T, f64:$F, imm:$Cond))]>;
+
+  def SELECT_CC_QFP_ICC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_ICC PSEUDO!",
+            [(set f128:$dst, (SPselecticc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 let usesCustomInserter = 1, Uses = [FCC] in {
@@ -291,17 +309,21 @@ let usesCustomInserter = 1, Uses = [FCC] in {
    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
             "; SELECT_CC_DFP_FCC PSEUDO!",
             [(set f64:$dst, (SPselectfcc f64:$T, f64:$F, imm:$Cond))]>;
+  def SELECT_CC_QFP_FCC
+   : Pseudo<(outs QFPRegs:$dst), (ins QFPRegs:$T, QFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_QFP_FCC PSEUDO!",
+            [(set f128:$dst, (SPselectfcc f128:$T, f128:$F, imm:$Cond))]>;
 }
 
 
 // Section A.3 - Synthetic Instructions, p. 85
 // special cases of JMPL:
 let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
-  let rd = O7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 15 in
     def RETL: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                    "jmp %o7+$val", [(retflag simm13:$val)]>;
 
-  let rd = I7.Num, rs1 = G0.Num in
+  let rd = 0, rs1 = 31 in
     def RET: F3_2<2, 0b111000, (outs), (ins i32imm:$val),
                   "jmp %i7+$val", []>;
 }
@@ -365,56 +387,76 @@ def LDDFri : F3_2<3, 0b100011,
                   (outs DFPRegs:$dst), (ins MEMri:$addr),
                   "ldd [$addr], $dst",
                   [(set f64:$dst, (load ADDRri:$addr))]>;
+def LDQFrr : F3_1<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRrr:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
+def LDQFri : F3_2<3, 0b100010,
+                  (outs QFPRegs:$dst), (ins MEMri:$addr),
+                  "ldq [$addr], $dst",
+                  [(set f128:$dst, (load ADDRri:$addr))]>,
+                  Requires<[HasV9, HasHardQuad]>;
 
 // Section B.4 - Store Integer Instructions, p. 95
 def STBrr : F3_1<3, 0b000101,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRrr:$addr)]>;
 def STBri : F3_2<3, 0b000101,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "stb $src, [$addr]",
-                 [(truncstorei8 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "stb $rd, [$addr]",
+                 [(truncstorei8 i32:$rd, ADDRri:$addr)]>;
 def STHrr : F3_1<3, 0b000110,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRrr:$addr)]>;
 def STHri : F3_2<3, 0b000110,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "sth $src, [$addr]",
-                 [(truncstorei16 i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "sth $rd, [$addr]",
+                 [(truncstorei16 i32:$rd, ADDRri:$addr)]>;
 def STrr  : F3_1<3, 0b000100,
-                 (outs), (ins MEMrr:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRrr:$addr)]>;
+                 (outs), (ins MEMrr:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRrr:$addr)]>;
 def STri  : F3_2<3, 0b000100,
-                 (outs), (ins MEMri:$addr, IntRegs:$src),
-                 "st $src, [$addr]",
-                 [(store i32:$src, ADDRri:$addr)]>;
+                 (outs), (ins MEMri:$addr, IntRegs:$rd),
+                 "st $rd, [$addr]",
+                 [(store i32:$rd, ADDRri:$addr)]>;
 
 // Section B.5 - Store Floating-point Instructions, p. 97
 def STFrr   : F3_1<3, 0b100100,
-                   (outs), (ins MEMrr:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRrr:$addr)]>;
 def STFri   : F3_2<3, 0b100100,
-                   (outs), (ins MEMri:$addr, FPRegs:$src),
-                   "st $src, [$addr]",
-                   [(store f32:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, FPRegs:$rd),
+                   "st $rd, [$addr]",
+                   [(store f32:$rd, ADDRri:$addr)]>;
 def STDFrr  : F3_1<3, 0b100111,
-                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
-                   "std  $src, [$addr]",
-                   [(store f64:$src, ADDRrr:$addr)]>;
+                   (outs), (ins MEMrr:$addr, DFPRegs:$rd),
+                   "std  $rd, [$addr]",
+                   [(store f64:$rd, ADDRrr:$addr)]>;
 def STDFri  : F3_2<3, 0b100111,
-                   (outs), (ins MEMri:$addr, DFPRegs:$src),
-                   "std $src, [$addr]",
-                   [(store f64:$src, ADDRri:$addr)]>;
+                   (outs), (ins MEMri:$addr, DFPRegs:$rd),
+                   "std $rd, [$addr]",
+                   [(store f64:$rd, ADDRri:$addr)]>;
+def STQFrr  : F3_1<3, 0b100110,
+                   (outs), (ins MEMrr:$addr, QFPRegs:$rd),
+                   "stq  $rd, [$addr]",
+                   [(store f128:$rd, ADDRrr:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
+def STQFri  : F3_2<3, 0b100110,
+                   (outs), (ins MEMri:$addr, QFPRegs:$rd),
+                   "stq $rd, [$addr]",
+                   [(store f128:$rd, ADDRri:$addr)]>,
+                   Requires<[HasV9, HasHardQuad]>;
 
 // Section B.9 - SETHI Instruction, p. 104
 def SETHIi: F2_1<0b100,
-                 (outs IntRegs:$dst), (ins i32imm:$src),
-                 "sethi $src, $dst",
-                 [(set i32:$dst, SETHIimm:$src)]>;
+                 (outs IntRegs:$rd), (ins i32imm:$imm22),
+                 "sethi $imm22, $rd",
+                 [(set i32:$rd, SETHIimm:$imm22)]>;
 
 // Section B.10 - NOP Instruction, p. 105
 // (It's a special case of SETHI)
@@ -460,27 +502,39 @@ defm SRA : F3_12<"sra", 0b100111, sra>;
 defm ADD   : F3_12<"add", 0b000000, add>;
 
 // "LEA" forms of add (patterns to make tblgen happy)
-def LEA_ADDri   : F3_2<2, 0b000000,
-                   (outs IntRegs:$dst), (ins MEMri:$addr),
-                   "add ${addr:arith}, $dst",
-                   [(set i32:$dst, ADDRri:$addr)]>;
+let Predicates = [Is32Bit] in
+  def LEA_ADDri   : F3_2<2, 0b000000,
+                     (outs IntRegs:$dst), (ins MEMri:$addr),
+                     "add ${addr:arith}, $dst",
+                     [(set iPTR:$dst, ADDRri:$addr)]>;
 
-let Defs = [ICC] in                   
+let Defs = [ICC] in
   defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
 
-let Uses = [ICC] in
-  defm ADDX  : F3_12<"addx", 0b001000, adde>;
+let Uses = [ICC], Defs = [ICC] in
+  defm ADDX  : F3_12<"addxcc", 0b011000, adde>;
 
 // Section B.15 - Subtract Instructions, p. 110
 defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
-let Uses = [ICC] in 
-  defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
-
-let Defs = [ICC] in 
-  defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+let Uses = [ICC], Defs = [ICC] in
+  defm SUBX   : F3_12  <"subxcc" , 0b011100, sube>;
+
+let Defs = [ICC] in
+  defm SUBCC  : F3_12  <"subcc", 0b010100, subc>;
+
+let Defs = [ICC], rd = 0 in {
+  def CMPrr   : F3_1<2, 0b010100,
+                     (outs), (ins IntRegs:$b, IntRegs:$c),
+                     "cmp $b, $c",
+                     [(SPcmpicc i32:$b, i32:$c)]>;
+  def CMPri   : F3_2<2, 0b010100,
+                     (outs), (ins IntRegs:$b, i32imm:$c),
+                     "cmp $b, $c",
+                     [(SPcmpicc i32:$b, (i32 simm13:$c))]>;
+}
 
 let Uses = [ICC], Defs = [ICC] in
-  def SUBXCCrr: F3_1<2, 0b011100, 
+  def SUBXCCrr: F3_1<2, 0b011100,
                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
                 "subxcc $b, $c, $dst", []>;
 
@@ -503,76 +557,91 @@ defm RESTORE : F3_12np<"restore", 0b111101>;
 
 // Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
 
+// unconditional branch class.
+class BranchAlways<dag ins, string asmstr, list<dag> pattern>
+  : F2_2<0b010, (outs), ins, asmstr, pattern> {
+  let isBranch     = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let isBarrier    = 1;
+}
+
+let cond = 8 in
+  def BA : BranchAlways<(ins brtarget:$imm22), "ba $imm22", [(br bb:$imm22)]>;
+
 // conditional branch class:
-class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+class BranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b010, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-let isBarrier = 1 in
-  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
-                      "ba $dst",
-                      [(br bb:$dst)]>;
+// Indirect branch instructions.
+let isTerminator = 1, isBarrier = 1,
+     hasDelaySlot = 1, isBranch =1,
+     isIndirectBranch = 1, rd = 0 in {
+  def BINDrr  : F3_1<2, 0b111000,
+                   (outs), (ins MEMrr:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRrr:$ptr)]>;
+  def BINDri  : F3_2<2, 0b111000,
+                   (outs), (ins MEMri:$ptr),
+                   "jmp $ptr",
+                   [(brind ADDRri:$ptr)]>;
+}
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [ICC] in
-  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                         "b$cc $dst",
-                        [(SPbricc bb:$dst, imm:$cc)]>;
-
+  def BCOND : BranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                         "b$cond $imm22",
+                        [(SPbricc bb:$imm22, imm:$cond)]>;
 
 // Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
 
 // floating-point conditional branch class:
-class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
- : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+class FPBranchSP<dag ins, string asmstr, list<dag> pattern>
+ : F2_2<0b110, (outs), ins, asmstr, pattern> {
   let isBranch = 1;
   let isTerminator = 1;
   let hasDelaySlot = 1;
 }
 
-// FIXME: the encoding for the JIT should look at the condition field.
 let Uses = [FCC] in
-  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
-                              "fb$cc $dst",
-                              [(SPbrfcc bb:$dst, imm:$cc)]>;
+  def FBCOND  : FPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
+                              "fb$cond $imm22",
+                              [(SPbrfcc bb:$imm22, imm:$cond)]>;
 
 
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
-    hasDelaySlot = 1, isCall = 1,
-    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
-    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15,
-        ICC, FCC, Y] in {
+    hasDelaySlot = 1, isCall = 1 in {
   def CALL : InstSP<(outs), (ins calltarget:$dst, variable_ops),
                     "call $dst", []> {
     bits<30> disp;
     let op = 1;
     let Inst{29-0} = disp;
   }
-  
+
   // indirect calls
   def JMPLrr : F3_1<2, 0b111000,
                     (outs), (ins MEMrr:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRrr:$ptr)]>;
+                    [(call ADDRrr:$ptr)]> { let rd = 15; }
   def JMPLri : F3_2<2, 0b111000,
                     (outs), (ins MEMri:$ptr, variable_ops),
                     "call $ptr",
-                    [(call ADDRri:$ptr)]>;
+                    [(call ADDRri:$ptr)]> { let rd = 15; }
 }
 
 // Section B.28 - Read State Register Instructions
-let Uses = [Y] in 
+let Uses = [Y], rs1 = 0, rs2 = 0 in
   def RDY : F3_1<2, 0b101000,
                  (outs IntRegs:$dst), (ins),
                  "rd %y, $dst", []>;
 
 // Section B.29 - Write State Register Instructions
-let Defs = [Y] in {
+let Defs = [Y], rd = 0 in {
   def WRYrr : F3_1<2, 0b110000,
                    (outs), (ins IntRegs:$b, IntRegs:$c),
                    "wr $b, $c, %y", []>;
@@ -581,58 +650,93 @@ let Defs = [Y] in {
                    "wr $b, $c, %y", []>;
 }
 // Convert Integer to Floating-point Instructions, p. 141
-def FITOS : F3_3<2, 0b110100, 0b011000100,
+def FITOS : F3_3u<2, 0b110100, 0b011000100,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fitos $src, $dst",
                  [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
-def FITOD : F3_3<2, 0b110100, 0b011001000, 
+def FITOD : F3_3u<2, 0b110100, 0b011001000,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fitod $src, $dst",
                  [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOQ : F3_3u<2, 0b110100, 0b011001100,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fitoq $src, $dst",
+                 [(set QFPRegs:$dst, (SPitof FPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert Floating-point to Integer Instructions, p. 142
-def FSTOI : F3_3<2, 0b110100, 0b011010001,
+def FSTOI : F3_3u<2, 0b110100, 0b011010001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fstoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
-def FDTOI : F3_3<2, 0b110100, 0b011010010,
+def FDTOI : F3_3u<2, 0b110100, 0b011010010,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtoi $src, $dst",
                  [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+def FQTOI : F3_3u<2, 0b110100, 0b011010011,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi QFPRegs:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Convert between Floating-point Formats Instructions, p. 143
-def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+def FSTOD : F3_3u<2, 0b110100, 0b011001001,
                  (outs DFPRegs:$dst), (ins FPRegs:$src),
                  "fstod $src, $dst",
                  [(set f64:$dst, (fextend f32:$src))]>;
-def FDTOS : F3_3<2, 0b110100, 0b011000110,
+def FSTOQ : F3_3u<2, 0b110100, 0b011001101,
+                 (outs QFPRegs:$dst), (ins FPRegs:$src),
+                 "fstoq $src, $dst",
+                 [(set f128:$dst, (fextend f32:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FDTOS : F3_3u<2, 0b110100, 0b011000110,
                  (outs FPRegs:$dst), (ins DFPRegs:$src),
                  "fdtos $src, $dst",
                  [(set f32:$dst, (fround f64:$src))]>;
+def FDTOQ : F3_3u<2, 0b110100, 0b01101110,
+                 (outs QFPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoq $src, $dst",
+                 [(set f128:$dst, (fextend f64:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOS : F3_3u<2, 0b110100, 0b011000111,
+                 (outs FPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtos $src, $dst",
+                 [(set f32:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
+def FQTOD : F3_3u<2, 0b110100, 0b011001011,
+                 (outs DFPRegs:$dst), (ins QFPRegs:$src),
+                 "fqtod $src, $dst",
+                 [(set f64:$dst, (fround f128:$src))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Move Instructions, p. 144
-def FMOVS : F3_3<2, 0b110100, 0b000000001,
+def FMOVS : F3_3u<2, 0b110100, 0b000000001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fmovs $src, $dst", []>;
-def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+def FNEGS : F3_3u<2, 0b110100, 0b000000101,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fnegs $src, $dst",
                  [(set f32:$dst, (fneg f32:$src))]>;
-def FABSS : F3_3<2, 0b110100, 0b000001001, 
+def FABSS : F3_3u<2, 0b110100, 0b000001001,
                  (outs FPRegs:$dst), (ins FPRegs:$src),
                  "fabss $src, $dst",
                  [(set f32:$dst, (fabs f32:$src))]>;
 
 
 // Floating-point Square Root Instructions, p.145
-def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
                   (outs FPRegs:$dst), (ins FPRegs:$src),
                   "fsqrts $src, $dst",
                   [(set f32:$dst, (fsqrt f32:$src))]>;
-def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+def FSQRTD : F3_3u<2, 0b110100, 0b000101010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
                   "fsqrtd $src, $dst",
                   [(set f64:$dst, (fsqrt f64:$src))]>;
+def FSQRTQ : F3_3u<2, 0b110100, 0b000101011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                  "fsqrtq $src, $dst",
+                  [(set f128:$dst, (fsqrt f128:$src))]>,
+                  Requires<[HasHardQuad]>;
 
 
 
@@ -645,6 +749,12 @@ def FADDD  : F3_3<2, 0b110100, 0b001000010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "faddd $src1, $src2, $dst",
                   [(set f64:$dst, (fadd f64:$src1, f64:$src2))]>;
+def FADDQ  : F3_3<2, 0b110100, 0b001000011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "faddq $src1, $src2, $dst",
+                  [(set f128:$dst, (fadd f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSUBS  : F3_3<2, 0b110100, 0b001000101,
                   (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsubs $src1, $src2, $dst",
@@ -653,6 +763,12 @@ def FSUBD  : F3_3<2, 0b110100, 0b001000110,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fsubd $src1, $src2, $dst",
                   [(set f64:$dst, (fsub f64:$src1, f64:$src2))]>;
+def FSUBQ  : F3_3<2, 0b110100, 0b001000111,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fsubq $src1, $src2, $dst",
+                  [(set f128:$dst, (fsub f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 
 // Floating-point Multiply and Divide Instructions, p. 147
 def FMULS  : F3_3<2, 0b110100, 0b001001001,
@@ -663,11 +779,24 @@ def FMULD  : F3_3<2, 0b110100, 0b001001010,
                   (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                   "fmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul f64:$src1, f64:$src2))]>;
+def FMULQ  : F3_3<2, 0b110100, 0b001001011,
+                  (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                  "fmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul f128:$src1, f128:$src2))]>,
+                  Requires<[HasHardQuad]>;
+
 def FSMULD : F3_3<2, 0b110100, 0b001101001,
                   (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                   "fsmuld $src1, $src2, $dst",
                   [(set f64:$dst, (fmul (fextend f32:$src1),
                                         (fextend f32:$src2)))]>;
+def FDMULQ : F3_3<2, 0b110100, 0b001101110,
+                  (outs QFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fdmulq $src1, $src2, $dst",
+                  [(set f128:$dst, (fmul (fextend f64:$src1),
+                                         (fextend f64:$src2)))]>,
+                  Requires<[HasHardQuad]>;
+
 def FDIVS  : F3_3<2, 0b110100, 0b001001101,
                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
                  "fdivs $src1, $src2, $dst",
@@ -676,21 +805,61 @@ def FDIVD  : F3_3<2, 0b110100, 0b001001110,
                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
                  "fdivd $src1, $src2, $dst",
                  [(set f64:$dst, (fdiv f64:$src1, f64:$src2))]>;
+def FDIVQ  : F3_3<2, 0b110100, 0b001001111,
+                 (outs QFPRegs:$dst), (ins QFPRegs:$src1, QFPRegs:$src2),
+                 "fdivq $src1, $src2, $dst",
+                 [(set f128:$dst, (fdiv f128:$src1, f128:$src2))]>,
+                 Requires<[HasHardQuad]>;
 
 // Floating-point Compare Instructions, p. 148
 // Note: the 2nd template arg is different for these guys.
 // Note 2: the result of a FCMP is not available until the 2nd cycle
-// after the instr is retired, but there is no interlock. This behavior
-// is modelled with a forced noop after the instruction.
+// after the instr is retired, but there is no interlock in Sparc V8.
+// This behavior is modeled with a forced noop after the instruction in
+// DelaySlotFiller.
+
 let Defs = [FCC] in {
-  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+  def FCMPS  : F3_3c<2, 0b110101, 0b001010001,
                    (outs), (ins FPRegs:$src1, FPRegs:$src2),
-                   "fcmps $src1, $src2\n\tnop",
+                   "fcmps $src1, $src2",
                    [(SPcmpfcc f32:$src1, f32:$src2)]>;
-  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+  def FCMPD  : F3_3c<2, 0b110101, 0b001010010,
                    (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
-                   "fcmpd $src1, $src2\n\tnop",
+                   "fcmpd $src1, $src2",
                    [(SPcmpfcc f64:$src1, f64:$src2)]>;
+  def FCMPQ  : F3_3c<2, 0b110101, 0b001010011,
+                   (outs), (ins QFPRegs:$src1, QFPRegs:$src2),
+                   "fcmpq $src1, $src2",
+                   [(SPcmpfcc f128:$src1, f128:$src2)]>,
+                   Requires<[HasHardQuad]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions for Thread Local Storage(TLS).
+//===----------------------------------------------------------------------===//
+
+def TLS_ADDrr : F3_1<2, 0b000000,
+                    (outs IntRegs:$rd),
+                    (ins IntRegs:$rs1, IntRegs:$rs2, TLSSym:$sym),
+                    "add $rs1, $rs2, $rd, $sym",
+                    [(set i32:$rd,
+                        (tlsadd i32:$rs1, i32:$rs2, tglobaltlsaddr:$sym))]>;
+
+let mayLoad = 1 in
+  def TLS_LDrr : F3_1<3, 0b000000,
+                      (outs IntRegs:$dst), (ins MEMrr:$addr, TLSSym:$sym),
+                      "ld [$addr], $dst, $sym",
+                      [(set i32:$dst,
+                          (tlsld ADDRrr:$addr, tglobaltlsaddr:$sym))]>;
+
+let Uses = [O6], isCall = 1, hasDelaySlot = 1 in
+  def TLS_CALL : InstSP<(outs),
+                        (ins calltarget:$disp, TLSSym:$sym, variable_ops),
+                        "call $disp, $sym",
+                        [(tlscall texternalsym:$disp, tglobaltlsaddr:$sym)]> {
+  bits<30> disp;
+  let op = 1;
+  let Inst{29-0} = disp;
 }
 
 //===----------------------------------------------------------------------===//
@@ -698,76 +867,110 @@ let Defs = [FCC] in {
 //===----------------------------------------------------------------------===//
 
 // V9 Conditional Moves.
-let Predicates = [HasV9], Constraints = "$T = $dst" in {
+let Predicates = [HasV9], Constraints = "$f = $rd" in {
   // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
-  // FIXME: Add instruction encodings for the JIT some day.
-  let Uses = [ICC] in {
+  let Uses = [ICC], cc = 0b100 in {
     def MOVICCrr
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
-               "mov$cc %icc, $F, $dst",
-               [(set i32:$dst, (SPselecticc i32:$F, i32:$T, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $rs2, $rd",
+             [(set i32:$rd, (SPselecticc i32:$rs2, i32:$f, imm:$cond))]>;
+
     def MOVICCri
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
-               "mov$cc %icc, $F, $dst",
-               [(set i32:$dst, (SPselecticc simm11:$F, i32:$T, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %icc, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselecticc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], cc = 0b000 in {
     def MOVFCCrr
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
-               "mov$cc %fcc0, $F, $dst",
-               [(set i32:$dst, (SPselectfcc i32:$F, i32:$T, imm:$cc))]>;
+      : F4_1<0b101100, (outs IntRegs:$rd),
+             (ins IntRegs:$rs2, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $rs2, $rd",
+             [(set i32:$rd, (SPselectfcc i32:$rs2, i32:$f, imm:$cond))]>;
     def MOVFCCri
-      : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
-               "mov$cc %fcc0, $F, $dst",
-               [(set i32:$dst, (SPselectfcc simm11:$F, i32:$T, imm:$cc))]>;
+      : F4_2<0b101100, (outs IntRegs:$rd),
+             (ins i32imm:$simm11, IntRegs:$f, CCOp:$cond),
+             "mov$cond %fcc0, $simm11, $rd",
+             [(set i32:$rd,
+                    (SPselectfcc simm11:$simm11, i32:$f, imm:$cond))]>;
   }
 
-  let Uses = [ICC] in {
+  let Uses = [ICC], opf_cc = 0b100 in {
     def FMOVS_ICC
-      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
-               "fmovs$cc %icc, $F, $dst",
-               [(set f32:$dst,
-                           (SPselecticc f32:$F, f32:$T, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %icc, $rs2, $rd",
+             [(set f32:$rd, (SPselecticc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_ICC
-      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
-               "fmovd$cc %icc, $F, $dst",
-               [(set f64:$dst, (SPselecticc f64:$F, f64:$T, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+               (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f64:$rd, (SPselecticc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_ICC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+               (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+               "fmovd$cond %icc, $rs2, $rd",
+               [(set f128:$rd, (SPselecticc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
-  let Uses = [FCC] in {
+  let Uses = [FCC], opf_cc = 0b000 in {
     def FMOVS_FCC
-      : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
-               "fmovs$cc %fcc0, $F, $dst",
-               [(set f32:$dst, (SPselectfcc f32:$F, f32:$T, imm:$cc))]>;
+      : F4_3<0b110101, 0b000001, (outs FPRegs:$rd),
+             (ins FPRegs:$rs2, FPRegs:$f, CCOp:$cond),
+             "fmovs$cond %fcc0, $rs2, $rd",
+             [(set f32:$rd, (SPselectfcc f32:$rs2, f32:$f, imm:$cond))]>;
     def FMOVD_FCC
-      : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
-               "fmovd$cc %fcc0, $F, $dst",
-               [(set f64:$dst, (SPselectfcc f64:$F, f64:$T, imm:$cc))]>;
+      : F4_3<0b110101, 0b000010, (outs DFPRegs:$rd),
+             (ins DFPRegs:$rs2, DFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f64:$rd, (SPselectfcc f64:$rs2, f64:$f, imm:$cond))]>;
+    def FMOVQ_FCC
+      : F4_3<0b110101, 0b000011, (outs QFPRegs:$rd),
+             (ins QFPRegs:$rs2, QFPRegs:$f, CCOp:$cond),
+             "fmovd$cond %fcc0, $rs2, $rd",
+             [(set f128:$rd, (SPselectfcc f128:$rs2, f128:$f, imm:$cond))]>;
   }
 
 }
 
 // Floating-Point Move Instructions, p. 164 of the V9 manual.
 let Predicates = [HasV9] in {
-  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+  def FMOVD : F3_3u<2, 0b110100, 0b000000010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fmovd $src, $dst", []>;
-  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+  def FMOVQ : F3_3u<2, 0b110100, 0b000000011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fmovq $src, $dst", []>,
+                   Requires<[HasHardQuad]>;
+  def FNEGD : F3_3u<2, 0b110100, 0b000000110,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fnegd $src, $dst",
                    [(set f64:$dst, (fneg f64:$src))]>;
-  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+  def FNEGQ : F3_3u<2, 0b110100, 0b000000111,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fnegq $src, $dst",
+                   [(set f128:$dst, (fneg f128:$src))]>,
+                   Requires<[HasHardQuad]>;
+  def FABSD : F3_3u<2, 0b110100, 0b000001010,
                    (outs DFPRegs:$dst), (ins DFPRegs:$src),
                    "fabsd $src, $dst",
                    [(set f64:$dst, (fabs f64:$src))]>;
+  def FABSQ : F3_3u<2, 0b110100, 0b000001011,
+                   (outs QFPRegs:$dst), (ins QFPRegs:$src),
+                   "fabsq $src, $dst",
+                   [(set f128:$dst, (fabs f128:$src))]>,
+                   Requires<[HasHardQuad]>;
 }
 
 // POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
 // the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
-def POPCrr : F3_1<2, 0b101110, 
-                  (outs IntRegs:$dst), (ins IntRegs:$src),
-                  "popc $src, $dst", []>, Requires<[HasV9]>;
+let rs1 = 0 in
+  def POPCrr : F3_1<2, 0b101110,
+                    (outs IntRegs:$dst), (ins IntRegs:$src),
+                    "popc $src, $dst", []>, Requires<[HasV9]>;
 def : Pat<(ctpop i32:$src),
           (POPCrr (SLLri $src, 0))>;
 
@@ -782,11 +985,6 @@ def : Pat<(i32 simm13:$val),
 def : Pat<(i32 imm:$val),
           (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
 
-// subc
-def : Pat<(subc i32:$b, i32:$c),
-          (SUBCCrr $b, $c)>;
-def : Pat<(subc i32:$b, simm13:$val),
-          (SUBCCri $b, imm:$val)>;
 
 // Global addresses, constant pool entries
 def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
@@ -794,11 +992,25 @@ def : Pat<(SPlo tglobaladdr:$in), (ORri (i32 G0), tglobaladdr:$in)>;
 def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
 def : Pat<(SPlo tconstpool:$in), (ORri (i32 G0), tconstpool:$in)>;
 
+// GlobalTLS addresses
+def : Pat<(SPhi tglobaltlsaddr:$in), (SETHIi tglobaltlsaddr:$in)>;
+def : Pat<(SPlo tglobaltlsaddr:$in), (ORri (i32 G0), tglobaltlsaddr:$in)>;
+def : Pat<(add (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (ADDri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+def : Pat<(xor (SPhi tglobaltlsaddr:$in1), (SPlo tglobaltlsaddr:$in2)),
+          (XORri (SETHIi tglobaltlsaddr:$in1), (tglobaltlsaddr:$in2))>;
+
+// Blockaddress
+def : Pat<(SPhi tblockaddress:$in), (SETHIi tblockaddress:$in)>;
+def : Pat<(SPlo tblockaddress:$in), (ORri (i32 G0), tblockaddress:$in)>;
+
 // Add reg, lo.  This is used when taking the addr of a global/constpool entry.
 def : Pat<(add iPTR:$r, (SPlo tglobaladdr:$in)), (ADDri $r, tglobaladdr:$in)>;
 def : Pat<(add iPTR:$r, (SPlo tconstpool:$in)),  (ADDri $r, tconstpool:$in)>;
+def : Pat<(add iPTR:$r, (SPlo tblockaddress:$in)),
+                        (ADDri $r, tblockaddress:$in)>;
 
-// Calls: 
+// Calls:
 def : Pat<(call tglobaladdr:$dst),
           (CALL tglobaladdr:$dst)>;
 def : Pat<(call texternalsym:$dst),
@@ -816,4 +1028,8 @@ def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
 def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
 def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
 
+// store 0, addr -> store %g0, addr
+def : Pat<(store (i32 0), ADDRrr:$dst), (STrr ADDRrr:$dst, (i32 G0))>;
+def : Pat<(store (i32 0), ADDRri:$dst), (STri ADDRri:$dst, (i32 G0))>;
+
 include "SparcInstr64Bit.td"
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
new file mode 100644
index 0000000..6493c7d
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -0,0 +1,165 @@
+//===-- SparcJITInfo.cpp - Implement the Sparc JIT Interface --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Sparc target.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "jit"
+#include "SparcJITInfo.h"
+#include "SparcRelocations.h"
+
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Memory.h"
+
+using namespace llvm;
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+extern "C" void SparcCompilationCallback();
+
+extern "C" {
+#if defined (__sparc__)
+  asm(
+      ".text\n"
+      "\t.align 4\n"
+      "\t.global SparcCompilationCallback\n"
+      "\t.type SparcCompilationCallback, #function\n"
+      "SparcCompilationCallback:\n"
+      // Save current register window.
+      "\tsave %sp, -192, %sp\n"
+      // stubaddr+4 is in %g1.
+      "\tcall SparcCompilationCallbackC\n"
+      "\t  sub %g1, 4, %o0\n"
+      // restore original register window and
+      // copy %o0 to %g1
+      "\t  restore %o0, 0, %g1\n"
+      // call the new stub
+      "\tjmp %g1\n"
+      "\t  nop\n"
+      "\t.size   SparcCompilationCallback, .-SparcCompilationCallback"
+      );
+
+#else
+  void SparcCompilationCallback() {
+    llvm_unreachable(
+      "Cannot call SparcCompilationCallback() on a non-sparc arch!");
+  }
+#endif
+}
+
+#define HI(Val) (((unsigned)(Val)) >> 10)
+#define LO(Val) (((unsigned)(Val)) & 0x3FF)
+
+#define SETHI_INST(imm, rd)    (0x01000000 | ((rd) << 25) | ((imm) & 0x3FFFFF))
+#define JMP_INST(rs1, imm, rd) (0x80000000 | ((rd) << 25) | (0x38 << 19) \
+                                | ((rs1) << 14) | (1 << 13) | ((imm) & 0x1FFF))
+#define NOP_INST               SETHI_INST(0, 0)
+
+extern "C" void *SparcCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t) JITCompilerFunction((void*) StubAddr);
+
+  // Rewrite the function stub so that we don't end up here every time we
+  // execute the call. We're replacing the first three instructions of the
+  // stub with code that jumps to the compiled function:
+  //   sethi %hi(NewVal), %g1
+  //   jmp %g1+%lo(NewVal)
+  //   nop
+
+  *(intptr_t *)(StubAddr)      = SETHI_INST(HI(NewVal), 1);
+  *(intptr_t *)(StubAddr + 4)  = JMP_INST(1, LO(NewVal), 0);
+  *(intptr_t *)(StubAddr + 8)  = NOP_INST;
+
+  sys::Memory::InvalidateInstructionCache((void*) StubAddr, 12);
+  return (void*)StubAddr;
+}
+
+void SparcJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  assert(0 && "FIXME: Implement SparcJITInfo::replaceMachineCodeForFunction");
+}
+
+
+TargetJITInfo::StubLayout SparcJITInfo::getStubLayout() {
+  // The stub contains 3 4-byte instructions, aligned at 4 bytes. See
+  // emitFunctionStub for details.
+
+  StubLayout Result = { 3*4, 4 };
+  return Result;
+}
+
+void *SparcJITInfo::emitFunctionStub(const Function *F, void *Fn,
+                                     JITCodeEmitter &JCE)
+{
+  JCE.emitAlignment(4);
+  void *Addr = (void*) (JCE.getCurrentPCValue());
+  if (!sys::Memory::setRangeWritable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub writable.");
+
+  intptr_t EmittedAddr;
+  if (Fn != (void*)(intptr_t)SparcCompilationCallback)
+    EmittedAddr = (intptr_t)Fn;
+  else
+    EmittedAddr = (intptr_t)SparcCompilationCallback;
+
+  // sethi %hi(EmittedAddr), %g1
+  // jmp   %g1+%lo(EmittedAddr), %g1
+  // nop
+
+  JCE.emitWordBE(SETHI_INST(HI(EmittedAddr), 1));
+  JCE.emitWordBE(JMP_INST(1, LO(EmittedAddr), 1));
+  JCE.emitWordBE(NOP_INST);
+
+  sys::Memory::InvalidateInstructionCache(Addr, 12);
+  if (!sys::Memory::setRangeExecutable(Addr, 12))
+    llvm_unreachable("ERROR: Unable to mark stub executable.");
+
+  return Addr;
+}
+
+TargetJITInfo::LazyResolverFn
+SparcJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return SparcCompilationCallback;
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void SparcJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char *GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*) Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t) MR->getResultPointer();
+
+    switch ((SP::RelocationType) MR->getRelocationType()) {
+    case SP::reloc_sparc_hi:
+      ResultPtr = (ResultPtr >> 10) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_lo:
+      ResultPtr = (ResultPtr & 0x3ff);
+      break;
+
+    case SP::reloc_sparc_pc30:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffffff;
+      break;
+
+    case SP::reloc_sparc_pc22:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x3fffff;
+      break;
+
+    case SP::reloc_sparc_pc19:
+      ResultPtr = ((ResultPtr - (intptr_t)RelocPos) >> 2) & 0x7ffff;
+      break;
+    }
+    *((unsigned*) RelocPos) |= (unsigned) ResultPtr;
+  }
+}
diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
new file mode 100644
index 0000000..9c6e488
--- /dev/null
+++ b/lib/Target/Sparc/SparcJITInfo.h
@@ -0,0 +1,67 @@
+//==- SparcJITInfo.h - Sparc Implementation of the JIT Interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCJITINFO_H
+#define SPARCJITINFO_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+class SparcTargetMachine;
+
+class SparcJITInfo : public TargetJITInfo {
+
+  bool IsPIC;
+
+  public:
+  explicit SparcJITInfo()
+    :  IsPIC(false) {}
+
+  /// replaceMachineCodeForFunction - Make it so that calling the function
+  /// whose machine code is at OLD turns into a call to NEW, perhaps by
+  /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+  /// code.
+  ///
+  virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+  // getStubLayout - Returns the size and alignment of the largest call stub
+  // on Sparc.
+  virtual StubLayout getStubLayout();
+
+
+  /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+  /// small native function that simply calls the function at the specified
+  /// address.
+  virtual void *emitFunctionStub(const Function *F, void *Fn,
+                                 JITCodeEmitter &JCE);
+
+  /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+  virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+  /// relocate - Before the JIT can run a block of code that has been emitted,
+  /// it must rewrite the code to contain the actual addresses of any
+  /// referenced global symbols.
+  virtual void relocate(void *Function, MachineRelocation *MR,
+                        unsigned NumRelocs, unsigned char *GOTBase);
+
+  /// Initialize - Initialize internal stage for the function being JITted.
+  void Initialize(const MachineFunction &MF, bool isPIC) {
+    IsPIC = isPIC;
+  }
+
+};
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
index 90c27a4..3783c16 100644
--- a/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -28,11 +28,16 @@ namespace llvm {
     /// SRetReturnReg - Holds the virtual register into which the sret
     /// argument is passed.
     unsigned SRetReturnReg;
+
+    /// IsLeafProc - True if the function is a leaf procedure.
+    bool IsLeafProc;
   public:
     SparcMachineFunctionInfo()
-      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
     explicit SparcMachineFunctionInfo(MachineFunction &MF)
-      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0) {}
+      : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
+        IsLeafProc(false) {}
 
     unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
     void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
@@ -42,6 +47,9 @@ namespace llvm {
 
     unsigned getSRetReturnReg() const { return SRetReturnReg; }
     void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+    void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+    bool isLeafProc() const { return IsLeafProc; }
   };
 }
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index 3af4c61..c98613a 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcRegisterInfo.h"
 #include "Sparc.h"
+#include "SparcMachineFunctionInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
@@ -20,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -28,31 +30,59 @@
 
 using namespace llvm;
 
-SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
-                                     const TargetInstrInfo &tii)
-  : SparcGenRegisterInfo(SP::I7), Subtarget(st), TII(tii) {
+static cl::opt<bool>
+ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
+                    cl::desc("Reserve application registers (%g2-%g4)"));
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
+  : SparcGenRegisterInfo(SP::I7), Subtarget(st) {
 }
 
 const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
-  static const uint16_t CalleeSavedRegs[] = { 0 };
-  return CalleeSavedRegs;
+  return CSR_SaveList;
+}
+
+const uint32_t*
+SparcRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  return CSR_RegMask;
+}
+
+const uint32_t*
+SparcRegisterInfo::getRTCallPreservedMask(CallingConv::ID CC) const {
+  return RTCSR_RegMask;
 }
 
 BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   // FIXME: G1 reserved for now for large imm generation by frame code.
   Reserved.set(SP::G1);
-  Reserved.set(SP::G2);
-  Reserved.set(SP::G3);
-  Reserved.set(SP::G4);
+
+  // G1-G4 can be used in applications.
+  if (ReserveAppRegisters) {
+    Reserved.set(SP::G2);
+    Reserved.set(SP::G3);
+    Reserved.set(SP::G4);
+  }
+  // G5 is not reserved in 64 bit mode.
+  if (!Subtarget.is64Bit())
+    Reserved.set(SP::G5);
+
   Reserved.set(SP::O6);
   Reserved.set(SP::I6);
   Reserved.set(SP::I7);
   Reserved.set(SP::G0);
-  Reserved.set(SP::G5);
   Reserved.set(SP::G6);
   Reserved.set(SP::G7);
+
+  // Unaliased double registers are not available in non-V9 targets.
+  if (!Subtarget.isV9()) {
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(SP::D16 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
+
   return Reserved;
 }
 
@@ -62,6 +92,62 @@ SparcRegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return Subtarget.is64Bit() ? &SP::I64RegsRegClass : &SP::IntRegsRegClass;
 }
 
+static void replaceFI(MachineFunction &MF,
+                      MachineBasicBlock::iterator II,
+                      MachineInstr &MI,
+                      DebugLoc dl,
+                      unsigned FIOperandNum, int Offset,
+                      unsigned FramePtr)
+{
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+
+  // FIXME: it would be better to scavenge a register here instead of
+  // reserving G1 all of the time.
+  if (Offset >= 0) {
+    // Emit nonnegaive immediates with sethi + or.
+    // sethi %hi(Offset), %g1
+    // add %g1, %fp, %g1
+    // Insert G1+%lo(offset) into the user.
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+      .addImm(HI22(Offset));
+
+
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(FramePtr);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(LO10(Offset));
+    return;
+  }
+
+  // Emit Negative numbers with sethi + xor
+  // sethi %hix(Offset), %g1
+  // xor  %g1, %lox(offset), %g1
+  // add %g1, %fp, %g1
+  // Insert: G1 + 0 into the user.
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1)
+    .addImm(HIX22(Offset));
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::XORri), SP::G1)
+    .addReg(SP::G1).addImm(LOX10(Offset));
+
+  BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+    .addReg(FramePtr);
+  // Insert: G1+%lo(offset) into the user.
+  MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
+  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(0);
+}
+
+
 void
 SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                        int SPAdj, unsigned FIOperandNum,
@@ -77,35 +163,49 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int64_t Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
                    MI.getOperand(FIOperandNum + 1).getImm() +
                    Subtarget.getStackPointerBias();
+  SparcMachineFunctionInfo *FuncInfo = MF.getInfo<SparcMachineFunctionInfo>();
+  unsigned FramePtr = SP::I6;
+  if (FuncInfo->isLeafProc()) {
+    // Use %sp and adjust offset if needed.
+    FramePtr = SP::O6;
+    int stackSize = MF.getFrameInfo()->getStackSize();
+    Offset += (stackSize) ? Subtarget.getAdjustedFrameSize(stackSize) : 0 ;
+  }
 
-  // Replace frame index with a frame pointer reference.
-  if (Offset >= -4096 && Offset <= 4095) {
-    // If the offset is small enough to fit in the immediate field, directly
-    // encode it.
-    MI.getOperand(FIOperandNum).ChangeToRegister(SP::I6, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-  } else {
-    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
-    // scavenge a register here instead of reserving G1 all of the time.
-    unsigned OffHi = (unsigned)Offset >> 10U;
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
-    // Emit G1 = G1 + I6
-    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
-      .addReg(SP::I6);
-    // Insert: G1+%lo(offset) into the user.
-    MI.getOperand(FIOperandNum).ChangeToRegister(SP::G1, false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  if (!Subtarget.isV9() || !Subtarget.hasHardQuad()) {
+    if (MI.getOpcode() == SP::STQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned SrcReg = MI.getOperand(2).getReg();
+      unsigned SrcEvenReg = getSubReg(SrcReg, SP::sub_even64);
+      unsigned SrcOddReg  = getSubReg(SrcReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::STDFri))
+        .addReg(FramePtr).addImm(0).addReg(SrcEvenReg);
+      replaceFI(MF, II, *StMI, dl, 0, Offset, FramePtr);
+      MI.setDesc(TII.get(SP::STDFri));
+      MI.getOperand(2).setReg(SrcOddReg);
+      Offset += 8;
+    } else if (MI.getOpcode() == SP::LDQFri) {
+      const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+      unsigned DestReg     = MI.getOperand(0).getReg();
+      unsigned DestEvenReg = getSubReg(DestReg, SP::sub_even64);
+      unsigned DestOddReg  = getSubReg(DestReg, SP::sub_odd64);
+      MachineInstr *StMI =
+        BuildMI(*MI.getParent(), II, dl, TII.get(SP::LDDFri), DestEvenReg)
+        .addReg(FramePtr).addImm(0);
+      replaceFI(MF, II, *StMI, dl, 1, Offset, FramePtr);
+
+      MI.setDesc(TII.get(SP::LDDFri));
+      MI.getOperand(0).setReg(DestOddReg);
+      Offset += 8;
+    }
   }
+
+  replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FramePtr);
+
 }
 
 unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return SP::I6;
 }
 
-unsigned SparcRegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned SparcRegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index f91df53..00b5a98 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -27,12 +27,14 @@ class Type;
 
 struct SparcRegisterInfo : public SparcGenRegisterInfo {
   SparcSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
-  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+  SparcRegisterInfo(SparcSubtarget &st);
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const;
+
+  const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -48,10 +50,6 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
index 497e7c5..2a575c0 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/lib/Target/Sparc/SparcRegisterInfo.td
@@ -8,11 +8,11 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//  Declarations that describe the Sparc register file 
+//  Declarations that describe the Sparc register file
 //===----------------------------------------------------------------------===//
 
-class SparcReg<string n> : Register<n> {
-  field bits<5> Num;
+class SparcReg<bits<16> Enc, string n> : Register<n> {
+  let HWEncoding = Enc;
   let Namespace = "SP";
 }
 
@@ -21,27 +21,33 @@ class SparcCtrlReg<string n>: Register<n> {
 }
 
 let Namespace = "SP" in {
-def sub_even : SubRegIndex;
-def sub_odd  : SubRegIndex;
+def sub_even : SubRegIndex<32>;
+def sub_odd  : SubRegIndex<32, 32>;
+def sub_even64 : SubRegIndex<64>;
+def sub_odd64  : SubRegIndex<64, 64>;
 }
 
 // Registers are identified with 5-bit ID numbers.
 // Ri - 32-bit integer registers
-class Ri<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Ri<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rf - 32-bit floating-point registers
-class Rf<bits<5> num, string n> : SparcReg<n> {
-  let Num = num;
-}
+class Rf<bits<16> Enc, string n> : SparcReg<Enc, n>;
+
 // Rd - Slots in the FP register file for 64-bit floating-point values.
-class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
-  let Num = num;
+class Rd<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
   let SubRegs = subregs;
   let SubRegIndices = [sub_even, sub_odd];
   let CoveredBySubRegs = 1;
 }
 
+// Rq - Slots in the FP register file for 128-bit floating-point values.
+class Rq<bits<16> Enc, string n, list<Register> subregs> : SparcReg<Enc, n> {
+  let SubRegs = subregs;
+  let SubRegIndices = [sub_even64, sub_odd64];
+  let CoveredBySubRegs = 1;
+}
+
 // Control Registers
 def ICC : SparcCtrlReg<"ICC">; // This represents icc and xcc in 64-bit code.
 def FCC : SparcCtrlReg<"FCC">;
@@ -52,68 +58,68 @@ def Y : SparcCtrlReg<"Y">;
 // Integer registers
 def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
 def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
-def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>; 
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>;
 def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
 def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
-def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>; 
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>;
 def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
 def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
 def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
 def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
-def O2 : Ri<10, "O2">, DwarfRegNum<[10]>; 
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>;
 def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
 def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
-def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>;
 def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
 def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
 def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
 def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
-def L2 : Ri<18, "L2">, DwarfRegNum<[18]>; 
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>;
 def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
 def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
-def L5 : Ri<21, "L5">, DwarfRegNum<[21]>; 
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>;
 def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
 def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
 def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
 def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
-def I2 : Ri<26, "I2">, DwarfRegNum<[26]>; 
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>;
 def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
 def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
-def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>;
 def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
 def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
 
 // Floating-point registers
 def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
 def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
-def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>; 
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>;
 def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
 def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
-def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>; 
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>;
 def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
 def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
-def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>; 
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>;
 def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
 def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
-def F11 : Rf<11, "F11">, DwarfRegNum<[43]>; 
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>;
 def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
 def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
-def F14 : Rf<14, "F14">, DwarfRegNum<[46]>; 
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>;
 def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
 def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
-def F17 : Rf<17, "F17">, DwarfRegNum<[49]>; 
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>;
 def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
 def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
-def F20 : Rf<20, "F20">, DwarfRegNum<[52]>; 
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>;
 def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
 def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
 def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
 def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
 def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
-def F26 : Rf<26, "F26">, DwarfRegNum<[58]>; 
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>;
 def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
 def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
-def F29 : Rf<29, "F29">, DwarfRegNum<[61]>; 
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>;
 def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
 def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
 
@@ -135,6 +141,43 @@ def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[85]>;
 def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[86]>;
 def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 
+// Unaliased double precision floating point registers.
+// FIXME: Define DwarfRegNum for these registers.
+def D16 : SparcReg< 1, "F32">;
+def D17 : SparcReg< 3, "F34">;
+def D18 : SparcReg< 5, "F36">;
+def D19 : SparcReg< 7, "F38">;
+def D20 : SparcReg< 9, "F40">;
+def D21 : SparcReg<11, "F42">;
+def D22 : SparcReg<13, "F44">;
+def D23 : SparcReg<15, "F46">;
+def D24 : SparcReg<17, "F48">;
+def D25 : SparcReg<19, "F50">;
+def D26 : SparcReg<21, "F52">;
+def D27 : SparcReg<23, "F54">;
+def D28 : SparcReg<25, "F56">;
+def D29 : SparcReg<27, "F58">;
+def D30 : SparcReg<29, "F60">;
+def D31 : SparcReg<31, "F62">;
+
+// Aliases of the F* registers used to hold 128-bit for values (long doubles).
+def Q0  : Rq< 0,  "F0", [D0,   D1]>;
+def Q1  : Rq< 4,  "F4", [D2,   D3]>;
+def Q2  : Rq< 8,  "F8", [D4,   D5]>;
+def Q3  : Rq<12, "F12", [D6,   D7]>;
+def Q4  : Rq<16, "F16", [D8,   D9]>;
+def Q5  : Rq<20, "F20", [D10, D11]>;
+def Q6  : Rq<24, "F24", [D12, D13]>;
+def Q7  : Rq<28, "F28", [D14, D15]>;
+def Q8  : Rq< 1, "F32", [D16, D17]>;
+def Q9  : Rq< 5, "F36", [D18, D19]>;
+def Q10 : Rq< 9, "F40", [D20, D21]>;
+def Q11 : Rq<13, "F44", [D22, D23]>;
+def Q12 : Rq<17, "F48", [D24, D25]>;
+def Q13 : Rq<21, "F52", [D26, D27]>;
+def Q14 : Rq<25, "F56", [D28, D29]>;
+def Q15 : Rq<29, "F60", [D30, D31]>;
+
 // Register classes.
 //
 // FIXME: the register order should be defined in terms of the preferred
@@ -144,19 +187,10 @@ def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[87]>;
 // register class for that. The i64 type is included here to allow i64 patterns
 // using the integer instructions.
 def IntRegs : RegisterClass<"SP", [i32, i64], 32,
-                            (add L0, L1, L2, L3, L4, L5, L6,
-                                 L7, I0, I1, I2, I3, I4, I5,
-                                 O0, O1, O2, O3, O4, O5, O7,
-                                 G1,
-                                 // Non-allocatable regs:
-                                 G2, G3, G4, // FIXME: OK for use only in
-                                             // applications, not libraries.
-                                 O6, // stack ptr
-                                 I6, // frame ptr
-                                 I7, // return address
-                                 G0, // constant zero
-                                 G5, G6, G7 // reserved for kernel
-                                 )>;
+                            (add (sequence "I%u", 0, 7),
+                                 (sequence "G%u", 0, 7),
+                                 (sequence "L%u", 0, 7),
+                                 (sequence "O%u", 0, 7))>;
 
 // Register class for 64-bit mode, with a 64-bit spill slot size.
 // These are the same as the 32-bit registers, so TableGen will consider this
@@ -167,4 +201,6 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>;
 // Floating point register classes.
 def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>;
 
-def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>;
+def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>;
+
+def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>;
diff --git a/lib/Target/Sparc/SparcRelocations.h b/lib/Target/Sparc/SparcRelocations.h
new file mode 100644
index 0000000..388cfe7
--- /dev/null
+++ b/lib/Target/Sparc/SparcRelocations.h
@@ -0,0 +1,41 @@
+//===-- SparcRelocations.h - Sparc Code Relocations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Sparc target-specific relocation types
+// (for relocation-model=static).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_RELOCATIONS_H
+#define SPARC_RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace SP {
+    enum RelocationType {
+      // reloc_sparc_hi - upper 22 bits
+      reloc_sparc_hi = 1,
+
+      // reloc_sparc_lo - lower 10 bits
+      reloc_sparc_lo = 2,
+
+      // reloc_sparc_pc30 - pc rel. 30 bits for call
+      reloc_sparc_pc30 = 3,
+
+     // reloc_sparc_pc22 - pc rel. 22 bits for branch
+      reloc_sparc_pc22 = 4,
+
+      // reloc_sparc_pc22 - pc rel. 19 bits for branch with icc/xcc
+      reloc_sparc_pc19 = 5
+    };
+  }
+}
+
+#endif
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index e5b2aeb..7d09d0e 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -13,6 +13,7 @@
 
 #include "SparcSubtarget.h"
 #include "Sparc.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -29,8 +30,9 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   IsV9(false),
   V8DeprecatedInsts(false),
   IsVIS(false),
-  Is64Bit(is64Bit) {
-  
+  Is64Bit(is64Bit),
+  HasHardQuad(false) {
+
   // Determine default and user specified characteristics
   std::string CPUName = CPU;
   if (CPUName.empty()) {
@@ -44,3 +46,30 @@ SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
 }
+
+
+int SparcSubtarget::getAdjustedFrameSize(int frameSize) const {
+
+  if (is64Bit()) {
+    // All 64-bit stack frames must be 16-byte aligned, and must reserve space
+    // for spilling the 16 window registers at %sp+BIAS..%sp+BIAS+128.
+    frameSize += 128;
+    // Frames with calls must also reserve space for 6 outgoing arguments
+    // whether they are used or not. LowerCall_64 takes care of that.
+    assert(frameSize % 16 == 0 && "Stack size not 16-byte aligned");
+  } else {
+    // Emit the correct save instruction based on the number of bytes in
+    // the frame. Minimum stack frame size according to V8 ABI is:
+    //   16 words for register window spill
+    //    1 word for address of returned aggregate-value
+    // +  6 words for passing parameters on the stack
+    // ----------
+    //   23 words * 4 bytes per word = 92 bytes
+    frameSize += 92;
+
+    // Round up to next doubleword boundary -- a double-word boundary
+    // is required by the ABI.
+    frameSize = RoundUpToAlignment(frameSize, 8);
+  }
+  return frameSize;
+}
diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
index b94dd11..0f81cc9 100644
--- a/lib/Target/Sparc/SparcSubtarget.h
+++ b/lib/Target/Sparc/SparcSubtarget.h
@@ -29,7 +29,8 @@ class SparcSubtarget : public SparcGenSubtargetInfo {
   bool V8DeprecatedInsts;
   bool IsVIS;
   bool Is64Bit;
-  
+  bool HasHardQuad;
+
 public:
   SparcSubtarget(const std::string &TT, const std::string &CPU,
                  const std::string &FS, bool is64bit);
@@ -37,11 +38,12 @@ public:
   bool isV9() const { return IsV9; }
   bool isVIS() const { return IsVIS; }
   bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
-  
-  /// ParseSubtargetFeatures - Parses features string setting specified 
+  bool hasHardQuad() const { return HasHardQuad; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  
+
   bool is64Bit() const { return Is64Bit; }
   std::string getDataLayout() const {
     const char *p;
@@ -58,6 +60,12 @@ public:
   int64_t getStackPointerBias() const {
     return is64Bit() ? 2047 : 0;
   }
+
+  /// Given a actual stack size as determined by FrameInfo, this function
+  /// returns adjusted framesize which includes space for register window
+  /// spills and arguments.
+  int getAdjustedFrameSize(int stackSize) const;
+
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 60bceb7..0f93674 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -37,6 +37,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
     FrameLowering(Subtarget) {
+  initAsmInfo();
 }
 
 namespace {
@@ -64,11 +65,17 @@ bool SparcPassConfig::addInstSelector() {
   return false;
 }
 
+bool SparcTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        JITCodeEmitter &JCE) {
+  // Machine code emitter pass for Sparc.
+  PM.add(createSparcJITCodeEmitterPass(*this, JCE));
+  return false;
+}
+
 /// addPreEmitPass - This pass may be implemented by targets that want to run
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
 bool SparcPassConfig::addPreEmitPass(){
-  addPass(createSparcFPMoverPass(getSparcTargetMachine()));
   addPass(createSparcDelaySlotFillerPass(getSparcTargetMachine()));
   return true;
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 081075d..8c9bcd3 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -17,6 +17,7 @@
 #include "SparcFrameLowering.h"
 #include "SparcISelLowering.h"
 #include "SparcInstrInfo.h"
+#include "SparcJITInfo.h"
 #include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,6 +33,7 @@ class SparcTargetMachine : public LLVMTargetMachine {
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
+  SparcJITInfo JITInfo;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -52,10 +54,14 @@ public:
   virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
+  virtual SparcJITInfo *getJITInfo() {
+    return &JITInfo;
+  }
   virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
index bb71463..4eea163 100644
--- a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
@@ -15,7 +15,9 @@ using namespace llvm;
 Target llvm::TheSparcTarget;
 Target llvm::TheSparcV9Target;
 
-extern "C" void LLVMInitializeSparcTargetInfo() { 
-  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
-  RegisterTarget<Triple::sparcv9> Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+extern "C" void LLVMInitializeSparcTargetInfo() {
+  RegisterTarget<Triple::sparc, /*HasJIT=*/ true>
+    X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9, /*HasJIT=*/ true>
+    Y(TheSparcV9Target, "sparcv9", "Sparc V9");
 }
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index c7725a1..763f40c 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,6 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -28,21 +30,29 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 }
 
 namespace {
+enum RegisterKind {
+  GR32Reg,
+  GRH32Reg,
+  GR64Reg,
+  GR128Reg,
+  ADDR32Reg,
+  ADDR64Reg,
+  FP32Reg,
+  FP64Reg,
+  FP128Reg
+};
+
+enum MemoryKind {
+  BDMem,
+  BDXMem,
+  BDLMem
+};
+
 class SystemZOperand : public MCParsedAsmOperand {
 public:
-  enum RegisterKind {
-    GR32Reg,
-    GR64Reg,
-    GR128Reg,
-    ADDR32Reg,
-    ADDR64Reg,
-    FP32Reg,
-    FP64Reg,
-    FP128Reg
-  };
-
 private:
   enum OperandKind {
+    KindInvalid,
     KindToken,
     KindReg,
     KindAccessReg,
@@ -59,7 +69,15 @@ private:
     unsigned Length;
   };
 
-  // LLVM register Num, which has kind Kind.
+  // LLVM register Num, which has kind Kind.  In some ways it might be
+  // easier for this class to have a register bank (general, floating-point
+  // or access) and a raw register number (0-15).  This would postpone the
+  // interpretation of the operand to the add*() methods and avoid the need
+  // for context-dependent parsing.  However, we do things the current way
+  // because of the virtual getReg() method, which needs to distinguish
+  // between (say) %r0 used as a single register and %r0 used as a pair.
+  // Context-dependent parsing can also give us slightly better error
+  // messages when invalid pairs like %r1 are used.
   struct RegOp {
     RegisterKind Kind;
     unsigned Num;
@@ -67,12 +85,15 @@ private:
 
   // Base + Disp + Index, where Base and Index are LLVM registers or 0.
   // RegKind says what type the registers have (ADDR32Reg or ADDR64Reg).
+  // Length is the operand length for D(L,B)-style operands, otherwise
+  // it is null.
   struct MemOp {
     unsigned Base : 8;
     unsigned Index : 8;
     unsigned RegKind : 8;
     unsigned Unused : 8;
     const MCExpr *Disp;
+    const MCExpr *Length;
   };
 
   union {
@@ -99,6 +120,9 @@ private:
 
 public:
   // Create particular kinds of operand.
+  static SystemZOperand *createInvalid(SMLoc StartLoc, SMLoc EndLoc) {
+    return new SystemZOperand(KindInvalid, StartLoc, EndLoc);
+  }
   static SystemZOperand *createToken(StringRef Str, SMLoc Loc) {
     SystemZOperand *Op = new SystemZOperand(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
@@ -126,12 +150,14 @@ public:
   }
   static SystemZOperand *createMem(RegisterKind RegKind, unsigned Base,
                                    const MCExpr *Disp, unsigned Index,
-                                   SMLoc StartLoc, SMLoc EndLoc) {
+                                   const MCExpr *Length, SMLoc StartLoc,
+                                   SMLoc EndLoc) {
     SystemZOperand *Op = new SystemZOperand(KindMem, StartLoc, EndLoc);
     Op->Mem.RegKind = RegKind;
     Op->Mem.Base = Base;
     Op->Mem.Index = Index;
     Op->Mem.Disp = Disp;
+    Op->Mem.Length = Length;
     return Op;
   }
 
@@ -178,16 +204,20 @@ public:
   virtual bool isMem() const LLVM_OVERRIDE {
     return Kind == KindMem;
   }
-  bool isMem(RegisterKind RegKind, bool HasIndex) const {
+  bool isMem(RegisterKind RegKind, MemoryKind MemKind) const {
     return (Kind == KindMem &&
             Mem.RegKind == RegKind &&
-            (HasIndex || !Mem.Index));
+            (MemKind == BDXMem || !Mem.Index) &&
+            (MemKind == BDLMem) == (Mem.Length != 0));
+  }
+  bool isMemDisp12(RegisterKind RegKind, MemoryKind MemKind) const {
+    return isMem(RegKind, MemKind) && inRange(Mem.Disp, 0, 0xfff);
   }
-  bool isMemDisp12(RegisterKind RegKind, bool HasIndex) const {
-    return isMem(RegKind, HasIndex) && inRange(Mem.Disp, 0, 0xfff);
+  bool isMemDisp20(RegisterKind RegKind, MemoryKind MemKind) const {
+    return isMem(RegKind, MemKind) && inRange(Mem.Disp, -524288, 524287);
   }
-  bool isMemDisp20(RegisterKind RegKind, bool HasIndex) const {
-    return isMem(RegKind, HasIndex) && inRange(Mem.Disp, -524288, 524287);
+  bool isMemDisp12Len8(RegisterKind RegKind) const {
+    return isMemDisp12(RegKind, BDLMem) && inRange(Mem.Length, 1, 0x100);
   }
 
   // Override MCParsedAsmOperand.
@@ -223,9 +253,18 @@ public:
     addExpr(Inst, Mem.Disp);
     Inst.addOperand(MCOperand::CreateReg(Mem.Index));
   }
+  void addBDLAddrOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands");
+    assert(Kind == KindMem && "Invalid operand type");
+    Inst.addOperand(MCOperand::CreateReg(Mem.Base));
+    addExpr(Inst, Mem.Disp);
+    addExpr(Inst, Mem.Length);
+  }
 
   // Used by the TableGen code to check for particular operand types.
   bool isGR32() const { return isReg(GR32Reg); }
+  bool isGRH32() const { return isReg(GRH32Reg); }
+  bool isGRX32() const { return false; }
   bool isGR64() const { return isReg(GR64Reg); }
   bool isGR128() const { return isReg(GR128Reg); }
   bool isADDR32() const { return isReg(ADDR32Reg); }
@@ -234,12 +273,13 @@ public:
   bool isFP32() const { return isReg(FP32Reg); }
   bool isFP64() const { return isReg(FP64Reg); }
   bool isFP128() const { return isReg(FP128Reg); }
-  bool isBDAddr32Disp12() const { return isMemDisp12(ADDR32Reg, false); }
-  bool isBDAddr32Disp20() const { return isMemDisp20(ADDR32Reg, false); }
-  bool isBDAddr64Disp12() const { return isMemDisp12(ADDR64Reg, false); }
-  bool isBDAddr64Disp20() const { return isMemDisp20(ADDR64Reg, false); }
-  bool isBDXAddr64Disp12() const { return isMemDisp12(ADDR64Reg, true); }
-  bool isBDXAddr64Disp20() const { return isMemDisp20(ADDR64Reg, true); }
+  bool isBDAddr32Disp12() const { return isMemDisp12(ADDR32Reg, BDMem); }
+  bool isBDAddr32Disp20() const { return isMemDisp20(ADDR32Reg, BDMem); }
+  bool isBDAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDMem); }
+  bool isBDAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDMem); }
+  bool isBDXAddr64Disp12() const { return isMemDisp12(ADDR64Reg, BDXMem); }
+  bool isBDXAddr64Disp20() const { return isMemDisp20(ADDR64Reg, BDXMem); }
+  bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
   bool isU4Imm() const { return isImm(0, 15); }
   bool isU6Imm() const { return isImm(0, 63); }
   bool isU8Imm() const { return isImm(0, 255); }
@@ -250,46 +290,6 @@ public:
   bool isS32Imm() const { return isImm(-(1LL << 31), (1LL << 31) - 1); }
 };
 
-// Maps of asm register numbers to LLVM register numbers, with 0 indicating
-// an invalid register.  We don't use register class directly because that
-// specifies the allocation order.
-static const unsigned GR32Regs[] = {
-  SystemZ::R0W, SystemZ::R1W, SystemZ::R2W, SystemZ::R3W,
-  SystemZ::R4W, SystemZ::R5W, SystemZ::R6W, SystemZ::R7W,
-  SystemZ::R8W, SystemZ::R9W, SystemZ::R10W, SystemZ::R11W,
-  SystemZ::R12W, SystemZ::R13W, SystemZ::R14W, SystemZ::R15W
-};
-static const unsigned GR64Regs[] = {
-  SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
-  SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
-  SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
-  SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
-};
-static const unsigned GR128Regs[] = {
-  SystemZ::R0Q, 0, SystemZ::R2Q, 0,
-  SystemZ::R4Q, 0, SystemZ::R6Q, 0,
-  SystemZ::R8Q, 0, SystemZ::R10Q, 0,
-  SystemZ::R12Q, 0, SystemZ::R14Q, 0
-};
-static const unsigned FP32Regs[] = {
-  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
-  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
-  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
-  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
-};
-static const unsigned FP64Regs[] = {
-  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
-  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
-  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
-  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
-};
-static const unsigned FP128Regs[] = {
-  SystemZ::F0Q, SystemZ::F1Q, 0, 0,
-  SystemZ::F4Q, SystemZ::F5Q, 0, 0,
-  SystemZ::F8Q, SystemZ::F9Q, 0, 0,
-  SystemZ::F12Q, SystemZ::F13Q, 0, 0
-};
-
 class SystemZAsmParser : public MCTargetAsmParser {
 #define GET_ASSEMBLER_HEADER
 #include "SystemZGenAsmMatcher.inc"
@@ -297,35 +297,42 @@ class SystemZAsmParser : public MCTargetAsmParser {
 private:
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  enum RegisterGroup {
+    RegGR,
+    RegFP,
+    RegAccess
+  };
   struct Register {
-    char Prefix;
-    unsigned Number;
+    RegisterGroup Group;
+    unsigned Num;
     SMLoc StartLoc, EndLoc;
   };
 
   bool parseRegister(Register &Reg);
 
-  OperandMatchResultTy
-  parseRegister(Register &Reg, char Prefix, const unsigned *Regs,
-                bool IsAddress = false);
+  bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
+                     bool IsAddress = false);
 
   OperandMatchResultTy
   parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                char Prefix, const unsigned *Regs,
-                SystemZOperand::RegisterKind Kind,
-                bool IsAddress = false);
+                RegisterGroup Group, const unsigned *Regs, RegisterKind Kind);
+
+  bool parseAddress(unsigned &Base, const MCExpr *&Disp,
+                    unsigned &Index, const MCExpr *&Length,
+                    const unsigned *Regs, RegisterKind RegKind);
 
   OperandMatchResultTy
   parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               const unsigned *Regs, SystemZOperand::RegisterKind RegKind,
-               bool HasIndex);
+               const unsigned *Regs, RegisterKind RegKind,
+               MemoryKind MemKind);
 
   bool parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                     StringRef Mnemonic);
 
 public:
-  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+  SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+                   const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Initialize the set of available features.
@@ -349,25 +356,31 @@ public:
   // Used by the TableGen code to parse particular operand types.
   OperandMatchResultTy
   parseGR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'r', GR32Regs, SystemZOperand::GR32Reg);
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
+  }
+  OperandMatchResultTy
+  parseGRH32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+  }
+  OperandMatchResultTy
+  parseGRX32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    llvm_unreachable("GRX32 should only be used for pseudo instructions");
   }
   OperandMatchResultTy
   parseGR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'r', GR64Regs, SystemZOperand::GR64Reg);
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
   }
   OperandMatchResultTy
   parseGR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'r', GR128Regs, SystemZOperand::GR128Reg);
+    return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
   }
   OperandMatchResultTy
   parseADDR32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'r', GR32Regs, SystemZOperand::ADDR32Reg,
-                         true);
+    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
   }
   OperandMatchResultTy
   parseADDR64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'r', GR64Regs, SystemZOperand::ADDR64Reg,
-                         true);
+    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
   }
   OperandMatchResultTy
   parseADDR128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
@@ -375,30 +388,45 @@ public:
   }
   OperandMatchResultTy
   parseFP32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'f', FP32Regs, SystemZOperand::FP32Reg);
+    return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
   }
   OperandMatchResultTy
   parseFP64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'f', FP64Regs, SystemZOperand::FP64Reg);
+    return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
   }
   OperandMatchResultTy
   parseFP128(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseRegister(Operands, 'f', FP128Regs, SystemZOperand::FP128Reg);
+    return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
   }
   OperandMatchResultTy
   parseBDAddr32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseAddress(Operands, GR32Regs, SystemZOperand::ADDR32Reg, false);
+    return parseAddress(Operands, SystemZMC::GR32Regs, ADDR32Reg, BDMem);
   }
   OperandMatchResultTy
   parseBDAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseAddress(Operands, GR64Regs, SystemZOperand::ADDR64Reg, false);
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDMem);
   }
   OperandMatchResultTy
   parseBDXAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return parseAddress(Operands, GR64Regs, SystemZOperand::ADDR64Reg, true);
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDXMem);
+  }
+  OperandMatchResultTy
+  parseBDLAddr64(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parseAddress(Operands, SystemZMC::GR64Regs, ADDR64Reg, BDLMem);
   }
   OperandMatchResultTy
   parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  OperandMatchResultTy
+  parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+             int64_t MinVal, int64_t MaxVal);
+  OperandMatchResultTy
+  parsePCRel16(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parsePCRel(Operands, -(1LL << 16), (1LL << 16) - 1);
+  }
+  OperandMatchResultTy
+  parsePCRel32(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1);
+  }
 };
 }
 
@@ -417,122 +445,160 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
 
   // Eat the % prefix.
   if (Parser.getTok().isNot(AsmToken::Percent))
-    return true;
+    return Error(Parser.getTok().getLoc(), "register expected");
   Parser.Lex();
 
   // Expect a register name.
   if (Parser.getTok().isNot(AsmToken::Identifier))
-    return true;
+    return Error(Reg.StartLoc, "invalid register");
 
-  // Check the prefix.
+  // Check that there's a prefix.
   StringRef Name = Parser.getTok().getString();
   if (Name.size() < 2)
-    return true;
-  Reg.Prefix = Name[0];
+    return Error(Reg.StartLoc, "invalid register");
+  char Prefix = Name[0];
 
   // Treat the rest of the register name as a register number.
-  if (Name.substr(1).getAsInteger(10, Reg.Number))
-    return true;
+  if (Name.substr(1).getAsInteger(10, Reg.Num))
+    return Error(Reg.StartLoc, "invalid register");
+
+  // Look for valid combinations of prefix and number.
+  if (Prefix == 'r' && Reg.Num < 16)
+    Reg.Group = RegGR;
+  else if (Prefix == 'f' && Reg.Num < 16)
+    Reg.Group = RegFP;
+  else if (Prefix == 'a' && Reg.Num < 16)
+    Reg.Group = RegAccess;
+  else
+    return Error(Reg.StartLoc, "invalid register");
 
   Reg.EndLoc = Parser.getTok().getLoc();
   Parser.Lex();
   return false;
 }
 
-// Parse a register with prefix Prefix and convert it to LLVM numbering.
-// Regs maps asm register numbers to LLVM register numbers, with zero
-// entries indicating an invalid register.  IsAddress says whether the
-// register appears in an address context.
-SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseRegister(Register &Reg, char Prefix,
-                                const unsigned *Regs, bool IsAddress) {
+// Parse a register of group Group.  If Regs is nonnull, use it to map
+// the raw register number to LLVM numbering, with zero entries indicating
+// an invalid register.  IsAddress says whether the register appears in an
+// address context.
+bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
+                                     const unsigned *Regs, bool IsAddress) {
   if (parseRegister(Reg))
-    return MatchOperand_NoMatch;
-  if (Reg.Prefix != Prefix || Reg.Number > 15 || Regs[Reg.Number] == 0) {
-    Error(Reg.StartLoc, "invalid register");
-    return MatchOperand_ParseFail;
-  }
-  if (Reg.Number == 0 && IsAddress) {
-    Error(Reg.StartLoc, "%r0 used in an address");
-    return MatchOperand_ParseFail;
-  }
-  Reg.Number = Regs[Reg.Number];
-  return MatchOperand_Success;
+    return true;
+  if (Reg.Group != Group)
+    return Error(Reg.StartLoc, "invalid operand for instruction");
+  if (Regs && Regs[Reg.Num] == 0)
+    return Error(Reg.StartLoc, "invalid register pair");
+  if (Reg.Num == 0 && IsAddress)
+    return Error(Reg.StartLoc, "%r0 used in an address");
+  if (Regs)
+    Reg.Num = Regs[Reg.Num];
+  return false;
 }
 
-// Parse a register and add it to Operands.  Prefix is 'r' for GPRs,
-// 'f' for FPRs, etc.  Regs maps asm register numbers to LLVM register numbers,
-// with zero entries indicating an invalid register.  Kind is the type of
-// register represented by Regs and IsAddress says whether the register is
-// being parsed in an address context, meaning that %r0 evaluates as 0.
+// Parse a register and add it to Operands.  The other arguments are as above.
 SystemZAsmParser::OperandMatchResultTy
 SystemZAsmParser::parseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                char Prefix, const unsigned *Regs,
-                                SystemZOperand::RegisterKind Kind,
-                                bool IsAddress) {
+                                RegisterGroup Group, const unsigned *Regs,
+                                RegisterKind Kind) {
+  if (Parser.getTok().isNot(AsmToken::Percent))
+    return MatchOperand_NoMatch;
+
   Register Reg;
-  OperandMatchResultTy Result = parseRegister(Reg, Prefix, Regs, IsAddress);
-  if (Result == MatchOperand_Success)
-    Operands.push_back(SystemZOperand::createReg(Kind, Reg.Number,
-                                                 Reg.StartLoc, Reg.EndLoc));
-  return Result;
-}
+  bool IsAddress = (Kind == ADDR32Reg || Kind == ADDR64Reg);
+  if (parseRegister(Reg, Group, Regs, IsAddress))
+    return MatchOperand_ParseFail;
 
-// Parse a memory operand and add it to Operands.  Regs maps asm register
-// numbers to LLVM address registers and RegKind says what kind of address
-// register we're using (ADDR32Reg or ADDR64Reg).  HasIndex says whether
-// the address allows index registers.
-SystemZAsmParser::OperandMatchResultTy
-SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               const unsigned *Regs,
-                               SystemZOperand::RegisterKind RegKind,
-                               bool HasIndex) {
-  SMLoc StartLoc = Parser.getTok().getLoc();
+  Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
+                                               Reg.StartLoc, Reg.EndLoc));
+  return MatchOperand_Success;
+}
 
+// Parse a memory operand into Base, Disp, Index and Length.
+// Regs maps asm register numbers to LLVM register numbers and RegKind
+// says what kind of address register we're using (ADDR32Reg or ADDR64Reg).
+bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
+                                    unsigned &Index, const MCExpr *&Length,
+                                    const unsigned *Regs,
+                                    RegisterKind RegKind) {
   // Parse the displacement, which must always be present.
-  const MCExpr *Disp;
   if (getParser().parseExpression(Disp))
-    return MatchOperand_NoMatch;
+    return true;
 
   // Parse the optional base and index.
-  unsigned Index = 0;
-  unsigned Base = 0;
+  Index = 0;
+  Base = 0;
+  Length = 0;
   if (getLexer().is(AsmToken::LParen)) {
     Parser.Lex();
 
-    // Parse the first register.
-    Register Reg;
-    OperandMatchResultTy Result = parseRegister(Reg, 'r', GR64Regs, true);
-    if (Result != MatchOperand_Success)
-      return Result;
+    if (getLexer().is(AsmToken::Percent)) {
+      // Parse the first register and decide whether it's a base or an index.
+      Register Reg;
+      if (parseRegister(Reg, RegGR, Regs, RegKind))
+        return true;
+      if (getLexer().is(AsmToken::Comma))
+        Index = Reg.Num;
+      else
+        Base = Reg.Num;
+    } else {
+      // Parse the length.
+      if (getParser().parseExpression(Length))
+        return true;
+    }
 
-    // Check whether there's a second register.  If so, the one that we
-    // just parsed was the index.
+    // Check whether there's a second register.  It's the base if so.
     if (getLexer().is(AsmToken::Comma)) {
       Parser.Lex();
-
-      if (!HasIndex) {
-        Error(Reg.StartLoc, "invalid use of indexed addressing");
-        return MatchOperand_ParseFail;
-      }
-
-      Index = Reg.Number;
-      Result = parseRegister(Reg, 'r', GR64Regs, true);
-      if (Result != MatchOperand_Success)
-        return Result;
+      Register Reg;
+      if (parseRegister(Reg, RegGR, Regs, RegKind))
+        return true;
+      Base = Reg.Num;
     }
-    Base = Reg.Number;
 
     // Consume the closing bracket.
     if (getLexer().isNot(AsmToken::RParen))
-      return MatchOperand_NoMatch;
+      return Error(Parser.getTok().getLoc(), "unexpected token in address");
     Parser.Lex();
   }
+  return false;
+}
+
+// Parse a memory operand and add it to Operands.  The other arguments
+// are as above.
+SystemZAsmParser::OperandMatchResultTy
+SystemZAsmParser::parseAddress(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               const unsigned *Regs, RegisterKind RegKind,
+                               MemoryKind MemKind) {
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  unsigned Base, Index;
+  const MCExpr *Disp;
+  const MCExpr *Length;
+  if (parseAddress(Base, Disp, Index, Length, Regs, RegKind))
+    return MatchOperand_ParseFail;
+
+  if (Index && MemKind != BDXMem)
+    {
+      Error(StartLoc, "invalid use of indexed addressing");
+      return MatchOperand_ParseFail;
+    }
+
+  if (Length && MemKind != BDLMem)
+    {
+      Error(StartLoc, "invalid use of length addressing");
+      return MatchOperand_ParseFail;
+    }
+
+  if (!Length && MemKind == BDLMem)
+    {
+      Error(StartLoc, "missing length in address");
+      return MatchOperand_ParseFail;
+    }
 
   SMLoc EndLoc =
     SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(SystemZOperand::createMem(RegKind, Base, Disp, Index,
-                                               StartLoc, EndLoc));
+                                               Length, StartLoc, EndLoc));
   return MatchOperand_Success;
 }
 
@@ -544,13 +610,14 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                      SMLoc &EndLoc) {
   Register Reg;
   if (parseRegister(Reg))
-    return Error(Reg.StartLoc, "register expected");
-  if (Reg.Prefix == 'r' && Reg.Number < 16)
-    RegNo = GR64Regs[Reg.Number];
-  else if (Reg.Prefix == 'f' && Reg.Number < 16)
-    RegNo = FP64Regs[Reg.Number];
+    return true;
+  if (Reg.Group == RegGR)
+    RegNo = SystemZMC::GR64Regs[Reg.Num];
+  else if (Reg.Group == RegFP)
+    RegNo = SystemZMC::FP64Regs[Reg.Num];
   else
-    return Error(Reg.StartLoc, "invalid register");
+    // FIXME: Access registers aren't modelled as LLVM registers yet.
+    return Error(Reg.StartLoc, "invalid operand for instruction");
   StartLoc = Reg.StartLoc;
   EndLoc = Reg.EndLoc;
   return false;
@@ -604,15 +671,33 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
   if (ResTy == MatchOperand_ParseFail)
     return true;
 
-  // The only other type of operand is an immediate.
-  const MCExpr *Expr;
+  // Check for a register.  All real register operands should have used
+  // a context-dependent parse routine, which gives the required register
+  // class.  The code is here to mop up other cases, like those where
+  // the instruction isn't recognized.
+  if (Parser.getTok().is(AsmToken::Percent)) {
+    Register Reg;
+    if (parseRegister(Reg))
+      return true;
+    Operands.push_back(SystemZOperand::createInvalid(Reg.StartLoc, Reg.EndLoc));
+    return false;
+  }
+
+  // The only other type of operand is an immediate or address.  As above,
+  // real address operands should have used a context-dependent parse routine,
+  // so we treat any plain expression as an immediate.
   SMLoc StartLoc = Parser.getTok().getLoc();
-  if (getParser().parseExpression(Expr))
+  unsigned Base, Index;
+  const MCExpr *Expr, *Length;
+  if (parseAddress(Base, Expr, Index, Length, SystemZMC::GR64Regs, ADDR64Reg))
     return true;
 
   SMLoc EndLoc =
     SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
+  if (Base || Index || Length)
+    Operands.push_back(SystemZOperand::createInvalid(StartLoc, EndLoc));
+  else
+    Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
   return false;
 }
 
@@ -671,15 +756,47 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
 parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  Register Reg;
-  if (parseRegister(Reg))
+  if (Parser.getTok().isNot(AsmToken::Percent))
     return MatchOperand_NoMatch;
-  if (Reg.Prefix != 'a' || Reg.Number > 15) {
-    Error(Reg.StartLoc, "invalid register");
+
+  Register Reg;
+  if (parseRegister(Reg, RegAccess, 0))
     return MatchOperand_ParseFail;
+
+  Operands.push_back(SystemZOperand::createAccessReg(Reg.Num,
+                                                     Reg.StartLoc,
+                                                     Reg.EndLoc));
+  return MatchOperand_Success;
+}
+
+SystemZAsmParser::OperandMatchResultTy SystemZAsmParser::
+parsePCRel(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+           int64_t MinVal, int64_t MaxVal) {
+  MCContext &Ctx = getContext();
+  MCStreamer &Out = getStreamer();
+  const MCExpr *Expr;
+  SMLoc StartLoc = Parser.getTok().getLoc();
+  if (getParser().parseExpression(Expr))
+    return MatchOperand_NoMatch;
+
+  // For consistency with the GNU assembler, treat immediates as offsets
+  // from ".".
+  if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+    int64_t Value = CE->getValue();
+    if ((Value & 1) || Value < MinVal || Value > MaxVal) {
+      Error(StartLoc, "offset out of range");
+      return MatchOperand_ParseFail;
+    }
+    MCSymbol *Sym = Ctx.CreateTempSymbol();
+    Out.EmitLabel(Sym);
+    const MCExpr *Base = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                 Ctx);
+    Expr = Value == 0 ? Base : MCBinaryExpr::CreateAdd(Base, Expr, Ctx);
   }
-  Operands.push_back(SystemZOperand::createAccessReg(Reg.Number,
-                                                     Reg.StartLoc, Reg.EndLoc));
+
+  SMLoc EndLoc =
+    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Operands.push_back(SystemZOperand::createImm(Expr, StartLoc, EndLoc));
   return MatchOperand_Success;
 }
 
diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
index 67b17fc..d21c0a8 100644
--- a/lib/Target/SystemZ/CMakeLists.txt
+++ b/lib/Target/SystemZ/CMakeLists.txt
@@ -4,6 +4,7 @@ tablegen(LLVM SystemZGenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM SystemZGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM SystemZGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM SystemZGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM SystemZGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM SystemZGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM SystemZGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM SystemZGenRegisterInfo.inc -gen-register-info)
@@ -14,19 +15,25 @@ add_llvm_target(SystemZCodeGen
   SystemZAsmPrinter.cpp
   SystemZCallingConv.cpp
   SystemZConstantPoolValue.cpp
+  SystemZElimCompare.cpp
   SystemZFrameLowering.cpp
   SystemZISelDAGToDAG.cpp
   SystemZISelLowering.cpp
   SystemZInstrInfo.cpp
+  SystemZLongBranch.cpp
+  SystemZMachineFunctionInfo.cpp
   SystemZMCInstLower.cpp
   SystemZRegisterInfo.cpp
+  SystemZSelectionDAGInfo.cpp
+  SystemZShortenInst.cpp
   SystemZSubtarget.cpp
   SystemZTargetMachine.cpp
   )
 
-add_dependencies(LLVMSystemZCodeGen intrinsics_gen)
+add_dependencies(LLVMSystemZCodeGen SystemZCommonTableGen intrinsics_gen)
 
 add_subdirectory(AsmParser)
+add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/SystemZ/Disassembler/CMakeLists.txt b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..5bc1859
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZDisassembler
+  SystemZDisassembler.cpp
+  )
+
+add_dependencies(LLVMSystemZDisassembler SystemZCommonTableGen)
diff --git a/lib/Target/MBlaze/Disassembler/LLVMBuild.txt b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
index 28dd9dc..c3081f5 100644
--- a/lib/Target/MBlaze/Disassembler/LLVMBuild.txt
+++ b/lib/Target/SystemZ/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/MBlaze/Disassembler/LLVMBuild.txt -----------*- Conf -*--===;
+;===-- ./lib/Target/SystemZ/Disassembler/LLVMBuild.txt ---------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -17,7 +17,7 @@
 
 [component_0]
 type = Library
-name = MBlazeDisassembler
-parent = MBlaze
-required_libraries = MBlazeDesc MBlazeInfo MC Support
-add_to_library_groups = MBlaze
+name = SystemZDisassembler
+parent = SystemZ
+required_libraries = MC Support SystemZDesc SystemZInfo
+add_to_library_groups = SystemZ
diff --git a/lib/Target/MBlaze/MCTargetDesc/Makefile b/lib/Target/SystemZ/Disassembler/Makefile
index 71075ff..efc4cc8 100644
--- a/lib/Target/MBlaze/MCTargetDesc/Makefile
+++ b/lib/Target/SystemZ/Disassembler/Makefile
@@ -1,4 +1,4 @@
-##===- lib/Target/MBlaze/TargetDesc/Makefile ---------------*- Makefile -*-===##
+##===-- lib/Target/SystemZ/Disassembler/Makefile -----------*- Makefile -*-===##
 #
 #                     The LLVM Compiler Infrastructure
 #
@@ -8,9 +8,9 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../../../..
-LIBRARYNAME = LLVMMBlazeDesc
+LIBRARYNAME = LLVMSystemZDisassembler
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' x86 target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
new file mode 100644
index 0000000..fc3c38d
--- /dev/null
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -0,0 +1,323 @@
+//===-- SystemZDisassembler.cpp - Disassembler for SystemZ ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+class SystemZDisassembler : public MCDisassembler {
+public:
+  SystemZDisassembler(const MCSubtargetInfo &STI)
+    : MCDisassembler(STI) {}
+  virtual ~SystemZDisassembler() {}
+
+  // Override MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const LLVM_OVERRIDE;
+};
+} // end anonymous namespace
+
+static MCDisassembler *createSystemZDisassembler(const Target &T,
+                                                 const MCSubtargetInfo &STI) {
+  return new SystemZDisassembler(STI);
+}
+
+extern "C" void LLVMInitializeSystemZDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheSystemZTarget,
+                                         createSystemZDisassembler);
+}
+
+static DecodeStatus decodeRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                        const unsigned *Regs) {
+  assert(RegNo < 16 && "Invalid register");
+  RegNo = Regs[RegNo];
+  if (RegNo == 0)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeGR32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus DecodeGRH32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GRH32Regs);
+}
+
+static DecodeStatus DecodeGR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus DecodeGR128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR128Regs);
+}
+
+static DecodeStatus DecodeADDR64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus DecodeFP32BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP32Regs);
+}
+
+static DecodeStatus DecodeFP64BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP64Regs);
+}
+
+static DecodeStatus DecodeFP128BitRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeRegisterClass(Inst, RegNo, SystemZMC::FP128Regs);
+}
+
+template<unsigned N>
+static DecodeStatus decodeUImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+  return MCDisassembler::Success;
+}
+
+template<unsigned N>
+static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm) {
+  assert(isUInt<N>(Imm) && "Invalid immediate");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeAccessRegOperand(MCInst &Inst, uint64_t Imm,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU4ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<4>(Inst, Imm);
+}
+
+static DecodeStatus decodeU6ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<6>(Inst, Imm);
+}
+
+static DecodeStatus decodeU8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeU16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeU32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeUImmOperand<32>(Inst, Imm);
+}
+
+static DecodeStatus decodeS8ImmOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<8>(Inst, Imm);
+}
+
+static DecodeStatus decodeS16ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<16>(Inst, Imm);
+}
+
+static DecodeStatus decodeS32ImmOperand(MCInst &Inst, uint64_t Imm,
+                                        uint64_t Address, const void *Decoder) {
+  return decodeSImmOperand<32>(Inst, Imm);
+}
+
+template<unsigned N>
+static DecodeStatus decodePCDBLOperand(MCInst &Inst, uint64_t Imm,
+                                       uint64_t Address) {
+  assert(isUInt<N>(Imm) && "Invalid PC-relative offset");
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<N>(Imm) * 2 + Address));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodePC16DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<16>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodePC32DBLOperand(MCInst &Inst, uint64_t Imm,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  return decodePCDBLOperand<32>(Inst, Imm, Address);
+}
+
+static DecodeStatus decodeBDAddr12Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 12;
+  uint64_t Disp = Field & 0xfff;
+  assert(Base < 16 && "Invalid BDAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr20Operand(MCInst &Inst, uint64_t Field,
+                                          const unsigned *Regs) {
+  uint64_t Base = Field >> 20;
+  uint64_t Disp = ((Field << 12) & 0xff000) | ((Field >> 8) & 0xfff);
+  assert(Base < 16 && "Invalid BDAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr12Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Index < 16 && "Invalid BDXAddr12");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field,
+                                           const unsigned *Regs) {
+  uint64_t Index = Field >> 24;
+  uint64_t Base = (Field >> 20) & 0xf;
+  uint64_t Disp = ((Field & 0xfff00) >> 8) | ((Field & 0xff) << 12);
+  assert(Index < 16 && "Invalid BDXAddr20");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(SignExtend64<20>(Disp)));
+  Inst.addOperand(MCOperand::CreateReg(Index == 0 ? 0 : Regs[Index]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field,
+                                               const unsigned *Regs) {
+  uint64_t Length = Field >> 16;
+  uint64_t Base = (Field >> 12) & 0xf;
+  uint64_t Disp = Field & 0xfff;
+  assert(Length < 256 && "Invalid BDLAddr12Len8");
+  Inst.addOperand(MCOperand::CreateReg(Base == 0 ? 0 : Regs[Base]));
+  Inst.addOperand(MCOperand::CreateImm(Disp));
+  Inst.addOperand(MCOperand::CreateImm(Length + 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeBDAddr32Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr32Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR32Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  return decodeBDAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp12Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr12Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field,
+                                                 uint64_t Address,
+                                                 const void *Decoder) {
+  return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst,
+                                                     uint64_t Field,
+                                                     uint64_t Address,
+                                                     const void *Decoder) {
+  return decodeBDLAddr12Len8Operand(Inst, Field, SystemZMC::GR64Regs);
+}
+
+#include "SystemZGenDisassemblerTables.inc"
+
+DecodeStatus SystemZDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
+                                                 const MemoryObject &Region,
+                                                 uint64_t Address,
+                                                 raw_ostream &os,
+                                                 raw_ostream &cs) const {
+  // Get the first two bytes of the instruction.
+  uint8_t Bytes[6];
+  Size = 0;
+  if (Region.readBytes(Address, 2, Bytes) == -1)
+    return MCDisassembler::Fail;
+
+  // The top 2 bits of the first byte specify the size.
+  const uint8_t *Table;
+  if (Bytes[0] < 0x40) {
+    Size = 2;
+    Table = DecoderTable16;
+  } else if (Bytes[0] < 0xc0) {
+    Size = 4;
+    Table = DecoderTable32;
+  } else {
+    Size = 6;
+    Table = DecoderTable48;
+  }
+
+  // Read any remaining bytes.
+  if (Size > 2 && Region.readBytes(Address + 2, Size - 2, Bytes + 2) == -1)
+    return MCDisassembler::Fail;
+
+  // Construct the instruction.
+  uint64_t Inst = 0;
+  for (uint64_t I = 0; I < Size; ++I)
+    Inst = (Inst << 8) | Bytes[I];
+
+  return decodeInstruction(Table, MI, Inst, Address, this, STI);
+}
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index d73cf49..e1e64d3 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -114,10 +114,14 @@ void SystemZInstPrinter::printAccessRegOperand(const MCInst *MI, int OpNum,
   O << "%a" << (unsigned int)Value;
 }
 
-void SystemZInstPrinter::printCallOperand(const MCInst *MI, int OpNum,
-                                          raw_ostream &O) {
-  printOperand(MI, OpNum, O);
-  O << "@PLT";
+void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "0x";
+    O.write_hex(MO.getImm());
+  } else
+    O << *MO.getExpr();
 }
 
 void SystemZInstPrinter::printOperand(const MCInst *MI, int OpNum,
@@ -138,6 +142,17 @@ void SystemZInstPrinter::printBDXAddrOperand(const MCInst *MI, int OpNum,
                MI->getOperand(OpNum + 2).getReg(), O);
 }
 
+void SystemZInstPrinter::printBDLAddrOperand(const MCInst *MI, int OpNum,
+                                             raw_ostream &O) {
+  unsigned Base = MI->getOperand(OpNum).getReg();
+  uint64_t Disp = MI->getOperand(OpNum + 1).getImm();
+  uint64_t Length = MI->getOperand(OpNum + 2).getImm();
+  O << Disp << '(' << Length;
+  if (Base)
+    O << ",%" << getRegisterName(Base);
+  O << ')';
+}
+
 void SystemZInstPrinter::printCond4Operand(const MCInst *MI, int OpNum,
                                            raw_ostream &O) {
   static const char *const CondNames[] = {
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index b82e79d..734ecf0 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -48,6 +48,7 @@ private:
   void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU4ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU6ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printS8ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
@@ -56,7 +57,7 @@ private:
   void printU16ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printS32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
-  void printCallOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printAccessRegOperand(const MCInst *MI, int OpNum, raw_ostream &O);
 
   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt
index aba0de2..95e657f 100644
--- a/lib/Target/SystemZ/LLVMBuild.txt
+++ b/lib/Target/SystemZ/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AsmParser InstPrinter MCTargetDesc TargetInfo
+subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
@@ -24,6 +24,7 @@ name = SystemZ
 parent = Target
 has_asmparser = 1
 has_asmprinter = 1
+has_disassembler = 1
 has_jit = 1
 
 [component_1]
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index e901c6c..26a8fae 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -35,16 +35,6 @@ static uint64_t extractBitsForFixup(MCFixupKind Kind, uint64_t Value) {
   llvm_unreachable("Unknown fixup kind!");
 }
 
-// If Opcode can be relaxed, return the relaxed form, otherwise return 0.
-static unsigned getRelaxedOpcode(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRC:  return SystemZ::BRCL;
-  case SystemZ::J:    return SystemZ::JG;
-  case SystemZ::BRAS: return SystemZ::BRASL;
-  }
-  return 0;
-}
-
 namespace {
 class SystemZMCAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
@@ -60,14 +50,20 @@ public:
     LLVM_OVERRIDE;
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                           uint64_t Value) const LLVM_OVERRIDE;
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE;
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const LLVM_OVERRIDE {
+    return false;
+  }
   virtual bool fixupNeedsRelaxation(const MCFixup &Fixup,
                                     uint64_t Value,
                                     const MCRelaxableFragment *Fragment,
                                     const MCAsmLayout &Layout) const
-    LLVM_OVERRIDE;
+    LLVM_OVERRIDE {
+    return false;
+  }
   virtual void relaxInstruction(const MCInst &Inst,
-                                MCInst &Res) const LLVM_OVERRIDE;
+                                MCInst &Res) const LLVM_OVERRIDE {
+    llvm_unreachable("SystemZ does do not have assembler relaxation");
+  }
   virtual bool writeNopData(uint64_t Count,
                             MCObjectWriter *OW) const LLVM_OVERRIDE;
   virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const
@@ -115,28 +111,6 @@ void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
   }
 }
 
-bool SystemZMCAsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return getRelaxedOpcode(Inst.getOpcode()) != 0;
-}
-
-bool
-SystemZMCAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                          uint64_t Value,
-                                          const MCRelaxableFragment *Fragment,
-                                          const MCAsmLayout &Layout) const {
-  // At the moment we just need to relax 16-bit fields to wider fields.
-  Value = extractBitsForFixup(Fixup.getKind(), Value);
-  return (int16_t)Value != (int64_t)Value;
-}
-
-void SystemZMCAsmBackend::relaxInstruction(const MCInst &Inst,
-                                           MCInst &Res) const {
-  unsigned Opcode = getRelaxedOpcode(Inst.getOpcode());
-  assert(Opcode && "Unexpected insn to relax");
-  Res = Inst;
-  Res.setOpcode(Opcode);
-}
-
 bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
                                        MCObjectWriter *OW) const {
   for (uint64_t I = 0; I != Count; ++I)
@@ -144,8 +118,9 @@ bool SystemZMCAsmBackend::writeNopData(uint64_t Count,
   return true;
 }
 
-MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                              StringRef CPU) {
+MCAsmBackend *llvm::createSystemZMCAsmBackend(const Target &T,
+                                              const MCRegisterInfo &MRI,
+                                              StringRef TT, StringRef CPU) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
   return new SystemZMCAsmBackend(OSABI);
 }
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index c96a0d4..965c41e 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -13,16 +13,14 @@
 
 using namespace llvm;
 
-SystemZMCAsmInfo::SystemZMCAsmInfo(const Target &T, StringRef TT) {
+SystemZMCAsmInfo::SystemZMCAsmInfo(StringRef TT) {
   PointerSize = 8;
   CalleeSaveStackSlotSize = 8;
   IsLittleEndian = false;
 
   CommentString = "#";
-  PCSymbol = ".";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = "\t.quad\t";
   UsesELFSectionDirectiveForBSS = true;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
index bac1bca..b9ac92a 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.h
@@ -10,16 +10,15 @@
 #ifndef SystemZTARGETASMINFO_H
 #define SystemZTARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class Target;
 class StringRef;
 
-class SystemZMCAsmInfo : public MCAsmInfo {
+class SystemZMCAsmInfo : public MCAsmInfoELF {
 public:
-  explicit SystemZMCAsmInfo(const Target &T, StringRef TT);
+  explicit SystemZMCAsmInfo(StringRef TT);
 
   // Override MCAsmInfo;
   virtual const MCSection *getNonexecutableStackSection(MCContext &Ctx) const
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index ea2250f..f07ea7b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -45,33 +45,40 @@ private:
 
   // Called by the TableGen code to get the binary encoding of operand
   // MO in MI.  Fixups is the list of fixups against MI.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups) const;
 
+  // Called by the TableGen code to get the binary encoding of an address.
+  // The index or length, if any, is encoded first, followed by the base,
+  // followed by the displacement.  In a 20-bit displacement,
+  // the low 12 bits are encoded before the high 8 bits.
+  uint64_t getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                               SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                                SmallVectorImpl<MCFixup> &Fixups) const;
+  uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                                    SmallVectorImpl<MCFixup> &Fixups) const;
+
   // Operand OpNum of MI needs a PC-relative fixup of kind Kind at
   // Offset bytes from the start of MI.  Add the fixup to Fixups
   // and return the in-place addend, which since we're a RELA target
   // is always 0.
-  unsigned getPCRelEncoding(const MCInst &MI, unsigned int OpNum,
+  uint64_t getPCRelEncoding(const MCInst &MI, unsigned OpNum,
                             SmallVectorImpl<MCFixup> &Fixups,
                             unsigned Kind, int64_t Offset) const;
 
-  unsigned getPC16DBLEncoding(const MCInst &MI, unsigned int OpNum,
+  uint64_t getPC16DBLEncoding(const MCInst &MI, unsigned OpNum,
                               SmallVectorImpl<MCFixup> &Fixups) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC16DBL, 2);
   }
-  unsigned getPC32DBLEncoding(const MCInst &MI, unsigned int OpNum,
+  uint64_t getPC32DBLEncoding(const MCInst &MI, unsigned OpNum,
                               SmallVectorImpl<MCFixup> &Fixups) const {
     return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PC32DBL, 2);
   }
-  unsigned getPLT16DBLEncoding(const MCInst &MI, unsigned int OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT16DBL, 2);
-  }
-  unsigned getPLT32DBLEncoding(const MCInst &MI, unsigned int OpNum,
-                               SmallVectorImpl<MCFixup> &Fixups) const {
-    return getPCRelEncoding(MI, OpNum, Fixups, SystemZ::FK_390_PLT32DBL, 2);
-  }
 };
 }
 
@@ -95,34 +102,83 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 }
 
-unsigned SystemZMCCodeEmitter::
+uint64_t SystemZMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg())
-    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg());
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+    return static_cast<uint64_t>(MO.getImm());
   llvm_unreachable("Unexpected operand type!");
 }
 
-unsigned
-SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned int OpNum,
+uint64_t SystemZMCCodeEmitter::
+getBDAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp));
+  return (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                    SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  assert(isUInt<4>(Base) && isInt<20>(Disp));
+  return (Base << 20) | ((Disp & 0xfff) << 8) | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr12Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Index));
+  return (Index << 16) | (Base << 12) | Disp;
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum,
+                     SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  uint64_t Index = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups);
+  assert(isUInt<4>(Base) && isInt<20>(Disp) && isUInt<4>(Index));
+  return (Index << 24) | (Base << 20) | ((Disp & 0xfff) << 8)
+    | ((Disp & 0xff000) >> 12);
+}
+
+uint64_t SystemZMCCodeEmitter::
+getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups);
+  uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups);
+  uint64_t Len  = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups) - 1;
+  assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<8>(Len));
+  return (Len << 16) | (Base << 12) | Disp;
+}
+
+uint64_t
+SystemZMCCodeEmitter::getPCRelEncoding(const MCInst &MI, unsigned OpNum,
                                        SmallVectorImpl<MCFixup> &Fixups,
                                        unsigned Kind, int64_t Offset) const {
   const MCOperand &MO = MI.getOperand(OpNum);
-  // For compatibility with the GNU assembler, treat constant operands as
-  // unadjusted PC-relative offsets.
+  const MCExpr *Expr;
   if (MO.isImm())
-    return MO.getImm() / 2;
-
-  const MCExpr *Expr = MO.getExpr();
-  if (Offset) {
-    // The operand value is relative to the start of MI, but the fixup
-    // is relative to the operand field itself, which is Offset bytes
-    // into MI.  Add Offset to the relocation value to cancel out
-    // this difference.
-    const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-    Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    Expr = MCConstantExpr::Create(MO.getImm() + Offset, Ctx);
+  else {
+    Expr = MO.getExpr();
+    if (Offset) {
+      // The operand value is relative to the start of MI, but the fixup
+      // is relative to the operand field itself, which is Offset bytes
+      // into MI.  Add Offset to the relocation value to cancel out
+      // this difference.
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
   }
   Fixups.push_back(MCFixup::Create(Offset, Expr, (MCFixupKind)Kind));
   return 0;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 49a7f47..9e1296b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -27,11 +27,80 @@
 
 using namespace llvm;
 
-static MCAsmInfo *createSystemZMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new SystemZMCAsmInfo(T, TT);
-  MachineLocation FPDst(MachineLocation::VirtualFP);
-  MachineLocation FPSrc(SystemZ::R15D, -SystemZMC::CFAOffsetFromInitialSP);
-  MAI->addInitialFrameState(0, FPDst, FPSrc);
+const unsigned SystemZMC::GR32Regs[16] = {
+  SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
+  SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
+  SystemZ::R8L, SystemZ::R9L, SystemZ::R10L, SystemZ::R11L,
+  SystemZ::R12L, SystemZ::R13L, SystemZ::R14L, SystemZ::R15L
+};
+
+const unsigned SystemZMC::GRH32Regs[16] = {
+  SystemZ::R0H, SystemZ::R1H, SystemZ::R2H, SystemZ::R3H,
+  SystemZ::R4H, SystemZ::R5H, SystemZ::R6H, SystemZ::R7H,
+  SystemZ::R8H, SystemZ::R9H, SystemZ::R10H, SystemZ::R11H,
+  SystemZ::R12H, SystemZ::R13H, SystemZ::R14H, SystemZ::R15H
+};
+
+const unsigned SystemZMC::GR64Regs[16] = {
+  SystemZ::R0D, SystemZ::R1D, SystemZ::R2D, SystemZ::R3D,
+  SystemZ::R4D, SystemZ::R5D, SystemZ::R6D, SystemZ::R7D,
+  SystemZ::R8D, SystemZ::R9D, SystemZ::R10D, SystemZ::R11D,
+  SystemZ::R12D, SystemZ::R13D, SystemZ::R14D, SystemZ::R15D
+};
+
+const unsigned SystemZMC::GR128Regs[16] = {
+  SystemZ::R0Q, 0, SystemZ::R2Q, 0,
+  SystemZ::R4Q, 0, SystemZ::R6Q, 0,
+  SystemZ::R8Q, 0, SystemZ::R10Q, 0,
+  SystemZ::R12Q, 0, SystemZ::R14Q, 0
+};
+
+const unsigned SystemZMC::FP32Regs[16] = {
+  SystemZ::F0S, SystemZ::F1S, SystemZ::F2S, SystemZ::F3S,
+  SystemZ::F4S, SystemZ::F5S, SystemZ::F6S, SystemZ::F7S,
+  SystemZ::F8S, SystemZ::F9S, SystemZ::F10S, SystemZ::F11S,
+  SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S
+};
+
+const unsigned SystemZMC::FP64Regs[16] = {
+  SystemZ::F0D, SystemZ::F1D, SystemZ::F2D, SystemZ::F3D,
+  SystemZ::F4D, SystemZ::F5D, SystemZ::F6D, SystemZ::F7D,
+  SystemZ::F8D, SystemZ::F9D, SystemZ::F10D, SystemZ::F11D,
+  SystemZ::F12D, SystemZ::F13D, SystemZ::F14D, SystemZ::F15D
+};
+
+const unsigned SystemZMC::FP128Regs[16] = {
+  SystemZ::F0Q, SystemZ::F1Q, 0, 0,
+  SystemZ::F4Q, SystemZ::F5Q, 0, 0,
+  SystemZ::F8Q, SystemZ::F9Q, 0, 0,
+  SystemZ::F12Q, SystemZ::F13Q, 0, 0
+};
+
+unsigned SystemZMC::getFirstReg(unsigned Reg) {
+  static unsigned Map[SystemZ::NUM_TARGET_REGS];
+  static bool Initialized = false;
+  if (!Initialized) {
+    for (unsigned I = 0; I < 16; ++I) {
+      Map[GR32Regs[I]] = I;
+      Map[GRH32Regs[I]] = I;
+      Map[GR64Regs[I]] = I;
+      Map[GR128Regs[I]] = I;
+      Map[FP32Regs[I]] = I;
+      Map[FP64Regs[I]] = I;
+      Map[FP128Regs[I]] = I;
+    }
+  }
+  assert(Reg < SystemZ::NUM_TARGET_REGS);
+  return Map[Reg];
+}
+
+static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
+                                         StringRef TT) {
+  MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
+  MCCFIInstruction Inst =
+      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(SystemZ::R15D, true),
+                                     SystemZMC::CFAOffsetFromInitialSP);
+  MAI->addInitialFrameState(Inst);
   return MAI;
 }
 
@@ -118,7 +187,7 @@ static MCStreamer *createSystemZMCObjectStreamer(const Target &T, StringRef TT,
                                                  MCCodeEmitter *Emitter,
                                                  bool RelaxAll,
                                                  bool NoExecStack) {
-  return createELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, OS, Emitter, RelaxAll, NoExecStack);
 }
 
 extern "C" void LLVMInitializeSystemZTargetMC() {
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
index 229912f..97e325b 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.h
@@ -34,6 +34,39 @@ namespace SystemZMC {
 
   // The offset of the DWARF CFA from the incoming stack pointer.
   const int64_t CFAOffsetFromInitialSP = CallFrameSize;
+
+  // Maps of asm register numbers to LLVM register numbers, with 0 indicating
+  // an invalid register.  In principle we could use 32-bit and 64-bit register
+  // classes directly, provided that we relegated the GPR allocation order
+  // in SystemZRegisterInfo.td to an AltOrder and left the default order
+  // as %r0-%r15.  It seems better to provide the same interface for
+  // all classes though.
+  extern const unsigned GR32Regs[16];
+  extern const unsigned GRH32Regs[16];
+  extern const unsigned GR64Regs[16];
+  extern const unsigned GR128Regs[16];
+  extern const unsigned FP32Regs[16];
+  extern const unsigned FP64Regs[16];
+  extern const unsigned FP128Regs[16];
+
+  // Return the 0-based number of the first architectural register that
+  // contains the given LLVM register.   E.g. R1D -> 1.
+  unsigned getFirstReg(unsigned Reg);
+
+  // Return the given register as a GR64.
+  inline unsigned getRegAsGR64(unsigned Reg) {
+    return GR64Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a low GR32.
+  inline unsigned getRegAsGR32(unsigned Reg) {
+    return GR32Regs[getFirstReg(Reg)];
+  }
+
+  // Return the given register as a high GR32.
+  inline unsigned getRegAsGRH32(unsigned Reg) {
+    return GRH32Regs[getFirstReg(Reg)];
+  }
 }
 
 MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
@@ -41,8 +74,9 @@ MCCodeEmitter *createSystemZMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCSubtargetInfo &STI,
                                           MCContext &Ctx);
 
-MCAsmBackend *createSystemZMCAsmBackend(const Target &T, StringRef TT,
-                                        StringRef CPU);
+MCAsmBackend *createSystemZMCAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI,
+                                        StringRef TT, StringRef CPU);
 
 MCObjectWriter *createSystemZObjectWriter(raw_ostream &OS, uint8_t OSABI);
 } // end namespace llvm
diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
index c992584..445725b 100644
--- a/lib/Target/SystemZ/Makefile
+++ b/lib/Target/SystemZ/Makefile
@@ -16,13 +16,14 @@ BUILT_SOURCES = SystemZGenRegisterInfo.inc \
 		SystemZGenAsmWriter.inc \
 		SystemZGenAsmMatcher.inc \
 		SystemZGenCodeEmitter.inc \
+		SystemZGenDisassemblerTables.inc \
 		SystemZGenInstrInfo.inc \
 		SystemZGenDAGISel.inc \
 		SystemZGenSubtargetInfo.inc \
 		SystemZGenCallingConv.inc \
 		SystemZGenMCCodeEmitter.inc
 
-DIRS = InstPrinter AsmParser TargetInfo MCTargetDesc
+DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt
index d1f56a4..afa6cf0 100644
--- a/lib/Target/SystemZ/README.txt
+++ b/lib/Target/SystemZ/README.txt
@@ -29,7 +29,7 @@ to load 103.  This seems to be a general target-independent problem.
 
 --
 
-The tuning of the choice between Load Address (LA) and addition in
+The tuning of the choice between LOAD ADDRESS (LA) and addition in
 SystemZISelDAGToDAG.cpp is suspect.  It should be tweaked based on
 performance measurements.
 
@@ -39,22 +39,35 @@ There is no scheduling support.
 
 --
 
-We don't use the Branch on Count or Branch on Index families of instruction.
+We don't use the BRANCH ON INDEX instructions.
 
 --
 
-We don't use the condition code results of anything except comparisons.
+We might want to use BRANCH ON CONDITION for conditional indirect calls
+and conditional returns.
 
-Implementing this may need something more finely grained than the z_cmp
-and z_ucmp that we have now.  It might (or might not) also be useful to
-have a mask of "don't care" values in conditional branches.  For example,
-integer comparisons never set CC to 3, so the bottom bit of the CC mask
-isn't particularly relevant.  JNLH and JE are equally good for testing
-equality after an integer comparison, etc.
+--
+
+We don't use the TEST DATA CLASS instructions.
+
+--
+
+We could use the generic floating-point forms of LOAD COMPLEMENT,
+LOAD NEGATIVE and LOAD POSITIVE in cases where we don't need the
+condition codes.  For example, we could use LCDFR instead of LCDBR.
 
 --
 
-We don't optimize string and block memory operations.
+We only use MVC, XC and CLC for constant-length block operations.
+We could extend them to variable-length operations too,
+using EXECUTE RELATIVE LONG.
+
+MVCIN, MVCLE and CLCLE may be worthwhile too.
+
+--
+
+We don't use CUSE or the TRANSLATE family of instructions for string
+operations.  The TRANSLATE ones are probably more difficult to exploit.
 
 --
 
@@ -63,9 +76,21 @@ conventions require f128s to be returned by invisible reference.
 
 --
 
-DAGCombiner can detect integer absolute, but there's not yet an associated
-ISD opcode.  We could add one and implement it using Load Positive.
-Negated absolutes could use Load Negative.
+ADD LOGICAL WITH SIGNED IMMEDIATE could be useful when we need to
+produce a carry.  SUBTRACT LOGICAL IMMEDIATE could be useful when we
+need to produce a borrow.  (Note that there are no memory forms of
+ADD LOGICAL WITH CARRY and SUBTRACT LOGICAL WITH BORROW, so the high
+part of 128-bit memory operations would probably need to be done
+via a register.)
+
+--
+
+We don't use the halfword forms of LOAD REVERSED and STORE REVERSED
+(LRVH and STRVH).
+
+--
+
+We don't use ICM or STCM.
 
 --
 
@@ -142,5 +167,15 @@ See CodeGen/SystemZ/alloca-01.ll for an example.
 --
 
 Atomic loads and stores use the default compare-and-swap based implementation.
-This is probably much too conservative in practice, and the overhead is
-especially bad for 8- and 16-bit accesses.
+This is much too conservative in practice, since the architecture guarantees
+that 1-, 2-, 4- and 8-byte loads and stores to aligned addresses are
+inherently atomic.
+
+--
+
+If needed, we can support 16-byte atomics using LPQ, STPQ and CSDG.
+
+--
+
+We might want to model all access registers and use them to spill
+32-bit values.
diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
index b811cbe..dcebbad 100644
--- a/lib/Target/SystemZ/SystemZ.h
+++ b/lib/Target/SystemZ/SystemZ.h
@@ -30,16 +30,51 @@ namespace llvm {
     const unsigned CCMASK_3 = 1 << 0;
     const unsigned CCMASK_ANY = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
 
-    // Condition-code mask assignments for floating-point comparisons.
+    // Condition-code mask assignments for integer and floating-point
+    // comparisons.
     const unsigned CCMASK_CMP_EQ = CCMASK_0;
     const unsigned CCMASK_CMP_LT = CCMASK_1;
     const unsigned CCMASK_CMP_GT = CCMASK_2;
-    const unsigned CCMASK_CMP_UO = CCMASK_3;
     const unsigned CCMASK_CMP_NE = CCMASK_CMP_LT | CCMASK_CMP_GT;
     const unsigned CCMASK_CMP_LE = CCMASK_CMP_EQ | CCMASK_CMP_LT;
     const unsigned CCMASK_CMP_GE = CCMASK_CMP_EQ | CCMASK_CMP_GT;
+
+    // Condition-code mask assignments for floating-point comparisons only.
+    const unsigned CCMASK_CMP_UO = CCMASK_3;
     const unsigned CCMASK_CMP_O  = CCMASK_ANY ^ CCMASK_CMP_UO;
 
+    // All condition-code values produced by comparisons.
+    const unsigned CCMASK_ICMP = CCMASK_0 | CCMASK_1 | CCMASK_2;
+    const unsigned CCMASK_FCMP = CCMASK_0 | CCMASK_1 | CCMASK_2 | CCMASK_3;
+
+    // Condition-code mask assignments for CS.
+    const unsigned CCMASK_CS_EQ = CCMASK_0;
+    const unsigned CCMASK_CS_NE = CCMASK_1;
+    const unsigned CCMASK_CS    = CCMASK_0 | CCMASK_1;
+
+    // Condition-code mask assignments for a completed SRST loop.
+    const unsigned CCMASK_SRST_FOUND    = CCMASK_1;
+    const unsigned CCMASK_SRST_NOTFOUND = CCMASK_2;
+    const unsigned CCMASK_SRST          = CCMASK_1 | CCMASK_2;
+
+    // Condition-code mask assignments for TEST UNDER MASK.
+    const unsigned CCMASK_TM_ALL_0       = CCMASK_0;
+    const unsigned CCMASK_TM_MIXED_MSB_0 = CCMASK_1;
+    const unsigned CCMASK_TM_MIXED_MSB_1 = CCMASK_2;
+    const unsigned CCMASK_TM_ALL_1       = CCMASK_3;
+    const unsigned CCMASK_TM_SOME_0      = CCMASK_TM_ALL_1 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_SOME_1      = CCMASK_TM_ALL_0 ^ CCMASK_ANY;
+    const unsigned CCMASK_TM_MSB_0       = CCMASK_0 | CCMASK_1;
+    const unsigned CCMASK_TM_MSB_1       = CCMASK_2 | CCMASK_3;
+    const unsigned CCMASK_TM             = CCMASK_ANY;
+
+    // The position of the low CC bit in an IPM result.
+    const unsigned IPM_CC = 28;
+
+    // Mask assignments for PFD.
+    const unsigned PFD_READ  = 1;
+    const unsigned PFD_WRITE = 2;
+
     // Return true if Val fits an LLILL operand.
     static inline bool isImmLL(uint64_t Val) {
       return (Val & ~0x000000000000ffffULL) == 0;
@@ -73,5 +108,8 @@ namespace llvm {
 
   FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
                                      CodeGenOpt::Level OptLevel);
+  FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
+  FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
+  FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 } // end namespace llvm;
 #endif
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index e03c32f..abf5c8e 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -14,13 +14,10 @@
 include "llvm/Target/Target.td"
 
 //===----------------------------------------------------------------------===//
-// SystemZ supported processors
+// SystemZ supported processors and features
 //===----------------------------------------------------------------------===//
 
-class Proc<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
-
-def : Proc<"z10", []>;
+include "SystemZProcessors.td"
 
 //===----------------------------------------------------------------------===//
 // Register file description
diff --git a/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 1e15ab1..75cbda4 100644
--- a/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -19,16 +19,142 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
 
 using namespace llvm;
 
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GR32s.
+static MCInst lowerRILow(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// GR64 register operands turned into GRH32s.
+static MCInst lowerRIHigh(const MachineInstr *MI, unsigned Opcode) {
+  if (MI->isCompare())
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(1).getImm());
+  else
+    return MCInstBuilder(Opcode)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(1).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+}
+
+// Return an RI instruction like MI with opcode Opcode, but with the
+// R2 register turned into a GR64.
+static MCInst lowerRIEfLow(const MachineInstr *MI, unsigned Opcode) {
+  return MCInstBuilder(Opcode)
+    .addReg(MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg())
+    .addReg(SystemZMC::getRegAsGR64(MI->getOperand(2).getReg()))
+    .addImm(MI->getOperand(3).getImm())
+    .addImm(MI->getOperand(4).getImm())
+    .addImm(MI->getOperand(5).getImm());
+}
+
 void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+  SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
-  Lower.lower(MI, LoweredMI);
+  switch (MI->getOpcode()) {
+  case SystemZ::Return:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R14D);
+    break;
+
+  case SystemZ::CallBRASL:
+    LoweredMI = MCInstBuilder(SystemZ::BRASL)
+      .addReg(SystemZ::R14D)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBASR:
+    LoweredMI = MCInstBuilder(SystemZ::BASR)
+      .addReg(SystemZ::R14D)
+      .addReg(MI->getOperand(0).getReg());
+    break;
+
+  case SystemZ::CallJG:
+    LoweredMI = MCInstBuilder(SystemZ::JG)
+      .addExpr(Lower.getExpr(MI->getOperand(0), MCSymbolRefExpr::VK_PLT));
+    break;
+
+  case SystemZ::CallBR:
+    LoweredMI = MCInstBuilder(SystemZ::BR).addReg(SystemZ::R1D);
+    break;
+
+  case SystemZ::IILF64:
+    LoweredMI = MCInstBuilder(SystemZ::IILF)
+      .addReg(SystemZMC::getRegAsGR32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::IIHF64:
+    LoweredMI = MCInstBuilder(SystemZ::IIHF)
+      .addReg(SystemZMC::getRegAsGRH32(MI->getOperand(0).getReg()))
+      .addImm(MI->getOperand(2).getImm());
+    break;
+
+  case SystemZ::RISBHH:
+  case SystemZ::RISBHL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBHG);
+    break;
+
+  case SystemZ::RISBLH:
+  case SystemZ::RISBLL:
+    LoweredMI = lowerRIEfLow(MI, SystemZ::RISBLG);
+    break;
+
+#define LOWER_LOW(NAME)                                                 \
+  case SystemZ::NAME##64: LoweredMI = lowerRILow(MI, SystemZ::NAME); break
+
+  LOWER_LOW(IILL);
+  LOWER_LOW(IILH);
+  LOWER_LOW(TMLL);
+  LOWER_LOW(TMLH);
+  LOWER_LOW(NILL);
+  LOWER_LOW(NILH);
+  LOWER_LOW(NILF);
+  LOWER_LOW(OILL);
+  LOWER_LOW(OILH);
+  LOWER_LOW(OILF);
+  LOWER_LOW(XILF);
+
+#undef LOWER_LOW
+
+#define LOWER_HIGH(NAME) \
+  case SystemZ::NAME##64: LoweredMI = lowerRIHigh(MI, SystemZ::NAME); break
+
+  LOWER_HIGH(IIHL);
+  LOWER_HIGH(IIHH);
+  LOWER_HIGH(TMHL);
+  LOWER_HIGH(TMHH);
+  LOWER_HIGH(NIHL);
+  LOWER_HIGH(NIHH);
+  LOWER_HIGH(NIHF);
+  LOWER_HIGH(OIHL);
+  LOWER_HIGH(OIHH);
+  LOWER_HIGH(OIHF);
+  LOWER_HIGH(XIHF);
+
+#undef LOWER_HIGH
+
+  default:
+    Lower.lower(MI, LoweredMI);
+    break;
+  }
   OutStreamer.EmitInstruction(LoweredMI);
 }
 
@@ -48,7 +174,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
-    MCSymbolRefExpr::Create(Mang->getSymbol(ZCPV->getGlobalValue()),
+    MCSymbolRefExpr::Create(getSymbol(ZCPV->getGlobalValue()),
                             getModifierVariantKind(ZCPV->getModifier()),
                             OutContext);
   uint64_t Size = TM.getDataLayout()->getTypeAllocSize(ZCPV->getType());
@@ -66,7 +192,7 @@ bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
       return true;
     OS << -int64_t(MI->getOperand(OpNo).getImm());
   } else {
-    SystemZMCInstLower Lower(Mang, MF->getContext(), *this);
+    SystemZMCInstLower Lower(MF->getContext(), *this);
     MCOperand MO(Lower.lowerOperand(MI->getOperand(OpNo)));
     SystemZInstPrinter::printOperand(MO, OS);
   }
@@ -100,7 +226,7 @@ void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
         OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0), 0);
+                                    TD->getPointerSize(0));
       }
       Stubs.clear();
     }
diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
index c2d727f..c4f641e 100644
--- a/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/lib/Target/SystemZ/SystemZCallingConv.td
@@ -23,7 +23,7 @@ def RetCC_SystemZ : CallingConv<[
   // call-clobbered argument registers available for code that doesn't
   // care about the ABI.  (R6 is an argument register too, but is
   // call-saved and therefore not suitable for return values.)
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
 
   // ABI-complaint code returns float and double in F0.  Make the
@@ -53,7 +53,7 @@ def CC_SystemZ : CallingConv<[
 
   // The first 5 integer arguments are passed in R2-R6.  Note that R6
   // is call-saved.
-  CCIfType<[i32], CCAssignToReg<[R2W, R3W, R4W, R5W, R6W]>>,
+  CCIfType<[i32], CCAssignToReg<[R2L, R3L, R4L, R5L, R6L]>>,
   CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
 
   // The first 4 float and double arguments are passed in even registers F0-F6.
diff --git a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index e9c4f6d..6c70811 100644
--- a/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -39,7 +39,7 @@ unsigned SystemZConstantPoolValue::getRelocationInfo() const {
 int SystemZConstantPoolValue::
 getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
   unsigned AlignMask = Alignment - 1;
-  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
   for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
     if (Constants[I].isMachineConstantPoolEntry() &&
         (Constants[I].getAlignment() & AlignMask) == 0) {
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
new file mode 100644
index 0000000..b8a77db
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -0,0 +1,471 @@
+//===-- SystemZElimCompare.cpp - Eliminate comparison instructions --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass:
+// (1) tries to remove compares if CC already contains the required information
+// (2) fuses compares and branches into COMPARE AND BRANCH instructions
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-elim-compare"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
+STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
+STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
+
+namespace {
+  // Represents the references to a particular register in one or more
+  // instructions.
+  struct Reference {
+    Reference()
+      : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+
+    Reference &operator|=(const Reference &Other) {
+      Def |= Other.Def;
+      IndirectDef |= Other.IndirectDef;
+      Use |= Other.Use;
+      IndirectUse |= Other.IndirectUse;
+      return *this;
+    }
+
+    operator bool() const { return Def || Use; }
+
+    // True if the register is defined or used in some form, either directly or
+    // via a sub- or super-register.
+    bool Def;
+    bool Use;
+
+    // True if the register is defined or used indirectly, by a sub- or
+    // super-register.
+    bool IndirectDef;
+    bool IndirectUse;
+  };
+
+  class SystemZElimCompare : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZElimCompare(const SystemZTargetMachine &tm)
+      : MachineFunctionPass(ID), TII(0), TRI(0) {}
+
+    virtual const char *getPassName() const {
+      return "SystemZ Comparison Elimination";
+    }
+
+    bool processBlock(MachineBasicBlock *MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    Reference getRegReferences(MachineInstr *MI, unsigned Reg);
+    bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+                       SmallVectorImpl<MachineInstr *> &CCUsers);
+    bool convertToLoadAndTest(MachineInstr *MI);
+    bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
+                               SmallVectorImpl<MachineInstr *> &CCUsers);
+    bool optimizeCompareZero(MachineInstr *Compare,
+                             SmallVectorImpl<MachineInstr *> &CCUsers);
+    bool fuseCompareAndBranch(MachineInstr *Compare,
+                              SmallVectorImpl<MachineInstr *> &CCUsers);
+
+    const SystemZInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+  };
+
+  char SystemZElimCompare::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZElimComparePass(SystemZTargetMachine &TM) {
+  return new SystemZElimCompare(TM);
+}
+
+// Return true if CC is live out of MBB.
+static bool isCCLiveOut(MachineBasicBlock *MBB) {
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI)
+    if ((*SI)->isLiveIn(SystemZ::CC))
+      return true;
+  return false;
+}
+
+// Return true if any CC result of MI would reflect the value of subreg
+// SubReg of Reg.
+static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
+  if (MI->getNumOperands() > 0 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(0).isDef() &&
+      MI->getOperand(0).getReg() == Reg &&
+      MI->getOperand(0).getSubReg() == SubReg)
+    return true;
+
+  switch (MI->getOpcode()) {
+  case SystemZ::LR:
+  case SystemZ::LGR:
+  case SystemZ::LGFR:
+  case SystemZ::LTR:
+  case SystemZ::LTGR:
+  case SystemZ::LTGFR:
+  case SystemZ::LER:
+  case SystemZ::LDR:
+  case SystemZ::LXR:
+  case SystemZ::LTEBR:
+  case SystemZ::LTDBR:
+  case SystemZ::LTXBR:
+    if (MI->getOperand(1).getReg() == Reg &&
+        MI->getOperand(1).getSubReg() == SubReg)
+      return true;
+  }
+
+  return false;
+}
+
+// Describe the references to Reg in MI, including sub- and super-registers.
+Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
+  Reference Ref;
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    if (MO.isReg()) {
+      if (unsigned MOReg = MO.getReg()) {
+        if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
+          if (MO.isUse()) {
+            Ref.Use = true;
+            Ref.IndirectUse |= (MOReg != Reg);
+          }
+          if (MO.isDef()) {
+            Ref.Def = true;
+            Ref.IndirectDef |= (MOReg != Reg);
+          }
+        }
+      }
+    }
+  }
+  return Ref;
+}
+
+// Compare compares the result of MI against zero.  If MI is an addition
+// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
+// and convert the branch to a BRCT(G).  Return true on success.
+bool
+SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+                                  SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // Check whether we have an addition of -1.
+  unsigned Opcode = MI->getOpcode();
+  unsigned BRCT;
+  if (Opcode == SystemZ::AHI)
+    BRCT = SystemZ::BRCT;
+  else if (Opcode == SystemZ::AGHI)
+    BRCT = SystemZ::BRCTG;
+  else
+    return false;
+  if (MI->getOperand(2).getImm() != -1)
+    return false;
+
+  // Check whether we have a single JLH.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::BRC ||
+      Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+      Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
+    return false;
+
+  // We already know that there are no references to the register between
+  // MI and Compare.  Make sure that there are also no references between
+  // Compare and Branch.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (getRegReferences(MBBI, SrcReg))
+      return false;
+
+  // The transformation is OK.  Rebuild Branch as a BRCT(G).
+  MachineOperand Target(Branch->getOperand(2));
+  Branch->RemoveOperand(2);
+  Branch->RemoveOperand(1);
+  Branch->RemoveOperand(0);
+  Branch->setDesc(TII->get(BRCT));
+  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1))
+    .addOperand(Target)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  MI->removeFromParent();
+  return true;
+}
+
+// If MI is a load instruction, try to convert it into a LOAD AND TEST.
+// Return true on success.
+bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
+  unsigned Opcode = TII->getLoadAndTest(MI->getOpcode());
+  if (!Opcode)
+    return false;
+
+  MI->setDesc(TII->get(Opcode));
+  MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+  return true;
+}
+
+// The CC users in CCUsers are testing the result of a comparison of some
+// value X against zero and we know that any CC value produced by MI
+// would also reflect the value of X.  Try to adjust CCUsers so that
+// they test the result of MI directly, returning true on success.
+// Leave everything unchanged on failure.
+bool SystemZElimCompare::
+adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
+                      SmallVectorImpl<MachineInstr *> &CCUsers) {
+  int Opcode = MI->getOpcode();
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned MIFlags = Desc.TSFlags;
+
+  // See which compare-style condition codes are available.
+  unsigned ReusableCCMask = SystemZII::getCompareZeroCCMask(MIFlags);
+
+  // For unsigned comparisons with zero, only equality makes sense.
+  unsigned CompareFlags = Compare->getDesc().TSFlags;
+  if (CompareFlags & SystemZII::IsLogical)
+    ReusableCCMask &= SystemZ::CCMASK_CMP_EQ;
+
+  if (ReusableCCMask == 0)
+    return false;
+
+  unsigned CCValues = SystemZII::getCCValues(MIFlags);
+  assert((ReusableCCMask & ~CCValues) == 0 && "Invalid CCValues");
+
+  // Now check whether these flags are enough for all users.
+  SmallVector<MachineOperand *, 4> AlterMasks;
+  for (unsigned int I = 0, E = CCUsers.size(); I != E; ++I) {
+    MachineInstr *MI = CCUsers[I];
+
+    // Fail if this isn't a use of CC that we understand.
+    unsigned Flags = MI->getDesc().TSFlags;
+    unsigned FirstOpNum;
+    if (Flags & SystemZII::CCMaskFirst)
+      FirstOpNum = 0;
+    else if (Flags & SystemZII::CCMaskLast)
+      FirstOpNum = MI->getNumExplicitOperands() - 2;
+    else
+      return false;
+
+    // Check whether the instruction predicate treats all CC values
+    // outside of ReusableCCMask in the same way.  In that case it
+    // doesn't matter what those CC values mean.
+    unsigned CCValid = MI->getOperand(FirstOpNum).getImm();
+    unsigned CCMask = MI->getOperand(FirstOpNum + 1).getImm();
+    unsigned OutValid = ~ReusableCCMask & CCValid;
+    unsigned OutMask = ~ReusableCCMask & CCMask;
+    if (OutMask != 0 && OutMask != OutValid)
+      return false;
+
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum));
+    AlterMasks.push_back(&MI->getOperand(FirstOpNum + 1));
+  }
+
+  // All users are OK.  Adjust the masks for MI.
+  for (unsigned I = 0, E = AlterMasks.size(); I != E; I += 2) {
+    AlterMasks[I]->setImm(CCValues);
+    unsigned CCMask = AlterMasks[I + 1]->getImm();
+    if (CCMask & ~ReusableCCMask)
+      AlterMasks[I + 1]->setImm((CCMask & ReusableCCMask) |
+                                (CCValues & ~ReusableCCMask));
+  }
+
+  // CC is now live after MI.
+  int CCDef = MI->findRegisterDefOperandIdx(SystemZ::CC, false, true, TRI);
+  assert(CCDef >= 0 && "Couldn't find CC set");
+  MI->getOperand(CCDef).setIsDead(false);
+
+  // Clear any intervening kills of CC.
+  MachineBasicBlock::iterator MBBI = MI, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    MBBI->clearRegisterKills(SystemZ::CC, TRI);
+
+  return true;
+}
+
+// Return true if Compare is a comparison against zero.
+static bool isCompareZero(MachineInstr *Compare) {
+  switch (Compare->getOpcode()) {
+  case SystemZ::LTEBRCompare:
+  case SystemZ::LTDBRCompare:
+  case SystemZ::LTXBRCompare:
+    return true;
+
+  default:
+    return (Compare->getNumExplicitOperands() == 2 &&
+            Compare->getOperand(1).isImm() &&
+            Compare->getOperand(1).getImm() == 0);
+  }
+}
+
+// Try to optimize cases where comparison instruction Compare is testing
+// a value against zero.  Return true on success and if Compare should be
+// deleted as dead.  CCUsers is the list of instructions that use the CC
+// value produced by Compare.
+bool SystemZElimCompare::
+optimizeCompareZero(MachineInstr *Compare,
+                    SmallVectorImpl<MachineInstr *> &CCUsers) {
+  if (!isCompareZero(Compare))
+    return false;
+
+  // Search back for CC results that are based on the first operand.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
+  MachineBasicBlock *MBB = Compare->getParent();
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB->begin();
+  Reference CCRefs;
+  Reference SrcRefs;
+  while (MBBI != MBBE) {
+    --MBBI;
+    MachineInstr *MI = MBBI;
+    if (resultTests(MI, SrcReg, SrcSubReg)) {
+      // Try to remove both MI and Compare by converting a branch to BRCT(G).
+      // We don't care in this case whether CC is modified between MI and
+      // Compare.
+      if (!CCRefs.Use && !SrcRefs && convertToBRCT(MI, Compare, CCUsers)) {
+        BranchOnCounts += 1;
+        return true;
+      }
+      // Try to eliminate Compare by reusing a CC result from MI.
+      if ((!CCRefs && convertToLoadAndTest(MI)) ||
+          (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
+        EliminatedComparisons += 1;
+        return true;
+      }
+    }
+    SrcRefs |= getRegReferences(MI, SrcReg);
+    if (SrcRefs.Def)
+      return false;
+    CCRefs |= getRegReferences(MI, SystemZ::CC);
+    if (CCRefs.Use && CCRefs.Def)
+      return false;
+  }
+  return false;
+}
+
+// Try to fuse comparison instruction Compare into a later branch.
+// Return true on success and if Compare is therefore redundant.
+bool SystemZElimCompare::
+fuseCompareAndBranch(MachineInstr *Compare,
+                     SmallVectorImpl<MachineInstr *> &CCUsers) {
+  // See whether we have a comparison that can be fused.
+  unsigned FusedOpcode = TII->getCompareAndBranch(Compare->getOpcode(),
+                                                  Compare);
+  if (!FusedOpcode)
+    return false;
+
+  // See whether we have a single branch with which to fuse.
+  if (CCUsers.size() != 1)
+    return false;
+  MachineInstr *Branch = CCUsers[0];
+  if (Branch->getOpcode() != SystemZ::BRC)
+    return false;
+
+  // Make sure that the operands are available at the branch.
+  unsigned SrcReg = Compare->getOperand(0).getReg();
+  unsigned SrcReg2 = (Compare->getOperand(1).isReg() ?
+                      Compare->getOperand(1).getReg() : 0);
+  MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+  for (++MBBI; MBBI != MBBE; ++MBBI)
+    if (MBBI->modifiesRegister(SrcReg, TRI) ||
+        (SrcReg2 && MBBI->modifiesRegister(SrcReg2, TRI)))
+      return false;
+
+  // Read the branch mask and target.
+  MachineOperand CCMask(MBBI->getOperand(1));
+  MachineOperand Target(MBBI->getOperand(2));
+  assert((CCMask.getImm() & ~SystemZ::CCMASK_ICMP) == 0 &&
+         "Invalid condition-code mask for integer comparison");
+
+  // Clear out all current operands.
+  int CCUse = MBBI->findRegisterUseOperandIdx(SystemZ::CC, false, TRI);
+  assert(CCUse >= 0 && "BRC must use CC");
+  Branch->RemoveOperand(CCUse);
+  Branch->RemoveOperand(2);
+  Branch->RemoveOperand(1);
+  Branch->RemoveOperand(0);
+
+  // Rebuild Branch as a fused compare and branch.
+  Branch->setDesc(TII->get(FusedOpcode));
+  MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+    .addOperand(Compare->getOperand(0))
+    .addOperand(Compare->getOperand(1))
+    .addOperand(CCMask)
+    .addOperand(Target)
+    .addReg(SystemZ::CC, RegState::ImplicitDefine);
+
+  // Clear any intervening kills of SrcReg and SrcReg2.
+  MBBI = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MBBI->clearRegisterKills(SrcReg, TRI);
+    if (SrcReg2)
+      MBBI->clearRegisterKills(SrcReg2, TRI);
+  }
+  FusedComparisons += 1;
+  return true;
+}
+
+// Process all comparison instructions in MBB.  Return true if something
+// changed.
+bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  // Walk backwards through the block looking for comparisons, recording
+  // all CC users as we go.  The subroutines can delete Compare and
+  // instructions before it.
+  bool CompleteCCUsers = !isCCLiveOut(MBB);
+  SmallVector<MachineInstr *, 4> CCUsers;
+  MachineBasicBlock::iterator MBBI = MBB->end();
+  while (MBBI != MBB->begin()) {
+    MachineInstr *MI = --MBBI;
+    if (CompleteCCUsers &&
+        MI->isCompare() &&
+        (optimizeCompareZero(MI, CCUsers) ||
+         fuseCompareAndBranch(MI, CCUsers))) {
+      ++MBBI;
+      MI->removeFromParent();
+      Changed = true;
+      CCUsers.clear();
+      CompleteCCUsers = true;
+      continue;
+    }
+
+    Reference CCRefs(getRegReferences(MI, SystemZ::CC));
+    if (CCRefs.Def) {
+      CCUsers.clear();
+      CompleteCCUsers = !CCRefs.IndirectDef;
+    }
+    if (CompleteCCUsers && CCRefs.Use)
+      CCUsers.push_back(MI);
+  }
+  return Changed;
+}
+
+bool SystemZElimCompare::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  TRI = &TII->getRegisterInfo();
+
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI)
+    Changed |= processBlock(MFI);
+
+  return Changed;
+}
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index fda33de..acfb491 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -14,19 +14,15 @@
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
 
 using namespace llvm;
 
-SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
-                                           const SystemZSubtarget &sti)
-  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
-                        -SystemZMC::CallFrameSize),
-    TM(tm),
-    STI(sti) {
+namespace {
   // The ABI-defined register save slots, relative to the incoming stack
   // pointer.
-  static const unsigned SpillOffsetTable[][2] = {
+  static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
     { SystemZ::R2D,  0x10 },
     { SystemZ::R3D,  0x18 },
     { SystemZ::R4D,  0x20 },
@@ -46,11 +42,23 @@ SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
     { SystemZ::F4D,  0x90 },
     { SystemZ::F6D,  0x98 }
   };
+}
 
+SystemZFrameLowering::SystemZFrameLowering(const SystemZTargetMachine &tm,
+                                           const SystemZSubtarget &sti)
+  : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 8,
+                        -SystemZMC::CallFrameSize, 8),
+    TM(tm), STI(sti) {
   // Create a mapping from register number to save slot offset.
   RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
   for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
-    RegSpillOffsets[SpillOffsetTable[I][0]] = SpillOffsetTable[I][1];
+    RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+}
+
+const TargetFrameLowering::SpillSlot *
+SystemZFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = array_lengthof(SpillOffsetTable);
+  return SpillOffsetTable;
 }
 
 void SystemZFrameLowering::
@@ -103,7 +111,7 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
                         const SystemZTargetMachine &TM,
                         unsigned GPR64, bool IsImplicit) {
   const SystemZRegisterInfo *RI = TM.getRegisterInfo();
-  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_32bit);
+  unsigned GPR32 = RI->getSubReg(GPR64, SystemZ::subreg_l32);
   bool IsLive = MBB.isLiveIn(GPR64) || MBB.isLiveIn(GPR32);
   if (!IsLive || !IsImplicit) {
     MIB.addReg(GPR64, getImplRegState(IsImplicit) | getKillRegState(!IsLive));
@@ -127,14 +135,12 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   // Scan the call-saved GPRs and find the bounds of the register spill area.
-  unsigned SavedGPRFrameSize = 0;
   unsigned LowGPR = 0;
   unsigned HighGPR = SystemZ::R15D;
   unsigned StartOffset = -1U;
   for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
     unsigned Reg = CSI[I].getReg();
     if (SystemZ::GR64BitRegClass.contains(Reg)) {
-      SavedGPRFrameSize += 8;
       unsigned Offset = RegSpillOffsets[Reg];
       assert(Offset && "Unexpected GPR save");
       if (StartOffset > Offset) {
@@ -144,9 +150,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     }
   }
 
-  // Save information about the range and location of the call-saved
-  // registers, for use by the epilogue inserter.
-  ZFI->setSavedGPRFrameSize(SavedGPRFrameSize);
+  // Save the range of call-saved registers, for use by the epilogue inserter.
   ZFI->setLowSavedGPR(LowGPR);
   ZFI->setHighSavedGPR(HighGPR);
 
@@ -260,6 +264,22 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
+void SystemZFrameLowering::
+processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                    RegScavenger *RS) const {
+  MachineFrameInfo *MFFrame = MF.getFrameInfo();
+  uint64_t MaxReach = (MFFrame->estimateStackSize(MF) +
+                       SystemZMC::CallFrameSize * 2);
+  if (!isUInt<12>(MaxReach)) {
+    // We may need register scavenging slots if some parts of the frame
+    // are outside the reach of an unsigned 12-bit displacement.
+    // Create 2 for the case where both addresses in an MVC are
+    // out of range.
+    RS->addScavengingFrameIndex(MFFrame->CreateStackObject(8, 8, false));
+    RS->addScavengingFrameIndex(MFFrame->CreateStackObject(8, 8, false));
+  }
+}
+
 // Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
 static void emitIncrement(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator &MBBI,
@@ -283,7 +303,7 @@ static void emitIncrement(MachineBasicBlock &MBB,
     }
     MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII->get(Opcode), Reg)
       .addReg(Reg).addImm(ThisVal);
-    // The PSW implicit def is dead.
+    // The CC implicit def is dead.
     MI->getOperand(3).setIsDead();
     NumBytes -= ThisVal;
   }
@@ -297,7 +317,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineModuleInfo &MMI = MF.getMMI();
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFFrame->getCalleeSavedInfo();
   bool HasFP = hasFP(MF);
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -321,9 +341,8 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned Reg = I->getReg();
       if (SystemZ::GR64BitRegClass.contains(Reg)) {
         int64_t Offset = SPOffsetFromCFA + RegSpillOffsets[Reg];
-        MachineLocation StackSlot(MachineLocation::VirtualFP, Offset);
-        MachineLocation RegValue(Reg);
-        Moves.push_back(MachineMove(GPRSaveLabel, StackSlot, RegValue));
+        MMI.addFrameInst(MCCFIInstruction::createOffset(
+            GPRSaveLabel, MRI->getDwarfRegNum(Reg, true), Offset));
       }
     }
   }
@@ -338,9 +357,8 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
     MCSymbol *AdjustSPLabel = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
       .addSym(AdjustSPLabel);
-    MachineLocation FPDest(MachineLocation::VirtualFP);
-    MachineLocation FPSrc(MachineLocation::VirtualFP, SPOffsetFromCFA + Delta);
-    Moves.push_back(MachineMove(AdjustSPLabel, FPDest, FPSrc));
+    MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+        AdjustSPLabel, SPOffsetFromCFA + Delta));
     SPOffsetFromCFA += Delta;
   }
 
@@ -353,9 +371,9 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
     MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::PROLOG_LABEL))
       .addSym(SetFPLabel);
-    MachineLocation HardFP(SystemZ::R11D);
-    MachineLocation VirtualFP(MachineLocation::VirtualFP);
-    Moves.push_back(MachineMove(SetFPLabel, HardFP, VirtualFP));
+    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
+    MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(SetFPLabel, HardFP));
 
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R11 as live on entry when
@@ -381,12 +399,10 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF) const {
       // Add CFI for the this save.
       if (!FPRSaveLabel)
         FPRSaveLabel = MMI.getContext().CreateTempSymbol();
-      unsigned Reg = I->getReg();
+      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
       int64_t Offset = getFrameIndexOffset(MF, I->getFrameIdx());
-      MachineLocation Slot(MachineLocation::VirtualFP,
-                           SPOffsetFromCFA + Offset);
-      MachineLocation RegValue(Reg);
-      Moves.push_back(MachineMove(FPRSaveLabel, Slot, RegValue));
+      MMI.addFrameInst(MCCFIInstruction::createOffset(
+          FPRSaveLabel, Reg, SPOffsetFromCFA + Offset));
     }
   }
   // Complete the CFI for the FPR saves, modelling them as taking effect
@@ -404,8 +420,7 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
 
   // Skip the return instruction.
-  assert(MBBI->getOpcode() == SystemZ::RET &&
-         "Can only insert epilogue into returning blocks");
+  assert(MBBI->isReturn() && "Can only insert epilogue into returning blocks");
 
   uint64_t StackSize = getAllocatedStackSize(MF);
   if (ZFI->getLowSavedGPR()) {
@@ -453,11 +468,6 @@ int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   // offset is therefore negative.
   int64_t Offset = (MFFrame->getObjectOffset(FI) +
                     MFFrame->getOffsetAdjustment());
-  if (FI >= 0)
-    // Non-fixed objects are allocated below the incoming stack pointer.
-    // Account for the space at the top of the frame that we choose not
-    // to allocate.
-    Offset += getUnallocatedTopBytes(MF);
 
   // Make the offset relative to the incoming stack pointer.
   Offset -= getOffsetOfLocalArea();
@@ -469,23 +479,12 @@ int SystemZFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 }
 
 uint64_t SystemZFrameLowering::
-getUnallocatedTopBytes(const MachineFunction &MF) const {
-  return MF.getInfo<SystemZMachineFunctionInfo>()->getSavedGPRFrameSize();
-}
-
-uint64_t SystemZFrameLowering::
 getAllocatedStackSize(const MachineFunction &MF) const {
   const MachineFrameInfo *MFFrame = MF.getFrameInfo();
 
   // Start with the size of the local variables and spill slots.
   uint64_t StackSize = MFFrame->getStackSize();
 
-  // Remove any bytes that we choose not to allocate.
-  StackSize -= getUnallocatedTopBytes(MF);
-
-  // Include space for an emergency spill slot, if one might be needed.
-  StackSize += getEmergencySpillSlotSize(MF);
-
   // We need to allocate the ABI-defined 160-byte base area whenever
   // we allocate stack space for our own use and whenever we call another
   // function.
@@ -495,19 +494,6 @@ getAllocatedStackSize(const MachineFunction &MF) const {
   return StackSize;
 }
 
-unsigned SystemZFrameLowering::
-getEmergencySpillSlotSize(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFFrame = MF.getFrameInfo();
-  uint64_t MaxReach = MFFrame->getStackSize() + SystemZMC::CallFrameSize * 2;
-  return isUInt<12>(MaxReach) ? 0 : 8;
-}
-
-unsigned SystemZFrameLowering::
-getEmergencySpillSlotOffset(const MachineFunction &MF) const {
-  assert(getEmergencySpillSlotSize(MF) && "No emergency spill slot");
-  return SystemZMC::CallFrameSize;
-}
-
 bool
 SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   // The ABI requires us to allocate 160 bytes of stack space for the callee,
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.h b/lib/Target/SystemZ/SystemZFrameLowering.h
index 5ca049c..9b0a1d5 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -29,7 +29,10 @@ public:
   SystemZFrameLowering(const SystemZTargetMachine &tm,
                        const SystemZSubtarget &sti);
 
-  // Override FrameLowering.
+  // Override TargetFrameLowering.
+  virtual bool isFPCloseToIncomingSP() const LLVM_OVERRIDE { return false; }
+  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const
+    LLVM_OVERRIDE;
   virtual void
     processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                          RegScavenger *RS) const LLVM_OVERRIDE;
@@ -45,6 +48,8 @@ public:
                                 const std::vector<CalleeSavedInfo> &CSI,
                                 const TargetRegisterInfo *TRI) const
     LLVM_OVERRIDE;
+  virtual void processFunctionBeforeFrameFinalized(MachineFunction &MF,
+                                                   RegScavenger *RS) const;
   virtual void emitPrologue(MachineFunction &MF) const LLVM_OVERRIDE;
   virtual void emitEpilogue(MachineFunction &MF,
                             MachineBasicBlock &MBB) const LLVM_OVERRIDE;
@@ -59,29 +64,9 @@ public:
                                 MachineBasicBlock::iterator MI) const
     LLVM_OVERRIDE;
 
-  // The target-independent code automatically allocates save slots for
-  // call-saved GPRs.  However, we don't need those slots for SystemZ,
-  // because the ABI sets aside GPR save slots in the caller-allocated part
-  // of the frame.  Since the target-independent code puts this unneeded
-  // area at the top of the callee-allocated part of frame, we choose not
-  // to allocate it and adjust the offsets accordingly.  Return the
-  // size of this unallocated area.
-  // FIXME: seems a bit hackish.
-  uint64_t getUnallocatedTopBytes(const MachineFunction &MF) const;
-
   // Return the number of bytes in the callee-allocated part of the frame.
   uint64_t getAllocatedStackSize(const MachineFunction &MF) const;
 
-  // Return the number of frame bytes that should be reserved for
-  // an emergency spill slot, for use by the register scaveneger.
-  // Return 0 if register scaveging won't be needed.
-  unsigned getEmergencySpillSlotSize(const MachineFunction &MF) const;
-
-  // Return the offset from the frame pointer of the emergency spill slot,
-  // which always fits within a 12-bit unsigned displacement field.
-  // Only valid if getEmergencySpillSlotSize(MF) returns nonzero.
-  unsigned getEmergencySpillSlotOffset(const MachineFunction &MF) const;
-
   // Return the byte offset from the incoming stack pointer of Reg's
   // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
   unsigned getRegSpillOffset(unsigned Reg) const {
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index d436ba9..f4a2773 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZTargetMachine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -91,44 +92,90 @@ struct SystemZAddressingMode {
   }
 };
 
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Represents operands 2 to 5 of the ROTATE AND ... SELECTED BITS operation
+// given by Opcode.  The operands are: Input (R2), Start (I3), End (I4) and
+// Rotate (I5).  The combined operand value is effectively:
+//
+//   (or (rotl Input, Rotate), ~Mask)
+//
+// for RNSBG and:
+//
+//   (and (rotl Input, Rotate), Mask)
+//
+// otherwise.  The output value has BitSize bits, although Input may be
+// narrower (in which case the upper bits are don't care).
+struct RxSBGOperands {
+  RxSBGOperands(unsigned Op, SDValue N)
+    : Opcode(Op), BitSize(N.getValueType().getSizeInBits()),
+      Mask(allOnes(BitSize)), Input(N), Start(64 - BitSize), End(63),
+      Rotate(0) {}
+
+  unsigned Opcode;
+  unsigned BitSize;
+  uint64_t Mask;
+  SDValue Input;
+  unsigned Start;
+  unsigned End;
+  unsigned Rotate;
+};
+
 class SystemZDAGToDAGISel : public SelectionDAGISel {
   const SystemZTargetLowering &Lowering;
   const SystemZSubtarget &Subtarget;
 
   // Used by SystemZOperands.td to create integer constants.
-  inline SDValue getImm(const SDNode *Node, uint64_t Imm) {
+  inline SDValue getImm(const SDNode *Node, uint64_t Imm) const {
     return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
   }
 
+  const SystemZTargetMachine &getTargetMachine() const {
+    return static_cast<const SystemZTargetMachine &>(TM);
+  }
+
+  const SystemZInstrInfo *getInstrInfo() const {
+    return getTargetMachine().getInstrInfo();
+  }
+
   // Try to fold more of the base or index of AM into AM, where IsBase
   // selects between the base and index.
-  bool expandAddress(SystemZAddressingMode &AM, bool IsBase);
+  bool expandAddress(SystemZAddressingMode &AM, bool IsBase) const;
 
   // Try to describe N in AM, returning true on success.
-  bool selectAddress(SDValue N, SystemZAddressingMode &AM);
+  bool selectAddress(SDValue N, SystemZAddressingMode &AM) const;
 
   // Extract individual target operands from matched address AM.
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp);
+                          SDValue &Base, SDValue &Disp) const;
   void getAddressOperands(const SystemZAddressingMode &AM, EVT VT,
-                          SDValue &Base, SDValue &Disp, SDValue &Index);
+                          SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // Try to match Addr as a FormBD address with displacement type DR.
   // Return true on success, storing the base and displacement in
   // Base and Disp respectively.
   bool selectBDAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
-                    SDValue &Base, SDValue &Disp);
+                    SDValue &Base, SDValue &Disp) const;
+
+  // Try to match Addr as a FormBDX address with displacement type DR.
+  // Return true on success and if the result had no index.  Store the
+  // base and displacement in Base and Disp respectively.
+  bool selectMVIAddr(SystemZAddressingMode::DispRange DR, SDValue Addr,
+                     SDValue &Base, SDValue &Disp) const;
 
   // Try to match Addr as a FormBDX* address of form Form with
   // displacement type DR.  Return true on success, storing the base,
   // displacement and index in Base, Disp and Index respectively.
   bool selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                      SystemZAddressingMode::DispRange DR, SDValue Addr,
-                     SDValue &Base, SDValue &Disp, SDValue &Index);
+                     SDValue &Base, SDValue &Disp, SDValue &Index) const;
 
   // PC-relative address matching routines used by SystemZOperands.td.
-  bool selectPCRelAddress(SDValue Addr, SDValue &Target) {
-    if (Addr.getOpcode() == SystemZISD::PCREL_WRAPPER) {
+  bool selectPCRelAddress(SDValue Addr, SDValue &Target) const {
+    if (SystemZISD::isPCREL(Addr.getOpcode())) {
       Target = Addr.getOperand(0);
       return true;
     }
@@ -136,69 +183,104 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   }
 
   // BD matching routines used by SystemZOperands.td.
-  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Only, Addr, Base, Disp);
   }
-  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
   }
-  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Only, Addr, Base, Disp);
   }
-  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) {
+  bool selectBDAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
     return selectBDAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
   }
 
+  // MVI matching routines used by SystemZOperands.td.
+  bool selectMVIAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp12Pair, Addr, Base, Disp);
+  }
+  bool selectMVIAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp) const {
+    return selectMVIAddr(SystemZAddressingMode::Disp20Pair, Addr, Base, Disp);
+  }
+
   // BDX matching routines used by SystemZOperands.td.
   bool selectBDXAddr12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectDynAlloc12Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                            SDValue &Index) {
+                            SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXDynAlloc,
                          SystemZAddressingMode::Disp12Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Only128(SDValue Addr, SDValue &Base, SDValue &Disp,
-                              SDValue &Index) {
+                              SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Only128,
                          Addr, Base, Disp, Index);
   }
   bool selectBDXAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                           SDValue &Index) {
+                           SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXNormal,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr12Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp12Pair,
                          Addr, Base, Disp, Index);
   }
   bool selectLAAddr20Pair(SDValue Addr, SDValue &Base, SDValue &Disp,
-                          SDValue &Index) {
+                          SDValue &Index) const {
     return selectBDXAddr(SystemZAddressingMode::FormBDXLA,
                          SystemZAddressingMode::Disp20Pair,
                          Addr, Base, Disp, Index);
   }
 
+  // Check whether (or Op (and X InsertMask)) is effectively an insertion
+  // of X into bits InsertMask of some Y != Op.  Return true if so and
+  // set Op to that Y.
+  bool detectOrAndInsertion(SDValue &Op, uint64_t InsertMask) const;
+
+  // Try to update RxSBG so that only the bits of RxSBG.Input in Mask are used.
+  // Return true on success.
+  bool refineRxSBGMask(RxSBGOperands &RxSBG, uint64_t Mask) const;
+
+  // Try to fold some of RxSBG.Input into other fields of RxSBG.
+  // Return true on success.
+  bool expandRxSBG(RxSBGOperands &RxSBG) const;
+
+  // Return an undefined value of type VT.
+  SDValue getUNDEF(SDLoc DL, EVT VT) const;
+
+  // Convert N to VT, if it isn't already.
+  SDValue convertTo(SDLoc DL, EVT VT, SDValue N) const;
+
+  // Try to implement AND or shift node N using RISBG with the zero flag set.
+  // Return the selected node on success, otherwise return null.
+  SDNode *tryRISBGZero(SDNode *N);
+
+  // Try to use RISBG or Opcode to implement OR or XOR node N.
+  // Return the selected node on success, otherwise return null.
+  SDNode *tryRxSBG(SDNode *N, unsigned Opcode);
+
   // If Op0 is null, then Node is a constant that can be loaded using:
   //
   //   (Opcode UpperVal LowerVal)
@@ -209,6 +291,26 @@ class SystemZDAGToDAGISel : public SelectionDAGISel {
   SDNode *splitLargeImmediate(unsigned Opcode, SDNode *Node, SDValue Op0,
                               uint64_t UpperVal, uint64_t LowerVal);
 
+  // Return true if Load and Store are loads and stores of the same size
+  // and are guaranteed not to overlap.  Such operations can be implemented
+  // using block (SS-format) instructions.
+  //
+  // Partial overlap would lead to incorrect code, since the block operations
+  // are logically bytewise, even though they have a fast path for the
+  // non-overlapping case.  We also need to avoid full overlap (i.e. two
+  // addresses that might be equal at run time) because although that case
+  // would be handled correctly, it might be implemented by millicode.
+  bool canUseBlockOperation(StoreSDNode *Store, LoadSDNode *Load) const;
+
+  // N is a (store (load Y), X) pattern.  Return true if it can use an MVC
+  // from Y to X.
+  bool storeLoadCanUseMVC(SDNode *N) const;
+
+  // N is a (store (op (load A[0]), (load A[1])), X) pattern.  Return true
+  // if A[1 - I] == X and if N can use a block operation like NC from A[I]
+  // to X.
+  bool storeLoadCanUseBlockBinary(SDNode *N, unsigned I) const;
+
 public:
   SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(TM, OptLevel),
@@ -294,9 +396,9 @@ static bool expandIndex(SystemZAddressingMode &AM, SDValue Base,
 // The base or index of AM is equivalent to Op0 + Op1, where IsBase selects
 // between the base and index.  Try to fold Op1 into AM's displacement.
 static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
-                       SDValue Op0, ConstantSDNode *Op1) {
+                       SDValue Op0, uint64_t Op1) {
   // First try adjusting the displacement.
-  int64_t TestDisp = AM.Disp + Op1->getSExtValue();
+  int64_t TestDisp = AM.Disp + Op1;
   if (selectDisp(AM.DR, TestDisp)) {
     changeComponent(AM, IsBase, Op0);
     AM.Disp = TestDisp;
@@ -309,7 +411,7 @@ static bool expandDisp(SystemZAddressingMode &AM, bool IsBase,
 }
 
 bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
-                                        bool IsBase) {
+                                        bool IsBase) const {
   SDValue N = IsBase ? AM.Base : AM.Index;
   unsigned Opcode = N.getOpcode();
   if (Opcode == ISD::TRUNCATE) {
@@ -329,13 +431,23 @@ bool SystemZDAGToDAGISel::expandAddress(SystemZAddressingMode &AM,
       return expandAdjDynAlloc(AM, IsBase, Op0);
 
     if (Op0Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op1, cast<ConstantSDNode>(Op0));
+      return expandDisp(AM, IsBase, Op1,
+                        cast<ConstantSDNode>(Op0)->getSExtValue());
     if (Op1Code == ISD::Constant)
-      return expandDisp(AM, IsBase, Op0, cast<ConstantSDNode>(Op1));
+      return expandDisp(AM, IsBase, Op0,
+                        cast<ConstantSDNode>(Op1)->getSExtValue());
 
     if (IsBase && expandIndex(AM, Op0, Op1))
       return true;
   }
+  if (Opcode == SystemZISD::PCREL_OFFSET) {
+    SDValue Full = N.getOperand(0);
+    SDValue Base = N.getOperand(1);
+    SDValue Anchor = Base.getOperand(0);
+    uint64_t Offset = (cast<GlobalAddressSDNode>(Full)->getOffset() -
+                       cast<GlobalAddressSDNode>(Anchor)->getOffset());
+    return expandDisp(AM, IsBase, Base, Offset);
+  }
   return false;
 }
 
@@ -414,14 +526,15 @@ static bool shouldUseLA(SDNode *Base, int64_t Disp, SDNode *Index) {
 
 // Return true if Addr is suitable for AM, updating AM if so.
 bool SystemZDAGToDAGISel::selectAddress(SDValue Addr,
-                                        SystemZAddressingMode &AM) {
+                                        SystemZAddressingMode &AM) const {
   // Start out assuming that the address will need to be loaded separately,
   // then try to extend it as much as we can.
   AM.Base = Addr;
 
   // First try treating the address as a constant.
   if (Addr.getOpcode() == ISD::Constant &&
-      expandDisp(AM, true, SDValue(), cast<ConstantSDNode>(Addr)))
+      expandDisp(AM, true, SDValue(),
+                 cast<ConstantSDNode>(Addr)->getSExtValue()))
     ;
   else
     // Otherwise try expanding each component.
@@ -461,7 +574,7 @@ static void insertDAGNode(SelectionDAG *DAG, SDNode *Pos, SDValue N) {
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp) {
+                                             SDValue &Disp) const {
   Base = AM.Base;
   if (!Base.getNode())
     // Register 0 means "no base".  This is mostly useful for shifts.
@@ -474,7 +587,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
     // Truncate values from i64 to i32, for shifts.
     assert(VT == MVT::i32 && Base.getValueType() == MVT::i64 &&
            "Unexpected truncation");
-    DebugLoc DL = Base.getDebugLoc();
+    SDLoc DL(Base);
     SDValue Trunc = CurDAG->getNode(ISD::TRUNCATE, DL, VT, Base);
     insertDAGNode(CurDAG, Base.getNode(), Trunc);
     Base = Trunc;
@@ -486,7 +599,8 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
                                              EVT VT, SDValue &Base,
-                                             SDValue &Disp, SDValue &Index) {
+                                             SDValue &Disp,
+                                             SDValue &Index) const {
   getAddressOperands(AM, VT, Base, Disp);
 
   Index = AM.Index;
@@ -497,7 +611,7 @@ void SystemZDAGToDAGISel::getAddressOperands(const SystemZAddressingMode &AM,
 
 bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
                                        SDValue Addr, SDValue &Base,
-                                       SDValue &Disp) {
+                                       SDValue &Disp) const {
   SystemZAddressingMode AM(SystemZAddressingMode::FormBD, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -506,10 +620,21 @@ bool SystemZDAGToDAGISel::selectBDAddr(SystemZAddressingMode::DispRange DR,
   return true;
 }
 
+bool SystemZDAGToDAGISel::selectMVIAddr(SystemZAddressingMode::DispRange DR,
+                                        SDValue Addr, SDValue &Base,
+                                        SDValue &Disp) const {
+  SystemZAddressingMode AM(SystemZAddressingMode::FormBDXNormal, DR);
+  if (!selectAddress(Addr, AM) || AM.Index.getNode())
+    return false;
+
+  getAddressOperands(AM, Addr.getValueType(), Base, Disp);
+  return true;
+}
+
 bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
                                         SystemZAddressingMode::DispRange DR,
                                         SDValue Addr, SDValue &Base,
-                                        SDValue &Disp, SDValue &Index) {
+                                        SDValue &Disp, SDValue &Index) const {
   SystemZAddressingMode AM(Form, DR);
   if (!selectAddress(Addr, AM))
     return false;
@@ -518,11 +643,317 @@ bool SystemZDAGToDAGISel::selectBDXAddr(SystemZAddressingMode::AddrForm Form,
   return true;
 }
 
+bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
+                                               uint64_t InsertMask) const {
+  // We're only interested in cases where the insertion is into some operand
+  // of Op, rather than into Op itself.  The only useful case is an AND.
+  if (Op.getOpcode() != ISD::AND)
+    return false;
+
+  // We need a constant mask.
+  ConstantSDNode *MaskNode =
+    dyn_cast<ConstantSDNode>(Op.getOperand(1).getNode());
+  if (!MaskNode)
+    return false;
+
+  // It's not an insertion of Op.getOperand(0) if the two masks overlap.
+  uint64_t AndMask = MaskNode->getZExtValue();
+  if (InsertMask & AndMask)
+    return false;
+
+  // It's only an insertion if all bits are covered or are known to be zero.
+  // The inner check covers all cases but is more expensive.
+  uint64_t Used = allOnes(Op.getValueType().getSizeInBits());
+  if (Used != (AndMask | InsertMask)) {
+    APInt KnownZero, KnownOne;
+    CurDAG->ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne);
+    if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
+      return false;
+  }
+
+  Op = Op.getOperand(0);
+  return true;
+}
+
+bool SystemZDAGToDAGISel::refineRxSBGMask(RxSBGOperands &RxSBG,
+                                          uint64_t Mask) const {
+  const SystemZInstrInfo *TII = getInstrInfo();
+  if (RxSBG.Rotate != 0)
+    Mask = (Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate));
+  Mask &= RxSBG.Mask;
+  if (TII->isRxSBGMask(Mask, RxSBG.BitSize, RxSBG.Start, RxSBG.End)) {
+    RxSBG.Mask = Mask;
+    return true;
+  }
+  return false;
+}
+
+// Return true if any bits of (RxSBG.Input & Mask) are significant.
+static bool maskMatters(RxSBGOperands &RxSBG, uint64_t Mask) {
+  // Rotate the mask in the same way as RxSBG.Input is rotated.
+  if (RxSBG.Rotate != 0)
+    Mask = ((Mask << RxSBG.Rotate) | (Mask >> (64 - RxSBG.Rotate)));
+  return (Mask & RxSBG.Mask) != 0;
+}
+
+bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
+  SDValue N = RxSBG.Input;
+  unsigned Opcode = N.getOpcode();
+  switch (Opcode) {
+  case ISD::AND: {
+    if (RxSBG.Opcode == SystemZ::RNSBG)
+      return false;
+
+    ConstantSDNode *MaskNode =
+      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known zeros, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      Mask |= KnownZero.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::OR: {
+    if (RxSBG.Opcode != SystemZ::RNSBG)
+      return false;
+
+    ConstantSDNode *MaskNode =
+      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!MaskNode)
+      return false;
+
+    SDValue Input = N.getOperand(0);
+    uint64_t Mask = ~MaskNode->getZExtValue();
+    if (!refineRxSBGMask(RxSBG, Mask)) {
+      // If some bits of Input are already known ones, those bits will have
+      // been removed from the mask.  See if adding them back in makes the
+      // mask suitable.
+      APInt KnownZero, KnownOne;
+      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      Mask &= ~KnownOne.getZExtValue();
+      if (!refineRxSBGMask(RxSBG, Mask))
+        return false;
+    }
+    RxSBG.Input = Input;
+    return true;
+  }
+
+  case ISD::ROTL: {
+    // Any 64-bit rotate left can be merged into the RxSBG.
+    if (RxSBG.BitSize != 64 || N.getValueType() != MVT::i64)
+      return false;
+    ConstantSDNode *CountNode
+      = dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    RxSBG.Rotate = (RxSBG.Rotate + CountNode->getZExtValue()) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+      
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    // Check that the extension bits are don't-care (i.e. are masked out
+    // by the final mask).
+    unsigned InnerBitSize = N.getOperand(0).getValueType().getSizeInBits();
+    if (maskMatters(RxSBG, allOnes(RxSBG.BitSize) - allOnes(InnerBitSize)))
+      return false;
+
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SHL: {
+    ConstantSDNode *CountNode =
+      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG) {
+      // Treat (shl X, count) as (rotl X, size-count) as long as the bottom
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count)))
+        return false;
+    } else {
+      // Treat (shl X, count) as (and (rotl X, count), ~0<<count).
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count) << Count))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate + Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+
+  case ISD::SRL:
+  case ISD::SRA: {
+    ConstantSDNode *CountNode =
+      dyn_cast<ConstantSDNode>(N.getOperand(1).getNode());
+    if (!CountNode)
+      return false;
+
+    uint64_t Count = CountNode->getZExtValue();
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    if (Count < 1 || Count >= BitSize)
+      return false;
+
+    if (RxSBG.Opcode == SystemZ::RNSBG || Opcode == ISD::SRA) {
+      // Treat (srl|sra X, count) as (rotl X, size-count) as long as the top
+      // count bits from RxSBG.Input are ignored.
+      if (maskMatters(RxSBG, allOnes(Count) << (BitSize - Count)))
+        return false;
+    } else {
+      // Treat (srl X, count), mask) as (and (rotl X, size-count), ~0>>count),
+      // which is similar to SLL above.
+      if (!refineRxSBGMask(RxSBG, allOnes(BitSize - Count)))
+        return false;
+    }
+
+    RxSBG.Rotate = (RxSBG.Rotate - Count) & 63;
+    RxSBG.Input = N.getOperand(0);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+SDValue SystemZDAGToDAGISel::getUNDEF(SDLoc DL, EVT VT) const {
+  SDNode *N = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+  return SDValue(N, 0);
+}
+
+SDValue SystemZDAGToDAGISel::convertTo(SDLoc DL, EVT VT, SDValue N) const {
+  if (N.getValueType() == MVT::i32 && VT == MVT::i64)
+    return CurDAG->getTargetInsertSubreg(SystemZ::subreg_l32,
+                                         DL, VT, getUNDEF(DL, MVT::i64), N);
+  if (N.getValueType() == MVT::i64 && VT == MVT::i32)
+    return CurDAG->getTargetExtractSubreg(SystemZ::subreg_l32, DL, VT, N);
+  assert(N.getValueType() == VT && "Unexpected value types");
+  return N;
+}
+
+SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  RxSBGOperands RISBG(SystemZ::RISBG, SDValue(N, 0));
+  unsigned Count = 0;
+  while (expandRxSBG(RISBG))
+    if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
+      Count += 1;
+  if (Count == 0)
+    return 0;
+  if (Count == 1) {
+    // Prefer to use normal shift instructions over RISBG, since they can handle
+    // all cases and are sometimes shorter.
+    if (N->getOpcode() != ISD::AND)
+      return 0;
+
+    // Prefer register extensions like LLC over RISBG.  Also prefer to start
+    // out with normal ANDs if one instruction would be enough.  We can convert
+    // these ANDs into an RISBG later if a three-address instruction is useful.
+    if (VT == MVT::i32 ||
+        RISBG.Mask == 0xff ||
+        RISBG.Mask == 0xffff ||
+        SystemZ::isImmLF(~RISBG.Mask) ||
+        SystemZ::isImmHF(~RISBG.Mask)) {
+      // Force the new mask into the DAG, since it may include known-one bits.
+      ConstantSDNode *MaskN = cast<ConstantSDNode>(N->getOperand(1).getNode());
+      if (MaskN->getZExtValue() != RISBG.Mask) {
+        SDValue NewMask = CurDAG->getConstant(RISBG.Mask, VT);
+        N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
+        return SelectCode(N);
+      }
+      return 0;
+    }
+  }  
+
+  unsigned Opcode = SystemZ::RISBG;
+  EVT OpcodeVT = MVT::i64;
+  if (VT == MVT::i32 && Subtarget.hasHighWord()) {
+    Opcode = SystemZ::RISBMux;
+    OpcodeVT = MVT::i32;
+    RISBG.Start &= 31;
+    RISBG.End &= 31;
+  }
+  SDValue Ops[5] = {
+    getUNDEF(SDLoc(N), OpcodeVT),
+    convertTo(SDLoc(N), OpcodeVT, RISBG.Input),
+    CurDAG->getTargetConstant(RISBG.Start, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.End | 128, MVT::i32),
+    CurDAG->getTargetConstant(RISBG.Rotate, MVT::i32)
+  };
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), OpcodeVT, Ops);
+  return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
+}
+
+SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
+  // Try treating each operand of N as the second operand of the RxSBG
+  // and see which goes deepest.
+  RxSBGOperands RxSBG[] = {
+    RxSBGOperands(Opcode, N->getOperand(0)),
+    RxSBGOperands(Opcode, N->getOperand(1))
+  };
+  unsigned Count[] = { 0, 0 };
+  for (unsigned I = 0; I < 2; ++I)
+    while (expandRxSBG(RxSBG[I]))
+      if (RxSBG[I].Input.getOpcode() != ISD::ANY_EXTEND)
+        Count[I] += 1;
+
+  // Do nothing if neither operand is suitable.
+  if (Count[0] == 0 && Count[1] == 0)
+    return 0;
+
+  // Pick the deepest second operand.
+  unsigned I = Count[0] > Count[1] ? 0 : 1;
+  SDValue Op0 = N->getOperand(I ^ 1);
+
+  // Prefer IC for character insertions from memory.
+  if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
+    if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
+      if (Load->getMemoryVT() == MVT::i8)
+        return 0;
+
+  // See whether we can avoid an AND in the first operand by converting
+  // ROSBG to RISBG.
+  if (Opcode == SystemZ::ROSBG && detectOrAndInsertion(Op0, RxSBG[I].Mask))
+    Opcode = SystemZ::RISBG;
+           
+  EVT VT = N->getValueType(0);
+  SDValue Ops[5] = {
+    convertTo(SDLoc(N), MVT::i64, Op0),
+    convertTo(SDLoc(N), MVT::i64, RxSBG[I].Input),
+    CurDAG->getTargetConstant(RxSBG[I].Start, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].End, MVT::i32),
+    CurDAG->getTargetConstant(RxSBG[I].Rotate, MVT::i32)
+  };
+  N = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, Ops);
+  return convertTo(SDLoc(N), VT, SDValue(N, 0)).getNode();
+}
+
 SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
                                                  SDValue Op0, uint64_t UpperVal,
                                                  uint64_t LowerVal) {
   EVT VT = Node->getValueType(0);
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   SDValue Upper = CurDAG->getConstant(UpperVal, VT);
   if (Op0.getNode())
     Upper = CurDAG->getNode(Opcode, DL, VT, Op0, Upper);
@@ -533,6 +964,64 @@ SDNode *SystemZDAGToDAGISel::splitLargeImmediate(unsigned Opcode, SDNode *Node,
   return Or.getNode();
 }
 
+bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
+                                               LoadSDNode *Load) const {
+  // Check that the two memory operands have the same size.
+  if (Load->getMemoryVT() != Store->getMemoryVT())
+    return false;
+
+  // Volatility stops an access from being decomposed.
+  if (Load->isVolatile() || Store->isVolatile())
+    return false;
+
+  // There's no chance of overlap if the load is invariant.
+  if (Load->isInvariant())
+    return true;
+
+  // Otherwise we need to check whether there's an alias.
+  const Value *V1 = Load->getSrcValue();
+  const Value *V2 = Store->getSrcValue();
+  if (!V1 || !V2)
+    return false;
+
+  // Reject equality.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  int64_t End1 = Load->getSrcValueOffset() + Size;
+  int64_t End2 = Store->getSrcValueOffset() + Size;
+  if (V1 == V2 && End1 == End2)
+    return false;
+
+  return !AA->alias(AliasAnalysis::Location(V1, End1, Load->getTBAAInfo()),
+                    AliasAnalysis::Location(V2, End2, Store->getTBAAInfo()));
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseMVC(SDNode *N) const {
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  LoadSDNode *Load = cast<LoadSDNode>(Store->getValue());
+
+  // Prefer not to use MVC if either address can use ... RELATIVE LONG
+  // instructions.
+  uint64_t Size = Load->getMemoryVT().getStoreSize();
+  if (Size > 1 && Size <= 8) {
+    // Prefer LHRL, LRL and LGRL.
+    if (SystemZISD::isPCREL(Load->getBasePtr().getOpcode()))
+      return false;
+    // Prefer STHRL, STRL and STGRL.
+    if (SystemZISD::isPCREL(Store->getBasePtr().getOpcode()))
+      return false;
+  }
+
+  return canUseBlockOperation(Store, Load);
+}
+
+bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
+                                                     unsigned I) const {
+  StoreSDNode *StoreA = cast<StoreSDNode>(N);
+  LoadSDNode *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
+  LoadSDNode *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
+  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+}
+
 SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
   DEBUG(errs() << "Selecting: "; Node->dump(CurDAG); errs() << "\n");
@@ -540,16 +1029,26 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
+    Node->setNodeId(-1);
     return 0;
   }
 
   unsigned Opcode = Node->getOpcode();
+  SDNode *ResNode = 0;
   switch (Opcode) {
   case ISD::OR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::ROSBG);
+    goto or_xor;
+
   case ISD::XOR:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::RXSBG);
+    // Fall through.
+  or_xor:
     // If this is a 64-bit operation in which both 32-bit halves are nonzero,
     // split the operation into two.
-    if (Node->getValueType(0) == MVT::i64)
+    if (!ResNode && Node->getValueType(0) == MVT::i64)
       if (ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
         uint64_t Val = Op1->getZExtValue();
         if (!SystemZ::isImmLF(Val) && !SystemZ::isImmHF(Val))
@@ -558,6 +1057,17 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
       }
     break;
 
+  case ISD::AND:
+    if (Node->getOperand(1).getOpcode() != ISD::Constant)
+      ResNode = tryRxSBG(Node, SystemZ::RNSBG);
+    // Fall through.
+  case ISD::ROTL:
+  case ISD::SHL:
+  case ISD::SRL:
+    if (!ResNode)
+      ResNode = tryRISBGZero(Node);
+    break;
+
   case ISD::Constant:
     // If this is a 64-bit constant that is out of the range of LLILF,
     // LLIHF and LGFI, split it into two 32-bit pieces.
@@ -582,10 +1092,32 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
       }
     }
     break;
+
+  case SystemZISD::SELECT_CCMASK: {
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    // Prefer to put any load first, so that it can be matched as a
+    // conditional load.
+    if (Op1.getOpcode() == ISD::LOAD && Op0.getOpcode() != ISD::LOAD) {
+      SDValue CCValid = Node->getOperand(2);
+      SDValue CCMask = Node->getOperand(3);
+      uint64_t ConstCCValid =
+        cast<ConstantSDNode>(CCValid.getNode())->getZExtValue();
+      uint64_t ConstCCMask =
+        cast<ConstantSDNode>(CCMask.getNode())->getZExtValue();
+      // Invert the condition.
+      CCMask = CurDAG->getConstant(ConstCCValid ^ ConstCCMask,
+                                   CCMask.getValueType());
+      SDValue Op4 = Node->getOperand(4);
+      Node = CurDAG->UpdateNodeOperands(Node, Op1, Op0, CCValid, CCMask, Op4);
+    }
+    break;
+  }
   }
 
   // Select the default instruction
-  SDNode *ResNode = SelectCode(Node);
+  if (!ResNode)
+    ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ";
         if (ResNode == NULL || ResNode == Node)
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index eb21b31..f6e1853 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -23,8 +23,23 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
+#include <cctype>
+
 using namespace llvm;
 
+namespace {
+// Represents a sequence for extracting a 0/1 value from an IPM result:
+// (((X ^ XORValue) + AddValue) >> Bit)
+struct IPMConversion {
+  IPMConversion(unsigned xorValue, int64_t addValue, unsigned bit)
+    : XORValue(xorValue), AddValue(addValue), Bit(bit) {}
+
+  int64_t XORValue;
+  int64_t AddValue;
+  unsigned Bit;
+};
+}
+
 // Classify VT as either 32 or 64 bit.
 static bool is32Bit(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
@@ -51,7 +66,10 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   MVT PtrVT = getPointerTy();
 
   // Set up the register classes.
-  addRegisterClass(MVT::i32,  &SystemZ::GR32BitRegClass);
+  if (Subtarget.hasHighWord())
+    addRegisterClass(MVT::i32, &SystemZ::GRX32BitRegClass);
+  else
+    addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
   addRegisterClass(MVT::i64,  &SystemZ::GR64BitRegClass);
   addRegisterClass(MVT::f32,  &SystemZ::FP32BitRegClass);
   addRegisterClass(MVT::f64,  &SystemZ::FP64BitRegClass);
@@ -67,7 +85,7 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
 
   // TODO: It may be better to default to latency-oriented scheduling, however
   // LLVM's current latency-oriented scheduler can't handle physreg definitions
-  // such as SystemZ has with PSW, so set this to the register-pressure
+  // such as SystemZ has with CC, so set this to the register-pressure
   // scheduler, because it can.
   setSchedulingPreference(Sched::RegPressure);
 
@@ -83,8 +101,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
        ++I) {
     MVT VT = MVT::SimpleValueType(I);
     if (isTypeLegal(VT)) {
-      // Expand SETCC(X, Y, COND) into SELECT_CC(X, Y, 1, 0, COND).
-      setOperationAction(ISD::SETCC, VT, Expand);
+      // Lower SET_CC into an IPM-based sequence.
+      setOperationAction(ISD::SETCC, VT, Custom);
 
       // Expand SELECT(C, A, B) into SELECT_CC(X, 0, A, B, NE).
       setOperationAction(ISD::SELECT, VT, Expand);
@@ -128,9 +146,11 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::ROTR,            VT, Expand);
 
-      // Use *MUL_LOHI where possible and a wider multiplication otherwise.
+      // Use *MUL_LOHI where possible instead of MULH*.
       setOperationAction(ISD::MULHS, VT, Expand);
       setOperationAction(ISD::MULHU, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Custom);
+      setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // We have instructions for signed but not unsigned FP conversion.
       setOperationAction(ISD::FP_TO_UINT, VT, Expand);
@@ -165,14 +185,6 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   // Give LowerOperation the chance to replace 64-bit ORs with subregs.
   setOperationAction(ISD::OR, MVT::i64, Custom);
 
-  // The architecture has 32-bit SMUL_LOHI and UMUL_LOHI (MR and MLR),
-  // but they aren't really worth using.  There is no 64-bit SMUL_LOHI,
-  // but there is a 64-bit UMUL_LOHI: MLGR.
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);
-
   // FIXME: Can we support these natively?
   setOperationAction(ISD::SRL_PARTS, MVT::i64, Expand);
   setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
@@ -200,10 +212,8 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::STACKSAVE,    MVT::Other, Custom);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Custom);
 
-  // Expand these using getExceptionSelectorRegister() and
-  // getExceptionPointerRegister().
-  setOperationAction(ISD::EXCEPTIONADDR, PtrVT, Expand);
-  setOperationAction(ISD::EHSELECTION,   PtrVT, Expand);
+  // Handle prefetches with PFD or PFDRL.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
   // Handle floating-point types.
   for (unsigned I = MVT::FIRST_FP_VALUETYPE;
@@ -214,6 +224,15 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
       // We can use FI for FRINT.
       setOperationAction(ISD::FRINT, VT, Legal);
 
+      // We can use the extended form of FI for other rounding operations.
+      if (Subtarget.hasFPExtension()) {
+        setOperationAction(ISD::FNEARBYINT, VT, Legal);
+        setOperationAction(ISD::FFLOOR, VT, Legal);
+        setOperationAction(ISD::FCEIL, VT, Legal);
+        setOperationAction(ISD::FTRUNC, VT, Legal);
+        setOperationAction(ISD::FROUND, VT, Legal);
+      }
+
       // No special instructions for these.
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
@@ -246,6 +265,43 @@ SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm)
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+
+  // We want to use MVC in preference to even a single load/store pair.
+  MaxStoresPerMemcpy = 0;
+  MaxStoresPerMemcpyOptSize = 0;
+
+  // The main memset sequence is a byte store followed by an MVC.
+  // Two STC or MV..I stores win over that, but the kind of fused stores
+  // generated by target-independent code don't when the byte value is
+  // variable.  E.g.  "STC <reg>;MHI <reg>,257;STH <reg>" is not better
+  // than "STC;MVC".  Handle the choice in target-specific code instead.
+  MaxStoresPerMemset = 0;
+  MaxStoresPerMemsetOptSize = 0;
+}
+
+EVT SystemZTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  case MVT::f128:
+    return false;
+  default:
+    break;
+  }
+
+  return false;
 }
 
 bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
@@ -253,6 +309,47 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   return Imm.isZero() || Imm.isNegZero();
 }
 
+bool SystemZTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
+                                                          bool *Fast) const {
+  // Unaligned accesses should never be slower than the expanded version.
+  // We check specifically for aligned accesses in the few cases where
+  // they are required.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+  
+bool SystemZTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // Punt on globals for now, although they can be used in limited
+  // RELATIVE LONG cases.
+  if (AM.BaseGV)
+    return false;
+
+  // Require a 20-bit signed offset.
+  if (!isInt<20>(AM.BaseOffs))
+    return false;
+
+  // Indexing is OK but no scale factor can be applied.
+  return AM.Scale == 0 || AM.Scale == 1;
+}
+
+bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
+  if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
+    return false;
+  unsigned FromBits = FromType->getPrimitiveSizeInBits();
+  unsigned ToBits = ToType->getPrimitiveSizeInBits();
+  return FromBits > ToBits;
+}
+
+bool SystemZTargetLowering::isTruncateFree(EVT FromVT, EVT ToVT) const {
+  if (!FromVT.isInteger() || !ToVT.isInteger())
+    return false;
+  unsigned FromBits = FromVT.getSizeInBits();
+  unsigned ToBits = ToVT.getSizeInBits();
+  return FromBits > ToBits;
+}
+
 //===----------------------------------------------------------------------===//
 // Inline asm support
 //===----------------------------------------------------------------------===//
@@ -264,6 +361,7 @@ SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'a': // Address register
     case 'd': // Data register (equivalent to 'r')
     case 'f': // Floating-point register
+    case 'h': // High-part register
     case 'r': // General-purpose register
       return C_RegisterClass;
 
@@ -306,6 +404,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
 
   case 'a': // Address register
   case 'd': // Data register (equivalent to 'r')
+  case 'h': // High-part register
   case 'r': // General-purpose register
     if (CallOperandVal->getType()->isIntegerTy())
       weight = CW_Register;
@@ -349,8 +448,24 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
   return weight;
 }
 
+// Parse a "{tNNN}" register constraint for which the register type "t"
+// has already been verified.  MC is the class associated with "t" and
+// Map maps 0-based register numbers to LLVM register numbers.
+static std::pair<unsigned, const TargetRegisterClass *>
+parseRegisterNumber(const std::string &Constraint,
+                    const TargetRegisterClass *RC, const unsigned *Map) {
+  assert(*(Constraint.end()-1) == '}' && "Missing '}'");
+  if (isdigit(Constraint[2])) {
+    std::string Suffix(Constraint.data() + 2, Constraint.size() - 2);
+    unsigned Index = atoi(Suffix.c_str());
+    if (Index < 16 && Map[Index])
+      return std::make_pair(Map[Index], RC);
+  }
+  return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+}
+
 std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC Constraint Letters
     switch (Constraint[0]) {
@@ -370,6 +485,9 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
         return std::make_pair(0U, &SystemZ::ADDR128BitRegClass);
       return std::make_pair(0U, &SystemZ::ADDR32BitRegClass);
 
+    case 'h': // High-part register (an LLVM extension)
+      return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
+
     case 'f': // Floating-point register
       if (VT == MVT::f64)
         return std::make_pair(0U, &SystemZ::FP64BitRegClass);
@@ -378,6 +496,32 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
       return std::make_pair(0U, &SystemZ::FP32BitRegClass);
     }
   }
+  if (Constraint[0] == '{') {
+    // We need to override the default register parsing for GPRs and FPRs
+    // because the interpretation depends on VT.  The internal names of
+    // the registers are also different from the external names
+    // (F0D and F0S instead of F0, etc.).
+    if (Constraint[1] == 'r') {
+      if (VT == MVT::i32)
+        return parseRegisterNumber(Constraint, &SystemZ::GR32BitRegClass,
+                                   SystemZMC::GR32Regs);
+      if (VT == MVT::i128)
+        return parseRegisterNumber(Constraint, &SystemZ::GR128BitRegClass,
+                                   SystemZMC::GR128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::GR64BitRegClass,
+                                 SystemZMC::GR64Regs);
+    }
+    if (Constraint[1] == 'f') {
+      if (VT == MVT::f32)
+        return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
+                                   SystemZMC::FP32Regs);
+      if (VT == MVT::f128)
+        return parseRegisterNumber(Constraint, &SystemZ::FP128BitRegClass,
+                                   SystemZMC::FP128Regs);
+      return parseRegisterNumber(Constraint, &SystemZ::FP64BitRegClass,
+                                 SystemZMC::FP64Regs);
+    }
+  }
   return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 }
 
@@ -433,10 +577,21 @@ LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
 
 #include "SystemZGenCallingConv.inc"
 
+bool SystemZTargetLowering::allowTruncateForTailCall(Type *FromType,
+                                                     Type *ToType) const {
+  return isTruncateFree(FromType, ToType);
+}
+
+bool SystemZTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
+  return true;
+}
+
 // Value is a value that has been passed to us in the location described by VA
 // (and so has type VA.getLocVT()).  Convert Value to VA.getValVT(), chaining
 // any loads onto Chain.
-static SDValue convertLocVTToValVT(SelectionDAG &DAG, DebugLoc DL,
+static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDLoc DL,
                                    CCValAssign &VA, SDValue Chain,
                                    SDValue Value) {
   // If the argument has been promoted from a smaller type, insert an
@@ -461,7 +616,7 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, DebugLoc DL,
 // Value is a value of type VA.getValVT() that we need to copy into
 // the location described by VA.  Return a copy of Value converted to
 // VA.getValVT().  The caller is responsible for handling indirect values.
-static SDValue convertValVTToLocVT(SelectionDAG &DAG, DebugLoc DL,
+static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDLoc DL,
                                    CCValAssign &VA, SDValue Value) {
   switch (VA.getLocInfo()) {
   case CCValAssign::SExt:
@@ -480,7 +635,7 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, DebugLoc DL,
 SDValue SystemZTargetLowering::
 LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                     DebugLoc DL, SelectionDAG &DAG,
+                     SDLoc DL, SelectionDAG &DAG,
                      SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -595,35 +750,56 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
   return Chain;
 }
 
+static bool canUseSiblingCall(CCState ArgCCInfo,
+                              SmallVectorImpl<CCValAssign> &ArgLocs) {
+  // Punt if there are any indirect or stack arguments, or if the call
+  // needs the call-saved argument register R6.
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    CCValAssign &VA = ArgLocs[I];
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      return false;
+    if (!VA.isRegLoc())
+      return false;
+    unsigned Reg = VA.getLocReg();
+    if (Reg == SystemZ::R6H || Reg == SystemZ::R6L || Reg == SystemZ::R6D)
+      return false;
+  }
+  return true;
+}
+
 SDValue
 SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG = CLI.DAG;
-  DebugLoc &DL = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDLoc &DL = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
-  bool &isTailCall = CLI.IsTailCall;
+  bool &IsTailCall = CLI.IsTailCall;
   CallingConv::ID CallConv = CLI.CallConv;
   bool IsVarArg = CLI.IsVarArg;
   MachineFunction &MF = DAG.getMachineFunction();
   EVT PtrVT = getPointerTy();
 
-  // SystemZ target does not yet support tail call optimization.
-  isTailCall = false;
-
   // Analyze the operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState ArgCCInfo(CallConv, IsVarArg, MF, TM, ArgLocs, *DAG.getContext());
   ArgCCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
 
+  // We don't support GuaranteedTailCallOpt, only automatically-detected
+  // sibling calls.
+  if (IsTailCall && !canUseSiblingCall(ArgCCInfo, ArgLocs))
+    IsTailCall = false;
+
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = ArgCCInfo.getNextStackOffset();
 
   // Mark the start of the call.
-  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true));
+  if (!IsTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes, PtrVT, true),
+                                 DL);
 
   // Copy argument values to their designated locations.
   SmallVector<std::pair<unsigned, SDValue>, 9> RegsToPass;
@@ -672,22 +848,27 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Build a sequence of copy-to-reg nodes, chained and glued together.
-  SDValue Glue;
-  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
-                             RegsToPass[I].second, Glue);
-    Glue = Chain.getValue(1);
-  }
-
   // Accept direct calls by converting symbolic call addresses to the
-  // associated Target* opcodes.
+  // associated Target* opcodes.  Force %r1 to be used for indirect
+  // tail calls.
+  SDValue Glue;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
   } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT);
     Callee = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Callee);
+  } else if (IsTailCall) {
+    Chain = DAG.getCopyToReg(Chain, DL, SystemZ::R1D, Callee, Glue);
+    Glue = Chain.getValue(1);
+    Callee = DAG.getRegister(SystemZ::R1D, Callee.getValueType());
+  }
+
+  // Build a sequence of copy-to-reg nodes, chained and glued together.
+  for (unsigned I = 0, E = RegsToPass.size(); I != E; ++I) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[I].first,
+                             RegsToPass[I].second, Glue);
+    Glue = Chain.getValue(1);
   }
 
   // The first call operand is the chain and the second is the target address.
@@ -707,6 +888,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  if (IsTailCall)
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, &Ops[0], Ops.size());
   Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
   Glue = Chain.getValue(1);
 
@@ -714,7 +897,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   Chain = DAG.getCALLSEQ_END(Chain,
                              DAG.getConstant(NumBytes, PtrVT, true),
                              DAG.getConstant(0, PtrVT, true),
-                             Glue);
+                             Glue, DL);
   Glue = Chain.getValue(1);
 
   // Assign locations to each value returned by this call.
@@ -745,7 +928,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
                                    CallingConv::ID CallConv, bool IsVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<SDValue> &OutVals,
-                                   DebugLoc DL, SelectionDAG &DAG) const {
+                                   SDLoc DL, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   // Assign locations to each returned value.
@@ -815,6 +998,96 @@ static unsigned CCMaskForCondCode(ISD::CondCode CC) {
 #undef CONV
 }
 
+// Return a sequence for getting a 1 from an IPM result when CC has a
+// value in CCMask and a 0 when CC has a value in CCValid & ~CCMask.
+// The handling of CC values outside CCValid doesn't matter.
+static IPMConversion getIPMConversion(unsigned CCValid, unsigned CCMask) {
+  // Deal with cases where the result can be taken directly from a bit
+  // of the IPM result.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_2 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, 0, SystemZ::IPM_CC + 1);
+
+  // Deal with cases where we can add a value to force the sign bit
+  // to contain the right value.  Putting the bit in 31 means we can
+  // use SRL rather than RISBG(L), and also makes it easier to get a
+  // 0/-1 value, so it has priority over the other tests below.
+  //
+  // These sequences rely on the fact that the upper two bits of the
+  // IPM result are zero.
+  uint64_t TopBit = uint64_t(1) << 31;
+  if (CCMask == (CCValid & SystemZ::CCMASK_0))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_1)))
+    return IPMConversion(0, -(2 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2)))
+    return IPMConversion(0, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_3))
+    return IPMConversion(0, TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(0, TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  // Next try inverting the value and testing a bit.  0/1 could be
+  // handled this way too, but we dealt with that case above.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_2)))
+    return IPMConversion(-1, 0, SystemZ::IPM_CC);
+
+  // Handle cases where adding a value forces a non-sign bit to contain
+  // the right value.
+  if (CCMask == (CCValid & (SystemZ::CCMASK_1 | SystemZ::CCMASK_2)))
+    return IPMConversion(0, 1 << SystemZ::IPM_CC, SystemZ::IPM_CC + 1);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0 | SystemZ::CCMASK_3)))
+    return IPMConversion(0, -(1 << SystemZ::IPM_CC), SystemZ::IPM_CC + 1);
+
+  // The remaing cases are 1, 2, 0/1/3 and 0/2/3.  All these are
+  // can be done by inverting the low CC bit and applying one of the
+  // sign-based extractions above.
+  if (CCMask == (CCValid & SystemZ::CCMASK_1))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(1 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & SystemZ::CCMASK_2))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_1
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC, -(3 << SystemZ::IPM_CC), 31);
+  if (CCMask == (CCValid & (SystemZ::CCMASK_0
+                            | SystemZ::CCMASK_2
+                            | SystemZ::CCMASK_3)))
+    return IPMConversion(1 << SystemZ::IPM_CC,
+                         TopBit - (1 << SystemZ::IPM_CC), 31);
+
+  llvm_unreachable("Unexpected CC combination");
+}
+
+// If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
+// can be converted to a comparison against zero, adjust the operands
+// as necessary.
+static void adjustZeroCmp(SelectionDAG &DAG, bool &IsUnsigned,
+                          SDValue &CmpOp0, SDValue &CmpOp1,
+                          unsigned &CCMask) {
+  if (IsUnsigned)
+    return;
+
+  ConstantSDNode *ConstOp1 = dyn_cast<ConstantSDNode>(CmpOp1.getNode());
+  if (!ConstOp1)
+    return;
+
+  int64_t Value = ConstOp1->getSExtValue();
+  if ((Value == -1 && CCMask == SystemZ::CCMASK_CMP_GT) ||
+      (Value == -1 && CCMask == SystemZ::CCMASK_CMP_LE) ||
+      (Value == 1 && CCMask == SystemZ::CCMASK_CMP_LT) ||
+      (Value == 1 && CCMask == SystemZ::CCMASK_CMP_GE)) {
+    CCMask ^= SystemZ::CCMASK_CMP_EQ;
+    CmpOp1 = DAG.getConstant(0, CmpOp1.getValueType());
+  }
+}
+
 // If a comparison described by IsUnsigned, CCMask, CmpOp0 and CmpOp1
 // is suitable for CLI(Y), CHHSI or CLHHSI, adjust the operands as necessary.
 static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
@@ -840,7 +1113,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
   uint64_t Mask = (1 << NumBits) - 1;
   if (Load->getExtensionType() == ISD::SEXTLOAD) {
     int64_t SignedValue = Constant->getSExtValue();
-    if (uint64_t(SignedValue) + (1 << (NumBits - 1)) > Mask)
+    if (uint64_t(SignedValue) + (1ULL << (NumBits - 1)) > Mask)
       return;
     // Unsigned comparison between two sign-extended values is equivalent
     // to unsigned comparison between two zero-extended values.
@@ -859,7 +1132,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
       if (Value == 0 && CCMask == SystemZ::CCMASK_CMP_LT)
         // Test whether the high bit of the byte is set.
         Value = 127, CCMask = SystemZ::CCMASK_CMP_GT, IsUnsigned = true;
-      else if (SignedValue == -1 && CCMask == SystemZ::CCMASK_CMP_GT)
+      else if (Value == 0 && CCMask == SystemZ::CCMASK_CMP_GE)
         // Test whether the high bit of the byte is clear.
         Value = 128, CCMask = SystemZ::CCMASK_CMP_LT, IsUnsigned = true;
       else
@@ -879,7 +1152,7 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
   ISD::LoadExtType ExtType = IsUnsigned ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
   if (CmpOp0.getValueType() != MVT::i32 ||
       Load->getExtensionType() != ExtType)
-    CmpOp0 = DAG.getExtLoad(ExtType, Load->getDebugLoc(), MVT::i32,
+    CmpOp0 = DAG.getExtLoad(ExtType, SDLoc(Load), MVT::i32,
                             Load->getChain(), Load->getBasePtr(),
                             Load->getPointerInfo(), Load->getMemoryVT(),
                             Load->isVolatile(), Load->isNonTemporal(),
@@ -891,67 +1164,309 @@ static void adjustSubwordCmp(SelectionDAG &DAG, bool &IsUnsigned,
     CmpOp1 = DAG.getConstant(Value, MVT::i32);
 }
 
-// Return true if a comparison described by CCMask, CmpOp0 and CmpOp1
-// is an equality comparison that is better implemented using unsigned
-// rather than signed comparison instructions.
-static bool preferUnsignedComparison(SelectionDAG &DAG, SDValue CmpOp0,
-                                     SDValue CmpOp1, unsigned CCMask) {
-  // The test must be for equality or inequality.
-  if (CCMask != SystemZ::CCMASK_CMP_EQ && CCMask != SystemZ::CCMASK_CMP_NE)
+// Return true if Op is either an unextended load, or a load suitable
+// for integer register-memory comparisons of type ICmpType.
+static bool isNaturalMemoryOperand(SDValue Op, unsigned ICmpType) {
+  LoadSDNode *Load = dyn_cast<LoadSDNode>(Op.getNode());
+  if (Load) {
+    // There are no instructions to compare a register with a memory byte.
+    if (Load->getMemoryVT() == MVT::i8)
+      return false;
+    // Otherwise decide on extension type.
+    switch (Load->getExtensionType()) {
+    case ISD::NON_EXTLOAD:
+      return true;
+    case ISD::SEXTLOAD:
+      return ICmpType != SystemZICMP::UnsignedOnly;
+    case ISD::ZEXTLOAD:
+      return ICmpType != SystemZICMP::SignedOnly;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+// Return true if it is better to swap comparison operands Op0 and Op1.
+// ICmpType is the type of an integer comparison.
+static bool shouldSwapCmpOperands(SDValue Op0, SDValue Op1,
+                                  unsigned ICmpType) {
+  // Leave f128 comparisons alone, since they have no memory forms.
+  if (Op0.getValueType() == MVT::f128)
     return false;
 
-  if (CmpOp1.getOpcode() == ISD::Constant) {
-    uint64_t Value = cast<ConstantSDNode>(CmpOp1)->getSExtValue();
+  // Always keep a floating-point constant second, since comparisons with
+  // zero can use LOAD TEST and comparisons with other constants make a
+  // natural memory operand.
+  if (isa<ConstantFPSDNode>(Op1))
+    return false;
 
-    // If we're comparing with memory, prefer unsigned comparisons for
-    // values that are in the unsigned 16-bit range but not the signed
-    // 16-bit range.  We want to use CLFHSI and CLGHSI.
-    if (CmpOp0.hasOneUse() &&
-        ISD::isNormalLoad(CmpOp0.getNode()) &&
-        (Value >= 32768 && Value < 65536))
-      return true;
+  // Never swap comparisons with zero since there are many ways to optimize
+  // those later.
+  ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1);
+  if (COp1 && COp1->getZExtValue() == 0)
+    return false;
 
-    // Use unsigned comparisons for values that are in the CLGFI range
-    // but not in the CGFI range.
-    if (CmpOp0.getValueType() == MVT::i64 && (Value >> 31) == 1)
+  // Look for cases where Cmp0 is a single-use load and Cmp1 isn't.
+  // In that case we generally prefer the memory to be second.
+  if ((isNaturalMemoryOperand(Op0, ICmpType) && Op0.hasOneUse()) &&
+      !(isNaturalMemoryOperand(Op1, ICmpType) && Op1.hasOneUse())) {
+    // The only exceptions are when the second operand is a constant and
+    // we can use things like CHHSI.
+    if (!COp1)
       return true;
+    // The unsigned memory-immediate instructions can handle 16-bit
+    // unsigned integers.
+    if (ICmpType != SystemZICMP::SignedOnly &&
+        isUInt<16>(COp1->getZExtValue()))
+      return false;
+    // The signed memory-immediate instructions can handle 16-bit
+    // signed integers.
+    if (ICmpType != SystemZICMP::UnsignedOnly &&
+        isInt<16>(COp1->getSExtValue()))
+      return false;
+    return true;
+  }
+  return false;
+}
+
+// Return true if shift operation N has an in-range constant shift value.
+// Store it in ShiftVal if so.
+static bool isSimpleShift(SDValue N, unsigned &ShiftVal) {
+  ConstantSDNode *Shift = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!Shift)
+    return false;
 
+  uint64_t Amount = Shift->getZExtValue();
+  if (Amount >= N.getValueType().getSizeInBits())
     return false;
+
+  ShiftVal = Amount;
+  return true;
+}
+
+// Check whether an AND with Mask is suitable for a TEST UNDER MASK
+// instruction and whether the CC value is descriptive enough to handle
+// a comparison of type Opcode between the AND result and CmpVal.
+// CCMask says which comparison result is being tested and BitSize is
+// the number of bits in the operands.  If TEST UNDER MASK can be used,
+// return the corresponding CC mask, otherwise return 0.
+static unsigned getTestUnderMaskCond(unsigned BitSize, unsigned CCMask,
+                                     uint64_t Mask, uint64_t CmpVal,
+                                     unsigned ICmpType) {
+  assert(Mask != 0 && "ANDs with zero should have been removed by now");
+
+  // Check whether the mask is suitable for TMHH, TMHL, TMLH or TMLL.
+  if (!SystemZ::isImmLL(Mask) && !SystemZ::isImmLH(Mask) &&
+      !SystemZ::isImmHL(Mask) && !SystemZ::isImmHH(Mask))
+    return 0;
+
+  // Work out the masks for the lowest and highest bits.
+  unsigned HighShift = 63 - countLeadingZeros(Mask);
+  uint64_t High = uint64_t(1) << HighShift;
+  uint64_t Low = uint64_t(1) << countTrailingZeros(Mask);
+
+  // Signed ordered comparisons are effectively unsigned if the sign
+  // bit is dropped.
+  bool EffectivelyUnsigned = (ICmpType != SystemZICMP::SignedOnly);
+
+  // Check for equality comparisons with 0, or the equivalent.
+  if (CmpVal == 0) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal <= Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_SOME_1;
+  }
+  if (EffectivelyUnsigned && CmpVal < Low) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_ALL_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_SOME_1;
   }
 
-  // Prefer CL for zero-extended loads.
-  if (CmpOp1.getOpcode() == ISD::ZERO_EXTEND ||
-      ISD::isZEXTLoad(CmpOp1.getNode()))
-    return true;
+  // Check for equality comparisons with the mask, or the equivalent.
+  if (CmpVal == Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal >= Mask - Low && CmpVal < Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - Low && CmpVal <= Mask) {
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_ALL_1;
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_SOME_0;
+  }
 
-  // ...and for "in-register" zero extensions.
-  if (CmpOp1.getOpcode() == ISD::AND && CmpOp1.getValueType() == MVT::i64) {
-    SDValue Mask = CmpOp1.getOperand(1);
-    if (Mask.getOpcode() == ISD::Constant &&
-        cast<ConstantSDNode>(Mask)->getZExtValue() == 0xffffffff)
-      return true;
+  // Check for ordered comparisons with the top bit.
+  if (EffectivelyUnsigned && CmpVal >= Mask - High && CmpVal < High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LE)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GT)
+      return SystemZ::CCMASK_TM_MSB_1;
+  }
+  if (EffectivelyUnsigned && CmpVal > Mask - High && CmpVal <= High) {
+    if (CCMask == SystemZ::CCMASK_CMP_LT)
+      return SystemZ::CCMASK_TM_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_GE)
+      return SystemZ::CCMASK_TM_MSB_1;
   }
 
-  return false;
+  // If there are just two bits, we can do equality checks for Low and High
+  // as well.
+  if (Mask == Low + High) {
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == Low)
+      return SystemZ::CCMASK_TM_MIXED_MSB_0 ^ SystemZ::CCMASK_ANY;
+    if (CCMask == SystemZ::CCMASK_CMP_EQ && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1;
+    if (CCMask == SystemZ::CCMASK_CMP_NE && CmpVal == High)
+      return SystemZ::CCMASK_TM_MIXED_MSB_1 ^ SystemZ::CCMASK_ANY;
+  }
+
+  // Looks like we've exhausted our options.
+  return 0;
+}
+
+// See whether the comparison (Opcode CmpOp0, CmpOp1, ICmpType) can be
+// implemented as a TEST UNDER MASK instruction when the condition being
+// tested is as described by CCValid and CCMask.  Update the arguments
+// with the TM version if so.
+static void adjustForTestUnderMask(SelectionDAG &DAG, unsigned &Opcode,
+                                   SDValue &CmpOp0, SDValue &CmpOp1,
+                                   unsigned &CCValid, unsigned &CCMask,
+                                   unsigned &ICmpType) {
+  // Check that we have a comparison with a constant.
+  ConstantSDNode *ConstCmpOp1 = dyn_cast<ConstantSDNode>(CmpOp1);
+  if (!ConstCmpOp1)
+    return;
+  uint64_t CmpVal = ConstCmpOp1->getZExtValue();
+
+  // Check whether the nonconstant input is an AND with a constant mask.
+  if (CmpOp0.getOpcode() != ISD::AND)
+    return;
+  SDValue AndOp0 = CmpOp0.getOperand(0);
+  SDValue AndOp1 = CmpOp0.getOperand(1);
+  ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(AndOp1.getNode());
+  if (!Mask)
+    return;
+  uint64_t MaskVal = Mask->getZExtValue();
+
+  // Check whether the combination of mask, comparison value and comparison
+  // type are suitable.
+  unsigned BitSize = CmpOp0.getValueType().getSizeInBits();
+  unsigned NewCCMask, ShiftVal;
+  if (ICmpType != SystemZICMP::SignedOnly &&
+      AndOp0.getOpcode() == ISD::SHL &&
+      isSimpleShift(AndOp0, ShiftVal) &&
+      (NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal >> ShiftVal,
+                                        CmpVal >> ShiftVal,
+                                        SystemZICMP::Any))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal >> ShiftVal, AndOp0.getValueType());
+  } else if (ICmpType != SystemZICMP::SignedOnly &&
+             AndOp0.getOpcode() == ISD::SRL &&
+             isSimpleShift(AndOp0, ShiftVal) &&
+             (NewCCMask = getTestUnderMaskCond(BitSize, CCMask,
+                                               MaskVal << ShiftVal,
+                                               CmpVal << ShiftVal,
+                                               SystemZICMP::UnsignedOnly))) {
+    AndOp0 = AndOp0.getOperand(0);
+    AndOp1 = DAG.getConstant(MaskVal << ShiftVal, AndOp0.getValueType());
+  } else {
+    NewCCMask = getTestUnderMaskCond(BitSize, CCMask, MaskVal, CmpVal,
+                                     ICmpType);
+    if (!NewCCMask)
+      return;
+  }
+
+  // Go ahead and make the change.
+  Opcode = SystemZISD::TM;
+  CmpOp0 = AndOp0;
+  CmpOp1 = AndOp1;
+  ICmpType = (bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_0) !=
+              bool(NewCCMask & SystemZ::CCMASK_TM_MIXED_MSB_1));
+  CCValid = SystemZ::CCMASK_TM;
+  CCMask = NewCCMask;
 }
 
-// Return a target node that compares CmpOp0 and CmpOp1.  Set CCMask to the
-// 4-bit condition-code mask for CC.
-static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
-                       ISD::CondCode CC, unsigned &CCMask) {
+// Return a target node that compares CmpOp0 with CmpOp1 and stores a
+// 2-bit result in CC.  Set CCValid to the CCMASK_* of all possible
+// 2-bit results and CCMask to the subset of those results that are
+// associated with Cond.
+static SDValue emitCmp(const SystemZTargetMachine &TM, SelectionDAG &DAG,
+                       SDLoc DL, SDValue CmpOp0, SDValue CmpOp1,
+                       ISD::CondCode Cond, unsigned &CCValid,
+                       unsigned &CCMask) {
   bool IsUnsigned = false;
-  CCMask = CCMaskForCondCode(CC);
-  if (!CmpOp0.getValueType().isFloatingPoint()) {
+  CCMask = CCMaskForCondCode(Cond);
+  unsigned Opcode, ICmpType = 0;
+  if (CmpOp0.getValueType().isFloatingPoint()) {
+    CCValid = SystemZ::CCMASK_FCMP;
+    Opcode = SystemZISD::FCMP;
+  } else {
     IsUnsigned = CCMask & SystemZ::CCMASK_CMP_UO;
-    CCMask &= ~SystemZ::CCMASK_CMP_UO;
+    CCValid = SystemZ::CCMASK_ICMP;
+    CCMask &= CCValid;
+    adjustZeroCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
     adjustSubwordCmp(DAG, IsUnsigned, CmpOp0, CmpOp1, CCMask);
-    if (preferUnsignedComparison(DAG, CmpOp0, CmpOp1, CCMask))
-      IsUnsigned = true;
+    Opcode = SystemZISD::ICMP;
+    // Choose the type of comparison.  Equality and inequality tests can
+    // use either signed or unsigned comparisons.  The choice also doesn't
+    // matter if both sign bits are known to be clear.  In those cases we
+    // want to give the main isel code the freedom to choose whichever
+    // form fits best.
+    if (CCMask == SystemZ::CCMASK_CMP_EQ ||
+        CCMask == SystemZ::CCMASK_CMP_NE ||
+        (DAG.SignBitIsZero(CmpOp0) && DAG.SignBitIsZero(CmpOp1)))
+      ICmpType = SystemZICMP::Any;
+    else if (IsUnsigned)
+      ICmpType = SystemZICMP::UnsignedOnly;
+    else
+      ICmpType = SystemZICMP::SignedOnly;
   }
 
-  DebugLoc DL = CmpOp0.getDebugLoc();
-  return DAG.getNode((IsUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
-                     DL, MVT::Glue, CmpOp0, CmpOp1);
+  if (shouldSwapCmpOperands(CmpOp0, CmpOp1, ICmpType)) {
+    std::swap(CmpOp0, CmpOp1);
+    CCMask = ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+              (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+              (CCMask & SystemZ::CCMASK_CMP_UO));
+  }
+
+  adjustForTestUnderMask(DAG, Opcode, CmpOp0, CmpOp1, CCValid, CCMask,
+                         ICmpType);
+  if (Opcode == SystemZISD::ICMP || Opcode == SystemZISD::TM)
+    return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1,
+                       DAG.getConstant(ICmpType, MVT::i32));
+  return DAG.getNode(Opcode, DL, MVT::Glue, CmpOp0, CmpOp1);
+}
+
+// Implement a 32-bit *MUL_LOHI operation by extending both operands to
+// 64 bits.  Extend is the extension type to use.  Store the high part
+// in Hi and the low part in Lo.
+static void lowerMUL_LOHI32(SelectionDAG &DAG, SDLoc DL,
+                            unsigned Extend, SDValue Op0, SDValue Op1,
+                            SDValue &Hi, SDValue &Lo) {
+  Op0 = DAG.getNode(Extend, DL, MVT::i64, Op0);
+  Op1 = DAG.getNode(Extend, DL, MVT::i64, Op1);
+  SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, Op0, Op1);
+  Hi = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, DAG.getConstant(32, MVT::i64));
+  Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Hi);
+  Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
 }
 
 // Lower a binary operation that produces two VT results, one in each
@@ -959,7 +1474,7 @@ static SDValue emitCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
 // Extend extends Op0 to a GR128, and Opcode performs the GR128 operation
 // on the extended Op0 and (unextended) Op1.  Store the even register result
 // in Even and the odd register result in Odd.
-static void lowerGR128Binary(SelectionDAG &DAG, DebugLoc DL, EVT VT,
+static void lowerGR128Binary(SelectionDAG &DAG, SDLoc DL, EVT VT,
                              unsigned Extend, unsigned Opcode,
                              SDValue Op0, SDValue Op1,
                              SDValue &Even, SDValue &Odd) {
@@ -967,14 +1482,38 @@ static void lowerGR128Binary(SelectionDAG &DAG, DebugLoc DL, EVT VT,
   SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped,
                                SDValue(In128, 0), Op1);
   bool Is32Bit = is32Bit(VT);
-  SDValue SubReg0 = DAG.getTargetConstant(SystemZ::even128(Is32Bit), VT);
-  SDValue SubReg1 = DAG.getTargetConstant(SystemZ::odd128(Is32Bit), VT);
-  SDNode *Reg0 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg0);
-  SDNode *Reg1 = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                    VT, Result, SubReg1);
-  Even = SDValue(Reg0, 0);
-  Odd = SDValue(Reg1, 0);
+  Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result);
+  Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result);
+}
+
+SDValue SystemZTargetLowering::lowerSETCC(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDValue CmpOp0   = Op.getOperand(0);
+  SDValue CmpOp1   = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDLoc DL(Op);
+
+  unsigned CCValid, CCMask;
+  SDValue Glue = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
+
+  IPMConversion Conversion = getIPMConversion(CCValid, CCMask);
+  SDValue Result = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+
+  if (Conversion.XORValue)
+    Result = DAG.getNode(ISD::XOR, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.XORValue, MVT::i32));
+
+  if (Conversion.AddValue)
+    Result = DAG.getNode(ISD::ADD, DL, MVT::i32, Result,
+                         DAG.getConstant(Conversion.AddValue, MVT::i32));
+
+  // The SHR/AND sequence should get optimized to an RISBG.
+  Result = DAG.getNode(ISD::SRL, DL, MVT::i32, Result,
+                       DAG.getConstant(Conversion.Bit, MVT::i32));
+  if (Conversion.Bit != 31)
+    Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                         DAG.getConstant(1, MVT::i32));
+  return Result;
 }
 
 SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
@@ -983,12 +1522,13 @@ SDValue SystemZTargetLowering::lowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue CmpOp0   = Op.getOperand(2);
   SDValue CmpOp1   = Op.getOperand(3);
   SDValue Dest     = Op.getOperand(4);
-  DebugLoc DL      = Op.getDebugLoc();
+  SDLoc DL(Op);
 
-  unsigned CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
+  unsigned CCValid, CCMask;
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
   return DAG.getNode(SystemZISD::BR_CCMASK, DL, Op.getValueType(),
-                     Chain, DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
+                     Chain, DAG.getConstant(CCValid, MVT::i32),
+                     DAG.getConstant(CCMask, MVT::i32), Dest, Flags);
 }
 
 SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
@@ -998,14 +1538,15 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   SDValue TrueOp   = Op.getOperand(2);
   SDValue FalseOp  = Op.getOperand(3);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  DebugLoc DL      = Op.getDebugLoc();
+  SDLoc DL(Op);
 
-  unsigned CCMask;
-  SDValue Flags = emitCmp(DAG, CmpOp0, CmpOp1, CC, CCMask);
+  unsigned CCValid, CCMask;
+  SDValue Flags = emitCmp(TM, DAG, DL, CmpOp0, CmpOp1, CC, CCValid, CCMask);
 
-  SmallVector<SDValue, 4> Ops;
+  SmallVector<SDValue, 5> Ops;
   Ops.push_back(TrueOp);
   Ops.push_back(FalseOp);
+  Ops.push_back(DAG.getConstant(CCValid, MVT::i32));
   Ops.push_back(DAG.getConstant(CCMask, MVT::i32));
   Ops.push_back(Flags);
 
@@ -1015,7 +1556,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
 
 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
                                                   SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   int64_t Offset = Node->getOffset();
   EVT PtrVT = getPointerTy();
@@ -1024,18 +1565,18 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
 
   SDValue Result;
   if (Subtarget.isPC32DBLSymbol(GV, RM, CM)) {
-    // Make sure that the offset is aligned to a halfword.  If it isn't,
-    // create an "anchor" at the previous 12-bit boundary.
-    // FIXME check whether there is a better way of handling this.
-    if (Offset & 1) {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT,
-                                          Offset & ~uint64_t(0xfff));
-      Offset &= 0xfff;
-    } else {
-      Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Offset);
+    // Assign anchors at 1<<12 byte boundaries.
+    uint64_t Anchor = Offset & ~uint64_t(0xfff);
+    Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor);
+    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
+
+    // The offset can be folded into the address if it is aligned to a halfword.
+    Offset -= Anchor;
+    if (Offset != 0 && (Offset & 1) == 0) {
+      SDValue Full = DAG.getTargetGlobalAddress(GV, DL, PtrVT, Anchor + Offset);
+      Result = DAG.getNode(SystemZISD::PCREL_OFFSET, DL, PtrVT, Full, Result);
       Offset = 0;
     }
-    Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
   } else {
     Result = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, SystemZII::MO_GOT);
     Result = DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -1054,7 +1595,7 @@ SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
 
 SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 						     SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   const GlobalValue *GV = Node->getGlobal();
   EVT PtrVT = getPointerTy();
   TLSModel::Model model = TM.getTLSModel(GV);
@@ -1093,7 +1634,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
 
 SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
                                                  SelectionDAG &DAG) const {
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   const BlockAddress *BA = Node->getBlockAddress();
   int64_t Offset = Node->getOffset();
   EVT PtrVT = getPointerTy();
@@ -1105,7 +1646,7 @@ SDValue SystemZTargetLowering::lowerBlockAddress(BlockAddressSDNode *Node,
 
 SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
                                               SelectionDAG &DAG) const {
-  DebugLoc DL = JT->getDebugLoc();
+  SDLoc DL(JT);
   EVT PtrVT = getPointerTy();
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
 
@@ -1115,7 +1656,7 @@ SDValue SystemZTargetLowering::lowerJumpTable(JumpTableSDNode *JT,
 
 SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
                                                  SelectionDAG &DAG) const {
-  DebugLoc DL = CP->getDebugLoc();
+  SDLoc DL(CP);
   EVT PtrVT = getPointerTy();
 
   SDValue Result;
@@ -1132,29 +1673,38 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
 
 SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue In = Op.getOperand(0);
   EVT InVT = In.getValueType();
   EVT ResVT = Op.getValueType();
 
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDValue Shift32 = DAG.getConstant(32, MVT::i64);
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
-    SDValue In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
-    SDValue Shift = DAG.getNode(ISD::SHL, DL, MVT::i64, In64, Shift32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Shift);
-    SDNode *Out = DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                     MVT::f32, Out64, SubReg32);
-    return SDValue(Out, 0);
+    SDValue In64;
+    if (Subtarget.hasHighWord()) {
+      SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL,
+                                       MVT::i64);
+      In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                       MVT::i64, SDValue(U64, 0), In);
+    } else {
+      In64 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, In);
+      In64 = DAG.getNode(ISD::SHL, DL, MVT::i64, In64,
+                         DAG.getConstant(32, MVT::i64));
+    }
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::f64, In64);
+    return DAG.getTargetExtractSubreg(SystemZ::subreg_h32,
+                                      DL, MVT::f32, Out64);
   }
   if (InVT == MVT::f32 && ResVT == MVT::i32) {
     SDNode *U64 = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::f64);
-    SDNode *In64 = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::f64, SDValue(U64, 0), In, SubReg32);
-    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, SDValue(In64, 0));
-    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64, Shift32);
-    SDValue Out = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
-    return Out;
+    SDValue In64 = DAG.getTargetInsertSubreg(SystemZ::subreg_h32, DL,
+                                             MVT::f64, SDValue(U64, 0), In);
+    SDValue Out64 = DAG.getNode(ISD::BITCAST, DL, MVT::i64, In64);
+    if (Subtarget.hasHighWord())
+      return DAG.getTargetExtractSubreg(SystemZ::subreg_h32, DL,
+                                        MVT::i32, Out64);
+    SDValue Shift = DAG.getNode(ISD::SRL, DL, MVT::i64, Out64,
+                                DAG.getConstant(32, MVT::i64));
+    return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Shift);
   }
   llvm_unreachable("Unexpected bitcast combination");
 }
@@ -1169,7 +1719,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
   SDValue Chain   = Op.getOperand(0);
   SDValue Addr    = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  DebugLoc DL     = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // The initial values of each field.
   const unsigned NumFields = 4;
@@ -1203,7 +1753,7 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
   SDValue SrcPtr     = Op.getOperand(2);
   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  DebugLoc DL        = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32),
                        /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
@@ -1214,7 +1764,7 @@ SDValue SystemZTargetLowering::
 lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  DebugLoc DL   = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   unsigned SPReg = getStackPointerRegisterToSaveRestore();
 
@@ -1237,18 +1787,64 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
-SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
                                               SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
-  assert(!is32Bit(VT) && "Only support 64-bit UMUL_LOHI");
+  SDLoc DL(Op);
+  SDValue Ops[2];
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else {
+    // Do a full 128-bit multiplication based on UMUL_LOHI64:
+    //
+    //   (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64)
+    //
+    // but using the fact that the upper halves are either all zeros
+    // or all ones:
+    //
+    //   (ll * rl) - ((lh & rl) << 64) - ((ll & rh) << 64)
+    //
+    // and grouping the right terms together since they are quicker than the
+    // multiplication:
+    //
+    //   (ll * rl) - (((lh & rl) + (ll & rh)) << 64)
+    SDValue C63 = DAG.getConstant(63, MVT::i64);
+    SDValue LL = Op.getOperand(0);
+    SDValue RL = Op.getOperand(1);
+    SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63);
+    SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63);
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  SMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     LL, RL, Ops[1], Ops[0]);
+    SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH);
+    SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL);
+    SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
+    Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
+  }
+  return DAG.getMergeValues(Ops, 2, DL);
+}
 
-  // UMUL_LOHI64 returns the low result in the odd register and the high
-  // result in the even register.  UMUL_LOHI is defined to return the
-  // low half first, so the results are in reverse order.
+SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
-                   Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
+  if (is32Bit(VT))
+    // Just do a normal 64-bit multiplication and extract the results.
+    // We define this so that it can be used for constant division.
+    lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0),
+                    Op.getOperand(1), Ops[1], Ops[0]);
+  else
+    // UMUL_LOHI64 returns the low result in the odd register and the high
+    // result in the even register.  UMUL_LOHI is defined to return the
+    // low half first, so the results are in reverse order.
+    lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
+                     Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, 2, DL);
 }
 
@@ -1257,19 +1853,24 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
+  unsigned Opcode;
 
   // We use DSGF for 32-bit division.
   if (is32Bit(VT)) {
     Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0);
-    Op1 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op1);
-  }
+    Opcode = SystemZISD::SDIVREM32;
+  } else if (DAG.ComputeNumSignBits(Op1) > 32) {
+    Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1);
+    Opcode = SystemZISD::SDIVREM32;
+  } else    
+    Opcode = SystemZISD::SDIVREM64;
 
   // DSG(F) takes a 64-bit dividend, so the even register in the GR128
   // input is "don't care".  The instruction returns the remainder in
   // the even register and the quotient in the odd register.
   SDValue Ops[2];
-  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::SDIVREM64,
+  lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
                    Op0, Op1, Ops[1], Ops[0]);
   return DAG.getMergeValues(Ops, 2, DL);
 }
@@ -1277,7 +1878,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
 SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
                                             SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // DL(G) uses a double-width dividend, so we need to clear the even
   // register in the GR128 input.  The instruction returns the remainder
@@ -1332,22 +1933,20 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // high 32 bits and just masks out low bits.  We can skip it if so.
   if (HighOp.getOpcode() == ISD::AND &&
       HighOp.getOperand(1).getOpcode() == ISD::Constant) {
-    ConstantSDNode *MaskNode = cast<ConstantSDNode>(HighOp.getOperand(1));
-    uint64_t Mask = MaskNode->getZExtValue() | Masks[High];
-    if ((Mask >> 32) == 0xffffffff)
-      HighOp = HighOp.getOperand(0);
+    SDValue HighOp0 = HighOp.getOperand(0);
+    uint64_t Mask = cast<ConstantSDNode>(HighOp.getOperand(1))->getZExtValue();
+    if (DAG.MaskedValueIsZero(HighOp0, APInt(64, ~(Mask | 0xffffffff))))
+      HighOp = HighOp0;
   }
 
   // Take advantage of the fact that all GR32 operations only change the
   // low 32 bits by truncating Low to an i32 and inserting it directly
   // using a subreg.  The interesting cases are those where the truncation
   // can be folded.
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue Low32 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, LowOp);
-  SDValue SubReg32 = DAG.getTargetConstant(SystemZ::subreg_32bit, MVT::i64);
-  SDNode *Result = DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
-                                      MVT::i64, HighOp, Low32, SubReg32);
-  return SDValue(Result, 0);
+  return DAG.getTargetInsertSubreg(SystemZ::subreg_l32, DL,
+                                   MVT::i64, HighOp, Low32);
 }
 
 // Op is an 8-, 16-bit or 32-bit ATOMIC_LOAD_* operation.  Lower the first
@@ -1368,7 +1967,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD(SDValue Op,
   SDValue Addr = Node->getBasePtr();
   SDValue Src2 = Node->getVal();
   MachineMemOperand *MMO = Node->getMemOperand();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   EVT PtrVT = Addr.getValueType();
 
   // Convert atomic subtracts of constants into additions.
@@ -1442,7 +2041,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
   SDValue CmpVal = Node->getOperand(2);
   SDValue SwapVal = Node->getOperand(3);
   MachineMemOperand *MMO = Node->getMemOperand();
-  DebugLoc DL = Node->getDebugLoc();
+  SDLoc DL(Node);
   EVT PtrVT = Addr.getValueType();
 
   // Get the address of the containing word.
@@ -1474,7 +2073,7 @@ SDValue SystemZTargetLowering::lowerSTACKSAVE(SDValue Op,
                                               SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  return DAG.getCopyFromReg(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getCopyFromReg(Op.getOperand(0), SDLoc(Op),
                             SystemZ::R15D, Op.getValueType());
 }
 
@@ -1482,10 +2081,30 @@ SDValue SystemZTargetLowering::lowerSTACKRESTORE(SDValue Op,
                                                  SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MF.getInfo<SystemZMachineFunctionInfo>()->setManipulatesSP(true);
-  return DAG.getCopyToReg(Op.getOperand(0), Op.getDebugLoc(),
+  return DAG.getCopyToReg(Op.getOperand(0), SDLoc(Op),
                           SystemZ::R15D, Op.getOperand(1));
 }
 
+SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  bool IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+  if (!IsData)
+    // Just preserve the chain.
+    return Op.getOperand(0);
+
+  bool IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Code = IsWrite ? SystemZ::PFD_WRITE : SystemZ::PFD_READ;
+  MemIntrinsicSDNode *Node = cast<MemIntrinsicSDNode>(Op.getNode());
+  SDValue Ops[] = {
+    Op.getOperand(0),
+    DAG.getConstant(Code, MVT::i32),
+    Op.getOperand(1)
+  };
+  return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
+                                 Node->getVTList(), Ops, array_lengthof(Ops),
+                                 Node->getMemoryVT(), Node->getMemOperand());
+}
+
 SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -1493,6 +2112,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerBR_CC(Op, DAG);
   case ISD::SELECT_CC:
     return lowerSELECT_CC(Op, DAG);
+  case ISD::SETCC:
+    return lowerSETCC(Op, DAG);
   case ISD::GlobalAddress:
     return lowerGlobalAddress(cast<GlobalAddressSDNode>(Op), DAG);
   case ISD::GlobalTLSAddress:
@@ -1511,6 +2132,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerVACOPY(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::SMUL_LOHI:
+    return lowerSMUL_LOHI(Op, DAG);
   case ISD::UMUL_LOHI:
     return lowerUMUL_LOHI(Op, DAG);
   case ISD::SDIVREM:
@@ -1547,6 +2170,8 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
     return lowerSTACKSAVE(Op, DAG);
   case ISD::STACKRESTORE:
     return lowerSTACKRESTORE(Op, DAG);
+  case ISD::PREFETCH:
+    return lowerPREFETCH(Op, DAG);
   default:
     llvm_unreachable("Unexpected node to lower");
   }
@@ -1557,9 +2182,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
     OPCODE(RET_FLAG);
     OPCODE(CALL);
+    OPCODE(SIBCALL);
     OPCODE(PCREL_WRAPPER);
-    OPCODE(CMP);
-    OPCODE(UCMP);
+    OPCODE(PCREL_OFFSET);
+    OPCODE(ICMP);
+    OPCODE(FCMP);
+    OPCODE(TM);
     OPCODE(BR_CCMASK);
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
@@ -1568,6 +2196,20 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(SDIVREM64);
     OPCODE(UDIVREM32);
     OPCODE(UDIVREM64);
+    OPCODE(MVC);
+    OPCODE(MVC_LOOP);
+    OPCODE(NC);
+    OPCODE(NC_LOOP);
+    OPCODE(OC);
+    OPCODE(OC_LOOP);
+    OPCODE(XC);
+    OPCODE(XC_LOOP);
+    OPCODE(CLC);
+    OPCODE(CLC_LOOP);
+    OPCODE(STRCMP);
+    OPCODE(STPCPY);
+    OPCODE(SEARCH_STRING);
+    OPCODE(IPM);
     OPCODE(ATOMIC_SWAPW);
     OPCODE(ATOMIC_LOADW_ADD);
     OPCODE(ATOMIC_LOADW_SUB);
@@ -1580,6 +2222,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_LOADW_UMIN);
     OPCODE(ATOMIC_LOADW_UMAX);
     OPCODE(ATOMIC_CMP_SWAPW);
+    OPCODE(PREFETCH);
   }
   return NULL;
 #undef OPCODE
@@ -1609,6 +2252,31 @@ static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
   return NewMBB;
 }
 
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+// Force base value Base into a register before MI.  Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+                         const SystemZInstrInfo *TII) {
+  if (Base.isReg())
+    return Base.getReg();
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+    .addOperand(Base).addImm(0).addReg(0);
+  return Reg;
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
 MachineBasicBlock *
 SystemZTargetLowering::emitSelect(MachineInstr *MI,
@@ -1618,21 +2286,20 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   unsigned DestReg  = MI->getOperand(0).getReg();
   unsigned TrueReg  = MI->getOperand(1).getReg();
   unsigned FalseReg = MI->getOperand(2).getReg();
-  unsigned CCMask   = MI->getOperand(3).getImm();
+  unsigned CCValid  = MI->getOperand(3).getImm();
+  unsigned CCMask   = MI->getOperand(4).getImm();
   DebugLoc DL       = MI->getDebugLoc();
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
 
   //  StartMBB:
-  //   ...
-  //   TrueVal = ...
-  //   cmpTY ccX, r1, r2
-  //   jCC JoinMBB
+  //   BRC CCMask, JoinMBB
   //   # fallthrough to FalseMBB
   MBB = StartMBB;
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(CCMask).addMBB(JoinMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
   MBB->addSuccessor(JoinMBB);
   MBB->addSuccessor(FalseMBB);
 
@@ -1645,7 +2312,7 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   //   %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
   //  ...
   MBB = JoinMBB;
-  BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+  BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
     .addReg(TrueReg).addMBB(StartMBB)
     .addReg(FalseReg).addMBB(FalseMBB);
 
@@ -1653,6 +2320,69 @@ SystemZTargetLowering::emitSelect(MachineInstr *MI,
   return JoinMBB;
 }
 
+// Implement EmitInstrWithCustomInserter for pseudo CondStore* instruction MI.
+// StoreOpcode is the store to use and Invert says whether the store should
+// happen when the condition is false rather than true.  If a STORE ON
+// CONDITION is available, STOCOpcode is its opcode, otherwise it is 0.
+MachineBasicBlock *
+SystemZTargetLowering::emitCondStore(MachineInstr *MI,
+                                     MachineBasicBlock *MBB,
+                                     unsigned StoreOpcode, unsigned STOCOpcode,
+                                     bool Invert) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+
+  unsigned SrcReg     = MI->getOperand(0).getReg();
+  MachineOperand Base = MI->getOperand(1);
+  int64_t Disp        = MI->getOperand(2).getImm();
+  unsigned IndexReg   = MI->getOperand(3).getReg();
+  unsigned CCValid    = MI->getOperand(4).getImm();
+  unsigned CCMask     = MI->getOperand(5).getImm();
+  DebugLoc DL         = MI->getDebugLoc();
+
+  StoreOpcode = TII->getOpcodeForOffset(StoreOpcode, Disp);
+
+  // Use STOCOpcode if possible.  We could use different store patterns in
+  // order to avoid matching the index register, but the performance trade-offs
+  // might be more complicated in that case.
+  if (STOCOpcode && !IndexReg && TM.getSubtargetImpl()->hasLoadStoreOnCond()) {
+    if (Invert)
+      CCMask ^= CCValid;
+    BuildMI(*MBB, MI, DL, TII->get(STOCOpcode))
+      .addReg(SrcReg).addOperand(Base).addImm(Disp)
+      .addImm(CCValid).addImm(CCMask);
+    MI->eraseFromParent();
+    return MBB;
+  }
+
+  // Get the condition needed to branch around the store.
+  if (!Invert)
+    CCMask ^= CCValid;
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   BRC CCMask, JoinMBB
+  //   # fallthrough to FalseMBB
+  MBB = StartMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(JoinMBB);
+  MBB->addSuccessor(JoinMBB);
+  MBB->addSuccessor(FalseMBB);
+
+  //  FalseMBB:
+  //   store %SrcReg, %Disp(%Index,%Base)
+  //   # fallthrough to JoinMBB
+  MBB = FalseMBB;
+  BuildMI(MBB, DL, TII->get(StoreOpcode))
+    .addReg(SrcReg).addOperand(Base).addImm(Disp).addReg(IndexReg);
+  MBB->addSuccessor(JoinMBB);
+
+  MI->eraseFromParent();
+  return JoinMBB;
+}
+
 // Implement EmitInstrWithCustomInserter for pseudo ATOMIC_LOAD{,W}_*
 // or ATOMIC_SWAP{,W} instruction MI.  BinOpcode is the instruction that
 // performs the binary operation elided by "*", or 0 for ATOMIC_SWAP{,W}.
@@ -1669,7 +2399,6 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
   const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
   bool IsSubWord = (BitSize < 32);
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -1706,7 +2435,7 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
 
   // Insert a basic block for the main loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
 
   //  StartMBB:
@@ -1740,11 +2469,11 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
       .addReg(RotatedOldVal).addOperand(Src2);
     if (BitSize < 32)
       // XILF with the upper BitSize bits set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(uint32_t(~0 << (32 - BitSize)));
     else if (BitSize == 32)
       // XILF with every bit set.
-      BuildMI(MBB, DL, TII->get(SystemZ::XILF32), RotatedNewVal)
+      BuildMI(MBB, DL, TII->get(SystemZ::XILF), RotatedNewVal)
         .addReg(Tmp).addImm(~uint32_t(0));
     else {
       // Use LCGR and add -1 to the result, which is more compact than
@@ -1769,7 +2498,8 @@ SystemZTargetLowering::emitAtomicLoadBinary(MachineInstr *MI,
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
     .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
@@ -1792,7 +2522,6 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
   const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
   bool IsSubWord = (BitSize < 32);
 
   // Extract the operands.  Base can be a register or a frame index.
@@ -1828,7 +2557,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
 
   // Insert 3 basic blocks for the loop.
   MachineBasicBlock *StartMBB  = MBB;
-  MachineBasicBlock *DoneMBB   = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
   MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
   MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
@@ -1846,7 +2575,7 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
   //   %OldVal        = phi [ %OrigVal, StartMBB ], [ %Dest, UpdateMBB ]
   //   %RotatedOldVal = RLL %OldVal, 0(%BitShift)
   //   CompareOpcode %RotatedOldVal, %Src2
-  //   BRCL KeepOldMask, UpdateMBB
+  //   BRC KeepOldMask, UpdateMBB
   MBB = LoopMBB;
   BuildMI(MBB, DL, TII->get(SystemZ::PHI), OldVal)
     .addReg(OrigVal).addMBB(StartMBB)
@@ -1856,8 +2585,8 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
       .addReg(OldVal).addReg(BitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CompareOpcode))
     .addReg(RotatedOldVal).addReg(Src2);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL))
-    .addImm(KeepOldMask).addMBB(UpdateMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(KeepOldMask).addMBB(UpdateMBB);
   MBB->addSuccessor(UpdateMBB);
   MBB->addSuccessor(UseAltMBB);
 
@@ -1887,7 +2616,8 @@ SystemZTargetLowering::emitAtomicLoadMinMax(MachineInstr *MI,
       .addReg(RotatedNewVal).addReg(NegBitShift).addImm(0);
   BuildMI(MBB, DL, TII->get(CSOpcode), Dest)
     .addReg(OldVal).addReg(NewVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
@@ -1903,7 +2633,6 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
   const SystemZInstrInfo *TII = TM.getInstrInfo();
   MachineFunction &MF = *MBB->getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned MaskNE = CCMaskForCondCode(ISD::SETNE);
 
   // Extract the operands.  Base can be a register or a frame index.
   unsigned Dest        = MI->getOperand(0).getReg();
@@ -1935,7 +2664,7 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
   // Insert 2 basic blocks for the loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockAfter(MI, MBB);
+  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
   MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
   MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
 
@@ -1978,7 +2707,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
     .addReg(CmpVal).addReg(Dest).addImm(32).addImm(63 - BitSize).addImm(0);
   BuildMI(MBB, DL, TII->get(SystemZ::CR))
     .addReg(Dest).addReg(RetryCmpVal);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(DoneMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE).addMBB(DoneMBB);
   MBB->addSuccessor(DoneMBB);
   MBB->addSuccessor(SetMBB);
 
@@ -1998,7 +2729,8 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
     .addReg(RetrySwapVal).addReg(NegBitShift).addImm(-BitSize);
   BuildMI(MBB, DL, TII->get(CSOpcode), RetryOldVal)
     .addReg(OldVal).addReg(StoreVal).addOperand(Base).addImm(Disp);
-  BuildMI(MBB, DL, TII->get(SystemZ::BRCL)).addImm(MaskNE).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_CS).addImm(SystemZ::CCMASK_CS_NE).addMBB(LoopMBB);
   MBB->addSuccessor(LoopMBB);
   MBB->addSuccessor(DoneMBB);
 
@@ -2008,8 +2740,8 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr *MI,
 
 // Emit an extension from a GR32 or GR64 to a GR128.  ClearEven is true
 // if the high register of the GR128 value must be cleared or false if
-// it's "don't care".  SubReg is subreg_odd32 when extending a GR32
-// and subreg_odd when extending a GR64.
+// it's "don't care".  SubReg is subreg_l32 when extending a GR32
+// and subreg_l64 when extending a GR64.
 MachineBasicBlock *
 SystemZTargetLowering::emitExt128(MachineInstr *MI,
                                   MachineBasicBlock *MBB,
@@ -2031,7 +2763,7 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
     BuildMI(*MBB, MI, DL, TII->get(SystemZ::LLILL), Zero64)
       .addImm(0);
     BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewIn128)
-      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_high);
+      .addReg(In128).addReg(Zero64).addImm(SystemZ::subreg_h64);
     In128 = NewIn128;
   }
   BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest)
@@ -2041,9 +2773,238 @@ SystemZTargetLowering::emitExt128(MachineInstr *MI,
   return MBB;
 }
 
+MachineBasicBlock *
+SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
+  uint64_t       DestDisp = MI->getOperand(1).getImm();
+  MachineOperand SrcBase  = earlyUseOperand(MI->getOperand(2));
+  uint64_t       SrcDisp  = MI->getOperand(3).getImm();
+  uint64_t       Length   = MI->getOperand(4).getImm();
+
+  // When generating more than one CLC, all but the last will need to
+  // branch to the end when a difference is found.
+  MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
+                               splitBlockAfter(MI, MBB) : 0);
+
+  // Check for the loop form, in which operand 5 is the trip count.
+  if (MI->getNumExplicitOperands() > 5) {
+    bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+    uint64_t StartCountReg = MI->getOperand(5).getReg();
+    uint64_t StartSrcReg   = forceReg(MI, SrcBase, TII);
+    uint64_t StartDestReg  = (HaveSingleBase ? StartSrcReg :
+                              forceReg(MI, DestBase, TII));
+
+    const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+    uint64_t ThisSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+                            MRI.createVirtualRegister(RC));
+    uint64_t NextSrcReg  = MRI.createVirtualRegister(RC);
+    uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+                            MRI.createVirtualRegister(RC));
+
+    RC = &SystemZ::GR64BitRegClass;
+    uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+    uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+    MachineBasicBlock *StartMBB = MBB;
+    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+
+    //  StartMBB:
+    //   # fall through to LoopMMB
+    MBB->addSuccessor(LoopMBB);
+
+    //  LoopMBB:
+    //   %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+    //                      [ %NextDestReg, NextMBB ]
+    //   %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+    //                     [ %NextSrcReg, NextMBB ]
+    //   %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+    //                       [ %NextCountReg, NextMBB ]
+    //   ( PFD 2, 768+DestDisp(%ThisDestReg) )
+    //   Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+    //   ( JLH EndMBB )
+    //
+    // The prefetch is used only for MVC.  The JLH is used only for CLC.
+    MBB = LoopMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+      .addReg(StartDestReg).addMBB(StartMBB)
+      .addReg(NextDestReg).addMBB(NextMBB);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+        .addReg(StartSrcReg).addMBB(StartMBB)
+        .addReg(NextSrcReg).addMBB(NextMBB);
+    BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+      .addReg(StartCountReg).addMBB(StartMBB)
+      .addReg(NextCountReg).addMBB(NextMBB);
+    if (Opcode == SystemZ::MVC)
+      BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+        .addImm(SystemZ::PFD_WRITE)
+        .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+    BuildMI(MBB, DL, TII->get(Opcode))
+      .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+      .addReg(ThisSrcReg).addImm(SrcDisp);
+    if (EndMBB) {
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+    }
+
+    // NextMBB:
+    //   %NextDestReg = LA 256(%ThisDestReg)
+    //   %NextSrcReg = LA 256(%ThisSrcReg)
+    //   %NextCountReg = AGHI %ThisCountReg, -1
+    //   CGHI %NextCountReg, 0
+    //   JLH LoopMBB
+    //   # fall through to DoneMMB
+    //
+    // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+    MBB = NextMBB;
+
+    BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+      .addReg(ThisDestReg).addImm(256).addReg(0);
+    if (!HaveSingleBase)
+      BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+        .addReg(ThisSrcReg).addImm(256).addReg(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+      .addReg(ThisCountReg).addImm(-1);
+    BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+      .addReg(NextCountReg).addImm(0);
+    BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+      .addMBB(LoopMBB);
+    MBB->addSuccessor(LoopMBB);
+    MBB->addSuccessor(DoneMBB);
+
+    DestBase = MachineOperand::CreateReg(NextDestReg, false);
+    SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+    Length &= 255;
+    MBB = DoneMBB;
+  }
+  // Handle any remaining bytes with straight-line code.
+  while (Length > 0) {
+    uint64_t ThisLength = std::min(Length, uint64_t(256));
+    // The previous iteration might have created out-of-range displacements.
+    // Apply them using LAY if so.
+    if (!isUInt<12>(DestDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(DestBase).addImm(DestDisp).addReg(0);
+      DestBase = MachineOperand::CreateReg(Reg, false);
+      DestDisp = 0;
+    }
+    if (!isUInt<12>(SrcDisp)) {
+      unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+        .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+      SrcBase = MachineOperand::CreateReg(Reg, false);
+      SrcDisp = 0;
+    }
+    BuildMI(*MBB, MI, DL, TII->get(Opcode))
+      .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+      .addOperand(SrcBase).addImm(SrcDisp);
+    DestDisp += ThisLength;
+    SrcDisp += ThisLength;
+    Length -= ThisLength;
+    // If there's another CLC to go, branch to the end if a difference
+    // was found.
+    if (EndMBB && Length > 0) {
+      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+        .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+        .addMBB(EndMBB);
+      MBB->addSuccessor(EndMBB);
+      MBB->addSuccessor(NextMBB);
+      MBB = NextMBB;
+    }
+  }
+  if (EndMBB) {
+    MBB->addSuccessor(EndMBB);
+    MBB = EndMBB;
+    MBB->addLiveIn(SystemZ::CC);
+  }
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+// Decompose string pseudo-instruction MI into a loop that continually performs
+// Opcode until CC != 3.
+MachineBasicBlock *
+SystemZTargetLowering::emitStringWrapper(MachineInstr *MI,
+                                         MachineBasicBlock *MBB,
+                                         unsigned Opcode) const {
+  const SystemZInstrInfo *TII = TM.getInstrInfo();
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  uint64_t End1Reg   = MI->getOperand(0).getReg();
+  uint64_t Start1Reg = MI->getOperand(1).getReg();
+  uint64_t Start2Reg = MI->getOperand(2).getReg();
+  uint64_t CharReg   = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *RC = &SystemZ::GR64BitRegClass;
+  uint64_t This1Reg = MRI.createVirtualRegister(RC);
+  uint64_t This2Reg = MRI.createVirtualRegister(RC);
+  uint64_t End2Reg  = MRI.createVirtualRegister(RC);
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+  //  StartMBB:
+  //   # fall through to LoopMMB
+  MBB->addSuccessor(LoopMBB);
+
+  //  LoopMBB:
+  //   %This1Reg = phi [ %Start1Reg, StartMBB ], [ %End1Reg, LoopMBB ]
+  //   %This2Reg = phi [ %Start2Reg, StartMBB ], [ %End2Reg, LoopMBB ]
+  //   R0L = %CharReg
+  //   %End1Reg, %End2Reg = CLST %This1Reg, %This2Reg -- uses R0L
+  //   JO LoopMBB
+  //   # fall through to DoneMMB
+  //
+  // The load of R0L can be hoisted by post-RA LICM.
+  MBB = LoopMBB;
+
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This1Reg)
+    .addReg(Start1Reg).addMBB(StartMBB)
+    .addReg(End1Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), This2Reg)
+    .addReg(Start2Reg).addMBB(StartMBB)
+    .addReg(End2Reg).addMBB(LoopMBB);
+  BuildMI(MBB, DL, TII->get(TargetOpcode::COPY), SystemZ::R0L).addReg(CharReg);
+  BuildMI(MBB, DL, TII->get(Opcode))
+    .addReg(End1Reg, RegState::Define).addReg(End2Reg, RegState::Define)
+    .addReg(This1Reg).addReg(This2Reg);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ANY).addImm(SystemZ::CCMASK_3).addMBB(LoopMBB);
+  MBB->addSuccessor(LoopMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  DoneMBB->addLiveIn(SystemZ::CC);
+
+  MI->eraseFromParent();
+  return DoneMBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::
 EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   switch (MI->getOpcode()) {
+  case SystemZ::Select32Mux:
   case SystemZ::Select32:
   case SystemZ::SelectF32:
   case SystemZ::Select64:
@@ -2051,12 +3012,45 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::SelectF128:
     return emitSelect(MI, MBB);
 
+  case SystemZ::CondStore8Mux:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, false);
+  case SystemZ::CondStore8MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STCMux, 0, true);
+  case SystemZ::CondStore16Mux:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, false);
+  case SystemZ::CondStore16MuxInv:
+    return emitCondStore(MI, MBB, SystemZ::STHMux, 0, true);
+  case SystemZ::CondStore8:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, false);
+  case SystemZ::CondStore8Inv:
+    return emitCondStore(MI, MBB, SystemZ::STC, 0, true);
+  case SystemZ::CondStore16:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, false);
+  case SystemZ::CondStore16Inv:
+    return emitCondStore(MI, MBB, SystemZ::STH, 0, true);
+  case SystemZ::CondStore32:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, false);
+  case SystemZ::CondStore32Inv:
+    return emitCondStore(MI, MBB, SystemZ::ST, SystemZ::STOC, true);
+  case SystemZ::CondStore64:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, false);
+  case SystemZ::CondStore64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STG, SystemZ::STOCG, true);
+  case SystemZ::CondStoreF32:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, false);
+  case SystemZ::CondStoreF32Inv:
+    return emitCondStore(MI, MBB, SystemZ::STE, 0, true);
+  case SystemZ::CondStoreF64:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, false);
+  case SystemZ::CondStoreF64Inv:
+    return emitCondStore(MI, MBB, SystemZ::STD, 0, true);
+
   case SystemZ::AEXT128_64:
-    return emitExt128(MI, MBB, false, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, false, SystemZ::subreg_l64);
   case SystemZ::ZEXT128_32:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low32);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l32);
   case SystemZ::ZEXT128_64:
-    return emitExt128(MI, MBB, true, SystemZ::subreg_low);
+    return emitExt128(MI, MBB, true, SystemZ::subreg_l64);
 
   case SystemZ::ATOMIC_SWAPW:
     return emitAtomicLoadBinary(MI, MBB, 0, 0);
@@ -2092,98 +3086,98 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
   case SystemZ::ATOMIC_LOADW_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0);
   case SystemZ::ATOMIC_LOADW_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0);
   case SystemZ::ATOMIC_LOAD_NR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32);
-  case SystemZ::ATOMIC_LOAD_NILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32);
-  case SystemZ::ATOMIC_LOAD_NILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32);
-  case SystemZ::ATOMIC_LOAD_NILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32);
-  case SystemZ::ATOMIC_LOAD_NGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
   case SystemZ::ATOMIC_LOAD_NILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32);
   case SystemZ::ATOMIC_LOAD_NILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64);
-  case SystemZ::ATOMIC_LOAD_NIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64);
-  case SystemZ::ATOMIC_LOAD_NIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32);
   case SystemZ::ATOMIC_LOAD_NILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64);
-  case SystemZ::ATOMIC_LOAD_NIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32);
+  case SystemZ::ATOMIC_LOAD_NGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64);
+  case SystemZ::ATOMIC_LOAD_NILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64);
+  case SystemZ::ATOMIC_LOAD_NILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_NILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64);
+  case SystemZ::ATOMIC_LOAD_NIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 0);
   case SystemZ::ATOMIC_LOADW_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 0);
   case SystemZ::ATOMIC_LOAD_OR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::OR, 32);
-  case SystemZ::ATOMIC_LOAD_OILL32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL32, 32);
-  case SystemZ::ATOMIC_LOAD_OILH32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH32, 32);
-  case SystemZ::ATOMIC_LOAD_OILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF32, 32);
-  case SystemZ::ATOMIC_LOAD_OGR:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
   case SystemZ::ATOMIC_LOAD_OILL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL, 32);
   case SystemZ::ATOMIC_LOAD_OILH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 64);
-  case SystemZ::ATOMIC_LOAD_OIHL:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL, 64);
-  case SystemZ::ATOMIC_LOAD_OIHH:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH, 32);
   case SystemZ::ATOMIC_LOAD_OILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 64);
-  case SystemZ::ATOMIC_LOAD_OIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF, 64);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF, 32);
+  case SystemZ::ATOMIC_LOAD_OGR:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OGR, 64);
+  case SystemZ::ATOMIC_LOAD_OILL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILL64, 64);
+  case SystemZ::ATOMIC_LOAD_OILH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILH64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHL64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHL64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHH64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHH64, 64);
+  case SystemZ::ATOMIC_LOAD_OILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OILF64, 64);
+  case SystemZ::ATOMIC_LOAD_OIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::OIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 0);
   case SystemZ::ATOMIC_LOADW_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 0);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 0);
   case SystemZ::ATOMIC_LOAD_XR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XR, 32);
-  case SystemZ::ATOMIC_LOAD_XILF32:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF32, 32);
+  case SystemZ::ATOMIC_LOAD_XILF:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 32);
   case SystemZ::ATOMIC_LOAD_XGR:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::XGR, 64);
-  case SystemZ::ATOMIC_LOAD_XILF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF, 64);
-  case SystemZ::ATOMIC_LOAD_XIHF:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF, 64);
+  case SystemZ::ATOMIC_LOAD_XILF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XILF64, 64);
+  case SystemZ::ATOMIC_LOAD_XIHF64:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::XIHF64, 64);
 
   case SystemZ::ATOMIC_LOADW_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 0, true);
   case SystemZ::ATOMIC_LOADW_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 0, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 0, true);
   case SystemZ::ATOMIC_LOAD_NRi:
     return emitAtomicLoadBinary(MI, MBB, SystemZ::NR, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILL32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILH32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NILF32i:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF32, 32, true);
-  case SystemZ::ATOMIC_LOAD_NGRi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
   case SystemZ::ATOMIC_LOAD_NILLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL, 32, true);
   case SystemZ::ATOMIC_LOAD_NILHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHLi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHHi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH, 32, true);
   case SystemZ::ATOMIC_LOAD_NILFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 64, true);
-  case SystemZ::ATOMIC_LOAD_NIHFi:
-    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF, 64, true);
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF, 32, true);
+  case SystemZ::ATOMIC_LOAD_NGRi:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NGR, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHL64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHL64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHH64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHH64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NILF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NILF64, 64, true);
+  case SystemZ::ATOMIC_LOAD_NIHF64i:
+    return emitAtomicLoadBinary(MI, MBB, SystemZ::NIHF64, 64, true);
 
   case SystemZ::ATOMIC_LOADW_MIN:
     return emitAtomicLoadMinMax(MI, MBB, SystemZ::CR,
@@ -2227,6 +3221,27 @@ EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const {
 
   case SystemZ::ATOMIC_CMP_SWAPW:
     return emitAtomicCmpSwapW(MI, MBB);
+  case SystemZ::MVCSequence:
+  case SystemZ::MVCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
+  case SystemZ::NCSequence:
+  case SystemZ::NCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::NC);
+  case SystemZ::OCSequence:
+  case SystemZ::OCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::OC);
+  case SystemZ::XCSequence:
+  case SystemZ::XCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::XC);
+  case SystemZ::CLCSequence:
+  case SystemZ::CLCLoop:
+    return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
+  case SystemZ::CLSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::CLST);
+  case SystemZ::MVSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::MVST);
+  case SystemZ::SRSTLoop:
+    return emitStringWrapper(MI, MBB, SystemZ::SRST);
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index eea820c..c6dcca6 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -16,6 +16,7 @@
 #define LLVM_TARGET_SystemZ_ISELLOWERING_H
 
 #include "SystemZ.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -31,17 +32,32 @@ namespace SystemZISD {
     // is the target address.  The arguments start at operand 2.
     // There is an optional glue operand at the end.
     CALL,
+    SIBCALL,
 
     // Wraps a TargetGlobalAddress that should be loaded using PC-relative
     // accesses (LARL).  Operand 0 is the address.
     PCREL_WRAPPER,
 
-    // Signed integer and floating-point comparisons.  The operands are the
-    // two values to compare.
-    CMP,
+    // Used in cases where an offset is applied to a TargetGlobalAddress.
+    // Operand 0 is the full TargetGlobalAddress and operand 1 is a
+    // PCREL_WRAPPER for an anchor point.  This is used so that we can
+    // cheaply refer to either the full address or the anchor point
+    // as a register base.
+    PCREL_OFFSET,
 
-    // Likewise unsigned integer comparison.
-    UCMP,
+    // Integer comparisons.  There are three operands: the two values
+    // to compare, and an integer of type SystemZICMP.
+    ICMP,
+
+    // Floating-point comparisons.  The two operands are the values to compare.
+    FCMP,
+
+    // Test under mask.  The first operand is ANDed with the second operand
+    // and the condition codes are set on the result.  The third operand is
+    // a boolean that is true if the condition codes need to distinguish
+    // between CCMASK_TM_MIXED_MSB_0 and CCMASK_TM_MIXED_MSB_1 (which the
+    // register forms do but the memory forms don't).
+    TM,
 
     // Branches if a condition is true.  Operand 0 is the chain operand;
     // operand 1 is the 4-bit condition-code mask, with bit N in
@@ -67,10 +83,55 @@ namespace SystemZISD {
     // first input operands are GR128s.  The trailing numbers are the
     // widths of the second operand in bits.
     UMUL_LOHI64,
+    SDIVREM32,
     SDIVREM64,
     UDIVREM32,
     UDIVREM64,
 
+    // Use a series of MVCs to copy bytes from one memory location to another.
+    // The operands are:
+    // - the target address
+    // - the source address
+    // - the constant length
+    //
+    // This isn't a memory opcode because we'd need to attach two
+    // MachineMemOperands rather than one.
+    MVC,
+
+    // Like MVC, but implemented as a loop that handles X*256 bytes
+    // followed by straight-line code to handle the rest (if any).
+    // The value of X is passed as an additional operand.
+    MVC_LOOP,
+
+    // Similar to MVC and MVC_LOOP, but for logic operations (AND, OR, XOR).
+    NC,
+    NC_LOOP,
+    OC,
+    OC_LOOP,
+    XC,
+    XC_LOOP,
+
+    // Use CLC to compare two blocks of memory, with the same comments
+    // as for MVC and MVC_LOOP.
+    CLC,
+    CLC_LOOP,
+
+    // Use an MVST-based sequence to implement stpcpy().
+    STPCPY,
+
+    // Use a CLST-based sequence to implement strcmp().  The two input operands
+    // are the addresses of the strings to compare.
+    STRCMP,
+
+    // Use an SRST-based sequence to search a block of memory.  The first
+    // operand is the end address, the second is the start, and the third
+    // is the character to search for.  CC is set to 1 on success and 2
+    // on failure.
+    SEARCH_STRING,
+
+    // Store the CC value in bits 29 and 28 of an integer.
+    IPM,
+
     // Wrappers around the inner loop of an 8- or 16-bit ATOMIC_SWAP or
     // ATOMIC_LOAD_<op>.
     //
@@ -102,7 +163,27 @@ namespace SystemZISD {
     //            operand into the high bits
     // Operand 4: the negative of operand 2, for rotating the other way
     // Operand 5: the width of the field in bits (8 or 16)
-    ATOMIC_CMP_SWAPW
+    ATOMIC_CMP_SWAPW,
+
+    // Prefetch from the second operand using the 4-bit control code in
+    // the first operand.  The code is 1 for a load prefetch and 2 for
+    // a store prefetch.
+    PREFETCH
+  };
+
+  // Return true if OPCODE is some kind of PC-relative address.
+  inline bool isPCREL(unsigned Opcode) {
+    return Opcode == PCREL_WRAPPER || Opcode == PCREL_OFFSET;
+  }
+}
+
+namespace SystemZICMP {
+  // Describes whether an integer comparison needs to be signed or unsigned,
+  // or whether either type is OK.
+  enum {
+    Any,
+    UnsignedOnly,
+    SignedOnly
   };
 }
 
@@ -117,17 +198,19 @@ public:
   virtual MVT getScalarShiftAmountTy(EVT LHSTy) const LLVM_OVERRIDE {
     return MVT::i32;
   }
-  virtual EVT getSetCCResultType(EVT VT) const {
-    return MVT::i32;
-  }
-  virtual bool isFMAFasterThanMulAndAdd(EVT) const LLVM_OVERRIDE {
-    return true;
-  }
-  virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  virtual EVT getSetCCResultType(LLVMContext &, EVT) const LLVM_OVERRIDE;
+  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const LLVM_OVERRIDE;
+  virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const LLVM_OVERRIDE;
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const
+     LLVM_OVERRIDE;
+  virtual bool allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const
+    LLVM_OVERRIDE;
+  virtual bool isTruncateFree(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool isTruncateFree(EVT, EVT) const LLVM_OVERRIDE;
   virtual const char *getTargetNodeName(unsigned Opcode) const LLVM_OVERRIDE;
   virtual std::pair<unsigned, const TargetRegisterClass *>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 EVT VT) const LLVM_OVERRIDE;
+                                 MVT VT) const LLVM_OVERRIDE;
   virtual TargetLowering::ConstraintType
     getConstraintType(const std::string &Constraint) const LLVM_OVERRIDE;
   virtual TargetLowering::ConstraintWeight
@@ -143,11 +226,13 @@ public:
                                 MachineBasicBlock *BB) const LLVM_OVERRIDE;
   virtual SDValue LowerOperation(SDValue Op,
                                  SelectionDAG &DAG) const LLVM_OVERRIDE;
+  virtual bool allowTruncateForTailCall(Type *, Type *) const LLVM_OVERRIDE;
+  virtual bool mayBeEmittedAsTailCall(CallInst *CI) const LLVM_OVERRIDE;
   virtual SDValue
     LowerFormalArguments(SDValue Chain,
                          CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,
-                         DebugLoc DL, SelectionDAG &DAG,
+                         SDLoc DL, SelectionDAG &DAG,
                          SmallVectorImpl<SDValue> &InVals) const LLVM_OVERRIDE;
   virtual SDValue
     LowerCall(CallLoweringInfo &CLI,
@@ -158,13 +243,14 @@ public:
                 CallingConv::ID CallConv, bool IsVarArg,
                 const SmallVectorImpl<ISD::OutputArg> &Outs,
                 const SmallVectorImpl<SDValue> &OutVals,
-                DebugLoc DL, SelectionDAG &DAG) const LLVM_OVERRIDE;
+                SDLoc DL, SelectionDAG &DAG) const LLVM_OVERRIDE;
 
 private:
   const SystemZSubtarget &Subtarget;
   const SystemZTargetMachine &TM;
 
   // Implement LowerOperation for individual opcodes.
+  SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -178,6 +264,7 @@ private:
   SDValue lowerVASTART(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
@@ -188,10 +275,24 @@ private:
   SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+
+  // If the last instruction before MBBI in MBB was some form of COMPARE,
+  // try to replace it with a COMPARE AND BRANCH just before MBBI.
+  // CCMask and Target are the BRC-like operands for the branch.
+  // Return true if the change was made.
+  bool convertPrevCompareToBranch(MachineBasicBlock *MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  unsigned CCMask,
+                                  MachineBasicBlock *Target) const;
 
   // Implement EmitInstrWithCustomInserter for individual operation types.
   MachineBasicBlock *emitSelect(MachineInstr *MI,
                                 MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitCondStore(MachineInstr *MI,
+                                   MachineBasicBlock *BB,
+                                   unsigned StoreOpcode, unsigned STOCOpcode,
+                                   bool Invert) const;
   MachineBasicBlock *emitExt128(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
                                 bool ClearEven, unsigned SubReg) const;
@@ -206,6 +307,12 @@ private:
                                           unsigned BitSize) const;
   MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr *MI,
                                         MachineBasicBlock *BB) const;
+  MachineBasicBlock *emitMemMemWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
+  MachineBasicBlock *emitStringWrapper(MachineInstr *MI,
+                                       MachineBasicBlock *BB,
+                                       unsigned Opcode) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 7c9f0e6..6080046 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Control-flow instructions
+// Select instructions
 //===----------------------------------------------------------------------===//
 
 // C's ?: operator for floating-point operands.
@@ -16,6 +16,11 @@ def SelectF32  : SelectWrapper<FP32>;
 def SelectF64  : SelectWrapper<FP64>;
 def SelectF128 : SelectWrapper<FP128>;
 
+defm CondStoreF32 : CondStores<FP32, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+defm CondStoreF64 : CondStores<FP64, nonvolatile_store,
+                               nonvolatile_load, bdxaddr20only>;
+
 //===----------------------------------------------------------------------===//
 // Move instructions
 //===----------------------------------------------------------------------===//
@@ -29,57 +34,69 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 
 // Moves between two floating-point registers.
 let neverHasSideEffects = 1 in {
-  def LER : UnaryRR <"ler", 0x38,   null_frag, FP32,  FP32>;
-  def LDR : UnaryRR <"ldr", 0x28,   null_frag, FP64,  FP64>;
-  def LXR : UnaryRRE<"lxr", 0xB365, null_frag, FP128, FP128>;
+  def LER : UnaryRR <"le", 0x38,   null_frag, FP32,  FP32>;
+  def LDR : UnaryRR <"ld", 0x28,   null_frag, FP64,  FP64>;
+  def LXR : UnaryRRE<"lx", 0xB365, null_frag, FP128, FP128>;
+}
+
+// Moves between two floating-point registers that also set the condition
+// codes.
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  defm LTEBR : LoadAndTestRRE<"lteb", 0xB302, FP32>;
+  defm LTDBR : LoadAndTestRRE<"ltdb", 0xB312, FP64>;
+  defm LTXBR : LoadAndTestRRE<"ltxb", 0xB342, FP128>;
 }
+def : CompareZeroFP<LTEBRCompare, FP32>;
+def : CompareZeroFP<LTDBRCompare, FP64>;
+def : CompareZeroFP<LTXBRCompare, FP128>;
 
 // Moves between 64-bit integer and floating-point registers.
-def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>;
-def LDGR : UnaryRRE<"ldgr", 0xB3C1, bitconvert, FP64, GR64>;
+def LGDR : UnaryRRE<"lgd", 0xB3CD, bitconvert, GR64, FP64>;
+def LDGR : UnaryRRE<"ldg", 0xB3C1, bitconvert, FP64, GR64>;
 
 // fcopysign with an FP32 result.
 let isCodeGenOnly = 1 in {
-  def CPSDRss : BinaryRevRRF<"cpsdr", 0xB372, fcopysign, FP32, FP32>;
-  def CPSDRsd : BinaryRevRRF<"cpsdr", 0xB372, fcopysign, FP32, FP64>;
+  def CPSDRss : BinaryRRF<"cpsd", 0xB372, fcopysign, FP32, FP32>;
+  def CPSDRsd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP32, FP64>;
 }
 
-// The sign of an FP128 is in the high register.  Give the CPSDRsd
-// operands in R1, R2, R3 order.
+// The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP32:$src1, FP128:$src2),
-          (CPSDRsd (EXTRACT_SUBREG FP128:$src2, subreg_high), FP32:$src1)>;
+          (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP64 result.
 let isCodeGenOnly = 1 in
-  def CPSDRds : BinaryRevRRF<"cpsdr", 0xB372, fcopysign, FP64, FP32>;
-def CPSDRdd : BinaryRevRRF<"cpsdr", 0xB372, fcopysign, FP64, FP64>;
+  def CPSDRds : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP32>;
+def CPSDRdd : BinaryRRF<"cpsd", 0xB372, fcopysign, FP64, FP64>;
 
-// The sign of an FP128 is in the high register.  Give the CPSDRdd
-// operands in R1, R2, R3 order.
+// The sign of an FP128 is in the high register.
 def : Pat<(fcopysign FP64:$src1, FP128:$src2),
-          (CPSDRdd (EXTRACT_SUBREG FP128:$src2, subreg_high), FP64:$src1)>;
+          (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
 
 // fcopysign with an FP128 result.  Use "upper" as the high half and leave
 // the low half as-is.
 class CopySign128<RegisterOperand cls, dag upper>
   : Pat<(fcopysign FP128:$src1, cls:$src2),
-        (INSERT_SUBREG FP128:$src1, upper, subreg_high)>;
+        (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>;
 
-// Give the CPSDR* operands in R1, R2, R3 order.
-def : CopySign128<FP32,  (CPSDRds FP32:$src2,
-                                  (EXTRACT_SUBREG FP128:$src1, subreg_high))>;
-def : CopySign128<FP64,  (CPSDRdd FP64:$src2,
-                                  (EXTRACT_SUBREG FP128:$src1, subreg_high))>;
-def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src2, subreg_high),
-                                  (EXTRACT_SUBREG FP128:$src1, subreg_high))>;
+def : CopySign128<FP32,  (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP32:$src2)>;
+def : CopySign128<FP64,  (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  FP64:$src2)>;
+def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64),
+                                  (EXTRACT_SUBREG FP128:$src2, subreg_h64))>;
+
+defm LoadStoreF32  : MVCLoadStore<load, f32,  MVCSequence, 4>;
+defm LoadStoreF64  : MVCLoadStore<load, f64,  MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, f128, MVCSequence, 16>;
 
 //===----------------------------------------------------------------------===//
 // Load instructions
 //===----------------------------------------------------------------------===//
 
 let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
-  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32>;
-  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64>;
+  defm LE : UnaryRXPair<"le", 0x78, 0xED64, load, FP32, 4>;
+  defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -94,8 +111,8 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
 //===----------------------------------------------------------------------===//
 
 let SimpleBDXStore = 1 in {
-  defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32>;
-  defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64>;
+  defm STE : StoreRXPair<"ste", 0x70, 0xED66, store, FP32, 4>;
+  defm STD : StoreRXPair<"std", 0x60, 0xED67, store, FP64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -112,201 +129,232 @@ let SimpleBDXStore = 1 in {
 // Convert floating-point values to narrower representations, rounding
 // according to the current mode.  The destination of LEXBR and LDXBR
 // is a 128-bit value, but only the first register of the pair is used.
-def LEDBR : UnaryRRE<"ledbr", 0xB344, fround,    FP32,  FP64>;
-def LEXBR : UnaryRRE<"lexbr", 0xB346, null_frag, FP128, FP128>;
-def LDXBR : UnaryRRE<"ldxbr", 0xB345, null_frag, FP128, FP128>;
+def LEDBR : UnaryRRE<"ledb", 0xB344, fround,    FP32,  FP64>;
+def LEXBR : UnaryRRE<"lexb", 0xB346, null_frag, FP128, FP128>;
+def LDXBR : UnaryRRE<"ldxb", 0xB345, null_frag, FP128, FP128>;
 
 def : Pat<(f32 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_32bit)>;
+          (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hh32)>;
 def : Pat<(f64 (fround FP128:$src)),
-          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_high)>;
+          (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>;
 
 // Extend register floating-point values to wider representations.
-def LDEBR : UnaryRRE<"ldebr", 0xB304, fextend, FP64,  FP32>;
-def LXEBR : UnaryRRE<"lxebr", 0xB306, fextend, FP128, FP32>;
-def LXDBR : UnaryRRE<"lxdbr", 0xB305, fextend, FP128, FP64>;
+def LDEBR : UnaryRRE<"ldeb", 0xB304, fextend, FP64,  FP32>;
+def LXEBR : UnaryRRE<"lxeb", 0xB306, fextend, FP128, FP32>;
+def LXDBR : UnaryRRE<"lxdb", 0xB305, fextend, FP128, FP64>;
 
 // Extend memory floating-point values to wider representations.
-def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64>;
-def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128>;
-def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128>;
+def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64,  4>;
+def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>;
+def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>;
 
 // Convert a signed integer register value to a floating-point one.
-let Defs = [PSW] in {
-  def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32,  GR32>;
-  def CDFBR : UnaryRRE<"cdfbr", 0xB395, sint_to_fp, FP64,  GR32>;
-  def CXFBR : UnaryRRE<"cxfbr", 0xB396, sint_to_fp, FP128, GR32>;
-
-  def CEGBR : UnaryRRE<"cegbr", 0xB3A4, sint_to_fp, FP32,  GR64>;
-  def CDGBR : UnaryRRE<"cdgbr", 0xB3A5, sint_to_fp, FP64,  GR64>;
-  def CXGBR : UnaryRRE<"cxgbr", 0xB3A6, sint_to_fp, FP128, GR64>;
-}
+def CEFBR : UnaryRRE<"cefb", 0xB394, sint_to_fp, FP32,  GR32>;
+def CDFBR : UnaryRRE<"cdfb", 0xB395, sint_to_fp, FP64,  GR32>;
+def CXFBR : UnaryRRE<"cxfb", 0xB396, sint_to_fp, FP128, GR32>;
+
+def CEGBR : UnaryRRE<"cegb", 0xB3A4, sint_to_fp, FP32,  GR64>;
+def CDGBR : UnaryRRE<"cdgb", 0xB3A5, sint_to_fp, FP64,  GR64>;
+def CXGBR : UnaryRRE<"cxgb", 0xB3A6, sint_to_fp, FP128, GR64>;
 
 // Convert a floating-point register value to a signed integer value,
 // with the second operand (modifier M3) specifying the rounding mode.
-let Defs = [PSW] in {
-  def CFEBR : UnaryRRF<"cfebr", 0xB398, GR32, FP32>;
-  def CFDBR : UnaryRRF<"cfdbr", 0xB399, GR32, FP64>;
-  def CFXBR : UnaryRRF<"cfxbr", 0xB39A, GR32, FP128>;
-
-  def CGEBR : UnaryRRF<"cgebr", 0xB3A8, GR64, FP32>;
-  def CGDBR : UnaryRRF<"cgdbr", 0xB3A9, GR64, FP64>;
-  def CGXBR : UnaryRRF<"cgxbr", 0xB3AA, GR64, FP128>;
+let Defs = [CC] in {
+  def CFEBR : UnaryRRF<"cfeb", 0xB398, GR32, FP32>;
+  def CFDBR : UnaryRRF<"cfdb", 0xB399, GR32, FP64>;
+  def CFXBR : UnaryRRF<"cfxb", 0xB39A, GR32, FP128>;
+
+  def CGEBR : UnaryRRF<"cgeb", 0xB3A8, GR64, FP32>;
+  def CGDBR : UnaryRRF<"cgdb", 0xB3A9, GR64, FP64>;
+  def CGXBR : UnaryRRF<"cgxb", 0xB3AA, GR64, FP128>;
 }
 
 // fp_to_sint always rounds towards zero, which is modifier value 5.
-def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR FP32:$src,  5)>;
-def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR FP64:$src,  5)>;
-def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR FP128:$src, 5)>;
+def : Pat<(i32 (fp_to_sint FP32:$src)),  (CFEBR 5, FP32:$src)>;
+def : Pat<(i32 (fp_to_sint FP64:$src)),  (CFDBR 5, FP64:$src)>;
+def : Pat<(i32 (fp_to_sint FP128:$src)), (CFXBR 5, FP128:$src)>;
 
-def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR FP32:$src,  5)>;
-def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR FP64:$src,  5)>;
-def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR FP128:$src, 5)>;
+def : Pat<(i64 (fp_to_sint FP32:$src)),  (CGEBR 5, FP32:$src)>;
+def : Pat<(i64 (fp_to_sint FP64:$src)),  (CGDBR 5, FP64:$src)>;
+def : Pat<(i64 (fp_to_sint FP128:$src)), (CGXBR 5, FP128:$src)>;
 
 //===----------------------------------------------------------------------===//
 // Unary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Negation (Load Complement).
-let Defs = [PSW] in {
-  def LCEBR : UnaryRRE<"lcebr", 0xB303, fneg, FP32,  FP32>;
-  def LCDBR : UnaryRRE<"lcdbr", 0xB313, fneg, FP64,  FP64>;
-  def LCXBR : UnaryRRE<"lcxbr", 0xB343, fneg, FP128, FP128>;
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LCEBR : UnaryRRE<"lceb", 0xB303, fneg, FP32,  FP32>;
+  def LCDBR : UnaryRRE<"lcdb", 0xB313, fneg, FP64,  FP64>;
+  def LCXBR : UnaryRRE<"lcxb", 0xB343, fneg, FP128, FP128>;
 }
 
 // Absolute value (Load Positive).
-let Defs = [PSW] in {
-  def LPEBR : UnaryRRE<"lpebr", 0xB300, fabs, FP32,  FP32>;
-  def LPDBR : UnaryRRE<"lpdbr", 0xB310, fabs, FP64,  FP64>;
-  def LPXBR : UnaryRRE<"lpxbr", 0xB340, fabs, FP128, FP128>;
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LPEBR : UnaryRRE<"lpeb", 0xB300, fabs, FP32,  FP32>;
+  def LPDBR : UnaryRRE<"lpdb", 0xB310, fabs, FP64,  FP64>;
+  def LPXBR : UnaryRRE<"lpxb", 0xB340, fabs, FP128, FP128>;
 }
 
 // Negative absolute value (Load Negative).
-let Defs = [PSW] in {
-  def LNEBR : UnaryRRE<"lnebr", 0xB301, fnabs, FP32,  FP32>;
-  def LNDBR : UnaryRRE<"lndbr", 0xB311, fnabs, FP64,  FP64>;
-  def LNXBR : UnaryRRE<"lnxbr", 0xB341, fnabs, FP128, FP128>;
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def LNEBR : UnaryRRE<"lneb", 0xB301, fnabs, FP32,  FP32>;
+  def LNDBR : UnaryRRE<"lndb", 0xB311, fnabs, FP64,  FP64>;
+  def LNXBR : UnaryRRE<"lnxb", 0xB341, fnabs, FP128, FP128>;
 }
 
 // Square root.
-def SQEBR : UnaryRRE<"sqebr", 0xB314, fsqrt, FP32,  FP32>;
-def SQDBR : UnaryRRE<"sqdbr", 0xB315, fsqrt, FP64,  FP64>;
-def SQXBR : UnaryRRE<"sqxbr", 0xB316, fsqrt, FP128, FP128>;
+def SQEBR : UnaryRRE<"sqeb", 0xB314, fsqrt, FP32,  FP32>;
+def SQDBR : UnaryRRE<"sqdb", 0xB315, fsqrt, FP64,  FP64>;
+def SQXBR : UnaryRRE<"sqxb", 0xB316, fsqrt, FP128, FP128>;
 
-def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32>;
-def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64>;
+def SQEB : UnaryRXE<"sqeb", 0xED14, loadu<fsqrt>, FP32, 4>;
+def SQDB : UnaryRXE<"sqdb", 0xED15, loadu<fsqrt>, FP64, 8>;
 
 // Round to an integer, with the second operand (modifier M3) specifying
-// the rounding mode.
-//
-// These forms always check for inexact conditions.  z196 added versions
-// that allow this to suppressed (as for fnearbyint), but we don't yet
-// support -march=z196.
-let Defs = [PSW] in {
-  def FIEBR : UnaryRRF<"fiebr", 0xB357, FP32,  FP32>;
-  def FIDBR : UnaryRRF<"fidbr", 0xB35F, FP64,  FP64>;
-  def FIXBR : UnaryRRF<"fixbr", 0xB347, FP128, FP128>;
-}
+// the rounding mode.  These forms always check for inexact conditions.
+def FIEBR : UnaryRRF<"fieb", 0xB357, FP32,  FP32>;
+def FIDBR : UnaryRRF<"fidb", 0xB35F, FP64,  FP64>;
+def FIXBR : UnaryRRF<"fixb", 0xB347, FP128, FP128>;
+
+// Extended forms of the previous three instructions.  M4 can be set to 4
+// to suppress detection of inexact conditions.
+def FIEBRA : UnaryRRF4<"fiebra", 0xB357, FP32,  FP32>,
+             Requires<[FeatureFPExtension]>;
+def FIDBRA : UnaryRRF4<"fidbra", 0xB35F, FP64,  FP64>,
+             Requires<[FeatureFPExtension]>;
+def FIXBRA : UnaryRRF4<"fixbra", 0xB347, FP128, FP128>,
+             Requires<[FeatureFPExtension]>;
 
 // frint rounds according to the current mode (modifier 0) and detects
 // inexact conditions.
-def : Pat<(frint FP32:$src),  (FIEBR FP32:$src,  0)>;
-def : Pat<(frint FP64:$src),  (FIDBR FP64:$src,  0)>;
-def : Pat<(frint FP128:$src), (FIXBR FP128:$src, 0)>;
+def : Pat<(frint FP32:$src),  (FIEBR 0, FP32:$src)>;
+def : Pat<(frint FP64:$src),  (FIDBR 0, FP64:$src)>;
+def : Pat<(frint FP128:$src), (FIXBR 0, FP128:$src)>;
+
+let Predicates = [FeatureFPExtension] in {
+  // fnearbyint is like frint but does not detect inexact conditions.
+  def : Pat<(fnearbyint FP32:$src),  (FIEBRA 0, FP32:$src,  4)>;
+  def : Pat<(fnearbyint FP64:$src),  (FIDBRA 0, FP64:$src,  4)>;
+  def : Pat<(fnearbyint FP128:$src), (FIXBRA 0, FP128:$src, 4)>;
+
+  // floor is no longer allowed to raise an inexact condition,
+  // so restrict it to the cases where the condition can be suppressed.
+  // Mode 7 is round towards -inf.
+  def : Pat<(ffloor FP32:$src),  (FIEBRA 7, FP32:$src,  4)>;
+  def : Pat<(ffloor FP64:$src),  (FIDBRA 7, FP64:$src,  4)>;
+  def : Pat<(ffloor FP128:$src), (FIXBRA 7, FP128:$src, 4)>;
+
+  // Same idea for ceil, where mode 6 is round towards +inf.
+  def : Pat<(fceil FP32:$src),  (FIEBRA 6, FP32:$src,  4)>;
+  def : Pat<(fceil FP64:$src),  (FIDBRA 6, FP64:$src,  4)>;
+  def : Pat<(fceil FP128:$src), (FIXBRA 6, FP128:$src, 4)>;
+
+  // Same idea for trunc, where mode 5 is round towards zero.
+  def : Pat<(ftrunc FP32:$src),  (FIEBRA 5, FP32:$src,  4)>;
+  def : Pat<(ftrunc FP64:$src),  (FIDBRA 5, FP64:$src,  4)>;
+  def : Pat<(ftrunc FP128:$src), (FIXBRA 5, FP128:$src, 4)>;
+
+  // Same idea for round, where mode 1 is round towards nearest with
+  // ties away from zero.
+  def : Pat<(frnd FP32:$src),  (FIEBRA 1, FP32:$src,  4)>;
+  def : Pat<(frnd FP64:$src),  (FIDBRA 1, FP64:$src,  4)>;
+  def : Pat<(frnd FP128:$src), (FIXBRA 1, FP128:$src, 4)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Binary arithmetic
 //===----------------------------------------------------------------------===//
 
 // Addition.
-let Defs = [PSW] in {
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
   let isCommutable = 1 in {
-    def AEBR : BinaryRRE<"aebr", 0xB30A, fadd, FP32,  FP32>;
-    def ADBR : BinaryRRE<"adbr", 0xB31A, fadd, FP64,  FP64>;
-    def AXBR : BinaryRRE<"axbr", 0xB34A, fadd, FP128, FP128>;
+    def AEBR : BinaryRRE<"aeb", 0xB30A, fadd, FP32,  FP32>;
+    def ADBR : BinaryRRE<"adb", 0xB31A, fadd, FP64,  FP64>;
+    def AXBR : BinaryRRE<"axb", 0xB34A, fadd, FP128, FP128>;
   }
-  def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load>;
-  def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load>;
+  def AEB : BinaryRXE<"aeb", 0xED0A, fadd, FP32, load, 4>;
+  def ADB : BinaryRXE<"adb", 0xED1A, fadd, FP64, load, 8>;
 }
 
 // Subtraction.
-let Defs = [PSW] in {
-  def SEBR : BinaryRRE<"sebr", 0xB30B, fsub, FP32,  FP32>;
-  def SDBR : BinaryRRE<"sdbr", 0xB31B, fsub, FP64,  FP64>;
-  def SXBR : BinaryRRE<"sxbr", 0xB34B, fsub, FP128, FP128>;
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0xF in {
+  def SEBR : BinaryRRE<"seb", 0xB30B, fsub, FP32,  FP32>;
+  def SDBR : BinaryRRE<"sdb", 0xB31B, fsub, FP64,  FP64>;
+  def SXBR : BinaryRRE<"sxb", 0xB34B, fsub, FP128, FP128>;
 
-  def SEB : BinaryRXE<"seb",  0xED0B, fsub, FP32, load>;
-  def SDB : BinaryRXE<"sdb",  0xED1B, fsub, FP64, load>;
+  def SEB : BinaryRXE<"seb",  0xED0B, fsub, FP32, load, 4>;
+  def SDB : BinaryRXE<"sdb",  0xED1B, fsub, FP64, load, 8>;
 }
 
 // Multiplication.
 let isCommutable = 1 in {
-  def MEEBR : BinaryRRE<"meebr", 0xB317, fmul, FP32,  FP32>;
-  def MDBR  : BinaryRRE<"mdbr",  0xB31C, fmul, FP64,  FP64>;
-  def MXBR  : BinaryRRE<"mxbr",  0xB34C, fmul, FP128, FP128>;
+  def MEEBR : BinaryRRE<"meeb", 0xB317, fmul, FP32,  FP32>;
+  def MDBR  : BinaryRRE<"mdb",  0xB31C, fmul, FP64,  FP64>;
+  def MXBR  : BinaryRRE<"mxb",  0xB34C, fmul, FP128, FP128>;
 }
-def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load>;
-def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load>;
+def MEEB : BinaryRXE<"meeb", 0xED17, fmul, FP32, load, 4>;
+def MDB  : BinaryRXE<"mdb",  0xED1C, fmul, FP64, load, 8>;
 
 // f64 multiplication of two FP32 registers.
-def MDEBR : BinaryRRE<"mdebr", 0xB30C, null_frag, FP64, FP32>;
+def MDEBR : BinaryRRE<"mdeb", 0xB30C, null_frag, FP64, FP32>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)), (f64 (fextend FP32:$src2))),
           (MDEBR (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                FP32:$src1, subreg_32bit), FP32:$src2)>;
+                                FP32:$src1, subreg_h32), FP32:$src2)>;
 
 // f64 multiplication of an FP32 register and an f32 memory.
-def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load>;
+def MDEB : BinaryRXE<"mdeb", 0xED0C, null_frag, FP64, load, 4>;
 def : Pat<(fmul (f64 (fextend FP32:$src1)),
                 (f64 (extloadf32 bdxaddr12only:$addr))),
-          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_32bit),
+          (MDEB (INSERT_SUBREG (f64 (IMPLICIT_DEF)), FP32:$src1, subreg_h32),
                 bdxaddr12only:$addr)>;
 
 // f128 multiplication of two FP64 registers.
-def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>;
+def MXDBR : BinaryRRE<"mxdb", 0xB307, null_frag, FP128, FP64>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)), (f128 (fextend FP64:$src2))),
           (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)),
-                                FP64:$src1, subreg_high), FP64:$src2)>;
+                                FP64:$src1, subreg_h64), FP64:$src2)>;
 
 // f128 multiplication of an FP64 register and an f64 memory.
-def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load>;
+def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>;
 def : Pat<(fmul (f128 (fextend FP64:$src1)),
                 (f128 (extloadf64 bdxaddr12only:$addr))),
-          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_high),
+          (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64),
                 bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
-def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
-def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+def MAEBR : TernaryRRD<"maeb", 0xB30E, z_fma, FP32>;
+def MADBR : TernaryRRD<"madb", 0xB31E, z_fma, FP64>;
 
-def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load>;
-def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load>;
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
 
 // Fused multiply-subtract.
-def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
-def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+def MSEBR : TernaryRRD<"mseb", 0xB30F, z_fms, FP32>;
+def MSDBR : TernaryRRD<"msdb", 0xB31F, z_fms, FP64>;
 
-def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load>;
-def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load>;
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
 
 // Division.
-def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32,  FP32>;
-def DDBR : BinaryRRE<"ddbr", 0xB31D, fdiv, FP64,  FP64>;
-def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>;
+def DEBR : BinaryRRE<"deb", 0xB30D, fdiv, FP32,  FP32>;
+def DDBR : BinaryRRE<"ddb", 0xB31D, fdiv, FP64,  FP64>;
+def DXBR : BinaryRRE<"dxb", 0xB34D, fdiv, FP128, FP128>;
 
-def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load>;
-def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load>;
+def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>;
+def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>;
 
 //===----------------------------------------------------------------------===//
 // Comparisons
 //===----------------------------------------------------------------------===//
 
-let Defs = [PSW] in {
-  def CEBR : CompareRRE<"cebr", 0xB309, z_cmp, FP32,  FP32>;
-  def CDBR : CompareRRE<"cdbr", 0xB319, z_cmp, FP64,  FP64>;
-  def CXBR : CompareRRE<"cxbr", 0xB349, z_cmp, FP128, FP128>;
+let Defs = [CC], CCValues = 0xF in {
+  def CEBR : CompareRRE<"ceb", 0xB309, z_fcmp, FP32,  FP32>;
+  def CDBR : CompareRRE<"cdb", 0xB319, z_fcmp, FP64,  FP64>;
+  def CXBR : CompareRRE<"cxb", 0xB349, z_fcmp, FP128, FP128>;
 
-  def CEB : CompareRXE<"ceb", 0xED09, z_cmp, FP32, load>;
-  def CDB : CompareRXE<"cdb", 0xED19, z_cmp, FP64, load>;
+  def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>;
+  def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index b32b7eb..a8efe16 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -21,12 +21,24 @@ class InstSystemZ<int size, dag outs, dag ins, string asmstr,
   let Pattern = pattern;
   let AsmString = asmstr;
 
-  // Used to identify a group of related instructions, such as ST and STY.
-  string Function = "";
-
-  // "12" for an instruction that has a ...Y equivalent, "20" for that
-  // ...Y equivalent.
-  string PairType = "none";
+  // Some instructions come in pairs, one having a 12-bit displacement
+  // and the other having a 20-bit displacement.  Both instructions in
+  // the pair have the same DispKey and their DispSizes are "12" and "20"
+  // respectively.
+  string DispKey = "";
+  string DispSize = "none";
+
+  // Many register-based <INSN>R instructions have a memory-based <INSN>
+  // counterpart.  OpKey uniquely identifies <INSN>, while OpType is
+  // "reg" for <INSN>R and "mem" for <INSN>.
+  string OpKey = "";
+  string OpType = "none";
+
+  // Many distinct-operands instructions have older 2-operand equivalents.
+  // NumOpsKey uniquely identifies one of these 2-operand and 3-operand pairs,
+  // with NumOpsValue being "2" or "3" as appropriate.
+  string NumOpsKey = "";
+  string NumOpsValue = "none";
 
   // True if this instruction is a simple D(X,B) load of a register
   // (with no sign or zero extension).
@@ -46,11 +58,40 @@ class InstSystemZ<int size, dag outs, dag ins, string asmstr,
   // operations.
   bit Is128Bit = 0;
 
-  let TSFlags{0} = SimpleBDXLoad;
-  let TSFlags{1} = SimpleBDXStore;
-  let TSFlags{2} = Has20BitOffset;
-  let TSFlags{3} = HasIndex;
-  let TSFlags{4} = Is128Bit;
+  // The access size of all memory operands in bytes, or 0 if not known.
+  bits<5> AccessBytes = 0;
+
+  // If the instruction sets CC to a useful value, this gives the mask
+  // of all possible CC results.  The mask has the same form as
+  // SystemZ::CCMASK_*.
+  bits<4> CCValues = 0;
+
+  // The subset of CCValues that have the same meaning as they would after
+  // a comparison of the first operand against zero.
+  bits<4> CompareZeroCCMask = 0;
+
+  // True if the instruction is conditional and if the CC mask operand
+  // comes first (as for BRC, etc.).
+  bit CCMaskFirst = 0;
+
+  // Similar, but true if the CC mask operand comes last (as for LOC, etc.).
+  bit CCMaskLast = 0;
+
+  // True if the instruction is the "logical" rather than "arithmetic" form,
+  // in cases where a distinction exists.
+  bit IsLogical = 0;
+
+  let TSFlags{0}     = SimpleBDXLoad;
+  let TSFlags{1}     = SimpleBDXStore;
+  let TSFlags{2}     = Has20BitOffset;
+  let TSFlags{3}     = HasIndex;
+  let TSFlags{4}     = Is128Bit;
+  let TSFlags{9-5}   = AccessBytes;
+  let TSFlags{13-10} = CCValues;
+  let TSFlags{17-14} = CompareZeroCCMask;
+  let TSFlags{18}    = CCMaskFirst;
+  let TSFlags{19}    = CCMaskLast;
+  let TSFlags{20}    = IsLogical;
 }
 
 //===----------------------------------------------------------------------===//
@@ -61,8 +102,8 @@ class InstSystemZ<int size, dag outs, dag ins, string asmstr,
 // displacement.
 def getDisp12Opcode : InstrMapping {
   let FilterClass = "InstSystemZ";
-  let RowFields = ["Function"];
-  let ColFields = ["PairType"];
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
   let KeyCol = ["20"];
   let ValueCols = [["12"]];
 }
@@ -70,37 +111,54 @@ def getDisp12Opcode : InstrMapping {
 // Return the version of an instruction that has a signed 20-bit displacement.
 def getDisp20Opcode : InstrMapping {
   let FilterClass = "InstSystemZ";
-  let RowFields = ["Function"];
-  let ColFields = ["PairType"];
+  let RowFields = ["DispKey"];
+  let ColFields = ["DispSize"];
   let KeyCol = ["12"];
   let ValueCols = [["20"]];
 }
 
+// Return the memory form of a register instruction.
+def getMemOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["OpKey"];
+  let ColFields = ["OpType"];
+  let KeyCol = ["reg"];
+  let ValueCols = [["mem"]];
+}
+
+// Return the 3-operand form of a 2-operand instruction.
+def getThreeOperandOpcode : InstrMapping {
+  let FilterClass = "InstSystemZ";
+  let RowFields = ["NumOpsKey"];
+  let ColFields = ["NumOpsValue"];
+  let KeyCol = ["2"];
+  let ValueCols = [["3"]];
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction formats
 //===----------------------------------------------------------------------===//
 //
 // Formats are specified using operand field declarations of the form:
 //
-//   bits<4> Rn : register input or output for operand n
-//   bits<m> In : immediate value of width m for operand n
-//   bits<4> Bn : base register for address operand n
-//   bits<m> Dn : displacement value of width m for address operand n
-//   bits<4> Xn : index register for address operand n
-//   bits<4> Mn : mode value for operand n
+//   bits<4> Rn   : register input or output for operand n
+//   bits<m> In   : immediate value of width m for operand n
+//   bits<4> BDn  : address operand n, which has a base and a displacement
+//   bits<m> XBDn : address operand n, which has an index, a base and a
+//                  displacement
+//   bits<4> Xn   : index register for address operand n
+//   bits<4> Mn   : mode value for operand n
 //
-// The operand numbers ("n" in the list above) follow the architecture manual,
-// but the fields are always declared in assembly order, so there are some
-// cases where operand "2" comes after operand "3".  For address operands,
-// the base register field is declared first, followed by the displacement,
-// followed by the index (if any).  This matches the bdaddr* and bdxaddr*
-// orders.
+// The operand numbers ("n" in the list above) follow the architecture manual.
+// Assembly operands sometimes have a different order; in particular, R3 often
+// is often written between operands 1 and 2.
 //
 //===----------------------------------------------------------------------===//
 
 class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
   bits<16> I2;
@@ -111,9 +169,64 @@ class InstRI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-0}  = I2;
 }
 
+class InstRIEb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R2;
+  let Inst{31-16} = RI4;
+  let Inst{15-12} = M3;
+  let Inst{11-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<8> I2;
+  bits<4> M3;
+  bits<16> RI4;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = M3;
+  let Inst{31-16} = RI4;
+  let Inst{15-8}  = I2;
+  let Inst{7-0}   = op{7-0};
+}
+
+class InstRIEd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R3;
+  bits<16> I2;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-36} = R1;
+  let Inst{35-32} = R3;
+  let Inst{31-16} = I2;
+  let Inst{15-8}  = 0;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R2;
@@ -133,6 +246,7 @@ class InstRIEf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRIL<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
   bits<32> I2;
@@ -146,6 +260,7 @@ class InstRIL<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<2, outs, ins, asmstr, pattern> {
   field bits<16> Inst;
+  field bits<16> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R2;
@@ -158,6 +273,7 @@ class InstRR<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R3;
@@ -173,6 +289,7 @@ class InstRRD<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R2;
@@ -186,14 +303,16 @@ class InstRRE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R2;
   bits<4> R3;
+  bits<4> R4;
 
   let Inst{31-16} = op;
   let Inst{15-12} = R3;
-  let Inst{11-8}  = 0;
+  let Inst{11-8}  = R4;
   let Inst{7-4}   = R1;
   let Inst{3-0}   = R2;
 }
@@ -201,17 +320,14 @@ class InstRRF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRX<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
-  bits<4> B2;
-  bits<12> D2;
-  bits<4> X2;
+  bits<20> XBD2;
 
   let Inst{31-24} = op;
   let Inst{23-20} = R1;
-  let Inst{19-16} = X2;
-  let Inst{15-12} = B2;
-  let Inst{11-0}  = D2;
+  let Inst{19-0}  = XBD2;
 
   let HasIndex = 1;
 }
@@ -219,17 +335,14 @@ class InstRX<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
-  bits<4> B2;
-  bits<12> D2;
-  bits<4> X2;
+  bits<20> XBD2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-36} = R1;
-  let Inst{35-32} = X2;
-  let Inst{31-28} = B2;
-  let Inst{27-16} = D2;
+  let Inst{35-16} = XBD2;
   let Inst{15-8}  = 0;
   let Inst{7-0}   = op{7-0};
 
@@ -239,18 +352,15 @@ class InstRXE<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R3;
-  bits<4> B2;
-  bits<12> D2;
-  bits<4> X2;
+  bits<20> XBD2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-36} = R3;
-  let Inst{35-32} = X2;
-  let Inst{31-28} = B2;
-  let Inst{27-16} = D2;
+  let Inst{35-16} = XBD2;
   let Inst{15-12} = R1;
   let Inst{11-8}  = 0;
   let Inst{7-0}   = op{7-0};
@@ -261,18 +371,14 @@ class InstRXF<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRXY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
-  bits<4> B2;
-  bits<20> D2;
-  bits<4> X2;
+  bits<28> XBD2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-36} = R1;
-  let Inst{35-32} = X2;
-  let Inst{31-28} = B2;
-  let Inst{27-16} = D2{11-0};
-  let Inst{15-8}  = D2{19-12};
+  let Inst{35-8}  = XBD2;
   let Inst{7-0}   = op{7-0};
 
   let Has20BitOffset = 1;
@@ -282,34 +388,31 @@ class InstRXY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstRS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R3;
-  bits<4> B2;
-  bits<12> D2;
+  bits<16> BD2;
 
   let Inst{31-24} = op;
   let Inst{23-20} = R1;
   let Inst{19-16} = R3;
-  let Inst{15-12} = B2;
-  let Inst{11-0}  = D2;
+  let Inst{15-0}  = BD2;
 }
 
 class InstRSY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
   bits<4> R1;
   bits<4> R3;
-  bits<4> B2;
-  bits<20> D2;
+  bits<24> BD2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-36} = R1;
   let Inst{35-32} = R3;
-  let Inst{31-28} = B2;
-  let Inst{27-16} = D2{11-0};
-  let Inst{15-8}  = D2{19-12};
+  let Inst{31-8}  = BD2;
   let Inst{7-0}   = op{7-0};
 
   let Has20BitOffset = 1;
@@ -318,60 +421,77 @@ class InstRSY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 class InstSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
+  field bits<32> SoftFail = 0;
 
-  bits<4> B1;
-  bits<12> D1;
+  bits<16> BD1;
   bits<8> I2;
 
   let Inst{31-24} = op;
   let Inst{23-16} = I2;
-  let Inst{15-12} = B1;
-  let Inst{11-0}  = D1;
+  let Inst{15-0}  = BD1;
 }
 
 class InstSIL<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
-  bits<4> B1;
-  bits<12> D1;
+  bits<16> BD1;
   bits<16> I2;
 
   let Inst{47-32} = op;
-  let Inst{31-28} = B1;
-  let Inst{27-16} = D1;
+  let Inst{31-16} = BD1;
   let Inst{15-0}  = I2;
 }
 
 class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
+  field bits<48> SoftFail = 0;
 
-  bits<4> B1;
-  bits<20> D1;
+  bits<24> BD1;
   bits<8> I2;
 
   let Inst{47-40} = op{15-8};
   let Inst{39-32} = I2;
-  let Inst{31-28} = B1;
-  let Inst{27-16} = D1{11-0};
-  let Inst{15-8}  = D1{19-12};
+  let Inst{31-8}  = BD1;
   let Inst{7-0}   = op{7-0};
 
   let Has20BitOffset = 1;
 }
 
+class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<24> BDL1;
+  bits<16> BD2;
+
+  let Inst{47-40} = op;
+  let Inst{39-16} = BDL1;
+  let Inst{15-0}  = BD2;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction definitions with semantics
 //===----------------------------------------------------------------------===//
 //
-// These classes have the form <Category><Format>, where <Format> is one
+// These classes have the form [Cond]<Category><Format>, where <Format> is one
 // of the formats defined above and where <Category> describes the inputs
-// and outputs.  <Category> can be one of:
+// and outputs.  "Cond" is used if the instruction is conditional,
+// in which case the 4-bit condition-code mask is added as a final operand.
+// <Category> can be one of:
 //
 //   Inherent:
 //     One register output operand and no input operands.
 //
+//   BranchUnary:
+//     One register output operand, one register input operand and
+//     one branch displacement.  The instructions stores a modified
+//     form of the source register in the destination register and
+//     branches on the result.
+//
 //   Store:
 //     One register or immediate input operand and one address input operand.
 //     The instruction stores the first operand to the address.
@@ -420,6 +540,10 @@ class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //     One output operand and five input operands.  The first two operands
 //     are registers and the other three are immediates.
 //
+//   Prefetch:
+//     One 4-bit immediate operand and one address operand.  The immediate
+//     operand is 1 for a load prefetch and 2 for a store prefetch.
+//
 // The format determines which input operands are tied to output operands,
 // and also determines the shape of any address operand.
 //
@@ -432,23 +556,32 @@ class InstSIY<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 
 class InherentRRE<string mnemonic, bits<16> opcode, RegisterOperand cls,
                   dag src>
-  : InstRRE<opcode, (outs cls:$dst), (ins),
-            mnemonic#"\t$dst",
-            [(set cls:$dst, src)]> {
+  : InstRRE<opcode, (outs cls:$R1), (ins),
+            mnemonic#"\t$R1",
+            [(set cls:$R1, src)]> {
   let R2 = 0;
 }
 
+class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
+  : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$I2),
+           mnemonic##"\t$R1, $I2", []> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
-  : InstRSY<opcode, (outs cls:$dst1, cls:$dst2), (ins bdaddr20only:$addr),
-            mnemonic#"\t$dst1, $dst2, $addr", []> {
+  : InstRSY<opcode, (outs cls:$R1, cls:$R3), (ins bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
   let mayLoad = 1;
 }
 
 class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                  RegisterOperand cls>
-  : InstRIL<opcode, (outs), (ins cls:$src, pcrel32:$addr),
-            mnemonic#"\t$src, $addr",
-            [(operator cls:$src, pcrel32:$addr)]> {
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, pcrel32:$I2)]> {
   let mayStore = 1;
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
@@ -457,105 +590,206 @@ class StoreRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
 }
 
 class StoreRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              RegisterOperand cls, AddressingMode mode = bdxaddr12only>
-  : InstRX<opcode, (outs), (ins cls:$src, mode:$addr),
-           mnemonic#"\t$src, $addr",
-           [(operator cls:$src, mode:$addr)]> {
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
   let mayStore = 1;
+  let AccessBytes = bytes;
 }
 
 class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               RegisterOperand cls, AddressingMode mode = bdxaddr20only>
-  : InstRXY<opcode, (outs), (ins cls:$src, mode:$addr),
-            mnemonic#"\t$src, $addr",
-            [(operator cls:$src, mode:$addr)]> {
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, mode:$XBD2)]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
   let mayStore = 1;
+  let AccessBytes = bytes;
 }
 
 multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
-                       SDPatternOperator operator, RegisterOperand cls> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
-      def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bdxaddr12pair>;
-    let PairType = "20" in
-      def Y  : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bdxaddr20pair>;
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : StoreRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
   }
 }
 
 class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
-  : InstRSY<opcode, (outs), (ins cls:$from, cls:$to, bdaddr20only:$addr),
-            mnemonic#"\t$from, $to, $addr", []> {
+  : InstRSY<opcode, (outs), (ins cls:$R1, cls:$R3, bdaddr20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2", []> {
   let mayStore = 1;
 }
 
+// StoreSI* instructions are used to store an integer to memory, but the
+// addresses are more restricted than for normal stores.  If we are in the
+// situation of having to force either the address into a register or the
+// constant into a register, it's usually better to do the latter.
+// We therefore match the address in the same way as a normal store and
+// only use the StoreSI* instruction if the matched address is suitable.
 class StoreSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              Immediate imm, AddressingMode mode = bdaddr12only>
-  : InstSI<opcode, (outs), (ins mode:$addr, imm:$src),
-           mnemonic#"\t$addr, $src",
-           [(operator imm:$src, mode:$addr)]> {
+              Immediate imm>
+  : InstSI<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               Immediate imm, AddressingMode mode = bdaddr20only>
-  : InstSIY<opcode, (outs), (ins mode:$addr, imm:$src),
-            mnemonic#"\t$addr, $src",
-            [(operator imm:$src, mode:$addr)]> {
+               Immediate imm>
+  : InstSIY<opcode, (outs), (ins mviaddr20pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr20pair:$BD1)]> {
   let mayStore = 1;
 }
 
 class StoreSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                Immediate imm>
-  : InstSIL<opcode, (outs), (ins bdaddr12only:$addr, imm:$src),
-            mnemonic#"\t$addr, $src",
-            [(operator imm:$src, bdaddr12only:$addr)]> {
+  : InstSIL<opcode, (outs), (ins mviaddr12pair:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator imm:$I2, mviaddr12pair:$BD1)]> {
   let mayStore = 1;
 }
 
 multiclass StoreSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
                        SDPatternOperator operator, Immediate imm> {
-  let Function = mnemonic in {
-    let PairType = "12" in
-      def "" : StoreSI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
-    let PairType = "20" in
-      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
+      def "" : StoreSI<mnemonic, siOpcode, operator, imm>;
+    let DispSize = "20" in
+      def Y  : StoreSIY<mnemonic#"y", siyOpcode, operator, imm>;
   }
 }
 
+class CondStoreRSY<string mnemonic, bits<16> opcode,
+                   RegisterOperand cls, bits<5> bytes,
+                   AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, cond4:$valid, cond4:$R3),
+            mnemonic#"$R3\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondStoreRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondStoreRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2, uimm8zx4:$R3),
+            mnemonic#"\t$R1, $BD2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+}
+
+// Like CondStoreRSY, but with a fixed CC mask.
+class FixedCondStoreRSY<string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<4> ccmask, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs), (ins cls:$R1, mode:$BD2),
+            mnemonic#"\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayStore = 1;
+  let AccessBytes = bytes;
+  let R3 = ccmask;
+}
+
 class UnaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
               RegisterOperand cls1, RegisterOperand cls2>
-  : InstRR<opcode, (outs cls1:$dst), (ins cls2:$src),
-           mnemonic#"\t$dst, $src",
-           [(set cls1:$dst, (operator cls2:$src))]>;
+  : InstRR<opcode, (outs cls1:$R1), (ins cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+}
 
 class UnaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                RegisterOperand cls1, RegisterOperand cls2>
-  : InstRRE<opcode, (outs cls1:$dst), (ins cls2:$src),
-            mnemonic#"\t$dst, $src",
-            [(set cls1:$dst, (operator cls2:$src))]>;
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+}
 
 class UnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$dst), (ins cls2:$src, uimm8zx4:$mode),
-            mnemonic#"\t$dst, $mode, $src", []>;
+  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2),
+            mnemonic#"r\t$R1, $R3, $R2", []> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let R4 = 0;
+}
+
+class UnaryRRF4<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins uimm8zx4:$R3, cls2:$R2, uimm8zx4:$R4),
+            mnemonic#"\t$R1, $R3, $R2, $R4", []>;
+
+// These instructions are generated by if conversion.  The old value of R1
+// is added as an implicit use.
+class CondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                   RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls2:$R2, cond4:$valid, cond4:$R3),
+            mnemonic#"r$R3\t$R1, $R2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let CCMaskLast = 1;
+  let R4 = 0;
+}
+
+// Like CondUnaryRRF, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                      RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2, uimm8zx4:$R3),
+            mnemonic#"r\t$R1, $R2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R4 = 0;
+}
+
+// Like CondUnaryRRF, but with a fixed CC mask.
+class FixedCondUnaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                        RegisterOperand cls2, bits<4> ccmask>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"\t$R1, $R2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R3 = ccmask;
+  let R4 = 0;
+}
 
 class UnaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
               RegisterOperand cls, Immediate imm>
-  : InstRI<opcode, (outs cls:$dst), (ins imm:$src),
-           mnemonic#"\t$dst, $src",
-           [(set cls:$dst, (operator imm:$src))]>;
+  : InstRI<opcode, (outs cls:$R1), (ins imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator imm:$I2))]>;
 
 class UnaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                RegisterOperand cls, Immediate imm>
-  : InstRIL<opcode, (outs cls:$dst), (ins imm:$src),
-            mnemonic#"\t$dst, $src",
-            [(set cls:$dst, (operator imm:$src))]>;
+  : InstRIL<opcode, (outs cls:$R1), (ins imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator imm:$I2))]>;
 
 class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                  RegisterOperand cls>
-  : InstRIL<opcode, (outs cls:$dst), (ins pcrel32:$addr),
-            mnemonic#"\t$dst, $addr",
-            [(set cls:$dst, (operator pcrel32:$addr))]> {
+  : InstRIL<opcode, (outs cls:$R1), (ins pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator pcrel32:$I2))]> {
   let mayLoad = 1;
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
@@ -563,148 +797,267 @@ class UnaryRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
   let AddedComplexity = 7;
 }
 
+class CondUnaryRSY<string mnemonic, bits<16> opcode,
+                   SDPatternOperator operator, RegisterOperand cls,
+                   bits<5> bytes, AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1),
+            (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$R3),
+            mnemonic#"$R3\t$R1, $BD2",
+            [(set cls:$R1,
+                  (z_select_ccmask (load bdaddr20only:$BD2), cls:$R1src,
+                                   cond4:$valid, cond4:$R3))]>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let CCMaskLast = 1;
+}
+
+// Like CondUnaryRSY, but used for the raw assembly form.  The condition-code
+// mask is the third operand rather than being part of the mnemonic.
+class AsmCondUnaryRSY<string mnemonic, bits<16> opcode,
+                      RegisterOperand cls, bits<5> bytes,
+                      AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2, uimm8zx4:$R3),
+            mnemonic#"\t$R1, $BD2, $R3", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+// Like CondUnaryRSY, but with a fixed CC mask.
+class FixedCondUnaryRSY<string mnemonic, bits<16> opcode,
+                        RegisterOperand cls, bits<4> ccmask, bits<5> bytes,
+                        AddressingMode mode = bdaddr20only>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$BD2),
+            mnemonic#"\t$R1, $BD2", []>,
+    Requires<[FeatureLoadStoreOnCond]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+  let R3 = ccmask;
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class UnaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              RegisterOperand cls, AddressingMode mode = bdxaddr12only>
-  : InstRX<opcode, (outs cls:$dst), (ins mode:$addr),
-           mnemonic#"\t$dst, $addr",
-           [(set cls:$dst, (operator mode:$addr))]> {
+              RegisterOperand cls, bits<5> bytes,
+              AddressingMode mode = bdxaddr12only>
+  : InstRX<opcode, (outs cls:$R1), (ins mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class UnaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               RegisterOperand cls>
-  : InstRXE<opcode, (outs cls:$dst), (ins bdxaddr12only:$addr),
-            mnemonic#"\t$dst, $addr",
-            [(set cls:$dst, (operator bdxaddr12only:$addr))]> {
+               RegisterOperand cls, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               RegisterOperand cls, AddressingMode mode = bdxaddr20only>
-  : InstRXY<opcode, (outs cls:$dst), (ins mode:$addr),
-            mnemonic#"\t$dst, $addr",
-            [(set cls:$dst, (operator mode:$addr))]> {
+               RegisterOperand cls, bits<5> bytes,
+               AddressingMode mode = bdxaddr20only>
+  : InstRXY<opcode, (outs cls:$R1), (ins mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
-                       SDPatternOperator operator, RegisterOperand cls> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
-      def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bdxaddr12pair>;
-    let PairType = "20" in
-      def Y  : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bdxaddr20pair>;
+                       SDPatternOperator operator, RegisterOperand cls,
+                       bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : UnaryRXY<mnemonic#"y", rxyOpcode, operator, cls, bytes,
+                        bdxaddr20pair>;
   }
 }
 
 class BinaryRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls1, RegisterOperand cls2>
-  : InstRR<opcode, (outs cls1:$dst), (ins cls1:$src1, cls2:$src2),
-           mnemonic#"\t$dst, $src2",
-           [(set cls1:$dst, (operator cls1:$src1, cls2:$src2))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRR<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
 }
 
 class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 RegisterOperand cls1, RegisterOperand cls2>
-  : InstRRE<opcode, (outs cls1:$dst), (ins cls1:$src1, cls2:$src2),
-            mnemonic#"\t$dst, $src2",
-            [(set cls1:$dst, (operator cls1:$src1, cls2:$src2))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRRE<opcode, (outs cls1:$R1), (ins cls1:$R1src, cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(set cls1:$R1, (operator cls1:$R1src, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
 }
 
-// Here the assembly and dag operands are in natural order,
-// but the first input operand maps to R3 and the second to R2.
-// This is used for "CPSDR R1, R3, R2", which is equivalent to
-// R1 = copysign (R3, R2).
-//
-// Direct uses of the instruction must pass operands in encoding order --
-// R1, R2, R3 -- so they must pass the source operands in reverse order.
-class BinaryRevRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                   RegisterOperand cls1, RegisterOperand cls2>
-  : InstRRF<opcode, (outs cls1:$dst), (ins cls2:$src2, cls1:$src1),
-            mnemonic#"\t$dst, $src1, $src2",
-            [(set cls1:$dst, (operator cls1:$src1, cls2:$src2))]>;
+class BinaryRRF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R3, cls2:$R2),
+            mnemonic#"r\t$R1, $R3, $R2",
+            [(set cls1:$R1, (operator cls1:$R3, cls2:$R2))]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let R4 = 0;
+}
+
+class BinaryRRFK<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRF<opcode, (outs cls1:$R1), (ins cls1:$R2, cls2:$R3),
+            mnemonic#"rk\t$R1, $R2, $R3",
+            [(set cls1:$R1, (operator cls1:$R2, cls2:$R3))]> {
+  let R4 = 0;
+}
+
+multiclass BinaryRRAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls1,
+                        RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFK<mnemonic, opcode2, null_frag, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRR<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
+
+multiclass BinaryRREAndK<string mnemonic, bits<16> opcode1, bits<16> opcode2,
+                         SDPatternOperator operator, RegisterOperand cls1,
+                         RegisterOperand cls2> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRRFK<mnemonic, opcode2, null_frag, cls1, cls2>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRRE<mnemonic, opcode1, operator, cls1, cls2>;
+  }
+}
 
 class BinaryRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                RegisterOperand cls, Immediate imm>
-  : InstRI<opcode, (outs cls:$dst), (ins cls:$src1, imm:$src2),
-           mnemonic#"\t$dst, $src2",
-           [(set cls:$dst, (operator cls:$src1, imm:$src2))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRI<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class BinaryRIE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls, Immediate imm>
+  : InstRIEd<opcode, (outs cls:$R1), (ins cls:$R3, imm:$I2),
+             mnemonic#"\t$R1, $R3, $I2",
+             [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
+                        SDPatternOperator operator, RegisterOperand cls,
+                        Immediate imm> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIE<mnemonic##"k", opcode2, null_frag, cls, imm>,
+              Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
+  }
 }
 
 class BinaryRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                 RegisterOperand cls, Immediate imm>
-  : InstRIL<opcode, (outs cls:$dst), (ins cls:$src1, imm:$src2),
-            mnemonic#"\t$dst, $src2",
-            [(set cls:$dst, (operator cls:$src1, imm:$src2))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRIL<opcode, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
 }
 
 class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-               RegisterOperand cls, SDPatternOperator load,
+               RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                AddressingMode mode = bdxaddr12only>
-  : InstRX<opcode, (outs cls:$dst), (ins cls:$src1, mode:$src2),
-           mnemonic#"\t$dst, $src2",
-           [(set cls:$dst, (operator cls:$src1, (load mode:$src2)))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRX<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                  RegisterOperand cls, SDPatternOperator load>
-  : InstRXE<opcode, (outs cls:$dst), (ins cls:$src1, bdxaddr12only:$src2),
-            mnemonic#"\t$dst, $src2",
-            [(set cls:$dst, (operator cls:$src1,
-                                      (load bdxaddr12only:$src2)))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+                  RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                RegisterOperand cls, SDPatternOperator load,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                 AddressingMode mode = bdxaddr20only>
-  : InstRXY<opcode, (outs cls:$dst), (ins cls:$src1, mode:$src2),
-            mnemonic#"\t$dst, $src2",
-            [(set cls:$dst, (operator cls:$src1, (load mode:$src2)))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRXY<opcode, (outs cls:$R1), (ins cls:$R1src, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, (load mode:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                         SDPatternOperator operator, RegisterOperand cls,
-                        SDPatternOperator load> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
-      def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bdxaddr12pair>;
-    let PairType = "20" in
-      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load,
+                        SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
+      def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
+                        bdxaddr12pair>;
+    let DispSize = "20" in
+      def Y  : BinaryRXY<mnemonic#"y", rxyOpcode, operator, cls, load, bytes,
                          bdxaddr20pair>;
   }
 }
 
 class BinarySI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                Operand imm, AddressingMode mode = bdaddr12only>
-  : InstSI<opcode, (outs), (ins mode:$addr, imm:$src),
-           mnemonic#"\t$addr, $src",
-           [(store (operator (load mode:$addr), imm:$src), mode:$addr)]> {
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
 
 class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 Operand imm, AddressingMode mode = bdaddr20only>
-  : InstSIY<opcode, (outs), (ins mode:$addr, imm:$src),
-            mnemonic#"\t$addr, $src",
-            [(store (operator (load mode:$addr), imm:$src), mode:$addr)]> {
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(store (operator (load mode:$BD1), imm:$I2), mode:$BD1)]> {
   let mayLoad = 1;
   let mayStore = 1;
 }
@@ -712,59 +1065,83 @@ class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
                         bits<16> siyOpcode, SDPatternOperator operator,
                         Operand imm> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
       def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
-    let PairType = "20" in
+    let DispSize = "20" in
       def Y  : BinarySIY<mnemonic#"y", siyOpcode, operator, imm, bdaddr20pair>;
   }
 }
 
 class ShiftRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-              RegisterOperand cls, AddressingMode mode>
-  : InstRS<opcode, (outs cls:$dst), (ins cls:$src1, mode:$src2),
-           mnemonic#"\t$dst, $src2",
-           [(set cls:$dst, (operator cls:$src1, mode:$src2))]> {
+              RegisterOperand cls>
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, shift12only:$BD2),
+           mnemonic#"\t$R1, $BD2",
+           [(set cls:$R1, (operator cls:$R1src, shift12only:$BD2))]> {
   let R3 = 0;
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
 }
 
 class ShiftRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-               RegisterOperand cls, AddressingMode mode>
-  : InstRSY<opcode, (outs cls:$dst), (ins cls:$src1, mode:$src2),
-            mnemonic#"\t$dst, $src1, $src2",
-            [(set cls:$dst, (operator cls:$src1, mode:$src2))]>;
+               RegisterOperand cls>
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R3, shift20only:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator cls:$R3, shift20only:$BD2))]>;
+
+multiclass ShiftRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
+                       SDPatternOperator operator, RegisterOperand cls> {
+  let NumOpsKey = mnemonic in {
+    let NumOpsValue = "3" in
+      def K  : ShiftRSY<mnemonic##"k", opcode2, null_frag, cls>,
+               Requires<[FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : ShiftRS<mnemonic, opcode1, operator, cls>;
+  }
+}
 
 class CompareRR<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls1, RegisterOperand cls2>
-  : InstRR<opcode, (outs), (ins cls1:$src1, cls2:$src2),
-           mnemonic#"\t$src1, $src2",
-           [(operator cls1:$src1, cls2:$src2)]>;
+  : InstRR<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+           mnemonic#"r\t$R1, $R2",
+           [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
 
 class CompareRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2>
-  : InstRRE<opcode, (outs), (ins cls1:$src1, cls2:$src2),
-            mnemonic#"\t$src1, $src2",
-            [(operator cls1:$src1, cls2:$src2)]>;
+  : InstRRE<opcode, (outs), (ins cls1:$R1, cls2:$R2),
+            mnemonic#"r\t$R1, $R2",
+            [(operator cls1:$R1, cls2:$R2)]> {
+  let OpKey = mnemonic ## cls1;
+  let OpType = "reg";
+  let isCompare = 1;
+}
 
 class CompareRI<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                 RegisterOperand cls, Immediate imm>
-  : InstRI<opcode, (outs), (ins cls:$src1, imm:$src2),
-           mnemonic#"\t$src1, $src2",
-           [(operator cls:$src1, imm:$src2)]>;
+  : InstRI<opcode, (outs), (ins cls:$R1, imm:$I2),
+           mnemonic#"\t$R1, $I2",
+           [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
 
 class CompareRIL<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                  RegisterOperand cls, Immediate imm>
-  : InstRIL<opcode, (outs), (ins cls:$src1, imm:$src2),
-            mnemonic#"\t$src1, $src2",
-            [(operator cls:$src1, imm:$src2)]>;
+  : InstRIL<opcode, (outs), (ins cls:$R1, imm:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
 
 class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
                    RegisterOperand cls, SDPatternOperator load>
-  : InstRIL<opcode, (outs), (ins cls:$src1, pcrel32:$src2),
-            mnemonic#"\t$src1, $src2",
-            [(operator cls:$src1, (load pcrel32:$src2))]> {
+  : InstRIL<opcode, (outs), (ins cls:$R1, pcrel32:$I2),
+            mnemonic#"\t$R1, $I2",
+            [(operator cls:$R1, (load pcrel32:$I2))]> {
+  let isCompare = 1;
   let mayLoad = 1;
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
@@ -773,77 +1150,92 @@ class CompareRILPC<string mnemonic, bits<12> opcode, SDPatternOperator operator,
 }
 
 class CompareRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
-                RegisterOperand cls, SDPatternOperator load,
+                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                 AddressingMode mode = bdxaddr12only>
-  : InstRX<opcode, (outs), (ins cls:$src1, mode:$src2),
-           mnemonic#"\t$src1, $src2",
-           [(operator cls:$src1, (load mode:$src2))]> {
+  : InstRX<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+           mnemonic#"\t$R1, $XBD2",
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class CompareRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, SDPatternOperator load>
-  : InstRXE<opcode, (outs), (ins cls:$src1, bdxaddr12only:$src2),
-            mnemonic#"\t$src1, $src2",
-            [(operator cls:$src1, (load bdxaddr12only:$src2))]> {
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load bdxaddr12only:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, SDPatternOperator load,
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                  AddressingMode mode = bdxaddr20only>
-  : InstRXY<opcode, (outs), (ins cls:$src1, mode:$src2),
-            mnemonic#"\t$src1, $src2",
-            [(operator cls:$src1, (load mode:$src2))]> {
+  : InstRXY<opcode, (outs), (ins cls:$R1, mode:$XBD2),
+            mnemonic#"\t$R1, $XBD2",
+            [(operator cls:$R1, (load mode:$XBD2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let isCompare = 1;
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                          SDPatternOperator operator, RegisterOperand cls,
-                         SDPatternOperator load> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
+                         SDPatternOperator load, bits<5> bytes> {
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
       def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
-                         load, bdxaddr12pair>;
-    let PairType = "20" in
+                         load, bytes, bdxaddr12pair>;
+    let DispSize = "20" in
       def Y  : CompareRXY<mnemonic#"y", rxyOpcode, operator, cls,
-                          load, bdxaddr20pair>;
+                          load, bytes, bdxaddr20pair>;
   }
 }
 
 class CompareSI<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 SDPatternOperator load, Immediate imm,
                 AddressingMode mode = bdaddr12only>
-  : InstSI<opcode, (outs), (ins mode:$addr, imm:$src),
-           mnemonic#"\t$addr, $src",
-           [(operator (load mode:$addr), imm:$src)]> {
+  : InstSI<opcode, (outs), (ins mode:$BD1, imm:$I2),
+           mnemonic#"\t$BD1, $I2",
+           [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
   let mayLoad = 1;
 }
 
 class CompareSIL<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  SDPatternOperator load, Immediate imm>
-  : InstSIL<opcode, (outs), (ins bdaddr12only:$addr, imm:$src),
-            mnemonic#"\t$addr, $src",
-            [(operator (load bdaddr12only:$addr), imm:$src)]> {
+  : InstSIL<opcode, (outs), (ins bdaddr12only:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load bdaddr12only:$BD1), imm:$I2)]> {
+  let isCompare = 1;
   let mayLoad = 1;
 }
 
 class CompareSIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  SDPatternOperator load, Immediate imm,
                  AddressingMode mode = bdaddr20only>
-  : InstSIY<opcode, (outs), (ins mode:$addr, imm:$src),
-            mnemonic#"\t$addr, $src",
-            [(operator (load mode:$addr), imm:$src)]> {
+  : InstSIY<opcode, (outs), (ins mode:$BD1, imm:$I2),
+            mnemonic#"\t$BD1, $I2",
+            [(operator (load mode:$BD1), imm:$I2)]> {
+  let isCompare = 1;
   let mayLoad = 1;
 }
 
 multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
                          SDPatternOperator operator, SDPatternOperator load,
                          Immediate imm> {
-  let Function = mnemonic in {
-    let PairType = "12" in
+  let DispKey = mnemonic in {
+    let DispSize = "12" in
       def "" : CompareSI<mnemonic, siOpcode, operator, load, imm, bdaddr12pair>;
-    let PairType = "20" in
+    let DispSize = "20" in
       def Y  : CompareSIY<mnemonic#"y", siyOpcode, operator, load, imm,
                           bdaddr20pair>;
   }
@@ -851,65 +1243,94 @@ multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
 
 class TernaryRRD<string mnemonic, bits<16> opcode,
                  SDPatternOperator operator, RegisterOperand cls>
-  : InstRRD<opcode, (outs cls:$dst), (ins cls:$src1, cls:$src2, cls:$src3),
-            mnemonic#"\t$dst, $src2, $src3",
-            [(set cls:$dst, (operator cls:$src1, cls:$src2, cls:$src3))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+            mnemonic#"r\t$R1, $R3, $R2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "reg";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
 }
 
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, SDPatternOperator load>
-  : InstRXF<opcode, (outs cls:$dst),
-            (ins cls:$src1, cls:$src2, bdxaddr12only:$src3),
-            mnemonic#"\t$dst, $src2, $src3",
-            [(set cls:$dst, (operator cls:$src1, cls:$src2,
-                                      (load bdxaddr12only:$src3)))]> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls:$R1),
+            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
+                                     (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic ## cls;
+  let OpType = "mem";
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
+  let AccessBytes = bytes;
 }
 
 class CmpSwapRS<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                 RegisterOperand cls, AddressingMode mode = bdaddr12only>
-  : InstRS<opcode, (outs cls:$dst), (ins cls:$old, cls:$new, mode:$ptr),
-           mnemonic#"\t$dst, $new, $ptr",
-           [(set cls:$dst, (operator mode:$ptr, cls:$old, cls:$new))]> {
-  let Constraints = "$old = $dst";
-  let DisableEncoding = "$old";
+  : InstRS<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+           mnemonic#"\t$R1, $R3, $BD2",
+           [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let mayStore = 1;
 }
 
 class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls, AddressingMode mode = bdaddr20only>
-  : InstRSY<opcode, (outs cls:$dst), (ins cls:$old, cls:$new, mode:$ptr),
-            mnemonic#"\t$dst, $new, $ptr",
-            [(set cls:$dst, (operator mode:$ptr, cls:$old, cls:$new))]> {
-  let Constraints = "$old = $dst";
-  let DisableEncoding = "$old";
+  : InstRSY<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, mode:$BD2),
+            mnemonic#"\t$R1, $R3, $BD2",
+            [(set cls:$R1, (operator mode:$BD2, cls:$R1src, cls:$R3))]> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
   let mayLoad = 1;
   let mayStore = 1;
 }
 
 multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
                          SDPatternOperator operator, RegisterOperand cls> {
-  let Function = mnemonic ## #cls in {
-    let PairType = "12" in
+  let DispKey = mnemonic ## #cls in {
+    let DispSize = "12" in
       def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
-    let PairType = "20" in
+    let DispSize = "20" in
       def Y  : CmpSwapRSY<mnemonic#"y", rsyOpcode, operator, cls, bdaddr20pair>;
   }
 }
 
 class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                        RegisterOperand cls2>
-  : InstRIEf<opcode, (outs cls1:$dst),
-             (ins cls1:$src1, cls2:$src2,
-                  uimm8zx6:$imm1, uimm8zx6:$imm2, uimm8zx6:$imm3),
-             mnemonic#"\t$dst, $src2, $imm1, $imm2, $imm3", []> {
-  let Constraints = "$src1 = $dst";
-  let DisableEncoding = "$src1";
+  : InstRIEf<opcode, (outs cls1:$R1),
+             (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+             mnemonic#"\t$R1, $R2, $I3, $I4, $I5", []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
+class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
+  : InstRXY<opcode, (outs), (ins uimm8zx4:$R1, bdxaddr20only:$XBD2),
+            mnemonic##"\t$R1, $XBD2",
+            [(operator uimm8zx4:$R1, bdxaddr20only:$XBD2)]>;
+
+class PrefetchRILPC<string mnemonic, bits<12> opcode,
+                    SDPatternOperator operator>
+  : InstRIL<opcode, (outs), (ins uimm8zx4:$R1, pcrel32:$I2),
+            mnemonic##"\t$R1, $I2",
+            [(operator uimm8zx4:$R1, pcrel32:$I2)]> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// A floating-point load-and test operation.  Create both a normal unary
+// operation and one that acts as a comparison against zero.
+multiclass LoadAndTestRRE<string mnemonic, bits<16> opcode,
+                          RegisterOperand cls> {
+  def "" : UnaryRRE<mnemonic, opcode, null_frag, cls, cls>;
+  let isCodeGenOnly = 1 in
+    def Compare : CompareRRE<mnemonic, opcode, null_frag, cls, cls>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -928,17 +1349,130 @@ class Pseudo<dag outs, dag ins, list<dag> pattern>
   let isCodeGenOnly = 1;
 }
 
+// Like UnaryRI, but expanded after RA depending on the choice of register.
+class UnaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins imm:$I2),
+           [(set cls:$R1, (operator imm:$I2))]>;
+
+// Like UnaryRXY, but expanded after RA depending on the choice of register.
+class UnaryRXYPseudo<string key, SDPatternOperator operator,
+                     RegisterOperand cls, bits<5> bytes,
+                     AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs cls:$R1), (ins mode:$XBD2),
+           [(set cls:$R1, (operator mode:$XBD2))]> {
+  let OpKey = key ## cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like UnaryRR, but expanded after RA depending on the choice of registers.
+class UnaryRRPseudo<string key, SDPatternOperator operator,
+                    RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1), (ins cls2:$R2),
+           [(set cls1:$R1, (operator cls2:$R2))]> {
+  let OpKey = key ## cls1;
+  let OpType = "reg";
+}
+
+// Like BinaryRI, but expanded after RA depending on the choice of register.
+class BinaryRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R1src, imm:$I2),
+           [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// Like BinaryRIE, but expanded after RA depending on the choice of register.
+class BinaryRIEPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs cls:$R1), (ins cls:$R3, imm:$I2),
+           [(set cls:$R1, (operator cls:$R3, imm:$I2))]>;
+
+// Like BinaryRIAndK, but expanded after RA depending on the choice of register.
+multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
+                              RegisterOperand cls, Immediate imm> {
+  let NumOpsKey = key in {
+    let NumOpsValue = "3" in
+      def K : BinaryRIEPseudo<null_frag, cls, imm>,
+              Requires<[FeatureHighWord, FeatureDistinctOps]>;
+    let NumOpsValue = "2", isConvertibleToThreeAddress = 1 in
+      def "" : BinaryRIPseudo<operator, cls, imm>,
+               Requires<[FeatureHighWord]>;
+  }
+}
+
+// Like CompareRI, but expanded after RA depending on the choice of register.
+class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
+                      Immediate imm>
+  : Pseudo<(outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]>;
+
+// Like CompareRXY, but expanded after RA depending on the choice of register.
+class CompareRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                       SDPatternOperator load, bits<5> bytes,
+                       AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, (load mode:$XBD2))]> {
+  let mayLoad = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like StoreRXY, but expanded after RA depending on the choice of register.
+class StoreRXYPseudo<SDPatternOperator operator, RegisterOperand cls,
+                     bits<5> bytes, AddressingMode mode = bdxaddr20only>
+  : Pseudo<(outs), (ins cls:$R1, mode:$XBD2),
+           [(operator cls:$R1, mode:$XBD2)]> {
+  let mayStore = 1;
+  let Has20BitOffset = 1;
+  let HasIndex = 1;
+  let AccessBytes = bytes;
+}
+
+// Like RotateSelectRIEf, but expanded after RA depending on the choice
+// of registers.
+class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2>
+  : Pseudo<(outs cls1:$R1),
+           (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5),
+           []> {
+  let Constraints = "$R1 = $R1src";
+  let DisableEncoding = "$R1src";
+}
+
 // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is
 // the value of the PSW's 2-bit condition code field.
 class SelectWrapper<RegisterOperand cls>
-  : Pseudo<(outs cls:$dst), (ins cls:$src1, cls:$src2, i8imm:$cc),
-           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, imm:$cc))]> {
+  : Pseudo<(outs cls:$dst),
+           (ins cls:$src1, cls:$src2, uimm8zx4:$valid, uimm8zx4:$cc),
+           [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2,
+                                            uimm8zx4:$valid, uimm8zx4:$cc))]> {
   let usesCustomInserter = 1;
   // Although the instructions used by these nodes do not in themselves
-  // change the PSW, the insertion requires new blocks, and the PSW cannot
-  // be live across them.
-  let Defs = [PSW];
-  let Uses = [PSW];
+  // change CC, the insertion requires new blocks, and CC cannot be live
+  // across them.
+  let Defs = [CC];
+  let Uses = [CC];
+}
+
+// Stores $new to $addr if $cc is true ("" case) or false (Inv case).
+multiclass CondStores<RegisterOperand cls, SDPatternOperator store,
+                      SDPatternOperator load, AddressingMode mode> {
+  let Defs = [CC], Uses = [CC], usesCustomInserter = 1 in {
+    def "" : Pseudo<(outs),
+                    (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc),
+                    [(store (z_select_ccmask cls:$new, (load mode:$addr),
+                                             uimm8zx4:$valid, uimm8zx4:$cc),
+                            mode:$addr)]>;
+    def Inv : Pseudo<(outs),
+                     (ins cls:$new, mode:$addr, uimm8zx4:$valid, uimm8zx4:$cc),
+                     [(store (z_select_ccmask (load mode:$addr), cls:$new,
+                                              uimm8zx4:$valid, uimm8zx4:$cc),
+                              mode:$addr)]>;
+  }
 }
 
 // OPERATOR is ATOMIC_SWAP or an ATOMIC_LOAD_* operation.  PAT and OPERAND
@@ -947,7 +1481,7 @@ class AtomicLoadBinary<SDPatternOperator operator, RegisterOperand cls,
                        dag pat, DAGOperand operand>
   : Pseudo<(outs cls:$dst), (ins bdaddr20only:$ptr, operand:$src2),
            [(set cls:$dst, (operator bdaddr20only:$ptr, pat))]> {
-  let Defs = [PSW];
+  let Defs = [CC];
   let Has20BitOffset = 1;
   let mayLoad = 1;
   let mayStore = 1;
@@ -973,7 +1507,7 @@ class AtomicLoadWBinary<SDPatternOperator operator, dag pat,
                 ADDR32:$negbitshift, uimm32:$bitsize),
            [(set GR32:$dst, (operator bdaddr20only:$ptr, pat, ADDR32:$bitshift,
                                       ADDR32:$negbitshift, uimm32:$bitsize))]> {
-  let Defs = [PSW];
+  let Defs = [CC];
   let Has20BitOffset = 1;
   let mayLoad = 1;
   let mayStore = 1;
@@ -985,3 +1519,85 @@ class AtomicLoadWBinaryReg<SDPatternOperator operator>
   : AtomicLoadWBinary<operator, (i32 GR32:$src2), GR32>;
 class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
   : AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
+
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
+multiclass MemorySS<string mnemonic, bits<8> opcode,
+                    SDPatternOperator sequence, SDPatternOperator loop> {
+  def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
+                                       bdaddr12only:$BD2),
+                  mnemonic##"\t$BDL1, $BD2", []>;
+  let usesCustomInserter = 1 in {
+    def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                       imm64:$length),
+                           [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+                                      imm64:$length)]>;
+    def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+                                   imm64:$length, GR64:$count256),
+                      [(loop bdaddr12only:$dest, bdaddr12only:$src,
+                             imm64:$length, GR64:$count256)]>;
+  }
+}
+
+// Define an instruction that operates on two strings, both terminated
+// by the character in R0.  The instruction processes a CPU-determinated
+// number of bytes at a time and sets CC to 3 if the instruction needs
+// to be repeated.  Also define a pseudo instruction that represents
+// the full loop (the main instruction plus the branch on CC==3).
+multiclass StringRRE<string mnemonic, bits<16> opcode,
+                     SDPatternOperator operator> {
+  def "" : InstRRE<opcode, (outs GR64:$R1, GR64:$R2),
+                   (ins GR64:$R1src, GR64:$R2src),
+                   mnemonic#"\t$R1, $R2", []> {
+    let Constraints = "$R1 = $R1src, $R2 = $R2src";
+    let DisableEncoding = "$R1src, $R2src";
+  }
+  let usesCustomInserter = 1 in
+    def Loop : Pseudo<(outs GR64:$end),
+                      (ins GR64:$start1, GR64:$start2, GR32:$char),
+                      [(set GR64:$end, (operator GR64:$start1, GR64:$start2,
+                                                 GR32:$char))]>;
+}
+
+// A pseudo instruction that is a direct alias of a real instruction.
+// These aliases are used in cases where a particular register operand is
+// fixed or where the same instruction is used with different register sizes.
+// The size parameter is the size in bytes of the associated real instruction.
+class Alias<int size, dag outs, dag ins, list<dag> pattern>
+  : InstSystemZ<size, outs, ins, "", pattern> {
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+}
+
+// An alias of a BinaryRI, but with different register sizes.
+class BinaryAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                    Immediate imm>
+  : Alias<4, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a BinaryRIL, but with different register sizes.
+class BinaryAliasRIL<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<6, (outs cls:$R1), (ins cls:$R1src, imm:$I2),
+          [(set cls:$R1, (operator cls:$R1src, imm:$I2))]> {
+  let Constraints = "$R1 = $R1src";
+}
+
+// An alias of a CompareRI, but with different register sizes.
+class CompareAliasRI<SDPatternOperator operator, RegisterOperand cls,
+                     Immediate imm>
+  : Alias<4, (outs), (ins cls:$R1, imm:$I2), [(operator cls:$R1, imm:$I2)]> {
+  let isCompare = 1;
+}
+
+// An alias of a RotateSelectRIEf, but with different register sizes.
+class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
+  : Alias<6, (outs cls1:$R1),
+          (ins cls1:$R1src, cls2:$R2, uimm8:$I3, uimm8:$I4, uimm8zx6:$I5), []> {
+  let Constraints = "$R1 = $R1src";
+}
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 0718c83..acfeed8 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -12,17 +12,37 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZInstrInfo.h"
+#include "SystemZTargetMachine.h"
 #include "SystemZInstrBuilder.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
 using namespace llvm;
 
+// Return a mask with Count low bits set.
+static uint64_t allOnes(unsigned int Count) {
+  return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
+}
+
+// Reg should be a 32-bit GPR.  Return true if it is a high register rather
+// than a low register.
+static bool isHighReg(unsigned int Reg) {
+  if (SystemZ::GRH32BitRegClass.contains(Reg))
+    return true;
+  assert(SystemZ::GR32BitRegClass.contains(Reg) && "Invalid GRX32");
+  return false;
+}
+
+// Pin the vtable to this file.
+void SystemZInstrInfo::anchor() {}
+
 SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
   : SystemZGenInstrInfo(SystemZ::ADJCALLSTACKDOWN, SystemZ::ADJCALLSTACKUP),
-    RI(tm, *this) {
+    RI(tm), TM(tm) {
 }
 
 // MI is a 128-bit load or store.  Split it into two 64-bit loads or stores,
@@ -40,8 +60,8 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   // Set up the two 64-bit registers.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
-  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_high));
-  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_low));
+  HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
+  LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
@@ -74,12 +94,104 @@ void SystemZInstrInfo::splitAdjDynAlloc(MachineBasicBlock::iterator MI) const {
   OffsetMO.setImm(Offset);
 }
 
+// MI is an RI-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.  ConvertHigh is true if LowOpcode takes a signed operand
+// and HighOpcode takes an unsigned 32-bit operand.  In those cases,
+// MI has the same kind of operand as LowOpcode, so needs to be converted
+// if HighOpcode is used.
+void SystemZInstrInfo::expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                      unsigned HighOpcode,
+                                      bool ConvertHigh) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  bool IsHigh = isHighReg(Reg);
+  MI->setDesc(get(IsHigh ? HighOpcode : LowOpcode));
+  if (IsHigh && ConvertHigh)
+    MI->getOperand(1).setImm(uint32_t(MI->getOperand(1).getImm()));
+}
+
+// MI is a three-operand RIE-style pseudo instruction.  Replace it with
+// LowOpcode3 if the registers are both low GR32s, otherwise use a move
+// followed by HighOpcode or LowOpcode, depending on whether the target
+// is a high or low GR32.
+void SystemZInstrInfo::expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned LowOpcodeK,
+                                       unsigned HighOpcode) const {
+  unsigned DestReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (!DestIsHigh && !SrcIsHigh)
+    MI->setDesc(get(LowOpcodeK));
+  else {
+    emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                  DestReg, SrcReg, SystemZ::LR, 32,
+                  MI->getOperand(1).isKill());
+    MI->setDesc(get(DestIsHigh ? HighOpcode : LowOpcode));
+    MI->getOperand(1).setReg(DestReg);
+  }
+}
+
+// MI is an RXY-style pseudo instruction.  Replace it with LowOpcode
+// if the first operand is a low GR32 and HighOpcode if the first operand
+// is a high GR32.
+void SystemZInstrInfo::expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                       unsigned HighOpcode) const {
+  unsigned Reg = MI->getOperand(0).getReg();
+  unsigned Opcode = getOpcodeForOffset(isHighReg(Reg) ? HighOpcode : LowOpcode,
+                                       MI->getOperand(2).getImm());
+  MI->setDesc(get(Opcode));
+}
+
+// MI is an RR-style pseudo instruction that zero-extends the low Size bits
+// of one GRX32 into another.  Replace it with LowOpcode if both operands
+// are low registers, otherwise use RISB[LH]G.
+void SystemZInstrInfo::expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                                        unsigned Size) const {
+  emitGRX32Move(*MI->getParent(), MI, MI->getDebugLoc(),
+                MI->getOperand(0).getReg(), MI->getOperand(1).getReg(),
+                LowOpcode, Size, MI->getOperand(1).isKill());
+  MI->eraseFromParent();
+}
+
+// Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
+// DestReg before MBBI in MBB.  Use LowLowOpcode when both DestReg and SrcReg
+// are low registers, otherwise use RISB[LH]G.  Size is the number of bits
+// taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
+// KillSrc is true if this move is the last use of SrcReg.
+void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL, unsigned DestReg,
+                                     unsigned SrcReg, unsigned LowLowOpcode,
+                                     unsigned Size, bool KillSrc) const {
+  unsigned Opcode;
+  bool DestIsHigh = isHighReg(DestReg);
+  bool SrcIsHigh = isHighReg(SrcReg);
+  if (DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBHH;
+  else if (DestIsHigh && !SrcIsHigh)
+    Opcode = SystemZ::RISBHL;
+  else if (!DestIsHigh && SrcIsHigh)
+    Opcode = SystemZ::RISBLH;
+  else {
+    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+      .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
+  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+    .addReg(DestReg, RegState::Undef)
+    .addReg(SrcReg, getKillRegState(KillSrc))
+    .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
+}
+
 // If MI is a simple load or store for a frame object, return the register
 // it loads or stores and set FrameIndex to the index of the frame object.
 // Return 0 otherwise.
 //
 // Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
-static int isSimpleMove(const MachineInstr *MI, int &FrameIndex, int Flag) {
+static int isSimpleMove(const MachineInstr *MI, int &FrameIndex,
+                        unsigned Flag) {
   const MCInstrDesc &MCID = MI->getDesc();
   if ((MCID.TSFlags & Flag) &&
       MI->getOperand(1).isFI() &&
@@ -101,6 +213,31 @@ unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
   return isSimpleMove(MI, FrameIndex, SystemZII::SimpleBDXStore);
 }
 
+bool SystemZInstrInfo::isStackSlotCopy(const MachineInstr *MI,
+                                       int &DestFrameIndex,
+                                       int &SrcFrameIndex) const {
+  // Check for MVC 0(Length,FI1),0(FI2)
+  const MachineFrameInfo *MFI = MI->getParent()->getParent()->getFrameInfo();
+  if (MI->getOpcode() != SystemZ::MVC ||
+      !MI->getOperand(0).isFI() ||
+      MI->getOperand(1).getImm() != 0 ||
+      !MI->getOperand(3).isFI() ||
+      MI->getOperand(4).getImm() != 0)
+    return false;
+
+  // Check that Length covers the full slots.
+  int64_t Length = MI->getOperand(2).getImm();
+  unsigned FI1 = MI->getOperand(0).getIndex();
+  unsigned FI2 = MI->getOperand(3).getIndex();
+  if (MFI->getObjectSize(FI1) != Length ||
+      MFI->getObjectSize(FI2) != Length)
+    return false;
+
+  DestFrameIndex = FI1;
+  SrcFrameIndex = FI2;
+  return true;
+}
+
 bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                      MachineBasicBlock *&TBB,
                                      MachineBasicBlock *&FBB,
@@ -123,19 +260,22 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
     // A terminator that isn't a branch can't easily be handled by this
     // analysis.
-    unsigned ThisCond;
-    const MachineOperand *ThisTarget;
-    if (!isBranch(I, ThisCond, ThisTarget))
+    if (!I->isBranch())
       return true;
 
     // Can't handle indirect branches.
-    if (!ThisTarget->isMBB())
+    SystemZII::Branch Branch(getBranchInfo(I));
+    if (!Branch.Target->isMBB())
+      return true;
+
+    // Punt on compound branches.
+    if (Branch.Type != SystemZII::BranchNormal)
       return true;
 
-    if (ThisCond == SystemZ::CCMASK_ANY) {
+    if (Branch.CCMask == SystemZ::CCMASK_ANY) {
       // Handle unconditional branches.
       if (!AllowModify) {
-        TBB = ThisTarget->getMBB();
+        TBB = Branch.Target->getMBB();
         continue;
       }
 
@@ -147,7 +287,7 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       FBB = 0;
 
       // Delete the JMP if it's equivalent to a fall-through.
-      if (MBB.isLayoutSuccessor(ThisTarget->getMBB())) {
+      if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
         TBB = 0;
         I->eraseFromParent();
         I = MBB.end();
@@ -155,7 +295,7 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       }
 
       // TBB is used to indicate the unconditinal destination.
-      TBB = ThisTarget->getMBB();
+      TBB = Branch.Target->getMBB();
       continue;
     }
 
@@ -163,26 +303,28 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
     if (Cond.empty()) {
       // FIXME: add X86-style branch swap
       FBB = TBB;
-      TBB = ThisTarget->getMBB();
-      Cond.push_back(MachineOperand::CreateImm(ThisCond));
+      TBB = Branch.Target->getMBB();
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCValid));
+      Cond.push_back(MachineOperand::CreateImm(Branch.CCMask));
       continue;
     }
 
     // Handle subsequent conditional branches.
-    assert(Cond.size() == 1);
-    assert(TBB);
+    assert(Cond.size() == 2 && TBB && "Should have seen a conditional branch");
 
     // Only handle the case where all conditional branches branch to the same
     // destination.
-    if (TBB != ThisTarget->getMBB())
+    if (TBB != Branch.Target->getMBB())
       return true;
 
     // If the conditions are the same, we can leave them alone.
-    unsigned OldCond = Cond[0].getImm();
-    if (OldCond == ThisCond)
+    unsigned OldCCValid = Cond[0].getImm();
+    unsigned OldCCMask = Cond[1].getImm();
+    if (OldCCValid == Branch.CCValid && OldCCMask == Branch.CCMask)
       continue;
 
     // FIXME: Try combining conditions like X86 does.  Should be easy on Z!
+    return false;
   }
 
   return false;
@@ -197,11 +339,9 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
     --I;
     if (I->isDebugValue())
       continue;
-    unsigned Cond;
-    const MachineOperand *Target;
-    if (!isBranch(I, Cond, Target))
+    if (!I->isBranch())
       break;
-    if (!Target->isMBB())
+    if (!getBranchInfo(I).Target->isMBB())
       break;
     // Remove the branch.
     I->eraseFromParent();
@@ -212,6 +352,13 @@ unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return Count;
 }
 
+bool SystemZInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid condition");
+  Cond[1].setImm(Cond[1].getImm() ^ Cond[0].getImm());
+  return false;
+}
+
 unsigned
 SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                                MachineBasicBlock *FBB,
@@ -223,30 +370,185 @@ SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
 
   // Shouldn't be a fall through.
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-  assert((Cond.size() == 1 || Cond.size() == 0) &&
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
          "SystemZ branch conditions have one component!");
 
   if (Cond.empty()) {
     // Unconditional branch?
     assert(!FBB && "Unconditional branch with multiple successors!");
-    BuildMI(&MBB, DL, get(SystemZ::JG)).addMBB(TBB);
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(TBB);
     return 1;
   }
 
   // Conditional branch.
   unsigned Count = 0;
-  unsigned CC = Cond[0].getImm();
-  BuildMI(&MBB, DL, get(SystemZ::BRCL)).addImm(CC).addMBB(TBB);
+  unsigned CCValid = Cond[0].getImm();
+  unsigned CCMask = Cond[1].getImm();
+  BuildMI(&MBB, DL, get(SystemZ::BRC))
+    .addImm(CCValid).addImm(CCMask).addMBB(TBB);
   ++Count;
 
   if (FBB) {
     // Two-way Conditional branch. Insert the second branch.
-    BuildMI(&MBB, DL, get(SystemZ::JG)).addMBB(FBB);
+    BuildMI(&MBB, DL, get(SystemZ::J)).addMBB(FBB);
     ++Count;
   }
   return Count;
 }
 
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
+                                      unsigned &SrcReg, unsigned &SrcReg2,
+                                      int &Mask, int &Value) const {
+  assert(MI->isCompare() && "Caller should have checked for a comparison");
+
+  if (MI->getNumExplicitOperands() == 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isImm()) {
+    SrcReg = MI->getOperand(0).getReg();
+    SrcReg2 = 0;
+    Value = MI->getOperand(1).getImm();
+    Mask = ~0;
+    return true;
+  }
+
+  return false;
+}
+
+// If Reg is a virtual register, return its definition, otherwise return null.
+static MachineInstr *getDef(unsigned Reg,
+                            const MachineRegisterInfo *MRI) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return 0;
+  return MRI->getUniqueVRegDef(Reg);
+}
+
+// Return true if MI is a shift of type Opcode by Imm bits.
+static bool isShift(MachineInstr *MI, int Opcode, int64_t Imm) {
+  return (MI->getOpcode() == Opcode &&
+          !MI->getOperand(2).getReg() &&
+          MI->getOperand(3).getImm() == Imm);
+}
+
+// If the destination of MI has no uses, delete it as dead.
+static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
+  if (MRI->use_nodbg_empty(MI->getOperand(0).getReg()))
+    MI->eraseFromParent();
+}
+
+// Compare compares SrcReg against zero.  Check whether SrcReg contains
+// the result of an IPM sequence whose input CC survives until Compare,
+// and whether Compare is therefore redundant.  Delete it and return
+// true if so.
+static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
+                                  const MachineRegisterInfo *MRI,
+                                  const TargetRegisterInfo *TRI) {
+  MachineInstr *LGFR = 0;
+  MachineInstr *RLL = getDef(SrcReg, MRI);
+  if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
+    LGFR = RLL;
+    RLL = getDef(LGFR->getOperand(1).getReg(), MRI);
+  }
+  if (!RLL || !isShift(RLL, SystemZ::RLL, 31))
+    return false;
+
+  MachineInstr *SRL = getDef(RLL->getOperand(1).getReg(), MRI);
+  if (!SRL || !isShift(SRL, SystemZ::SRL, SystemZ::IPM_CC))
+    return false;
+
+  MachineInstr *IPM = getDef(SRL->getOperand(1).getReg(), MRI);
+  if (!IPM || IPM->getOpcode() != SystemZ::IPM)
+    return false;
+
+  // Check that there are no assignments to CC between the IPM and Compare,
+  if (IPM->getParent() != Compare->getParent())
+    return false;
+  MachineBasicBlock::iterator MBBI = IPM, MBBE = Compare;
+  for (++MBBI; MBBI != MBBE; ++MBBI) {
+    MachineInstr *MI = MBBI;
+    if (MI->modifiesRegister(SystemZ::CC, TRI))
+      return false;
+  }
+
+  Compare->eraseFromParent();
+  if (LGFR)
+    eraseIfDead(LGFR, MRI);
+  eraseIfDead(RLL, MRI);
+  eraseIfDead(SRL, MRI);
+  eraseIfDead(IPM, MRI);
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::optimizeCompareInstr(MachineInstr *Compare,
+                                       unsigned SrcReg, unsigned SrcReg2,
+                                       int Mask, int Value,
+                                       const MachineRegisterInfo *MRI) const {
+  assert(!SrcReg2 && "Only optimizing constant comparisons so far");
+  bool IsLogical = (Compare->getDesc().TSFlags & SystemZII::IsLogical) != 0;
+  if (Value == 0 &&
+      !IsLogical &&
+      removeIPMBasedCompare(Compare, SrcReg, MRI, TM.getRegisterInfo()))
+    return true;
+  return false;
+}
+
+// If Opcode is a move that has a conditional variant, return that variant,
+// otherwise return 0.
+static unsigned getConditionalMove(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::LR:  return SystemZ::LOCR;
+  case SystemZ::LGR: return SystemZ::LOCGR;
+  default:           return 0;
+  }
+}
+
+bool SystemZInstrInfo::isPredicable(MachineInstr *MI) const {
+  unsigned Opcode = MI->getOpcode();
+  if (TM.getSubtargetImpl()->hasLoadStoreOnCond() &&
+      getConditionalMove(Opcode))
+    return true;
+  return false;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &MBB,
+                    unsigned NumCycles, unsigned ExtraPredCycles,
+                    const BranchProbability &Probability) const {
+  // For now only convert single instructions.
+  return NumCycles == 1;
+}
+
+bool SystemZInstrInfo::
+isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                    unsigned NumCyclesT, unsigned ExtraPredCyclesT,
+                    MachineBasicBlock &FMBB,
+                    unsigned NumCyclesF, unsigned ExtraPredCyclesF,
+                    const BranchProbability &Probability) const {
+  // For now avoid converting mutually-exclusive cases.
+  return false;
+}
+
+bool SystemZInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  assert(Pred.size() == 2 && "Invalid condition");
+  unsigned CCValid = Pred[0].getImm();
+  unsigned CCMask = Pred[1].getImm();
+  assert(CCMask > 0 && CCMask < 15 && "Invalid predicate");
+  unsigned Opcode = MI->getOpcode();
+  if (TM.getSubtargetImpl()->hasLoadStoreOnCond()) {
+    if (unsigned CondOpcode = getConditionalMove(Opcode)) {
+      MI->setDesc(get(CondOpcode));
+      MachineInstrBuilder(*MI->getParent()->getParent(), MI)
+        .addImm(CCValid).addImm(CCMask)
+        .addReg(SystemZ::CC, RegState::Implicit);;
+      return true;
+    }
+  }
+  return false;
+}
+
 void
 SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 			      MachineBasicBlock::iterator MBBI, DebugLoc DL,
@@ -254,18 +556,21 @@ SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 			      bool KillSrc) const {
   // Split 128-bit GPR moves into two 64-bit moves.  This handles ADDR128 too.
   if (SystemZ::GR128BitRegClass.contains(DestReg, SrcReg)) {
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_high),
-                RI.getSubReg(SrcReg, SystemZ::subreg_high), KillSrc);
-    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_low),
-                RI.getSubReg(SrcReg, SystemZ::subreg_low), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_h64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_h64), KillSrc);
+    copyPhysReg(MBB, MBBI, DL, RI.getSubReg(DestReg, SystemZ::subreg_l64),
+                RI.getSubReg(SrcReg, SystemZ::subreg_l64), KillSrc);
+    return;
+  }
+
+  if (SystemZ::GRX32BitRegClass.contains(DestReg, SrcReg)) {
+    emitGRX32Move(MBB, MBBI, DL, DestReg, SrcReg, SystemZ::LR, 32, KillSrc);
     return;
   }
 
   // Everything else needs only one instruction.
   unsigned Opcode;
-  if (SystemZ::GR32BitRegClass.contains(DestReg, SrcReg))
-    Opcode = SystemZ::LR;
-  else if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
+  if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LGR;
   else if (SystemZ::FP32BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::LER;
@@ -313,6 +618,256 @@ SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                     FrameIdx);
 }
 
+// Return true if MI is a simple load or store with a 12-bit displacement
+// and no index.  Flag is SimpleBDXLoad for loads and SimpleBDXStore for stores.
+static bool isSimpleBD12Move(const MachineInstr *MI, unsigned Flag) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  return ((MCID.TSFlags & Flag) &&
+          isUInt<12>(MI->getOperand(2).getImm()) &&
+          MI->getOperand(3).getReg() == 0);
+}
+
+namespace {
+  struct LogicOp {
+    LogicOp() : RegSize(0), ImmLSB(0), ImmSize(0) {}
+    LogicOp(unsigned regSize, unsigned immLSB, unsigned immSize)
+      : RegSize(regSize), ImmLSB(immLSB), ImmSize(immSize) {}
+
+    operator bool() const { return RegSize; }
+
+    unsigned RegSize, ImmLSB, ImmSize;
+  };
+}
+
+static LogicOp interpretAndImmediate(unsigned Opcode) {
+  switch (Opcode) {
+  case SystemZ::NILMux: return LogicOp(32,  0, 16);
+  case SystemZ::NIHMux: return LogicOp(32, 16, 16);
+  case SystemZ::NILL64: return LogicOp(64,  0, 16);
+  case SystemZ::NILH64: return LogicOp(64, 16, 16);
+  case SystemZ::NIHL64: return LogicOp(64, 32, 16);
+  case SystemZ::NIHH64: return LogicOp(64, 48, 16);
+  case SystemZ::NIFMux: return LogicOp(32,  0, 32);
+  case SystemZ::NILF64: return LogicOp(64,  0, 32);
+  case SystemZ::NIHF64: return LogicOp(64, 32, 32);
+  default:              return LogicOp();
+  }
+}
+
+// Used to return from convertToThreeAddress after replacing two-address
+// instruction OldMI with three-address instruction NewMI.
+static MachineInstr *finishConvertToThreeAddress(MachineInstr *OldMI,
+                                                 MachineInstr *NewMI,
+                                                 LiveVariables *LV) {
+  if (LV) {
+    unsigned NumOps = OldMI->getNumOperands();
+    for (unsigned I = 1; I < NumOps; ++I) {
+      MachineOperand &Op = OldMI->getOperand(I);
+      if (Op.isReg() && Op.isKill())
+        LV->replaceKillInstruction(Op.getReg(), OldMI, NewMI);
+    }
+  }
+  return NewMI;
+}
+
+MachineInstr *
+SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  unsigned Opcode = MI->getOpcode();
+  unsigned NumOps = MI->getNumOperands();
+
+  // Try to convert something like SLL into SLLK, if supported.
+  // We prefer to keep the two-operand form where possible both
+  // because it tends to be shorter and because some instructions
+  // have memory forms that can be used during spilling.
+  if (TM.getSubtargetImpl()->hasDistinctOps()) {
+    MachineOperand &Dest = MI->getOperand(0);
+    MachineOperand &Src = MI->getOperand(1);
+    unsigned DestReg = Dest.getReg();
+    unsigned SrcReg = Src.getReg();
+    // AHIMux is only really a three-operand instruction when both operands
+    // are low registers.  Try to constrain both operands to be low if
+    // possible.
+    if (Opcode == SystemZ::AHIMux &&
+        TargetRegisterInfo::isVirtualRegister(DestReg) &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+        MRI.getRegClass(DestReg)->contains(SystemZ::R1L) &&
+        MRI.getRegClass(SrcReg)->contains(SystemZ::R1L)) {
+      MRI.constrainRegClass(DestReg, &SystemZ::GR32BitRegClass);
+      MRI.constrainRegClass(SrcReg, &SystemZ::GR32BitRegClass);
+    }
+    int ThreeOperandOpcode = SystemZ::getThreeOperandOpcode(Opcode);
+    if (ThreeOperandOpcode >= 0) {
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MBBI, MI->getDebugLoc(), get(ThreeOperandOpcode))
+        .addOperand(Dest);
+      // Keep the kill state, but drop the tied flag.
+      MIB.addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg());
+      // Keep the remaining operands as-is.
+      for (unsigned I = 2; I < NumOps; ++I)
+        MIB.addOperand(MI->getOperand(I));
+      return finishConvertToThreeAddress(MI, MIB, LV);
+    }
+  }
+
+  // Try to convert an AND into an RISBG-type instruction.
+  if (LogicOp And = interpretAndImmediate(Opcode)) {
+    uint64_t Imm = MI->getOperand(2).getImm() << And.ImmLSB;
+    // AND IMMEDIATE leaves the other bits of the register unchanged.
+    Imm |= allOnes(And.RegSize) & ~(allOnes(And.ImmSize) << And.ImmLSB);
+    unsigned Start, End;
+    if (isRxSBGMask(Imm, And.RegSize, Start, End)) {
+      unsigned NewOpcode;
+      if (And.RegSize == 64)
+        NewOpcode = SystemZ::RISBG;
+      else {
+        NewOpcode = SystemZ::RISBMux;
+        Start &= 31;
+        End &= 31;
+      }
+      MachineOperand &Dest = MI->getOperand(0);
+      MachineOperand &Src = MI->getOperand(1);
+      MachineInstrBuilder MIB =
+        BuildMI(*MBB, MI, MI->getDebugLoc(), get(NewOpcode))
+        .addOperand(Dest).addReg(0)
+        .addReg(Src.getReg(), getKillRegState(Src.isKill()), Src.getSubReg())
+        .addImm(Start).addImm(End + 128).addImm(0);
+      return finishConvertToThreeAddress(MI, MIB, LV);
+    }
+  }
+  return 0;
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                        MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Opcode = MI->getOpcode();
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    if ((Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+        isInt<8>(MI->getOperand(2).getImm()) &&
+        !MI->getOperand(3).getReg()) {
+      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+      return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::AGSI))
+        .addFrameIndex(FrameIndex).addImm(0)
+        .addImm(MI->getOperand(2).getImm());
+    }
+    return 0;
+  }
+
+  // All other cases require a single operand.
+  if (Ops.size() != 1)
+    return 0;
+
+  unsigned OpNum = Ops[0];
+  assert(Size == MF.getRegInfo()
+         .getRegClass(MI->getOperand(OpNum).getReg())->getSize() &&
+         "Invalid size combination");
+
+  if ((Opcode == SystemZ::AHI || Opcode == SystemZ::AGHI) &&
+      OpNum == 0 &&
+      isInt<8>(MI->getOperand(2).getImm())) {
+    // A(G)HI %reg, CONST -> A(G)SI %mem, CONST
+    Opcode = (Opcode == SystemZ::AHI ? SystemZ::ASI : SystemZ::AGSI);
+    return BuildMI(MF, MI->getDebugLoc(), get(Opcode))
+      .addFrameIndex(FrameIndex).addImm(0)
+      .addImm(MI->getOperand(2).getImm());
+  }
+
+  if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
+    bool Op0IsGPR = (Opcode == SystemZ::LGDR);
+    bool Op1IsGPR = (Opcode == SystemZ::LDGR);
+    // If we're spilling the destination of an LDGR or LGDR, store the
+    // source register instead.
+    if (OpNum == 0) {
+      unsigned StoreOpcode = Op1IsGPR ? SystemZ::STG : SystemZ::STD;
+      return BuildMI(MF, MI->getDebugLoc(), get(StoreOpcode))
+        .addOperand(MI->getOperand(1)).addFrameIndex(FrameIndex)
+        .addImm(0).addReg(0);
+    }
+    // If we're spilling the source of an LDGR or LGDR, load the
+    // destination register instead.
+    if (OpNum == 1) {
+      unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
+      unsigned Dest = MI->getOperand(0).getReg();
+      return BuildMI(MF, MI->getDebugLoc(), get(LoadOpcode), Dest)
+        .addFrameIndex(FrameIndex).addImm(0).addReg(0);
+    }
+  }
+
+  // Look for cases where the source of a simple store or the destination
+  // of a simple load is being spilled.  Try to use MVC instead.
+  //
+  // Although MVC is in practice a fast choice in these cases, it is still
+  // logically a bytewise copy.  This means that we cannot use it if the
+  // load or store is volatile.  We also wouldn't be able to use MVC if
+  // the two memories partially overlap, but that case cannot occur here,
+  // because we know that one of the memories is a full frame index.
+  //
+  // For performance reasons, we also want to avoid using MVC if the addresses
+  // might be equal.  We don't worry about that case here, because spill slot
+  // coloring happens later, and because we have special code to remove
+  // MVCs that turn out to be redundant.
+  if (OpNum == 0 && MI->hasOneMemOperand()) {
+    MachineMemOperand *MMO = *MI->memoperands_begin();
+    if (MMO->getSize() == Size && !MMO->isVolatile()) {
+      // Handle conversion of loads.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXLoad)) {
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addFrameIndex(FrameIndex).addImm(0).addImm(Size)
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addMemOperand(MMO);
+      }
+      // Handle conversion of stores.
+      if (isSimpleBD12Move(MI, SystemZII::SimpleBDXStore)) {
+        return BuildMI(MF, MI->getDebugLoc(), get(SystemZ::MVC))
+          .addOperand(MI->getOperand(1)).addImm(MI->getOperand(2).getImm())
+          .addImm(Size).addFrameIndex(FrameIndex).addImm(0)
+          .addMemOperand(MMO);
+      }
+    }
+  }
+
+  // If the spilled operand is the final one, try to change <INSN>R
+  // into <INSN>.
+  int MemOpcode = SystemZ::getMemOpcode(Opcode);
+  if (MemOpcode >= 0) {
+    unsigned NumOps = MI->getNumExplicitOperands();
+    if (OpNum == NumOps - 1) {
+      const MCInstrDesc &MemDesc = get(MemOpcode);
+      uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
+      assert(AccessBytes != 0 && "Size of access should be known");
+      assert(AccessBytes <= Size && "Access outside the frame index");
+      uint64_t Offset = Size - AccessBytes;
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(MemOpcode));
+      for (unsigned I = 0; I < OpNum; ++I)
+        MIB.addOperand(MI->getOperand(I));
+      MIB.addFrameIndex(FrameIndex).addImm(Offset);
+      if (MemDesc.TSFlags & SystemZII::HasIndex)
+        MIB.addReg(0);
+      return MIB;
+    }
+  }
+
+  return 0;
+}
+
+MachineInstr *
+SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        MachineInstr* LoadMI) const {
+  return 0;
+}
+
 bool
 SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   switch (MI->getOpcode()) {
@@ -332,6 +887,138 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
     splitMove(MI, SystemZ::STD);
     return true;
 
+  case SystemZ::LBMux:
+    expandRXYPseudo(MI, SystemZ::LB, SystemZ::LBH);
+    return true;
+
+  case SystemZ::LHMux:
+    expandRXYPseudo(MI, SystemZ::LH, SystemZ::LHH);
+    return true;
+
+  case SystemZ::LLCRMux:
+    expandZExtPseudo(MI, SystemZ::LLCR, 8);
+    return true;
+
+  case SystemZ::LLHRMux:
+    expandZExtPseudo(MI, SystemZ::LLHR, 16);
+    return true;
+
+  case SystemZ::LLCMux:
+    expandRXYPseudo(MI, SystemZ::LLC, SystemZ::LLCH);
+    return true;
+
+  case SystemZ::LLHMux:
+    expandRXYPseudo(MI, SystemZ::LLH, SystemZ::LLHH);
+    return true;
+
+  case SystemZ::LMux:
+    expandRXYPseudo(MI, SystemZ::L, SystemZ::LFH);
+    return true;
+
+  case SystemZ::STCMux:
+    expandRXYPseudo(MI, SystemZ::STC, SystemZ::STCH);
+    return true;
+
+  case SystemZ::STHMux:
+    expandRXYPseudo(MI, SystemZ::STH, SystemZ::STHH);
+    return true;
+
+  case SystemZ::STMux:
+    expandRXYPseudo(MI, SystemZ::ST, SystemZ::STFH);
+    return true;
+
+  case SystemZ::LHIMux:
+    expandRIPseudo(MI, SystemZ::LHI, SystemZ::IIHF, true);
+    return true;
+
+  case SystemZ::IIFMux:
+    expandRIPseudo(MI, SystemZ::IILF, SystemZ::IIHF, false);
+    return true;
+
+  case SystemZ::IILMux:
+    expandRIPseudo(MI, SystemZ::IILL, SystemZ::IIHL, false);
+    return true;
+
+  case SystemZ::IIHMux:
+    expandRIPseudo(MI, SystemZ::IILH, SystemZ::IIHH, false);
+    return true;
+
+  case SystemZ::NIFMux:
+    expandRIPseudo(MI, SystemZ::NILF, SystemZ::NIHF, false);
+    return true;
+
+  case SystemZ::NILMux:
+    expandRIPseudo(MI, SystemZ::NILL, SystemZ::NIHL, false);
+    return true;
+
+  case SystemZ::NIHMux:
+    expandRIPseudo(MI, SystemZ::NILH, SystemZ::NIHH, false);
+    return true;
+
+  case SystemZ::OIFMux:
+    expandRIPseudo(MI, SystemZ::OILF, SystemZ::OIHF, false);
+    return true;
+
+  case SystemZ::OILMux:
+    expandRIPseudo(MI, SystemZ::OILL, SystemZ::OIHL, false);
+    return true;
+
+  case SystemZ::OIHMux:
+    expandRIPseudo(MI, SystemZ::OILH, SystemZ::OIHH, false);
+    return true;
+
+  case SystemZ::XIFMux:
+    expandRIPseudo(MI, SystemZ::XILF, SystemZ::XIHF, false);
+    return true;
+
+  case SystemZ::TMLMux:
+    expandRIPseudo(MI, SystemZ::TMLL, SystemZ::TMHL, false);
+    return true;
+
+  case SystemZ::TMHMux:
+    expandRIPseudo(MI, SystemZ::TMLH, SystemZ::TMHH, false);
+    return true;
+
+  case SystemZ::AHIMux:
+    expandRIPseudo(MI, SystemZ::AHI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::AHIMuxK:
+    expandRIEPseudo(MI, SystemZ::AHI, SystemZ::AHIK, SystemZ::AIH);
+    return true;
+
+  case SystemZ::AFIMux:
+    expandRIPseudo(MI, SystemZ::AFI, SystemZ::AIH, false);
+    return true;
+
+  case SystemZ::CFIMux:
+    expandRIPseudo(MI, SystemZ::CFI, SystemZ::CIH, false);
+    return true;
+
+  case SystemZ::CLFIMux:
+    expandRIPseudo(MI, SystemZ::CLFI, SystemZ::CLIH, false);
+    return true;
+
+  case SystemZ::CMux:
+    expandRXYPseudo(MI, SystemZ::C, SystemZ::CHF);
+    return true;
+
+  case SystemZ::CLMux:
+    expandRXYPseudo(MI, SystemZ::CL, SystemZ::CLHF);
+    return true;
+
+  case SystemZ::RISBMux: {
+    bool DestIsHigh = isHighReg(MI->getOperand(0).getReg());
+    bool SrcIsHigh = isHighReg(MI->getOperand(2).getReg());
+    if (SrcIsHigh == DestIsHigh)
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHH : SystemZ::RISBLL));
+    else {
+      MI->setDesc(get(DestIsHigh ? SystemZ::RISBHL : SystemZ::RISBLH));
+      MI->getOperand(5).setImm(MI->getOperand(5).getImm() ^ 32);
+    }
+    return true;
+  }
+
   case SystemZ::ADJDYNALLOC:
     splitAdjDynAlloc(MI);
     return true;
@@ -341,32 +1028,60 @@ SystemZInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   }
 }
 
-bool SystemZInstrInfo::
-ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 1 && "Invalid branch condition!");
-  Cond[0].setImm(Cond[0].getImm() ^ SystemZ::CCMASK_ANY);
-  return false;
+uint64_t SystemZInstrInfo::getInstSizeInBytes(const MachineInstr *MI) const {
+  if (MI->getOpcode() == TargetOpcode::INLINEASM) {
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  return MI->getDesc().getSize();
 }
 
-bool SystemZInstrInfo::isBranch(const MachineInstr *MI, unsigned &Cond,
-                                const MachineOperand *&Target) const {
+SystemZII::Branch
+SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
   switch (MI->getOpcode()) {
   case SystemZ::BR:
   case SystemZ::J:
   case SystemZ::JG:
-    Cond = SystemZ::CCMASK_ANY;
-    Target = &MI->getOperand(0);
-    return true;
+    return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY,
+                             SystemZ::CCMASK_ANY, &MI->getOperand(0));
 
   case SystemZ::BRC:
   case SystemZ::BRCL:
-    Cond = MI->getOperand(0).getImm();
-    Target = &MI->getOperand(1);
-    return true;
+    return SystemZII::Branch(SystemZII::BranchNormal,
+                             MI->getOperand(0).getImm(),
+                             MI->getOperand(1).getImm(), &MI->getOperand(2));
+
+  case SystemZ::BRCT:
+    return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
+  case SystemZ::BRCTG:
+    return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
+                             SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
+  case SystemZ::CIJ:
+  case SystemZ::CRJ:
+    return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CLIJ:
+  case SystemZ::CLRJ:
+    return SystemZII::Branch(SystemZII::BranchCL, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CGIJ:
+  case SystemZ::CGRJ:
+    return SystemZII::Branch(SystemZII::BranchCG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
+
+  case SystemZ::CLGIJ:
+  case SystemZ::CLGRJ:
+    return SystemZII::Branch(SystemZII::BranchCLG, SystemZ::CCMASK_ICMP,
+                             MI->getOperand(2).getImm(), &MI->getOperand(3));
 
   default:
-    assert(!MI->getDesc().isBranch() && "Unknown branch opcode");
-    return false;
+    llvm_unreachable("Unrecognized branch opcode");
   }
 }
 
@@ -375,7 +1090,13 @@ void SystemZInstrInfo::getLoadStoreOpcodes(const TargetRegisterClass *RC,
                                            unsigned &StoreOpcode) const {
   if (RC == &SystemZ::GR32BitRegClass || RC == &SystemZ::ADDR32BitRegClass) {
     LoadOpcode = SystemZ::L;
-    StoreOpcode = SystemZ::ST32;
+    StoreOpcode = SystemZ::ST;
+  } else if (RC == &SystemZ::GRH32BitRegClass) {
+    LoadOpcode = SystemZ::LFH;
+    StoreOpcode = SystemZ::STFH;
+  } else if (RC == &SystemZ::GRX32BitRegClass) {
+    LoadOpcode = SystemZ::LMux;
+    StoreOpcode = SystemZ::STMux;
   } else if (RC == &SystemZ::GR64BitRegClass ||
              RC == &SystemZ::ADDR64BitRegClass) {
     LoadOpcode = SystemZ::LG;
@@ -424,6 +1145,88 @@ unsigned SystemZInstrInfo::getOpcodeForOffset(unsigned Opcode,
   return 0;
 }
 
+unsigned SystemZInstrInfo::getLoadAndTest(unsigned Opcode) const {
+  switch (Opcode) {
+  case SystemZ::L:    return SystemZ::LT;
+  case SystemZ::LY:   return SystemZ::LT;
+  case SystemZ::LG:   return SystemZ::LTG;
+  case SystemZ::LGF:  return SystemZ::LTGF;
+  case SystemZ::LR:   return SystemZ::LTR;
+  case SystemZ::LGFR: return SystemZ::LTGFR;
+  case SystemZ::LGR:  return SystemZ::LTGR;
+  case SystemZ::LER:  return SystemZ::LTEBR;
+  case SystemZ::LDR:  return SystemZ::LTDBR;
+  case SystemZ::LXR:  return SystemZ::LTXBR;
+  default:            return 0;
+  }
+}
+
+// Return true if Mask matches the regexp 0*1+0*, given that zero masks
+// have already been filtered out.  Store the first set bit in LSB and
+// the number of set bits in Length if so.
+static bool isStringOfOnes(uint64_t Mask, unsigned &LSB, unsigned &Length) {
+  unsigned First = findFirstSet(Mask);
+  uint64_t Top = (Mask >> First) + 1;
+  if ((Top & -Top) == Top) {
+    LSB = First;
+    Length = findFirstSet(Top);
+    return true;
+  }
+  return false;
+}
+
+bool SystemZInstrInfo::isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                                   unsigned &Start, unsigned &End) const {
+  // Reject trivial all-zero masks.
+  if (Mask == 0)
+    return false;
+
+  // Handle the 1+0+ or 0+1+0* cases.  Start then specifies the index of
+  // the msb and End specifies the index of the lsb.
+  unsigned LSB, Length;
+  if (isStringOfOnes(Mask, LSB, Length)) {
+    Start = 63 - (LSB + Length - 1);
+    End = 63 - LSB;
+    return true;
+  }
+
+  // Handle the wrap-around 1+0+1+ cases.  Start then specifies the msb
+  // of the low 1s and End specifies the lsb of the high 1s.
+  if (isStringOfOnes(Mask ^ allOnes(BitSize), LSB, Length)) {
+    assert(LSB > 0 && "Bottom bit must be set");
+    assert(LSB + Length < BitSize && "Top bit must be set");
+    Start = 63 - (LSB - 1);
+    End = 63 - (LSB + Length);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::getCompareAndBranch(unsigned Opcode,
+                                               const MachineInstr *MI) const {
+  switch (Opcode) {
+  case SystemZ::CR:
+    return SystemZ::CRJ;
+  case SystemZ::CGR:
+    return SystemZ::CGRJ;
+  case SystemZ::CHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CIJ : 0;
+  case SystemZ::CGHI:
+    return MI && isInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CGIJ : 0;
+  case SystemZ::CLR:
+    return SystemZ::CLRJ;
+  case SystemZ::CLGR:
+    return SystemZ::CLGRJ;
+  case SystemZ::CLFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLIJ : 0;
+  case SystemZ::CLGFI:
+    return MI && isUInt<8>(MI->getOperand(1).getImm()) ? SystemZ::CLGIJ : 0;
+  default:
+    return 0;
+  }
+}
+
 void SystemZInstrInfo::loadImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      unsigned Reg, uint64_t Value) const {
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 0fc4761..be4c8fe 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -28,12 +28,31 @@ class SystemZTargetMachine;
 namespace SystemZII {
   enum {
     // See comments in SystemZInstrFormats.td.
-    SimpleBDXLoad  = (1 << 0),
-    SimpleBDXStore = (1 << 1),
-    Has20BitOffset = (1 << 2),
-    HasIndex       = (1 << 3),
-    Is128Bit       = (1 << 4)
+    SimpleBDXLoad          = (1 << 0),
+    SimpleBDXStore         = (1 << 1),
+    Has20BitOffset         = (1 << 2),
+    HasIndex               = (1 << 3),
+    Is128Bit               = (1 << 4),
+    AccessSizeMask         = (31 << 5),
+    AccessSizeShift        = 5,
+    CCValuesMask           = (15 << 10),
+    CCValuesShift          = 10,
+    CompareZeroCCMaskMask  = (15 << 14),
+    CompareZeroCCMaskShift = 14,
+    CCMaskFirst            = (1 << 18),
+    CCMaskLast             = (1 << 19),
+    IsLogical              = (1 << 20)
   };
+  static inline unsigned getAccessSize(unsigned int Flags) {
+    return (Flags & AccessSizeMask) >> AccessSizeShift;
+  }
+  static inline unsigned getCCValues(unsigned int Flags) {
+    return (Flags & CCValuesMask) >> CCValuesShift;
+  }
+  static inline unsigned getCompareZeroCCMask(unsigned int Flags) {
+    return (Flags & CompareZeroCCMaskMask) >> CompareZeroCCMaskShift;
+  }
+
   // SystemZ MachineOperand target flags.
   enum {
     // Masks out the bits for the access model.
@@ -42,14 +61,74 @@ namespace SystemZII {
     // @GOT (aka @GOTENT)
     MO_GOT = (1 << 0)
   };
+  // Classifies a branch.
+  enum BranchType {
+    // An instruction that branches on the current value of CC.
+    BranchNormal,
+
+    // An instruction that peforms a 32-bit signed comparison and branches
+    // on the result.
+    BranchC,
+
+    // An instruction that peforms a 32-bit unsigned comparison and branches
+    // on the result.
+    BranchCL,
+
+    // An instruction that peforms a 64-bit signed comparison and branches
+    // on the result.
+    BranchCG,
+
+    // An instruction that peforms a 64-bit unsigned comparison and branches
+    // on the result.
+    BranchCLG,
+
+    // An instruction that decrements a 32-bit register and branches if
+    // the result is nonzero.
+    BranchCT,
+
+    // An instruction that decrements a 64-bit register and branches if
+    // the result is nonzero.
+    BranchCTG
+  };
+  // Information about a branch instruction.
+  struct Branch {
+    // The type of the branch.
+    BranchType Type;
+
+    // CCMASK_<N> is set if CC might be equal to N.
+    unsigned CCValid;
+
+    // CCMASK_<N> is set if the branch should be taken when CC == N.
+    unsigned CCMask;
+
+    // The target of the branch.
+    const MachineOperand *Target;
+
+    Branch(BranchType type, unsigned ccValid, unsigned ccMask,
+           const MachineOperand *target)
+      : Type(type), CCValid(ccValid), CCMask(ccMask), Target(target) {}
+  };
 }
 
 class SystemZInstrInfo : public SystemZGenInstrInfo {
   const SystemZRegisterInfo RI;
+  SystemZTargetMachine &TM;
 
   void splitMove(MachineBasicBlock::iterator MI, unsigned NewOpcode) const;
   void splitAdjDynAlloc(MachineBasicBlock::iterator MI) const;
-
+  void expandRIPseudo(MachineInstr *MI, unsigned LowOpcode,
+                      unsigned HighOpcode, bool ConvertHigh) const;
+  void expandRIEPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned LowOpcodeK, unsigned HighOpcode) const;
+  void expandRXYPseudo(MachineInstr *MI, unsigned LowOpcode,
+                       unsigned HighOpcode) const;
+  void expandZExtPseudo(MachineInstr *MI, unsigned LowOpcode,
+                        unsigned Size) const;
+  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                     unsigned LowLowOpcode, unsigned Size, bool KillSrc) const;
+  virtual void anchor();
+  
 public:
   explicit SystemZInstrInfo(SystemZTargetMachine &TM);
 
@@ -58,6 +137,8 @@ public:
                                        int &FrameIndex) const LLVM_OVERRIDE;
   virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
                                       int &FrameIndex) const LLVM_OVERRIDE;
+  virtual bool isStackSlotCopy(const MachineInstr *MI, int &DestFrameIndex,
+                               int &SrcFrameIndex) const LLVM_OVERRIDE;
   virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
                              MachineBasicBlock *&TBB,
                              MachineBasicBlock *&FBB,
@@ -68,6 +149,29 @@ public:
                                 MachineBasicBlock *FBB,
                                 const SmallVectorImpl<MachineOperand> &Cond,
                                 DebugLoc DL) const LLVM_OVERRIDE;
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &Mask, int &Value) const
+    LLVM_OVERRIDE;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const LLVM_OVERRIDE;
+  virtual bool isPredicable(MachineInstr *MI) const LLVM_OVERRIDE;
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const
+    LLVM_OVERRIDE;
+  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumCyclesT,
+                                   unsigned ExtraPredCyclesT,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumCyclesF,
+                                   unsigned ExtraPredCyclesF,
+                                   const BranchProbability &Probability) const
+    LLVM_OVERRIDE;
+  virtual bool
+    PredicateInstruction(MachineInstr *MI,
+                         const SmallVectorImpl<MachineOperand> &Pred) const
+    LLVM_OVERRIDE;
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -84,6 +188,18 @@ public:
                          unsigned DestReg, int FrameIdx,
                          const TargetRegisterClass *RC,
                          const TargetRegisterInfo *TRI) const LLVM_OVERRIDE;
+  virtual MachineInstr *
+    convertToThreeAddress(MachineFunction::iterator &MFI,
+                          MachineBasicBlock::iterator &MBBI,
+                          LiveVariables *LV) const;
+  virtual MachineInstr *
+    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                          const SmallVectorImpl<unsigned> &Ops,
+                          int FrameIndex) const;
+  virtual MachineInstr *
+    foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
+                          const SmallVectorImpl<unsigned> &Ops,
+                          MachineInstr* LoadMI) const;
   virtual bool
     expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const LLVM_OVERRIDE;
   virtual bool
@@ -93,13 +209,15 @@ public:
   // Return the SystemZRegisterInfo, which this class owns.
   const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
 
+  // Return the size in bytes of MI.
+  uint64_t getInstSizeInBytes(const MachineInstr *MI) const;
+
   // Return true if MI is a conditional or unconditional branch.
   // When returning true, set Cond to the mask of condition-code
   // values on which the instruction will branch, and set Target
   // to the operand that contains the branch target.  This target
   // can be a register or a basic block.
-  bool isBranch(const MachineInstr *MI, unsigned &Cond,
-                const MachineOperand *&Target) const;
+  SystemZII::Branch getBranchInfo(const MachineInstr *MI) const;
 
   // Get the load and store opcodes for a given register class.
   void getLoadStoreOpcodes(const TargetRegisterClass *RC,
@@ -112,6 +230,22 @@ public:
   // exists.
   unsigned getOpcodeForOffset(unsigned Opcode, int64_t Offset) const;
 
+  // If Opcode is a load instruction that has a LOAD AND TEST form,
+  // return the opcode for the testing form, otherwise return 0.
+  unsigned getLoadAndTest(unsigned Opcode) const;
+
+  // Return true if ROTATE AND ... SELECTED BITS can be used to select bits
+  // Mask of the R2 operand, given that only the low BitSize bits of Mask are
+  // significant.  Set Start and End to the I3 and I4 operands if so.
+  bool isRxSBGMask(uint64_t Mask, unsigned BitSize,
+                   unsigned &Start, unsigned &End) const;
+
+  // If Opcode is a COMPARE opcode for which an associated COMPARE AND
+  // BRANCH exists, return the opcode for the latter, otherwise return 0.
+  // MI, if nonnull, is the compare instruction.
+  unsigned getCompareAndBranch(unsigned Opcode,
+                               const MachineInstr *MI = 0) const;
+
   // Emit code before MBBI in MI to move immediate value Value into
   // physical register Reg.
   void loadImmediate(MachineBasicBlock &MBB,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
index 7ffa382..6524e44 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -32,77 +32,202 @@ let neverHasSideEffects = 1 in {
 // Control flow instructions
 //===----------------------------------------------------------------------===//
 
-// A return instruction.  R1 is the condition-code mask (all 1s)
-// and R2 is the target address, which is always stored in %r14.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1,
-    R1 = 15, R2 = 14, isCodeGenOnly = 1 in {
-  def RET : InstRR<0x07, (outs), (ins), "br\t%r14", [(z_retflag)]>;
-}
+// A return instruction (br %r14).
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in
+  def Return : Alias<2, (outs), (ins), [(z_retflag)]>;
 
 // Unconditional branches.  R1 is the condition-code mask (all 1s).
 let isBranch = 1, isTerminator = 1, isBarrier = 1, R1 = 15 in {
   let isIndirectBranch = 1 in
-    def BR : InstRR<0x07, (outs), (ins ADDR64:$dst),
-                    "br\t$dst", [(brind ADDR64:$dst)]>;
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2),
+                    "br\t$R2", [(brind ADDR64:$R2)]>;
 
-  // An assembler extended mnemonic for BRC.  Use a separate instruction for
-  // the asm parser, so that we don't relax Js to external symbols into JGs.
-  let isCodeGenOnly = 1 in
-    def J : InstRI<0xA74, (outs), (ins brtarget16:$dst), "j\t$dst", []>;
-  let isAsmParserOnly = 1 in
-    def AsmJ : InstRI<0xA74, (outs), (ins brtarget16:$dst), "j\t$dst", []>;
+  // An assembler extended mnemonic for BRC.
+  def J : InstRI<0xA74, (outs), (ins brtarget16:$I2), "j\t$I2",
+                 [(br bb:$I2)]>;
 
   // An assembler extended mnemonic for BRCL.  (The extension is "G"
   // rather than "L" because "JL" is "Jump if Less".)
-  def JG : InstRIL<0xC04, (outs), (ins brtarget32:$dst),
-                   "jg\t$dst", [(br bb:$dst)]>;
+  def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2), "jg\t$I2", []>;
 }
 
 // Conditional branches.  It's easier for LLVM to handle these branches
 // in their raw BRC/BRCL form, with the 4-bit condition-code mask being
 // the first operand.  It seems friendlier to use mnemonic forms like
 // JE and JLH when writing out the assembly though.
-multiclass CondBranches<Operand imm, string short, string long> {
-  let isBranch = 1, isTerminator = 1, Uses = [PSW] in {
-    def "" : InstRI<0xA74, (outs), (ins imm:$cond, brtarget16:$dst), short, []>;
-    def L  : InstRIL<0xC04, (outs), (ins imm:$cond, brtarget32:$dst), long, []>;
+let isBranch = 1, isTerminator = 1, Uses = [CC] in {
+  let isCodeGenOnly = 1, CCMaskFirst = 1 in {
+    def BRC : InstRI<0xA74, (outs), (ins cond4:$valid, cond4:$R1,
+                                         brtarget16:$I2), "j$R1\t$I2",
+                     [(z_br_ccmask cond4:$valid, cond4:$R1, bb:$I2)]>;
+    def BRCL : InstRIL<0xC04, (outs), (ins cond4:$valid, cond4:$R1,
+                                           brtarget32:$I2), "jg$R1\t$I2", []>;
+  }
+  def AsmBRC : InstRI<0xA74, (outs), (ins uimm8zx4:$R1, brtarget16:$I2),
+                      "brc\t$R1, $I2", []>;
+  def AsmBRCL : InstRIL<0xC04, (outs), (ins uimm8zx4:$R1, brtarget32:$I2),
+                        "brcl\t$R1, $I2", []>;
+  def AsmBCR : InstRR<0x07, (outs), (ins uimm8zx4:$R1, GR64:$R2),
+                      "bcr\t$R1, $R2", []>;
+}
+
+// Fused compare-and-branch instructions.  As for normal branches,
+// we handle these instructions internally in their raw CRJ-like form,
+// but use assembly macros like CRJE when writing them out.
+//
+// These instructions do not use or clobber the condition codes.
+// We nevertheless pretend that they clobber CC, so that we can lower
+// them to separate comparisons and BRCLs if the branch ends up being
+// out of range.
+multiclass CompareBranches<Operand ccmask, string pos1, string pos2> {
+  let isBranch = 1, isTerminator = 1, Defs = [CC] in {
+    def RJ  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "crj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def GRJ : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def IJ  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def GIJ : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2, ccmask:$M3,
+                                            brtarget16:$RI4),
+                       "cgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LRJ  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LGRJ : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgrj"##pos1##"\t$R1, $R2, "##pos2##"$RI4", []>;
+    def LIJ  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
+    def LGIJ : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2, ccmask:$M3,
+                                             brtarget16:$RI4),
+                        "clgij"##pos1##"\t$R1, $I2, "##pos2##"$RI4", []>;
   }
 }
 let isCodeGenOnly = 1 in
-  defm BRC : CondBranches<cond4, "j$cond\t$dst", "jg$cond\t$dst">;
-let isAsmParserOnly = 1 in
-  defm AsmBRC : CondBranches<uimm8zx4, "brc\t$cond, $dst", "brcl\t$cond, $dst">;
-
-def : Pat<(z_br_ccmask cond4:$cond, bb:$dst), (BRCL cond4:$cond, bb:$dst)>;
-
-// Define AsmParser mnemonics for each condition code.
-multiclass CondExtendedMnemonic<bits<4> Cond, string name> {
-  let R1 = Cond in {
-    def "" : InstRI<0xA74, (outs), (ins brtarget16:$dst),
-                    "j"##name##"\t$dst", []>;
-    def L  : InstRIL<0xC04, (outs), (ins brtarget32:$dst),
-                    "jg"##name##"\t$dst", []>;
+  defm C : CompareBranches<cond4, "$M3", "">;
+defm AsmC : CompareBranches<uimm8zx4, "", "$M3, ">;
+
+// Define AsmParser mnemonics for each general condition-code mask
+// (integer or floating-point)
+multiclass CondExtendedMnemonic<bits<4> ccmask, string name> {
+  let R1 = ccmask in {
+    def J : InstRI<0xA74, (outs), (ins brtarget16:$I2),
+                   "j"##name##"\t$I2", []>;
+    def JG : InstRIL<0xC04, (outs), (ins brtarget32:$I2),
+                     "jg"##name##"\t$I2", []>;
+    def BR : InstRR<0x07, (outs), (ins ADDR64:$R2), "b"##name##"r\t$R2", []>;
+  }
+  def LOCR  : FixedCondUnaryRRF<"locr"##name,  0xB9F2, GR32, GR32, ccmask>;
+  def LOCGR : FixedCondUnaryRRF<"locgr"##name, 0xB9E2, GR64, GR64, ccmask>;
+  def LOC   : FixedCondUnaryRSY<"loc"##name,   0xEBF2, GR32, ccmask, 4>;
+  def LOCG  : FixedCondUnaryRSY<"locg"##name,  0xEBE2, GR64, ccmask, 8>;
+  def STOC  : FixedCondStoreRSY<"stoc"##name,  0xEBF3, GR32, ccmask, 4>;
+  def STOCG : FixedCondStoreRSY<"stocg"##name, 0xEBE3, GR64, ccmask, 8>;
+}
+defm AsmO   : CondExtendedMnemonic<1,  "o">;
+defm AsmH   : CondExtendedMnemonic<2,  "h">;
+defm AsmNLE : CondExtendedMnemonic<3,  "nle">;
+defm AsmL   : CondExtendedMnemonic<4,  "l">;
+defm AsmNHE : CondExtendedMnemonic<5,  "nhe">;
+defm AsmLH  : CondExtendedMnemonic<6,  "lh">;
+defm AsmNE  : CondExtendedMnemonic<7,  "ne">;
+defm AsmE   : CondExtendedMnemonic<8,  "e">;
+defm AsmNLH : CondExtendedMnemonic<9,  "nlh">;
+defm AsmHE  : CondExtendedMnemonic<10, "he">;
+defm AsmNL  : CondExtendedMnemonic<11, "nl">;
+defm AsmLE  : CondExtendedMnemonic<12, "le">;
+defm AsmNH  : CondExtendedMnemonic<13, "nh">;
+defm AsmNO  : CondExtendedMnemonic<14, "no">;
+
+// Define AsmParser mnemonics for each integer condition-code mask.
+// This is like the list above, except that condition 3 is not possible
+// and that the low bit of the mask is therefore always 0.  This means
+// that each condition has two names.  Conditions "o" and "no" are not used.
+//
+// We don't make one of the two names an alias of the other because
+// we need the custom parsing routines to select the correct register class.
+multiclass IntCondExtendedMnemonicA<bits<4> ccmask, string name> {
+  let M3 = ccmask in {
+    def CR  : InstRIEb<0xEC76, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                       "crj"##name##"\t$R1, $R2, $RI4", []>;
+    def CGR : InstRIEb<0xEC64, (outs), (ins GR64:$R1, GR64:$R2,
+                                            brtarget16:$RI4),
+                       "cgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CI  : InstRIEc<0xEC7E, (outs), (ins GR32:$R1, imm32sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cij"##name##"\t$R1, $I2, $RI4", []>;
+    def CGI : InstRIEc<0xEC7C, (outs), (ins GR64:$R1, imm64sx8:$I2,
+                                            brtarget16:$RI4),
+                       "cgij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLR  : InstRIEb<0xEC77, (outs), (ins GR32:$R1, GR32:$R2,
+                                            brtarget16:$RI4),
+                        "clrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLGR : InstRIEb<0xEC65, (outs), (ins GR64:$R1, GR64:$R2,
+                                             brtarget16:$RI4),
+                        "clgrj"##name##"\t$R1, $R2, $RI4", []>;
+    def CLI  : InstRIEc<0xEC7F, (outs), (ins GR32:$R1, imm32zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clij"##name##"\t$R1, $I2, $RI4", []>;
+    def CLGI : InstRIEc<0xEC7D, (outs), (ins GR64:$R1, imm64zx8:$I2,
+                                             brtarget16:$RI4),
+                        "clgij"##name##"\t$R1, $I2, $RI4", []>;
   }
 }
-let isAsmParserOnly = 1 in {
-  defm AsmJO   : CondExtendedMnemonic<1,  "o">;
-  defm AsmJH   : CondExtendedMnemonic<2,  "h">;
-  defm AsmJNLE : CondExtendedMnemonic<3,  "nle">;
-  defm AsmJL   : CondExtendedMnemonic<4,  "l">;
-  defm AsmJNHE : CondExtendedMnemonic<5,  "nhe">;
-  defm AsmJLH  : CondExtendedMnemonic<6,  "lh">;
-  defm AsmJNE  : CondExtendedMnemonic<7,  "ne">;
-  defm AsmJE   : CondExtendedMnemonic<8,  "e">;
-  defm AsmJNLH : CondExtendedMnemonic<9,  "nlh">;
-  defm AsmJHE  : CondExtendedMnemonic<10, "he">;
-  defm AsmJNL  : CondExtendedMnemonic<11, "nl">;
-  defm AsmJLE  : CondExtendedMnemonic<12, "le">;
-  defm AsmJNH  : CondExtendedMnemonic<13, "nh">;
-  defm AsmJNO  : CondExtendedMnemonic<14, "no">;
+multiclass IntCondExtendedMnemonic<bits<4> ccmask, string name1, string name2>
+  : IntCondExtendedMnemonicA<ccmask, name1> {
+  let isAsmParserOnly = 1 in
+    defm Alt : IntCondExtendedMnemonicA<ccmask, name2>;
+}
+defm AsmJH   : IntCondExtendedMnemonic<2,  "h",  "nle">;
+defm AsmJL   : IntCondExtendedMnemonic<4,  "l",  "nhe">;
+defm AsmJLH  : IntCondExtendedMnemonic<6,  "lh", "ne">;
+defm AsmJE   : IntCondExtendedMnemonic<8,  "e",  "nlh">;
+defm AsmJHE  : IntCondExtendedMnemonic<10, "he", "nl">;
+defm AsmJLE  : IntCondExtendedMnemonic<12, "le", "nh">;
+
+// Decrement a register and branch if it is nonzero.  These don't clobber CC,
+// but we might need to split long branches into sequences that do.
+let Defs = [CC] in {
+  def BRCT  : BranchUnaryRI<"brct",  0xA76, GR32>;
+  def BRCTG : BranchUnaryRI<"brctg", 0xA77, GR64>;
 }
 
-def Select32 : SelectWrapper<GR32>;
-def Select64 : SelectWrapper<GR64>;
+//===----------------------------------------------------------------------===//
+// Select instructions
+//===----------------------------------------------------------------------===//
+
+def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>;
+def Select32    : SelectWrapper<GR32>;
+def Select64    : SelectWrapper<GR64>;
+
+// We don't define 32-bit Mux stores because the low-only STOC should
+// always be used if possible.
+defm CondStore8Mux  : CondStores<GRX32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore16Mux : CondStores<GRX32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>,
+                      Requires<[FeatureHighWord]>;
+defm CondStore8     : CondStores<GR32, nonvolatile_truncstorei8,
+                                 nonvolatile_anyextloadi8, bdxaddr20only>;
+defm CondStore16    : CondStores<GR32, nonvolatile_truncstorei16,
+                                 nonvolatile_anyextloadi16, bdxaddr20only>;
+defm CondStore32    : CondStores<GR32, nonvolatile_store,
+                                 nonvolatile_load, bdxaddr20only>;
+
+defm : CondStores64<CondStore8, CondStore8Inv, nonvolatile_truncstorei8,
+                    nonvolatile_anyextloadi8, bdxaddr20only>;
+defm : CondStores64<CondStore16, CondStore16Inv, nonvolatile_truncstorei16,
+                    nonvolatile_anyextloadi16, bdxaddr20only>;
+defm : CondStores64<CondStore32, CondStore32Inv, nonvolatile_truncstorei32,
+                    nonvolatile_anyextloadi32, bdxaddr20only>;
+defm CondStore64 : CondStores<GR64, nonvolatile_store,
+                              nonvolatile_load, bdxaddr20only>;
 
 //===----------------------------------------------------------------------===//
 // Call instructions
@@ -110,26 +235,30 @@ def Select64 : SelectWrapper<GR64>;
 
 // The definitions here are for the call-clobbered registers.
 let isCall = 1, Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
-                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D],
-    R1 = 14, isCodeGenOnly = 1 in {
-  def BRAS  : InstRI<0xA75, (outs), (ins pcrel16call:$dst, variable_ops),
-                     "bras\t%r14, $dst", []>;
-  def BRASL : InstRIL<0xC05, (outs), (ins pcrel32call:$dst, variable_ops),
-                      "brasl\t%r14, $dst", [(z_call pcrel32call:$dst)]>;
-  def BASR  : InstRR<0x0D, (outs), (ins ADDR64:$dst, variable_ops),
-                     "basr\t%r14, $dst", [(z_call ADDR64:$dst)]>;
+                        F0D, F1D, F2D, F3D, F4D, F5D, F6D, F7D, CC] in {
+  def CallBRASL : Alias<6, (outs), (ins pcrel32:$I2, variable_ops),
+                        [(z_call pcrel32:$I2)]>;
+  def CallBASR  : Alias<2, (outs), (ins ADDR64:$R2, variable_ops),
+                        [(z_call ADDR64:$R2)]>;
+}
+
+// Sibling calls.  Indirect sibling calls must be via R1, since R2 upwards
+// are argument registers and since branching to R0 is a no-op.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
+  def CallJG : Alias<6, (outs), (ins pcrel32:$I2),
+                     [(z_sibcall pcrel32:$I2)]>;
+  let Uses = [R1D] in
+    def CallBR : Alias<2, (outs), (ins), [(z_sibcall R1D)]>;
 }
 
 // Define the general form of the call instructions for the asm parser.
 // These instructions don't hard-code %r14 as the return address register.
-let isAsmParserOnly = 1 in {
-  def AsmBRAS  : InstRI<0xA75, (outs), (ins GR64:$save, brtarget16:$dst),
-                        "bras\t$save, $dst", []>;
-  def AsmBRASL : InstRIL<0xC05, (outs), (ins GR64:$save, brtarget32:$dst),
-                        "brasl\t$save, $dst", []>;
-  def AsmBASR  : InstRR<0x0D, (outs), (ins GR64:$save, ADDR64:$dst),
-                        "basr\t$save, $dst", []>;
-}
+def BRAS  : InstRI<0xA75, (outs), (ins GR64:$R1, brtarget16:$I2),
+                   "bras\t$R1, $I2", []>;
+def BRASL : InstRIL<0xC05, (outs), (ins GR64:$R1, brtarget32:$I2),
+                    "brasl\t$R1, $I2", []>;
+def BASR  : InstRR<0x0D, (outs), (ins GR64:$R1, ADDR64:$R2),
+                   "basr\t$R1, $R2", []>;
 
 //===----------------------------------------------------------------------===//
 // Move instructions
@@ -137,13 +266,34 @@ let isAsmParserOnly = 1 in {
 
 // Register moves.
 let neverHasSideEffects = 1 in {
-  def LR  : UnaryRR <"lr",  0x18,   null_frag, GR32, GR32>;
-  def LGR : UnaryRRE<"lgr", 0xB904, null_frag, GR64, GR64>;
+  // Expands to LR, RISBHG or RISBLG, depending on the choice of registers.
+  def LRMux : UnaryRRPseudo<"l", null_frag, GRX32, GRX32>,
+              Requires<[FeatureHighWord]>;
+  def LR  : UnaryRR <"l",  0x18,   null_frag, GR32, GR32>;
+  def LGR : UnaryRRE<"lg", 0xB904, null_frag, GR64, GR64>;
+}
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LTR  : UnaryRR <"lt",  0x12,   null_frag, GR32, GR32>;
+  def LTGR : UnaryRRE<"ltg", 0xB902, null_frag, GR64, GR64>;
+}
+
+// Move on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def LOCR  : CondUnaryRRF<"loc",  0xB9F2, GR32, GR32>;
+  def LOCGR : CondUnaryRRF<"locg", 0xB9E2, GR64, GR64>;
+}
+let Uses = [CC] in {
+  def AsmLOCR  : AsmCondUnaryRRF<"loc",  0xB9F2, GR32, GR32>;
+  def AsmLOCGR : AsmCondUnaryRRF<"locg", 0xB9E2, GR64, GR64>;
 }
 
 // Immediate moves.
-let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
-  // 16-bit sign-extended immediates.
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in {
+  // 16-bit sign-extended immediates.  LHIMux expands to LHI or IIHF,
+  // deopending on the choice of register.
+  def LHIMux : UnaryRIPseudo<bitconvert, GRX32, imm32sx16>,
+               Requires<[FeatureHighWord]>;
   def LHI  : UnaryRI<"lhi",  0xA78, bitconvert, GR32, imm32sx16>;
   def LGHI : UnaryRI<"lghi", 0xA79, bitconvert, GR64, imm64sx16>;
 
@@ -161,11 +311,13 @@ let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 
 // Register loads.
 let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
-  defm L   : UnaryRXPair<"l", 0x58, 0xE358, load, GR32>;
-  def  LRL : UnaryRILPC<"lrl", 0xC4D, aligned_load, GR32>;
-
-  def LG   : UnaryRXY<"lg", 0xE304, load, GR64>;
-  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+  // Expands to L, LY or LFH, depending on the choice of register.
+  def LMux : UnaryRXYPseudo<"l", load, GRX32, 4>,
+             Requires<[FeatureHighWord]>;
+  defm L : UnaryRXPair<"l", 0x58, 0xE358, load, GR32, 4>;
+  def LFH : UnaryRXY<"lfh", 0xE3CA, load, GRH32, 4>,
+            Requires<[FeatureHighWord]>;
+  def LG : UnaryRXY<"lg", 0xE304, load, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -174,16 +326,35 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
                       [(set GR128:$dst, (load bdxaddr20only128:$src))]>;
   }
 }
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  def LT  : UnaryRXY<"lt",  0xE312, load, GR32, 4>;
+  def LTG : UnaryRXY<"ltg", 0xE302, load, GR64, 8>;
+}
+
+let canFoldAsLoad = 1 in {
+  def LRL  : UnaryRILPC<"lrl",  0xC4D, aligned_load, GR32>;
+  def LGRL : UnaryRILPC<"lgrl", 0xC48, aligned_load, GR64>;
+}
+
+// Load on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def LOC  : CondUnaryRSY<"loc",  0xEBF2, nonvolatile_load, GR32, 4>;
+  def LOCG : CondUnaryRSY<"locg", 0xEBE2, nonvolatile_load, GR64, 8>;
+}
+let Uses = [CC] in {
+  def AsmLOC  : AsmCondUnaryRSY<"loc",  0xEBF2, GR32, 4>;
+  def AsmLOCG : AsmCondUnaryRSY<"locg", 0xEBE2, GR64, 8>;
+}
 
 // Register stores.
 let SimpleBDXStore = 1 in {
-  let isCodeGenOnly = 1 in {
-    defm ST32   : StoreRXPair<"st", 0x50, 0xE350, store, GR32>;
-    def  STRL32 : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
-  }
-
-  def STG   : StoreRXY<"stg", 0xE324, store, GR64>;
-  def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+  // Expands to ST, STY or STFH, depending on the choice of register.
+  def STMux : StoreRXYPseudo<store, GRX32, 4>,
+              Requires<[FeatureHighWord]>;
+  defm ST : StoreRXPair<"st", 0x50, 0xE350, store, GR32, 4>;
+  def STFH : StoreRXY<"stfh", 0xE3CB, store, GRH32, 4>,
+             Requires<[FeatureHighWord]>;
+  def STG : StoreRXY<"stg", 0xE324, store, GR64, 8>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -192,6 +363,18 @@ let SimpleBDXStore = 1 in {
                        [(store GR128:$src, bdxaddr20only128:$dst)]>;
   }
 }
+def STRL  : StoreRILPC<"strl", 0xC4F, aligned_store, GR32>;
+def STGRL : StoreRILPC<"stgrl", 0xC4B, aligned_store, GR64>;
+
+// Store on condition.
+let isCodeGenOnly = 1, Uses = [CC] in {
+  def STOC  : CondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def STOCG : CondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+}
+let Uses = [CC] in {
+  def AsmSTOC  : AsmCondStoreRSY<"stoc",  0xEBF3, GR32, 4>;
+  def AsmSTOCG : AsmCondStoreRSY<"stocg", 0xEBE3, GR64, 8>;
+}
 
 // 8-bit immediate stores to 8-bit fields.
 defm MVI : StoreSIPair<"mvi", 0x92, 0xEB52, truncstorei8, imm32zx8trunc>;
@@ -201,50 +384,70 @@ def MVHHI : StoreSIL<"mvhhi", 0xE544, truncstorei16, imm32sx16trunc>;
 def MVHI  : StoreSIL<"mvhi",  0xE54C, store,         imm32sx16>;
 def MVGHI : StoreSIL<"mvghi", 0xE548, store,         imm64sx16>;
 
+// Memory-to-memory moves.
+let mayLoad = 1, mayStore = 1 in
+  defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
+
+// String moves.
+let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0L] in
+  defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
+
 //===----------------------------------------------------------------------===//
 // Sign extensions
 //===----------------------------------------------------------------------===//
+//
+// Note that putting these before zero extensions mean that we will prefer
+// them for anyextload*.  There's not really much to choose between the two
+// either way, but signed-extending loads have a short LH and a long LHY,
+// while zero-extending loads have only the long LLH.
+//
+//===----------------------------------------------------------------------===//
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LBR : UnaryRRE<"lbr", 0xB926, sext8,  GR32, GR32>;
-  def LHR : UnaryRRE<"lhr", 0xB927, sext16, GR32, GR32>;
+  def LBR : UnaryRRE<"lb", 0xB926, sext8,  GR32, GR32>;
+  def LHR : UnaryRRE<"lh", 0xB927, sext16, GR32, GR32>;
 }
 
 // 64-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LGBR : UnaryRRE<"lgbr", 0xB906, sext8,  GR64, GR64>;
-  def LGHR : UnaryRRE<"lghr", 0xB907, sext16, GR64, GR64>;
-  def LGFR : UnaryRRE<"lgfr", 0xB914, sext32, GR64, GR32>;
+  def LGBR : UnaryRRE<"lgb", 0xB906, sext8,  GR64, GR64>;
+  def LGHR : UnaryRRE<"lgh", 0xB907, sext16, GR64, GR64>;
+  def LGFR : UnaryRRE<"lgf", 0xB914, sext32, GR64, GR32>;
 }
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGFR : UnaryRRE<"ltgf", 0xB912, null_frag, GR64, GR64>;
 
 // Match 32-to-64-bit sign extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(sext_inreg GR64:$src, i32),
-          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
-
-// 32-bit extensions from memory.
-def  LB   : UnaryRXY<"lb", 0xE376, sextloadi8, GR32>;
-defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, sextloadi16, GR32>;
-def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_sextloadi16, GR32>;
+          (LGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LBMux expands to LB or LBH,
+// depending on the choice of register.
+def LBMux : UnaryRXYPseudo<"lb", asextloadi8, GRX32, 1>,
+            Requires<[FeatureHighWord]>;
+def LB  : UnaryRXY<"lb", 0xE376, asextloadi8, GR32, 1>;
+def LBH : UnaryRXY<"lbh", 0xE3C0, asextloadi8, GRH32, 1>,
+          Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LHMux expands to LH or LHH,
+// depending on the choice of register.
+def LHMux : UnaryRXYPseudo<"lh", asextloadi16, GRX32, 2>,
+            Requires<[FeatureHighWord]>;
+defm LH   : UnaryRXPair<"lh", 0x48, 0xE378, asextloadi16, GR32, 2>;
+def  LHH  : UnaryRXY<"lhh", 0xE3C4, asextloadi16, GRH32, 2>,
+            Requires<[FeatureHighWord]>;
+def  LHRL : UnaryRILPC<"lhrl", 0xC45, aligned_asextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LGB   : UnaryRXY<"lgb", 0xE377, sextloadi8,  GR64>;
-def LGH   : UnaryRXY<"lgh", 0xE315, sextloadi16, GR64>;
-def LGF   : UnaryRXY<"lgf", 0xE314, sextloadi32, GR64>;
-def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_sextloadi16, GR64>;
-def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_sextloadi32, GR64>;
-
-// If the sign of a load-extend operation doesn't matter, use the signed ones.
-// There's not really much to choose between the sign and zero extensions,
-// but LH is more compact than LLH for small offsets.
-def : Pat<(i32 (extloadi8  bdxaddr20only:$src)), (LB  bdxaddr20only:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr12pair:$src)), (LH  bdxaddr12pair:$src)>;
-def : Pat<(i32 (extloadi16 bdxaddr20pair:$src)), (LHY bdxaddr20pair:$src)>;
-
-def : Pat<(i64 (extloadi8  bdxaddr20only:$src)), (LGB bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi16 bdxaddr20only:$src)), (LGH bdxaddr20only:$src)>;
-def : Pat<(i64 (extloadi32 bdxaddr20only:$src)), (LGF bdxaddr20only:$src)>;
+def LGB   : UnaryRXY<"lgb", 0xE377, asextloadi8,  GR64, 1>;
+def LGH   : UnaryRXY<"lgh", 0xE315, asextloadi16, GR64, 2>;
+def LGF   : UnaryRXY<"lgf", 0xE314, asextloadi32, GR64, 4>;
+def LGHRL : UnaryRILPC<"lghrl", 0xC44, aligned_asextloadi16, GR64>;
+def LGFRL : UnaryRILPC<"lgfrl", 0xC4C, aligned_asextloadi32, GR64>;
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in
+  def LTGF : UnaryRXY<"ltgf", 0xE332, asextloadi32, GR64, 4>;
 
 //===----------------------------------------------------------------------===//
 // Zero extensions
@@ -252,33 +455,51 @@ def : Pat<(i64 (extloadi32 bdxaddr20only:$src)), (LGF bdxaddr20only:$src)>;
 
 // 32-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LLCR : UnaryRRE<"llcr", 0xB994, zext8,  GR32, GR32>;
-  def LLHR : UnaryRRE<"llhr", 0xB995, zext16, GR32, GR32>;
+  // Expands to LLCR or RISB[LH]G, depending on the choice of registers.
+  def LLCRMux : UnaryRRPseudo<"llc", zext8, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLCR    : UnaryRRE<"llc", 0xB994, zext8,  GR32, GR32>;
+  // Expands to LLHR or RISB[LH]G, depending on the choice of registers.
+  def LLHRMux : UnaryRRPseudo<"llh", zext16, GRX32, GRX32>,
+                Requires<[FeatureHighWord]>;
+  def LLHR    : UnaryRRE<"llh", 0xB995, zext16, GR32, GR32>;
 }
 
 // 64-bit extensions from registers.
 let neverHasSideEffects = 1 in {
-  def LLGCR : UnaryRRE<"llgcr", 0xB984, zext8,  GR64, GR64>;
-  def LLGHR : UnaryRRE<"llghr", 0xB985, zext16, GR64, GR64>;
-  def LLGFR : UnaryRRE<"llgfr", 0xB916, zext32, GR64, GR32>;
+  def LLGCR : UnaryRRE<"llgc", 0xB984, zext8,  GR64, GR64>;
+  def LLGHR : UnaryRRE<"llgh", 0xB985, zext16, GR64, GR64>;
+  def LLGFR : UnaryRRE<"llgf", 0xB916, zext32, GR64, GR32>;
 }
 
 // Match 32-to-64-bit zero extensions in which the source is already
 // in a 64-bit register.
 def : Pat<(and GR64:$src, 0xffffffff),
-          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
-
-// 32-bit extensions from memory.
-def LLC   : UnaryRXY<"llc", 0xE394, zextloadi8,  GR32>;
-def LLH   : UnaryRXY<"llh", 0xE395, zextloadi16, GR32>;
-def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_zextloadi16, GR32>;
+          (LLGFR (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
+
+// 32-bit extensions from 8-bit memory.  LLCMux expands to LLC or LLCH,
+// depending on the choice of register.
+def LLCMux : UnaryRXYPseudo<"llc", azextloadi8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+def LLC  : UnaryRXY<"llc", 0xE394, azextloadi8, GR32, 1>;
+def LLCH : UnaryRXY<"llch", 0xE3C2, azextloadi8, GR32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// 32-bit extensions from 16-bit memory.  LLHMux expands to LLH or LLHH,
+// depending on the choice of register.
+def LLHMux : UnaryRXYPseudo<"llh", azextloadi16, GRX32, 2>,
+             Requires<[FeatureHighWord]>;
+def LLH   : UnaryRXY<"llh", 0xE395, azextloadi16, GR32, 2>;
+def LLHH  : UnaryRXY<"llhh", 0xE3C6, azextloadi16, GR32, 2>,
+            Requires<[FeatureHighWord]>;
+def LLHRL : UnaryRILPC<"llhrl", 0xC42, aligned_azextloadi16, GR32>;
 
 // 64-bit extensions from memory.
-def LLGC   : UnaryRXY<"llgc", 0xE390, zextloadi8,  GR64>;
-def LLGH   : UnaryRXY<"llgh", 0xE391, zextloadi16, GR64>;
-def LLGF   : UnaryRXY<"llgf", 0xE316, zextloadi32, GR64>;
-def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_zextloadi16, GR64>;
-def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
+def LLGC   : UnaryRXY<"llgc", 0xE390, azextloadi8,  GR64, 1>;
+def LLGH   : UnaryRXY<"llgh", 0xE391, azextloadi16, GR64, 2>;
+def LLGF   : UnaryRXY<"llgf", 0xE316, azextloadi32, GR64, 4>;
+def LLGHRL : UnaryRILPC<"llghrl", 0xC46, aligned_azextloadi16, GR64>;
+def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_azextloadi32, GR64>;
 
 //===----------------------------------------------------------------------===//
 // Truncations
@@ -286,21 +507,31 @@ def LLGFRL : UnaryRILPC<"llgfrl", 0xC4E, aligned_zextloadi32, GR64>;
 
 // Truncations of 64-bit registers to 32-bit registers.
 def : Pat<(i32 (trunc GR64:$src)),
-          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
-
-// Truncations of 32-bit registers to memory.
-let isCodeGenOnly = 1 in {
-  defm STC32   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR32>;
-  defm STH32   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32>;
-  def  STHRL32 : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
-}
+          (EXTRACT_SUBREG GR64:$src, subreg_l32)>;
+
+// Truncations of 32-bit registers to 8-bit memory.  STCMux expands to
+// STC, STCY or STCH, depending on the choice of register.
+def STCMux : StoreRXYPseudo<truncstorei8, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STC : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8, GR32, 1>;
+def STCH : StoreRXY<"stch", 0xE3C3, truncstorei8, GRH32, 1>,
+           Requires<[FeatureHighWord]>;
+
+// Truncations of 32-bit registers to 16-bit memory.  STHMux expands to
+// STH, STHY or STHH, depending on the choice of register.
+def STHMux : StoreRXYPseudo<truncstorei16, GRX32, 1>,
+             Requires<[FeatureHighWord]>;
+defm STH : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR32, 2>;
+def STHH : StoreRXY<"sthh", 0xE3C7, truncstorei16, GRH32, 2>,
+           Requires<[FeatureHighWord]>;
+def STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR32>;
 
 // Truncations of 64-bit registers to memory.
-defm STC   : StoreRXPair<"stc", 0x42, 0xE372, truncstorei8,  GR64>;
-defm STH   : StoreRXPair<"sth", 0x40, 0xE370, truncstorei16, GR64>;
-def  STHRL : StoreRILPC<"sthrl", 0xC47, aligned_truncstorei16, GR64>;
-defm ST    : StoreRXPair<"st", 0x50, 0xE350, truncstorei32, GR64>;
-def  STRL  : StoreRILPC<"strl", 0xC4F, aligned_truncstorei32, GR64>;
+defm : StoreGR64Pair<STC, STCY, truncstorei8>;
+defm : StoreGR64Pair<STH, STHY, truncstorei16>;
+def  : StoreGR64PC<STHRL, aligned_truncstorei16>;
+defm : StoreGR64Pair<ST, STY, truncstorei32>;
+def  : StoreGR64PC<STRL, aligned_truncstorei32>;
 
 //===----------------------------------------------------------------------===//
 // Multi-register moves
@@ -318,50 +549,77 @@ def STMG : StoreMultipleRSY<"stmg", 0xEB24, GR64>;
 
 // Byte-swapping register moves.
 let neverHasSideEffects = 1 in {
-  def LRVR  : UnaryRRE<"lrvr",  0xB91F, bswap, GR32, GR32>;
-  def LRVGR : UnaryRRE<"lrvgr", 0xB90F, bswap, GR64, GR64>;
+  def LRVR  : UnaryRRE<"lrv",  0xB91F, bswap, GR32, GR32>;
+  def LRVGR : UnaryRRE<"lrvg", 0xB90F, bswap, GR64, GR64>;
 }
 
-// Byte-swapping loads.
-def LRV  : UnaryRXY<"lrv",  0xE31E, loadu<bswap>, GR32>;
-def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu<bswap>, GR64>;
+// Byte-swapping loads.  Unlike normal loads, these instructions are
+// allowed to access storage more than once.
+def LRV  : UnaryRXY<"lrv",  0xE31E, loadu<bswap, nonvolatile_load>, GR32, 4>;
+def LRVG : UnaryRXY<"lrvg", 0xE30F, loadu<bswap, nonvolatile_load>, GR64, 8>;
 
-// Byte-swapping stores.
-def STRV  : StoreRXY<"strv",  0xE33E, storeu<bswap>, GR32>;
-def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap>, GR64>;
+// Likewise byte-swapping stores.
+def STRV  : StoreRXY<"strv", 0xE33E, storeu<bswap, nonvolatile_store>, GR32, 4>;
+def STRVG : StoreRXY<"strvg", 0xE32F, storeu<bswap, nonvolatile_store>,
+                     GR64, 8>;
 
 //===----------------------------------------------------------------------===//
 // Load address instructions
 //===----------------------------------------------------------------------===//
 
 // Load BDX-style addresses.
-let neverHasSideEffects = 1, Function = "la" in {
-  let PairType = "12" in
-    def LA : InstRX<0x41, (outs GR64:$dst), (ins laaddr12pair:$src),
-                    "la\t$dst, $src",
-                    [(set GR64:$dst, laaddr12pair:$src)]>;
-  let PairType = "20" in
-    def LAY : InstRXY<0xE371, (outs GR64:$dst), (ins laaddr20pair:$src),
-                      "lay\t$dst, $src",
-                      [(set GR64:$dst, laaddr20pair:$src)]>;
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    DispKey = "la" in {
+  let DispSize = "12" in
+    def LA : InstRX<0x41, (outs GR64:$R1), (ins laaddr12pair:$XBD2),
+                    "la\t$R1, $XBD2",
+                    [(set GR64:$R1, laaddr12pair:$XBD2)]>;
+  let DispSize = "20" in
+    def LAY : InstRXY<0xE371, (outs GR64:$R1), (ins laaddr20pair:$XBD2),
+                      "lay\t$R1, $XBD2",
+                      [(set GR64:$R1, laaddr20pair:$XBD2)]>;
 }
 
 // Load a PC-relative address.  There's no version of this instruction
 // with a 16-bit offset, so there's no relaxation.
-let neverHasSideEffects = 1 in {
-  def LARL : InstRIL<0xC00, (outs GR64:$dst), (ins pcrel32:$src),
-                     "larl\t$dst, $src",
-                     [(set GR64:$dst, pcrel32:$src)]>;
+let neverHasSideEffects = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    isReMaterializable = 1 in {
+  def LARL : InstRIL<0xC00, (outs GR64:$R1), (ins pcrel32:$I2),
+                     "larl\t$R1, $I2",
+                     [(set GR64:$R1, pcrel32:$I2)]>;
 }
 
 //===----------------------------------------------------------------------===//
-// Negation
+// Absolute and Negation
 //===----------------------------------------------------------------------===//
 
-let Defs = [PSW] in {
-  def LCR   : UnaryRR <"lcr",   0x13,   ineg,      GR32, GR32>;
-  def LCGR  : UnaryRRE<"lcgr",  0xB903, ineg,      GR64, GR64>;
-  def LCGFR : UnaryRRE<"lcgfr", 0xB913, null_frag, GR64, GR32>;
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LPR  : UnaryRR <"lp",  0x10,   z_iabs32, GR32, GR32>;
+    def LPGR : UnaryRRE<"lpg", 0xB900, z_iabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LPGFR : UnaryRRE<"lpgf", 0xB910, null_frag, GR64, GR32>;
+}
+defm : SXU<z_iabs64, LPGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LNR  : UnaryRR <"ln",  0x11,   z_inegabs32, GR32, GR32>;
+    def LNGR : UnaryRRE<"lng", 0xB901, z_inegabs64, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LNGFR : UnaryRRE<"lngf", 0xB911, null_frag, GR64, GR32>;
+}
+defm : SXU<z_inegabs64, LNGFR>;
+
+let Defs = [CC] in {
+  let CCValues = 0xF, CompareZeroCCMask = 0x8 in {
+    def LCR  : UnaryRR <"lc",  0x13,   ineg, GR32, GR32>;
+    def LCGR : UnaryRRE<"lcg", 0xB903, ineg, GR64, GR64>;
+  }
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def LCGFR : UnaryRRE<"lcgf", 0xB913, null_frag, GR64, GR32>;
 }
 defm : SXU<ineg, LCGFR>;
 
@@ -370,69 +628,83 @@ defm : SXU<ineg, LCGFR>;
 //===----------------------------------------------------------------------===//
 
 let isCodeGenOnly = 1 in
-  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, zextloadi8>;
-defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, zextloadi8>;
+  defm IC32 : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR32, azextloadi8, 1>;
+defm IC : BinaryRXPair<"ic", 0x43, 0xE373, inserti8, GR64, azextloadi8, 1>;
 
-defm : InsertMem<"inserti8", IC32,  GR32, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", IC32Y, GR32, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC32,  GR32, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>;
 
-defm : InsertMem<"inserti8", IC,  GR64, zextloadi8, bdxaddr12pair>;
-defm : InsertMem<"inserti8", ICY, GR64, zextloadi8, bdxaddr20pair>;
+defm : InsertMem<"inserti8", IC,  GR64, azextloadi8, bdxaddr12pair>;
+defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>;
 
 // Insertions of a 16-bit immediate, leaving other bits unaffected.
 // We don't have or_as_insert equivalents of these operations because
 // OI is available instead.
-let isCodeGenOnly = 1 in {
-  def IILL32 : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
-  def IILH32 : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
-}
-def IILL : BinaryRI<"iill", 0xA53, insertll, GR64, imm64ll16>;
-def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR64, imm64lh16>;
-def IIHL : BinaryRI<"iihl", 0xA51, inserthl, GR64, imm64hl16>;
-def IIHH : BinaryRI<"iihh", 0xA50, inserthh, GR64, imm64hh16>;
+//
+// IIxMux expands to II[LH]x, depending on the choice of register.
+def IILMux : BinaryRIPseudo<insertll, GRX32, imm32ll16>,
+             Requires<[FeatureHighWord]>;
+def IIHMux : BinaryRIPseudo<insertlh, GRX32, imm32lh16>,
+             Requires<[FeatureHighWord]>;
+def IILL : BinaryRI<"iill", 0xA53, insertll, GR32, imm32ll16>;
+def IILH : BinaryRI<"iilh", 0xA52, insertlh, GR32, imm32lh16>;
+def IIHL : BinaryRI<"iihl", 0xA51, insertll, GRH32, imm32ll16>;
+def IIHH : BinaryRI<"iihh", 0xA50, insertlh, GRH32, imm32lh16>;
+def IILL64 : BinaryAliasRI<insertll, GR64, imm64ll16>;
+def IILH64 : BinaryAliasRI<insertlh, GR64, imm64lh16>;
+def IIHL64 : BinaryAliasRI<inserthl, GR64, imm64hl16>;
+def IIHH64 : BinaryAliasRI<inserthh, GR64, imm64hh16>;
 
 // ...likewise for 32-bit immediates.  For GR32s this is a general
 // full-width move.  (We use IILF rather than something like LLILF
 // for 32-bit moves because IILF leaves the upper 32 bits of the
 // GR64 unchanged.)
-let isCodeGenOnly = 1 in {
-  def IILF32 : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+let isAsCheapAsAMove = 1, isMoveImm = 1, isReMaterializable = 1 in {
+  def IIFMux : UnaryRIPseudo<bitconvert, GRX32, uimm32>,
+               Requires<[FeatureHighWord]>;
+  def IILF : UnaryRIL<"iilf", 0xC09, bitconvert, GR32, uimm32>;
+  def IIHF : UnaryRIL<"iihf", 0xC08, bitconvert, GRH32, uimm32>;
 }
-def IILF : BinaryRIL<"iilf", 0xC09, insertlf, GR64, imm64lf32>;
-def IIHF : BinaryRIL<"iihf", 0xC08, inserthf, GR64, imm64hf32>;
+def IILF64 : BinaryAliasRIL<insertlf, GR64, imm64lf32>;
+def IIHF64 : BinaryAliasRIL<inserthf, GR64, imm64hf32>;
 
 // An alternative model of inserthf, with the first operand being
 // a zero-extended value.
 def : Pat<(or (zext32 GR32:$src), imm64hf32:$imm),
-          (IIHF (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit),
-                imm64hf32:$imm)>;
+          (IIHF64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32),
+                  imm64hf32:$imm)>;
 
 //===----------------------------------------------------------------------===//
 // Addition
 //===----------------------------------------------------------------------===//
 
 // Plain addition.
-let Defs = [PSW] in {
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Addition of a register.
   let isCommutable = 1 in {
-    def AR  : BinaryRR <"ar",  0x1A,   add, GR32, GR32>;
-    def AGR : BinaryRRE<"agr", 0xB908, add, GR64, GR64>;
+    defm AR : BinaryRRAndK<"a", 0x1A, 0xB9F8, add, GR32, GR32>;
+    defm AGR : BinaryRREAndK<"ag", 0xB908, 0xB9E8, add, GR64, GR64>;
   }
-  def AGFR : BinaryRRE<"agfr", 0xB918, null_frag, GR64, GR32>;
+  def AGFR : BinaryRRE<"agf", 0xB918, null_frag, GR64, GR32>;
 
   // Addition of signed 16-bit immediates.
-  def AHI  : BinaryRI<"ahi",  0xA7A, add, GR32, imm32sx16>;
-  def AGHI : BinaryRI<"aghi", 0xA7B, add, GR64, imm64sx16>;
+  defm AHIMux : BinaryRIAndKPseudo<"ahimux", add, GRX32, imm32sx16>;
+  defm AHI  : BinaryRIAndK<"ahi",  0xA7A, 0xECD8, add, GR32, imm32sx16>;
+  defm AGHI : BinaryRIAndK<"aghi", 0xA7B, 0xECD9, add, GR64, imm64sx16>;
 
   // Addition of signed 32-bit immediates.
+  def AFIMux : BinaryRIPseudo<add, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
   def AFI  : BinaryRIL<"afi",  0xC29, add, GR32, simm32>;
+  def AIH  : BinaryRIL<"aih",  0xCC8, add, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
   def AGFI : BinaryRIL<"agfi", 0xC28, add, GR64, imm64sx32>;
 
   // Addition of memory.
-  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, sextloadi16>;
-  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load>;
-  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, sextloadi32>;
-  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load>;
+  defm AH  : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>;
+  defm A   : BinaryRXPair<"a",  0x5A, 0xE35A, add, GR32, load, 4>;
+  def  AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>;
+  def  AG  : BinaryRXY<"ag",  0xE308, add, GR64, load, 8>;
 
   // Addition to memory.
   def ASI  : BinarySIY<"asi",  0xEB6A, add, imm32sx8>;
@@ -441,34 +713,40 @@ let Defs = [PSW] in {
 defm : SXB<add, GR64, AGFR>;
 
 // Addition producing a carry.
-let Defs = [PSW] in {
+let Defs = [CC] in {
   // Addition of a register.
   let isCommutable = 1 in {
-    def ALR  : BinaryRR <"alr",  0x1E,   addc, GR32, GR32>;
-    def ALGR : BinaryRRE<"algr", 0xB90A, addc, GR64, GR64>;
+    defm ALR : BinaryRRAndK<"al", 0x1E, 0xB9FA, addc, GR32, GR32>;
+    defm ALGR : BinaryRREAndK<"alg", 0xB90A, 0xB9EA, addc, GR64, GR64>;
   }
-  def ALGFR : BinaryRRE<"algfr", 0xB91A, null_frag, GR64, GR32>;
+  def ALGFR : BinaryRRE<"algf", 0xB91A, null_frag, GR64, GR32>;
+
+  // Addition of signed 16-bit immediates.
+  def ALHSIK  : BinaryRIE<"alhsik",  0xECDA, addc, GR32, imm32sx16>,
+                Requires<[FeatureDistinctOps]>;
+  def ALGHSIK : BinaryRIE<"alghsik", 0xECDB, addc, GR64, imm64sx16>,
+                Requires<[FeatureDistinctOps]>;
 
   // Addition of unsigned 32-bit immediates.
   def ALFI  : BinaryRIL<"alfi",  0xC2B, addc, GR32, uimm32>;
   def ALGFI : BinaryRIL<"algfi", 0xC2A, addc, GR64, imm64zx32>;
 
   // Addition of memory.
-  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load>;
-  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, zextloadi32>;
-  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load>;
+  defm AL   : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>;
+  def  ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>;
+  def  ALG  : BinaryRXY<"alg",  0xE30A, addc, GR64, load, 8>;
 }
 defm : ZXB<addc, GR64, ALGFR>;
 
 // Addition producing and using a carry.
-let Defs = [PSW], Uses = [PSW] in {
+let Defs = [CC], Uses = [CC] in {
   // Addition of a register.
-  def ALCR  : BinaryRRE<"alcr",  0xB998, adde, GR32, GR32>;
-  def ALCGR : BinaryRRE<"alcgr", 0xB988, adde, GR64, GR64>;
+  def ALCR  : BinaryRRE<"alc",  0xB998, adde, GR32, GR32>;
+  def ALCGR : BinaryRRE<"alcg", 0xB988, adde, GR64, GR64>;
 
   // Addition of memory.
-  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load>;
-  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load>;
+  def ALC  : BinaryRXY<"alc",  0xE398, adde, GR32, load, 4>;
+  def ALCG : BinaryRXY<"alcg", 0xE388, adde, GR64, load, 8>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -477,25 +755,26 @@ let Defs = [PSW], Uses = [PSW] in {
 
 // Plain substraction.  Although immediate forms exist, we use the
 // add-immediate instruction instead.
-let Defs = [PSW] in {
+let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in {
   // Subtraction of a register.
-  def SR   : BinaryRR <"sr",   0x1B,   sub,       GR32, GR32>;
-  def SGFR : BinaryRRE<"sgfr", 0xB919, null_frag, GR64, GR32>;
-  def SGR  : BinaryRRE<"sgr",  0xB909, sub,       GR64, GR64>;
+  defm SR : BinaryRRAndK<"s", 0x1B, 0xB9F9, sub, GR32, GR32>;
+  def SGFR : BinaryRRE<"sgf", 0xB919, null_frag, GR64, GR32>;
+  defm SGR : BinaryRREAndK<"sg", 0xB909, 0xB9E9, sub, GR64, GR64>;
 
   // Subtraction of memory.
-  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load>;
-  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, sextloadi32>;
-  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load>;
+  defm SH  : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>;
+  defm S   : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>;
+  def  SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>;
+  def  SG  : BinaryRXY<"sg",  0xE309, sub, GR64, load, 8>;
 }
 defm : SXB<sub, GR64, SGFR>;
 
 // Subtraction producing a carry.
-let Defs = [PSW] in {
+let Defs = [CC] in {
   // Subtraction of a register.
-  def SLR   : BinaryRR <"slr",   0x1F,   subc,      GR32, GR32>;
-  def SLGFR : BinaryRRE<"slgfr", 0xB91B, null_frag, GR64, GR32>;
-  def SLGR  : BinaryRRE<"slgr",  0xB90B, subc,      GR64, GR64>;
+  defm SLR : BinaryRRAndK<"sl", 0x1F, 0xB9FB, subc, GR32, GR32>;
+  def SLGFR : BinaryRRE<"slgf", 0xB91B, null_frag, GR64, GR32>;
+  defm SLGR : BinaryRREAndK<"slg", 0xB90B, 0xB9EB, subc, GR64, GR64>;
 
   // Subtraction of unsigned 32-bit immediates.  These don't match
   // subc because we prefer addc for constants.
@@ -503,56 +782,78 @@ let Defs = [PSW] in {
   def SLGFI : BinaryRIL<"slgfi", 0xC24, null_frag, GR64, imm64zx32>;
 
   // Subtraction of memory.
-  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load>;
-  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, zextloadi32>;
-  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load>;
+  defm SL   : BinaryRXPair<"sl", 0x5F, 0xE35F, subc, GR32, load, 4>;
+  def  SLGF : BinaryRXY<"slgf", 0xE31B, subc, GR64, azextloadi32, 4>;
+  def  SLG  : BinaryRXY<"slg",  0xE30B, subc, GR64, load, 8>;
 }
 defm : ZXB<subc, GR64, SLGFR>;
 
 // Subtraction producing and using a carry.
-let Defs = [PSW], Uses = [PSW] in {
+let Defs = [CC], Uses = [CC] in {
   // Subtraction of a register.
-  def SLBR  : BinaryRRE<"slbr",  0xB999, sube, GR32, GR32>;
-  def SLGBR : BinaryRRE<"slbgr", 0xB989, sube, GR64, GR64>;
+  def SLBR  : BinaryRRE<"slb",  0xB999, sube, GR32, GR32>;
+  def SLGBR : BinaryRRE<"slbg", 0xB989, sube, GR64, GR64>;
 
   // Subtraction of memory.
-  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load>;
-  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load>;
+  def SLB  : BinaryRXY<"slb",  0xE399, sube, GR32, load, 4>;
+  def SLBG : BinaryRXY<"slbg", 0xE389, sube, GR64, load, 8>;
 }
 
 //===----------------------------------------------------------------------===//
 // AND
 //===----------------------------------------------------------------------===//
 
-let Defs = [PSW] in {
+let Defs = [CC] in {
   // ANDs of a register.
-  let isCommutable = 1 in {
-    def NR  : BinaryRR <"nr",  0x14,   and, GR32, GR32>;
-    def NGR : BinaryRRE<"ngr", 0xB980, and, GR64, GR64>;
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm NR : BinaryRRAndK<"n", 0x14, 0xB9F4, and, GR32, GR32>;
+    defm NGR : BinaryRREAndK<"ng", 0xB980, 0xB9E4, and, GR64, GR64>;
   }
 
-  // ANDs of a 16-bit immediate, leaving other bits unaffected.
-  let isCodeGenOnly = 1 in {
-    def NILL32 : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
-    def NILH32 : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+  let isConvertibleToThreeAddress = 1 in {
+    // ANDs of a 16-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 16-bit field, not the full register.
+    //
+    // NIxMux expands to NI[LH]x, depending on the choice of register.
+    def NILMux : BinaryRIPseudo<and, GRX32, imm32ll16c>,
+                 Requires<[FeatureHighWord]>;
+    def NIHMux : BinaryRIPseudo<and, GRX32, imm32lh16c>,
+                 Requires<[FeatureHighWord]>;
+    def NILL : BinaryRI<"nill", 0xA57, and, GR32, imm32ll16c>;
+    def NILH : BinaryRI<"nilh", 0xA56, and, GR32, imm32lh16c>;
+    def NIHL : BinaryRI<"nihl", 0xA55, and, GRH32, imm32ll16c>;
+    def NIHH : BinaryRI<"nihh", 0xA54, and, GRH32, imm32lh16c>;
+    def NILL64 : BinaryAliasRI<and, GR64, imm64ll16c>;
+    def NILH64 : BinaryAliasRI<and, GR64, imm64lh16c>;
+    def NIHL64 : BinaryAliasRI<and, GR64, imm64hl16c>;
+    def NIHH64 : BinaryAliasRI<and, GR64, imm64hh16c>;
+
+    // ANDs of a 32-bit immediate, leaving other bits unaffected.
+    // The CC result only reflects the 32-bit field, which means we can
+    // use it as a zero indicator for i32 operations but not otherwise.
+    let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+      // Expands to NILF or NIHF, depending on the choice of register.
+      def NIFMux : BinaryRIPseudo<and, GRX32, uimm32>,
+                   Requires<[FeatureHighWord]>;
+      def NILF : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
+      def NIHF : BinaryRIL<"nihf", 0xC0A, and, GRH32, uimm32>;
+    }
+    def NILF64 : BinaryAliasRIL<and, GR64, imm64lf32c>;
+    def NIHF64 : BinaryAliasRIL<and, GR64, imm64hf32c>;
   }
-  def NILL : BinaryRI<"nill", 0xA57, and, GR64, imm64ll16c>;
-  def NILH : BinaryRI<"nilh", 0xA56, and, GR64, imm64lh16c>;
-  def NIHL : BinaryRI<"nihl", 0xA55, and, GR64, imm64hl16c>;
-  def NIHH : BinaryRI<"nihh", 0xA54, and, GR64, imm64hh16c>;
-
-  // ANDs of a 32-bit immediate, leaving other bits unaffected.
-  let isCodeGenOnly = 1 in
-    def NILF32 : BinaryRIL<"nilf", 0xC0B, and, GR32, uimm32>;
-  def NILF : BinaryRIL<"nilf", 0xC0B, and, GR64, imm64lf32c>;
-  def NIHF : BinaryRIL<"nihf", 0xC0A, and, GR64, imm64hf32c>;
 
   // ANDs of memory.
-  defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load>;
-  def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm N  : BinaryRXPair<"n", 0x54, 0xE354, and, GR32, load, 4>;
+    def  NG : BinaryRXY<"ng", 0xE380, and, GR64, load, 8>; 
+  }
 
   // AND to memory
   defm NI : BinarySIPair<"ni", 0x94, 0xEB54, null_frag, uimm8>;
+
+  // Block AND.
+  let mayLoad = 1, mayStore = 1 in
+    defm NC : MemorySS<"nc", 0xD4, z_nc, z_nc_loop>;
 }
 defm : RMWIByte<and, bdaddr12pair, NI>;
 defm : RMWIByte<and, bdaddr20pair, NIY>;
@@ -561,35 +862,55 @@ defm : RMWIByte<and, bdaddr20pair, NIY>;
 // OR
 //===----------------------------------------------------------------------===//
 
-let Defs = [PSW] in {
+let Defs = [CC] in {
   // ORs of a register.
-  let isCommutable = 1 in {
-    def OR  : BinaryRR <"or",  0x16,   or, GR32, GR32>;
-    def OGR : BinaryRRE<"ogr", 0xB981, or, GR64, GR64>;
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm OR : BinaryRRAndK<"o", 0x16, 0xB9F6, or, GR32, GR32>;
+    defm OGR : BinaryRREAndK<"og", 0xB981, 0xB9E6, or, GR64, GR64>;
   }
 
   // ORs of a 16-bit immediate, leaving other bits unaffected.
-  let isCodeGenOnly = 1 in {
-    def OILL32 : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
-    def OILH32 : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
-  }
-  def OILL : BinaryRI<"oill", 0xA5B, or, GR64, imm64ll16>;
-  def OILH : BinaryRI<"oilh", 0xA5A, or, GR64, imm64lh16>;
-  def OIHL : BinaryRI<"oihl", 0xA59, or, GR64, imm64hl16>;
-  def OIHH : BinaryRI<"oihh", 0xA58, or, GR64, imm64hh16>;
+  // The CC result only reflects the 16-bit field, not the full register.
+  //
+  // OIxMux expands to OI[LH]x, depending on the choice of register.
+  def OILMux : BinaryRIPseudo<or, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def OIHMux : BinaryRIPseudo<or, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def OILL : BinaryRI<"oill", 0xA5B, or, GR32, imm32ll16>;
+  def OILH : BinaryRI<"oilh", 0xA5A, or, GR32, imm32lh16>;
+  def OIHL : BinaryRI<"oihl", 0xA59, or, GRH32, imm32ll16>;
+  def OIHH : BinaryRI<"oihh", 0xA58, or, GRH32, imm32lh16>;
+  def OILL64 : BinaryAliasRI<or, GR64, imm64ll16>;
+  def OILH64 : BinaryAliasRI<or, GR64, imm64lh16>;
+  def OIHL64 : BinaryAliasRI<or, GR64, imm64hl16>;
+  def OIHH64 : BinaryAliasRI<or, GR64, imm64hh16>;
 
   // ORs of a 32-bit immediate, leaving other bits unaffected.
-  let isCodeGenOnly = 1 in
-    def OILF32 : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
-  def OILF : BinaryRIL<"oilf", 0xC0D, or, GR64, imm64lf32>;
-  def OIHF : BinaryRIL<"oihf", 0xC0C, or, GR64, imm64hf32>;
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to OILF or OIHF, depending on the choice of register.
+    def OIFMux : BinaryRIPseudo<or, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def OILF : BinaryRIL<"oilf", 0xC0D, or, GR32, uimm32>;
+    def OIHF : BinaryRIL<"oihf", 0xC0C, or, GRH32, uimm32>;
+  }
+  def OILF64 : BinaryAliasRIL<or, GR64, imm64lf32>;
+  def OIHF64 : BinaryAliasRIL<or, GR64, imm64hf32>;
 
   // ORs of memory.
-  defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load>;
-  def  OG : BinaryRXY<"og", 0xE381, or, GR64, load>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm O  : BinaryRXPair<"o", 0x56, 0xE356, or, GR32, load, 4>;
+    def  OG : BinaryRXY<"og", 0xE381, or, GR64, load, 8>;
+  }
 
   // OR to memory
   defm OI : BinarySIPair<"oi", 0x96, 0xEB56, null_frag, uimm8>;
+
+  // Block OR.
+  let mayLoad = 1, mayStore = 1 in
+    defm OC : MemorySS<"oc", 0xD6, z_oc, z_oc_loop>;
 }
 defm : RMWIByte<or, bdaddr12pair, OI>;
 defm : RMWIByte<or, bdaddr20pair, OIY>;
@@ -598,25 +919,38 @@ defm : RMWIByte<or, bdaddr20pair, OIY>;
 // XOR
 //===----------------------------------------------------------------------===//
 
-let Defs = [PSW] in {
+let Defs = [CC] in {
   // XORs of a register.
-  let isCommutable = 1 in {
-    def XR  : BinaryRR <"xr",  0x17,   xor, GR32, GR32>;
-    def XGR : BinaryRRE<"xgr", 0xB982, xor, GR64, GR64>;
+  let isCommutable = 1, CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm XR : BinaryRRAndK<"x", 0x17, 0xB9F7, xor, GR32, GR32>;
+    defm XGR : BinaryRREAndK<"xg", 0xB982, 0xB9E7, xor, GR64, GR64>;
   }
 
   // XORs of a 32-bit immediate, leaving other bits unaffected.
-  let isCodeGenOnly = 1 in
-    def XILF32 : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
-  def XILF : BinaryRIL<"xilf", 0xC07, xor, GR64, imm64lf32>;
-  def XIHF : BinaryRIL<"xihf", 0xC06, xor, GR64, imm64hf32>;
+  // The CC result only reflects the 32-bit field, which means we can
+  // use it as a zero indicator for i32 operations but not otherwise.
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    // Expands to XILF or XIHF, depending on the choice of register.
+    def XIFMux : BinaryRIPseudo<xor, GRX32, uimm32>,
+                 Requires<[FeatureHighWord]>;
+    def XILF : BinaryRIL<"xilf", 0xC07, xor, GR32, uimm32>;
+    def XIHF : BinaryRIL<"xihf", 0xC06, xor, GRH32, uimm32>;
+  }
+  def XILF64 : BinaryAliasRIL<xor, GR64, imm64lf32>;
+  def XIHF64 : BinaryAliasRIL<xor, GR64, imm64hf32>;
 
   // XORs of memory.
-  defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load>;
-  def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load>;
+  let CCValues = 0xC, CompareZeroCCMask = 0x8 in {
+    defm X  : BinaryRXPair<"x",0x57, 0xE357, xor, GR32, load, 4>;
+    def  XG : BinaryRXY<"xg", 0xE382, xor, GR64, load, 8>;
+  }
 
   // XOR to memory
   defm XI : BinarySIPair<"xi", 0x97, 0xEB57, null_frag, uimm8>;
+
+  // Block XOR.
+  let mayLoad = 1, mayStore = 1 in
+    defm XC : MemorySS<"xc", 0xD7, z_xc, z_xc_loop>;
 }
 defm : RMWIByte<xor, bdaddr12pair, XI>;
 defm : RMWIByte<xor, bdaddr20pair, XIY>;
@@ -627,10 +961,10 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>;
 
 // Multiplication of a register.
 let isCommutable = 1 in {
-  def MSR  : BinaryRRE<"msr",  0xB252, mul, GR32, GR32>;
-  def MSGR : BinaryRRE<"msgr", 0xB90C, mul, GR64, GR64>;
+  def MSR  : BinaryRRE<"ms",  0xB252, mul, GR32, GR32>;
+  def MSGR : BinaryRRE<"msg", 0xB90C, mul, GR64, GR64>;
 }
-def MSGFR : BinaryRRE<"msgfr", 0xB91C, null_frag, GR64, GR32>;
+def MSGFR : BinaryRRE<"msgf", 0xB91C, null_frag, GR64, GR32>;
 defm : SXB<mul, GR64, MSGFR>;
 
 // Multiplication of a signed 16-bit immediate.
@@ -642,33 +976,32 @@ def MSFI  : BinaryRIL<"msfi",  0xC21, mul, GR32, simm32>;
 def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>;
 
 // Multiplication of memory.
-defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, sextloadi16>;
-defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load>;
-def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, sextloadi32>;
-def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load>;
+defm MH   : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>;
+defm MS   : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>;
+def  MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>;
+def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of a register, producing two results.
-def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>;
+def MLGR : BinaryRRE<"mlg", 0xB986, z_umul_lohi64, GR128, GR64>;
 
 // Multiplication of memory, producing two results.
-def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load>;
+def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>;
 
 //===----------------------------------------------------------------------===//
 // Division and remainder
 //===----------------------------------------------------------------------===//
 
 // Division and remainder, from registers.
-def DSGFR : BinaryRRE<"dsgfr", 0xB91D, null_frag,   GR128, GR32>;
-def DSGR  : BinaryRRE<"dsgr",  0xB90D, z_sdivrem64, GR128, GR64>;
-def DLR   : BinaryRRE<"dlr",   0xB997, z_udivrem32, GR128, GR32>;
-def DLGR  : BinaryRRE<"dlgr",  0xB987, z_udivrem64, GR128, GR64>;
-defm : SXB<z_sdivrem64, GR128, DSGFR>;
+def DSGFR : BinaryRRE<"dsgf", 0xB91D, z_sdivrem32, GR128, GR32>;
+def DSGR  : BinaryRRE<"dsg",  0xB90D, z_sdivrem64, GR128, GR64>;
+def DLR   : BinaryRRE<"dl",   0xB997, z_udivrem32, GR128, GR32>;
+def DLGR  : BinaryRRE<"dlg",  0xB987, z_udivrem64, GR128, GR64>;
 
 // Division and remainder, from memory.
-def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem64, GR128, sextloadi32>;
-def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load>;
-def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load>;
-def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load>;
+def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>;
+def DSG  : BinaryRXY<"dsg",  0xE30D, z_sdivrem64, GR128, load, 8>;
+def DL   : BinaryRXY<"dl",   0xE397, z_udivrem32, GR128, load, 4>;
+def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load, 8>;
 
 //===----------------------------------------------------------------------===//
 // Shifts
@@ -676,111 +1009,188 @@ def DLG  : BinaryRXY<"dlg",  0xE387, z_udivrem64, GR128, load>;
 
 // Shift left.
 let neverHasSideEffects = 1 in {
-  def SLL  : ShiftRS <"sll",  0x89,   shl, GR32, shift12only>;
-  def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64, shift20only>;
+  defm SLL : ShiftRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>;
+  def SLLG : ShiftRSY<"sllg", 0xEB0D, shl, GR64>;
 }
 
 // Logical shift right.
 let neverHasSideEffects = 1 in {
-  def SRL  : ShiftRS <"srl",  0x88,   srl, GR32, shift12only>;
-  def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64, shift20only>;
+  defm SRL : ShiftRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>;
+  def SRLG : ShiftRSY<"srlg", 0xEB0C, srl, GR64>;
 }
 
 // Arithmetic shift right.
-let Defs = [PSW] in {
-  def SRA  : ShiftRS <"sra",  0x8A,   sra, GR32, shift12only>;
-  def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64, shift20only>;
+let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in {
+  defm SRA : ShiftRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>;
+  def SRAG : ShiftRSY<"srag", 0xEB0A, sra, GR64>;
 }
 
 // Rotate left.
 let neverHasSideEffects = 1 in {
-  def RLL  : ShiftRSY<"rll",  0xEB1D, rotl, GR32, shift20only>;
-  def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64, shift20only>;
+  def RLL  : ShiftRSY<"rll",  0xEB1D, rotl, GR32>;
+  def RLLG : ShiftRSY<"rllg", 0xEB1C, rotl, GR64>;
 }
 
 // Rotate second operand left and inserted selected bits into first operand.
 // These can act like 32-bit operands provided that the constant start and
-// end bits (operands 2 and 3) are in the range [32, 64)
-let Defs = [PSW] in {
+// end bits (operands 2 and 3) are in the range [32, 64).
+let Defs = [CC] in {
   let isCodeGenOnly = 1 in
-    def RISBG32 : RotateSelectRIEf<"risbg",  0xEC55, GR32, GR32>;
-  def RISBG : RotateSelectRIEf<"risbg",  0xEC55, GR64, GR64>;
+    def RISBG32 : RotateSelectRIEf<"risbg", 0xEC55, GR32, GR32>;
+  let CCValues = 0xE, CompareZeroCCMask = 0xE in
+    def RISBG : RotateSelectRIEf<"risbg", 0xEC55, GR64, GR64>;
+}
+
+// Forms of RISBG that only affect one word of the destination register.
+// They do not set CC.
+def RISBMux : RotateSelectRIEfPseudo<GRX32, GRX32>, Requires<[FeatureHighWord]>;
+def RISBLL  : RotateSelectAliasRIEf<GR32,  GR32>,  Requires<[FeatureHighWord]>;
+def RISBLH  : RotateSelectAliasRIEf<GR32,  GRH32>, Requires<[FeatureHighWord]>;
+def RISBHL  : RotateSelectAliasRIEf<GRH32, GR32>,  Requires<[FeatureHighWord]>;
+def RISBHH  : RotateSelectAliasRIEf<GRH32, GRH32>, Requires<[FeatureHighWord]>;
+def RISBLG  : RotateSelectRIEf<"risblg", 0xEC51, GR32, GR64>,
+              Requires<[FeatureHighWord]>;
+def RISBHG  : RotateSelectRIEf<"risbhg", 0xEC5D, GRH32, GR64>,
+              Requires<[FeatureHighWord]>;
+
+// Rotate second operand left and perform a logical operation with selected
+// bits of the first operand.  The CC result only describes the selected bits,
+// so isn't useful for a full comparison against zero.
+let Defs = [CC] in {
+  def RNSBG : RotateSelectRIEf<"rnsbg", 0xEC54, GR64, GR64>;
+  def ROSBG : RotateSelectRIEf<"rosbg", 0xEC56, GR64, GR64>;
+  def RXSBG : RotateSelectRIEf<"rxsbg", 0xEC57, GR64, GR64>;
 }
 
 //===----------------------------------------------------------------------===//
 // Comparison
 //===----------------------------------------------------------------------===//
 
-// Signed comparisons.
-let Defs = [PSW] in {
+// Signed comparisons.  We put these before the unsigned comparisons because
+// some of the signed forms have COMPARE AND BRANCH equivalents whereas none
+// of the unsigned forms do.
+let Defs = [CC], CCValues = 0xE in {
   // Comparison with a register.
-  def CR   : CompareRR <"cr",   0x19,   z_cmp,     GR32, GR32>;
-  def CGFR : CompareRRE<"cgfr", 0xB930, null_frag, GR64, GR32>;
-  def CGR  : CompareRRE<"cgr",  0xB920, z_cmp,     GR64, GR64>;
+  def CR   : CompareRR <"c",   0x19,   z_scmp,    GR32, GR32>;
+  def CGFR : CompareRRE<"cgf", 0xB930, null_frag, GR64, GR32>;
+  def CGR  : CompareRRE<"cg",  0xB920, z_scmp,    GR64, GR64>;
 
   // Comparison with a signed 16-bit immediate.
-  def CHI  : CompareRI<"chi",  0xA7E, z_cmp, GR32, imm32sx16>;
-  def CGHI : CompareRI<"cghi", 0xA7F, z_cmp, GR64, imm64sx16>;
-
-  // Comparison with a signed 32-bit immediate.
-  def CFI  : CompareRIL<"cfi",  0xC2D, z_cmp, GR32, simm32>;
-  def CGFI : CompareRIL<"cgfi", 0xC2C, z_cmp, GR64, imm64sx32>;
+  def CHI  : CompareRI<"chi",  0xA7E, z_scmp, GR32, imm32sx16>;
+  def CGHI : CompareRI<"cghi", 0xA7F, z_scmp, GR64, imm64sx16>;
+
+  // Comparison with a signed 32-bit immediate.  CFIMux expands to CFI or CIH,
+  // depending on the choice of register.
+  def CFIMux : CompareRIPseudo<z_scmp, GRX32, simm32>,
+               Requires<[FeatureHighWord]>;
+  def CFI  : CompareRIL<"cfi",  0xC2D, z_scmp, GR32, simm32>;
+  def CIH  : CompareRIL<"cih",  0xCCD, z_scmp, GRH32, simm32>,
+             Requires<[FeatureHighWord]>;
+  def CGFI : CompareRIL<"cgfi", 0xC2C, z_scmp, GR64, imm64sx32>;
 
   // Comparison with memory.
-  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_cmp, GR32, sextloadi16>;
-  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_cmp, GR32, load>;
-  def  CGH   : CompareRXY<"cgh", 0xE334, z_cmp, GR64, sextloadi16>;
-  def  CGF   : CompareRXY<"cgf", 0xE330, z_cmp, GR64, sextloadi32>;
-  def  CG    : CompareRXY<"cg",  0xE320, z_cmp, GR64, load>;
-  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_cmp, GR32, aligned_sextloadi16>;
-  def  CRL   : CompareRILPC<"crl",   0xC6D, z_cmp, GR32, aligned_load>;
-  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_cmp, GR64, aligned_sextloadi16>;
-  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_cmp, GR64, aligned_sextloadi32>;
-  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_cmp, GR64, aligned_load>;
+  defm CH    : CompareRXPair<"ch", 0x49, 0xE379, z_scmp, GR32, asextloadi16, 2>;
+  def  CMux  : CompareRXYPseudo<z_scmp, GRX32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  defm C     : CompareRXPair<"c",  0x59, 0xE359, z_scmp, GR32, load, 4>;
+  def  CHF   : CompareRXY<"chf", 0xE3CD, z_scmp, GRH32, load, 4>,
+               Requires<[FeatureHighWord]>;
+  def  CGH   : CompareRXY<"cgh", 0xE334, z_scmp, GR64, asextloadi16, 2>;
+  def  CGF   : CompareRXY<"cgf", 0xE330, z_scmp, GR64, asextloadi32, 4>;
+  def  CG    : CompareRXY<"cg",  0xE320, z_scmp, GR64, load, 8>;
+  def  CHRL  : CompareRILPC<"chrl",  0xC65, z_scmp, GR32, aligned_asextloadi16>;
+  def  CRL   : CompareRILPC<"crl",   0xC6D, z_scmp, GR32, aligned_load>;
+  def  CGHRL : CompareRILPC<"cghrl", 0xC64, z_scmp, GR64, aligned_asextloadi16>;
+  def  CGFRL : CompareRILPC<"cgfrl", 0xC6C, z_scmp, GR64, aligned_asextloadi32>;
+  def  CGRL  : CompareRILPC<"cgrl",  0xC68, z_scmp, GR64, aligned_load>;
 
   // Comparison between memory and a signed 16-bit immediate.
-  def CHHSI : CompareSIL<"chhsi", 0xE554, z_cmp, sextloadi16, imm32sx16>;
-  def CHSI  : CompareSIL<"chsi",  0xE55C, z_cmp, load,        imm32sx16>;
-  def CGHSI : CompareSIL<"cghsi", 0xE558, z_cmp, load,        imm64sx16>;
+  def CHHSI : CompareSIL<"chhsi", 0xE554, z_scmp, asextloadi16, imm32sx16>;
+  def CHSI  : CompareSIL<"chsi",  0xE55C, z_scmp, load, imm32sx16>;
+  def CGHSI : CompareSIL<"cghsi", 0xE558, z_scmp, load, imm64sx16>;
 }
-defm : SXB<z_cmp, GR64, CGFR>;
+defm : SXB<z_scmp, GR64, CGFR>;
 
 // Unsigned comparisons.
-let Defs = [PSW] in {
+let Defs = [CC], CCValues = 0xE, IsLogical = 1 in {
   // Comparison with a register.
-  def CLR   : CompareRR <"clr",   0x15,   z_ucmp,    GR32, GR32>;
-  def CLGFR : CompareRRE<"clgfr", 0xB931, null_frag, GR64, GR32>;
-  def CLGR  : CompareRRE<"clgr",  0xB921, z_ucmp,    GR64, GR64>;
-
-  // Comparison with a signed 32-bit immediate.
+  def CLR   : CompareRR <"cl",   0x15,   z_ucmp,    GR32, GR32>;
+  def CLGFR : CompareRRE<"clgf", 0xB931, null_frag, GR64, GR32>;
+  def CLGR  : CompareRRE<"clg",  0xB921, z_ucmp,    GR64, GR64>;
+
+  // Comparison with an unsigned 32-bit immediate.  CLFIMux expands to CLFI
+  // or CLIH, depending on the choice of register.
+  def CLFIMux : CompareRIPseudo<z_ucmp, GRX32, uimm32>,
+                Requires<[FeatureHighWord]>;
   def CLFI  : CompareRIL<"clfi",  0xC2F, z_ucmp, GR32, uimm32>;
+  def CLIH  : CompareRIL<"clih",  0xCCF, z_ucmp, GR32, uimm32>,
+              Requires<[FeatureHighWord]>;
   def CLGFI : CompareRIL<"clgfi", 0xC2E, z_ucmp, GR64, imm64zx32>;
 
   // Comparison with memory.
-  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load>;
-  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, zextloadi32>;
-  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load>;
+  def  CLMux  : CompareRXYPseudo<z_ucmp, GRX32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  defm CL     : CompareRXPair<"cl", 0x55, 0xE355, z_ucmp, GR32, load, 4>;
+  def  CLHF   : CompareRXY<"clhf", 0xE3CF, z_ucmp, GRH32, load, 4>,
+                Requires<[FeatureHighWord]>;
+  def  CLGF   : CompareRXY<"clgf", 0xE331, z_ucmp, GR64, azextloadi32, 4>;
+  def  CLG    : CompareRXY<"clg",  0xE321, z_ucmp, GR64, load, 8>;
   def  CLHRL  : CompareRILPC<"clhrl",  0xC67, z_ucmp, GR32,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLRL   : CompareRILPC<"clrl",   0xC6F, z_ucmp, GR32,
                              aligned_load>;
   def  CLGHRL : CompareRILPC<"clghrl", 0xC66, z_ucmp, GR64,
-                             aligned_zextloadi16>;
+                             aligned_azextloadi16>;
   def  CLGFRL : CompareRILPC<"clgfrl", 0xC6E, z_ucmp, GR64,
-                             aligned_zextloadi32>;
+                             aligned_azextloadi32>;
   def  CLGRL  : CompareRILPC<"clgrl",  0xC6A, z_ucmp, GR64,
                              aligned_load>;
 
   // Comparison between memory and an unsigned 8-bit immediate.
-  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, zextloadi8, imm32zx8>;
+  defm CLI : CompareSIPair<"cli", 0x95, 0xEB55, z_ucmp, azextloadi8, imm32zx8>;
 
   // Comparison between memory and an unsigned 16-bit immediate.
-  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, zextloadi16, imm32zx16>;
-  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load,        imm32zx16>;
-  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load,        imm64zx16>;
+  def CLHHSI : CompareSIL<"clhhsi", 0xE555, z_ucmp, azextloadi16, imm32zx16>;
+  def CLFHSI : CompareSIL<"clfhsi", 0xE55D, z_ucmp, load, imm32zx16>;
+  def CLGHSI : CompareSIL<"clghsi", 0xE559, z_ucmp, load, imm64zx16>;
 }
 defm : ZXB<z_ucmp, GR64, CLGFR>;
 
+// Memory-to-memory comparison.
+let mayLoad = 1, Defs = [CC] in
+  defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
+
+// String comparison.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm CLST : StringRRE<"clst", 0xB25D, z_strcmp>;
+
+// Test under mask.
+let Defs = [CC] in {
+  // TMxMux expands to TM[LH]x, depending on the choice of register.
+  def TMLMux : CompareRIPseudo<z_tm_reg, GRX32, imm32ll16>,
+               Requires<[FeatureHighWord]>;
+  def TMHMux : CompareRIPseudo<z_tm_reg, GRX32, imm32lh16>,
+               Requires<[FeatureHighWord]>;
+  def TMLL : CompareRI<"tmll", 0xA71, z_tm_reg, GR32, imm32ll16>;
+  def TMLH : CompareRI<"tmlh", 0xA70, z_tm_reg, GR32, imm32lh16>;
+  def TMHL : CompareRI<"tmhl", 0xA73, z_tm_reg, GRH32, imm32ll16>;
+  def TMHH : CompareRI<"tmhh", 0xA72, z_tm_reg, GRH32, imm32lh16>;
+
+  def TMLL64 : CompareAliasRI<z_tm_reg, GR64, imm64ll16>;
+  def TMLH64 : CompareAliasRI<z_tm_reg, GR64, imm64lh16>;
+  def TMHL64 : CompareAliasRI<z_tm_reg, GR64, imm64hl16>;
+  def TMHH64 : CompareAliasRI<z_tm_reg, GR64, imm64hh16>;
+
+  defm TM : CompareSIPair<"tm", 0x91, 0xEB51, z_tm_mem, anyextloadi8, imm32zx8>;
+}
+
+//===----------------------------------------------------------------------===//
+// Prefetch
+//===----------------------------------------------------------------------===//
+
+def PFD : PrefetchRXY<"pfd", 0xE336, z_prefetch>;
+def PFDRL : PrefetchRILPC<"pfdrl", 0xC62, z_prefetch>;
+
 //===----------------------------------------------------------------------===//
 // Atomic operations
 //===----------------------------------------------------------------------===//
@@ -805,60 +1215,60 @@ def ATOMIC_LOAD_SGR     : AtomicLoadBinaryReg64<atomic_load_sub_64>;
 def ATOMIC_LOADW_NR     : AtomicLoadWBinaryReg<z_atomic_loadw_and>;
 def ATOMIC_LOADW_NILH   : AtomicLoadWBinaryImm<z_atomic_loadw_and, imm32lh16c>;
 def ATOMIC_LOAD_NR      : AtomicLoadBinaryReg32<atomic_load_and_32>;
-def ATOMIC_LOAD_NILL32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
-def ATOMIC_LOAD_NILH32  : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
-def ATOMIC_LOAD_NILF32  : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
+def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32ll16c>;
+def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm32<atomic_load_and_32, imm32lh16c>;
+def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm32<atomic_load_and_32, uimm32>;
 def ATOMIC_LOAD_NGR     : AtomicLoadBinaryReg64<atomic_load_and_64>;
-def ATOMIC_LOAD_NILL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
-def ATOMIC_LOAD_NILH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
-def ATOMIC_LOAD_NIHL    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
-def ATOMIC_LOAD_NIHH    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
-def ATOMIC_LOAD_NILF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
-def ATOMIC_LOAD_NIHF    : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
+def ATOMIC_LOAD_NILL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64ll16c>;
+def ATOMIC_LOAD_NILH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lh16c>;
+def ATOMIC_LOAD_NIHL64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hl16c>;
+def ATOMIC_LOAD_NIHH64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hh16c>;
+def ATOMIC_LOAD_NILF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64lf32c>;
+def ATOMIC_LOAD_NIHF64  : AtomicLoadBinaryImm64<atomic_load_and_64, imm64hf32c>;
 
 def ATOMIC_LOADW_OR     : AtomicLoadWBinaryReg<z_atomic_loadw_or>;
 def ATOMIC_LOADW_OILH   : AtomicLoadWBinaryImm<z_atomic_loadw_or, imm32lh16>;
 def ATOMIC_LOAD_OR      : AtomicLoadBinaryReg32<atomic_load_or_32>;
-def ATOMIC_LOAD_OILL32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
-def ATOMIC_LOAD_OILH32  : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
-def ATOMIC_LOAD_OILF32  : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
+def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32ll16>;
+def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm32<atomic_load_or_32, imm32lh16>;
+def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm32<atomic_load_or_32, uimm32>;
 def ATOMIC_LOAD_OGR     : AtomicLoadBinaryReg64<atomic_load_or_64>;
-def ATOMIC_LOAD_OILL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
-def ATOMIC_LOAD_OILH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
-def ATOMIC_LOAD_OIHL    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
-def ATOMIC_LOAD_OIHH    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
-def ATOMIC_LOAD_OILF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
-def ATOMIC_LOAD_OIHF    : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
+def ATOMIC_LOAD_OILL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64ll16>;
+def ATOMIC_LOAD_OILH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lh16>;
+def ATOMIC_LOAD_OIHL64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hl16>;
+def ATOMIC_LOAD_OIHH64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hh16>;
+def ATOMIC_LOAD_OILF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64lf32>;
+def ATOMIC_LOAD_OIHF64  : AtomicLoadBinaryImm64<atomic_load_or_64, imm64hf32>;
 
 def ATOMIC_LOADW_XR     : AtomicLoadWBinaryReg<z_atomic_loadw_xor>;
 def ATOMIC_LOADW_XILF   : AtomicLoadWBinaryImm<z_atomic_loadw_xor, uimm32>;
 def ATOMIC_LOAD_XR      : AtomicLoadBinaryReg32<atomic_load_xor_32>;
-def ATOMIC_LOAD_XILF32  : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
+def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm32<atomic_load_xor_32, uimm32>;
 def ATOMIC_LOAD_XGR     : AtomicLoadBinaryReg64<atomic_load_xor_64>;
-def ATOMIC_LOAD_XILF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
-def ATOMIC_LOAD_XIHF    : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
+def ATOMIC_LOAD_XILF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64lf32>;
+def ATOMIC_LOAD_XIHF64  : AtomicLoadBinaryImm64<atomic_load_xor_64, imm64hf32>;
 
 def ATOMIC_LOADW_NRi    : AtomicLoadWBinaryReg<z_atomic_loadw_nand>;
 def ATOMIC_LOADW_NILHi  : AtomicLoadWBinaryImm<z_atomic_loadw_nand,
                                                imm32lh16c>;
 def ATOMIC_LOAD_NRi     : AtomicLoadBinaryReg32<atomic_load_nand_32>;
-def ATOMIC_LOAD_NILL32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32ll16c>;
-def ATOMIC_LOAD_NILH32i : AtomicLoadBinaryImm32<atomic_load_nand_32,
+def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm32<atomic_load_nand_32,
                                                 imm32lh16c>;
-def ATOMIC_LOAD_NILF32i : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
+def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm32<atomic_load_nand_32, uimm32>;
 def ATOMIC_LOAD_NGRi    : AtomicLoadBinaryReg64<atomic_load_nand_64>;
-def ATOMIC_LOAD_NILLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64ll16c>;
-def ATOMIC_LOAD_NILHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lh16c>;
-def ATOMIC_LOAD_NIHLi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHL64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hl16c>;
-def ATOMIC_LOAD_NIHHi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHH64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hh16c>;
-def ATOMIC_LOAD_NILFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NILF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64lf32c>;
-def ATOMIC_LOAD_NIHFi   : AtomicLoadBinaryImm64<atomic_load_nand_64,
+def ATOMIC_LOAD_NIHF64i : AtomicLoadBinaryImm64<atomic_load_nand_64,
                                                 imm64hf32c>;
 
 def ATOMIC_LOADW_MIN    : AtomicLoadWBinaryReg<z_atomic_loadw_min>;
@@ -885,13 +1295,13 @@ def ATOMIC_CMP_SWAPW
                  (z_atomic_cmp_swapw bdaddr20only:$addr, GR32:$cmp, GR32:$swap,
                                      ADDR32:$bitshift, ADDR32:$negbitshift,
                                      uimm32:$bitsize))]> {
-  let Defs = [PSW];
+  let Defs = [CC];
   let mayLoad = 1;
   let mayStore = 1;
   let usesCustomInserter = 1;
 }
 
-let Defs = [PSW] in {
+let Defs = [CC] in {
   defm CS  : CmpSwapRSPair<"cs", 0xBA, 0xEB14, atomic_cmp_swap_32, GR32>;
   def  CSG : CmpSwapRSY<"csg", 0xEB30, atomic_cmp_swap_64, GR64>;
 }
@@ -900,34 +1310,30 @@ let Defs = [PSW] in {
 // Miscellaneous Instructions.
 //===----------------------------------------------------------------------===//
 
+// Extract CC into bits 29 and 28 of a register.
+let Uses = [CC] in
+  def IPM : InherentRRE<"ipm", 0xB222, GR32, (z_ipm)>;
+
 // Read a 32-bit access register into a GR32.  As with all GR32 operations,
 // the upper 32 bits of the enclosing GR64 remain unchanged, which is useful
 // when a 64-bit address is stored in a pair of access registers.
-def EAR : InstRRE<0xB24F, (outs GR32:$dst), (ins access_reg:$src),
-                  "ear\t$dst, $src",
-                  [(set GR32:$dst, (z_extract_access access_reg:$src))]>;
+def EAR : InstRRE<0xB24F, (outs GR32:$R1), (ins access_reg:$R2),
+                  "ear\t$R1, $R2",
+                  [(set GR32:$R1, (z_extract_access access_reg:$R2))]>;
 
 // Find leftmost one, AKA count leading zeros.  The instruction actually
 // returns a pair of GR64s, the first giving the number of leading zeros
 // and the second giving a copy of the source with the leftmost one bit
 // cleared.  We only use the first result here.
-let Defs = [PSW] in {
-  def FLOGR : UnaryRRE<"flogr", 0xB983, null_frag, GR128, GR64>;
+let Defs = [CC] in {
+  def FLOGR : UnaryRRE<"flog", 0xB983, null_frag, GR128, GR64>;
 }
 def : Pat<(ctlz GR64:$src),
-          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_high)>;
+          (EXTRACT_SUBREG (FLOGR GR64:$src), subreg_h64)>;
 
 // Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext.
 def : Pat<(i64 (anyext GR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
-
-// There are no 32-bit equivalents of LLILL and LLILH, so use a full
-// 64-bit move followed by a subreg.  This preserves the invariant that
-// all GR32 operations only modify the low 32 bits.
-def : Pat<(i32 imm32ll16:$src),
-          (EXTRACT_SUBREG (LLILL (LL16 imm:$src)), subreg_32bit)>;
-def : Pat<(i32 imm32lh16:$src),
-          (EXTRACT_SUBREG (LLILH (LH16 imm:$src)), subreg_32bit)>;
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>;
 
 // Extend GR32s and GR64s to GR128s.
 let usesCustomInserter = 1 in {
@@ -936,6 +1342,10 @@ let usesCustomInserter = 1 in {
   def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>;
 }
 
+// Search a block of memory for a character.
+let mayLoad = 1, Defs = [CC], Uses = [R0L] in
+  defm SRST : StringRRE<"srst", 0xb25e, z_search_string>;
+
 //===----------------------------------------------------------------------===//
 // Peepholes.
 //===----------------------------------------------------------------------===//
@@ -944,12 +1354,40 @@ let usesCustomInserter = 1 in {
 defm : ZXB<add, GR64, ALGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32:$src2),
            (ALGFI GR64:$src1, imm64zx32:$src2)>;
-def  : Pat<(add GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(add GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (ALGF GR64:$src1, bdxaddr20only:$addr)>;
 
 // Use SL* for GR64 subtractions of unsigned 32-bit values.
 defm : ZXB<sub, GR64, SLGFR>;
 def  : Pat<(add GR64:$src1, imm64zx32n:$src2),
            (SLGFI GR64:$src1, imm64zx32n:$src2)>;
-def  : Pat<(sub GR64:$src1, (zextloadi32 bdxaddr20only:$addr)),
+def  : Pat<(sub GR64:$src1, (azextloadi32 bdxaddr20only:$addr)),
            (SLGF GR64:$src1, bdxaddr20only:$addr)>;
+
+// Optimize sign-extended 1/0 selects to -1/0 selects.  This is important
+// for vector legalization.
+def : Pat<(sra (shl (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid, uimm8zx4:$cc)),
+                         (i32 31)),
+                    (i32 31)),
+          (Select32 (LHI -1), (LHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
+def : Pat<(sra (shl (i64 (anyext (i32 (z_select_ccmask 1, 0, uimm8zx4:$valid,
+                                                       uimm8zx4:$cc)))),
+                    (i32 63)),
+               (i32 63)),
+          (Select64 (LGHI -1), (LGHI 0), uimm8zx4:$valid, uimm8zx4:$cc)>;
+
+// Peepholes for turning scalar operations into block operations.
+defm : BlockLoadStore<anyextloadi8, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<load, i32, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<anyextloadi8, i64, MVCSequence, NCSequence,
+                      OCSequence, XCSequence, 1>;
+defm : BlockLoadStore<anyextloadi16, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 2>;
+defm : BlockLoadStore<anyextloadi32, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 4>;
+defm : BlockLoadStore<load, i64, MVCSequence, NCSequence, OCSequence,
+                      XCSequence, 8>;
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
new file mode 100644
index 0000000..ba027d4
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -0,0 +1,462 @@
+//===-- SystemZLongBranch.cpp - Branch lengthening for SystemZ ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that all branches are in range.  There are several ways
+// in which this could be done.  One aggressive approach is to assume that all
+// branches are in range and successively replace those that turn out not
+// to be in range with a longer form (branch relaxation).  A simple
+// implementation is to continually walk through the function relaxing
+// branches until no more changes are needed and a fixed point is reached.
+// However, in the pathological worst case, this implementation is
+// quadratic in the number of blocks; relaxing branch N can make branch N-1
+// go out of range, which in turn can make branch N-2 go out of range,
+// and so on.
+//
+// An alternative approach is to assume that all branches must be
+// converted to their long forms, then reinstate the short forms of
+// branches that, even under this pessimistic assumption, turn out to be
+// in range (branch shortening).  This too can be implemented as a function
+// walk that is repeated until a fixed point is reached.  In general,
+// the result of shortening is not as good as that of relaxation, and
+// shortening is also quadratic in the worst case; shortening branch N
+// can bring branch N-1 in range of the short form, which in turn can do
+// the same for branch N-2, and so on.  The main advantage of shortening
+// is that each walk through the function produces valid code, so it is
+// possible to stop at any point after the first walk.  The quadraticness
+// could therefore be handled with a maximum pass count, although the
+// question then becomes: what maximum count should be used?
+//
+// On SystemZ, long branches are only needed for functions bigger than 64k,
+// which are relatively rare to begin with, and the long branch sequences
+// are actually relatively cheap.  It therefore doesn't seem worth spending
+// much compilation time on the problem.  Instead, the approach we take is:
+//
+// (1) Work out the address that each block would have if no branches
+//     need relaxing.  Exit the pass early if all branches are in range
+//     according to this assumption.
+//
+// (2) Work out the address that each block would have if all branches
+//     need relaxing.
+//
+// (3) Walk through the block calculating the final address of each instruction
+//     and relaxing those that need to be relaxed.  For backward branches,
+//     this check uses the final address of the target block, as calculated
+//     earlier in the walk.  For forward branches, this check uses the
+//     address of the target block that was calculated in (2).  Both checks
+//     give a conservatively-correct range.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-long-branch"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+STATISTIC(LongBranches, "Number of long branches.");
+
+namespace {
+  // Represents positional information about a basic block.
+  struct MBBInfo {
+    // The address that we currently assume the block has.
+    uint64_t Address;
+
+    // The size of the block in bytes, excluding terminators.
+    // This value never changes.
+    uint64_t Size;
+
+    // The minimum alignment of the block, as a log2 value.
+    // This value never changes.
+    unsigned Alignment;
+
+    // The number of terminators in this block.  This value never changes.
+    unsigned NumTerminators;
+
+    MBBInfo()
+      : Address(0), Size(0), Alignment(0), NumTerminators(0) {} 
+  };
+
+  // Represents the state of a block terminator.
+  struct TerminatorInfo {
+    // If this terminator is a relaxable branch, this points to the branch
+    // instruction, otherwise it is null.
+    MachineInstr *Branch;
+
+    // The address that we currently assume the terminator has.
+    uint64_t Address;
+
+    // The current size of the terminator in bytes.
+    uint64_t Size;
+
+    // If Branch is nonnull, this is the number of the target block,
+    // otherwise it is unused.
+    unsigned TargetBlock;
+
+    // If Branch is nonnull, this is the length of the longest relaxed form,
+    // otherwise it is zero.
+    unsigned ExtraRelaxSize;
+
+    TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
+  };
+
+  // Used to keep track of the current position while iterating over the blocks.
+  struct BlockPosition {
+    // The address that we assume this position has.
+    uint64_t Address;
+
+    // The number of low bits in Address that are known to be the same
+    // as the runtime address.
+    unsigned KnownBits;
+
+    BlockPosition(unsigned InitialAlignment)
+      : Address(0), KnownBits(InitialAlignment) {}
+  };
+
+  class SystemZLongBranch : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZLongBranch(const SystemZTargetMachine &tm)
+      : MachineFunctionPass(ID), TII(0) {}
+
+    virtual const char *getPassName() const {
+      return "SystemZ Long Branch";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
+    void skipTerminator(BlockPosition &Position, TerminatorInfo &Terminator,
+                        bool AssumeRelaxed);
+    TerminatorInfo describeTerminator(MachineInstr *MI);
+    uint64_t initMBBInfo();
+    bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
+    bool mustRelaxABranch();
+    void setWorstCaseAddresses();
+    void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
+    void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
+    void relaxBranch(TerminatorInfo &Terminator);
+    void relaxBranches();
+
+    const SystemZInstrInfo *TII;
+    MachineFunction *MF;
+    SmallVector<MBBInfo, 16> MBBs;
+    SmallVector<TerminatorInfo, 16> Terminators;
+  };
+
+  char SystemZLongBranch::ID = 0;
+
+  const uint64_t MaxBackwardRange = 0x10000;
+  const uint64_t MaxForwardRange = 0xfffe;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZLongBranchPass(SystemZTargetMachine &TM) {
+  return new SystemZLongBranch(TM);
+}
+
+// Position describes the state immediately before Block.  Update Block
+// accordingly and move Position to the end of the block's non-terminator
+// instructions.
+void SystemZLongBranch::skipNonTerminators(BlockPosition &Position,
+                                           MBBInfo &Block) {
+  if (Block.Alignment > Position.KnownBits) {
+    // When calculating the address of Block, we need to conservatively
+    // assume that Block had the worst possible misalignment.
+    Position.Address += ((uint64_t(1) << Block.Alignment) -
+                         (uint64_t(1) << Position.KnownBits));
+    Position.KnownBits = Block.Alignment;
+  }
+
+  // Align the addresses.
+  uint64_t AlignMask = (uint64_t(1) << Block.Alignment) - 1;
+  Position.Address = (Position.Address + AlignMask) & ~AlignMask;
+
+  // Record the block's position.
+  Block.Address = Position.Address;
+
+  // Move past the non-terminators in the block.
+  Position.Address += Block.Size;
+}
+
+// Position describes the state immediately before Terminator.
+// Update Terminator accordingly and move Position past it.
+// Assume that Terminator will be relaxed if AssumeRelaxed.
+void SystemZLongBranch::skipTerminator(BlockPosition &Position,
+                                       TerminatorInfo &Terminator,
+                                       bool AssumeRelaxed) {
+  Terminator.Address = Position.Address;
+  Position.Address += Terminator.Size;
+  if (AssumeRelaxed)
+    Position.Address += Terminator.ExtraRelaxSize;
+}
+
+// Return a description of terminator instruction MI.
+TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
+  TerminatorInfo Terminator;
+  Terminator.Size = TII->getInstSizeInBytes(MI);
+  if (MI->isConditionalBranch() || MI->isUnconditionalBranch()) {
+    switch (MI->getOpcode()) {
+    case SystemZ::J:
+      // Relaxes to JG, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRC:
+      // Relaxes to BRCL, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::BRCT:
+    case SystemZ::BRCTG:
+      // Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    case SystemZ::CRJ:
+    case SystemZ::CLRJ:
+      // Relaxes to a C(L)R/BRCL sequence, which is 2 bytes longer.
+      Terminator.ExtraRelaxSize = 2;
+      break;
+    case SystemZ::CGRJ:
+    case SystemZ::CLGRJ:
+      // Relaxes to a C(L)GR/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CIJ:
+    case SystemZ::CGIJ:
+      // Relaxes to a C(G)HI/BRCL sequence, which is 4 bytes longer.
+      Terminator.ExtraRelaxSize = 4;
+      break;
+    case SystemZ::CLIJ:
+    case SystemZ::CLGIJ:
+      // Relaxes to a CL(G)FI/BRCL sequence, which is 6 bytes longer.
+      Terminator.ExtraRelaxSize = 6;
+      break;
+    default:
+      llvm_unreachable("Unrecognized branch instruction");
+    }
+    Terminator.Branch = MI;
+    Terminator.TargetBlock =
+      TII->getBranchInfo(MI).Target->getMBB()->getNumber();
+  }
+  return Terminator;
+}
+
+// Fill MBBs and Terminators, setting the addresses on the assumption
+// that no branches need relaxation.  Return the size of the function under
+// this assumption.
+uint64_t SystemZLongBranch::initMBBInfo() {
+  MF->RenumberBlocks();
+  unsigned NumBlocks = MF->size();
+
+  MBBs.clear();
+  MBBs.resize(NumBlocks);
+
+  Terminators.clear();
+  Terminators.reserve(NumBlocks);
+
+  BlockPosition Position(MF->getAlignment());
+  for (unsigned I = 0; I < NumBlocks; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(I);
+    MBBInfo &Block = MBBs[I];
+
+    // Record the alignment, for quick access.
+    Block.Alignment = MBB->getAlignment();
+
+    // Calculate the size of the fixed part of the block.
+    MachineBasicBlock::iterator MI = MBB->begin();
+    MachineBasicBlock::iterator End = MBB->end();
+    while (MI != End && !MI->isTerminator()) {
+      Block.Size += TII->getInstSizeInBytes(MI);
+      ++MI;
+    }
+    skipNonTerminators(Position, Block);
+
+    // Add the terminators.
+    while (MI != End) {
+      if (!MI->isDebugValue()) {
+        assert(MI->isTerminator() && "Terminator followed by non-terminator");
+        Terminators.push_back(describeTerminator(MI));
+        skipTerminator(Position, Terminators.back(), false);
+        ++Block.NumTerminators;
+      }
+      ++MI;
+    }
+  }
+
+  return Position.Address;
+}
+
+// Return true if, under current assumptions, Terminator would need to be
+// relaxed if it were placed at address Address.
+bool SystemZLongBranch::mustRelaxBranch(const TerminatorInfo &Terminator,
+                                        uint64_t Address) {
+  if (!Terminator.Branch)
+    return false;
+
+  const MBBInfo &Target = MBBs[Terminator.TargetBlock];
+  if (Address >= Target.Address) {
+    if (Address - Target.Address <= MaxBackwardRange)
+      return false;
+  } else {
+    if (Target.Address - Address <= MaxForwardRange)
+      return false;
+  }
+
+  return true;
+}
+
+// Return true if, under current assumptions, any terminator needs
+// to be relaxed.
+bool SystemZLongBranch::mustRelaxABranch() {
+  for (SmallVectorImpl<TerminatorInfo>::iterator TI = Terminators.begin(),
+         TE = Terminators.end(); TI != TE; ++TI)
+    if (mustRelaxBranch(*TI, TI->Address))
+      return true;
+  return false;
+}
+
+// Set the address of each block on the assumption that all branches
+// must be long.
+void SystemZLongBranch::setWorstCaseAddresses() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (SmallVectorImpl<MBBInfo>::iterator BI = MBBs.begin(), BE = MBBs.end();
+       BI != BE; ++BI) {
+    skipNonTerminators(Position, *BI);
+    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+      skipTerminator(Position, *TI, true);
+      ++TI;
+    }
+  }
+}
+
+// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
+// by a BRCL on the result.
+void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
+                                           unsigned AddOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1))
+    .addImm(-1);
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addImm(SystemZ::CCMASK_CMP_NE)
+    .addOperand(MI->getOperand(2));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Split MI into the comparison given by CompareOpcode followed
+// a BRCL on the result.
+void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
+                                           unsigned CompareOpcode) {
+  MachineBasicBlock *MBB = MI->getParent();
+  DebugLoc DL = MI->getDebugLoc();
+  BuildMI(*MBB, MI, DL, TII->get(CompareOpcode))
+    .addOperand(MI->getOperand(0))
+    .addOperand(MI->getOperand(1));
+  MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+    .addImm(SystemZ::CCMASK_ICMP)
+    .addOperand(MI->getOperand(2))
+    .addOperand(MI->getOperand(3));
+  // The implicit use of CC is a killing use.
+  BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+  MI->eraseFromParent();
+}
+
+// Relax the branch described by Terminator.
+void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
+  MachineInstr *Branch = Terminator.Branch;
+  switch (Branch->getOpcode()) {
+  case SystemZ::J:
+    Branch->setDesc(TII->get(SystemZ::JG));
+    break;
+  case SystemZ::BRC:
+    Branch->setDesc(TII->get(SystemZ::BRCL));
+    break;
+  case SystemZ::BRCT:
+    splitBranchOnCount(Branch, SystemZ::AHI);
+    break;
+  case SystemZ::BRCTG:
+    splitBranchOnCount(Branch, SystemZ::AGHI);
+    break;
+  case SystemZ::CRJ:
+    splitCompareBranch(Branch, SystemZ::CR);
+    break;
+  case SystemZ::CGRJ:
+    splitCompareBranch(Branch, SystemZ::CGR);
+    break;
+  case SystemZ::CIJ:
+    splitCompareBranch(Branch, SystemZ::CHI);
+    break;
+  case SystemZ::CGIJ:
+    splitCompareBranch(Branch, SystemZ::CGHI);
+    break;
+  case SystemZ::CLRJ:
+    splitCompareBranch(Branch, SystemZ::CLR);
+    break;
+  case SystemZ::CLGRJ:
+    splitCompareBranch(Branch, SystemZ::CLGR);
+    break;
+  case SystemZ::CLIJ:
+    splitCompareBranch(Branch, SystemZ::CLFI);
+    break;
+  case SystemZ::CLGIJ:
+    splitCompareBranch(Branch, SystemZ::CLGFI);
+    break;
+  default:
+    llvm_unreachable("Unrecognized branch");
+  }
+
+  Terminator.Size += Terminator.ExtraRelaxSize;
+  Terminator.ExtraRelaxSize = 0;
+  Terminator.Branch = 0;
+
+  ++LongBranches;
+}
+
+// Run a shortening pass and relax any branches that need to be relaxed.
+void SystemZLongBranch::relaxBranches() {
+  SmallVector<TerminatorInfo, 16>::iterator TI = Terminators.begin();
+  BlockPosition Position(MF->getAlignment());
+  for (SmallVectorImpl<MBBInfo>::iterator BI = MBBs.begin(), BE = MBBs.end();
+       BI != BE; ++BI) {
+    skipNonTerminators(Position, *BI);
+    for (unsigned BTI = 0, BTE = BI->NumTerminators; BTI != BTE; ++BTI) {
+      assert(Position.Address <= TI->Address &&
+             "Addresses shouldn't go forwards");
+      if (mustRelaxBranch(*TI, Position.Address))
+        relaxBranch(*TI);
+      skipTerminator(Position, *TI, false);
+      ++TI;
+    }
+  }
+}
+
+bool SystemZLongBranch::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+  MF = &F;
+  uint64_t Size = initMBBInfo();
+  if (Size <= MaxForwardRange || !mustRelaxABranch())
+    return false;
+
+  setWorstCaseAddresses();
+  relaxBranches();
+  return true;
+}
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.cpp b/lib/Target/SystemZ/SystemZMCInstLower.cpp
index 5d83321..ff9a6c0 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.cpp
+++ b/lib/Target/SystemZ/SystemZMCInstLower.cpp
@@ -15,20 +15,6 @@
 
 using namespace llvm;
 
-// Where relaxable pairs of reloc-generating instructions exist,
-// we tend to use the longest form by default, since that produces
-// correct assembly in cases where no relaxation is performed.
-// If Opcode is one such instruction, return the opcode for the
-// shortest possible form instead, otherwise return Opcode itself.
-static unsigned getShortenedInstr(unsigned Opcode) {
-  switch (Opcode) {
-  case SystemZ::BRCL:  return SystemZ::BRC;
-  case SystemZ::JG:    return SystemZ::J;
-  case SystemZ::BRASL: return SystemZ::BRAS;
-  }
-  return Opcode;
-}
-
 // Return the VK_* enumeration for MachineOperand target flags Flags.
 static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   switch (Flags & SystemZII::MO_SYMBOL_MODIFIER) {
@@ -40,77 +26,75 @@ static MCSymbolRefExpr::VariantKind getVariantKind(unsigned Flags) {
   llvm_unreachable("Unrecognised MO_ACCESS_MODEL");
 }
 
-SystemZMCInstLower::SystemZMCInstLower(Mangler *mang, MCContext &ctx,
+SystemZMCInstLower::SystemZMCInstLower(MCContext &ctx,
                                        SystemZAsmPrinter &asmprinter)
-  : Mang(mang), Ctx(ctx), AsmPrinter(asmprinter) {}
+  : Ctx(ctx), AsmPrinter(asmprinter) {}
 
-MCOperand SystemZMCInstLower::lowerSymbolOperand(const MachineOperand &MO,
-                                                 const MCSymbol *Symbol,
-                                                 int64_t Offset) const {
-  MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
-  if (Offset) {
-    const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
-    Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+const MCExpr *
+SystemZMCInstLower::getExpr(const MachineOperand &MO,
+                            MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbol *Symbol;
+  bool HasOffset = true;
+  switch (MO.getType()) {
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = AsmPrinter.getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    HasOffset = false;
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  default:
+    llvm_unreachable("unknown operand type");
   }
-  return MCOperand::CreateExpr(Expr);
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
+  if (HasOffset)
+    if (int64_t Offset = MO.getOffset()) {
+      const MCExpr *OffsetExpr = MCConstantExpr::Create(Offset, Ctx);
+      Expr = MCBinaryExpr::CreateAdd(Expr, OffsetExpr, Ctx);
+    }
+  return Expr;
 }
 
 MCOperand SystemZMCInstLower::lowerOperand(const MachineOperand &MO) const {
   switch (MO.getType()) {
-  default:
-    llvm_unreachable("unknown operand type");
-
   case MachineOperand::MO_Register:
-    // Ignore all implicit register operands.
-    if (MO.isImplicit())
-      return MCOperand();
     return MCOperand::CreateReg(MO.getReg());
 
   case MachineOperand::MO_Immediate:
     return MCOperand::CreateImm(MO.getImm());
 
-  case MachineOperand::MO_MachineBasicBlock:
-    return lowerSymbolOperand(MO, MO.getMBB()->getSymbol(),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_GlobalAddress:
-    return lowerSymbolOperand(MO, Mang->getSymbol(MO.getGlobal()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_ExternalSymbol: {
-    StringRef Name = MO.getSymbolName();
-    return lowerSymbolOperand(MO, AsmPrinter.GetExternalSymbolSymbol(Name),
-                              MO.getOffset());
-  }
-
-  case MachineOperand::MO_JumpTableIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()),
-                              /* MO has no offset field */0);
-
-  case MachineOperand::MO_ConstantPoolIndex:
-    return lowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()),
-                              MO.getOffset());
-
-  case MachineOperand::MO_BlockAddress: {
-    const BlockAddress *BA = MO.getBlockAddress();
-    return lowerSymbolOperand(MO, AsmPrinter.GetBlockAddressSymbol(BA),
-                              MO.getOffset());
+  default: {
+    MCSymbolRefExpr::VariantKind Kind = getVariantKind(MO.getTargetFlags());
+    return MCOperand::CreateExpr(getExpr(MO, Kind));
   }
   }
 }
 
 void SystemZMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  unsigned Opcode = MI->getOpcode();
-  // When emitting binary code, start with the shortest form of an instruction
-  // and then relax it where necessary.
-  if (!AsmPrinter.OutStreamer.hasRawTextSupport())
-    Opcode = getShortenedInstr(Opcode);
-  OutMI.setOpcode(Opcode);
+  OutMI.setOpcode(MI->getOpcode());
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
-    MCOperand MCOp = lowerOperand(MO);
-    if (MCOp.isValid())
-      OutMI.addOperand(MCOp);
+    // Ignore all implicit register operands.
+    if (!MO.isReg() || !MO.isImplicit())
+      OutMI.addOperand(lowerOperand(MO));
   }
 }
diff --git a/lib/Target/SystemZ/SystemZMCInstLower.h b/lib/Target/SystemZ/SystemZMCInstLower.h
index afa72f3..f6d5ac8 100644
--- a/lib/Target/SystemZ/SystemZMCInstLower.h
+++ b/lib/Target/SystemZ/SystemZMCInstLower.h
@@ -10,37 +10,34 @@
 #ifndef LLVM_SYSTEMZMCINSTLOWER_H
 #define LLVM_SYSTEMZMCINSTLOWER_H
 
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class MCContext;
 class MCInst;
 class MCOperand;
-class MCSymbol;
 class MachineInstr;
 class MachineOperand;
 class Mangler;
 class SystemZAsmPrinter;
 
 class LLVM_LIBRARY_VISIBILITY SystemZMCInstLower {
-  Mangler *Mang;
   MCContext &Ctx;
   SystemZAsmPrinter &AsmPrinter;
 
 public:
-  SystemZMCInstLower(Mangler *mang, MCContext &ctx,
-                     SystemZAsmPrinter &asmPrinter);
+  SystemZMCInstLower(MCContext &ctx, SystemZAsmPrinter &asmPrinter);
 
   // Lower MachineInstr MI to MCInst OutMI.
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
 
-  // Return an MCOperand for MO.  Return an empty operand if MO is implicit.
+  // Return an MCOperand for MO.
   MCOperand lowerOperand(const MachineOperand& MO) const;
 
-  // Return an MCOperand for MO, given that it equals Symbol + Offset.
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Symbol, int64_t Offset) const;
+  // Return an MCExpr for symbolic operand MO with variant kind Kind.
+  const MCExpr *getExpr(const MachineOperand &MO,
+                        MCSymbolRefExpr::VariantKind Kind) const;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTXNumRegisters.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
index a95c16b..00572d0 100644
--- a/lib/Target/NVPTX/NVPTXNumRegisters.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.cpp
@@ -1,5 +1,4 @@
-
-//===-- NVPTXNumRegisters.h - PTX Register Info ---------------------------===//
+//== SystemZMachineFuctionInfo.cpp - SystemZ machine function info-*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,9 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef NVPTX_NUM_REGISTERS_H
-#define NVPTX_NUM_REGISTERS_H
+#include "SystemZMachineFunctionInfo.h"
+
+using namespace llvm;
+
 
-namespace llvm { const unsigned NVPTXNumRegisters = 396; }
+// pin vtable to this file
+void SystemZMachineFunctionInfo::anchor() {}
 
-#endif
diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index 1dc05a7e..845291f 100644
--- a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -15,7 +15,7 @@
 namespace llvm {
 
 class SystemZMachineFunctionInfo : public MachineFunctionInfo {
-  unsigned SavedGPRFrameSize;
+  virtual void anchor();
   unsigned LowSavedGPR;
   unsigned HighSavedGPR;
   unsigned VarArgsFirstGPR;
@@ -26,14 +26,8 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
 
 public:
   explicit SystemZMachineFunctionInfo(MachineFunction &MF)
-    : SavedGPRFrameSize(0), LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0),
-      VarArgsFirstFPR(0), VarArgsFrameIndex(0), RegSaveFrameIndex(0),
-      ManipulatesSP(false) {}
-
-  // Get and set the number of bytes allocated by generic code to store
-  // call-saved GPRs.
-  unsigned getSavedGPRFrameSize() const { return SavedGPRFrameSize; }
-  void setSavedGPRFrameSize(unsigned bytes) { SavedGPRFrameSize = bytes; }
+    : LowSavedGPR(0), HighSavedGPR(0), VarArgsFirstGPR(0), VarArgsFirstFPR(0),
+      VarArgsFrameIndex(0), RegSaveFrameIndex(0), ManipulatesSP(false) {}
 
   // Get and set the first call-saved GPR that should be saved and restored
   // by this function.  This is 0 if no GPRs need to be saved or restored.
diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
index 0abc3f7..3ad146c 100644
--- a/lib/Target/SystemZ/SystemZOperands.td
+++ b/lib/Target/SystemZ/SystemZOperands.td
@@ -24,57 +24,93 @@ class ImmediateAsmOperand<string name>
 class Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop>
   : PatLeaf<(vt imm), pred, xform>, Operand<vt> {
   let PrintMethod = "print"##asmop##"Operand";
+  let DecoderMethod = "decode"##asmop##"Operand";
   let ParserMatchClass = !cast<AsmOperandClass>(asmop);
 }
 
+// Constructs an asm operand for a PC-relative address.  SIZE says how
+// many bits there are.
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+  let PredicateMethod = "isImm";
+  let ParserMethod = "parsePCRel"##size;
+}
+
+// Constructs an operand for a PC-relative address with address type VT.
+// ASMOP is the associated asm operand.
+class PCRelOperand<ValueType vt, AsmOperandClass asmop> : Operand<vt> {
+  let PrintMethod = "printPCRelOperand";
+  let ParserMatchClass = asmop;
+}
+
 // Constructs both a DAG pattern and instruction operand for a PC-relative
-// address with address size VT.  SELF is the name of the operand.
-class PCRelAddress<ValueType vt, string self>
-  : ComplexPattern<vt, 1, "selectPCRelAddress", [z_pcrel_wrapper]>,
-    Operand<vt> {
+// address with address size VT.  SELF is the name of the operand and
+// ASMOP is the associated asm operand.
+class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
+  : ComplexPattern<vt, 1, "selectPCRelAddress",
+                   [z_pcrel_wrapper, z_pcrel_offset]>,
+    PCRelOperand<vt, asmop> {
   let MIOperandInfo = (ops !cast<Operand>(self));
 }
 
 // Constructs an AsmOperandClass for addressing mode FORMAT, treating the
 // registers as having BITSIZE bits and displacements as having DISPSIZE bits.
-class AddressAsmOperand<string format, string bitsize, string dispsize>
+// LENGTH is "LenN" for addresses with an N-bit length field, otherwise it
+// is "".
+class AddressAsmOperand<string format, string bitsize, string dispsize,
+                        string length = "">
   : AsmOperandClass {
-  let Name = format##bitsize##"Disp"##dispsize;
+  let Name = format##bitsize##"Disp"##dispsize##length;
   let ParserMethod = "parse"##format##bitsize;
   let RenderMethod = "add"##format##"Operands";
 }
 
 // Constructs both a DAG pattern and instruction operand for an addressing mode.
-// The mode is selected by custom code in selectTYPE...SUFFIX().  The address
-// registers have BITSIZE bits and displacements have DISPSIZE bits.  NUMOPS is
-// the number of operands that make up an address and OPERANDS lists the types
-// of those operands using (ops ...).  FORMAT is the type of addressing mode,
-// which needs to match the names used in AddressAsmOperand.
-class AddressingMode<string type, string bitsize, string dispsize,
-                     string suffix, int numops, string format, dag operands>
+// FORMAT, BITSIZE, DISPSIZE and LENGTH are the parameters to an associated
+// AddressAsmOperand.  OPERANDS is a list of NUMOPS individual operands
+// (base register, displacement, etc.).  SELTYPE is the type of the memory
+// operand for selection purposes; sometimes we want different selection
+// choices for the same underlying addressing mode.  SUFFIX is similarly
+// a suffix appended to the displacement for selection purposes;
+// e.g. we want to reject small 20-bit displacements if a 12-bit form
+// also exists, but we want to accept them otherwise.
+class AddressingMode<string seltype, string bitsize, string dispsize,
+                     string suffix, string length, int numops, string format,
+                     dag operands>
   : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
-                   "select"##type##dispsize##suffix,
+                   "select"##seltype##dispsize##suffix##length,
                    [add, sub, or, frameindex, z_adjdynalloc]>,
     Operand<!cast<ValueType>("i"##bitsize)> {
   let PrintMethod = "print"##format##"Operand";
+  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+  let DecoderMethod =
+    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
   let MIOperandInfo = operands;
   let ParserMatchClass =
-    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize);
+    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
 }
 
 // An addressing mode with a base and displacement but no index.
 class BDMode<string type, string bitsize, string dispsize, string suffix>
-  : AddressingMode<type, bitsize, dispsize, suffix, 2, "BDAddr",
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
                         !cast<Immediate>("disp"##dispsize##"imm"##bitsize))>;
 
 // An addressing mode with a base, displacement and index.
 class BDXMode<string type, string bitsize, string dispsize, string suffix>
-  : AddressingMode<type, bitsize, dispsize, suffix, 3, "BDXAddr",
+  : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
                    (ops !cast<RegisterOperand>("ADDR"##bitsize),
                         !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
                         !cast<RegisterOperand>("ADDR"##bitsize))>;
 
+// A BDMode paired with an immediate length operand of LENSIZE bits.
+class BDLMode<string type, string bitsize, string dispsize, string suffix,
+              string lensize>
+  : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
+                   "BDLAddr",
+                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
+                        !cast<Immediate>("disp"##dispsize##"imm"##bitsize),
+                        !cast<Immediate>("imm"##bitsize))>;
+
 //===----------------------------------------------------------------------===//
 // Extracting immediate operands from nodes
 // These all create MVT::i64 nodes to ensure the value is not sign-extended
@@ -298,6 +334,10 @@ def imm64sx8 : Immediate<i64, [{
   return isInt<8>(N->getSExtValue());
 }], SIMM8, "S8Imm">;
 
+def imm64zx8 : Immediate<i64, [{
+  return isUInt<8>(N->getSExtValue());
+}], UIMM8, "U8Imm">;
+
 def imm64sx16 : Immediate<i64, [{
   return isInt<16>(N->getSExtValue());
 }], SIMM16, "S16Imm">;
@@ -318,7 +358,7 @@ def imm64zx32n : Immediate<i64, [{
   return isUInt<32>(-N->getSExtValue());
 }], NEGIMM32, "U32Imm">;
 
-def imm64 : ImmLeaf<i64, [{}]>;
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point immediates
@@ -334,30 +374,26 @@ def fpimmneg0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(-0.0); }]>;
 // Symbolic address operands
 //===----------------------------------------------------------------------===//
 
+// PC-relative asm operands.
+def PCRel16 : PCRelAsmOperand<"16">;
+def PCRel32 : PCRelAsmOperand<"32">;
+
 // PC-relative offsets of a basic block.  The offset is sign-extended
 // and multiplied by 2.
-def brtarget16 : Operand<OtherVT> {
+def brtarget16 : PCRelOperand<OtherVT, PCRel16> {
   let EncoderMethod = "getPC16DBLEncoding";
+  let DecoderMethod = "decodePC16DBLOperand";
 }
-def brtarget32 : Operand<OtherVT> {
+def brtarget32 : PCRelOperand<OtherVT, PCRel32> {
   let EncoderMethod = "getPC32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
 }
 
 // A PC-relative offset of a global value.  The offset is sign-extended
 // and multiplied by 2.
-def pcrel32 : PCRelAddress<i64, "pcrel32"> {
+def pcrel32 : PCRelAddress<i64, "pcrel32", PCRel32> {
   let EncoderMethod = "getPC32DBLEncoding";
-}
-
-// A PC-relative offset of a global value when the value is used as a
-// call target.  The offset is sign-extended and multiplied by 2.
-def pcrel16call : PCRelAddress<i64, "pcrel16call"> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT16DBLEncoding";
-}
-def pcrel32call : PCRelAddress<i64, "pcrel32call"> {
-  let PrintMethod = "printCallOperand";
-  let EncoderMethod = "getPLT32DBLEncoding";
+  let DecoderMethod = "decodePC32DBLOperand";
 }
 
 //===----------------------------------------------------------------------===//
@@ -372,22 +408,25 @@ def disp12imm64 : Operand<i64>;
 def disp20imm32 : Operand<i32>;
 def disp20imm64 : Operand<i64>;
 
-def BDAddr32Disp12  : AddressAsmOperand<"BDAddr",  "32", "12">;
-def BDAddr32Disp20  : AddressAsmOperand<"BDAddr",  "32", "20">;
-def BDAddr64Disp12  : AddressAsmOperand<"BDAddr",  "64", "12">;
-def BDAddr64Disp20  : AddressAsmOperand<"BDAddr",  "64", "20">;
-def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">;
-def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
+def BDAddr32Disp12      : AddressAsmOperand<"BDAddr",   "32", "12">;
+def BDAddr32Disp20      : AddressAsmOperand<"BDAddr",   "32", "20">;
+def BDAddr64Disp12      : AddressAsmOperand<"BDAddr",   "64", "12">;
+def BDAddr64Disp20      : AddressAsmOperand<"BDAddr",   "64", "20">;
+def BDXAddr64Disp12     : AddressAsmOperand<"BDXAddr",  "64", "12">;
+def BDXAddr64Disp20     : AddressAsmOperand<"BDXAddr",  "64", "20">;
+def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr",  "64", "12", "Len8">;
 
 // DAG patterns and operands for addressing modes.  Each mode has
-// the form <type><range><group> where:
+// the form <type><range><group>[<len>] where:
 //
 // <type> is one of:
 //   shift    : base + displacement (32-bit)
 //   bdaddr   : base + displacement
+//   mviaddr  : like bdaddr, but reject cases with a natural index
 //   bdxaddr  : base + displacement + index
 //   laaddr   : like bdxaddr, but used for Load Address operations
 //   dynalloc : base + displacement + index + ADJDYNALLOC
+//   bdladdr  : base + displacement with a length field
 //
 // <range> is one of:
 //   12       : the displacement is an unsigned 12-bit value
@@ -398,20 +437,28 @@ def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">;
 //              range value (12 or 20)
 //   only     : used when there is no equivalent instruction with the opposite
 //              range value
-def shift12only      : BDMode <"BDAddr",   "32", "12", "Only">;
-def shift20only      : BDMode <"BDAddr",   "32", "20", "Only">;
-def bdaddr12only     : BDMode <"BDAddr",   "64", "12", "Only">;
-def bdaddr12pair     : BDMode <"BDAddr",   "64", "12", "Pair">;
-def bdaddr20only     : BDMode <"BDAddr",   "64", "20", "Only">;
-def bdaddr20pair     : BDMode <"BDAddr",   "64", "20", "Pair">;
-def bdxaddr12only    : BDXMode<"BDXAddr",  "64", "12", "Only">;
-def bdxaddr12pair    : BDXMode<"BDXAddr",  "64", "12", "Pair">;
-def bdxaddr20only    : BDXMode<"BDXAddr",  "64", "20", "Only">;
-def bdxaddr20only128 : BDXMode<"BDXAddr",  "64", "20", "Only128">;
-def bdxaddr20pair    : BDXMode<"BDXAddr",  "64", "20", "Pair">;
-def dynalloc12only   : BDXMode<"DynAlloc", "64", "12", "Only">;
-def laaddr12pair     : BDXMode<"LAAddr",   "64", "12", "Pair">;
-def laaddr20pair     : BDXMode<"LAAddr",   "64", "20", "Pair">;
+//
+// <len> is one of:
+//
+//   <empty>  : there is no length field
+//   len8     : the length field is 8 bits, with a range of [1, 0x100].
+def shift12only       : BDMode <"BDAddr",   "32", "12", "Only">;
+def shift20only       : BDMode <"BDAddr",   "32", "20", "Only">;
+def bdaddr12only      : BDMode <"BDAddr",   "64", "12", "Only">;
+def bdaddr12pair      : BDMode <"BDAddr",   "64", "12", "Pair">;
+def bdaddr20only      : BDMode <"BDAddr",   "64", "20", "Only">;
+def bdaddr20pair      : BDMode <"BDAddr",   "64", "20", "Pair">;
+def mviaddr12pair     : BDMode <"MVIAddr",  "64", "12", "Pair">;
+def mviaddr20pair     : BDMode <"MVIAddr",  "64", "20", "Pair">;
+def bdxaddr12only     : BDXMode<"BDXAddr",  "64", "12", "Only">;
+def bdxaddr12pair     : BDXMode<"BDXAddr",  "64", "12", "Pair">;
+def bdxaddr20only     : BDXMode<"BDXAddr",  "64", "20", "Only">;
+def bdxaddr20only128  : BDXMode<"BDXAddr",  "64", "20", "Only128">;
+def bdxaddr20pair     : BDXMode<"BDXAddr",  "64", "20", "Pair">;
+def dynalloc12only    : BDXMode<"DynAlloc", "64", "12", "Only">;
+def laaddr12pair      : BDXMode<"LAAddr",   "64", "12", "Pair">;
+def laaddr20pair      : BDXMode<"LAAddr",   "64", "20", "Pair">;
+def bdladdr12onlylen8 : BDLMode<"BDLAddr",  "64", "12", "Only", "8">;
 
 //===----------------------------------------------------------------------===//
 // Miscellaneous
diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td
index 8c4df56..31cabaa 100644
--- a/lib/Target/SystemZ/SystemZOperators.td
+++ b/lib/Target/SystemZ/SystemZOperators.td
@@ -15,16 +15,25 @@ def SDT_CallSeqEnd          : SDCallSeqEnd<[SDTCisVT<0, i64>,
                                             SDTCisVT<1, i64>]>;
 def SDT_ZCall               : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
 def SDT_ZCmp                : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
-def SDT_ZBRCCMask           : SDTypeProfile<0, 2,
+def SDT_ZICmp               : SDTypeProfile<0, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisVT<2, i32>]>;
+def SDT_ZBRCCMask           : SDTypeProfile<0, 3,
                                             [SDTCisVT<0, i8>,
-                                             SDTCisVT<1, OtherVT>]>;
-def SDT_ZSelectCCMask       : SDTypeProfile<1, 3,
+                                             SDTCisVT<1, i8>,
+                                             SDTCisVT<2, OtherVT>]>;
+def SDT_ZSelectCCMask       : SDTypeProfile<1, 4,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisSameAs<1, 2>,
-                                             SDTCisVT<3, i8>]>;
+                                             SDTCisVT<3, i8>,
+                                             SDTCisVT<4, i8>]>;
 def SDT_ZWrapPtr            : SDTypeProfile<1, 1,
                                             [SDTCisSameAs<0, 1>,
                                              SDTCisPtrTy<0>]>;
+def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
 def SDT_ZExtractAccess      : SDTypeProfile<1, 1,
                                             [SDTCisVT<0, i32>,
@@ -52,6 +61,24 @@ def SDT_ZAtomicCmpSwapW     : SDTypeProfile<1, 6,
                                              SDTCisVT<4, i32>,
                                              SDTCisVT<5, i32>,
                                              SDTCisVT<6, i32>]>;
+def SDT_ZMemMemLength       : SDTypeProfile<0, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop         : SDTypeProfile<0, 4,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisVT<2, i64>,
+                                             SDTCisVT<3, i64>]>;
+def SDT_ZString             : SDTypeProfile<1, 3,
+                                            [SDTCisPtrTy<0>,
+                                             SDTCisPtrTy<1>,
+                                             SDTCisPtrTy<2>,
+                                             SDTCisVT<3, i32>]>;
+def SDT_ZI32Intrinsic       : SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>;
+def SDT_ZPrefetch           : SDTypeProfile<0, 2,
+                                            [SDTCisVT<0, i8>,
+                                             SDTCisPtrTy<1>]>;
 
 //===----------------------------------------------------------------------===//
 // Node definitions
@@ -70,9 +97,15 @@ def z_retflag           : SDNode<"SystemZISD::RET_FLAG", SDTNone,
 def z_call              : SDNode<"SystemZISD::CALL", SDT_ZCall,
                                  [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                                   SDNPVariadic]>;
+def z_sibcall           : SDNode<"SystemZISD::SIBCALL", SDT_ZCall,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
+                                  SDNPVariadic]>;
 def z_pcrel_wrapper     : SDNode<"SystemZISD::PCREL_WRAPPER", SDT_ZWrapPtr, []>;
-def z_cmp               : SDNode<"SystemZISD::CMP", SDT_ZCmp, [SDNPOutGlue]>;
-def z_ucmp              : SDNode<"SystemZISD::UCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_pcrel_offset      : SDNode<"SystemZISD::PCREL_OFFSET",
+                                 SDT_ZWrapOffset, []>;
+def z_icmp              : SDNode<"SystemZISD::ICMP", SDT_ZICmp, [SDNPOutGlue]>;
+def z_fcmp              : SDNode<"SystemZISD::FCMP", SDT_ZCmp, [SDNPOutGlue]>;
+def z_tm                : SDNode<"SystemZISD::TM", SDT_ZICmp, [SDNPOutGlue]>;
 def z_br_ccmask         : SDNode<"SystemZISD::BR_CCMASK", SDT_ZBRCCMask,
                                  [SDNPHasChain, SDNPInGlue]>;
 def z_select_ccmask     : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask,
@@ -81,6 +114,7 @@ def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
 def z_extract_access    : SDNode<"SystemZISD::EXTRACT_ACCESS",
                                  SDT_ZExtractAccess>;
 def z_umul_lohi64       : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>;
+def z_sdivrem32         : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>;
 def z_sdivrem64         : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>;
 def z_udivrem32         : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>;
 def z_udivrem64         : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>;
@@ -102,10 +136,56 @@ def z_atomic_loadw_umin : AtomicWOp<"ATOMIC_LOADW_UMIN">;
 def z_atomic_loadw_umax : AtomicWOp<"ATOMIC_LOADW_UMAX">;
 def z_atomic_cmp_swapw  : AtomicWOp<"ATOMIC_CMP_SWAPW", SDT_ZAtomicCmpSwapW>;
 
+def z_mvc               : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop          : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc                : SDNode<"SystemZISD::NC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_nc_loop           : SDNode<"SystemZISD::NC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc                : SDNode<"SystemZISD::OC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_oc_loop           : SDNode<"SystemZISD::OC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc                : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_xc_loop           : SDNode<"SystemZISD::XC_LOOP", SDT_ZMemMemLoop,
+                                  [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_clc               : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop          : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_strcmp            : SDNode<"SystemZISD::STRCMP", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_stpcpy            : SDNode<"SystemZISD::STPCPY", SDT_ZString,
+                                 [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_search_string     : SDNode<"SystemZISD::SEARCH_STRING", SDT_ZString,
+                                 [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_ipm               : SDNode<"SystemZISD::IPM", SDT_ZI32Intrinsic,
+                                 [SDNPInGlue]>;
+def z_prefetch          : SDNode<"SystemZISD::PREFETCH", SDT_ZPrefetch,
+                                 [SDNPHasChain, SDNPMayLoad, SDNPMayStore,
+                                  SDNPMemOperand]>;
+
 //===----------------------------------------------------------------------===//
 // Pattern fragments
 //===----------------------------------------------------------------------===//
 
+// Signed and unsigned comparisons.
+def z_scmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::UnsignedOnly;
+}]>;
+def z_ucmp : PatFrag<(ops node:$a, node:$b), (z_icmp node:$a, node:$b, imm), [{
+  unsigned Type = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+  return Type != SystemZICMP::SignedOnly;
+}]>;
+
+// Register- and memory-based TEST UNDER MASK.
+def z_tm_reg : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, imm)>;
+def z_tm_mem : PatFrag<(ops node:$a, node:$b), (z_tm node:$a, node:$b, 0)>;
+
 // Register sign-extend operations.  Sub-32-bit values are represented as i32s.
 def sext8  : PatFrag<(ops node:$src), (sext_inreg node:$src, i8)>;
 def sext16 : PatFrag<(ops node:$src), (sext_inreg node:$src, i16)>;
@@ -120,17 +200,61 @@ def zext32 : PatFrag<(ops node:$src), (zext (i32 node:$src))>;
 def loadf32 : PatFrag<(ops node:$src), (f32 (load node:$src))>;
 def loadf64 : PatFrag<(ops node:$src), (f64 (load node:$src))>;
 
+// Extending loads in which the extension type can be signed.
+def asextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::SEXTLOAD;
+}]>;
+def asextloadi8 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def asextloadi16 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def asextloadi32 : PatFrag<(ops node:$ptr), (asextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type can be unsigned.
+def azextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  unsigned Type = cast<LoadSDNode>(N)->getExtensionType();
+  return Type == ISD::EXTLOAD || Type == ISD::ZEXTLOAD;
+}]>;
+def azextloadi8 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def azextloadi16 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def azextloadi32 : PatFrag<(ops node:$ptr), (azextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+// Extending loads in which the extension type doesn't matter.
+def anyextload : PatFrag<(ops node:$ptr), (unindexedload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getExtensionType() != ISD::NON_EXTLOAD;
+}]>;
+def anyextloadi8 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+def anyextloadi16 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+def anyextloadi32 : PatFrag<(ops node:$ptr), (anyextload node:$ptr), [{
+  return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 // Aligned loads.
 class AlignedLoad<SDPatternOperator load>
   : PatFrag<(ops node:$addr), (load node:$addr), [{
   LoadSDNode *Load = cast<LoadSDNode>(N);
   return Load->getAlignment() >= Load->getMemoryVT().getStoreSize();
 }]>;
-def aligned_load        : AlignedLoad<load>;
-def aligned_sextloadi16 : AlignedLoad<sextloadi16>;
-def aligned_sextloadi32 : AlignedLoad<sextloadi32>;
-def aligned_zextloadi16 : AlignedLoad<zextloadi16>;
-def aligned_zextloadi32 : AlignedLoad<zextloadi32>;
+def aligned_load         : AlignedLoad<load>;
+def aligned_asextloadi16 : AlignedLoad<asextloadi16>;
+def aligned_asextloadi32 : AlignedLoad<asextloadi32>;
+def aligned_azextloadi16 : AlignedLoad<azextloadi16>;
+def aligned_azextloadi32 : AlignedLoad<azextloadi32>;
 
 // Aligned stores.
 class AlignedStore<SDPatternOperator store>
@@ -142,6 +266,54 @@ def aligned_store         : AlignedStore<store>;
 def aligned_truncstorei16 : AlignedStore<truncstorei16>;
 def aligned_truncstorei32 : AlignedStore<truncstorei32>;
 
+// Non-volatile loads.  Used for instructions that might access the storage
+// location multiple times.
+class NonvolatileLoad<SDPatternOperator load>
+  : PatFrag<(ops node:$addr), (load node:$addr), [{
+  LoadSDNode *Load = cast<LoadSDNode>(N);
+  return !Load->isVolatile();
+}]>;
+def nonvolatile_load          : NonvolatileLoad<load>;
+def nonvolatile_anyextloadi8  : NonvolatileLoad<anyextloadi8>;
+def nonvolatile_anyextloadi16 : NonvolatileLoad<anyextloadi16>;
+def nonvolatile_anyextloadi32 : NonvolatileLoad<anyextloadi32>;
+
+// Non-volatile stores.
+class NonvolatileStore<SDPatternOperator store>
+  : PatFrag<(ops node:$src, node:$addr), (store node:$src, node:$addr), [{
+  StoreSDNode *Store = cast<StoreSDNode>(N);
+  return !Store->isVolatile();
+}]>;
+def nonvolatile_store         : NonvolatileStore<store>;
+def nonvolatile_truncstorei8  : NonvolatileStore<truncstorei8>;
+def nonvolatile_truncstorei16 : NonvolatileStore<truncstorei16>;
+def nonvolatile_truncstorei32 : NonvolatileStore<truncstorei32>;
+
+// A store of a load that can be implemented using MVC.
+def mvc_store : PatFrag<(ops node:$value, node:$addr),
+                        (unindexedstore node:$value, node:$addr),
+                        [{ return storeLoadCanUseMVC(N); }]>;
+
+// Binary read-modify-write operations on memory in which the other
+// operand is also memory and for which block operations like NC can
+// be used.  There are two patterns for each operator, depending on
+// which operand contains the "other" load.
+multiclass block_op<SDPatternOperator operator> {
+  def "1" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator node:$value,
+                                              (unindexedload node:$addr)),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 0); }]>;
+  def "2" : PatFrag<(ops node:$value, node:$addr),
+                    (unindexedstore (operator (unindexedload node:$addr),
+                                              node:$value),
+                                    node:$addr),
+                    [{ return storeLoadCanUseBlockBinary(N, 1); }]>;
+}
+defm block_and : block_op<and>;
+defm block_or  : block_op<or>;
+defm block_xor : block_op<xor>;
+
 // Insertions.
 def inserti8 : PatFrag<(ops node:$src1, node:$src2),
                        (or (and node:$src1, -256), node:$src2)>;
@@ -174,6 +346,16 @@ def or_as_revinserti8 : PatFrag<(ops node:$src1, node:$src2),
                                    APInt::getLowBitsSet(BitWidth, 8));
 }]>;
 
+// Integer absolute, matching the canonical form generated by DAGCombiner.
+def z_iabs32 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 31))),
+                            (sra node:$src, (i32 31)))>;
+def z_iabs64 : PatFrag<(ops node:$src),
+                       (xor (add node:$src, (sra node:$src, (i32 63))),
+                            (sra node:$src, (i32 63)))>;
+def z_inegabs32 : PatFrag<(ops node:$src), (ineg (z_iabs32 node:$src))>;
+def z_inegabs64 : PatFrag<(ops node:$src), (ineg (z_iabs64 node:$src))>;
+
 // Fused multiply-add and multiply-subtract, but with the order of the
 // operands matching SystemZ's MA and MS instructions.
 def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -186,11 +368,11 @@ def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>;
 
 // Create a unary operator that loads from memory and then performs
 // the given operation on it.
-class loadu<SDPatternOperator operator>
+class loadu<SDPatternOperator operator, SDPatternOperator load = load>
   : PatFrag<(ops node:$addr), (operator (load node:$addr))>;
 
 // Create a store operator that performs the given unary operation
 // on the value before storing it.
-class storeu<SDPatternOperator operator>
+class storeu<SDPatternOperator operator, SDPatternOperator store = store>
   : PatFrag<(ops node:$value, node:$addr),
             (store (operator node:$value), node:$addr)>;
diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td
index 3689f74..7706351 100644
--- a/lib/Target/SystemZ/SystemZPatterns.td
+++ b/lib/Target/SystemZ/SystemZPatterns.td
@@ -13,7 +13,7 @@ multiclass SXU<SDPatternOperator operator, Instruction insn> {
   def : Pat<(operator (sext (i32 GR32:$src))),
             (insn GR32:$src)>;
   def : Pat<(operator (sext_inreg GR64:$src, i32)),
-            (insn (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+            (insn (EXTRACT_SUBREG GR64:$src, subreg_l32))>;
 }
 
 // Record that INSN performs a 64-bit version of binary operator OPERATOR
@@ -24,7 +24,7 @@ multiclass SXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (sext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (sext_inreg GR64:$src2, i32)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Like SXB, but for zero extension.
@@ -33,7 +33,7 @@ multiclass ZXB<SDPatternOperator operator, RegisterOperand cls,
   def : Pat<(operator cls:$src1, (zext GR32:$src2)),
             (insn cls:$src1, GR32:$src2)>;
   def : Pat<(operator cls:$src1, (and GR64:$src2, 0xffffffff)),
-            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_32bit))>;
+            (insn cls:$src1, (EXTRACT_SUBREG GR64:$src2, subreg_l32))>;
 }
 
 // Record that INSN performs a binary read-modify-write operation,
@@ -50,12 +50,8 @@ class RMWI<SDPatternOperator load, SDPatternOperator operator,
 // memory location.  IMM is the type of the second operand.
 multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
                     Instruction insn> {
-  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<zextloadi8, operator, truncstorei8, mode, imm64, insn>;
-  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<sextloadi8, operator, truncstorei8, mode, imm64, insn>;
-  def : RMWI<extloadi8, operator, truncstorei8, mode, imm32, insn>;
-  def : RMWI<extloadi8, operator, truncstorei8, mode, imm64, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm32, insn>;
+  def : RMWI<anyextloadi8, operator, truncstorei8, mode, imm64, insn>;
 }
 
 // Record that INSN performs insertion TYPE into a register of class CLS.
@@ -69,3 +65,88 @@ multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
               (load mode:$src2), cls:$src1),
             (insn cls:$src1, mode:$src2)>;
 }
+
+// INSN stores the low 32 bits of a GPR to a memory with addressing mode MODE.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64<Instruction insn, SDPatternOperator operator,
+                AddressingMode mode>
+  : Pat<(operator GR64:$R1, mode:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), mode:$XBD2)>;
+
+// INSN and INSNY are an RX/RXY pair of instructions that store the low
+// 32 bits of a GPR to memory.  Record that they are equivalent to using
+// OPERATOR to store a GR64.
+multiclass StoreGR64Pair<Instruction insn, Instruction insny,
+                         SDPatternOperator operator> {
+  def : StoreGR64<insn, operator, bdxaddr12pair>;
+  def : StoreGR64<insny, operator, bdxaddr20pair>;
+}
+
+// INSN stores the low 32 bits of a GPR using PC-relative addressing.
+// Record that it is equivalent to using OPERATOR to store a GR64.
+class StoreGR64PC<Instruction insn, SDPatternOperator operator>
+  : Pat<(operator GR64:$R1, pcrel32:$XBD2),
+        (insn (EXTRACT_SUBREG GR64:$R1, subreg_l32), pcrel32:$XBD2)> {
+  // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
+  // However, BDXs have two extra operands and are therefore 6 units more
+  // complex.
+  let AddedComplexity = 7;
+}
+
+// INSN and INSNINV conditionally store the low 32 bits of a GPR to memory,
+// with INSN storing when the condition is true and INSNINV storing when the
+// condition is false.  Record that they are equivalent to a LOAD/select/STORE
+// sequence for GR64s.
+multiclass CondStores64<Instruction insn, Instruction insninv,
+                        SDPatternOperator store, SDPatternOperator load,
+                        AddressingMode mode> {
+  def : Pat<(store (z_select_ccmask GR64:$new, (load mode:$addr),
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insn (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                  uimm8zx4:$valid, uimm8zx4:$cc)>;
+  def : Pat<(store (z_select_ccmask (load mode:$addr), GR64:$new,
+                                    uimm8zx4:$valid, uimm8zx4:$cc),
+                   mode:$addr),
+            (insninv (EXTRACT_SUBREG GR64:$new, subreg_l32), mode:$addr,
+                     uimm8zx4:$valid, uimm8zx4:$cc)>;
+}
+
+// Try to use MVC instruction INSN for a load of type LOAD followed by a store
+// of the same size.  VT is the type of the intermediate (legalized) value and
+// LENGTH is the number of bytes loaded by LOAD.
+multiclass MVCLoadStore<SDPatternOperator load, ValueType vt, Instruction insn,
+                        bits<5> length> {
+  def : Pat<(mvc_store (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// Use NC-like instruction INSN for block_op operation OPERATOR.
+// The other operand is a load of type LOAD, which accesses LENGTH bytes.
+// VT is the intermediate legalized type in which the binary operation
+// is actually done.
+multiclass BinaryLoadStore<SDPatternOperator operator, SDPatternOperator load,
+                           ValueType vt, Instruction insn, bits<5> length> {
+  def : Pat<(operator (vt (load bdaddr12only:$src)), bdaddr12only:$dest),
+            (insn bdaddr12only:$dest, bdaddr12only:$src, length)>;
+}
+
+// A convenient way of generating all block peepholes for a particular
+// LOAD/VT/LENGTH combination.
+multiclass BlockLoadStore<SDPatternOperator load, ValueType vt,
+                          Instruction mvc, Instruction nc, Instruction oc,
+                          Instruction xc, bits<5> length> {
+  defm : MVCLoadStore<load, vt, mvc, length>;
+  defm : BinaryLoadStore<block_and1, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_and2, load, vt, nc, length>;
+  defm : BinaryLoadStore<block_or1,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_or2,  load, vt, oc, length>;
+  defm : BinaryLoadStore<block_xor1, load, vt, xc, length>;
+  defm : BinaryLoadStore<block_xor2, load, vt, xc, length>;
+}
+
+// Record that INSN is a LOAD AND TEST that can be used to compare
+// registers in CLS against zero.  The instruction has separate R1 and R2
+// operands, but they must be the same when the instruction is used like this.
+class CompareZeroFP<Instruction insn, RegisterOperand cls>
+  : Pat<(z_fcmp cls:$reg, (fpimm0)), (insn cls:$reg, cls:$reg)>;
diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td
new file mode 100644
index 0000000..f241fb0
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZProcessors.td
@@ -0,0 +1,46 @@
+//===-- SystemZ.td - SystemZ processors and features ---------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Processor and feature definitions.
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZFeature<string extname, string intname, string desc>
+  : Predicate<"Subtarget.has"##intname##"()">,
+    AssemblerPredicate<"Feature"##intname, extname>,
+    SubtargetFeature<extname, "Has"##intname, "true", desc>;
+
+def FeatureDistinctOps : SystemZFeature<
+  "distinct-ops", "DistinctOps",
+  "Assume that the distinct-operands facility is installed"
+>;
+
+def FeatureLoadStoreOnCond : SystemZFeature<
+  "load-store-on-cond", "LoadStoreOnCond",
+  "Assume that the load/store-on-condition facility is installed"
+>;
+
+def FeatureHighWord : SystemZFeature<
+  "high-word", "HighWord",
+  "Assume that the high-word facility is installed"
+>;
+
+def FeatureFPExtension : SystemZFeature<
+  "fp-extension", "FPExtension",
+  "Assume that the floating-point extension facility is installed"
+>;
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"z10", NoItineraries, []>;
+def : Processor<"z196", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
+def : Processor<"zEC12", NoItineraries,
+                [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
+                 FeatureFPExtension]>;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index a0ae7ed..b61ae88 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -17,9 +17,8 @@
 
 using namespace llvm;
 
-SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm,
-                                         const SystemZInstrInfo &tii)
-  : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm), TII(tii) {}
+SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm)
+  : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {}
 
 const uint16_t*
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
@@ -43,41 +42,19 @@ SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   if (TFI->hasFP(MF)) {
     // R11D is the frame pointer.  Reserve all aliases.
     Reserved.set(SystemZ::R11D);
-    Reserved.set(SystemZ::R11W);
+    Reserved.set(SystemZ::R11L);
+    Reserved.set(SystemZ::R11H);
     Reserved.set(SystemZ::R10Q);
   }
 
   // R15D is the stack pointer.  Reserve all aliases.
   Reserved.set(SystemZ::R15D);
-  Reserved.set(SystemZ::R15W);
+  Reserved.set(SystemZ::R15L);
+  Reserved.set(SystemZ::R15H);
   Reserved.set(SystemZ::R14Q);
   return Reserved;
 }
 
-bool
-SystemZRegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
-					   MachineBasicBlock::iterator SaveMBBI,
-					   MachineBasicBlock::iterator &UseMBBI,
-					   const TargetRegisterClass *RC,
-					   unsigned Reg) const {
-  MachineFunction &MF = *MBB.getParent();
-  const SystemZFrameLowering *TFI =
-    static_cast<const SystemZFrameLowering *>(TM.getFrameLowering());
-  unsigned Base = getFrameRegister(MF);
-  uint64_t Offset = TFI->getEmergencySpillSlotOffset(MF);
-  DebugLoc DL;
-
-  unsigned LoadOpcode, StoreOpcode;
-  TII.getLoadStoreOpcodes(RC, LoadOpcode, StoreOpcode);
-
-  // The offset must always be in range of a 12-bit unsigned displacement.
-  BuildMI(MBB, SaveMBBI, DL, TII.get(StoreOpcode))
-    .addReg(Reg, RegState::Kill).addReg(Base).addImm(Offset).addReg(0);
-  BuildMI(MBB, UseMBBI, DL, TII.get(LoadOpcode), Reg)
-    .addReg(Base).addImm(Offset).addReg(0);
-  return true;
-}
-
 void
 SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                          int SPAdj, unsigned FIOperandNum,
@@ -86,6 +63,8 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   MachineBasicBlock &MBB = *MI->getParent();
   MachineFunction &MF = *MBB.getParent();
+  const SystemZInstrInfo &TII =
+    *static_cast<const SystemZInstrInfo*>(TM.getInstrInfo());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   DebugLoc DL = MI->getDebugLoc();
 
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 91a70de..13f45fa 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -22,10 +22,10 @@ namespace SystemZ {
   // Return the subreg to use for referring to the even and odd registers
   // in a GR128 pair.  Is32Bit says whether we want a GR32 or GR64.
   inline unsigned even128(bool Is32bit) {
-    return Is32bit ? subreg_32bit : subreg_high;
+    return Is32bit ? subreg_hl32 : subreg_h64;
   }
   inline unsigned odd128(bool Is32bit) {
-    return Is32bit ? subreg_low32 : subreg_low;
+    return Is32bit ? subreg_l32 : subreg_l64;
   }
 }
 
@@ -35,10 +35,9 @@ class SystemZInstrInfo;
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
 private:
   SystemZTargetMachine &TM;
-  const SystemZInstrInfo &TII;
 
 public:
-  SystemZRegisterInfo(SystemZTargetMachine &tm, const SystemZInstrInfo &tii);
+  SystemZRegisterInfo(SystemZTargetMachine &tm);
 
   // Override TargetRegisterInfo.h.
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const
@@ -49,15 +48,14 @@ public:
     LLVM_OVERRIDE {
     return true;
   }
+  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const
+    LLVM_OVERRIDE {
+    return true;
+  }
   virtual const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0)
     const LLVM_OVERRIDE;
   virtual BitVector getReservedRegs(const MachineFunction &MF)
     const LLVM_OVERRIDE;
-  virtual bool saveScavengerRegister(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator SaveMBBI,
-                                     MachineBasicBlock::iterator &UseMBBI,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Reg) const LLVM_OVERRIDE;
   virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                                    int SPAdj, unsigned FIOperandNum,
                                    RegScavenger *RS) const LLVM_OVERRIDE;
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
index bd1b563..93d7c83 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -21,10 +21,12 @@ class SystemZRegWithSubregs<string n, list<Register> subregs>
 }
 
 let Namespace = "SystemZ" in {
-def subreg_32bit  : SubRegIndex; // could also be known as "subreg_high32"
-def subreg_high   : SubRegIndex;
-def subreg_low    : SubRegIndex;
-def subreg_low32  : SubRegIndex<[subreg_low, subreg_32bit]>;
+def subreg_l32   : SubRegIndex<32, 0>;  // Also acts as subreg_ll32.
+def subreg_h32   : SubRegIndex<32, 32>; // Also acts as subreg_lh32.
+def subreg_l64   : SubRegIndex<64, 0>;
+def subreg_h64   : SubRegIndex<64, 64>;
+def subreg_hh32  : ComposedSubRegIndex<subreg_h64, subreg_h32>;
+def subreg_hl32  : ComposedSubRegIndex<subreg_h64, subreg_l32>;
 }
 
 // Define a register class that contains values of type TYPE and an
@@ -54,36 +56,49 @@ class GPR32<bits<16> num, string n> : SystemZReg<n> {
 }
 
 // One of the 16 64-bit general-purpose registers.
-class GPR64<bits<16> num, string n, GPR32 low>
- : SystemZRegWithSubregs<n, [low]> {
+class GPR64<bits<16> num, string n, GPR32 low, GPR32 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_l32, subreg_h32];
 }
 
 // 8 even-odd pairs of GPR64s.
-class GPR128<bits<16> num, string n, GPR64 high, GPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class GPR128<bits<16> num, string n, GPR64 low, GPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // General-purpose registers
 foreach I = 0-15 in {
-  def R#I#W : GPR32<I, "r"#I>;
-  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"W")>, DwarfRegNum<[I]>;
+  def R#I#L : GPR32<I, "r"#I>;
+  def R#I#H : GPR32<I, "r"#I>;
+  def R#I#D : GPR64<I, "r"#I, !cast<GPR32>("R"#I#"L"), !cast<GPR32>("R"#I#"H")>,
+                    DwarfRegNum<[I]>;
 }
 
 foreach I = [0, 2, 4, 6, 8, 10, 12, 14] in {
-  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#I#"D"),
-                     !cast<GPR64>("R"#!add(I, 1)#"D")>;
+  def R#I#Q : GPR128<I, "r"#I, !cast<GPR64>("R"#!add(I, 1)#"D"),
+                     !cast<GPR64>("R"#I#"D")>;
 }
 
 /// Allocate the callee-saved R6-R13 backwards. That way they can be saved
 /// together with R14 and R15 in one prolog instruction.
-defm GR32 : SystemZRegClass<"GR32", i32, 32, (add (sequence "R%uW",  0, 5),
-                                                  (sequence "R%uW", 15, 6))>;
-defm GR64 : SystemZRegClass<"GR64", i64, 64, (add (sequence "R%uD",  0, 5),
-                                                  (sequence "R%uD", 15, 6))>;
+defm GR32  : SystemZRegClass<"GR32",  i32, 32, (add (sequence "R%uL",  0, 5),
+                                                    (sequence "R%uL", 15, 6))>;
+defm GRH32 : SystemZRegClass<"GRH32", i32, 32, (add (sequence "R%uH",  0, 5),
+                                                    (sequence "R%uH", 15, 6))>;
+defm GR64  : SystemZRegClass<"GR64",  i64, 64, (add (sequence "R%uD",  0, 5),
+                                                    (sequence "R%uD", 15, 6))>;
+
+// Combine the low and high GR32s into a single class.  This can only be
+// used for virtual registers if the high-word facility is available.
+defm GRX32 : SystemZRegClass<"GRX32", i32, 32,
+                             (add (sequence "R%uL",  0, 5),
+                                  (sequence "R%uH",  0, 5),
+                                  R15L, R15H, R14L, R14H, R13L, R13H,
+                                  R12L, R12H, R11L, R11H, R10L, R10H,
+                                  R9L, R9H, R8L, R8H, R7L, R7H, R6L, R6H)>;
 
 // The architecture doesn't really have any i128 support, so model the
 // register pairs as untyped instead.
@@ -93,7 +108,7 @@ defm GR128 : SystemZRegClass<"GR128", untyped, 128, (add R0Q, R2Q, R4Q,
 
 // Base and index registers.  Everything except R0, which in an address
 // context evaluates as 0.
-defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0W)>;
+defm ADDR32 : SystemZRegClass<"ADDR32", i32, 32, (sub GR32Bit, R0L)>;
 defm ADDR64 : SystemZRegClass<"ADDR64", i64, 64, (sub GR64Bit, R0D)>;
 
 // Not used directly, but needs to exist for ADDR32 and ADDR64 subregs
@@ -113,14 +128,14 @@ class FPR32<bits<16> num, string n> : SystemZReg<n> {
 class FPR64<bits<16> num, string n, FPR32 low>
  : SystemZRegWithSubregs<n, [low]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_32bit];
+  let SubRegIndices = [subreg_h32];
 }
 
 // 8 pairs of FPR64s, with a one-register gap inbetween.
-class FPR128<bits<16> num, string n, FPR64 high, FPR64 low>
- : SystemZRegWithSubregs<n, [high, low]> {
+class FPR128<bits<16> num, string n, FPR64 low, FPR64 high>
+ : SystemZRegWithSubregs<n, [low, high]> {
   let HWEncoding = num;
-  let SubRegIndices = [subreg_high, subreg_low];
+  let SubRegIndices = [subreg_l64, subreg_h64];
 }
 
 // Floating-point registers
@@ -131,8 +146,8 @@ foreach I = 0-15 in {
 }
 
 foreach I = [0, 1, 4, 5, 8, 9, 12, 13] in {
-  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#I#"D"),
-                     !cast<FPR64>("F"#!add(I, 2)#"D")>;
+  def F#I#Q  : FPR128<I, "f"#I, !cast<FPR64>("F"#!add(I, 2)#"D"),
+                     !cast<FPR64>("F"#I#"D")>;
 }
 
 // There's no store-multiple instruction for FPRs, so we're not fussy
@@ -146,5 +161,7 @@ defm FP128 : SystemZRegClass<"FP128", f128, 128, (add F0Q, F1Q, F4Q, F5Q,
 // Other registers
 //===----------------------------------------------------------------------===//
 
-// Status register
-def PSW : SystemZReg<"psw">;
+// The 2-bit condition code field of the PSW.  Every register named in an
+// inline asm needs a class associated with it.
+def CC : SystemZReg<"cc">;
+def CCRegs : RegisterClass<"SystemZ", [i32], 32, (add CC)>;
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
new file mode 100644
index 0000000..c7ebb5d
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -0,0 +1,293 @@
+//===-- SystemZSelectionDAGInfo.cpp - SystemZ SelectionDAG Info -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZSelectionDAGInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-selectiondag-info"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+SystemZSelectionDAGInfo::
+SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
+  : TargetSelectionDAGInfo(TM) {
+}
+
+SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
+}
+
+// Decide whether it is best to use a loop or straight-line code for
+// a block operation of Size bytes with source address Src and destination
+// address Dest.  Sequence is the opcode to use for straight-line code
+// (such as MVC) and Loop is the opcode to use for loops (such as MVC_LOOP).
+// Return the chain for the completed operation.
+static SDValue emitMemMem(SelectionDAG &DAG, SDLoc DL, unsigned Sequence,
+                          unsigned Loop, SDValue Chain, SDValue Dst,
+                          SDValue Src, uint64_t Size) {
+  EVT PtrVT = Src.getValueType();
+  // The heuristic we use is to prefer loops for anything that would
+  // require 7 or more MVCs.  With these kinds of sizes there isn't
+  // much to choose between straight-line code and looping code,
+  // since the time will be dominated by the MVCs themselves.
+  // However, the loop has 4 or 5 instructions (depending on whether
+  // the base addresses can be proved equal), so there doesn't seem
+  // much point using a loop for 5 * 256 bytes or fewer.  Anything in
+  // the range (5 * 256, 6 * 256) will need another instruction after
+  // the loop, so it doesn't seem worth using a loop then either.
+  // The next value up, 6 * 256, can be implemented in the same
+  // number of straight-line MVCs as 6 * 256 - 1.
+  if (Size > 6 * 256)
+    return DAG.getNode(Loop, DL, MVT::Other, Chain, Dst, Src,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(Sequence, DL, MVT::Other, Chain, Dst, Src,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+SDValue SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
+                        bool IsVolatile, bool AlwaysInline,
+                        MachinePointerInfo DstPtrInfo,
+                        MachinePointerInfo SrcPtrInfo) const {
+  if (IsVolatile)
+    return SDValue();
+
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, Dst, Src, CSize->getZExtValue());
+  return SDValue();
+}
+
+// Handle a memset of 1, 2, 4 or 8 bytes with the operands given by
+// Chain, Dst, ByteVal and Size.  These cases are expected to use
+// MVI, MVHHI, MVHI and MVGHI respectively.
+static SDValue memsetStore(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Dst, uint64_t ByteVal, uint64_t Size,
+                           unsigned Align,
+                           MachinePointerInfo DstPtrInfo) {
+  uint64_t StoreVal = ByteVal;
+  for (unsigned I = 1; I < Size; ++I)
+    StoreVal |= ByteVal << (I * 8);
+  return DAG.getStore(Chain, DL,
+                      DAG.getConstant(StoreVal, MVT::getIntegerVT(Size * 8)),
+                      Dst, DstPtrInfo, false, false, Align);
+}
+
+SDValue SystemZSelectionDAGInfo::
+EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dst, SDValue Byte, SDValue Size,
+                        unsigned Align, bool IsVolatile,
+                        MachinePointerInfo DstPtrInfo) const {
+  EVT PtrVT = Dst.getValueType();
+
+  if (IsVolatile)
+    return SDValue();
+
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    if (Bytes == 0)
+      return SDValue();
+    if (ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte)) {
+      // Handle cases that can be done using at most two of
+      // MVI, MVHI, MVHHI and MVGHI.  The latter two can only be
+      // used if ByteVal is all zeros or all ones; in other casees,
+      // we can move at most 2 halfwords.
+      uint64_t ByteVal = CByte->getZExtValue();
+      if (ByteVal == 0 || ByteVal == 255 ?
+          Bytes <= 16 && CountPopulation_64(Bytes) <= 2 :
+          Bytes <= 4) {
+        unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
+        unsigned Size2 = Bytes - Size1;
+        SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
+                                     Align, DstPtrInfo);
+        if (Size2 == 0)
+          return Chain1;
+        Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                          DAG.getConstant(Size1, PtrVT));
+        DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
+        SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
+                                     std::min(Align, Size1), DstPtrInfo);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    } else {
+      // Handle one and two bytes using STC.
+      if (Bytes <= 2) {
+        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                                      false, false, Align);
+        if (Bytes == 1)
+          return Chain1;
+        SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+        SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
+                                      DstPtrInfo.getWithOffset(1),
+                                      false, false, 1);
+        return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
+      }
+    }
+    assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
+
+    // Handle the special case of a memset of 0, which can use XC.
+    ConstantSDNode *CByte = dyn_cast<ConstantSDNode>(Byte);
+    if (CByte && CByte->getZExtValue() == 0)
+      return emitMemMem(DAG, DL, SystemZISD::XC, SystemZISD::XC_LOOP,
+                        Chain, Dst, Dst, Bytes);
+
+    // Copy the byte to the first location and then use MVC to copy
+    // it to the rest.
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+                         false, false, Align);
+    SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+                                   DAG.getConstant(1, PtrVT));
+    return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
+                      Chain, DstPlus1, Dst, Bytes - 1);
+  }
+  return SDValue();
+}
+
+// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
+// deciding whether to use a loop or straight-line code.
+static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                       SDValue Src1, SDValue Src2, uint64_t Size) {
+  SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+  EVT PtrVT = Src1.getValueType();
+  // A two-CLC sequence is a clear win over a loop, not least because it
+  // needs only one branch.  A three-CLC sequence needs the same number
+  // of branches as a loop (i.e. 2), but is shorter.  That brings us to
+  // lengths greater than 768 bytes.  It seems relatively likely that
+  // a difference will be found within the first 768 bytes, so we just
+  // optimize for the smallest number of branch instructions, in order
+  // to avoid polluting the prediction buffer too much.  A loop only ever
+  // needs 2 branches, whereas a straight-line sequence would need 3 or more.
+  if (Size > 3 * 256)
+    return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
+                       DAG.getConstant(Size, PtrVT),
+                       DAG.getConstant(Size / 256, PtrVT));
+  return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
+                     DAG.getConstant(Size, PtrVT));
+}
+
+// Convert the current CC value into an integer that is 0 if CC == 0,
+// less than zero if CC == 1 and greater than zero if CC >= 2.
+// The sequence starts with IPM, which puts CC into bits 29 and 28
+// of an integer and clears bits 30 and 31.
+static SDValue addIPMSequence(SDLoc DL, SDValue Glue, SelectionDAG &DAG) {
+  SDValue IPM = DAG.getNode(SystemZISD::IPM, DL, MVT::i32, Glue);
+  SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i32, IPM,
+                            DAG.getConstant(SystemZ::IPM_CC, MVT::i32));
+  SDValue ROTL = DAG.getNode(ISD::ROTL, DL, MVT::i32, SRL,
+                             DAG.getConstant(31, MVT::i32));
+  return ROTL;
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2, SDValue Size,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
+    uint64_t Bytes = CSize->getZExtValue();
+    assert(Bytes > 0 && "Caller should have handled 0-size case");
+    Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
+    SDValue Glue = Chain.getValue(1);
+    return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+  }
+  return std::make_pair(SDValue(), SDValue());
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, SDValue Char, SDValue Length,
+                        MachinePointerInfo SrcPtrInfo) const {
+  // Use SRST to find the character.  End is its address on success.
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  Length = DAG.getZExtOrTrunc(Length, DL, PtrVT);
+  Char = DAG.getZExtOrTrunc(Char, DL, MVT::i32);
+  Char = DAG.getNode(ISD::AND, DL, MVT::i32, Char,
+                     DAG.getConstant(255, MVT::i32));
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, Length);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, Char);
+  Chain = End.getValue(1);
+  SDValue Glue = End.getValue(2);
+
+  // Now select between End and null, depending on whether the character
+  // was found.
+  SmallVector<SDValue, 5> Ops;
+  Ops.push_back(End);
+  Ops.push_back(DAG.getConstant(0, PtrVT));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST, MVT::i32));
+  Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
+  Ops.push_back(Glue);
+  VTs = DAG.getVTList(PtrVT, MVT::Glue);
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  return std::make_pair(End, Chain);
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Dest, SDValue Src,
+                        MachinePointerInfo DestPtrInfo,
+                        MachinePointerInfo SrcPtrInfo, bool isStpcpy) const {
+  SDVTList VTs = DAG.getVTList(Dest.getValueType(), MVT::Other);
+  SDValue EndDest = DAG.getNode(SystemZISD::STPCPY, DL, VTs, Chain, Dest, Src,
+                                DAG.getConstant(0, MVT::i32));
+  return std::make_pair(isStpcpy ? EndDest : Dest, EndDest.getValue(1));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src1, SDValue Src2,
+                        MachinePointerInfo Op1PtrInfo,
+                        MachinePointerInfo Op2PtrInfo) const {
+  SDVTList VTs = DAG.getVTList(Src1.getValueType(), MVT::Other, MVT::Glue);
+  SDValue Unused = DAG.getNode(SystemZISD::STRCMP, DL, VTs, Chain, Src1, Src2,
+                               DAG.getConstant(0, MVT::i32));
+  Chain = Unused.getValue(1);
+  SDValue Glue = Chain.getValue(2);
+  return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
+}
+
+// Search from Src for a null character, stopping once Src reaches Limit.
+// Return a pair of values, the first being the number of nonnull characters
+// and the second being the out chain.
+//
+// This can be used for strlen by setting Limit to 0.
+static std::pair<SDValue, SDValue> getBoundedStrlen(SelectionDAG &DAG, SDLoc DL,
+                                                    SDValue Chain, SDValue Src,
+                                                    SDValue Limit) {
+  EVT PtrVT = Src.getValueType();
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other, MVT::Glue);
+  SDValue End = DAG.getNode(SystemZISD::SEARCH_STRING, DL, VTs, Chain,
+                            Limit, Src, DAG.getConstant(0, MVT::i32));
+  Chain = End.getValue(1);
+  SDValue Len = DAG.getNode(ISD::SUB, DL, PtrVT, End, Src);
+  return std::make_pair(Len, Chain);
+}    
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                        SDValue Src, MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  return getBoundedStrlen(DAG, DL, Chain, Src, DAG.getConstant(0, PtrVT));
+}
+
+std::pair<SDValue, SDValue> SystemZSelectionDAGInfo::
+EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                         SDValue Src, SDValue MaxLength,
+                         MachinePointerInfo SrcPtrInfo) const {
+  EVT PtrVT = Src.getValueType();
+  MaxLength = DAG.getZExtOrTrunc(MaxLength, DL, PtrVT);
+  SDValue Limit = DAG.getNode(ISD::ADD, DL, PtrVT, Src, MaxLength);
+  return getBoundedStrlen(DAG, DL, Chain, Src, Limit);
+}
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
new file mode 100644
index 0000000..281d1e2
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -0,0 +1,80 @@
+//===-- SystemZSelectionDAGInfo.h - SystemZ SelectionDAG Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the SystemZ subclass for TargetSelectionDAGInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZSELECTIONDAGINFO_H
+#define SYSTEMZSELECTIONDAGINFO_H
+
+#include "llvm/Target/TargetSelectionDAGInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+class SystemZSelectionDAGInfo : public TargetSelectionDAGInfo {
+public:
+  explicit SystemZSelectionDAGInfo(const SystemZTargetMachine &TM);
+  ~SystemZSelectionDAGInfo();
+
+  virtual
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                                  SDValue Dst, SDValue Src,
+                                  SDValue Size, unsigned Align,
+                                  bool IsVolatile, bool AlwaysInline,
+                                  MachinePointerInfo DstPtrInfo,
+                                  MachinePointerInfo SrcPtrInfo) const
+    LLVM_OVERRIDE;
+
+  virtual SDValue
+  EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL,
+                          SDValue Chain, SDValue Dst, SDValue Byte,
+                          SDValue Size, unsigned Align, bool IsVolatile,
+                          MachinePointerInfo DstPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2, SDValue Size,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, SDValue Char, SDValue Length,
+                          MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Dest, SDValue Src,
+                          MachinePointerInfo DestPtrInfo,
+                          MachinePointerInfo SrcPtrInfo,
+                          bool isStpcpy) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src1, SDValue Src2,
+                          MachinePointerInfo Op1PtrInfo,
+                          MachinePointerInfo Op2PtrInfo) const LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                          SDValue Src, MachinePointerInfo SrcPtrInfo) const
+    LLVM_OVERRIDE;
+
+  virtual std::pair<SDValue, SDValue>
+  EmitTargetCodeForStrnlen(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+                           SDValue Src, SDValue MaxLength,
+                           MachinePointerInfo SrcPtrInfo) const LLVM_OVERRIDE;
+};
+
+}
+
+#endif
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
new file mode 100644
index 0000000..537a545
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -0,0 +1,163 @@
+//===-- SystemZShortenInst.cpp - Instruction-shortening pass --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to replace instructions with shorter forms.  For example,
+// IILF can be replaced with LLILL or LLILH if the constant fits and if the
+// other 32 bits of the GR64 destination are not live.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-shorten-inst"
+
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+  class SystemZShortenInst : public MachineFunctionPass {
+  public:
+    static char ID;
+    SystemZShortenInst(const SystemZTargetMachine &tm);
+
+    virtual const char *getPassName() const {
+      return "SystemZ Instruction Shortening";
+    }
+
+    bool processBlock(MachineBasicBlock *MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+
+  private:
+    bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
+                    unsigned LLIxL, unsigned LLIxH);
+
+    const SystemZInstrInfo *TII;
+
+    // LowGPRs[I] has bit N set if LLVM register I includes the low
+    // word of GPR N.  HighGPRs is the same for the high word.
+    unsigned LowGPRs[SystemZ::NUM_TARGET_REGS];
+    unsigned HighGPRs[SystemZ::NUM_TARGET_REGS];
+  };
+
+  char SystemZShortenInst::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
+  return new SystemZShortenInst(TM);
+}
+
+SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
+  : MachineFunctionPass(ID), TII(0), LowGPRs(), HighGPRs() {
+  // Set up LowGPRs and HighGPRs.
+  for (unsigned I = 0; I < 16; ++I) {
+    LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
+    LowGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GRH32Regs[I]] |= 1 << I;
+    HighGPRs[SystemZMC::GR64Regs[I]] |= 1 << I;
+    if (unsigned GR128 = SystemZMC::GR128Regs[I]) {
+      LowGPRs[GR128] |= 3 << I;
+      HighGPRs[GR128] |= 3 << I;
+    }
+  }
+}
+
+// MI loads one word of a GPR using an IIxF instruction and LLIxL and LLIxH
+// are the halfword immediate loads for the same word.  Try to use one of them
+// instead of IIxF.  If MI loads the high word, GPRMap[X] is the set of high
+// words referenced by LLVM register X while LiveOther is the mask of low
+// words that are currently live, and vice versa.
+bool SystemZShortenInst::shortenIIF(MachineInstr &MI, unsigned *GPRMap,
+                                    unsigned LiveOther, unsigned LLIxL,
+                                    unsigned LLIxH) {
+  unsigned Reg = MI.getOperand(0).getReg();
+  assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+  unsigned GPRs = GPRMap[Reg];
+  assert(GPRs != 0 && "Register must be a GPR");
+  if (GPRs & LiveOther)
+    return false;
+
+  uint64_t Imm = MI.getOperand(1).getImm();
+  if (SystemZ::isImmLL(Imm)) {
+    MI.setDesc(TII->get(LLIxL));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    return true;
+  }
+  if (SystemZ::isImmLH(Imm)) {
+    MI.setDesc(TII->get(LLIxH));
+    MI.getOperand(0).setReg(SystemZMC::getRegAsGR64(Reg));
+    MI.getOperand(1).setImm(Imm >> 16);
+    return true;
+  }
+  return false;
+}
+
+// Process all instructions in MBB.  Return true if something changed.
+bool SystemZShortenInst::processBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  // Work out which words are live on exit from the block.
+  unsigned LiveLow = 0;
+  unsigned LiveHigh = 0;
+  for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    for (MachineBasicBlock::livein_iterator LI = (*SI)->livein_begin(),
+           LE = (*SI)->livein_end(); LI != LE; ++LI) {
+      unsigned Reg = *LI;
+      assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+      LiveLow |= LowGPRs[Reg];
+      LiveHigh |= HighGPRs[Reg];
+    }
+  }
+
+  // Iterate backwards through the block looking for instructions to change.
+  for (MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin(),
+         MBBE = MBB->rend(); MBBI != MBBE; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == SystemZ::IILF)
+      Changed |= shortenIIF(MI, LowGPRs, LiveHigh, SystemZ::LLILL,
+                            SystemZ::LLILH);
+    else if (Opcode == SystemZ::IIHF)
+      Changed |= shortenIIF(MI, HighGPRs, LiveLow, SystemZ::LLIHL,
+                            SystemZ::LLIHH);
+    unsigned UsedLow = 0;
+    unsigned UsedHigh = 0;
+    for (MachineInstr::mop_iterator MOI = MI.operands_begin(),
+           MOE = MI.operands_end(); MOI != MOE; ++MOI) {
+      MachineOperand &MO = *MOI;
+      if (MO.isReg()) {
+        if (unsigned Reg = MO.getReg()) {
+          assert(Reg < SystemZ::NUM_TARGET_REGS && "Invalid register number");
+          if (MO.isDef()) {
+            LiveLow &= ~LowGPRs[Reg];
+            LiveHigh &= ~HighGPRs[Reg];
+          } else if (!MO.isUndef()) {
+            UsedLow |= LowGPRs[Reg];
+            UsedHigh |= HighGPRs[Reg];
+          }
+        }
+      }
+    }
+    LiveLow |= UsedLow;
+    LiveHigh |= UsedHigh;
+  }
+
+  return Changed;
+}
+
+bool SystemZShortenInst::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getTarget().getInstrInfo());
+
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = F.begin(), MFE = F.end();
+       MFI != MFE; ++MFI)
+    Changed |= processBlock(MFI);
+
+  return Changed;
+}
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index cfd3324..3971d5e 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -9,6 +9,8 @@
 
 #include "SystemZSubtarget.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Host.h"
+#include "MCTargetDesc/SystemZMCTargetDesc.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -16,13 +18,22 @@
 
 using namespace llvm;
 
+// Pin the vtabel to this file.
+void SystemZSubtarget::anchor() {}
+
 SystemZSubtarget::SystemZSubtarget(const std::string &TT,
                                    const std::string &CPU,
                                    const std::string &FS)
-  : SystemZGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT) {
+  : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
+    HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
+    TargetTriple(TT) {
   std::string CPUName = CPU;
   if (CPUName.empty())
-    CPUName = "z10";
+    CPUName = "generic";
+#if defined(__linux__) && defined(__s390x__)
+  if (CPUName == "generic")
+    CPUName = sys::getHostCPUName();
+#endif
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 8d4d450..5817491 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -26,6 +26,13 @@ class GlobalValue;
 class StringRef;
 
 class SystemZSubtarget : public SystemZGenSubtargetInfo {
+  virtual void anchor();
+protected:
+  bool HasDistinctOps;
+  bool HasLoadStoreOnCond;
+  bool HasHighWord;
+  bool HasFPExtension;
+
 private:
   Triple TargetTriple;
 
@@ -33,9 +40,24 @@ public:
   SystemZSubtarget(const std::string &TT, const std::string &CPU,
                    const std::string &FS);
 
+  // This is important for reducing register pressure in vector code.
+  virtual bool useAA() const LLVM_OVERRIDE { return true; }
+
   // Automatically generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  // Return true if the target has the distinct-operands facility.
+  bool hasDistinctOps() const { return HasDistinctOps; }
+
+  // Return true if the target has the load/store-on-condition facility.
+  bool hasLoadStoreOnCond() const { return HasLoadStoreOnCond; }
+
+  // Return true if the target has the high-word facility.
+  bool hasHighWord() const { return HasHighWord; }
+
+  // Return true if the target has the floating-point extension facility.
+  bool hasFPExtension() const { return HasFPExtension; }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 8c4c456..dee92e9 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -10,6 +10,7 @@
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
 
@@ -33,6 +34,7 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, StringRef TT,
        "-f32:32-f64:64-f128:64-a0:8:16-n32:64"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
     FrameLowering(*this, Subtarget) {
+  initAsmInfo();
 }
 
 namespace {
@@ -46,15 +48,61 @@ public:
     return getTM<SystemZTargetMachine>();
   }
 
-  virtual bool addInstSelector();
+  virtual void addIRPasses() LLVM_OVERRIDE;
+  virtual bool addInstSelector() LLVM_OVERRIDE;
+  virtual bool addPreSched2() LLVM_OVERRIDE;
+  virtual bool addPreEmitPass() LLVM_OVERRIDE;
 };
 } // end anonymous namespace
 
+void SystemZPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createPartiallyInlineLibCallsPass());
+}
+
 bool SystemZPassConfig::addInstSelector() {
   addPass(createSystemZISelDag(getSystemZTargetMachine(), getOptLevel()));
   return false;
 }
 
+bool SystemZPassConfig::addPreSched2() {
+  if (getSystemZTargetMachine().getSubtargetImpl()->hasLoadStoreOnCond())
+    addPass(&IfConverterID);
+  return true;
+}
+
+bool SystemZPassConfig::addPreEmitPass() {
+  // We eliminate comparisons here rather than earlier because some
+  // transformations can change the set of available CC values and we
+  // generally want those transformations to have priority.  This is
+  // especially true in the commonest case where the result of the comparison
+  // is used by a single in-range branch instruction, since we will then
+  // be able to fuse the compare and the branch instead.
+  //
+  // For example, two-address NILF can sometimes be converted into
+  // three-address RISBLG.  NILF produces a CC value that indicates whether
+  // the low word is zero, but RISBLG does not modify CC at all.  On the
+  // other hand, 64-bit ANDs like NILL can sometimes be converted to RISBG.
+  // The CC value produced by NILL isn't useful for our purposes, but the
+  // value produced by RISBG can be used for any comparison with zero
+  // (not just equality).  So there are some transformations that lose
+  // CC values (while still being worthwhile) and others that happen to make
+  // the CC result more useful than it was originally.
+  //
+  // Another reason is that we only want to use BRANCH ON COUNT in cases
+  // where we know that the count register is not going to be spilled.
+  //
+  // Doing it so late makes it more likely that a register will be reused
+  // between the comparison and the branch, but it isn't clear whether
+  // preventing that would be a win or not.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZElimComparePass(getSystemZTargetMachine()));
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createSystemZShortenInstPass(getSystemZTargetMachine()));
+  addPass(createSystemZLongBranchPass(getSystemZTargetMachine()));
+  return true;
+}
+
 TargetPassConfig *SystemZTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new SystemZPassConfig(this, PM);
 }
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
index 98614e7..a99a98e 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -20,10 +20,10 @@
 #include "SystemZInstrInfo.h"
 #include "SystemZRegisterInfo.h"
 #include "SystemZSubtarget.h"
+#include "SystemZSelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
@@ -32,7 +32,7 @@ class SystemZTargetMachine : public LLVMTargetMachine {
   const DataLayout        DL;
   SystemZInstrInfo        InstrInfo;
   SystemZTargetLowering   TLInfo;
-  TargetSelectionDAGInfo  TSInfo;
+  SystemZSelectionDAGInfo TSInfo;
   SystemZFrameLowering    FrameLowering;
 
 public:
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 3d92f29..2190198 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -88,6 +88,14 @@ LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
   return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
 }
 
+LLVMTypeRef LLVMIntPtrTypeInContext(LLVMContextRef C, LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C)));
+}
+
+LLVMTypeRef LLVMIntPtrTypeForASInContext(LLVMContextRef C, LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(*unwrap(C), AS));
+}
+
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
 }
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index ee88ce7..3e68fe1 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -27,7 +27,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "_IO_getc",
     "_IO_putc",
     "_ZdaPv",
+    "_ZdaPvRKSt9nothrow_t",
     "_ZdlPv",
+    "_ZdlPvRKSt9nothrow_t",
     "_Znaj",
     "_ZnajRKSt9nothrow_t",
     "_Znam",
@@ -36,6 +38,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "_ZnwjRKSt9nothrow_t",
     "_Znwm",
     "_ZnwmRKSt9nothrow_t",
+    "__cospi",
+    "__cospif",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -43,6 +47,13 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__isoc99_scanf",
     "__isoc99_sscanf",
     "__memcpy_chk",
+    "__sincospi_stret",
+    "__sincospi_stretf",
+    "__sinpi",
+    "__sinpif",
+    "__sqrt_finite",
+    "__sqrtf_finite",
+    "__sqrtl_finite",
     "__strdup",
     "__strndup",
     "__strtok_r",
@@ -165,6 +176,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "getlogin_r",
     "getpwnam",
     "gets",
+    "gettimeofday",
     "htonl",
     "htons",
     "iprintf",
@@ -325,6 +337,24 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "write"
   };
 
+static bool hasSinCosPiStret(const Triple &T) {
+  // Only Darwin variants have _stret versions of combined trig functions.
+  if (!T.isMacOSX() && T.getOS() != Triple::IOS)
+    return false;
+
+  // The ABI is rather complicated on x86, so don't do anything special there.
+  if (T.getArch() == Triple::x86)
+    return false;
+
+  if (T.isMacOSX() && T.isMacOSXVersionLT(10, 9))
+    return false;
+
+  if (T.getOS() == Triple::IOS && T.isOSVersionLT(7, 0))
+    return false;
+
+  return true;
+}
+
 /// initialize - Initialize the set of available library functions based on the
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
@@ -344,13 +374,22 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   if (T.isMacOSX()) {
     if (T.isMacOSXVersionLT(10, 5))
       TLI.setUnavailable(LibFunc::memset_pattern16);
-  } else if (T.getOS() == Triple::IOS) {
+  } else if (T.isiOS()) {
     if (T.isOSVersionLT(3, 0))
       TLI.setUnavailable(LibFunc::memset_pattern16);
   } else {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
+  if (!hasSinCosPiStret(T)) {
+    TLI.setUnavailable(LibFunc::sinpi);
+    TLI.setUnavailable(LibFunc::sinpif);
+    TLI.setUnavailable(LibFunc::cospi);
+    TLI.setUnavailable(LibFunc::cospif);
+    TLI.setUnavailable(LibFunc::sincospi_stret);
+    TLI.setUnavailable(LibFunc::sincospi_stretf);
+  }
+
   if (T.isMacOSX() && T.getArch() == Triple::x86 &&
       !T.isMacOSXVersionLT(10, 7)) {
     // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
@@ -487,6 +526,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::getitimer);
     TLI.setUnavailable(LibFunc::getlogin_r);
     TLI.setUnavailable(LibFunc::getpwnam);
+    TLI.setUnavailable(LibFunc::gettimeofday);
     TLI.setUnavailable(LibFunc::htonl);
     TLI.setUnavailable(LibFunc::htons);
     TLI.setUnavailable(LibFunc::lchown);
@@ -555,7 +595,7 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
   }
 
   // The following functions are available on at least Linux:
-  if (T.getOS() != Triple::Linux) {
+  if (!T.isOSLinux()) {
     TLI.setUnavailable(LibFunc::dunder_strdup);
     TLI.setUnavailable(LibFunc::dunder_strtok_r);
     TLI.setUnavailable(LibFunc::dunder_isoc99_scanf);
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index f5121e3..7b8d110 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -97,10 +97,20 @@ static bool IsNullTerminatedString(const Constant *C) {
   return false;
 }
 
+/// Return the MCSymbol for the specified global value.  This
+/// symbol is the main label that is the address of the global.
+MCSymbol *TargetLoweringObjectFile::getSymbol(Mangler &M, 
+                                              const GlobalValue *GV) const {
+  SmallString<60> NameStr;
+  M.getNameWithPrefix(NameStr, GV, false);
+  return Ctx->GetOrCreateSymbol(NameStr.str());
+}
+
+
 MCSymbol *TargetLoweringObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
@@ -293,7 +303,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI, unsigned Encoding,
                         MCStreamer &Streamer) const {
   const MCSymbolRefExpr *Ref =
-    MCSymbolRefExpr::Create(Mang->getSymbol(GV), getContext());
+    MCSymbolRefExpr::Create(getSymbol(*Mang, GV), getContext());
 
   return getTTypeReference(Ref, Encoding, Streamer);
 }
@@ -317,3 +327,9 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
   }
   }
 }
+
+const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
+  // FIXME: It's not clear what, if any, default this should have - perhaps a
+  // null return could mean 'no location' & we should just do that here.
+  return MCSymbolRefExpr::Create(Sym, *Ctx);
+}
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index e728251..cb42e83 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -78,7 +78,6 @@ void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
   } while (0)
 
   RESET_OPTION(NoFramePointerElim, "no-frame-pointer-elim");
-  RESET_OPTION(NoFramePointerElimNonLeaf, "no-frame-pointer-elim-non-leaf");
   RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
   RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
@@ -165,6 +164,11 @@ CodeGenOpt::Level TargetMachine::getOptLevel() const {
   return CodeGenInfo->getOptLevel();
 }
 
+void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
+  if (CodeGenInfo)
+    CodeGenInfo->setOptLevel(Level);
+}
+
 bool TargetMachine::getAsmVerbosityDefault() {
   return AsmVerbosityDefault;
 }
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index 01d12e8..3d5f827 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
 #include <cstdlib>
@@ -60,13 +61,44 @@ inline LLVMTargetRef wrap(const Target * P) {
 }
 
 LLVMTargetRef LLVMGetFirstTarget() {
-   const Target* target = &*TargetRegistry::begin();
-   return wrap(target);
+  if(TargetRegistry::begin() == TargetRegistry::end()) {
+    return NULL;
+  }
+
+  const Target* target = &*TargetRegistry::begin();
+  return wrap(target);
 }
 LLVMTargetRef LLVMGetNextTarget(LLVMTargetRef T) {
   return wrap(unwrap(T)->getNext());
 }
 
+LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
+  StringRef NameRef = Name;
+  for (TargetRegistry::iterator IT = TargetRegistry::begin(),
+                                IE = TargetRegistry::end(); IT != IE; ++IT) {
+    if (IT->getName() == NameRef)
+      return wrap(&*IT);
+  }
+  
+  return NULL;
+}
+
+LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
+                                 char **ErrorMessage) {
+  std::string Error;
+  
+  *T = wrap(TargetRegistry::lookupTarget(TripleStr, Error));
+  
+  if (!*T) {
+    if (ErrorMessage)
+      *ErrorMessage = strdup(Error.c_str());
+
+    return 1;
+  }
+  
+  return 0;
+}
+
 const char * LLVMGetTargetName(LLVMTargetRef T) {
   return unwrap(T)->getName();
 }
@@ -87,9 +119,10 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
   return unwrap(T)->hasMCAsmBackend();
 }
 
-LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T, char* Triple,
-  char* CPU, char* Features, LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
-  LLVMCodeModel CodeModel) {
+LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
+        const char* Triple, const char* CPU, const char* Features,
+        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
+        LLVMCodeModel CodeModel) {
   Reloc::Model RM;
   switch (Reloc){
     case LLVMRelocStatic:
@@ -158,6 +191,11 @@ LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
   return wrap(unwrap(T)->getDataLayout());
 }
 
+void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
+                                      LLVMBool VerboseAsm) {
+  unwrap(T)->setAsmVerbosityDefault(VerboseAsm);
+}
+
 static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
   formatted_raw_ostream &OS, LLVMCodeGenFileType codegen, char **ErrorMessage) {
   TargetMachine* TM = unwrap(T);
@@ -200,12 +238,12 @@ static LLVMBool LLVMTargetMachineEmit(LLVMTargetMachineRef T, LLVMModuleRef M,
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   char* Filename, LLVMCodeGenFileType codegen, char** ErrorMessage) {
   std::string error;
-  raw_fd_ostream dest(Filename, error, raw_fd_ostream::F_Binary);
-  formatted_raw_ostream destf(dest);
+  raw_fd_ostream dest(Filename, error, sys::fs::F_Binary);
   if (!error.empty()) {
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
+  formatted_raw_ostream destf(dest);
   bool Result = LLVMTargetMachineEmit(T, M, destf, codegen, ErrorMessage);
   dest.flush();
   return Result;
@@ -225,3 +263,7 @@ LLVMBool LLVMTargetMachineEmitToMemoryBuffer(LLVMTargetMachineRef T,
                                                      Data.length(), "");
   return Result;
 }
+
+char *LLVMGetDefaultTargetTriple(void) {
+  return strdup(sys::getDefaultTargetTriple().c_str());
+}
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index af0cef6..10e8db5 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/ADT/SmallVector.h"
 using namespace llvm;
@@ -22,6 +23,21 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
+// Temporary option to compare overall performance change when moving from the
+// SD scheduler to the MachineScheduler pass pipeline. It should be removed
+// before 3.4. The normal way to enable/disable the MachineScheduling pass
+// itself is by using -enable-misched. For targets that already use MI sched
+// (via MySubTarget::enableMachineScheduler()) -misched-bench=false negates the
+// subtarget hook.
+static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
+    cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
+
+bool TargetSubtargetInfo::useMachineScheduler() const {
+  if (BenchMachineSched.getNumOccurrences())
+    return BenchMachineSched;
+  return enableMachineScheduler();
+}
+
 bool TargetSubtargetInfo::enableMachineScheduler() const {
   return false;
 }
@@ -35,3 +51,6 @@ bool TargetSubtargetInfo::enablePostRAScheduler(
   return false;
 }
 
+bool TargetSubtargetInfo::useAA() const {
+  return false;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 68908ab..bc8f367 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -79,7 +80,7 @@ private:
       PostfixStack.push_back(std::make_pair(Op, Val));
     }
     
-    void popOperator() { InfixOperatorStack.pop_back_val(); }
+    void popOperator() { InfixOperatorStack.pop_back(); }
     void pushOperator(InfixCalculatorTok Op) {
       // Push the new operator if the stack is empty.
       if (InfixOperatorStack.empty()) {
@@ -117,12 +118,12 @@ private:
         
         if (StackOp == IC_RPAREN) {
           ++ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else if (StackOp == IC_LPAREN) {
           --ParenCount;
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
         } else {
-          InfixOperatorStack.pop_back_val();
+          InfixOperatorStack.pop_back();
           PostfixStack.push_back(std::make_pair(StackOp, 0));
         }
       }
@@ -219,7 +220,9 @@ private:
     const MCExpr *getSym() { return Sym; }
     StringRef getSymName() { return SymName; }
     int64_t getImm() { return Imm + IC.execute(); }
-    bool isValidEndState() { return State == IES_RBRAC; }
+    bool isValidEndState() {
+      return State == IES_RBRAC || State == IES_INTEGER;
+    }
     bool getStopOnLBrac() { return StopOnLBrac; }
     bool getAddImmPrefix() { return AddImmPrefix; }
     bool hadError() { return State == IES_ERROR; }
@@ -492,16 +495,17 @@ private:
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
   X86Operand *ParseIntelOffsetOfOperator();
-  X86Operand *ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
   X86Operand *ParseIntelOperator(unsigned OpKind);
-  X86Operand *ParseIntelMemOperand(unsigned SegReg, int64_t ImmDisp,
-                                   SMLoc StartLoc);
-  X86Operand *ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
+  X86Operand *ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+  X86Operand *ParseIntelMemOperand(int64_t ImmDisp, SMLoc StartLoc,
+                                   unsigned Size);
+  bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
                                        int64_t ImmDisp, unsigned Size);
-  X86Operand *ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
-                                   InlineAsmIdentifierInfo &Info,
-                                   bool IsUnevaluatedOperand, SMLoc &End);
+  bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
+                            InlineAsmIdentifierInfo &Info,
+                            bool IsUnevaluatedOperand, SMLoc &End);
 
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
@@ -552,8 +556,9 @@ private:
   /// }
 
 public:
-  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+  X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
+               const MCInstrInfo &MII)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -811,6 +816,9 @@ struct X86Operand : public MCParsedAsmOperand {
   bool isMem256() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 256);
   }
+  bool isMem512() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 512);
+  }
 
   bool isMemVX32() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
@@ -828,14 +836,45 @@ struct X86Operand : public MCParsedAsmOperand {
     return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
       getMemIndexReg() >= X86::YMM0 && getMemIndexReg() <= X86::YMM15;
   }
+  bool isMemVZ32() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 32) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
+  bool isMemVZ64() const {
+    return Kind == Memory && (!Mem.Size || Mem.Size == 64) &&
+      getMemIndexReg() >= X86::ZMM0 && getMemIndexReg() <= X86::ZMM31;
+  }
 
   bool isAbsMem() const {
     return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
       !getMemIndexReg() && getMemScale() == 1;
   }
 
+  bool isMemOffs8() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 8);
+  }
+  bool isMemOffs16() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 16);
+  }
+  bool isMemOffs32() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 32);
+  }
+  bool isMemOffs64() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1 && (!Mem.Size || Mem.Size == 64);
+  }
+
   bool isReg() const { return Kind == Register; }
 
+  bool isGR32orGR64() const {
+    return Kind == Register &&
+      (X86MCRegisterClasses[X86::GR32RegClassID].contains(getReg()) ||
+      X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.
     if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
@@ -849,43 +888,40 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
+  static unsigned getGR32FromGR64(unsigned RegNo) {
+    switch (RegNo) {
+    default: llvm_unreachable("Unexpected register");
+    case X86::RAX: return X86::EAX;
+    case X86::RCX: return X86::ECX;
+    case X86::RDX: return X86::EDX;
+    case X86::RBX: return X86::EBX;
+    case X86::RBP: return X86::EBP;
+    case X86::RSP: return X86::ESP;
+    case X86::RSI: return X86::ESI;
+    case X86::RDI: return X86::EDI;
+    case X86::R8: return X86::R8D;
+    case X86::R9: return X86::R9D;
+    case X86::R10: return X86::R10D;
+    case X86::R11: return X86::R11D;
+    case X86::R12: return X86::R12D;
+    case X86::R13: return X86::R13D;
+    case X86::R14: return X86::R14D;
+    case X86::R15: return X86::R15D;
+    case X86::RIP: return X86::EIP;
+    }
   }
 
-  void addMem8Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem16Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem80Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem128Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMem256Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVY32Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
-  }
-  void addMemVX64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+  void addGR32orGR64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned RegNo = getReg();
+    if (X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo))
+      RegNo = getGR32FromGR64(RegNo);
+    Inst.addOperand(MCOperand::CreateReg(RegNo));
   }
-  void addMemVY64Operands(MCInst &Inst, unsigned N) const {
-    addMemOperands(Inst, N);
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    addExpr(Inst, getImm());
   }
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
@@ -906,6 +942,15 @@ struct X86Operand : public MCParsedAsmOperand {
       Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
   }
 
+  void addMemOffsOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    // Add as immediates when possible.
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getMemDisp()))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
   static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
     SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
     X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
@@ -1194,6 +1239,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
         }
       }
       assert (Found && "Unable to rewrite ImmDisp.");
+      (void)Found;
     } else {
       // We have a symbolic and an immediate displacement, but no displacement
       // before the bracketed expression.  Put the immediate displacement
@@ -1223,8 +1269,7 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> *AsmRewrites,
   }
 }
 
-X86Operand *
-X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
+bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   const AsmToken &Tok = Parser.getTok();
 
   bool Done = false;
@@ -1246,7 +1291,7 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
         Done = true;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
     }
     case AsmToken::EndOfStatement: {
       Done = true;
@@ -1265,18 +1310,18 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       } else {
         if (!isParsingInlineAsm()) {
           if (getParser().parsePrimaryExpr(Val, End))
-            return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+            return Error(Tok.getLoc(), "Unexpected identifier!");
         } else {
           InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-          if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                                     /*Unevaluated*/ false, End))
-            return Err;
+          if (ParseIntelIdentifier(Val, Identifier, Info,
+                                   /*Unevaluated=*/false, End))
+            return true;
         }
         SM.onIdentifierExpr(Val, Identifier);
         UpdateLocLex = false;
         break;
       }
-      return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+      return Error(Tok.getLoc(), "Unexpected identifier!");
     }
     case AsmToken::Integer:
       if (isParsingInlineAsm() && SM.getAddImmPrefix())
@@ -1294,14 +1339,14 @@ X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
     case AsmToken::RParen:  SM.onRParen(); break;
     }
     if (SM.hadError())
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return Error(Tok.getLoc(), "unknown token in expression");
 
     if (!Done && UpdateLocLex) {
       End = Tok.getLoc();
       Parser.Lex(); // Consume the token.
     }
   }
-  return 0;
+  return false;
 }
 
 X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
@@ -1318,8 +1363,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // may have already parsed an immediate displacement before the bracketed
   // expression.
   IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
-  if (X86Operand *Err = ParseIntelExpression(SM, End))
-    return Err;
+  if (ParseIntelExpression(SM, End))
+    return 0;
 
   const MCExpr *Disp;
   if (const MCExpr *Sym = SM.getSym()) {
@@ -1337,8 +1382,8 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // Parse the dot operator (e.g., [ebx].foo.bar).
   if (Tok.getString().startswith(".")) {
     const MCExpr *NewDisp;
-    if (X86Operand *Err = ParseIntelDotOperator(Disp, NewDisp))
-      return Err;
+    if (ParseIntelDotOperator(Disp, NewDisp))
+      return 0;
     
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
@@ -1366,11 +1411,10 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
 }
 
 // Inline assembly may use variable names with namespace alias qualifiers.
-X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
-                                               StringRef &Identifier,
-                                               InlineAsmIdentifierInfo &Info,
-                                               bool IsUnevaluatedOperand,
-                                               SMLoc &End) {
+bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
+                                        StringRef &Identifier,
+                                        InlineAsmIdentifierInfo &Info,
+                                        bool IsUnevaluatedOperand, SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
   Val = 0;
 
@@ -1395,68 +1439,89 @@ X86Operand *X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Identifier);
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
   Val = MCSymbolRefExpr::Create(Sym, Variant, getParser().getContext());
-  return 0;
+  return false;
 }
 
-/// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg,
-                                               int64_t ImmDisp,
-                                               SMLoc Start) {
-  const AsmToken &Tok = Parser.getTok();
-  SMLoc End;
-
-  unsigned Size = getIntelMemOperandSize(Tok.getString());
-  if (Size) {
-    Parser.Lex(); // Eat operand size (e.g., byte, word).
-    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
-      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
-    Parser.Lex(); // Eat ptr.
-  }
-
-  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+/// \brief Parse intel style segment override.
+X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
+                                                    SMLoc Start,
+                                                    unsigned Size) {
+  assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
+  const AsmToken &Tok = Parser.getTok(); // Eat colon.
+  if (Tok.isNot(AsmToken::Colon))
+    return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
+  Parser.Lex(); // Eat ':'
+
+  int64_t ImmDisp = 0;
   if (getLexer().is(AsmToken::Integer)) {
+    ImmDisp = Tok.getIntVal();
+    AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
+
     if (isParsingInlineAsm())
-      InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_ImmPrefix,
-                                                  Tok.getLoc()));
-    int64_t ImmDisp = Tok.getIntVal();
-    Parser.Lex(); // Eat the integer.
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+      InstInfo->AsmRewrites->push_back(
+          AsmRewrite(AOK_ImmPrefix, ImmDispToken.getLoc()));
+
+    if (getLexer().isNot(AsmToken::LBrac)) {
+      // An immediate following a 'segment register', 'colon' token sequence can
+      // be followed by a bracketed expression.  If it isn't we know we have our
+      // final segment override.
+      const MCExpr *Disp = MCConstantExpr::Create(ImmDisp, getContext());
+      return X86Operand::CreateMem(SegReg, Disp, /*BaseReg=*/0, /*IndexReg=*/0,
+                                   /*Scale=*/1, Start, ImmDispToken.getEndLoc(),
+                                   Size);
+    }
   }
 
   if (getLexer().is(AsmToken::LBrac))
     return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
 
-  if (!ParseRegister(SegReg, Start, End)) {
-    // Handel SegReg : [ ... ]
-    if (getLexer().isNot(AsmToken::Colon))
-      return ErrorOperand(Start, "Expected ':' token!");
-    Parser.Lex(); // Eat :
-    if (getLexer().isNot(AsmToken::LBrac))
-      return ErrorOperand(Start, "Expected '[' token!");
-    return ParseIntelBracExpression(SegReg, Start, ImmDisp, Size);
+  const MCExpr *Val;
+  SMLoc End;
+  if (!isParsingInlineAsm()) {
+    if (getParser().parsePrimaryExpr(Val, End))
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
+
+    return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
+  InlineAsmIdentifierInfo Info;
+  StringRef Identifier = Tok.getString();
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+                               /*Scale=*/1, Start, End, Size, Identifier, Info);
+}
+
+/// ParseIntelMemOperand - Parse intel style memory operand.
+X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
+                                               unsigned Size) {
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc End;
+
+  // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
+  if (getLexer().is(AsmToken::LBrac))
+    return ParseIntelBracExpression(/*SegReg=*/0, Start, ImmDisp, Size);
+
   const MCExpr *Val;
   if (!isParsingInlineAsm()) {
     if (getParser().parsePrimaryExpr(Val, End))
-      return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+      return ErrorOperand(Tok.getLoc(), "unknown token in expression");
 
     return X86Operand::CreateMem(Val, Start, End, Size);
   }
 
   InlineAsmIdentifierInfo Info;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
-  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
+  return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
 }
 
 /// Parse the '.' operator.
-X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
                                                 const MCExpr *&NewDisp) {
   const AsmToken &Tok = Parser.getTok();
   int64_t OrigDispVal, DotDispVal;
@@ -1465,7 +1530,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
     OrigDispVal = OrigDisp->getValue();
   else
-    return ErrorOperand(Tok.getLoc(), "Non-constant offsets are not supported!");
+    return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
 
   // Drop the '.'.
   StringRef DotDispStr = Tok.getString().drop_front(1);
@@ -1480,10 +1545,10 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
     std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
     if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
                                            DotDisp))
-      return ErrorOperand(Tok.getLoc(), "Unable to lookup field reference!");
+      return Error(Tok.getLoc(), "Unable to lookup field reference!");
     DotDispVal = DotDisp;
   } else
-    return ErrorOperand(Tok.getLoc(), "Unexpected token type!");
+    return Error(Tok.getLoc(), "Unexpected token type!");
 
   if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
     SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
@@ -1494,7 +1559,7 @@ X86Operand *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
   }
 
   NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
-  return 0;
+  return false;
 }
 
 /// Parse the 'offset' operator.  This operator is used to specify the
@@ -1508,9 +1573,9 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ false, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/false, End))
+    return 0;
 
   // Don't emit the offset operator.
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1544,9 +1609,12 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
-  if (X86Operand *Err = ParseIntelIdentifier(Val, Identifier, Info,
-                                             /*Unevaluated*/ true, End))
-    return Err;
+  if (ParseIntelIdentifier(Val, Identifier, Info,
+                           /*Unevaluated=*/true, End))
+    return 0;
+
+  if (!Info.OpDecl)
+    return ErrorOperand(Start, "unable to lookup expression");
 
   unsigned CVal = 0;
   switch(OpKind) {
@@ -1567,7 +1635,7 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc Start = Tok.getLoc(), End;
+  SMLoc Start, End;
 
   // Offset, length, type and size operators.
   if (isParsingInlineAsm()) {
@@ -1582,14 +1650,23 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
       return ParseIntelOperator(IOK_TYPE);
   }
 
+  unsigned Size = getIntelMemOperandSize(Tok.getString());
+  if (Size) {
+    Parser.Lex(); // Eat operand size (e.g., byte, word).
+    if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
+      return ErrorOperand(Start, "Expected 'PTR' or 'ptr' token!");
+    Parser.Lex(); // Eat ptr.
+  }
+  Start = Tok.getLoc();
+
   // Immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Minus) ||
       getLexer().is(AsmToken::LParen)) {    
     AsmToken StartTok = Tok;
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
-    if (X86Operand *Err = ParseIntelExpression(SM, End))
-      return Err;
+    if (ParseIntelExpression(SM, End))
+      return 0;
 
     int64_t Imm = SM.getImm();
     if (isParsingInlineAsm()) {
@@ -1613,23 +1690,22 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
                           "before bracketed expr.");
 
     // Parse ImmDisp [ BaseReg + Scale*IndexReg + Disp ].
-    return ParseIntelMemOperand(/*SegReg=*/0, Imm, Start);
+    return ParseIntelMemOperand(Imm, Start, Size);
   }
 
   // Register.
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
     // If this is a segment register followed by a ':', then this is the start
-    // of a memory reference, otherwise this is a normal register reference.
+    // of a segment override, otherwise this is a normal register reference.
     if (getLexer().isNot(AsmToken::Colon))
       return X86Operand::CreateReg(RegNo, Start, End);
 
-    getParser().Lex(); // Eat the colon.
-    return ParseIntelMemOperand(/*SegReg=*/RegNo, /*Disp=*/0, Start);
+    return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
   }
 
   // Memory operand.
-  return ParseIntelMemOperand(/*SegReg=*/0, /*Disp=*/0, Start);
+  return ParseIntelMemOperand(/*Disp=*/0, Start, Size);
 }
 
 X86Operand *X86AsmParser::ParseATTOperand() {
@@ -1941,6 +2017,47 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       }
     }
 
+    if (STI.getFeatureBits() & X86::FeatureAVX512) {
+      // Parse mask register {%k1}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{", Loc));
+        Parser.Lex();  // Eat the {
+        if (X86Operand *Op = ParseOperand()) {
+          Operands.push_back(Op);
+          if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+          }
+          Loc = Parser.getTok().getLoc();
+          Operands.push_back(X86Operand::CreateToken("}", Loc));
+          Parser.Lex();  // Eat the }
+        } else {
+          Parser.eatToEndOfStatement();
+          return true;
+        }
+      }
+      // Parse "zeroing non-masked" semantic {z}
+      if (getLexer().is(AsmToken::LCurly)) {
+        SMLoc Loc = Parser.getTok().getLoc();
+        Operands.push_back(X86Operand::CreateToken("{z}", Loc));
+        Parser.Lex();  // Eat the {
+        if (!getLexer().is(AsmToken::Identifier) || getLexer().getTok().getIdentifier() != "z") {
+          SMLoc Loc = getLexer().getLoc();
+          Parser.eatToEndOfStatement();
+          return Error(Loc, "Expected z at this point");
+        }
+        Parser.Lex();  // Eat the z
+        if (!getLexer().is(AsmToken::RCurly)) {
+            SMLoc Loc = getLexer().getLoc();
+            Parser.eatToEndOfStatement();
+            return Error(Loc, "Expected } at this point");
+        }
+        Parser.Lex();  // Eat the }
+      }
+    }
+
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       SMLoc Loc = getLexer().getLoc();
       Parser.eatToEndOfStatement();
@@ -2192,6 +2309,55 @@ processInstruction(MCInst &Inst,
   case X86::SBB16i16: return convert16i16to16ri8(Inst, X86::SBB16ri8);
   case X86::SBB32i32: return convert32i32to32ri8(Inst, X86::SBB32ri8);
   case X86::SBB64i32: return convert64i32to64ri8(Inst, X86::SBB64ri8);
+  case X86::VMOVAPDrr:
+  case X86::VMOVAPDYrr:
+  case X86::VMOVAPSrr:
+  case X86::VMOVAPSYrr:
+  case X86::VMOVDQArr:
+  case X86::VMOVDQAYrr:
+  case X86::VMOVDQUrr:
+  case X86::VMOVDQUYrr:
+  case X86::VMOVUPDrr:
+  case X86::VMOVUPDYrr:
+  case X86::VMOVUPSrr:
+  case X86::VMOVUPSYrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(1).getReg()))
+      return false;
+
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVAPDrr:  NewOpc = X86::VMOVAPDrr_REV;  break;
+    case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
+    case X86::VMOVAPSrr:  NewOpc = X86::VMOVAPSrr_REV;  break;
+    case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
+    case X86::VMOVDQArr:  NewOpc = X86::VMOVDQArr_REV;  break;
+    case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
+    case X86::VMOVDQUrr:  NewOpc = X86::VMOVDQUrr_REV;  break;
+    case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
+    case X86::VMOVUPDrr:  NewOpc = X86::VMOVUPDrr_REV;  break;
+    case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
+    case X86::VMOVUPSrr:  NewOpc = X86::VMOVUPSrr_REV;  break;
+    case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
+  case X86::VMOVSDrr:
+  case X86::VMOVSSrr: {
+    if (X86II::isX86_64ExtendedReg(Inst.getOperand(0).getReg()) ||
+        !X86II::isX86_64ExtendedReg(Inst.getOperand(2).getReg()))
+      return false;
+    unsigned NewOpc;
+    switch (Inst.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV;   break;
+    case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV;   break;
+    }
+    Inst.setOpcode(NewOpc);
+    return true;
+  }
   }
 }
 
@@ -2306,25 +2472,25 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   unsigned Match1, Match2, Match3, Match4;
 
   Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match1 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[1];
   Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match2 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[2];
   Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match3 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
   Tmp[Base.size()] = Suffixes[3];
   Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
-                                isParsingIntelSyntax());
+                                MatchingInlineAsm, isParsingIntelSyntax());
   // If this returned as a missing feature failure, remember that.
   if (Match4 == Match_MissingFeature)
     ErrorInfoMissingFeature = ErrorInfoIgnore;
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 7cb71f0..7e20151 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -53,7 +53,7 @@ endif()
 
 add_llvm_target(X86CodeGen ${sources})
 
-add_dependencies(LLVMX86CodeGen intrinsics_gen)
+add_dependencies(LLVMX86CodeGen X86CommonTableGen intrinsics_gen)
 
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ca6f80c..903e36c 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -190,94 +190,8 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
                                      uint64_t Address, uint64_t Offset,
                                      uint64_t Width, MCInst &MI, 
                                      const MCDisassembler *Dis) {  
-  LLVMOpInfoCallback getOpInfo = Dis->getLLVMOpInfoCallback();
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = Dis->getDisInfoBlock();
-
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, Offset, Width, 1, &SymbolicOp)) {
-    // Clear SymbolicOp.Value from above and also all other fields.
-    memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-    LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-    if (!SymbolLookUp)
-      return false;
-    uint64_t ReferenceType;
-    if (isBranch)
-       ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-    else
-       ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
-    const char *ReferenceName;
-    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
-                                    &ReferenceName);
-    if (Name) {
-      SymbolicOp.AddSymbol.Name = Name;
-      SymbolicOp.AddSymbol.Present = true;
-    }
-    // For branches always create an MCExpr so it gets printed as hex address.
-    else if (isBranch) {
-      SymbolicOp.Value = Value;
-    }
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-      (*Dis->CommentStream) << "symbol stub for: " << ReferenceName;
-    if (!Name && !isBranch)
-      return false;
-  }
-
-  MCContext *Ctx = Dis->getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create((int)SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-      if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create((int)SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-
-  return true;
+  return Dis->tryAddingSymbolicOperand(MI, Value, Address, isBranch,
+                                       Offset, Width);
 }
 
 /// tryAddingPcLoadReferenceComment - trys to add a comment as to what is being
@@ -290,15 +204,7 @@ static bool tryAddingSymbolicOperand(int64_t Value, bool isBranch,
 static void tryAddingPcLoadReferenceComment(uint64_t Address, uint64_t Value,
                                             const void *Decoder) {
   const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
-  LLVMSymbolLookupCallback SymbolLookUp = Dis->getLLVMSymbolLookupCallback();
-  if (SymbolLookUp) {
-    void *DisInfo = Dis->getDisInfoBlock();
-    uint64_t ReferenceType = LLVMDisassembler_ReferenceType_In_PCrel_Load;
-    const char *ReferenceName;
-    (void)SymbolLookUp(DisInfo, Value, &ReferenceType, Address, &ReferenceName);
-    if(ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-      (*Dis->CommentStream) << "literal pool for: " << ReferenceName;
-  }
+  Dis->tryAddingPcLoadReferenceComment(Value, Address);
 }
 
 /// translateImmediate  - Appends an immediate operand to an MCInst.
@@ -325,16 +231,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
     default:
       break;
     case 1:
-      type = TYPE_MOFFS8;
+      if(immediate & 0x80)
+        immediate |= ~(0xffull);
       break;
     case 2:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case 4:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case 8:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -357,16 +265,18 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
           Opcode != X86::VMPSADBWrri && Opcode != X86::VDPPSYrri &&
           Opcode != X86::VDPPSYrmi && Opcode != X86::VDPPDrri &&
           Opcode != X86::VINSERTPSrr)
-        type = TYPE_MOFFS8;
+        if(immediate & 0x80)
+          immediate |= ~(0xffull);
       break;
     case ENCODING_IW:
-      type = TYPE_MOFFS16;
+      if(immediate & 0x8000)
+        immediate |= ~(0xffffull);
       break;
     case ENCODING_ID:
-      type = TYPE_MOFFS32;
+      if(immediate & 0x80000000)
+        immediate |= ~(0xffffffffull);
       break;
     case ENCODING_IO:
-      type = TYPE_MOFFS64;
       break;
     }
   }
@@ -380,33 +290,27 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   case TYPE_XMM256:
     mcInst.addOperand(MCOperand::CreateReg(X86::YMM0 + (immediate >> 4)));
     return;
+  case TYPE_XMM512:
+    mcInst.addOperand(MCOperand::CreateReg(X86::ZMM0 + (immediate >> 4)));
+    return;
   case TYPE_REL8:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS8:
     if(immediate & 0x80)
       immediate |= ~(0xffull);
     break;
-  case TYPE_MOFFS16:
-    if(immediate & 0x8000)
-      immediate |= ~(0xffffull);
-    break;
   case TYPE_REL32:
   case TYPE_REL64:
     isBranch = true;
     pcrel = insn.startLocation + insn.immediateOffset + insn.immediateSize;
-    // fall through to sign extend the immediate if needed.
-  case TYPE_MOFFS32:
     if(immediate & 0x80000000)
       immediate |= ~(0xffffffffull);
     break;
-  case TYPE_MOFFS64:
   default:
     // operand is 64 bits wide.  Do nothing.
     break;
   }
-    
+
   if(!tryAddingSymbolicOperand(immediate + pcrel, isBranch, insn.startLocation,
                                insn.immediateOffset, insn.immediateSize,
                                mcInst, Dis))
@@ -537,6 +441,7 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
       EA_BASES_64BIT
       REGS_XMM
       REGS_YMM
+      REGS_ZMM
 #undef ENTRY
       }
     } else {
@@ -659,6 +564,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_XMM64:
   case TYPE_XMM128:
   case TYPE_XMM256:
+  case TYPE_XMM512:
   case TYPE_DEBUGREG:
   case TYPE_CONTROLREG:
     return translateRMRegister(mcInst, insn);
@@ -777,6 +683,15 @@ static bool translateInstruction(MCInst &mcInst,
   }
   
   mcInst.setOpcode(insn.instructionID);
+  // If when reading the prefix bytes we determined the overlapping 0xf2 or 0xf3
+  // prefix bytes should be disassembled as xrelease and xacquire then set the
+  // opcode to those instead of the rep and repne opcodes.
+  if (insn.xAcquireRelease) {
+    if(mcInst.getOpcode() == X86::REP_PREFIX)
+      mcInst.setOpcode(X86::XRELEASE_PREFIX);
+    else if(mcInst.getOpcode() == X86::REPNE_PREFIX)
+      mcInst.setOpcode(X86::XACQUIRE_PREFIX);
+  }
   
   int index;
   
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index e40edba..c81a857 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -25,8 +25,6 @@
 #define TRUE  1
 #define FALSE 0
 
-typedef int8_t bool;
-
 #ifndef NDEBUG
 #define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
 #else
@@ -81,6 +79,15 @@ static int modRMRequired(OpcodeType type,
   case THREEBYTE_A7:
     decision = &THREEBYTEA7_SYM;
     break;
+  case XOP8_MAP:
+    decision = &XOP8_MAP_SYM;
+    break;
+  case XOP9_MAP:
+    decision = &XOP9_MAP_SYM;
+    break;
+  case XOPA_MAP:
+    decision = &XOPA_MAP_SYM;
+    break;
   }
 
   return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
@@ -122,6 +129,15 @@ static InstrUID decode(OpcodeType type,
   case THREEBYTE_A7:
     dec = &THREEBYTEA7_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
     break;
+  case XOP8_MAP:
+    dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOP9_MAP:
+    dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case XOPA_MAP:
+    dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
   }
 
   switch (dec->modrm_type) {
@@ -305,6 +321,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
   BOOL prefixGroups[4] = { FALSE };
   uint64_t prefixLocation;
   uint8_t byte = 0;
+  uint8_t nextByte;
 
   BOOL hasAdSize = FALSE;
   BOOL hasOpSize = FALSE;
@@ -314,20 +331,42 @@ static int readPrefixes(struct InternalInstruction* insn) {
   while (isPrefix) {
     prefixLocation = insn->readerCursor;
 
+    /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
     if (consumeByte(insn, &byte))
-      return -1;
+      break;
 
     /*
      * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
      * break and let it be disassembled as a normal "instruction".
      */
+    if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+      break;
+
     if (insn->readerCursor - 1 == insn->startLocation
-        && (byte == 0xf0 || byte == 0xf2 || byte == 0xf3)) {
-      uint8_t nextByte;
-      if (byte == 0xf0)
-        break;
-      if (lookAtByte(insn, &nextByte))
-        return -1;
+        && (byte == 0xf2 || byte == 0xf3)
+        && !lookAtByte(insn, &nextByte))
+    {
+      /*
+       * If the byte is 0xf2 or 0xf3, and any of the following conditions are
+       * met:
+       * - it is followed by a LOCK (0xf0) prefix
+       * - it is followed by an xchg instruction
+       * then it should be disassembled as a xacquire/xrelease not repne/rep.
+       */
+      if ((byte == 0xf2 || byte == 0xf3) &&
+          ((nextByte == 0xf0) |
+          ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+        insn->xAcquireRelease = TRUE;
+      /*
+       * Also if the byte is 0xf3, and the following condition is met:
+       * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+       *                       "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+       * then it should be disassembled as an xrelease not rep.
+       */
+      if (byte == 0xf3 &&
+          (nextByte == 0x88 || nextByte == 0x89 ||
+           nextByte == 0xc6 || nextByte == 0xc7))
+        insn->xAcquireRelease = TRUE;
       if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
         if (consumeByte(insn, &nextByte))
           return -1;
@@ -405,7 +444,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       dbgprintf(insn, "Found prefix 0x%hhx", byte);
   }
 
-  insn->vexSize = 0;
+  insn->vexXopType = TYPE_NO_VEX_XOP;
 
   if (byte == 0xc4) {
     uint8_t byte1;
@@ -416,7 +455,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 3;
+      insn->vexXopType = TYPE_VEX_3B;
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
     else {
@@ -424,22 +463,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->necessaryPrefixLocation = insn->readerCursor - 1;
     }
 
-    if (insn->vexSize == 3) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
-      consumeByte(insn, &insn->vexPrefix[2]);
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
 
       /* We simulate the REX prefix for simplicity's sake */
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (wFromVEX3of3(insn->vexPrefix[2]) << 3)
-                        | (rFromVEX2of3(insn->vexPrefix[1]) << 2)
-                        | (xFromVEX2of3(insn->vexPrefix[1]) << 1)
-                        | (bFromVEX2of3(insn->vexPrefix[1]) << 0);
+                        | (wFromVEX3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromVEX2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromVEX2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromVEX2of3(insn->vexXopPrefix[1]) << 0);
       }
 
-      switch (ppFromVEX3of3(insn->vexPrefix[2]))
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2]))
       {
       default:
         break;
@@ -448,7 +487,9 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1], insn->vexPrefix[2]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else if (byte == 0xc5) {
@@ -460,22 +501,22 @@ static int readPrefixes(struct InternalInstruction* insn) {
     }
 
     if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
-      insn->vexSize = 2;
+      insn->vexXopType = TYPE_VEX_2B;
     }
     else {
       unconsumeByte(insn);
     }
 
-    if (insn->vexSize == 2) {
-      insn->vexPrefix[0] = byte;
-      consumeByte(insn, &insn->vexPrefix[1]);
+    if (insn->vexXopType == TYPE_VEX_2B) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
 
       if (insn->mode == MODE_64BIT) {
         insn->rexPrefix = 0x40
-                        | (rFromVEX2of2(insn->vexPrefix[1]) << 2);
+                        | (rFromVEX2of2(insn->vexXopPrefix[1]) << 2);
       }
 
-      switch (ppFromVEX2of2(insn->vexPrefix[1]))
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1]))
       {
       default:
         break;
@@ -484,7 +525,53 @@ static int readPrefixes(struct InternalInstruction* insn) {
         break;
       }
 
-      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexPrefix[0], insn->vexPrefix[1]);
+      dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx", insn->vexXopPrefix[0], insn->vexXopPrefix[1]);
+    }
+  }
+  else if (byte == 0x8f) {
+    uint8_t byte1;
+
+    if (lookAtByte(insn, &byte1)) {
+      dbgprintf(insn, "Couldn't read second byte of XOP");
+      return -1;
+    }
+
+    if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+      insn->vexXopType = TYPE_XOP;
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+    else {
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+
+    if (insn->vexXopType == TYPE_XOP) {
+      insn->vexXopPrefix[0] = byte;
+      consumeByte(insn, &insn->vexXopPrefix[1]);
+      consumeByte(insn, &insn->vexXopPrefix[2]);
+
+      /* We simulate the REX prefix for simplicity's sake */
+
+      if (insn->mode == MODE_64BIT) {
+        insn->rexPrefix = 0x40
+                        | (wFromXOP3of3(insn->vexXopPrefix[2]) << 3)
+                        | (rFromXOP2of3(insn->vexXopPrefix[1]) << 2)
+                        | (xFromXOP2of3(insn->vexXopPrefix[1]) << 1)
+                        | (bFromXOP2of3(insn->vexXopPrefix[1]) << 0);
+      }
+
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2]))
+      {
+      default:
+        break;
+      case VEX_PREFIX_66:
+        hasOpSize = TRUE;
+        break;
+      }
+
+      dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+                insn->vexXopPrefix[0], insn->vexXopPrefix[1],
+                insn->vexXopPrefix[2]);
     }
   }
   else {
@@ -559,37 +646,49 @@ static int readOpcode(struct InternalInstruction* insn) {
 
   insn->opcodeType = ONEBYTE;
 
-  if (insn->vexSize == 3)
+  if (insn->vexXopType == TYPE_VEX_3B)
   {
-    switch (mmmmmFromVEX2of3(insn->vexPrefix[1]))
+    switch (mmmmmFromVEX2of3(insn->vexXopPrefix[1]))
     {
     default:
-      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)", mmmmmFromVEX2of3(insn->vexPrefix[1]));
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
       return -1;
-    case 0:
-      break;
     case VEX_LOB_0F:
-      insn->twoByteEscape = 0x0f;
       insn->opcodeType = TWOBYTE;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F38:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x38;
       insn->opcodeType = THREEBYTE_38;
       return consumeByte(insn, &insn->opcode);
     case VEX_LOB_0F3A:
-      insn->twoByteEscape = 0x0f;
-      insn->threeByteEscape = 0x3a;
       insn->opcodeType = THREEBYTE_3A;
       return consumeByte(insn, &insn->opcode);
     }
   }
-  else if (insn->vexSize == 2)
+  else if (insn->vexXopType == TYPE_VEX_2B)
   {
-    insn->twoByteEscape = 0x0f;
     insn->opcodeType = TWOBYTE;
     return consumeByte(insn, &insn->opcode);
   }
+  else if (insn->vexXopType == TYPE_XOP)
+  {
+    switch (mmmmmFromXOP2of3(insn->vexXopPrefix[1]))
+    {
+    default:
+      dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
+                mmmmmFromVEX2of3(insn->vexXopPrefix[1]));
+      return -1;
+    case XOP_MAP_SELECT_8:
+      insn->opcodeType = XOP8_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_9:
+      insn->opcodeType = XOP9_MAP;
+      return consumeByte(insn, &insn->opcode);
+    case XOP_MAP_SELECT_A:
+      insn->opcodeType = XOPA_MAP;
+      return consumeByte(insn, &insn->opcode);
+    }
+  }
 
   if (consumeByte(insn, &current))
     return -1;
@@ -597,16 +696,12 @@ static int readOpcode(struct InternalInstruction* insn) {
   if (current == 0x0f) {
     dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
 
-    insn->twoByteEscape = current;
-
     if (consumeByte(insn, &current))
       return -1;
 
     if (current == 0x38) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -614,8 +709,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0x3a) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -623,8 +716,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa6) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -632,8 +723,6 @@ static int readOpcode(struct InternalInstruction* insn) {
     } else if (current == 0xa7) {
       dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
 
-      insn->threeByteEscape = current;
-
       if (consumeByte(insn, &current))
         return -1;
 
@@ -747,11 +836,27 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
   if (insn->mode == MODE_64BIT)
     attrMask |= ATTR_64BIT;
 
-  if (insn->vexSize) {
+  if (insn->vexXopType != TYPE_NO_VEX_XOP) {
     attrMask |= ATTR_VEX;
 
-    if (insn->vexSize == 3) {
-      switch (ppFromVEX3of3(insn->vexPrefix[2])) {
+    if (insn->vexXopType == TYPE_VEX_3B) {
+      switch (ppFromVEX3of3(insn->vexXopPrefix[2])) {
+      case VEX_PREFIX_66:
+        attrMask |= ATTR_OPSIZE;
+        break;
+      case VEX_PREFIX_F3:
+        attrMask |= ATTR_XS;
+        break;
+      case VEX_PREFIX_F2:
+        attrMask |= ATTR_XD;
+        break;
+      }
+
+      if (lFromVEX3of3(insn->vexXopPrefix[2]))
+        attrMask |= ATTR_VEXL;
+    }
+    else if (insn->vexXopType == TYPE_VEX_2B) {
+      switch (ppFromVEX2of2(insn->vexXopPrefix[1])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -763,11 +868,11 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX3of3(insn->vexPrefix[2]))
+      if (lFromVEX2of2(insn->vexXopPrefix[1]))
         attrMask |= ATTR_VEXL;
     }
-    else if (insn->vexSize == 2) {
-      switch (ppFromVEX2of2(insn->vexPrefix[1])) {
+    else if (insn->vexXopType == TYPE_XOP) {
+      switch (ppFromXOP3of3(insn->vexXopPrefix[2])) {
       case VEX_PREFIX_66:
         attrMask |= ATTR_OPSIZE;
         break;
@@ -779,7 +884,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
         break;
       }
 
-      if (lFromVEX2of2(insn->vexPrefix[1]))
+      if (lFromXOP3of3(insn->vexXopPrefix[2]))
         attrMask |= ATTR_VEXL;
     }
     else {
@@ -805,42 +910,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
 
   /* The following clauses compensate for limitations of the tables. */
 
-  if ((attrMask & ATTR_VEXL) && (attrMask & ATTR_REXW) &&
-      !(attrMask & ATTR_OPSIZE)) {
-    /*
-     * Some VEX instructions ignore the L-bit, but use the W-bit. Normally L-bit
-     * has precedence since there are no L-bit with W-bit entries in the tables.
-     * So if the L-bit isn't significant we should use the W-bit instead.
-     * We only need to do this if the instruction doesn't specify OpSize since
-     * there is a VEX_L_W_OPSIZE table.
-     */
-
-    const struct InstructionSpecifier *spec;
-    uint16_t instructionIDWithWBit;
-    const struct InstructionSpecifier *specWithWBit;
-
-    spec = specifierForUID(instructionID);
-
-    if (getIDWithAttrMask(&instructionIDWithWBit,
-                          insn,
-                          (attrMask & (~ATTR_VEXL)) | ATTR_REXW)) {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-      return 0;
-    }
-
-    specWithWBit = specifierForUID(instructionIDWithWBit);
-
-    if (instructionID != instructionIDWithWBit) {
-      insn->instructionID = instructionIDWithWBit;
-      insn->spec = specWithWBit;
-    } else {
-      insn->instructionID = instructionID;
-      insn->spec = spec;
-    }
-    return 0;
-  }
-
   if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
     /*
      * The instruction tables make no distinction between instructions that
@@ -1234,6 +1303,8 @@ static int readModRM(struct InternalInstruction* insn) {
       return prefix##_EAX + index;                        \
     case TYPE_R64:                                        \
       return prefix##_RAX + index;                        \
+    case TYPE_XMM512:                                     \
+      return prefix##_ZMM0 + index;                       \
     case TYPE_XMM256:                                     \
       return prefix##_YMM0 + index;                       \
     case TYPE_XMM128:                                     \
@@ -1479,10 +1550,12 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 static int readVVVV(struct InternalInstruction* insn) {
   dbgprintf(insn, "readVVVV()");
 
-  if (insn->vexSize == 3)
-    insn->vvvv = vvvvFromVEX3of3(insn->vexPrefix[2]);
-  else if (insn->vexSize == 2)
-    insn->vvvv = vvvvFromVEX2of2(insn->vexPrefix[1]);
+  if (insn->vexXopType == TYPE_VEX_3B)
+    insn->vvvv = vvvvFromVEX3of3(insn->vexXopPrefix[2]);
+  else if (insn->vexXopType == TYPE_VEX_2B)
+    insn->vvvv = vvvvFromVEX2of2(insn->vexXopPrefix[1]);
+  else if (insn->vexXopType == TYPE_XOP)
+    insn->vvvv = vvvvFromXOP3of3(insn->vexXopPrefix[2]);
   else
     return -1;
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 407ead3..6d03d5c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -59,6 +59,15 @@ extern "C" {
 #define lFromVEX2of2(vex)       (((vex) & 0x4) >> 2)
 #define ppFromVEX2of2(vex)      ((vex) & 0x3)
 
+#define rFromXOP2of3(xop)       (((~(xop)) & 0x80) >> 7)
+#define xFromXOP2of3(xop)       (((~(xop)) & 0x40) >> 6)
+#define bFromXOP2of3(xop)       (((~(xop)) & 0x20) >> 5)
+#define mmmmmFromXOP2of3(xop)   ((xop) & 0x1f)
+#define wFromXOP3of3(xop)       (((xop) & 0x80) >> 7)
+#define vvvvFromXOP3of3(vex)    (((~(vex)) & 0x78) >> 3)
+#define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
+#define ppFromXOP3of3(xop)      ((xop) & 0x3)
+
 /*
  * These enums represent Intel registers for use by the decoder.
  */
@@ -219,7 +228,23 @@ extern "C" {
   ENTRY(XMM12)    \
   ENTRY(XMM13)    \
   ENTRY(XMM14)    \
-  ENTRY(XMM15)
+  ENTRY(XMM15)    \
+  ENTRY(XMM16)    \
+  ENTRY(XMM17)    \
+  ENTRY(XMM18)    \
+  ENTRY(XMM19)    \
+  ENTRY(XMM20)    \
+  ENTRY(XMM21)    \
+  ENTRY(XMM22)    \
+  ENTRY(XMM23)    \
+  ENTRY(XMM24)    \
+  ENTRY(XMM25)    \
+  ENTRY(XMM26)    \
+  ENTRY(XMM27)    \
+  ENTRY(XMM28)    \
+  ENTRY(XMM29)    \
+  ENTRY(XMM30)    \
+  ENTRY(XMM31)
 
 #define REGS_YMM  \
   ENTRY(YMM0)     \
@@ -237,7 +262,57 @@ extern "C" {
   ENTRY(YMM12)    \
   ENTRY(YMM13)    \
   ENTRY(YMM14)    \
-  ENTRY(YMM15)
+  ENTRY(YMM15)    \
+  ENTRY(YMM16)    \
+  ENTRY(YMM17)    \
+  ENTRY(YMM18)    \
+  ENTRY(YMM19)    \
+  ENTRY(YMM20)    \
+  ENTRY(YMM21)    \
+  ENTRY(YMM22)    \
+  ENTRY(YMM23)    \
+  ENTRY(YMM24)    \
+  ENTRY(YMM25)    \
+  ENTRY(YMM26)    \
+  ENTRY(YMM27)    \
+  ENTRY(YMM28)    \
+  ENTRY(YMM29)    \
+  ENTRY(YMM30)    \
+  ENTRY(YMM31)
+
+#define REGS_ZMM  \
+  ENTRY(ZMM0)     \
+  ENTRY(ZMM1)     \
+  ENTRY(ZMM2)     \
+  ENTRY(ZMM3)     \
+  ENTRY(ZMM4)     \
+  ENTRY(ZMM5)     \
+  ENTRY(ZMM6)     \
+  ENTRY(ZMM7)     \
+  ENTRY(ZMM8)     \
+  ENTRY(ZMM9)     \
+  ENTRY(ZMM10)    \
+  ENTRY(ZMM11)    \
+  ENTRY(ZMM12)    \
+  ENTRY(ZMM13)    \
+  ENTRY(ZMM14)    \
+  ENTRY(ZMM15)    \
+  ENTRY(ZMM16)    \
+  ENTRY(ZMM17)    \
+  ENTRY(ZMM18)    \
+  ENTRY(ZMM19)    \
+  ENTRY(ZMM20)    \
+  ENTRY(ZMM21)    \
+  ENTRY(ZMM22)    \
+  ENTRY(ZMM23)    \
+  ENTRY(ZMM24)    \
+  ENTRY(ZMM25)    \
+  ENTRY(ZMM26)    \
+  ENTRY(ZMM27)    \
+  ENTRY(ZMM28)    \
+  ENTRY(ZMM29)    \
+  ENTRY(ZMM30)    \
+  ENTRY(ZMM31)
 
 #define REGS_SEGMENT \
   ENTRY(ES)          \
@@ -285,6 +360,7 @@ extern "C" {
   REGS_MMX            \
   REGS_XMM            \
   REGS_YMM            \
+  REGS_ZMM            \
   REGS_SEGMENT        \
   REGS_DEBUG          \
   REGS_CONTROL        \
@@ -319,6 +395,7 @@ typedef enum {
   ALL_EA_BASES
   REGS_XMM
   REGS_YMM
+  REGS_ZMM
 #undef ENTRY
   SIB_INDEX_max
 } SIBIndex;
@@ -379,6 +456,12 @@ typedef enum {
   VEX_LOB_0F3A = 0x3
 } VEXLeadingOpcodeByte;
 
+typedef enum {
+  XOP_MAP_SELECT_8 = 0x8,
+  XOP_MAP_SELECT_9 = 0x9,
+  XOP_MAP_SELECT_A = 0xA
+} XOPMapSelect;
+
 /*
  * VEXPrefixCode - Possible values for the VEX.pp field
  */
@@ -390,6 +473,13 @@ typedef enum {
   VEX_PREFIX_F2 = 0x3
 } VEXPrefixCode;
 
+typedef enum {
+  TYPE_NO_VEX_XOP = 0x0,
+  TYPE_VEX_2B = 0x1,
+  TYPE_VEX_3B = 0x2,
+  TYPE_XOP = 0x3
+} VEXXOPType;
+
 typedef uint8_t BOOL;
 
 /*
@@ -446,10 +536,10 @@ struct InternalInstruction {
   uint8_t prefixPresent[0x100];
   /* contains the location (for use with the reader) of the prefix byte */
   uint64_t prefixLocations[0x100];
-  /* The value of the VEX prefix, if present */
-  uint8_t vexPrefix[3];
+  /* The value of the VEX/XOP prefix, if present */
+  uint8_t vexXopPrefix[3];
   /* The length of the VEX prefix (0 if not present) */
-  uint8_t vexSize;
+  VEXXOPType vexXopType;
   /* The value of the REX prefix, if present */
   uint8_t rexPrefix;
   /* The location where a mandatory prefix would have to be (i.e., right before
@@ -457,6 +547,8 @@ struct InternalInstruction {
   uint64_t necessaryPrefixLocation;
   /* The segment override type */
   SegmentOverride segmentOverride;
+  /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
+  BOOL xAcquireRelease;
 
   /* Sizes of various critical pieces of data, in bytes */
   uint8_t registerSize;
@@ -471,10 +563,6 @@ struct InternalInstruction {
 
   /* opcode state */
 
-  /* The value of the two-byte escape prefix (usually 0x0f) */
-  uint8_t twoByteEscape;
-  /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
-  uint8_t threeByteEscape;
   /* The last byte of the opcode, not counting any ModR/M extension */
   uint8_t opcode;
   /* The ModR/M byte of the instruction, if it is an opcode extension */
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 23dfe4b..dd1719c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -32,6 +32,9 @@
 #define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
 #define THREEBYTEA6_SYM   x86DisassemblerThreeByteA6Opcodes
 #define THREEBYTEA7_SYM   x86DisassemblerThreeByteA7Opcodes
+#define XOP8_MAP_SYM      x86DisassemblerXOP8Opcodes
+#define XOP9_MAP_SYM      x86DisassemblerXOP9Opcodes
+#define XOPA_MAP_SYM      x86DisassemblerXOPAOpcodes
 
 #define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
 #define CONTEXTS_STR      "x86DisassemblerContexts"
@@ -41,6 +44,9 @@
 #define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
 #define THREEBYTEA6_STR   "x86DisassemblerThreeByteA6Opcodes"
 #define THREEBYTEA7_STR   "x86DisassemblerThreeByteA7Opcodes"
+#define XOP8_MAP_STR      "x86DisassemblerXOP8Opcodes"
+#define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
+#define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 
 /*
  * Attributes of an instruction that must be known before the opcode can be
@@ -116,8 +122,154 @@ enum attributeBits {
   ENUM_ENTRY(IC_VEX_L_XS,           4,  "requires VEX and the L and XS prefix")\
   ENUM_ENTRY(IC_VEX_L_XD,           4,  "requires VEX and the L and XD prefix")\
   ENUM_ENTRY(IC_VEX_L_OPSIZE,       4,  "requires VEX, L, and OpSize")         \
-  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")
-
+  ENUM_ENTRY(IC_VEX_L_W,            4,  "requires VEX, L and W")               \
+  ENUM_ENTRY(IC_VEX_L_W_XS,         5,  "requires VEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_XD,         5,  "requires VEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_VEX_L_W_OPSIZE,     5,  "requires VEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX,               1,  "requires an EVEX prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS,            2,  "requires EVEX and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD,            2,  "requires EVEX and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE,        2,  "requires EVEX and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W,             3,  "requires EVEX and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS,          4,  "requires EVEX, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD,          4,  "requires EVEX, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE,      4,  "requires EVEX, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L,             3,  "requires EVEX and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS,          4,  "requires EVEX and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD,          4,  "requires EVEX and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE,      4,  "requires EVEX, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W,           3,  "requires EVEX, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS,        4,  "requires EVEX, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD,        4,  "requires EVEX, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE,    4,  "requires EVEX, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2,            3,  "requires EVEX and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS,         4,  "requires EVEX and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD,         4,  "requires EVEX and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE,     4,  "requires EVEX, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W,          3,  "requires EVEX, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS,       4,  "requires EVEX, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD,       4,  "requires EVEX, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE,   4,  "requires EVEX, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K,             1,  "requires an EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K,          2,  "requires EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K,          2,  "requires EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K,      2,  "requires EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K,           3,  "requires EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K,        4,  "requires EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K,        4,  "requires EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K,    4,  "requires EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K,           3,  "requires EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K,        4,  "requires EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K,        4,  "requires EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K,    4,  "requires EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K,         3,  "requires EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K,      4,  "requires EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K,      4,  "requires EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K,  4,  "requires EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K,          3,  "requires EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K,       4,  "requires EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K,       4,  "requires EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K,   4,  "requires EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K,        3,  "requires EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K,     4,  "requires EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K,     4,  "requires EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K, 4,  "requires EVEX_K, L2, W and OpSize")     \
+  ENUM_ENTRY(IC_EVEX_B,             1,  "requires an EVEX_B prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_B,          2,  "requires EVEX_B and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_B,          2,  "requires EVEX_B and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_B,      2,  "requires EVEX_B and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_B,           3,  "requires EVEX_B and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_B,        4,  "requires EVEX_B, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_B,        4,  "requires EVEX_B, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_B,    4,  "requires EVEX_B, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_B,           3,  "requires EVEX_B and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_B,        4,  "requires EVEX_B and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_B,        4,  "requires EVEX_B and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_B,    4,  "requires EVEX_B, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_B,         3,  "requires EVEX_B, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_B,      4,  "requires EVEX_B, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_B,      4,  "requires EVEX_B, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_B,  4,  "requires EVEX_B, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_B,          3,  "requires EVEX_B and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_B,       4,  "requires EVEX_B and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_B,       4,  "requires EVEX_B and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_B,   4,  "requires EVEX_B, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_B,        3,  "requires EVEX_B, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_B,     4,  "requires EVEX_B, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_B,     4,  "requires EVEX_B, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_B, 4,  "requires EVEX_B, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_K_B,             1,  "requires EVEX_B and EVEX_K prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_K_B,          2,  "requires EVEX_B, EVEX_K and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_K_B,          2,  "requires EVEX_B, EVEX_K and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_K_B,      2,  "requires EVEX_B, EVEX_K and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_K_B,           3,  "requires EVEX_B, EVEX_K and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_K_B,        4,  "requires EVEX_B, EVEX_K, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_K_B,        4,  "requires EVEX_B, EVEX_K, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_K_B,    4,  "requires EVEX_B, EVEX_K, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_K_B,           3,  "requires EVEX_B, EVEX_K and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_K_B,        4,  "requires EVEX_B, EVEX_K and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_K_B,        4,  "requires EVEX_B, EVEX_K and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_K_B,    4,  "requires EVEX_B, EVEX_K, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_K_B,         3,  "requires EVEX_B, EVEX_K, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_K_B,      4,  "requires EVEX_B, EVEX_K, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_K_B,      4,  "requires EVEX_B, EVEX_K, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_K_B,  4,  "requires EVEX_B, EVEX_K, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_K_B,          3,  "requires EVEX_B, EVEX_K and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_K_B,       4,  "requires EVEX_B, EVEX_K and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_K_B,       4,  "requires EVEX_B, EVEX_K and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_K_B,   4,  "requires EVEX_B, EVEX_K, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_K_B,        3,  "requires EVEX_B, EVEX_K, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_K_B,     4,  "requires EVEX_B, EVEX_K, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_K_B, 4,  "requires EVEX_B, EVEX_K, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ_B,             1,  "requires EVEX_B and EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ_B,          2,  "requires EVEX_B, EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ_B,      2,  "requires EVEX_B, EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ_B,           3,  "requires EVEX_B, EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ_B,        4,  "requires EVEX_B, EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ_B,    4,  "requires EVEX_B, EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ_B,         3,  "requires EVEX_B, EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ_B,      4,  "requires EVEX_B, EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ_B,  4,  "requires EVEX_B, EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ_B,          3,  "requires EVEX_B, EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ_B,       4,  "requires EVEX_B, EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ_B,   4,  "requires EVEX_B, EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ_B,        3,  "requires EVEX_B, EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ_B,     4,  "requires EVEX_B, EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ_B, 4,  "requires EVEX_B, EVEX_KZ, L2, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_KZ,             1,  "requires an EVEX_KZ prefix")             \
+  ENUM_ENTRY(IC_EVEX_XS_KZ,          2,  "requires EVEX_KZ and the XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_XD_KZ,          2,  "requires EVEX_KZ and the XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_OPSIZE_KZ,      2,  "requires EVEX_KZ and the OpSize prefix") \
+  ENUM_ENTRY(IC_EVEX_W_KZ,           3,  "requires EVEX_KZ and the W prefix")      \
+  ENUM_ENTRY(IC_EVEX_W_XS_KZ,        4,  "requires EVEX_KZ, W, and XS prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_XD_KZ,        4,  "requires EVEX_KZ, W, and XD prefix")     \
+  ENUM_ENTRY(IC_EVEX_W_OPSIZE_KZ,    4,  "requires EVEX_KZ, W, and OpSize")        \
+  ENUM_ENTRY(IC_EVEX_L_KZ,           3,  "requires EVEX_KZ and the L prefix")       \
+  ENUM_ENTRY(IC_EVEX_L_XS_KZ,        4,  "requires EVEX_KZ and the L and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L_XD_KZ,        4,  "requires EVEX_KZ and the L and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L_OPSIZE_KZ,    4,  "requires EVEX_KZ, L, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L_W_KZ,         3,  "requires EVEX_KZ, L and W")               \
+  ENUM_ENTRY(IC_EVEX_L_W_XS_KZ,      4,  "requires EVEX_KZ, L, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_XD_KZ,      4,  "requires EVEX_KZ, L, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L_W_OPSIZE_KZ,  4,  "requires EVEX_KZ, L, W and OpSize")       \
+  ENUM_ENTRY(IC_EVEX_L2_KZ,          3,  "requires EVEX_KZ and the L2 prefix")       \
+  ENUM_ENTRY(IC_EVEX_L2_XS_KZ,       4,  "requires EVEX_KZ and the L2 and XS prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_XD_KZ,       4,  "requires EVEX_KZ and the L2 and XD prefix")\
+  ENUM_ENTRY(IC_EVEX_L2_OPSIZE_KZ,   4,  "requires EVEX_KZ, L2, and OpSize")         \
+  ENUM_ENTRY(IC_EVEX_L2_W_KZ,        3,  "requires EVEX_KZ, L2 and W")               \
+  ENUM_ENTRY(IC_EVEX_L2_W_XS_KZ,     4,  "requires EVEX_KZ, L2, W and XS prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_XD_KZ,     4,  "requires EVEX_KZ, L2, W and XD prefix")    \
+  ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
 
 #define ENUM_ENTRY(n, r, d) n,
 typedef enum {
@@ -136,7 +288,10 @@ typedef enum {
   THREEBYTE_38  = 2,
   THREEBYTE_3A  = 3,
   THREEBYTE_A6  = 4,
-  THREEBYTE_A7  = 5
+  THREEBYTE_A7  = 5,
+  XOP8_MAP      = 6,
+  XOP9_MAP      = 7,
+  XOPA_MAP      = 8
 } OpcodeType;
 
 /*
@@ -224,6 +379,7 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
   ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
   ENUM_ENTRY(ENCODING_VVVV,   "Register operand in VEX.vvvv byte.")            \
+  ENUM_ENTRY(ENCODING_WRITEMASK, "Register operand in EVEX.aaa byte.")         \
   ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
   ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
   ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
@@ -321,6 +477,9 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
   ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
   ENUM_ENTRY(TYPE_XMM256,     "32-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM512,     "64-byte")                                       \
+  ENUM_ENTRY(TYPE_VK8,        "8-bit")                                         \
+  ENUM_ENTRY(TYPE_VK16,       "16-bit")                                        \
   ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
   ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
   ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index e357710..4439311 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -50,7 +50,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
-  
+
   // Next always print the annotation.
   printAnnotation(OS, Annot);
 
@@ -139,8 +139,7 @@ void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
     if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex(Address);
+      O << formatHex((uint64_t)Address);
     }
     else {
       // Otherwise, just print the expression.
@@ -159,10 +158,10 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     O << markup("<imm:")
       << '$' << formatImm((int64_t)Op.getImm())
       << markup(">");
-    
+
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
-    
+
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << markup("<imm:")
@@ -177,7 +176,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   const MCOperand &IndexReg = MI->getOperand(Op+2);
   const MCOperand &DispSpec = MI->getOperand(Op+3);
   const MCOperand &SegReg = MI->getOperand(Op+4);
-  
+
   O << markup("<mem:");
 
   // If this has a segment register, print it.
@@ -185,7 +184,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     printOperand(MI, Op+4, O);
     O << ':';
   }
-  
+
   if (DispSpec.isImm()) {
     int64_t DispVal = DispSpec.getImm();
     if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
@@ -194,21 +193,21 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
     O << *DispSpec.getExpr();
   }
-  
+
   if (IndexReg.getReg() || BaseReg.getReg()) {
     O << '(';
     if (BaseReg.getReg())
       printOperand(MI, Op, O);
-    
+
     if (IndexReg.getReg()) {
       O << ',';
       printOperand(MI, Op+2, O);
       unsigned ScaleVal = MI->getOperand(Op+1).getImm();
       if (ScaleVal != 1) {
         O << ','
-	  << markup("<imm:")
+          << markup("<imm:")
           << ScaleVal // never printed in hex.
-	  << markup(">");
+          << markup(">");
       }
     }
     O << ')';
@@ -216,3 +215,19 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
 
   O << markup(">");
 }
+
+void X86ATTInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                       raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << markup("<mem:");
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << markup(">");
+}
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 8e09183..a8fab72 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -42,7 +42,8 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
@@ -65,6 +66,9 @@ public:
   void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
   void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
@@ -80,6 +84,22 @@ public:
   void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemReference(MI, OpNo, O);
+  }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 141f4a4..e7e7b15 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -119,7 +119,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
-    O << Op.getImm();
+    O << formatImm(Op.getImm());
   else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     // If a symbolic branch target was added as a constant expression then print
@@ -127,8 +127,7 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
     const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
     int64_t Address;
     if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex(Address);
+      O << formatHex((uint64_t)Address);
     }
     else {
       // Otherwise, just print the expression.
@@ -137,18 +136,13 @@ void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
   }
 }
 
-static void PrintRegName(raw_ostream &O, StringRef RegName) {
-  for (unsigned i = 0, e = RegName.size(); i != e; ++i)
-    O << (char)toupper(RegName[i]);
-}
-
 void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    PrintRegName(O, getRegisterName(Op.getReg()));
+    printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
-    O << Op.getImm();
+    O << formatImm((int64_t)Op.getImm());
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     O << *Op.getExpr();
@@ -200,9 +194,25 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
           DispVal = -DispVal;
         }
       }
-      O << DispVal;
+      O << formatImm(DispVal);
     }
   }
   
   O << ']';
 }
+
+void X86IntelInstPrinter::printMemOffset(const MCInst *MI, unsigned Op,
+                                         raw_ostream &O) {
+  const MCOperand &DispSpec = MI->getOperand(Op);
+
+  O << '[';
+
+  if (DispSpec.isImm()) {
+    O << formatImm(DispSpec.getImm());
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement?");
+    O << *DispSpec.getExpr();
+  }
+
+  O << ']';
+}
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index bb769eb..590bf68 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -39,56 +39,82 @@ public:
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  
+  void printMemOffset(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "OPAQUE PTR ";
+    O << "opaque ptr ";
     printMemReference(MI, OpNo, O);
   }
   
   void printi8mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "BYTE PTR ";
+    O << "byte ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printi16mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "WORD PTR ";
+    O << "word ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printi32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "DWORD PTR ";
+    O << "dword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printi64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "QWORD PTR ";
+    O << "qword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printi128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "XMMWORD PTR ";
+    O << "xmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printi256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "YMMWORD PTR ";
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printi512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf32mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "DWORD PTR ";
+    O << "dword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf64mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "QWORD PTR ";
+    O << "qword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf80mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "XWORD PTR ";
+    O << "xword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf128mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "XMMWORD PTR ";
+    O << "xmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
   void printf256mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    O << "YMMWORD PTR ";
+    O << "ymmword ptr ";
+    printMemReference(MI, OpNo, O);
+  }
+  void printf512mem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "zmmword ptr ";
     printMemReference(MI, OpNo, O);
   }
+
+  void printMemOffs8(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "byte ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs16(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "word ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs32(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "dword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
+  void printMemOffs64(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    O << "qword ptr ";
+    printMemOffset(MI, OpNo, O);
+  }
 };
   
 }
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 1c240e5..2eb5f25 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -6,6 +6,8 @@ add_llvm_library(LLVMX86Desc
   X86MachObjectWriter.cpp
   X86ELFObjectWriter.cpp
   X86WinCOFFObjectWriter.cpp
+  X86MachORelocationInfo.cpp
+  X86ELFRelocationInfo.cpp
   )
 
 add_dependencies(LLVMX86Desc X86CommonTableGen)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 598ddee..f8e359b 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -19,10 +20,10 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -67,9 +68,16 @@ public:
 
 class X86AsmBackend : public MCAsmBackend {
   StringRef CPU;
+  bool HasNopl;
 public:
   X86AsmBackend(const Target &T, StringRef _CPU)
-    : MCAsmBackend(), CPU(_CPU) {}
+    : MCAsmBackend(), CPU(_CPU) {
+    HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
+              CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
+              CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
+              CPU != "geode" && CPU != "winchip-c6" && CPU != "winchip2" &&
+              CPU != "c3" && CPU != "c3-2";
+  }
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -308,8 +316,8 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
   // This CPU doesnt support long nops. If needed add more.
   // FIXME: Can we get this from the subtarget somehow?
-  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
-      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+  // FIXME: We could generated something better than plain 0x90.
+  if (!HasNopl) {
     for (uint64_t i = 0; i < Count; ++i)
       OW->Write8(0x90);
     return true;
@@ -334,6 +342,7 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 /* *** */
 
 namespace {
+
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
@@ -382,35 +391,368 @@ public:
   }
 };
 
+namespace CU {
+
+  /// Compact unwind encoding values.
+  enum CompactUnwindEncodings {
+    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
+    /// the return address, then [RE]SP is moved to [RE]BP.
+    UNWIND_MODE_BP_FRAME                   = 0x01000000,
+
+    /// A frameless function with a small constant stack size.
+    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
+
+    /// A frameless function with a large constant stack size.
+    UNWIND_MODE_STACK_IND                  = 0x03000000,
+
+    /// No compact unwind encoding is available.
+    UNWIND_MODE_DWARF                      = 0x04000000,
+
+    /// Mask for encoding the frame registers.
+    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
+
+    /// Mask for encoding the frameless registers.
+    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
+  };
+
+} // end CU namespace
+
 class DarwinX86AsmBackend : public X86AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Number of registers that can be saved in a compact unwind encoding.
+  enum { CU_NUM_SAVED_REGS = 6 };
+
+  mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  bool Is64Bit;
+
+  unsigned OffsetSize;                   ///< Offset of a "push" instruction.
+  unsigned PushInstrSize;                ///< Size of a "push" instruction.
+  unsigned MoveInstrSize;                ///< Size of a "move" instruction.
+  unsigned StackDivide;                  ///< Amount to adjust stack stize by.
+protected:
+  /// \brief Implementation of algorithm to generate the compact unwind encoding
+  /// for the CFI instructions.
+  uint32_t
+  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+    if (Instrs.empty()) return 0;
+
+    // Reset the saved registers.
+    unsigned SavedRegIdx = 0;
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+
+    bool HasFP = false;
+
+    // Encode that we are using EBP/RBP as the frame pointer.
+    uint32_t CompactUnwindEncoding = 0;
+
+    unsigned SubtractInstrIdx = Is64Bit ? 3 : 2;
+    unsigned InstrOffset = 0;
+    unsigned StackAdjust = 0;
+    unsigned StackSize = 0;
+    unsigned PrevStackSize = 0;
+    unsigned NumDefCFAOffsets = 0;
+
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Any other CFI directives indicate a frame that we aren't prepared
+        // to represent via compact unwind, so just bail out.
+        return 0;
+      case MCCFIInstruction::OpDefCfaRegister: {
+        // Defines a frame pointer. E.g.
+        //
+        //     movq %rsp, %rbp
+        //  L0:
+        //     .cfi_def_cfa_register %rbp
+        //
+        HasFP = true;
+        assert(MRI.getLLVMRegNum(Inst.getRegister(), true) ==
+               (Is64Bit ? X86::RBP : X86::EBP) && "Invalid frame pointer!");
+
+        // Reset the counts.
+        memset(SavedRegs, 0, sizeof(SavedRegs));
+        StackAdjust = 0;
+        SavedRegIdx = 0;
+        InstrOffset += MoveInstrSize;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        // Defines a new offset for the CFA. E.g.
+        //
+        //  With frame:
+        //  
+        //     pushq %rbp
+        //  L0:
+        //     .cfi_def_cfa_offset 16
+        //
+        //  Without frame:
+        //
+        //     subq $72, %rsp
+        //  L0:
+        //     .cfi_def_cfa_offset 80
+        //
+        PrevStackSize = StackSize;
+        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        ++NumDefCFAOffsets;
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Defines a "push" of a callee-saved register. E.g.
+        //
+        //     pushq %r15
+        //     pushq %r14
+        //     pushq %rbx
+        //  L0:
+        //     subq $120, %rsp
+        //  L1:
+        //     .cfi_offset %rbx, -40
+        //     .cfi_offset %r14, -32
+        //     .cfi_offset %r15, -24
+        //
+        if (SavedRegIdx == CU_NUM_SAVED_REGS)
+          // If there are too many saved registers, we cannot use a compact
+          // unwind encoding.
+          return CU::UNWIND_MODE_DWARF;
+
+        unsigned Reg = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        SavedRegs[SavedRegIdx++] = Reg;
+        StackAdjust += OffsetSize;
+        InstrOffset += PushInstrSize;
+        break;
+      }
+      }
+    }
+
+    StackAdjust /= StackDivide;
+
+    if (HasFP) {
+      if ((StackAdjust & 0xFF) != StackAdjust)
+        // Offset was too big for a compact unwind encoding.
+        return CU::UNWIND_MODE_DWARF;
+
+      // Get the encoding of the saved registers when we have a frame pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame();
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
+      CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
+      CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
+    } else {
+      // If the amount of the stack allocation is the size of a register, then
+      // we "push" the RAX/EAX register onto the stack instead of adjusting the
+      // stack pointer with a SUB instruction. We don't support the push of the
+      // RAX/EAX register with compact unwind. So we check for that situation
+      // here.
+      if ((NumDefCFAOffsets == SavedRegIdx + 1 &&
+           StackSize - PrevStackSize == 1) ||
+          (Instrs.size() == 1 && NumDefCFAOffsets == 1 && StackSize == 2))
+        return CU::UNWIND_MODE_DWARF;
+
+      SubtractInstrIdx += InstrOffset;
+      ++StackAdjust;
+
+      if ((StackSize & 0xFF) == StackSize) {
+        // Frameless stack with a small stack size.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
+
+        // Encode the stack size.
+        CompactUnwindEncoding |= (StackSize & 0xFF) << 16;
+      } else {
+        if ((StackAdjust & 0x7) != StackAdjust)
+          // The extra stack adjustments are too big for us to handle.
+          return CU::UNWIND_MODE_DWARF;
+
+        // Frameless stack with an offset too large for us to encode compactly.
+        CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
+
+        // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
+        // instruction.
+        CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
+
+        // Encode any extra stack stack adjustments (done via push
+        // instructions).
+        CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
+      }
+
+      // Encode the number of registers saved. (Reverse the list first.)
+      std::reverse(&SavedRegs[0], &SavedRegs[SavedRegIdx]);
+      CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
+
+      // Get the encoding of the saved registers when we don't have a frame
+      // pointer.
+      uint32_t RegEnc = encodeCompactUnwindRegistersWithoutFrame(SavedRegIdx);
+      if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
+
+      // Encode the register encoding.
+      CompactUnwindEncoding |=
+        RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
+    }
+
+    return CompactUnwindEncoding;
+  }
+
+private:
+  /// \brief Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const uint16_t CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const uint16_t CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const uint16_t *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// \brief Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// \brief Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
 public:
-  DarwinX86AsmBackend(const Target &T, StringRef CPU)
-    : X86AsmBackend(T, CPU) { }
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI, StringRef CPU,
+                      bool Is64Bit)
+    : X86AsmBackend(T, CPU), MRI(MRI), Is64Bit(Is64Bit) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+    PushInstrSize = 1;
+  }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
 public:
-  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {}
+  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU)
+    : DarwinX86AsmBackend(T, MRI, CPU, false), SupportsCU(SupportsCU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
-                                     object::mach::CTM_i386,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_I386,
+                                     MachO::CPU_SUBTYPE_I386_ALL);
+  }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
   }
 };
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
+  bool SupportsCU;
+  const MachO::CPUSubTypeX86 Subtype;
 public:
-  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
-    : DarwinX86AsmBackend(T, CPU) {
+  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                         StringRef CPU, bool SupportsCU,
+                         MachO::CPUSubTypeX86 st)
+    : DarwinX86AsmBackend(T, MRI, CPU, true), SupportsCU(SupportsCU),
+      Subtype(st) {
     HasReliableSymbolDifference = true;
   }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
-                                     object::mach::CTM_x86_64,
-                                     object::mach::CSX86_ALL);
+                                     MachO::CPU_TYPE_X86_64, Subtype);
   }
 
   virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -445,15 +787,26 @@ public:
       return false;
     }
   }
+
+  /// \brief Generate the compact unwind encoding for the CFI instructions.
+  virtual uint32_t
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const {
+    return SupportsCU ? generateCompactUnwindEncodingImpl(Instrs) : 0;
+  }
 };
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T, CPU);
+    return new DarwinX86_32AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7));
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, false, CPU);
@@ -462,11 +815,21 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, String
   return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
+                                           const MCRegisterInfo &MRI,
+                                           StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T, CPU);
+  if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO) {
+    MachO::CPUSubTypeX86 CS =
+        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
+            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
+            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
+    return new DarwinX86_64AsmBackend(T, MRI, CPU,
+                                      TheTriple.isMacOSX() &&
+                                      !TheTriple.isMacOSXVersionLT(10, 7), CS);
+  }
 
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return new WindowsX86AsmBackend(T, true, CPU);
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index d8f7278..1ef9814 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -354,6 +354,9 @@ namespace X86II {
     // XOP9 - Prefix to exclude use of imm byte.
     XOP9 = 21 << Op0Shift,
 
+    // XOPA - Prefix to encode 0xA in VEX.MMMM of XOP instructions.
+    XOPA = 22 << Op0Shift,
+
     //===------------------------------------------------------------------===//
     // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
     // They are used to specify GPRs and SSE registers, 64-bit operand size,
@@ -462,20 +465,54 @@ namespace X86II {
     // prefix. Usually used for scalar instructions. Needed by disassembler.
     VEX_LIG     = 1U << 6,
 
+    // TODO: we should combine VEX_L and VEX_LIG together to form a 2-bit field
+    // with following encoding:
+    // - 00 V128
+    // - 01 V256
+    // - 10 V512
+    // - 11 LIG (but, in insn encoding, leave VEX.L and EVEX.L in zeros.
+    // this will save 1 tsflag bit
+
+    // VEX_EVEX - Specifies that this instruction use EVEX form which provides
+    // syntax support up to 32 512-bit register operands and up to 7 16-bit
+    // mask operands as well as source operand data swizzling/memory operand
+    // conversion, eviction hint, and rounding mode.
+    EVEX        = 1U << 7,
+
+    // EVEX_K - Set if this instruction requires masking
+    EVEX_K      = 1U << 8,
+
+    // EVEX_Z - Set if this instruction has EVEX.Z field set.
+    EVEX_Z      = 1U << 9,
+
+    // EVEX_L2 - Set if this instruction has EVEX.L' field set.
+    EVEX_L2     = 1U << 10,
+
+    // EVEX_B - Set if this instruction has EVEX.B field set.
+    EVEX_B      = 1U << 11,
+
+    // EVEX_CD8E - compressed disp8 form, element-size
+    EVEX_CD8EShift = VEXShift + 12,
+    EVEX_CD8EMask = 3,
+
+    // EVEX_CD8V - compressed disp8 form, vector-width
+    EVEX_CD8VShift = EVEX_CD8EShift + 2,
+    EVEX_CD8VMask = 7,
+
     /// Has3DNow0F0FOpcode - This flag indicates that the instruction uses the
     /// wacky 0x0F 0x0F prefix for 3DNow! instructions.  The manual documents
     /// this as having a 0x0F prefix with a 0x0F opcode, and each instruction
     /// storing a classifier in the imm8 field.  To simplify our implementation,
     /// we handle this by storeing the classifier in the opcode field and using
     /// this flag to indicate that the encoder should do the wacky 3DNow! thing.
-    Has3DNow0F0FOpcode = 1U << 7,
+    Has3DNow0F0FOpcode = 1U << 17,
 
     /// MemOp4 - Used to indicate swapping of operand 3 and 4 to be encoded in
     /// ModRM or I8IMM. This is used for FMA4 and XOP instructions.
-    MemOp4 = 1U << 8,
+    MemOp4 = 1U << 18,
 
     /// XOP - Opcode prefix used by XOP instructions.
-    XOP = 1U << 9
+    XOP = 1U << 19
 
   };
 
@@ -533,12 +570,19 @@ namespace X86II {
     unsigned CurOp = 0;
     if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
       ++CurOp;
-    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
-      assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+      // Special case for AVX-512 GATHER with 2 TIED_TO operands
+      // Skip the first 2 operands: dst, mask_wb
+      CurOp += 2;
+    else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+             Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
       // Special case for GATHER with 2 TIED_TO operands
       // Skip the first 2 operands: dst, mask_wb
       CurOp += 2;
-    }
+    else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+      // SCATTER
+      ++CurOp;
     return CurOp;
   }
 
@@ -569,12 +613,15 @@ namespace X86II {
     case X86II::MRMSrcMem: {
       bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
       bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+      bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+      bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
       if (HasMemOp4)
         ++FirstMemOp;// Skip the register source (which is encoded in I8IMM).
-
+      if (HasEVEX_K)
+        ++FirstMemOp;// Skip the mask register
       // FIXME: Maybe lea should have its own form?  This is a horrible hack.
       //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
       //    Opcode == X86::LEA16r || Opcode == X86::LEA32r)
@@ -611,6 +658,14 @@ namespace X86II {
   /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
   /// higher) register?  e.g. r8, xmm8, xmm13, etc.
   inline bool isX86_64ExtendedReg(unsigned RegNo) {
+    if ((RegNo > X86::XMM7 && RegNo <= X86::XMM15) ||
+        (RegNo > X86::XMM23 && RegNo <= X86::XMM31) ||
+        (RegNo > X86::YMM7 && RegNo <= X86::YMM15) ||
+        (RegNo > X86::YMM23 && RegNo <= X86::YMM31) ||
+        (RegNo > X86::ZMM7 && RegNo <= X86::ZMM15) ||
+        (RegNo > X86::ZMM23 && RegNo <= X86::ZMM31))
+      return true;
+
     switch (RegNo) {
     default: break;
     case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
@@ -621,16 +676,21 @@ namespace X86II {
     case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
     case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
     case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
-    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
-    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
-    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
     case X86::CR8:   case X86::CR9:   case X86::CR10:  case X86::CR11:
     case X86::CR12:  case X86::CR13:  case X86::CR14:  case X86::CR15:
         return true;
     }
     return false;
   }
+
+  /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
+  /// registers? e.g. zmm21, etc.
+  static inline bool is32ExtendedReg(unsigned RegNo) {
+    return ((RegNo > X86::XMM15 && RegNo <= X86::XMM31) ||
+            (RegNo > X86::YMM15 && RegNo <= X86::YMM31) ||
+            (RegNo > X86::ZMM15 && RegNo <= X86::ZMM31));
+  }
+
   
   inline bool isX86_64NonExtLowByteReg(unsigned reg) {
     return (reg == X86::SPL || reg == X86::BPL ||
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index de80dd8..3ddd865 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -101,7 +101,27 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
     } else {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
-      case FK_Data_8: Type = ELF::R_X86_64_64; break;
+      case FK_Data_8:
+        switch (Modifier) {
+        default:
+          llvm_unreachable("Unimplemented");
+        case MCSymbolRefExpr::VK_None:
+          Type = ELF::R_X86_64_64;
+          break;
+        case MCSymbolRefExpr::VK_GOT:
+          Type = ELF::R_X86_64_GOT64;
+          break;
+        case MCSymbolRefExpr::VK_GOTOFF:
+          Type = ELF::R_X86_64_GOTOFF64;
+          break;
+        case MCSymbolRefExpr::VK_TPOFF:
+          Type = ELF::R_X86_64_TPOFF64;
+          break;
+        case MCSymbolRefExpr::VK_DTPOFF:
+          Type = ELF::R_X86_64_DTPOFF64;
+          break;
+        }
+        break;
       case X86::reloc_signed_4byte:
         switch (Modifier) {
         default:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
new file mode 100644
index 0000000..a3eb4fb
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -0,0 +1,135 @@
+//===-- X86ELFRelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+using namespace object;
+using namespace ELF;
+
+namespace {
+class X86_64ELFRelocationInfo : public MCRelocationInfo {
+public:
+  X86_64ELFRelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+    uint64_t  SymSize; SymI->getSize(SymSize);
+    int64_t  Addend;  getELFRelocationAddend(Rel, Addend);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+
+    const MCExpr *Expr = 0;
+    // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
+    bool hasAddend = false;
+
+    // The AMD64 SysV ABI says:
+    // A: the addend used to compute the value of the relocatable field.
+    // B: the base address at which a shared object has been loaded into memory
+    //    during execution. Generally, a shared object is built with a 0 base
+    //    virtual address, but the execution address will be different.
+    // G: the offset into the global offset table at which the relocation
+    //    entry's symbol will reside during execution.
+    // GOT: the address of the global offset table.
+    // L: the place (section offset or address) of the Procedure Linkage Table
+    //    entry for a symbol.
+    // P: the place (section offset or address) of the storage unit being
+    //    relocated (computed using r_offset).
+    // S: the value of the symbol whose index resides in the relocation entry.
+    // Z: the size of the symbol whose index resides in the relocation entry.
+
+    switch(RelType) {
+    case R_X86_64_NONE:
+    case R_X86_64_COPY:
+      // none
+      break;
+    case R_X86_64_64:
+    case R_X86_64_16:
+    case R_X86_64_8:
+      // S + A
+    case R_X86_64_32:
+    case R_X86_64_32S:
+      // S + A (We don't care about the result not fitting in 32 bits.)
+    case R_X86_64_PC32:
+    case R_X86_64_PC16:
+    case R_X86_64_PC8:
+    case R_X86_64_PC64:
+      // S + A - P (P/pcrel is implicit)
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOT32:
+    case R_X86_64_GOT64:
+    case R_X86_64_GOTPC32:
+    case R_X86_64_GOTPC64:
+    case R_X86_64_GOTPLT64:
+      // G + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Ctx);
+      break;
+    case R_X86_64_PLT32:
+      // L + A - P -> S@PLT + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_PLT, Ctx);
+      break;
+    case R_X86_64_GLOB_DAT:
+    case R_X86_64_JUMP_SLOT:
+      // S
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    case R_X86_64_GOTPCREL:
+    case R_X86_64_GOTPCREL64:
+      // G + GOT + A - P -> S@GOTPCREL + A
+      hasAddend = true;
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case R_X86_64_GOTOFF64:
+      // S + A - GOT
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTOFF, Ctx);
+      break;
+    case R_X86_64_PLTOFF64:
+      // L + A - GOT
+      break;
+    case R_X86_64_SIZE32:
+    case R_X86_64_SIZE64:
+      // Z + A
+      Expr = MCConstantExpr::Create(SymSize, Ctx);
+      break;
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    if (Expr && hasAddend && Addend != 0)
+      Expr = MCBinaryExpr::CreateAdd(Expr,
+                                     MCConstantExpr::Create(Addend, Ctx),
+                                     Ctx);
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86ELFRelocationInfo - Construct an X86 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64ELFRelocationInfo(MCContext &Ctx) {
+  // We only handle x86-64 for now.
+  return new X86_64ELFRelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 7815ae9..3861e1c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -59,10 +59,8 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   // for .S files on other systems.  Perhaps this is because the file system
   // wasn't always case preserving or something.
   CommentString = "##";
-  PCSymbol = ".";
 
   SupportsDebugInformation = true;
-  DwarfUsesInlineInfoSection = true;
   UseDataRegionDirectives = MarkedJTDataRegions;
 
   // Exceptions handling
@@ -92,8 +90,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   TextAlignFillValue = 0x90;
 
   PrivateGlobalPrefix = ".L";
-  WeakRefDirective = "\t.weak\t";
-  PCSymbol = ".";
 
   // Set up DWARF directives
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
@@ -139,6 +135,8 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   AssemblerDialect = AsmWriterFlavor;
 
   TextAlignFillValue = 0x90;
+
+  AllowAtInName = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b6b70fd..80979dd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class Triple;
@@ -35,7 +36,7 @@ namespace llvm {
                                 MCStreamer &Streamer) const;
   };
 
-  class X86ELFMCAsmInfo : public MCAsmInfo {
+  class X86ELFMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
     explicit X86ELFMCAsmInfo(const Triple &Triple);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 016af71..7952607 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -53,7 +53,7 @@ public:
   }
 
   unsigned GetX86RegNum(const MCOperand &MO) const {
-    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
   }
 
   // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -77,6 +77,14 @@ public:
     return (~SrcRegNum) & 0xf;
   }
 
+  unsigned char getWriteMaskRegisterEncoding(const MCInst &MI,
+                                             unsigned OpNum) const {
+    assert(X86::K0 != MI.getOperand(OpNum).getReg() &&
+           "Invalid mask register as write-mask!");
+    unsigned MaskRegNum = GetX86RegNum(MI.getOperand(OpNum));
+    return MaskRegNum;
+  }
+
   void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
     OS << (char)C;
     ++CurByte;
@@ -152,6 +160,52 @@ static bool isDisp8(int Value) {
   return Value == (signed char)Value;
 }
 
+/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
+/// compressed dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+  assert(((TSFlags >> X86II::VEXShift) & X86II::EVEX) &&
+         "Compressed 8-bit displacement is only valid for EVEX inst.");
+
+  unsigned CD8E = (TSFlags >> X86II::EVEX_CD8EShift) & X86II::EVEX_CD8EMask;
+  unsigned CD8V = (TSFlags >> X86II::EVEX_CD8VShift) & X86II::EVEX_CD8VMask;
+
+  if (CD8V == 0 && CD8E == 0) {
+    CValue = Value;
+    return isDisp8(Value);
+  }
+  
+  unsigned MemObjSize = 1U << CD8E;
+  if (CD8V & 4) {
+    // Fixed vector length
+    MemObjSize *= 1U << (CD8V & 0x3);
+  } else {
+    // Modified vector length
+    bool EVEX_b = (TSFlags >> X86II::VEXShift) & X86II::EVEX_B;
+    if (!EVEX_b) {
+      unsigned EVEX_LL = ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) ? 1 : 0;
+      EVEX_LL += ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2) ? 2 : 0;
+      assert(EVEX_LL < 3 && "");
+
+      unsigned NumElems = (1U << (EVEX_LL + 4)) / MemObjSize;
+      NumElems /= 1U << (CD8V & 0x3);
+
+      MemObjSize *= NumElems;
+    }
+  }
+
+  unsigned MemObjMask = MemObjSize - 1;
+  assert((MemObjSize & MemObjMask) == 0 && "Invalid memory object size.");
+
+  if (Value & MemObjMask) // Unaligned offset
+    return false;
+  Value /= MemObjSize;
+  bool Ret = (Value == (signed char)Value);
+
+  if (Ret)
+    CValue = Value;
+  return Ret;
+}
+
 /// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
 /// in an instruction with the specified TSFlags.
 static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
@@ -318,6 +372,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
   const MCOperand &Scale    = MI.getOperand(Op+X86::AddrScaleAmt);
   const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
   unsigned BaseReg = Base.getReg();
+  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
 
   // Handle %rip relative addressing.
   if (BaseReg == X86::RIP) {    // [disp32+RIP] in X86-64 mode
@@ -378,10 +433,21 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
     }
 
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
-    if (Disp.isImm() && isDisp8(Disp.getImm())) {
-      EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-      EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
-      return;
+    if (Disp.isImm()) {
+      if (!HasEVEX && isDisp8(Disp.getImm())) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        return;
+      }
+      // Try EVEX compressed 8-bit displacement first; if failed, fall back to
+      // 32-bit displacement.
+      int CDisp8 = 0;
+      if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+        EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+        EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+                      CDisp8 - Disp.getImm());
+        return;
+      }
     }
 
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
@@ -397,6 +463,8 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
 
   bool ForceDisp32 = false;
   bool ForceDisp8  = false;
+  int CDisp8 = 0;
+  int ImmOffset = 0;
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
@@ -412,10 +480,15 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
              BaseRegNo != N86::EBP) {
     // Emit no displacement ModR/M byte
     EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
-  } else if (isDisp8(Disp.getImm())) {
+  } else if (!HasEVEX && isDisp8(Disp.getImm())) {
     // Emit the disp8 encoding.
     EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
     ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+    ImmOffset = CDisp8 - Disp.getImm();
   } else {
     // Emit the normal disp32 encoding.
     EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
@@ -445,7 +518,7 @@ void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
 
   // Do we need to output a displacement?
   if (ForceDisp8)
-    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+    EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
   else if (ForceDisp32 || Disp.getImm() != 0)
     EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
                   CurByte, OS, Fixups);
@@ -457,6 +530,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                            int MemOperand, const MCInst &MI,
                                            const MCInstrDesc &Desc,
                                            raw_ostream &OS) const {
+  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+  bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
   bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
   bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3;
   bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
@@ -468,6 +543,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0: Same as REX_R=1 (64 bit mode only)
   //
   unsigned char VEX_R = 0x1;
+  unsigned char EVEX_R2 = 0x1;
 
   // VEX_X: equivalent to REX.X, only used when a
   // register is used for index in SIB Byte.
@@ -488,7 +564,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -498,12 +574,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
   // (in 1's complement form) or 1111 if unused.
   unsigned char VEX_4V = 0xf;
+  unsigned char EVEX_V2 = 0x1;
 
   // VEX_L (Vector Length):
   //
@@ -511,6 +589,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //  1: 256-bit vector
   //
   unsigned char VEX_L = 0;
+  unsigned char EVEX_L2 = 0;
 
   // VEX_PP: opcode extension providing equivalent
   // functionality of a SIMD prefix
@@ -522,6 +601,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   //
   unsigned char VEX_PP = 0;
 
+  // EVEX_U
+  unsigned char EVEX_U = 1; // Always '1' so far
+
+  // EVEX_z
+  unsigned char EVEX_z = 0;
+
+  // EVEX_b
+  unsigned char EVEX_b = 0;
+
+  // EVEX_aaa
+  unsigned char EVEX_aaa = 0;
+
   // Encode the operand size opcode prefix as needed.
   if (TSFlags & X86II::OpSize)
     VEX_PP = 0x01;
@@ -530,10 +621,18 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
+  if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_L2))
+    EVEX_L2 = 1;
+
+  if (HasEVEX_K && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_Z))
+    EVEX_z = 1;
+
+  if (HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_B))
+    EVEX_b = 1;
 
   switch (TSFlags & X86II::Op0Mask) {
   default: llvm_unreachable("Invalid prefix!");
@@ -567,11 +666,11 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::XOP9:
     VEX_5M = 0x9;
     break;
-  case X86II::A6:  // Bypass: Not used by VEX
-  case X86II::A7:  // Bypass: Not used by VEX
-  case X86II::TB:  // Bypass: Not used by VEX
-  case 0:
-    break;  // No prefix!
+  case X86II::XOPA:
+    VEX_5M = 0xA;
+    break;
+  case X86II::TB: // VEX_5M/VEX_PP already correct
+    break;
   }
 
 
@@ -580,12 +679,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   unsigned CurOp = 0;
   if (NumOps > 1 && Desc.getOperandConstraint(1, MCOI::TIED_TO) == 0)
     ++CurOp;
-  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0) {
-    assert(Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1);
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+           Desc.getOperandConstraint(3, MCOI::TIED_TO) == 1)
+    // Special case for AVX-512 GATHER with 2 TIED_TO operands
+    // Skip the first 2 operands: dst, mask_wb
+    CurOp += 2;
+  else if (NumOps > 3 && Desc.getOperandConstraint(2, MCOI::TIED_TO) == 0 &&
+           Desc.getOperandConstraint(NumOps - 1, MCOI::TIED_TO) == 1)
     // Special case for GATHER with 2 TIED_TO operands
     // Skip the first 2 operands: dst, mask_wb
     CurOp += 2;
-  }
+  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps - 2, MCOI::TIED_TO) == 0)
+    // SCATTER
+    ++CurOp;
 
   switch (TSFlags & X86II::FormMask) {
   case X86II::MRMInitReg: llvm_unreachable("FIXME: Remove this!");
@@ -595,18 +701,35 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  MemAddr, src1(VEX_4V), src2(ModR/M)
     //  MemAddr, src1(ModR/M), imm8
     //
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrBaseReg).getReg()))
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand + 
+                                                 X86::AddrBaseReg).getReg()))
       VEX_B = 0x0;
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(X86::AddrIndexReg).getReg()))
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(MemOperand +
+                                                 X86::AddrIndexReg).getReg()))
       VEX_X = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                                          X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
+
+    CurOp += X86::AddrNumOperands;
 
-    CurOp = X86::AddrNumOperands;
-    if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
 
     const MCOperand &MO = MI.getOperand(CurOp);
-    if (MO.isReg() && X86II::isX86_64ExtendedReg(MO.getReg()))
-      VEX_R = 0x0;
+    if (MO.isReg()) {
+      if (X86II::isX86_64ExtendedReg(MO.getReg()))
+        VEX_R = 0x0;
+      if (HasEVEX && X86II::is32ExtendedReg(MO.getReg()))
+        EVEX_R2 = 0x0;
+    }
     break;
   }
   case X86II::MRMSrcMem:
@@ -619,11 +742,21 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  FMA4:
     //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp++).getReg()))
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
+    CurOp++;
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
 
-    if (HasVEX_4V)
+    if (HasVEX_4V) {
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
 
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -631,6 +764,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrIndexReg).getReg()))
       VEX_X = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(MemOperand +
+                                          X86::AddrIndexReg).getReg()))
+      EVEX_V2 = 0x0;
 
     if (HasVEX_4VOp3)
       // Instruction format for 4VOp3:
@@ -647,8 +783,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     // MRM[0-9]m instructions forms:
     //  MemAddr
     //  src1(VEX_4V), MemAddr
-    if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, 0);
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
+
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
 
     if (X86II::isX86_64ExtendedReg(
                MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -669,16 +812,27 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
     CurOp++;
 
-    if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
 
     if (HasMemOp4) // Skip second register source (encoded in I8IMM)
       CurOp++;
 
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
     CurOp++;
     if (HasVEX_4VOp3)
       VEX_4V = getVEXRegisterEncoding(MI, CurOp);
@@ -690,13 +844,24 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     //  dst(ModR/M), src1(VEX_4V), src2(ModR/M)
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
     CurOp++;
 
-    if (HasVEX_4V)
-      VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+        EVEX_V2 = 0x0;
+      CurOp++;
+    }
 
     if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_R = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      EVEX_R2 = 0x0;
     break;
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
@@ -704,9 +869,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::MRM6r: case X86II::MRM7r:
     // MRM0r-MRM7r instructions forms:
     //  dst(VEX_4V), src(ModR/M), imm8
-    VEX_4V = getVEXRegisterEncoding(MI, 0);
-    if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+    if (HasVEX_4V) {
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+          EVEX_V2 = 0x0;
+      CurOp++;
+    }    
+    if (HasEVEX_K)
+      EVEX_aaa = getWriteMaskRegisterEncoding(MI, CurOp++);
+
+    if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
       VEX_B = 0x0;
+    if (HasEVEX && X86II::is32ExtendedReg(MI.getOperand(CurOp).getReg()))
+      VEX_X = 0x0;
     break;
   default: // RawFrm
     break;
@@ -715,29 +890,58 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   // Emit segment override opcode prefix as needed.
   EmitSegmentOverridePrefix(TSFlags, CurByte, MemOperand, MI, OS);
 
-  // VEX opcode prefix can have 2 or 3 bytes
-  //
-  //  3 bytes:
-  //    +-----+ +--------------+ +-------------------+
-  //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
-  //    +-----+ +--------------+ +-------------------+
-  //  2 bytes:
-  //    +-----+ +-------------------+
-  //    | C5h | | R | vvvv | L | pp |
-  //    +-----+ +-------------------+
-  //
-  unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
+  if (!HasEVEX) {
+    // VEX opcode prefix can have 2 or 3 bytes
+    //
+    //  3 bytes:
+    //    +-----+ +--------------+ +-------------------+
+    //    | C4h | | RXB | m-mmmm | | W | vvvv | L | pp |
+    //    +-----+ +--------------+ +-------------------+
+    //  2 bytes:
+    //    +-----+ +-------------------+
+    //    | C5h | | R | vvvv | L | pp |
+    //    +-----+ +-------------------+
+    //
+    unsigned char LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
 
-  if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
-    EmitByte(0xC5, CurByte, OS);
-    EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
-    return;
-  }
+    if (VEX_B && VEX_X && !VEX_W && !XOP && (VEX_5M == 1)) { // 2 byte VEX prefix
+      EmitByte(0xC5, CurByte, OS);
+      EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      return;
+    }
 
-  // 3 byte VEX prefix
-  EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
-  EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
-  EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+    // 3 byte VEX prefix
+    EmitByte(XOP ? 0x8F : 0xC4, CurByte, OS);
+    EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+    EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+  } else {
+    // EVEX opcode prefix can have 4 bytes
+    //
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    // | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
+    // +-----+ +--------------+ +-------------------+ +------------------------+
+    assert((VEX_5M & 0x3) == VEX_5M
+           && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+    VEX_5M &= 0x3;
+
+    EmitByte(0x62, CurByte, OS);
+    EmitByte((VEX_R   << 7) |
+             (VEX_X   << 6) |
+             (VEX_B   << 5) |
+             (EVEX_R2 << 4) |
+             VEX_5M, CurByte, OS);
+    EmitByte((VEX_W   << 7) |
+             (VEX_4V  << 3) |
+             (EVEX_U  << 2) |
+             VEX_PP, CurByte, OS);
+    EmitByte((EVEX_z  << 7) |
+             (EVEX_L2 << 6) |
+             (VEX_L   << 5) |
+             (EVEX_b  << 4) |
+             (EVEX_V2 << 3) |
+             EVEX_aaa, CurByte, OS);
+  }
 }
 
 /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
@@ -1007,6 +1211,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
   const unsigned MemOp4_I8IMMOperand = 2;
 
+  // It uses the EVEX.aaa field?
+  bool HasEVEX = (TSFlags >> X86II::VEXShift) & X86II::EVEX;
+  bool HasEVEX_K = HasEVEX && ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode);
   if (MemoryOperand != -1) MemoryOperand += CurOp;
@@ -1057,6 +1265,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     SrcRegNum = CurOp + 1;
 
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
@@ -1069,6 +1280,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     SrcRegNum = CurOp + X86::AddrNumOperands;
 
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
@@ -1082,6 +1296,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     EmitByte(BaseOpcode, CurByte, OS);
     SrcRegNum = CurOp + 1;
 
+    if (HasEVEX_K) // Skip writemask
+      SrcRegNum++;
+
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
@@ -1100,6 +1317,12 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRMSrcMem: {
     int AddrOperands = X86::AddrNumOperands;
     unsigned FirstMemOp = CurOp+1;
+
+    if (HasEVEX_K) { // Skip writemask
+      ++AddrOperands;
+      ++FirstMemOp;
+    }
+
     if (HasVEX_4V) {
       ++AddrOperands;
       ++FirstMemOp;  // Skip the register source (which is encoded in VEX_VVVV).
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 5e84530..1cbdafd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -263,7 +263,7 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
   return X;
 }
 
-static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
+static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   Triple TheTriple(TT);
   bool is64Bit = TheTriple.getArch() == Triple::x86_64;
 
@@ -290,14 +290,16 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
   int stackGrowth = is64Bit ? -8 : -4;
 
   // Initial state of the frame pointer is esp+stackGrowth.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
-  MAI->addInitialFrameState(0, Dst, Src);
+  unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+      0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+  MAI->addInitialFrameState(Inst);
 
   // Add return address to move list
-  MachineLocation CSDst(is64Bit ? X86::RSP : X86::ESP, stackGrowth);
-  MachineLocation CSSrc(is64Bit ? X86::RIP : X86::EIP);
-  MAI->addInitialFrameState(0, CSDst, CSSrc);
+  unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
+  MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
+      0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+  MAI->addInitialFrameState(Inst2);
 
   return MAI;
 }
@@ -366,7 +368,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  return createELFStreamer(Ctx, 0, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
@@ -382,6 +384,17 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
   return 0;
 }
 
+static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
+                                                   MCContext &Ctx) {
+  Triple TheTriple(TT);
+  if (TheTriple.isEnvironmentMachO() && TheTriple.getArch() == Triple::x86_64)
+    return createX86_64MachORelocationInfo(Ctx);
+  else if (TheTriple.isOSBinFormatELF())
+    return createX86_64ELFRelocationInfo(Ctx);
+  // Default to the stock relocation info.
+  return llvm::createMCRelocationInfo(TT, Ctx);
+}
+
 static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
   return new MCInstrAnalysis(Info);
 }
@@ -439,4 +452,10 @@ extern "C" void LLVMInitializeX86TargetMC() {
                                         createX86MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,
                                         createX86MCInstPrinter);
+
+  // Register the MC relocation info.
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_32Target,
+                                           createX86MCRelocationInfo);
+  TargetRegistry::RegisterMCRelocationInfo(TheX86_64Target,
+                                           createX86MCRelocationInfo);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 981aa1a..41ae435 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -25,6 +25,7 @@ class MCInstrInfo;
 class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class MCRelocationInfo;
 class Target;
 class StringRef;
 class raw_ostream;
@@ -78,8 +79,10 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                                     StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
@@ -94,6 +97,12 @@ MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
                                          uint16_t EMachine);
 /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
 MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
+
+/// createX86_64MachORelocationInfo - Construct X86-64 Mach-O relocation info.
+MCRelocationInfo *createX86_64MachORelocationInfo(MCContext &Ctx);
+
+/// createX86_64ELFORelocationInfo - Construct X86-64 ELF relocation info.
+MCRelocationInfo *createX86_64ELFRelocationInfo(MCContext &Ctx);
 } // End llvm namespace
 
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
new file mode 100644
index 0000000..209b1d0e
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -0,0 +1,116 @@
+//===-- X86MachORelocationInfo.cpp ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCRelocationInfo.h"
+#include "llvm/Object/MachO.h"
+
+using namespace llvm;
+using namespace object;
+using namespace MachO;
+
+namespace {
+class X86_64MachORelocationInfo : public MCRelocationInfo {
+public:
+  X86_64MachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
+
+  const MCExpr *createExprForRelocation(RelocationRef Rel) {
+    const MachOObjectFile *Obj = cast<MachOObjectFile>(Rel.getObjectFile());
+
+    uint64_t RelType; Rel.getType(RelType);
+    symbol_iterator SymI = Rel.getSymbol();
+
+    StringRef SymName; SymI->getName(SymName);
+    uint64_t  SymAddr; SymI->getAddress(SymAddr);
+
+    any_relocation_info RE = Obj->getRelocation(Rel.getRawDataRefImpl());
+    bool isPCRel = Obj->getAnyRelocationPCRel(RE);
+
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(SymName);
+    // FIXME: check that the value is actually the same.
+    if (Sym->isVariable() == false)
+      Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
+    const MCExpr *Expr = 0;
+
+    switch(RelType) {
+    case X86_64_RELOC_TLV:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_4:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(4, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_2:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(2, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SIGNED_1:
+      Expr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(Sym, Ctx),
+                                     MCConstantExpr::Create(1, Ctx),
+                                     Ctx);
+      break;
+    case X86_64_RELOC_GOT_LOAD:
+      Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, Ctx);
+      break;
+    case X86_64_RELOC_GOT:
+      Expr = MCSymbolRefExpr::Create(Sym, isPCRel ?
+                                     MCSymbolRefExpr::VK_GOTPCREL :
+                                     MCSymbolRefExpr::VK_GOT,
+                                     Ctx);
+      break;
+    case X86_64_RELOC_SUBTRACTOR:
+      {
+        RelocationRef RelNext;
+        Obj->getRelocationNext(Rel.getRawDataRefImpl(), RelNext);
+        any_relocation_info RENext = Obj->getRelocation(RelNext.getRawDataRefImpl());
+
+        // X86_64_SUBTRACTOR must be followed by a relocation of type
+        // X86_64_RELOC_UNSIGNED.
+        // NOTE: Scattered relocations don't exist on x86_64.
+        unsigned RType = Obj->getAnyRelocationType(RENext);
+        if (RType != X86_64_RELOC_UNSIGNED)
+          report_fatal_error("Expected X86_64_RELOC_UNSIGNED after "
+                             "X86_64_RELOC_SUBTRACTOR.");
+
+        const MCExpr *LHS = MCSymbolRefExpr::Create(Sym, Ctx);
+
+        symbol_iterator RSymI = RelNext.getSymbol();
+        uint64_t RSymAddr;
+        RSymI->getAddress(RSymAddr);
+        StringRef RSymName;
+        RSymI->getName(RSymName);
+
+        MCSymbol *RSym = Ctx.GetOrCreateSymbol(RSymName);
+        if (RSym->isVariable() == false)
+          RSym->setVariableValue(MCConstantExpr::Create(RSymAddr, Ctx));
+
+        const MCExpr *RHS = MCSymbolRefExpr::Create(RSym, Ctx);
+
+        Expr = MCBinaryExpr::CreateSub(LHS, RHS, Ctx);
+        break;
+      }
+    default:
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+      break;
+    }
+    return Expr;
+  }
+};
+} // End unnamed namespace
+
+/// createX86_64MachORelocationInfo - Construct an X86-64 Mach-O RelocationInfo.
+MCRelocationInfo *llvm::createX86_64MachORelocationInfo(MCContext &Ctx) {
+  return new X86_64MachORelocationInfo(Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 64f005c..eb7c0b1 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -16,12 +16,11 @@
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Object/MachOFormat.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MachO.h"
 
 using namespace llvm;
-using namespace llvm::object;
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
@@ -132,7 +131,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
 
   if (Target.isAbsolute()) { // constant
     // SymbolNum of 0 indicates the absolute section.
-    Type = macho::RIT_X86_64_Unsigned;
+    Type = MachO::X86_64_RELOC_UNSIGNED;
     Index = 0;
 
     // FIXME: I believe this is broken, I don't think the linker can understand
@@ -141,26 +140,31 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // is to use an absolute symbol (which we don't support yet).
     if (IsPCRel) {
       IsExtern = 1;
-      Type = macho::RIT_X86_64_Branch;
+      Type = MachO::X86_64_RELOC_BRANCH;
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
+    if (A->isTemporary())
+      A = &A->AliasedSymbol();
     MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
+    if (B->isTemporary())
+      B = &B->AliasedSymbol();
     MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Neither symbol can be modified.
     if (Target.getSymA()->getKind() != MCSymbolRefExpr::VK_None ||
         Target.getSymB()->getKind() != MCSymbolRefExpr::VK_None)
-      report_fatal_error("unsupported relocation of modified symbol");
+      report_fatal_error("unsupported relocation of modified symbol", false);
 
     // We don't support PCrel relocations of differences. Darwin 'as' doesn't
     // implement most of these correctly.
     if (IsPCRel)
-      report_fatal_error("unsupported pc-relative relocation of difference");
+      report_fatal_error("unsupported pc-relative relocation of difference",
+                         false);
 
     // The support for the situation where one or both of the symbols would
     // require a local relocation is handled just like if the symbols were
@@ -173,7 +177,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     // single SIGNED relocation); reject it for now.  Except the case where both
     // symbols don't have a base, equal but both NULL.
     if (A_Base == B_Base && A_Base)
-      report_fatal_error("unsupported relocation with identical base");
+      report_fatal_error("unsupported relocation with identical base", false);
+
+    // A subtraction expression where both symbols are undefined is a
+    // non-relocatable expression.
+    if (A->isUndefined() && B->isUndefined())
+      report_fatal_error("unsupported relocation with subtraction expression",
+                         false);
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
       (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
@@ -188,15 +198,15 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = A_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Unsigned;
-
-    macho::RelocationEntry MRE;
-    MRE.Word0 = FixupOffset;
-    MRE.Word1 = ((Index     <<  0) |
-                 (IsPCRel   << 24) |
-                 (Log2Size  << 25) |
-                 (IsExtern  << 27) |
-                 (Type      << 28));
+    Type = MachO::X86_64_RELOC_UNSIGNED;
+
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = FixupOffset;
+    MRE.r_word1 = ((Index     <<  0) |
+                   (IsPCRel   << 24) |
+                   (Log2Size  << 25) |
+                   (IsExtern  << 27) |
+                   (Type      << 28));
     Writer->addRelocation(Fragment->getParent(), MRE);
 
     if (B_Base) {
@@ -207,7 +217,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       Index = B_SD.getFragment()->getParent()->getOrdinal() + 1;
       IsExtern = 0;
     }
-    Type = macho::RIT_X86_64_Subtractor;
+    Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
     MCSymbolData &SD = Asm.getSymbolData(*Symbol);
@@ -252,11 +262,11 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
         return;
       } else {
         report_fatal_error("unsupported relocation of variable '" +
-                           Symbol->getName() + "'");
+                           Symbol->getName() + "'", false);
       }
     } else {
       report_fatal_error("unsupported relocation of undefined symbol '" +
-                         Symbol->getName() + "'");
+                         Symbol->getName() + "'", false);
     }
 
     MCSymbolRefExpr::VariantKind Modifier = Target.getSymA()->getKind();
@@ -267,15 +277,16 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // rewrite the movq to an leaq at link time if the symbol ends up in
           // the same linkage unit.
           if (unsigned(Fixup.getKind()) == X86::reloc_riprel_4byte_movq_load)
-            Type = macho::RIT_X86_64_GOTLoad;
+            Type = MachO::X86_64_RELOC_GOT_LOAD;
           else
-            Type = macho::RIT_X86_64_GOT;
+            Type = MachO::X86_64_RELOC_GOT;
         }  else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-          Type = macho::RIT_X86_64_TLV;
+          Type = MachO::X86_64_RELOC_TLV;
         }  else if (Modifier != MCSymbolRefExpr::VK_None) {
-          report_fatal_error("unsupported symbol modifier in relocation");
+          report_fatal_error("unsupported symbol modifier in relocation",
+                             false);
         } else {
-          Type = macho::RIT_X86_64_Signed;
+          Type = MachO::X86_64_RELOC_SIGNED;
 
           // The Darwin x86_64 relocation format has a problem where it cannot
           // encode an address (L<foo> + <constant>) which is outside the atom
@@ -292,34 +303,40 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
           // (the additional bias), but instead appear to just look at the final
           // offset.
           switch (-(Target.getConstant() + (1LL << Log2Size))) {
-          case 1: Type = macho::RIT_X86_64_Signed1; break;
-          case 2: Type = macho::RIT_X86_64_Signed2; break;
-          case 4: Type = macho::RIT_X86_64_Signed4; break;
+          case 1: Type = MachO::X86_64_RELOC_SIGNED_1; break;
+          case 2: Type = MachO::X86_64_RELOC_SIGNED_2; break;
+          case 4: Type = MachO::X86_64_RELOC_SIGNED_4; break;
           }
         }
       } else {
         if (Modifier != MCSymbolRefExpr::VK_None)
           report_fatal_error("unsupported symbol modifier in branch "
-                             "relocation");
+                             "relocation", false);
 
-        Type = macho::RIT_X86_64_Branch;
+        Type = MachO::X86_64_RELOC_BRANCH;
       }
     } else {
       if (Modifier == MCSymbolRefExpr::VK_GOT) {
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
       } else if (Modifier == MCSymbolRefExpr::VK_GOTPCREL) {
         // GOTPCREL is allowed as a modifier on non-PCrel instructions, in which
         // case all we do is set the PCrel bit in the relocation entry; this is
         // used with exception handling, for example. The source is required to
         // include any necessary offset directly.
-        Type = macho::RIT_X86_64_GOT;
+        Type = MachO::X86_64_RELOC_GOT;
         IsPCRel = 1;
       } else if (Modifier == MCSymbolRefExpr::VK_TLVP) {
-        report_fatal_error("TLVP symbol modifier should have been rip-rel");
+        report_fatal_error("TLVP symbol modifier should have been rip-rel",
+                           false);
       } else if (Modifier != MCSymbolRefExpr::VK_None)
-        report_fatal_error("unsupported symbol modifier in relocation");
-      else
-        Type = macho::RIT_X86_64_Unsigned;
+        report_fatal_error("unsupported symbol modifier in relocation", false);
+      else {
+        Type = MachO::X86_64_RELOC_UNSIGNED;
+        unsigned Kind = Fixup.getKind();
+        if (Kind == X86::reloc_signed_4byte)
+          report_fatal_error("32-bit absolute addressing is not supported in "
+                             "64-bit mode", false);
+      }
     }
   }
 
@@ -327,13 +344,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   FixedValue = Value;
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -347,7 +364,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     uint64_t &FixedValue) {
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
   unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, Fixup.getKind());
-  unsigned Type = macho::RIT_Vanilla;
+  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
@@ -355,7 +372,8 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
+                       "' can not be undefined in a subtraction expression",
+                       false);
 
   uint32_t Value = Writer->getSymbolAddress(A_SD, Layout);
   uint64_t SecAddr = Writer->getSectionAddress(A_SD->getFragment()->getParent());
@@ -367,22 +385,23 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
-                         "' can not be undefined in a subtraction expression");
+                         "' can not be undefined in a subtraction expression",
+                         false);
 
     // Select the appropriate difference relocation type.
     //
     // Note that there is no longer any semantic difference between these two
     // relocation types from the linkers point of view, this is done solely for
     // pedantic compatibility with 'as'.
-    Type = A_SD->isExternal() ? (unsigned)macho::RIT_Difference :
-      (unsigned)macho::RIT_Generic_LocalDifference;
+    Type = A_SD->isExternal() ? (unsigned)MachO::GENERIC_RELOC_SECTDIFF :
+      (unsigned)MachO::GENERIC_RELOC_LOCAL_SECTDIFF;
     Value2 = Writer->getSymbolAddress(B_SD, Layout);
     FixedValue -= Writer->getSectionAddress(B_SD->getFragment()->getParent());
   }
 
   // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == macho::RIT_Difference ||
-      Type == macho::RIT_Generic_LocalDifference) {
+  if (Type == MachO::GENERIC_RELOC_SECTDIFF ||
+      Type == MachO::GENERIC_RELOC_LOCAL_SECTDIFF) {
     // If the offset is too large to fit in a scattered relocation,
     // we're hosed. It's an unfortunate limitation of the MachO format.
     if (FixupOffset > 0xffffff) {
@@ -396,13 +415,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       llvm_unreachable("fatal error returned?!");
     }
 
-    macho::RelocationEntry MRE;
-    MRE.Word0 = ((0         <<  0) |
-                 (macho::RIT_Pair  << 24) |
-                 (Log2Size  << 28) |
-                 (IsPCRel   << 30) |
-                 macho::RF_Scattered);
-    MRE.Word1 = Value2;
+    MachO::any_relocation_info MRE;
+    MRE.r_word0 = ((0                         <<  0) | // r_address
+                   (MachO::GENERIC_RELOC_PAIR << 24) | // r_type
+                   (Log2Size                  << 28) |
+                   (IsPCRel                   << 30) |
+                   MachO::R_SCATTERED);
+    MRE.r_word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
   } else {
     // If the offset is more than 24-bits, it won't fit in a scattered
@@ -416,13 +435,13 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
       return false;
   }
 
-  macho::RelocationEntry MRE;
-  MRE.Word0 = ((FixupOffset <<  0) |
-               (Type        << 24) |
-               (Log2Size    << 28) |
-               (IsPCRel     << 30) |
-               macho::RF_Scattered);
-  MRE.Word1 = Value;
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = ((FixupOffset <<  0) |
+                 (Type        << 24) |
+                 (Log2Size    << 28) |
+                 (IsPCRel     << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
   return true;
 }
@@ -464,13 +483,13 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = Value;
-  MRE.Word1 = ((Index                  <<  0) |
-               (IsPCRel                << 24) |
-               (Log2Size               << 25) |
-               (1                      << 27) | // Extern
-               (macho::RIT_Generic_TLV << 28)); // Type
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = Value;
+  MRE.r_word1 = ((Index                    <<  0) |
+                 (IsPCRel                  << 24) |
+                 (Log2Size                 << 25) |
+                 (1                        << 27) | // r_extern
+                 (MachO::GENERIC_RELOC_TLV << 28)); // r_type
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
@@ -530,7 +549,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     //
     // FIXME: Currently, these are never generated (see code below). I cannot
     // find a case where they are actually emitted.
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   } else {
     // Resolve constant variables.
     if (SD->getSymbol().isVariable()) {
@@ -561,17 +580,17 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
     if (IsPCRel)
       FixedValue -= Writer->getSectionAddress(Fragment->getParent());
 
-    Type = macho::RIT_Vanilla;
+    Type = MachO::GENERIC_RELOC_VANILLA;
   }
 
   // struct relocation_info (8 bytes)
-  macho::RelocationEntry MRE;
-  MRE.Word0 = FixupOffset;
-  MRE.Word1 = ((Index     <<  0) |
-               (IsPCRel   << 24) |
-               (Log2Size  << 25) |
-               (IsExtern  << 27) |
-               (Type      << 28));
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = FixupOffset;
+  MRE.r_word1 = ((Index     <<  0) |
+                 (IsPCRel   << 24) |
+                 (Log2Size  << 25) |
+                 (IsExtern  << 27) |
+                 (Type      << 28));
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ed64a32..6da4142 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -27,7 +27,7 @@ namespace {
 
   public:
     X86WinCOFFObjectWriter(bool Is64Bit_);
-    ~X86WinCOFFObjectWriter();
+    virtual ~X86WinCOFFObjectWriter();
 
     virtual unsigned getRelocType(const MCValue &Target,
                                   const MCFixup &Fixup,
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 496b704..adfa7fa 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -517,37 +517,6 @@ to <2 x i64> ops being so bad.
 
 //===---------------------------------------------------------------------===//
 
-'select' on vectors and scalars could be a whole lot better.  We currently 
-lower them to conditional branches.  On x86-64 for example, we compile this:
-
-double test(double a, double b, double c, double d) { return a<b ? c : d; }
-
-to:
-
-_test:
-	ucomisd	%xmm0, %xmm1
-	ja	LBB1_2	# entry
-LBB1_1:	# entry
-	movapd	%xmm3, %xmm2
-LBB1_2:	# entry
-	movapd	%xmm2, %xmm0
-	ret
-
-instead of:
-
-_test:
-	cmpltsd	%xmm1, %xmm0
-	andpd	%xmm0, %xmm2
-	andnpd	%xmm3, %xmm0
-	orpd	%xmm2, %xmm0
-	ret
-
-For unpredictable branches, the later is much more efficient.  This should
-just be a matter of having scalar sse map to SELECT_CC and custom expanding
-or iseling it.
-
-//===---------------------------------------------------------------------===//
-
 LLVM currently generates stack realignment code, when it is not necessary
 needed. The problem is that we need to know about stack alignment too early,
 before RA runs.
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index c865500..65c5552 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -50,10 +50,10 @@ def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
 def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
                                       "Enable SSSE3 instructions",
                                       [FeatureSSE3]>;
-def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+def FeatureSSE41   : SubtargetFeature<"sse4.1", "X86SSELevel", "SSE41",
                                       "Enable SSE 4.1 instructions",
                                       [FeatureSSSE3]>;
-def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+def FeatureSSE42   : SubtargetFeature<"sse4.2", "X86SSELevel", "SSE42",
                                       "Enable SSE 4.2 instructions",
                                       [FeatureSSE41]>;
 def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
@@ -68,7 +68,7 @@ def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
 def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
                                       "Support 64-bit instructions",
                                       [FeatureCMOV]>;
-def FeatureCMPXCHG16B : SubtargetFeature<"cmpxchg16b", "HasCmpxchg16b", "true",
+def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       "64-bit with cmpxchg16b",
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
@@ -86,6 +86,19 @@ def FeatureAVX     : SubtargetFeature<"avx", "X86SSELevel", "AVX",
 def FeatureAVX2    : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
                                       "Enable AVX2 instructions",
                                       [FeatureAVX]>;
+def FeatureAVX512   : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
+                                      "Enable AVX-512 instructions",
+                                      [FeatureAVX2]>;
+def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
+                      "Enable AVX-512 Exponential and Reciprocal Instructions",
+                                      [FeatureAVX512]>;
+def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
+                      "Enable AVX-512 Conflict Detection Instructions",
+                                      [FeatureAVX512]>;
+def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
+                      "Enable AVX-512 PreFetch Instructions",
+                                      [FeatureAVX512]>;
+
 def FeaturePCLMUL  : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
                          "Enable packed carry-less multiplication instructions",
                                [FeatureSSE2]>;
@@ -104,12 +117,15 @@ def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
 def FeatureAES     : SubtargetFeature<"aes", "HasAES", "true",
                                       "Enable AES instructions",
                                       [FeatureSSE2]>;
+def FeatureTBM     : SubtargetFeature<"tbm", "HasTBM", "true",
+                                      "Enable TBM instructions">;
 def FeatureMOVBE   : SubtargetFeature<"movbe", "HasMOVBE", "true",
                                       "Support MOVBE instruction">;
-def FeatureRDRAND  : SubtargetFeature<"rdrand", "HasRDRAND", "true",
+def FeatureRDRAND  : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
                                       "Support RDRAND instruction">;
 def FeatureF16C    : SubtargetFeature<"f16c", "HasF16C", "true",
-                       "Support 16-bit floating point conversion instructions">;
+                       "Support 16-bit floating point conversion instructions",
+                       [FeatureAVX]>;
 def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
                                        "Support FS/GS Base instructions">;
 def FeatureLZCNT   : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
@@ -124,6 +140,9 @@ def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
                                       "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
+def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
+                                      "Enable SHA instructions",
+                                      [FeatureSSE2]>;
 def FeaturePRFCHW  : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
                                       "Support PRFCHW instructions">;
 def FeatureRDSEED  : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -150,6 +169,8 @@ include "X86Schedule.td"
 
 def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
                     "Intel Atom processors">;
+def ProcIntelSLM  : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
+                    "Intel Silvermont processors">;
 
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
@@ -193,6 +214,14 @@ def : ProcessorModel<"atom", AtomModel,
                       FeatureLEAUsesAG,
                       FeaturePadShortFunctions]>;
 
+// Atom Silvermont.
+def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
+                               FeatureSSE42, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeaturePOPCNT,
+                               FeaturePCLMUL, FeatureAES,
+                               FeatureCallRegIndirect,
+                               FeaturePRFCHW,
+                               FeatureSlowBTMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
                      [FeatureSSE42, FeatureCMPXCHG16B, FeatureSlowBTMem,
@@ -227,6 +256,15 @@ def : ProcessorModel<"core-avx2", HaswellModel,
                       FeatureBMI, FeatureBMI2, FeatureFMA, FeatureRTM,
                       FeatureHLE]>;
 
+// KNL
+// FIXME: define KNL model
+def : ProcessorModel<"knl", HaswellModel,
+                     [FeatureAVX512, FeatureERI, FeatureCDI, FeaturePFI,
+                      FeatureCMPXCHG16B, FeatureFastUAMem, FeaturePOPCNT,
+                      FeatureAES, FeaturePCLMUL, FeatureRDRAND, FeatureF16C,
+                      FeatureFSGSBase, FeatureMOVBE, FeatureLZCNT, FeatureBMI,
+                      FeatureBMI2, FeatureFMA, FeatureRTM, FeatureHLE]>;
+
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
 def : Proc<"k6-3",            [Feature3DNow]>;
@@ -254,21 +292,30 @@ def : Proc<"amdfam10",        [FeatureSSE4A,
                                FeaturePOPCNT, FeatureSlowBTMem]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL, FeatureBMI,
-                               FeatureF16C, FeatureMOVBE, FeatureLZCNT,
-                               FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
+                               FeatureBMI, FeatureF16C, FeatureMOVBE,
+                               FeatureLZCNT, FeaturePOPCNT]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureLZCNT, FeaturePOPCNT]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
-                               FeatureAES, FeaturePCLMUL,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
+                               FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA]>;
+
+// Steamroller
+def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
+                               FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
+                               FeatureFMA, FeatureFSGSBase]>;
+
 def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index 6b228b0..1258441 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -96,7 +96,7 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
              MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
-      GVSym = Mang->getSymbol(GV);
+      GVSym = getSymbol(GV);
 
     // Handle dllimport linkage.
     if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
@@ -109,21 +109,21 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
         MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
       if (StubSym.getPointer() == 0)
         StubSym = MachineModuleInfoImpl::
-          StubValueTy(Mang->getSymbol(GV), !GV->hasInternalLinkage());
+          StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
     }
 
     // If the name begins with a dollar-sign, enclose it in parens.  We do this
@@ -333,21 +333,21 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
   const MachineOperand &IndexReg = MI->getOperand(Op+2);
   const MachineOperand &DispSpec = MI->getOperand(Op+3);
   const MachineOperand &SegReg   = MI->getOperand(Op+4);
-  
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+4, O, Modifier, AsmVariant);
     O << ':';
   }
-  
+
   O << '[';
-  
+
   bool NeedPlus = false;
   if (BaseReg.getReg()) {
     printOperand(MI, Op, O, Modifier, AsmVariant);
     NeedPlus = true;
   }
-  
+
   if (IndexReg.getReg()) {
     if (NeedPlus) O << " + ";
     if (ScaleVal != 1)
@@ -394,6 +394,7 @@ bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
     Reg = getX86SubSuperRegister(Reg, MVT::i32);
     break;
   case 'q': // Print DImode register
+    // FIXME: gcc will actually print e instead of r for 32-bit.
     Reg = getX86SubSuperRegister(Reg, MVT::i64);
     break;
   }
@@ -518,6 +519,27 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget->isTargetEnvMacho())
     OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+
+  if (Subtarget->isTargetCOFF()) {
+    // Emit an absolute @feat.00 symbol.  This appears to be some kind of
+    // compiler features bitfield read by link.exe.
+    if (!Subtarget->is64Bit()) {
+      MCSymbol *S = MMI->getContext().GetOrCreateSymbol(StringRef("@feat.00"));
+      OutStreamer.BeginCOFFSymbolDef(S);
+      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_NULL);
+      OutStreamer.EndCOFFSymbolDef();
+      // According to the PE-COFF spec, the LSB of this value marks the object
+      // for "registered SEH".  This means that all SEH handler entry points
+      // must be registered in .sxdata.  Use of any unregistered handlers will
+      // cause the process to terminate immediately.  LLVM does not know how to
+      // register any SEH handlers, so its object files should be safe.
+      S->setAbsolute();
+      OutStreamer.EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer.EmitAssignment(
+          S, MCConstantExpr::Create(int64_t(1), MMI->getContext()));
+    }
+  }
 }
 
 
@@ -606,6 +628,8 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer.AddBlankLine();
     }
 
+    SM.serializeToStackMapSection();
+
     // Funny Darwin hack: This flag tells the linker that no global symbols
     // contain code that falls through to other global symbols (e.g. the obvious
     // implementation of multiple entry points).  If this doesn't occur, the
@@ -645,12 +669,12 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
 
     for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedFns.push_back(Mang->getSymbol(I));
+        DLLExportedFns.push_back(getSymbol(I));
 
     for (Module::const_global_iterator I = M.global_begin(),
            E = M.global_end(); I != E; ++I)
       if (I->hasDLLExportLinkage())
-        DLLExportedGlobals.push_back(Mang->getSymbol(I));
+        DLLExportedGlobals.push_back(getSymbol(I));
 
     // Output linker support code for dllexported globals on windows.
     if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
@@ -702,48 +726,6 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 }
 
-MachineLocation
-X86AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert (MI->getNumOperands() == 7 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-
-  if (MI->getOperand(0).isReg() && MI->getOperand(3).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(3).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
-void X86AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                           raw_ostream &O) {
-  // Only the target-dependent form of DBG_VALUE should get here.
-  // Referencing the offset and metadata as NOps-2 and NOps-1 is
-  // probably portable to other targets; frame pointer location is not.
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps==7);
-  O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  if (V.getContext().isSubprogram())
-    O << DISubprogram(V.getContext()).getDisplayName() << ":";
-  O << V.getName();
-  O << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  O << '[';
-  if (MI->getOperand(0).isReg() && MI->getOperand(0).getReg())
-    printOperand(MI, 0, O);
-  else
-    O << "undef";
-  O << '+'; printOperand(MI, 3, O);
-  O << ']';
-  O << "+";
-  printOperand(MI, NOps-2, O);
-}
-
-
-
 //===----------------------------------------------------------------------===//
 // Target Registry Stuff
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index bc7496b..24a768b 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,6 +16,7 @@
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
@@ -24,9 +25,21 @@ class MCStreamer;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
+  StackMaps SM;
+
+  // Parses operands of PATCHPOINT and STACKMAP to produce stack map Location
+  // structures. Returns a result location and an iterator to the operand
+  // immediately following the operands consumed.
+  //
+  // This method is implemented in X86MCInstLower.cpp.
+  static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+    stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                          MachineInstr::const_mop_iterator MOE,
+                          const TargetMachine &TM);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
+    : AsmPrinter(TM, Streamer), SM(*this, stackmapOperandParser) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
@@ -67,11 +80,6 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
                               unsigned AsmVariant = 1);
 
   virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
-
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  virtual MachineLocation
-    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
new file mode 100644
index 0000000..e76f9fd
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.h
@@ -0,0 +1,35 @@
+//=== X86CallingConv.h - X86 Custom Calling Convention Routines -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom routines for the X86 Calling Convention that
+// aren't done by tablegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86CALLINGCONV_H
+#define X86CALLINGCONV_H
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/IR/CallingConv.h"
+
+namespace llvm {
+
+inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
+                                CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
+                                CCState &) {
+  llvm_unreachable("The AnyReg calling convention is only supported by the " \
+                   "stackmap and patchpoint intrinsics.");
+  // gracefully fallback to X86 C calling convention on Release builds.
+  return false;
+}
+
+} // End llvm namespace
+
+#endif
+
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 9eafbd5..a78b5c0 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -49,6 +49,12 @@ def RetCC_X86Common : CallingConv<[
   CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
             CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
 
+  // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3
+  // can only be used by ABI non-compliant code. This vector type is only
+  // supported while using the AVX-512 target feature.
+  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
   // MMX vector types are always returned in MM0. If the target doesn't have
   // MM0, it doesn't support these vector types.
   CCIfType<[x86mmx], CCAssignToReg<[MM0]>>,
@@ -99,6 +105,10 @@ def RetCC_Intel_OCL_BI : CallingConv<[
   CCIfType<[v8f32, v4f64, v8i32, v4i64],
             CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
 
+  // 512-bit FP vectors
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
+
   // i32, i64 in the standard way
   CCDelegateTo<RetCC_X86Common>
 ]>;
@@ -141,6 +151,26 @@ def RetCC_X86_64_HiPE : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[R15, RBP, RAX, RDX]>>
 ]>;
 
+// X86-64 WebKit_JS return-value convention.
+def RetCC_X86_64_WebKit_JS : CallingConv<[
+  // Promote all types to i64
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // Return: RAX
+  CCIfType<[i64], CCAssignToReg<[RAX]>>
+]>;
+
+// X86-64 AnyReg return-value convention. No explicit register is specified for
+// the return-value. The register allocator is allowed and expected to choose
+// any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def RetCC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 // This is the root return-value convention for the X86-32 backend.
 def RetCC_X86_32 : CallingConv<[
   // If FastCC, use RetCC_X86_32_Fast.
@@ -156,6 +186,15 @@ def RetCC_X86_32 : CallingConv<[
 def RetCC_X86_64 : CallingConv<[
   // HiPE uses RetCC_X86_64_HiPE
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_64_HiPE>>,
+
+  // Handle JavaScript calls.
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<RetCC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_X86_64_AnyReg>>,
+
+  // Handle explicit CC selection
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
+
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
 
@@ -208,10 +247,15 @@ def CC_X86_64_C : CallingConv<[
   // fixed arguments to vararg functions are supposed to be passed in
   // registers.  Actually modeling that would be a lot of work, though.
   CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                          CCIfSubtarget<"hasAVX()",
+                          CCIfSubtarget<"hasFp256()",
                           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3,
                                          YMM4, YMM5, YMM6, YMM7]>>>>,
 
+  // The first 8 512-bit vector arguments are passed in ZMM registers.
+  CCIfNotVarArg<CCIfType<[v16i32, v8i64, v16f32, v8f64],
+            CCIfSubtarget<"hasAVX512()",
+            CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7]>>>>,
+
   // Integer/FP values get stored in stack slots that are 8 bytes in size and
   // 8-byte aligned if there are no more registers to hold them.
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
@@ -225,7 +269,11 @@ def CC_X86_64_C : CallingConv<[
 
   // 256-bit vectors get 32-byte stack slots that are 32-byte aligned.
   CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToStack<32, 32>>
+           CCAssignToStack<32, 32>>,
+
+  // 512-bit vectors get 64-byte stack slots that are 64-byte aligned.
+  CCIfType<[v16i32, v8i64, v16f32, v8f64],
+           CCAssignToStack<64, 64>>
 ]>;
 
 // Calling convention used on Win64
@@ -246,16 +294,19 @@ def CC_X86_Win64_C : CallingConv<[
   // 256 bit vectors are passed by pointer
   CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
 
+  // 512 bit vectors are passed by pointer
+  CCIfType<[v16i32, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
   // The first 4 MMX vector arguments are passed in GPRs.
   CCIfType<[x86mmx], CCBitConvertToType<i64>>,
 
   // The first 4 integer arguments are passed in integer registers.
   CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
                                           [XMM0, XMM1, XMM2, XMM3]>>,
-  
+
   // Do not pass the sret argument in RCX, the Win64 thiscall calling
-  // convention requires "this" to be passed in RCX.                                        
-  CCIfCC<"CallingConv::X86_ThisCall", 
+  // convention requires "this" to be passed in RCX.
+  CCIfCC<"CallingConv::X86_ThisCall",
     CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[RDX , R8  , R9  ],
                                                      [XMM1, XMM2, XMM3]>>>>,
 
@@ -302,6 +353,25 @@ def CC_X86_64_HiPE : CallingConv<[
   CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
 ]>;
 
+def CC_X86_64_WebKit_JS : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer/FP values are always stored in stack slots that are 8 bytes in size
+  // and 8-byte aligned.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+// No explicit register is specified for the AnyReg calling convention. The
+// register allocator may assign the arguments to any free register.
+//
+// This calling convention is currently only supported by the stackmap and
+// patchpoint intrinsics. All other uses will result in an assert on Debug
+// builds. On Release builds we fallback to the X86 C calling convention.
+def CC_X86_64_AnyReg : CallingConv<[
+  CCCustom<"CC_X86_AnyReg_Error">
+]>;
+
 //===----------------------------------------------------------------------===//
 // X86 C Calling Convention
 //===----------------------------------------------------------------------===//
@@ -327,7 +397,7 @@ def CC_X86_32_Common : CallingConv<[
   // Integer/Float values get stored in stack slots that are 4 bytes in
   // size and 4-byte aligned.
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
-  
+
   // Doubles get 8-byte slots that are 4-byte aligned.
   CCIfType<[f64], CCAssignToStack<8, 4>>,
 
@@ -340,7 +410,7 @@ def CC_X86_32_Common : CallingConv<[
 
   // The first 4 AVX 256-bit vector arguments are passed in YMM registers.
   CCIfNotVarArg<CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-                CCIfSubtarget<"hasAVX()",
+                CCIfSubtarget<"hasFp256()",
                 CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>>>,
 
   // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
@@ -464,6 +534,10 @@ def CC_Intel_OCL_BI : CallingConv<[
   CCIfType<[v8f32, v4f64, v8i32, v4i64],
            CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
 
+  // The 512-bit vector arguments are passed in ZMM registers.
+  CCIfType<[v16f32, v8f64, v16i32, v8i64],
+           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3]>>,
+
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfSubtarget<"is64Bit()",       CCDelegateTo<CC_X86_64_C>>,
   CCDelegateTo<CC_X86_32_C>
@@ -489,6 +563,10 @@ def CC_X86_32 : CallingConv<[
 def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_64_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>,
+  CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>,
+  CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>,
+  CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>,
+  CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
 
   // Mingw64 and native Win64 use Win64 CC
   CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
@@ -525,9 +603,13 @@ def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
 
 // Standard C + YMM6-15
 def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
-                                                  R13, R14, R15, 
+                                                  R13, R14, R15,
                                                   (sequence "YMM%u", 6, 15))>;
 
+def CSR_Win64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI,
+                                                     R12, R13, R14, R15,
+                                                     (sequence "ZMM%u", 6, 21),
+                                                     K4, K5, K6, K7)>;
 //Standard C + XMM 8-15
 def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
                                                  (sequence "XMM%u", 8, 15))>;
@@ -535,3 +617,7 @@ def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
 //Standard C + YMM 8-15
 def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
                                                   (sequence "YMM%u", 8, 15))>;
+
+def CSR_64_Intel_OCL_BI_AVX512    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "ZMM%u", 16, 31),
+                                                  K4, K5, K6, K7)>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 8fea6ed..14385ed 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -53,13 +53,8 @@ namespace {
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
       : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
-      MCE(mce), PICBaseOffset(0), Is64BitMode(false),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
-    Emitter(X86TargetMachine &tm, CodeEmitter &mce,
-            const X86InstrInfo &ii, const DataLayout &td, bool is64)
-      : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
-      MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
-      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+        MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
     bool runOnMachineFunction(MachineFunction &MF);
 
@@ -845,7 +840,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   unsigned char VEX_W = 0;
 
   // XOP: Use XOP prefix byte 0x8f instead of VEX.
-  unsigned char XOP = 0;
+  bool XOP = false;
 
   // VEX_5M (VEX m-mmmmm field):
   //
@@ -855,7 +850,8 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   //  0b00011: implied 0F 3A leading opcode bytes
   //  0b00100-0b11111: Reserved for future use
   //  0b01000: XOP map select - 08h instructions with imm byte
-  //  0b10001: XOP map select - 09h instructions with no imm byte
+  //  0b01001: XOP map select - 09h instructions with no imm byte
+  //  0b01010: XOP map select - 0Ah instructions with imm dword
   unsigned char VEX_5M = 0x1;
 
   // VEX_4V (VEX vvvv field): a register specifier
@@ -887,7 +883,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     VEX_W = 1;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::XOP)
-    XOP = 1;
+    XOP = true;
 
   if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L)
     VEX_L = 1;
@@ -924,11 +920,11 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::XOP9:
       VEX_5M = 0x9;
       break;
-    case X86II::A6:  // Bypass: Not used by VEX
-    case X86II::A7:  // Bypass: Not used by VEX
-    case X86II::TB:  // Bypass: Not used by VEX
-    case 0:
-      break;  // No prefix!
+    case X86II::XOPA:
+      VEX_5M = 0xA;
+      break;
+    case X86II::TB: // VEX_5M/VEX_PP already correct
+      break;
   }
 
 
@@ -987,11 +983,14 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  FMA4:
       //  dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM)
       //  dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M),
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_R = 0x0;
+      CurOp++;
 
-      if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 1);
+      if (HasVEX_4V) {
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+        CurOp++;
+      }
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -1001,7 +1000,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
         VEX_X = 0x0;
 
       if (HasVEX_4VOp3)
-        VEX_4V = getVEXRegisterEncoding(MI, X86::AddrNumOperands+1);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp+X86::AddrNumOperands);
       break;
     case X86II::MRM0m: case X86II::MRM1m:
     case X86II::MRM2m: case X86II::MRM3m:
@@ -1011,7 +1010,7 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
       //  MemAddr
       //  src1(VEX_4V), MemAddr
       if (HasVEX_4V)
-        VEX_4V = getVEXRegisterEncoding(MI, 0);
+        VEX_4V = getVEXRegisterEncoding(MI, CurOp++);
 
       if (X86II::isX86_64ExtendedReg(
                           MI.getOperand(MemOperand+X86::AddrBaseReg).getReg()))
@@ -1064,8 +1063,10 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
     case X86II::MRM6r: case X86II::MRM7r:
       // MRM0r-MRM7r instructions forms:
       //  dst(VEX_4V), src(ModR/M), imm8
-      VEX_4V = getVEXRegisterEncoding(MI, 0);
-      if (X86II::isX86_64ExtendedReg(MI.getOperand(1).getReg()))
+      VEX_4V = getVEXRegisterEncoding(MI, CurOp);
+      CurOp++;
+
+      if (X86II::isX86_64ExtendedReg(MI.getOperand(CurOp).getReg()))
         VEX_B = 0x0;
       break;
     default: // RawFrm
@@ -1270,7 +1271,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
     unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
       : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
-    if (Opcode == X86::MOV64ri64i32)
+    if (Opcode == X86::MOV32ri64)
       rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
     // This should not occur on Darwin for relocatable objects.
     if (Opcode == X86::MOV64ri)
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index cf44bd0..97f96ab 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -14,6 +14,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
@@ -45,10 +46,6 @@ class X86FastISel : public FastISel {
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
-  /// RegInfo - X86 register info.
-  ///
-  const X86RegisterInfo *RegInfo;
-
   /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
   /// floating point ops.
   /// When SSE is available, use it for f32 operations.
@@ -63,7 +60,6 @@ public:
     Subtarget = &TM.getSubtarget<X86Subtarget>();
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
-    RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -84,8 +80,10 @@ private:
 
   bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
 
-  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM);
-  bool X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, const Value *Val, const X86AddressMode &AM,
+                        bool Aligned = false);
+  bool X86FastEmitStore(EVT VT, unsigned ValReg, const X86AddressMode &AM,
+                        bool Aligned = false);
 
   bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
                          unsigned &ResultReg);
@@ -128,6 +126,8 @@ private:
     return static_cast<const X86TargetMachine *>(&TM);
   }
 
+  bool handleConstantAddresses(const Value *V, X86AddressMode &AM);
+
   unsigned TargetMaterializeConstant(const Constant *C);
 
   unsigned TargetMaterializeAlloca(const AllocaInst *C);
@@ -238,7 +238,8 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
 /// and a displacement offset, or a GlobalAddress,
 /// i.e. V. Return true if it is possible.
 bool
-X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
+X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg,
+                              const X86AddressMode &AM, bool Aligned) {
   // Get opcode and regclass of the output for the given store instruction.
   unsigned Opc = 0;
   switch (VT.getSimpleVT().SimpleTy) {
@@ -248,8 +249,8 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-            TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
-    Val = AndResult;
+            TII.get(X86::AND8ri), AndResult).addReg(ValReg).addImm(1);
+    ValReg = AndResult;
   }
   // FALLTHROUGH, handling i1 as i8.
   case MVT::i8:  Opc = X86::MOV8mr;  break;
@@ -265,26 +266,35 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
           (Subtarget->hasAVX() ? X86::VMOVSDmr : X86::MOVSDmr) : X86::ST_Fp64m;
     break;
   case MVT::v4f32:
-    Opc = X86::MOVAPSmr;
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPSmr : X86::MOVUPSmr;
     break;
   case MVT::v2f64:
-    Opc = X86::MOVAPDmr;
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVAPDmr : X86::MOVAPDmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVUPDmr : X86::MOVUPDmr;
     break;
   case MVT::v4i32:
   case MVT::v2i64:
   case MVT::v8i16:
   case MVT::v16i8:
-    Opc = X86::MOVDQAmr;
+    if (Aligned)
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQAmr : X86::MOVDQAmr;
+    else
+      Opc = Subtarget->hasAVX() ? X86::VMOVDQUmr : X86::MOVDQUmr;
     break;
   }
 
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
-                         DL, TII.get(Opc)), AM).addReg(Val);
+                         DL, TII.get(Opc)), AM).addReg(ValReg);
   return true;
 }
 
 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
-                                   const X86AddressMode &AM) {
+                                   const X86AddressMode &AM, bool Aligned) {
   // Handle 'null' like i32/i64 0.
   if (isa<ConstantPointerNull>(Val))
     Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
@@ -319,7 +329,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
   if (ValReg == 0)
     return false;
 
-  return X86FastEmitStore(VT, ValReg, AM);
+  return X86FastEmitStore(VT, ValReg, AM, Aligned);
 }
 
 /// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
@@ -337,9 +347,126 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
   return true;
 }
 
+bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
+  // Handle constant address.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // Can't handle TLS yet.
+    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
+    // it works...).
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      if (const GlobalVariable *GVar =
+            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
+        if (GVar->isThreadLocal())
+          return false;
+
+    // RIP-relative addresses can't have additional register operands, so if
+    // we've already folded stuff into the addressing mode, just force the
+    // global value into its own register, which we can use as the basereg.
+    if (!Subtarget->isPICStyleRIPRel() ||
+        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
+      // Okay, we've committed to selecting this global. Set up the address.
+      AM.GV = GV;
+
+      // Allow the subtarget to classify the global.
+      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+      // If this reference is relative to the pic base, set it now.
+      if (isGlobalRelativeToPICBase(GVFlags)) {
+        // FIXME: How do we know Base.Reg is free??
+        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
+      }
+
+      // Unless the ABI requires an extra load, return a direct reference to
+      // the global.
+      if (!isGlobalStubReference(GVFlags)) {
+        if (Subtarget->isPICStyleRIPRel()) {
+          // Use rip-relative addressing if we can.  Above we verified that the
+          // base and index registers are unused.
+          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+          AM.Base.Reg = X86::RIP;
+        }
+        AM.GVOpFlags = GVFlags;
+        return true;
+      }
+
+      // Ok, we need to do a load from a stub.  If we've already loaded from
+      // this stub, reuse the loaded pointer, otherwise emit the load now.
+      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+      unsigned LoadReg;
+      if (I != LocalValueMap.end() && I->second != 0) {
+        LoadReg = I->second;
+      } else {
+        // Issue load from stub.
+        unsigned Opc = 0;
+        const TargetRegisterClass *RC = NULL;
+        X86AddressMode StubAM;
+        StubAM.Base.Reg = AM.Base.Reg;
+        StubAM.GV = GV;
+        StubAM.GVOpFlags = GVFlags;
+
+        // Prepare for inserting code in the local-value area.
+        SavePoint SaveInsertPt = enterLocalValueArea();
+
+        if (TLI.getPointerTy() == MVT::i64) {
+          Opc = X86::MOV64rm;
+          RC  = &X86::GR64RegClass;
+
+          if (Subtarget->isPICStyleRIPRel())
+            StubAM.Base.Reg = X86::RIP;
+        } else {
+          Opc = X86::MOV32rm;
+          RC  = &X86::GR32RegClass;
+        }
+
+        LoadReg = createResultReg(RC);
+        MachineInstrBuilder LoadMI =
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
+        addFullAddress(LoadMI, StubAM);
+
+        // Ok, back to normal mode.
+        leaveLocalValueArea(SaveInsertPt);
+
+        // Prevent loading GV stub multiple times in same MBB.
+        LocalValueMap[V] = LoadReg;
+      }
+
+      // Now construct the final address. Note that the Disp, Scale,
+      // and Index values may already be set here.
+      AM.Base.Reg = LoadReg;
+      AM.GV = 0;
+      return true;
+    }
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
 /// X86SelectAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
+  SmallVector<const Value *, 32> GEPs;
+redo_gep:
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
@@ -434,13 +561,8 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
           Disp += CI->getSExtValue() * S;
           break;
         }
-        if (isa<AddOperator>(Op) &&
-            (!isa<Instruction>(Op) ||
-             FuncInfo.MBBMap[cast<Instruction>(Op)->getParent()]
-               == FuncInfo.MBB) &&
-            isa<ConstantInt>(cast<AddOperator>(Op)->getOperand(1))) {
-          // An add (in the same block) with a constant operand. Fold the
-          // constant.
+        if (canFoldAddIntoGEP(U, Op)) {
+          // A compatible add with a constant operand. Fold the constant.
           ConstantInt *CI =
             cast<ConstantInt>(cast<AddOperator>(Op)->getOperand(1));
           Disp += CI->getSExtValue() * S;
@@ -462,139 +584,43 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
         goto unsupported_gep;
       }
     }
+
     // Check for displacement overflow.
     if (!isInt<32>(Disp))
       break;
-    // Ok, the GEP indices were covered by constant-offset and scaled-index
-    // addressing. Update the address state and move on to examining the base.
+
     AM.IndexReg = IndexReg;
     AM.Scale = Scale;
     AM.Disp = (uint32_t)Disp;
-    if (X86SelectAddress(U->getOperand(0), AM))
+    GEPs.push_back(V);
+
+    if (const GetElementPtrInst *GEP =
+          dyn_cast<GetElementPtrInst>(U->getOperand(0))) {
+      // Ok, the GEP indices were covered by constant-offset and scaled-index
+      // addressing. Update the address state and move on to examining the base.
+      V = GEP;
+      goto redo_gep;
+    } else if (X86SelectAddress(U->getOperand(0), AM)) {
       return true;
+    }
 
     // If we couldn't merge the gep value into this addr mode, revert back to
     // our address and just match the value instead of completely failing.
     AM = SavedAM;
-    break;
-  unsupported_gep:
-    // Ok, the GEP indices weren't all covered.
-    break;
-  }
-  }
-
-  // Handle constant address.
-  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
-    // Can't handle alternate code models yet.
-    if (TM.getCodeModel() != CodeModel::Small)
-      return false;
 
-    // Can't handle TLS yet.
-    if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-      if (GVar->isThreadLocal())
-        return false;
-
-    // Can't handle TLS yet, part 2 (this is slightly crazy, but this is how
-    // it works...).
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      if (const GlobalVariable *GVar =
-            dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal(false)))
-        if (GVar->isThreadLocal())
-          return false;
-
-    // RIP-relative addresses can't have additional register operands, so if
-    // we've already folded stuff into the addressing mode, just force the
-    // global value into its own register, which we can use as the basereg.
-    if (!Subtarget->isPICStyleRIPRel() ||
-        (AM.Base.Reg == 0 && AM.IndexReg == 0)) {
-      // Okay, we've committed to selecting this global. Set up the address.
-      AM.GV = GV;
-
-      // Allow the subtarget to classify the global.
-      unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
-
-      // If this reference is relative to the pic base, set it now.
-      if (isGlobalRelativeToPICBase(GVFlags)) {
-        // FIXME: How do we know Base.Reg is free??
-        AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(FuncInfo.MF);
-      }
-
-      // Unless the ABI requires an extra load, return a direct reference to
-      // the global.
-      if (!isGlobalStubReference(GVFlags)) {
-        if (Subtarget->isPICStyleRIPRel()) {
-          // Use rip-relative addressing if we can.  Above we verified that the
-          // base and index registers are unused.
-          assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
-          AM.Base.Reg = X86::RIP;
-        }
-        AM.GVOpFlags = GVFlags;
+    for (SmallVectorImpl<const Value *>::reverse_iterator
+           I = GEPs.rbegin(), E = GEPs.rend(); I != E; ++I)
+      if (handleConstantAddresses(*I, AM))
         return true;
-      }
 
-      // Ok, we need to do a load from a stub.  If we've already loaded from
-      // this stub, reuse the loaded pointer, otherwise emit the load now.
-      DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
-      unsigned LoadReg;
-      if (I != LocalValueMap.end() && I->second != 0) {
-        LoadReg = I->second;
-      } else {
-        // Issue load from stub.
-        unsigned Opc = 0;
-        const TargetRegisterClass *RC = NULL;
-        X86AddressMode StubAM;
-        StubAM.Base.Reg = AM.Base.Reg;
-        StubAM.GV = GV;
-        StubAM.GVOpFlags = GVFlags;
-
-        // Prepare for inserting code in the local-value area.
-        SavePoint SaveInsertPt = enterLocalValueArea();
-
-        if (TLI.getPointerTy() == MVT::i64) {
-          Opc = X86::MOV64rm;
-          RC  = &X86::GR64RegClass;
-
-          if (Subtarget->isPICStyleRIPRel())
-            StubAM.Base.Reg = X86::RIP;
-        } else {
-          Opc = X86::MOV32rm;
-          RC  = &X86::GR32RegClass;
-        }
-
-        LoadReg = createResultReg(RC);
-        MachineInstrBuilder LoadMI =
-          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc), LoadReg);
-        addFullAddress(LoadMI, StubAM);
-
-        // Ok, back to normal mode.
-        leaveLocalValueArea(SaveInsertPt);
-
-        // Prevent loading GV stub multiple times in same MBB.
-        LocalValueMap[V] = LoadReg;
-      }
-
-      // Now construct the final address. Note that the Disp, Scale,
-      // and Index values may already be set here.
-      AM.Base.Reg = LoadReg;
-      AM.GV = 0;
-      return true;
-    }
+    return false;
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
   }
-
-  // If all else fails, try to materialize the value in a register.
-  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
-    if (AM.Base.Reg == 0) {
-      AM.Base.Reg = getRegForValue(V);
-      return AM.Base.Reg != 0;
-    }
-    if (AM.IndexReg == 0) {
-      assert(AM.Scale == 1 && "Scale with no index!");
-      AM.IndexReg = getRegForValue(V);
-      return AM.IndexReg != 0;
-    }
   }
 
-  return false;
+  return handleConstantAddresses(V, AM);
 }
 
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
@@ -602,9 +628,35 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   const User *U = NULL;
   unsigned Opcode = Instruction::UserOp1;
-  if (const Instruction *I = dyn_cast<Instruction>(V)) {
+  const Instruction *I = dyn_cast<Instruction>(V);
+  // Record if the value is defined in the same basic block.
+  //
+  // This information is crucial to know whether or not folding an
+  // operand is valid.
+  // Indeed, FastISel generates or reuses a virtual register for all
+  // operands of all instructions it selects. Obviously, the definition and
+  // its uses must use the same virtual register otherwise the produced
+  // code is incorrect.
+  // Before instruction selection, FunctionLoweringInfo::set sets the virtual
+  // registers for values that are alive across basic blocks. This ensures
+  // that the values are consistently set between across basic block, even
+  // if different instruction selection mechanisms are used (e.g., a mix of
+  // SDISel and FastISel).
+  // For values local to a basic block, the instruction selection process
+  // generates these virtual registers with whatever method is appropriate
+  // for its needs. In particular, FastISel and SDISel do not share the way
+  // local virtual registers are set.
+  // Therefore, this is impossible (or at least unsafe) to share values
+  // between basic blocks unless they use the same instruction selection
+  // method, which is not guarantee for X86.
+  // Moreover, things like hasOneUse could not be used accurately, if we
+  // allow to reference values across basic blocks whereas they are not
+  // alive across basic blocks initially.
+  bool InMBB = true;
+  if (I) {
     Opcode = I->getOpcode();
     U = I;
+    InMBB = I->getParent() == FuncInfo.MBB->getBasicBlock();
   } else if (const ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
     Opcode = C->getOpcode();
     U = C;
@@ -613,18 +665,22 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
   switch (Opcode) {
   default: break;
   case Instruction::BitCast:
-    // Look past bitcasts.
-    return X86SelectCallAddress(U->getOperand(0), AM);
+    // Look past bitcasts if its operand is in the same BB.
+    if (InMBB)
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
 
   case Instruction::IntToPtr:
-    // Look past no-op inttoptrs.
-    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+    // Look past no-op inttoptrs if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
 
   case Instruction::PtrToInt:
-    // Look past no-op ptrtoints.
-    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+    // Look past no-op ptrtoints if its operand is in the same BB.
+    if (InMBB &&
+        TLI.getValueType(U->getType()) == TLI.getPointerTy())
       return X86SelectCallAddress(U->getOperand(0), AM);
     break;
   }
@@ -693,6 +749,10 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (S->isAtomic())
     return false;
 
+  unsigned SABIAlignment =
+    TD.getABITypeAlignment(S->getValueOperand()->getType());
+  bool Aligned = S->getAlignment() == 0 || S->getAlignment() >= SABIAlignment;
+
   MVT VT;
   if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
     return false;
@@ -701,7 +761,7 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (!X86SelectAddress(I->getOperand(1), AM))
     return false;
 
-  return X86FastEmitStore(VT, I->getOperand(0), AM);
+  return X86FastEmitStore(VT, I->getOperand(0), AM, Aligned);
 }
 
 /// X86SelectRet - Select and emit code to implement ret instructions.
@@ -717,10 +777,11 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::C &&
       CC != CallingConv::Fast &&
-      CC != CallingConv::X86_FastCall)
+      CC != CallingConv::X86_FastCall &&
+      CC != CallingConv::X86_64_SysV)
     return false;
 
-  if (Subtarget->isTargetWin64())
+  if (Subtarget->isCallingConvWin64(CC))
     return false;
 
   // Don't handle popping bytes on return for now.
@@ -1005,10 +1066,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
 }
 
 bool X86FastISel::X86SelectZExt(const Instruction *I) {
-  // Handle zero-extension from i1 to i8, which is common.
-  if (!I->getOperand(0)->getType()->isIntegerTy(1))
-    return false;
-
   EVT DstVT = TLI.getValueType(I->getType());
   if (!TLI.isTypeLegal(DstVT))
     return false;
@@ -1017,12 +1074,37 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   if (ResultReg == 0)
     return false;
 
-  // Set the high bits to zero.
-  ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
-  if (ResultReg == 0)
-    return false;
+  // Handle zero-extension from i1 to i8, which is common.
+  MVT SrcVT = TLI.getSimpleValueType(I->getOperand(0)->getType());
+  if (SrcVT.SimpleTy == MVT::i1) {
+    // Set the high bits to zero.
+    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
+    SrcVT = MVT::i8;
+
+    if (ResultReg == 0)
+      return false;
+  }
+
+  if (DstVT == MVT::i64) {
+    // Handle extension to 64-bits via sub-register shenanigans.
+    unsigned MovInst;
+
+    switch (SrcVT.SimpleTy) {
+    case MVT::i8:  MovInst = X86::MOVZX32rr8;  break;
+    case MVT::i16: MovInst = X86::MOVZX32rr16; break;
+    case MVT::i32: MovInst = X86::MOV32rr;     break;
+    default: llvm_unreachable("Unexpected zext to i64 source type");
+    }
+
+    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(MovInst), Result32)
+      .addReg(ResultReg);
 
-  if (DstVT != MVT::i8) {
+    ResultReg = createResultReg(&X86::GR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::SUBREG_TO_REG),
+            ResultReg)
+      .addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+  } else if (DstVT != MVT::i8) {
     ResultReg = FastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
                            ResultReg, /*Kill=*/true);
     if (ResultReg == 0)
@@ -1273,8 +1355,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR16RegClass, X86::AX,  X86::DX, {
         { X86::IDIV16r, X86::CWD,     Copy,            X86::AX,  S }, // SDiv
         { X86::IDIV16r, X86::CWD,     Copy,            X86::DX,  S }, // SRem
-        { X86::DIV16r,  X86::MOV16r0, Copy,            X86::AX,  U }, // UDiv
-        { X86::DIV16r,  X86::MOV16r0, Copy,            X86::DX,  U }, // URem
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::AX,  U }, // UDiv
+        { X86::DIV16r,  X86::MOV32r0, Copy,            X86::DX,  U }, // URem
       }
     }, // i16
     { &X86::GR32RegClass, X86::EAX, X86::EDX, {
@@ -1287,8 +1369,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     { &X86::GR64RegClass, X86::RAX, X86::RDX, {
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RAX, S }, // SDiv
         { X86::IDIV64r, X86::CQO,     Copy,            X86::RDX, S }, // SRem
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RAX, U }, // UDiv
-        { X86::DIV64r,  X86::MOV64r0, Copy,            X86::RDX, U }, // URem
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RAX, U }, // UDiv
+        { X86::DIV64r,  X86::MOV32r0, Copy,            X86::RDX, U }, // URem
       }
     }, // i64
   };
@@ -1334,17 +1416,63 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
     if (OpEntry.IsOpSigned)
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
               TII.get(OpEntry.OpSignExtend));
-    else
+    else {
+      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-              TII.get(OpEntry.OpSignExtend), TypeEntry.HighInReg);
+              TII.get(X86::MOV32r0), Zero32);
+
+      // Copy the zero into the appropriate sub/super/identical physical
+      // register. Unfortunately the operations needed are not uniform enough to
+      // fit neatly into the table above.
+      if (VT.SimpleTy == MVT::i16) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(Copy), TypeEntry.HighInReg)
+          .addReg(Zero32, 0, X86::sub_16bit);
+      } else if (VT.SimpleTy == MVT::i32) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(Copy), TypeEntry.HighInReg)
+            .addReg(Zero32);
+      } else if (VT.SimpleTy == MVT::i64) {
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg)
+            .addImm(0).addReg(Zero32).addImm(X86::sub_32bit);
+      }
+    }
   }
   // Generate the DIV/IDIV instruction.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
           TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
-  // Copy output register into result register.
-  unsigned ResultReg = createResultReg(TypeEntry.RC);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
-          TII.get(Copy), ResultReg).addReg(OpEntry.DivRemResultReg);
+  // For i8 remainder, we can't reference AH directly, as we'll end
+  // up with bogus copies like %R9B = COPY %AH. Reference AX
+  // instead to prevent AH references in a REX instruction.
+  //
+  // The current assumption of the fast register allocator is that isel
+  // won't generate explicit references to the GPR8_NOREX registers. If
+  // the allocator and/or the backend get enhanced to be more robust in
+  // that regard, this can be, and should be, removed.
+  unsigned ResultReg = 0;
+  if ((I->getOpcode() == Instruction::SRem ||
+       I->getOpcode() == Instruction::URem) &&
+      OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
+    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
+    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+            TII.get(Copy), SourceSuperReg).addReg(X86::AX);
+
+    // Shift AX right by 8 bits instead of using AH.
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::SHR16ri),
+            ResultSuperReg).addReg(SourceSuperReg).addImm(8);
+
+    // Now reference the 8-bit subreg of the result.
+    ResultReg = FastEmitInst_extractsubreg(MVT::i8, ResultSuperReg,
+                                           /*Kill=*/true, X86::sub_8bit);
+  }
+  // Copy the result out of the physreg if we haven't already.
+  if (!ResultReg) {
+    ResultReg = createResultReg(TypeEntry.RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Copy), ResultReg)
+        .addReg(OpEntry.DivRemResultReg);
+  }
   UpdateValueMap(I, ResultReg);
 
   return true;
@@ -1643,9 +1771,6 @@ bool X86FastISel::FastLowerArguments() {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
-  if (Subtarget->isTargetWin64())
-    return false;
-
   const Function *F = FuncInfo.Fn;
   if (F->isVarArg())
     return false;
@@ -1653,7 +1778,10 @@ bool X86FastISel::FastLowerArguments() {
   CallingConv::ID CC = F->getCallingConv();
   if (CC != CallingConv::C)
     return false;
-  
+
+  if (Subtarget->isCallingConvWin64(CC))
+    return false;
+
   if (!Subtarget->is64Bit())
     return false;
   
@@ -1697,8 +1825,6 @@ bool X86FastISel::FastLowerArguments() {
   const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I, ++Idx) {
-    if (I->use_empty())
-      continue;
     bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
     const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
     unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
@@ -1757,8 +1883,10 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   // Handle only C and fastcc calling conventions for now.
   ImmutableCallSite CS(CI);
   CallingConv::ID CC = CS.getCallingConv();
+  bool isWin64 = Subtarget->isCallingConvWin64(CC);
   if (CC != CallingConv::C && CC != CallingConv::Fast &&
-      CC != CallingConv::X86_FastCall)
+      CC != CallingConv::X86_FastCall && CC != CallingConv::X86_64_Win64 &&
+      CC != CallingConv::X86_64_SysV)
     return false;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
@@ -1772,7 +1900,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   // Don't know how to handle Win64 varargs yet.  Nothing special needed for
   // x86-32.  Special handling for x86-64 is implemented.
-  if (isVarArg && Subtarget->isTargetWin64())
+  if (isVarArg && isWin64)
     return false;
 
   // Fast-isel doesn't know about callee-pop yet.
@@ -1902,7 +2030,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
                  I->getParent()->getContext());
 
   // Allocate shadow area for Win64
-  if (Subtarget->isTargetWin64())
+  if (isWin64)
     CCInfo.AllocateStack(32, 8);
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_X86);
@@ -1985,6 +2113,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
+      const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo*>(
+          getTargetMachine()->getRegisterInfo());
       AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
@@ -2016,7 +2146,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
             X86::EBX).addReg(Base);
   }
 
-  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64()) {
+  if (Subtarget->is64Bit() && isVarArg && !isWin64) {
     // Count the number of XMM registers allocated.
     static const uint16_t XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
@@ -2085,7 +2215,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (Subtarget->isPICStyleGOT())
     MIB.addReg(X86::EBX, RegState::Implicit);
 
-  if (Subtarget->is64Bit() && isVarArg && !Subtarget->isTargetWin64())
+  if (Subtarget->is64Bit() && isVarArg && !isWin64)
     MIB.addReg(X86::AL, RegState::Implicit);
 
   // Add implicit physical register uses to the call.
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 0dd034c..38a8351 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -125,6 +125,15 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
       // which requires isImm() to be true
       return 0;
     }
+    break;
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+    if (MI->getOperand(1).getReg() != MI->getOperand(2).getReg()) {
+      // if src1 != src2, then convertToThreeAddress will
+      // need to create a Virtual register, which we cannot do
+      // after register allocation.
+      return 0;
+    }
   }
   return TII->convertToThreeAddress(MFI, MBBI, 0);
 }
@@ -135,8 +144,8 @@ FunctionPass *llvm::createX86FixupLEAs() {
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = Func.getTarget().getInstrInfo();
   TM = &MF->getTarget();
+  TII = TM->getInstrInfo();
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 0585b43..48470da 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -115,9 +115,10 @@ namespace {
       unsigned Mask = 0;
       for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
            E = MBB->livein_end(); I != E; ++I) {
-        unsigned Reg = *I - X86::FP0;
-        if (Reg < 8)
-          Mask |= 1 << Reg;
+        unsigned Reg = *I;
+        if (Reg < X86::FP0 || Reg > X86::FP6)
+          continue;
+        Mask |= 1 << (Reg - X86::FP0);
       }
       return Mask;
     }
@@ -893,8 +894,8 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Produce implicit-defs for free by using killed registers.
   while (Kills && Defs) {
-    unsigned KReg = CountTrailingZeros_32(Kills);
-    unsigned DReg = CountTrailingZeros_32(Defs);
+    unsigned KReg = countTrailingZeros(Kills);
+    unsigned DReg = countTrailingZeros(Defs);
     DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
     std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
     std::swap(RegMap[KReg], RegMap[DReg]);
@@ -917,7 +918,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Manually kill the rest.
   while (Kills) {
-    unsigned KReg = CountTrailingZeros_32(Kills);
+    unsigned KReg = countTrailingZeros(Kills);
     DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
     freeStackSlotBefore(I, KReg);
     Kills &= ~(1 << KReg);
@@ -925,7 +926,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
 
   // Load zeros for all the imp-defs.
   while(Defs) {
-    unsigned DReg = CountTrailingZeros_32(Defs);
+    unsigned DReg = countTrailingZeros(Defs);
     DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
     BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
     pushReg(DReg);
@@ -1636,7 +1637,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     // Note: this might be a non-optimal pop sequence.  We might be able to do
     // better by trying to pop in stack order or something.
     while (FPKills) {
-      unsigned FPReg = CountTrailingZeros_32(FPKills);
+      unsigned FPReg = countTrailingZeros(FPKills);
       if (isLive(FPReg))
         freeStackSlotAfter(InsertPt, FPReg);
       FPKills &= ~(1U << FPReg);
@@ -1661,6 +1662,7 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
     BuildMI(*MBB, I, MI->getDebugLoc(), TII->get(X86::CALLpcrel32))
       .addExternalSymbol("_ftol2")
       .addReg(X86::ST0, RegState::ImplicitKill)
+      .addReg(X86::ECX, RegState::ImplicitDefine)
       .addReg(X86::EAX, RegState::Define | RegState::Implicit)
       .addReg(X86::EDX, RegState::Define | RegState::Implicit)
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 42b4e73..a06ba9d 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -307,12 +307,12 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
                                                  unsigned FramePtr) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
 
   // Add callee saved registers to move list.
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   if (CSI.empty()) return;
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
   const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
   bool HasFP = hasFP(MF);
 
@@ -360,276 +360,18 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
     if (HasFP && FramePtr == Reg)
       continue;
 
-    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-    MachineLocation CSSrc(Reg);
-    Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    MMI.addFrameInst(MCCFIInstruction::createOffset(Label, DwarfReg, Offset));
   }
 }
 
-/// getCompactUnwindRegNum - Get the compact unwind number for a given
-/// register. The number corresponds to the enum lists in
-/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
-  for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
-    if (*CURegs == Reg)
-      return Idx;
-
-  return -1;
-}
-
-// Number of registers that can be saved in a compact unwind encoding.
-#define CU_NUM_SAVED_REGS 6
-
-/// encodeCompactUnwindRegistersWithoutFrame - Create the permutation encoding
-/// used with frameless stacks. It is passed the number of registers to be saved
-/// and an array of the registers saved.
-static uint32_t
-encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                         unsigned RegCount, bool Is64Bit) {
-  // The saved registers are numbered from 1 to 6. In order to encode the order
-  // in which they were saved, we re-number them according to their place in the
-  // register order. The re-numbering is relative to the last re-numbered
-  // register. E.g., if we have registers {6, 2, 4, 5} saved in that order:
-  //
-  //    Orig  Re-Num
-  //    ----  ------
-  //     6       6
-  //     2       2
-  //     4       3
-  //     5       3
-  //
-  for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
-    int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
-    if (CUReg == -1) return ~0U;
-    SavedRegs[i] = CUReg;
-  }
-
-  // Reverse the list.
-  std::swap(SavedRegs[0], SavedRegs[5]);
-  std::swap(SavedRegs[1], SavedRegs[4]);
-  std::swap(SavedRegs[2], SavedRegs[3]);
-
-  uint32_t RenumRegs[CU_NUM_SAVED_REGS];
-  for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i) {
-    unsigned Countless = 0;
-    for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
-      if (SavedRegs[j] < SavedRegs[i])
-        ++Countless;
-
-    RenumRegs[i] = SavedRegs[i] - Countless - 1;
-  }
-
-  // Take the renumbered values and encode them into a 10-bit number.
-  uint32_t permutationEncoding = 0;
-  switch (RegCount) {
-  case 6:
-    permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                           + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                           +     RenumRegs[4];
-    break;
-  case 5:
-    permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
-                           + 6 * RenumRegs[3] +  2 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 4:
-    permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
-                           + 3 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 3:
-    permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
-                           +     RenumRegs[5];
-    break;
-  case 2:
-    permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
-    break;
-  case 1:
-    permutationEncoding |=       RenumRegs[5];
-    break;
-  }
-
-  assert((permutationEncoding & 0x3FF) == permutationEncoding &&
-         "Invalid compact register encoding!");
-  return permutationEncoding;
-}
-
-/// encodeCompactUnwindRegistersWithFrame - Return the registers encoded for a
-/// compact encoding with a frame pointer.
-static uint32_t
-encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
-                                      bool Is64Bit) {
-  // Encode the registers in the order they were saved, 3-bits per register. The
-  // registers are numbered from 1 to CU_NUM_SAVED_REGS.
-  uint32_t RegEnc = 0;
-  for (int I = CU_NUM_SAVED_REGS - 1, Idx = 0; I != -1; --I) {
-    unsigned Reg = SavedRegs[I];
-    if (Reg == 0) continue;
-
-    int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
-    if (CURegNum == -1) return ~0U;
-
-    // Encode the 3-bit register number in order, skipping over 3-bits for each
-    // register.
-    RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
-  }
-
-  assert((RegEnc & 0x3FFFF) == RegEnc && "Invalid compact register encoding!");
-  return RegEnc;
-}
-
-uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
-  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
-  unsigned FramePtr = RegInfo->getFrameRegister(MF);
-  unsigned StackPtr = RegInfo->getStackRegister();
-
-  bool Is64Bit = STI.is64Bit();
-  bool HasFP = hasFP(MF);
-
-  unsigned SavedRegs[CU_NUM_SAVED_REGS] = { 0, 0, 0, 0, 0, 0 };
-  unsigned SavedRegIdx = 0;
-
-  unsigned OffsetSize = (Is64Bit ? 8 : 4);
-
-  unsigned PushInstr = (Is64Bit ? X86::PUSH64r : X86::PUSH32r);
-  unsigned PushInstrSize = 1;
-  unsigned MoveInstr = (Is64Bit ? X86::MOV64rr : X86::MOV32rr);
-  unsigned MoveInstrSize = (Is64Bit ? 3 : 2);
-  unsigned SubtractInstrIdx = (Is64Bit ? 3 : 2);
-
-  unsigned StackDivide = (Is64Bit ? 8 : 4);
-
-  unsigned InstrOffset = 0;
-  unsigned StackAdjust = 0;
-  unsigned StackSize = 0;
-
-  MachineBasicBlock &MBB = MF.front(); // Prologue is in entry BB.
-  bool ExpectEnd = false;
-  for (MachineBasicBlock::iterator
-         MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ++MBBI) {
-    MachineInstr &MI = *MBBI;
-    unsigned Opc = MI.getOpcode();
-    if (Opc == X86::PROLOG_LABEL) continue;
-    if (!MI.getFlag(MachineInstr::FrameSetup)) break;
-
-    // We don't exect any more prolog instructions.
-    if (ExpectEnd) return CU::UNWIND_MODE_DWARF;
-
-    if (Opc == PushInstr) {
-      // If there are too many saved registers, we cannot use compact encoding.
-      if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
-
-      unsigned Reg = MI.getOperand(0).getReg();
-      if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
-        ExpectEnd = true;
-        continue;
-      }
-
-      SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
-      StackAdjust += OffsetSize;
-      InstrOffset += PushInstrSize;
-    } else if (Opc == MoveInstr) {
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-
-      if (DstReg != FramePtr || SrcReg != StackPtr)
-        return CU::UNWIND_MODE_DWARF;
-
-      StackAdjust = 0;
-      memset(SavedRegs, 0, sizeof(SavedRegs));
-      SavedRegIdx = 0;
-      InstrOffset += MoveInstrSize;
-    } else if (Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
-               Opc == X86::SUB32ri || Opc == X86::SUB32ri8) {
-      if (StackSize)
-        // We already have a stack size.
-        return CU::UNWIND_MODE_DWARF;
-
-      if (!MI.getOperand(0).isReg() ||
-          MI.getOperand(0).getReg() != MI.getOperand(1).getReg() ||
-          MI.getOperand(0).getReg() != StackPtr || !MI.getOperand(2).isImm())
-        // We need this to be a stack adjustment pointer. Something like:
-        //
-        //   %RSP<def> = SUB64ri8 %RSP, 48
-        return CU::UNWIND_MODE_DWARF;
-
-      StackSize = MI.getOperand(2).getImm() / StackDivide;
-      SubtractInstrIdx += InstrOffset;
-      ExpectEnd = true;
-    }
-  }
-
-  // Encode that we are using EBP/RBP as the frame pointer.
-  uint32_t CompactUnwindEncoding = 0;
-  StackAdjust /= StackDivide;
-  if (HasFP) {
-    if ((StackAdjust & 0xFF) != StackAdjust)
-      // Offset was too big for compact encoding.
-      return CU::UNWIND_MODE_DWARF;
-
-    // Get the encoding of the saved registers when we have a frame pointer.
-    uint32_t RegEnc = encodeCompactUnwindRegistersWithFrame(SavedRegs, Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    CompactUnwindEncoding |= CU::UNWIND_MODE_BP_FRAME;
-    CompactUnwindEncoding |= (StackAdjust & 0xFF) << 16;
-    CompactUnwindEncoding |= RegEnc & CU::UNWIND_BP_FRAME_REGISTERS;
-  } else {
-    ++StackAdjust;
-    uint32_t TotalStackSize = StackAdjust + StackSize;
-    if ((TotalStackSize & 0xFF) == TotalStackSize) {
-      // Frameless stack with a small stack size.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IMMD;
-
-      // Encode the stack size.
-      CompactUnwindEncoding |= (TotalStackSize & 0xFF) << 16;
-    } else {
-      if ((StackAdjust & 0x7) != StackAdjust)
-        // The extra stack adjustments are too big for us to handle.
-        return CU::UNWIND_MODE_DWARF;
-
-      // Frameless stack with an offset too large for us to encode compactly.
-      CompactUnwindEncoding |= CU::UNWIND_MODE_STACK_IND;
-
-      // Encode the offset to the nnnnnn value in the 'subl $nnnnnn, ESP'
-      // instruction.
-      CompactUnwindEncoding |= (SubtractInstrIdx & 0xFF) << 16;
-
-      // Encode any extra stack stack adjustments (done via push instructions).
-      CompactUnwindEncoding |= (StackAdjust & 0x7) << 13;
-    }
-
-    // Encode the number of registers saved.
-    CompactUnwindEncoding |= (SavedRegIdx & 0x7) << 10;
-
-    // Get the encoding of the saved registers when we don't have a frame
-    // pointer.
-    uint32_t RegEnc =
-      encodeCompactUnwindRegistersWithoutFrame(SavedRegs, SavedRegIdx,
-                                               Is64Bit);
-    if (RegEnc == ~0U) return CU::UNWIND_MODE_DWARF;
-
-    // Encode the register encoding.
-    CompactUnwindEncoding |=
-      RegEnc & CU::UNWIND_FRAMELESS_STACK_REG_PERMUTATION;
-  }
-
-  return CompactUnwindEncoding;
-}
-
 /// usesTheStack - This function checks if any of the users of EFLAGS
 /// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
 /// to use the stack, and if we don't adjust the stack we clobber the first
 /// frame index.
 /// See X86InstrInfo::copyPhysReg.
-static bool usesTheStack(MachineFunction &MF) {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+static bool usesTheStack(const MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
        re = MRI.reg_end(); ri != re; ++ri)
@@ -732,7 +474,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   //        REG < 64                    => DW_CFA_offset + Reg
   //        ELSE                        => DW_CFA_offset_extended
 
-  std::vector<MachineMove> &Moves = MMI.getFrameMoves();
   uint64_t NumBytes = 0;
   int stackGrowth = -SlotSize;
 
@@ -765,20 +506,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         .addSym(FrameLabel);
 
       // Define the current CFA rule to use the provided offset.
-      if (StackSize) {
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
-        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-      } else {
-        MachineLocation SPDst(StackPtr);
-        MachineLocation SPSrc(StackPtr, stackGrowth);
-        Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-      }
+      assert(StackSize);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, 2 * stackGrowth));
 
       // Change the rule for the FramePtr to be an "offset" rule.
-      MachineLocation FPDst(MachineLocation::VirtualFP, 2 * stackGrowth);
-      MachineLocation FPSrc(FramePtr);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      MMI.addFrameInst(MCCFIInstruction::createOffset(FrameLabel, DwarfFramePtr,
+                                                      2 * stackGrowth));
     }
 
     // Update EBP with the new base value.
@@ -794,9 +529,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
         .addSym(FrameLabel);
 
       // Define the current CFA to use the EBP/RBP register.
-      MachineLocation FPDst(FramePtr);
-      MachineLocation FPSrc(MachineLocation::VirtualFP);
-      Moves.push_back(MachineMove(FrameLabel, FPDst, FPSrc));
+      unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaRegister(FrameLabel, DwarfFramePtr));
     }
 
     // Mark the FramePtr as live-in in every block except the entry.
@@ -824,10 +559,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, MBBI, DL, TII.get(X86::PROLOG_LABEL)).addSym(Label);
 
       // Define the current CFA rule to use the provided offset.
-      unsigned Ptr = StackSize ? MachineLocation::VirtualFP : StackPtr;
-      MachineLocation SPDst(Ptr);
-      MachineLocation SPSrc(Ptr, StackOffset);
-      Moves.push_back(MachineMove(Label, SPDst, SPSrc));
+      assert(StackSize);
+      MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(Label, StackOffset));
       StackOffset += stackGrowth;
     }
   }
@@ -872,7 +606,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // responsible for adjusting the stack pointer.  Touching the stack at 4K
   // increments is necessary to ensure that the guard pages used by the OS
   // virtual memory manager are allocated in correct sequence.
-  if (NumBytes >= 4096 && STI.isTargetCOFF() && !STI.isTargetEnvMacho()) {
+  if (NumBytes >= 4096 && STI.isOSWindows() && !STI.isTargetEnvMacho()) {
     const char *StackProbeSymbol;
     bool isSPUpdateNeeded = false;
 
@@ -923,11 +657,14 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit)
       .setMIFlag(MachineInstr::FrameSetup);
 
-    // MSVC x64's __chkstk needs to adjust %rsp.
-    // FIXME: %rax preserves the offset and should be available.
-    if (isSPUpdateNeeded)
-      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
-                   UseLEA, TII, *RegInfo);
+    // MSVC x64's __chkstk does not adjust %rsp itself.
+    // It also does not clobber %rax so we can reuse it when adjusting %rsp.
+    if (isSPUpdateNeeded) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::SUB64rr), StackPtr)
+        .addReg(StackPtr)
+        .addReg(X86::RAX)
+        .setMIFlag(MachineInstr::FrameSetup);
+    }
 
     if (isEAXAlive) {
         // Restore EAX
@@ -961,27 +698,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
 
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
-      if (StackSize) {
-        MachineLocation SPDst(MachineLocation::VirtualFP);
-        MachineLocation SPSrc(MachineLocation::VirtualFP,
-                              -StackSize + stackGrowth);
-        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
-      } else {
-        MachineLocation SPDst(StackPtr);
-        MachineLocation SPSrc(StackPtr, stackGrowth);
-        Moves.push_back(MachineMove(Label, SPDst, SPSrc));
-      }
+      assert(StackSize);
+      MMI.addFrameInst(MCCFIInstruction::createDefCfaOffset(
+          Label, -StackSize + stackGrowth));
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
     if (PushedRegs)
       emitCalleeSavedFrameMoves(MF, Label, HasFP ? FramePtr : StackPtr);
   }
-
-  // Darwin 10.7 and greater has support for compact unwind encoding.
-  if (STI.getTargetTriple().isMacOSX() &&
-      !STI.getTargetTriple().isMacOSXVersionLT(10, 7))
-    MMI.setCompactUnwindEncoding(getCompactUnwindEncoding(MF));
 }
 
 void X86FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -1336,7 +1061,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   unsigned SlotSize = RegInfo->getSlotSize();
 
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
-  int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
 
   if (TailCallReturnAddrDelta < 0) {
     // create RETURNADDR area
@@ -1349,7 +1074,7 @@ X86FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     //   }
     //   [EBP]
     MFI->CreateFixedObject(-TailCallReturnAddrDelta,
-                           (-1U*SlotSize)+TailCallReturnAddrDelta, true);
+                           TailCallReturnAddrDelta - SlotSize, true);
   }
 
   if (hasFP(MF)) {
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 6e309d8..3d3b011 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -20,32 +20,6 @@
 
 namespace llvm {
 
-namespace CU {
-
-  /// Compact unwind encoding values.
-  enum CompactUnwindEncodings {
-    /// [RE]BP based frame where [RE]BP is pused on the stack immediately after
-    /// the return address, then [RE]SP is moved to [RE]BP.
-    UNWIND_MODE_BP_FRAME                   = 0x01000000,
-
-    /// A frameless function with a small constant stack size.
-    UNWIND_MODE_STACK_IMMD                 = 0x02000000,
-
-    /// A frameless function with a large constant stack size.
-    UNWIND_MODE_STACK_IND                  = 0x03000000,
-
-    /// No compact unwind encoding is available.
-    UNWIND_MODE_DWARF                      = 0x04000000,
-
-    /// Mask for encoding the frame registers.
-    UNWIND_BP_FRAME_REGISTERS              = 0x00007FFF,
-
-    /// Mask for encoding the frameless registers.
-    UNWIND_FRAMELESS_STACK_REG_PERMUTATION = 0x000003FF
-  };
-
-} // end CU namespace
-
 class MCSymbol;
 class X86TargetMachine;
 
@@ -91,7 +65,6 @@ public:
   int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const;
-  uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 968b358..36d1690 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -79,7 +79,8 @@ namespace {
     }
 
     bool hasBaseOrIndexReg() const {
-      return IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+      return BaseType == FrameIndexBase ||
+             IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
     }
 
     /// isRIPRelative - Return true if this addressing mode is already RIP
@@ -141,10 +142,6 @@ namespace {
   /// SelectionDAG operations.
   ///
   class X86DAGToDAGISel : public SelectionDAGISel {
-    /// X86Lowering - This object fully describes how to lower LLVM code to an
-    /// X86-specific SelectionDAG.
-    const X86TargetLowering &X86Lowering;
-
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
@@ -156,7 +153,6 @@ namespace {
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(tm, OptLevel),
-        X86Lowering(*tm.getTargetLowering()),
         Subtarget(&tm.getSubtarget<X86Subtarget>()),
         OptForSize(false) {}
 
@@ -188,7 +184,7 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
+    SDNode *SelectAtomicLoadArith(SDNode *Node, MVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -200,9 +196,13 @@ namespace {
     bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
+    bool SelectMOV64Imm32(SDValue N, SDValue &Imm);
     bool SelectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
                        SDValue &Segment);
+    bool SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                            SDValue &Scale, SDValue &Index, SDValue &Disp,
+                            SDValue &Segment);
     bool SelectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
@@ -229,14 +229,15 @@ namespace {
                                    SDValue &Scale, SDValue &Index,
                                    SDValue &Disp, SDValue &Segment) {
       Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
-        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex,
+                                    getTargetLowering()->getPointerTy()) :
         AM.Base_Reg;
       Scale = getI8Imm(AM.Scale);
       Index = AM.IndexReg;
       // These are 32-bit even in 64-bit mode since RIP relative offset
       // is 32-bit.
       if (AM.GV)
-        Disp = CurDAG->getTargetGlobalAddress(AM.GV, DebugLoc(),
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
                                               MVT::i32, AM.Disp,
                                               AM.SymbolFlags);
       else if (AM.CP)
@@ -373,7 +374,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
       else
         Ops.push_back(Chain.getOperand(i));
     SDValue NewChain =
-      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
                       MVT::Other, &Ops[0], Ops.size());
     Ops.clear();
     Ops.push_back(NewChain);
@@ -491,8 +492,8 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
       continue;
 
-    EVT SrcVT = N->getOperand(0).getValueType();
-    EVT DstVT = N->getValueType(0);
+    MVT SrcVT = N->getOperand(0).getSimpleValueType();
+    MVT DstVT = N->getSimpleValueType(0);
 
     // If any of the sources are vectors, no fp stack involved.
     if (SrcVT.isVector() || DstVT.isVector())
@@ -500,8 +501,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 
     // If the source and destination are SSE registers, then this is a legal
     // conversion that should not be lowered.
-    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
-    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    const X86TargetLowering *X86Lowering =
+        static_cast<const X86TargetLowering *>(getTargetLowering());
+    bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
     if (SrcIsSSE && DstIsSSE)
       continue;
 
@@ -517,14 +520,14 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
     // FPStack has extload and truncstore.  SSE can fold direct loads into other
     // operations.  Based on this, decide what we want to do.
-    EVT MemVT;
+    MVT MemVT;
     if (N->getOpcode() == ISD::FP_ROUND)
       MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
     else
       MemVT = SrcIsSSE ? SrcVT : DstVT;
 
     SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
 
     // FIXME: optimize the case where the src/dest is a load or store?
     SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
@@ -781,8 +784,8 @@ static bool FoldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
       Mask != (0xffu << ScaleLog))
     return true;
 
-  EVT VT = N.getValueType();
-  DebugLoc DL = N.getDebugLoc();
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
   SDValue Eight = DAG.getConstant(8, MVT::i8);
   SDValue NewMask = DAG.getConstant(0xff, VT);
   SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
@@ -829,8 +832,8 @@ static bool FoldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
   if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3)
     return true;
 
-  EVT VT = N.getValueType();
-  DebugLoc DL = N.getDebugLoc();
+  MVT VT = N.getSimpleValueType();
+  SDLoc DL(N);
   SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, VT);
   SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
   SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
@@ -886,8 +889,8 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
     return true;
 
   unsigned ShiftAmt = Shift.getConstantOperandVal(1);
-  unsigned MaskLZ = CountLeadingZeros_64(Mask);
-  unsigned MaskTZ = CountTrailingZeros_64(Mask);
+  unsigned MaskLZ = countLeadingZeros(Mask);
+  unsigned MaskTZ = countTrailingZeros(Mask);
 
   // The amount of shift we're trying to fit into the addressing mode is taken
   // from the trailing zeros of the mask.
@@ -902,7 +905,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // Scale the leading zero count down based on the actual size of the value.
   // Also scale it down based on the size of the shift.
-  MaskLZ -= (64 - X.getValueSizeInBits()) + ShiftAmt;
+  MaskLZ -= (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
 
   // The final check is to ensure that any masked out high bits of X are
   // already known to be zero. Otherwise, the mask has a semantic impact
@@ -912,31 +915,31 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   // replace them with zero extensions cheaply if necessary.
   bool ReplacingAnyExtend = false;
   if (X.getOpcode() == ISD::ANY_EXTEND) {
-    unsigned ExtendBits =
-      X.getValueSizeInBits() - X.getOperand(0).getValueSizeInBits();
+    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
+                          X.getOperand(0).getSimpleValueType().getSizeInBits();
     // Assume that we'll replace the any-extend with a zero-extend, and
     // narrow the search to the extended value.
     X = X.getOperand(0);
     MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
     ReplacingAnyExtend = true;
   }
-  APInt MaskedHighBits = APInt::getHighBitsSet(X.getValueSizeInBits(),
-                                               MaskLZ);
+  APInt MaskedHighBits =
+    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   APInt KnownZero, KnownOne;
   DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
   // and an addressing mode. Make it so.
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (ReplacingAnyExtend) {
     assert(X.getValueType() != VT);
     // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
-    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, X.getDebugLoc(), VT, X);
+    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
     InsertDAGNode(DAG, N, NewX);
     X = NewX;
   }
-  DebugLoc DL = N.getDebugLoc();
+  SDLoc DL(N);
   SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, MVT::i8);
   SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
   SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, MVT::i8);
@@ -960,7 +963,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
 bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                               unsigned Depth) {
-  DebugLoc dl = N.getDebugLoc();
+  SDLoc dl(N);
   DEBUG({
       dbgs() << "MatchAddress: ";
       AM.dump();
@@ -1057,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     // The mask used for the transform is expected to be post-shift, but we
     // found the shift first so just apply the shift to the mask before passing
@@ -1242,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
     // We only handle up to 64-bit values here as those are what matter for
     // addressing mode optimizations.
-    if (X.getValueSizeInBits() > 64) break;
+    if (X.getSimpleValueType().getSizeInBits() > 64) break;
 
     if (!isa<ConstantSDNode>(N.getOperand(1)))
       break;
@@ -1321,7 +1324,7 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
@@ -1380,6 +1383,71 @@ bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Root,
 }
 
 
+bool X86DAGToDAGISel::SelectMOV64Imm32(SDValue N, SDValue &Imm) {
+  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+      return false;
+
+    Imm = CurDAG->getTargetConstant(ImmVal, MVT::i64);
+    return true;
+  }
+
+  // In static codegen with small code model, we can get the address of a label
+  // into a register with 'movl'. TableGen has already made sure we're looking
+  // at a label of some kind.
+  assert(N->getOpcode() == X86ISD::Wrapper &&
+         "Unexpected node type for MOV32ri64");
+  N = N.getOperand(0);
+
+  if (N->getOpcode() != ISD::TargetConstantPool &&
+      N->getOpcode() != ISD::TargetJumpTable &&
+      N->getOpcode() != ISD::TargetGlobalAddress &&
+      N->getOpcode() != ISD::TargetExternalSymbol &&
+      N->getOpcode() != ISD::TargetBlockAddress)
+    return false;
+
+  Imm = N;
+  return TM.getCodeModel() == CodeModel::Small;
+}
+
+bool X86DAGToDAGISel::SelectLEA64_32Addr(SDValue N, SDValue &Base,
+                                         SDValue &Scale, SDValue &Index,
+                                         SDValue &Disp, SDValue &Segment) {
+  if (!SelectLEAAddr(N, Base, Scale, Index, Disp, Segment))
+    return false;
+
+  SDLoc DL(N);
+  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
+  if (RN && RN->getReg() == 0)
+    Base = CurDAG->getRegister(0, MVT::i64);
+  else if (Base.getValueType() == MVT::i32 && !dyn_cast<FrameIndexSDNode>(N)) {
+    // Base could already be %rip, particularly in the x32 ABI.
+    Base = SDValue(CurDAG->getMachineNode(
+                       TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                       CurDAG->getTargetConstant(0, MVT::i64),
+                       Base,
+                       CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                   0);
+  }
+
+  RN = dyn_cast<RegisterSDNode>(Index);
+  if (RN && RN->getReg() == 0)
+    Index = CurDAG->getRegister(0, MVT::i64);
+  else {
+    assert(Index.getValueType() == MVT::i32 &&
+           "Expect to be extending 32-bit registers for use in LEA");
+    Index = SDValue(CurDAG->getMachineNode(
+                        TargetOpcode::SUBREG_TO_REG, DL, MVT::i64,
+                        CurDAG->getTargetConstant(0, MVT::i64),
+                        Index,
+                        CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                    0);
+  }
+
+  return true;
+}
+
 /// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
 /// mode it matches can be cost effectively emitted as an LEA instruction.
 bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
@@ -1398,7 +1466,7 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   assert (T == AM.Segment);
   AM.Segment = Copy;
 
-  EVT VT = N.getValueType();
+  MVT VT = N.getSimpleValueType();
   unsigned Complexity = 0;
   if (AM.BaseType == X86ISelAddressMode::RegBase)
     if (AM.Base_Reg.getNode())
@@ -1487,7 +1555,8 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
 ///
 SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
-  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+  return CurDAG->getRegister(GlobalBaseReg,
+                             getTargetLowering()->getPointerTy()).getNode();
 }
 
 SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
@@ -1502,7 +1571,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node),
                                            MVT::i32, MVT::i32, MVT::Other, Ops);
   cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
   return ResNode;
@@ -1637,8 +1706,8 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
 // + empty, the operand is not needed any more with the new op selected.
 // + non-empty, otherwise.
 static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
-                                                DebugLoc dl,
-                                                enum AtomicOpc &Op, EVT NVT,
+                                                SDLoc dl,
+                                                enum AtomicOpc &Op, MVT NVT,
                                                 SDValue Val) {
   if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
     int64_t CNVal = CN->getSExtValue();
@@ -1685,11 +1754,11 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
   return Val;
 }
 
-SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
+SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
@@ -1725,7 +1794,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
+  switch (NVT.SimpleTy) {
     default: return 0;
     case MVT::i8:
       if (isCN)
@@ -1920,7 +1989,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
     if (ChainCheck)
       // Make a new TokenFactor with all the other input chains except
       // for the load.
-      InputChain = CurDAG->getNode(ISD::TokenFactor, Chain.getDebugLoc(),
+      InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
                                    MVT::Other, &ChainOps[0], ChainOps.size());
   }
   if (!ChainCheck)
@@ -1968,7 +2037,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
   SDValue Segment = CurDAG->getRegister(0, MVT::i32);
   const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue()), VIdx,
                           Disp, Segment, VMask, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(), VTs, Ops);
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, SDLoc(Node), VTs, Ops);
   // Node has 2 outputs: VDst and MVT::Other.
   // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
   // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
@@ -1979,15 +2048,16 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
 }
 
 SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
-  EVT NVT = Node->getValueType(0);
+  MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
   unsigned Opcode = Node->getOpcode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
 
   DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << '\n');
 
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
+    Node->setNodeId(-1);
     return NULL;   // Already selected.
   }
 
@@ -2013,6 +2083,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     case Intrinsic::x86_avx2_gather_d_d_256:
     case Intrinsic::x86_avx2_gather_q_d:
     case Intrinsic::x86_avx2_gather_q_d_256: {
+      if (!Subtarget->hasAVX2())
+        break;
       unsigned Opc;
       switch (IntNo) {
       default: llvm_unreachable("Impossible intrinsic");
@@ -2117,7 +2189,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       break;
 
     unsigned ShlOp, Op;
-    EVT CstVT = NVT;
+    MVT CstVT = NVT;
 
     // Check the minimum bitwidth for the new constant.
     // TODO: AND32ri is the same as AND64ri32 with zext imm.
@@ -2132,7 +2204,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     if (NVT == CstVT)
       break;
 
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i32:
       assert(CstVT == MVT::i8);
@@ -2169,7 +2241,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     unsigned LoReg;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:  LoReg = X86::AL;  Opc = X86::MUL8r; break;
     case MVT::i16: LoReg = X86::AX;  Opc = X86::MUL16r; break;
@@ -2198,7 +2270,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     bool isSigned = Opcode == ISD::SMUL_LOHI;
     bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
       case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
@@ -2208,7 +2280,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                      MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
       case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
@@ -2335,9 +2407,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-    // Propagate ordering to the last node, for now.
-    CurDAG->AssignOrdering(InFlag.getNode(), CurDAG->GetOrdering(Node));
-
     return NULL;
   }
 
@@ -2348,7 +2417,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
       case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
@@ -2356,7 +2425,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
-      switch (NVT.getSimpleVT().SimpleTy) {
+      switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
       case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
@@ -2366,27 +2435,24 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
 
     unsigned LoReg, HiReg, ClrReg;
-    unsigned ClrOpcode, SExtOpcode;
-    switch (NVT.getSimpleVT().SimpleTy) {
+    unsigned SExtOpcode;
+    switch (NVT.SimpleTy) {
     default: llvm_unreachable("Unsupported VT!");
     case MVT::i8:
       LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
-      ClrOpcode  = 0;
       SExtOpcode = X86::CBW;
       break;
     case MVT::i16:
       LoReg = X86::AX;  HiReg = X86::DX;
-      ClrOpcode  = X86::MOV16r0; ClrReg = X86::DX;
+      ClrReg = X86::DX;
       SExtOpcode = X86::CWD;
       break;
     case MVT::i32:
       LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
-      ClrOpcode  = X86::MOV32r0;
       SExtOpcode = X86::CDQ;
       break;
     case MVT::i64:
       LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
-      ClrOpcode  = X86::MOV64r0;
       SExtOpcode = X86::CQO;
       break;
     }
@@ -2424,8 +2490,29 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode =
-          SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);       
+        switch (NVT.SimpleTy) {
+        case MVT::i16:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_16bit, MVT::i32)),
+                      0);
+          break;
+        case MVT::i32:
+          break;
+        case MVT::i64:
+          ClrNode =
+              SDValue(CurDAG->getMachineNode(
+                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                          CurDAG->getTargetConstant(0, MVT::i64), ClrNode,
+                          CurDAG->getTargetConstant(X86::sub_32bit, MVT::i32)),
+                      0);
+          break;
+        default:
+          llvm_unreachable("Unexpected division source");
+        }
+
         InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
                                       ClrNode, InFlag).getValue(1);
       }
@@ -2446,6 +2533,11 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     // Prevent use of AH in a REX instruction by referencing AX instead.
     // Shift it down 8 bits.
+    //
+    // The current assumption of the register allocator is that isel
+    // won't generate explicit references to the GPR8_NOREX registers. If
+    // the allocator and/or the backend get enhanced to be more robust in
+    // that regard, this can be, and should be, removed.
     if (HiReg == X86::AH && Subtarget->is64Bit() &&
         !SDValue(Node, 1).use_empty()) {
       SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
@@ -2519,7 +2611,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // On x86-32, only the ABCD registers have 8-bit subregisters.
         if (!Subtarget->is64Bit()) {
           const TargetRegisterClass *TRC;
-          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          switch (N0.getSimpleValueType().SimpleTy) {
           case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
           case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
           default: llvm_unreachable("Unsupported TEST operand type!");
@@ -2554,7 +2646,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
         // Put the value in an ABCD register.
         const TargetRegisterClass *TRC;
-        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        switch (N0.getSimpleValueType().SimpleTy) {
         case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
         case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
         case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
@@ -2666,7 +2758,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     EVT LdVT = LoadNode->getMemoryVT();
     unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
     MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
-                                                   Node->getDebugLoc(),
+                                                   SDLoc(Node),
                                                    MVT::i32, MVT::Other, Ops);
     Result->setMemRefs(MemOp, MemOp + 2);
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index f69f5d8..76eeb64 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16,6 +16,7 @@
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86.h"
+#include "X86CallingConv.h"
 #include "X86InstrBuilder.h"
 #include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
@@ -55,20 +56,17 @@ using namespace llvm;
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 // Forward declarations.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2);
 
-/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
-/// sets things up to match to an AVX VEXTRACTF128 instruction or a
-/// simple subregister reference.  Idx is an index in the 128 bits we
-/// want.  It need not be aligned to a 128-bit bounday.  That makes
-/// lowering EXTRACT_VECTOR_ELT operations easier.
-static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
-                                   SelectionDAG &DAG, DebugLoc dl) {
+static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
+                                SelectionDAG &DAG, SDLoc dl,
+                                unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
   EVT VT = Vec.getValueType();
-  assert(VT.is256BitVector() && "Unexpected vector size!");
   EVT ElVT = VT.getVectorElementType();
-  unsigned Factor = VT.getSizeInBits()/128;
+  unsigned Factor = VT.getSizeInBits()/vectorWidth;
   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
                                   VT.getVectorNumElements()/Factor);
 
@@ -76,13 +74,12 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   if (Vec.getOpcode() == ISD::UNDEF)
     return DAG.getUNDEF(ResultVT);
 
-  // Extract the relevant 128 bits.  Generate an EXTRACT_SUBVECTOR
-  // we can match to VEXTRACTF128.
-  unsigned ElemsPerChunk = 128 / ElVT.getSizeInBits();
+  // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
+  unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
 
-  // This is the index of the first element of the 128-bit chunk
+  // This is the index of the first element of the vectorWidth-bit chunk
   // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
                                * ElemsPerChunk);
 
   // If the input is a buildvector just emit a smaller one.
@@ -95,38 +92,71 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
                                VecIdx);
 
   return Result;
+
+}
+/// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
+/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
+/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
+/// instructions or a simple subregister reference. Idx is an index in the
+/// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering EXTRACT_VECTOR_ELT operations easier.
+static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert((Vec.getValueType().is256BitVector() ||
+          Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
 }
 
-/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
-/// sets things up to match to an AVX VINSERTF128 instruction or a
-/// simple superregister reference.  Idx is an index in the 128 bits
-/// we want.  It need not be aligned to a 128-bit bounday.  That makes
-/// lowering INSERT_VECTOR_ELT operations easier.
-static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
-                                  unsigned IdxVal, SelectionDAG &DAG,
-                                  DebugLoc dl) {
+/// Generate a DAG to grab 256-bits from a 512-bit vector.
+static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
+                                   SelectionDAG &DAG, SDLoc dl) {
+  assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
+  return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
+}
+
+static SDValue InsertSubVector(SDValue Result, SDValue Vec,
+                               unsigned IdxVal, SelectionDAG &DAG,
+                               SDLoc dl, unsigned vectorWidth) {
+  assert((vectorWidth == 128 || vectorWidth == 256) &&
+         "Unsupported vector width");
   // Inserting UNDEF is Result
   if (Vec.getOpcode() == ISD::UNDEF)
     return Result;
-
   EVT VT = Vec.getValueType();
-  assert(VT.is128BitVector() && "Unexpected vector size!");
-
   EVT ElVT = VT.getVectorElementType();
   EVT ResultVT = Result.getValueType();
 
-  // Insert the relevant 128 bits.
-  unsigned ElemsPerChunk = 128/ElVT.getSizeInBits();
+  // Insert the relevant vectorWidth bits.
+  unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
 
-  // This is the index of the first element of the 128-bit chunk
+  // This is the index of the first element of the vectorWidth-bit chunk
   // we want.
-  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
+  unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
                                * ElemsPerChunk);
 
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                      VecIdx);
 }
+/// Generate a DAG to put 128-bits into a vector > 128 bits.  This
+/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
+/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
+/// simple superregister reference.  Idx is an index in the 128 bits
+/// we want.  It need not be aligned to a 128-bit bounday.  That makes
+/// lowering INSERT_VECTOR_ELT operations easier.
+static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
+                                  SDLoc dl) {
+  assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
+}
+
+static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
+                                  unsigned IdxVal, SelectionDAG &DAG,
+                                  SDLoc dl) {
+  assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
+  return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
+}
 
 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
 /// instructions. This is used because creating CONCAT_VECTOR nodes of
@@ -134,11 +164,18 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
 /// large BUILD_VECTORS.
 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
                                    unsigned NumElems, SelectionDAG &DAG,
-                                   DebugLoc dl) {
+                                   SDLoc dl) {
   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
 }
 
+static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
+                                   unsigned NumElems, SelectionDAG &DAG,
+                                   SDLoc dl) {
+  SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
+  return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
+}
+
 static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
   const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
   bool is64Bit = Subtarget->is64Bit();
@@ -163,7 +200,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  RegInfo = TM.getRegisterInfo();
   TD = getDataLayout();
 
   resetOperationActions();
@@ -202,6 +238,8 @@ void X86TargetLowering::resetOperationActions() {
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
 
   // Bypass expensive divides on Atom when compiling with O2
@@ -562,10 +600,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
   }
 
-  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
-  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
-  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
-  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
   if (Subtarget->is64Bit()) {
     setExceptionPointerRegister(X86::RAX);
     setExceptionSelectorRegister(X86::RDX);
@@ -585,10 +619,12 @@ void X86TargetLowering::resetOperationActions() {
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
-  if (Subtarget->is64Bit()) {
+  if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
+    // TargetInfo::X86_64ABIBuiltinVaList
     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
   } else {
+    // TargetInfo::CharPtrBuiltinVaList
     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
   }
@@ -596,7 +632,7 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
+  if (Subtarget->isOSWindows() && !Subtarget->isTargetEnvMacho())
     setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
                        MVT::i64 : MVT::i32, Custom);
   else if (TM.Options.EnableSegmentedStacks)
@@ -999,7 +1035,7 @@ void X86TargetLowering::resetOperationActions() {
     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   }
 
-  if (Subtarget->hasSSE41()) {
+  if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -1115,9 +1151,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v4i32, Custom);
-
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
@@ -1125,7 +1158,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
 
@@ -1158,10 +1190,16 @@ void X86TargetLowering::resetOperationActions() {
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
+    setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
 
     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
@@ -1262,6 +1300,152 @@ void X86TargetLowering::resetOperationActions() {
     }
   }
 
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
+    addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
+    addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+
+    addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
+    addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
+
+    setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
+
+    setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
+    setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
+    setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
+    setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
+      setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
+      setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
+      setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
+    }
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
+    setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::i1, Legal);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
+    setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
+    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
+
+    setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
+    setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
+
+    setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
+
+    setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
+    setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
+    setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
+
+    setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
+    setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
+
+    setOperationAction(ISD::AND,                MVT::v8i64, Legal);
+    setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
+    setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
+    setOperationAction(ISD::AND,                MVT::v16i32, Legal);
+    setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
+    setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
+
+    // Custom lower several nodes.
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE;
+             i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+      // Extract subvector is special because the value type
+      // (result) is 256/128-bit but the source is 512-bit wide.
+      if (VT.is128BitVector() || VT.is256BitVector())
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+      if (VT.getVectorElementType() == MVT::i1)
+        setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
+      // Do not attempt to custom lower other non-512-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      if ( EltSize >= 32) {
+        setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
+        setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
+        setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
+        setOperationAction(ISD::VSELECT,             VT, Legal);
+        setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
+        setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
+        setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      }
+    }
+    for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
+      MVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to promote non-256-bit vectors
+      if (!VT.is512BitVector())
+        continue;
+
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
+    }
+  }// has  AVX-512
+
   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
   // of this type with custom code.
   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
@@ -1273,6 +1457,7 @@ void X86TargetLowering::resetOperationActions() {
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1361,8 +1546,17 @@ void X86TargetLowering::resetOperationActions() {
   setPrefFunctionAlignment(4); // 2^4 bytes.
 }
 
-EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
-  if (!VT.isVector()) return MVT::i8;
+EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i8;
+
+  const TargetMachine &TM = getTargetMachine();
+  if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512())
+    switch(VT.getVectorNumElements()) {
+    case  8: return MVT::v8i1;
+    case 16: return MVT::v16i1;
+    }
+
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1504,9 +1698,9 @@ X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
                                                     SelectionDAG &DAG) const {
   if (!Subtarget->is64Bit())
-    // This doesn't have DebugLoc associated with it, but is not really the
+    // This doesn't have SDLoc associated with it, but is not really the
     // same as a Register.
-    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy());
+    return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
   return Table;
 }
 
@@ -1571,6 +1765,13 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
   return true;
 }
 
+bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
+                                            unsigned DestAS) const {
+  assert(SrcAS != DestAS && "Expected different address spaces!");
+
+  return SrcAS < 256 && DestAS < 256;
+}
+
 //===----------------------------------------------------------------------===//
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -1588,12 +1789,17 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
+const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+  return ScratchRegs;
+}
+
 SDValue
 X86TargetLowering::LowerReturn(SDValue Chain,
                                CallingConv::ID CallConv, bool isVarArg,
                                const SmallVectorImpl<ISD::OutputArg> &Outs,
                                const SmallVectorImpl<SDValue> &OutVals,
-                               DebugLoc dl, SelectionDAG &DAG) const {
+                               SDLoc dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
@@ -1761,7 +1967,7 @@ SDValue
 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SDLoc dl, SelectionDAG &DAG,
                                    SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -1868,7 +2074,7 @@ argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
 static SDValue
 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
-                          DebugLoc dl) {
+                          SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
 
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
@@ -1883,13 +2089,19 @@ static bool IsTailCallConvention(CallingConv::ID CC) {
           CC == CallingConv::HiPE);
 }
 
+/// \brief Return true if the calling convention is a C calling convention.
+static bool IsCCallConvention(CallingConv::ID CC) {
+  return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
+          CC == CallingConv::X86_64_SysV);
+}
+
 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
     return false;
 
   CallSite CS(CI);
   CallingConv::ID CalleeCC = CS.getCallingConv();
-  if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C)
+  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
     return false;
 
   return true;
@@ -1906,7 +2118,7 @@ SDValue
 X86TargetLowering::LowerMemArgument(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     MachineFrameInfo *MFI,
                                     unsigned i) const {
@@ -1948,7 +2160,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                                         CallingConv::ID CallConv,
                                         bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                        DebugLoc dl,
+                                        SDLoc dl,
                                         SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
@@ -1964,7 +2176,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
   MachineFrameInfo *MFI = MF.getFrameInfo();
   bool Is64Bit = Subtarget->is64Bit();
   bool IsWindows = Subtarget->isTargetWindows();
-  bool IsWin64 = Subtarget->isTargetWin64();
+  bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
 
   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
          "Var args not supported with calling convention fastcc, ghc or hipe");
@@ -1975,9 +2187,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
-  if (IsWin64) {
+  if (IsWin64)
     CCInfo.AllocateStack(32, 8);
-  }
 
   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
 
@@ -2003,12 +2214,18 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
         RC = &X86::FR32RegClass;
       else if (RegVT == MVT::f64)
         RC = &X86::FR64RegClass;
+      else if (RegVT.is512BitVector())
+        RC = &X86::VR512RegClass;
       else if (RegVT.is256BitVector())
         RC = &X86::VR256RegClass;
       else if (RegVT.is128BitVector())
         RC = &X86::VR128RegClass;
       else if (RegVT == MVT::x86mmx)
         RC = &X86::VR64RegClass;
+      else if (RegVT == MVT::v8i1)
+        RC = &X86::VK8RegClass;
+      else if (RegVT == MVT::v16i1)
+        RC = &X86::VK16RegClass;
       else
         llvm_unreachable("Unknown argument type!");
 
@@ -2225,7 +2442,7 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
 SDValue
 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
                                     SDValue StackPtr, SDValue Arg,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     const CCValAssign &VA,
                                     ISD::ArgFlagsTy Flags) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
@@ -2245,7 +2462,7 @@ SDValue
 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
                                            SDValue &OutRetAddr, SDValue Chain,
                                            bool IsTailCall, bool Is64Bit,
-                                           int FPDiff, DebugLoc dl) const {
+                                           int FPDiff, SDLoc dl) const {
   // Adjust the Return address stack slot.
   EVT VT = getPointerTy();
   OutRetAddr = getReturnAddressFrameIndex(DAG);
@@ -2261,12 +2478,13 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
                          SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
-                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
+                         unsigned SlotSize, int FPDiff, SDLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
   int NewReturnAddrFI =
-    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
+                                         false);
   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
@@ -2278,10 +2496,10 @@ SDValue
 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   CallingConv::ID CallConv              = CLI.CallConv;
@@ -2290,7 +2508,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   MachineFunction &MF = DAG.getMachineFunction();
   bool Is64Bit        = Subtarget->is64Bit();
-  bool IsWin64        = Subtarget->isTargetWin64();
+  bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
   bool IsWindows      = Subtarget->isTargetWindows();
   StructReturnType SR = callIsStructReturn(Outs);
   bool IsSibcall      = false;
@@ -2323,9 +2541,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                  ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64
-  if (IsWin64) {
+  if (IsWin64)
     CCInfo.AllocateStack(32, 8);
-  }
 
   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
 
@@ -2354,7 +2571,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!IsSibcall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                                 dl);
 
   SDValue RetAddrFrIdx;
   // Load return address for tail calls.
@@ -2368,6 +2586,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
     EVT RegVT = VA.getLocVT();
@@ -2443,7 +2663,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // GOT pointer.
     if (!isTailCall) {
       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
-               DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), getPointerTy())));
+               DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
       // address of the callee into ECX. The value in ecx is used as target of
@@ -2640,7 +2860,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (!IsSibcall && isTailCall) {
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                           DAG.getIntPtrConstant(0, true), InFlag);
+                           DAG.getIntPtrConstant(0, true), InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -2699,7 +2919,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                DAG.getIntPtrConstant(NumBytes, true),
                                DAG.getIntPtrConstant(NumBytesForCalleeToPush,
                                                      true),
-                               InFlag);
+                               InFlag, dl);
     InFlag = Chain.getValue(1);
   }
 
@@ -2747,6 +2967,8 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                SelectionDAG& DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetMachine &TM = MF.getTarget();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
@@ -2831,13 +3053,12 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
                                                      SelectionDAG &DAG) const {
-  if (!IsTailCallConvention(CalleeCC) &&
-      CalleeCC != CallingConv::C)
+  if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
     return false;
 
   // If -tailcallopt is specified, make fastcc functions tail-callable.
   const MachineFunction &MF = DAG.getMachineFunction();
-  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  const Function *CallerF = MF.getFunction();
 
   // If the function return type is x86_fp80 and the callee return type is not,
   // then the FP_EXTEND of the call result is not a nop. It's not safe to
@@ -2847,6 +3068,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
+  bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
+  bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
 
   if (getTargetMachine().Options.GuaranteedTailCallOpt) {
     if (IsTailCallConvention(CalleeCC) && CCMatch)
@@ -2859,6 +3082,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
   // emit a special epilogue.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   if (RegInfo->needsStackRealignment(MF))
     return false;
 
@@ -2878,7 +3103,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
     // Optimizing for varargs on Win64 is unlikely to be safe without
     // additional testing.
-    if (Subtarget->isTargetWin64())
+    if (IsCalleeWin64 || IsCallerWin64)
       return false;
 
     SmallVector<CCValAssign, 16> ArgLocs;
@@ -2953,9 +3178,8 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                    getTargetMachine(), ArgLocs, *DAG.getContext());
 
     // Allocate shadow area for Win64
-    if (Subtarget->isTargetWin64()) {
+    if (IsCalleeWin64)
       CCInfo.AllocateStack(32, 8);
-    }
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     if (CCInfo.getNextStackOffset()) {
@@ -3062,7 +3286,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3073,7 +3297,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3087,7 +3311,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SDValue V2, unsigned TargetMask,
                                     SelectionDAG &DAG) {
   switch(Opc) {
@@ -3100,7 +3324,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   }
 }
 
-static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
   switch(Opc) {
   default: llvm_unreachable("Unknown x86 shuffle node");
@@ -3119,13 +3343,16 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
 
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
   int ReturnAddrIndex = FuncInfo->getRAIndex();
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
     unsigned SlotSize = RegInfo->getSlotSize();
-    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                           -(int64_t)SlotSize,
                                                            false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
@@ -3332,7 +3559,7 @@ static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
 /// the second operand.
-static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
+static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT) {
   if (VT == MVT::v4f32 || VT == MVT::v4i32 )
     return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
   if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -3342,7 +3569,7 @@ static bool isPSHUFDMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFHW.
-static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3371,7 +3598,7 @@ static bool isPSHUFHWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PSHUFLW.
-static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
     return false;
 
@@ -3400,14 +3627,14 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 
 /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
 /// is suitable for input to PALIGNR.
-static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
+static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
                           const X86Subtarget *Subtarget) {
   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
       (VT.is256BitVector() && !Subtarget->hasInt256()))
     return false;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   // Do not handle 64-bit element shuffles with palignr.
@@ -3490,10 +3717,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
 /// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
-                        bool Commuted = false) {
-  if (!HasFp256 && VT.is256BitVector())
-    return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3502,6 +3726,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   if (NumLaneElems != 2 && NumLaneElems != 4)
     return false;
 
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  bool symetricMaskRequired =
+    (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
   // VSHUFPSY divides the resulting vector into 4 chunks.
   // The sources are also splitted into 4 chunks, and each destination
   // chunk must come from a different source chunk.
@@ -3521,6 +3749,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
   //
   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
   //
+  SmallVector<int, 4> MaskVal(NumLaneElems, -1);
   unsigned HalfLaneElems = NumLaneElems/2;
   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
     for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -3531,9 +3760,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
       // For VSHUFPSY, the mask of the second half must be the same as the
       // first but with the appropriate offsets. This works in the same way as
       // VPERMILPS works with masks.
-      if (NumElems != 8 || l == 0 || Mask[i] < 0)
+      if (!symetricMaskRequired || Idx < 0)
         continue;
-      if (!isUndefOrEqual(Idx, Mask[i]+l))
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Idx - l;
+        continue;
+      }
+      if ((signed)(Idx - l) != MaskVal[i])
         return false;
     }
   }
@@ -3543,7 +3776,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
 
 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
-static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3562,7 +3795,7 @@ static bool isMOVHLPSMask(ArrayRef<int> Mask, EVT VT) {
 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
 /// <2, 3, 2, 3>
-static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3579,7 +3812,7 @@ static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
-static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3601,7 +3834,7 @@ static bool isMOVLPMask(ArrayRef<int> Mask, EVT VT) {
 
 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
-static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -3627,8 +3860,8 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
 static
 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
                                SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
-  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDLoc dl(SVOp);
 
   if (VT != MVT::v8i32 && VT != MVT::v8f32)
     return SDValue();
@@ -3670,73 +3903,92 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
-         "Unsupported vector type for unpckh");
+  assert(VT.getSizeInBits() >= 128 &&
+         "Unsupported vector type for unpckl");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
-         i != (l+1)*NumLaneElts;
-         i += 2, ++j) {
-      int BitI  = Mask[i];
-      int BitI1 = Mask[i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (!isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j + NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
   }
-
   return true;
 }
 
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
+static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
                          bool HasInt256, bool V2IsSplat = false) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert(VT.getSizeInBits() >= 128 &&
          "Unsupported vector type for unpckh");
 
-  if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
-      (!HasInt256 || (NumElts != 16 && NumElts != 32)))
+  // AVX defines UNPCK* to operate independently on 128-bit lanes.
+  unsigned NumLanes;
+  unsigned NumOf256BitLanes;
+  unsigned NumElts = VT.getVectorNumElements();
+  if (VT.is256BitVector()) {
+    if (NumElts != 4 && NumElts != 8 &&
+        (!HasInt256 || (NumElts != 16 && NumElts != 32)))
     return false;
+    NumLanes = 2;
+    NumOf256BitLanes = 1;
+  } else if (VT.is512BitVector()) {
+    assert(VT.getScalarType().getSizeInBits() >= 32 &&
+           "Unsupported vector type for unpckh");
+    NumLanes = 2;
+    NumOf256BitLanes = 2;
+  } else {
+    NumLanes = 1;
+    NumOf256BitLanes = 1;
+  }
 
-  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
-  // independently on 128-bit lanes.
-  unsigned NumLanes = VT.getSizeInBits()/128;
-  unsigned NumLaneElts = NumElts/NumLanes;
+  unsigned NumEltsInStride = NumElts/NumOf256BitLanes;
+  unsigned NumLaneElts = NumEltsInStride/NumLanes;
 
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
-         i != (l+1)*NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[i];
-      int BitI1 = Mask[i+1];
-      if (!isUndefOrEqual(BitI, j))
-        return false;
-      if (V2IsSplat) {
-        if (isUndefOrEqual(BitI1, NumElts))
+  for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) {
+    for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) {
+      for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+        int BitI  = Mask[l256*NumEltsInStride+l+i];
+        int BitI1 = Mask[l256*NumEltsInStride+l+i+1];
+        if (!isUndefOrEqual(BitI, j+l256*NumElts))
           return false;
-      } else {
-        if (!isUndefOrEqual(BitI1, j+NumElts))
+        if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts))
+          return false;
+        if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride))
           return false;
       }
     }
@@ -3747,10 +3999,12 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
 /// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
   bool Is256BitVec = VT.is256BitVector();
 
+  if (VT.is512BitVector())
+    return false;
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -3770,12 +4024,10 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = l*NumLaneElts, j = l*NumLaneElts;
-         i != (l+1)*NumLaneElts;
-         i += 2, ++j) {
-      int BitI  = Mask[i];
-      int BitI1 = Mask[i+1];
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
 
       if (!isUndefOrEqual(BitI, j))
         return false;
@@ -3790,9 +4042,12 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
 /// <2, 2, 3, 3>
-static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
+static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   unsigned NumElts = VT.getVectorNumElements();
 
+  if (VT.is512BitVector())
+    return false;
+
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
@@ -3805,11 +4060,10 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  for (unsigned l = 0; l != NumLanes; ++l) {
-    for (unsigned i = l*NumLaneElts, j = (l*NumLaneElts)+NumLaneElts/2;
-         i != (l+1)*NumLaneElts; i += 2, ++j) {
-      int BitI  = Mask[i];
-      int BitI1 = Mask[i+1];
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
+      int BitI  = Mask[l+i];
+      int BitI1 = Mask[l+i+1];
       if (!isUndefOrEqual(BitI, j))
         return false;
       if (!isUndefOrEqual(BitI1, j))
@@ -3846,7 +4100,7 @@ static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
 /// The first half comes from the second half of V1 and the second half from the
 /// the second half of V2.
-static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -3878,7 +4132,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned HalfSize = VT.getVectorNumElements()/2;
 
@@ -3899,6 +4153,44 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
   return (FstHalf | (SndHalf << 4));
 }
 
+// Symetric in-lane mask. Each lane has 4 elements (for imm8)
+static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (EltSize < 32)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  Imm8 = 0;
+  if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      Imm8 |= Mask[i] << (i*2);
+    }
+    return true;
+  }
+
+  unsigned LaneSize = 4;
+  SmallVector<int, 4> MaskVal(LaneSize, -1);
+
+  for (unsigned l = 0; l != NumElts; l += LaneSize) {
+    for (unsigned i = 0; i != LaneSize; ++i) {
+      if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
+        return false;
+      if (Mask[i+l] < 0)
+        continue;
+      if (MaskVal[i] < 0) {
+        MaskVal[i] = Mask[i+l] - l;
+        Imm8 |= MaskVal[i] << (i*2);
+        continue;
+      }
+      if (Mask[i+l] != (signed)(MaskVal[i]+l))
+        return false;
+    }
+  }
+  return true;
+}
+
 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
 /// Note that VPERMIL mask matching is different depending whether theunderlying
@@ -3906,38 +4198,39 @@ static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
 /// to the same elements of the low, but to the higher half of the source.
 /// In VPERMILPD the two lanes could be shuffled independently of each other
 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
-  if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+  if (VT.getSizeInBits() < 256 || EltSize < 32)
     return false;
-
+  bool symetricMaskRequired = (EltSize == 32);
   unsigned NumElts = VT.getVectorNumElements();
-  // Only match 256-bit with 32/64-bit types
-  if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
-    return false;
 
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned LaneSize = NumElts/NumLanes;
+  // 2 or 4 elements in one lane
+  
+  SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
   for (unsigned l = 0; l != NumElts; l += LaneSize) {
     for (unsigned i = 0; i != LaneSize; ++i) {
       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
         return false;
-      if (NumElts != 8 || l == 0)
-        continue;
-      // VPERMILPS handling
-      if (Mask[i] < 0)
-        continue;
-      if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
-        return false;
+      if (symetricMaskRequired) {
+        if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+          ExpectedMaskVal[i] = Mask[i+l] - l;
+          continue;
+        }
+        if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+          return false;
+      }
     }
   }
-
   return true;
 }
 
 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
 /// element of vector 2 and the other elements to come from vector 1 in order.
-static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
+static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
                                bool V2IsSplat = false, bool V2IsUndef = false) {
   if (!VT.is128BitVector())
     return false;
@@ -3961,7 +4254,7 @@ static bool isCommutedMOVLMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
-static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -3969,7 +4262,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i+1" is the value the indexed mask element must have
@@ -3984,7 +4278,7 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
-static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
+static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
                            const X86Subtarget *Subtarget) {
   if (!Subtarget->hasSSE3())
     return false;
@@ -3992,7 +4286,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
   unsigned NumElems = VT.getVectorNumElements();
 
   if ((VT.is128BitVector() && NumElems != 4) ||
-      (VT.is256BitVector() && NumElems != 8))
+      (VT.is256BitVector() && NumElems != 8) ||
+      (VT.is512BitVector() && NumElems != 16))
     return false;
 
   // "i" is the value the indexed mask element must have
@@ -4007,7 +4302,7 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 256-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
+static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
   if (!HasFp256 || !VT.is256BitVector())
     return false;
 
@@ -4027,7 +4322,7 @@ static bool isMOVDDUPYMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// version of MOVDDUP.
-static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
+static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4041,49 +4336,66 @@ static bool isMOVDDUPMask(ArrayRef<int> Mask, EVT VT) {
   return true;
 }
 
-/// isVEXTRACTF128Index - Return true if the specified
+/// isVEXTRACTIndex - Return true if the specified
 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-/// suitable for input to VEXTRACTF128.
-bool X86::isVEXTRACTF128Index(SDNode *N) {
+/// suitable for instruction that extract 128 or 256 bit vectors
+static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
     return false;
 
-  // The index should be aligned on a 128-bit boundary.
+  // The index should be aligned on a vecWidth-bit boundary.
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
-  bool Result = (Index * ElSize) % 128 == 0;
+  bool Result = (Index * ElSize) % vecWidth == 0;
 
   return Result;
 }
 
-/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
+/// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
 /// operand specifies a subvector insert that is suitable for input to
-/// VINSERTF128.
-bool X86::isVINSERTF128Index(SDNode *N) {
+/// insertion of 128 or 256-bit subvectors
+static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
     return false;
-
-  // The index should be aligned on a 128-bit boundary.
+  // The index should be aligned on a vecWidth-bit boundary.
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
-  bool Result = (Index * ElSize) % 128 == 0;
+  bool Result = (Index * ElSize) % vecWidth == 0;
 
   return Result;
 }
 
+bool X86::isVINSERT128Index(SDNode *N) {
+  return isVINSERTIndex(N, 128);
+}
+
+bool X86::isVINSERT256Index(SDNode *N) {
+  return isVINSERTIndex(N, 256);
+}
+
+bool X86::isVEXTRACT128Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 128);
+}
+
+bool X86::isVEXTRACT256Index(SDNode *N) {
+  return isVEXTRACTIndex(N, 256);
+}
+
 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
 /// Handles 128-bit and 256-bit.
 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert((VT.getSizeInBits() >= 128) &&
          "Unsupported vector type for PSHUF/SHUFP");
 
   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
@@ -4092,10 +4404,10 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
   unsigned NumLanes = VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
-  assert((NumLaneElts == 2 || NumLaneElts == 4) &&
-         "Only supports 2 or 4 elements per lane");
+  assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
+         "Only supports 2, 4 or 8 elements per lane");
 
-  unsigned Shift = (NumLaneElts == 4) ? 1 : 0;
+  unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
   unsigned Mask = 0;
   for (unsigned i = 0; i != NumElts; ++i) {
     int Elt = N->getMaskElt(i);
@@ -4111,7 +4423,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4135,7 +4447,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
+  MVT VT = N->getSimpleValueType(0);
 
   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
          "Unsupported vector type for PSHUFHW");
@@ -4159,11 +4471,12 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
 /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
 /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+  MVT VT = SVOp->getSimpleValueType(0);
+  unsigned EltSize = VT.is512BitVector() ? 1 :
+    VT.getVectorElementType().getSizeInBits() >> 3;
 
   unsigned NumElts = VT.getVectorNumElements();
-  unsigned NumLanes = VT.getSizeInBits()/128;
+  unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
   unsigned NumLaneElts = NumElts/NumLanes;
 
   int Val = 0;
@@ -4180,61 +4493,64 @@ static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
   return (Val - i) * EltSize;
 }
 
-/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
-/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
-/// instructions.
-unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
+static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
-    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");
+    llvm_unreachable("Illegal extract subvector for VEXTRACT");
 
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
 
-  MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+  MVT VecVT = N->getOperand(0).getSimpleValueType();
   MVT ElVT = VecVT.getVectorElementType();
 
-  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   return Index / NumElemsPerChunk;
 }
 
-/// getInsertVINSERTF128Immediate - Return the appropriate immediate
-/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
-/// instructions.
-unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
+static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
+  assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
-    llvm_unreachable("Illegal insert subvector for VINSERTF128");
+    llvm_unreachable("Illegal insert subvector for VINSERT");
 
   uint64_t Index =
     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
 
-  MVT VecVT = N->getValueType(0).getSimpleVT();
+  MVT VecVT = N->getSimpleValueType(0);
   MVT ElVT = VecVT.getVectorElementType();
 
-  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
+  unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
   return Index / NumElemsPerChunk;
 }
 
-/// getShuffleCLImmediate - Return the appropriate immediate to shuffle
-/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
-/// Handles 256-bit.
-static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
-  MVT VT = N->getValueType(0).getSimpleVT();
-
-  unsigned NumElts = VT.getVectorNumElements();
+/// getExtractVEXTRACT128Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 128);
+}
 
-  assert((VT.is256BitVector() && NumElts == 4) &&
-         "Unsupported vector type for VPERMQ/VPERMPD");
+/// getExtractVEXTRACT256Immediate - Return the appropriate immediate
+/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
+  return getExtractVEXTRACTImmediate(N, 256);
+}
 
-  unsigned Mask = 0;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int Elt = N->getMaskElt(i);
-    if (Elt < 0)
-      continue;
-    Mask |= Elt << (i*2);
-  }
+/// getInsertVINSERT128Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
+/// and VINSERTI128 instructions.
+unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 128);
+}
 
-  return Mask;
+/// getInsertVINSERT256Immediate - Return the appropriate immediate
+/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
+/// and VINSERTI64x4 instructions.
+unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
+  return getInsertVINSERTImmediate(N, 256);
 }
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
@@ -4249,7 +4565,7 @@ bool X86::isZeroNode(SDValue Elt) {
 /// their permute mask.
 static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                     SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> MaskVec;
 
@@ -4263,7 +4579,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
     }
     MaskVec.push_back(Idx);
   }
-  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+  return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1),
                               SVOp->getOperand(0), &MaskVec[0]);
 }
 
@@ -4271,7 +4587,7 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
 /// half of V2 (and in order).
-static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, EVT VT) {
+static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
   if (VT.getVectorNumElements() != 4)
@@ -4328,7 +4644,7 @@ static bool WillBeConstantPoolLoad(SDNode *N) {
 /// half of V2 (and in order). And since V1 will become the source of the
 /// MOVLP, it must be either a vector load or a scalar load to vector.
 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
-                               ArrayRef<int> Mask, EVT VT) {
+                               ArrayRef<int> Mask, MVT VT) {
   if (!VT.is128BitVector())
     return false;
 
@@ -4396,7 +4712,7 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
 /// getZeroVector - Returns a vector of specified type with all zero elements.
 ///
 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
-                             SelectionDAG &DAG, DebugLoc dl) {
+                             SelectionDAG &DAG, SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   // Always build SSE zero vectors as <4 x i32> bitcasted
@@ -4424,6 +4740,11 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
                         array_lengthof(Ops));
     }
+  } else if (VT.is512BitVector()) { // AVX-512
+      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                        Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -4435,7 +4756,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
 /// Then bitcast to their original type, ensuring they get CSE'd.
 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
-                             DebugLoc dl) {
+                             SDLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
@@ -4469,7 +4790,7 @@ static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
 
 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
 /// operation of specified width.
-static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
                        SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4480,7 +4801,7 @@ static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
-static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4492,7 +4813,7 @@ static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 }
 
 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
-static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
                           SDValue V2) {
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 8> Mask;
@@ -4508,9 +4829,9 @@ static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
 // Generate shuffles which repeat i16 and i8 several times until they can be
 // represented by v4f32 and then be manipulated by target suported shuffles.
 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
-  EVT VT = V.getValueType();
+  MVT VT = V.getSimpleValueType();
   int NumElems = VT.getVectorNumElements();
-  DebugLoc dl = V.getDebugLoc();
+  SDLoc dl(V);
 
   while (NumElems > 4) {
     if (EltNo < NumElems/2) {
@@ -4526,8 +4847,8 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
 
 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
-  EVT VT = V.getValueType();
-  DebugLoc dl = V.getDebugLoc();
+  MVT VT = V.getSimpleValueType();
+  SDLoc dl(V);
 
   if (VT.is128BitVector()) {
     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
@@ -4552,9 +4873,9 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
 
 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
-  EVT SrcVT = SV->getValueType(0);
+  MVT SrcVT = SV->getSimpleValueType(0);
   SDValue V1 = SV->getOperand(0);
-  DebugLoc dl = SV->getDebugLoc();
+  SDLoc dl(SV);
 
   int EltNo = SV->getSplatIndex();
   int NumElems = SrcVT.getVectorNumElements();
@@ -4575,7 +4896,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
   // instruction because the target has no such instruction. Generate shuffles
   // which repeat i16 and i8 several times until they fit in i32, and then can
   // be manipulated by target suported shuffles.
-  EVT EltVT = SrcVT.getVectorElementType();
+  MVT EltVT = SrcVT.getVectorElementType();
   if (EltVT == MVT::i8 || EltVT == MVT::i16)
     V1 = PromoteSplati8i16(V1, DAG, EltNo);
 
@@ -4597,15 +4918,15 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                            bool IsZero,
                                            const X86Subtarget *Subtarget,
                                            SelectionDAG &DAG) {
-  EVT VT = V2.getValueType();
+  MVT VT = V2.getSimpleValueType();
   SDValue V1 = IsZero
-    ? getZeroVector(VT, Subtarget, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+    ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
   unsigned NumElems = VT.getVectorNumElements();
   SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i != NumElems; ++i)
     // If this is the insertion idx, put the low elt of V2 here.
     MaskVec.push_back(i == Idx ? NumElems : i);
-  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+  return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
 }
 
 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
@@ -4715,7 +5036,7 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    MVT ShufVT = V.getValueType().getSimpleVT();
+    MVT ShufVT = V.getSimpleValueType();
     unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     bool IsUnary;
@@ -4756,19 +5077,27 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
 /// shuffle operation which come from a consecutively from a zero. The
 /// search can start in two different directions, from left or right.
-static
-unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems,
-                                  bool ZerosFromLeft, SelectionDAG &DAG) {
-  unsigned i;
-  for (i = 0; i != NumElems; ++i) {
-    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
+/// We count undefs as zeros until PreferredNum is reached.
+static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
+                                         unsigned NumElems, bool ZerosFromLeft,
+                                         SelectionDAG &DAG,
+                                         unsigned PreferredNum = -1U) {
+  unsigned NumZeros = 0;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
-    if (!(Elt.getNode() &&
-         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
+    if (!Elt.getNode())
+      break;
+
+    if (X86::isZeroNode(Elt))
+      ++NumZeros;
+    else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
+      NumZeros = std::min(NumZeros + 1, PreferredNum);
+    else
       break;
   }
 
-  return i;
+  return NumZeros;
 }
 
 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
@@ -4805,9 +5134,11 @@ bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
 /// logical left shift of a vector.
 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
-              false /* check zeros from right */, DAG);
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, false /* check zeros from right */, DAG,
+      SVOp->getMaskElt(0));
   unsigned OpSrc;
 
   if (!NumZeros)
@@ -4838,9 +5169,11 @@ static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
 /// logical left shift of a vector.
 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
-  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
-  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
-              true /* check zeros from left */, DAG);
+  unsigned NumElems =
+    SVOp->getSimpleValueType(0).getVectorNumElements();
+  unsigned NumZeros = getNumOfConsecutiveZeros(
+      SVOp, NumElems, true /* check zeros from left */, DAG,
+      NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
   unsigned OpSrc;
 
   if (!NumZeros)
@@ -4873,7 +5206,7 @@ static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
   // Although the logic below support any bitwidth size, there are no
   // shift instructions which handle more than 128-bit vectors.
-  if (!SVOp->getValueType(0).is128BitVector())
+  if (!SVOp->getSimpleValueType(0).is128BitVector())
     return false;
 
   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
@@ -4893,7 +5226,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   if (NumNonZero > 8)
     return SDValue();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V(0, 0);
   bool First = true;
   for (unsigned i = 0; i < 16; ++i) {
@@ -4941,7 +5274,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   if (NumNonZero > 4)
     return SDValue();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue V(0, 0);
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
@@ -4967,7 +5300,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 ///
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                          unsigned NumBits, SelectionDAG &DAG,
-                         const TargetLowering &TLI, DebugLoc dl) {
+                         const TargetLowering &TLI, SDLoc dl) {
   assert(VT.is128BitVector() && "Unknown type for VShift");
   EVT ShVT = MVT::v2i64;
   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
@@ -4978,9 +5311,8 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
 }
 
-SDValue
-X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
-                                          SelectionDAG &DAG) const {
+static SDValue
+LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
 
   // Check if the scalar load can be widened into a vector load. And if
   // the address is "base + cst" see if the cst can be "absorbed" into
@@ -5032,7 +5364,7 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
       return SDValue();
     int64_t StartOffset = Offset & ~(RequiredAlign-1);
     if (StartOffset)
-      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+      Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
 
     int EltNo = (Offset - StartOffset) >> 2;
@@ -5063,7 +5395,8 @@ X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
 /// There's even a handy isZeroNode for that purpose.
 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
-                                        DebugLoc &DL, SelectionDAG &DAG) {
+                                        SDLoc &DL, SelectionDAG &DAG,
+                                        bool isAfterLegalize) {
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
@@ -5099,15 +5432,33 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   // load of the entire vector width starting at the base pointer.  If we found
   // consecutive loads for the low half, generate a vzext_load node.
   if (LastLoadedElt == NumElems - 1) {
+
+    if (isAfterLegalize &&
+        !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
+      return SDValue();
+
+    SDValue NewLd = SDValue();
+
     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
-      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                         LDBase->getPointerInfo(),
-                         LDBase->isVolatile(), LDBase->isNonTemporal(),
-                         LDBase->isInvariant(), 0);
-    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                       LDBase->getPointerInfo(),
-                       LDBase->isVolatile(), LDBase->isNonTemporal(),
-                       LDBase->isInvariant(), LDBase->getAlignment());
+      NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                          LDBase->getPointerInfo(),
+                          LDBase->isVolatile(), LDBase->isNonTemporal(),
+                          LDBase->isInvariant(), 0);
+    NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
+                        LDBase->getPointerInfo(),
+                        LDBase->isVolatile(), LDBase->isNonTemporal(),
+                        LDBase->isInvariant(), LDBase->getAlignment());
+
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                     SDValue(LDBase, 1),
+                                     SDValue(NewLd.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(NewLd.getNode(), 1));
+    }
+
+    return NewLd;
   }
   if (NumElems == 4 && LastLoadedElt == 1 &&
       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
@@ -5144,15 +5495,15 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 /// a scalar load, or a constant.
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
-SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
+                                    SelectionDAG &DAG) {
   if (!Subtarget->hasFp256())
     return SDValue();
 
-  MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
 
-  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+  assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Unsupported vector type for broadcast.");
 
   SDValue Ld;
@@ -5196,7 +5547,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
           return SDValue();
 
         // Use the register form of the broadcast instruction available on AVX2.
-        if (VT.is256BitVector())
+        if (VT.getSizeInBits() >= 256)
           Sc = Extract128BitVector(Sc, 0, DAG, dl);
         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
       }
@@ -5208,13 +5559,18 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
       // The scalar_to_vector node and the suspected
       // load node must have exactly one user.
       // Constants may have multiple users.
-      if (!ConstSplatVal && (!Sc.hasOneUse() || !Ld.hasOneUse()))
+
+      // AVX-512 has register version of the broadcast
+      bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
+        Ld.getValueType().getSizeInBits() >= 32;
+      if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
+          !hasRegVer))
         return SDValue();
       break;
     }
   }
 
-  bool Is256 = VT.is256BitVector();
+  bool IsGE256 = (VT.getSizeInBits() >= 256);
 
   // Handle the broadcasting a single constant scalar from the constant pool
   // into a vector. On Sandybridge it is still better to load a constant vector
@@ -5224,7 +5580,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
     assert(!CVT.isVector() && "Must not broadcast a vector type");
     unsigned ScalarSize = CVT.getSizeInBits();
 
-    if (ScalarSize == 32 || (Is256 && ScalarSize == 64)) {
+    if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
       const Constant *C = 0;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
@@ -5233,7 +5589,8 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
 
       assert(C && "Invalid constant type");
 
-      SDValue CP = DAG.getConstantPool(C, getPointerTy());
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
                        MachinePointerInfo::getConstantPool(),
@@ -5248,14 +5605,14 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget->hasInt256() &&
-      (ScalarSize == 32 || (Is256 && ScalarSize == 64)))
+      (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The scalar source must be a normal load.
   if (!IsLoad)
     return SDValue();
 
-  if (ScalarSize == 32 || (Is256 && ScalarSize == 64))
+  if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
@@ -5269,15 +5626,15 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
+static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
 
   // Skip if insert_vec_elt is not supported.
-  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned NumElems = Op.getNumOperands();
 
   SDValue VecIn1;
@@ -5343,19 +5700,128 @@ X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
   return NV;
 }
 
+// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
+SDValue
+X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
+
+  MVT VT = Op.getSimpleValueType();
+  assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
+         "Unexpected type in LowerBUILD_VECTORvXi1!");
+
+  SDLoc dl(Op);
+  if (ISD::isBuildVectorAllZeros(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                       Ops, VT.getVectorNumElements());
+  }
+
+  if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+    SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
+    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
+                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
+                       Ops, VT.getVectorNumElements());
+  }
+
+  bool AllContants = true;
+  uint64_t Immediate = 0;
+  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
+    SDValue In = Op.getOperand(idx);
+    if (In.getOpcode() == ISD::UNDEF)
+      continue;
+    if (!isa<ConstantSDNode>(In)) {
+      AllContants = false;
+      break;
+    }
+    if (cast<ConstantSDNode>(In)->getZExtValue())
+      Immediate |= (1ULL << idx);
+  }
+
+  if (AllContants) {
+    SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
+      DAG.getConstant(Immediate, MVT::i16));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  // Splat vector (with undefs)
+  SDValue In = Op.getOperand(0);
+  for (unsigned i = 1, e = Op.getNumOperands(); i != e; ++i) {
+    if (Op.getOperand(i) != In && Op.getOperand(i).getOpcode() != ISD::UNDEF)
+      llvm_unreachable("Unsupported predicate operation");
+  }
+
+  SDValue EFLAGS, X86CC;
+  if (In.getOpcode() == ISD::SETCC) {
+    SDValue Op0 = In.getOperand(0);
+    SDValue Op1 = In.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(In.getOperand(2))->get();
+    bool isFP = Op1.getValueType().isFloatingPoint();
+    unsigned X86CCVal = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+
+    assert(X86CCVal != X86::COND_INVALID && "Unsupported predicate operation");
+
+    X86CC = DAG.getConstant(X86CCVal, MVT::i8);
+    EFLAGS = EmitCmp(Op0, Op1, X86CCVal, DAG);
+    EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  } else if (In.getOpcode() == X86ISD::SETCC) {
+    X86CC = In.getOperand(0);
+    EFLAGS = In.getOperand(1);
+  } else {
+    // The algorithm:
+    //   Bit1 = In & 0x1
+    //   if (Bit1 != 0)
+    //     ZF = 0
+    //   else
+    //     ZF = 1
+    //   if (ZF == 0)
+    //     res = allOnes ### CMOVNE -1, %res
+    //   else
+    //     res = allZero
+    MVT InVT = In.getSimpleValueType();
+    SDValue Bit1 = DAG.getNode(ISD::AND, dl, InVT, In, DAG.getConstant(1, InVT));
+    EFLAGS = EmitTest(Bit1, X86::COND_NE, DAG);
+    X86CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  }
+
+  if (VT == MVT::v16i1) {
+    SDValue Cst1 = DAG.getConstant(-1, MVT::i16);
+    SDValue Cst0 = DAG.getConstant(0, MVT::i16);
+    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i16,
+          Cst0, Cst1, X86CC, EFLAGS);
+    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+  }
+
+  if (VT == MVT::v8i1) {
+    SDValue Cst1 = DAG.getConstant(-1, MVT::i32);
+    SDValue Cst0 = DAG.getConstant(0, MVT::i32);
+    SDValue CmovOp = DAG.getNode(X86ISD::CMOV, dl, MVT::i32,
+          Cst0, Cst1, X86CC, EFLAGS);
+    CmovOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CmovOp);
+    return DAG.getNode(ISD::BITCAST, dl, VT, CmovOp);
+  }
+  llvm_unreachable("Unsupported predicate operation");
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT ExtVT = VT.getVectorElementType();
   unsigned NumElems = Op.getNumOperands();
 
+  // Generate vectors for predicate vectors.
+  if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
+    return LowerBUILD_VECTORvXi1(Op, DAG);
+
   // Vectors containing all zeros can be matched by pxor and xorps later
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
-    if (VT == MVT::v4i32 || VT == MVT::v8i32)
+    if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
       return Op;
 
     return getZeroVector(VT, Subtarget, DAG, dl);
@@ -5368,10 +5834,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
+    if (!VT.is512BitVector())
+      return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
   }
 
-  SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+  SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
   if (Broadcast.getNode())
     return Broadcast;
 
@@ -5404,7 +5871,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
   // Special case for single non-zero, non-undef, element.
   if (NumNonZero == 1) {
-    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    unsigned Idx = countTrailingZeros(NonZeros);
     SDValue Item = Op.getOperand(Idx);
 
     // If this is an insertion of an i64 value on x86-32, and if the top bits of
@@ -5450,7 +5917,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 
       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
-        if (VT.is256BitVector()) {
+        if (VT.is256BitVector() || VT.is512BitVector()) {
           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
                              Item, DAG.getIntPtrConstant(0));
@@ -5513,7 +5980,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
       // Check if it's possible to issue this instead.
       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
-      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      unsigned Idx = countTrailingZeros(NonZeros);
       SDValue Item = Op.getOperand(Idx);
       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
@@ -5548,7 +6015,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (EVTBits == 64) {
     if (NumNonZero == 1) {
       // One half is zero or undef.
-      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      unsigned Idx = countTrailingZeros(NonZeros);
       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                  Op.getOperand(Idx));
       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
@@ -5615,7 +6082,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       V[i] = Op.getOperand(i);
 
     // Check for elements which are consecutive loads.
-    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
+    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
     if (LD.getNode())
       return LD;
 
@@ -5678,22 +6145,25 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
 // to create 256-bit vectors from two other 128-bit ones.
 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
-  MVT ResVT = Op.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT ResVT = Op.getSimpleValueType();
 
-  assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
+  assert((ResVT.is256BitVector() ||
+          ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
 
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = ResVT.getVectorNumElements();
+  if(ResVT.is256BitVector())
+    return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 
-  return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
+  return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
 static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 2);
 
-  // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
+  // AVX/AVX-512 can use the vinsertf128 instruction to create 256-bit vectors
   // from two other 128-bit ones.
   return LowerAVXCONCAT_VECTORS(Op, DAG);
 }
@@ -5704,11 +6174,15 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
                            const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  SDLoc dl(SVOp);
+  MVT VT = SVOp->getSimpleValueType(0);
   MVT EltVT = VT.getVectorElementType();
   unsigned NumElems = VT.getVectorNumElements();
 
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
   if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
     return SDValue();
   if (!Subtarget->hasInt256() && VT == MVT::v16i16)
@@ -5765,7 +6239,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   SmallVector<int, 8> MaskVals;
 
   // Determine if more than 1 of the words in each of the low and high quadwords
@@ -6014,13 +6488,13 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
 // 1. [ssse3] 1 x pshufb
 // 2. [ssse3] 2 x pshufb + 1 x por
 // 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
-static
-SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
-                                 SelectionDAG &DAG,
-                                 const X86TargetLowering &TLI) {
+static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                        const X86Subtarget* Subtarget,
+                                        SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   ArrayRef<int> MaskVals = SVOp->getMask();
 
   // Promote splats to a larger type which usually leads to more efficient code.
@@ -6033,7 +6507,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   // present, fall back to case 3.
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3()) {
+  if (Subtarget->hasSSSE3()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -6146,10 +6620,10 @@ static
 SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
                                  const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
 
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6194,8 +6668,8 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
 static
 SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                  SelectionDAG &DAG) {
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
-  DebugLoc dl = SVOp->getDebugLoc();
+  MVT VT = SVOp->getSimpleValueType(0);
+  SDLoc dl(SVOp);
   unsigned NumElems = VT.getVectorNumElements();
   MVT NewVT;
   unsigned Scale;
@@ -6231,9 +6705,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
 
 /// getVZextMovL - Return a zero-extending vector move low node.
 ///
-static SDValue getVZextMovL(MVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, MVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
-                            const X86Subtarget *Subtarget, DebugLoc dl) {
+                            const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
     LoadSDNode *LD = NULL;
     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
@@ -6273,12 +6747,12 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   if (NewOp.getNode())
     return NewOp;
 
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  MVT VT = SVOp->getSimpleValueType(0);
 
   unsigned NumElems = VT.getVectorNumElements();
   unsigned NumLaneElems = NumElems / 2;
 
-  DebugLoc dl = SVOp->getDebugLoc();
+  SDLoc dl(SVOp);
   MVT EltVT = VT.getVectorElementType();
   MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
   SDValue Output[2];
@@ -6384,8 +6858,8 @@ static SDValue
 LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
-  DebugLoc dl = SVOp->getDebugLoc();
-  MVT VT = SVOp->getValueType(0).getSimpleVT();
+  SDLoc dl(SVOp);
+  MVT VT = SVOp->getSimpleValueType(0);
 
   assert(VT.is128BitVector() && "Unsupported vector size");
 
@@ -6535,8 +7009,8 @@ static bool MayFoldVectorLoad(SDValue V) {
 }
 
 static
-SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
+SDValue getMOVDDup(SDValue &Op, SDLoc &dl, SDValue V1, SelectionDAG &DAG) {
+  MVT VT = Op.getSimpleValueType();
 
   // Canonizalize to v2f64.
   V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
@@ -6546,11 +7020,11 @@ SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
 }
 
 static
-SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
+SDValue getMOVLowToHigh(SDValue &Op, SDLoc &dl, SelectionDAG &DAG,
                         bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT != MVT::v2i64 && "unsupported shuffle type");
 
@@ -6565,10 +7039,10 @@ SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
 }
 
 static
-SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
+SDValue getMOVHighToLow(SDValue &Op, SDLoc &dl, SelectionDAG &DAG) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
          "unsupported shuffle type");
@@ -6581,10 +7055,10 @@ SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
 }
 
 static
-SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
+SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned NumElems = VT.getVectorNumElements();
 
   // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
@@ -6638,20 +7112,20 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
 }
 
 // Reduce a vector shuffle to zext.
-SDValue
-X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
+                                    SelectionDAG &DAG) {
   // PMOVZX is only available from SSE41.
   if (!Subtarget->hasSSE41())
     return SDValue();
 
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
 
   // Only AVX2 support 256-bit vector integer extending.
   if (!Subtarget->hasInt256() && VT.is256BitVector())
     return SDValue();
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   unsigned NumElems = VT.getVectorNumElements();
@@ -6683,12 +7157,11 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
       return SDValue();
   }
 
-  LLVMContext *Context = DAG.getContext();
   unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
-  EVT NeVT = EVT::getIntegerVT(*Context, NBits);
-  EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
+  MVT NeVT = MVT::getIntegerVT(NBits);
+  MVT NVT = MVT::getVectorVT(NeVT, NumElems >> Shift);
 
-  if (!isTypeLegal(NVT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(NVT))
     return SDValue();
 
   // Simplify the operand as it's prepared to be fed into shuffle.
@@ -6696,8 +7169,8 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
   if (V1.getOpcode() == ISD::BITCAST &&
       V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
       V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      V1.getOperand(0)
-        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+      V1.getOperand(0).getOperand(0)
+        .getSimpleValueType().getSizeInBits() == SignificantBits) {
     // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
     SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
     ConstantSDNode *CIdx =
@@ -6706,19 +7179,19 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
     // selection to fold it. Otherwise, we will short the conversion sequence.
     if (CIdx && CIdx->getZExtValue() == 0 &&
         (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
-      if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+      MVT FullVT = V.getSimpleValueType();
+      MVT V1VT = V1.getSimpleValueType();
+      if (FullVT.getSizeInBits() > V1VT.getSizeInBits()) {
         // The "ext_vec_elt" node is wider than the result node.
         // In this case we should extract subvector from V.
         // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
-        unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
-        EVT FullVT = V.getValueType();
-        EVT SubVecVT = EVT::getVectorVT(*Context, 
-                                        FullVT.getVectorElementType(),
+        unsigned Ratio = FullVT.getSizeInBits() / V1VT.getSizeInBits();
+        MVT SubVecVT = MVT::getVectorVT(FullVT.getVectorElementType(),
                                         FullVT.getVectorNumElements()/Ratio);
-        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V, 
+        V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
                         DAG.getIntPtrConstant(0));
       }
-      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1VT, V);
     }
   }
 
@@ -6726,11 +7199,12 @@ X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
 
-SDValue
-X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
+static SDValue
+NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                       SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
-  MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
 
@@ -6740,13 +7214,13 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   // Handle splat operations
   if (SVOp->isSplat()) {
     // Use vbroadcast whenever the splat comes from a foldable load
-    SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
+    SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
     if (Broadcast.getNode())
       return Broadcast;
   }
 
   // Check integer expanding shuffles.
-  SDValue NewOp = LowerVectorIntExtend(Op, DAG);
+  SDValue NewOp = LowerVectorIntExtend(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -6764,7 +7238,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
@@ -6773,7 +7247,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
-        MVT NewVT = NewOp.getValueType().getSimpleVT();
+        MVT NewVT = NewOp.getSimpleValueType();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
           return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
                               DAG, Subtarget, dl);
@@ -6788,8 +7262,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
-  MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
   unsigned NumElems = VT.getVectorNumElements();
   bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
   bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
@@ -6826,7 +7300,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // Normalize the input vectors. Here splats, zeroed vectors, profitable
   // narrowing and commutation of operands should be handled. The actual code
   // doesn't include all of those, work in progress...
-  SDValue NewOp = NormalizeVectorShuffle(Op, DAG);
+  SDValue NewOp = NormalizeVectorShuffle(Op, Subtarget, DAG);
   if (NewOp.getNode())
     return NewOp;
 
@@ -6871,6 +7345,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 TargetMask, DAG);
   }
 
+  if (isPALIGNRMask(M, VT, Subtarget))
+    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
+                                getShufflePALIGNRImmediate(SVOp),
+                                DAG);
+
   // Check if this can be converted into a logical shift.
   bool isLeft = false;
   unsigned ShAmt = 0;
@@ -6981,18 +7460,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Normalize the node to match x86 shuffle ops if needed
-  if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+  if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
     return CommuteVectorShuffle(SVOp, DAG);
 
   // The checks below are all present in isShuffleMaskLegal, but they are
   // inlined here right now to enable us to directly emit target specific
   // nodes, and remove one by one until they don't return Op anymore.
 
-  if (isPALIGNRMask(M, VT, Subtarget))
-    return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
-                                getShufflePALIGNRImmediate(SVOp),
-                                DAG);
-
   if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
       SVOp->getSplatIndex() == 0 && V2IsUndef) {
     if (VT == MVT::v2f64 || VT == MVT::v2i64)
@@ -7009,7 +7483,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShufflePSHUFLWImmediate(SVOp),
                                 DAG);
 
-  if (isSHUFPMask(M, VT, HasFp256))
+  if (isSHUFPMask(M, VT))
     return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
                                 getShuffleSHUFImmediate(SVOp), DAG);
 
@@ -7028,8 +7502,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
 
   // Handle VPERMILPS/D* permutations
-  if (isVPERMILPMask(M, VT, HasFp256)) {
-    if (HasInt256 && VT == MVT::v8i32)
+  if (isVPERMILPMask(M, VT)) {
+    if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
       return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
                                   getShuffleSHUFImmediate(SVOp), DAG);
     return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -7045,21 +7519,28 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (BlendOp.getNode())
     return BlendOp;
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v8i32 || VT == MVT::v8f32)) {
-    SmallVector<SDValue, 8> permclMask;
-    for (unsigned i = 0; i != 8; ++i) {
-      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MVT::i32));
+  unsigned Imm8;
+  if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
+    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
+
+  if ((V2IsUndef && HasInt256 && VT.is256BitVector() && NumElems == 8) ||
+      VT.is512BitVector()) {
+    MVT MaskEltVT = MVT::getIntegerVT(VT.getVectorElementType().getSizeInBits());
+    MVT MaskVectorVT = MVT::getVectorVT(MaskEltVT, NumElems);
+    SmallVector<SDValue, 16> permclMask;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
     }
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32,
-                               &permclMask[0], 8);
-    // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
-    return DAG.getNode(X86ISD::VPERMV, dl, VT,
-                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
-  }
 
-  if (V2IsUndef && HasInt256 && (VT == MVT::v4i64 || VT == MVT::v4f64))
-    return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
-                                getShuffleCLImmediate(SVOp), DAG);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
+                                &permclMask[0], NumElems);
+    if (V2IsUndef)
+      // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
+      return DAG.getNode(X86ISD::VPERMV, dl, VT,
+                          DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1);
+    return DAG.getNode(X86ISD::VPERMV3, dl, VT,
+                       DAG.getNode(ISD::BITCAST, dl, VT, Mask), V1, V2);
+  }
 
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
@@ -7075,7 +7556,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (VT == MVT::v16i8) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, Subtarget, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -7099,10 +7580,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getSimpleValueType();
+  SDLoc dl(Op);
 
-  if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
+  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
@@ -7163,25 +7644,45 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
 SDValue
 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
+  SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
-  MVT VecVT = Vec.getValueType().getSimpleVT();
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = Op.getOperand(1);
+  if (!isa<ConstantSDNode>(Idx)) {
+    if (VecVT.is512BitVector() ||
+        (VecVT.is256BitVector() && Subtarget->hasInt256() &&
+         VecVT.getVectorElementType().getSizeInBits() == 32)) {
+
+      MVT MaskEltVT =
+        MVT::getIntegerVT(VecVT.getVectorElementType().getSizeInBits());
+      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
+                                    MaskEltVT.getSizeInBits());
+      
+      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
+      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
+                                getZeroVector(MaskVT, Subtarget, DAG, dl),
+                                Idx, DAG.getConstant(0, getPointerTy()));
+      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(),
+                        Perm, DAG.getConstant(0, getPointerTy()));
+    }
+    return SDValue();
+  }
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
-  if (VecVT.is256BitVector()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
-    unsigned NumElems = VecVT.getVectorNumElements();
-    SDValue Idx = Op.getOperand(1);
-    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
 
+    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
     // Get the 128-bit vector.
     Vec = Extract128BitVector(Vec, IdxVal, DAG, dl);
+    MVT EltVT = VecVT.getVectorElementType();
 
-    if (IdxVal >= NumElems/2)
-      IdxVal -= NumElems/2;
+    unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
+
+    //if (IdxVal >= NumElems/2)
+    //  IdxVal -= NumElems/2;
+    IdxVal -= (IdxVal/ElemsPerChunk)*ElemsPerChunk;
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                        DAG.getConstant(IdxVal, MVT::i32));
   }
@@ -7194,8 +7695,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
       return Res;
   }
 
-  MVT VT = Op.getValueType().getSimpleVT();
-  DebugLoc dl = Op.getDebugLoc();
+  MVT VT = Op.getSimpleValueType();
   // TODO: handle v16i8.
   if (VT.getSizeInBits() == 16) {
     SDValue Vec = Op.getOperand(0);
@@ -7222,7 +7722,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
     // SHUFPS the element to the lowest double word, then movss.
     int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7241,7 +7741,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
     // to a f64mem, the whole operation is folded into a single MOVHPDmr.
     int Mask[2] = { 1, -1 };
-    MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
+    MVT VVT = Op.getOperand(0).getSimpleValueType();
     SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                        DAG.getUNDEF(VVT), Mask);
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7252,9 +7752,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 }
 
 static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
@@ -7306,29 +7806,30 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
 
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
   SDValue N1 = Op.getOperand(1);
   SDValue N2 = Op.getOperand(2);
 
   // If this is a 256-bit vector result, first extract the 128-bit vector,
   // insert the element into the extracted half and then place it back.
-  if (VT.is256BitVector()) {
+  if (VT.is256BitVector() || VT.is512BitVector()) {
     if (!isa<ConstantSDNode>(N2))
       return SDValue();
 
     // Get the desired 128-bit vector half.
-    unsigned NumElems = VT.getVectorNumElements();
     unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
     SDValue V = Extract128BitVector(N0, IdxVal, DAG, dl);
 
     // Insert the element into the desired half.
-    bool Upper = IdxVal >= NumElems/2;
+    unsigned NumEltsIn128 = 128/EltVT.getSizeInBits();
+    unsigned IdxIn128 = IdxVal - (IdxVal/NumEltsIn128) * NumEltsIn128;
+
     V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
-                 DAG.getConstant(Upper ? IdxVal-NumElems/2 : IdxVal, MVT::i32));
+                    DAG.getConstant(IdxIn128, MVT::i32));
 
     // Insert the changed part back to the 256-bit vector
     return Insert128BitVector(N0, V, IdxVal, DAG, dl);
@@ -7353,17 +7854,16 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
-  LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
-  MVT OpVT = Op.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT OpVT = Op.getSimpleValueType();
 
   // If this is a 256-bit vector result, first insert into a 128-bit
   // vector and then insert into the 256-bit vector.
   if (!OpVT.is128BitVector()) {
     // Insert into a 128-bit vector.
-    EVT VT128 = EVT::getVectorVT(*Context,
-                                 OpVT.getVectorElementType(),
-                                 OpVT.getVectorNumElements() / 2);
+    unsigned SizeFactor = OpVT.getSizeInBits()/128;
+    MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
+                                 OpVT.getVectorNumElements() / SizeFactor);
 
     Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
 
@@ -7386,16 +7886,22 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
 // upper bits of a vector.
 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
-  if (Subtarget->hasFp256()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
-    SDValue Vec = Op.getNode()->getOperand(0);
-    SDValue Idx = Op.getNode()->getOperand(1);
+  SDLoc dl(Op);
+  SDValue In =  Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT ResVT   = Op.getSimpleValueType();
+  MVT InVT    = In.getSimpleValueType();
 
-    if (Op.getNode()->getValueType(0).is128BitVector() &&
-        Vec.getNode()->getValueType(0).is256BitVector() &&
+  if (Subtarget->hasFp256()) {
+    if (ResVT.is128BitVector() &&
+        (InVT.is256BitVector() || InVT.is512BitVector()) &&
         isa<ConstantSDNode>(Idx)) {
-      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-      return Extract128BitVector(Vec, IdxVal, DAG, dl);
+      return Extract128BitVector(In, IdxVal, DAG, dl);
+    }
+    if (ResVT.is256BitVector() && InVT.is512BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      return Extract256BitVector(In, IdxVal, DAG, dl);
     }
   }
   return SDValue();
@@ -7407,17 +7913,25 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
                                      SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
-    DebugLoc dl = Op.getNode()->getDebugLoc();
+    SDLoc dl(Op.getNode());
     SDValue Vec = Op.getNode()->getOperand(0);
     SDValue SubVec = Op.getNode()->getOperand(1);
     SDValue Idx = Op.getNode()->getOperand(2);
 
-    if (Op.getNode()->getValueType(0).is256BitVector() &&
-        SubVec.getNode()->getValueType(0).is128BitVector() &&
+    if ((Op.getNode()->getSimpleValueType(0).is256BitVector() ||
+         Op.getNode()->getSimpleValueType(0).is512BitVector()) &&
+        SubVec.getNode()->getSimpleValueType(0).is128BitVector() &&
         isa<ConstantSDNode>(Idx)) {
       unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
       return Insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
     }
+
+    if (Op.getNode()->getSimpleValueType(0).is512BitVector() &&
+        SubVec.getNode()->getSimpleValueType(0).is256BitVector() &&
+        isa<ConstantSDNode>(Idx)) {
+      unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+      return Insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
+    }
   }
   return SDValue();
 }
@@ -7449,13 +7963,13 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
                                              CP->getAlignment(),
                                              CP->getOffset(), OpFlag);
-  DebugLoc DL = CP->getDebugLoc();
+  SDLoc DL(CP);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
   // With PIC, the address is actually $g + Offset.
   if (OpFlag) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
   }
 
@@ -7481,14 +7995,14 @@ SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
                                           OpFlag);
-  DebugLoc DL = JT->getDebugLoc();
+  SDLoc DL(JT);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
   if (OpFlag)
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
 
   return Result;
@@ -7519,7 +8033,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
 
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
 
   // With PIC, the address is actually $g + Offset.
@@ -7527,7 +8041,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
       !Subtarget->is64Bit()) {
     Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                          DAG.getNode(X86ISD::GlobalBaseReg,
-                                     DebugLoc(), getPointerTy()),
+                                     SDLoc(), getPointerTy()),
                          Result);
   }
 
@@ -7548,7 +8062,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
   CodeModel::Model M = getTargetMachine().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
                                              OpFlags);
 
@@ -7569,7 +8083,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
-X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
                                       int64_t Offset, SelectionDAG &DAG) const {
   // Create the TargetGlobalAddress node, folding in the constant
   // offset if it is legal.
@@ -7618,7 +8132,7 @@ SDValue
 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
-  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+  return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
 }
 
 static SDValue
@@ -7627,7 +8141,7 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            unsigned char OperandFlags, bool LocalDynamic = false) {
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                            GA->getValueType(0),
                                            GA->getOffset(),
@@ -7656,10 +8170,10 @@ static SDValue
 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
   SDValue InFlag;
-  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
+  SDLoc dl(GA);  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                    DAG.getNode(X86ISD::GlobalBaseReg,
-                                               DebugLoc(), PtrVT), InFlag);
+                                               SDLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7677,7 +8191,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
                                            SelectionDAG &DAG,
                                            const EVT PtrVT,
                                            bool is64Bit) {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
 
   // Get the start address of the TLS block for this module.
   X86MachineFunctionInfo* MFI = DAG.getMachineFunction()
@@ -7691,7 +8205,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   } else {
     SDValue InFlag;
     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-        DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
+        DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
     InFlag = Chain.getValue(1);
     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
                       X86II::MO_TLSLDM, /*LocalDynamic=*/true);
@@ -7716,16 +8230,15 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                    const EVT PtrVT, TLSModel::Model model,
                                    bool is64Bit, bool isPIC) {
-  DebugLoc dl = GA->getDebugLoc();
+  SDLoc dl(GA);
 
   // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
   Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                          is64Bit ? 257 : 256));
 
-  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
-                                      DAG.getIntPtrConstant(0),
-                                      MachinePointerInfo(Ptr),
-                                      false, false, false, 0);
+  SDValue ThreadPointer =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0),
+                  MachinePointerInfo(Ptr), false, false, false, 0);
 
   unsigned char OperandFlags = 0;
   // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
@@ -7747,21 +8260,20 @@ static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   // emit "addl x@ntpoff,%eax" (local exec)
   // or "addl x@indntpoff,%eax" (initial exec)
   // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
-  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
-                                           GA->getValueType(0),
-                                           GA->getOffset(), OperandFlags);
+  SDValue TGA =
+      DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
+                                 GA->getOffset(), OperandFlags);
   SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
 
   if (model == TLSModel::InitialExec) {
     if (isPIC && !is64Bit) {
       Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
-                          DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT),
+                           DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
                            Offset);
     }
 
     Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
-                         MachinePointerInfo::getGOT(), false, false, false,
-                         0);
+                         MachinePointerInfo::getGOT(), false, false, false, 0);
   }
 
   // The address of the thread local variable is the add of the thread
@@ -7809,7 +8321,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       OpFlag = X86II::MO_TLVP_PIC_BASE;
     else
       OpFlag = X86II::MO_TLVP;
-    DebugLoc DL = Op.getDebugLoc();
+    SDLoc DL(Op);
     SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                 GA->getValueType(0),
                                                 GA->getOffset(), OpFlag);
@@ -7819,7 +8331,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     if (PIC32)
       Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                            DAG.getNode(X86ISD::GlobalBaseReg,
-                                       DebugLoc(), getPointerTy()),
+                                       SDLoc(), getPointerTy()),
                            Offset);
 
     // Lowering the machine isd will make sure everything is in the right
@@ -7856,7 +8368,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // thread-localness.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       GV = GA->resolveAliasedGlobal(false);
-    DebugLoc dl = GA->getDebugLoc();
+    SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
     // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
@@ -7914,11 +8426,16 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
   SDValue ShOpLo = Op.getOperand(0);
   SDValue ShOpHi = Op.getOperand(1);
   SDValue ShAmt  = Op.getOperand(2);
+  // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
+  // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
+  // during isel.
+  SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                  DAG.getConstant(VTBits - 1, MVT::i8));
   SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                      DAG.getConstant(VTBits - 1, MVT::i8))
                        : DAG.getConstant(0, VT);
@@ -7926,12 +8443,15 @@ SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
   SDValue Tmp2, Tmp3;
   if (Op.getOpcode() == ISD::SHL_PARTS) {
     Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
-    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
   } else {
     Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
-    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
   }
 
+  // If the shift amount is larger or equal than the width of a part we can't
+  // rely on the results of shld/shrd. Insert a test and select the appropriate
+  // values for large shift amounts.
   SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                 DAG.getConstant(VTBits, MVT::i8));
   SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
@@ -7973,7 +8493,7 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     return Op;
   }
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned Size = SrcVT.getSizeInBits()/8;
   MachineFunction &MF = DAG.getMachineFunction();
   int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
@@ -7989,7 +8509,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                      SDValue StackSlot,
                                      SelectionDAG &DAG) const {
   // Build the FILD
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   SDVTList Tys;
   bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
   if (useSSE)
@@ -8064,11 +8584,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
      #endif
   */
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   LLVMContext *Context = DAG.getContext();
 
   // Build some magic constants.
-  const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
+  static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
 
@@ -8118,7 +8638,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
 // LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // FP constant to bias correct the final result.
   SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
                                    MVT::f64);
@@ -8166,7 +8686,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
                                                SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   EVT SVT = N0.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
           SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
@@ -8181,7 +8701,7 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (Op.getValueType().isVector())
     return lowerUINT_TO_FP_vec(Op, DAG);
@@ -8240,7 +8760,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   APInt FF(32, 0x5F800000ULL);
 
   // Check whether the sign bit is set.
-  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+  SDValue SignSet = DAG.getSetCC(dl,
+                                 getSetCCResultType(*DAG.getContext(), MVT::i64),
                                  Op.getOperand(0), DAG.getConstant(0, MVT::i64),
                                  ISD::SETLT);
 
@@ -8269,7 +8790,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
 std::pair<SDValue,SDValue>
 X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
                                     bool IsSigned, bool IsReplace) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   EVT DstTy = Op.getValueType();
 
@@ -8363,10 +8884,10 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
 
 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
                               const X86Subtarget *Subtarget) {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
-  DebugLoc dl = Op->getDebugLoc();
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
 
   // Optimize vectors in AVX mode:
   //
@@ -8381,7 +8902,8 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   //   Concat upper and lower parts.
   //
 
-  if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+  if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
+      ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
       ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
     return SDValue();
 
@@ -8403,8 +8925,39 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
 
-SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
-                                           SelectionDAG &DAG) const {
+static  SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
+                                        SelectionDAG &DAG) {
+  MVT VT = Op->getValueType(0).getSimpleVT();
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getValueType().getSimpleVT();
+  SDLoc DL(Op);
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  EVT ExtVT = (NumElts == 8)? MVT::v8i64 : MVT::v16i32;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  // Now we have only mask extension
+  assert(InVT.getVectorElementType() == MVT::i1);
+  SDValue Cst = DAG.getTargetConstant(1, ExtVT.getScalarType());
+  const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, DL, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, DL, VT, Brcst);
+}
+
+static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                               SelectionDAG &DAG) {
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
     if (Res.getNode())
@@ -8413,12 +8966,16 @@ SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
 
   return SDValue();
 }
-SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+
+static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
+
+  if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
+    return LowerZERO_EXTEND_AVX512(Op, DAG);
 
   if (Subtarget->hasFp256()) {
     SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
@@ -8426,33 +8983,44 @@ SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
       return Res;
   }
 
-  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
-      VT.getVectorNumElements() != SVT.getVectorNumElements())
-    return SDValue();
-
-  assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
-
-  // AVX2 has better support of integer extending.
-  if (Subtarget->hasInt256())
-    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
-
-  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
-  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
-  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
-                           DAG.getVectorShuffle(MVT::v8i16, DL, In,
-                                                DAG.getUNDEF(MVT::v8i16),
-                                                &Mask[0]));
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+  assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
+         VT.getVectorNumElements() != SVT.getVectorNumElements());
+  return SDValue();
 }
 
 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();  
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
-
-  if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+  MVT InVT = In.getSimpleValueType();
+  assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
+         "Invalid TRUNCATE operation");
+
+  if (InVT.is512BitVector() || VT.getVectorElementType() == MVT::i1) {
+    if (VT.getVectorElementType().getSizeInBits() >=8)
+      return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
+
+    assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert ((NumElts == 8 || NumElts == 16) && "Unexpected vector type");
+    if (InVT.getSizeInBits() < 512) {
+      MVT ExtVT = (NumElts == 16)? MVT::v16i32 : MVT::v8i64;
+      In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
+      InVT = ExtVT;
+    }
+    SDValue Cst = DAG.getTargetConstant(1, InVT.getVectorElementType());
+    const Constant *C = (dyn_cast<ConstantSDNode>(Cst))->getConstantIntValue();
+    SDValue CP = DAG.getConstantPool(C, getPointerTy());
+    unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+    SDValue Ld = DAG.getLoad(Cst.getValueType(), DL, DAG.getEntryNode(), CP,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, Alignment);
+    SDValue OneV = DAG.getNode(X86ISD::VBROADCAST, DL, InVT, Ld);
+    SDValue And = DAG.getNode(ISD::AND, DL, InVT, OneV, In);
+    return DAG.getNode(X86ISD::TESTM, DL, VT, And, And);
+  }
+
+  if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget->hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
@@ -8483,7 +9051,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
   }
 
-  if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+  if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
     // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
     if (Subtarget->hasInt256()) {
       In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
@@ -8541,11 +9109,9 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Handle truncation of V256 to V128 using shuffles.
-  if (!VT.is128BitVector() || !SVT.is256BitVector())
+  if (!VT.is128BitVector() || !InVT.is256BitVector())
     return SDValue();
 
-  assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
-         "Invalid op");
   assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
 
   unsigned NumElems = VT.getVectorNumElements();
@@ -8565,11 +9131,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   if (VT.isVector()) {
     if (VT == MVT::v8i16)
-      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
-                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT,
+                         DAG.getNode(ISD::FP_TO_SINT, SDLoc(Op),
                                      MVT::v8i32, Op.getOperand(0)));
     return SDValue();
   }
@@ -8582,7 +9148,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
 
@@ -8599,7 +9165,7 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 
   if (StackSlot.getNode())
     // Load the result.
-    return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+    return DAG.getLoad(Op.getValueType(), SDLoc(Op),
                        FIST, StackSlot, MachinePointerInfo(),
                        false, false, false, 0);
 
@@ -8608,10 +9174,10 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
 }
 
 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc DL = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+  SDLoc DL(Op);
+  MVT VT = Op.getSimpleValueType();
   SDValue In = Op.getOperand(0);
-  MVT SVT = In.getValueType().getSimpleVT();
+  MVT SVT = In.getSimpleValueType();
 
   assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
 
@@ -8622,8 +9188,8 @@ static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -8656,8 +9222,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
-  DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT;
   unsigned NumElts = VT == MVT::f64 ? 2 : 4;
   if (VT.isVector()) {
@@ -8678,7 +9244,7 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
                              MachinePointerInfo::getConstantPool(),
                              false, false, false, Alignment);
   if (VT.isVector()) {
-    MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    MVT XORVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits()/64);
     return DAG.getNode(ISD::BITCAST, dl, VT,
                        DAG.getNode(ISD::XOR, dl, XORVT,
                                    DAG.getNode(ISD::BITCAST, dl, XORVT,
@@ -8693,9 +9259,9 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
-  MVT SrcVT = Op1.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT SrcVT = Op1.getSimpleValueType();
 
   // If second operand is smaller, extend it first.
   if (SrcVT.bitsLT(VT)) {
@@ -8770,8 +9336,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
-  MVT VT = Op.getValueType().getSimpleVT();
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
 
   // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
   SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
@@ -8781,8 +9347,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
 
 // LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
 //
-SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
 
   if (!Subtarget->hasSSE41())
@@ -8792,7 +9358,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
     return SDValue();
 
   SDNode *N = Op.getNode();
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, unsigned> VecInMap;
@@ -8804,7 +9370,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
   Opnds.push_back(N->getOperand(1));
 
   for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
-    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
     // BFS traverse all OR'd operands.
     if (I->getOpcode() == ISD::OR) {
       Opnds.push_back(I->getOperand(0));
@@ -8876,7 +9442,7 @@ SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
                                     SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // CF and OF aren't always set the way we want. Determine which
   // of these we need.
@@ -8907,7 +9473,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   unsigned NumOperands = 0;
 
   // Truncate operations may prevent the merge of the SETCC instruction
-  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // and the arithmetic instruction before it. Attempt to truncate the operands
   // of the arithmetic instruction and use a reduced bit-width instruction.
   bool NeedTruncation = false;
   SDValue ArithOp = Op;
@@ -9015,7 +9581,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     case ISD::AND: Opcode = X86ISD::AND; break;
     case ISD::OR: {
       if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
-        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG);
         if (EFLAGS.getNode())
           return EFLAGS;
       }
@@ -9091,7 +9657,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
     if (C->getAPIntValue() == 0)
       return EmitTest(Op0, X86CC, DAG);
 
-  DebugLoc dl = Op0.getDebugLoc();
+  SDLoc dl(Op0);
   if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
        Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
     // Use SUB instead of CMP to enable CSE between SUB and CMP.
@@ -9118,7 +9684,7 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
   // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
   // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
   // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
-  DebugLoc dl = Cmp.getDebugLoc();
+  SDLoc dl(Cmp);
   SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
   SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
   SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
@@ -9135,7 +9701,7 @@ static bool isAllOnes(SDValue V) {
 /// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
 /// if it's possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
-                                     DebugLoc dl, SelectionDAG &DAG) const {
+                                     SDLoc dl, SelectionDAG &DAG) const {
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -9203,16 +9769,61 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
   return SDValue();
 }
 
+/// \brief - Turns an ISD::CondCode into a value suitable for SSE floating point
+/// mask CMPs.
+static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+                              SDValue &Op1) {
+  unsigned SSECC;
+  bool Swap = false;
+
+  // SSE Condition code mapping:
+  //  0 - EQ
+  //  1 - LT
+  //  2 - LE
+  //  3 - UNORD
+  //  4 - NEQ
+  //  5 - NLT
+  //  6 - NLE
+  //  7 - ORD
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETOGT:
+  case ISD::SETGT:  Swap = true; // Fallthrough
+  case ISD::SETLT:
+  case ISD::SETOLT: SSECC = 1; break;
+  case ISD::SETOGE:
+  case ISD::SETGE:  Swap = true; // Fallthrough
+  case ISD::SETLE:
+  case ISD::SETOLE: SSECC = 2; break;
+  case ISD::SETUO:  SSECC = 3; break;
+  case ISD::SETUNE:
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETULE: Swap = true; // Fallthrough
+  case ISD::SETUGE: SSECC = 5; break;
+  case ISD::SETULT: Swap = true; // Fallthrough
+  case ISD::SETUGT: SSECC = 6; break;
+  case ISD::SETO:   SSECC = 7; break;
+  case ISD::SETUEQ:
+  case ISD::SETONE: SSECC = 8; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  return SSECC;
+}
+
 // Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
 // ones, and then concatenate the result back.
 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue CC = Op.getOperand(2);
 
   // Extract the LHS vectors
@@ -9233,61 +9844,62 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
 }
 
+static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  MVT VT = Op.getSimpleValueType();
+
+  assert(Op0.getValueType().getVectorElementType().getSizeInBits() >= 32 &&
+         Op.getValueType().getScalarType() == MVT::i1 &&
+         "Cannot set masked compare for this operation");
+
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  SDLoc dl(Op);
+
+  bool Unsigned = false;
+  unsigned SSECC;
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Unexpected SETCC condition");
+  case ISD::SETNE:  SSECC = 4; break;
+  case ISD::SETEQ:  SSECC = 0; break;
+  case ISD::SETUGT: Unsigned = true;
+  case ISD::SETGT:  SSECC = 6; break; // NLE
+  case ISD::SETULT: Unsigned = true;
+  case ISD::SETLT:  SSECC = 1; break;
+  case ISD::SETUGE: Unsigned = true;
+  case ISD::SETGE:  SSECC = 5; break; // NLT
+  case ISD::SETULE: Unsigned = true;
+  case ISD::SETLE:  SSECC = 2; break;
+  }
+  unsigned  Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
+  return DAG.getNode(Opc, dl, VT, Op0, Op1,
+                     DAG.getConstant(SSECC, MVT::i8));
+
+}
+
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
                            SelectionDAG &DAG) {
-  SDValue Cond;
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue CC = Op.getOperand(2);
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
   ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
-  bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
-  DebugLoc dl = Op.getDebugLoc();
+  bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+  SDLoc dl(Op);
 
   if (isFP) {
 #ifndef NDEBUG
-    MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
+    MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
     assert(EltVT == MVT::f32 || EltVT == MVT::f64);
 #endif
 
-    unsigned SSECC;
-    bool Swap = false;
-
-    // SSE Condition code mapping:
-    //  0 - EQ
-    //  1 - LT
-    //  2 - LE
-    //  3 - UNORD
-    //  4 - NEQ
-    //  5 - NLT
-    //  6 - NLE
-    //  7 - ORD
-    switch (SetCCOpcode) {
-    default: llvm_unreachable("Unexpected SETCC condition");
-    case ISD::SETOEQ:
-    case ISD::SETEQ:  SSECC = 0; break;
-    case ISD::SETOGT:
-    case ISD::SETGT: Swap = true; // Fallthrough
-    case ISD::SETLT:
-    case ISD::SETOLT: SSECC = 1; break;
-    case ISD::SETOGE:
-    case ISD::SETGE: Swap = true; // Fallthrough
-    case ISD::SETLE:
-    case ISD::SETOLE: SSECC = 2; break;
-    case ISD::SETUO:  SSECC = 3; break;
-    case ISD::SETUNE:
-    case ISD::SETNE:  SSECC = 4; break;
-    case ISD::SETULE: Swap = true; // Fallthrough
-    case ISD::SETUGE: SSECC = 5; break;
-    case ISD::SETULT: Swap = true; // Fallthrough
-    case ISD::SETUGT: SSECC = 6; break;
-    case ISD::SETO:   SSECC = 7; break;
-    case ISD::SETUEQ:
-    case ISD::SETONE: SSECC = 8; break;
-    }
-    if (Swap)
-      std::swap(Op0, Op1);
-
+    unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
+    unsigned Opc = X86ISD::CMPP;
+    if (Subtarget->hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+      assert(VT.getVectorNumElements() <= 16);
+      Opc = X86ISD::CMPM;
+    }
     // In the two special cases we can't handle, emit two comparisons.
     if (SSECC == 8) {
       unsigned CC0, CC1;
@@ -9299,14 +9911,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
         CC0 = 7; CC1 = 4; CombineOpc = ISD::AND;
       }
 
-      SDValue Cmp0 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC0, MVT::i8));
-      SDValue Cmp1 = DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+      SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
                                  DAG.getConstant(CC1, MVT::i8));
       return DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
     }
     // Handle all other FP comparisons here.
-    return DAG.getNode(X86ISD::CMPP, dl, VT, Op0, Op1,
+    return DAG.getNode(Opc, dl, VT, Op0, Op1,
                        DAG.getConstant(SSECC, MVT::i8));
   }
 
@@ -9314,25 +9926,63 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (VT.is256BitVector() && !Subtarget->hasInt256())
     return Lower256IntVSETCC(Op, DAG);
 
+  bool MaskResult = (VT.getVectorElementType() == MVT::i1);
+  EVT OpVT = Op1.getValueType();
+  if (Subtarget->hasAVX512()) {
+    if (Op1.getValueType().is512BitVector() ||
+        (MaskResult && OpVT.getVectorElementType().getSizeInBits() >= 32))
+      return LowerIntVSETCC_AVX512(Op, DAG);
+
+    // In AVX-512 architecture setcc returns mask with i1 elements,
+    // But there is no compare instruction for i8 and i16 elements.
+    // We are not talking about 512-bit operands in this case, these
+    // types are illegal.
+    if (MaskResult &&
+        (OpVT.getVectorElementType().getSizeInBits() < 32 &&
+         OpVT.getVectorElementType().getSizeInBits() >= 8))
+      return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                         DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
+  }
+
   // We are handling one of the integer comparisons here.  Since SSE only has
   // GT and EQ comparisons for integer, swapping operands and multiple
   // operations may be required for some comparisons.
   unsigned Opc;
-  bool Swap = false, Invert = false, FlipSigns = false;
-
+  bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
+  
   switch (SetCCOpcode) {
   default: llvm_unreachable("Unexpected SETCC condition");
   case ISD::SETNE:  Invert = true;
-  case ISD::SETEQ:  Opc = X86ISD::PCMPEQ; break;
+  case ISD::SETEQ:  Opc = MaskResult? X86ISD::PCMPEQM: X86ISD::PCMPEQ; break;
   case ISD::SETLT:  Swap = true;
-  case ISD::SETGT:  Opc = X86ISD::PCMPGT; break;
+  case ISD::SETGT:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT; break;
   case ISD::SETGE:  Swap = true;
-  case ISD::SETLE:  Opc = X86ISD::PCMPGT; Invert = true; break;
+  case ISD::SETLE:  Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    Invert = true; break;
   case ISD::SETULT: Swap = true;
-  case ISD::SETUGT: Opc = X86ISD::PCMPGT; FlipSigns = true; break;
+  case ISD::SETUGT: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; break;
   case ISD::SETUGE: Swap = true;
-  case ISD::SETULE: Opc = X86ISD::PCMPGT; FlipSigns = true; Invert = true; break;
+  case ISD::SETULE: Opc = MaskResult? X86ISD::PCMPGTM: X86ISD::PCMPGT;
+                    FlipSigns = true; Invert = true; break;
+  }
+  
+  // Special case: Use min/max operations for SETULE/SETUGE
+  MVT VET = VT.getVectorElementType();
+  bool hasMinMax =
+       (Subtarget->hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
+    || (Subtarget->hasSSE2()  && (VET == MVT::i8));
+  
+  if (hasMinMax) {
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETULE: Opc = X86ISD::UMIN; MinMax = true; break;
+    case ISD::SETUGE: Opc = X86ISD::UMAX; MinMax = true; break;
+    }
+    
+    if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
   }
+  
   if (Swap)
     std::swap(Op0, Op1);
 
@@ -9366,8 +10016,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
 
       // Create masks for only the low parts/high parts of the 64 bit integers.
-      const int MaskHi[] = { 1, 1, 3, 3 };
-      const int MaskLo[] = { 0, 0, 2, 2 };
+      static const int MaskHi[] = { 1, 1, 3, 3 };
+      static const int MaskLo[] = { 0, 0, 2, 2 };
       SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
       SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
       SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
@@ -9394,7 +10044,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
 
       // Make sure the lower and upper halves are both all-ones.
-      const int Mask[] = { 1, 0, 3, 2 };
+      static const int Mask[] = { 1, 0, 3, 2 };
       SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
       Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
 
@@ -9419,20 +10069,23 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   // If the logical-not of the result is required, perform that now.
   if (Invert)
     Result = DAG.getNOT(dl, Result, VT);
+  
+  if (MinMax)
+    Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
 
   return Result;
 }
 
 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
-  MVT VT = Op.getValueType().getSimpleVT();
+  MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
 
   assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
   SDValue Op0 = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
 
   // Optimize to BT if possible.
@@ -9469,7 +10122,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+  bool isFP = Op1.getSimpleValueType().isFloatingPoint();
   unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
   if (X86CC == X86::COND_INVALID)
     return SDValue();
@@ -9526,9 +10179,31 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond  = Op.getOperand(0);
   SDValue Op1 = Op.getOperand(1);
   SDValue Op2 = Op.getOperand(2);
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
+  EVT VT = Op1.getValueType();
   SDValue CC;
 
+  // Lower fp selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
+  // are available. Otherwise fp cmovs get lowered into a less efficient branch
+  // sequence later on.
+  if (Cond.getOpcode() == ISD::SETCC &&
+      ((Subtarget->hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
+       (Subtarget->hasSSE1() && VT == MVT::f32)) &&
+      VT == Cond.getOperand(0).getValueType() && Cond->hasOneUse()) {
+    SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
+    int SSECC = translateX86FSETCC(
+        cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+
+    if (SSECC != 8) {
+      unsigned Opcode = VT == MVT::f32 ? X86ISD::FSETCCss : X86ISD::FSETCCsd;
+      SDValue Cmp = DAG.getNode(Opcode, DL, VT, CondOp0, CondOp1,
+                                DAG.getConstant(SSECC, MVT::i8));
+      SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
+      SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
+      return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
+    }
+  }
+
   if (Cond.getOpcode() == ISD::SETCC) {
     SDValue NewCond = LowerSETCC(Cond, DAG);
     if (NewCond.getNode())
@@ -9602,7 +10277,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
     SDValue Cmp = Cond.getOperand(1);
     unsigned Opc = Cmp.getOpcode();
-    MVT VT = Op.getValueType().getSimpleVT();
+    MVT VT = Op.getSimpleValueType();
 
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -9711,15 +10386,50 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
 }
 
-SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MVT VT = Op->getValueType(0).getSimpleVT();
+static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
   SDValue In = Op->getOperand(0);
-  MVT InVT = In.getValueType().getSimpleVT();
-  DebugLoc dl = Op->getDebugLoc();
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  unsigned int NumElts = VT.getVectorNumElements();
+  if (NumElts != 8 && NumElts != 16)
+    return SDValue();
+
+  if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
+    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
+
+  MVT ExtVT = (NumElts == 8) ? MVT::v8i64 : MVT::v16i32;
+  Constant *C = ConstantInt::get(*DAG.getContext(),
+    APInt::getAllOnesValue(ExtVT.getScalarType().getSizeInBits()));
+
+  SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
+  SDValue Ld = DAG.getLoad(ExtVT.getScalarType(), dl, DAG.getEntryNode(), CP,
+                          MachinePointerInfo::getConstantPool(),
+                          false, false, false, Alignment);
+  SDValue Brcst = DAG.getNode(X86ISD::VBROADCASTM, dl, ExtVT, In, Ld);
+  if (VT.is512BitVector())
+    return Brcst;
+  return DAG.getNode(X86ISD::VTRUNC, dl, VT, Brcst);
+}
+
+static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget *Subtarget,
+                                SelectionDAG &DAG) {
+  MVT VT = Op->getSimpleValueType(0);
+  SDValue In = Op->getOperand(0);
+  MVT InVT = In.getSimpleValueType();
+  SDLoc dl(Op);
+
+  if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
+    return LowerSIGN_EXTEND_AVX512(Op, DAG);
 
   if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
-      (VT != MVT::v8i32 || InVT != MVT::v8i16))
+      (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+      (VT != MVT::v16i16 || InVT != MVT::v16i8))
     return SDValue();
 
   if (Subtarget->hasInt256())
@@ -9789,7 +10499,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Dest  = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue CC;
   bool Inverted = false;
 
@@ -10059,12 +10769,13 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
          "This should be used only on Windows targets or when segmented stacks "
          "are being used");
   assert(!Subtarget->isTargetEnvMacho() && "Not implemented");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
-  // FIXME: Ensure alignment here
+  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  EVT VT = Op.getNode()->getValueType(0);
 
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
@@ -10102,12 +10813,20 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
-    Flag = Chain.getValue(1);
 
-    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
-                               SPTy).getValue(1);
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    unsigned SPReg = RegInfo->getStackRegister();
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
+    Chain = SP.getValue(1);
 
-    SDValue Ops1[2] = { Chain.getValue(0), Chain };
+    if (Align) {
+      SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
+                       DAG.getConstant(-(uint64_t)Align, VT));
+      Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
+    }
+
+    SDValue Ops1[2] = { SP, Chain };
     return DAG.getMergeValues(Ops1, 2, dl);
   }
 }
@@ -10117,7 +10836,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
 
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   if (!Subtarget->is64Bit() || Subtarget->isTargetWin64()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
@@ -10184,7 +10903,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDValue SrcPtr = Op.getOperand(1);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
   unsigned Align = Op.getConstantOperandVal(3);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
@@ -10250,7 +10969,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   SDValue SrcPtr = Op.getOperand(2);
   const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
                        DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
@@ -10258,25 +10977,37 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+// getTargetVShiftByConstNode - Handle vector element shifts where the shift
+// amount is a constant. Takes immediate version of shift as input.
+static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, EVT VT,
+                                          SDValue SrcOp, uint64_t ShiftAmt,
+                                          SelectionDAG &DAG) {
+
+  // Check for ShiftAmt >= element width
+  if (ShiftAmt >= VT.getVectorElementType().getSizeInBits()) {
+    if (Opc == X86ISD::VSRAI)
+      ShiftAmt = VT.getVectorElementType().getSizeInBits() - 1;
+    else
+      return DAG.getConstant(0, VT);
+  }
+
+  assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
+         && "Unknown target vector shift-by-constant node");
+
+  return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
+}
+
 // getTargetVShiftNode - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
-static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
+static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, EVT VT,
                                    SDValue SrcOp, SDValue ShAmt,
                                    SelectionDAG &DAG) {
   assert(ShAmt.getValueType() == MVT::i32 && "ShAmt is not i32");
 
-  if (isa<ConstantSDNode>(ShAmt)) {
-    // Constant may be a TargetConstant. Use a regular constant.
-    uint32_t ShiftAmt = cast<ConstantSDNode>(ShAmt)->getZExtValue();
-    switch (Opc) {
-      default: llvm_unreachable("Unknown target vector shift node");
-      case X86ISD::VSHLI:
-      case X86ISD::VSRLI:
-      case X86ISD::VSRAI:
-        return DAG.getNode(Opc, dl, VT, SrcOp,
-                           DAG.getConstant(ShiftAmt, MVT::i32));
-    }
-  }
+  // Catch shift-by-constant.
+  if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
+    return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
+                                      CShAmt->getZExtValue(), DAG);
 
   // Change opcode to non-immediate version
   switch (Opc) {
@@ -10304,7 +11035,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
 }
 
 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -10479,24 +11210,32 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_pmaxu_b:
   case Intrinsic::x86_avx2_pmaxu_w:
   case Intrinsic::x86_avx2_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_d:
+  case Intrinsic::x86_avx512_pmaxu_q:
   case Intrinsic::x86_sse2_pminu_b:
   case Intrinsic::x86_sse41_pminuw:
   case Intrinsic::x86_sse41_pminud:
   case Intrinsic::x86_avx2_pminu_b:
   case Intrinsic::x86_avx2_pminu_w:
   case Intrinsic::x86_avx2_pminu_d:
+  case Intrinsic::x86_avx512_pminu_d:
+  case Intrinsic::x86_avx512_pminu_q:
   case Intrinsic::x86_sse41_pmaxsb:
   case Intrinsic::x86_sse2_pmaxs_w:
   case Intrinsic::x86_sse41_pmaxsd:
   case Intrinsic::x86_avx2_pmaxs_b:
   case Intrinsic::x86_avx2_pmaxs_w:
   case Intrinsic::x86_avx2_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_d:
+  case Intrinsic::x86_avx512_pmaxs_q:
   case Intrinsic::x86_sse41_pminsb:
   case Intrinsic::x86_sse2_pmins_w:
   case Intrinsic::x86_sse41_pminsd:
   case Intrinsic::x86_avx2_pmins_b:
   case Intrinsic::x86_avx2_pmins_w:
-  case Intrinsic::x86_avx2_pmins_d: {
+  case Intrinsic::x86_avx2_pmins_d: 
+  case Intrinsic::x86_avx512_pmins_d:
+  case Intrinsic::x86_avx512_pmins_q: {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10506,6 +11245,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxu_b:
     case Intrinsic::x86_avx2_pmaxu_w:
     case Intrinsic::x86_avx2_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_d:
+    case Intrinsic::x86_avx512_pmaxu_q:
       Opcode = X86ISD::UMAX;
       break;
     case Intrinsic::x86_sse2_pminu_b:
@@ -10514,6 +11255,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pminu_b:
     case Intrinsic::x86_avx2_pminu_w:
     case Intrinsic::x86_avx2_pminu_d:
+    case Intrinsic::x86_avx512_pminu_d:
+    case Intrinsic::x86_avx512_pminu_q:
       Opcode = X86ISD::UMIN;
       break;
     case Intrinsic::x86_sse41_pmaxsb:
@@ -10522,6 +11265,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmaxs_b:
     case Intrinsic::x86_avx2_pmaxs_w:
     case Intrinsic::x86_avx2_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_d:
+    case Intrinsic::x86_avx512_pmaxs_q:
       Opcode = X86ISD::SMAX;
       break;
     case Intrinsic::x86_sse41_pminsb:
@@ -10530,6 +11275,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_avx2_pmins_b:
     case Intrinsic::x86_avx2_pmins_w:
     case Intrinsic::x86_avx2_pmins_d:
+    case Intrinsic::x86_avx512_pmins_d:
+    case Intrinsic::x86_avx512_pmins_q:
       Opcode = X86ISD::SMIN;
       break;
     }
@@ -10542,10 +11289,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_sse2_max_pd:
   case Intrinsic::x86_avx_max_ps_256:
   case Intrinsic::x86_avx_max_pd_256:
+  case Intrinsic::x86_avx512_max_ps_512:
+  case Intrinsic::x86_avx512_max_pd_512:
   case Intrinsic::x86_sse_min_ps:
   case Intrinsic::x86_sse2_min_pd:
   case Intrinsic::x86_avx_min_ps_256:
-  case Intrinsic::x86_avx_min_pd_256: {
+  case Intrinsic::x86_avx_min_pd_256:
+  case Intrinsic::x86_avx512_min_ps_512:
+  case Intrinsic::x86_avx512_min_pd_512:  {
     unsigned Opcode;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10553,12 +11304,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_sse2_max_pd:
     case Intrinsic::x86_avx_max_ps_256:
     case Intrinsic::x86_avx_max_pd_256:
+    case Intrinsic::x86_avx512_max_ps_512:
+    case Intrinsic::x86_avx512_max_pd_512:
       Opcode = X86ISD::FMAX;
       break;
     case Intrinsic::x86_sse_min_ps:
     case Intrinsic::x86_sse2_min_pd:
     case Intrinsic::x86_avx_min_ps_256:
     case Intrinsic::x86_avx_min_pd_256:
+    case Intrinsic::x86_avx512_min_ps_512:
+    case Intrinsic::x86_avx512_min_pd_512:
       Opcode = X86ISD::FMIN;
       break;
     }
@@ -10629,7 +11384,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_avx2_permd:
   case Intrinsic::x86_avx2_permps:
     // Operands intentionally swapped. Mask is last operand to intrinsic,
-    // but second operand for node/intruction.
+    // but second operand for node/instruction.
     return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
                        Op.getOperand(2), Op.getOperand(1));
 
@@ -10704,6 +11459,16 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
+  case Intrinsic::x86_avx512_kortestz:
+  case Intrinsic::x86_avx512_kortestc: {
+    unsigned X86CC = (IntNo == Intrinsic::x86_avx512_kortestz)? X86::COND_E: X86::COND_B;
+    SDValue LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, Op.getOperand(2));
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
 
   // SSE/AVX shift intrinsics
   case Intrinsic::x86_sse2_psll_w:
@@ -10854,8 +11619,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
       X86CC = X86::COND_E;
       break;
     }
-    SmallVector<SDValue, 5> NewOps;
-    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -10872,8 +11636,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     else
       Opcode = X86ISD::PCMPESTRI;
 
-    SmallVector<SDValue, 5> NewOps;
-    NewOps.append(Op->op_begin()+1, Op->op_end());
+    SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
   }
@@ -10900,7 +11663,19 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   case Intrinsic::x86_fma_vfmaddsub_ps_256:
   case Intrinsic::x86_fma_vfmaddsub_pd_256:
   case Intrinsic::x86_fma_vfmsubadd_ps_256:
-  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+  case Intrinsic::x86_fma_vfmsubadd_pd_256:
+  case Intrinsic::x86_fma_vfmadd_ps_512:
+  case Intrinsic::x86_fma_vfmadd_pd_512:
+  case Intrinsic::x86_fma_vfmsub_ps_512:
+  case Intrinsic::x86_fma_vfmsub_pd_512:
+  case Intrinsic::x86_fma_vfnmadd_ps_512:
+  case Intrinsic::x86_fma_vfnmadd_pd_512:
+  case Intrinsic::x86_fma_vfnmsub_ps_512:
+  case Intrinsic::x86_fma_vfnmsub_pd_512:
+  case Intrinsic::x86_fma_vfmaddsub_ps_512:
+  case Intrinsic::x86_fma_vfmaddsub_pd_512:
+  case Intrinsic::x86_fma_vfmsubadd_ps_512:
+  case Intrinsic::x86_fma_vfmsubadd_pd_512: { 
     unsigned Opc;
     switch (IntNo) {
     default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
@@ -10908,36 +11683,48 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     case Intrinsic::x86_fma_vfmadd_pd:
     case Intrinsic::x86_fma_vfmadd_ps_256:
     case Intrinsic::x86_fma_vfmadd_pd_256:
+    case Intrinsic::x86_fma_vfmadd_ps_512:
+    case Intrinsic::x86_fma_vfmadd_pd_512:
       Opc = X86ISD::FMADD;
       break;
     case Intrinsic::x86_fma_vfmsub_ps:
     case Intrinsic::x86_fma_vfmsub_pd:
     case Intrinsic::x86_fma_vfmsub_ps_256:
     case Intrinsic::x86_fma_vfmsub_pd_256:
+    case Intrinsic::x86_fma_vfmsub_ps_512:
+    case Intrinsic::x86_fma_vfmsub_pd_512:
       Opc = X86ISD::FMSUB;
       break;
     case Intrinsic::x86_fma_vfnmadd_ps:
     case Intrinsic::x86_fma_vfnmadd_pd:
     case Intrinsic::x86_fma_vfnmadd_ps_256:
     case Intrinsic::x86_fma_vfnmadd_pd_256:
+    case Intrinsic::x86_fma_vfnmadd_ps_512:
+    case Intrinsic::x86_fma_vfnmadd_pd_512:
       Opc = X86ISD::FNMADD;
       break;
     case Intrinsic::x86_fma_vfnmsub_ps:
     case Intrinsic::x86_fma_vfnmsub_pd:
     case Intrinsic::x86_fma_vfnmsub_ps_256:
     case Intrinsic::x86_fma_vfnmsub_pd_256:
+    case Intrinsic::x86_fma_vfnmsub_ps_512:
+    case Intrinsic::x86_fma_vfnmsub_pd_512:
       Opc = X86ISD::FNMSUB;
       break;
     case Intrinsic::x86_fma_vfmaddsub_ps:
     case Intrinsic::x86_fma_vfmaddsub_pd:
     case Intrinsic::x86_fma_vfmaddsub_ps_256:
     case Intrinsic::x86_fma_vfmaddsub_pd_256:
+    case Intrinsic::x86_fma_vfmaddsub_ps_512:
+    case Intrinsic::x86_fma_vfmaddsub_pd_512:
       Opc = X86ISD::FMADDSUB;
       break;
     case Intrinsic::x86_fma_vfmsubadd_ps:
     case Intrinsic::x86_fma_vfmsubadd_pd:
     case Intrinsic::x86_fma_vfmsubadd_ps_256:
     case Intrinsic::x86_fma_vfmsubadd_pd_256:
+    case Intrinsic::x86_fma_vfmsubadd_ps_512:
+    case Intrinsic::x86_fma_vfmsubadd_pd_512:
       Opc = X86ISD::FMSUBADD;
       break;
     }
@@ -10948,8 +11735,88 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   }
 }
 
-static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                             SDValue Base, SDValue Index,
+                             SDValue ScaleOp, SDValue Chain,
+                             const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  EVT MaskVT = MVT::getVectorVT(MVT::i1, 
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Mask, SDValue Base,
+                              SDValue Index, SDValue ScaleOp, SDValue Chain,
+                              const X86Subtarget * Subtarget) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  if (Src.getOpcode() == ISD::UNDEF)
+    Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl); 
+  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+}
+
+static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                              SDValue Src, SDValue Base, SDValue Index,
+                              SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+  SDLoc dl(Op);
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
+  assert(C && "Invalid scale type");
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
+  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  EVT MaskVT = MVT::getVectorVT(MVT::i1,
+                                Index.getValueType().getVectorNumElements());
+  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  return SDValue(Res, 1);
+}
+
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
+  SDLoc dl(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
@@ -10983,7 +11850,144 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
-
+  //int_gather(index, base, scale);
+  case Intrinsic::x86_avx512_gather_qpd_512:
+  case Intrinsic::x86_avx512_gather_qps_512:
+  case Intrinsic::x86_avx512_gather_dpd_512:
+  case Intrinsic::x86_avx512_gather_qpi_512:
+  case Intrinsic::x86_avx512_gather_qpq_512:
+  case Intrinsic::x86_avx512_gather_dpq_512:
+  case Intrinsic::x86_avx512_gather_dps_512:
+  case Intrinsic::x86_avx512_gather_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Index = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Scale = Op.getOperand(4);
+    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
+  }
+  //int_gather_mask(v1, mask, index, base, scale);
+  case Intrinsic::x86_avx512_gather_qps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dpd_mask_512:
+  case Intrinsic::x86_avx512_gather_dps_mask_512:
+  case Intrinsic::x86_avx512_gather_qpi_mask_512:
+  case Intrinsic::x86_avx512_gather_qpq_mask_512:
+  case Intrinsic::x86_avx512_gather_dpi_mask_512:
+  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_gather_qps_mask_512: 
+        Opc = X86::VGATHERQPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpd_mask_512:
+        Opc = X86::VGATHERQPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpd_mask_512:
+        Opc = X86::VGATHERDPDZrm; break;
+      case Intrinsic::x86_avx512_gather_dps_mask_512:
+        Opc = X86::VGATHERDPSZrm; break;
+      case Intrinsic::x86_avx512_gather_qpi_mask_512:
+        Opc = X86::VPGATHERQDZrm; break;
+      case Intrinsic::x86_avx512_gather_qpq_mask_512:
+        Opc = X86::VPGATHERQQZrm; break;
+      case Intrinsic::x86_avx512_gather_dpi_mask_512:
+        Opc = X86::VPGATHERDDZrm; break;
+      case Intrinsic::x86_avx512_gather_dpq_mask_512:
+        Opc = X86::VPGATHERDQZrm; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Base  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+                          Subtarget);
+  }
+  //int_scatter(base, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qpd_512:
+  case Intrinsic::x86_avx512_scatter_qps_512:
+  case Intrinsic::x86_avx512_scatter_dpd_512:
+  case Intrinsic::x86_avx512_scatter_qpi_512:
+  case Intrinsic::x86_avx512_scatter_qpq_512:
+  case Intrinsic::x86_avx512_scatter_dpq_512:
+  case Intrinsic::x86_avx512_scatter_dps_512:
+  case Intrinsic::x86_avx512_scatter_dpi_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Src   = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
+  }
+  //int_scatter_mask(base, mask, index, v1, scale);
+  case Intrinsic::x86_avx512_scatter_qps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+  case Intrinsic::x86_avx512_scatter_dps_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
+    unsigned Opc;
+    switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_avx512_scatter_qpd_mask_512: 
+        Opc = X86::VSCATTERQPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qps_mask_512:
+        Opc = X86::VSCATTERQPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpd_mask_512:
+        Opc = X86::VSCATTERDPDZmr; break;
+      case Intrinsic::x86_avx512_scatter_dps_mask_512:
+        Opc = X86::VSCATTERDPSZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpi_mask_512:
+        Opc = X86::VPSCATTERQDZmr; break;
+      case Intrinsic::x86_avx512_scatter_qpq_mask_512:
+        Opc = X86::VPSCATTERQQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpq_mask_512:
+        Opc = X86::VPSCATTERDQZmr; break;
+      case Intrinsic::x86_avx512_scatter_dpi_mask_512:
+        Opc = X86::VPSCATTERDDZmr; break;
+    }
+    SDValue Chain = Op.getOperand(0);
+    SDValue Base  = Op.getOperand(2);
+    SDValue Mask  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Src   = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
   // XTEST intrinsics.
   case Intrinsic::x86_xtest: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
@@ -11004,13 +12008,14 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
   MFI->setReturnAddressIsTaken(true);
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT PtrVT = getPointerTy();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset =
-      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    const X86RegisterInfo *RegInfo =
+      static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
+    SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
     return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                        DAG.getNode(ISD::ADD, dl, PtrVT,
                                    FrameAddr, Offset),
@@ -11028,8 +12033,10 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
           (FrameReg == X86::EBP && VT == MVT::i32)) &&
@@ -11044,6 +12051,8 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
@@ -11051,9 +12060,11 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain     = Op.getOperand(0);
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
-  DebugLoc dl       = Op.getDebugLoc();
+  SDLoc dl      (Op);
 
   EVT PtrVT = getPointerTy();
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
   assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
@@ -11074,7 +12085,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
                                                SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
                      DAG.getVTList(MVT::i32, MVT::Other),
                      Op.getOperand(0), Op.getOperand(1));
@@ -11082,7 +12093,7 @@ SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
 
 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
                                                 SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
                      Op.getOperand(0), Op.getOperand(1));
 }
@@ -11097,7 +12108,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   SDValue Trmp = Op.getOperand(1); // trampoline
   SDValue FPtr = Op.getOperand(2); // nested function
   SDValue Nest = Op.getOperand(3); // 'nest' parameter value
-  DebugLoc dl  = Op.getDebugLoc();
+  SDLoc dl (Op);
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
@@ -11267,7 +12278,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   const TargetFrameLowering &TFI = *TM.getFrameLowering();
   unsigned StackAlignment = TFI.getStackAlignment();
   EVT VT = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
@@ -11314,7 +12325,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -11348,7 +12359,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   Op = Op.getOperand(0);
   if (VT == MVT::i8) {
@@ -11372,7 +12383,7 @@ static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   unsigned NumBits = VT.getSizeInBits();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   Op = Op.getOperand(0);
 
   // Issue a bsf (scan bits forward) which also sets EFLAGS.
@@ -11398,7 +12409,7 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
          "Unsupported value type for operation");
 
   unsigned NumElems = VT.getVectorNumElements();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Extract the LHS vectors
   SDValue LHS = Op.getOperand(0);
@@ -11434,7 +12445,7 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
                         SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
@@ -11450,7 +12461,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
            "Should not custom lower when pmuldq is available!");
 
     // Extract the odd parts.
-    const int UnpackMask[] = { 1, -1, 3, -1 };
+    static const int UnpackMask[] = { 1, -1, 3, -1 };
     SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
     SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
 
@@ -11464,12 +12475,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
 
     // Merge the two vectors back together with a shuffle. This expands into 2
     // shuffles.
-    const int ShufMask[] = { 0, 4, 2, 6 };
+    static const int ShufMask[] = { 0, 4, 2, 6 };
     return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
   }
 
-  assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
-         "Only know how to lower V2I64/V4I64 multiply");
+  assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
+         "Only know how to lower V2I64/V4I64/V8I64 multiply");
 
   //  Ahi = psrlqi(a, 32);
   //  Bhi = psrlqi(b, 32);
@@ -11482,13 +12493,12 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   //  AhiBlo = psllqi(AhiBlo, 32);
   //  return AloBlo + AloBhi + AhiBlo;
 
-  SDValue ShAmt = DAG.getConstant(32, MVT::i32);
-
-  SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
-  SDValue Bhi = DAG.getNode(X86ISD::VSRLI, dl, VT, B, ShAmt);
+  SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
+  SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
 
   // Bit cast to 32-bit vectors for MULUDQ
-  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 : MVT::v8i32;
+  EVT MulVT = (VT == MVT::v2i64) ? MVT::v4i32 :
+                                  (VT == MVT::v4i64) ? MVT::v8i32 : MVT::v16i32;
   A = DAG.getNode(ISD::BITCAST, dl, MulVT, A);
   B = DAG.getNode(ISD::BITCAST, dl, MulVT, B);
   Ahi = DAG.getNode(ISD::BITCAST, dl, MulVT, Ahi);
@@ -11498,19 +12508,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   SDValue AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
   SDValue AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
 
-  AloBhi = DAG.getNode(X86ISD::VSHLI, dl, VT, AloBhi, ShAmt);
-  AhiBlo = DAG.getNode(X86ISD::VSHLI, dl, VT, AhiBlo, ShAmt);
+  AloBhi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AloBhi, 32, DAG);
+  AhiBlo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, AhiBlo, 32, DAG);
 
   SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
-SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT EltTy = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   SDValue N0 = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   // Lower sdiv X, pow2-const.
   BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
@@ -11518,23 +12528,35 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
     return SDValue();
 
   APInt SplatValue, SplatUndef;
-  unsigned MinSplatBits;
+  unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                          HasAnyUndefs) ||
+      EltTy.getSizeInBits() < SplatBitSize)
     return SDValue();
 
   if ((SplatValue != 0) &&
       (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned lg2 = SplatValue.countTrailingZeros();
+    unsigned Lg2 = SplatValue.countTrailingZeros();
     // Splat the sign bit.
-    SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
-    SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+    SmallVector<SDValue, 16> Sz(NumElts,
+                                DAG.getConstant(EltTy.getSizeInBits() - 1,
+                                                EltTy));
+    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
+                                          NumElts));
     // Add (N0 < 0) ? abs2 - 1 : 0;
-    SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
-    SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+    SmallVector<SDValue, 16> Amt(NumElts,
+                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
+                                                 EltTy));
+    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
+                                          NumElts));
     SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
-    SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
+    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
+                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
+                                          NumElts));
 
     // If we're dividing by a positive value, we're done.  Otherwise, we must
     // negate the result.
@@ -11551,7 +12573,7 @@ SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                          const X86Subtarget *Subtarget) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
@@ -11563,23 +12585,26 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 
       if (VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 ||
           (Subtarget->hasInt256() &&
-           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16))) {
+           (VT == MVT::v4i64 || VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+          (Subtarget->hasAVX512() &&
+           (VT == MVT::v8i64 || VT == MVT::v16i32))) {
         if (Op.getOpcode() == ISD::SHL)
-          return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRL)
-          return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                            DAG);
         if (Op.getOpcode() == ISD::SRA && VT != MVT::v2i64 && VT != MVT::v4i64)
-          return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                             DAG.getConstant(ShiftAmt, MVT::i32));
+          return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                            DAG);
       }
 
       if (VT == MVT::v16i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG); 
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -11590,8 +12615,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v8i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v8i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 16> V(16,
@@ -11622,8 +12648,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       if (Subtarget->hasInt256() && VT == MVT::v32i8) {
         if (Op.getOpcode() == ISD::SHL) {
           // Make a large shift.
-          SDValue SHL = DAG.getNode(X86ISD::VSHLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SHL = DAG.getNode(ISD::BITCAST, dl, VT, SHL);
           // Zero out the rightmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -11634,8 +12661,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
-          SDValue SRL = DAG.getNode(X86ISD::VSRLI, dl, MVT::v16i16, R,
-                                    DAG.getConstant(ShiftAmt, MVT::i32));
+          SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl,
+                                                   MVT::v16i16, R, ShiftAmt,
+                                                   DAG);
           SRL = DAG.getNode(ISD::BITCAST, dl, VT, SRL);
           // Zero out the leftmost bits.
           SmallVector<SDValue, 32> V(32,
@@ -11700,14 +12728,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     default:
       llvm_unreachable("Unknown shift opcode!");
     case ISD::SHL:
-      return DAG.getNode(X86ISD::VSHLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRL:
-      return DAG.getNode(X86ISD::VSRLI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt,
+                                        DAG);
     case ISD::SRA:
-      return DAG.getNode(X86ISD::VSRAI, dl, VT, R,
-                         DAG.getConstant(ShiftAmt, MVT::i32));
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, ShiftAmt,
+                                        DAG);
     }
   }
 
@@ -11717,7 +12745,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                                         const X86Subtarget* Subtarget) {
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
 
@@ -11725,7 +12753,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
       VT == MVT::v4i32 || VT == MVT::v8i16 ||
       (Subtarget->hasInt256() &&
        ((VT == MVT::v4i64 && Op.getOpcode() != ISD::SRA) ||
-        VT == MVT::v8i32 || VT == MVT::v16i16))) {
+        VT == MVT::v8i32 || VT == MVT::v16i16)) ||
+       (Subtarget->hasAVX512() && (VT == MVT::v8i64 || VT == MVT::v16i32))) {
     SDValue BaseShAmt;
     EVT EltVT = VT.getVectorElementType();
 
@@ -11793,6 +12822,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSHLI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRA:
@@ -11802,6 +12833,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v8i16:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, R, BaseShAmt, DAG);
         }
       case ISD::SRL:
@@ -11813,6 +12846,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
         case MVT::v4i64:
         case MVT::v8i32:
         case MVT::v16i16:
+        case MVT::v16i32:
+        case MVT::v8i64:
           return getTargetVShiftNode(X86ISD::VSRLI, dl, VT, R, BaseShAmt, DAG);
         }
       }
@@ -11821,7 +12856,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
 
   // Special case in 32-bit mode, where i64 is expanded into high and low parts.
   if (!Subtarget->is64Bit() &&
-      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64)) &&
+      (VT == MVT::v2i64 || (Subtarget->hasInt256() && VT == MVT::v4i64) ||
+      (Subtarget->hasAVX512() && VT == MVT::v8i64)) &&
       Amt.getOpcode() == ISD::BITCAST &&
       Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
     Amt = Amt.getOperand(0);
@@ -11850,10 +12886,11 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
+                          SelectionDAG &DAG) {
 
   EVT VT = Op.getValueType();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue R = Op.getOperand(0);
   SDValue Amt = Op.getOperand(1);
   SDValue V;
@@ -11869,6 +12906,8 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   if (V.getNode())
       return V;
 
+  if (Subtarget->hasAVX512() && (VT == MVT::v16i32 || VT == MVT::v8i64))
+    return Op;
   // AVX2 has VPSLLV/VPSRAV/VPSRLV.
   if (Subtarget->hasInt256()) {
     if (Op.getOpcode() == ISD::SRL &&
@@ -11909,8 +12948,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)15, 4), a);
     SDValue M = DAG.getNode(ISD::AND, dl, VT, R, CM1);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(4, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 4, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -11921,8 +12959,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
 
     // r = VSELECT(r, psllw(r & (char16)63, 2), a);
     M = DAG.getNode(ISD::AND, dl, VT, R, CM2);
-    M = getTargetVShiftNode(X86ISD::VSHLI, dl, MVT::v8i16, M,
-                            DAG.getConstant(2, MVT::i32), DAG);
+    M = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, MVT::v8i16, M, 2, DAG);
     M = DAG.getNode(ISD::BITCAST, dl, VT, M);
     R = DAG.getNode(ISD::VSELECT, dl, VT, OpVSel, M, R);
 
@@ -11989,7 +13026,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   SDValue RHS = N->getOperand(1);
   unsigned BaseOp = 0;
   unsigned Cond = 0;
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Unknown ovf instruction!");
   case ISD::SADDO:
@@ -12056,7 +13093,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
   EVT VT = Op.getValueType();
 
@@ -12065,7 +13102,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 
   unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                       ExtraVT.getScalarType().getSizeInBits();
-  SDValue ShAmt = DAG.getConstant(BitsDiff, MVT::i32);
 
   switch (VT.getSimpleVT().SimpleTy) {
     default: return SDValue();
@@ -12099,31 +13135,41 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
       // fall through
     case MVT::v4i32:
     case MVT::v8i16: {
-      // (sext (vzext x)) -> (vsext x)
       SDValue Op0 = Op.getOperand(0);
       SDValue Op00 = Op0.getOperand(0);
       SDValue Tmp1;
       // Hopefully, this VECTOR_SHUFFLE is just a VZEXT.
       if (Op0.getOpcode() == ISD::BITCAST &&
-          Op00.getOpcode() == ISD::VECTOR_SHUFFLE)
-        Tmp1 = LowerVectorIntExtend(Op00, DAG);
-      if (Tmp1.getNode()) {
-        SDValue Tmp1Op0 = Tmp1.getOperand(0);
-        assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
-               "This optimization is invalid without a VZEXT.");
-        return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          Op00.getOpcode() == ISD::VECTOR_SHUFFLE) {
+        // (sext (vzext x)) -> (vsext x)
+        Tmp1 = LowerVectorIntExtend(Op00, Subtarget, DAG);
+        if (Tmp1.getNode()) {
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          // This folding is only valid when the in-reg type is a vector of i8,
+          // i16, or i32.
+          if (ExtraEltVT == MVT::i8 || ExtraEltVT == MVT::i16 ||
+              ExtraEltVT == MVT::i32) {
+            SDValue Tmp1Op0 = Tmp1.getOperand(0);
+            assert(Tmp1Op0.getOpcode() == X86ISD::VZEXT &&
+                   "This optimization is invalid without a VZEXT.");
+            return DAG.getNode(X86ISD::VSEXT, dl, VT, Tmp1Op0.getOperand(0));
+          }
+          Op0 = Tmp1;
+        }
       }
 
       // If the above didn't work, then just use Shift-Left + Shift-Right.
-      Tmp1 = getTargetVShiftNode(X86ISD::VSHLI, dl, VT, Op0, ShAmt, DAG);
-      return getTargetVShiftNode(X86ISD::VSRAI, dl, VT, Tmp1, ShAmt, DAG);
+      Tmp1 = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Op0, BitsDiff,
+                                        DAG);
+      return getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Tmp1, BitsDiff,
+                                        DAG);
     }
   }
 }
 
 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
                                  SelectionDAG &DAG) {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
   SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
@@ -12160,7 +13206,7 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
                              SelectionDAG &DAG) {
   EVT T = Op.getValueType();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned Reg = 0;
   unsigned size = 0;
   switch(T.getSimpleVT().SimpleTy) {
@@ -12194,7 +13240,7 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   assert(Subtarget->is64Bit() && "Result not type legalized?");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue TheChain = Op.getOperand(0);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
   SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
   SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
@@ -12208,9 +13254,10 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
 }
 
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
-  EVT SrcVT = Op.getOperand(0).getValueType();
-  EVT DstVT = Op.getValueType();
+static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
+  MVT SrcVT = Op.getOperand(0).getSimpleValueType();
+  MVT DstVT = Op.getSimpleValueType();
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
@@ -12230,7 +13277,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT T = Node->getValueType(0);
   SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
                               DAG.getConstant(0, T), Node->getOperand(2));
@@ -12246,7 +13293,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
 
 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert seq_cst store -> xchg
@@ -12289,25 +13336,26 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
   }
 
   if (!ExtraOp)
-    return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                        Op.getOperand(1));
-  return DAG.getNode(Opc, Op->getDebugLoc(), VTs, Op.getOperand(0),
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
                      Op.getOperand(1), Op.getOperand(2));
 }
 
-SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
+                            SelectionDAG &DAG) {
   assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
 
   // For MacOSX, we want to call an alternative entry point: __sincos_stret,
   // which returns the values as { float, float } (in XMM0) or
   // { double, double } (which is returned in XMM0, XMM1).
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Arg = Op.getOperand(0);
   EVT ArgVT = Arg.getValueType();
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
-  ArgListTy Args;
-  ArgListEntry Entry;
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
@@ -12320,7 +13368,8 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   // the small struct {f32, f32} is returned in (eax, edx). For f64,
   // the results are returned via SRet in memory.
   const char *LibcallName =  isF64 ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy());
 
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
@@ -12331,7 +13380,7 @@ SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
                          CallingConv::C, /*isTaillCall=*/false,
                          /*doesNotRet=*/false, /*isReturnValueUsed*/true,
                          Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
     // Returned in xmm0 and xmm1.
@@ -12375,9 +13424,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return LowerTRUNCATE(Op, DAG);
-  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, DAG);
-  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, DAG);
-  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, DAG);
+  case ISD::ZERO_EXTEND:        return LowerZERO_EXTEND(Op, Subtarget, DAG);
+  case ISD::SIGN_EXTEND:        return LowerSIGN_EXTEND(Op, Subtarget, DAG);
+  case ISD::ANY_EXTEND:         return LowerANY_EXTEND(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
@@ -12393,7 +13442,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
   case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
-  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
   case ISD::FRAME_TO_ARGS_OFFSET:
@@ -12411,7 +13461,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
-  case ISD::SHL:                return LowerShift(Op, DAG);
+  case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
@@ -12419,7 +13469,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
   case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
-  case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
+  case ISD::BITCAST:            return LowerBITCAST(Op, Subtarget, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
   case ISD::SUBC:
@@ -12427,14 +13477,14 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
   case ISD::SDIV:               return LowerSDIV(Op, DAG);
-  case ISD::FSINCOS:            return LowerFSINCOS(Op, DAG);
+  case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   }
 }
 
 static void ReplaceATOMIC_LOAD(SDNode *Node,
                                   SmallVectorImpl<SDValue> &Results,
                                   SelectionDAG &DAG) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
   // Convert wide load -> cmpxchg8b/cmpxchg16b
@@ -12455,7 +13505,7 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
 static void
 ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
                         SelectionDAG &DAG, unsigned NewOp) {
-  DebugLoc dl = Node->getDebugLoc();
+  SDLoc dl(Node);
   assert (Node->getValueType(0) == MVT::i64 &&
           "Only know how to expand i64 atomics");
 
@@ -12480,7 +13530,7 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (N->getOpcode()) {
   default:
@@ -12664,6 +13714,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::SHLD:               return "X86ISD::SHLD";
   case X86ISD::SHRD:               return "X86ISD::SHRD";
   case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FANDN:              return "X86ISD::FANDN";
   case X86ISD::FOR:                return "X86ISD::FOR";
   case X86ISD::FXOR:               return "X86ISD::FXOR";
   case X86ISD::FSRL:               return "X86ISD::FSRL";
@@ -12680,6 +13731,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMP:                return "X86ISD::CMP";
   case X86ISD::COMI:               return "X86ISD::COMI";
   case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::CMPM:               return "X86ISD::CMPM";
+  case X86ISD::CMPMU:              return "X86ISD::CMPMU";
   case X86ISD::SETCC:              return "X86ISD::SETCC";
   case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
   case X86ISD::FSETCCsd:           return "X86ISD::FSETCCsd";
@@ -12739,6 +13792,9 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
   case X86ISD::VZEXT:              return "X86ISD::VZEXT";
   case X86ISD::VSEXT:              return "X86ISD::VSEXT";
+  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
+  case X86ISD::VTRUNCM:            return "X86ISD::VTRUNCM";
+  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
@@ -12752,6 +13808,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::CMPP:               return "X86ISD::CMPP";
   case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
   case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
+  case X86ISD::PCMPEQM:            return "X86ISD::PCMPEQM";
+  case X86ISD::PCMPGTM:            return "X86ISD::PCMPGTM";
   case X86ISD::ADD:                return "X86ISD::ADD";
   case X86ISD::SUB:                return "X86ISD::SUB";
   case X86ISD::ADC:                return "X86ISD::ADC";
@@ -12766,9 +13824,14 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::BLSI:               return "X86ISD::BLSI";
   case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
   case X86ISD::BLSR:               return "X86ISD::BLSR";
+  case X86ISD::BZHI:               return "X86ISD::BZHI";
+  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
   case X86ISD::TESTP:              return "X86ISD::TESTP";
+  case X86ISD::TESTM:              return "X86ISD::TESTM";
+  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
+  case X86ISD::KTEST:              return "X86ISD::KTEST";
   case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
   case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
   case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
@@ -12787,9 +13850,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
+  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
+  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
@@ -12875,6 +13940,20 @@ bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return NumBits1 > NumBits2;
 }
 
+bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   return isInt<32>(Imm);
 }
@@ -12926,6 +14005,27 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool
+X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  if (!(Subtarget->hasFMA() || Subtarget->hasFMA4()))
+    return false;
+
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
   return !(VT1 == MVT::i32 && VT2 == MVT::i16);
@@ -12938,37 +14038,46 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
 bool
 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
     return false;
 
   // FIXME: pshufb, blends, shifts.
-  return (VT.getVectorNumElements() == 2 ||
+  return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
-          isMOVLMask(M, VT) ||
-          isSHUFPMask(M, VT, Subtarget->hasFp256()) ||
-          isPSHUFDMask(M, VT) ||
-          isPSHUFHWMask(M, VT, Subtarget->hasInt256()) ||
-          isPSHUFLWMask(M, VT, Subtarget->hasInt256()) ||
-          isPALIGNRMask(M, VT, Subtarget) ||
-          isUNPCKLMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKHMask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKL_v_undef_Mask(M, VT, Subtarget->hasInt256()) ||
-          isUNPCKH_v_undef_Mask(M, VT, Subtarget->hasInt256()));
+          isMOVLMask(M, SVT) ||
+          isSHUFPMask(M, SVT) ||
+          isPSHUFDMask(M, SVT) ||
+          isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
+          isPALIGNRMask(M, SVT, Subtarget) ||
+          isUNPCKLMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKHMask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKL_v_undef_Mask(M, SVT, Subtarget->hasInt256()) ||
+          isUNPCKH_v_undef_Mask(M, SVT, Subtarget->hasInt256()));
 }
 
 bool
 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                           EVT VT) const {
-  unsigned NumElts = VT.getVectorNumElements();
+  if (!VT.isSimple())
+    return false;
+
+  MVT SVT = VT.getSimpleVT();
+  unsigned NumElts = SVT.getVectorNumElements();
   // FIXME: This collection of masks seems suspect.
   if (NumElts == 2)
     return true;
-  if (NumElts == 4 && VT.is128BitVector()) {
-    return (isMOVLMask(Mask, VT)  ||
-            isCommutedMOVLMask(Mask, VT, true) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256()) ||
-            isSHUFPMask(Mask, VT, Subtarget->hasFp256(), /* Commuted */ true));
+  if (NumElts == 4 && SVT.is128BitVector()) {
+    return (isMOVLMask(Mask, SVT)  ||
+            isCommutedMOVLMask(Mask, SVT, true) ||
+            isSHUFPMask(Mask, SVT) ||
+            isSHUFPMask(Mask, SVT, /* Commuted */ true));
   }
   return false;
 }
@@ -14392,12 +15501,11 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
     } else {
       // __chkstk(MSVCRT): does not update stack pointer.
       // Clobbers R10, R11 and EFLAGS.
-      // FIXME: RAX(allocated size) might be reused and not killed.
       BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
         .addExternalSymbol("__chkstk")
         .addReg(X86::RAX, RegState::Implicit)
         .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
-      // RAX has the offset to subtracted from RSP.
+      // RAX has the offset to be subtracted from RSP.
       BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
         .addReg(X86::RSP)
         .addReg(X86::RAX);
@@ -14589,6 +15697,9 @@ X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
   // Setup
   MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
           .addMBB(restoreMBB);
+
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   MIB.addRegMask(RegInfo->getNoPreservedMask());
   thisMBB->addSuccessor(mainMBB);
   thisMBB->addSuccessor(restoreMBB);
@@ -14634,6 +15745,8 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
     (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
   unsigned Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  const X86RegisterInfo *RegInfo =
+    static_cast<const X86RegisterInfo*>(getTargetMachine().getRegisterInfo());
   unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   unsigned SP = RegInfo->getStackRegister();
 
@@ -14706,6 +15819,9 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::CMOV_V8F32:
   case X86::CMOV_V4F64:
   case X86::CMOV_V4I64:
+  case X86::CMOV_V16F32:
+  case X86::CMOV_V8F64:
+  case X86::CMOV_V8I64:
   case X86::CMOV_GR16:
   case X86::CMOV_GR32:
   case X86::CMOV_RFP32:
@@ -15034,7 +16150,7 @@ static bool isShuffleLow128VectorInsertHigh(ShuffleVectorSDNode *SVOp) {
 static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         const X86Subtarget* Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
@@ -15130,7 +16246,7 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const X86Subtarget *Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Don't create instructions with illegal types after legalize types has run.
@@ -15154,7 +16270,7 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
     Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
 
-  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
+  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true);
 }
 
 /// PerformTruncateCombine - Converts truncate operation to
@@ -15249,7 +16365,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   // All checks match so transform back to vector_shuffle so that DAG combiner
   // can finish the job
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Create shuffle node taking into account the case that its a unary shuffle
   SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(VT) : InVec.getOperand(1);
@@ -15261,6 +16377,44 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
                      EltNo);
 }
 
+/// Extract one bit from mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+static SDValue ExtractBitFromMaskVector(SDNode *N, SelectionDAG &DAG) {
+  SDValue Vec = N->getOperand(0);
+  SDLoc dl(Vec);
+  MVT VecVT = Vec.getSimpleValueType();
+  SDValue Idx = N->getOperand(1);
+  MVT EltVT = N->getSimpleValueType(0);
+  
+  assert((VecVT.getVectorElementType() == MVT::i1 && EltVT == MVT::i8) ||
+         "Unexpected operands in ExtractBitFromMaskVector");
+
+  // variable index
+  if (!isa<ConstantSDNode>(Idx)) {
+    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                              ExtVT.getVectorElementType(), Ext);
+    return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  MVT ScalarVT = MVT::getIntegerVT(VecVT.getSizeInBits());
+  unsigned MaxShift = VecVT.getSizeInBits() - 1;
+  Vec = DAG.getNode(ISD::BITCAST, dl, ScalarVT, Vec);
+  Vec = DAG.getNode(ISD::SHL, dl, ScalarVT, Vec, 
+              DAG.getConstant(MaxShift - IdxVal, ScalarVT));
+  Vec = DAG.getNode(ISD::SRL, dl, ScalarVT, Vec,
+    DAG.getConstant(MaxShift, ScalarVT));
+
+  if (VecVT == MVT::v16i1) {
+    Vec = DAG.getNode(ISD::BITCAST, dl, MVT::i16, Vec);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Vec);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, MVT::i8, Vec);
+}
+
 /// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
 /// generation and convert it from being a bunch of shuffles and extracts
 /// to a simple store and scalar loads to extract the elements.
@@ -15271,12 +16425,17 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
+
+  if (InputVector.getValueType().getVectorElementType() == MVT::i1 &&
+      !DCI.isBeforeLegalize())
+    return ExtractBitFromMaskVector(N, DAG);
+
   // Detect whether we are trying to convert from mmx to i32 and the bitcast
   // from mmx to v2i32 has a single usage.
   if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
       InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
       InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
-    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+    return DAG.getNode(X86ISD::MMX_MOVD2W, SDLoc(InputVector),
                        N->getValueType(0),
                        InputVector.getNode()->getOperand(0));
 
@@ -15321,7 +16480,7 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // Ok, we've now decided to do the transformation.
-  DebugLoc dl = InputVector.getDebugLoc();
+  SDLoc dl(InputVector);
 
   // Store the value to a temporary stack slot.
   SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
@@ -15358,24 +16517,28 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 /// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
-static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
-                                   SDValue RHS, SelectionDAG &DAG,
-                                   const X86Subtarget *Subtarget) {
+static std::pair<unsigned, bool>
+matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
+                   SelectionDAG &DAG, const X86Subtarget *Subtarget) {
   if (!VT.isVector())
-    return 0;
+    return std::make_pair(0, false);
 
+  bool NeedSplit = false;
   switch (VT.getSimpleVT().SimpleTy) {
-  default: return 0;
+  default: return std::make_pair(0, false);
   case MVT::v32i8:
   case MVT::v16i16:
   case MVT::v8i32:
     if (!Subtarget->hasAVX2())
-      return 0;
+      NeedSplit = true;
+    if (!Subtarget->hasAVX())
+      return std::make_pair(0, false);
+    break;
   case MVT::v16i8:
   case MVT::v8i16:
   case MVT::v4i32:
     if (!Subtarget->hasSSE2())
-      return 0;
+      return std::make_pair(0, false);
   }
 
   // SSE2 has only a small subset of the operations.
@@ -15386,6 +16549,7 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
 
   ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
+  unsigned Opc = 0;
   // Check for x CC y ? x : y.
   if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
       DAG.isEqualTo(RHS, Cond.getOperand(1))) {
@@ -15393,16 +16557,16 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     }
   // Check for x CC y ? y : x -- a min/max with reversed arms.
   } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
@@ -15411,20 +16575,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
     default: break;
     case ISD::SETULT:
     case ISD::SETULE:
-      return hasUnsigned ? X86ISD::UMAX : 0;
+      Opc = hasUnsigned ? X86ISD::UMAX : 0; break;
     case ISD::SETUGT:
     case ISD::SETUGE:
-      return hasUnsigned ? X86ISD::UMIN : 0;
+      Opc = hasUnsigned ? X86ISD::UMIN : 0; break;
     case ISD::SETLT:
     case ISD::SETLE:
-      return hasSigned ? X86ISD::SMAX : 0;
+      Opc = hasSigned ? X86ISD::SMAX : 0; break;
     case ISD::SETGT:
     case ISD::SETGE:
-      return hasSigned ? X86ISD::SMIN : 0;
+      Opc = hasSigned ? X86ISD::SMIN : 0; break;
     }
   }
 
-  return 0;
+  return std::make_pair(Opc, NeedSplit);
 }
 
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
@@ -15432,19 +16596,20 @@ static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue Cond = N->getOperand(0);
   // Get the LHS/RHS of the select.
   SDValue LHS = N->getOperand(1);
   SDValue RHS = N->getOperand(2);
   EVT VT = LHS.getValueType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // If we have SSE[12] support, try to form min/max nodes. SSE min/max
   // instructions match the semantics of the common C idiom x<y?x:y but not
   // x<=y?x:y, because of how they handle negative zero (which can be
   // ignored in unsafe-math mode).
   if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
-      VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+      VT != MVT::f80 && TLI.isTypeLegal(VT) &&
       (Subtarget->hasSSE2() ||
        (Subtarget->hasSSE1() && VT.getScalarType() == MVT::f32))) {
     ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
@@ -15583,6 +16748,22 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
   }
 
+  EVT CondVT = Cond.getValueType();
+  if (Subtarget->hasAVX512() && VT.isVector() && CondVT.isVector() &&
+      CondVT.getVectorElementType() == MVT::i1) {
+    // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
+    // lowering on AVX-512. In this case we convert it to
+    // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
+    // The same situation for all 128 and 256-bit vectors of i8 and i16
+    EVT OpVT = LHS.getValueType();
+    if ((OpVT.is128BitVector() || OpVT.is256BitVector()) &&
+        (OpVT.getVectorElementType() == MVT::i8 ||
+         OpVT.getVectorElementType() == MVT::i16)) {
+      Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, OpVT, Cond);
+      DCI.AddToWorklist(Cond.getNode());
+      return DAG.getNode(N->getOpcode(), DL, OpVT, Cond, LHS, RHS);
+    }
+  }
   // If this is a select between two integer constants, try to do some
   // optimizations.
   if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
@@ -15700,16 +16881,19 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     case ISD::SETLT:
     case ISD::SETGT: {
       ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
-      Cond = DAG.getSetCC(Cond.getDebugLoc(), Cond.getValueType(),
+      Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
                           Cond.getOperand(0), Cond.getOperand(1), NewCC);
       return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
     }
     }
   }
 
+  // Early exit check
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
   // Match VSELECTs into subs with unsigned saturation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
       // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
       ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
        (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
@@ -15763,14 +16947,35 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   // Try to match a min/max vector operation.
-  if (!DCI.isBeforeLegalize() &&
-      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
-    if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
-      return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC) {
+    std::pair<unsigned, bool> ret = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget);
+    unsigned Opc = ret.first;
+    bool NeedSplit = ret.second;
+
+    if (Opc && NeedSplit) {
+      unsigned NumElems = VT.getVectorNumElements();
+      // Extract the LHS vectors
+      SDValue LHS1 = Extract128BitVector(LHS, 0, DAG, DL);
+      SDValue LHS2 = Extract128BitVector(LHS, NumElems/2, DAG, DL);
+
+      // Extract the RHS vectors
+      SDValue RHS1 = Extract128BitVector(RHS, 0, DAG, DL);
+      SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, DL);
+
+      // Create min/max for each subvector
+      LHS = DAG.getNode(Opc, DL, LHS1.getValueType(), LHS1, RHS1);
+      RHS = DAG.getNode(Opc, DL, LHS2.getValueType(), LHS2, RHS2);
+
+      // Merge the result
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS, RHS);
+    } else if (Opc)
+      return DAG.getNode(Opc, DL, VT, LHS, RHS);
+  }
 
   // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
-  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
-      Cond.getOpcode() == ISD::SETCC) {
+  if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // Check if SETCC has already been promoted
+      TLI.getSetCCResultType(*DAG.getContext(), VT) == Cond.getValueType()) {
 
     assert(Cond.getValueType().isVector() &&
            "vector select expects a vector selector!");
@@ -15817,7 +17022,6 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
@@ -15826,6 +17030,15 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (BitWidth == 1)
       return SDValue();
 
+    // Check all uses of that condition operand to check whether it will be
+    // consumed by non-BLEND instructions, which may depend on all bits are set
+    // properly.
+    for (SDNode::use_iterator I = Cond->use_begin(),
+                              E = Cond->use_end(); I != E; ++I)
+      if (I->getOpcode() != ISD::VSELECT)
+        // TODO: Add other opcodes eventually lowered into BLEND.
+        return SDValue();
+
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
     APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
 
@@ -15976,7 +17189,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // If the flag operand isn't dead, don't touch this CMOV.
   if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
@@ -16179,7 +17392,7 @@ static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
   }
   if (MulAmt2 &&
       (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     if (isPowerOf2_64(MulAmt2) &&
         !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
@@ -16229,7 +17442,7 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
       APInt ShAmt = N1C->getAPIntValue();
       Mask = Mask.shl(ShAmt);
       if (Mask != 0)
-        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+        return DAG.getNode(ISD::AND, SDLoc(N), VT,
                            N00, DAG.getConstant(Mask, VT));
     }
   }
@@ -16245,7 +17458,39 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
     // hardware support for this operation. This is better expressed as an ADD
     // of two values.
     if (N1C && (1 == N1C->getZExtValue())) {
-      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, N0, N0);
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
+    }
+  }
+
+  return SDValue();
+}
+
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal 
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+      (!Subtarget->hasInt256() ||
+       (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+    return SDValue();
+
+  SDValue Amt = N->getOperand(1);
+  SDLoc DL(N);
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      APInt ShiftAmt = C->getAPIntValue();
+      unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+
+      // SSE2/AVX2 logical shifts always return a vector of 0s
+      // if the shift amount is bigger than or equal to 
+      // the element size. The constant shift amount will be
+      // encoded as a 8-bit immediate.
+      if (ShiftAmt.trunc(8).uge(MaxAmount))
+        return getZeroVector(VT, Subtarget, DAG, DL);
     }
   }
 
@@ -16261,6 +17506,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
     if (V.getNode()) return V;
   }
 
+  if (N->getOpcode() != ISD::SRA) {
+    // Try to fold this logical shift into a zero vector.
+    SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
+    if (V.getNode()) return V;
+  }
+
   return SDValue();
 }
 
@@ -16279,7 +17530,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
     SDValue N1 = N->getOperand(1);
     SDValue CMP0 = N0->getOperand(1);
     SDValue CMP1 = N1->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
 
     // The SETCCs should both refer to the same CMP.
     if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
@@ -16398,7 +17649,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
 
   SDValue N0  = Narrow->getOperand(0);
   SDValue N1  = Narrow->getOperand(1);
-  DebugLoc DL = Narrow->getDebugLoc();
+  SDLoc DL(Narrow);
 
   // The Left side has to be a trunc.
   if (N0.getOpcode() != ISD::TRUNCATE)
@@ -16464,33 +17715,80 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BLSI, and BLSR instructions
+  // Create BLSI, BLSR, and BZHI instructions
   // BLSI is X & (-X)
   // BLSR is X & (X-1)
-  if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
+  // BZHI is X & ((1 << Y) - 1)
+  // BEXTR is ((X >> imm) & (2**size-1))
+  if (VT == MVT::i32 || VT == MVT::i64) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
-
-    // Check LHS for neg
-    if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
-        isZero(N0.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
-
-    // Check RHS for neg
-    if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
-        isZero(N1.getOperand(0)))
-      return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+    SDLoc DL(N);
+
+    if (Subtarget->hasBMI()) {
+      // Check LHS for neg
+      if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
+          isZero(N0.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N1);
+
+      // Check RHS for neg
+      if (N1.getOpcode() == ISD::SUB && N1.getOperand(1) == N0 &&
+          isZero(N1.getOperand(0)))
+        return DAG.getNode(X86ISD::BLSI, DL, VT, N0);
+
+      // Check LHS for X-1
+      if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
+          isAllOnes(N0.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+
+      // Check RHS for X-1
+      if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
+          isAllOnes(N1.getOperand(1)))
+        return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    }
+
+    if (Subtarget->hasBMI2()) {
+      // Check for (and (add (shl 1, Y), -1), X)
+      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::SHL) {
+          SDValue N001 = N00.getOperand(1);
+          assert(N001.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
+        }
+      }
 
-    // Check LHS for X-1
-    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
-        isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N1);
+      // Check for (and X, (add (shl 1, Y), -1))
+      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::SHL) {
+          SDValue N101 = N10.getOperand(1);
+          assert(N101.getValueType() == MVT::i8 && "unexpected type");
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
+          if (C && C->getZExtValue() == 1)
+            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
+        }
+      }
+    }
 
-    // Check RHS for X-1
-    if (N1.getOpcode() == ISD::ADD && N1.getOperand(0) == N0 &&
-        isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::BLSR, DL, VT, N0);
+    // Check for BEXTR.
+    if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
+        (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
+      ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
+      ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (MaskNode && ShiftNode) {
+        uint64_t Mask = MaskNode->getZExtValue();
+        uint64_t Shift = ShiftNode->getZExtValue();
+        if (isMask_64(Mask)) {
+          uint64_t MaskSize = CountPopulation_64(Mask);
+          if (Shift + MaskSize <= VT.getSizeInBits())
+            return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
+                               DAG.getConstant(Shift | (MaskSize << 8), VT));
+        }
+      }
+    } // BEXTR
 
     return SDValue();
   }
@@ -16504,7 +17802,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Check LHS for vnot
   if (N0.getOpcode() == ISD::XOR &&
@@ -16588,7 +17886,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
       if ((SraAmt + 1) != EltBits)
         return SDValue();
 
-      DebugLoc DL = N->getDebugLoc();
+      SDLoc DL(N);
 
       // Now we know we at least have a plendvb with the mask val.  See if
       // we can form a psignb/w/d.
@@ -16637,7 +17935,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   if (ShAmt1.getOpcode() == ISD::TRUNCATE)
     ShAmt1 = ShAmt1.getOperand(0);
 
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   unsigned Opc = X86ISD::SHLD;
   SDValue Op0 = N0.getOperand(0);
   SDValue Op1 = N1.getOperand(0);
@@ -16684,7 +17982,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
   // and change it to SUB and CMOV.
@@ -16734,7 +18032,7 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
   // Create BLSMSK instructions by finding X ^ (X-1)
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1 &&
       isAllOnes(N0.getOperand(1)))
@@ -16754,7 +18052,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   LoadSDNode *Ld = cast<LoadSDNode>(N);
   EVT RegVT = Ld->getValueType(0);
   EVT MemVT = Ld->getMemoryVT();
-  DebugLoc dl = Ld->getDebugLoc();
+  SDLoc dl(Ld);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned RegSz = RegVT.getSizeInBits();
 
@@ -16949,7 +18247,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT VT = St->getValue().getValueType();
   EVT StVT = St->getMemoryVT();
-  DebugLoc dl = St->getDebugLoc();
+  SDLoc dl(St);
   SDValue StoredVal = St->getOperand(1);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
@@ -17112,8 +18410,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
       return SDValue();
 
-    DebugLoc LdDL = Ld->getDebugLoc();
-    DebugLoc StDL = N->getDebugLoc();
+    SDLoc LdDL(Ld);
+    SDLoc StDL(N);
     // If we are a 64-bit capable x86, lower to a single movq load/store pair.
     // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
     // pair instead.
@@ -17206,7 +18504,7 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
       RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
     return false;
 
-  EVT VT = LHS.getValueType();
+  MVT VT = LHS.getSimpleValueType();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for horizontal add/sub");
@@ -17277,23 +18575,24 @@ static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
   //   LHS = VECTOR_SHUFFLE A, B, LMask
   //   RHS = VECTOR_SHUFFLE A, B, RMask
   // Check that the masks correspond to performing a horizontal operation.
-  for (unsigned i = 0; i != NumElts; ++i) {
-    int LIdx = LMask[i], RIdx = RMask[i];
-
-    // Ignore any UNDEF components.
-    if (LIdx < 0 || RIdx < 0 ||
-        (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
-        (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
-      continue;
+  for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+    for (unsigned i = 0; i != NumLaneElts; ++i) {
+      int LIdx = LMask[i+l], RIdx = RMask[i+l];
+
+      // Ignore any UNDEF components.
+      if (LIdx < 0 || RIdx < 0 ||
+          (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
+          (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
+        continue;
 
-    // Check that successive elements are being operated on.  If not, this is
-    // not a horizontal operation.
-    unsigned Src = (i/HalfLaneElts) % 2; // each lane is split between srcs
-    unsigned LaneStart = (i/NumLaneElts) * NumLaneElts;
-    int Index = 2*(i%HalfLaneElts) + NumElts*Src + LaneStart;
-    if (!(LIdx == Index && RIdx == Index + 1) &&
-        !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
-      return false;
+      // Check that successive elements are being operated on.  If not, this is
+      // not a horizontal operation.
+      unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
+      int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
+      if (!(LIdx == Index && RIdx == Index + 1) &&
+          !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
+        return false;
+    }
   }
 
   LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
@@ -17312,7 +18611,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, true))
-    return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
+    return DAG.getNode(X86ISD::FHADD, SDLoc(N), VT, LHS, RHS);
   return SDValue();
 }
 
@@ -17327,7 +18626,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
        (Subtarget->hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
       isHorizontalBinOp(LHS, RHS, false))
-    return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
+    return DAG.getNode(X86ISD::FHSUB, SDLoc(N), VT, LHS, RHS);
   return SDValue();
 }
 
@@ -17364,7 +18663,7 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
     case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
   }
 
-  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
                      N->getOperand(0), N->getOperand(1));
 }
 
@@ -17381,6 +18680,19 @@ static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// PerformFANDNCombine - Do target-specific dag combines on X86ISD::FANDN nodes
+static SDValue PerformFANDNCombine(SDNode *N, SelectionDAG &DAG) {
+  // FANDN(x, 0.0) -> 0.0
+  // FANDN(0.0, x) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
 static SDValue PerformBTCombine(SDNode *N,
                                 SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI) {
@@ -17408,12 +18720,12 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
   if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
       VT.getVectorElementType().getSizeInBits() ==
       OpVT.getVectorElementType().getSizeInBits()) {
-    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
   }
   return SDValue();
 }
 
-static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG, 
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
                                                const X86Subtarget *Subtarget) {
   EVT VT = N->getValueType(0);
   if (!VT.isVector())
@@ -17422,7 +18734,7 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
   // both SSE and AVX2 since there is no sign-extended shift right
@@ -17433,14 +18745,14 @@ static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
       N0.getOpcode() == ISD::SIGN_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
 
-    // EXTLOAD has a better solution on AVX2, 
+    // EXTLOAD has a better solution on AVX2,
     // it may be replaced with X86ISD::VSEXT node.
     if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
-        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, 
+        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
                                   N00, N1);
       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
     }
@@ -17469,7 +18781,7 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
 
 static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
                                  const X86Subtarget* Subtarget) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
   // Let legalize expand this if it isn't a legal type yet.
@@ -17514,7 +18826,7 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
   //           (and (i32 x86isd::setcc_carry), 1)
   // This eliminates the zext. This transformation is necessary because
   // ISD::SETCC is always legalized to i8.
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -17552,17 +18864,17 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(LHS.getOperand(0)))
       if (C->getAPIntValue() == 0 && LHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
                                    LHS.getValueType(), RHS, LHS.getOperand(1));
-        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
                             addV, DAG.getConstant(0, addV.getValueType()), CC);
       }
   if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS.getOperand(0)))
       if (C->getAPIntValue() == 0 && RHS.hasOneUse()) {
-        SDValue addV = DAG.getNode(ISD::ADD, N->getDebugLoc(),
+        SDValue addV = DAG.getNode(ISD::ADD, SDLoc(N),
                                    RHS.getValueType(), LHS, RHS.getOperand(1));
-        return DAG.getSetCC(N->getDebugLoc(), N->getValueType(0),
+        return DAG.getSetCC(SDLoc(N), N->getValueType(0),
                             addV, DAG.getConstant(0, addV.getValueType()), CC);
       }
   return SDValue();
@@ -17571,7 +18883,7 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+static SDValue MaterializeSETB(SDLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, DL, MVT::i8,
                      DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
                                  DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
@@ -17582,7 +18894,7 @@ static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
 static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
@@ -17596,7 +18908,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
     if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
         EFLAGS.getValueType().isInteger() &&
         !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
       SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
@@ -17626,7 +18938,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
                                     const X86Subtarget *Subtarget) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
   SDValue Chain = N->getOperand(0);
   SDValue Dest = N->getOperand(1);
   SDValue EFLAGS = N->getOperand(3);
@@ -17651,7 +18963,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
 
   // SINT_TO_FP(v4i8) -> SINT_TO_FP(SEXT(v4i8 to v4i32))
   if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
+    SDLoc dl(N);
     MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
     SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
     return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
@@ -17665,7 +18977,7 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
     if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
         ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
         !XTLI->getSubtarget()->is64Bit() &&
-        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+        VT == MVT::i64) {
       SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
                                           Ld->getChain(), Op0, DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
@@ -17686,7 +18998,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
       // We don't have a good way to replace an EFLAGS use, so only do this when
       // dead right now.
       SDValue(N, 1).use_empty()) {
-    DebugLoc DL = N->getDebugLoc();
+    SDLoc DL(N);
     EVT VT = N->getValueType(0);
     SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
     SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
@@ -17705,7 +19017,7 @@ static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
 //      (sub (sete  X, 0), Y) -> sbb  0, Y
 //      (sub (setne X, 0), Y) -> adc -1, Y
 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
-  DebugLoc DL = N->getDebugLoc();
+  SDLoc DL(N);
 
   // Look through ZExts.
   SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
@@ -17751,7 +19063,7 @@ static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
-    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+    return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
@@ -17771,10 +19083,10 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
         isa<ConstantSDNode>(Op1.getOperand(1))) {
       APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
       EVT VT = Op0.getValueType();
-      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
+      SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
                                    Op1.getOperand(0),
                                    DAG.getConstant(~XorC, VT));
-      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
                          DAG.getConstant(C->getAPIntValue()+1, VT));
     }
   }
@@ -17784,7 +19096,7 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   if (((Subtarget->hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget->hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
       isHorizontalBinOp(Op0, Op1, true))
-    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+    return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
 }
@@ -17801,7 +19113,7 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
   if (In.getOpcode() != X86ISD::VZEXT)
     return SDValue();
 
-  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+  return DAG.getNode(X86ISD::VZEXT, SDLoc(N), N->getValueType(0),
                      In.getOperand(0));
 }
 
@@ -17835,6 +19147,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::FANDN:       return PerformFANDNCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
@@ -17990,6 +19303,22 @@ namespace {
   const VariadicFunction1<bool, StringRef, StringRef, matchAsmImpl> matchAsm={};
 }
 
+static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
+
+  if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
+    if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
+        std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
+
+      if (AsmPieces.size() == 3)
+        return true;
+      else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
   InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
 
@@ -18031,12 +19360,8 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
-      return IntrinsicLowering::LowerToByteSwap(CI);
+      if (clobbersFlagRegisters(AsmPieces))
+        return IntrinsicLowering::LowerToByteSwap(CI);
     }
     break;
   case 3:
@@ -18049,11 +19374,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
       const std::string &ConstraintsStr = IA->getConstraintString();
       SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
       array_pod_sort(AsmPieces.begin(), AsmPieces.end());
-      if (AsmPieces.size() == 4 &&
-          AsmPieces[0] == "~{cc}" &&
-          AsmPieces[1] == "~{dirflag}" &&
-          AsmPieces[2] == "~{flags}" &&
-          AsmPieces[3] == "~{fpsr}")
+      if (clobbersFlagRegisters(AsmPieces))
         return IntrinsicLowering::LowerToByteSwap(CI);
     }
 
@@ -18361,7 +19682,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                         getTargetMachine())))
       return;
 
-    Result = DAG.getTargetGlobalAddress(GV, Op.getDebugLoc(),
+    Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
                                         GA->getValueType(0), Offset);
     break;
   }
@@ -18376,7 +19697,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
 std::pair<unsigned, const TargetRegisterClass*>
 X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                EVT VT) const {
+                                                MVT VT) const {
   // First, see if this is a constraint that directly corresponds to an LLVM
   // register class.
   if (Constraint.size() == 1) {
@@ -18443,7 +19764,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
       if (!Subtarget->hasSSE1()) break;
 
-      switch (VT.getSimpleVT().SimpleTy) {
+      switch (VT.SimpleTy) {
       default: break;
       // Scalar SSE types.
       case MVT::f32:
@@ -18468,6 +19789,11 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       case MVT::v8f32:
       case MVT::v4f64:
         return std::make_pair(0U, &X86::VR256RegClass);
+      case MVT::v8f64:
+      case MVT::v16f32:
+      case MVT::v16i32:
+      case MVT::v8i64:
+        return std::make_pair(0U, &X86::VR512RegClass);
       }
       break;
     }
@@ -18578,7 +19904,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
     }
   } else if (Res.second == &X86::FR32RegClass ||
              Res.second == &X86::FR64RegClass ||
-             Res.second == &X86::VR128RegClass) {
+             Res.second == &X86::VR128RegClass ||
+             Res.second == &X86::VR256RegClass ||
+             Res.second == &X86::FR32XRegClass ||
+             Res.second == &X86::FR64XRegClass ||
+             Res.second == &X86::VR128XRegClass ||
+             Res.second == &X86::VR256XRegClass ||
+             Res.second == &X86::VR512RegClass) {
     // Handle references to XMM physical registers that got mapped into the
     // wrong class.  This can happen with constraints like {xmm0} where the
     // target independent register mapper will just pick the first match it can
@@ -18592,6 +19924,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
       Res.second = &X86::VR128RegClass;
     else if (X86::VR256RegClass.hasType(VT))
       Res.second = &X86::VR256RegClass;
+    else if (X86::VR512RegClass.hasType(VT))
+      Res.second = &X86::VR512RegClass;
   }
 
   return Res;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 2727e22..bc3dd60 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -53,6 +53,10 @@ namespace llvm {
       /// to X86::XORPS or X86::XORPD.
       FXOR,
 
+      /// FANDN - Bitwise logical ANDNOT of floating point values. This
+      /// corresponds to X86::ANDNPS or X86::ANDNPD.
+      FANDN,
+
       /// FSRL - Bitwise logical right shift of floating point values. These
       /// corresponds to X86::PSRLDQ.
       FSRL,
@@ -250,6 +254,12 @@ namespace llvm {
       // VSEXT - Vector integer signed-extend.
       VSEXT,
 
+      // VTRUNC - Vector integer truncate.
+      VTRUNC,
+
+      // VTRUNC - Vector integer truncate with mask.
+      VTRUNCM,
+
       // VFPEXT - Vector FP extend.
       VFPEXT,
 
@@ -270,6 +280,13 @@ namespace llvm {
 
       // PCMP* - Vector integer comparisons.
       PCMPEQ, PCMPGT,
+      // PCMP*M - Vector integer comparisons, the result is in a mask vector.
+      PCMPEQM, PCMPGTM,
+
+      /// CMPM, CMPMU - Vector comparison generating mask bits for fp and
+      /// integer signed and unsigned data types.
+      CMPM,
+      CMPMU,
 
       // ADD, SUB, SMUL, etc. - Arithmetic operations with FLAGS results.
       ADD, SUB, ADC, SBB, SMUL,
@@ -278,18 +295,27 @@ namespace llvm {
       BLSI,   // BLSI - Extract lowest set isolated bit
       BLSMSK, // BLSMSK - Get mask up to lowest set bit
       BLSR,   // BLSR - Reset lowest set bit
+      BZHI,   // BZHI - Zero high bits
+      BEXTR,  // BEXTR - Bit field extract
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
 
       // MUL_IMM - X86 specific multiply by immediate.
       MUL_IMM,
 
-      // PTEST - Vector bitwise comparisons
+      // PTEST - Vector bitwise comparisons.
       PTEST,
 
-      // TESTP - Vector packed fp sign bitwise comparisons
+      // TESTP - Vector packed fp sign bitwise comparisons.
       TESTP,
 
+      // TESTM - Vector "test" in AVX-512, the result is in a mask vector.
+      TESTM,
+
+      // OR/AND test for masks
+      KORTEST,
+      KTEST,
+
       // Several flavors of instructions with vector shuffle behaviors.
       PALIGNR,
       PSHUFD,
@@ -310,9 +336,13 @@ namespace llvm {
       UNPCKH,
       VPERMILP,
       VPERMV,
+      VPERMV3,
       VPERMI,
       VPERM2X128,
       VBROADCAST,
+      // masked broadcast
+      VBROADCASTM,
+      VINSERT,
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
@@ -434,25 +464,45 @@ namespace llvm {
 
   /// Define some predicates that are used for node matching.
   namespace X86 {
-    /// isVEXTRACTF128Index - Return true if the specified
+    /// isVEXTRACT128Index - Return true if the specified
+    /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
+    /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
+    bool isVEXTRACT128Index(SDNode *N);
+
+    /// isVINSERT128Index - Return true if the specified
+    /// INSERT_SUBVECTOR operand specifies a subvector insert that is
+    /// suitable for input to VINSERTF128, VINSERTI128 instructions.
+    bool isVINSERT128Index(SDNode *N);
+
+    /// isVEXTRACT256Index - Return true if the specified
     /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
-    /// suitable for input to VEXTRACTF128.
-    bool isVEXTRACTF128Index(SDNode *N);
+    /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
+    bool isVEXTRACT256Index(SDNode *N);
 
-    /// isVINSERTF128Index - Return true if the specified
+    /// isVINSERT256Index - Return true if the specified
     /// INSERT_SUBVECTOR operand specifies a subvector insert that is
-    /// suitable for input to VINSERTF128.
-    bool isVINSERTF128Index(SDNode *N);
+    /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
+    bool isVINSERT256Index(SDNode *N);
 
-    /// getExtractVEXTRACTF128Immediate - Return the appropriate
+    /// getExtractVEXTRACT128Immediate - Return the appropriate
     /// immediate to extract the specified EXTRACT_SUBVECTOR index
-    /// with VEXTRACTF128 instructions.
-    unsigned getExtractVEXTRACTF128Immediate(SDNode *N);
+    /// with VEXTRACTF128, VEXTRACTI128 instructions.
+    unsigned getExtractVEXTRACT128Immediate(SDNode *N);
 
-    /// getInsertVINSERTF128Immediate - Return the appropriate
+    /// getInsertVINSERT128Immediate - Return the appropriate
     /// immediate to insert at the specified INSERT_SUBVECTOR index
-    /// with VINSERTF128 instructions.
-    unsigned getInsertVINSERTF128Immediate(SDNode *N);
+    /// with VINSERTF128, VINSERT128 instructions.
+    unsigned getInsertVINSERT128Immediate(SDNode *N);
+
+    /// getExtractVEXTRACT256Immediate - Return the appropriate
+    /// immediate to extract the specified EXTRACT_SUBVECTOR index
+    /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
+    unsigned getExtractVEXTRACT256Immediate(SDNode *N);
+
+    /// getInsertVINSERT256Immediate - Return the appropriate
+    /// immediate to insert at the specified INSERT_SUBVECTOR index
+    /// with VINSERTF64x4, VINSERTI64x4 instructions.
+    unsigned getInsertVINSERT256Immediate(SDNode *N);
 
     /// isZeroNode - Returns true if Elt is a constant zero or a floating point
     /// constant +0.0.
@@ -511,7 +561,7 @@ namespace llvm {
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
@@ -563,7 +613,7 @@ namespace llvm {
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
-    virtual EVT getSetCCResultType(EVT VT) const;
+    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
 
     /// computeMaskedBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
@@ -610,7 +660,7 @@ namespace llvm {
     /// error, this returns a register number of 0.
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   EVT VT) const;
+                                   MVT VT) const;
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
@@ -634,6 +684,8 @@ namespace llvm {
     virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
     virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
 
+    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
     /// isZExtFree - Return true if any actual instruction that defines a
     /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
     /// register. This does not necessarily include registers defined in
@@ -646,11 +698,11 @@ namespace llvm {
     virtual bool isZExtFree(EVT VT1, EVT VT2) const;
     virtual bool isZExtFree(SDValue Val, EVT VT2) const;
 
-    /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
-    /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
-    /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
-    /// is expanded to mul + add.
-    virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
 
     /// isNarrowingProfitable - Return true if it's profitable to narrow
     /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
@@ -723,6 +775,8 @@ namespace llvm {
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
 
+    virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const LLVM_OVERRIDE;
+
     /// \brief Reset the operation actions based on target options.
     virtual void resetOperationActions();
 
@@ -734,7 +788,6 @@ namespace llvm {
     /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
-    const X86RegisterInfo *RegInfo;
     const DataLayout *TD;
 
     /// Used to store the TargetOptions so that we don't waste time resetting
@@ -760,16 +813,16 @@ namespace llvm {
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerMemArgument(SDValue Chain,
                              CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,  MachineFrameInfo *MFI,
                               unsigned i) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
-                             DebugLoc dl, SelectionDAG &DAG,
+                             SDLoc dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
                              ISD::ArgFlagsTy Flags) const;
 
@@ -791,7 +844,7 @@ namespace llvm {
     bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv) const;
     SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
                                 SDValue Chain, bool IsTailCall, bool Is64Bit,
-                                int FPDiff, DebugLoc dl) const;
+                                int FPDiff, SDLoc dl) const;
 
     unsigned GetAlignedArgumentStackSize(unsigned StackSize,
                                          SelectionDAG &DAG) const;
@@ -800,37 +853,32 @@ namespace llvm {
                                                bool isSigned,
                                                bool isReplace) const;
 
-    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
-                                   SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+    SDValue LowerGlobalAddress(const GlobalValue *GV, SDLoc dl,
                                int64_t Offset, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
     SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
-                      DebugLoc dl, SelectionDAG &DAG) const;
+                      SDLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -847,25 +895,13 @@ namespace llvm {
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
-    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
-    SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
-    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
-
-    SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     virtual SDValue
       LowerCall(CallLoweringInfo &CLI,
@@ -876,7 +912,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
 
@@ -891,6 +927,8 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
+    virtual const uint16_t *getScratchRegisters(CallingConv::ID CC) const;
+
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
new file mode 100644
index 0000000..cb19fbd
--- /dev/null
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -0,0 +1,3526 @@
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f64  (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))),  (v16f32 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+  def : Pat<(v8i64  (bitconvert (v8f64 VR512:$src))),  (v8i64 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))),  (v16i32 VR512:$src)>;
+  def : Pat<(v8f64  (bitconvert (v8i64 VR512:$src))),  (v8f64 VR512:$src)>;
+
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128X:$src))), (v2i64 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128X:$src))), (v4i32 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128X:$src))), (v8i16 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128X:$src))), (v16i8 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128X:$src))), (v4f32 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128X:$src))), (v2f64 VR128X:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128X:$src))), (v2f64 VR128X:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+  def : Pat<(v4f64  (bitconvert (v8f32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v8i32 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v4i64 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v16i16 VR256X:$src))), (v4f64 VR256X:$src)>;
+  def : Pat<(v4f64  (bitconvert (v32i8 VR256X:$src))),  (v4f64 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v8i32 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4i64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v4f64 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v32i8 VR256X:$src))),  (v8f32 VR256X:$src)>;
+  def : Pat<(v8f32  (bitconvert (v16i16 VR256X:$src))), (v8f32 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8f32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v8i32 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v4f64 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v32i8 VR256X:$src))),  (v4i64 VR256X:$src)>;
+  def : Pat<(v4i64  (bitconvert (v16i16 VR256X:$src))), (v4i64 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4f64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v4i64 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8f32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v8i32 VR256X:$src))),  (v32i8 VR256X:$src)>;
+  def : Pat<(v32i8  (bitconvert (v16i16 VR256X:$src))), (v32i8 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v32i8 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v16i16 VR256X:$src))), (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v8f32 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4i64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v8i32  (bitconvert (v4f64 VR256X:$src))),  (v8i32 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8f32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v8i32 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4i64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v4f64 VR256X:$src))),  (v16i16 VR256X:$src)>;
+  def : Pat<(v16i16 (bitconvert (v32i8 VR256X:$src))),  (v16i16 VR256X:$src)>;
+}
+
+//
+// AVX-512: VPXOR instruction writes zero to its upper part, it's safe build zeros.
+//
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isPseudo = 1, Predicates = [HasAVX512] in {
+def AVX512_512_SET0 : I<0, Pseudo, (outs VR512:$dst), (ins), "",
+               [(set VR512:$dst, (v16f32 immAllZerosV))]>;
+}
+
+def : Pat<(v8i64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16i32 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v8f64 immAllZerosV), (AVX512_512_SET0)>;
+def : Pat<(v16f32 immAllZerosV), (AVX512_512_SET0)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VECTOR INSERT
+//
+// -- 32x8 form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+def VINSERTF32x4rr : AVX512AIi8<0x18, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTF32x4rm : AVX512AIi8<0x18, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, f128mem:$src2, i8imm:$src3),
+          "vinsertf32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+}
+
+// -- 64x4 fp form --
+let neverHasSideEffects = 1, ExeDomain = SSEPackedDouble in {
+def VINSERTF64x4rr : AVX512AIi8<0x1a, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTF64x4rm : AVX512AIi8<0x1a, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+          "vinsertf64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+// -- 32x4 integer form --
+let neverHasSideEffects = 1 in {
+def VINSERTI32x4rr : AVX512AIi8<0x38, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR128X:$src2, i8imm:$src3),
+          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512;
+let mayLoad = 1 in
+def VINSERTI32x4rm : AVX512AIi8<0x38, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i128mem:$src2, i8imm:$src3),
+          "vinserti32x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+}
+
+let neverHasSideEffects = 1 in {
+// -- 64x4 form --
+def VINSERTI64x4rr : AVX512AIi8<0x3a, MRMSrcReg, (outs VR512:$dst),
+          (ins VR512:$src1, VR256X:$src2, i8imm:$src3),
+          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W;
+let mayLoad = 1 in
+def VINSERTI64x4rm : AVX512AIi8<0x3a, MRMSrcMem, (outs VR512:$dst),
+          (ins VR512:$src1, i256mem:$src2, i8imm:$src3),
+          "vinserti64x4\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+          []>, EVEX_4V, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (v4f32 VR128X:$src2),
+           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (v2f64 VR128X:$src2),
+           (iPTR imm)), (VINSERTF32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v2i64 VR128X:$src2),
+           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v4i32 VR128X:$src2),
+           (iPTR imm)), (VINSERTI32x4rr VR512:$src1, VR128X:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+			
+def : Pat<(vinsert128_insert:$ins (v16f32 VR512:$src1), (loadv4f32 addr:$src2),
+           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1),
+	                (bc_v4i32 (loadv2i64 addr:$src2)),
+           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8f64  VR512:$src1), (loadv2f64 addr:$src2),
+           (iPTR imm)), (VINSERTF32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (loadv2i64 addr:$src2),
+           (iPTR imm)), (VINSERTI32x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert128_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (v8f32 VR256X:$src2),
+           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (v4f64 VR256X:$src2),
+           (iPTR imm)), (VINSERTF64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i64  VR512:$src1), (v4i64 VR256X:$src2),
+           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i32 VR512:$src1), (v8i32 VR256X:$src2),
+           (iPTR imm)), (VINSERTI64x4rr VR512:$src1, VR256X:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+def : Pat<(vinsert256_insert:$ins (v16f32  VR512:$src1), (loadv8f32 addr:$src2),
+           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8f64  VR512:$src1), (loadv4f64 addr:$src2),
+           (iPTR imm)), (VINSERTF64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v8i64  VR512:$src1), (loadv4i64 addr:$src2),
+           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
+	                (bc_v8i32 (loadv4i64 addr:$src2)),
+           (iPTR imm)), (VINSERTI64x4rm VR512:$src1, addr:$src2,
+                        (INSERT_get_vinsert256_imm VR512:$ins))>;
+
+// vinsertps - insert f32 to XMM
+def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
+      (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
+      "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      EVEX_4V;
+def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
+      (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
+      "vinsertps{z}\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+      [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+                          (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                          imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 VECTOR EXTRACT
+//---
+let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
+// -- 32x4 form --
+def VEXTRACTF32x4rr : AVX512AIi8<0x19, MRMDestReg, (outs VR128X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512;
+def VEXTRACTF32x4mr : AVX512AIi8<0x19, MRMDestMem, (outs),
+          (ins f128mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextractf32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTF64x4rr : AVX512AIi8<0x1b, MRMDestReg, (outs VR256X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTF64x4mr : AVX512AIi8<0x1b, MRMDestMem, (outs),
+          (ins f256mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextractf64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+let neverHasSideEffects = 1 in {
+// -- 32x4 form --
+def VEXTRACTI32x4rr : AVX512AIi8<0x39, MRMDestReg, (outs VR128X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512;
+def VEXTRACTI32x4mr : AVX512AIi8<0x39, MRMDestMem, (outs),
+          (ins i128mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextracti32x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VT4>;
+
+// -- 64x4 form --
+def VEXTRACTI64x4rr : AVX512AIi8<0x3b, MRMDestReg, (outs VR256X:$dst),
+          (ins VR512:$src1, i8imm:$src2),
+          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in
+def VEXTRACTI64x4mr : AVX512AIi8<0x3b, MRMDestMem, (outs),
+          (ins i256mem:$dst, VR512:$src1, i8imm:$src2),
+          "vextracti64x4\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          []>, EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT4>;
+}
+
+def : Pat<(vextract128_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+          (v4f32 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext VR512:$src1, (iPTR imm)),
+          (v4i32 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+          (v2f64 (VEXTRACTF32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+def : Pat<(vextract128_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+          (v2i64 (VEXTRACTI32x4rr VR512:$src1,
+                  (EXTRACT_get_vextract128_imm VR128X:$ext)))>;
+
+
+def : Pat<(vextract256_extract:$ext (v16f32 VR512:$src1), (iPTR imm)),
+          (v8f32 (VEXTRACTF64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v16i32 VR512:$src1), (iPTR imm)),
+          (v8i32 (VEXTRACTI64x4rr VR512:$src1,
+                    (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8f64 VR512:$src1), (iPTR imm)),
+          (v4f64 (VEXTRACTF64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+def : Pat<(vextract256_extract:$ext (v8i64 VR512:$src1), (iPTR imm)),
+          (v4i64 (VEXTRACTI64x4rr VR512:$src1,
+                  (EXTRACT_get_vextract256_imm VR256X:$ext)))>;
+
+// A 256-bit subvector extract from the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
+def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
+def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
+def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
+
+// zmm -> xmm
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
+          (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
+          (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
+          (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
+          (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
+
+
+// A 128-bit subvector insert to the first 512-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
+          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+          sub_ymm)>;
+
+def : Pat<(insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+def : Pat<(insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0)),
+          (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
+
+// vextractps - extract 32 bits from XMM
+def VEXTRACTPSzrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
+      (ins VR128X:$src1, u32u8imm:$src2),
+      "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
+      EVEX;
+
+def VEXTRACTPSzmr : AVX512AIi8<0x17, MRMDestMem, (outs),
+      (ins f32mem:$dst, VR128X:$src1, u32u8imm:$src2),
+      "vextractps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
+                          addr:$dst)]>, EVEX;
+
+//===---------------------------------------------------------------------===//
+// AVX-512 BROADCAST
+//---
+multiclass avx512_fp_broadcast<bits<8> opc, string OpcodeStr, 
+                         RegisterClass DestRC,
+                         RegisterClass SrcRC, X86MemOperand x86memop> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DestRC:$dst), (ins SrcRC:$src),
+         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+         []>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DestRC:$dst), (ins x86memop:$src),
+        !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),[]>, EVEX;
+}
+let ExeDomain = SSEPackedSingle in {
+  defm VBROADCASTSSZ  : avx512_fp_broadcast<0x18, "vbroadcastss{z}", VR512, 
+                                       VR128X, f32mem>,
+                                       EVEX_V512, EVEX_CD8<32, CD8VT1>;
+}
+
+let ExeDomain = SSEPackedDouble in {
+  defm VBROADCASTSDZ  : avx512_fp_broadcast<0x19, "vbroadcastsd{z}", VR512,
+                                       VR128X, f64mem>,
+                                       EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(v16f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSDZrm addr:$src)>;
+
+def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
+          (VBROADCASTSSZrm addr:$src)>;
+def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
+          (VBROADCASTSDZrm addr:$src)>;
+
+multiclass avx512_int_broadcast_reg<bits<8> opc, string OpcodeStr,
+                          RegisterClass SrcRC, RegisterClass KRC> {
+  def Zrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins SrcRC:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   []>, EVEX, EVEX_V512;
+  def Zkrr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), 
+                   (ins KRC:$mask, SrcRC:$src),
+                   !strconcat(OpcodeStr, 
+                        "\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+                   []>, EVEX, EVEX_V512, EVEX_KZ;
+}
+
+defm VPBROADCASTDr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastd", GR32, VK16WM>;
+defm VPBROADCASTQr  : avx512_int_broadcast_reg<0x7C, "vpbroadcastq", GR64, VK8WM>,
+                                            VEX_W;
+                                            
+def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
+           (VPBROADCASTDrZkrr VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
+
+def : Pat <(v8i64 (X86vzext VK8WM:$mask)),
+           (VPBROADCASTQrZkrr VK8WM:$mask, (i64 (MOV64ri 0x1)))>;
+
+def : Pat<(v16i32 (X86VBroadcast (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (X86VBroadcast (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
+def : Pat<(v8i64 (X86VBroadcastm VK8WM:$mask, (i64 GR64:$src))),
+        (VPBROADCASTQrZkrr VK8WM:$mask, GR64:$src)>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_i32_512 (i32 GR32:$src))),
+        (VPBROADCASTDrZrr GR32:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_i64_512 (i64 GR64:$src))),
+        (VPBROADCASTQrZrr GR64:$src)>;
+
+multiclass avx512_int_broadcast_rm<bits<8> opc, string OpcodeStr,
+                          X86MemOperand x86memop, PatFrag ld_frag,
+                          RegisterClass DstRC, ValueType OpVT, ValueType SrcVT,
+                          RegisterClass KRC> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins VR128X:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set DstRC:$dst,
+                    (OpVT (X86VBroadcast (SrcVT VR128X:$src))))]>, EVEX;
+  def krr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst), (ins KRC:$mask,
+                                                         VR128X:$src),
+                    !strconcat(OpcodeStr, 
+                    "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                    [(set DstRC:$dst,
+                      (OpVT (X86VBroadcastm KRC:$mask, (SrcVT VR128X:$src))))]>,
+                    EVEX, EVEX_KZ;
+  let mayLoad = 1 in {
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  [(set DstRC:$dst, 
+                    (OpVT (X86VBroadcast (ld_frag addr:$src))))]>, EVEX;
+  def krm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst), (ins KRC:$mask,
+                                                         x86memop:$src),
+                  !strconcat(OpcodeStr, 
+                      "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+                  [(set DstRC:$dst, (OpVT (X86VBroadcastm KRC:$mask, 
+                                     (ld_frag addr:$src))))]>, EVEX, EVEX_KZ;
+  }
+}
+
+defm VPBROADCASTDZ  : avx512_int_broadcast_rm<0x58, "vpbroadcastd", i32mem,
+                      loadi32, VR512, v16i32, v4i32, VK16WM>,
+                      EVEX_V512, EVEX_CD8<32, CD8VT1>;
+defm VPBROADCASTQZ  : avx512_int_broadcast_rm<0x59, "vpbroadcastq", i64mem,
+                      loadi64, VR512, v8i64, v2i64, VK8WM>,  EVEX_V512, VEX_W,
+                      EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(v16i32 (int_x86_avx512_pbroadcastd_512 (v4i32 VR128X:$src))),
+          (VPBROADCASTDZrr VR128X:$src)>;
+def : Pat<(v8i64 (int_x86_avx512_pbroadcastq_512 (v2i64 VR128X:$src))),
+          (VPBROADCASTQZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (X86VBroadcast (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (X86VBroadcast (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
+
+def : Pat<(v16f32 (int_x86_avx512_vbroadcast_ss_ps_512 (v4f32 VR128X:$src))),
+          (VBROADCASTSSZrr VR128X:$src)>;
+def : Pat<(v8f64 (int_x86_avx512_vbroadcast_sd_pd_512 (v2f64 VR128X:$src))),
+          (VBROADCASTSDZrr VR128X:$src)>;
+    
+// Provide fallback in case the load node that is used in the patterns above
+// is used by additional users, which prevents the pattern selection.
+def : Pat<(v16f32 (X86VBroadcast FR32X:$src)),
+          (VBROADCASTSSZrr (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
+def : Pat<(v8f64 (X86VBroadcast FR64X:$src)),
+          (VBROADCASTSDZrr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
+
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8i32 (X86VBroadcastm (v8i1 VK8WM:$mask), (loadi32 addr:$src))),
+           (EXTRACT_SUBREG 
+              (v16i32 (VPBROADCASTDZkrm (COPY_TO_REGCLASS VK8WM:$mask, VK16WM),
+                       addr:$src)), sub_ymm)>;
+}
+//===----------------------------------------------------------------------===//
+// AVX-512 BROADCAST MASK TO VECTOR REGISTER
+//---
+
+multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
+                       RegisterClass DstRC, RegisterClass KRC,
+                       ValueType OpVT, ValueType SrcVT> {
+def rr : AVX512XS8I<opc, MRMDestReg, (outs DstRC:$dst), (ins KRC:$src),
+                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                  []>, EVEX;
+}
+
+defm VPBROADCASTMW2D : avx512_mask_broadcast<0x3A, "vpbroadcastmw2d", VR512,
+                                             VK16, v16i32, v16i1>, EVEX_V512;
+defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", VR512,
+                                            VK8, v8i64, v8i1>, EVEX_V512, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - VPERM
+//
+// -- immediate form --
+multiclass avx512_perm_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512AIi8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512AIi8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPERMQZ  : avx512_perm_imm<0x00, "vpermq", VR512, X86VPermi, memopv8i64,
+                        i512mem, v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedDouble in 
+defm VPERMPDZ  : avx512_perm_imm<0x01, "vpermpd", VR512, X86VPermi, memopv8f64, 
+                        f512mem, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM - register form --
+multiclass avx512_perm<bits<8> opc, string OpcodeStr, RegisterClass RC, 
+                     PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, RC:$src2)))]>, EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2),
+                   !strconcat(OpcodeStr,
+                       "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv RC:$src1, (mem_frag addr:$src2))))]>,
+                     EVEX_4V;
+}
+
+defm VPERMDZ   : avx512_perm<0x36, "vpermd",  VR512,  memopv16i32, i512mem,
+                           v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMQZ   : avx512_perm<0x36, "vpermq",  VR512,  memopv8i64,  i512mem, 
+                           v8i64>,  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+let ExeDomain = SSEPackedSingle in
+defm VPERMPSZ  : avx512_perm<0x16, "vpermps", VR512,  memopv16f32, f512mem,
+                           v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMPDZ  : avx512_perm<0x16, "vpermpd", VR512,  memopv8f64, f512mem, 
+                           v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// -- VPERM2I - 3 source operands form --
+multiclass avx512_perm_3src<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                          PatFrag mem_frag, X86MemOperand x86memop,
+                          ValueType OpVT> {
+let Constraints = "$src1 = $dst" in {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                       "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, RC:$src3)))]>,
+                    EVEX_4V;
+
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (X86VPermv3 RC:$src1, RC:$src2, 
+                      (mem_frag addr:$src3))))]>, EVEX_4V;
+  }
+}
+defm VPERMI2D  : avx512_perm_3src<0x76, "vpermi2d",  VR512, memopv16i32, i512mem, 
+                               v16i32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2Q  : avx512_perm_3src<0x76, "vpermi2q",  VR512, memopv8i64, i512mem, 
+                               v8i64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPERMI2PS : avx512_perm_3src<0x77, "vpermi2ps",  VR512, memopv16f32, i512mem, 
+                               v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPERMI2PD : avx512_perm_3src<0x77, "vpermi2pd",  VR512, memopv8f64, i512mem, 
+                               v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - BLEND using mask
+//
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, Intrinsic Int, 
+                          RegisterClass KRC, RegisterClass RC,
+                          X86MemOperand x86memop, PatFrag mem_frag,
+                          SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (OpNode KRC:$mask, (vt RC:$src2), 
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+  def rr_Int : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+               (ins KRC:$mask, RC:$src1, RC:$src2),
+               !strconcat(OpcodeStr,
+                "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
+               [(set RC:$dst, (Int KRC:$mask, (vt RC:$src2),
+                 (vt RC:$src1)))]>, EVEX_4V, EVEX_K;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 []>, 
+                 EVEX_4V, EVEX_K;
+
+    def rm_Int : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+                 (ins KRC:$mask, RC:$src1, x86memop:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $mask, $dst|$dst, $mask, $src1, $src2}"),
+                 [(set RC:$dst, (Int KRC:$mask, (vt RC:$src1),
+                   (mem_frag addr:$src2)))]>,
+                 EVEX_4V, EVEX_K;
+  }
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VBLENDMPSZ : avx512_blendmask<0x65, "vblendmps", 
+                              int_x86_avx512_mskblend_ps_512,
+                              VK16WM, VR512, f512mem,
+                              memopv16f32, vselect, v16f32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+let ExeDomain = SSEPackedDouble in
+defm VBLENDMPDZ : avx512_blendmask<0x65, "vblendmpd", 
+                              int_x86_avx512_mskblend_pd_512,
+                              VK8WM, VR512, f512mem,
+                              memopv8f64, vselect, v8f64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+defm VPBLENDMDZ : avx512_blendmask<0x64, "vpblendmd", 
+                              int_x86_avx512_mskblend_d_512,
+                              VK16WM, VR512, f512mem, 
+                              memopv16i32, vselect, v16i32>, 
+                              EVEX_CD8<32, CD8VF>, EVEX_V512;
+
+defm VPBLENDMQZ : avx512_blendmask<0x64, "vpblendmq", 
+                              int_x86_avx512_mskblend_q_512, 
+                              VK8WM, VR512, f512mem, 
+                              memopv8i64, vselect, v8i64>, 
+                              VEX_W, EVEX_CD8<64, CD8VF>, EVEX_V512;
+
+let Predicates = [HasAVX512] in {
+def : Pat<(v8f32 (vselect (v8i1 VK8WM:$mask), (v8f32 VR256X:$src1),
+                            (v8f32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+              (v16f32 (VBLENDMPSZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+
+def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
+                            (v8i32 VR256X:$src2))),
+            (EXTRACT_SUBREG 
+                (v16i32 (VPBLENDMDZrr (COPY_TO_REGCLASS VK8WM:$mask, VK16WM), 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+}
+
+multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX512BI<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2)))],
+             IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+}
+
+defm VPCMPEQDZ : avx512_icmp_packed<0x76, "vpcmpeqd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpeqm, v16i32>, EVEX_V512;
+defm VPCMPEQQZ : avx512_icmp_packed<0x29, "vpcmpeqq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpeqm, v8i64>, T8, EVEX_V512, VEX_W;
+
+defm VPCMPGTDZ : avx512_icmp_packed<0x66, "vpcmpgtd", VK16, VR512, i512mem, 
+                           memopv16i32, X86pcmpgtm, v16i32>, EVEX_V512;
+defm VPCMPGTQZ : avx512_icmp_packed<0x37, "vpcmpgtq", VK8, VR512, i512mem, 
+                           memopv8i64, X86pcmpgtm, v8i64>, T8, EVEX_V512, VEX_W;
+
+def : Pat<(v8i1 (X86pcmpgtm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPGTDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+def : Pat<(v8i1 (X86pcmpeqm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
+            (COPY_TO_REGCLASS (VPCMPEQDZrr 
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm))), VK8)>;
+
+multiclass avx512_icmp_cc<bits<8> opc, RegisterClass KRC,
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt, Operand CC, string asm,
+              string asm_alt> {
+  def rri : AVX512AIi8<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], 
+             IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+  def rmi : AVX512AIi8<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (memop_frag addr:$src2),
+                              imm:$cc))], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512AIi8<opc, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+    def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+  }
+}
+
+defm VPCMPDZ : avx512_icmp_cc<0x1F, VK16, VR512, i512mem, memopv16i32,
+                              X86cmpm, v16i32, AVXCC,
+              "vpcmp${cc}d\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPCMPUDZ : avx512_icmp_cc<0x1E, VK16, VR512, i512mem, memopv16i32,
+                               X86cmpmu, v16i32, AVXCC,
+              "vpcmp${cc}ud\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpud\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQZ : avx512_icmp_cc<0x1F, VK8, VR512, i512mem, memopv8i64,
+                              X86cmpm, v8i64, AVXCC,
+              "vpcmp${cc}q\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQZ : avx512_icmp_cc<0x1E, VK8, VR512, i512mem, memopv8i64,
+                               X86cmpmu, v8i64, AVXCC,
+              "vpcmp${cc}uq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+              "vpcmpuq\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}">,
+              VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+// avx512_cmp_packed - sse 1 & 2 compare packed instructions
+multiclass avx512_cmp_packed<RegisterClass KRC, RegisterClass RC,
+                           X86MemOperand x86memop, Operand CC,
+                           SDNode OpNode, ValueType vt, string asm,
+                           string asm_alt, Domain d> {
+  def rri : AVX512PIi8<0xC2, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2), imm:$cc))], d>;
+  def rmi : AVX512PIi8<0xC2, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
+             [(set KRC:$dst,
+              (OpNode (vt RC:$src1), (memop addr:$src2), imm:$cc))], d>;
+
+  // Accept explicit immediate argument form instead of comparison code.
+  let neverHasSideEffects = 1 in {
+    def rri_alt : AVX512PIi8<0xC2, MRMSrcReg,
+               (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+    def rmi_alt : AVX512PIi8<0xC2, MRMSrcMem,
+               (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
+               asm_alt, [], d>;
+  }
+}
+
+defm VCMPPSZ : avx512_cmp_packed<VK16, VR512, f512mem, AVXCC, X86cmpm, v16f32,
+               "vcmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedSingle>, EVEX_4V, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VCMPPDZ : avx512_cmp_packed<VK8, VR512, f512mem, AVXCC, X86cmpm, v8f64,
+               "vcmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               "vcmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+               SSEPackedDouble>, OpSize, EVEX_4V, VEX_W, EVEX_V512,
+               EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i1 (X86cmpm (v8f32 VR256X:$src1), (v8f32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VCMPPSZrri
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpm (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+def : Pat<(v8i1 (X86cmpmu (v8i32 VR256X:$src1), (v8i32 VR256X:$src2), imm:$cc)),
+          (COPY_TO_REGCLASS (VPCMPUDZrri
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)),
+            (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src2, sub_ymm)),
+            imm:$cc), VK8)>;
+               
+// Mask register copy, including
+// - copy between mask registers
+// - load/store mask registers
+// - copy from GPR to mask register and vice versa
+//
+multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
+                         string OpcodeStr, RegisterClass KRC,
+                         ValueType vt, X86MemOperand x86memop> {
+  let neverHasSideEffects = 1 in {
+    def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+    let mayLoad = 1 in
+    def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (vt (load addr:$src)))]>;
+    let mayStore = 1 in
+    def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
+                             string OpcodeStr,
+                             RegisterClass KRC, RegisterClass GRC> {
+  let neverHasSideEffects = 1 in {
+    def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+    def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+  }
+}
+
+let Predicates = [HasAVX512] in {
+  defm KMOVW : avx512_mask_mov<0x90, 0x90, 0x91, "kmovw", VK16, v16i1, i16mem>,
+               VEX, TB;
+  defm KMOVW : avx512_mask_mov_gpr<0x92, 0x93, "kmovw", VK16, GR32>,
+               VEX, TB;
+}
+
+let Predicates = [HasAVX512] in {
+  // GR16 from/to 16-bit mask
+  def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
+            (KMOVWkr (SUBREG_TO_REG (i32 0), GR16:$src, sub_16bit))>;
+  def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
+            (EXTRACT_SUBREG (KMOVWrk VK16:$src), sub_16bit)>;
+
+  // Store kreg in memory
+  def : Pat<(store (v16i1 VK16:$src), addr:$dst),
+            (KMOVWmk addr:$dst, VK16:$src)>;
+
+  def : Pat<(store (v8i1 VK8:$src), addr:$dst),
+            (KMOVWmk addr:$dst, (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16)))>;
+}
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  // GR from/to 8-bit mask without native support
+  def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
+            (COPY_TO_REGCLASS
+              (KMOVWkr (SUBREG_TO_REG (i32 0), GR8:$src, sub_8bit)),
+              VK8)>;
+  def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
+            (EXTRACT_SUBREG
+              (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
+              sub_8bit)>;
+}
+
+// Mask unary operation
+// - KNOT
+multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
+                         RegisterClass KRC, SDPatternOperator OpNode> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               [(set KRC:$dst, (OpNode KRC:$src))]>;
+}
+
+multiclass avx512_mask_unop_w<bits<8> opc, string OpcodeStr,
+                               SDPatternOperator OpNode> {
+  defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                          VEX, TB;
+}
+
+defm KNOT : avx512_mask_unop_w<0x44, "knot", not>;
+
+def : Pat<(xor VK16:$src1, (v16i1 immAllOnesV)), (KNOTWrr VK16:$src1)>;
+def : Pat<(xor VK8:$src1,  (v8i1 immAllOnesV)),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK8:$src1, VK16)), VK8)>;
+
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+def : Pat<(not VK8:$src),
+          (COPY_TO_REGCLASS
+            (KNOTWrr (COPY_TO_REGCLASS VK8:$src, VK16)), VK8)>;
+
+// Mask binary operation
+// - KADD, KAND, KANDN, KOR, KXNOR, KXOR
+multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC, SDPatternOperator OpNode> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_binop_w<bits<8> opc, string OpcodeStr,
+                             SDPatternOperator OpNode> {
+  defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                           VEX_4V, VEX_L, TB;
+}
+
+def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
+def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
+
+let isCommutable = 1 in {
+  defm KADD  : avx512_mask_binop_w<0x4a, "kadd",  add>;
+  defm KAND  : avx512_mask_binop_w<0x41, "kand",  and>;
+  let isCommutable = 0 in
+  defm KANDN : avx512_mask_binop_w<0x42, "kandn", andn>;
+  defm KOR   : avx512_mask_binop_w<0x45, "kor",   or>;
+  defm KXNOR : avx512_mask_binop_w<0x46, "kxnor", xnor>;
+  defm KXOR  : avx512_mask_binop_w<0x47, "kxor",  xor>;
+}
+
+multiclass avx512_mask_binop_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+                VK16:$src1, VK16:$src2),
+              (!cast<Instruction>(InstName##"Wrr") VK16:$src1, VK16:$src2)>;
+}
+
+defm : avx512_mask_binop_int<"kadd",  "KADD">;
+defm : avx512_mask_binop_int<"kand",  "KAND">;
+defm : avx512_mask_binop_int<"kandn", "KANDN">;
+defm : avx512_mask_binop_int<"kor",   "KOR">;
+defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
+defm : avx512_mask_binop_int<"kxor",  "KXOR">;
+// With AVX-512, 8-bit mask is promoted to 16-bit mask.
+multiclass avx512_binop_pat<SDPatternOperator OpNode, Instruction Inst> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(OpNode VK8:$src1, VK8:$src2),
+              (COPY_TO_REGCLASS
+                (Inst (COPY_TO_REGCLASS VK8:$src1, VK16),
+                      (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
+}
+
+defm : avx512_binop_pat<and,  KANDWrr>;
+defm : avx512_binop_pat<andn, KANDNWrr>;
+defm : avx512_binop_pat<or,   KORWrr>;
+defm : avx512_binop_pat<xnor, KXNORWrr>;
+defm : avx512_binop_pat<xor,  KXORWrr>;
+
+// Mask unpacking
+multiclass avx512_mask_unpck<bits<8> opc, string OpcodeStr,
+                           RegisterClass KRC1, RegisterClass KRC2> {
+  let Predicates = [HasAVX512] in
+    def rr : I<opc, MRMSrcReg, (outs KRC1:$dst), (ins KRC2:$src1, KRC2:$src2),
+               !strconcat(OpcodeStr,
+                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+}
+
+multiclass avx512_mask_unpck_bw<bits<8> opc, string OpcodeStr> {
+  defm BW : avx512_mask_unpck<opc, !strconcat(OpcodeStr, "bw"), VK16, VK8>,
+                            VEX_4V, VEX_L, OpSize, TB;
+}
+
+defm KUNPCK : avx512_mask_unpck_bw<0x4b, "kunpck">;
+
+multiclass avx512_mask_unpck_int<string IntName, string InstName> {
+  let Predicates = [HasAVX512] in
+    def : Pat<(!cast<Intrinsic>("int_x86_"##IntName##"_v16i1")
+                VK8:$src1, VK8:$src2),
+              (!cast<Instruction>(InstName##"BWrr") VK8:$src1, VK8:$src2)>;
+}
+
+defm : avx512_mask_unpck_int<"kunpck", "KUNPCK">;
+// Mask bit testing
+multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                            SDNode OpNode> {
+  let Predicates = [HasAVX512], Defs = [EFLAGS] in
+    def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
+               [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+}
+
+multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  defm W : avx512_mask_testop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                            VEX, TB;
+}
+
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
+defm KTEST   : avx512_mask_testop_w<0x99, "ktest", X86ktest>;
+
+// Mask shift
+multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                             SDNode OpNode> {
+  let Predicates = [HasAVX512] in
+    def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, i8imm:$imm),
+                 !strconcat(OpcodeStr,
+                            "\t{$imm, $src, $dst|$dst, $src, $imm}"),
+                            [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+}
+
+multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
+                               SDNode OpNode> {
+  defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
+                             VEX, OpSize, TA, VEX_W;
+}
+
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", shl>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", srl>;
+
+// Mask setting all 0s or 1s
+multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
+  let Predicates = [HasAVX512] in
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+      def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+                     [(set KRC:$dst, (VT Val))]>;
+}
+
+multiclass avx512_mask_setop_w<PatFrag Val> {
+  defm B : avx512_mask_setop<VK8,  v8i1, Val>;
+  defm W : avx512_mask_setop<VK16, v16i1, Val>;
+}
+
+defm KSET0 : avx512_mask_setop_w<immAllZerosV>;
+defm KSET1 : avx512_mask_setop_w<immAllOnesV>;
+
+// With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8i1 immAllZerosV), (COPY_TO_REGCLASS (KSET0W), VK8)>;
+  def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
+}
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 0))),
+          (v8i1 (COPY_TO_REGCLASS VK16:$src, VK8))>;
+
+def : Pat<(v16i1 (insert_subvector undef, (v8i1 VK8:$src), (iPTR 0))),
+          (v16i1 (COPY_TO_REGCLASS VK8:$src, VK16))>;
+
+def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
+          (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Aligned and unaligned load and store
+//
+
+multiclass avx512_mov_packed<bits<8> opc, RegisterClass RC, RegisterClass KRC,
+                            X86MemOperand x86memop, PatFrag ld_frag, 
+                            string asm, Domain d> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], d>,
+              EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+               [(set RC:$dst, (ld_frag addr:$src))], d>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs RC:$dst), 
+                                     (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), [], d>,
+              EVEX, EVEX_K;
+  def rmk : AVX512PI<opc, MRMSrcMem, (outs RC:$dst),
+                                (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               [], d>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVAPSZ : avx512_mov_packed<0x28, VR512, VK16WM, f512mem, alignedloadv16f32,
+                              "vmovaps", SSEPackedSingle>,
+                               EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVAPDZ : avx512_mov_packed<0x28, VR512, VK8WM, f512mem, alignedloadv8f64,
+                              "vmovapd", SSEPackedDouble>,
+                              OpSize, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+defm VMOVUPSZ : avx512_mov_packed<0x10, VR512, VK16WM, f512mem, loadv16f32,
+                              "vmovups", SSEPackedSingle>,
+                              EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVUPDZ : avx512_mov_packed<0x10, VR512, VK8WM, f512mem, loadv8f64,
+                              "vmovupd", SSEPackedDouble>,
+                               OpSize, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+def VMOVAPSZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovaps\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVAPDZmr : AVX512PI<0x29, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovapd\t{$src, $dst|$dst, $src}",
+                    [(alignedstore512 (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+def VMOVUPSZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovups\t{$src, $dst|$dst, $src}",
+                    [(store (v16f32 VR512:$src), addr:$dst)],
+                    SSEPackedSingle>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVUPDZmr : AVX512PI<0x11, MRMDestMem, (outs), (ins f512mem:$dst, VR512:$src),
+                    "vmovupd\t{$src, $dst|$dst, $src}",
+                    [(store (v8f64 VR512:$src), addr:$dst)],
+                    SSEPackedDouble>, EVEX, EVEX_V512,
+                    OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+let neverHasSideEffects = 1 in {
+  def VMOVDQA32rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512;
+  def VMOVDQA64rr  : AVX512BI<0x6F, MRMSrcReg, (outs VR512:$dst),
+                             (ins VR512:$src),
+                             "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                             EVEX, EVEX_V512, VEX_W;
+let mayStore = 1 in {
+  def VMOVDQA32mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def VMOVDQA64mr  : AVX512BI<0x7F, MRMDestMem, (outs),
+                     (ins i512mem:$dst, VR512:$src),
+                     "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                     EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+let mayLoad = 1 in {
+def VMOVDQA32rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst), 
+                           (ins i512mem:$src),
+                           "vmovdqa32\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+def VMOVDQA64rm  : AVX512BI<0x6F, MRMSrcMem, (outs VR512:$dst),
+                           (ins i512mem:$src),
+                           "vmovdqa64\t{$src, $dst|$dst, $src}", []>,
+                           EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+}
+
+// 512-bit aligned load/store
+def : Pat<(alignedloadv8i64 addr:$src),  (VMOVDQA64rm addr:$src)>;
+def : Pat<(alignedloadv16i32 addr:$src), (VMOVDQA32rm addr:$src)>;
+
+def : Pat<(alignedstore512 (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQA64mr addr:$dst, VR512:$src)>;
+def : Pat<(alignedstore512 (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQA32mr addr:$dst, VR512:$src)>;
+
+multiclass avx512_mov_int<bits<8> load_opc, bits<8> store_opc, string asm,
+                          RegisterClass RC, RegisterClass KRC,
+                          PatFrag ld_frag, X86MemOperand x86memop> {
+let neverHasSideEffects = 1 in
+  def rr : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let canFoldAsLoad = 1 in
+  def rm : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+                     [(set RC:$dst, (ld_frag addr:$src))]>, EVEX;
+let mayStore = 1 in
+  def mr : AVX512XSI<store_opc, MRMDestMem, (outs),
+                     (ins x86memop:$dst, VR512:$src),
+                     !strconcat(asm, "\t{$src, $dst|$dst, $src}"), []>, EVEX;
+let Constraints = "$src1 = $dst" in {
+  def rrk : AVX512XSI<load_opc, MRMSrcReg, (outs RC:$dst),
+                                      (ins RC:$src1, KRC:$mask, RC:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), []>,
+              EVEX, EVEX_K;
+  def rmk : AVX512XSI<load_opc, MRMSrcMem, (outs RC:$dst),
+                                  (ins RC:$src1, KRC:$mask, x86memop:$src2),
+              !strconcat(asm, 
+              "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+               []>, EVEX, EVEX_K;
+}
+}
+
+defm VMOVDQU32 : avx512_mov_int<0x6F, 0x7F, "vmovdqu32", VR512, VK16WM,
+                                memopv16i32, i512mem>,
+                                EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMOVDQU64 : avx512_mov_int<0x6F, 0x7F, "vmovdqu64", VR512, VK8WM,
+                                memopv8i64, i512mem>,
+                                EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+// 512-bit unaligned load/store
+def : Pat<(loadv8i64 addr:$src),         (VMOVDQU64rm addr:$src)>;
+def : Pat<(loadv16i32 addr:$src),        (VMOVDQU32rm addr:$src)>;
+
+def : Pat<(store (v8i64  VR512:$src), addr:$dst),
+          (VMOVDQU64mr addr:$dst, VR512:$src)>;
+def : Pat<(store (v16i32 VR512:$src), addr:$dst),
+          (VMOVDQU32mr addr:$dst, VR512:$src)>;
+
+let AddedComplexity = 20 in {
+def : Pat<(v16f32 (vselect VK16WM:$mask, (v16f32 VR512:$src1), 
+                           (v16f32 VR512:$src2))),
+                  (VMOVUPSZrrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8f64 (vselect VK8WM:$mask, (v8f64 VR512:$src1), 
+                           (v8f64 VR512:$src2))),
+                  (VMOVUPDZrrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+def : Pat<(v16i32 (vselect VK16WM:$mask, (v16i32 VR512:$src1), 
+                           (v16i32 VR512:$src2))),
+                  (VMOVDQU32rrk VR512:$src2, VK16WM:$mask, VR512:$src1)>;
+def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src1), 
+                           (v8i64 VR512:$src2))),
+                  (VMOVDQU64rrk VR512:$src2, VK8WM:$mask, VR512:$src1)>;
+}
+// Move Int Doubleword to Packed Double Int
+//
+def VMOVDI2PDIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
+                        EVEX, VEX_LIG;
+def VMOVDI2PDIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
+                        IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+def VMOV64toPQIZrr : AVX512SI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                        [(set VR128X:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))],
+                          IIC_SSE_MOVDQ>, EVEX, VEX_W, VEX_LIG;
+let isCodeGenOnly = 1 in {
+def VMOV64toSDZrr : AVX512SI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))],
+                       IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOVSDto64Zrr : AVX512SI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FR64:$src))],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+}
+def VMOVSDto64Zmr : AVX512SI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         "vmovq{z}\t{$src, $dst|$dst, $src}",
+                         [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                         IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteStore]>,
+                         EVEX_CD8<64, CD8VT1>;
+
+// Move Int Doubleword to Single Scalar
+//
+let isCodeGenOnly = 1 in {
+def VMOVDI2SSZrr  : AVX512SI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert GR32:$src))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG;
+
+def VMOVDI2SSZrm  : AVX512SI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Packed Doubleword Int to Packed Double Int
+//
+def VMOVPDI2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128X:$src),
+                                        (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
+                       EVEX, VEX_LIG;
+def VMOVPDI2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                       (ins i32mem:$dst, VR128X:$src),
+                       "vmovd{z}\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128X:$src),
+                                     (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+
+// Move Packed Doubleword Int first element to Doubleword Int
+//
+def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
+                                                   (iPTR 0)))],
+                      IIC_SSE_MOVD_ToGP>, TB, OpSize, EVEX, VEX_LIG, VEX_W,
+                      Requires<[HasAVX512, In64BitMode]>;
+
+def VMOVPQIto64Zmr : I<0xD6, MRMDestMem, (outs),
+                       (ins i64mem:$dst, VR128X:$src),
+                       "vmovq{z}\t{$src, $dst|$dst, $src}",
+                       [(store (extractelt (v2i64 VR128X:$src), (iPTR 0)),
+                               addr:$dst)], IIC_SSE_MOVDQ>,
+                       EVEX, OpSize, VEX_LIG, VEX_W, TB, EVEX_CD8<64, CD8VT1>,
+                       Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>;
+
+// Move Scalar Single to Double Int
+//
+let isCodeGenOnly = 1 in {
+def VMOVSS2DIZrr  : AVX512SI<0x7E, MRMDestReg, (outs GR32:$dst),
+                      (ins FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32X:$src))],
+                      IIC_SSE_MOVD_ToGP>, EVEX, VEX_LIG;
+def VMOVSS2DIZmr  : AVX512SI<0x7E, MRMDestMem, (outs),
+                      (ins i32mem:$dst, FR32X:$src),
+                      "vmovd{z}\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
+                      IIC_SSE_MOVDQ>, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+}
+
+// Move Quadword Int to Packed Quadword Int
+//
+def VMOVQI2PQIZrm : AVX512SI<0x6E, MRMSrcMem, (outs VR128X:$dst),
+                      (ins i64mem:$src),
+                      "vmovq{z}\t{$src, $dst|$dst, $src}",
+                      [(set VR128X:$dst,
+                        (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
+                      EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  MOVSS, MOVSD
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_move_scalar <string asm, RegisterClass RC, 
+                              SDNode OpNode, ValueType vt,
+                              X86MemOperand x86memop, PatFrag mem_pat> {
+  def rr : SI<0x10, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, RC:$src2), 
+              !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+              [(set VR128X:$dst, (vt (OpNode VR128X:$src1,
+                                      (scalar_to_vector RC:$src2))))],
+              IIC_SSE_MOV_S_RR>, EVEX_4V, VEX_LIG;
+  def rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set RC:$dst, (mem_pat addr:$src))], IIC_SSE_MOV_S_RM>,
+              EVEX, VEX_LIG;
+  def mr: SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+             !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+             [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+             EVEX, VEX_LIG;
+}
+
+let ExeDomain = SSEPackedSingle in
+defm VMOVSSZ : avx512_move_scalar<"movss{z}", FR32X, X86Movss, v4f32, f32mem,
+                                 loadf32>, XS, EVEX_CD8<32, CD8VT1>;
+
+let ExeDomain = SSEPackedDouble in
+defm VMOVSDZ : avx512_move_scalar<"movsd{z}", FR64X, X86Movsd, v2f64, f64mem,
+                                 loadf64>, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+
+// For the disassembler
+let isCodeGenOnly = 1 in {
+  def VMOVSSZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR32X:$src2),
+                        "movss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XS, EVEX_4V, VEX_LIG;
+  def VMOVSDZrr_REV : SI<0x11, MRMDestReg, (outs VR128X:$dst),
+                        (ins VR128X:$src1, FR64X:$src2),
+                        "movsd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+                        IIC_SSE_MOV_S_RR>,
+                        XD, EVEX_4V, VEX_LIG, VEX_W;
+}
+
+let Predicates = [HasAVX512] in {
+  let AddedComplexity = 15 in {
+  // Move scalar to XMM zero-extended, zeroing a VR128X then do a
+  // MOVS{S,D} to the lower bits.
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
+            (VMOVSSZrr (v4f32 (V_SET0)), FR32X:$src)>;
+  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
+            (VMOVSSZrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
+            (VMOVSSZrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
+            (VMOVSDZrr (v2f64 (V_SET0)), FR64X:$src)>;
+
+  // Move low f32 and clear high bits.
+  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4f32 (V_SET0)), 
+              (EXTRACT_SUBREG (v8f32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSSZrr (v4i32 (V_SET0)),
+                       (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+  }
+
+  let AddedComplexity = 20 in {
+  // MOVSSrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)>;
+
+  // MOVSDrm zeros the high parts of the register; represent this
+  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
+  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+  def : Pat<(v2f64 (X86vzload addr:$src)),
+            (COPY_TO_REGCLASS (VMOVSDZrm addr:$src), VR128X)>;
+
+  // Represent the same patterns above but in the form they appear for
+  // 256-bit types
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrm addr:$src), sub_xmm)>;
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
+  }
+  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
+                   (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (V_SET0)),
+                                            FR32X:$src)), sub_xmm)>;
+  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
+                   (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (V_SET0)),
+                                     FR64X:$src)), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
+
+  // Move low f64 and clear high bits.
+  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0),
+             (VMOVSDZrr (v2f64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+            (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (V_SET0)),
+                       (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+  // Extract and store.
+  def : Pat<(store (f32 (vector_extract (v4f32 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+  def : Pat<(store (f64 (vector_extract (v2f64 VR128X:$src), (iPTR 0))),
+                   addr:$dst),
+            (VMOVSDZmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128X:$src), FR64X))>;
+
+  // Shuffle with VMOVSS
+  def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4i32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
+  def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
+            (VMOVSSZrr (v4f32 VR128X:$src1),
+                      (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
+
+  // 256-bit variants
+  def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  // Shuffle with VMOVSD
+  def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+
+  // 256-bit variants
+  def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+  def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
+            (SUBREG_TO_REG (i32 0),
+              (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
+                        (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
+              sub_xmm)>;
+
+  def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+  def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
+            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+}
+
+let AddedComplexity = 15 in
+def VMOVZPQILo2PQIZrr : AVX512XSI<0x7E, MRMSrcReg, (outs VR128X:$dst),
+                                (ins VR128X:$src),
+                                "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                [(set VR128X:$dst, (v2i64 (X86vzmovl 
+                                                   (v2i64 VR128X:$src))))],
+                                IIC_SSE_MOVQ_RR>, EVEX, VEX_W;
+
+let AddedComplexity = 20 in
+def VMOVZPQILo2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
+                                 (ins i128mem:$src),
+                                 "vmovq{z}\t{$src, $dst|$dst, $src}",
+                                 [(set VR128X:$dst, (v2i64 (X86vzmovl
+                                                     (loadv2i64 addr:$src))))],
+                                 IIC_SSE_MOVDQ>, EVEX, VEX_W,
+                                 EVEX_CD8<8, CD8VT8>;
+
+let Predicates = [HasAVX512] in {
+  // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
+              (VMOV64toPQIZrr GR64:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIZrr GR32:$src)>;
+              
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (VMOVDI2PDIZrm addr:$src)>;
+    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (VMOVZPQILo2PQIZrm addr:$src)>;
+    def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
+            (VMOVZPQILo2PQIZrr VR128X:$src)>;
+  }
+
+  // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
+  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+            (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+}
+
+def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Integer arithmetic
+//
+multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        OpndItins itins, bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       itins.rr>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (memop_frag addr:$src2))))],
+                                     itins.rm>, EVEX_4V;
+  def rmb : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (OpVT (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+                        itins.rm>, EVEX_4V, EVEX_B;
+}
+multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0> {
+  let isCommutable = IsCommutable in
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       []>, EVEX_4V, VEX_W;
+}
+
+defm VPADDDZ : avx512_binop_rm<0xFE, "vpaddd", add, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPSUBDZ : avx512_binop_rm<0xFA, "vpsubd", sub, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPMULLDZ : avx512_binop_rm<0x40, "vpmulld", mul, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPADDQZ : avx512_binop_rm<0xD4, "vpaddq", add, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 1>, 
+                   EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_W;
+
+defm VPSUBQZ : avx512_binop_rm<0xFB, "vpsubq", sub, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMULDQZ : avx512_binop_rm2<0x28, "vpmuldq", v8i64, v16i32,
+                VR512, memopv8i64, i512mem, SSE_INTALU_ITINS_P, 1>, T8,
+                EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+defm VPMULUDQZ : avx512_binop_rm2<0xF4, "vpmuludq", v8i64, v16i32,
+                 VR512, memopv8i64, i512mem, SSE_INTMUL_ITINS_P, 1>, EVEX_V512,
+                 EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v8i64 (X86pmuludq (v16i32 VR512:$src1), (v16i32 VR512:$src2))),
+          (VPMULUDQZrr VR512:$src1, VR512:$src2)>;
+
+defm VPMAXUDZ : avx512_binop_rm<0x3F, "vpmaxud", X86umax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXUQZ : avx512_binop_rm<0x3F, "vpmaxuq", X86umax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMAXSDZ : avx512_binop_rm<0x3D, "vpmaxsd", X86smax, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMAXSQZ : avx512_binop_rm<0x3D, "vpmaxsq", X86smax, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINUDZ : avx512_binop_rm<0x3B, "vpminud", X86umin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINUQZ : avx512_binop_rm<0x3B, "vpminuq", X86umin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VPMINSDZ : avx512_binop_rm<0x39, "vpminsd", X86smin, v16i32, VR512, memopv16i32,
+                   i512mem, loadi32, i32mem, "{1to16}", SSE_INTALU_ITINS_P, 1>,
+                   T8, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPMINSQZ : avx512_binop_rm<0x39, "vpminsq", X86smin, v8i64, VR512, memopv8i64,
+                   i512mem, loadi64, i64mem, "{1to8}", SSE_INTALU_ITINS_P, 0>,
+                   T8, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - Unpack Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_unpack_fp<bits<8> opc, SDNode OpNode, ValueType vt,
+                                   PatFrag mem_frag, RegisterClass RC,
+                                   X86MemOperand x86memop, string asm,
+                                   Domain d> {
+    def rr : AVX512PI<opc, MRMSrcReg,
+                (outs RC:$dst), (ins RC:$src1, RC:$src2),
+                asm, [(set RC:$dst,
+                           (vt (OpNode RC:$src1, RC:$src2)))],
+                           d>, EVEX_4V;
+    def rm : AVX512PI<opc, MRMSrcMem,
+                (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+                asm, [(set RC:$dst,
+                       (vt (OpNode RC:$src1,
+                            (bitconvert (mem_frag addr:$src2)))))],
+                        d>, EVEX_4V;
+}
+
+defm VUNPCKHPSZ: avx512_unpack_fp<0x15, X86Unpckh, v16f32, memopv8f64,
+      VR512, f512mem, "vunpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKHPDZ: avx512_unpack_fp<0x15, X86Unpckh, v8f64, memopv8f64,
+      VR512, f512mem, "vunpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VUNPCKLPSZ: avx512_unpack_fp<0x14, X86Unpckl, v16f32, memopv8f64,
+      VR512, f512mem, "vunpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VUNPCKLPDZ: avx512_unpack_fp<0x14, X86Unpckl, v8f64, memopv8f64,
+      VR512, f512mem, "vunpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+      SSEPackedDouble>, OpSize, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_unpack_int<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+                        X86MemOperand x86memop> {
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1), (OpVT RC:$src2))))], 
+       IIC_SSE_UNPCK>, EVEX_4V;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpVT (OpNode (OpVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))],
+                                     IIC_SSE_UNPCK>, EVEX_4V;
+}
+defm VPUNPCKLDQZ  : avx512_unpack_int<0x62, "vpunpckldq", X86Unpckl, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKLQDQZ : avx512_unpack_int<0x6C, "vpunpcklqdq", X86Unpckl, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPUNPCKHDQZ  : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
+                                VR512, memopv16i32, i512mem>, EVEX_V512,
+                                EVEX_CD8<32, CD8VF>;
+defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
+                                VR512, memopv8i64, i512mem>, EVEX_V512,
+                                VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                         SDNode OpNode, PatFrag mem_frag, 
+                         X86MemOperand x86memop, ValueType OpVT> {
+  def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+                     EVEX;
+  def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+                     (ins x86memop:$src1, i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                         "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                       (OpVT (OpNode (mem_frag addr:$src1),
+                              (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+                      i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+                      memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+                      EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+                      memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+                      VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+          (VPERMILPDZri VR512:$src1, imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Logical Instructions
+//===----------------------------------------------------------------------===//
+
+defm VPANDDZ : avx512_binop_rm<0xDB, "vpandd", and, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDQZ : avx512_binop_rm<0xDB, "vpandq", and, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPORDZ  : avx512_binop_rm<0xEB, "vpord", or, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPORQZ  : avx512_binop_rm<0xEB, "vporq", or, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPXORDZ : avx512_binop_rm<0xEF, "vpxord", xor, v16i32, VR512, memopv16i32,
+                      i512mem, loadi32, i32mem, "{1to16}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPXORQZ : avx512_binop_rm<0xEF, "vpxorq", xor, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 1>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPANDNDZ : avx512_binop_rm<0xDF, "vpandnd", X86andnp, v16i32, VR512,
+                      memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                      SSE_BIT_ITINS_P, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPANDNQZ : avx512_binop_rm<0xDF, "vpandnq", X86andnp, v8i64, VR512, memopv8i64,
+                      i512mem, loadi64, i64mem, "{1to8}", SSE_BIT_ITINS_P, 0>,
+                      EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  FP arithmetic
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm SSZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss{z}"), OpNode, FR32X,
+                             f32mem, itins.s, 0>, XS, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<32, CD8VT1>;
+  defm SDZ : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd{z}"), OpNode, FR64X,
+                             f64mem, itins.d, 0>, XD, VEX_W, EVEX_4V, VEX_LIG,
+                             EVEX_CD8<64, CD8VT1>;
+}
+
+let isCommutable = 1 in {
+defm VADD : avx512_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>;
+defm VMUL : avx512_binop_s<0x59, "mul", fmul, SSE_ALU_ITINS_S>;
+defm VMIN : avx512_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>;
+defm VMAX : avx512_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>;
+}
+let isCommutable = 0 in {
+defm VSUB : avx512_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>;
+defm VDIV : avx512_binop_s<0x5E, "div", fdiv, SSE_ALU_ITINS_S>;
+}
+
+multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag,
+                           X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+                           string BrdcstStr,
+                           Domain d, OpndItins itins, bit commutable> {
+  let isCommutable = commutable in
+    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
+       EVEX_4V, TB;
+  let mayLoad = 1 in {
+    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
+       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
+          itins.rm, d>, EVEX_4V, TB;
+    def rmb : PI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", $src1, $dst|$dst, $src1, ${src2}", BrdcstStr, "}"),
+       [(set RC:$dst, (OpNode RC:$src1, 
+                       (vt (X86VBroadcast (scalar_mfrag addr:$src2)))))],
+       itins.rm, d>, EVEX_4V, EVEX_B, TB;
+    }
+}
+
+defm VADDPSZ : avx512_fp_packed<0x58, "addps", fadd, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle, 
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+                   
+defm VADDPDZ : avx512_fp_packed<0x58, "addpd", fadd, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMULPSZ : avx512_fp_packed<0x59, "mulps", fmul, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMULPDZ : avx512_fp_packed<0x59, "mulpd", fmul, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VMINPSZ : avx512_fp_packed<0x5D, "minps", X86fmin, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VMAXPSZ : avx512_fp_packed<0x5F, "maxps", X86fmax, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 1>,
+                   EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VMINPDZ : avx512_fp_packed<0x5D, "minpd", X86fmin, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VMAXPDZ : avx512_fp_packed<0x5F, "maxpd", X86fmax, VR512, v8f64, f512mem,
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 1>,
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+defm VSUBPSZ : avx512_fp_packed<0x5C, "subps", fsub, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VDIVPSZ : avx512_fp_packed<0x5E, "divps", fdiv, VR512, v16f32, f512mem,
+                   memopv16f32, f32mem, loadf32, "{1to16}", SSEPackedSingle,
+                   SSE_ALU_ITINS_P.s, 0>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VSUBPDZ : avx512_fp_packed<0x5C, "subpd", fsub, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, 
+                   memopv8f64, f64mem, loadf64, "{1to8}", SSEPackedDouble,
+                   SSE_ALU_ITINS_P.d, 0>, 
+                   EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  VPTESTM instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC, 
+              RegisterClass RC, X86MemOperand x86memop, PatFrag memop_frag, 
+              SDNode OpNode, ValueType vt> {
+  def rr : AVX5128I<opc, MRMSrcReg,
+             (outs KRC:$dst), (ins RC:$src1, RC:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), (vt RC:$src2)))]>, EVEX_4V;
+  def rm : AVX5128I<opc, MRMSrcMem,
+             (outs KRC:$dst), (ins RC:$src1, x86memop:$src2), 
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set KRC:$dst, (OpNode (vt RC:$src1), 
+              (bitconvert (memop_frag addr:$src2))))]>, EVEX_4V;
+}
+
+defm VPTESTMDZ  : avx512_vptest<0x27, "vptestmd", VK16, VR512,  f512mem,
+                              memopv16i32, X86testm, v16i32>, EVEX_V512,
+                              EVEX_CD8<32, CD8VF>;
+defm VPTESTMQZ  : avx512_vptest<0x27, "vptestmq", VK8, VR512,  f512mem,
+                              memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
+                              EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Shift instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
+                         string OpcodeStr, SDNode OpNode, RegisterClass RC,
+                         ValueType vt, X86MemOperand x86memop, PatFrag mem_frag,
+                         RegisterClass KRC> {
+  def ri : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (i8 imm:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rik : AVX512BIi8<opc, ImmFormR, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def mi: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (OpNode (mem_frag addr:$src1),
+                     (i8 imm:$src2)))], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def mik: AVX512BIi8<opc, ImmFormM, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src1, i8imm:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          RegisterClass RC, ValueType vt, ValueType SrcVT,
+                          PatFrag bc_frag, RegisterClass KRC> {
+  // src2 is always 128-bit
+  def rr : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1, (SrcVT VR128X:$src2))))],
+        SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V;
+  def rrk : AVX512BI<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, VR128X:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rr>, EVEX_4V, EVEX_K;
+  def rm : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+       [(set RC:$dst, (vt (OpNode RC:$src1,
+                       (bc_frag (memopv2i64 addr:$src2)))))],
+                        SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V;
+  def rmk : AVX512BI<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src1, i128mem:$src2),
+           !strconcat(OpcodeStr,
+                "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}"),
+       [], SSE_INTSHIFT_ITINS_P.rm>, EVEX_4V, EVEX_K;
+}
+
+defm VPSRLDZ : avx512_shift_rmi<0x72, MRM2r, MRM2m, "vpsrld", X86vsrli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRLDZ : avx512_shift_rrm<0xD2, "vpsrld", X86vsrl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRLQZ : avx512_shift_rmi<0x73, MRM2r, MRM2m, "vpsrlq", X86vsrli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRLQZ : avx512_shift_rrm<0xD3, "vpsrlq", X86vsrl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSLLDZ : avx512_shift_rmi<0x72, MRM6r, MRM6m, "vpslld", X86vshli,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VF>;
+defm VPSLLDZ : avx512_shift_rrm<0xF2, "vpslld", X86vshl,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSLLQZ : avx512_shift_rmi<0x73, MRM6r, MRM6m, "vpsllq", X86vshli,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSLLQZ : avx512_shift_rrm<0xF3, "vpsllq", X86vshl,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+defm VPSRADZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsrad", X86vsrai,
+                           VR512, v16i32, i512mem, memopv16i32, VK16WM>,
+                           EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VPSRADZ : avx512_shift_rrm<0xE2, "vpsrad", X86vsra,
+                           VR512, v16i32, v4i32, bc_v4i32, VK16WM>, EVEX_V512,
+                           EVEX_CD8<32, CD8VQ>;
+                           
+defm VPSRAQZ : avx512_shift_rmi<0x72, MRM4r, MRM4m, "vpsraq", X86vsrai,
+                           VR512, v8i64, i512mem, memopv8i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VF>, VEX_W;
+defm VPSRAQZ : avx512_shift_rrm<0xE2, "vpsraq", X86vsra,
+                           VR512, v8i64, v2i64, bc_v2i64, VK8WM>, EVEX_V512,
+                           EVEX_CD8<64, CD8VQ>, VEX_W;
+
+//===-------------------------------------------------------------------===//
+// Variable Bit Shifts
+//===-------------------------------------------------------------------===//
+multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           RegisterClass RC, ValueType vt,
+                           X86MemOperand x86memop, PatFrag mem_frag> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+             (ins RC:$src1, RC:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (vt RC:$src2))))]>,
+             EVEX_4V;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+             (ins RC:$src1, x86memop:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+             [(set RC:$dst,
+               (vt (OpNode RC:$src1, (mem_frag addr:$src2))))]>,
+             EVEX_4V;
+}
+
+defm VPSLLVDZ : avx512_var_shift<0x47, "vpsllvd", shl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSLLVQZ : avx512_var_shift<0x47, "vpsllvq", shl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRLVDZ : avx512_var_shift<0x45, "vpsrlvd", srl, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRLVQZ : avx512_var_shift<0x45, "vpsrlvq", srl, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32, 
+                               i512mem, memopv16i32>, EVEX_V512,
+                               EVEX_CD8<32, CD8VF>;
+defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64, 
+                               i512mem, memopv8i64>, EVEX_V512, VEX_W,
+                               EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT, 
+                        X86MemOperand x86memop, PatFrag memop_frag> {
+def rr  : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm  : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set RC:$dst,
+                      (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+          (VMOVDDUPZrm addr:$src)>;
+
+//===---------------------------------------------------------------------===//
+// Replicate Single FP - MOVSHDUP and MOVSLDUP
+//===---------------------------------------------------------------------===//
+multiclass avx512_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
+                              ValueType vt, RegisterClass RC, PatFrag mem_frag,
+                              X86MemOperand x86memop> {
+  def rr : AVX512XSI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (vt (OpNode RC:$src)))]>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512XSI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                      [(set RC:$dst, (OpNode (mem_frag addr:$src)))]>, EVEX;
+}
+
+defm VMOVSHDUPZ  : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+defm VMOVSLDUPZ  : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
+                       v16f32, VR512, memopv16f32, f512mem>, EVEX_V512,
+                       EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movshdup (memopv16i32 addr:$src))),
+           (VMOVSHDUPZrm addr:$src)>;
+def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>;
+def : Pat<(v16i32 (X86Movsldup (memopv16i32 addr:$src))),
+           (VMOVSLDUPZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Move Low to High and High to Low packed FP Instructions
+//===----------------------------------------------------------------------===//
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+           IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+          (ins VR128X:$src1, VR128X:$src2),
+          "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+          [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+          IIC_SSE_MOV_LH>, EVEX_4V;
+
+let Predicates = [HasAVX512] in {
+  // MOVLHPS patterns
+  def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+  def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+            (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+  // MOVHLPS patterns
+  def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+            (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+}
+
+//===----------------------------------------------------------------------===//
+// FMA - Fused Multiply Operations
+//
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_rm<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  def r: AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, RC:$src3),
+          !strconcat(OpcodeStr,"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT(OpNode RC:$src1, RC:$src2, RC:$src3)))]>;
+
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src2, x86memop:$src3),
+          !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2,
+                                               (mem_frag addr:$src3))))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86scalar_mop:$src3),
+           !strconcat(OpcodeStr, "\t{${src3}", BrdcstStr, 
+            ", $src2, $dst|$dst, $src2, ${src3}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src3)))))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD213PSZ    : avx512_fma3p_rm<0xA8, "vfmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB213PSZ    : avx512_fma3p_rm<0xAA, "vfmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB213PSZ : avx512_fma3p_rm<0xA6, "vfmaddsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD213PSZ : avx512_fma3p_rm<0xA7, "vfmsubadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD213PSZ   : avx512_fma3p_rm<0xAC, "vfnmadd213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB213PSZ   : avx512_fma3p_rm<0xAE, "vfnmsub213ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD213PDZ    : avx512_fma3p_rm<0xA8, "vfmadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB213PDZ    : avx512_fma3p_rm<0xAA, "vfmsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB213PDZ : avx512_fma3p_rm<0xA6, "vfmaddsub213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD213PDZ : avx512_fma3p_rm<0xA7, "vfmsubadd213pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD213PDZ : avx512_fma3p_rm<0xAC, "vfnmadd213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB213PDZ : avx512_fma3p_rm<0xAE, "vfnmsub213pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3p_m132<bits<8> opc, string OpcodeStr,
+            RegisterClass RC, X86MemOperand x86memop,
+            PatFrag mem_frag, X86MemOperand x86scalar_mop, PatFrag scalar_mfrag,
+            string BrdcstStr, SDNode OpNode, ValueType OpVT> {
+  let mayLoad = 1 in
+  def m: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+          (ins RC:$src1, RC:$src3, x86memop:$src2),
+          !strconcat(OpcodeStr, "\t{$src2, $src3, $dst|$dst, $src3, $src2}"),
+          [(set RC:$dst, (OpVT (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3)))]>;
+   def mb: AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src3, x86scalar_mop:$src2),
+           !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr, 
+            ", $src3, $dst|$dst, $src3, ${src2}", BrdcstStr, "}"),
+           [(set RC:$dst, (OpNode RC:$src1, 
+           (OpVT (X86VBroadcast (scalar_mfrag addr:$src2))), RC:$src3))]>, EVEX_B;
+}
+} // Constraints = "$src1 = $dst"
+
+
+let ExeDomain = SSEPackedSingle in {
+  defm VFMADD132PSZ    : avx512_fma3p_m132<0x98, "vfmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMSUB132PSZ    : avx512_fma3p_m132<0x9A, "vfmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFMADDSUB132PSZ : avx512_fma3p_m132<0x96, "vfmaddsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmaddsub, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFMSUBADD132PSZ : avx512_fma3p_m132<0x97, "vfmsubadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fmsubadd, v16f32>,
+                                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+  defm VFNMADD132PSZ   : avx512_fma3p_m132<0x9C, "vfnmadd132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmadd, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+  defm VFNMSUB132PSZ   : avx512_fma3p_m132<0x9E, "vfnmsub132ps", VR512, f512mem,
+                                    memopv16f32, f32mem, loadf32, "{1to16}",
+                                    X86Fnmsub, v16f32>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VF>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VFMADD132PDZ    : avx512_fma3p_m132<0x98, "vfmadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmadd, v8f64>, EVEX_V512,
+                                    VEX_W, EVEX_CD8<64, CD8VF>;
+  defm VFMSUB132PDZ    : avx512_fma3p_m132<0x9A, "vfmsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMADDSUB132PDZ : avx512_fma3p_m132<0x96, "vfmaddsub132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmaddsub, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFMSUBADD132PDZ : avx512_fma3p_m132<0x97, "vfmsubadd132pd", VR512, f512mem,
+                                    memopv8f64, f64mem, loadf64, "{1to8}",
+                                    X86Fmsubadd, v8f64>, EVEX_V512, VEX_W,
+                                    EVEX_CD8<64, CD8VF>;
+  defm VFNMADD132PDZ : avx512_fma3p_m132<0x9C, "vfnmadd132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmadd, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+  defm VFNMSUB132PDZ : avx512_fma3p_m132<0x9E, "vfnmsub132pd", VR512, f512mem,
+                                  memopv8f64, f64mem, loadf64, "{1to8}",
+                                  X86Fnmsub, v8f64>, EVEX_V512, VEX_W,
+                                  EVEX_CD8<64, CD8VF>;
+}
+
+// Scalar FMA
+let Constraints = "$src1 = $dst" in {
+multiclass avx512_fma3s_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, 
+                 RegisterClass RC, ValueType OpVT, 
+                 X86MemOperand x86memop, Operand memop, 
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def r     : AVX512FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+  let mayLoad = 1 in
+  def m     : AVX512FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, f128mem:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+defm VFMADDSSZ  : avx512_fma3s_rm<0xA9, "vfmadd213ss{z}", X86Fmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMADDSDZ  : avx512_fma3s_rm<0xA9, "vfmadd213sd{z}", X86Fmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFMSUBSSZ  : avx512_fma3s_rm<0xAB, "vfmsub213ss{z}", X86Fmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFMSUBSDZ  : avx512_fma3s_rm<0xAB, "vfmsub213sd{z}", X86Fmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMADDSSZ  : avx512_fma3s_rm<0xAD, "vfnmadd213ss{z}", X86Fnmadd, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMADDSDZ  : avx512_fma3s_rm<0xAD, "vfnmadd213sd{z}", X86Fnmadd, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VFNMSUBSSZ  : avx512_fma3s_rm<0xAF, "vfnmsub213ss{z}", X86Fnmsub, FR32X, 
+                      f32, f32mem, ssmem, loadf32>, EVEX_CD8<32, CD8VT1>;
+defm VFNMSUBSDZ  : avx512_fma3s_rm<0xAF, "vfnmsub213sd{z}", X86Fnmsub, FR64X,
+                      f64, f64mem, sdmem, loadf64>, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vcvtsi<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          X86MemOperand x86memop, string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins DstRC:$src1, x86memop:$src),
+              !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
+              EVEX_4V;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+defm VCVTSI2SSZ   : avx512_vcvtsi<0x2A, GR32, FR32X, i32mem, "cvtsi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SSZ : avx512_vcvtsi<0x2A, GR64, FR32X, i64mem, "cvtsi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTSI2SDZ   : avx512_vcvtsi<0x2A, GR32, FR64X, i32mem, "cvtsi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTSI642SDZ : avx512_vcvtsi<0x2A, GR64, FR64X, i64mem, "cvtsi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+          (VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+          (VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (sint_to_fp GR32:$src)),
+          (VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (sint_to_fp GR64:$src)),
+          (VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (sint_to_fp GR32:$src)),
+          (VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (sint_to_fp GR64:$src)),
+          (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+defm VCVTUSI2SSZ   : avx512_vcvtsi<0x7B, GR32, FR32X, i32mem, "cvtusi2ss{l}{z}">,
+                                  XS, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SSZ : avx512_vcvtsi<0x7B, GR64, FR32X, i64mem, "cvtusi2ss{q}{z}">,
+                                  XS, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+defm VCVTUSI2SDZ   : avx512_vcvtsi<0x7B, GR32, FR64X, i32mem, "cvtusi2sd{l}{z}">,
+                                  XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+defm VCVTUSI642SDZ : avx512_vcvtsi<0x7B, GR64, FR64X, i64mem, "cvtusi2sd{q}{z}">,
+                                  XD, VEX_W, VEX_LIG, EVEX_CD8<64, CD8VT1>;
+
+def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+          (VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+          (VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
+
+def : Pat<(f32 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f32 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+def : Pat<(f64 (uint_to_fp GR32:$src)),
+          (VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
+def : Pat<(f64 (uint_to_fp GR64:$src)),
+          (VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Scalar convert from float/double to integer
+//===----------------------------------------------------------------------===//
+multiclass avx512_cvt_s_int<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                          Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
+                          string asm> {
+let neverHasSideEffects = 1 in {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (Int SrcRC:$src))]>, EVEX, VEX_LIG;
+  let mayLoad = 1 in
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), []>, EVEX, VEX_LIG;
+} // neverHasSideEffects = 1
+}
+let Predicates = [HasAVX512] in {
+// Convert float/double to signed/unsigned int 32/64
+defm VCVTSS2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse_cvtss2si,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse_cvtss2si64,
+                                   ssmem, sse_load_f32, "cvtss2si{z}">,
+                                   XS, VEX_W, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtss2usi,
+                                   ssmem, sse_load_f32, "cvtss2usi{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtss2usi64, ssmem,
+                                   sse_load_f32, "cvtss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ:    avx512_cvt_s_int<0x2D, VR128X, GR32, int_x86_sse2_cvtsd2si,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z:  avx512_cvt_s_int<0x2D, VR128X, GR64, int_x86_sse2_cvtsd2si64,
+                                   sdmem, sse_load_f64, "cvtsd2si{z}">,
+                                   XD, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USIZ:   avx512_cvt_s_int<0x79, VR128X, GR32, int_x86_avx512_cvtsd2usi,
+                                   sdmem, sse_load_f64, "cvtsd2usi{z}">,
+                                   XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2USI64Z: avx512_cvt_s_int<0x79, VR128X, GR64,
+                                   int_x86_avx512_cvtsd2usi64, sdmem,
+                                   sse_load_f64, "cvtsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+
+defm Int_VCVTSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+defm Int_VCVTUSI2SSZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2ss, i32mem, loadi32, "cvtusi2ss{l}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V;
+defm Int_VCVTUSI2SS64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642ss, i64mem, loadi64, "cvtusi2ss{q}{z}",
+          SSE_CVT_Scalar, 0>, XS, EVEX_4V, VEX_W;
+defm Int_VCVTUSI2SDZ : sse12_cvt_sint_3addr<0x2A, GR32, VR128X,
+          int_x86_avx512_cvtusi2sd, i32mem, loadi32, "cvtusi2sd{l}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V;
+defm Int_VCVTUSI2SD64Z : sse12_cvt_sint_3addr<0x2A, GR64, VR128X,
+          int_x86_avx512_cvtusi642sd, i64mem, loadi64, "cvtusi2sd{q}{z}",
+          SSE_CVT_Scalar, 0>, XD, EVEX_4V, VEX_W;
+
+// Convert float/double to signed/unsigned int 32/64 with truncation
+defm Int_VCVTTSS2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse_cvttss2si,
+                                   ssmem, sse_load_f32, "cvttss2si{z}">,
+                                   XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2SIZ : avx512_cvt_s_int<0x2C, VR128X, GR32, int_x86_sse2_cvttsd2si,
+                                   sdmem, sse_load_f64, "cvttsd2si{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2SI64Z : avx512_cvt_s_int<0x2C, VR128X, GR64,
+                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+                                   "cvttsd2si{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSS2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttss2usi, ssmem, sse_load_f32,
+                                   "cvttss2si{z}">, XS, EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSS2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttss2usi64, ssmem,
+                                   sse_load_f32, "cvttss2usi{z}">, XS, VEX_W,
+                                   EVEX_CD8<32, CD8VT1>;
+defm Int_VCVTTSD2USIZ : avx512_cvt_s_int<0x78, VR128X, GR32,
+                                   int_x86_avx512_cvttsd2usi,
+                                   sdmem, sse_load_f64, "cvttsd2usi{z}">, XD,
+                                   EVEX_CD8<64, CD8VT1>;
+defm Int_VCVTTSD2USI64Z : avx512_cvt_s_int<0x78, VR128X, GR64,
+                                   int_x86_avx512_cvttsd2usi64, sdmem,
+                                   sse_load_f64, "cvttsd2usi{z}">, XD, VEX_W,
+                                   EVEX_CD8<64, CD8VT1>;
+}
+
+multiclass avx512_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
+                         SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
+                         string asm> {
+  def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode SrcRC:$src))]>, EVEX;
+  def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>, EVEX;
+}
+
+defm VCVTTSS2SIZ    : avx512_cvt_s<0x2C, FR32X, GR32, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USIZ   : avx512_cvt_s<0x78, FR32X, GR32, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2SI64Z  : avx512_cvt_s<0x2C, FR32X, GR64, fp_to_sint, f32mem,
+                                  loadf32, "cvttss2si{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSS2USI64Z : avx512_cvt_s<0x78, FR32X, GR64, fp_to_uint, f32mem,
+                                  loadf32, "cvttss2usi{z}">, XS, VEX_W,
+                                  EVEX_CD8<32, CD8VT1>;
+defm VCVTTSD2SIZ    : avx512_cvt_s<0x2C, FR64X, GR32, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USIZ   : avx512_cvt_s<0x78, FR64X, GR32, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2SI64Z  : avx512_cvt_s<0x2C, FR64X, GR64, fp_to_sint, f64mem,
+                                  loadf64, "cvttsd2si{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+defm VCVTTSD2USI64Z : avx512_cvt_s<0x78, FR64X, GR64, fp_to_uint, f64mem,
+                                  loadf64, "cvttsd2usi{z}">, XD, VEX_W,
+                                  EVEX_CD8<64, CD8VT1>;
+//===----------------------------------------------------------------------===//
+// AVX-512  Convert form float to double and back
+//===----------------------------------------------------------------------===//
+let neverHasSideEffects = 1 in {
+def VCVTSS2SDZrr : AVX512XSI<0x5A, MRMSrcReg, (outs FR64X:$dst),
+                    (ins FR32X:$src1, FR32X:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSS2SDZrm : AVX512XSI<0x5A, MRMSrcMem, (outs FR64X:$dst),
+                    (ins FR32X:$src1, f32mem:$src2),
+                    "vcvtss2sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+                    EVEX_CD8<32, CD8VT1>;
+
+// Convert scalar double to scalar single
+def VCVTSD2SSZrr  : AVX512XDI<0x5A, MRMSrcReg, (outs FR32X:$dst),
+                      (ins FR64X:$src1, FR64X:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W, Sched<[WriteCvtF2F]>;
+let mayLoad = 1 in
+def VCVTSD2SSZrm  : AVX512XDI<0x5A, MRMSrcMem, (outs FR32X:$dst),
+                      (ins FR64X:$src1, f64mem:$src2),
+                      "vcvtsd2ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      []>, EVEX_4V, VEX_LIG, VEX_W,
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, EVEX_CD8<64, CD8VT1>;
+}
+
+def : Pat<(f64 (fextend FR32X:$src)), (VCVTSS2SDZrr FR32X:$src, FR32X:$src)>,
+      Requires<[HasAVX512]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+      Requires<[HasAVX512, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDZrr (f32 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
+    Requires<[HasAVX512, OptForSpeed]>;
+
+def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>,
+           Requires<[HasAVX512]>;
+
+multiclass avx512_vcvt_fp<bits<8> opc, string asm, RegisterClass SrcRC, 
+               RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, 
+               X86MemOperand x86memop, ValueType OpVT, ValueType InVT,
+               Domain d> {
+let neverHasSideEffects = 1 in {
+  def rr : AVX512PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT SrcRC:$src))))], d>, EVEX;
+  let mayLoad = 1 in
+  def rm : AVX512PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src),
+              !strconcat(asm,"\t{$src, $dst|$dst, $src}"), 
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))], d>, EVEX;
+} // neverHasSideEffects = 1
+}
+
+defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround,
+                                memopv8f64, f512mem, v8f32, v8f64,
+                                SSEPackedSingle>, EVEX_V512, VEX_W, OpSize,
+                                EVEX_CD8<64, CD8VF>;
+
+defm VCVTPS2PDZ : avx512_vcvt_fp<0x5A, "vcvtps2pd", VR256X, VR512, fextend,
+                                memopv4f64, f256mem, v8f64, v8f32,
+                                SSEPackedDouble>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512  Vector convert from sign integer to float/double
+//===----------------------------------------------------------------------===//
+
+defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp,
+                                memopv8i64, i512mem, v16f32, v16i32,
+                                SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp,
+                                memopv4i64, i256mem, v8f64, v8i32,
+                                SSEPackedDouble>, EVEX_V512, XS,
+                                EVEX_CD8<32, CD8VH>;
+
+defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint,
+                                 memopv8f64, f512mem, v8i32, v8f64, 
+                                 SSEPackedDouble>, EVEX_V512, OpSize, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+
+defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint,
+                                 memopv16f32, f512mem, v16i32, v16f32,
+                                 SSEPackedSingle>, EVEX_V512, 
+                                 EVEX_CD8<32, CD8VF>;
+
+defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint,
+                                 memopv8f64, f512mem, v8i32, v8f64,
+                                 SSEPackedDouble>, EVEX_V512, VEX_W,
+                                 EVEX_CD8<64, CD8VF>;
+                                 
+defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp,
+                                 memopv4i64, f256mem, v8f64, v8i32,
+                                 SSEPackedDouble>, EVEX_V512, XS,
+                                 EVEX_CD8<32, CD8VH>;
+                                 
+defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp,
+                                 memopv16i32, f512mem, v16f32, v16i32,
+                                 SSEPackedSingle>, EVEX_V512, XD,
+                                 EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
+           (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+                                 
+
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 VR512:$src),
+          (VCVTDQ2PSZrr VR512:$src)>;
+def : Pat<(int_x86_avx512_cvtdq2_ps_512 (bitconvert (memopv8i64 addr:$src))),
+          (VCVTDQ2PSZrm addr:$src)>;
+
+def VCVTPS2DQZrr : AVX512BI<0x5B, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 VR512:$src))],
+                        IIC_SSE_CVT_PS_RR>, EVEX, EVEX_V512;
+def VCVTPS2DQZrm : AVX512BI<0x5B, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        "vcvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR512:$dst,
+                          (int_x86_avx512_cvt_ps2dq_512 (memopv16f32 addr:$src)))],
+                        IIC_SSE_CVT_PS_RM>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(v8f32 (fround (loadv8f64 addr:$src))),
+            (VCVTPD2PSZrm addr:$src)>;
+  def : Pat<(v8f64 (extloadv8f32 addr:$src)),
+            (VCVTPS2PDZrm addr:$src)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Half precision conversion instructions
+//===----------------------------------------------------------------------===//
+multiclass avx512_f16c_ph2ps<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX5128I<0x13, MRMSrcReg, (outs destRC:$dst), (ins srcRC:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}",
+             [(set destRC:$dst, (Int srcRC:$src))]>, EVEX;
+  let neverHasSideEffects = 1, mayLoad = 1 in
+  def rm : AVX5128I<0x13, MRMSrcMem, (outs destRC:$dst), (ins x86memop:$src),
+             "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, EVEX;
+}
+
+multiclass avx512_f16c_ps2ph<RegisterClass destRC, RegisterClass srcRC,
+                             X86MemOperand x86memop, Intrinsic Int> {
+  def rr : AVX512AIi8<0x1D, MRMDestReg, (outs destRC:$dst),
+               (ins srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+               [(set destRC:$dst, (Int srcRC:$src1, imm:$src2))]>, EVEX;
+  let neverHasSideEffects = 1, mayStore = 1 in
+  def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+               (ins x86memop:$dst, srcRC:$src1, i32i8imm:$src2),
+               "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, EVEX;
+}
+
+defm VCVTPH2PSZ : avx512_f16c_ph2ps<VR512, VR256X, f256mem,
+                                    int_x86_avx512_vcvtph2ps_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+defm VCVTPS2PHZ : avx512_f16c_ps2ph<VR256X, VR512, f256mem,
+                                    int_x86_avx512_vcvtps2ph_512>, EVEX_V512,
+                                    EVEX_CD8<32, CD8VH>;
+
+let Defs = [EFLAGS], Predicates = [HasAVX512] in {
+  defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
+                                 "ucomiss{z}">, TB, EVEX, VEX_LIG,
+                                 EVEX_CD8<32, CD8VT1>;
+  defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
+                                  "ucomisd{z}">, TB, OpSize, EVEX,
+                                  VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  let Pattern = []<dag> in {
+    defm VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, undef, v4f32, f128mem, load,
+                                   "comiss{z}">, TB, EVEX, VEX_LIG,
+                                   EVEX_CD8<32, CD8VT1>;
+    defm VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, undef, v2f64, f128mem, load,
+                                   "comisd{z}">, TB, OpSize, EVEX,
+                                    VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+  defm Int_VUCOMISSZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v4f32, f128mem,
+                            load, "ucomiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VUCOMISDZ  : sse12_ord_cmp<0x2E, VR128X, X86ucomi, v2f64, f128mem,
+                            load, "ucomisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+  defm Int_VCOMISSZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v4f32, f128mem,
+                            load, "comiss">, TB, EVEX, VEX_LIG,
+                            EVEX_CD8<32, CD8VT1>;
+  defm Int_VCOMISDZ  : sse12_ord_cmp<0x2F, VR128X, X86comi, v2f64, f128mem,
+                            load, "comisd">, TB, OpSize, EVEX,
+                            VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+}
+  
+/// avx512_unop_p - AVX-512 unops in packed form.
+multiclass avx512_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+  def PSZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512;
+  def PSZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f256mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "ps\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W;
+  def PDZm : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                        !strconcat(OpcodeStr,
+                                   "pd\t{$src, $dst|$dst, $src}"),
+                        [(set VR512:$dst, (OpNode (memopv16f32 addr:$src)))]>,
+                        EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_p_int - AVX-512 intrinsics unops in packed forms.
+multiclass avx512_fp_unop_p_int<bits<8> opc, string OpcodeStr,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int> {
+  def PSZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "ps\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                         EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX5128I<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, 
+                           (V8F64Int (memopv8f64 addr:$src)))]>,
+                            EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+/// avx512_fp_unop_s - AVX-512 unops in scalar form.
+multiclass avx512_fp_unop_s<bits<8> opc, string OpcodeStr> {
+  let hasSideEffects = 0 in {
+  def SSZr : AVX5128I<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : AVX5128I<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : AVX5128I<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : AVX5128I<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                          "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : AVX5128I<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                              "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  []>, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+}
+
+defm VRCP14   : avx512_fp_unop_s<0x4D, "vrcp14">,
+                avx512_fp_unop_p<0x4C, "vrcp14", X86frcp>,
+                avx512_fp_unop_p_int<0x4C, "vrcp14", 
+                    int_x86_avx512_rcp14_ps_512, int_x86_avx512_rcp14_pd_512>;
+
+defm VRSQRT14  : avx512_fp_unop_s<0x4F, "vrsqrt14">,
+                avx512_fp_unop_p<0x4E, "vrsqrt14", X86frsqrt>,
+                avx512_fp_unop_p_int<0x4E, "vrsqrt14", 
+                    int_x86_avx512_rsqrt14_ps_512, int_x86_avx512_rsqrt14_pd_512>;
+
+def : Pat<(int_x86_avx512_rsqrt14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRSQRT14SSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rsqrt14_ss sse_load_f32:$src),
+          (VRSQRT14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+def : Pat<(int_x86_avx512_rcp14_ss VR128X:$src),
+          (COPY_TO_REGCLASS (VRCP14SSZr (f32 (IMPLICIT_DEF)),
+                                      (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                            VR128X)>;
+def : Pat<(int_x86_avx512_rcp14_ss sse_load_f32:$src),
+          (VRCP14SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+let AddedComplexity = 20, Predicates = [HasERI] in {
+defm VRCP28   : avx512_fp_unop_s<0xCB, "vrcp28">,
+                avx512_fp_unop_p<0xCA, "vrcp28", X86frcp>,
+                avx512_fp_unop_p_int<0xCA, "vrcp28",
+                    int_x86_avx512_rcp28_ps_512, int_x86_avx512_rcp28_pd_512>;
+
+defm VRSQRT28  : avx512_fp_unop_s<0xCD, "vrsqrt28">,
+                avx512_fp_unop_p<0xCC, "vrsqrt28", X86frsqrt>,
+                avx512_fp_unop_p_int<0xCC, "vrsqrt28",
+                    int_x86_avx512_rsqrt28_ps_512, int_x86_avx512_rsqrt28_pd_512>;
+}
+
+let Predicates = [HasERI] in {
+  def : Pat<(int_x86_avx512_rsqrt28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRSQRT28SSZr (f32 (IMPLICIT_DEF)),
+                                         (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rsqrt28_ss sse_load_f32:$src),
+            (VRSQRT28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_avx512_rcp28_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VRCP28SSZr (f32 (IMPLICIT_DEF)),
+                                       (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_avx512_rcp28_ss sse_load_f32:$src),
+            (VRCP28SSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+}
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              Intrinsic V16F32Int, Intrinsic V8F64Int,
+                              OpndItins itins_s, OpndItins itins_d> {
+  def PSZrr :AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+             !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+             [(set VR512:$dst, (v16f32 (OpNode VR512:$src)))], itins_s.rr>,
+             EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+  def PSZrm : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, 
+                (OpNode (v16f32 (bitconvert (memopv16f32 addr:$src)))))],
+              itins_s.rm>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+  def PDZrr : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set VR512:$dst, (v8f64 (OpNode VR512:$src)))], itins_d.rr>,
+              EVEX, EVEX_V512;
+
+  let mayLoad = 1 in
+    def PDZrm : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                [(set VR512:$dst, (OpNode
+                  (v8f64 (bitconvert (memopv16f32 addr:$src)))))],
+                itins_d.rm>, EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+  def PSZr_Int : AVX512PSI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr,
+                                      "ps\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V16F32Int VR512:$src))]>, 
+                           EVEX, EVEX_V512;
+  def PSZm_Int : AVX512PSI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                          !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                          [(set VR512:$dst, 
+                           (V16F32Int (memopv16f32 addr:$src)))]>, EVEX,
+                          EVEX_V512, EVEX_CD8<32, CD8VF>;
+  def PDZr_Int : AVX512PDI<opc, MRMSrcReg, (outs VR512:$dst), (ins VR512:$src),
+                           !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                           [(set VR512:$dst, (V8F64Int VR512:$src))]>, 
+                           EVEX, EVEX_V512, VEX_W;
+  def PDZm_Int : AVX512PDI<opc, MRMSrcMem, (outs VR512:$dst), (ins f512mem:$src),
+                         !strconcat(OpcodeStr,
+                         "pd\t{$src, $dst|$dst, $src}"),
+                         [(set VR512:$dst, (V8F64Int (memopv8f64 addr:$src)))]>,
+                         EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,
+                          Intrinsic F32Int, Intrinsic F64Int,
+                          OpndItins itins_s, OpndItins itins_d> {
+  def SSZr : SI<opc, MRMSrcReg, (outs FR32X:$dst),
+               (ins FR32X:$src1, FR32X:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rr>, XS, EVEX_4V;
+  def SSZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F32Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XS, EVEX_4V;
+  let mayLoad = 1 in {
+  def SSZm : SI<opc, MRMSrcMem, (outs FR32X:$dst),
+               (ins FR32X:$src1, f32mem:$src2),
+               !strconcat(OpcodeStr,
+                          "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                      [], itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  def SSZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                   (ins VR128X:$src1, ssmem:$src2),
+                   !strconcat(OpcodeStr,
+                 "ss{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   [(set VR128X:$dst, 
+                     (F32Int VR128X:$src1, sse_load_f32:$src2))],
+                   itins_s.rm>, XS, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+  }
+  def SDZr : SI<opc, MRMSrcReg, (outs FR64X:$dst),
+               (ins FR64X:$src1, FR64X:$src2),
+               !strconcat(OpcodeStr,
+                          "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+                      XD, EVEX_4V, VEX_W;
+  def SDZr_Int : SIi8<opc, MRMSrcReg, (outs VR128X:$dst),
+               (ins VR128X:$src1, VR128X:$src2),
+               !strconcat(OpcodeStr,
+                "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+               [(set VR128X:$dst, 
+                 (F64Int VR128X:$src1, VR128X:$src2))],
+               itins_s.rr>, XD, EVEX_4V, VEX_W;
+  let mayLoad = 1 in {
+  def SDZm : SI<opc, MRMSrcMem, (outs FR64X:$dst),
+               (ins FR64X:$src1, f64mem:$src2),
+               !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, 
+               XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  def SDZm_Int : SIi8<opc, MRMSrcMem, (outs VR128X:$dst),
+                  (ins VR128X:$src1, sdmem:$src2),
+                   !strconcat(OpcodeStr,
+                  "sd{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                  [(set VR128X:$dst, 
+                    (F64Int VR128X:$src1, sse_load_f64:$src2))]>, 
+                  XD, EVEX_4V, VEX_W, EVEX_CD8<64, CD8VT1>;
+  }
+}
+
+
+defm VSQRT  : avx512_sqrt_scalar<0x51, "sqrt", 
+                int_x86_avx512_sqrt_ss, int_x86_avx512_sqrt_sd, 
+                SSE_SQRTSS, SSE_SQRTSD>,
+              avx512_sqrt_packed<0x51, "vsqrt", fsqrt,
+                int_x86_avx512_sqrt_ps_512, int_x86_avx512_sqrt_pd_512,
+                SSE_SQRTPS, SSE_SQRTPD>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(f32 (fsqrt FR32X:$src)),
+            (VSQRTSSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64X:$src)),
+            (VSQRTSDZr (f64 (IMPLICIT_DEF)), FR64X:$src)>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDZm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32X:$src)),
+            (VRSQRT14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRT14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32X:$src)),
+            (VRCP14SSZr (f32 (IMPLICIT_DEF)), FR32X:$src)>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCP14SSZm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[OptForSize]>;
+
+  def : Pat<(int_x86_sse_sqrt_ss VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSSZr (f32 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR32)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse_sqrt_ss sse_load_f32:$src),
+            (VSQRTSSZm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
+
+  def : Pat<(int_x86_sse2_sqrt_sd VR128X:$src),
+            (COPY_TO_REGCLASS (VSQRTSDZr (f64 (IMPLICIT_DEF)),
+                                        (COPY_TO_REGCLASS VR128X:$src, FR64)),
+                              VR128X)>;
+  def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
+            (VSQRTSDZm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
+
+
+multiclass avx512_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
+                            X86MemOperand x86memop, RegisterClass RC,
+                            PatFrag mem_frag32, PatFrag mem_frag64,
+                            Intrinsic V4F32Int, Intrinsic V2F64Int,
+                            CD8VForm VForm> {
+let ExeDomain = SSEPackedSingle in {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr : AVX512AIi8<opcps, MRMSrcReg,
+                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PSm : AVX512AIi8<opcps, MRMSrcMem,
+                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set RC:$dst,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                    EVEX_CD8<32, VForm>;
+} // ExeDomain = SSEPackedSingle
+
+let ExeDomain = SSEPackedDouble in {
+  // Vector intrinsic operation, reg
+  def PDr : AVX512AIi8<opcpd, MRMSrcReg,
+                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>;
+
+  // Vector intrinsic operation, mem
+  def PDm : AVX512AIi8<opcpd, MRMSrcMem,
+                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
+                     !strconcat(OpcodeStr,
+                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                     [(set RC:$dst,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                     EVEX_CD8<64, VForm>;
+} // ExeDomain = SSEPackedDouble
+}
+
+multiclass avx512_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+let ExeDomain = GenericDomain in {
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SSr : AVX512AIi8<opcss, MRMSrcReg,
+      (outs FR32X:$dst), (ins FR32X:$src1, FR32X:$src2, i32i8imm:$src3),
+      !strconcat(OpcodeStr,
+              "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+      []>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : AVX512AIi8<opcss, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F32Int VR128X:$src1, VR128X:$src2, imm:$src3))]>;
+
+  // Intrinsic operation, mem.
+  def SSm : AVX512AIi8<opcss, MRMSrcMem, (outs VR128X:$dst),
+                     (ins VR128X:$src1, ssmem:$src2, i32i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                   "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     [(set VR128X:$dst, (F32Int VR128X:$src1, 
+                                         sse_load_f32:$src2, imm:$src3))]>,
+                     EVEX_CD8<32, CD8VT1>;
+
+  // Operation, reg.
+  let hasSideEffects = 0 in
+  def SDr : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs FR64X:$dst), (ins FR64X:$src1, FR64X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        []>, VEX_W;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : AVX512AIi8<opcsd, MRMSrcReg,
+        (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst, (F64Int VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+        VEX_W;
+
+  // Intrinsic operation, mem.
+  def SDm : AVX512AIi8<opcsd, MRMSrcMem,
+        (outs VR128X:$dst), (ins VR128X:$src1, sdmem:$src2, i32i8imm:$src3),
+        !strconcat(OpcodeStr,
+                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+        [(set VR128X:$dst,
+              (F64Int VR128X:$src1, sse_load_f64:$src2, imm:$src3))]>,
+        VEX_W, EVEX_CD8<64, CD8VT1>;
+} // ExeDomain = GenericDomain
+}
+
+let Predicates = [HasAVX512] in {
+  defm VRNDSCALE  : avx512_fp_binop_rm<0x0A, 0x0B, "vrndscale",
+                              int_x86_avx512_rndscale_ss,
+                              int_x86_avx512_rndscale_sd>, EVEX_4V;
+
+  defm VRNDSCALEZ : avx512_fp_unop_rm<0x08, 0x09, "vrndscale", f256mem, VR512,
+                                  memopv16f32, memopv8f64,
+                                  int_x86_avx512_rndscale_ps_512,
+                                  int_x86_avx512_rndscale_pd_512, CD8VF>, 
+                                  EVEX, EVEX_V512;
+}
+
+def : Pat<(ffloor FR32X:$src),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x1))>;
+def : Pat<(f64 (ffloor FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x1))>;
+def : Pat<(f32 (fnearbyint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0xC))>;
+def : Pat<(f64 (fnearbyint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0xC))>;
+def : Pat<(f32 (fceil FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x2))>;
+def : Pat<(f64 (fceil FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x2))>;
+def : Pat<(f32 (frint FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x4))>;
+def : Pat<(f64 (frint FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x4))>;
+def : Pat<(f32 (ftrunc FR32X:$src)),
+          (VRNDSCALESSr (f32 (IMPLICIT_DEF)), FR32X:$src, (i32 0x3))>;
+def : Pat<(f64 (ftrunc FR64X:$src)),
+          (VRNDSCALESDr (f64 (IMPLICIT_DEF)), FR64X:$src, (i32 0x3))>;
+
+def : Pat<(v16f32 (ffloor VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x1))>;
+def : Pat<(v16f32 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0xC))>;
+def : Pat<(v16f32 (fceil VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x2))>;
+def : Pat<(v16f32 (frint VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x4))>;
+def : Pat<(v16f32 (ftrunc VR512:$src)),
+          (VRNDSCALEZPSr VR512:$src, (i32 0x3))>;
+
+def : Pat<(v8f64 (ffloor VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x1))>;
+def : Pat<(v8f64 (fnearbyint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0xC))>;
+def : Pat<(v8f64 (fceil VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x2))>;
+def : Pat<(v8f64 (frint VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x4))>;
+def : Pat<(v8f64 (ftrunc VR512:$src)),
+          (VRNDSCALEZPDr VR512:$src, (i32 0x3))>;
+
+//-------------------------------------------------
+// Integer truncate and extend operations
+//-------------------------------------------------
+
+multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
+                          RegisterClass dstRC, RegisterClass srcRC,
+                          RegisterClass KRC, X86MemOperand x86memop> {
+  def rr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins srcRC:$src),
+               !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+
+  def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+               []>, EVEX, EVEX_KZ;
+
+  def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
+               !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+               []>, EVEX;
+}
+defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVSQB   : avx512_trunc_sat<0x22, "vpmovsqb",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVUSQB  : avx512_trunc_sat<0x12, "vpmovusqb", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
+defm VPMOVQW    : avx512_trunc_sat<0x34, "vpmovqw",   VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVSQW   : avx512_trunc_sat<0x24, "vpmovsqw",  VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVUSQW  : avx512_trunc_sat<0x14, "vpmovusqw", VR128X, VR512, VK8WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<16, CD8VQ>;
+defm VPMOVQD    : avx512_trunc_sat<0x35, "vpmovqd",   VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVSQD   : avx512_trunc_sat<0x25, "vpmovsqd",  VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVUSQD  : avx512_trunc_sat<0x15, "vpmovusqd", VR256X, VR512, VK8WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<32, CD8VH>;
+defm VPMOVDW    : avx512_trunc_sat<0x33, "vpmovdw",   VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVSDW   : avx512_trunc_sat<0x23, "vpmovsdw",  VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVUSDW  : avx512_trunc_sat<0x13, "vpmovusdw", VR256X, VR512, VK16WM,
+                                 i256mem>, EVEX_V512, EVEX_CD8<16, CD8VH>;
+defm VPMOVDB    : avx512_trunc_sat<0x31, "vpmovdb",   VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVSDB   : avx512_trunc_sat<0x21, "vpmovsdb",  VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+defm VPMOVUSDB  : avx512_trunc_sat<0x11, "vpmovusdb", VR128X, VR512, VK16WM,
+                                 i128mem>, EVEX_V512, EVEX_CD8<8, CD8VQ>;
+
+def : Pat<(v16i8  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQBrr  VR512:$src)>;
+def : Pat<(v8i16  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQWrr  VR512:$src)>;
+def : Pat<(v16i16 (X86vtrunc (v16i32 VR512:$src))), (VPMOVDWrr  VR512:$src)>;
+def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
+def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
+
+def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
+                  (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQWkrr  VK8WM:$mask, VR512:$src)>;
+def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
+                  (VPMOVQDkrr  VK8WM:$mask, VR512:$src)>;
+
+
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
+                      RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, 
+                      X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+
+  def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins SrcRC:$src),
+              !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins x86memop:$src),
+              !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+              [(set DstRC:$dst,
+                (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
+              EVEX;
+}
+
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+                             
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VQ>;
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
+                             EVEX_CD8<8, CD8VO>;
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VH>;
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, 
+                             memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
+                             EVEX_CD8<16, CD8VQ>;
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, 
+                             memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
+                             EVEX_CD8<32, CD8VH>;
+
+//===----------------------------------------------------------------------===//
+// GATHER - SCATTER Operations
+
+multiclass avx512_gather<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayLoad = 1,
+  Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb" in
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs RC:$dst, KRC:$mask_wb),
+            (ins RC:$src1, KRC:$mask, memop:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+defm VGATHERDPDZ : avx512_gather<0x92, "vgatherdpd", VK8WM, VR512, vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERDPSZ : avx512_gather<0x92, "vgatherdps", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERQPDZ : avx512_gather<0x93, "vgatherqpd", VK8WM, VR512, vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VGATHERQPSZ : avx512_gather<0x93, "vgatherqps", VK8WM, VR256X, vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPGATHERDQZ : avx512_gather<0x90, "vpgatherdq", VK8WM, VR512,  vy64xmem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERDDZ : avx512_gather<0x90, "vpgatherdd", VK16WM, VR512, vz32mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPGATHERQQZ : avx512_gather<0x91, "vpgatherqq", VK8WM, VR512,  vz64mem>,
+                                 EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPGATHERQDZ : avx512_gather<0x91, "vpgatherqd", VK8WM, VR256X,  vz64mem>,
+                                 EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+multiclass avx512_scatter<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                       RegisterClass RC, X86MemOperand memop> {
+let mayStore = 1, Constraints = "$mask = $mask_wb" in
+  def mr  : AVX5128I<opc, MRMDestMem, (outs KRC:$mask_wb),
+            (ins memop:$dst, KRC:$mask, RC:$src2),
+            !strconcat(OpcodeStr,
+            "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VSCATTERDPDZ : avx512_scatter<0xA2, "vscatterdpd", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERDPSZ : avx512_scatter<0xA2, "vscatterdps", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERQPDZ : avx512_scatter<0xA3, "vscatterqpd", VK8WM, VR512, vz64mem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VSCATTERQPSZ : avx512_scatter<0xA3, "vscatterqps", VK8WM, VR256X, vz64mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+  
+defm VPSCATTERDQZ : avx512_scatter<0xA0, "vpscatterdq", VK8WM, VR512, vy64xmem>,
+                                   EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERDDZ : avx512_scatter<0xA0, "vpscatterdd", VK16WM, VR512, vz32mem>,
+                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
+                                  EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
+                                  EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+                      ValueType vt, string OpcodeStr, PatFrag mem_frag,
+                      Domain d> {
+  def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
+  def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                   [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+                                       (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+                   EVEX_4V, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ  : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+                  SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ  : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+                  SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16i32 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPSZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v16i32 (X86Shufp VR512:$src1,
+                    (memopv16i32 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPSZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+def : Pat<(v8i64 (X86Shufp VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VSHUFPDZrri VR512:$src1, VR512:$src2, imm:$imm)>;
+def : Pat<(v8i64 (X86Shufp VR512:$src1,
+                            (memopv8i64 addr:$src2), (i8 imm:$imm))),
+          (VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+                     (ins RC:$src1, RC:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+  let mayLoad = 1 in
+  def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+                     (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+                     !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+                     []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>, 
+                 EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>, 
+                 VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+          (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
+multiclass avx512_vpabs<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop> {
+  def rr  : AVX5128I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                    EVEX;
+  def rm  : AVX5128I<opc, MRMSrcMem, (outs VR512:$dst), 
+                   (ins x86memop:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>,
+                   EVEX;
+}
+
+defm VPABSD : avx512_vpabs<0x1E, "vpabsd", VR512, i512mem>, EVEX_V512,
+                        EVEX_CD8<32, CD8VF>;
+defm VPABSQ : avx512_vpabs<0x1F, "vpabsq", VR512, i512mem>, EVEX_V512, VEX_W,
+                        EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_conflict<bits<8> opc, string OpcodeStr, 
+                        RegisterClass RC, RegisterClass KRC, PatFrag memop_frag,
+                        X86MemOperand x86memop, PatFrag scalar_mfrag,
+                        X86MemOperand x86scalar_mop, string BrdcstStr,
+                        Intrinsic Int, Intrinsic maskInt, Intrinsic maskzInt> {
+  def rr : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst} |${dst}, $src}"),
+       [(set RC:$dst, (Int RC:$src))]>, EVEX;
+  def rm : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86memop:$src),
+       !strconcat(OpcodeStr, "\t{$src, ${dst}|${dst}, $src}"),
+       [(set RC:$dst, (Int (memop_frag addr:$src)))]>, EVEX;
+  def rmb : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst}|${dst}, ${src}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_B;
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins KRC:$mask, RC:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, RC:$src))]>, EVEX, EVEX_KZ;
+  def rmkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86memop:$src),
+       !strconcat(OpcodeStr,
+                  "\t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
+       [(set RC:$dst, (maskzInt KRC:$mask, (memop_frag addr:$src)))]>,
+       EVEX, EVEX_KZ;
+  def rmbkz : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins KRC:$mask, x86scalar_mop:$src),
+       !strconcat(OpcodeStr, "\t{${src}", BrdcstStr,
+                  ", ${dst} {${mask}} {z}|${dst} {${mask}} {z}, ${src}",
+                  BrdcstStr, "}"),
+       []>, EVEX, EVEX_KZ, EVEX_B;
+       
+  let Constraints = "$src1 = $dst" in {
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, RC:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, RC:$src2))]>, EVEX, EVEX_K;
+  def rmk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86memop:$src2),
+       !strconcat(OpcodeStr,
+                  "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
+       [(set RC:$dst, (maskInt RC:$src1, KRC:$mask, (memop_frag addr:$src2)))]>, EVEX, EVEX_K;
+  def rmbk : AVX5128I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, KRC:$mask, x86scalar_mop:$src2),
+       !strconcat(OpcodeStr, "\t{${src2}", BrdcstStr,
+                  ", ${dst} {${mask}}|${dst} {${mask}}, ${src2}", BrdcstStr, "}"),
+       []>, EVEX, EVEX_K, EVEX_B;
+   }
+}
+
+let Predicates = [HasCDI] in {
+defm VPCONFLICTD : avx512_conflict<0xC4, "vpconflictd", VR512, VK16WM,
+                    memopv16i32, i512mem, loadi32, i32mem, "{1to16}",
+                    int_x86_avx512_conflict_d_512,
+                    int_x86_avx512_conflict_d_mask_512,
+                    int_x86_avx512_conflict_d_maskz_512>,
+                    EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+defm VPCONFLICTQ : avx512_conflict<0xC4, "vpconflictq", VR512, VK8WM,
+                    memopv8i64, i512mem, loadi64, i64mem, "{1to8}",
+                    int_x86_avx512_conflict_q_512,
+                    int_x86_avx512_conflict_q_mask_512,
+                    int_x86_avx512_conflict_q_maskz_512>,
+                    EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+}
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 225e972..7fc9c443 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -294,7 +294,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
 // unsigned division/remainder
 let hasSideEffects = 1 in { // so that we don't speculatively execute
 let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "div{b}\t$src", [], IIC_DIV8_REG>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -310,7 +310,7 @@ def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
 } // SchedRW
 
 let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
                "div{b}\t$src", [], IIC_DIV8_MEM>,
              SchedLoadReg<WriteIDivLd>;
@@ -331,7 +331,7 @@ def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
 
 // Signed division/remainder.
 let SchedRW = [WriteIDiv] in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>;
 let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
@@ -347,7 +347,7 @@ def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
 } // SchedRW
 
 let mayLoad = 1 in {
-let Defs = [AL,EFLAGS,AX], Uses = [AX] in
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
 def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
                "idiv{b}\t$src", [], IIC_IDIV8>,
              SchedLoadReg<WriteIDivLd>;
@@ -497,6 +497,21 @@ def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
                 Requires<[In64BitMode]>;
 } // isConvertibleToThreeAddress = 1, CodeSize = 2
 
+let isCodeGenOnly = 1, CodeSize = 2 in {
+def INC32_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                  "inc{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def INC32_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                  "inc{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+def DEC32_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                  "dec{w}\t$dst", [], IIC_UNARY_REG>,
+                OpSize, Requires<[In32BitMode]>;
+def DEC32_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                  "dec{l}\t$dst", [], IIC_UNARY_REG>,
+                Requires<[In32BitMode]>;
+} // isCodeGenOnly = 1, CodeSize = 2
+
 } // Constraints = "$src1 = $dst", SchedRW
 
 let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
@@ -578,7 +593,6 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
 } // CodeSize = 2, SchedRW
 } // Defs = [EFLAGS]
 
-
 /// X86TypeInfo - This is a bunch of information that describes relevant X86
 /// information about value types.  For example, it can tell you what the
 /// register class and preferred load to use.
@@ -726,20 +740,25 @@ class BinOpRR_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRR<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                   (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2,
-                          EFLAGS))], IIC_BIN_NONMEM>;
+                          EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRR_Rev - Instructions like "add reg, reg, reg" (reversed encoding).
-class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                 InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, MRMSrcReg, typeinfo,
         (outs typeinfo.RegClass:$dst),
         (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
-        mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $dst|$dst, $src2}", [], itin>,
     Sched<[WriteALU]> {
   // The disassembler should know about this, but not the asmparser.
   let isCodeGenOnly = 1;
   let hasSideEffects = 0;
 }
 
+// BinOpRR_RDD_Rev - Instructions like "adc reg, reg, reg" (reversed encoding).
+class BinOpRR_RFF_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
+  : BinOpRR_Rev<opcode, mnemonic, typeinfo, IIC_BIN_CARRY_NONMEM>;
+
 // BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
 class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
   : ITy<opcode, MRMSrcReg, typeinfo, (outs),
@@ -753,10 +772,11 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
 
 // BinOpRM - Instructions like "add reg, reg, [mem]".
 class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              dag outlist, list<dag> pattern>
+              dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMSrcMem, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.MemOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALULd, ReadAfterLd]>;
 
 // BinOpRM_R - Instructions like "add reg, reg, [mem]".
@@ -786,14 +806,15 @@ class BinOpRM_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
             (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2),
-                    EFLAGS))]>;
+                    EFLAGS))], IIC_BIN_CARRY_MEM>;
 
 // BinOpRI - Instructions like "add reg, reg, imm".
 class BinOpRI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Format f, dag outlist, list<dag> pattern>
+              Format f, dag outlist, list<dag> pattern,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.ImmOperand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -824,14 +845,15 @@ class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
             [(set typeinfo.RegClass:$dst, EFLAGS,
                 (opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
-                        EFLAGS))]>;
+                        EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpRI8 - Instructions like "add reg, reg, imm8".
 class BinOpRI8<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-               Format f, dag outlist, list<dag> pattern>
+               Format f, dag outlist, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, f, typeinfo, outlist,
         (ins typeinfo.RegClass:$src1, typeinfo.Imm8Operand:$src2),
-        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, IIC_BIN_NONMEM>,
+        mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
     Sched<[WriteALU]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -863,14 +885,14 @@ class BinOpRI8_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
              [(set typeinfo.RegClass:$dst, EFLAGS,
                (opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2,
-                       EFLAGS))]>;
+                       EFLAGS))], IIC_BIN_CARRY_NONMEM>;
 
 // BinOpMR - Instructions like "add [mem], reg".
 class BinOpMR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              list<dag> pattern>
+              list<dag> pattern, InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, MRMDestMem, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.RegClass:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]>;
 
 // BinOpMR_RMW - Instructions like "add [mem], reg".
@@ -886,7 +908,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
   : BinOpMR<opcode, mnemonic, typeinfo,
           [(store (opnode (load addr:$dst), typeinfo.RegClass:$src, EFLAGS),
                   addr:$dst),
-           (implicit EFLAGS)]>;
+           (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMR_F - Instructions like "cmp [mem], reg".
 class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
@@ -896,10 +918,11 @@ class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI - Instructions like "add [mem], imm".
 class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
-              Format f, list<dag> pattern, bits<8> opcode = 0x80>
+              Format f, list<dag> pattern, bits<8> opcode = 0x80,
+              InstrItinClass itin = IIC_BIN_MEM>
   : ITy<opcode, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.ImmOperand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = typeinfo.ImmEncoding;
 }
@@ -917,7 +940,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI<mnemonic, typeinfo, f,
             [(store (opnode (typeinfo.VT (load addr:$dst)),
                             typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
-             (implicit EFLAGS)]>;
+             (implicit EFLAGS)], 0x80, IIC_BIN_CARRY_MEM>;
 
 // BinOpMI_F - Instructions like "cmp [mem], imm".
 class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
@@ -929,10 +952,11 @@ class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
 
 // BinOpMI8 - Instructions like "add [mem], imm8".
 class BinOpMI8<string mnemonic, X86TypeInfo typeinfo,
-               Format f, list<dag> pattern>
+               Format f, list<dag> pattern,
+               InstrItinClass itin = IIC_BIN_MEM>
   : ITy<0x82, f, typeinfo,
         (outs), (ins typeinfo.MemOperand:$dst, typeinfo.Imm8Operand:$src),
-        mnemonic, "{$src, $dst|$dst, $src}", pattern, IIC_BIN_MEM>,
+        mnemonic, "{$src, $dst|$dst, $src}", pattern, itin>,
     Sched<[WriteALULd, WriteRMW]> {
   let ImmT = Imm8; // Always 8-bit immediate.
 }
@@ -951,7 +975,7 @@ class BinOpMI8_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
   : BinOpMI8<mnemonic, typeinfo, f,
              [(store (opnode (load addr:$dst),
                              typeinfo.Imm8Operator:$src, EFLAGS), addr:$dst),
-              (implicit EFLAGS)]>;
+              (implicit EFLAGS)], IIC_BIN_CARRY_MEM>;
 
 // BinOpMI8_F - Instructions like "cmp [mem], imm8".
 class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
@@ -960,18 +984,28 @@ class BinOpMI8_F<string mnemonic, X86TypeInfo typeinfo,
              [(set EFLAGS, (opnode (load addr:$dst),
                                    typeinfo.Imm8Operator:$src))]>;
 
-// BinOpAI - Instructions like "add %eax, %eax, imm".
+// BinOpAI - Instructions like "add %eax, %eax, imm", that imp-def EFLAGS.
 class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
-              Register areg, string operands>
+              Register areg, string operands,
+              InstrItinClass itin = IIC_BIN_NONMEM>
   : ITy<opcode, RawFrm, typeinfo,
         (outs), (ins typeinfo.ImmOperand:$src),
-        mnemonic, operands, []>, Sched<[WriteALU]> {
+        mnemonic, operands, [], itin>, Sched<[WriteALU]> {
   let ImmT = typeinfo.ImmEncoding;
   let Uses = [areg];
-  let Defs = [areg];
+  let Defs = [areg, EFLAGS];
   let hasSideEffects = 0;
 }
 
+// BinOpAI_FF - Instructions like "adc %eax, %eax, imm", that implicitly define
+// and use EFLAGS.
+class BinOpAI_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
+                Register areg, string operands>
+  : BinOpAI<opcode, mnemonic, typeinfo, areg, operands,
+            IIC_BIN_CARRY_NONMEM> {
+  let Uses = [areg, EFLAGS];
+}
+
 /// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
 /// defined with "(set GPR:$dst, EFLAGS, (...".
 ///
@@ -1030,16 +1064,16 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
-
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
 }
 
 /// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -1052,7 +1086,7 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                           string mnemonic, Format RegMRM, Format MemMRM,
                           SDNode opnode, bit CommutableRR,
                            bit ConvertibleToThreeAddress> {
-  let Defs = [EFLAGS] in {
+  let Uses = [EFLAGS], Defs = [EFLAGS] in {
     let Constraints = "$src1 = $dst" in {
       let isCommutable = CommutableRR,
           isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
@@ -1062,10 +1096,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
 
       def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
       def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1101,16 +1135,16 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
-
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  } // Uses = [EFLAGS], Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI_FF<BaseOpc4, mnemonic, Xi8 , AL,
+                              "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI_FF<BaseOpc4, mnemonic, Xi16, AX,
+                              "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi32, EAX,
+                              "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI_FF<BaseOpc4, mnemonic, Xi64, RAX,
+                              "{$src, %rax|rax, $src}">;
 }
 
 /// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1168,16 +1202,16 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
     def NAME#16mi   : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
     def NAME#32mi   : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
     def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
-
-    def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
-                             "{$src, %al|AL, $src}">;
-    def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
-                             "{$src, %ax|AX, $src}">;
-    def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
-                             "{$src, %eax|EAX, $src}">;
-    def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
-                             "{$src, %rax|RAX, $src}">;
-  }
+  } // Defs = [EFLAGS]
+
+  def NAME#8i8   : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+                           "{$src, %al|al, $src}">;
+  def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+                           "{$src, %ax|ax, $src}">;
+  def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+                           "{$src, %eax|eax, $src}">;
+  def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+                           "{$src, %rax|rax, $src}">;
 }
 
 
@@ -1195,12 +1229,10 @@ defm SUB : ArithBinOp_RF<0x28, 0x2A, 0x2C, "sub", MRM5r, MRM5m,
 }
 
 // Arithmetic.
-let Uses = [EFLAGS] in {
-  defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
-                            1, 0>;
-  defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
-                            0, 0>;
-}
+defm ADC : ArithBinOp_RFF<0x10, 0x12, 0x14, "adc", MRM2r, MRM2m, X86adc_flag,
+                          1, 0>;
+defm SBB : ArithBinOp_RFF<0x18, 0x1A, 0x1C, "sbb", MRM3r, MRM3m, X86sbb_flag,
+                          0, 0>;
 
 let isCompare = 1 in {
 defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
@@ -1215,44 +1247,46 @@ defm CMP : ArithBinOp_F<0x38, 0x3A, 0x3C, "cmp", MRM7r, MRM7m, X86cmp, 0, 0>;
 def X86testpat : PatFrag<(ops node:$lhs, node:$rhs),
                          (X86cmp (and_su node:$lhs, node:$rhs), 0)>;
 
-let isCompare = 1, Defs = [EFLAGS] in {
-  let isCommutable = 1 in {
-    def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
-    def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
-    def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
-    def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
-  } // isCommutable
-
-  def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
-  def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
-  def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
-  def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
-
-  def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
-  def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
-  def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-  def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
-
-  def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
-  def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
-  def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
-  def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+let isCompare = 1 in {
+  let Defs = [EFLAGS] in {
+    let isCommutable = 1 in {
+      def TEST8rr  : BinOpRR_F<0x84, "test", Xi8 , X86testpat, MRMSrcReg>;
+      def TEST16rr : BinOpRR_F<0x84, "test", Xi16, X86testpat, MRMSrcReg>;
+      def TEST32rr : BinOpRR_F<0x84, "test", Xi32, X86testpat, MRMSrcReg>;
+      def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat, MRMSrcReg>;
+    } // isCommutable
+
+    def TEST8rm    : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
+    def TEST16rm   : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
+    def TEST32rm   : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
+    def TEST64rm   : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+
+    def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
+    def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
+    def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+    def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
+
+    def TEST8mi    : BinOpMI_F<"test", Xi8 , X86testpat, MRM0m, 0xF6>;
+    def TEST16mi   : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
+    def TEST32mi   : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
+    def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
+
+    // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
+    // register class is constrained to GR8_NOREX.
+    let isPseudo = 1 in
+    def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
+                          "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
+  } // Defs = [EFLAGS]
 
   def TEST8i8    : BinOpAI<0xA8, "test", Xi8 , AL,
-                           "{$src, %al|AL, $src}">;
+                           "{$src, %al|al, $src}">;
   def TEST16i16  : BinOpAI<0xA8, "test", Xi16, AX,
-                           "{$src, %ax|AX, $src}">;
+                           "{$src, %ax|ax, $src}">;
   def TEST32i32  : BinOpAI<0xA8, "test", Xi32, EAX,
-                           "{$src, %eax|EAX, $src}">;
+                           "{$src, %eax|eax, $src}">;
   def TEST64i32  : BinOpAI<0xA8, "test", Xi64, RAX,
-                           "{$src, %rax|RAX, $src}">;
-
-  // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
-  // register class is constrained to GR8_NOREX.
-  let isPseudo = 1 in
-  def TEST8ri_NOREX : I<0, Pseudo, (outs), (ins GR8_NOREX:$src, i8imm:$mask),
-                        "", [], IIC_BIN_NONMEM>, Sched<[WriteALU]>;
-}
+                           "{$src, %rax|rax, $src}">;
+} // isCompare
 
 //===----------------------------------------------------------------------===//
 // ANDN Instruction
@@ -1294,12 +1328,12 @@ let neverHasSideEffects = 1 in {
   let isCommutable = 1 in
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul]>;
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMul, WriteIMulH]>;
 
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
-             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd]>;
+             [], IIC_MUL8>, T8XD, VEX_4V, Sched<[WriteIMulLd, WriteIMulH]>;
 }
 }
 
@@ -1328,7 +1362,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
   def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
              "adcx{l}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_MEM>, T8, OpSize;
- 
+
   def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
              "adcx{q}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
@@ -1353,7 +1387,7 @@ let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
   def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
              "adox{l}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_MEM>, T8XS;
- 
+
   def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
              "adox{q}\t{$src, $dst|$dst, $src}",
              [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index d9ff0c6..7d10b67 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -129,12 +129,13 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
 
 // The MSVC runtime contains an _ftol2 routine for converting floating-point
 // to integer values. It has a strange calling convention: the input is
-// popped from the x87 stack, and the return value is given in EDX:EAX. No
-// other registers (aside from flags) are touched.
+// popped from the x87 stack, and the return value is given in EDX:EAX. ECX is
+// used as a temporary register. No other registers (aside from flags) are
+// touched.
 // Microsoft toolchains do not support 80-bit precision, so a WIN_FTOL_80
 // variant is unnecessary.
 
-let Defs = [EAX, EDX, EFLAGS], FPForm = SpecialFP in {
+let Defs = [EAX, EDX, ECX, EFLAGS], FPForm = SpecialFP in {
   def WIN_FTOL_32 : I<0, Pseudo, (outs), (ins RFP32:$src),
                       "# win32 fptoui",
                       [(X86WinFTOL RFP32:$src)]>,
@@ -216,48 +217,38 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias Instructions
 //===----------------------------------------------------------------------===//
 
-// Alias instructions that map movr0 to xor.
+// Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 // FIXME: Set encoding to pseudo.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1 in {
-def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
-                 [(set GR8:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
-
-// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
-// encoding and avoids a partial-register update sometimes, but doing so
-// at isel time interferes with rematerialization in the current register
-// allocator. For now, this is rewritten when the instruction is lowered
-// to an MCInst.
-def MOV16r0   : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
-                 "",
-                 [(set GR16:$dst, 0)], IIC_ALU_NONMEM>, OpSize,
-                 Sched<[WriteZero]>;
-
-// FIXME: Set encoding to pseudo.
+    isCodeGenOnly = 1 in
 def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
-}
 
-// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
-// smaller encoding, but doing so at isel time interferes with rematerialization
-// in the current register allocator. For now, this is rewritten when the
-// instruction is lowered to an MCInst.
-// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
-// when we have a better way to specify isel priority.
-let Defs = [EFLAGS], isCodeGenOnly=1,
-    AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
+// Other widths can also make use of the 32-bit xor, which may have a smaller
+// encoding and avoid partial register updates.
+def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
+def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
+  let AddedComplexity = 20;
+}
 
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
 let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isCodeGenOnly = 1 in
-def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
-                        "", [(set GR64:$dst, i64immZExt32:$src)],
-                        IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+    isCodeGenOnly = 1, neverHasSideEffects = 1 in
+def MOV32ri64 : Ii32<0xb8, AddRegFrm, (outs GR32:$dst), (ins i64i32imm:$src),
+                     "", [], IIC_ALU_NONMEM>, Sched<[WriteALU]>;
+
+// This 64-bit pseudo-move can be used for both a 64-bit constant that is
+// actually the zero-extension of a 32-bit constant, and for labels in the
+// x86-64 small code model.
+def mov64imm32 : ComplexPattern<i64, 1, "SelectMOV64Imm32", [imm, X86Wrapper]>;
+
+let AddedComplexity = 1 in
+def : Pat<(i64 mov64imm32:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32ri64 mov64imm32:$src), sub_32bit)>;
 
 // Use sbb to materialize carry bit.
 let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
@@ -893,6 +884,24 @@ let Uses = [EFLAGS], usesCustomInserter = 1 in {
                     [(set VR256:$dst,
                       (v4i64 (X86cmov VR256:$t, VR256:$f, imm:$cond,
                                           EFLAGS)))]>;
+  def CMOV_V8I64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8I64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8i64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V8F64 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V8F64 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v8f64 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V16F32 : I<0, Pseudo,
+                    (outs VR512:$dst), (ins VR512:$t, VR512:$f, i8imm:$cond),
+                    "#CMOV_V16F32 PSEUDO!",
+                    [(set VR512:$dst,
+                      (v16f32 (X86cmov VR512:$t, VR512:$f, imm:$cond,
+                                          EFLAGS)))]>;
 }
 
 
@@ -926,8 +935,6 @@ def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
 def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV32mi addr:$dst, tblockaddress:$src)>;
 
-
-
 // ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
 // code model mode, should use 'movabs'.  FIXME: This is really a hack, the
 //  'movabs' predicate should handle this sort of thing.
@@ -942,20 +949,6 @@ def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
 def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
           (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
 
-// In static codegen with small code model, we can get the address of a label
-// into a register with 'movl'.  FIXME: This is a hack, the 'imm' predicate of
-// the MOV64ri64i32 should accept these.
-def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
-          (MOV64ri64i32 tconstpool  :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
-          (MOV64ri64i32 tjumptable  :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
-          (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
-          (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
-def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
-          (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
-
 // In kernel code model, we can get the address of a label
 // into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
 // the MOV64ri32 should accept these.
@@ -989,14 +982,12 @@ def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
           (MOV64mi32 addr:$dst, tblockaddress:$src)>,
           Requires<[NearData, IsStatic]>;
 
-
-
 // Calls
 
 // tls has some funny stuff here...
 // This corresponds to movabs $foo@tpoff, %rax
 def : Pat<(i64 (X86Wrapper tglobaltlsaddr :$dst)),
-          (MOV64ri tglobaltlsaddr :$dst)>;
+          (MOV64ri32 tglobaltlsaddr :$dst)>;
 // This corresponds to add $foo@tpoff, %rax
 def : Pat<(add GR64:$src1, (X86Wrapper tglobaltlsaddr :$dst)),
           (ADD64ri32 GR64:$src1, tglobaltlsaddr :$dst)>;
@@ -1119,7 +1110,8 @@ defm : CMOVmr<X86_COND_NO, CMOVO16rm , CMOVO32rm , CMOVO64rm>;
 def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
 def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
 def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
-def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+def : Pat<(zextloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
 
 // extload bool -> extload byte
 // When extloading from 16-bit and smaller memory locations into 64-bit
@@ -1133,14 +1125,16 @@ def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
 def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
 def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
 
-def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
-def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
-def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
 // For other extloads, use subregs, since the high contents of the register are
 // defined after an extload.
+def : Pat<(extloadi64i1 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+def : Pat<(extloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
 def : Pat<(extloadi64i32 addr:$src),
-          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
-                         sub_32bit)>;
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
 
 // anyext. Define these to do an explicit zero-extend to
 // avoid partial-register updates.
@@ -1152,8 +1146,10 @@ def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
 def : Pat<(i32 (anyext GR16:$src)),
           (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR16:$src, sub_16bit)>;
 
-def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8  :$src)>;
-def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR8 :$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8  GR8  :$src), sub_32bit)>;
+def : Pat<(i64 (anyext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
 def : Pat<(i64 (anyext GR32:$src)),
           (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
 
@@ -1318,13 +1314,19 @@ def : Pat<(and GR16:$src1, 0xff),
 
 // r & (2^32-1) ==> movz
 def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
-          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
+          (SUBREG_TO_REG (i64 0),
+                         (MOV32rr (EXTRACT_SUBREG GR64:$src, sub_32bit)),
+                         sub_32bit)>;
 // r & (2^16-1) ==> movz
 def : Pat<(and GR64:$src, 0xffff),
-          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit)))>;
+          (SUBREG_TO_REG (i64 0),
+                      (MOVZX32rr16 (i16 (EXTRACT_SUBREG GR64:$src, sub_16bit))),
+                      sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR64:$src, 0xff),
-          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit)))>;
+          (SUBREG_TO_REG (i64 0),
+                         (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
+                         sub_32bit)>;
 // r & (2^8-1) ==> movz
 def : Pat<(and GR32:$src1, 0xff),
            (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 0e69651..e4ccc06 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -49,10 +49,12 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1,
 let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
   def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                         "jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
+  let hasSideEffects = 0 in
   def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
                        "jmp\t$dst", [], IIC_JMP_REL>;
   // FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
   // with JMP_1.
+  let hasSideEffects = 0 in
   def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
                        "jmpq\t$dst", [], IIC_JMP_REL>;
 }
@@ -60,6 +62,7 @@ let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
 // Conditional Branches.
 let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
   multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
+    let hasSideEffects = 0 in
     def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
                        IIC_Jcc>;
     def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
@@ -85,7 +88,7 @@ defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
 defm JG  : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
 
 // jcx/jecx/jrcx instructions.
-let isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
   // These are the 32-bit versions of this instruction for the asmparser.  In
   // 32-bit mode, the address size prefix is jcxz and the unprefixed version is
   // jecxz.
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index 6dc7175..4090550 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -14,26 +14,26 @@
 let neverHasSideEffects = 1 in {
   let Defs = [AX], Uses = [AL] in
   def CBW : I<0x98, RawFrm, (outs), (ins),
-              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+              "{cbtw|cbw}", [], IIC_CBW>, OpSize;   // AX = signext(AL)
   let Defs = [EAX], Uses = [AX] in
   def CWDE : I<0x98, RawFrm, (outs), (ins),
-              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+              "{cwtl|cwde}", [], IIC_CBW>;   // EAX = signext(AX)
 
   let Defs = [AX,DX], Uses = [AX] in
   def CWD : I<0x99, RawFrm, (outs), (ins),
-              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+              "{cwtd|cwd}", [], IIC_CBW>, OpSize; // DX:AX = signext(AX)
   let Defs = [EAX,EDX], Uses = [EAX] in
   def CDQ : I<0x99, RawFrm, (outs), (ins),
-              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+              "{cltd|cdq}", [], IIC_CBW>; // EDX:EAX = signext(EAX)
 
 
   let Defs = [RAX], Uses = [EAX] in
   def CDQE : RI<0x98, RawFrm, (outs), (ins),
-               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+               "{cltq|cdqe}", [], IIC_CBW>;     // RAX = signext(EAX)
 
   let Defs = [RAX,RDX], Uses = [RAX] in
   def CQO  : RI<0x99, RawFrm, (outs), (ins),
-                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+                "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
 }
 
 
@@ -149,38 +149,24 @@ def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                        "movz{wq|x}\t{$src, $dst|$dst, $src}", [], IIC_MOVZX>,
                        TB, Sched<[WriteALULd]>;
 
-// FIXME: These should be Pat patterns.
-let isCodeGenOnly = 1 in {
-
-// Use movzbl instead of movzbq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
-                   "", [(set GR64:$dst, (zext GR8:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
-def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
-                   "", [(set GR64:$dst, (zextloadi64i8 addr:$src))], IIC_MOVZX>,
-                   TB, Sched<[WriteALULd]>;
-// Use movzwl instead of movzwq when the destination is a register; it's
-// equivalent due to implicit zero-extending, and it has a smaller encoding.
-def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
-                   "", [(set GR64:$dst, (zext GR16:$src))], IIC_MOVZX>, TB,
-                   Sched<[WriteALU]>;
-def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
-                   "", [(set GR64:$dst, (zextloadi64i16 addr:$src))],
-                   IIC_MOVZX>, TB, Sched<[WriteALULd]>;
-
-// There's no movzlq instruction, but movl can be used for this purpose, using
-// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
-// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
-// zero-extension, however this isn't possible when the 32-bit value is
-// defined by a truncate or is copied from something where the high bits aren't
-// necessarily all zero. In such cases, we fall back to these explicit zext
-// instructions.
-def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
-                    "", [(set GR64:$dst, (zext GR32:$src))], IIC_MOVZX>,
-                    Sched<[WriteALU]>;
-def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
-                    "", [(set GR64:$dst, (zextloadi64i32 addr:$src))],
-                    IIC_MOVZX>, Sched<[WriteALULd]>;
-}
-
+// 64-bit zero-extension patterns use SUBREG_TO_REG and an operation writing a
+// 32-bit register.
+def : Pat<(i64 (zext GR8:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr8 GR8:$src), sub_32bit)>;
+def : Pat<(zextloadi64i8 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm8 addr:$src), sub_32bit)>;
+
+def : Pat<(i64 (zext GR16:$src)),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16:$src), sub_32bit)>;
+def : Pat<(zextloadi64i16 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOVZX32rm16 addr:$src), sub_32bit)>;
+
+// The preferred way to do 32-bit-to-64-bit zero extension on x86-64 is to use a
+// SUBREG_TO_REG to utilize implicit zero-extension, however this isn't possible
+// when the 32-bit value is defined by a truncate or is copied from something
+// where the high bits aren't necessarily all zero. In such cases, we fall back
+// to these explicit zext instructions.
+def : Pat<(i64 (zext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rr GR32:$src), sub_32bit)>;
+def : Pat<(i64 (zextloadi64i32 addr:$src)),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src), sub_32bit)>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 7759a8a2..69cd5a5 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -74,43 +74,43 @@ let neverHasSideEffects = 1 in {
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
-                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
-  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
-                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
+  defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", loadv4f32,
+                                 loadv8f32, X86Fmadd, v4f32, v8f32>;
+  defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", loadv4f32,
+                                 loadv8f32, X86Fmsub, v4f32, v8f32>;
   defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
-                                 memopv4f32, memopv8f32, X86Fmaddsub,
+                                 loadv4f32, loadv8f32, X86Fmaddsub,
                                  v4f32, v8f32>;
   defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
-                                 memopv4f32, memopv8f32, X86Fmsubadd,
+                                 loadv4f32, loadv8f32, X86Fmsubadd,
                                  v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
-  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
-                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
-  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
-                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
+  defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", loadv2f64,
+                                 loadv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
+  defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", loadv2f64,
+                                 loadv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
   defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
-                                 memopv2f64, memopv4f64, X86Fmaddsub,
+                                 loadv2f64, loadv4f64, X86Fmaddsub,
                                  v2f64, v4f64>, VEX_W;
   defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
-                                 memopv2f64, memopv4f64, X86Fmsubadd,
+                                 loadv2f64, loadv4f64, X86Fmsubadd,
                                  v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
-  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
-  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
-                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
+  defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmadd, v4f32, v8f32>;
+  defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  loadv4f32,
+                               loadv8f32, X86Fnmsub, v4f32, v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
-  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
-                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
+  defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", loadv2f64,
+                               loadv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
   defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
-                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
+                               loadv2f64, loadv4f64, X86Fnmsub, v2f64,
                                v4f64>, VEX_W;
 }
 
@@ -206,25 +206,26 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
-                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+                           (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
            (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set RC:$dst,
-             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
 // For disassembler
 let isCodeGenOnly = 1, hasSideEffects = 0 in
   def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
-               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               VEX_LIG;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -235,19 +236,19 @@ multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, memop:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
-                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+                                  mem_cpat:$src3))]>, VEX_W, VEX_LIG, MemOp4;
   def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, memop:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
-                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
 }
 
 multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -338,31 +339,31 @@ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
 
 let ExeDomain = SSEPackedSingle in {
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
   defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
-                            memopv4f32, memopv8f32>;
+                            loadv4f32, loadv8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
   defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
-                            memopv2f64, memopv4f64>;
+                            loadv2f64, loadv4f64>;
 }
 
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 2224a08..7c37888 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -229,22 +229,22 @@ class FPrST0PInst<bits<8> o, string asm>
 // of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
 // we have to put some 'r's in and take them out of weird places.
 def ADD_FST0r   : FPST0rInst <0xC0, "fadd\t$op">;
-def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, ST(0)}">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, st(0)}">;
 def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp\t$op">;
 def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr\t$op">;
-def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, ST(0)}">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, st(0)}">;
 def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
 def SUB_FST0r   : FPST0rInst <0xE0, "fsub\t$op">;
-def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, ST(0)}">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
 def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
 def MUL_FST0r   : FPST0rInst <0xC8, "fmul\t$op">;
-def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, ST(0)}">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, st(0)}">;
 def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp\t$op">;
 def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr\t$op">;
-def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, ST(0)}">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, st(0)}">;
 def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
 def DIV_FST0r   : FPST0rInst <0xF0, "fdiv\t$op">;
-def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, ST(0)}">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
 def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
 
 def COM_FST0r   : FPST0rInst <0xD0, "fcom\t$op">;
@@ -337,21 +337,21 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
 let Predicates = [HasCMov] in {
 // These are not factored because there's no clean way to pass DA/DB.
 def CMOVB_F  : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovb\t{$op, %st(0)|ST(0), $op}">, DA;
+                  "fcmovb\t{$op, %st(0)|st(0), $op}">, DA;
 def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovbe\t{$op, %st(0)|ST(0), $op}">, DA;
+                  "fcmovbe\t{$op, %st(0)|st(0), $op}">, DA;
 def CMOVE_F  : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmove\t{$op, %st(0)|ST(0), $op}">, DA;
+                  "fcmove\t{$op, %st(0)|st(0), $op}">, DA;
 def CMOVP_F  : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovu\t {$op, %st(0)|ST(0), $op}">, DA;
+                  "fcmovu\t{$op, %st(0)|st(0), $op}">, DA;
 def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnb\t{$op, %st(0)|ST(0), $op}">, DB;
+                  "fcmovnb\t{$op, %st(0)|st(0), $op}">, DB;
 def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnbe\t{$op, %st(0)|ST(0), $op}">, DB;
+                  "fcmovnbe\t{$op, %st(0)|st(0), $op}">, DB;
 def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovne\t{$op, %st(0)|ST(0), $op}">, DB;
+                  "fcmovne\t{$op, %st(0)|st(0), $op}">, DB;
 def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
-                  "fcmovnu\t{$op, %st(0)|ST(0), $op}">, DB;
+                  "fcmovnu\t{$op, %st(0)|st(0), $op}">, DB;
 } // Predicates = [HasCMov]
 
 // Floating point loads & stores.
@@ -578,7 +578,7 @@ def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
 let SchedRW = [WriteALU] in {
 let Defs = [AX], Uses = [FPSW] in
 def FNSTSW16r : I<0xE0, RawFrm,                  // AX = fp flags
-                  (outs), (ins), "fnstsw %ax",
+                  (outs), (ins), "fnstsw\t{%ax|ax}",
                   [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>, DF;
 
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index a71e024..0fd9011 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -96,6 +96,20 @@ def SSEPackedSingle : Domain<1>;
 def SSEPackedDouble : Domain<2>;
 def SSEPackedInt    : Domain<3>;
 
+// Class specifying the vector form of the decompressed
+// displacement of 8-bit.
+class CD8VForm<bits<3> val> {
+  bits<3> Value = val;
+}
+def CD8VF  : CD8VForm<0>;  // v := VL
+def CD8VH  : CD8VForm<1>;  // v := VL/2
+def CD8VQ  : CD8VForm<2>;  // v := VL/4
+def CD8VO  : CD8VForm<3>;  // v := VL/8
+def CD8VT1 : CD8VForm<4>;  // v := 1
+def CD8VT2 : CD8VForm<5>;  // v := 2
+def CD8VT4 : CD8VForm<6>;  // v := 4
+def CD8VT8 : CD8VForm<7>;  // v := 8
+
 // Prefix byte classes which are used to indicate to the ad-hoc machine code
 // emitter that various prefix bytes are required.
 class OpSize { bit hasOpSizePrefix = 1; }
@@ -125,6 +139,7 @@ class T8XS   { bits<5> Prefix = 18; }
 class TAXD   { bits<5> Prefix = 19; }
 class XOP8   { bits<5> Prefix = 20; }
 class XOP9   { bits<5> Prefix = 21; }
+class XOPA   { bits<5> Prefix = 22; }
 class VEX    { bit hasVEXPrefix = 1; }
 class VEX_W  { bit hasVEX_WPrefix = 1; }
 class VEX_4V : VEX { bit hasVEX_4VPrefix = 1; }
@@ -132,6 +147,19 @@ class VEX_4VOp3 : VEX { bit hasVEX_4VOp3Prefix = 1; }
 class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
+class EVEX : VEX { bit hasEVEXPrefix = 1; }
+class EVEX_4V : VEX_4V { bit hasEVEXPrefix = 1; }
+class EVEX_K { bit hasEVEX_K = 1; }
+class EVEX_KZ : EVEX_K { bit hasEVEX_Z = 1; }
+class EVEX_B { bit hasEVEX_B = 1; }
+class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
+class EVEX_CD8<int esize, CD8VForm form> {
+  bits<2> EVEX_CD8E = !if(!eq(esize, 8),  0b00,
+                      !if(!eq(esize, 16), 0b01,
+                      !if(!eq(esize, 32), 0b10,
+                      !if(!eq(esize, 64), 0b11, ?))));
+  bits<3> EVEX_CD8V = form.Value;
+}
 class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class MemOp4 { bit hasMemOp4Prefix = 1; }
 class XOP { bit hasXOP_Prefix = 1; }
@@ -177,6 +205,13 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                             // to be encoded in a immediate field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
+  bit hasEVEXPrefix = 0;    // Does this inst require EVEX form?
+  bit hasEVEX_K = 0;        // Does this inst require masking?
+  bit hasEVEX_Z = 0;        // Does this inst set the EVEX_Z field?
+  bit hasEVEX_L2 = 0;       // Does this inst set the EVEX_L2 field?
+  bit hasEVEX_B = 0;        // Does this inst set the EVEX_B field?
+  bits<2> EVEX_CD8E = 0;    // Compressed disp8 form - element-size.
+  bits<3> EVEX_CD8V = 0;    // Compressed disp8 form - vector-width.
   bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding?
   bit hasMemOp4Prefix = 0;  // Same bit as VEX_W, but used for swapping operands
   bit hasXOP_Prefix = 0;    // Does this inst require an XOP prefix?
@@ -200,9 +235,16 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{37}    = hasVEX_i8ImmReg;
   let TSFlags{38}    = hasVEX_L;
   let TSFlags{39}    = ignoresVEX_L;
-  let TSFlags{40}    = has3DNow0F0FOpcode;
-  let TSFlags{41}    = hasMemOp4Prefix;
-  let TSFlags{42}    = hasXOP_Prefix;
+  let TSFlags{40}    = hasEVEXPrefix;
+  let TSFlags{41}    = hasEVEX_K;
+  let TSFlags{42}    = hasEVEX_Z;
+  let TSFlags{43}    = hasEVEX_L2;
+  let TSFlags{44}    = hasEVEX_B;
+  let TSFlags{46-45} = EVEX_CD8E;
+  let TSFlags{49-47} = EVEX_CD8V;
+  let TSFlags{50}    = has3DNow0F0FOpcode;
+  let TSFlags{51}    = hasMemOp4Prefix;
+  let TSFlags{52}    = hasXOP_Prefix;
 }
 
 class PseudoI<dag oops, dag iops, list<dag> pattern>
@@ -292,13 +334,17 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
 }
 
 def __xs : XS;
+def __xd : XD;
 
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
+                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1],
+                   !if(!eq(Prefix, __xd.Prefix), [UseSSE2],
+                   !if(hasOpSizePrefix, [UseSSE2], [UseSSE1])))));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -308,8 +354,9 @@ class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [UseAVX],
+                   !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -319,8 +366,9 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -337,11 +385,12 @@ class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> patter
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
-  let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
+  let Predicates = !if(hasEVEXPrefix /* EVEX */, [HasAVX512],
+                   !if(hasVEXPrefix /* VEX */, [HasAVX],
+                   !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1])));
 
   // AVX instructions have a 'v' prefix in the mnemonic
-  let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
+  let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
 // SSE1 Instruction Templates:
@@ -350,7 +399,7 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   PSI   - SSE1 instructions with TB prefix.
 //   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
 //   VSSI  - SSE1 instructions with XS prefix in AVX form.
-//   VPSI  - SSE1 instructions with TB prefix in AVX form.
+//   VPSI  - SSE1 instructions with TB prefix in AVX form, packed single.
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = NoItinerary>
@@ -381,10 +430,13 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
 //   S2SI   - SSE2 instructions with XS prefix.
 //   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
-//   PDI    - SSE2 instructions with TB and OpSize prefixes.
+//   PDI    - SSE2 instructions with TB and OpSize prefixes, packed double domain.
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
-//   VSDI   - SSE2 instructions with XD prefix in AVX form.
-//   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   VSDI   - SSE2 scalar instructions with XD prefix in AVX form.
+//   VPDI   - SSE2 vector instructions with TB and OpSize prefixes in AVX form,
+//                 packed double domain.
+//   VS2I   - SSE2 scalar instructions with TB and OpSize prefixes in AVX form.
+//   S2I    - SSE2 scalar instructions with TB and OpSize prefixes.
 //   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
 //               MMX operands.
 //   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
@@ -413,7 +465,7 @@ class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
-        Requires<[HasAVX]>;
+        Requires<[UseAVX]>;
 class VS2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -422,6 +474,14 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class VS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, TB,
+        OpSize, Requires<[UseAVX]>;
+class S2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB,
+        OpSize, Requires<[UseSSE2]>;
 class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
                list<dag> pattern, InstrItinClass itin = NoItinerary>
       : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
@@ -539,12 +599,80 @@ class AVX2AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
         Requires<[HasAVX2]>;
 
+
+// AVX-512 Instruction Templates:
+//   Instructions introduced in AVX-512 (no SSE equivalent forms)
+//
+//   AVX5128I - AVX-512 instructions with T8 and OpSize prefix.
+//   AVX512AIi8 - AVX-512 instructions with TA, OpSize prefix and ImmT = Imm8.
+//   AVX512PDI  - AVX-512 instructions with TB, OpSize, double packed.
+//   AVX512PSI  - AVX-512 instructions with TB, single packed.
+//   AVX512XS8I - AVX-512 instructions with T8 and XS prefixes.
+//   AVX512XSI  - AVX-512 instructions with XS prefix, generic domain.
+//   AVX512BI   - AVX-512 instructions with TB, OpSize, int packed domain.
+//   AVX512SI   - AVX-512 scalar instructions with TB and OpSize prefixes.
+
+class AVX5128I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8, OpSize,
+        Requires<[HasAVX512]>;
+class AVX512XS8I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8XS,
+        Requires<[HasAVX512]>;
+class AVX512XSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, XS,
+        Requires<[HasAVX512]>;
+class AVX512XDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, XD,
+        Requires<[HasAVX512]>;
+class AVX512BI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+        Requires<[HasAVX512]>;
+class AVX512BIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+        Requires<[HasAVX512]>;
+class AVX512SI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB, OpSize,
+        Requires<[HasAVX512]>;
+class AVX512AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA, OpSize,
+        Requires<[HasAVX512]>;
+class AVX512Ii8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TB,
+      Requires<[HasAVX512]>;
+class AVX512PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB,
+        OpSize, Requires<[HasAVX512]>;
+class AVX512PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
+        Requires<[HasAVX512]>;
+class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
+class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin, d>, TB, Requires<[HasAVX512]>;
+class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag>pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, T8,
+        OpSize, EVEX_4V, Requires<[HasAVX512]>;
+
 // AES Instruction Templates:
 //
 // AES8I
 // These use the same encoding as the SSE4.2 T8 and TA encodings.
 class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
-            list<dag>pattern, InstrItinClass itin = NoItinerary>
+            list<dag>pattern, InstrItinClass itin = IIC_AES>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
         Requires<[HasAES]>;
 
@@ -614,6 +742,13 @@ class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
+class RIi64_NOREX<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern, InstrItinClass itin = NoItinerary>
+  : X86Inst<o, f, Imm64, outs, ins, asm, itin> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
 class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : SSI<o, F, outs, ins, asm, pattern, itin>, REX_W;
@@ -626,11 +761,18 @@ class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : VPDI<o, F, outs, ins, asm, pattern, itin>, VEX_W;
+class RS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : S2I<o, F, outs, ins, asm, pattern, itin>, REX_W;
+class VRS2I<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : VS2I<o, F, outs, ins, asm, pattern, itin>, VEX_W;
 
 // MMX Instruction templates
 //
 
 // MMXI   - MMX instructions with TB prefix.
+// MMXI32 - MMX instructions with TB prefix valid only in 32 bit mode.
 // MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
 // MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
 // MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
@@ -640,6 +782,9 @@ class VRPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX]>;
+class MMXI32<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern, InstrItinClass itin = NoItinerary>
+      : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In32BitMode]>;
 class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
              list<dag> pattern, InstrItinClass itin = NoItinerary>
       : I<o, F, outs, ins, asm, pattern, itin>, TB, Requires<[HasMMX,In64BitMode]>;
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 2a72fb6..1fed424 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -47,6 +47,8 @@ def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
+def X86fandn   : SDNode<"X86ISD::FANDN",     SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
 def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
 def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
 def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
@@ -103,6 +105,13 @@ def X86vsext   : SDNode<"X86ISD::VSEXT",
                          SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                               SDTCisInt<0>, SDTCisInt<1>]>>;
 
+def X86vtrunc   : SDNode<"X86ISD::VTRUNC",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+def X86vtruncm   : SDNode<"X86ISD::VTRUNCM",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>,
+                                              SDTCisVec<2>, SDTCisInt<2>]>>;
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>]>>;
@@ -116,6 +125,15 @@ def X86cmpp    : SDNode<"X86ISD::CMPP",      SDTX86VFCMP>;
 def X86pcmpeq  : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
 def X86pcmpgt  : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
 
+def X86IntCmpMask : SDTypeProfile<1, 2,
+    [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<1>]>;
+def X86pcmpeqm  : SDNode<"X86ISD::PCMPEQM", X86IntCmpMask, [SDNPCommutative]>;
+def X86pcmpgtm  : SDNode<"X86ISD::PCMPGTM", X86IntCmpMask>;
+
+def X86CmpMaskCC : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmpm   : SDNode<"X86ISD::CMPM",    X86CmpMaskCC>;
+def X86cmpmu  : SDNode<"X86ISD::CMPMU",   X86CmpMaskCC>;
+
 def X86vshl    : SDNode<"X86ISD::VSHL",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisVec<2>]>>;
@@ -136,6 +154,11 @@ def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
 def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
+def X86kortest : SDNode<"X86ISD::KORTEST", SDTX86CmpPTest>;
+def X86ktest   : SDNode<"X86ISD::KTEST", SDTX86CmpPTest>;
+def X86testm  : SDNode<"X86ISD::TESTM", SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisVec<1>,
+                                          SDTCisSameAs<2, 1>]>>;
 
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
@@ -147,13 +170,17 @@ def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
 def SDTShuff1Op : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                 SDTCisSameAs<0,2>]>;
+def SDTShuff3Op : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>]>;
 
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 
-def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDTVBroadcastm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>]>;
+
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
 
@@ -188,10 +215,14 @@ def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 def X86VPermilp  : SDNode<"X86ISD::VPERMILP", SDTShuff2OpI>;
 def X86VPermv    : SDNode<"X86ISD::VPERMV",   SDTShuff2Op>;
 def X86VPermi    : SDNode<"X86ISD::VPERMI",   SDTShuff2OpI>;
+def X86VPermv3   : SDNode<"X86ISD::VPERMV3",  SDTShuff3Op>;
 
 def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
+def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
+def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
+                              [SDTCisSameAs<0, 1>, SDTCisPtrTy<3>]>, []>;
 
 def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
@@ -229,13 +260,13 @@ def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
 def ssmem : Operand<v4f32> {
   let PrintMethod = "printf32mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem32AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 def sdmem : Operand<v2f64> {
   let PrintMethod = "printf64mem";
   let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
-  let ParserMatchClass = X86MemAsmOperand;
+  let ParserMatchClass = X86Mem64AsmOperand;
   let OperandType = "OPERAND_MEMORY";
 }
 
@@ -255,9 +286,16 @@ def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
-// 128-/256-bit extload pattern fragments
+// 512-bit load pattern fragments
+def loadv16f32   : PatFrag<(ops node:$ptr), (v16f32 (load node:$ptr))>;
+def loadv8f64    : PatFrag<(ops node:$ptr), (v8f64 (load node:$ptr))>;
+def loadv16i32   : PatFrag<(ops node:$ptr), (v16i32 (load node:$ptr))>;
+def loadv8i64    : PatFrag<(ops node:$ptr), (v8i64 (load node:$ptr))>;
+
+// 128-/256-/512-bit extload pattern fragments
 def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
 def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
 
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
@@ -271,6 +309,12 @@ def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
   return cast<StoreSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'store', but always requires 512-bit vector alignment.
+def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
+                              (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 // Like 'load', but always requires 128-bit vector alignment.
 def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 16;
@@ -286,6 +330,11 @@ def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() >= 32;
 }]>;
 
+// Like 'load', but always requires 512-bit vector alignment.
+def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 64;
+}]>;
+
 def alignedloadfsf32 : PatFrag<(ops node:$ptr),
                                (f32 (alignedload node:$ptr))>;
 def alignedloadfsf64 : PatFrag<(ops node:$ptr),
@@ -309,6 +358,16 @@ def alignedloadv4f64 : PatFrag<(ops node:$ptr),
 def alignedloadv4i64 : PatFrag<(ops node:$ptr),
                                (v4i64 (alignedload256 node:$ptr))>;
 
+// 512-bit aligned load pattern fragments
+def alignedloadv16f32 : PatFrag<(ops node:$ptr),
+                                (v16f32 (alignedload512 node:$ptr))>;
+def alignedloadv16i32 : PatFrag<(ops node:$ptr),
+                                (v16i32 (alignedload512 node:$ptr))>;
+def alignedloadv8f64  : PatFrag<(ops node:$ptr),
+                                (v8f64  (alignedload512 node:$ptr))>;
+def alignedloadv8i64  : PatFrag<(ops node:$ptr),
+                                (v8i64  (alignedload512 node:$ptr))>;
+
 // Like 'load', but uses special alignment checks suitable for use in
 // memory operands in most SSE instructions, which are required to
 // be naturally aligned on some targets but not on others.  If the subtarget
@@ -320,6 +379,16 @@ def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
          || cast<LoadSDNode>(N)->getAlignment() >= 16;
 }]>;
 
+def memop4 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+
+def memop8 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
 def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
 def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
 
@@ -335,6 +404,12 @@ def memopv8f32 : PatFrag<(ops node:$ptr), (v8f32 (memop node:$ptr))>;
 def memopv4f64 : PatFrag<(ops node:$ptr), (v4f64 (memop node:$ptr))>;
 def memopv4i64 : PatFrag<(ops node:$ptr), (v4i64 (memop node:$ptr))>;
 
+// 512-bit memop pattern fragments
+def memopv16f32 : PatFrag<(ops node:$ptr), (v16f32 (memop4 node:$ptr))>;
+def memopv8f64  : PatFrag<(ops node:$ptr), (v8f64  (memop8 node:$ptr))>;
+def memopv16i32 : PatFrag<(ops node:$ptr), (v16i32 (memop4 node:$ptr))>;
+def memopv8i64  : PatFrag<(ops node:$ptr), (v8i64  (memop8 node:$ptr))>;
+
 // SSSE3 uses MMX registers for some instructions. They aren't aligned on a
 // 16-byte boundary.
 // FIXME: 8 byte alignment for mmx reads is not required
@@ -384,6 +459,10 @@ def bc_v16i16 : PatFrag<(ops node:$in), (v16i16 (bitconvert node:$in))>;
 def bc_v8i32 : PatFrag<(ops node:$in), (v8i32 (bitconvert node:$in))>;
 def bc_v4i64 : PatFrag<(ops node:$in), (v4i64 (bitconvert node:$in))>;
 
+// 512-bit bitconvert pattern fragments
+def bc_v16i32 : PatFrag<(ops node:$in), (v16i32 (bitconvert node:$in))>;
+def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
+
 def vzmovl_v2i64 : PatFrag<(ops node:$src),
                            (bitconvert (v2i64 (X86vzmovl
                              (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
@@ -405,28 +484,54 @@ def BYTE_imm  : SDNodeXForm<imm, [{
   return getI32Imm(N->getZExtValue() >> 3);
 }]>;
 
-// EXTRACT_get_vextractf128_imm xform function: convert extract_subvector index
-// to VEXTRACTF128 imm.
-def EXTRACT_get_vextractf128_imm : SDNodeXForm<extract_subvector, [{
-  return getI8Imm(X86::getExtractVEXTRACTF128Immediate(N));
+// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
+// to VEXTRACTF128/VEXTRACTI128 imm.
+def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT128Immediate(N));
+}]>;
+
+// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
+// VINSERTF128/VINSERTI128 imm.
+def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT128Immediate(N));
+}]>;
+
+// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
+// to VEXTRACTF64x4 imm.
+def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
+  return getI8Imm(X86::getExtractVEXTRACT256Immediate(N));
 }]>;
 
-// INSERT_get_vinsertf128_imm xform function: convert insert_subvector index to
-// VINSERTF128 imm.
-def INSERT_get_vinsertf128_imm : SDNodeXForm<insert_subvector, [{
-  return getI8Imm(X86::getInsertVINSERTF128Immediate(N));
+// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
+// VINSERTF64x4 imm.
+def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
+  return getI8Imm(X86::getInsertVINSERT256Immediate(N));
 }]>;
 
-def vextractf128_extract : PatFrag<(ops node:$bigvec, node:$index),
+def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
+                                   (extract_subvector node:$bigvec,
+                                                      node:$index), [{
+  return X86::isVEXTRACT128Index(N);
+}], EXTRACT_get_vextract128_imm>;
+
+def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+                                      node:$index),
+                                 (insert_subvector node:$bigvec, node:$smallvec,
+                                                   node:$index), [{
+  return X86::isVINSERT128Index(N);
+}], INSERT_get_vinsert128_imm>;
+
+
+def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
                                    (extract_subvector node:$bigvec,
                                                       node:$index), [{
-  return X86::isVEXTRACTF128Index(N);
-}], EXTRACT_get_vextractf128_imm>;
+  return X86::isVEXTRACT256Index(N);
+}], EXTRACT_get_vextract256_imm>;
 
-def vinsertf128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
+def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
                                       node:$index),
                                  (insert_subvector node:$bigvec, node:$smallvec,
                                                    node:$index), [{
-  return X86::isVINSERTF128Index(N);
-}], INSERT_get_vinsertf128_imm>;
+  return X86::isVINSERT256Index(N);
+}], INSERT_get_vinsert256_imm>;
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 7c0423f..2461773 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -35,7 +36,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
 using namespace llvm;
@@ -81,6 +82,7 @@ enum {
   TB_ALIGN_NONE  =    0 << TB_ALIGN_SHIFT,
   TB_ALIGN_16    =   16 << TB_ALIGN_SHIFT,
   TB_ALIGN_32    =   32 << TB_ALIGN_SHIFT,
+  TB_ALIGN_64    =   64 << TB_ALIGN_SHIFT,
   TB_ALIGN_MASK  = 0xff << TB_ALIGN_SHIFT
 };
 
@@ -90,6 +92,9 @@ struct X86OpTblEntry {
   uint16_t Flags;
 };
 
+// Pin the vtable to this file.
+void X86InstrInfo::anchor() {}
+
 X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
   : X86GenInstrInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKDOWN64
@@ -97,7 +102,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                     (tm.getSubtarget<X86Subtarget>().is64Bit()
                      ? X86::ADJCALLSTACKUP64
                      : X86::ADJCALLSTACKUP32)),
-    TM(tm), RI(tm, *this) {
+    TM(tm), RI(tm) {
 
   static const X86OpTblEntry OpTbl2Addr[] = {
     { X86::ADC32ri,     X86::ADC32mi,    0 },
@@ -298,8 +303,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
     { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
     { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
-    { X86::FsMOVAPDrr,  X86::MOVSDmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,  X86::MOVSSmr,       TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
     { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
     { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
@@ -356,8 +359,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
     // AVX 128-bit versions of foldable instructions
     { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
-    { X86::FsVMOVAPDrr, X86::VMOVSDmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr, X86::VMOVSSmr,      TB_FOLDED_STORE | TB_NO_REVERSE },
     { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
     { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -374,7 +375,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
     { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE }
+    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
+    // AVX-512 foldable instructions
+    { X86::VMOVPDI2DIZrr,X86::VMOVPDI2DIZmr,  TB_FOLDED_STORE }
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
@@ -400,8 +403,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
     { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
     { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
-    { X86::FsMOVAPDrr,      X86::MOVSDrm,             TB_NO_REVERSE },
-    { X86::FsMOVAPSrr,      X86::MOVSSrm,             TB_NO_REVERSE },
     { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
     { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
     { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
@@ -444,16 +445,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
     { X86::MOVUPDrr,        X86::MOVUPDrm,            TB_ALIGN_16 },
     { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm,        0 },
     { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm,        0 },
     { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm,     TB_ALIGN_16 },
     { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
     { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
     { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
     { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
-    { X86::MOVZX64rr16,     X86::MOVZX64rm16,         0 },
-    { X86::MOVZX64rr32,     X86::MOVZX64rm32,         0 },
-    { X86::MOVZX64rr8,      X86::MOVZX64rm8,          0 },
     { X86::PABSBrr128,      X86::PABSBrm128,          TB_ALIGN_16 },
     { X86::PABSDrr128,      X86::PABSDrm128,          TB_ALIGN_16 },
     { X86::PABSWrr128,      X86::PABSWrm128,          TB_ALIGN_16 },
@@ -496,8 +493,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         0 },
     { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       0 },
     { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         0 },
-    { X86::FsVMOVAPDrr,     X86::VMOVSDrm,            TB_NO_REVERSE },
-    { X86::FsVMOVAPSrr,     X86::VMOVSSrm,            TB_NO_REVERSE },
     { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
     { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
     { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
@@ -510,7 +505,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         TB_ALIGN_16 },
     { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
     { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZDI2PDIrr,   X86::VMOVZDI2PDIrm,       0 },
     { X86::VMOVZQI2PQIrr,   X86::VMOVZQI2PQIrm,       0 },
     { X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm,    TB_ALIGN_16 },
     { X86::VPABSBrr128,     X86::VPABSBrm128,         0 },
@@ -555,11 +549,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
-    // BMI/BMI2/LZCNT/POPCNT foldable instructions
+    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
     { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
     { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
+    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
+    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
+    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
+    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
+    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
+    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
+    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
+    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
+    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
+    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
+    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
+    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
+    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
     { X86::BLSI32rr,        X86::BLSI32rm,            0 },
     { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
+    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
     { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
     { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
     { X86::BLSR32rr,        X86::BLSR32rm,            0 },
@@ -580,9 +590,27 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::SHRX64rr,        X86::SHRX64rm,            0 },
     { X86::SHLX32rr,        X86::SHLX32rm,            0 },
     { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
+    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
     { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
     { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
     { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
+    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
+    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
+
+    // AVX-512 foldable instructions
+    { X86::VMOV64toPQIZrr,  X86::VMOVQI2PQIZrm,       0 },
+    { X86::VMOVDI2SSZrr,    X86::VMOVDI2SSZrm,        0 },
+    { X86::VMOVDQA32rr,     X86::VMOVDQA32rm,         TB_ALIGN_64 },
+    { X86::VMOVDQA64rr,     X86::VMOVDQA64rm,         TB_ALIGN_64 },
+    { X86::VMOVDQU32rr,     X86::VMOVDQU32rm,         0 },
+    { X86::VMOVDQU64rr,     X86::VMOVDQU64rm,         0 },
+
+    // AES foldable instructions
+    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
+    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
+    { X86::VAESIMCrr,             X86::VAESIMCrm,             TB_ALIGN_16 },
+    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -1180,6 +1208,52 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::PDEP64rr,          X86::PDEP64rm,            0 },
     { X86::PEXT32rr,          X86::PEXT32rm,            0 },
     { X86::PEXT64rr,          X86::PEXT64rm,            0 },
+
+    // AVX-512 foldable instructions
+    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
+    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
+    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
+    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
+    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
+    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
+    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
+    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
+    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
+    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
+    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
+    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
+    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
+    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
+    { X86::VPERMPDZri,        X86::VPERMPDZmi,          0 },
+    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
+    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
+    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
+    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
+    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
+    { X86::VALIGNQrri,        X86::VALIGNQrmi,          0 },
+    { X86::VALIGNDrri,        X86::VALIGNDrmi,          0 },
+
+    // AES foldable instructions
+    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
+    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
+    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
+    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
+    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       TB_ALIGN_16 },
+    { X86::VAESDECrr,         X86::VAESDECrm,           TB_ALIGN_16 },
+    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       TB_ALIGN_16 },
+    { X86::VAESENCrr,         X86::VAESENCrm,           TB_ALIGN_16 },
+
+    // SHA foldable instructions
+    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
+    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
+    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
+    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
+    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
+    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
+    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1341,6 +1415,11 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
     { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
     { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
+    // AVX-512 VPERMI instructions with 3 source operands.
+    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
+    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
+    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
+    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1381,7 +1460,6 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr8:
   case X86::MOVZX32rr8:
   case X86::MOVSX64rr8:
-  case X86::MOVZX64rr8:
     if (!TM.getSubtarget<X86Subtarget>().is64Bit())
       // It's not always legal to reference the low 8-bit of the larger
       // register in 32-bit mode.
@@ -1389,9 +1467,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   case X86::MOVSX32rr16:
   case X86::MOVZX32rr16:
   case X86::MOVSX64rr16:
-  case X86::MOVZX64rr16:
-  case X86::MOVSX64rr32:
-  case X86::MOVZX64rr32: {
+  case X86::MOVSX64rr32: {
     if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
       // Be conservative.
       return false;
@@ -1404,17 +1480,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
     case X86::MOVSX32rr8:
     case X86::MOVZX32rr8:
     case X86::MOVSX64rr8:
-    case X86::MOVZX64rr8:
       SubIdx = X86::sub_8bit;
       break;
     case X86::MOVSX32rr16:
     case X86::MOVZX32rr16:
     case X86::MOVSX64rr16:
-    case X86::MOVZX64rr16:
       SubIdx = X86::sub_16bit;
       break;
     case X86::MOVSX64rr32:
-    case X86::MOVZX64rr32:
       SubIdx = X86::sub_32bit;
       break;
     }
@@ -1463,6 +1536,8 @@ static bool isFrameLoadOpcode(int Opcode) {
   case X86::VMOVDQAYrm:
   case X86::MMX_MOVD64rm:
   case X86::MMX_MOVQ64rm:
+  case X86::VMOVDQA32rm:
+  case X86::VMOVDQA64rm:
     return true;
   }
 }
@@ -1722,37 +1797,16 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  unsigned DestReg, unsigned SubIdx,
                                  const MachineInstr *Orig,
                                  const TargetRegisterInfo &TRI) const {
-  DebugLoc DL = Orig->getDebugLoc();
-
-  // MOV32r0 etc. are implemented with xor which clobbers condition code.
-  // Re-materialize them as movri instructions to avoid side effects.
-  bool Clone = true;
+  // MOV32r0 is implemented with a xor which clobbers condition code.
+  // Re-materialize it as movri instructions to avoid side effects.
   unsigned Opc = Orig->getOpcode();
-  switch (Opc) {
-  default: break;
-  case X86::MOV8r0:
-  case X86::MOV16r0:
-  case X86::MOV32r0:
-  case X86::MOV64r0: {
-    if (!isSafeToClobberEFLAGS(MBB, I)) {
-      switch (Opc) {
-      default: llvm_unreachable("Unreachable!");
-      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
-      case X86::MOV16r0: Opc = X86::MOV16ri; break;
-      case X86::MOV32r0: Opc = X86::MOV32ri; break;
-      case X86::MOV64r0: Opc = X86::MOV64ri64i32; break;
-      }
-      Clone = false;
-    }
-    break;
-  }
-  }
-
-  if (Clone) {
+  if (Opc == X86::MOV32r0 && !isSafeToClobberEFLAGS(MBB, I)) {
+    DebugLoc DL = Orig->getDebugLoc();
+    BuildMI(MBB, I, DL, get(X86::MOV32ri)).addOperand(Orig->getOperand(0))
+      .addImm(0);
+  } else {
     MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
     MBB.insert(I, MI);
-  } else {
-    BuildMI(MBB, I, DL, get(Opc)).addOperand(Orig->getOperand(0)).addImm(0);
   }
 
   MachineInstr *NewMI = prior(I);
@@ -1772,6 +1826,98 @@ static bool hasLiveCondCodeDef(MachineInstr *MI) {
   return false;
 }
 
+/// getTruncatedShiftCount - check whether the shift count for a machine operand
+/// is non-zero.
+inline static unsigned getTruncatedShiftCount(MachineInstr *MI,
+                                              unsigned ShiftAmtOperandIdx) {
+  // The shift count is six bits with the REX.W prefix and five bits without.
+  unsigned ShiftCountMask = (MI->getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
+  unsigned Imm = MI->getOperand(ShiftAmtOperandIdx).getImm();
+  return Imm & ShiftCountMask;
+}
+
+/// isTruncatedShiftCountForLEA - check whether the given shift count is appropriate
+/// can be represented by a LEA instruction.
+inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
+  // Left shift instructions can be transformed into load-effective-address
+  // instructions if we can encode them appropriately.
+  // A LEA instruction utilizes a SIB byte to encode it's scale factor.
+  // The SIB.scale field is two bits wide which means that we can encode any
+  // shift amount less than 4.
+  return ShAmt < 4 && ShAmt > 0;
+}
+
+bool X86InstrInfo::classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                                  unsigned Opc, bool AllowSP,
+                                  unsigned &NewSrc, bool &isKill, bool &isUndef,
+                                  MachineOperand &ImplicitOp) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC;
+  if (AllowSP) {
+    RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
+  } else {
+    RC = Opc != X86::LEA32r ?
+      &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
+  }
+  unsigned SrcReg = Src.getReg();
+
+  // For both LEA64 and LEA32 the register already has essentially the right
+  // type (32-bit or 64-bit) we may just need to forbid SP.
+  if (Opc != X86::LEA64_32r) {
+    NewSrc = SrcReg;
+    isKill = Src.isKill();
+    isUndef = Src.isUndef();
+
+    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
+        !MF.getRegInfo().constrainRegClass(NewSrc, RC))
+      return false;
+
+    return true;
+  }
+
+  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
+  // another we need to add 64-bit registers to the final MI.
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+    ImplicitOp = Src;
+    ImplicitOp.setImplicit();
+
+    NewSrc = getX86SubSuperRegister(Src.getReg(), MVT::i64);
+    MachineBasicBlock::LivenessQueryResult LQR =
+      MI->getParent()->computeRegisterLiveness(&getRegisterInfo(), NewSrc, MI);
+
+    switch (LQR) {
+    case MachineBasicBlock::LQR_Unknown:
+      // We can't give sane liveness flags to the instruction, abandon LEA
+      // formation.
+      return false;
+    case MachineBasicBlock::LQR_Live:
+      isKill = MI->killsRegister(SrcReg);
+      isUndef = false;
+      break;
+    default:
+      // The physreg itself is dead, so we have to use it as an <undef>.
+      isKill = false;
+      isUndef = true;
+      break;
+    }
+  } else {
+    // Virtual register of the wrong class, we have to create a temporary 64-bit
+    // vreg to feed into the LEA.
+    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            get(TargetOpcode::COPY))
+      .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
+        .addOperand(Src);
+
+    // Which is obviously going to be dead after we're done with it.
+    isKill = true;
+    isUndef = false;
+  }
+
+  // We've set all the parameters without issue.
+  return true;
+}
+
 /// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
 /// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
 /// to a 32-bit superregister and then truncating back down to a 16-bit
@@ -1787,11 +1933,16 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   bool isDead = MI->getOperand(0).isDead();
   bool isKill = MI->getOperand(1).isKill();
 
-  unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
-    ? X86::LEA64_32r : X86::LEA32r;
   MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
-  unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
   unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned Opc, leaInReg;
+  if (TM.getSubtarget<X86Subtarget>().is64Bit()) {
+    Opc = X86::LEA64_32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+  } else {
+    Opc = X86::LEA32r;
+    leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+  }
 
   // Build and insert into an implicit UNDEF value. This is OK because
   // well be shifting and then extracting the lower 16-bits.
@@ -1841,7 +1992,10 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
       // just a single insert_subreg.
       addRegReg(MIB, leaInReg, true, leaInReg, false);
     } else {
-      leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
+      else
+        leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
       // Build and insert into an implicit UNDEF value. This is OK because
       // well be shifting and then extracting the lower 16-bits.
       BuildMI(*MFI, &*MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF),leaInReg2);
@@ -1891,6 +2045,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                     MachineBasicBlock::iterator &MBBI,
                                     LiveVariables *LV) const {
   MachineInstr *MI = MBBI;
+
+  // The following opcodes also sets the condition code register(s). Only
+  // convert them to equivalent lea if the condition code register def's
+  // are dead!
+  if (hasLiveCondCodeDef(MI))
+    return 0;
+
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
   const MachineOperand &Dest = MI->getOperand(0);
@@ -1935,10 +2096,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
 
     // LEA can't handle RSP.
     if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
@@ -1953,29 +2112,34 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-        !MF.getRegInfo().constrainRegClass(Src.getReg(),
-                                           &X86::GR32_NOSPRegClass))
+    bool isKill, isUndef;
+    unsigned SrcReg;
+    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                        SrcReg, isKill, isUndef, ImplicitOp))
       return 0;
 
-    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-    NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest)
-      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+      .addImm(0).addReg(0);
+    if (ImplicitOp.getReg() != 0)
+      MIB.addOperand(ImplicitOp);
+    NewMI = MIB;
+
     break;
   }
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
-    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
-    // the flags produced by a shift yet, so this is safe.
-    unsigned ShAmt = MI->getOperand(2).getImm();
-    if (ShAmt == 0 || ShAmt >= 4) return 0;
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
 
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
@@ -1985,11 +2149,6 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     break;
   }
   default: {
-    // The following opcodes also sets the condition code register(s). Only
-    // convert them to equivalent lea if the condition code register def's
-    // are dead!
-    if (hasLiveCondCodeDef(MI))
-      return 0;
 
     switch (MIOpc) {
     default: return 0;
@@ -1999,17 +2158,20 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      const TargetRegisterClass *RC = MIOpc == X86::INC64r ?
-        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
-        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
-
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
         return 0;
 
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src), 1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, 1);
       break;
     }
     case X86::INC16r:
@@ -2026,16 +2188,22 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
         : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
-      const TargetRegisterClass *RC = MIOpc == X86::DEC64r ?
-        (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
-        (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
-          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
+                          SrcReg, isKill, isUndef, ImplicitOp))
         return 0;
 
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src), -1);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, -1);
+
       break;
     }
     case X86::DEC16r:
@@ -2052,36 +2220,41 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32rr_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc;
-      const TargetRegisterClass *RC;
-      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB) {
+      if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
         Opc = X86::LEA64r;
-        RC = &X86::GR64_NOSPRegClass;
-      } else {
+      else
         Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-        RC = &X86::GR32_NOSPRegClass;
-      }
-
 
-      unsigned Src2 = MI->getOperand(2).getReg();
-      bool isKill2 = MI->getOperand(2).isKill();
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return 0;
 
-      // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src2) &&
-          !MF.getRegInfo().constrainRegClass(Src2, RC))
+      const MachineOperand &Src2 = MI->getOperand(2);
+      bool isKill2, isUndef2;
+      unsigned SrcReg2;
+      MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
+                          SrcReg2, isKill2, isUndef2, ImplicitOp2))
         return 0;
 
-      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest),
-                        Src.getReg(), Src.isKill(), Src2, isKill2);
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+        .addOperand(Dest);
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+      if (ImplicitOp2.getReg() != 0)
+        MIB.addOperand(ImplicitOp2);
+
+      NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
 
       // Preserve undefness of the operands.
-      bool isUndef = MI->getOperand(1).isUndef();
-      bool isUndef2 = MI->getOperand(2).isUndef();
       NewMI->getOperand(1).setIsUndef(isUndef);
       NewMI->getOperand(3).setIsUndef(isUndef2);
 
-      if (LV && isKill2)
-        LV->replaceKillInstruction(Src2, MI, NewMI);
+      if (LV && Src2.isKill())
+        LV->replaceKillInstruction(SrcReg2, MI, NewMI);
       break;
     }
     case X86::ADD16rr:
@@ -2120,9 +2293,21 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addOperand(Dest).addOperand(Src),
-                        MI->getOperand(2).getImm());
+
+      bool isKill, isUndef;
+      unsigned SrcReg;
+      MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
+                          SrcReg, isKill, isUndef, ImplicitOp))
+        return 0;
+
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+          .addOperand(Dest)
+          .addReg(SrcReg, getUndefRegState(isUndef) | getKillRegState(isKill));
+      if (ImplicitOp.getReg() != 0)
+        MIB.addOperand(ImplicitOp);
+
+      NewMI = addOffset(MIB, MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -2789,23 +2974,29 @@ static bool isHReg(unsigned Reg) {
 
 // Try and copy between VR128/VR64 and GR64 registers.
 static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
-                                        bool HasAVX) {
+                                        const X86Subtarget& Subtarget) {
+
+
   // SrcReg(VR128) -> DestReg(GR64)
   // SrcReg(VR64)  -> DestReg(GR64)
   // SrcReg(GR64)  -> DestReg(VR128)
   // SrcReg(GR64)  -> DestReg(VR64)
 
+  bool HasAVX = Subtarget.hasAVX();
+  bool HasAVX512 = Subtarget.hasAVX512();
   if (X86::GR64RegClass.contains(DestReg)) {
-    if (X86::VR128RegClass.contains(SrcReg))
+    if (X86::VR128XRegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
-      return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
+      return HasAVX512 ? X86::VMOVPQIto64Zrr: (HasAVX ? X86::VMOVPQIto64rr :
+                                               X86::MOVPQIto64rr);
     if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
-    if (X86::VR128RegClass.contains(DestReg))
-      return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
+    if (X86::VR128XRegClass.contains(DestReg))
+      return HasAVX512 ? X86::VMOV64toPQIZrr: (HasAVX ? X86::VMOV64toPQIrr :
+                                               X86::MOV64toPQIrr);
     // Copy from a GR64 register to a VR64 register.
     if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
@@ -2814,14 +3005,30 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(FR32) -> DestReg(GR32)
   // SrcReg(GR32) -> DestReg(FR32)
 
-  if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
+  if (X86::GR32RegClass.contains(DestReg) && X86::FR32XRegClass.contains(SrcReg))
     // Copy from a FR32 register to a GR32 register.
-    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+    return HasAVX512 ? X86::VMOVSS2DIZrr : (HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr);
 
-  if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
+  if (X86::FR32XRegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
     // Copy from a GR32 register to a FR32 register.
-    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+    return HasAVX512 ? X86::VMOVDI2SSZrr : (HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr);
+  return 0;
+}
 
+static
+unsigned copyPhysRegOpcode_AVX512(unsigned& DestReg, unsigned& SrcReg) {
+  if (X86::VR128XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR256XRegClass.contains(DestReg, SrcReg) ||
+      X86::VR512RegClass.contains(DestReg, SrcReg)) {
+     DestReg = get512BitSuperRegister(DestReg);
+     SrcReg = get512BitSuperRegister(SrcReg);
+     return X86::VMOVAPSZrr;
+  }
+  if ((X86::VK8RegClass.contains(DestReg) ||
+       X86::VK16RegClass.contains(DestReg)) &&
+      (X86::VK8RegClass.contains(SrcReg) ||
+       X86::VK16RegClass.contains(SrcReg)))
+    return X86::KMOVWkk;
   return 0;
 }
 
@@ -2831,7 +3038,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  unsigned Opc;
+  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+  unsigned Opc = 0;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2849,14 +3057,17 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
              "8-bit H register can not be copied outside GR8_NOREX");
     } else
       Opc = X86::MOV8rr;
-  } else if (X86::VR128RegClass.contains(DestReg, SrcReg))
+  }
+  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
+    Opc = X86::MMX_MOVQ64rr;
+  else if (HasAVX512)
+    Opc = copyPhysRegOpcode_AVX512(DestReg, SrcReg);
+  else if (X86::VR128RegClass.contains(DestReg, SrcReg))
     Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
   else if (X86::VR256RegClass.contains(DestReg, SrcReg))
     Opc = X86::VMOVAPSYrr;
-  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
-    Opc = X86::MMX_MOVQ64rr;
-  else
-    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, HasAVX);
+  if (!Opc)
+    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, TM.getSubtarget<X86Subtarget>());
 
   if (Opc) {
     BuildMI(MBB, MI, DL, get(Opc), DestReg)
@@ -2904,6 +3115,18 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
                                       bool isStackAligned,
                                       const TargetMachine &TM,
                                       bool load) {
+  if (TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+    if (X86::VK8RegClass.hasSubClassEq(RC)  ||
+      X86::VK16RegClass.hasSubClassEq(RC))
+      return load ? X86::KMOVWkm : X86::KMOVWmk;
+    if (RC->getSize() == 4 && X86::FR32XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSSZrm : X86::VMOVSSZmr;
+    if (RC->getSize() == 8 && X86::FR64XRegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVSDZrm : X86::VMOVSDZmr;
+    if (X86::VR512RegClass.hasSubClassEq(RC))
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
+  }
+
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (RC->getSize()) {
   default:
@@ -2945,7 +3168,8 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
     assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
     return load ? X86::LD_Fp80m : X86::ST_FpP80m;
   case 16: {
-    assert(X86::VR128RegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
+    assert((X86::VR128RegClass.hasSubClassEq(RC) ||
+            X86::VR128XRegClass.hasSubClassEq(RC))&& "Unknown 16-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ?
@@ -2957,12 +3181,19 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
         (HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
   }
   case 32:
-    assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+    assert((X86::VR256RegClass.hasSubClassEq(RC) ||
+            X86::VR256XRegClass.hasSubClassEq(RC)) && "Unknown 32-byte regclass");
     // If stack is realigned we can use aligned stores.
     if (isStackAligned)
       return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
     else
       return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
+  case 64:
+    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
+    if (isStackAligned)
+      return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
 }
 
@@ -2989,7 +3220,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   const MachineFunction &MF = *MBB.getParent();
   assert(MF.getFrameInfo()->getObjectSize(FrameIdx) >= RC->getSize() &&
          "Stack slot too small for store");
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3005,7 +3236,7 @@ void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
                                   MachineInstr::mmo_iterator MMOBegin,
                                   MachineInstr::mmo_iterator MMOEnd,
                                   SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
@@ -3025,7 +3256,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = (TM.getFrameLowering()->getStackAlignment() >= Alignment) ||
     RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3039,7 +3270,7 @@ void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
                                  MachineInstr::mmo_iterator MMOBegin,
                                  MachineInstr::mmo_iterator MMOEnd,
                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
-  unsigned Alignment = RC->getSize() == 32 ? 32 : 16;
+  unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
   bool isAligned = MMOBegin != MMOEnd &&
                    (*MMOBegin)->getAlignment() >= Alignment;
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
@@ -3171,6 +3402,25 @@ inline static bool isRedundantFlagInstr(MachineInstr *FlagI, unsigned SrcReg,
 inline static bool isDefConvertible(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default: return false;
+
+  // The shift instructions only modify ZF if their shift count is non-zero.
+  // N.B.: The processor truncates the shift count depending on the encoding.
+  case X86::SAR8ri:    case X86::SAR16ri:  case X86::SAR32ri:case X86::SAR64ri:
+  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
+     return getTruncatedShiftCount(MI, 2) != 0;
+
+  // Some left shift instructions can be turned into LEA instructions but only
+  // if their flags aren't used. Avoid transforming such instructions.
+  case X86::SHL8ri:    case X86::SHL16ri:  case X86::SHL32ri:case X86::SHL64ri:{
+    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
+    if (isTruncatedShiftCountForLEA(ShAmt)) return false;
+    return ShAmt != 0;
+  }
+
+  case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
+  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
+     return getTruncatedShiftCount(MI, 3) != 0;
+
   case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
   case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
@@ -3200,8 +3450,37 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
   case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
   case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
+  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
+  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
+  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
+  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC32rr:   case X86::ADC64ri32:
+  case X86::ADC64ri8:  case X86::ADC64rr:
+  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB32rr:   case X86::SBB64ri32:
+  case X86::SBB64ri8:  case X86::SBB64rr:
   case X86::ANDN32rr:  case X86::ANDN32rm:
   case X86::ANDN64rr:  case X86::ANDN64rm:
+  case X86::BEXTR32rr: case X86::BEXTR64rr:
+  case X86::BEXTR32rm: case X86::BEXTR64rm:
+  case X86::BLSI32rr:  case X86::BLSI32rm:
+  case X86::BLSI64rr:  case X86::BLSI64rm:
+  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
+  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
+  case X86::BLSR32rr:  case X86::BLSR32rm:
+  case X86::BLSR64rr:  case X86::BLSR64rm:
+  case X86::BZHI32rr:  case X86::BZHI32rm:
+  case X86::BZHI64rr:  case X86::BZHI64rm:
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
     return true;
   }
 }
@@ -3308,10 +3587,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       // MOV32r0 etc. are implemented with xor which clobbers condition code.
       // They are safe to move up, if the definition to EFLAGS is dead and
       // earlier instructions do not read or write EFLAGS.
-      if (!Movr0Inst && (Instr->getOpcode() == X86::MOV8r0 ||
-           Instr->getOpcode() == X86::MOV16r0 ||
-           Instr->getOpcode() == X86::MOV32r0 ||
-           Instr->getOpcode() == X86::MOV64r0) &&
+      if (!Movr0Inst && Instr->getOpcode() == X86::MOV32r0 &&
           Instr->registerDefIsDead(X86::EFLAGS, TRI)) {
         Movr0Inst = Instr;
         continue;
@@ -3420,20 +3696,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // The instruction to be updated is either Sub or MI.
   Sub = IsCmpZero ? MI : Sub;
-  // Move Movr0Inst to the place right before Sub.
+  // Move Movr0Inst to the appropriate place before Sub.
   if (Movr0Inst) {
-    Sub->getParent()->remove(Movr0Inst);
-    Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
+    // Look backwards until we find a def that doesn't use the current EFLAGS.
+    Def = Sub;
+    MachineBasicBlock::reverse_iterator
+      InsertI = MachineBasicBlock::reverse_iterator(++Def),
+                InsertE = Sub->getParent()->rend();
+    for (; InsertI != InsertE; ++InsertI) {
+      MachineInstr *Instr = &*InsertI;
+      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
+          Instr->modifiesRegister(X86::EFLAGS, TRI)) {
+        Sub->getParent()->remove(Movr0Inst);
+        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
+                                   Movr0Inst);
+        break;
+      }
+    }
+    if (InsertI == InsertE)
+      return false;
   }
 
   // Make sure Sub instruction defines EFLAGS and mark the def live.
-  unsigned LastOperand = Sub->getNumOperands() - 1;
-  assert(Sub->getNumOperands() >= 2 &&
-         Sub->getOperand(LastOperand).isReg() &&
-         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
-         "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
-  Sub->getOperand(LastOperand).setIsDef(true);
-  Sub->getOperand(LastOperand).setIsDead(false);
+  unsigned i = 0, e = Sub->getNumOperands();
+  for (; i != e; ++i) {
+    MachineOperand &MO = Sub->getOperand(i);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
+      MO.setIsDead(false);
+      break;
+    }
+  }
+  assert(i != e && "Unable to locate a def EFLAGS operand");
+
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
@@ -3558,6 +3852,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::AVX_SET0:
     assert(HasAVX && "AVX not supported");
     return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+  case X86::AVX512_512_SET0:
+    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
   case X86::V_SETALLONES:
     return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
   case X86::AVX2_SETALLONES:
@@ -3565,23 +3861,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
+  case X86::KSET0W: return Expand2AddrUndef(MIB, get(X86::KXORWrr));
+  case X86::KSET1B:
+  case X86::KSET1W: return Expand2AddrUndef(MIB, get(X86::KXNORWrr));
   }
   return false;
 }
 
-MachineInstr*
-X86InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                       int FrameIx, uint64_t Offset,
-                                       const MDNode *MDPtr,
-                                       DebugLoc DL) const {
-  X86AddressMode AM;
-  AM.BaseType = X86AddressMode::FrameIndexBase;
-  AM.Base.FrameIndex = FrameIx;
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(X86::DBG_VALUE));
-  addFullAddress(MIB, AM).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
                                      const SmallVectorImpl<MachineOperand> &MOs,
                                      MachineInstr *MI,
@@ -3686,18 +3972,11 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    unsigned Opc = 0;
-    switch (MI->getOpcode()) {
-    default: break;
-    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
-    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
-    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
-    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    if (MI->getOpcode() == X86::MOV32r0) {
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+      if (NewMI)
+        return NewMI;
     }
-    if (Opc)
-       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
-    if (NewMI)
-      return NewMI;
 
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (i == 1) {
@@ -3798,18 +4077,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   case X86::RSQRTSSr_Int:
   case X86::SQRTSSr:
   case X86::SQRTSSr_Int:
-  // AVX encoded versions
-  case X86::VCVTSD2SSrr:
-  case X86::Int_VCVTSD2SSrr:
-  case X86::VCVTSS2SDrr:
-  case X86::Int_VCVTSS2SDrr:
-  case X86::VRCPSSr:
-  case X86::VROUNDSDr:
-  case X86::VROUNDSDr_Int:
-  case X86::VROUNDSSr:
-  case X86::VROUNDSSr_Int:
-  case X86::VRSQRTSSr:
-  case X86::VSQRTSSr:
     return true;
   }
 
@@ -3841,10 +4108,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
   return 16;
 }
 
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+  switch (Opcode) {
+  case X86::VCVTSI2SSrr:
+  case X86::Int_VCVTSI2SSrr:
+  case X86::VCVTSI2SS64rr:
+  case X86::Int_VCVTSI2SS64rr:
+  case X86::VCVTSI2SDrr:
+  case X86::Int_VCVTSI2SDrr:
+  case X86::VCVTSI2SD64rr:
+  case X86::Int_VCVTSI2SD64rr:
+  case X86::VCVTSD2SSrr:
+  case X86::Int_VCVTSD2SSrr:
+  case X86::VCVTSS2SDrr:
+  case X86::Int_VCVTSS2SDrr:
+  case X86::VRCPSSr:
+  case X86::VROUNDSDr:
+  case X86::VROUNDSDr_Int:
+  case X86::VROUNDSSr:
+  case X86::VROUNDSSr_Int:
+  case X86::VRSQRTSSr:
+  case X86::VSQRTSSr:
+
+  // AVX-512
+  case X86::VCVTSD2SSZrr:
+  case X86::VCVTSS2SDZrr:
+    return true;
+  }
+
+  return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                     const TargetRegisterInfo *TRI) const {
+  if (!hasUndefRegUpdate(MI->getOpcode()))
+    return 0;
+
+  // Set the OpNum parameter to the first source operand.
+  OpNum = 1;
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    // Use the same magic number as getPartialRegUpdateClearance.
+    return 16;
+  }
+  return 0;
+}
+
 void X86InstrInfo::
 breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                           const TargetRegisterInfo *TRI) const {
   unsigned Reg = MI->getOperand(OpNum).getReg();
+  // If MI kills this register, the false dependence is already broken.
+  if (MI->killsRegister(Reg, TRI))
+    return;
   if (X86::VR128RegClass.contains(Reg)) {
     // These instructions are all floating point domain, so xorps is the best
     // choice.
@@ -3864,10 +4198,75 @@ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
   MI->addRegisterKilled(Reg, TRI, true);
 }
 
-MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
-                                                  MachineInstr *MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                                  int FrameIndex) const {
+static MachineInstr* foldPatchpoint(MachineFunction &MF,
+                                    MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex,
+                                    const TargetInstrInfo &TII) {
+  unsigned StartIdx = 0;
+  switch (MI->getOpcode()) {
+  case TargetOpcode::STACKMAP:
+    StartIdx = 2; // Skip ID, nShadowBytes.
+    break;
+  case TargetOpcode::PATCHPOINT: {
+    // For PatchPoint, the call args are not foldable.
+    PatchPointOpers opers(MI);
+    StartIdx = opers.getVarIdx();
+    break;
+  }
+  default:
+    llvm_unreachable("unexpected stackmap opcode");
+  }
+
+  // Return false if any operands requested for folding are not foldable (not
+  // part of the stackmap's live values).
+  for (SmallVectorImpl<unsigned>::const_iterator I = Ops.begin(), E = Ops.end();
+       I != E; ++I) {
+    if (*I < StartIdx)
+      return 0;
+  }
+
+  MachineInstr *NewMI =
+    MF.CreateMachineInstr(TII.get(MI->getOpcode()), MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  // No need to fold return, the meta data, and function arguments
+  for (unsigned i = 0; i < StartIdx; ++i)
+    MIB.addOperand(MI->getOperand(i));
+
+  for (unsigned i = StartIdx; i < MI->getNumOperands(); ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
+      assert(MO.getReg() && "patchpoint can only fold a vreg operand");
+      // Compute the spill slot size and offset.
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(MO.getReg());
+      unsigned SpillSize;
+      unsigned SpillOffset;
+      bool Valid = TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize,
+                                         SpillOffset, &MF.getTarget());
+      if (!Valid)
+        report_fatal_error("cannot spill patchpoint subregister operand");
+
+      MIB.addOperand(MachineOperand::CreateImm(StackMaps::IndirectMemRefOp));
+      MIB.addOperand(MachineOperand::CreateImm(SpillSize));
+      MIB.addOperand(MachineOperand::CreateFI(FrameIndex));
+      addOffset(MIB, SpillOffset);
+    }
+    else
+      MIB.addOperand(MO);
+  }
+  return NewMI;
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const {
+  // Special case stack map and patch point intrinsics.
+  if (MI->getOpcode() == TargetOpcode::STACKMAP
+      || MI->getOpcode() == TargetOpcode::PATCHPOINT) {
+    return foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
+  }
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -3881,6 +4280,10 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
   unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  // If the function stack isn't realigned we don't want to fold instructions
+  // that need increased alignment.
+  if (!RI.needsStackRealignment(MF))
+    Alignment = std::min(Alignment, TM.getFrameLowering()->getStackAlignment());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -3910,6 +4313,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                                   MachineInstr *MI,
                                            const SmallVectorImpl<unsigned> &Ops,
                                                   MachineInstr *LoadMI) const {
+  // If loading from a FrameIndex, fold directly from the FrameIndex.
+  unsigned NumOps = LoadMI->getDesc().getNumOperands();
+  int FrameIndex;
+  if (isLoadFromStackSlot(LoadMI, FrameIndex))
+    return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
+
   // Check switch flag
   if (NoFusing) return NULL;
 
@@ -4035,7 +4444,6 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       return NULL;
 
     // Folding a normal load. Just copy the load's address operands.
-    unsigned NumOps = LoadMI->getDesc().getNumOperands();
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
       MOs.push_back(LoadMI->getOperand(i));
     break;
@@ -4083,13 +4491,9 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
-    switch (Opc) {
-    case X86::MOV8r0:
-    case X86::MOV16r0:
-    case X86::MOV32r0:
-    case X86::MOV64r0: return true;
-    default: break;
-    }
+    if (Opc == X86::MOV32r0)
+      return true;
+
     OpcodeTablePtr = &RegOp2MemOpTable0;
   } else if (OpNum == 1) {
     OpcodeTablePtr = &RegOp2MemOpTable1;
@@ -4250,7 +4654,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   std::vector<SDValue> AddrOps;
   std::vector<SDValue> BeforeOps;
   std::vector<SDValue> AfterOps;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   unsigned NumOps = N->getNumOperands();
   for (unsigned i = 0; i != NumOps-1; ++i) {
     SDValue Op = N->getOperand(i);
@@ -4507,6 +4911,167 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
+bool X86InstrInfo::shouldScheduleAdjacent(MachineInstr* First,
+                                          MachineInstr *Second) const {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!TM.getSubtarget<X86Subtarget>().hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  switch(Second->getOpcode()) {
+  default:
+    return false;
+  case X86::JE_4:
+  case X86::JNE_4:
+  case X86::JL_4:
+  case X86::JLE_4:
+  case X86::JG_4:
+  case X86::JGE_4:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_4:
+  case X86::JBE_4:
+  case X86::JA_4:
+  case X86::JAE_4:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_4:
+  case X86::JNS_4:
+  case X86::JP_4:
+  case X86::JNP_4:
+  case X86::JO_4:
+  case X86::JNO_4:
+    FuseKind = FuseTest;
+    break;
+  }
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64_16r:
+  case X86::INC64_32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64_16r:
+  case X86::DEC64_32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  }
+}
 
 bool X86InstrInfo::
 ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
@@ -4700,6 +5265,37 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
   case X86::VSQRTSSm:
   case X86::VSQRTSSm_Int:
   case X86::VSQRTSSr:
+  case X86::VSQRTPDZrm:
+  case X86::VSQRTPDZrr:
+  case X86::VSQRTPSZrm:
+  case X86::VSQRTPSZrr:
+  case X86::VSQRTSDZm:
+  case X86::VSQRTSDZm_Int:
+  case X86::VSQRTSDZr:
+  case X86::VSQRTSSZm_Int:
+  case X86::VSQRTSSZr:
+  case X86::VSQRTSSZm:
+  case X86::VDIVSDZrm:
+  case X86::VDIVSDZrr:
+  case X86::VDIVSSZrm:
+  case X86::VDIVSSZrr:
+
+  case X86::VGATHERQPSZrm:
+  case X86::VGATHERQPDZrm:
+  case X86::VGATHERDPDZrm:
+  case X86::VGATHERDPSZrm:
+  case X86::VPGATHERQDZrm:
+  case X86::VPGATHERQQZrm:
+  case X86::VPGATHERDDZrm:
+  case X86::VPGATHERDQZrm:
+  case X86::VSCATTERQPDZmr:
+  case X86::VSCATTERQPSZmr:
+  case X86::VSCATTERDPDZmr:
+  case X86::VSCATTERDPSZmr:
+  case X86::VPSCATTERQDZmr:
+  case X86::VPSCATTERQQZmr:
+  case X86::VPSCATTERDDZmr:
+  case X86::VPSCATTERDQZmr:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 260f054..600e392 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -152,6 +152,8 @@ class X86InstrInfo : public X86GenInstrInfo {
                             MemOp2RegOpTableType &M2RTable,
                             unsigned RegOp, unsigned MemOp, unsigned Flags);
 
+  virtual void anchor();
+
 public:
   explicit X86InstrInfo(X86TargetMachine &tm);
 
@@ -192,6 +194,19 @@ public:
                      const MachineInstr *Orig,
                      const TargetRegisterInfo &TRI) const;
 
+  /// Given an operand within a MachineInstr, insert preceding code to put it
+  /// into the right format for a particular kind of LEA instruction. This may
+  /// involve using an appropriate super-register instead (with an implicit use
+  /// of the original) or creating a new virtual register and inserting COPY
+  /// instructions to get the data into the right class.
+  ///
+  /// Reference parameters are set to indicate how caller should add this
+  /// operand to the LEA instruction.
+  bool classifyLEAReg(MachineInstr *MI, const MachineOperand &Src,
+                      unsigned LEAOpcode, bool AllowSP,
+                      unsigned &NewSrc, bool &isKill,
+                      bool &isUndef, MachineOperand &ImplicitOp) const;
+
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
   /// may be able to convert a two-address instruction into a true
@@ -262,12 +277,6 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                         int FrameIx, uint64_t Offset,
-                                         const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-
   /// foldMemoryOperand - If this target supports it, fold a load or store of
   /// the specified stack slot into the specified machine instruction for the
   /// specified operand(s).  If this is possible, the target should perform the
@@ -332,6 +341,9 @@ public:
                                        int64_t Offset1, int64_t Offset2,
                                        unsigned NumLoads) const;
 
+  virtual bool shouldScheduleAdjacent(MachineInstr* First,
+                                      MachineInstr *Second) const LLVM_OVERRIDE;
+
   virtual void getNoopForMachoTarget(MCInst &NopInst) const;
 
   virtual
@@ -359,6 +371,8 @@ public:
 
   unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
                                         const TargetRegisterInfo *TRI) const;
+  unsigned getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+                                const TargetRegisterInfo *TRI) const;
   void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
                                  const TargetRegisterInfo *TRI) const;
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 3380d8c..6e5d543 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -248,11 +248,12 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
-def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
 def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
 def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
 def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
+def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
+def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
@@ -278,43 +279,52 @@ def ptr_rc_nosp : PointerLikeRegClass<1>;
 
 // *mem - Operand definitions for the funky X86 addressing mode operands.
 //
-def X86MemAsmOperand : AsmOperandClass { 
- let Name = "Mem"; let PredicateMethod = "isMem"; 
+def X86MemAsmOperand : AsmOperandClass {
+ let Name = "Mem";
 }
-def X86Mem8AsmOperand : AsmOperandClass { 
-  let Name = "Mem8"; let PredicateMethod = "isMem8";
+def X86Mem8AsmOperand : AsmOperandClass {
+  let Name = "Mem8"; let RenderMethod = "addMemOperands";
 }
-def X86Mem16AsmOperand : AsmOperandClass { 
-  let Name = "Mem16"; let PredicateMethod = "isMem16";
+def X86Mem16AsmOperand : AsmOperandClass {
+  let Name = "Mem16"; let RenderMethod = "addMemOperands";
 }
-def X86Mem32AsmOperand : AsmOperandClass { 
-  let Name = "Mem32"; let PredicateMethod = "isMem32";
+def X86Mem32AsmOperand : AsmOperandClass {
+  let Name = "Mem32"; let RenderMethod = "addMemOperands";
 }
-def X86Mem64AsmOperand : AsmOperandClass { 
-  let Name = "Mem64"; let PredicateMethod = "isMem64";
+def X86Mem64AsmOperand : AsmOperandClass {
+  let Name = "Mem64"; let RenderMethod = "addMemOperands";
 }
-def X86Mem80AsmOperand : AsmOperandClass { 
-  let Name = "Mem80"; let PredicateMethod = "isMem80";
+def X86Mem80AsmOperand : AsmOperandClass {
+  let Name = "Mem80"; let RenderMethod = "addMemOperands";
 }
-def X86Mem128AsmOperand : AsmOperandClass { 
-  let Name = "Mem128"; let PredicateMethod = "isMem128";
+def X86Mem128AsmOperand : AsmOperandClass {
+  let Name = "Mem128"; let RenderMethod = "addMemOperands";
 }
-def X86Mem256AsmOperand : AsmOperandClass { 
-  let Name = "Mem256"; let PredicateMethod = "isMem256";
+def X86Mem256AsmOperand : AsmOperandClass {
+  let Name = "Mem256"; let RenderMethod = "addMemOperands";
+}
+def X86Mem512AsmOperand : AsmOperandClass {
+  let Name = "Mem512"; let RenderMethod = "addMemOperands";
 }
 
 // Gather mem operands
 def X86MemVX32Operand : AsmOperandClass {
-  let Name = "MemVX32"; let PredicateMethod = "isMemVX32";
+  let Name = "MemVX32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY32Operand : AsmOperandClass {
-  let Name = "MemVY32"; let PredicateMethod = "isMemVY32";
+  let Name = "MemVY32"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ32Operand : AsmOperandClass {
+  let Name = "MemVZ32"; let RenderMethod = "addMemOperands";
 }
 def X86MemVX64Operand : AsmOperandClass {
-  let Name = "MemVX64"; let PredicateMethod = "isMemVX64";
+  let Name = "MemVX64"; let RenderMethod = "addMemOperands";
 }
 def X86MemVY64Operand : AsmOperandClass {
-  let Name = "MemVY64"; let PredicateMethod = "isMemVY64";
+  let Name = "MemVY64"; let RenderMethod = "addMemOperands";
+}
+def X86MemVZ64Operand : AsmOperandClass {
+  let Name = "MemVZ64"; let RenderMethod = "addMemOperands";
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
@@ -333,28 +343,36 @@ def opaque48mem : X86MemOperand<"printopaquemem">;
 def opaque80mem : X86MemOperand<"printopaquemem">;
 def opaque512mem : X86MemOperand<"printopaquemem">;
 
-def i8mem   : X86MemOperand<"printi8mem"> { 
+def i8mem   : X86MemOperand<"printi8mem"> {
   let ParserMatchClass = X86Mem8AsmOperand; }
-def i16mem  : X86MemOperand<"printi16mem"> { 
+def i16mem  : X86MemOperand<"printi16mem"> {
   let ParserMatchClass = X86Mem16AsmOperand; }
-def i32mem  : X86MemOperand<"printi32mem"> { 
+def i32mem  : X86MemOperand<"printi32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def i64mem  : X86MemOperand<"printi64mem"> { 
+def i64mem  : X86MemOperand<"printi64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def i128mem : X86MemOperand<"printi128mem"> { 
+def i128mem : X86MemOperand<"printi128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def i256mem : X86MemOperand<"printi256mem"> { 
+def i256mem : X86MemOperand<"printi256mem"> {
   let ParserMatchClass = X86Mem256AsmOperand; }
-def f32mem  : X86MemOperand<"printf32mem"> { 
+def i512mem : X86MemOperand<"printi512mem"> {
+  let ParserMatchClass = X86Mem512AsmOperand; }
+def f32mem  : X86MemOperand<"printf32mem"> {
   let ParserMatchClass = X86Mem32AsmOperand; }
-def f64mem  : X86MemOperand<"printf64mem"> { 
+def f64mem  : X86MemOperand<"printf64mem"> {
   let ParserMatchClass = X86Mem64AsmOperand; }
-def f80mem  : X86MemOperand<"printf80mem"> { 
+def f80mem  : X86MemOperand<"printf80mem"> {
   let ParserMatchClass = X86Mem80AsmOperand; }
-def f128mem : X86MemOperand<"printf128mem"> { 
+def f128mem : X86MemOperand<"printf128mem"> {
   let ParserMatchClass = X86Mem128AsmOperand; }
-def f256mem : X86MemOperand<"printf256mem">{ 
+def f256mem : X86MemOperand<"printf256mem">{
   let ParserMatchClass = X86Mem256AsmOperand; }
+def f512mem : X86MemOperand<"printf512mem">{
+  let ParserMatchClass = X86Mem512AsmOperand; }
+def v512mem : Operand<iPTR> {
+  let PrintMethod = "printf512mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86Mem512AsmOperand; }
 
 // Gather mem operands
 def vx32mem : X86MemOperand<"printi32mem">{
@@ -369,6 +387,15 @@ def vx64mem : X86MemOperand<"printi64mem">{
 def vy64mem : X86MemOperand<"printi64mem">{
   let MIOperandInfo = (ops ptr_rc, i8imm, VR256, i32imm, i8imm);
   let ParserMatchClass = X86MemVY64Operand; }
+def vy64xmem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR256X, i32imm, i8imm);
+  let ParserMatchClass = X86MemVY64Operand; }
+def vz32mem : X86MemOperand<"printi32mem">{
+  let MIOperandInfo = (ops ptr_rc, i16imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86MemVZ32Operand; }
+def vz64mem : X86MemOperand<"printi64mem">{
+  let MIOperandInfo = (ops ptr_rc, i8imm, VR512, i32imm, i8imm);
+  let ParserMatchClass = X86MemVZ64Operand; }
 }
 
 // A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
@@ -412,17 +439,49 @@ let OperandType = "OPERAND_PCREL",
 def i32imm_pcrel : Operand<i32>;
 def i16imm_pcrel : Operand<i16>;
 
-def offset8 : Operand<i64>;
-def offset16 : Operand<i64>;
-def offset32 : Operand<i64>;
-def offset64 : Operand<i64>;
-
 // Branch targets have OtherVT type and print as pc-relative values.
 def brtarget : Operand<OtherVT>;
 def brtarget8 : Operand<OtherVT>;
 
 }
 
+def X86MemOffs8AsmOperand : AsmOperandClass {
+  let Name = "MemOffs8";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem8AsmOperand];
+}
+def X86MemOffs16AsmOperand : AsmOperandClass {
+  let Name = "MemOffs16";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem16AsmOperand];
+}
+def X86MemOffs32AsmOperand : AsmOperandClass {
+  let Name = "MemOffs32";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem32AsmOperand];
+}
+def X86MemOffs64AsmOperand : AsmOperandClass {
+  let Name = "MemOffs64";
+  let RenderMethod = "addMemOffsOperands";
+  let SuperClasses = [X86Mem64AsmOperand];
+}
+
+let OperandType = "OPERAND_MEMORY" in {
+def offset8 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs8AsmOperand;
+  let PrintMethod = "printMemOffs8"; }
+def offset16 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs16AsmOperand;
+  let PrintMethod = "printMemOffs16"; }
+def offset32 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs32AsmOperand;
+  let PrintMethod = "printMemOffs32"; }
+def offset64 : Operand<i64> {
+  let ParserMatchClass = X86MemOffs64AsmOperand;
+  let PrintMethod = "printMemOffs64"; }
+}
+
+
 def SSECC : Operand<i8> {
   let PrintMethod = "printSSECC";
   let OperandType = "OPERAND_IMMEDIATE";
@@ -443,6 +502,14 @@ class ImmZExtAsmOperandClass : AsmOperandClass {
   let RenderMethod = "addImmOperands";
 }
 
+def X86GR32orGR64AsmOperand : AsmOperandClass {
+  let Name = "GR32orGR64";
+}
+
+def GR32orGR64 : RegisterOperand<GR32> {
+  let ParserMatchClass = X86GR32orGR64AsmOperand;
+}
+
 // Sign-extended immediate classes. We don't need to define the full lattice
 // here because there is no instruction with an ambiguity between ImmSExti64i32
 // and ImmSExti32i8.
@@ -523,8 +590,7 @@ def i64i8imm   : Operand<i64> {
 
 def lea64_32mem : Operand<i32> {
   let PrintMethod = "printi32mem";
-  let AsmOperandLowerMethod = "lower_lea64_32mem";
-  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm, i8imm);
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
@@ -546,7 +612,7 @@ def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
                                [add, sub, mul, X86mul_imm, shl, or, frameindex],
                                []>;
 // In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
-def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEA64_32Addr",
                                   [add, sub, mul, X86mul_imm, shl, or,
                                    frameindex, X86WrapperRIP],
                                   []>;
@@ -591,13 +657,22 @@ def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
 def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
+def HasAVX512    : Predicate<"Subtarget->hasAVX512()">;
+def UseAVX       : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX512()">;
+def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
+def NoAVX512       : Predicate<"!Subtarget->hasAVX512()">;
+def HasCDI       : Predicate<"Subtarget->hasCDI()">;
+def HasPFI       : Predicate<"Subtarget->hasPFI()">;
+def HasERI       : Predicate<"Subtarget->hasERI()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
 def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
 def HasFMA       : Predicate<"Subtarget->hasFMA()">;
+def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
+def HasTBM       : Predicate<"Subtarget->hasTBM()">;
 def HasMOVBE     : Predicate<"Subtarget->hasMOVBE()">;
 def HasRDRAND    : Predicate<"Subtarget->hasRDRAND()">;
 def HasF16C      : Predicate<"Subtarget->hasF16C()">;
@@ -609,9 +684,10 @@ def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasHLE       : Predicate<"Subtarget->hasHLE()">;
 def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
+def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
-def HasPrefetchW : Predicate<"Subtarget->has3DNow() || Subtarget->hasPRFCHW()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -804,11 +880,11 @@ def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>;
 def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", [],
                 IIC_POP_REG>, OpSize;
-def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", [],
+def POP16rmm: I<0x8F, MRM0m, (outs), (ins i16mem:$dst), "pop{w}\t$dst", [],
                 IIC_POP_MEM>, OpSize;
 def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", [],
                 IIC_POP_REG>;
-def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", [],
+def POP32rmm: I<0x8F, MRM0m, (outs), (ins i32mem:$dst), "pop{l}\t$dst", [],
                 IIC_POP_MEM>;
 
 def POPF16   : I<0x9D, RawFrm, (outs), (ins), "popf{w}", [], IIC_POP_F>, OpSize;
@@ -852,7 +928,7 @@ def POP64r   : I<0x58, AddRegFrm,
                  (outs GR64:$reg), (ins), "pop{q}\t$reg", [], IIC_POP_REG>;
 def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", [],
                 IIC_POP_REG>;
-def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", [],
+def POP64rmm: I<0x8F, MRM0m, (outs), (ins i64mem:$dst), "pop{q}\t$dst", [],
                 IIC_POP_MEM>;
 } // mayLoad, SchedRW
 let mayStore = 1, SchedRW = [WriteStore] in {
@@ -884,12 +960,12 @@ def PUSHF64    : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
 
 let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
     mayLoad = 1, neverHasSideEffects = 1, SchedRW = [WriteLoad] in {
-def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
+def POPA32   : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
                Requires<[In32BitMode]>;
 }
 let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
     mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
-def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
+def PUSHA32  : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
                Requires<[In32BitMode]>;
 }
 
@@ -910,53 +986,56 @@ let Defs = [EFLAGS] in {
 def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf GR16:$src))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, OpSize, Sched<[WriteShift]>;
 def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsf{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsf (loadi16 addr:$src)))],
-                  IIC_BSF>, TB, OpSize, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, OpSize, Sched<[WriteShiftLd]>;
 def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))], IIC_BSF>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsf GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsf{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsf (loadi32 addr:$src)))],
-                 IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf GR64:$src))],
-                  IIC_BSF>, TB, Sched<[WriteShift]>;
+                  IIC_BIT_SCAN_REG>, TB, Sched<[WriteShift]>;
 def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsf{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsf (loadi64 addr:$src)))],
-                  IIC_BSF>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 
 def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
-                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))], IIC_BSR>,
+                 [(set GR16:$dst, EFLAGS, (X86bsr GR16:$src))],
+                 IIC_BIT_SCAN_REG>,
                  TB, OpSize, Sched<[WriteShift]>;
 def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                  "bsr{w}\t{$src, $dst|$dst, $src}",
                  [(set GR16:$dst, EFLAGS, (X86bsr (loadi16 addr:$src)))],
-                 IIC_BSR>, TB,
+                 IIC_BIT_SCAN_MEM>, TB,
                  OpSize, Sched<[WriteShiftLd]>;
 def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
-                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))], IIC_BSR>, TB,
+                 [(set GR32:$dst, EFLAGS, (X86bsr GR32:$src))],
+                 IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                  "bsr{l}\t{$src, $dst|$dst, $src}",
                  [(set GR32:$dst, EFLAGS, (X86bsr (loadi32 addr:$src)))],
-                 IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                 IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
-                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BSR>, TB,
+                  [(set GR64:$dst, EFLAGS, (X86bsr GR64:$src))], IIC_BIT_SCAN_REG>, TB,
                Sched<[WriteShift]>;
 def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                   "bsr{q}\t{$src, $dst|$dst, $src}",
                   [(set GR64:$dst, EFLAGS, (X86bsr (loadi64 addr:$src)))],
-                  IIC_BSR>, TB, Sched<[WriteShiftLd]>;
+                  IIC_BIT_SCAN_MEM>, TB, Sched<[WriteShiftLd]>;
 } // Defs = [EFLAGS]
 
 let SchedRW = [WriteMicrocoded] in {
@@ -1038,44 +1117,67 @@ def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                       [(store i64immSExt32:$src, addr:$dst)], IIC_MOV_MEM>;
 } // SchedRW
 
+let hasSideEffects = 0 in {
+
 /// moffs8, moffs16 and moffs32 versions of moves.  The immediate is a
 /// 32-bit offset from the PC.  These are only valid in x86-32 mode.
 let SchedRW = [WriteALU] in {
+let mayLoad = 1 in {
 def MOV8o8a : Ii32 <0xA0, RawFrm, (outs), (ins offset8:$src),
-                   "mov{b}\t{$src, %al|AL, $src}", [], IIC_MOV_MEM>,
+                   "mov{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
                    Requires<[In32BitMode]>;
 def MOV16o16a : Ii32 <0xA1, RawFrm, (outs), (ins offset16:$src),
-                      "mov{w}\t{$src, %ax|AL, $src}", [], IIC_MOV_MEM>, OpSize,
+                      "mov{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
-                      "mov{l}\t{$src, %eax|EAX, $src}", [], IIC_MOV_MEM>,
+                      "mov{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
+}
+let mayStore = 1 in {
 def MOV8ao8 : Ii32 <0xA2, RawFrm, (outs offset8:$dst), (ins),
-                   "mov{b}\t{%al, $dst|$dst, AL}", [], IIC_MOV_MEM>,
+                   "mov{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
                   Requires<[In32BitMode]>;
 def MOV16ao16 : Ii32 <0xA3, RawFrm, (outs offset16:$dst), (ins),
-                      "mov{w}\t{%ax, $dst|$dst, AL}", [], IIC_MOV_MEM>, OpSize,
+                      "mov{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, OpSize,
                      Requires<[In32BitMode]>;
 def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
-                      "mov{l}\t{%eax, $dst|$dst, EAX}", [], IIC_MOV_MEM>,
+                      "mov{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
                      Requires<[In32BitMode]>;
 }
+}
 
-// FIXME: These definitions are utterly broken
-// Just leave them commented out for now because they're useless outside
-// of the large code model, and most compilers won't generate the instructions
-// in question.
-/*
-def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
-                      "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
-                       "mov{q}\t{$src, %rax|RAX, $src}", []>;
-def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
-                       "mov{q}\t{%rax, $dst|$dst, RAX}", []>;
-*/
-
+// These forms all have full 64-bit absolute addresses in their instructions
+// and use the movabs mnemonic to indicate this specific form.
+let mayLoad = 1 in {
+def MOV64o8a : RIi64_NOREX<0xA0, RawFrm, (outs), (ins offset8:$src),
+                     "movabs{b}\t{$src, %al|al, $src}", []>,
+                     Requires<[In64BitMode]>;
+def MOV64o16a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset16:$src),
+                     "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize,
+                     Requires<[In64BitMode]>;
+def MOV64o32a : RIi64_NOREX<0xA1, RawFrm, (outs), (ins offset32:$src),
+                     "movabs{l}\t{$src, %eax|eax, $src}", []>,
+                     Requires<[In64BitMode]>;
+def MOV64o64a : RIi64<0xA1, RawFrm, (outs), (ins offset64:$src),
+                     "movabs{q}\t{$src, %rax|rax, $src}", []>,
+                     Requires<[In64BitMode]>;
+}
+
+let mayStore = 1 in {
+def MOV64ao8 : RIi64_NOREX<0xA2, RawFrm, (outs offset8:$dst), (ins),
+                     "movabs{b}\t{%al, $dst|$dst, al}", []>,
+                     Requires<[In64BitMode]>;
+def MOV64ao16 : RIi64_NOREX<0xA3, RawFrm, (outs offset16:$dst), (ins),
+                     "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize,
+                     Requires<[In64BitMode]>;
+def MOV64ao32 : RIi64_NOREX<0xA3, RawFrm, (outs offset32:$dst), (ins),
+                     "movabs{l}\t{%eax, $dst|$dst, eax}", []>,
+                     Requires<[In64BitMode]>;
+def MOV64ao64 : RIi64<0xA3, RawFrm, (outs offset64:$dst), (ins),
+                     "movabs{q}\t{%rax, $dst|$dst, rax}", []>,
+                     Requires<[In64BitMode]>;
+}
+} // hasSideEffects = 0
 
 let isCodeGenOnly = 1, hasSideEffects = 0, SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
@@ -1127,7 +1229,7 @@ def MOV8rr_NOREX : I<0x88, MRMDestReg,
                      (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [], IIC_MOV>,
                    Sched<[WriteMove]>;
-let mayStore = 1 in
+let mayStore = 1, neverHasSideEffects = 1 in
 def MOV8mr_NOREX : I<0x88, MRMDestMem,
                      (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
                      "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", [],
@@ -1408,17 +1510,17 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
 
 // Swap between EAX and other registers.
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
-                  "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
+                  "xchg{w}\t{$src, %ax|ax, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
-                  "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                  "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
                   Requires<[In32BitMode]>;
 // Uses GR32_NOAX in 64-bit mode to prevent encoding using the 0x90 NOP encoding.
 // xchg %eax, %eax needs to clear upper 32-bits of RAX so is not a NOP.
 def XCHG32ar64 : I<0x90, AddRegFrm, (outs), (ins GR32_NOAX:$src),
-                   "xchg{l}\t{$src, %eax|EAX, $src}", [], IIC_XCHG_REG>,
+                   "xchg{l}\t{$src, %eax|eax, $src}", [], IIC_XCHG_REG>,
                    Requires<[In64BitMode]>;
 def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
-                  "xchg{q}\t{$src, %rax|RAX, $src}", [], IIC_XCHG_REG>;
+                  "xchg{q}\t{$src, %rax|rax, $src}", [], IIC_XCHG_REG>;
 } // SchedRW
 
 let SchedRW = [WriteALU] in {
@@ -1768,6 +1870,30 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
                                int_x86_bmi_bzhi_64, loadi64>, VEX_W;
 }
 
+def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
+          (BZHI32rr GR32:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
+          (BZHI32rm addr:$src1,
+                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
+          (BZHI64rr GR64:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
+          (BZHI64rm addr:$src1,
+                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+let Predicates = [HasBMI] in {
+  def : Pat<(X86bextr GR32:$src1, GR32:$src2),
+            (BEXTR32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
+            (BEXTR32rm addr:$src1, GR32:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, GR64:$src2),
+            (BEXTR64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
+            (BEXTR64rm addr:$src1, GR64:$src2)>;
+} // HasBMI
+
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
                          X86MemOperand x86memop, Intrinsic Int,
                          PatFrag ld_frag> {
@@ -1792,6 +1918,134 @@ let Predicates = [HasBMI2] in {
 }
 
 //===----------------------------------------------------------------------===//
+// TBM Instructions
+//
+let Predicates = [HasTBM], Defs = [EFLAGS] in {
+
+multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
+                                X86MemOperand x86memop, PatFrag ld_frag,
+                                Intrinsic Int, Operand immtype,
+                                SDPatternOperator immoperator> {
+  def ri : Ii32<opc,  MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+  def mi : Ii32<opc,  MRMSrcMem, (outs RC:$dst),
+                (ins x86memop:$src1, immtype:$cntl),
+                !strconcat(OpcodeStr,
+                           "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
+                [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
+           XOP, XOPA, VEX;
+}
+
+defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
+                                     int_x86_tbm_bextri_u32, i32imm, imm>;
+defm BEXTRI64 : tbm_ternary_imm_intr<0x10, GR64, "bextr", i64mem, loadi64,
+                                     int_x86_tbm_bextri_u64, i64i32imm,
+                                     i64immSExt32>, VEX_W;
+
+multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
+                         RegisterClass RC, string OpcodeStr,
+                         X86MemOperand x86memop, PatFrag ld_frag> {
+let hasSideEffects = 0 in {
+  def rr : I<opc,  FormReg, (outs RC:$dst), (ins RC:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+  let mayLoad = 1 in
+  def rm : I<opc,  FormMem, (outs RC:$dst), (ins x86memop:$src),
+             !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
+             []>, XOP, XOP9, VEX_4V;
+}
+}
+
+multiclass tbm_binary_intr<bits<8> opc, string OpcodeStr,
+                           Format FormReg, Format FormMem> {
+  defm NAME#32 : tbm_binary_rm<opc, FormReg, FormMem, GR32, OpcodeStr, i32mem,
+                               loadi32>;
+  defm NAME#64 : tbm_binary_rm<opc, FormReg, FormMem, GR64, OpcodeStr, i64mem,
+                               loadi64>, VEX_W;
+}
+
+defm BLCFILL : tbm_binary_intr<0x01, "blcfill", MRM1r, MRM1m>;
+defm BLCI    : tbm_binary_intr<0x02, "blci", MRM6r, MRM6m>;
+defm BLCIC   : tbm_binary_intr<0x01, "blcic", MRM5r, MRM5m>;
+defm BLCMSK  : tbm_binary_intr<0x02, "blcmsk", MRM1r, MRM1m>;
+defm BLCS    : tbm_binary_intr<0x01, "blcs", MRM3r, MRM3m>;
+defm BLSFILL : tbm_binary_intr<0x01, "blsfill", MRM2r, MRM2m>;
+defm BLSIC   : tbm_binary_intr<0x01, "blsic", MRM6r, MRM6m>;
+defm T1MSKC  : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
+defm TZMSK   : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
+} // HasTBM, EFLAGS
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments to auto generate TBM instructions.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasTBM] in {
+  def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
+            (BEXTRI32ri GR32:$src1, imm:$src2)>;
+  def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
+            (BEXTRI32mi addr:$src1, imm:$src2)>;
+  def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
+            (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
+  def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
+            (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
+
+  // FIXME: patterns for the load versions are not implemented
+  def : Pat<(and GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (not (add GR32:$src, 1))),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (not (add GR64:$src, 1))),
+            (BLCI64rr GR64:$src)>;
+
+  // Extra patterns because opt can optimize the above patterns to this.
+  def : Pat<(or GR32:$src, (sub -2, GR32:$src)),
+            (BLCI32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (sub -2, GR64:$src)),
+            (BLCI64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, 1)),
+            (BLCIC32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, 1)),
+            (BLCIC64rr GR64:$src)>;
+
+  def : Pat<(xor GR32:$src, (add GR32:$src, 1)),
+            (BLCMSK32rr GR32:$src)>;
+  def : Pat<(xor GR64:$src, (add GR64:$src, 1)),
+            (BLCMSK64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, 1)),
+            (BLCS32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, 1)),
+            (BLCS64rr GR64:$src)>;
+
+  def : Pat<(or GR32:$src, (add GR32:$src, -1)),
+            (BLSFILL32rr GR32:$src)>;
+  def : Pat<(or GR64:$src, (add GR64:$src, -1)),
+            (BLSFILL64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, -1)),
+            (BLSIC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, -1)),
+            (BLSIC64rr GR64:$src)>;
+
+  def : Pat<(or (not GR32:$src), (add GR32:$src, 1)),
+            (T1MSKC32rr GR32:$src)>;
+  def : Pat<(or (not GR64:$src), (add GR64:$src, 1)),
+            (T1MSKC64rr GR64:$src)>;
+
+  def : Pat<(and (not GR32:$src), (add GR32:$src, -1)),
+            (TZMSK32rr GR32:$src)>;
+  def : Pat<(and (not GR64:$src), (add GR64:$src, -1)),
+            (TZMSK64rr GR64:$src)>;
+} // HasTBM
+
+//===----------------------------------------------------------------------===//
 // Subsystems.
 //===----------------------------------------------------------------------===//
 
@@ -1815,6 +2069,7 @@ include "X86InstrXOP.td"
 
 // SSE, MMX and 3DNow! vector support.
 include "X86InstrSSE.td"
+include "X86InstrAVX512.td"
 include "X86InstrMMX.td"
 include "X86Instr3DNow.td"
 
@@ -1867,6 +2122,9 @@ def : MnemonicAlias<"pushf",  "pushfl", "att">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"pushf",  "pushfq", "att">, Requires<[In64BitMode]>;
 def : MnemonicAlias<"pushfd", "pushfl", "att">;
 
+def : MnemonicAlias<"popad",   "popa", "intel">, Requires<[In32BitMode]>;
+def : MnemonicAlias<"pushad",  "pusha", "intel">, Requires<[In32BitMode]>;
+
 def : MnemonicAlias<"repe",  "rep",   "att">;
 def : MnemonicAlias<"repz",  "rep",   "att">;
 def : MnemonicAlias<"repnz", "repne", "att">;
@@ -1919,29 +2177,31 @@ def : MnemonicAlias<"fucomip",  "fucompi",  "att">;
 def : MnemonicAlias<"fwait",    "wait",     "att">;
 
 
-class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond>
+class CondCodeAlias<string Prefix,string Suffix, string OldCond, string NewCond,
+                    string VariantName>
   : MnemonicAlias<!strconcat(Prefix, OldCond, Suffix),
-                  !strconcat(Prefix, NewCond, Suffix)>;
+                  !strconcat(Prefix, NewCond, Suffix), VariantName>;
 
 /// IntegerCondCodeMnemonicAlias - This multiclass defines a bunch of
 /// MnemonicAlias's that canonicalize the condition code in a mnemonic, for
 /// example "setz" -> "sete".
-multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix> {
-  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b">;   // setc   -> setb
-  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e">;   // setz   -> sete
-  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be">;  // setna  -> setbe
-  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae">;  // setnb  -> setae
-  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae">;  // setnc  -> setae
-  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le">;  // setng  -> setle
-  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge">;  // setnl  -> setge
-  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne">;  // setnz  -> setne
-  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p">;   // setpe  -> setp
-  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np">;  // setpo  -> setnp
-
-  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b">;   // setnae -> setb
-  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a">;   // setnbe -> seta
-  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l">;   // setnge -> setl
-  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g">;   // setnle -> setg
+multiclass IntegerCondCodeMnemonicAlias<string Prefix, string Suffix,
+                                        string V = ""> {
+  def C   : CondCodeAlias<Prefix, Suffix, "c",   "b",  V>; // setc   -> setb
+  def Z   : CondCodeAlias<Prefix, Suffix, "z" ,  "e",  V>; // setz   -> sete
+  def NA  : CondCodeAlias<Prefix, Suffix, "na",  "be", V>; // setna  -> setbe
+  def NB  : CondCodeAlias<Prefix, Suffix, "nb",  "ae", V>; // setnb  -> setae
+  def NC  : CondCodeAlias<Prefix, Suffix, "nc",  "ae", V>; // setnc  -> setae
+  def NG  : CondCodeAlias<Prefix, Suffix, "ng",  "le", V>; // setng  -> setle
+  def NL  : CondCodeAlias<Prefix, Suffix, "nl",  "ge", V>; // setnl  -> setge
+  def NZ  : CondCodeAlias<Prefix, Suffix, "nz",  "ne", V>; // setnz  -> setne
+  def PE  : CondCodeAlias<Prefix, Suffix, "pe",  "p",  V>; // setpe  -> setp
+  def PO  : CondCodeAlias<Prefix, Suffix, "po",  "np", V>; // setpo  -> setnp
+
+  def NAE : CondCodeAlias<Prefix, Suffix, "nae", "b",  V>; // setnae -> setb
+  def NBE : CondCodeAlias<Prefix, Suffix, "nbe", "a",  V>; // setnbe -> seta
+  def NGE : CondCodeAlias<Prefix, Suffix, "nge", "l",  V>; // setnge -> setl
+  def NLE : CondCodeAlias<Prefix, Suffix, "nle", "g",  V>; // setnle -> setg
 }
 
 // Aliases for set<CC>
@@ -1949,9 +2209,11 @@ defm : IntegerCondCodeMnemonicAlias<"set", "">;
 // Aliases for j<CC>
 defm : IntegerCondCodeMnemonicAlias<"j", "">;
 // Aliases for cmov<CC>{w,l,q}
-defm : IntegerCondCodeMnemonicAlias<"cmov", "w">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "l">;
-defm : IntegerCondCodeMnemonicAlias<"cmov", "q">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "w", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "l", "att">;
+defm : IntegerCondCodeMnemonicAlias<"cmov", "q", "att">;
+// No size suffix for intel-style asm.
+defm : IntegerCondCodeMnemonicAlias<"cmov", "", "intel">;
 
 
 //===----------------------------------------------------------------------===//
@@ -1963,75 +2225,83 @@ def : InstAlias<"aad", (AAD8i8 10)>;
 def : InstAlias<"aam", (AAM8i8 10)>;
 
 // Disambiguate the mem/imm form of bt-without-a-suffix as btl.
-def : InstAlias<"bt $imm, $mem", (BT32mi8 i32mem:$mem, i32i8imm:$imm)>;
+// Likewise for btc/btr/bts.
+def : InstAlias<"bt {$imm, $mem|$mem, $imm}",
+                (BT32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btc {$imm, $mem|$mem, $imm}",
+                (BTC32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"btr {$imm, $mem|$mem, $imm}",
+                (BTR32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
+def : InstAlias<"bts {$imm, $mem|$mem, $imm}",
+                (BTS32mi8 i32mem:$mem, i32i8imm:$imm), 0>;
 
 // clr aliases.
-def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg)>;
-def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg)>;
-def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg)>;
-def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg)>;
+def : InstAlias<"clrb $reg", (XOR8rr  GR8 :$reg, GR8 :$reg), 0>;
+def : InstAlias<"clrw $reg", (XOR16rr GR16:$reg, GR16:$reg), 0>;
+def : InstAlias<"clrl $reg", (XOR32rr GR32:$reg, GR32:$reg), 0>;
+def : InstAlias<"clrq $reg", (XOR64rr GR64:$reg, GR64:$reg), 0>;
 
 // div and idiv aliases for explicit A register.
-def : InstAlias<"divb $src, %al",  (DIV8r  GR8 :$src)>;
-def : InstAlias<"divw $src, %ax",  (DIV16r GR16:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32r GR32:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64r GR64:$src)>;
-def : InstAlias<"divb $src, %al",  (DIV8m  i8mem :$src)>;
-def : InstAlias<"divw $src, %ax",  (DIV16m i16mem:$src)>;
-def : InstAlias<"divl $src, %eax", (DIV32m i32mem:$src)>;
-def : InstAlias<"divq $src, %rax", (DIV64m i64mem:$src)>;
-def : InstAlias<"idivb $src, %al",  (IDIV8r  GR8 :$src)>;
-def : InstAlias<"idivw $src, %ax",  (IDIV16r GR16:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32r GR32:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64r GR64:$src)>;
-def : InstAlias<"idivb $src, %al",  (IDIV8m  i8mem :$src)>;
-def : InstAlias<"idivw $src, %ax",  (IDIV16m i16mem:$src)>;
-def : InstAlias<"idivl $src, %eax", (IDIV32m i32mem:$src)>;
-def : InstAlias<"idivq $src, %rax", (IDIV64m i64mem:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8r  GR8 :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16r GR16:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32r GR32:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64r GR64:$src)>;
+def : InstAlias<"div{b}\t{$src, %al|al, $src}", (DIV8m  i8mem :$src)>;
+def : InstAlias<"div{w}\t{$src, %ax|ax, $src}", (DIV16m i16mem:$src)>;
+def : InstAlias<"div{l}\t{$src, %eax|eax, $src}", (DIV32m i32mem:$src)>;
+def : InstAlias<"div{q}\t{$src, %rax|rax, $src}", (DIV64m i64mem:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8r  GR8 :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16r GR16:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32r GR32:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64r GR64:$src)>;
+def : InstAlias<"idiv{b}\t{$src, %al|al, $src}", (IDIV8m  i8mem :$src)>;
+def : InstAlias<"idiv{w}\t{$src, %ax|ax, $src}", (IDIV16m i16mem:$src)>;
+def : InstAlias<"idiv{l}\t{$src, %eax|eax, $src}", (IDIV32m i32mem:$src)>;
+def : InstAlias<"idiv{q}\t{$src, %rax|rax, $src}", (IDIV64m i64mem:$src)>;
 
 
 
 // Various unary fpstack operations default to operating on on ST1.
 // For example, "fxch" -> "fxch %st(1)"
 def : InstAlias<"faddp",        (ADD_FPrST0  ST1), 0>;
-def : InstAlias<"fsubp",        (SUBR_FPrST0 ST1)>;
-def : InstAlias<"fsubrp",       (SUB_FPrST0  ST1)>;
-def : InstAlias<"fmulp",        (MUL_FPrST0  ST1)>;
-def : InstAlias<"fdivp",        (DIVR_FPrST0 ST1)>;
-def : InstAlias<"fdivrp",       (DIV_FPrST0  ST1)>;
-def : InstAlias<"fxch",         (XCH_F       ST1)>;
-def : InstAlias<"fcom",         (COM_FST0r   ST1)>;
-def : InstAlias<"fcomp",        (COMP_FST0r  ST1)>;
-def : InstAlias<"fcomi",        (COM_FIr     ST1)>;
-def : InstAlias<"fcompi",       (COM_FIPr    ST1)>;
-def : InstAlias<"fucom",        (UCOM_Fr     ST1)>;
-def : InstAlias<"fucomp",       (UCOM_FPr    ST1)>;
-def : InstAlias<"fucomi",       (UCOM_FIr    ST1)>;
-def : InstAlias<"fucompi",      (UCOM_FIPr   ST1)>;
+def : InstAlias<"fsub{|r}p",    (SUBR_FPrST0 ST1), 0>;
+def : InstAlias<"fsub{r|}p",    (SUB_FPrST0  ST1), 0>;
+def : InstAlias<"fmulp",        (MUL_FPrST0  ST1), 0>;
+def : InstAlias<"fdiv{|r}p",    (DIVR_FPrST0 ST1), 0>;
+def : InstAlias<"fdiv{r|}p",    (DIV_FPrST0  ST1), 0>;
+def : InstAlias<"fxch",         (XCH_F       ST1), 0>;
+def : InstAlias<"fcom",         (COM_FST0r   ST1), 0>;
+def : InstAlias<"fcomp",        (COMP_FST0r  ST1), 0>;
+def : InstAlias<"fcomi",        (COM_FIr     ST1), 0>;
+def : InstAlias<"fcompi",       (COM_FIPr    ST1), 0>;
+def : InstAlias<"fucom",        (UCOM_Fr     ST1), 0>;
+def : InstAlias<"fucomp",       (UCOM_FPr    ST1), 0>;
+def : InstAlias<"fucomi",       (UCOM_FIr    ST1), 0>;
+def : InstAlias<"fucompi",      (UCOM_FIPr   ST1), 0>;
 
 // Handle fmul/fadd/fsub/fdiv instructions with explicitly written st(0) op.
 // For example, "fadd %st(4), %st(0)" -> "fadd %st(4)".  We also disambiguate
 // instructions like "fadd %st(0), %st(0)" as "fadd %st(0)" for consistency with
 // gas.
 multiclass FpUnaryAlias<string Mnemonic, Instruction Inst, bit EmitAlias = 1> {
- def : InstAlias<!strconcat(Mnemonic, " $op, %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{$op, %st(0)|st(0), $op}"),
                  (Inst RST:$op), EmitAlias>;
- def : InstAlias<!strconcat(Mnemonic, " %st(0), %st(0)"),
+ def : InstAlias<!strconcat(Mnemonic, "\t{%st(0), %st(0)|st(0), st(0)}"),
                  (Inst ST0), EmitAlias>;
 }
 
 defm : FpUnaryAlias<"fadd",   ADD_FST0r>;
 defm : FpUnaryAlias<"faddp",  ADD_FPrST0, 0>;
 defm : FpUnaryAlias<"fsub",   SUB_FST0r>;
-defm : FpUnaryAlias<"fsubp",  SUBR_FPrST0>;
+defm : FpUnaryAlias<"fsub{|r}p",  SUBR_FPrST0>;
 defm : FpUnaryAlias<"fsubr",  SUBR_FST0r>;
-defm : FpUnaryAlias<"fsubrp", SUB_FPrST0>;
+defm : FpUnaryAlias<"fsub{r|}p", SUB_FPrST0>;
 defm : FpUnaryAlias<"fmul",   MUL_FST0r>;
 defm : FpUnaryAlias<"fmulp",  MUL_FPrST0>;
 defm : FpUnaryAlias<"fdiv",   DIV_FST0r>;
-defm : FpUnaryAlias<"fdivp",  DIVR_FPrST0>;
+defm : FpUnaryAlias<"fdiv{|r}p",  DIVR_FPrST0>;
 defm : FpUnaryAlias<"fdivr",  DIVR_FST0r>;
-defm : FpUnaryAlias<"fdivrp", DIV_FPrST0>;
+defm : FpUnaryAlias<"fdiv{r|}p", DIV_FPrST0>;
 defm : FpUnaryAlias<"fcomi",   COM_FIr, 0>;
 defm : FpUnaryAlias<"fucomi",  UCOM_FIr, 0>;
 defm : FpUnaryAlias<"fcompi",   COM_FIPr>;
@@ -2041,16 +2311,16 @@ defm : FpUnaryAlias<"fucompi",  UCOM_FIPr>;
 // Handle "f{mulp,addp} st(0), $op" the same as "f{mulp,addp} $op", since they
 // commute.  We also allow fdiv[r]p/fsubrp even though they don't commute,
 // solely because gas supports it.
-def : InstAlias<"faddp %st(0), $op", (ADD_FPrST0 RST:$op), 0>;
-def : InstAlias<"fmulp %st(0), $op", (MUL_FPrST0 RST:$op)>;
-def : InstAlias<"fsubp %st(0), $op", (SUBR_FPrST0 RST:$op)>;
-def : InstAlias<"fsubrp %st(0), $op", (SUB_FPrST0 RST:$op)>;
-def : InstAlias<"fdivp %st(0), $op", (DIVR_FPrST0 RST:$op)>;
-def : InstAlias<"fdivrp %st(0), $op", (DIV_FPrST0 RST:$op)>;
+def : InstAlias<"faddp\t{%st(0), $op|$op, st(0)}", (ADD_FPrST0 RST:$op), 0>;
+def : InstAlias<"fmulp\t{%st(0), $op|$op, st(0)}", (MUL_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{|r}p\t{%st(0), $op|$op, st(0)}", (SUBR_FPrST0 RST:$op)>;
+def : InstAlias<"fsub{r|}p\t{%st(0), $op|$op, st(0)}", (SUB_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{|r}p\t{%st(0), $op|$op, st(0)}", (DIVR_FPrST0 RST:$op)>;
+def : InstAlias<"fdiv{r|}p\t{%st(0), $op|$op, st(0)}", (DIV_FPrST0 RST:$op)>;
 
 // We accept "fnstsw %eax" even though it only writes %ax.
-def : InstAlias<"fnstsw %eax", (FNSTSW16r)>;
-def : InstAlias<"fnstsw %al" , (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%eax|eax}", (FNSTSW16r)>;
+def : InstAlias<"fnstsw\t{%al|al}" , (FNSTSW16r)>;
 def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
@@ -2069,12 +2339,12 @@ def : InstAlias<"imulq $imm, $r",(IMUL64rri32 GR64:$r, GR64:$r,i64i32imm:$imm)>;
 def : InstAlias<"imulq $imm, $r", (IMUL64rri8 GR64:$r, GR64:$r, i64i8imm:$imm)>;
 
 // inb %dx -> inb %al, %dx
-def : InstAlias<"inb %dx", (IN8rr)>;
-def : InstAlias<"inw %dx", (IN16rr)>;
-def : InstAlias<"inl %dx", (IN32rr)>;
-def : InstAlias<"inb $port", (IN8ri i8imm:$port)>;
-def : InstAlias<"inw $port", (IN16ri i8imm:$port)>;
-def : InstAlias<"inl $port", (IN32ri i8imm:$port)>;
+def : InstAlias<"inb\t{%dx|dx}", (IN8rr), 0>;
+def : InstAlias<"inw\t{%dx|dx}", (IN16rr), 0>;
+def : InstAlias<"inl\t{%dx|dx}", (IN32rr), 0>;
+def : InstAlias<"inb\t$port", (IN8ri i8imm:$port), 0>;
+def : InstAlias<"inw\t$port", (IN16ri i8imm:$port), 0>;
+def : InstAlias<"inl\t$port", (IN32ri i8imm:$port), 0>;
 
 
 // jmp and call aliases for lcall and ljmp.  jmp $42,$5 -> ljmp
@@ -2102,7 +2372,7 @@ def : InstAlias<"movq $src, $dst",
 
 // movsd with no operands (as opposed to the SSE scalar move of a double) is an
 // alias for movsl. (as in rep; movsd)
-def : InstAlias<"movsd", (MOVSD)>;
+def : InstAlias<"movsd", (MOVSD), 0>;
 
 // movsx aliases
 def : InstAlias<"movsx $src, $dst", (MOVSX16rr8 GR16:$dst, GR8:$src), 0>;
@@ -2123,12 +2393,12 @@ def : InstAlias<"movzx $src, $dst", (MOVZX64rr16_Q GR64:$dst, GR16:$src), 0>;
 // Note: No GR32->GR64 movzx form.
 
 // outb %dx -> outb %al, %dx
-def : InstAlias<"outb %dx", (OUT8rr)>;
-def : InstAlias<"outw %dx", (OUT16rr)>;
-def : InstAlias<"outl %dx", (OUT32rr)>;
-def : InstAlias<"outb $port", (OUT8ir i8imm:$port)>;
-def : InstAlias<"outw $port", (OUT16ir i8imm:$port)>;
-def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
+def : InstAlias<"outb\t{%dx|dx}", (OUT8rr), 0>;
+def : InstAlias<"outw\t{%dx|dx}", (OUT16rr), 0>;
+def : InstAlias<"outl\t{%dx|dx}", (OUT32rr), 0>;
+def : InstAlias<"outb\t$port", (OUT8ir i8imm:$port), 0>;
+def : InstAlias<"outw\t$port", (OUT16ir i8imm:$port), 0>;
+def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
 
 // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
 // effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
@@ -2136,19 +2406,19 @@ def : InstAlias<"outl $port", (OUT32ir i8imm:$port)>;
 def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
 
 // shld/shrd op,op -> shld op, op, CL
-def : InstAlias<"shldw $r2, $r1", (SHLD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shldl $r2, $r1", (SHLD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shldq $r2, $r1", (SHLD64rrCL GR64:$r1, GR64:$r2)>;
-def : InstAlias<"shrdw $r2, $r1", (SHRD16rrCL GR16:$r1, GR16:$r2)>;
-def : InstAlias<"shrdl $r2, $r1", (SHRD32rrCL GR32:$r1, GR32:$r2)>;
-def : InstAlias<"shrdq $r2, $r1", (SHRD64rrCL GR64:$r1, GR64:$r2)>;
-
-def : InstAlias<"shldw $reg, $mem", (SHLD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shldl $reg, $mem", (SHLD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shldq $reg, $mem", (SHLD64mrCL i64mem:$mem, GR64:$reg)>;
-def : InstAlias<"shrdw $reg, $mem", (SHRD16mrCL i16mem:$mem, GR16:$reg)>;
-def : InstAlias<"shrdl $reg, $mem", (SHRD32mrCL i32mem:$mem, GR32:$reg)>;
-def : InstAlias<"shrdq $reg, $mem", (SHRD64mrCL i64mem:$mem, GR64:$reg)>;
+def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shld{l}\t{$r2, $r1|$r1, $r2}", (SHLD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shld{q}\t{$r2, $r1|$r1, $r2}", (SHLD64rrCL GR64:$r1, GR64:$r2), 0>;
+def : InstAlias<"shrd{w}\t{$r2, $r1|$r1, $r2}", (SHRD16rrCL GR16:$r1, GR16:$r2), 0>;
+def : InstAlias<"shrd{l}\t{$r2, $r1|$r1, $r2}", (SHRD32rrCL GR32:$r1, GR32:$r2), 0>;
+def : InstAlias<"shrd{q}\t{$r2, $r1|$r1, $r2}", (SHRD64rrCL GR64:$r1, GR64:$r2), 0>;
+
+def : InstAlias<"shld{w}\t{$reg, $mem|$mem, $reg}", (SHLD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shld{l}\t{$reg, $mem|$mem, $reg}", (SHLD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shld{q}\t{$reg, $mem|$mem, $reg}", (SHLD64mrCL i64mem:$mem, GR64:$reg), 0>;
+def : InstAlias<"shrd{w}\t{$reg, $mem|$mem, $reg}", (SHRD16mrCL i16mem:$mem, GR16:$reg), 0>;
+def : InstAlias<"shrd{l}\t{$reg, $mem|$mem, $reg}", (SHRD32mrCL i32mem:$mem, GR32:$reg), 0>;
+def : InstAlias<"shrd{q}\t{$reg, $mem|$mem, $reg}", (SHRD64mrCL i64mem:$mem, GR64:$reg), 0>;
 
 /*  FIXME: This is disabled because the asm matcher is currently incapable of
  *  matching a fixed immediate like $1.
@@ -2179,19 +2449,19 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
 FIXME */
 
 // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"testb $val, $mem", (TEST8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"testw $val, $mem", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"testl $val, $mem", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"testq $val, $mem", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm  GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
 
 // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchgb $mem, $val", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchgw $mem, $val", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchgl $mem, $val", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchgq $mem, $val", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
 
 // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchgw %ax, $src", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
-def : InstAlias<"xchgl %eax, $src", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchgq %rax, $src", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[In32BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 49721df..ba58143 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -189,13 +189,14 @@ multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
                     RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
                     PatFrag ld_frag, string asm, Domain d> {
-  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst),(ins DstRC:$src1, SrcRC:$src2),
-              asm, [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], 
-              NoItinerary, d>;
-  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst),
-                   (ins DstRC:$src1, x86memop:$src2), asm,
-              [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], 
-              NoItinerary, d>;
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst),
+                  (ins DstRC:$src1, SrcRC:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+                  NoItinerary, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst),
+                  (ins DstRC:$src1, x86memop:$src2), asm,
+                  [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+                  NoItinerary, d>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -203,7 +204,7 @@ multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
 //===----------------------------------------------------------------------===//
 
 def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
-                     [(int_x86_mmx_emms)]>;
+                     [(int_x86_mmx_emms)], IIC_MMX_EMMS>;
 
 //===----------------------------------------------------------------------===//
 // MMX Scalar Instructions
@@ -235,10 +236,10 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
                           IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
 
-let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
                              "movd\t{$src, $dst|$dst, $src}",
-                             [], IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
+                             [(set VR64:$dst, (bitconvert GR64:$src))],
+                             IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
 
 // These are 64 bit moves, but since the OS X assembler doesn't
 // recognize a register-register movq, we write them as
@@ -249,10 +250,6 @@ def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
                                "movd\t{$src, $dst|$dst, $src}", 
                              [(set GR64:$dst,
                               (bitconvert VR64:$src))], IIC_MMX_MOV_REG_MM>;
-def MMX_MOVD64rrv164 : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-                             [(set VR64:$dst,
-                              (bitconvert GR64:$src))], IIC_MMX_MOV_MM_RM>;
 let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
@@ -288,7 +285,7 @@ def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                                     (i64 (bitconvert (x86mmx VR64:$src))))))],
                               IIC_MMX_MOVQ_RR>;
 
-let neverHasSideEffects = 1 in
+let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
                                (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                                [], IIC_MMX_MOVQ_RR>;
@@ -296,6 +293,7 @@ def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
 def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                               (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                               [], IIC_MMX_MOVQ_RR>;
+}
 } // SchedRW
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
@@ -303,21 +301,15 @@ def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)],
                          IIC_MMX_MOVQ_RM>, Sched<[WriteStore]>;
 
-let AddedComplexity = 15 in
-// movd to MMX register zero-extends
-def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-              [(set VR64:$dst,
-                    (x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteMove]>;
-let AddedComplexity = 20 in
-def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
-                           (ins i32mem:$src),
-                             "movd\t{$src, $dst|$dst, $src}",
-          [(set VR64:$dst,
-                (x86mmx (X86vzmovl (x86mmx
-                                   (scalar_to_vector (loadi32 addr:$src))))))],
-                            IIC_MMX_MOV_MM_RM>, Sched<[WriteLoad]>;
+let Predicates = [HasMMX] in {
+  let AddedComplexity = 15 in
+  // movd to MMX register zero-extends
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
+            (MMX_MOVD64rr GR32:$src)>;
+  let AddedComplexity = 20 in
+  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
+            (MMX_MOVD64rm addr:$src)>;
+}
 
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
@@ -357,21 +349,21 @@ defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
                                    MMX_INTALU_ITINS>;
 defm MMX_PSUBW : MMXI_binop_rm_int<0xF9, "psubw", int_x86_mmx_psub_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBD : MMXI_binop_rm_int<0xFA, "psubd", int_x86_mmx_psub_d,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBQ : MMXI_binop_rm_int<0xFB, "psubq", int_x86_mmx_psub_q,
-                                   MMX_INTALUQ_ITINS, 1>;
+                                   MMX_INTALUQ_ITINS>;
 
 defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 
 defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w,
-                                   MMX_INTALU_ITINS, 1>;
+                                   MMX_INTALU_ITINS>;
 
 defm MMX_PHSUBW  : SS3I_binop_rm_int_mm<0x05, "phsubw", int_x86_ssse3_phsub_w,
                                    MMX_PHADDSUBW>;
@@ -554,18 +546,18 @@ let Constraints = "$src1 = $dst" in {
 
 // Extract / Insert
 def MMX_PEXTRWirri: MMXIi8<0xC5, MRMSrcReg,
-                           (outs GR32:$dst), (ins VR64:$src1, i32i8imm:$src2),
-                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                           [(set GR32:$dst, (int_x86_mmx_pextr_w VR64:$src1,
-                                             (iPTR imm:$src2)))],
-                           IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
+                       (outs GR32orGR64:$dst), (ins VR64:$src1, i32i8imm:$src2),
+                       "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR32orGR64:$dst, (int_x86_mmx_pextr_w VR64:$src1,
+                                         (iPTR imm:$src2)))],
+                       IIC_MMX_PEXTR>, Sched<[WriteShuffle]>;
 let Constraints = "$src1 = $dst" in {
   def MMX_PINSRWirri : MMXIi8<0xC4, MRMSrcReg,
                       (outs VR64:$dst), 
-                      (ins VR64:$src1, GR32:$src2, i32i8imm:$src3),
+                      (ins VR64:$src1, GR32orGR64:$src2, i32i8imm:$src3),
                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                       [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
-                                        GR32:$src2, (iPTR imm:$src3)))],
+                                        GR32orGR64:$src2, (iPTR imm:$src3)))],
                       IIC_MMX_PINSRW>, Sched<[WriteShuffle]>;
 
   def MMX_PINSRWirmi : MMXIi8<0xC4, MRMSrcMem,
@@ -579,9 +571,10 @@ let Constraints = "$src1 = $dst" in {
 }
 
 // Mask creation
-def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+                          (ins VR64:$src),
                           "pmovmskb\t{$src, $dst|$dst, $src}",
-                          [(set GR32:$dst, 
+                          [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
 
@@ -598,10 +591,10 @@ def : Pat<(x86mmx (MMX_X86movdq2q (loadv2i64 addr:$src))),
 // Misc.
 let SchedRW = [WriteShuffle] in {
 let Uses = [EDI] in
-def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
-                        "maskmovq\t{$mask, $src|$src, $mask}",
-                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
-                        IIC_MMX_MASKMOV>;
+def MMX_MASKMOVQ : MMXI32<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                          "maskmovq\t{$mask, $src|$src, $mask}",
+                          [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)],
+                          IIC_MMX_MASKMOV>;
 let Uses = [RDI] in
 def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            "maskmovq\t{$mask, $src|$src, $mask}",
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cce938b..a5debc0 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -151,6 +151,34 @@ def SSE_MOVU_ITINS : OpndItins<
   IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
 >;
 
+def SSE_DPPD_ITINS : OpndItins<
+  IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
+>;
+
+def SSE_DPPS_ITINS : OpndItins<
+  IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
+>;
+
+def DEFAULT_ITINS : OpndItins<
+  IIC_ALU_NONMEM, IIC_ALU_MEM
+>;
+
+def SSE_EXTRACT_ITINS : OpndItins<
+  IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
+>;
+
+def SSE_INSERT_ITINS : OpndItins<
+  IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
+>;
+
+def SSE_MPSADBW_ITINS : OpndItins<
+  IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
+>;
+
+def SSE_PMULLD_ITINS : OpndItins<
+  IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
+>;
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 Instructions Classes
 //===----------------------------------------------------------------------===//
@@ -455,10 +483,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // SSE 1 & 2 - Move FP Scalar Instructions
 //
 // Move Instructions. Register-to-register movss/movsd is not used for FR32/64
-// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
-// is used instead. Register-to-register movss/movsd is not modeled as an
-// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
-// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
+// register copies because it's a partial register update; Register-to-register
+// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
+// that the insert be implementable in terms of a copy, and just mentioned, we
+// don't use movss/movsd for copies.
 //===----------------------------------------------------------------------===//
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
@@ -526,7 +554,7 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in {
 }
 
 // Patterns
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVS{S,D} to the lower bits.
@@ -1074,23 +1102,6 @@ let Predicates = [UseSSE1] in {
             (MOVUPSmr addr:$dst, VR128:$src)>;
 }
 
-// Alias instruction to do FR32 or FR64 reg-to-reg copy using movaps. Upper
-// bits are disregarded. FIXME: Set encoding to pseudo!
-let neverHasSideEffects = 1, SchedRW = [WriteMove] in {
-def FsVMOVAPSrr : VPSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                       "movaps\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsVMOVAPDrr : VPDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                       "movapd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>, VEX;
-def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
-                     "movaps\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
-                     "movapd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_MOVA_P_RR>;
-}
-
 // Alias instruction to load FR32 or FR64 from f128mem using movaps. Upper
 // bits are disregarded. FIXME: Set encoding to pseudo!
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
@@ -1103,15 +1114,15 @@ let isCodeGenOnly = 1 in {
                          "movapd\t{$src, $dst|$dst, $src}",
                          [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
                          IIC_SSE_MOVA_P_RM>, VEX;
+  def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                       "movaps\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
+  def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                       "movapd\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
+                       IIC_SSE_MOVA_P_RM>;
 }
-def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
-                     "movaps\t{$src, $dst|$dst, $src}",
-                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
-def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
-                     "movapd\t{$src, $dst|$dst, $src}",
-                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))],
-                     IIC_SSE_MOVA_P_RM>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1327,7 +1338,7 @@ let Predicates = [UseSSE2] in {
 // SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
 //===----------------------------------------------------------------------===//
 
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseAVX] in {
   def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1358,7 +1369,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
                         IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1440,7 +1451,7 @@ let neverHasSideEffects = 1 in {
 
 multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           X86MemOperand x86memop, string asm> {
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
   def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
               !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
            Sched<[WriteCvtI2F]>;
@@ -1452,6 +1463,7 @@ let neverHasSideEffects = 1 in {
 } // neverHasSideEffects = 1
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTTSS2SI   : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
                                 "cvttss2si\t{$src, $dst|$dst, $src}",
                                 SSE_CVT_SS2SI_32>,
@@ -1485,7 +1497,7 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
 def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
-
+}
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
 // register, but the same isn't true when only using memory operands,
 // provide other assembly "l" and "q" forms to address this explicitly
@@ -1499,12 +1511,12 @@ defm VCVTSI2SD   : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
 defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
                                   XD, VEX_4V, VEX_W, VEX_LIG;
 
-def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+let Predicates = [UseAVX] in {
+  def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
-def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
+  def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1606,19 +1618,21 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
               itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
+let Predicates = [UseAVX] in {
 defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
                   int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
                   SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
 defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
                     int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
                     SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
                  sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
 defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
                    sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
 
+let Predicates = [UseAVX] in {
 defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
           int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
           SSE_CVT_Scalar, 0>, XS, VEX_4V;
@@ -1633,7 +1647,7 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
           int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
           SSE_CVT_Scalar, 0>, XD,
           VEX_4V, VEX_W;
-
+}
 let Constraints = "$src1 = $dst" in {
   defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
                         int_x86_sse_cvtsi2ss, i32mem, loadi32,
@@ -1652,6 +1666,7 @@ let Constraints = "$src1 = $dst" in {
 /// SSE 1 Only
 
 // Aliases for intrinsics
+let Predicates = [UseAVX] in {
 defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS, VEX;
@@ -1666,6 +1681,7 @@ defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>,
                                   XD, VEX, VEX_W;
+}
 defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
                                     ssmem, sse_load_f32, "cvttss2si",
                                     SSE_CVT_SS2SI_32>, XS;
@@ -1679,13 +1695,14 @@ defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
                                   int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
                                   "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
 
+let Predicates = [UseAVX] in {
 defm VCVTSS2SI   : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
 defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
                                   ssmem, sse_load_f32, "cvtss2si",
                                   SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
-
+}
 defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
                                ssmem, sse_load_f32, "cvtss2si",
                                SSE_CVT_SS2SI_32>, XS;
@@ -1707,6 +1724,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             SSEPackedSingle, SSE_CVT_PS>,
                             TB, Requires<[UseSSE2]>;
 
+let Predicates = [UseAVX] in {
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1723,6 +1741,7 @@ def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
 def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
                 (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+}
 
 def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
                 (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
@@ -1744,7 +1763,7 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 /// SSE 2 Only
 
 // Convert scalar double to scalar single
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
                        (ins FR64:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
@@ -1760,7 +1779,7 @@ def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
 }
 
 def : Pat<(f32 (fround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
-          Requires<[HasAVX]>;
+          Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
                       "cvtsd2ss\t{$src, $dst|$dst, $src}",
@@ -1778,27 +1797,27 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[UseAVX]>,
                        Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
                        IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtF2F]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
-                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
                        IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
@@ -1807,7 +1826,7 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
 
 // Convert scalar single to scalar double
 // SSE2 instructions with XS prefix
-let neverHasSideEffects = 1 in {
+let neverHasSideEffects = 1, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
                     (ins FR32:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1824,16 +1843,16 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
 }
 
 def : Pat<(f64 (fextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-    Requires<[HasAVX, OptForSize]>;
+    Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
     (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-    Requires<[HasAVX, OptForSpeed]>;
+    Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
@@ -1861,14 +1880,14 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[UseAVX]>,
                     Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
@@ -1895,7 +1914,7 @@ def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
+                         (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
                        IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1905,7 +1924,7 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
-                          (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
+                          (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
                         IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
@@ -1934,7 +1953,7 @@ def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_cvtpd2dq (memopv2f64 addr:$src)))]>, VEX,
+                         (int_x86_sse2_cvtpd2dq (loadv2f64 addr:$src)))]>, VEX,
                        Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -1946,7 +1965,7 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 (memopv4f64 addr:$src)))]>,
+                         (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -1972,7 +1991,7 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttps2dq
-                                            (memopv4f32 addr:$src)))],
+                                            (loadv4f32 addr:$src)))],
                          IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1982,7 +2001,7 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
-                                             (memopv8f32 addr:$src)))],
+                                             (loadv8f32 addr:$src)))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
                           Sched<[WriteCvtF2ILd]>;
 
@@ -1999,27 +2018,27 @@ def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(int_x86_sse2_cvtdq2ps VR128:$src),
             (VCVTDQ2PSrr VR128:$src)>;
-  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (memopv2i64 addr:$src))),
+  def : Pat<(int_x86_sse2_cvtdq2ps (bc_v4i32 (loadv2i64 addr:$src))),
             (VCVTDQ2PSrm addr:$src)>;
 
   def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
             (VCVTTPS2DQrr VR128:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f32 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f32 addr:$src))),
             (VCVTTPS2DQrm addr:$src)>;
 
   def : Pat<(v8f32 (sint_to_fp (v8i32 VR256:$src))),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8f32 (sint_to_fp (bc_v8i32 (loadv4i64 addr:$src)))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   def : Pat<(v8i32 (fp_to_sint (v8f32 VR256:$src))),
             (VCVTTPS2DQYrr VR256:$src)>;
-  def : Pat<(v8i32 (fp_to_sint (memopv8f32 addr:$src))),
+  def : Pat<(v8i32 (fp_to_sint (loadv8f32 addr:$src))),
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
@@ -2056,7 +2075,7 @@ def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
-                                            (memopv2f64 addr:$src)))],
+                                            (loadv2f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
 
 // YMM only
@@ -2068,7 +2087,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
-                          (int_x86_avx_cvtt_pd2dq_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
@@ -2076,7 +2095,7 @@ def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
             (VCVTTPD2DQYrr VR256:$src)>;
-  def : Pat<(v4i32 (fp_to_sint (memopv4f64 addr:$src))),
+  def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
             (VCVTTPD2DQYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2110,7 +2129,7 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
+                       (int_x86_avx_cvt_ps2_pd_256 (loadv4f32 addr:$src)))],
                      IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 }
 
@@ -2140,7 +2159,7 @@ def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L,
+                        (bitconvert (loadv2i64 addr:$src))))]>, VEX, VEX_L,
                     Sched<[WriteCvtI2FLd]>;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2162,7 +2181,7 @@ def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(v4f64 (sint_to_fp (v4i32 VR128:$src))),
             (VCVTDQ2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))),
             (VCVTDQ2PDYrm addr:$src)>;
 } // Predicates = [HasAVX]
 
@@ -2181,7 +2200,7 @@ def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_sse2_cvtpd2ps (memopv2f64 addr:$src)))],
+                          (int_x86_sse2_cvtpd2ps (loadv2f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
 
 // YMM only
@@ -2193,7 +2212,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (int_x86_avx_cvt_pd2_ps_256 (memopv4f64 addr:$src)))],
+                          (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
@@ -2215,13 +2234,13 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_avx_cvtdq2_ps_256 VR256:$src),
             (VCVTDQ2PSYrr VR256:$src)>;
-  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (memopv4i64 addr:$src))),
+  def : Pat<(int_x86_avx_cvtdq2_ps_256 (bitconvert (loadv4i64 addr:$src))),
             (VCVTDQ2PSYrm addr:$src)>;
 
   // Match fround and fextend for 128/256-bit conversions
   def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
             (VCVTPD2PSrr VR128:$src)>;
-  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+  def : Pat<(v4f32 (X86vfpround (loadv2f64 addr:$src))),
             (VCVTPD2PSXrm addr:$src)>;
   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
             (VCVTPD2PSYrr VR256:$src)>;
@@ -2299,7 +2318,7 @@ let Constraints = "$src1 = $dst" in {
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmpsd, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                  SSE_ALU_F32S>, // same latency as 32 bit compare
+                  SSE_ALU_F64S>,
                   XD;
 }
 
@@ -2334,7 +2353,7 @@ let Constraints = "$src1 = $dst" in {
                        SSE_ALU_F32S>, XS;
   defm Int_CMPSD  : sse12_cmp_scalar_int<f64mem, SSECC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
-                       SSE_ALU_F32S>, // same latency as f32
+                       SSE_ALU_F64S>,
                        XD;
 }
 
@@ -2342,90 +2361,88 @@ let Constraints = "$src1 = $dst" in {
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                             ValueType vt, X86MemOperand x86memop,
-                            PatFrag ld_frag, string OpcodeStr, Domain d> {
-  def rr: PI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
+                            PatFrag ld_frag, string OpcodeStr> {
+  def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
-                     IIC_SSE_COMIS_RR, d>,
+                     IIC_SSE_COMIS_RR>,
           Sched<[WriteFAdd]>;
-  def rm: PI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
+  def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (ld_frag addr:$src2)))],
-                                           IIC_SSE_COMIS_RM, d>,
+                                           IIC_SSE_COMIS_RM>,
           Sched<[WriteFAddLd, ReadAfterLd]>;
 }
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, TB, VEX, VEX_LIG;
+                                  "ucomiss">, TB, VEX, VEX_LIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, TB, OpSize, VEX,
-                                  VEX_LIG;
+                                  "ucomisd">, TB, OpSize, VEX, VEX_LIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, TB, VEX,
-                                    VEX_LIG;
+                                    "comiss">, TB, VEX, VEX_LIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, TB, OpSize, VEX,
-                                    VEX_LIG;
+                                    "comisd">, TB, OpSize, VEX, VEX_LIG;
   }
 
   defm Int_VUCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                            load, "ucomiss", SSEPackedSingle>, TB, VEX;
+                            load, "ucomiss">, TB, VEX;
   defm Int_VUCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                            load, "ucomisd", SSEPackedDouble>, TB, OpSize, VEX;
+                            load, "ucomisd">, TB, OpSize, VEX;
 
   defm Int_VCOMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem,
-                            load, "comiss", SSEPackedSingle>, TB, VEX;
+                            load, "comiss">, TB, VEX;
   defm Int_VCOMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem,
-                            load, "comisd", SSEPackedDouble>, TB, OpSize, VEX;
+                            load, "comisd">, TB, OpSize, VEX;
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss", SSEPackedSingle>, TB;
+                                  "ucomiss">, TB;
   defm UCOMISD  : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd", SSEPackedDouble>, TB, OpSize;
+                                  "ucomisd">, TB, OpSize;
 
   let Pattern = []<dag> in {
     defm COMISS  : sse12_ord_cmp<0x2F, VR128, undef, v4f32, f128mem, load,
-                                    "comiss", SSEPackedSingle>, TB;
+                                    "comiss">, TB;
     defm COMISD  : sse12_ord_cmp<0x2F, VR128, undef, v2f64, f128mem, load,
-                                    "comisd", SSEPackedDouble>, TB, OpSize;
+                                    "comisd">, TB, OpSize;
   }
 
   defm Int_UCOMISS  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v4f32, f128mem,
-                              load, "ucomiss", SSEPackedSingle>, TB;
+                              load, "ucomiss">, TB;
   defm Int_UCOMISD  : sse12_ord_cmp<0x2E, VR128, X86ucomi, v2f64, f128mem,
-                              load, "ucomisd", SSEPackedDouble>, TB, OpSize;
+                              load, "ucomisd">, TB, OpSize;
 
   defm Int_COMISS  : sse12_ord_cmp<0x2F, VR128, X86comi, v4f32, f128mem, load,
-                                  "comiss", SSEPackedSingle>, TB;
+                                  "comiss">, TB;
   defm Int_COMISD  : sse12_ord_cmp<0x2F, VR128, X86comi, v2f64, f128mem, load,
-                                  "comisd", SSEPackedDouble>, TB, OpSize;
+                                  "comisd">, TB, OpSize;
 } // Defs = [EFLAGS]
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             Operand CC, Intrinsic Int, string asm,
-                            string asm_alt, Domain d> {
+                            string asm_alt, Domain d,
+                            OpndItins itins = SSE_ALU_F32P> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, RC:$src2, imm:$cc))],
-             IIC_SSE_CMPP_RR, d>,
+             itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
              [(set RC:$dst, (Int RC:$src1, (memop addr:$src2), imm:$cc))],
-             IIC_SSE_CMPP_RM, d>,
+             itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
   // Accept explicit immediate argument form instead of comparison code.
   let neverHasSideEffects = 1 in {
     def rri_alt : PIi8<0xC2, MRMSrcReg,
                (outs RC:$dst), (ins RC:$src1, RC:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RR, d>, Sched<[WriteFAdd]>;
+               asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
     def rmi_alt : PIi8<0xC2, MRMSrcMem,
                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, i8imm:$cc),
-               asm_alt, [], IIC_SSE_CMPP_RM, d>,
+               asm_alt, [], itins.rm, d>,
                Sched<[WriteFAddLd, ReadAfterLd]>;
   }
 }
@@ -2450,11 +2467,11 @@ let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedSingle>, TB;
+                 SSEPackedSingle, SSE_ALU_F32P>, TB;
   defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                 SSEPackedDouble>, TB, OpSize;
+                 SSEPackedDouble, SSE_ALU_F64P>, TB, OpSize;
 }
 
 let Predicates = [HasAVX] in {
@@ -2514,16 +2531,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 
 defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv4f32, SSEPackedSingle>, TB, VEX_4V;
+           loadv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
-           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
+           "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+           loadv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2538,13 +2555,13 @@ let Constraints = "$src1 = $dst" in {
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
-                       (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
+                       (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
-                       (memopv2i64 addr:$src2), (i8 imm:$imm))),
+                       (loadv2i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
   def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
             (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
@@ -2553,13 +2570,13 @@ let Predicates = [HasAVX] in {
   def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v8i32 (X86Shufp VR256:$src1,
-                      (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                      (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
             (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 
   def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
             (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
   def : Pat<(v4i64 (X86Shufp VR256:$src1,
-                              (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                              (loadv4i64 addr:$src2), (i8 imm:$imm))),
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -2603,29 +2620,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
              Sched<[WriteShuffleLd, ReadAfterLd]>;
 }
 
-defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
+defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
+defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
-defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
+defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V;
-defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
+defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V;
 
-defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
+defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
+defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
-defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
+defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedSingle>, TB, VEX_4V, VEX_L;
-defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
+defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                      SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
@@ -2645,20 +2662,20 @@ let Constraints = "$src1 = $dst" in {
 } // Constraints = "$src1 = $dst"
 
 let Predicates = [HasAVX1Only] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
 
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
@@ -2689,13 +2706,10 @@ let Predicates = [UseSSE2] in {
 /// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
 multiclass sse12_extr_sign_mask<RegisterClass RC, Intrinsic Int, string asm,
                                 Domain d> {
-  def rr32 : PI<0x50, MRMSrcReg, (outs GR32:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                     [(set GR32:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
-             Sched<[WriteVecLogic]>;
-  def rr64 : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins RC:$src),
-                !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [],
-                IIC_SSE_MOVMSK, d>, REX_W, Sched<[WriteVecLogic]>;
+  def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
+              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+              [(set GR32orGR64:$dst, (Int RC:$src))], IIC_SSE_MOVMSK, d>,
+              Sched<[WriteVecLogic]>;
 }
 
 let Predicates = [HasAVX] in {
@@ -2712,29 +2726,15 @@ let Predicates = [HasAVX] in {
                                         OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR32:$src)),
-            (VMOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>;
   def : Pat<(i32 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>;
+            (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
   def : Pat<(i64 (X86fgetsign FR64:$src)),
-            (VMOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
-  // Assembler Only
-  def VMOVMSKPSr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, Sched<[WriteVecLogic]>;
-  def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX, VEX_L, Sched<[WriteVecLogic]>;
-  def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-             "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedDouble>, TB,
-             OpSize, VEX, VEX_L, Sched<[WriteVecLogic]>;
+            (SUBREG_TO_REG (i64 0),
+             (VMOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2743,16 +2743,18 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
                                      SSEPackedDouble>, TB, OpSize;
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128))>,
       Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
-          (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPSrr (COPY_TO_REGCLASS FR32:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128))>,
       Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
-          (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
+          (SUBREG_TO_REG (i64 0),
+           (MOVMSKPDrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_32bit)>,
       Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
@@ -2791,7 +2793,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2799,7 +2801,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
-                               OpVT256, VR256, memopv4i64, i256mem, itins,
+                               OpVT256, VR256, loadv4i64, i256mem, itins,
                                IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -2839,16 +2841,18 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-              SSE_BIT_ITINS_P>;
-defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-              SSE_BIT_ITINS_P>;
-defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-              SSE_BIT_ITINS_P>;
-
-let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
-  defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
+let isCodeGenOnly = 1 in {
+  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
                 SSE_BIT_ITINS_P>;
+  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+                SSE_BIT_ITINS_P>;
+  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+                SSE_BIT_ITINS_P>;
+
+  let isCommutable = 0 in
+    defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", X86fandn,
+                  SSE_BIT_ITINS_P>;
+}
 
 /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
 ///
@@ -2858,14 +2862,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         !strconcat(OpcodeStr, "ps"), f256mem,
         [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
-                                  (memopv4i64 addr:$src2)))], 0>,
+                                  (loadv4i64 addr:$src2)))], 0>,
                                   TB, OpSize, VEX_4V, VEX_L;
 
   // In AVX no need to add a pattern for 128-bit logical rr ps, because they
@@ -2875,14 +2879,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem, [],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, TB, VEX_4V;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
-                                 (memopv2i64 addr:$src2)))], 0>,
+                                 (loadv2i64 addr:$src2)))], 0>,
                                                  TB, OpSize, VEX_4V;
 
   let Constraints = "$src1 = $dst" in {
@@ -2924,120 +2928,93 @@ let isCommutable = 0 in
 
 /// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
 /// classes below
-multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SizeItins itins,
-                                  bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
-                            OpNode, FR32, f32mem,
-                            itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
-                            OpNode, FR64, f64mem,
-                            itins.d, Is2Addr>, XD;
-}
-
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
                                   SDNode OpNode, SizeItins itins> {
-let Predicates = [HasAVX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
-                               VR128, v4f32, f128mem, memopv4f32,
+                               VR128, v4f32, f128mem, loadv4f32,
                                SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
-                               VR128, v2f64, f128mem, memopv2f64,
+                               VR128, v2f64, f128mem, loadv2f64,
                                SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
-                        OpNode, VR256, v8f32, f256mem, memopv8f32,
+                        OpNode, VR256, v8f32, f256mem, loadv8f32,
                         SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
-                        OpNode, VR256, v4f64, f256mem, memopv4f64,
+                        OpNode, VR256, v4f64, f256mem, loadv4f64,
                         SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
-}
 
-let Constraints = "$src1 = $dst" in {
-  defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
-                            v4f32, f128mem, memopv4f32, SSEPackedSingle,
-                            itins.s, 1>, TB;
-  defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
-                            v2f64, f128mem, memopv2f64, SSEPackedDouble,
-                            itins.d, 1>, TB, OpSize;
-}
-}
-
-multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SizeItins itins,
-                                      bit Is2Addr = 1> {
-  defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
-     itins.s, Is2Addr>, XS;
-  defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
-     !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
-     itins.d, Is2Addr>, XD;
+  let Constraints = "$src1 = $dst" in {
+    defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
+                              v4f32, f128mem, memopv4f32, SSEPackedSingle,
+                              itins.s>, TB;
+    defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
+                              v2f64, f128mem, memopv2f64, SSEPackedDouble,
+                              itins.d>, TB, OpSize;
+  }
 }
 
-// Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
-let isCommutable = 0 in {
-  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
-  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
-  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
-  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
-}
+multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                  SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                         OpNode, FR32, f32mem, itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                         OpNode, FR64, f64mem, itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCodeGenOnly = 1 in {
-  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
-  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
+                              OpNode, FR32, f32mem, itins.s>, XS;
+    defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
+                              OpNode, FR64, f64mem, itins.d>, XD;
+  }
 }
 
-defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
-defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
-            basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
-              VEX_4V, VEX_LIG;
+multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
+                                      SizeItins itins> {
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s, 0>, XS, VEX_4V, VEX_LIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d, 0>, XD, VEX_4V, VEX_LIG;
 
-let isCommutable = 0 in {
-  defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
-  defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
-              basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
-                VEX_4V, VEX_LIG;
+  let Constraints = "$src1 = $dst" in {
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "ss"), "", "_ss", ssmem, sse_load_f32,
+                   itins.s>, XS;
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, VR128,
+                   !strconcat(OpcodeStr, "sd"), "2", "_sd", sdmem, sse_load_f64,
+                   itins.d>, XD;
+  }
 }
 
-let Constraints = "$src1 = $dst" in {
-  defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
-  defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
-
-  let isCommutable = 0 in {
-    defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
-    defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
-    defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
-    defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-               basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
-  }
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
+           basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
+           basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
+           basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
+let isCommutable = 0 in {
+  defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
+  defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
+  defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
+  defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
+             basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
-  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
-       VEX_4V, VEX_LIG;
-  let Constraints = "$src1 = $dst" in {
-    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
-    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
-  }
+  defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+  defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
+             basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
 }
 
 /// Unop Arithmetic
@@ -3049,12 +3026,20 @@ let isCodeGenOnly = 1 in {
 /// And, we have a special variant form for a full-vector intrinsic form.
 
 let Sched = WriteFSqrt in {
-def SSE_SQRTP : OpndItins<
-  IIC_SSE_SQRTP_RR, IIC_SSE_SQRTP_RM
+def SSE_SQRTPS : OpndItins<
+  IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
+>;
+
+def SSE_SQRTSS : OpndItins<
+  IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
 >;
 
-def SSE_SQRTS : OpndItins<
-  IIC_SSE_SQRTS_RR, IIC_SSE_SQRTS_RM
+def SSE_SQRTPD : OpndItins<
+  IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
+>;
+
+def SSE_SQRTSD : OpndItins<
+  IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
 >;
 }
 
@@ -3175,7 +3160,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3185,7 +3170,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3212,7 +3197,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                           !strconcat("v", OpcodeStr,
                           "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+                          [(set VR128:$dst, (V4F32Int (loadv4f32 addr:$src)))],
                           itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                             !strconcat("v", OpcodeStr,
@@ -3223,7 +3208,7 @@ let Predicates = [HasAVX] in {
                           (ins f256mem:$src),
                           !strconcat("v", OpcodeStr,
                                     "ps\t{$src, $dst|$dst, $src}"),
-                          [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+                          [(set VR256:$dst, (V8F32Int (loadv8f32 addr:$src)))],
                           itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3293,7 +3278,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
-                       [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+                       [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
                        itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
@@ -3303,7 +3288,7 @@ let Predicates = [HasAVX] in {
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
-                        [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+                        [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
                         itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
 }
 
@@ -3319,47 +3304,48 @@ let Predicates = [HasAVX] in {
 
 // Square root.
 defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
-                            SSE_SQRTS>,
-             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+                            SSE_SQRTSS>,
+             sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS>,
              sse2_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse2_sqrt_sd,
-                            SSE_SQRTS>,
-             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
+                            SSE_SQRTSD>,
+             sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
 
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
-             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
+             sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
-                                int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+                                int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
 defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
                                 int_x86_avx_rcp_ps_256, SSE_RCPP>;
 
-def : Pat<(f32 (fsqrt FR32:$src)),
-          (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (fsqrt (load addr:$src))),
-          (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-def : Pat<(f64 (fsqrt FR64:$src)),
-          (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
-def : Pat<(f64 (fsqrt (load addr:$src))),
-          (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frsqrt FR32:$src)),
-          (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frsqrt (load addr:$src))),
-          (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-def : Pat<(f32 (X86frcp FR32:$src)),
-          (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
-def : Pat<(f32 (X86frcp (load addr:$src))),
-          (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
-          Requires<[HasAVX, OptForSize]>;
-
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
+  def : Pat<(f32 (fsqrt FR32:$src)),
+            (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (fsqrt (load addr:$src))),
+            (VSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+  def : Pat<(f64 (fsqrt FR64:$src)),
+            (VSQRTSDr (f64 (IMPLICIT_DEF)), FR64:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f64 (fsqrt (load addr:$src))),
+            (VSQRTSDm (f64 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frsqrt FR32:$src)),
+            (VRSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frsqrt (load addr:$src))),
+            (VRSQRTSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+
+  def : Pat<(f32 (X86frcp FR32:$src)),
+            (VRCPSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
+  def : Pat<(f32 (X86frcp (load addr:$src))),
+            (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
+            Requires<[HasAVX, OptForSize]>;
+}
+let Predicates = [UseAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3373,7 +3359,9 @@ let Predicates = [HasAVX] in {
                               VR128)>;
   def : Pat<(int_x86_sse2_sqrt_sd sse_load_f64:$src),
             (VSQRTSDm_Int (v2f64 (IMPLICIT_DEF)), sse_load_f64:$src)>;
+}
 
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VRSQRTSSr (f32 (IMPLICIT_DEF)),
                                          (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3657,7 +3645,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                  XS, Requires<[UseSSE2]>;
 }
 
-let mayStore = 1, SchedRW = [WriteStore] in {
+let mayStore = 1, neverHasSideEffects = 1, SchedRW = [WriteStore] in {
 def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqa\t{$src, $dst|$dst, $src}",
                    [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
@@ -3720,7 +3708,7 @@ multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
                              bit IsCommutable = 0> {
 let Predicates = [HasAVX] in
   defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
-                                 VR128, memopv2i64, i128mem, itins,
+                                 VR128, loadv2i64, i128mem, itins,
                                  IsCommutable, 0>, VEX_4V;
 
 let Constraints = "$src1 = $dst" in
@@ -3729,7 +3717,7 @@ let Constraints = "$src1 = $dst" in
 
 let Predicates = [HasAVX2] in
   defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
-                                   VR256, memopv4i64, i256mem, itins,
+                                   VR256, loadv4i64, i256mem, itins,
                                    IsCommutable, 0>, VEX_4V, VEX_L;
 }
 
@@ -3756,11 +3744,11 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
                        (bc_frag (memopv2i64 addr:$src2)))))], itins.rm>,
       Sched<[WriteVecShiftLd, ReadAfterLd]>;
   def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
-       (ins RC:$src1, i32i8imm:$src2),
+       (ins RC:$src1, i8imm:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>,
+       [(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))], itins.ri>,
        Sched<[WriteVecShift]>;
 }
 
@@ -3844,15 +3832,15 @@ defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
 defm PAVGW   : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
                                  int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
 defm PSADBW  : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
-                                 int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+                                 int_x86_avx2_psad_bw, SSE_PMADD, 1>;
 
 let Predicates = [HasAVX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
-                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
+                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                               VEX_4V;
 let Predicates = [HasAVX2] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
-                               VR256, memopv4i64, i256mem,
+                               VR256, loadv4i64, i256mem,
                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
@@ -3988,12 +3976,14 @@ let ExeDomain = SSEPackedInt, SchedRW = [WriteVecShift] in {
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "pslldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   def PSRLDQri : PDIi8<0x73, MRM3r,
                        (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                        "psrldq\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))]>;
+                         (int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2))],
+                         IIC_SSE_INTSHDQ_P_RI>;
   // PSRADQri doesn't exist in SSE[1-3].
 }
 } // Constraints = "$src1 = $dst"
@@ -4077,14 +4067,14 @@ let Predicates = [HasAVX] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, i8imm:$src2),
                       !strconcat("v", OpcodeStr,
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR128:$dst,
-                       (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                        (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX,
+                       (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
+                        (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
                   Sched<[WriteShuffleLd]>;
 }
 
@@ -4095,14 +4085,14 @@ let Predicates = [HasAVX2] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, i8imm:$src2),
                        !strconcat("v", OpcodeStr,
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR256:$dst,
-                        (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
-                         (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L,
+                        (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
+                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
                    Sched<[WriteShuffleLd]>;
 }
 
@@ -4113,14 +4103,14 @@ let Predicates = [UseSSE2] in {
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                IIC_SSE_PSHUF>, Sched<[WriteShuffle]>;
+                IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
   def mi : Ii8<0x70, MRMSrcMem,
                (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
                !strconcat(OpcodeStr,
                           "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set VR128:$dst,
                   (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
-                          (i8 imm:$src2))))], IIC_SSE_PSHUF>,
+                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
            Sched<[WriteShuffleLd]>;
 }
 }
@@ -4131,7 +4121,7 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
             (VPSHUFDmi addr:$src1, imm:$imm)>;
   def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
             (VPSHUFDri VR128:$src1, imm:$imm)>;
@@ -4254,13 +4244,13 @@ let ExeDomain = SSEPackedInt in {
 multiclass sse2_pinsrw<bit Is2Addr = 1> {
   def rri : Ii8<0xC4, MRMSrcReg,
        (outs VR128:$dst), (ins VR128:$src1,
-        GR32:$src2, i32i8imm:$src3),
+        GR32orGR64:$src2, i32i8imm:$src3),
        !if(Is2Addr,
            "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
-         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))], IIC_SSE_PINSRW>,
-       Sched<[WriteShuffle]>;
+         (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
+       IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
   def rmi : Ii8<0xC4, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1,
                         i16mem:$src2, i32i8imm:$src3),
@@ -4276,29 +4266,24 @@ multiclass sse2_pinsrw<bit Is2Addr = 1> {
 // Extract
 let Predicates = [HasAVX] in
 def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "vpextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))]>, TB, OpSize, VEX,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))]>, TB, OpSize, VEX,
                 Sched<[WriteShuffle]>;
 def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
-                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    (outs GR32orGR64:$dst), (ins VR128:$src1, i32i8imm:$src2),
                     "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
-                                                imm:$src2))], IIC_SSE_PEXTRW>,
+                    [(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                            imm:$src2))], IIC_SSE_PEXTRW>,
                Sched<[WriteShuffleLd, ReadAfterLd]>;
 
 // Insert
-let Predicates = [HasAVX] in {
-  defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
-  def  VPINSRWrr64i : Ii8<0xC4, MRMSrcReg, (outs VR128:$dst),
-       (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
-       "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-       []>, TB, OpSize, VEX_4V, Sched<[WriteShuffle]>;
-}
+let Predicates = [HasAVX] in
+defm VPINSRW : sse2_pinsrw<0>, TB, OpSize, VEX_4V;
 
-let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
+let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
+defm PINSRW : sse2_pinsrw, TB, OpSize;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4308,24 +4293,23 @@ let Constraints = "$src1 = $dst" in
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
 
-def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>, VEX;
-def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK>, VEX;
 
 let Predicates = [HasAVX2] in {
-def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
+def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
+           (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
-def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
+           [(set GR32orGR64:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>,
+           VEX, VEX_L;
 }
 
-def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
+           [(set GR32orGR64:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))],
            IIC_SSE_MOVMSK>;
 
 } // ExeDomain = SSEPackedInt
@@ -4336,25 +4320,25 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
 
 let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [HasAVX,In32BitMode] in
 def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>, VEX;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
            IIC_SSE_MASKMOV>, VEX;
 
-let Uses = [EDI] in
+let Uses = [EDI], Predicates = [UseSSE2,In32BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
            IIC_SSE_MASKMOV>;
-let Uses = [RDI] in
+let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
 def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
@@ -4369,43 +4353,45 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Packed Double Int
 //
-def VMOVDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                         VEX, Sched<[WriteMove]>;
-def VMOVDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>,
                       VEX, Sched<[WriteLoad]>;
-def VMOV64toPQIrr : VRPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOV64toSDrr : VRPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+let isCodeGenOnly = 1 in
+def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
 
-def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
                   Sched<[WriteMove]>;
-def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                       "movd\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (scalar_to_vector (loadi32 addr:$src))))],
                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2i64 (scalar_to_vector GR64:$src)))],
                           IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+let isCodeGenOnly = 1 in
+def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}",
                        [(set FR64:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
@@ -4413,63 +4399,77 @@ def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
 //===---------------------------------------------------------------------===//
 // Move Int Doubleword to Single Scalar
 //
-def VMOVDI2SSrr  : VPDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-
-def VMOVDI2SSrm  : VPDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>,
-                      VEX, Sched<[WriteLoad]>;
-def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert GR32:$src))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
-
-def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
-                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+let isCodeGenOnly = 1 in {
+  def VMOVDI2SSrr  : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+
+  def VMOVDI2SSrm  : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>,
+                        VEX, Sched<[WriteLoad]>;
+  def MOVDI2SSrr  : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert GR32:$src))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
+
+  def MOVDI2SSrm  : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
+                        IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int to Packed Double Int
 //
-def VMOVPDI2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def VMOVPDI2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
                     Sched<[WriteMove]>;
-def VMOVPDI2DImr  : VPDI<0x7E, MRMDestMem, (outs),
+def VMOVPDI2DImr  : VS2I<0x7E, MRMDestMem, (outs),
                        (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
                                      VEX, Sched<[WriteLoad]>;
-def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+def MOVPDI2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
                                         (iPTR 0)))], IIC_SSE_MOVD_ToGP>,
                    Sched<[WriteMove]>;
-def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                        "movd\t{$src, $dst|$dst, $src}",
                        [(store (i32 (vector_extract (v4i32 VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
 
+def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
+def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
+
+def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
+        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
+
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 let SchedRW = [WriteMove] in {
-def VMOVPQIto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
                       VEX;
 
-def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                          (iPTR 0)))],
@@ -4479,121 +4479,112 @@ def MOVPQIto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 //===---------------------------------------------------------------------===//
 // Bitcast FR64 <-> GR64
 //
-let Predicates = [HasAVX] in
-def VMOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                        "vmovq\t{$src, $dst|$dst, $src}",
-                        [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
-                        VEX, Sched<[WriteLoad]>;
-def VMOVSDto64rr : VRPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+let isCodeGenOnly = 1 in {
+  let Predicates = [UseAVX] in
+  def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                          "movq\t{$src, $dst|$dst, $src}",
+                          [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                          VEX, Sched<[WriteLoad]>;
+  def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (bitconvert FR64:$src))],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
+  def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                           "movq\t{$src, $dst|$dst, $src}",
+                           [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
+                           IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+
+  def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                         "movq\t{$src, $dst|$dst, $src}",
+                         [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
+                         IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
+  def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
                          "mov{d|q}\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64:$src))],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
-def VMOVSDto64mr : VRPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                         IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
                          "movq\t{$src, $dst|$dst, $src}",
                          [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                         IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-
-def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
-                       IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
-def MOVSDto64rr : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}",
-                       [(set GR64:$dst, (bitconvert FR64:$src))],
-                       IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSDto64mr : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
-                       "movq\t{$src, $dst|$dst, $src}",
-                       [(store (i64 (bitconvert FR64:$src)), addr:$dst)],
-                       IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+                         IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Move Scalar Single to Double Int
 //
-def VMOVSS2DIrr  : VPDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
-def VMOVSS2DImr  : VPDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
-def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(set GR32:$dst, (bitconvert FR32:$src))],
-                      IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
-def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
-                      "movd\t{$src, $dst|$dst, $src}",
-                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
-                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+let isCodeGenOnly = 1 in {
+  def VMOVSS2DIrr  : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
+  def VMOVSS2DImr  : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
+  def MOVSS2DIrr  : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set GR32:$dst, (bitconvert FR32:$src))],
+                        IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
+  def MOVSS2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(store (i32 (bitconvert FR32:$src)), addr:$dst)],
+                        IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
+}
 
 //===---------------------------------------------------------------------===//
 // Patterns and instructions to describe movd/movq to XMM register zero-extends
 //
-let SchedRW = [WriteMove] in {
+let isCodeGenOnly = 1, SchedRW = [WriteMove] in {
 let AddedComplexity = 15 in {
-def VMOVZDI2PDIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>, VEX;
-def VMOVZQI2PQIrr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                       "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
+def VMOVZQI2PQIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "movq\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>,
                                       VEX, VEX_W;
-}
-let AddedComplexity = 15 in {
-def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst, (v4i32 (X86vzmovl
-                                      (v4i32 (scalar_to_vector GR32:$src)))))],
-                                      IIC_SSE_MOVDQ>;
-def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+def MOVZQI2PQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
                        "mov{d|q}\t{$src, $dst|$dst, $src}", // X86-64 only
                        [(set VR128:$dst, (v2i64 (X86vzmovl
                                       (v2i64 (scalar_to_vector GR64:$src)))))],
                                       IIC_SSE_MOVDQ>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
-let AddedComplexity = 20, SchedRW = [WriteLoad] in {
-def VMOVZDI2PDIrm : VPDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>, VEX;
-def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
-                       "movd\t{$src, $dst|$dst, $src}",
-                       [(set VR128:$dst,
-                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
-                                                   (loadi32 addr:$src))))))],
-                                                   IIC_SSE_MOVDQ>;
-} // AddedComplexity, SchedRW
+let Predicates = [UseAVX] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (VMOVDI2PDIrr GR32:$src)>;
 
-let Predicates = [HasAVX] in {
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-              (VMOVZDI2PDIrm addr:$src)>;
+              (VMOVDI2PDIrm addr:$src)>;
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                                (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
-            (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
+            (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                                (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
-  def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
-            (MOVZDI2PDIrm addr:$src)>;
+let Predicates = [UseSSE2] in {
+  let AddedComplexity = 15 in
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
+              (MOVDI2PDIrr GR32:$src)>;
+
+  let AddedComplexity = 20 in {
+    def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+    def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+              (MOVDI2PDIrm addr:$src)>;
+  }
 }
 
 // These are the correct encodings of the instructions so that we know how to
@@ -4602,15 +4593,12 @@ let Predicates = [UseSSE2], AddedComplexity = 20 in {
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
 def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOV64toSDrr FR64:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
                 (MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVSDto64rr GR64:$dst, FR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (VMOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
-def : InstAlias<"movq\t{$src, $dst|$dst, $src}",
-                (MOVZQI2PQIrr VR128:$dst, GR64:$src), 0>;
+// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
+def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
+                (VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Quadword
@@ -4625,7 +4613,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[HasAVX]>;
+                    VEX, Requires<[UseAVX]>;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4638,12 +4626,12 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
 // Move Packed Quadword Int to Quadword Int
 //
 let SchedRW = [WriteStore] in {
-def VMOVPQI2QImr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
                                     IIC_SSE_MOVDQ>, VEX;
-def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (vector_extract (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
@@ -4653,25 +4641,24 @@ def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
 //===---------------------------------------------------------------------===//
 // Store / copy lower 64-bits of a XMM register.
 //
-def VMOVLQ128mr : VPDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def VMOVLQ128mr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>, VEX,
                   Sched<[WriteStore]>;
-def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVLQ128mr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)],
                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 
-let AddedComplexity = 20 in
+let isCodeGenOnly = 1, AddedComplexity = 20 in {
 def VMOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "vmovq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, VEX, Requires<[HasAVX]>, Sched<[WriteLoad]>;
+                     XS, VEX, Requires<[UseAVX]>, Sched<[WriteLoad]>;
 
-let AddedComplexity = 20 in
 def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                      "movq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst,
@@ -4679,10 +4666,9 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
                      XS, Requires<[UseSSE2]>, Sched<[WriteLoad]>;
+}
 
-let Predicates = [HasAVX], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (VMOVZQI2PQIrm addr:$src)>;
+let Predicates = [UseAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (VMOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)),
@@ -4690,8 +4676,6 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
 }
 
 let Predicates = [UseSSE2], AddedComplexity = 20 in {
-  def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-            (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
@@ -4714,7 +4698,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4723,14 +4707,14 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                       XS, Requires<[UseSSE2]>;
 } // SchedRW
 
-let SchedRW = [WriteVecLogicLd] in {
+let isCodeGenOnly = 1, SchedRW = [WriteVecLogicLd] in {
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, VEX, Requires<[HasAVX]>;
+                      XS, VEX, Requires<[UseAVX]>;
 let AddedComplexity = 20 in {
 def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -4739,49 +4723,19 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                                              IIC_SSE_MOVDQ>,
                       XS, Requires<[UseSSE2]>;
 }
-} // SchedRW
+} // isCodeGenOnly, SchedRW
 
 let AddedComplexity = 20 in {
-  let Predicates = [HasAVX] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (VMOVZPQILo2PQIrm addr:$src)>;
+  let Predicates = [UseAVX] in {
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
   let Predicates = [UseSSE2] in {
-    def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
-              (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (MOVZPQILo2PQIrr VR128:$src)>;
   }
 }
 
-// Instructions to match in the assembler
-let SchedRW = [WriteMove] in {
-def VMOVQs64rr : VPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-def VMOVQd64rr : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                      "movq\t{$src, $dst|$dst, $src}", [],
-                      IIC_SSE_MOVDQ>, VEX, VEX_W;
-// Recognize "movd" with GR64 destination, but encode as a "movq"
-def VMOVQd64rr_alt : VPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "movd\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVDQ>, VEX, VEX_W;
-} // SchedRW
-
-// Instructions for the disassembler
-// xr = XMM register
-// xm = mem64
-
-let SchedRW = [WriteMove] in {
-let Predicates = [HasAVX] in
-def VMOVQxrxr: I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "vmovq\t{$src, $dst|$dst, $src}", []>, VEX, XS;
-def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
-                 "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, XS;
-} // SchedRW
-
 //===---------------------------------------------------------------------===//
 // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
@@ -4800,13 +4754,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, memopv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4816,19 +4770,19 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (VMOVSHDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSHDUPrm addr:$src)>;
   def : Pat<(v4i32 (X86Movsldup VR128:$src)),
             (VMOVSLDUPrr VR128:$src)>;
-  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
+  def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
             (VMOVSLDUPrm addr:$src)>;
   def : Pat<(v8i32 (X86Movshdup VR256:$src)),
             (VMOVSHDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSHDUPYrm addr:$src)>;
   def : Pat<(v8i32 (X86Movsldup VR256:$src)),
             (VMOVSLDUPYrr VR256:$src)>;
-  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (memopv4i64 addr:$src)))),
+  def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
             (VMOVSLDUPYrm addr:$src)>;
 }
 
@@ -4882,20 +4836,20 @@ let Predicates = [HasAVX] in {
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
 
 let Predicates = [HasAVX] in {
-  def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv2f64 addr:$src)),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv4f32 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-  def : Pat<(X86Movddup (bc_v2f64 (memopv2i64 addr:$src))),
+  def : Pat<(X86Movddup (bc_v2f64 (loadv2i64 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (bc_v2f64
                              (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
 
   // 256-bit version
-  def : Pat<(X86Movddup (memopv4f64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4f64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
-  def : Pat<(X86Movddup (memopv4i64 addr:$src)),
+  def : Pat<(X86Movddup (loadv4i64 addr:$src)),
             (VMOVDDUPYrm addr:$src)>;
   def : Pat<(X86Movddup (v4i64 (scalar_to_vector (loadi64 addr:$src)))),
             (VMOVDDUPYrm addr:$src)>;
@@ -5097,12 +5051,12 @@ multiclass SS3I_unop_rm_int_y<bits<8> opc, string OpcodeStr,
 // Helper fragments to match sext vXi1 to vXiY.
 def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
                                                VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i32 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i32 31)))>;
+def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
+def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
 def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
                                                VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i32 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i32 31)))>;
+def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
+def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
 
 let Predicates = [HasAVX] in {
   defm VPABSB  : SS3I_unop_rm_int<0x1C, "vpabsb",
@@ -5258,34 +5212,34 @@ multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm<0x08, "vpsignb", X86psign, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGNW    : SS3I_binop_rm<0x09, "vpsignw", X86psign, v8i16, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSIGND    : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v4i32, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSIGN, 0>, VEX_4V;
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, VR128,
-                                  memopv2i64, i128mem,
+                                  loadv2i64, i128mem,
                                   SSE_PSHUFB, 0>, VEX_4V;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
@@ -5305,28 +5259,28 @@ defm VPMULHRSW    : SS3I_binop_rm_int<0x0B, "vpmulhrsw",
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
-                                  memopv4i64, i256mem,
+                                  loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
@@ -5384,7 +5338,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffle]>;
+      [], IIC_SSE_PALIGNRR>, OpSize, Sched<[WriteShuffle]>;
   let mayLoad = 1 in
   def R128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
@@ -5392,7 +5346,7 @@ multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                   "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-      [], IIC_SSE_PALIGNR>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
+      [], IIC_SSE_PALIGNRM>, OpSize, Sched<[WriteShuffleLd, ReadAfterLd]>;
   }
 }
 
@@ -5472,28 +5426,29 @@ def MWAITrr   : I<0x01, MRM_C9, (outs), (ins), "mwait",
                 TB, Requires<[HasSSE3]>;
 } // SchedRW
 
-def : InstAlias<"mwait %eax, %ecx", (MWAITrr)>, Requires<[In32BitMode]>;
-def : InstAlias<"mwait %rax, %rcx", (MWAITrr)>, Requires<[In64BitMode]>;
+def : InstAlias<"mwait\t{%eax, %ecx|ecx, eax}", (MWAITrr)>, Requires<[In32BitMode]>;
+def : InstAlias<"mwait\t{%rax, %rcx|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;
 
-def : InstAlias<"monitor %eax, %ecx, %edx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%eax, %ecx, %edx|edx, ecx, eax}", (MONITORrrr)>,
       Requires<[In32BitMode]>;
-def : InstAlias<"monitor %rax, %rcx, %rdx", (MONITORrrr)>,
+def : InstAlias<"monitor\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORrrr)>,
       Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Move with Sign/Zero Extend
 //===----------------------------------------------------------------------===//
 
-multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
-       OpSize;
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))],
+         itins.rm>, OpSize;
 }
 
 multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
@@ -5504,22 +5459,23 @@ multiclass SS41I_binop_rm_int16_y<bits<8> opc, string OpcodeStr,
 
   def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                  [(set VR256:$dst, (IntId (load addr:$src)))]>, OpSize;
+                  [(set VR256:$dst, (IntId (load addr:$src)))]>,
+                  OpSize;
 }
 
 let Predicates = [HasAVX] in {
-defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw", int_x86_sse41_pmovsxbw>,
-                                     VEX;
-defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd", int_x86_sse41_pmovsxwd>,
-                                     VEX;
-defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq", int_x86_sse41_pmovsxdq>,
-                                     VEX;
-defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw", int_x86_sse41_pmovzxbw>,
-                                     VEX;
-defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd", int_x86_sse41_pmovzxwd>,
-                                     VEX;
-defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
-                                     VEX;
+defm VPMOVSXBW : SS41I_binop_rm_int8<0x20, "vpmovsxbw",
+                                     int_x86_sse41_pmovsxbw>, VEX;
+defm VPMOVSXWD : SS41I_binop_rm_int8<0x23, "vpmovsxwd",
+                                     int_x86_sse41_pmovsxwd>, VEX;
+defm VPMOVSXDQ : SS41I_binop_rm_int8<0x25, "vpmovsxdq",
+                                     int_x86_sse41_pmovsxdq>, VEX;
+defm VPMOVZXBW : SS41I_binop_rm_int8<0x30, "vpmovzxbw",
+                                     int_x86_sse41_pmovzxbw>, VEX;
+defm VPMOVZXWD : SS41I_binop_rm_int8<0x33, "vpmovzxwd",
+                                     int_x86_sse41_pmovzxwd>, VEX;
+defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq",
+                                     int_x86_sse41_pmovzxdq>, VEX;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5537,12 +5493,12 @@ defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
                                         int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
-defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
-defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
-defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
-defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
-defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
-defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd,                                       SSE_INTALU_ITINS_P>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq,                                       SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load.
@@ -5640,32 +5596,39 @@ let Predicates = [HasAVX2] in {
               (VPMOVZXDQYrr VR128:$src)>;
     def : Pat<(v8i32 (X86vzmovly (v8i16 VR128:$src))),
               (VPMOVZXWDYrr VR128:$src)>;
+    def : Pat<(v16i16 (X86vzmovly (v16i8 VR128:$src))),
+              (VPMOVZXBWYrr VR128:$src)>;
   }
 
   def : Pat<(v4i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
   def : Pat<(v8i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+  def : Pat<(v16i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
 }
 
 let Predicates = [HasAVX] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+  def : Pat<(v8i16 (X86vsmovl (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
 }
 
 
-multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+                 [(set VR128:$dst, (IntId VR128:$src))], itins.rr>, OpSize;
 
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
        [(set VR128:$dst,
-         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))],
+         itins.rm>,
           OpSize;
 }
 
@@ -5704,10 +5667,14 @@ defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
                                        int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
-defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
-defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
-defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
-defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX] in {
   // Common patterns involving scalar load
@@ -5735,7 +5702,8 @@ let Predicates = [UseSSE41] in {
             (PMOVZXWQrm addr:$src)>;
 }
 
-multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                  [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
@@ -5774,8 +5742,10 @@ defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
                                        int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
-defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
-defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq,
+                                      SSE_INTALU_ITINS_P>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq,
+                                      SSE_INTALU_ITINS_P>;
 
 let Predicates = [HasAVX2] in {
   def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
@@ -6027,35 +5997,39 @@ let Predicates = [UseSSE41] in {
 
 /// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
 multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
+                                         imm:$src2))]>,
                  OpSize;
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
-                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  []>, OpSize;
 // FIXME:
 // There's an AssertZext in the way of writing the store pattern
 // (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
 }
 
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX] in
   defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;
-  def  VPEXTRBrr64 : SS4AIi8<0x14, MRMDestReg, (outs GR64:$dst),
-         (ins VR128:$src1, i32i8imm:$src2),
-         "vpextrb\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, OpSize, VEX;
-}
 
 defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
 
 
 /// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
 multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  let isCodeGenOnly = 1, hasSideEffects = 0 in
+  def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
+                   (ins VR128:$src1, i32i8imm:$src2),
+                   !strconcat(OpcodeStr,
+                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                   []>, OpSize;
+
   let neverHasSideEffects = 1, mayStore = 1 in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
@@ -6117,31 +6091,28 @@ defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
-multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
-  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
+                            OpndItins itins = DEFAULT_ITINS> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
                  (ins VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                 [(set GR32:$dst,
-                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+                 [(set GR32orGR64:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
+                    itins.rr>,
            OpSize;
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
-                          addr:$dst)]>, OpSize;
+                          addr:$dst)], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [HasAVX] in {
+  let Predicates = [UseAVX] in
     defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
-    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
-                    (ins VR128:$src1, i32i8imm:$src2),
-                    "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                    []>, OpSize, VEX;
-  }
-  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
@@ -6162,13 +6133,13 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
 
 multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
-      (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+      (ins VR128:$src1, GR32orGR64:$src2, i32i8imm:$src3),
       !if(Is2Addr,
         !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+        (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>, OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
       !if(Is2Addr,
@@ -6241,7 +6212,8 @@ let Constraints = "$src1 = $dst" in
 // are optimized inserts that won't zero arbitrary elements in the destination
 // vector. The next one matches the intrinsic and could zero arbitrary elements
 // in the target vector.
-multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
+multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
+                           OpndItins itins = DEFAULT_ITINS> {
   def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
       (ins VR128:$src1, VR128:$src2, u32u8imm:$src3),
       !if(Is2Addr,
@@ -6249,7 +6221,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       OpSize;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6260,14 +6232,14 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
       [(set VR128:$dst,
         (X86insrtps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
-                    imm:$src3))]>, OpSize;
+                    imm:$src3))], itins.rm>, OpSize;
 }
 
 let ExeDomain = SSEPackedSingle in {
-  let Predicates = [HasAVX] in
+  let Predicates = [UseAVX] in
     defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
   let Constraints = "$src1 = $dst" in
-    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6285,7 +6257,8 @@ let ExeDomain = SSEPackedSingle in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6294,7 +6267,8 @@ let ExeDomain = SSEPackedSingle in {
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
+                          (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_MEM>,
                     OpSize;
 } // ExeDomain = SSEPackedSingle
 
@@ -6304,7 +6278,8 @@ let ExeDomain = SSEPackedDouble in {
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))]>,
+                    [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
+                    IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 
   // Vector intrinsic operation, mem
@@ -6313,7 +6288,8 @@ let ExeDomain = SSEPackedDouble in {
                     !strconcat(OpcodeStr,
                     "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
-                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
+                          (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
+                          IIC_SSE_ROUNDPS_REG>,
                     OpSize;
 } // ExeDomain = SSEPackedDouble
 }
@@ -6397,11 +6373,11 @@ let ExeDomain = GenericDomain in {
 let Predicates = [HasAVX] in {
   // Intrinsic form
   defm VROUND  : sse41_fp_unop_rm<0x08, 0x09, "vround", f128mem, VR128,
-                                  memopv4f32, memopv2f64,
+                                  loadv4f32, loadv2f64,
                                   int_x86_sse41_round_ps,
                                   int_x86_sse41_round_pd>, VEX;
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
-                                  memopv8f32, memopv4f64,
+                                  loadv8f32, loadv4f64,
                                   int_x86_avx_round_ps_256,
                                   int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
@@ -6539,7 +6515,7 @@ def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 OpSize, VEX;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
                 OpSize, VEX;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
@@ -6548,7 +6524,7 @@ def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
-                [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
+                [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
                 OpSize, VEX, VEX_L;
 }
 
@@ -6577,13 +6553,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
-defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
                             VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
-defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
                             VEX_L;
 }
 }
@@ -6595,30 +6571,33 @@ defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
 let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
   def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
-                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)]>,
+                     [(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      OpSize, XS;
   def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
                      "popcnt{w}\t{$src, $dst|$dst, $src}",
                      [(set GR16:$dst, (ctpop (loadi16 addr:$src))),
-                      (implicit EFLAGS)]>, OpSize, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, OpSize, XS;
 
   def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
-                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)]>,
+                     [(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
+                     IIC_SSE_POPCNT_RR>,
                      XS;
   def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
                      "popcnt{l}\t{$src, $dst|$dst, $src}",
                      [(set GR32:$dst, (ctpop (loadi32 addr:$src))),
-                      (implicit EFLAGS)]>, XS;
+                      (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 
   def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
-                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)]>,
+                      [(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
+                      IIC_SSE_POPCNT_RR>,
                       XS;
   def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
                       "popcnt{q}\t{$src, $dst|$dst, $src}",
                       [(set GR64:$dst, (ctpop (loadi64 addr:$src))),
-                       (implicit EFLAGS)]>, XS;
+                       (implicit EFLAGS)], IIC_SSE_POPCNT_RM>, XS;
 }
 
 
@@ -6646,14 +6625,16 @@ defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
 
 /// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
-                              Intrinsic IntId128, bit Is2Addr = 1> {
+                              Intrinsic IntId128, bit Is2Addr = 1,
+                              OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
        (ins VR128:$src1, VR128:$src2),
        !if(Is2Addr,
            !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, OpSize;
+       [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))],
+        itins.rr>, OpSize;
   def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
        (ins VR128:$src1, i128mem:$src2),
        !if(Is2Addr,
@@ -6661,7 +6642,8 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
        [(set VR128:$dst,
          (IntId128 VR128:$src1,
-          (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
+          (bitconvert (memopv2i64 addr:$src2))))],
+          itins.rm>, OpSize;
 }
 
 /// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
@@ -6677,14 +6659,15 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
        !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
        [(set VR256:$dst,
          (IntId256 VR256:$src1,
-          (bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
+          (bitconvert (loadv4i64 addr:$src2))))]>, OpSize;
 }
 
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
-                          X86MemOperand x86memop, bit Is2Addr = 1> {
+                          X86MemOperand x86memop, bit Is2Addr = 1,
+                          OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
        (ins RC:$src1, RC:$src2),
@@ -6707,21 +6690,21 @@ let Predicates = [HasAVX] in {
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
                                                          0>, VEX_4V;
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
-                                  memopv2i64, i128mem, 0>, VEX_4V;
+                                  loadv2i64, i128mem, 0>, VEX_4V;
   defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
                                                          0>, VEX_4V;
 }
@@ -6731,21 +6714,21 @@ let Predicates = [HasAVX2] in {
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
                                         int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
                                         int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
@@ -6754,22 +6737,23 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in
   defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
   defm PMINSB   : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINSD   : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUD   : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMINUW   : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSB   : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXSD   : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUD   : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
-                                 memopv2i64, i128mem>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
-                                 memopv2i64, i128mem>;
-  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
+                                 memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
+  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
+                                     1, SSE_INTMUL_ITINS_P>;
 }
 
 let Predicates = [HasAVX] in {
@@ -6787,15 +6771,16 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   defm PMULLD  : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
   defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
-                                memopv2i64, i128mem>;
+                                memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
 }
 
 /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
 multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                  Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
-                 X86MemOperand x86memop, bit Is2Addr = 1> {
+                 X86MemOperand x86memop, bit Is2Addr = 1,
+                 OpndItins itins = DEFAULT_ITINS> {
   let isCommutable = 1 in
   def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
         (ins RC:$src1, RC:$src2, u32u8imm:$src3),
@@ -6804,7 +6789,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
             !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
-        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+        [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
         OpSize;
   def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
         (ins RC:$src1, x86memop:$src2, u32u8imm:$src3),
@@ -6815,7 +6800,7 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set RC:$dst,
           (IntId RC:$src1,
-           (bitconvert (memop_frag addr:$src2)), imm:$src3))]>,
+           (bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
         OpSize;
 }
 
@@ -6823,40 +6808,40 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     let ExeDomain = SSEPackedSingle in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                        VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                        VR128, loadv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    int_x86_avx_blend_ps_256, VR256, loadv8f32,
                                     f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                        VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                        VR128, loadv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     int_x86_avx_blend_pd_256,VR256, loadv4f64,
                                      f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
-                                      VR128, memopv2i64, i128mem, 0>, VEX_4V;
+                                      VR128, loadv2i64, i128mem, 0>, VEX_4V;
   }
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
-                                   VR128, memopv4f32, f128mem, 0>, VEX_4V;
+                                   VR128, loadv4f32, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
-                                   VR128, memopv2f64, f128mem, 0>, VEX_4V;
+                                   VR128, loadv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  VR256, loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6864,21 +6849,27 @@ let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
   let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
-                                     VR128, memopv4f32, f128mem>;
+                                     VR128, memopv4f32, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
-                                     VR128, memopv2f64, f128mem>;
+                                     VR128, memopv2f64, f128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTALU_ITINS_P>;
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
-                                     VR128, memopv2i64, i128mem>;
+                                     VR128, memopv2i64, i128mem,
+                                     1, SSE_INTMUL_ITINS_P>;
   }
   let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
-                                  VR128, memopv4f32, f128mem>;
+                                  VR128, memopv4f32, f128mem, 1,
+                                  SSE_DPPS_ITINS>;
   let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
-                                  VR128, memopv2f64, f128mem>;
+                                  VR128, memopv2f64, f128mem, 1,
+                                  SSE_DPPD_ITINS>;
 }
 
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
@@ -6905,23 +6896,23 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
-                                           memopv2f64, int_x86_sse41_blendvpd>;
+                                           loadv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
+                                  loadv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
-                                           memopv4f32, int_x86_sse41_blendvps>;
+                                           loadv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
+                                  loadv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           memopv2i64, int_x86_sse41_pblendvb>;
+                                           loadv2i64, int_x86_sse41_pblendvb>;
 }
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
+                                      loadv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6974,7 +6965,7 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
                             (v32i8 VR256:$src2))),
-            (VPBLENDVBYrr VR256:$src1, VR256:$src2, VR256:$mask)>;
+            (VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
   def : Pat<(v16i16 (X86Blendi (v16i16 VR256:$src1), (v16i16 VR256:$src2),
                                (imm:$mask))),
             (VPBLENDWYrri VR256:$src1, VR256:$src2, imm:$mask)>;
@@ -6983,13 +6974,14 @@ let Predicates = [HasAVX2] in {
 /// SS41I_ternary_int - SSE 4.1 ternary operator
 let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                               X86MemOperand x86memop, Intrinsic IntId> {
+                               X86MemOperand x86memop, Intrinsic IntId,
+                               OpndItins itins = DEFAULT_ITINS> {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
                      "\t{$src2, $dst|$dst, $src2}"),
-                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
-                    OpSize;
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
+                    itins.rr>, OpSize;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
@@ -6997,7 +6989,8 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
                      "\t{$src2, $dst|$dst, $src2}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
-                       (bitconvert (mem_frag addr:$src2)), XMM0))]>, OpSize;
+                       (bitconvert (mem_frag addr:$src2)), XMM0))],
+                       itins.rm>, OpSize;
   }
 }
 
@@ -7011,17 +7004,17 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   int_x86_sse41_pblendvb>;
 
 // Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
+def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
 
 let Predicates = [UseSSE41] in {
@@ -7094,11 +7087,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 memopv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, 0>, VEX_4V;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7258,70 +7251,100 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
 // crc intrinsic instruction
 // This set of instructions are only rm, the only difference is the size
 // of r and m.
+class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
+                   RegisterClass RCIn, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>;
+
+class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
+                   X86MemOperand x86memop, SDPatternOperator Int> :
+  SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
+         !strconcat(asm, "\t{$src2, $src1|$src1, $src2}"),
+         [(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
+         IIC_CRC32_MEM>;
+
 let Constraints = "$src1 = $dst" in {
-  def CRC32r32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i8mem:$src2),
-                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR8:$src2),
-                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_8 GR32:$src1, GR8:$src2))]>;
-  def CRC32r32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i16mem:$src2),
-                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1,
-                         (load addr:$src2)))]>,
-                         OpSize;
-  def CRC32r32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR16:$src2),
-                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_16 GR32:$src1, GR16:$src2))]>,
-                         OpSize;
-  def CRC32r32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
-                      (ins GR32:$src1, i32mem:$src2),
-                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1,
-                         (load addr:$src2)))]>;
-  def CRC32r32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
-                      (ins GR32:$src1, GR32:$src2),
-                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR32:$dst,
-                         (int_x86_sse42_crc32_32_32 GR32:$src1, GR32:$src2))]>;
-  def CRC32r64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i8mem:$src2),
-                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR8:$src2),
-                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_8 GR64:$src1, GR8:$src2))]>,
-                         REX_W;
-  def CRC32r64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
-                      (ins GR64:$src1, i64mem:$src2),
-                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1,
-                         (load addr:$src2)))]>,
-                         REX_W;
-  def CRC32r64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
-                      (ins GR64:$src1, GR64:$src2),
-                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
-                       [(set GR64:$dst,
-                         (int_x86_sse42_crc32_64_64 GR64:$src1, GR64:$src2))]>,
-                         REX_W;
+  def CRC32r32m8  : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32r8  : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
+                                 int_x86_sse42_crc32_32_8>;
+  def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
+                                 int_x86_sse42_crc32_32_16>, OpSize;
+  def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
+                                 int_x86_sse42_crc32_32_32>;
+  def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
+                                 int_x86_sse42_crc32_64_64>, REX_W;
+  let hasSideEffects = 0 in {
+    let mayLoad = 1 in
+    def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
+                                   null_frag>, REX_W;
+    def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
+                                   null_frag>, REX_W;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SHA-NI Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
+                      bit UsesXMM0 = 0> {
+  def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+
+  def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, i128mem:$src2),
+             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             [!if(UsesXMM0,
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
+                  (set VR128:$dst, (IntId VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+}
+
+let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
+  def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
+                         (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
+                            (i8 imm:$src3)))]>, TA;
+  def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
+                         (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                         "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                         [(set VR128:$dst,
+                           (int_x86_sha1rnds4 VR128:$src1,
+                            (bc_v4i32 (memopv2i64 addr:$src2)),
+                            (i8 imm:$src3)))]>, TA;
+
+  defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
+  defm SHA1MSG1  : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
+  defm SHA1MSG2  : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+
+  let Uses=[XMM0] in
+  defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+
+  defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
+  defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
 }
 
+// Aliases with explicit %xmm0
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
+def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
 //===----------------------------------------------------------------------===//
@@ -7378,7 +7401,7 @@ let Predicates = [HasAVX, HasAES] in {
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
-      [(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
+      [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
       OpSize, VEX;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
@@ -7405,7 +7428,7 @@ let Predicates = [HasAVX, HasAES] in {
       (ins i128mem:$src1, i8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
-        (int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
+        (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
       OpSize, VEX;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
@@ -7436,7 +7459,7 @@ def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (loadv2i64 addr:$src2), imm:$src3))]>;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -7444,13 +7467,15 @@ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst,
-             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>;
+             (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+             IIC_SSE_PCLMULQDQ_RR>;
 
 def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
            "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
-                              (memopv2i64 addr:$src2), imm:$src3))]>;
+                              (memopv2i64 addr:$src2), imm:$src3))],
+                              IIC_SSE_PCLMULQDQ_RM>;
 } // Constraints = "$src1 = $dst"
 
 
@@ -7581,62 +7606,62 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
 }
 
 let Predicates = [HasAVX] in {
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
 let Predicates = [HasAVX1Only] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7656,59 +7681,59 @@ def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
 
 // AVX1 patterns
 let Predicates = [HasAVX] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2f64 (VEXTRACTF128rr
                     (v4f64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v4f32 (vextract128_extract:$ext (v8f32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v2f64 (vextract128_extract:$ext (v4f64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
 
 let Predicates = [HasAVX1Only] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTF128rr
                   (v4i64 VR256:$src1),
-                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTF128rr
                   (v8i32 VR256:$src1),
-                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTF128rr
                   (v16i16 VR256:$src1),
-                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTF128rr
                   (v32i8 VR256:$src1),
-                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                  (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+def : Pat<(alignedstore (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
                                 (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
                                 (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
                                 (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
                                 (iPTR imm))), addr:$dst),
           (VEXTRACTF128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7780,15 +7805,15 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
+                               loadv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                               memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
+                               loadv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
+                       loadv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7796,15 +7821,15 @@ def : Pat<(v8i32 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
 def : Pat<(v4i64 (X86VPermilp VR256:$src1, (i8 imm:$imm))),
           (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (memopv4i64 addr:$src1)),
+def : Pat<(v8i32 (X86VPermilp (bc_v8i32 (loadv4i64 addr:$src1)),
                                (i8 imm:$imm))),
           (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilp (memopv4i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v4i64 (X86VPermilp (loadv4i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDYmi addr:$src1, imm:$imm)>;
 
 def : Pat<(v2i64 (X86VPermilp VR128:$src1, (i8 imm:$imm))),
           (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilp (memopv2i64 addr:$src1), (i8 imm:$imm))),
+def : Pat<(v2i64 (X86VPermilp (loadv2i64 addr:$src1), (i8 imm:$imm))),
           (VPERMILPDmi addr:$src1, imm:$imm)>;
 }
 
@@ -7820,7 +7845,7 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
@@ -7828,7 +7853,7 @@ let Predicates = [HasAVX] in {
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4f64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7843,16 +7868,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
-                  (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4i64 addr:$src2), (i8 imm:$imm))),
+                  (loadv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
-                  (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                  (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                  (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
@@ -7896,7 +7921,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
                TA, OpSize, VEX;
 }
 
-let Predicates = [HasAVX, HasF16C] in {
+let Predicates = [HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
@@ -7930,9 +7955,9 @@ multiclass AVX2_binop_rmi_int<bits<8> opc, string OpcodeStr,
 
 let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
-                                   VR128, memopv2i64, i128mem>;
+                                   VR128, loadv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>, VEX_L;
+                                    VR256, loadv4i64, i256mem>, VEX_L;
 }
 
 def : Pat<(v4i32 (X86Blendi (v4i32 VR128:$src1), (v4i32 VR128:$src2),
@@ -8110,9 +8135,9 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    VEX_4V, VEX_L;
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, v8f32>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          ValueType OpVT> {
@@ -8132,9 +8157,9 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                               (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
-defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
+defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64>, VEX_W;
 let ExeDomain = SSEPackedDouble in
-defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
+defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
@@ -8147,7 +8172,7 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
+          [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in {
@@ -8158,13 +8183,13 @@ def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (memopv4i64 addr:$src2)),
+def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
-                   (bc_v16i16 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
+                   (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)),
+def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
                   (i8 imm:$imm))),
           (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
 }
@@ -8186,42 +8211,42 @@ def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
 }
 
 let Predicates = [HasAVX2] in {
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
                                    (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+def : Pat<(vinsert128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
-                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
-                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
-                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
+def : Pat<(vinsert128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (loadv2i64 addr:$src2)),
                                    (iPTR imm)),
           (VINSERTI128rm VR256:$src1, addr:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+                         (INSERT_get_vinsert128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8240,39 +8265,39 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           VEX, VEX_L;
 
 let Predicates = [HasAVX2] in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTI128rr
                     (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTI128rr
                     (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
+def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTI128rr
                     (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                    (EXTRACT_get_vextract128_imm VR128:$ext)))>;
 
-def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+def : Pat<(store (v2i64 (vextract128_extract:$ext (v4i64 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v4i32 (vextract128_extract:$ext (v8i32 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v8i16 (vextract128_extract:$ext (v16i16 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
-def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
-                                (iPTR imm))), addr:$dst),
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
+def : Pat<(store (v16i8 (vextract128_extract:$ext (v32i8 VR256:$src1),
+                         (iPTR imm))), addr:$dst),
           (VEXTRACTI128mr addr:$dst, VR256:$src1,
-           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+           (EXTRACT_get_vextract128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8328,7 +8353,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
                (vt128 (OpNode VR128:$src1,
-                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
+                       (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
              VEX_4V;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
@@ -8341,7 +8366,7 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
-                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
+                       (vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
              VEX_4V, VEX_L;
 }
 
@@ -8367,7 +8392,9 @@ multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
             []>, VEX_4VOp3, VEX_L;
 }
 
-let mayLoad = 1, Constraints = "$src1 = $dst, $mask = $mask_wb" in {
+let mayLoad = 1, Constraints
+  = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+  in {
   defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx64mem, vx64mem>, VEX_W;
   defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx64mem, vy64mem>, VEX_W;
   defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx32mem, vy32mem>;
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index 757dcd0..0191c01 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -26,37 +26,37 @@ def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
 
 // 0F 01 DE
 let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|EAX}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
 
 // 0F 01 D8
 let Uses = [EAX] in
 def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
-                "vmrun\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+                "vmrun\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
 let Uses = [RAX] in
 def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
-                "vmrun\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+                "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
 
 // 0F 01 DA
 let Uses = [EAX] in
 def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
-                "vmload\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+                "vmload\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
 let Uses = [RAX] in
 def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
-                "vmload\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+                "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
 
 // 0F 01 DB
 let Uses = [EAX] in
 def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
-                "vmsave\t{%eax|EAX}", []>, TB, Requires<[In32BitMode]>;
+                "vmsave\t{%eax|eax}", []>, TB, Requires<[In32BitMode]>;
 let Uses = [RAX] in
 def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
-                "vmsave\t{%rax|RAX}", []>, TB, Requires<[In64BitMode]>;
+                "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
 
 // 0F 01 DF
 let Uses = [EAX, ECX] in
 def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%ecx, %eax|EAX, ECX}", []>, TB, Requires<[In32BitMode]>;
+                "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[In32BitMode]>;
 let Uses = [RAX, ECX] in
 def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
-                "invlpga\t{%ecx, %rax|RAX, ECX}", []>, TB, Requires<[In64BitMode]>;
+                "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
 
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 89c1a68..1937770 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -18,16 +18,16 @@ let Defs = [EFLAGS] in {
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1),
-                 "shl{b}\t{%cl, $dst|$dst, CL}",
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
                  [(set GR8:$dst, (shl GR8:$src1, CL))], IIC_SR>;
 def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
-                 "shl{w}\t{%cl, $dst|$dst, CL}",
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (shl GR16:$src1, CL))], IIC_SR>, OpSize;
 def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
-                 "shl{l}\t{%cl, $dst|$dst, CL}",
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (shl GR32:$src1, CL))], IIC_SR>;
 def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
-                  "shl{q}\t{%cl, $dst|$dst, CL}",
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (shl GR64:$src1, CL))], IIC_SR>;
 } // Uses = [CL]
 
@@ -70,17 +70,17 @@ let SchedRW = [WriteShiftLd, WriteRMW] in {
 // using CL?
 let Uses = [CL] in {
 def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
-                 "shl{b}\t{%cl, $dst|$dst, CL}",
+                 "shl{b}\t{%cl, $dst|$dst, cl}",
                  [(store (shl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
 def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
-                 "shl{w}\t{%cl, $dst|$dst, CL}",
+                 "shl{w}\t{%cl, $dst|$dst, cl}",
                  [(store (shl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
                  OpSize;
 def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
-                 "shl{l}\t{%cl, $dst|$dst, CL}",
+                 "shl{l}\t{%cl, $dst|$dst, cl}",
                  [(store (shl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
 def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
-                  "shl{q}\t{%cl, $dst|$dst, CL}",
+                  "shl{q}\t{%cl, $dst|$dst, cl}",
                   [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
 }
 def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -124,16 +124,16 @@ def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src1),
-                 "shr{b}\t{%cl, $dst|$dst, CL}",
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
                  [(set GR8:$dst, (srl GR8:$src1, CL))], IIC_SR>;
 def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
-                 "shr{w}\t{%cl, $dst|$dst, CL}",
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (srl GR16:$src1, CL))], IIC_SR>, OpSize;
 def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
-                 "shr{l}\t{%cl, $dst|$dst, CL}",
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (srl GR32:$src1, CL))], IIC_SR>;
 def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
-                  "shr{q}\t{%cl, $dst|$dst, CL}",
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (srl GR64:$src1, CL))], IIC_SR>;
 }
 
@@ -171,17 +171,17 @@ def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
-                 "shr{b}\t{%cl, $dst|$dst, CL}",
+                 "shr{b}\t{%cl, $dst|$dst, cl}",
                  [(store (srl (loadi8 addr:$dst), CL), addr:$dst)], IIC_SR>;
 def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
-                 "shr{w}\t{%cl, $dst|$dst, CL}",
+                 "shr{w}\t{%cl, $dst|$dst, cl}",
                  [(store (srl (loadi16 addr:$dst), CL), addr:$dst)], IIC_SR>,
                  OpSize;
 def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
-                 "shr{l}\t{%cl, $dst|$dst, CL}",
+                 "shr{l}\t{%cl, $dst|$dst, cl}",
                  [(store (srl (loadi32 addr:$dst), CL), addr:$dst)], IIC_SR>;
 def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
-                  "shr{q}\t{%cl, $dst|$dst, CL}",
+                  "shr{q}\t{%cl, $dst|$dst, cl}",
                   [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
 }
 def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
@@ -224,19 +224,19 @@ def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
-                 "sar{b}\t{%cl, $dst|$dst, CL}",
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
                  [(set GR8:$dst, (sra GR8:$src1, CL))],
                  IIC_SR>;
 def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
-                 "sar{w}\t{%cl, $dst|$dst, CL}",
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (sra GR16:$src1, CL))],
                  IIC_SR>, OpSize;
 def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
-                 "sar{l}\t{%cl, $dst|$dst, CL}",
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (sra GR32:$src1, CL))],
                  IIC_SR>;
 def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
-                 "sar{q}\t{%cl, $dst|$dst, CL}",
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
                  [(set GR64:$dst, (sra GR64:$src1, CL))],
                  IIC_SR>;
 }
@@ -283,19 +283,19 @@ def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
-                 "sar{b}\t{%cl, $dst|$dst, CL}",
+                 "sar{b}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi8 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
-                 "sar{w}\t{%cl, $dst|$dst, CL}",
+                 "sar{w}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi16 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize;
 def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
-                 "sar{l}\t{%cl, $dst|$dst, CL}",
+                 "sar{l}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi32 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
-                 "sar{q}\t{%cl, $dst|$dst, CL}",
+                 "sar{q}\t{%cl, $dst|$dst, cl}",
                  [(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 }
@@ -349,7 +349,7 @@ def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
                  "rcl{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
   
 def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcl{w}\t$dst", [], IIC_SR>, OpSize;
@@ -357,7 +357,7 @@ def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
                   "rcl{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
 let Uses = [CL] in
 def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
 
 def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcl{l}\t$dst", [], IIC_SR>;
@@ -365,7 +365,7 @@ def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
                   "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 
 
 def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
@@ -374,7 +374,7 @@ def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
                    "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 
 
 def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
@@ -383,7 +383,7 @@ def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$cnt),
                  "rcr{b}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src1),
-                "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
   
 def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
                 "rcr{w}\t$dst", [], IIC_SR>, OpSize;
@@ -391,7 +391,7 @@ def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$cnt),
                   "rcr{w}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize;
 let Uses = [CL] in
 def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
 
 def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
                 "rcr{l}\t$dst", [], IIC_SR>;
@@ -399,7 +399,7 @@ def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$cnt),
                   "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
                  
 def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
                  "rcr{q}\t$dst", [], IIC_SR>;
@@ -407,7 +407,7 @@ def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$cnt),
                    "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
 let Uses = [CL] in
 def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 
 } // Constraints = "$src = $dst"
 
@@ -448,22 +448,22 @@ def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, i8imm:$cnt),
 
 let Uses = [CL] in {
 def RCL8mCL : I<0xD2, MRM2m, (outs), (ins i8mem:$dst),
-                "rcl{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                "rcl{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
-                 "rcl{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+                 "rcl{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
 def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
-                 "rcl{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                 "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
-                  "rcl{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                  "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 
 def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
-                "rcr{b}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
-                 "rcr{w}\t{%cl, $dst|$dst, CL}", [], IIC_SR>, OpSize;
+                 "rcr{w}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize;
 def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
-                 "rcr{l}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                 "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
-                  "rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
+                  "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
 }
 } // SchedRW
 } // hasSideEffects = 0
@@ -472,16 +472,16 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 // FIXME: provide shorter instructions when imm8 == 1
 let Uses = [CL] in {
 def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
-                 "rol{b}\t{%cl, $dst|$dst, CL}",
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
                  [(set GR8:$dst, (rotl GR8:$src1, CL))], IIC_SR>;
 def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
-                 "rol{w}\t{%cl, $dst|$dst, CL}",
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (rotl GR16:$src1, CL))], IIC_SR>, OpSize;
 def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
-                 "rol{l}\t{%cl, $dst|$dst, CL}",
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (rotl GR32:$src1, CL))], IIC_SR>;
 def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
-                  "rol{q}\t{%cl, $dst|$dst, CL}",
+                  "rol{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (rotl GR64:$src1, CL))], IIC_SR>;
 }
 
@@ -525,19 +525,19 @@ def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
-                 "rol{b}\t{%cl, $dst|$dst, CL}",
+                 "rol{b}\t{%cl, $dst|$dst, cl}",
                  [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
-                 "rol{w}\t{%cl, $dst|$dst, CL}",
+                 "rol{w}\t{%cl, $dst|$dst, cl}",
                  [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize;
 def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
-                 "rol{l}\t{%cl, $dst|$dst, CL}",
+                 "rol{l}\t{%cl, $dst|$dst, cl}",
                  [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
-                   "rol{q}\t{%cl, $dst|$dst, %cl}",
+                   "rol{q}\t{%cl, $dst|$dst, cl}",
                    [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
                    IIC_SR>;
 }
@@ -582,16 +582,16 @@ def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
 let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
-                 "ror{b}\t{%cl, $dst|$dst, CL}",
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
                  [(set GR8:$dst, (rotr GR8:$src1, CL))], IIC_SR>;
 def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
-                 "ror{w}\t{%cl, $dst|$dst, CL}",
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
                  [(set GR16:$dst, (rotr GR16:$src1, CL))], IIC_SR>, OpSize;
 def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
-                 "ror{l}\t{%cl, $dst|$dst, CL}",
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
                  [(set GR32:$dst, (rotr GR32:$src1, CL))], IIC_SR>;
 def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
-                  "ror{q}\t{%cl, $dst|$dst, CL}",
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
                   [(set GR64:$dst, (rotr GR64:$src1, CL))], IIC_SR>;
 }
 
@@ -635,19 +635,19 @@ def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
-                 "ror{b}\t{%cl, $dst|$dst, CL}",
+                 "ror{b}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
-                 "ror{w}\t{%cl, $dst|$dst, CL}",
+                 "ror{w}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)],
                  IIC_SR>, OpSize;
 def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
-                 "ror{l}\t{%cl, $dst|$dst, CL}",
+                 "ror{l}\t{%cl, $dst|$dst, cl}",
                  [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)],
                  IIC_SR>;
 def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
-                  "ror{q}\t{%cl, $dst|$dst, CL}",
+                  "ror{q}\t{%cl, $dst|$dst, cl}",
                   [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
                   IIC_SR>;
 }
@@ -699,35 +699,35 @@ let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
 let Uses = [CL] in {
 def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), 
                    (ins GR16:$src1, GR16:$src2),
-                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
                    TB, OpSize;
 def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
                    (ins GR16:$src1, GR16:$src2),
-                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))],
                     IIC_SHD16_REG_CL>,
                    TB, OpSize;
 def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
                    (ins GR32:$src1, GR32:$src2),
-                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))],
                     IIC_SHD32_REG_CL>, TB;
 def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
-                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))],
                    IIC_SHD32_REG_CL>, TB;
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
-                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))],
                     IIC_SHD64_REG_CL>, 
                     TB;
 def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), 
                     (ins GR64:$src1, GR64:$src2),
-                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))],
                     IIC_SHD64_REG_CL>, 
                     TB;
@@ -782,29 +782,29 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
 let SchedRW = [WriteShiftLd, WriteRMW] in {
 let Uses = [CL] in {
 def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
                      addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
 def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                  "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
                     addr:$dst)], IIC_SHD16_MEM_CL>, TB, OpSize;
 
 def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
                      addr:$dst)], IIC_SHD32_MEM_CL>, TB;
 def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                  "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                   [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
                     addr:$dst)], IIC_SHD32_MEM_CL>, TB;
                     
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
                       addr:$dst)], IIC_SHD64_MEM_CL>, TB;
 def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
                     [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
                       addr:$dst)], IIC_SHD64_MEM_CL>, TB;
 }
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index bab3cdd..2196dc3 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -77,43 +77,43 @@ def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iretq", [], IIC_IRET>,
 let SchedRW = [WriteSystem] in {
 let Defs = [AL], Uses = [DX] in
 def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
-               "in{b}\t{%dx, %al|AL, DX}", [], IIC_IN_RR>;
+               "in{b}\t{%dx, %al|al, dx}", [], IIC_IN_RR>;
 let Defs = [AX], Uses = [DX] in
 def IN16rr : I<0xED, RawFrm, (outs), (ins),
-               "in{w}\t{%dx, %ax|AX, DX}", [], IIC_IN_RR>,  OpSize;
+               "in{w}\t{%dx, %ax|ax, dx}", [], IIC_IN_RR>,  OpSize;
 let Defs = [EAX], Uses = [DX] in
 def IN32rr : I<0xED, RawFrm, (outs), (ins),
-               "in{l}\t{%dx, %eax|EAX, DX}", [], IIC_IN_RR>;
+               "in{l}\t{%dx, %eax|eax, dx}", [], IIC_IN_RR>;
 
 let Defs = [AL] in
 def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i8imm:$port),
-                  "in{b}\t{$port, %al|AL, $port}", [], IIC_IN_RI>;
+                  "in{b}\t{$port, %al|al, $port}", [], IIC_IN_RI>;
 let Defs = [AX] in
 def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{w}\t{$port, %ax|AX, $port}", [], IIC_IN_RI>, OpSize;
+                  "in{w}\t{$port, %ax|ax, $port}", [], IIC_IN_RI>, OpSize;
 let Defs = [EAX] in
 def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i8imm:$port),
-                  "in{l}\t{$port, %eax|EAX, $port}", [], IIC_IN_RI>;
+                  "in{l}\t{$port, %eax|eax, $port}", [], IIC_IN_RI>;
 
 let Uses = [DX, AL] in
 def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
-                "out{b}\t{%al, %dx|DX, AL}", [], IIC_OUT_RR>;
+                "out{b}\t{%al, %dx|dx, al}", [], IIC_OUT_RR>;
 let Uses = [DX, AX] in
 def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{w}\t{%ax, %dx|DX, AX}", [], IIC_OUT_RR>, OpSize;
+                "out{w}\t{%ax, %dx|dx, ax}", [], IIC_OUT_RR>, OpSize;
 let Uses = [DX, EAX] in
 def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
-                "out{l}\t{%eax, %dx|DX, EAX}", [], IIC_OUT_RR>;
+                "out{l}\t{%eax, %dx|dx, eax}", [], IIC_OUT_RR>;
 
 let Uses = [AL] in
 def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i8imm:$port),
-                   "out{b}\t{%al, $port|$port, AL}", [], IIC_OUT_IR>;
+                   "out{b}\t{%al, $port|$port, al}", [], IIC_OUT_IR>;
 let Uses = [AX] in
 def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{w}\t{%ax, $port|$port, AX}", [], IIC_OUT_IR>, OpSize;
+                   "out{w}\t{%ax, $port|$port, ax}", [], IIC_OUT_IR>, OpSize;
 let Uses = [EAX] in
 def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i8imm:$port),
-                   "out{l}\t{%eax, $port|$port, EAX}", [], IIC_OUT_IR>;
+                   "out{l}\t{%eax, $port|$port, eax}", [], IIC_OUT_IR>;
 
 def IN8  : I<0x6C, RawFrm, (outs), (ins), "ins{b}", [], IIC_INS>;
 def IN16 : I<0x6D, RawFrm, (outs), (ins), "ins{w}", [], IIC_INS>,  OpSize;
@@ -248,75 +248,75 @@ def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
              "ltr{w}\t$src", [], IIC_LTR>, TB;
              
 def PUSHCS16 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{w}\t{%cs|CS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+                 "push{w}\t{%cs|cs}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
                OpSize;
 def PUSHCS32 : I<0x0E, RawFrm, (outs), (ins),
-                 "push{l}\t{%cs|CS}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
+                 "push{l}\t{%cs|cs}", [], IIC_PUSH_CS>, Requires<[In32BitMode]>;
 def PUSHSS16 : I<0x16, RawFrm, (outs), (ins),
-                 "push{w}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+                 "push{w}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
                OpSize;
 def PUSHSS32 : I<0x16, RawFrm, (outs), (ins),
-                 "push{l}\t{%ss|SS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ss|ss}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHDS16 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{w}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+                 "push{w}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
                OpSize;
 def PUSHDS32 : I<0x1E, RawFrm, (outs), (ins),
-                 "push{l}\t{%ds|DS}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 "push{l}\t{%ds|ds}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
 def PUSHES16 : I<0x06, RawFrm, (outs), (ins),
-                 "push{w}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
+                 "push{w}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>,
                OpSize;
 def PUSHES32 : I<0x06, RawFrm, (outs), (ins),
-                 "push{l}\t{%es|ES}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
+                 "push{l}\t{%es|es}", [], IIC_PUSH_SR>, Requires<[In32BitMode]>;
                  
 def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{w}\t{%fs|FS}", [], IIC_PUSH_SR>, OpSize, TB;
+                 "push{w}\t{%fs|fs}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{l}\t{%fs|FS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%fs|fs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{w}\t{%gs|GS}", [], IIC_PUSH_SR>, OpSize, TB;
+                 "push{w}\t{%gs|gs}", [], IIC_PUSH_SR>, OpSize, TB;
 def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{l}\t{%gs|GS}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
+                 "push{l}\t{%gs|gs}", [], IIC_PUSH_SR>, TB, Requires<[In32BitMode]>;
 
 def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
-                 "push{q}\t{%fs|FS}", [], IIC_PUSH_SR>, TB;
+                 "push{q}\t{%fs|fs}", [], IIC_PUSH_SR>, TB;
 def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
-                 "push{q}\t{%gs|GS}", [], IIC_PUSH_SR>, TB;
+                 "push{q}\t{%gs|gs}", [], IIC_PUSH_SR>, TB;
 
 // No "pop cs" instruction.
 def POPSS16 : I<0x17, RawFrm, (outs), (ins),
-                "pop{w}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+                "pop{w}\t{%ss|ss}", [], IIC_POP_SR_SS>,
               OpSize, Requires<[In32BitMode]>;
 def POPSS32 : I<0x17, RawFrm, (outs), (ins),
-                "pop{l}\t{%ss|SS}", [], IIC_POP_SR_SS>,
+                "pop{l}\t{%ss|ss}", [], IIC_POP_SR_SS>,
                       Requires<[In32BitMode]>;
                 
 def POPDS16 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{w}\t{%ds|DS}", [], IIC_POP_SR>,
+                "pop{w}\t{%ds|ds}", [], IIC_POP_SR>,
               OpSize, Requires<[In32BitMode]>;
 def POPDS32 : I<0x1F, RawFrm, (outs), (ins),
-                "pop{l}\t{%ds|DS}", [], IIC_POP_SR>,
+                "pop{l}\t{%ds|ds}", [], IIC_POP_SR>,
                       Requires<[In32BitMode]>;
                 
 def POPES16 : I<0x07, RawFrm, (outs), (ins),
-                "pop{w}\t{%es|ES}", [], IIC_POP_SR>,
+                "pop{w}\t{%es|es}", [], IIC_POP_SR>,
               OpSize, Requires<[In32BitMode]>;
 def POPES32 : I<0x07, RawFrm, (outs), (ins),
-                "pop{l}\t{%es|ES}", [], IIC_POP_SR>,
+                "pop{l}\t{%es|es}", [], IIC_POP_SR>,
                       Requires<[In32BitMode]>;
                 
 def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{w}\t{%fs|FS}", [], IIC_POP_SR>, OpSize, TB;
+                "pop{w}\t{%fs|fs}", [], IIC_POP_SR>, OpSize, TB;
 def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{l}\t{%fs|FS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+                "pop{l}\t{%fs|fs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
-                "pop{q}\t{%fs|FS}", [], IIC_POP_SR>, TB;
+                "pop{q}\t{%fs|fs}", [], IIC_POP_SR>, TB;
                 
 def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{w}\t{%gs|GS}", [], IIC_POP_SR>, OpSize, TB;
+                "pop{w}\t{%gs|gs}", [], IIC_POP_SR>, OpSize, TB;
 def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{l}\t{%gs|GS}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
+                "pop{l}\t{%gs|gs}", [], IIC_POP_SR>, TB, Requires<[In32BitMode]>;
 def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
-                "pop{q}\t{%gs|GS}", [], IIC_POP_SR>, TB;
+                "pop{q}\t{%gs|gs}", [], IIC_POP_SR>, TB;
                  
 
 def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 363a190..59a6f1e 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -37,3 +37,10 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
                  [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+
+// HLE prefixes
+
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
+
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 2aa08fa..2b6ee5c 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -20,23 +20,21 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
-  defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
-  defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
-  defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
-  defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
-  defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
-  defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
-  defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
-  defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
-  defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
-  defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
-  defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
-  defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
-  defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
-  defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
-}
+defm VPHSUBWD  : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, memopv2i64>;
+defm VPHSUBDQ  : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, memopv2i64>;
+defm VPHSUBBW  : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, memopv2i64>;
+defm VPHADDWQ  : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, memopv2i64>;
+defm VPHADDWD  : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, memopv2i64>;
+defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, memopv2i64>;
+defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, memopv2i64>;
+defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, memopv2i64>;
+defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, memopv2i64>;
+defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, memopv2i64>;
+defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, memopv2i64>;
+defm VPHADDDQ  : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, memopv2i64>;
+defm VPHADDBW  : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, memopv2i64>;
+defm VPHADDBQ  : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, memopv2i64>;
+defm VPHADDBD  : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, memopv2i64>;
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
@@ -49,12 +47,10 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
-                   ssmem, sse_load_f32>;
-  defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
-                   sdmem, sse_load_f64>;
-}
+defm VFRCZSS   : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss,
+                 ssmem, sse_load_f32>;
+defm VFRCZSD   : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd,
+                 sdmem, sse_load_f64>;
 
 multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -66,10 +62,8 @@ multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
-  defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
-}
+defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, memopv4f32>;
+defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, memopv2f64>;
 
 multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
@@ -81,12 +75,8 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
            [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256,
-                           memopv8f32>;
-  defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256,
-                           memopv4f64>;
-}
+defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, memopv8f32>;
+defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, memopv4f64>;
 
 multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@@ -107,20 +97,18 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              VEX_4VOp3;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
-  defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
-  defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
-  defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
-  defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
-  defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
-  defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
-  defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
-  defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
-  defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
-  defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
-  defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
-}
+defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>;
+defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>;
+defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>;
+defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>;
+defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>;
+defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>;
+defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>;
+defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>;
+defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>;
+defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>;
+defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>;
+defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>;
 
 multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def ri : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
@@ -134,12 +122,10 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
              (Int (bitconvert (memopv2i64 addr:$src1)), imm:$src2))]>, VEX;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
-  defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
-  defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
-  defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
-}
+defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>;
+defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>;
+defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>;
+defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>;
 
 // Instruction where second source can be memory, but third must be register
 multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -158,20 +144,18 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               VR128:$src3))]>, VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
-  defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
-  defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
-  defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
-  defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
-  defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
-  defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
-  defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
-  defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
-  defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
-  defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
-  defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
-}
+defm VPMADCSWD  : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>;
+defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>;
+defm VPMACSWW   : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>;
+defm VPMACSWD   : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>;
+defm VPMACSSWW  : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>;
+defm VPMACSSWD  : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>;
+defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>;
+defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>;
+defm VPMACSSDD  : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>;
+defm VPMACSDQL  : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>;
+defm VPMACSDQH  : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>;
+defm VPMACSDD   : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>;
 
 // Instruction where second source can be memory, third must be imm8
 multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -190,16 +174,14 @@ multiclass xop4opimm<bits<8> opc, string OpcodeStr, Intrinsic Int> {
               imm:$src3))]>, VEX_4V;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
-  defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
-  defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
-  defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
-  defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
-  defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
-  defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
-  defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
-}
+defm VPCOMB  : xop4opimm<0xCC, "vpcomb", int_x86_xop_vpcomb>;
+defm VPCOMW  : xop4opimm<0xCD, "vpcomw", int_x86_xop_vpcomw>;
+defm VPCOMD  : xop4opimm<0xCE, "vpcomd", int_x86_xop_vpcomd>;
+defm VPCOMQ  : xop4opimm<0xCF, "vpcomq", int_x86_xop_vpcomq>;
+defm VPCOMUB : xop4opimm<0xEC, "vpcomub", int_x86_xop_vpcomub>;
+defm VPCOMUW : xop4opimm<0xED, "vpcomuw", int_x86_xop_vpcomuw>;
+defm VPCOMUD : xop4opimm<0xEE, "vpcomud", int_x86_xop_vpcomud>;
+defm VPCOMUQ : xop4opimm<0xEF, "vpcomuq", int_x86_xop_vpcomuq>;
 
 // Instruction where either second or third source can be memory
 multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
@@ -227,10 +209,8 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
-  defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
-}
+defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>;
+defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>;
 
 multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
   def rrY : IXOPi8<opc, MRMSrcReg, (outs VR256:$dst),
@@ -257,9 +237,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            VEX_4V, VEX_I8IMM, VEX_L;
 }
 
-let isAsmParserOnly = 1 in {
-  defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
-}
+defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>;
 
 multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
                   Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> {
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 44d8cce..e99f2d9 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -127,7 +127,7 @@ extern "C" {
     "movaps  %xmm6, 96(%rsp)\n"
     "movaps  %xmm7, 112(%rsp)\n"
     // JIT callee
-#ifdef _WIN64
+#if defined(_WIN64) || defined(__CYGWIN__)
     "subq    $32, %rsp\n"
     "movq    %rbp, %rcx\n"    // Pass prev frame and return address
     "movq    8(%rbp), %rdx\n"
@@ -339,6 +339,7 @@ extern "C" {
 /// must locate the start of the stub or call site and pass it into the JIT
 /// compiler function.
 extern "C" {
+LLVM_ATTRIBUTE_USED // Referenced from inline asm.
 LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
                                                          intptr_t RetAddr) {
   intptr_t *RetAddrLoc = &StackPtr[1];
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index a8a9fd8..6649c82 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "X86COFFMachineModuleInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -34,14 +35,12 @@ namespace {
 /// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class X86MCInstLower {
   MCContext &Ctx;
-  Mangler *Mang;
   const MachineFunction &MF;
   const TargetMachine &TM;
   const MCAsmInfo &MAI;
   X86AsmPrinter &AsmPrinter;
 public:
-  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
-                 X86AsmPrinter &asmprinter);
+  X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
 
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 
@@ -50,13 +49,16 @@ public:
 
 private:
   MachineModuleInfoMachO &getMachOMMI() const;
+  Mangler *getMang() const {
+    return AsmPrinter.Mang;
+  }
 };
 
 } // end anonymous namespace
 
-X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
+X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
-: Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
+: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()),
   MAI(*TM.getMCAsmInfo()), AsmPrinter(asmprinter) {}
 
 MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
@@ -81,7 +83,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
         MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
       isImplicitlyPrivate = true;
 
-    Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+    getMang()->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
   } else if (MO.isSymbol()) {
     Name += MAI.getGlobalPrefix();
     Name += MO.getSymbolName();
@@ -110,7 +112,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -124,7 +126,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     }
     return Sym;
@@ -140,7 +142,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
     if (MO.isGlobal()) {
       StubSym =
         MachineModuleInfoImpl::
-        StubValueTy(Mang->getSymbol(MO.getGlobal()),
+        StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
                     !MO.getGlobal()->hasInternalLinkage());
     } else {
       Name.erase(Name.end()-5, Name.end());
@@ -225,32 +227,6 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
 }
 
 
-
-static void lower_subreg32(MCInst *MI, unsigned OpNo) {
-  // Convert registers in the addr mode according to subreg32.
-  unsigned Reg = MI->getOperand(OpNo).getReg();
-  if (Reg != 0)
-    MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
-}
-
-static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
-  // Convert registers in the addr mode according to subreg64.
-  for (unsigned i = 0; i != 4; ++i) {
-    if (!MI->getOperand(OpNo+i).isReg()) continue;
-
-    unsigned Reg = MI->getOperand(OpNo+i).getReg();
-    // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
-    if (Reg == 0 || Reg == X86::RIP) continue;
-
-    MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
-  }
-}
-
-/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
-static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
-  OutMI.setOpcode(NewOpc);
-  lower_subreg32(&OutMI, 0);
-}
 /// LowerUnaryToTwoAddr - R = setb   -> R = sbb R, R
 static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
   OutMI.setOpcode(NewOpc);
@@ -280,6 +256,34 @@ static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
   Inst.addOperand(Saved);
 }
 
+/// \brief If a movsx instruction has a shorter encoding for the used register
+/// simplify the instruction to use it instead.
+static void SimplifyMOVSX(MCInst &Inst) {
+  unsigned NewOpcode = 0;
+  unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
+  switch (Inst.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected instruction!");
+  case X86::MOVSX16rr8:  // movsbw %al, %ax   --> cbtw
+    if (Op0 == X86::AX && Op1 == X86::AL)
+      NewOpcode = X86::CBW;
+    break;
+  case X86::MOVSX32rr16: // movswl %ax, %eax  --> cwtl
+    if (Op0 == X86::EAX && Op1 == X86::AX)
+      NewOpcode = X86::CWDE;
+    break;
+  case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
+    if (Op0 == X86::RAX && Op1 == X86::EAX)
+      NewOpcode = X86::CDQE;
+    break;
+  }
+
+  if (NewOpcode != 0) {
+    Inst = MCInst();
+    Inst.setOpcode(NewOpcode);
+  }
+}
+
 /// \brief Simplify things like MOV32rm to MOV32o32a.
 static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
                                   unsigned Opcode) {
@@ -376,9 +380,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   // Handle a few special cases to eliminate operand modifiers.
 ReSimplify:
   switch (OutMI.getOpcode()) {
-  case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
-    lower_lea64_32mem(&OutMI, 1);
-    // FALL THROUGH.
+  case X86::LEA64_32r:
   case X86::LEA64r:
   case X86::LEA16r:
   case X86::LEA32r:
@@ -388,23 +390,10 @@ ReSimplify:
     assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
            "LEA has segment specified!");
     break;
-  case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
-  case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
-  case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
-  case X86::MOVZX64rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
-  case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
-  case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
-  case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
-  case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
 
-  case X86::MOV16r0:
-    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
-    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
-    break;
-  case X86::MOV64r0:
-    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV64r0 -> MOV32r0
-    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+  case X86::MOV32ri64:
+    OutMI.setOpcode(X86::MOV32ri);
     break;
 
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
@@ -598,16 +587,11 @@ ReSimplify:
   case X86::XOR32ri:    SimplifyShortImmForm(OutMI, X86::XOR32i32);  break;
   case X86::XOR64ri32:  SimplifyShortImmForm(OutMI, X86::XOR64i32);  break;
 
-  case X86::MORESTACK_RET:
-    OutMI.setOpcode(X86::RET);
-    break;
-
-  case X86::MORESTACK_RET_RESTORE_R10:
-    OutMI.setOpcode(X86::MOV64rr);
-    OutMI.addOperand(MCOperand::CreateReg(X86::R10));
-    OutMI.addOperand(MCOperand::CreateReg(X86::RAX));
-
-    AsmPrinter.OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+  // Try to shrink some forms of movsx.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr16:
+  case X86::MOVSX64rr32:
+    SimplifyMOVSX(OutMI);
     break;
   }
 }
@@ -691,17 +675,143 @@ static void LowerTlsAddr(MCStreamer &OutStreamer,
     .addExpr(tlsRef));
 }
 
+static std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+parseMemoryOperand(StackMaps::Location::LocationType LocTy, unsigned Size,
+                   MachineInstr::const_mop_iterator MOI,
+                   MachineInstr::const_mop_iterator MOE) {
+
+  typedef StackMaps::Location Location;
+
+  assert(std::distance(MOI, MOE) >= 5 && "Too few operands to encode mem op.");
+
+  const MachineOperand &Base = *MOI;
+  const MachineOperand &Scale = *(++MOI);
+  const MachineOperand &Index = *(++MOI);
+  const MachineOperand &Disp = *(++MOI);
+  const MachineOperand &ZeroReg = *(++MOI);
+
+  // Sanity check for supported operand format.
+  assert(Base.isReg() &&
+         Scale.isImm() && Scale.getImm() == 1 &&
+         Index.isReg() && Index.getReg() == 0 &&
+         Disp.isImm() && ZeroReg.isReg() && (ZeroReg.getReg() == 0) &&
+         "Unsupported x86 memory operand sequence.");
+  (void)Scale;
+  (void)Index;
+  (void)ZeroReg;
+
+  return std::make_pair(
+    Location(LocTy, Size, Base.getReg(), Disp.getImm()), ++MOI);
+}
+
+std::pair<StackMaps::Location, MachineInstr::const_mop_iterator>
+X86AsmPrinter::stackmapOperandParser(MachineInstr::const_mop_iterator MOI,
+                                     MachineInstr::const_mop_iterator MOE,
+                                     const TargetMachine &TM) {
+
+  typedef StackMaps::Location Location;
+
+  const MachineOperand &MOP = *MOI;
+  assert(!MOP.isRegMask() && (!MOP.isReg() || !MOP.isImplicit()) &&
+         "Register mask and implicit operands should not be processed.");
+
+  if (MOP.isImm()) {
+    // Verify anyregcc
+    // [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+
+    switch (MOP.getImm()) {
+    default: llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp: {
+      unsigned Size = TM.getDataLayout()->getPointerSizeInBits();
+      assert((Size % 8) == 0 && "Need pointer size in bytes.");
+      Size /= 8;
+      return parseMemoryOperand(StackMaps::Location::Direct, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::IndirectMemRefOp: {
+      ++MOI;
+      int64_t Size = MOI->getImm();
+      assert(Size > 0 && "Need a valid size for indirect memory locations.");
+      return parseMemoryOperand(StackMaps::Location::Indirect, Size,
+                                llvm::next(MOI), MOE);
+    }
+    case StackMaps::ConstantOp: {
+      ++MOI;
+      assert(MOI->isImm() && "Expected constant operand.");
+      int64_t Imm = MOI->getImm();
+      return std::make_pair(
+        Location(Location::Constant, sizeof(int64_t), 0, Imm), ++MOI);
+    }
+    }
+  }
+
+  // Otherwise this is a reg operand. The physical register number will
+  // ultimately be encoded as a DWARF regno. The stack map also records the size
+  // of a spill slot that can hold the register content. (The runtime can
+  // track the actual size of the data type if it needs to.)
+  assert(MOP.isReg() && "Expected register operand here.");
+  assert(TargetRegisterInfo::isPhysicalRegister(MOP.getReg()) &&
+         "Virtreg operands should have been rewritten before now.");
+  const TargetRegisterClass *RC =
+    TM.getRegisterInfo()->getMinimalPhysRegClass(MOP.getReg());
+  assert(!MOP.getSubReg() && "Physical subreg still around.");
+  return std::make_pair(
+    Location(Location::Register, RC->getSize(), MOP.getReg(), 0), ++MOI);
+}
+
+// Lower a stackmap of the form:
+// <id>, <shadowBytes>, ...
+static void LowerSTACKMAP(MCStreamer &OutStreamer,
+                          StackMaps &SM,
+                          const MachineInstr &MI)
+{
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+  SM.recordStackMap(MI);
+  // Emit padding.
+  // FIXME: These nops ensure that the stackmap's shadow is covered by
+  // instructions from the same basic block, but the nops should not be
+  // necessary if instructions from the same block follow the stackmap.
+  for (unsigned i = 0; i < NumNOPBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
+static void LowerPATCHPOINT(MCStreamer &OutStreamer,
+                            StackMaps &SM,
+                            const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers opers(&MI);
+  unsigned ScratchIdx = opers.getNextScratchIdx();
+  unsigned EncodedBytes = 0;
+  int64_t CallTarget = opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  if (CallTarget) {
+    // Emit MOV to materialize the target address and the CALL to target.
+    // This is encoded with 12-13 bytes, depending on which register is used.
+    // We conservatively assume that it is 12 bytes and emit in worst case one
+    // extra NOP byte.
+    EncodedBytes = 12;
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri)
+                                .addReg(MI.getOperand(ScratchIdx).getReg())
+                                .addImm(CallTarget));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r)
+                                .addReg(MI.getOperand(ScratchIdx).getReg()));
+  }
+  // Emit padding.
+  unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+
+  for (unsigned i = EncodedBytes; i < NumBytes; ++i)
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP));
+}
+
 void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(Mang, *MF, *this);
+  X86MCInstLower MCInstLowering(*MF, *this);
   switch (MI->getOpcode()) {
   case TargetOpcode::DBG_VALUE:
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      std::string TmpStr;
-      raw_string_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
+    llvm_unreachable("Should be handled target independently");
 
   // Emit nothing here but a comment if we can.
   case X86::Int_MemBarrier:
@@ -786,6 +896,24 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addExpr(DotExpr));
     return;
   }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
+
+  case X86::MORESTACK_RET:
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::RET));
+    OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64rr)
+      .addReg(X86::R10)
+      .addReg(X86::RAX));
+    return;
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 16886e4..dbda556 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -54,15 +54,14 @@ static cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
-X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
-                                 const TargetInstrInfo &tii)
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm)
   : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
                          ? X86::RIP : X86::EIP),
                        X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
                        X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
                        (tm.getSubtarget<X86Subtarget>().is64Bit()
                          ? X86::RIP : X86::EIP)),
-                       TM(tm), TII(tii) {
+                       TM(tm) {
   X86_MC::InitLLVM2SEHRegisterMapping(this);
 
   // Cache some information.
@@ -102,8 +101,8 @@ int X86RegisterInfo::getCompactUnwindRegNum(unsigned RegNum, bool isEH) const {
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // Only enable when post-RA scheduling is enabled and this is needed.
-  return TM.getSubtargetImpl()->postRAScheduler();
+  // ExeDepsFixer and PostRAScheduler require liveness.
+  return true;
 }
 
 int
@@ -240,8 +239,18 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case CallingConv::HiPE:
     return CSR_NoRegs_SaveList;
 
+  case CallingConv::WebKit_JS:
+    return CSR_64_SaveList;
+  case CallingConv::AnyReg:
+    return CSR_MostRegs_64_SaveList;
+
   case CallingConv::Intel_OCL_BI: {
     bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+    bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
+    if (HasAVX512 && IsWin64)
+      return CSR_Win64_Intel_OCL_BI_AVX512_SaveList;
+    if (HasAVX512 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_AVX512_SaveList;
     if (HasAVX && IsWin64)
       return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
     if (HasAVX && Is64Bit)
@@ -276,8 +285,13 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+  bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
 
   if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX512)
+      return CSR_Win64_Intel_OCL_BI_AVX512_RegMask;
+    if (Is64Bit && HasAVX512)
+      return CSR_64_Intel_OCL_BI_AVX512_RegMask;
     if (IsWin64 && HasAVX)
       return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
     if (Is64Bit && HasAVX)
@@ -287,6 +301,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   }
   if (CC == CallingConv::GHC || CC == CallingConv::HiPE)
     return CSR_NoRegs_RegMask;
+  if (CC == CallingConv::WebKit_JS || CC == CallingConv::AnyReg)
+    return CSR_MostRegs_64_RegMask;
   if (!Is64Bit)
     return CSR_32_RegMask;
   if (CC == CallingConv::Cold)
@@ -306,19 +322,19 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
   // Set the stack-pointer register and its aliases as reserved.
-  Reserved.set(X86::RSP);
-  for (MCSubRegIterator I(X86::RSP, this); I.isValid(); ++I)
+  for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
     Reserved.set(*I);
 
   // Set the instruction pointer register and its aliases as reserved.
-  Reserved.set(X86::RIP);
-  for (MCSubRegIterator I(X86::RIP, this); I.isValid(); ++I)
+  for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
+       ++I)
     Reserved.set(*I);
 
   // Set the frame-pointer register and its aliases as reserved if needed.
   if (TFI->hasFP(MF)) {
-    Reserved.set(X86::RBP);
-    for (MCSubRegIterator I(X86::RBP, this); I.isValid(); ++I)
+    for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
+         ++I)
       Reserved.set(*I);
   }
 
@@ -331,8 +347,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
         "Stack realignment in presence of dynamic allocas is not supported with"
         "this calling convention.");
 
-    Reserved.set(getBaseRegister());
-    for (MCSubRegIterator I(getBaseRegister(), this); I.isValid(); ++I)
+    for (MCSubRegIterator I(getBaseRegister(), this, /*IncludeSelf=*/true);
+         I.isValid(); ++I)
       Reserved.set(*I);
   }
 
@@ -345,14 +361,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(X86::GS);
 
   // Mark the floating point stack registers as reserved.
-  Reserved.set(X86::ST0);
-  Reserved.set(X86::ST1);
-  Reserved.set(X86::ST2);
-  Reserved.set(X86::ST3);
-  Reserved.set(X86::ST4);
-  Reserved.set(X86::ST5);
-  Reserved.set(X86::ST6);
-  Reserved.set(X86::ST7);
+  for (unsigned n = 0; n != 8; ++n)
+    Reserved.set(X86::ST0 + n);
 
   // Reserve the registers that only exist in 64-bit mode.
   if (!Is64Bit) {
@@ -365,19 +375,20 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
     for (unsigned n = 0; n != 8; ++n) {
       // R8, R9, ...
-      static const uint16_t GPR64[] = {
-        X86::R8,  X86::R9,  X86::R10, X86::R11,
-        X86::R12, X86::R13, X86::R14, X86::R15
-      };
-      for (MCRegAliasIterator AI(GPR64[n], this, true); AI.isValid(); ++AI)
+      for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
 
       // XMM8, XMM9, ...
-      assert(X86::XMM15 == X86::XMM8+7);
       for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI)
         Reserved.set(*AI);
     }
   }
+  if (!Is64Bit || !TM.getSubtarget<X86Subtarget>().hasAVX512()) {
+    for (unsigned n = 16; n != 32; ++n) {
+      for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+        Reserved.set(*AI);
+    }
+  }
 
   return Reserved;
 }
@@ -407,10 +418,11 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
+    return false;
+
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
-  if (!MF.getTarget().Options.RealignStack)
-    return false;
 
   // Stack realignment requires a frame pointer.  If we already started
   // register allocation with frame pointer elimination, it is too late now.
@@ -507,14 +519,6 @@ unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return TFI->hasFP(MF) ? FramePtr : StackPtr;
 }
 
-unsigned X86RegisterInfo::getEHExceptionRegister() const {
-  llvm_unreachable("What is the exception register");
-}
-
-unsigned X86RegisterInfo::getEHHandlerRegister() const {
-  llvm_unreachable("What is the exception handler register");
-}
-
 namespace llvm {
 unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
                                 bool High) {
@@ -688,4 +692,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
     }
   }
 }
+
+unsigned get512BitSuperRegister(unsigned Reg) {
+  if (Reg >= X86::XMM0 && Reg <= X86::XMM31)
+    return X86::ZMM0 + (Reg - X86::XMM0);
+  if (Reg >= X86::YMM0 && Reg <= X86::YMM31)
+    return X86::ZMM0 + (Reg - X86::YMM0);
+  if (Reg >= X86::ZMM0 && Reg <= X86::ZMM31)
+    return Reg;
+  llvm_unreachable("Unexpected SIMD register");
+}
+
 }
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index b9d7b8c..22251b2 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -27,7 +27,6 @@ namespace llvm {
 class X86RegisterInfo : public X86GenRegisterInfo {
 public:
   X86TargetMachine &TM;
-  const TargetInstrInfo &TII;
 
 private:
   /// Is64Bit - Is the target 64-bits.
@@ -56,7 +55,7 @@ private:
   unsigned BasePtr;
 
 public:
-  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+  X86RegisterInfo(X86TargetMachine &tm);
 
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
@@ -127,10 +126,6 @@ public:
   unsigned getBaseRegister() const { return BasePtr; }
   // FIXME: Move to FrameInfok
   unsigned getSlotSize() const { return SlotSize; }
-
-  // Exception handling queries.
-  unsigned getEHExceptionRegister() const;
-  unsigned getEHHandlerRegister() const;
 };
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
@@ -138,6 +133,9 @@ public:
 // e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
 unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
 
+//get512BitRegister - X86 utility - returns 512-bit super register
+unsigned get512BitSuperRegister(unsigned Reg);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index be6282a..b802728 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -21,11 +21,12 @@ class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n>
 
 // Subregister indices.
 let Namespace = "X86" in {
-  def sub_8bit    : SubRegIndex;
-  def sub_8bit_hi : SubRegIndex;
-  def sub_16bit   : SubRegIndex;
-  def sub_32bit   : SubRegIndex;
-  def sub_xmm     : SubRegIndex;
+  def sub_8bit    : SubRegIndex<8>;
+  def sub_8bit_hi : SubRegIndex<8, 8>;
+  def sub_16bit   : SubRegIndex<16>;
+  def sub_32bit   : SubRegIndex<32>;
+  def sub_xmm     : SubRegIndex<128>;
+  def sub_ymm     : SubRegIndex<256>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -186,28 +187,53 @@ def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
 def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
 def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
 def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+
 } // CostPerUse
 
-// YMM Registers, used by AVX instructions
+// YMM0-15 registers, used by AVX instructions and
+// YMM16-31 registers, used by AVX-512 instructions.
 let SubRegIndices = [sub_xmm] in {
-def YMM0:  X86Reg<"ymm0",   0, [XMM0]>,  DwarfRegAlias<XMM0>;
-def YMM1:  X86Reg<"ymm1",   1, [XMM1]>,  DwarfRegAlias<XMM1>;
-def YMM2:  X86Reg<"ymm2",   2, [XMM2]>,  DwarfRegAlias<XMM2>;
-def YMM3:  X86Reg<"ymm3",   3, [XMM3]>,  DwarfRegAlias<XMM3>;
-def YMM4:  X86Reg<"ymm4",   4, [XMM4]>,  DwarfRegAlias<XMM4>;
-def YMM5:  X86Reg<"ymm5",   5, [XMM5]>,  DwarfRegAlias<XMM5>;
-def YMM6:  X86Reg<"ymm6",   6, [XMM6]>,  DwarfRegAlias<XMM6>;
-def YMM7:  X86Reg<"ymm7",   7, [XMM7]>,  DwarfRegAlias<XMM7>;
-def YMM8:  X86Reg<"ymm8",   8, [XMM8]>,  DwarfRegAlias<XMM8>;
-def YMM9:  X86Reg<"ymm9",   9, [XMM9]>,  DwarfRegAlias<XMM9>;
-def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
-def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
-def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
-def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
-def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
-def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+  foreach  Index = 0-31 in {
+    def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+
+// ZMM Registers, used by AVX-512 instructions.
+let SubRegIndices = [sub_ymm] in {
+  foreach  Index = 0-31 in {
+    def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
 }
 
+  // Mask Registers, used by AVX-512 instructions.
+  def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118, -2, -2]>;
+  def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119, -2, -2]>;
+  def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120, -2, -2]>;
+  def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121, -2, -2]>;
+  def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122, -2, -2]>;
+  def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123, -2, -2]>;
+  def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124, -2, -2]>;
+  def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, -2, -2]>;
+
 class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
   let Aliases = A;
 }
@@ -421,3 +447,25 @@ def FPCCR : RegisterClass<"X86", [i16], 16, (add FPSW)> {
   let CopyCost = -1;  // Don't allow copying of status registers.
   let isAllocatable = 0;
 }
+
+// AVX-512 vector/mask registers.
+def VR512 : RegisterClass<"X86", [v16f32, v8f64, v16i32, v8i64], 512,
+    (sequence "ZMM%u", 0, 31)>;
+
+// Scalar AVX-512 floating point registers.
+def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
+
+def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
+
+// Extended VR128 and VR256 for AVX-512 instructions
+def VR128X : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                          128, (add FR32X)>;
+def VR256X : RegisterClass<"X86", [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
+                          256, (sequence "YMM%u", 0, 31)>;
+
+def VK8     : RegisterClass<"X86", [v8i1],   8, (sequence "K%u", 0, 7)>;
+def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)>;
+
+def VK8WM   : RegisterClass<"X86", [v8i1],   8, (sub VK8, K0)>;
+def VK16WM  : RegisterClass<"X86", [v16i1], 16, (add VK8WM)>;
+
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 84c9203..9748261 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -16,10 +16,13 @@ def HaswellModel : SchedMachineModel {
   // All x86 instructions are modeled as a single micro-op, and HW can decode 4
   // instructions per cycle.
   let IssueWidth = 4;
-  let MinLatency = 0; // 0 = Out-of-order execution.
+  let MicroOpBufferSize = 192; // Based on the reorder buffer.
   let LoadLatency = 4;
-  let ILPWindow = 30;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = HaswellModel in {
@@ -50,6 +53,12 @@ def HWPort15  : ProcResGroup<[HWPort1, HWPort5]>;
 def HWPort015 : ProcResGroup<[HWPort0, HWPort1, HWPort5]>;
 def HWPort0156: ProcResGroup<[HWPort0, HWPort1, HWPort5, HWPort6]>;
 
+// 60 Entry Unified Scheduler
+def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
+                              HWPort5, HWPort6, HWPort7]> {
+  let BufferSize=60;
+}
+
 // Integer division issued on port 0.
 def HWDivider : ProcResource<1>;
 
@@ -86,6 +95,7 @@ def : WriteRes<WriteZero,  []>;
 
 defm : HWWriteResPair<WriteALU,   HWPort0156, 1>;
 defm : HWWriteResPair<WriteIMul,  HWPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
 defm : HWWriteResPair<WriteShift, HWPort056,  1>;
 defm : HWWriteResPair<WriteJump,  HWPort5,   1>;
 
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index b36b3ad..3011c6d 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -17,10 +17,13 @@ def SandyBridgeModel : SchedMachineModel {
   // instructions per cycle.
   // FIXME: Identify instructions that aren't a single fused micro-op.
   let IssueWidth = 4;
-  let MinLatency = 0; // 0 = Out-of-order execution.
+  let MicroOpBufferSize = 168; // Based on the reorder buffer.
   let LoadLatency = 4;
-  let ILPWindow = 20;
   let MispredictPenalty = 16;
+
+  // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
 }
 
 let SchedModel = SandyBridgeModel in {
@@ -46,6 +49,11 @@ def SBPort05  : ProcResGroup<[SBPort0, SBPort5]>;
 def SBPort15  : ProcResGroup<[SBPort1, SBPort5]>;
 def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
 
+// 54 Entry Unified Scheduler
+def SBPortAny : ProcResGroup<[SBPort0, SBPort1, SBPort23, SBPort4, SBPort5]> {
+  let BufferSize=54;
+}
+
 // Integer division issued on port 0.
 def SBDivider : ProcResource<1>;
 
@@ -82,6 +90,7 @@ def : WriteRes<WriteZero,  []>;
 
 defm : SBWriteResPair<WriteALU,   SBPort015, 1>;
 defm : SBWriteResPair<WriteIMul,  SBPort1,   3>;
+def  : WriteRes<WriteIMulH, []> { let Latency = 3; }
 defm : SBWriteResPair<WriteShift, SBPort05,  1>;
 defm : SBWriteResPair<WriteJump,  SBPort5,   1>;
 
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 9fbde88..0556437 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -42,6 +42,7 @@ multiclass X86SchedWritePair {
 // Arithmetic.
 defm WriteALU  : X86SchedWritePair; // Simple integer ALU op.
 defm WriteIMul : X86SchedWritePair; // Integer multiplication.
+def  WriteIMulH : SchedWrite;       // Integer multiplication, high part.
 defm WriteIDiv : X86SchedWritePair; // Integer division.
 def  WriteLEA  : SchedWrite;        // LEA instructions can't fold loads.
 
@@ -140,9 +141,12 @@ def IIC_IDIV64      : InstrItinClass;
 // neg/not/inc/dec
 def IIC_UNARY_REG   : InstrItinClass;
 def IIC_UNARY_MEM   : InstrItinClass;
-// add/sub/and/or/xor/adc/sbc/cmp/test
+// add/sub/and/or/xor/sbc/cmp/test
 def IIC_BIN_MEM     : InstrItinClass;
 def IIC_BIN_NONMEM  : InstrItinClass;
+// adc/sbc
+def IIC_BIN_CARRY_MEM     : InstrItinClass;
+def IIC_BIN_CARRY_NONMEM  : InstrItinClass;
 // shift/rotate
 def IIC_SR          : InstrItinClass;
 // shift double
@@ -249,11 +253,11 @@ def IIC_SSE_INTSH_P_RR : InstrItinClass;
 def IIC_SSE_INTSH_P_RM : InstrItinClass;
 def IIC_SSE_INTSH_P_RI : InstrItinClass;
 
-def IIC_SSE_CMPP_RR : InstrItinClass;
-def IIC_SSE_CMPP_RM : InstrItinClass;
+def IIC_SSE_INTSHDQ_P_RI : InstrItinClass;
 
 def IIC_SSE_SHUFP : InstrItinClass;
-def IIC_SSE_PSHUF : InstrItinClass;
+def IIC_SSE_PSHUF_RI : InstrItinClass;
+def IIC_SSE_PSHUF_MI : InstrItinClass;
 
 def IIC_SSE_UNPCK : InstrItinClass;
 
@@ -266,10 +270,14 @@ def IIC_SSE_PINSRW : InstrItinClass;
 def IIC_SSE_PABS_RR : InstrItinClass;
 def IIC_SSE_PABS_RM : InstrItinClass;
 
-def IIC_SSE_SQRTP_RR : InstrItinClass;
-def IIC_SSE_SQRTP_RM : InstrItinClass;
-def IIC_SSE_SQRTS_RR : InstrItinClass;
-def IIC_SSE_SQRTS_RM : InstrItinClass;
+def IIC_SSE_SQRTPS_RR : InstrItinClass;
+def IIC_SSE_SQRTPS_RM : InstrItinClass;
+def IIC_SSE_SQRTSS_RR : InstrItinClass;
+def IIC_SSE_SQRTSS_RM : InstrItinClass;
+def IIC_SSE_SQRTPD_RR : InstrItinClass;
+def IIC_SSE_SQRTPD_RM : InstrItinClass;
+def IIC_SSE_SQRTSD_RR : InstrItinClass;
+def IIC_SSE_SQRTSD_RM : InstrItinClass;
 
 def IIC_SSE_RCPP_RR : InstrItinClass;
 def IIC_SSE_RCPP_RM : InstrItinClass;
@@ -311,7 +319,8 @@ def IIC_SSE_PSIGN_RM : InstrItinClass;
 
 def IIC_SSE_PMADD : InstrItinClass;
 def IIC_SSE_PMULHRSW : InstrItinClass;
-def IIC_SSE_PALIGNR : InstrItinClass;
+def IIC_SSE_PALIGNRR : InstrItinClass;
+def IIC_SSE_PALIGNRM : InstrItinClass;
 def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 
@@ -487,8 +496,8 @@ def IIC_PUSH_REG : InstrItinClass;
 def IIC_PUSH_F : InstrItinClass;
 def IIC_PUSH_A : InstrItinClass;
 def IIC_BSWAP : InstrItinClass;
-def IIC_BSF : InstrItinClass;
-def IIC_BSR : InstrItinClass;
+def IIC_BIT_SCAN_MEM : InstrItinClass;
+def IIC_BIT_SCAN_REG : InstrItinClass;
 def IIC_MOVS : InstrItinClass;
 def IIC_STOS : InstrItinClass;
 def IIC_SCAS : InstrItinClass;
@@ -535,6 +544,33 @@ def IIC_BOUND : InstrItinClass;
 def IIC_ARPL_REG : InstrItinClass;
 def IIC_ARPL_MEM : InstrItinClass;
 def IIC_MOVBE : InstrItinClass;
+def IIC_AES   : InstrItinClass;
+def IIC_BLEND_MEM : InstrItinClass;
+def IIC_BLEND_NOMEM : InstrItinClass;
+def IIC_CBW   : InstrItinClass;
+def IIC_CRC32_REG : InstrItinClass;
+def IIC_CRC32_MEM : InstrItinClass;
+def IIC_SSE_DPPD_RR : InstrItinClass;
+def IIC_SSE_DPPD_RM : InstrItinClass;
+def IIC_SSE_DPPS_RR : InstrItinClass;
+def IIC_SSE_DPPS_RM : InstrItinClass;
+def IIC_MMX_EMMS : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RR : InstrItinClass;
+def IIC_SSE_EXTRACTPS_RM : InstrItinClass;
+def IIC_SSE_INSERTPS_RR : InstrItinClass;
+def IIC_SSE_INSERTPS_RM : InstrItinClass;
+def IIC_SSE_MPSADBW_RR : InstrItinClass;
+def IIC_SSE_MPSADBW_RM : InstrItinClass;
+def IIC_SSE_PMULLD_RR : InstrItinClass;
+def IIC_SSE_PMULLD_RM : InstrItinClass;
+def IIC_SSE_ROUNDPS_REG : InstrItinClass;
+def IIC_SSE_ROUNDPS_MEM : InstrItinClass;
+def IIC_SSE_ROUNDPD_REG : InstrItinClass;
+def IIC_SSE_ROUNDPD_MEM : InstrItinClass;
+def IIC_SSE_POPCNT_RR : InstrItinClass;
+def IIC_SSE_POPCNT_RM : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RR : InstrItinClass;
+def IIC_SSE_PCLMULQDQ_RM : InstrItinClass;
 
 def IIC_NOP : InstrItinClass;
 
@@ -546,8 +582,9 @@ def IIC_NOP : InstrItinClass;
 // Resources beyond the decoder operate on micro-ops and are bufferred
 // so adjacent micro-ops don't directly compete.
 //
-// MinLatency=0 indicates that RAW dependencies can be decoded in the
-// same cycle.
+// MicroOpBufferSize > 1 indicates that RAW dependencies can be
+// decoded in the same cycle. The value 32 is a reasonably arbitrary
+// number of in-flight instructions.
 //
 // HighLatency=10 is optimistic. X86InstrInfo::isHighLatencyDef
 // indicates high latency opcodes. Alternatively, InstrItinData
@@ -555,19 +592,15 @@ def IIC_NOP : InstrItinClass;
 // latencies. Since these latencies are not used for pipeline hazards,
 // they do not need to be exact.
 //
-// ILPWindow=10 is an arbitrary threshold that approximates cycles of
-// latency hidden by instruction buffers. The actual value is not very
-// important but should be zero for inorder and nonzero for OOO processors.
-//
-// The GenericModel contains no instruciton itineraries.
+// The GenericModel contains no instruction itineraries.
 def GenericModel : SchedMachineModel {
   let IssueWidth = 4;
-  let MinLatency = 0;
+  let MicroOpBufferSize = 32;
   let LoadLatency = 4;
   let HighLatency = 10;
-  let ILPWindow = 10;
 }
 
 include "X86ScheduleAtom.td"
 include "X86SchedSandyBridge.td"
 include "X86SchedHaswell.td"
+include "X86ScheduleSLM.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index cce8f1b..ba72f29 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom (Bonnell)
-// processors.
+// This file defines the itinerary class data for the Intel Atom
+// in order (Saltwell-32nm/Bonnell-45nm) processors.
 //
 //===----------------------------------------------------------------------===//
 
@@ -79,9 +79,12 @@ def AtomItineraries : ProcessorItineraries<
   // neg/not/inc/dec
   InstrItinData<IIC_UNARY_REG, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [Port0]>] >,
-  // add/sub/and/or/xor/adc/sbc/cmp/test
+  // add/sub/and/or/xor/cmp/test
   InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_BIN_MEM, [InstrStage<1, [Port0]>] >,
+  // adc/sbc
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<1, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<1, [Port0]>] >,
   // shift/rotate
   InstrItinData<IIC_SR, [InstrStage<1, [Port0]>] >,
   // shift double
@@ -203,18 +206,23 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
-  InstrItinData<IIC_SSE_CMPP_RR, [InstrStage<6, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_CMPP_RM, [InstrStage<7, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [Port0, Port1]>] >,
 
   InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_SSE_PSHUF, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
 
   InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
 
-  InstrItinData<IIC_SSE_SQRTP_RR, [InstrStage<13, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTP_RM, [InstrStage<14, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTS_RR, [InstrStage<11, [Port0, Port1]>] >,
-  InstrItinData<IIC_SSE_SQRTS_RM, [InstrStage<12, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<70, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<34, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<34, [Port0, Port1]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<125, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
 
   InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
@@ -273,7 +281,8 @@ def AtomItineraries : ProcessorItineraries<
 
   InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [Port0]>] >,
   InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [Port0]>] >,
-  InstrItinData<IIC_SSE_PALIGNR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [Port0]>] >,
   InstrItinData<IIC_SSE_MWAIT, [InstrStage<46, [Port0, Port1]>] >,
   InstrItinData<IIC_SSE_MONITOR, [InstrStage<45, [Port0, Port1]>] >,
 
@@ -465,8 +474,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_PUSH_A, [InstrStage<8, [Port0, Port1]>] >,
 
   InstrItinData<IIC_BSWAP, [InstrStage<1, [Port0]>] >,
-  InstrItinData<IIC_BSF, [InstrStage<16, [Port0, Port1]>] >,
-  InstrItinData<IIC_BSR, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<16, [Port0, Port1]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<16, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVS, [InstrStage<3, [Port0, Port1]>] >,
   InstrItinData<IIC_STOS, [InstrStage<1, [Port0, Port1]>] >,
   InstrItinData<IIC_SCAS, [InstrStage<2, [Port0, Port1]>] >,
@@ -513,6 +522,8 @@ def AtomItineraries : ProcessorItineraries<
   InstrItinData<IIC_ARPL_REG, [InstrStage<24, [Port0, Port1]>] >,
   InstrItinData<IIC_ARPL_MEM, [InstrStage<23, [Port0, Port1]>] >,
   InstrItinData<IIC_MOVBE, [InstrStage<1, [Port0]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [Port0, Port1]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<5, [Port0, Port1]>] >,
 
   InstrItinData<IIC_NOP, [InstrStage<1, [Port0, Port1]>] >
   ]>;
@@ -520,11 +531,9 @@ def AtomItineraries : ProcessorItineraries<
 // Atom machine model.
 def AtomModel : SchedMachineModel {
   let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
-  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
-                       // OperandCycles may be used for expected latency.
+  let MicroOpBufferSize = 0; // In-order execution, always hide latency.
   let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
   let HighLatency = 30;// Expected, may be overriden by OperandCycles.
-  let ILPWindow = 0; // Always try to hide expected latency.
 
   let Itineraries = AtomItineraries;
 }
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
new file mode 100644
index 0000000..6c2a304
--- /dev/null
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -0,0 +1,668 @@
+//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Intel Atom
+// (Silvermont) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def IEC_RSV0 : FuncUnit;
+def IEC_RSV1 : FuncUnit;
+def FPC_RSV0 : FuncUnit;
+def FPC_RSV1 : FuncUnit;
+def MEC_RSV : FuncUnit;
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+def SLMItineraries : ProcessorItineraries<
+  [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
+  [], [
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] 
+  // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] 
+  // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
+  //
+  // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
+  //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // mul
+  InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul by al, ax, eax, rax
+  InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  // imul reg by reg|mem
+  InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>]  >,
+  // imul reg = reg/mem * imm
+  InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  // idiv - min latency
+  InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
+  // div - min latency
+  InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<25, [MEC_RSV]>] >,
+  InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
+  // neg/not/inc/dec
+  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // add/sub/and/or/xor/adc/sbc/cmp/test
+  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // adc/sbb
+  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  // shift/rotate
+  InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+  // shift double
+  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
+  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+  // cmov
+  InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
+  // set
+  InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcc
+  InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jcxz/jecxz/jrcxz
+  InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp rel
+  InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // jmp indirect
+  InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  // jmp far
+  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // loop/loope/loopne
+  InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // call - all but reg/imm
+  InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //ret
+  InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //sign extension movs
+  InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  //zero extension movs
+  InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // SSE binary operations
+  // arithmetic fp scalar
+  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+
+  // arithmetic fp parallel
+  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<27, [MEC_RSV]>] >,
+
+  // bitwise parallel
+  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  // arithmetic int parallel
+  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // multiply int parallel
+  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
+                   InstrStage<5, [MEC_RSV]>] >,
+
+  // shift parallel
+  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
+                   InstrStage<2, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
+                   InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
+
+  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
+                   InstrStage<26, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
+                   InstrStage<13, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
+                   InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
+  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
+                   InstrStage<4, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<9, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  // conversions
+  // to/from PD ...
+  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  // to/from PS except to/from PD and PS2PI
+  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+
+  // MMX MOVs
+  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // other MMX
+  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // conversions
+  // from/to PD
+  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // from/to PI
+  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FST,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FIST,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_FLDZ,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FCOMI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDCW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FFREE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_WAIT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXAM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FNOP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FLDL,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_F2XM1,  [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FYL2X,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPTAN,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FPATAN,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FXTRACT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPSTP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FPREM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FYL2XP1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FSINCOS,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FRNDINT,  [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FSCALE,  [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXSAVE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXRSTOR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+
+  // System instructions
+  InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INT3,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IRET,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_HLT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LXS,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RSM,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SIDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SGDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SLDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_IN_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_IN_RI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_INS,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // worst case for mov REG_CRx
+  InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LAR
+  InstrItinData<IIC_LAR_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LAR_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // LSL
+  InstrItinData<IIC_LSL_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LSL_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // push control register, segment registers
+  InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // pop control register, segment registers
+  InstrItinData<IIC_POP_SR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // VERR, VERW
+  InstrItinData<IIC_VERR,     [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // WRMSR, RDMSR
+  InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  // SMSW, LMSW
+  InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+
+  InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<6, [MEC_RSV]>] >,
+  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
+  InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
+  InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
+                  InstrStage<3, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<12, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<15, [MEC_RSV]>] >,
+  InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<1, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<11, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<5, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
+  InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
+                  InstrStage<4, [MEC_RSV]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
+  InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
+                  InstrStage<10, [MEC_RSV]>] >,
+
+  InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
+  ]>;
+
+// Silvermont machine model.
+def SLMModel : SchedMachineModel {
+  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
+  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
+                       // OperandCycles may be used for expected latency.
+  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
+  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+
+  let Itineraries = SLMItineraries;
+}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index f934fdd..b9c620f 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -27,7 +27,7 @@ X86SelectionDAGInfo::~X86SelectionDAGInfo() {
 }
 
 SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                              SDValue Chain,
                                              SDValue Dst, SDValue Src,
                                              SDValue Size, unsigned Align,
@@ -46,8 +46,6 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
       !ConstantSize ||
       ConstantSize->getZExtValue() >
         Subtarget->getMaxInlineSizeThreshold()) {
-    SDValue InFlag(0, 0);
-
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
@@ -175,7 +173,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
 }
 
 SDValue
-X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                         SDValue Chain, SDValue Dst, SDValue Src,
                                         SDValue Size, unsigned Align,
                                         bool isVolatile, bool AlwaysInline,
diff --git a/lib/Target/X86/X86SelectionDAGInfo.h b/lib/Target/X86/X86SelectionDAGInfo.h
index d1d66fe..d728af5 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/lib/Target/X86/X86SelectionDAGInfo.h
@@ -34,7 +34,7 @@ public:
   ~X86SelectionDAGInfo();
 
   virtual
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
@@ -42,7 +42,7 @@ public:
                                   MachinePointerInfo DstPtrInfo) const;
 
   virtual
-  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+  SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 74da2a9..01353b2 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -276,20 +276,29 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
          (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
          (Family == 6 && Model == 0x2A) || // SandyBridge
          (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
-         (Family == 6 && Model == 0x3A))) {// IvyBridge
+         (Family == 6 && Model == 0x3A) || // IvyBridge
+         (Family == 6 && Model == 0x3E) || // IvyBridge EP
+         (Family == 6 && Model == 0x3C) || // Haswell
+         (Family == 6 && Model == 0x3F) || // ...
+         (Family == 6 && Model == 0x45) || // ...
+         (Family == 6 && Model == 0x46))) { // ...
       IsUAMemFast = true;
       ToggleFeature(X86::FeatureFastUAMem);
     }
 
-    // Set processor type. Currently only Atom is detected.
+    // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
     if (Family == 6 &&
-        (Model == 28 || Model == 38 || Model == 39
-         || Model == 53 || Model == 54)) {
+        (Model == 28 || Model == 38 || Model == 39 ||
+         Model == 53 || Model == 54)) {
       X86ProcFamily = IntelAtom;
 
       UseLeaForSP = true;
       ToggleFeature(X86::FeatureLeaForSP);
     }
+    else if (Family == 6 &&
+        (Model == 55 || Model == 74 || Model == 77)) {
+      X86ProcFamily = IntelSLM;
+    }
 
     unsigned MaxExtLevel;
     X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
@@ -351,14 +360,38 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasRTM = true;
         ToggleFeature(X86::FeatureRTM);
       }
-      if (IsIntel && ((EBX >> 19) & 0x1)) {
-        HasADX = true;
-        ToggleFeature(X86::FeatureADX);
+      if (IsIntel && ((EBX >> 16) & 0x1)) {
+        X86SSELevel = AVX512F;
+        ToggleFeature(X86::FeatureAVX512);
       }
       if (IsIntel && ((EBX >> 18) & 0x1)) {
         HasRDSEED = true;
         ToggleFeature(X86::FeatureRDSEED);
       }
+      if (IsIntel && ((EBX >> 19) & 0x1)) {
+        HasADX = true;
+        ToggleFeature(X86::FeatureADX);
+      }
+      if (IsIntel && ((EBX >> 26) & 0x1)) {
+        HasPFI = true;
+        ToggleFeature(X86::FeaturePFI);
+      }
+      if (IsIntel && ((EBX >> 27) & 0x1)) {
+        HasERI = true;
+        ToggleFeature(X86::FeatureERI);
+      }
+      if (IsIntel && ((EBX >> 28) & 0x1)) {
+        HasCDI = true;
+        ToggleFeature(X86::FeatureCDI);
+      }
+      if (IsIntel && ((EBX >> 29) & 0x1)) {
+        HasSHA = true;
+        ToggleFeature(X86::FeatureSHA);
+      }
+    }
+    if (IsAMD && ((ECX >> 21) & 0x1)) {
+      HasTBM = true;
+      ToggleFeature(X86::FeatureTBM);
     }
   }
 }
@@ -416,8 +449,8 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
     // Make sure 64-bit features are available in 64-bit mode.
     if (In64BitMode) {
-      HasX86_64 = true; ToggleFeature(X86::Feature64Bit);
-      HasCMov = true;   ToggleFeature(X86::FeatureCMOV);
+      if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
+      if (!HasCMov)   { HasCMov   = true; ToggleFeature(X86::FeatureCMOV); }
 
       if (X86SSELevel < SSE2) {
         X86SSELevel = SSE2;
@@ -429,9 +462,9 @@ void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
 
   // CPUName may have been set by the CPU detection code. Make sure the
   // new MCSchedModel is used.
-  InitMCProcessorInfo(CPUName, FS);
+  InitCPUSchedModel(CPUName);
 
-  if (X86ProcFamily == IntelAtom)
+  if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
     PostRAScheduler = true;
 
   InstrItins = getInstrItineraryForCPU(CPUName);
@@ -468,6 +501,7 @@ void X86Subtarget::initializeEnvironment() {
   HasFMA = false;
   HasFMA4 = false;
   HasXOP = false;
+  HasTBM = false;
   HasMOVBE = false;
   HasRDRAND = false;
   HasF16C = false;
@@ -477,7 +511,11 @@ void X86Subtarget::initializeEnvironment() {
   HasBMI2 = false;
   HasRTM = false;
   HasHLE = false;
+  HasERI = false;
+  HasCDI = false;
+  HasPFI = false;
   HasADX = false;
+  HasSHA = false;
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 66832b9..dd8c081 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -42,7 +42,7 @@ enum Style {
 class X86Subtarget : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
-    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
   };
 
   enum X863DNowEnum {
@@ -50,7 +50,7 @@ protected:
   };
 
   enum X86ProcFamilyEnum {
-    Others, IntelAtom
+    Others, IntelAtom, IntelSLM
   };
 
   /// X86ProcFamily - X86 processor family: Intel Atom, and others
@@ -97,6 +97,9 @@ protected:
   /// HasXOP - Target has XOP instructions
   bool HasXOP;
 
+  /// HasTBM - Target has TBM instructions.
+  bool HasTBM;
+
   /// HasMOVBE - True if the processor has the MOVBE instruction.
   bool HasMOVBE;
 
@@ -127,6 +130,9 @@ protected:
   /// HasADX - Processor has ADX instructions.
   bool HasADX;
 
+  /// HasSHA - Processor has SHA instructions.
+  bool HasSHA;
+
   /// HasPRFCHW - Processor has PRFCHW instructions.
   bool HasPRFCHW;
 
@@ -169,6 +175,15 @@ protected:
   ///             address generation (AG) time.
   bool LEAUsesAG;
 
+  /// Processor has AVX-512 PreFetch Instructions
+  bool HasPFI;
+  
+  /// Processor has AVX-512 Exponential and Reciprocal Instructions
+  bool HasERI;
+  
+  /// Processor has AVX-512 Conflict Detection Instructions
+  bool HasCDI;
+  
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment;
@@ -249,6 +264,7 @@ public:
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasAVX512() const { return X86SSELevel >= AVX512F; }
   bool hasFp256() const { return hasAVX(); }
   bool hasInt256() const { return hasAVX2(); }
   bool hasSSE4A() const { return HasSSE4A; }
@@ -261,6 +277,7 @@ public:
   // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
   bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
+  bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
   bool hasF16C() const { return HasF16C; }
@@ -271,6 +288,7 @@ public:
   bool hasRTM() const { return HasRTM; }
   bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
+  bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
@@ -282,6 +300,9 @@ public:
   bool padShortFunctions() const { return PadShortFunctions; }
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
+  bool hasCDI() const { return HasCDI; }
+  bool hasPFI() const { return HasPFI; }
+  bool hasERI() const { return HasERI; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
 
@@ -298,10 +319,8 @@ public:
     return (TargetTriple.getEnvironment() == Triple::ELF ||
             TargetTriple.isOSBinFormatELF());
   }
-  bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
-  bool isTargetNaCl() const {
-    return TargetTriple.getOS() == Triple::NaCl;
-  }
+  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetWindows() const { return TargetTriple.getOS() == Triple::Win32; }
@@ -314,15 +333,14 @@ public:
   }
   bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
 
+  bool isOSWindows() const { return TargetTriple.isOSWindows(); }
+
   bool isTargetWin64() const {
-    // FIXME: x86_64-cygwin has not been released yet.
     return In64BitMode && TargetTriple.isOSWindows();
   }
 
   bool isTargetWin32() const {
-    // FIXME: Cygwin is included for isTargetWin64 -- should it be included
-    // here too?
-    return !In64BitMode && (isTargetMingw() || isTargetWindows());
+    return !In64BitMode && (isTargetCygMing() || isTargetWindows());
   }
 
   bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
@@ -338,7 +356,13 @@ public:
   }
   bool isPICStyleStubAny() const {
     return PICStyle == PICStyles::StubDynamicNoPIC ||
-           PICStyle == PICStyles::StubPIC; }
+           PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isCallingConvWin64(CallingConv::ID CC) const {
+    return (isTargetWin64() && CC != CallingConv::X86_64_SysV) ||
+           CC == CallingConv::X86_64_Win64;
+  }
 
   /// ClassifyGlobalReference - Classify a global variable reference for the
   /// current subtarget according to how we should reference it in a non-pcrel
@@ -361,11 +385,14 @@ public:
   /// memset with zero passed as the second argument. Otherwise it
   /// returns null.
   const char *getBZeroEntry() const;
-  
+
   /// This function returns true if the target has sincos() routine in its
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
 
+  /// Enable the MachineScheduler pass for all X86 subtargets.
+  bool enableMachineScheduler() const LLVM_OVERRIDE { return true; }
+
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 00fa47f..ddf580f 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -49,6 +49,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     JITInfo(*this) {
+  initAsmInfo();
 }
 
 void X86_64TargetMachine::anchor() { }
@@ -69,6 +70,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     JITInfo(*this) {
+  initAsmInfo();
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
@@ -90,7 +92,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   } else if (Subtarget.is64Bit()) {
     // PIC in 64 bit mode is always rip-rel.
     Subtarget.setPICStyle(PICStyles::RIPRel);
-  } else if (Subtarget.isTargetCygMing()) {
+  } else if (Subtarget.isTargetCOFF()) {
     Subtarget.setPICStyle(PICStyles::None);
   } else if (Subtarget.isTargetDarwin()) {
     if (getRelocationModel() == Reloc::PIC_)
@@ -112,14 +114,14 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
 // Command line options for x86
 //===----------------------------------------------------------------------===//
 static cl::opt<bool>
-UseVZeroUpper("x86-use-vzeroupper",
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
 // Temporary option to control early if-conversion for x86 while adding machine
 // models.
 static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt",
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
 	       cl::desc("Enable early if-conversion on X86"));
 
 //===----------------------------------------------------------------------===//
@@ -130,7 +132,7 @@ void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our X86 pass. This
   // allows the X86 pass to delegate to the target independent layer when
   // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createBasicTargetTransformInfoPass(this));
   PM.add(createX86TargetTransformInfoPass(this));
 }
 
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 871dacd..086cd4d 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -25,7 +25,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
   if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = Mang->getSymbol(GV);
+    const MCSymbol *Sym = getSymbol(*Mang, GV);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
     const MCExpr *Four = MCConstantExpr::Create(4, getContext());
@@ -39,7 +39,7 @@ getTTypeGlobalReference(const GlobalValue *GV, Mangler *Mang,
 MCSymbol *X86_64MachoTargetObjectFile::
 getCFIPersonalitySymbol(const GlobalValue *GV, Mangler *Mang,
                         MachineModuleInfo *MMI) const {
-  return Mang->getSymbol(GV);
+  return getSymbol(*Mang, GV);
 }
 
 void
@@ -47,3 +47,9 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+const MCExpr *
+X86LinuxTargetObjectFile::getDebugThreadLocalSymbol(
+    const MCSymbol *Sym) const {
+  return MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
+}
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 9d26d38..79c861d 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -36,6 +36,9 @@ namespace llvm {
   /// and x86-64.
   class X86LinuxTargetObjectFile : public TargetLoweringObjectFileELF {
     virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    /// \brief Describe a TLS variable address within debug info.
+    virtual const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const;
   };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index eba9d78..f88a666 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -33,7 +33,6 @@ void initializeX86TTIPass(PassRegistry &);
 namespace {
 
 class X86TTI : public ImmutablePass, public TargetTransformInfo {
-  const X86TargetMachine *TM;
   const X86Subtarget *ST;
   const X86TargetLowering *TLI;
 
@@ -42,12 +41,12 @@ class X86TTI : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   X86TTI(const X86TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializeX86TTIPass(*PassRegistry::getPassRegistry());
   }
@@ -101,6 +100,11 @@ public:
                                    unsigned Alignment,
                                    unsigned AddressSpace) const;
 
+  virtual unsigned getAddressComputationCost(Type *PtrTy, bool IsComplex) const;
+  
+  virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+                                    bool IsPairwiseForm) const;
+
   /// @}
 };
 
@@ -126,8 +130,8 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
   // TODO: Currently the __builtin_popcount() implementation using SSE3
   //   instructions is inefficient. Once the problem is fixed, we should
-  //   call ST->hasSSE3() instead of ST->hasSSE4().
-  return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+  //   call ST->hasSSE3() instead of ST->hasPOPCNT().
+  return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
@@ -173,7 +177,7 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> AVX2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
     { ISD::SHL,     MVT::v4i32,    1 },
@@ -196,17 +200,27 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v32i8,  32*10 }, // Scalarized.
     { ISD::SRA,  MVT::v16i16,  16*10 }, // Scalarized.
     { ISD::SRA,  MVT::v4i64,  4*10 }, // Scalarized.
+
+    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
+    { ISD::SDIV,  MVT::v32i8,  32*20 },
+    { ISD::SDIV,  MVT::v16i16, 16*20 },
+    { ISD::SDIV,  MVT::v8i32,  8*20 },
+    { ISD::SDIV,  MVT::v4i64,  4*20 },
+    { ISD::UDIV,  MVT::v32i8,  32*20 },
+    { ISD::UDIV,  MVT::v16i16, 16*20 },
+    { ISD::UDIV,  MVT::v8i32,  8*20 },
+    { ISD::UDIV,  MVT::v4i64,  4*20 },
   };
 
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTable, array_lengthof(AVX2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> SSE2UniformConstCostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType>
+  SSE2UniformConstCostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // Constant splats are cheaper for the following instructions.
@@ -227,15 +241,13 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2UniformConstCostTable,
-                                   array_lengthof(SSE2UniformConstCostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
   }
 
 
-  static const CostTblEntry<MVT> SSE2CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE2CostTable[] = {
     // We don't correctly identify costs of casts because they are marked as
     // custom.
     // For some cases, where the shift amount is a scalar we would be able
@@ -258,16 +270,30 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v8i16,  8*10 }, // Scalarized.
     { ISD::SRA,  MVT::v4i32,  4*10 }, // Scalarized.
     { ISD::SRA,  MVT::v2i64,  2*10 }, // Scalarized.
+
+    // It is not a good idea to vectorize division. We have to scalarize it and
+    // in the process we will often end up having to spilling regular
+    // registers. The overhead of division is going to dominate most kernels
+    // anyways so try hard to prevent vectorization of division - it is
+    // generally a bad idea. Assume somewhat arbitrarily that we have to be able
+    // to hide "20 cycles" for each lane.
+    { ISD::SDIV,  MVT::v16i8,  16*20 },
+    { ISD::SDIV,  MVT::v8i16,  8*20 },
+    { ISD::SDIV,  MVT::v4i32,  4*20 },
+    { ISD::SDIV,  MVT::v2i64,  2*20 },
+    { ISD::UDIV,  MVT::v16i8,  16*20 },
+    { ISD::UDIV,  MVT::v8i16,  8*20 },
+    { ISD::UDIV,  MVT::v4i32,  4*20 },
+    { ISD::UDIV,  MVT::v2i64,  2*20 },
   };
 
   if (ST->hasSSE2()) {
-    int Idx = CostTableLookup<MVT>(SSE2CostTable, array_lengthof(SSE2CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(SSE2CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2CostTable[Idx].Cost;
   }
 
-  static const CostTblEntry<MVT> AVX1CostTable[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTable[] = {
     // We don't have to scalarize unsupported ops. We can issue two half-sized
     // operations and we only need to extract the upper YMM half.
     // Two ops + 1 extract + 1 insert = 4.
@@ -286,21 +312,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   // Look for AVX1 lowering tricks.
   if (ST->hasAVX() && !ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
-                                   ISD, LT.second);
+    int Idx = CostTableLookup(AVX1CostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * AVX1CostTable[Idx].Cost;
   }
 
   // Custom lowering of vectors.
-  static const CostTblEntry<MVT> CustomLowered[] = {
+  static const CostTblEntry<MVT::SimpleValueType> CustomLowered[] = {
     // A v2i64/v4i64 and multiply is custom lowered as a series of long
     // multiplies(3), shifts(4) and adds(2).
     { ISD::MUL,     MVT::v2i64,    9 },
     { ISD::MUL,     MVT::v4i64,    9 },
   };
-  int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
-                                 ISD, LT.second);
+  int Idx = CostTableLookup(CustomLowered, ISD, LT.second);
   if (Idx != -1)
     return LT.first * CustomLowered[Idx].Cost;
 
@@ -337,7 +361,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   std::pair<unsigned, MVT> LTSrc = TLI->getTypeLegalizationCost(Src);
   std::pair<unsigned, MVT> LTDest = TLI->getTypeLegalizationCost(Dst);
 
-  static const TypeConversionCostTblEntry<MVT> SSE2ConvTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  SSE2ConvTbl[] = {
     // These are somewhat magic numbers justified by looking at the output of
     // Intel's IACA, running some kernels and making sure when we take
     // legalization into account the throughput will be overestimated.
@@ -361,9 +386,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasSSE2() && !ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(SSE2ConvTbl,
-                                          array_lengthof(SSE2ConvTbl),
-                                          ISD, LTDest.second, LTSrc.second);
+    int Idx =
+        ConvertCostTableLookup(SSE2ConvTbl, ISD, LTDest.second, LTSrc.second);
     if (Idx != -1)
       return LTSrc.first * SSE2ConvTbl[Idx].Cost;
   }
@@ -375,13 +399,17 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   if (!SrcTy.isSimple() || !DstTy.isSimple())
     return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
 
-  static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+  static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+  AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 2 },
 
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i1,  8 },
     { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  8 },
@@ -420,9 +448,8 @@ unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
   };
 
   if (ST->hasAVX()) {
-    int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
-                                 array_lengthof(AVXConversionTbl),
-                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    int Idx = ConvertCostTableLookup(AVXConversionTbl, ISD, DstTy.getSimpleVT(),
+                                     SrcTy.getSimpleVT());
     if (Idx != -1)
       return AVXConversionTbl[Idx].Cost;
   }
@@ -440,7 +467,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  static const CostTblEntry<MVT> SSE42CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTbl[] = {
     { ISD::SETCC,   MVT::v2f64,   1 },
     { ISD::SETCC,   MVT::v4f32,   1 },
     { ISD::SETCC,   MVT::v2i64,   1 },
@@ -449,7 +476,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v16i8,   1 },
   };
 
-  static const CostTblEntry<MVT> AVX1CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTbl[] = {
     { ISD::SETCC,   MVT::v4f64,   1 },
     { ISD::SETCC,   MVT::v8f32,   1 },
     // AVX1 does not support 8-wide integer compare.
@@ -459,7 +486,7 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
     { ISD::SETCC,   MVT::v32i8,   4 },
   };
 
-  static const CostTblEntry<MVT> AVX2CostTbl[] = {
+  static const CostTblEntry<MVT::SimpleValueType> AVX2CostTbl[] = {
     { ISD::SETCC,   MVT::v4i64,   1 },
     { ISD::SETCC,   MVT::v8i32,   1 },
     { ISD::SETCC,   MVT::v16i16,  1 },
@@ -467,19 +494,19 @@ unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   };
 
   if (ST->hasAVX2()) {
-    int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    int Idx = CostTableLookup(AVX2CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX2CostTbl[Idx].Cost;
   }
 
   if (ST->hasAVX()) {
-    int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    int Idx = CostTableLookup(AVX1CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * AVX1CostTbl[Idx].Cost;
   }
 
   if (ST->hasSSE42()) {
-    int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    int Idx = CostTableLookup(SSE42CostTbl, ISD, MTy);
     if (Idx != -1)
       return LT.first * SSE42CostTbl[Idx].Cost;
   }
@@ -511,8 +538,51 @@ unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
 }
 
+unsigned X86TTI::getScalarizationOverhead(Type *Ty, bool Insert,
+                                            bool Extract) const {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
 unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                                  unsigned AddressSpace) const {
+  // Handle non power of two vectors such as <3 x float>
+  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
+    unsigned NumElem = VTy->getVectorNumElements();
+
+    // Handle a few common cases:
+    // <3 x float>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 32)
+      // Cost = 64 bit store + extract + 32 bit store.
+      return 3;
+
+    // <3 x double>
+    if (NumElem == 3 && VTy->getScalarSizeInBits() == 64)
+      // Cost = 128 bit store + unpack + 64 bit store.
+      return 3;
+
+    // Assume that all other non power-of-two numbers are scalarized.
+    if (!isPowerOf2_32(NumElem)) {
+      unsigned Cost = TargetTransformInfo::getMemoryOpCost(Opcode,
+                                                           VTy->getScalarType(),
+                                                           Alignment,
+                                                           AddressSpace);
+      unsigned SplitCost = getScalarizationOverhead(Src,
+                                                    Opcode == Instruction::Load,
+                                                    Opcode==Instruction::Store);
+      return NumElem * Cost + SplitCost;
+    }
+  }
+
   // Legalize the type.
   std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -528,3 +598,97 @@ unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
 
   return Cost;
 }
+
+unsigned X86TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  return TargetTransformInfo::getAddressComputationCost(Ty, IsComplex);
+}
+
+unsigned X86TTI::getReductionCost(unsigned Opcode, Type *ValTy,
+                                  bool IsPairwise) const {
+  
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+  
+  MVT MTy = LT.second;
+  
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+ 
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput 
+  // and make it as the cost. 
+ 
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v8i16,   5 },
+  };
+ 
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::FADD,  MVT::v4f64,   5 },
+    { ISD::FADD,  MVT::v8f32,   7 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
+    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
+    { ISD::ADD,   MVT::v8i16,   5 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+
+  static const CostTblEntry<MVT::SimpleValueType> SSE42CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v2f64,   2 },
+    { ISD::FADD,  MVT::v4f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.3".
+    { ISD::ADD,   MVT::v8i16,   4 },      // The data reported by the IACA tool is "4.3".
+  };
+  
+  static const CostTblEntry<MVT::SimpleValueType> AVX1CostTblNoPairWise[] = {
+    { ISD::FADD,  MVT::v4f32,   3 },
+    { ISD::FADD,  MVT::v4f64,   3 },
+    { ISD::FADD,  MVT::v8f32,   4 },
+    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
+    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "2.8".
+    { ISD::ADD,   MVT::v4i64,   3 },
+    { ISD::ADD,   MVT::v8i16,   4 },
+    { ISD::ADD,   MVT::v8i32,   5 },
+  };
+  
+  if (IsPairwise) {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblPairWise[Idx].Cost;
+    }
+  
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblPairWise[Idx].Cost;
+    }
+  } else {
+    if (ST->hasAVX()) {
+      int Idx = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * AVX1CostTblNoPairWise[Idx].Cost;
+    }
+    
+    if (ST->hasSSE42()) {
+      int Idx = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy);
+      if (Idx != -1)
+        return LT.first * SSE42CostTblNoPairWise[Idx].Cost;
+    }
+  }
+
+  return TargetTransformInfo::getReductionCost(Opcode, ValTy, IsPairwise);
+}
+
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 0f77948..66ae9c2 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -105,23 +105,28 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
 }
 
 static bool isYmmReg(unsigned Reg) {
-  if (Reg >= X86::YMM0 && Reg <= X86::YMM15)
-    return true;
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM31);
+}
 
-  return false;
+static bool isZmmReg(unsigned Reg) {
+  return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31);
 }
 
 static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
        E = MRI.livein_end(); I != E; ++I)
-    if (isYmmReg(I->first))
+    if (isYmmReg(I->first) || isZmmReg(I->first))
       return true;
 
   return false;
 }
 
 static bool clobbersAllYmmRegs(const MachineOperand &MO) {
-  for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+  for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
@@ -143,6 +148,25 @@ static bool hasYmmReg(MachineInstr *MI) {
   return false;
 }
 
+/// clobbersAnyYmmReg() - Check if any YMM register will be clobbered by this
+/// instruction.
+static bool clobbersAnyYmmReg(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isRegMask())
+      continue;
+    for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+    for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) {
+      if (MO.clobbersPhysReg(reg))
+        return true;
+    }
+  }
+  return false;
+}
+
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
@@ -226,8 +250,9 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
   bool BBHasCall = false;
 
   for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
-    MachineInstr *MI = I;
     DebugLoc dl = I->getDebugLoc();
+    MachineInstr *MI = I;
+
     bool isControlFlow = MI->isCall() || MI->isReturn();
 
     // Shortcut: don't need to check regular instructions in dirty state.
@@ -246,6 +271,14 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
     if (!isControlFlow)
       continue;
 
+    // If the call won't clobber any YMM register, skip it as well. It usually
+    // happens on helper function calls (such as '_chkstk', '_ftol2') where
+    // standard calling convention is not used (RegMask is not used to mark
+    // register clobbered and register usage (def/imp-def/use) is well-dfined
+    // and explicitly specified.
+    if (MI->isCall() && !clobbersAnyYmmReg(MI))
+      continue;
+
     BBHasCall = true;
 
     // The VZEROUPPER instruction resets the upper 128 bits of all Intel AVX
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index d5bfddc..3fa3b34 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -22,10 +22,11 @@ add_llvm_target(XCoreCodeGen
   XCoreSubtarget.cpp
   XCoreTargetMachine.cpp
   XCoreTargetObjectFile.cpp
+  XCoreTargetTransformInfo.cpp
   XCoreSelectionDAGInfo.cpp
   )
 
-add_dependencies(LLVMXCoreCodeGen intrinsics_gen)
+add_dependencies(LLVMXCoreCodeGen XCoreCommonTableGen intrinsics_gen)
 
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index a2ae40c..9c20abd 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -29,7 +29,7 @@ namespace {
 
 /// \brief A disassembler class for XCore.
 class XCoreDisassembler : public MCDisassembler {
-  const MCRegisterInfo *RegInfo;
+  OwningPtr<const MCRegisterInfo> RegInfo;
 public:
   XCoreDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
     MCDisassembler(STI), RegInfo(Info) {}
@@ -42,7 +42,7 @@ public:
                                       raw_ostream &vStream,
                                       raw_ostream &cStream) const;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 };
 }
 
@@ -53,7 +53,7 @@ static bool readInstruction16(const MemoryObject &region,
   uint8_t Bytes[4];
 
   // We want to read exactly 2 Bytes of data.
-  if (region.readBytes(address, 2, Bytes, NULL) == -1) {
+  if (region.readBytes(address, 2, Bytes) == -1) {
     size = 0;
     return false;
   }
@@ -69,7 +69,7 @@ static bool readInstruction32(const MemoryObject &region,
   uint8_t Bytes[4];
 
   // We want to read exactly 4 Bytes of data.
-  if (region.readBytes(address, 4, Bytes, NULL) == -1) {
+  if (region.readBytes(address, 4, Bytes) == -1) {
     size = 0;
     return false;
   }
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index 1cfdbda..3d1c474 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -13,7 +13,7 @@ using namespace llvm;
 
 void XCoreMCAsmInfo::anchor() { }
 
-XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, StringRef TT) {
+XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
@@ -23,10 +23,14 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, StringRef TT) {
     
   PrivateGlobalPrefix = ".L";
   AscizDirective = ".asciiz";
-  WeakDefDirective = "\t.weak\t";
-  WeakRefDirective = "\t.weak\t";
+
+  HiddenVisibilityAttr = MCSA_Invalid;
+  HiddenDeclarationVisibilityAttr = MCSA_Invalid;
+  ProtectedVisibilityAttr = MCSA_Invalid;
 
   // Debug
   HasLEB128 = true;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+  DwarfRegNumForCFI = true;
 }
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index 0767775..e53c96b 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -14,16 +14,16 @@
 #ifndef XCORETARGETASMINFO_H
 #define XCORETARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
   class StringRef;
   class Target;
 
-  class XCoreMCAsmInfo : public MCAsmInfo {
+  class XCoreMCAsmInfo : public MCAsmInfoELF {
     virtual void anchor();
   public:
-    explicit XCoreMCAsmInfo(const Target &T, StringRef TT);
+    explicit XCoreMCAsmInfo(StringRef TT);
   };
 
 } // namespace llvm
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index c177365..10bb6df 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -51,13 +51,13 @@ static MCSubtargetInfo *createXCoreMCSubtargetInfo(StringRef TT, StringRef CPU,
   return X;
 }
 
-static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
-  MCAsmInfo *MAI = new XCoreMCAsmInfo(T, TT);
+static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
+                                       StringRef TT) {
+  MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(XCore::SP, 0);
-  MAI->addInitialFrameState(0, Dst, Src);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, XCore::SP, 0);
+  MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
diff --git a/lib/Target/XCore/README.txt b/lib/Target/XCore/README.txt
index b69205b..28d551a 100644
--- a/lib/Target/XCore/README.txt
+++ b/lib/Target/XCore/README.txt
@@ -5,3 +5,4 @@ To-do
 * Tailcalls
 * Investigate loop alignment
 * Add builtins
+
diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
index 2f375fc..73c310b 100644
--- a/lib/Target/XCore/XCore.h
+++ b/lib/Target/XCore/XCore.h
@@ -31,6 +31,8 @@ namespace llvm {
                                    CodeGenOpt::Level OptLevel);
   ModulePass *createXCoreLowerThreadLocalPass();
 
+  ImmutablePass *createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM);
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index e177ad3..c03dfe6 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -36,6 +36,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -49,7 +50,6 @@ namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
     XCoreMCInstLower MCInstLowering;
-    void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
   public:
     explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
@@ -76,7 +76,6 @@ namespace {
     void EmitInstruction(const MachineInstr *MI);
     void EmitFunctionBodyStart();
     void EmitFunctionBodyEnd();
-    virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   };
 } // end of anonymous namespace
 
@@ -85,16 +84,17 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
     GV->hasWeakLinkage()) ||
     GV->hasLinkOnceLinkage()) && "Unexpected linkage");
   if (ArrayType *ATy = dyn_cast<ArrayType>(
-    cast<PointerType>(GV->getType())->getElementType())) {
-    OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
-    // FIXME: MCStreamerize.
-    OutStreamer.EmitRawText(StringRef(".globound"));
-    OutStreamer.EmitRawText("\t.set\t" + Twine(Sym->getName()));
-    OutStreamer.EmitRawText(".globound," + Twine(ATy->getNumElements()));
+                        cast<PointerType>(GV->getType())->getElementType())) {
+
+    MCSymbol *SymGlob = OutContext.GetOrCreateSymbol(
+                          Twine(Sym->getName() + StringRef(".globound")));
+    OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Global);
+    OutStreamer.EmitAssignment(SymGlob,
+                               MCConstantExpr::Create(ATy->getNumElements(),
+                                                      OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
       // TODO Use COMDAT groups for LinkOnceLinkage
-      OutStreamer.EmitRawText(MAI->getWeakDefDirective() +Twine(Sym->getName())+
-                              ".globound");
+      OutStreamer.EmitSymbolAttribute(SymGlob, MCSA_Weak);
     }
   }
 }
@@ -109,7 +109,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
 
   
-  MCSymbol *GVSym = Mang->getSymbol(GV);
+  MCSymbol *GVSym = getSymbol(GV);
   const Constant *C = GV->getInitializer();
   unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
   
@@ -216,7 +216,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
     O << *MO.getMBB()->getSymbol();
     break;
   case MachineOperand::MO_GlobalAddress:
-    O << *Mang->getSymbol(MO.getGlobal());
+    O << *getSymbol(MO.getGlobal());
     break;
   case MachineOperand::MO_ExternalSymbol:
     O << MO.getSymbolName();
@@ -242,45 +242,14 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
 bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                                       unsigned AsmVariant,const char *ExtraCode,
                                       raw_ostream &O) {
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0])
-    if (ExtraCode[1] != 0) return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      // See if this is a generic print operand
-      return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
-    }
-
-  printOperand(MI, OpNo, O);
-  return false;
-}
-
-void XCoreAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                             raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps == 4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps-2, OS);
-}
+  // Print the operand if there is no operand modifier.
+  if (!ExtraCode || !ExtraCode[0]) {
+    printOperand(MI, OpNo, O);
+    return false;
+  }
 
-MachineLocation XCoreAsmPrinter::
-getDebugValueLocation(const MachineInstr *MI) const {
-  // Handles frame addresses emitted in XCoreInstrInfo::emitFrameIndexDebugValue.
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm() &&
-         "Unexpected MachineOperand types");
-  return MachineLocation(MI->getOperand(0).getReg(),
-                         MI->getOperand(1).getImm());
+  // Otherwise fallback on the default implementation.
+  return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
 }
 
 void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -288,15 +257,8 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   raw_svector_ostream O(Str);
 
   switch (MI->getOpcode()) {
-  case XCore::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
+  case XCore::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
   case XCore::ADD_2rus:
     if (MI->getOperand(2).getImm() == 0) {
       O << "\tmov "
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index beeb07f..c34b35c 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -30,10 +30,6 @@
 using namespace llvm;
 
 // helper functions. FIXME: Eliminate.
-static inline bool isImmUs(unsigned val) {
-  return val <= 11;
-}
-
 static inline bool isImmU6(unsigned val) {
   return val < (1 << 6);
 }
@@ -92,11 +88,16 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo *MMI = &MF.getMMI();
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
   XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
+  if (MFI->getMaxAlignment() > getStackAlignment())
+    report_fatal_error("emitPrologue unsupported alignment: "
+                       + Twine(MFI->getMaxAlignment()));
+
   bool FP = hasFP(MF);
   const AttributeSet &PAL = MF.getFunction()->getAttributes();
 
@@ -116,9 +117,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   }
   bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(MF);
 
+  bool saveLR = XFI->getUsesLR();
   // Do we need to allocate space on the stack?
   if (FrameSize) {
-    bool saveLR = XFI->getUsesLR();
     bool LRSavedOnEntry = false;
     int Opcode;
     if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
@@ -132,34 +133,28 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
 
     if (emitFrameMoves) {
-      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
-
       // Show update of SP.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-
-      MachineLocation SPDst(MachineLocation::VirtualFP);
-      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
-      Moves.push_back(MachineMove(FrameLabel, SPDst, SPSrc));
-
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(FrameLabel,
+                                                             -FrameSize*4));
       if (LRSavedOnEntry) {
-        MachineLocation CSDst(MachineLocation::VirtualFP, 0);
-        MachineLocation CSSrc(XCore::LR);
-        Moves.push_back(MachineMove(FrameLabel, CSDst, CSSrc));
+        unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+        MMI->addFrameInst(MCCFIInstruction::createOffset(FrameLabel, Reg, 0));
       }
     }
-    if (saveLR) {
-      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-      storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
-      MBB.addLiveIn(XCore::LR);
+  }
+  if (saveLR) {
+    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl, TII);
+    MBB.addLiveIn(XCore::LR);
 
-      if (emitFrameMoves) {
-        MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
-        BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
-        MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
-        MachineLocation CSSrc(XCore::LR);
-        MMI->getFrameMoves().push_back(MachineMove(SaveLRLabel, CSDst, CSSrc));
-      }
+    if (emitFrameMoves) {
+      MCSymbol *SaveLRLabel = MMI->getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveLRLabel);
+      unsigned Reg = MRI->getDwarfRegNum(XCore::LR, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveLRLabel, Reg,
+                                                       LRSpillOffset));
     }
   }
 
@@ -172,37 +167,34 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
     if (emitFrameMoves) {
       MCSymbol *SaveR10Label = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(SaveR10Label);
-      MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
-      MachineLocation CSSrc(XCore::R10);
-      MMI->getFrameMoves().push_back(MachineMove(SaveR10Label, CSDst, CSSrc));
+      unsigned Reg = MRI->getDwarfRegNum(XCore::R10, true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SaveR10Label, Reg,
+                                                       FPSpillOffset));
     }
     // Set the FP from the SP.
     unsigned FramePtr = XCore::R10;
-    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
-      .addImm(0);
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr).addImm(0);
     if (emitFrameMoves) {
       // Show FP is now valid.
       MCSymbol *FrameLabel = MMI->getContext().CreateTempSymbol();
       BuildMI(MBB, MBBI, dl, TII.get(XCore::PROLOG_LABEL)).addSym(FrameLabel);
-      MachineLocation SPDst(FramePtr);
-      MachineLocation SPSrc(MachineLocation::VirtualFP);
-      MMI->getFrameMoves().push_back(MachineMove(FrameLabel, SPDst, SPSrc));
+      unsigned Reg = MRI->getDwarfRegNum(FramePtr, true);
+      MMI->addFrameInst(MCCFIInstruction::createDefCfaRegister(FrameLabel,
+                                                               Reg));
     }
   }
 
   if (emitFrameMoves) {
     // Frame moves for callee saved.
-    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
     std::vector<std::pair<MCSymbol*, CalleeSavedInfo> >&SpillLabels =
         XFI->getSpillLabels();
     for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
       MCSymbol *SpillLabel = SpillLabels[I].first;
       CalleeSavedInfo &CSI = SpillLabels[I].second;
       int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
-      unsigned Reg = CSI.getReg();
-      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
-      MachineLocation CSSrc(Reg);
-      Moves.push_back(MachineMove(SpillLabel, CSDst, CSSrc));
+      unsigned Reg = MRI->getDwarfRegNum(CSI.getReg(), true);
+      MMI->addFrameInst(MCCFIInstruction::createOffset(SpillLabel, Reg,
+                                                       Offset));
     }
   }
 }
@@ -213,6 +205,7 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   const XCoreInstrInfo &TII =
     *static_cast<const XCoreInstrInfo*>(MF.getTarget().getInstrInfo());
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
   DebugLoc dl = MBBI->getDebugLoc();
 
   bool FP = hasFP(MF);
@@ -237,28 +230,32 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
     report_fatal_error("emitEpilogue Frame size too big: " + Twine(FrameSize));
   }
 
-  if (FrameSize) {
-    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  if (FP) {
+    // Restore R10
+    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    FPSpillOffset += FrameSize*4;
+    loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
+  }
 
-    if (FP) {
-      // Restore R10
-      int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
-      FPSpillOffset += FrameSize*4;
-      loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl, TII);
-    }
-    bool restoreLR = XFI->getUsesLR();
-    if (restoreLR && MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0) {
-      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
-      LRSpillOffset += FrameSize*4;
-      loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
-      restoreLR = false;
-    }
+  bool restoreLR = XFI->getUsesLR();
+  if (restoreLR &&
+      (FrameSize == 0 || MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0)) {
+    int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+    LRSpillOffset += FrameSize*4;
+    loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl, TII);
+    restoreLR = false;
+  }
+
+  if (FrameSize) {
     if (restoreLR) {
       // Fold prologue into return instruction
+      assert(MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0);
       assert(MBBI->getOpcode() == XCore::RETSP_u6
         || MBBI->getOpcode() == XCore::RETSP_lu6);
       int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
-      BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      MachineInstrBuilder MIB  = BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      for (unsigned i = 3, e = MBBI->getNumOperands(); i < e; ++i)
+        MIB->addOperand(MBBI->getOperand(i)); // copy any variadic operands
       MBB.erase(MBBI);
     } else {
       int Opcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index eb29b50..e28f84f 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -37,13 +37,11 @@ using namespace llvm;
 ///
 namespace {
   class XCoreDAGToDAGISel : public SelectionDAGISel {
-    const XCoreTargetLowering &Lowering;
     const XCoreSubtarget &Subtarget;
 
   public:
     XCoreDAGToDAGISel(XCoreTargetMachine &TM, CodeGenOpt::Level OptLevel)
       : SelectionDAGISel(TM, OptLevel),
-        Lowering(*TM.getTargetLowering()), 
         Subtarget(*TM.getSubtargetImpl()) { }
 
     SDNode *Select(SDNode *N);
@@ -61,7 +59,7 @@ namespace {
       if (!isMask_32(value)) {
         return false;
       }
-      int msksize = 32 - CountLeadingZeros_32(value);
+      int msksize = 32 - countLeadingZeros(value);
       return (msksize >= 1 && msksize <= 8) ||
               msksize == 16 || msksize == 24 || msksize == 32;
     }
@@ -109,7 +107,7 @@ bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
 }
 
 SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case ISD::Constant: {
@@ -117,7 +115,7 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
     if (immMskBitp(N)) {
       // Transformation function: get the size of a mask
       // Look for the first non-zero bit
-      SDValue MskSize = getI32Imm(32 - CountLeadingZeros_32(Val));
+      SDValue MskSize = getI32Imm(32 - countLeadingZeros((uint32_t)Val));
       return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
                                     MVT::i32, MskSize);
     }
@@ -125,7 +123,7 @@ SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
                               Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      TLI.getPointerTy());
+                                      getTargetLowering()->getPointerTy());
       SDNode *node = CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32,
                                             MVT::Other, CPIdx,
                                             CurDAG->getEntryNode());
@@ -204,12 +202,12 @@ replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
   }
   if (!found)
     return SDValue();
-  return CurDAG->getNode(ISD::TokenFactor, Chain->getDebugLoc(), MVT::Other,
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
                          &Ops[0], Ops.size());
 }
 
 SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   // (brind (int_xcore_checkevent (addr)))
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 2d27f1a..89ad27d 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -59,6 +59,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::CRC8              : return "XCoreISD::CRC8";
     case XCoreISD::BR_JT             : return "XCoreISD::BR_JT";
     case XCoreISD::BR_JT32           : return "XCoreISD::BR_JT32";
+    case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
     default                          : return NULL;
   }
 }
@@ -79,7 +80,7 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
 
   setStackPointerRegisterToSaveRestore(XCore::SP);
 
-  setSchedulingPreference(Sched::RegPressure);
+  setSchedulingPreference(Sched::Source);
 
   // Use i32 for setcc operations results (slt, sgt, ...).
   setBooleanContents(ZeroOrOneBooleanContent);
@@ -148,6 +149,13 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
 
+  // Exception handling
+  setExceptionPointerRegister(XCore::R0);
+  setExceptionSelectorRegister(XCore::R1);
+
+  // Atomic operations
+  setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
+
   // TRAMPOLINE is custom lowered.
   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
@@ -166,6 +174,24 @@ XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
   setMinFunctionAlignment(1);
 }
 
+bool XCoreTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  EVT VT1 = Val.getValueType();
+  if (!VT1.isSimple() || !VT1.isInteger() ||
+      !VT2.isSimple() || !VT2.isInteger())
+    return false;
+
+  switch (VT1.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::i8:
+    return true;
+  }
+
+  return false;
+}
+
 SDValue XCoreTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode())
@@ -188,6 +214,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, DAG);
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -215,7 +242,7 @@ void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
 SDValue XCoreTargetLowering::
 LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
                              Op.getOperand(3), Op.getOperand(4));
   return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
@@ -227,7 +254,7 @@ getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                         SelectionDAG &DAG) const
 {
   // FIXME there is no actual debug info here
-  DebugLoc dl = GA.getDebugLoc();
+  SDLoc dl(GA);
   const GlobalValue *UnderlyingGV = GV;
   // If GV is an alias then use the aliasee to determine the wrapper type
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
@@ -243,7 +270,7 @@ getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
 SDValue XCoreTargetLowering::
 LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GN->getGlobal();
   int64_t Offset = GN->getOffset();
@@ -259,15 +286,10 @@ LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
   return GA;
 }
 
-static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
-}
-
 SDValue XCoreTargetLowering::
 LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
   SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
@@ -280,7 +302,7 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
 {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   // FIXME there isn't really debug info here
-  DebugLoc dl = CP->getDebugLoc();
+  SDLoc dl(CP);
   EVT PtrVT = Op.getValueType();
   SDValue Res;
   if (CP->isMachineConstantPoolEntry()) {
@@ -303,7 +325,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = Op.getOperand(0);
   SDValue Table = Op.getOperand(1);
   SDValue Index = Op.getOperand(2);
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
   unsigned JTI = JT->getIndex();
   MachineFunction &MF = DAG.getMachineFunction();
@@ -322,7 +344,7 @@ LowerBR_JT(SDValue Op, SelectionDAG &DAG) const
 }
 
 SDValue XCoreTargetLowering::
-lowerLoadWordFromAlignedBasePlusOffset(DebugLoc DL, SDValue Chain, SDValue Base,
+lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
                                        int64_t Offset, SelectionDAG &DAG) const
 {
   if ((Offset & 0x3) == 0) {
@@ -388,7 +410,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
 
   if (!LD->isVolatile()) {
     const GlobalValue *GV;
@@ -469,7 +491,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   SDValue Chain = ST->getChain();
   SDValue BasePtr = ST->getBasePtr();
   SDValue Value = ST->getValue();
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
 
   if (ST->getAlignment() == 2) {
     SDValue Low = Value;
@@ -516,7 +538,7 @@ LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
 {
   assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::SMUL_LOHI &&
          "Unexpected operand to lower!");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Zero = DAG.getConstant(0, MVT::i32);
@@ -533,7 +555,7 @@ LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
 {
   assert(Op.getValueType() == MVT::i32 && Op.getOpcode() == ISD::UMUL_LOHI &&
          "Unexpected operand to lower!");
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   SDValue Zero = DAG.getConstant(0, MVT::i32);
@@ -618,7 +640,7 @@ TryExpandADDWithMul(SDNode *N, SelectionDAG &DAG) const
   } else {
     return SDValue();
   }
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   SDValue LL, RL, AddendL, AddendH;
   LL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                    Mul.getOperand(0),  DAG.getConstant(0, MVT::i32));
@@ -677,7 +699,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
       return Result;
   }
 
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
 
   // Extract components
   SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
@@ -707,31 +729,33 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
 SDValue XCoreTargetLowering::
 LowerVAARG(SDValue Op, SelectionDAG &DAG) const
 {
-  llvm_unreachable("unimplemented");
-  // FIXME Arguments passed by reference need a extra dereference.
+  // Whist llvm does not support aggregate varargs we can ignore
+  // the possibility of the ValueType being an implicit byVal vararg.
   SDNode *Node = Op.getNode();
-  DebugLoc dl = Node->getDebugLoc();
-  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
-  EVT VT = Node->getValueType(0);
-  SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
-                               Node->getOperand(1), MachinePointerInfo(V),
+  EVT VT = Node->getValueType(0); // not an aggregate
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc dl(Node);
+  SDValue VAList = DAG.getLoad(PtrVT, dl, InChain,
+                               VAListPtr, MachinePointerInfo(SV),
                                false, false, false, 0);
   // Increment the pointer, VAList, to the next vararg
-  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList,
-                     DAG.getConstant(VT.getSizeInBits(),
-                                     getPointerTy()));
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAList,
+                                DAG.getIntPtrConstant(VT.getSizeInBits() / 8));
   // Store the incremented VAList to the legalized pointer
-  Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Node->getOperand(1),
-                      MachinePointerInfo(V), false, false, 0);
+  InChain = DAG.getStore(VAList.getValue(1), dl, nextPtr, VAListPtr,
+                         MachinePointerInfo(SV), false, false, 0);
   // Load the actual argument out of the pointer VAList
-  return DAG.getLoad(VT, dl, Tmp3, VAList, MachinePointerInfo(),
+  return DAG.getLoad(VT, dl, InChain, VAList, MachinePointerInfo(),
                      false, false, false, 0);
 }
 
 SDValue XCoreTargetLowering::
 LowerVASTART(SDValue Op, SelectionDAG &DAG) const
 {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // vastart stores the address of the VarArgsFrameIndex slot into the
   // memory location argument
   MachineFunction &MF = DAG.getMachineFunction();
@@ -743,7 +767,7 @@ LowerVASTART(SDValue Op, SelectionDAG &DAG) const
 
 SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op,
                                             SelectionDAG &DAG) const {
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   // Depths > 0 not supported yet!
   if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
     return SDValue();
@@ -783,7 +807,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue Addr = Trmp;
 
-  DebugLoc dl = Op.getDebugLoc();
+  SDLoc dl(Op);
   OutChains[0] = DAG.getStore(Chain, dl, DAG.getConstant(0x0a3cd805, MVT::i32),
                               Addr, MachinePointerInfo(TrmpAddr), false, false,
                               0);
@@ -817,7 +841,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue XCoreTargetLowering::
 LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
+  SDLoc DL(Op);
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
     case Intrinsic::xcore_crc8:
@@ -832,6 +856,12 @@ LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+SDValue XCoreTargetLowering::
+LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  return DAG.getNode(XCoreISD::MEMBARRIER, DL, MVT::Other, Op.getOperand(0));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -847,10 +877,10 @@ SDValue
 XCoreTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
-  DebugLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SDLoc &dl                             = CLI.DL;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -883,7 +913,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
   // Analyze operands of the call, assigning locations to each operand.
@@ -901,7 +931,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   unsigned NumBytes = CCInfo.getNextStackOffset();
 
   Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes,
-                                 getPointerTy(), true));
+                                 getPointerTy(), true), dl);
 
   SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
   SmallVector<SDValue, 12> MemOpChains;
@@ -991,7 +1021,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   Chain = DAG.getCALLSEQ_END(Chain,
                              DAG.getConstant(NumBytes, getPointerTy(), true),
                              DAG.getConstant(0, getPointerTy(), true),
-                             InFlag);
+                             InFlag, dl);
   InFlag = Chain.getValue(1);
 
   // Handle result values, copying them out of physregs into vregs that we
@@ -1006,7 +1036,7 @@ SDValue
 XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                      CallingConv::ID CallConv, bool isVarArg,
                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SDLoc dl, SelectionDAG &DAG,
                                      SmallVectorImpl<SDValue> &InVals) const {
 
   // Assign locations to each value returned by this call.
@@ -1031,13 +1061,17 @@ XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+namespace {
+  struct ArgDataPair { SDValue SDV; ISD::ArgFlagsTy Flags; };
+}
+
 /// XCore formal arguments implementation
 SDValue
 XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
                                           CallingConv::ID CallConv,
                                           bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
-                                          DebugLoc dl,
+                                          SDLoc dl,
                                           SelectionDAG &DAG,
                                           SmallVectorImpl<SDValue> &InVals)
                                             const {
@@ -1062,7 +1096,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
                                        bool isVarArg,
                                        const SmallVectorImpl<ISD::InputArg>
                                          &Ins,
-                                       DebugLoc dl,
+                                       SDLoc dl,
                                        SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1080,9 +1114,22 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   unsigned LRSaveSize = StackSlotSize;
 
+  // All getCopyFromReg ops must precede any getMemcpys to prevent the
+  // scheduler clobbering a register before it has been copied.
+  // The stages are:
+  // 1. CopyFromReg (and load) arg & vararg registers.
+  // 2. Chain CopyFromReg nodes into a TokenFactor.
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // 4. Chain mem ops nodes into a TokenFactor.
+  SmallVector<SDValue, 4> CFRegNode;
+  SmallVector<ArgDataPair, 4> ArgData;
+  SmallVector<SDValue, 4> MemOps;
+
+  // 1a. CopyFromReg (and load) arg registers.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
 
     CCValAssign &VA = ArgLocs[i];
+    SDValue ArgIn;
 
     if (VA.isRegLoc()) {
       // Arguments passed in registers
@@ -1099,7 +1146,8 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       case MVT::i32:
         unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+        ArgIn = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        CFRegNode.push_back(ArgIn.getValue(ArgIn->getNumValues() - 1));
       }
     } else {
       // sanity check
@@ -1119,14 +1167,17 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
       // Create the SelectionDAG nodes corresponding to a load
       //from this parameter
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
-      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI),
-                                   false, false, false, 0));
+      ArgIn = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                          MachinePointerInfo::getFixedStack(FI),
+                          false, false, false, 0);
     }
+    const ArgDataPair ADP = { ArgIn, Ins[i].Flags };
+    ArgData.push_back(ADP);
   }
 
+  // 1b. CopyFromReg vararg registers.
   if (isVarArg) {
-    /* Argument registers */
+    // Argument registers
     static const uint16_t ArgRegs[] = {
       XCore::R0, XCore::R1, XCore::R2, XCore::R3
     };
@@ -1134,7 +1185,6 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
     unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
                                                      array_lengthof(ArgRegs));
     if (FirstVAReg < array_lengthof(ArgRegs)) {
-      SmallVector<SDValue, 4> MemOps;
       int offset = 0;
       // Save remaining registers, storing higher register numbers at a higher
       // address
@@ -1150,14 +1200,12 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
         unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
         RegInfo.addLiveIn(ArgRegs[i], VReg);
         SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        CFRegNode.push_back(Val.getValue(Val->getNumValues() - 1));
         // Move argument from virt reg -> stack
         SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                      MachinePointerInfo(), false, false, 0);
         MemOps.push_back(Store);
       }
-      if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
     } else {
       // This will point to the next argument passed via stack.
       XFI->setVarArgsFrameIndex(
@@ -1166,6 +1214,42 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
     }
   }
 
+  // 2. chain CopyFromReg nodes into a TokenFactor.
+  if (!CFRegNode.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &CFRegNode[0],
+                        CFRegNode.size());
+
+  // 3. Memcpy 'byVal' args & push final InVals.
+  // Aggregates passed "byVal" need to be copied by the callee.
+  // The callee will use a pointer to this copy, rather than the original
+  // pointer.
+  for (SmallVectorImpl<ArgDataPair>::const_iterator ArgDI = ArgData.begin(),
+                                                    ArgDE = ArgData.end();
+       ArgDI != ArgDE; ++ArgDI) {
+    if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
+      unsigned Size = ArgDI->Flags.getByValSize();
+      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
+      // Create a new object on the stack and copy the pointee into it.
+      int FI = MFI->CreateStackObject(Size, Align, false, false);
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(FIN);
+      MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
+                                     DAG.getConstant(Size, MVT::i32),
+                                     Align, false, false,
+                                     MachinePointerInfo(),
+                                     MachinePointerInfo()));
+    } else {
+      InVals.push_back(ArgDI->SDV);
+    }
+  }
+
+  // 4, chain mem ops nodes into a TokenFactor.
+  if (!MemOps.empty()) {
+    MemOps.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
+                        MemOps.size());
+  }
+
   return Chain;
 }
 
@@ -1188,7 +1272,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
                                  const SmallVectorImpl<SDValue> &OutVals,
-                                 DebugLoc dl, SelectionDAG &DAG) const {
+                                 SDLoc dl, SelectionDAG &DAG) const {
 
   // CCValAssign - represent the assignment of
   // the return value to a location
@@ -1305,7 +1389,7 @@ XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
-  DebugLoc dl = N->getDebugLoc();
+  SDLoc dl(N);
   switch (N->getOpcode()) {
   default: break;
   case XCoreISD::LADD: {
@@ -1582,7 +1666,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
 std::pair<unsigned, const TargetRegisterClass*>
 XCoreTargetLowering::
 getRegForInlineAsmConstraint(const std::string &Constraint,
-                             EVT VT) const {
+                             MVT VT) const {
   if (Constraint.size() == 1) {
     switch (Constraint[0]) {
     default : break;
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index c7dfa26..bc08497 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -70,7 +70,10 @@ namespace llvm {
       BR_JT,
 
       // Jumptable branch using long branches for each entry.
-      BR_JT32
+      BR_JT32,
+
+      // Memory barrier.
+      MEMBARRIER
     };
   }
 
@@ -83,6 +86,10 @@ namespace llvm {
 
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
+    using TargetLowering::isZExtFree;
+    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+
+
     virtual unsigned getJumpTableEncoding() const;
     virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
 
@@ -115,7 +122,7 @@ namespace llvm {
                               CallingConv::ID CallConv,
                               bool isVarArg,
                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                              DebugLoc dl, SelectionDAG &DAG,
+                              SDLoc dl, SelectionDAG &DAG,
                               SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
                            CallingConv::ID CallConv, bool isVarArg,
@@ -123,17 +130,17 @@ namespace llvm {
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
                            const SmallVectorImpl<SDValue> &OutVals,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
-                            DebugLoc dl, SelectionDAG &DAG,
+                            SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
     SDValue getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
                                     SelectionDAG &DAG) const;
-    SDValue lowerLoadWordFromAlignedBasePlusOffset(DebugLoc DL, SDValue Chain,
+    SDValue lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain,
                                                    SDValue Base, int64_t Offset,
                                                    SelectionDAG &DAG) const;
 
@@ -154,11 +161,12 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 EVT VT) const;
+                                 MVT VT) const;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
@@ -177,7 +185,7 @@ namespace llvm {
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
-                           DebugLoc dl, SelectionDAG &DAG,
+                           SDLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
     virtual SDValue
@@ -189,7 +197,7 @@ namespace llvm {
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  DebugLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const;
 
     virtual bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index e457e0d..33c7f31 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR
+#define GET_INSTRINFO_CTOR_DTOR
 #include "XCoreGenInstrInfo.inc"
 
 namespace llvm {
@@ -39,9 +39,13 @@ namespace XCore {
 
 using namespace llvm;
 
+
+// Pin the vtable to this file.
+void XCoreInstrInfo::anchor() {}
+
 XCoreInstrInfo::XCoreInstrInfo()
   : XCoreGenInstrInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
-    RI(*this) {
+    RI() {
 }
 
 static bool isZeroImm(const MachineOperand &op) {
@@ -386,15 +390,6 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
     .addImm(0);
 }
 
-MachineInstr*
-XCoreInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(XCore::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 /// ReverseBranchCondition - Return the inverse opcode of the 
 /// specified Branch instruction.
 bool XCoreInstrInfo::
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 42eeed8..4429b07 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -24,6 +24,7 @@ namespace llvm {
 
 class XCoreInstrInfo : public XCoreGenInstrInfo {
   const XCoreRegisterInfo RI;
+  virtual void anchor();
 public:
   XCoreInstrInfo();
 
@@ -78,12 +79,6 @@ public:
                                     const TargetRegisterClass *RC,
                                     const TargetRegisterInfo *TRI) const;
 
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
-
   virtual bool ReverseBranchCondition(
                             SmallVectorImpl<MachineOperand> &Cond) const;
 };
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 587166c..934a707 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -70,6 +70,11 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
 def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+def SDT_XCoreMEMBARRIER : SDTypeProfile<0, 0, []>;
+
+def XCoreMemBarrier : SDNode<"XCoreISD::MEMBARRIER", SDT_XCoreMEMBARRIER,
+                             [SDNPHasChain]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
@@ -84,7 +89,7 @@ def msksize_xform : SDNodeXForm<imm, [{
   // Transformation function: get the size of a mask
   assert(isMask_32(N->getZExtValue()));
   // look for the first non-zero bit
-  return getI32Imm(32 - CountLeadingZeros_32(N->getZExtValue()));
+  return getI32Imm(32 - countLeadingZeros((uint32_t)N->getZExtValue()));
 }]>;
 
 def neg_xform : SDNodeXForm<imm, [{
@@ -279,12 +284,6 @@ multiclass FRU6_LRU6_backwards_branch<bits<6> opc, string OpcStr> {
                     !strconcat(OpcStr, " $a, $b"), []>;
 }
 
-multiclass FRU6_LRU6_cp<bits<6> opc, string OpcStr> {
-  def _ru6: _FRU6<opc, (outs RRegs:$a), (ins i32imm:$b),
-                  !strconcat(OpcStr, " $a, cp[$b]"), []>;
-  def _lru6: _FLRU6<opc, (outs RRegs:$a), (ins i32imm:$b),
-                    !strconcat(OpcStr, " $a, cp[$b]"), []>;
-}
 
 // U6
 multiclass FU6_LU6<bits<10> opc, string OpcStr, SDNode OpNode> {
@@ -349,6 +348,10 @@ let usesCustomInserter = 1 in {
                                  (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
 }
 
+let hasSideEffects = 1 in
+def Int_MemBarrier : PseudoInstXCore<(outs), (ins), "#MEMBARRIER",
+                                     [(XCoreMemBarrier)]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -539,8 +542,13 @@ def STWDP_lru6 : _FLRU6<0b010100, (outs), (ins RRegs:$a, i32imm:$b),
                         [(store RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
 
 //let Uses = [CP] in ..
-let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in
-defm LDWCP : FRU6_LRU6_cp<0b011011, "ldw">;
+let mayLoad = 1, isReMaterializable = 1, neverHasSideEffects = 1 in {
+def LDWCP_ru6 : _FRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                      "ldw $a, cp[$b]", []>;
+def LDWCP_lru6: _FLRU6<0b011011, (outs RRegs:$a), (ins i32imm:$b),
+                       "ldw $a, cp[$b]",
+                       [(set RRegs:$a, (load (cprelwrapper tglobaladdr:$b)))]>;
+}
 
 let Uses = [SP] in {
 let mayStore=1 in {
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 2e328b4..afce753 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -22,6 +22,9 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/NoFolder.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 #define DEBUG_TYPE "xcore-lower-thread-local"
 
@@ -71,13 +74,104 @@ createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
   return ConstantArray::get(NewType, Elements);
 }
 
-static bool hasNonInstructionUse(GlobalVariable *GV) {
-  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); UI != E;
-       ++UI)
-    if (!isa<Instruction>(*UI))
-      return true;
+static Instruction *
+createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
+  IRBuilder<true,NoFolder> Builder(Instr);
+  unsigned OpCode = CE->getOpcode();
+  switch (OpCode) {
+    case Instruction::GetElementPtr: {
+      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
+      ArrayRef<Value *> CEOps(CEOpVec);
+      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(CEOps[0],
+                                                             CEOps.slice(1)));
+    }
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+      return dyn_cast<Instruction>(
+                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
+                                      CE->getOperand(0), CE->getOperand(1),
+                                      CE->getName()));
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      return dyn_cast<Instruction>(
+                  Builder.CreateCast((Instruction::CastOps)OpCode,
+                                     CE->getOperand(0), CE->getType(),
+                                     CE->getName()));
+    default:
+      llvm_unreachable("Unhandled constant expression!\n");
+  }
+}
+
+static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
+  do {
+    SmallVector<WeakVH,8> WUsers;
+    for (Value::use_iterator I = CE->use_begin(), E = CE->use_end();
+         I != E; ++I)
+      WUsers.push_back(WeakVH(*I));
+    std::sort(WUsers.begin(), WUsers.end());
+    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
+    while (!WUsers.empty())
+      if (WeakVH WU = WUsers.pop_back_val()) {
+        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
+          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
+            if (PN->getIncomingValue(I) == CE) {
+              BasicBlock *PredBB = PN->getIncomingBlock(I);
+              if (PredBB->getTerminator()->getNumSuccessors() > 1)
+                PredBB = SplitEdge(PredBB, PN->getParent(), P);
+              Instruction *InsertPos = PredBB->getTerminator();
+              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
+              PN->setOperand(I, NewInst);
+            }
+        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
+          Instruction *NewInst = createReplacementInstr(CE, Instr);
+          Instr->replaceUsesOfWith(CE, NewInst);
+        } else {
+          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
+          if (!CExpr || !replaceConstantExprOp(CExpr, P))
+            return false;
+        }
+      }
+  } while (CE->hasNUsesOrMore(1)); // We need to check becasue a recursive
+  // sibbling may have used 'CE' when createReplacementInstr was called.
+  CE->destroyConstant();
+  return true;
+}
 
-  return false;
+static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
+  SmallVector<WeakVH,8> WUsers;
+  for (Value::use_iterator I = GV->use_begin(), E = GV->use_end(); I != E; ++I)
+    if (!isa<Instruction>(*I))
+      WUsers.push_back(WeakVH(*I));
+  while (!WUsers.empty())
+    if (WeakVH WU = WUsers.pop_back_val()) {
+      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
+      if (!CE || !replaceConstantExprOp(CE, P))
+        return false;
+    }
+  return true;
 }
 
 static bool isZeroLengthArray(Type *Ty) {
@@ -92,14 +186,16 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
     return false;
 
   // Skip globals that we can't lower and leave it for the backend to error.
-  if (hasNonInstructionUse(GV) ||
+  if (!rewriteNonInstructionUses(GV, this) ||
       !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
     return false;
 
   // Create replacement global.
   ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
-  Constant *NewInitializer = createLoweredInitializer(NewType,
-                                                      GV->getInitializer());
+  Constant *NewInitializer = 0;
+  if (GV->hasInitializer())
+    NewInitializer = createLoweredInitializer(NewType,
+                                              GV->getInitializer());
   GlobalVariable *NewGV =
     new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
                        NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
index f96eda9..def2673 100644
--- a/lib/Target/XCore/XCoreMCInstLower.cpp
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -43,7 +43,7 @@ MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       Symbol = MO.getMBB()->getSymbol();
       break;
     case MachineOperand::MO_GlobalAddress:
-      Symbol = Mang->getSymbol(MO.getGlobal());
+      Symbol = Printer.getSymbol(MO.getGlobal());
       Offset += MO.getOffset();
       break;
     case MachineOperand::MO_BlockAddress:
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index 49b5634..dbd2f52 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -37,8 +37,8 @@
 
 using namespace llvm;
 
-XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
-  : XCoreGenRegisterInfo(XCore::LR), TII(tii) {
+XCoreRegisterInfo::XCoreRegisterInfo()
+  : XCoreGenRegisterInfo(XCore::LR) {
 }
 
 // helper functions
@@ -112,6 +112,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = FrameOp.getIndex();
 
   MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
   int StackSize = MF.getFrameInfo()->getStackSize();
@@ -249,6 +250,7 @@ loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     report_fatal_error("loadConstant value too big " + Twine(Value));
   }
   int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+  const TargetInstrInfo &TII = *MBB.getParent()->getTarget().getInstrInfo();
   BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
 }
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 1db3248..2370c62 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -25,8 +25,6 @@ class TargetInstrInfo;
 
 struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
 private:
-  const TargetInstrInfo &TII;
-
   void loadConstant(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator I,
                   unsigned DstReg, int64_t Value, DebugLoc dl) const;
@@ -40,7 +38,7 @@ private:
                   unsigned DstReg, int Offset, DebugLoc dl) const;
 
 public:
-  XCoreRegisterInfo(const TargetInstrInfo &tii);
+  XCoreRegisterInfo();
 
   /// Code Generation virtual methods...
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 07e5fff..9ae0b86 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -33,6 +33,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget),
     TLInfo(*this),
     TSInfo(*this) {
+  initAsmInfo();
 }
 
 namespace {
@@ -69,3 +70,11 @@ bool XCorePassConfig::addInstSelector() {
 extern "C" void LLVMInitializeXCoreTarget() {
   RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
 }
+
+void XCoreTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our XCore pass. This
+  // allows the XCore pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createXCoreTargetTransformInfoPass(this));
+}
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index eb9a1aa..a19a677 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -57,6 +57,8 @@ public:
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+
+  virtual void addAnalysisPasses(PassManagerBase &PM);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
new file mode 100644
index 0000000..cc165f7
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -0,0 +1,83 @@
+//===-- XCoreTargetTransformInfo.cpp - XCore specific TTI pass ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// XCore target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcoretti"
+#include "XCore.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeXCoreTTIPass(PassRegistry &);
+}
+
+namespace {
+
+class XCoreTTI : public ImmutablePass, public TargetTransformInfo {
+public:
+  XCoreTTI() : ImmutablePass(ID) {
+    llvm_unreachable("This pass cannot be directly constructed");
+  }
+
+  XCoreTTI(const XCoreTargetMachine *TM)
+      : ImmutablePass(ID) {
+    initializeXCoreTTIPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void initializePass() {
+    pushTTIStack(this);
+  }
+
+  virtual void finalizePass() {
+    popTTIStack();
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    TargetTransformInfo::getAnalysisUsage(AU);
+  }
+
+  static char ID;
+
+  virtual void *getAdjustedAnalysisPointer(const void *ID) {
+    if (ID == &TargetTransformInfo::ID)
+      return (TargetTransformInfo*)this;
+    return this;
+  }
+
+  unsigned getNumberOfRegisters(bool Vector) const {
+    if (Vector) {
+       return 0;
+    }
+    return 12;
+  }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(XCoreTTI, TargetTransformInfo, "xcoretti",
+                   "XCore Target Transform Info", true, true, false)
+char XCoreTTI::ID = 0;
+
+
+ImmutablePass *
+llvm::createXCoreTargetTransformInfoPass(const XCoreTargetMachine *TM) {
+  return new XCoreTTI(TM);
+}
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index 9f2343b..9251783 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -52,7 +52,7 @@ namespace {
       return false;
     }
 
-    // We don't modify the program, so we preserve all analyses
+    // We don't modify the program, so we preserve all analyses.
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesAll();
     }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index e6fa4ed..df08091 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -88,7 +88,7 @@ char ArgPromotion::ID = 0;
 INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
                 "Promote 'by reference' arguments to scalars", false, false)
 
@@ -126,12 +126,10 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   if (!F || !F->hasLocalLinkage()) return 0;
 
   // First check: see if there are any pointer arguments!  If not, quick exit.
-  SmallVector<std::pair<Argument*, unsigned>, 16> PointerArgs;
-  unsigned ArgNo = 0;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++ArgNo)
+  SmallVector<Argument*, 16> PointerArgs;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I)
     if (I->getType()->isPointerTy())
-      PointerArgs.push_back(std::pair<Argument*, unsigned>(I, ArgNo));
+      PointerArgs.push_back(I);
   if (PointerArgs.empty()) return 0;
 
   // Second check: make sure that all callers are direct callers.  We can't
@@ -152,15 +150,13 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   // add it to ArgsToPromote.
   SmallPtrSet<Argument*, 8> ArgsToPromote;
   SmallPtrSet<Argument*, 8> ByValArgsToTransform;
-  for (unsigned i = 0; i != PointerArgs.size(); ++i) {
-    bool isByVal=F->getAttributes().
-      hasAttribute(PointerArgs[i].second+1, Attribute::ByVal);
-    Argument *PtrArg = PointerArgs[i].first;
+  for (unsigned i = 0, e = PointerArgs.size(); i != e; ++i) {
+    Argument *PtrArg = PointerArgs[i];
     Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
     // If this is a byval argument, and if the aggregate type is small, just
     // pass the elements, which is always safe.
-    if (isByVal) {
+    if (PtrArg->hasByValAttr()) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (maxElements > 0 && STy->getNumElements() > maxElements) {
           DEBUG(dbgs() << "argpromotion disable promoting argument '"
@@ -205,7 +201,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
     }
     
     // Otherwise, see if we can promote the pointer to its value.
-    if (isSafeToPromoteArgument(PtrArg, isByVal))
+    if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValAttr()))
       ArgsToPromote.insert(PtrArg);
   }
 
@@ -221,8 +217,7 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
 static bool AllCallersPassInValidPointerForArgument(Argument *Arg) {
   Function *Callee = Arg->getParent();
 
-  unsigned ArgNo = std::distance(Callee->arg_begin(),
-                                 Function::arg_iterator(Arg));
+  unsigned ArgNo = Arg->getArgNo();
 
   // Look at all call sites of the function.  At this pointer we know we only
   // have direct callees.
@@ -509,7 +504,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // OriginalLoads - Keep track of a representative load instruction from the
   // original function so that we can tell the alias analysis implementation
   // what the new GEP/Load instructions we are inserting look like.
-  std::map<IndicesVector, LoadInst*> OriginalLoads;
+  // We need to keep the original loads for each argument and the elements
+  // of the argument that are accessed.
+  std::map<std::pair<Argument*, IndicesVector>, LoadInst*> OriginalLoads;
 
   // Attribute - Keep track of the parameter attributes for the arguments
   // that we are *not* promoting. For the ones that we do promote, the parameter
@@ -574,7 +571,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         else
           // Take any load, we will use it only to update Alias Analysis
           OrigLoad = cast<LoadInst>(User->use_back());
-        OriginalLoads[Indices] = OrigLoad;
+        OriginalLoads[std::make_pair(I, Indices)] = OrigLoad;
       }
 
       // Add a parameter to the function for each element passed in.
@@ -681,7 +678,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         for (ScalarizeTable::iterator SI = ArgIndices.begin(),
                E = ArgIndices.end(); SI != E; ++SI) {
           Value *V = *AI;
-          LoadInst *OrigLoad = OriginalLoads[*SI];
+          LoadInst *OrigLoad = OriginalLoads[std::make_pair(I, *SI)];
           if (!SI->empty()) {
             Ops.reserve(SI->size());
             Type *ElTy = V->getType();
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index a7bf188..d94c0f4 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -93,9 +93,12 @@ bool ConstantMerge::hasKnownAlignment(GlobalVariable *GV) const {
 }
 
 unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
+  unsigned Align = GV->getAlignment();
+  if (Align)
+    return Align;
   if (TD)
     return TD->getPreferredAlignment(GV);
-  return GV->getAlignment();
+  return 0;
 }
 
 bool ConstantMerge::runOnModule(Module &M) {
@@ -210,9 +213,9 @@ bool ConstantMerge::runOnModule(Module &M) {
       // Bump the alignment if necessary.
       if (Replacements[i].first->getAlignment() ||
           Replacements[i].second->getAlignment()) {
-        Replacements[i].second->setAlignment(std::max(
-            Replacements[i].first->getAlignment(),
-            Replacements[i].second->getAlignment()));
+        Replacements[i].second->setAlignment(
+            std::max(getAlignment(Replacements[i].first),
+                     getAlignment(Replacements[i].second)));
       }
 
       // Eliminate any uses of the dead global.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 49ef1e7..911c14e 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -211,7 +211,9 @@ void DAE::CollectFunctionDIs(Module &M) {
       for (unsigned SPIndex = 0, SPNum = SPs.getNumElements();
            SPIndex < SPNum; ++SPIndex) {
         DISubprogram SP(SPs.getElement(SPIndex));
-        if (!SP.Verify())
+        assert((!SP || SP.isSubprogram()) &&
+          "A MDNode in subprograms of a CU should be null or a DISubprogram.");
+        if (!SP)
           continue;
         if (Function *F = SP.getFunction())
           FunctionDIs[F] = SP;
@@ -263,8 +265,10 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   // to pass in a smaller number of arguments into the new function.
   //
   std::vector<Value*> Args;
-  while (!Fn.use_empty()) {
-    CallSite CS(Fn.use_back());
+  for (Value::use_iterator I = Fn.use_begin(), E = Fn.use_end(); I != E; ) {
+    CallSite CS(*I++);
+    if (!CS)
+      continue;
     Instruction *Call = CS.getInstruction();
 
     // Pass all the same arguments.
@@ -330,6 +334,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
   if (DI != FunctionDIs.end())
     DI->second.replaceFunction(NF);
 
+  // Fix up any BlockAddresses that refer to the function.
+  Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
+  // Delete the bitcast that we just created, so that NF does not
+  // appear to be address-taken.
+  NF->removeDeadConstantUsers();
   // Finally, nuke the old function.
   Fn.eraseFromParent();
   return true;
@@ -343,8 +352,22 @@ bool DAE::RemoveDeadArgumentsFromCallers(Function &Fn)
   if (Fn.isDeclaration() || Fn.mayBeOverridden())
     return false;
 
-  // Functions with local linkage should already have been handled.
-  if (Fn.hasLocalLinkage())
+  // Functions with local linkage should already have been handled, except the
+  // fragile (variadic) ones which we can improve here.
+  if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
+    return false;
+
+  // If a function seen at compile time is not necessarily the one linked to
+  // the binary being built, it is illegal to change the actual arguments
+  // passed to it. These functions can be captured by isWeakForLinker().
+  // *NOTE* that mayBeOverridden() is insufficient for this purpose as it
+  // doesn't include linkage types like AvailableExternallyLinkage and
+  // LinkOnceODRLinkage. Take link_odr* as an example, it indicates a set of
+  // *EQUIVALENT* globals that can be merged at link-time. However, the
+  // semantic of *EQUIVALENT*-functions includes parameters. Changing
+  // parameters breaks this assumption.
+  //
+  if (Fn.isWeakForLinker())
     return false;
 
   if (Fn.use_empty())
@@ -604,9 +627,20 @@ void DAE::SurveyFunction(const Function &F) {
   UseVector MaybeLiveArgUses;
   for (Function::const_arg_iterator AI = F.arg_begin(),
        E = F.arg_end(); AI != E; ++AI, ++i) {
-    // See what the effect of this use is (recording any uses that cause
-    // MaybeLive in MaybeLiveArgUses).
-    Liveness Result = SurveyUses(AI, MaybeLiveArgUses);
+    Liveness Result;
+    if (F.getFunctionType()->isVarArg()) {
+      // Variadic functions will already have a va_arg function expanded inside
+      // them, making them potentially very sensitive to ABI changes resulting
+      // from removing arguments entirely, so don't. For example AArch64 handles
+      // register and stack HFAs very differently, and this is reflected in the
+      // IR which has already been generated.
+      Result = Live;
+    } else {
+      // See what the effect of this use is (recording any uses that cause
+      // MaybeLive in MaybeLiveArgUses). 
+      Result = SurveyUses(AI, MaybeLiveArgUses);
+    }
+
     // Mark the result.
     MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
     // Clear the vector again for the next iteration.
@@ -695,10 +729,42 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   FunctionType *FTy = F->getFunctionType();
   std::vector<Type*> Params;
 
+  // Keep track of if we have a live 'returned' argument
+  bool HasLiveReturnedArg = false;
+
   // Set up to build a new list of parameter attributes.
   SmallVector<AttributeSet, 8> AttributesVec;
   const AttributeSet &PAL = F->getAttributes();
 
+  // Remember which arguments are still alive.
+  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+  // Construct the new parameter list from non-dead arguments. Also construct
+  // a new set of parameter attributes to correspond. Skip the first parameter
+  // attribute, since that belongs to the return value.
+  unsigned i = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+       I != E; ++I, ++i) {
+    RetOrArg Arg = CreateArg(F, i);
+    if (LiveValues.erase(Arg)) {
+      Params.push_back(I->getType());
+      ArgAlive[i] = true;
+
+      // Get the original parameter attributes (skipping the first one, that is
+      // for the return value.
+      if (PAL.hasAttributes(i + 1)) {
+        AttrBuilder B(PAL, i + 1);
+        if (B.contains(Attribute::Returned))
+          HasLiveReturnedArg = true;
+        AttributesVec.
+          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
+      }
+    } else {
+      ++NumArgumentsEliminated;
+      DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
+            << ") from " << F->getName() << "\n");
+    }
+  }
+
   // Find out the new return value.
   Type *RetTy = FTy->getReturnType();
   Type *NRetTy = NULL;
@@ -707,7 +773,27 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // -1 means unused, other numbers are the new index
   SmallVector<int, 5> NewRetIdxs(RetCount, -1);
   std::vector<Type*> RetTypes;
-  if (RetTy->isVoidTy()) {
+
+  // If there is a function with a live 'returned' argument but a dead return
+  // value, then there are two possible actions:
+  // 1) Eliminate the return value and take off the 'returned' attribute on the
+  //    argument.
+  // 2) Retain the 'returned' attribute and treat the return value (but not the
+  //    entire function) as live so that it is not eliminated.
+  // 
+  // It's not clear in the general case which option is more profitable because,
+  // even in the absence of explicit uses of the return value, code generation
+  // is free to use the 'returned' attribute to do things like eliding
+  // save/restores of registers across calls. Whether or not this happens is
+  // target and ABI-specific as well as depending on the amount of register
+  // pressure, so there's no good way for an IR-level pass to figure this out.
+  //
+  // Fortunately, the only places where 'returned' is currently generated by
+  // the FE are places where 'returned' is basically free and almost always a
+  // performance win, so the second option can just be used always for now.
+  //
+  // This should be revisited if 'returned' is ever applied more liberally.
+  if (RetTy->isVoidTy() || HasLiveReturnedArg) {
     NRetTy = RetTy;
   } else {
     StructType *STy = dyn_cast<StructType>(RetTy);
@@ -777,33 +863,6 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
     AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
 
-  // Remember which arguments are still alive.
-  SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
-  // Construct the new parameter list from non-dead arguments. Also construct
-  // a new set of parameter attributes to correspond. Skip the first parameter
-  // attribute, since that belongs to the return value.
-  unsigned i = 0;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++i) {
-    RetOrArg Arg = CreateArg(F, i);
-    if (LiveValues.erase(Arg)) {
-      Params.push_back(I->getType());
-      ArgAlive[i] = true;
-
-      // Get the original parameter attributes (skipping the first one, that is
-      // for the return value.
-      if (PAL.hasAttributes(i + 1)) {
-        AttrBuilder B(PAL, i + 1);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
-    } else {
-      ++NumArgumentsEliminated;
-      DEBUG(dbgs() << "DAE - Removing argument " << i << " (" << I->getName()
-            << ") from " << F->getName() << "\n");
-    }
-  }
-
   if (PAL.hasAttributes(AttributeSet::FunctionIndex))
     AttributesVec.push_back(AttributeSet::get(F->getContext(),
                                               PAL.getFnAttributes()));
@@ -864,6 +923,13 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         // Get original parameter attributes, but skip return attributes.
         if (CallPAL.hasAttributes(i + 1)) {
           AttrBuilder B(CallPAL, i + 1);
+          // If the return type has changed, then get rid of 'returned' on the
+          // call site. The alternative is to make all 'returned' attributes on
+          // call sites keep the return value alive just like 'returned'
+          // attributes on function declaration but it's less clearly a win
+          // and this is not an expected case anyway
+          if (NRetTy != RetTy && B.contains(Attribute::Returned))
+            B.removeAttribute(Attribute::Returned);
           AttributesVec.
             push_back(AttributeSet::get(F->getContext(), Args.size(), B));
         }
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index fa3d72d..50fb3e6 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -21,6 +21,38 @@
 #include <algorithm>
 using namespace llvm;
 
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+  bool Local = GV.hasLocalLinkage();
+  if (Local)
+    GV.setVisibility(GlobalValue::HiddenVisibility);
+
+  if (Local || Delete) {
+    GV.setLinkage(GlobalValue::ExternalLinkage);
+    return;
+  }
+
+  if (!GV.hasLinkOnceLinkage()) {
+    assert(!GV.isDiscardableIfUnused());
+    return;
+  }
+
+  // Map linkonce* to weak* so that llvm doesn't drop this GV.
+  switch(GV.getLinkage()) {
+  default:
+    llvm_unreachable("Unexpected linkage");
+  case GlobalValue::LinkOnceAnyLinkage:
+    GV.setLinkage(GlobalValue::WeakAnyLinkage);
+    return;
+  case GlobalValue::LinkOnceODRLinkage:
+    GV.setLinkage(GlobalValue::WeakODRLinkage);
+    return;
+  }
+}
+
 namespace {
   /// @brief A pass to extract specific functions and their dependencies.
   class GVExtractorPass : public ModulePass {
@@ -60,12 +92,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->setInitializer(0);
@@ -80,12 +107,7 @@ namespace {
             continue;
         }
 
-        bool Local = I->isDiscardableIfUnused();
-        if (Local)
-          I->setVisibility(GlobalValue::HiddenVisibility);
-
-        if (Local || Delete)
-          I->setLinkage(GlobalValue::ExternalLinkage);
+	makeVisible(*I, Delete);
 
         if (Delete)
           I->deleteBody();
@@ -97,12 +119,10 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        if (CurI->isDiscardableIfUnused()) {
-          CurI->setVisibility(GlobalValue::HiddenVisibility);
-          CurI->setLinkage(GlobalValue::ExternalLinkage);
-        }
+	bool Delete = deleteStuff == (bool)Named.count(CurI);
+	makeVisible(*CurI, Delete);
 
-        if (deleteStuff == (bool)Named.count(CurI)) {
+        if (Delete) {
           Type *Ty =  CurI->getType()->getElementType();
 
           CurI->removeFromParent();
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index bc5109b..60e5f06 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -9,14 +9,12 @@
 //
 // This file implements a simple interprocedural pass which walks the
 // call-graph, looking for functions which do not access or only read
-// non-local memory, and marking them readnone/readonly.  In addition,
-// it marks function arguments (of pointer type) 'nocapture' if a call
-// to the function does not create any copies of the pointer value that
-// outlive the call.  This more or less means that the pointer is only
-// dereferenced, and not returned from the function or stored in a global.
-// Finally, well-known library call declarations are marked with all
-// attributes that are consistent with the function's standard definition.
-// This pass is implemented as a bottom-up traversal of the call-graph.
+// non-local memory, and marking them readnone/readonly.  It does the
+// same with function arguments independently, marking them readonly/
+// readnone/nocapture.  Finally, well-known library call declarations
+// are marked with all attributes that are consistent with the
+// function's standard definition. This pass is implemented as a
+// bottom-up traversal of the call-graph.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,6 +38,8 @@ using namespace llvm;
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
+STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
@@ -56,8 +56,8 @@ namespace {
     // AddReadAttrs - Deduce readonly/readnone attributes for the SCC.
     bool AddReadAttrs(const CallGraphSCC &SCC);
 
-    // AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
-    bool AddNoCaptureAttrs(const CallGraphSCC &SCC);
+    // AddArgumentAttrs - Deduce nocapture attributes for the SCC.
+    bool AddArgumentAttrs(const CallGraphSCC &SCC);
 
     // IsFunctionMallocLike - Does this function allocate new memory?
     bool IsFunctionMallocLike(Function *F,
@@ -71,36 +71,43 @@ namespace {
 
     void setDoesNotAccessMemory(Function &F) {
       if (!F.doesNotAccessMemory()) {
-	F.setDoesNotAccessMemory();
-	++NumAnnotated;
+        F.setDoesNotAccessMemory();
+        ++NumAnnotated;
       }
     }
 
     void setOnlyReadsMemory(Function &F) {
       if (!F.onlyReadsMemory()) {
-	F.setOnlyReadsMemory();
-	++NumAnnotated;
+        F.setOnlyReadsMemory();
+        ++NumAnnotated;
       }
     }
 
     void setDoesNotThrow(Function &F) {
       if (!F.doesNotThrow()) {
-	F.setDoesNotThrow();
-	++NumAnnotated;
+        F.setDoesNotThrow();
+        ++NumAnnotated;
       }
     }
 
     void setDoesNotCapture(Function &F, unsigned n) {
       if (!F.doesNotCapture(n)) {
-	F.setDoesNotCapture(n);
-	++NumAnnotated;
+        F.setDoesNotCapture(n);
+        ++NumAnnotated;
+      }
+    }
+
+    void setOnlyReadsMemory(Function &F, unsigned n) {
+      if (!F.onlyReadsMemory(n)) {
+        F.setOnlyReadsMemory(n);
+        ++NumAnnotated;
       }
     }
 
     void setDoesNotAlias(Function &F, unsigned n) {
       if (!F.doesNotAlias(n)) {
-	F.setDoesNotAlias(n);
-	++NumAnnotated;
+        F.setDoesNotAlias(n);
+        ++NumAnnotated;
       }
     }
 
@@ -129,7 +136,8 @@ namespace {
 char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
@@ -343,6 +351,7 @@ namespace {
       Function *F = CS.getCalledFunction();
       if (!F || !SCCNodes.count(F)) { Captured = true; return true; }
 
+      bool Found = false;
       Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
       for (CallSite::arg_iterator PI = CS.arg_begin(), PE = CS.arg_end();
            PI != PE; ++PI, ++AI) {
@@ -353,10 +362,12 @@ namespace {
         }
         if (PI == U) {
           Uses.push_back(AI);
+          Found = true;
           break;
         }
       }
-      assert(!Uses.empty() && "Capturing call-site captured nothing?");
+      assert(Found && "Capturing call-site captured nothing?");
+      (void)Found;
       return false;
     }
 
@@ -394,8 +405,100 @@ namespace llvm {
   };
 }
 
-/// AddNoCaptureAttrs - Deduce nocapture attributes for the SCC.
-bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
+// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
+static Attribute::AttrKind
+determinePointerReadAttrs(Argument *A,
+                          const SmallPtrSet<Argument*, 8> &SCCNodes) {
+                                                       
+  SmallVector<Use*, 32> Worklist;
+  SmallSet<Use*, 32> Visited;
+  int Count = 0;
+
+  bool IsRead = false;
+  // We don't need to track IsWritten. If A is written to, return immediately.
+
+  for (Value::use_iterator UI = A->use_begin(), UE = A->use_end();
+       UI != UE; ++UI) {
+    if (Count++ >= 20)
+      return Attribute::None;
+
+    Use *U = &UI.getUse();
+    Visited.insert(U);
+    Worklist.push_back(U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    Value *V = U->get();
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+      // The original value is not read/written via this if the new value isn't.
+      for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Use *U = &UI.getUse();
+        if (Visited.insert(U))
+          Worklist.push_back(U);
+      }
+      break;
+
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS(I);
+      if (CS.doesNotAccessMemory())
+        continue;
+
+      Function *F = CS.getCalledFunction();
+      if (!F) {
+        if (CS.onlyReadsMemory()) {
+          IsRead = true;
+          continue;
+        }
+        return Attribute::None;
+      }
+
+      Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+      for (CallSite::arg_iterator A = B; A != E; ++A, ++AI) {
+        if (A->get() == V) {
+          if (AI == AE) {
+            assert(F->isVarArg() &&
+                   "More params than args in non-varargs call.");
+            return Attribute::None;
+          }
+          if (SCCNodes.count(AI))
+            continue;
+          if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(A - B))
+            return Attribute::None;
+          if (!CS.doesNotAccessMemory(A - B))
+            IsRead = true;
+        }
+      }
+      break;
+    }
+
+    case Instruction::Load:
+      IsRead = true;
+      break;
+
+    case Instruction::ICmp:
+    case Instruction::Ret:
+      break;
+
+    default:
+      return Attribute::None;
+    }
+  }
+
+  return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+}
+
+/// AddArgumentAttrs - Deduce nocapture attributes for the SCC.
+bool FunctionAttrs::AddArgumentAttrs(const CallGraphSCC &SCC) {
   bool Changed = false;
 
   SmallPtrSet<Function*, 8> SCCNodes;
@@ -442,8 +545,11 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       continue;
     }
 
-    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A!=E; ++A)
-      if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+    for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
+         A != E; ++A) {
+      if (!A->getType()->isPointerTy()) continue;
+      bool HasNonLocalUses = false;
+      if (!A->hasNoCaptureAttr()) {
         ArgumentUsesTracker Tracker(SCCNodes);
         PointerMayBeCaptured(A, &Tracker);
         if (!Tracker.Captured) {
@@ -458,12 +564,32 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
             // its particulars for Argument-SCC analysis later.
             ArgumentGraphNode *Node = AG[A];
             for (SmallVectorImpl<Argument*>::iterator UI = Tracker.Uses.begin(),
-                   UE = Tracker.Uses.end(); UI != UE; ++UI)
+                     UE = Tracker.Uses.end(); UI != UE; ++UI) {
               Node->Uses.push_back(AG[*UI]);
+              if (*UI != A)
+                HasNonLocalUses = true;
+            }
           }
         }
         // Otherwise, it's captured. Don't bother doing SCC analysis on it.
       }
+      if (!HasNonLocalUses && !A->onlyReadsMemory()) {
+        // Can we determine that it's readonly/readnone without doing an SCC?
+        // Note that we don't allow any calls at all here, or else our result
+        // will be dependent on the iteration order through the functions in the
+        // SCC.
+        SmallPtrSet<Argument*, 8> Self;
+        Self.insert(A);
+        Attribute::AttrKind R = determinePointerReadAttrs(A, Self);
+        if (R != Attribute::None) {
+          AttrBuilder B;
+          B.addAttribute(R);
+          A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+          Changed = true;
+          R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+        }
+      }
+    }
   }
 
   // The graph we've collected is partial because we stopped scanning for
@@ -482,11 +608,8 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       // eg. "void f(int* x) { if (...) f(x); }"
       if (ArgumentSCC[0]->Uses.size() == 1 &&
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
-        ArgumentSCC[0]->
-          Definition->
-          addAttr(AttributeSet::get(ArgumentSCC[0]->Definition->getContext(),
-                                    ArgumentSCC[0]->Definition->getArgNo() + 1,
-                                    B));
+        Argument *A = ArgumentSCC[0]->Definition;
+        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -532,6 +655,42 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       ++NumNoCapture;
       Changed = true;
     }
+
+    // We also want to compute readonly/readnone. With a small number of false
+    // negatives, we can assume that any pointer which is captured isn't going
+    // to be provably readonly or readnone, since by definition we can't
+    // analyze all uses of a captured pointer.
+    //
+    // The false negatives happen when the pointer is captured by a function
+    // that promises readonly/readnone behaviour on the pointer, then the
+    // pointer's lifetime ends before anything that writes to arbitrary memory.
+    // Also, a readonly/readnone pointer may be returned, but returning a
+    // pointer is capturing it.
+
+    Attribute::AttrKind ReadAttr = Attribute::ReadNone;
+    for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+      Argument *A = ArgumentSCC[i]->Definition;
+      Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
+      if (K == Attribute::ReadNone)
+        continue;
+      if (K == Attribute::ReadOnly) {
+        ReadAttr = Attribute::ReadOnly;
+        continue;
+      }
+      ReadAttr = K;
+      break;
+    }
+
+    if (ReadAttr != Attribute::None) {
+      AttrBuilder B;
+      B.addAttribute(ReadAttr);
+      for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+        Argument *A = ArgumentSCC[i]->Definition;
+        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+        Changed = true;
+      }
+    }
   }
 
   return Changed;
@@ -678,24 +837,32 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setOnlyReadsMemory(F);
     setDoesNotThrow(F);
     break;
-  case LibFunc::strcpy:
-  case LibFunc::stpcpy:
-  case LibFunc::strcat:
   case LibFunc::strtol:
   case LibFunc::strtod:
   case LibFunc::strtof:
   case LibFunc::strtoul:
   case LibFunc::strtoll:
   case LibFunc::strtold:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    break;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
   case LibFunc::strncat:
   case LibFunc::strncpy:
   case LibFunc::stpncpy:
-  case LibFunc::strtoull:
     if (FTy->getNumParams() < 2 ||
         !FTy->getParamType(1)->isPointerTy())
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::strxfrm:
     if (FTy->getNumParams() != 3 ||
@@ -705,14 +872,15 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
-    break;
-  case LibFunc::strcmp:
-  case LibFunc::strspn:
-  case LibFunc::strncmp:
-  case LibFunc::strcspn:
-  case LibFunc::strcoll:
-  case LibFunc::strcasecmp:
-  case LibFunc::strncasecmp:
+    setOnlyReadsMemory(F, 2);
+    break;
+  case LibFunc::strcmp: //0,1
+    case LibFunc::strspn: // 0,1
+    case LibFunc::strncmp: // 0,1
+    case LibFunc::strcspn: //0,1
+    case LibFunc::strcoll: //0,1
+    case LibFunc::strcasecmp:  // 0,1
+    case LibFunc::strncasecmp: // 
     if (FTy->getNumParams() < 2 ||
         !FTy->getParamType(0)->isPointerTy() ||
         !FTy->getParamType(1)->isPointerTy())
@@ -736,8 +904,15 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::scanf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::setbuf:
   case LibFunc::setvbuf:
     if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
@@ -753,11 +928,31 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::stat:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::sscanf:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
+    break;
   case LibFunc::sprintf:
-  case LibFunc::statvfs:
     if (FTy->getNumParams() < 2 ||
         !FTy->getParamType(0)->isPointerTy() ||
         !FTy->getParamType(1)->isPointerTy())
@@ -765,6 +960,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::snprintf:
     if (FTy->getNumParams() != 3 ||
@@ -774,6 +970,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 3);
+    setOnlyReadsMemory(F, 3);
     break;
   case LibFunc::setitimer:
     if (FTy->getNumParams() != 3 ||
@@ -783,6 +980,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 2);
     setDoesNotCapture(F, 3);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::system:
     if (FTy->getNumParams() != 1 ||
@@ -790,6 +988,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     // May throw; "system" is a valid pthread cancellation point.
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::malloc:
     if (FTy->getNumParams() != 1 ||
@@ -818,6 +1017,12 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
   case LibFunc::modf:
   case LibFunc::modff:
   case LibFunc::modfl:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
   case LibFunc::memcpy:
   case LibFunc::memccpy:
   case LibFunc::memmove:
@@ -826,6 +1031,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::memalign:
     if (!FTy->getReturnType()->isPointerTy())
@@ -833,6 +1039,13 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotAlias(F, 0);
     break;
   case LibFunc::mkdir:
+    if (FTy->getNumParams() == 0 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::mktime:
     if (FTy->getNumParams() == 0 ||
         !FTy->getParamType(0)->isPointerTy())
@@ -856,8 +1069,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     // May throw; "read" is a valid pthread cancellation point.
     setDoesNotCapture(F, 2);
     break;
-  case LibFunc::rmdir:
   case LibFunc::rewind:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::rmdir:
   case LibFunc::remove:
   case LibFunc::realpath:
     if (FTy->getNumParams() < 1 ||
@@ -865,8 +1084,19 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::rename:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
+    break;
   case LibFunc::readlink:
     if (FTy->getNumParams() < 2 ||
         !FTy->getParamType(0)->isPointerTy() ||
@@ -875,12 +1105,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::write:
     if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
       return false;
     // May throw; "write" is a valid pthread cancellation point.
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::bcopy:
     if (FTy->getNumParams() != 3 ||
@@ -890,6 +1122,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::bcmp:
     if (FTy->getNumParams() != 3 ||
@@ -916,6 +1149,12 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     break;
   case LibFunc::chmod:
   case LibFunc::chown:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::ctermid:
   case LibFunc::clearerr:
   case LibFunc::closedir:
@@ -939,6 +1178,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::fopen:
     if (FTy->getNumParams() != 2 ||
@@ -950,6 +1190,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::fdopen:
     if (FTy->getNumParams() != 2 ||
@@ -959,6 +1201,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::feof:
   case LibFunc::free:
@@ -1004,7 +1247,16 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 3);
+    break;
   case LibFunc::fread:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 4);
+    break;
   case LibFunc::fwrite:
     if (FTy->getNumParams() != 4 ||
         !FTy->getParamType(0)->isPointerTy() ||
@@ -1013,9 +1265,28 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 4);
+    break;
   case LibFunc::fputs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::fscanf:
   case LibFunc::fprintf:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
+    break;
   case LibFunc::fgetpos:
     if (FTy->getNumParams() < 2 ||
         !FTy->getParamType(0)->isPointerTy() ||
@@ -1055,6 +1326,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::ungetc:
     if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
@@ -1063,12 +1335,24 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotCapture(F, 2);
     break;
   case LibFunc::uname:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
   case LibFunc::unlink:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::unsetenv:
     if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::utime:
   case LibFunc::utimes:
@@ -1079,6 +1363,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::putc:
     if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
@@ -1093,13 +1379,20 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::pread:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "pread" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
   case LibFunc::pwrite:
     if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
       return false;
-    // May throw; these are valid pthread cancellation points.
+    // May throw; "pwrite" is a valid pthread cancellation point.
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::putchar:
     setDoesNotThrow(F);
@@ -1114,6 +1407,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::pclose:
     if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
@@ -1126,8 +1421,19 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::vsscanf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
+    break;
   case LibFunc::vfscanf:
     if (FTy->getNumParams() != 3 ||
         !FTy->getParamType(1)->isPointerTy() ||
@@ -1136,6 +1442,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::valloc:
     if (!FTy->getReturnType()->isPointerTy())
@@ -1148,6 +1455,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::vfprintf:
   case LibFunc::vsprintf:
@@ -1158,6 +1466,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::vsnprintf:
     if (FTy->getNumParams() != 4 ||
@@ -1167,12 +1476,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 3);
+    setOnlyReadsMemory(F, 3);
     break;
   case LibFunc::open:
     if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
       return false;
     // May throw; "open" is a valid pthread cancellation point.
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::opendir:
     if (FTy->getNumParams() != 1 ||
@@ -1182,6 +1493,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::tmpfile:
     if (!FTy->getReturnType()->isPointerTy())
@@ -1210,12 +1522,14 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::lchown:
     if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::qsort:
     if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
@@ -1232,6 +1546,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::dunder_strtok_r:
     if (FTy->getNumParams() != 3 ||
@@ -1239,6 +1554,7 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::under_IO_getc:
     if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
@@ -1258,10 +1574,20 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
   case LibFunc::stat64:
   case LibFunc::lstat64:
   case LibFunc::statvfs64:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    break;
   case LibFunc::dunder_isoc99_sscanf:
     if (FTy->getNumParams() < 1 ||
         !FTy->getParamType(0)->isPointerTy() ||
@@ -1270,6 +1596,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotThrow(F);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::fopen64:
     if (FTy->getNumParams() != 2 ||
@@ -1281,6 +1609,8 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
     setDoesNotAlias(F, 0);
     setDoesNotCapture(F, 1);
     setDoesNotCapture(F, 2);
+    setOnlyReadsMemory(F, 1);
+    setOnlyReadsMemory(F, 2);
     break;
   case LibFunc::fseeko64:
   case LibFunc::ftello64:
@@ -1307,7 +1637,18 @@ bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
       return false;
     // May throw; "open" is a valid pthread cancellation point.
     setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F, 1);
     break;
+  case LibFunc::gettimeofday:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // Currently some platforms have the restrict keyword on the arguments to
+    // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+    // arguments.
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
   default:
     // Didn't mark any attributes.
     return false;
@@ -1339,7 +1680,7 @@ bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
 
   bool Changed = annotateLibraryCalls(SCC);
   Changed |= AddReadAttrs(SCC);
-  Changed |= AddNoCaptureAttrs(SCC);
+  Changed |= AddArgumentAttrs(SCC);
   Changed |= AddNoAliasAttrs(SCC);
   return Changed;
 }
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 201f320..901295d 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -179,6 +179,9 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
     // any globals used will be marked as needed.
     Function *F = cast<Function>(G);
 
+    if (F->hasPrefixData())
+      MarkUsedGlobalsAsNeeded(F->getPrefixData());
+
     for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
       for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
         for (User::op_iterator U = I->op_begin(), E = I->op_end(); U != E; ++U)
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 0ef900e..2ea89a1 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -37,7 +37,10 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -59,7 +62,6 @@ STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
 STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
 
 namespace {
-  struct GlobalStatus;
   struct GlobalOpt : public ModulePass {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
@@ -79,7 +81,6 @@ namespace {
     bool OptimizeGlobalCtorsList(GlobalVariable *&GCL);
     bool ProcessGlobal(GlobalVariable *GV,Module::global_iterator &GVI);
     bool ProcessInternalGlobal(GlobalVariable *GV,Module::global_iterator &GVI,
-                               const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
@@ -97,209 +98,6 @@ INITIALIZE_PASS_END(GlobalOpt, "globalopt",
 
 ModulePass *llvm::createGlobalOptimizerPass() { return new GlobalOpt(); }
 
-namespace {
-
-/// GlobalStatus - As we analyze each global, keep track of some information
-/// about it.  If we find out that the address of the global is taken, none of
-/// this info will be accurate.
-struct GlobalStatus {
-  /// isCompared - True if the global's address is used in a comparison.
-  bool isCompared;
-
-  /// isLoaded - True if the global is ever loaded.  If the global isn't ever
-  /// loaded it can be deleted.
-  bool isLoaded;
-
-  /// StoredType - Keep track of what stores to the global look like.
-  ///
-  enum StoredType {
-    /// NotStored - There is no store to this global.  It can thus be marked
-    /// constant.
-    NotStored,
-
-    /// isInitializerStored - This global is stored to, but the only thing
-    /// stored is the constant it was initialized with.  This is only tracked
-    /// for scalar globals.
-    isInitializerStored,
-
-    /// isStoredOnce - This global is stored to, but only its initializer and
-    /// one other value is ever stored to it.  If this global isStoredOnce, we
-    /// track the value stored to it in StoredOnceValue below.  This is only
-    /// tracked for scalar globals.
-    isStoredOnce,
-
-    /// isStored - This global is stored to by multiple values or something else
-    /// that we cannot track.
-    isStored
-  } StoredType;
-
-  /// StoredOnceValue - If only one value (besides the initializer constant) is
-  /// ever stored to this global, keep track of what value it is.
-  Value *StoredOnceValue;
-
-  /// AccessingFunction/HasMultipleAccessingFunctions - These start out
-  /// null/false.  When the first accessing function is noticed, it is recorded.
-  /// When a second different accessing function is noticed,
-  /// HasMultipleAccessingFunctions is set to true.
-  const Function *AccessingFunction;
-  bool HasMultipleAccessingFunctions;
-
-  /// HasNonInstructionUser - Set to true if this global has a user that is not
-  /// an instruction (e.g. a constant expr or GV initializer).
-  bool HasNonInstructionUser;
-
-  /// AtomicOrdering - Set to the strongest atomic ordering requirement.
-  AtomicOrdering Ordering;
-
-  GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
-                   StoredOnceValue(0), AccessingFunction(0),
-                   HasMultipleAccessingFunctions(false),
-                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
-};
-
-}
-
-/// StrongerOrdering - Return the stronger of the two ordering. If the two
-/// orderings are acquire and release, then return AcquireRelease.
-///
-static AtomicOrdering StrongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
-  if (X == Acquire && Y == Release) return AcquireRelease;
-  if (Y == Acquire && X == Release) return AcquireRelease;
-  return (AtomicOrdering)std::max(X, Y);
-}
-
-/// SafeToDestroyConstant - It is safe to destroy a constant iff it is only used
-/// by constants itself.  Note that constants cannot be cyclic, so this test is
-/// pretty easy to implement recursively.
-///
-static bool SafeToDestroyConstant(const Constant *C) {
-  if (isa<GlobalValue>(C)) return false;
-
-  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
-       ++UI)
-    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
-      if (!SafeToDestroyConstant(CU)) return false;
-    } else
-      return false;
-  return true;
-}
-
-
-/// AnalyzeGlobal - Look at all uses of the global and fill in the GlobalStatus
-/// structure.  If the global has its address taken, return true to indicate we
-/// can't do anything with it.
-///
-static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
-                          SmallPtrSet<const PHINode*, 16> &PHIUsers) {
-  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
-       ++UI) {
-    const User *U = *UI;
-    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
-      GS.HasNonInstructionUser = true;
-
-      // If the result of the constantexpr isn't pointer type, then we won't
-      // know to expect it in various places.  Just reject early.
-      if (!isa<PointerType>(CE->getType())) return true;
-
-      if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
-    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
-      if (!GS.HasMultipleAccessingFunctions) {
-        const Function *F = I->getParent()->getParent();
-        if (GS.AccessingFunction == 0)
-          GS.AccessingFunction = F;
-        else if (GS.AccessingFunction != F)
-          GS.HasMultipleAccessingFunctions = true;
-      }
-      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        GS.isLoaded = true;
-        // Don't hack on volatile loads.
-        if (LI->isVolatile()) return true;
-        GS.Ordering = StrongerOrdering(GS.Ordering, LI->getOrdering());
-      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        // Don't allow a store OF the address, only stores TO the address.
-        if (SI->getOperand(0) == V) return true;
-
-        // Don't hack on volatile stores.
-        if (SI->isVolatile()) return true;
-
-        GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
-
-        // If this is a direct store to the global (i.e., the global is a scalar
-        // value, not an aggregate), keep more specific information about
-        // stores.
-        if (GS.StoredType != GlobalStatus::isStored) {
-          if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(
-                                                           SI->getOperand(1))) {
-            Value *StoredVal = SI->getOperand(0);
-
-            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
-              if (C->isThreadDependent()) {
-                // The stored value changes between threads; don't track it.
-                return true;
-              }
-            }
-
-            if (StoredVal == GV->getInitializer()) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (isa<LoadInst>(StoredVal) &&
-                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
-              if (GS.StoredType < GlobalStatus::isInitializerStored)
-                GS.StoredType = GlobalStatus::isInitializerStored;
-            } else if (GS.StoredType < GlobalStatus::isStoredOnce) {
-              GS.StoredType = GlobalStatus::isStoredOnce;
-              GS.StoredOnceValue = StoredVal;
-            } else if (GS.StoredType == GlobalStatus::isStoredOnce &&
-                       GS.StoredOnceValue == StoredVal) {
-              // noop.
-            } else {
-              GS.StoredType = GlobalStatus::isStored;
-            }
-          } else {
-            GS.StoredType = GlobalStatus::isStored;
-          }
-        }
-      } else if (isa<BitCastInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<GetElementPtrInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<SelectInst>(I)) {
-        if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
-        // PHI nodes we can check just like select or GEP instructions, but we
-        // have to be careful about infinite recursion.
-        if (PHIUsers.insert(PN))  // Not already visited.
-          if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-      } else if (isa<CmpInst>(I)) {
-        GS.isCompared = true;
-      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
-        if (MTI->isVolatile()) return true;
-        if (MTI->getArgOperand(0) == V)
-          GS.StoredType = GlobalStatus::isStored;
-        if (MTI->getArgOperand(1) == V)
-          GS.isLoaded = true;
-      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
-        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
-        if (MSI->isVolatile()) return true;
-        GS.StoredType = GlobalStatus::isStored;
-      } else {
-        return true;  // Any other non-load instruction might take address!
-      }
-    } else if (const Constant *C = dyn_cast<Constant>(U)) {
-      GS.HasNonInstructionUser = true;
-      // We might have a dead and dangling constant hanging off of here.
-      if (!SafeToDestroyConstant(C))
-        return true;
-    } else {
-      GS.HasNonInstructionUser = true;
-      // Otherwise must be some other user.
-      return true;
-    }
-  }
-
-  return false;
-}
-
 /// isLeakCheckerRoot - Is this global variable possibly used by a leak checker
 /// as a root?  If so, we might not really want to eliminate the stores to it.
 static bool isLeakCheckerRoot(GlobalVariable *GV) {
@@ -433,7 +231,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
         Changed = true;
       }
     } else if (Constant *C = dyn_cast<Constant>(U)) {
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
@@ -470,9 +268,17 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  // Note that we need to use a weak value handle for the worklist items. When
+  // we delete a constant array, we may also be holding pointer to one of its
+  // elements (or an element of one of its elements if we're dealing with an
+  // array of arrays) in the worklist.
+  SmallVector<WeakVH, 8> WorkList(V->use_begin(), V->use_end());
   while (!WorkList.empty()) {
-    User *U = WorkList.pop_back_val();
+    Value *UV = WorkList.pop_back_val();
+    if (!UV)
+      continue;
+
+    User *U = cast<User>(UV);
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -533,7 +339,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
     } else if (Constant *C = dyn_cast<Constant>(U)) {
       // If we have a chain of dead constantexprs or other things dangling from
       // us, and if they are all dead, nuke them without remorse.
-      if (SafeToDestroyConstant(C)) {
+      if (isSafeToDestroyConstant(C)) {
         C->destroyConstant();
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
@@ -548,7 +354,7 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
 static bool isSafeSROAElementUse(Value *V) {
   // We might have a dead and dangling constant hanging off of here.
   if (Constant *C = dyn_cast<Constant>(V))
-    return SafeToDestroyConstant(C);
+    return isSafeToDestroyConstant(C);
 
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
@@ -1372,8 +1178,7 @@ static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     // PN's type is pointer to struct.  Make a new PHI of pointer to struct
     // field.
-    StructType *ST =
-      cast<StructType>(cast<PointerType>(PN->getType())->getElementType());
+    StructType *ST = cast<StructType>(PN->getType()->getPointerElementType());
 
     PHINode *NewPN =
      PHINode::Create(PointerType::getUnqual(ST->getElementType(FieldNo)),
@@ -1504,7 +1309,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
     if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(CI->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1734,7 +1539,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
     if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
-      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+      Type *IntPtrTy = TD->getIntPtrType(CI->getType());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
@@ -1916,13 +1721,12 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (!GV->hasLocalLinkage())
     return false;
 
-  SmallPtrSet<const PHINode*, 16> PHIUsers;
   GlobalStatus GS;
 
-  if (AnalyzeGlobal(GV, GS, PHIUsers))
+  if (GlobalStatus::analyzeGlobal(GV, GS))
     return false;
 
-  if (!GS.isCompared && !GV->hasUnnamedAddr()) {
+  if (!GS.IsCompared && !GV->hasUnnamedAddr()) {
     GV->setUnnamedAddr(true);
     NumUnnamed++;
   }
@@ -1930,19 +1734,17 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV,
   if (GV->isConstant() || !GV->hasInitializer())
     return false;
 
-  return ProcessInternalGlobal(GV, GVI, PHIUsers, GS);
+  return ProcessInternalGlobal(GV, GVI, GS);
 }
 
 /// ProcessInternalGlobal - Analyze the specified global variable and optimize
 /// it if possible.  If we make a change, return true.
 bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
                                       Module::global_iterator &GVI,
-                                const SmallPtrSet<const PHINode*, 16> &PHIUsers,
                                       const GlobalStatus &GS) {
   // If this is a first class global and has only one accessing function
-  // and this function is main (which we know is not recursive we can make
-  // this global a local variable) we replace the global with a local alloca
-  // in this function.
+  // and this function is main (which we know is not recursive), we replace
+  // the global with a local alloca in this function.
   //
   // NOTE: It doesn't make sense to promote non single-value types since we
   // are just replacing static memory to stack memory.
@@ -1971,7 +1773,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
   // If the global is never loaded (but may be stored to), it is dead.
   // Delete it now.
-  if (!GS.isLoaded) {
+  if (!GS.IsLoaded) {
     DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV);
 
     bool Changed;
@@ -1992,7 +1794,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     }
     return Changed;
 
-  } else if (GS.StoredType <= GlobalStatus::isInitializerStored) {
+  } else if (GS.StoredType <= GlobalStatus::InitializerStored) {
     DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
     GV->setConstant(true);
 
@@ -2015,7 +1817,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
         GVI = FirstNewGV;  // Don't skip the newly produced globals!
         return true;
       }
-  } else if (GS.StoredType == GlobalStatus::isStoredOnce) {
+  } else if (GS.StoredType == GlobalStatus::StoredOnce) {
     // If the initial value for the global was an undef value, and if only
     // one other value was stored into it, we can just change the
     // initializer to be the stored value, then delete all stores to the
@@ -2048,11 +1850,14 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
 
     // Otherwise, if the global was not a boolean, we can shrink it to be a
     // boolean.
-    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
-      if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
-        ++NumShrunkToBool;
-        return true;
+    if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+      if (GS.Ordering == NotAtomic) {
+        if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+          ++NumShrunkToBool;
+          return true;
+        }
       }
+    }
   }
 
   return false;
@@ -2210,8 +2015,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
   CSVals[1] = 0;
 
   StructType *StructTy =
-    cast <StructType>(
-    cast<ArrayType>(GCL->getType()->getElementType())->getElementType());
+    cast<StructType>(GCL->getType()->getElementType()->getArrayElementType());
 
   // Create the new init list.
   std::vector<Constant*> CAList;
@@ -2784,7 +2588,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           Value *Ptr = PtrArg->stripPointerCasts();
           if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
             Type *ElemTy = cast<PointerType>(GV->getType())->getElementType();
-            if (!Size->isAllOnesValue() &&
+            if (TD && !Size->isAllOnesValue() &&
                 Size->getValue().getLimitedValue() >=
                 TD->getTypeStoreSize(ElemTy)) {
               Invariants.insert(GV);
@@ -3041,107 +2845,148 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   return true;
 }
 
-static Value::use_iterator getFirst(Value *V, SmallPtrSet<Use*, 8> &Tried) {
-  for (Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
-    Use *U = &I.getUse();
-    if (Tried.count(U))
-      continue;
-
-    User *Usr = *I;
-    GlobalVariable *GV = dyn_cast<GlobalVariable>(Usr);
-    if (!GV || !GV->hasName()) {
-      Tried.insert(U);
-      return I;
-    }
-
-    StringRef Name = GV->getName();
-    if (Name != "llvm.used" && Name != "llvm.compiler_used") {
-      Tried.insert(U);
-      return I;
-    }
-  }
-  return V->use_end();
+static int compareNames(Constant *const *A, Constant *const *B) {
+  return (*A)->getName().compare((*B)->getName());
 }
 
-static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New);
-
-static bool replaceUsesOfWithOnConstant(ConstantArray *CA, Value *From,
-                                        Value *ToV, Use *U) {
-  Constant *To = cast<Constant>(ToV);
-
-  SmallVector<Constant*, 8> NewOps;
-  for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
-    Constant *Op = CA->getOperand(i);
-    NewOps.push_back(Op == From ? To : Op);
+static void setUsedInitializer(GlobalVariable &V,
+                               SmallPtrSet<GlobalValue *, 8> Init) {
+  if (Init.empty()) {
+    V.eraseFromParent();
+    return;
   }
 
-  Constant *Replacement = ConstantArray::get(CA->getType(), NewOps);
-  assert(Replacement != CA && "CA didn't contain From!");
+  SmallVector<llvm::Constant *, 8> UsedArray;
+  PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext());
 
-  bool Ret = replaceAllNonLLVMUsedUsesWith(CA, Replacement);
-  if (Replacement->use_empty())
-    Replacement->destroyConstant();
-  if (CA->use_empty())
-    CA->destroyConstant();
-  return Ret;
+  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Init.begin(), E = Init.end();
+       I != E; ++I) {
+    Constant *Cast = llvm::ConstantExpr::getBitCast(*I, Int8PtrTy);
+    UsedArray.push_back(Cast);
+  }
+  // Sort to get deterministic order.
+  array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
+  ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
+
+  Module *M = V.getParent();
+  V.removeFromParent();
+  GlobalVariable *NV =
+      new GlobalVariable(*M, ATy, false, llvm::GlobalValue::AppendingLinkage,
+                         llvm::ConstantArray::get(ATy, UsedArray), "");
+  NV->takeName(&V);
+  NV->setSection("llvm.metadata");
+  delete &V;
 }
 
-static bool replaceUsesOfWithOnConstant(ConstantExpr *CE, Value *From,
-                                        Value *ToV, Use *U) {
-  Constant *To = cast<Constant>(ToV);
-  SmallVector<Constant*, 8> NewOps;
-  for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
-    Constant *Op = CE->getOperand(i);
-    NewOps.push_back(Op == From ? To : Op);
+namespace {
+/// \brief An easy to access representation of llvm.used and llvm.compiler.used.
+class LLVMUsed {
+  SmallPtrSet<GlobalValue *, 8> Used;
+  SmallPtrSet<GlobalValue *, 8> CompilerUsed;
+  GlobalVariable *UsedV;
+  GlobalVariable *CompilerUsedV;
+
+public:
+  LLVMUsed(Module &M) {
+    UsedV = collectUsedGlobalVariables(M, Used, false);
+    CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
+  }
+  typedef SmallPtrSet<GlobalValue *, 8>::iterator iterator;
+  iterator usedBegin() { return Used.begin(); }
+  iterator usedEnd() { return Used.end(); }
+  iterator compilerUsedBegin() { return CompilerUsed.begin(); }
+  iterator compilerUsedEnd() { return CompilerUsed.end(); }
+  bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+  bool compilerUsedCount(GlobalValue *GV) const {
+    return CompilerUsed.count(GV);
+  }
+  bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
+  bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
+  bool usedInsert(GlobalValue *GV) { return Used.insert(GV); }
+  bool compilerUsedInsert(GlobalValue *GV) { return CompilerUsed.insert(GV); }
+
+  void syncVariablesAndSets() {
+    if (UsedV)
+      setUsedInitializer(*UsedV, Used);
+    if (CompilerUsedV)
+      setUsedInitializer(*CompilerUsedV, CompilerUsed);
   }
+};
+}
 
-  Constant *Replacement = CE->getWithOperands(NewOps);
-  assert(Replacement != CE && "CE didn't contain From!");
+static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
+  if (GA.use_empty()) // No use at all.
+    return false;
 
-  bool Ret = replaceAllNonLLVMUsedUsesWith(CE, Replacement);
-  if (Replacement->use_empty())
-    Replacement->destroyConstant();
-  if (CE->use_empty())
-    CE->destroyConstant();
-  return Ret;
+  assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (!GA.hasOneUse())
+    // Strictly more than one use. So at least one is not in llvm.used and
+    // llvm.compiler.used.
+    return true;
+
+  // Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
+  return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
 }
 
-static bool replaceUsesOfWithOnConstant(Constant *C, Value *From, Value *To,
-                                        Use *U) {
-  if (ConstantArray *CA = dyn_cast<ConstantArray>(C))
-    return replaceUsesOfWithOnConstant(CA, From, To, U);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
-    return replaceUsesOfWithOnConstant(CE, From, To, U);
-  C->replaceUsesOfWithOnConstant(From, To, U);
-  return true;
+static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
+                                               const LLVMUsed &U) {
+  unsigned N = 2;
+  assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
+         "We should have removed the duplicated "
+         "element from llvm.compiler.used");
+  if (U.usedCount(&V) || U.compilerUsedCount(&V))
+    ++N;
+  return V.hasNUsesOrMore(N);
 }
 
-static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New) {
-  SmallPtrSet<Use*, 8> Tried;
-  bool Ret = false;
-  for (;;) {
-    Value::use_iterator I = getFirst(Old, Tried);
-    if (I == Old->use_end())
-      break;
-    Use &U = I.getUse();
+static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
+  if (!GA.hasLocalLinkage())
+    return true;
 
-    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
-    // constant because they are uniqued.
-    if (Constant *C = dyn_cast<Constant>(U.getUser())) {
-      if (!isa<GlobalValue>(C)) {
-        Ret |= replaceUsesOfWithOnConstant(C, Old, New, &U);
-        continue;
-      }
-    }
+  return U.usedCount(&GA) || U.compilerUsedCount(&GA);
+}
 
-    U.set(New);
+static bool hasUsesToReplace(GlobalAlias &GA, LLVMUsed &U, bool &RenameTarget) {
+  RenameTarget = false;
+  bool Ret = false;
+  if (hasUseOtherThanLLVMUsed(GA, U))
     Ret = true;
-  }
-  return Ret;
+
+  // If the alias is externally visible, we may still be able to simplify it.
+  if (!mayHaveOtherReferences(GA, U))
+    return Ret;
+
+  // If the aliasee has internal linkage, give it the name and linkage
+  // of the alias, and delete the alias.  This turns:
+  //   define internal ... @f(...)
+  //   @a = alias ... @f
+  // into:
+  //   define ... @a(...)
+  Constant *Aliasee = GA.getAliasee();
+  GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+  if (!Target->hasLocalLinkage())
+    return Ret;
+
+  // Do not perform the transform if multiple aliases potentially target the
+  // aliasee. This check also ensures that it is safe to replace the section
+  // and other attributes of the aliasee with those of the alias.
+  if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
+    return Ret;
+
+  RenameTarget = true;
+  return true;
 }
 
 bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
   bool Changed = false;
+  LLVMUsed Used(M);
+
+  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.usedBegin(),
+                                               E = Used.usedEnd();
+       I != E; ++I)
+    Used.compilerUsedErase(*I);
 
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
        I != E;) {
@@ -3156,38 +3001,29 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     Constant *Aliasee = J->getAliasee();
     GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
     Target->removeDeadConstantUsers();
-    bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
 
     // Make all users of the alias use the aliasee instead.
-    if (replaceAllNonLLVMUsedUsesWith(J, Aliasee)) {
-      ++NumAliasesResolved;
-      Changed = true;
-    }
-    if (!J->use_empty())
+    bool RenameTarget;
+    if (!hasUsesToReplace(*J, Used, RenameTarget))
       continue;
 
-    // If the alias is externally visible, we may still be able to simplify it.
-    if (!J->hasLocalLinkage()) {
-      // If the aliasee has internal linkage, give it the name and linkage
-      // of the alias, and delete the alias.  This turns:
-      //   define internal ... @f(...)
-      //   @a = alias ... @f
-      // into:
-      //   define ... @a(...)
-      if (!Target->hasLocalLinkage())
-        continue;
-
-      // Do not perform the transform if multiple aliases potentially target the
-      // aliasee. This check also ensures that it is safe to replace the section
-      // and other attributes of the aliasee with those of the alias.
-      if (!hasOneUse)
-        continue;
+    J->replaceAllUsesWith(Aliasee);
+    ++NumAliasesResolved;
+    Changed = true;
 
+    if (RenameTarget) {
       // Give the aliasee the name, linkage and other attributes of the alias.
       Target->takeName(J);
       Target->setLinkage(J->getLinkage());
       Target->GlobalValue::copyAttributesFrom(J);
-    }
+
+      if (Used.usedErase(J))
+        Used.usedInsert(Target);
+
+      if (Used.compilerUsedErase(J))
+        Used.compilerUsedInsert(Target);
+    } else if (mayHaveOtherReferences(*J, Used))
+      continue;
 
     // Delete the alias.
     M.getAliasList().erase(J);
@@ -3195,6 +3031,8 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     Changed = true;
   }
 
+  Used.syncVariablesAndSets();
+
   return Changed;
 }
 
@@ -3323,8 +3161,6 @@ bool GlobalOpt::runOnModule(Module &M) {
   // Try to find the llvm.globalctors list.
   GlobalVariable *GlobalCtors = FindGlobalCtors(M);
 
-  Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
-
   bool LocalChange = true;
   while (LocalChange) {
     LocalChange = false;
@@ -3342,7 +3178,9 @@ bool GlobalOpt::runOnModule(Module &M) {
     // Resolve aliases, when possible.
     LocalChange |= OptimizeGlobalAliases(M);
 
-    // Try to remove trivial global destructors.
+    // Try to remove trivial global destructors if they are not removed
+    // already.
+    Function *CXAAtExitFn = FindCXAAtExit(M, TLI);
     if (CXAAtExitFn)
       LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
 
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index a0095da..437597e 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -63,7 +63,7 @@ public:
 char AlwaysInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(AlwaysInliner, "always-inline",
                 "Inliner for always_inline functions", false, false)
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index a4f7026..57379a3 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 
 namespace {
 
-/// \brief Actaul inliner pass implementation.
+/// \brief Actual inliner pass implementation.
 ///
 /// The common implementation of the inlining logic is shared between this
 /// inliner pass and the always inliner pass. The two passes use different cost
@@ -61,7 +61,7 @@ public:
 char SimpleInliner::ID = 0;
 INITIALIZE_PASS_BEGIN(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_DEPENDENCY(InlineCostAnalysis)
 INITIALIZE_PASS_END(SimpleInliner, "inline",
                 "Function Integration/Inlining", false, false)
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 663ddb7..d75d6ca 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -116,7 +116,8 @@ static void AdjustCallerSSPLevel(Function *Caller, Function *Callee) {
 /// any new allocas to the set if not possible.
 static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
                                  InlinedArrayAllocasTy &InlinedArrayAllocas,
-                                 int InlineHistory, bool InsertLifetime) {
+                                 int InlineHistory, bool InsertLifetime,
+                                 const DataLayout *TD) {
   Function *Callee = CS.getCalledFunction();
   Function *Caller = CS.getCaller();
 
@@ -189,6 +190,14 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
     bool MergedAwayAlloca = false;
     for (unsigned i = 0, e = AllocasForType.size(); i != e; ++i) {
       AllocaInst *AvailableAlloca = AllocasForType[i];
+
+      unsigned Align1 = AI->getAlignment(),
+               Align2 = AvailableAlloca->getAlignment();
+      // If we don't have data layout information, and only one alloca is using
+      // the target default, then we can't safely merge them because we can't
+      // pick the greater alignment.
+      if (!TD && (!Align1 || !Align2) && Align1 != Align2)
+        continue;
       
       // The available alloca has to be in the right function, not in some other
       // function in this SCC.
@@ -206,6 +215,20 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
                    << *AvailableAlloca << '\n');
       
       AI->replaceAllUsesWith(AvailableAlloca);
+
+      if (Align1 != Align2) {
+        if (!Align1 || !Align2) {
+          assert(TD && "DataLayout required to compare default alignments");
+          unsigned TypeAlign = TD->getABITypeAlignment(AI->getAllocatedType());
+
+          Align1 = Align1 ? Align1 : TypeAlign;
+          Align2 = Align2 ? Align2 : TypeAlign;
+        }
+
+        if (Align1 > Align2)
+          AvailableAlloca->setAlignment(AI->getAlignment());
+      }
+
       AI->eraseFromParent();
       MergedAwayAlloca = true;
       ++NumMergedAllocas;
@@ -482,7 +505,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
 
         // Attempt to inline the function.
         if (!InlineCallIfPossible(CS, InlineInfo, InlinedArrayAllocas,
-                                  InlineHistoryID, InsertLifetime))
+                                  InlineHistoryID, InsertLifetime, TD))
           continue;
         ++NumInlined;
         
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index 4bfab5b..64e2ced 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -11,10 +11,17 @@
 // If the function or variable is not in the list of external names given to
 // the pass it is marked as internal.
 //
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "internalize"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Module.h"
@@ -22,6 +29,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
@@ -48,10 +57,8 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit InternalizePass();
-    explicit InternalizePass(ArrayRef<const char *> exportList);
+    explicit InternalizePass(ArrayRef<const char *> ExportList);
     void LoadFile(const char *Filename);
-    void ClearExportList();
-    void AddToExportList(const std::string &val);
     virtual bool runOnModule(Module &M);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
@@ -70,15 +77,14 @@ InternalizePass::InternalizePass()
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
   if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
-  if (!APIList.empty())           // If a list is specified, use it as well.
-    ExternalNames.insert(APIList.begin(), APIList.end());
+  ExternalNames.insert(APIList.begin(), APIList.end());
 }
 
-InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
+InternalizePass::InternalizePass(ArrayRef<const char *> ExportList)
   : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
-        itr != exportList.end(); itr++) {
+  for(ArrayRef<const char *>::const_iterator itr = ExportList.begin();
+        itr != ExportList.end(); itr++) {
     ExternalNames.insert(*itr);
   }
 }
@@ -99,12 +105,25 @@ void InternalizePass::LoadFile(const char *Filename) {
   }
 }
 
-void InternalizePass::ClearExportList() {
-  ExternalNames.clear();
-}
+static bool shouldInternalize(const GlobalValue &GV,
+                              const std::set<std::string> &ExternalNames) {
+  // Function must be defined here
+  if (GV.isDeclaration())
+    return false;
+
+  // Available externally is really just a "declaration with a body".
+  if (GV.hasAvailableExternallyLinkage())
+    return false;
+
+  // Already has internal linkage
+  if (GV.hasLocalLinkage())
+    return false;
+
+  // Marked to keep external?
+  if (ExternalNames.count(GV.getName()))
+    return false;
 
-void InternalizePass::AddToExportList(const std::string &val) {
-  ExternalNames.insert(val);
+  return true;
 }
 
 bool InternalizePass::runOnModule(Module &M) {
@@ -112,26 +131,40 @@ bool InternalizePass::runOnModule(Module &M) {
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
   bool Changed = false;
 
-  // Never internalize functions which code-gen might insert.
-  // FIXME: We should probably add this (and the __stack_chk_guard) via some
-  // type of call-back in CodeGen.
-  ExternalNames.insert("__stack_chk_fail");
+  SmallPtrSet<GlobalValue *, 8> Used;
+  collectUsedGlobalVariables(M, Used, false);
+
+  // We must assume that globals in llvm.used have a reference that not even
+  // the linker can see, so we don't internalize them.
+  // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
+  // linker can drop those symbols. If this pass is running as part of LTO,
+  // one might think that it could just drop llvm.compiler.used. The problem
+  // is that even in LTO llvm doesn't see every reference. For example,
+  // we don't see references from function local inline assembly. To be
+  // conservative, we internalize symbols in llvm.compiler.used, but we
+  // keep llvm.compiler.used so that the symbol is not deleted by llvm.
+  for (SmallPtrSet<GlobalValue *, 8>::iterator I = Used.begin(), E = Used.end();
+       I != E; ++I) {
+    GlobalValue *V = *I;
+    ExternalNames.insert(V->getName());
+  }
 
   // Mark all functions not in the api as internal.
   // FIXME: maybe use private linkage?
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() &&         // Function must be defined here
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !I->hasLocalLinkage() &&  // Can't already have internal linkage
-        !ExternalNames.count(I->getName())) {// Not marked to keep external?
-      I->setLinkage(GlobalValue::InternalLinkage);
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+
+    if (ExternalNode)
       // Remove a callgraph edge from the external node to this function.
-      if (ExternalNode) ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
-      Changed = true;
-      ++NumFunctions;
-      DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
-    }
+      ExternalNode->removeOneAbstractEdgeTo((*CG)[I]);
+
+    Changed = true;
+    ++NumFunctions;
+    DEBUG(dbgs() << "Internalizing func " << I->getName() << "\n");
+  }
 
   // Never internalize the llvm.used symbol.  It is used to implement
   // attribute((used)).
@@ -146,35 +179,36 @@ bool InternalizePass::runOnModule(Module &M) {
   ExternalNames.insert("llvm.global.annotations");
 
   // Never internalize symbols code-gen inserts.
+  // FIXME: We should probably add this (and the __stack_chk_guard) via some
+  // type of call-back in CodeGen.
+  ExternalNames.insert("__stack_chk_fail");
   ExternalNames.insert("__stack_chk_guard");
 
   // Mark all global variables with initializers that are not in the api as
   // internal as well.
   // FIXME: maybe use private linkage?
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasLocalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumGlobals;
-      DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumGlobals;
+    DEBUG(dbgs() << "Internalized gvar " << I->getName() << "\n");
+  }
 
   // Mark all aliases that are not in the api as internal as well.
   for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
-       I != E; ++I)
-    if (!I->isDeclaration() && !I->hasInternalLinkage() &&
-        // Available externally is really just a "declaration with a body".
-        !I->hasAvailableExternallyLinkage() &&
-        !ExternalNames.count(I->getName())) {
-      I->setLinkage(GlobalValue::InternalLinkage);
-      Changed = true;
-      ++NumAliases;
-      DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
-    }
+       I != E; ++I) {
+    if (!shouldInternalize(*I, ExternalNames))
+      continue;
+
+    I->setLinkage(GlobalValue::InternalLinkage);
+    Changed = true;
+    ++NumAliases;
+    DEBUG(dbgs() << "Internalized alias " << I->getName() << "\n");
+  }
 
   return Changed;
 }
@@ -183,6 +217,6 @@ ModulePass *llvm::createInternalizePass() {
   return new InternalizePass();
 }
 
-ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
-  return new InternalizePass(el);
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> ExportList) {
+  return new InternalizePass(ExportList);
 }
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 4ce749c..3861421 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -210,16 +210,20 @@ private:
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
 bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
+
+  PointerType *PTy1 = dyn_cast<PointerType>(Ty1);
+  PointerType *PTy2 = dyn_cast<PointerType>(Ty2);
+
+  if (TD) {
+    if (PTy1 && PTy1->getAddressSpace() == 0) Ty1 = TD->getIntPtrType(Ty1);
+    if (PTy2 && PTy2->getAddressSpace() == 0) Ty2 = TD->getIntPtrType(Ty2);
+  }
+
   if (Ty1 == Ty2)
     return true;
-  if (Ty1->getTypeID() != Ty2->getTypeID()) {
-    if (TD) {
-      LLVMContext &Ctx = Ty1->getContext();
-      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true;
-      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true;
-    }
+
+  if (Ty1->getTypeID() != Ty2->getTypeID())
     return false;
-  }
 
   switch (Ty1->getTypeID()) {
   default:
@@ -241,8 +245,7 @@ bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
     return true;
 
   case Type::PointerTyID: {
-    PointerType *PTy1 = cast<PointerType>(Ty1);
-    PointerType *PTy2 = cast<PointerType>(Ty2);
+    assert(PTy1 && PTy2 && "Both types must be pointers here.");
     return PTy1->getAddressSpace() == PTy2->getAddressSpace();
   }
 
@@ -352,14 +355,19 @@ bool FunctionComparator::isEquivalentOperation(const Instruction *I1,
 // Determine whether two GEP operations perform the same underlying arithmetic.
 bool FunctionComparator::isEquivalentGEP(const GEPOperator *GEP1,
                                          const GEPOperator *GEP2) {
-  // When we have target data, we can reduce the GEP down to the value in bytes
-  // added to the address.
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 1;
-  APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
-  if (TD &&
-      GEP1->accumulateConstantOffset(*TD, Offset1) &&
-      GEP2->accumulateConstantOffset(*TD, Offset2)) {
-    return Offset1 == Offset2;
+  unsigned AS = GEP1->getPointerAddressSpace();
+  if (AS != GEP2->getPointerAddressSpace())
+    return false;
+
+  if (TD) {
+    // When we have target data, we can reduce the GEP down to the value in bytes
+    // added to the address.
+    unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 1;
+    APInt Offset1(BitWidth, 0), Offset2(BitWidth, 0);
+    if (GEP1->accumulateConstantOffset(*TD, Offset1) &&
+        GEP2->accumulateConstantOffset(*TD, Offset2)) {
+      return Offset1 == Offset2;
+    }
   }
 
   if (GEP1->getPointerOperand()->getType() !=
@@ -713,6 +721,19 @@ void MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
   writeThunk(F, G);
 }
 
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simplier then CastInst::getCastOpcode.
+static Value* createCast(IRBuilder<false> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
 // Replace G with a simple tail call to bitcast(F). Also replace direct uses
 // of G with bitcast(F). Deletes G.
 void MergeFunctions::writeThunk(Function *F, Function *G) {
@@ -738,7 +759,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   FunctionType *FFTy = F->getFunctionType();
   for (Function::arg_iterator AI = NewG->arg_begin(), AE = NewG->arg_end();
        AI != AE; ++AI) {
-    Args.push_back(Builder.CreateBitCast(AI, FFTy->getParamType(i)));
+    Args.push_back(createCast(Builder, (Value*)AI, FFTy->getParamType(i)));
     ++i;
   }
 
@@ -748,13 +769,7 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
-    Type *RetTy = NewG->getReturnType();
-    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
-      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
-    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
-      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
-    else
-      Builder.CreateRet(Builder.CreateBitCast(CI, RetTy));
+    Builder.CreateRet(createCast(Builder, CI, NewG->getReturnType()));
   }
 
   NewG->copyAttributesFrom(G);
@@ -829,6 +844,18 @@ bool MergeFunctions::insert(ComparableFunction &NewF) {
 
   const ComparableFunction &OldF = *Result.first;
 
+  // Don't merge tiny functions, since it can just end up making the function
+  // larger.
+  // FIXME: Should still merge them if they are unnamed_addr and produce an
+  // alias.
+  if (NewF.getFunc()->size() == 1) {
+    if (NewF.getFunc()->front().size() <= 2) {
+      DEBUG(dbgs() << NewF.getFunc()->getName()
+            << " is to small to bother merging\n");
+      return false;
+    }
+  }
+
   // Never thunk a strong function to a weak function.
   assert(!OldF.getFunc()->mayBeOverridden() ||
          NewF.getFunc()->mayBeOverridden());
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 986c0b8..24c5018 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -29,15 +29,20 @@
 using namespace llvm;
 
 static cl::opt<bool>
-RunLoopVectorization("vectorize-loops",
+RunLoopVectorization("vectorize-loops", cl::Hidden,
                      cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-RunSLPVectorization("vectorize-slp",
+LateVectorization("late-vectorize", cl::init(true), cl::Hidden,
+                  cl::desc("Run the vectorization pasess late in the pass "
+                           "pipeline (after the inliner)"));
+
+static cl::opt<bool>
+RunSLPVectorization("vectorize-slp", cl::Hidden,
                     cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize-slp-aggressive",
+RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
                     cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
@@ -49,17 +54,22 @@ static cl::opt<bool> UseNewSROA("use-new-sroa",
   cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+                 cl::desc("Run the loop rerolling pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
     LibraryInfo = 0;
     Inliner = 0;
-    DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
     BBVectorize = RunBBVectorization;
     SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
+    LateVectorize = LateVectorization;
+    RerollLoops = RunLoopRerolling;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -174,8 +184,6 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   else
     MPM.add(createScalarReplAggregatesPass(-1, false));
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
-  if (!DisableSimplifyLibCalls)
-    MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
   MPM.add(createJumpThreadingPass());         // Thread jumps.
   MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
@@ -192,8 +200,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (LoopVectorize && OptLevel > 2)
-    MPM.add(createLoopVectorizePass());
+  if (!LateVectorize && LoopVectorize)
+      MPM.add(createLoopVectorizePass(DisableUnrollLoops));
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
@@ -213,16 +221,18 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
+  if (RerollLoops)
+    MPM.add(createLoopRerollPass());
   if (SLPVectorize)
-    MPM.add(createSLPVectorizerPass());     // Vectorize parallel scalar chains.
+    MPM.add(createSLPVectorizerPass());   // Vectorize parallel scalar chains.
 
   if (BBVectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
     if (OptLevel > 1 && UseGVNAfterVectorization)
-      MPM.add(createGVNPass());                   // Remove redundancies
+      MPM.add(createGVNPass());           // Remove redundancies
     else
-      MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
+      MPM.add(createEarlyCSEPass());      // Catch trivial redundancies
 
     // BBVectorize may have significantly shortened a loop body; unroll again.
     if (!DisableUnrollLoops)
@@ -230,9 +240,25 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   }
 
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
-  MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
+  MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
 
+  // As an experimental mode, run any vectorization passes in a separate
+  // pipeline from the CGSCC pass manager that runs iteratively with the
+  // inliner.
+  if (LateVectorize && LoopVectorize) {
+    // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+    // pass manager that we are specifically trying to avoid. To prevent this
+    // we must insert a no-op module pass to reset the pass manager.
+    MPM.add(createBarrierNoopPass());
+
+    // Add the various vectorization passes and relevant cleanup passes for
+    // them since we are no longer in the middle of the main scalar pipeline.
+    MPM.add(createLoopVectorizePass(DisableUnrollLoops));
+    MPM.add(createInstructionCombiningPass());
+    MPM.add(createCFGSimplificationPass());
+  }
+
   if (!DisableUnitAtATime) {
     // FIXME: We shouldn't bother with this anymore.
     MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
@@ -257,11 +283,8 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // Now that composite has been compiled, scan through the module, looking
   // for a main function.  If main is defined, mark all other functions
   // internal.
-  if (Internalize) {
-    std::vector<const char*> E;
-    E.push_back("main");
-    PM.add(createInternalizePass(E));
-  }
+  if (Internalize)
+    PM.add(createInternalizePass("main"));
 
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
@@ -302,6 +325,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // The IPO passes may leave cruft around.  Clean up after them.
   PM.add(createInstructionCombiningPass());
   PM.add(createJumpThreadingPass());
+
   // Break up allocas
   if (UseNewSROA)
     PM.add(createSROAPass());
@@ -315,6 +339,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createLICMPass());                 // Hoist loop invariants.
   PM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
   PM.add(createMemCpyOptPass());            // Remove dead memcpys.
+
   // Nuke dead stores.
   PM.add(createDeadStoreEliminationPass());
 
@@ -379,8 +404,7 @@ LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
 void
 LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
                                                  LLVMBool Value) {
-  PassManagerBuilder *Builder = unwrap(PMB);
-  Builder->DisableSimplifyLibCalls = Value;
+  // NOTE: The simplify-libcalls pass has been removed.
 }
 
 void
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index 73d9323..b160913 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -51,7 +51,7 @@ namespace {
 char PruneEH::ID = 0;
 INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
-INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(CallGraph)
 INITIALIZE_PASS_END(PruneEH, "prune-eh",
                 "Remove unused exception handling info", false, false)
 
@@ -145,15 +145,13 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
         NewAttributes.addAttribute(Attribute::NoReturn);
 
       Function *F = (*I)->getFunction();
-      const AttributeSet &PAL = F->getAttributes();
-      const AttributeSet &NPAL =
-        PAL.addAttributes(F->getContext(), AttributeSet::FunctionIndex,
-                          AttributeSet::get(F->getContext(),
-                                            AttributeSet::FunctionIndex,
-                                            NewAttributes));
+      const AttributeSet &PAL = F->getAttributes().getFnAttributes();
+      const AttributeSet &NPAL = AttributeSet::get(
+          F->getContext(), AttributeSet::FunctionIndex, NewAttributes);
+
       if (PAL != NPAL) {
         MadeChange = true;
-        F->setAttributes(NPAL);
+        F->addAttributes(AttributeSet::FunctionIndex, NPAL);
       }
     }
 
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 3396f79..c4f5cfc 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -9,7 +9,7 @@
 //
 // The StripSymbols transformation implements code stripping. Specifically, it
 // can delete:
-// 
+//
 //   * names for virtual registers
 //   * symbols for internal globals and functions
 //   * debug information
@@ -39,7 +39,7 @@ namespace {
     bool OnlyDebugInfo;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit StripSymbols(bool ODI = false) 
+    explicit StripSymbols(bool ODI = false)
       : ModulePass(ID), OnlyDebugInfo(ODI) {
         initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
       }
@@ -144,7 +144,7 @@ static void RemoveDeadConstant(Constant *C) {
   assert(C->use_empty() && "Constant is not dead!");
   SmallPtrSet<Constant*, 4> Operands;
   for (unsigned i = 0, e = C->getNumOperands(); i != e; ++i)
-    if (OnlyUsedBy(C->getOperand(i), C)) 
+    if (OnlyUsedBy(C->getOperand(i), C))
       Operands.insert(cast<Constant>(C->getOperand(i)));
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
     if (!GV->hasLocalLinkage()) return;   // Don't delete non static globals.
@@ -182,7 +182,7 @@ static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
   for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
     StructType *STy = StructTypes[i];
     if (STy->isLiteral() || STy->getName().empty()) continue;
-    
+
     if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
       continue;
 
@@ -199,7 +199,7 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
   ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
 
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV = 
+    if (GlobalValue *GV =
           dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
       UsedValues.insert(GV);
 }
@@ -217,71 +217,20 @@ static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
   }
-  
+
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (I->hasLocalLinkage() && llvmUsedValues.count(I) == 0)
       if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
         I->setName("");     // Internal symbols can't participate in linkage
     StripSymtab(I->getValueSymbolTable(), PreserveDbgInfo);
   }
-  
+
   // Remove all names from types.
   StripTypeNames(M, PreserveDbgInfo);
 
   return true;
 }
 
-// StripDebugInfo - Strip debug info in the module if it exists.  
-// To do this, we remove llvm.dbg.func.start, llvm.dbg.stoppoint, and 
-// llvm.dbg.region.end calls, and any globals they point to if now dead.
-static bool StripDebugInfo(Module &M) {
-
-  bool Changed = false;
-
-  // Remove all of the calls to the debugger intrinsics, and remove them from
-  // the module.
-  if (Function *Declare = M.getFunction("llvm.dbg.declare")) {
-    while (!Declare->use_empty()) {
-      CallInst *CI = cast<CallInst>(Declare->use_back());
-      CI->eraseFromParent();
-    }
-    Declare->eraseFromParent();
-    Changed = true;
-  }
-
-  if (Function *DbgVal = M.getFunction("llvm.dbg.value")) {
-    while (!DbgVal->use_empty()) {
-      CallInst *CI = cast<CallInst>(DbgVal->use_back());
-      CI->eraseFromParent();
-    }
-    DbgVal->eraseFromParent();
-    Changed = true;
-  }
-
-  for (Module::named_metadata_iterator NMI = M.named_metadata_begin(),
-         NME = M.named_metadata_end(); NMI != NME;) {
-    NamedMDNode *NMD = NMI;
-    ++NMI;
-    if (NMD->getName().startswith("llvm.dbg.")) {
-      NMD->eraseFromParent();
-      Changed = true;
-    }
-  }
-
-  for (Module::iterator MI = M.begin(), ME = M.end(); MI != ME; ++MI)
-    for (Function::iterator FI = MI->begin(), FE = MI->end(); FI != FE;
-         ++FI)
-      for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE;
-           ++BI) {
-        if (!BI->getDebugLoc().isUnknown()) {
-          Changed = true;
-          BI->setDebugLoc(DebugLoc());
-        }
-      }
-
-  return Changed;
-}
-
 bool StripSymbols::runOnModule(Module &M) {
   bool Changed = false;
   Changed |= StripDebugInfo(M);
@@ -307,13 +256,13 @@ bool StripDebugDeclare::runOnModule(Module &M) {
       assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
       CI->eraseFromParent();
       if (Arg1->use_empty()) {
-        if (Constant *C = dyn_cast<Constant>(Arg1)) 
+        if (Constant *C = dyn_cast<Constant>(Arg1))
           DeadConstants.push_back(C);
-        else 
+        else
           RecursivelyDeleteTriviallyDeadInstructions(Arg1);
       }
       if (Arg2->use_empty())
-        if (Constant *C = dyn_cast<Constant>(Arg2)) 
+        if (Constant *C = dyn_cast<Constant>(Arg2))
           DeadConstants.push_back(C);
     }
     Declare->eraseFromParent();
@@ -332,81 +281,107 @@ bool StripDebugDeclare::runOnModule(Module &M) {
   return true;
 }
 
-/// getRealLinkageName - If special LLVM prefix that is used to inform the asm 
-/// printer to not emit usual symbol prefix before the symbol name is used then
-/// return linkage name after skipping this special LLVM prefix.
-static StringRef getRealLinkageName(StringRef LinkageName) {
-  char One = '\1';
-  if (LinkageName.startswith(StringRef(&One, 1)))
-    return LinkageName.substr(1);
-  return LinkageName;
-}
-
+/// Remove any debug info for global variables/functions in the given module for
+/// which said global variable/function no longer exists (i.e. is null).
+///
+/// Debugging information is encoded in llvm IR using metadata. This is designed
+/// such a way that debug info for symbols preserved even if symbols are
+/// optimized away by the optimizer. This special pass removes debug info for
+/// such symbols.
 bool StripDeadDebugInfo::runOnModule(Module &M) {
   bool Changed = false;
 
-  // Debugging infomration is encoded in llvm IR using metadata. This is designed
-  // such a way that debug info for symbols preserved even if symbols are
-  // optimized away by the optimizer. This special pass removes debug info for 
-  // such symbols.
-
-  // llvm.dbg.gv keeps track of debug info for global variables.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.gv")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (DIGlobalVariable(NMD->getOperand(i)).Verify())
-        MDs.push_back(NMD->getOperand(i));
-      else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      GlobalVariable *GV = DIGlobalVariable(*I).getGlobal();
-      if (GV && M.getGlobalVariable(GV->getName(), true)) {
-        if (!NMD)
-          NMD = M.getOrInsertNamedMetadata("llvm.dbg.gv");
-        NMD->addOperand(*I);
-      }
+  LLVMContext &C = M.getContext();
+
+  // Find all debug info in F. This is actually overkill in terms of what we
+  // want to do, but we want to try and be as resilient as possible in the face
+  // of potential debug info changes by using the formal interfaces given to us
+  // as much as possible.
+  DebugInfoFinder F;
+  F.processModule(M);
+
+  // For each compile unit, find the live set of global variables/functions and
+  // replace the current list of potentially dead global variables/functions
+  // with the live list.
+  SmallVector<Value *, 64> LiveGlobalVariables;
+  SmallVector<Value *, 64> LiveSubprograms;
+  DenseSet<const MDNode *> VisitedSet;
+
+  for (DebugInfoFinder::iterator CI = F.compile_unit_begin(),
+         CE = F.compile_unit_end(); CI != CE; ++CI) {
+    // Create our compile unit.
+    DICompileUnit DIC(*CI);
+    assert(DIC.Verify() && "DIC must verify as a DICompileUnit.");
+
+    // Create our live subprogram list.
+    DIArray SPs = DIC.getSubprograms();
+    bool SubprogramChange = false;
+    for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
+      DISubprogram DISP(SPs.getElement(i));
+      assert(DISP.Verify() && "DISP must verify as a DISubprogram.");
+
+      // Make sure we visit each subprogram only once.
+      if (!VisitedSet.insert(DISP).second)
+        continue;
+
+      // If the function referenced by DISP is not null, the function is live.
+      if (DISP.getFunction())
+        LiveSubprograms.push_back(DISP);
       else
-        Changed = true;
+        SubprogramChange = true;
     }
-  }
 
-  // llvm.dbg.sp keeps track of debug info for subprograms.
-  if (NamedMDNode *NMD = M.getNamedMetadata("llvm.dbg.sp")) {
-    SmallVector<MDNode *, 8> MDs;
-    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i)
-      if (DISubprogram(NMD->getOperand(i)).Verify())
-        MDs.push_back(NMD->getOperand(i));
+    // Create our live global variable list.
+    DIArray GVs = DIC.getGlobalVariables();
+    bool GlobalVariableChange = false;
+    for (unsigned i = 0, e = GVs.getNumElements(); i != e; ++i) {
+      DIGlobalVariable DIG(GVs.getElement(i));
+      assert(DIG.Verify() && "DIG must verify as DIGlobalVariable.");
+
+      // Make sure we only visit each global variable only once.
+      if (!VisitedSet.insert(DIG).second)
+        continue;
+
+      // If the global variable referenced by DIG is not null, the global
+      // variable is live.
+      if (DIG.getGlobal())
+        LiveGlobalVariables.push_back(DIG);
       else
-        Changed = true;
-    NMD->eraseFromParent();
-    NMD = NULL;
-
-    for (SmallVector<MDNode *, 8>::iterator I = MDs.begin(),
-           E = MDs.end(); I != E; ++I) {
-      bool FnIsLive = false;
-      if (Function *F = DISubprogram(*I).getFunction())
-        if (M.getFunction(F->getName()))
-          FnIsLive = true;
-      if (FnIsLive) {
-          if (!NMD)
-            NMD = M.getOrInsertNamedMetadata("llvm.dbg.sp");
-          NMD->addOperand(*I);
-      } else {
-        // Remove llvm.dbg.lv.fnname named mdnode which may have been used
-        // to hold debug info for dead function's local variables.
-        StringRef FName = DISubprogram(*I).getLinkageName();
-        if (FName.empty())
-          FName = DISubprogram(*I).getName();
-        if (NamedMDNode *LVNMD = 
-            M.getNamedMetadata(Twine("llvm.dbg.lv.", 
-                                     getRealLinkageName(FName)))) 
-          LVNMD->eraseFromParent();
-      }
+        GlobalVariableChange = true;
+    }
+
+    // If we found dead subprograms or global variables, replace the current
+    // subprogram list/global variable list with our new live subprogram/global
+    // variable list.
+    if (SubprogramChange) {
+      // Make sure that 9 is still the index of the subprograms. This is to make
+      // sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this is updated if such an
+      // event occurs.
+      assert(DIC->getNumOperands() >= 10 &&
+             SPs == DIC->getOperand(9) &&
+             "DICompileUnits is expected to store Subprograms in operand "
+             "9.");
+      DIC->replaceOperandWith(9, MDNode::get(C, LiveSubprograms));
+      Changed = true;
     }
+
+    if (GlobalVariableChange) {
+      // Make sure that 10 is still the index of global variables. This is to
+      // make sure that an assert is hit if the location of the subprogram array
+      // changes. This is just to make sure that this index is updated if such
+      // an event occurs.
+      assert(DIC->getNumOperands() >= 11 &&
+             GVs == DIC->getOperand(10) &&
+             "DICompileUnits is expected to store Global Variables in operand "
+             "10.");
+      DIC->replaceOperandWith(10, MDNode::get(C, LiveGlobalVariables));
+      Changed = true;
+    }
+
+    // Reset lists for the next iteration.
+    LiveSubprograms.clear();
+    LiveGlobalVariables.clear();
   }
 
   return Changed;
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 2a36074..a5eddc2 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -1,4 +1,4 @@
-//===- InstCombine.h - Main InstCombine pass definition -------------------===//
+//===- InstCombine.h - Main InstCombine pass definition ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -158,8 +158,8 @@ public:
                               ConstantInt *DivRHS);
   Instruction *FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *DivI,
                               ConstantInt *DivRHS);
-  Instruction *FoldICmpAddOpCst(ICmpInst &ICI, Value *X, ConstantInt *CI,
-                                ICmpInst::Predicate Pred, Value *TheAdd);
+  Instruction *FoldICmpAddOpCst(Instruction &ICI, Value *X, ConstantInt *CI,
+                                ICmpInst::Predicate Pred);
   Instruction *FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
                            ICmpInst::Predicate Cond, Instruction &I);
   Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
@@ -178,6 +178,7 @@ public:
   Instruction *visitPtrToInt(PtrToIntInst &CI);
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
+  Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
   Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
                               Instruction *FI);
   Instruction *FoldSelectIntoOp(SelectInst &SI, Value*, Value*);
@@ -212,8 +213,8 @@ private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero=false) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset,
-                                  SmallVectorImpl<Value*> &NewIndices);
+  Type *FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                            SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// ShouldOptimizeCast - Return true if the cast from "V to Ty" actually
@@ -234,6 +235,7 @@ private:
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
   Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+  Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask);
 
 public:
   // InsertNewInstBefore - insert an instruction New before instruction Old
@@ -270,7 +272,7 @@ public:
     if (&I == V)
       V = UndefValue::get(I.getType());
 
-    DEBUG(errs() << "IC: Replacing " << I << "\n"
+    DEBUG(dbgs() << "IC: Replacing " << I << "\n"
                     "    with " << *V << '\n');
 
     I.replaceAllUsesWith(V);
@@ -282,7 +284,7 @@ public:
   // instruction.  Instead, visit methods should return the value returned by
   // this function.
   Instruction *EraseInstFromFunction(Instruction &I) {
-    DEBUG(errs() << "IC: ERASE " << I << '\n');
+    DEBUG(dbgs() << "IC: ERASE " << I << '\n');
 
     assert(I.use_empty() && "Cannot erase instruction that is used!");
     // Make sure that we reprocess all operands now that we reduced their
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 166f8df..534feb8 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
@@ -488,7 +489,7 @@ Value *FAddCombine::performFactorization(Instruction *I) {
                       createFSub(AddSub0, AddSub1);
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
     const APFloat &F = CFP->getValueAPF();
-    if (!F.isNormal() || F.isDenormal())
+    if (!F.isNormal())
       return 0;
   }
 
@@ -659,7 +660,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
     }
   }
 
-  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) &&
+  assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
          "out-of-bound access");
 
   if (ConstAdd)
@@ -876,7 +877,7 @@ static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
       uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
       uint32_t CSTVal = CST->getLimitedValue(BitWidth);
       CST = ConstantInt::get(V->getType()->getContext(),
-                             APInt(BitWidth, 1).shl(CSTVal));
+                             APInt::getOneBitSet(BitWidth, CSTVal));
       return I->getOperand(0);
     }
   return 0;
@@ -1185,9 +1186,15 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD))
     return ReplaceInstUsesWith(I, V);
 
-  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
-    if (Instruction *NV = FoldOpIntoPhi(I))
-      return NV;
+  if (isa<Constant>(RHS)) {
+    if (isa<PHINode>(LHS))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+
+    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
+      if (Instruction *NV = FoldOpIntoSelect(I, SI))
+        return NV;
+  }
 
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
@@ -1516,9 +1523,33 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD))
     return ReplaceInstUsesWith(I, V);
 
-  // If this is a 'B = x-(-A)', change to B = x+A...
-  if (Value *V = dyn_castFNegVal(Op1))
-    return BinaryOperator::CreateFAdd(Op0, V);
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *NV = FoldOpIntoSelect(I, SI))
+        return NV;
+
+  // If this is a 'B = x-(-A)', change to B = x+A, potentially looking
+  // through FP extensions/truncations along the way.
+  if (Value *V = dyn_castFNegVal(Op1)) {
+    Instruction *NewI = BinaryOperator::CreateFAdd(Op0, V);
+    NewI->copyFastMathFlags(&I);
+    return NewI;
+  }
+  if (FPTruncInst *FPTI = dyn_cast<FPTruncInst>(Op1)) {
+    if (Value *V = dyn_castFNegVal(FPTI->getOperand(0))) {
+      Value *NewTrunc = Builder->CreateFPTrunc(V, I.getType());
+      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewTrunc);
+      NewI->copyFastMathFlags(&I);
+      return NewI;
+    }
+  } else if (FPExtInst *FPEI = dyn_cast<FPExtInst>(Op1)) {
+    if (Value *V = dyn_castFNegVal(FPEI->getOperand(0))) {
+      Value *NewExt = Builder->CreateFPExt(V, I.getType());
+      Instruction *NewI = BinaryOperator::CreateFAdd(Op0, NewExt);
+      NewI->copyFastMathFlags(&I);
+      return NewI;
+    }
+  }
 
   if (I.hasUnsafeAlgebra()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index ec75dd2..88bb69b 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -173,14 +173,14 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
       // Adding a one to a single bit bit-field should be turned into an XOR
       // of the bit.  First thing to check is to see if this AND is with a
       // single bit constant.
-      const APInt &AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+      const APInt &AndRHSV = AndRHS->getValue();
 
       // If there is only one bit set.
       if (AndRHSV.isPowerOf2()) {
         // Ok, at this point, we know that we are masking the result of the
         // ADD down to exactly one bit.  If the constant we are adding has
         // no bits set below this bit, then we can eliminate the ADD.
-        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+        const APInt& AddRHS = OpRHS->getValue();
 
         // Check to see if any bits below the one bit set in AndRHSV are set.
         if ((AddRHS & (AndRHSV-1)) == 0) {
@@ -209,8 +209,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
-    ConstantInt *CI = ConstantInt::get(AndRHS->getContext(),
-                                       AndRHS->getValue() & ShlMask);
+    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShlMask);
 
     if (CI->getValue() == ShlMask)
       // Masking out bits that the shift already masks.
@@ -230,8 +229,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     uint32_t BitWidth = AndRHS->getType()->getBitWidth();
     uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
     APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-    ConstantInt *CI = ConstantInt::get(Op->getContext(),
-                                       AndRHS->getValue() & ShrMask);
+    ConstantInt *CI = Builder->getInt(AndRHS->getValue() & ShrMask);
 
     if (CI->getValue() == ShrMask)
       // Masking out bits that the shift already masks.
@@ -251,8 +249,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
       uint32_t BitWidth = AndRHS->getType()->getBitWidth();
       uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
       APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
-      Constant *C = ConstantInt::get(Op->getContext(),
-                                     AndRHS->getValue() & ShrMask);
+      Constant *C = Builder->getInt(AndRHS->getValue() & ShrMask);
       if (C == AndRHS) {          // Masking out bits shifted in.
         // (Val ashr C1) & C2 -> (Val lshr C1) & C2
         // Make the argument unsigned.
@@ -279,7 +276,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
 
   if (Inside) {
     if (Lo == Hi)  // Trivially false.
-      return ConstantInt::getFalse(V->getContext());
+      return Builder->getFalse();
 
     // V >= Min && V < Hi --> V < Hi
     if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
@@ -296,7 +293,7 @@ Value *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
   }
 
   if (Lo == Hi)  // Trivially true.
-    return ConstantInt::getTrue(V->getContext());
+    return Builder->getTrue();
 
   // V < Min || V >= Hi -> V > Hi-1
   Hi = SubOne(cast<ConstantInt>(Hi));
@@ -491,6 +488,26 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
   return result;
 }
 
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+  unsigned NewMask;
+  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
+                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
+                     FoldMskICmp_BMask_Mixed))
+            << 1;
+
+  NewMask |=
+      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
+               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
+               FoldMskICmp_BMask_NotMixed))
+      >> 1;
+
+  return NewMask;
+}
+
 /// decomposeBitTestICmp - Decompose an icmp into the form ((X & Y) pred Z)
 /// if possible. The returned predicate is either == or !=. Returns false if
 /// decomposition fails.
@@ -551,14 +568,22 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     L21 = L22 = L1 = 0;
   } else {
     // Look for ANDs in the LHS icmp.
-    if (match(L1, m_And(m_Value(L11), m_Value(L12)))) {
-      if (!match(L2, m_And(m_Value(L21), m_Value(L22))))
-        L21 = L22 = 0;
-    } else {
-      if (!match(L2, m_And(m_Value(L11), m_Value(L12))))
-        return 0;
-      std::swap(L1, L2);
+    if (!L1->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
+      L11 = L12 = 0;
+    } else if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+      // Any icmp can be viewed as being trivially masked; if it allows us to
+      // remove one, it's worth it.
+      L11 = L1;
+      L12 = Constant::getAllOnesValue(L1->getType());
+    }
+
+    if (!L2->getType()->isIntegerTy()) {
+      // You can icmp pointers, for example. They really aren't masks.
       L21 = L22 = 0;
+    } else if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+      L21 = L2;
+      L22 = Constant::getAllOnesValue(L2->getType());
     }
   }
 
@@ -579,7 +604,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
       return 0;
     }
     E = R2; R1 = 0; ok = true;
-  } else if (match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+  } else if (R1->getType()->isIntegerTy()) {
+    if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+      // As before, model no mask as a trivial mask if it'll let us do an
+      // optimisation.
+      R11 = R1;
+      R12 = Constant::getAllOnesValue(R1->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R2; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -592,7 +624,12 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     return 0;
 
   // Look for ANDs in on the right side of the RHS icmp.
-  if (!ok && match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+  if (!ok && R2->getType()->isIntegerTy()) {
+    if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+      R11 = R2;
+      R12 = Constant::getAllOnesValue(R2->getType());
+    }
+
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
       A = R11; D = R12; E = R1; ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
@@ -621,8 +658,7 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 /// foldLogOpOfMaskedICmps:
 /// try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
 /// into a single (icmp(A & X) ==/!= Y)
-static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                     ICmpInst::Predicate NEWCC,
+static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy* Builder) {
   Value *A = 0, *B = 0, *C = 0, *D = 0, *E = 0;
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -632,8 +668,24 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
   assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
          "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
 
-  if (NEWCC == ICmpInst::ICMP_NE)
-    mask >>= 1; // treat "Not"-states as normal states
+  // In full generality:
+  //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
+  // ==  ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+  //
+  // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+  // equivalent to (icmp (A & X) !Op Y).
+  //
+  // Therefore, we can pretend for the rest of this function that we're dealing
+  // with the conjunction, provided we flip the sense of any comparisons (both
+  // input and output).
+
+  // In most cases we're going to produce an EQ for the "&&" case.
+  ICmpInst::Predicate NEWCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+  if (!IsAnd) {
+    // Convert the masking analysis into its equivalent with negated
+    // comparisons.
+    mask = conjugateICmpMask(mask);
+  }
 
   if (mask & FoldMskICmp_Mask_AllZeroes) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
@@ -660,6 +712,40 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     Value* newAnd = Builder->CreateAnd(A, newAnd1);
     return Builder->CreateICmp(NEWCC, newAnd, A);
   }
+
+  // Remaining cases assume at least that B and D are constant, and depend on
+  // their actual values. This isn't strictly, necessary, just a "handle the
+  // easy cases for now" decision.
+  ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+  if (BCst == 0) return 0;
+  ConstantInt *DCst = dyn_cast<ConstantInt>(D);
+  if (DCst == 0) return 0;
+
+  if (mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+    // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+    // Only valid if one of the masks is a superset of the other (check "B&D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() & DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
+  if (mask & FoldMskICmp_AMask_NotAllOnes) {
+    // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+    //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+    // Only valid if one of the masks is a superset of the other (check "B|D" is
+    // the same as either B or D).
+    APInt NewMask = BCst->getValue() | DCst->getValue();
+
+    if (NewMask == BCst->getValue())
+      return LHS;
+    else if (NewMask == DCst->getValue())
+      return RHS;
+  }
   if (mask & FoldMskICmp_BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
@@ -668,14 +754,9 @@ static Value* foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS,
     // contradict, then we can transform to
     // -> (icmp eq (A & (B|D)), (C|E))
     // Currently, we only handle the case of B, C, D, and E being constant.
-    ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-    if (BCst == 0) return 0;
-    ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-    if (DCst == 0) return 0;
     // we can't simply use C and E, because we might actually handle
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set
-
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
     if (CCst == 0) return 0;
     if (LHSCC != NEWCC)
@@ -718,7 +799,7 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   }
 
   // handle (roughly):  (icmp eq (A & B), C) & (icmp eq (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_EQ, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
     return V;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
@@ -852,10 +933,15 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
       return RHS;
     case ICmpInst::ICMP_NE:
+      // Special case to get the ordering right when the values wrap around
+      // zero.
+      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
+        std::swap(LHSCst, RHSCst);
       if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
         Constant *AddCST = ConstantExpr::getNeg(LHSCst);
         Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1));
+        return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
+                                      Val->getName()+".cmp");
       }
       break;                        // (X != 13 & X != 15) -> no change
     }
@@ -943,7 +1029,7 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // false.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return ConstantInt::getFalse(LHS->getContext());
+          return Builder->getFalse();
         return Builder->CreateFCmpORD(LHS->getOperand(0), RHS->getOperand(0));
       }
 
@@ -1302,7 +1388,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 /// always in the local (OverallLeftShift) coordinate space.
 ///
 static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
-                              SmallVector<Value*, 8> &ByteValues) {
+                              SmallVectorImpl<Value *> &ByteValues) {
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     // If this is an or instruction, it may be an inner node of the bswap.
     if (I->getOpcode() == Instruction::Or) {
@@ -1380,7 +1466,7 @@ static bool CollectBSwapParts(Value *V, int OverallLeftShift, uint32_t ByteMask,
   // into a byteswap.  At least one of the two bytes would not be aligned with
   // their ultimate destination.
   if (!isPowerOf2_32(ByteMask)) return true;
-  unsigned InputByteNo = CountTrailingZeros_32(ByteMask);
+  unsigned InputByteNo = countTrailingZeros(ByteMask);
 
   // 2) The input and ultimate destinations must line up: if byte 3 of an i32
   // is demanded, it needs to go into byte 0 of the result.  This means that the
@@ -1457,10 +1543,60 @@ static Instruction *MatchSelectFromAndOr(Value *A, Value *B,
   return 0;
 }
 
+/// IsOneHotValue - Returns true for "one-hot" values (values where at most
+/// one bit can be set).
+static bool IsOneHotValue(Value *V) {
+  // Match 1<<K.
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V))
+    if (BO->getOpcode() == Instruction::Shl) {
+      ConstantInt *One = dyn_cast<ConstantInt>(BO->getOperand(0));
+      return One && One->isOne();
+    }
+
+  // Check for power of two integer constants.
+  if (ConstantInt *K = dyn_cast<ConstantInt>(V))
+    return K->getValue().isPowerOf2();
+
+  return false;
+}
+
 /// FoldOrOfICmps - Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
 
+  // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
+  // if K1 and K2 are a one-bit mask.
+  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+
+  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
+      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+
+    BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
+    BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
+    if (LAnd && RAnd && LAnd->hasOneUse() && RHS->hasOneUse() &&
+        LAnd->getOpcode() == Instruction::And &&
+        RAnd->getOpcode() == Instruction::And) {
+
+      Value *Mask = 0;
+      Value *Masked = 0;
+      if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
+          IsOneHotValue(LAnd->getOperand(1)) &&
+          IsOneHotValue(RAnd->getOperand(1))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
+        Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
+      } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
+                 IsOneHotValue(LAnd->getOperand(0)) &&
+                 IsOneHotValue(RAnd->getOperand(0))) {
+        Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
+        Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
+      }
+
+      if (Masked)
+        return Builder->CreateICmp(ICmpInst::ICMP_NE, Masked, Mask);
+    }
+  }
+
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
   if (PredicatesFoldable(LHSCC, RHSCC)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
@@ -1477,13 +1613,37 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 
   // handle (roughly):
   // (icmp ne (A & B), C) | (icmp ne (A & D), E)
-  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, ICmpInst::ICMP_NE, Builder))
+  if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
-  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (LHS->hasOneUse() || RHS->hasOneUse()) {
+    // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
+    // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
+    Value *A = 0, *B = 0;
+    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) {
+      B = Val;
+      if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1))
+        A = Val2;
+      else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2)
+        A = RHS->getOperand(1);
+    }
+    // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
+    // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
+    else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+      B = Val2;
+      if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1))
+        A = Val;
+      else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val)
+        A = LHS->getOperand(1);
+    }
+    if (A && B)
+      return Builder->CreateICmp(
+          ICmpInst::ICMP_UGE,
+          Builder->CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
+  }
+
+  // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
   if (LHSCst == 0 || RHSCst == 0) return 0;
 
   if (LHSCst == RHSCst && LHSCC == RHSCC) {
@@ -1588,7 +1748,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
     case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
     case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     }
   case ICmpInst::ICMP_ULT:
     switch (RHSCC) {
@@ -1640,7 +1800,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       break;
     case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
     case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
       break;
     }
@@ -1655,7 +1815,7 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
       break;
     case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
     case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
-      return ConstantInt::getTrue(LHS->getContext());
+      return Builder->getTrue();
     case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
       break;
     }
@@ -1676,7 +1836,7 @@ Value *InstCombiner::FoldOrOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
         // If either of the constants are nans, then the whole thing returns
         // true.
         if (LHSC->getValueAPF().isNaN() || RHSC->getValueAPF().isNaN())
-          return ConstantInt::getTrue(LHS->getContext());
+          return Builder->getTrue();
 
         // Otherwise, no need to compare the two constants, compare the
         // rest.
@@ -1779,8 +1939,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       Value *Or = Builder->CreateOr(X, RHS);
       Or->takeName(Op0);
       return BinaryOperator::CreateAnd(Or,
-                         ConstantInt::get(I.getContext(),
-                                          RHS->getValue() | C1->getValue()));
+                             Builder->getInt(RHS->getValue() | C1->getValue()));
     }
 
     // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
@@ -1789,8 +1948,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       Value *Or = Builder->CreateOr(X, RHS);
       Or->takeName(Op0);
       return BinaryOperator::CreateXor(Or,
-                 ConstantInt::get(I.getContext(),
-                                  C1->getValue() & ~RHS->getValue()));
+                            Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
 
     // Try to fold constant and into select arguments.
@@ -1872,15 +2030,13 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
             ((V1 == B && MaskedValueIsZero(V2, ~C1->getValue())) ||  // (V|N)
              (V2 == B && MaskedValueIsZero(V1, ~C1->getValue()))))   // (N|V)
           return BinaryOperator::CreateAnd(A,
-                               ConstantInt::get(A->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
         // Or commutes, try both ways.
         if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
             ((V1 == A && MaskedValueIsZero(V2, ~C2->getValue())) ||  // (V|N)
              (V2 == A && MaskedValueIsZero(V1, ~C2->getValue()))))   // (N|V)
           return BinaryOperator::CreateAnd(B,
-                               ConstantInt::get(B->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
 
         // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
         // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
@@ -1891,8 +2047,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
             (C4->getValue() & ~C2->getValue()) == 0) {
           V2 = Builder->CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
           return BinaryOperator::CreateAnd(V2,
-                               ConstantInt::get(B->getContext(),
-                                                C1->getValue()|C2->getValue()));
+                                Builder->getInt(C1->getValue()|C2->getValue()));
         }
       }
     }
@@ -2160,8 +2315,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode,
-                                           ConstantInt::getTrue(I.getContext()),
+              (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(),
                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
             return CastInst::Create(Opcode, CI, Op0C->getType());
@@ -2191,8 +2345,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
                                       Op0I->getOperand(0));
           } else if (RHS->getValue().isSignBit()) {
             // (X + C) ^ signbit -> (X + C + signbit)
-            Constant *C = ConstantInt::get(I.getContext(),
-                                           RHS->getValue() + Op0CI->getValue());
+            Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue());
             return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 78b4a2c..0cd7b14 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -946,7 +946,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     int ix = FTy->getNumParams();
     // See if we can optimize any arguments passed through the varargs area of
     // the call.
-    for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
+    for (CallSite::arg_iterator I = CS.arg_begin() + FTy->getNumParams(),
            E = CS.arg_end(); I != E; ++I, ++ix) {
       CastInst *CI = dyn_cast<CastInst>(*I);
       if (CI && isSafeToEliminateVarargsCast(CS, CI, TD, ix)) {
@@ -999,19 +999,15 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // Check to see if we are changing the return type...
   if (OldRetTy != NewRetTy) {
-    if (Callee->isDeclaration() &&
-        // Conversion is ok if changing from one pointer type to another or from
-        // a pointer to an integer of the same size.
-        !((OldRetTy->isPointerTy() || !TD ||
-           OldRetTy == TD->getIntPtrType(Caller->getContext())) &&
-          (NewRetTy->isPointerTy() || !TD ||
-           NewRetTy == TD->getIntPtrType(Caller->getContext()))))
-      return false;   // Cannot transform this return value.
+    if (!CastInst::isBitCastable(NewRetTy, OldRetTy)) {
+      if (Callee->isDeclaration())
+        return false;   // Cannot transform this return value.
 
-    if (!Caller->use_empty() &&
-        // void -> non-void is handled specially
-        !NewRetTy->isVoidTy() && !CastInst::isCastable(NewRetTy, OldRetTy))
+      if (!Caller->use_empty() &&
+          // void -> non-void is handled specially
+          !NewRetTy->isVoidTy())
       return false;   // Cannot transform this return value.
+    }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
       AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
@@ -1036,7 +1032,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
               return false;
   }
 
-  unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+  unsigned NumActualArgs = CS.arg_size();
   unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
 
   CallSite::arg_iterator AI = CS.arg_begin();
@@ -1044,7 +1040,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     Type *ParamTy = FT->getParamType(i);
     Type *ActTy = (*AI)->getType();
 
-    if (!CastInst::isCastable(ActTy, ParamTy))
+    if (!CastInst::isBitCastable(ActTy, ParamTy))
       return false;   // Cannot transform this parameter value.
 
     if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
@@ -1061,20 +1057,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
 
-      Type *CurElTy = cast<PointerType>(ActTy)->getElementType();
+      Type *CurElTy = ActTy->getPointerElementType();
       if (TD->getTypeAllocSize(CurElTy) !=
           TD->getTypeAllocSize(ParamPTy->getElementType()))
         return false;
     }
-
-    // Converting from one pointer type to another or between a pointer and an
-    // integer of the same size is safe even if we do not have a body.
-    bool isConvertible = ActTy == ParamTy ||
-      (TD && ((ParamTy->isPointerTy() ||
-      ParamTy == TD->getIntPtrType(Caller->getContext())) &&
-              (ActTy->isPointerTy() ||
-              ActTy == TD->getIntPtrType(Caller->getContext()))));
-    if (Callee->isDeclaration() && !isConvertible) return false;
   }
 
   if (Callee->isDeclaration()) {
@@ -1141,12 +1128,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
     Type *ParamTy = FT->getParamType(i);
+
     if ((*AI)->getType() == ParamTy) {
       Args.push_back(*AI);
     } else {
-      Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
-          false, ParamTy, false);
-      Args.push_back(Builder->CreateCast(opcode, *AI, ParamTy));
+      Args.push_back(Builder->CreateBitCast(*AI, ParamTy));
     }
 
     // Add any parameter attributes.
@@ -1217,9 +1203,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   Value *NV = NC;
   if (OldRetTy != NV->getType() && !Caller->use_empty()) {
     if (!NV->getType()->isVoidTy()) {
-      Instruction::CastOps opcode =
-        CastInst::getCastOpcode(NC, false, OldRetTy, false);
-      NV = NC = CastInst::Create(opcode, NC, OldRetTy);
+      NV = NC = CastInst::Create(CastInst::BitCast, NC, OldRetTy);
       NC->setDebugLoc(Caller->getDebugLoc());
 
       // If this is an invoke instruction, we should insert it after the first
@@ -1287,7 +1271,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
     if (NestTy) {
       Instruction *Caller = CS.getInstruction();
       std::vector<Value*> NewArgs;
-      NewArgs.reserve(unsigned(CS.arg_end()-CS.arg_begin())+1);
+      NewArgs.reserve(CS.arg_size() + 1);
 
       SmallVector<AttributeSet, 8> NewAttrs;
       NewAttrs.reserve(Attrs.getNumSlots() + 1);
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2ee1278..72377dc 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -677,7 +677,6 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   case Instruction::Add:
   case Instruction::Sub:
   case Instruction::Mul:
-  case Instruction::Shl:
     if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear) ||
         !CanEvaluateZExtd(I->getOperand(1), Ty, Tmp))
       return false;
@@ -701,6 +700,17 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
 
+  case Instruction::Shl:
+    // We can promote shl(x, cst) if we can promote x.  Since shl overwrites the
+    // upper bits we can reduce BitsToClear by the shift amount.
+    if (ConstantInt *Amt = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      if (!CanEvaluateZExtd(I->getOperand(0), Ty, BitsToClear))
+        return false;
+      uint64_t ShiftAmt = Amt->getZExtValue();
+      BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
+      return true;
+    }
+    return false;
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
@@ -1219,6 +1229,19 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     }
   }
 
+  // (fptrunc (select cond, R1, Cst)) -->
+  // (select cond, (fptrunc R1), (fptrunc Cst))
+  SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0));
+  if (SI &&
+      (isa<ConstantFP>(SI->getOperand(1)) ||
+       isa<ConstantFP>(SI->getOperand(2)))) {
+    Value *LHSTrunc = Builder->CreateFPTrunc(SI->getOperand(1),
+                                             CI.getType());
+    Value *RHSTrunc = Builder->CreateFPTrunc(SI->getOperand(2),
+                                             CI.getType());
+    return SelectInst::Create(SI->getOperand(0), LHSTrunc, RHSTrunc);
+  }
+
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI.getOperand(0));
   if (II) {
     switch (II->getIntrinsicID()) {
@@ -1239,9 +1262,14 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   }
 
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
+  // Note that we restrict this transformation based on
+  // TLI->has(LibFunc::sqrtf), even for the sqrt intrinsic, because
+  // TLI->has(LibFunc::sqrtf) is sufficient to guarantee that the
+  // single-precision intrinsic can be expanded in the backend.
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
-      Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) &&
+      (Call->getCalledFunction()->getName() == TLI->getName(LibFunc::sqrt) ||
+       Call->getCalledFunction()->getIntrinsicID() == Intrinsic::sqrt) &&
       Call->getNumArgOperands() == 1 &&
       Call->hasOneUse()) {
     CastInst *Arg = dyn_cast<CastInst>(Call->getArgOperand(0));
@@ -1252,11 +1280,11 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
-                                                   Callee->getAttributes(),
-                                                   Builder->getFloatTy(),
-                                                   Builder->getFloatTy(),
-                                                   NULL);
+      Constant *SqrtfFunc = (Callee->getIntrinsicID() == Intrinsic::sqrt) ?
+        Intrinsic::getDeclaration(M, Intrinsic::sqrt, Builder->getFloatTy()) :
+        M->getOrInsertFunction("sqrtf", Callee->getAttributes(),
+                               Builder->getFloatTy(), Builder->getFloatTy(),
+                               NULL);
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
@@ -1328,14 +1356,18 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   // If the source integer type is not the intptr_t type for this target, do a
   // trunc or zext to the intptr_t type, then inttoptr of it.  This allows the
   // cast to be exposed to other transforms.
-  if (TD && CI.getOperand(0)->getType()->getScalarSizeInBits() !=
-      TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
-
-    Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
-    return new IntToPtrInst(P, CI.getType());
+
+  if (TD) {
+    unsigned AS = CI.getAddressSpace();
+    if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+        TD->getPointerSizeInBits(AS)) {
+      Type *Ty = TD->getIntPtrType(CI.getContext(), AS);
+      if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
+        Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+
+      Value *P = Builder->CreateZExtOrTrunc(CI.getOperand(0), Ty);
+      return new IntToPtrInst(P, CI.getType());
+    }
   }
 
   if (Instruction *I = commonCastTransforms(CI))
@@ -1360,25 +1392,32 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       return &CI;
     }
 
+    if (!TD)
+      return commonCastTransforms(CI);
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
     // non-type-safe code.
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) &&
+    unsigned AS = GEP->getPointerAddressSpace();
+    unsigned OffsetBits = TD->getPointerSizeInBits(AS);
+    APInt Offset(OffsetBits, 0);
+    BitCastInst *BCI = dyn_cast<BitCastInst>(GEP->getOperand(0));
+    if (GEP->hasOneUse() &&
+        BCI &&
         GEP->accumulateConstantOffset(*TD, Offset)) {
       // Get the base pointer input of the bitcast, and the type it points to.
-      Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
-      Type *GEPIdxTy =
-      cast<PointerType>(OrigBase->getType())->getElementType();
+      Value *OrigBase = BCI->getOperand(0);
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(GEPIdxTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OrigBase->getType(),
+                              Offset.getSExtValue(),
+                              NewIndices)) {
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
         Value *NGEP = cast<GEPOperator>(GEP)->isInBounds() ?
-        Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
-        Builder->CreateGEP(OrigBase, NewIndices);
+          Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
+          Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
 
         if (isa<BitCastInst>(CI))
@@ -1396,16 +1435,22 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
   // to be exposed to other transforms.
-  if (TD && CI.getType()->getScalarSizeInBits() != TD->getPointerSizeInBits()) {
-    Type *Ty = TD->getIntPtrType(CI.getContext());
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
 
-    Value *P = Builder->CreatePtrToInt(CI.getOperand(0), Ty);
-    return CastInst::CreateIntegerCast(P, CI.getType(), /*isSigned=*/false);
-  }
+  if (!TD)
+    return commonPointerCastTransforms(CI);
+
+  Type *Ty = CI.getType();
+  unsigned AS = CI.getPointerAddressSpace();
+
+  if (Ty->getScalarSizeInBits() == TD->getPointerSizeInBits(AS))
+    return commonPointerCastTransforms(CI);
 
-  return commonPointerCastTransforms(CI);
+  Type *PtrTy = TD->getIntPtrType(CI.getContext(), AS);
+  if (Ty->isVectorTy()) // Handle vectors of pointers.
+    PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
+
+  Value *P = Builder->CreatePtrToInt(CI.getOperand(0), PtrTy);
+  return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
 }
 
 /// OptimizeVectorResize - This input value (which is known to have vector type)
@@ -1478,12 +1523,17 @@ static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
 /// insertions into the vector.  See the example in the comment for
 /// OptimizeIntegerToVectorInsertions for the pattern this handles.
 /// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
 ///
 /// This returns false if the pattern can't be matched or true if it can,
 /// filling in Elements with the elements found here.
-static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
+static bool CollectInsertionElements(Value *V, unsigned Shift,
                                      SmallVectorImpl<Value*> &Elements,
-                                     Type *VecEltTy) {
+                                     Type *VecEltTy, InstCombiner &IC) {
+  assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+         "Shift should be a multiple of the element type size");
+
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
 
@@ -1495,8 +1545,12 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
       if (C->isNullValue())
         return true;
 
+    unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+    if (IC.getDataLayout()->isBigEndian())
+      ElementIndex = Elements.size() - ElementIndex - 1;
+
     // Fail if multiple elements are inserted into this slot.
-    if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
+    if (Elements[ElementIndex] != 0)
       return false;
 
     Elements[ElementIndex] = V;
@@ -1512,7 +1566,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     // it to the right type so it gets properly inserted.
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
-                                      ElementIndex, Elements, VecEltTy);
+                                      Shift, Elements, VecEltTy, IC);
 
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
@@ -1523,10 +1577,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
 
     for (unsigned i = 0; i != NumElts; ++i) {
+      unsigned ShiftI = Shift+i*ElementSize;
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
-                                                               i*ElementSize));
+                                                                  ShiftI));
       Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
-      if (!CollectInsertionElements(Piece, ElementIndex+i, Elements, VecEltTy))
+      if (!CollectInsertionElements(Piece, ShiftI, Elements, VecEltTy, IC))
         return false;
     }
     return true;
@@ -1539,29 +1594,28 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Or:
-    return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy) &&
-           CollectInsertionElements(I->getOperand(1), ElementIndex,
-                                    Elements, VecEltTy);
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC) &&
+           CollectInsertionElements(I->getOperand(1), Shift,
+                                    Elements, VecEltTy, IC);
   case Instruction::Shl: {
     // Must be shifting by a constant that is a multiple of the element size.
     ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (CI == 0) return false;
-    if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
-    unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-
-    return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
-                                    Elements, VecEltTy);
+    Shift += CI->getZExtValue();
+    if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+    return CollectInsertionElements(I->getOperand(0), Shift,
+                                    Elements, VecEltTy, IC);
   }
 
   }
@@ -1584,12 +1638,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
 /// Into two insertelements that do "buildvector{%inc, %inc5}".
 static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
                                                 InstCombiner &IC) {
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   VectorType *DestVecTy = cast<VectorType>(CI.getType());
   Value *IntInput = CI.getOperand(0);
 
   SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
   if (!CollectInsertionElements(IntInput, 0, Elements,
-                                DestVecTy->getElementType()))
+                                DestVecTy->getElementType(), IC))
     return 0;
 
   // If we succeeded, we know that all of the element are specified by Elements
@@ -1775,10 +1832,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
     if (SVI->hasOneUse() && DestTy->isVectorTy() &&
-        cast<VectorType>(DestTy)->getNumElements() ==
-              SVI->getType()->getNumElements() &&
+        DestTy->getVectorNumElements() == SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
-          cast<VectorType>(SVI->getOperand(0)->getType())->getNumElements()) {
+        SVI->getOperand(0)->getType()->getVectorNumElements()) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
@@ -1800,3 +1856,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
 }
+
+Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
+  return commonCastTransforms(CI);
+}
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 4c252c0..9bb65ef 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -227,7 +227,8 @@ Instruction *InstCombiner::
 FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
                              CmpInst &ICI, ConstantInt *AndCst) {
   // We need TD information to know the pointer size unless this is inbounds.
-  if (!GEP->isInBounds() && TD == 0) return 0;
+  if (!GEP->isInBounds() && TD == 0)
+    return 0;
 
   Constant *Init = GV->getInitializer();
   if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
@@ -393,16 +394,19 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // If the index is larger than the pointer size of the target, truncate the
   // index down like the GEP would do implicitly.  We don't have to do this for
   // an inbounds GEP because the index can't be out of range.
-  if (!GEP->isInBounds() &&
-      Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits())
-    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
+  if (!GEP->isInBounds()) {
+    Type *IntPtrTy = TD->getIntPtrType(GEP->getType());
+    unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
+    if (Idx->getType()->getPrimitiveSizeInBits() > PtrSize)
+      Idx = Builder->CreateTrunc(Idx, IntPtrTy);
+  }
 
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
   if (SecondTrueElement != Overdefined) {
     // None true -> false.
     if (FirstTrueElement == Undefined)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(GEP->getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
 
     Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
 
@@ -422,7 +426,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   if (SecondFalseElement != Overdefined) {
     // None false -> true.
     if (FirstFalseElement == Undefined)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(GEP->getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
 
     Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
 
@@ -562,16 +566,18 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     }
   }
 
+
+
   // Okay, we know we have a single variable index, which must be a
   // pointer/array/vector index.  If there is no offset, life is simple, return
   // the index.
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getOperand(0)->getType());
+  unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
   if (Offset == 0) {
     // Cast to intptrty in case a truncation occurs.  If an extension is needed,
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
@@ -593,7 +599,6 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     return 0;
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -647,8 +652,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       // If all indices are the same, just compare the base pointers.
       if (IndicesTheSame)
-        return new ICmpInst(ICmpInst::getSignedPredicate(Cond),
-                            GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+        return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
 
       // If we're comparing GEPs with two base pointers that only differ in type
       // and both GEPs have only constant indices or just one use, then fold
@@ -679,7 +683,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       }
     if (AllZeros)
       return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
-                          ICmpInst::getSwappedPredicate(Cond), I);
+                         ICmpInst::getSwappedPredicate(Cond), I);
 
     // If the other GEP has all zero indices, recurse.
     AllZeros = true;
@@ -712,8 +716,7 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 
       if (NumDifferences == 0)   // SAME GEP?
         return ReplaceInstUsesWith(I, // No comparison is needed here.
-                               ConstantInt::get(Type::getInt1Ty(I.getContext()),
-                                             ICmpInst::isTrueWhenEqual(Cond)));
+                             Builder->getInt1(ICmpInst::isTrueWhenEqual(Cond)));
 
       else if (NumDifferences == 1 && GEPsInBounds) {
         Value *LHSV = GEPLHS->getOperand(DiffOperand);
@@ -739,10 +742,9 @@ Instruction *InstCombiner::FoldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
 }
 
 /// FoldICmpAddOpCst - Fold "icmp pred (X+CI), X".
-Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
+Instruction *InstCombiner::FoldICmpAddOpCst(Instruction &ICI,
                                             Value *X, ConstantInt *CI,
-                                            ICmpInst::Predicate Pred,
-                                            Value *TheAdd) {
+                                            ICmpInst::Predicate Pred) {
   // If we have X+0, exit early (simplifying logic below) and let it get folded
   // elsewhere.   icmp X+0, X  -> icmp X, X
   if (CI->isZero()) {
@@ -752,11 +754,11 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
 
   // (X+4) == X -> false.
   if (Pred == ICmpInst::ICMP_EQ)
-    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(X->getContext()));
+    return ReplaceInstUsesWith(ICI, Builder->getFalse());
 
   // (X+4) != X -> true.
   if (Pred == ICmpInst::ICMP_NE)
-    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(X->getContext()));
+    return ReplaceInstUsesWith(ICI, Builder->getTrue());
 
   // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
   // so the values can never be equal.  Similarly for all other "or equals"
@@ -798,7 +800,7 @@ Instruction *InstCombiner::FoldICmpAddOpCst(ICmpInst &ICI,
   // (X+ -1) >s X      --> X <s (MAXSINT-(-1-1))      --> X == -128
 
   assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
-  Constant *C = ConstantInt::get(X->getContext(), CI->getValue()-1);
+  Constant *C = Builder->getInt(CI->getValue()-1);
   return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantExpr::getSub(SMax, C));
 }
 
@@ -921,7 +923,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   default: llvm_unreachable("Unhandled icmp opcode!");
   case ICmpInst::ICMP_EQ:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
                           ICmpInst::ICMP_UGE, X, LoBound);
@@ -932,7 +934,7 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
                                                     DivIsSigned, true));
   case ICmpInst::ICMP_NE:
     if (LoOverflow && HiOverflow)
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (HiOverflow)
       return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
                           ICmpInst::ICMP_ULT, X, LoBound);
@@ -944,16 +946,16 @@ Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_SLT:
     if (LoOverflow == +1)   // Low bound is greater than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (LoOverflow == -1)   // Low bound is less than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     return new ICmpInst(Pred, X, LoBound);
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_SGT:
     if (HiOverflow == +1)       // High bound greater than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getFalse());
     if (HiOverflow == -1)       // High bound less than input range.
-      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue(ICI.getContext()));
+      return ReplaceInstUsesWith(ICI, Builder->getTrue());
     if (Pred == ICmpInst::ICMP_UGT)
       return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
     return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
@@ -1017,7 +1019,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   // If we are comparing against bits always shifted out, the
   // comparison cannot succeed.
   APInt Comp = CmpRHSV << ShAmtVal;
-  ConstantInt *ShiftedCmpRHS = ConstantInt::get(ICI.getContext(), Comp);
+  ConstantInt *ShiftedCmpRHS = Builder->getInt(Comp);
   if (Shr->getOpcode() == Instruction::LShr)
     Comp = Comp.lshr(ShAmtVal);
   else
@@ -1025,8 +1027,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
 
   if (Comp != CmpRHSV) { // Comparing against a bit that we know is zero.
     bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-    Constant *Cst = ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                     IsICMP_NE);
+    Constant *Cst = Builder->getInt1(IsICMP_NE);
     return ReplaceInstUsesWith(ICI, Cst);
   }
 
@@ -1039,7 +1040,7 @@ Instruction *InstCombiner::FoldICmpShrCst(ICmpInst &ICI, BinaryOperator *Shr,
   if (Shr->hasOneUse()) {
     // Otherwise strength reduce the shift into an and.
     APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
-    Constant *Mask = ConstantInt::get(ICI.getContext(), Val);
+    Constant *Mask = Builder->getInt(Val);
 
     Value *And = Builder->CreateAnd(Shr->getOperand(0),
                                     Mask, Shr->getName()+".mask");
@@ -1072,7 +1073,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         APInt NewRHS = RHS->getValue().zext(SrcBits);
         NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits);
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
-                            ConstantInt::get(ICI.getContext(), NewRHS));
+                            Builder->getInt(NewRHS));
       }
     }
     break;
@@ -1115,8 +1116,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                          ? ICI.getUnsignedPredicate()
                                          : ICI.getSignedPredicate();
           return new ICmpInst(Pred, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),
-                                               RHSV ^ SignBit));
+                              Builder->getInt(RHSV ^ SignBit));
         }
 
         // (icmp u/s (xor A ~SignBit), C) -> (icmp s/u (xor C ~SignBit), A)
@@ -1127,10 +1127,21 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                          : ICI.getSignedPredicate();
           Pred = ICI.getSwappedPredicate(Pred);
           return new ICmpInst(Pred, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),
-                                               RHSV ^ NotSignBit));
+                              Builder->getInt(RHSV ^ NotSignBit));
         }
       }
+
+      // (icmp ugt (xor X, C), ~C) -> (icmp ult X, C)
+      //   iff -C is a power of 2
+      if (ICI.getPredicate() == ICmpInst::ICMP_UGT &&
+          XorCST->getValue() == ~RHSV && (RHSV + 1).isPowerOf2())
+        return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0), XorCST);
+
+      // (icmp ult (xor X, C), -C) -> (icmp uge X, C)
+      //   iff -C is a power of 2
+      if (ICI.getPredicate() == ICmpInst::ICMP_ULT &&
+          XorCST->getValue() == -RHSV && RHSV.isPowerOf2())
+        return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0), XorCST);
     }
     break;
   case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
@@ -1187,11 +1198,16 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       Type *AndTy = AndCST->getType();          // Type of the and.
 
       // We can fold this as long as we can't shift unknown bits
-      // into the mask.  This can only happen with signed shift
-      // rights, as they sign-extend.
+      // into the mask. This can happen with signed shift
+      // rights, as they sign-extend. With logical shifts,
+      // we must still make sure the comparison is not signed
+      // because we are effectively changing the
+      // position of the sign bit (PR17827).
+      // TODO: We can relax these constraints a bit more.
       if (ShAmt) {
-        bool CanFold = Shift->isLogicalShift();
-        if (!CanFold) {
+        bool CanFold = false;
+        unsigned ShiftOpcode = Shift->getOpcode();
+        if (ShiftOpcode == Instruction::AShr) {
           // To test for the bad case of the signed shr, see if any
           // of the bits shifted in could be tested after the mask.
           uint32_t TyBits = Ty->getPrimitiveSizeInBits();
@@ -1201,6 +1217,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) &
                AndCST->getValue()) == 0)
             CanFold = true;
+        } else if (ShiftOpcode == Instruction::Shl ||
+                   ShiftOpcode == Instruction::LShr) {
+          CanFold = !ICI.isSigned();
         }
 
         if (CanFold) {
@@ -1218,11 +1237,9 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             // As a special case, check to see if this means that the
             // result is always true or false now.
             if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
-              return ReplaceInstUsesWith(ICI,
-                                       ConstantInt::getFalse(ICI.getContext()));
+              return ReplaceInstUsesWith(ICI, Builder->getFalse());
             if (ICI.getPredicate() == ICmpInst::ICMP_NE)
-              return ReplaceInstUsesWith(ICI,
-                                       ConstantInt::getTrue(ICI.getContext()));
+              return ReplaceInstUsesWith(ICI, Builder->getTrue());
           } else {
             ICI.setOperand(1, NewCst);
             Constant *NewAndCST;
@@ -1284,6 +1301,15 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
               return Res;
           }
     }
+
+    // X & -C == -C -> X >  u ~C
+    // X & -C != -C -> X <= u ~C
+    //   iff C is a power of 2
+    if (ICI.isEquality() && RHS == LHSI->getOperand(1) && (-RHSV).isPowerOf2())
+      return new ICmpInst(
+          ICI.getPredicate() == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_UGT
+                                                  : ICmpInst::ICMP_ULE,
+          LHSI->getOperand(0), SubOne(RHS));
     break;
 
   case Instruction::Or: {
@@ -1325,10 +1351,80 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
   }
 
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
+    uint32_t TypeBits = RHSV.getBitWidth();
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
-    if (!ShAmt) break;
+    if (!ShAmt) {
+      Value *X;
+      // (1 << X) pred P2 -> X pred Log2(P2)
+      if (match(LHSI, m_Shl(m_One(), m_Value(X)))) {
+        bool RHSVIsPowerOf2 = RHSV.isPowerOf2();
+        ICmpInst::Predicate Pred = ICI.getPredicate();
+        if (ICI.isUnsigned()) {
+          if (!RHSVIsPowerOf2) {
+            // (1 << X) <  30 -> X <= 4
+            // (1 << X) <= 30 -> X <= 4
+            // (1 << X) >= 30 -> X >  4
+            // (1 << X) >  30 -> X >  4
+            if (Pred == ICmpInst::ICMP_ULT)
+              Pred = ICmpInst::ICMP_ULE;
+            else if (Pred == ICmpInst::ICMP_UGE)
+              Pred = ICmpInst::ICMP_UGT;
+          }
+          unsigned RHSLog2 = RHSV.logBase2();
+
+          // (1 << X) >= 2147483648 -> X >= 31 -> X == 31
+          // (1 << X) >  2147483648 -> X >  31 -> false
+          // (1 << X) <= 2147483648 -> X <= 31 -> true
+          // (1 << X) <  2147483648 -> X <  31 -> X != 31
+          if (RHSLog2 == TypeBits-1) {
+            if (Pred == ICmpInst::ICMP_UGE)
+              Pred = ICmpInst::ICMP_EQ;
+            else if (Pred == ICmpInst::ICMP_UGT)
+              return ReplaceInstUsesWith(ICI, Builder->getFalse());
+            else if (Pred == ICmpInst::ICMP_ULE)
+              return ReplaceInstUsesWith(ICI, Builder->getTrue());
+            else if (Pred == ICmpInst::ICMP_ULT)
+              Pred = ICmpInst::ICMP_NE;
+          }
 
-    uint32_t TypeBits = RHSV.getBitWidth();
+          return new ICmpInst(Pred, X,
+                              ConstantInt::get(RHS->getType(), RHSLog2));
+        } else if (ICI.isSigned()) {
+          if (RHSV.isAllOnesValue()) {
+            // (1 << X) <= -1 -> X == 31
+            if (Pred == ICmpInst::ICMP_SLE)
+              return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                                  ConstantInt::get(RHS->getType(), TypeBits-1));
+
+            // (1 << X) >  -1 -> X != 31
+            if (Pred == ICmpInst::ICMP_SGT)
+              return new ICmpInst(ICmpInst::ICMP_NE, X,
+                                  ConstantInt::get(RHS->getType(), TypeBits-1));
+          } else if (!RHSV) {
+            // (1 << X) <  0 -> X == 31
+            // (1 << X) <= 0 -> X == 31
+            if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+              return new ICmpInst(ICmpInst::ICMP_EQ, X,
+                                  ConstantInt::get(RHS->getType(), TypeBits-1));
+
+            // (1 << X) >= 0 -> X != 31
+            // (1 << X) >  0 -> X != 31
+            if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
+              return new ICmpInst(ICmpInst::ICMP_NE, X,
+                                  ConstantInt::get(RHS->getType(), TypeBits-1));
+          }
+        } else if (ICI.isEquality()) {
+          if (RHSVIsPowerOf2)
+            return new ICmpInst(
+                Pred, X, ConstantInt::get(RHS->getType(), RHSV.logBase2()));
+
+          return ReplaceInstUsesWith(
+              ICI, Pred == ICmpInst::ICMP_EQ ? Builder->getFalse()
+                                             : Builder->getTrue());
+        }
+      }
+      break;
+    }
 
     // Check that the shift amount is in range.  If not, don't perform
     // undefined shifts.  When the shift is visited it will be
@@ -1344,8 +1440,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
                                                                  ShAmt);
       if (Comp != RHS) {// Comparing against a bit that we know is zero.
         bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
-        Constant *Cst =
-          ConstantInt::get(Type::getInt1Ty(ICI.getContext()), IsICMP_NE);
+        Constant *Cst = Builder->getInt1(IsICMP_NE);
         return ReplaceInstUsesWith(ICI, Cst);
       }
 
@@ -1364,9 +1459,8 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
-        Constant *Mask =
-          ConstantInt::get(ICI.getContext(), APInt::getLowBitsSet(TypeBits,
-                                                       TypeBits-ShAmtVal));
+        Constant *Mask = Builder->getInt(APInt::getLowBitsSet(TypeBits,
+                                                          TypeBits - ShAmtVal));
 
         Value *And =
           Builder->CreateAnd(LHSI->getOperand(0),Mask, LHSI->getName()+".mask");
@@ -1451,6 +1545,30 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         return R;
     break;
 
+  case Instruction::Sub: {
+    ConstantInt *LHSC = dyn_cast<ConstantInt>(LHSI->getOperand(0));
+    if (!LHSC) break;
+    const APInt &LHSV = LHSC->getValue();
+
+    // C1-X <u C2 -> (X|(C2-1)) == C1
+    //   iff C1 & (C2-1) == C2-1
+    //       C2 is a power of 2
+    if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() &&
+        RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == (RHSV - 1))
+      return new ICmpInst(ICmpInst::ICMP_EQ,
+                          Builder->CreateOr(LHSI->getOperand(1), RHSV - 1),
+                          LHSC);
+
+    // C1-X >u C2 -> (X|C2) != C1
+    //   iff C1 & C2 == C2
+    //       C2+1 is a power of 2
+    if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() &&
+        (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == RHSV)
+      return new ICmpInst(ICmpInst::ICMP_NE,
+                          Builder->CreateOr(LHSI->getOperand(1), RHSV), LHSC);
+    break;
+  }
+
   case Instruction::Add:
     // Fold: icmp pred (add X, C1), C2
     if (!ICI.isEquality()) {
@@ -1464,20 +1582,38 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       if (ICI.isSigned()) {
         if (CR.getLower().isSignBit()) {
           return new ICmpInst(ICmpInst::ICMP_SLT, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+                              Builder->getInt(CR.getUpper()));
         } else if (CR.getUpper().isSignBit()) {
           return new ICmpInst(ICmpInst::ICMP_SGE, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+                              Builder->getInt(CR.getLower()));
         }
       } else {
         if (CR.getLower().isMinValue()) {
           return new ICmpInst(ICmpInst::ICMP_ULT, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getUpper()));
+                              Builder->getInt(CR.getUpper()));
         } else if (CR.getUpper().isMinValue()) {
           return new ICmpInst(ICmpInst::ICMP_UGE, LHSI->getOperand(0),
-                              ConstantInt::get(ICI.getContext(),CR.getLower()));
+                              Builder->getInt(CR.getLower()));
         }
       }
+
+      // X-C1 <u C2 -> (X & -C2) == C1
+      //   iff C1 & (C2-1) == 0
+      //       C2 is a power of 2
+      if (ICI.getPredicate() == ICmpInst::ICMP_ULT && LHSI->hasOneUse() &&
+          RHSV.isPowerOf2() && (LHSV & (RHSV - 1)) == 0)
+        return new ICmpInst(ICmpInst::ICMP_EQ,
+                            Builder->CreateAnd(LHSI->getOperand(0), -RHSV),
+                            ConstantExpr::getNeg(LHSC));
+
+      // X-C1 >u C2 -> (X & ~C2) != C1
+      //   iff C1 & C2 == 0
+      //       C2+1 is a power of 2
+      if (ICI.getPredicate() == ICmpInst::ICMP_UGT && LHSI->hasOneUse() &&
+          (RHSV + 1).isPowerOf2() && (LHSV & RHSV) == 0)
+        return new ICmpInst(ICmpInst::ICMP_NE,
+                            Builder->CreateAnd(LHSI->getOperand(0), ~RHSV),
+                            ConstantExpr::getNeg(LHSC));
     }
     break;
   }
@@ -1555,9 +1691,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
           Constant *NotCI = ConstantExpr::getNot(RHS);
           if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
-            return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                       isICMP_NE));
+            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
         }
         break;
 
@@ -1566,9 +1700,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
           // If bits are being compared against that are and'd out, then the
           // comparison can never succeed!
           if ((RHSV & ~BOC->getValue()) != 0)
-            return ReplaceInstUsesWith(ICI,
-                             ConstantInt::get(Type::getInt1Ty(ICI.getContext()),
-                                       isICMP_NE));
+            return ReplaceInstUsesWith(ICI, Builder->getInt1(isICMP_NE));
 
           // If we have ((X & C) == C), turn it into ((X & C) != 0).
           if (RHS == BOC && RHSV.isPowerOf2())
@@ -1619,7 +1751,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       case Intrinsic::bswap:
         Worklist.Add(II);
         ICI.setOperand(0, II->getArgOperand(0));
-        ICI.setOperand(1, ConstantInt::get(II->getContext(), RHSV.byteSwap()));
+        ICI.setOperand(1, Builder->getInt(RHSV.byteSwap()));
         return &ICI;
       case Intrinsic::ctlz:
       case Intrinsic::cttz:
@@ -1661,8 +1793,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
-      TD->getPointerSizeInBits() ==
-         cast<IntegerType>(DestTy)->getBitWidth()) {
+      TD->getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth()) {
     Value *RHSOp = 0;
     if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
       RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
@@ -1915,14 +2046,59 @@ static APInt DemandedBitsLHSMask(ICmpInst &I,
 
 }
 
+/// \brief Check if the order of \p Op0 and \p Op1 as operand in an ICmpInst
+/// should be swapped.
+/// The descision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rational is that several architectures use the same instruction for
+/// both subtract and cmp, thus it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value * Op0,
+                                          const Value * Op1) {
+  // Filter out pointer value as those cannot appears directly in subtract.
+  // FIXME: we may want to go through inttoptrs or bitcasts.
+  if (Op0->getType()->isPointerTy())
+    return false;
+  // Count every uses of both Op0 and Op1 in a subtract.
+  // Each time Op0 is the first operand, count -1: swapping is bad, the
+  // subtract has already the same layout as the compare.
+  // Each time Op0 is the second operand, count +1: swapping is good, the
+  // subtract has a diffrent layout as the compare.
+  // At the end, if the benefit is greater than 0, Op0 should come second to
+  // expose more CSE opportunities.
+  int GlobalSwapBenefits = 0;
+  for (Value::const_use_iterator UI = Op0->use_begin(), UIEnd = Op0->use_end(); UI != UIEnd; ++UI) {
+    const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(*UI);
+    if (!BinOp || BinOp->getOpcode() != Instruction::Sub)
+      continue;
+    // If Op0 is the first argument, this is not beneficial to swap the
+    // arguments.
+    int LocalSwapBenefits = -1;
+    unsigned Op1Idx = 1;
+    if (BinOp->getOperand(Op1Idx) == Op0) {
+      Op1Idx = 0;
+      LocalSwapBenefits = 1;
+    }
+    if (BinOp->getOperand(Op1Idx) != Op1)
+      continue;
+    GlobalSwapBenefits += LocalSwapBenefits;
+  }
+  return GlobalSwapBenefits > 0;
+}
+
 Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   bool Changed = false;
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  unsigned Op0Cplxity = getComplexity(Op0);
+  unsigned Op1Cplxity = getComplexity(Op1);
 
   /// Orders the operands of the compare so that they are listed from most
   /// complex to least complex.  This puts constants before unary operators,
   /// before binary operators.
-  if (getComplexity(Op0) < getComplexity(Op1)) {
+  if (Op0Cplxity < Op1Cplxity ||
+        (Op0Cplxity == Op1Cplxity &&
+         swapMayExposeCSEOpportunities(Op0, Op1))) {
     I.swapOperands();
     std::swap(Op0, Op1);
     Changed = true;
@@ -2041,19 +2217,19 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     case ICmpInst::ICMP_ULE:
       assert(!CI->isMaxValue(false));                 // A <=u MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_ULT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                          Builder->getInt(CI->getValue()+1));
     case ICmpInst::ICMP_SLE:
       assert(!CI->isMaxValue(true));                  // A <=s MAX -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                          Builder->getInt(CI->getValue()+1));
     case ICmpInst::ICMP_UGE:
       assert(!CI->isMinValue(false));                 // A >=u MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_UGT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                          Builder->getInt(CI->getValue()-1));
     case ICmpInst::ICMP_SGE:
       assert(!CI->isMinValue(true));                  // A >=s MIN -> TRUE
       return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                          Builder->getInt(CI->getValue()-1));
     }
 
     // If this comparison is a normal comparison, it demands all
@@ -2192,7 +2368,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Max == Op0Min+1)        // A <u C -> A == C-1 if min(A)+1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                              Builder->getInt(CI->getValue()-1));
 
         // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
         if (CI->isMinValue(true))
@@ -2211,7 +2387,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Min == Op0Max-1)        // A >u C -> A == C+1 if max(a)-1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                              Builder->getInt(CI->getValue()+1));
 
         // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
         if (CI->isMaxValue(true))
@@ -2229,7 +2405,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Max == Op0Min+1)        // A <s C -> A == C-1 if min(A)+1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()-1));
+                              Builder->getInt(CI->getValue()-1));
       }
       break;
     case ICmpInst::ICMP_SGT:
@@ -2243,7 +2419,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
         if (Op1Min == Op0Max-1)        // A >s C -> A == C+1 if max(A)-1 == C
           return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
-                          ConstantInt::get(CI->getContext(), CI->getValue()+1));
+                              Builder->getInt(CI->getValue()+1));
       }
       break;
     case ICmpInst::ICMP_SGE:
@@ -2357,7 +2533,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) ==
+            TD->getIntPtrType(RHSC->getType()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
@@ -2719,8 +2895,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         ConstantInt *C1, *C2;
         if (match(B, m_ConstantInt(C1)) &&
             match(D, m_ConstantInt(C2)) && Op1->hasOneUse()) {
-          Constant *NC = ConstantInt::get(I.getContext(),
-                                          C1->getValue() ^ C2->getValue());
+          Constant *NC = Builder->getInt(C1->getValue() ^ C2->getValue());
           Value *Xor = Builder->CreateXor(C, NC);
           return new ICmpInst(I.getPredicate(), A, Xor);
         }
@@ -2781,6 +2956,24 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
                             Builder->CreateTrunc(B, A->getType()));
     }
 
+    // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+    // For lshr and ashr pairs.
+    if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+        (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+         match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+      unsigned TypeBits = Cst1->getBitWidth();
+      unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+      if (ShAmt < TypeBits && ShAmt != 0) {
+        ICmpInst::Predicate Pred = I.getPredicate() == ICmpInst::ICMP_NE
+                                       ? ICmpInst::ICMP_UGE
+                                       : ICmpInst::ICMP_ULT;
+        Value *Xor = Builder->CreateXor(A, B, I.getName() + ".unshifted");
+        APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+        return new ICmpInst(Pred, Xor, Builder->getInt(CmpVal));
+      }
+    }
+
     // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
     // "icmp (and X, mask), cst"
     uint64_t ShAmt = 0;
@@ -2811,20 +3004,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     Value *X; ConstantInt *Cst;
     // icmp X+Cst, X
     if (match(Op0, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op1 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate(), Op0);
+      return FoldICmpAddOpCst(I, X, Cst, I.getPredicate());
 
     // icmp X, X+Cst
     if (match(Op1, m_Add(m_Value(X), m_ConstantInt(Cst))) && Op0 == X)
-      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate(), Op1);
+      return FoldICmpAddOpCst(I, X, Cst, I.getSwappedPredicate());
   }
   return Changed ? &I : 0;
 }
 
-
-
-
-
-
 /// FoldFCmp_IntToFP_Cst - Fold fcmp ([us]itofp x, cst) if possible.
 ///
 Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
@@ -2885,9 +3073,9 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
     Pred = ICmpInst::ICMP_NE;
     break;
   case FCmpInst::FCMP_ORD:
-    return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+    return ReplaceInstUsesWith(I, Builder->getTrue());
   case FCmpInst::FCMP_UNO:
-    return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+    return ReplaceInstUsesWith(I, Builder->getFalse());
   }
 
   IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
@@ -2901,50 +3089,50 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
   if (!LHSUnsigned) {
     // If the RHS value is > SignedMax, fold the comparison.  This handles +INF
     // and large values.
-    APFloat SMax(RHS.getSemantics(), APFloat::fcZero, false);
+    APFloat SMax(RHS.getSemantics());
     SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
     if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // If the RHS value is > UnsignedMax, fold the comparison. This handles
     // +INF and large values.
-    APFloat UMax(RHS.getSemantics(), APFloat::fcZero, false);
+    APFloat UMax(RHS.getSemantics());
     UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
                           APFloat::rmNearestTiesToEven);
     if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
   if (!LHSUnsigned) {
     // See if the RHS value is < SignedMin.
-    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    APFloat SMin(RHS.getSemantics());
     SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
           Pred == ICmpInst::ICMP_SGE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   } else {
     // See if the RHS value is < UnsignedMin.
-    APFloat SMin(RHS.getSemantics(), APFloat::fcZero, false);
+    APFloat SMin(RHS.getSemantics());
     SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
     if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
           Pred == ICmpInst::ICMP_UGE)
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
-      return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
+      return ReplaceInstUsesWith(I, Builder->getFalse());
     }
   }
 
@@ -2966,14 +3154,14 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
       switch (Pred) {
       default: llvm_unreachable("Unexpected integer comparison!");
       case ICmpInst::ICMP_NE:  // (float)int != 4.4   --> true
-        return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getTrue());
       case ICmpInst::ICMP_EQ:  // (float)int == 4.4   --> false
-        return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+        return ReplaceInstUsesWith(I, Builder->getFalse());
       case ICmpInst::ICMP_ULE:
         // (float)int <= 4.4   --> int <= 4
         // (float)int <= -4.4  --> false
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getFalse());
         break;
       case ICmpInst::ICMP_SLE:
         // (float)int <= 4.4   --> int <= 4
@@ -2985,7 +3173,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int < -4.4   --> false
         // (float)int < 4.4    --> int <= 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getFalse(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getFalse());
         Pred = ICmpInst::ICMP_ULE;
         break;
       case ICmpInst::ICMP_SLT:
@@ -2998,7 +3186,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int > 4.4    --> int > 4
         // (float)int > -4.4   --> true
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getTrue());
         break;
       case ICmpInst::ICMP_SGT:
         // (float)int > 4.4    --> int > 4
@@ -3010,7 +3198,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I,
         // (float)int >= -4.4   --> true
         // (float)int >= 4.4    --> int > 4
         if (RHS.isNegative())
-          return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext()));
+          return ReplaceInstUsesWith(I, Builder->getTrue());
         Pred = ICmpInst::ICMP_UGT;
         break;
       case ICmpInst::ICMP_SGE:
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index e2d7966..4c861b3 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -154,7 +154,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getType());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -180,12 +180,13 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
       // Now that I is pointing to the first non-allocation-inst in the block,
       // insert our getelementptr instruction...
       //
-      Value *NullIdx =Constant::getNullValue(Type::getInt32Ty(AI.getContext()));
-      Value *Idx[2];
-      Idx[0] = NullIdx;
-      Idx[1] = NullIdx;
+      Type *IdxTy = TD
+                  ? TD->getIntPtrType(AI.getType())
+                  : Type::getInt64Ty(AI.getContext());
+      Value *NullIdx = Constant::getNullValue(IdxTy);
+      Value *Idx[2] = { NullIdx, NullIdx };
       Instruction *GEP =
-           GetElementPtrInst::CreateInBounds(New, Idx, New->getName()+".sub");
+        GetElementPtrInst::CreateInBounds(New, Idx, New->getName() + ".sub");
       InsertNewInstBefore(GEP, *It);
 
       // Now make everything use the getelementptr instead of the original
@@ -262,9 +263,9 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           EraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Instruction *NewI
-          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
-                                                             AI.getType()));
+        Constant *Cast
+          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
+        Instruction *NewI = ReplaceInstUsesWith(AI, Cast);
         EraseInstFromFunction(*Copy);
         ++NumGlobalCopies;
         return NewI;
@@ -302,9 +303,11 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
       if (ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
         if (Constant *CSrc = dyn_cast<Constant>(CastOp))
           if (ASrcTy->getNumElements() != 0) {
-            Value *Idxs[2];
-            Idxs[0] = Constant::getNullValue(Type::getInt32Ty(LI.getContext()));
-            Idxs[1] = Idxs[0];
+            Type *IdxTy = TD
+                        ? TD->getIntPtrType(SrcTy)
+                        : Type::getInt64Ty(SrcTy->getContext());
+            Value *Idx = Constant::getNullValue(IdxTy);
+            Value *Idxs[2] = { Idx, Idx };
             CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs);
             SrcTy = cast<PointerType>(CastOp->getType());
             SrcPTy = SrcTy->getElementType();
@@ -315,7 +318,8 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
-          (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
+          (SrcPTy->isPtrOrPtrVectorTy() ==
+           LI.getType()->isPtrOrPtrVectorTy()) &&
           IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
                IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
 
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index ecc9fc3..a759548 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -95,6 +95,25 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
   return MulExt.slt(Min) || MulExt.sgt(Max);
 }
 
+/// \brief A helper routine of InstCombiner::visitMul().
+///
+/// If C is a vector of known powers of 2, then this function returns
+/// a new vector obtained from C replacing each element with its logBase2.
+/// Return a null pointer otherwise.
+static Constant *getLogBase2Vector(ConstantDataVector *CV) {
+  const APInt *IVal;
+  SmallVector<Constant *, 4> Elts;
+
+  for (unsigned I = 0, E = CV->getNumElements(); I != E; ++I) {
+    Constant *Elt = CV->getElementAsConstant(I);
+    if (!match(Elt, m_APInt(IVal)) || !IVal->isPowerOf2())
+      return 0;
+    Elts.push_back(ConstantInt::get(Elt->getType(), IVal->logBase2()));
+  }
+
+  return ConstantVector::get(Elts);
+}
+
 Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -108,24 +127,37 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
     return BinaryOperator::CreateNeg(Op0, I.getName());
 
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-
-    // ((X << C1)*C2) == (X * (C2 << C1))
-    if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
-      if (SI->getOpcode() == Instruction::Shl)
-        if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
-          return BinaryOperator::CreateMul(SI->getOperand(0),
-                                           ConstantExpr::getShl(CI, ShOp));
-
-    const APInt &Val = CI->getValue();
-    if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
-      Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2());
-      BinaryOperator *Shl = BinaryOperator::CreateShl(Op0, NewCst);
-      if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
-      if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
-      return Shl;
+  // Also allow combining multiply instructions on vectors.
+  {
+    Value *NewOp;
+    Constant *C1, *C2;
+    const APInt *IVal;
+    if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
+                        m_Constant(C1))) &&
+        match(C1, m_APInt(IVal)))
+      // ((X << C1)*C2) == (X * (C2 << C1))
+      return BinaryOperator::CreateMul(NewOp, ConstantExpr::getShl(C1, C2));
+
+    if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
+      Constant *NewCst = 0;
+      if (match(C1, m_APInt(IVal)) && IVal->isPowerOf2())
+        // Replace X*(2^C) with X << C, where C is either a scalar or a splat.
+        NewCst = ConstantInt::get(NewOp->getType(), IVal->logBase2());
+      else if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(C1))
+        // Replace X*(2^C) with X << C, where C is a vector of known
+        // constant powers of 2.
+        NewCst = getLogBase2Vector(CV);
+
+      if (NewCst) {
+        BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
+        if (I.hasNoSignedWrap()) Shl->setHasNoSignedWrap();
+        if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
+        return Shl;
+      }
     }
+  }
 
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
@@ -306,13 +338,13 @@ static bool isFMulOrFDivWithConstant(Value *V) {
   if (C0 && C1)
     return false;
 
-  return (C0 && C0->getValueAPF().isNormal()) ||
-         (C1 && C1->getValueAPF().isNormal());
+  return (C0 && C0->getValueAPF().isFiniteNonZero()) ||
+         (C1 && C1->getValueAPF().isFiniteNonZero());
 }
 
 static bool isNormalFp(const ConstantFP *C) {
   const APFloat &Flt = C->getValueAPF();
-  return Flt.isNormal() && !Flt.isDenormal();
+  return Flt.isNormal();
 }
 
 /// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
@@ -342,9 +374,12 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
   } else {
     if (C0) {
       // (C0 / X) * C => (C0 * C) / X
-      ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
-      if (isNormalFp(F))
-        R = BinaryOperator::CreateFDiv(F, Opnd1);
+      if (FMulOrDiv->hasOneUse()) {
+        // It would otherwise introduce another div.
+        ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFMul(C0, C));
+        if (isNormalFp(F))
+          R = BinaryOperator::CreateFDiv(F, Opnd1);
+      }
     } else {
       // (X / C1) * C => X * (C/C1) if C/C1 is not a denormal
       ConstantFP *F = cast<ConstantFP>(ConstantExpr::getFDiv(C, C1));
@@ -391,7 +426,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         return NV;
 
     ConstantFP *C = dyn_cast<ConstantFP>(Op1);
-    if (C && AllowReassociate && C->getValueAPF().isNormal()) {
+    if (C && AllowReassociate && C->getValueAPF().isFiniteNonZero()) {
       // Let MDC denote an expression in one of these forms:
       // X * C, C/X, X/C, where C is a constant.
       //
@@ -418,7 +453,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Swap = true;
         }
 
-        if (C1 && C1->getValueAPF().isNormal() &&
+        if (C1 && C1->getValueAPF().isFiniteNonZero() &&
             isFMulOrFDivWithConstant(Opnd0)) {
           Value *M1 = ConstantExpr::getFMul(C1, C);
           Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ?
@@ -428,10 +463,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
             if (Swap && FAddSub->getOpcode() == Instruction::FSub)
               std::swap(M0, M1);
 
-            Value *R = (FAddSub->getOpcode() == Instruction::FAdd) ?
-                        BinaryOperator::CreateFAdd(M0, M1) :
-                        BinaryOperator::CreateFSub(M0, M1);
-            Instruction *RI = cast<Instruction>(R);
+            Instruction *RI = (FAddSub->getOpcode() == Instruction::FAdd)
+                                  ? BinaryOperator::CreateFAdd(M0, M1)
+                                  : BinaryOperator::CreateFSub(M0, M1);
             RI->copyFastMathFlags(&I);
             return RI;
           }
@@ -458,13 +492,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
     // if pattern detected emit alternate sequence
     if (OpX && OpY) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(Log2->getFastMathFlags());
       Log2->setArgOperand(0, OpY);
       Value *FMulVal = Builder->CreateFMul(OpX, Log2);
-      Instruction *FMul = cast<Instruction>(FMulVal);
-      FMul->copyFastMathFlags(Log2);
-      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
-      FSub->copyFastMathFlags(Log2);
-      return FSub;
+      Value *FSub = Builder->CreateFSub(FMulVal, OpX);
+      FSub->takeName(&I);
+      return ReplaceInstUsesWith(I, FSub);
     }
   }
 
@@ -474,6 +508,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   for (int i = 0; i < 2; i++) {
     bool IgnoreZeroSign = I.hasNoSignedZeros();
     if (BinaryOperator::isFNeg(Opnd0, IgnoreZeroSign)) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->SetFastMathFlags(I.getFastMathFlags());
+
       Value *N0 = dyn_castFNegVal(Opnd0, IgnoreZeroSign);
       Value *N1 = dyn_castFNegVal(Opnd1, IgnoreZeroSign);
 
@@ -484,13 +521,9 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       if (Opnd0->hasOneUse()) {
         // -X * Y => -(X*Y) (Promote negation as high as possible)
         Value *T = Builder->CreateFMul(N0, Opnd1);
-        cast<Instruction>(T)->setDebugLoc(I.getDebugLoc());
-        Instruction *Neg = BinaryOperator::CreateFNeg(T);
-        if (I.getFastMathFlags().any()) {
-          cast<Instruction>(T)->copyFastMathFlags(&I);
-          Neg->copyFastMathFlags(&I);
-        }
-        return Neg;
+        Value *Neg = Builder->CreateFNeg(T);
+        Neg->takeName(&I);
+        return ReplaceInstUsesWith(I, Neg);
       }
     }
 
@@ -513,13 +546,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           Y = Opnd0_0;
 
         if (Y) {
-          Instruction *T = cast<Instruction>(Builder->CreateFMul(Opnd1, Opnd1));
-          T->copyFastMathFlags(&I);
-          T->setDebugLoc(I.getDebugLoc());
+          BuilderTy::FastMathFlagGuard Guard(*Builder);
+          Builder->SetFastMathFlags(I.getFastMathFlags());
+          Value *T = Builder->CreateFMul(Opnd1, Opnd1);
 
-          Instruction *R = BinaryOperator::CreateFMul(T, Y);
-          R->copyFastMathFlags(&I);
-          return R;
+          Value *R = Builder->CreateFMul(T, Y);
+          R->takeName(&I);
+          return ReplaceInstUsesWith(I, R);
         }
       }
     }
@@ -528,10 +561,10 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
       Value *LHS = Op0, *RHS = Op1;
       Value *B, *C;
-      if (!match(RHS, m_UIToFp(m_Value(C))))
+      if (!match(RHS, m_UIToFP(m_Value(C))))
         std::swap(LHS, RHS);
 
-      if (match(RHS, m_UIToFp(m_Value(C))) && C->getType()->isIntegerTy(1)) {
+      if (match(RHS, m_UIToFP(m_Value(C))) && C->getType()->isIntegerTy(1)) {
         B = LHS;
         Value *Zero = ConstantFP::getNegativeZero(B->getType());
         return SelectInst::Create(C, B, Zero);
@@ -542,10 +575,10 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
       Value *LHS = Op0, *RHS = Op1;
       Value *A, *C;
-      if (!match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C)))))
+      if (!match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))))
         std::swap(LHS, RHS);
 
-      if (match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C)))) &&
+      if (match(RHS, m_FSub(m_FPOne(), m_UIToFP(m_Value(C)))) &&
           C->getType()->isIntegerTy(1)) {
         A = LHS;
         Value *Zero = ConstantFP::getNegativeZero(A->getType());
@@ -613,8 +646,7 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
         *I = SI->getOperand(NonNullOperand);
         Worklist.Add(BBI);
       } else if (*I == SelectCond) {
-        *I = NonNullOperand == 1 ? ConstantInt::getTrue(BBI->getContext()) :
-                                   ConstantInt::getFalse(BBI->getContext());
+        *I = Builder->getInt1(NonNullOperand == 1);
         Worklist.Add(BBI);
       }
     }
@@ -703,40 +735,124 @@ static Value *dyn_castZExtVal(Value *V, Type *Ty) {
   return 0;
 }
 
-Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+namespace {
+const unsigned MaxDepth = 6;
+typedef Instruction *(*FoldUDivOperandCb)(Value *Op0, Value *Op1,
+                                          const BinaryOperator &I,
+                                          InstCombiner &IC);
+
+/// \brief Used to maintain state for visitUDivOperand().
+struct UDivFoldAction {
+  FoldUDivOperandCb FoldAction; ///< Informs visitUDiv() how to fold this
+                                ///< operand.  This can be zero if this action
+                                ///< joins two actions together.
+
+  Value *OperandToFold;         ///< Which operand to fold.
+  union {
+    Instruction *FoldResult;    ///< The instruction returned when FoldAction is
+                                ///< invoked.
+
+    size_t SelectLHSIdx;        ///< Stores the LHS action index if this action
+                                ///< joins two actions together.
+  };
+
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
+      : FoldAction(FA), OperandToFold(InputOperand), FoldResult(0) {}
+  UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
+      : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
+};
+}
 
-  if (Value *V = SimplifyUDivInst(Op0, Op1, TD))
-    return ReplaceInstUsesWith(I, V);
+// X udiv 2^C -> X >> C
+static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
+                                    const BinaryOperator &I, InstCombiner &IC) {
+  const APInt &C = cast<Constant>(Op1)->getUniqueInteger();
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(
+      Op0, ConstantInt::get(Op0->getType(), C.logBase2()));
+  if (I.isExact()) LShr->setIsExact();
+  return LShr;
+}
 
-  // Handle the integer div common cases
-  if (Instruction *Common = commonIDivTransforms(I))
-    return Common;
+// X udiv C, where C >= signbit
+static Instruction *foldUDivNegCst(Value *Op0, Value *Op1,
+                                   const BinaryOperator &I, InstCombiner &IC) {
+  Value *ICI = IC.Builder->CreateICmpULT(Op0, cast<ConstantInt>(Op1));
 
-  {
-    // X udiv 2^C -> X >> C
-    // Check to see if this is an unsigned division with an exact power of 2,
-    // if so, convert to a right shift.
-    const APInt *C;
-    if (match(Op1, m_Power2(C))) {
-      BinaryOperator *LShr =
-      BinaryOperator::CreateLShr(Op0,
-                                 ConstantInt::get(Op0->getType(),
-                                                  C->logBase2()));
-      if (I.isExact()) LShr->setIsExact();
-      return LShr;
-    }
+  return SelectInst::Create(ICI, Constant::getNullValue(I.getType()),
+                            ConstantInt::get(I.getType(), 1));
+}
+
+// X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
+                                InstCombiner &IC) {
+  Instruction *ShiftLeft = cast<Instruction>(Op1);
+  if (isa<ZExtInst>(ShiftLeft))
+    ShiftLeft = cast<Instruction>(ShiftLeft->getOperand(0));
+
+  const APInt &CI =
+      cast<Constant>(ShiftLeft->getOperand(0))->getUniqueInteger();
+  Value *N = ShiftLeft->getOperand(1);
+  if (CI != 1)
+    N = IC.Builder->CreateAdd(N, ConstantInt::get(N->getType(), CI.logBase2()));
+  if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
+    N = IC.Builder->CreateZExt(N, Z->getDestTy());
+  BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
+  if (I.isExact()) LShr->setIsExact();
+  return LShr;
+}
+
+// \brief Recursively visits the possible right hand operands of a udiv
+// instruction, seeing through select instructions, to determine if we can
+// replace the udiv with something simpler.  If we find that an operand is not
+// able to simplify the udiv, we abort the entire transformation.
+static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
+                               SmallVectorImpl<UDivFoldAction> &Actions,
+                               unsigned Depth = 0) {
+  // Check to see if this is an unsigned division with an exact power of 2,
+  // if so, convert to a right shift.
+  if (match(Op1, m_Power2())) {
+    Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
+    return Actions.size();
   }
 
-  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1))
     // X udiv C, where C >= signbit
     if (C->getValue().isNegative()) {
-      Value *IC = Builder->CreateICmpULT(Op0, C);
-      return SelectInst::Create(IC, Constant::getNullValue(I.getType()),
-                                ConstantInt::get(I.getType(), 1));
+      Actions.push_back(UDivFoldAction(foldUDivNegCst, C));
+      return Actions.size();
     }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
+      match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
+    Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
+    return Actions.size();
   }
 
+  // The remaining tests are all recursive, so bail out if we hit the limit.
+  if (Depth++ == MaxDepth)
+    return 0;
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+    if (size_t LHSIdx = visitUDivOperand(Op0, SI->getOperand(1), I, Actions))
+      if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions)) {
+        Actions.push_back(UDivFoldAction((FoldUDivOperandCb)0, Op1, LHSIdx-1));
+        return Actions.size();
+      }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Value *V = SimplifyUDivInst(Op0, Op1, TD))
+    return ReplaceInstUsesWith(I, V);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
   // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
   if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) {
     Value *X;
@@ -747,38 +863,6 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
     }
   }
 
-  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
-  { const APInt *CI; Value *N;
-    if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
-        match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
-      if (*CI != 1)
-        N = Builder->CreateAdd(N,
-                               ConstantInt::get(N->getType(), CI->logBase2()));
-      if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
-        N = Builder->CreateZExt(N, Z->getDestTy());
-      if (I.isExact())
-        return BinaryOperator::CreateExactLShr(Op0, N);
-      return BinaryOperator::CreateLShr(Op0, N);
-    }
-  }
-
-  // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
-  // where C1&C2 are powers of two.
-  { Value *Cond; const APInt *C1, *C2;
-    if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) {
-      // Construct the "on true" case of the select
-      Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t",
-                                       I.isExact());
-
-      // Construct the "on false" case of the select
-      Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f",
-                                       I.isExact());
-
-      // construct the select instruction and return it.
-      return SelectInst::Create(Cond, TSI, FSI);
-    }
-  }
-
   // (zext A) udiv (zext B) --> zext (A udiv B)
   if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
     if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
@@ -786,6 +870,37 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
                                               I.isExact()),
                           I.getType());
 
+  // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
+  SmallVector<UDivFoldAction, 6> UDivActions;
+  if (visitUDivOperand(Op0, Op1, I, UDivActions))
+    for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
+      FoldUDivOperandCb Action = UDivActions[i].FoldAction;
+      Value *ActionOp1 = UDivActions[i].OperandToFold;
+      Instruction *Inst;
+      if (Action)
+        Inst = Action(Op0, ActionOp1, I, *this);
+      else {
+        // This action joins two actions together.  The RHS of this action is
+        // simply the last action we processed, we saved the LHS action index in
+        // the joining action.
+        size_t SelectRHSIdx = i - 1;
+        Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
+        size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
+        Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
+        Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
+                                  SelectLHS, SelectRHS);
+      }
+
+      // If this is the last action to process, return it to the InstCombiner.
+      // Otherwise, we insert it before the UDiv and record it so that we may
+      // use it as part of a joining action (i.e., a SelectInst).
+      if (e - i != 1) {
+        Inst->insertBefore(&I);
+        UDivActions[i].FoldResult = Inst;
+      } else
+        return Inst;
+    }
+
   return 0;
 }
 
@@ -846,7 +961,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
 /// FP value and:
 ///    1) 1/C is exact, or
 ///    2) reciprocal is allowed.
-/// If the convertion was successful, the simplified expression "X * 1/C" is
+/// If the conversion was successful, the simplified expression "X * 1/C" is
 /// returned; otherwise, NULL is returned.
 ///
 static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
@@ -856,7 +971,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
   APFloat Reciprocal(FpVal.getSemantics());
   bool Cvt = FpVal.getExactInverse(&Reciprocal);
 
-  if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
+  if (!Cvt && AllowReciprocal && FpVal.isFiniteNonZero()) {
     Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
     (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
     Cvt = !Reciprocal.isDenormal();
@@ -876,10 +991,19 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   if (Value *V = SimplifyFDivInst(Op0, Op1, TD))
     return ReplaceInstUsesWith(I, V);
 
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
   bool AllowReassociate = I.hasUnsafeAlgebra();
   bool AllowReciprocal = I.hasAllowReciprocal();
 
   if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI))
+        return R;
+
     if (AllowReassociate) {
       ConstantFP *C1 = 0;
       ConstantFP *C2 = Op1C;
@@ -891,14 +1015,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         //
         Constant *C = ConstantExpr::getFDiv(C1, C2);
         const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
-        if (F.isNormal() && !F.isDenormal())
+        if (F.isNormal())
           Res = BinaryOperator::CreateFMul(X, C);
       } else if (match(Op0, m_FDiv(m_Value(X), m_ConstantFP(C1)))) {
         // (X/C1)/C2 => X /(C2*C1) [=> X * 1/(C2*C1) if reciprocal is allowed]
         //
         Constant *C = ConstantExpr::getFMul(C1, C2);
         const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
-        if (F.isNormal() && !F.isDenormal()) {
+        if (F.isNormal()) {
           Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C),
                                          AllowReciprocal);
           if (!Res)
@@ -939,7 +1063,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
 
     if (Fold) {
       const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
-      if (FoldC.isNormal() && !FoldC.isDenormal()) {
+      if (FoldC.isNormal()) {
         Instruction *R = CreateDiv ?
                          BinaryOperator::CreateFDiv(Fold, X) :
                          BinaryOperator::CreateFMul(X, Fold);
@@ -1027,37 +1151,26 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
   if (Instruction *common = commonIRemTransforms(I))
     return common;
 
-  // X urem C^2 -> X and C-1
-  { const APInt *C;
-    if (match(Op1, m_Power2(C)))
-      return BinaryOperator::CreateAnd(Op0,
-                                       ConstantInt::get(I.getType(), *C-1));
-  }
+  // (zext A) urem (zext B) --> zext (A urem B)
+  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
+    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
+      return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
+                          I.getType());
 
-  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)
-  if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
+  // X urem Y -> X and Y-1, where Y is a power of 2,
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
   }
 
-  // urem X, (select Cond, 2^C1, 2^C2) -->
-  //    select Cond, (and X, C1-1), (and X, C2-1)
-  // when C1&C2 are powers of two.
-  { Value *Cond; const APInt *C1, *C2;
-    if (match(Op1, m_Select(m_Value(Cond), m_Power2(C1), m_Power2(C2)))) {
-      Value *TrueAnd = Builder->CreateAnd(Op0, *C1-1, Op1->getName()+".t");
-      Value *FalseAnd = Builder->CreateAnd(Op0, *C2-1, Op1->getName()+".f");
-      return SelectInst::Create(Cond, TrueAnd, FalseAnd);
-    }
+  // 1 urem X -> zext(X != 1)
+  if (match(Op0, m_One())) {
+    Value *Cmp = Builder->CreateICmpNE(Op1, Op0);
+    Value *Ext = Builder->CreateZExt(Cmp, I.getType());
+    return ReplaceInstUsesWith(I, Ext);
   }
 
-  // (zext A) urem (zext B) --> zext (A urem B)
-  if (ZExtInst *ZOp0 = dyn_cast<ZExtInst>(Op0))
-    if (Value *ZOp1 = dyn_castZExtVal(Op1, ZOp0->getSrcTy()))
-      return new ZExtInst(Builder->CreateURem(ZOp0->getOperand(0), ZOp1),
-                          I.getType());
-
   return 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index bd14e81..4c6d0c4 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -604,8 +604,6 @@ namespace llvm {
              LHS.Width == RHS.Width;
     }
   };
-  template <>
-  struct isPodLike<LoweredPHIRecord> { static const bool value = true; };
 }
 
 
@@ -688,10 +686,10 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
 
-  DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
-            for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
-              errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
-        );
+  DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+        for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+          dbgs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';
+    );
 
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
@@ -772,7 +770,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       }
       PredValues.clear();
 
-      DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
+      DEBUG(dbgs() << "  Made element PHI for offset " << Offset << ": "
                    << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
@@ -792,7 +790,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
 // PHINode simplification
 //
 Instruction *InstCombiner::visitPHINode(PHINode &PN) {
-  if (Value *V = SimplifyInstruction(&PN, TD))
+  if (Value *V = SimplifyInstruction(&PN, TD, TLI))
     return ReplaceInstUsesWith(PN, V);
 
   // If all PHI operands are the same operation, pull them through the PHI,
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 59502fb..283bec2 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -367,7 +367,7 @@ static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
                                   Value *FalseVal,
                                   InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality())
+  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
     return 0;
 
   Value *CmpLHS = IC->getOperand(0);
@@ -662,7 +662,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
                                 ConstantInt *FalseVal,
                                 InstCombiner::BuilderTy *Builder) {
   const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
-  if (!IC || !IC->isEquality())
+  if (!IC || !IC->isEquality() || !SI.getType()->isIntegerTy())
     return 0;
 
   if (!match(IC->getOperand(1), m_Zero()))
@@ -670,8 +670,7 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
 
   ConstantInt *AndRHS;
   Value *LHS = IC->getOperand(0);
-  if (LHS->getType() != SI.getType() ||
-      !match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS))))
+  if (!match(LHS, m_And(m_Value(), m_ConstantInt(AndRHS))))
     return 0;
 
   // If both select arms are non-zero see if we have a select of the form
@@ -705,7 +704,13 @@ static Value *foldSelectICmpAnd(const SelectInst &SI, ConstantInt *TrueVal,
   unsigned ValZeros = ValC->getValue().logBase2();
   unsigned AndZeros = AndRHS->getValue().logBase2();
 
-  Value *V = LHS;
+  // If types don't match we can still convert the select by introducing a zext
+  // or a trunc of the 'and'. The trunc case requires that all of the truncated
+  // bits are zero, we can figure that out by looking at the 'and' mask.
+  if (AndZeros >= ValC->getBitWidth())
+    return 0;
+
+  Value *V = Builder->CreateZExtOrTrunc(LHS, SI.getType());
   if (ValZeros > AndZeros)
     V = Builder->CreateShl(V, ValZeros - AndZeros);
   else if (ValZeros < AndZeros)
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 8add1ea..c831ddd 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -754,7 +754,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero |= LHSKnownZero;
+        KnownZero.setBit(KnownZero.getBitWidth() - 1);
     }
     break;
   case Instruction::URem: {
@@ -808,7 +808,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         // TODO: Could compute known zero/one bits based on the input.
         break;
       }
-      case Intrinsic::x86_sse42_crc32_64_8:
       case Intrinsic::x86_sse42_crc32_64_64:
         KnownZero = APInt::getHighBitsSet(64, 32);
         return 0;
@@ -845,21 +844,26 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
 
-  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
-  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+  const APInt &ShlOp1 = cast<ConstantInt>(Shl->getOperand(1))->getValue();
+  const APInt &ShrOp1 = cast<ConstantInt>(Shr->getOperand(1))->getValue();
+  if (!ShlOp1 || !ShrOp1)
+      return 0; // Noop.
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+  unsigned BitWidth = Ty->getIntegerBitWidth();
+  if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+    return 0; // Undef.
+
+  unsigned ShlAmt = ShlOp1.getZExtValue();
+  unsigned ShrAmt = ShrOp1.getZExtValue();
 
   KnownOne.clearAllBits();
   KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
   KnownZero &= DemandedMask;
 
-  if (ShlAmt == 0 || ShrAmt == 0)
-    return 0;
-
-  Value *VarX = Shr->getOperand(0);
-  Type *Ty = VarX->getType();
-
-  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
-  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+  APInt BitMask2(APInt::getAllOnesValue(BitWidth));
 
   bool isLshr = (Shr->getOpcode() == Instruction::LShr);
   BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 4301ddb..1e72410 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -106,8 +106,8 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
 }
 
 // If we have a PHI node with a vector type that has only 2 uses: feed
-// itself and be an operand of extractelemnt at a constant location,
-// try to replace the PHI of the vector type with a PHI of a scalar type
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
 Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
   if (!PN->hasNUses(2))
@@ -125,17 +125,15 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
   // and that it is a binary operation which is cheap to scalarize.
   // otherwise return NULL.
   if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) ||
-    !(isa<BinaryOperator>(PHIUser)) ||
-    !CheapToScalarize(PHIUser, true))
+      !(isa<BinaryOperator>(PHIUser)) || !CheapToScalarize(PHIUser, true))
     return NULL;
 
   // Create a scalar PHI node that will replace the vector PHI node
   // just before the current PHI node.
-  PHINode * scalarPHI = cast<PHINode>(
-    InsertNewInstWith(PHINode::Create(EI.getType(),
-    PN->getNumIncomingValues(), ""), *PN));
+  PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
+      PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
   // Scalarize each PHI operand.
-  for (unsigned i=0; i < PN->getNumIncomingValues(); i++) {
+  for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
     Value *PHIInVal = PN->getIncomingValue(i);
     BasicBlock *inBB = PN->getIncomingBlock(i);
     Value *Elt = EI.getIndexOperand();
@@ -145,17 +143,17 @@ Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
       // scalar PHI and the second operand is extracted from the other
       // vector operand.
       BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
-      unsigned opId = (B0->getOperand(0) == PN) ? 1: 0;
-      Value *Op = Builder->CreateExtractElement(
-        B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt");
+      unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
+      Value *Op = InsertNewInstWith(
+          ExtractElementInst::Create(B0->getOperand(opId), Elt,
+                                     B0->getOperand(opId)->getName() + ".Elt"),
+          *B0);
       Value *newPHIUser = InsertNewInstWith(
-        BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op),
-        *B0);
+          BinaryOperator::Create(B0->getOpcode(), scalarPHI, Op), *B0);
       scalarPHI->addIncoming(newPHIUser, inBB);
     } else {
       // Scalarize PHI input:
-      Instruction *newEI =
-        ExtractElementInst::Create(PHIInVal, Elt, "");
+      Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
       // Insert the new instruction into the predecessor basic block.
       Instruction *pos = dyn_cast<Instruction>(PHIInVal);
       BasicBlock::iterator InsertPos;
@@ -224,7 +222,7 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
       Instruction *scalarPHI = scalarizePHI(EI, PN);
       if (scalarPHI)
-        return (scalarPHI);
+        return scalarPHI;
     }
   }
 
@@ -284,6 +282,38 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+      if (SI->hasOneUse()) {
+        // TODO: For a select on vectors, it might be useful to do this if it
+        // has multiple extractelement uses. For vector select, that seems to
+        // fight the vectorizer.
+
+        // If we are extracting an element from a vector select or a select on
+        // vectors, a select on the scalars extracted from the vector arguments.
+        Value *TrueVal = SI->getTrueValue();
+        Value *FalseVal = SI->getFalseValue();
+
+        Value *Cond = SI->getCondition();
+        if (Cond->getType()->isVectorTy()) {
+          Cond = Builder->CreateExtractElement(Cond,
+                                               EI.getIndexOperand(),
+                                               Cond->getName() + ".elt");
+        }
+
+        Value *V1Elem
+          = Builder->CreateExtractElement(TrueVal,
+                                          EI.getIndexOperand(),
+                                          TrueVal->getName() + ".elt");
+
+        Value *V2Elem
+          = Builder->CreateExtractElement(FalseVal,
+                                          EI.getIndexOperand(),
+                                          FalseVal->getName() + ".elt");
+        return SelectInst::Create(Cond,
+                                  V1Elem,
+                                  V2Elem,
+                                  SI->getName() + ".elt");
+      }
     }
   }
   return 0;
@@ -296,7 +326,7 @@ static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
                                          SmallVectorImpl<Constant*> &Mask) {
   assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
-  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+  unsigned NumElts = V->getType()->getVectorNumElements();
 
   if (isa<UndefValue>(V)) {
     Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
@@ -496,6 +526,254 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   return 0;
 }
 
+/// Return true if we can evaluate the specified expression tree if the vector
+/// elements were shuffled in a different order.
+static bool CanEvaluateShuffled(Value *V, ArrayRef<int> Mask,
+                                unsigned Depth = 5) {
+  // We can always reorder the elements of a constant.
+  if (isa<Constant>(V))
+    return true;
+
+  // We won't reorder vector arguments. No IPO here.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+
+  // Two users may expect different orders of the elements. Don't try it.
+  if (!I->hasOneUse())
+    return false;
+
+  if (Depth == 0) return false;
+
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::GetElementPtr: {
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!CanEvaluateShuffled(I->getOperand(i), Mask, Depth-1))
+          return false;
+      }
+      return true;
+    }
+    case Instruction::InsertElement: {
+      ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
+      if (!CI) return false;
+      int ElementNumber = CI->getLimitedValue();
+
+      // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
+      // can't put an element into multiple indices.
+      bool SeenOnce = false;
+      for (int i = 0, e = Mask.size(); i != e; ++i) {
+        if (Mask[i] == ElementNumber) {
+          if (SeenOnce)
+            return false;
+          SeenOnce = true;
+        }
+      }
+      return CanEvaluateShuffled(I->getOperand(0), Mask, Depth-1);
+    }
+  }
+  return false;
+}
+
+/// Rebuild a new instruction just like 'I' but with the new operands given.
+/// In the event of type mismatch, the type of the operands is correct.
+static Value *BuildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+  // We don't want to use the IRBuilder here because we want the replacement
+  // instructions to appear next to 'I', not the builder's insertion point.
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      BinaryOperator *BO = cast<BinaryOperator>(I);
+      assert(NewOps.size() == 2 && "binary operator with #ops != 2");
+      BinaryOperator *New =
+          BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
+                                 NewOps[0], NewOps[1], "", BO);
+      if (isa<OverflowingBinaryOperator>(BO)) {
+        New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
+        New->setHasNoSignedWrap(BO->hasNoSignedWrap());
+      }
+      if (isa<PossiblyExactOperator>(BO)) {
+        New->setIsExact(BO->isExact());
+      }
+      return New;
+    }
+    case Instruction::ICmp:
+      assert(NewOps.size() == 2 && "icmp with #ops != 2");
+      return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::FCmp:
+      assert(NewOps.size() == 2 && "fcmp with #ops != 2");
+      return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
+                          NewOps[0], NewOps[1]);
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt: {
+      // It's possible that the mask has a different number of elements from
+      // the original cast. We recompute the destination type to match the mask.
+      Type *DestTy =
+          VectorType::get(I->getType()->getScalarType(),
+                          NewOps[0]->getType()->getVectorNumElements());
+      assert(NewOps.size() == 1 && "cast with #ops != 1");
+      return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
+                              "", I);
+    }
+    case Instruction::GetElementPtr: {
+      Value *Ptr = NewOps[0];
+      ArrayRef<Value*> Idx = NewOps.slice(1);
+      GetElementPtrInst *GEP = GetElementPtrInst::Create(Ptr, Idx, "", I);
+      GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
+      return GEP;
+    }
+  }
+  llvm_unreachable("failed to rebuild vector instructions");
+}
+
+Value *
+InstCombiner::EvaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
+  // Mask.size() does not need to be equal to the number of vector elements.
+
+  assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
+  if (isa<UndefValue>(V)) {
+    return UndefValue::get(VectorType::get(V->getType()->getScalarType(),
+                                           Mask.size()));
+  }
+  if (isa<ConstantAggregateZero>(V)) {
+    return ConstantAggregateZero::get(
+               VectorType::get(V->getType()->getScalarType(),
+                               Mask.size()));
+  }
+  if (Constant *C = dyn_cast<Constant>(V)) {
+    SmallVector<Constant *, 16> MaskValues;
+    for (int i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] == -1)
+        MaskValues.push_back(UndefValue::get(Builder->getInt32Ty()));
+      else
+        MaskValues.push_back(Builder->getInt32(Mask[i]));
+    }
+    return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
+                                          ConstantVector::get(MaskValues));
+  }
+
+  Instruction *I = cast<Instruction>(V);
+  switch (I->getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::Select:
+    case Instruction::GetElementPtr: {
+      SmallVector<Value*, 8> NewOps;
+      bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements());
+      for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *V = EvaluateInDifferentElementOrder(I->getOperand(i), Mask);
+        NewOps.push_back(V);
+        NeedsRebuild |= (V != I->getOperand(i));
+      }
+      if (NeedsRebuild) {
+        return BuildNew(I, NewOps);
+      }
+      return I;
+    }
+    case Instruction::InsertElement: {
+      int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
+
+      // The insertelement was inserting at Element. Figure out which element
+      // that becomes after shuffling. The answer is guaranteed to be unique
+      // by CanEvaluateShuffled.
+      bool Found = false;
+      int Index = 0;
+      for (int e = Mask.size(); Index != e; ++Index) {
+        if (Mask[Index] == Element) {
+          Found = true;
+          break;
+        }
+      }
+
+      if (!Found)
+        return UndefValue::get(
+            VectorType::get(V->getType()->getScalarType(), Mask.size()));
+
+      Value *V = EvaluateInDifferentElementOrder(I->getOperand(0), Mask);
+      return InsertElementInst::Create(V, I->getOperand(1),
+                                       Builder->getInt32(Index), "", I);
+    }
+  }
+  llvm_unreachable("failed to reorder elements of vector instruction!");
+}
 
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
@@ -527,9 +805,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (LHS == RHS || isa<UndefValue>(LHS)) {
     if (isa<UndefValue>(LHS) && LHS == RHS) {
       // shuffle(undef,undef,mask) -> undef.
-      Value* result = (VWidth == LHSWidth)
+      Value *Result = (VWidth == LHSWidth)
                       ? LHS : UndefValue::get(SVI.getType());
-      return ReplaceInstUsesWith(SVI, result);
+      return ReplaceInstUsesWith(SVI, Result);
     }
 
     // Remap any references to RHS to use LHS.
@@ -576,6 +854,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
   }
 
+  if (isa<UndefValue>(RHS) && CanEvaluateShuffled(LHS, Mask)) {
+    Value *V = EvaluateInDifferentElementOrder(LHS, Mask);
+    return ReplaceInstUsesWith(SVI, V);
+  }
+
   // If the LHS is a shufflevector itself, see if we can combine it with this
   // one without producing an unusual shuffle.
   // Cases that might be simplified:
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 49efce5..f84db27 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -1,4 +1,4 @@
-//===- InstCombineWorklist.h - Worklist for the InstCombine pass ----------===//
+//===- InstCombineWorklist.h - Worklist for InstCombine pass ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -37,7 +37,7 @@ public:
   /// in it.
   void Add(Instruction *I) {
     if (WorklistMap.insert(std::make_pair(I, Worklist.size())).second) {
-      DEBUG(errs() << "IC: ADD: " << *I << '\n');
+      DEBUG(dbgs() << "IC: ADD: " << *I << '\n');
       Worklist.push_back(I);
     }
   }
@@ -54,7 +54,7 @@ public:
     assert(Worklist.empty() && "Worklist must be empty to add initial group");
     Worklist.reserve(NumEntries+16);
     WorklistMap.resize(NumEntries);
-    DEBUG(errs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
+    DEBUG(dbgs() << "IC: ADDING: " << NumEntries << " instrs to worklist\n");
     for (unsigned Idx = 0; NumEntries; --NumEntries) {
       Instruction *I = List[NumEntries-1];
       WorklistMap.insert(std::make_pair(I, Idx++));
@@ -74,8 +74,7 @@ public:
   }
 
   Instruction *RemoveOne() {
-    Instruction *I = Worklist.back();
-    Worklist.pop_back();
+    Instruction *I = Worklist.pop_back_val();
     WorklistMap.erase(I);
     return I;
   }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ec10751..191a101 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -699,7 +699,10 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
       Value *InV = 0;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+      // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
+      // even if currently isNullValue gives false.
+      Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+      if (InC && !isa<ConstantExpr>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
@@ -755,19 +758,25 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return ReplaceInstUsesWith(I, NewPN);
 }
 
-/// FindElementAtOffset - Given a type and a constant offset, determine whether
-/// or not there is a sequence of GEP indices into the type that will land us at
-/// the specified offset.  If so, fill them into NewIndices and return the
-/// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
-                                          SmallVectorImpl<Value*> &NewIndices) {
-  if (!TD) return 0;
-  if (!Ty->isSized()) return 0;
+/// FindElementAtOffset - Given a pointer type and a constant offset, determine
+/// whether or not there is a sequence of GEP indices into the pointed type that
+/// will land us at the specified offset.  If so, fill them into NewIndices and
+/// return the resultant element type, otherwise return null.
+Type *InstCombiner::FindElementAtOffset(Type *PtrTy, int64_t Offset,
+                                        SmallVectorImpl<Value*> &NewIndices) {
+  assert(PtrTy->isPtrOrPtrVectorTy());
+
+  if (!TD)
+    return 0;
+
+  Type *Ty = PtrTy->getPointerElementType();
+  if (!Ty->isSized())
+    return 0;
 
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(PtrTy);
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -1176,6 +1185,22 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         GetElementPtrInst::Create(Src->getOperand(0), Indices, GEP.getName());
   }
 
+  // Canonicalize (gep i8* X, -(ptrtoint Y)) to (sub (ptrtoint X), (ptrtoint Y))
+  // The GEP pattern is emitted by the SCEV expander for certain kinds of
+  // pointer arithmetic.
+  if (TD && GEP.getNumIndices() == 1 &&
+      match(GEP.getOperand(1), m_Neg(m_PtrToInt(m_Value())))) {
+    unsigned AS = GEP.getPointerAddressSpace();
+    if (GEP.getType() == Builder->getInt8PtrTy(AS) &&
+        GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+        TD->getPointerSizeInBits(AS)) {
+      Operator *Index = cast<Operator>(GEP.getOperand(1));
+      Value *PtrToInt = Builder->CreatePtrToInt(PtrOp, Index->getType());
+      Value *NewSub = Builder->CreateSub(PtrToInt, Index->getOperand(1));
+      return CastInst::Create(Instruction::IntToPtr, NewSub, GEP.getType());
+    }
+  }
+
   // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
   Value *StrippedPtr = PtrOp->stripPointerCasts();
   PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
@@ -1231,13 +1256,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
       // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
       Type *SrcElTy = StrippedPtrTy->getElementType();
-      Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      Type *ResElTy = PtrOp->getType()->getPointerElementType();
       if (TD && SrcElTy->isArrayTy() &&
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeAllocSize(SrcElTy->getArrayElementType()) ==
           TD->getTypeAllocSize(ResElTy)) {
-        Value *Idx[2];
-        Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-        Idx[1] = GEP.getOperand(1);
+        Type *IdxType = TD->getIntPtrType(GEP.getType());
+        Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
         Value *NewGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()) :
           Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
@@ -1261,7 +1285,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1287,8 +1311,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
         uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
-        uint64_t ArrayEltSize =
-          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        uint64_t ArrayEltSize
+          = TD->getTypeAllocSize(SrcElTy->getArrayElementType());
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -1296,7 +1320,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1304,9 +1328,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
             // If the multiplication NewIdx * Scale may overflow then the new
             // GEP may not be "inbounds".
-            Value *Off[2];
-            Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-            Off[1] = NewIdx;
+            Value *Off[2] = {
+              Constant::getNullValue(TD->getIntPtrType(GEP.getType())),
+              NewIdx
+            };
+
             Value *NewGEP = GEP.isInBounds() && NSW ?
               Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
               Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
@@ -1318,15 +1344,20 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
+  if (!TD)
+    return 0;
+
   /// See if we can simplify:
   ///   X = bitcast A* to B*
   ///   Y = gep X, <...constant indices...>
   /// into a gep of the original struct.  This is important for SROA and alias
   /// analysis of unions.  If "A" is also a bitcast, wait for A/X to be merged.
   if (BitCastInst *BCI = dyn_cast<BitCastInst>(PtrOp)) {
-    APInt Offset(TD ? TD->getPointerSizeInBits() : 1, 0);
-    if (TD &&
-        !isa<BitCastInst>(BCI->getOperand(0)) &&
+    Value *Operand = BCI->getOperand(0);
+    PointerType *OpType = cast<PointerType>(Operand->getType());
+    unsigned OffsetBits = TD->getPointerTypeSizeInBits(OpType);
+    APInt Offset(OffsetBits, 0);
+    if (!isa<BitCastInst>(Operand) &&
         GEP.accumulateConstantOffset(*TD, Offset) &&
         StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) {
 
@@ -1335,8 +1366,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (!Offset) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
-        if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0), TLI)) {
+        if (isa<AllocaInst>(Operand) || isAllocationFn(Operand, TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1347,19 +1377,17 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             return &GEP;
           }
         }
-        return new BitCastInst(BCI->getOperand(0), GEP.getType());
+        return new BitCastInst(Operand, GEP.getType());
       }
 
       // Otherwise, if the offset is non-zero, we need to find out if there is a
       // field at Offset in 'A's type.  If so, we can pull the cast through the
       // GEP.
       SmallVector<Value*, 8> NewIndices;
-      Type *InTy =
-        cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
-      if (FindElementAtOffset(InTy, Offset.getSExtValue(), NewIndices)) {
+      if (FindElementAtOffset(OpType, Offset.getSExtValue(), NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
-          Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
-          Builder->CreateGEP(BCI->getOperand(0), NewIndices);
+          Builder->CreateInBoundsGEP(Operand, NewIndices) :
+          Builder->CreateGEP(Operand, NewIndices);
 
         if (NGEP->getType() == GEP.getType())
           return ReplaceInstUsesWith(GEP, NGEP);
@@ -1372,8 +1400,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   return 0;
 }
 
-
-
 static bool
 isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
                      const TargetLibraryInfo *TLI) {
@@ -2042,7 +2068,7 @@ Instruction *InstCombiner::visitLandingPadInst(LandingPadInst &LI) {
         continue;
       // If Filter is a subset of LFilter, i.e. every element of Filter is also
       // an element of LFilter, then discard LFilter.
-      SmallVector<Value *, 16>::iterator J = NewClauses.begin() + j;
+      SmallVectorImpl<Value *>::iterator J = NewClauses.begin() + j;
       // If Filter is empty then it is a subset of LFilter.
       if (!FElts) {
         // Discard LFilter.
@@ -2209,7 +2235,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // DCE instruction if trivially dead.
       if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
-        DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
+        DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
         continue;
       }
@@ -2217,7 +2243,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0)))
         if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) {
-          DEBUG(errs() << "IC: ConstFold to: " << *C << " from: "
+          DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: "
                        << *Inst << '\n');
           Inst->replaceAllUsesWith(C);
           ++NumConstProp;
@@ -2293,7 +2319,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
 bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
 
-  DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+  DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
                << F.getName() << "\n");
 
   {
@@ -2338,7 +2364,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, TLI)) {
-      DEBUG(errs() << "IC: DCE: " << *I << '\n');
+      DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
       MadeIRChange = true;
@@ -2348,7 +2374,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     // Instruction isn't dead, see if we can constant propagate it.
     if (!I->use_empty() && isa<Constant>(I->getOperand(0)))
       if (Constant *C = ConstantFoldInstruction(I, TD, TLI)) {
-        DEBUG(errs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
+        DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I << '\n');
 
         // Add operands to the worklist.
         ReplaceInstUsesWith(*I, C);
@@ -2396,13 +2422,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     std::string OrigI;
 #endif
     DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
-    DEBUG(errs() << "IC: Visiting: " << OrigI << '\n');
+    DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
 
     if (Instruction *Result = visit(*I)) {
       ++NumCombined;
       // Should we replace the old instruction with a new one?
       if (Result != I) {
-        DEBUG(errs() << "IC: Old = " << *I << '\n'
+        DEBUG(dbgs() << "IC: Old = " << *I << '\n'
                      << "    New = " << *Result << '\n');
 
         if (!I->getDebugLoc().isUnknown())
@@ -2431,7 +2457,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
         EraseInstFromFunction(*I);
       } else {
 #ifndef NDEBUG
-        DEBUG(errs() << "IC: Mod = " << OrigI << '\n'
+        DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
                      << "    New = " << *I << '\n');
 #endif
 
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 623c470..d731ec5 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DIBuilder.h"
@@ -39,13 +40,14 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/BlackList.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
 #include <algorithm>
 #include <string>
 
@@ -56,36 +58,49 @@ static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
 static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
 static const uint64_t kDefaultShort64bitShadowOffset = 0x7FFF8000;  // < 2G.
 static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa8000;
 
+static const size_t kMinStackMallocSize = 1 << 6;  // 64B
 static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
 static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
 static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
 
-static const char *kAsanModuleCtorName = "asan.module_ctor";
-static const char *kAsanModuleDtorName = "asan.module_dtor";
-static const int   kAsanCtorAndCtorPriority = 1;
-static const char *kAsanReportErrorTemplate = "__asan_report_";
-static const char *kAsanReportLoadN = "__asan_report_load_n";
-static const char *kAsanReportStoreN = "__asan_report_store_n";
-static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
-static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
-static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
-static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init_v3";
-static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
-static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
-static const char *kAsanMappingScaleName = "__asan_mapping_scale";
-static const char *kAsanStackMallocName = "__asan_stack_malloc";
-static const char *kAsanStackFreeName = "__asan_stack_free";
-static const char *kAsanGenPrefix = "__asan_gen_";
-static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory";
-static const char *kAsanUnpoisonStackMemoryName =
+static const char *const kAsanModuleCtorName = "asan.module_ctor";
+static const char *const kAsanModuleDtorName = "asan.module_dtor";
+static const int         kAsanCtorAndCtorPriority = 1;
+static const char *const kAsanReportErrorTemplate = "__asan_report_";
+static const char *const kAsanReportLoadN = "__asan_report_load_n";
+static const char *const kAsanReportStoreN = "__asan_report_store_n";
+static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *const kAsanUnregisterGlobalsName =
+    "__asan_unregister_globals";
+static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
+static const char *const kAsanInitName = "__asan_init_v3";
+static const char *const kAsanCovName = "__sanitizer_cov";
+static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
+static const char *const kAsanMappingOffsetName = "__asan_mapping_offset";
+static const char *const kAsanMappingScaleName = "__asan_mapping_scale";
+static const int         kMaxAsanStackMallocSizeClass = 10;
+static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
+static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
+static const char *const kAsanGenPrefix = "__asan_gen_";
+static const char *const kAsanPoisonStackMemoryName =
+    "__asan_poison_stack_memory";
+static const char *const kAsanUnpoisonStackMemoryName =
     "__asan_unpoison_stack_memory";
 
+static const char *const kAsanOptionDetectUAR =
+    "__asan_option_detect_stack_use_after_return";
+
+// These constants must match the definitions in the run-time library.
 static const int kAsanStackLeftRedzoneMagic = 0xf1;
 static const int kAsanStackMidRedzoneMagic = 0xf2;
 static const int kAsanStackRightRedzoneMagic = 0xf3;
 static const int kAsanStackPartialRedzoneMagic = 0xf4;
+#ifndef NDEBUG
+static const int kAsanStackAfterReturnMagic = 0xf5;
+#endif
 
 // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
 static const size_t kNumberOfAccessSizes = 5;
@@ -120,6 +135,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClCoverage("asan-coverage",
+       cl::desc("ASan coverage"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
@@ -130,6 +147,19 @@ static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
        cl::desc("File containing the list of objects to ignore "
                 "during instrumentation"), cl::Hidden);
 
+// This is an experimental feature that will allow to choose between
+// instrumented and non-instrumented code at link-time.
+// If this option is on, just before instrumenting a function we create its
+// clone; if the function is not changed by asan the clone is deleted.
+// If we end up with a clone, we put the instrumented function into a section
+// called "ASAN" and the uninstrumented function into a section called "NOASAN".
+//
+// This is still a prototype, we need to figure out a way to keep two copies of
+// a function so that the linker can easily choose one of them.
+static cl::opt<bool> ClKeepUninstrumented("asan-keep-uninstrumented-functions",
+       cl::desc("Keep uninstrumented copies of functions"),
+       cl::Hidden, cl::init(false));
+
 // These flags allow to change the shadow mapping.
 // The shadow mapping looks like
 //    Shadow = (Mem >> scale) + (1 << offset_log)
@@ -167,6 +197,13 @@ static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
 static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
                                cl::Hidden, cl::init(-1));
 
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalArray,
+          "Number of optimized accesses to global arrays");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+          "Number of optimized accesses to global vars");
+
 namespace {
 /// A set of dynamically initialized globals extracted from metadata.
 class SetOfDynamicallyInitializedGlobals {
@@ -206,8 +243,11 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize,
   llvm::Triple TargetTriple(M.getTargetTriple());
   bool IsAndroid = TargetTriple.getEnvironment() == llvm::Triple::Android;
   bool IsMacOSX = TargetTriple.getOS() == llvm::Triple::MacOSX;
-  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64;
+  bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
+                 TargetTriple.getArch() == llvm::Triple::ppc64le;
   bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
+  bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips ||
+                  TargetTriple.getArch() == llvm::Triple::mipsel;
 
   ShadowMapping Mapping;
 
@@ -217,7 +257,8 @@ static ShadowMapping getShadowMapping(const Module &M, int LongSize,
   Mapping.OrShadowOffset = !IsPPC64 && !ClShort64BitOffset;
 
   Mapping.Offset = (IsAndroid || ZeroBaseShadow) ? 0 :
-      (LongSize == 32 ? kDefaultShadowOffset32 :
+      (LongSize == 32 ?
+       (IsMIPS32 ? kMIPS32_ShadowOffset32 : kDefaultShadowOffset32) :
        IsPPC64 ? kPPC64_ShadowOffset64 : kDefaultShadowOffset64);
   if (!ZeroBaseShadow && ClShort64BitOffset && IsX86_64 && !IsMacOSX) {
     assert(LongSize == 64);
@@ -285,6 +326,8 @@ struct AddressSanitizer : public FunctionPass {
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   bool LooksLikeCodeInBug11395(Instruction *I);
   void FindDynamicInitializers(Module &M);
+  bool GlobalIsLinkerInitialized(GlobalVariable *G);
+  bool InjectCoverage(Function &F);
 
   bool CheckInitOrder;
   bool CheckUseAfterReturn;
@@ -300,7 +343,8 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanHandleNoReturnFunc;
-  OwningPtr<BlackList> BL;
+  Function *AsanCovFunction;
+  OwningPtr<SpecialCaseList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   // This array is indexed by AccessIsWrite.
@@ -340,7 +384,7 @@ class AddressSanitizerModule : public ModulePass {
   SmallString<64> BlacklistFile;
   bool ZeroBaseShadow;
 
-  OwningPtr<BlackList> BL;
+  OwningPtr<SpecialCaseList> BL;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
   Type *IntptrTy;
   LLVMContext *C;
@@ -375,12 +419,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   uint64_t TotalStackSize;
   unsigned StackAlignment;
 
-  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+           *AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
   Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
 
   // Stores a place and arguments of poisoning/unpoisoning call for alloca.
   struct AllocaPoisonCall {
     IntrinsicInst *InsBefore;
+    AllocaInst *AI;
     uint64_t Size;
     bool DoPoison;
   };
@@ -433,7 +479,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
 
     StackAlignment = std::max(StackAlignment, AI.getAlignment());
     AllocaVec.push_back(&AI);
-    uint64_t AlignedSize =  getAlignedAllocaSize(&AI);
+    uint64_t AlignedSize = getAlignedAllocaSize(&AI);
     TotalStackSize += AlignedSize;
   }
 
@@ -459,7 +505,7 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
     if (!AI) return;
     bool DoPoison = (ID == Intrinsic::lifetime_end);
-    AllocaPoisonCall APC = {&II, SizeValue, DoPoison};
+    AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
     AllocaPoisonCallVec.push_back(APC);
   }
 
@@ -467,33 +513,37 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   void initializeCallbacks(Module &M);
 
   // Check if we want (and can) handle this alloca.
-  bool isInterestingAlloca(AllocaInst &AI) {
+  bool isInterestingAlloca(AllocaInst &AI) const {
     return (!AI.isArrayAllocation() &&
             AI.isStaticAlloca() &&
+            AI.getAlignment() <= RedzoneSize() &&
             AI.getAllocatedType()->isSized());
   }
 
   size_t RedzoneSize() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
-  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) const {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = ASan.TD->getTypeAllocSize(Ty);
     return SizeInBytes;
   }
-  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+  uint64_t getAlignedSize(uint64_t SizeInBytes) const {
     size_t RZ = RedzoneSize();
     return ((SizeInBytes + RZ - 1) / RZ) * RZ;
   }
-  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) const {
     uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
     return getAlignedSize(SizeInBytes);
   }
   /// Finds alloca where the value comes from.
   AllocaInst *findAllocaForValue(Value *V);
-  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+  void poisonRedZones(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB,
                       Value *ShadowBase, bool DoPoison);
-  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+  void SetShadowToStackAfterReturnInlined(IRBuilder<> &IRB, Value *ShadowBase,
+                                          int Size);
 };
 
 }  // namespace
@@ -520,16 +570,16 @@ ModulePass *llvm::createAddressSanitizerModulePass(
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
-  size_t Res = CountTrailingZeros_32(TypeSize / 8);
+  size_t Res = countTrailingZeros(TypeSize / 8);
   assert(Res < kNumberOfAccessSizes);
   return Res;
 }
 
-// Create a constant for Str so that we can pass it to the run-time lib.
+// \brief Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
   GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true,
-                            GlobalValue::PrivateLinkage, StrConst,
+                            GlobalValue::InternalLinkage, StrConst,
                             kAsanGenPrefix);
   GV->setUnnamedAddr(true);  // Ok to merge these.
   GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
@@ -620,6 +670,13 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   return NULL;
 }
 
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+  // If a global variable does not have dynamic initialization we don't
+  // have to instrument it.  However, if a global does not have initializer
+  // at all, we assume it has dynamic initializer (in other TU).
+  return G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G);
+}
+
 void AddressSanitizer::instrumentMop(Instruction *I) {
   bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
@@ -628,13 +685,19 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!CheckInitOrder)
-        return;
-      // If a global variable does not have dynamic initialization we don't
-      // have to instrument it.  However, if a global does not have initailizer
-      // at all, we assume it has dynamic initializer (in other TU).
-      if (G->hasInitializer() && !DynamicallyInitializedGlobals.Contains(G))
+      if (!CheckInitOrder || GlobalIsLinkerInitialized(G)) {
+        NumOptimizedAccessesToGlobalVar++;
         return;
+      }
+    }
+    ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr);
+    if (CE && CE->isGEPWithNoNotionalOverIndexing()) {
+      if (GlobalVariable *G = dyn_cast<GlobalVariable>(CE->getOperand(0))) {
+        if (CE->getOperand(1)->isNullValue() && GlobalIsLinkerInitialized(G)) {
+          NumOptimizedAccessesToGlobalArray++;
+          return;
+        }
+      }
     }
   }
 
@@ -646,6 +709,11 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
 
   assert((TypeSize % 8) == 0);
 
+  if (IsWrite)
+    NumInstrumentedWrites++;
+  else
+    NumInstrumentedReads++;
+
   // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check.
   if (TypeSize == 8  || TypeSize == 16 ||
       TypeSize == 32 || TypeSize == 64 || TypeSize == 128)
@@ -861,7 +929,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new BlackList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   if (BL->isIn(M)) return false;
   C = &(M.getContext());
   int LongSize = TD->getPointerSizeInBits();
@@ -892,8 +960,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy, NULL);
-  SmallVector<Constant *, 16> Initializers(n), DynamicInit;
-
+  SmallVector<Constant *, 16> Initializers(n);
 
   Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
   assert(CtorFunc);
@@ -929,7 +996,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
     bool GlobalHasDynamicInitializer =
         DynamicallyInitializedGlobals.Contains(G);
     // Don't check initialization order if this global is blacklisted.
-    GlobalHasDynamicInitializer &= !BL->isInInit(*G);
+    GlobalHasDynamicInitializer &= !BL->isIn(*G, "init");
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -939,8 +1006,11 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
     GlobalVariable *Name = createPrivateGlobalForString(M, G->getName());
 
     // Create a new global variable with enough space for a redzone.
+    GlobalValue::LinkageTypes Linkage = G->getLinkage();
+    if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+      Linkage = GlobalValue::InternalLinkage;
     GlobalVariable *NewGlobal = new GlobalVariable(
-        M, NewTy, G->isConstant(), G->getLinkage(),
+        M, NewTy, G->isConstant(), Linkage,
         NewInitializer, "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setAlignment(MinRZ);
@@ -973,7 +1043,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
 
   ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
   GlobalVariable *AllGlobals = new GlobalVariable(
-      M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
@@ -1021,6 +1091,8 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  AsanCovFunction = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanCovName, IRB.getVoidTy(), IntptrTy, NULL));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -1051,7 +1123,7 @@ bool AddressSanitizer::doInitialization(Module &M) {
 
   if (!TD)
     return false;
-  BL.reset(new BlackList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   DynamicallyInitializedGlobals.Init(M);
 
   C = &(M.getContext());
@@ -1092,6 +1164,47 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   return false;
 }
 
+// Poor man's coverage that works with ASan.
+// We create a Guard boolean variable with the same linkage
+// as the function and inject this code into the entry block:
+// if (*Guard) {
+//    __sanitizer_cov(&F);
+//    *Guard = 1;
+// }
+// The accesses to Guard are atomic. The rest of the logic is
+// in __sanitizer_cov (it's fine to call it more than once).
+//
+// This coverage implementation provides very limited data:
+// it only tells if a given function was ever executed.
+// No counters, no per-basic-block or per-edge data.
+// But for many use cases this is what we need and the added slowdown
+// is negligible. This simple implementation will probably be obsoleted
+// by the upcoming Clang-based coverage implementation.
+// By having it here and now we hope to
+//  a) get the functionality to users earlier and
+//  b) collect usage statistics to help improve Clang coverage design.
+bool AddressSanitizer::InjectCoverage(Function &F) {
+  if (!ClCoverage) return false;
+  IRBuilder<> IRB(F.getEntryBlock().getFirstInsertionPt());
+  Type *Int8Ty = IRB.getInt8Ty();
+  GlobalVariable *Guard = new GlobalVariable(
+      *F.getParent(), Int8Ty, false, GlobalValue::PrivateLinkage,
+      Constant::getNullValue(Int8Ty), "__asan_gen_cov_" + F.getName());
+  LoadInst *Load = IRB.CreateLoad(Guard);
+  Load->setAtomic(Monotonic);
+  Load->setAlignment(1);
+  Value *Cmp = IRB.CreateICmpEQ(Constant::getNullValue(Int8Ty), Load);
+  Instruction *Ins = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+  IRB.SetInsertPoint(Ins);
+  // We pass &F to __sanitizer_cov. We could avoid this and rely on
+  // GET_CALLER_PC, but having the PC of the first instruction is just nice.
+  IRB.CreateCall(AsanCovFunction, IRB.CreatePointerCast(&F, IntptrTy));
+  StoreInst *Store = IRB.CreateStore(ConstantInt::get(Int8Ty, 1), Guard);
+  Store->setAtomic(Monotonic);
+  Store->setAlignment(1);
+  return true;
+}
+
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
@@ -1102,8 +1215,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   // If needed, insert __asan_init before checking for SanitizeAddress attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
-  if (!F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::SanitizeAddress))
+  if (!F.hasFnAttribute(Attribute::SanitizeAddress))
     return false;
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
@@ -1114,6 +1226,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   SmallSet<Value*, 16> TempsToInstrument;
   SmallVector<Instruction*, 16> ToInstrument;
   SmallVector<Instruction*, 8> NoReturnCalls;
+  int NumAllocas = 0;
   bool IsWrite;
 
   // Fill the set of memory operations to instrument.
@@ -1132,6 +1245,8 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
         // ok, take it.
       } else {
+        if (isa<AllocaInst>(BI))
+          NumAllocas++;
         CallSite CS(BI);
         if (CS) {
           // A call inside BB.
@@ -1148,6 +1263,17 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     }
   }
 
+  Function *UninstrumentedDuplicate = 0;
+  bool LikelyToInstrument =
+      !NoReturnCalls.empty() || !ToInstrument.empty() || (NumAllocas > 0);
+  if (ClKeepUninstrumented && LikelyToInstrument) {
+    ValueToValueMapTy VMap;
+    UninstrumentedDuplicate = CloneFunction(&F, VMap, false);
+    UninstrumentedDuplicate->removeFnAttr(Attribute::SanitizeAddress);
+    UninstrumentedDuplicate->setName("NOASAN_" + F.getName());
+    F.getParent()->getFunctionList().push_back(UninstrumentedDuplicate);
+  }
+
   // Instrument.
   int NumInstrumented = 0;
   for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
@@ -1172,9 +1298,29 @@ bool AddressSanitizer::runOnFunction(Function &F) {
     IRBuilder<> IRB(CI);
     IRB.CreateCall(AsanHandleNoReturnFunc);
   }
-  DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n");
 
-  return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+  bool res = NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
+
+  if (InjectCoverage(F))
+    res = true;
+
+  DEBUG(dbgs() << "ASAN done instrumenting: " << res << " " << F << "\n");
+
+  if (ClKeepUninstrumented) {
+    if (!res) {
+      // No instrumentation is done, no need for the duplicate.
+      if (UninstrumentedDuplicate)
+        UninstrumentedDuplicate->eraseFromParent();
+    } else {
+      // The function was instrumented. We must have the duplicate.
+      assert(UninstrumentedDuplicate);
+      UninstrumentedDuplicate->setSection("NOASAN");
+      assert(!F.hasSection());
+      F.setSection("ASAN");
+    }
+  }
+
+  return res;
 }
 
 static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) {
@@ -1217,11 +1363,15 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
 
 void FunctionStackPoisoner::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
-  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
-  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanStackFreeName, IRB.getVoidTy(),
-      IntptrTy, IntptrTy, IntptrTy, NULL));
+  for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+    std::string Suffix = itostr(i);
+    AsanStackMallocFunc[i] = checkInterfaceFunction(
+        M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
+                              IntptrTy, IntptrTy, NULL));
+    AsanStackFreeFunc[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        kAsanStackFreeNameTemplate + Suffix, IRB.getVoidTy(), IntptrTy,
+        IntptrTy, IntptrTy, NULL));
+  }
   AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
   AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
@@ -1229,7 +1379,7 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
 }
 
 void FunctionStackPoisoner::poisonRedZones(
-  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase,
+  const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> &IRB, Value *ShadowBase,
   bool DoPoison) {
   size_t ShadowRZSize = RedzoneSize() >> Mapping.Scale;
   assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
@@ -1270,6 +1420,10 @@ void FunctionStackPoisoner::poisonRedZones(
                                         RedzoneSize(),
                                         1ULL << Mapping.Scale,
                                         kAsanStackPartialRedzoneMagic);
+        Poison =
+            ASan.TD->isLittleEndian()
+                ? support::endian::byte_swap<uint32_t, support::little>(Poison)
+                : support::endian::byte_swap<uint32_t, support::big>(Poison);
       }
       Value *PartialPoison = ConstantInt::get(RZTy, Poison);
       IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
@@ -1286,12 +1440,40 @@ void FunctionStackPoisoner::poisonRedZones(
   }
 }
 
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+  assert(LocalStackSize <= kMaxStackMallocSize);
+  uint64_t MaxSize = kMinStackMallocSize;
+  for (int i = 0; ; i++, MaxSize *= 2)
+    if (LocalStackSize <= MaxSize)
+      return i;
+  llvm_unreachable("impossible LocalStackSize");
+}
+
+// Set Size bytes starting from ShadowBase to kAsanStackAfterReturnMagic.
+// We can not use MemSet intrinsic because it may end up calling the actual
+// memset. Size is a multiple of 8.
+// Currently this generates 8-byte stores on x86_64; it may be better to
+// generate wider stores.
+void FunctionStackPoisoner::SetShadowToStackAfterReturnInlined(
+    IRBuilder<> &IRB, Value *ShadowBase, int Size) {
+  assert(!(Size % 8));
+  assert(kAsanStackAfterReturnMagic == 0xf5);
+  for (int i = 0; i < Size; i += 8) {
+    Value *p = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+    IRB.CreateStore(ConstantInt::get(IRB.getInt64Ty(), 0xf5f5f5f5f5f5f5f5ULL),
+                    IRB.CreateIntToPtr(p, IRB.getInt64Ty()->getPointerTo()));
+  }
+}
+
 void FunctionStackPoisoner::poisonStack() {
   uint64_t LocalStackSize = TotalStackSize +
                             (AllocaVec.size() + 1) * RedzoneSize();
 
   bool DoStackMalloc = ASan.CheckUseAfterReturn
       && LocalStackSize <= kMaxStackMallocSize;
+  int StackMallocIdx = -1;
 
   assert(AllocaVec.size() > 0);
   Instruction *InsBefore = AllocaVec[0];
@@ -1309,8 +1491,28 @@ void FunctionStackPoisoner::poisonStack() {
   Value *LocalStackBase = OrigStackBase;
 
   if (DoStackMalloc) {
-    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+    // LocalStackBase = OrigStackBase
+    // if (__asan_option_detect_stack_use_after_return)
+    //   LocalStackBase = __asan_stack_malloc_N(LocalStackBase, OrigStackBase);
+    StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+    assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+    Constant *OptionDetectUAR = F.getParent()->getOrInsertGlobal(
+        kAsanOptionDetectUAR, IRB.getInt32Ty());
+    Value *Cmp = IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUAR),
+                                  Constant::getNullValue(IRB.getInt32Ty()));
+    Instruction *Term =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+    BasicBlock *CmpBlock = cast<Instruction>(Cmp)->getParent();
+    IRBuilder<> IRBIf(Term);
+    LocalStackBase = IRBIf.CreateCall2(
+        AsanStackMallocFunc[StackMallocIdx],
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+    BasicBlock *SetBlock = cast<Instruction>(LocalStackBase)->getParent();
+    IRB.SetInsertPoint(InsBefore);
+    PHINode *Phi = IRB.CreatePHI(IntptrTy, 2);
+    Phi->addIncoming(OrigStackBase, CmpBlock);
+    Phi->addIncoming(LocalStackBase, SetBlock);
+    LocalStackBase = Phi;
   }
 
   // This string will be parsed by the run-time (DescribeAddressIfStack).
@@ -1322,11 +1524,10 @@ void FunctionStackPoisoner::poisonStack() {
   bool HavePoisonedAllocas = false;
   for (size_t i = 0, n = AllocaPoisonCallVec.size(); i < n; i++) {
     const AllocaPoisonCall &APC = AllocaPoisonCallVec[i];
-    IntrinsicInst *II = APC.InsBefore;
-    AllocaInst *AI = findAllocaForValue(II->getArgOperand(1));
-    assert(AI);
-    IRBuilder<> IRB(II);
-    poisonAlloca(AI, APC.Size, IRB, APC.DoPoison);
+    assert(APC.InsBefore);
+    assert(APC.AI);
+    IRBuilder<> IRB(APC.InsBefore);
+    poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
     HavePoisonedAllocas |= APC.DoPoison;
   }
 
@@ -1384,10 +1585,35 @@ void FunctionStackPoisoner::poisonStack() {
     // Unpoison the stack.
     poisonRedZones(AllocaVec, IRBRet, ShadowBase, false);
     if (DoStackMalloc) {
+      assert(StackMallocIdx >= 0);
       // In use-after-return mode, mark the whole stack frame unaddressable.
-      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
-                         ConstantInt::get(IntptrTy, LocalStackSize),
-                         OrigStackBase);
+      if (StackMallocIdx <= 4) {
+        // For small sizes inline the whole thing:
+        // if LocalStackBase != OrigStackBase:
+        //     memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+        //     **SavedFlagPtr(LocalStackBase) = 0
+        // FIXME: if LocalStackBase != OrigStackBase don't call poisonRedZones.
+        Value *Cmp = IRBRet.CreateICmpNE(LocalStackBase, OrigStackBase);
+        TerminatorInst *PoisonTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
+        IRBuilder<> IRBPoison(PoisonTerm);
+        int ClassSize = kMinStackMallocSize << StackMallocIdx;
+        SetShadowToStackAfterReturnInlined(IRBPoison, ShadowBase,
+                                           ClassSize >> Mapping.Scale);
+        Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+            LocalStackBase,
+            ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+        Value *SavedFlagPtr = IRBPoison.CreateLoad(
+            IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+        IRBPoison.CreateStore(
+            Constant::getNullValue(IRBPoison.getInt8Ty()),
+            IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+      } else {
+        // For larger frames call __asan_stack_free_*.
+        IRBRet.CreateCall3(AsanStackFreeFunc[StackMallocIdx], LocalStackBase,
+                           ConstantInt::get(IntptrTy, LocalStackSize),
+                           OrigStackBase);
+      }
     } else if (HavePoisonedAllocas) {
       // If we poisoned some allocas in llvm.lifetime analysis,
       // unpoison whole stack frame now.
@@ -1402,7 +1628,7 @@ void FunctionStackPoisoner::poisonStack() {
 }
 
 void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
-                                         IRBuilder<> IRB, bool DoPoison) {
+                                         IRBuilder<> &IRB, bool DoPoison) {
   // For now just insert the call to ASan runtime.
   Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
   Value *SizeArg = ConstantInt::get(IntptrTy, Size);
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
deleted file mode 100644
index 39de4b0..0000000
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-//===-- BlackList.cpp - blacklist for sanitizers --------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions or global
-// variables based on a user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/BlackList.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-#include <string>
-#include <utility>
-
-namespace llvm {
-
-BlackList::BlackList(const StringRef Path) {
-  // Validate and open blacklist file.
-  if (Path.empty()) return;
-  OwningPtr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
-    report_fatal_error("Can't open blacklist file: " + Path + ": " +
-                       EC.message());
-  }
-
-  // Iterate through each line in the blacklist file.
-  SmallVector<StringRef, 16> Lines;
-  SplitString(File.take()->getBuffer(), Lines, "\n\r");
-  StringMap<std::string> Regexps;
-  for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
-       I != E; ++I) {
-    // Ignore empty lines and lines starting with "#"
-    if (I->empty() || I->startswith("#"))
-      continue;
-    // Get our prefix and unparsed regexp.
-    std::pair<StringRef, StringRef> SplitLine = I->split(":");
-    StringRef Prefix = SplitLine.first;
-    std::string Regexp = SplitLine.second;
-    if (Regexp.empty()) {
-      // Missing ':' in the line.
-      report_fatal_error("malformed blacklist line: " + SplitLine.first);
-    }
-
-    // Replace * with .*
-    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
-         pos += strlen(".*")) {
-      Regexp.replace(pos, strlen("*"), ".*");
-    }
-
-    // Check that the regexp is valid.
-    Regex CheckRE(Regexp);
-    std::string Error;
-    if (!CheckRE.isValid(Error)) {
-      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
-          ": " + Error);
-    }
-
-    // Add this regexp into the proper group by its prefix.
-    if (!Regexps[Prefix].empty())
-      Regexps[Prefix] += "|";
-    Regexps[Prefix] += Regexp;
-  }
-
-  // Iterate through each of the prefixes, and create Regexs for them.
-  for (StringMap<std::string>::const_iterator I = Regexps.begin(),
-       E = Regexps.end(); I != E; ++I) {
-    Entries[I->getKey()] = new Regex(I->getValue());
-  }
-}
-
-bool BlackList::isIn(const Function &F) const {
-  return isIn(*F.getParent()) || inSection("fun", F.getName());
-}
-
-bool BlackList::isIn(const GlobalVariable &G) const {
-  return isIn(*G.getParent()) || inSection("global", G.getName());
-}
-
-bool BlackList::isIn(const Module &M) const {
-  return inSection("src", M.getModuleIdentifier());
-}
-
-static StringRef GetGVTypeString(const GlobalVariable &G) {
-  // Types of GlobalVariables are always pointer types.
-  Type *GType = G.getType()->getElementType();
-  // For now we support blacklisting struct types only.
-  if (StructType *SGType = dyn_cast<StructType>(GType)) {
-    if (!SGType->isLiteral())
-      return SGType->getName();
-  }
-  return "<unknown type>";
-}
-
-bool BlackList::isInInit(const GlobalVariable &G) const {
-  return (isIn(*G.getParent()) ||
-          inSection("global-init", G.getName()) ||
-          inSection("global-init-type", GetGVTypeString(G)) ||
-          inSection("global-init-src", G.getParent()->getModuleIdentifier()));
-}
-
-bool BlackList::inSection(const StringRef Section,
-                          const StringRef Query) const {
-  StringMap<Regex*>::const_iterator I = Entries.find(Section);
-  if (I == Entries.end()) return false;
-
-  Regex *FunctionRegex = I->getValue();
-  return FunctionRegex->match(Query);
-}
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index b094d42..7a9f0f6 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -80,7 +80,7 @@ BasicBlock *BoundsChecking::getTrapBB() {
     return TrapBB;
 
   Function *Fn = Inst->getParent()->getParent();
-  BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint();
+  IRBuilder<>::InsertPointGuard Guard(*Builder);
   TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
   Builder->SetInsertPoint(TrapBB);
 
@@ -91,7 +91,6 @@ BasicBlock *BoundsChecking::getTrapBB() {
   TrapCall->setDebugLoc(Inst->getDebugLoc());
   Builder->CreateUnreachable();
 
-  Builder->SetInsertPoint(PrevInsertPoint);
   return TrapBB;
 }
 
@@ -173,7 +172,8 @@ bool BoundsChecking::runOnFunction(Function &F) {
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext(),
+                                           /*RoundToAlign=*/true);
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 1c9e053..3563593 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,14 +1,11 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
-  BlackList.cpp
   BoundsChecking.cpp
-  EdgeProfiling.cpp
+  DataFlowSanitizer.cpp
+  DebugIR.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
   Instrumentation.cpp
-  OptimalEdgeProfiling.cpp
-  PathProfiling.cpp
-  ProfilingUtils.cpp
   ThreadSanitizer.cpp
   )
 
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
new file mode 100644
index 0000000..9b9e725
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -0,0 +1,1397 @@
+//===-- DataFlowSanitizer.cpp - dynamic data flow analysis ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own.  Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation.  Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label.  On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// |                    |
+/// |       unused       |
+/// |                    |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// |    union table     |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// |   shadow memory    |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000).  Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range.  See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
+#include <iterator>
+
+using namespace llvm;
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct.  For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16.  This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+    "dfsan-preserve-alignment",
+    cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+    cl::init(false));
+
+// The ABI list file controls how shadow parameters are passed.  The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI.  Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown.  The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::opt<std::string> ClABIListFile(
+    "dfsan-abilist",
+    cl::desc("File listing native ABI functions and how the pass treats them"),
+    cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+    "dfsan-args-abi",
+    cl::desc("Use the argument ABI rather than the TLS ABI"),
+    cl::Hidden);
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+    "dfsan-debug-nonzero-labels",
+    cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+             "load or return with a nonzero label"),
+    cl::Hidden);
+
+namespace {
+
+class DataFlowSanitizer : public ModulePass {
+  friend struct DFSanFunction;
+  friend class DFSanVisitor;
+
+  enum {
+    ShadowWidth = 16
+  };
+
+  /// Which ABI should be used for instrumented functions?
+  enum InstrumentedABI {
+    /// Argument and return value labels are passed through additional
+    /// arguments and by modifying the return type.
+    IA_Args,
+
+    /// Argument and return value labels are passed through TLS variables
+    /// __dfsan_arg_tls and __dfsan_retval_tls.
+    IA_TLS
+  };
+
+  /// How should calls to uninstrumented functions be handled?
+  enum WrapperKind {
+    /// This function is present in an uninstrumented form but we don't know
+    /// how it should be handled.  Print a warning and call the function anyway.
+    /// Don't label the return value.
+    WK_Warning,
+
+    /// This function does not write to (user-accessible) memory, and its return
+    /// value is unlabelled.
+    WK_Discard,
+
+    /// This function does not write to (user-accessible) memory, and the label
+    /// of its return value is the union of the label of its arguments.
+    WK_Functional,
+
+    /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+    /// where F is the name of the function.  This function may wrap the
+    /// original function or provide its own implementation.  This is similar to
+    /// the IA_Args ABI, except that IA_Args uses a struct return type to
+    /// pass the return value shadow in a register, while WK_Custom uses an
+    /// extra pointer argument to return the shadow.  This allows the wrapped
+    /// form of the function type to be expressed in C.
+    WK_Custom
+  };
+
+  DataLayout *DL;
+  Module *Mod;
+  LLVMContext *Ctx;
+  IntegerType *ShadowTy;
+  PointerType *ShadowPtrTy;
+  IntegerType *IntptrTy;
+  ConstantInt *ZeroShadow;
+  ConstantInt *ShadowPtrMask;
+  ConstantInt *ShadowPtrMul;
+  Constant *ArgTLS;
+  Constant *RetvalTLS;
+  void *(*GetArgTLSPtr)();
+  void *(*GetRetvalTLSPtr)();
+  Constant *GetArgTLS;
+  Constant *GetRetvalTLS;
+  FunctionType *DFSanUnionFnTy;
+  FunctionType *DFSanUnionLoadFnTy;
+  FunctionType *DFSanUnimplementedFnTy;
+  FunctionType *DFSanSetLabelFnTy;
+  FunctionType *DFSanNonzeroLabelFnTy;
+  Constant *DFSanUnionFn;
+  Constant *DFSanUnionLoadFn;
+  Constant *DFSanUnimplementedFn;
+  Constant *DFSanSetLabelFn;
+  Constant *DFSanNonzeroLabelFn;
+  MDNode *ColdCallWeights;
+  OwningPtr<SpecialCaseList> ABIList;
+  DenseMap<Value *, Function *> UnwrappedFnMap;
+  AttributeSet ReadOnlyNoneAttrs;
+
+  Value *getShadowAddress(Value *Addr, Instruction *Pos);
+  Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+  bool isInstrumented(const Function *F);
+  bool isInstrumented(const GlobalAlias *GA);
+  FunctionType *getArgsFunctionType(FunctionType *T);
+  FunctionType *getTrampolineFunctionType(FunctionType *T);
+  FunctionType *getCustomFunctionType(FunctionType *T);
+  InstrumentedABI getInstrumentedABI();
+  WrapperKind getWrapperKind(Function *F);
+  void addGlobalNamePrefix(GlobalValue *GV);
+  Function *buildWrapperFunction(Function *F, StringRef NewFName,
+                                 GlobalValue::LinkageTypes NewFLink,
+                                 FunctionType *NewFT);
+  Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+
+ public:
+  DataFlowSanitizer(StringRef ABIListFile = StringRef(),
+                    void *(*getArgTLS)() = 0, void *(*getRetValTLS)() = 0);
+  static char ID;
+  bool doInitialization(Module &M);
+  bool runOnModule(Module &M);
+};
+
+struct DFSanFunction {
+  DataFlowSanitizer &DFS;
+  Function *F;
+  DataFlowSanitizer::InstrumentedABI IA;
+  bool IsNativeABI;
+  Value *ArgTLSPtr;
+  Value *RetvalTLSPtr;
+  AllocaInst *LabelReturnAlloca;
+  DenseMap<Value *, Value *> ValShadowMap;
+  DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+  std::vector<std::pair<PHINode *, PHINode *> > PHIFixups;
+  DenseSet<Instruction *> SkipInsts;
+  DenseSet<Value *> NonZeroChecks;
+
+  DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+      : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()),
+        IsNativeABI(IsNativeABI), ArgTLSPtr(0), RetvalTLSPtr(0),
+        LabelReturnAlloca(0) {}
+  Value *getArgTLSPtr();
+  Value *getArgTLS(unsigned Index, Instruction *Pos);
+  Value *getRetvalTLS();
+  Value *getShadow(Value *V);
+  void setShadow(Instruction *I, Value *Shadow);
+  Value *combineOperandShadows(Instruction *Inst);
+  Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+                    Instruction *Pos);
+  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+                   Instruction *Pos);
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+ public:
+  DFSanFunction &DFSF;
+  DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+  void visitOperandShadowInst(Instruction &I);
+
+  void visitBinaryOperator(BinaryOperator &BO);
+  void visitCastInst(CastInst &CI);
+  void visitCmpInst(CmpInst &CI);
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+  void visitLoadInst(LoadInst &LI);
+  void visitStoreInst(StoreInst &SI);
+  void visitReturnInst(ReturnInst &RI);
+  void visitCallSite(CallSite CS);
+  void visitPHINode(PHINode &PN);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+  void visitExtractValueInst(ExtractValueInst &I);
+  void visitInsertValueInst(InsertValueInst &I);
+  void visitAllocaInst(AllocaInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitMemSetInst(MemSetInst &I);
+  void visitMemTransferInst(MemTransferInst &I);
+};
+
+}
+
+char DataFlowSanitizer::ID;
+INITIALIZE_PASS(DataFlowSanitizer, "dfsan",
+                "DataFlowSanitizer: dynamic data flow analysis.", false, false)
+
+ModulePass *llvm::createDataFlowSanitizerPass(StringRef ABIListFile,
+                                              void *(*getArgTLS)(),
+                                              void *(*getRetValTLS)()) {
+  return new DataFlowSanitizer(ABIListFile, getArgTLS, getRetValTLS);
+}
+
+DataFlowSanitizer::DataFlowSanitizer(StringRef ABIListFile,
+                                     void *(*getArgTLS)(),
+                                     void *(*getRetValTLS)())
+    : ModulePass(ID), GetArgTLSPtr(getArgTLS), GetRetvalTLSPtr(getRetValTLS),
+      ABIList(SpecialCaseList::createOrDie(ABIListFile.empty() ? ClABIListFile
+                                                               : ABIListFile)) {
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  if (T->isVarArg())
+    ArgTypes.push_back(ShadowPtrTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    RetType = StructType::get(RetType, ShadowTy, (Type *)0);
+  return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  ArgTypes.push_back(T->getPointerTo());
+  std::copy(T->param_begin(), T->param_end(), std::back_inserter(ArgTypes));
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+FunctionType *DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+  assert(!T->isVarArg());
+  llvm::SmallVector<Type *, 4> ArgTypes;
+  for (FunctionType::param_iterator i = T->param_begin(), e = T->param_end();
+       i != e; ++i) {
+    FunctionType *FT;
+    if (isa<PointerType>(*i) && (FT = dyn_cast<FunctionType>(cast<PointerType>(
+                                     *i)->getElementType()))) {
+      ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+      ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+    } else {
+      ArgTypes.push_back(*i);
+    }
+  }
+  for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+    ArgTypes.push_back(ShadowTy);
+  Type *RetType = T->getReturnType();
+  if (!RetType->isVoidTy())
+    ArgTypes.push_back(ShadowPtrTy);
+  return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+bool DataFlowSanitizer::doInitialization(Module &M) {
+  DL = getAnalysisIfAvailable<DataLayout>();
+  if (!DL)
+    return false;
+
+  Mod = &M;
+  Ctx = &M.getContext();
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowPtrTy = PointerType::getUnqual(ShadowTy);
+  IntptrTy = DL->getIntPtrType(*Ctx);
+  ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
+  ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+
+  Type *DFSanUnionArgs[2] = { ShadowTy, ShadowTy };
+  DFSanUnionFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionArgs, /*isVarArg=*/ false);
+  Type *DFSanUnionLoadArgs[2] = { ShadowPtrTy, IntptrTy };
+  DFSanUnionLoadFnTy =
+      FunctionType::get(ShadowTy, DFSanUnionLoadArgs, /*isVarArg=*/ false);
+  DFSanUnimplementedFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  Type *DFSanSetLabelArgs[3] = { ShadowTy, Type::getInt8PtrTy(*Ctx), IntptrTy };
+  DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+                                        DFSanSetLabelArgs, /*isVarArg=*/false);
+  DFSanNonzeroLabelFnTy = FunctionType::get(
+      Type::getVoidTy(*Ctx), ArrayRef<Type *>(), /*isVarArg=*/false);
+
+  if (GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = 0;
+    GetArgTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ArgTLSTy), (Type *)0)));
+  }
+  if (GetRetvalTLSPtr) {
+    RetvalTLS = 0;
+    GetRetvalTLS = ConstantExpr::getIntToPtr(
+        ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)),
+        PointerType::getUnqual(
+            FunctionType::get(PointerType::getUnqual(ShadowTy), (Type *)0)));
+  }
+
+  ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+  return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+  return !ABIList->isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+  return !ABIList->isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+  return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+  if (ABIList->isIn(*F, "functional"))
+    return WK_Functional;
+  if (ABIList->isIn(*F, "discard"))
+    return WK_Discard;
+  if (ABIList->isIn(*F, "custom"))
+    return WK_Custom;
+
+  return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+  std::string GVName = GV->getName(), Prefix = "dfs$";
+  GV->setName(Prefix + GVName);
+
+  // Try to change the name of the function in module inline asm.  We only do
+  // this for specific asm directives, currently only ".symver", to try to avoid
+  // corrupting asm which happens to contain the symbol name as a substring.
+  // Note that the substitution for .symver assumes that the versioned symbol
+  // also has an instrumented name.
+  std::string Asm = GV->getParent()->getModuleInlineAsm();
+  std::string SearchStr = ".symver " + GVName + ",";
+  size_t Pos = Asm.find(SearchStr);
+  if (Pos != std::string::npos) {
+    Asm.replace(Pos, SearchStr.size(),
+                ".symver " + Prefix + GVName + "," + Prefix);
+    GV->getParent()->setModuleInlineAsm(Asm);
+  }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+                                        GlobalValue::LinkageTypes NewFLink,
+                                        FunctionType *NewFT) {
+  FunctionType *FT = F->getFunctionType();
+  Function *NewF = Function::Create(NewFT, NewFLink, NewFName,
+                                    F->getParent());
+  NewF->copyAttributesFrom(F);
+  NewF->removeAttributes(
+      AttributeSet::ReturnIndex,
+      AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                       AttributeSet::ReturnIndex));
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+  std::vector<Value *> Args;
+  unsigned n = FT->getNumParams();
+  for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+    Args.push_back(&*ai);
+  CallInst *CI = CallInst::Create(F, Args, "", BB);
+  if (FT->getReturnType()->isVoidTy())
+    ReturnInst::Create(*Ctx, BB);
+  else
+    ReturnInst::Create(*Ctx, CI, BB);
+
+  return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+                                                          StringRef FName) {
+  FunctionType *FTT = getTrampolineFunctionType(FT);
+  Constant *C = Mod->getOrInsertFunction(FName, FTT);
+  Function *F = dyn_cast<Function>(C);
+  if (F && F->isDeclaration()) {
+    F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    std::vector<Value *> Args;
+    Function::arg_iterator AI = F->arg_begin(); ++AI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+      Args.push_back(&*AI);
+    CallInst *CI =
+        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    ReturnInst *RI;
+    if (FT->getReturnType()->isVoidTy())
+      RI = ReturnInst::Create(*Ctx, BB);
+    else
+      RI = ReturnInst::Create(*Ctx, CI, BB);
+
+    DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+    Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+    for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N)
+      DFSF.ValShadowMap[ValAI] = ShadowAI;
+    DFSanVisitor(DFSF).visitCallInst(*CI);
+    if (!FT->getReturnType()->isVoidTy())
+      new StoreInst(DFSF.getShadow(RI->getReturnValue()),
+                    &F->getArgumentList().back(), RI);
+  }
+
+  return C;
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (!DL)
+    return false;
+
+  if (ABIList->isIn(M, "skip"))
+    return false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+  }
+
+  DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
+    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(1, Attribute::ZExt);
+    F->addAttribute(2, Attribute::ZExt);
+  }
+  DFSanUnionLoadFn =
+      Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
+    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  }
+  DFSanUnimplementedFn =
+      Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+  DFSanSetLabelFn =
+      Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy);
+  if (Function *F = dyn_cast<Function>(DFSanSetLabelFn)) {
+    F->addAttribute(1, Attribute::ZExt);
+  }
+  DFSanNonzeroLabelFn =
+      Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+
+  std::vector<Function *> FnsToInstrument;
+  llvm::SmallPtrSet<Function *, 2> FnsWithNativeABI;
+  for (Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    if (!i->isIntrinsic() &&
+        i != DFSanUnionFn &&
+        i != DFSanUnionLoadFn &&
+        i != DFSanUnimplementedFn &&
+        i != DFSanSetLabelFn &&
+        i != DFSanNonzeroLabelFn)
+      FnsToInstrument.push_back(&*i);
+  }
+
+  // Give function aliases prefixes when necessary, and build wrappers where the
+  // instrumentedness is inconsistent.
+  for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+    GlobalAlias *GA = &*i;
+    ++i;
+    // Don't stop on weak.  We assume people aren't playing games with the
+    // instrumentedness of overridden weak aliases.
+    if (Function *F = dyn_cast<Function>(
+            GA->resolveAliasedGlobal(/*stopOnWeak=*/false))) {
+      bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+      if (GAInst && FInst) {
+        addGlobalNamePrefix(GA);
+      } else if (GAInst != FInst) {
+        // Non-instrumented alias of an instrumented function, or vice versa.
+        // Replace the alias with a native-ABI wrapper of the aliasee.  The pass
+        // below will take care of instrumenting it.
+        Function *NewF =
+            buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+        GA->replaceAllUsesWith(NewF);
+        NewF->takeName(GA);
+        GA->eraseFromParent();
+        FnsToInstrument.push_back(NewF);
+      }
+    }
+  }
+
+  AttrBuilder B;
+  B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
+  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+
+  // First, change the ABI of every function in the module.  ABI-listed
+  // functions keep their original ABI and get a wrapper function.
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    Function &F = **i;
+    FunctionType *FT = F.getFunctionType();
+
+    bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+                              FT->getReturnType()->isVoidTy());
+
+    if (isInstrumented(&F)) {
+      // Instrumented functions get a 'dfs$' prefix.  This allows us to more
+      // easily identify cases of mismatching ABIs.
+      if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+        FunctionType *NewFT = getArgsFunctionType(FT);
+        Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
+        NewF->copyAttributesFrom(&F);
+        NewF->removeAttributes(
+            AttributeSet::ReturnIndex,
+            AttributeFuncs::typeIncompatible(NewFT->getReturnType(),
+                                             AttributeSet::ReturnIndex));
+        for (Function::arg_iterator FArg = F.arg_begin(),
+                                    NewFArg = NewF->arg_begin(),
+                                    FArgEnd = F.arg_end();
+             FArg != FArgEnd; ++FArg, ++NewFArg) {
+          FArg->replaceAllUsesWith(NewFArg);
+        }
+        NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+        for (Function::use_iterator ui = F.use_begin(), ue = F.use_end();
+             ui != ue;) {
+          BlockAddress *BA = dyn_cast<BlockAddress>(ui.getUse().getUser());
+          ++ui;
+          if (BA) {
+            BA->replaceAllUsesWith(
+                BlockAddress::get(NewF, BA->getBasicBlock()));
+            delete BA;
+          }
+        }
+        F.replaceAllUsesWith(
+            ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+        NewF->takeName(&F);
+        F.eraseFromParent();
+        *i = NewF;
+        addGlobalNamePrefix(NewF);
+      } else {
+        addGlobalNamePrefix(&F);
+      }
+               // Hopefully, nobody will try to indirectly call a vararg
+               // function... yet.
+    } else if (FT->isVarArg()) {
+      UnwrappedFnMap[&F] = &F;
+      *i = 0;
+    } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+      // Build a wrapper function for F.  The wrapper simply calls F, and is
+      // added to FnsToInstrument so that any instrumentation according to its
+      // WrapperKind is done in the second pass below.
+      FunctionType *NewFT = getInstrumentedABI() == IA_Args
+                                ? getArgsFunctionType(FT)
+                                : FT;
+      Function *NewF = buildWrapperFunction(
+          &F, std::string("dfsw$") + std::string(F.getName()),
+          GlobalValue::LinkOnceODRLinkage, NewFT);
+      if (getInstrumentedABI() == IA_TLS)
+        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+
+      Value *WrappedFnCst =
+          ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+      F.replaceAllUsesWith(WrappedFnCst);
+      UnwrappedFnMap[WrappedFnCst] = &F;
+      *i = NewF;
+
+      if (!F.isDeclaration()) {
+        // This function is probably defining an interposition of an
+        // uninstrumented function and hence needs to keep the original ABI.
+        // But any functions it may call need to use the instrumented ABI, so
+        // we instrument it in a mode which preserves the original ABI.
+        FnsWithNativeABI.insert(&F);
+
+        // This code needs to rebuild the iterators, as they may be invalidated
+        // by the push_back, taking care that the new range does not include
+        // any functions added by this code.
+        size_t N = i - FnsToInstrument.begin(),
+               Count = e - FnsToInstrument.begin();
+        FnsToInstrument.push_back(&F);
+        i = FnsToInstrument.begin() + N;
+        e = FnsToInstrument.begin() + Count;
+      }
+    }
+  }
+
+  for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+                                         e = FnsToInstrument.end();
+       i != e; ++i) {
+    if (!*i || (*i)->isDeclaration())
+      continue;
+
+    removeUnreachableBlocks(**i);
+
+    DFSanFunction DFSF(*this, *i, FnsWithNativeABI.count(*i));
+
+    // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+    // Build a copy of the list before iterating over it.
+    llvm::SmallVector<BasicBlock *, 4> BBList;
+    std::copy(df_begin(&(*i)->getEntryBlock()), df_end(&(*i)->getEntryBlock()),
+              std::back_inserter(BBList));
+
+    for (llvm::SmallVector<BasicBlock *, 4>::iterator i = BBList.begin(),
+                                                      e = BBList.end();
+         i != e; ++i) {
+      Instruction *Inst = &(*i)->front();
+      while (1) {
+        // DFSanVisitor may split the current basic block, changing the current
+        // instruction's next pointer and moving the next instruction to the
+        // tail block from which we should continue.
+        Instruction *Next = Inst->getNextNode();
+        // DFSanVisitor may delete Inst, so keep track of whether it was a
+        // terminator.
+        bool IsTerminator = isa<TerminatorInst>(Inst);
+        if (!DFSF.SkipInsts.count(Inst))
+          DFSanVisitor(DFSF).visit(Inst);
+        if (IsTerminator)
+          break;
+        Inst = Next;
+      }
+    }
+
+    // We will not necessarily be able to compute the shadow for every phi node
+    // until we have visited every block.  Therefore, the code that handles phi
+    // nodes adds them to the PHIFixups list so that they can be properly
+    // handled here.
+    for (std::vector<std::pair<PHINode *, PHINode *> >::iterator
+             i = DFSF.PHIFixups.begin(),
+             e = DFSF.PHIFixups.end();
+         i != e; ++i) {
+      for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+           ++val) {
+        i->second->setIncomingValue(
+            val, DFSF.getShadow(i->first->getIncomingValue(val)));
+      }
+    }
+
+    // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+    // places (i.e. instructions in basic blocks we haven't even begun visiting
+    // yet).  To make our life easier, do this work in a pass after the main
+    // instrumentation.
+    if (ClDebugNonzeroLabels) {
+      for (DenseSet<Value *>::iterator i = DFSF.NonZeroChecks.begin(),
+                                       e = DFSF.NonZeroChecks.end();
+           i != e; ++i) {
+        Instruction *Pos;
+        if (Instruction *I = dyn_cast<Instruction>(*i))
+          Pos = I->getNextNode();
+        else
+          Pos = DFSF.F->getEntryBlock().begin();
+        while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+          Pos = Pos->getNextNode();
+        IRBuilder<> IRB(Pos);
+        Instruction *NeInst = cast<Instruction>(
+            IRB.CreateICmpNE(*i, DFSF.DFS.ZeroShadow));
+        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+            NeInst, /*Unreachable=*/ false, ColdCallWeights));
+        IRBuilder<> ThenIRB(BI);
+        ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn);
+      }
+    }
+  }
+
+  return false;
+}
+
+Value *DFSanFunction::getArgTLSPtr() {
+  if (ArgTLSPtr)
+    return ArgTLSPtr;
+  if (DFS.ArgTLS)
+    return ArgTLSPtr = DFS.ArgTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return ArgTLSPtr = IRB.CreateCall(DFS.GetArgTLS);
+}
+
+Value *DFSanFunction::getRetvalTLS() {
+  if (RetvalTLSPtr)
+    return RetvalTLSPtr;
+  if (DFS.RetvalTLS)
+    return RetvalTLSPtr = DFS.RetvalTLS;
+
+  IRBuilder<> IRB(F->getEntryBlock().begin());
+  return RetvalTLSPtr = IRB.CreateCall(DFS.GetRetvalTLS);
+}
+
+Value *DFSanFunction::getArgTLS(unsigned Idx, Instruction *Pos) {
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateConstGEP2_64(getArgTLSPtr(), 0, Idx);
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+  if (!isa<Argument>(V) && !isa<Instruction>(V))
+    return DFS.ZeroShadow;
+  Value *&Shadow = ValShadowMap[V];
+  if (!Shadow) {
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      if (IsNativeABI)
+        return DFS.ZeroShadow;
+      switch (IA) {
+      case DataFlowSanitizer::IA_TLS: {
+        Value *ArgTLSPtr = getArgTLSPtr();
+        Instruction *ArgTLSPos =
+            DFS.ArgTLS ? &*F->getEntryBlock().begin()
+                       : cast<Instruction>(ArgTLSPtr)->getNextNode();
+        IRBuilder<> IRB(ArgTLSPos);
+        Shadow = IRB.CreateLoad(getArgTLS(A->getArgNo(), ArgTLSPos));
+        break;
+      }
+      case DataFlowSanitizer::IA_Args: {
+        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        Function::arg_iterator i = F->arg_begin();
+        while (ArgIdx--)
+          ++i;
+        Shadow = i;
+        assert(Shadow->getType() == DFS.ShadowTy);
+        break;
+      }
+      }
+      NonZeroChecks.insert(Shadow);
+    } else {
+      Shadow = DFS.ZeroShadow;
+    }
+  }
+  return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+  assert(!ValShadowMap.count(I));
+  assert(Shadow->getType() == DFS.ShadowTy);
+  ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+  assert(Addr != RetvalTLS && "Reinstrumenting?");
+  IRBuilder<> IRB(Pos);
+  return IRB.CreateIntToPtr(
+      IRB.CreateMul(
+          IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy), ShadowPtrMask),
+          ShadowPtrMul),
+      ShadowPtrTy);
+}
+
+// Generates IR to compute the union of the two given shadows, inserting it
+// before Pos.  Returns the computed union Value.
+Value *DataFlowSanitizer::combineShadows(Value *V1, Value *V2,
+                                         Instruction *Pos) {
+  if (V1 == ZeroShadow)
+    return V2;
+  if (V2 == ZeroShadow)
+    return V1;
+  if (V1 == V2)
+    return V1;
+  IRBuilder<> IRB(Pos);
+  BasicBlock *Head = Pos->getParent();
+  Value *Ne = IRB.CreateICmpNE(V1, V2);
+  Instruction *NeInst = dyn_cast<Instruction>(Ne);
+  if (NeInst) {
+    BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        NeInst, /*Unreachable=*/ false, ColdCallWeights));
+    IRBuilder<> ThenIRB(BI);
+    CallInst *Call = ThenIRB.CreateCall2(DFSanUnionFn, V1, V2);
+    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(1, Attribute::ZExt);
+    Call->addAttribute(2, Attribute::ZExt);
+
+    BasicBlock *Tail = BI->getSuccessor(0);
+    PHINode *Phi = PHINode::Create(ShadowTy, 2, "", Tail->begin());
+    Phi->addIncoming(Call, Call->getParent());
+    Phi->addIncoming(V1, Head);
+    Pos = Phi;
+    return Phi;
+  } else {
+    assert(0 && "todo");
+    return 0;
+  }
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst.  Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+  if (Inst->getNumOperands() == 0)
+    return DFS.ZeroShadow;
+
+  Value *Shadow = getShadow(Inst->getOperand(0));
+  for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+    Shadow = DFS.combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+  }
+  return Shadow;
+}
+
+void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+  Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+  DFSF.setShadow(&I, CombinedShadow);
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+// Addr has alignment Align, and take the union of each of those shadows.
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                 Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      return IRB.CreateLoad(i->second);
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  SmallVector<Value *, 2> Objs;
+  GetUnderlyingObjects(Addr, Objs, DFS.DL);
+  bool AllConstants = true;
+  for (SmallVector<Value *, 2>::iterator i = Objs.begin(), e = Objs.end();
+       i != e; ++i) {
+    if (isa<Function>(*i) || isa<BlockAddress>(*i))
+      continue;
+    if (isa<GlobalVariable>(*i) && cast<GlobalVariable>(*i)->isConstant())
+      continue;
+
+    AllConstants = false;
+    break;
+  }
+  if (AllConstants)
+    return DFS.ZeroShadow;
+
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  switch (Size) {
+  case 0:
+    return DFS.ZeroShadow;
+  case 1: {
+    LoadInst *LI = new LoadInst(ShadowAddr, "", Pos);
+    LI->setAlignment(ShadowAlign);
+    return LI;
+  }
+  case 2: {
+    IRBuilder<> IRB(Pos);
+    Value *ShadowAddr1 =
+        IRB.CreateGEP(ShadowAddr, ConstantInt::get(DFS.IntptrTy, 1));
+    return DFS.combineShadows(IRB.CreateAlignedLoad(ShadowAddr, ShadowAlign),
+                              IRB.CreateAlignedLoad(ShadowAddr1, ShadowAlign),
+                              Pos);
+  }
+  }
+  if (Size % (64 / DFS.ShadowWidth) == 0) {
+    // Fast path for the common case where each byte has identical shadow: load
+    // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+    // shadow is non-equal.
+    BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+    IRBuilder<> FallbackIRB(FallbackBB);
+    CallInst *FallbackCall = FallbackIRB.CreateCall2(
+        DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+
+    // Compare each of the shadows stored in the loaded 64 bits to each other,
+    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    IRBuilder<> IRB(Pos);
+    Value *WideAddr =
+        IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+    Value *WideShadow = IRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+    Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+    Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+    BasicBlock *Head = Pos->getParent();
+    BasicBlock *Tail = Head->splitBasicBlock(Pos);
+    // In the following code LastBr will refer to the previous basic block's
+    // conditional branch instruction, whose true successor is fixed up to point
+    // to the next block during the loop below or to the tail after the final
+    // iteration.
+    BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+    ReplaceInstWithInst(Head->getTerminator(), LastBr);
+
+    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidth) {
+      BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+      IRBuilder<> NextIRB(NextBB);
+      WideAddr = NextIRB.CreateGEP(WideAddr, ConstantInt::get(DFS.IntptrTy, 1));
+      Value *NextWideShadow = NextIRB.CreateAlignedLoad(WideAddr, ShadowAlign);
+      ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+      LastBr->setSuccessor(0, NextBB);
+      LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+    }
+
+    LastBr->setSuccessor(0, Tail);
+    FallbackIRB.CreateBr(Tail);
+    PHINode *Shadow = PHINode::Create(DFS.ShadowTy, 2, "", &Tail->front());
+    Shadow->addIncoming(FallbackCall, FallbackBB);
+    Shadow->addIncoming(TruncShadow, LastBr->getParent());
+    return Shadow;
+  }
+
+  IRBuilder<> IRB(Pos);
+  CallInst *FallbackCall = IRB.CreateCall2(
+      DFS.DFSanUnionLoadFn, ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size));
+  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+  uint64_t Size = DFSF.DFS.DL->getTypeStoreSize(LI.getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = LI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(LI.getType());
+  } else {
+    Align = 1;
+  }
+  IRBuilder<> IRB(&LI);
+  Value *LoadedShadow =
+      DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+  Value *CombinedShadow = DFSF.DFS.combineShadows(LoadedShadow, PtrShadow, &LI);
+  if (CombinedShadow != DFSF.DFS.ZeroShadow)
+    DFSF.NonZeroChecks.insert(CombinedShadow);
+
+  DFSF.setShadow(&LI, CombinedShadow);
+}
+
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+                                Value *Shadow, Instruction *Pos) {
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+    llvm::DenseMap<AllocaInst *, AllocaInst *>::iterator i =
+        AllocaShadowMap.find(AI);
+    if (i != AllocaShadowMap.end()) {
+      IRBuilder<> IRB(Pos);
+      IRB.CreateStore(Shadow, i->second);
+      return;
+    }
+  }
+
+  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  IRBuilder<> IRB(Pos);
+  Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+  if (Shadow == DFS.ZeroShadow) {
+    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+    Value *ExtShadowAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+    IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+    return;
+  }
+
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  uint64_t Offset = 0;
+  if (Size >= ShadowVecSize) {
+    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    Value *ShadowVec = UndefValue::get(ShadowVecTy);
+    for (unsigned i = 0; i != ShadowVecSize; ++i) {
+      ShadowVec = IRB.CreateInsertElement(
+          ShadowVec, Shadow, ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
+    }
+    Value *ShadowVecAddr =
+        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+    do {
+      Value *CurShadowVecAddr = IRB.CreateConstGEP1_32(ShadowVecAddr, Offset);
+      IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+      Size -= ShadowVecSize;
+      ++Offset;
+    } while (Size >= ShadowVecSize);
+    Offset *= ShadowVecSize;
+  }
+  while (Size > 0) {
+    Value *CurShadowAddr = IRB.CreateConstGEP1_32(ShadowAddr, Offset);
+    IRB.CreateAlignedStore(Shadow, CurShadowAddr, ShadowAlign);
+    --Size;
+    ++Offset;
+  }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+  uint64_t Size =
+      DFSF.DFS.DL->getTypeStoreSize(SI.getValueOperand()->getType());
+  uint64_t Align;
+  if (ClPreserveAlignment) {
+    Align = SI.getAlignment();
+    if (Align == 0)
+      Align = DFSF.DFS.DL->getABITypeAlignment(SI.getValueOperand()->getType());
+  } else {
+    Align = 1;
+  }
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Align,
+                   DFSF.getShadow(SI.getValueOperand()), &SI);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+  visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+  visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+  visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+  bool AllLoadsStores = true;
+  for (Instruction::use_iterator i = I.use_begin(), e = I.use_end(); i != e;
+       ++i) {
+    if (isa<LoadInst>(*i))
+      continue;
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(*i)) {
+      if (SI->getPointerOperand() == &I)
+        continue;
+    }
+
+    AllLoadsStores = false;
+    break;
+  }
+  if (AllLoadsStores) {
+    IRBuilder<> IRB(&I);
+    DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.ShadowTy);
+  }
+  DFSF.setShadow(&I, DFSF.DFS.ZeroShadow);
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+  Value *CondShadow = DFSF.getShadow(I.getCondition());
+  Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+  Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+
+  if (isa<VectorType>(I.getCondition()->getType())) {
+    DFSF.setShadow(
+        &I, DFSF.DFS.combineShadows(
+                CondShadow,
+                DFSF.DFS.combineShadows(TrueShadow, FalseShadow, &I), &I));
+  } else {
+    Value *ShadowSel;
+    if (TrueShadow == FalseShadow) {
+      ShadowSel = TrueShadow;
+    } else {
+      ShadowSel =
+          SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+    }
+    DFSF.setShadow(&I, DFSF.DFS.combineShadows(CondShadow, ShadowSel, &I));
+  }
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *ValShadow = DFSF.getShadow(I.getValue());
+  IRB.CreateCall3(
+      DFSF.DFS.DFSanSetLabelFn, ValShadow,
+      IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)),
+      IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy));
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+  IRBuilder<> IRB(&I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+  Value *LenShadow = IRB.CreateMul(
+      I.getLength(),
+      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *AlignShadow;
+  if (ClPreserveAlignment) {
+    AlignShadow = IRB.CreateMul(I.getAlignmentCst(),
+                                ConstantInt::get(I.getAlignmentCst()->getType(),
+                                                 DFSF.DFS.ShadowWidth / 8));
+  } else {
+    AlignShadow = ConstantInt::get(I.getAlignmentCst()->getType(),
+                                   DFSF.DFS.ShadowWidth / 8);
+  }
+  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+  IRB.CreateCall5(I.getCalledValue(), DestShadow, SrcShadow, LenShadow,
+                  AlignShadow, I.getVolatileCst());
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+  if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+    switch (DFSF.IA) {
+    case DataFlowSanitizer::IA_TLS: {
+      Value *S = DFSF.getShadow(RI.getReturnValue());
+      IRBuilder<> IRB(&RI);
+      IRB.CreateStore(S, DFSF.getRetvalTLS());
+      break;
+    }
+    case DataFlowSanitizer::IA_Args: {
+      IRBuilder<> IRB(&RI);
+      Type *RT = DFSF.F->getFunctionType()->getReturnType();
+      Value *InsVal =
+          IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+      Value *InsShadow =
+          IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+      RI.setOperand(0, InsShadow);
+      break;
+    }
+    }
+  }
+}
+
+void DFSanVisitor::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
+    visitOperandShadowInst(*CS.getInstruction());
+    return;
+  }
+
+  IRBuilder<> IRB(CS.getInstruction());
+
+  DenseMap<Value *, Function *>::iterator i =
+      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+  if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+    Function *F = i->second;
+    switch (DFSF.DFS.getWrapperKind(F)) {
+    case DataFlowSanitizer::WK_Warning: {
+      CS.setCalledFunction(F);
+      IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+                     IRB.CreateGlobalStringPtr(F->getName()));
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Discard: {
+      CS.setCalledFunction(F);
+      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      return;
+    }
+    case DataFlowSanitizer::WK_Functional: {
+      CS.setCalledFunction(F);
+      visitOperandShadowInst(*CS.getInstruction());
+      return;
+    }
+    case DataFlowSanitizer::WK_Custom: {
+      // Don't try to handle invokes of custom functions, it's too complicated.
+      // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+      // wrapper.
+      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+        FunctionType *FT = F->getFunctionType();
+        FunctionType *CustomFT = DFSF.DFS.getCustomFunctionType(FT);
+        std::string CustomFName = "__dfsw_";
+        CustomFName += F->getName();
+        Constant *CustomF =
+            DFSF.DFS.Mod->getOrInsertFunction(CustomFName, CustomFT);
+        if (Function *CustomFn = dyn_cast<Function>(CustomF)) {
+          CustomFn->copyAttributesFrom(F);
+
+          // Custom functions returning non-void will write to the return label.
+          if (!FT->getReturnType()->isVoidTy()) {
+            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+                                       DFSF.DFS.ReadOnlyNoneAttrs);
+          }
+        }
+
+        std::vector<Value *> Args;
+
+        CallSite::arg_iterator i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+          Type *T = (*i)->getType();
+          FunctionType *ParamFT;
+          if (isa<PointerType>(T) &&
+              (ParamFT = dyn_cast<FunctionType>(
+                   cast<PointerType>(T)->getElementType()))) {
+            std::string TName = "dfst";
+            TName += utostr(FT->getNumParams() - n);
+            TName += "$";
+            TName += F->getName();
+            Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+            Args.push_back(T);
+            Args.push_back(
+                IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+          } else {
+            Args.push_back(*i);
+          }
+        }
+
+        i = CS.arg_begin();
+        for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+          Args.push_back(DFSF.getShadow(*i));
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          if (!DFSF.LabelReturnAlloca) {
+            DFSF.LabelReturnAlloca =
+                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
+                               DFSF.F->getEntryBlock().begin());
+          }
+          Args.push_back(DFSF.LabelReturnAlloca);
+        }
+
+        CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+        CustomCI->setCallingConv(CI->getCallingConv());
+        CustomCI->setAttributes(CI->getAttributes());
+
+        if (!FT->getReturnType()->isVoidTy()) {
+          LoadInst *LabelLoad = IRB.CreateLoad(DFSF.LabelReturnAlloca);
+          DFSF.setShadow(CustomCI, LabelLoad);
+        }
+
+        CI->replaceAllUsesWith(CustomCI);
+        CI->eraseFromParent();
+        return;
+      }
+      break;
+    }
+    }
+  }
+
+  FunctionType *FT = cast<FunctionType>(
+      CS.getCalledValue()->getType()->getPointerElementType());
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+    for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
+      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
+                      DFSF.getArgTLS(i, CS.getInstruction()));
+    }
+  }
+
+  Instruction *Next = 0;
+  if (!CS.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      if (II->getNormalDest()->getSinglePredecessor()) {
+        Next = II->getNormalDest()->begin();
+      } else {
+        BasicBlock *NewBB =
+            SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DFS);
+        Next = NewBB->begin();
+      }
+    } else {
+      Next = CS->getNextNode();
+    }
+
+    if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+      IRBuilder<> NextIRB(Next);
+      LoadInst *LI = NextIRB.CreateLoad(DFSF.getRetvalTLS());
+      DFSF.SkipInsts.insert(LI);
+      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.NonZeroChecks.insert(LI);
+    }
+  }
+
+  // Do all instrumentation for IA_Args down here to defer tampering with the
+  // CFG in a way that SplitEdge may be able to detect.
+  if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+    FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+    Value *Func =
+        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+    std::vector<Value *> Args;
+
+    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(*i);
+
+    i = CS.arg_begin();
+    for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+      Args.push_back(DFSF.getShadow(*i));
+
+    if (FT->isVarArg()) {
+      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
+      AllocaInst *VarArgShadow =
+          new AllocaInst(VarArgArrayTy, "", DFSF.F->getEntryBlock().begin());
+      Args.push_back(IRB.CreateConstGEP2_32(VarArgShadow, 0, 0));
+      for (unsigned n = 0; i != e; ++i, ++n) {
+        IRB.CreateStore(DFSF.getShadow(*i),
+                        IRB.CreateConstGEP2_32(VarArgShadow, 0, n));
+        Args.push_back(*i);
+      }
+    }
+
+    CallSite NewCS;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      NewCS = IRB.CreateInvoke(Func, II->getNormalDest(), II->getUnwindDest(),
+                               Args);
+    } else {
+      NewCS = IRB.CreateCall(Func, Args);
+    }
+    NewCS.setCallingConv(CS.getCallingConv());
+    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType(),
+                                         AttributeSet::ReturnIndex)));
+
+    if (Next) {
+      ExtractValueInst *ExVal =
+          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      DFSF.SkipInsts.insert(ExVal);
+      ExtractValueInst *ExShadow =
+          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      DFSF.SkipInsts.insert(ExShadow);
+      DFSF.setShadow(ExVal, ExShadow);
+      DFSF.NonZeroChecks.insert(ExShadow);
+
+      CS.getInstruction()->replaceAllUsesWith(ExVal);
+    }
+
+    CS.getInstruction()->eraseFromParent();
+  }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
+  PHINode *ShadowPN =
+      PHINode::Create(DFSF.DFS.ShadowTy, PN.getNumIncomingValues(), "", &PN);
+
+  // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+  Value *UndefShadow = UndefValue::get(DFSF.DFS.ShadowTy);
+  for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+       ++i) {
+    ShadowPN->addIncoming(UndefShadow, *i);
+  }
+
+  DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+  DFSF.setShadow(&PN, ShadowPN);
+}
diff --git a/lib/Transforms/Instrumentation/DebugIR.cpp b/lib/Transforms/Instrumentation/DebugIR.cpp
new file mode 100644
index 0000000..f50a044
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DebugIR.cpp
@@ -0,0 +1,618 @@
+//===--- DebugIR.cpp - Transform debug metadata to allow debugging IR -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A Module transform pass that emits a succinct version of the IR and replaces
+// the source file metadata to allow debuggers to step through the IR.
+//
+// FIXME: instead of replacing debug metadata, this pass should allow for
+// additional metadata to be used to point capable debuggers to the IR file
+// without destroying the mapping to the original source file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "debug-ir"
+
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/Assembly/AssemblyAnnotationWriter.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+#include "DebugIR.h"
+
+#include <string>
+
+#define STR_HELPER(x) #x
+#define STR(x) STR_HELPER(x)
+
+using namespace llvm;
+
+namespace {
+
+/// Builds a map of Value* to line numbers on which the Value appears in a
+/// textual representation of the IR by plugging into the AssemblyWriter by
+/// masquerading as an AssemblyAnnotationWriter.
+class ValueToLineMap : public AssemblyAnnotationWriter {
+  ValueMap<const Value *, unsigned int> Lines;
+  typedef ValueMap<const Value *, unsigned int>::const_iterator LineIter;
+
+  void addEntry(const Value *V, formatted_raw_ostream &Out) {
+    Out.flush();
+    Lines.insert(std::make_pair(V, Out.getLine() + 1));
+  }
+
+public:
+
+  /// Prints Module to a null buffer in order to build the map of Value pointers
+  /// to line numbers.
+  ValueToLineMap(const Module *M) {
+    raw_null_ostream ThrowAway;
+    M->print(ThrowAway, this);
+  }
+
+  // This function is called after an Instruction, GlobalValue, or GlobalAlias
+  // is printed.
+  void printInfoComment(const Value &V, formatted_raw_ostream &Out) {
+    addEntry(&V, Out);
+  }
+
+  void emitFunctionAnnot(const Function *F, formatted_raw_ostream &Out) {
+    addEntry(F, Out);
+  }
+
+  /// If V appears on a line in the textual IR representation, sets Line to the
+  /// line number and returns true, otherwise returns false.
+  bool getLine(const Value *V, unsigned int &Line) const {
+    LineIter i = Lines.find(V);
+    if (i != Lines.end()) {
+      Line = i->second;
+      return true;
+    }
+    return false;
+  }
+};
+
+/// Removes debug intrisncs like llvm.dbg.declare and llvm.dbg.value.
+class DebugIntrinsicsRemover : public InstVisitor<DebugIntrinsicsRemover> {
+  void remove(Instruction &I) { I.eraseFromParent(); }
+
+public:
+  static void process(Module &M) {
+    DebugIntrinsicsRemover Remover;
+    Remover.visit(&M);
+  }
+  void visitDbgDeclareInst(DbgDeclareInst &I) { remove(I); }
+  void visitDbgValueInst(DbgValueInst &I) { remove(I); }
+  void visitDbgInfoIntrinsic(DbgInfoIntrinsic &I) { remove(I); }
+};
+
+/// Removes debug metadata (!dbg) nodes from all instructions, and optionally
+/// metadata named "llvm.dbg.cu" if RemoveNamedInfo is true.
+class DebugMetadataRemover : public InstVisitor<DebugMetadataRemover> {
+  bool RemoveNamedInfo;
+
+public:
+  static void process(Module &M, bool RemoveNamedInfo = true) {
+    DebugMetadataRemover Remover(RemoveNamedInfo);
+    Remover.run(&M);
+  }
+
+  DebugMetadataRemover(bool RemoveNamedInfo)
+      : RemoveNamedInfo(RemoveNamedInfo) {}
+
+  void visitInstruction(Instruction &I) {
+    if (I.getMetadata(LLVMContext::MD_dbg))
+      I.setMetadata(LLVMContext::MD_dbg, 0);
+  }
+
+  void run(Module *M) {
+    // Remove debug metadata attached to instructions
+    visit(M);
+
+    if (RemoveNamedInfo) {
+      // Remove CU named metadata (and all children nodes)
+      NamedMDNode *Node = M->getNamedMetadata("llvm.dbg.cu");
+      if (Node)
+        M->eraseNamedMetadata(Node);
+    }
+  }
+};
+
+/// Updates debug metadata in a Module:
+///   - changes Filename/Directory to values provided on construction
+///   - adds/updates line number (DebugLoc) entries associated with each
+///     instruction to reflect the instruction's location in an LLVM IR file
+class DIUpdater : public InstVisitor<DIUpdater> {
+  /// Builder of debug information
+  DIBuilder Builder;
+
+  /// Helper for type attributes/sizes/etc
+  DataLayout Layout;
+
+  /// Map of Value* to line numbers
+  const ValueToLineMap LineTable;
+
+  /// Map of Value* (in original Module) to Value* (in optional cloned Module)
+  const ValueToValueMapTy *VMap;
+
+  /// Directory of debug metadata
+  DebugInfoFinder Finder;
+
+  /// Source filename and directory
+  StringRef Filename;
+  StringRef Directory;
+
+  // CU nodes needed when creating DI subprograms
+  MDNode *FileNode;
+  MDNode *LexicalBlockFileNode;
+  const MDNode *CUNode;
+
+  ValueMap<const Function *, MDNode *> SubprogramDescriptors;
+  DenseMap<const Type *, MDNode *> TypeDescriptors;
+
+public:
+  DIUpdater(Module &M, StringRef Filename = StringRef(),
+            StringRef Directory = StringRef(), const Module *DisplayM = 0,
+            const ValueToValueMapTy *VMap = 0)
+      : Builder(M), Layout(&M), LineTable(DisplayM ? DisplayM : &M), VMap(VMap),
+        Finder(), Filename(Filename), Directory(Directory), FileNode(0),
+        LexicalBlockFileNode(0), CUNode(0) {
+    Finder.processModule(M);
+    visit(&M);
+  }
+
+  ~DIUpdater() { Builder.finalize(); }
+
+  void visitModule(Module &M) {
+    if (Finder.compile_unit_count() > 1)
+      report_fatal_error("DebugIR pass supports only a signle compile unit per "
+                         "Module.");
+    createCompileUnit(
+        Finder.compile_unit_count() == 1 ? *Finder.compile_unit_begin() : 0);
+  }
+
+  void visitFunction(Function &F) {
+    if (F.isDeclaration() || findDISubprogram(&F))
+      return;
+
+    StringRef MangledName = F.getName();
+    DICompositeType Sig = createFunctionSignature(&F);
+
+    // find line of function declaration
+    unsigned Line = 0;
+    if (!findLine(&F, Line)) {
+      DEBUG(dbgs() << "WARNING: No line for Function " << F.getName().str()
+                   << "\n");
+      return;
+    }
+
+    Instruction *FirstInst = F.begin()->begin();
+    unsigned ScopeLine = 0;
+    if (!findLine(FirstInst, ScopeLine)) {
+      DEBUG(dbgs() << "WARNING: No line for 1st Instruction in Function "
+                   << F.getName().str() << "\n");
+      return;
+    }
+
+    bool Local = F.hasInternalLinkage();
+    bool IsDefinition = !F.isDeclaration();
+    bool IsOptimized = false;
+
+    int FuncFlags = llvm::DIDescriptor::FlagPrototyped;
+    assert(CUNode && FileNode);
+    DISubprogram Sub = Builder.createFunction(
+        DICompileUnit(CUNode), F.getName(), MangledName, DIFile(FileNode), Line,
+        Sig, Local, IsDefinition, ScopeLine, FuncFlags, IsOptimized, &F);
+    assert(Sub.isSubprogram());
+    DEBUG(dbgs() << "create subprogram mdnode " << *Sub << ": "
+                 << "\n");
+
+    SubprogramDescriptors.insert(std::make_pair(&F, Sub));
+  }
+
+  void visitInstruction(Instruction &I) {
+    DebugLoc Loc(I.getDebugLoc());
+
+    /// If a ValueToValueMap is provided, use it to get the real instruction as
+    /// the line table was generated on a clone of the module on which we are
+    /// operating.
+    Value *RealInst = 0;
+    if (VMap)
+      RealInst = VMap->lookup(&I);
+
+    if (!RealInst)
+      RealInst = &I;
+
+    unsigned Col = 0; // FIXME: support columns
+    unsigned Line;
+    if (!LineTable.getLine(RealInst, Line)) {
+      // Instruction has no line, it may have been removed (in the module that
+      // will be passed to the debugger) so there is nothing to do here.
+      DEBUG(dbgs() << "WARNING: no LineTable entry for instruction " << RealInst
+                   << "\n");
+      DEBUG(RealInst->dump());
+      return;
+    }
+
+    DebugLoc NewLoc;
+    if (!Loc.isUnknown())
+      // I had a previous debug location: re-use the DebugLoc
+      NewLoc = DebugLoc::get(Line, Col, Loc.getScope(RealInst->getContext()),
+                             Loc.getInlinedAt(RealInst->getContext()));
+    else if (MDNode *scope = findScope(&I))
+      NewLoc = DebugLoc::get(Line, Col, scope, 0);
+    else {
+      DEBUG(dbgs() << "WARNING: no valid scope for instruction " << &I
+                   << ". no DebugLoc will be present."
+                   << "\n");
+      return;
+    }
+
+    addDebugLocation(I, NewLoc);
+  }
+
+private:
+
+  void createCompileUnit(MDNode *CUToReplace) {
+    std::string Flags;
+    bool IsOptimized = false;
+    StringRef Producer;
+    unsigned RuntimeVersion(0);
+    StringRef SplitName;
+
+    if (CUToReplace) {
+      // save fields from existing CU to re-use in the new CU
+      DICompileUnit ExistingCU(CUToReplace);
+      Producer = ExistingCU.getProducer();
+      IsOptimized = ExistingCU.isOptimized();
+      Flags = ExistingCU.getFlags();
+      RuntimeVersion = ExistingCU.getRunTimeVersion();
+      SplitName = ExistingCU.getSplitDebugFilename();
+    } else {
+      Producer =
+          "LLVM Version " STR(LLVM_VERSION_MAJOR) "." STR(LLVM_VERSION_MINOR);
+    }
+
+    CUNode =
+        Builder.createCompileUnit(dwarf::DW_LANG_C99, Filename, Directory,
+                                  Producer, IsOptimized, Flags, RuntimeVersion);
+
+    if (CUToReplace)
+      CUToReplace->replaceAllUsesWith(const_cast<MDNode *>(CUNode));
+
+    DICompileUnit CU(CUNode);
+    FileNode = Builder.createFile(Filename, Directory);
+    LexicalBlockFileNode = Builder.createLexicalBlockFile(CU, DIFile(FileNode));
+  }
+
+  /// Returns the MDNode* that represents the DI scope to associate with I
+  MDNode *findScope(const Instruction *I) {
+    const Function *F = I->getParent()->getParent();
+    if (MDNode *ret = findDISubprogram(F))
+      return ret;
+
+    DEBUG(dbgs() << "WARNING: Using fallback lexical block file scope "
+                 << LexicalBlockFileNode << " as scope for instruction " << I
+                 << "\n");
+    return LexicalBlockFileNode;
+  }
+
+  /// Returns the MDNode* that is the descriptor for F
+  MDNode *findDISubprogram(const Function *F) {
+    typedef ValueMap<const Function *, MDNode *>::const_iterator FuncNodeIter;
+    FuncNodeIter i = SubprogramDescriptors.find(F);
+    if (i != SubprogramDescriptors.end())
+      return i->second;
+
+    DEBUG(dbgs() << "searching for DI scope node for Function " << F
+                 << " in a list of " << Finder.subprogram_count()
+                 << " subprogram nodes"
+                 << "\n");
+
+    for (DebugInfoFinder::iterator i = Finder.subprogram_begin(),
+                                   e = Finder.subprogram_end();
+         i != e; ++i) {
+      DISubprogram S(*i);
+      if (S.getFunction() == F) {
+        DEBUG(dbgs() << "Found DISubprogram " << *i << " for function "
+                     << S.getFunction() << "\n");
+        return *i;
+      }
+    }
+    DEBUG(dbgs() << "unable to find DISubprogram node for function "
+                 << F->getName().str() << "\n");
+    return 0;
+  }
+
+  /// Sets Line to the line number on which V appears and returns true. If a
+  /// line location for V is not found, returns false.
+  bool findLine(const Value *V, unsigned &Line) {
+    if (LineTable.getLine(V, Line))
+      return true;
+
+    if (VMap) {
+      Value *mapped = VMap->lookup(V);
+      if (mapped && LineTable.getLine(mapped, Line))
+        return true;
+    }
+    return false;
+  }
+
+  std::string getTypeName(Type *T) {
+    std::string TypeName;
+    raw_string_ostream TypeStream(TypeName);
+    T->print(TypeStream);
+    TypeStream.flush();
+    return TypeName;
+  }
+
+  /// Returns the MDNode that represents type T if it is already created, or 0
+  /// if it is not.
+  MDNode *getType(const Type *T) {
+    typedef DenseMap<const Type *, MDNode *>::const_iterator TypeNodeIter;
+    TypeNodeIter i = TypeDescriptors.find(T);
+    if (i != TypeDescriptors.end())
+      return i->second;
+    return 0;
+  }
+
+  /// Returns a DebugInfo type from an LLVM type T.
+  DIDerivedType getOrCreateType(Type *T) {
+    MDNode *N = getType(T);
+    if (N)
+      return DIDerivedType(N);
+    else if (T->isVoidTy())
+      return DIDerivedType(0);
+    else if (T->isStructTy()) {
+      N = Builder.createStructType(
+          DIScope(LexicalBlockFileNode), T->getStructName(), DIFile(FileNode),
+          0, Layout.getTypeSizeInBits(T), Layout.getABITypeAlignment(T), 0,
+          DIType(0), DIArray(0)); // filled in later
+
+      // N is added to the map (early) so that element search below can find it,
+      // so as to avoid infinite recursion for structs that contain pointers to
+      // their own type.
+      TypeDescriptors[T] = N;
+      DICompositeType StructDescriptor(N);
+
+      SmallVector<Value *, 4> Elements;
+      for (unsigned i = 0; i < T->getStructNumElements(); ++i)
+        Elements.push_back(getOrCreateType(T->getStructElementType(i)));
+
+      // set struct elements
+      StructDescriptor.setTypeArray(Builder.getOrCreateArray(Elements));
+    } else if (T->isPointerTy()) {
+      Type *PointeeTy = T->getPointerElementType();
+      if (!(N = getType(PointeeTy)))
+        N = Builder.createPointerType(
+            getOrCreateType(PointeeTy), Layout.getPointerTypeSizeInBits(T),
+            Layout.getPrefTypeAlignment(T), getTypeName(T));
+    } else if (T->isArrayTy()) {
+      SmallVector<Value *, 1> Subrange;
+      Subrange.push_back(
+          Builder.getOrCreateSubrange(0, T->getArrayNumElements() - 1));
+
+      N = Builder.createArrayType(Layout.getTypeSizeInBits(T),
+                                  Layout.getPrefTypeAlignment(T),
+                                  getOrCreateType(T->getArrayElementType()),
+                                  Builder.getOrCreateArray(Subrange));
+    } else {
+      int encoding = llvm::dwarf::DW_ATE_signed;
+      if (T->isIntegerTy())
+        encoding = llvm::dwarf::DW_ATE_unsigned;
+      else if (T->isFloatingPointTy())
+        encoding = llvm::dwarf::DW_ATE_float;
+
+      N = Builder.createBasicType(getTypeName(T), T->getPrimitiveSizeInBits(),
+                                  0, encoding);
+    }
+    TypeDescriptors[T] = N;
+    return DIDerivedType(N);
+  }
+
+  /// Returns a DebugInfo type that represents a function signature for Func.
+  DICompositeType createFunctionSignature(const Function *Func) {
+    SmallVector<Value *, 4> Params;
+    DIDerivedType ReturnType(getOrCreateType(Func->getReturnType()));
+    Params.push_back(ReturnType);
+
+    const Function::ArgumentListType &Args(Func->getArgumentList());
+    for (Function::ArgumentListType::const_iterator i = Args.begin(),
+                                                    e = Args.end();
+         i != e; ++i) {
+      Type *T(i->getType());
+      Params.push_back(getOrCreateType(T));
+    }
+
+    DIArray ParamArray = Builder.getOrCreateArray(Params);
+    return Builder.createSubroutineType(DIFile(FileNode), ParamArray);
+  }
+
+  /// Associates Instruction I with debug location Loc.
+  void addDebugLocation(Instruction &I, DebugLoc Loc) {
+    MDNode *MD = Loc.getAsMDNode(I.getContext());
+    I.setMetadata(LLVMContext::MD_dbg, MD);
+  }
+};
+
+/// Sets Filename/Directory from the Module identifier and returns true, or
+/// false if source information is not present.
+bool getSourceInfoFromModule(const Module &M, std::string &Directory,
+                             std::string &Filename) {
+  std::string PathStr(M.getModuleIdentifier());
+  if (PathStr.length() == 0 || PathStr == "<stdin>")
+    return false;
+
+  Filename = sys::path::filename(PathStr);
+  SmallVector<char, 16> Path(PathStr.begin(), PathStr.end());
+  sys::path::remove_filename(Path);
+  Directory = StringRef(Path.data(), Path.size());
+  return true;
+}
+
+// Sets Filename/Directory from debug information in M and returns true, or
+// false if no debug information available, or cannot be parsed.
+bool getSourceInfoFromDI(const Module &M, std::string &Directory,
+                         std::string &Filename) {
+  NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu");
+  if (!CUNode || CUNode->getNumOperands() == 0)
+    return false;
+
+  DICompileUnit CU(CUNode->getOperand(0));
+  if (!CU.Verify())
+    return false;
+
+  Filename = CU.getFilename();
+  Directory = CU.getDirectory();
+  return true;
+}
+
+} // anonymous namespace
+
+namespace llvm {
+
+bool DebugIR::getSourceInfo(const Module &M) {
+  ParsedPath = getSourceInfoFromDI(M, Directory, Filename) ||
+               getSourceInfoFromModule(M, Directory, Filename);
+  return ParsedPath;
+}
+
+bool DebugIR::updateExtension(StringRef NewExtension) {
+  size_t dot = Filename.find_last_of(".");
+  if (dot == std::string::npos)
+    return false;
+
+  Filename.erase(dot);
+  Filename += NewExtension.str();
+  return true;
+}
+
+void DebugIR::generateFilename(OwningPtr<int> &fd) {
+  SmallVector<char, 16> PathVec;
+  fd.reset(new int);
+  sys::fs::createTemporaryFile("debug-ir", "ll", *fd, PathVec);
+  StringRef Path(PathVec.data(), PathVec.size());
+  Filename = sys::path::filename(Path);
+  sys::path::remove_filename(PathVec);
+  Directory = StringRef(PathVec.data(), PathVec.size());
+
+  GeneratedPath = true;
+}
+
+std::string DebugIR::getPath() {
+  SmallVector<char, 16> Path;
+  sys::path::append(Path, Directory, Filename);
+  Path.resize(Filename.size() + Directory.size() + 2);
+  Path[Filename.size() + Directory.size() + 1] = '\0';
+  return std::string(Path.data());
+}
+
+void DebugIR::writeDebugBitcode(const Module *M, int *fd) {
+  OwningPtr<raw_fd_ostream> Out;
+  std::string error;
+
+  if (!fd) {
+    std::string Path = getPath();
+    Out.reset(new raw_fd_ostream(Path.c_str(), error));
+    DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to file "
+                 << Path << "\n");
+  } else {
+    DEBUG(dbgs() << "WRITING debug bitcode from Module " << M << " to fd "
+                 << *fd << "\n");
+    Out.reset(new raw_fd_ostream(*fd, true));
+  }
+
+  M->print(*Out, 0);
+  Out->close();
+}
+
+void DebugIR::createDebugInfo(Module &M, OwningPtr<Module> &DisplayM) {
+  if (M.getFunctionList().size() == 0)
+    // no functions -- no debug info needed
+    return;
+
+  OwningPtr<ValueToValueMapTy> VMap;
+
+  if (WriteSourceToDisk && (HideDebugIntrinsics || HideDebugMetadata)) {
+    VMap.reset(new ValueToValueMapTy);
+    DisplayM.reset(CloneModule(&M, *VMap));
+
+    if (HideDebugIntrinsics)
+      DebugIntrinsicsRemover::process(*DisplayM);
+
+    if (HideDebugMetadata)
+      DebugMetadataRemover::process(*DisplayM);
+  }
+
+  DIUpdater R(M, Filename, Directory, DisplayM.get(), VMap.get());
+}
+
+bool DebugIR::isMissingPath() { return Filename.empty() || Directory.empty(); }
+
+bool DebugIR::runOnModule(Module &M) {
+  OwningPtr<int> fd;
+
+  if (isMissingPath() && !getSourceInfo(M)) {
+    if (!WriteSourceToDisk)
+      report_fatal_error("DebugIR unable to determine file name in input. "
+                         "Ensure Module contains an identifier, a valid "
+                         "DICompileUnit, or construct DebugIR with "
+                         "non-empty Filename/Directory parameters.");
+    else
+      generateFilename(fd);
+  }
+
+  if (!GeneratedPath && WriteSourceToDisk)
+    updateExtension(".debug-ll");
+
+  // Clear line numbers. Keep debug info (if any) if we were able to read the
+  // file name from the DICompileUnit descriptor.
+  DebugMetadataRemover::process(M, !ParsedPath);
+
+  OwningPtr<Module> DisplayM;
+  createDebugInfo(M, DisplayM);
+  if (WriteSourceToDisk) {
+    Module *OutputM = DisplayM.get() ? DisplayM.get() : &M;
+    writeDebugBitcode(OutputM, fd.get());
+  }
+
+  DEBUG(M.dump());
+  return true;
+}
+
+bool DebugIR::runOnModule(Module &M, std::string &Path) {
+  bool result = runOnModule(M);
+  Path = getPath();
+  return result;
+}
+
+} // llvm namespace
+
+char DebugIR::ID = 0;
+INITIALIZE_PASS(DebugIR, "debug-ir", "Enable debugging IR", false, false)
+
+ModulePass *llvm::createDebugIRPass(bool HideDebugIntrinsics,
+                                    bool HideDebugMetadata, StringRef Directory,
+                                    StringRef Filename) {
+  return new DebugIR(HideDebugIntrinsics, HideDebugMetadata, Directory,
+                     Filename);
+}
+
+ModulePass *llvm::createDebugIRPass() { return new DebugIR(); }
diff --git a/lib/Transforms/Instrumentation/DebugIR.h b/lib/Transforms/Instrumentation/DebugIR.h
new file mode 100644
index 0000000..13774cf
--- /dev/null
+++ b/lib/Transforms/Instrumentation/DebugIR.h
@@ -0,0 +1,99 @@
+//===- llvm/Transforms/Instrumentation/DebugIR.h - Interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface of the DebugIR pass. For most users,
+// including Instrumentation.h and calling createDebugIRPass() is sufficient and
+// there is no need to include this file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
+#define LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+class DebugIR : public llvm::ModulePass {
+  /// If true, write a source file to disk.
+  bool WriteSourceToDisk;
+
+  /// Hide certain (non-essential) debug information (only relevant if
+  /// createSource is true.
+  bool HideDebugIntrinsics;
+  bool HideDebugMetadata;
+
+  /// The location of the source file.
+  std::string Directory;
+  std::string Filename;
+
+  /// True if a temporary file name was generated.
+  bool GeneratedPath;
+
+  /// True if the file name was read from the Module.
+  bool ParsedPath;
+
+public:
+  static char ID;
+
+  const char *getPassName() const { return "DebugIR"; }
+
+  /// Generate a file on disk to be displayed in a debugger. If Filename and
+  /// Directory are empty, a temporary path will be generated.
+  DebugIR(bool HideDebugIntrinsics, bool HideDebugMetadata,
+          llvm::StringRef Directory, llvm::StringRef Filename)
+      : ModulePass(ID), WriteSourceToDisk(true),
+        HideDebugIntrinsics(HideDebugIntrinsics),
+        HideDebugMetadata(HideDebugMetadata), Directory(Directory),
+        Filename(Filename), GeneratedPath(false), ParsedPath(false) {}
+
+  /// Modify input in-place; do not generate additional files, and do not hide
+  /// any debug intrinsics/metadata that might be present.
+  DebugIR()
+      : ModulePass(ID), WriteSourceToDisk(false), HideDebugIntrinsics(false),
+        HideDebugMetadata(false), GeneratedPath(false), ParsedPath(false) {}
+
+  /// Run pass on M and set Path to the source file path in the output module.
+  bool runOnModule(llvm::Module &M, std::string &Path);
+  bool runOnModule(llvm::Module &M);
+
+private:
+
+  /// Returns the concatenated Directory + Filename, without error checking
+  std::string getPath();
+
+  /// Attempts to read source information from debug information in M, and if
+  /// that fails, from M's identifier. Returns true on success, false otherwise.
+  bool getSourceInfo(const llvm::Module &M);
+
+  /// Replace the extension of Filename with NewExtension, and return true if
+  /// successful. Return false if extension could not be found or Filename is
+  /// empty.
+  bool updateExtension(llvm::StringRef NewExtension);
+
+  /// Generate a temporary filename and open an fd
+  void generateFilename(llvm::OwningPtr<int> &fd);
+
+  /// Creates DWARF CU/Subroutine metadata
+  void createDebugInfo(llvm::Module &M,
+                       llvm::OwningPtr<llvm::Module> &DisplayM);
+
+  /// Returns true if either Directory or Filename is missing, false otherwise.
+  bool isMissingPath();
+
+  /// Write M to disk, optionally passing in an fd to an open file which is
+  /// closed by this function after writing. If no fd is specified, a new file
+  /// is opened, written, and closed.
+  void writeDebugBitcode(const llvm::Module *M, int *fd = 0);
+};
+
+} // llvm namespace
+
+#endif // LLVM_TRANSFORMS_INSTRUMENTATION_DEBUGIR_H
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
deleted file mode 100644
index a2459fb..0000000
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-//===- EdgeProfiling.cpp - Insert counters for edge profiling -------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-// Note that this implementation is very naive.  We insert a counter for *every*
-// edge in the program, instead of using control flow information to prune the
-// number of counters inserted.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-edge-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class EdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    EdgeProfiler() : ModulePass(ID) {
-      initializeEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual const char *getPassName() const {
-      return "Edge Profiler";
-    }
-  };
-}
-
-char EdgeProfiler::ID = 0;
-INITIALIZE_PASS(EdgeProfiler, "insert-edge-profiling",
-                "Insert instrumentation for edge profiling", false, false)
-
-ModulePass *llvm::createEdgeProfilerPass() { return new EdgeProfiler(); }
-
-bool EdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  std::set<BasicBlock*> BlocksToInstrument;
-  unsigned NumEdges = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      BlocksToInstrument.insert(BB);
-      NumEdges += BB->getTerminator()->getNumSuccessors();
-    }
-  }
-
-  Type *ATy = ArrayType::get(Type::getInt32Ty(M.getContext()), NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "EdgeProfCounters");
-  NumEdgesInserted = NumEdges;
-
-  // Instrument all of the edges...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Create counter for (0,entry) edge.
-    IncrementCounterInBlock(&F->getEntryBlock(), i++, Counters);
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
-      if (BlocksToInstrument.count(BB)) {  // Don't instrument inserted blocks
-        // Okay, we have to add a counter of each outgoing edge.  If the
-        // outgoing edge is not critical don't split it, just insert the counter
-        // in the source or destination of the edge.
-        TerminatorInst *TI = BB->getTerminator();
-        for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-          // If the edge is critical, split it.
-          SplitCriticalEdge(TI, s, this);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If we
-          // only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(BB, i++, Counters, false);
-          } else {
-            // Insert counter at the start of the block
-            IncrementCounterInBlock(TI->getSuccessor(s), i++, Counters);
-          }
-        }
-      }
-  }
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 2edd151..206bffb 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -17,7 +17,6 @@
 #define DEBUG_TYPE "insert-gcov-profiling"
 
 #include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
@@ -34,9 +33,10 @@
 #include "llvm/Support/DebugLoc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/Support/PathV2.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
 #include <string>
 #include <utility>
 using namespace llvm;
@@ -102,6 +102,7 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getSummaryInfoFunc();
     Constant *getDeleteWriteoutFunctionListFunc();
     Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
@@ -153,10 +154,10 @@ static std::string getFunctionName(DISubprogram SP) {
 namespace {
   class GCOVRecord {
    protected:
-    static const char *LinesTag;
-    static const char *FunctionTag;
-    static const char *BlockTag;
-    static const char *EdgeTag;
+    static const char *const LinesTag;
+    static const char *const FunctionTag;
+    static const char *const BlockTag;
+    static const char *const EdgeTag;
 
     GCOVRecord() {}
 
@@ -170,7 +171,7 @@ namespace {
 
     // Returns the length measured in 4-byte blocks that will be used to
     // represent this string in a GCOV file
-    unsigned lengthOfGCOVString(StringRef s) {
+    static unsigned lengthOfGCOVString(StringRef s) {
       // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
       // padding out to the next 4-byte word. The length is measured in 4-byte
       // words including padding, not bytes of actual string.
@@ -190,10 +191,10 @@ namespace {
 
     raw_ostream *os;
   };
-  const char *GCOVRecord::LinesTag = "\0\0\x45\x01";
-  const char *GCOVRecord::FunctionTag = "\0\0\0\1";
-  const char *GCOVRecord::BlockTag = "\0\0\x41\x01";
-  const char *GCOVRecord::EdgeTag = "\0\0\x43\x01";
+  const char *const GCOVRecord::LinesTag = "\0\0\x45\x01";
+  const char *const GCOVRecord::FunctionTag = "\0\0\0\1";
+  const char *const GCOVRecord::BlockTag = "\0\0\x41\x01";
+  const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01";
 
   class GCOVFunction;
   class GCOVBlock;
@@ -207,7 +208,7 @@ namespace {
       Lines.push_back(Line);
     }
 
-    uint32_t length() {
+    uint32_t length() const {
       // Here 2 = 1 for string length + 1 for '0' id#.
       return lengthOfGCOVString(Filename) + 2 + Lines.size();
     }
@@ -229,6 +230,15 @@ namespace {
     SmallVector<uint32_t, 32> Lines;
   };
 
+
+  // Sorting function for deterministic behaviour in GCOVBlock::writeOut.
+  struct StringKeySort {
+    bool operator()(StringMapEntry<GCOVLines *> *LHS,
+                    StringMapEntry<GCOVLines *> *RHS) const {
+      return LHS->getKey() < RHS->getKey();
+    }
+  };
+
   // Represent a basic block in GCOV. Each block has a unique number in the
   // function, number of lines belonging to each block, and a set of edges to
   // other blocks.
@@ -248,17 +258,23 @@ namespace {
 
     void writeOut() {
       uint32_t Len = 3;
+      SmallVector<StringMapEntry<GCOVLines *> *, 32> SortedLinesByFile;
       for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
                E = LinesByFile.end(); I != E; ++I) {
         Len += I->second->length();
+        SortedLinesByFile.push_back(&*I);
       }
 
       writeBytes(LinesTag, 4);
       write(Len);
       write(Number);
-      for (StringMap<GCOVLines *>::iterator I = LinesByFile.begin(),
-               E = LinesByFile.end(); I != E; ++I) 
-        I->second->writeOut();
+
+      StringKeySort Sorter;
+      std::sort(SortedLinesByFile.begin(), SortedLinesByFile.end(), Sorter);
+      for (SmallVectorImpl<StringMapEntry<GCOVLines *> *>::iterator
+               I = SortedLinesByFile.begin(), E = SortedLinesByFile.end();
+           I != E; ++I) 
+        (*I)->getValue()->writeOut();
       write(0);
       write(0);
     }
@@ -335,9 +351,10 @@ namespace {
       DEBUG(dbgs() << Blocks.size() << " blocks.\n");
 
       // Emit edges between blocks.
-      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
-               E = Blocks.end(); I != E; ++I) {
-        GCOVBlock &Block = *I->second;
+      if (Blocks.empty()) return;
+      Function *F = Blocks.begin()->first->getParent();
+      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+        GCOVBlock &Block = *Blocks[I];
         if (Block.OutEdges.empty()) continue;
 
         writeBytes(EdgeTag, 4);
@@ -352,9 +369,8 @@ namespace {
       }
 
       // Emit lines for each block.
-      for (DenseMap<BasicBlock *, GCOVBlock *>::iterator I = Blocks.begin(),
-               E = Blocks.end(); I != E; ++I) {
-        I->second->writeOut();
+      for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
+        Blocks[I]->writeOut();
       }
     }
 
@@ -410,7 +426,7 @@ void GCOVProfiler::emitProfileNotes() {
     DICompileUnit CU(CU_Nodes->getOperand(i));
     std::string ErrorInfo;
     raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
-                       raw_fd_ostream::F_Binary);
+                       sys::fs::F_Binary);
     out.write("oncg", 4);
     out.write(ReversedVersion, 4);
     out.write("MVLL", 4);
@@ -418,7 +434,10 @@ void GCOVProfiler::emitProfileNotes() {
     DIArray SPs = CU.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
       DISubprogram SP(SPs.getElement(i));
-      if (!SP.Verify()) continue;
+      assert((!SP || SP.isSubprogram()) &&
+        "A MDNode in subprograms of a CU should be null or a DISubprogram.");
+      if (!SP)
+        continue;
 
       Function *F = SP.getFunction();
       if (!F) continue;
@@ -467,7 +486,10 @@ bool GCOVProfiler::emitProfileArcs() {
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
       DISubprogram SP(SPs.getElement(i));
-      if (!SP.Verify()) continue;
+      assert((!SP || SP.isSubprogram()) &&
+        "A MDNode in subprograms of a CU should be null or a DISubprogram.");
+      if (!SP)
+        continue;
       Function *F = SP.getFunction();
       if (!F) continue;
       if (!Result) Result = true;
@@ -497,15 +519,15 @@ bool GCOVProfiler::emitProfileArcs() {
         TerminatorInst *TI = BB->getTerminator();
         int Successors = isa<ReturnInst>(TI) ? 1 : TI->getNumSuccessors();
         if (Successors) {
-          IRBuilder<> Builder(TI);
-          
           if (Successors == 1) {
+            IRBuilder<> Builder(BB->getFirstInsertionPt());
             Value *Counter = Builder.CreateConstInBoundsGEP2_64(Counters, 0,
                                                                 Edge);
             Value *Count = Builder.CreateLoad(Counter);
             Count = Builder.CreateAdd(Count, Builder.getInt64(1));
             Builder.CreateStore(Count, Counter);
           } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+            IRBuilder<> Builder(BI);
             Value *Sel = Builder.CreateSelect(BI->getCondition(),
                                               Builder.getInt64(Edge),
                                               Builder.getInt64(Edge + 1));
@@ -521,6 +543,7 @@ bool GCOVProfiler::emitProfileArcs() {
             for (int i = 0; i != Successors; ++i)
               ComplexEdgeSuccs.insert(TI->getSuccessor(i));
           }
+
           Edge += Successors;
         }
       }
@@ -532,14 +555,13 @@ bool GCOVProfiler::emitProfileArcs() {
         GlobalVariable *EdgeState = getEdgeStateValue();
         
         for (int i = 0, e = ComplexEdgePreds.size(); i != e; ++i) {
-          IRBuilder<> Builder(ComplexEdgePreds[i+1]->getTerminator());
+          IRBuilder<> Builder(ComplexEdgePreds[i + 1]->getFirstInsertionPt());
           Builder.CreateStore(Builder.getInt32(i), EdgeState);
         }
+
         for (int i = 0, e = ComplexEdgeSuccs.size(); i != e; ++i) {
-          // call runtime to perform increment
-          BasicBlock::iterator InsertPt =
-            ComplexEdgeSuccs[i+1]->getFirstInsertionPt();
-          IRBuilder<> Builder(InsertPt);
+          // Call runtime to perform increment.
+          IRBuilder<> Builder(ComplexEdgeSuccs[i+1]->getFirstInsertionPt());
           Value *CounterPtrArray =
             Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0,
                                                i * ComplexEdgePreds.size());
@@ -577,7 +599,7 @@ bool GCOVProfiler::emitProfileArcs() {
     };
     FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
 
-    // Inialize the environment and register the local writeout and flush
+    // Initialize the environment and register the local writeout and flush
     // functions.
     Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
     Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
@@ -679,6 +701,11 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getSummaryInfoFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
 Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
@@ -725,6 +752,7 @@ Function *GCOVProfiler::insertCounterWriteout(
   Constant *StartFile = getStartFileFunc();
   Constant *EmitFunction = getEmitFunctionFunc();
   Constant *EmitArcs = getEmitArcsFunc();
+  Constant *SummaryInfo = getSummaryInfoFunc();
   Constant *EndFile = getEndFileFunc();
 
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
@@ -751,6 +779,7 @@ Function *GCOVProfiler::insertCounterWriteout(
                             Builder.getInt32(Arcs),
                             Builder.CreateConstGEP2_64(GV, 0, 0));
       }
+      Builder.CreateCall(SummaryInfo);
       Builder.CreateCall(EndFile);
     }
   }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 9f35396..b1bea38 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -24,12 +24,10 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
-  initializeEdgeProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
-  initializeOptimalEdgeProfilerPass(Registry);
-  initializePathProfilerPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
+  initializeDataFlowSanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4e75904..d547adc 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -66,6 +66,31 @@
 /// avoids storing origin to memory when a fully initialized value is stored.
 /// This way it avoids needless overwritting origin of the 4-byte region on
 /// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+///                            Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "msan"
@@ -74,6 +99,7 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/ValueMap.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -90,9 +116,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 using namespace llvm;
 
@@ -156,6 +182,18 @@ static cl::opt<std::string>  ClBlacklistFile("msan-blacklist",
        cl::desc("File containing the list of functions where MemorySanitizer "
                 "should not report bugs"), cl::Hidden);
 
+// Experimental. Wraps all indirect calls in the instrumented code with
+// a call to the given function. This is needed to assist the dynamic
+// helper tool (MSanDR) to regain control on transition between instrumented and
+// non-instrumented code.
+static cl::opt<std::string> ClWrapIndirectCalls("msan-wrap-indirect-calls",
+       cl::desc("Wrap indirect calls with a given function"),
+       cl::Hidden);
+
+static cl::opt<bool> ClWrapIndirectCallsFast("msan-wrap-indirect-calls-fast",
+       cl::desc("Do not wrap indirect calls with target in the same module"),
+       cl::Hidden, cl::init(true));
+
 namespace {
 
 /// \brief An instrumentation pass implementing detection of uninitialized
@@ -167,12 +205,12 @@ class MemorySanitizer : public FunctionPass {
  public:
   MemorySanitizer(bool TrackOrigins = false,
                   StringRef BlacklistFile = StringRef())
-    : FunctionPass(ID),
-      TrackOrigins(TrackOrigins || ClTrackOrigins),
-      TD(0),
-      WarningFn(0),
-      BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
-                                          : BlacklistFile) { }
+      : FunctionPass(ID),
+        TrackOrigins(TrackOrigins || ClTrackOrigins),
+        TD(0),
+        WarningFn(0),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile : BlacklistFile),
+        WrapIndirectCalls(!ClWrapIndirectCalls.empty()) {}
   const char *getPassName() const { return "MemorySanitizer"; }
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
@@ -206,13 +244,16 @@ class MemorySanitizer : public FunctionPass {
   /// function.
   GlobalVariable *OriginTLS;
 
+  GlobalVariable *MsandrModuleStart;
+  GlobalVariable *MsandrModuleEnd;
+
   /// \brief The run-time callback to print a warning.
   Value *WarningFn;
   /// \brief Run-time helper that copies origin info for a memory range.
   Value *MsanCopyOriginFn;
   /// \brief Run-time helper that generates a new origin value for a stack
   /// allocation.
-  Value *MsanSetAllocaOriginFn;
+  Value *MsanSetAllocaOrigin4Fn;
   /// \brief Run-time helper that poisons stack on function entry.
   Value *MsanPoisonStackFn;
   /// \brief MSan runtime replacements for memmove, memcpy and memset.
@@ -228,13 +269,19 @@ class MemorySanitizer : public FunctionPass {
   MDNode *ColdCallWeights;
   /// \brief Branch weights for origin store.
   MDNode *OriginStoreWeights;
-  /// \bried Path to blacklist file.
+  /// \brief Path to blacklist file.
   SmallString<64> BlacklistFile;
   /// \brief The blacklist.
-  OwningPtr<BlackList> BL;
+  OwningPtr<SpecialCaseList> BL;
   /// \brief An empty volatile inline asm that prevents callback merge.
   InlineAsm *EmptyAsm;
 
+  bool WrapIndirectCalls;
+  /// \brief Run-time wrapper for indirect calls.
+  Value *IndirectCallWrapperFn;
+  // Argument and return type of IndirectCallWrapperFn: void (*f)(void).
+  Type *AnyFunctionPtrTy;
+
   friend struct MemorySanitizerVisitor;
   friend struct VarArgAMD64Helper;
 };
@@ -280,9 +327,9 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MsanCopyOriginFn = M.getOrInsertFunction(
     "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
     IRB.getInt8PtrTy(), IntptrTy, NULL);
-  MsanSetAllocaOriginFn = M.getOrInsertFunction(
-    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), NULL);
+  MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
   MsanPoisonStackFn = M.getOrInsertFunction(
     "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
   MemmoveFn = M.getOrInsertFunction(
@@ -299,35 +346,53 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   RetvalTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 8), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   RetvalOriginTLS = new GlobalVariable(
     M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    "__msan_retval_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   ParamTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   ParamOriginTLS = new GlobalVariable(
     M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
-    0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    0, "__msan_param_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   VAArgTLS = new GlobalVariable(
     M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
     GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   VAArgOverflowSizeTLS = new GlobalVariable(
     M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
     "__msan_va_arg_overflow_size_tls", 0,
-    GlobalVariable::GeneralDynamicTLSModel);
+    GlobalVariable::InitialExecTLSModel);
   OriginTLS = new GlobalVariable(
     M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
-    "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+    "__msan_origin_tls", 0, GlobalVariable::InitialExecTLSModel);
 
   // We insert an empty inline asm after __msan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+
+  if (WrapIndirectCalls) {
+    AnyFunctionPtrTy =
+        PointerType::getUnqual(FunctionType::get(IRB.getVoidTy(), false));
+    IndirectCallWrapperFn = M.getOrInsertFunction(
+        ClWrapIndirectCalls, AnyFunctionPtrTy, AnyFunctionPtrTy, NULL);
+  }
+
+  if (ClWrapIndirectCallsFast) {
+    MsandrModuleStart = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "__executable_start");
+    MsandrModuleStart->setVisibility(GlobalVariable::HiddenVisibility);
+    MsandrModuleEnd = new GlobalVariable(
+        M, IRB.getInt32Ty(), false, GlobalValue::ExternalLinkage,
+        0, "_end");
+    MsandrModuleEnd->setVisibility(GlobalVariable::HiddenVisibility);
+  }
 }
 
 /// \brief Module-level initialization.
@@ -337,7 +402,7 @@ bool MemorySanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new BlackList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
   C = &(M.getContext());
   unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
   switch (PtrSize) {
@@ -365,11 +430,13 @@ bool MemorySanitizer::doInitialization(Module &M) {
   appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
                       "__msan_init", IRB.getVoidTy(), NULL)), 0);
 
-  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
-                     IRB.getInt32(TrackOrigins), "__msan_track_origins");
+  if (TrackOrigins)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(TrackOrigins), "__msan_track_origins");
 
-  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
-                     IRB.getInt32(ClKeepGoing), "__msan_keep_going");
+  if (ClKeepGoing)
+    new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                       IRB.getInt32(ClKeepGoing), "__msan_keep_going");
 
   return true;
 }
@@ -420,27 +487,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   MemorySanitizer &MS;
   SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
   ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  // The following flags disable parts of MSan instrumentation based on
+  // blacklist contents and command-line options.
   bool InsertChecks;
   bool LoadShadow;
-  OwningPtr<VarArgHelper> VAHelper;
+  bool PoisonStack;
+  bool PoisonUndef;
+  bool CheckReturnValue;
 
   struct ShadowOriginAndInsertPoint {
-    Instruction *Shadow;
-    Instruction *Origin;
+    Value *Shadow;
+    Value *Origin;
     Instruction *OrigIns;
-    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+    ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
       : Shadow(S), Origin(O), OrigIns(I) { }
     ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
   };
   SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
   SmallVector<Instruction*, 16> StoreList;
+  SmallVector<CallSite, 16> IndirectCallList;
 
   MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
       : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
-    LoadShadow = InsertChecks =
-        !MS.BL->isIn(F) &&
-        F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::SanitizeMemory);
+    bool SanitizeFunction = !MS.BL->isIn(F) && F.getAttributes().hasAttribute(
+                                                   AttributeSet::FunctionIndex,
+                                                   Attribute::SanitizeMemory);
+    InsertChecks = SanitizeFunction;
+    LoadShadow = SanitizeFunction;
+    PoisonStack = SanitizeFunction && ClPoisonStack;
+    PoisonUndef = SanitizeFunction && ClPoisonUndef;
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
 
     DEBUG(if (!InsertChecks)
           dbgs() << "MemorySanitizer is not inserting checks into '"
@@ -454,7 +534,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       IRBuilder<> IRB(&I);
       Value *Val = I.getValueOperand();
       Value *Addr = I.getPointerOperand();
-      Value *Shadow = getShadow(Val);
+      Value *Shadow = I.isAtomic() ? getCleanShadow(Val) : getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
       StoreInst *NewSI =
@@ -463,7 +543,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       (void)NewSI;
 
       if (ClCheckAccessAddress)
-        insertCheck(Addr, &I);
+        insertShadowCheck(Addr, &I);
+
+      if (I.isAtomic())
+        I.setOrdering(addReleaseOrdering(I.getOrdering()));
 
       if (MS.TrackOrigins) {
         unsigned Alignment = std::max(kMinOriginAlignment, I.getAlignment());
@@ -473,11 +556,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         } else {
           Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
 
-          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
           // TODO(eugenis): handle non-zero constant shadow by inserting an
           // unconditional check (can not simply fail compilation as this could
           // be in the dead code).
-          if (Cst)
+          if (isa<Constant>(ConvertedShadow))
             continue;
 
           Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
@@ -495,12 +577,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void materializeChecks() {
     for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
-      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Value *Shadow = InstrumentationList[i].Shadow;
       Instruction *OrigIns = InstrumentationList[i].OrigIns;
       IRBuilder<> IRB(OrigIns);
       DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
       Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
       DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      // See the comment in materializeStores().
+      if (isa<Constant>(ConvertedShadow))
+        continue;
       Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
                                     getCleanShadow(ConvertedShadow), "_mscmp");
       Instruction *CheckTerm =
@@ -510,7 +595,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       IRB.SetInsertPoint(CheckTerm);
       if (MS.TrackOrigins) {
-        Instruction *Origin = InstrumentationList[i].Origin;
+        Value *Origin = InstrumentationList[i].Origin;
         IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
                         MS.OriginTLS);
       }
@@ -522,6 +607,48 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "DONE:\n" << F);
   }
 
+  void materializeIndirectCalls() {
+    for (size_t i = 0, n = IndirectCallList.size(); i < n; i++) {
+      CallSite CS = IndirectCallList[i];
+      Instruction *I = CS.getInstruction();
+      BasicBlock *B = I->getParent();
+      IRBuilder<> IRB(I);
+      Value *Fn0 = CS.getCalledValue();
+      Value *Fn = IRB.CreateBitCast(Fn0, MS.AnyFunctionPtrTy);
+
+      if (ClWrapIndirectCallsFast) {
+        // Check that call target is inside this module limits.
+        Value *Start =
+            IRB.CreateBitCast(MS.MsandrModuleStart, MS.AnyFunctionPtrTy);
+        Value *End = IRB.CreateBitCast(MS.MsandrModuleEnd, MS.AnyFunctionPtrTy);
+
+        Value *NotInThisModule = IRB.CreateOr(IRB.CreateICmpULT(Fn, Start),
+                                              IRB.CreateICmpUGE(Fn, End));
+
+        PHINode *NewFnPhi =
+            IRB.CreatePHI(Fn0->getType(), 2, "msandr.indirect_target");
+
+        Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+            cast<Instruction>(NotInThisModule),
+            /* Unreachable */ false, MS.ColdCallWeights);
+
+        IRB.SetInsertPoint(CheckTerm);
+        // Slow path: call wrapper function to possibly transform the call
+        // target.
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+
+        NewFnPhi->addIncoming(Fn0, B);
+        NewFnPhi->addIncoming(NewFn, dyn_cast<Instruction>(NewFn)->getParent());
+        CS.setCalledFunction(NewFnPhi);
+      } else {
+        Value *NewFn = IRB.CreateBitCast(
+            IRB.CreateCall(MS.IndirectCallWrapperFn, Fn), Fn0->getType());
+        CS.setCalledFunction(NewFn);
+      }
+    }
+  }
+
   /// \brief Add MemorySanitizer instrumentation to a function.
   bool runOnFunction() {
     MS.initializeCallbacks(*F.getParent());
@@ -564,6 +691,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Insert shadow value checks.
     materializeChecks();
 
+    // Wrap indirect calls.
+    materializeIndirectCalls();
+
     return true;
   }
 
@@ -741,7 +871,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return Shadow;
     }
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
-      Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+      Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
       DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
       (void)U;
       return AllOnes;
@@ -768,14 +898,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           if (AI->hasByValAttr()) {
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
+            // Figure out maximal valid memcpy alignment.
+            unsigned ArgAlign = AI->getParamAlignment();
+            if (ArgAlign == 0) {
+              Type *EltType = A->getType()->getPointerElementType();
+              ArgAlign = MS.TD->getABITypeAlignment(EltType);
+            }
+            unsigned CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
             Value *Cpy = EntryIRB.CreateMemCpy(
-              getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
-              Base, Size, AI->getParamAlignment());
+                getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB), Base, Size,
+                CopyAlign);
             DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
             (void)Cpy;
             *ShadowPtr = getCleanShadow(V);
           } else {
-            *ShadowPtr = EntryIRB.CreateLoad(Base);
+            *ShadowPtr = EntryIRB.CreateAlignedLoad(Base, kShadowTLSAlignment);
           }
           DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
                 **ShadowPtr << "\n");
@@ -784,7 +921,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
           }
         }
-        ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+        ArgOffset += DataLayout::RoundUpAlignment(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -820,20 +957,63 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// \brief Remember the place where a shadow check should be inserted.
   ///
   /// This location will be later instrumented with a check that will print a
-  /// UMR warning in runtime if the value is not fully defined.
-  void insertCheck(Value *Val, Instruction *OrigIns) {
-    assert(Val);
+  /// UMR warning in runtime if the shadow value is not 0.
+  void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+    assert(Shadow);
     if (!InsertChecks) return;
-    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
-    if (!Shadow) return;
 #ifndef NDEBUG
     Type *ShadowTy = Shadow->getType();
     assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
            "Can only insert checks for integer and vector shadow types");
 #endif
-    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
     InstrumentationList.push_back(
-      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+        ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    insertShadowCheck(Shadow, Origin, OrigIns);
+  }
+
+  AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Release:
+        return Release;
+      case Acquire:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
+  }
+
+  AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+    switch (a) {
+      case NotAtomic:
+        return NotAtomic;
+      case Unordered:
+      case Monotonic:
+      case Acquire:
+        return Acquire;
+      case Release:
+      case AcquireRelease:
+        return AcquireRelease;
+      case SequentiallyConsistent:
+        return SequentiallyConsistent;
+    }
+    llvm_unreachable("Unknown ordering");
   }
 
   // ------------------- Visitors.
@@ -844,7 +1024,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Optionally, checks that the load address is fully defined.
   void visitLoadInst(LoadInst &I) {
     assert(I.getType()->isSized() && "Load type must have size");
-    IRBuilder<> IRB(&I);
+    IRBuilder<> IRB(I.getNextNode());
     Type *ShadowTy = getShadowTy(&I);
     Value *Addr = I.getPointerOperand();
     if (LoadShadow) {
@@ -856,7 +1036,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
 
     if (ClCheckAccessAddress)
-      insertCheck(I.getPointerOperand(), &I);
+      insertShadowCheck(I.getPointerOperand(), &I);
+
+    if (I.isAtomic())
+      I.setOrdering(addAcquireOrdering(I.getOrdering()));
 
     if (MS.TrackOrigins) {
       if (LoadShadow) {
@@ -877,9 +1060,40 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     StoreList.push_back(&I);
   }
 
+  void handleCASOrRMW(Instruction &I) {
+    assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getOperand(0);
+    Value *ShadowPtr = getShadowPtr(Addr, I.getType(), IRB);
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    // Only test the conditional argument of cmpxchg instruction.
+    // The other argument can potentially be uninitialized, but we can not
+    // detect this situation reliably without possible false positives.
+    if (isa<AtomicCmpXchgInst>(I))
+      insertShadowCheck(I.getOperand(1), &I);
+
+    IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+    setShadow(&I, getCleanShadow(&I));
+  }
+
+  void visitAtomicRMWInst(AtomicRMWInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
+  void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+    handleCASOrRMW(I);
+    I.setOrdering(addReleaseOrdering(I.getOrdering()));
+  }
+
   // Vector manipulation.
   void visitExtractElementInst(ExtractElementInst &I) {
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
               "_msprop"));
@@ -887,7 +1101,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitInsertElementInst(InsertElementInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -895,7 +1109,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitShuffleVectorInst(ShuffleVectorInst &I) {
-    insertCheck(I.getOperand(2), &I);
+    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
               I.getOperand(2), "_msprop"));
@@ -1094,18 +1308,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   /// \brief Cast between two shadow types, extending or truncating as
   /// necessary.
-  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+                          bool Signed = false) {
     Type *srcTy = V->getType();
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
         dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
-      return IRB.CreateIntCast(V, dstTy, false);
+      return IRB.CreateIntCast(V, dstTy, Signed);
     size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
     size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
-      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
     return IRB.CreateBitCast(V2, dstTy);
     // TODO: handle struct types.
   }
@@ -1130,7 +1345,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
     // Strict on the second argument.
-    insertCheck(I.getOperand(1), &I);
+    insertShadowCheck(I.getOperand(1), &I);
     setShadow(&I, getShadow(&I, 0));
     setOrigin(&I, getOrigin(&I, 0));
   }
@@ -1413,7 +1628,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
 
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
@@ -1440,9 +1655,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       setShadow(&I, getCleanShadow(&I));
     }
 
-
     if (ClCheckAccessAddress)
-      insertCheck(Addr, &I);
+      insertShadowCheck(Addr, &I);
 
     if (MS.TrackOrigins) {
       if (LoadShadow)
@@ -1539,11 +1753,119 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
+  // \brief Instrument vector convert instrinsic.
+  //
+  // This function instruments intrinsics like cvtsi2ss:
+  // %Out = int_xxx_cvtyyy(%ConvertOp)
+  // or
+  // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+  // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+  // number \p Out elements, and (if has 2 arguments) copies the rest of the
+  // elements from \p CopyOp.
+  // In most cases conversion involves floating-point value which may trigger a
+  // hardware exception when not fully initialized. For this reason we require
+  // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+  // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+  // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+  // return a fully initialized value.
+  void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements) {
+    IRBuilder<> IRB(&I);
+    Value *CopyOp, *ConvertOp;
+
+    switch (I.getNumArgOperands()) {
+    case 2:
+      CopyOp = I.getArgOperand(0);
+      ConvertOp = I.getArgOperand(1);
+      break;
+    case 1:
+      ConvertOp = I.getArgOperand(0);
+      CopyOp = NULL;
+      break;
+    default:
+      llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+    }
+
+    // The first *NumUsedElements* elements of ConvertOp are converted to the
+    // same number of output elements. The rest of the output is copied from
+    // CopyOp, or (if not available) filled with zeroes.
+    // Combine shadow for elements of ConvertOp that are used in this operation,
+    // and insert a check.
+    // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+    // int->any conversion.
+    Value *ConvertShadow = getShadow(ConvertOp);
+    Value *AggShadow = 0;
+    if (ConvertOp->getType()->isVectorTy()) {
+      AggShadow = IRB.CreateExtractElement(
+          ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+      for (int i = 1; i < NumUsedElements; ++i) {
+        Value *MoreShadow = IRB.CreateExtractElement(
+            ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+        AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+      }
+    } else {
+      AggShadow = ConvertShadow;
+    }
+    assert(AggShadow->getType()->isIntegerTy());
+    insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+    // Build result shadow by zero-filling parts of CopyOp shadow that come from
+    // ConvertOp.
+    if (CopyOp) {
+      assert(CopyOp->getType() == I.getType());
+      assert(CopyOp->getType()->isVectorTy());
+      Value *ResultShadow = getShadow(CopyOp);
+      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      for (int i = 0; i < NumUsedElements; ++i) {
+        ResultShadow = IRB.CreateInsertElement(
+            ResultShadow, ConstantInt::getNullValue(EltTy),
+            ConstantInt::get(IRB.getInt32Ty(), i));
+      }
+      setShadow(&I, ResultShadow);
+      setOrigin(&I, getOrigin(CopyOp));
+    } else {
+      setShadow(&I, getCleanShadow(&I));
+    }
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvtss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttss2usi:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi64:
+    case llvm::Intrinsic::x86_avx512_cvttsd2usi:
+    case llvm::Intrinsic::x86_avx512_cvtusi2sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi2ss:
+    case llvm::Intrinsic::x86_avx512_cvtusi642sd:
+    case llvm::Intrinsic::x86_avx512_cvtusi642ss:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvtsd2si:
+    case llvm::Intrinsic::x86_sse2_cvtsd2ss:
+    case llvm::Intrinsic::x86_sse2_cvtsi2sd:
+    case llvm::Intrinsic::x86_sse2_cvtsi642sd:
+    case llvm::Intrinsic::x86_sse2_cvtss2sd:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si64:
+    case llvm::Intrinsic::x86_sse2_cvttsd2si:
+    case llvm::Intrinsic::x86_sse_cvtsi2ss:
+    case llvm::Intrinsic::x86_sse_cvtsi642ss:
+    case llvm::Intrinsic::x86_sse_cvtss2si64:
+    case llvm::Intrinsic::x86_sse_cvtss2si:
+    case llvm::Intrinsic::x86_sse_cvttss2si64:
+    case llvm::Intrinsic::x86_sse_cvttss2si:
+      handleVectorConvertIntrinsic(I, 1);
+      break;
+    case llvm::Intrinsic::x86_sse2_cvtdq2pd:
+    case llvm::Intrinsic::x86_sse2_cvtps2pd:
+    case llvm::Intrinsic::x86_sse_cvtps2pi:
+    case llvm::Intrinsic::x86_sse_cvttps2pi:
+      handleVectorConvertIntrinsic(I, 2);
+      break;
     default:
       if (!handleUnknownIntrinsic(I))
         visitInstruction(I);
@@ -1589,6 +1911,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       }
     }
     IRBuilder<> IRB(&I);
+
+    if (MS.WrapIndirectCalls && !CS.getCalledFunction())
+      IndirectCallList.push_back(CS);
+
     unsigned ArgOffset = 0;
     DEBUG(dbgs() << "  CallSite: " << I << "\n");
     for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
@@ -1632,7 +1958,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     DEBUG(dbgs() << "  done with call args\n");
 
     FunctionType *FT =
-      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+      cast<FunctionType>(CS.getCalledValue()->getType()->getContainedType(0));
     if (FT->isVarArg()) {
       VAHelper->visitCallSite(CS, IRB);
     }
@@ -1671,12 +1997,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitReturnInst(ReturnInst &I) {
     IRBuilder<> IRB(&I);
-    if (Value *RetVal = I.getReturnValue()) {
-      // Set the shadow for the RetVal.
+    Value *RetVal = I.getReturnValue();
+    if (!RetVal) return;
+    Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+    if (CheckReturnValue) {
+      insertShadowCheck(RetVal, &I);
+      Value *Shadow = getCleanShadow(RetVal);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+    } else {
       Value *Shadow = getShadow(RetVal);
-      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
-      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -1694,20 +2025,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   void visitAllocaInst(AllocaInst &I) {
     setShadow(&I, getCleanShadow(&I));
-    if (!ClPoisonStack) return;
     IRBuilder<> IRB(I.getNextNode());
     uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType());
-    if (ClPoisonStackWithCall) {
+    if (PoisonStack && ClPoisonStackWithCall) {
       IRB.CreateCall2(MS.MsanPoisonStackFn,
                       IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
                       ConstantInt::get(MS.IntptrTy, Size));
     } else {
       Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
-      IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern),
-                       Size, I.getAlignment());
+      Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment());
     }
 
-    if (MS.TrackOrigins) {
+    if (PoisonStack && MS.TrackOrigins) {
       setOrigin(&I, getCleanOrigin());
       SmallString<2048> StackDescriptionStorage;
       raw_svector_ostream StackDescription(StackDescriptionStorage);
@@ -1720,18 +2050,34 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Descr =
           createPrivateNonConstGlobalForString(*F.getParent(),
                                                StackDescription.str());
-      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+
+      IRB.CreateCall4(MS.MsanSetAllocaOrigin4Fn,
                       IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
                       ConstantInt::get(MS.IntptrTy, Size),
-                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+                      IRB.CreatePointerCast(&F, MS.IntptrTy));
     }
   }
 
   void visitSelectInst(SelectInst& I) {
     IRBuilder<> IRB(&I);
-    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
-              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
-              "_msprop"));
+    // a = select b, c, d
+    Value *S = IRB.CreateSelect(I.getCondition(), getShadow(I.getTrueValue()),
+                                getShadow(I.getFalseValue()));
+    if (I.getType()->isAggregateType()) {
+      // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+      // an extra "select". This results in much more compact IR.
+      // Sa = select Sb, poisoned, (select b, Sc, Sd)
+      S = IRB.CreateSelect(getShadow(I.getCondition()),
+                           getPoisonedShadow(getShadowTy(I.getType())), S,
+                           "_msprop_select_agg");
+    } else {
+      // Sa = (sext Sb) | (select b, Sc, Sd)
+      S = IRB.CreateOr(S, CreateShadowCast(IRB, getShadow(I.getCondition()),
+                                           S->getType(), true),
+                       "_msprop_select");
+    }
+    setShadow(&I, S);
     if (MS.TrackOrigins) {
       // Origins are always i32, so any vector conditions must be flattened.
       // FIXME: consider tracking vector origins for app vectors?
@@ -1766,7 +2112,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
     DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
     setShadow(&I, ResShadow);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void visitInsertValueInst(InsertValueInst &I) {
@@ -1779,7 +2125,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
     DEBUG(dbgs() << "   Res:        " << *Res << "\n");
     setShadow(&I, Res);
-    setOrigin(&I, getCleanOrigin());
+    setOriginForNaryOp(I);
   }
 
   void dumpInst(Instruction &I) {
@@ -1802,7 +2148,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       dumpInst(I);
     DEBUG(dbgs() << "DEFAULT: " << I << "\n");
     for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
-      insertCheck(I.getOperand(i), &I);
+      insertShadowCheck(I.getOperand(i), &I);
     setShadow(&I, getCleanShadow(&I));
     setOrigin(&I, getCleanOrigin());
   }
@@ -1956,16 +2302,35 @@ struct VarArgAMD64Helper : public VarArgHelper {
       Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
       Value *OverflowArgAreaShadowPtr =
         MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
-      Value *SrcPtr =
-        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      Value *SrcPtr = IRB.CreateConstGEP1_32(VAArgTLSCopy, AMD64FpEndOffset);
       IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
     }
   }
 };
 
-VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+/// \brief A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+  VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+                   MemorySanitizerVisitor &MSV) {}
+
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {}
+
+  void visitVAStartInst(VAStartInst &I) {}
+
+  void visitVACopyInst(VACopyInst &I) {}
+
+  void finalizeInstrumentation() {}
+};
+
+VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
                                  MemorySanitizerVisitor &Visitor) {
-  return new VarArgAMD64Helper(Func, Msan, Visitor);
+  // VarArg handling is only implemented on AMD64. False positives are possible
+  // on other platforms.
+  llvm::Triple TargetTriple(Func.getParent()->getTargetTriple());
+  if (TargetTriple.getArch() == llvm::Triple::x86_64)
+    return new VarArgAMD64Helper(Func, Msan, Visitor);
+  else
+    return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
 
 }  // namespace
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
deleted file mode 100644
index b45aef65..0000000
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ /dev/null
@@ -1,225 +0,0 @@
-//===- OptimalEdgeProfiling.cpp - Insert counters for opt. edge profiling -===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments the specified program with counters for edge profiling.
-// Edge profiling can give a reasonable approximation of the hot paths through a
-// program, and is used for a wide variety of program transformations.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-optimal-edge-profiling"
-#include "llvm/Transforms/Instrumentation.h"
-#include "MaximumSpanningTree.h"
-#include "ProfilingUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-STATISTIC(NumEdgesInserted, "The # of edges inserted.");
-
-namespace {
-  class OptimalEdgeProfiler : public ModulePass {
-    bool runOnModule(Module &M);
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    OptimalEdgeProfiler() : ModulePass(ID) {
-      initializeOptimalEdgeProfilerPass(*PassRegistry::getPassRegistry());
-    }
-
-    void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequiredID(ProfileEstimatorPassID);
-      AU.addRequired<ProfileInfo>();
-    }
-
-    virtual const char *getPassName() const {
-      return "Optimal Edge Profiler";
-    }
-  };
-}
-
-char OptimalEdgeProfiler::ID = 0;
-INITIALIZE_PASS_BEGIN(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileEstimatorPass)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(OptimalEdgeProfiler, "insert-optimal-edge-profiling",
-                "Insert optimal instrumentation for edge profiling",
-                false, false)
-
-ModulePass *llvm::createOptimalEdgeProfilerPass() {
-  return new OptimalEdgeProfiler();
-}
-
-inline static void printEdgeCounter(ProfileInfo::Edge e,
-                                    BasicBlock* b,
-                                    unsigned i) {
-  DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
-               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
-}
-
-bool OptimalEdgeProfiler::runOnModule(Module &M) {
-  Function *Main = M.getFunction("main");
-  if (Main == 0) {
-    errs() << "WARNING: cannot insert edge profiling into a module"
-           << " with no main function!\n";
-    return false;  // No main, no instrumentation!
-  }
-
-  // NumEdges counts all the edges that may be instrumented. Later on its
-  // decided which edges to actually instrument, to achieve optimal profiling.
-  // For the entry block a virtual edge (0,entry) is reserved, for each block
-  // with no successors an edge (BB,0) is reserved. These edges are necessary
-  // to calculate a truly optimal maximum spanning tree and thus an optimal
-  // instrumentation.
-  unsigned NumEdges = 0;
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    // Reserve space for (0,entry) edge.
-    ++NumEdges;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Keep track of which blocks need to be instrumented.  We don't want to
-      // instrument blocks that are added as the result of breaking critical
-      // edges!
-      if (BB->getTerminator()->getNumSuccessors() == 0) {
-        // Reserve space for (BB,0) edge.
-        ++NumEdges;
-      } else {
-        NumEdges += BB->getTerminator()->getNumSuccessors();
-      }
-    }
-  }
-
-  // In the profiling output a counter for each edge is reserved, but only few
-  // are used. This is done to be able to read back in the profile without
-  // calulating the maximum spanning tree again, instead each edge counter that
-  // is not used is initialised with -1 to signal that this edge counter has to
-  // be calculated from other edge counters on reading the profile info back
-  // in.
-
-  Type *Int32 = Type::getInt32Ty(M.getContext());
-  ArrayType *ATy = ArrayType::get(Int32, NumEdges);
-  GlobalVariable *Counters =
-    new GlobalVariable(M, ATy, false, GlobalValue::InternalLinkage,
-                       Constant::getNullValue(ATy), "OptEdgeProfCounters");
-  NumEdgesInserted = 0;
-
-  std::vector<Constant*> Initializer(NumEdges);
-  Constant *Zero = ConstantInt::get(Int32, 0);
-  Constant *Uncounted = ConstantInt::get(Int32, ProfileInfoLoader::Uncounted);
-
-  // Instrument all of the edges not in MST...
-  unsigned i = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
-
-    // Calculate a Maximum Spanning Tree with the edge weights determined by
-    // ProfileEstimator. ProfileEstimator also assign weights to the virtual
-    // edges (0,entry) and (BB,0) (for blocks with no successors) and this
-    // edges also participate in the maximum spanning tree calculation.
-    // The third parameter of MaximumSpanningTree() has the effect that not the
-    // actual MST is returned but the edges _not_ in the MST.
-
-    ProfileInfo::EdgeWeights ECs =
-      getAnalysis<ProfileInfo>(*F).getEdgeWeights(F);
-    std::vector<ProfileInfo::EdgeWeight> EdgeVector(ECs.begin(), ECs.end());
-    MaximumSpanningTree<BasicBlock> MST(EdgeVector);
-    std::stable_sort(MST.begin(), MST.end());
-
-    // Check if (0,entry) not in the MST. If not, instrument edge
-    // (IncrementCounterInBlock()) and set the counter initially to zero, if
-    // the edge is in the MST the counter is initialised to -1.
-
-    BasicBlock *entry = &(F->getEntryBlock());
-    ProfileInfo::Edge edge = ProfileInfo::getEdge(0, entry);
-    if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-      printEdgeCounter(edge, entry, i);
-      IncrementCounterInBlock(entry, i, Counters); ++NumEdgesInserted;
-      Initializer[i++] = (Zero);
-    } else{
-      Initializer[i++] = (Uncounted);
-    }
-
-    // InsertedBlocks contains all blocks that were inserted for splitting an
-    // edge, this blocks do not have to be instrumented.
-    DenseSet<BasicBlock*> InsertedBlocks;
-    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
-      // Check if block was not inserted and thus does not have to be
-      // instrumented.
-      if (InsertedBlocks.count(BB)) continue;
-
-      // Okay, we have to add a counter of each outgoing edge not in MST. If
-      // the outgoing edge is not critical don't split it, just insert the
-      // counter in the source or destination of the edge. Also, if the block
-      // has no successors, the virtual edge (BB,0) is processed.
-      TerminatorInst *TI = BB->getTerminator();
-      if (TI->getNumSuccessors() == 0) {
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB, 0);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-          printEdgeCounter(edge, BB, i);
-          IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          Initializer[i++] = (Zero);
-        } else{
-          Initializer[i++] = (Uncounted);
-        }
-      }
-      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
-        BasicBlock *Succ = TI->getSuccessor(s);
-        ProfileInfo::Edge edge = ProfileInfo::getEdge(BB,Succ);
-        if (!std::binary_search(MST.begin(), MST.end(), edge)) {
-
-          // If the edge is critical, split it.
-          bool wasInserted = SplitCriticalEdge(TI, s, this);
-          Succ = TI->getSuccessor(s);
-          if (wasInserted)
-            InsertedBlocks.insert(Succ);
-
-          // Okay, we are guaranteed that the edge is no longer critical.  If
-          // we only have a single successor, insert the counter in this block,
-          // otherwise insert it in the successor block.
-          if (TI->getNumSuccessors() == 1) {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, BB, i);
-            IncrementCounterInBlock(BB, i, Counters); ++NumEdgesInserted;
-          } else {
-            // Insert counter at the start of the block
-            printEdgeCounter(edge, Succ, i);
-            IncrementCounterInBlock(Succ, i, Counters); ++NumEdgesInserted;
-          }
-          Initializer[i++] = (Zero);
-        } else {
-          Initializer[i++] = (Uncounted);
-        }
-      }
-    }
-  }
-
-  // Check if the number of edges counted at first was the number of edges we
-  // considered for instrumentation.
-  assert(i == NumEdges && "the number of edges in counting array is wrong");
-
-  // Assign the now completely defined initialiser to the array.
-  Constant *init = ConstantArray::get(ATy, Initializer);
-  Counters->setInitializer(init);
-
-  // Add the initialization call to main.
-  InsertProfilingInitCall(Main, "llvm_start_opt_edge_profiling", Counters);
-  return true;
-}
-
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
deleted file mode 100644
index 7de7326..0000000
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ /dev/null
@@ -1,1424 +0,0 @@
-//===- PathProfiling.cpp - Inserts counters for path profiling ------------===//
-//
-//                      The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass instruments functions for Ball-Larus path profiling.  Ball-Larus
-// profiling converts the CFG into a DAG by replacing backedges with edges
-// from entry to the start block and from the end block to exit.  The paths
-// along the new DAG are enumrated, i.e. each path is given a path number.
-// Edges are instrumented to increment the path number register, such that the
-// path number register will equal the path number of the path taken at the
-// exit.
-//
-// This file defines classes for building a CFG for use with different stages
-// in the Ball-Larus path profiling instrumentation [Ball96].  The
-// requirements are formatting the llvm CFG into the Ball-Larus DAG, path
-// numbering, finding a spanning tree, moving increments from the spanning
-// tree to chords.
-//
-// Terms:
-// DAG            - Directed Acyclic Graph.
-// Ball-Larus DAG - A CFG with an entry node, an exit node, and backedges
-//                  removed in the following manner.  For every backedge
-//                  v->w, insert edge ENTRY->w and edge v->EXIT.
-// Path Number    - The number corresponding to a specific path through a
-//                  Ball-Larus DAG.
-// Spanning Tree  - A subgraph, S, is a spanning tree if S covers all
-//                  vertices and is a tree.
-// Chord          - An edge not in the spanning tree.
-//
-// [Ball96]
-//  T. Ball and J. R. Larus. "Efficient Path Profiling."
-//  International Symposium on Microarchitecture, pages 46-57, 1996.
-//  http://portal.acm.org/citation.cfm?id=243857
-//
-// [Ball94]
-//  Thomas Ball.  "Efficiently Counting Program Events with Support for
-//  On-line queries."
-//  ACM Transactions on Programmmg Languages and Systems, Vol 16, No 5,
-//  September 1994, Pages 1399-1410.
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "insert-path-profiling"
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "ProfilingUtils.h"
-#include "llvm/Analysis/PathNumbering.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/TypeBuilder.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <vector>
-
-#define HASH_THRESHHOLD 100000
-
-using namespace llvm;
-
-namespace {
-class BLInstrumentationNode;
-class BLInstrumentationEdge;
-class BLInstrumentationDag;
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationNode extends BallLarusNode with member used by the
-// instrumentation algortihms.
-// ---------------------------------------------------------------------------
-class BLInstrumentationNode : public BallLarusNode {
-public:
-  // Creates a new BLInstrumentationNode from a BasicBlock.
-  BLInstrumentationNode(BasicBlock* BB);
-
-  // Get/sets the Value corresponding to the pathNumber register,
-  // constant or phinode.  Used by the instrumentation code to remember
-  // path number Values.
-  Value* getStartingPathNumber();
-  void setStartingPathNumber(Value* pathNumber);
-
-  Value* getEndingPathNumber();
-  void setEndingPathNumber(Value* pathNumber);
-
-  // Get/set the PHINode Instruction for this node.
-  PHINode* getPathPHI();
-  void setPathPHI(PHINode* pathPHI);
-
-private:
-
-  Value* _startingPathNumber; // The Value for the current pathNumber.
-  Value* _endingPathNumber; // The Value for the current pathNumber.
-  PHINode* _pathPHI; // The PHINode for current pathNumber.
-};
-
-// --------------------------------------------------------------------------
-// BLInstrumentationEdge extends BallLarusEdge with data about the
-// instrumentation that will end up on each edge.
-// --------------------------------------------------------------------------
-class BLInstrumentationEdge : public BallLarusEdge {
-public:
-  BLInstrumentationEdge(BLInstrumentationNode* source,
-                        BLInstrumentationNode* target);
-
-  // Sets the target node of this edge.  Required to split edges.
-  void setTarget(BallLarusNode* node);
-
-  // Get/set whether edge is in the spanning tree.
-  bool isInSpanningTree() const;
-  void setIsInSpanningTree(bool isInSpanningTree);
-
-  // Get/ set whether this edge will be instrumented with a path number
-  // initialization.
-  bool isInitialization() const;
-  void setIsInitialization(bool isInitialization);
-
-  // Get/set whether this edge will be instrumented with a path counter
-  // increment.  Notice this is incrementing the path counter
-  // corresponding to the path number register.  The path number
-  // increment is determined by getIncrement().
-  bool isCounterIncrement() const;
-  void setIsCounterIncrement(bool isCounterIncrement);
-
-  // Get/set the path number increment that this edge will be instrumented
-  // with.  This is distinct from the path counter increment and the
-  // weight.  The counter increment counts the number of executions of
-  // some path, whereas the path number keeps track of which path number
-  // the program is on.
-  long getIncrement() const;
-  void setIncrement(long increment);
-
-  // Get/set whether the edge has been instrumented.
-  bool hasInstrumentation();
-  void setHasInstrumentation(bool hasInstrumentation);
-
-  // Returns the successor number of this edge in the source.
-  unsigned getSuccessorNumber();
-
-private:
-  // The increment that the code will be instrumented with.
-  long long _increment;
-
-  // Whether this edge is in the spanning tree.
-  bool _isInSpanningTree;
-
-  // Whether this edge is an initialiation of the path number.
-  bool _isInitialization;
-
-  // Whether this edge is a path counter increment.
-  bool _isCounterIncrement;
-
-  // Whether this edge has been instrumented.
-  bool _hasInstrumentation;
-};
-
-// ---------------------------------------------------------------------------
-// BLInstrumentationDag extends BallLarusDag with algorithms that
-// determine where instrumentation should be placed.
-// ---------------------------------------------------------------------------
-class BLInstrumentationDag : public BallLarusDag {
-public:
-  BLInstrumentationDag(Function &F);
-
-  // Returns the Exit->Root edge. This edge is required for creating
-  // directed cycles in the algorithm for moving instrumentation off of
-  // the spanning tree
-  BallLarusEdge* getExitRootEdge();
-
-  // Returns an array of phony edges which mark those nodes
-  // with function calls
-  BLEdgeVector getCallPhonyEdges();
-
-  // Gets/sets the path counter array
-  GlobalVariable* getCounterArray();
-  void setCounterArray(GlobalVariable* c);
-
-  // Calculates the increments for the chords, thereby removing
-  // instrumentation from the spanning tree edges. Implementation is based
-  // on the algorithm in Figure 4 of [Ball94]
-  void calculateChordIncrements();
-
-  // Updates the state when an edge has been split
-  void splitUpdate(BLInstrumentationEdge* formerEdge, BasicBlock* newBlock);
-
-  // Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-  // edges are in the spanning tree will not be instrumented, but this
-  // implementation does not try to minimize the instrumentation overhead
-  // by trying to find hot edges.
-  void calculateSpanningTree();
-
-  // Pushes initialization further down in order to group the first
-  // increment and initialization.
-  void pushInitialization();
-
-  // Pushes the path counter increments up in order to group the last path
-  // number increment.
-  void pushCounters();
-
-  // Removes phony edges from the successor list of the source, and the
-  // predecessor list of the target.
-  void unlinkPhony();
-
-  // Generate dot graph for the function
-  void generateDotGraph();
-
-protected:
-  // BLInstrumentationDag creates BLInstrumentationNode objects in this
-  // method overriding the creation of BallLarusNode objects.
-  //
-  // Allows subclasses to determine which type of Node is created.
-  // Override this method to produce subclasses of BallLarusNode if
-  // necessary.
-  virtual BallLarusNode* createNode(BasicBlock* BB);
-
-  // BLInstrumentationDag create BLInstrumentationEdges.
-  //
-  // Allows subclasses to determine which type of Edge is created.
-  // Override this method to produce subclasses of BallLarusEdge if
-  // necessary.  Parameters source and target will have been created by
-  // createNode and can be cast to the subclass of BallLarusNode*
-  // returned by createNode.
-  virtual BallLarusEdge* createEdge(
-    BallLarusNode* source, BallLarusNode* target, unsigned edgeNumber);
-
-private:
-  BLEdgeVector _treeEdges; // All edges in the spanning tree.
-  BLEdgeVector _chordEdges; // All edges not in the spanning tree.
-  GlobalVariable* _counterArray; // Array to store path counters
-
-  // Removes the edge from the appropriate predecessor and successor lists.
-  void unlinkEdge(BallLarusEdge* edge);
-
-  // Makes an edge part of the spanning tree.
-  void makeEdgeSpanning(BLInstrumentationEdge* edge);
-
-  // Pushes initialization and calls itself recursively.
-  void pushInitializationFromEdge(BLInstrumentationEdge* edge);
-
-  // Pushes path counter increments up recursively.
-  void pushCountersFromEdge(BLInstrumentationEdge* edge);
-
-  // Depth first algorithm for determining the chord increments.f
-  void calculateChordIncrementsDfs(
-    long weight, BallLarusNode* v, BallLarusEdge* e);
-
-  // Determines the relative direction of two edges.
-  int calculateChordIncrementsDir(BallLarusEdge* e, BallLarusEdge* f);
-};
-
-// ---------------------------------------------------------------------------
-// PathProfiler is a module pass which instruments path profiling instructions
-// ---------------------------------------------------------------------------
-class PathProfiler : public ModulePass {
-private:
-  // Current context for multi threading support.
-  LLVMContext* Context;
-
-  // Which function are we currently instrumenting
-  unsigned currentFunctionNumber;
-
-  // The function prototype in the profiling runtime for incrementing a
-  // single path counter in a hash table.
-  Constant* llvmIncrementHashFunction;
-  Constant* llvmDecrementHashFunction;
-
-  // Instruments each function with path profiling.  'main' is instrumented
-  // with code to save the profile to disk.
-  bool runOnModule(Module &M);
-
-  // Analyzes the function for Ball-Larus path profiling, and inserts code.
-  void runOnFunction(std::vector<Constant*> &ftInit, Function &F, Module &M);
-
-  // Creates an increment constant representing incr.
-  ConstantInt* createIncrementConstant(long incr, int bitsize);
-
-  // Creates an increment constant representing the value in
-  // edge->getIncrement().
-  ConstantInt* createIncrementConstant(BLInstrumentationEdge* edge);
-
-  // Finds the insertion point after pathNumber in block.  PathNumber may
-  // be NULL.
-  BasicBlock::iterator getInsertionPoint(
-    BasicBlock* block, Value* pathNumber);
-
-  // Inserts source's pathNumber Value* into target.  Target may or may not
-  // have multiple predecessors, and may or may not have its phiNode
-  // initalized.
-  void pushValueIntoNode(
-    BLInstrumentationNode* source, BLInstrumentationNode* target);
-
-  // Inserts source's pathNumber Value* into the appropriate slot of
-  // target's phiNode.
-  void pushValueIntoPHI(
-    BLInstrumentationNode* target, BLInstrumentationNode* source);
-
-  // The Value* in node, oldVal,  is updated with a Value* correspodning to
-  // oldVal + addition.
-  void insertNumberIncrement(BLInstrumentationNode* node, Value* addition,
-                             bool atBeginning);
-
-  // Creates a counter increment in the given node.  The Value* in node is
-  // taken as the index into a hash table.
-  void insertCounterIncrement(
-    Value* incValue,
-    BasicBlock::iterator insertPoint,
-    BLInstrumentationDag* dag,
-    bool increment = true);
-
-  // A PHINode is created in the node, and its values initialized to -1U.
-  void preparePHI(BLInstrumentationNode* node);
-
-  // Inserts instrumentation for the given edge
-  //
-  // Pre: The edge's source node has pathNumber set if edge is non zero
-  // path number increment.
-  //
-  // Post: Edge's target node has a pathNumber set to the path number Value
-  // corresponding to the value of the path register after edge's
-  // execution.
-  void insertInstrumentationStartingAt(
-    BLInstrumentationEdge* edge,
-    BLInstrumentationDag* dag);
-
-  // If this edge is a critical edge, then inserts a node at this edge.
-  // This edge becomes the first edge, and a new BallLarusEdge is created.
-  bool splitCritical(BLInstrumentationEdge* edge, BLInstrumentationDag* dag);
-
-  // Inserts instrumentation according to the marked edges in dag.  Phony
-  // edges must be unlinked from the DAG, but accessible from the
-  // backedges.  Dag must have initializations, path number increments, and
-  // counter increments present.
-  //
-  // Counter storage is created here.
-  void insertInstrumentation( BLInstrumentationDag& dag, Module &M);
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  PathProfiler() : ModulePass(ID) {
-    initializePathProfilerPass(*PassRegistry::getPassRegistry());
-  }
-
-  virtual const char *getPassName() const {
-    return "Path Profiler";
-  }
-};
-} // end anonymous namespace
-
-// Should we print the dot-graphs
-static cl::opt<bool> DotPathDag("path-profile-pathdag", cl::Hidden,
-        cl::desc("Output the path profiling DAG for each function."));
-
-// Register the path profiler as a pass
-char PathProfiler::ID = 0;
-INITIALIZE_PASS(PathProfiler, "insert-path-profiling",
-                "Insert instrumentation for Ball-Larus path profiling",
-                false, false)
-
-ModulePass *llvm::createPathProfilerPass() { return new PathProfiler(); }
-
-namespace llvm {
-  class PathProfilingFunctionTable {};
-
-  // Type for global array storing references to hashes or arrays
-  template<bool xcompile> class TypeBuilder<PathProfilingFunctionTable,
-                                            xcompile> {
-  public:
-    static StructType *get(LLVMContext& C) {
-      return( StructType::get(
-                TypeBuilder<types::i<32>, xcompile>::get(C), // type
-                TypeBuilder<types::i<32>, xcompile>::get(C), // array size
-                TypeBuilder<types::i<8>*, xcompile>::get(C), // array/hash ptr
-                NULL));
-    }
-  };
-
-  typedef TypeBuilder<PathProfilingFunctionTable, true>
-  ftEntryTypeBuilder;
-
-  // BallLarusEdge << operator overloading
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge)
-      LLVM_ATTRIBUTE_USED;
-  raw_ostream& operator<<(raw_ostream& os,
-                          const BLInstrumentationEdge& edge) {
-    os << "[" << edge.getSource()->getName() << " -> "
-       << edge.getTarget()->getName() << "] init: "
-       << (edge.isInitialization() ? "yes" : "no")
-       << " incr:" << edge.getIncrement() << " cinc: "
-       << (edge.isCounterIncrement() ? "yes" : "no");
-    return(os);
-  }
-}
-
-// Creates a new BLInstrumentationNode from a BasicBlock.
-BLInstrumentationNode::BLInstrumentationNode(BasicBlock* BB) :
-  BallLarusNode(BB),
-  _startingPathNumber(NULL), _endingPathNumber(NULL), _pathPHI(NULL) {}
-
-// Constructor for BLInstrumentationEdge.
-BLInstrumentationEdge::BLInstrumentationEdge(BLInstrumentationNode* source,
-                                             BLInstrumentationNode* target)
-  : BallLarusEdge(source, target, 0),
-    _increment(0), _isInSpanningTree(false), _isInitialization(false),
-    _isCounterIncrement(false), _hasInstrumentation(false) {}
-
-// Sets the target node of this edge.  Required to split edges.
-void BLInstrumentationEdge::setTarget(BallLarusNode* node) {
-  _target = node;
-}
-
-// Returns whether this edge is in the spanning tree.
-bool BLInstrumentationEdge::isInSpanningTree() const {
-  return(_isInSpanningTree);
-}
-
-// Sets whether this edge is in the spanning tree.
-void BLInstrumentationEdge::setIsInSpanningTree(bool isInSpanningTree) {
-  _isInSpanningTree = isInSpanningTree;
-}
-
-// Returns whether this edge will be instrumented with a path number
-// initialization.
-bool BLInstrumentationEdge::isInitialization() const {
-  return(_isInitialization);
-}
-
-// Sets whether this edge will be instrumented with a path number
-// initialization.
-void BLInstrumentationEdge::setIsInitialization(bool isInitialization) {
-  _isInitialization = isInitialization;
-}
-
-// Returns whether this edge will be instrumented with a path counter
-// increment.  Notice this is incrementing the path counter
-// corresponding to the path number register.  The path number
-// increment is determined by getIncrement().
-bool BLInstrumentationEdge::isCounterIncrement() const {
-  return(_isCounterIncrement);
-}
-
-// Sets whether this edge will be instrumented with a path counter
-// increment.
-void BLInstrumentationEdge::setIsCounterIncrement(bool isCounterIncrement) {
-  _isCounterIncrement = isCounterIncrement;
-}
-
-// Gets the path number increment that this edge will be instrumented
-// with.  This is distinct from the path counter increment and the
-// weight.  The counter increment is counts the number of executions of
-// some path, whereas the path number keeps track of which path number
-// the program is on.
-long BLInstrumentationEdge::getIncrement() const {
-  return(_increment);
-}
-
-// Set whether this edge will be instrumented with a path number
-// increment.
-void BLInstrumentationEdge::setIncrement(long increment) {
-  _increment = increment;
-}
-
-// True iff the edge has already been instrumented.
-bool BLInstrumentationEdge::hasInstrumentation() {
-  return(_hasInstrumentation);
-}
-
-// Set whether this edge has been instrumented.
-void BLInstrumentationEdge::setHasInstrumentation(bool hasInstrumentation) {
-  _hasInstrumentation = hasInstrumentation;
-}
-
-// Returns the successor number of this edge in the source.
-unsigned BLInstrumentationEdge::getSuccessorNumber() {
-  BallLarusNode* sourceNode = getSource();
-  BallLarusNode* targetNode = getTarget();
-  BasicBlock* source = sourceNode->getBlock();
-  BasicBlock* target = targetNode->getBlock();
-
-  if(source == NULL || target == NULL)
-    return(0);
-
-  TerminatorInst* terminator = source->getTerminator();
-
-        unsigned i;
-  for(i=0; i < terminator->getNumSuccessors(); i++) {
-    if(terminator->getSuccessor(i) == target)
-      break;
-  }
-
-  return(i);
-}
-
-// BLInstrumentationDag constructor initializes a DAG for the given Function.
-BLInstrumentationDag::BLInstrumentationDag(Function &F) : BallLarusDag(F),
-                                                          _counterArray(0) {
-}
-
-// Returns the Exit->Root edge. This edge is required for creating
-// directed cycles in the algorithm for moving instrumentation off of
-// the spanning tree
-BallLarusEdge* BLInstrumentationDag::getExitRootEdge() {
-  BLEdgeIterator erEdge = getExit()->succBegin();
-  return(*erEdge);
-}
-
-BLEdgeVector BLInstrumentationDag::getCallPhonyEdges () {
-  BLEdgeVector callEdges;
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++ ) {
-    if( (*edge)->getType() == BallLarusEdge::CALLEDGE_PHONY )
-      callEdges.push_back(*edge);
-  }
-
-  return callEdges;
-}
-
-// Gets the path counter array
-GlobalVariable* BLInstrumentationDag::getCounterArray() {
-  return _counterArray;
-}
-
-void BLInstrumentationDag::setCounterArray(GlobalVariable* c) {
-  _counterArray = c;
-}
-
-// Calculates the increment for the chords, thereby removing
-// instrumentation from the spanning tree edges. Implementation is based on
-// the algorithm in Figure 4 of [Ball94]
-void BLInstrumentationDag::calculateChordIncrements() {
-  calculateChordIncrementsDfs(0, getRoot(), NULL);
-
-  BLInstrumentationEdge* chord;
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-      end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    chord = (BLInstrumentationEdge*) *chordEdge;
-    chord->setIncrement(chord->getIncrement() + chord->getWeight());
-  }
-}
-
-// Updates the state when an edge has been split
-void BLInstrumentationDag::splitUpdate(BLInstrumentationEdge* formerEdge,
-                                       BasicBlock* newBlock) {
-  BallLarusNode* oldTarget = formerEdge->getTarget();
-  BallLarusNode* newNode = addNode(newBlock);
-  formerEdge->setTarget(newNode);
-  newNode->addPredEdge(formerEdge);
-
-  DEBUG(dbgs() << "  Edge split: " << *formerEdge << "\n");
-
-  oldTarget->removePredEdge(formerEdge);
-  BallLarusEdge* newEdge = addEdge(newNode, oldTarget,0);
-
-  if( formerEdge->getType() == BallLarusEdge::BACKEDGE ||
-                        formerEdge->getType() == BallLarusEdge::SPLITEDGE) {
-                newEdge->setType(formerEdge->getType());
-    newEdge->setPhonyRoot(formerEdge->getPhonyRoot());
-    newEdge->setPhonyExit(formerEdge->getPhonyExit());
-    formerEdge->setType(BallLarusEdge::NORMAL);
-                formerEdge->setPhonyRoot(NULL);
-    formerEdge->setPhonyExit(NULL);
-  }
-}
-
-// Calculates a spanning tree of the DAG ignoring cycles.  Whichever
-// edges are in the spanning tree will not be instrumented, but this
-// implementation does not try to minimize the instrumentation overhead
-// by trying to find hot edges.
-void BLInstrumentationDag::calculateSpanningTree() {
-  std::stack<BallLarusNode*> dfsStack;
-
-  for(BLNodeIterator nodeIt = _nodes.begin(), end = _nodes.end();
-      nodeIt != end; nodeIt++) {
-    (*nodeIt)->setColor(BallLarusNode::WHITE);
-  }
-
-  dfsStack.push(getRoot());
-  while(dfsStack.size() > 0) {
-    BallLarusNode* node = dfsStack.top();
-    dfsStack.pop();
-
-    if(node->getColor() == BallLarusNode::WHITE)
-      continue;
-
-    BallLarusNode* nextNode;
-    bool forward = true;
-    BLEdgeIterator succEnd = node->succEnd();
-
-    node->setColor(BallLarusNode::WHITE);
-    // first iterate over successors then predecessors
-    for(BLEdgeIterator edge = node->succBegin(), predEnd = node->predEnd();
-        edge != predEnd; edge++) {
-      if(edge == succEnd) {
-        edge = node->predBegin();
-        forward = false;
-      }
-
-      // Ignore split edges
-      if ((*edge)->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      nextNode = forward? (*edge)->getTarget(): (*edge)->getSource();
-      if(nextNode->getColor() != BallLarusNode::WHITE) {
-        nextNode->setColor(BallLarusNode::WHITE);
-        makeEdgeSpanning((BLInstrumentationEdge*)(*edge));
-      }
-    }
-  }
-
-  for(BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-      edge != end; edge++) {
-    BLInstrumentationEdge* instEdge = (BLInstrumentationEdge*) (*edge);
-      // safe since createEdge is overriden
-    if(!instEdge->isInSpanningTree() && (*edge)->getType()
-        != BallLarusEdge::SPLITEDGE)
-      _chordEdges.push_back(instEdge);
-  }
-}
-
-// Pushes initialization further down in order to group the first
-// increment and initialization.
-void BLInstrumentationDag::pushInitialization() {
-  BLInstrumentationEdge* exitRootEdge =
-                (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsInitialization(true);
-  pushInitializationFromEdge(exitRootEdge);
-}
-
-// Pushes the path counter increments up in order to group the last path
-// number increment.
-void BLInstrumentationDag::pushCounters() {
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) getExitRootEdge();
-  exitRootEdge->setIsCounterIncrement(true);
-  pushCountersFromEdge(exitRootEdge);
-}
-
-// Removes phony edges from the successor list of the source, and the
-// predecessor list of the target.
-void BLInstrumentationDag::unlinkPhony() {
-  BallLarusEdge* edge;
-
-  for(BLEdgeIterator next = _edges.begin(),
-      end = _edges.end(); next != end; next++) {
-    edge = (*next);
-
-    if( edge->getType() == BallLarusEdge::BACKEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::SPLITEDGE_PHONY ||
-        edge->getType() == BallLarusEdge::CALLEDGE_PHONY ) {
-      unlinkEdge(edge);
-    }
-  }
-}
-
-// Generate a .dot graph to represent the DAG and pathNumbers
-void BLInstrumentationDag::generateDotGraph() {
-  std::string errorInfo;
-  std::string functionName = getFunction().getName().str();
-  std::string filename = "pathdag." + functionName + ".dot";
-
-  DEBUG (dbgs() << "Writing '" << filename << "'...\n");
-  raw_fd_ostream dotFile(filename.c_str(), errorInfo);
-
-  if (!errorInfo.empty()) {
-    errs() << "Error opening '" << filename.c_str() <<"' for writing!";
-    errs() << "\n";
-    return;
-  }
-
-  dotFile << "digraph " << functionName << " {\n";
-
-  for( BLEdgeIterator edge = _edges.begin(), end = _edges.end();
-       edge != end; edge++) {
-    std::string sourceName = (*edge)->getSource()->getName();
-    std::string targetName = (*edge)->getTarget()->getName();
-
-    dotFile << "\t\"" << sourceName.c_str() << "\" -> \""
-            << targetName.c_str() << "\" ";
-
-    long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-
-    switch( (*edge)->getType() ) {
-    case BallLarusEdge::NORMAL:
-      dotFile << "[label=" << inc << "] [color=black];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE:
-      dotFile << "[color=cyan];\n";
-      break;
-
-    case BallLarusEdge::BACKEDGE_PHONY:
-      dotFile << "[label=" << inc
-              << "] [color=blue];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE:
-      dotFile << "[color=violet];\n";
-      break;
-
-    case BallLarusEdge::SPLITEDGE_PHONY:
-      dotFile << "[label=" << inc << "] [color=red];\n";
-      break;
-
-    case BallLarusEdge::CALLEDGE_PHONY:
-      dotFile << "[label=" << inc     << "] [color=green];\n";
-      break;
-    }
-  }
-
-  dotFile << "}\n";
-}
-
-// Allows subclasses to determine which type of Node is created.
-// Override this method to produce subclasses of BallLarusNode if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusNode* BLInstrumentationDag::createNode(BasicBlock* BB) {
-  return( new BLInstrumentationNode(BB) );
-}
-
-// Allows subclasses to determine which type of Edge is created.
-// Override this method to produce subclasses of BallLarusEdge if
-// necessary. The destructor of BallLarusDag will call free on each pointer
-// created.
-BallLarusEdge* BLInstrumentationDag::createEdge(BallLarusNode* source,
-                                                BallLarusNode* target, unsigned edgeNumber) {
-  // One can cast from BallLarusNode to BLInstrumentationNode since createNode
-  // is overriden to produce BLInstrumentationNode.
-  return( new BLInstrumentationEdge((BLInstrumentationNode*)source,
-                                    (BLInstrumentationNode*)target) );
-}
-
-// Sets the Value corresponding to the pathNumber register, constant,
-// or phinode.  Used by the instrumentation code to remember path
-// number Values.
-Value* BLInstrumentationNode::getStartingPathNumber(){
-  return(_startingPathNumber);
-}
-
-// Sets the Value of the pathNumber.  Used by the instrumentation code.
-void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
-                                                       pathNumber->getName() :
-                                                       "unused") << "\n");
-  _startingPathNumber = pathNumber;
-}
-
-Value* BLInstrumentationNode::getEndingPathNumber(){
-  return(_endingPathNumber);
-}
-
-void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
-  DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
-               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
-  _endingPathNumber = pathNumber;
-}
-
-// Get the PHINode Instruction for this node.  Used by instrumentation
-// code.
-PHINode* BLInstrumentationNode::getPathPHI() {
-  return(_pathPHI);
-}
-
-// Set the PHINode Instruction for this node.  Used by instrumentation
-// code.
-void BLInstrumentationNode::setPathPHI(PHINode* pathPHI) {
-  _pathPHI = pathPHI;
-}
-
-// Removes the edge from the appropriate predecessor and successor
-// lists.
-void BLInstrumentationDag::unlinkEdge(BallLarusEdge* edge) {
-  if(edge == getExitRootEdge())
-    DEBUG(dbgs() << " Removing exit->root edge\n");
-
-  edge->getSource()->removeSuccEdge(edge);
-  edge->getTarget()->removePredEdge(edge);
-}
-
-// Makes an edge part of the spanning tree.
-void BLInstrumentationDag::makeEdgeSpanning(BLInstrumentationEdge* edge) {
-  edge->setIsInSpanningTree(true);
-  _treeEdges.push_back(edge);
-}
-
-// Pushes initialization and calls itself recursively.
-void BLInstrumentationDag::pushInitializationFromEdge(
-  BLInstrumentationEdge* edge) {
-  BallLarusNode* target;
-
-  target = edge->getTarget();
-  if( target->getNumberPredEdges() > 1 || target == getExit() ) {
-    return;
-  } else {
-    for(BLEdgeIterator next = target->succBegin(),
-          end = target->succEnd(); next != end; next++) {
-      BLInstrumentationEdge* intoEdge = (BLInstrumentationEdge*) *next;
-
-      // Skip split edges
-      if (intoEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      intoEdge->setIncrement(intoEdge->getIncrement() +
-                             edge->getIncrement());
-      intoEdge->setIsInitialization(true);
-      pushInitializationFromEdge(intoEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsInitialization(false);
-  }
-}
-
-// Pushes path counter increments up recursively.
-void BLInstrumentationDag::pushCountersFromEdge(BLInstrumentationEdge* edge) {
-  BallLarusNode* source;
-
-  source = edge->getSource();
-  if(source->getNumberSuccEdges() > 1 || source == getRoot()
-     || edge->isInitialization()) {
-    return;
-  } else {
-    for(BLEdgeIterator previous = source->predBegin(),
-          end = source->predEnd(); previous != end; previous++) {
-      BLInstrumentationEdge* fromEdge = (BLInstrumentationEdge*) *previous;
-
-      // Skip split edges
-      if (fromEdge->getType() == BallLarusEdge::SPLITEDGE)
-        continue;
-
-      fromEdge->setIncrement(fromEdge->getIncrement() +
-                             edge->getIncrement());
-      fromEdge->setIsCounterIncrement(true);
-      pushCountersFromEdge(fromEdge);
-    }
-
-    edge->setIncrement(0);
-    edge->setIsCounterIncrement(false);
-  }
-}
-
-// Depth first algorithm for determining the chord increments.
-void BLInstrumentationDag::calculateChordIncrementsDfs(long weight,
-                                                       BallLarusNode* v, BallLarusEdge* e) {
-  BLInstrumentationEdge* f;
-
-  for(BLEdgeIterator treeEdge = _treeEdges.begin(),
-        end = _treeEdges.end(); treeEdge != end; treeEdge++) {
-    f = (BLInstrumentationEdge*) *treeEdge;
-    if(e != f && v == f->getTarget()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getSource(), f);
-    }
-    if(e != f && v == f->getSource()) {
-      calculateChordIncrementsDfs(
-        calculateChordIncrementsDir(e,f)*(weight) +
-        f->getWeight(), f->getTarget(), f);
-    }
-  }
-
-  for(BLEdgeIterator chordEdge = _chordEdges.begin(),
-        end = _chordEdges.end(); chordEdge != end; chordEdge++) {
-    f = (BLInstrumentationEdge*) *chordEdge;
-    if(v == f->getSource() || v == f->getTarget()) {
-      f->setIncrement(f->getIncrement() +
-                      calculateChordIncrementsDir(e,f)*weight);
-    }
-  }
-}
-
-// Determines the relative direction of two edges.
-int BLInstrumentationDag::calculateChordIncrementsDir(BallLarusEdge* e,
-                                                      BallLarusEdge* f) {
-  if( e == NULL)
-    return(1);
-  else if(e->getSource() == f->getTarget()
-          || e->getTarget() == f->getSource())
-    return(1);
-
-  return(-1);
-}
-
-// Creates an increment constant representing incr.
-ConstantInt* PathProfiler::createIncrementConstant(long incr,
-                                                   int bitsize) {
-  return(ConstantInt::get(IntegerType::get(*Context, 32), incr));
-}
-
-// Creates an increment constant representing the value in
-// edge->getIncrement().
-ConstantInt* PathProfiler::createIncrementConstant(
-  BLInstrumentationEdge* edge) {
-  return(createIncrementConstant(edge->getIncrement(), 32));
-}
-
-// Finds the insertion point after pathNumber in block.  PathNumber may
-// be NULL.
-BasicBlock::iterator PathProfiler::getInsertionPoint(BasicBlock* block, Value*
-                                                     pathNumber) {
-  if(pathNumber == NULL || isa<ConstantInt>(pathNumber)
-     || (((Instruction*)(pathNumber))->getParent()) != block) {
-    return(block->getFirstInsertionPt());
-  } else {
-    Instruction* pathNumberInst = (Instruction*) (pathNumber);
-    BasicBlock::iterator insertPoint;
-    BasicBlock::iterator end = block->end();
-
-    for(insertPoint = block->begin();
-        insertPoint != end; insertPoint++) {
-      Instruction* insertInst = &(*insertPoint);
-
-      if(insertInst == pathNumberInst)
-        return(++insertPoint);
-    }
-
-    return(insertPoint);
-  }
-}
-
-// A PHINode is created in the node, and its values initialized to -1U.
-void PathProfiler::preparePHI(BLInstrumentationNode* node) {
-  BasicBlock* block = node->getBlock();
-  BasicBlock::iterator insertPoint = block->getFirstInsertionPt();
-  pred_iterator PB = pred_begin(node->getBlock()),
-          PE = pred_end(node->getBlock());
-  PHINode* phi = PHINode::Create(Type::getInt32Ty(*Context),
-                                 std::distance(PB, PE), "pathNumber",
-                                 insertPoint );
-  node->setPathPHI(phi);
-  node->setStartingPathNumber(phi);
-  node->setEndingPathNumber(phi);
-
-  for(pred_iterator predIt = PB; predIt != PE; predIt++) {
-    BasicBlock* pred = (*predIt);
-
-    if(pred != NULL)
-      phi->addIncoming(createIncrementConstant((long)-1, 32), pred);
-  }
-}
-
-// Inserts source's pathNumber Value* into target.  Target may or may not
-// have multiple predecessors, and may or may not have its phiNode
-// initalized.
-void PathProfiler::pushValueIntoNode(BLInstrumentationNode* source,
-                                     BLInstrumentationNode* target) {
-  if(target->getBlock() == NULL)
-    return;
-
-
-  if(target->getNumberPredEdges() <= 1) {
-    assert(target->getStartingPathNumber() == NULL &&
-           "Target already has path number");
-    target->setStartingPathNumber(source->getEndingPathNumber());
-    target->setEndingPathNumber(source->getEndingPathNumber());
-    DEBUG(dbgs() << "  Passing path number"
-          << (source->getEndingPathNumber() ? "" : " (null)")
-          << " value through.\n");
-  } else {
-    if(target->getPathPHI() == NULL) {
-      DEBUG(dbgs() << "  Initializing PHI node for block '"
-            << target->getName() << "'\n");
-      preparePHI(target);
-    }
-    pushValueIntoPHI(target, source);
-    DEBUG(dbgs() << "  Passing number value into PHI for block '"
-          << target->getName() << "'\n");
-  }
-}
-
-// Inserts source's pathNumber Value* into the appropriate slot of
-// target's phiNode.
-void PathProfiler::pushValueIntoPHI(BLInstrumentationNode* target,
-                                    BLInstrumentationNode* source) {
-  PHINode* phi = target->getPathPHI();
-  assert(phi != NULL && "  Tried to push value into node with PHI, but node"
-         " actually had no PHI.");
-  phi->removeIncomingValue(source->getBlock(), false);
-  phi->addIncoming(source->getEndingPathNumber(), source->getBlock());
-}
-
-// The Value* in node, oldVal,  is updated with a Value* correspodning to
-// oldVal + addition.
-void PathProfiler::insertNumberIncrement(BLInstrumentationNode* node,
-                                         Value* addition, bool atBeginning) {
-  BasicBlock* block = node->getBlock();
-  assert(node->getStartingPathNumber() != NULL);
-  assert(node->getEndingPathNumber() != NULL);
-
-  BasicBlock::iterator insertPoint;
-
-  if( atBeginning )
-    insertPoint = block->getFirstInsertionPt();
-  else
-    insertPoint = block->getTerminator();
-
-  DEBUG(errs() << "  Creating addition instruction.\n");
-  Value* newpn = BinaryOperator::Create(Instruction::Add,
-                                        node->getStartingPathNumber(),
-                                        addition, "pathNumber", insertPoint);
-
-  node->setEndingPathNumber(newpn);
-
-  if( atBeginning )
-    node->setStartingPathNumber(newpn);
-}
-
-// Creates a counter increment in the given node.  The Value* in node is
-// taken as the index into an array or hash table.  The hash table access
-// is a call to the runtime.
-void PathProfiler::insertCounterIncrement(Value* incValue,
-                                          BasicBlock::iterator insertPoint,
-                                          BLInstrumentationDag* dag,
-                                          bool increment) {
-  // Counter increment for array
-  if( dag->getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    // Get pointer to the array location
-    std::vector<Value*> gepIndices(2);
-    gepIndices[0] = Constant::getNullValue(Type::getInt32Ty(*Context));
-    gepIndices[1] = incValue;
-
-    GetElementPtrInst* pcPointer =
-      GetElementPtrInst::Create(dag->getCounterArray(), gepIndices,
-                                "counterInc", insertPoint);
-
-    // Load from the array - call it oldPC
-    LoadInst* oldPc = new LoadInst(pcPointer, "oldPC", insertPoint);
-
-    // Test to see whether adding 1 will overflow the counter
-    ICmpInst* isMax = new ICmpInst(insertPoint, CmpInst::ICMP_ULT, oldPc,
-                                   createIncrementConstant(0xffffffff, 32),
-                                   "isMax");
-
-    // Select increment for the path counter based on overflow
-    SelectInst* inc =
-      SelectInst::Create( isMax, createIncrementConstant(increment?1:-1,32),
-                          createIncrementConstant(0,32),
-                          "pathInc", insertPoint);
-
-    // newPc = oldPc + inc
-    BinaryOperator* newPc = BinaryOperator::Create(Instruction::Add,
-                                                   oldPc, inc, "newPC",
-                                                   insertPoint);
-
-    // Store back in to the array
-    new StoreInst(newPc, pcPointer, insertPoint);
-  } else { // Counter increment for hash
-    std::vector<Value*> args(2);
-    args[0] = ConstantInt::get(Type::getInt32Ty(*Context),
-                               currentFunctionNumber);
-    args[1] = incValue;
-
-    CallInst::Create(
-      increment ? llvmIncrementHashFunction : llvmDecrementHashFunction,
-      args, "", insertPoint);
-  }
-}
-
-// Inserts instrumentation for the given edge
-//
-// Pre: The edge's source node has pathNumber set if edge is non zero
-// path number increment.
-//
-// Post: Edge's target node has a pathNumber set to the path number Value
-// corresponding to the value of the path register after edge's
-// execution.
-//
-// FIXME: This should be reworked so it's not recursive.
-void PathProfiler::insertInstrumentationStartingAt(BLInstrumentationEdge* edge,
-                                                   BLInstrumentationDag* dag) {
-  // Mark the edge as instrumented
-  edge->setHasInstrumentation(true);
-  DEBUG(dbgs() << "\nInstrumenting edge: " << (*edge) << "\n");
-
-  // create a new node for this edge's instrumentation
-  splitCritical(edge, dag);
-
-  BLInstrumentationNode* sourceNode = (BLInstrumentationNode*)edge->getSource();
-  BLInstrumentationNode* targetNode = (BLInstrumentationNode*)edge->getTarget();
-  BLInstrumentationNode* instrumentNode;
-  BLInstrumentationNode* nextSourceNode;
-
-  bool atBeginning = false;
-
-  // Source node has only 1 successor so any information can be simply
-  // inserted in to it without splitting
-  if( sourceNode->getBlock() && sourceNode->getNumberSuccEdges() <= 1) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << sourceNode->getName() << " (at end)\n");
-    instrumentNode = sourceNode;
-    nextSourceNode = targetNode; // ... since we never made any new nodes
-  }
-
-  // The target node only has one predecessor, so we can safely insert edge
-  // instrumentation into it. If there was splitting, it must have been
-  // successful.
-  else if( targetNode->getNumberPredEdges() == 1 ) {
-    DEBUG(dbgs() << "  Potential instructions to be placed in: "
-          << targetNode->getName() << " (at beginning)\n");
-    pushValueIntoNode(sourceNode, targetNode);
-    instrumentNode = targetNode;
-    nextSourceNode = NULL; // ... otherwise we'll just keep splitting
-    atBeginning = true;
-  }
-
-  // Somehow, splitting must have failed.
-  else {
-    errs() << "Instrumenting could not split a critical edge.\n";
-    DEBUG(dbgs() << "  Couldn't split edge " << (*edge) << ".\n");
-    return;
-  }
-
-  // Insert instrumentation if this is a back or split edge
-  if( edge->getType() == BallLarusEdge::BACKEDGE ||
-      edge->getType() == BallLarusEdge::SPLITEDGE ) {
-    BLInstrumentationEdge* top =
-      (BLInstrumentationEdge*) edge->getPhonyRoot();
-    BLInstrumentationEdge* bottom =
-      (BLInstrumentationEdge*) edge->getPhonyExit();
-
-    assert( top->isInitialization() && " Top phony edge did not"
-            " contain a path number initialization.");
-    assert( bottom->isCounterIncrement() && " Bottom phony edge"
-            " did not contain a path counter increment.");
-
-    // split edge has yet to be initialized
-    if( !instrumentNode->getEndingPathNumber() ) {
-      instrumentNode->setStartingPathNumber(createIncrementConstant(0,32));
-      instrumentNode->setEndingPathNumber(createIncrementConstant(0,32));
-    }
-
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    // add information from the bottom edge, if it exists
-    if( bottom->getIncrement() ) {
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(bottom),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-    }
-
-    insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                           insertPoint, dag);
-
-    if( atBeginning )
-      instrumentNode->setStartingPathNumber(createIncrementConstant(top));
-
-    instrumentNode->setEndingPathNumber(createIncrementConstant(top));
-
-    // Check for path counter increments
-    if( top->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             instrumentNode->getBlock()->getTerminator(),dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Insert instrumentation if this is a normal edge
-  else {
-    BasicBlock::iterator insertPoint = atBeginning ?
-      instrumentNode->getBlock()->getFirstInsertionPt() :
-      instrumentNode->getBlock()->getTerminator();
-
-    if( edge->isInitialization() ) { // initialize path number
-      instrumentNode->setEndingPathNumber(createIncrementConstant(edge));
-    } else if( edge->getIncrement() )       {// increment path number
-      Value* newpn =
-        BinaryOperator::Create(Instruction::Add,
-                               instrumentNode->getStartingPathNumber(),
-                               createIncrementConstant(edge),
-                               "pathNumber", insertPoint);
-      instrumentNode->setEndingPathNumber(newpn);
-
-      if( atBeginning )
-        instrumentNode->setStartingPathNumber(newpn);
-    }
-
-    // Check for path counter increments
-    if( edge->isCounterIncrement() ) {
-      insertCounterIncrement(instrumentNode->getEndingPathNumber(),
-                             insertPoint, dag);
-      instrumentNode->setEndingPathNumber(0);
-    }
-  }
-
-  // Push it along
-  if (nextSourceNode && instrumentNode->getEndingPathNumber())
-    pushValueIntoNode(instrumentNode, nextSourceNode);
-
-  // Add all the successors
-  for( BLEdgeIterator next = targetNode->succBegin(),
-         end = targetNode->succEnd(); next != end; next++ ) {
-    // So long as it is un-instrumented, add it to the list
-    if( !((BLInstrumentationEdge*)(*next))->hasInstrumentation() )
-      insertInstrumentationStartingAt((BLInstrumentationEdge*)*next,dag);
-    else
-      DEBUG(dbgs() << "  Edge " << *(BLInstrumentationEdge*)(*next)
-            << " already instrumented.\n");
-  }
-}
-
-// Inserts instrumentation according to the marked edges in dag.  Phony edges
-// must be unlinked from the DAG, but accessible from the backedges.  Dag
-// must have initializations, path number increments, and counter increments
-// present.
-//
-// Counter storage is created here.
-void PathProfiler::insertInstrumentation(
-  BLInstrumentationDag& dag, Module &M) {
-
-  BLInstrumentationEdge* exitRootEdge =
-    (BLInstrumentationEdge*) dag.getExitRootEdge();
-  insertInstrumentationStartingAt(exitRootEdge, &dag);
-
-  // Iterate through each call edge and apply the appropriate hash increment
-  // and decrement functions
-  BLEdgeVector callEdges = dag.getCallPhonyEdges();
-  for( BLEdgeIterator edge = callEdges.begin(),
-         end = callEdges.end(); edge != end; edge++ ) {
-    BLInstrumentationNode* node =
-      (BLInstrumentationNode*)(*edge)->getSource();
-    BasicBlock::iterator insertPoint = node->getBlock()->getFirstInsertionPt();
-
-    // Find the first function call
-    while( ((Instruction&)(*insertPoint)).getOpcode() != Instruction::Call )
-      insertPoint++;
-
-    DEBUG(dbgs() << "\nInstrumenting method call block '"
-                 << node->getBlock()->getName() << "'\n");
-    DEBUG(dbgs() << "   Path number initialized: "
-                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
-
-    Value* newpn;
-    if( node->getStartingPathNumber() ) {
-      long inc = ((BLInstrumentationEdge*)(*edge))->getIncrement();
-      if ( inc )
-        newpn = BinaryOperator::Create(Instruction::Add,
-                                       node->getStartingPathNumber(),
-                                       createIncrementConstant(inc,32),
-                                       "pathNumber", insertPoint);
-      else
-        newpn = node->getStartingPathNumber();
-    } else {
-      newpn = (Value*)createIncrementConstant(
-        ((BLInstrumentationEdge*)(*edge))->getIncrement(), 32);
-    }
-
-    insertCounterIncrement(newpn, insertPoint, &dag);
-    insertCounterIncrement(newpn, node->getBlock()->getTerminator(),
-                           &dag, false);
-  }
-}
-
-// Entry point of the module
-void PathProfiler::runOnFunction(std::vector<Constant*> &ftInit,
-                                 Function &F, Module &M) {
-  // Build DAG from CFG
-  BLInstrumentationDag dag = BLInstrumentationDag(F);
-  dag.init();
-
-  // give each path a unique integer value
-  dag.calculatePathNumbers();
-
-  // modify path increments to increase the efficiency
-  // of instrumentation
-  dag.calculateSpanningTree();
-  dag.calculateChordIncrements();
-  dag.pushInitialization();
-  dag.pushCounters();
-  dag.unlinkPhony();
-
-  // potentially generate .dot graph for the dag
-  if (DotPathDag)
-    dag.generateDotGraph ();
-
-  // Should we store the information in an array or hash
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD ) {
-    Type* t = ArrayType::get(Type::getInt32Ty(*Context),
-                                   dag.getNumberOfPaths());
-
-    dag.setCounterArray(new GlobalVariable(M, t, false,
-                                           GlobalValue::InternalLinkage,
-                                           Constant::getNullValue(t), ""));
-  }
-
-  insertInstrumentation(dag, M);
-
-  // Add to global function reference table
-  unsigned type;
-  Type* voidPtr = TypeBuilder<types::i<8>*, true>::get(*Context);
-
-  if( dag.getNumberOfPaths() <= HASH_THRESHHOLD )
-    type = ProfilingArray;
-  else
-    type = ProfilingHash;
-
-  std::vector<Constant*> entryArray(3);
-  entryArray[0] = createIncrementConstant(type,32);
-  entryArray[1] = createIncrementConstant(dag.getNumberOfPaths(),32);
-  entryArray[2] = dag.getCounterArray() ?
-    ConstantExpr::getBitCast(dag.getCounterArray(), voidPtr) :
-    Constant::getNullValue(voidPtr);
-
-  StructType* at = ftEntryTypeBuilder::get(*Context);
-  ConstantStruct* functionEntry =
-    (ConstantStruct*)ConstantStruct::get(at, entryArray);
-  ftInit.push_back(functionEntry);
-}
-
-// Output the bitcode if we want to observe instrumentation changess
-#define PRINT_MODULE dbgs() <<                               \
-  "\n\n============= MODULE BEGIN ===============\n" << M << \
-  "\n============== MODULE END ================\n"
-
-bool PathProfiler::runOnModule(Module &M) {
-  Context = &M.getContext();
-
-  DEBUG(dbgs()
-        << "****************************************\n"
-        << "****************************************\n"
-        << "**                                    **\n"
-        << "**   PATH PROFILING INSTRUMENTATION   **\n"
-        << "**                                    **\n"
-        << "****************************************\n"
-        << "****************************************\n");
-
-  // No main, no instrumentation!
-  Function *Main = M.getFunction("main");
-
-  // Using fortran? ... this kind of works
-  if (!Main)
-    Main = M.getFunction("MAIN__");
-
-  if (!Main) {
-    errs() << "WARNING: cannot insert path profiling into a module"
-           << " with no main function!\n";
-    return false;
-  }
-
-  llvmIncrementHashFunction = M.getOrInsertFunction(
-    "llvm_increment_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  llvmDecrementHashFunction = M.getOrInsertFunction(
-    "llvm_decrement_path_count",
-    Type::getVoidTy(*Context), // return type
-    Type::getInt32Ty(*Context), // function number
-    Type::getInt32Ty(*Context), // path number
-    NULL );
-
-  std::vector<Constant*> ftInit;
-  unsigned functionNumber = 0;
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; F++) {
-    if (F->isDeclaration())
-      continue;
-
-    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
-    functionNumber++;
-
-    // set function number
-    currentFunctionNumber = functionNumber;
-    runOnFunction(ftInit, *F, M);
-  }
-
-  Type *t = ftEntryTypeBuilder::get(*Context);
-  ArrayType* ftArrayType = ArrayType::get(t, ftInit.size());
-  Constant* ftInitConstant = ConstantArray::get(ftArrayType, ftInit);
-
-  DEBUG(dbgs() << " ftArrayType:" << *ftArrayType << "\n");
-
-  GlobalVariable* functionTable =
-    new GlobalVariable(M, ftArrayType, false, GlobalValue::InternalLinkage,
-                       ftInitConstant, "functionPathTable");
-  Type *eltType = ftArrayType->getTypeAtIndex((unsigned)0);
-  InsertProfilingInitCall(Main, "llvm_start_path_profiling", functionTable,
-                          PointerType::getUnqual(eltType));
-
-  DEBUG(PRINT_MODULE);
-
-  return true;
-}
-
-// If this edge is a critical edge, then inserts a node at this edge.
-// This edge becomes the first edge, and a new BallLarusEdge is created.
-// Returns true if the edge was split
-bool PathProfiler::splitCritical(BLInstrumentationEdge* edge,
-                                 BLInstrumentationDag* dag) {
-  unsigned succNum = edge->getSuccessorNumber();
-  BallLarusNode* sourceNode = edge->getSource();
-  BallLarusNode* targetNode = edge->getTarget();
-  BasicBlock* sourceBlock = sourceNode->getBlock();
-  BasicBlock* targetBlock = targetNode->getBlock();
-
-  if(sourceBlock == NULL || targetBlock == NULL
-     || sourceNode->getNumberSuccEdges() <= 1
-     || targetNode->getNumberPredEdges() == 1 ) {
-    return(false);
-  }
-
-  TerminatorInst* terminator = sourceBlock->getTerminator();
-
-  if( SplitCriticalEdge(terminator, succNum, this, false)) {
-    BasicBlock* newBlock = terminator->getSuccessor(succNum);
-    dag->splitUpdate(edge, newBlock);
-    return(true);
-  } else
-    return(false);
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.cpp b/lib/Transforms/Instrumentation/ProfilingUtils.cpp
deleted file mode 100644
index 4b3de6d..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-//===- ProfilingUtils.cpp - Helper functions shared by profilers ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ProfilingUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-
-void llvm::InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                                   GlobalValue *Array,
-                                   PointerType *arrayType) {
-  LLVMContext &Context = MainFn->getContext();
-  Type *ArgVTy =
-    PointerType::getUnqual(Type::getInt8PtrTy(Context));
-  PointerType *UIntPtr = arrayType ? arrayType :
-    Type::getInt32PtrTy(Context);
-  Module &M = *MainFn->getParent();
-  Constant *InitFn = M.getOrInsertFunction(FnName, Type::getInt32Ty(Context),
-                                           Type::getInt32Ty(Context),
-                                           ArgVTy, UIntPtr,
-                                           Type::getInt32Ty(Context),
-                                           (Type *)0);
-
-  // This could force argc and argv into programs that wouldn't otherwise have
-  // them, but instead we just pass null values in.
-  std::vector<Value*> Args(4);
-  Args[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Args[1] = Constant::getNullValue(ArgVTy);
-
-  // Skip over any allocas in the entry block.
-  BasicBlock *Entry = MainFn->begin();
-  BasicBlock::iterator InsertPos = Entry->begin();
-  while (isa<AllocaInst>(InsertPos)) ++InsertPos;
-
-  std::vector<Constant*> GEPIndices(2,
-                             Constant::getNullValue(Type::getInt32Ty(Context)));
-  unsigned NumElements = 0;
-  if (Array) {
-    Args[2] = ConstantExpr::getGetElementPtr(Array, GEPIndices);
-    NumElements =
-      cast<ArrayType>(Array->getType()->getElementType())->getNumElements();
-  } else {
-    // If this profiling instrumentation doesn't have a constant array, just
-    // pass null.
-    Args[2] = ConstantPointerNull::get(UIntPtr);
-  }
-  Args[3] = ConstantInt::get(Type::getInt32Ty(Context), NumElements);
-
-  CallInst *InitCall = CallInst::Create(InitFn, Args, "newargc", InsertPos);
-
-  // If argc or argv are not available in main, just pass null values in.
-  Function::arg_iterator AI;
-  switch (MainFn->arg_size()) {
-  default:
-  case 2:
-    AI = MainFn->arg_begin(); ++AI;
-    if (AI->getType() != ArgVTy) {
-      Instruction::CastOps opcode = CastInst::getCastOpcode(AI, false, ArgVTy,
-                                                            false);
-      InitCall->setArgOperand(1,
-          CastInst::Create(opcode, AI, ArgVTy, "argv.cast", InitCall));
-    } else {
-      InitCall->setArgOperand(1, AI);
-    }
-    /* FALL THROUGH */
-
-  case 1:
-    AI = MainFn->arg_begin();
-    // If the program looked at argc, have it look at the return value of the
-    // init call instead.
-    if (!AI->getType()->isIntegerTy(32)) {
-      Instruction::CastOps opcode;
-      if (!AI->use_empty()) {
-        opcode = CastInst::getCastOpcode(InitCall, true, AI->getType(), true);
-        AI->replaceAllUsesWith(
-          CastInst::Create(opcode, InitCall, AI->getType(), "", InsertPos));
-      }
-      opcode = CastInst::getCastOpcode(AI, true,
-                                       Type::getInt32Ty(Context), true);
-      InitCall->setArgOperand(0,
-          CastInst::Create(opcode, AI, Type::getInt32Ty(Context),
-                           "argc.cast", InitCall));
-    } else {
-      AI->replaceAllUsesWith(InitCall);
-      InitCall->setArgOperand(0, AI);
-    }
-
-  case 0: break;
-  }
-}
-
-void llvm::IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                                   GlobalValue *CounterArray, bool beginning) {
-  // Insert the increment after any alloca or PHI instructions...
-  BasicBlock::iterator InsertPos = beginning ? BB->getFirstInsertionPt() :
-                                   BB->getTerminator();
-  while (isa<AllocaInst>(InsertPos))
-    ++InsertPos;
-
-  LLVMContext &Context = BB->getContext();
-
-  // Create the getelementptr constant expression
-  std::vector<Constant*> Indices(2);
-  Indices[0] = Constant::getNullValue(Type::getInt32Ty(Context));
-  Indices[1] = ConstantInt::get(Type::getInt32Ty(Context), CounterNum);
-  Constant *ElementPtr =
-    ConstantExpr::getGetElementPtr(CounterArray, Indices);
-
-  // Load, increment and store the value back.
-  Value *OldVal = new LoadInst(ElementPtr, "OldFuncCounter", InsertPos);
-  Value *NewVal = BinaryOperator::Create(Instruction::Add, OldVal,
-                                 ConstantInt::get(Type::getInt32Ty(Context), 1),
-                                         "NewFuncCounter", InsertPos);
-  new StoreInst(NewVal, ElementPtr, InsertPos);
-}
-
-void llvm::InsertProfilingShutdownCall(Function *Callee, Module *Mod) {
-  // llvm.global_dtors is an array of type { i32, void ()* }. Prepare those
-  // types.
-  Type *GlobalDtorElems[2] = {
-    Type::getInt32Ty(Mod->getContext()),
-    FunctionType::get(Type::getVoidTy(Mod->getContext()), false)->getPointerTo()
-  };
-  StructType *GlobalDtorElemTy =
-      StructType::get(Mod->getContext(), GlobalDtorElems, false);
-
-  // Construct the new element we'll be adding.
-  Constant *Elem[2] = {
-    ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 65535),
-    ConstantExpr::getBitCast(Callee, GlobalDtorElems[1])
-  };
-
-  // If llvm.global_dtors exists, make a copy of the things in its list and
-  // delete it, to replace it with one that has a larger array type.
-  std::vector<Constant *> dtors;
-  if (GlobalVariable *GlobalDtors = Mod->getNamedGlobal("llvm.global_dtors")) {
-    if (ConstantArray *InitList =
-        dyn_cast<ConstantArray>(GlobalDtors->getInitializer())) {
-      for (unsigned i = 0, e = InitList->getType()->getNumElements();
-           i != e; ++i)
-        dtors.push_back(cast<Constant>(InitList->getOperand(i)));
-    }
-    GlobalDtors->eraseFromParent();
-  }
-
-  // Build up llvm.global_dtors with our new item in it.
-  GlobalVariable *GlobalDtors = new GlobalVariable(
-      *Mod, ArrayType::get(GlobalDtorElemTy, 1), false,
-      GlobalValue::AppendingLinkage, NULL, "llvm.global_dtors");
-                                    
-  dtors.push_back(ConstantStruct::get(GlobalDtorElemTy, Elem));
-  GlobalDtors->setInitializer(ConstantArray::get(
-      cast<ArrayType>(GlobalDtors->getType()->getElementType()), dtors));
-}
diff --git a/lib/Transforms/Instrumentation/ProfilingUtils.h b/lib/Transforms/Instrumentation/ProfilingUtils.h
deleted file mode 100644
index 09b2217..0000000
--- a/lib/Transforms/Instrumentation/ProfilingUtils.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//===- ProfilingUtils.h - Helper functions shared by profilers --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a few helper functions which are used by profile
-// instrumentation code to instrument the code.  This allows the profiler pass
-// to worry about *what* to insert, and these functions take care of *how* to do
-// it.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef PROFILINGUTILS_H
-#define PROFILINGUTILS_H
-
-namespace llvm {
-  class BasicBlock;
-  class Function;
-  class GlobalValue;
-  class Module;
-  class PointerType;
-
-  void InsertProfilingInitCall(Function *MainFn, const char *FnName,
-                               GlobalValue *Arr = 0,
-                               PointerType *arrayType = 0);
-  void IncrementCounterInBlock(BasicBlock *BB, unsigned CounterNum,
-                               GlobalValue *CounterArray,
-                               bool beginning = true);
-  void InsertProfilingShutdownCall(Function *Callee, Module *Mod);
-}
-
-#endif
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 299060a..89fb746 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -41,8 +41,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/BlackList.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
 
 using namespace llvm;
 
@@ -99,7 +99,7 @@ struct ThreadSanitizer : public FunctionPass {
   DataLayout *TD;
   Type *IntptrTy;
   SmallString<64> BlacklistFile;
-  OwningPtr<BlackList> BL;
+  OwningPtr<SpecialCaseList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -227,7 +227,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
   TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new BlackList(BlacklistFile));
+  BL.reset(SpecialCaseList::createOrDie(BlacklistFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -240,12 +240,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 }
 
 static bool isVtableAccess(Instruction *I) {
-  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) {
-    if (Tag->getNumOperands() < 1) return false;
-    if (MDString *Tag1 = dyn_cast<MDString>(Tag->getOperand(0))) {
-      if (Tag1->getString() == "vtable pointer") return true;
-    }
-  }
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
   return false;
 }
 
@@ -362,7 +358,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // (e.g. variables that do not escape, etc).
 
   // Instrument memory accesses.
-  if (ClInstrumentMemoryAccesses)
+  if (ClInstrumentMemoryAccesses && F.hasFnAttribute(Attribute::SanitizeThread))
     for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
       Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
     }
@@ -579,7 +575,7 @@ int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) {
     // Ignore all unusual sizes.
     return -1;
   }
-  size_t Idx = CountTrailingZeros_32(TypeSize / 8);
+  size_t Idx = countTrailingZeros(TypeSize / 8);
   assert(Idx < kNumberOfAccessSizes);
   return Idx;
 }
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
new file mode 100644
index 0000000..4eac39d
--- /dev/null
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -0,0 +1,186 @@
+//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization --*- C++ -*---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains a class ARCRuntimeEntryPoints for use in
+/// creating/managing references to entry points to the arc objective c runtime.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
+#define LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
+
+#include "ObjCARC.h"
+
+namespace llvm {
+namespace objcarc {
+
+/// Declarations for ObjC runtime functions and constants. These are initialized
+/// lazily to avoid cluttering up the Module with unused declarations.
+class ARCRuntimeEntryPoints {
+public:
+  enum EntryPointType {
+    EPT_AutoreleaseRV,
+    EPT_Release,
+    EPT_Retain,
+    EPT_RetainBlock,
+    EPT_Autorelease,
+    EPT_StoreStrong,
+    EPT_RetainRV,
+    EPT_RetainAutorelease,
+    EPT_RetainAutoreleaseRV
+  };
+
+  ARCRuntimeEntryPoints() : TheModule(0),
+                            AutoreleaseRV(0),
+                            Release(0),
+                            Retain(0),
+                            RetainBlock(0),
+                            Autorelease(0),
+                            StoreStrong(0),
+                            RetainRV(0),
+                            RetainAutorelease(0),
+                            RetainAutoreleaseRV(0) { }
+
+  ~ARCRuntimeEntryPoints() { }
+
+  void Initialize(Module *M) {
+    TheModule = M;
+    AutoreleaseRV = 0;
+    Release = 0;
+    Retain = 0;
+    RetainBlock = 0;
+    Autorelease = 0;
+    StoreStrong = 0;
+    RetainRV = 0;
+    RetainAutorelease = 0;
+    RetainAutoreleaseRV = 0;
+  }
+
+  Constant *get(const EntryPointType entry) {
+    assert(TheModule != 0 && "Not initialized.");
+
+    switch (entry) {
+    case EPT_AutoreleaseRV:
+      return getI8XRetI8XEntryPoint(AutoreleaseRV,
+                                    "objc_autoreleaseReturnValue", true);
+    case EPT_Release:
+      return getVoidRetI8XEntryPoint(Release, "objc_release");
+    case EPT_Retain:
+      return getI8XRetI8XEntryPoint(Retain, "objc_retain", true);
+    case EPT_RetainBlock:
+      return getI8XRetI8XEntryPoint(RetainBlock, "objc_retainBlock", false);
+    case EPT_Autorelease:
+      return getI8XRetI8XEntryPoint(Autorelease, "objc_autorelease", true);
+    case EPT_StoreStrong:
+      return getI8XRetI8XXI8XEntryPoint(StoreStrong, "objc_storeStrong");
+    case EPT_RetainRV:
+      return getI8XRetI8XEntryPoint(RetainRV,
+                                    "objc_retainAutoreleasedReturnValue", true);
+    case EPT_RetainAutorelease:
+      return getI8XRetI8XEntryPoint(RetainAutorelease, "objc_retainAutorelease",
+                                    true);
+    case EPT_RetainAutoreleaseRV:
+      return getI8XRetI8XEntryPoint(RetainAutoreleaseRV,
+                                    "objc_retainAutoreleaseReturnValue", true);
+    }
+
+    llvm_unreachable("Switch should be a covered switch.");
+  }
+
+private:
+  /// Cached reference to the module which we will insert declarations into.
+  Module *TheModule;
+
+  /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+  Constant *AutoreleaseRV;
+  /// Declaration for ObjC runtime function objc_release.
+  Constant *Release;
+  /// Declaration for ObjC runtime function objc_retain.
+  Constant *Retain;
+  /// Declaration for ObjC runtime function objc_retainBlock.
+  Constant *RetainBlock;
+  /// Declaration for ObjC runtime function objc_autorelease.
+  Constant *Autorelease;
+  /// Declaration for objc_storeStrong().
+  Constant *StoreStrong;
+  /// Declaration for objc_retainAutoreleasedReturnValue().
+  Constant *RetainRV;
+  /// Declaration for objc_retainAutorelease().
+  Constant *RetainAutorelease;
+  /// Declaration for objc_retainAutoreleaseReturnValue().
+  Constant *RetainAutoreleaseRV;
+
+  Constant *getVoidRetI8XEntryPoint(Constant *&Decl,
+                                    const char *Name) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
+    AttributeSet Attr =
+      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
+                                          /*isVarArg=*/false);
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+  Constant *getI8XRetI8XEntryPoint(Constant *& Decl,
+                                   const char *Name,
+                                   bool NoUnwind = false) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attr = AttributeSet();
+
+    if (NoUnwind)
+      Attr = Attr.addAttribute(C, AttributeSet::FunctionIndex,
+                               Attribute::NoUnwind);
+
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+  Constant *getI8XRetI8XXI8XEntryPoint(Constant *&Decl,
+                                       const char *Name) {
+    if (Decl)
+      return Decl;
+
+    LLVMContext &C = TheModule->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *I8XX = PointerType::getUnqual(I8X);
+    Type *Params[] = { I8XX, I8X };
+
+    AttributeSet Attr =
+      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    Attr = Attr.addAttribute(C, 1, Attribute::NoCapture);
+
+    FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
+                                          /*isVarArg=*/false);
+
+    return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
+  }
+
+}; // class ARCRuntimeEntryPoints
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_SCALAR_ARCRUNTIMEENTRYPOINTS_H
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.h b/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 24d358b..617cdf3 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -1,4 +1,4 @@
-//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index 39670f3..8044494 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -1,4 +1,4 @@
-//===- ObjCARC.h - ObjC ARC Optimization --------------*- mode: c++ -*-----===//
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -286,7 +286,9 @@ static inline void EraseInstruction(Instruction *CI) {
 
   if (!Unused) {
     // Replace the return value with the argument.
-    assert(IsForwarding(GetBasicInstructionClass(CI)) &&
+    assert((IsForwarding(GetBasicInstructionClass(CI)) ||
+            (IsNoopOnNull(GetBasicInstructionClass(CI)) &&
+             isa<ConstantPointerNull>(OldArg))) &&
            "Can't delete non-forwarding instruction with users!");
     CI->replaceAllUsesWith(OldArg);
   }
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
index 46b2de7..d18667b 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.cpp
@@ -1,4 +1,4 @@
-//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -*- mode: c++ -*---===//
+//===- ObjCARCAliasAnalysis.cpp - ObjC ARC Optimization -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
index 7abe995..41ccfe2 100644
--- a/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
+++ b/lib/Transforms/ObjCARC/ObjCARCAliasAnalysis.h
@@ -1,4 +1,4 @@
-//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- mode: c++ -*-----===//
+//===- ObjCARCAliasAnalysis.h - ObjC ARC Optimization -*- C++ -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index c43f4f4..9d80037 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -28,6 +28,7 @@
 
 #define DEBUG_TYPE "objc-arc-contract"
 #include "ObjCARC.h"
+#include "ARCRuntimeEntryPoints.h"
 #include "DependencyAnalysis.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
@@ -52,23 +53,11 @@ namespace {
     AliasAnalysis *AA;
     DominatorTree *DT;
     ProvenanceAnalysis PA;
+    ARCRuntimeEntryPoints EP;
 
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// Declarations for ObjC runtime functions, for use in creating calls to
-    /// them. These are initialized lazily to avoid cluttering up the Module
-    /// with unused declarations.
-
-    /// Declaration for objc_storeStrong().
-    Constant *StoreStrongCallee;
-    /// Declaration for objc_retainAutorelease().
-    Constant *RetainAutoreleaseCallee;
-    /// Declaration for objc_retainAutoreleaseReturnValue().
-    Constant *RetainAutoreleaseRVCallee;
-    /// Declaration for objc_retainAutoreleasedReturnValue().
-    Constant *RetainRVCallee;
-
     /// The inline asm string to insert between calls and RetainRV calls to make
     /// the optimization work on targets which need it.
     const MDString *RetainRVMarker;
@@ -78,11 +67,6 @@ namespace {
     /// "tail".
     SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
-    Constant *getStoreStrongCallee(Module *M);
-    Constant *getRetainRVCallee(Module *M);
-    Constant *getRetainAutoreleaseCallee(Module *M);
-    Constant *getRetainAutoreleaseRVCallee(Module *M);
-
     bool OptimizeRetainCall(Function &F, Instruction *Retain);
 
     bool ContractAutorelease(Function &F, Instruction *Autorelease,
@@ -125,74 +109,6 @@ void ObjCARCContract::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 }
 
-Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
-  if (!StoreStrongCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *I8XX = PointerType::getUnqual(I8X);
-    Type *Params[] = { I8XX, I8X };
-
-    AttributeSet Attr = AttributeSet()
-      .addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                    Attribute::NoUnwind)
-      .addAttribute(M->getContext(), 1, Attribute::NoCapture);
-
-    StoreStrongCallee =
-      M->getOrInsertFunction(
-        "objc_storeStrong",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attr);
-  }
-  return StoreStrongCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
-  if (!RetainAutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainAutoreleaseCallee =
-      M->getOrInsertFunction("objc_retainAutorelease", FTy, Attribute);
-  }
-  return RetainAutoreleaseCallee;
-}
-
-Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
-  if (!RetainAutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainAutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainAutoreleaseRVCallee;
-}
-
-Constant *ObjCARCContract::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainRVCallee;
-}
-
 /// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
 /// return value. We do this late so we do not disrupt the dataflow analysis in
 /// ObjCARCOpt.
@@ -222,7 +138,8 @@ ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) {
 
   // We do not have to worry about tail calls/does not throw since
   // retain/retainRV have the same properties.
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+  Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_RetainRV);
+  cast<CallInst>(Retain)->setCalledFunction(Decl);
 
   DEBUG(dbgs() << "New: " << *Retain << "\n");
   return true;
@@ -272,10 +189,10 @@ ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
                   "                                      Old Retain: "
                << *Retain << "\n");
 
-  if (Class == IC_AutoreleaseRV)
-    Retain->setCalledFunction(getRetainAutoreleaseRVCallee(F.getParent()));
-  else
-    Retain->setCalledFunction(getRetainAutoreleaseCallee(F.getParent()));
+  Constant *Decl = EP.get(Class == IC_AutoreleaseRV ?
+                          ARCRuntimeEntryPoints::EPT_RetainAutoreleaseRV :
+                          ARCRuntimeEntryPoints::EPT_RetainAutorelease);
+  Retain->setCalledFunction(Decl);
 
   DEBUG(dbgs() << "                                      New Retain: "
                << *Retain << "\n");
@@ -356,9 +273,8 @@ void ObjCARCContract::ContractRelease(Instruction *Release,
     Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
   if (Args[1]->getType() != I8X)
     Args[1] = new BitCastInst(Args[1], I8X, "", Store);
-  CallInst *StoreStrong =
-    CallInst::Create(getStoreStrongCallee(BB->getParent()->getParent()),
-                     Args, "", Store);
+  Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_StoreStrong);
+  CallInst *StoreStrong = CallInst::Create(Decl, Args, "", Store);
   StoreStrong->setDoesNotThrow();
   StoreStrong->setDebugLoc(Store->getDebugLoc());
 
@@ -381,11 +297,7 @@ bool ObjCARCContract::doInitialization(Module &M) {
   if (!Run)
     return false;
 
-  // These are initialized lazily.
-  StoreStrongCallee = 0;
-  RetainAutoreleaseCallee = 0;
-  RetainAutoreleaseRVCallee = 0;
-  RetainRVCallee = 0;
+  EP.Initialize(&M);
 
   // Initialize RetainRVMarker.
   RetainRVMarker = 0;
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 43e2e20..2976df6 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -26,10 +26,12 @@
 
 #define DEBUG_TYPE "objc-arc-opts"
 #include "ObjCARC.h"
+#include "ARCRuntimeEntryPoints.h"
 #include "DependencyAnalysis.h"
 #include "ObjCARCAliasAnalysis.h"
 #include "ProvenanceAnalysis.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -107,6 +109,12 @@ namespace {
       return std::make_pair(Vector.begin() + Pair.first->second, false);
     }
 
+    iterator find(const KeyT &Key) {
+      typename MapTy::iterator It = Map.find(Key);
+      if (It == Map.end()) return Vector.end();
+      return Vector.begin() + It->second;
+    }
+
     const_iterator find(const KeyT &Key) const {
       typename MapTy::const_iterator It = Map.find(Key);
       if (It == Map.end()) return Vector.end();
@@ -168,91 +176,40 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return 0;
 }
 
-/// \brief Test whether the given retainable object pointer escapes.
-///
-/// This differs from regular escape analysis in that a use as an
-/// argument to a call is not considered an escape.
-///
-static bool DoesRetainableObjPtrEscape(const User *Ptr) {
-  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Target: " << *Ptr << "\n");
-
-  // Walk the def-use chains.
+/// This is a wrapper around getUnderlyingObjCPtr along the lines of
+/// GetUnderlyingObjects except that it returns early when it sees the first
+/// alloca.
+static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V) {
+  SmallPtrSet<const Value *, 4> Visited;
   SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(Ptr);
-  // If Ptr has any operands add them as well.
-  for (User::const_op_iterator I = Ptr->op_begin(), E = Ptr->op_end(); I != E;
-       ++I) {
-    Worklist.push_back(*I);
-  }
-
-  // Ensure we do not visit any value twice.
-  SmallPtrSet<const Value *, 8> VisitedSet;
-
+  Worklist.push_back(V);
   do {
-    const Value *V = Worklist.pop_back_val();
+    const Value *P = Worklist.pop_back_val();
+    P = GetUnderlyingObjCPtr(P);
 
-    DEBUG(dbgs() << "Visiting: " << *V << "\n");
+    if (isa<AllocaInst>(P))
+      return true;
 
-    for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-         UI != UE; ++UI) {
-      const User *UUser = *UI;
+    if (!Visited.insert(P))
+      continue;
 
-      DEBUG(dbgs() << "User: " << *UUser << "\n");
+    if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
+      Worklist.push_back(SI->getTrueValue());
+      Worklist.push_back(SI->getFalseValue());
+      continue;
+    }
 
-      // Special - Use by a call (callee or argument) is not considered
-      // to be an escape.
-      switch (GetBasicInstructionClass(UUser)) {
-      case IC_StoreWeak:
-      case IC_InitWeak:
-      case IC_StoreStrong:
-      case IC_Autorelease:
-      case IC_AutoreleaseRV: {
-        DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n");
-        // These special functions make copies of their pointer arguments.
-        return true;
-      }
-      case IC_IntrinsicUser:
-        // Use by the use intrinsic is not an escape.
-        continue;
-      case IC_User:
-      case IC_None:
-        // Use by an instruction which copies the value is an escape if the
-        // result is an escape.
-        if (isa<BitCastInst>(UUser) || isa<GetElementPtrInst>(UUser) ||
-            isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
-
-          if (VisitedSet.insert(UUser)) {
-            DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes."
-                  " Adding to list.\n");
-            Worklist.push_back(UUser);
-          } else {
-            DEBUG(dbgs() << "Already visited node.\n");
-          }
-          continue;
-        }
-        // Use by a load is not an escape.
-        if (isa<LoadInst>(UUser))
-          continue;
-        // Use by a store is not an escape if the use is the address.
-        if (const StoreInst *SI = dyn_cast<StoreInst>(UUser))
-          if (V != SI->getValueOperand())
-            continue;
-        break;
-      default:
-        // Regular calls and other stuff are not considered escapes.
-        continue;
-      }
-      // Otherwise, conservatively assume an escape.
-      DEBUG(dbgs() << "Assuming ptr escapes.\n");
-      return true;
+    if (const PHINode *PN = dyn_cast<const PHINode>(P)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        Worklist.push_back(PN->getIncomingValue(i));
+      continue;
     }
   } while (!Worklist.empty());
 
-  // No escapes found.
-  DEBUG(dbgs() << "Ptr does not escape.\n");
   return false;
 }
 
+
 /// @}
 ///
 /// \defgroup ARCOpt ARC Optimization.
@@ -300,18 +257,18 @@ STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
 STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
 STATISTIC(NumRets,        "Number of return value forwarding "
-                          "retain+autoreleaes eliminated");
+                          "retain+autoreleases eliminated");
 STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+#ifndef NDEBUG
 STATISTIC(NumRetainsBeforeOpt,
-          "Number of retains before optimization.");
+          "Number of retains before optimization");
 STATISTIC(NumReleasesBeforeOpt,
-          "Number of releases before optimization.");
-#ifndef NDEBUG
+          "Number of releases before optimization");
 STATISTIC(NumRetainsAfterOpt,
-          "Number of retains after optimization.");
+          "Number of retains after optimization");
 STATISTIC(NumReleasesAfterOpt,
-          "Number of releases after optimization.");
+          "Number of releases after optimization");
 #endif
 
 namespace {
@@ -414,14 +371,20 @@ namespace {
     /// sequence.
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
+    /// If this is true, we cannot perform code motion but can still remove
+    /// retain/release pairs.
+    bool CFGHazardAfflicted;
+
     RRInfo() :
-      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0),
+      CFGHazardAfflicted(false) {}
 
     void clear();
 
-    bool IsTrackingImpreciseReleases() {
-      return ReleaseMetadata != 0;
-    }
+    /// Conservatively merge the two RRInfo. Returns true if a partial merge has
+    /// occured, false otherwise.
+    bool Merge(const RRInfo &Other);
+
   };
 }
 
@@ -431,6 +394,30 @@ void RRInfo::clear() {
   ReleaseMetadata = 0;
   Calls.clear();
   ReverseInsertPts.clear();
+  CFGHazardAfflicted = false;
+}
+
+bool RRInfo::Merge(const RRInfo &Other) {
+    // Conservatively merge the ReleaseMetadata information.
+    if (ReleaseMetadata != Other.ReleaseMetadata)
+      ReleaseMetadata = 0;
+
+    // Conservatively merge the boolean state.
+    KnownSafe &= Other.KnownSafe;
+    IsTailCallRelease &= Other.IsTailCallRelease;
+    CFGHazardAfflicted |= Other.CFGHazardAfflicted;
+
+    // Merge the call sets.
+    Calls.insert(Other.Calls.begin(), Other.Calls.end());
+
+    // Merge the insert point sets. If there are any differences,
+    // that makes this a partial merge.
+    bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
+    for (SmallPtrSet<Instruction *, 2>::const_iterator
+         I = Other.ReverseInsertPts.begin(),
+         E = Other.ReverseInsertPts.end(); I != E; ++I)
+      Partial |= ReverseInsertPts.insert(*I);
+    return Partial;
 }
 
 namespace {
@@ -445,22 +432,59 @@ namespace {
     bool Partial;
 
     /// The current position in the sequence.
-    Sequence Seq : 8;
+    unsigned char Seq : 8;
 
-  public:
     /// Unidirectional information about the current sequence.
-    ///
-    /// TODO: Encapsulate this better.
     RRInfo RRI;
 
+  public:
     PtrState() : KnownPositiveRefCount(false), Partial(false),
                  Seq(S_None) {}
 
+
+    bool IsKnownSafe() const {
+      return RRI.KnownSafe;
+    }
+
+    void SetKnownSafe(const bool NewValue) {
+      RRI.KnownSafe = NewValue;
+    }
+
+    bool IsTailCallRelease() const {
+      return RRI.IsTailCallRelease;
+    }
+
+    void SetTailCallRelease(const bool NewValue) {
+      RRI.IsTailCallRelease = NewValue;
+    }
+
+    bool IsTrackingImpreciseReleases() const {
+      return RRI.ReleaseMetadata != 0;
+    }
+
+    const MDNode *GetReleaseMetadata() const {
+      return RRI.ReleaseMetadata;
+    }
+
+    void SetReleaseMetadata(MDNode *NewValue) {
+      RRI.ReleaseMetadata = NewValue;
+    }
+
+    bool IsCFGHazardAfflicted() const {
+      return RRI.CFGHazardAfflicted;
+    }
+
+    void SetCFGHazardAfflicted(const bool NewValue) {
+      RRI.CFGHazardAfflicted = NewValue;
+    }
+
     void SetKnownPositiveRefCount() {
+      DEBUG(dbgs() << "Setting Known Positive.\n");
       KnownPositiveRefCount = true;
     }
 
     void ClearKnownPositiveRefCount() {
+      DEBUG(dbgs() << "Clearing Known Positive.\n");
       KnownPositiveRefCount = false;
     }
 
@@ -474,7 +498,7 @@ namespace {
     }
 
     Sequence GetSeq() const {
-      return Seq;
+      return static_cast<Sequence>(Seq);
     }
 
     void ClearSequenceProgress() {
@@ -489,13 +513,34 @@ namespace {
     }
 
     void Merge(const PtrState &Other, bool TopDown);
+
+    void InsertCall(Instruction *I) {
+      RRI.Calls.insert(I);
+    }
+
+    void InsertReverseInsertPt(Instruction *I) {
+      RRI.ReverseInsertPts.insert(I);
+    }
+
+    void ClearReverseInsertPts() {
+      RRI.ReverseInsertPts.clear();
+    }
+
+    bool HasReverseInsertPts() const {
+      return !RRI.ReverseInsertPts.empty();
+    }
+
+    const RRInfo &GetRRInfo() const {
+      return RRI;
+    }
   };
 }
 
 void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
-  Seq = MergeSeqs(Seq, Other.Seq, TopDown);
-  KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
+  Seq = MergeSeqs(static_cast<Sequence>(Seq), static_cast<Sequence>(Other.Seq),
+                  TopDown);
+  KnownPositiveRefCount &= Other.KnownPositiveRefCount;
 
   // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
@@ -508,22 +553,11 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
     // mixing them is unsafe.
     ClearSequenceProgress();
   } else {
-    // Conservatively merge the ReleaseMetadata information.
-    if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata)
-      RRI.ReleaseMetadata = 0;
-
-    RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe;
-    RRI.IsTailCallRelease = RRI.IsTailCallRelease &&
-                            Other.RRI.IsTailCallRelease;
-    RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end());
-
-    // Merge the insert point sets. If there are any differences,
-    // that makes this a partial merge.
-    Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size();
-    for (SmallPtrSet<Instruction *, 2>::const_iterator
-         I = Other.RRI.ReverseInsertPts.begin(),
-         E = Other.RRI.ReverseInsertPts.end(); I != E; ++I)
-      Partial |= RRI.ReverseInsertPts.insert(*I);
+    // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
+    // point, we know that currently we are not partial. Stash whether or not
+    // the merge operation caused us to undergo a partial merging of reverse
+    // insertion points.
+    Partial = RRI.Merge(Other.RRI);
   }
 }
 
@@ -556,7 +590,9 @@ namespace {
     SmallVector<BasicBlock *, 2> Succs;
 
   public:
-    BBState() : TopDownPathCount(0), BottomUpPathCount(0) {}
+    static const unsigned OverflowOccurredValue;
+
+    BBState() : TopDownPathCount(0), BottomUpPathCount(0) { }
 
     typedef MapTy::iterator ptr_iterator;
     typedef MapTy::const_iterator ptr_const_iterator;
@@ -587,14 +623,26 @@ namespace {
     /// definition.
     void SetAsExit()  { BottomUpPathCount = 1; }
 
+    /// Attempt to find the PtrState object describing the top down state for
+    /// pointer Arg. Return a new initialized PtrState describing the top down
+    /// state for Arg if we do not find one.
     PtrState &getPtrTopDownState(const Value *Arg) {
       return PerPtrTopDown[Arg];
     }
 
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg. Return a new initialized PtrState describing the bottom up
+    /// state for Arg if we do not find one.
     PtrState &getPtrBottomUpState(const Value *Arg) {
       return PerPtrBottomUp[Arg];
     }
 
+    /// Attempt to find the PtrState object describing the bottom up state for
+    /// pointer Arg.
+    ptr_iterator findPtrBottomUpState(const Value *Arg) {
+      return PerPtrBottomUp.find(Arg);
+    }
+
     void clearBottomUpPointers() {
       PerPtrBottomUp.clear();
     }
@@ -608,27 +656,38 @@ namespace {
     void MergePred(const BBState &Other);
     void MergeSucc(const BBState &Other);
 
-    /// Return the number of possible unique paths from an entry to an exit
+    /// Compute the number of possible unique paths from an entry to an exit
     /// which pass through this block. This is only valid after both the
     /// top-down and bottom-up traversals are complete.
-    unsigned GetAllPathCount() const {
-      assert(TopDownPathCount != 0);
-      assert(BottomUpPathCount != 0);
-      return TopDownPathCount * BottomUpPathCount;
+    ///
+    /// Returns true if overflow occured. Returns false if overflow did not
+    /// occur.
+    bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
+      if (TopDownPathCount == OverflowOccurredValue ||
+          BottomUpPathCount == OverflowOccurredValue)
+        return true;
+      unsigned long long Product =
+        (unsigned long long)TopDownPathCount*BottomUpPathCount;
+      // Overflow occured if any of the upper bits of Product are set or if all
+      // the lower bits of Product are all set.
+      return (Product >> 32) ||
+             ((PathCount = Product) == OverflowOccurredValue);
     }
 
     // Specialized CFG utilities.
     typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator;
-    edge_iterator pred_begin() { return Preds.begin(); }
-    edge_iterator pred_end() { return Preds.end(); }
-    edge_iterator succ_begin() { return Succs.begin(); }
-    edge_iterator succ_end() { return Succs.end(); }
+    edge_iterator pred_begin() const { return Preds.begin(); }
+    edge_iterator pred_end() const { return Preds.end(); }
+    edge_iterator succ_begin() const { return Succs.begin(); }
+    edge_iterator succ_end() const { return Succs.end(); }
 
     void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
     void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
 
     bool isExit() const { return Succs.empty(); }
   };
+
+  const unsigned BBState::OverflowOccurredValue = 0xffffffff;
 }
 
 void BBState::InitFromPred(const BBState &Other) {
@@ -644,13 +703,25 @@ void BBState::InitFromSucc(const BBState &Other) {
 /// The top-down traversal uses this to merge information about predecessors to
 /// form the initial state for a new block.
 void BBState::MergePred(const BBState &Other) {
+  if (TopDownPathCount == OverflowOccurredValue)
+    return;
+
   // Other.TopDownPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   TopDownPathCount += Other.TopDownPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (TopDownPathCount == OverflowOccurredValue) {
+    clearTopDownPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (TopDownPathCount < Other.TopDownPathCount) {
+    TopDownPathCount = OverflowOccurredValue;
     clearTopDownPointers();
     return;
   }
@@ -676,13 +747,25 @@ void BBState::MergePred(const BBState &Other) {
 /// The bottom-up traversal uses this to merge information about successors to
 /// form the initial state for a new block.
 void BBState::MergeSucc(const BBState &Other) {
+  if (BottomUpPathCount == OverflowOccurredValue)
+    return;
+
   // Other.BottomUpPathCount can be 0, in which case it is either dead or a
   // loop backedge. Loop backedges are special.
   BottomUpPathCount += Other.BottomUpPathCount;
 
+  // In order to be consistent, we clear the top down pointers when by adding
+  // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+  // has not occured.
+  if (BottomUpPathCount == OverflowOccurredValue) {
+    clearBottomUpPointers();
+    return;
+  }
+
   // Check for overflow. If we have overflow, fall back to conservative
   // behavior.
   if (BottomUpPathCount < Other.BottomUpPathCount) {
+    BottomUpPathCount = OverflowOccurredValue;
     clearBottomUpPointers();
     return;
   }
@@ -991,25 +1074,14 @@ namespace {
   class ObjCARCOpt : public FunctionPass {
     bool Changed;
     ProvenanceAnalysis PA;
+    ARCRuntimeEntryPoints EP;
+
+    // This is used to track if a pointer is stored into an alloca.
+    DenseSet<const Value *> MultiOwnersSet;
 
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
-    /// Declarations for ObjC runtime functions, for use in creating calls to
-    /// them. These are initialized lazily to avoid cluttering up the Module
-    /// with unused declarations.
-
-    /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
-    Constant *AutoreleaseRVCallee;
-    /// Declaration for ObjC runtime function objc_release.
-    Constant *ReleaseCallee;
-    /// Declaration for ObjC runtime function objc_retain.
-    Constant *RetainCallee;
-    /// Declaration for ObjC runtime function objc_retainBlock.
-    Constant *RetainBlockCallee;
-    /// Declaration for ObjC runtime function objc_autorelease.
-    Constant *AutoreleaseCallee;
-
     /// Flags which determine whether each of the interesting runtine functions
     /// is in fact used in the current function.
     unsigned UsedInThisFunction;
@@ -1032,19 +1104,9 @@ namespace {
     unsigned ARCAnnotationProvenanceSourceMDKind;
 #endif // ARC_ANNOATIONS
 
-    Constant *getAutoreleaseRVCallee(Module *M);
-    Constant *getReleaseCallee(Module *M);
-    Constant *getRetainCallee(Module *M);
-    Constant *getRetainBlockCallee(Module *M);
-    Constant *getAutoreleaseCallee(Module *M);
-
-    bool IsRetainBlockOptimizable(const Instruction *Inst);
-
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    InstructionClass &Class);
-    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
-                                 InstructionClass &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -1078,9 +1140,9 @@ namespace {
                                MapVector<Value *, RRInfo> &Retains,
                                DenseMap<Value *, RRInfo> &Releases,
                                Module *M,
-                               SmallVector<Instruction *, 4> &NewRetains,
-                               SmallVector<Instruction *, 4> &NewReleases,
-                               SmallVector<Instruction *, 8> &DeadInsts,
+                               SmallVectorImpl<Instruction *> &NewRetains,
+                               SmallVectorImpl<Instruction *> &NewReleases,
+                               SmallVectorImpl<Instruction *> &DeadInsts,
                                RRInfo &RetainsToMove,
                                RRInfo &ReleasesToMove,
                                Value *Arg,
@@ -1133,101 +1195,6 @@ void ObjCARCOpt::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
 }
 
-bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
-  // Without the magic metadata tag, we have to assume this might be an
-  // objc_retainBlock call inserted to convert a block pointer to an id,
-  // in which case it really is needed.
-  if (!Inst->getMetadata(CopyOnEscapeMDKind))
-    return false;
-
-  // If the pointer "escapes" (not including being used in a call),
-  // the copy may be needed.
-  if (DoesRetainableObjPtrEscape(Inst))
-    return false;
-
-  // Otherwise, it's not needed.
-  return true;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
-  if (!AutoreleaseRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    AutoreleaseRVCallee =
-      M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
-                             Attribute);
-  }
-  return AutoreleaseRVCallee;
-}
-
-Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
-  if (!ReleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    ReleaseCallee =
-      M->getOrInsertFunction(
-        "objc_release",
-        FunctionType::get(Type::getVoidTy(C), Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return ReleaseCallee;
-}
-
-Constant *ObjCARCOpt::getRetainCallee(Module *M) {
-  if (!RetainCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainCallee =
-      M->getOrInsertFunction(
-        "objc_retain",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return RetainCallee;
-}
-
-Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
-  if (!RetainBlockCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    // objc_retainBlock is not nounwind because it calls user copy constructors
-    // which could theoretically throw.
-    RetainBlockCallee =
-      M->getOrInsertFunction(
-        "objc_retainBlock",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        AttributeSet());
-  }
-  return RetainBlockCallee;
-}
-
-Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
-  if (!AutoreleaseCallee) {
-    LLVMContext &C = M->getContext();
-    Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    AutoreleaseCallee =
-      M->getOrInsertFunction(
-        "objc_autorelease",
-        FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        Attribute);
-  }
-  return AutoreleaseCallee;
-}
-
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
 /// not a return value.  Or, if it can be paired with an
 /// objc_autoreleaseReturnValue, delete the pair and return true.
@@ -1281,7 +1248,8 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
                   "objc_retain since the operand is not a return value.\n"
                   "Old = " << *RetainRV << "\n");
 
-  cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+  cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
 
   DEBUG(dbgs() << "New = " << *RetainRV << "\n");
 
@@ -1318,8 +1286,8 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                   "Old = " << *AutoreleaseRV << "\n");
 
   CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
-  AutoreleaseRVCI->
-    setCalledFunction(getAutoreleaseCallee(F.getParent()));
+  Constant *NewDecl = EP.get(ARCRuntimeEntryPoints::EPT_Autorelease);
+  AutoreleaseRVCI->setCalledFunction(NewDecl);
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
   Class = IC_Autorelease;
 
@@ -1327,40 +1295,6 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
 
 }
 
-// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
-// calls.
-//
-// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
-// does not escape (following the rules of block escaping), strength reduce the
-// objc_retainBlock to an objc_retain.
-//
-// TODO: If an objc_retainBlock call is dominated period by a previous
-// objc_retainBlock call, strength reduce the objc_retainBlock to an
-// objc_retain.
-bool
-ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
-                                    InstructionClass &Class) {
-  assert(GetBasicInstructionClass(Inst) == Class);
-  assert(IC_RetainBlock == Class);
-
-  // If we can not optimize Inst, return false.
-  if (!IsRetainBlockOptimizable(Inst))
-    return false;
-
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n");
-  DEBUG(dbgs() << "Old: " << *Inst << "\n");
-  CallInst *RetainBlock = cast<CallInst>(Inst);
-  RetainBlock->setCalledFunction(getRetainCallee(F.getParent()));
-  // Remove copy_on_escape metadata.
-  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
-  Class = IC_Retain;
-  DEBUG(dbgs() << "New: " << *Inst << "\n");
-  return true;
-}
-
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
@@ -1437,15 +1371,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
-    case IC_RetainBlock:
-      // If we strength reduce an objc_retainBlock to an objc_retain, continue
-      // onto the objc_retain peephole optimizations. Otherwise break.
-      if (!OptimizeRetainBlockCall(F, Inst, Class))
-        break;
-      // FALLTHROUGH
-    case IC_Retain:
-      ++NumRetainsBeforeOpt;
-      break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
         continue;
@@ -1453,9 +1378,6 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_AutoreleaseRV:
       OptimizeAutoreleaseRVCall(F, Inst, Class);
       break;
-    case IC_Release:
-      ++NumReleasesBeforeOpt;
-      break;
     }
 
     // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
@@ -1469,9 +1391,10 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
         // Create the declaration lazily.
         LLVMContext &C = Inst->getContext();
-        CallInst *NewCall =
-          CallInst::Create(getReleaseCallee(F.getParent()),
-                           Call->getArgOperand(0), "", Call);
+
+        Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release);
+        CallInst *NewCall = CallInst::Create(Decl, Call->getArgOperand(0), "",
+                                             Call);
         NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None));
 
         DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
@@ -1639,13 +1562,15 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
                                  PtrState &S,
                                  bool &SomeSuccHasSame,
                                  bool &AllSuccsHaveSame,
+                                 bool &NotAllSeqEqualButKnownSafe,
                                  bool &ShouldContinue) {
   switch (SuccSSeq) {
   case S_CanRelease: {
-    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) {
       S.ClearSequenceProgress();
       break;
     }
+    S.SetCFGHazardAfflicted(true);
     ShouldContinue = true;
     break;
   }
@@ -1655,8 +1580,10 @@ static void CheckForUseCFGHazard(const Sequence SuccSSeq,
   case S_Stop:
   case S_Release:
   case S_MovableRelease:
-    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
       AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
     break;
   case S_Retain:
     llvm_unreachable("bottom-up pointer in retain state!");
@@ -1672,7 +1599,8 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
                                         const bool SuccSRRIKnownSafe,
                                         PtrState &S,
                                         bool &SomeSuccHasSame,
-                                        bool &AllSuccsHaveSame) {
+                                        bool &AllSuccsHaveSame,
+                                        bool &NotAllSeqEqualButKnownSafe) {
   switch (SuccSSeq) {
   case S_CanRelease:
     SomeSuccHasSame = true;
@@ -1681,8 +1609,10 @@ static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
   case S_Release:
   case S_MovableRelease:
   case S_Use:
-    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+    if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
       AllSuccsHaveSame = false;
+    else
+      NotAllSeqEqualButKnownSafe = true;
     break;
   case S_Retain:
     llvm_unreachable("bottom-up pointer in retain state!");
@@ -1718,6 +1648,7 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
     const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
     bool SomeSuccHasSame = false;
     bool AllSuccsHaveSame = true;
+    bool NotAllSeqEqualButKnownSafe = false;
 
     succ_const_iterator SI(TI), SE(TI, false);
 
@@ -1742,24 +1673,24 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
 
       // If we have S_Use or S_CanRelease, perform our check for cfg hazard
       // checks.
-      const bool SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+      const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe();
 
       // *NOTE* We do not use Seq from above here since we are allowing for
       // S.GetSeq() to change while we are visiting basic blocks.
       switch(S.GetSeq()) {
       case S_Use: {
         bool ShouldContinue = false;
-        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
-                             SomeSuccHasSame, AllSuccsHaveSame,
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
+                             AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
                              ShouldContinue);
         if (ShouldContinue)
           continue;
         break;
       }
       case S_CanRelease: {
-        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe,
-                                    S, SomeSuccHasSame,
-                                    AllSuccsHaveSame);
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                                    SomeSuccHasSame, AllSuccsHaveSame,
+                                    NotAllSeqEqualButKnownSafe);
         break;
       }
       case S_Retain:
@@ -1774,8 +1705,15 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
     // If the state at the other end of any of the successor edges
     // matches the current state, require all edges to match. This
     // guards against loops in the middle of a sequence.
-    if (SomeSuccHasSame && !AllSuccsHaveSame)
+    if (SomeSuccHasSame && !AllSuccsHaveSame) {
       S.ClearSequenceProgress();
+    } else if (NotAllSeqEqualButKnownSafe) {
+      // If we would have cleared the state foregoing the fact that we are known
+      // safe, stop code motion. This is because whether or not it is safe to
+      // remove RR pairs via KnownSafe is an orthogonal concept to whether we
+      // are allowed to perform code motion.
+      S.SetCFGHazardAfflicted(true);
+    }
   }
 }
 
@@ -1812,10 +1750,10 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
     ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
     S.ResetSequenceProgress(NewSeq);
-    S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
-    S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-    S.RRI.Calls.insert(Inst);
+    S.SetReleaseMetadata(ReleaseMetadata);
+    S.SetKnownSafe(S.HasKnownPositiveRefCount());
+    S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall());
+    S.InsertCall(Inst);
     S.SetKnownPositiveRefCount();
     break;
   }
@@ -1839,14 +1777,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     case S_Use:
       // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
       // imprecise release, clear our reverse insertion points.
-      if (OldSeq != S_Use || S.RRI.IsTrackingImpreciseReleases())
-        S.RRI.ReverseInsertPts.clear();
+      if (OldSeq != S_Use || S.IsTrackingImpreciseReleases())
+        S.ClearReverseInsertPts();
       // FALL THROUGH
     case S_CanRelease:
       // Don't do retain+release tracking for IC_RetainRV, because it's
       // better to let it remain as the first instruction after a call.
       if (Class != IC_RetainRV)
-        Retains[Inst] = S.RRI;
+        Retains[Inst] = S.GetRRInfo();
       S.ClearSequenceProgress();
       break;
     case S_None:
@@ -1866,6 +1804,28 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
   case IC_None:
     // These are irrelevant.
     return NestingDetected;
+  case IC_User:
+    // If we have a store into an alloca of a pointer we are tracking, the
+    // pointer has multiple owners implying that we must be more conservative.
+    //
+    // This comes up in the context of a pointer being ``KnownSafe''. In the
+    // presense of a block being initialized, the frontend will emit the
+    // objc_retain on the original pointer and the release on the pointer loaded
+    // from the alloca. The optimizer will through the provenance analysis
+    // realize that the two are related, but since we only require KnownSafe in
+    // one direction, will match the inner retain on the original pointer with
+    // the guard release on the original pointer. This is fixed by ensuring that
+    // in the presense of allocas we only unconditionally remove pointers if
+    // both our retain and our release are KnownSafe.
+    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand())) {
+        BBState::ptr_iterator I = MyStates.findPtrBottomUpState(
+          StripPointerCastsAndObjCCalls(SI->getValueOperand()));
+        if (I != MyStates.bottom_up_ptr_end())
+          MultiOwnersSet.insert(I->first);
+      }
+    }
+    break;
   default:
     break;
   }
@@ -1908,14 +1868,14 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
       if (CanUse(Inst, Ptr, PA, Class)) {
         DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
               << "\n");
-        assert(S.RRI.ReverseInsertPts.empty());
+        assert(!S.HasReverseInsertPts());
         // If this is an invoke instruction, we're scanning it as part of
         // one of its successor blocks, since we can't insert code after it
         // in its own block, and we don't want to split critical edges.
         if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+          S.InsertReverseInsertPt(BB->getFirstInsertionPt());
         else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+          S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst)));
         S.SetSeq(S_Use);
         ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
       } else if (Seq == S_Release && IsUser(Class)) {
@@ -1924,12 +1884,12 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
         // Non-movable releases depend on any possible objc pointer use.
         S.SetSeq(S_Stop);
         ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
-        assert(S.RRI.ReverseInsertPts.empty());
+        assert(!S.HasReverseInsertPts());
         // As above; handle invoke specially.
         if (isa<InvokeInst>(Inst))
-          S.RRI.ReverseInsertPts.insert(BB->getFirstInsertionPt());
+          S.InsertReverseInsertPt(BB->getFirstInsertionPt());
         else
-          S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
+          S.InsertReverseInsertPt(llvm::next(BasicBlock::iterator(Inst)));
       }
       break;
     case S_Stop:
@@ -2049,8 +2009,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
       ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
       S.ResetSequenceProgress(S_Retain);
-      S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
-      S.RRI.Calls.insert(Inst);
+      S.SetKnownSafe(S.HasKnownPositiveRefCount());
+      S.InsertCall(Inst);
     }
 
     S.SetKnownPositiveRefCount();
@@ -2073,12 +2033,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     case S_Retain:
     case S_CanRelease:
       if (OldSeq == S_Retain || ReleaseMetadata != 0)
-        S.RRI.ReverseInsertPts.clear();
+        S.ClearReverseInsertPts();
       // FALL THROUGH
     case S_Use:
-      S.RRI.ReleaseMetadata = ReleaseMetadata;
-      S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
-      Releases[Inst] = S.RRI;
+      S.SetReleaseMetadata(ReleaseMetadata);
+      S.SetTailCallRelease(cast<CallInst>(Inst)->isTailCall());
+      Releases[Inst] = S.GetRRInfo();
       ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
       S.ClearSequenceProgress();
       break;
@@ -2122,8 +2082,8 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
       case S_Retain:
         S.SetSeq(S_CanRelease);
         ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
-        assert(S.RRI.ReverseInsertPts.empty());
-        S.RRI.ReverseInsertPts.insert(Inst);
+        assert(!S.HasReverseInsertPts());
+        S.InsertReverseInsertPt(Inst);
 
         // One call can't cause a transition from S_Retain to S_CanRelease
         // and S_CanRelease to S_Use. If we've made the first transition,
@@ -2350,8 +2310,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *InsertPt = *PI;
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call =
-      CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt);
+    Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
     Call->setDoesNotThrow();
     Call->setTailCall();
 
@@ -2364,8 +2324,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *InsertPt = *PI;
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
-    CallInst *Call = CallInst::Create(getReleaseCallee(M), MyArg,
-                                      "", InsertPt);
+    Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Release);
+    CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
     // Attach a clang.imprecise_release metadata tag, if appropriate.
     if (MDNode *M = ReleasesToMove.ReleaseMetadata)
       Call->setMetadata(ImpreciseReleaseMDKind, M);
@@ -2403,17 +2363,20 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                                   MapVector<Value *, RRInfo> &Retains,
                                   DenseMap<Value *, RRInfo> &Releases,
                                   Module *M,
-                                  SmallVector<Instruction *, 4> &NewRetains,
-                                  SmallVector<Instruction *, 4> &NewReleases,
-                                  SmallVector<Instruction *, 8> &DeadInsts,
+                                  SmallVectorImpl<Instruction *> &NewRetains,
+                                  SmallVectorImpl<Instruction *> &NewReleases,
+                                  SmallVectorImpl<Instruction *> &DeadInsts,
                                   RRInfo &RetainsToMove,
                                   RRInfo &ReleasesToMove,
                                   Value *Arg,
                                   bool KnownSafe,
                                   bool &AnyPairsCompletelyEliminated) {
   // If a pair happens in a region where it is known that the reference count
-  // is already incremented, we can similarly ignore possible decrements.
+  // is already incremented, we can similarly ignore possible decrements unless
+  // we are dealing with a retainable object with multiple provenance sources.
   bool KnownSafeTD = true, KnownSafeBU = true;
+  bool MultipleOwners = false;
+  bool CFGHazardAfflicted = false;
 
   // Connect the dots between the top-down-collected RetainsToMove and
   // bottom-up-collected ReleasesToMove to form sets of related calls.
@@ -2432,6 +2395,8 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
+      MultipleOwners =
+        MultipleOwners || MultiOwnersSet.count(GetObjCArg(NewRetain));
       for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewRetainRRI.Calls.begin(),
              LE = NewRetainRRI.Calls.end(); LI != LE; ++LI) {
@@ -2441,10 +2406,27 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Releases.end())
           return false;
         const RRInfo &NewRetainReleaseRRI = Jt->second;
-        assert(NewRetainReleaseRRI.Calls.count(NewRetain));
+
+        // If the release does not have a reference to the retain as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+          return false;
+
         if (ReleasesToMove.Calls.insert(NewRetainRelease)) {
-          OldDelta -=
-            BBStates[NewRetainRelease->getParent()].GetAllPathCount();
+
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
+          OldDelta -= PathCount;
 
           // Merge the ReleaseMetadata and IsTailCallRelease values.
           if (FirstRelease) {
@@ -2469,8 +2451,18 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                    RE = NewRetainReleaseRRI.ReverseInsertPts.end();
                  RI != RE; ++RI) {
               Instruction *RIP = *RI;
-              if (ReleasesToMove.ReverseInsertPts.insert(RIP))
-                NewDelta -= BBStates[RIP->getParent()].GetAllPathCount();
+              if (ReleasesToMove.ReverseInsertPts.insert(RIP)) {
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
+                NewDelta -= PathCount;
+              }
             }
           NewReleases.push_back(NewRetainRelease);
         }
@@ -2488,6 +2480,7 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
       assert(It != Releases.end());
       const RRInfo &NewReleaseRRI = It->second;
       KnownSafeBU &= NewReleaseRRI.KnownSafe;
+      CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
       for (SmallPtrSet<Instruction *, 2>::const_iterator
              LI = NewReleaseRRI.Calls.begin(),
              LE = NewReleaseRRI.Calls.end(); LI != LE; ++LI) {
@@ -2497,10 +2490,25 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
         if (Jt == Retains.end())
           return false;
         const RRInfo &NewReleaseRetainRRI = Jt->second;
-        assert(NewReleaseRetainRRI.Calls.count(NewRelease));
+
+        // If the retain does not have a reference to the release as well,
+        // something happened which is unaccounted for. Do not do anything.
+        //
+        // This can happen if we catch an additive overflow during path count
+        // merging.
+        if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+          return false;
+
         if (RetainsToMove.Calls.insert(NewReleaseRetain)) {
-          unsigned PathCount =
-            BBStates[NewReleaseRetain->getParent()].GetAllPathCount();
+          // If we overflow when we compute the path count, don't remove/move
+          // anything.
+          const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
+          unsigned PathCount = BBState::OverflowOccurredValue;
+          if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+            return false;
+          assert(PathCount != BBState::OverflowOccurredValue &&
+                 "PathCount at this point can not be "
+                 "OverflowOccurredValue.");
           OldDelta += PathCount;
           OldCount += PathCount;
 
@@ -2512,7 +2520,16 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
                  RI != RE; ++RI) {
               Instruction *RIP = *RI;
               if (RetainsToMove.ReverseInsertPts.insert(RIP)) {
-                PathCount = BBStates[RIP->getParent()].GetAllPathCount();
+                // If we overflow when we compute the path count, don't
+                // remove/move anything.
+                const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+                PathCount = BBState::OverflowOccurredValue;
+                if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+                  return false;
+                assert(PathCount != BBState::OverflowOccurredValue &&
+                       "PathCount at this point can not be "
+                       "OverflowOccurredValue.");
                 NewDelta += PathCount;
                 NewCount += PathCount;
               }
@@ -2525,9 +2542,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     if (NewRetains.empty()) break;
   }
 
-  // If the pointer is known incremented or nested, we can safely delete the
-  // pair regardless of what's between them.
-  if (KnownSafeTD || KnownSafeBU) {
+  // If the pointer is known incremented in 1 direction and we do not have
+  // MultipleOwners, we can safely remove the retain/releases. Otherwise we need
+  // to be known safe in both directions.
+  bool UnconditionallySafe = (KnownSafeTD && KnownSafeBU) ||
+    ((KnownSafeTD || KnownSafeBU) && !MultipleOwners);
+  if (UnconditionallySafe) {
     RetainsToMove.ReverseInsertPts.clear();
     ReleasesToMove.ReverseInsertPts.clear();
     NewCount = 0;
@@ -2538,6 +2558,14 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
     // less aggressive solution which is.
     if (NewDelta != 0)
       return false;
+
+    // At this point, we are not going to remove any RR pairs, but we still are
+    // able to move RR pairs. If one of our pointers is afflicted with
+    // CFGHazards, we cannot perform such code motion so exit early.
+    const bool WillPerformCodeMotion = RetainsToMove.ReverseInsertPts.size() ||
+      ReleasesToMove.ReverseInsertPts.size();
+    if (CFGHazardAfflicted && WillPerformCodeMotion)
+      return false;
   }
 
   // Determine whether the original call points are balanced in the retain and
@@ -2685,9 +2713,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
           if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
+            Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
           }
           // Zap the fully redundant load.
@@ -2715,9 +2742,8 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
           Changed = true;
           // If the load has a builtin retain, insert a plain retain for it.
           if (Class == IC_LoadWeakRetained) {
-            CallInst *CI =
-              CallInst::Create(getRetainCallee(F.getParent()), EarlierCall,
-                               "", Call);
+            Constant *Decl = EP.get(ARCRuntimeEntryPoints::EPT_Retain);
+            CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
             CI->setTailCall();
           }
           // Zap the fully redundant load.
@@ -2801,23 +2827,29 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
 /// Identify program paths which execute sequences of retains and releases which
 /// can be eliminated.
 bool ObjCARCOpt::OptimizeSequences(Function &F) {
-  /// Releases, Retains - These are used to store the results of the main flow
-  /// analysis. These use Value* as the key instead of Instruction* so that the
-  /// map stays valid when we get around to rewriting code and calls get
-  /// replaced by arguments.
+  // Releases, Retains - These are used to store the results of the main flow
+  // analysis. These use Value* as the key instead of Instruction* so that the
+  // map stays valid when we get around to rewriting code and calls get
+  // replaced by arguments.
   DenseMap<Value *, RRInfo> Releases;
   MapVector<Value *, RRInfo> Retains;
 
-  /// This is used during the traversal of the function to track the
-  /// states for each identified object at each block.
+  // This is used during the traversal of the function to track the
+  // states for each identified object at each block.
   DenseMap<const BasicBlock *, BBState> BBStates;
 
   // Analyze the CFG of the function, and all instructions.
   bool NestingDetected = Visit(F, BBStates, Retains, Releases);
 
   // Transform.
-  return PerformCodePlacement(BBStates, Retains, Releases, F.getParent()) &&
-         NestingDetected;
+  bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
+                                                           Releases,
+                                                           F.getParent());
+
+  // Cleanup.
+  MultiOwnersSet.clear();
+
+  return AnyPairsCompletelyEliminated && NestingDetected;
 }
 
 /// Check if there is a dependent call earlier that does not have anything in
@@ -3025,12 +3057,8 @@ bool ObjCARCOpt::doInitialization(Module &M) {
   // they are not, because they return their argument value. And objc_release
   // calls finalizers which can have arbitrary side effects.
 
-  // These are initialized lazily.
-  AutoreleaseRVCallee = 0;
-  ReleaseCallee = 0;
-  RetainCallee = 0;
-  RetainBlockCallee = 0;
-  AutoreleaseCallee = 0;
+  // Initialize our runtime entry point cache.
+  EP.Initialize(&M);
 
   return false;
 }
@@ -3050,6 +3078,12 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
 
   PA.setAA(&getAnalysis<AliasAnalysis>());
 
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, false);
+  }
+#endif
+
   // This pass performs several distinct transformations. As a compile-time aid
   // when compiling code that isn't ObjC, skip these if the relevant ObjC
   // library functions aren't declared.
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
index 03e12d4..53c077e 100644
--- a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -1,4 +1,4 @@
-//===- ObjCARCUtil.cpp - ObjC ARC Optimization --------*- mode: c++ -*-----===//
+//===- ObjCARCUtil.cpp - ObjC ARC Optimization ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -112,6 +112,8 @@ InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
           .Case("objc_retain_autorelease",    IC_FusedRetainAutorelease)
           .Case("objc_retainAutorelease",     IC_FusedRetainAutorelease)
           .Case("objc_retainAutoreleaseReturnValue",IC_FusedRetainAutoreleaseRV)
+          .Case("objc_sync_enter", IC_User)
+          .Case("objc_sync_exit", IC_User)
           .Default(IC_CallOrUser);
 
       // Argument is i8**
diff --git a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index ec449fd8e..a13fb9e 100644
--- a/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -1,4 +1,4 @@
-//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- mode: c++ -*-----===//
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index a097308..a3eb07a9 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -83,7 +83,7 @@ bool ADCE::runOnFunction(Function& F) {
       I->dropAllReferences();
     }
 
-  for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(),
+  for (SmallVectorImpl<Instruction *>::iterator I = worklist.begin(),
        E = worklist.end(); I != E; ++I) {
     ++NumRemoved;
     (*I)->eraseFromParent();
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
deleted file mode 100644
index e755008..0000000
--- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ /dev/null
@@ -1,152 +0,0 @@
-//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a very simple profile guided basic block placement
-// algorithm.  The idea is to put frequently executed blocks together at the
-// start of the function, and hopefully increase the number of fall-through
-// conditional branches.  If there is no profile information for a particular
-// function, this pass basically orders blocks in depth-first order
-//
-// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
-// Positioning" by Pettis and Hansen, except that it uses basic block counts
-// instead of edge counts.  This should be improved in many ways, but is very
-// simple for now.
-//
-// Basically we "place" the entry block, then loop over all successors in a DFO,
-// placing the most frequently executed successor until we run out of blocks.  I
-// told you this was _extremely_ simplistic. :) This is also much slower than it
-// could be.  When it becomes important, this pass will be rewritten to use a
-// better algorithm, and then we can worry about efficiency.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "block-placement"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CFG.h"
-#include <set>
-using namespace llvm;
-
-STATISTIC(NumMoved, "Number of basic blocks moved");
-
-namespace {
-  struct BlockPlacement : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    BlockPlacement() : FunctionPass(ID) {
-      initializeBlockPlacementPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<ProfileInfo>();
-      //AU.addPreserved<ProfileInfo>();  // Does this work?
-    }
-  private:
-    /// PI - The profile information that is guiding us.
-    ///
-    ProfileInfo *PI;
-
-    /// NumMovedBlocks - Every time we move a block, increment this counter.
-    ///
-    unsigned NumMovedBlocks;
-
-    /// PlacedBlocks - Every time we place a block, remember it so we don't get
-    /// into infinite loops.
-    std::set<BasicBlock*> PlacedBlocks;
-
-    /// InsertPos - This an iterator to the next place we want to insert a
-    /// block.
-    Function::iterator InsertPos;
-
-    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
-    /// successors.
-    void PlaceBlocks(BasicBlock *BB);
-  };
-}
-
-char BlockPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-INITIALIZE_AG_DEPENDENCY(ProfileInfo)
-INITIALIZE_PASS_END(BlockPlacement, "block-placement",
-                "Profile Guided Basic Block Placement", false, false)
-
-FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
-
-bool BlockPlacement::runOnFunction(Function &F) {
-  PI = &getAnalysis<ProfileInfo>();
-
-  NumMovedBlocks = 0;
-  InsertPos = F.begin();
-
-  // Recursively place all blocks.
-  PlaceBlocks(F.begin());
-
-  PlacedBlocks.clear();
-  NumMoved += NumMovedBlocks;
-  return NumMovedBlocks != 0;
-}
-
-
-/// PlaceBlocks - Recursively place the specified blocks and any unplaced
-/// successors.
-void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
-  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
-  PlacedBlocks.insert(BB);
-
-  // Place the specified block.
-  if (&*InsertPos != BB) {
-    // Use splice to move the block into the right place.  This avoids having to
-    // remove the block from the function then readd it, which causes a bunch of
-    // symbol table traffic that is entirely pointless.
-    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
-    Blocks.splice(InsertPos, Blocks, BB);
-
-    ++NumMovedBlocks;
-  } else {
-    // This block is already in the right place, we don't have to do anything.
-    ++InsertPos;
-  }
-
-  // Keep placing successors until we run out of ones to place.  Note that this
-  // loop is very inefficient (N^2) for blocks with many successors, like switch
-  // statements.  FIXME!
-  while (1) {
-    // Okay, now place any unplaced successors.
-    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
-
-    // Scan for the first unplaced successor.
-    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
-      /*empty*/;
-    if (SI == E) return;  // No more successors to place.
-
-    double MaxExecutionCount = PI->getExecutionCount(*SI);
-    BasicBlock *MaxSuccessor = *SI;
-
-    // Scan for more frequently executed successors
-    for (; SI != E; ++SI)
-      if (!PlacedBlocks.count(*SI)) {
-        double Count = PI->getExecutionCount(*SI);
-        if (Count > MaxExecutionCount ||
-            // Prefer to not disturb the code.
-            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
-          MaxExecutionCount = Count;
-          MaxSuccessor = *SI;
-        }
-      }
-
-    // Now that we picked the maximally executed successor, place it.
-    PlaceBlocks(MaxSuccessor);
-  }
-}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index fd55e08..626c810 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,6 +1,5 @@
 add_llvm_library(LLVMScalarOpts
   ADCE.cpp
-  BasicBlockPlacement.cpp
   CodeGenPrepare.cpp
   ConstantProp.cpp
   CorrelatedValuePropagation.cpp
@@ -17,19 +16,23 @@ add_llvm_library(LLVMScalarOpts
   LoopInstSimplify.cpp
   LoopRotation.cpp
   LoopStrengthReduce.cpp
+  LoopRerollPass.cpp
   LoopUnrollPass.cpp
   LoopUnswitch.cpp
   LowerAtomic.cpp
   MemCpyOptimizer.cpp
+  PartiallyInlineLibCalls.cpp
   Reassociate.cpp
   Reg2Mem.cpp
+  SampleProfile.cpp
   SCCP.cpp
   SROA.cpp
   Scalar.cpp
   ScalarReplAggregates.cpp
   SimplifyCFGPass.cpp
-  SimplifyLibCalls.cpp
+  FlattenCFGPass.cpp
   Sink.cpp
+  StructurizeCFG.cpp
   TailRecursionElimination.cpp
   )
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index f0d29c8..007e9b7 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -76,10 +75,10 @@ namespace {
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
+    const TargetMachine *TM;
     const TargetLowering *TLI;
     const TargetLibraryInfo *TLInfo;
     DominatorTree *DT;
-    ProfileInfo *PFI;
 
     /// CurInstIterator - As we scan instructions optimizing them, this is the
     /// next instruction to optimize.  Xforms that can invalidate this should
@@ -100,8 +99,8 @@ namespace {
 
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit CodeGenPrepare(const TargetLowering *tli = 0)
-      : FunctionPass(ID), TLI(tli) {
+    explicit CodeGenPrepare(const TargetMachine *TM = 0)
+      : FunctionPass(ID), TM(TM), TLI(0) {
         initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
       }
     bool runOnFunction(Function &F);
@@ -110,7 +109,6 @@ namespace {
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
-      AU.addPreserved<ProfileInfo>();
       AU.addRequired<TargetLibraryInfo>();
     }
 
@@ -139,17 +137,17 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
                 "Optimize for code generation", false, false)
 
-FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
-  return new CodeGenPrepare(TLI);
+FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
+  return new CodeGenPrepare(TM);
 }
 
 bool CodeGenPrepare::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
   ModifiedDT = false;
+  if (TM) TLI = TM->getTargetLowering();
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
-  PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
                                            Attribute::OptimizeForSize);
 
@@ -205,7 +203,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
 
       DeleteDeadBlock(BB);
-      
+
       for (SmallVectorImpl<BasicBlock*>::iterator
              II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
         if (pred_begin(*II) == pred_end(*II))
@@ -440,10 +438,6 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
     DT->changeImmediateDominator(DestBB, NewIDom);
     DT->eraseNode(BB);
   }
-  if (PFI) {
-    PFI->replaceAllUses(BB, DestBB);
-    PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB));
-  }
   BB->eraseFromParent();
   ++NumBlocksElim;
 
@@ -830,7 +824,7 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
   void print(raw_ostream &OS) const;
   void dump() const;
-  
+
   bool operator==(const ExtAddrMode& O) const {
     return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
            (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
@@ -838,10 +832,12 @@ struct ExtAddrMode : public TargetLowering::AddrMode {
   }
 };
 
+#ifndef NDEBUG
 static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
   AM.print(OS);
   return OS;
 }
+#endif
 
 void ExtAddrMode::print(raw_ostream &OS) const {
   bool NeedPlus = false;
@@ -866,7 +862,6 @@ void ExtAddrMode::print(raw_ostream &OS) const {
     OS << (NeedPlus ? " + " : "")
        << Scale << "*";
     WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
-    NeedPlus = true;
   }
 
   OS << ']';
@@ -891,16 +886,16 @@ class AddressingModeMatcher {
   /// the memory instruction that we're computing this address for.
   Type *AccessTy;
   Instruction *MemoryInst;
-  
+
   /// AddrMode - This is the addressing mode that we're building up.  This is
   /// part of the return value of this addressing mode matching stuff.
   ExtAddrMode &AddrMode;
-  
+
   /// IgnoreProfitability - This is set to true when we should not do
   /// profitability checks.  When true, IsProfitableToFoldIntoAddressingMode
   /// always returns true.
   bool IgnoreProfitability;
-  
+
   AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
                         const TargetLowering &T, Type *AT,
                         Instruction *MI, ExtAddrMode &AM)
@@ -908,7 +903,7 @@ class AddressingModeMatcher {
     IgnoreProfitability = false;
   }
 public:
-  
+
   /// Match - Find the maximal addressing mode that a load/store of V can fold,
   /// give an access type of AccessTy.  This returns a list of involved
   /// instructions in AddrModeInsts.
@@ -918,7 +913,7 @@ public:
                            const TargetLowering &TLI) {
     ExtAddrMode Result;
 
-    bool Success = 
+    bool Success =
       AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
                             MemoryInst, Result).MatchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -943,11 +938,11 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
   // mode.  Just process that directly.
   if (Scale == 1)
     return MatchAddr(ScaleReg, Depth);
-  
+
   // If the scale is 0, it takes nothing to add this.
   if (Scale == 0)
     return true;
-  
+
   // If we already have a scale of this value, we can add to it, otherwise, we
   // need an available scale field.
   if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
@@ -966,7 +961,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
 
   // It was legal, so commit it.
   AddrMode = TestAddrMode;
-  
+
   // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
   // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
   // X*Scale + C*Scale to addr mode.
@@ -975,7 +970,7 @@ bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
       match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
     TestAddrMode.ScaledReg = AddLHS;
     TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
-      
+
     // If this addressing mode is legal, commit it and remember that we folded
     // this instruction.
     if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
@@ -1026,7 +1021,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
                                                unsigned Depth) {
   // Avoid exponential behavior on extremely deep expression trees.
   if (Depth >= 5) return false;
-  
+
   switch (Opcode) {
   case Instruction::PtrToInt:
     // PtrToInt is always a noop, as we know that the int type is pointer sized.
@@ -1034,7 +1029,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
   case Instruction::IntToPtr:
     // This inttoptr is a no-op if the integer type is pointer sized.
     if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
+        TLI.getPointerTy(AddrInst->getType()->getPointerAddressSpace()))
       return MatchAddr(AddrInst->getOperand(0), Depth);
     return false;
   case Instruction::BitCast:
@@ -1055,16 +1050,16 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
         MatchAddr(AddrInst->getOperand(0), Depth+1))
       return true;
-    
+
     // Restore the old addr mode info.
     AddrMode = BackupAddrMode;
     AddrModeInsts.resize(OldSize);
-    
+
     // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
     if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
         MatchAddr(AddrInst->getOperand(1), Depth+1))
       return true;
-    
+
     // Otherwise we definitely can't merge the ADD in.
     AddrMode = BackupAddrMode;
     AddrModeInsts.resize(OldSize);
@@ -1081,7 +1076,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     int64_t Scale = RHS->getSExtValue();
     if (Opcode == Instruction::Shl)
       Scale = 1LL << Scale;
-    
+
     return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
   }
   case Instruction::GetElementPtr: {
@@ -1089,7 +1084,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     // one variable offset.
     int VariableOperand = -1;
     unsigned VariableScale = 0;
-    
+
     int64_t ConstantOffset = 0;
     const DataLayout *TD = TLI.getDataLayout();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
@@ -1107,14 +1102,14 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
           // We only allow one variable index at the moment.
           if (VariableOperand != -1)
             return false;
-          
+
           // Remember the variable index.
           VariableOperand = i;
           VariableScale = TypeSize;
         }
       }
     }
-    
+
     // A common case is for the GEP to only do a constant offset.  In this case,
     // just add it to the disp field and check validity.
     if (VariableOperand == -1) {
@@ -1208,7 +1203,7 @@ bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
         AddrModeInsts.push_back(I);
         return true;
       }
-      
+
       // It isn't profitable to do this, roll back.
       //cerr << "NOT FOLDING: " << *I;
       AddrMode = BackupAddrMode;
@@ -1254,7 +1249,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
   TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
+
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
@@ -1279,7 +1274,7 @@ static bool FindAllMemoryUses(Instruction *I,
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I))
     return false;
-  
+
   // If this is an obviously unfoldable instruction, bail out.
   if (!MightBeFoldableInst(I))
     return true;
@@ -1293,24 +1288,24 @@ static bool FindAllMemoryUses(Instruction *I,
       MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
       continue;
     }
-    
+
     if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
       unsigned opNo = UI.getOperandNo();
       if (opNo == 0) return true; // Storing addr, not into addr.
       MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
-    
+
     if (CallInst *CI = dyn_cast<CallInst>(U)) {
       InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
       if (!IA) return true;
-      
+
       // If this is a memory operand, we're cool, otherwise bail out.
       if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
         return true;
       continue;
     }
-    
+
     if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
                           TLI))
       return true;
@@ -1328,17 +1323,17 @@ bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
   // If Val is either of the known-live values, we know it is live!
   if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
     return true;
-  
+
   // All values other than instructions and arguments (e.g. constants) are live.
   if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
-  
+
   // If Val is a constant sized alloca in the entry block, it is live, this is
   // true because it is just a reference to the stack/frame pointer, which is
   // live for the whole function.
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
     if (AI->isStaticAlloca())
       return true;
-  
+
   // Check to see if this value is already used in the memory instruction's
   // block.  If so, it's already live into the block at the very least, so we
   // can reasonably fold it.
@@ -1370,7 +1365,7 @@ bool AddressingModeMatcher::
 IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
                                      ExtAddrMode &AMAfter) {
   if (IgnoreProfitability) return true;
-  
+
   // AMBefore is the addressing mode before this instruction was folded into it,
   // and AMAfter is the addressing mode after the instruction was folded.  Get
   // the set of registers referenced by AMAfter and subtract out those
@@ -1381,7 +1376,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // BaseReg and ScaleReg (global addresses are always available, as are any
   // folded immediates).
   Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
-  
+
   // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
   // lifetime wasn't extended by adding this instruction.
   if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
@@ -1402,7 +1397,7 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
   if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
     return false;  // Has a non-memory, non-foldable use!
-  
+
   // Now that we know that all uses of this instruction are part of a chain of
   // computation involving only operations that could theoretically be folded
   // into a memory use, loop over each of these uses and see if they could
@@ -1411,15 +1406,14 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
     Instruction *User = MemoryUses[i].first;
     unsigned OpNo = MemoryUses[i].second;
-    
+
     // Get the access type of this use.  If the use isn't a pointer, we don't
     // know what it accesses.
     Value *Address = User->getOperand(OpNo);
     if (!Address->getType()->isPointerTy())
       return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
-    
+    Type *AddressAccessTy = Address->getType()->getPointerElementType();
+
     // Do a match against the root of this address, ignoring profitability. This
     // will tell us if the addressing mode for the memory operation will
     // *actually* cover the shared instruction.
@@ -1434,10 +1428,10 @@ IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
                   I) == MatchedAddrModeInsts.end())
       return false;
-    
+
     MatchedAddrModeInsts.clear();
   }
-  
+
   return true;
 }
 
@@ -1572,9 +1566,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
-    Type *IntPtrTy =
-          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
-
+    Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(Addr->getType());
     Value *Result = 0;
 
     // Start with the base register. Do this first so that subsequent address
@@ -1893,7 +1885,8 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
     // trivial PHI, go ahead and zap it here.
-    if (Value *V = SimplifyInstruction(P)) {
+    if (Value *V = SimplifyInstruction(P, TLI ? TLI->getDataLayout() : 0,
+                                       TLInfo, DT)) {
       P->replaceAllUsesWith(V);
       P->eraseFromParent();
       ++NumPHIsElim;
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 3c08634..5266894 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -72,11 +72,6 @@ namespace {
 }
 
 namespace llvm {
-// SimpleValue is POD.
-template<> struct isPodLike<SimpleValue> {
-  static const bool value = true;
-};
-
 template<> struct DenseMapInfo<SimpleValue> {
   static inline SimpleValue getEmptyKey() {
     return DenseMapInfo<Instruction*>::getEmptyKey();
@@ -220,11 +215,6 @@ namespace {
 }
 
 namespace llvm {
-  // CallValue is POD.
-  template<> struct isPodLike<CallValue> {
-    static const bool value = true;
-  };
-
   template<> struct DenseMapInfo<CallValue> {
     static inline CallValue getEmptyKey() {
       return DenseMapInfo<Instruction*>::getEmptyKey();
diff --git a/lib/Transforms/Scalar/FlattenCFGPass.cpp b/lib/Transforms/Scalar/FlattenCFGPass.cpp
new file mode 100644
index 0000000..e7de07f
--- /dev/null
+++ b/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -0,0 +1,79 @@
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "flattencfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+public:
+  FlattenCFGPass() : FunctionPass(ID) {
+    initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<AliasAnalysis>();
+  }
+
+private:
+  AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                      false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+                    false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+  bool Changed = false;
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks and remove them if they are unneeded...
+    //
+    for (Function::iterator BBIt = F.begin(); BBIt != F.end();) {
+      if (FlattenCFG(BBIt++, AA)) {
+        LocalChange = true;
+      }
+    }
+    Changed |= LocalChange;
+  }
+  return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+  AA = &getAnalysis<AliasAnalysis>();
+  bool EverChanged = false;
+  // iterativelyFlattenCFG can make some blocks dead.
+  while (iterativelyFlattenCFG(F, AA)) {
+    removeUnreachableBlocks(F);
+    EverChanged = true;
+  }
+  return EverChanged;
+}
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index f350b9b..6af269d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -21,8 +21,10 @@
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -45,6 +47,7 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -505,7 +508,9 @@ namespace {
     enum ValType {
       SimpleVal,  // A simple offsetted value that is accessed.
       LoadVal,    // A value produced by a load.
-      MemIntrin   // A memory intrinsic which is loaded from.
+      MemIntrin,  // A memory intrinsic which is loaded from.
+      UndefVal    // A UndefValue representing a value from dead block (which
+                  // is not yet physically removed from the CFG). 
     };
   
     /// V - The value that is live out of the block.
@@ -543,10 +548,20 @@ namespace {
       Res.Offset = Offset;
       return Res;
     }
-  
+
+    static AvailableValueInBlock getUndef(BasicBlock *BB) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(0);
+      Res.Val.setInt(UndefVal);
+      Res.Offset = 0;
+      return Res;
+    }
+
     bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
     bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
     bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+    bool isUndefValue() const { return Val.getInt() == UndefVal; }
   
     Value *getSimpleValue() const {
       assert(isSimpleValue() && "Wrong accessor");
@@ -574,6 +589,7 @@ namespace {
     DominatorTree *DT;
     const DataLayout *TD;
     const TargetLibraryInfo *TLI;
+    SetVector<BasicBlock *> DeadBlocks;
 
     ValueTable VN;
 
@@ -692,9 +708,13 @@ namespace {
     void cleanupGlobalSets();
     void verifyRemoved(const Instruction *I) const;
     bool splitCriticalEdges();
+    BasicBlock *splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ);
     unsigned replaceAllDominatedUsesWith(Value *From, Value *To,
                                          const BasicBlockEdge &Root);
     bool propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root);
+    bool processFoldableCondBr(BranchInst *BI);
+    void addDeadBlock(BasicBlock *BB);
+    void assignValNumForDeadCode();
   };
 
   char GVN::ID = 0;
@@ -1068,14 +1088,15 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
     ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, &TD))
     return Offset;
   return -1;
@@ -1152,7 +1173,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
     Type *DestPTy =
       IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
     DestPTy = PointerType::get(DestPTy,
-                       cast<PointerType>(PtrVal->getType())->getAddressSpace());
+                               PtrVal->getType()->getPointerAddressSpace());
     Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
@@ -1227,15 +1248,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
 
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
   Src = ConstantExpr::getBitCast(Src,
-                                 llvm::Type::getInt8PtrTy(Src->getContext()));
+                                 Type::getInt8PtrTy(Src->getContext(), AS));
   Constant *OffsetCst =
-  ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
   Src = ConstantExpr::getGetElementPtr(Src, OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy));
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
@@ -1250,8 +1272,10 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   // just use the dominating value directly.
   if (ValuesPerBlock.size() == 1 &&
       gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
-                                               LI->getParent()))
+                                               LI->getParent())) {
+    assert(!ValuesPerBlock[0].isUndefValue() && "Dead BB dominate this block");
     return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn);
+  }
 
   // Otherwise, we have to construct SSA form.
   SmallVector<PHINode*, 8> NewPHIs;
@@ -1321,7 +1345,7 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
                    << *getCoercedLoadValue() << '\n'
                    << *Res << '\n' << "\n\n\n");
     }
-  } else {
+  } else if (isMemIntrinValue()) {
     const DataLayout *TD = gvn.getDataLayout();
     assert(TD && "Need target data to handle type mismatch case");
     Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
@@ -1329,6 +1353,10 @@ Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) c
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
                  << *Res << '\n' << "\n\n\n");
+  } else {
+    assert(isUndefValue() && "Should be UndefVal");
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+    return UndefValue::get(LoadTy);
   }
   return Res;
 }
@@ -1352,6 +1380,13 @@ void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
 
+    if (DeadBlocks.count(DepBB)) {
+      // Dead dependent mem-op disguise as a load evaluating the same value
+      // as the load in question.
+      ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+      continue;
+    }
+
     if (!DepInfo.isDef() && !DepInfo.isClobber()) {
       UnavailableBlocks.push_back(DepBB);
       continue;
@@ -1513,7 +1548,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   for (unsigned i = 0, e = UnavailableBlocks.size(); i != e; ++i)
     FullyAvailableBlocks[UnavailableBlocks[i]] = false;
 
-  SmallVector<std::pair<TerminatorInst*, unsigned>, 4> NeedToSplit;
+  SmallVector<BasicBlock *, 4> CriticalEdgePred;
   for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB);
        PI != E; ++PI) {
     BasicBlock *Pred = *PI;
@@ -1536,20 +1571,14 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
         return false;
       }
 
-      unsigned SuccNum = GetSuccessorNumber(Pred, LoadBB);
-      NeedToSplit.push_back(std::make_pair(Pred->getTerminator(), SuccNum));
+      CriticalEdgePred.push_back(Pred);
     }
   }
 
-  if (!NeedToSplit.empty()) {
-    toSplit.append(NeedToSplit.begin(), NeedToSplit.end());
-    return false;
-  }
-
   // Decide whether PRE is profitable for this load.
   unsigned NumUnavailablePreds = PredLoads.size();
   assert(NumUnavailablePreds != 0 &&
-         "Fully available value should be eliminated above!");
+         "Fully available value should already be eliminated!");
 
   // If this load is unavailable in multiple predecessors, reject it.
   // FIXME: If we could restructure the CFG, we could make a common pred with
@@ -1558,6 +1587,17 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
   if (NumUnavailablePreds != 1)
       return false;
 
+  // Split critical edges, and update the unavailable predecessors accordingly.
+  for (SmallVectorImpl<BasicBlock *>::iterator I = CriticalEdgePred.begin(),
+         E = CriticalEdgePred.end(); I != E; I++) {
+    BasicBlock *OrigPred = *I;
+    BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+    PredLoads.erase(OrigPred);
+    PredLoads[NewPred] = 0;
+    DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+                 << LoadBB->getName() << '\n');
+  }
+
   // Check if the load can safely be moved to all the unavailable predecessors.
   bool CanDoPRE = true;
   SmallVector<Instruction*, 8> NewInsts;
@@ -1594,7 +1634,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
       if (MD) MD->removeInstruction(I);
       I->eraseFromParent();
     }
-    return false;
+    // HINT:Don't revert the edge-splitting as following transformation may 
+    // also need to split these critial edges.
+    return !CriticalEdgePred.empty();
   }
 
   // Okay, we can eliminate this load by inserting a reload in the predecessor
@@ -2181,11 +2223,13 @@ bool GVN::processInstruction(Instruction *I) {
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
   if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+    if (!BI->isConditional())
       return false;
 
-    Value *BranchCond = BI->getCondition();
+    if (isa<Constant>(BI->getCondition()))
+      return processFoldableCondBr(BI);
 
+    Value *BranchCond = BI->getCondition();
     BasicBlock *TrueSucc = BI->getSuccessor(0);
     BasicBlock *FalseSucc = BI->getSuccessor(1);
     // Avoid multiple edges early.
@@ -2297,25 +2341,30 @@ bool GVN::runOnFunction(Function& F) {
   while (ShouldContinue) {
     DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
     ShouldContinue = iterateOnFunction(F);
-    if (splitCriticalEdges())
-      ShouldContinue = true;
     Changed |= ShouldContinue;
     ++Iteration;
   }
 
   if (EnablePRE) {
+    // Fabricate val-num for dead-code in order to suppress assertion in
+    // performPRE().
+    assignValNumForDeadCode();
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
       Changed |= PREChanged;
     }
   }
+
   // FIXME: Should perform GVN again after PRE does something.  PRE can move
   // computations into blocks where they become fully redundant.  Note that
   // we can't do this until PRE's critical edge splitting updates memdep.
   // Actually, when this happens, we should just fully integrate PRE into GVN.
 
   cleanupGlobalSets();
+  // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+  // iteration. 
+  DeadBlocks.clear();
 
   return Changed;
 }
@@ -2326,6 +2375,9 @@ bool GVN::processBlock(BasicBlock *BB) {
   // (and incrementing BI before processing an instruction).
   assert(InstrsToErase.empty() &&
          "We expect InstrsToErase to be empty across iterations");
+  if (DeadBlocks.count(BB))
+    return false;
+
   bool ChangedFunction = false;
 
   for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
@@ -2344,7 +2396,7 @@ bool GVN::processBlock(BasicBlock *BB) {
     if (!AtStart)
       --BI;
 
-    for (SmallVector<Instruction*, 4>::iterator I = InstrsToErase.begin(),
+    for (SmallVectorImpl<Instruction *>::iterator I = InstrsToErase.begin(),
          E = InstrsToErase.end(); I != E; ++I) {
       DEBUG(dbgs() << "GVN removed: " << **I << '\n');
       if (MD) MD->removeInstruction(*I);
@@ -2543,6 +2595,15 @@ bool GVN::performPRE(Function &F) {
   return Changed;
 }
 
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+  BasicBlock *BB = SplitCriticalEdge(Pred, Succ, this);
+  if (MD)
+    MD->invalidateCachedPredecessors();
+  return BB;
+}
+
 /// splitCriticalEdges - Split critical edges found during the previous
 /// iteration that may enable further optimization.
 bool GVN::splitCriticalEdges() {
@@ -2569,9 +2630,18 @@ bool GVN::iterateOnFunction(Function &F) {
        RE = RPOT.end(); RI != RE; ++RI)
     Changed |= processBlock(*RI);
 #else
+  // Save the blocks this function have before transformation begins. GVN may
+  // split critical edge, and hence may invalidate the RPO/DT iterator.
+  //
+  std::vector<BasicBlock *> BBVect;
+  BBVect.reserve(256);
   for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()),
        DE = df_end(DT->getRootNode()); DI != DE; ++DI)
-    Changed |= processBlock(DI->getBlock());
+    BBVect.push_back(DI->getBlock());
+
+  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
+       I != E; I++)
+    Changed |= processBlock(*I);
 #endif
 
   return Changed;
@@ -2601,3 +2671,133 @@ void GVN::verifyRemoved(const Instruction *Inst) const {
     }
   }
 }
+
+// BB is declared dead, which implied other blocks become dead as well. This
+// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+// live successors, update their phi nodes by replacing the operands
+// corresponding to dead blocks with UndefVal.
+//
+void GVN::addDeadBlock(BasicBlock *BB) {
+  SmallVector<BasicBlock *, 4> NewDead;
+  SmallSetVector<BasicBlock *, 4> DF;
+
+  NewDead.push_back(BB);
+  while (!NewDead.empty()) {
+    BasicBlock *D = NewDead.pop_back_val();
+    if (DeadBlocks.count(D))
+      continue;
+
+    // All blocks dominated by D are dead.
+    SmallVector<BasicBlock *, 8> Dom;
+    DT->getDescendants(D, Dom);
+    DeadBlocks.insert(Dom.begin(), Dom.end());
+    
+    // Figure out the dominance-frontier(D).
+    for (SmallVectorImpl<BasicBlock *>::iterator I = Dom.begin(),
+           E = Dom.end(); I != E; I++) {
+      BasicBlock *B = *I;
+      for (succ_iterator SI = succ_begin(B), SE = succ_end(B); SI != SE; SI++) {
+        BasicBlock *S = *SI;
+        if (DeadBlocks.count(S))
+          continue;
+
+        bool AllPredDead = true;
+        for (pred_iterator PI = pred_begin(S), PE = pred_end(S); PI != PE; PI++)
+          if (!DeadBlocks.count(*PI)) {
+            AllPredDead = false;
+            break;
+          }
+
+        if (!AllPredDead) {
+          // S could be proved dead later on. That is why we don't update phi
+          // operands at this moment.
+          DF.insert(S);
+        } else {
+          // While S is not dominated by D, it is dead by now. This could take
+          // place if S already have a dead predecessor before D is declared
+          // dead.
+          NewDead.push_back(S);
+        }
+      }
+    }
+  }
+
+  // For the dead blocks' live successors, update their phi nodes by replacing
+  // the operands corresponding to dead blocks with UndefVal.
+  for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+        I != E; I++) {
+    BasicBlock *B = *I;
+    if (DeadBlocks.count(B))
+      continue;
+
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(B), pred_end(B));
+    for (SmallVectorImpl<BasicBlock *>::iterator PI = Preds.begin(),
+           PE = Preds.end(); PI != PE; PI++) {
+      BasicBlock *P = *PI;
+
+      if (!DeadBlocks.count(P))
+        continue;
+
+      if (isCriticalEdge(P->getTerminator(), GetSuccessorNumber(P, B))) {
+        if (BasicBlock *S = splitCriticalEdges(P, B))
+          DeadBlocks.insert(P = S);
+      }
+
+      for (BasicBlock::iterator II = B->begin(); isa<PHINode>(II); ++II) {
+        PHINode &Phi = cast<PHINode>(*II);
+        Phi.setIncomingValue(Phi.getBasicBlockIndex(P),
+                             UndefValue::get(Phi.getType()));
+      }
+    }
+  }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+//  1) If the dead out-coming edge is a critical-edge, split it. Let 
+//     R be the target of the dead out-coming edge.
+//  1) Identify the set of dead blocks implied by the branch's dead outcoming
+//     edge. The result of this step will be {X| X is dominated by R}
+//  2) Identify those blocks which haves at least one dead prodecessor. The
+//     result of this step will be dominance-frontier(R).
+//  3) Update the PHIs in DF(R) by replacing the operands corresponding to 
+//     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+  if (!Cond)
+    return false;
+
+  BasicBlock *DeadRoot = Cond->getZExtValue() ? 
+                         BI->getSuccessor(1) : BI->getSuccessor(0);
+  if (DeadBlocks.count(DeadRoot))
+    return false;
+
+  if (!DeadRoot->getSinglePredecessor())
+    DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+  addDeadBlock(DeadRoot);
+  return true;
+}
+
+// performPRE() will trigger assert if it come across an instruciton without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+  for (SetVector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+        E = DeadBlocks.end(); I != E; I++) {
+    BasicBlock *BB = *I;
+    for (BasicBlock::iterator II = BB->begin(), EE = BB->end();
+          II != EE; II++) {
+      Instruction *Inst = &*II;
+      unsigned ValNum = VN.lookup_or_add(Inst);
+      addToLeaderTable(ValNum, Inst, BB);
+    }
+  }
+}
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 4796eb2..954e545 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -72,15 +72,13 @@ using namespace llvm;
 
 static cl::opt<bool>
 EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
-                  	cl::desc("Enable global merge pass on constants"),
-                  	cl::init(false));
+                         cl::desc("Enable global merge pass on constants"),
+                         cl::init(false));
 
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
-    /// TLI - Keep a pointer of a TargetLowering to consult for determining
-    /// target type sizes.
-    const TargetLowering *TLI;
+    const TargetMachine *TM;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
@@ -104,8 +102,8 @@ namespace {
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetLowering *tli = 0)
-      : FunctionPass(ID), TLI(tli) {
+    explicit GlobalMerge(const TargetMachine *TM = 0)
+      : FunctionPass(ID), TM(TM) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -144,6 +142,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge",
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
+  const TargetLowering *TLI = TM->getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
@@ -234,6 +233,7 @@ void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
 bool GlobalMerge::doInitialization(Module &M) {
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
+  const TargetLowering *TLI = TM->getTargetLowering();
   const DataLayout *TD = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
@@ -305,6 +305,6 @@ bool GlobalMerge::doFinalization(Module &M) {
   return false;
 }
 
-Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
-  return new GlobalMerge(tli);
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM) {
+  return new GlobalMerge(TM);
 }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 8e76c78..235aaaa 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -532,7 +532,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
         const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (!SE->isLoopInvariant(ExitValue, L))
+        if (!SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE))
           continue;
 
         // Computing the value outside of the loop brings no benefit if :
@@ -1479,8 +1480,14 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
   if (IndVar->getType()->isPointerTy()
       && !IVCount->getType()->isPointerTy()) {
 
+    // IVOffset will be the new GEP offset that is interpreted by GEP as a
+    // signed value. IVCount on the other hand represents the loop trip count,
+    // which is an unsigned value. FindLoopCounter only allows induction
+    // variables that have a positive unit stride of one. This means we don't
+    // have to handle the case of negative offsets (yet) and just need to zero
+    // extend IVCount.
     Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
-    const SCEV *IVOffset = SE->getTruncateOrSignExtend(IVCount, OfsTy);
+    const SCEV *IVOffset = SE->getTruncateOrZeroExtend(IVCount, OfsTy);
 
     // Expand the code for the iteration count.
     assert(SE->isLoopInvariant(IVOffset, L) &&
@@ -1492,7 +1499,7 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     assert(AR->getStart() == SE->getSCEV(GEPBase) && "bad loop counter");
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
-    assert(SE->getSizeOfExpr(
+    assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
              cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
            && "unit stride pointer IV must be i8*");
 
@@ -1506,9 +1513,10 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
     //
     // Valid Cases: (1) both integers is most common; (2) both may be pointers
-    // for simple memset-style loops; (3) IVInit is an integer and IVCount is a
-    // pointer may occur when enable-iv-rewrite generates a canonical IV on top
-    // of case #2.
+    // for simple memset-style loops.
+    //
+    // IVInit integer and IVCount pointer would only occur if a canonical IV
+    // were generated on top of case #2, which is not expected.
 
     const SCEV *IVLimit = 0;
     // For unit stride, IVCount = Start + BECount with 2's complement overflow.
@@ -1552,44 +1560,23 @@ LinearFunctionTestReplace(Loop *L,
                           SCEVExpander &Rewriter) {
   assert(canExpandBackedgeTakenCount(L, SE) && "precondition");
 
-  // LFTR can ignore IV overflow and truncate to the width of
-  // BECount. This avoids materializing the add(zext(add)) expression.
-  Type *CntTy = BackedgeTakenCount->getType();
-
+  // Initialize CmpIndVar and IVCount to their preincremented values.
+  Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
-  Value *CmpIndVar;
   if (L->getExitingBlock() == L->getLoopLatch()) {
     // Add one to the "backedge-taken" count to get the trip count.
-    // If this addition may overflow, we have to be more pessimistic and
-    // cast the induction variable before doing the add.
-    const SCEV *N =
-      SE->getAddExpr(IVCount, SE->getConstant(IVCount->getType(), 1));
-    if (CntTy == IVCount->getType())
-      IVCount = N;
-    else {
-      const SCEV *Zero = SE->getConstant(IVCount->getType(), 0);
-      if ((isa<SCEVConstant>(N) && !N->isZero()) ||
-          SE->isLoopEntryGuardedByCond(L, ICmpInst::ICMP_NE, N, Zero)) {
-        // No overflow. Cast the sum.
-        IVCount = SE->getTruncateOrZeroExtend(N, CntTy);
-      } else {
-        // Potential overflow. Cast before doing the add.
-        IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
-        IVCount = SE->getAddExpr(IVCount, SE->getConstant(CntTy, 1));
-      }
-    }
+    // This addition may overflow, which is valid as long as the comparison is
+    // truncated to BackedgeTakenCount->getType().
+    IVCount = SE->getAddExpr(BackedgeTakenCount,
+                             SE->getConstant(BackedgeTakenCount->getType(), 1));
     // The BackedgeTaken expression contains the number of times that the
     // backedge branches to the loop header.  This is one less than the
     // number of times the loop executes, so use the incremented indvar.
     CmpIndVar = IndVar->getIncomingValueForBlock(L->getExitingBlock());
-  } else {
-    // We must use the preincremented value...
-    IVCount = SE->getTruncateOrZeroExtend(IVCount, CntTy);
-    CmpIndVar = IndVar;
   }
 
   Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
@@ -1612,12 +1599,40 @@ LinearFunctionTestReplace(Loop *L,
                << "  IVCount:\t" << *IVCount << "\n");
 
   IRBuilder<> Builder(BI);
-  if (SE->getTypeSizeInBits(CmpIndVar->getType())
-      > SE->getTypeSizeInBits(ExitCnt->getType())) {
-    CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
-                                    "lftr.wideiv");
-  }
 
+  // LFTR can ignore IV overflow and truncate to the width of
+  // BECount. This avoids materializing the add(zext(add)) expression.
+  unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+  unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+  if (CmpIndVarSize > ExitCntSize) {
+    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+    const SCEV *ARStart = AR->getStart();
+    const SCEV *ARStep = AR->getStepRecurrence(*SE);
+    // For constant IVCount, avoid truncation.
+    if (isa<SCEVConstant>(ARStart) && isa<SCEVConstant>(IVCount)) {
+      const APInt &Start = cast<SCEVConstant>(ARStart)->getValue()->getValue();
+      APInt Count = cast<SCEVConstant>(IVCount)->getValue()->getValue();
+      // Note that the post-inc value of BackedgeTakenCount may have overflowed
+      // above such that IVCount is now zero.
+      if (IVCount != BackedgeTakenCount && Count == 0) {
+        Count = APInt::getMaxValue(Count.getBitWidth()).zext(CmpIndVarSize);
+        ++Count;
+      }
+      else
+        Count = Count.zext(CmpIndVarSize);
+      APInt NewLimit;
+      if (cast<SCEVConstant>(ARStep)->getValue()->isNegative())
+        NewLimit = Start - Count;
+      else
+        NewLimit = Start + Count;
+      ExitCnt = ConstantInt::get(CmpIndVar->getType(), NewLimit);
+
+      DEBUG(dbgs() << "  Widen RHS:\t" << *ExitCnt << "\n");
+    } else {
+      CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+                                      "lftr.wideiv");
+    }
+  }
   Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
   Value *OrigCond = BI->getCondition();
   // It's tempting to use replaceAllUsesWith here to fully replace the old
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index b61c5ba..b3ec2fc 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LazyValueInfo.h"
@@ -129,6 +130,7 @@ namespace {
     bool ProcessBranchOnXOR(BinaryOperator *BO);
 
     bool SimplifyPartiallyRedundantLoad(LoadInst *LI);
+    bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
   };
 }
 
@@ -775,7 +777,11 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
           return true;
         }
       }
+
     }
+
+    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
+      return true;
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -821,7 +827,6 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) {
   return false;
 }
 
-
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -836,6 +841,12 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (LoadBB->getSinglePredecessor())
     return false;
 
+  // If the load is defined in a landing pad, it can't be partially redundant,
+  // because the edges between the invoke and the landing pad cannot have other
+  // instructions between them.
+  if (LoadBB->isLandingPad())
+    return false;
+
   Value *LoadedPtr = LI->getOperand(0);
 
   // If the loaded operand is defined in the LoadBB, it can't be available.
@@ -1615,4 +1626,80 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
   return true;
 }
 
+/// TryToUnfoldSelect - Look for blocks of the form
+/// bb1:
+///   %a = select
+///   br bb
+///
+/// bb2:
+///   %p = phi [%a, %bb] ...
+///   %c = icmp %p
+///   br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
+bool JumpThreading::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+  Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+      CondLHS->getParent() != BB)
+    return false;
+
+  for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+    BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+    SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
 
+    // Look if one of the incoming values is a select in the corresponding
+    // predecessor.
+    if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+      continue;
+
+    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredTerm || !PredTerm->isUnconditional())
+      continue;
+
+    // Now check if one of the select values would allow us to constant fold the
+    // terminator in BB. We don't do the transform if both sides fold, those
+    // cases will be threaded in any case.
+    LazyValueInfo::Tristate LHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+                                CondRHS, Pred, BB);
+    LazyValueInfo::Tristate RHSFolds =
+        LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+                                CondRHS, Pred, BB);
+    if ((LHSFolds != LazyValueInfo::Unknown ||
+         RHSFolds != LazyValueInfo::Unknown) &&
+        LHSFolds != RHSFolds) {
+      // Expand the select.
+      //
+      // Pred --
+      //  |    v
+      //  |  NewBB
+      //  |    |
+      //  |-----
+      //  v
+      // BB
+      BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+                                             BB->getParent(), BB);
+      // Move the unconditional branch to NewBB.
+      PredTerm->removeFromParent();
+      NewBB->getInstList().insert(NewBB->end(), PredTerm);
+      // Create a conditional branch and update PHI nodes.
+      BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+      CondLHS->setIncomingValue(I, SI->getFalseValue());
+      CondLHS->addIncoming(SI->getTrueValue(), NewBB);
+      // The select is now dead.
+      SI->eraseFromParent();
+
+      // Update any other PHI nodes in BB.
+      for (BasicBlock::iterator BI = BB->begin();
+           PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+        if (Phi != CondLHS)
+          Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 0b62050..9e39d2e 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -51,8 +51,8 @@ namespace {
     }
 
   private:
-    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
-                    SmallVector<BasicBlock*, 4> &exitBlocks,
+    bool isLoopDead(Loop *L, SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                    SmallVectorImpl<BasicBlock *> &exitBlocks,
                     bool &Changed, BasicBlock *Preheader);
 
   };
@@ -77,8 +77,8 @@ Pass *llvm::createLoopDeletionPass() {
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
 bool LoopDeletion::isLoopDead(Loop *L,
-                              SmallVector<BasicBlock*, 4> &exitingBlocks,
-                              SmallVector<BasicBlock*, 4> &exitBlocks,
+                              SmallVectorImpl<BasicBlock *> &exitingBlocks,
+                              SmallVectorImpl<BasicBlock *> &exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
   BasicBlock *exitBlock = exitBlocks[0];
 
@@ -209,7 +209,7 @@ bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
     // Move all of the block's children to be children of the preheader, which
     // allows us to remove the domtree entry for the block.
     ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end());
-    for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(),
+    for (SmallVectorImpl<DomTreeNode *>::iterator DI = ChildNodes.begin(),
          DE = ChildNodes.end(); DI != DE; ++DI) {
       DT.changeImmediateDominator(*DI, DT[preheader]);
     }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 8258719..952b76b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -81,7 +81,7 @@ namespace {
     /// Return the condition of the branch terminating the given basic block.
     static Value *getBrCondtion(BasicBlock *);
 
-    /// Derive the precondition block (i.e the block that guards the loop 
+    /// Derive the precondition block (i.e the block that guards the loop
     /// preheader) from the given preheader.
     static BasicBlock *getPrecondBb(BasicBlock *PreHead);
   };
@@ -111,7 +111,7 @@ namespace {
     /// beween a variable and zero, and if the variable is non-zero, the
     /// control yeilds to the loop entry. If the branch matches the behavior,
     /// the variable involved in the comparion is returned. This function will
-    /// be called to see if the precondition and postcondition of the loop 
+    /// be called to see if the precondition and postcondition of the loop
     /// are in desirable form.
     Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
 
@@ -274,11 +274,11 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
 //
 //===----------------------------------------------------------------------===//
 
-// This fucntion will return true iff the given block contains nothing but goto. 
-// A typical usage of this function is to check if the preheader fucntion is 
-// "almost" empty such that generated intrinsic function can be moved across 
-// preheader and to be placed at the end of the preconditiona block without 
-// concerning of breaking data dependence.
+// This function will return true iff the given block contains nothing but goto.
+// A typical usage of this function is to check if the preheader function is
+// "almost" empty such that generated intrinsic functions can be moved across
+// the preheader and be placed at the end of the precondition block without
+// the concern of breaking data dependence.
 bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
   if (BranchInst *Br = getBranch(BB)) {
     return Br->isUnconditional() && BB->size() == 1;
@@ -314,7 +314,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
     return false;
 
-  // Counting population are usually conducted by few arithmetic instrutions.
+  // Counting population are usually conducted by few arithmetic instructions.
   // Such instructions can be easilly "absorbed" by vacant slots in a
   // non-compact loop. Therefore, recognizing popcount idiom only makes sense
   // in a compact loop.
@@ -339,7 +339,7 @@ bool NclPopcountRecognize::preliminaryScreen() {
   PreCondBB = LIRUtil::getPrecondBb(PreHead);
   if (!PreCondBB)
     return false;
- 
+
   return true;
 }
 
@@ -504,7 +504,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   // Assuming before transformation, the loop is following:
   //  if (x) // the precondition
   //     do { cnt++; x &= x - 1; } while(x);
- 
+
   // Step 1: Insert the ctpop instruction at the end of the precondition block
   IRBuilderTy Builder(PreCondBr);
   Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
@@ -611,7 +611,7 @@ void NclPopcountRecognize::transform(Instruction *CntInst,
   SE->forgetLoop(CurLoop);
 }
 
-CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 
+CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder,
                                                       Value *Val, DebugLoc DL) {
   Value *Ops[] = { Val };
   Type *Tys[] = { Val->getType() };
@@ -667,13 +667,13 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
   if (!getDataLayout())
     return false;
 
-  // set DT 
+  // set DT
   (void)getDominatorTree();
 
   LoopInfo &LI = getAnalysis<LoopInfo>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
-  // set TLI 
+  // set TLI
   (void)getTargetLibraryInfo();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -953,6 +953,8 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   Value *SplatValue = isBytewiseValue(StoredVal);
   Constant *PatternValue = 0;
 
+  unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+
   // If we're allowed to form a memset, and the stored value would be acceptable
   // for memset, use it.
   if (SplatValue && TLI->has(LibFunc::memset) &&
@@ -961,8 +963,10 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
       CurLoop->isLoopInvariant(SplatValue)) {
     // Keep and use SplatValue.
     PatternValue = 0;
-  } else if (TLI->has(LibFunc::memset_pattern16) &&
+  } else if (DestAS == 0 &&
+             TLI->has(LibFunc::memset_pattern16) &&
              (PatternValue = getMemSetPatternValue(StoredVal, *TD))) {
+    // Don't create memset_pattern16s with address spaces.
     // It looks like we can use PatternValue!
     SplatValue = 0;
   } else {
@@ -978,20 +982,20 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, "loop-idiom");
 
+  Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
   // this into a memset in the loop preheader now if we want.  However, this
   // would be unsafe to do if there is anything else in the loop that may read
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
-  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
-    Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
+    Expander.expandCodeFor(Ev->getStart(), DestInt8PtrTy,
                            Preheader->getTerminator());
 
-
   if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
                             CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
+                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
     deleteIfDeadInstruction(BasePtr, *SE, TLI);
@@ -1002,27 +1006,35 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = Builder.getIntPtrTy(TD, DestAS);
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
                                          SCEV::FlagNUW);
-  if (StoreSize != 1)
+  if (StoreSize != 1) {
     NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
                                SCEV::FlagNUW);
+  }
 
   Value *NumBytes =
     Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
 
   CallInst *NewCall;
-  if (SplatValue)
-    NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment);
-  else {
+  if (SplatValue) {
+    NewCall = Builder.CreateMemSet(BasePtr,
+                                   SplatValue,
+                                   NumBytes,
+                                   StoreAlignment);
+  } else {
+    // Everything is emitted in default address space
+    Type *Int8PtrTy = DestInt8PtrTy;
+
     Module *M = TheStore->getParent()->getParent()->getParent();
     Value *MSP = M->getOrInsertFunction("memset_pattern16",
                                         Builder.getVoidTy(),
-                                        Builder.getInt8PtrTy(),
-                                        Builder.getInt8PtrTy(), IntPtr,
+                                        Int8PtrTy,
+                                        Int8PtrTy,
+                                        IntPtr,
                                         (void*)0);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
@@ -1032,7 +1044,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                                             PatternValue, ".memset_pattern");
     GV->setUnnamedAddr(true); // Ok to merge these.
     GV->setAlignment(16);
-    Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy());
+    Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes);
   }
 
@@ -1108,17 +1120,17 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(SI->getContext());
-  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+  Type *IntPtrTy = Builder.getIntPtrTy(TD, SI->getPointerAddressSpace());
+  BECount = SE->getTruncateOrZeroExtend(BECount, IntPtrTy);
 
-  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
+  const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtrTy, 1),
                                          SCEV::FlagNUW);
   if (StoreSize != 1)
-    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+    NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize),
                                SCEV::FlagNUW);
 
   Value *NumBytes =
-    Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator());
+    Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator());
 
   CallInst *NewCall =
     Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes,
diff --git a/lib/Transforms/Scalar/LoopRerollPass.cpp b/lib/Transforms/Scalar/LoopRerollPass.cpp
new file mode 100644
index 0000000..335af81
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -0,0 +1,1184 @@
+//===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
+  cl::desc("The maximum increment for loop rerolling"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; i += 3) {
+//     foo(i);
+//     foo(i+1);
+//     foo(i+2);
+//   }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i)
+//     foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+//   for (int i = 0; i < 500; ++i) {
+//     x[3*i] = foo(0);
+//     x[3*i+1] = foo(0);
+//     x[3*i+2] = foo(0);
+//   }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+//   for (int i = 0; i < 1500; ++i)
+//     x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+  class LoopReroll : public LoopPass {
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopReroll() : LoopPass(ID) {
+      initializeLoopRerollPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addPreserved<DominatorTree>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfo>();
+    }
+
+protected:
+    AliasAnalysis *AA;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
+    DataLayout *DL;
+    TargetLibraryInfo *TLI;
+    DominatorTree *DT;
+
+    typedef SmallVector<Instruction *, 16> SmallInstructionVector;
+    typedef SmallSet<Instruction *, 16>   SmallInstructionSet;
+
+    // A chain of isomorphic instructions, indentified by a single-use PHI,
+    // representing a reduction. Only the last value may be used outside the
+    // loop.
+    struct SimpleLoopReduction {
+      SimpleLoopReduction(Instruction *P, Loop *L)
+        : Valid(false), Instructions(1, P) {
+        assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+        add(L);
+      }
+
+      bool valid() const {
+        return Valid;
+      }
+
+      Instruction *getPHI() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.front();
+      }
+
+      Instruction *getReducedValue() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.back();
+      }
+
+      Instruction *get(size_t i) const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions[i+1];
+      }
+
+      Instruction *operator [] (size_t i) const { return get(i); }
+
+      // The size, ignoring the initial PHI.
+      size_t size() const {
+        assert(Valid && "Using invalid reduction");
+        return Instructions.size()-1;
+      }
+
+      typedef SmallInstructionVector::iterator iterator;
+      typedef SmallInstructionVector::const_iterator const_iterator;
+
+      iterator begin() {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      const_iterator begin() const {
+        assert(Valid && "Using invalid reduction");
+        return llvm::next(Instructions.begin());
+      }
+
+      iterator end() { return Instructions.end(); }
+      const_iterator end() const { return Instructions.end(); }
+
+    protected:
+      bool Valid;
+      SmallInstructionVector Instructions;
+
+      void add(Loop *L);
+    };
+
+    // The set of all reductions, and state tracking of possible reductions
+    // during loop instruction processing.
+    struct ReductionTracker {
+      typedef SmallVector<SimpleLoopReduction, 16> SmallReductionVector;
+
+      // Add a new possible reduction.
+      void addSLR(SimpleLoopReduction &SLR) {
+        PossibleReds.push_back(SLR);
+      }
+
+      // Setup to track possible reductions corresponding to the provided
+      // rerolling scale. Only reductions with a number of non-PHI instructions
+      // that is divisible by the scale are considered. Three instructions sets
+      // are filled in:
+      //   - A set of all possible instructions in eligible reductions.
+      //   - A set of all PHIs in eligible reductions
+      //   - A set of all reduced values (last instructions) in eligible reductions.
+      void restrictToScale(uint64_t Scale,
+                           SmallInstructionSet &PossibleRedSet,
+                           SmallInstructionSet &PossibleRedPHISet,
+                           SmallInstructionSet &PossibleRedLastSet) {
+        PossibleRedIdx.clear();
+        PossibleRedIter.clear();
+        Reds.clear();
+
+        for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+          if (PossibleReds[i].size() % Scale == 0) {
+            PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+            PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+      
+            PossibleRedSet.insert(PossibleReds[i].getPHI());
+            PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+            for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+                 JE = PossibleReds[i].end(); J != JE; ++J) {
+              PossibleRedSet.insert(*J);
+              PossibleRedIdx[*J] = i;
+            }
+          }
+      }
+
+      // The functions below are used while processing the loop instructions.
+
+      // Are the two instructions both from reductions, and furthermore, from
+      // the same reduction?
+      bool isPairInSame(Instruction *J1, Instruction *J2) {
+        DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+        if (J1I != PossibleRedIdx.end()) {
+          DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+          if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+            return true;
+        }
+
+        return false;
+      }
+
+      // The two provided instructions, the first from the base iteration, and
+      // the second from iteration i, form a matched pair. If these are part of
+      // a reduction, record that fact.
+      void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+        if (PossibleRedIdx.count(J1)) {
+          assert(PossibleRedIdx.count(J2) &&
+                 "Recording reduction vs. non-reduction instruction?");
+
+          PossibleRedIter[J1] = 0;
+          PossibleRedIter[J2] = i;
+
+          int Idx = PossibleRedIdx[J1];
+          assert(Idx == PossibleRedIdx[J2] &&
+                 "Recording pair from different reductions?");
+          Reds.insert(Idx);
+        }
+      }
+
+      // The functions below can be called after we've finished processing all
+      // instructions in the loop, and we know which reductions were selected.
+
+      // Is the provided instruction the PHI of a reduction selected for
+      // rerolling?
+      bool isSelectedPHI(Instruction *J) {
+        if (!isa<PHINode>(J))
+          return false;
+
+        for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+             RI != RIE; ++RI) {
+          int i = *RI;
+          if (cast<Instruction>(J) == PossibleReds[i].getPHI())
+            return true;
+        }
+
+        return false;
+      }
+
+      bool validateSelected();
+      void replaceSelected();
+
+    protected:
+      // The vector of all possible reductions (for any scale).
+      SmallReductionVector PossibleReds;
+
+      DenseMap<Instruction *, int> PossibleRedIdx;
+      DenseMap<Instruction *, int> PossibleRedIter;
+      DenseSet<int> Reds;
+    };
+
+    void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+    void collectPossibleReductions(Loop *L,
+           ReductionTracker &Reductions);
+    void collectInLoopUserSet(Loop *L,
+           const SmallInstructionVector &Roots,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    void collectInLoopUserSet(Loop *L,
+           Instruction * Root,
+           const SmallInstructionSet &Exclude,
+           const SmallInstructionSet &Final,
+           DenseSet<Instruction *> &Users);
+    bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                          Instruction *&IV,
+                          SmallInstructionVector &LoopIncs);
+    bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
+                         SmallVector<SmallInstructionVector, 32> &Roots,
+                         SmallInstructionSet &AllRoots,
+                         SmallInstructionVector &LoopIncs);
+    bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
+                ReductionTracker &Reductions);
+  };
+}
+
+char LoopReroll::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
+
+Pass *llvm::createLoopRerollPass() {
+  return new LoopReroll;
+}
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+  for (Value::use_iterator UI = I->use_begin(),
+       UIE = I->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!L->contains(User))
+      return true;
+  }
+
+  return false;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+                                    SmallInstructionVector &PossibleIVs) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isIntegerTy())
+      continue;
+
+    if (const SCEVAddRecExpr *PHISCEV =
+        dyn_cast<SCEVAddRecExpr>(SE->getSCEV(I))) {
+      if (PHISCEV->getLoop() != L)
+        continue;
+      if (!PHISCEV->isAffine())
+        continue;
+      if (const SCEVConstant *IncSCEV =
+          dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE))) {
+        if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
+          continue;
+        if (IncSCEV->getValue()->uge(MaxInc))
+          continue;
+
+        DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
+              *PHISCEV << "\n");
+        PossibleIVs.push_back(I);
+      }
+    }
+  }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+  assert(!Valid && "Cannot add to an already-valid chain");
+
+  // The reduction variable must be a chain of single-use instructions
+  // (including the PHI), except for the last value (which is used by the PHI
+  // and also outside the loop).
+  Instruction *C = Instructions.front();
+
+  do {
+    C = cast<Instruction>(*C->use_begin());
+    if (C->hasOneUse()) {
+      if (!C->isBinaryOp())
+        return;
+
+      if (!(isa<PHINode>(Instructions.back()) ||
+            C->isSameOperationAs(Instructions.back())))
+        return;
+
+      Instructions.push_back(C);
+    }
+  } while (C->hasOneUse());
+
+  if (Instructions.size() < 2 ||
+      !C->isSameOperationAs(Instructions.back()) ||
+      C->use_begin() == C->use_end())
+    return;
+
+  // C is now the (potential) last instruction in the reduction chain.
+  for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
+       UI != UIE; ++UI) {
+    // The only in-loop user can be the initial PHI.
+    if (L->contains(cast<Instruction>(*UI)))
+      if (cast<Instruction>(*UI ) != Instructions.front())
+        return;
+  }
+
+  Instructions.push_back(C);
+  Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+  ReductionTracker &Reductions) {
+  BasicBlock *Header = L->getHeader();
+  for (BasicBlock::iterator I = Header->begin(),
+       IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+    if (!isa<PHINode>(I))
+      continue;
+    if (!I->getType()->isSingleValueType())
+      continue;
+
+    SimpleLoopReduction SLR(I, L);
+    if (!SLR.valid())
+      continue;
+
+    DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
+          SLR.size() << " chained instructions)\n");
+    Reductions.addSLR(SLR);
+  }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+//   1. Instructions in the set of excluded instructions are never added to the
+//   use set (even if they are users). This is used, for example, to exclude
+//   including root increments in the use set of the primary IV.
+//
+//   2. Instructions in the set of final instructions are added to the use set
+//   if they are users, but their users are not added. This is used, for
+//   example, to prevent a reduction update from forcing all later reduction
+//   updates into the use set.
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  Instruction *Root, const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  SmallInstructionVector Queue(1, Root);
+  while (!Queue.empty()) {
+    Instruction *I = Queue.pop_back_val();
+    if (!Users.insert(I).second)
+      continue;
+
+    if (!Final.count(I))
+      for (Value::use_iterator UI = I->use_begin(),
+           UIE = I->use_end(); UI != UIE; ++UI) {
+        Instruction *User = cast<Instruction>(*UI);
+        if (PHINode *PN = dyn_cast<PHINode>(User)) {
+          // Ignore "wrap-around" uses to PHIs of this loop's header.
+          if (PN->getIncomingBlock(UI) == L->getHeader())
+            continue;
+        }
+  
+        if (L->contains(User) && !Exclude.count(User)) {
+          Queue.push_back(User);
+        }
+      }
+
+    // We also want to collect single-user "feeder" values.
+    for (User::op_iterator OI = I->op_begin(),
+         OIE = I->op_end(); OI != OIE; ++OI) {
+      if (Instruction *Op = dyn_cast<Instruction>(*OI))
+        if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+            !Final.count(Op))
+          Queue.push_back(Op);
+    }
+  }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::collectInLoopUserSet(Loop *L,
+  const SmallInstructionVector &Roots,
+  const SmallInstructionSet &Exclude,
+  const SmallInstructionSet &Final,
+  DenseSet<Instruction *> &Users) {
+  for (SmallInstructionVector::const_iterator I = Roots.begin(),
+       IE = Roots.end(); I != IE; ++I)
+    collectInLoopUserSet(L, *I, Exclude, Final, Users);
+}
+
+static bool isSimpleLoadStore(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+    return !MI->isVolatile();
+  return false;
+}
+
+// Recognize loops that are setup like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
+bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
+                                  Instruction *&IV,
+                                  SmallInstructionVector &LoopIncs) {
+  // This is a special case: here we're looking for all uses (except for
+  // the increment) to be multiplied by a common factor. The increment must
+  // be by one. This is to capture loops like:
+  //   for (int i = 0; i < 500; ++i) {
+  //     foo(3*i); foo(3*i+1); foo(3*i+2);
+  //   }
+  if (RealIV->getNumUses() != 2)
+    return false;
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(RealIV));
+  Instruction *User1 = cast<Instruction>(*RealIV->use_begin()),
+              *User2 = cast<Instruction>(*llvm::next(RealIV->use_begin()));
+  if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
+    return false;
+  const SCEVAddRecExpr *User1SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User1)),
+                       *User2SCEV =
+                         dyn_cast<SCEVAddRecExpr>(SE->getSCEV(User2));
+  if (!User1SCEV || !User1SCEV->isAffine() ||
+      !User2SCEV || !User2SCEV->isAffine())
+    return false;
+
+  // We assume below that User1 is the scale multiply and User2 is the
+  // increment. If this can't be true, then swap them.
+  if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
+    std::swap(User1, User2);
+    std::swap(User1SCEV, User2SCEV);
+  }
+
+  if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
+    return false;
+  assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
+         "Invalid non-unit step for multiplicative scaling");
+  LoopIncs.push_back(User2);
+
+  if (const SCEVConstant *MulScale =
+      dyn_cast<SCEVConstant>(User1SCEV->getStepRecurrence(*SE))) {
+    // Make sure that both the start and step have the same multiplier.
+    if (RealIVSCEV->getStart()->getType() != MulScale->getType())
+      return false;
+    if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
+        User1SCEV->getStart())
+      return false;
+
+    ConstantInt *MulScaleCI = MulScale->getValue();
+    if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
+      return false;
+    Scale = MulScaleCI->getZExtValue();
+    IV = User1;
+  } else
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
+  return true;
+}
+
+// Collect all root increments with respect to the provided induction variable
+// (normally the PHI, but sometimes a multiply). A root increment is an
+// instruction, normally an add, with a positive constant less than Scale. In a
+// rerollable loop, each of these increments is the root of an instruction
+// graph isomorphic to the others. Also, we collect the final induction
+// increment (the increment equal to the Scale), and its users in LoopIncs.
+bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
+                                 Instruction *IV,
+                                 SmallVector<SmallInstructionVector, 32> &Roots,
+                                 SmallInstructionSet &AllRoots,
+                                 SmallInstructionVector &LoopIncs) {
+  for (Value::use_iterator UI = IV->use_begin(),
+       UIE = IV->use_end(); UI != UIE; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!SE->isSCEVable(User->getType()))
+      continue;
+    if (User->getType() != IV->getType())
+      continue;
+    if (!L->contains(User))
+      continue;
+    if (hasUsesOutsideLoop(User, L))
+      continue;
+
+    if (const SCEVConstant *Diff = dyn_cast<SCEVConstant>(SE->getMinusSCEV(
+          SE->getSCEV(User), SE->getSCEV(IV)))) {
+      uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
+      if (Idx > 0 && Idx < Scale) {
+        Roots[Idx-1].push_back(User);
+        AllRoots.insert(User);
+      } else if (Idx == Scale && Inc > 1) {
+        LoopIncs.push_back(User);
+      }
+    }
+  }
+
+  if (Roots[0].empty())
+    return false;
+  bool AllSame = true;
+  for (unsigned i = 1; i < Scale-1; ++i)
+    if (Roots[i].size() != Roots[0].size()) {
+      AllSame = false;
+      break;
+    }
+
+  if (!AllSame)
+    return false;
+
+  return true;
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+  // For a non-associative reduction, the chain entries must appear in order.
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int PrevIter = 0, BaseCount = 0, Count = 0;
+    for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
+         JE = PossibleReds[i].end(); J != JE; ++J) {
+	// Note that all instructions in the chain must have been found because
+	// all instructions in the function must have been assigned to some
+	// iteration.
+      int Iter = PossibleRedIter[*J];
+      if (Iter != PrevIter && Iter != PrevIter + 1 &&
+          !PossibleReds[i].getReducedValue()->isAssociative()) {
+        DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
+                        *J << "\n");
+        return false;
+      }
+
+      if (Iter != PrevIter) {
+        if (Count != BaseCount) {
+          DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
+                " reduction use count " << Count <<
+                " is not equal to the base use count " <<
+                BaseCount << "\n");
+          return false;
+        }
+
+        Count = 0;
+      }
+
+      ++Count;
+      if (Iter == 0)
+        ++BaseCount;
+
+      PrevIter = Iter;
+    }
+  }
+
+  return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+  // Fixup reductions to refer to the last instruction associated with the
+  // first iteration (not the last).
+  for (DenseSet<int>::iterator RI = Reds.begin(), RIE = Reds.end();
+       RI != RIE; ++RI) {
+    int i = *RI;
+    int j = 0;
+    for (int e = PossibleReds[i].size(); j != e; ++j)
+      if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+        --j;
+        break;
+      }
+
+    // Replace users with the new end-of-chain value.
+    SmallInstructionVector Users;
+    for (Value::use_iterator UI =
+           PossibleReds[i].getReducedValue()->use_begin(),
+         UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
+      Users.push_back(cast<Instruction>(*UI));
+
+    for (SmallInstructionVector::iterator J = Users.begin(),
+         JE = Users.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+                              PossibleReds[i][j]);
+  }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1                <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2                <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1  <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+//   - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+//     the next unmatched instruction in f(%iv.(i+1)).
+//   - Ensure that both matched instructions don't have any external users
+//     (with the exception of last-in-chain reduction instructions).
+//   - Track the (aliasing) write set, and other side effects, of all
+//     instructions that belong to future iterations that come before the matched
+//     instructions. If the matched instructions read from that write set, then
+//     f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+//     f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+//     if any of these future instructions had side effects (could not be
+//     speculatively executed), and so do the matched instructions, when we
+//     cannot reorder those side-effect-producing instructions, and rerolling
+//     fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+                        const SCEV *IterCount,
+                        ReductionTracker &Reductions) {
+  const SCEVAddRecExpr *RealIVSCEV = cast<SCEVAddRecExpr>(SE->getSCEV(IV));
+  uint64_t Inc = cast<SCEVConstant>(RealIVSCEV->getOperand(1))->
+                   getValue()->getZExtValue();
+  // The collection of loop increment instructions.
+  SmallInstructionVector LoopIncs;
+  uint64_t Scale = Inc;
+
+  // The effective induction variable, IV, is normally also the real induction
+  // variable. When we're dealing with a loop like:
+  //   for (int i = 0; i < 500; ++i)
+  //     x[3*i] = ...;
+  //     x[3*i+1] = ...;
+  //     x[3*i+2] = ...;
+  // then the real IV is still i, but the effective IV is (3*i).
+  Instruction *RealIV = IV;
+  if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
+    return false;
+
+  assert(Scale <= MaxInc && "Scale is too large");
+  assert(Scale > 1 && "Scale must be at least 2");
+
+  // The set of increment instructions for each increment value.
+  SmallVector<SmallInstructionVector, 32> Roots(Scale-1);
+  SmallInstructionSet AllRoots;
+  if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
+    return false;
+
+  DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
+                  *RealIV << "\n");
+
+  // An array of just the possible reductions for this scale factor. When we
+  // collect the set of all users of some root instructions, these reduction
+  // instructions are treated as 'final' (their uses are not considered).
+  // This is important because we don't want the root use set to search down
+  // the reduction chain.
+  SmallInstructionSet PossibleRedSet;
+  SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
+  Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
+                             PossibleRedLastSet);
+
+  // We now need to check for equivalence of the use graph of each root with
+  // that of the primary induction variable (excluding the roots). Our goal
+  // here is not to solve the full graph isomorphism problem, but rather to
+  // catch common cases without a lot of work. As a result, we will assume
+  // that the relative order of the instructions in each unrolled iteration
+  // is the same (although we will not make an assumption about how the
+  // different iterations are intermixed). Note that while the order must be
+  // the same, the instructions may not be in the same basic block.
+  SmallInstructionSet Exclude(AllRoots);
+  Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+  DenseSet<Instruction *> BaseUseSet;
+  collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
+
+  DenseSet<Instruction *> AllRootUses;
+  std::vector<DenseSet<Instruction *> > RootUseSets(Scale-1);
+
+  bool MatchFailed = false;
+  for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
+    DenseSet<Instruction *> &RootUseSet = RootUseSets[i];
+    collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
+                         PossibleRedSet, RootUseSet);
+
+    DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
+                    " vs. iteration increment " << (i+1) <<
+                    " use set size: " << RootUseSet.size() << "\n");
+
+    if (BaseUseSet.size() != RootUseSet.size()) {
+      MatchFailed = true;
+      break;
+    }
+
+    // In addition to regular aliasing information, we need to look for
+    // instructions from later (future) iterations that have side effects
+    // preventing us from reordering them past other instructions with side
+    // effects.
+    bool FutureSideEffects = false;
+    AliasSetTracker AST(*AA);
+
+    // The map between instructions in f(%iv.(i+1)) and f(%iv).
+    DenseMap<Value *, Value *> BaseMap;
+
+    assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
+    for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
+         JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
+      if (cast<Instruction>(J1) == RealIV)
+        continue;
+      if (cast<Instruction>(J1) == IV)
+        continue;
+      if (!BaseUseSet.count(J1))
+        continue;
+      if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
+        continue;
+
+      while (J2 != JE && (!RootUseSet.count(J2) ||
+             std::find(Roots[i].begin(), Roots[i].end(), J2) !=
+               Roots[i].end())) {
+        // As we iterate through the instructions, instructions that don't
+        // belong to previous iterations (or the base case), must belong to
+        // future iterations. We want to track the alias set of writes from
+        // previous iterations.
+        if (!isa<PHINode>(J2) && !BaseUseSet.count(J2) &&
+            !AllRootUses.count(J2)) {
+          if (J2->mayWriteToMemory())
+            AST.add(J2);
+
+          // Note: This is specifically guarded by a check on isa<PHINode>,
+          // which while a valid (somewhat arbitrary) micro-optimization, is
+          // needed because otherwise isSafeToSpeculativelyExecute returns
+          // false on PHI nodes.
+          if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
+            FutureSideEffects = true; 
+        }
+
+        ++J2;
+      }
+
+      if (!J1->isSameOperationAs(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << "\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that this instruction, which is in the use set of this
+      // root instruction, does not also belong to the base set or the set of
+      // some previous root instruction.
+      if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (prev. case overlap)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // Make sure that we don't alias with any instruction in the alias set
+      // tracker. If we do, then we depend on a future iteration, and we
+      // can't reroll.
+      if (J2->mayReadFromMemory()) {
+        for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
+             K != KE && !MatchFailed; ++K) {
+          if (K->aliasesUnknownInst(J2, *AA)) {
+            DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                            " vs. " << *J2 << " (depends on future store)\n");
+            MatchFailed = true;
+            break;
+          }
+        }
+      }
+
+      // If we've past an instruction from a future iteration that may have
+      // side effects, and this instruction might also, then we can't reorder
+      // them, and this matching fails. As an exception, we allow the alias
+      // set tracker to handle regular (simple) load/store dependencies.
+      if (FutureSideEffects &&
+            ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
+             (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 <<
+                        " (side effects prevent reordering)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      // For instructions that are part of a reduction, if the operation is
+      // associative, then don't bother matching the operands (because we
+      // already know that the instructions are isomorphic, and the order
+      // within the iteration does not matter). For non-associative reductions,
+      // we do need to match the operands, because we need to reject
+      // out-of-order instructions within an iteration!
+      // For example (assume floating-point addition), we need to reject this:
+      //   x += a[i]; x += b[i];
+      //   x += a[i+1]; x += b[i+1];
+      //   x += b[i+2]; x += a[i+2];
+      bool InReduction = Reductions.isPairInSame(J1, J2);
+
+      if (!(InReduction && J1->isAssociative())) {
+        bool Swapped = false, SomeOpMatched = false;;
+        for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
+          Value *Op2 = J2->getOperand(j);
+
+	  // If this is part of a reduction (and the operation is not
+	  // associatve), then we match all operands, but not those that are
+	  // part of the reduction.
+          if (InReduction)
+            if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+              if (Reductions.isPairInSame(J2, Op2I))
+                continue;
+
+          DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+          if (BMI != BaseMap.end())
+            Op2 = BMI->second;
+          else if (std::find(Roots[i].begin(), Roots[i].end(),
+                             (Instruction*) Op2) != Roots[i].end())
+            Op2 = IV;
+
+          if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+	    // If we've not already decided to swap the matched operands, and
+	    // we've not already matched our first operand (note that we could
+	    // have skipped matching the first operand because it is part of a
+	    // reduction above), and the instruction is commutative, then try
+	    // the swapped match.
+            if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
+                J1->getOperand(!j) == Op2) {
+              Swapped = true;
+            } else {
+              DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                              " vs. " << *J2 << " (operand " << j << ")\n");
+              MatchFailed = true;
+              break;
+            }
+          }
+
+          SomeOpMatched = true;
+        }
+      }
+
+      if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
+          (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
+        DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
+                        " vs. " << *J2 << " (uses outside loop)\n");
+        MatchFailed = true;
+        break;
+      }
+
+      if (!MatchFailed)
+        BaseMap.insert(std::pair<Value *, Value *>(J2, J1));
+
+      AllRootUses.insert(J2);
+      Reductions.recordPair(J1, J2, i+1);
+
+      ++J2;
+    }
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
+                  *RealIV << "\n");
+
+  DenseSet<Instruction *> LoopIncUseSet;
+  collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
+                       SmallInstructionSet(), LoopIncUseSet);
+  DEBUG(dbgs() << "LRR: Loop increment set size: " <<
+                  LoopIncUseSet.size() << "\n");
+
+  // Make sure that all instructions in the loop have been included in some
+  // use set.
+  for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
+       J != JE; ++J) {
+    if (isa<DbgInfoIntrinsic>(J))
+      continue;
+    if (cast<Instruction>(J) == RealIV)
+      continue;
+    if (cast<Instruction>(J) == IV)
+      continue;
+    if (BaseUseSet.count(J) || AllRootUses.count(J) ||
+        (LoopIncUseSet.count(J) && (J->isTerminator() ||
+                                    isSafeToSpeculativelyExecute(J, DL))))
+      continue;
+
+    if (AllRoots.count(J))
+      continue;
+
+    if (Reductions.isSelectedPHI(J))
+      continue;
+
+    DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
+                    " unprocessed instruction found: " << *J << "\n");
+    MatchFailed = true;
+    break;
+  }
+
+  if (MatchFailed)
+    return false;
+
+  DEBUG(dbgs() << "LRR: all instructions processed from " <<
+                  *RealIV << "\n");
+
+  if (!Reductions.validateSelected())
+    return false;
+
+  // At this point, we've validated the rerolling, and we're committed to
+  // making changes!
+
+  Reductions.replaceSelected();
+
+  // Remove instructions associated with non-base iterations.
+  for (BasicBlock::reverse_iterator J = Header->rbegin();
+       J != Header->rend();) {
+    if (AllRootUses.count(&*J)) {
+      Instruction *D = &*J;
+      DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
+      D->eraseFromParent();
+      continue;
+    }
+
+    ++J; 
+  }
+
+  // Insert the new induction variable.
+  const SCEV *Start = RealIVSCEV->getStart();
+  if (Inc == 1)
+    Start = SE->getMulExpr(Start,
+                           SE->getConstant(Start->getType(), Scale));
+  const SCEVAddRecExpr *H =
+    cast<SCEVAddRecExpr>(SE->getAddRecExpr(Start,
+                           SE->getConstant(RealIVSCEV->getType(), 1),
+                           L, SCEV::FlagAnyWrap));
+  { // Limit the lifetime of SCEVExpander.
+    SCEVExpander Expander(*SE, "reroll");
+    PHINode *NewIV =
+      cast<PHINode>(Expander.expandCodeFor(H, IV->getType(),
+                                           Header->begin()));
+    for (DenseSet<Instruction *>::iterator J = BaseUseSet.begin(),
+         JE = BaseUseSet.end(); J != JE; ++J)
+      (*J)->replaceUsesOfWith(IV, NewIV);
+
+    if (BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator())) {
+      if (LoopIncUseSet.count(BI)) {
+        const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
+        if (Inc == 1)
+          ICSCEV =
+            SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
+        Value *IC;
+        if (isa<SCEVConstant>(ICSCEV)) {
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
+        } else {
+          BasicBlock *Preheader = L->getLoopPreheader();
+          if (!Preheader)
+            Preheader = InsertPreheaderForLoop(L, this);
+
+          IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
+                                      Preheader->getTerminator());
+        }
+ 
+        Value *NewIVNext = NewIV->getIncomingValueForBlock(Header); 
+        Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
+                                   "exitcond");
+        BI->setCondition(Cond);
+
+        if (BI->getSuccessor(1) != Header)
+          BI->swapSuccessors();
+      }
+    }
+  }
+
+  SimplifyInstructionsInBlock(Header, DL, TLI);
+  DeleteDeadPHIs(Header, TLI);
+  ++NumRerolledLoops;
+  return true;
+}
+
+bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  AA = &getAnalysis<AliasAnalysis>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
+  DL = getAnalysisIfAvailable<DataLayout>();
+  DT = &getAnalysis<DominatorTree>();
+
+  BasicBlock *Header = L->getHeader();
+  DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
+        "] Loop %" << Header->getName() << " (" <<
+        L->getNumBlocks() << " block(s))\n");
+
+  bool Changed = false;
+
+  // For now, we'll handle only single BB loops.
+  if (L->getNumBlocks() > 1)
+    return Changed;
+
+  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+    return Changed;
+
+  const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
+  const SCEV *IterCount =
+    SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
+  DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
+
+  // First, we need to find the induction variable with respect to which we can
+  // reroll (there may be several possible options).
+  SmallInstructionVector PossibleIVs;
+  collectPossibleIVs(L, PossibleIVs);
+
+  if (PossibleIVs.empty()) {
+    DEBUG(dbgs() << "LRR: No possible IVs found\n");
+    return Changed;
+  }
+
+  ReductionTracker Reductions;
+  collectPossibleReductions(L, Reductions);
+
+  // For each possible IV, collect the associated possible set of 'root' nodes
+  // (i+1, i+2, etc.).
+  for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
+       IE = PossibleIVs.end(); I != IE; ++I)
+    if (reroll(*I, L, Header, IterCount, Reductions)) {
+      Changed = true;
+      break;
+    }
+
+  return Changed;
+}
+
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 73e44d7..eff5268 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -774,6 +774,16 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
 }
 
 namespace {
+class LSRUse;
+}
+// Check if it is legal to fold 2 base registers.
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F);
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F);
+
+namespace {
 
 /// Cost - This class is used to measure and compare candidate formulae.
 class Cost {
@@ -785,11 +795,12 @@ class Cost {
   unsigned NumBaseAdds;
   unsigned ImmCost;
   unsigned SetupCost;
+  unsigned ScaleCost;
 
 public:
   Cost()
     : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0) {}
+      SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -799,9 +810,9 @@ public:
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
     return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
-             | ImmCost | SetupCost) != ~0u)
+             | ImmCost | SetupCost | ScaleCost) != ~0u)
       || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
-           & ImmCost & SetupCost) == ~0u);
+           & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
 
@@ -810,12 +821,14 @@ public:
     return NumRegs == ~0u;
   }
 
-  void RateFormula(const Formula &F,
+  void RateFormula(const TargetTransformInfo &TTI,
+                   const Formula &F,
                    SmallPtrSet<const SCEV *, 16> &Regs,
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
+                   const LSRUse &LU,
                    SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 
   void print(raw_ostream &OS) const;
@@ -900,12 +913,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
 }
 
-void Cost::RateFormula(const Formula &F,
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
                        SmallPtrSet<const SCEV *, 16> &Regs,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
                        ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
                        SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -932,7 +947,12 @@ void Cost::RateFormula(const Formula &F,
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
   if (NumBaseParts > 1)
-    NumBaseAdds += NumBaseParts - 1;
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
+
+  // Accumulate non-free scaling amounts.
+  ScaleCost += getScalingFactorCost(TTI, LU, F);
 
   // Tally up the non-zero immediates.
   for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -955,6 +975,7 @@ void Cost::Loose() {
   NumBaseAdds = ~0u;
   ImmCost = ~0u;
   SetupCost = ~0u;
+  ScaleCost = ~0u;
 }
 
 /// operator< - Choose the lower cost.
@@ -967,6 +988,8 @@ bool Cost::operator<(const Cost &Other) const {
     return NumIVMuls < Other.NumIVMuls;
   if (NumBaseAdds != Other.NumBaseAdds)
     return NumBaseAdds < Other.NumBaseAdds;
+  if (ScaleCost != Other.ScaleCost)
+    return ScaleCost < Other.ScaleCost;
   if (ImmCost != Other.ImmCost)
     return ImmCost < Other.ImmCost;
   if (SetupCost != Other.SetupCost)
@@ -983,6 +1006,8 @@ void Cost::print(raw_ostream &OS) const {
   if (NumBaseAdds != 0)
     OS << ", plus " << NumBaseAdds << " base add"
        << (NumBaseAdds == 1 ? "" : "s");
+  if (ScaleCost != 0)
+    OS << ", plus " << ScaleCost << " scale cost";
   if (ImmCost != 0)
     OS << ", plus " << ImmCost << " imm cost";
   if (SetupCost != 0)
@@ -1145,6 +1170,13 @@ public:
   /// may be used.
   bool AllFixupsOutsideLoop;
 
+  /// RigidFormula is set to true to guarantee that this use will be associated
+  /// with a single formula--the one that initially matched. Some SCEV
+  /// expressions cannot be expanded. This allows LSR to consider the registers
+  /// used by those expressions without the need to expand them later after
+  /// changing the formula.
+  bool RigidFormula;
+
   /// WidestFixupType - This records the widest use type for any fixup using
   /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
   /// max fixup widths to be equivalent, because the narrower one may be relying
@@ -1163,6 +1195,7 @@ public:
                                       MinOffset(INT64_MAX),
                                       MaxOffset(INT64_MIN),
                                       AllFixupsOutsideLoop(true),
+                                      RigidFormula(false),
                                       WidestFixupType(0) {}
 
   bool HasFormulaWithSameRegs(const Formula &F) const;
@@ -1189,6 +1222,9 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
 /// InsertFormula - If the given formula has not yet been inserted, add it to
 /// the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
+  if (!Formulae.empty() && RigidFormula)
+    return false;
+
   SmallVector<const SCEV *, 4> Key = F.BaseRegs;
   if (F.ScaledReg) Key.push_back(F.ScaledReg);
   // Unstable sort by host order ok, because this is only used for uniquifying.
@@ -1359,6 +1395,66 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
                     F.BaseOffset, F.HasBaseReg, F.Scale);
 }
 
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+                             const Formula &F) {
+  // If F is used as an Addressing Mode, it may fold one Base plus one
+  // scaled register. If the scaled register is nil, do as if another
+  // element of the base regs is a 1-scaled register.
+  // This is possible if BaseRegs has at least 2 registers.
+
+  // If this is not an address calculation, this is not an addressing mode
+  // use.
+  if (LU.Kind !=  LSRUse::Address)
+    return false;
+
+  // F is already scaled.
+  if (F.Scale != 0)
+    return false;
+
+  // We need to keep one register for the base and one to scale.
+  if (F.BaseRegs.size() < 2)
+    return false;
+
+  return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+                    F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
+ }
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+                                     const LSRUse &LU, const Formula &F) {
+  if (!F.Scale)
+    return 0;
+  assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+                    LU.AccessTy, F) && "Illegal formula in use.");
+
+  switch (LU.Kind) {
+  case LSRUse::Address: {
+    // Check the scaling factor cost with both the min and max offsets.
+    int ScaleCostMinOffset =
+      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
+                               F.BaseOffset + LU.MinOffset,
+                               F.HasBaseReg, F.Scale);
+    int ScaleCostMaxOffset =
+      TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV,
+                               F.BaseOffset + LU.MaxOffset,
+                               F.HasBaseReg, F.Scale);
+
+    assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+           "Legal addressing mode has an illegal cost!");
+    return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+  }
+  case LSRUse::ICmpZero:
+    // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg.
+    // Therefore, return 0 in case F.Scale == -1.
+    return F.Scale != -1;
+
+  case LSRUse::Basic:
+  case LSRUse::Special:
+    return 0;
+  }
+
+  llvm_unreachable("Invalid LSRUse Kind!");
+}
+
 static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
                              LSRUse::KindType Kind, Type *AccessTy,
                              GlobalValue *BaseGV, int64_t BaseOffset,
@@ -1664,7 +1760,7 @@ void LSRInstance::OptimizeShadowIV() {
     IVUsers::const_iterator CandidateUI = UI;
     ++UI;
     Instruction *ShadowUse = CandidateUI->getUser();
-    Type *DestTy = NULL;
+    Type *DestTy = 0;
     bool IsSigned = false;
 
     /* If shadow use is a int->float cast then insert a second IV
@@ -1726,7 +1822,7 @@ void LSRInstance::OptimizeShadowIV() {
       continue;
 
     /* Initialize new IV, double d = 0.0 in above example. */
-    ConstantInt *C = NULL;
+    ConstantInt *C = 0;
     if (Incr->getOperand(0) == PH)
       C = dyn_cast<ConstantInt>(Incr->getOperand(1));
     else if (Incr->getOperand(1) == PH)
@@ -2858,7 +2954,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 
         // x == y  -->  x - y == 0
         const SCEV *N = SE.getSCEV(NV);
-        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) {
+        if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
           // S is normalized, so normalize N before folding it into S
           // to keep the result normalized.
           N = TransformForPostIncUse(Normalize, N, CI, 0,
@@ -2901,6 +2997,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
 /// and loop-computable portions.
 void
 LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+  // Mark uses whose expressions cannot be expanded.
+  if (!isSafeToExpand(S, SE))
+    LU.RigidFormula = true;
+
   Formula F;
   F.InitialMatch(S, L, SE);
   bool Inserted = InsertFormula(LU, LUIdx, F);
@@ -3048,7 +3148,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
       if (Remainder)
         Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
     }
-    return NULL;
+    return 0;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     // Split a non-zero base out of an addrec.
     if (AR->getStart()->isZero())
@@ -3060,7 +3160,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
     // does not pertain to this loop.
     if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
       Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
-      Remainder = NULL;
+      Remainder = 0;
     }
     if (Remainder != AR->getStart()) {
       if (!Remainder)
@@ -3082,7 +3182,7 @@ static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
         CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
       if (Remainder)
         Ops.push_back(SE.getMulExpr(C, Remainder));
-      return NULL;
+      return 0;
     }
   }
   return S;
@@ -3607,7 +3707,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                    abs64(NewF.BaseOffset)) &&
                   (C->getValue()->getValue() +
                    NewF.BaseOffset).countTrailingZeros() >=
-                   CountTrailingZeros_64(NewF.BaseOffset))
+                   countTrailingZeros<uint64_t>(NewF.BaseOffset))
                 goto skip_formula;
 
           // Ok, looks good.
@@ -3690,7 +3790,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
       // the corresponding bad register from the Regs set.
       Cost CostF;
       Regs.clear();
-      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+      CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
                         &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
@@ -3726,7 +3826,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
 
         Cost CostBest;
         Regs.clear();
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+        CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+                             DT, LU);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4079,7 +4180,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // the current best, prune the search at that point.
     NewCost = CurCost;
     NewRegs = CurRegs;
-    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+    NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        LU);
     if (NewCost < SolutionCost) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {
@@ -4266,6 +4368,8 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
                            SCEVExpander &Rewriter,
                            SmallVectorImpl<WeakVH> &DeadInsts) const {
   const LSRUse &LU = Uses[LF.LUIdx];
+  if (LU.RigidFormula)
+    return LF.OperandValToReplace;
 
   // Determine an input position which will be dominated by the operands and
   // which will dominate the result.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 80d060b..08ac38d 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -49,12 +49,17 @@ namespace {
   class LoopUnroll : public LoopPass {
   public:
     static char ID; // Pass ID, replacement for typeid
-    LoopUnroll(int T = -1, int C = -1,  int P = -1) : LoopPass(ID) {
+    LoopUnroll(int T = -1, int C = -1, int P = -1, int R = -1) : LoopPass(ID) {
       CurrentThreshold = (T == -1) ? UnrollThreshold : unsigned(T);
       CurrentCount = (C == -1) ? UnrollCount : unsigned(C);
       CurrentAllowPartial = (P == -1) ? UnrollAllowPartial : (bool)P;
+      CurrentRuntime = (R == -1) ? UnrollRuntime : (bool)R;
 
       UserThreshold = (T != -1) || (UnrollThreshold.getNumOccurrences() > 0);
+      UserAllowPartial = (P != -1) ||
+                         (UnrollAllowPartial.getNumOccurrences() > 0);
+      UserRuntime = (R != -1) || (UnrollRuntime.getNumOccurrences() > 0);
+      UserCount = (C != -1) || (UnrollCount.getNumOccurrences() > 0);
 
       initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
     }
@@ -75,7 +80,11 @@ namespace {
     unsigned CurrentCount;
     unsigned CurrentThreshold;
     bool     CurrentAllowPartial;
+    bool     CurrentRuntime;
+    bool     UserCount;            // CurrentCount is user-specified.
     bool     UserThreshold;        // CurrentThreshold is user-specified.
+    bool     UserAllowPartial;     // CurrentAllowPartial is user-specified.
+    bool     UserRuntime;          // CurrentRuntime is user-specified.
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
 
@@ -110,8 +119,9 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
-  return new LoopUnroll(Threshold, Count, AllowPartial);
+Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
+                                 int Runtime) {
+  return new LoopUnroll(Threshold, Count, AllowPartial, Runtime);
 }
 
 /// ApproximateLoopSize - Approximate the size of the loop.
@@ -145,16 +155,24 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         << "] Loop %" << Header->getName() << "\n");
   (void)Header;
 
+  TargetTransformInfo::UnrollingPreferences UP;
+  UP.Threshold = CurrentThreshold;
+  UP.OptSizeThreshold = OptSizeUnrollThreshold;
+  UP.Count = CurrentCount;
+  UP.Partial = CurrentAllowPartial;
+  UP.Runtime = CurrentRuntime;
+  TTI.getUnrollingPreferences(L, UP);
+
   // Determine the current unrolling threshold.  While this is normally set
   // from UnrollThreshold, it is overridden to a smaller value if the current
   // function is marked as optimize-for-size, and the unroll threshold was
   // not user specified.
-  unsigned Threshold = CurrentThreshold;
+  unsigned Threshold = UserThreshold ? CurrentThreshold : UP.Threshold;
   if (!UserThreshold &&
       Header->getParent()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex,
                      Attribute::OptimizeForSize))
-    Threshold = OptSizeUnrollThreshold;
+    Threshold = UP.OptSizeThreshold;
 
   // Find trip count and trip multiple if count is not available
   unsigned TripCount = 0;
@@ -167,11 +185,14 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     TripCount = SE->getSmallConstantTripCount(L, LatchBlock);
     TripMultiple = SE->getSmallConstantTripMultiple(L, LatchBlock);
   }
+
+  bool Runtime = UserRuntime ? CurrentRuntime : UP.Runtime;
+
   // Use a default unroll-count if the user doesn't specify a value
   // and the trip count is a run-time value.  The default is different
   // for run-time or compile-time trip count loops.
-  unsigned Count = CurrentCount;
-  if (UnrollRuntime && CurrentCount == 0 && TripCount == 0)
+  unsigned Count = UserCount ? CurrentCount : UP.Count;
+  if (Runtime && Count == 0 && TripCount == 0)
     Count = UnrollRuntimeCount;
 
   if (Count == 0) {
@@ -204,7 +225,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
     if (TripCount != 1 && Size > Threshold) {
       DEBUG(dbgs() << "  Too large to fully unroll with count: " << Count
             << " because size: " << Size << ">" << Threshold << "\n");
-      if (!CurrentAllowPartial && !(UnrollRuntime && TripCount == 0)) {
+      bool AllowPartial = UserAllowPartial ? CurrentAllowPartial : UP.Partial;
+      if (!AllowPartial && !(Runtime && TripCount == 0)) {
         DEBUG(dbgs() << "  will not try to unroll partially because "
               << "-unroll-allow-partial not given\n");
         return false;
@@ -215,7 +237,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
         while (Count != 0 && TripCount%Count != 0)
           Count--;
       }
-      else if (UnrollRuntime) {
+      else if (Runtime) {
         // Reduce unroll count to be a lower power-of-two value
         while (Count != 0 && Size > Threshold) {
           Count >>= 1;
@@ -231,7 +253,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   }
 
   // Unroll the loop.
-  if (!UnrollLoop(L, Count, TripCount, UnrollRuntime, TripMultiple, LI, &LPM))
+  if (!UnrollLoop(L, Count, TripCount, Runtime, TripMultiple, LI, &LPM))
     return false;
 
   return true;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 0e8199f..c4ebfd5 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -87,8 +87,8 @@ namespace {
     typedef LoopPropsMap::iterator LoopPropsMapIt;
 
     LoopPropsMap LoopsProperties;
-    UnswitchedValsMap* CurLoopInstructions;
-    LoopProperties* CurrentLoopProperties;
+    UnswitchedValsMap *CurLoopInstructions;
+    LoopProperties *CurrentLoopProperties;
 
     // Max size of code we can produce on remained iterations.
     unsigned MaxSize;
@@ -96,30 +96,30 @@ namespace {
     public:
 
       LUAnalysisCache() :
-        CurLoopInstructions(NULL), CurrentLoopProperties(NULL),
+        CurLoopInstructions(0), CurrentLoopProperties(0),
         MaxSize(Threshold)
       {}
 
       // Analyze loop. Check its size, calculate is it possible to unswitch
       // it. Returns true if we can unswitch this loop.
-      bool countLoop(const Loop* L, const TargetTransformInfo &TTI);
+      bool countLoop(const Loop *L, const TargetTransformInfo &TTI);
 
       // Clean all data related to given loop.
-      void forgetLoop(const Loop* L);
+      void forgetLoop(const Loop *L);
 
       // Mark case value as unswitched.
       // Since SI instruction can be partly unswitched, in order to avoid
       // extra unswitching in cloned loops keep track all unswitched values.
-      void setUnswitched(const SwitchInst* SI, const Value* V);
+      void setUnswitched(const SwitchInst *SI, const Value *V);
 
       // Check was this case value unswitched before or not.
-      bool isUnswitched(const SwitchInst* SI, const Value* V);
+      bool isUnswitched(const SwitchInst *SI, const Value *V);
 
       // Clone all loop-unswitch related loop properties.
       // Redistribute unswitching quotas.
       // Note, that new loop data is stored inside the VMap.
-      void cloneData(const Loop* NewLoop, const Loop* OldLoop,
-                     const ValueToValueMapTy& VMap);
+      void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                     const ValueToValueMapTy &VMap);
   };
 
   class LoopUnswitch : public LoopPass {
@@ -151,8 +151,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     explicit LoopUnswitch(bool Os = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
-      currentLoop(NULL), DT(NULL), loopHeader(NULL),
-      loopPreheader(NULL) {
+      currentLoop(0), DT(0), loopHeader(0),
+      loopPreheader(0) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -196,7 +196,7 @@ namespace {
 
     /// Split all of the edges from inside the loop to their exit blocks.
     /// Update the appropriate Phi nodes as we do so.
-    void SplitExitEdges(Loop *L, const SmallVector<BasicBlock *, 8> &ExitBlocks);
+    void SplitExitEdges(Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
     bool UnswitchIfProfitable(Value *LoopCond, Constant *Val);
     void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
@@ -212,8 +212,6 @@ namespace {
                                         Instruction *InsertPt);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
-    void RemoveBlockIfDead(BasicBlock *BB,
-                           std::vector<Instruction*> &Worklist, Loop *l);
     void RemoveLoopFromHierarchy(Loop *L);
     bool IsTrivialUnswitchCondition(Value *Cond, Constant **Val = 0,
                                     BasicBlock **LoopExit = 0);
@@ -225,12 +223,14 @@ namespace {
 // it. Returns true if we can unswitch this loop.
 bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
-  std::pair<LoopPropsMapIt, bool> InsertRes =
+  LoopPropsMapIt PropsIt;
+  bool Inserted;
+  llvm::tie(PropsIt, Inserted) =
       LoopsProperties.insert(std::make_pair(L, LoopProperties()));
 
-  LoopProperties& Props = InsertRes.first->second;
+  LoopProperties &Props = PropsIt->second;
 
-  if (InsertRes.second) {
+  if (Inserted) {
     // New loop.
 
     // Limit the number of instructions to avoid causing significant code
@@ -242,8 +242,7 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
     // consideration code simplification opportunities and code that can
     // be shared by the resultant unswitched loops.
     CodeMetrics Metrics;
-    for (Loop::block_iterator I = L->block_begin(),
-           E = L->block_end();
+    for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
          I != E; ++I)
       Metrics.analyzeBasicBlock(*I, TTI);
 
@@ -253,17 +252,16 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "NOT unswitching loop %"
-            << L->getHeader()->getName() << ", contents cannot be "
-            << "duplicated!\n");
+                   << L->getHeader()->getName() << ", contents cannot be "
+                   << "duplicated!\n");
       return false;
     }
   }
 
   if (!Props.CanBeUnswitchedCount) {
     DEBUG(dbgs() << "NOT unswitching loop %"
-          << L->getHeader()->getName() << ", cost too high: "
-          << L->getBlocks().size() << "\n");
-
+                 << L->getHeader()->getName() << ", cost too high: "
+                 << L->getBlocks().size() << "\n");
     return false;
   }
 
@@ -275,41 +273,41 @@ bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI) {
 }
 
 // Clean all data related to given loop.
-void LUAnalysisCache::forgetLoop(const Loop* L) {
+void LUAnalysisCache::forgetLoop(const Loop *L) {
 
   LoopPropsMapIt LIt = LoopsProperties.find(L);
 
   if (LIt != LoopsProperties.end()) {
-    LoopProperties& Props = LIt->second;
+    LoopProperties &Props = LIt->second;
     MaxSize += Props.CanBeUnswitchedCount * Props.SizeEstimation;
     LoopsProperties.erase(LIt);
   }
 
-  CurrentLoopProperties = NULL;
-  CurLoopInstructions = NULL;
+  CurrentLoopProperties = 0;
+  CurLoopInstructions = 0;
 }
 
 // Mark case value as unswitched.
 // Since SI instruction can be partly unswitched, in order to avoid
 // extra unswitching in cloned loops keep track all unswitched values.
-void LUAnalysisCache::setUnswitched(const SwitchInst* SI, const Value* V) {
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
   (*CurLoopInstructions)[SI].insert(V);
 }
 
 // Check was this case value unswitched before or not.
-bool LUAnalysisCache::isUnswitched(const SwitchInst* SI, const Value* V) {
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
   return (*CurLoopInstructions)[SI].count(V);
 }
 
 // Clone all loop-unswitch related loop properties.
 // Redistribute unswitching quotas.
 // Note, that new loop data is stored inside the VMap.
-void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
-                     const ValueToValueMapTy& VMap) {
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+                                const ValueToValueMapTy &VMap) {
 
-  LoopProperties& NewLoopProps = LoopsProperties[NewLoop];
-  LoopProperties& OldLoopProps = *CurrentLoopProperties;
-  UnswitchedValsMap& Insts = OldLoopProps.UnswitchedVals;
+  LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+  LoopProperties &OldLoopProps = *CurrentLoopProperties;
+  UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
 
   // Reallocate "can-be-unswitched quota"
 
@@ -324,9 +322,9 @@ void LUAnalysisCache::cloneData(const Loop* NewLoop, const Loop* OldLoop,
   // for new loop switches we clone info about values that was
   // already unswitched and has redundant successors.
   for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
-    const SwitchInst* OldInst = I->first;
-    Value* NewI = VMap.lookup(OldInst);
-    const SwitchInst* NewInst = cast_or_null<SwitchInst>(NewI);
+    const SwitchInst *OldInst = I->first;
+    Value *NewI = VMap.lookup(OldInst);
+    const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
     assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
 
     NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
@@ -458,14 +456,14 @@ bool LoopUnswitch::processCurrentLoop() {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
-        Constant *UnswitchVal = NULL;
+        Constant *UnswitchVal = 0;
 
         // Do not process same value again and again.
         // At this point we have some cases already unswitched and
         // some not yet unswitched. Let's find the first not yet unswitched one.
         for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
              i != e; ++i) {
-          Constant* UnswitchValCandidate = i.getCaseValue();
+          Constant *UnswitchValCandidate = i.getCaseValue();
           if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
             UnswitchVal = UnswitchValCandidate;
             break;
@@ -511,7 +509,8 @@ static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
     // Already visited. Without more analysis, this could indicate an infinite
     // loop.
     return false;
-  } else if (!L->contains(BB)) {
+  }
+  if (!L->contains(BB)) {
     // Otherwise, this is a loop exit, this is fine so long as this is the
     // first exit.
     if (ExitBB != 0) return false;
@@ -595,11 +594,11 @@ bool LoopUnswitch::IsTrivialUnswitchCondition(Value *Cond, Constant **Val,
     // on already unswitched cases.
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock* LoopExitCandidate;
+      BasicBlock *LoopExitCandidate;
       if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
                                                i.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt* CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = i.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -752,7 +751,7 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond,
 /// SplitExitEdges - Split all of the edges from inside the loop to their exit
 /// blocks.  Update the appropriate Phi nodes as we do so.
 void LoopUnswitch::SplitExitEdges(Loop *L,
-                                const SmallVector<BasicBlock *, 8> &ExitBlocks){
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
 
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
     BasicBlock *ExitBlock = ExitBlocks[i];
@@ -854,9 +853,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
     // If the successor of the exit block had PHI nodes, add an entry for
     // NewExit.
-    PHINode *PN;
-    for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) {
-      PN = cast<PHINode>(I);
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
       Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
       ValueToValueMapTy::iterator It = VMap.find(V);
       if (It != VMap.end()) V = It->second;
@@ -864,8 +862,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     }
 
     if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
-      PN = PHINode::Create(LPad->getType(), 0, "",
-                           ExitSucc->getFirstInsertionPt());
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    ExitSucc->getFirstInsertionPt());
 
       for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
            I != E; ++I) {
@@ -946,117 +944,6 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   ++NumSimplify;
 }
 
-/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
-/// information, and remove any dead successors it has.
-///
-void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
-                                     std::vector<Instruction*> &Worklist,
-                                     Loop *L) {
-  if (pred_begin(BB) != pred_end(BB)) {
-    // This block isn't dead, since an edge to BB was just removed, see if there
-    // are any easy simplifications we can do now.
-    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-      // If it has one pred, fold phi nodes in BB.
-      while (isa<PHINode>(BB->begin()))
-        ReplaceUsesOfWith(BB->begin(),
-                          cast<PHINode>(BB->begin())->getIncomingValue(0),
-                          Worklist, L, LPM);
-
-      // If this is the header of a loop and the only pred is the latch, we now
-      // have an unreachable loop.
-      if (Loop *L = LI->getLoopFor(BB))
-        if (loopHeader == BB && L->contains(Pred)) {
-          // Remove the branch from the latch to the header block, this makes
-          // the header dead, which will make the latch dead (because the header
-          // dominates the latch).
-          LPM->deleteSimpleAnalysisValue(Pred->getTerminator(), L);
-          Pred->getTerminator()->eraseFromParent();
-          new UnreachableInst(BB->getContext(), Pred);
-
-          // The loop is now broken, remove it from LI.
-          RemoveLoopFromHierarchy(L);
-
-          // Reprocess the header, which now IS dead.
-          RemoveBlockIfDead(BB, Worklist, L);
-          return;
-        }
-
-      // If pred ends in a uncond branch, add uncond branch to worklist so that
-      // the two blocks will get merged.
-      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-        if (BI->isUnconditional())
-          Worklist.push_back(BI);
-    }
-    return;
-  }
-
-  DEBUG(dbgs() << "Nuking dead block: " << *BB);
-
-  // Remove the instructions in the basic block from the worklist.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
-    RemoveFromWorklist(I, Worklist);
-
-    // Anything that uses the instructions in this basic block should have their
-    // uses replaced with undefs.
-    // If I is not void type then replaceAllUsesWith undef.
-    // This allows ValueHandlers and custom metadata to adjust itself.
-    if (!I->getType()->isVoidTy())
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
-  }
-
-  // If this is the edge to the header block for a loop, remove the loop and
-  // promote all subloops.
-  if (Loop *BBLoop = LI->getLoopFor(BB)) {
-    if (BBLoop->getLoopLatch() == BB) {
-      RemoveLoopFromHierarchy(BBLoop);
-      if (currentLoop == BBLoop) {
-        currentLoop = 0;
-        redoLoop = false;
-      }
-    }
-  }
-
-  // Remove the block from the loop info, which removes it from any loops it
-  // was in.
-  LI->removeBlock(BB);
-
-
-  // Remove phi node entries in successors for this block.
-  TerminatorInst *TI = BB->getTerminator();
-  SmallVector<BasicBlock*, 4> Succs;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-    Succs.push_back(TI->getSuccessor(i));
-    TI->getSuccessor(i)->removePredecessor(BB);
-  }
-
-  // Unique the successors, remove anything with multiple uses.
-  array_pod_sort(Succs.begin(), Succs.end());
-  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
-
-  // Remove the basic block, including all of the instructions contained in it.
-  LPM->deleteSimpleAnalysisValue(BB, L);
-  BB->eraseFromParent();
-  // Remove successor blocks here that are not dead, so that we know we only
-  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
-  // then getting removed before we revisit them, which is badness.
-  //
-  for (unsigned i = 0; i != Succs.size(); ++i)
-    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
-      // One exception is loop headers.  If this block was the preheader for a
-      // loop, then we DO want to visit the loop so the loop gets deleted.
-      // We know that if the successor is a loop header, that this loop had to
-      // be the preheader: the case where this was the latch block was handled
-      // above and headers can only have two predecessors.
-      if (!LI->isLoopHeader(Succs[i])) {
-        Succs.erase(Succs.begin()+i);
-        --i;
-      }
-    }
-
-  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
-    RemoveBlockIfDead(Succs[i], Worklist, L);
-}
-
 /// RemoveLoopFromHierarchy - We have discovered that the specified loop has
 /// become unwrapped, either because the backedge was deleted, or because the
 /// edge into the header was removed.  If the edge into the header from the
@@ -1088,7 +975,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
   std::vector<Instruction*> Worklist;
   LLVMContext &Context = Val->getContext();
 
-
   // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
   // in the loop with the appropriate one directly.
   if (IsEqual || (isa<ConstantInt>(Val) &&
@@ -1108,8 +994,8 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       Worklist.push_back(U);
     }
 
-    for (std::vector<Instruction*>::iterator UI = Worklist.begin();
-         UI != Worklist.end(); ++UI)
+    for (std::vector<Instruction*>::iterator UI = Worklist.begin(),
+         UE = Worklist.end(); UI != UE; ++UI)
       (*UI)->replaceUsesOfWith(LIC, Replacement);
 
     SimplifyCode(Worklist, L);
@@ -1266,23 +1152,6 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         continue;
       }
 
-      if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
-        // Conditional branch.  Turn it into an unconditional branch, then
-        // remove dead blocks.
-        continue;  // FIXME: Enable.
-
-        DEBUG(dbgs() << "Folded branch: " << *BI);
-        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
-        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
-        DeadSucc->removePredecessor(BI->getParent(), true);
-        Worklist.push_back(BranchInst::Create(LiveSucc, BI));
-        LPM->deleteSimpleAnalysisValue(BI, L);
-        BI->eraseFromParent();
-        RemoveFromWorklist(BI, Worklist);
-        ++NumSimplify;
-
-        RemoveBlockIfDead(DeadSucc, Worklist, L);
-      }
       continue;
     }
   }
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index be0f0e8..9912d3d 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -170,14 +170,17 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // pessimize the llvm optimizer.
   //
   // Since we don't have perfect knowledge here, make some assumptions: assume
-  // the maximum GPR width is the same size as the pointer size and assume that
-  // this width can be stored.  If so, check to see whether we will end up
-  // actually reducing the number of stores used.
+  // the maximum GPR width is the same size as the largest legal integer
+  // size. If so, check to see whether we will end up actually reducing the
+  // number of stores used.
   unsigned Bytes = unsigned(End-Start);
-  unsigned NumPointerStores = Bytes/TD.getPointerSize();
+  unsigned MaxIntSize = TD.getLargestLegalIntTypeSize();
+  if (MaxIntSize == 0)
+    MaxIntSize = 1;
+  unsigned NumPointerStores = Bytes / MaxIntSize;
 
   // Assume the remaining bytes if any are done a byte at a time.
-  unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize();
+  unsigned NumByteStores = Bytes - NumPointerStores * MaxIntSize;
 
   // If we will reduce the # stores (according to this heuristic), do the
   // transformation.  This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
@@ -465,7 +468,7 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst,
       AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
 
     // Zap all the stores.
-    for (SmallVector<Instruction*, 16>::const_iterator
+    for (SmallVectorImpl<Instruction *>::const_iterator
          SI = Range.TheStores.begin(),
          SE = Range.TheStores.end(); SI != SE; ++SI) {
       MD->removeInstruction(*SI);
@@ -626,8 +629,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
       return false;
 
     Type *StructTy = cast<PointerType>(A->getType())->getElementType();
-    uint64_t destSize = TD->getTypeAllocSize(StructTy);
+    if (!StructTy->isSized()) {
+      // The call may never return and hence the copy-instruction may never
+      // be executed, and therefore it's not safe to say "the destination
+      // has at least <cpyLen> bytes, as implied by the copy-instruction",
+      return false;
+    }
 
+    uint64_t destSize = TD->getTypeAllocSize(StructTy);
     if (destSize < srcSize)
       return false;
   } else {
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
new file mode 100644
index 0000000..15cee44
--- /dev/null
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -0,0 +1,156 @@
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+  class PartiallyInlineLibCalls : public FunctionPass {
+  public:
+    static char ID;
+
+    PartiallyInlineLibCalls() :
+      FunctionPass(ID) {
+      initializePartiallyInlineLibCallsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    /// Optimize calls to sqrt.
+    bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+                      BasicBlock &CurrBB, Function::iterator &BB);
+  };
+
+  char PartiallyInlineLibCalls::ID = 0;
+}
+
+INITIALIZE_PASS(PartiallyInlineLibCalls, "partially-inline-libcalls",
+                "Partially inline calls to library functions", false, false)
+
+void PartiallyInlineLibCalls::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetTransformInfo>();
+  FunctionPass::getAnalysisUsage(AU);
+}
+
+bool PartiallyInlineLibCalls::runOnFunction(Function &F) {
+  bool Changed = false;
+  Function::iterator CurrBB;
+  TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
+  const TargetTransformInfo *TTI = &getAnalysis<TargetTransformInfo>();
+  for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+    CurrBB = BB++;
+
+    for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+         II != IE; ++II) {
+      CallInst *Call = dyn_cast<CallInst>(&*II);
+      Function *CalledFunc;
+
+      if (!Call || !(CalledFunc = Call->getCalledFunction()))
+        continue;
+
+      // Skip if function either has local linkage or is not a known library
+      // function.
+      LibFunc::Func LibFunc;
+      if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() ||
+          !TLI->getLibFunc(CalledFunc->getName(), LibFunc))
+        continue;
+
+      switch (LibFunc) {
+      case LibFunc::sqrtf:
+      case LibFunc::sqrt:
+        if (TTI->haveFastSqrt(Call->getType()) &&
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+          break;
+        continue;
+      default:
+        continue;
+      }
+
+      Changed = true;
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool PartiallyInlineLibCalls::optimizeSQRT(CallInst *Call,
+                                           Function *CalledFunc,
+                                           BasicBlock &CurrBB,
+                                           Function::iterator &BB) {
+  // There is no need to change the IR, since backend will emit sqrt
+  // instruction if the call has already been marked read-only.
+  if (Call->onlyReadsMemory())
+    return false;
+
+  // Do the following transformation:
+  //
+  // (before)
+  // dst = sqrt(src)
+  //
+  // (after)
+  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+  // if (v0 is a NaN)
+  //   v1 = sqrt(src)         # library call.
+  // dst = phi(v0, v1)
+  //
+
+  // Move all instructions following Call to newly created block JoinBB.
+  // Create phi and replace all uses.
+  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode(), this);
+  IRBuilder<> Builder(JoinBB, JoinBB->begin());
+  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Call->replaceAllUsesWith(Phi);
+
+  // Create basic block LibCallBB and insert a call to library function sqrt.
+  BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+                                             CurrBB.getParent(), JoinBB);
+  Builder.SetInsertPoint(LibCallBB);
+  Instruction *LibCall = Call->clone();
+  Builder.Insert(LibCall);
+  Builder.CreateBr(JoinBB);
+
+  // Add attribute "readnone" so that backend can use a native sqrt instruction
+  // for this call. Insert a FP compare instruction and a conditional branch
+  // at the end of CurrBB.
+  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  CurrBB.getTerminator()->eraseFromParent();
+  Builder.SetInsertPoint(&CurrBB);
+  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+  // Add phi operands.
+  Phi->addIncoming(Call, &CurrBB);
+  Phi->addIncoming(LibCall, LibCallBB);
+
+  BB = JoinBB;
+  return true;
+}
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+  return new PartiallyInlineLibCalls();
+}
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index a3c241d..328a9c5 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -122,7 +122,6 @@ namespace {
   class XorOpnd {
   public:
     XorOpnd(Value *V);
-    const XorOpnd &operator=(const XorOpnd &That);
 
     bool isInvalid() const { return SymbolicPart == 0; }
     bool isOrExpr() const { return isOr; }
@@ -225,15 +224,6 @@ XorOpnd::XorOpnd(Value *V) {
   isOr = true;
 }
 
-const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
-  OrigVal = That.OrigVal;
-  SymbolicPart = That.SymbolicPart;
-  ConstPart = That.ConstPart;
-  SymbolicRank = That.SymbolicRank;
-  isOr = That.isOr;
-  return *this;
-}
-
 char Reassociate::ID = 0;
 INITIALIZE_PASS(Reassociate, "reassociate",
                 "Reassociate expressions", false, false)
@@ -251,21 +241,24 @@ static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
 }
 
 static bool isUnmovableInstruction(Instruction *I) {
-  if (I->getOpcode() == Instruction::PHI ||
-      I->getOpcode() == Instruction::LandingPad ||
-      I->getOpcode() == Instruction::Alloca ||
-      I->getOpcode() == Instruction::Load ||
-      I->getOpcode() == Instruction::Invoke ||
-      (I->getOpcode() == Instruction::Call &&
-       !isa<DbgInfoIntrinsic>(I)) ||
-      I->getOpcode() == Instruction::UDiv ||
-      I->getOpcode() == Instruction::SDiv ||
-      I->getOpcode() == Instruction::FDiv ||
-      I->getOpcode() == Instruction::URem ||
-      I->getOpcode() == Instruction::SRem ||
-      I->getOpcode() == Instruction::FRem)
+  switch (I->getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::LandingPad:
+  case Instruction::Alloca:
+  case Instruction::Load:
+  case Instruction::Invoke:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
     return true;
-  return false;
+  case Instruction::Call:
+    return !isa<DbgInfoIntrinsic>(I);
+  default:
+    return false;
+  }
 }
 
 void Reassociate::BuildRankMap(Function &F) {
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index e30a274..4364720 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -214,7 +214,7 @@ public:
   /// This returns true if the block was not considered live before.
   bool MarkBlockExecutable(BasicBlock *BB) {
     if (!BBExecutable.insert(BB)) return false;
-    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << "\n");
+    DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
     BBWorkList.push_back(BB);  // Add the block to the work list!
     return true;
   }
@@ -427,7 +427,7 @@ private:
       // feasible that wasn't before.  Revisit the PHI nodes in the block
       // because they have potentially new operands.
       DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
-            << " -> " << Dest->getName() << "\n");
+            << " -> " << Dest->getName() << '\n');
 
       PHINode *PN;
       for (BasicBlock::iterator I = Dest->begin();
@@ -439,7 +439,7 @@ private:
   // getFeasibleSuccessors - Return a vector of booleans to indicate which
   // successors are reachable from a given terminator instruction.
   //
-  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);
 
   // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
   // block to the 'To' basic block is currently feasible.
@@ -501,7 +501,7 @@ private:
 
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
-    dbgs() << "SCCP: Don't know how to handle: " << I;
+    dbgs() << "SCCP: Don't know how to handle: " << I << '\n';
     markAnythingOverdefined(&I);   // Just in case
   }
 };
@@ -513,7 +513,7 @@ private:
 // successors are reachable from a given terminator instruction.
 //
 void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
-                                       SmallVector<bool, 16> &Succs) {
+                                       SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
   if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
     if (BI->isUnconditional()) {
@@ -1604,7 +1604,7 @@ bool SCCP::runOnFunction(Function &F) {
 
       Constant *Const = IV.isConstant()
         ? IV.getConstant() : UndefValue::get(Inst->getType());
-      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+      DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
 
       // Replaces all of the uses of a variable with uses of the constant.
       Inst->replaceAllUsesWith(Const);
@@ -1812,7 +1812,7 @@ bool IPSCCP::runOnModule(Module &M) {
 
         Constant *Const = IV.isConstant()
           ? IV.getConstant() : UndefValue::get(Inst->getType());
-        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst);
+        DEBUG(dbgs() << "  Constant: " << *Const << " = " << *Inst << '\n');
 
         // Replaces all of the uses of a variable with uses of the
         // constant.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index d073e78..9f3fc83 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -47,6 +47,7 @@
 #include "llvm/InstVisitor.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@@ -58,9 +59,9 @@ using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
 STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
-STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
-STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
-STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
 STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
 STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
@@ -110,17 +111,39 @@ typedef llvm::IRBuilder<false, ConstantFolder,
 }
 
 namespace {
-/// \brief A common base class for representing a half-open byte range.
-struct ByteRange {
+/// \brief A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
   /// \brief The beginning offset of the range.
   uint64_t BeginOffset;
 
   /// \brief The ending offset, not included in the range.
   uint64_t EndOffset;
 
-  ByteRange() : BeginOffset(), EndOffset() {}
-  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
-      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+  /// \brief Storage for both the use of this slice and whether it can be
+  /// split.
+  PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+  Slice() : BeginOffset(), EndOffset() {}
+  Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset),
+        UseAndIsSplittable(U, IsSplittable) {}
+
+  uint64_t beginOffset() const { return BeginOffset; }
+  uint64_t endOffset() const { return EndOffset; }
+
+  bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+  void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+  Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+  bool isDead() const { return getUse() == 0; }
+  void kill() { UseAndIsSplittable.setPointer(0); }
 
   /// \brief Support for ordering ranges.
   ///
@@ -128,173 +151,67 @@ struct ByteRange {
   /// always increasing, and within equal start offsets, the end offsets are
   /// decreasing. Thus the spanning range comes first in a cluster with the
   /// same start position.
-  bool operator<(const ByteRange &RHS) const {
-    if (BeginOffset < RHS.BeginOffset) return true;
-    if (BeginOffset > RHS.BeginOffset) return false;
-    if (EndOffset > RHS.EndOffset) return true;
+  bool operator<(const Slice &RHS) const {
+    if (beginOffset() < RHS.beginOffset()) return true;
+    if (beginOffset() > RHS.beginOffset()) return false;
+    if (isSplittable() != RHS.isSplittable()) return !isSplittable();
+    if (endOffset() > RHS.endOffset()) return true;
     return false;
   }
 
   /// \brief Support comparison with a single offset to allow binary searches.
-  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
-    return LHS.BeginOffset < RHSOffset;
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+                                              uint64_t RHSOffset) {
+    return LHS.beginOffset() < RHSOffset;
   }
-
   friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
-                                              const ByteRange &RHS) {
-    return LHSOffset < RHS.BeginOffset;
+                                              const Slice &RHS) {
+    return LHSOffset < RHS.beginOffset();
   }
 
-  bool operator==(const ByteRange &RHS) const {
-    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+  bool operator==(const Slice &RHS) const {
+    return isSplittable() == RHS.isSplittable() &&
+           beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
   }
-  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+  bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
 };
-
-/// \brief A partition of an alloca.
-///
-/// This structure represents a contiguous partition of the alloca. These are
-/// formed by examining the uses of the alloca. During formation, they may
-/// overlap but once an AllocaPartitioning is built, the Partitions within it
-/// are all disjoint.
-struct Partition : public ByteRange {
-  /// \brief Whether this partition is splittable into smaller partitions.
-  ///
-  /// We flag partitions as splittable when they are formed entirely due to
-  /// accesses by trivially splittable operations such as memset and memcpy.
-  bool IsSplittable;
-
-  /// \brief Test whether a partition has been marked as dead.
-  bool isDead() const {
-    if (BeginOffset == UINT64_MAX) {
-      assert(EndOffset == UINT64_MAX);
-      return true;
-    }
-    return false;
-  }
-
-  /// \brief Kill a partition.
-  /// This is accomplished by setting both its beginning and end offset to
-  /// the maximum possible value.
-  void kill() {
-    assert(!isDead() && "He's Dead, Jim!");
-    BeginOffset = EndOffset = UINT64_MAX;
-  }
-
-  Partition() : ByteRange(), IsSplittable() {}
-  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
-      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
-};
-
-/// \brief A particular use of a partition of the alloca.
-///
-/// This structure is used to associate uses of a partition with it. They
-/// mark the range of bytes which are referenced by a particular instruction,
-/// and includes a handle to the user itself and the pointer value in use.
-/// The bounds of these uses are determined by intersecting the bounds of the
-/// memory use itself with a particular partition. As a consequence there is
-/// intentionally overlap between various uses of the same partition.
-class PartitionUse : public ByteRange {
-  /// \brief Combined storage for both the Use* and split state.
-  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
-
-public:
-  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
-  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
-               bool IsSplit)
-      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
-
-  /// \brief The use in question. Provides access to both user and used value.
-  ///
-  /// Note that this may be null if the partition use is *dead*, that is, it
-  /// should be ignored.
-  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
-
-  /// \brief Set the use for this partition use range.
-  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
-
-  /// \brief Whether this use is split across multiple partitions.
-  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
-};
-}
+} // end anonymous namespace
 
 namespace llvm {
-template <> struct isPodLike<Partition> : llvm::true_type {};
-template <> struct isPodLike<PartitionUse> : llvm::true_type {};
+template <typename T> struct isPodLike;
+template <> struct isPodLike<Slice> {
+   static const bool value = true;
+};
 }
 
 namespace {
-/// \brief Alloca partitioning representation.
+/// \brief Representation of the alloca slices.
 ///
-/// This class represents a partitioning of an alloca into slices, and
-/// information about the nature of uses of each slice of the alloca. The goal
-/// is that this information is sufficient to decide if and how to split the
-/// alloca apart and replace slices with scalars. It is also intended that this
-/// structure can capture the relevant information needed both to decide about
-/// and to enact these transformations.
-class AllocaPartitioning {
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class AllocaSlices {
 public:
-  /// \brief Construct a partitioning of a particular alloca.
-  ///
-  /// Construction does most of the work for partitioning the alloca. This
-  /// performs the necessary walks of users and builds a partitioning from it.
-  AllocaPartitioning(const DataLayout &TD, AllocaInst &AI);
+  /// \brief Construct the slices of a particular alloca.
+  AllocaSlices(const DataLayout &DL, AllocaInst &AI);
 
   /// \brief Test whether a pointer to the allocation escapes our analysis.
   ///
-  /// If this is true, the partitioning is never fully built and should be
+  /// If this is true, the slices are never fully built and should be
   /// ignored.
   bool isEscaped() const { return PointerEscapingInstr; }
 
-  /// \brief Support for iterating over the partitions.
+  /// \brief Support for iterating over the slices.
   /// @{
-  typedef SmallVectorImpl<Partition>::iterator iterator;
-  iterator begin() { return Partitions.begin(); }
-  iterator end() { return Partitions.end(); }
+  typedef SmallVectorImpl<Slice>::iterator iterator;
+  iterator begin() { return Slices.begin(); }
+  iterator end() { return Slices.end(); }
 
-  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
-  const_iterator begin() const { return Partitions.begin(); }
-  const_iterator end() const { return Partitions.end(); }
-  /// @}
-
-  /// \brief Support for iterating over and manipulating a particular
-  /// partition's uses.
-  ///
-  /// The iteration support provided for uses is more limited, but also
-  /// includes some manipulation routines to support rewriting the uses of
-  /// partitions during SROA.
-  /// @{
-  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
-  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
-  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
-  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
-  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
-
-  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
-  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
-  const_use_iterator use_begin(const_iterator I) const {
-    return Uses[I - begin()].begin();
-  }
-  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
-  const_use_iterator use_end(const_iterator I) const {
-    return Uses[I - begin()].end();
-  }
-
-  unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); }
-  unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); }
-  const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const {
-    return Uses[PIdx][UIdx];
-  }
-  const PartitionUse &getUse(const_iterator I, unsigned UIdx) const {
-    return Uses[I - begin()][UIdx];
-  }
-
-  void use_push_back(unsigned Idx, const PartitionUse &PU) {
-    Uses[Idx].push_back(PU);
-  }
-  void use_push_back(const_iterator I, const PartitionUse &PU) {
-    Uses[I - begin()].push_back(PU);
-  }
+  typedef SmallVectorImpl<Slice>::const_iterator const_iterator;
+  const_iterator begin() const { return Slices.begin(); }
+  const_iterator end() const { return Slices.end(); }
   /// @}
 
   /// \brief Allow iterating the dead users for this alloca.
@@ -320,66 +237,12 @@ public:
   dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
   /// @}
 
-  /// \brief MemTransferInst auxiliary data.
-  /// This struct provides some auxiliary data about memory transfer
-  /// intrinsics such as memcpy and memmove. These intrinsics can use two
-  /// different ranges within the same alloca, and provide other challenges to
-  /// correctly represent. We stash extra data to help us untangle this
-  /// after the partitioning is complete.
-  struct MemTransferOffsets {
-    /// The destination begin and end offsets when the destination is within
-    /// this alloca. If the end offset is zero the destination is not within
-    /// this alloca.
-    uint64_t DestBegin, DestEnd;
-
-    /// The source begin and end offsets when the source is within this alloca.
-    /// If the end offset is zero, the source is not within this alloca.
-    uint64_t SourceBegin, SourceEnd;
-
-    /// Flag for whether an alloca is splittable.
-    bool IsSplittable;
-  };
-  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
-    return MemTransferInstData.lookup(&II);
-  }
-
-  /// \brief Map from a PHI or select operand back to a partition.
-  ///
-  /// When manipulating PHI nodes or selects, they can use more than one
-  /// partition of an alloca. We store a special mapping to allow finding the
-  /// partition referenced by each of these operands, if any.
-  iterator findPartitionForPHIOrSelectOperand(Use *U) {
-    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
-      = PHIOrSelectOpMap.find(U);
-    if (MapIt == PHIOrSelectOpMap.end())
-      return end();
-
-    return begin() + MapIt->second.first;
-  }
-
-  /// \brief Map from a PHI or select operand back to the specific use of
-  /// a partition.
-  ///
-  /// Similar to mapping these operands back to the partitions, this maps
-  /// directly to the use structure of that partition.
-  use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) {
-    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
-      = PHIOrSelectOpMap.find(U);
-    assert(MapIt != PHIOrSelectOpMap.end());
-    return Uses[MapIt->second.first].begin() + MapIt->second.second;
-  }
-
-  /// \brief Compute a common type among the uses of a particular partition.
-  ///
-  /// This routines walks all of the uses of a particular partition and tries
-  /// to find a common type between them. Untyped operations such as memset and
-  /// memcpy are ignored.
-  Type *getCommonType(iterator I) const;
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
-  void printUsers(raw_ostream &OS, const_iterator I,
+  void printSlice(raw_ostream &OS, const_iterator I,
                   StringRef Indent = "  ") const;
+  void printUse(raw_ostream &OS, const_iterator I,
+                StringRef Indent = "  ") const;
   void print(raw_ostream &OS) const;
   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
   void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
@@ -387,47 +250,36 @@ public:
 
 private:
   template <typename DerivedT, typename RetT = void> class BuilderBase;
-  class PartitionBuilder;
-  friend class AllocaPartitioning::PartitionBuilder;
-  class UseBuilder;
-  friend class AllocaPartitioning::UseBuilder;
+  class SliceBuilder;
+  friend class AllocaSlices::SliceBuilder;
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// \brief Handle to alloca instruction to simplify method interfaces.
   AllocaInst &AI;
 #endif
 
-  /// \brief The instruction responsible for this alloca having no partitioning.
+  /// \brief The instruction responsible for this alloca not having a known set
+  /// of slices.
   ///
   /// When an instruction (potentially) escapes the pointer to the alloca, we
-  /// store a pointer to that here and abort trying to partition the alloca.
-  /// This will be null if the alloca is partitioned successfully.
+  /// store a pointer to that here and abort trying to form slices of the
+  /// alloca. This will be null if the alloca slices are analyzed successfully.
   Instruction *PointerEscapingInstr;
 
-  /// \brief The partitions of the alloca.
+  /// \brief The slices of the alloca.
   ///
-  /// We store a vector of the partitions over the alloca here. This vector is
-  /// sorted by increasing begin offset, and then by decreasing end offset. See
-  /// the Partition inner class for more details. Initially (during
-  /// construction) there are overlaps, but we form a disjoint sequence of
-  /// partitions while finishing construction and a fully constructed object is
-  /// expected to always have this as a disjoint space.
-  SmallVector<Partition, 8> Partitions;
-
-  /// \brief The uses of the partitions.
-  ///
-  /// This is essentially a mapping from each partition to a list of uses of
-  /// that partition. The mapping is done with a Uses vector that has the exact
-  /// same number of entries as the partition vector. Each entry is itself
-  /// a vector of the uses.
-  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
+  /// We store a vector of the slices formed by uses of the alloca here. This
+  /// vector is sorted by increasing begin offset, and then the unsplittable
+  /// slices before the splittable ones. See the Slice inner class for more
+  /// details.
+  SmallVector<Slice, 8> Slices;
 
   /// \brief Instructions which will become dead if we rewrite the alloca.
   ///
-  /// Note that these are not separated by partition. This is because we expect
-  /// a partitioned alloca to be completely rewritten or not rewritten at all.
-  /// If rewritten, all these instructions can simply be removed and replaced
-  /// with undef as they come from outside of the allocated space.
+  /// Note that these are not separated by slice. This is because we expect an
+  /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+  /// all these instructions can simply be removed and replaced with undef as
+  /// they come from outside of the allocated space.
   SmallVector<Instruction *, 8> DeadUsers;
 
   /// \brief Operands which will become dead if we rewrite the alloca.
@@ -439,26 +291,6 @@ private:
   /// want to swap this particular input for undef to simplify the use lists of
   /// the alloca.
   SmallVector<Use *, 8> DeadOperands;
-
-  /// \brief The underlying storage for auxiliary memcpy and memset info.
-  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
-
-  /// \brief A side datastructure used when building up the partitions and uses.
-  ///
-  /// This mapping is only really used during the initial building of the
-  /// partitioning so that we can retain information about PHI and select nodes
-  /// processed.
-  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
-
-  /// \brief Auxiliary information for particular PHI or select operands.
-  SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
-
-  /// \brief A utility routine called from the constructor.
-  ///
-  /// This does what it says on the tin. It is the key of the alloca partition
-  /// splitting and merging. After it is called we have the desired disjoint
-  /// collection of partitions.
-  void splitAndMergePartitions();
 };
 }
 
@@ -474,29 +306,35 @@ static Value *foldSelectInst(SelectInst &SI) {
   return 0;
 }
 
-/// \brief Builder for the alloca partitioning.
+/// \brief Builder for the alloca slices.
 ///
-/// This class builds an alloca partitioning by recursively visiting the uses
-/// of an alloca and splitting the partitions for each load and store at each
-/// offset.
-class AllocaPartitioning::PartitionBuilder
-    : public PtrUseVisitor<PartitionBuilder> {
-  friend class PtrUseVisitor<PartitionBuilder>;
-  friend class InstVisitor<PartitionBuilder>;
-  typedef PtrUseVisitor<PartitionBuilder> Base;
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+  friend class PtrUseVisitor<SliceBuilder>;
+  friend class InstVisitor<SliceBuilder>;
+  typedef PtrUseVisitor<SliceBuilder> Base;
 
   const uint64_t AllocSize;
-  AllocaPartitioning &P;
+  AllocaSlices &S;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+  SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
 
-  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
-  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
-      : PtrUseVisitor<PartitionBuilder>(DL),
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {}
+  SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &S)
+      : PtrUseVisitor<SliceBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), S(S) {}
 
 private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I))
+      S.DeadUsers.push_back(&I);
+  }
+
   void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
                  bool IsSplittable = false) {
     // Completely skip uses which have a zero size or start either before or
@@ -505,9 +343,9 @@ private:
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
                    << " which has zero size or starts outside of the "
                    << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << I << "\n");
-      return;
+      return markAsDead(I);
     }
 
     uint64_t BeginOffset = Offset.getZExtValue();
@@ -523,13 +361,26 @@ private:
     if (Size > AllocSize - BeginOffset) {
       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
                    << " to remain within the " << AllocSize << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << I << "\n");
       EndOffset = AllocSize;
     }
 
-    Partition New(BeginOffset, EndOffset, IsSplittable);
-    P.Partitions.push_back(New);
+    S.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    return Base::visitBitCastInst(BC);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    return Base::visitGetElementPtrInst(GEPI);
   }
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
@@ -580,9 +431,9 @@ private:
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
+                   << "    alloca: " << S.AI << "\n"
                    << "       use: " << SI << "\n");
-      return;
+      return markAsDead(SI);
     }
 
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
@@ -597,7 +448,7 @@ private:
     if ((Length && Length->getValue() == 0) ||
         (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return;
+      return markAsDead(II);
 
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
@@ -613,7 +464,7 @@ private:
     if ((Length && Length->getValue() == 0) ||
         (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return;
+      return markAsDead(II);
 
     if (!IsOffsetKnown)
       return PI.setAborted(&II);
@@ -622,63 +473,44 @@ private:
     uint64_t Size = Length ? Length->getLimitedValue()
                            : AllocSize - RawOffset;
 
-    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
-
-    // Only intrinsics with a constant length can be split.
-    Offsets.IsSplittable = Length;
+    // Check for the special case where the same exact value is used for both
+    // source and dest.
+    if (*U == II.getRawDest() && *U == II.getRawSource()) {
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return markAsDead(II);
 
-    if (*U == II.getRawDest()) {
-      Offsets.DestBegin = RawOffset;
-      Offsets.DestEnd = RawOffset + Size;
-    }
-    if (*U == II.getRawSource()) {
-      Offsets.SourceBegin = RawOffset;
-      Offsets.SourceEnd = RawOffset + Size;
+      return insertUse(II, Offset, Size, /*IsSplittable=*/false);
     }
 
-    // If we have set up end offsets for both the source and the destination,
-    // we have found both sides of this transfer pointing at the same alloca.
-    bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd;
-    if (SeenBothEnds && II.getRawDest() != II.getRawSource()) {
-      unsigned PrevIdx = MemTransferPartitionMap[&II];
+    // If we have seen both source and destination for a mem transfer, then
+    // they both point to the same alloca.
+    bool Inserted;
+    SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+    llvm::tie(MTPI, Inserted) =
+        MemTransferSliceMap.insert(std::make_pair(&II, S.Slices.size()));
+    unsigned PrevIdx = MTPI->second;
+    if (!Inserted) {
+      Slice &PrevP = S.Slices[PrevIdx];
 
       // Check if the begin offsets match and this is a non-volatile transfer.
       // In that case, we can completely elide the transfer.
-      if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
-        P.Partitions[PrevIdx].kill();
-        return;
+      if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+        PrevP.kill();
+        return markAsDead(II);
       }
 
       // Otherwise we have an offset transfer within the same alloca. We can't
       // split those.
-      P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false;
-    } else if (SeenBothEnds) {
-      // Handle the case where this exact use provides both ends of the
-      // operation.
-      assert(II.getRawDest() == II.getRawSource());
-
-      // For non-volatile transfers this is a no-op.
-      if (!II.isVolatile())
-        return;
-
-      // Otherwise just suppress splitting.
-      Offsets.IsSplittable = false;
+      PrevP.makeUnsplittable();
     }
 
-
     // Insert the use now that we've fixed up the splittable nature.
-    insertUse(II, Offset, Size, Offsets.IsSplittable);
-
-    // Setup the mapping from intrinsic to partition of we've not seen both
-    // ends of this transfer.
-    if (!SeenBothEnds) {
-      unsigned NewIdx = P.Partitions.size() - 1;
-      bool Inserted
-        = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second;
-      assert(Inserted &&
-             "Already have intrinsic in map but haven't seen both ends");
-      (void)Inserted;
-    }
+    insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+    // Check that we ended up with a valid index in the map.
+    assert(S.Slices[PrevIdx].getUse()->getUser() == &II &&
+           "Map index doesn't point back to a slice with this user.");
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
@@ -702,7 +534,7 @@ private:
 
   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
     // We consider any PHI or select that results in a direct load or store of
-    // the same offset to be a viable use for partitioning purposes. These uses
+    // the same offset to be a viable use for slicing purposes. These uses
     // are considered unsplittable and the size is the maximum loaded or stored
     // size.
     SmallPtrSet<Instruction *, 4> Visited;
@@ -747,234 +579,36 @@ private:
 
   void visitPHINode(PHINode &PN) {
     if (PN.use_empty())
-      return;
+      return markAsDead(PN);
     if (!IsOffsetKnown)
       return PI.setAborted(&PN);
 
     // See if we already have computed info on this node.
-    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
-    if (PHIInfo.first) {
-      PHIInfo.second = true;
-      insertUse(PN, Offset, PHIInfo.first);
-      return;
+    uint64_t &PHISize = PHIOrSelectSizes[&PN];
+    if (!PHISize) {
+      // This is a new PHI node, check for an unsafe use of the PHI node.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHISize))
+        return PI.setAborted(UnsafeI);
     }
 
-    // Check for an unsafe use of the PHI node.
-    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
-      return PI.setAborted(UnsafeI);
-
-    insertUse(PN, Offset, PHIInfo.first);
-  }
-
-  void visitSelectInst(SelectInst &SI) {
-    if (SI.use_empty())
-      return;
-    if (Value *Result = foldSelectInst(SI)) {
-      if (Result == *U)
-        // If the result of the constant fold will be the pointer, recurse
-        // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI);
-
-      return;
-    }
-    if (!IsOffsetKnown)
-      return PI.setAborted(&SI);
-
-    // See if we already have computed info on this node.
-    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
-    if (SelectInfo.first) {
-      SelectInfo.second = true;
-      insertUse(SI, Offset, SelectInfo.first);
-      return;
-    }
-
-    // Check for an unsafe use of the PHI node.
-    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
-      return PI.setAborted(UnsafeI);
-
-    insertUse(SI, Offset, SelectInfo.first);
-  }
-
-  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
-  void visitInstruction(Instruction &I) {
-    PI.setAborted(&I);
-  }
-};
-
-/// \brief Use adder for the alloca partitioning.
-///
-/// This class adds the uses of an alloca to all of the partitions which they
-/// use. For splittable partitions, this can end up doing essentially a linear
-/// walk of the partitions, but the number of steps remains bounded by the
-/// total result instruction size:
-/// - The number of partitions is a result of the number unsplittable
-///   instructions using the alloca.
-/// - The number of users of each partition is at worst the total number of
-///   splittable instructions using the alloca.
-/// Thus we will produce N * M instructions in the end, where N are the number
-/// of unsplittable uses and M are the number of splittable. This visitor does
-/// the exact same number of updates to the partitioning.
-///
-/// In the more common case, this visitor will leverage the fact that the
-/// partition space is pre-sorted, and do a logarithmic search for the
-/// partition needed, making the total visit a classical ((N + M) * log(N))
-/// complexity operation.
-class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
-  friend class PtrUseVisitor<UseBuilder>;
-  friend class InstVisitor<UseBuilder>;
-  typedef PtrUseVisitor<UseBuilder> Base;
-
-  const uint64_t AllocSize;
-  AllocaPartitioning &P;
-
-  /// \brief Set to de-duplicate dead instructions found in the use walk.
-  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
-
-public:
-  UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : PtrUseVisitor<UseBuilder>(TD),
-        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {}
-
-private:
-  void markAsDead(Instruction &I) {
-    if (VisitedDeadInsts.insert(&I))
-      P.DeadUsers.push_back(&I);
-  }
-
-  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
-    // If the use has a zero size or extends outside of the allocation, record
-    // it as a dead use for elimination later.
-    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
-      return markAsDead(User);
-
-    uint64_t BeginOffset = Offset.getZExtValue();
-    uint64_t EndOffset = BeginOffset + Size;
-
-    // Clamp the end offset to the end of the allocation. Note that this is
-    // formulated to handle even the case where "BeginOffset + Size" overflows.
-    assert(AllocSize >= BeginOffset); // Established above.
-    if (Size > AllocSize - BeginOffset)
-      EndOffset = AllocSize;
-
-    // NB: This only works if we have zero overlapping partitions.
-    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
-    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
-      I = llvm::prior(I);
-    iterator E = P.end();
-    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
-    for (; I != E && I->BeginOffset < EndOffset; ++I) {
-      PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
-                         std::min(I->EndOffset, EndOffset), U, IsSplit);
-      P.use_push_back(I, NewPU);
-      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
-        P.PHIOrSelectOpMap[U]
-          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
-    }
-  }
-
-  void visitBitCastInst(BitCastInst &BC) {
-    if (BC.use_empty())
-      return markAsDead(BC);
-
-    return Base::visitBitCastInst(BC);
-  }
-
-  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    if (GEPI.use_empty())
-      return markAsDead(GEPI);
-
-    return Base::visitGetElementPtrInst(GEPI);
-  }
-
-  void visitLoadInst(LoadInst &LI) {
-    assert(IsOffsetKnown);
-    uint64_t Size = DL.getTypeStoreSize(LI.getType());
-    insertUse(LI, Offset, Size);
-  }
-
-  void visitStoreInst(StoreInst &SI) {
-    assert(IsOffsetKnown);
-    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
-    // ignore it. Note that this is more strict than the generic clamping
-    // behavior of insertUse.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size))
-      return markAsDead(SI);
-
-    insertUse(SI, Offset, Size);
-  }
-
-  void visitMemSetInst(MemSetInst &II) {
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
-      return markAsDead(II);
-
-    assert(IsOffsetKnown);
-    insertUse(II, Offset, Length ? Length->getLimitedValue()
-                                 : AllocSize - Offset.getLimitedValue());
-  }
-
-  void visitMemTransferInst(MemTransferInst &II) {
-    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    if ((Length && Length->getValue() == 0) ||
-        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
-      return markAsDead(II);
-
-    assert(IsOffsetKnown);
-    uint64_t Size = Length ? Length->getLimitedValue()
-                           : AllocSize - Offset.getLimitedValue();
-
-    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
-    if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
-        Offsets.DestBegin == Offsets.SourceBegin)
-      return markAsDead(II); // Skip identity transfers without side-effects.
-
-    insertUse(II, Offset, Size);
-  }
-
-  void visitIntrinsicInst(IntrinsicInst &II) {
-    assert(IsOffsetKnown);
-    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
-           II.getIntrinsicID() == Intrinsic::lifetime_end);
-
-    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-    insertUse(II, Offset, std::min(Length->getLimitedValue(),
-                                   AllocSize - Offset.getLimitedValue()));
-  }
-
-  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
-    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
-
     // For PHI and select operands outside the alloca, we can't nuke the entire
     // phi or select -- the other side might still be relevant, so we special
     // case them here and use a separate structure to track the operands
     // themselves which should be replaced with undef.
-    if ((Offset.isNegative() && Offset.uge(Size)) ||
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if ((Offset.isNegative() && (-Offset).uge(PHISize)) ||
         (!Offset.isNegative() && Offset.uge(AllocSize))) {
-      P.DeadOperands.push_back(U);
+      S.DeadOperands.push_back(U);
       return;
     }
 
-    insertUse(User, Offset, Size);
-  }
-
-  void visitPHINode(PHINode &PN) {
-    if (PN.use_empty())
-      return markAsDead(PN);
-
-    assert(IsOffsetKnown);
-    insertPHIOrSelect(PN, Offset);
+    insertUse(PN, Offset, PHISize);
   }
 
   void visitSelectInst(SelectInst &SI) {
     if (SI.use_empty())
       return markAsDead(SI);
-
     if (Value *Result = foldSelectInst(SI)) {
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
@@ -983,276 +617,106 @@ private:
       else
         // Otherwise the operand to the select is dead, and we can replace it
         // with undef.
-        P.DeadOperands.push_back(U);
+        S.DeadOperands.push_back(U);
 
       return;
     }
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
 
-    assert(IsOffsetKnown);
-    insertPHIOrSelect(SI, Offset);
-  }
-
-  /// \brief Unreachable, we've already visited the alloca once.
-  void visitInstruction(Instruction &I) {
-    llvm_unreachable("Unhandled instruction in use builder.");
-  }
-};
-
-void AllocaPartitioning::splitAndMergePartitions() {
-  size_t NumDeadPartitions = 0;
-
-  // Track the range of splittable partitions that we pass when accumulating
-  // overlapping unsplittable partitions.
-  uint64_t SplitEndOffset = 0ull;
-
-  Partition New(0ull, 0ull, false);
-
-  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
-    ++j;
-
-    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
-      assert(New.BeginOffset == New.EndOffset);
-      New = Partitions[i];
-    } else {
-      assert(New.IsSplittable);
-      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
-    }
-    assert(New.BeginOffset != New.EndOffset);
-
-    // Scan the overlapping partitions.
-    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
-      // If the new partition we are forming is splittable, stop at the first
-      // unsplittable partition.
-      if (New.IsSplittable && !Partitions[j].IsSplittable)
-        break;
-
-      // Grow the new partition to include any equally splittable range. 'j' is
-      // always equally splittable when New is splittable, but when New is not
-      // splittable, we may subsume some (or part of some) splitable partition
-      // without growing the new one.
-      if (New.IsSplittable == Partitions[j].IsSplittable) {
-        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
-      } else {
-        assert(!New.IsSplittable);
-        assert(Partitions[j].IsSplittable);
-        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
-      }
-
-      Partitions[j].kill();
-      ++NumDeadPartitions;
-      ++j;
-    }
-
-    // If the new partition is splittable, chop off the end as soon as the
-    // unsplittable subsequent partition starts and ensure we eventually cover
-    // the splittable area.
-    if (j != e && New.IsSplittable) {
-      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
-      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+    // See if we already have computed info on this node.
+    uint64_t &SelectSize = PHIOrSelectSizes[&SI];
+    if (!SelectSize) {
+      // This is a new Select, check for an unsafe use of it.
+      if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectSize))
+        return PI.setAborted(UnsafeI);
     }
 
-    // Add the new partition if it differs from the original one and is
-    // non-empty. We can end up with an empty partition here if it was
-    // splittable but there is an unsplittable one that starts at the same
-    // offset.
-    if (New != Partitions[i]) {
-      if (New.BeginOffset != New.EndOffset)
-        Partitions.push_back(New);
-      // Mark the old one for removal.
-      Partitions[i].kill();
-      ++NumDeadPartitions;
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    // FIXME: This should instead be escaped in the event we're instrumenting
+    // for address sanitization.
+    if ((Offset.isNegative() && Offset.uge(SelectSize)) ||
+        (!Offset.isNegative() && Offset.uge(AllocSize))) {
+      S.DeadOperands.push_back(U);
+      return;
     }
 
-    New.BeginOffset = New.EndOffset;
-    if (!New.IsSplittable) {
-      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
-      if (j != e && !Partitions[j].IsSplittable)
-        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
-      New.IsSplittable = true;
-      // If there is a trailing splittable partition which won't be fused into
-      // the next splittable partition go ahead and add it onto the partitions
-      // list.
-      if (New.BeginOffset < New.EndOffset &&
-          (j == e || !Partitions[j].IsSplittable ||
-           New.EndOffset < Partitions[j].BeginOffset)) {
-        Partitions.push_back(New);
-        New.BeginOffset = New.EndOffset = 0ull;
-      }
-    }
+    insertUse(SI, Offset, SelectSize);
   }
 
-  // Re-sort the partitions now that they have been split and merged into
-  // disjoint set of partitions. Also remove any of the dead partitions we've
-  // replaced in the process.
-  std::sort(Partitions.begin(), Partitions.end());
-  if (NumDeadPartitions) {
-    assert(Partitions.back().isDead());
-    assert((ptrdiff_t)NumDeadPartitions ==
-           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  void visitInstruction(Instruction &I) {
+    PI.setAborted(&I);
   }
-  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
-}
+};
 
-AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
     :
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
       AI(AI),
 #endif
       PointerEscapingInstr(0) {
-  PartitionBuilder PB(TD, AI, *this);
-  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  SliceBuilder PB(DL, AI, *this);
+  SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
   if (PtrI.isEscaped() || PtrI.isAborted()) {
     // FIXME: We should sink the escape vs. abort info into the caller nicely,
-    // possibly by just storing the PtrInfo in the AllocaPartitioning.
+    // possibly by just storing the PtrInfo in the AllocaSlices.
     PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
                                                   : PtrI.getAbortingInst();
     assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
   }
 
+  Slices.erase(std::remove_if(Slices.begin(), Slices.end(),
+                              std::mem_fun_ref(&Slice::isDead)),
+               Slices.end());
+
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
-  std::sort(Partitions.begin(), Partitions.end());
-
-  // Remove any partitions from the back which are marked as dead.
-  while (!Partitions.empty() && Partitions.back().isDead())
-    Partitions.pop_back();
-
-  if (Partitions.size() > 1) {
-    // Intersect splittability for all partitions with equal offsets and sizes.
-    // Then remove all but the first so that we have a sequence of non-equal but
-    // potentially overlapping partitions.
-    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
-         I = J) {
-      ++J;
-      while (J != E && *I == *J) {
-        I->IsSplittable &= J->IsSplittable;
-        ++J;
-      }
-    }
-    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
-                     Partitions.end());
-
-    // Split splittable and merge unsplittable partitions into a disjoint set
-    // of partitions over the used space of the allocation.
-    splitAndMergePartitions();
-  }
-
-  // Record how many partitions we end up with.
-  NumAllocaPartitions += Partitions.size();
-  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
-
-  // Now build up the user lists for each of these disjoint partitions by
-  // re-walking the recursive users of the alloca.
-  Uses.resize(Partitions.size());
-  UseBuilder UB(TD, AI, *this);
-  PtrI = UB.visitPtr(AI);
-  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
-  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
-
-  unsigned NumUses = 0;
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
-  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
-    NumUses += Uses[Idx].size();
-#endif
-  NumAllocaPartitionUses += NumUses;
-  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
+  std::sort(Slices.begin(), Slices.end());
 }
 
-Type *AllocaPartitioning::getCommonType(iterator I) const {
-  Type *Ty = 0;
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    Use *U = UI->getUse();
-    if (!U)
-      continue; // Skip dead uses.
-    if (isa<IntrinsicInst>(*U->getUser()))
-      continue;
-    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
-      continue;
-
-    Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
-      UserTy = LI->getType();
-    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
-      UserTy = SI->getValueOperand()->getType();
-    else
-      return 0; // Bail if we have weird uses.
-
-    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
-      // If the type is larger than the partition, skip it. We only encounter
-      // this for split integer operations where we want to use the type of the
-      // entity causing the split.
-      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
-        continue;
-
-      // If we have found an integer type use covering the alloca, use that
-      // regardless of the other types, as integers are often used for a "bucket
-      // of bits" type.
-      return ITy;
-    }
-
-    if (Ty && Ty != UserTy)
-      return 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-    Ty = UserTy;
-  }
-  return Ty;
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+                         StringRef Indent) const {
+  printSlice(OS, I, Indent);
+  printUse(OS, I, Indent);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-
-void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
-                               StringRef Indent) const {
-  OS << Indent << "partition #" << (I - begin())
-     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
-     << (I->IsSplittable ? " (splittable)" : "")
-     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
-     << "\n";
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+                              StringRef Indent) const {
+  OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+     << " slice #" << (I - begin())
+     << (I->isSplittable() ? " (splittable)" : "") << "\n";
 }
 
-void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
-                                    StringRef Indent) const {
-  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (!UI->getUse())
-      continue; // Skip dead uses.
-    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
-       << "used by: " << *UI->getUse()->getUser() << "\n";
-    if (MemTransferInst *II =
-            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
-      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
-      bool IsDest;
-      if (!MTO.IsSplittable)
-        IsDest = UI->BeginOffset == MTO.DestBegin;
-      else
-        IsDest = MTO.DestBegin != 0u;
-      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
-         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
-         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
-    }
-  }
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+                            StringRef Indent) const {
+  OS << Indent << "  used by: " << *I->getUse()->getUser() << "\n";
 }
 
-void AllocaPartitioning::print(raw_ostream &OS) const {
+void AllocaSlices::print(raw_ostream &OS) const {
   if (PointerEscapingInstr) {
-    OS << "No partitioning for alloca: " << AI << "\n"
+    OS << "Can't analyze slices for alloca: " << AI << "\n"
        << "  A pointer to this alloca escaped by:\n"
        << "  " << *PointerEscapingInstr << "\n";
     return;
   }
 
-  OS << "Partitioning of alloca: " << AI << "\n";
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+  OS << "Slices of alloca: " << AI << "\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
     print(OS, I);
-    printUsers(OS, I);
-  }
 }
 
-void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
-void AllocaPartitioning::dump() const { print(dbgs()); }
+void AllocaSlices::dump(const_iterator I) const { print(dbgs(), I); }
+void AllocaSlices::dump() const { print(dbgs()); }
 
 #endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 
-
 namespace {
 /// \brief Implementation of LoadAndStorePromoter for promoting allocas.
 ///
@@ -1269,12 +733,13 @@ class AllocaPromoter : public LoadAndStorePromoter {
   SmallVector<DbgValueInst *, 4> DVIs;
 
 public:
-  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+  AllocaPromoter(const SmallVectorImpl<Instruction *> &Insts, SSAUpdater &S,
                  AllocaInst &AI, DIBuilder &DIB)
-    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+      : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
 
   void run(const SmallVectorImpl<Instruction*> &Insts) {
-    // Remember which alloca we're promoting (for isInstInList).
+    // Retain the debug information attached to the alloca for use when
+    // rewriting loads and stores.
     if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
       for (Value::use_iterator UI = DebugNode->use_begin(),
                                UE = DebugNode->use_end();
@@ -1286,7 +751,9 @@ public:
     }
 
     LoadAndStorePromoter::run(Insts);
-    AI.eraseFromParent();
+
+    // While we have the debug information, clear it off of the alloca. The
+    // caller takes care of deleting the alloca.
     while (!DDIs.empty())
       DDIs.pop_back_val()->eraseFromParent();
     while (!DVIs.empty())
@@ -1295,13 +762,34 @@ public:
 
   virtual bool isInstInList(Instruction *I,
                             const SmallVectorImpl<Instruction*> &Insts) const {
+    Value *Ptr;
     if (LoadInst *LI = dyn_cast<LoadInst>(I))
-      return LI->getOperand(0) == &AI;
-    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+      Ptr = LI->getOperand(0);
+    else
+      Ptr = cast<StoreInst>(I)->getPointerOperand();
+
+    // Only used to detect cycles, which will be rare and quickly found as
+    // we're walking up a chain of defs rather than down through uses.
+    SmallPtrSet<Value *, 4> Visited;
+
+    do {
+      if (Ptr == &AI)
+        return true;
+
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Ptr))
+        Ptr = BCI->getOperand(0);
+      else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+        Ptr = GEPI->getPointerOperand();
+      else
+        return false;
+
+    } while (Visited.insert(Ptr));
+
+    return false;
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -1309,7 +797,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = 0;
@@ -1360,7 +848,7 @@ class SROA : public FunctionPass {
   const bool RequiresDomTree;
 
   LLVMContext *C;
-  const DataLayout *TD;
+  const DataLayout *DL;
   DominatorTree *DT;
 
   /// \brief Worklist of alloca instructions to simplify.
@@ -1390,10 +878,25 @@ class SROA : public FunctionPass {
   /// \brief A collection of alloca instructions we can directly promote.
   std::vector<AllocaInst *> PromotableAllocas;
 
+  /// \brief A worklist of PHIs to speculate prior to promoting allocas.
+  ///
+  /// All of these PHIs have been checked for the safety of speculation and by
+  /// being speculated will allow promoting allocas currently in the promotable
+  /// queue.
+  SetVector<PHINode *, SmallVector<PHINode *, 2> > SpeculatablePHIs;
+
+  /// \brief A worklist of select instructions to speculate prior to promoting
+  /// allocas.
+  ///
+  /// All of these select instructions have been checked for the safety of
+  /// speculation and by being speculated will allow promoting allocas
+  /// currently in the promotable queue.
+  SetVector<SelectInst *, SmallVector<SelectInst *, 2> > SpeculatableSelects;
+
 public:
   SROA(bool RequiresDomTree = true)
       : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
-        C(0), TD(0), DT(0) {
+        C(0), DL(0), DT(0) {
     initializeSROAPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F);
@@ -1404,13 +907,13 @@ public:
 
 private:
   friend class PHIOrSelectSpeculator;
-  friend class AllocaPartitionRewriter;
-  friend class AllocaPartitionVectorRewriter;
+  friend class AllocaSliceRewriter;
 
-  bool rewriteAllocaPartition(AllocaInst &AI,
-                              AllocaPartitioning &P,
-                              AllocaPartitioning::iterator PI);
-  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
+  bool rewritePartition(AllocaInst &AI, AllocaSlices &S,
+                        AllocaSlices::iterator B, AllocaSlices::iterator E,
+                        int64_t BeginOffset, int64_t EndOffset,
+                        ArrayRef<AllocaSlices::iterator> SplitUses);
+  bool splitAlloca(AllocaInst &AI, AllocaSlices &S);
   bool runOnAlloca(AllocaInst &AI);
   void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
   bool promoteAllocas(Function &F);
@@ -1429,286 +932,255 @@ INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
                     false, false)
 
-namespace {
-/// \brief Visitor to speculate PHIs and Selects where possible.
-class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
-  // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
-
-  const DataLayout &TD;
-  AllocaPartitioning &P;
-  SROA &Pass;
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
+static Type *findCommonType(AllocaSlices::const_iterator B,
+                            AllocaSlices::const_iterator E,
+                            uint64_t EndOffset) {
+  Type *Ty = 0;
+  bool IgnoreNonIntegralTypes = false;
+  for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+    Use *U = I->getUse();
+    if (isa<IntrinsicInst>(*U->getUser()))
+      continue;
+    if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+      continue;
 
-public:
-  PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass)
-    : TD(TD), P(P), Pass(Pass) {}
-
-  /// \brief Visit the users of an alloca partition and rewrite them.
-  void visitUsers(AllocaPartitioning::const_iterator PI) {
-    // Note that we need to use an index here as the underlying vector of uses
-    // may be grown during speculation. However, we never need to re-visit the
-    // new uses, and so we can use the initial size bound.
-    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
-      const PartitionUse &PU = P.getUse(PI, Idx);
-      if (!PU.getUse())
-        continue; // Skip dead use.
-
-      visit(cast<Instruction>(PU.getUse()->getUser()));
+    Type *UserTy = 0;
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    } else {
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
+      continue;
     }
-  }
 
-private:
-  // By default, skip this instruction.
-  void visitInstruction(Instruction &I) {}
-
-  /// PHI instructions that use an alloca and are subsequently loaded can be
-  /// rewritten to load both input pointers in the pred blocks and then PHI the
-  /// results, allowing the load of the alloca to be promoted.
-  /// From this:
-  ///   %P2 = phi [i32* %Alloca, i32* %Other]
-  ///   %V = load i32* %P2
-  /// to:
-  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-  ///   ...
-  ///   %V2 = load i32* %Other
-  ///   ...
-  ///   %V = phi [i32 %V1, i32 %V2]
-  ///
-  /// We can do this to a select if its only uses are loads and if the operands
-  /// to the select can be loaded unconditionally.
-  ///
-  /// FIXME: This should be hoisted into a generic utility, likely in
-  /// Transforms/Util/Local.h
-  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
-    // For now, we can only do this promotion if the load is in the same block
-    // as the PHI, and if there are no stores between the phi and load.
-    // TODO: Allow recursive phi users.
-    // TODO: Allow stores.
-    BasicBlock *BB = PN.getParent();
-    unsigned MaxAlign = 0;
-    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
-         UI != UE; ++UI) {
-      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-      if (LI == 0 || !LI->isSimple()) return false;
-
-      // For now we only allow loads in the same block as the PHI.  This is
-      // a common case that happens when instcombine merges two loads through
-      // a PHI.
-      if (LI->getParent() != BB) return false;
-
-      // Ensure that there are no instructions between the PHI and the load that
-      // could store.
-      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
-        if (BBI->mayWriteToMemory())
-          return false;
-
-      MaxAlign = std::max(MaxAlign, LI->getAlignment());
-      Loads.push_back(LI);
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split. Also skip if the type is not a byte width
+      // multiple.
+      if (ITy->getBitWidth() % 8 != 0 ||
+          ITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for
+      // a "bucket of bits" type.
+      //
+      // NB: This *must* be the only return from inside the loop so that the
+      // order of slices doesn't impact the computed type.
+      return ITy;
+    } else if (IgnoreNonIntegralTypes) {
+      continue;
     }
 
-    // We can only transform this if it is safe to push the loads into the
-    // predecessor blocks. The only thing to watch out for is that we can't put
-    // a possibly trapping load in the predecessor if it is a critical edge.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
-      Value *InVal = PN.getIncomingValue(Idx);
-
-      // If the value is produced by the terminator of the predecessor (an
-      // invoke) or it has side-effects, there is no valid place to put a load
-      // in the predecessor.
-      if (TI == InVal || TI->mayHaveSideEffects())
-        return false;
+    if (Ty && Ty != UserTy)
+      IgnoreNonIntegralTypes = true; // Give up on anything but an iN type.
 
-      // If the predecessor has a single successor, then the edge isn't
-      // critical.
-      if (TI->getNumSuccessors() == 1)
-        continue;
+    Ty = UserTy;
+  }
+  return Ty;
+}
 
-      // If this pointer is always safe to load, or if we can prove that there
-      // is already a load in the block, then we can move the load to the pred
-      // block.
-      if (InVal->isDereferenceablePointer() ||
-          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
-        continue;
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = phi [i32* %Alloca, i32* %Other]
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   ...
+///   %V2 = load i32* %Other
+///   ...
+///   %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN,
+                                 const DataLayout *DL = 0) {
+  // For now, we can only do this promotion if the load is in the same block
+  // as the PHI, and if there are no stores between the phi and load.
+  // TODO: Allow recursive phi users.
+  // TODO: Allow stores.
+  BasicBlock *BB = PN.getParent();
+  unsigned MaxAlign = 0;
+  bool HaveLoad = false;
+  for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end(); UI != UE;
+       ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple())
+      return false;
 
+    // For now we only allow loads in the same block as the PHI.  This is
+    // a common case that happens when instcombine merges two loads through
+    // a PHI.
+    if (LI->getParent() != BB)
       return false;
-    }
 
-    return true;
+    // Ensure that there are no instructions between the PHI and the load that
+    // could store.
+    for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+      if (BBI->mayWriteToMemory())
+        return false;
+
+    MaxAlign = std::max(MaxAlign, LI->getAlignment());
+    HaveLoad = true;
   }
 
-  void visitPHINode(PHINode &PN) {
-    DEBUG(dbgs() << "    original: " << PN << "\n");
+  if (!HaveLoad)
+    return false;
 
-    SmallVector<LoadInst *, 4> Loads;
-    if (!isSafePHIToSpeculate(PN, Loads))
-      return;
+  // We can only transform this if it is safe to push the loads into the
+  // predecessor blocks. The only thing to watch out for is that we can't put
+  // a possibly trapping load in the predecessor if it is a critical edge.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+
+    // If the value is produced by the terminator of the predecessor (an
+    // invoke) or it has side-effects, there is no valid place to put a load
+    // in the predecessor.
+    if (TI == InVal || TI->mayHaveSideEffects())
+      return false;
 
-    assert(!Loads.empty());
+    // If the predecessor has a single successor, then the edge isn't
+    // critical.
+    if (TI->getNumSuccessors() == 1)
+      continue;
 
-    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
-    IRBuilderTy PHIBuilder(&PN);
-    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
-                                          PN.getName() + ".sroa.speculated");
+    // If this pointer is always safe to load, or if we can prove that there
+    // is already a load in the block, then we can move the load to the pred
+    // block.
+    if (InVal->isDereferenceablePointer() ||
+        isSafeToLoadUnconditionally(InVal, TI, MaxAlign, DL))
+      continue;
 
-    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
-    // matter which one we get and if any differ.
-    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
-    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
-    unsigned Align = SomeLoad->getAlignment();
+    return false;
+  }
 
-    // Rewrite all loads of the PN to use the new PHI.
-    do {
-      LoadInst *LI = Loads.pop_back_val();
-      LI->replaceAllUsesWith(NewPN);
-      Pass.DeadInsts.insert(LI);
-    } while (!Loads.empty());
-
-    // Inject loads into all of the pred blocks.
-    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
-      BasicBlock *Pred = PN.getIncomingBlock(Idx);
-      TerminatorInst *TI = Pred->getTerminator();
-      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
-      Value *InVal = PN.getIncomingValue(Idx);
-      IRBuilderTy PredBuilder(TI);
-
-      LoadInst *Load
-        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
-                                         Pred->getName()));
-      ++NumLoadsSpeculated;
-      Load->setAlignment(Align);
-      if (TBAATag)
-        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
-      NewPN->addIncoming(Load, Pred);
-
-      Instruction *Ptr = dyn_cast<Instruction>(InVal);
-      if (!Ptr)
-        // No uses to rewrite.
-        continue;
+  return true;
+}
 
-      // Try to lookup and rewrite any partition uses corresponding to this phi
-      // input.
-      AllocaPartitioning::iterator PI
-        = P.findPartitionForPHIOrSelectOperand(InUse);
-      if (PI == P.end())
-        continue;
+static void speculatePHINodeLoads(PHINode &PN) {
+  DEBUG(dbgs() << "    original: " << PN << "\n");
+
+  Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+  IRBuilderTy PHIBuilder(&PN);
+  PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                        PN.getName() + ".sroa.speculated");
+
+  // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+  // matter which one we get and if any differ.
+  LoadInst *SomeLoad = cast<LoadInst>(*PN.use_begin());
+  MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+  unsigned Align = SomeLoad->getAlignment();
+
+  // Rewrite all loads of the PN to use the new PHI.
+  while (!PN.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(*PN.use_begin());
+    LI->replaceAllUsesWith(NewPN);
+    LI->eraseFromParent();
+  }
+
+  // Inject loads into all of the pred blocks.
+  for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+    BasicBlock *Pred = PN.getIncomingBlock(Idx);
+    TerminatorInst *TI = Pred->getTerminator();
+    Value *InVal = PN.getIncomingValue(Idx);
+    IRBuilderTy PredBuilder(TI);
+
+    LoadInst *Load = PredBuilder.CreateLoad(
+        InVal, (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+    ++NumLoadsSpeculated;
+    Load->setAlignment(Align);
+    if (TBAATag)
+      Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+    NewPN->addIncoming(Load, Pred);
+  }
+
+  DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  PN.eraseFromParent();
+}
 
-      // Replace the Use in the PartitionUse for this operand with the Use
-      // inside the load.
-      AllocaPartitioning::use_iterator UI
-        = P.findPartitionUseForPHIOrSelectOperand(InUse);
-      assert(isa<PHINode>(*UI->getUse()->getUser()));
-      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
-    }
-    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
-  }
-
-  /// Select instructions that use an alloca and are subsequently loaded can be
-  /// rewritten to load both input pointers and then select between the result,
-  /// allowing the load of the alloca to be promoted.
-  /// From this:
-  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
-  ///   %V = load i32* %P2
-  /// to:
-  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
-  ///   %V2 = load i32* %Other
-  ///   %V = select i1 %cond, i32 %V1, i32 %V2
-  ///
-  /// We can do this to a select if its only uses are loads and if the operand
-  /// to the select can be loaded unconditionally.
-  bool isSafeSelectToSpeculate(SelectInst &SI,
-                               SmallVectorImpl<LoadInst *> &Loads) {
-    Value *TValue = SI.getTrueValue();
-    Value *FValue = SI.getFalseValue();
-    bool TDerefable = TValue->isDereferenceablePointer();
-    bool FDerefable = FValue->isDereferenceablePointer();
-
-    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
-         UI != UE; ++UI) {
-      LoadInst *LI = dyn_cast<LoadInst>(*UI);
-      if (LI == 0 || !LI->isSimple()) return false;
-
-      // Both operands to the select need to be dereferencable, either
-      // absolutely (e.g. allocas) or at this point because we can see other
-      // accesses to it.
-      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
-                                                      LI->getAlignment(), &TD))
-        return false;
-      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
-                                                      LI->getAlignment(), &TD))
-        return false;
-      Loads.push_back(LI);
-    }
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+///   %V = load i32* %P2
+/// to:
+///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+///   %V2 = load i32* %Other
+///   %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI, const DataLayout *DL = 0) {
+  Value *TValue = SI.getTrueValue();
+  Value *FValue = SI.getFalseValue();
+  bool TDerefable = TValue->isDereferenceablePointer();
+  bool FDerefable = FValue->isDereferenceablePointer();
+
+  for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end(); UI != UE;
+       ++UI) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI);
+    if (LI == 0 || !LI->isSimple())
+      return false;
 
-    return true;
+    // Both operands to the select need to be dereferencable, either
+    // absolutely (e.g. allocas) or at this point because we can see other
+    // accesses to it.
+    if (!TDerefable &&
+        !isSafeToLoadUnconditionally(TValue, LI, LI->getAlignment(), DL))
+      return false;
+    if (!FDerefable &&
+        !isSafeToLoadUnconditionally(FValue, LI, LI->getAlignment(), DL))
+      return false;
   }
 
-  void visitSelectInst(SelectInst &SI) {
-    DEBUG(dbgs() << "    original: " << SI << "\n");
-
-    // If the select isn't safe to speculate, just use simple logic to emit it.
-    SmallVector<LoadInst *, 4> Loads;
-    if (!isSafeSelectToSpeculate(SI, Loads))
-      return;
+  return true;
+}
 
-    IRBuilderTy IRB(&SI);
-    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
-    AllocaPartitioning::iterator PIs[2];
-    PartitionUse PUs[2];
-    for (unsigned i = 0, e = 2; i != e; ++i) {
-      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
-      if (PIs[i] != P.end()) {
-        // If the pointer is within the partitioning, remove the select from
-        // its uses. We'll add in the new loads below.
-        AllocaPartitioning::use_iterator UI
-          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
-        PUs[i] = *UI;
-        // Clear out the use here so that the offsets into the use list remain
-        // stable but this use is ignored when rewriting.
-        UI->setUse(0);
-      }
-    }
+static void speculateSelectInstLoads(SelectInst &SI) {
+  DEBUG(dbgs() << "    original: " << SI << "\n");
 
-    Value *TV = SI.getTrueValue();
-    Value *FV = SI.getFalseValue();
-    // Replace the loads of the select with a select of two loads.
-    while (!Loads.empty()) {
-      LoadInst *LI = Loads.pop_back_val();
+  IRBuilderTy IRB(&SI);
+  Value *TV = SI.getTrueValue();
+  Value *FV = SI.getFalseValue();
+  // Replace the loads of the select with a select of two loads.
+  while (!SI.use_empty()) {
+    LoadInst *LI = cast<LoadInst>(*SI.use_begin());
+    assert(LI->isSimple() && "We only speculate simple loads");
 
-      IRB.SetInsertPoint(LI);
-      LoadInst *TL =
+    IRB.SetInsertPoint(LI);
+    LoadInst *TL =
         IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
-      LoadInst *FL =
+    LoadInst *FL =
         IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
-      NumLoadsSpeculated += 2;
-
-      // Transfer alignment and TBAA info if present.
-      TL->setAlignment(LI->getAlignment());
-      FL->setAlignment(LI->getAlignment());
-      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
-        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
-        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
-      }
+    NumLoadsSpeculated += 2;
 
-      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
-                                  LI->getName() + ".sroa.speculated");
+    // Transfer alignment and TBAA info if present.
+    TL->setAlignment(LI->getAlignment());
+    FL->setAlignment(LI->getAlignment());
+    if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+      TL->setMetadata(LLVMContext::MD_tbaa, Tag);
+      FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+    }
 
-      LoadInst *Loads[2] = { TL, FL };
-      for (unsigned i = 0, e = 2; i != e; ++i) {
-        if (PIs[i] != P.end()) {
-          Use *LoadUse = &Loads[i]->getOperandUse(0);
-          assert(PUs[i].getUse()->get() == LoadUse->get());
-          PUs[i].setUse(LoadUse);
-          P.use_push_back(PIs[i], PUs[i]);
-        }
-      }
+    Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                LI->getName() + ".sroa.speculated");
 
-      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
-      LI->replaceAllUsesWith(V);
-      Pass.DeadInsts.insert(LI);
-    }
+    DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+    LI->replaceAllUsesWith(V);
+    LI->eraseFromParent();
   }
-};
+  SI.eraseFromParent();
 }
 
 /// \brief Build a GEP out of a base pointer and indices.
@@ -1737,7 +1209,7 @@ static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
 /// TargetTy. If we can't find one with the same type, we at least try to use
 /// one with the same size. If none of that works, we just produce the GEP as
 /// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
                                     SmallVectorImpl<Value *> &Indices) {
   if (Ty == TargetTy)
@@ -1754,7 +1226,7 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
       ElementTy = SeqTy->getElementType();
       // Note that we use the default address space as this index is over an
       // array or a vector, not a pointer.
-      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
+      Indices.push_back(IRB.getInt(APInt(DL.getPointerSizeInBits(0), 0)));
     } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
       if (STy->element_begin() == STy->element_end())
         break; // Nothing left to descend into.
@@ -1775,12 +1247,12 @@ static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
                                        SmallVectorImpl<Value *> &Indices) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
+    return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1790,7 +1262,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = TD.getTypeSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
     if (ElementSizeInBits % 8)
       return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
@@ -1799,20 +1271,20 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
       return 0;
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
+    return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
                                     Offset, TargetTy, Indices);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
     Type *ElementTy = ArrTy->getElementType();
-    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
       return 0;
 
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
-    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+    return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                     Indices);
   }
 
@@ -1820,18 +1292,18 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
   if (!STy)
     return 0;
 
-  const StructLayout *SL = TD.getStructLayout(STy);
+  const StructLayout *SL = DL.getStructLayout(STy);
   uint64_t StructOffset = Offset.getZExtValue();
   if (StructOffset >= SL->getSizeInBytes())
     return 0;
   unsigned Index = SL->getElementContainingOffset(StructOffset);
   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   Type *ElementTy = STy->getElementType(Index);
-  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
     return 0; // The offset points into alignment padding.
 
   Indices.push_back(IRB.getInt32(Index));
-  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                   Indices);
 }
 
@@ -1845,7 +1317,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
 /// Indices, and setting Ty to the result subtype.
 ///
 /// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
                                       SmallVectorImpl<Value *> &Indices) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
@@ -1858,14 +1330,14 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return 0; // We can't GEP through an unsized element.
-  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
   if (ElementSize == 0)
     return 0; // Zero-length arrays can't help us build a natural GEP.
   APInt NumSkippedElements = Offset.sdiv(ElementSize);
 
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
-  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+  return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
                                   Indices);
 }
 
@@ -1884,7 +1356,7 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
 /// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL,
                              Value *Ptr, APInt Offset, Type *PointerTy) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
@@ -1908,7 +1380,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
     // First fold any existing GEPs into the offset.
     while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
       APInt GEPOffset(Offset.getBitWidth(), 0);
-      if (!GEP->accumulateConstantOffset(TD, GEPOffset))
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
         break;
       Offset += GEPOffset;
       Ptr = GEP->getPointerOperand();
@@ -1918,7 +1390,7 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
 
     // See if we can perform a natural GEP here.
     Indices.clear();
-    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
+    if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
                                            Indices)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
@@ -1989,6 +1461,10 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
 
+  // We can convert pointers to integers and vice-versa. Same for vectors
+  // of pointers and integers.
+  OldTy = OldTy->getScalarType();
+  NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy())
       return true;
@@ -2007,24 +1483,126 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
 static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
-                           Type *Ty) {
-  assert(canConvertValue(DL, V->getType(), Ty) &&
-         "Value not convertable to type");
-  if (V->getType() == Ty)
+                           Type *NewTy) {
+  Type *OldTy = V->getType();
+  assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+  if (OldTy == NewTy)
     return V;
-  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
-    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
       if (NewITy->getBitWidth() > OldITy->getBitWidth())
         return IRB.CreateZExt(V, NewITy);
-  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
-    return IRB.CreateIntToPtr(V, Ty);
-  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
-    return IRB.CreatePtrToInt(V, Ty);
 
-  return IRB.CreateBitCast(V, Ty);
+  // See if we need inttoptr for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isIntegerTy() &&
+      NewTy->getScalarType()->isPointerTy()) {
+    // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                                NewTy);
+
+    return IRB.CreateIntToPtr(V, NewTy);
+  }
+
+  // See if we need ptrtoint for this type pair. A cast involving both scalars
+  // and vectors requires and additional bitcast.
+  if (OldTy->getScalarType()->isPointerTy() &&
+      NewTy->getScalarType()->isIntegerTy()) {
+    // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
+      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                               NewTy);
+
+    return IRB.CreatePtrToInt(V, NewTy);
+  }
+
+  return IRB.CreateBitCast(V, NewTy);
 }
 
-/// \brief Test whether the given alloca partition can be promoted to a vector.
+/// \brief Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partioning which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(
+    const DataLayout &DL, AllocaSlices &S, uint64_t SliceBeginOffset,
+    uint64_t SliceEndOffset, VectorType *Ty, uint64_t ElementSize,
+    AllocaSlices::const_iterator I) {
+  // First validate the slice offsets.
+  uint64_t BeginOffset =
+      std::max(I->beginOffset(), SliceBeginOffset) - SliceBeginOffset;
+  uint64_t BeginIndex = BeginOffset / ElementSize;
+  if (BeginIndex * ElementSize != BeginOffset ||
+      BeginIndex >= Ty->getNumElements())
+    return false;
+  uint64_t EndOffset =
+      std::min(I->endOffset(), SliceEndOffset) - SliceBeginOffset;
+  uint64_t EndIndex = EndOffset / ElementSize;
+  if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+    return false;
+
+  assert(EndIndex > BeginIndex && "Empty vector!");
+  uint64_t NumElements = EndIndex - BeginIndex;
+  Type *SliceTy =
+      (NumElements == 1) ? Ty->getElementType()
+                         : VectorType::get(Ty->getElementType(), NumElements);
+
+  Type *SplitIntTy =
+      Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+  Use *U = I->getUse();
+
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile())
+      return false;
+    if (!I->isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+    // Disable vector promotion when there are loads or stores of an FCA.
+    return false;
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
+      return false;
+    Type *LTy = LI->getType();
+    if (SliceBeginOffset > I->beginOffset() ||
+        SliceEndOffset < I->endOffset()) {
+      assert(LTy->isIntegerTy());
+      LTy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, SliceTy, LTy))
+      return false;
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    if (SI->isVolatile())
+      return false;
+    Type *STy = SI->getValueOperand()->getType();
+    if (SliceBeginOffset > I->beginOffset() ||
+        SliceEndOffset < I->endOffset()) {
+      assert(STy->isIntegerTy());
+      STy = SplitIntTy;
+    }
+    if (!canConvertValue(DL, STy, SliceTy))
+      return false;
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
 ///
 /// This is a quick test to check whether we can rewrite a particular alloca
 /// partition (and its newly formed alloca) into a vector alloca with only
@@ -2032,75 +1610,103 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
 /// SSA value. We only can ensure this for a limited set of operations, and we
 /// don't want to do the rewrites unless we are confident that the result will
 /// be promotable, so we have an early test here.
-static bool isVectorPromotionViable(const DataLayout &TD,
-                                    Type *AllocaTy,
-                                    AllocaPartitioning &P,
-                                    uint64_t PartitionBeginOffset,
-                                    uint64_t PartitionEndOffset,
-                                    AllocaPartitioning::const_use_iterator I,
-                                    AllocaPartitioning::const_use_iterator E) {
+static bool
+isVectorPromotionViable(const DataLayout &DL, Type *AllocaTy, AllocaSlices &S,
+                        uint64_t SliceBeginOffset, uint64_t SliceEndOffset,
+                        AllocaSlices::const_iterator I,
+                        AllocaSlices::const_iterator E,
+                        ArrayRef<AllocaSlices::iterator> SplitUses) {
   VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
   if (!Ty)
     return false;
 
-  uint64_t ElementSize = TD.getTypeSizeInBits(Ty->getScalarType());
+  uint64_t ElementSize = DL.getTypeSizeInBits(Ty->getScalarType());
 
   // While the definition of LLVM vectors is bitpacked, we don't support sizes
   // that aren't byte sized.
   if (ElementSize % 8)
     return false;
-  assert((TD.getTypeSizeInBits(Ty) % 8) == 0 &&
+  assert((DL.getTypeSizeInBits(Ty) % 8) == 0 &&
          "vector size not a multiple of element size?");
   ElementSize /= 8;
 
-  for (; I != E; ++I) {
-    Use *U = I->getUse();
-    if (!U)
-      continue; // Skip dead use.
-
-    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
-    uint64_t BeginIndex = BeginOffset / ElementSize;
-    if (BeginIndex * ElementSize != BeginOffset ||
-        BeginIndex >= Ty->getNumElements())
+  for (; I != E; ++I)
+    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
+                                         SliceEndOffset, Ty, ElementSize, I))
       return false;
-    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
-    uint64_t EndIndex = EndOffset / ElementSize;
-    if (EndIndex * ElementSize != EndOffset ||
-        EndIndex > Ty->getNumElements())
+
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    if (!isVectorPromotionViableForSlice(DL, S, SliceBeginOffset,
+                                         SliceEndOffset, Ty, ElementSize, *SUI))
       return false;
 
-    assert(EndIndex > BeginIndex && "Empty vector!");
-    uint64_t NumElements = EndIndex - BeginIndex;
-    Type *PartitionTy
-      = (NumElements == 1) ? Ty->getElementType()
-                           : VectorType::get(Ty->getElementType(), NumElements);
+  return true;
+}
 
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
-      if (MI->isVolatile())
-        return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
-        const AllocaPartitioning::MemTransferOffsets &MTO
-          = P.getMemTransferOffsets(*MTI);
-        if (!MTO.IsSplittable)
-          return false;
-      }
-    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
-      // Disable vector promotion when there are loads or stores of an FCA.
+/// \brief Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const DataLayout &DL,
+                                            Type *AllocaTy,
+                                            uint64_t AllocBeginOffset,
+                                            uint64_t Size, AllocaSlices &S,
+                                            AllocaSlices::const_iterator I,
+                                            bool &WholeAllocaOp) {
+  uint64_t RelBegin = I->beginOffset() - AllocBeginOffset;
+  uint64_t RelEnd = I->endOffset() - AllocBeginOffset;
+
+  // We can't reasonably handle cases where the load or store extends past
+  // the end of the aloca's type and into its padding.
+  if (RelEnd > Size)
+    return false;
+
+  Use *U = I->getUse();
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+    if (LI->isVolatile())
       return false;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
-      if (LI->isVolatile())
-        return false;
-      if (!canConvertValue(TD, PartitionTy, LI->getType()))
-        return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
-      if (SI->isVolatile())
+    if (RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
         return false;
-      if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, AllocaTy, LI->getType())) {
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      return false;
+    }
+  } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+    Type *ValueTy = SI->getValueOperand()->getType();
+    if (SI->isVolatile())
+      return false;
+    if (RelBegin == 0 && RelEnd == Size)
+      WholeAllocaOp = true;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
         return false;
-    } else {
+    } else if (RelBegin != 0 || RelEnd != Size ||
+               !canConvertValue(DL, ValueTy, AllocaTy)) {
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
       return false;
     }
+  } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+    if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+      return false;
+    if (!I->isSplittable())
+      return false; // Skip any unsplittable intrinsics.
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+    if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+        II->getIntrinsicID() != Intrinsic::lifetime_end)
+      return false;
+  } else {
+    return false;
   }
+
   return true;
 }
 
@@ -2110,97 +1716,50 @@ static bool isVectorPromotionViable(const DataLayout &TD,
 /// This is a quick test to check whether we can rewrite the integer loads and
 /// stores to a particular alloca into wider loads and stores and be able to
 /// promote the resulting alloca.
-static bool isIntegerWideningViable(const DataLayout &TD,
-                                    Type *AllocaTy,
-                                    uint64_t AllocBeginOffset,
-                                    AllocaPartitioning &P,
-                                    AllocaPartitioning::const_use_iterator I,
-                                    AllocaPartitioning::const_use_iterator E) {
-  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+static bool
+isIntegerWideningViable(const DataLayout &DL, Type *AllocaTy,
+                        uint64_t AllocBeginOffset, AllocaSlices &S,
+                        AllocaSlices::const_iterator I,
+                        AllocaSlices::const_iterator E,
+                        ArrayRef<AllocaSlices::iterator> SplitUses) {
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
     return false;
 
   // Don't try to handle allocas with bit-padding.
-  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
     return false;
 
   // We need to ensure that an integer type with the appropriate bitwidth can
   // be converted to the alloca type, whatever that is. We don't want to force
   // the alloca itself to have an integer type if there is a more suitable one.
   Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
-  if (!canConvertValue(TD, AllocaTy, IntTy) ||
-      !canConvertValue(TD, IntTy, AllocaTy))
+  if (!canConvertValue(DL, AllocaTy, IntTy) ||
+      !canConvertValue(DL, IntTy, AllocaTy))
     return false;
 
-  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
-
-  // Check the uses to ensure the uses are (likely) promotable integer uses.
-  // Also ensure that the alloca has a covering load or store. We don't want
-  // to widen the integer operations only to fail to promote due to some other
-  // unsplittable entry (which we may make splittable later).
-  bool WholeAllocaOp = false;
-  for (; I != E; ++I) {
-    Use *U = I->getUse();
-    if (!U)
-      continue; // Skip dead use.
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
 
-    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
-    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
+  // While examining uses, we ensure that the alloca has a covering load or
+  // store. We don't want to widen the integer operations only to fail to
+  // promote due to some other unsplittable entry (which we may make splittable
+  // later). However, if there are only splittable uses, go ahead and assume
+  // that we cover the alloca.
+  bool WholeAllocaOp = (I != E) ? false : DL.isLegalInteger(SizeInBits);
 
-    // We can't reasonably handle cases where the load or store extends past
-    // the end of the aloca's type and into its padding.
-    if (RelEnd > Size)
+  for (; I != E; ++I)
+    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
+                                         S, I, WholeAllocaOp))
       return false;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
-      if (LI->isVolatile())
-        return false;
-      if (RelBegin == 0 && RelEnd == Size)
-        WholeAllocaOp = true;
-      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
-          return false;
-        continue;
-      }
-      // Non-integer loads need to be convertible from the alloca type so that
-      // they are promotable.
-      if (RelBegin != 0 || RelEnd != Size ||
-          !canConvertValue(TD, AllocaTy, LI->getType()))
-        return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
-      Type *ValueTy = SI->getValueOperand()->getType();
-      if (SI->isVolatile())
-        return false;
-      if (RelBegin == 0 && RelEnd == Size)
-        WholeAllocaOp = true;
-      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
-          return false;
-        continue;
-      }
-      // Non-integer stores need to be convertible to the alloca type so that
-      // they are promotable.
-      if (RelBegin != 0 || RelEnd != Size ||
-          !canConvertValue(TD, ValueTy, AllocaTy))
-        return false;
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
-      if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
-        return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
-        const AllocaPartitioning::MemTransferOffsets &MTO
-          = P.getMemTransferOffsets(*MTI);
-        if (!MTO.IsSplittable)
-          return false;
-      }
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
-      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
-          II->getIntrinsicID() != Intrinsic::lifetime_end)
-        return false;
-    } else {
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    if (!isIntegerWideningViableForSlice(DL, AllocaTy, AllocBeginOffset, Size,
+                                         S, *SUI, WholeAllocaOp))
       return false;
-    }
-  }
+
   return WholeAllocaOp;
 }
 
@@ -2335,19 +1894,19 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
 }
 
 namespace {
-/// \brief Visitor to rewrite instructions using a partition of an alloca to
-/// use a new alloca.
+/// \brief Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
 ///
 /// Also implements the rewriting to vector-based accesses when the partition
 /// passes the isVectorPromotionViable predicate. Most of the rewriting logic
 /// lives here.
-class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
-                                                   bool> {
+class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
-  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
+  friend class llvm::InstVisitor<AllocaSliceRewriter, bool>;
+  typedef llvm::InstVisitor<AllocaSliceRewriter, bool> Base;
 
-  const DataLayout &TD;
-  AllocaPartitioning &P;
+  const DataLayout &DL;
+  AllocaSlices &S;
   SROA &Pass;
   AllocaInst &OldAI, &NewAI;
   const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
@@ -2372,106 +1931,112 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
   // integer type will be stored here for easy access during rewriting.
   IntegerType *IntTy;
 
-  // The offset of the partition user currently being rewritten.
+  // The offset of the slice currently being rewritten.
   uint64_t BeginOffset, EndOffset;
+  bool IsSplittable;
   bool IsSplit;
   Use *OldUse;
   Instruction *OldPtr;
 
+  // Output members carrying state about the result of visiting and rewriting
+  // the slice of the alloca.
+  bool IsUsedByRewrittenSpeculatableInstructions;
+
   // Utility IR builder, whose name prefix is setup for each visited use, and
   // the insertion point is set to point to the user.
   IRBuilderTy IRB;
 
 public:
-  AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
-                          AllocaPartitioning::iterator PI,
-                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
-                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
-    : TD(TD), P(P), Pass(Pass),
-      OldAI(OldAI), NewAI(NewAI),
-      NewAllocaBeginOffset(NewBeginOffset),
-      NewAllocaEndOffset(NewEndOffset),
-      NewAllocaTy(NewAI.getAllocatedType()),
-      VecTy(), ElementTy(), ElementSize(), IntTy(),
-      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
-      IRB(NewAI.getContext(), ConstantFolder()) {
-  }
-
-  /// \brief Visit the users of the alloca partition and rewrite them.
-  bool visitUsers(AllocaPartitioning::const_use_iterator I,
-                  AllocaPartitioning::const_use_iterator E) {
-    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
-                                NewAllocaBeginOffset, NewAllocaEndOffset,
-                                I, E)) {
-      ++NumVectorized;
-      VecTy = cast<VectorType>(NewAI.getAllocatedType());
-      ElementTy = VecTy->getElementType();
-      assert((TD.getTypeSizeInBits(VecTy->getScalarType()) % 8) == 0 &&
+  AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &S, SROA &Pass,
+                      AllocaInst &OldAI, AllocaInst &NewAI,
+                      uint64_t NewBeginOffset, uint64_t NewEndOffset,
+                      bool IsVectorPromotable = false,
+                      bool IsIntegerPromotable = false)
+      : DL(DL), S(S), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+        NewAllocaBeginOffset(NewBeginOffset), NewAllocaEndOffset(NewEndOffset),
+        NewAllocaTy(NewAI.getAllocatedType()),
+        VecTy(IsVectorPromotable ? cast<VectorType>(NewAllocaTy) : 0),
+        ElementTy(VecTy ? VecTy->getElementType() : 0),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        IntTy(IsIntegerPromotable
+                  ? Type::getIntNTy(
+                        NewAI.getContext(),
+                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
+                  : 0),
+        BeginOffset(), EndOffset(), IsSplittable(), IsSplit(), OldUse(),
+        OldPtr(), IsUsedByRewrittenSpeculatableInstructions(false),
+        IRB(NewAI.getContext(), ConstantFolder()) {
+    if (VecTy) {
+      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
-      ElementSize = TD.getTypeSizeInBits(VecTy->getScalarType()) / 8;
-    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
-                                       NewAllocaBeginOffset, P, I, E)) {
-      IntTy = Type::getIntNTy(NewAI.getContext(),
-                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
+      ++NumVectorized;
     }
+    assert((!IsVectorPromotable && !IsIntegerPromotable) ||
+           IsVectorPromotable != IsIntegerPromotable);
+  }
+
+  bool visit(AllocaSlices::const_iterator I) {
     bool CanSROA = true;
-    for (; I != E; ++I) {
-      if (!I->getUse())
-        continue; // Skip dead uses.
-      BeginOffset = I->BeginOffset;
-      EndOffset = I->EndOffset;
-      IsSplit = I->isSplit();
-      OldUse = I->getUse();
-      OldPtr = cast<Instruction>(OldUse->get());
-
-      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
-      IRB.SetInsertPoint(OldUserI);
-      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
-                        ".");
-
-      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
-    }
-    if (VecTy) {
+    BeginOffset = I->beginOffset();
+    EndOffset = I->endOffset();
+    IsSplittable = I->isSplittable();
+    IsSplit =
+        BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+
+    OldUse = I->getUse();
+    OldPtr = cast<Instruction>(OldUse->get());
+
+    Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+    IRB.SetInsertPoint(OldUserI);
+    IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+    IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+    CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+    if (VecTy || IntTy)
       assert(CanSROA);
-      VecTy = 0;
-      ElementTy = 0;
-      ElementSize = 0;
-    }
-    if (IntTy) {
-      assert(CanSROA);
-      IntTy = 0;
-    }
     return CanSROA;
   }
 
+  /// \brief Query whether this slice is used by speculatable instructions after
+  /// rewriting.
+  ///
+  /// These instructions (PHIs and Selects currently) require the alloca slice
+  /// to run back through the rewriter. Thus, they are promotable, but not on
+  /// this iteration. This is distinct from a slice which is unpromotable for
+  /// some other reason, in which case we don't even want to perform the
+  /// speculation. This can be querried at any time and reflects whether (at
+  /// that point) a visit call has rewritten a speculatable instruction on the
+  /// current slice.
+  bool isUsedByRewrittenSpeculatableInstructions() const {
+    return IsUsedByRewrittenSpeculatableInstructions;
+  }
+
 private:
+  // Make sure the other visit overloads are visible.
+  using Base::visit;
+
   // Every instruction which can end up as a user must have a rewrite rule.
   bool visitInstruction(Instruction &I) {
     DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
-    assert(BeginOffset >= NewAllocaBeginOffset);
-    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, uint64_t Offset,
+                              Type *PointerTy) {
+    assert(Offset >= NewAllocaBeginOffset);
+    return getAdjustedPtr(IRB, DL, &NewAI, APInt(DL.getPointerSizeInBits(),
+                                                 Offset - NewAllocaBeginOffset),
+                          PointerTy);
   }
 
   /// \brief Compute suitable alignment to access an offset into the new alloca.
   unsigned getOffsetAlign(uint64_t Offset) {
     unsigned NewAIAlign = NewAI.getAlignment();
     if (!NewAIAlign)
-      NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType());
+      NewAIAlign = DL.getABITypeAlignment(NewAI.getAllocatedType());
     return MinAlign(NewAIAlign, Offset);
   }
 
-  /// \brief Compute suitable alignment to access this partition of the new
-  /// alloca.
-  unsigned getPartitionAlign() {
-    return getOffsetAlign(BeginOffset - NewAllocaBeginOffset);
-  }
-
   /// \brief Compute suitable alignment to access a type at an offset of the
   /// new alloca.
   ///
@@ -2479,15 +2044,7 @@ private:
   /// otherwise returns the maximal suitable alignment.
   unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
     unsigned Align = getOffsetAlign(Offset);
-    return Align == TD.getABITypeAlignment(Ty) ? 0 : Align;
-  }
-
-  /// \brief Compute suitable alignment to access a type at the beginning of
-  /// this partition of the new alloca.
-  ///
-  /// See \c getOffsetTypeAlign for details; this routine delegates to it.
-  unsigned getPartitionTypeAlign(Type *Ty) {
-    return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
+    return Align == DL.getABITypeAlignment(Ty) ? 0 : Align;
   }
 
   unsigned getIndex(uint64_t Offset) {
@@ -2505,9 +2062,10 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst() {
-    unsigned BeginIndex = getIndex(BeginOffset);
-    unsigned EndIndex = getIndex(EndOffset);
+  Value *rewriteVectorizedLoadInst(uint64_t NewBeginOffset,
+                                   uint64_t NewEndOffset) {
+    unsigned BeginIndex = getIndex(NewBeginOffset);
+    unsigned EndIndex = getIndex(NewEndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
 
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
@@ -2515,16 +2073,17 @@ private:
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(LoadInst &LI) {
+  Value *rewriteIntegerLoad(LoadInst &LI, uint64_t NewBeginOffset,
+                            uint64_t NewEndOffset) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                      "load");
-    V = convertValue(TD, IRB, V, IntTy);
-    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
-      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+    V = convertValue(DL, IRB, V, IntTy);
+    assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || NewEndOffset < NewAllocaEndOffset)
+      V = extractInteger(DL, IRB, V, cast<IntegerType>(LI.getType()), Offset,
                          "extract");
     return V;
   }
@@ -2534,37 +2093,44 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
-    uint64_t Size = EndOffset - BeginOffset;
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    uint64_t Size = NewEndOffset - NewBeginOffset;
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
                              : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst();
+      V = rewriteVectorizedLoadInst(NewBeginOffset, NewEndOffset);
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(LI);
-    } else if (BeginOffset == NewAllocaBeginOffset &&
-               canConvertValue(TD, NewAllocaTy, LI.getType())) {
+      V = rewriteIntegerLoad(LI, NewBeginOffset, NewEndOffset);
+    } else if (NewBeginOffset == NewAllocaBeginOffset &&
+               canConvertValue(DL, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                 LI.isVolatile(), "load");
     } else {
       Type *LTy = TargetTy->getPointerTo();
-      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
-                                getPartitionTypeAlign(TargetTy),
-                                LI.isVolatile(), "load");
+      V = IRB.CreateAlignedLoad(
+          getAdjustedAllocaPtr(IRB, NewBeginOffset, LTy),
+          getOffsetTypeAlign(TargetTy, NewBeginOffset - NewAllocaBeginOffset),
+          LI.isVolatile(), "load");
       IsPtrAdjusted = true;
     }
-    V = convertValue(TD, IRB, V, TargetTy);
+    V = convertValue(DL, IRB, V, TargetTy);
 
     if (IsSplit) {
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
-      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
+      assert(Size < DL.getTypeStoreSize(LI.getType()) &&
              "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
-             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             DL.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
       // Move the insertion point just past the load so that we can refer to it.
       IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
@@ -2574,7 +2140,7 @@ private:
       // LI only used for this computation.
       Value *Placeholder
         = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
-      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+      V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
@@ -2589,24 +2155,26 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(Value *V,
-                                  StoreInst &SI, Value *OldOp) {
-    unsigned BeginIndex = getIndex(BeginOffset);
-    unsigned EndIndex = getIndex(EndOffset);
-    assert(EndIndex > BeginIndex && "Empty vector!");
-    unsigned NumElements = EndIndex - BeginIndex;
-    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
-    Type *PartitionTy
-      = (NumElements == 1) ? ElementTy
-                           : VectorType::get(ElementTy, NumElements);
-    if (V->getType() != PartitionTy)
-      V = convertValue(TD, IRB, V, PartitionTy);
-
-    // Mix in the existing elements.
-    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                       "load");
-    V = insertVector(IRB, Old, V, BeginIndex, "vec");
+  bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+                                  uint64_t NewBeginOffset,
+                                  uint64_t NewEndOffset) {
+    if (V->getType() != VecTy) {
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      Type *SliceTy =
+          (NumElements == 1) ? ElementTy
+                             : VectorType::get(ElementTy, NumElements);
+      if (V->getType() != SliceTy)
+        V = convertValue(DL, IRB, V, SliceTy);
 
+      // Mix in the existing elements.
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         "load");
+      V = insertVector(IRB, Old, V, BeginIndex, "vec");
+    }
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
 
@@ -2615,19 +2183,20 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI,
+                           uint64_t NewBeginOffset, uint64_t NewEndOffset) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
-    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          "oldload");
-      Old = convertValue(TD, IRB, Old, IntTy);
+      Old = convertValue(DL, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+      V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset,
                         "insert");
     }
-    V = convertValue(TD, IRB, V, NewAllocaTy);
+    V = convertValue(DL, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
     (void)Store;
@@ -2648,37 +2217,45 @@ private:
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    uint64_t Size = EndOffset - BeginOffset;
-    if (Size < TD.getTypeStoreSize(V->getType())) {
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    if (Size < DL.getTypeStoreSize(V->getType())) {
       assert(!SI.isVolatile());
-      assert(IsSplit && "A seemingly split store isn't splittable");
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
-             TD.getTypeStoreSizeInBits(V->getType()) &&
+             DL.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
-      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
+      V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset,
                          "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp, NewBeginOffset,
+                                        NewEndOffset);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(V, SI);
+      return rewriteIntegerStore(V, SI, NewBeginOffset, NewEndOffset);
 
     StoreInst *NewSI;
-    if (BeginOffset == NewAllocaBeginOffset &&
-        EndOffset == NewAllocaEndOffset &&
-        canConvertValue(TD, V->getType(), NewAllocaTy)) {
-      V = convertValue(TD, IRB, V, NewAllocaTy);
+    if (NewBeginOffset == NewAllocaBeginOffset &&
+        NewEndOffset == NewAllocaEndOffset &&
+        canConvertValue(DL, V->getType(), NewAllocaTy)) {
+      V = convertValue(DL, IRB, V, NewAllocaTy);
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
-      NewSI = IRB.CreateAlignedStore(V, NewPtr,
-                                     getPartitionTypeAlign(V->getType()),
-                                     SI.isVolatile());
+      Value *NewPtr = getAdjustedAllocaPtr(IRB, NewBeginOffset,
+                                           V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(
+          V, NewPtr, getOffsetTypeAlign(
+                         V->getType(), NewBeginOffset - NewAllocaBeginOffset),
+          SI.isVolatile());
     }
     (void)NewSI;
     Pass.DeadInsts.insert(&SI);
@@ -2729,9 +2306,12 @@ private:
     // If the memset has a variable size, it cannot be split, just adjust the
     // pointer to the new alloca.
     if (!isa<Constant>(II.getLength())) {
-      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      assert(!IsSplit);
+      assert(BeginOffset >= NewAllocaBeginOffset);
+      II.setDest(
+          getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
       Type *CstTy = II.getAlignmentCst()->getType();
-      II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign()));
+      II.setAlignment(ConstantInt::get(CstTy, getOffsetAlign(BeginOffset)));
 
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -2743,21 +2323,26 @@ private:
     Type *AllocaTy = NewAI.getAllocatedType();
     Type *ScalarTy = AllocaTy->getScalarType();
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
+
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memset.
     if (!VecTy && !IntTy &&
-        (BeginOffset != NewAllocaBeginOffset ||
-         EndOffset != NewAllocaEndOffset ||
+        (BeginOffset > NewAllocaBeginOffset ||
+         EndOffset < NewAllocaEndOffset ||
          !AllocaTy->isSingleValueType() ||
-         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
-         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
+         !DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy)) ||
+         DL.getTypeSizeInBits(ScalarTy)%8 != 0)) {
       Type *SizeTy = II.getLength()->getType();
-      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
-      CallInst *New
-        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
-                                                II.getRawDest()->getType()),
-                           II.getValue(), Size, getPartitionAlign(),
-                           II.isVolatile());
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+      CallInst *New = IRB.CreateMemSet(
+          getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getRawDest()->getType()),
+          II.getValue(), Size, getOffsetAlign(SliceOffset), II.isVolatile());
       (void)New;
       DEBUG(dbgs() << "          to: " << *New << "\n");
       return false;
@@ -2774,15 +2359,15 @@ private:
       // If this is a memset of a vectorized alloca, insert it.
       assert(ElementTy == ScalarTy);
 
-      unsigned BeginIndex = getIndex(BeginOffset);
-      unsigned EndIndex = getIndex(EndOffset);
+      unsigned BeginIndex = getIndex(NewBeginOffset);
+      unsigned EndIndex = getIndex(NewEndOffset);
       assert(EndIndex > BeginIndex && "Empty vector!");
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
       Value *Splat =
-          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
-      Splat = convertValue(TD, IRB, Splat, ElementTy);
+          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Splat = convertValue(DL, IRB, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
@@ -2794,32 +2379,31 @@ private:
       // set integer.
       assert(!II.isVolatile());
 
-      uint64_t Size = EndOffset - BeginOffset;
+      uint64_t Size = NewEndOffset - NewBeginOffset;
       V = getIntegerSplat(II.getValue(), Size);
 
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                            "oldload");
-        Old = convertValue(TD, IRB, Old, IntTy);
-        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
+        Old = convertValue(DL, IRB, Old, IntTy);
+        uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(DL, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
       }
-      V = convertValue(TD, IRB, V, AllocaTy);
+      V = convertValue(DL, IRB, V, AllocaTy);
     } else {
       // Established these invariants above.
-      assert(BeginOffset == NewAllocaBeginOffset);
-      assert(EndOffset == NewAllocaEndOffset);
+      assert(NewBeginOffset == NewAllocaBeginOffset);
+      assert(NewEndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
+      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
         V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
-      V = convertValue(TD, IRB, V, AllocaTy);
+      V = convertValue(DL, IRB, V, AllocaTy);
     }
 
     Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
@@ -2835,21 +2419,25 @@ private:
 
     DEBUG(dbgs() << "    original: " << II << "\n");
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
     bool IsDest = II.getRawDest() == OldPtr;
 
-    const AllocaPartitioning::MemTransferOffsets &MTO
-      = P.getMemTransferOffsets(II);
-
     // Compute the relative offset within the transfer.
-    unsigned IntPtrWidth = TD.getPointerSizeInBits();
-    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
-                                                       : MTO.SourceBegin));
+    unsigned IntPtrWidth = DL.getPointerSizeInBits();
+    APInt RelOffset(IntPtrWidth, NewBeginOffset - BeginOffset);
 
     unsigned Align = II.getAlignment();
+    uint64_t SliceOffset = NewBeginOffset - NewAllocaBeginOffset;
     if (Align > 1)
-      Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
-                       MinAlign(II.getAlignment(), getPartitionAlign()));
+      Align =
+          MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
+                   MinAlign(II.getAlignment(), getOffsetAlign(SliceOffset)));
 
     // For unsplit intrinsics, we simply modify the source and destination
     // pointers in place. This isn't just an optimization, it is a matter of
@@ -2858,12 +2446,14 @@ private:
     // a variable length. We may also be dealing with memmove instead of
     // memcpy, and so simply updating the pointers is the necessary for us to
     // update both source and dest of a single call.
-    if (!MTO.IsSplittable) {
+    if (!IsSplittable) {
       Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
       if (IsDest)
-        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+        II.setDest(
+            getAdjustedAllocaPtr(IRB, BeginOffset, II.getRawDest()->getType()));
       else
-        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
+        II.setSource(getAdjustedAllocaPtr(IRB, BeginOffset,
+                                          II.getRawSource()->getType()));
 
       Type *CstTy = II.getAlignmentCst()->getType();
       II.setAlignment(ConstantInt::get(CstTy, Align));
@@ -2881,24 +2471,21 @@ private:
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memcpy.
     bool EmitMemCpy
-      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
-                             EndOffset != NewAllocaEndOffset ||
+      = !VecTy && !IntTy && (BeginOffset > NewAllocaBeginOffset ||
+                             EndOffset < NewAllocaEndOffset ||
                              !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
     // size hasn't been shrunk based on analysis of the viable range, this is
     // a no-op.
     if (EmitMemCpy && &OldAI == &NewAI) {
-      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
-      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
       // Ensure the start lines up.
-      assert(BeginOffset == OrigBegin);
-      (void)OrigBegin;
+      assert(NewBeginOffset == BeginOffset);
 
       // Rewrite the size as needed.
-      if (EndOffset != OrigEnd)
+      if (NewEndOffset != EndOffset)
         II.setLength(ConstantInt::get(II.getLength()->getType(),
-                                      EndOffset - BeginOffset));
+                                      NewEndOffset - NewBeginOffset));
       return false;
     }
     // Record this instruction for deletion.
@@ -2917,13 +2504,13 @@ private:
 
       // Compute the other pointer, folding as much as possible to produce
       // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+      OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
 
-      Value *OurPtr
-        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
-                                           : II.getRawSource()->getType());
+      Value *OurPtr = getAdjustedAllocaPtr(
+          IRB, NewBeginOffset,
+          IsDest ? II.getRawDest()->getType() : II.getRawSource()->getType());
       Type *SizeTy = II.getLength()->getType();
-      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+      Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
 
       CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
                                        IsDest ? OtherPtr : OurPtr,
@@ -2939,11 +2526,11 @@ private:
     if (!Align)
       Align = 1;
 
-    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
-                         EndOffset == NewAllocaEndOffset;
-    uint64_t Size = EndOffset - BeginOffset;
-    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
-    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
+    bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+                         NewEndOffset == NewAllocaEndOffset;
+    uint64_t Size = NewEndOffset - NewBeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
     unsigned NumElements = EndIndex - BeginIndex;
     IntegerType *SubIntTy
       = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
@@ -2960,7 +2547,7 @@ private:
       OtherPtrTy = SubIntTy->getPointerTo();
     }
 
-    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
+    Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, RelOffset, OtherPtrTy);
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
@@ -2973,10 +2560,9 @@ private:
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                   "load");
-      Src = convertValue(TD, IRB, Src, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
+      Src = convertValue(DL, IRB, Src, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
                                   "copyload");
@@ -2989,11 +2575,10 @@ private:
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          "oldload");
-      Old = convertValue(TD, IRB, Old, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
-      Src = convertValue(TD, IRB, Src, NewAllocaTy);
+      Old = convertValue(DL, IRB, Old, IntTy);
+      uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+      Src = convertValue(DL, IRB, Src, NewAllocaTy);
     }
 
     StoreInst *Store = cast<StoreInst>(
@@ -3009,13 +2594,20 @@ private:
     DEBUG(dbgs() << "    original: " << II << "\n");
     assert(II.getArgOperand(1) == OldPtr);
 
+    // Compute the intersecting offset range.
+    assert(BeginOffset < NewAllocaEndOffset);
+    assert(EndOffset > NewAllocaBeginOffset);
+    uint64_t NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+    uint64_t NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
     ConstantInt *Size
       = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
-                         EndOffset - BeginOffset);
-    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
+                         NewEndOffset - NewBeginOffset);
+    Value *Ptr =
+        getAdjustedAllocaPtr(IRB, NewBeginOffset, II.getArgOperand(1)->getType());
     Value *New;
     if (II.getIntrinsicID() == Intrinsic::lifetime_start)
       New = IRB.CreateLifetimeStart(Ptr, Size);
@@ -3029,30 +2621,45 @@ private:
 
   bool visitPHINode(PHINode &PN) {
     DEBUG(dbgs() << "    original: " << PN << "\n");
+    assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
 
     // We would like to compute a new pointer in only one place, but have it be
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    IRBuilderTy PtrBuilder(OldPtr);
     PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
                              ".");
 
-    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+    Value *NewPtr =
+        getAdjustedAllocaPtr(PtrBuilder, BeginOffset, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
     DEBUG(dbgs() << "          to: " << PN << "\n");
     deleteIfTriviallyDead(OldPtr);
-    return false;
+
+    // Check whether we can speculate this PHI node, and if so remember that
+    // fact and queue it up for another iteration after the speculation
+    // occurs.
+    if (isSafePHIToSpeculate(PN, &DL)) {
+      Pass.SpeculatablePHIs.insert(&PN);
+      IsUsedByRewrittenSpeculatableInstructions = true;
+      return true;
+    }
+
+    return false; // PHIs can't be promoted on their own.
   }
 
   bool visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
     assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
            "Pointer isn't an operand!");
+    assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+    assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
 
-    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
+    Value *NewPtr = getAdjustedAllocaPtr(IRB, BeginOffset, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     if (SI.getOperand(1) == OldPtr)
       SI.setOperand(1, NewPtr);
@@ -3061,7 +2668,17 @@ private:
 
     DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
-    return false;
+
+    // Check whether we can speculate this select instruction, and if so
+    // remember that fact and queue it up for another iteration after the
+    // speculation occurs.
+    if (isSafeSelectToSpeculate(SI, &DL)) {
+      Pass.SpeculatableSelects.insert(&SI);
+      IsUsedByRewrittenSpeculatableInstructions = true;
+      return true;
+    }
+
+    return false; // Selects can't be promoted on their own.
   }
 
 };
@@ -3077,7 +2694,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   // Befriend the base class so it can delegate to private visit methods.
   friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
 
-  const DataLayout &TD;
+  const DataLayout &DL;
 
   /// Queue of pointer uses to analyze and potentially rewrite.
   SmallVector<Use *, 8> Queue;
@@ -3090,7 +2707,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
   Use *U;
 
 public:
-  AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {}
+  AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
 
   /// Rewrite loads and stores through a pointer and all pointers derived from
   /// it.
@@ -3319,12 +2936,12 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
 /// when the size or offset cause either end of type-based partition to be off.
 /// Also, this is a best-effort routine. It is reasonable to give up and not
 /// return a type if necessary.
-static Type *getTypePartition(const DataLayout &TD, Type *Ty,
+static Type *getTypePartition(const DataLayout &DL, Type *Ty,
                               uint64_t Offset, uint64_t Size) {
-  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
-    return stripAggregateTypeWrapping(TD, Ty);
-  if (Offset > TD.getTypeAllocSize(Ty) ||
-      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(DL, Ty);
+  if (Offset > DL.getTypeAllocSize(Ty) ||
+      (DL.getTypeAllocSize(Ty) - Offset) < Size)
     return 0;
 
   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
@@ -3333,7 +2950,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
       return 0;
 
     Type *ElementTy = SeqTy->getElementType();
-    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
     if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
@@ -3350,12 +2967,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
       if ((Offset + Size) > ElementSize)
         return 0;
       // Recurse through the element type trying to peel off offset bytes.
-      return getTypePartition(TD, ElementTy, Offset, Size);
+      return getTypePartition(DL, ElementTy, Offset, Size);
     }
     assert(Offset == 0);
 
     if (Size == ElementSize)
-      return stripAggregateTypeWrapping(TD, ElementTy);
+      return stripAggregateTypeWrapping(DL, ElementTy);
     assert(Size > ElementSize);
     uint64_t NumElements = Size / ElementSize;
     if (NumElements * ElementSize != Size)
@@ -3367,7 +2984,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   if (!STy)
     return 0;
 
-  const StructLayout *SL = TD.getStructLayout(STy);
+  const StructLayout *SL = DL.getStructLayout(STy);
   if (Offset >= SL->getSizeInBytes())
     return 0;
   uint64_t EndOffset = Offset + Size;
@@ -3378,7 +2995,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   Offset -= SL->getElementOffset(Index);
 
   Type *ElementTy = STy->getElementType(Index);
-  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
   if (Offset >= ElementSize)
     return 0; // The offset points into alignment padding.
 
@@ -3386,12 +3003,12 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   if (Offset > 0 || Size < ElementSize) {
     if ((Offset + Size) > ElementSize)
       return 0;
-    return getTypePartition(TD, ElementTy, Offset, Size);
+    return getTypePartition(DL, ElementTy, Offset, Size);
   }
   assert(Offset == 0);
 
   if (Size == ElementSize)
-    return stripAggregateTypeWrapping(TD, ElementTy);
+    return stripAggregateTypeWrapping(DL, ElementTy);
 
   StructType::element_iterator EI = STy->element_begin() + Index,
                                EE = STy->element_end();
@@ -3414,7 +3031,7 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
   // Try to build up a sub-structure.
   StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
                                       STy->isPacked());
-  const StructLayout *SubSL = TD.getStructLayout(SubTy);
+  const StructLayout *SubSL = DL.getStructLayout(SubTy);
   if (Size != SubSL->getSizeInBytes())
     return 0; // The sub-struct doesn't have quite the size needed.
 
@@ -3431,113 +3048,280 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
 /// appropriate new offsets. It also evaluates how successful the rewrite was
 /// at enabling promotion and if it was successful queues the alloca to be
 /// promoted.
-bool SROA::rewriteAllocaPartition(AllocaInst &AI,
-                                  AllocaPartitioning &P,
-                                  AllocaPartitioning::iterator PI) {
-  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
-  bool IsLive = false;
-  for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
-                                        UE = P.use_end(PI);
-       UI != UE && !IsLive; ++UI)
-    if (UI->getUse())
-      IsLive = true;
-  if (!IsLive)
-    return false; // No live uses left of this partition.
-
-  DEBUG(dbgs() << "Speculating PHIs and selects in partition "
-               << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n");
-
-  PHIOrSelectSpeculator Speculator(*TD, P, *this);
-  DEBUG(dbgs() << "  speculating ");
-  DEBUG(P.print(dbgs(), PI, ""));
-  Speculator.visitUsers(PI);
+bool SROA::rewritePartition(AllocaInst &AI, AllocaSlices &S,
+                            AllocaSlices::iterator B, AllocaSlices::iterator E,
+                            int64_t BeginOffset, int64_t EndOffset,
+                            ArrayRef<AllocaSlices::iterator> SplitUses) {
+  assert(BeginOffset < EndOffset);
+  uint64_t SliceSize = EndOffset - BeginOffset;
 
   // Try to compute a friendly type for this partition of the alloca. This
   // won't always succeed, in which case we fall back to a legal integer type
   // or an i8 array of an appropriate size.
-  Type *AllocaTy = 0;
-  if (Type *PartitionTy = P.getCommonType(PI))
-    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
-      AllocaTy = PartitionTy;
-  if (!AllocaTy)
-    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
-                                             PI->BeginOffset, AllocaSize))
-      AllocaTy = PartitionTy;
-  if ((!AllocaTy ||
-       (AllocaTy->isArrayTy() &&
-        AllocaTy->getArrayElementType()->isIntegerTy())) &&
-      TD->isLegalInteger(AllocaSize * 8))
-    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
-  if (!AllocaTy)
-    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
-  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
+  Type *SliceTy = 0;
+  if (Type *CommonUseTy = findCommonType(B, E, EndOffset))
+    if (DL->getTypeAllocSize(CommonUseTy) >= SliceSize)
+      SliceTy = CommonUseTy;
+  if (!SliceTy)
+    if (Type *TypePartitionTy = getTypePartition(*DL, AI.getAllocatedType(),
+                                                 BeginOffset, SliceSize))
+      SliceTy = TypePartitionTy;
+  if ((!SliceTy || (SliceTy->isArrayTy() &&
+                    SliceTy->getArrayElementType()->isIntegerTy())) &&
+      DL->isLegalInteger(SliceSize * 8))
+    SliceTy = Type::getIntNTy(*C, SliceSize * 8);
+  if (!SliceTy)
+    SliceTy = ArrayType::get(Type::getInt8Ty(*C), SliceSize);
+  assert(DL->getTypeAllocSize(SliceTy) >= SliceSize);
+
+  bool IsVectorPromotable = isVectorPromotionViable(
+      *DL, SliceTy, S, BeginOffset, EndOffset, B, E, SplitUses);
+
+  bool IsIntegerPromotable =
+      !IsVectorPromotable &&
+      isIntegerWideningViable(*DL, SliceTy, BeginOffset, S, B, E, SplitUses);
 
   // Check for the case where we're going to rewrite to a new alloca of the
   // exact same type as the original, and with the same access offsets. In that
   // case, re-use the existing alloca, but still run through the rewriter to
   // perform phi and select speculation.
   AllocaInst *NewAI;
-  if (AllocaTy == AI.getAllocatedType()) {
-    assert(PI->BeginOffset == 0 &&
+  if (SliceTy == AI.getAllocatedType()) {
+    assert(BeginOffset == 0 &&
            "Non-zero begin offset but same alloca type");
-    assert(PI == P.begin() && "Begin offset is zero on later partition");
     NewAI = &AI;
+    // FIXME: We should be able to bail at this point with "nothing changed".
+    // FIXME: We might want to defer PHI speculation until after here.
   } else {
     unsigned Alignment = AI.getAlignment();
     if (!Alignment) {
       // The minimum alignment which users can rely on when the explicit
       // alignment is omitted or zero is that required by the ABI for this
       // type.
-      Alignment = TD->getABITypeAlignment(AI.getAllocatedType());
+      Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
     }
-    Alignment = MinAlign(Alignment, PI->BeginOffset);
+    Alignment = MinAlign(Alignment, BeginOffset);
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    if (Alignment <= TD->getABITypeAlignment(AllocaTy))
+    if (Alignment <= DL->getABITypeAlignment(SliceTy))
       Alignment = 0;
-    NewAI = new AllocaInst(AllocaTy, 0, Alignment,
-                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
-                           &AI);
+    NewAI = new AllocaInst(SliceTy, 0, Alignment,
+                           AI.getName() + ".sroa." + Twine(B - S.begin()), &AI);
     ++NumNewAllocas;
   }
 
   DEBUG(dbgs() << "Rewriting alloca partition "
-               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
-               << *NewAI << "\n");
+               << "[" << BeginOffset << "," << EndOffset << ") to: " << *NewAI
+               << "\n");
 
-  // Track the high watermark of the post-promotion worklist. We will reset it
-  // to this point if the alloca is not in fact scheduled for promotion.
+  // Track the high watermark on several worklists that are only relevant for
+  // promoted allocas. We will reset it to this point if the alloca is not in
+  // fact scheduled for promotion.
   unsigned PPWOldSize = PostPromotionWorklist.size();
+  unsigned SPOldSize = SpeculatablePHIs.size();
+  unsigned SSOldSize = SpeculatableSelects.size();
+  unsigned NumUses = 0;
+
+  AllocaSliceRewriter Rewriter(*DL, S, *this, AI, *NewAI, BeginOffset,
+                               EndOffset, IsVectorPromotable,
+                               IsIntegerPromotable);
+  bool Promotable = true;
+  for (ArrayRef<AllocaSlices::iterator>::const_iterator SUI = SplitUses.begin(),
+                                                        SUE = SplitUses.end();
+       SUI != SUE; ++SUI) {
+    DEBUG(dbgs() << "  rewriting split ");
+    DEBUG(S.printSlice(dbgs(), *SUI, ""));
+    Promotable &= Rewriter.visit(*SUI);
+    ++NumUses;
+  }
+  for (AllocaSlices::iterator I = B; I != E; ++I) {
+    DEBUG(dbgs() << "  rewriting ");
+    DEBUG(S.printSlice(dbgs(), I, ""));
+    Promotable &= Rewriter.visit(I);
+    ++NumUses;
+  }
+
+  NumAllocaPartitionUses += NumUses;
+  MaxUsesPerAllocaPartition =
+      std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition);
 
-  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
-                                   PI->BeginOffset, PI->EndOffset);
-  DEBUG(dbgs() << "  rewriting ");
-  DEBUG(P.print(dbgs(), PI, ""));
-  bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI));
-  if (Promotable) {
+  if (Promotable && !Rewriter.isUsedByRewrittenSpeculatableInstructions()) {
     DEBUG(dbgs() << "  and queuing for promotion\n");
     PromotableAllocas.push_back(NewAI);
-  } else if (NewAI != &AI) {
+  } else if (NewAI != &AI ||
+             (Promotable &&
+              Rewriter.isUsedByRewrittenSpeculatableInstructions())) {
     // If we can't promote the alloca, iterate on it to check for new
     // refinements exposed by splitting the current alloca. Don't iterate on an
     // alloca which didn't actually change and didn't get promoted.
+    //
+    // Alternatively, if we could promote the alloca but have speculatable
+    // instructions then we will speculate them after finishing our processing
+    // of the original alloca. Mark the new one for re-visiting in the next
+    // iteration so the speculated operations can be rewritten.
+    //
+    // FIXME: We should actually track whether the rewriter changed anything.
     Worklist.insert(NewAI);
   }
 
   // Drop any post-promotion work items if promotion didn't happen.
-  if (!Promotable)
+  if (!Promotable) {
     while (PostPromotionWorklist.size() > PPWOldSize)
       PostPromotionWorklist.pop_back();
+    while (SpeculatablePHIs.size() > SPOldSize)
+      SpeculatablePHIs.pop_back();
+    while (SpeculatableSelects.size() > SSOldSize)
+      SpeculatableSelects.pop_back();
+  }
 
   return true;
 }
 
-/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
+namespace {
+struct IsSliceEndLessOrEqualTo {
+  uint64_t UpperBound;
+
+  IsSliceEndLessOrEqualTo(uint64_t UpperBound) : UpperBound(UpperBound) {}
+
+  bool operator()(const AllocaSlices::iterator &I) {
+    return I->endOffset() <= UpperBound;
+  }
+};
+}
+
+static void
+removeFinishedSplitUses(SmallVectorImpl<AllocaSlices::iterator> &SplitUses,
+                        uint64_t &MaxSplitUseEndOffset, uint64_t Offset) {
+  if (Offset >= MaxSplitUseEndOffset) {
+    SplitUses.clear();
+    MaxSplitUseEndOffset = 0;
+    return;
+  }
+
+  size_t SplitUsesOldSize = SplitUses.size();
+  SplitUses.erase(std::remove_if(SplitUses.begin(), SplitUses.end(),
+                                 IsSliceEndLessOrEqualTo(Offset)),
+                  SplitUses.end());
+  if (SplitUsesOldSize == SplitUses.size())
+    return;
+
+  // Recompute the max. While this is linear, so is remove_if.
+  MaxSplitUseEndOffset = 0;
+  for (SmallVectorImpl<AllocaSlices::iterator>::iterator
+           SUI = SplitUses.begin(),
+           SUE = SplitUses.end();
+       SUI != SUE; ++SUI)
+    MaxSplitUseEndOffset = std::max((*SUI)->endOffset(), MaxSplitUseEndOffset);
+}
+
+/// \brief Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &S) {
+  if (S.begin() == S.end())
+    return false;
+
+  unsigned NumPartitions = 0;
   bool Changed = false;
-  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
-       ++PI)
-    Changed |= rewriteAllocaPartition(AI, P, PI);
+  SmallVector<AllocaSlices::iterator, 4> SplitUses;
+  uint64_t MaxSplitUseEndOffset = 0;
+
+  uint64_t BeginOffset = S.begin()->beginOffset();
+
+  for (AllocaSlices::iterator SI = S.begin(), SJ = llvm::next(SI), SE = S.end();
+       SI != SE; SI = SJ) {
+    uint64_t MaxEndOffset = SI->endOffset();
+
+    if (!SI->isSplittable()) {
+      // When we're forming an unsplittable region, it must always start at the
+      // first slice and will extend through its end.
+      assert(BeginOffset == SI->beginOffset());
+
+      // Form a partition including all of the overlapping slices with this
+      // unsplittable slice.
+      while (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
+        if (!SJ->isSplittable())
+          MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
+        ++SJ;
+      }
+    } else {
+      assert(SI->isSplittable()); // Established above.
+
+      // Collect all of the overlapping splittable slices.
+      while (SJ != SE && SJ->beginOffset() < MaxEndOffset &&
+             SJ->isSplittable()) {
+        MaxEndOffset = std::max(MaxEndOffset, SJ->endOffset());
+        ++SJ;
+      }
+
+      // Back up MaxEndOffset and SJ if we ended the span early when
+      // encountering an unsplittable slice.
+      if (SJ != SE && SJ->beginOffset() < MaxEndOffset) {
+        assert(!SJ->isSplittable());
+        MaxEndOffset = SJ->beginOffset();
+      }
+    }
+
+    // Check if we have managed to move the end offset forward yet. If so,
+    // we'll have to rewrite uses and erase old split uses.
+    if (BeginOffset < MaxEndOffset) {
+      // Rewrite a sequence of overlapping slices.
+      Changed |=
+          rewritePartition(AI, S, SI, SJ, BeginOffset, MaxEndOffset, SplitUses);
+      ++NumPartitions;
+
+      removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset, MaxEndOffset);
+    }
+
+    // Accumulate all the splittable slices from the [SI,SJ) region which
+    // overlap going forward.
+    for (AllocaSlices::iterator SK = SI; SK != SJ; ++SK)
+      if (SK->isSplittable() && SK->endOffset() > MaxEndOffset) {
+        SplitUses.push_back(SK);
+        MaxSplitUseEndOffset = std::max(SK->endOffset(), MaxSplitUseEndOffset);
+      }
+
+    // If we're already at the end and we have no split uses, we're done.
+    if (SJ == SE && SplitUses.empty())
+      break;
+
+    // If we have no split uses or no gap in offsets, we're ready to move to
+    // the next slice.
+    if (SplitUses.empty() || (SJ != SE && MaxEndOffset == SJ->beginOffset())) {
+      BeginOffset = SJ->beginOffset();
+      continue;
+    }
+
+    // Even if we have split slices, if the next slice is splittable and the
+    // split slices reach it, we can simply set up the beginning offset of the
+    // next iteration to bridge between them.
+    if (SJ != SE && SJ->isSplittable() &&
+        MaxSplitUseEndOffset > SJ->beginOffset()) {
+      BeginOffset = MaxEndOffset;
+      continue;
+    }
+
+    // Otherwise, we have a tail of split slices. Rewrite them with an empty
+    // range of slices.
+    uint64_t PostSplitEndOffset =
+        SJ == SE ? MaxSplitUseEndOffset : SJ->beginOffset();
+
+    Changed |= rewritePartition(AI, S, SJ, SJ, MaxEndOffset, PostSplitEndOffset,
+                                SplitUses);
+    ++NumPartitions;
+
+    if (SJ == SE)
+      break; // Skip the rest, we don't need to do any cleanup.
+
+    removeFinishedSplitUses(SplitUses, MaxSplitUseEndOffset,
+                            PostSplitEndOffset);
+
+    // Now just reset the begin offset for the next iteration.
+    BeginOffset = SJ->beginOffset();
+  }
+
+  NumAllocaPartitions += NumPartitions;
+  MaxPartitionsPerAlloca =
+      std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca);
 
   return Changed;
 }
@@ -3545,7 +3329,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
 /// \brief Analyze an alloca for SROA.
 ///
 /// This analyzes the alloca to ensure we can reason about it, builds
-/// a partitioning of the alloca, and then hands it off to be split and
+/// the slices of the alloca, and then hands it off to be split and
 /// rewritten as needed.
 bool SROA::runOnAlloca(AllocaInst &AI) {
   DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
@@ -3559,32 +3343,32 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
 
   // Skip alloca forms that this analysis can't handle.
   if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+      DL->getTypeAllocSize(AI.getAllocatedType()) == 0)
     return false;
 
   bool Changed = false;
 
   // First, split any FCA loads and stores touching this alloca to promote
   // better splitting and promotion opportunities.
-  AggLoadStoreRewriter AggRewriter(*TD);
+  AggLoadStoreRewriter AggRewriter(*DL);
   Changed |= AggRewriter.rewrite(AI);
 
-  // Build the partition set using a recursive instruction-visiting builder.
-  AllocaPartitioning P(*TD, AI);
-  DEBUG(P.print(dbgs()));
-  if (P.isEscaped())
+  // Build the slices using a recursive instruction-visiting builder.
+  AllocaSlices S(*DL, AI);
+  DEBUG(S.print(dbgs()));
+  if (S.isEscaped())
     return Changed;
 
   // Delete all the dead users of this alloca before splitting and rewriting it.
-  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
-                                              DE = P.dead_user_end();
+  for (AllocaSlices::dead_user_iterator DI = S.dead_user_begin(),
+                                        DE = S.dead_user_end();
        DI != DE; ++DI) {
     Changed = true;
     (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
     DeadInsts.insert(*DI);
   }
-  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
-                                            DE = P.dead_op_end();
+  for (AllocaSlices::dead_op_iterator DO = S.dead_op_begin(),
+                                      DE = S.dead_op_end();
        DO != DE; ++DO) {
     Value *OldV = **DO;
     // Clobber the use with an undef value.
@@ -3596,11 +3380,21 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
       }
   }
 
-  // No partitions to split. Leave the dead alloca for a later pass to clean up.
-  if (P.begin() == P.end())
+  // No slices to split. Leave the dead alloca for a later pass to clean up.
+  if (S.begin() == S.end())
     return Changed;
 
-  return splitAlloca(AI, P) || Changed;
+  Changed |= splitAlloca(AI, S);
+
+  DEBUG(dbgs() << "  Speculating PHIs\n");
+  while (!SpeculatablePHIs.empty())
+    speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+  DEBUG(dbgs() << "  Speculating Selects\n");
+  while (!SpeculatableSelects.empty())
+    speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+  return Changed;
 }
 
 /// \brief Delete the dead instructions accumulated in this run.
@@ -3635,6 +3429,15 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
   }
 }
 
+static void enqueueUsersInWorklist(Instruction &I,
+                                   SmallVectorImpl<Instruction *> &Worklist,
+                                   SmallPtrSet<Instruction *, 8> &Visited) {
+  for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+       ++UI)
+    if (Visited.insert(cast<Instruction>(*UI)))
+      Worklist.push_back(cast<Instruction>(*UI));
+}
+
 /// \brief Promote the allocas, using the best available technique.
 ///
 /// This attempts to promote whatever allocas have been identified as viable in
@@ -3659,25 +3462,28 @@ bool SROA::promoteAllocas(Function &F) {
   DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
   SSAUpdater SSA;
   DIBuilder DIB(*F.getParent());
-  SmallVector<Instruction*, 64> Insts;
+  SmallVector<Instruction *, 64> Insts;
+
+  // We need a worklist to walk the uses of each alloca.
+  SmallVector<Instruction *, 8> Worklist;
+  SmallPtrSet<Instruction *, 8> Visited;
+  SmallVector<Instruction *, 32> DeadInsts;
 
   for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
     AllocaInst *AI = PromotableAllocas[Idx];
-    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
-         UI != UE;) {
-      Instruction *I = cast<Instruction>(*UI++);
+    Insts.clear();
+    Worklist.clear();
+    Visited.clear();
+
+    enqueueUsersInWorklist(*AI, Worklist, Visited);
+
+    while (!Worklist.empty()) {
+      Instruction *I = Worklist.pop_back_val();
+
       // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
       // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
       // leading to them) here. Eventually it should use them to optimize the
       // scalar values produced.
-      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
-        assert(onlyUsedByLifetimeMarkers(I) &&
-               "Found a bitcast used outside of a lifetime marker.");
-        while (!I->use_empty())
-          cast<Instruction>(*I->use_begin())->eraseFromParent();
-        I->eraseFromParent();
-        continue;
-      }
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
         assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
                II->getIntrinsicID() == Intrinsic::lifetime_end);
@@ -3685,10 +3491,30 @@ bool SROA::promoteAllocas(Function &F) {
         continue;
       }
 
-      Insts.push_back(I);
+      // Push the loads and stores we find onto the list. SROA will already
+      // have validated that all loads and stores are viable candidates for
+      // promotion.
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        assert(LI->getType() == AI->getAllocatedType());
+        Insts.push_back(LI);
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        assert(SI->getValueOperand()->getType() == AI->getAllocatedType());
+        Insts.push_back(SI);
+        continue;
+      }
+
+      // For everything else, we know that only no-op bitcasts and GEPs will
+      // make it this far, just recurse through them and recall them for later
+      // removal.
+      DeadInsts.push_back(I);
+      enqueueUsersInWorklist(*I, Worklist, Visited);
     }
     AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
-    Insts.clear();
+    while (!DeadInsts.empty())
+      DeadInsts.pop_back_val()->eraseFromParent();
+    AI->eraseFromParent();
   }
 
   PromotableAllocas.clear();
@@ -3712,8 +3538,8 @@ namespace {
 bool SROA::runOnFunction(Function &F) {
   DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
   C = &F.getContext();
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (!TD) {
+  DL = getAnalysisIfAvailable<DataLayout>();
+  if (!DL) {
     DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
     return false;
   }
diff --git a/lib/Transforms/Scalar/SampleProfile.cpp b/lib/Transforms/Scalar/SampleProfile.cpp
new file mode 100644
index 0000000..9bcd702
--- /dev/null
+++ b/lib/Transforms/Scalar/SampleProfile.cpp
@@ -0,0 +1,479 @@
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+//      to indicate the weights of each edge coming out of the branch.
+//      The weight of each edge is the weight of the target block for
+//      that edge. The weight of a block B is computed as the maximum
+//      number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sample-profile"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+    "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+    cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+namespace {
+/// \brief Sample-based profile reader.
+///
+/// Each profile contains sample counts for all the functions
+/// executed. Inside each function, statements are annotated with the
+/// collected samples on all the instructions associated with that
+/// statement.
+///
+/// For this to produce meaningful data, the program needs to be
+/// compiled with some debug information (at minimum, line numbers:
+/// -gline-tables-only). Otherwise, it will be impossible to match IR
+/// instructions to the line numbers collected by the profiler.
+///
+/// From the profile file, we are interested in collecting the
+/// following information:
+///
+/// * A list of functions included in the profile (mangled names).
+///
+/// * For each function F:
+///   1. The total number of samples collected in F.
+///
+///   2. The samples collected at each line in F. To provide some
+///      protection against source code shuffling, line numbers should
+///      be relative to the start of the function.
+class SampleProfile {
+public:
+  SampleProfile(StringRef F) : Profiles(0), Filename(F) {}
+
+  void dump();
+  void loadText();
+  void loadNative() { llvm_unreachable("not implemented"); }
+  bool emitAnnotations(Function &F);
+  void printFunctionProfile(raw_ostream &OS, StringRef FName);
+  void dumpFunctionProfile(StringRef FName);
+
+protected:
+  typedef DenseMap<uint32_t, uint32_t> BodySampleMap;
+  typedef DenseMap<BasicBlock *, uint32_t> BlockWeightMap;
+
+  /// \brief Representation of the runtime profile for a function.
+  ///
+  /// This data structure contains the runtime profile for a given
+  /// function. It contains the total number of samples collected
+  /// in the function and a map of samples collected in every statement.
+  struct FunctionProfile {
+    /// \brief Total number of samples collected inside this function.
+    ///
+    /// Samples are cumulative, they include all the samples collected
+    /// inside this function and all its inlined callees.
+    unsigned TotalSamples;
+
+    // \brief Total number of samples collected at the head of the function.
+    unsigned TotalHeadSamples;
+
+    /// \brief Map line offsets to collected samples.
+    ///
+    /// Each entry in this map contains the number of samples
+    /// collected at the corresponding line offset. All line locations
+    /// are an offset from the start of the function.
+    BodySampleMap BodySamples;
+
+    /// \brief Map basic blocks to their computed weights.
+    ///
+    /// The weight of a basic block is defined to be the maximum
+    /// of all the instruction weights in that block.
+    BlockWeightMap BlockWeights;
+  };
+
+  uint32_t getInstWeight(Instruction &I, unsigned FirstLineno,
+                         BodySampleMap &BodySamples);
+  uint32_t computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                              BodySampleMap &BodySamples);
+
+  /// \brief Map every function to its associated profile.
+  ///
+  /// The profile of every function executed at runtime is collected
+  /// in the structure FunctionProfile. This maps function objects
+  /// to their corresponding profiles.
+  StringMap<FunctionProfile> Profiles;
+
+  /// \brief Path name to the file holding the profile data.
+  ///
+  /// The format of this file is defined by each profiler
+  /// independently. If possible, the profiler should have a text
+  /// version of the profile format to be used in constructing test
+  /// cases and debugging.
+  StringRef Filename;
+};
+
+/// \brief Loader class for text-based profiles.
+///
+/// This class defines a simple interface to read text files containing
+/// profiles. It keeps track of line number information and location of
+/// the file pointer. Users of this class are responsible for actually
+/// parsing the lines returned by the readLine function.
+///
+/// TODO - This does not really belong here. It is a generic text file
+/// reader. It should be moved to the Support library and made more general.
+class ExternalProfileTextLoader {
+public:
+  ExternalProfileTextLoader(StringRef F) : Filename(F) {
+    error_code EC;
+    EC = MemoryBuffer::getFile(Filename, Buffer);
+    if (EC)
+      report_fatal_error("Could not open profile file " + Filename + ": " +
+                         EC.message());
+    FP = Buffer->getBufferStart();
+    Lineno = 0;
+  }
+
+  /// \brief Read a line from the mapped file.
+  StringRef readLine() {
+    size_t Length = 0;
+    const char *start = FP;
+    while (FP != Buffer->getBufferEnd() && *FP != '\n') {
+      Length++;
+      FP++;
+    }
+    if (FP != Buffer->getBufferEnd())
+      FP++;
+    Lineno++;
+    return StringRef(start, Length);
+  }
+
+  /// \brief Return true, if we've reached EOF.
+  bool atEOF() const { return FP == Buffer->getBufferEnd(); }
+
+  /// \brief Report a parse error message and stop compilation.
+  void reportParseError(Twine Msg) const {
+    report_fatal_error(Filename + ":" + Twine(Lineno) + ": " + Msg + "\n");
+  }
+
+private:
+  /// \brief Memory buffer holding the text file.
+  OwningPtr<MemoryBuffer> Buffer;
+
+  /// \brief Current position into the memory buffer.
+  const char *FP;
+
+  /// \brief Current line number.
+  int64_t Lineno;
+
+  /// \brief Path name where to the profile file.
+  StringRef Filename;
+};
+
+/// \brief Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader : public FunctionPass {
+public:
+  // Class identification, replacement for typeinfo
+  static char ID;
+
+  SampleProfileLoader(StringRef Name = SampleProfileFile)
+      : FunctionPass(ID), Profiler(0), Filename(Name) {
+    initializeSampleProfileLoaderPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool doInitialization(Module &M);
+
+  void dump() { Profiler->dump(); }
+
+  virtual const char *getPassName() const { return "Sample profile pass"; }
+
+  virtual bool runOnFunction(Function &F);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+  }
+
+protected:
+  /// \brief Profile reader object.
+  OwningPtr<SampleProfile> Profiler;
+
+  /// \brief Name of the profile file to load.
+  StringRef Filename;
+};
+}
+
+/// \brief Print the function profile for \p FName on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param FName Name of the function to print.
+void SampleProfile::printFunctionProfile(raw_ostream &OS, StringRef FName) {
+  FunctionProfile FProfile = Profiles[FName];
+  OS << "Function: " << FName << ", " << FProfile.TotalSamples << ", "
+     << FProfile.TotalHeadSamples << ", " << FProfile.BodySamples.size()
+     << " sampled lines\n";
+  for (BodySampleMap::const_iterator SI = FProfile.BodySamples.begin(),
+                                     SE = FProfile.BodySamples.end();
+       SI != SE; ++SI)
+    OS << "\tline offset: " << SI->first
+       << ", number of samples: " << SI->second << "\n";
+  OS << "\n";
+}
+
+/// \brief Dump the function profile for \p FName.
+///
+/// \param FName Name of the function to print.
+void SampleProfile::dumpFunctionProfile(StringRef FName) {
+  printFunctionProfile(dbgs(), FName);
+}
+
+/// \brief Dump all the function profiles found.
+void SampleProfile::dump() {
+  for (StringMap<FunctionProfile>::const_iterator I = Profiles.begin(),
+                                                  E = Profiles.end();
+       I != E; ++I)
+    dumpFunctionProfile(I->getKey());
+}
+
+/// \brief Load samples from a text file.
+///
+/// The file is divided in two segments:
+///
+/// Symbol table (represented with the string "symbol table")
+///    Number of symbols in the table
+///    symbol 1
+///    symbol 2
+///    ...
+///    symbol N
+///
+/// Function body profiles
+///    function1:total_samples:total_head_samples:number_of_locations
+///    location_offset_1: number_of_samples
+///    location_offset_2: number_of_samples
+///    ...
+///    location_offset_N: number_of_samples
+///
+/// Function names must be mangled in order for the profile loader to
+/// match them in the current translation unit.
+///
+/// Since this is a flat profile, a function that shows up more than
+/// once gets all its samples aggregated across all its instances.
+/// TODO - flat profiles are too imprecise to provide good optimization
+/// opportunities. Convert them to context-sensitive profile.
+///
+/// This textual representation is useful to generate unit tests and
+/// for debugging purposes, but it should not be used to generate
+/// profiles for large programs, as the representation is extremely
+/// inefficient.
+void SampleProfile::loadText() {
+  ExternalProfileTextLoader Loader(Filename);
+
+  // Read the symbol table.
+  StringRef Line = Loader.readLine();
+  if (Line != "symbol table")
+    Loader.reportParseError("Expected 'symbol table', found " + Line);
+  int NumSymbols;
+  Line = Loader.readLine();
+  if (Line.getAsInteger(10, NumSymbols))
+    Loader.reportParseError("Expected a number, found " + Line);
+  for (int I = 0; I < NumSymbols; I++) {
+    StringRef FName = Loader.readLine();
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.BodySamples.clear();
+    FProfile.TotalSamples = 0;
+    FProfile.TotalHeadSamples = 0;
+  }
+
+  // Read the profile of each function. Since each function may be
+  // mentioned more than once, and we are collecting flat profiles,
+  // accumulate samples as we parse them.
+  Regex HeadRE("^([^:]+):([0-9]+):([0-9]+):([0-9]+)$");
+  Regex LineSample("^([0-9]+): ([0-9]+)$");
+  while (!Loader.atEOF()) {
+    SmallVector<StringRef, 4> Matches;
+    Line = Loader.readLine();
+    if (!HeadRE.match(Line, &Matches))
+      Loader.reportParseError("Expected 'mangled_name:NUM:NUM:NUM', found " +
+                              Line);
+    assert(Matches.size() == 5);
+    StringRef FName = Matches[1];
+    unsigned NumSamples, NumHeadSamples, NumSampledLines;
+    Matches[2].getAsInteger(10, NumSamples);
+    Matches[3].getAsInteger(10, NumHeadSamples);
+    Matches[4].getAsInteger(10, NumSampledLines);
+    FunctionProfile &FProfile = Profiles[FName];
+    FProfile.TotalSamples += NumSamples;
+    FProfile.TotalHeadSamples += NumHeadSamples;
+    BodySampleMap &SampleMap = FProfile.BodySamples;
+    unsigned I;
+    for (I = 0; I < NumSampledLines && !Loader.atEOF(); I++) {
+      Line = Loader.readLine();
+      if (!LineSample.match(Line, &Matches))
+        Loader.reportParseError("Expected 'NUM: NUM', found " + Line);
+      assert(Matches.size() == 3);
+      unsigned LineOffset, NumSamples;
+      Matches[1].getAsInteger(10, LineOffset);
+      Matches[2].getAsInteger(10, NumSamples);
+      SampleMap[LineOffset] += NumSamples;
+    }
+
+    if (I < NumSampledLines)
+      Loader.reportParseError("Unexpected end of file");
+  }
+}
+
+/// \brief Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use \p FirstLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using \p BodySamples.
+///
+/// \param Inst Instruction to query.
+/// \param FirstLineno Line number of the first instruction in the function.
+/// \param BodySamples Map of relative source line locations to samples.
+///
+/// \returns The profiled weight of I.
+uint32_t SampleProfile::getInstWeight(Instruction &Inst, unsigned FirstLineno,
+                                      BodySampleMap &BodySamples) {
+  unsigned LOffset = Inst.getDebugLoc().getLine() - FirstLineno + 1;
+  return BodySamples.lookup(LOffset);
+}
+
+/// \brief Compute the weight of a basic block.
+///
+/// The weight of basic block \p B is the maximum weight of all the
+/// instructions in B.
+///
+/// \param B The basic block to query.
+/// \param FirstLineno The line number for the first line in the
+///     function holding B.
+/// \param BodySamples The map containing all the samples collected in that
+///     function.
+///
+/// \returns The computed weight of B.
+uint32_t SampleProfile::computeBlockWeight(BasicBlock *B, unsigned FirstLineno,
+                                           BodySampleMap &BodySamples) {
+  // If we've computed B's weight before, return it.
+  Function *F = B->getParent();
+  FunctionProfile &FProfile = Profiles[F->getName()];
+  std::pair<BlockWeightMap::iterator, bool> Entry =
+      FProfile.BlockWeights.insert(std::make_pair(B, 0));
+  if (!Entry.second)
+    return Entry.first->second;
+
+  // Otherwise, compute and cache B's weight.
+  uint32_t Weight = 0;
+  for (BasicBlock::iterator I = B->begin(), E = B->end(); I != E; ++I) {
+    uint32_t InstWeight = getInstWeight(*I, FirstLineno, BodySamples);
+    if (InstWeight > Weight)
+      Weight = InstWeight;
+  }
+  Entry.first->second = Weight;
+  return Weight;
+}
+
+/// \brief Generate branch weight metadata for all branches in \p F.
+///
+/// For every branch instruction B in \p F, we compute the weight of the
+/// target block for each of the edges out of B. This is the weight
+/// that we associate with that branch.
+///
+/// TODO - This weight assignment will most likely be wrong if the
+/// target branch has more than two predecessors. This needs to be done
+/// using some form of flow propagation.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on B using the computed values.
+///
+/// \param F The function to query.
+bool SampleProfile::emitAnnotations(Function &F) {
+  bool Changed = false;
+  FunctionProfile &FProfile = Profiles[F.getName()];
+  unsigned FirstLineno = inst_begin(F)->getDebugLoc().getLine();
+  MDBuilder MDB(F.getContext());
+
+  // Clear the block weights cache.
+  FProfile.BlockWeights.clear();
+
+  // When we find a branch instruction: For each edge E out of the branch,
+  // the weight of E is the weight of the target block.
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    BasicBlock *B = I;
+    TerminatorInst *TI = B->getTerminator();
+    if (TI->getNumSuccessors() == 1)
+      continue;
+    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+      continue;
+
+    SmallVector<uint32_t, 4> Weights;
+    unsigned NSuccs = TI->getNumSuccessors();
+    for (unsigned I = 0; I < NSuccs; ++I) {
+      BasicBlock *Succ = TI->getSuccessor(I);
+      uint32_t Weight =
+          computeBlockWeight(Succ, FirstLineno, FProfile.BodySamples);
+      Weights.push_back(Weight);
+    }
+
+    TI->setMetadata(llvm::LLVMContext::MD_prof,
+                    MDB.createBranchWeights(Weights));
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+char SampleProfileLoader::ID = 0;
+INITIALIZE_PASS(SampleProfileLoader, "sample-profile", "Sample Profile loader",
+                false, false)
+
+bool SampleProfileLoader::runOnFunction(Function &F) {
+  return Profiler->emitAnnotations(F);
+}
+
+bool SampleProfileLoader::doInitialization(Module &M) {
+  Profiler.reset(new SampleProfile(Filename));
+  Profiler->loadText();
+  return true;
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass() {
+  return new SampleProfileLoader(SampleProfileFile);
+}
+
+FunctionPass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+  return new SampleProfileLoader(Name);
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 8a9c7da..857597e 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 /// ScalarOpts library.
 void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeADCEPass(Registry);
-  initializeBlockPlacementPass(Registry);
+  initializeSampleProfileLoaderPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeConstantPropagationPass(Registry);
   initializeCorrelatedValuePropagationPass(Registry);
@@ -44,12 +44,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLoopInstSimplifyPass(Registry);
   initializeLoopRotatePass(Registry);
   initializeLoopStrengthReducePass(Registry);
+  initializeLoopRerollPass(Registry);
   initializeLoopUnrollPass(Registry);
   initializeLoopUnswitchPass(Registry);
   initializeLoopIdiomRecognizePass(Registry);
   initializeLowerAtomicPass(Registry);
   initializeLowerExpectIntrinsicPass(Registry);
   initializeMemCpyOptPass(Registry);
+  initializePartiallyInlineLibCallsPass(Registry);
   initializeReassociatePass(Registry);
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
@@ -58,7 +60,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeSROA_DTPass(Registry);
   initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
-  initializeSimplifyLibCallsPass(Registry);
+  initializeStructurizeCFGPass(Registry);
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
 }
@@ -111,6 +113,10 @@ void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopRotatePass());
 }
 
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopRerollPass());
+}
+
 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopUnrollPass());
 }
@@ -123,6 +129,10 @@ void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createMemCpyOptPass());
 }
 
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
 void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createPromoteMemoryToRegisterPass());
 }
@@ -149,7 +159,7 @@ void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
 }
 
 void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
-  unwrap(PM)->add(createSimplifyLibCallsPass());
+  // NOTE: The simplify-libcalls pass has been removed.
 }
 
 void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index bfde334..57b290e 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -166,21 +166,21 @@ namespace {
     void DeleteDeadInstructions();
 
     void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                              SmallVector<AllocaInst*, 32> &NewElts);
+                              SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                        SmallVector<AllocaInst*, 32> &NewElts);
+                        SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                    SmallVector<AllocaInst*, 32> &NewElts);
+                    SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
                                   uint64_t Offset,
-                                  SmallVector<AllocaInst*, 32> &NewElts);
+                                  SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
                                       AllocaInst *AI,
-                                      SmallVector<AllocaInst*, 32> &NewElts);
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                       SmallVector<AllocaInst*, 32> &NewElts);
+                                       SmallVectorImpl<AllocaInst *> &NewElts);
     void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                      SmallVector<AllocaInst*, 32> &NewElts);
+                                      SmallVectorImpl<AllocaInst *> &NewElts);
     bool ShouldAttemptScalarRepl(AllocaInst *AI);
   };
 
@@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
@@ -1066,12 +1066,12 @@ public:
 
     LoadAndStorePromoter::run(Insts);
     AI->eraseFromParent();
-    for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       DDI->eraseFromParent();
     }
-    for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       DVI->eraseFromParent();
@@ -1086,7 +1086,7 @@ public:
   }
 
   virtual void updateDebugInfo(Instruction *Inst) const {
-    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+    for (SmallVectorImpl<DbgDeclareInst *>::const_iterator I = DDIs.begin(),
            E = DDIs.end(); I != E; ++I) {
       DbgDeclareInst *DDI = *I;
       if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
@@ -1094,7 +1094,7 @@ public:
       else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
         ConvertDebugDeclareToDebugValue(DDI, LI, *DIB);
     }
-    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+    for (SmallVectorImpl<DbgValueInst *>::const_iterator I = DVIs.begin(),
            E = DVIs.end(); I != E; ++I) {
       DbgValueInst *DVI = *I;
       Value *Arg = NULL;
@@ -1865,7 +1865,7 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) {
 /// Offset indicates the position within AI that is referenced by this
 /// instruction.
 void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
-                                SmallVector<AllocaInst*, 32> &NewElts) {
+                                SmallVectorImpl<AllocaInst *> &NewElts) {
   for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) {
     Use &TheUse = UI.getUse();
     Instruction *User = cast<Instruction>(*UI++);
@@ -1979,7 +1979,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset,
 /// RewriteBitCast - Update a bitcast reference to the alloca being replaced
 /// and recursively continue updating all of its uses.
 void SROA::RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset,
-                          SmallVector<AllocaInst*, 32> &NewElts) {
+                          SmallVectorImpl<AllocaInst *> &NewElts) {
   RewriteForScalarRepl(BC, AI, Offset, NewElts);
   if (BC->getOperand(0) != AI)
     return;
@@ -2037,7 +2037,7 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset,
 /// elements of the alloca that are being split apart, and if so, rewrite
 /// the GEP to be relative to the new element.
 void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
-                      SmallVector<AllocaInst*, 32> &NewElts) {
+                      SmallVectorImpl<AllocaInst *> &NewElts) {
   uint64_t OldOffset = Offset;
   SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end());
   // If the GEP was dynamic then it must have been a dynamic vector lookup.
@@ -2099,7 +2099,7 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset,
 /// to mark the lifetime of the scalarized memory.
 void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
                                     uint64_t Offset,
-                                    SmallVector<AllocaInst*, 32> &NewElts) {
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
   ConstantInt *OldSize = cast<ConstantInt>(II->getArgOperand(0));
   // Put matching lifetime markers on everything from Offset up to
   // Offset+OldSize.
@@ -2153,9 +2153,10 @@ void SROA::RewriteLifetimeIntrinsic(IntrinsicInst *II, AllocaInst *AI,
 
 /// RewriteMemIntrinUserOfAlloca - MI is a memcpy/memset/memmove from or to AI.
 /// Rewrite it to copy or set the elements of the scalarized memory.
-void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
-                                        AllocaInst *AI,
-                                        SmallVector<AllocaInst*, 32> &NewElts) {
+void
+SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
+                                   AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
   // If this is a memcpy/memmove, construct the other pointer as the
   // appropriate type.  The "Other" pointer is the pointer that goes to memory
   // that doesn't have anything to do with the alloca that we are promoting. For
@@ -2189,7 +2190,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
     if (OtherPtr == AI || OtherPtr == NewElts[0]) {
       // This code will run twice for a no-op memcpy -- once for each operand.
       // Put only one reference to MI on the DeadInsts list.
-      for (SmallVector<Value*, 32>::const_iterator I = DeadInsts.begin(),
+      for (SmallVectorImpl<Value *>::const_iterator I = DeadInsts.begin(),
              E = DeadInsts.end(); I != E; ++I)
         if (*I == MI) return;
       DeadInsts.push_back(MI);
@@ -2326,8 +2327,9 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst,
 /// RewriteStoreUserOfWholeAlloca - We found a store of an integer that
 /// overwrites the entire allocation.  Extract out the pieces of the stored
 /// integer and store them individually.
-void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
-                                         SmallVector<AllocaInst*, 32> &NewElts){
+void
+SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
+                                    SmallVectorImpl<AllocaInst *> &NewElts) {
   // Extract each element out of the integer according to its structure offset
   // and store the element value to the individual alloca.
   Value *SrcVal = SI->getOperand(0);
@@ -2440,8 +2442,9 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI,
 
 /// RewriteLoadUserOfWholeAlloca - We found a load of the entire allocation to
 /// an integer.  Load the individual pieces to form the aggregate value.
-void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
-                                        SmallVector<AllocaInst*, 32> &NewElts) {
+void
+SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
+                                   SmallVectorImpl<AllocaInst *> &NewElts) {
   // Extract each element out of the NewElts according to its structure offset
   // and form the result value.
   Type *AllocaEltTy = AI->getAllocatedType();
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index c243d34..8371f6d 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -41,187 +41,31 @@ using namespace llvm;
 STATISTIC(NumSimpl, "Number of blocks simplified");
 
 namespace {
-  struct CFGSimplifyPass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CFGSimplifyPass() : FunctionPass(ID) {
-      initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    virtual bool runOnFunction(Function &F);
+struct CFGSimplifyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  CFGSimplifyPass() : FunctionPass(ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+  virtual bool runOnFunction(Function &F);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<TargetTransformInfo>();
-    }
-  };
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<TargetTransformInfo>();
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
-                      false, false)
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                      false)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG",
-                    false, false)
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+                    false)
 
 // Public interface to the CFGSimplification pass
 FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// changeToUnreachable - Insert an unreachable instruction before the specified
-/// instruction, making it and the rest of the code in the block dead.
-static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
-  BasicBlock *BB = I->getParent();
-  // Loop over all of the successors, removing BB's entry from any PHI
-  // nodes.
-  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-    (*SI)->removePredecessor(BB);
-
-  // Insert a call to llvm.trap right before this.  This turns the undefined
-  // behavior into a hard fail instead of falling through into random code.
-  if (UseLLVMTrap) {
-    Function *TrapFn =
-      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
-    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
-    CallTrap->setDebugLoc(I->getDebugLoc());
-  }
-  new UnreachableInst(I->getContext(), I);
-
-  // All instructions after this are dead.
-  BasicBlock::iterator BBI = I, BBE = BB->end();
-  while (BBI != BBE) {
-    if (!BBI->use_empty())
-      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
-    BB->getInstList().erase(BBI++);
-  }
-}
-
-/// changeToCall - Convert the specified invoke into a normal call.
-static void changeToCall(InvokeInst *II) {
-  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
-  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
-  NewCall->takeName(II);
-  NewCall->setCallingConv(II->getCallingConv());
-  NewCall->setAttributes(II->getAttributes());
-  NewCall->setDebugLoc(II->getDebugLoc());
-  II->replaceAllUsesWith(NewCall);
-
-  // Follow the call by a branch to the normal destination.
-  BranchInst::Create(II->getNormalDest(), II);
-
-  // Update PHI nodes in the unwind destination
-  II->getUnwindDest()->removePredecessor(II->getParent());
-  II->eraseFromParent();
-}
-
-static bool markAliveBlocks(BasicBlock *BB,
-                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
-
-  SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(BB);
-  Reachable.insert(BB);
-  bool Changed = false;
-  do {
-    BB = Worklist.pop_back_val();
-
-    // Do a quick scan of the basic block, turning any obviously unreachable
-    // instructions into LLVM unreachable insts.  The instruction combining pass
-    // canonicalizes unreachable insts into stores to null or undef.
-    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
-      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
-        if (CI->doesNotReturn()) {
-          // If we found a call to a no-return function, insert an unreachable
-          // instruction after it.  Make sure there isn't *already* one there
-          // though.
-          ++BBI;
-          if (!isa<UnreachableInst>(BBI)) {
-            // Don't insert a call to llvm.trap right before the unreachable.
-            changeToUnreachable(BBI, false);
-            Changed = true;
-          }
-          break;
-        }
-      }
-
-      // Store to undef and store to null are undefined and used to signal that
-      // they should be changed to unreachable by passes that can't modify the
-      // CFG.
-      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
-        // Don't touch volatile stores.
-        if (SI->isVolatile()) continue;
-
-        Value *Ptr = SI->getOperand(1);
-
-        if (isa<UndefValue>(Ptr) ||
-            (isa<ConstantPointerNull>(Ptr) &&
-             SI->getPointerAddressSpace() == 0)) {
-          changeToUnreachable(SI, true);
-          Changed = true;
-          break;
-        }
-      }
-    }
-
-    // Turn invokes that call 'nounwind' functions into ordinary calls.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
-      Value *Callee = II->getCalledValue();
-      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        changeToUnreachable(II, true);
-        Changed = true;
-      } else if (II->doesNotThrow()) {
-        if (II->use_empty() && II->onlyReadsMemory()) {
-          // jump to the normal destination branch.
-          BranchInst::Create(II->getNormalDest(), II);
-          II->getUnwindDest()->removePredecessor(II->getParent());
-          II->eraseFromParent();
-        } else
-          changeToCall(II);
-        Changed = true;
-      }
-    }
-
-    Changed |= ConstantFoldTerminator(BB, true);
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.insert(*SI))
-        Worklist.push_back(*SI);
-  } while (!Worklist.empty());
-  return Changed;
-}
-
-/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
-/// if they are in a dead cycle.  Return true if a change was made, false
-/// otherwise.
-static bool removeUnreachableBlocksFromFn(Function &F) {
-  SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = markAliveBlocks(F.begin(), Reachable);
-
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-
-  assert(Reachable.size() < F.size());
-  NumSimpl += F.size()-Reachable.size();
-
-  // Loop over all of the basic blocks that are not reachable, dropping all of
-  // their internal references...
-  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
-    if (Reachable.count(BB))
-      continue;
-
-    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      if (Reachable.count(*SI))
-        (*SI)->removePredecessor(BB);
-    BB->dropAllReferences();
-  }
-
-  for (Function::iterator I = ++F.begin(); I != F.end();)
-    if (!Reachable.count(I))
-      I = F.getBasicBlockList().erase(I);
-    else
-      ++I;
-
-  return true;
-}
-
 /// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
 static bool mergeEmptyReturnBlocks(Function &F) {
@@ -326,7 +170,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfo>();
   const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, TD);
 
@@ -334,16 +178,16 @@ bool CFGSimplifyPass::runOnFunction(Function &F) {
   if (!EverChanged) return false;
 
   // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // removeUnreachableBlocks is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
   // avoid reruning iterativelySimplifyCFG if the second pass of
-  // removeUnreachableBlocksFromFn doesn't do anything.
-  if (!removeUnreachableBlocksFromFn(F))
+  // removeUnreachableBlocks doesn't do anything.
+  if (!removeUnreachableBlocks(F))
     return true;
 
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, TD);
-    EverChanged |= removeUnreachableBlocksFromFn(F);
+    EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
deleted file mode 100644
index 3514e6c..0000000
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ /dev/null
@@ -1,247 +0,0 @@
-//===- SimplifyLibCalls.cpp - Optimize specific well-known library calls --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a simple pass that applies a variety of small
-// optimizations for calls to specific well-known function calls (e.g. runtime
-// library functions).   Any optimization that takes the very simple form
-// "replace call to library function with simpler code that provides the same
-// result" belongs in this file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "simplify-libcalls"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-using namespace llvm;
-
-
-//===----------------------------------------------------------------------===//
-// Optimizer Base Class
-//===----------------------------------------------------------------------===//
-
-/// This class is the abstract base class for the set of optimizations that
-/// corresponds to one library call.
-namespace {
-class LibCallOptimization {
-protected:
-  Function *Caller;
-  const DataLayout *TD;
-  const TargetLibraryInfo *TLI;
-  LLVMContext* Context;
-public:
-  LibCallOptimization() { }
-  virtual ~LibCallOptimization() {}
-
-  /// CallOptimizer - This pure virtual method is implemented by base classes to
-  /// do various optimizations.  If this returns null then no transformation was
-  /// performed.  If it returns CI, then it transformed the call and CI is to be
-  /// deleted.  If it returns something else, replace CI with the new value and
-  /// delete CI.
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
-    =0;
-
-  Value *OptimizeCall(CallInst *CI, const DataLayout *TD,
-                      const TargetLibraryInfo *TLI, IRBuilder<> &B) {
-    Caller = CI->getParent()->getParent();
-    this->TD = TD;
-    this->TLI = TLI;
-    if (CI->getCalledFunction())
-      Context = &CI->getCalledFunction()->getContext();
-
-    // We never change the calling convention.
-    if (CI->getCallingConv() != llvm::CallingConv::C)
-      return NULL;
-
-    return CallOptimizer(CI->getCalledFunction(), CI, B);
-  }
-};
-} // End anonymous namespace.
-
-
-//===----------------------------------------------------------------------===//
-// SimplifyLibCalls Pass Implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-  /// This pass optimizes well known library functions from libc and libm.
-  ///
-  class SimplifyLibCalls : public FunctionPass {
-    TargetLibraryInfo *TLI;
-
-    StringMap<LibCallOptimization*> Optimizations;
-  public:
-    static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID) {
-      initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
-    }
-    void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
-    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
-
-    void InitOptimizations();
-    bool runOnFunction(Function &F);
-
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<TargetLibraryInfo>();
-    }
-  };
-} // end anonymous namespace.
-
-char SimplifyLibCalls::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SimplifyLibCalls, "simplify-libcalls",
-                      "Simplify well-known library calls", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
-INITIALIZE_PASS_END(SimplifyLibCalls, "simplify-libcalls",
-                    "Simplify well-known library calls", false, false)
-
-// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createSimplifyLibCallsPass() {
-  return new SimplifyLibCalls();
-}
-
-void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
-  if (TLI->has(F))
-    Optimizations[TLI->getName(F)] = Opt;
-}
-
-void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
-                              LibCallOptimization* Opt) {
-  if (TLI->has(F1) && TLI->has(F2))
-    Optimizations[TLI->getName(F1)] = Opt;
-}
-
-/// Optimizations - Populate the Optimizations map with all the optimizations
-/// we know.
-void SimplifyLibCalls::InitOptimizations() {
-}
-
-
-/// runOnFunction - Top level algorithm.
-///
-bool SimplifyLibCalls::runOnFunction(Function &F) {
-  TLI = &getAnalysis<TargetLibraryInfo>();
-
-  if (Optimizations.empty())
-    InitOptimizations();
-
-  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
-
-  IRBuilder<> Builder(F.getContext());
-
-  bool Changed = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
-      // Ignore non-calls.
-      CallInst *CI = dyn_cast<CallInst>(I++);
-      if (!CI || CI->hasFnAttr(Attribute::NoBuiltin)) continue;
-
-      // Ignore indirect calls and calls to non-external functions.
-      Function *Callee = CI->getCalledFunction();
-      if (Callee == 0 || !Callee->isDeclaration() ||
-          !(Callee->hasExternalLinkage() || Callee->hasDLLImportLinkage()))
-        continue;
-
-      // Ignore unknown calls.
-      LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
-      if (!LCO) continue;
-
-      // Set the builder to the instruction after the call.
-      Builder.SetInsertPoint(BB, I);
-
-      // Use debug location of CI for all new instructions.
-      Builder.SetCurrentDebugLocation(CI->getDebugLoc());
-
-      // Try to optimize this call.
-      Value *Result = LCO->OptimizeCall(CI, TD, TLI, Builder);
-      if (Result == 0) continue;
-
-      DEBUG(dbgs() << "SimplifyLibCalls simplified: " << *CI;
-            dbgs() << "  into: " << *Result << "\n");
-
-      // Something changed!
-      Changed = true;
-
-      // Inspect the instruction after the call (which was potentially just
-      // added) next.
-      I = CI; ++I;
-
-      if (CI != Result && !CI->use_empty()) {
-        CI->replaceAllUsesWith(Result);
-        if (!Result->hasName())
-          Result->takeName(CI);
-      }
-      CI->eraseFromParent();
-    }
-  }
-  return Changed;
-}
-
-// TODO:
-//   Additional cases that we need to add to this file:
-//
-// cbrt:
-//   * cbrt(expN(X))  -> expN(x/3)
-//   * cbrt(sqrt(x))  -> pow(x,1/6)
-//   * cbrt(sqrt(x))  -> pow(x,1/9)
-//
-// exp, expf, expl:
-//   * exp(log(x))  -> x
-//
-// log, logf, logl:
-//   * log(exp(x))   -> x
-//   * log(x**y)     -> y*log(x)
-//   * log(exp(y))   -> y*log(e)
-//   * log(exp2(y))  -> y*log(2)
-//   * log(exp10(y)) -> y*log(10)
-//   * log(sqrt(x))  -> 0.5*log(x)
-//   * log(pow(x,y)) -> y*log(x)
-//
-// lround, lroundf, lroundl:
-//   * lround(cnst) -> cnst'
-//
-// pow, powf, powl:
-//   * pow(exp(x),y)  -> exp(x*y)
-//   * pow(sqrt(x),y) -> pow(x,y*0.5)
-//   * pow(pow(x,y),z)-> pow(x,y*z)
-//
-// round, roundf, roundl:
-//   * round(cnst) -> cnst'
-//
-// signbit:
-//   * signbit(cnst) -> cnst'
-//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
-//
-// sqrt, sqrtf, sqrtl:
-//   * sqrt(expN(x))  -> expN(x*0.5)
-//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
-//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
-//
-// strchr:
-//   * strchr(p, 0) -> strlen(p)
-// tan, tanf, tanl:
-//   * tan(atan(x)) -> x
-//
-// trunc, truncf, truncl:
-//   * trunc(cnst) -> cnst'
-//
-//
diff --git a/lib/Target/R600/AMDGPUStructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index dea43b8..5045ff8f 100644
--- a/lib/Target/R600/AMDGPUStructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUStructurizeCFG.cpp -  ------------------===//
+//===-- StructurizeCFG.cpp ------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,24 +6,17 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-/// \file
-/// The pass implemented in this file transforms the programs control flow
-/// graph into a form that's suitable for code generation on hardware that
-/// implements control flow by execution masking. This currently includes all
-/// AMD GPUs but may as well be useful for other types of hardware.
-//
-//===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
-#include "llvm/ADT/SCCIterator.h"
+#define DEBUG_TYPE "structurizecfg"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -52,14 +45,13 @@ typedef DenseMap<BasicBlock *, BasicBlock*> BB2BBMap;
 
 // The name for newly created blocks.
 
-static const char *FlowBlockName = "Flow";
+static const char *const FlowBlockName = "Flow";
 
 /// @brief Find the nearest common dominator for multiple BasicBlocks
 ///
-/// Helper class for AMDGPUStructurizeCFG
+/// Helper class for StructurizeCFG
 /// TODO: Maybe move into common code
 class NearestCommonDominator {
-
   DominatorTree *DT;
 
   DTN2UnsignedMap IndexMap;
@@ -77,7 +69,6 @@ public:
 
   /// \brief Add BB to the resulting dominator
   void addBlock(BasicBlock *BB, bool Remember = true) {
-
     DomTreeNode *Node = DT->getNode(BB);
 
     if (Result == 0) {
@@ -131,7 +122,7 @@ public:
 /// | |
 /// 2 |
 /// | /
-/// |/   
+/// |/
 /// 3
 /// ||   Where:
 /// | |  1 = "If" block, calculates the condition
@@ -164,10 +155,7 @@ public:
 /// while the true side continues the general flow. So the loop condition
 /// consist of a network of PHI nodes where the true incoming values expresses
 /// breaks and the false values expresses continue states.
-class AMDGPUStructurizeCFG : public RegionPass {
-
-  static char ID;
-
+class StructurizeCFG : public RegionPass {
   Type *Boolean;
   ConstantInt *BoolTrue;
   ConstantInt *BoolFalse;
@@ -239,10 +227,11 @@ class AMDGPUStructurizeCFG : public RegionPass {
   void rebuildSSA();
 
 public:
-  AMDGPUStructurizeCFG():
-    RegionPass(ID) {
+  static char ID;
 
-    initializeRegionInfoPass(*PassRegistry::getPassRegistry());
+  StructurizeCFG() :
+    RegionPass(ID) {
+    initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
   using Pass::doInitialization;
@@ -251,24 +240,31 @@ public:
   virtual bool runOnRegion(Region *R, RGPassManager &RGM);
 
   virtual const char *getPassName() const {
-    return "AMDGPU simplify control flow";
+    return "Structurize control flow";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
-
+    AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTree>();
     AU.addPreserved<DominatorTree>();
     RegionPass::getAnalysisUsage(AU);
   }
-
 };
 
 } // end anonymous namespace
 
-char AMDGPUStructurizeCFG::ID = 0;
+char StructurizeCFG::ID = 0;
+
+INITIALIZE_PASS_BEGIN(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_DEPENDENCY(RegionInfo)
+INITIALIZE_PASS_END(StructurizeCFG, "structurizecfg", "Structurize the CFG",
+                    false, false)
 
 /// \brief Initialize the types and constants used in the pass
-bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
+bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   LLVMContext &Context = R->getEntry()->getContext();
 
   Boolean = Type::getInt1Ty(Context);
@@ -280,7 +276,7 @@ bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
 }
 
 /// \brief Build up the general order of nodes
-void AMDGPUStructurizeCFG::orderNodes() {
+void StructurizeCFG::orderNodes() {
   scc_iterator<Region *> I = scc_begin(ParentRegion),
                          E = scc_end(ParentRegion);
   for (Order.clear(); I != E; ++I) {
@@ -290,8 +286,7 @@ void AMDGPUStructurizeCFG::orderNodes() {
 }
 
 /// \brief Determine the end of the loops
-void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
-
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
   if (N->isSubRegion()) {
     // Test for exit as back edge
     BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
@@ -313,8 +308,7 @@ void AMDGPUStructurizeCFG::analyzeLoops(RegionNode *N) {
 }
 
 /// \brief Invert the given condition
-Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
-
+Value *StructurizeCFG::invert(Value *Condition) {
   // First: Check if it's a constant
   if (Condition == BoolTrue)
     return BoolFalse;
@@ -329,39 +323,49 @@ Value *AMDGPUStructurizeCFG::invert(Value *Condition) {
   if (match(Condition, m_Not(m_Value(Condition))))
     return Condition;
 
-  // Third: Check all the users for an invert
-  BasicBlock *Parent = cast<Instruction>(Condition)->getParent();
-  for (Value::use_iterator I = Condition->use_begin(),
-       E = Condition->use_end(); I != E; ++I) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
+    // Third: Check all the users for an invert
+    BasicBlock *Parent = Inst->getParent();
+    for (Value::use_iterator I = Condition->use_begin(),
+           E = Condition->use_end(); I != E; ++I) {
 
-    Instruction *User = dyn_cast<Instruction>(*I);
-    if (!User || User->getParent() != Parent)
-      continue;
+      Instruction *User = dyn_cast<Instruction>(*I);
+      if (!User || User->getParent() != Parent)
+        continue;
+
+      if (match(*I, m_Not(m_Specific(Condition))))
+        return *I;
+    }
+
+    // Last option: Create a new instruction
+    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  }
 
-    if (match(*I, m_Not(m_Specific(Condition))))
-      return *I;
+  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
+    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
+    return BinaryOperator::CreateNot(Condition,
+                                     Arg->getName() + ".inv",
+                                     EntryBlock.getTerminator());
   }
 
-  // Last option: Create a new instruction
-  return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
+  llvm_unreachable("Unhandled condition to invert");
 }
 
 /// \brief Build the condition for one edge
-Value *AMDGPUStructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
-                                            bool Invert) {
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+                                      bool Invert) {
   Value *Cond = Invert ? BoolFalse : BoolTrue;
   if (Term->isConditional()) {
     Cond = Term->getCondition();
 
-    if (Idx != Invert)
+    if (Idx != (unsigned)Invert)
       Cond = invert(Cond);
   }
   return Cond;
 }
 
 /// \brief Analyze the predecessors of each block and build up predicates
-void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
-
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
   RegionInfo *RI = ParentRegion->getRegionInfo();
   BasicBlock *BB = N->getEntry();
   BBPredicates &Pred = Predicates[BB];
@@ -398,7 +402,7 @@ void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
             }
           }
           Pred[*PI] = buildCondition(Term, i, false);
- 
+
         } else {
           // Back edge
           LPred[*PI] = buildCondition(Term, i, true);
@@ -425,8 +429,7 @@ void AMDGPUStructurizeCFG::gatherPredicates(RegionNode *N) {
 }
 
 /// \brief Collect various loop and predicate infos
-void AMDGPUStructurizeCFG::collectInfos() {
-
+void StructurizeCFG::collectInfos() {
   // Reset predicate
   Predicates.clear();
 
@@ -452,7 +455,7 @@ void AMDGPUStructurizeCFG::collectInfos() {
 }
 
 /// \brief Insert the missing branch conditions
-void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
+void StructurizeCFG::insertConditions(bool Loops) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
   SSAUpdater PhiInserter;
@@ -501,7 +504,7 @@ void AMDGPUStructurizeCFG::insertConditions(bool Loops) {
 
 /// \brief Remove all PHI values coming from "From" into "To" and remember
 /// them in DeletedPhis
-void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   PhiMap &Map = DeletedPhis[To];
   for (BasicBlock::iterator I = To->begin(), E = To->end();
        I != E && isa<PHINode>(*I);) {
@@ -515,7 +518,7 @@ void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
 }
 
 /// \brief Add a dummy PHI value as soon as we knew the new predecessor
-void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
   for (BasicBlock::iterator I = To->begin(), E = To->end();
        I != E && isa<PHINode>(*I);) {
 
@@ -527,8 +530,7 @@ void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
 }
 
 /// \brief Add the real PHI value as soon as everything is set up
-void AMDGPUStructurizeCFG::setPhiValues() {
-
+void StructurizeCFG::setPhiValues() {
   SSAUpdater Updater;
   for (BB2BBVecMap::iterator AI = AddedPhis.begin(), AE = AddedPhis.end();
        AI != AE; ++AI) {
@@ -576,7 +578,7 @@ void AMDGPUStructurizeCFG::setPhiValues() {
 }
 
 /// \brief Remove phi values from all successors and then remove the terminator.
-void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
   TerminatorInst *Term = BB->getTerminator();
   if (!Term)
     return;
@@ -591,9 +593,8 @@ void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) {
 }
 
 /// \brief Let node exit(s) point to NewExit
-void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
-                                      bool IncludeDominator) {
-
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+                                bool IncludeDominator) {
   if (Node->isSubRegion()) {
     Region *SubRegion = Node->getNodeAs<Region>();
     BasicBlock *OldExit = SubRegion->getExit();
@@ -639,7 +640,7 @@ void AMDGPUStructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
 }
 
 /// \brief Create a new flow node and update dominator tree and region info
-BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
   LLVMContext &Context = Func->getContext();
   BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
                        Order.back()->getEntry();
@@ -651,8 +652,7 @@ BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Dominator) {
 }
 
 /// \brief Create a new or reuse the previous node as flow node
-BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
-
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
   BasicBlock *Entry = PrevNode->getEntry();
 
   if (!PrevNode->isSubRegion()) {
@@ -660,7 +660,7 @@ BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
     if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
       return Entry;
 
-  } 
+  }
 
   // create a new flow node
   BasicBlock *Flow = getNextFlow(Entry);
@@ -672,9 +672,8 @@ BasicBlock *AMDGPUStructurizeCFG::needPrefix(bool NeedEmpty) {
 }
 
 /// \brief Returns the region exit if possible, otherwise just a new flow node
-BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
-                                              bool ExitUseAllowed) {
-
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+                                        bool ExitUseAllowed) {
   if (Order.empty() && ExitUseAllowed) {
     BasicBlock *Exit = ParentRegion->getExit();
     DT->changeImmediateDominator(Exit, Flow);
@@ -685,12 +684,12 @@ BasicBlock *AMDGPUStructurizeCFG::needPostfix(BasicBlock *Flow,
 }
 
 /// \brief Set the previous node
-void AMDGPUStructurizeCFG::setPrevNode(BasicBlock *BB) {
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
   PrevNode =  ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB) : 0;
 }
 
 /// \brief Does BB dominate all the predicates of Node ?
-bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end();
        PI != PE; ++PI) {
@@ -702,8 +701,7 @@ bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node)
 }
 
 /// \brief Can we predict that this node will always be called?
-bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
-
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
   BBPredicates &Preds = Predicates[Node->getEntry()];
   bool Dominated = false;
 
@@ -726,9 +724,8 @@ bool AMDGPUStructurizeCFG::isPredictableTrue(RegionNode *Node) {
 }
 
 /// Take one node from the order vector and wire it up
-void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
-                                    BasicBlock *LoopEnd) {
-
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+                              BasicBlock *LoopEnd) {
   RegionNode *Node = Order.pop_back_val();
   Visited.insert(Node->getEntry());
 
@@ -763,8 +760,8 @@ void AMDGPUStructurizeCFG::wireFlow(bool ExitUseAllowed,
   }
 }
 
-void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
-                                       BasicBlock *LoopEnd) {
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+                                 BasicBlock *LoopEnd) {
   RegionNode *Node = Order.back();
   BasicBlock *LoopStart = Node->getEntry();
 
@@ -782,6 +779,20 @@ void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
     handleLoops(false, LoopEnd);
   }
 
+  // If the start of the loop is the entry block, we can't branch to it so
+  // insert a new dummy entry block.
+  Function *LoopFunc = LoopStart->getParent();
+  if (LoopStart == &LoopFunc->getEntryBlock()) {
+    LoopStart->setName("entry.orig");
+
+    BasicBlock *NewEntry =
+      BasicBlock::Create(LoopStart->getContext(),
+                         "entry",
+                         LoopFunc,
+                         LoopStart);
+    BranchInst::Create(LoopStart, NewEntry);
+  }
+
   // Create an extra loop end node
   LoopEnd = needPrefix(false);
   BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
@@ -793,8 +804,7 @@ void AMDGPUStructurizeCFG::handleLoops(bool ExitUseAllowed,
 
 /// After this function control flow looks like it should be, but
 /// branches and PHI nodes only have undefined conditions.
-void AMDGPUStructurizeCFG::createFlow() {
-
+void StructurizeCFG::createFlow() {
   BasicBlock *Exit = ParentRegion->getExit();
   bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
 
@@ -818,7 +828,7 @@ void AMDGPUStructurizeCFG::createFlow() {
 
 /// Handle a rare case where the disintegrated nodes instructions
 /// no longer dominate all their uses. Not sure if this is really nessasary
-void AMDGPUStructurizeCFG::rebuildSSA() {
+void StructurizeCFG::rebuildSSA() {
   SSAUpdater Updater;
   for (Region::block_iterator I = ParentRegion->block_begin(),
                               E = ParentRegion->block_end();
@@ -859,7 +869,7 @@ void AMDGPUStructurizeCFG::rebuildSSA() {
 }
 
 /// \brief Run the transformation for each region found
-bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
+bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   if (R->isTopLevelRegion())
     return false;
 
@@ -891,6 +901,6 @@ bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
 }
 
 /// \brief Create the pass
-Pass *llvm::createAMDGPUStructurizeCFGPass() {
-  return new AMDGPUStructurizeCFG();
+Pass *llvm::createStructurizeCFGPass() {
+  return new StructurizeCFG();
 }
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 2002e68..9fb8ddc 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -53,6 +53,7 @@
 #define DEBUG_TYPE "tailcallelim"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -69,6 +70,7 @@
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -97,16 +99,16 @@ namespace {
     bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                     BasicBlock *&OldEntry,
                                     bool &TailCallsAreMarkedTail,
-                                    SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                     bool CannotTailCallElimCallsMarkedTail);
     bool FoldReturnAndProcessPred(BasicBlock *BB,
                                   ReturnInst *Ret, BasicBlock *&OldEntry,
                                   bool &TailCallsAreMarkedTail,
-                                  SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                  SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                   bool CannotTailCallElimCallsMarkedTail);
     bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
                                bool &TailCallsAreMarkedTail,
-                               SmallVector<PHINode*, 8> &ArgumentPHIs,
+                               SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                bool CannotTailCallElimCallsMarkedTail);
     bool CanMoveAboveCall(Instruction *I, CallInst *CI);
     Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
@@ -129,34 +131,44 @@ void TailCallElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfo>();
 }
 
-/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
-/// callees of this function.  We only do very simple analysis right now, this
-/// could be expanded in the future to use mod/ref information for particular
-/// call sites if desired.
-static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
-  // FIXME: do simple 'address taken' analysis.
-  return true;
+/// CanTRE - Scan the specified basic block for alloca instructions.
+/// If it contains any that are variable-sized or not in the entry block,
+/// returns false.
+static bool CanTRE(AllocaInst *AI) {
+  // Because of PR962, we don't TRE allocas outside the entry block.
+
+  // If this alloca is in the body of the function, or if it is a variable
+  // sized allocation, we cannot tail call eliminate calls marked 'tail'
+  // with this mechanism.
+  BasicBlock *BB = AI->getParent();
+  return BB == &BB->getParent()->getEntryBlock() &&
+         isa<ConstantInt>(AI->getArraySize());
 }
 
-/// CheckForEscapingAllocas - Scan the specified basic block for alloca
-/// instructions.  If it contains any that might be accessed by calls, return
-/// true.
-static bool CheckForEscapingAllocas(BasicBlock *BB,
-                                    bool &CannotTCETailMarkedCall) {
-  bool RetVal = false;
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
-      RetVal |= AllocaMightEscapeToCalls(AI);
-
-      // If this alloca is in the body of the function, or if it is a variable
-      // sized allocation, we cannot tail call eliminate calls marked 'tail'
-      // with this mechanism.
-      if (BB != &BB->getParent()->getEntryBlock() ||
-          !isa<ConstantInt>(AI->getArraySize()))
-        CannotTCETailMarkedCall = true;
-    }
-  return RetVal;
-}
+namespace {
+struct AllocaCaptureTracker : public CaptureTracker {
+  AllocaCaptureTracker() : Captured(false) {}
+
+  void tooManyUses() LLVM_OVERRIDE { Captured = true; }
+
+  bool shouldExplore(Use *U) LLVM_OVERRIDE {
+    Value *V = U->getUser();
+    if (isa<CallInst>(V) || isa<InvokeInst>(V))
+      UsesAlloca.insert(V);
+    return true;
+  }
+
+  bool captured(Use *U) LLVM_OVERRIDE {
+    if (isa<ReturnInst>(U->getUser()))
+      return false;
+    Captured = true;
+    return true;
+  }
+
+  bool Captured;
+  SmallPtrSet<const Value *, 16> UsesAlloca;
+};
+} // end anonymous namespace
 
 bool TailCallElim::runOnFunction(Function &F) {
   // If this function is a varargs function, we won't be able to PHI the args
@@ -168,41 +180,44 @@ bool TailCallElim::runOnFunction(Function &F) {
   bool TailCallsAreMarkedTail = false;
   SmallVector<PHINode*, 8> ArgumentPHIs;
   bool MadeChange = false;
-  bool FunctionContainsEscapingAllocas = false;
 
-  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // CanTRETailMarkedCall - If false, we cannot perform TRE on tail calls
   // marked with the 'tail' attribute, because doing so would cause the stack
-  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // size to increase (real TRE would deallocate variable sized allocas, TRE
   // doesn't).
-  bool CannotTCETailMarkedCall = false;
-
-  // Loop over the function, looking for any returning blocks, and keeping track
-  // of whether this function has any non-trivially used allocas.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
-      break;
-
-    FunctionContainsEscapingAllocas |=
-      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  bool CanTRETailMarkedCall = true;
+
+  // Find calls that can be marked tail.
+  AllocaCaptureTracker ACT;
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        CanTRETailMarkedCall &= CanTRE(AI);
+        PointerMayBeCaptured(AI, &ACT);
+        // If any allocas are captured, exit.
+        if (ACT.Captured)
+          return false;
+      }
+    }
   }
 
-  /// FIXME: The code generator produces really bad code when an 'escaping
-  /// alloca' is changed from being a static alloca to being a dynamic alloca.
-  /// Until this is resolved, disable this transformation if that would ever
-  /// happen.  This bug is PR962.
-  if (FunctionContainsEscapingAllocas)
-    return false;
-
-  // Second pass, change any tail calls to loops.
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                          ArgumentPHIs,CannotTCETailMarkedCall);
-      if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
-                                          TailCallsAreMarkedTail, ArgumentPHIs,
-                                          CannotTCETailMarkedCall);
-      MadeChange |= Change;
+  // Second pass, change any tail recursive calls to loops.
+  //
+  // FIXME: The code generator produces really bad code when an 'escaping
+  // alloca' is changed from being a static alloca to being a dynamic alloca.
+  // Until this is resolved, disable this transformation if that would ever
+  // happen.  This bug is PR962.
+  if (ACT.UsesAlloca.empty()) {
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
+        bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                            ArgumentPHIs, !CanTRETailMarkedCall);
+        if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
+          Change = FoldReturnAndProcessPred(BB, Ret, OldEntry,
+                                            TailCallsAreMarkedTail, ArgumentPHIs,
+                                            !CanTRETailMarkedCall);
+        MadeChange |= Change;
+      }
     }
   }
 
@@ -223,16 +238,24 @@ bool TailCallElim::runOnFunction(Function &F) {
     }
   }
 
-  // Finally, if this function contains no non-escaping allocas, or calls
-  // setjmp, mark all calls in the function as eligible for tail calls
-  //(there is no stack memory for them to access).
-  if (!FunctionContainsEscapingAllocas && !F.callsFunctionThatReturnsTwice())
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+  // At this point, we know that the function does not have any captured
+  // allocas. If additionally the function does not call setjmp, mark all calls
+  // in the function that do not access stack memory with the tail keyword. This
+  // implies ensuring that there does not exist any path from a call that takes
+  // in an alloca but does not capture it and the call which we wish to mark
+  // with "tail".
+  if (!F.callsFunctionThatReturnsTwice()) {
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
         if (CallInst *CI = dyn_cast<CallInst>(I)) {
-          CI->setTailCall();
-          MadeChange = true;
+          if (!ACT.UsesAlloca.count(CI)) {
+            CI->setTailCall();
+            MadeChange = true;
+          }
         }
+      }
+    }
+  }
 
   return MadeChange;
 }
@@ -424,7 +447,7 @@ TailCallElim::FindTRECandidate(Instruction *TI,
 bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
                                        BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
-                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                        bool CannotTailCallElimCallsMarkedTail) {
   // If we are introducing accumulator recursion to eliminate operations after
   // the call instruction that are both associative and commutative, the initial
@@ -600,7 +623,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
 bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
                                        ReturnInst *Ret, BasicBlock *&OldEntry,
                                        bool &TailCallsAreMarkedTail,
-                                       SmallVector<PHINode*, 8> &ArgumentPHIs,
+                                       SmallVectorImpl<PHINode *> &ArgumentPHIs,
                                        bool CannotTailCallElimCallsMarkedTail) {
   bool Change = false;
 
@@ -634,10 +657,11 @@ bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB,
   return Change;
 }
 
-bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
-                                         bool &TailCallsAreMarkedTail,
-                                         SmallVector<PHINode*, 8> &ArgumentPHIs,
-                                       bool CannotTailCallElimCallsMarkedTail) {
+bool
+TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                    bool &TailCallsAreMarkedTail,
+                                    SmallVectorImpl<PHINode *> &ArgumentPHIs,
+                                    bool CannotTailCallElimCallsMarkedTail) {
   CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
   if (!CI)
     return false;
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index ba99d2e..12de9ee 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
@@ -170,7 +171,7 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, Pass *P) {
       if (DomTreeNode *DTN = DT->getNode(BB)) {
         DomTreeNode *PredDTN = DT->getNode(PredBB);
         SmallVector<DomTreeNode*, 8> Children(DTN->begin(), DTN->end());
-        for (SmallVector<DomTreeNode*, 8>::iterator DI = Children.begin(),
+        for (SmallVectorImpl<DomTreeNode *>::iterator DI = Children.begin(),
              DE = Children.end(); DI != DE; ++DI)
           DT->changeImmediateDominator(*DI, PredDTN);
 
@@ -235,22 +236,6 @@ void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
   ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
 }
 
-/// GetSuccessorNumber - Search for the specified successor of basic block BB
-/// and return its position in the terminator instruction's list of
-/// successors.  It is an error to call this with a block that is not a
-/// successor.
-unsigned llvm::GetSuccessorNumber(BasicBlock *BB, BasicBlock *Succ) {
-  TerminatorInst *Term = BB->getTerminator();
-#ifndef NDEBUG
-  unsigned e = Term->getNumSuccessors();
-#endif
-  for (unsigned i = 0; ; ++i) {
-    assert(i != e && "Didn't find edge?");
-    if (Term->getSuccessor(i) == Succ)
-      return i;
-  }
-}
-
 /// SplitEdge -  Split the edge connecting specified block. Pass P must
 /// not be NULL.
 BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
@@ -263,7 +248,6 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, Pass *P) {
 
   // If the edge isn't critical, then BB has a single successor or Succ has a
   // single pred.  Split the block.
-  BasicBlock::iterator SplitPoint;
   if (BasicBlock *SP = Succ->getSinglePredecessor()) {
     // If the successor only has a single pred, split the top of the successor
     // block.
@@ -416,8 +400,12 @@ static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
       // If all incoming values for the new PHI would be the same, just don't
       // make a new PHI.  Instead, just remove the incoming values from the old
       // PHI.
-      for (unsigned i = 0, e = Preds.size(); i != e; ++i)
-        PN->removeIncomingValue(Preds[i], false);
+      for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+        // Explicitly check the BB index here to handle duplicates in Preds.
+        int Idx = PN->getBasicBlockIndex(Preds[i]);
+        if (Idx >= 0)
+          PN->removeIncomingValue(Idx, false);
+      }
     } else {
       // If the values coming into the block are not the same, we need a PHI.
       // Create the new PHI node, insert it into NewBB at the end of the block
@@ -598,52 +586,6 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
   }
 }
 
-/// FindFunctionBackedges - Analyze the specified function to find all of the
-/// loop backedges in the function and return them.  This is a relatively cheap
-/// (compared to computing dominators and loop info) analysis.
-///
-/// The output is added to Result, as pairs of <from,to> edge info.
-void llvm::FindFunctionBackedges(const Function &F,
-     SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
-  const BasicBlock *BB = &F.getEntryBlock();
-  if (succ_begin(BB) == succ_end(BB))
-    return;
-
-  SmallPtrSet<const BasicBlock*, 8> Visited;
-  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
-  SmallPtrSet<const BasicBlock*, 8> InStack;
-
-  Visited.insert(BB);
-  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
-  InStack.insert(BB);
-  do {
-    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
-    const BasicBlock *ParentBB = Top.first;
-    succ_const_iterator &I = Top.second;
-
-    bool FoundNew = false;
-    while (I != succ_end(ParentBB)) {
-      BB = *I++;
-      if (Visited.insert(BB)) {
-        FoundNew = true;
-        break;
-      }
-      // Successor is in VisitStack, it's a back edge.
-      if (InStack.count(BB))
-        Result.push_back(std::make_pair(ParentBB, BB));
-    }
-
-    if (FoundNew) {
-      // Go down one level if there is a unvisited successor.
-      InStack.insert(BB);
-      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
-    } else {
-      // Go up one level.
-      InStack.erase(VisitStack.pop_back_val().first);
-    }
-  } while (!VisitStack.empty());
-}
-
 /// FoldReturnIntoUncondBranch - This method duplicates the specified return
 /// instruction into a predecessor which ends in an unconditional branch. If
 /// the return instruction returns a value defined by a PHI, propagate the
@@ -726,3 +668,104 @@ TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp,
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
   return CheckTerm;
 }
+
+/// GetIfCondition - Given a basic block (BB) with two predecessors,
+/// check to see if the merge at this block is due
+/// to an "if condition".  If so, return the boolean condition that determines
+/// which entry into BB will be taken.  Also, return by references the block
+/// that will be entered from if the condition is true, and the block that will
+/// be entered if the condition is false.
+///
+/// This does no checking to see if the true/false blocks have large or unsavory
+/// instructions in them.
+Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+                             BasicBlock *&IfFalse) {
+  PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
+  BasicBlock *Pred1 = NULL;
+  BasicBlock *Pred2 = NULL;
+
+  if (SomePHI) {
+    if (SomePHI->getNumIncomingValues() != 2)
+      return NULL;
+    Pred1 = SomePHI->getIncomingBlock(0);
+    Pred2 = SomePHI->getIncomingBlock(1);
+  } else {
+    pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+    if (PI == PE) // No predecessor
+      return NULL;
+    Pred1 = *PI++;
+    if (PI == PE) // Only one predecessor
+      return NULL;
+    Pred2 = *PI++;
+    if (PI != PE) // More than two predecessors
+      return NULL;
+  }
+
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
+  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
+  if (Pred1Br == 0 || Pred2Br == 0)
+    return 0;
+
+  // Eliminate code duplication by ensuring that Pred1Br is conditional if
+  // either are.
+  if (Pred2Br->isConditional()) {
+    // If both branches are conditional, we don't have an "if statement".  In
+    // reality, we could transform this case, but since the condition will be
+    // required anyway, we stand no chance of eliminating it, so the xform is
+    // probably not profitable.
+    if (Pred1Br->isConditional())
+      return 0;
+
+    std::swap(Pred1, Pred2);
+    std::swap(Pred1Br, Pred2Br);
+  }
+
+  if (Pred1Br->isConditional()) {
+    // The only thing we have to watch out for here is to make sure that Pred2
+    // doesn't have incoming edges from other blocks.  If it does, the condition
+    // doesn't dominate BB.
+    if (Pred2->getSinglePredecessor() == 0)
+      return 0;
+
+    // If we found a conditional branch predecessor, make sure that it branches
+    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
+    if (Pred1Br->getSuccessor(0) == BB &&
+        Pred1Br->getSuccessor(1) == Pred2) {
+      IfTrue = Pred1;
+      IfFalse = Pred2;
+    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+               Pred1Br->getSuccessor(1) == BB) {
+      IfTrue = Pred2;
+      IfFalse = Pred1;
+    } else {
+      // We know that one arm of the conditional goes to BB, so the other must
+      // go somewhere unrelated, and this must not be an "if statement".
+      return 0;
+    }
+
+    return Pred1Br->getCondition();
+  }
+
+  // Ok, if we got here, both predecessors end with an unconditional branch to
+  // BB.  Don't panic!  If both blocks only have a single (identical)
+  // predecessor, and THAT is a conditional branch, then we're all ok!
+  BasicBlock *CommonPred = Pred1->getSinglePredecessor();
+  if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor())
+    return 0;
+
+  // Otherwise, if this is a conditional branch, then we can use it!
+  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+  if (BI == 0) return 0;
+
+  assert(BI->isConditional() && "Two successors but not conditional?");
+  if (BI->getSuccessor(0) == Pred1) {
+    IfTrue = Pred1;
+    IfFalse = Pred2;
+  } else {
+    IfTrue = Pred2;
+    IfFalse = Pred1;
+  }
+  return BI->getCondition();
+}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 8513772..0e7f7f7 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -19,9 +19,9 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
@@ -44,7 +44,6 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
       AU.addPreserved<LoopInfo>();
-      AU.addPreserved<ProfileInfo>();
 
       // No loop canonicalization guarantees are broken by this pass.
       AU.addPreservedID(LoopSimplifyID);
@@ -84,39 +83,6 @@ bool BreakCriticalEdges::runOnFunction(Function &F) {
 //    Implementation of the external critical edge manipulation functions
 //===----------------------------------------------------------------------===//
 
-// isCriticalEdge - Return true if the specified edge is a critical edge.
-// Critical edges are edges from a block with multiple successors to a block
-// with multiple predecessors.
-//
-bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum,
-                          bool AllowIdenticalEdges) {
-  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
-  if (TI->getNumSuccessors() == 1) return false;
-
-  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
-  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
-
-  // If there is more than one predecessor, this is a critical edge...
-  assert(I != E && "No preds, but we have an edge to the block?");
-  const BasicBlock *FirstPred = *I;
-  ++I;        // Skip one edge due to the incoming arc from TI.
-  if (!AllowIdenticalEdges)
-    return I != E;
-
-  // If AllowIdenticalEdges is true, then we allow this edge to be considered
-  // non-critical iff all preds come from TI's block.
-  while (I != E) {
-    const BasicBlock *P = *I;
-    if (P != FirstPred)
-      return true;
-    // Note: leave this as is until no one ever compiles with either gcc 4.0.1
-    // or Xcode 2. This seems to work around the pred_iterator assert in PR 2207
-    E = pred_end(P);
-    ++I;
-  }
-  return false;
-}
-
 /// createPHIsForSplitLoopExit - When a loop exit edge is split, LCSSA form
 /// may require new PHIs in the new exit block. This function inserts the
 /// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB
@@ -245,10 +211,9 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
 
   DominatorTree *DT = P->getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = P->getAnalysisIfAvailable<LoopInfo>();
-  ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
 
   // If we have nothing to update, just return.
-  if (DT == 0 && LI == 0 && PI == 0)
+  if (DT == 0 && LI == 0)
     return NewBB;
 
   // Now update analysis information.  Since the only predecessor of NewBB is
@@ -401,9 +366,5 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum,
     }
   }
 
-  // Update ProfileInfo if it is around.
-  if (PI)
-    PI->splitEdge(TIBB, DestBB, NewBB, MergeIdenticalEdges);
-
   return NewBB;
 }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index b71628b..5afd6b8 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMTransformUtils
   CmpInstAnalysis.cpp
   CodeExtractor.cpp
   DemoteRegToStack.cpp
+  GlobalStatus.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
   IntegerDivision.cpp
@@ -25,9 +26,11 @@ add_llvm_library(LLVMTransformUtils
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
+  FlattenCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
   SimplifyLibCalls.cpp
+  SpecialCaseList.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index be8d39e..d105f5e 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -78,7 +78,8 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                              bool ModuleLevelChanges,
                              SmallVectorImpl<ReturnInst*> &Returns,
                              const char *NameSuffix, ClonedCodeInfo *CodeInfo,
-                             ValueMapTypeRemapper *TypeMapper) {
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
   assert(NameSuffix && "NameSuffix cannot be null!");
 
 #ifndef NDEBUG
@@ -147,7 +148,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     for (BasicBlock::iterator II = BB->begin(); II != BB->end(); ++II)
       RemapInstruction(II, VMap,
                        ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
-                       TypeMapper);
+                       TypeMapper, Materializer);
 }
 
 /// CloneFunction - Return a copy of the specified function, but without
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index f7c659f..6f008644 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -277,8 +277,8 @@ void CodeExtractor::splitReturnBlocks() {
 
         DomTreeNode *NewNode = DT->addNewBlock(New, *I);
 
-        for (SmallVector<DomTreeNode*, 8>::iterator I = Children.begin(),
-               E = Children.end(); I != E; ++I) 
+        for (SmallVectorImpl<DomTreeNode *>::iterator I = Children.begin(),
+               E = Children.end(); I != E; ++I)
           DT->changeImmediateDominator(*I, NewNode);
       }
     }
@@ -665,8 +665,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
     TheSwitch->setCondition(call);
     TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
     // Remove redundant case
-    SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1);
-    TheSwitch->removeCase(ToBeRemoved);
+    TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
     break;
   }
 }
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index db525cd..0723b35 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
diff --git a/lib/Transforms/Utils/FlattenCFG.cpp b/lib/Transforms/Utils/FlattenCFG.cpp
new file mode 100644
index 0000000..1da226b
--- /dev/null
+++ b/lib/Transforms/Utils/FlattenCFG.cpp
@@ -0,0 +1,486 @@
+//===- FlatternCFG.cpp - Code to perform CFG flattening ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Reduce conditional branches in CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "flattencfg"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+namespace {
+class FlattenCFGOpt {
+  AliasAnalysis *AA;
+  /// \brief Use parallel-and or parallel-or to generate conditions for
+  /// conditional branches.
+  bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0);
+  /// \brief If \param BB is the merge block of an if-region, attempt to merge
+  /// the if-region with an adjacent if-region upstream if two if-regions
+  /// contain identical instructions.
+  bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0);
+  /// \brief Compare a pair of blocks: \p Block1 and \p Block2, which
+  /// are from two if-regions whose entry blocks are \p Head1 and \p
+  /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
+  /// instructions, and have no memory reference alias with \p Head2.
+  /// This is used as a legality check for merging if-regions.
+  bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
+                            BasicBlock *Block1, BasicBlock *Block2);
+
+public:
+  FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
+  bool run(BasicBlock *BB);
+};
+}
+
+/// If \param [in] BB has more than one predecessor that is a conditional
+/// branch, attempt to use parallel and/or for the branch condition. \returns
+/// true on success.
+///
+/// Before:
+///   ......
+///   %cmp10 = fcmp une float %tmp1, %tmp2
+///   br i1 %cmp1, label %if.then, label %lor.rhs
+///
+/// lor.rhs:
+///   ......
+///   %cmp11 = fcmp une float %tmp3, %tmp4
+///   br i1 %cmp11, label %if.then, label %ifend
+///
+/// if.end:  // the merge block
+///   ......
+///
+/// if.then: // has two predecessors, both of them contains conditional branch.
+///   ......
+///   br label %if.end;
+///
+/// After:
+///  ......
+///  %cmp10 = fcmp une float %tmp1, %tmp2
+///  ......
+///  %cmp11 = fcmp une float %tmp3, %tmp4
+///  %cmp12 = or i1 %cmp10, %cmp11    // parallel-or mode.
+///  br i1 %cmp12, label %if.then, label %ifend
+///
+///  if.end:
+///    ......
+///
+///  if.then:
+///    ......
+///    br label %if.end;
+///
+///  Current implementation handles two cases.
+///  Case 1: \param BB is on the else-path.
+///
+///          BB1
+///        /     |
+///       BB2    |
+///      /   \   |
+///     BB3   \  |     where, BB1, BB2 contain conditional branches.
+///      \    |  /     BB3 contains unconditional branch.
+///       \   | /      BB4 corresponds to \param BB which is also the merge.
+///  BB => BB4
+///
+///
+///  Corresponding source code:
+///
+///  if (a == b && c == d)
+///    statement; // BB3
+///
+///  Case 2: \param BB BB is on the then-path.
+///
+///             BB1
+///          /      |
+///         |      BB2
+///         \    /    |  where BB1, BB2 contain conditional branches.
+///  BB =>   BB3      |  BB3 contains unconditiona branch and corresponds
+///           \     /    to \param BB.  BB4 is the merge.
+///             BB4
+///
+///  Corresponding source code:
+///
+///  if (a == b || c == d)
+///    statement;  // BB3
+///
+///  In both cases,  \param BB is the common successor of conditional branches.
+///  In Case 1, \param BB (BB4) has an unconditional branch (BB3) as
+///  its predecessor.  In Case 2, \param BB (BB3) only has conditional branches
+///  as its predecessors.
+///
+bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder,
+                                         Pass *P) {
+  PHINode *PHI = dyn_cast<PHINode>(BB->begin());
+  if (PHI)
+    return false; // For simplicity, avoid cases containing PHI nodes.
+
+  BasicBlock *LastCondBlock = NULL;
+  BasicBlock *FirstCondBlock = NULL;
+  BasicBlock *UnCondBlock = NULL;
+  int Idx = -1;
+
+  // Check predecessors of \param BB.
+  SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+  for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end();
+       PI != PE; ++PI) {
+    BasicBlock *Pred = *PI;
+    BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
+
+    // All predecessors should terminate with a branch.
+    if (!PBI)
+      return false;
+
+    BasicBlock *PP = Pred->getSinglePredecessor();
+
+    if (PBI->isUnconditional()) {
+      // Case 1: Pred (BB3) is an unconditional block, it should
+      // have a single predecessor (BB2) that is also a predecessor
+      // of \param BB (BB4) and should not have address-taken.
+      // There should exist only one such unconditional
+      // branch among the predecessors.
+      if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
+          Pred->hasAddressTaken())
+        return false;
+
+      UnCondBlock = Pred;
+      continue;
+    }
+
+    // Only conditional branches are allowed beyond this point.
+    assert(PBI->isConditional());
+
+    // Condition's unique use should be the branch instruction.
+    Value *PC = PBI->getCondition();
+    if (!PC || !PC->hasOneUse())
+      return false;
+
+    if (PP && Preds.count(PP)) {
+      // These are internal condition blocks to be merged from, e.g.,
+      // BB2 in both cases.
+      // Should not be address-taken.
+      if (Pred->hasAddressTaken())
+        return false;
+
+      // Instructions in the internal condition blocks should be safe
+      // to hoist up.
+      for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) {
+        Instruction *CI = BI++;
+        if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
+          return false;
+      }
+    } else {
+      // This is the condition block to be merged into, e.g. BB1 in
+      // both cases.
+      if (FirstCondBlock)
+        return false;
+      FirstCondBlock = Pred;
+    }
+
+    // Find whether BB is uniformly on the true (or false) path
+    // for all of its predecessors.
+    BasicBlock *PS1 = PBI->getSuccessor(0);
+    BasicBlock *PS2 = PBI->getSuccessor(1);
+    BasicBlock *PS = (PS1 == BB) ? PS2 : PS1;
+    int CIdx = (PS1 == BB) ? 0 : 1;
+
+    if (Idx == -1)
+      Idx = CIdx;
+    else if (CIdx != Idx)
+      return false;
+
+    // PS is the successor which is not BB. Check successors to identify
+    // the last conditional branch.
+    if (Preds.count(PS) == 0) {
+      // Case 2.
+      LastCondBlock = Pred;
+    } else {
+      // Case 1
+      BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
+      if (BPS && BPS->isUnconditional()) {
+        // Case 1: PS(BB3) should be an unconditional branch.
+        LastCondBlock = Pred;
+      }
+    }
+  }
+
+  if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
+    return false;
+
+  TerminatorInst *TBB = LastCondBlock->getTerminator();
+  BasicBlock *PS1 = TBB->getSuccessor(0);
+  BasicBlock *PS2 = TBB->getSuccessor(1);
+  BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
+  BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
+
+  // If PS1 does not jump into PS2, but PS2 jumps into PS1,
+  // attempt branch inversion.
+  if (!PBI1 || !PBI1->isUnconditional() ||
+      (PS1->getTerminator()->getSuccessor(0) != PS2)) {
+    // Check whether PS2 jumps into PS1.
+    if (!PBI2 || !PBI2->isUnconditional() ||
+        (PS2->getTerminator()->getSuccessor(0) != PS1))
+      return false;
+
+    // Do branch inversion.
+    BasicBlock *CurrBlock = LastCondBlock;
+    bool EverChanged = false;
+    while (1) {
+      BranchInst *BI = dyn_cast<BranchInst>(CurrBlock->getTerminator());
+      CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      // Cannonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
+      if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
+        CI->setPredicate(ICmpInst::getInversePredicate(Predicate));
+        BI->swapSuccessors();
+        EverChanged = true;
+      }
+      if (CurrBlock == FirstCondBlock)
+        break;
+      CurrBlock = CurrBlock->getSinglePredecessor();
+    }
+    return EverChanged;
+  }
+
+  // PS1 must have a conditional branch.
+  if (!PBI1 || !PBI1->isUnconditional())
+    return false;
+
+  // PS2 should not contain PHI node.
+  PHI = dyn_cast<PHINode>(PS2->begin());
+  if (PHI)
+    return false;
+
+  // Do the transformation.
+  BasicBlock *CB;
+  BranchInst *PBI = dyn_cast<BranchInst>(FirstCondBlock->getTerminator());
+  bool Iteration = true;
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+  Value *PC = PBI->getCondition();
+
+  do {
+    CB = PBI->getSuccessor(1 - Idx);
+    // Delete the conditional branch.
+    FirstCondBlock->getInstList().pop_back();
+    FirstCondBlock->getInstList()
+        .splice(FirstCondBlock->end(), CB->getInstList());
+    PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+    Value *CC = PBI->getCondition();
+    // Merge conditions.
+    Builder.SetInsertPoint(PBI);
+    Value *NC;
+    if (Idx == 0)
+      // Case 2, use parallel or.
+      NC = Builder.CreateOr(PC, CC);
+    else
+      // Case 1, use parallel and.
+      NC = Builder.CreateAnd(PC, CC);
+
+    PBI->replaceUsesOfWith(CC, NC);
+    PC = NC;
+    if (CB == LastCondBlock)
+      Iteration = false;
+    // Remove internal conditional branches.
+    CB->dropAllReferences();
+    // make CB unreachable and let downstream to delete the block.
+    new UnreachableInst(CB->getContext(), CB);
+  } while (Iteration);
+
+  DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+  return true;
+}
+
+/// Compare blocks from two if-regions, where \param Head1 is the entry of the
+/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param
+/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block
+//  in the 2nd if-region to compare.  \returns true if \param Block1 and \param
+/// Block2 have identical instructions and do not have memory reference alias
+/// with \param Head2.
+///
+bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
+                                         BasicBlock *Block1,
+                                         BasicBlock *Block2) {
+  TerminatorInst *PTI2 = Head2->getTerminator();
+  Instruction *PBI2 = Head2->begin();
+
+  bool eq1 = (Block1 == Head1);
+  bool eq2 = (Block2 == Head2);
+  if (eq1 || eq2) {
+    // An empty then-path or else-path.
+    return (eq1 == eq2);
+  }
+
+  // Check whether instructions in Block1 and Block2 are identical
+  // and do not alias with instructions in Head2.
+  BasicBlock::iterator iter1 = Block1->begin();
+  BasicBlock::iterator end1 = Block1->getTerminator();
+  BasicBlock::iterator iter2 = Block2->begin();
+  BasicBlock::iterator end2 = Block2->getTerminator();
+
+  while (1) {
+    if (iter1 == end1) {
+      if (iter2 != end2)
+        return false;
+      break;
+    }
+
+    if (!iter1->isIdenticalTo(iter2))
+      return false;
+
+    // Illegal to remove instructions with side effects except
+    // non-volatile stores.
+    if (iter1->mayHaveSideEffects()) {
+      Instruction *CurI = &*iter1;
+      StoreInst *SI = dyn_cast<StoreInst>(CurI);
+      if (!SI || SI->isVolatile())
+        return false;
+    }
+
+    // For simplicity and speed, data dependency check can be
+    // avoided if read from memory doesn't exist.
+    if (iter1->mayReadFromMemory())
+      return false;
+
+    if (iter1->mayWriteToMemory()) {
+      for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) {
+        if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
+          // Check alias with Head2.
+          if (!AA || AA->alias(iter1, BI))
+            return false;
+        }
+      }
+    }
+    ++iter1;
+    ++iter2;
+  }
+
+  return true;
+}
+
+/// Check whether \param BB is the merge block of a if-region.  If yes, check
+/// whether there exists an adjacent if-region upstream, the two if-regions
+/// contain identical instructions and can be legally merged.  \returns true if
+/// the two if-regions are merged.
+///
+/// From:
+/// if (a)
+///   statement;
+/// if (b)
+///   statement;
+///
+/// To:
+/// if (a || b)
+///   statement;
+///
+bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder,
+                                  Pass *P) {
+  BasicBlock *IfTrue2, *IfFalse2;
+  Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
+  Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
+  if (!CInst2)
+    return false;
+
+  BasicBlock *SecondEntryBlock = CInst2->getParent();
+  if (SecondEntryBlock->hasAddressTaken())
+    return false;
+
+  BasicBlock *IfTrue1, *IfFalse1;
+  Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
+  Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1);
+  if (!CInst1)
+    return false;
+
+  BasicBlock *FirstEntryBlock = CInst1->getParent();
+
+  // Either then-path or else-path should be empty.
+  if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock))
+    return false;
+  if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock))
+    return false;
+
+  TerminatorInst *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PBI2 = SecondEntryBlock->begin();
+
+  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
+                            IfTrue2))
+    return false;
+
+  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1,
+                            IfFalse2))
+    return false;
+
+  // Check whether \param SecondEntryBlock has side-effect and is safe to
+  // speculate.
+  for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) {
+    Instruction *CI = BI;
+    if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
+        !isSafeToSpeculativelyExecute(CI))
+      return false;
+  }
+
+  // Merge \param SecondEntryBlock into \param FirstEntryBlock.
+  FirstEntryBlock->getInstList().pop_back();
+  FirstEntryBlock->getInstList()
+      .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
+  BranchInst *PBI = dyn_cast<BranchInst>(FirstEntryBlock->getTerminator());
+  Value *CC = PBI->getCondition();
+  BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+  BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+  Builder.SetInsertPoint(PBI);
+  Value *NC = Builder.CreateOr(CInst1, CC);
+  PBI->replaceUsesOfWith(CC, NC);
+  Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
+
+  // Remove IfTrue1
+  if (IfTrue1 != FirstEntryBlock) {
+    IfTrue1->dropAllReferences();
+    IfTrue1->eraseFromParent();
+  }
+
+  // Remove IfFalse1
+  if (IfFalse1 != FirstEntryBlock) {
+    IfFalse1->dropAllReferences();
+    IfFalse1->eraseFromParent();
+  }
+
+  // Remove \param SecondEntryBlock
+  SecondEntryBlock->dropAllReferences();
+  SecondEntryBlock->eraseFromParent();
+  DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+  return true;
+}
+
+bool FlattenCFGOpt::run(BasicBlock *BB) {
+  bool Changed = false;
+  assert(BB && BB->getParent() && "Block not embedded in function!");
+  assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+  IRBuilder<> Builder(BB);
+
+  if (FlattenParallelAndOr(BB, Builder))
+    return true;
+
+  if (MergeIfRegion(BB, Builder))
+    return true;
+
+  return Changed;
+}
+
+/// FlattenCFG - This function is used to flatten a CFG.  For
+/// example, it uses parallel-and and parallel-or mode to collapse
+//  if-conditions and merge if-regions with identical statements.
+///
+bool llvm::FlattenCFG(BasicBlock *BB, AliasAnalysis *AA) {
+  return FlattenCFGOpt(AA).run(BB);
+}
diff --git a/lib/Transforms/Utils/GlobalStatus.cpp b/lib/Transforms/Utils/GlobalStatus.cpp
new file mode 100644
index 0000000..5f0a563
--- /dev/null
+++ b/lib/Transforms/Utils/GlobalStatus.cpp
@@ -0,0 +1,183 @@
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+  if (X == Acquire && Y == Release)
+    return AcquireRelease;
+  if (Y == Acquire && X == Release)
+    return AcquireRelease;
+  return (AtomicOrdering)std::max(X, Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+  if (isa<GlobalValue>(C))
+    return false;
+
+  for (Value::const_use_iterator UI = C->use_begin(), E = C->use_end(); UI != E;
+       ++UI)
+    if (const Constant *CU = dyn_cast<Constant>(*UI)) {
+      if (!isSafeToDestroyConstant(CU))
+        return false;
+    } else
+      return false;
+  return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+                             SmallPtrSet<const PHINode *, 16> &PhiUsers) {
+  for (Value::const_use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;
+       ++UI) {
+    const User *U = *UI;
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+      GS.HasNonInstructionUser = true;
+
+      // If the result of the constantexpr isn't pointer type, then we won't
+      // know to expect it in various places.  Just reject early.
+      if (!isa<PointerType>(CE->getType()))
+        return true;
+
+      if (analyzeGlobalAux(CE, GS, PhiUsers))
+        return true;
+    } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      if (!GS.HasMultipleAccessingFunctions) {
+        const Function *F = I->getParent()->getParent();
+        if (GS.AccessingFunction == 0)
+          GS.AccessingFunction = F;
+        else if (GS.AccessingFunction != F)
+          GS.HasMultipleAccessingFunctions = true;
+      }
+      if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        GS.IsLoaded = true;
+        // Don't hack on volatile loads.
+        if (LI->isVolatile())
+          return true;
+        GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+      } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        // Don't allow a store OF the address, only stores TO the address.
+        if (SI->getOperand(0) == V)
+          return true;
+
+        // Don't hack on volatile stores.
+        if (SI->isVolatile())
+          return true;
+
+        GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+        // If this is a direct store to the global (i.e., the global is a scalar
+        // value, not an aggregate), keep more specific information about
+        // stores.
+        if (GS.StoredType != GlobalStatus::Stored) {
+          if (const GlobalVariable *GV =
+                  dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+            Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
+            if (StoredVal == GV->getInitializer()) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (isa<LoadInst>(StoredVal) &&
+                       cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+              if (GS.StoredType < GlobalStatus::InitializerStored)
+                GS.StoredType = GlobalStatus::InitializerStored;
+            } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+              GS.StoredType = GlobalStatus::StoredOnce;
+              GS.StoredOnceValue = StoredVal;
+            } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+                       GS.StoredOnceValue == StoredVal) {
+              // noop.
+            } else {
+              GS.StoredType = GlobalStatus::Stored;
+            }
+          } else {
+            GS.StoredType = GlobalStatus::Stored;
+          }
+        }
+      } else if (isa<BitCastInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<GetElementPtrInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (isa<SelectInst>(I)) {
+        if (analyzeGlobalAux(I, GS, PhiUsers))
+          return true;
+      } else if (const PHINode *PN = dyn_cast<PHINode>(I)) {
+        // PHI nodes we can check just like select or GEP instructions, but we
+        // have to be careful about infinite recursion.
+        if (PhiUsers.insert(PN)) // Not already visited.
+          if (analyzeGlobalAux(I, GS, PhiUsers))
+            return true;
+      } else if (isa<CmpInst>(I)) {
+        GS.IsCompared = true;
+      } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+        if (MTI->isVolatile())
+          return true;
+        if (MTI->getArgOperand(0) == V)
+          GS.StoredType = GlobalStatus::Stored;
+        if (MTI->getArgOperand(1) == V)
+          GS.IsLoaded = true;
+      } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+        assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+        if (MSI->isVolatile())
+          return true;
+        GS.StoredType = GlobalStatus::Stored;
+      } else if (ImmutableCallSite C = I) {
+        if (!C.isCallee(UI))
+          return true;
+        GS.IsLoaded = true;
+      } else {
+        return true; // Any other non-load instruction might take address!
+      }
+    } else if (const Constant *C = dyn_cast<Constant>(U)) {
+      GS.HasNonInstructionUser = true;
+      // We might have a dead and dangling constant hanging off of here.
+      if (!isSafeToDestroyConstant(C))
+        return true;
+    } else {
+      GS.HasNonInstructionUser = true;
+      // Otherwise must be some other user.
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+  SmallPtrSet<const PHINode *, 16> PhiUsers;
+  return analyzeGlobalAux(V, GS, PhiUsers);
+}
+
+GlobalStatus::GlobalStatus()
+    : IsCompared(false), IsLoaded(false), StoredType(NotStored),
+      StoredOnceValue(0), AccessingFunction(0),
+      HasMultipleAccessingFunctions(false), HasNonInstructionUser(false),
+      Ordering(NotAtomic) {}
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index dabb67b9..d021bce 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -193,7 +193,8 @@ static bool HandleCallsInBlockInlinedThroughInvoke(BasicBlock *BB,
     CallInst *CI = dyn_cast<CallInst>(I);
 
     // If this call cannot unwind, don't convert it to an invoke.
-    if (!CI || CI->doesNotThrow())
+    // Inline asm calls cannot throw.
+    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
       continue;
 
     // Convert this function call into an invoke instruction.  First, split the
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 2d1b166..f15e8d5 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -55,7 +55,6 @@ namespace {
     DominatorTree *DT;
     LoopInfo *LI;
     ScalarEvolution *SE;
-    std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
     Loop *L;
     
@@ -82,11 +81,6 @@ namespace {
       // Check the special guarantees that LCSSA makes.
       assert(L->isLCSSAForm(*DT) && "LCSSA form not preserved!");
     }
-
-    /// inLoop - returns true if the given block is within the current loop
-    bool inLoop(BasicBlock *B) const {
-      return std::binary_search(LoopBlocks.begin(), LoopBlocks.end(), B);
-    }
   };
 }
   
@@ -129,11 +123,6 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   if (ExitBlocks.empty())
     return false;
   
-  // Speed up queries by creating a sorted vector of blocks.
-  LoopBlocks.clear();
-  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
-  array_pod_sort(LoopBlocks.begin(), LoopBlocks.end());
-  
   // Look at all the instructions in the loop, checking to see if they have uses
   // outside the loop.  If so, rewrite those uses.
   bool MadeChange = false;
@@ -198,7 +187,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     if (PHINode *PN = dyn_cast<PHINode>(U))
       UserBB = PN->getIncomingBlock(UI);
     
-    if (InstBB != UserBB && !inLoop(UserBB))
+    if (InstBB != UserBB && !L->contains(UserBB))
       UsesToRewrite.push_back(&UI.getUse());
   }
 
@@ -244,7 +233,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
       // If the exit block has a predecessor not within the loop, arrange for
       // the incoming value use corresponding to that predecessor to be
       // rewritten in terms of a different LCSSA PHI.
-      if (!inLoop(*PI))
+      if (!L->contains(*PI))
         UsesToRewrite.push_back(
           &PN->getOperandUse(
             PN->getOperandNumForIncomingValue(PN->getNumIncomingValues()-1)));
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 12e5b3e..2768041 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -16,10 +16,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/DIBuilder.h"
 #include "llvm/DebugInfo.h"
@@ -43,6 +43,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+
 //===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
@@ -84,7 +86,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       BI->eraseFromParent();
       return true;
     }
-    
+
     if (Dest2 == Dest1) {       // Conditional branch to same location?
       // This branch matches something like this:
       //     br bool %cond, label %Dest, label %Dest
@@ -104,7 +106,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     }
     return false;
   }
-  
+
   if (SwitchInst *SI = dyn_cast<SwitchInst>(T)) {
     // If we are switching on a constant, we can convert the switch into a
     // single branch instruction!
@@ -188,38 +190,33 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
-    
+
     if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
       SwitchInst::CaseIt FirstCase = SI->case_begin();
-      IntegersSubset& Case = FirstCase.getCaseValueEx();
-      if (Case.isSingleNumber()) {
-        // FIXME: Currently work with ConstantInt based numbers.
-        Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
-             Case.getSingleNumber(0).toConstantInt(),
-            "cond");
-
-        // Insert the new branch.
-        BranchInst *NewBr = Builder.CreateCondBr(Cond,
-                                FirstCase.getCaseSuccessor(),
-                                SI->getDefaultDest());
-        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
-        if (MD && MD->getNumOperands() == 3) {
-          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
-          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
-          assert(SICase && SIDef);
-          // The TrueWeight should be the weight for the single case of SI.
-          NewBr->setMetadata(LLVMContext::MD_prof,
-                 MDBuilder(BB->getContext()).
-                 createBranchWeights(SICase->getValue().getZExtValue(),
-                                     SIDef->getValue().getZExtValue()));
-        }
+      Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+          FirstCase.getCaseValue(), "cond");
 
-        // Delete the old switch.
-        SI->eraseFromParent();
-        return true;
+      // Insert the new branch.
+      BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                               FirstCase.getCaseSuccessor(),
+                                               SI->getDefaultDest());
+      MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+      if (MD && MD->getNumOperands() == 3) {
+        ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+        ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+        assert(SICase && SIDef);
+        // The TrueWeight should be the weight for the single case of SI.
+        NewBr->setMetadata(LLVMContext::MD_prof,
+                        MDBuilder(BB->getContext()).
+                        createBranchWeights(SICase->getValue().getZExtValue(),
+                                            SIDef->getValue().getZExtValue()));
       }
+
+      // Delete the old switch.
+      SI->eraseFromParent();
+      return true;
     }
     return false;
   }
@@ -231,7 +228,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       BasicBlock *TheOnlyDest = BA->getBasicBlock();
       // Insert the new branch.
       Builder.CreateBr(TheOnlyDest);
-      
+
       for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
         if (IBI->getDestination(i) == TheOnlyDest)
           TheOnlyDest = 0;
@@ -242,7 +239,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
         RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
-      
+
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
       // 'unreachable' instruction.
@@ -250,11 +247,11 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         BB->getTerminator()->eraseFromParent();
         new UnreachableInst(BB->getContext(), BB);
       }
-      
+
       return true;
     }
   }
-  
+
   return false;
 }
 
@@ -321,10 +318,10 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
     return false;
-  
+
   SmallVector<Instruction*, 16> DeadInsts;
   DeadInsts.push_back(I);
-  
+
   do {
     I = DeadInsts.pop_back_val();
 
@@ -333,9 +330,9 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
     for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
       Value *OpV = I->getOperand(i);
       I->setOperand(i, 0);
-      
+
       if (!OpV->use_empty()) continue;
-    
+
       // If the operand is an instruction that became dead as we nulled out the
       // operand, and if it is 'trivially' dead, delete it in a future loop
       // iteration.
@@ -343,7 +340,7 @@ llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
         if (isInstructionTriviallyDead(OpI, TLI))
           DeadInsts.push_back(OpI);
     }
-    
+
     I->eraseFromParent();
   } while (!DeadInsts.empty());
 
@@ -415,7 +412,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
     Instruction *Inst = BI++;
 
     WeakVH BIHandle(BI);
-    if (recursivelySimplifyInstruction(Inst, TD)) {
+    if (recursivelySimplifyInstruction(Inst, TD, TLI)) {
       MadeChange = true;
       if (BIHandle != BI)
         BI = BB->begin();
@@ -450,12 +447,12 @@ void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
-  
+
   // Remove the entries for Pred from the PHI nodes in BB, but do not simplify
   // them down.  This will leave us with single entry phi nodes and other phis
   // that can be removed.
   BB->removePredecessor(Pred, true);
-  
+
   WeakVH PhiIt = &BB->front();
   while (PHINode *PN = dyn_cast<PHINode>(PhiIt)) {
     PhiIt = &*++BasicBlock::iterator(cast<Instruction>(PhiIt));
@@ -486,10 +483,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
     PN->replaceAllUsesWith(NewVal);
     PN->eraseFromParent();
   }
-  
+
   BasicBlock *PredBB = DestBB->getSinglePredecessor();
   assert(PredBB && "Block doesn't have a single predecessor!");
-  
+
   // Zap anything that took the address of DestBB.  Not doing this will give the
   // address an invalid value.
   if (DestBB->hasAddressTaken()) {
@@ -500,10 +497,10 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
                                                      BA->getType()));
     BA->destroyConstant();
   }
-  
+
   // Anything that branched to PredBB now branches to DestBB.
   PredBB->replaceAllUsesWith(DestBB);
-  
+
   // Splice all the instructions from PredBB to DestBB.
   PredBB->getTerminator()->eraseFromParent();
   DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
@@ -515,25 +512,27 @@ void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB, Pass *P) {
       DT->changeImmediateDominator(DestBB, PredBBIDom);
       DT->eraseNode(PredBB);
     }
-    ProfileInfo *PI = P->getAnalysisIfAvailable<ProfileInfo>();
-    if (PI) {
-      PI->replaceAllUses(PredBB, DestBB);
-      PI->removeEdge(ProfileInfo::getEdge(PredBB, DestBB));
-    }
   }
   // Nuke BB.
   PredBB->eraseFromParent();
 }
 
+/// CanMergeValues - Return true if we can choose one of these values to use
+/// in place of the other. Note that we will always choose the non-undef
+/// value to keep.
+static bool CanMergeValues(Value *First, Value *Second) {
+  return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
+}
+
 /// CanPropagatePredecessorsForPHIs - Return true if we can fold BB, an
-/// almost-empty BB ending in an unconditional branch to Succ, into succ.
+/// almost-empty BB ending in an unconditional branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
 ///
 static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
 
-  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into " 
+  DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
         << Succ->getName() << "\n");
   // Shortcut, if there is only a single predecessor it must be BB and merging
   // is always safe
@@ -555,9 +554,10 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
       for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
         BasicBlock *IBB = PN->getIncomingBlock(PI);
         if (BBPreds.count(IBB) &&
-            BBPN->getIncomingValueForBlock(IBB) != PN->getIncomingValue(PI)) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
-                << Succ->getName() << " is conflicting with " 
+            !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
+                            PN->getIncomingValue(PI))) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
+                << Succ->getName() << " is conflicting with "
                 << BBPN->getName() << " with regard to common predecessor "
                 << IBB->getName() << "\n");
           return false;
@@ -570,8 +570,9 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
         // one for BB, in which case this phi node will not prevent the merging
         // of the block.
         BasicBlock *IBB = PN->getIncomingBlock(PI);
-        if (BBPreds.count(IBB) && Val != PN->getIncomingValue(PI)) {
-          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in " 
+        if (BBPreds.count(IBB) &&
+            !CanMergeValues(Val, PN->getIncomingValue(PI))) {
+          DEBUG(dbgs() << "Can't fold, phi node " << PN->getName() << " in "
                 << Succ->getName() << " is conflicting with regard to common "
                 << "predecessor " << IBB->getName() << "\n");
           return false;
@@ -583,6 +584,139 @@ static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
   return true;
 }
 
+typedef SmallVector<BasicBlock *, 16> PredBlockVector;
+typedef DenseMap<BasicBlock *, Value *> IncomingValueMap;
+
+/// \brief Determines the value to use as the phi node input for a block.
+///
+/// Select between \p OldVal any value that we know flows from \p BB
+/// to a particular phi on the basis of which one (if either) is not
+/// undef. Update IncomingValues based on the selected value.
+///
+/// \param OldVal The value we are considering selecting.
+/// \param BB The block that the value flows in from.
+/// \param IncomingValues A map from block-to-value for other phi inputs
+/// that we have examined.
+///
+/// \returns the selected value.
+static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
+                                          IncomingValueMap &IncomingValues) {
+  if (!isa<UndefValue>(OldVal)) {
+    assert((!IncomingValues.count(BB) ||
+            IncomingValues.find(BB)->second == OldVal) &&
+           "Expected OldVal to match incoming value from BB!");
+
+    IncomingValues.insert(std::make_pair(BB, OldVal));
+    return OldVal;
+  }
+
+  IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+  if (It != IncomingValues.end()) return It->second;
+
+  return OldVal;
+}
+
+/// \brief Create a map from block to value for the operands of a
+/// given phi.
+///
+/// Create a map from block to value for each non-undef value flowing
+/// into \p PN.
+///
+/// \param PN The phi we are collecting the map for.
+/// \param IncomingValues [out] The map from block to value for this phi.
+static void gatherIncomingValuesToPhi(PHINode *PN,
+                                      IncomingValueMap &IncomingValues) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V))
+      IncomingValues.insert(std::make_pair(BB, V));
+  }
+}
+
+/// \brief Replace the incoming undef values to a phi with the values
+/// from a block-to-value map.
+///
+/// \param PN The phi we are replacing the undefs in.
+/// \param IncomingValues A map from block to value.
+static void replaceUndefValuesInPhi(PHINode *PN,
+                                    const IncomingValueMap &IncomingValues) {
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+
+    if (!isa<UndefValue>(V)) continue;
+
+    BasicBlock *BB = PN->getIncomingBlock(i);
+    IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+    if (It == IncomingValues.end()) continue;
+
+    PN->setIncomingValue(i, It->second);
+  }
+}
+
+/// \brief Replace a value flowing from a block to a phi with
+/// potentially multiple instances of that value flowing from the
+/// block's predecessors to the phi.
+///
+/// \param BB The block with the value flowing into the phi.
+/// \param BBPreds The predecessors of BB.
+/// \param PN The phi that we are updating.
+static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
+                                                const PredBlockVector &BBPreds,
+                                                PHINode *PN) {
+  Value *OldVal = PN->removeIncomingValue(BB, false);
+  assert(OldVal && "No entry in PHI for Pred BB!");
+
+  IncomingValueMap IncomingValues;
+
+  // We are merging two blocks - BB, and the block containing PN - and
+  // as a result we need to redirect edges from the predecessors of BB
+  // to go to the block containing PN, and update PN
+  // accordingly. Since we allow merging blocks in the case where the
+  // predecessor and successor blocks both share some predecessors,
+  // and where some of those common predecessors might have undef
+  // values flowing into PN, we want to rewrite those values to be
+  // consistent with the non-undef values.
+
+  gatherIncomingValuesToPhi(PN, IncomingValues);
+
+  // If this incoming value is one of the PHI nodes in BB, the new entries
+  // in the PHI node are the entries from the old PHI.
+  if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+    PHINode *OldValPN = cast<PHINode>(OldVal);
+    for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) {
+      // Note that, since we are merging phi nodes and BB and Succ might
+      // have common predecessors, we could end up with a phi node with
+      // identical incoming branches. This will be cleaned up later (and
+      // will trigger asserts if we try to clean it up now, without also
+      // simplifying the corresponding conditional branch).
+      BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
+      Value *PredVal = OldValPN->getIncomingValue(i);
+      Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  } else {
+    for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
+      // Update existing incoming values in PN for this
+      // predecessor of BB.
+      BasicBlock *PredBB = BBPreds[i];
+      Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
+                                                    IncomingValues);
+
+      // And add a new incoming value for this predecessor for the
+      // newly retargeted branch.
+      PN->addIncoming(Selected, PredBB);
+    }
+  }
+
+  replaceUndefValuesInPhi(PN, IncomingValues);
+}
+
 /// TryToSimplifyUncondBranchFromEmptyBlock - BB is known to contain an
 /// unconditional branch, and contains no instructions other than PHI nodes,
 /// potential side-effect free intrinsics and the branch.  If possible,
@@ -595,7 +729,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   // We can't eliminate infinite loops.
   BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
   if (BB == Succ) return false;
-  
+
   // Check to see if merging these blocks would cause conflicts for any of the
   // phi nodes in BB or Succ. If not, we can safely merge.
   if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
@@ -629,39 +763,21 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   }
 
   DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
-  
+
   if (isa<PHINode>(Succ->begin())) {
     // If there is more than one pred of succ, and there are PHI nodes in
     // the successor, then we need to add incoming edges for the PHI nodes
     //
-    const SmallVector<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
-    
+    const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
+
     // Loop over all of the PHI nodes in the successor of BB.
     for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
       PHINode *PN = cast<PHINode>(I);
-      Value *OldVal = PN->removeIncomingValue(BB, false);
-      assert(OldVal && "No entry in PHI for Pred BB!");
-      
-      // If this incoming value is one of the PHI nodes in BB, the new entries
-      // in the PHI node are the entries from the old PHI.
-      if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
-        PHINode *OldValPN = cast<PHINode>(OldVal);
-        for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i)
-          // Note that, since we are merging phi nodes and BB and Succ might
-          // have common predecessors, we could end up with a phi node with
-          // identical incoming branches. This will be cleaned up later (and
-          // will trigger asserts if we try to clean it up now, without also
-          // simplifying the corresponding conditional branch).
-          PN->addIncoming(OldValPN->getIncomingValue(i),
-                          OldValPN->getIncomingBlock(i));
-      } else {
-        // Add an incoming value for each of the new incoming values.
-        for (unsigned i = 0, e = BBPreds.size(); i != e; ++i)
-          PN->addIncoming(OldVal, BBPreds[i]);
-      }
+
+      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
     }
   }
-  
+
   if (Succ->getSinglePredecessor()) {
     // BB is the only predecessor of Succ, so Succ will end up with exactly
     // the same predecessors BB had.
@@ -676,7 +792,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
       PN->eraseFromParent();
     }
   }
-    
+
   // Everything that jumped to BB now goes to Succ.
   BB->replaceAllUsesWith(Succ);
   if (!Succ->hasName()) Succ->takeName(BB);
@@ -784,7 +900,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
     // the final program then it is impossible for us to reliably enforce the
     // preferred alignment.
     if (GV->isWeakForLinker()) return Align;
-    
+
     if (GV->getAlignment() >= PrefAlign)
       return GV->getAlignment();
     // We can only increase the alignment of the global if it has no alignment
@@ -804,26 +920,27 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const DataLayout *TD) {
+                                          const DataLayout *DL) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
+  unsigned BitWidth = DL ? DL->getPointerTypeSizeInBits(V->getType()) : 64;
+
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-  ComputeMaskedBits(V, KnownZero, KnownOne, TD);
+  ComputeMaskedBits(V, KnownZero, KnownOne, DL);
   unsigned TrailZ = KnownZero.countTrailingOnes();
-  
-  // Avoid trouble with rediculously large TrailZ values, such as
+
+  // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
   TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
-  
+
   unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
-  
+
   // LLVM doesn't support alignments larger than this currently.
   Align = std::min(Align, +Value::MaximumAlignment);
-  
+
   if (PrefAlign > Align)
-    Align = enforceKnownAlignment(V, Align, PrefAlign, TD);
-    
+    Align = enforceKnownAlignment(V, Align, PrefAlign, DL);
+
   // We don't need to make any adjustment.
   return Align;
 }
@@ -854,7 +971,9 @@ static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) {
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
   DIVariable DIVar(DDI->getVariable());
-  if (!DIVar.Verify())
+  assert((!DIVar || DIVar.isVariable()) &&
+         "Variable in DbgDeclareInst should be either null or a DIVariable.");
+  if (!DIVar)
     return false;
 
   if (LdStHasDebugValue(DIVar, SI))
@@ -888,16 +1007,18 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            LoadInst *LI, DIBuilder &Builder) {
   DIVariable DIVar(DDI->getVariable());
-  if (!DIVar.Verify())
+  assert((!DIVar || DIVar.isVariable()) &&
+         "Variable in DbgDeclareInst should be either null or a DIVariable.");
+  if (!DIVar)
     return false;
 
   if (LdStHasDebugValue(DIVar, LI))
     return true;
 
-  Instruction *DbgVal = 
+  Instruction *DbgVal =
     Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
                                     DIVar, LI);
-  
+
   // Propagate any debug metadata from the store onto the dbg.value.
   DebugLoc LIDL = LI->getDebugLoc();
   if (!LIDL.isUnknown())
@@ -921,10 +1042,14 @@ bool llvm::LowerDbgDeclare(Function &F) {
   if (Dbgs.empty())
     return false;
 
-  for (SmallVector<DbgDeclareInst *, 4>::iterator I = Dbgs.begin(),
+  for (SmallVectorImpl<DbgDeclareInst *>::iterator I = Dbgs.begin(),
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
-    if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+    AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+    // If this is an alloca for a scalar variable, insert a dbg.value
+    // at each load and store to the alloca and erase the dbg.declare.
+    if (AI && !AI->isArrayAllocation()) {
+
       // We only remove the dbg.declare intrinsic if all uses are
       // converted to dbg.value intrinsics.
       bool RemoveDDI = true;
@@ -961,7 +1086,9 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   if (!DDI)
     return false;
   DIVariable DIVar(DDI->getVariable());
-  if (!DIVar.Verify())
+  assert((!DIVar || DIVar.isVariable()) &&
+         "Variable in DbgDeclareInst should be either null or a DIVariable.");
+  if (!DIVar)
     return false;
 
   // Create a copy of the original DIDescriptor for user variable, appending
@@ -990,33 +1117,153 @@ bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
   return true;
 }
 
-bool llvm::removeUnreachableBlocks(Function &F) {
-  SmallPtrSet<BasicBlock*, 16> Reachable;
+/// changeToUnreachable - Insert an unreachable instruction before the specified
+/// instruction, making it and the rest of the code in the block dead.
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+  BasicBlock *BB = I->getParent();
+  // Loop over all of the successors, removing BB's entry from any PHI
+  // nodes.
+  for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+    (*SI)->removePredecessor(BB);
+
+  // Insert a call to llvm.trap right before this.  This turns the undefined
+  // behavior into a hard fail instead of falling through into random code.
+  if (UseLLVMTrap) {
+    Function *TrapFn =
+      Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+    CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+    CallTrap->setDebugLoc(I->getDebugLoc());
+  }
+  new UnreachableInst(I->getContext(), I);
+
+  // All instructions after this are dead.
+  BasicBlock::iterator BBI = I, BBE = BB->end();
+  while (BBI != BBE) {
+    if (!BBI->use_empty())
+      BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+    BB->getInstList().erase(BBI++);
+  }
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
+  SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+  CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
+  NewCall->takeName(II);
+  NewCall->setCallingConv(II->getCallingConv());
+  NewCall->setAttributes(II->getAttributes());
+  NewCall->setDebugLoc(II->getDebugLoc());
+  II->replaceAllUsesWith(NewCall);
+
+  // Follow the call by a branch to the normal destination.
+  BranchInst::Create(II->getNormalDest(), II);
+
+  // Update PHI nodes in the unwind destination
+  II->getUnwindDest()->removePredecessor(II->getParent());
+  II->eraseFromParent();
+}
+
+static bool markAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 128> &Reachable) {
+
   SmallVector<BasicBlock*, 128> Worklist;
-  Worklist.push_back(&F.getEntryBlock());
-  Reachable.insert(&F.getEntryBlock());
+  Worklist.push_back(BB);
+  Reachable.insert(BB);
+  bool Changed = false;
   do {
-    BasicBlock *BB = Worklist.pop_back_val();
+    BB = Worklist.pop_back_val();
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;++BBI){
+      if (CallInst *CI = dyn_cast<CallInst>(BBI)) {
+        if (CI->doesNotReturn()) {
+          // If we found a call to a no-return function, insert an unreachable
+          // instruction after it.  Make sure there isn't *already* one there
+          // though.
+          ++BBI;
+          if (!isa<UnreachableInst>(BBI)) {
+            // Don't insert a call to llvm.trap right before the unreachable.
+            changeToUnreachable(BBI, false);
+            Changed = true;
+          }
+          break;
+        }
+      }
+
+      // Store to undef and store to null are undefined and used to signal that
+      // they should be changed to unreachable by passes that can't modify the
+      // CFG.
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
+        // Don't touch volatile stores.
+        if (SI->isVolatile()) continue;
+
+        Value *Ptr = SI->getOperand(1);
+
+        if (isa<UndefValue>(Ptr) ||
+            (isa<ConstantPointerNull>(Ptr) &&
+             SI->getPointerAddressSpace() == 0)) {
+          changeToUnreachable(SI, true);
+          Changed = true;
+          break;
+        }
+      }
+    }
+
+    // Turn invokes that call 'nounwind' functions into ordinary calls.
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
+      Value *Callee = II->getCalledValue();
+      if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+        changeToUnreachable(II, true);
+        Changed = true;
+      } else if (II->doesNotThrow()) {
+        if (II->use_empty() && II->onlyReadsMemory()) {
+          // jump to the normal destination branch.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->getUnwindDest()->removePredecessor(II->getParent());
+          II->eraseFromParent();
+        } else
+          changeToCall(II);
+        Changed = true;
+      }
+    }
+
+    Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.insert(*SI))
         Worklist.push_back(*SI);
   } while (!Worklist.empty());
+  return Changed;
+}
+
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// if they are in a dead cycle.  Return true if a change was made, false
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 128> Reachable;
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
 
+  // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
-    return false;
+    return Changed;
 
   assert(Reachable.size() < F.size());
-  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
-    if (Reachable.count(I))
+  NumRemoved += F.size()-Reachable.size();
+
+  // Loop over all of the basic blocks that are not reachable, dropping all of
+  // their internal references...
+  for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Reachable.count(BB))
       continue;
 
-    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
       if (Reachable.count(*SI))
-        (*SI)->removePredecessor(I);
-    I->dropAllReferences();
+        (*SI)->removePredecessor(BB);
+    BB->dropAllReferences();
   }
 
-  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+  for (Function::iterator I = ++F.begin(); I != F.end();)
     if (!Reachable.count(I))
       I = F.getBasicBlockList().erase(I);
     else
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 37819cc..6d5f16c 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -59,6 +59,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 using namespace llvm;
 
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
@@ -100,16 +101,16 @@ namespace {
   private:
     bool ProcessLoop(Loop *L, LPPassManager &LPM);
     BasicBlock *RewriteLoopExitBlock(Loop *L, BasicBlock *Exit);
-    BasicBlock *InsertPreheaderForLoop(Loop *L);
     Loop *SeparateNestedLoop(Loop *L, LPPassManager &LPM,
                              BasicBlock *Preheader);
     BasicBlock *InsertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader);
-    void PlaceSplitBlockCarefully(BasicBlock *NewBB,
-                                  SmallVectorImpl<BasicBlock*> &SplitPreds,
-                                  Loop *L);
   };
 }
 
+static void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                                     SmallVectorImpl<BasicBlock*> &SplitPreds,
+                                     Loop *L);
+
 char LoopSimplify::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
                 "Canonicalize natural loops", true, false)
@@ -208,7 +209,7 @@ ReprocessLoop:
   // Does the loop already have a preheader?  If so, don't insert one.
   BasicBlock *Preheader = L->getLoopPreheader();
   if (!Preheader) {
-    Preheader = InsertPreheaderForLoop(L);
+    Preheader = InsertPreheaderForLoop(L, this);
     if (Preheader) {
       ++NumInserted;
       Changed = true;
@@ -367,7 +368,7 @@ ReprocessLoop:
 /// preheader, this method is called to insert one.  This method has two phases:
 /// preheader insertion and analysis updating.
 ///
-BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, Pass *PP) {
   BasicBlock *Header = L->getHeader();
 
   // Compute the set of predecessors of the loop that are not in the loop.
@@ -390,11 +391,11 @@ BasicBlock *LoopSimplify::InsertPreheaderForLoop(Loop *L) {
   BasicBlock *PreheaderBB;
   if (!Header->isLandingPad()) {
     PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader",
-                                         this);
+                                         PP);
   } else {
     SmallVector<BasicBlock*, 2> NewBBs;
     SplitLandingPadPredecessors(Header, OutsideBlocks, ".preheader",
-                                ".split-lp", this, NewBBs);
+                                ".split-lp", PP, NewBBs);
     PreheaderBB = NewBBs[0];
   }
 
@@ -491,9 +492,9 @@ static PHINode *FindPHIToPartitionLoops(Loop *L, DominatorTree *DT,
 // PlaceSplitBlockCarefully - If the block isn't already, move the new block to
 // right after some 'outside block' block.  This prevents the preheader from
 // being placed inside the loop body, e.g. when the loop hasn't been rotated.
-void LoopSimplify::PlaceSplitBlockCarefully(BasicBlock *NewBB,
-                                       SmallVectorImpl<BasicBlock*> &SplitPreds,
-                                            Loop *L) {
+void PlaceSplitBlockCarefully(BasicBlock *NewBB,
+                              SmallVectorImpl<BasicBlock*> &SplitPreds,
+                              Loop *L) {
   // Check to see if NewBB is already well placed.
   Function::iterator BBI = NewBB; --BBI;
   for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index cb581b3..162807d 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -90,7 +90,8 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
   // Move all definitions in the successor to the predecessor...
   OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
 
-  std::string OldName = BB->getName();
+  // OldName will be valid until erased.
+  StringRef OldName = BB->getName();
 
   // Erase basic block from the function...
 
@@ -102,12 +103,13 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI,
     }
   }
   LI->removeBlock(BB);
-  BB->eraseFromParent();
 
   // Inherit predecessor's name if it exists...
   if (!OldName.empty() && !OnlyPred->hasName())
     OnlyPred->setName(OldName);
 
+  BB->eraseFromParent();
+
   return OnlyPred;
 }
 
@@ -239,8 +241,6 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount,
     DEBUG(dbgs() << "!\n");
   }
 
-  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
-
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
 
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 4aee8ff..e017f50 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -29,7 +29,7 @@
 
 using namespace llvm;
 
-STATISTIC(IfHandled, "Number of 'expect' intrinsic intructions handled");
+STATISTIC(IfHandled, "Number of 'expect' intrinsic instructions handled");
 
 static cl::opt<uint32_t>
 LikelyBranchWeight("likely-branch-weight", cl::Hidden, cl::init(64),
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 9ec84d7..9799a30 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -61,6 +61,8 @@ static cl::opt<bool> ExpensiveEHSupport("enable-correct-eh-support",
 
 namespace {
   class LowerInvoke : public FunctionPass {
+    const TargetMachine *TM;
+
     // Used for both models.
     Constant *AbortFn;
 
@@ -70,15 +72,12 @@ namespace {
     Constant *SetJmpFn, *LongJmpFn, *StackSaveFn, *StackRestoreFn;
     bool useExpensiveEHSupport;
 
-    // We peek in TLI to grab the target's jmp_buf size and alignment
-    const TargetLowering *TLI;
-
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit LowerInvoke(const TargetLowering *tli = NULL,
+    explicit LowerInvoke(const TargetMachine *TM = 0,
                          bool useExpensiveEHSupport = ExpensiveEHSupport)
-      : FunctionPass(ID), useExpensiveEHSupport(useExpensiveEHSupport),
-        TLI(tli) {
+      : FunctionPass(ID), TM(TM),
+        useExpensiveEHSupport(useExpensiveEHSupport) {
       initializeLowerInvokePass(*PassRegistry::getPassRegistry());
     }
     bool doInitialization(Module &M);
@@ -108,12 +107,9 @@ INITIALIZE_PASS(LowerInvoke, "lowerinvoke",
 char &llvm::LowerInvokePassID = LowerInvoke::ID;
 
 // Public Interface To the LowerInvoke pass.
-FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI) {
-  return new LowerInvoke(TLI, ExpensiveEHSupport);
-}
-FunctionPass *llvm::createLowerInvokePass(const TargetLowering *TLI,
+FunctionPass *llvm::createLowerInvokePass(const TargetMachine *TM,
                                           bool useExpensiveEHSupport) {
-  return new LowerInvoke(TLI, useExpensiveEHSupport);
+  return new LowerInvoke(TM, useExpensiveEHSupport || ExpensiveEHSupport);
 }
 
 // doInitialization - Make sure that there is a prototype for abort in the
@@ -122,6 +118,7 @@ bool LowerInvoke::doInitialization(Module &M) {
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
   if (useExpensiveEHSupport) {
     // Insert a type for the linked list of jump buffers.
+    const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0;
     unsigned JBSize = TLI ? TLI->getJumpBufSize() : 0;
     JBSize = JBSize ? JBSize : 200;
     Type *JmpBufTy = ArrayType::get(VoidPtrTy, JBSize);
@@ -349,7 +346,6 @@ splitLiveRangesLiveAcrossInvokes(SmallVectorImpl<InvokeInst*> &Invokes) {
       // Scan all of the uses and see if the live range is live across an unwind
       // edge.  If we find a use live across an invoke edge, create an alloca
       // and spill the value.
-      std::set<InvokeInst*> InvokesWithStoreInserted;
 
       // Find all of the blocks that this value is live in.
       std::set<BasicBlock*> LiveBBs;
@@ -430,6 +426,7 @@ bool LowerInvoke::insertExpensiveEHSupport(Function &F) {
     // Create an alloca for the incoming jump buffer ptr and the new jump buffer
     // that needs to be restored on all exits from the function.  This is an
     // alloca because the value needs to be live across invokes.
+    const TargetLowering *TLI = TM ? TM->getTargetLowering() : 0;
     unsigned Align = TLI ? TLI->getJumpBufAlignment() : 0;
     AllocaInst *JmpBuf =
       new AllocaInst(JBLinkTy, 0, Align,
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 955b853..2d2a8a5 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -66,6 +66,18 @@ namespace {
                              BasicBlock* OrigBlock, BasicBlock* Default);
     unsigned Clusterify(CaseVector& Cases, SwitchInst *SI);
   };
+
+  /// The comparison function for sorting the switch case values in the vector.
+  /// WARNING: Case ranges should be disjoint!
+  struct CaseCmp {
+    bool operator () (const LowerSwitch::CaseRange& C1,
+                      const LowerSwitch::CaseRange& C2) {
+
+      const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low);
+      const ConstantInt* CI2 = cast<const ConstantInt>(C2.High);
+      return CI1->getValue().slt(CI2->getValue());
+    }
+  };
 }
 
 char LowerSwitch::ID = 0;
@@ -147,7 +159,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End,
   Function::iterator FI = OrigBlock;
   F->getBasicBlockList().insert(++FI, NewNode);
 
-  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT,
+  ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
                                 Val, Pivot.Low, "Pivot");
   NewNode->getInstList().push_back(Comp);
   BranchInst::Create(LBranch, RBranch, Comp, NewNode);
@@ -222,34 +234,40 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val,
 
 // Clusterify - Transform simple list of Cases into list of CaseRange's
 unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
-
-  IntegersSubsetToBB TheClusterifier;
+  unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-       i != e; ++i) {
-    BasicBlock *SuccBB = i.getCaseSuccessor();
-    IntegersSubset CaseRanges = i.getCaseValueEx();
-    TheClusterifier.add(CaseRanges, SuccBB);
-  }
-  
-  TheClusterifier.optimize();
+  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
+    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
+                              i.getCaseSuccessor()));
   
-  size_t numCmps = 0;
-  for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(),
-       e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
-    IntegersSubsetToBB::Cluster &C = *i;
-    
-    // FIXME: Currently work with ConstantInt based numbers.
-    // Changing it to APInt based is a pretty heavy for this commit.
-    Cases.push_back(CaseRange(C.first.getLow().toConstantInt(),
-                              C.first.getHigh().toConstantInt(), C.second));
-    if (C.first.isSingleNumber())
+  std::sort(Cases.begin(), Cases.end(), CaseCmp());
+
+  // Merge case into clusters
+  if (Cases.size()>=2)
+    for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) {
+      int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue();
+      int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue();
+      BasicBlock* nextBB = J->BB;
+      BasicBlock* currentBB = I->BB;
+
+      // If the two neighboring cases go to the same destination, merge them
+      // into a single case.
+      if ((nextValue-currentValue==1) && (currentBB == nextBB)) {
+        I->High = J->High;
+        J = Cases.erase(J);
+      } else {
+        I = J++;
+      }
+    }
+
+  for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) {
+    if (I->Low != I->High)
       // A range counts double, since it requires two compares.
       ++numCmps;
   }
 
-  return numCmps;  
+  return numCmps;
 }
 
 // processSwitchInst - Replace the specified switch instruction with a sequence
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index 3716f58..c3704531 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -53,7 +53,7 @@ namespace {
     }
 
     bool runOnModule(Module &M) {
-      static const char *metaNames[] = {
+      static const char *const metaNames[] = {
         // See http://en.wikipedia.org/wiki/Metasyntactic_variable
         "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
         "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index d090b48..ff6e6f9 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
@@ -62,3 +63,20 @@ void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
 void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority) {
   appendToGlobalArray("llvm.global_dtors", M, F, Priority);
 }
+
+GlobalVariable *
+llvm::collectUsedGlobalVariables(Module &M, SmallPtrSet<GlobalValue *, 8> &Set,
+                                 bool CompilerUsed) {
+  const char *Name = CompilerUsed ? "llvm.compiler.used" : "llvm.used";
+  GlobalVariable *GV = M.getGlobalVariable(Name);
+  if (!GV || !GV->hasInitializer())
+    return GV;
+
+  const ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+  for (unsigned I = 0, E = Init->getNumOperands(); I != E; ++I) {
+    Value *Op = Init->getOperand(I);
+    GlobalValue *G = cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+    Set.insert(G);
+  }
+  return GV;
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index de335ec..8f6eee3 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -27,8 +27,8 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -56,36 +56,13 @@ STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
 
-namespace llvm {
-template<>
-struct DenseMapInfo<std::pair<BasicBlock*, unsigned> > {
-  typedef std::pair<BasicBlock*, unsigned> EltTy;
-  static inline EltTy getEmptyKey() {
-    return EltTy(reinterpret_cast<BasicBlock*>(-1), ~0U);
-  }
-  static inline EltTy getTombstoneKey() {
-    return EltTy(reinterpret_cast<BasicBlock*>(-2), 0U);
-  }
-  static unsigned getHashValue(const std::pair<BasicBlock*, unsigned> &Val) {
-    using llvm::hash_value;
-    return static_cast<unsigned>(hash_value(Val));
-  }
-  static bool isEqual(const EltTy &LHS, const EltTy &RHS) {
-    return LHS == RHS;
-  }
-};
-}
-
-/// isAllocaPromotable - Return true if this alloca is legal for promotion.
-/// This is true if there are only loads and stores to the alloca.
-///
 bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   // FIXME: If the memory unit is of pointer or integer type, we can permit
   // assignments to subsections of the memory unit.
 
   // Only allow direct and non-volatile loads and stores...
   for (Value::const_use_iterator UI = AI->use_begin(), UE = AI->use_end();
-       UI != UE; ++UI) {   // Loop over all of the uses of the alloca
+       UI != UE; ++UI) { // Loop over all of the uses of the alloca
     const User *U = *UI;
     if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
       // Note that atomic loads can be transformed; atomic semantics do
@@ -94,7 +71,7 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
         return false;
     } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
       if (SI->getOperand(0) == AI)
-        return false;   // Don't allow a store OF the AI, only INTO the AI.
+        return false; // Don't allow a store OF the AI, only INTO the AI.
       // Note that atomic stores can be transformed; atomic semantics do
       // not have any meaning for a local alloca.
       if (SI->isVolatile())
@@ -124,243 +101,217 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 }
 
 namespace {
-  struct AllocaInfo;
-
-  // Data package used by RenamePass()
-  class RenamePassData {
-  public:
-    typedef std::vector<Value *> ValVector;
-    
-    RenamePassData() : BB(NULL), Pred(NULL), Values() {}
-    RenamePassData(BasicBlock *B, BasicBlock *P,
-                   const ValVector &V) : BB(B), Pred(P), Values(V) {}
-    BasicBlock *BB;
-    BasicBlock *Pred;
-    ValVector Values;
-    
-    void swap(RenamePassData &RHS) {
-      std::swap(BB, RHS.BB);
-      std::swap(Pred, RHS.Pred);
-      Values.swap(RHS.Values);
+
+struct AllocaInfo {
+  SmallVector<BasicBlock *, 32> DefiningBlocks;
+  SmallVector<BasicBlock *, 32> UsingBlocks;
+
+  StoreInst *OnlyStore;
+  BasicBlock *OnlyBlock;
+  bool OnlyUsedInOneBlock;
+
+  Value *AllocaPointerVal;
+  DbgDeclareInst *DbgDeclare;
+
+  void clear() {
+    DefiningBlocks.clear();
+    UsingBlocks.clear();
+    OnlyStore = 0;
+    OnlyBlock = 0;
+    OnlyUsedInOneBlock = true;
+    AllocaPointerVal = 0;
+    DbgDeclare = 0;
+  }
+
+  /// Scan the uses of the specified alloca, filling in the AllocaInfo used
+  /// by the rest of the pass to reason about the uses of this alloca.
+  void AnalyzeAlloca(AllocaInst *AI) {
+    clear();
+
+    // As we scan the uses of the alloca instruction, keep track of stores,
+    // and decide whether all of the loads and stores to the alloca are within
+    // the same basic block.
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E;) {
+      Instruction *User = cast<Instruction>(*UI++);
+
+      if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+        // Remember the basic blocks which define new values for the alloca
+        DefiningBlocks.push_back(SI->getParent());
+        AllocaPointerVal = SI->getOperand(0);
+        OnlyStore = SI;
+      } else {
+        LoadInst *LI = cast<LoadInst>(User);
+        // Otherwise it must be a load instruction, keep track of variable
+        // reads.
+        UsingBlocks.push_back(LI->getParent());
+        AllocaPointerVal = LI;
+      }
+
+      if (OnlyUsedInOneBlock) {
+        if (OnlyBlock == 0)
+          OnlyBlock = User->getParent();
+        else if (OnlyBlock != User->getParent())
+          OnlyUsedInOneBlock = false;
+      }
     }
-  };
-  
-  /// LargeBlockInfo - This assigns and keeps a per-bb relative ordering of
-  /// load/store instructions in the block that directly load or store an alloca.
+
+    DbgDeclare = FindAllocaDbgDeclare(AI);
+  }
+};
+
+// Data package used by RenamePass()
+class RenamePassData {
+public:
+  typedef std::vector<Value *> ValVector;
+
+  RenamePassData() : BB(NULL), Pred(NULL), Values() {}
+  RenamePassData(BasicBlock *B, BasicBlock *P, const ValVector &V)
+      : BB(B), Pred(P), Values(V) {}
+  BasicBlock *BB;
+  BasicBlock *Pred;
+  ValVector Values;
+
+  void swap(RenamePassData &RHS) {
+    std::swap(BB, RHS.BB);
+    std::swap(Pred, RHS.Pred);
+    Values.swap(RHS.Values);
+  }
+};
+
+/// \brief This assigns and keeps a per-bb relative ordering of load/store
+/// instructions in the block that directly load or store an alloca.
+///
+/// This functionality is important because it avoids scanning large basic
+/// blocks multiple times when promoting many allocas in the same block.
+class LargeBlockInfo {
+  /// \brief For each instruction that we track, keep the index of the
+  /// instruction.
   ///
-  /// This functionality is important because it avoids scanning large basic
-  /// blocks multiple times when promoting many allocas in the same block.
-  class LargeBlockInfo {
-    /// InstNumbers - For each instruction that we track, keep the index of the
-    /// instruction.  The index starts out as the number of the instruction from
-    /// the start of the block.
-    DenseMap<const Instruction *, unsigned> InstNumbers;
-  public:
-    
-    /// isInterestingInstruction - This code only looks at accesses to allocas.
-    static bool isInterestingInstruction(const Instruction *I) {
-      return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
-             (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
-    }
-    
-    /// getInstructionIndex - Get or calculate the index of the specified
-    /// instruction.
-    unsigned getInstructionIndex(const Instruction *I) {
-      assert(isInterestingInstruction(I) &&
-             "Not a load/store to/from an alloca?");
-      
-      // If we already have this instruction number, return it.
-      DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
-      if (It != InstNumbers.end()) return It->second;
-      
-      // Scan the whole block to get the instruction.  This accumulates
-      // information for every interesting instruction in the block, in order to
-      // avoid gratuitus rescans.
-      const BasicBlock *BB = I->getParent();
-      unsigned InstNo = 0;
-      for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end();
-           BBI != E; ++BBI)
-        if (isInterestingInstruction(BBI))
-          InstNumbers[BBI] = InstNo++;
-      It = InstNumbers.find(I);
-      
-      assert(It != InstNumbers.end() && "Didn't insert instruction?");
+  /// The index starts out as the number of the instruction from the start of
+  /// the block.
+  DenseMap<const Instruction *, unsigned> InstNumbers;
+
+public:
+
+  /// This code only looks at accesses to allocas.
+  static bool isInterestingInstruction(const Instruction *I) {
+    return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+           (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+  }
+
+  /// Get or calculate the index of the specified instruction.
+  unsigned getInstructionIndex(const Instruction *I) {
+    assert(isInterestingInstruction(I) &&
+           "Not a load/store to/from an alloca?");
+
+    // If we already have this instruction number, return it.
+    DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+    if (It != InstNumbers.end())
       return It->second;
-    }
-    
-    void deleteValue(const Instruction *I) {
-      InstNumbers.erase(I);
-    }
-    
-    void clear() {
-      InstNumbers.clear();
-    }
-  };
-
-  struct PromoteMem2Reg {
-    /// Allocas - The alloca instructions being promoted.
-    ///
-    std::vector<AllocaInst*> Allocas;
-    DominatorTree &DT;
-    DIBuilder *DIB;
-
-    /// AST - An AliasSetTracker object to update.  If null, don't update it.
-    ///
-    AliasSetTracker *AST;
-    
-    /// AllocaLookup - Reverse mapping of Allocas.
-    ///
-    DenseMap<AllocaInst*, unsigned>  AllocaLookup;
-
-    /// NewPhiNodes - The PhiNodes we're adding.  That map is used to simplify
-    /// some Phi nodes as we iterate over it, so it should have deterministic
-    /// iterators.  We could use a MapVector, but since we already maintain a
-    /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is
-    /// more efficient (also supports removal).
-    ///
-    DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes;
-    
-    /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
-    /// it corresponds to.
-    DenseMap<PHINode*, unsigned> PhiToAllocaMap;
-    
-    /// PointerAllocaValues - If we are updating an AliasSetTracker, then for
-    /// each alloca that is of pointer type, we keep track of what to copyValue
-    /// to the inserted PHI nodes here.
-    ///
-    std::vector<Value*> PointerAllocaValues;
-
-    /// AllocaDbgDeclares - For each alloca, we keep track of the dbg.declare
-    /// intrinsic that describes it, if any, so that we can convert it to a
-    /// dbg.value intrinsic if the alloca gets promoted.
-    SmallVector<DbgDeclareInst*, 8> AllocaDbgDeclares;
-
-    /// Visited - The set of basic blocks the renamer has already visited.
-    ///
-    SmallPtrSet<BasicBlock*, 16> Visited;
-
-    /// BBNumbers - Contains a stable numbering of basic blocks to avoid
-    /// non-determinstic behavior.
-    DenseMap<BasicBlock*, unsigned> BBNumbers;
-
-    /// DomLevels - Maps DomTreeNodes to their level in the dominator tree.
-    DenseMap<DomTreeNode*, unsigned> DomLevels;
-
-    /// BBNumPreds - Lazily compute the number of predecessors a block has.
-    DenseMap<const BasicBlock*, unsigned> BBNumPreds;
-  public:
-    PromoteMem2Reg(const std::vector<AllocaInst*> &A, DominatorTree &dt,
-                   AliasSetTracker *ast)
-      : Allocas(A), DT(dt), DIB(0), AST(ast) {}
-    ~PromoteMem2Reg() {
-      delete DIB;
-    }
 
-    void run();
+    // Scan the whole block to get the instruction.  This accumulates
+    // information for every interesting instruction in the block, in order to
+    // avoid gratuitus rescans.
+    const BasicBlock *BB = I->getParent();
+    unsigned InstNo = 0;
+    for (BasicBlock::const_iterator BBI = BB->begin(), E = BB->end(); BBI != E;
+         ++BBI)
+      if (isInterestingInstruction(BBI))
+        InstNumbers[BBI] = InstNo++;
+    It = InstNumbers.find(I);
+
+    assert(It != InstNumbers.end() && "Didn't insert instruction?");
+    return It->second;
+  }
 
-    /// dominates - Return true if BB1 dominates BB2 using the DominatorTree.
-    ///
-    bool dominates(BasicBlock *BB1, BasicBlock *BB2) const {
-      return DT.dominates(BB1, BB2);
-    }
+  void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
 
-  private:
-    void RemoveFromAllocasList(unsigned &AllocaIdx) {
-      Allocas[AllocaIdx] = Allocas.back();
-      Allocas.pop_back();
-      --AllocaIdx;
-    }
+  void clear() { InstNumbers.clear(); }
+};
 
-    unsigned getNumPreds(const BasicBlock *BB) {
-      unsigned &NP = BBNumPreds[BB];
-      if (NP == 0)
-        NP = std::distance(pred_begin(BB), pred_end(BB))+1;
-      return NP-1;
-    }
+struct PromoteMem2Reg {
+  /// The alloca instructions being promoted.
+  std::vector<AllocaInst *> Allocas;
+  DominatorTree &DT;
+  DIBuilder DIB;
 
-    void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
-                                 AllocaInfo &Info);
-    void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
-                             const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
-                             SmallPtrSet<BasicBlock*, 32> &LiveInBlocks);
-    
-    void RewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                  LargeBlockInfo &LBI);
-    void PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                  LargeBlockInfo &LBI);
-    
-    void RenamePass(BasicBlock *BB, BasicBlock *Pred,
-                    RenamePassData::ValVector &IncVals,
-                    std::vector<RenamePassData> &Worklist);
-    bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
-  };
-  
-  struct AllocaInfo {
-    SmallVector<BasicBlock*, 32> DefiningBlocks;
-    SmallVector<BasicBlock*, 32> UsingBlocks;
-    
-    StoreInst  *OnlyStore;
-    BasicBlock *OnlyBlock;
-    bool OnlyUsedInOneBlock;
-    
-    Value *AllocaPointerVal;
-    DbgDeclareInst *DbgDeclare;
-    
-    void clear() {
-      DefiningBlocks.clear();
-      UsingBlocks.clear();
-      OnlyStore = 0;
-      OnlyBlock = 0;
-      OnlyUsedInOneBlock = true;
-      AllocaPointerVal = 0;
-      DbgDeclare = 0;
-    }
-    
-    /// AnalyzeAlloca - Scan the uses of the specified alloca, filling in our
-    /// ivars.
-    void AnalyzeAlloca(AllocaInst *AI) {
-      clear();
-
-      // As we scan the uses of the alloca instruction, keep track of stores,
-      // and decide whether all of the loads and stores to the alloca are within
-      // the same basic block.
-      for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
-           UI != E;)  {
-        Instruction *User = cast<Instruction>(*UI++);
-
-        if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
-          // Remember the basic blocks which define new values for the alloca
-          DefiningBlocks.push_back(SI->getParent());
-          AllocaPointerVal = SI->getOperand(0);
-          OnlyStore = SI;
-        } else {
-          LoadInst *LI = cast<LoadInst>(User);
-          // Otherwise it must be a load instruction, keep track of variable
-          // reads.
-          UsingBlocks.push_back(LI->getParent());
-          AllocaPointerVal = LI;
-        }
-        
-        if (OnlyUsedInOneBlock) {
-          if (OnlyBlock == 0)
-            OnlyBlock = User->getParent();
-          else if (OnlyBlock != User->getParent())
-            OnlyUsedInOneBlock = false;
-        }
-      }
-      
-      DbgDeclare = FindAllocaDbgDeclare(AI);
-    }
-  };
+  /// An AliasSetTracker object to update.  If null, don't update it.
+  AliasSetTracker *AST;
 
-  typedef std::pair<DomTreeNode*, unsigned> DomTreeNodePair;
+  /// Reverse mapping of Allocas.
+  DenseMap<AllocaInst *, unsigned> AllocaLookup;
 
-  struct DomTreeNodeCompare {
-    bool operator()(const DomTreeNodePair &LHS, const DomTreeNodePair &RHS) {
-      return LHS.second < RHS.second;
-    }
-  };
-}  // end of anonymous namespace
+  /// \brief The PhiNodes we're adding.
+  ///
+  /// That map is used to simplify some Phi nodes as we iterate over it, so
+  /// it should have deterministic iterators.  We could use a MapVector, but
+  /// since we already maintain a map from BasicBlock* to a stable numbering
+  /// (BBNumbers), the DenseMap is more efficient (also supports removal).
+  DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
+
+  /// For each PHI node, keep track of which entry in Allocas it corresponds
+  /// to.
+  DenseMap<PHINode *, unsigned> PhiToAllocaMap;
+
+  /// If we are updating an AliasSetTracker, then for each alloca that is of
+  /// pointer type, we keep track of what to copyValue to the inserted PHI
+  /// nodes here.
+  std::vector<Value *> PointerAllocaValues;
+
+  /// For each alloca, we keep track of the dbg.declare intrinsic that
+  /// describes it, if any, so that we can convert it to a dbg.value
+  /// intrinsic if the alloca gets promoted.
+  SmallVector<DbgDeclareInst *, 8> AllocaDbgDeclares;
+
+  /// The set of basic blocks the renamer has already visited.
+  ///
+  SmallPtrSet<BasicBlock *, 16> Visited;
+
+  /// Contains a stable numbering of basic blocks to avoid non-determinstic
+  /// behavior.
+  DenseMap<BasicBlock *, unsigned> BBNumbers;
+
+  /// Maps DomTreeNodes to their level in the dominator tree.
+  DenseMap<DomTreeNode *, unsigned> DomLevels;
+
+  /// Lazily compute the number of predecessors a block has.
+  DenseMap<const BasicBlock *, unsigned> BBNumPreds;
+
+public:
+  PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                 AliasSetTracker *AST)
+      : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
+        DIB(*DT.getRoot()->getParent()->getParent()), AST(AST) {}
+
+  void run();
+
+private:
+  void RemoveFromAllocasList(unsigned &AllocaIdx) {
+    Allocas[AllocaIdx] = Allocas.back();
+    Allocas.pop_back();
+    --AllocaIdx;
+  }
+
+  unsigned getNumPreds(const BasicBlock *BB) {
+    unsigned &NP = BBNumPreds[BB];
+    if (NP == 0)
+      NP = std::distance(pred_begin(BB), pred_end(BB)) + 1;
+    return NP - 1;
+  }
+
+  void DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
+                               AllocaInfo &Info);
+  void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
+                           const SmallPtrSet<BasicBlock *, 32> &DefBlocks,
+                           SmallPtrSet<BasicBlock *, 32> &LiveInBlocks);
+  void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+                  RenamePassData::ValVector &IncVals,
+                  std::vector<RenamePassData> &Worklist);
+  bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+};
+
+} // end of anonymous namespace
 
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
@@ -388,10 +339,191 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   }
 }
 
+/// \brief Rewrite as many loads as possible given a single store.
+///
+/// When there is only a single store, we can use the domtree to trivially
+/// replace all of the dominated loads with the stored value. Do so, and return
+/// true if this has successfully promoted the alloca entirely. If this returns
+/// false there were some loads which were not dominated by the single store
+/// and thus must be phi-ed with undef. We fall back to the standard alloca
+/// promotion algorithm in that case.
+static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+                                     LargeBlockInfo &LBI,
+                                     DominatorTree &DT,
+                                     AliasSetTracker *AST) {
+  StoreInst *OnlyStore = Info.OnlyStore;
+  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+  BasicBlock *StoreBB = OnlyStore->getParent();
+  int StoreIndex = -1;
+
+  // Clear out UsingBlocks.  We will reconstruct it here if needed.
+  Info.UsingBlocks.clear();
+
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
+    Instruction *UserInst = cast<Instruction>(*UI++);
+    if (!isa<LoadInst>(UserInst)) {
+      assert(UserInst == OnlyStore && "Should only have load/stores");
+      continue;
+    }
+    LoadInst *LI = cast<LoadInst>(UserInst);
+
+    // Okay, if we have a load from the alloca, we want to replace it with the
+    // only value stored to the alloca.  We can do this if the value is
+    // dominated by the store.  If not, we use the rest of the mem2reg machinery
+    // to insert the phi nodes as needed.
+    if (!StoringGlobalVal) { // Non-instructions are always dominated.
+      if (LI->getParent() == StoreBB) {
+        // If we have a use that is in the same block as the store, compare the
+        // indices of the two instructions to see which one came first.  If the
+        // load came before the store, we can't handle it.
+        if (StoreIndex == -1)
+          StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+          // Can't handle this load, bail out.
+          Info.UsingBlocks.push_back(StoreBB);
+          continue;
+        }
+
+      } else if (LI->getParent() != StoreBB &&
+                 !DT.dominates(StoreBB, LI->getParent())) {
+        // If the load and store are in different blocks, use BB dominance to
+        // check their relationships.  If the store doesn't dom the use, bail
+        // out.
+        Info.UsingBlocks.push_back(LI->getParent());
+        continue;
+      }
+    }
+
+    // Otherwise, we *can* safely rewrite this load.
+    Value *ReplVal = OnlyStore->getOperand(0);
+    // If the replacement value is the load, this must occur in unreachable
+    // code.
+    if (ReplVal == LI)
+      ReplVal = UndefValue::get(LI->getType());
+    LI->replaceAllUsesWith(ReplVal);
+    if (AST && LI->getType()->isPointerTy())
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Finally, after the scan, check to see if the store is all that is left.
+  if (!Info.UsingBlocks.empty())
+    return false; // If not, we'll have to fall back for the remainder.
+
+  // Record debuginfo for the store and remove the declaration's
+  // debuginfo.
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+    DIBuilder DIB(*AI->getParent()->getParent()->getParent());
+    ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, DIB);
+    DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
+  // Remove the (now dead) store and alloca.
+  Info.OnlyStore->eraseFromParent();
+  LBI.deleteValue(Info.OnlyStore);
+
+  if (AST)
+    AST->deleteValue(AI);
+  AI->eraseFromParent();
+  LBI.deleteValue(AI);
+  return true;
+}
+
+/// Many allocas are only used within a single basic block.  If this is the
+/// case, avoid traversing the CFG and inserting a lot of potentially useless
+/// PHI nodes by just performing a single linear pass over the basic block
+/// using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return true.  This is necessary in cases where, due to control flow, the
+/// alloca is potentially undefined on some control flow paths.  e.g. code like
+/// this is potentially correct:
+///
+///   for (...) { if (c) { A = undef; undef = B; } }
+///
+/// ... so long as A is not used before undef is set.
+static void promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+                                     LargeBlockInfo &LBI,
+                                     AliasSetTracker *AST) {
+  // The trickiest case to handle is when we have large blocks. Because of this,
+  // this code is optimized assuming that large blocks happen.  This does not
+  // significantly pessimize the small block case.  This uses LargeBlockInfo to
+  // make it efficient to get the index of various operations in the block.
+
+  // Walk the use-def list of the alloca, getting the locations of all stores.
+  typedef SmallVector<std::pair<unsigned, StoreInst *>, 64> StoresByIndexTy;
+  StoresByIndexTy StoresByIndex;
+
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;
+       ++UI)
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
+      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+  // Sort the stores by their index, making it efficient to do a lookup with a
+  // binary search.
+  std::sort(StoresByIndex.begin(), StoresByIndex.end(), less_first());
+
+  // Walk all of the loads from this alloca, replacing them with the nearest
+  // store above them, if any.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
+    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
+    if (!LI)
+      continue;
+
+    unsigned LoadIdx = LBI.getInstructionIndex(LI);
+
+    // Find the nearest store that has a lower index than this load.
+    StoresByIndexTy::iterator I =
+        std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
+                         std::make_pair(LoadIdx, static_cast<StoreInst *>(0)),
+                         less_first());
+
+    if (I == StoresByIndex.begin())
+      // If there is no store before this load, the load takes the undef value.
+      LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+    else
+      // Otherwise, there was a store before this load, the load takes its value.
+      LI->replaceAllUsesWith(llvm::prior(I)->second->getOperand(0));
+
+    if (AST && LI->getType()->isPointerTy())
+      AST->deleteValue(LI);
+    LI->eraseFromParent();
+    LBI.deleteValue(LI);
+  }
+
+  // Remove the (now dead) stores and alloca.
+  while (!AI->use_empty()) {
+    StoreInst *SI = cast<StoreInst>(AI->use_back());
+    // Record debuginfo for the store before removing it.
+    if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+      DIBuilder DIB(*AI->getParent()->getParent()->getParent());
+      ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+    }
+    SI->eraseFromParent();
+    LBI.deleteValue(SI);
+  }
+
+  if (AST)
+    AST->deleteValue(AI);
+  AI->eraseFromParent();
+  LBI.deleteValue(AI);
+
+  // The alloca's debuginfo can be removed as well.
+  if (DbgDeclareInst *DDI = Info.DbgDeclare) {
+    DDI->eraseFromParent();
+    LBI.deleteValue(DDI);
+  }
+
+  ++NumLocalPromoted;
+}
+
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  if (AST) PointerAllocaValues.resize(Allocas.size());
+  if (AST)
+    PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
   AllocaInfo Info;
@@ -400,8 +532,7 @@ void PromoteMem2Reg::run() {
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
-    assert(isAllocaPromotable(AI) &&
-           "Cannot promote non-promotable alloca!");
+    assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
@@ -409,7 +540,8 @@ void PromoteMem2Reg::run() {
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
-      if (AST) AST->deleteValue(AI);
+      if (AST)
+        AST->deleteValue(AI);
       AI->eraseFromParent();
 
       // Remove the alloca from the Allocas list, since it has been processed
@@ -417,7 +549,7 @@ void PromoteMem2Reg::run() {
       ++NumDeadAlloca;
       continue;
     }
-    
+
     // Calculate the set of read and write-locations for each alloca.  This is
     // analogous to finding the 'uses' and 'definitions' of each variable.
     Info.AnalyzeAlloca(AI);
@@ -425,75 +557,27 @@ void PromoteMem2Reg::run() {
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
-      RewriteSingleStoreAlloca(AI, Info, LBI);
-
-      // Finally, after the scan, check to see if the store is all that is left.
-      if (Info.UsingBlocks.empty()) {
-        // Record debuginfo for the store and remove the declaration's 
-        // debuginfo.
-        if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-          if (!DIB)
-            DIB = new DIBuilder(*DDI->getParent()->getParent()->getParent());
-          ConvertDebugDeclareToDebugValue(DDI, Info.OnlyStore, *DIB);
-          DDI->eraseFromParent();
-        }
-        // Remove the (now dead) store and alloca.
-        Info.OnlyStore->eraseFromParent();
-        LBI.deleteValue(Info.OnlyStore);
-
-        if (AST) AST->deleteValue(AI);
-        AI->eraseFromParent();
-        LBI.deleteValue(AI);
-        
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
-        
         ++NumSingleStore;
         continue;
       }
     }
-    
+
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock) {
-      PromoteSingleBlockAlloca(AI, Info, LBI);
-      
-      // Finally, after the scan, check to see if the stores are all that is
-      // left.
-      if (Info.UsingBlocks.empty()) {
-        
-        // Remove the (now dead) stores and alloca.
-        while (!AI->use_empty()) {
-          StoreInst *SI = cast<StoreInst>(AI->use_back());
-          // Record debuginfo for the store before removing it.
-          if (DbgDeclareInst *DDI = Info.DbgDeclare) {
-            if (!DIB)
-              DIB = new DIBuilder(*SI->getParent()->getParent()->getParent());
-            ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
-          }
-          SI->eraseFromParent();
-          LBI.deleteValue(SI);
-        }
-        
-        if (AST) AST->deleteValue(AI);
-        AI->eraseFromParent();
-        LBI.deleteValue(AI);
-        
-        // The alloca has been processed, move on.
-        RemoveFromAllocasList(AllocaNum);
-        
-        // The alloca's debuginfo can be removed as well.
-        if (DbgDeclareInst *DDI = Info.DbgDeclare)
-          DDI->eraseFromParent();
+      promoteSingleBlockAlloca(AI, Info, LBI, AST);
 
-        ++NumLocalPromoted;
-        continue;
-      }
+      // The alloca has been processed, move on.
+      RemoveFromAllocasList(AllocaNum);
+      continue;
     }
 
     // If we haven't computed dominator tree levels, do so now.
     if (DomLevels.empty()) {
-      SmallVector<DomTreeNode*, 32> Worklist;
+      SmallVector<DomTreeNode *, 32> Worklist;
 
       DomTreeNode *Root = DT.getRootNode();
       DomLevels[Root] = 0;
@@ -522,10 +606,11 @@ void PromoteMem2Reg::run() {
     // stored into the alloca.
     if (AST)
       PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
-      
+
     // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (Info.DbgDeclare) AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
-    
+    if (Info.DbgDeclare)
+      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
+
     // Keep the reverse mapping of the 'Allocas' array for the rename pass.
     AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
 
@@ -540,8 +625,7 @@ void PromoteMem2Reg::run() {
     return; // All of the allocas must have been trivial!
 
   LBI.clear();
-  
-  
+
   // Set the incoming values for the basic block to be null values for all of
   // the alloca's.  We do this in case there is a load of a value that has not
   // been stored yet.  In this case, it will get this null value.
@@ -562,7 +646,7 @@ void PromoteMem2Reg::run() {
     // RenamePass may add new worklist entries.
     RenamePass(RPD.BB, RPD.Pred, RPD.Values, RenamePassWorkList);
   } while (!RenamePassWorkList.empty());
-  
+
   // The renamer uses the Visited set to avoid infinite loops.  Clear it now.
   Visited.clear();
 
@@ -575,7 +659,8 @@ void PromoteMem2Reg::run() {
     // tree. Just delete the users now.
     if (!A->use_empty())
       A->replaceAllUsesWith(UndefValue::get(A->getType()));
-    if (AST) AST->deleteValue(A);
+    if (AST)
+      AST->deleteValue(A);
     A->eraseFromParent();
   }
 
@@ -591,13 +676,15 @@ void PromoteMem2Reg::run() {
   bool EliminatedAPHI = true;
   while (EliminatedAPHI) {
     EliminatedAPHI = false;
-    
+
     // Iterating over NewPhiNodes is deterministic, so it is safe to try to
     // simplify and RAUW them as we go.  If it was not, we could add uses to
     // the values we replace with in a non deterministic order, thus creating
     // non deterministic def->use chains.
-    for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
-           NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+             I = NewPhiNodes.begin(),
+             E = NewPhiNodes.end();
+         I != E;) {
       PHINode *PN = I->second;
 
       // If this PHI node merges one value and/or undefs, get the value.
@@ -613,15 +700,17 @@ void PromoteMem2Reg::run() {
       ++I;
     }
   }
-  
+
   // At this point, the renamer has added entries to PHI nodes for all reachable
   // code.  Unfortunately, there may be unreachable blocks which the renamer
   // hasn't traversed.  If this is the case, the PHI nodes may not
   // have incoming values for all predecessors.  Loop over all PHI nodes we have
   // created, inserting undef values if they are missing any incoming values.
   //
-  for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
-         NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+           I = NewPhiNodes.begin(),
+           E = NewPhiNodes.end();
+       I != E; ++I) {
     // We want to do this once per basic block.  As such, only process a block
     // when we find the PHI that is the first entry in the block.
     PHINode *SomePHI = I->second;
@@ -636,21 +725,20 @@ void PromoteMem2Reg::run() {
       continue;
 
     // Get the preds for BB.
-    SmallVector<BasicBlock*, 16> Preds(pred_begin(BB), pred_end(BB));
-    
+    SmallVector<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+
     // Ok, now we know that all of the PHI nodes are missing entries for some
     // basic blocks.  Start by sorting the incoming predecessors for efficient
     // access.
     std::sort(Preds.begin(), Preds.end());
-    
+
     // Now we loop through all BB's which have entries in SomePHI and remove
     // them from the Preds list.
     for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
       // Do a log(n) search of the Preds list for the entry we want.
-      SmallVector<BasicBlock*, 16>::iterator EntIt =
-        std::lower_bound(Preds.begin(), Preds.end(),
-                         SomePHI->getIncomingBlock(i));
-      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i)&&
+      SmallVectorImpl<BasicBlock *>::iterator EntIt = std::lower_bound(
+          Preds.begin(), Preds.end(), SomePHI->getIncomingBlock(i));
+      assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
              "PHI node has entry for a block which is not a predecessor!");
 
       // Remove the entry
@@ -670,39 +758,41 @@ void PromoteMem2Reg::run() {
         SomePHI->addIncoming(UndefVal, Preds[pred]);
     }
   }
-        
+
   NewPhiNodes.clear();
 }
 
+/// \brief Determine which blocks the value is live in.
+///
+/// These are blocks which lead to uses.  Knowing this allows us to avoid
+/// inserting PHI nodes into blocks which don't lead to uses (thus, the
+/// inserted phi nodes would be dead).
+void PromoteMem2Reg::ComputeLiveInBlocks(
+    AllocaInst *AI, AllocaInfo &Info,
+    const SmallPtrSet<BasicBlock *, 32> &DefBlocks,
+    SmallPtrSet<BasicBlock *, 32> &LiveInBlocks) {
 
-/// ComputeLiveInBlocks - Determine which blocks the value is live in.  These
-/// are blocks which lead to uses.  Knowing this allows us to avoid inserting
-/// PHI nodes into blocks which don't lead to uses (thus, the inserted phi nodes
-/// would be dead).
-void PromoteMem2Reg::
-ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info, 
-                    const SmallPtrSet<BasicBlock*, 32> &DefBlocks,
-                    SmallPtrSet<BasicBlock*, 32> &LiveInBlocks) {
-  
   // To determine liveness, we must iterate through the predecessors of blocks
   // where the def is live.  Blocks are added to the worklist if we need to
   // check their predecessors.  Start with all the using blocks.
-  SmallVector<BasicBlock*, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
-                                                   Info.UsingBlocks.end());
-  
+  SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+                                                    Info.UsingBlocks.end());
+
   // If any of the using blocks is also a definition block, check to see if the
   // definition occurs before or after the use.  If it happens before the use,
   // the value isn't really live-in.
   for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
     BasicBlock *BB = LiveInBlockWorklist[i];
-    if (!DefBlocks.count(BB)) continue;
-    
+    if (!DefBlocks.count(BB))
+      continue;
+
     // Okay, this is a block that both uses and defines the value.  If the first
     // reference to the alloca is a def (store), then we know it isn't live-in.
-    for (BasicBlock::iterator I = BB->begin(); ; ++I) {
+    for (BasicBlock::iterator I = BB->begin();; ++I) {
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-        if (SI->getOperand(1) != AI) continue;
-        
+        if (SI->getOperand(1) != AI)
+          continue;
+
         // We found a store to the alloca before a load.  The alloca is not
         // actually live-in here.
         LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
@@ -710,73 +800,76 @@ ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
         --i, --e;
         break;
       }
-      
+
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        if (LI->getOperand(0) != AI) continue;
-        
+        if (LI->getOperand(0) != AI)
+          continue;
+
         // Okay, we found a load before a store to the alloca.  It is actually
         // live into this block.
         break;
       }
     }
   }
-  
+
   // Now that we have a set of blocks where the phi is live-in, recursively add
   // their predecessors until we find the full region the value is live.
   while (!LiveInBlockWorklist.empty()) {
     BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
-    
+
     // The block really is live in here, insert it into the set.  If already in
     // the set, then it has already been processed.
     if (!LiveInBlocks.insert(BB))
       continue;
-    
+
     // Since the value is live into BB, it is either defined in a predecessor or
     // live into it to.  Add the preds to the worklist unless they are a
     // defining block.
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
       BasicBlock *P = *PI;
-      
+
       // The value is not live into a predecessor if it defines the value.
       if (DefBlocks.count(P))
         continue;
-      
+
       // Otherwise it is, add to the worklist.
       LiveInBlockWorklist.push_back(P);
     }
   }
 }
 
-/// DetermineInsertionPoint - At this point, we're committed to promoting the
-/// alloca using IDF's, and the standard SSA construction algorithm.  Determine
-/// which blocks need phi nodes and see if we can optimize out some work by
-/// avoiding insertion of dead phi nodes.
+/// At this point, we're committed to promoting the alloca using IDF's, and the
+/// standard SSA construction algorithm.  Determine which blocks need phi nodes
+/// and see if we can optimize out some work by avoiding insertion of dead phi
+/// nodes.
 void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
                                              AllocaInfo &Info) {
   // Unique the set of defining blocks for efficient lookup.
-  SmallPtrSet<BasicBlock*, 32> DefBlocks;
+  SmallPtrSet<BasicBlock *, 32> DefBlocks;
   DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
 
   // Determine which blocks the value is live in.  These are blocks which lead
   // to uses.
-  SmallPtrSet<BasicBlock*, 32> LiveInBlocks;
+  SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
   ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
 
   // Use a priority queue keyed on dominator tree level so that inserted nodes
   // are handled from the bottom of the dominator tree upwards.
+  typedef std::pair<DomTreeNode *, unsigned> DomTreeNodePair;
   typedef std::priority_queue<DomTreeNodePair, SmallVector<DomTreeNodePair, 32>,
-                              DomTreeNodeCompare> IDFPriorityQueue;
+                              less_second> IDFPriorityQueue;
   IDFPriorityQueue PQ;
 
-  for (SmallPtrSet<BasicBlock*, 32>::const_iterator I = DefBlocks.begin(),
-       E = DefBlocks.end(); I != E; ++I) {
+  for (SmallPtrSet<BasicBlock *, 32>::const_iterator I = DefBlocks.begin(),
+                                                     E = DefBlocks.end();
+       I != E; ++I) {
     if (DomTreeNode *Node = DT.getNode(*I))
       PQ.push(std::make_pair(Node, DomLevels[Node]));
   }
 
-  SmallVector<std::pair<unsigned, BasicBlock*>, 32> DFBlocks;
-  SmallPtrSet<DomTreeNode*, 32> Visited;
-  SmallVector<DomTreeNode*, 32> Worklist;
+  SmallVector<std::pair<unsigned, BasicBlock *>, 32> DFBlocks;
+  SmallPtrSet<DomTreeNode *, 32> Visited;
+  SmallVector<DomTreeNode *, 32> Worklist;
   while (!PQ.empty()) {
     DomTreeNodePair RootPair = PQ.top();
     PQ.pop();
@@ -836,179 +929,22 @@ void PromoteMem2Reg::DetermineInsertionPoint(AllocaInst *AI, unsigned AllocaNum,
     QueuePhiNode(DFBlocks[i].second, AllocaNum, CurrentVersion);
 }
 
-/// RewriteSingleStoreAlloca - If there is only a single store to this value,
-/// replace any loads of it that are directly dominated by the definition with
-/// the value stored.
-void PromoteMem2Reg::RewriteSingleStoreAlloca(AllocaInst *AI,
-                                              AllocaInfo &Info,
-                                              LargeBlockInfo &LBI) {
-  StoreInst *OnlyStore = Info.OnlyStore;
-  bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
-  BasicBlock *StoreBB = OnlyStore->getParent();
-  int StoreIndex = -1;
-
-  // Clear out UsingBlocks.  We will reconstruct it here if needed.
-  Info.UsingBlocks.clear();
-  
-  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ) {
-    Instruction *UserInst = cast<Instruction>(*UI++);
-    if (!isa<LoadInst>(UserInst)) {
-      assert(UserInst == OnlyStore && "Should only have load/stores");
-      continue;
-    }
-    LoadInst *LI = cast<LoadInst>(UserInst);
-    
-    // Okay, if we have a load from the alloca, we want to replace it with the
-    // only value stored to the alloca.  We can do this if the value is
-    // dominated by the store.  If not, we use the rest of the mem2reg machinery
-    // to insert the phi nodes as needed.
-    if (!StoringGlobalVal) {  // Non-instructions are always dominated.
-      if (LI->getParent() == StoreBB) {
-        // If we have a use that is in the same block as the store, compare the
-        // indices of the two instructions to see which one came first.  If the
-        // load came before the store, we can't handle it.
-        if (StoreIndex == -1)
-          StoreIndex = LBI.getInstructionIndex(OnlyStore);
-
-        if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
-          // Can't handle this load, bail out.
-          Info.UsingBlocks.push_back(StoreBB);
-          continue;
-        }
-        
-      } else if (LI->getParent() != StoreBB &&
-                 !dominates(StoreBB, LI->getParent())) {
-        // If the load and store are in different blocks, use BB dominance to
-        // check their relationships.  If the store doesn't dom the use, bail
-        // out.
-        Info.UsingBlocks.push_back(LI->getParent());
-        continue;
-      }
-    }
-    
-    // Otherwise, we *can* safely rewrite this load.
-    Value *ReplVal = OnlyStore->getOperand(0);
-    // If the replacement value is the load, this must occur in unreachable
-    // code.
-    if (ReplVal == LI)
-      ReplVal = UndefValue::get(LI->getType());
-    LI->replaceAllUsesWith(ReplVal);
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
-    LI->eraseFromParent();
-    LBI.deleteValue(LI);
-  }
-}
-
-namespace {
-
-/// StoreIndexSearchPredicate - This is a helper predicate used to search by the
-/// first element of a pair.
-struct StoreIndexSearchPredicate {
-  bool operator()(const std::pair<unsigned, StoreInst*> &LHS,
-                  const std::pair<unsigned, StoreInst*> &RHS) {
-    return LHS.first < RHS.first;
-  }
-};
-
-}
-
-/// PromoteSingleBlockAlloca - Many allocas are only used within a single basic
-/// block.  If this is the case, avoid traversing the CFG and inserting a lot of
-/// potentially useless PHI nodes by just performing a single linear pass over
-/// the basic block using the Alloca.
-///
-/// If we cannot promote this alloca (because it is read before it is written),
-/// return true.  This is necessary in cases where, due to control flow, the
-/// alloca is potentially undefined on some control flow paths.  e.g. code like
-/// this is potentially correct:
-///
-///   for (...) { if (c) { A = undef; undef = B; } }
-///
-/// ... so long as A is not used before undef is set.
+/// \brief Queue a phi-node to be added to a basic-block for a specific Alloca.
 ///
-void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                              LargeBlockInfo &LBI) {
-  // The trickiest case to handle is when we have large blocks. Because of this,
-  // this code is optimized assuming that large blocks happen.  This does not
-  // significantly pessimize the small block case.  This uses LargeBlockInfo to
-  // make it efficient to get the index of various operations in the block.
-  
-  // Clear out UsingBlocks.  We will reconstruct it here if needed.
-  Info.UsingBlocks.clear();
-  
-  // Walk the use-def list of the alloca, getting the locations of all stores.
-  typedef SmallVector<std::pair<unsigned, StoreInst*>, 64> StoresByIndexTy;
-  StoresByIndexTy StoresByIndex;
-  
-  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
-       UI != E; ++UI) 
-    if (StoreInst *SI = dyn_cast<StoreInst>(*UI))
-      StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
-
-  // If there are no stores to the alloca, just replace any loads with undef.
-  if (StoresByIndex.empty()) {
-    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) 
-      if (LoadInst *LI = dyn_cast<LoadInst>(*UI++)) {
-        LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
-        if (AST && LI->getType()->isPointerTy())
-          AST->deleteValue(LI);
-        LBI.deleteValue(LI);
-        LI->eraseFromParent();
-      }
-    return;
-  }
-  
-  // Sort the stores by their index, making it efficient to do a lookup with a
-  // binary search.
-  std::sort(StoresByIndex.begin(), StoresByIndex.end());
-  
-  // Walk all of the loads from this alloca, replacing them with the nearest
-  // store above them, if any.
-  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E;) {
-    LoadInst *LI = dyn_cast<LoadInst>(*UI++);
-    if (!LI) continue;
-    
-    unsigned LoadIdx = LBI.getInstructionIndex(LI);
-    
-    // Find the nearest store that has a lower than this load. 
-    StoresByIndexTy::iterator I = 
-      std::lower_bound(StoresByIndex.begin(), StoresByIndex.end(),
-                       std::pair<unsigned, StoreInst*>(LoadIdx, static_cast<StoreInst*>(0)),
-                       StoreIndexSearchPredicate());
-    
-    // If there is no store before this load, then we can't promote this load.
-    if (I == StoresByIndex.begin()) {
-      // Can't handle this load, bail out.
-      Info.UsingBlocks.push_back(LI->getParent());
-      continue;
-    }
-      
-    // Otherwise, there was a store before this load, the load takes its value.
-    --I;
-    LI->replaceAllUsesWith(I->second->getOperand(0));
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
-    LI->eraseFromParent();
-    LBI.deleteValue(LI);
-  }
-}
-
-// QueuePhiNode - queues a phi-node to be added to a basic-block for a specific
-// Alloca returns true if there wasn't already a phi-node for that variable
-//
+/// Returns true if there wasn't already a phi-node for that variable
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                                   unsigned &Version) {
   // Look up the basic-block in question.
   PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
 
   // If the BB already has a phi node added for the i'th alloca then we're done!
-  if (PN) return false;
+  if (PN)
+    return false;
 
   // Create a PhiNode using the dereferenced type... and add the phi-node to the
   // BasicBlock.
   PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
-                       Allocas[AllocaNo]->getName() + "." + Twine(Version++), 
+                       Allocas[AllocaNo]->getName() + "." + Twine(Version++),
                        BB->begin());
   ++NumPHIInsert;
   PhiToAllocaMap[PN] = AllocaNo;
@@ -1019,10 +955,11 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
   return true;
 }
 
-// RenamePass - Recursively traverse the CFG of the function, renaming loads and
-// stores to the allocas which we are promoting.  IncomingVals indicates what
-// value each Alloca contains on exit from the predecessor block Pred.
-//
+/// \brief Recursively traverse the CFG of the function, renaming loads and
+/// stores to the allocas which we are promoting.
+///
+/// IncomingVals indicates what value each Alloca contains on exit from the
+/// predecessor block Pred.
 void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
                                 RenamePassData::ValVector &IncomingVals,
                                 std::vector<RenamePassData> &Worklist) {
@@ -1040,48 +977,49 @@ NextIteration:
       // inserted by this pass of mem2reg will have the same number of incoming
       // operands so far.  Remember this count.
       unsigned NewPHINumOperands = APN->getNumOperands();
-      
-      unsigned NumEdges = 0;
-      for (succ_iterator I = succ_begin(Pred), E = succ_end(Pred); I != E; ++I)
-        if (*I == BB)
-          ++NumEdges;
+
+      unsigned NumEdges = std::count(succ_begin(Pred), succ_end(Pred), BB);
       assert(NumEdges && "Must be at least one edge from Pred to BB!");
-      
+
       // Add entries for all the phis.
       BasicBlock::iterator PNI = BB->begin();
       do {
         unsigned AllocaNo = PhiToAllocaMap[APN];
-        
+
         // Add N incoming values to the PHI node.
         for (unsigned i = 0; i != NumEdges; ++i)
           APN->addIncoming(IncomingVals[AllocaNo], Pred);
-        
+
         // The currently active variable for this block is now the PHI.
         IncomingVals[AllocaNo] = APN;
-        
+
         // Get the next phi node.
         ++PNI;
         APN = dyn_cast<PHINode>(PNI);
-        if (APN == 0) break;
-        
+        if (APN == 0)
+          break;
+
         // Verify that it is missing entries.  If not, it is not being inserted
         // by this mem2reg invocation so we want to ignore it.
       } while (APN->getNumOperands() == NewPHINumOperands);
     }
   }
-  
+
   // Don't revisit blocks.
-  if (!Visited.insert(BB)) return;
+  if (!Visited.insert(BB))
+    return;
 
-  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II); ) {
+  for (BasicBlock::iterator II = BB->begin(); !isa<TerminatorInst>(II);) {
     Instruction *I = II++; // get the instruction, increment iterator
 
     if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
       AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
-      if (!Src) continue;
-  
-      DenseMap<AllocaInst*, unsigned>::iterator AI = AllocaLookup.find(Src);
-      if (AI == AllocaLookup.end()) continue;
+      if (!Src)
+        continue;
+
+      DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src);
+      if (AI == AllocaLookup.end())
+        continue;
 
       Value *V = IncomingVals[AI->second];
 
@@ -1094,30 +1032,29 @@ NextIteration:
       // Delete this instruction and mark the name as the current holder of the
       // value
       AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
-      if (!Dest) continue;
-      
+      if (!Dest)
+        continue;
+
       DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
       if (ai == AllocaLookup.end())
         continue;
-      
+
       // what value were we writing?
       IncomingVals[ai->second] = SI->getOperand(0);
       // Record debuginfo for the store before removing it.
-      if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second]) {
-        if (!DIB)
-          DIB = new DIBuilder(*SI->getParent()->getParent()->getParent());
-        ConvertDebugDeclareToDebugValue(DDI, SI, *DIB);
-      }
+      if (DbgDeclareInst *DDI = AllocaDbgDeclares[ai->second])
+        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
       BB->getInstList().erase(SI);
     }
   }
 
   // 'Recurse' to our successors.
   succ_iterator I = succ_begin(BB), E = succ_end(BB);
-  if (I == E) return;
+  if (I == E)
+    return;
 
   // Keep track of the successors so we don't visit the same successor twice
-  SmallPtrSet<BasicBlock*, 8> VisitedSuccs;
+  SmallPtrSet<BasicBlock *, 8> VisitedSuccs;
 
   // Handle the first successor without using the worklist.
   VisitedSuccs.insert(*I);
@@ -1132,18 +1069,11 @@ NextIteration:
   goto NextIteration;
 }
 
-/// PromoteMemToReg - Promote the specified list of alloca instructions into
-/// scalar registers, inserting PHI nodes as appropriate.  This function does
-/// not modify the CFG of the function at all.  All allocas must be from the
-/// same function.
-///
-/// If AST is specified, the specified tracker is updated to reflect changes
-/// made to the IR.
-///
-void llvm::PromoteMemToReg(const std::vector<AllocaInst*> &Allocas,
-                           DominatorTree &DT, AliasSetTracker *AST) {
+void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+                           AliasSetTracker *AST) {
   // If there is nothing to do, bail out...
-  if (Allocas.empty()) return;
+  if (Allocas.empty())
+    return;
 
   PromoteMem2Reg(Allocas, DT, AST).run();
 }
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 9d90fbe..30adbfa 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -42,8 +42,6 @@ SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
 }
 
-/// Initialize - Reset this object to get ready for a new set of SSA
-/// updates with type 'Ty'.  PHI nodes get a name based on 'Name'.
 void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
   if (AV == 0)
     AV = new AvailableValsTy();
@@ -53,14 +51,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
   ProtoName = Name;
 }
 
-/// HasValueForBlock - Return true if the SSAUpdater already has a value for
-/// the specified block.
 bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
   return getAvailableVals(AV).count(BB);
 }
 
-/// AddAvailableValue - Indicate that a rewritten value is available in the
-/// specified block with the specified value.
 void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
   assert(ProtoType != 0 && "Need to initialize SSAUpdater");
   assert(ProtoType == V->getType() &&
@@ -68,10 +62,8 @@ void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
   getAvailableVals(AV)[BB] = V;
 }
 
-/// IsEquivalentPHI - Check if PHI has the same incoming value as specified
-/// in ValueMapping for each predecessor block.
 static bool IsEquivalentPHI(PHINode *PHI,
-                            DenseMap<BasicBlock*, Value*> &ValueMapping) {
+                          SmallDenseMap<BasicBlock*, Value*, 8> &ValueMapping) {
   unsigned PHINumValues = PHI->getNumIncomingValues();
   if (PHINumValues != ValueMapping.size())
     return false;
@@ -86,32 +78,11 @@ static bool IsEquivalentPHI(PHINode *PHI,
   return true;
 }
 
-/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
-/// live at the end of the specified block.
 Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
   Value *Res = GetValueAtEndOfBlockInternal(BB);
   return Res;
 }
 
-/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
-/// is live in the middle of the specified block.
-///
-/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
-/// important case: if there is a definition of the rewritten value after the
-/// 'use' in BB.  Consider code like this:
-///
-///      X1 = ...
-///   SomeBB:
-///      use(X)
-///      X2 = ...
-///      br Cond, SomeBB, OutBB
-///
-/// In this case, there are two values (X1 and X2) added to the AvailableVals
-/// set by the client of the rewriter, and those values are both live out of
-/// their respective blocks.  However, the use of X happens in the *middle* of
-/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
-/// merge the appropriate values, and this value isn't live out of the block.
-///
 Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // If there is no definition of the renamed variable in this block, just use
   // GetValueAtEndOfBlock to do our work.
@@ -165,8 +136,8 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // Otherwise, we do need a PHI: check to see if we already have one available
   // in this block that produces the right value.
   if (isa<PHINode>(BB->begin())) {
-    DenseMap<BasicBlock*, Value*> ValueMapping(PredValues.begin(),
-                                               PredValues.end());
+    SmallDenseMap<BasicBlock*, Value*, 8> ValueMapping(PredValues.begin(),
+                                                       PredValues.end());
     PHINode *SomePHI;
     for (BasicBlock::iterator It = BB->begin();
          (SomePHI = dyn_cast<PHINode>(It)); ++It) {
@@ -203,8 +174,6 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   return InsertedPHI;
 }
 
-/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
-/// which use their value in the corresponding predecessor.
 void SSAUpdater::RewriteUse(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
 
@@ -222,10 +191,6 @@ void SSAUpdater::RewriteUse(Use &U) {
   U.set(V);
 }
 
-/// RewriteUseAfterInsertions - Rewrite a use, just like RewriteUse.  However,
-/// this version of the method can rewrite uses in the same block as a
-/// definition, because it assumes that all uses of a value are below any
-/// inserted values.
 void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
   
@@ -238,8 +203,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
   U.set(V);
 }
 
-/// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template,
-/// specialized for SSAUpdater.
 namespace llvm {
 template<>
 class SSAUpdaterTraits<SSAUpdater> {
@@ -342,10 +305,9 @@ public:
 
 } // End llvm namespace
 
-/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
-/// for the specified BB and if so, return it.  If not, construct SSA form by
-/// first calculating the required placement of PHIs and then inserting new
-/// PHIs where needed.
+/// Check to see if AvailableVals has an entry for the specified BB and if so,
+/// return it.  If not, construct SSA form by first calculating the required
+/// placement of PHIs and then inserting new PHIs where needed.
 Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   AvailableValsTy &AvailableVals = getAvailableVals(AV);
   if (Value *V = AvailableVals[BB])
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 052ad85..ff50b12 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -40,12 +41,14 @@
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/NoFolder.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <map>
 #include <set>
 using namespace llvm;
+using namespace PatternMatch;
 
 static cl::opt<unsigned>
 PHINodeFoldingThreshold("phi-node-folding-threshold", cl::Hidden, cl::init(1),
@@ -88,7 +91,6 @@ namespace {
 class SimplifyCFGOpt {
   const TargetTransformInfo &TTI;
   const DataLayout *const TD;
-
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
                                std::vector<ValueEqualityComparisonCase> &Cases);
@@ -194,94 +196,7 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
     PN->addIncoming(PN->getIncomingValueForBlock(ExistPred), NewPred);
 }
 
-
-/// GetIfCondition - Given a basic block (BB) with two predecessors (and at
-/// least one PHI node in it), check to see if the merge at this block is due
-/// to an "if condition".  If so, return the boolean condition that determines
-/// which entry into BB will be taken.  Also, return by references the block
-/// that will be entered from if the condition is true, and the block that will
-/// be entered if the condition is false.
-///
-/// This does no checking to see if the true/false blocks have large or unsavory
-/// instructions in them.
-static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
-                             BasicBlock *&IfFalse) {
-  PHINode *SomePHI = cast<PHINode>(BB->begin());
-  assert(SomePHI->getNumIncomingValues() == 2 &&
-         "Function can only handle blocks with 2 predecessors!");
-  BasicBlock *Pred1 = SomePHI->getIncomingBlock(0);
-  BasicBlock *Pred2 = SomePHI->getIncomingBlock(1);
-
-  // We can only handle branches.  Other control flow will be lowered to
-  // branches if possible anyway.
-  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
-  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
-  if (Pred1Br == 0 || Pred2Br == 0)
-    return 0;
-
-  // Eliminate code duplication by ensuring that Pred1Br is conditional if
-  // either are.
-  if (Pred2Br->isConditional()) {
-    // If both branches are conditional, we don't have an "if statement".  In
-    // reality, we could transform this case, but since the condition will be
-    // required anyway, we stand no chance of eliminating it, so the xform is
-    // probably not profitable.
-    if (Pred1Br->isConditional())
-      return 0;
-
-    std::swap(Pred1, Pred2);
-    std::swap(Pred1Br, Pred2Br);
-  }
-
-  if (Pred1Br->isConditional()) {
-    // The only thing we have to watch out for here is to make sure that Pred2
-    // doesn't have incoming edges from other blocks.  If it does, the condition
-    // doesn't dominate BB.
-    if (Pred2->getSinglePredecessor() == 0)
-      return 0;
-
-    // If we found a conditional branch predecessor, make sure that it branches
-    // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
-    if (Pred1Br->getSuccessor(0) == BB &&
-        Pred1Br->getSuccessor(1) == Pred2) {
-      IfTrue = Pred1;
-      IfFalse = Pred2;
-    } else if (Pred1Br->getSuccessor(0) == Pred2 &&
-               Pred1Br->getSuccessor(1) == BB) {
-      IfTrue = Pred2;
-      IfFalse = Pred1;
-    } else {
-      // We know that one arm of the conditional goes to BB, so the other must
-      // go somewhere unrelated, and this must not be an "if statement".
-      return 0;
-    }
-
-    return Pred1Br->getCondition();
-  }
-
-  // Ok, if we got here, both predecessors end with an unconditional branch to
-  // BB.  Don't panic!  If both blocks only have a single (identical)
-  // predecessor, and THAT is a conditional branch, then we're all ok!
-  BasicBlock *CommonPred = Pred1->getSinglePredecessor();
-  if (CommonPred == 0 || CommonPred != Pred2->getSinglePredecessor())
-    return 0;
-
-  // Otherwise, if this is a conditional branch, then we can use it!
-  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
-  if (BI == 0) return 0;
-
-  assert(BI->isConditional() && "Two successors but not conditional?");
-  if (BI->getSuccessor(0) == Pred1) {
-    IfTrue = Pred1;
-    IfFalse = Pred2;
-  } else {
-    IfTrue = Pred2;
-    IfFalse = Pred1;
-  }
-  return BI->getCondition();
-}
-
-/// ComputeSpeculuationCost - Compute an abstract "cost" of speculating the
+/// ComputeSpeculationCost - Compute an abstract "cost" of speculating the
 /// given instruction, which is assumed to be safe to speculate. 1 means
 /// cheap, 2 means less cheap, and UINT_MAX means prohibitively expensive.
 static unsigned ComputeSpeculationCost(const User *I) {
@@ -432,7 +347,24 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
   // If this is an icmp against a constant, handle this as one of the cases.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
     if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) {
+      Value *RHSVal;
+      ConstantInt *RHSC;
+
       if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ:ICmpInst::ICMP_NE)) {
+        // (x & ~2^x) == y --> x == y || x == y|2^x
+        // This undoes a transformation done by instcombine to fuse 2 compares.
+        if (match(ICI->getOperand(0),
+                  m_And(m_Value(RHSVal), m_ConstantInt(RHSC)))) {
+          APInt Not = ~RHSC->getValue();
+          if (Not.isPowerOf2()) {
+            Vals.push_back(C);
+            Vals.push_back(
+                ConstantInt::get(C->getContext(), C->getValue() | Not));
+            UsedICmps++;
+            return RHSVal;
+          }
+        }
+
         UsedICmps++;
         Vals.push_back(C);
         return I->getOperand(0);
@@ -443,6 +375,13 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
       ConstantRange Span =
         ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
 
+      // Shift the range if the compare is fed by an add. This is the range
+      // compare idiom as emitted by instcombine.
+      bool hasAdd =
+          match(I->getOperand(0), m_Add(m_Value(RHSVal), m_ConstantInt(RHSC)));
+      if (hasAdd)
+        Span = Span.subtract(RHSC->getValue());
+
       // If this is an and/!= check then we want to optimize "x ugt 2" into
       // x != 0 && x != 1.
       if (!isEQ)
@@ -455,7 +394,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
         Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
-      return I->getOperand(0);
+      return hasAdd ? RHSVal : I->getOperand(0);
     }
     return 0;
   }
@@ -533,15 +472,17 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
   } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
     if (BI->isConditional() && BI->getCondition()->hasOneUse())
       if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition()))
-        if ((ICI->getPredicate() == ICmpInst::ICMP_EQ ||
-             ICI->getPredicate() == ICmpInst::ICMP_NE) &&
-            GetConstantInt(ICI->getOperand(1), TD))
+        if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), TD))
           CV = ICI->getOperand(0);
 
   // Unwrap any lossless ptrtoint cast.
-  if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext()))
-    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV))
-      CV = PTII->getOperand(0);
+  if (TD && CV) {
+    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+      Value *Ptr = PTII->getPointerOperand();
+      if (PTII->getType() == TD->getIntPtrType(Ptr->getType()))
+        CV = Ptr;
+    }
+  }
   return CV;
 }
 
@@ -763,9 +704,10 @@ namespace {
   };
 }
 
-static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
-  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+                                    ConstantInt *const *P2) {
+  const ConstantInt *LHS = *P1;
+  const ConstantInt *RHS = *P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -988,7 +930,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without DataLayout");
-        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()),
                                     "magicptr");
       }
 
@@ -1083,9 +1025,9 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
       (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
     return false;
 
-  // If we get here, we can hoist at least one instruction.
   BasicBlock *BIParent = BI->getParent();
 
+  bool Changed = false;
   do {
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
@@ -1100,6 +1042,7 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
       I2->replaceAllUsesWith(I1);
     I1->intersectOptionalDataWith(I2);
     I2->eraseFromParent();
+    Changed = true;
 
     I1 = BB1_Itr++;
     I2 = BB2_Itr++;
@@ -1119,7 +1062,23 @@ static bool HoistThenElseCodeToIf(BranchInst *BI) {
 HoistTerminator:
   // It may not be possible to hoist an invoke.
   if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
-    return true;
+    return Changed;
+
+  for (succ_iterator SI = succ_begin(BB1), E = succ_end(BB1); SI != E; ++SI) {
+    PHINode *PN;
+    for (BasicBlock::iterator BBI = SI->begin();
+         (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      if (BB1V == BB2V)
+        continue;
+
+      if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+        return Changed;
+      if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+        return Changed;
+    }
+  }
 
   // Okay, it is safe to hoist the terminator.
   Instruction *NT = I1->clone();
@@ -1362,8 +1321,8 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
 ///
 /// \return The pointer to the value of the previous store if the store can be
 ///         hoisted into the predecessor block. 0 otherwise.
-Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
-                              BasicBlock *StoreBB, BasicBlock *EndBB) {
+static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                                     BasicBlock *StoreBB, BasicBlock *EndBB) {
   StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
   if (!StoreToHoist)
     return 0;
@@ -1522,18 +1481,23 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
     Value *OrigV = PN->getIncomingValueForBlock(BB);
     Value *ThenV = PN->getIncomingValueForBlock(ThenBB);
 
+    // FIXME: Try to remove some of the duplication with HoistThenElseCodeToIf.
     // Skip PHIs which are trivial.
     if (ThenV == OrigV)
       continue;
 
     HaveRewritablePHIs = true;
-    ConstantExpr *CE = dyn_cast<ConstantExpr>(ThenV);
-    if (!CE)
+    ConstantExpr *OrigCE = dyn_cast<ConstantExpr>(OrigV);
+    ConstantExpr *ThenCE = dyn_cast<ConstantExpr>(ThenV);
+    if (!OrigCE && !ThenCE)
       continue; // Known safe and cheap.
 
-    if (!isSafeToSpeculativelyExecute(CE))
+    if ((ThenCE && !isSafeToSpeculativelyExecute(ThenCE)) ||
+        (OrigCE && !isSafeToSpeculativelyExecute(OrigCE)))
       return false;
-    if (ComputeSpeculationCost(CE) > PHINodeFoldingThreshold)
+    unsigned OrigCost = OrigCE ? ComputeSpeculationCost(OrigCE) : 0;
+    unsigned ThenCost = ThenCE ? ComputeSpeculationCost(ThenCE) : 0;
+    if (OrigCost + ThenCost > 2 * PHINodeFoldingThreshold)
       return false;
 
     // Account for the cost of an unfolded ConstantExpr which could end up
@@ -1598,6 +1562,19 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   return true;
 }
 
+/// \returns True if this block contains a CallInst with the NoDuplicate
+/// attribute.
+static bool HasNoDuplicateCall(const BasicBlock *BB) {
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    const CallInst *CI = dyn_cast<CallInst>(I);
+    if (!CI)
+      continue;
+    if (CI->cannotDuplicate())
+      return true;
+  }
+  return false;
+}
+
 /// BlockIsSimpleEnoughToThreadThrough - Return true if we can thread a branch
 /// across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
@@ -1645,6 +1622,8 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
 
+  if (HasNoDuplicateCall(BB)) return false;
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -2111,14 +2090,19 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
     // must already have been resolved, so we won't be inhibiting the
-    // out-of-order core by speculating them earlier.
-    if (BonusInst) {
+    // out-of-order core by speculating them earlier. We also allow
+    // instructions that are used by the terminator's condition because it
+    // exposes more merging opportunities.
+    bool UsedByBranch = (BonusInst && BonusInst->hasOneUse() &&
+                         *BonusInst->use_begin() == Cond);
+
+    if (BonusInst && !UsedByBranch) {
       // Collect the values used by the bonus inst
       SmallPtrSet<Value*, 4> UsedValues;
       for (Instruction::op_iterator OI = BonusInst->op_begin(),
            OE = BonusInst->op_end(); OI != OE; ++OI) {
         Value *V = *OI;
-        if (!isa<Constant>(V))
+        if (!isa<Constant>(V) && !isa<Argument>(V))
           UsedValues.insert(V);
       }
 
@@ -2829,7 +2813,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
-                                     TD->getIntPtrType(CompVal->getContext()),
+                                     TD->getIntPtrType(CompVal->getType()),
                                      "magicptr");
   }
 
@@ -3202,7 +3186,7 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
 /// and use it to remove dead cases.
 static bool EliminateDeadSwitchCases(SwitchInst *SI) {
   Value *Cond = SI->getCondition();
-  unsigned Bits = cast<IntegerType>(Cond->getType())->getBitWidth();
+  unsigned Bits = Cond->getType()->getIntegerBitWidth();
   APInt KnownZero(Bits, 0), KnownOne(Bits, 0);
   ComputeMaskedBits(Cond, KnownZero, KnownOne);
 
@@ -3307,7 +3291,7 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   for (ForwardingNodesMap::iterator I = ForwardingNodes.begin(),
        E = ForwardingNodes.end(); I != E; ++I) {
     PHINode *Phi = I->first;
-    SmallVector<int,4> &Indexes = I->second;
+    SmallVectorImpl<int> &Indexes = I->second;
 
     if (Indexes.size() < 2) continue;
 
@@ -3345,28 +3329,10 @@ static Constant *LookupConstant(Value *V,
 /// simple instructions such as binary operations where both operands are
 /// constant or can be replaced by constants from the ConstantPool. Returns the
 /// resulting constant on success, 0 otherwise.
-static Constant *ConstantFold(Instruction *I,
-                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
-    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::get(BO->getOpcode(), A, B);
-  }
-
-  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
-      return 0;
-    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
-    if (!B)
-      return 0;
-    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
-  }
-
+static Constant *
+ConstantFold(Instruction *I,
+             const SmallDenseMap<Value *, Constant *> &ConstantPool,
+             const DataLayout *DL) {
   if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
     Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
     if (!A)
@@ -3378,25 +3344,32 @@ static Constant *ConstantFold(Instruction *I,
     return 0;
   }
 
-  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
-    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
-    if (!A)
+  SmallVector<Constant *, 4> COps;
+  for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+    if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+      COps.push_back(A);
+    else
       return 0;
-    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
   }
 
-  return 0;
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I))
+    return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+                                           COps[1], DL);
+
+  return ConstantFoldInstOperands(I->getOpcode(), I->getType(), COps, DL);
 }
 
 /// GetCaseResults - Try to determine the resulting constant values in phi nodes
 /// at the common destination basic block, *CommonDest, for one of the case
 /// destionations CaseDest corresponding to value CaseVal (0 for the default
 /// case), of a switch instruction SI.
-static bool GetCaseResults(SwitchInst *SI,
-                           ConstantInt *CaseVal,
-                           BasicBlock *CaseDest,
-                           BasicBlock **CommonDest,
-                           SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
+static bool
+GetCaseResults(SwitchInst *SI,
+               ConstantInt *CaseVal,
+               BasicBlock *CaseDest,
+               BasicBlock **CommonDest,
+               SmallVectorImpl<std::pair<PHINode *, Constant *> > &Res,
+               const DataLayout *DL) {
   // The block from which we enter the common destination.
   BasicBlock *Pred = SI->getParent();
 
@@ -3415,7 +3388,7 @@ static bool GetCaseResults(SwitchInst *SI,
     } else if (isa<DbgInfoIntrinsic>(I)) {
       // Skip debug intrinsic.
       continue;
-    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+    } else if (Constant *C = ConstantFold(I, ConstantPool, DL)) {
       // Instruction is side-effect free and constant.
       ConstantPool.insert(std::make_pair(I, C));
     } else {
@@ -3469,7 +3442,7 @@ namespace {
     SwitchLookupTable(Module &M,
                       uint64_t TableSize,
                       ConstantInt *Offset,
-               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+             const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values,
                       Constant *DefaultValue,
                       const DataLayout *TD);
 
@@ -3516,7 +3489,7 @@ namespace {
 SwitchLookupTable::SwitchLookupTable(Module &M,
                                      uint64_t TableSize,
                                      ConstantInt *Offset,
-               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+             const SmallVectorImpl<std::pair<ConstantInt*, Constant*> >& Values,
                                      Constant *DefaultValue,
                                      const DataLayout *TD)
     : SingleValue(0), BitMap(0), BitMapElementTy(0), Array(0) {
@@ -3643,7 +3616,7 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
 }
 
 /// ShouldBuildLookupTable - Determine whether a lookup table should be built
-/// for this switch, based on the number of caes, size of the table and the
+/// for this switch, based on the number of cases, size of the table and the
 /// types of the results.
 static bool ShouldBuildLookupTable(SwitchInst *SI,
                                    uint64_t TableSize,
@@ -3739,7 +3712,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
     ResultsTy Results;
     if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
-                        Results))
+                        Results, TD))
       return false;
 
     // Append the result from this case to the list for each phi.
@@ -3753,7 +3726,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // Get the resulting values for the default case.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
   if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
-                      DefaultResultsList))
+                      DefaultResultsList, TD))
     return false;
   for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
     PHINode *PHI = DefaultResultsList[I].first;
@@ -3774,14 +3747,32 @@ static bool SwitchToLookupTable(SwitchInst *SI,
                                             CommonDest->getParent(),
                                             CommonDest);
 
-  // Check whether the condition value is within the case range, and branch to
-  // the new BB.
+  // Compute the table index value.
   Builder.SetInsertPoint(SI);
   Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
                                         "switch.tableidx");
-  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
-      MinCaseVal->getType(), TableSize));
-  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Compute the maximum table size representable by the integer type we are
+  // switching upon.
+  unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+  uint64_t MaxTableSize = CaseSize > 63? UINT64_MAX : 1ULL << CaseSize;
+  assert(MaxTableSize >= TableSize &&
+         "It is impossible for a switch to have more entries than the max "
+         "representable value of its input integer type's size.");
+
+  // If we have a fully covered lookup table, unconditionally branch to the
+  // lookup table BB. Otherwise, check if the condition value is within the case
+  // range. If it is so, branch to the new BB. Otherwise branch to SI's default
+  // destination.
+  const bool GeneratingCoveredLookupTable = MaxTableSize == TableSize;
+  if (GeneratingCoveredLookupTable) {
+    Builder.CreateBr(LookupBB);
+    SI->getDefaultDest()->removePredecessor(SI->getParent());
+  } else {
+    Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+                                         MinCaseVal->getType(), TableSize));
+    Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+  }
 
   // Populate the BB that does the lookups.
   Builder.SetInsertPoint(LookupBB);
@@ -3810,9 +3801,11 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     Builder.CreateBr(CommonDest);
 
   // Remove the switch.
-  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
     BasicBlock *Succ = SI->getSuccessor(i);
-    if (Succ == SI->getDefaultDest()) continue;
+
+    if (Succ == SI->getDefaultDest())
+      continue;
     Succ->removePredecessor(SI->getParent());
   }
   SI->eraseFromParent();
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 41c207c..bf3442a 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -119,7 +119,7 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
         return 0;
 
       D = ConstantInt::get(UseInst->getContext(),
-                           APInt(BitWidth, 1).shl(D->getZExtValue()));
+                           APInt::getOneBitSet(BitWidth, D->getZExtValue()));
     }
     FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
   }
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 6bea2dd..15b3e66 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
@@ -26,11 +27,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+ColdErrorCalls("error-reporting-is-cold",  cl::init(true),
+  cl::Hidden, cl::desc("Treat error-reporting calls as cold"));
+
 /// This class is the abstract base class for the set of optimizations that
 /// corresponds to one library call.
 namespace {
@@ -118,6 +124,21 @@ static bool callHasFloatingPointArgument(const CallInst *CI) {
   return false;
 }
 
+/// \brief Check whether the overloaded unary floating point function
+/// corresponing to \a Ty is available.
+static bool hasUnaryFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+                            LibFunc::Func DoubleFn, LibFunc::Func FloatFn,
+                            LibFunc::Func LongDoubleFn) {
+  switch (Ty->getTypeID()) {
+  case Type::FloatTyID:
+    return TLI->has(FloatFn);
+  case Type::DoubleTyID:
+    return TLI->has(DoubleFn);
+  default:
+    return TLI->has(LongDoubleFn);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -477,7 +498,7 @@ struct StrChrOpt : public LibCallOptimization {
 
     // Compute the offset, make sure to handle the case when we're searching for
     // zero (a weird way to spell strlen).
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.find(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
       return Constant::getNullValue(CI->getType());
@@ -513,7 +534,7 @@ struct StrRChrOpt : public LibCallOptimization {
     }
 
     // Compute the offset.
-    size_t I = CharC->getSExtValue() == 0 ?
+    size_t I = (0xFF & CharC->getSExtValue()) == 0 ?
         Str.size() : Str.rfind(CharC->getSExtValue());
     if (I == StringRef::npos) // Didn't find the char. Return null.
       return Constant::getNullValue(CI->getType());
@@ -774,7 +795,7 @@ struct StrPBrkOpt : public LibCallOptimization {
     // Constant folding.
     if (HasS1 && HasS2) {
       size_t I = S1.find_first_of(S2);
-      if (I == std::string::npos) // No match.
+      if (I == StringRef::npos) // No match.
         return Constant::getNullValue(CI->getType());
 
       return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
@@ -912,7 +933,7 @@ struct StrStrOpt : public LibCallOptimization {
 
     // If both strings are known, constant fold it.
     if (HasStr1 && HasStr2) {
-      std::string::size_type Offset = SearchStr.find(ToFindStr);
+      size_t Offset = SearchStr.find(ToFindStr);
 
       if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
         return Constant::getNullValue(CI->getType());
@@ -1031,7 +1052,7 @@ struct MemSetOpt : public LibCallOptimization {
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -1133,9 +1154,13 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
-      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+      // pow(1.0, x) -> 1.0
+      if (Op1C->isExactlyValue(1.0))
         return Op1C;
-      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+      // pow(2.0, x) -> exp2(x)
+      if (Op1C->isExactlyValue(2.0) &&
+          hasUnaryFloatFn(TLI, Op1->getType(), LibFunc::exp2, LibFunc::exp2f,
+                          LibFunc::exp2l))
         return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
     }
 
@@ -1145,7 +1170,11 @@ struct PowOpt : public UnsafeFPLibCallOptimization {
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
 
-    if (Op2C->isExactlyValue(0.5)) {
+    if (Op2C->isExactlyValue(0.5) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::sqrt, LibFunc::sqrtf,
+                        LibFunc::sqrtl) &&
+        hasUnaryFloatFn(TLI, Op2->getType(), LibFunc::fabs, LibFunc::fabsf,
+                        LibFunc::fabsl)) {
       // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
       // This is faster than calling pow, and still handles negative zero
       // and negative infinity correctly.
@@ -1178,7 +1207,7 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     Value *Ret = NULL;
     if (UnsafeFPShrink && Callee->getName() == "exp2" &&
-        TLI->has(LibFunc::exp2)) {
+        TLI->has(LibFunc::exp2f)) {
       UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
       Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
     }
@@ -1229,6 +1258,155 @@ struct Exp2Opt : public UnsafeFPLibCallOptimization {
   }
 };
 
+struct SinCosPiOpt : public LibCallOptimization {
+  SinCosPiOpt() {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Make sure the prototype is as expected, otherwise the rest of the
+    // function is probably invalid and likely to abort.
+    if (!isTrigLibCall(CI))
+      return 0;
+
+    Value *Arg = CI->getArgOperand(0);
+    SmallVector<CallInst *, 1> SinCalls;
+    SmallVector<CallInst *, 1> CosCalls;
+    SmallVector<CallInst *, 1> SinCosCalls;
+
+    bool IsFloat = Arg->getType()->isFloatTy();
+
+    // Look for all compatible sinpi, cospi and sincospi calls with the same
+    // argument. If there are enough (in some sense) we can make the
+    // substitution.
+    for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+         UI != UE; ++UI)
+      classifyArgUse(*UI, CI->getParent(), IsFloat, SinCalls, CosCalls,
+                     SinCosCalls);
+
+    // It's only worthwhile if both sinpi and cospi are actually used.
+    if (SinCosCalls.empty() && (SinCalls.empty() || CosCalls.empty()))
+      return 0;
+
+    Value *Sin, *Cos, *SinCos;
+    insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos,
+                     SinCos);
+
+    replaceTrigInsts(SinCalls, Sin);
+    replaceTrigInsts(CosCalls, Cos);
+    replaceTrigInsts(SinCosCalls, SinCos);
+
+    return 0;
+  }
+
+  bool isTrigLibCall(CallInst *CI) {
+    Function *Callee = CI->getCalledFunction();
+    FunctionType *FT = Callee->getFunctionType();
+
+    // We can only hope to do anything useful if we can ignore things like errno
+    // and floating-point exceptions.
+    bool AttributesSafe = CI->hasFnAttr(Attribute::NoUnwind) &&
+                          CI->hasFnAttr(Attribute::ReadNone);
+
+    // Other than that we need float(float) or double(double)
+    return AttributesSafe && FT->getNumParams() == 1 &&
+           FT->getReturnType() == FT->getParamType(0) &&
+           (FT->getParamType(0)->isFloatTy() ||
+            FT->getParamType(0)->isDoubleTy());
+  }
+
+  void classifyArgUse(Value *Val, BasicBlock *BB, bool IsFloat,
+                      SmallVectorImpl<CallInst *> &SinCalls,
+                      SmallVectorImpl<CallInst *> &CosCalls,
+                      SmallVectorImpl<CallInst *> &SinCosCalls) {
+    CallInst *CI = dyn_cast<CallInst>(Val);
+
+    if (!CI)
+      return;
+
+    Function *Callee = CI->getCalledFunction();
+    StringRef FuncName = Callee->getName();
+    LibFunc::Func Func;
+    if (!TLI->getLibFunc(FuncName, Func) || !TLI->has(Func) ||
+        !isTrigLibCall(CI))
+      return;
+
+    if (IsFloat) {
+      if (Func == LibFunc::sinpif)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospif)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stretf)
+        SinCosCalls.push_back(CI);
+    } else {
+      if (Func == LibFunc::sinpi)
+        SinCalls.push_back(CI);
+      else if (Func == LibFunc::cospi)
+        CosCalls.push_back(CI);
+      else if (Func == LibFunc::sincospi_stret)
+        SinCosCalls.push_back(CI);
+    }
+  }
+
+  void replaceTrigInsts(SmallVectorImpl<CallInst*> &Calls, Value *Res) {
+    for (SmallVectorImpl<CallInst*>::iterator I = Calls.begin(),
+           E = Calls.end();
+         I != E; ++I) {
+      LCS->replaceAllUsesWith(*I, Res);
+    }
+  }
+
+  void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+                        bool UseFloat, Value *&Sin, Value *&Cos,
+                        Value *&SinCos) {
+    Type *ArgTy = Arg->getType();
+    Type *ResTy;
+    StringRef Name;
+
+    Triple T(OrigCallee->getParent()->getTargetTriple());
+    if (UseFloat) {
+      Name = "__sincospi_stretf";
+
+      assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+      // x86_64 can't use {float, float} since that would be returned in both
+      // xmm0 and xmm1, which isn't what a real struct would do.
+      ResTy = T.getArch() == Triple::x86_64
+                  ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                  : static_cast<Type *>(StructType::get(ArgTy, ArgTy, NULL));
+    } else {
+      Name = "__sincospi_stret";
+      ResTy = StructType::get(ArgTy, ArgTy, NULL);
+    }
+
+    Module *M = OrigCallee->getParent();
+    Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
+                                           ResTy, ArgTy, NULL);
+
+    if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+      // If the argument is an instruction, it must dominate all uses so put our
+      // sincos call there.
+      BasicBlock::iterator Loc = ArgInst;
+      B.SetInsertPoint(ArgInst->getParent(), ++Loc);
+    } else {
+      // Otherwise (e.g. for a constant) the beginning of the function is as
+      // good a place as any.
+      BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+      B.SetInsertPoint(&EntryBB, EntryBB.begin());
+    }
+
+    SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+    if (SinCos->getType()->isStructTy()) {
+      Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+      Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+    } else {
+      Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+                                   "sinpi");
+      Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+                                   "cospi");
+    }
+  }
+
+};
+
 //===----------------------------------------------------------------------===//
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -1333,6 +1511,54 @@ struct ToAsciiOpt : public LibCallOptimization {
 // Formatting and IO Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
+struct ErrorReportingOpt : public LibCallOptimization {
+  ErrorReportingOpt(int S = -1) : StreamArg(S) {}
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &) {
+    // Error reporting calls should be cold, mark them as such.
+    // This applies even to non-builtin calls: it is only a hint and applies to
+    // functions that the frontend might not understand as builtins.
+
+    // This heuristic was suggested in:
+    // Improving Static Branch Prediction in a Compiler
+    // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+    // Proceedings of PACT'98, Oct. 1998, IEEE
+
+    if (!CI->hasFnAttr(Attribute::Cold) && isReportingError(Callee, CI)) {
+      CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    }
+
+    return 0;
+  }
+
+protected:
+  bool isReportingError(Function *Callee, CallInst *CI) {
+    if (!ColdErrorCalls)
+      return false;
+ 
+    if (!Callee || !Callee->isDeclaration())
+      return false;
+
+    if (StreamArg < 0)
+      return true;
+
+    // These functions might be considered cold, but only if their stream
+    // argument is stderr.
+
+    if (StreamArg >= (int) CI->getNumArgOperands())
+      return false;
+    LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+    if (!LI)
+      return false;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+    if (!GV || !GV->isDeclaration())
+      return false;
+    return GV->getName() == "stderr";
+  }
+
+  int StreamArg;
+};
+
 struct PrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
@@ -1361,7 +1587,7 @@ struct PrintFOpt : public LibCallOptimization {
 
     // printf("foo\n") --> puts("foo")
     if (FormatStr[FormatStr.size()-1] == '\n' &&
-        FormatStr.find('%') == std::string::npos) {  // no format characters.
+        FormatStr.find('%') == StringRef::npos) { // No format characters.
       // Create a string literal with no \n on it.  We expect the constant merge
       // pass to be run after this pass, to merge duplicate strings.
       FormatStr = FormatStr.drop_back();
@@ -1513,6 +1739,9 @@ struct SPrintFOpt : public LibCallOptimization {
 struct FPrintFOpt : public LibCallOptimization {
   Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
                                    IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 0);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // All the optimizations depend on the format string.
     StringRef FormatStr;
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
@@ -1590,6 +1819,9 @@ struct FPrintFOpt : public LibCallOptimization {
 
 struct FWriteOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 3);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // Require a pointer, an integer, an integer, a pointer, returning integer.
     FunctionType *FT = Callee->getFunctionType();
     if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
@@ -1623,6 +1855,9 @@ struct FWriteOpt : public LibCallOptimization {
 
 struct FPutsOpt : public LibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    ErrorReportingOpt ER(/* StreamArg = */ 1);
+    (void) ER.callOptimizer(Callee, CI, B);
+
     // These optimizations require DataLayout.
     if (!TD) return 0;
 
@@ -1741,6 +1976,7 @@ static MemSetOpt MemSet;
 // Math library call optimizations.
 static UnaryDoubleFPOpt UnaryDoubleFP(false);
 static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+static SinCosPiOpt SinCosPi;
 
   // Integer library call optimizations.
 static FFSOpt FFS;
@@ -1750,6 +1986,9 @@ static IsAsciiOpt IsAscii;
 static ToAsciiOpt ToAscii;
 
 // Formatting and IO library call optimizations.
+static ErrorReportingOpt ErrorReporting;
+static ErrorReportingOpt ErrorReporting0(0);
+static ErrorReportingOpt ErrorReporting1(1);
 static PrintFOpt PrintF;
 static SPrintFOpt SPrintF;
 static FPrintFOpt FPrintF;
@@ -1825,6 +2064,11 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
       case LibFunc::cos:
       case LibFunc::cosl:
         return &Cos;
+      case LibFunc::sinpif:
+      case LibFunc::sinpi:
+      case LibFunc::cospif:
+      case LibFunc::cospi:
+        return &SinCosPi;
       case LibFunc::powf:
       case LibFunc::pow:
       case LibFunc::powl:
@@ -1859,6 +2103,13 @@ LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
         return &FPuts;
       case LibFunc::puts:
         return &Puts;
+      case LibFunc::perror:
+        return &ErrorReporting;
+      case LibFunc::vfprintf:
+      case LibFunc::fiprintf:
+        return &ErrorReporting0;
+      case LibFunc::fputc:
+        return &ErrorReporting1;
       case LibFunc::ceil:
       case LibFunc::fabs:
       case LibFunc::floor:
@@ -1940,7 +2191,7 @@ LibCallSimplifier::~LibCallSimplifier() {
 }
 
 Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
-  if (CI->hasFnAttr(Attribute::NoBuiltin)) return 0;
+  if (CI->isNoBuiltin()) return 0;
   return Impl->optimizeCall(CI);
 }
 
@@ -1950,3 +2201,53 @@ void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
 }
 
 }
+
+// TODO:
+//   Additional cases that we need to add to this file:
+//
+// cbrt:
+//   * cbrt(expN(X))  -> expN(x/3)
+//   * cbrt(sqrt(x))  -> pow(x,1/6)
+//   * cbrt(sqrt(x))  -> pow(x,1/9)
+//
+// exp, expf, expl:
+//   * exp(log(x))  -> x
+//
+// log, logf, logl:
+//   * log(exp(x))   -> x
+//   * log(x**y)     -> y*log(x)
+//   * log(exp(y))   -> y*log(e)
+//   * log(exp2(y))  -> y*log(2)
+//   * log(exp10(y)) -> y*log(10)
+//   * log(sqrt(x))  -> 0.5*log(x)
+//   * log(pow(x,y)) -> y*log(x)
+//
+// lround, lroundf, lroundl:
+//   * lround(cnst) -> cnst'
+//
+// pow, powf, powl:
+//   * pow(exp(x),y)  -> exp(x*y)
+//   * pow(sqrt(x),y) -> pow(x,y*0.5)
+//   * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// round, roundf, roundl:
+//   * round(cnst) -> cnst'
+//
+// signbit:
+//   * signbit(cnst) -> cnst'
+//   * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+//   * sqrt(expN(x))  -> expN(x*0.5)
+//   * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+//   * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+// strchr:
+//   * strchr(p, 0) -> strlen(p)
+// tan, tanf, tanl:
+//   * tan(atan(x)) -> x
+//
+// trunc, truncf, truncl:
+//   * trunc(cnst) -> cnst'
+//
+//
diff --git a/lib/Transforms/Utils/SpecialCaseList.cpp b/lib/Transforms/Utils/SpecialCaseList.cpp
new file mode 100644
index 0000000..2ef692c
--- /dev/null
+++ b/lib/Transforms/Utils/SpecialCaseList.cpp
@@ -0,0 +1,222 @@
+//===-- SpecialCaseList.cpp - special case list for sanitizers ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables, or to instrument some functions or global variables in a specific
+// way, based on a user-supplied list.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SpecialCaseList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <string>
+#include <utility>
+
+namespace llvm {
+
+/// Represents a set of regular expressions.  Regular expressions which are
+/// "literal" (i.e. no regex metacharacters) are stored in Strings, while all
+/// others are represented as a single pipe-separated regex in RegEx.  The
+/// reason for doing so is efficiency; StringSet is much faster at matching
+/// literal strings than Regex.
+struct SpecialCaseList::Entry {
+  StringSet<> Strings;
+  Regex *RegEx;
+
+  Entry() : RegEx(0) {}
+
+  bool match(StringRef Query) const {
+    return Strings.count(Query) || (RegEx && RegEx->match(Query));
+  }
+};
+
+SpecialCaseList::SpecialCaseList() : Entries() {}
+
+SpecialCaseList *SpecialCaseList::create(
+    const StringRef Path, std::string &Error) {
+  if (Path.empty())
+    return new SpecialCaseList();
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+    Error = (Twine("Can't open file '") + Path + "': " + EC.message()).str();
+    return 0;
+  }
+  return create(File.get(), Error);
+}
+
+SpecialCaseList *SpecialCaseList::create(
+    const MemoryBuffer *MB, std::string &Error) {
+  OwningPtr<SpecialCaseList> SCL(new SpecialCaseList());
+  if (!SCL->parse(MB, Error))
+    return 0;
+  return SCL.take();
+}
+
+SpecialCaseList *SpecialCaseList::createOrDie(const StringRef Path) {
+  std::string Error;
+  if (SpecialCaseList *SCL = create(Path, Error))
+    return SCL;
+  report_fatal_error(Error);
+}
+
+bool SpecialCaseList::parse(const MemoryBuffer *MB, std::string &Error) {
+  // Iterate through each line in the blacklist file.
+  SmallVector<StringRef, 16> Lines;
+  SplitString(MB->getBuffer(), Lines, "\n\r");
+  StringMap<StringMap<std::string> > Regexps;
+  assert(Entries.empty() &&
+         "parse() should be called on an empty SpecialCaseList");
+  int LineNo = 1;
+  for (SmallVectorImpl<StringRef>::iterator I = Lines.begin(), E = Lines.end();
+       I != E; ++I, ++LineNo) {
+    // Ignore empty lines and lines starting with "#"
+    if (I->empty() || I->startswith("#"))
+      continue;
+    // Get our prefix and unparsed regexp.
+    std::pair<StringRef, StringRef> SplitLine = I->split(":");
+    StringRef Prefix = SplitLine.first;
+    if (SplitLine.second.empty()) {
+      // Missing ':' in the line.
+      Error = (Twine("Malformed line ") + Twine(LineNo) + ": '" +
+               SplitLine.first + "'").str();
+      return false;
+    }
+
+    std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("=");
+    std::string Regexp = SplitRegexp.first;
+    StringRef Category = SplitRegexp.second;
+
+    // Backwards compatibility.
+    if (Prefix == "global-init") {
+      Prefix = "global";
+      Category = "init";
+    } else if (Prefix == "global-init-type") {
+      Prefix = "type";
+      Category = "init";
+    } else if (Prefix == "global-init-src") {
+      Prefix = "src";
+      Category = "init";
+    }
+
+    // See if we can store Regexp in Strings.
+    if (Regex::isLiteralERE(Regexp)) {
+      Entries[Prefix][Category].Strings.insert(Regexp);
+      continue;
+    }
+
+    // Replace * with .*
+    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
+         pos += strlen(".*")) {
+      Regexp.replace(pos, strlen("*"), ".*");
+    }
+
+    // Check that the regexp is valid.
+    Regex CheckRE(Regexp);
+    std::string REError;
+    if (!CheckRE.isValid(REError)) {
+      Error = (Twine("Malformed regex in line ") + Twine(LineNo) + ": '" +
+               SplitLine.second + "': " + REError).str();
+      return false;
+    }
+
+    // Add this regexp into the proper group by its prefix.
+    if (!Regexps[Prefix][Category].empty())
+      Regexps[Prefix][Category] += "|";
+    Regexps[Prefix][Category] += "^" + Regexp + "$";
+  }
+
+  // Iterate through each of the prefixes, and create Regexs for them.
+  for (StringMap<StringMap<std::string> >::const_iterator I = Regexps.begin(),
+                                                          E = Regexps.end();
+       I != E; ++I) {
+    for (StringMap<std::string>::const_iterator II = I->second.begin(),
+                                                IE = I->second.end();
+         II != IE; ++II) {
+      Entries[I->getKey()][II->getKey()].RegEx = new Regex(II->getValue());
+    }
+  }
+  return true;
+}
+
+SpecialCaseList::~SpecialCaseList() {
+  for (StringMap<StringMap<Entry> >::iterator I = Entries.begin(),
+                                              E = Entries.end();
+       I != E; ++I) {
+    for (StringMap<Entry>::const_iterator II = I->second.begin(),
+                                          IE = I->second.end();
+         II != IE; ++II) {
+      delete II->second.RegEx;
+    }
+  }
+}
+
+bool SpecialCaseList::isIn(const Function& F, const StringRef Category) const {
+  return isIn(*F.getParent(), Category) ||
+         inSectionCategory("fun", F.getName(), Category);
+}
+
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
+  // Types of GlobalVariables are always pointer types.
+  Type *GType = G.getType()->getElementType();
+  // For now we support blacklisting struct types only.
+  if (StructType *SGType = dyn_cast<StructType>(GType)) {
+    if (!SGType->isLiteral())
+      return SGType->getName();
+  }
+  return "<unknown type>";
+}
+
+bool SpecialCaseList::isIn(const GlobalVariable &G,
+                           const StringRef Category) const {
+  return isIn(*G.getParent(), Category) ||
+         inSectionCategory("global", G.getName(), Category) ||
+         inSectionCategory("type", GetGlobalTypeString(G), Category);
+}
+
+bool SpecialCaseList::isIn(const GlobalAlias &GA,
+                           const StringRef Category) const {
+  if (isIn(*GA.getParent(), Category))
+    return true;
+
+  if (isa<FunctionType>(GA.getType()->getElementType()))
+    return inSectionCategory("fun", GA.getName(), Category);
+
+  return inSectionCategory("global", GA.getName(), Category) ||
+         inSectionCategory("type", GetGlobalTypeString(GA), Category);
+}
+
+bool SpecialCaseList::isIn(const Module &M, const StringRef Category) const {
+  return inSectionCategory("src", M.getModuleIdentifier(), Category);
+}
+
+bool SpecialCaseList::inSectionCategory(const StringRef Section,
+                                        const StringRef Query,
+                                        const StringRef Category) const {
+  StringMap<StringMap<Entry> >::const_iterator I = Entries.find(Section);
+  if (I == Entries.end()) return false;
+  StringMap<Entry>::const_iterator II = I->second.find(Category);
+  if (II == I->second.end()) return false;
+
+  return II->getValue().match(Query);
+}
+
+}  // namespace llvm
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index 544c5ee..457fc80 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -22,14 +22,22 @@ using namespace llvm;
 
 // Out of line method to get vtable etc for class.
 void ValueMapTypeRemapper::anchor() {}
+void ValueMaterializer::anchor() {}
 
 Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
-                      ValueMapTypeRemapper *TypeMapper) {
+                      ValueMapTypeRemapper *TypeMapper,
+                      ValueMaterializer *Materializer) {
   ValueToValueMapTy::iterator I = VM.find(V);
   
   // If the value already exists in the map, use it.
   if (I != VM.end() && I->second) return I->second;
   
+  // If we have a materializer and it can materialize a value, use that.
+  if (Materializer) {
+    if (Value *NewV = Materializer->materializeValueFor(const_cast<Value*>(V)))
+      return VM[V] = NewV;
+  }
+
   // Global values do not need to be seeded into the VM if they
   // are using the identity mapping.
   if (isa<GlobalValue>(V) || isa<MDString>(V))
@@ -64,7 +72,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     for (unsigned i = 0, e = MD->getNumOperands(); i != e; ++i) {
       Value *OP = MD->getOperand(i);
       if (OP == 0) continue;
-      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper);
+      Value *Mapped_OP = MapValue(OP, VM, Flags, TypeMapper, Materializer);
       // Use identity map if Mapped_Op is null and we can ignore missing
       // entries.
       if (Mapped_OP == OP ||
@@ -79,7 +87,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
         if (Op == 0)
           Elts.push_back(0);
         else {
-          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper);
+          Value *Mapped_Op = MapValue(Op, VM, Flags, TypeMapper, Materializer);
           // Use identity map if Mapped_Op is null and we can ignore missing
           // entries.
           if (Mapped_Op == 0 && (Flags & RF_IgnoreMissingEntries))
@@ -109,9 +117,9 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   
   if (BlockAddress *BA = dyn_cast<BlockAddress>(C)) {
     Function *F = 
-      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper));
+      cast<Function>(MapValue(BA->getFunction(), VM, Flags, TypeMapper, Materializer));
     BasicBlock *BB = cast_or_null<BasicBlock>(MapValue(BA->getBasicBlock(), VM,
-                                                       Flags, TypeMapper));
+                                                       Flags, TypeMapper, Materializer));
     return VM[V] = BlockAddress::get(F, BB ? BB : BA->getBasicBlock());
   }
   
@@ -121,7 +129,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
   Value *Mapped = 0;
   for (; OpNo != NumOperands; ++OpNo) {
     Value *Op = C->getOperand(OpNo);
-    Mapped = MapValue(Op, VM, Flags, TypeMapper);
+    Mapped = MapValue(Op, VM, Flags, TypeMapper, Materializer);
     if (Mapped != C) break;
   }
   
@@ -149,7 +157,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
     // Map the rest of the operands that aren't processed yet.
     for (++OpNo; OpNo != NumOperands; ++OpNo)
       Ops.push_back(MapValue(cast<Constant>(C->getOperand(OpNo)), VM,
-                             Flags, TypeMapper));
+                             Flags, TypeMapper, Materializer));
   }
   
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
@@ -173,10 +181,11 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
 /// current values into those specified by VMap.
 ///
 void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
-                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper){
+                            RemapFlags Flags, ValueMapTypeRemapper *TypeMapper,
+                            ValueMaterializer *Materializer){
   // Remap operands.
   for (User::op_iterator op = I->op_begin(), E = I->op_end(); op != E; ++op) {
-    Value *V = MapValue(*op, VMap, Flags, TypeMapper);
+    Value *V = MapValue(*op, VMap, Flags, TypeMapper, Materializer);
     // If we aren't ignoring missing entries, assert that something happened.
     if (V != 0)
       *op = V;
@@ -204,7 +213,7 @@ void llvm::RemapInstruction(Instruction *I, ValueToValueMapTy &VMap,
   for (SmallVectorImpl<std::pair<unsigned, MDNode *> >::iterator
        MI = MDs.begin(), ME = MDs.end(); MI != ME; ++MI) {
     MDNode *Old = MI->second;
-    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper);
+    MDNode *New = MapValue(Old, VMap, Flags, TypeMapper, Materializer);
     if (New != Old)
       I->setMetadata(MI->first, New);
   }
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 17900da..c5e1dcb 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -356,7 +356,7 @@ namespace {
                      Instruction *J, unsigned o, bool IBeforeJ);
 
     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     Instruction *J, SmallVectorImpl<Value *> &ReplacedOperands,
                      bool IBeforeJ);
 
     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
@@ -533,7 +533,7 @@ namespace {
       default: break;
       case Instruction::GetElementPtr:
         // We mark this instruction as zero-cost because scalar GEPs are usually
-        // lowered to the intruction addressing mode. At the moment we don't
+        // lowered to the instruction addressing mode. At the moment we don't
         // generate vector GEPs.
         return 0;
       case Instruction::Br:
@@ -625,10 +625,10 @@ namespace {
         ConstantInt *IntOff = ConstOffSCEV->getValue();
         int64_t Offset = IntOff->getSExtValue();
 
-        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        Type *VTy = IPtr->getType()->getPointerElementType();
         int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
 
-        Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType();
+        Type *VTy2 = JPtr->getType()->getPointerElementType();
         if (VTy != VTy2 && Offset < 0) {
           int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2);
           OffsetInElmts = Offset/VTy2TSS;
@@ -1182,6 +1182,8 @@ namespace {
       // Look for an instruction with which to pair instruction *I...
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       bool JAfterStart = IAfterStart;
       BasicBlock::iterator J = llvm::next(I);
       for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
@@ -1403,6 +1405,8 @@ namespace {
 
       DenseSet<Value *> Users;
       AliasSetTracker WriteSet(*AA);
+      if (I->mayWriteToMemory()) WriteSet.add(I);
+
       for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) {
         (void) trackUsesOfI(Users, WriteSet, I, J);
 
@@ -1602,7 +1606,7 @@ namespace {
         DenseSet<ValuePair> CurrentPairs;
 
         bool CanAdd = true;
-        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
               = BestChildren.begin(), E2 = BestChildren.end();
              C2 != E2; ++C2) {
           if (C2->first.first == C->first.first ||
@@ -1642,7 +1646,7 @@ namespace {
         if (!CanAdd) continue;
 
         // And check the queue too...
-        for (SmallVector<ValuePairWithDepth, 32>::iterator C2 = Q.begin(),
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2 = Q.begin(),
              E2 = Q.end(); C2 != E2; ++C2) {
           if (C2->first.first == C->first.first ||
               C2->first.first == C->first.second ||
@@ -1691,7 +1695,7 @@ namespace {
         // to an already-selected child. Check for this here, and if a
         // conflict is found, then remove the previously-selected child
         // before adding this one in its place.
-        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
+        for (SmallVectorImpl<ValuePairWithDepth>::iterator C2
               = BestChildren.begin(); C2 != BestChildren.end();) {
           if (C2->first.first == C->first.first ||
               C2->first.first == C->first.second ||
@@ -1706,7 +1710,7 @@ namespace {
         BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
       }
 
-      for (SmallVector<ValuePairWithDepth, 8>::iterator C
+      for (SmallVectorImpl<ValuePairWithDepth>::iterator C
             = BestChildren.begin(), E2 = BestChildren.end();
            C != E2; ++C) {
         size_t DepthF = getDepthFactor(C->first.first);
@@ -2227,11 +2231,12 @@ namespace {
     // The pointer value is taken to be the one with the lowest offset.
     Value *VPtr = IPtr;
 
-    Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
-    Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
+    Type *ArgTypeI = IPtr->getType()->getPointerElementType();
+    Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
-    Type *VArgPtrType = PointerType::get(VArgType,
-      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    Type *VArgPtrType
+      = PointerType::get(VArgType,
+                         IPtr->getType()->getPointerAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
                         /* insert before */ I);
   }
@@ -2240,7 +2245,7 @@ namespace {
                      unsigned MaskOffset, unsigned NumInElem,
                      unsigned NumInElem1, unsigned IdxOffset,
                      std::vector<Constant*> &Mask) {
-    unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements();
+    unsigned NumElem1 = J->getType()->getVectorNumElements();
     for (unsigned v = 0; v < NumElem1; ++v) {
       int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
       if (m < 0) {
@@ -2267,18 +2272,18 @@ namespace {
     Type *ArgTypeJ = J->getType();
     Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
 
-    unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements();
+    unsigned NumElemI = ArgTypeI->getVectorNumElements();
 
     // Get the total number of elements in the fused vector type.
     // By definition, this must equal the number of elements in
     // the final mask.
-    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    unsigned NumElem = VArgType->getVectorNumElements();
     std::vector<Constant*> Mask(NumElem);
 
     Type *OpTypeI = I->getOperand(0)->getType();
-    unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements();
+    unsigned NumInElemI = OpTypeI->getVectorNumElements();
     Type *OpTypeJ = J->getOperand(0)->getType();
-    unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements();
+    unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
 
     // The fused vector will be:
     // -----------------------------------------------------
@@ -2340,6 +2345,12 @@ namespace {
     return ExpandedIEChain;
   }
 
+  static unsigned getNumScalarElements(Type *Ty) {
+    if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+      return VecTy->getNumElements();
+    return 1;
+  }
+
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
@@ -2355,17 +2366,8 @@ namespace {
     Instruction *L = I, *H = J;
     Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
 
-    unsigned numElemL;
-    if (ArgTypeL->isVectorTy())
-      numElemL = cast<VectorType>(ArgTypeL)->getNumElements();
-    else
-      numElemL = 1;
-
-    unsigned numElemH;
-    if (ArgTypeH->isVectorTy())
-      numElemH = cast<VectorType>(ArgTypeH)->getNumElements();
-    else
-      numElemH = 1;
+    unsigned numElemL = getNumScalarElements(ArgTypeL);
+    unsigned numElemH = getNumScalarElements(ArgTypeH);
 
     Value *LOp = L->getOperand(o);
     Value *HOp = H->getOperand(o);
@@ -2426,11 +2428,12 @@ namespace {
 
       if (CanUseInputs) {
         unsigned LOpElem =
-          cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(LOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
+
         unsigned HOpElem =
-          cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType())
-            ->getNumElements();
+          cast<Instruction>(HOp)->getOperand(0)->getType()
+            ->getVectorNumElements();
 
         // We have one or two input vectors. We need to map each index of the
         // operands to the index of the original vector.
@@ -2646,14 +2649,14 @@ namespace {
                                            getReplacementName(IBeforeJ ? I : J,
                                                               true, o, 1));
         }
-  
+
         NHOp->insertBefore(IBeforeJ ? J : I);
         HOp = NHOp;
       }
     }
 
     if (ArgType->isVectorTy()) {
-      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      unsigned numElem = VArgType->getVectorNumElements();
       std::vector<Constant*> Mask(numElem);
       for (unsigned v = 0; v < numElem; ++v) {
         unsigned Idx = v;
@@ -2687,7 +2690,7 @@ namespace {
   // to the vector instruction that fuses I with J.
   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                      Instruction *I, Instruction *J,
-                     SmallVector<Value *, 3> &ReplacedOperands,
+                     SmallVectorImpl<Value *> &ReplacedOperands,
                      bool IBeforeJ) {
     unsigned NumOperands = I->getNumOperands();
 
@@ -2746,16 +2749,8 @@ namespace {
       VectorType *VType = getVecTypeForPair(IType, JType);
       unsigned numElem = VType->getNumElements();
 
-      unsigned numElemI, numElemJ;
-      if (IType->isVectorTy())
-        numElemI = cast<VectorType>(IType)->getNumElements();
-      else
-        numElemI = 1;
-
-      if (JType->isVectorTy())
-        numElemJ = cast<VectorType>(JType)->getNumElements();
-      else
-        numElemJ = 1;
+      unsigned numElemI = getNumScalarElements(IType);
+      unsigned numElemJ = getNumScalarElements(JType);
 
       if (IType->isVectorTy()) {
         std::vector<Constant*> Mask1(numElemI), Mask2(numElemI);
@@ -2804,6 +2799,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J; ++L)
       (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs);
 
@@ -2824,6 +2821,8 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
+
     for (; cast<Instruction>(L) != J;) {
       if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSetPairs)) {
         // Move this instruction
@@ -2853,6 +2852,7 @@ namespace {
 
     DenseSet<Value *> Users;
     AliasSetTracker WriteSet(*AA);
+    if (I->mayWriteToMemory()) WriteSet.add(I);
 
     // Note: We cannot end the loop when we reach J because J could be moved
     // farther down the use chain by another instruction pairing. Also, J
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 7ae082f..07967d8 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -3,7 +3,6 @@ add_llvm_library(LLVMVectorize
   Vectorize.cpp
   LoopVectorize.cpp
   SLPVectorizer.cpp
-  VecUtils.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 08d3725..5e75871 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -47,13 +47,15 @@
 
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
@@ -119,11 +121,14 @@ static const unsigned TinyTripCountUnrollThreshold = 128;
 /// than this number of comparisons.
 static const unsigned RuntimeMemoryCheckThreshold = 8;
 
-/// We use a metadata with this name  to indicate that a scalar loop was
-/// vectorized and that we don't need to re-vectorize it if we run into it
-/// again.
-static const char*
-AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
+/// Maximum simd width.
+static const unsigned MaxVectorWidth = 64;
+
+/// Maximum vectorization unroll count.
+static const unsigned MaxUnrollFactor = 16;
+
+/// The cost of a loop that is considered 'small' by the unroller.
+static const unsigned SmallLoopCost = 20;
 
 namespace {
 
@@ -166,7 +171,9 @@ public:
     updateAnalysis();
   }
 
-private:
+  virtual ~InnerLoopVectorizer() {}
+
+protected:
   /// A small list of PHINodes.
   typedef SmallVector<PHINode*, 4> PhiVector;
   /// When we unroll loops we have multiple vector values for each scalar.
@@ -174,6 +181,11 @@ private:
   /// originated from one scalar instruction.
   typedef SmallVector<Value*, 2> VectorParts;
 
+  // When we if-convert we need create edge masks. We have to cache values so
+  // that we don't end up with exponential recursion/IR.
+  typedef DenseMap<std::pair<BasicBlock*, BasicBlock*>,
+                   VectorParts> EdgeMaskCache;
+
   /// Add code that checks at runtime if the accessed arrays overlap.
   /// Returns the comparator value or NULL if no check is needed.
   Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
@@ -181,7 +193,13 @@ private:
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop(LoopVectorizationLegality *Legal);
+  virtual void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// \brief The Loop exit block may have single value PHI nodes where the
+  /// incoming value is 'Undef'. While vectorizing we only handled real values
+  /// that were defined inside the loop. Here we fix the 'undef case'.
+  /// See PR14725.
+  void fixLCSSAPHIs();
 
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
@@ -195,16 +213,23 @@ private:
   void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
                             PhiVector *PV);
 
+  /// Vectorize a single PHINode in a block. This method handles the induction
+  /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+  /// arbitrary length vectors.
+  void widenPHIInstruction(Instruction *PN, VectorParts &Entry,
+                           LoopVectorizationLegality *Legal,
+                           unsigned UF, unsigned VF, PhiVector *PV);
+
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
-  void scalarizeInstruction(Instruction *Instr);
+  virtual void scalarizeInstruction(Instruction *Instr);
 
   /// Vectorize Load and Store instructions,
-  void vectorizeMemoryInstruction(Instruction *Instr,
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
                                   LoopVectorizationLegality *Legal);
 
   /// Create a broadcast instruction. This method generates a broadcast
@@ -212,12 +237,12 @@ private:
   /// value. If this is the induction variable then we extend it to N, N+1, ...
   /// this is needed because each iteration in the loop corresponds to a SIMD
   /// element.
-  Value *getBroadcastInstrs(Value *V);
+  virtual Value *getBroadcastInstrs(Value *V);
 
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -227,7 +252,7 @@ private:
   VectorParts &getVectorValue(Value *V);
 
   /// Generate a shuffle sequence that will reverse the vector Vec.
-  Value *reverseVector(Value *Vec);
+  virtual Value *reverseVector(Value *Vec);
 
   /// This is a helper class that holds the vectorizer state. It maps scalar
   /// instructions to vector instructions. When the code is 'unrolled' then
@@ -285,6 +310,8 @@ private:
   /// The vectorization SIMD factor to use. Each vector will have this many
   /// vector elements.
   unsigned VF;
+
+protected:
   /// The vectorization unroll factor to use. Each scalar is vectorized to this
   /// many different vector instructions.
   unsigned UF;
@@ -313,10 +340,57 @@ private:
   PHINode *Induction;
   /// The induction variable of the old basic block.
   PHINode *OldInduction;
+  /// Holds the extended (to the widest induction type) start index.
+  Value *ExtendedIdx;
   /// Maps scalars to widened vectors.
   ValueMap WidenMap;
+  EdgeMaskCache MaskCache;
 };
 
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+  InnerLoopUnroller(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+                    DominatorTree *DT, DataLayout *DL,
+                    const TargetLibraryInfo *TLI, unsigned UnrollFactor) :
+    InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
+
+private:
+  virtual void scalarizeInstruction(Instruction *Instr);
+  virtual void vectorizeMemoryInstruction(Instruction *Instr,
+                                          LoopVectorizationLegality *Legal);
+  virtual Value *getBroadcastInstrs(Value *V);
+  virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
+  virtual Value *reverseVector(Value *Vec);
+};
+
+/// \brief Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+  if (!I)
+    return I;
+
+  DebugLoc Empty;
+  if (I->getDebugLoc() != Empty)
+    return I;
+
+  for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+    if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+      if (OpInst->getDebugLoc() != Empty)
+        return OpInst;
+  }
+
+  return I;
+}
+
+/// \brief Set the debug location in the builder using the debug location in the
+/// instruction.
+static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
+    B.SetCurrentDebugLocation(Inst->getDebugLoc());
+  else
+    B.SetCurrentDebugLocation(DebugLoc());
+}
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -333,10 +407,10 @@ private:
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
-                            DominatorTree *DT, TargetTransformInfo* TTI,
-                            AliasAnalysis *AA, TargetLibraryInfo *TLI)
-      : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0), HasFunNoNaNAttr(false) {}
+                            DominatorTree *DT, TargetLibraryInfo *TLI)
+      : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
+        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+        MaxSafeDepDistBytes(-1U) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -372,7 +446,7 @@ public:
     MRK_FloatMax
   };
 
-  /// This POD struct holds information about reduction variables.
+  /// This struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
       Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
@@ -409,8 +483,8 @@ public:
     MinMaxReductionKind MinMaxKind;
   };
 
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
+  /// This struct holds information about the memory runtime legality
+  /// check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
     RuntimePointerCheck() : Need(false) {}
 
@@ -420,10 +494,13 @@ public:
       Pointers.clear();
       Starts.clear();
       Ends.clear();
+      IsWritePtr.clear();
+      DependencySetId.clear();
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr,
+                unsigned DepSetId);
 
     /// This flag indicates if we need to add the runtime check.
     bool Need;
@@ -435,9 +512,12 @@ public:
     SmallVector<const SCEV*, 2> Ends;
     /// Holds the information if this pointer is used for writing to memory.
     SmallVector<bool, 2> IsWritePtr;
+    /// Holds the id of the set of pointers that could be dependent because of a
+    /// shared underlying object.
+    SmallVector<unsigned, 2> DependencySetId;
   };
 
-  /// A POD for saving information about induction variables.
+  /// A struct for saving information about induction variables.
   struct InductionInfo {
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
@@ -455,11 +535,6 @@ public:
   /// induction descriptor.
   typedef MapVector<PHINode*, InductionInfo> InductionList;
 
-  /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
-  /// respective Store/Load instruction(s) to calculate aliasing.
-  typedef MapVector<Value*, Instruction* > AliasMap;
-  typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
-
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -474,6 +549,9 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Returns the widest induction type.
+  Type *getWidestInductionType() { return WidestIndTy; }
+
   /// Returns True if V is an induction variable in this loop.
   bool isInductionVariable(const Value *V);
 
@@ -503,6 +581,9 @@ public:
   /// This function returns the identity element (or neutral element) for
   /// the operation K.
   static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
+
+  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -523,8 +604,9 @@ private:
   void collectLoopUniforms();
 
   /// Return true if all of the instructions in the block can be speculatively
-  /// executed.
-  bool blockCanBePredicated(BasicBlock *BB);
+  /// executed. \p SafePtrs is a list of addresses that are known to be legal
+  /// and we know that we can read from them without segfault.
+  bool blockCanBePredicated(BasicBlock *BB, SmallPtrSet<Value *, 8>& SafePtrs);
 
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
@@ -543,16 +625,6 @@ private:
   /// Returns the induction kind of Phi. This function may return NoInduction
   /// if the PHI is not an induction variable.
   InductionKind isInductionVariable(PHINode *Phi);
-  /// Return true if can compute the address bounds of Ptr within the loop.
-  bool hasComputableBounds(Value *Ptr);
-  /// Return true if there is the chance of write reorder.
-  bool hasPossibleGlobalWriteReorder(Value *Object,
-                                     Instruction *Inst,
-                                     AliasMultiMap &WriteObjects,
-                                     unsigned MaxByteWidth);
-  /// Return the AA location for a load or a store.
-  AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
-
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -562,10 +634,6 @@ private:
   DataLayout *DL;
   /// Dominators.
   DominatorTree *DT;
-  /// Target Info.
-  TargetTransformInfo *TTI;
-  /// Alias Analysis.
-  AliasAnalysis *AA;
   /// Target Library Info.
   TargetLibraryInfo *TLI;
 
@@ -580,6 +648,8 @@ private:
   /// Notice that inductions don't need to start at zero and that induction
   /// variables can be pointers.
   InductionList Inductions;
+  /// Holds the widest induction type encountered.
+  Type *WidestIndTy;
 
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
@@ -592,6 +662,8 @@ private:
   RuntimePointerCheck PtrRtCheck;
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
+
+  unsigned MaxSafeDepDistBytes;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -684,12 +756,140 @@ private:
   const TargetLibraryInfo *TLI;
 };
 
+/// Utility class for getting and setting loop vectorizer hints in the form
+/// of loop metadata.
+struct LoopVectorizeHints {
+  /// Vectorization width.
+  unsigned Width;
+  /// Vectorization unroll factor.
+  unsigned Unroll;
+
+  LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
+  : Width(VectorizationFactor)
+  , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
+  , LoopID(L->getLoopID()) {
+    getHints(L);
+    // The command line options override any loop metadata except for when
+    // width == 1 which is used to indicate the loop is already vectorized.
+    if (VectorizationFactor.getNumOccurrences() > 0 && Width != 1)
+      Width = VectorizationFactor;
+    if (VectorizationUnroll.getNumOccurrences() > 0)
+      Unroll = VectorizationUnroll;
+
+    DEBUG(if (DisableUnrolling && Unroll == 1)
+            dbgs() << "LV: Unrolling disabled by the pass manager\n");
+  }
+
+  /// Return the loop vectorizer metadata prefix.
+  static StringRef Prefix() { return "llvm.vectorizer."; }
+
+  MDNode *createHint(LLVMContext &Context, StringRef Name, unsigned V) {
+    SmallVector<Value*, 2> Vals;
+    Vals.push_back(MDString::get(Context, Name));
+    Vals.push_back(ConstantInt::get(Type::getInt32Ty(Context), V));
+    return MDNode::get(Context, Vals);
+  }
+
+  /// Mark the loop L as already vectorized by setting the width to 1.
+  void setAlreadyVectorized(Loop *L) {
+    LLVMContext &Context = L->getHeader()->getContext();
+
+    Width = 1;
+
+    // Create a new loop id with one more operand for the already_vectorized
+    // hint. If the loop already has a loop id then copy the existing operands.
+    SmallVector<Value*, 4> Vals(1);
+    if (LoopID)
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i)
+        Vals.push_back(LoopID->getOperand(i));
+
+    Vals.push_back(createHint(Context, Twine(Prefix(), "width").str(), Width));
+    Vals.push_back(createHint(Context, Twine(Prefix(), "unroll").str(), 1));
+
+    MDNode *NewLoopID = MDNode::get(Context, Vals);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    L->setLoopID(NewLoopID);
+    if (LoopID)
+      LoopID->replaceAllUsesWith(NewLoopID);
+
+    LoopID = NewLoopID;
+  }
+
+private:
+  MDNode *LoopID;
+
+  /// Find hints specified in the loop metadata.
+  void getHints(const Loop *L) {
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = 0;
+      SmallVector<Value*, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the vectorizer prefix.
+      StringRef Hint = S->getString();
+      if (!Hint.startswith(Prefix()))
+        continue;
+      // Remove the prefix.
+      Hint = Hint.substr(Prefix().size(), StringRef::npos);
+
+      if (Args.size() == 1)
+        getHint(Hint, Args[0]);
+    }
+  }
+
+  // Check string hint with one operand.
+  void getHint(StringRef Hint, Value *Arg) {
+    const ConstantInt *C = dyn_cast<ConstantInt>(Arg);
+    if (!C) return;
+    unsigned Val = C->getZExtValue();
+
+    if (Hint == "width") {
+      if (isPowerOf2_32(Val) && Val <= MaxVectorWidth)
+        Width = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid width hint metadata\n");
+    } else if (Hint == "unroll") {
+      if (isPowerOf2_32(Val) && Val <= MaxUnrollFactor)
+        Unroll = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+    } else {
+      DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
+    }
+  }
+};
+
 /// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
   /// Pass identification, replacement for typeid
   static char ID;
 
-  explicit LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize(bool NoUnrolling = false)
+    : LoopPass(ID), DisableUnrolling(NoUnrolling) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -698,8 +898,8 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
   TargetTransformInfo *TTI;
   DominatorTree *DT;
-  AliasAnalysis *AA;
   TargetLibraryInfo *TLI;
+  bool DisableUnrolling;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -711,19 +911,30 @@ struct LoopVectorize : public LoopPass {
     LI = &getAnalysis<LoopInfo>();
     TTI = &getAnalysis<TargetTransformInfo>();
     DT = &getAnalysis<DominatorTree>();
-    AA = getAnalysisIfAvailable<AliasAnalysis>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     if (DL == NULL) {
-      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout\n");
       return false;
     }
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
+    LoopVectorizeHints Hints(L, DisableUnrolling);
+
+    if (Hints.Width == 1 && Hints.Unroll == 1) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      return false;
+    }
+
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
+    LoopVectorizationLegality LVL(L, SE, DL, DT, TLI);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
@@ -749,23 +960,30 @@ struct LoopVectorize : public LoopPass {
 
     // Select the optimal vectorization factor.
     LoopVectorizationCostModel::VectorizationFactor VF;
-    VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
     // Select the unroll factor.
-    unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
-                                        VF.Width, VF.Cost);
+    unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
+                                        VF.Cost);
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
+          F->getParent()->getModuleIdentifier() << '\n');
+    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << '\n');
 
     if (VF.Width == 1) {
       DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-      return false;
+      if (UF == 1)
+        return false;
+      // We decided not to vectorize, but we may want to unroll.
+      InnerLoopUnroller Unroller(L, SE, LI, DT, DL, TLI, UF);
+      Unroller.vectorize(&LVL);
+    } else {
+      // If we decided that it is *legal* to vectorize the loop then do it.
+      InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
+      LB.vectorize(&LVL);
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
-          F->getParent()->getModuleIdentifier()<<"\n");
-    DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
-
-    // If we decided that it is *legal* to vectorize the loop then do it.
-    InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
-    LB.vectorize(&LVL);
+    // Mark the loop as already vectorized to avoid vectorizing again.
+    Hints.setAlreadyVectorized(L);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
@@ -795,38 +1013,34 @@ struct LoopVectorize : public LoopPass {
 void
 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
                                                        Loop *Lp, Value *Ptr,
-                                                       bool WritePtr) {
+                                                       bool WritePtr,
+                                                       unsigned DepSetId) {
   const SCEV *Sc = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
-  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *Ex = SE->getBackedgeTakenCount(Lp);
   const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
   Pointers.push_back(Ptr);
   Starts.push_back(AR->getStart());
   Ends.push_back(ScEnd);
   IsWritePtr.push_back(WritePtr);
+  DependencySetId.push_back(DepSetId);
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Save the current insertion location.
-  Instruction *Loc = Builder.GetInsertPoint();
-
   // We need to place the broadcast of invariant variables outside the loop.
   Instruction *Instr = dyn_cast<Instruction>(V);
   bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
   bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
 
   // Place the code for broadcasting invariant variables in the new preheader.
+  IRBuilder<>::InsertPointGuard Guard(Builder);
   if (Invariant)
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
 
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
 
-  // Restore the builder insertion point.
-  if (Invariant)
-    Builder.SetInsertPoint(Loc);
-
   return Shuf;
 }
 
@@ -853,10 +1067,35 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(DataLayout *DL,
+                                       const GetElementPtrInst *Gep) {
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL->getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
   // Make sure that the pointer does not point to structs.
-  if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+  if (Ptr->getType()->getPointerElementType()->isAggregateType())
     return 0;
 
   // If this value is a pointer induction variable we know it is consecutive.
@@ -874,8 +1113,6 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     return 0;
 
   unsigned NumOperands = Gep->getNumOperands();
-  Value *LastIndex = Gep->getOperand(NumOperands - 1);
-
   Value *GpPtr = Gep->getPointerOperand();
   // If this GEP value is a consecutive pointer induction variable and all of
   // the indices are constant then we know it is consecutive. We can
@@ -899,14 +1136,18 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
       return -1;
   }
 
-  // Check that all of the gep indices are uniform except for the last.
-  for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+  unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0; i != NumOperands; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return 0;
 
-  // We can emit wide load/stores only if the last index is the induction
-  // variable.
-  const SCEV *Last = SE->getSCEV(LastIndex);
+  // We can emit wide load/stores only if the last non-zero index is the
+  // induction variable.
+  const SCEV *Last = SE->getSCEV(Gep->getOperand(InductionOperand));
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
     const SCEV *Step = AR->getStepRecurrence(*SE);
 
@@ -964,7 +1205,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
-
+  // An alignment of 0 means target abi alignment. We need to use the scalar's
+  // target abi alignment in such a case.
+  if (!Alignment)
+    Alignment = DL->getABITypeAlignment(ScalarDataTy);
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
 
@@ -985,6 +1230,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   // Handle consecutive loads/stores.
   GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
   if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+    setDebugLocFromInst(Builder, Gep);
     Value *PtrOperand = Gep->getPointerOperand();
     Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
     FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
@@ -995,26 +1241,40 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
     Gep2->setName("gep.indvar.base");
     Ptr = Builder.Insert(Gep2);
   } else if (Gep) {
+    setDebugLocFromInst(Builder, Gep);
     assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
                                OrigLoop) && "Base ptr must be invariant");
 
     // The last index does not have to be the induction. It can be
     // consecutive and be a function of the index. For example A[I+1];
     unsigned NumOperands = Gep->getNumOperands();
-
-    Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
-    VectorParts &GEPParts = getVectorValue(LastGepOperand);
-    Value *LastIndex = GEPParts[0];
-    LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
+    unsigned InductionOperand = getGEPInductionOperand(DL, Gep);
     // Create the new GEP with the new induction variable.
     GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-    Gep2->setOperand(NumOperands - 1, LastIndex);
-    Gep2->setName("gep.indvar.idx");
+
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      Value *GepOperand = Gep->getOperand(i);
+      Instruction *GepOperandInst = dyn_cast<Instruction>(GepOperand);
+
+      // Update last index or loop invariant instruction anchored in loop.
+      if (i == InductionOperand ||
+          (GepOperandInst && OrigLoop->contains(GepOperandInst))) {
+        assert((i == InductionOperand ||
+               SE->isLoopInvariant(SE->getSCEV(GepOperandInst), OrigLoop)) &&
+               "Must be last index or loop invariant");
+
+        VectorParts &GEPParts = getVectorValue(GepOperand);
+        Value *Index = GEPParts[0];
+        Index = Builder.CreateExtractElement(Index, Zero);
+        Gep2->setOperand(i, Index);
+        Gep2->setName("gep.indvar.idx");
+      }
+    }
     Ptr = Builder.Insert(Gep2);
   } else {
     // Use the induction element ptr.
     assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+    setDebugLocFromInst(Builder, Ptr);
     VectorParts &PtrVal = getVectorValue(Ptr);
     Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
   }
@@ -1023,8 +1283,11 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   if (SI) {
     assert(!Legal->isUniform(SI->getPointerOperand()) &&
            "We do not allow storing to uniform addresses");
+    setDebugLocFromInst(Builder, SI);
+    // We don't want to update the value in the map as it might be used in
+    // another expression. So don't use a reference type for "StoredVal".
+    VectorParts StoredVal = getVectorValue(SI->getValueOperand());
 
-    VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
     for (unsigned Part = 0; Part < UF; ++Part) {
       // Calculate the pointer for the specific unroll-part.
       Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
@@ -1039,11 +1302,16 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
       }
 
-      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Value *VecPtr = Builder.CreateBitCast(PartPtr,
+                                            DataTy->getPointerTo(AddressSpace));
       Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
     }
+    return;
   }
 
+  // Handle loads.
+  assert(LI && "Must have a load instruction");
+  setDebugLocFromInst(Builder, LI);
   for (unsigned Part = 0; Part < UF; ++Part) {
     // Calculate the pointer for the specific unroll-part.
     Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
@@ -1055,7 +1323,8 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
     }
 
-    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *VecPtr = Builder.CreateBitCast(PartPtr,
+                                          DataTy->getPointerTo(AddressSpace));
     Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
     cast<LoadInst>(LI)->setAlignment(Alignment);
     Entry[Part] = Reverse ? reverseVector(LI) :  LI;
@@ -1067,6 +1336,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<VectorParts, 4> Params;
 
+  setDebugLocFromInst(Builder, Instr);
+
   // Find all of the vectorized parameters.
   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
     Value *SrcOp = Instr->getOperand(op);
@@ -1112,7 +1383,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
-      // Replace the operands of the cloned instrucions with extracted scalars.
+      // Replace the operands of the cloned instructions with extracted scalars.
       for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
         Value *Op = Params[op][Part];
         // Param is a vector. Need to extract the right lane.
@@ -1142,16 +1413,13 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   if (!PtrRtCheck->Need)
     return NULL;
 
-  Instruction *MemoryRuntimeCheck = 0;
   unsigned NumPointers = PtrRtCheck->Pointers.size();
-  SmallVector<Value* , 2> Starts;
-  SmallVector<Value* , 2> Ends;
+  SmallVector<TrackingVH<Value> , 2> Starts;
+  SmallVector<TrackingVH<Value> , 2> Ends;
 
+  LLVMContext &Ctx = Loc->getContext();
   SCEVExpander Exp(*SE, "induction");
 
-  // Use this type for pointer arithmetic.
-  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
-
   for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
     const SCEV *Sc = SE->getSCEV(Ptr);
@@ -1162,7 +1430,11 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
     } else {
-      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr << '\n');
+      unsigned AS = Ptr->getType()->getPointerAddressSpace();
+
+      // Use this type for pointer arithmetic.
+      Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
 
       Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
@@ -1172,17 +1444,32 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   }
 
   IRBuilder<> ChkBuilder(Loc);
-
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = 0;
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
       // No need to check if two readonly pointers intersect.
       if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
         continue;
 
-      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
-      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
-      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
-      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy, "bc");
+      // Only need to check pointers between two different dependency sets.
+      if (PtrRtCheck->DependencySetId[i] == PtrRtCheck->DependencySetId[j])
+       continue;
+
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
 
       Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
       Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
@@ -1190,12 +1477,17 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
       if (MemoryRuntimeCheck)
         IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
                                          "conflict.rdx");
-
-      MemoryRuntimeCheck = cast<Instruction>(IsConflict);
+      MemoryRuntimeCheck = IsConflict;
     }
   }
 
-  return MemoryRuntimeCheck;
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  return Check;
 }
 
 void
@@ -1234,23 +1526,27 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  // Mark the old scalar loop with metadata that tells us not to vectorize this
-  // loop again if we run into it.
-  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None);
-  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
-
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
   // don't have a single induction variable.
   OldInduction = Legal->getInduction();
-  Type *IdxTy = OldInduction ? OldInduction->getType() :
-  DL->getIntPtrType(SE->getContext());
+  Type *IdxTy = Legal->getWidestInductionType();
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(OrigLoop);
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
+  // The exit count might have the type of i64 while the phi is i32. This can
+  // happen if we have an induction variable that is sign extended before the
+  // compare. The only way that we get a backedge taken count is that the
+  // induction variable was signed and as such will not overflow. In such a case
+  // truncation is legal.
+  if (ExitCount->getType()->getPrimitiveSizeInBits() >
+      IdxTy->getPrimitiveSizeInBits())
+    ExitCount = SE->getTruncateOrNoop(ExitCount, IdxTy);
+
+  ExitCount = SE->getNoopOrZeroExtend(ExitCount, IdxTy);
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
@@ -1266,9 +1562,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // The loop index does not have to start at Zero. Find the original start
   // value from the induction PHI node. If we don't have an induction variable
   // then we know that it starts at zero.
-  Value *StartIdx = OldInduction ?
-  OldInduction->getIncomingValueForBlock(BypassBlock):
-  ConstantInt::get(IdxTy, 0);
+  Builder.SetInsertPoint(BypassBlock->getTerminator());
+  Value *StartIdx = ExtendedIdx = OldInduction ?
+    Builder.CreateZExt(OldInduction->getIncomingValueForBlock(BypassBlock),
+                       IdxTy):
+    ConstantInt::get(IdxTy, 0);
 
   assert(BypassBlock && "Invalid loop structure");
   LoopBypassBlocks.push_back(BypassBlock);
@@ -1283,11 +1581,28 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ScalarPH =
   MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
+  // Create and register the new vector loop.
+  Loop* Lp = new Loop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks
+  // before calling any utilities such as SCEV that require valid LoopInfo.
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
+  }
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstNonPHI());
 
   // Generate the induction variable.
+  setDebugLocFromInst(Builder, getDebugLocFromInstOrOperands(OldInduction));
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
@@ -1296,6 +1611,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // This is the IR builder that we use to add all of the logic for bypassing
   // the new vector loop.
   IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+  setDebugLocFromInst(BypassBuilder,
+                      getDebugLocFromInstOrOperands(OldInduction));
 
   // We may need to extend the index in case there is a type mismatch.
   // We know that the count starts at zero and does not overflow.
@@ -1334,6 +1651,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     // Create a new block containing the memory check.
     BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
                                                           "vector.memcheck");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, LI->getBase());
     LoopBypassBlocks.push_back(CheckBlock);
 
     // Replace the branch into the memory check block with a conditional branch
@@ -1362,76 +1681,101 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   PHINode *ResumeIndex = 0;
   LoopVectorizationLegality::InductionList::iterator I, E;
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  // Set builder to point to last bypass block.
+  BypassBuilder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
     LoopVectorizationLegality::InductionInfo II = I->second;
-    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+
+    Type *ResumeValTy = (OrigPhi == OldInduction) ? IdxTy : OrigPhi->getType();
+    PHINode *ResumeVal = PHINode::Create(ResumeValTy, 2, "resume.val",
                                          MiddleBlock->getTerminator());
+    // We might have extended the type of the induction variable but we need a
+    // truncated version for the scalar loop.
+    PHINode *TruncResumeVal = (OrigPhi == OldInduction) ?
+      PHINode::Create(OrigPhi->getType(), 2, "trunc.resume.val",
+                      MiddleBlock->getTerminator()) : 0;
+
     Value *EndValue = 0;
     switch (II.IK) {
     case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
     case LoopVectorizationLegality::IK_IntInduction: {
-      // Handle the integer induction counter:
+      // Handle the integer induction counter.
       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-      assert(OrigPhi == OldInduction && "Unknown integer PHI");
-      // We know what the end value is.
-      EndValue = IdxEndRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
+
+      // We have the canonical induction variable.
+      if (OrigPhi == OldInduction) {
+        // Create a truncated version of the resume value for the scalar loop,
+        // we might have promoted the type to a larger width.
+        EndValue =
+          BypassBuilder.CreateTrunc(IdxEndRoundDown, OrigPhi->getType());
+        // The new PHI merges the original incoming value, in case of a bypass,
+        // or the value at the end of the vectorized loop.
+        for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+          TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+        TruncResumeVal->addIncoming(EndValue, VecBody);
+
+        // We know what the end value is.
+        EndValue = IdxEndRoundDown;
+        // We also know which PHI node holds it.
+        ResumeIndex = ResumeVal;
+        break;
+      }
+
+      // Not the canonical induction variable - add the vector loop count to the
+      // start value.
+      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+                                                   II.StartValue->getType(),
+                                                   "cast.crd");
+      EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_ReverseIntInduction: {
       // Convert the CountRoundDown variable to the PHI size.
-      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
-      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
-      Value *CRD = CountRoundDown;
-      if (CRDSize > IISize)
-        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
-                               II.StartValue->getType(), "tr.crd",
-                               LoopBypassBlocks.back()->getTerminator());
-      else if (CRDSize < IISize)
-        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
-                               II.StartValue->getType(),
-                               "sext.crd",
-                               LoopBypassBlocks.back()->getTerminator());
-      // Handle reverse integer induction counter:
-      EndValue =
-        BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+                                                   II.StartValue->getType(),
+                                                   "cast.crd");
+      // Handle reverse integer induction counter.
+      EndValue = BypassBuilder.CreateSub(II.StartValue, CRD, "rev.ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue =
-        GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      EndValue = BypassBuilder.CreateGEP(II.StartValue, CountRoundDown,
+                                         "ptr.ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_ReversePtrInduction: {
       // The value at the end of the loop for the reverse pointer is calculated
       // by creating a GEP with a negative index starting from the start value.
       Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
-      Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
-                                  "rev.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
-      EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
-                                  "rev.ptr.ind.end",
-                                  LoopBypassBlocks.back()->getTerminator());
+      Value *NegIdx = BypassBuilder.CreateSub(Zero, CountRoundDown,
+                                              "rev.ind.end");
+      EndValue = BypassBuilder.CreateGEP(II.StartValue, NegIdx,
+                                         "rev.ptr.ind.end");
       break;
     }
     }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) {
+      if (OrigPhi == OldInduction)
+        ResumeVal->addIncoming(StartIdx, LoopBypassBlocks[I]);
+      else
+        ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
+    }
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
     unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
-    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+    // The old inductions phi node in the scalar body needs the truncated value.
+    if (OrigPhi == OldInduction)
+      OrigPhi->setIncomingValue(BlockIdx, TruncResumeVal);
+    else
+      OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
   }
 
   // If we are generating a new induction variable then we also need to
@@ -1476,24 +1820,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
-  // Create and register the new vector loop.
-  Loop* Lp = new Loop();
-  Loop *ParentLoop = OrigLoop->getParentLoop();
-
-  // Insert the new loop into the loop nest and register the new basic blocks.
-  if (ParentLoop) {
-    ParentLoop->addChildLoop(Lp);
-    for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
-      ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
-    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
-    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
-    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
-  } else {
-    LI->addTopLevelLoop(Lp);
-  }
-
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
   // Save the state.
   LoopVectorPreHeader = VectorPH;
   LoopScalarPreHeader = ScalarPH;
@@ -1501,6 +1827,9 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
   LoopScalarBody = OldBasicBlock;
+
+  LoopVectorizeHints Hints(Lp, true);
+  Hints.setAlreadyVectorized(Lp);
 }
 
 /// This function returns the identity element (or neutral element) for
@@ -1530,6 +1859,31 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   }
 }
 
+static Intrinsic::ID checkUnaryFloatSignature(const CallInst &I,
+                                              Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+static Intrinsic::ID checkBinaryFloatSignature(const CallInst &I,
+                                               Intrinsic::ID ValidIntrinsicID) {
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      !I.getArgOperand(1)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return Intrinsic::not_intrinsic;
+
+  return ValidIntrinsicID;
+}
+
+
 static Intrinsic::ID
 getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   // If we have an intrinsic call, check if it is trivially vectorizable.
@@ -1544,14 +1898,18 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
     case Intrinsic::log10:
     case Intrinsic::log2:
     case Intrinsic::fabs:
+    case Intrinsic::copysign:
     case Intrinsic::floor:
     case Intrinsic::ceil:
     case Intrinsic::trunc:
     case Intrinsic::rint:
     case Intrinsic::nearbyint:
+    case Intrinsic::round:
     case Intrinsic::pow:
     case Intrinsic::fma:
     case Intrinsic::fmuladd:
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
       return II->getIntrinsicID();
     default:
       return Intrinsic::not_intrinsic;
@@ -1564,8 +1922,9 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   LibFunc::Func Func;
   Function *F = CI->getCalledFunction();
   // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment.
-  if (!F || !TLI->getLibFunc(F->getName(), Func))
+  // that the target knows that it's available in this environment and it does
+  // not have local linkage.
+  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(F->getName(), Func))
     return Intrinsic::not_intrinsic;
 
   // Otherwise check if we have a call to a function that can be turned into a
@@ -1576,59 +1935,67 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
   case LibFunc::sin:
   case LibFunc::sinf:
   case LibFunc::sinl:
-    return Intrinsic::sin;
+    return checkUnaryFloatSignature(*CI, Intrinsic::sin);
   case LibFunc::cos:
   case LibFunc::cosf:
   case LibFunc::cosl:
-    return Intrinsic::cos;
+    return checkUnaryFloatSignature(*CI, Intrinsic::cos);
   case LibFunc::exp:
   case LibFunc::expf:
   case LibFunc::expl:
-    return Intrinsic::exp;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp);
   case LibFunc::exp2:
   case LibFunc::exp2f:
   case LibFunc::exp2l:
-    return Intrinsic::exp2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::exp2);
   case LibFunc::log:
   case LibFunc::logf:
   case LibFunc::logl:
-    return Intrinsic::log;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log);
   case LibFunc::log10:
   case LibFunc::log10f:
   case LibFunc::log10l:
-    return Intrinsic::log10;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log10);
   case LibFunc::log2:
   case LibFunc::log2f:
   case LibFunc::log2l:
-    return Intrinsic::log2;
+    return checkUnaryFloatSignature(*CI, Intrinsic::log2);
   case LibFunc::fabs:
   case LibFunc::fabsf:
   case LibFunc::fabsl:
-    return Intrinsic::fabs;
+    return checkUnaryFloatSignature(*CI, Intrinsic::fabs);
+  case LibFunc::copysign:
+  case LibFunc::copysignf:
+  case LibFunc::copysignl:
+    return checkBinaryFloatSignature(*CI, Intrinsic::copysign);
   case LibFunc::floor:
   case LibFunc::floorf:
   case LibFunc::floorl:
-    return Intrinsic::floor;
+    return checkUnaryFloatSignature(*CI, Intrinsic::floor);
   case LibFunc::ceil:
   case LibFunc::ceilf:
   case LibFunc::ceill:
-    return Intrinsic::ceil;
+    return checkUnaryFloatSignature(*CI, Intrinsic::ceil);
   case LibFunc::trunc:
   case LibFunc::truncf:
   case LibFunc::truncl:
-    return Intrinsic::trunc;
+    return checkUnaryFloatSignature(*CI, Intrinsic::trunc);
   case LibFunc::rint:
   case LibFunc::rintf:
   case LibFunc::rintl:
-    return Intrinsic::rint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::rint);
   case LibFunc::nearbyint:
   case LibFunc::nearbyintf:
   case LibFunc::nearbyintl:
-    return Intrinsic::nearbyint;
+    return checkUnaryFloatSignature(*CI, Intrinsic::nearbyint);
+  case LibFunc::round:
+  case LibFunc::roundf:
+  case LibFunc::roundl:
+    return checkUnaryFloatSignature(*CI, Intrinsic::round);
   case LibFunc::pow:
   case LibFunc::powf:
   case LibFunc::powl:
-    return Intrinsic::pow;
+    return checkBinaryFloatSignature(*CI, Intrinsic::pow);
   }
 
   return Intrinsic::not_intrinsic;
@@ -1690,7 +2057,8 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
   }
 
   Value *Cmp;
-  if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+  if (RK == LoopVectorizationLegality::MRK_FloatMin ||
+      RK == LoopVectorizationLegality::MRK_FloatMax)
     Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
   else
     Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
@@ -1699,6 +2067,54 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
   return Select;
 }
 
+namespace {
+struct CSEDenseMapInfo {
+  static bool canHandle(Instruction *I) {
+    return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+           isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+  }
+  static inline Instruction *getEmptyKey() {
+    return DenseMapInfo<Instruction *>::getEmptyKey();
+  }
+  static inline Instruction *getTombstoneKey() {
+    return DenseMapInfo<Instruction *>::getTombstoneKey();
+  }
+  static unsigned getHashValue(Instruction *I) {
+    assert(canHandle(I) && "Unknown instruction!");
+    return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+                                                           I->value_op_end()));
+  }
+  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+    if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+        LHS == getTombstoneKey() || RHS == getTombstoneKey())
+      return LHS == RHS;
+    return LHS->isIdenticalTo(RHS);
+  }
+};
+}
+
+///\brief Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+  // Perform simple cse.
+  SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+    Instruction *In = I++;
+
+    if (!CSEDenseMapInfo::canHandle(In))
+      continue;
+
+    // Check if we can replace this instruction with any of the
+    // visited instructions.
+    if (Instruction *V = CSEMap.lookup(In)) {
+      In->replaceAllUsesWith(V);
+      In->eraseFromParent();
+      continue;
+    }
+
+    CSEMap[In] = In;
+  }
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1750,6 +2166,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
     (*Legal->getReductionVars())[RdxPhi];
 
+    setDebugLocFromInst(Builder, RdxDesc.StartValue);
+
     // We need to generate a reduction vector from the incoming scalar.
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
@@ -1767,18 +2185,31 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
         RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
       // MinMax reduction have the start value as their identify.
-      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
-                                                         "minmax.ident");
+      if (VF == 1) {
+        VectorStart = Identity = RdxDesc.StartValue;
+      } else {
+        VectorStart = Identity = Builder.CreateVectorSplat(VF,
+                                                           RdxDesc.StartValue,
+                                                           "minmax.ident");
+      }
     } else {
+      // Handle other reduction kinds:
       Constant *Iden =
-        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
-                                                        VecTy->getScalarType());
-      Identity = ConstantVector::getSplat(VF, Iden);
-
-      // This vector is the Identity vector where the first element is the
-      // incoming scalar reduction.
-      VectorStart = Builder.CreateInsertElement(Identity,
-                                                RdxDesc.StartValue, Zero);
+      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                      VecTy->getScalarType());
+      if (VF == 1) {
+        Identity = Iden;
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = RdxDesc.StartValue;
+      } else {
+        Identity = ConstantVector::getSplat(VF, Iden);
+
+        // This vector is the Identity vector where the first element is the
+        // incoming scalar reduction.
+        VectorStart = Builder.CreateInsertElement(Identity,
+                                                  RdxDesc.StartValue, Zero);
+      }
     }
 
     // Fix the vector-loop phi.
@@ -1793,7 +2224,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *LoopVal = RdxPhi->getIncomingValueForBlock(Latch);
     VectorParts &Val = getVectorValue(LoopVal);
     for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the 
+      // Make sure to add the reduction stat value only to the
       // first unroll part.
       Value *StartVal = (part == 0) ? VectorStart : Identity;
       cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
@@ -1807,6 +2238,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
 
     VectorParts RdxParts;
+    setDebugLocFromInst(Builder, RdxDesc.LoopExitInstr);
     for (unsigned part = 0; part < UF; ++part) {
       // This PHINode contains the vectorized reduction variable, or
       // the initial value vector, if we bypass the vector loop.
@@ -1822,6 +2254,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = getReductionBinOp(RdxDesc.Kind);
+    setDebugLocFromInst(Builder, ReducedPartRdx);
     for (unsigned part = 1; part < UF; ++part) {
       if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
@@ -1832,37 +2265,40 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                         ReducedPartRdx, RdxParts[part]);
     }
 
-    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-    // and vector ops, reducing the set of values being computed by half each
-    // round.
-    assert(isPowerOf2_32(VF) &&
-           "Reduction emission only supported for pow2 vectors!");
-    Value *TmpVec = ReducedPartRdx;
-    SmallVector<Constant*, 32> ShuffleMask(VF, 0);
-    for (unsigned i = VF; i != 1; i >>= 1) {
-      // Move the upper half of the vector to the lower half.
-      for (unsigned j = 0; j != i/2; ++j)
-        ShuffleMask[j] = Builder.getInt32(i/2 + j);
-
-      // Fill the rest of the mask with undef.
-      std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
-                UndefValue::get(Builder.getInt32Ty()));
-
-      Value *Shuf =
+    if (VF > 1) {
+      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+      // and vector ops, reducing the set of values being computed by half each
+      // round.
+      assert(isPowerOf2_32(VF) &&
+             "Reduction emission only supported for pow2 vectors!");
+      Value *TmpVec = ReducedPartRdx;
+      SmallVector<Constant*, 32> ShuffleMask(VF, 0);
+      for (unsigned i = VF; i != 1; i >>= 1) {
+        // Move the upper half of the vector to the lower half.
+        for (unsigned j = 0; j != i/2; ++j)
+          ShuffleMask[j] = Builder.getInt32(i/2 + j);
+
+        // Fill the rest of the mask with undef.
+        std::fill(&ShuffleMask[i/2], ShuffleMask.end(),
+                  UndefValue::get(Builder.getInt32Ty()));
+
+        Value *Shuf =
         Builder.CreateShuffleVector(TmpVec,
                                     UndefValue::get(TmpVec->getType()),
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
-                                     "bin.rdx");
-      else
-        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
-    }
+        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+          TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                       "bin.rdx");
+        else
+          TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
+      }
 
-    // The result is in the first element of the vector.
-    Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+      // The result is in the first element of the vector.
+      ReducedPartRdx = Builder.CreateExtractElement(TmpVec,
+                                                    Builder.getInt32(0));
+    }
 
     // Now, we need to fix the users of the reduction variable
     // inside and outside of the scalar remainder loop.
@@ -1871,7 +2307,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
          LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
       PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi) continue;
+      if (!LCSSAPhi) break;
 
       // All PHINodes need to have a single entry edge, or two if
       // we already fixed them.
@@ -1881,7 +2317,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
       // incoming bypass edge.
       if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
         // Add an edge coming from the bypass.
-        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
         break;
       }
     }// end of the LCSSA phi scan.
@@ -1893,29 +2329,38 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
     // Pick the other block.
     int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, ReducedPartRdx);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 
-  // The Loop exit block may have single value PHI nodes where the incoming
-  // value is 'undef'. While vectorizing we only handled real values that
-  // were defined inside the loop. Here we handle the 'undef case'.
-  // See PR14725.
+  fixLCSSAPHIs();
+
+  // Remove redundant induction instructions.
+  cse(LoopVectorBody);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
   for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
        LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
     PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-    if (!LCSSAPhi) continue;
+    if (!LCSSAPhi) break;
     if (LCSSAPhi->getNumIncomingValues() == 1)
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
   }
-}
+} 
 
 InnerLoopVectorizer::VectorParts
 InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
   assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
          "Invalid edge");
 
+  // Look for cached value.
+  std::pair<BasicBlock*, BasicBlock*> Edge(Src, Dst);
+  EdgeMaskCache::iterator ECEntryIt = MaskCache.find(Edge);
+  if (ECEntryIt != MaskCache.end())
+    return ECEntryIt->second;
+
   VectorParts SrcMask = createBlockInMask(Src);
 
   // The terminator has to be a branch inst!
@@ -1931,9 +2376,12 @@ InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
 
     for (unsigned part = 0; part < UF; ++part)
       EdgeMask[part] = Builder.CreateAnd(EdgeMask[part], SrcMask[part]);
+
+    MaskCache[Edge] = EdgeMask;
     return EdgeMask;
   }
 
+  MaskCache[Edge] = SrcMask;
   return SrcMask;
 }
 
@@ -1961,154 +2409,185 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
   return BlockMask;
 }
 
-void
-InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
-                                          BasicBlock *BB, PhiVector *PV) {
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    VectorParts &Entry = WidenMap.get(it);
-    switch (it->getOpcode()) {
-    case Instruction::Br:
-      // Nothing to do for PHIs and BR, since we already took care of the
-      // loop control flow instructions.
-      continue;
-    case Instruction::PHI:{
-      PHINode* P = cast<PHINode>(it);
-      // Handle reduction variables:
-      if (Legal->getReductionVars()->count(P)) {
-        for (unsigned part = 0; part < UF; ++part) {
-          // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(it->getType(), VF);
-          Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
-                                        LoopVectorBody-> getFirstInsertionPt());
-        }
-        PV->push_back(P);
-        continue;
-      }
+void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
+                                              InnerLoopVectorizer::VectorParts &Entry,
+                                              LoopVectorizationLegality *Legal,
+                                              unsigned UF, unsigned VF, PhiVector *PV) {
+  PHINode* P = cast<PHINode>(PN);
+  // Handle reduction variables:
+  if (Legal->getReductionVars()->count(P)) {
+    for (unsigned part = 0; part < UF; ++part) {
+      // This is phase one of vectorizing PHIs.
+      Type *VecTy = (VF == 1) ? PN->getType() :
+      VectorType::get(PN->getType(), VF);
+      Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
+                                    LoopVectorBody-> getFirstInsertionPt());
+    }
+    PV->push_back(P);
+    return;
+  }
 
-      // Check for PHI nodes that are lowered to vector selects.
-      if (P->getParent() != OrigLoop->getHeader()) {
-        // We know that all PHIs in non header blocks are converted into
-        // selects, so we don't have to worry about the insertion order and we
-        // can just use the builder.
-        // At this point we generate the predication tree. There may be
-        // duplications since this is a simple recursive scan, but future
-        // optimizations will clean it up.
-
-        unsigned NumIncoming = P->getNumIncomingValues();
-        assert(NumIncoming > 1 && "Invalid PHI");
-
-        // Generate a sequence of selects of the form:
-        // SELECT(Mask3, In3,
-        //      SELECT(Mask2, In2,
-        //                   ( ...)))
-        for (unsigned In = 0; In < NumIncoming; In++) {
-          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
-                                            P->getParent());
-          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
-
-          for (unsigned part = 0; part < UF; ++part) {
-            // We don't need to 'select' the first PHI operand because it is
-            // the default value if all of the other masks don't match.
-            if (In == 0)
-              Entry[part] = In0[part];
-            else
-              // Select between the current value and the previous incoming edge
-              // based on the incoming mask.
-              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
-                                                 Entry[part], "predphi");
-          }
-        }
-        continue;
+  setDebugLocFromInst(Builder, P);
+  // Check for PHI nodes that are lowered to vector selects.
+  if (P->getParent() != OrigLoop->getHeader()) {
+    // We know that all PHIs in non header blocks are converted into
+    // selects, so we don't have to worry about the insertion order and we
+    // can just use the builder.
+    // At this point we generate the predication tree. There may be
+    // duplications since this is a simple recursive scan, but future
+    // optimizations will clean it up.
+
+    unsigned NumIncoming = P->getNumIncomingValues();
+
+    // Generate a sequence of selects of the form:
+    // SELECT(Mask3, In3,
+    //      SELECT(Mask2, In2,
+    //                   ( ...)))
+    for (unsigned In = 0; In < NumIncoming; In++) {
+      VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                        P->getParent());
+      VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+      for (unsigned part = 0; part < UF; ++part) {
+        // We might have single edge PHIs (blocks) - use an identity
+        // 'select' for the first PHI operand.
+        if (In == 0)
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             In0[part]);
+        else
+          // Select between the current value and the previous incoming edge
+          // based on the incoming mask.
+          Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                             Entry[part], "predphi");
       }
+    }
+    return;
+  }
 
-      // This PHINode must be an induction variable.
-      // Make sure that we know about it.
-      assert(Legal->getInductionVars()->count(P) &&
-             "Not an induction variable");
+  // This PHINode must be an induction variable.
+  // Make sure that we know about it.
+  assert(Legal->getInductionVars()->count(P) &&
+         "Not an induction variable");
 
-      LoopVectorizationLegality::InductionInfo II =
-        Legal->getInductionVars()->lookup(P);
+  LoopVectorizationLegality::InductionInfo II =
+  Legal->getInductionVars()->lookup(P);
 
-      switch (II.IK) {
-      case LoopVectorizationLegality::IK_NoInduction:
-        llvm_unreachable("Unknown induction");
-      case LoopVectorizationLegality::IK_IntInduction: {
-        assert(P == OldInduction && "Unexpected PHI");
-        Value *Broadcasted = getBroadcastInstrs(Induction);
+  switch (II.IK) {
+    case LoopVectorizationLegality::IK_NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IK_IntInduction: {
+      assert(P->getType() == II.StartValue->getType() && "Types must match");
+      Type *PhiTy = P->getType();
+      Value *Broadcasted;
+      if (P == OldInduction) {
+        // Handle the canonical induction variable. We might have had to
+        // extend the type.
+        Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+      } else {
+        // Handle other induction variables that are now based on the
+        // canonical one.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+                                                 "normalized.idx");
+        NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+        Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+                                        "offset.idx");
+      }
+      Broadcasted = getBroadcastInstrs(Broadcasted);
+      // After broadcasting the induction variable we need to make the vector
+      // consecutive by adding 0, 1, 2, etc.
+      for (unsigned part = 0; part < UF; ++part)
+        Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
+      return;
+    }
+    case LoopVectorizationLegality::IK_ReverseIntInduction:
+    case LoopVectorizationLegality::IK_PtrInduction:
+    case LoopVectorizationLegality::IK_ReversePtrInduction:
+      // Handle reverse integer and pointer inductions.
+      Value *StartIdx = ExtendedIdx;
+      // This is the normalized GEP that starts counting at zero.
+      Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                               "normalized.idx");
+
+      // Handle the reverse integer induction variable case.
+      if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
+        IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+        Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                               "resize.norm.idx");
+        Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                               "reverse.idx");
+
+        // This is a new value so do not hoist it out.
+        Value *Broadcasted = getBroadcastInstrs(ReverseInd);
         // After broadcasting the induction variable we need to make the
-        // vector consecutive by adding 0, 1, 2 ...
+        // vector consecutive by adding  ... -3, -2, -1, 0.
         for (unsigned part = 0; part < UF; ++part)
-          Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
-        continue;
+          Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                             true);
+        return;
       }
-      case LoopVectorizationLegality::IK_ReverseIntInduction:
-      case LoopVectorizationLegality::IK_PtrInduction:
-      case LoopVectorizationLegality::IK_ReversePtrInduction:
-        // Handle reverse integer and pointer inductions.
-        Value *StartIdx = 0;
-        // If we have a single integer induction variable then use it.
-        // Otherwise, start counting at zero.
-        if (OldInduction) {
-          LoopVectorizationLegality::InductionInfo OldII =
-            Legal->getInductionVars()->lookup(OldInduction);
-          StartIdx = OldII.StartValue;
-        } else {
-          StartIdx = ConstantInt::get(Induction->getType(), 0);
-        }
-        // This is the normalized GEP that starts counting at zero.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                 "normalized.idx");
 
-        // Handle the reverse integer induction variable case.
-        if (LoopVectorizationLegality::IK_ReverseIntInduction == II.IK) {
-          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
-          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
-                                                 "resize.norm.idx");
-          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
-                                                 "reverse.idx");
-
-          // This is a new value so do not hoist it out.
-          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
-          // After broadcasting the induction variable we need to make the
-          // vector consecutive by adding  ... -3, -2, -1, 0.
-          for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
-                                               true);
+      // Handle the pointer induction variable case.
+      assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+      // Is this a reverse induction ptr or a consecutive induction ptr.
+      bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
+                      II.IK);
+
+      // This is the vector of results. Notice that we don't generate
+      // vector geps because scalar geps result in better code.
+      for (unsigned part = 0; part < UF; ++part) {
+        if (VF == 1) {
+          int EltIndex = (part) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (Reverse)
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+          else
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          Entry[part] = SclrGep;
           continue;
         }
 
-        // Handle the pointer induction variable case.
-        assert(P->getType()->isPointerTy() && "Unexpected type.");
-
-        // Is this a reverse induction ptr or a consecutive induction ptr.
-        bool Reverse = (LoopVectorizationLegality::IK_ReversePtrInduction ==
-                        II.IK);
-
-        // This is the vector of results. Notice that we don't generate
-        // vector geps because scalar geps result in better code.
-        for (unsigned part = 0; part < UF; ++part) {
-          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-          for (unsigned int i = 0; i < VF; ++i) {
-            int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
-            Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
-            Value *GlobalIdx;
-            if (!Reverse)
-              GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-            else
-              GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
-
-            Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
-                                               "next.gep");
-            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                                 Builder.getInt32(i),
-                                                 "insert.gep");
-          }
-          Entry[part] = VecVal;
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          int EltIndex = (i + part * VF) * (Reverse ? -1 : 1);
+          Constant *Idx = ConstantInt::get(Induction->getType(), EltIndex);
+          Value *GlobalIdx;
+          if (!Reverse)
+            GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          else
+            GlobalIdx = Builder.CreateSub(Idx, NormalizedIdx, "gep.ridx");
+
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
         }
-        continue;
+        Entry[part] = VecVal;
       }
+      return;
+  }
+}
 
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    VectorParts &Entry = WidenMap.get(it);
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      // Vectorize PHINodes.
+      widenPHIInstruction(it, Entry, Legal, UF, VF, PV);
+      continue;
     }// End of PHI.
 
     case Instruction::Add:
@@ -2131,6 +2610,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     case Instruction::Xor: {
       // Just widen binops.
       BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      setDebugLocFromInst(Builder, BinOp);
       VectorParts &A = getVectorValue(it->getOperand(0));
       VectorParts &B = getVectorValue(it->getOperand(1));
 
@@ -2157,6 +2637,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       // instruction with a scalar condition. Otherwise, use vector-select.
       bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(it->getOperand(0)),
                                                OrigLoop);
+      setDebugLocFromInst(Builder, it);
 
       // The condition can be loop invariant  but still defined inside the
       // loop. This means that we can't just use the original 'cond' value.
@@ -2165,8 +2646,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       VectorParts &Cond = getVectorValue(it->getOperand(0));
       VectorParts &Op0  = getVectorValue(it->getOperand(1));
       VectorParts &Op1  = getVectorValue(it->getOperand(2));
-      Value *ScalarCond = Builder.CreateExtractElement(Cond[0],
-                                                       Builder.getInt32(0));
+
+      Value *ScalarCond = (VF == 1) ? Cond[0] :
+        Builder.CreateExtractElement(Cond[0], Builder.getInt32(0));
+
       for (unsigned Part = 0; Part < UF; ++Part) {
         Entry[Part] = Builder.CreateSelect(
           InvariantCond ? ScalarCond : Cond[Part],
@@ -2181,6 +2664,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       // Widen compares. Generate vector compares.
       bool FCmp = (it->getOpcode() == Instruction::FCmp);
       CmpInst *Cmp = dyn_cast<CmpInst>(it);
+      setDebugLocFromInst(Builder, it);
       VectorParts &A = getVectorValue(it->getOperand(0));
       VectorParts &B = getVectorValue(it->getOperand(1));
       for (unsigned Part = 0; Part < UF; ++Part) {
@@ -2211,6 +2695,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       CastInst *CI = dyn_cast<CastInst>(it);
+      setDebugLocFromInst(Builder, it);
       /// Optimize the special case where the source is the induction
       /// variable. Notice that we can only optimize the 'trunc' case
       /// because: a. FP conversions lose precision, b. sext/zext may wrap,
@@ -2225,7 +2710,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         break;
       }
       /// Vectorize casts.
-      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+      Type *DestTy = (VF == 1) ? CI->getType() :
+                                 VectorType::get(CI->getType(), VF);
 
       VectorParts &A = getVectorValue(it->getOperand(0));
       for (unsigned Part = 0; Part < UF; ++Part)
@@ -2237,20 +2723,32 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       // Ignore dbg intrinsics.
       if (isa<DbgInfoIntrinsic>(it))
         break;
+      setDebugLocFromInst(Builder, it);
 
       Module *M = BB->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
       assert(ID && "Not an intrinsic call!");
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value*, 4> Args;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-          VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
-          Args.push_back(Arg[Part]);
+      switch (ID) {
+      case Intrinsic::lifetime_end:
+      case Intrinsic::lifetime_start:
+        scalarizeInstruction(it);
+        break;
+      default:
+        for (unsigned Part = 0; Part < UF; ++Part) {
+          SmallVector<Value *, 4> Args;
+          for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
+            VectorParts &Arg = getVectorValue(CI->getArgOperand(i));
+            Args.push_back(Arg[Part]);
+          }
+          Type *Tys[] = {CI->getType()};
+          if (VF > 1)
+            Tys[0] = VectorType::get(CI->getType()->getScalarType(), VF);
+
+          Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+          Entry[Part] = Builder.CreateCall(F, Args);
         }
-        Type *Tys[] = { VectorType::get(CI->getType()->getScalarType(), VF) };
-        Function *F = Intrinsic::getDeclaration(M, ID, Tys);
-        Entry[Part] = Builder.CreateCall(F, Args);
+        break;
       }
       break;
     }
@@ -2283,24 +2781,65 @@ void InnerLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
+/// \brief Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (!Phi)
+      return true;
+    for (unsigned p = 0, e = Phi->getNumIncomingValues(); p != e; ++p)
+      if (Constant *C = dyn_cast<Constant>(Phi->getIncomingValue(p)))
+        if (C->canTrap())
+          return false;
+  }
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   if (!EnableIfConversion)
     return false;
 
   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+  // A list of pointers that we can safely read and write to.
+  SmallPtrSet<Value *, 8> SafePointes;
+
+  // Collect safe addresses.
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
+
+    if (blockNeedsPredication(BB))
+      continue;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        SafePointes.insert(LI->getPointerOperand());
+      else if (StoreInst *SI = dyn_cast<StoreInst>(I))
+        SafePointes.insert(SI->getPointerOperand());
+    }
+  }
 
   // Collect the blocks that need predication.
-  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
-    BasicBlock *BB = LoopBlocks[i];
+  BasicBlock *Header = TheLoop->getHeader();
+  for (Loop::block_iterator BI = TheLoop->block_begin(),
+         BE = TheLoop->block_end(); BI != BE; ++BI) {
+    BasicBlock *BB = *BI;
 
     // We don't support switch statements inside loops.
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
     // We must be able to predicate all blocks that need to be predicated.
-    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+    if (blockNeedsPredication(BB)) {
+      if (!blockCanBePredicated(BB, SafePointes))
+        return false;
+    } else if (BB != Header && !canIfConvertPHINodes(BB))
       return false;
+
   }
 
   // We can if-convert this loop.
@@ -2325,27 +2864,26 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getExitingBlock())
     return false;
 
-  unsigned NumBlocks = TheLoop->getNumBlocks();
+  // We need to have a loop header.
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << '\n');
 
   // Check if we can if-convert non single-bb loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
-  // We need to have a loop header.
-  BasicBlock *Latch = TheLoop->getLoopLatch();
-  DEBUG(dbgs() << "LV: Found a loop: " <<
-        TheLoop->getHeader()->getName() << "\n");
-
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(TheLoop);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
+  BasicBlock *Latch = TheLoop->getLoopLatch();
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
   if (TC > 0u && TC < TinyTripCountVectorThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
@@ -2378,6 +2916,26 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
+static Type *convertPointerToIntegerType(DataLayout &DL, Type *Ty) {
+  if (Ty->isPointerTy())
+    return DL.getIntPtrType(Ty);
+
+  // It is possible that char's or short's overflow when we ask for the loop's
+  // trip count, work around this by changing the type size.
+  if (Ty->getScalarSizeInBits() < 32)
+    return Type::getInt32Ty(Ty->getContext());
+
+  return Ty;
+}
+
+static Type* getWiderType(DataLayout &DL, Type *Ty0, Type *Ty1) {
+  Ty0 = convertPointerToIntegerType(DL, Ty0);
+  Ty1 = convertPointerToIntegerType(DL, Ty1);
+  if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+    return Ty0;
+  return Ty1;
+}
+
 /// \brief Check that the instruction has outside loop users and is not an
 /// identified reduction variable.
 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
@@ -2391,7 +2949,7 @@ static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
       Instruction *U = cast<Instruction>(*I);
       // This user may be a reduction exit value.
       if (!TheLoop->contains(U)) {
-        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        DEBUG(dbgs() << "LV: Found an outside user for : " << *U << '\n');
         return true;
       }
     }
@@ -2402,13 +2960,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
-  // If we marked the scalar loop as "already vectorized" then no need
-  // to vectorize it again.
-  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
-    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
-    return false;
-  }
-
   // Look for the attribute signaling the absence of NaNs.
   Function &F = *Header->getParent();
   if (F.hasFnAttribute("no-nans-fp-math"))
@@ -2425,10 +2976,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
          ++it) {
 
       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+        Type *PhiTy = Phi->getType();
         // Check that this PHI type is allowed.
-        if (!Phi->getType()->isIntegerTy() &&
-            !Phi->getType()->isFloatingPointTy() &&
-            !Phi->getType()->isPointerTy()) {
+        if (!PhiTy->isIntegerTy() &&
+            !PhiTy->isFloatingPointTy() &&
+            !PhiTy->isPointerTy()) {
           DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
@@ -2456,17 +3008,29 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         InductionKind IK = isInductionVariable(Phi);
 
         if (IK_NoInduction != IK) {
+          // Get the widest type.
+          if (!WidestIndTy)
+            WidestIndTy = convertPointerToIntegerType(*DL, PhiTy);
+          else
+            WidestIndTy = getWiderType(*DL, PhiTy, WidestIndTy);
+
           // Int inductions are special because we only allow one IV.
           if (IK == IK_IntInduction) {
-            if (Induction) {
-              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
-              return false;
-            }
-            Induction = Phi;
+            // Use the phi node with the widest type as induction. Use the last
+            // one if there are multiple (no good reason for doing this other
+            // than it is expedient).
+            if (!Induction || PhiTy == WidestIndTy)
+              Induction = Phi;
           }
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
           Inductions[Phi] = InductionInfo(StartValue, IK);
+
+          // Until we explicitly handle the case of an induction variable with
+          // an outside loop user we have to give up vectorizing this loop.
+          if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            return false;
+
           continue;
         }
 
@@ -2503,7 +3067,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           continue;
         }
         if (AddReductionVar(Phi, RK_FloatMinMax)) {
-          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<
+                "\n");
           continue;
         }
 
@@ -2520,9 +3085,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       }
 
       // Check that the instruction return type is vectorizable.
-      if (!VectorType::isValidElementType(it->getType()) &&
-          !it->getType()->isVoidTy()) {
-        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      // Also, we can't vectorize extractelement instructions.
+      if ((!VectorType::isValidElementType(it->getType()) &&
+           !it->getType()->isVoidTy()) || isa<ExtractElementInst>(it)) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type.\n");
         return false;
       }
 
@@ -2544,7 +3110,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
-    assert(getInductionVars()->size() && "No induction variables");
+    if (Inductions.empty())
+      return false;
   }
 
   return true;
@@ -2573,59 +3140,715 @@ void LoopVectorizationLegality::collectLoopUniforms() {
     Uniforms.insert(I);
 
     // Insert all operands.
-    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
-      Worklist.push_back(I->getOperand(i));
-    }
+    Worklist.insert(Worklist.end(), I->op_begin(), I->op_end());
   }
 }
 
-AliasAnalysis::Location
-LoopVectorizationLegality::getLoadStoreLocation(Instruction *Inst) {
-  if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
-    return AA->getLocation(Store);
-  else if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
-    return AA->getLocation(Load);
+namespace {
+/// \brief Analyses memory accesses in a loop.
+///
+/// Checks whether run time pointer checks are needed and builds sets for data
+/// dependence checking.
+class AccessAnalysis {
+public:
+  /// \brief Read or write access location.
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+
+  /// \brief Set of potential dependent memory accesses.
+  typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
+
+  AccessAnalysis(DataLayout *Dl, DepCandidates &DA) :
+    DL(Dl), DepCands(DA), AreAllWritesIdentified(true),
+    AreAllReadsIdentified(true), IsRTCheckNeeded(false) {}
+
+  /// \brief Register a load  and whether it is only read from.
+  void addLoad(Value *Ptr, bool IsReadOnly) {
+    Accesses.insert(MemAccessInfo(Ptr, false));
+    if (IsReadOnly)
+      ReadOnlyPtr.insert(Ptr);
+  }
 
-  llvm_unreachable("Should be either load or store instruction");
+  /// \brief Register a store.
+  void addStore(Value *Ptr) {
+    Accesses.insert(MemAccessInfo(Ptr, true));
+  }
+
+  /// \brief Check whether we can check the pointers at runtime for
+  /// non-intersection.
+  bool canCheckPtrAtRT(LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
+                       unsigned &NumComparisons, ScalarEvolution *SE,
+                       Loop *TheLoop, bool ShouldCheckStride = false);
+
+  /// \brief Goes over all memory accesses, checks whether a RT check is needed
+  /// and builds sets of dependent accesses.
+  void buildDependenceSets() {
+    // Process read-write pointers first.
+    processMemAccesses(false);
+    // Next, process read pointers.
+    processMemAccesses(true);
+  }
+
+  bool isRTCheckNeeded() { return IsRTCheckNeeded; }
+
+  bool isDependencyCheckNeeded() { return !CheckDeps.empty(); }
+  void resetDepChecks() { CheckDeps.clear(); }
+
+  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+
+private:
+  typedef SetVector<MemAccessInfo> PtrAccessSet;
+  typedef DenseMap<Value*, MemAccessInfo> UnderlyingObjToAccessMap;
+
+  /// \brief Go over all memory access or only the deferred ones if
+  /// \p UseDeferred is true and check whether runtime pointer checks are needed
+  /// and build sets of dependency check candidates.
+  void processMemAccesses(bool UseDeferred);
+
+  /// Set of all accesses.
+  PtrAccessSet Accesses;
+
+  /// Set of access to check after all writes have been processed.
+  PtrAccessSet DeferredAccesses;
+
+  /// Map of pointers to last access encountered.
+  UnderlyingObjToAccessMap ObjToLastAccess;
+
+  /// Set of accesses that need a further dependence check.
+  MemAccessInfoSet CheckDeps;
+
+  /// Set of pointers that are read only.
+  SmallPtrSet<Value*, 16> ReadOnlyPtr;
+
+  /// Set of underlying objects already written to.
+  SmallPtrSet<Value*, 16> WriteObjects;
+
+  DataLayout *DL;
+
+  /// Sets of potentially dependent accesses - members of one set share an
+  /// underlying pointer. The set "CheckDeps" identfies which sets really need a
+  /// dependence check.
+  DepCandidates &DepCands;
+
+  bool AreAllWritesIdentified;
+  bool AreAllReadsIdentified;
+  bool IsRTCheckNeeded;
+};
+
+} // end anonymous namespace
+
+/// \brief Check whether a pointer can participate in a runtime bounds check.
+static bool hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
+  const SCEV *PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
 }
 
-bool
-LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
-                                                Value *Object,
-                                                Instruction *Inst,
-                                                AliasMultiMap& WriteObjects,
-                                                unsigned MaxByteWidth) {
+/// \brief Check the stride of the pointer and ensure that it does not wrap in
+/// the address space.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+                        const Loop *Lp);
+
+bool AccessAnalysis::canCheckPtrAtRT(
+                       LoopVectorizationLegality::RuntimePointerCheck &RtCheck,
+                        unsigned &NumComparisons, ScalarEvolution *SE,
+                        Loop *TheLoop, bool ShouldCheckStride) {
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  unsigned NumReadPtrChecks = 0;
+  unsigned NumWritePtrChecks = 0;
+  bool CanDoRT = true;
+
+  bool IsDepCheckNeeded = isDependencyCheckNeeded();
+  // We assign consecutive id to access from different dependence sets.
+  // Accesses within the same set don't need a runtime check.
+  unsigned RunningDepId = 1;
+  DenseMap<Value *, unsigned> DepSetId;
+
+  for (PtrAccessSet::iterator AI = Accesses.begin(), AE = Accesses.end();
+       AI != AE; ++AI) {
+    const MemAccessInfo &Access = *AI;
+    Value *Ptr = Access.getPointer();
+    bool IsWrite = Access.getInt();
+
+    // Just add write checks if we have both.
+    if (!IsWrite && Accesses.count(MemAccessInfo(Ptr, true)))
+      continue;
+
+    if (IsWrite)
+      ++NumWritePtrChecks;
+    else
+      ++NumReadPtrChecks;
+
+    if (hasComputableBounds(SE, Ptr) &&
+        // When we run after a failing dependency check we have to make sure we
+        // don't have wrapping pointers.
+        (!ShouldCheckStride || isStridedPtr(SE, DL, Ptr, TheLoop) == 1)) {
+      // The id of the dependence set.
+      unsigned DepId;
+
+      if (IsDepCheckNeeded) {
+        Value *Leader = DepCands.getLeaderValue(Access).getPointer();
+        unsigned &LeaderId = DepSetId[Leader];
+        if (!LeaderId)
+          LeaderId = RunningDepId++;
+        DepId = LeaderId;
+      } else
+        // Each access has its own dependence set.
+        DepId = RunningDepId++;
+
+      RtCheck.insert(SE, TheLoop, Ptr, IsWrite, DepId);
+
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *Ptr << '\n');
+    } else {
+      CanDoRT = false;
+    }
+  }
 
-  AliasAnalysis::Location ThisLoc = getLoadStoreLocation(Inst);
+  if (IsDepCheckNeeded && CanDoRT && RunningDepId == 2)
+    NumComparisons = 0; // Only one dependence set.
+  else {
+    NumComparisons = (NumWritePtrChecks * (NumReadPtrChecks +
+                                           NumWritePtrChecks - 1));
+  }
 
-  std::vector<Instruction*>::iterator
-              it = WriteObjects[Object].begin(),
-              end = WriteObjects[Object].end();
+  // If the pointers that we would use for the bounds comparison have different
+  // address spaces, assume the values aren't directly comparable, so we can't
+  // use them for the runtime check. We also have to assume they could
+  // overlap. In the future there should be metadata for whether address spaces
+  // are disjoint.
+  unsigned NumPointers = RtCheck.Pointers.size();
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i + 1; j < NumPointers; ++j) {
+      // Only need to check pointers between two different dependency sets.
+      if (RtCheck.DependencySetId[i] == RtCheck.DependencySetId[j])
+       continue;
+
+      Value *PtrI = RtCheck.Pointers[i];
+      Value *PtrJ = RtCheck.Pointers[j];
+
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "LV: Runtime check would require comparison between"
+                       " different address spaces\n");
+        return false;
+      }
+    }
+  }
+
+  return CanDoRT;
+}
+
+static bool isFunctionScopeIdentifiedObject(Value *Ptr) {
+  return isNoAliasArgument(Ptr) || isNoAliasCall(Ptr) || isa<AllocaInst>(Ptr);
+}
 
-  for (; it != end; ++it) {
-    Instruction* I = *it;
-    if (I == Inst)
+void AccessAnalysis::processMemAccesses(bool UseDeferred) {
+  // We process the set twice: first we process read-write pointers, last we
+  // process read-only pointers. This allows us to skip dependence tests for
+  // read-only pointers.
+
+  PtrAccessSet &S = UseDeferred ? DeferredAccesses : Accesses;
+  for (PtrAccessSet::iterator AI = S.begin(), AE = S.end(); AI != AE; ++AI) {
+    const MemAccessInfo &Access = *AI;
+    Value *Ptr = Access.getPointer();
+    bool IsWrite = Access.getInt();
+
+    DepCands.insert(Access);
+
+    // Memorize read-only pointers for later processing and skip them in the
+    // first round (they need to be checked after we have seen all write
+    // pointers). Note: we also mark pointer that are not consecutive as
+    // "read-only" pointers (so that we check "a[b[i]] +="). Hence, we need the
+    // second check for "!IsWrite".
+    bool IsReadOnlyPtr = ReadOnlyPtr.count(Ptr) && !IsWrite;
+    if (!UseDeferred && IsReadOnlyPtr) {
+      DeferredAccesses.insert(Access);
       continue;
+    }
+
+    bool NeedDepCheck = false;
+    // Check whether there is the possiblity of dependency because of underlying
+    // objects being the same.
+    typedef SmallVector<Value*, 16> ValueVector;
+    ValueVector TempObjects;
+    GetUnderlyingObjects(Ptr, TempObjects, DL);
+    for (ValueVector::iterator UI = TempObjects.begin(), UE = TempObjects.end();
+         UI != UE; ++UI) {
+      Value *UnderlyingObj = *UI;
+
+      // If this is a write then it needs to be an identified object.  If this a
+      // read and all writes (so far) are identified function scope objects we
+      // don't need an identified underlying object but only an Argument (the
+      // next write is going to invalidate this assumption if it is
+      // unidentified).
+      // This is a micro-optimization for the case where all writes are
+      // identified and we have one argument pointer.
+      // Otherwise, we do need a runtime check.
+      if ((IsWrite && !isFunctionScopeIdentifiedObject(UnderlyingObj)) ||
+          (!IsWrite && (!AreAllWritesIdentified ||
+                        !isa<Argument>(UnderlyingObj)) &&
+           !isIdentifiedObject(UnderlyingObj))) {
+        DEBUG(dbgs() << "LV: Found an unidentified " <<
+              (IsWrite ?  "write" : "read" ) << " ptr: " << *UnderlyingObj <<
+              "\n");
+        IsRTCheckNeeded = (IsRTCheckNeeded ||
+                           !isIdentifiedObject(UnderlyingObj) ||
+                           !AreAllReadsIdentified);
+
+        if (IsWrite)
+          AreAllWritesIdentified = false;
+        if (!IsWrite)
+          AreAllReadsIdentified = false;
+      }
+
+      // If this is a write - check other reads and writes for conflicts.  If
+      // this is a read only check other writes for conflicts (but only if there
+      // is no other write to the ptr - this is an optimization to catch "a[i] =
+      // a[i] + " without having to do a dependence check).
+      if ((IsWrite || IsReadOnlyPtr) && WriteObjects.count(UnderlyingObj))
+        NeedDepCheck = true;
+
+      if (IsWrite)
+        WriteObjects.insert(UnderlyingObj);
+
+      // Create sets of pointers connected by shared underlying objects.
+      UnderlyingObjToAccessMap::iterator Prev =
+        ObjToLastAccess.find(UnderlyingObj);
+      if (Prev != ObjToLastAccess.end())
+        DepCands.unionSets(Access, Prev->second);
+
+      ObjToLastAccess[UnderlyingObj] = Access;
+    }
+
+    if (NeedDepCheck)
+      CheckDeps.insert(Access);
+  }
+}
+
+namespace {
+/// \brief Checks memory dependences among accesses to the same underlying
+/// object to determine whether there vectorization is legal or not (and at
+/// which vectorization factor).
+///
+/// This class works under the assumption that we already checked that memory
+/// locations with different underlying pointers are "must-not alias".
+/// We use the ScalarEvolution framework to symbolically evalutate access
+/// functions pairs. Since we currently don't restructure the loop we can rely
+/// on the program order of memory accesses to determine their safety.
+/// At the moment we will only deem accesses as safe for:
+///  * A negative constant distance assuming program order.
+///
+///      Safe: tmp = a[i + 1];     OR     a[i + 1] = x;
+///            a[i] = tmp;                y = a[i];
+///
+///   The latter case is safe because later checks guarantuee that there can't
+///   be a cycle through a phi node (that is, we check that "x" and "y" is not
+///   the same variable: a header phi can only be an induction or a reduction, a
+///   reduction can't have a memory sink, an induction can't have a memory
+///   source). This is important and must not be violated (or we have to
+///   resort to checking for cycles through memory).
+///
+///  * A positive constant distance assuming program order that is bigger
+///    than the biggest memory access.
+///
+///     tmp = a[i]        OR              b[i] = x
+///     a[i+2] = tmp                      y = b[i+2];
+///
+///     Safe distance: 2 x sizeof(a[0]), and 2 x sizeof(b[0]), respectively.
+///
+///  * Zero distances and all accesses have the same size.
+///
+class MemoryDepChecker {
+public:
+  typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
+  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+
+  MemoryDepChecker(ScalarEvolution *Se, DataLayout *Dl, const Loop *L)
+      : SE(Se), DL(Dl), InnermostLoop(L), AccessIdx(0),
+        ShouldRetryWithRuntimeCheck(false) {}
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(StoreInst *SI) {
+    Value *Ptr = SI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, true)].push_back(AccessIdx);
+    InstMap.push_back(SI);
+    ++AccessIdx;
+  }
+
+  /// \brief Register the location (instructions are given increasing numbers)
+  /// of a write access.
+  void addAccess(LoadInst *LI) {
+    Value *Ptr = LI->getPointerOperand();
+    Accesses[MemAccessInfo(Ptr, false)].push_back(AccessIdx);
+    InstMap.push_back(LI);
+    ++AccessIdx;
+  }
+
+  /// \brief Check whether the dependencies between the accesses are safe.
+  ///
+  /// Only checks sets with elements in \p CheckDeps.
+  bool areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                   MemAccessInfoSet &CheckDeps);
+
+  /// \brief The maximum number of bytes of a vector register we can vectorize
+  /// the accesses safely with.
+  unsigned getMaxSafeDepDistBytes() { return MaxSafeDepDistBytes; }
+
+  /// \brief In same cases when the dependency check fails we can still
+  /// vectorize the loop with a dynamic array access check.
+  bool shouldRetryWithRuntimeCheck() { return ShouldRetryWithRuntimeCheck; }
+
+private:
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  const Loop *InnermostLoop;
+
+  /// \brief Maps access locations (ptr, read/write) to program order.
+  DenseMap<MemAccessInfo, std::vector<unsigned> > Accesses;
+
+  /// \brief Memory access instructions in program order.
+  SmallVector<Instruction *, 16> InstMap;
+
+  /// \brief The program order index to be used for the next instruction.
+  unsigned AccessIdx;
+
+  // We can access this many bytes in parallel safely.
+  unsigned MaxSafeDepDistBytes;
+
+  /// \brief If we see a non constant dependence distance we can still try to
+  /// vectorize this loop with runtime checks.
+  bool ShouldRetryWithRuntimeCheck;
+
+  /// \brief Check whether there is a plausible dependence between the two
+  /// accesses.
+  ///
+  /// Access \p A must happen before \p B in program order. The two indices
+  /// identify the index into the program order map.
+  ///
+  /// This function checks  whether there is a plausible dependence (or the
+  /// absence of such can't be proved) between the two accesses. If there is a
+  /// plausible dependence but the dependence distance is bigger than one
+  /// element access it records this distance in \p MaxSafeDepDistBytes (if this
+  /// distance is smaller than any other distance encountered so far).
+  /// Otherwise, this function returns true signaling a possible dependence.
+  bool isDependent(const MemAccessInfo &A, unsigned AIdx,
+                   const MemAccessInfo &B, unsigned BIdx);
+
+  /// \brief Check whether the data dependence could prevent store-load
+  /// forwarding.
+  bool couldPreventStoreLoadForward(unsigned Distance, unsigned TypeByteSize);
+};
+
+} // end anonymous namespace
+
+static bool isInBoundsGep(Value *Ptr) {
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+    return GEP->isInBounds();
+  return false;
+}
 
-    AliasAnalysis::Location ThatLoc = getLoadStoreLocation(I);
-    if (AA->alias(ThisLoc.getWithNewSize(MaxByteWidth),
-                  ThatLoc.getWithNewSize(MaxByteWidth)))
+/// \brief Check whether the access through \p Ptr has a constant stride.
+static int isStridedPtr(ScalarEvolution *SE, DataLayout *DL, Value *Ptr,
+                        const Loop *Lp) {
+  const Type *Ty = Ptr->getType();
+  assert(Ty->isPointerTy() && "Unexpected non ptr");
+
+  // Make sure that the pointer does not point to aggregate types.
+  const PointerType *PtrTy = cast<PointerType>(Ty);
+  if (PtrTy->getElementType()->isAggregateType()) {
+    DEBUG(dbgs() << "LV: Bad stride - Not a pointer to a scalar type" << *Ptr <<
+          "\n");
+    return 0;
+  }
+
+  const SCEV *PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: Bad stride - Not an AddRecExpr pointer "
+          << *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  // The accesss function must stride over the innermost loop.
+  if (Lp != AR->getLoop()) {
+    DEBUG(dbgs() << "LV: Bad stride - Not striding over innermost loop " <<
+          *Ptr << " SCEV: " << *PtrScev << "\n");
+  }
+
+  // The address calculation must not wrap. Otherwise, a dependence could be
+  // inverted.
+  // An inbounds getelementptr that is a AddRec with a unit stride
+  // cannot wrap per definition. The unit stride requirement is checked later.
+  // An getelementptr without an inbounds attribute and unit stride would have
+  // to access the pointer value "0" which is undefined behavior in address
+  // space 0, therefore we can also vectorize this case.
+  bool IsInBoundsGEP = isInBoundsGep(Ptr);
+  bool IsNoWrapAddRec = AR->getNoWrapFlags(SCEV::NoWrapMask);
+  bool IsInAddressSpaceZero = PtrTy->getAddressSpace() == 0;
+  if (!IsNoWrapAddRec && !IsInBoundsGEP && !IsInAddressSpaceZero) {
+    DEBUG(dbgs() << "LV: Bad stride - Pointer may wrap in the address space "
+          << *Ptr << " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  // Check the step is constant.
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) {
+    DEBUG(dbgs() << "LV: Bad stride - Not a constant strided " << *Ptr <<
+          " SCEV: " << *PtrScev << "\n");
+    return 0;
+  }
+
+  int64_t Size = DL->getTypeAllocSize(PtrTy->getElementType());
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return 0;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  // Strided access.
+  int64_t Stride = StepVal / Size;
+  int64_t Rem = StepVal % Size;
+  if (Rem)
+    return 0;
+
+  // If the SCEV could wrap but we have an inbounds gep with a unit stride we
+  // know we can't "wrap around the address space". In case of address space
+  // zero we know that this won't happen without triggering undefined behavior.
+  if (!IsNoWrapAddRec && (IsInBoundsGEP || IsInAddressSpaceZero) &&
+      Stride != 1 && Stride != -1)
+    return 0;
+
+  return Stride;
+}
+
+bool MemoryDepChecker::couldPreventStoreLoadForward(unsigned Distance,
+                                                    unsigned TypeByteSize) {
+  // If loads occur at a distance that is not a multiple of a feasible vector
+  // factor store-load forwarding does not take place.
+  // Positive dependences might cause troubles because vectorizing them might
+  // prevent store-load forwarding making vectorized code run a lot slower.
+  //   a[i] = a[i-3] ^ a[i-8];
+  //   The stores to a[i:i+1] don't align with the stores to a[i-3:i-2] and
+  //   hence on your typical architecture store-load forwarding does not take
+  //   place. Vectorizing in such cases does not make sense.
+  // Store-load forwarding distance.
+  const unsigned NumCyclesForStoreLoadThroughMemory = 8*TypeByteSize;
+  // Maximum vector factor.
+  unsigned MaxVFWithoutSLForwardIssues = MaxVectorWidth*TypeByteSize;
+  if(MaxSafeDepDistBytes < MaxVFWithoutSLForwardIssues)
+    MaxVFWithoutSLForwardIssues = MaxSafeDepDistBytes;
+
+  for (unsigned vf = 2*TypeByteSize; vf <= MaxVFWithoutSLForwardIssues;
+       vf *= 2) {
+    if (Distance % vf && Distance / vf < NumCyclesForStoreLoadThroughMemory) {
+      MaxVFWithoutSLForwardIssues = (vf >>=1);
+      break;
+    }
+  }
+
+  if (MaxVFWithoutSLForwardIssues< 2*TypeByteSize) {
+    DEBUG(dbgs() << "LV: Distance " << Distance <<
+          " that could cause a store-load forwarding conflict\n");
+    return true;
+  }
+
+  if (MaxVFWithoutSLForwardIssues < MaxSafeDepDistBytes &&
+      MaxVFWithoutSLForwardIssues != MaxVectorWidth*TypeByteSize)
+    MaxSafeDepDistBytes = MaxVFWithoutSLForwardIssues;
+  return false;
+}
+
+bool MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
+                                   const MemAccessInfo &B, unsigned BIdx) {
+  assert (AIdx < BIdx && "Must pass arguments in program order");
+
+  Value *APtr = A.getPointer();
+  Value *BPtr = B.getPointer();
+  bool AIsWrite = A.getInt();
+  bool BIsWrite = B.getInt();
+
+  // Two reads are independent.
+  if (!AIsWrite && !BIsWrite)
+    return false;
+
+  const SCEV *AScev = SE->getSCEV(APtr);
+  const SCEV *BScev = SE->getSCEV(BPtr);
+
+  int StrideAPtr = isStridedPtr(SE, DL, APtr, InnermostLoop);
+  int StrideBPtr = isStridedPtr(SE, DL, BPtr, InnermostLoop);
+
+  const SCEV *Src = AScev;
+  const SCEV *Sink = BScev;
+
+  // If the induction step is negative we have to invert source and sink of the
+  // dependence.
+  if (StrideAPtr < 0) {
+    //Src = BScev;
+    //Sink = AScev;
+    std::swap(APtr, BPtr);
+    std::swap(Src, Sink);
+    std::swap(AIsWrite, BIsWrite);
+    std::swap(AIdx, BIdx);
+    std::swap(StrideAPtr, StrideBPtr);
+  }
+
+  const SCEV *Dist = SE->getMinusSCEV(Sink, Src);
+
+  DEBUG(dbgs() << "LV: Src Scev: " << *Src << "Sink Scev: " << *Sink
+        << "(Induction step: " << StrideAPtr <<  ")\n");
+  DEBUG(dbgs() << "LV: Distance for " << *InstMap[AIdx] << " to "
+        << *InstMap[BIdx] << ": " << *Dist << "\n");
+
+  // Need consecutive accesses. We don't want to vectorize
+  // "A[B[i]] += ..." and similar code or pointer arithmetic that could wrap in
+  // the address space.
+  if (!StrideAPtr || !StrideBPtr || StrideAPtr != StrideBPtr){
+    DEBUG(dbgs() << "Non-consecutive pointer access\n");
+    return true;
+  }
+
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
+  if (!C) {
+    DEBUG(dbgs() << "LV: Dependence because of non constant distance\n");
+    ShouldRetryWithRuntimeCheck = true;
+    return true;
+  }
+
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
+  unsigned TypeByteSize = DL->getTypeAllocSize(ATy);
+
+  // Negative distances are not plausible dependencies.
+  const APInt &Val = C->getValue()->getValue();
+  if (Val.isNegative()) {
+    bool IsTrueDataDependence = (AIsWrite && !BIsWrite);
+    if (IsTrueDataDependence &&
+        (couldPreventStoreLoadForward(Val.abs().getZExtValue(), TypeByteSize) ||
+         ATy != BTy))
       return true;
+
+    DEBUG(dbgs() << "LV: Dependence is negative: NoDep\n");
+    return false;
+  }
+
+  // Write to the same location with the same size.
+  // Could be improved to assert type sizes are the same (i32 == float, etc).
+  if (Val == 0) {
+    if (ATy == BTy)
+      return false;
+    DEBUG(dbgs() << "LV: Zero dependence difference but different types\n");
+    return true;
+  }
+
+  assert(Val.isStrictlyPositive() && "Expect a positive value");
+
+  // Positive distance bigger than max vectorization factor.
+  if (ATy != BTy) {
+    DEBUG(dbgs() <<
+          "LV: ReadWrite-Write positive dependency with different types\n");
+    return false;
   }
+
+  unsigned Distance = (unsigned) Val.getZExtValue();
+
+  // Bail out early if passed-in parameters make vectorization not feasible.
+  unsigned ForcedFactor = VectorizationFactor ? VectorizationFactor : 1;
+  unsigned ForcedUnroll = VectorizationUnroll ? VectorizationUnroll : 1;
+
+  // The distance must be bigger than the size needed for a vectorized version
+  // of the operation and the size of the vectorized operation must not be
+  // bigger than the currrent maximum size.
+  if (Distance < 2*TypeByteSize ||
+      2*TypeByteSize > MaxSafeDepDistBytes ||
+      Distance < TypeByteSize * ForcedUnroll * ForcedFactor) {
+    DEBUG(dbgs() << "LV: Failure because of Positive distance "
+        << Val.getSExtValue() << '\n');
+    return true;
+  }
+
+  MaxSafeDepDistBytes = Distance < MaxSafeDepDistBytes ?
+    Distance : MaxSafeDepDistBytes;
+
+  bool IsTrueDataDependence = (!AIsWrite && BIsWrite);
+  if (IsTrueDataDependence &&
+      couldPreventStoreLoadForward(Distance, TypeByteSize))
+     return true;
+
+  DEBUG(dbgs() << "LV: Positive distance " << Val.getSExtValue() <<
+        " with max VF = " << MaxSafeDepDistBytes / TypeByteSize << '\n');
+
   return false;
 }
 
+bool
+MemoryDepChecker::areDepsSafe(AccessAnalysis::DepCandidates &AccessSets,
+                              MemAccessInfoSet &CheckDeps) {
+
+  MaxSafeDepDistBytes = -1U;
+  while (!CheckDeps.empty()) {
+    MemAccessInfo CurAccess = *CheckDeps.begin();
+
+    // Get the relevant memory access set.
+    EquivalenceClasses<MemAccessInfo>::iterator I =
+      AccessSets.findValue(AccessSets.getLeaderValue(CurAccess));
+
+    // Check accesses within this set.
+    EquivalenceClasses<MemAccessInfo>::member_iterator AI, AE;
+    AI = AccessSets.member_begin(I), AE = AccessSets.member_end();
+
+    // Check every access pair.
+    while (AI != AE) {
+      CheckDeps.erase(*AI);
+      EquivalenceClasses<MemAccessInfo>::member_iterator OI = llvm::next(AI);
+      while (OI != AE) {
+        // Check every accessing instruction pair in program order.
+        for (std::vector<unsigned>::iterator I1 = Accesses[*AI].begin(),
+             I1E = Accesses[*AI].end(); I1 != I1E; ++I1)
+          for (std::vector<unsigned>::iterator I2 = Accesses[*OI].begin(),
+               I2E = Accesses[*OI].end(); I2 != I2E; ++I2) {
+            if (*I1 < *I2 && isDependent(*AI, *I1, *OI, *I2))
+              return false;
+            if (*I2 < *I1 && isDependent(*OI, *I2, *AI, *I1))
+              return false;
+          }
+        ++OI;
+      }
+      AI++;
+    }
+  }
+  return true;
+}
+
 bool LoopVectorizationLegality::canVectorizeMemory() {
 
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
+
   // Holds the Load and Store *instructions*.
   ValueVector Loads;
   ValueVector Stores;
+
+  // Holds all the different accesses in the loop.
+  unsigned NumReads = 0;
+  unsigned NumReadWrites = 0;
+
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+  MemoryDepChecker DepChecker(SE, DL, TheLoop);
 
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
@@ -2639,6 +3862,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       // but is not a load, then we quit. Notice that we don't handle function
       // calls that read or write.
       if (it->mayReadFromMemory()) {
+        // Many math library functions read the rounding mode. We will only
+        // vectorize a loop if it contains known function calls that don't set
+        // the flag. Therefore, it is safe to ignore this read from memory.
+        CallInst *Call = dyn_cast<CallInst>(it);
+        if (Call && getIntrinsicIDForCall(Call, TLI))
+          continue;
+
         LoadInst *Ld = dyn_cast<LoadInst>(it);
         if (!Ld) return false;
         if (!Ld->isSimple() && !IsAnnotatedParallel) {
@@ -2646,6 +3876,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
           return false;
         }
         Loads.push_back(Ld);
+        DepChecker.addAccess(Ld);
         continue;
       }
 
@@ -2658,9 +3889,10 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
           return false;
         }
         Stores.push_back(St);
+        DepChecker.addAccess(St);
       }
-    } // next instr.
-  } // next block.
+    } // Next instr.
+  } // Next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -2672,10 +3904,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
-  // Holds the read and read-write *pointers* that we find. These maps hold
-  // unique values for pointers (so no need for multi-map).
-  AliasMap Reads;
-  AliasMap ReadWrites;
+  AccessAnalysis::DepCandidates DependentAccesses;
+  AccessAnalysis Accesses(DL, DependentAccesses);
 
   // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
@@ -2694,10 +3924,12 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       return false;
     }
 
-    // If we did *not* see this pointer before, insert it to
-    // the read-write list. At this phase it is only a 'write' list.
-    if (Seen.insert(Ptr))
-      ReadWrites.insert(std::make_pair(Ptr, ST));
+    // If we did *not* see this pointer before, insert it to  the read-write
+    // list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr)) {
+      ++NumReadWrites;
+      Accesses.addStore(Ptr);
+    }
   }
 
   if (IsAnnotatedParallel) {
@@ -2718,51 +3950,44 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     // If the address of i is unknown (for example A[B[i]]) then we may
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
-    if (Seen.insert(Ptr) || 0 == isConsecutivePtr(Ptr))
-      Reads.insert(std::make_pair(Ptr, LD));
+    bool IsReadOnlyPtr = false;
+    if (Seen.insert(Ptr) || !isStridedPtr(SE, DL, Ptr, TheLoop)) {
+      ++NumReads;
+      IsReadOnlyPtr = true;
+    }
+    Accesses.addLoad(Ptr, IsReadOnlyPtr);
   }
 
   // If we write (or read-write) to a single destination and there are no
   // other reads in this loop then is it safe to vectorize.
-  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+  if (NumReadWrites == 1 && NumReads == 0) {
     DEBUG(dbgs() << "LV: Found a write-only loop!\n");
     return true;
   }
 
-  unsigned NumReadPtrs = 0;
-  unsigned NumWritePtrs = 0;
+  // Build dependence sets and check whether we need a runtime pointer bounds
+  // check.
+  Accesses.buildDependenceSets();
+  bool NeedRTCheck = Accesses.isRTCheckNeeded();
 
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
-  bool CanDoRT = true;
-  AliasMap::iterator MI, ME;
-  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
-    Value *V = (*MI).first;
-    if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V, true);
-      NumWritePtrs++;
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
-    } else {
-      CanDoRT = false;
-      break;
-    }
-  }
-  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
-    Value *V = (*MI).first;
-    if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V, false);
-      NumReadPtrs++;
-      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
-    } else {
-      CanDoRT = false;
-      break;
-    }
-  }
+  unsigned NumComparisons = 0;
+  bool CanDoRT = false;
+  if (NeedRTCheck)
+    CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE, TheLoop);
+
+
+  DEBUG(dbgs() << "LV: We need to do " << NumComparisons <<
+        " pointer comparisons.\n");
 
-  // Check that we did not collect too many pointers or found a
-  // unsizeable pointer.
-  unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
-  DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+  // If we only have one set of dependences to check pointers among we don't
+  // need a runtime check.
+  if (NumComparisons == 0 && NeedRTCheck)
+    NeedRTCheck = false;
+
+  // Check that we did not collect too many pointers or found an unsizeable
+  // pointer.
   if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
     CanDoRT = false;
@@ -2772,122 +3997,69 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
   }
 
-  bool NeedRTCheck = false;
-
-  // Biggest vectorized access possible, vector width * unroll factor.
-  // TODO: We're being very pessimistic here, find a way to know the
-  // real access width before getting here.
-  unsigned MaxByteWidth = (TTI->getRegisterBitWidth(true) / 8) *
-                           TTI->getMaximumUnrollFactor();
-  // Now that the pointers are in two lists (Reads and ReadWrites), we
-  // can check that there are no conflicts between each of the writes and
-  // between the writes to the reads.
-  // Note that WriteObjects duplicates the stores (indexed now by underlying
-  // objects) to avoid pointing to elements inside ReadWrites.
-  // TODO: Maybe create a new type where they can interact without duplication.
-  AliasMultiMap WriteObjects;
-  ValueVector TempObjects;
-
-  // Check that the read-writes do not conflict with other read-write
-  // pointers.
-  bool AllWritesIdentified = true;
-  for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
-    Value *Val = (*MI).first;
-    Instruction *Inst = (*MI).second;
-
-    GetUnderlyingObjects(Val, TempObjects, DL);
-    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
-         UI != UE; ++UI) {
-      if (!isIdentifiedObject(*UI)) {
-        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **UI <<"\n");
-        NeedRTCheck = true;
-        AllWritesIdentified = false;
-      }
+  if (NeedRTCheck && !CanDoRT) {
+    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
+          "the array bounds.\n");
+    PtrRtCheck.reset();
+    return false;
+  }
 
-      // Never seen it before, can't alias.
-      if (WriteObjects[*UI].empty()) {
-        DEBUG(dbgs() << "LV: Adding Underlying value:" << **UI <<"\n");
-        WriteObjects[*UI].push_back(Inst);
-        continue;
-      }
-      // Direct alias found.
-      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **UI <<"\n");
-        return false;
-      }
-      DEBUG(dbgs() << "LV: Found a conflicting global value:"
-            << **UI <<"\n");
-      DEBUG(dbgs() << "LV: While examining store:" << *Inst <<"\n");
-      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
-
-      // If global alias, make sure they do alias.
-      if (hasPossibleGlobalWriteReorder(*UI,
-                                        Inst,
-                                        WriteObjects,
-                                        MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
-                     << "\n");
+  PtrRtCheck.Need = NeedRTCheck;
+
+  bool CanVecMem = true;
+  if (Accesses.isDependencyCheckNeeded()) {
+    DEBUG(dbgs() << "LV: Checking memory dependencies\n");
+    CanVecMem = DepChecker.areDepsSafe(DependentAccesses,
+                                       Accesses.getDependenciesToCheck());
+    MaxSafeDepDistBytes = DepChecker.getMaxSafeDepDistBytes();
+
+    if (!CanVecMem && DepChecker.shouldRetryWithRuntimeCheck()) {
+      DEBUG(dbgs() << "LV: Retrying with memory checks\n");
+      NeedRTCheck = true;
+
+      // Clear the dependency checks. We assume they are not needed.
+      Accesses.resetDepChecks();
+
+      PtrRtCheck.reset();
+      PtrRtCheck.Need = true;
+
+      CanDoRT = Accesses.canCheckPtrAtRT(PtrRtCheck, NumComparisons, SE,
+                                         TheLoop, true);
+      // Check that we did not collect too many pointers or found an unsizeable
+      // pointer.
+      if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
+        DEBUG(dbgs() << "LV: Can't vectorize with memory checks\n");
+        PtrRtCheck.reset();
         return false;
       }
 
-      // Didn't alias, insert into map for further reference.
-      WriteObjects[*UI].push_back(Inst);
+      CanVecMem = true;
     }
-    TempObjects.clear();
   }
 
-  /// Check that the reads don't conflict with the read-writes.
-  for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
-    Value *Val = (*MI).first;
-    GetUnderlyingObjects(Val, TempObjects, DL);
-    for (ValueVector::iterator UI=TempObjects.begin(), UE=TempObjects.end();
-         UI != UE; ++UI) {
-      // If all of the writes are identified then we don't care if the read
-      // pointer is identified or not.
-      if (!AllWritesIdentified && !isIdentifiedObject(*UI)) {
-        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **UI <<"\n");
-        NeedRTCheck = true;
-      }
+  DEBUG(dbgs() << "LV: We" << (NeedRTCheck ? "" : " don't") <<
+        " need a runtime memory check.\n");
 
-      // Never seen it before, can't alias.
-      if (WriteObjects[*UI].empty())
-        continue;
-      // Direct alias found.
-      if (!AA || dyn_cast<GlobalValue>(*UI) == NULL) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << **UI <<"\n");
-        return false;
-      }
-      DEBUG(dbgs() << "LV: Found a global value:  "
-            << **UI <<"\n");
-      Instruction *Inst = (*MI).second;
-      DEBUG(dbgs() << "LV: While examining load:" << *Inst <<"\n");
-      DEBUG(dbgs() << "LV: On value:" << *Val <<"\n");
-
-      // If global alias, make sure they do alias.
-      if (hasPossibleGlobalWriteReorder(*UI,
-                                        Inst,
-                                        WriteObjects,
-                                        MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
-                     << "\n");
-        return false;
-      }
-    }
-    TempObjects.clear();
-  }
+  return CanVecMem;
+}
 
-  PtrRtCheck.Need = NeedRTCheck;
-  if (NeedRTCheck && !CanDoRT) {
-    DEBUG(dbgs() << "LV: We can't vectorize because we can't find " <<
-          "the array bounds.\n");
-    PtrRtCheck.reset();
-    return false;
+static bool hasMultipleUsesOf(Instruction *I,
+                              SmallPtrSet<Instruction *, 8> &Insts) {
+  unsigned NumUses = 0;
+  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use) {
+    if (Insts.count(dyn_cast<Instruction>(*Use)))
+      ++NumUses;
+    if (NumUses > 1)
+      return true;
   }
 
-  DEBUG(dbgs() << "LV: We "<< (NeedRTCheck ? "" : "don't") <<
-        " need a runtime memory check.\n");
+  return false;
+}
+
+static bool areAllUsesIn(Instruction *I, SmallPtrSet<Instruction *, 8> &Set) {
+  for(User::op_iterator Use = I->op_begin(), E = I->op_end(); Use != E; ++Use)
+    if (!Set.count(dyn_cast<Instruction>(*Use)))
+      return false;
   return true;
 }
 
@@ -2909,116 +4081,154 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // This includes users of the reduction, variables (which form a cycle
   // which ends in the phi node).
   Instruction *ExitInstruction = 0;
-  // Indicates that we found a binary operation in our scan.
-  bool FoundBinOp = false;
+  // Indicates that we found a reduction operation in our scan.
+  bool FoundReduxOp = false;
 
-  // Iter is our iterator. We start with the PHI node and scan for all of the
-  // users of this instruction. All users must be instructions that can be
-  // used as reduction variables (such as ADD). We may have a single
-  // out-of-block user. The cycle must end with the original PHI.
-  Instruction *Iter = Phi;
+  // We start with the PHI node and scan for all of the users of this
+  // instruction. All users must be instructions that can be used as reduction
+  // variables (such as ADD). We must have a single out-of-block user. The cycle
+  // must include the original PHI.
+  bool FoundStartPHI = false;
 
   // To recognize min/max patterns formed by a icmp select sequence, we store
   // the number of instruction we saw from the recognized min/max pattern,
-  // such that we don't stop when we see the phi has two uses (one by the select
-  // and one by the icmp) and to make sure we only see exactly the two
-  // instructions.
+  //  to make sure we only see exactly the two instructions.
   unsigned NumCmpSelectPatternInst = 0;
   ReductionInstDesc ReduxDesc(false, 0);
 
-  // Avoid cycles in the chain.
   SmallPtrSet<Instruction *, 8> VisitedInsts;
-  while (VisitedInsts.insert(Iter)) {
-    // If the instruction has no users then this is a broken
-    // chain and can't be a reduction variable.
-    if (Iter->use_empty())
+  SmallVector<Instruction *, 8> Worklist;
+  Worklist.push_back(Phi);
+  VisitedInsts.insert(Phi);
+
+  // A value in the reduction can be used:
+  //  - By the reduction:
+  //      - Reduction operation:
+  //        - One use of reduction value (safe).
+  //        - Multiple use of reduction value (not safe).
+  //      - PHI:
+  //        - All uses of the PHI must be the reduction (safe).
+  //        - Otherwise, not safe.
+  //  - By one instruction outside of the loop (safe).
+  //  - By further instructions outside of the loop (not safe).
+  //  - By an instruction that is not part of the reduction (not safe).
+  //    This is either:
+  //      * An instruction type other than PHI or the reduction operation.
+  //      * A PHI in the header other than the initial PHI.
+  while (!Worklist.empty()) {
+    Instruction *Cur = Worklist.back();
+    Worklist.pop_back();
+
+    // No Users.
+    // If the instruction has no users then this is a broken chain and can't be
+    // a reduction variable.
+    if (Cur->use_empty())
       return false;
 
-    // Did we find a user inside this loop already ?
-    bool FoundInBlockUser = false;
-    // Did we reach the initial PHI node already ?
-    bool FoundStartPHI = false;
+    bool IsAPhi = isa<PHINode>(Cur);
 
-    // Is this a bin op ?
-    FoundBinOp |= !isa<PHINode>(Iter);
+    // A header PHI use other than the original PHI.
+    if (Cur != Phi && IsAPhi && Cur->getParent() == Phi->getParent())
+      return false;
 
-    // For each of the *users* of iter.
-    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
-         it != e; ++it) {
-      Instruction *U = cast<Instruction>(*it);
-      // We already know that the PHI is a user.
-      if (U == Phi) {
-        FoundStartPHI = true;
-        continue;
-      }
+    // Reductions of instructions such as Div, and Sub is only possible if the
+    // LHS is the reduction variable.
+    if (!Cur->isCommutative() && !IsAPhi && !isa<SelectInst>(Cur) &&
+        !isa<ICmpInst>(Cur) && !isa<FCmpInst>(Cur) &&
+        !VisitedInsts.count(dyn_cast<Instruction>(Cur->getOperand(0))))
+      return false;
+
+    // Any reduction instruction must be of one of the allowed kinds.
+    ReduxDesc = isReductionInstr(Cur, Kind, ReduxDesc);
+    if (!ReduxDesc.IsReduction)
+      return false;
+
+    // A reduction operation must only have one use of the reduction value.
+    if (!IsAPhi && Kind != RK_IntegerMinMax && Kind != RK_FloatMinMax &&
+        hasMultipleUsesOf(Cur, VisitedInsts))
+      return false;
+
+    // All inputs to a PHI node must be a reduction value.
+    if(IsAPhi && Cur != Phi && !areAllUsesIn(Cur, VisitedInsts))
+      return false;
+
+    if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(Cur) ||
+                                     isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+    if (Kind == RK_FloatMinMax && (isa<FCmpInst>(Cur) ||
+                                   isa<SelectInst>(Cur)))
+      ++NumCmpSelectPatternInst;
+
+    // Check  whether we found a reduction operator.
+    FoundReduxOp |= !IsAPhi;
+
+    // Process users of current instruction. Push non PHI nodes after PHI nodes
+    // onto the stack. This way we are going to have seen all inputs to PHI
+    // nodes once we get to them.
+    SmallVector<Instruction *, 8> NonPHIs;
+    SmallVector<Instruction *, 8> PHIs;
+    for (Value::use_iterator UI = Cur->use_begin(), E = Cur->use_end(); UI != E;
+         ++UI) {
+      Instruction *Usr = cast<Instruction>(*UI);
 
       // Check if we found the exit user.
-      BasicBlock *Parent = U->getParent();
+      BasicBlock *Parent = Usr->getParent();
       if (!TheLoop->contains(Parent)) {
-        // Exit if you find multiple outside users.
-        if (ExitInstruction != 0)
+        // Exit if you find multiple outside users or if the header phi node is
+        // being used. In this case the user uses the value of the previous
+        // iteration, in which case we would loose "VF-1" iterations of the
+        // reduction operation if we vectorize.
+        if (ExitInstruction != 0 || Cur == Phi)
           return false;
-        ExitInstruction = Iter;
-      }
 
-      // We allow in-loop PHINodes which are not the original reduction PHI
-      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
-      // structure) then don't skip this PHI.
-      if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
-          U->getParent() != TheLoop->getHeader() &&
-          TheLoop->contains(U) &&
-          Iter->hasNUsesOrMore(2))
-        continue;
+        // The instruction used by an outside user must be the last instruction
+        // before we feed back to the reduction phi. Otherwise, we loose VF-1
+        // operations on the value.
+        if (std::find(Phi->op_begin(), Phi->op_end(), Cur) == Phi->op_end())
+         return false;
 
-      // We can't have multiple inside users except for a combination of
-      // icmp/select both using the phi.
-      if (FoundInBlockUser && !NumCmpSelectPatternInst)
-        return false;
-      FoundInBlockUser = true;
-
-      // Any reduction instr must be of one of the allowed kinds.
-      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
-      if (!ReduxDesc.IsReduction)
-        return false;
+        ExitInstruction = Cur;
+        continue;
+      }
 
-      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U)))
-          ++NumCmpSelectPatternInst;
-      if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U)))
-          ++NumCmpSelectPatternInst;
+      // Process instructions only once (termination).
+      if (VisitedInsts.insert(Usr)) {
+        if (isa<PHINode>(Usr))
+          PHIs.push_back(Usr);
+        else
+          NonPHIs.push_back(Usr);
+      }
+      // Remember that we completed the cycle.
+      if (Usr == Phi)
+        FoundStartPHI = true;
+    }
+    Worklist.append(PHIs.begin(), PHIs.end());
+    Worklist.append(NonPHIs.begin(), NonPHIs.end());
+  }
 
-      // Reductions of instructions such as Div, and Sub is only
-      // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
-          !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter)
-        return false;
+  // This means we have seen one but not the other instruction of the
+  // pattern or more than just a select and cmp.
+  if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+      NumCmpSelectPatternInst != 2)
+    return false;
 
-      Iter = ReduxDesc.PatternLastInst;
-    }
+  if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction)
+    return false;
 
-    // This means we have seen one but not the other instruction of the
-    // pattern or more than just a select and cmp.
-    if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
-        NumCmpSelectPatternInst != 2)
-      return false;
+  // We found a reduction var if we have reached the original phi node and we
+  // only have a single instruction with out-of-loop users.
 
-    // We found a reduction var if we have reached the original
-    // phi node and we only have a single instruction with out-of-loop
-    // users.
-    if (FoundStartPHI) {
-      // This instruction is allowed to have out-of-loop users.
-      AllowedExit.insert(ExitInstruction);
+  // This instruction is allowed to have out-of-loop users.
+  AllowedExit.insert(ExitInstruction);
 
-      // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
-                             ReduxDesc.MinMaxKind);
-      Reductions[Phi] = RD;
-      // We've ended the cycle. This is a reduction variable if we have an
-      // outside user and it has a binary op.
-      return FoundBinOp && ExitInstruction;
-    }
-  }
+  // Save the description of this reduction variable.
+  ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                         ReduxDesc.MinMaxKind);
+  Reductions[Phi] = RD;
+  // We've ended the cycle. This is a reduction variable if we have an
+  // outside user and it has a binary op.
 
-  return false;
+  return true;
 }
 
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
@@ -3169,12 +4379,28 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
   return !DT->dominates(BB, Latch);
 }
 
-bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
+                                            SmallPtrSet<Value *, 8>& SafePtrs) {
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    // We don't predicate loads/stores at the moment.
-    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+    // We might be able to hoist the load.
+    if (it->mayReadFromMemory()) {
+      LoadInst *LI = dyn_cast<LoadInst>(it);
+      if (!LI || !SafePtrs.count(LI->getPointerOperand()))
+        return false;
+    }
+
+    // We don't predicate stores at the moment.
+    if (it->mayWriteToMemory() || it->mayThrow())
       return false;
 
+    // Check that we don't have a constant expression that can trap as operand.
+    for (Instruction::op_iterator OI = it->op_begin(), OE = it->op_end();
+         OI != OE; ++OI) {
+      if (Constant *C = dyn_cast<Constant>(*OI))
+        if (C->canTrap())
+          return false;
+    }
+
     // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
@@ -3189,15 +4415,6 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
   return true;
 }
 
-bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
-  const SCEV *PhiScev = SE->getSCEV(Ptr);
-  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-  if (!AR)
-    return false;
-
-  return AR->isAffine();
-}
-
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
                                                       unsigned UserVF) {
@@ -3210,13 +4427,19 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   // Find the trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
-  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
 
   unsigned WidestType = getWidestType();
   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+  unsigned MaxSafeDepDist = -1U;
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
+  WidestRegister = ((WidestRegister < MaxSafeDepDist) ?
+                    WidestRegister : MaxSafeDepDist);
   unsigned MaxVectorSize = WidestRegister / WidestType;
   DEBUG(dbgs() << "LV: The Widest type: " << WidestType << " bits.\n");
-  DEBUG(dbgs() << "LV: The Widest register is:" << WidestRegister << "bits.\n");
+  DEBUG(dbgs() << "LV: The Widest register is: "
+          << WidestRegister << " bits.\n");
 
   if (MaxVectorSize == 0) {
     DEBUG(dbgs() << "LV: The target has no vector registers.\n");
@@ -3252,7 +4475,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   if (UserVF != 0) {
     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
     return Factor;
@@ -3260,13 +4483,13 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
 
   float Cost = expectedCost(1);
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
     float VectorCost = expectedCost(i) / (float)i;
-    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+    DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
     if (VectorCost < Cost) {
       Cost = VectorCost;
@@ -3347,6 +4570,10 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   if (OptForSize)
     return 1;
 
+  // We used the distance for the unroll factor.
+  if (Legal->getMaxSafeDepDistBytes() != -1U)
+    return 1;
+
   // Do not unroll loops with a relatively small trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop,
                                               TheLoop->getLoopLatch());
@@ -3386,8 +4613,20 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   else if (UF < 1)
     UF = 1;
 
-  if (Legal->getReductionVars()->size()) {
-    DEBUG(dbgs() << "LV: Unrolling because of reductions. \n");
+  bool HasReductions = Legal->getReductionVars()->size();
+
+  // Decide if we want to unroll if we decided that it is legal to vectorize
+  // but not profitable.
+  if (VF == 1) {
+    if (TheLoop->getNumBlocks() > 1 || !HasReductions ||
+        LoopCost > SmallLoopCost)
+      return 1;
+
+    return UF;
+  }
+
+  if (HasReductions) {
+    DEBUG(dbgs() << "LV: Unrolling because of reductions.\n");
     return UF;
   }
 
@@ -3395,14 +4634,14 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
   // We assume that the cost overhead is 1 and we use the cost model
   // to estimate the cost of the loop and unroll until the cost of the
   // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is "<< LoopCost <<" \n");
-  if (LoopCost < 20) {
-    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost. \n");
-    unsigned NewUF = 20/LoopCost + 1;
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (LoopCost < SmallLoopCost) {
+    DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
+    unsigned NewUF = SmallLoopCost / (LoopCost + 1);
     return std::min(NewUF, UF);
   }
 
-  DEBUG(dbgs() << "LV: Not Unrolling. \n");
+  DEBUG(dbgs() << "LV: Not Unrolling.\n");
   return 1;
 }
 
@@ -3503,16 +4742,16 @@ LoopVectorizationCostModel::calculateRegisterUsage() {
     MaxUsage = std::max(MaxUsage, OpenIntervals.size());
 
     DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # " <<
-          OpenIntervals.size() <<"\n");
+          OpenIntervals.size() << '\n');
 
     // Add the current instruction to the list of open intervals.
     OpenIntervals.insert(I);
   }
 
   unsigned Invariant = LoopInvariants.size();
-  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << " \n");
-  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << " \n");
-  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << " \n");
+  DEBUG(dbgs() << "LV(REG): Found max usage: " << MaxUsage << '\n');
+  DEBUG(dbgs() << "LV(REG): Found invariant usage: " << Invariant << '\n');
+  DEBUG(dbgs() << "LV(REG): LoopSize: " << R.NumInstructions << '\n');
 
   R.LoopInvariantRegs = Invariant;
   R.MaxLocalUsers = MaxUsage;
@@ -3535,15 +4774,15 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
         continue;
 
       unsigned C = getInstructionCost(it, VF);
-      Cost += C;
-      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
-            VF << " For instruction: "<< *it << "\n");
+      BlockCost += C;
+      DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF " <<
+            VF << " For instruction: " << *it << '\n');
     }
 
     // We assume that if-converted blocks have a 50% chance of being executed.
     // When the code is scalar then some of the blocks are avoided due to CF.
     // When the code is vectorized we execute all code paths.
-    if (Legal->blockNeedsPredication(*bb) && VF == 1)
+    if (VF == 1 && Legal->blockNeedsPredication(*bb))
       BlockCost /= 2;
 
     Cost += BlockCost;
@@ -3552,6 +4791,59 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   return Cost;
 }
 
+/// \brief Check whether the address computation for a non-consecutive memory
+/// access looks like an unlikely candidate for being merged into the indexing
+/// mode.
+///
+/// We look for a GEP which has one index that is an induction variable and all
+/// other indices are loop invariant. If the stride of this access is also
+/// within a small bound we decide that this address computation can likely be
+/// merged into the addressing mode.
+/// In all other cases, we identify the address computation as complex.
+static bool isLikelyComplexAddressComputation(Value *Ptr,
+                                              LoopVectorizationLegality *Legal,
+                                              ScalarEvolution *SE,
+                                              const Loop *TheLoop) {
+  GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return true;
+
+  // We are looking for a gep with all loop invariant indices except for one
+  // which should be an induction variable.
+  unsigned NumOperands = Gep->getNumOperands();
+  for (unsigned i = 1; i < NumOperands; ++i) {
+    Value *Opd = Gep->getOperand(i);
+    if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+        !Legal->isInductionVariable(Opd))
+      return true;
+  }
+
+  // Now we know we have a GEP ptr, %inv, %ind, %inv. Make sure that the step
+  // can likely be merged into the address computation.
+  unsigned MaxMergeDistance = 64;
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Ptr));
+  if (!AddRec)
+    return true;
+
+  // Check the step is constant.
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C)
+    return true;
+
+  const APInt &APStepVal = C->getValue()->getValue();
+
+  // Huge step value - give up.
+  if (APStepVal.getBitWidth() > 64)
+    return true;
+
+  int64_t StepVal = APStepVal.getSExtValue();
+
+  return StepVal > MaxMergeDistance;
+}
+
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
@@ -3647,6 +4939,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
     if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
+      bool IsComplexComputation =
+        isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
@@ -3662,7 +4956,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       }
 
       // The cost of the scalar loads/stores.
-      Cost += VF * TTI.getAddressComputationCost(ValTy->getScalarType());
+      Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
       Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                        Alignment, AS);
       return Cost;
@@ -3743,15 +5037,17 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createLoopVectorizePass() {
-    return new LoopVectorize();
+  Pass *createLoopVectorizePass(bool NoUnrolling) {
+    return new LoopVectorize(NoUnrolling);
   }
 }
 
@@ -3766,3 +5062,96 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
 
   return false;
 }
+
+
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<VectorParts, 4> Params;
+
+  setDebugLocFromInst(Builder, Instr);
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getVectorValue(SrcOp));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && OrigLoop->contains(SrcInst)) {
+      assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap.get(SrcInst));
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      VectorParts Scalars;
+      Scalars.append(UF, SrcOp);
+      Params.push_back(Scalars);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+  Value *UndefVec = IsVoidRetTy ? 0 :
+  UndefValue::get(Instr->getType());
+  // Create a new entry in the WidenMap and initialize it to Undef or Null.
+  VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+
+    Instruction *Cloned = Instr->clone();
+      if (!IsVoidRetTy)
+        Cloned->setName(Instr->getName() + ".cloned");
+      // Replace the operands of the cloned instructions with extracted scalars.
+      for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+        Value *Op = Params[op][Part];
+        Cloned->setOperand(op, Op);
+      }
+
+      // Place the cloned scalar in the new loop.
+      Builder.Insert(Cloned);
+
+      // If the original scalar returns a value we need to place it in a vector
+      // so that future users will be able to use it.
+      if (!IsVoidRetTy)
+        VecResults[Part] = Cloned;
+  }
+}
+
+void
+InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr,
+                                              LoopVectorizationLegality*) {
+  return scalarizeInstruction(Instr);
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) {
+  return Vec;
+}
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) {
+  return V;
+}
+
+Value *InnerLoopUnroller::getConsecutiveVector(Value* Val, int StartIdx,
+                                               bool Negate) {
+  // When unrolling and the VF is 1, we only need to add a simple scalar.
+  Type *ITy = Val->getType();
+  assert(!ITy->isVectorTy() && "Val must be a scalar");
+  Constant *C = ConstantInt::get(ITy, StartIdx, Negate);
+  return Builder.CreateAdd(Val, C, "induction");
+}
+
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cc30cc9..c72b51f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16,18 +16,23 @@
 //
 //===----------------------------------------------------------------------===//
 #define SV_NAME "slp-vectorizer"
-#define DEBUG_TYPE SV_NAME
+#define DEBUG_TYPE "SLP"
 
-#include "VecUtils.h"
 #include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -35,19 +40,1717 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 #include <map>
 
 using namespace llvm;
 
 static cl::opt<int>
-SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
-                 cl::desc("Only vectorize trees if the gain is above this "
-                          "number. (gain = -cost of vectorization)"));
+    SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+                     cl::desc("Only vectorize if you gain more than this "
+                              "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden,
+                   cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+    "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Attempt to vectorize horizontal reductions feeding into a store"));
+
 namespace {
 
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 12;
+
+/// A helper class for numbering instructions in multiple blocks.
+/// Numbers start at zero for each basic block.
+struct BlockNumbering {
+
+  BlockNumbering(BasicBlock *Bb) : BB(Bb), Valid(false) {}
+
+  BlockNumbering() : BB(0), Valid(false) {}
+
+  void numberInstructions() {
+    unsigned Loc = 0;
+    InstrIdx.clear();
+    InstrVec.clear();
+    // Number the instructions in the block.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      InstrIdx[it] = Loc++;
+      InstrVec.push_back(it);
+      assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+    }
+    Valid = true;
+  }
+
+  int getIndex(Instruction *I) {
+    assert(I->getParent() == BB && "Invalid instruction");
+    if (!Valid)
+      numberInstructions();
+    assert(InstrIdx.count(I) && "Unknown instruction");
+    return InstrIdx[I];
+  }
+
+  Instruction *getInstruction(unsigned loc) {
+    if (!Valid)
+      numberInstructions();
+    assert(InstrVec.size() > loc && "Invalid Index");
+    return InstrVec[loc];
+  }
+
+  void forget() { Valid = false; }
+
+private:
+  /// The block we are numbering.
+  BasicBlock *BB;
+  /// Is the block numbered.
+  bool Valid;
+  /// Maps instructions to numbers and back.
+  SmallDenseMap<Instruction *, int> InstrIdx;
+  /// Maps integers to Instructions.
+  SmallVector<Instruction *, 32> InstrVec;
+};
+
+/// \returns the parent basic block if all of the instructions in \p VL
+/// are in the same block or null otherwise.
+static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return 0;
+  BasicBlock *BB = I0->getParent();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I)
+      return 0;
+
+    if (BB != I->getParent())
+      return 0;
+  }
+  return BB;
+}
+
+/// \returns True if all of the values in \p VL are constants.
+static bool allConstant(ArrayRef<Value *> VL) {
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    if (!isa<Constant>(VL[i]))
+      return false;
+  return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    if (VL[i] != VL[0])
+      return false;
+  return true;
+}
+
+/// \returns The opcode if all of the Instructions in \p VL have the same
+/// opcode, or zero.
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return 0;
+  unsigned Opcode = I0->getOpcode();
+  for (int i = 1, e = VL.size(); i < e; i++) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (!I || Opcode != I->getOpcode())
+      return 0;
+  }
+  return Opcode;
+}
+
+/// \returns \p I after propagating metadata from \p VL.
+static Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL) {
+  Instruction *I0 = cast<Instruction>(VL[0]);
+  SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+  I0->getAllMetadataOtherThanDebugLoc(Metadata);
+
+  for (unsigned i = 0, n = Metadata.size(); i != n; ++i) {
+    unsigned Kind = Metadata[i].first;
+    MDNode *MD = Metadata[i].second;
+
+    for (int i = 1, e = VL.size(); MD && i != e; i++) {
+      Instruction *I = cast<Instruction>(VL[i]);
+      MDNode *IMD = I->getMetadata(Kind);
+
+      switch (Kind) {
+      default:
+        MD = 0; // Remove unknown metadata
+        break;
+      case LLVMContext::MD_tbaa:
+        MD = MDNode::getMostGenericTBAA(MD, IMD);
+        break;
+      case LLVMContext::MD_fpmath:
+        MD = MDNode::getMostGenericFPMath(MD, IMD);
+        break;
+      }
+    }
+    I->setMetadata(Kind, MD);
+  }
+  return I;
+}
+
+/// \returns The type that all of the values in \p VL have or null if there
+/// are different types.
+static Type* getSameType(ArrayRef<Value *> VL) {
+  Type *Ty = VL[0]->getType();
+  for (int i = 1, e = VL.size(); i < e; i++)
+    if (VL[i]->getType() != Ty)
+      return 0;
+
+  return Ty;
+}
+
+/// \returns True if the ExtractElement instructions in VL can be vectorized
+/// to use the original vector.
+static bool CanReuseExtract(ArrayRef<Value *> VL) {
+  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+  // Check if all of the extracts come from the same vector and from the
+  // correct offset.
+  Value *VL0 = VL[0];
+  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+  Value *Vec = E0->getOperand(0);
+
+  // We have to extract from the same vector type.
+  unsigned NElts = Vec->getType()->getVectorNumElements();
+
+  if (NElts != VL.size())
+    return false;
+
+  // Check that all of the indices extract from the correct offset.
+  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
+  if (!CI || CI->getZExtValue())
+    return false;
+
+  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+
+    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+      return false;
+  }
+
+  return true;
+}
+
+static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+                                           SmallVectorImpl<Value *> &Left,
+                                           SmallVectorImpl<Value *> &Right) {
+
+  SmallVector<Value *, 16> OrigLeft, OrigRight;
+
+  bool AllSameOpcodeLeft = true;
+  bool AllSameOpcodeRight = true;
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *I = cast<Instruction>(VL[i]);
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+
+    OrigLeft.push_back(V0);
+    OrigRight.push_back(V1);
+
+    Instruction *I0 = dyn_cast<Instruction>(V0);
+    Instruction *I1 = dyn_cast<Instruction>(V1);
+
+    // Check whether all operands on one side have the same opcode. In this case
+    // we want to preserve the original order and not make things worse by
+    // reordering.
+    AllSameOpcodeLeft = I0;
+    AllSameOpcodeRight = I1;
+
+    if (i && AllSameOpcodeLeft) {
+      if(Instruction *P0 = dyn_cast<Instruction>(OrigLeft[i-1])) {
+        if(P0->getOpcode() != I0->getOpcode())
+          AllSameOpcodeLeft = false;
+      } else
+        AllSameOpcodeLeft = false;
+    }
+    if (i && AllSameOpcodeRight) {
+      if(Instruction *P1 = dyn_cast<Instruction>(OrigRight[i-1])) {
+        if(P1->getOpcode() != I1->getOpcode())
+          AllSameOpcodeRight = false;
+      } else
+        AllSameOpcodeRight = false;
+    }
+
+    // Sort two opcodes. In the code below we try to preserve the ability to use
+    // broadcast of values instead of individual inserts.
+    // vl1 = load
+    // vl2 = phi
+    // vr1 = load
+    // vr2 = vr2
+    //    = vl1 x vr1
+    //    = vl2 x vr2
+    // If we just sorted according to opcode we would leave the first line in
+    // tact but we would swap vl2 with vr2 because opcode(phi) > opcode(load).
+    //    = vl1 x vr1
+    //    = vr2 x vl2
+    // Because vr2 and vr1 are from the same load we loose the opportunity of a
+    // broadcast for the packed right side in the backend: we have [vr1, vl2]
+    // instead of [vr1, vr2=vr1].
+    if (I0 && I1) {
+       if(!i && I0->getOpcode() > I1->getOpcode()) {
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() > I1->getOpcode() && Right[i-1] != I1) {
+         // Try not to destroy a broad cast for no apparent benefit.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Right[i-1] ==  I0) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else if (i && I0->getOpcode() == I1->getOpcode() && Left[i-1] == I1) {
+         // Try preserve broadcasts.
+         Left.push_back(I1);
+         Right.push_back(I0);
+       } else {
+         Left.push_back(I0);
+         Right.push_back(I1);
+       }
+       continue;
+    }
+    // One opcode, put the instruction on the right.
+    if (I0) {
+      Left.push_back(V1);
+      Right.push_back(I0);
+      continue;
+    }
+    Left.push_back(V0);
+    Right.push_back(V1);
+  }
+
+  bool LeftBroadcast = isSplat(Left);
+  bool RightBroadcast = isSplat(Right);
+
+  // Don't reorder if the operands where good to begin with.
+  if (!(LeftBroadcast || RightBroadcast) &&
+      (AllSameOpcodeRight || AllSameOpcodeLeft)) {
+    Left = OrigLeft;
+    Right = OrigRight;
+  }
+}
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+public:
+  typedef SmallVector<Value *, 8> ValueList;
+  typedef SmallVector<Instruction *, 16> InstrList;
+  typedef SmallPtrSet<Value *, 16> ValueSet;
+  typedef SmallVector<StoreInst *, 8> StoreList;
+
+  BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
+          TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
+          DominatorTree *Dt) :
+    F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
+    Builder(Se->getContext()) {
+      // Setup the block numbering utility for all of the blocks in the
+      // function.
+      for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+        BasicBlock *BB = it;
+        BlocksNumbers[BB] = BlockNumbering(BB);
+      }
+    }
+
+  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// Returns the vectorized root.
+  Value *vectorizeTree();
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
+  int getTreeCost();
+
+  /// Construct a vectorizable tree that starts at \p Roots and is possibly
+  /// used by a reduction of \p RdxOps.
+  void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0);
+
+  /// Clear the internal data structures that are created by 'buildTree'.
+  void deleteTree() {
+    RdxOps = 0;
+    VectorizableTree.clear();
+    ScalarToTreeEntry.clear();
+    MustGather.clear();
+    ExternalUses.clear();
+    MemBarrierIgnoreList.clear();
+  }
+
+  /// \returns true if the memory operations A and B are consecutive.
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// \brief Perform LICM and CSE on the newly generated gather sequences.
+  void optimizeGatherSequence();
+private:
+  struct TreeEntry;
+
+  /// \returns the cost of the vectorizable entry.
+  int getEntryCost(TreeEntry *E);
+
+  /// This is the recursive part of buildTree.
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
+
+  /// Vectorize a single entry in the tree, starting in \p VL.
+  Value *vectorizeTree(ArrayRef<Value *> VL);
+
+  /// \returns the pointer to the vectorized value if \p VL is already
+  /// vectorized, or NULL. They may happen in cycles.
+  Value *alreadyVectorized(ArrayRef<Value *> VL) const;
+
+  /// \brief Take the pointer operand from the Load/Store instruction.
+  /// \returns NULL if this is not a valid Load/Store instruction.
+  static Value *getPointerOperand(Value *I);
+
+  /// \brief Take the address space operand from the Load/Store instruction.
+  /// \returns -1 if this is not a valid Load/Store instruction.
+  static unsigned getAddressSpaceOperand(Value *I);
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
+  int getGatherCost(Type *Ty);
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getGatherCost(ArrayRef<Value *> VL);
+
+  /// \returns the AA location that is being access by the instruction.
+  AliasAnalysis::Location getLocation(Instruction *I);
+
+  /// \brief Checks if it is possible to sink an instruction from
+  /// \p Src to \p Dst.
+  /// \returns the pointer to the barrier instruction if we can't sink.
+  Value *getSinkBarrier(Instruction *Src, Instruction *Dst);
+
+  /// \returns the index of the last instruction in the BB from \p VL.
+  int getLastIndex(ArrayRef<Value *> VL);
+
+  /// \returns the Instruction in the bundle \p VL.
+  Instruction *getLastInstruction(ArrayRef<Value *> VL);
+
+  /// \brief Set the Builder insert point to one after the last instruction in
+  /// the bundle
+  void setInsertPointAfterBundle(ArrayRef<Value *> VL);
+
+  /// \returns a vector from a collection of scalars in \p VL.
+  Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
+
+  /// \returns whether the VectorizableTree is fully vectoriable and will
+  /// be beneficial even the tree height is tiny.
+  bool isFullyVectorizableTinyTree(); 
+
+  struct TreeEntry {
+    TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
+    NeedToGather(0) {}
+
+    /// \returns true if the scalars in VL are equal to this entry.
+    bool isSame(ArrayRef<Value *> VL) const {
+      assert(VL.size() == Scalars.size() && "Invalid size");
+      return std::equal(VL.begin(), VL.end(), Scalars.begin());
+    }
+
+    /// A vector of scalars.
+    ValueList Scalars;
+
+    /// The Scalars are vectorized into this value. It is initialized to Null.
+    Value *VectorizedValue;
+
+    /// The index in the basic block of the last scalar.
+    int LastScalarIndex;
+
+    /// Do we need to gather this sequence ?
+    bool NeedToGather;
+  };
+
+  /// Create a new VectorizableTree entry.
+  TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+    VectorizableTree.push_back(TreeEntry());
+    int idx = VectorizableTree.size() - 1;
+    TreeEntry *Last = &VectorizableTree[idx];
+    Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+    Last->NeedToGather = !Vectorized;
+    if (Vectorized) {
+      Last->LastScalarIndex = getLastIndex(VL);
+      for (int i = 0, e = VL.size(); i != e; ++i) {
+        assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+        ScalarToTreeEntry[VL[i]] = idx;
+      }
+    } else {
+      Last->LastScalarIndex = 0;
+      MustGather.insert(VL.begin(), VL.end());
+    }
+    return Last;
+  }
+
+  /// -- Vectorization State --
+  /// Holds all of the tree entries.
+  std::vector<TreeEntry> VectorizableTree;
+
+  /// Maps a specific scalar to its tree entry.
+  SmallDenseMap<Value*, int> ScalarToTreeEntry;
+
+  /// A list of scalars that we found that we need to keep as scalars.
+  ValueSet MustGather;
+
+  /// This POD struct describes one external user in the vectorized tree.
+  struct ExternalUser {
+    ExternalUser (Value *S, llvm::User *U, int L) :
+      Scalar(S), User(U), Lane(L){};
+    // Which scalar in our function.
+    Value *Scalar;
+    // Which user that uses the scalar.
+    llvm::User *User;
+    // Which lane does the scalar belong to.
+    int Lane;
+  };
+  typedef SmallVector<ExternalUser, 16> UserList;
+
+  /// A list of values that need to extracted out of the tree.
+  /// This list holds pairs of (Internal Scalar : External User).
+  UserList ExternalUses;
+
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
+  ValueSet MemBarrierIgnoreList;
+
+  /// Holds all of the instructions that we gathered.
+  SetVector<Instruction *> GatherSeq;
+  /// A list of blocks that we are going to CSE.
+  SmallSet<BasicBlock *, 8> CSEBlocks;
+
+  /// Numbers instructions in different blocks.
+  DenseMap<BasicBlock *, BlockNumbering> BlocksNumbers;
+
+  /// Reduction operators.
+  ValueSet *RdxOps;
+
+  // Analysis and block reference.
+  Function *F;
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  /// Instruction builder to construct the vectorized tree.
+  IRBuilder<> Builder;
+};
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) {
+  deleteTree();
+  RdxOps = Rdx;
+  if (!getSameType(Roots))
+    return;
+  buildTree_rec(Roots, 0);
+
+  // Collect the values that we need to extract from the tree.
+  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+    TreeEntry *Entry = &VectorizableTree[EIdx];
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+
+      // No need to handle users of gathered values.
+      if (Entry->NeedToGather)
+        continue;
+
+      for (Value::use_iterator User = Scalar->use_begin(),
+           UE = Scalar->use_end(); User != UE; ++User) {
+        DEBUG(dbgs() << "SLP: Checking user:" << **User << ".\n");
+
+        // Skip in-tree scalars that become vectors.
+        if (ScalarToTreeEntry.count(*User)) {
+          DEBUG(dbgs() << "SLP: \tInternal user will be removed:" <<
+                **User << ".\n");
+          int Idx = ScalarToTreeEntry[*User]; (void) Idx;
+          assert(!VectorizableTree[Idx].NeedToGather && "Bad state");
+          continue;
+        }
+        Instruction *UserInst = dyn_cast<Instruction>(*User);
+        if (!UserInst)
+          continue;
+
+        // Ignore uses that are part of the reduction.
+        if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end())
+          continue;
+
+        DEBUG(dbgs() << "SLP: Need to extract:" << **User << " from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.push_back(ExternalUser(Scalar, *User, Lane));
+      }
+    }
+  }
+}
+
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  bool SameTy = getSameType(VL); (void)SameTy;
+  assert(SameTy && "Invalid types!");
+
+  if (Depth == RecursionMaxDepth) {
+    DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+    newTreeEntry(VL, false);
+    return;
+  }
+
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy()) {
+    DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+    newTreeEntry(VL, false);
+    return;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy()) {
+      DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+      newTreeEntry(VL, false);
+      return;
+    }
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
+      !getSameOpcode(VL)) {
+    DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+    newTreeEntry(VL, false);
+    return;
+  }
+
+  // We now know that this is a vector of instructions of the same type from
+  // the same block.
+
+  // Check if this is a duplicate of another entry.
+  if (ScalarToTreeEntry.count(VL[0])) {
+    int Idx = ScalarToTreeEntry[VL[0]];
+    TreeEntry *E = &VectorizableTree[Idx];
+    for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+      DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
+      if (E->Scalars[i] != VL[i]) {
+        DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
+    }
+    DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+    return;
+  }
+
+  // Check that none of the instructions in the bundle are already in the tree.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (ScalarToTreeEntry.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+            ") is already in tree.\n");
+      newTreeEntry(VL, false);
+      return;
+    }
+  }
+
+  // If any of the scalars appears in the table OR it is marked as a value that
+  // needs to stat scalar then we need to gather the scalars.
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
+      DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+      newTreeEntry(VL, false);
+      return;
+    }
+  }
+
+  // Check that all of the users of the scalars that we want to vectorize are
+  // schedulable.
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  int MyLastIndex = getLastIndex(VL);
+  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+
+  for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+    Instruction *Scalar = cast<Instruction>(VL[i]);
+    DEBUG(dbgs() << "SLP: Checking users of  " << *Scalar << ". \n");
+    for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
+         U != UE; ++U) {
+      DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n");
+      Instruction *User = dyn_cast<Instruction>(*U);
+      if (!User) {
+        DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
+        newTreeEntry(VL, false);
+        return;
+      }
+
+      // We don't care if the user is in a different basic block.
+      BasicBlock *UserBlock = User->getParent();
+      if (UserBlock != BB) {
+        DEBUG(dbgs() << "SLP: User from a different basic block "
+              << *User << ". \n");
+        continue;
+      }
+
+      // If this is a PHINode within this basic block then we can place the
+      // extract wherever we want.
+      if (isa<PHINode>(*User)) {
+        DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n");
+        continue;
+      }
+
+      // Check if this is a safe in-tree user.
+      if (ScalarToTreeEntry.count(User)) {
+        int Idx = ScalarToTreeEntry[User];
+        int VecLocation = VectorizableTree[Idx].LastScalarIndex;
+        if (VecLocation <= MyLastIndex) {
+          DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
+          newTreeEntry(VL, false);
+          return;
+        }
+        DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" <<
+              VecLocation << " vector value (" << *Scalar << ") at #"
+              << MyLastIndex << ".\n");
+        continue;
+      }
+
+      // This user is part of the reduction.
+      if (RdxOps && RdxOps->count(User))
+        continue;
+
+      // Make sure that we can schedule this unknown user.
+      BlockNumbering &BN = BlocksNumbers[BB];
+      int UserIndex = BN.getIndex(User);
+      if (UserIndex < MyLastIndex) {
+
+        DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
+              << *User << ". \n");
+        newTreeEntry(VL, false);
+        return;
+      }
+    }
+  }
+
+  // Check that every instructions appears once in this bundle.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    for (unsigned j = i+1; j < e; ++j)
+      if (VL[i] == VL[j]) {
+        DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
+
+  // Check that instructions in this bundle don't reference other instructions.
+  // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+         U != UE; ++U) {
+      for (unsigned j = 0; j < e; ++j) {
+        if (i != j && *U == VL[j]) {
+          DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n");
+          newTreeEntry(VL, false);
+          return;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+  unsigned Opcode = getSameOpcode(VL);
+
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    Instruction *Last = getLastInstruction(VL);
+
+    for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+      if (VL[i] == Last)
+        continue;
+      Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
+              << "\n because of " << *Barrier << ".  Gathering.\n");
+        newTreeEntry(VL, false);
+        return;
+      }
+    }
+  }
+
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+
+      // Check for terminator values (e.g. invoke).
+      for (unsigned j = 0; j < VL.size(); ++j)
+        for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+          TerminatorInst *Term = dyn_cast<TerminatorInst>(cast<PHINode>(VL[j])->getIncomingValue(i));
+          if (Term) {
+            DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
+            newTreeEntry(VL, false);
+            return;
+          }
+        }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
+
+        buildTree_rec(Operands, Depth + 1);
+      }
+      return;
+    }
+    case Instruction::ExtractElement: {
+      bool Reuse = CanReuseExtract(VL);
+      if (Reuse) {
+        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+      }
+      newTreeEntry(VL, Reuse);
+      return;
+    }
+    case Instruction::Load: {
+      // Check if the loads are consecutive or of we need to swizzle them.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
+        LoadInst *L = cast<LoadInst>(VL[i]);
+        if (!L->isSimple() || !isConsecutiveAccess(VL[i], VL[i + 1])) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
+          return;
+        }
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+      return;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+        if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+          return;
+        }
+      }
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth+1);
+      }
+      return;
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Check that all of the compares have the same predicate.
+      CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+      Type *ComparedTy = cast<Instruction>(VL[0])->getOperand(0)->getType();
+      for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+        CmpInst *Cmp = cast<CmpInst>(VL[i]);
+        if (Cmp->getPredicate() != P0 ||
+            Cmp->getOperand(0)->getType() != ComparedTy) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+          return;
+        }
+      }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth+1);
+      }
+      return;
+    }
+    case Instruction::Select:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+
+      // Sort operands of the instructions so that each side is more likely to
+      // have the same opcode.
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+        ValueList Left, Right;
+        reorderInputsAccordingToOpcode(VL, Left, Right);
+        buildTree_rec(Left, Depth + 1);
+        buildTree_rec(Right, Depth + 1);
+        return;
+      }
+
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        buildTree_rec(Operands, Depth+1);
+      }
+      return;
+    }
+    case Instruction::Store: {
+      // Check if the stores are consecutive or of we need to swizzle them.
+      for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+        if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+          newTreeEntry(VL, false);
+          DEBUG(dbgs() << "SLP: Non consecutive store.\n");
+          return;
+        }
+
+      newTreeEntry(VL, true);
+      DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+
+      ValueList Operands;
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+
+      // We can ignore these values because we are sinking them down.
+      MemBarrierIgnoreList.insert(VL.begin(), VL.end());
+      buildTree_rec(Operands, Depth + 1);
+      return;
+    }
+    default:
+      newTreeEntry(VL, false);
+      DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+      return;
+  }
+}
+
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+  ArrayRef<Value*> VL = E->Scalars;
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  if (E->NeedToGather) {
+    if (allConstant(VL))
+      return 0;
+    if (isSplat(VL)) {
+      return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+    }
+    return getGatherCost(E->Scalars);
+  }
+
+  assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
+         "Invalid VL");
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  unsigned Opcode = VL0->getOpcode();
+  switch (Opcode) {
+    case Instruction::PHI: {
+      return 0;
+    }
+    case Instruction::ExtractElement: {
+      if (CanReuseExtract(VL))
+        return 0;
+      return getGatherCost(VecTy);
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VL0->getOperand(0)->getType();
+
+      // Calculate the cost of this instruction.
+      int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                         VL0->getType(), SrcTy);
+
+      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      return VecCost - ScalarCost;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp:
+    case Instruction::Select:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Calculate the cost of this instruction.
+      int ScalarCost = 0;
+      int VecCost = 0;
+      if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
+          Opcode == Instruction::Select) {
+        VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+        ScalarCost = VecTy->getNumElements() *
+        TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+        VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+      } else {
+        // Certain instructions can be cheaper to vectorize if they have a
+        // constant second vector operand.
+        TargetTransformInfo::OperandValueKind Op1VK =
+            TargetTransformInfo::OK_AnyValue;
+        TargetTransformInfo::OperandValueKind Op2VK =
+            TargetTransformInfo::OK_UniformConstantValue;
+
+        // Check whether all second operands are constant.
+        for (unsigned i = 0; i < VL.size(); ++i)
+          if (!isa<ConstantInt>(cast<Instruction>(VL[i])->getOperand(1))) {
+            Op2VK = TargetTransformInfo::OK_AnyValue;
+            break;
+          }
+
+        ScalarCost =
+            VecTy->getNumElements() *
+            TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK);
+        VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK);
+      }
+      return VecCost - ScalarCost;
+    }
+    case Instruction::Load: {
+      // Cost of wide load - cost of scalar loads.
+      int ScalarLdCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+      int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, 1, 0);
+      return VecLdCost - ScalarLdCost;
+    }
+    case Instruction::Store: {
+      // We know that we can merge the stores. Calculate the cost.
+      int ScalarStCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+      int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, 1, 0);
+      return VecStCost - ScalarStCost;
+    }
+    default:
+      llvm_unreachable("Unknown instruction");
+  }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() {
+  DEBUG(dbgs() << "SLP: Check whether the tree with height " <<
+        VectorizableTree.size() << " is fully vectorizable .\n");
+
+  // We only handle trees of height 2.
+  if (VectorizableTree.size() != 2)
+    return false;
+
+  // Gathering cost would be too much for tiny trees.
+  if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) 
+    return false; 
+
+  return true; 
+}
+
+int BoUpSLP::getTreeCost() {
+  int Cost = 0;
+  DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
+        VectorizableTree.size() << ".\n");
+
+  // We only vectorize tiny trees if it is fully vectorizable.
+  if (VectorizableTree.size() < 3 && !isFullyVectorizableTinyTree()) {
+    if (!VectorizableTree.size()) {
+      assert(!ExternalUses.size() && "We should not have any external users");
+    }
+    return INT_MAX;
+  }
+
+  unsigned BundleWidth = VectorizableTree[0].Scalars.size();
+
+  for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
+    int C = getEntryCost(&VectorizableTree[i]);
+    DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
+          << *VectorizableTree[i].Scalars[0] << " .\n");
+    Cost += C;
+  }
+
+  int ExtractCost = 0;
+  for (UserList::iterator I = ExternalUses.begin(), E = ExternalUses.end();
+       I != E; ++I) {
+
+    VectorType *VecTy = VectorType::get(I->Scalar->getType(), BundleWidth);
+    ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+                                           I->Lane);
+  }
+
+
+  DEBUG(dbgs() << "SLP: Total Cost " << Cost + ExtractCost<< ".\n");
+  return  Cost + ExtractCost;
+}
+
+int BoUpSLP::getGatherCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getGatherCost(VecTy);
+}
+
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  return 0;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L = dyn_cast<LoadInst>(I))
+    return L->getPointerAddressSpace();
+  if (StoreInst *S = dyn_cast<StoreInst>(I))
+    return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB))
+    return false;
+
+  // Make sure that A and B are different pointers of the same type.
+  if (PtrA == PtrB || PtrA->getType() != PtrB->getType())
+    return false;
+
+  unsigned PtrBitWidth = DL->getPointerSizeInBits(ASA);
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  APInt Size(PtrBitWidth, DL->getTypeStoreSize(Ty));
+
+  APInt OffsetA(PtrBitWidth, 0), OffsetB(PtrBitWidth, 0);
+  PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetA);
+  PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(*DL, OffsetB);
+
+  APInt OffsetDelta = OffsetB - OffsetA;
+
+  // Check if they are based on the same pointer. That makes the offsets
+  // sufficient.
+  if (PtrA == PtrB)
+    return OffsetDelta == Size;
+
+  // Compute the necessary base pointer delta to have the necessary final delta
+  // equal to the size.
+  APInt BaseDelta = Size - OffsetDelta;
+
+  // Otherwise compute the distance with SCEV between the base pointers.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *C = SE->getConstant(BaseDelta);
+  const SCEV *X = SE->getAddExpr(PtrSCEVA, C);
+  return X == PtrSCEVB;
+}
+
+Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I))
+      continue;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory())
+        continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory())
+        continue;
+    }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
+  }
+  return 0;
+}
+
+int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
+
+  int MaxIdx = BN.getIndex(BB->getFirstNonPHI());
+  for (unsigned i = 0, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  return MaxIdx;
+}
+
+Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
+  BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
+  assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
+  BlockNumbering &BN = BlocksNumbers[BB];
+
+  int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
+  for (unsigned i = 1, e = VL.size(); i < e; ++i)
+    MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
+  Instruction *I = BN.getInstruction(MaxIdx);
+  assert(I && "bad location");
+  return I;
+}
+
+void BoUpSLP::setInsertPointAfterBundle(ArrayRef<Value *> VL) {
+  Instruction *VL0 = cast<Instruction>(VL[0]);
+  Instruction *LastInst = getLastInstruction(VL);
+  BasicBlock::iterator NextInst = LastInst;
+  ++NextInst;
+  Builder.SetInsertPoint(VL0->getParent(), NextInst);
+  Builder.SetCurrentDebugLocation(VL0->getDebugLoc());
+}
+
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+  Value *Vec = UndefValue::get(Ty);
+  // Generate the 'InsertElement' instruction.
+  for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    if (Instruction *Insrt = dyn_cast<Instruction>(Vec)) {
+      GatherSeq.insert(Insrt);
+      CSEBlocks.insert(Insrt->getParent());
+
+      // Add to our 'need-to-extract' list.
+      if (ScalarToTreeEntry.count(VL[i])) {
+        int Idx = ScalarToTreeEntry[VL[i]];
+        TreeEntry *E = &VectorizableTree[Idx];
+        // Find which lane we need to extract.
+        int FoundLane = -1;
+        for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
+          // Is this the lane of the scalar that we are looking for ?
+          if (E->Scalars[Lane] == VL[i]) {
+            FoundLane = Lane;
+            break;
+          }
+        }
+        assert(FoundLane >= 0 && "Could not find the correct lane");
+        ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane));
+      }
+    }
+  }
+
+  return Vec;
+}
+
+Value *BoUpSLP::alreadyVectorized(ArrayRef<Value *> VL) const {
+  SmallDenseMap<Value*, int>::const_iterator Entry
+    = ScalarToTreeEntry.find(VL[0]);
+  if (Entry != ScalarToTreeEntry.end()) {
+    int Idx = Entry->second;
+    const TreeEntry *En = &VectorizableTree[Idx];
+    if (En->isSame(VL) && En->VectorizedValue)
+      return En->VectorizedValue;
+  }
+  return 0;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+  if (ScalarToTreeEntry.count(VL[0])) {
+    int Idx = ScalarToTreeEntry[VL[0]];
+    TreeEntry *E = &VectorizableTree[Idx];
+    if (E->isSame(VL))
+      return vectorizeTree(E);
+  }
+
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  return Gather(VL, VecTy);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+  IRBuilder<>::InsertPointGuard Guard(Builder);
+
+  if (E->VectorizedValue) {
+    DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+    return E->VectorizedValue;
+  }
+
+  Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+  Type *ScalarTy = VL0->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+
+  if (E->NeedToGather) {
+    setInsertPointAfterBundle(E->Scalars);
+    return Gather(E->Scalars, VecTy);
+  }
+
+  unsigned Opcode = VL0->getOpcode();
+  assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
+
+  switch (Opcode) {
+    case Instruction::PHI: {
+      PHINode *PH = dyn_cast<PHINode>(VL0);
+      Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+      Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+      PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+      E->VectorizedValue = NewPhi;
+
+      // PHINodes may have multiple entries from the same block. We want to
+      // visit every block once.
+      SmallSet<BasicBlock*, 4> VisitedBBs;
+
+      for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+        ValueList Operands;
+        BasicBlock *IBB = PH->getIncomingBlock(i);
+
+        if (!VisitedBBs.insert(IBB)) {
+          NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+          continue;
+        }
+
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < E->Scalars.size(); ++j)
+          Operands.push_back(cast<PHINode>(E->Scalars[j])->
+                             getIncomingValueForBlock(IBB));
+
+        Builder.SetInsertPoint(IBB->getTerminator());
+        Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+        Value *Vec = vectorizeTree(Operands);
+        NewPhi->addIncoming(Vec, IBB);
+      }
+
+      assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+             "Invalid number of incoming values");
+      return NewPhi;
+    }
+
+    case Instruction::ExtractElement: {
+      if (CanReuseExtract(E->Scalars)) {
+        Value *V = VL0->getOperand(0);
+        E->VectorizedValue = V;
+        return V;
+      }
+      return Gather(E->Scalars, VecTy);
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      ValueList INVL;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *InVec = vectorizeTree(INVL);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      CastInst *CI = dyn_cast<CastInst>(VL0);
+      Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::FCmp:
+    case Instruction::ICmp: {
+      ValueList LHSV, RHSV;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+      }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *L = vectorizeTree(LHSV);
+      Value *R = vectorizeTree(RHSV);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+      Value *V;
+      if (Opcode == Instruction::FCmp)
+        V = Builder.CreateFCmp(P0, L, R);
+      else
+        V = Builder.CreateICmp(P0, L, R);
+
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::Select: {
+      ValueList TrueVec, FalseVec, CondVec;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+        CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+        TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+      }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *Cond = vectorizeTree(CondVec);
+      Value *True = vectorizeTree(TrueVec);
+      Value *False = vectorizeTree(FalseVec);
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      Value *V = Builder.CreateSelect(Cond, True, False);
+      E->VectorizedValue = V;
+      return V;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      ValueList LHSVL, RHSVL;
+      if (isa<BinaryOperator>(VL0) && VL0->isCommutative())
+        reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL);
+      else
+        for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+          LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+          RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+        }
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *LHS = vectorizeTree(LHSVL);
+      Value *RHS = vectorizeTree(RHSVL);
+
+      if (LHS == RHS && isa<Instruction>(LHS)) {
+        assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
+      }
+
+      if (Value *V = alreadyVectorized(E->Scalars))
+        return V;
+
+      BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+      E->VectorizedValue = V;
+
+      if (Instruction *I = dyn_cast<Instruction>(V))
+        return propagateMetadata(I, E->Scalars);
+
+      return V;
+    }
+    case Instruction::Load: {
+      // Loads are inserted at the head of the tree because we don't want to
+      // sink them all the way down past store instructions.
+      setInsertPointAfterBundle(E->Scalars);
+
+      LoadInst *LI = cast<LoadInst>(VL0);
+      unsigned AS = LI->getPointerAddressSpace();
+
+      Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
+      unsigned Alignment = LI->getAlignment();
+      LI = Builder.CreateLoad(VecPtr);
+      LI->setAlignment(Alignment);
+      E->VectorizedValue = LI;
+      return propagateMetadata(LI, E->Scalars);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(VL0);
+      unsigned Alignment = SI->getAlignment();
+      unsigned AS = SI->getPointerAddressSpace();
+
+      ValueList ValueOp;
+      for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+        ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+
+      setInsertPointAfterBundle(E->Scalars);
+
+      Value *VecValue = vectorizeTree(ValueOp);
+      Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                            VecTy->getPointerTo(AS));
+      StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+      S->setAlignment(Alignment);
+      E->VectorizedValue = S;
+      return propagateMetadata(S, E->Scalars);
+    }
+    default:
+    llvm_unreachable("unknown inst");
+  }
+  return 0;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+  Builder.SetInsertPoint(F->getEntryBlock().begin());
+  vectorizeTree(&VectorizableTree[0]);
+
+  DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size() << " values .\n");
+
+  // Extract all of the elements with the external uses.
+  for (UserList::iterator it = ExternalUses.begin(), e = ExternalUses.end();
+       it != e; ++it) {
+    Value *Scalar = it->Scalar;
+    llvm::User *User = it->User;
+
+    // Skip users that we already RAUW. This happens when one instruction
+    // has multiple uses of the same value.
+    if (std::find(Scalar->use_begin(), Scalar->use_end(), User) ==
+        Scalar->use_end())
+      continue;
+    assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
+
+    int Idx = ScalarToTreeEntry[Scalar];
+    TreeEntry *E = &VectorizableTree[Idx];
+    assert(!E->NeedToGather && "Extracting from a gather list");
+
+    Value *Vec = E->VectorizedValue;
+    assert(Vec && "Can't find vectorizable value");
+
+    Value *Lane = Builder.getInt32(it->Lane);
+    // Generate extracts for out-of-tree users.
+    // Find the insertion point for the extractelement lane.
+    if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
+      Builder.SetInsertPoint(PN->getParent()->getFirstInsertionPt());
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(PN->getParent());
+      User->replaceUsesOfWith(Scalar, Ex);
+    } else if (isa<Instruction>(Vec)){
+      if (PHINode *PH = dyn_cast<PHINode>(User)) {
+        for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+          if (PH->getIncomingValue(i) == Scalar) {
+            Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+            Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+            CSEBlocks.insert(PH->getIncomingBlock(i));
+            PH->setOperand(i, Ex);
+          }
+        }
+      } else {
+        Builder.SetInsertPoint(cast<Instruction>(User));
+        Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+        CSEBlocks.insert(cast<Instruction>(User)->getParent());
+        User->replaceUsesOfWith(Scalar, Ex);
+     }
+    } else {
+      Builder.SetInsertPoint(F->getEntryBlock().begin());
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      CSEBlocks.insert(&F->getEntryBlock());
+      User->replaceUsesOfWith(Scalar, Ex);
+    }
+
+    DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+  }
+
+  // For each vectorized value:
+  for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+    TreeEntry *Entry = &VectorizableTree[EIdx];
+
+    // For each lane:
+    for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+      Value *Scalar = Entry->Scalars[Lane];
+
+      // No need to handle users of gathered values.
+      if (Entry->NeedToGather)
+        continue;
+
+      assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+      Type *Ty = Scalar->getType();
+      if (!Ty->isVoidTy()) {
+        for (Value::use_iterator User = Scalar->use_begin(),
+             UE = Scalar->use_end(); User != UE; ++User) {
+          DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
+
+          assert((ScalarToTreeEntry.count(*User) ||
+                  // It is legal to replace the reduction users by undef.
+                  (RdxOps && RdxOps->count(*User))) &&
+                 "Replacing out-of-tree value with undef");
+        }
+        Value *Undef = UndefValue::get(Ty);
+        Scalar->replaceAllUsesWith(Undef);
+      }
+      DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+      cast<Instruction>(Scalar)->eraseFromParent();
+    }
+  }
+
+  for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+    BlocksNumbers[it].forget();
+  }
+  Builder.ClearInsertionPoint();
+
+  return VectorizableTree[0].VectorizedValue;
+}
+
+class DTCmp {
+  const DominatorTree *DT;
+
+public:
+  DTCmp(const DominatorTree *DT) : DT(DT) {}
+  bool operator()(const BasicBlock *A, const BasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+};
+
+void BoUpSLP::optimizeGatherSequence() {
+  DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+        << " gather sequences instructions.\n");
+  // LICM InsertElementInst sequences.
+  for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
+       e = GatherSeq.end(); it != e; ++it) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+    if (!Insert)
+      continue;
+
+    // Check if this block is inside a loop.
+    Loop *L = LI->getLoopFor(Insert->getParent());
+    if (!L)
+      continue;
+
+    // Check if it has a preheader.
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    if (!PreHeader)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec))
+      continue;
+    if (NewElem && L->contains(NewElem))
+      continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(PreHeader->getTerminator());
+  }
+
+  // Sort blocks by domination. This ensures we visit a block after all blocks
+  // dominating it are visited.
+  SmallVector<BasicBlock *, 8> CSEWorkList(CSEBlocks.begin(), CSEBlocks.end());
+  std::stable_sort(CSEWorkList.begin(), CSEWorkList.end(), DTCmp(DT));
+
+  // Perform O(N^2) search over the gather sequences and merge identical
+  // instructions. TODO: We can further optimize this scan if we split the
+  // instructions into different buckets based on the insert lane.
+  SmallVector<Instruction *, 16> Visited;
+  for (SmallVectorImpl<BasicBlock *>::iterator I = CSEWorkList.begin(),
+                                               E = CSEWorkList.end();
+       I != E; ++I) {
+    assert((I == CSEWorkList.begin() || !DT->dominates(*I, *llvm::prior(I))) &&
+           "Worklist not sorted properly!");
+    BasicBlock *BB = *I;
+    // For all instructions in blocks containing gather sequences:
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+      Instruction *In = it++;
+      if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+        continue;
+
+      // Check if we can replace this instruction with any of the
+      // visited instructions.
+      for (SmallVectorImpl<Instruction *>::iterator v = Visited.begin(),
+                                                    ve = Visited.end();
+           v != ve; ++v) {
+        if (In->isIdenticalTo(*v) &&
+            DT->dominates((*v)->getParent(), In->getParent())) {
+          In->replaceAllUsesWith(*v);
+          In->eraseFromParent();
+          In = 0;
+          break;
+        }
+      }
+      if (In) {
+        assert(std::find(Visited.begin(), Visited.end(), In) == Visited.end());
+        Visited.push_back(In);
+      }
+    }
+  }
+  CSEBlocks.clear();
+  GatherSeq.clear();
+}
+
 /// The SLPVectorizer Pass.
 struct SLPVectorizer : public FunctionPass {
-  typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+  typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, StoreList> StoreListMap;
 
   /// Pass identification, replacement for typeid
   static char ID;
@@ -61,6 +1764,7 @@ struct SLPVectorizer : public FunctionPass {
   TargetTransformInfo *TTI;
   AliasAnalysis *AA;
   LoopInfo *LI;
+  DominatorTree *DT;
 
   virtual bool runOnFunction(Function &F) {
     SE = &getAnalysis<ScalarEvolution>();
@@ -68,41 +1772,50 @@ struct SLPVectorizer : public FunctionPass {
     TTI = &getAnalysis<TargetTransformInfo>();
     AA = &getAnalysis<AliasAnalysis>();
     LI = &getAnalysis<LoopInfo>();
+    DT = &getAnalysis<DominatorTree>();
 
     StoreRefs.clear();
     bool Changed = false;
 
+    // If the target claims to have no vector registers don't attempt
+    // vectorization.
+    if (!TTI->getNumberOfRegisters(true))
+      return false;
+
     // Must have DataLayout. We can't require it because some tests run w/o
     // triple.
     if (!DL)
       return false;
 
-    for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
-      BasicBlock *BB = it;
-      bool BBChanged = false;
+    // Don't vectorize when the attribute NoImplicitFloat is used.
+    if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+      return false;
 
-      // Use the bollom up slp vectorizer to construct chains that start with
-      // he store instructions.
-      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
+    DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
 
-      // Vectorize trees that end at reductions.
-      BBChanged |= vectorizeReductions(BB, R);
+    // Use the bollom up slp vectorizer to construct chains that start with
+    // he store instructions.
+    BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
+
+    // Scan the blocks in the function in post order.
+    for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
+         e = po_end(&F.getEntryBlock()); it != e; ++it) {
+      BasicBlock *BB = *it;
 
       // Vectorize trees that end at stores.
       if (unsigned count = collectStores(BB, R)) {
         (void)count;
-        DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
-        BBChanged |= vectorizeStoreChains(R);
+        DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
+        Changed |= vectorizeStoreChains(R);
       }
 
-      // Try to hoist some of the scalarization code to the preheader.
-      if (BBChanged) hoistGatherSequence(LI, BB, R);
-
-      Changed |= BBChanged;
+      // Vectorize trees that end at reductions.
+      Changed |= vectorizeChainsInBlock(BB, R);
     }
 
     if (Changed) {
-      DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n");
+      R.optimizeGatherSequence();
+      DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
       DEBUG(verifyFunction(F));
     }
     return Changed;
@@ -114,6 +1827,10 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<AliasAnalysis>();
     AU.addRequired<TargetTransformInfo>();
     AU.addRequired<LoopInfo>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
+    AU.setPreservesCFG();
   }
 
 private:
@@ -125,29 +1842,149 @@ private:
   unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
-  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
+  bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
 
   /// \brief Try to vectorize a list of operands.
+  /// \returns true if a value was vectorized.
   bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
 
   /// \brief Try to vectorize a chain that may start at the operands of \V;
-  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
+  bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
 
   /// \brief Vectorize the stores that were collected in StoreRefs.
   bool vectorizeStoreChains(BoUpSLP &R);
 
-  /// \brief Try to hoist gather sequences outside of the loop in cases where
-  /// all of the sources are loop invariant.
-  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+  /// \brief Scan the basic block and look for patterns that are likely to start
+  /// a vectorization chain.
+  bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
 
-  /// \brief Scan the basic block and look for reductions that may start a
-  /// vectorization chain.
-  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R);
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
+                           BoUpSLP &R);
 
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
+                       BoUpSLP &R);
 private:
   StoreListMap StoreRefs;
 };
 
+/// \brief Check that the Values in the slice in VL array are still existant in
+/// the WeakVH array.
+/// Vectorization of part of the VL array may cause later values in the VL array
+/// to become invalid. We track when this has happened in the WeakVH array.
+static bool hasValueBeenRAUWed(ArrayRef<Value *> &VL,
+                               SmallVectorImpl<WeakVH> &VH,
+                               unsigned SliceBegin,
+                               unsigned SliceSize) {
+  for (unsigned i = SliceBegin; i < SliceBegin + SliceSize; ++i)
+    if (VH[i] != VL[i])
+      return true;
+
+  return false;
+}
+
+bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
+                                          int CostThreshold, BoUpSLP &R) {
+  unsigned ChainLen = Chain.size();
+  DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+        << "\n");
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  unsigned VF = MinVecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2)
+    return false;
+
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(Chain.begin(), Chain.end());
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+    if (i + VF > e)
+      break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(Chain, TrackValues, i, VF))
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+          << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    R.buildTree(Operands);
+
+    int Cost = R.getTreeCost();
+
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < CostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
+                                    int costThreshold, BoUpSLP &R) {
+  SetVector<Value *> Heads, Tails;
+  SmallDenseMap<Value *, Value *> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  BoUpSLP::ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of stores that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i) {
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j)
+        continue;
+
+      if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+  }
+
+  // For stores that start but don't end a link in the chain:
+  for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+       it != e; ++it) {
+    if (Tails.count(*it))
+      continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    BoUpSLP::ValueList Operands;
+    Value *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I))
+        break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
+
+    // Mark the vectorized stores so that we don't vectorize them again.
+    if (Vectorized)
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+
 unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   unsigned count = 0;
   StoreRefs.clear();
@@ -156,15 +1993,17 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
     if (!SI)
       continue;
 
+    // Don't touch volatile stores.
+    if (!SI->isSimple())
+      continue;
+
     // Check that the pointer points to scalars.
     Type *Ty = SI->getValueOperand()->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
       return 0;
 
-    // Find the base of the GEP.
-    Value *Ptr = SI->getPointerOperand();
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
-      Ptr = GEP->getPointerOperand();
+    // Find the base pointer.
+    Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
 
     // Save the store locations.
     StoreRefs[Ptr].push_back(SI);
@@ -173,34 +2012,83 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
   return count;
 }
 
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
-  if (!A || !B) return false;
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+  if (!A || !B)
+    return false;
   Value *VL[] = { A, B };
   return tryToVectorizeList(VL, R);
 }
 
 bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
-  DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+  if (VL.size() < 2)
+    return false;
+
+  DEBUG(dbgs() << "SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+  // Check that all of the parts are scalar instructions of the same type.
+  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+  if (!I0)
+    return false;
+
+  unsigned Opcode0 = I0->getOpcode();
+
+  Type *Ty0 = I0->getType();
+  unsigned Sz = DL->getTypeSizeInBits(Ty0);
+  unsigned VF = MinVecRegSize / Sz;
 
-  // Check that all of the parts are scalar.
   for (int i = 0, e = VL.size(); i < e; ++i) {
     Type *Ty = VL[i]->getType();
     if (Ty->isAggregateType() || Ty->isVectorTy())
-      return 0;
+      return false;
+    Instruction *Inst = dyn_cast<Instruction>(VL[i]);
+    if (!Inst || Inst->getOpcode() != Opcode0)
+      return false;
   }
 
-  int Cost = R.getTreeCost(VL);
-  int ExtrCost = R.getScalarizationCost(VL);
-  DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
-        " Cost of extract:" << ExtrCost << ".\n");
-  if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
-  DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
-  R.vectorizeArith(VL);
-  return true;
+  bool Changed = false;
+
+  // Keep track of values that were delete by vectorizing in the loop below.
+  SmallVector<WeakVH, 8> TrackValues(VL.begin(), VL.end());
+
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    unsigned OpsWidth = 0;
+
+    if (i + VF > e)
+      OpsWidth = e - i;
+    else
+      OpsWidth = VF;
+
+    if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+      break;
+
+    // Check that a previous iteration of this loop did not delete the Value.
+    if (hasValueBeenRAUWed(VL, TrackValues, i, OpsWidth))
+      continue;
+
+    DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+                 << "\n");
+    ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+
+    R.buildTree(Ops);
+    int Cost = R.getTreeCost();
+
+    if (Cost < -SLPCostThreshold) {
+      DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
+      R.vectorizeTree();
+
+      // Move to the next bundle.
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
 }
 
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
-  if (!V) return false;
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
+  if (!V)
+    return false;
+
   // Try to vectorize V.
   if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
     return true;
@@ -237,38 +2125,502 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
   return 0;
 }
 
-bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+/// \brief Generate a shuffle mask to be used in a reduction tree.
+///
+/// \param VecLen The length of the vector to be reduced.
+/// \param NumEltsToRdx The number of elements that should be reduced in the
+///        vector.
+/// \param IsPairwise Whether the reduction is a pairwise or splitting
+///        reduction. A pairwise reduction will generate a mask of 
+///        <0,2,...> or <1,3,..> while a splitting reduction will generate
+///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
+/// \param IsLeft True will generate a mask of even elements, odd otherwise.
+static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
+                                   bool IsPairwise, bool IsLeft,
+                                   IRBuilder<> &Builder) {
+  assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
+
+  SmallVector<Constant *, 32> ShuffleMask(
+      VecLen, UndefValue::get(Builder.getInt32Ty()));
+
+  if (IsPairwise)
+    // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+  else
+    // Move the upper half of the vector to the lower half.
+    for (unsigned i = 0; i != NumEltsToRdx; ++i)
+      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+
+  return ConstantVector::get(ShuffleMask);
+}
+
+
+/// Model horizontal reductions.
+///
+/// A horizontal reduction is a tree of reduction operations (currently add and
+/// fadd) that has operations that can be put into a vector as its leaf.
+/// For example, this tree:
+///
+/// mul mul mul mul
+///  \  /    \  /
+///   +       +
+///    \     /
+///       +
+/// This tree has "mul" as its reduced values and "+" as its reduction
+/// operations. A reduction might be feeding into a store or a binary operation
+/// feeding a phi.
+///    ...
+///    \  /
+///     +
+///     |
+///  phi +=
+///
+///  Or:
+///    ...
+///    \  /
+///     +
+///     |
+///   *p =
+///
+class HorizontalReduction {
+  SmallPtrSet<Value *, 16> ReductionOps;
+  SmallVector<Value *, 32> ReducedVals;
+
+  BinaryOperator *ReductionRoot;
+  PHINode *ReductionPHI;
+
+  /// The opcode of the reduction.
+  unsigned ReductionOpcode;
+  /// The opcode of the values we perform a reduction on.
+  unsigned ReducedValueOpcode;
+  /// The width of one full horizontal reduction operation.
+  unsigned ReduxWidth;
+  /// Should we model this reduction as a pairwise reduction tree or a tree that
+  /// splits the vector in halves and adds those halves.
+  bool IsPairwiseReduction;
+
+public:
+  HorizontalReduction()
+    : ReductionRoot(0), ReductionPHI(0), ReductionOpcode(0),
+    ReducedValueOpcode(0), ReduxWidth(0), IsPairwiseReduction(false) {}
+
+  /// \brief Try to find a reduction tree.
+  bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B,
+                                 DataLayout *DL) {
+    assert((!Phi ||
+            std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) &&
+           "Thi phi needs to use the binary operator");
+
+    // We could have a initial reductions that is not an add.
+    //  r *= v1 + v2 + v3 + v4
+    // In such a case start looking for a tree rooted in the first '+'.
+    if (Phi) {
+      if (B->getOperand(0) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(1));
+      } else if (B->getOperand(1) == Phi) {
+        Phi = 0;
+        B = dyn_cast<BinaryOperator>(B->getOperand(0));
+      }
+    }
+
+    if (!B)
+      return false;
+
+    Type *Ty = B->getType();
+    if (Ty->isVectorTy())
+      return false;
+
+    ReductionOpcode = B->getOpcode();
+    ReducedValueOpcode = 0;
+    ReduxWidth = MinVecRegSize / DL->getTypeSizeInBits(Ty);
+    ReductionRoot = B;
+    ReductionPHI = Phi;
+
+    if (ReduxWidth < 4)
+      return false;
+
+    // We currently only support adds.
+    if (ReductionOpcode != Instruction::Add &&
+        ReductionOpcode != Instruction::FAdd)
+      return false;
+
+    // Post order traverse the reduction tree starting at B. We only handle true
+    // trees containing only binary operators.
+    SmallVector<std::pair<BinaryOperator *, unsigned>, 32> Stack;
+    Stack.push_back(std::make_pair(B, 0));
+    while (!Stack.empty()) {
+      BinaryOperator *TreeN = Stack.back().first;
+      unsigned EdgeToVist = Stack.back().second++;
+      bool IsReducedValue = TreeN->getOpcode() != ReductionOpcode;
+
+      // Only handle trees in the current basic block.
+      if (TreeN->getParent() != B->getParent())
+        return false;
+
+      // Each tree node needs to have one user except for the ultimate
+      // reduction.
+      if (!TreeN->hasOneUse() && TreeN != B)
+        return false;
+
+      // Postorder vist.
+      if (EdgeToVist == 2 || IsReducedValue) {
+        if (IsReducedValue) {
+          // Make sure that the opcodes of the operations that we are going to
+          // reduce match.
+          if (!ReducedValueOpcode)
+            ReducedValueOpcode = TreeN->getOpcode();
+          else if (ReducedValueOpcode != TreeN->getOpcode())
+            return false;
+          ReducedVals.push_back(TreeN);
+        } else {
+          // We need to be able to reassociate the adds.
+          if (!TreeN->isAssociative())
+            return false;
+          ReductionOps.insert(TreeN);
+        }
+        // Retract.
+        Stack.pop_back();
+        continue;
+      }
+
+      // Visit left or right.
+      Value *NextV = TreeN->getOperand(EdgeToVist);
+      BinaryOperator *Next = dyn_cast<BinaryOperator>(NextV);
+      if (Next)
+        Stack.push_back(std::make_pair(Next, 0));
+      else if (NextV != Phi)
+        return false;
+    }
+    return true;
+  }
+
+  /// \brief Attempt to vectorize the tree found by
+  /// matchAssociativeReduction.
+  bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+    if (ReducedVals.empty())
+      return false;
+
+    unsigned NumReducedVals = ReducedVals.size();
+    if (NumReducedVals < ReduxWidth)
+      return false;
+
+    Value *VectorizedTree = 0;
+    IRBuilder<> Builder(ReductionRoot);
+    FastMathFlags Unsafe;
+    Unsafe.setUnsafeAlgebra();
+    Builder.SetFastMathFlags(Unsafe);
+    unsigned i = 0;
+
+    for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) {
+      ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth);
+      V.buildTree(ValsToReduce, &ReductionOps);
+
+      // Estimate cost.
+      int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]);
+      if (Cost >= -SLPCostThreshold)
+        break;
+
+      DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
+                   << ". (HorRdx)\n");
+
+      // Vectorize a tree.
+      DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+      Value *VectorizedRoot = V.vectorizeTree();
+
+      // Emit a reduction.
+      Value *ReducedSubTree = emitReduction(VectorizedRoot, Builder);
+      if (VectorizedTree) {
+        Builder.SetCurrentDebugLocation(Loc);
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedSubTree, "bin.rdx");
+      } else
+        VectorizedTree = ReducedSubTree;
+    }
+
+    if (VectorizedTree) {
+      // Finish the reduction.
+      for (; i < NumReducedVals; ++i) {
+        Builder.SetCurrentDebugLocation(
+          cast<Instruction>(ReducedVals[i])->getDebugLoc());
+        VectorizedTree = createBinOp(Builder, ReductionOpcode, VectorizedTree,
+                                     ReducedVals[i]);
+      }
+      // Update users.
+      if (ReductionPHI) {
+        assert(ReductionRoot != NULL && "Need a reduction operation");
+        ReductionRoot->setOperand(0, VectorizedTree);
+        ReductionRoot->setOperand(1, ReductionPHI);
+      } else
+        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+    }
+    return VectorizedTree != 0;
+  }
+
+private:
+
+  /// \brief Calcuate the cost of a reduction.
+  int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal) {
+    Type *ScalarTy = FirstReducedVal->getType();
+    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+
+    int PairwiseRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, true);
+    int SplittingRdxCost = TTI->getReductionCost(ReductionOpcode, VecTy, false);
+
+    IsPairwiseReduction = PairwiseRdxCost < SplittingRdxCost;
+    int VecReduxCost = IsPairwiseReduction ? PairwiseRdxCost : SplittingRdxCost;
+
+    int ScalarReduxCost =
+        ReduxWidth * TTI->getArithmeticInstrCost(ReductionOpcode, VecTy);
+
+    DEBUG(dbgs() << "SLP: Adding cost " << VecReduxCost - ScalarReduxCost
+                 << " for reduction that starts with " << *FirstReducedVal
+                 << " (It is a "
+                 << (IsPairwiseReduction ? "pairwise" : "splitting")
+                 << " reduction)\n");
+
+    return VecReduxCost - ScalarReduxCost;
+  }
+
+  static Value *createBinOp(IRBuilder<> &Builder, unsigned Opcode, Value *L,
+                            Value *R, const Twine &Name = "") {
+    if (Opcode == Instruction::FAdd)
+      return Builder.CreateFAdd(L, R, Name);
+    return Builder.CreateBinOp((Instruction::BinaryOps)Opcode, L, R, Name);
+  }
+
+  /// \brief Emit a horizontal reduction of the vectorized value.
+  Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder) {
+    assert(VectorizedValue && "Need to have a vectorized tree node");
+    Instruction *ValToReduce = dyn_cast<Instruction>(VectorizedValue);
+    assert(isPowerOf2_32(ReduxWidth) &&
+           "We only handle power-of-two reductions for now");
+
+    Value *TmpVec = ValToReduce;
+    for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
+      if (IsPairwiseReduction) {
+        Value *LeftMask =
+          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
+        Value *RightMask =
+          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+
+        Value *LeftShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
+        Value *RightShuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
+          "rdx.shuf.r");
+        TmpVec = createBinOp(Builder, ReductionOpcode, LeftShuf, RightShuf,
+                             "bin.rdx");
+      } else {
+        Value *UpperHalf =
+          createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
+        Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
+        TmpVec = createBinOp(Builder, ReductionOpcode, TmpVec, Shuf, "bin.rdx");
+      }
+    }
+
+    // The result is in the first element of the vector.
+    return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+  }
+};
+
+/// \brief Recognize construction of vectors like
+///  %ra = insertelement <4 x float> undef, float %s0, i32 0
+///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
+///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
+///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
+///
+/// Returns true if it matches
+///
+static bool findBuildVector(InsertElementInst *IE,
+                            SmallVectorImpl<Value *> &Ops) {
+  if (!isa<UndefValue>(IE->getOperand(0)))
+    return false;
+
+  while (true) {
+    Ops.push_back(IE->getOperand(1));
+
+    if (IE->use_empty())
+      return false;
+
+    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
+    if (!NextUse)
+      return true;
+
+    // If this isn't the final use, make sure the next insertelement is the only
+    // use. It's OK if the final constructed vector is used multiple times
+    if (!IE->hasOneUse())
+      return false;
+
+    IE = NextUse;
+  }
+
+  return false;
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+  return V->getType() < V2->getType();
+}
+
+bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    if (isa<DbgInfoIntrinsic>(it)) continue;
+  SmallVector<Value *, 4> Incoming;
+  SmallSet<Value *, 16> VisitedInstrs;
+
+  bool HaveVectorizedPhiNodes = true;
+  while (HaveVectorizedPhiNodes) {
+    HaveVectorizedPhiNodes = false;
+
+    // Collect the incoming values from the PHIs.
+    Incoming.clear();
+    for (BasicBlock::iterator instr = BB->begin(), ie = BB->end(); instr != ie;
+         ++instr) {
+      PHINode *P = dyn_cast<PHINode>(instr);
+      if (!P)
+        break;
+
+      if (!VisitedInstrs.count(P))
+        Incoming.push_back(P);
+    }
+
+    // Sort by type.
+    std::stable_sort(Incoming.begin(), Incoming.end(), PhiTypeSorterFunc);
+
+    // Try to vectorize elements base on their type.
+    for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+                                           E = Incoming.end();
+         IncIt != E;) {
+
+      // Look for the next elements with the same type.
+      SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      while (SameTypeIt != E &&
+             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+        VisitedInstrs.insert(*SameTypeIt);
+        ++SameTypeIt;
+      }
+
+      // Try to vectorize them.
+      unsigned NumElts = (SameTypeIt - IncIt);
+      DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n");
+      if (NumElts > 1 &&
+          tryToVectorizeList(ArrayRef<Value *>(IncIt, NumElts), R)) {
+        // Success start over because instructions might have been changed.
+        HaveVectorizedPhiNodes = true;
+        Changed = true;
+        break;
+      }
+
+      // Start over at the next instruction of a differnt type (or the end).
+      IncIt = SameTypeIt;
+    }
+  }
+
+  VisitedInstrs.clear();
+
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; it++) {
+    // We may go through BB multiple times so skip the one we have checked.
+    if (!VisitedInstrs.insert(it))
+      continue;
+
+    if (isa<DbgInfoIntrinsic>(it))
+      continue;
 
     // Try to vectorize reductions that use PHINodes.
     if (PHINode *P = dyn_cast<PHINode>(it)) {
       // Check that the PHI is a reduction PHI.
-      if (P->getNumIncomingValues() != 2) return Changed;
-      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
-                    (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
-                     0));
+      if (P->getNumIncomingValues() != 2)
+        return Changed;
+      Value *Rdx =
+          (P->getIncomingBlock(0) == BB
+               ? (P->getIncomingValue(0))
+               : (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) : 0));
       // Check if this is a Binary Operator.
       BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
       if (!BI)
         continue;
 
-      Value *Inst = BI->getOperand(0);
-      if (Inst == P) Inst = BI->getOperand(1);
-      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      // Try to match and vectorize a horizontal reduction.
+      HorizontalReduction HorRdx;
+      if (ShouldVectorizeHor &&
+          HorRdx.matchAssociativeReduction(P, BI, DL) &&
+          HorRdx.tryToReduce(R, TTI)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
+     Value *Inst = BI->getOperand(0);
+      if (Inst == P)
+        Inst = BI->getOperand(1);
+
+      if (tryToVectorize(dyn_cast<BinaryOperator>(Inst), R)) {
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+        continue;
+      }
+
       continue;
     }
 
+    // Try to vectorize horizontal reductions feeding into a store.
+    if (ShouldStartVectorizeHorAtStore)
+      if (StoreInst *SI = dyn_cast<StoreInst>(it))
+        if (BinaryOperator *BinOp =
+                dyn_cast<BinaryOperator>(SI->getValueOperand())) {
+          HorizontalReduction HorRdx;
+          if (((HorRdx.matchAssociativeReduction(0, BinOp, DL) &&
+                HorRdx.tryToReduce(R, TTI)) ||
+               tryToVectorize(BinOp, R))) {
+            Changed = true;
+            it = BB->begin();
+            e = BB->end();
+            continue;
+          }
+        }
+
     // Try to vectorize trees that start at compare instructions.
     if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
       if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
-        Changed |= true;
+        Changed = true;
+        // We would like to start over since some instructions are deleted
+        // and the iterator may become invalid value.
+        it = BB->begin();
+        e = BB->end();
         continue;
       }
-      for (int i = 0; i < 2; ++i)
-        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
-          Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+
+      for (int i = 0; i < 2; ++i) {
+         if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i))) {
+            if (tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R)) {
+              Changed = true;
+              // We would like to start over since some instructions are deleted
+              // and the iterator may become invalid value.
+              it = BB->begin();
+              e = BB->end();
+            }
+         }
+      }
+      continue;
+    }
+
+    // Try to vectorize trees that start at insertelement instructions.
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
+      SmallVector<Value *, 8> Ops;
+      if (!findBuildVector(IE, Ops))
+        continue;
+
+      if (tryToVectorizeList(Ops, R)) {
+        Changed = true;
+        it = BB->begin();
+        e = BB->end();
+      }
+
       continue;
     }
   }
@@ -284,51 +2636,19 @@ bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
     if (it->second.size() < 2)
       continue;
 
-    DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
-          it->second.size() << ".\n");
+    DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+          << it->second.size() << ".\n");
 
-    Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+    // Process the stores in chunks of 16.
+    for (unsigned CI = 0, CE = it->second.size(); CI < CE; CI+=16) {
+      unsigned Len = std::min<unsigned>(CE - CI, 16);
+      ArrayRef<StoreInst *> Chunk(&it->second[CI], Len);
+      Changed |= vectorizeStores(Chunk, -SLPCostThreshold, R);
+    }
   }
   return Changed;
 }
 
-void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
-                                        BoUpSLP &R) {
-  // Check if this block is inside a loop.
-  Loop *L = LI->getLoopFor(BB);
-  if (!L)
-    return;
-
-  // Check if it has a preheader.
-  BasicBlock *PreHeader = L->getLoopPreheader();
-  if (!PreHeader)
-    return;
-
-  // Mark the insertion point for the block.
-  Instruction *Location = PreHeader->getTerminator();
-
-  BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
-  for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
-       it != e; ++it) {
-    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
-
-    // The InsertElement sequence can be simplified into a constant.
-    if (!Insert)
-      continue;
-
-    // If the vector or the element that we insert into it are
-    // instructions that are defined in this basic block then we can't
-    // hoist this instruction.
-    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
-    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
-    if (CurrVec && L->contains(CurrVec)) continue;
-    if (NewElem && L->contains(NewElem)) continue;
-
-    // We can hoist this instruction. Move it to the pre-header.
-    Insert->moveBefore(Location);
-  }
-}
-
 } // end anonymous namespace
 
 char SLPVectorizer::ID = 0;
@@ -341,8 +2661,5 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 namespace llvm {
-  Pass *createSLPVectorizerPass() {
-    return new SLPVectorizer();
-  }
+Pass *createSLPVectorizerPass() { return new SLPVectorizer(); }
 }
-
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
deleted file mode 100644
index 9b94366..0000000
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ /dev/null
@@ -1,730 +0,0 @@
-//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "SLP"
-
-#include "VecUtils.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <map>
-
-using namespace llvm;
-
-static const unsigned MinVecRegSize = 128;
-
-static const unsigned RecursionMaxDepth = 6;
-
-namespace llvm {
-
-BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
-                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
-  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
-  numberInstructions();
-}
-
-void BoUpSLP::numberInstructions() {
-  int Loc = 0;
-  InstrIdx.clear();
-  InstrVec.clear();
-  // Number the instructions in the block.
-  for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
-    InstrIdx[it] = Loc++;
-    InstrVec.push_back(it);
-    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
-  }
-}
-
-Value *BoUpSLP::getPointerOperand(Value *I) {
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
-  return 0;
-}
-
-unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
-  if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
-  if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
-  return -1;
-}
-
-bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
-  Value *PtrA = getPointerOperand(A);
-  Value *PtrB = getPointerOperand(B);
-  unsigned ASA = getAddressSpaceOperand(A);
-  unsigned ASB = getAddressSpaceOperand(B);
-
-  // Check that the address spaces match and that the pointers are valid.
-  if (!PtrA || !PtrB || (ASA != ASB)) return false;
-
-  // Check that A and B are of the same type.
-  if (PtrA->getType() != PtrB->getType()) return false;
-
-  // Calculate the distance.
-  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
-  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
-  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
-  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
-
-  // Non constant distance.
-  if (!ConstOffSCEV) return false;
-
-  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
-  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
-  // The Instructions are connsecutive if the size of the first load/store is
-  // the same as the offset.
-  int64_t Sz = DL->getTypeStoreSize(Ty);
-  return ((-Offset) == Sz);
-}
-
-bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
-  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
-  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
-  unsigned VF = MinVecRegSize / Sz;
-
-  if (!isPowerOf2_32(Sz) || VF < 2) return false;
-
-  bool Changed = false;
-  // Look for profitable vectorizable trees at all offsets, starting at zero.
-  for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
-    if (i + VF > e) return Changed;
-    DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
-    ArrayRef<Value *> Operands = Chain.slice(i, VF);
-
-    int Cost = getTreeCost(Operands);
-    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
-    if (Cost < CostThreshold) {
-      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
-      vectorizeTree(Operands, VF);
-      i += VF - 1;
-      Changed = true;
-    }
-  }
-
-  return Changed;
-}
-
-bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
-  ValueSet Heads, Tails;
-  SmallDenseMap<Value*, Value*> ConsecutiveChain;
-
-  // We may run into multiple chains that merge into a single chain. We mark the
-  // stores that we vectorized so that we don't visit the same store twice.
-  ValueSet VectorizedStores;
-  bool Changed = false;
-
-  // Do a quadratic search on all of the given stores and find
-  // all of the pairs of loads that follow each other.
-  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
-    for (unsigned j = 0; j < e; ++j) {
-      if (i == j) continue;
-      if (isConsecutiveAccess(Stores[i], Stores[j])) {
-        Tails.insert(Stores[j]);
-        Heads.insert(Stores[i]);
-        ConsecutiveChain[Stores[i]] = Stores[j];
-      }
-    }
-
-  // For stores that start but don't end a link in the chain:
-  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
-    if (Tails.count(*it)) continue;
-
-    // We found a store instr that starts a chain. Now follow the chain and try
-    // to vectorize it.
-    ValueList Operands;
-    Value *I = *it;
-    // Collect the chain into a list.
-    while (Tails.count(I) || Heads.count(I)) {
-      if (VectorizedStores.count(I)) break;
-      Operands.push_back(I);
-      // Move to the next value in the chain.
-      I = ConsecutiveChain[I];
-    }
-
-    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
-
-    // Mark the vectorized stores so that we don't vectorize them again.
-    if (Vectorized)
-      VectorizedStores.insert(Operands.begin(), Operands.end());
-    Changed |= Vectorized;
-  }
-
-  return Changed;
-}
-
-int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
-  // Find the type of the operands in VL.
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-  // Find the cost of inserting/extracting values from the vector.
-  return getScalarizationCost(VecTy);
-}
-
-int BoUpSLP::getScalarizationCost(Type *Ty) {
-  int Cost = 0;
-  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
-    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-  return Cost;
-}
-
-AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
-  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
-  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
-  return AliasAnalysis::Location();
-}
-
-Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
-  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
-  BasicBlock::iterator I = Src, E = Dst;
-  /// Scan all of the instruction from SRC to DST and check if
-  /// the source may alias.
-  for (++I; I != E; ++I) {
-    // Ignore store instructions that are marked as 'ignore'.
-    if (MemBarrierIgnoreList.count(I)) continue;
-    if (Src->mayWriteToMemory()) /* Write */ {
-      if (!I->mayReadOrWriteMemory()) continue;
-    } else /* Read */ {
-      if (!I->mayWriteToMemory()) continue;
-    }
-    AliasAnalysis::Location A = getLocation(&*I);
-    AliasAnalysis::Location B = getLocation(Src);
-
-    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
-      return I;
-  }
-  return 0;
-}
-
-void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
-  Value *Vec = vectorizeTree(Operands, Operands.size());
-  BasicBlock::iterator Loc = cast<Instruction>(Vec);
-  IRBuilder<> Builder(++Loc);
-  // After vectorizing the operands we need to generate extractelement
-  // instructions and replace all of the uses of the scalar values with
-  // the values that we extracted from the vectorized tree.
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
-    Operands[i]->replaceAllUsesWith(S);
-  }
-}
-
-int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
-  // Get rid of the list of stores that were removed, and from the
-  // lists of instructions with multiple users.
-  MemBarrierIgnoreList.clear();
-  LaneMap.clear();
-  MultiUserVals.clear();
-  MustScalarize.clear();
-
-  // Scan the tree and find which value is used by which lane, and which values
-  // must be scalarized.
-  getTreeUses_rec(VL, 0);
-
-  // Check that instructions with multiple users can be vectorized. Mark unsafe
-  // instructions.
-  for (ValueSet::iterator it = MultiUserVals.begin(),
-       e = MultiUserVals.end(); it != e; ++it) {
-    // Check that all of the users of this instr are within the tree
-    // and that they are all from the same lane.
-    int Lane = -1;
-    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
-         I != E; ++I) {
-      if (LaneMap.find(*I) == LaneMap.end()) {
-        MustScalarize.insert(*it);
-        DEBUG(dbgs()<<"SLP: Adding " << **it <<
-              " to MustScalarize because of an out of tree usage.\n");
-        break;
-      }
-      if (Lane == -1) Lane = LaneMap[*I];
-      if (Lane != LaneMap[*I]) {
-        MustScalarize.insert(*it);
-        DEBUG(dbgs()<<"Adding " << **it <<
-              " to MustScalarize because multiple lane use it: "
-              << Lane << " and " << LaneMap[*I] << ".\n");
-        break;
-      }
-    }
-  }
-
-  // Now calculate the cost of vectorizing the tree.
-  return getTreeCost_rec(VL, 0);
-}
-
-void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  if (Depth == RecursionMaxDepth) return;
-
-  // Don't handle vectors.
-  if (VL[0]->getType()->isVectorTy()) return;
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    if (SI->getValueOperand()->getType()->isVectorTy()) return;
-
-  // Check if all of the operands are constants.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If one of the instructions is out of this BB, we need to scalarize all.
-    if (I && I->getParent() != BB) return;
-  }
-
-  // If all of the operands are identical or constant we have a simple solution.
-  if (AllConst || AllSameScalar) return;
-
-  // Scalarize unknown structures.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (!VL0) return;
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode()) return;
-  }
-
-  // Mark instructions with multiple users.
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // Remember to check if all of the users of this instr are vectorized
-    // within our tree.
-    if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
-  }
-
-  for (int i = 0, e = VL.size(); i < e; ++i) {
-    // Check that the instruction is only used within
-    // one lane.
-    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
-    // Make this instruction as 'seen' and remember the lane.
-    LaneMap[VL[i]] = i;
-  }
-
-  switch (Opcode) {
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast:
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-        ValueList Operands;
-        // Prepare the operand vector.
-        for (unsigned j = 0; j < VL.size(); ++j)
-          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-        getTreeUses_rec(Operands, Depth+1);
-      }
-      return;
-    }
-    case Instruction::Store: {
-      ValueList Operands;
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      getTreeUses_rec(Operands, Depth+1);
-      return;
-    }
-    default:
-    return;
-  }
-}
-
-int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
-  Type *ScalarTy = VL[0]->getType();
-
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-
-  /// Don't mess with vectors.
-  if (ScalarTy->isVectorTy()) return max_cost;
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-
-  if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
-
-  // Check if all of the operands are constants.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  bool MustScalarizeFlag = false;
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    // Must have a single use.
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    MustScalarizeFlag |= MustScalarize.count(VL[i]);
-    // This instruction is outside the basic block.
-    if (I && I->getParent() != BB)
-      return getScalarizationCost(VecTy);
-  }
-
-  // Is this a simple vector constant.
-  if (AllConst) return 0;
-
-  // If all of the operands are identical we can broadcast them.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (AllSameScalar) {
-    // If we are in a loop, and this is not an instruction (e.g. constant or
-    // argument) or the instruction is defined outside the loop then assume
-    // that the cost is zero.
-    if (L && (!VL0 || !L->contains(VL0)))
-      return 0;
-
-    // We need to broadcast the scalar.
-    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
-  }
-
-  // If this is not a constant, or a scalar from outside the loop then we
-  // need to scalarize it.
-  if (MustScalarizeFlag)
-    return getScalarizationCost(VecTy);
-
-  if (!VL0) return getScalarizationCost(VecTy);
-  assert(VL0->getParent() == BB && "Wrong BB");
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
-  }
-
-  // Check if it is safe to sink the loads or the stores.
-  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
-    int MaxIdx = InstrIdx[VL0];
-    for (unsigned i = 1, e = VL.size(); i < e; ++i )
-      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
-
-    Instruction *Last = InstrVec[MaxIdx];
-    for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
-      if (VL[i] == Last) continue;
-      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
-      if (Barrier) {
-        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
-              *Last << "\n because of " << *Barrier << "\n");
-        return max_cost;
-      }
-    }
-  }
-
-  switch (Opcode) {
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    int Cost = 0;
-    ValueList Operands;
-    Type *SrcTy = VL0->getOperand(0)->getType();
-    // Prepare the operand vector.
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      // Check that the casted type is the same for all users.
-      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
-        return getScalarizationCost(VecTy);
-    }
-
-    Cost += getTreeCost_rec(Operands, Depth+1);
-    if (Cost >= max_cost) return max_cost;
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                       VL0->getType(), SrcTy);
-
-    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
-    Cost += (VecCost - ScalarCost);
-    return Cost;
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    int Cost = 0;
-    // Calculate the cost of all of the operands.
-    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
-      ValueList Operands;
-      // Prepare the operand vector.
-      for (unsigned j = 0; j < VL.size(); ++j)
-        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
-      Cost += getTreeCost_rec(Operands, Depth+1);
-      if (Cost >= max_cost) return max_cost;
-    }
-
-    // Calculate the cost of this instruction.
-    int ScalarCost = VecTy->getNumElements() *
-      TTI->getArithmeticInstrCost(Opcode, ScalarTy);
-
-    int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
-    Cost += (VecCost - ScalarCost);
-    return Cost;
-  }
-  case Instruction::Load: {
-    // If we are scalarize the loads, add the cost of forming the vector.
-    for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
-      if (!isConsecutiveAccess(VL[i], VL[i+1]))
-        return getScalarizationCost(VecTy);
-
-    // Cost of wide load - cost of scalar loads.
-    int ScalarLdCost = VecTy->getNumElements() *
-      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
-    return VecLdCost - ScalarLdCost;
-  }
-  case Instruction::Store: {
-    // We know that we can merge the stores. Calculate the cost.
-    int ScalarStCost = VecTy->getNumElements() *
-      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
-    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
-    int StoreCost = VecStCost - ScalarStCost;
-
-    ValueList Operands;
-    for (unsigned j = 0; j < VL.size(); ++j) {
-      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
-      MemBarrierIgnoreList.insert(VL[j]);
-    }
-
-    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
-    return TotalCost;
-  }
-  default:
-    // Unable to vectorize unknown instructions.
-    return getScalarizationCost(VecTy);
-  }
-}
-
-Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
-  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
-  for (unsigned i = 0; i < VF; ++i )
-    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
-  return InstrVec[MaxIdx + 1];
-}
-
-Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
-  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
-  Value *Vec = UndefValue::get(Ty);
-  for (unsigned i=0; i < Ty->getNumElements(); ++i) {
-    // Generate the 'InsertElement' instruction.
-    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
-    // Remember that this instruction is used as part of a 'gather' sequence.
-    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
-    // if the users are outside of the basic block.
-    GatherInstructions.push_back(Vec);
-  }
-
-  return Vec;
-}
-
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
-  Value *V = vectorizeTree_rec(VL, VF);
-  // We moved some instructions around. We have to number them again
-  // before we can do any analysis.
-  numberInstructions();
-  MustScalarize.clear();
-  return V;
-}
-
-Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
-  Type *ScalarTy = VL[0]->getType();
-  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VF);
-
-  // Check if all of the operands are constants or identical.
-  bool AllConst = true;
-  bool AllSameScalar = true;
-  for (unsigned i = 0, e = VF; i < e; ++i) {
-    AllConst &= isa<Constant>(VL[i]);
-    AllSameScalar &= (VL[0] == VL[i]);
-    // The instruction must be in the same BB, and it must be vectorizable.
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
-      return Scalarize(VL, VecTy);
-  }
-
-  // Check that this is a simple vector constant.
-  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
-
-  // Scalarize unknown structures.
-  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
-  if (!VL0) return Scalarize(VL, VecTy);
-
-  if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
-
-  unsigned Opcode = VL0->getOpcode();
-  for (unsigned i = 0, e = VF; i < e; ++i) {
-    Instruction *I = dyn_cast<Instruction>(VL[i]);
-    // If not all of the instructions are identical then we have to scalarize.
-    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
-  }
-
-  switch (Opcode) {
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast: {
-    ValueList INVL;
-    for (int i = 0; i < VF; ++i)
-      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-    Value *InVec = vectorizeTree_rec(INVL, VF);
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
-    CastInst *CI = dyn_cast<CastInst>(VL0);
-    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor: {
-    ValueList LHSVL, RHSVL;
-    for (int i = 0; i < VF; ++i) {
-      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
-    }
-
-    Value *RHS = vectorizeTree_rec(RHSVL, VF);
-    Value *LHS = vectorizeTree_rec(LHSVL, VF);
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
-    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
-    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
-    VectorizedValues[VL0] = V;
-    return V;
-  }
-  case Instruction::Load: {
-    LoadInst *LI = cast<LoadInst>(VL0);
-    unsigned Alignment = LI->getAlignment();
-
-    // Check if all of the loads are consecutive.
-    for (unsigned i = 1, e = VF; i < e; ++i)
-      if (!isConsecutiveAccess(VL[i-1], VL[i]))
-        return Scalarize(VL, VecTy);
-
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
-    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
-                                          VecTy->getPointerTo());
-    LI = Builder.CreateLoad(VecPtr);
-    LI->setAlignment(Alignment);
-    VectorizedValues[VL0] = LI;
-    return LI;
-  }
-  case Instruction::Store: {
-    StoreInst *SI = cast<StoreInst>(VL0);
-    unsigned Alignment = SI->getAlignment();
-
-    ValueList ValueOp;
-    for (int i = 0; i < VF; ++i)
-      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
-
-    Value *VecValue = vectorizeTree_rec(ValueOp, VF);
-
-    IRBuilder<> Builder(GetLastInstr(VL, VF));
-    Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
-                                          VecTy->getPointerTo());
-    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
-
-    for (int i = 0; i < VF; ++i)
-      cast<Instruction>(VL[i])->eraseFromParent();
-    return 0;
-  }
-  default:
-    Value *S = Scalarize(VL, VecTy);
-    VectorizedValues[VL0] = S;
-    return S;
-  }
-}
-
-} // end of namespace
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
deleted file mode 100644
index 5456c6c..0000000
--- a/lib/Transforms/Vectorize/VecUtils.h
+++ /dev/null
@@ -1,164 +0,0 @@
-//===- VecUtils.h - Vectorization Utilities -------------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of classes and functions manipulate vectors and chains of
-// vectors.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
-#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include <vector>
-
-namespace llvm {
-
-class BasicBlock; class Instruction; class Type;
-class VectorType; class StoreInst; class Value;
-class ScalarEvolution; class DataLayout;
-class TargetTransformInfo; class AliasAnalysis;
-class Loop;
-
-/// Bottom Up SLP vectorization utility class.
-struct BoUpSLP  {
-  typedef SmallVector<Value*, 8> ValueList;
-  typedef SmallPtrSet<Value*, 16> ValueSet;
-  typedef SmallVector<StoreInst*, 8> StoreList;
-  static const int max_cost = 1<<20;
-
-  // \brief C'tor.
-  BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
-         TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
-
-  /// \brief Take the pointer operand from the Load/Store instruction.
-  /// \returns NULL if this is not a valid Load/Store instruction.
-  static Value *getPointerOperand(Value *I);
-
-  /// \brief Take the address space operand from the Load/Store instruction.
-  /// \returns -1 if this is not a valid Load/Store instruction.
-  static unsigned getAddressSpaceOperand(Value *I);
-
-  /// \returns true if the memory operations A and B are consecutive.
-  bool isConsecutiveAccess(Value *A, Value *B);
-
-  /// \brief Vectorize the tree that starts with the elements in \p VL.
-  /// \returns the vectorized value.
-  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
-
-  /// \returns the vectorization cost of the subtree that starts at \p VL.
-  /// A negative number means that this is profitable.
-  int getTreeCost(ArrayRef<Value *> VL);
-
-  /// \returns the scalarization cost for this list of values. Assuming that
-  /// this subtree gets vectorized, we may need to extract the values from the
-  /// roots. This method calculates the cost of extracting the values.
-  int getScalarizationCost(ArrayRef<Value *> VL);
-
-  /// \brief Attempts to order and vectorize a sequence of stores. This
-  /// function does a quadratic scan of the given stores.
-  /// \returns true if the basic block was modified.
-  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
-
-  /// \brief Vectorize a group of scalars into a vector tree.
-  void vectorizeArith(ArrayRef<Value *> Operands);
-
-  /// \returns the list of new instructions that were added in order to collect
-  /// scalars into vectors. This list can be used to further optimize the gather
-  /// sequences.
-  ValueList &getGatherSeqInstructions() {return GatherInstructions; }
-
-private:
-  /// \brief This method contains the recursive part of getTreeCost.
-  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
-
-  /// \brief This recursive method looks for vectorization hazards such as
-  /// values that are used by multiple users and checks that values are used
-  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
-  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
-
-  /// \brief This method contains the recursive part of vectorizeTree.
-  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
-
-  /// \brief Number all of the instructions in the block.
-  void numberInstructions();
-
-  ///  \brief Vectorize a sorted sequence of stores.
-  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
-
-  /// \returns the scalarization cost for this type. Scalarization in this
-  /// context means the creation of vectors from a group of scalars.
-  int getScalarizationCost(Type *Ty);
-
-  /// \returns the AA location that is being access by the instruction.
-  AliasAnalysis::Location getLocation(Instruction *I);
-
-  /// \brief Checks if it is possible to sink an instruction from
-  /// \p Src to \p Dst.
-  /// \returns the pointer to the barrier instruction if we can't sink.
-  Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
-
-  /// \returns the instruction that appears last in the BB from \p VL.
-  /// Only consider the first \p VF elements.
-  Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
-
-  /// \returns a vector from a collection of scalars in \p VL.
-  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
-
-private:
-  /// Maps instructions to numbers and back.
-  SmallDenseMap<Value*, int> InstrIdx;
-  /// Maps integers to Instructions.
-  std::vector<Instruction*> InstrVec;
-
-  // -- containers that are used during getTreeCost -- //
-
-  /// Contains values that must be scalarized because they are used
-  /// by multiple lanes, or by users outside the tree.
-  /// NOTICE: The vectorization methods also use this set.
-  ValueSet MustScalarize;
-
-  /// Contains a list of values that are used outside the current tree. This
-  /// set must be reset between runs.
-  ValueSet MultiUserVals;
-  /// Maps values in the tree to the vector lanes that uses them. This map must
-  /// be reset between runs of getCost.
-  std::map<Value*, int> LaneMap;
-  /// A list of instructions to ignore while sinking
-  /// memory instructions. This map must be reset between runs of getCost.
-  SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
-
-  // -- Containers that are used during vectorizeTree -- //
-
-  /// Maps between the first scalar to the vector. This map must be reset
-  ///between runs.
-  DenseMap<Value*, Value*> VectorizedValues;
-
-  // -- Containers that are used after vectorization by the caller -- //
-
-  /// A list of instructions that are used when gathering scalars into vectors.
-  /// In many cases these instructions can be hoisted outside of the BB.
-  /// Iterating over this list is faster than calling LICM.
-  ValueList GatherInstructions;
-
-  // Analysis and block reference.
-  BasicBlock *BB;
-  ScalarEvolution *SE;
-  DataLayout *DL;
-  TargetTransformInfo *TTI;
-  AliasAnalysis *AA;
-  Loop *L;
-};
-
-} // end of namespace
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
author	dim <dim@FreeBSD.org>	2013-12-22 00:04:03 +0000
committer	dim <dim@FreeBSD.org>	2013-12-22 00:04:03 +0000
commit	8cf58e3ee36bd550746fca361a894e2727485200 (patch)
tree	2ba0398b4c42ad4f55561327538044fd2c925a8b /lib
parent	aa45f148926e3461a1fd8b10c990f0a51a908cc9 (diff)
download	FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.zip FreeBSD-src-8cf58e3ee36bd550746fca361a894e2727485200.tar.gz